fix: pcie gen, nccl binary, netconf sudo, boot noise, firmware cleanup

- nvidia collector: read pcie.link.gen.current/max from nvidia-smi instead
  of sysfs to avoid false Gen1 readings when GPU is in ASPM idle state
- build: remove bee-nccl-gpu-stress from rm -f list so shell script from
  overlay is not silently dropped from the ISO
- smoketest: add explicit checks for bee-gpu-burn, bee-john-gpu-stress,
  bee-nccl-gpu-stress, all_reduce_perf
- netconf: re-exec via sudo when not root to fix RTNETLINK/resolv.conf errors
- auto/config: reduce loglevel 7→3 to show clean systemd output on boot
- auto/config: blacklist snd_hda_intel and related audio modules (unused on servers)
- package-lists: remove firmware-intel-sound and firmware-amd-graphics from
  base list; move firmware-amd-graphics to bee-amd variant only
- bible-local: mark memtest ADR resolved, document working solution

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-01 21:25:23 +03:00
parent 2baf3be640
commit eb60100297
9 changed files with 148 additions and 25 deletions

View File

@@ -32,7 +32,7 @@ lb config noauto \
--memtest memtest86+ \
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
--apt-recommends false \
--chroot-squashfs-compression-type zstd \
"${@}"

View File

@@ -862,7 +862,6 @@ rm -f \
"${OVERLAY_STAGE_DIR}/etc/bee-release" \
"${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/john" \
"${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \

View File

@@ -1,3 +1,6 @@
# AMD GPU firmware
firmware-amd-graphics
# AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST)
rocm-smi-lib=%%ROCM_SMI_VERSION%%
rocm-bandwidth-test=%%ROCM_BANDWIDTH_TEST_VERSION%%

View File

@@ -71,9 +71,7 @@ lightdm
firmware-linux-free
firmware-linux-nonfree
firmware-misc-nonfree
firmware-amd-graphics
firmware-realtek
firmware-intel-sound
firmware-bnx2
firmware-bnx2x
firmware-cavium

View File

@@ -52,6 +52,14 @@ else
fail "nvidia-smi: NOT FOUND"
fi
for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf; do
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
ok "$tool found: $p"
else
fail "$tool: NOT FOUND"
fi
done
echo ""
echo "-- NVIDIA modules --"
KO_DIR="/usr/local/lib/nvidia"

View File

@@ -3,6 +3,11 @@
# Type 'a' at any prompt to abort, 'b' to go back.
set -e
# Requires root for ip/dhclient/resolv.conf — re-exec via sudo if needed.
if [ "$(id -u)" -ne 0 ]; then
exec sudo "$0" "$@"
fi
abort() { echo "Aborted."; exit 0; }
ask() {