From eb60100297f51c59d0e7180dedfbabaa2902e480 Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Wed, 1 Apr 2026 21:25:23 +0300 Subject: [PATCH] fix: pcie gen, nccl binary, netconf sudo, boot noise, firmware cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - nvidia collector: read pcie.link.gen.current/max from nvidia-smi instead of sysfs to avoid false Gen1 readings when GPU is in ASPM idle state - build: remove bee-nccl-gpu-stress from rm -f list so shell script from overlay is not silently dropped from the ISO - smoketest: add explicit checks for bee-gpu-burn, bee-john-gpu-stress, bee-nccl-gpu-stress, all_reduce_perf - netconf: re-exec via sudo when not root to fix RTNETLINK/resolv.conf errors - auto/config: reduce loglevel 7→3 to show clean systemd output on boot - auto/config: blacklist snd_hda_intel and related audio modules (unused on servers) - package-lists: remove firmware-intel-sound and firmware-amd-graphics from base list; move firmware-amd-graphics to bee-amd variant only - bible-local: mark memtest ADR resolved, document working solution Co-Authored-By: Claude Sonnet 4.6 --- audit/internal/collector/nvidia.go | 80 ++++++++++++++----- audit/internal/collector/nvidia_test.go | 8 +- .../2026-04-01-memtest-build-strategy.md | 64 ++++++++++++++- iso/builder/auto/config | 2 +- iso/builder/build.sh | 1 - .../config/package-lists/bee-amd.list.chroot | 3 + .../config/package-lists/bee.list.chroot | 2 - iso/builder/smoketest.sh | 8 ++ iso/overlay/usr/local/bin/netconf | 5 ++ 9 files changed, 148 insertions(+), 25 deletions(-) diff --git a/audit/internal/collector/nvidia.go b/audit/internal/collector/nvidia.go index c7c0db0..afda777 100644 --- a/audit/internal/collector/nvidia.go +++ b/audit/internal/collector/nvidia.go @@ -13,14 +13,18 @@ import ( const nvidiaVendorID = 0x10de type nvidiaGPUInfo struct { - BDF string - Serial string - VBIOS string - TemperatureC *float64 - PowerW *float64 - ECCUncorrected *int64 - ECCCorrected *int64 - HWSlowdown *bool + BDF string + Serial string + VBIOS string + TemperatureC *float64 + PowerW *float64 + ECCUncorrected *int64 + ECCCorrected *int64 + HWSlowdown *bool + PCIeLinkGenCurrent *int + PCIeLinkGenMax *int + PCIeLinkWidthCur *int + PCIeLinkWidthMax *int } // enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi. @@ -94,7 +98,7 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) { out, err := exec.Command( "nvidia-smi", - "--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown", + "--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max", "--format=csv,noheader,nounits", ).Output() if err != nil { @@ -118,8 +122,8 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) { if len(rec) == 0 { continue } - if len(rec) < 9 { - return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 9", len(rec)) + if len(rec) < 13 { + return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 13", len(rec)) } bdf := normalizePCIeBDF(rec[1]) @@ -128,14 +132,18 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) { } info := nvidiaGPUInfo{ - BDF: bdf, - Serial: strings.TrimSpace(rec[2]), - VBIOS: strings.TrimSpace(rec[3]), - TemperatureC: parseMaybeFloat(rec[4]), - PowerW: parseMaybeFloat(rec[5]), - ECCUncorrected: parseMaybeInt64(rec[6]), - ECCCorrected: parseMaybeInt64(rec[7]), - HWSlowdown: parseMaybeBool(rec[8]), + BDF: bdf, + Serial: strings.TrimSpace(rec[2]), + VBIOS: strings.TrimSpace(rec[3]), + TemperatureC: parseMaybeFloat(rec[4]), + PowerW: parseMaybeFloat(rec[5]), + ECCUncorrected: parseMaybeInt64(rec[6]), + ECCCorrected: parseMaybeInt64(rec[7]), + HWSlowdown: parseMaybeBool(rec[8]), + PCIeLinkGenCurrent: parseMaybeInt(rec[9]), + PCIeLinkGenMax: parseMaybeInt(rec[10]), + PCIeLinkWidthCur: parseMaybeInt(rec[11]), + PCIeLinkWidthMax: parseMaybeInt(rec[12]), } result[bdf] = info } @@ -167,6 +175,22 @@ func parseMaybeInt64(v string) *int64 { return &n } +func parseMaybeInt(v string) *int { + v = strings.TrimSpace(v) + if v == "" || strings.EqualFold(v, "n/a") || strings.EqualFold(v, "not supported") || strings.EqualFold(v, "[not supported]") { + return nil + } + n, err := strconv.Atoi(v) + if err != nil { + return nil + } + return &n +} + +func pcieLinkGenLabel(gen int) string { + return fmt.Sprintf("Gen%d", gen) +} + func parseMaybeBool(v string) *bool { v = strings.TrimSpace(strings.ToLower(v)) switch v { @@ -231,4 +255,22 @@ func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) { if info.HWSlowdown != nil { dev.HWSlowdown = info.HWSlowdown } + // Override PCIe link speed/width with nvidia-smi driver values. + // sysfs current_link_speed reflects the instantaneous physical link state and + // can show Gen1 when the GPU is idle due to ASPM power management. The driver + // knows the negotiated speed regardless of the current power state. + if info.PCIeLinkGenCurrent != nil { + s := pcieLinkGenLabel(*info.PCIeLinkGenCurrent) + dev.LinkSpeed = &s + } + if info.PCIeLinkGenMax != nil { + s := pcieLinkGenLabel(*info.PCIeLinkGenMax) + dev.MaxLinkSpeed = &s + } + if info.PCIeLinkWidthCur != nil { + dev.LinkWidth = info.PCIeLinkWidthCur + } + if info.PCIeLinkWidthMax != nil { + dev.MaxLinkWidth = info.PCIeLinkWidthMax + } } diff --git a/audit/internal/collector/nvidia_test.go b/audit/internal/collector/nvidia_test.go index 2ccde36..e845a2a 100644 --- a/audit/internal/collector/nvidia_test.go +++ b/audit/internal/collector/nvidia_test.go @@ -6,7 +6,7 @@ import ( ) func TestParseNVIDIASMIQuery(t *testing.T) { - raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active\n" + raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n" byBDF, err := parseNVIDIASMIQuery(raw) if err != nil { t.Fatalf("parse failed: %v", err) @@ -28,6 +28,12 @@ func TestParseNVIDIASMIQuery(t *testing.T) { if gpu.HWSlowdown == nil || *gpu.HWSlowdown { t.Fatalf("hw slowdown: got %v, want false", gpu.HWSlowdown) } + if gpu.PCIeLinkGenCurrent == nil || *gpu.PCIeLinkGenCurrent != 4 { + t.Fatalf("pcie link gen current: got %v, want 4", gpu.PCIeLinkGenCurrent) + } + if gpu.PCIeLinkGenMax == nil || *gpu.PCIeLinkGenMax != 4 { + t.Fatalf("pcie link gen max: got %v, want 4", gpu.PCIeLinkGenMax) + } } func TestNormalizePCIeBDF(t *testing.T) { diff --git a/bible-local/decisions/2026-04-01-memtest-build-strategy.md b/bible-local/decisions/2026-04-01-memtest-build-strategy.md index 49ff9d9..297c1ca 100644 --- a/bible-local/decisions/2026-04-01-memtest-build-strategy.md +++ b/bible-local/decisions/2026-04-01-memtest-build-strategy.md @@ -1,7 +1,7 @@ # Decision: Treat memtest as explicit ISO content, not as trusted live-build magic **Date:** 2026-04-01 -**Status:** active +**Status:** resolved ## Context @@ -160,3 +160,65 @@ Current implementation direction: - But validation output is only trustworthy if ISO reading itself succeeded. A "missing memtest" warning without a successful ISO read is not evidence. - If we change memtest strategy again, we must update this ADR with the exact build evidence that justified the change. + +## Working Solution (confirmed 2026-04-01, commits 76a9100 → 2baf3be) + +This approach was confirmed working in ISO `easy-bee-nvidia-v3.20-5-g76a9100-amd64.iso` +and validated again in subsequent builds. The final ISO contains all required memtest artifacts. + +### Components + +**1. Binary hook `config/hooks/normal/9100-memtest.hook.binary`** + +Runs inside the live-build binary phase. Does not patch bootloader files at hook time — +those files may not exist yet. Instead: + +- Tries to copy `memtest86+x64.bin` / `memtest86+x64.efi` from `chroot/boot/` first. +- Falls back to extracting from the cached `.deb` (via `dpkg-deb -x`) if `chroot/boot/` is empty. +- Appends GRUB and isolinux menu entries only if the respective cfg files already exist at hook time. + If they do not exist, the hook warns and continues (does not fail). + +Controlled by `BEE_REQUIRE_MEMTEST=1` env var to turn warnings into hard errors when needed. + +**2. Post-`lb build` recovery step in `build.sh`** + +After `lb build` completes, `build.sh` checks whether the fully materialized `binary/` tree +contains all required memtest artifacts. If not: + +- Copies/extracts memtest binaries into `binary/boot/`. +- Patches `binary/boot/grub/grub.cfg` and `binary/isolinux/live.cfg` directly. +- Reruns the late binary stages (`binary_checksums`, `binary_iso`, `binary_zsync`) to rebuild + the ISO with the patched tree. + +This is the deterministic safety net: even if the hook runs at the wrong time, the recovery +step handles the final `binary/` tree after live-build has written all bootloader configs. + +**3. ISO validation hardening** + +The memtest probe in `build.sh` is wrapped in explicit `if` / `case` control flow, not called +as a bare command under `set -e`. A non-zero probe return (needs recovery) is intentional and +handled — it does not abort the build prematurely. + +ISO reading (`xorriso -indev -ls` / extraction) is treated as a separate prerequisite. +If the reader fails, the validator reports a reader error explicitly, not a memtest warning. +This prevents the false-negative loop that burned 2026-04-01 v3.14–v3.19. + +### Why this works when earlier attempts did not + +The earlier patterns all shared a single flaw: they assumed a single build-time point +(hook or source template) would be the last writer of bootloader configs and memtest payloads. +In live-build on Debian Bookworm that assumption is false — live-build continues writing +bootloader files after custom hooks run, and `chroot/boot/` does not reliably hold memtest payloads. + +The recovery step sidesteps the ordering problem entirely: it acts on the fully materialized +`binary/` tree after `lb build` finishes, then rebuilds the ISO from that patched tree. +There is no ordering dependency to get wrong. + +### Do not revert + +Do not remove the recovery step or the hook without a fresh real ISO build proving +live-build alone produces all four required artifacts: +- `boot/memtest86+x64.bin` +- `boot/memtest86+x64.efi` +- memtest entry in `boot/grub/grub.cfg` +- memtest entry in `isolinux/live.cfg` diff --git a/iso/builder/auto/config b/iso/builder/auto/config index 662181c..214039c 100755 --- a/iso/builder/auto/config +++ b/iso/builder/auto/config @@ -32,7 +32,7 @@ lb config noauto \ --memtest memtest86+ \ --iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \ --iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \ - --bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \ + --bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \ --apt-recommends false \ --chroot-squashfs-compression-type zstd \ "${@}" diff --git a/iso/builder/build.sh b/iso/builder/build.sh index 5e97e14..96087cf 100755 --- a/iso/builder/build.sh +++ b/iso/builder/build.sh @@ -862,7 +862,6 @@ rm -f \ "${OVERLAY_STAGE_DIR}/etc/bee-release" \ "${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \ "${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \ - "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress" \ "${OVERLAY_STAGE_DIR}/usr/local/bin/john" \ "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker" \ "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \ diff --git a/iso/builder/config/package-lists/bee-amd.list.chroot b/iso/builder/config/package-lists/bee-amd.list.chroot index f7cefca..8157879 100644 --- a/iso/builder/config/package-lists/bee-amd.list.chroot +++ b/iso/builder/config/package-lists/bee-amd.list.chroot @@ -1,3 +1,6 @@ +# AMD GPU firmware +firmware-amd-graphics + # AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST) rocm-smi-lib=%%ROCM_SMI_VERSION%% rocm-bandwidth-test=%%ROCM_BANDWIDTH_TEST_VERSION%% diff --git a/iso/builder/config/package-lists/bee.list.chroot b/iso/builder/config/package-lists/bee.list.chroot index 4879eff..567737c 100644 --- a/iso/builder/config/package-lists/bee.list.chroot +++ b/iso/builder/config/package-lists/bee.list.chroot @@ -71,9 +71,7 @@ lightdm firmware-linux-free firmware-linux-nonfree firmware-misc-nonfree -firmware-amd-graphics firmware-realtek -firmware-intel-sound firmware-bnx2 firmware-bnx2x firmware-cavium diff --git a/iso/builder/smoketest.sh b/iso/builder/smoketest.sh index b458523..025249b 100644 --- a/iso/builder/smoketest.sh +++ b/iso/builder/smoketest.sh @@ -52,6 +52,14 @@ else fail "nvidia-smi: NOT FOUND" fi +for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf; do + if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then + ok "$tool found: $p" + else + fail "$tool: NOT FOUND" + fi +done + echo "" echo "-- NVIDIA modules --" KO_DIR="/usr/local/lib/nvidia" diff --git a/iso/overlay/usr/local/bin/netconf b/iso/overlay/usr/local/bin/netconf index 651caf8..7bd368f 100755 --- a/iso/overlay/usr/local/bin/netconf +++ b/iso/overlay/usr/local/bin/netconf @@ -3,6 +3,11 @@ # Type 'a' at any prompt to abort, 'b' to go back. set -e +# Requires root for ip/dhclient/resolv.conf — re-exec via sudo if needed. +if [ "$(id -u)" -ne 0 ]; then + exec sudo "$0" "$@" +fi + abort() { echo "Aborted."; exit 0; } ask() {