fix: pcie gen, nccl binary, netconf sudo, boot noise, firmware cleanup

- nvidia collector: read pcie.link.gen.current/max from nvidia-smi instead
  of sysfs to avoid false Gen1 readings when GPU is in ASPM idle state
- build: remove bee-nccl-gpu-stress from rm -f list so shell script from
  overlay is not silently dropped from the ISO
- smoketest: add explicit checks for bee-gpu-burn, bee-john-gpu-stress,
  bee-nccl-gpu-stress, all_reduce_perf
- netconf: re-exec via sudo when not root to fix RTNETLINK/resolv.conf errors
- auto/config: reduce loglevel 7→3 to show clean systemd output on boot
- auto/config: blacklist snd_hda_intel and related audio modules (unused on servers)
- package-lists: remove firmware-intel-sound and firmware-amd-graphics from
  base list; move firmware-amd-graphics to bee-amd variant only
- bible-local: mark memtest ADR resolved, document working solution

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-01 21:25:23 +03:00
parent 2baf3be640
commit eb60100297
9 changed files with 148 additions and 25 deletions

View File

@@ -13,14 +13,18 @@ import (
const nvidiaVendorID = 0x10de const nvidiaVendorID = 0x10de
type nvidiaGPUInfo struct { type nvidiaGPUInfo struct {
BDF string BDF string
Serial string Serial string
VBIOS string VBIOS string
TemperatureC *float64 TemperatureC *float64
PowerW *float64 PowerW *float64
ECCUncorrected *int64 ECCUncorrected *int64
ECCCorrected *int64 ECCCorrected *int64
HWSlowdown *bool HWSlowdown *bool
PCIeLinkGenCurrent *int
PCIeLinkGenMax *int
PCIeLinkWidthCur *int
PCIeLinkWidthMax *int
} }
// enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi. // enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
@@ -94,7 +98,7 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) { func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
out, err := exec.Command( out, err := exec.Command(
"nvidia-smi", "nvidia-smi",
"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown", "--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
"--format=csv,noheader,nounits", "--format=csv,noheader,nounits",
).Output() ).Output()
if err != nil { if err != nil {
@@ -118,8 +122,8 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
if len(rec) == 0 { if len(rec) == 0 {
continue continue
} }
if len(rec) < 9 { if len(rec) < 13 {
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 9", len(rec)) return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 13", len(rec))
} }
bdf := normalizePCIeBDF(rec[1]) bdf := normalizePCIeBDF(rec[1])
@@ -128,14 +132,18 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
} }
info := nvidiaGPUInfo{ info := nvidiaGPUInfo{
BDF: bdf, BDF: bdf,
Serial: strings.TrimSpace(rec[2]), Serial: strings.TrimSpace(rec[2]),
VBIOS: strings.TrimSpace(rec[3]), VBIOS: strings.TrimSpace(rec[3]),
TemperatureC: parseMaybeFloat(rec[4]), TemperatureC: parseMaybeFloat(rec[4]),
PowerW: parseMaybeFloat(rec[5]), PowerW: parseMaybeFloat(rec[5]),
ECCUncorrected: parseMaybeInt64(rec[6]), ECCUncorrected: parseMaybeInt64(rec[6]),
ECCCorrected: parseMaybeInt64(rec[7]), ECCCorrected: parseMaybeInt64(rec[7]),
HWSlowdown: parseMaybeBool(rec[8]), HWSlowdown: parseMaybeBool(rec[8]),
PCIeLinkGenCurrent: parseMaybeInt(rec[9]),
PCIeLinkGenMax: parseMaybeInt(rec[10]),
PCIeLinkWidthCur: parseMaybeInt(rec[11]),
PCIeLinkWidthMax: parseMaybeInt(rec[12]),
} }
result[bdf] = info result[bdf] = info
} }
@@ -167,6 +175,22 @@ func parseMaybeInt64(v string) *int64 {
return &n return &n
} }
func parseMaybeInt(v string) *int {
v = strings.TrimSpace(v)
if v == "" || strings.EqualFold(v, "n/a") || strings.EqualFold(v, "not supported") || strings.EqualFold(v, "[not supported]") {
return nil
}
n, err := strconv.Atoi(v)
if err != nil {
return nil
}
return &n
}
func pcieLinkGenLabel(gen int) string {
return fmt.Sprintf("Gen%d", gen)
}
func parseMaybeBool(v string) *bool { func parseMaybeBool(v string) *bool {
v = strings.TrimSpace(strings.ToLower(v)) v = strings.TrimSpace(strings.ToLower(v))
switch v { switch v {
@@ -231,4 +255,22 @@ func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
if info.HWSlowdown != nil { if info.HWSlowdown != nil {
dev.HWSlowdown = info.HWSlowdown dev.HWSlowdown = info.HWSlowdown
} }
// Override PCIe link speed/width with nvidia-smi driver values.
// sysfs current_link_speed reflects the instantaneous physical link state and
// can show Gen1 when the GPU is idle due to ASPM power management. The driver
// knows the negotiated speed regardless of the current power state.
if info.PCIeLinkGenCurrent != nil {
s := pcieLinkGenLabel(*info.PCIeLinkGenCurrent)
dev.LinkSpeed = &s
}
if info.PCIeLinkGenMax != nil {
s := pcieLinkGenLabel(*info.PCIeLinkGenMax)
dev.MaxLinkSpeed = &s
}
if info.PCIeLinkWidthCur != nil {
dev.LinkWidth = info.PCIeLinkWidthCur
}
if info.PCIeLinkWidthMax != nil {
dev.MaxLinkWidth = info.PCIeLinkWidthMax
}
} }

View File

@@ -6,7 +6,7 @@ import (
) )
func TestParseNVIDIASMIQuery(t *testing.T) { func TestParseNVIDIASMIQuery(t *testing.T) {
raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active\n" raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
byBDF, err := parseNVIDIASMIQuery(raw) byBDF, err := parseNVIDIASMIQuery(raw)
if err != nil { if err != nil {
t.Fatalf("parse failed: %v", err) t.Fatalf("parse failed: %v", err)
@@ -28,6 +28,12 @@ func TestParseNVIDIASMIQuery(t *testing.T) {
if gpu.HWSlowdown == nil || *gpu.HWSlowdown { if gpu.HWSlowdown == nil || *gpu.HWSlowdown {
t.Fatalf("hw slowdown: got %v, want false", gpu.HWSlowdown) t.Fatalf("hw slowdown: got %v, want false", gpu.HWSlowdown)
} }
if gpu.PCIeLinkGenCurrent == nil || *gpu.PCIeLinkGenCurrent != 4 {
t.Fatalf("pcie link gen current: got %v, want 4", gpu.PCIeLinkGenCurrent)
}
if gpu.PCIeLinkGenMax == nil || *gpu.PCIeLinkGenMax != 4 {
t.Fatalf("pcie link gen max: got %v, want 4", gpu.PCIeLinkGenMax)
}
} }
func TestNormalizePCIeBDF(t *testing.T) { func TestNormalizePCIeBDF(t *testing.T) {

View File

@@ -1,7 +1,7 @@
# Decision: Treat memtest as explicit ISO content, not as trusted live-build magic # Decision: Treat memtest as explicit ISO content, not as trusted live-build magic
**Date:** 2026-04-01 **Date:** 2026-04-01
**Status:** active **Status:** resolved
## Context ## Context
@@ -160,3 +160,65 @@ Current implementation direction:
- But validation output is only trustworthy if ISO reading itself succeeded. A - But validation output is only trustworthy if ISO reading itself succeeded. A
"missing memtest" warning without a successful ISO read is not evidence. "missing memtest" warning without a successful ISO read is not evidence.
- If we change memtest strategy again, we must update this ADR with the exact build evidence that justified the change. - If we change memtest strategy again, we must update this ADR with the exact build evidence that justified the change.
## Working Solution (confirmed 2026-04-01, commits 76a9100 → 2baf3be)
This approach was confirmed working in ISO `easy-bee-nvidia-v3.20-5-g76a9100-amd64.iso`
and validated again in subsequent builds. The final ISO contains all required memtest artifacts.
### Components
**1. Binary hook `config/hooks/normal/9100-memtest.hook.binary`**
Runs inside the live-build binary phase. Does not patch bootloader files at hook time —
those files may not exist yet. Instead:
- Tries to copy `memtest86+x64.bin` / `memtest86+x64.efi` from `chroot/boot/` first.
- Falls back to extracting from the cached `.deb` (via `dpkg-deb -x`) if `chroot/boot/` is empty.
- Appends GRUB and isolinux menu entries only if the respective cfg files already exist at hook time.
If they do not exist, the hook warns and continues (does not fail).
Controlled by `BEE_REQUIRE_MEMTEST=1` env var to turn warnings into hard errors when needed.
**2. Post-`lb build` recovery step in `build.sh`**
After `lb build` completes, `build.sh` checks whether the fully materialized `binary/` tree
contains all required memtest artifacts. If not:
- Copies/extracts memtest binaries into `binary/boot/`.
- Patches `binary/boot/grub/grub.cfg` and `binary/isolinux/live.cfg` directly.
- Reruns the late binary stages (`binary_checksums`, `binary_iso`, `binary_zsync`) to rebuild
the ISO with the patched tree.
This is the deterministic safety net: even if the hook runs at the wrong time, the recovery
step handles the final `binary/` tree after live-build has written all bootloader configs.
**3. ISO validation hardening**
The memtest probe in `build.sh` is wrapped in explicit `if` / `case` control flow, not called
as a bare command under `set -e`. A non-zero probe return (needs recovery) is intentional and
handled — it does not abort the build prematurely.
ISO reading (`xorriso -indev -ls` / extraction) is treated as a separate prerequisite.
If the reader fails, the validator reports a reader error explicitly, not a memtest warning.
This prevents the false-negative loop that burned 2026-04-01 v3.14v3.19.
### Why this works when earlier attempts did not
The earlier patterns all shared a single flaw: they assumed a single build-time point
(hook or source template) would be the last writer of bootloader configs and memtest payloads.
In live-build on Debian Bookworm that assumption is false — live-build continues writing
bootloader files after custom hooks run, and `chroot/boot/` does not reliably hold memtest payloads.
The recovery step sidesteps the ordering problem entirely: it acts on the fully materialized
`binary/` tree after `lb build` finishes, then rebuilds the ISO from that patched tree.
There is no ordering dependency to get wrong.
### Do not revert
Do not remove the recovery step or the hook without a fresh real ISO build proving
live-build alone produces all four required artifacts:
- `boot/memtest86+x64.bin`
- `boot/memtest86+x64.efi`
- memtest entry in `boot/grub/grub.cfg`
- memtest entry in `isolinux/live.cfg`

View File

@@ -32,7 +32,7 @@ lb config noauto \
--memtest memtest86+ \ --memtest memtest86+ \
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \ --iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \ --iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \ --bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
--apt-recommends false \ --apt-recommends false \
--chroot-squashfs-compression-type zstd \ --chroot-squashfs-compression-type zstd \
"${@}" "${@}"

View File

@@ -862,7 +862,6 @@ rm -f \
"${OVERLAY_STAGE_DIR}/etc/bee-release" \ "${OVERLAY_STAGE_DIR}/etc/bee-release" \
"${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \ "${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \ "${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/john" \ "${OVERLAY_STAGE_DIR}/usr/local/bin/john" \
"${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker" \ "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \ "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \

View File

@@ -1,3 +1,6 @@
# AMD GPU firmware
firmware-amd-graphics
# AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST) # AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST)
rocm-smi-lib=%%ROCM_SMI_VERSION%% rocm-smi-lib=%%ROCM_SMI_VERSION%%
rocm-bandwidth-test=%%ROCM_BANDWIDTH_TEST_VERSION%% rocm-bandwidth-test=%%ROCM_BANDWIDTH_TEST_VERSION%%

View File

@@ -71,9 +71,7 @@ lightdm
firmware-linux-free firmware-linux-free
firmware-linux-nonfree firmware-linux-nonfree
firmware-misc-nonfree firmware-misc-nonfree
firmware-amd-graphics
firmware-realtek firmware-realtek
firmware-intel-sound
firmware-bnx2 firmware-bnx2
firmware-bnx2x firmware-bnx2x
firmware-cavium firmware-cavium

View File

@@ -52,6 +52,14 @@ else
fail "nvidia-smi: NOT FOUND" fail "nvidia-smi: NOT FOUND"
fi fi
for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf; do
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
ok "$tool found: $p"
else
fail "$tool: NOT FOUND"
fi
done
echo "" echo ""
echo "-- NVIDIA modules --" echo "-- NVIDIA modules --"
KO_DIR="/usr/local/lib/nvidia" KO_DIR="/usr/local/lib/nvidia"

View File

@@ -3,6 +3,11 @@
# Type 'a' at any prompt to abort, 'b' to go back. # Type 'a' at any prompt to abort, 'b' to go back.
set -e set -e
# Requires root for ip/dhclient/resolv.conf — re-exec via sudo if needed.
if [ "$(id -u)" -ne 0 ]; then
exec sudo "$0" "$@"
fi
abort() { echo "Aborted."; exit 0; } abort() { echo "Aborted."; exit 0; }
ask() { ask() {