fix: pcie gen, nccl binary, netconf sudo, boot noise, firmware cleanup
- nvidia collector: read pcie.link.gen.current/max from nvidia-smi instead of sysfs to avoid false Gen1 readings when GPU is in ASPM idle state - build: remove bee-nccl-gpu-stress from rm -f list so shell script from overlay is not silently dropped from the ISO - smoketest: add explicit checks for bee-gpu-burn, bee-john-gpu-stress, bee-nccl-gpu-stress, all_reduce_perf - netconf: re-exec via sudo when not root to fix RTNETLINK/resolv.conf errors - auto/config: reduce loglevel 7→3 to show clean systemd output on boot - auto/config: blacklist snd_hda_intel and related audio modules (unused on servers) - package-lists: remove firmware-intel-sound and firmware-amd-graphics from base list; move firmware-amd-graphics to bee-amd variant only - bible-local: mark memtest ADR resolved, document working solution Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -13,14 +13,18 @@ import (
|
||||
const nvidiaVendorID = 0x10de
|
||||
|
||||
type nvidiaGPUInfo struct {
|
||||
BDF string
|
||||
Serial string
|
||||
VBIOS string
|
||||
TemperatureC *float64
|
||||
PowerW *float64
|
||||
ECCUncorrected *int64
|
||||
ECCCorrected *int64
|
||||
HWSlowdown *bool
|
||||
BDF string
|
||||
Serial string
|
||||
VBIOS string
|
||||
TemperatureC *float64
|
||||
PowerW *float64
|
||||
ECCUncorrected *int64
|
||||
ECCCorrected *int64
|
||||
HWSlowdown *bool
|
||||
PCIeLinkGenCurrent *int
|
||||
PCIeLinkGenMax *int
|
||||
PCIeLinkWidthCur *int
|
||||
PCIeLinkWidthMax *int
|
||||
}
|
||||
|
||||
// enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
|
||||
@@ -94,7 +98,7 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
|
||||
func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
|
||||
out, err := exec.Command(
|
||||
"nvidia-smi",
|
||||
"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown",
|
||||
"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
|
||||
"--format=csv,noheader,nounits",
|
||||
).Output()
|
||||
if err != nil {
|
||||
@@ -118,8 +122,8 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
||||
if len(rec) == 0 {
|
||||
continue
|
||||
}
|
||||
if len(rec) < 9 {
|
||||
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 9", len(rec))
|
||||
if len(rec) < 13 {
|
||||
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 13", len(rec))
|
||||
}
|
||||
|
||||
bdf := normalizePCIeBDF(rec[1])
|
||||
@@ -128,14 +132,18 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
||||
}
|
||||
|
||||
info := nvidiaGPUInfo{
|
||||
BDF: bdf,
|
||||
Serial: strings.TrimSpace(rec[2]),
|
||||
VBIOS: strings.TrimSpace(rec[3]),
|
||||
TemperatureC: parseMaybeFloat(rec[4]),
|
||||
PowerW: parseMaybeFloat(rec[5]),
|
||||
ECCUncorrected: parseMaybeInt64(rec[6]),
|
||||
ECCCorrected: parseMaybeInt64(rec[7]),
|
||||
HWSlowdown: parseMaybeBool(rec[8]),
|
||||
BDF: bdf,
|
||||
Serial: strings.TrimSpace(rec[2]),
|
||||
VBIOS: strings.TrimSpace(rec[3]),
|
||||
TemperatureC: parseMaybeFloat(rec[4]),
|
||||
PowerW: parseMaybeFloat(rec[5]),
|
||||
ECCUncorrected: parseMaybeInt64(rec[6]),
|
||||
ECCCorrected: parseMaybeInt64(rec[7]),
|
||||
HWSlowdown: parseMaybeBool(rec[8]),
|
||||
PCIeLinkGenCurrent: parseMaybeInt(rec[9]),
|
||||
PCIeLinkGenMax: parseMaybeInt(rec[10]),
|
||||
PCIeLinkWidthCur: parseMaybeInt(rec[11]),
|
||||
PCIeLinkWidthMax: parseMaybeInt(rec[12]),
|
||||
}
|
||||
result[bdf] = info
|
||||
}
|
||||
@@ -167,6 +175,22 @@ func parseMaybeInt64(v string) *int64 {
|
||||
return &n
|
||||
}
|
||||
|
||||
func parseMaybeInt(v string) *int {
|
||||
v = strings.TrimSpace(v)
|
||||
if v == "" || strings.EqualFold(v, "n/a") || strings.EqualFold(v, "not supported") || strings.EqualFold(v, "[not supported]") {
|
||||
return nil
|
||||
}
|
||||
n, err := strconv.Atoi(v)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return &n
|
||||
}
|
||||
|
||||
func pcieLinkGenLabel(gen int) string {
|
||||
return fmt.Sprintf("Gen%d", gen)
|
||||
}
|
||||
|
||||
func parseMaybeBool(v string) *bool {
|
||||
v = strings.TrimSpace(strings.ToLower(v))
|
||||
switch v {
|
||||
@@ -231,4 +255,22 @@ func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
|
||||
if info.HWSlowdown != nil {
|
||||
dev.HWSlowdown = info.HWSlowdown
|
||||
}
|
||||
// Override PCIe link speed/width with nvidia-smi driver values.
|
||||
// sysfs current_link_speed reflects the instantaneous physical link state and
|
||||
// can show Gen1 when the GPU is idle due to ASPM power management. The driver
|
||||
// knows the negotiated speed regardless of the current power state.
|
||||
if info.PCIeLinkGenCurrent != nil {
|
||||
s := pcieLinkGenLabel(*info.PCIeLinkGenCurrent)
|
||||
dev.LinkSpeed = &s
|
||||
}
|
||||
if info.PCIeLinkGenMax != nil {
|
||||
s := pcieLinkGenLabel(*info.PCIeLinkGenMax)
|
||||
dev.MaxLinkSpeed = &s
|
||||
}
|
||||
if info.PCIeLinkWidthCur != nil {
|
||||
dev.LinkWidth = info.PCIeLinkWidthCur
|
||||
}
|
||||
if info.PCIeLinkWidthMax != nil {
|
||||
dev.MaxLinkWidth = info.PCIeLinkWidthMax
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ import (
|
||||
)
|
||||
|
||||
func TestParseNVIDIASMIQuery(t *testing.T) {
|
||||
raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active\n"
|
||||
raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
|
||||
byBDF, err := parseNVIDIASMIQuery(raw)
|
||||
if err != nil {
|
||||
t.Fatalf("parse failed: %v", err)
|
||||
@@ -28,6 +28,12 @@ func TestParseNVIDIASMIQuery(t *testing.T) {
|
||||
if gpu.HWSlowdown == nil || *gpu.HWSlowdown {
|
||||
t.Fatalf("hw slowdown: got %v, want false", gpu.HWSlowdown)
|
||||
}
|
||||
if gpu.PCIeLinkGenCurrent == nil || *gpu.PCIeLinkGenCurrent != 4 {
|
||||
t.Fatalf("pcie link gen current: got %v, want 4", gpu.PCIeLinkGenCurrent)
|
||||
}
|
||||
if gpu.PCIeLinkGenMax == nil || *gpu.PCIeLinkGenMax != 4 {
|
||||
t.Fatalf("pcie link gen max: got %v, want 4", gpu.PCIeLinkGenMax)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizePCIeBDF(t *testing.T) {
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# Decision: Treat memtest as explicit ISO content, not as trusted live-build magic
|
||||
|
||||
**Date:** 2026-04-01
|
||||
**Status:** active
|
||||
**Status:** resolved
|
||||
|
||||
## Context
|
||||
|
||||
@@ -160,3 +160,65 @@ Current implementation direction:
|
||||
- But validation output is only trustworthy if ISO reading itself succeeded. A
|
||||
"missing memtest" warning without a successful ISO read is not evidence.
|
||||
- If we change memtest strategy again, we must update this ADR with the exact build evidence that justified the change.
|
||||
|
||||
## Working Solution (confirmed 2026-04-01, commits 76a9100 → 2baf3be)
|
||||
|
||||
This approach was confirmed working in ISO `easy-bee-nvidia-v3.20-5-g76a9100-amd64.iso`
|
||||
and validated again in subsequent builds. The final ISO contains all required memtest artifacts.
|
||||
|
||||
### Components
|
||||
|
||||
**1. Binary hook `config/hooks/normal/9100-memtest.hook.binary`**
|
||||
|
||||
Runs inside the live-build binary phase. Does not patch bootloader files at hook time —
|
||||
those files may not exist yet. Instead:
|
||||
|
||||
- Tries to copy `memtest86+x64.bin` / `memtest86+x64.efi` from `chroot/boot/` first.
|
||||
- Falls back to extracting from the cached `.deb` (via `dpkg-deb -x`) if `chroot/boot/` is empty.
|
||||
- Appends GRUB and isolinux menu entries only if the respective cfg files already exist at hook time.
|
||||
If they do not exist, the hook warns and continues (does not fail).
|
||||
|
||||
Controlled by `BEE_REQUIRE_MEMTEST=1` env var to turn warnings into hard errors when needed.
|
||||
|
||||
**2. Post-`lb build` recovery step in `build.sh`**
|
||||
|
||||
After `lb build` completes, `build.sh` checks whether the fully materialized `binary/` tree
|
||||
contains all required memtest artifacts. If not:
|
||||
|
||||
- Copies/extracts memtest binaries into `binary/boot/`.
|
||||
- Patches `binary/boot/grub/grub.cfg` and `binary/isolinux/live.cfg` directly.
|
||||
- Reruns the late binary stages (`binary_checksums`, `binary_iso`, `binary_zsync`) to rebuild
|
||||
the ISO with the patched tree.
|
||||
|
||||
This is the deterministic safety net: even if the hook runs at the wrong time, the recovery
|
||||
step handles the final `binary/` tree after live-build has written all bootloader configs.
|
||||
|
||||
**3. ISO validation hardening**
|
||||
|
||||
The memtest probe in `build.sh` is wrapped in explicit `if` / `case` control flow, not called
|
||||
as a bare command under `set -e`. A non-zero probe return (needs recovery) is intentional and
|
||||
handled — it does not abort the build prematurely.
|
||||
|
||||
ISO reading (`xorriso -indev -ls` / extraction) is treated as a separate prerequisite.
|
||||
If the reader fails, the validator reports a reader error explicitly, not a memtest warning.
|
||||
This prevents the false-negative loop that burned 2026-04-01 v3.14–v3.19.
|
||||
|
||||
### Why this works when earlier attempts did not
|
||||
|
||||
The earlier patterns all shared a single flaw: they assumed a single build-time point
|
||||
(hook or source template) would be the last writer of bootloader configs and memtest payloads.
|
||||
In live-build on Debian Bookworm that assumption is false — live-build continues writing
|
||||
bootloader files after custom hooks run, and `chroot/boot/` does not reliably hold memtest payloads.
|
||||
|
||||
The recovery step sidesteps the ordering problem entirely: it acts on the fully materialized
|
||||
`binary/` tree after `lb build` finishes, then rebuilds the ISO from that patched tree.
|
||||
There is no ordering dependency to get wrong.
|
||||
|
||||
### Do not revert
|
||||
|
||||
Do not remove the recovery step or the hook without a fresh real ISO build proving
|
||||
live-build alone produces all four required artifacts:
|
||||
- `boot/memtest86+x64.bin`
|
||||
- `boot/memtest86+x64.efi`
|
||||
- memtest entry in `boot/grub/grub.cfg`
|
||||
- memtest entry in `isolinux/live.cfg`
|
||||
|
||||
@@ -32,7 +32,7 @@ lb config noauto \
|
||||
--memtest memtest86+ \
|
||||
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
|
||||
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
||||
--apt-recommends false \
|
||||
--chroot-squashfs-compression-type zstd \
|
||||
"${@}"
|
||||
|
||||
@@ -862,7 +862,6 @@ rm -f \
|
||||
"${OVERLAY_STAGE_DIR}/etc/bee-release" \
|
||||
"${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/john" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
# AMD GPU firmware
|
||||
firmware-amd-graphics
|
||||
|
||||
# AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST)
|
||||
rocm-smi-lib=%%ROCM_SMI_VERSION%%
|
||||
rocm-bandwidth-test=%%ROCM_BANDWIDTH_TEST_VERSION%%
|
||||
|
||||
@@ -71,9 +71,7 @@ lightdm
|
||||
firmware-linux-free
|
||||
firmware-linux-nonfree
|
||||
firmware-misc-nonfree
|
||||
firmware-amd-graphics
|
||||
firmware-realtek
|
||||
firmware-intel-sound
|
||||
firmware-bnx2
|
||||
firmware-bnx2x
|
||||
firmware-cavium
|
||||
|
||||
@@ -52,6 +52,14 @@ else
|
||||
fail "nvidia-smi: NOT FOUND"
|
||||
fi
|
||||
|
||||
for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf; do
|
||||
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
|
||||
ok "$tool found: $p"
|
||||
else
|
||||
fail "$tool: NOT FOUND"
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "-- NVIDIA modules --"
|
||||
KO_DIR="/usr/local/lib/nvidia"
|
||||
|
||||
@@ -3,6 +3,11 @@
|
||||
# Type 'a' at any prompt to abort, 'b' to go back.
|
||||
set -e
|
||||
|
||||
# Requires root for ip/dhclient/resolv.conf — re-exec via sudo if needed.
|
||||
if [ "$(id -u)" -ne 0 ]; then
|
||||
exec sudo "$0" "$@"
|
||||
fi
|
||||
|
||||
abort() { echo "Aborted."; exit 0; }
|
||||
|
||||
ask() {
|
||||
|
||||
Reference in New Issue
Block a user