fix: pcie gen, nccl binary, netconf sudo, boot noise, firmware cleanup
- nvidia collector: read pcie.link.gen.current/max from nvidia-smi instead of sysfs to avoid false Gen1 readings when GPU is in ASPM idle state - build: remove bee-nccl-gpu-stress from rm -f list so shell script from overlay is not silently dropped from the ISO - smoketest: add explicit checks for bee-gpu-burn, bee-john-gpu-stress, bee-nccl-gpu-stress, all_reduce_perf - netconf: re-exec via sudo when not root to fix RTNETLINK/resolv.conf errors - auto/config: reduce loglevel 7→3 to show clean systemd output on boot - auto/config: blacklist snd_hda_intel and related audio modules (unused on servers) - package-lists: remove firmware-intel-sound and firmware-amd-graphics from base list; move firmware-amd-graphics to bee-amd variant only - bible-local: mark memtest ADR resolved, document working solution Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -13,14 +13,18 @@ import (
|
||||
const nvidiaVendorID = 0x10de
|
||||
|
||||
type nvidiaGPUInfo struct {
|
||||
BDF string
|
||||
Serial string
|
||||
VBIOS string
|
||||
TemperatureC *float64
|
||||
PowerW *float64
|
||||
ECCUncorrected *int64
|
||||
ECCCorrected *int64
|
||||
HWSlowdown *bool
|
||||
BDF string
|
||||
Serial string
|
||||
VBIOS string
|
||||
TemperatureC *float64
|
||||
PowerW *float64
|
||||
ECCUncorrected *int64
|
||||
ECCCorrected *int64
|
||||
HWSlowdown *bool
|
||||
PCIeLinkGenCurrent *int
|
||||
PCIeLinkGenMax *int
|
||||
PCIeLinkWidthCur *int
|
||||
PCIeLinkWidthMax *int
|
||||
}
|
||||
|
||||
// enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
|
||||
@@ -94,7 +98,7 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
|
||||
func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
|
||||
out, err := exec.Command(
|
||||
"nvidia-smi",
|
||||
"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown",
|
||||
"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
|
||||
"--format=csv,noheader,nounits",
|
||||
).Output()
|
||||
if err != nil {
|
||||
@@ -118,8 +122,8 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
||||
if len(rec) == 0 {
|
||||
continue
|
||||
}
|
||||
if len(rec) < 9 {
|
||||
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 9", len(rec))
|
||||
if len(rec) < 13 {
|
||||
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 13", len(rec))
|
||||
}
|
||||
|
||||
bdf := normalizePCIeBDF(rec[1])
|
||||
@@ -128,14 +132,18 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
||||
}
|
||||
|
||||
info := nvidiaGPUInfo{
|
||||
BDF: bdf,
|
||||
Serial: strings.TrimSpace(rec[2]),
|
||||
VBIOS: strings.TrimSpace(rec[3]),
|
||||
TemperatureC: parseMaybeFloat(rec[4]),
|
||||
PowerW: parseMaybeFloat(rec[5]),
|
||||
ECCUncorrected: parseMaybeInt64(rec[6]),
|
||||
ECCCorrected: parseMaybeInt64(rec[7]),
|
||||
HWSlowdown: parseMaybeBool(rec[8]),
|
||||
BDF: bdf,
|
||||
Serial: strings.TrimSpace(rec[2]),
|
||||
VBIOS: strings.TrimSpace(rec[3]),
|
||||
TemperatureC: parseMaybeFloat(rec[4]),
|
||||
PowerW: parseMaybeFloat(rec[5]),
|
||||
ECCUncorrected: parseMaybeInt64(rec[6]),
|
||||
ECCCorrected: parseMaybeInt64(rec[7]),
|
||||
HWSlowdown: parseMaybeBool(rec[8]),
|
||||
PCIeLinkGenCurrent: parseMaybeInt(rec[9]),
|
||||
PCIeLinkGenMax: parseMaybeInt(rec[10]),
|
||||
PCIeLinkWidthCur: parseMaybeInt(rec[11]),
|
||||
PCIeLinkWidthMax: parseMaybeInt(rec[12]),
|
||||
}
|
||||
result[bdf] = info
|
||||
}
|
||||
@@ -167,6 +175,22 @@ func parseMaybeInt64(v string) *int64 {
|
||||
return &n
|
||||
}
|
||||
|
||||
func parseMaybeInt(v string) *int {
|
||||
v = strings.TrimSpace(v)
|
||||
if v == "" || strings.EqualFold(v, "n/a") || strings.EqualFold(v, "not supported") || strings.EqualFold(v, "[not supported]") {
|
||||
return nil
|
||||
}
|
||||
n, err := strconv.Atoi(v)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return &n
|
||||
}
|
||||
|
||||
func pcieLinkGenLabel(gen int) string {
|
||||
return fmt.Sprintf("Gen%d", gen)
|
||||
}
|
||||
|
||||
func parseMaybeBool(v string) *bool {
|
||||
v = strings.TrimSpace(strings.ToLower(v))
|
||||
switch v {
|
||||
@@ -231,4 +255,22 @@ func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
|
||||
if info.HWSlowdown != nil {
|
||||
dev.HWSlowdown = info.HWSlowdown
|
||||
}
|
||||
// Override PCIe link speed/width with nvidia-smi driver values.
|
||||
// sysfs current_link_speed reflects the instantaneous physical link state and
|
||||
// can show Gen1 when the GPU is idle due to ASPM power management. The driver
|
||||
// knows the negotiated speed regardless of the current power state.
|
||||
if info.PCIeLinkGenCurrent != nil {
|
||||
s := pcieLinkGenLabel(*info.PCIeLinkGenCurrent)
|
||||
dev.LinkSpeed = &s
|
||||
}
|
||||
if info.PCIeLinkGenMax != nil {
|
||||
s := pcieLinkGenLabel(*info.PCIeLinkGenMax)
|
||||
dev.MaxLinkSpeed = &s
|
||||
}
|
||||
if info.PCIeLinkWidthCur != nil {
|
||||
dev.LinkWidth = info.PCIeLinkWidthCur
|
||||
}
|
||||
if info.PCIeLinkWidthMax != nil {
|
||||
dev.MaxLinkWidth = info.PCIeLinkWidthMax
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ import (
|
||||
)
|
||||
|
||||
func TestParseNVIDIASMIQuery(t *testing.T) {
|
||||
raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active\n"
|
||||
raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
|
||||
byBDF, err := parseNVIDIASMIQuery(raw)
|
||||
if err != nil {
|
||||
t.Fatalf("parse failed: %v", err)
|
||||
@@ -28,6 +28,12 @@ func TestParseNVIDIASMIQuery(t *testing.T) {
|
||||
if gpu.HWSlowdown == nil || *gpu.HWSlowdown {
|
||||
t.Fatalf("hw slowdown: got %v, want false", gpu.HWSlowdown)
|
||||
}
|
||||
if gpu.PCIeLinkGenCurrent == nil || *gpu.PCIeLinkGenCurrent != 4 {
|
||||
t.Fatalf("pcie link gen current: got %v, want 4", gpu.PCIeLinkGenCurrent)
|
||||
}
|
||||
if gpu.PCIeLinkGenMax == nil || *gpu.PCIeLinkGenMax != 4 {
|
||||
t.Fatalf("pcie link gen max: got %v, want 4", gpu.PCIeLinkGenMax)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizePCIeBDF(t *testing.T) {
|
||||
|
||||
Reference in New Issue
Block a user