From bb1218ddd4ec09eb4ae98ca026b8cc7f502acb3d Mon Sep 17 00:00:00 2001 From: Mikhail Chusavitin Date: Fri, 10 Apr 2026 13:57:26 +0300 Subject: [PATCH] Fix GPU inventory: exclude BMC virtual VGA, show real NVIDIA model names Two issues: 1. BMC/management VGA chips (e.g. Huawei iBMC Hi171x, ASPEED) were included in GPU inventory because shouldIncludePCIeDevice only checked the PCI class, not the device name. Added a name-based filter for known BMC/management patterns when the class is VGA/display/3d. 2. New NVIDIA GPUs (e.g. RTX PRO 6000 Blackwell, device ID 2bb5) showed as "Device 2bb5" because lspci's database lags behind. Added "name" to the nvidia-smi query and use it to override dev.Model during enrichment. Co-Authored-By: Claude Sonnet 4.6 --- audit/internal/collector/nvidia.go | 33 +++++++++++--------- audit/internal/collector/nvidia_test.go | 5 ++- audit/internal/collector/pcie.go | 19 +++++++++++ audit/internal/collector/pcie_filter_test.go | 2 ++ 4 files changed, 44 insertions(+), 15 deletions(-) diff --git a/audit/internal/collector/nvidia.go b/audit/internal/collector/nvidia.go index 3a3ba31..246aafb 100644 --- a/audit/internal/collector/nvidia.go +++ b/audit/internal/collector/nvidia.go @@ -15,6 +15,7 @@ const nvidiaVendorID = 0x10de type nvidiaGPUInfo struct { Index int BDF string + Name string Serial string VBIOS string TemperatureC *float64 @@ -73,6 +74,9 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str continue } + if v := strings.TrimSpace(info.Name); v != "" { + devs[i].Model = &v + } if v := strings.TrimSpace(info.Serial); v != "" { devs[i].SerialNumber = &v } @@ -99,7 +103,7 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) { out, err := exec.Command( "nvidia-smi", - "--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max", + "--query-gpu=index,pci.bus_id,name,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max", "--format=csv,noheader,nounits", ).Output() if err != nil { @@ -123,8 +127,8 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) { if len(rec) == 0 { continue } - if len(rec) < 13 { - return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 13", len(rec)) + if len(rec) < 14 { + return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 14", len(rec)) } bdf := normalizePCIeBDF(rec[1]) @@ -135,17 +139,18 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) { info := nvidiaGPUInfo{ Index: parseRequiredInt(rec[0]), BDF: bdf, - Serial: strings.TrimSpace(rec[2]), - VBIOS: strings.TrimSpace(rec[3]), - TemperatureC: parseMaybeFloat(rec[4]), - PowerW: parseMaybeFloat(rec[5]), - ECCUncorrected: parseMaybeInt64(rec[6]), - ECCCorrected: parseMaybeInt64(rec[7]), - HWSlowdown: parseMaybeBool(rec[8]), - PCIeLinkGenCurrent: parseMaybeInt(rec[9]), - PCIeLinkGenMax: parseMaybeInt(rec[10]), - PCIeLinkWidthCur: parseMaybeInt(rec[11]), - PCIeLinkWidthMax: parseMaybeInt(rec[12]), + Name: strings.TrimSpace(rec[2]), + Serial: strings.TrimSpace(rec[3]), + VBIOS: strings.TrimSpace(rec[4]), + TemperatureC: parseMaybeFloat(rec[5]), + PowerW: parseMaybeFloat(rec[6]), + ECCUncorrected: parseMaybeInt64(rec[7]), + ECCCorrected: parseMaybeInt64(rec[8]), + HWSlowdown: parseMaybeBool(rec[9]), + PCIeLinkGenCurrent: parseMaybeInt(rec[10]), + PCIeLinkGenMax: parseMaybeInt(rec[11]), + PCIeLinkWidthCur: parseMaybeInt(rec[12]), + PCIeLinkWidthMax: parseMaybeInt(rec[13]), } result[bdf] = info } diff --git a/audit/internal/collector/nvidia_test.go b/audit/internal/collector/nvidia_test.go index 5c0e02b..320dc8f 100644 --- a/audit/internal/collector/nvidia_test.go +++ b/audit/internal/collector/nvidia_test.go @@ -6,7 +6,7 @@ import ( ) func TestParseNVIDIASMIQuery(t *testing.T) { - raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n" + raw := "0, 00000000:65:00.0, NVIDIA H100 80GB HBM3, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n" byBDF, err := parseNVIDIASMIQuery(raw) if err != nil { t.Fatalf("parse failed: %v", err) @@ -16,6 +16,9 @@ func TestParseNVIDIASMIQuery(t *testing.T) { if !ok { t.Fatalf("gpu by normalized bdf not found") } + if gpu.Name != "NVIDIA H100 80GB HBM3" { + t.Fatalf("name: got %q", gpu.Name) + } if gpu.Serial != "GPU-SERIAL-1" { t.Fatalf("serial: got %q", gpu.Serial) } diff --git a/audit/internal/collector/pcie.go b/audit/internal/collector/pcie.go index 6d91db8..2db1510 100644 --- a/audit/internal/collector/pcie.go +++ b/audit/internal/collector/pcie.go @@ -79,6 +79,25 @@ func shouldIncludePCIeDevice(class, vendor, device string) bool { } } + // Exclude BMC/management virtual VGA adapters — these are firmware video chips, + // not real GPUs, and pollute the GPU inventory (e.g. iBMC, iDRAC, iLO VGA). + if strings.Contains(c, "vga") || strings.Contains(c, "display") || strings.Contains(c, "3d") { + bmcPatterns := []string{ + "management system chip", + "management controller", + "ibmc", + "idrac", + "ilo vga", + "aspeed", + "matrox", + } + for _, bad := range bmcPatterns { + if strings.Contains(d, bad) { + return false + } + } + } + if strings.Contains(v, "advanced micro devices") || strings.Contains(v, "[amd]") { internalAMDPatterns := []string{ "dummy function", diff --git a/audit/internal/collector/pcie_filter_test.go b/audit/internal/collector/pcie_filter_test.go index 8be8b02..8d2d898 100644 --- a/audit/internal/collector/pcie_filter_test.go +++ b/audit/internal/collector/pcie_filter_test.go @@ -29,6 +29,8 @@ func TestShouldIncludePCIeDevice(t *testing.T) { {name: "raid", class: "RAID bus controller", want: true}, {name: "nvme", class: "Non-Volatile memory controller", want: true}, {name: "vga", class: "VGA compatible controller", want: true}, + {name: "ibmc vga", class: "VGA compatible controller", vendor: "Huawei Technologies Co., Ltd.", device: "Hi171x Series [iBMC Intelligent Management system chip w/VGA support]", want: false}, + {name: "aspeed vga", class: "VGA compatible controller", vendor: "ASPEED Technology, Inc.", device: "ASPEED Graphics Family", want: false}, {name: "other encryption controller", class: "Encryption controller", vendor: "Intel Corporation", device: "QuickAssist", want: true}, }