From 614b7cad618a3c331ee574c4107d8d9762eda6b1 Mon Sep 17 00:00:00 2001 From: Mikhail Chusavitin Date: Wed, 25 Mar 2026 20:00:38 +0300 Subject: [PATCH] Improve PCIe inventory and hardware identity collection --- audit/internal/collector/collector.go | 1 + audit/internal/collector/finalize.go | 13 +- audit/internal/collector/finalize_test.go | 17 +++ audit/internal/collector/nic_telemetry.go | 5 + .../internal/collector/nic_telemetry_test.go | 74 ++++++++++- audit/internal/collector/pcie.go | 36 ++++- audit/internal/collector/pcie_filter_test.go | 71 +++++++--- audit/internal/collector/pcie_identity.go | 123 ++++++++++++++++++ .../internal/collector/pcie_identity_test.go | 47 +++++++ internal/chart | 2 +- 10 files changed, 367 insertions(+), 22 deletions(-) create mode 100644 audit/internal/collector/pcie_identity.go create mode 100644 audit/internal/collector/pcie_identity_test.go diff --git a/audit/internal/collector/collector.go b/audit/internal/collector/collector.go index 16f042e..c91e866 100644 --- a/audit/internal/collector/collector.go +++ b/audit/internal/collector/collector.go @@ -36,6 +36,7 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest { snap.Memory = enrichMemoryWithTelemetry(snap.Memory, sensorDoc) snap.Storage = collectStorage() snap.PCIeDevices = collectPCIe() + snap.PCIeDevices = enrichPCIeWithPCISerials(snap.PCIeDevices) snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices) snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices) snap.PCIeDevices = enrichPCIeWithNICTelemetry(snap.PCIeDevices) diff --git a/audit/internal/collector/finalize.go b/audit/internal/collector/finalize.go index b953fec..9d1353a 100644 --- a/audit/internal/collector/finalize.go +++ b/audit/internal/collector/finalize.go @@ -41,7 +41,18 @@ func filterStorage(disks []schema.HardwareStorage) []schema.HardwareStorage { func filterPSUs(psus []schema.HardwarePowerSupply) []schema.HardwarePowerSupply { out := make([]schema.HardwarePowerSupply, 0, len(psus)) for _, psu := range psus { - if psu.SerialNumber == nil || *psu.SerialNumber == "" { + hasIdentity := false + switch { + case psu.SerialNumber != nil && *psu.SerialNumber != "": + hasIdentity = true + case psu.Slot != nil && *psu.Slot != "": + hasIdentity = true + case psu.Model != nil && *psu.Model != "": + hasIdentity = true + case psu.Vendor != nil && *psu.Vendor != "": + hasIdentity = true + } + if !hasIdentity { continue } out = append(out, psu) diff --git a/audit/internal/collector/finalize_test.go b/audit/internal/collector/finalize_test.go index 7f4d45c..2a5a156 100644 --- a/audit/internal/collector/finalize_test.go +++ b/audit/internal/collector/finalize_test.go @@ -61,3 +61,20 @@ func TestFinalizeSnapshotPreservesDuplicateSerials(t *testing.T) { t.Fatalf("duplicate serial should stay unchanged: %q", got) } } + +func TestFilterPSUsKeepsSlotOnlyEntries(t *testing.T) { + slot := "0" + status := statusOK + + got := filterPSUs([]schema.HardwarePowerSupply{ + {Slot: &slot, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}}, + {HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}}, + }) + + if len(got) != 1 { + t.Fatalf("len(got)=%d want 1", len(got)) + } + if got[0].Slot == nil || *got[0].Slot != "0" { + t.Fatalf("unexpected kept PSU: %+v", got[0]) + } +} diff --git a/audit/internal/collector/nic_telemetry.go b/audit/internal/collector/nic_telemetry.go index d951f89..2ddfabf 100644 --- a/audit/internal/collector/nic_telemetry.go +++ b/audit/internal/collector/nic_telemetry.go @@ -44,6 +44,11 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw } iface := ifaces[0] devs[i].MacAddresses = collectInterfaceMACs(ifaces) + if devs[i].SerialNumber == nil { + if serial := queryPCIDeviceSerial(bdf); serial != "" { + devs[i].SerialNumber = &serial + } + } if devs[i].Firmware == nil { if out, err := ethtoolInfoQuery(iface); err == nil { diff --git a/audit/internal/collector/nic_telemetry_test.go b/audit/internal/collector/nic_telemetry_test.go index c5570f9..6873a49 100644 --- a/audit/internal/collector/nic_telemetry_test.go +++ b/audit/internal/collector/nic_telemetry_test.go @@ -1,6 +1,10 @@ package collector -import "testing" +import ( + "bee/audit/internal/schema" + "fmt" + "testing" +) func TestParseSFPDOM(t *testing.T) { raw := ` @@ -29,6 +33,74 @@ func TestParseSFPDOM(t *testing.T) { } } +func TestParseLSPCIDetailSerial(t *testing.T) { + raw := ` +05:00.0 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6] + Serial number: NIC-SN-12345 +` + if got := parseLSPCIDetailSerial(raw); got != "NIC-SN-12345" { + t.Fatalf("serial=%q want %q", got, "NIC-SN-12345") + } +} + +func TestParsePCIVPDSerial(t *testing.T) { + raw := []byte{0x82, 0x05, 0x00, 'M', 'L', 'X', '5', 0x90, 0x08, 0x00, 'S', 'N', 0x08, 'M', 'T', '1', '2', '3', '4', '5', '6'} + if got := parsePCIVPDSerial(raw); got != "MT123456" { + t.Fatalf("serial=%q want %q", got, "MT123456") + } +} + +func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) { + origDetail := queryPCILSPCIDetail + origVPD := readPCIVPDFile + origIfaces := netIfacesByBDF + origReadMAC := readNetAddressFile + origEth := ethtoolInfoQuery + origModule := ethtoolModuleQuery + t.Cleanup(func() { + queryPCILSPCIDetail = origDetail + readPCIVPDFile = origVPD + netIfacesByBDF = origIfaces + readNetAddressFile = origReadMAC + ethtoolInfoQuery = origEth + ethtoolModuleQuery = origModule + }) + + queryPCILSPCIDetail = func(bdf string) (string, error) { + if bdf != "0000:18:00.0" { + t.Fatalf("unexpected bdf: %s", bdf) + } + return "Serial number: NIC-SN-98765\n", nil + } + readPCIVPDFile = func(string) ([]byte, error) { + return nil, fmt.Errorf("no vpd needed") + } + netIfacesByBDF = func(string) []string { return []string{"eth0"} } + readNetAddressFile = func(iface string) (string, error) { + if iface != "eth0" { + t.Fatalf("unexpected iface: %s", iface) + } + return "aa:bb:cc:dd:ee:ff", nil + } + ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") } + ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("skip optics") } + + class := "EthernetController" + bdf := "0000:18:00.0" + devs := []schema.HardwarePCIeDevice{{ + DeviceClass: &class, + BDF: &bdf, + }} + + out := enrichPCIeWithNICTelemetry(devs) + if out[0].SerialNumber == nil || *out[0].SerialNumber != "NIC-SN-98765" { + t.Fatalf("serial=%v want NIC-SN-98765", out[0].SerialNumber) + } + if len(out[0].MacAddresses) != 1 || out[0].MacAddresses[0] != "aa:bb:cc:dd:ee:ff" { + t.Fatalf("mac_addresses=%v", out[0].MacAddresses) + } +} + func TestDBMValue(t *testing.T) { tests := []struct { in string diff --git a/audit/internal/collector/pcie.go b/audit/internal/collector/pcie.go index 265c018..69e04ee 100644 --- a/audit/internal/collector/pcie.go +++ b/audit/internal/collector/pcie.go @@ -37,7 +37,7 @@ func parseLspci(output string) []schema.HardwarePCIeDevice { val := strings.TrimSpace(line[idx+2:]) fields[key] = val } - if !shouldIncludePCIeDevice(fields["Class"]) { + if !shouldIncludePCIeDevice(fields["Class"], fields["Vendor"], fields["Device"]) { continue } dev := parseLspciDevice(fields) @@ -46,8 +46,10 @@ func parseLspci(output string) []schema.HardwarePCIeDevice { return devs } -func shouldIncludePCIeDevice(class string) bool { +func shouldIncludePCIeDevice(class, vendor, device string) bool { c := strings.ToLower(strings.TrimSpace(class)) + v := strings.ToLower(strings.TrimSpace(vendor)) + d := strings.ToLower(strings.TrimSpace(device)) if c == "" { return true } @@ -68,12 +70,28 @@ func shouldIncludePCIeDevice(class string) bool { "audio device", "serial bus controller", "unassigned class", + "non-essential instrumentation", } for _, bad := range excluded { if strings.Contains(c, bad) { return false } } + + if strings.Contains(v, "advanced micro devices") || strings.Contains(v, "[amd]") { + internalAMDPatterns := []string{ + "dummy function", + "reserved spp", + "ptdma", + "cryptographic coprocessor pspcpp", + "pspcpp", + } + for _, bad := range internalAMDPatterns { + if strings.Contains(d, bad) { + return false + } + } + } return true } @@ -98,6 +116,8 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice { } if numaNode, ok := readPCINumaNode(bdf); ok { dev.NUMANode = &numaNode + } else if numaNode, ok := parsePCINumaNode(fields["NUMANode"]); ok { + dev.NUMANode = &numaNode } if width, ok := readPCIIntAttribute(bdf, "current_link_width"); ok { dev.LinkWidth = &width @@ -165,6 +185,18 @@ func readPCINumaNode(bdf string) (int, bool) { return value, true } +func parsePCINumaNode(raw string) (int, bool) { + raw = strings.TrimSpace(raw) + if raw == "" { + return 0, false + } + value, err := strconv.Atoi(raw) + if err != nil || value < 0 { + return 0, false + } + return value, true +} + func readPCIIntAttribute(bdf, attribute string) (int, bool) { out, err := exec.Command("cat", "/sys/bus/pci/devices/"+bdf+"/"+attribute).Output() if err != nil { diff --git a/audit/internal/collector/pcie_filter_test.go b/audit/internal/collector/pcie_filter_test.go index 92f85d7..b32b7c2 100644 --- a/audit/internal/collector/pcie_filter_test.go +++ b/audit/internal/collector/pcie_filter_test.go @@ -8,32 +8,42 @@ import ( func TestShouldIncludePCIeDevice(t *testing.T) { tests := []struct { - class string - want bool + name string + class string + vendor string + device string + want bool }{ - {"USB controller", false}, - {"System peripheral", false}, - {"Audio device", false}, - {"Host bridge", false}, - {"PCI bridge", false}, - {"SMBus", false}, - {"Performance counters", false}, - {"Ethernet controller", true}, - {"RAID bus controller", true}, - {"Non-Volatile memory controller", true}, - {"VGA compatible controller", true}, + {name: "usb", class: "USB controller", want: false}, + {name: "system peripheral", class: "System peripheral", want: false}, + {name: "audio", class: "Audio device", want: false}, + {name: "host bridge", class: "Host bridge", want: false}, + {name: "pci bridge", class: "PCI bridge", want: false}, + {name: "smbus", class: "SMBus", want: false}, + {name: "perf", class: "Performance counters", want: false}, + {name: "non essential instrumentation", class: "Non-Essential Instrumentation", want: false}, + {name: "amd dummy function", class: "Encryption controller", vendor: "Advanced Micro Devices, Inc. [AMD]", device: "Starship/Matisse PTDMA", want: false}, + {name: "amd pspcpp", class: "Encryption controller", vendor: "Advanced Micro Devices, Inc. [AMD]", device: "Starship/Matisse Cryptographic Coprocessor PSPCPP", want: false}, + {name: "ethernet", class: "Ethernet controller", want: true}, + {name: "raid", class: "RAID bus controller", want: true}, + {name: "nvme", class: "Non-Volatile memory controller", want: true}, + {name: "vga", class: "VGA compatible controller", want: true}, + {name: "other encryption controller", class: "Encryption controller", vendor: "Intel Corporation", device: "QuickAssist", want: true}, } for _, tt := range tests { - got := shouldIncludePCIeDevice(tt.class) - if got != tt.want { - t.Fatalf("class %q include=%v want %v", tt.class, got, tt.want) - } + t.Run(tt.name, func(t *testing.T) { + got := shouldIncludePCIeDevice(tt.class, tt.vendor, tt.device) + if got != tt.want { + t.Fatalf("class=%q vendor=%q device=%q include=%v want %v", tt.class, tt.vendor, tt.device, got, tt.want) + } + }) } } func TestParseLspci_filtersExcludedClasses(t *testing.T) { input := "Slot:\t0000:00:14.0\nClass:\tUSB controller\nVendor:\tIntel Corporation\nDevice:\tUSB 3.0\n\n" + + "Slot:\t0000:00:18.0\nClass:\tNon-Essential Instrumentation\nVendor:\tAdvanced Micro Devices, Inc. [AMD]\nDevice:\tStarship/Matisse PCIe Dummy Function\n\n" + "Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n" devs := parseLspci(input) @@ -51,6 +61,21 @@ func TestParseLspci_filtersExcludedClasses(t *testing.T) { } } +func TestParseLspci_filtersAMDChipsetNoise(t *testing.T) { + input := "" + + "Slot:\t0000:1a:00.0\nClass:\tNon-Essential Instrumentation\nVendor:\tAdvanced Micro Devices, Inc. [AMD]\nDevice:\tStarship/Matisse PCIe Dummy Function\n\n" + + "Slot:\t0000:1a:00.2\nClass:\tEncryption controller\nVendor:\tAdvanced Micro Devices, Inc. [AMD]\nDevice:\tStarship/Matisse PTDMA\n\n" + + "Slot:\t0000:05:00.0\nClass:\tEthernet controller\nVendor:\tMellanox Technologies\nDevice:\tMT28908 Family [ConnectX-6]\n\n" + + devs := parseLspci(input) + if len(devs) != 1 { + t.Fatalf("expected 1 remaining device, got %d", len(devs)) + } + if devs[0].Model == nil || *devs[0].Model != "MT28908 Family [ConnectX-6]" { + t.Fatalf("unexpected remaining device: %+v", devs[0]) + } +} + func TestPCIeJSONUsesSlotNotBDF(t *testing.T) { input := "Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n" @@ -68,6 +93,18 @@ func TestPCIeJSONUsesSlotNotBDF(t *testing.T) { } } +func TestParseLspciUsesNUMANodeFieldWhenSysfsUnavailable(t *testing.T) { + input := "Slot:\t0000:65:00.0\nClass:\tEthernet controller\nVendor:\tIntel Corporation\nDevice:\tX710\nNUMANode:\t1\n\n" + + devs := parseLspci(input) + if len(devs) != 1 { + t.Fatalf("expected 1 device, got %d", len(devs)) + } + if devs[0].NUMANode == nil || *devs[0].NUMANode != 1 { + t.Fatalf("numa_node=%v want 1", devs[0].NUMANode) + } +} + func TestNormalizePCILinkSpeed(t *testing.T) { tests := []struct { raw string diff --git a/audit/internal/collector/pcie_identity.go b/audit/internal/collector/pcie_identity.go new file mode 100644 index 0000000..12fae74 --- /dev/null +++ b/audit/internal/collector/pcie_identity.go @@ -0,0 +1,123 @@ +package collector + +import ( + "bee/audit/internal/schema" + "log/slog" + "os" + "os/exec" + "path/filepath" + "strings" +) + +var ( + queryPCILSPCIDetail = func(bdf string) (string, error) { + out, err := exec.Command("lspci", "-vv", "-s", bdf).Output() + if err != nil { + return "", err + } + return string(out), nil + } + readPCIVPDFile = func(bdf string) ([]byte, error) { + return os.ReadFile(filepath.Join("/sys/bus/pci/devices", bdf, "vpd")) + } +) + +func enrichPCIeWithPCISerials(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice { + enriched := 0 + for i := range devs { + if !shouldProbePCIeSerial(devs[i]) { + continue + } + bdf := normalizePCIeBDF(*devs[i].BDF) + if bdf == "" { + continue + } + if serial := queryPCIDeviceSerial(bdf); serial != "" { + devs[i].SerialNumber = &serial + enriched++ + } + } + if enriched > 0 { + slog.Info("pcie: serials enriched", "count", enriched) + } + return devs +} + +func shouldProbePCIeSerial(dev schema.HardwarePCIeDevice) bool { + if dev.BDF == nil || dev.SerialNumber != nil { + return false + } + if dev.DeviceClass == nil { + return false + } + class := strings.TrimSpace(*dev.DeviceClass) + return isNICClass(class) || isGPUClass(class) +} + +func queryPCIDeviceSerial(bdf string) string { + if out, err := queryPCILSPCIDetail(bdf); err == nil { + if serial := parseLSPCIDetailSerial(out); serial != "" { + return serial + } + } + if raw, err := readPCIVPDFile(bdf); err == nil { + return parsePCIVPDSerial(raw) + } + return "" +} + +func parseLSPCIDetailSerial(raw string) string { + for _, line := range strings.Split(raw, "\n") { + line = strings.TrimSpace(line) + if line == "" { + continue + } + lower := strings.ToLower(line) + if !strings.Contains(lower, "serial number:") { + continue + } + idx := strings.Index(line, ":") + if idx < 0 { + continue + } + if serial := strings.TrimSpace(line[idx+1:]); serial != "" { + return serial + } + } + return "" +} + +func parsePCIVPDSerial(raw []byte) string { + for i := 0; i+3 < len(raw); i++ { + if raw[i] != 'S' || raw[i+1] != 'N' { + continue + } + length := int(raw[i+2]) + if length <= 0 || length > 64 || i+3+length > len(raw) { + continue + } + value := strings.TrimSpace(strings.Trim(string(raw[i+3:i+3+length]), "\x00")) + if !looksLikeSerial(value) { + continue + } + return value + } + return "" +} + +func looksLikeSerial(value string) bool { + if len(value) < 4 { + return false + } + hasAlphaNum := false + for _, r := range value { + switch { + case r >= 'a' && r <= 'z', r >= 'A' && r <= 'Z', r >= '0' && r <= '9': + hasAlphaNum = true + case strings.ContainsRune(" -_./:", r): + default: + return false + } + } + return hasAlphaNum +} diff --git a/audit/internal/collector/pcie_identity_test.go b/audit/internal/collector/pcie_identity_test.go new file mode 100644 index 0000000..4402d66 --- /dev/null +++ b/audit/internal/collector/pcie_identity_test.go @@ -0,0 +1,47 @@ +package collector + +import ( + "bee/audit/internal/schema" + "fmt" + "testing" +) + +func TestEnrichPCIeWithPCISerialsAddsGPUFallback(t *testing.T) { + origDetail := queryPCILSPCIDetail + origVPD := readPCIVPDFile + t.Cleanup(func() { + queryPCILSPCIDetail = origDetail + readPCIVPDFile = origVPD + }) + + queryPCILSPCIDetail = func(bdf string) (string, error) { + if bdf != "0000:11:00.0" { + t.Fatalf("unexpected bdf: %s", bdf) + } + return "Serial number: GPU-SN-12345\n", nil + } + readPCIVPDFile = func(string) ([]byte, error) { + return nil, fmt.Errorf("no vpd needed") + } + + class := "DisplayController" + bdf := "0000:11:00.0" + devs := []schema.HardwarePCIeDevice{{ + DeviceClass: &class, + BDF: &bdf, + }} + + out := enrichPCIeWithPCISerials(devs) + if out[0].SerialNumber == nil || *out[0].SerialNumber != "GPU-SN-12345" { + t.Fatalf("serial=%v want GPU-SN-12345", out[0].SerialNumber) + } +} + +func TestShouldProbePCIeSerialSkipsNonGPUOrNIC(t *testing.T) { + class := "StorageController" + bdf := "0000:19:00.0" + dev := schema.HardwarePCIeDevice{DeviceClass: &class, BDF: &bdf} + if shouldProbePCIeSerial(dev) { + t.Fatal("unexpected probe for storage controller") + } +} diff --git a/internal/chart b/internal/chart index 05db699..ac8120c 160000 --- a/internal/chart +++ b/internal/chart @@ -1 +1 @@ -Subproject commit 05db6994d4a77bc95cc9d96892f81875f2f9fa01 +Subproject commit ac8120c8ab800bb3067efcada50bc4272dc8f76a