package collector import ( "bee/audit/internal/schema" "fmt" "log/slog" "os" "os/exec" "path/filepath" "strconv" "strings" ) func collectPCIe() []schema.HardwarePCIeDevice { out, err := exec.Command("lspci", "-vmm", "-D").Output() if err != nil { slog.Warn("pcie: lspci failed", "err", err) return nil } devs := parseLspci(string(out)) slog.Info("pcie: collected", "count", len(devs)) return devs } func parseLspci(output string) []schema.HardwarePCIeDevice { // lspci -vmm -D outputs blank-line separated records, each field is "Key:\tValue" var devs []schema.HardwarePCIeDevice for _, block := range strings.Split(output, "\n\n") { block = strings.TrimSpace(block) if block == "" { continue } fields := map[string]string{} for _, line := range strings.Split(block, "\n") { idx := strings.Index(line, ":\t") if idx < 0 { continue } key := strings.TrimSpace(line[:idx]) val := strings.TrimSpace(line[idx+2:]) fields[key] = val } if !shouldIncludePCIeDevice(fields["Class"], fields["Vendor"], fields["Device"]) { continue } dev := parseLspciDevice(fields) devs = append(devs, dev) } return devs } func shouldIncludePCIeDevice(class, vendor, device string) bool { c := strings.ToLower(strings.TrimSpace(class)) v := strings.ToLower(strings.TrimSpace(vendor)) d := strings.ToLower(strings.TrimSpace(device)) if c == "" { return true } // Keep inventory focused on useful replaceable components, not chipset/virtual noise. excluded := []string{ "host bridge", "isa bridge", "pci bridge", "co-processor", "performance counter", "performance counters", "ram memory", "system peripheral", "communication controller", "signal processing controller", "usb controller", "smbus", "audio device", "serial bus controller", "unassigned class", "non-essential instrumentation", } for _, bad := range excluded { if strings.Contains(c, bad) { return false } } // Exclude BMC/management virtual VGA adapters — these are firmware video chips, // not real GPUs, and pollute the GPU inventory (e.g. iBMC, iDRAC, iLO VGA). if strings.Contains(c, "vga") || strings.Contains(c, "display") || strings.Contains(c, "3d") { bmcPatterns := []string{ "management system chip", "management controller", "ibmc", "idrac", "ilo vga", "aspeed", "matrox", } for _, bad := range bmcPatterns { if strings.Contains(d, bad) { return false } } } if strings.Contains(v, "advanced micro devices") || strings.Contains(v, "[amd]") { internalAMDPatterns := []string{ "dummy function", "reserved spp", "ptdma", "cryptographic coprocessor pspcpp", "pspcpp", } for _, bad := range internalAMDPatterns { if strings.Contains(d, bad) { return false } } } return true } func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice { dev := schema.HardwarePCIeDevice{} present := true dev.Present = &present status := statusOK dev.Status = &status // Slot is the BDF: "0000:00:02.0" bdfStr := fields["Slot"] if bdfStr != "" { dev.Slot = &bdfStr dev.BDF = &bdfStr // parse vendor_id and device_id from sysfs vendorID, deviceID := readPCIIDs(bdfStr) if vendorID != 0 { dev.VendorID = &vendorID } if deviceID != 0 { dev.DeviceID = &deviceID } if numaNode, ok := readPCINumaNode(bdfStr); ok { dev.NUMANode = &numaNode } else if numaNode, ok := parsePCINumaNode(fields["NUMANode"]); ok { dev.NUMANode = &numaNode } if group, ok := readPCIIOMMUGroup(bdfStr); ok { dev.IOMMUGroup = &group } if width, ok := readPCIIntAttribute(bdfStr, "current_link_width"); ok { dev.LinkWidth = &width } if width, ok := readPCIIntAttribute(bdfStr, "max_link_width"); ok { dev.MaxLinkWidth = &width } if speed, ok := readPCIStringAttribute(bdfStr, "current_link_speed"); ok { linkSpeed := normalizePCILinkSpeed(speed) if linkSpeed != "" { dev.LinkSpeed = &linkSpeed } } if speed, ok := readPCIStringAttribute(bdfStr, "max_link_speed"); ok { linkSpeed := normalizePCILinkSpeed(speed) if linkSpeed != "" { dev.MaxLinkSpeed = &linkSpeed } } } if v := fields["Class"]; v != "" { class := mapPCIeDeviceClass(v) dev.DeviceClass = &class } if v := fields["Vendor"]; v != "" { dev.Manufacturer = &v } if v := fields["Device"]; v != "" { dev.Model = &v } // SVendor/SDevice available but not in schema — skip // Detect NVLink bridge mezzanine cards (CPU→HGX internal link). // These are Mellanox x2 devices with no host net interfaces and a DeviceName // containing "NVLINK". The targeted lspci call is only executed for the small // number of narrow-link Mellanox cards that pass the cheap pre-filter. if bdfStr != "" && isNVLinkBridgeCandidate(bdfStr, dev) && confirmNVLinkBridgeDeviceName(bdfStr) { markNVLinkBridge(&dev) } // Warn (or Critical for NVLink bridges) if PCIe link is running below max. applyPCIeLinkSpeedWarning(&dev) return dev } // readPCIIOMMUGroup resolves the IOMMU group number for a BDF via the // iommu_group symlink in sysfs: .../devices//iommu_group -> .../kernel/iommu_groups/ func readPCIIOMMUGroup(bdf string) (int, bool) { link := "/sys/bus/pci/devices/" + bdf + "/iommu_group" target, err := os.Readlink(link) if err != nil { return 0, false } n, err := strconv.Atoi(filepath.Base(target)) if err != nil { return 0, false } return n, true } // readPCIIDs reads vendor and device IDs from sysfs for a given BDF. func readPCIIDs(bdf string) (vendorID, deviceID int) { base := "/sys/bus/pci/devices/" + bdf if v, err := readHexFile(base + "/vendor"); err == nil { vendorID = v } if v, err := readHexFile(base + "/device"); err == nil { deviceID = v } return } func readHexFile(path string) (int, error) { out, err := exec.Command("cat", path).Output() if err != nil { return 0, err } s := strings.TrimSpace(strings.TrimPrefix(string(out), "0x")) n, err := strconv.ParseInt(s, 16, 64) return int(n), err } func readPCINumaNode(bdf string) (int, bool) { value, ok := readPCIIntAttribute(bdf, "numa_node") if !ok || value < 0 { return 0, false } return value, true } func parsePCINumaNode(raw string) (int, bool) { raw = strings.TrimSpace(raw) if raw == "" { return 0, false } value, err := strconv.Atoi(raw) if err != nil || value < 0 { return 0, false } return value, true } func readPCIIntAttribute(bdf, attribute string) (int, bool) { out, err := exec.Command("cat", "/sys/bus/pci/devices/"+bdf+"/"+attribute).Output() if err != nil { return 0, false } value, err := strconv.Atoi(strings.TrimSpace(string(out))) if err != nil || value < 0 { return 0, false } return value, true } func readPCIStringAttribute(bdf, attribute string) (string, bool) { out, err := exec.Command("cat", "/sys/bus/pci/devices/"+bdf+"/"+attribute).Output() if err != nil { return "", false } value := strings.TrimSpace(string(out)) if value == "" { return "", false } return value, true } // applyPCIeLinkSpeedWarning sets device status when the current PCIe link speed is // below the device maximum. Regular PCIe slots get Warning; NVLink bridge cards // get Critical because they are fixed internal connectors that must always train // to max speed — any downgrade signals a hardware fault. // // Disabled devices (sysfs enable==0) are skipped: they carry no data traffic and // their link state has no operational impact. This covers management endpoints // (e.g. PCIe switch fabric controllers on HGX baseboards) that the kernel never // activates but that lspci still reports with link stats. func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) { if dev.LinkSpeed == nil || dev.MaxLinkSpeed == nil { return } if pcieLinkSpeedRank(*dev.LinkSpeed) >= pcieLinkSpeedRank(*dev.MaxLinkSpeed) { return } if dev.BDF != nil { if enabled, ok := readPCIIntAttribute(*dev.BDF, "enable"); ok && enabled == 0 { return } } desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed) dev.ErrorDescription = &desc isNVLinkBridge := dev.DeviceClass != nil && *dev.DeviceClass == "NVLinkBridge" if isNVLinkBridge { crit := statusCritical dev.Status = &crit } else { warn := statusWarning dev.Status = &warn } } // pcieLinkSpeedRank returns a numeric rank for a normalized Gen string (e.g. "Gen4" → 4). // Returns 0 for unrecognised values so comparisons fail safe. func pcieLinkSpeedRank(gen string) int { switch gen { case "Gen1": return 1 case "Gen2": return 2 case "Gen3": return 3 case "Gen4": return 4 case "Gen5": return 5 case "Gen6": return 6 default: return 0 } } func normalizePCILinkSpeed(raw string) string { raw = strings.TrimSpace(strings.ToLower(raw)) switch { case strings.Contains(raw, "2.5"): return "Gen1" case strings.Contains(raw, "5.0"): return "Gen2" case strings.Contains(raw, "8.0"): return "Gen3" case strings.Contains(raw, "16.0"): return "Gen4" case strings.Contains(raw, "32.0"): return "Gen5" case strings.Contains(raw, "64.0"): return "Gen6" default: return "" } }