Disabled PCIe devices (sysfs enable==0) carry no data traffic; their link state has no operational impact. Switchtec PCIe switch management endpoints on NVIDIA HGX H100 baseboards (and similar fabric controllers) train at reduced speed intentionally and were producing spurious warnings. Check is vendor-agnostic: reads enable attribute via existing helper, no vendor/device ID hardcoding. Documented in bible-local/decisions/2026-06-12-pcie-disabled-device-link-warning.md. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
351 lines
9.0 KiB
Go
351 lines
9.0 KiB
Go
package collector
|
|
|
|
import (
|
|
"bee/audit/internal/schema"
|
|
"fmt"
|
|
"log/slog"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
)
|
|
|
|
func collectPCIe() []schema.HardwarePCIeDevice {
|
|
out, err := exec.Command("lspci", "-vmm", "-D").Output()
|
|
if err != nil {
|
|
slog.Warn("pcie: lspci failed", "err", err)
|
|
return nil
|
|
}
|
|
devs := parseLspci(string(out))
|
|
slog.Info("pcie: collected", "count", len(devs))
|
|
return devs
|
|
}
|
|
|
|
func parseLspci(output string) []schema.HardwarePCIeDevice {
|
|
// lspci -vmm -D outputs blank-line separated records, each field is "Key:\tValue"
|
|
var devs []schema.HardwarePCIeDevice
|
|
for _, block := range strings.Split(output, "\n\n") {
|
|
block = strings.TrimSpace(block)
|
|
if block == "" {
|
|
continue
|
|
}
|
|
fields := map[string]string{}
|
|
for _, line := range strings.Split(block, "\n") {
|
|
idx := strings.Index(line, ":\t")
|
|
if idx < 0 {
|
|
continue
|
|
}
|
|
key := strings.TrimSpace(line[:idx])
|
|
val := strings.TrimSpace(line[idx+2:])
|
|
fields[key] = val
|
|
}
|
|
if !shouldIncludePCIeDevice(fields["Class"], fields["Vendor"], fields["Device"]) {
|
|
continue
|
|
}
|
|
dev := parseLspciDevice(fields)
|
|
devs = append(devs, dev)
|
|
}
|
|
return devs
|
|
}
|
|
|
|
func shouldIncludePCIeDevice(class, vendor, device string) bool {
|
|
c := strings.ToLower(strings.TrimSpace(class))
|
|
v := strings.ToLower(strings.TrimSpace(vendor))
|
|
d := strings.ToLower(strings.TrimSpace(device))
|
|
if c == "" {
|
|
return true
|
|
}
|
|
|
|
// Keep inventory focused on useful replaceable components, not chipset/virtual noise.
|
|
excluded := []string{
|
|
"host bridge",
|
|
"isa bridge",
|
|
"pci bridge",
|
|
"co-processor",
|
|
"performance counter",
|
|
"performance counters",
|
|
"ram memory",
|
|
"system peripheral",
|
|
"communication controller",
|
|
"signal processing controller",
|
|
"usb controller",
|
|
"smbus",
|
|
"audio device",
|
|
"serial bus controller",
|
|
"unassigned class",
|
|
"non-essential instrumentation",
|
|
}
|
|
for _, bad := range excluded {
|
|
if strings.Contains(c, bad) {
|
|
return false
|
|
}
|
|
}
|
|
|
|
// Exclude BMC/management virtual VGA adapters — these are firmware video chips,
|
|
// not real GPUs, and pollute the GPU inventory (e.g. iBMC, iDRAC, iLO VGA).
|
|
if strings.Contains(c, "vga") || strings.Contains(c, "display") || strings.Contains(c, "3d") {
|
|
bmcPatterns := []string{
|
|
"management system chip",
|
|
"management controller",
|
|
"ibmc",
|
|
"idrac",
|
|
"ilo vga",
|
|
"aspeed",
|
|
"matrox",
|
|
}
|
|
for _, bad := range bmcPatterns {
|
|
if strings.Contains(d, bad) {
|
|
return false
|
|
}
|
|
}
|
|
}
|
|
|
|
if strings.Contains(v, "advanced micro devices") || strings.Contains(v, "[amd]") {
|
|
internalAMDPatterns := []string{
|
|
"dummy function",
|
|
"reserved spp",
|
|
"ptdma",
|
|
"cryptographic coprocessor pspcpp",
|
|
"pspcpp",
|
|
}
|
|
for _, bad := range internalAMDPatterns {
|
|
if strings.Contains(d, bad) {
|
|
return false
|
|
}
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
|
dev := schema.HardwarePCIeDevice{}
|
|
present := true
|
|
dev.Present = &present
|
|
status := statusOK
|
|
dev.Status = &status
|
|
|
|
// Slot is the BDF: "0000:00:02.0"
|
|
bdfStr := fields["Slot"]
|
|
if bdfStr != "" {
|
|
dev.Slot = &bdfStr
|
|
dev.BDF = &bdfStr
|
|
// parse vendor_id and device_id from sysfs
|
|
vendorID, deviceID := readPCIIDs(bdfStr)
|
|
if vendorID != 0 {
|
|
dev.VendorID = &vendorID
|
|
}
|
|
if deviceID != 0 {
|
|
dev.DeviceID = &deviceID
|
|
}
|
|
if numaNode, ok := readPCINumaNode(bdfStr); ok {
|
|
dev.NUMANode = &numaNode
|
|
} else if numaNode, ok := parsePCINumaNode(fields["NUMANode"]); ok {
|
|
dev.NUMANode = &numaNode
|
|
}
|
|
if group, ok := readPCIIOMMUGroup(bdfStr); ok {
|
|
dev.IOMMUGroup = &group
|
|
}
|
|
if width, ok := readPCIIntAttribute(bdfStr, "current_link_width"); ok {
|
|
dev.LinkWidth = &width
|
|
}
|
|
if width, ok := readPCIIntAttribute(bdfStr, "max_link_width"); ok {
|
|
dev.MaxLinkWidth = &width
|
|
}
|
|
if speed, ok := readPCIStringAttribute(bdfStr, "current_link_speed"); ok {
|
|
linkSpeed := normalizePCILinkSpeed(speed)
|
|
if linkSpeed != "" {
|
|
dev.LinkSpeed = &linkSpeed
|
|
}
|
|
}
|
|
if speed, ok := readPCIStringAttribute(bdfStr, "max_link_speed"); ok {
|
|
linkSpeed := normalizePCILinkSpeed(speed)
|
|
if linkSpeed != "" {
|
|
dev.MaxLinkSpeed = &linkSpeed
|
|
}
|
|
}
|
|
}
|
|
|
|
if v := fields["Class"]; v != "" {
|
|
class := mapPCIeDeviceClass(v)
|
|
dev.DeviceClass = &class
|
|
}
|
|
if v := fields["Vendor"]; v != "" {
|
|
dev.Manufacturer = &v
|
|
}
|
|
if v := fields["Device"]; v != "" {
|
|
dev.Model = &v
|
|
}
|
|
|
|
// SVendor/SDevice available but not in schema — skip
|
|
|
|
// Detect NVLink bridge mezzanine cards (CPU→HGX internal link).
|
|
// These are Mellanox x2 devices with no host net interfaces and a DeviceName
|
|
// containing "NVLINK". The targeted lspci call is only executed for the small
|
|
// number of narrow-link Mellanox cards that pass the cheap pre-filter.
|
|
if bdfStr != "" && isNVLinkBridgeCandidate(bdfStr, dev) && confirmNVLinkBridgeDeviceName(bdfStr) {
|
|
markNVLinkBridge(&dev)
|
|
}
|
|
|
|
// Warn (or Critical for NVLink bridges) if PCIe link is running below max.
|
|
applyPCIeLinkSpeedWarning(&dev)
|
|
|
|
return dev
|
|
}
|
|
|
|
// readPCIIOMMUGroup resolves the IOMMU group number for a BDF via the
|
|
// iommu_group symlink in sysfs: .../devices/<bdf>/iommu_group -> .../kernel/iommu_groups/<N>
|
|
func readPCIIOMMUGroup(bdf string) (int, bool) {
|
|
link := "/sys/bus/pci/devices/" + bdf + "/iommu_group"
|
|
target, err := os.Readlink(link)
|
|
if err != nil {
|
|
return 0, false
|
|
}
|
|
n, err := strconv.Atoi(filepath.Base(target))
|
|
if err != nil {
|
|
return 0, false
|
|
}
|
|
return n, true
|
|
}
|
|
|
|
// readPCIIDs reads vendor and device IDs from sysfs for a given BDF.
|
|
func readPCIIDs(bdf string) (vendorID, deviceID int) {
|
|
base := "/sys/bus/pci/devices/" + bdf
|
|
if v, err := readHexFile(base + "/vendor"); err == nil {
|
|
vendorID = v
|
|
}
|
|
if v, err := readHexFile(base + "/device"); err == nil {
|
|
deviceID = v
|
|
}
|
|
return
|
|
}
|
|
|
|
func readHexFile(path string) (int, error) {
|
|
out, err := exec.Command("cat", path).Output()
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
s := strings.TrimSpace(strings.TrimPrefix(string(out), "0x"))
|
|
n, err := strconv.ParseInt(s, 16, 64)
|
|
return int(n), err
|
|
}
|
|
|
|
func readPCINumaNode(bdf string) (int, bool) {
|
|
value, ok := readPCIIntAttribute(bdf, "numa_node")
|
|
if !ok || value < 0 {
|
|
return 0, false
|
|
}
|
|
return value, true
|
|
}
|
|
|
|
func parsePCINumaNode(raw string) (int, bool) {
|
|
raw = strings.TrimSpace(raw)
|
|
if raw == "" {
|
|
return 0, false
|
|
}
|
|
value, err := strconv.Atoi(raw)
|
|
if err != nil || value < 0 {
|
|
return 0, false
|
|
}
|
|
return value, true
|
|
}
|
|
|
|
func readPCIIntAttribute(bdf, attribute string) (int, bool) {
|
|
out, err := exec.Command("cat", "/sys/bus/pci/devices/"+bdf+"/"+attribute).Output()
|
|
if err != nil {
|
|
return 0, false
|
|
}
|
|
value, err := strconv.Atoi(strings.TrimSpace(string(out)))
|
|
if err != nil || value < 0 {
|
|
return 0, false
|
|
}
|
|
return value, true
|
|
}
|
|
|
|
func readPCIStringAttribute(bdf, attribute string) (string, bool) {
|
|
out, err := exec.Command("cat", "/sys/bus/pci/devices/"+bdf+"/"+attribute).Output()
|
|
if err != nil {
|
|
return "", false
|
|
}
|
|
value := strings.TrimSpace(string(out))
|
|
if value == "" {
|
|
return "", false
|
|
}
|
|
return value, true
|
|
}
|
|
|
|
// applyPCIeLinkSpeedWarning sets device status when the current PCIe link speed is
|
|
// below the device maximum. Regular PCIe slots get Warning; NVLink bridge cards
|
|
// get Critical because they are fixed internal connectors that must always train
|
|
// to max speed — any downgrade signals a hardware fault.
|
|
//
|
|
// Disabled devices (sysfs enable==0) are skipped: they carry no data traffic and
|
|
// their link state has no operational impact. This covers management endpoints
|
|
// (e.g. PCIe switch fabric controllers on HGX baseboards) that the kernel never
|
|
// activates but that lspci still reports with link stats.
|
|
func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) {
|
|
if dev.LinkSpeed == nil || dev.MaxLinkSpeed == nil {
|
|
return
|
|
}
|
|
if pcieLinkSpeedRank(*dev.LinkSpeed) >= pcieLinkSpeedRank(*dev.MaxLinkSpeed) {
|
|
return
|
|
}
|
|
if dev.BDF != nil {
|
|
if enabled, ok := readPCIIntAttribute(*dev.BDF, "enable"); ok && enabled == 0 {
|
|
return
|
|
}
|
|
}
|
|
desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
|
|
dev.ErrorDescription = &desc
|
|
|
|
isNVLinkBridge := dev.DeviceClass != nil && *dev.DeviceClass == "NVLinkBridge"
|
|
if isNVLinkBridge {
|
|
crit := statusCritical
|
|
dev.Status = &crit
|
|
} else {
|
|
warn := statusWarning
|
|
dev.Status = &warn
|
|
}
|
|
}
|
|
|
|
// pcieLinkSpeedRank returns a numeric rank for a normalized Gen string (e.g. "Gen4" → 4).
|
|
// Returns 0 for unrecognised values so comparisons fail safe.
|
|
func pcieLinkSpeedRank(gen string) int {
|
|
switch gen {
|
|
case "Gen1":
|
|
return 1
|
|
case "Gen2":
|
|
return 2
|
|
case "Gen3":
|
|
return 3
|
|
case "Gen4":
|
|
return 4
|
|
case "Gen5":
|
|
return 5
|
|
case "Gen6":
|
|
return 6
|
|
default:
|
|
return 0
|
|
}
|
|
}
|
|
|
|
func normalizePCILinkSpeed(raw string) string {
|
|
raw = strings.TrimSpace(strings.ToLower(raw))
|
|
switch {
|
|
case strings.Contains(raw, "2.5"):
|
|
return "Gen1"
|
|
case strings.Contains(raw, "5.0"):
|
|
return "Gen2"
|
|
case strings.Contains(raw, "8.0"):
|
|
return "Gen3"
|
|
case strings.Contains(raw, "16.0"):
|
|
return "Gen4"
|
|
case strings.Contains(raw, "32.0"):
|
|
return "Gen5"
|
|
case strings.Contains(raw, "64.0"):
|
|
return "Gen6"
|
|
default:
|
|
return ""
|
|
}
|
|
}
|