Files
bee/audit/internal/collector/pcie.go
Michael Chus 963bc960ca Fix SATA discovery, add NVLink bridge detection, add infiniband-diags
- storage: add jsonInt64 dual-format unmarshaler to handle lsblk output
  change in util-linux 2.38 (LOG-SEC/PHY-SEC now emitted as JSON
  integers, not quoted strings); fixes SATA disks invisible on Debian 12
- pcie: detect NVLink bridge mezzanine CX-7 cards (Mellanox x2, no host
  net ifaces, DeviceName contains "NVLINK" in lspci -v) and mark them
  with device_class="NVLinkBridge"; escalate PCIe link speed downgrade to
  Critical for these cards (Gen3 on a fixed internal connector = hardware
  fault, not a transient warning)
- pcie: cross-reference nvidia-smi topo to capture NVLink bond counts and
  active status for all NVLink bridge cards
- packages: add infiniband-diags to ISO; provides ibstat required by
  nvidia-fabricmanager-start.sh to enumerate IB devices before FM launch
  (absence causes CUDA_ERROR_SYSTEM_NOT_READY)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-28 20:57:04 +03:00

341 lines
8.6 KiB
Go

package collector
import (
"bee/audit/internal/schema"
"fmt"
"log/slog"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
)
func collectPCIe() []schema.HardwarePCIeDevice {
out, err := exec.Command("lspci", "-vmm", "-D").Output()
if err != nil {
slog.Warn("pcie: lspci failed", "err", err)
return nil
}
devs := parseLspci(string(out))
slog.Info("pcie: collected", "count", len(devs))
return devs
}
func parseLspci(output string) []schema.HardwarePCIeDevice {
// lspci -vmm -D outputs blank-line separated records, each field is "Key:\tValue"
var devs []schema.HardwarePCIeDevice
for _, block := range strings.Split(output, "\n\n") {
block = strings.TrimSpace(block)
if block == "" {
continue
}
fields := map[string]string{}
for _, line := range strings.Split(block, "\n") {
idx := strings.Index(line, ":\t")
if idx < 0 {
continue
}
key := strings.TrimSpace(line[:idx])
val := strings.TrimSpace(line[idx+2:])
fields[key] = val
}
if !shouldIncludePCIeDevice(fields["Class"], fields["Vendor"], fields["Device"]) {
continue
}
dev := parseLspciDevice(fields)
devs = append(devs, dev)
}
return devs
}
func shouldIncludePCIeDevice(class, vendor, device string) bool {
c := strings.ToLower(strings.TrimSpace(class))
v := strings.ToLower(strings.TrimSpace(vendor))
d := strings.ToLower(strings.TrimSpace(device))
if c == "" {
return true
}
// Keep inventory focused on useful replaceable components, not chipset/virtual noise.
excluded := []string{
"host bridge",
"isa bridge",
"pci bridge",
"co-processor",
"performance counter",
"performance counters",
"ram memory",
"system peripheral",
"communication controller",
"signal processing controller",
"usb controller",
"smbus",
"audio device",
"serial bus controller",
"unassigned class",
"non-essential instrumentation",
}
for _, bad := range excluded {
if strings.Contains(c, bad) {
return false
}
}
// Exclude BMC/management virtual VGA adapters — these are firmware video chips,
// not real GPUs, and pollute the GPU inventory (e.g. iBMC, iDRAC, iLO VGA).
if strings.Contains(c, "vga") || strings.Contains(c, "display") || strings.Contains(c, "3d") {
bmcPatterns := []string{
"management system chip",
"management controller",
"ibmc",
"idrac",
"ilo vga",
"aspeed",
"matrox",
}
for _, bad := range bmcPatterns {
if strings.Contains(d, bad) {
return false
}
}
}
if strings.Contains(v, "advanced micro devices") || strings.Contains(v, "[amd]") {
internalAMDPatterns := []string{
"dummy function",
"reserved spp",
"ptdma",
"cryptographic coprocessor pspcpp",
"pspcpp",
}
for _, bad := range internalAMDPatterns {
if strings.Contains(d, bad) {
return false
}
}
}
return true
}
func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
dev := schema.HardwarePCIeDevice{}
present := true
dev.Present = &present
status := statusOK
dev.Status = &status
// Slot is the BDF: "0000:00:02.0"
bdfStr := fields["Slot"]
if bdfStr != "" {
dev.Slot = &bdfStr
dev.BDF = &bdfStr
// parse vendor_id and device_id from sysfs
vendorID, deviceID := readPCIIDs(bdfStr)
if vendorID != 0 {
dev.VendorID = &vendorID
}
if deviceID != 0 {
dev.DeviceID = &deviceID
}
if numaNode, ok := readPCINumaNode(bdfStr); ok {
dev.NUMANode = &numaNode
} else if numaNode, ok := parsePCINumaNode(fields["NUMANode"]); ok {
dev.NUMANode = &numaNode
}
if group, ok := readPCIIOMMUGroup(bdfStr); ok {
dev.IOMMUGroup = &group
}
if width, ok := readPCIIntAttribute(bdfStr, "current_link_width"); ok {
dev.LinkWidth = &width
}
if width, ok := readPCIIntAttribute(bdfStr, "max_link_width"); ok {
dev.MaxLinkWidth = &width
}
if speed, ok := readPCIStringAttribute(bdfStr, "current_link_speed"); ok {
linkSpeed := normalizePCILinkSpeed(speed)
if linkSpeed != "" {
dev.LinkSpeed = &linkSpeed
}
}
if speed, ok := readPCIStringAttribute(bdfStr, "max_link_speed"); ok {
linkSpeed := normalizePCILinkSpeed(speed)
if linkSpeed != "" {
dev.MaxLinkSpeed = &linkSpeed
}
}
}
if v := fields["Class"]; v != "" {
class := mapPCIeDeviceClass(v)
dev.DeviceClass = &class
}
if v := fields["Vendor"]; v != "" {
dev.Manufacturer = &v
}
if v := fields["Device"]; v != "" {
dev.Model = &v
}
// SVendor/SDevice available but not in schema — skip
// Detect NVLink bridge mezzanine cards (CPU→HGX internal link).
// These are Mellanox x2 devices with no host net interfaces and a DeviceName
// containing "NVLINK". The targeted lspci call is only executed for the small
// number of narrow-link Mellanox cards that pass the cheap pre-filter.
if bdfStr != "" && isNVLinkBridgeCandidate(bdfStr, dev) && confirmNVLinkBridgeDeviceName(bdfStr) {
markNVLinkBridge(&dev)
}
// Warn (or Critical for NVLink bridges) if PCIe link is running below max.
applyPCIeLinkSpeedWarning(&dev)
return dev
}
// readPCIIOMMUGroup resolves the IOMMU group number for a BDF via the
// iommu_group symlink in sysfs: .../devices/<bdf>/iommu_group -> .../kernel/iommu_groups/<N>
func readPCIIOMMUGroup(bdf string) (int, bool) {
link := "/sys/bus/pci/devices/" + bdf + "/iommu_group"
target, err := os.Readlink(link)
if err != nil {
return 0, false
}
n, err := strconv.Atoi(filepath.Base(target))
if err != nil {
return 0, false
}
return n, true
}
// readPCIIDs reads vendor and device IDs from sysfs for a given BDF.
func readPCIIDs(bdf string) (vendorID, deviceID int) {
base := "/sys/bus/pci/devices/" + bdf
if v, err := readHexFile(base + "/vendor"); err == nil {
vendorID = v
}
if v, err := readHexFile(base + "/device"); err == nil {
deviceID = v
}
return
}
func readHexFile(path string) (int, error) {
out, err := exec.Command("cat", path).Output()
if err != nil {
return 0, err
}
s := strings.TrimSpace(strings.TrimPrefix(string(out), "0x"))
n, err := strconv.ParseInt(s, 16, 64)
return int(n), err
}
func readPCINumaNode(bdf string) (int, bool) {
value, ok := readPCIIntAttribute(bdf, "numa_node")
if !ok || value < 0 {
return 0, false
}
return value, true
}
func parsePCINumaNode(raw string) (int, bool) {
raw = strings.TrimSpace(raw)
if raw == "" {
return 0, false
}
value, err := strconv.Atoi(raw)
if err != nil || value < 0 {
return 0, false
}
return value, true
}
func readPCIIntAttribute(bdf, attribute string) (int, bool) {
out, err := exec.Command("cat", "/sys/bus/pci/devices/"+bdf+"/"+attribute).Output()
if err != nil {
return 0, false
}
value, err := strconv.Atoi(strings.TrimSpace(string(out)))
if err != nil || value < 0 {
return 0, false
}
return value, true
}
func readPCIStringAttribute(bdf, attribute string) (string, bool) {
out, err := exec.Command("cat", "/sys/bus/pci/devices/"+bdf+"/"+attribute).Output()
if err != nil {
return "", false
}
value := strings.TrimSpace(string(out))
if value == "" {
return "", false
}
return value, true
}
// applyPCIeLinkSpeedWarning sets device status when the current PCIe link speed is
// below the device maximum. Regular PCIe slots get Warning; NVLink bridge cards
// get Critical because they are fixed internal connectors that must always train
// to max speed — any downgrade signals a hardware fault.
func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) {
if dev.LinkSpeed == nil || dev.MaxLinkSpeed == nil {
return
}
if pcieLinkSpeedRank(*dev.LinkSpeed) >= pcieLinkSpeedRank(*dev.MaxLinkSpeed) {
return
}
desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
dev.ErrorDescription = &desc
isNVLinkBridge := dev.DeviceClass != nil && *dev.DeviceClass == "NVLinkBridge"
if isNVLinkBridge {
crit := statusCritical
dev.Status = &crit
} else {
warn := statusWarning
dev.Status = &warn
}
}
// pcieLinkSpeedRank returns a numeric rank for a normalized Gen string (e.g. "Gen4" → 4).
// Returns 0 for unrecognised values so comparisons fail safe.
func pcieLinkSpeedRank(gen string) int {
switch gen {
case "Gen1":
return 1
case "Gen2":
return 2
case "Gen3":
return 3
case "Gen4":
return 4
case "Gen5":
return 5
case "Gen6":
return 6
default:
return 0
}
}
func normalizePCILinkSpeed(raw string) string {
raw = strings.TrimSpace(strings.ToLower(raw))
switch {
case strings.Contains(raw, "2.5"):
return "Gen1"
case strings.Contains(raw, "5.0"):
return "Gen2"
case strings.Contains(raw, "8.0"):
return "Gen3"
case strings.Contains(raw, "16.0"):
return "Gen4"
case strings.Contains(raw, "32.0"):
return "Gen5"
case strings.Contains(raw, "64.0"):
return "Gen6"
default:
return ""
}
}