package collector import ( "bee/audit/internal/schema" "log/slog" "os/exec" "regexp" "strconv" "strings" ) var nv5re = regexp.MustCompile(`(?i)^NV(\d+)$`) // isNVLinkBridgeCandidate returns true for Mellanox PCIe devices that look like // NVLink bridge mezzanine cards: narrow link (x2), no host net interfaces. // These are the CPU-side PCIe control plane of the NVSwitch fabric on HGX/DGX systems. func isNVLinkBridgeCandidate(bdf string, dev schema.HardwarePCIeDevice) bool { if !isMellanoxDevice(dev) { return false } if dev.LinkWidth == nil || *dev.LinkWidth > 2 { return false } if len(netIfacesByBDF(bdf)) > 0 { return false } return true } // confirmNVLinkBridgeDeviceName checks if the lspci DeviceName for bdf contains // "NVLINK". This is a targeted single-device call, only executed for candidates // already pre-filtered by isNVLinkBridgeCandidate. func confirmNVLinkBridgeDeviceName(bdf string) bool { out, err := exec.Command("lspci", "-s", bdf, "-v").Output() if err != nil { return false } for _, line := range strings.Split(string(out), "\n") { if strings.Contains(strings.ToUpper(strings.TrimSpace(line)), "NVLINK") { return true } } return false } // markNVLinkBridge overwrites device_class and adds telemetry flags on a detected // NVLink bridge card. Must be called before applyPCIeLinkSpeedWarning so that the // correct severity (Critical) is applied. func markNVLinkBridge(dev *schema.HardwarePCIeDevice) { class := "NVLinkBridge" dev.DeviceClass = &class if dev.Telemetry == nil { dev.Telemetry = map[string]any{} } dev.Telemetry["nvlink_bridge"] = true } // enrichNVLinkBridgesWithGPUTopo cross-references NVLink bridge PCIe status with // the GPU-side NVLink topology reported by nvidia-smi. For each bridge device it // adds nvlink_topo_all_active and nvlink_topo_min_links to the telemetry, and // upgrades a degraded-link Warning to Critical when the fabric is also affected. func enrichNVLinkBridgesWithGPUTopo(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice { hasBridge := false for _, d := range devs { if d.DeviceClass != nil && *d.DeviceClass == "NVLinkBridge" { hasBridge = true break } } if !hasBridge { return devs } topo, err := queryNVIDIANVLinkTopo() if err != nil { slog.Info("nvlink-bridge: nvidia-smi topo unavailable, skipping cross-reference", "err", err) return devs } for i := range devs { if devs[i].DeviceClass == nil || *devs[i].DeviceClass != "NVLinkBridge" { continue } if devs[i].Telemetry == nil { devs[i].Telemetry = map[string]any{} } devs[i].Telemetry["nvlink_topo_all_active"] = topo.AllActive devs[i].Telemetry["nvlink_topo_min_links"] = topo.MinNVLinks devs[i].Telemetry["nvlink_topo_gpu_count"] = topo.GPUCount // If the bridge PCIe is already degraded AND the fabric is also degraded // (missing NVLink connections), escalate to Critical. if devs[i].Status != nil && *devs[i].Status == statusCritical && !topo.AllActive { devs[i].Telemetry["nvlink_fabric_affected"] = true } } slog.Info("nvlink-bridge: topo cross-reference applied", "gpu_count", topo.GPUCount, "all_active", topo.AllActive, "min_links", topo.MinNVLinks, ) return devs } // nvlinkTopoResult summarises the GPU NVLink connectivity matrix. type nvlinkTopoResult struct { GPUCount int AllActive bool // true if every GPU pair has at least one NVLink bond MinNVLinks int // minimum NVLink bonds seen across any GPU pair (0 = some pair disconnected) } // queryNVIDIANVLinkTopo runs nvidia-smi topo -m and parses the NVLink matrix. func queryNVIDIANVLinkTopo() (nvlinkTopoResult, error) { out, err := exec.Command("nvidia-smi", "topo", "-m").Output() if err != nil { return nvlinkTopoResult{}, err } return parseNVIDIATopologyMatrix(string(out)), nil } // parseNVIDIATopologyMatrix extracts the minimum NVLink bond count from the // nvidia-smi topo -m matrix. // // Format (abbreviated): // // GPU0 GPU1 ... NIC0 NIC1 // GPU0 X NV18 ... NODE NODE // GPU1 NV18 X ... NODE NODE // NIC0 NODE NODE... X PIX // // The header row starts with "GPU0"; its columns may include non-GPU entries // (NIC, CPU) which are ignored. Only GPU×GPU cells containing NV# values are // counted. X is self; non-NV tokens (NODE, SYS, PHB, PIX) are skipped. func parseNVIDIATopologyMatrix(raw string) nvlinkTopoResult { lines := strings.Split(raw, "\n") // Locate the header line and record which column indices are GPU columns. headerIdx := -1 var gpuColIndices []int // 0-based indices within fields (excluding the row label) var gpuCount int for i, line := range lines { trimmed := strings.TrimSpace(line) if strings.HasPrefix(trimmed, "GPU0") { parts := strings.Fields(trimmed) for j, col := range parts { if strings.HasPrefix(col, "GPU") { gpuColIndices = append(gpuColIndices, j) } } gpuCount = len(gpuColIndices) if gpuCount >= 2 { headerIdx = i } break } } if headerIdx < 0 || gpuCount == 0 { return nvlinkTopoResult{} } minLinks := -1 // -1 = no NV pair seen yet allActive := true for _, line := range lines[headerIdx+1:] { trimmed := strings.TrimSpace(line) if !strings.HasPrefix(trimmed, "GPU") { continue } cells := strings.Fields(trimmed) // cells[0] is the row label (e.g. "GPU0"); cells[1..] are column values. // gpuColIndices are 0-based within the header fields, so they map to // cells[idx+1] in the data rows (shift by 1 for the row label). for _, colIdx := range gpuColIndices { dataIdx := colIdx + 1 if dataIdx >= len(cells) { continue } cell := cells[dataIdx] m := nv5re.FindStringSubmatch(cell) if len(m) != 2 { continue } n, err := strconv.Atoi(m[1]) if err != nil { continue } if n == 0 { allActive = false } if minLinks < 0 || n < minLinks { minLinks = n } } } if minLinks < 0 { minLinks = 0 } return nvlinkTopoResult{ GPUCount: gpuCount, AllActive: allActive && minLinks > 0, MinNVLinks: minLinks, } }