Files
bee/audit/internal/collector/pcie_nvlink_bridge.go
Michael Chus 963bc960ca Fix SATA discovery, add NVLink bridge detection, add infiniband-diags
- storage: add jsonInt64 dual-format unmarshaler to handle lsblk output
  change in util-linux 2.38 (LOG-SEC/PHY-SEC now emitted as JSON
  integers, not quoted strings); fixes SATA disks invisible on Debian 12
- pcie: detect NVLink bridge mezzanine CX-7 cards (Mellanox x2, no host
  net ifaces, DeviceName contains "NVLINK" in lspci -v) and mark them
  with device_class="NVLinkBridge"; escalate PCIe link speed downgrade to
  Critical for these cards (Gen3 on a fixed internal connector = hardware
  fault, not a transient warning)
- pcie: cross-reference nvidia-smi topo to capture NVLink bond counts and
  active status for all NVLink bridge cards
- packages: add infiniband-diags to ISO; provides ibstat required by
  nvidia-fabricmanager-start.sh to enumerate IB devices before FM launch
  (absence causes CUDA_ERROR_SYSTEM_NOT_READY)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-28 20:57:04 +03:00

207 lines
5.9 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package collector
import (
"bee/audit/internal/schema"
"log/slog"
"os/exec"
"regexp"
"strconv"
"strings"
)
var nv5re = regexp.MustCompile(`(?i)^NV(\d+)$`)
// isNVLinkBridgeCandidate returns true for Mellanox PCIe devices that look like
// NVLink bridge mezzanine cards: narrow link (x2), no host net interfaces.
// These are the CPU-side PCIe control plane of the NVSwitch fabric on HGX/DGX systems.
func isNVLinkBridgeCandidate(bdf string, dev schema.HardwarePCIeDevice) bool {
if !isMellanoxDevice(dev) {
return false
}
if dev.LinkWidth == nil || *dev.LinkWidth > 2 {
return false
}
if len(netIfacesByBDF(bdf)) > 0 {
return false
}
return true
}
// confirmNVLinkBridgeDeviceName checks if the lspci DeviceName for bdf contains
// "NVLINK". This is a targeted single-device call, only executed for candidates
// already pre-filtered by isNVLinkBridgeCandidate.
func confirmNVLinkBridgeDeviceName(bdf string) bool {
out, err := exec.Command("lspci", "-s", bdf, "-v").Output()
if err != nil {
return false
}
for _, line := range strings.Split(string(out), "\n") {
if strings.Contains(strings.ToUpper(strings.TrimSpace(line)), "NVLINK") {
return true
}
}
return false
}
// markNVLinkBridge overwrites device_class and adds telemetry flags on a detected
// NVLink bridge card. Must be called before applyPCIeLinkSpeedWarning so that the
// correct severity (Critical) is applied.
func markNVLinkBridge(dev *schema.HardwarePCIeDevice) {
class := "NVLinkBridge"
dev.DeviceClass = &class
if dev.Telemetry == nil {
dev.Telemetry = map[string]any{}
}
dev.Telemetry["nvlink_bridge"] = true
}
// enrichNVLinkBridgesWithGPUTopo cross-references NVLink bridge PCIe status with
// the GPU-side NVLink topology reported by nvidia-smi. For each bridge device it
// adds nvlink_topo_all_active and nvlink_topo_min_links to the telemetry, and
// upgrades a degraded-link Warning to Critical when the fabric is also affected.
func enrichNVLinkBridgesWithGPUTopo(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
hasBridge := false
for _, d := range devs {
if d.DeviceClass != nil && *d.DeviceClass == "NVLinkBridge" {
hasBridge = true
break
}
}
if !hasBridge {
return devs
}
topo, err := queryNVIDIANVLinkTopo()
if err != nil {
slog.Info("nvlink-bridge: nvidia-smi topo unavailable, skipping cross-reference", "err", err)
return devs
}
for i := range devs {
if devs[i].DeviceClass == nil || *devs[i].DeviceClass != "NVLinkBridge" {
continue
}
if devs[i].Telemetry == nil {
devs[i].Telemetry = map[string]any{}
}
devs[i].Telemetry["nvlink_topo_all_active"] = topo.AllActive
devs[i].Telemetry["nvlink_topo_min_links"] = topo.MinNVLinks
devs[i].Telemetry["nvlink_topo_gpu_count"] = topo.GPUCount
// If the bridge PCIe is already degraded AND the fabric is also degraded
// (missing NVLink connections), escalate to Critical.
if devs[i].Status != nil && *devs[i].Status == statusCritical && !topo.AllActive {
devs[i].Telemetry["nvlink_fabric_affected"] = true
}
}
slog.Info("nvlink-bridge: topo cross-reference applied",
"gpu_count", topo.GPUCount,
"all_active", topo.AllActive,
"min_links", topo.MinNVLinks,
)
return devs
}
// nvlinkTopoResult summarises the GPU NVLink connectivity matrix.
type nvlinkTopoResult struct {
GPUCount int
AllActive bool // true if every GPU pair has at least one NVLink bond
MinNVLinks int // minimum NVLink bonds seen across any GPU pair (0 = some pair disconnected)
}
// queryNVIDIANVLinkTopo runs nvidia-smi topo -m and parses the NVLink matrix.
func queryNVIDIANVLinkTopo() (nvlinkTopoResult, error) {
out, err := exec.Command("nvidia-smi", "topo", "-m").Output()
if err != nil {
return nvlinkTopoResult{}, err
}
return parseNVIDIATopologyMatrix(string(out)), nil
}
// parseNVIDIATopologyMatrix extracts the minimum NVLink bond count from the
// nvidia-smi topo -m matrix.
//
// Format (abbreviated):
//
// GPU0 GPU1 ... NIC0 NIC1
// GPU0 X NV18 ... NODE NODE
// GPU1 NV18 X ... NODE NODE
// NIC0 NODE NODE... X PIX
//
// The header row starts with "GPU0"; its columns may include non-GPU entries
// (NIC, CPU) which are ignored. Only GPU×GPU cells containing NV# values are
// counted. X is self; non-NV tokens (NODE, SYS, PHB, PIX) are skipped.
func parseNVIDIATopologyMatrix(raw string) nvlinkTopoResult {
lines := strings.Split(raw, "\n")
// Locate the header line and record which column indices are GPU columns.
headerIdx := -1
var gpuColIndices []int // 0-based indices within fields (excluding the row label)
var gpuCount int
for i, line := range lines {
trimmed := strings.TrimSpace(line)
if strings.HasPrefix(trimmed, "GPU0") {
parts := strings.Fields(trimmed)
for j, col := range parts {
if strings.HasPrefix(col, "GPU") {
gpuColIndices = append(gpuColIndices, j)
}
}
gpuCount = len(gpuColIndices)
if gpuCount >= 2 {
headerIdx = i
}
break
}
}
if headerIdx < 0 || gpuCount == 0 {
return nvlinkTopoResult{}
}
minLinks := -1 // -1 = no NV pair seen yet
allActive := true
for _, line := range lines[headerIdx+1:] {
trimmed := strings.TrimSpace(line)
if !strings.HasPrefix(trimmed, "GPU") {
continue
}
cells := strings.Fields(trimmed)
// cells[0] is the row label (e.g. "GPU0"); cells[1..] are column values.
// gpuColIndices are 0-based within the header fields, so they map to
// cells[idx+1] in the data rows (shift by 1 for the row label).
for _, colIdx := range gpuColIndices {
dataIdx := colIdx + 1
if dataIdx >= len(cells) {
continue
}
cell := cells[dataIdx]
m := nv5re.FindStringSubmatch(cell)
if len(m) != 2 {
continue
}
n, err := strconv.Atoi(m[1])
if err != nil {
continue
}
if n == 0 {
allActive = false
}
if minLinks < 0 || n < minLinks {
minLinks = n
}
}
}
if minLinks < 0 {
minLinks = 0
}
return nvlinkTopoResult{
GPUCount: gpuCount,
AllActive: allActive && minLinks > 0,
MinNVLinks: minLinks,
}
}