- storage: add jsonInt64 dual-format unmarshaler to handle lsblk output change in util-linux 2.38 (LOG-SEC/PHY-SEC now emitted as JSON integers, not quoted strings); fixes SATA disks invisible on Debian 12 - pcie: detect NVLink bridge mezzanine CX-7 cards (Mellanox x2, no host net ifaces, DeviceName contains "NVLINK" in lspci -v) and mark them with device_class="NVLinkBridge"; escalate PCIe link speed downgrade to Critical for these cards (Gen3 on a fixed internal connector = hardware fault, not a transient warning) - pcie: cross-reference nvidia-smi topo to capture NVLink bond counts and active status for all NVLink bridge cards - packages: add infiniband-diags to ISO; provides ibstat required by nvidia-fabricmanager-start.sh to enumerate IB devices before FM launch (absence causes CUDA_ERROR_SYSTEM_NOT_READY) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
207 lines
5.9 KiB
Go
207 lines
5.9 KiB
Go
package collector
|
||
|
||
import (
|
||
"bee/audit/internal/schema"
|
||
"log/slog"
|
||
"os/exec"
|
||
"regexp"
|
||
"strconv"
|
||
"strings"
|
||
)
|
||
|
||
var nv5re = regexp.MustCompile(`(?i)^NV(\d+)$`)
|
||
|
||
// isNVLinkBridgeCandidate returns true for Mellanox PCIe devices that look like
|
||
// NVLink bridge mezzanine cards: narrow link (x2), no host net interfaces.
|
||
// These are the CPU-side PCIe control plane of the NVSwitch fabric on HGX/DGX systems.
|
||
func isNVLinkBridgeCandidate(bdf string, dev schema.HardwarePCIeDevice) bool {
|
||
if !isMellanoxDevice(dev) {
|
||
return false
|
||
}
|
||
if dev.LinkWidth == nil || *dev.LinkWidth > 2 {
|
||
return false
|
||
}
|
||
if len(netIfacesByBDF(bdf)) > 0 {
|
||
return false
|
||
}
|
||
return true
|
||
}
|
||
|
||
// confirmNVLinkBridgeDeviceName checks if the lspci DeviceName for bdf contains
|
||
// "NVLINK". This is a targeted single-device call, only executed for candidates
|
||
// already pre-filtered by isNVLinkBridgeCandidate.
|
||
func confirmNVLinkBridgeDeviceName(bdf string) bool {
|
||
out, err := exec.Command("lspci", "-s", bdf, "-v").Output()
|
||
if err != nil {
|
||
return false
|
||
}
|
||
for _, line := range strings.Split(string(out), "\n") {
|
||
if strings.Contains(strings.ToUpper(strings.TrimSpace(line)), "NVLINK") {
|
||
return true
|
||
}
|
||
}
|
||
return false
|
||
}
|
||
|
||
// markNVLinkBridge overwrites device_class and adds telemetry flags on a detected
|
||
// NVLink bridge card. Must be called before applyPCIeLinkSpeedWarning so that the
|
||
// correct severity (Critical) is applied.
|
||
func markNVLinkBridge(dev *schema.HardwarePCIeDevice) {
|
||
class := "NVLinkBridge"
|
||
dev.DeviceClass = &class
|
||
if dev.Telemetry == nil {
|
||
dev.Telemetry = map[string]any{}
|
||
}
|
||
dev.Telemetry["nvlink_bridge"] = true
|
||
}
|
||
|
||
// enrichNVLinkBridgesWithGPUTopo cross-references NVLink bridge PCIe status with
|
||
// the GPU-side NVLink topology reported by nvidia-smi. For each bridge device it
|
||
// adds nvlink_topo_all_active and nvlink_topo_min_links to the telemetry, and
|
||
// upgrades a degraded-link Warning to Critical when the fabric is also affected.
|
||
func enrichNVLinkBridgesWithGPUTopo(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
|
||
hasBridge := false
|
||
for _, d := range devs {
|
||
if d.DeviceClass != nil && *d.DeviceClass == "NVLinkBridge" {
|
||
hasBridge = true
|
||
break
|
||
}
|
||
}
|
||
if !hasBridge {
|
||
return devs
|
||
}
|
||
|
||
topo, err := queryNVIDIANVLinkTopo()
|
||
if err != nil {
|
||
slog.Info("nvlink-bridge: nvidia-smi topo unavailable, skipping cross-reference", "err", err)
|
||
return devs
|
||
}
|
||
|
||
for i := range devs {
|
||
if devs[i].DeviceClass == nil || *devs[i].DeviceClass != "NVLinkBridge" {
|
||
continue
|
||
}
|
||
if devs[i].Telemetry == nil {
|
||
devs[i].Telemetry = map[string]any{}
|
||
}
|
||
devs[i].Telemetry["nvlink_topo_all_active"] = topo.AllActive
|
||
devs[i].Telemetry["nvlink_topo_min_links"] = topo.MinNVLinks
|
||
devs[i].Telemetry["nvlink_topo_gpu_count"] = topo.GPUCount
|
||
|
||
// If the bridge PCIe is already degraded AND the fabric is also degraded
|
||
// (missing NVLink connections), escalate to Critical.
|
||
if devs[i].Status != nil && *devs[i].Status == statusCritical && !topo.AllActive {
|
||
devs[i].Telemetry["nvlink_fabric_affected"] = true
|
||
}
|
||
}
|
||
|
||
slog.Info("nvlink-bridge: topo cross-reference applied",
|
||
"gpu_count", topo.GPUCount,
|
||
"all_active", topo.AllActive,
|
||
"min_links", topo.MinNVLinks,
|
||
)
|
||
return devs
|
||
}
|
||
|
||
// nvlinkTopoResult summarises the GPU NVLink connectivity matrix.
|
||
type nvlinkTopoResult struct {
|
||
GPUCount int
|
||
AllActive bool // true if every GPU pair has at least one NVLink bond
|
||
MinNVLinks int // minimum NVLink bonds seen across any GPU pair (0 = some pair disconnected)
|
||
}
|
||
|
||
// queryNVIDIANVLinkTopo runs nvidia-smi topo -m and parses the NVLink matrix.
|
||
func queryNVIDIANVLinkTopo() (nvlinkTopoResult, error) {
|
||
out, err := exec.Command("nvidia-smi", "topo", "-m").Output()
|
||
if err != nil {
|
||
return nvlinkTopoResult{}, err
|
||
}
|
||
return parseNVIDIATopologyMatrix(string(out)), nil
|
||
}
|
||
|
||
// parseNVIDIATopologyMatrix extracts the minimum NVLink bond count from the
|
||
// nvidia-smi topo -m matrix.
|
||
//
|
||
// Format (abbreviated):
|
||
//
|
||
// GPU0 GPU1 ... NIC0 NIC1
|
||
// GPU0 X NV18 ... NODE NODE
|
||
// GPU1 NV18 X ... NODE NODE
|
||
// NIC0 NODE NODE... X PIX
|
||
//
|
||
// The header row starts with "GPU0"; its columns may include non-GPU entries
|
||
// (NIC, CPU) which are ignored. Only GPU×GPU cells containing NV# values are
|
||
// counted. X is self; non-NV tokens (NODE, SYS, PHB, PIX) are skipped.
|
||
func parseNVIDIATopologyMatrix(raw string) nvlinkTopoResult {
|
||
lines := strings.Split(raw, "\n")
|
||
|
||
// Locate the header line and record which column indices are GPU columns.
|
||
headerIdx := -1
|
||
var gpuColIndices []int // 0-based indices within fields (excluding the row label)
|
||
var gpuCount int
|
||
for i, line := range lines {
|
||
trimmed := strings.TrimSpace(line)
|
||
if strings.HasPrefix(trimmed, "GPU0") {
|
||
parts := strings.Fields(trimmed)
|
||
for j, col := range parts {
|
||
if strings.HasPrefix(col, "GPU") {
|
||
gpuColIndices = append(gpuColIndices, j)
|
||
}
|
||
}
|
||
gpuCount = len(gpuColIndices)
|
||
if gpuCount >= 2 {
|
||
headerIdx = i
|
||
}
|
||
break
|
||
}
|
||
}
|
||
if headerIdx < 0 || gpuCount == 0 {
|
||
return nvlinkTopoResult{}
|
||
}
|
||
|
||
minLinks := -1 // -1 = no NV pair seen yet
|
||
allActive := true
|
||
|
||
for _, line := range lines[headerIdx+1:] {
|
||
trimmed := strings.TrimSpace(line)
|
||
if !strings.HasPrefix(trimmed, "GPU") {
|
||
continue
|
||
}
|
||
cells := strings.Fields(trimmed)
|
||
// cells[0] is the row label (e.g. "GPU0"); cells[1..] are column values.
|
||
// gpuColIndices are 0-based within the header fields, so they map to
|
||
// cells[idx+1] in the data rows (shift by 1 for the row label).
|
||
for _, colIdx := range gpuColIndices {
|
||
dataIdx := colIdx + 1
|
||
if dataIdx >= len(cells) {
|
||
continue
|
||
}
|
||
cell := cells[dataIdx]
|
||
m := nv5re.FindStringSubmatch(cell)
|
||
if len(m) != 2 {
|
||
continue
|
||
}
|
||
n, err := strconv.Atoi(m[1])
|
||
if err != nil {
|
||
continue
|
||
}
|
||
if n == 0 {
|
||
allActive = false
|
||
}
|
||
if minLinks < 0 || n < minLinks {
|
||
minLinks = n
|
||
}
|
||
}
|
||
}
|
||
|
||
if minLinks < 0 {
|
||
minLinks = 0
|
||
}
|
||
|
||
return nvlinkTopoResult{
|
||
GPUCount: gpuCount,
|
||
AllActive: allActive && minLinks > 0,
|
||
MinNVLinks: minLinks,
|
||
}
|
||
}
|