diff --git a/audit/internal/collector/collector.go b/audit/internal/collector/collector.go index c551054..809d5be 100644 --- a/audit/internal/collector/collector.go +++ b/audit/internal/collector/collector.go @@ -41,6 +41,7 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest { snap.PCIeDevices = enrichPCIeWithPCISerials(snap.PCIeDevices) snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices) snap.PCIeDevices = enrichNVLinkBridgesWithGPUTopo(snap.PCIeDevices) + snap.PCIeDevices = enrichGPUCrossNUMATopology(snap.PCIeDevices) snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices) snap.PCIeDevices = enrichPCIeWithNICTelemetry(snap.PCIeDevices) snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices) diff --git a/audit/internal/collector/pcie_nvlink_bridge.go b/audit/internal/collector/pcie_nvlink_bridge.go index 26057c6..4316bea 100644 --- a/audit/internal/collector/pcie_nvlink_bridge.go +++ b/audit/internal/collector/pcie_nvlink_bridge.go @@ -2,6 +2,7 @@ package collector import ( "bee/audit/internal/schema" + "fmt" "log/slog" "os/exec" "regexp" @@ -119,26 +120,12 @@ func queryNVIDIANVLinkTopo() (nvlinkTopoResult, error) { return parseNVIDIATopologyMatrix(string(out)), nil } -// parseNVIDIATopologyMatrix extracts the minimum NVLink bond count from the -// nvidia-smi topo -m matrix. -// -// Format (abbreviated): -// -// GPU0 GPU1 ... NIC0 NIC1 -// GPU0 X NV18 ... NODE NODE -// GPU1 NV18 X ... NODE NODE -// NIC0 NODE NODE... X PIX -// -// The header row starts with "GPU0"; its columns may include non-GPU entries -// (NIC, CPU) which are ignored. Only GPU×GPU cells containing NV# values are -// counted. X is self; non-NV tokens (NODE, SYS, PHB, PIX) are skipped. -func parseNVIDIATopologyMatrix(raw string) nvlinkTopoResult { - lines := strings.Split(raw, "\n") - - // Locate the header line and record which column indices are GPU columns. - headerIdx := -1 - var gpuColIndices []int // 0-based indices within fields (excluding the row label) - var gpuCount int +// locateGPUTopologyColumns finds the header line of a nvidia-smi topo -m +// matrix and the 0-based field indices (excluding the row label) that +// correspond to GPU columns. Returns headerIdx=-1 if fewer than 2 GPU columns +// are found. +func locateGPUTopologyColumns(lines []string) (headerIdx int, gpuColIndices []int, gpuCount int) { + headerIdx = -1 for i, line := range lines { trimmed := strings.TrimSpace(line) if strings.HasPrefix(trimmed, "GPU0") { @@ -155,7 +142,30 @@ func parseNVIDIATopologyMatrix(raw string) nvlinkTopoResult { break } } - if headerIdx < 0 || gpuCount == 0 { + if headerIdx < 0 { + gpuColIndices = nil + gpuCount = 0 + } + return headerIdx, gpuColIndices, gpuCount +} + +// parseNVIDIATopologyMatrix extracts the minimum NVLink bond count from the +// nvidia-smi topo -m matrix. +// +// Format (abbreviated): +// +// GPU0 GPU1 ... NIC0 NIC1 +// GPU0 X NV18 ... NODE NODE +// GPU1 NV18 X ... NODE NODE +// NIC0 NODE NODE... X PIX +// +// The header row starts with "GPU0"; its columns may include non-GPU entries +// (NIC, CPU) which are ignored. Only GPU×GPU cells containing NV# values are +// counted. X is self; non-NV tokens (NODE, SYS, PHB, PIX) are skipped. +func parseNVIDIATopologyMatrix(raw string) nvlinkTopoResult { + lines := strings.Split(raw, "\n") + headerIdx, gpuColIndices, gpuCount := locateGPUTopologyColumns(lines) + if headerIdx < 0 { return nvlinkTopoResult{} } @@ -204,3 +214,110 @@ func parseNVIDIATopologyMatrix(raw string) nvlinkTopoResult { MinNVLinks: minLinks, } } + +// parseCrossNUMAPeers scans a nvidia-smi topo -m matrix for GPU pairs whose +// only path is "SYS" — traversing PCIe as well as the SMP interconnect +// between NUMA nodes (e.g. QPI/UPI). This is the slowest possible GPU-GPU +// path and, on servers where GPUs are only bridged pairwise via NVLink +// bridge (no switched NVLink fabric), it is exactly the hop that traffic +// between different bridge pairs has to cross. Returns a map from GPU index +// to the peer GPU indices reachable only via this cross-NUMA path. +func parseCrossNUMAPeers(raw string) map[int][]int { + lines := strings.Split(raw, "\n") + headerIdx, gpuColIndices, _ := locateGPUTopologyColumns(lines) + if headerIdx < 0 { + return nil + } + + // colIdx (0-based within header fields) -> GPU index, in header order. + colIdxToGPU := make(map[int]int, len(gpuColIndices)) + for gpuIdx, colIdx := range gpuColIndices { + colIdxToGPU[colIdx] = gpuIdx + } + + peers := make(map[int][]int) + rowGPU := -1 + for _, line := range lines[headerIdx+1:] { + trimmed := strings.TrimSpace(line) + if !strings.HasPrefix(trimmed, "GPU") { + continue + } + rowGPU++ + cells := strings.Fields(trimmed) + for _, colIdx := range gpuColIndices { + dataIdx := colIdx + 1 + if dataIdx >= len(cells) { + continue + } + colGPU := colIdxToGPU[colIdx] + if colGPU == rowGPU { + continue + } + if strings.EqualFold(cells[dataIdx], "SYS") { + peers[rowGPU] = append(peers[rowGPU], colGPU) + } + } + } + if len(peers) == 0 { + return nil + } + return peers +} + +// enrichGPUCrossNUMATopology flags GPUs that reach one or more peer GPUs only +// via a cross-NUMA-node PCIe hop ("SYS" in nvidia-smi topo -m). Unlike +// enrichNVLinkBridgesWithGPUTopo, this does not require an NVLink bridge PCIe +// device to be present: it applies to any multi-GPU box, since the weak point +// it detects is the path *between* NVLink-bridged pairs (or between GPUs with +// no NVLink at all), not the bridge itself. +func enrichGPUCrossNUMATopology(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice { + gpuByBDF, err := queryNVIDIAGPUs() + if err != nil || len(gpuByBDF) < 2 { + return devs + } + + out, err := exec.Command("nvidia-smi", "topo", "-m").Output() + if err != nil { + slog.Info("gpu-topology: nvidia-smi topo unavailable, skipping cross-NUMA check", "err", err) + return devs + } + peers := parseCrossNUMAPeers(string(out)) + if len(peers) == 0 { + return devs + } + + bdfToIndex := make(map[string]int, len(gpuByBDF)) + for bdf, info := range gpuByBDF { + bdfToIndex[bdf] = info.Index + } + + for i := range devs { + if devs[i].BDF == nil { + continue + } + idx, ok := bdfToIndex[normalizePCIeBDF(*devs[i].BDF)] + if !ok { + continue + } + peerList, ok := peers[idx] + if !ok { + continue + } + if devs[i].Telemetry == nil { + devs[i].Telemetry = map[string]any{} + } + devs[i].Telemetry["nvlink_cross_numa_peers"] = peerList + if devs[i].Status == nil || *devs[i].Status == statusOK { + warn := statusWarning + devs[i].Status = &warn + } + if devs[i].ErrorDescription == nil { + devs[i].ErrorDescription = stringPtr(fmt.Sprintf( + "GPU %d reaches GPU(s) %v only via a cross-NUMA-node PCIe path (SYS) — expect reduced bandwidth/increased latency for tensor-parallel workloads spanning these GPUs", + idx, peerList)) + } + } + + slog.Info("gpu-topology: cross-NUMA peers detected", "affected_gpus", len(peers)) + return devs +} diff --git a/audit/internal/collector/pcie_nvlink_bridge_test.go b/audit/internal/collector/pcie_nvlink_bridge_test.go index 126a08b..1d7ecba 100644 --- a/audit/internal/collector/pcie_nvlink_bridge_test.go +++ b/audit/internal/collector/pcie_nvlink_bridge_test.go @@ -80,6 +80,42 @@ func TestParseNVIDIATopologyMatrixEmpty(t *testing.T) { } } +func TestParseCrossNUMAPeersDetectsSYS(t *testing.T) { + t.Parallel() + + // 4-GPU box, two NVLink-bridged pairs (GPU0-GPU1, GPU2-GPU3); the pairs + // themselves only reach each other via SYS (cross-NUMA PCIe hop) — the + // exact topology of a server using pairwise NVLink bridge cards instead + // of a switched NVLink fabric. + input := ` GPU0 GPU1 GPU2 GPU3 +GPU0 X NV4 SYS SYS +GPU1 NV4 X SYS SYS +GPU2 SYS SYS X NV4 +GPU3 SYS SYS NV4 X +` + peers := parseCrossNUMAPeers(input) + + if len(peers[0]) != 2 || peers[0][0] != 2 || peers[0][1] != 3 { + t.Fatalf("peers[0]=%v want [2 3]", peers[0]) + } + if len(peers[2]) != 2 { + t.Fatalf("peers[2]=%v want 2 entries", peers[2]) + } +} + +func TestParseCrossNUMAPeersNoSYS(t *testing.T) { + t.Parallel() + + // Full NVSwitch fabric: every GPU pair connects via NVLink, no SYS hops. + input := ` GPU0 GPU1 +GPU0 X NV18 +GPU1 NV18 X +` + if peers := parseCrossNUMAPeers(input); peers != nil { + t.Fatalf("peers=%v want nil (no SYS pairs)", peers) + } +} + func TestApplyPCIeLinkSpeedWarningNVLinkBridgeEscalates(t *testing.T) { t.Parallel()