From 9db651c75a45587374f7c66c48b61feee9060dc8 Mon Sep 17 00:00:00 2001
From: Mikhail Chusavitin <mchusavitin@mchusmbp.local>
Date: Fri, 3 Jul 2026 12:22:04 +0300
Subject: [PATCH] gpu topology: detect cross-NUMA-only paths between GPU pairs

Flags GPUs that reach one or more peers only via a SYS-class PCIe hop
(crossing the CPU/NUMA-node boundary) in nvidia-smi topo -m. On servers
where GPUs are only bridged pairwise via NVLink bridge (no switched
NVLink fabric), this is the exact path that traffic between different
bridge pairs has to cross, and can cut multi-GPU throughput by 2x+ for
workloads spanning more than one pair.

Co-Authored-By: Claude Sonnet 5 <noreply@anthropic.com>
---
 audit/internal/collector/collector.go         |   1 +
 .../internal/collector/pcie_nvlink_bridge.go  | 159 +++++++++++++++---
 .../collector/pcie_nvlink_bridge_test.go      |  36 ++++
 3 files changed, 175 insertions(+), 21 deletions(-)

diff --git a/audit/internal/collector/collector.go b/audit/internal/collector/collector.go
index c551054..809d5be 100644
--- a/audit/internal/collector/collector.go
+++ b/audit/internal/collector/collector.go
@@ -41,6 +41,7 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
 	snap.PCIeDevices = enrichPCIeWithPCISerials(snap.PCIeDevices)
 	snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices)
 	snap.PCIeDevices = enrichNVLinkBridgesWithGPUTopo(snap.PCIeDevices)
+	snap.PCIeDevices = enrichGPUCrossNUMATopology(snap.PCIeDevices)
 	snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices)
 	snap.PCIeDevices = enrichPCIeWithNICTelemetry(snap.PCIeDevices)
 	snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices)
diff --git a/audit/internal/collector/pcie_nvlink_bridge.go b/audit/internal/collector/pcie_nvlink_bridge.go
index 26057c6..4316bea 100644
--- a/audit/internal/collector/pcie_nvlink_bridge.go
+++ b/audit/internal/collector/pcie_nvlink_bridge.go
@@ -2,6 +2,7 @@ package collector
 
 import (
 	"bee/audit/internal/schema"
+	"fmt"
 	"log/slog"
 	"os/exec"
 	"regexp"
@@ -119,26 +120,12 @@ func queryNVIDIANVLinkTopo() (nvlinkTopoResult, error) {
 	return parseNVIDIATopologyMatrix(string(out)), nil
 }
 
-// parseNVIDIATopologyMatrix extracts the minimum NVLink bond count from the
-// nvidia-smi topo -m matrix.
-//
-// Format (abbreviated):
-//
-//	         GPU0  GPU1 ... NIC0 NIC1
-//	GPU0      X   NV18 ... NODE NODE
-//	GPU1     NV18   X  ... NODE NODE
-//	NIC0     NODE  NODE...   X   PIX
-//
-// The header row starts with "GPU0"; its columns may include non-GPU entries
-// (NIC, CPU) which are ignored. Only GPU×GPU cells containing NV# values are
-// counted. X is self; non-NV tokens (NODE, SYS, PHB, PIX) are skipped.
-func parseNVIDIATopologyMatrix(raw string) nvlinkTopoResult {
-	lines := strings.Split(raw, "\n")
-
-	// Locate the header line and record which column indices are GPU columns.
-	headerIdx := -1
-	var gpuColIndices []int // 0-based indices within fields (excluding the row label)
-	var gpuCount int
+// locateGPUTopologyColumns finds the header line of a nvidia-smi topo -m
+// matrix and the 0-based field indices (excluding the row label) that
+// correspond to GPU columns. Returns headerIdx=-1 if fewer than 2 GPU columns
+// are found.
+func locateGPUTopologyColumns(lines []string) (headerIdx int, gpuColIndices []int, gpuCount int) {
+	headerIdx = -1
 	for i, line := range lines {
 		trimmed := strings.TrimSpace(line)
 		if strings.HasPrefix(trimmed, "GPU0") {
@@ -155,7 +142,30 @@ func parseNVIDIATopologyMatrix(raw string) nvlinkTopoResult {
 			break
 		}
 	}
-	if headerIdx < 0 || gpuCount == 0 {
+	if headerIdx < 0 {
+		gpuColIndices = nil
+		gpuCount = 0
+	}
+	return headerIdx, gpuColIndices, gpuCount
+}
+
+// parseNVIDIATopologyMatrix extracts the minimum NVLink bond count from the
+// nvidia-smi topo -m matrix.
+//
+// Format (abbreviated):
+//
+//	         GPU0  GPU1 ... NIC0 NIC1
+//	GPU0      X   NV18 ... NODE NODE
+//	GPU1     NV18   X  ... NODE NODE
+//	NIC0     NODE  NODE...   X   PIX
+//
+// The header row starts with "GPU0"; its columns may include non-GPU entries
+// (NIC, CPU) which are ignored. Only GPU×GPU cells containing NV# values are
+// counted. X is self; non-NV tokens (NODE, SYS, PHB, PIX) are skipped.
+func parseNVIDIATopologyMatrix(raw string) nvlinkTopoResult {
+	lines := strings.Split(raw, "\n")
+	headerIdx, gpuColIndices, gpuCount := locateGPUTopologyColumns(lines)
+	if headerIdx < 0 {
 		return nvlinkTopoResult{}
 	}
 
@@ -204,3 +214,110 @@ func parseNVIDIATopologyMatrix(raw string) nvlinkTopoResult {
 		MinNVLinks: minLinks,
 	}
 }
+
+// parseCrossNUMAPeers scans a nvidia-smi topo -m matrix for GPU pairs whose
+// only path is "SYS" — traversing PCIe as well as the SMP interconnect
+// between NUMA nodes (e.g. QPI/UPI). This is the slowest possible GPU-GPU
+// path and, on servers where GPUs are only bridged pairwise via NVLink
+// bridge (no switched NVLink fabric), it is exactly the hop that traffic
+// between different bridge pairs has to cross. Returns a map from GPU index
+// to the peer GPU indices reachable only via this cross-NUMA path.
+func parseCrossNUMAPeers(raw string) map[int][]int {
+	lines := strings.Split(raw, "\n")
+	headerIdx, gpuColIndices, _ := locateGPUTopologyColumns(lines)
+	if headerIdx < 0 {
+		return nil
+	}
+
+	// colIdx (0-based within header fields) -> GPU index, in header order.
+	colIdxToGPU := make(map[int]int, len(gpuColIndices))
+	for gpuIdx, colIdx := range gpuColIndices {
+		colIdxToGPU[colIdx] = gpuIdx
+	}
+
+	peers := make(map[int][]int)
+	rowGPU := -1
+	for _, line := range lines[headerIdx+1:] {
+		trimmed := strings.TrimSpace(line)
+		if !strings.HasPrefix(trimmed, "GPU") {
+			continue
+		}
+		rowGPU++
+		cells := strings.Fields(trimmed)
+		for _, colIdx := range gpuColIndices {
+			dataIdx := colIdx + 1
+			if dataIdx >= len(cells) {
+				continue
+			}
+			colGPU := colIdxToGPU[colIdx]
+			if colGPU == rowGPU {
+				continue
+			}
+			if strings.EqualFold(cells[dataIdx], "SYS") {
+				peers[rowGPU] = append(peers[rowGPU], colGPU)
+			}
+		}
+	}
+	if len(peers) == 0 {
+		return nil
+	}
+	return peers
+}
+
+// enrichGPUCrossNUMATopology flags GPUs that reach one or more peer GPUs only
+// via a cross-NUMA-node PCIe hop ("SYS" in nvidia-smi topo -m). Unlike
+// enrichNVLinkBridgesWithGPUTopo, this does not require an NVLink bridge PCIe
+// device to be present: it applies to any multi-GPU box, since the weak point
+// it detects is the path *between* NVLink-bridged pairs (or between GPUs with
+// no NVLink at all), not the bridge itself.
+func enrichGPUCrossNUMATopology(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
+	gpuByBDF, err := queryNVIDIAGPUs()
+	if err != nil || len(gpuByBDF) < 2 {
+		return devs
+	}
+
+	out, err := exec.Command("nvidia-smi", "topo", "-m").Output()
+	if err != nil {
+		slog.Info("gpu-topology: nvidia-smi topo unavailable, skipping cross-NUMA check", "err", err)
+		return devs
+	}
+	peers := parseCrossNUMAPeers(string(out))
+	if len(peers) == 0 {
+		return devs
+	}
+
+	bdfToIndex := make(map[string]int, len(gpuByBDF))
+	for bdf, info := range gpuByBDF {
+		bdfToIndex[bdf] = info.Index
+	}
+
+	for i := range devs {
+		if devs[i].BDF == nil {
+			continue
+		}
+		idx, ok := bdfToIndex[normalizePCIeBDF(*devs[i].BDF)]
+		if !ok {
+			continue
+		}
+		peerList, ok := peers[idx]
+		if !ok {
+			continue
+		}
+		if devs[i].Telemetry == nil {
+			devs[i].Telemetry = map[string]any{}
+		}
+		devs[i].Telemetry["nvlink_cross_numa_peers"] = peerList
+		if devs[i].Status == nil || *devs[i].Status == statusOK {
+			warn := statusWarning
+			devs[i].Status = &warn
+		}
+		if devs[i].ErrorDescription == nil {
+			devs[i].ErrorDescription = stringPtr(fmt.Sprintf(
+				"GPU %d reaches GPU(s) %v only via a cross-NUMA-node PCIe path (SYS) — expect reduced bandwidth/increased latency for tensor-parallel workloads spanning these GPUs",
+				idx, peerList))
+		}
+	}
+
+	slog.Info("gpu-topology: cross-NUMA peers detected", "affected_gpus", len(peers))
+	return devs
+}
diff --git a/audit/internal/collector/pcie_nvlink_bridge_test.go b/audit/internal/collector/pcie_nvlink_bridge_test.go
index 126a08b..1d7ecba 100644
--- a/audit/internal/collector/pcie_nvlink_bridge_test.go
+++ b/audit/internal/collector/pcie_nvlink_bridge_test.go
@@ -80,6 +80,42 @@ func TestParseNVIDIATopologyMatrixEmpty(t *testing.T) {
 	}
 }
 
+func TestParseCrossNUMAPeersDetectsSYS(t *testing.T) {
+	t.Parallel()
+
+	// 4-GPU box, two NVLink-bridged pairs (GPU0-GPU1, GPU2-GPU3); the pairs
+	// themselves only reach each other via SYS (cross-NUMA PCIe hop) — the
+	// exact topology of a server using pairwise NVLink bridge cards instead
+	// of a switched NVLink fabric.
+	input := `	GPU0	GPU1	GPU2	GPU3
+GPU0	 X 	NV4	SYS	SYS
+GPU1	NV4	 X 	SYS	SYS
+GPU2	SYS	SYS	 X 	NV4
+GPU3	SYS	SYS	NV4	 X
+`
+	peers := parseCrossNUMAPeers(input)
+
+	if len(peers[0]) != 2 || peers[0][0] != 2 || peers[0][1] != 3 {
+		t.Fatalf("peers[0]=%v want [2 3]", peers[0])
+	}
+	if len(peers[2]) != 2 {
+		t.Fatalf("peers[2]=%v want 2 entries", peers[2])
+	}
+}
+
+func TestParseCrossNUMAPeersNoSYS(t *testing.T) {
+	t.Parallel()
+
+	// Full NVSwitch fabric: every GPU pair connects via NVLink, no SYS hops.
+	input := `	GPU0	GPU1
+GPU0	 X 	NV18
+GPU1	NV18	 X
+`
+	if peers := parseCrossNUMAPeers(input); peers != nil {
+		t.Fatalf("peers=%v want nil (no SYS pairs)", peers)
+	}
+}
+
 func TestApplyPCIeLinkSpeedWarningNVLinkBridgeEscalates(t *testing.T) {
 	t.Parallel()