@@ -2,6 +2,7 @@ package collector
import (
"bee/audit/internal/schema"
"fmt"
"log/slog"
"os/exec"
"regexp"
@@ -119,26 +120,12 @@ func queryNVIDIANVLinkTopo() (nvlinkTopoResult, error) {
return parseNVIDIATopologyMatrix ( string ( out ) ) , nil
}
// parseNVIDIATopologyMatrix extracts the minimum NVLink bond count from the
// nvidia-smi topo -m matrix.
//
// Format (abbreviated):
//
// GPU0 GPU1 ... NIC0 NIC1
// GPU0 X NV18 ... NODE NODE
// GPU1 NV18 X ... NODE NODE
// NIC0 NODE NODE... X PIX
//
// The header row starts with "GPU0"; its columns may include non-GPU entries
// (NIC, CPU) which are ignored. Only GPU× GPU cells containing NV# values are
// counted. X is self; non-NV tokens (NODE, SYS, PHB, PIX) are skipped.
func parseNVIDIATopologyMatrix ( raw string ) nvlinkTopoResult {
lines := strings . Split ( raw , "\n" )
// Locate the header line and record which column indices are GPU columns.
headerIdx := - 1
var gpuColIndices [ ] int // 0-based indices within fields (excluding the row label)
var gpuCount int
// locateGPUTopologyColumns finds the header line of a nvidia-smi topo -m
// matrix and the 0-based field indices (excluding the row label) that
// correspond to GPU columns. Returns headerIdx=-1 if fewer than 2 GPU columns
// are found.
func locateGPUTopologyColumns ( lines [ ] string ) ( headerIdx int , gpuColIndices [ ] int , gpuCount int ) {
headerIdx = - 1
for i , line := range lines {
trimmed := strings . TrimSpace ( line )
if strings . HasPrefix ( trimmed , "GPU0" ) {
@@ -155,7 +142,30 @@ func parseNVIDIATopologyMatrix(raw string) nvlinkTopoResult {
break
}
}
if headerIdx < 0 || gpuCount == 0 {
if headerIdx < 0 {
gpuColIndices = nil
gpuCount = 0
}
return headerIdx , gpuColIndices , gpuCount
}
// parseNVIDIATopologyMatrix extracts the minimum NVLink bond count from the
// nvidia-smi topo -m matrix.
//
// Format (abbreviated):
//
// GPU0 GPU1 ... NIC0 NIC1
// GPU0 X NV18 ... NODE NODE
// GPU1 NV18 X ... NODE NODE
// NIC0 NODE NODE... X PIX
//
// The header row starts with "GPU0"; its columns may include non-GPU entries
// (NIC, CPU) which are ignored. Only GPU× GPU cells containing NV# values are
// counted. X is self; non-NV tokens (NODE, SYS, PHB, PIX) are skipped.
func parseNVIDIATopologyMatrix ( raw string ) nvlinkTopoResult {
lines := strings . Split ( raw , "\n" )
headerIdx , gpuColIndices , gpuCount := locateGPUTopologyColumns ( lines )
if headerIdx < 0 {
return nvlinkTopoResult { }
}
@@ -204,3 +214,110 @@ func parseNVIDIATopologyMatrix(raw string) nvlinkTopoResult {
MinNVLinks : minLinks ,
}
}
// parseCrossNUMAPeers scans a nvidia-smi topo -m matrix for GPU pairs whose
// only path is "SYS" — traversing PCIe as well as the SMP interconnect
// between NUMA nodes (e.g. QPI/UPI). This is the slowest possible GPU-GPU
// path and, on servers where GPUs are only bridged pairwise via NVLink
// bridge (no switched NVLink fabric), it is exactly the hop that traffic
// between different bridge pairs has to cross. Returns a map from GPU index
// to the peer GPU indices reachable only via this cross-NUMA path.
func parseCrossNUMAPeers ( raw string ) map [ int ] [ ] int {
lines := strings . Split ( raw , "\n" )
headerIdx , gpuColIndices , _ := locateGPUTopologyColumns ( lines )
if headerIdx < 0 {
return nil
}
// colIdx (0-based within header fields) -> GPU index, in header order.
colIdxToGPU := make ( map [ int ] int , len ( gpuColIndices ) )
for gpuIdx , colIdx := range gpuColIndices {
colIdxToGPU [ colIdx ] = gpuIdx
}
peers := make ( map [ int ] [ ] int )
rowGPU := - 1
for _ , line := range lines [ headerIdx + 1 : ] {
trimmed := strings . TrimSpace ( line )
if ! strings . HasPrefix ( trimmed , "GPU" ) {
continue
}
rowGPU ++
cells := strings . Fields ( trimmed )
for _ , colIdx := range gpuColIndices {
dataIdx := colIdx + 1
if dataIdx >= len ( cells ) {
continue
}
colGPU := colIdxToGPU [ colIdx ]
if colGPU == rowGPU {
continue
}
if strings . EqualFold ( cells [ dataIdx ] , "SYS" ) {
peers [ rowGPU ] = append ( peers [ rowGPU ] , colGPU )
}
}
}
if len ( peers ) == 0 {
return nil
}
return peers
}
// enrichGPUCrossNUMATopology flags GPUs that reach one or more peer GPUs only
// via a cross-NUMA-node PCIe hop ("SYS" in nvidia-smi topo -m). Unlike
// enrichNVLinkBridgesWithGPUTopo, this does not require an NVLink bridge PCIe
// device to be present: it applies to any multi-GPU box, since the weak point
// it detects is the path *between* NVLink-bridged pairs (or between GPUs with
// no NVLink at all), not the bridge itself.
func enrichGPUCrossNUMATopology ( devs [ ] schema . HardwarePCIeDevice ) [ ] schema . HardwarePCIeDevice {
gpuByBDF , err := queryNVIDIAGPUs ( )
if err != nil || len ( gpuByBDF ) < 2 {
return devs
}
out , err := exec . Command ( "nvidia-smi" , "topo" , "-m" ) . Output ( )
if err != nil {
slog . Info ( "gpu-topology: nvidia-smi topo unavailable, skipping cross-NUMA check" , "err" , err )
return devs
}
peers := parseCrossNUMAPeers ( string ( out ) )
if len ( peers ) == 0 {
return devs
}
bdfToIndex := make ( map [ string ] int , len ( gpuByBDF ) )
for bdf , info := range gpuByBDF {
bdfToIndex [ bdf ] = info . Index
}
for i := range devs {
if devs [ i ] . BDF == nil {
continue
}
idx , ok := bdfToIndex [ normalizePCIeBDF ( * devs [ i ] . BDF ) ]
if ! ok {
continue
}
peerList , ok := peers [ idx ]
if ! ok {
continue
}
if devs [ i ] . Telemetry == nil {
devs [ i ] . Telemetry = map [ string ] any { }
}
devs [ i ] . Telemetry [ "nvlink_cross_numa_peers" ] = peerList
if devs [ i ] . Status == nil || * devs [ i ] . Status == statusOK {
warn := statusWarning
devs [ i ] . Status = & warn
}
if devs [ i ] . ErrorDescription == nil {
devs [ i ] . ErrorDescription = stringPtr ( fmt . Sprintf (
"GPU %d reaches GPU(s) %v only via a cross-NUMA-node PCIe path (SYS) — expect reduced bandwidth/increased latency for tensor-parallel workloads spanning these GPUs" ,
idx , peerList ) )
}
}
slog . Info ( "gpu-topology: cross-NUMA peers detected" , "affected_gpus" , len ( peers ) )
return devs
}