Files
bee/audit/internal/collector/pcie_nvlink_bridge_test.go
Michael Chus 963bc960ca Fix SATA discovery, add NVLink bridge detection, add infiniband-diags
- storage: add jsonInt64 dual-format unmarshaler to handle lsblk output
  change in util-linux 2.38 (LOG-SEC/PHY-SEC now emitted as JSON
  integers, not quoted strings); fixes SATA disks invisible on Debian 12
- pcie: detect NVLink bridge mezzanine CX-7 cards (Mellanox x2, no host
  net ifaces, DeviceName contains "NVLINK" in lspci -v) and mark them
  with device_class="NVLinkBridge"; escalate PCIe link speed downgrade to
  Critical for these cards (Gen3 on a fixed internal connector = hardware
  fault, not a transient warning)
- pcie: cross-reference nvidia-smi topo to capture NVLink bond counts and
  active status for all NVLink bridge cards
- packages: add infiniband-diags to ISO; provides ibstat required by
  nvidia-fabricmanager-start.sh to enumerate IB devices before FM launch
  (absence causes CUDA_ERROR_SYSTEM_NOT_READY)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-28 20:57:04 +03:00

125 lines
3.1 KiB
Go

package collector
import (
"bee/audit/internal/schema"
"testing"
)
func TestParseNVIDIATopologyMatrix(t *testing.T) {
t.Parallel()
// Real-world B200 HGX output: 8 GPUs, all pairs connected via NV18.
input := ` GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 NIC0 NIC1
GPU0 X NV18 NV18 NV18 NV18 NV18 NV18 NV18 NODE NODE
GPU1 NV18 X NV18 NV18 NV18 NV18 NV18 NV18 NODE NODE
GPU2 NV18 NV18 X NV18 NV18 NV18 NV18 NV18 NODE NODE
GPU3 NV18 NV18 NV18 X NV18 NV18 NV18 NV18 NODE NODE
GPU4 NV18 NV18 NV18 NV18 X NV18 NV18 NV18 SYS SYS
GPU5 NV18 NV18 NV18 NV18 NV18 X NV18 NV18 SYS SYS
GPU6 NV18 NV18 NV18 NV18 NV18 NV18 X NV18 SYS SYS
GPU7 NV18 NV18 NV18 NV18 NV18 NV18 NV18 X SYS SYS
NIC0 NODE NODE NODE NODE SYS SYS SYS SYS X PIX
`
got := parseNVIDIATopologyMatrix(input)
if got.GPUCount != 8 {
t.Fatalf("GPUCount=%d want 8", got.GPUCount)
}
if !got.AllActive {
t.Fatalf("AllActive=false want true")
}
if got.MinNVLinks != 18 {
t.Fatalf("MinNVLinks=%d want 18", got.MinNVLinks)
}
}
func TestParseNVIDIATopologyMatrixPartialDegradation(t *testing.T) {
t.Parallel()
// GPU1-GPU3 pair shows NV12 (reduced) instead of NV18.
input := ` GPU0 GPU1 GPU2 GPU3
GPU0 X NV18 NV18 NV18
GPU1 NV18 X NV18 NV12
GPU2 NV18 NV18 X NV18
GPU3 NV18 NV12 NV18 X
`
got := parseNVIDIATopologyMatrix(input)
if got.MinNVLinks != 12 {
t.Fatalf("MinNVLinks=%d want 12", got.MinNVLinks)
}
if !got.AllActive {
t.Fatalf("AllActive=false want true (12 links is still active)")
}
}
func TestParseNVIDIATopologyMatrixDisconnected(t *testing.T) {
t.Parallel()
// GPU0-GPU1 pair fully disconnected (NV0).
input := ` GPU0 GPU1
GPU0 X NV0
GPU1 NV0 X
`
got := parseNVIDIATopologyMatrix(input)
if got.AllActive {
t.Fatalf("AllActive=true want false (NV0 means no links)")
}
if got.MinNVLinks != 0 {
t.Fatalf("MinNVLinks=%d want 0", got.MinNVLinks)
}
}
func TestParseNVIDIATopologyMatrixEmpty(t *testing.T) {
t.Parallel()
got := parseNVIDIATopologyMatrix("no gpus here")
if got.GPUCount != 0 {
t.Fatalf("GPUCount=%d want 0", got.GPUCount)
}
}
func TestApplyPCIeLinkSpeedWarningNVLinkBridgeEscalates(t *testing.T) {
t.Parallel()
bridgeClass := "NVLinkBridge"
linkSpeed := "Gen3"
maxLinkSpeed := "Gen4"
dev := schema.HardwarePCIeDevice{}
dev.DeviceClass = &bridgeClass
dev.LinkSpeed = &linkSpeed
dev.MaxLinkSpeed = &maxLinkSpeed
s := statusOK
dev.Status = &s
applyPCIeLinkSpeedWarning(&dev)
if dev.Status == nil || *dev.Status != statusCritical {
t.Fatalf("status=%v want Critical for NVLink bridge degradation", dev.Status)
}
if dev.ErrorDescription == nil {
t.Fatal("ErrorDescription nil, want degradation message")
}
}
func TestApplyPCIeLinkSpeedWarningRegularCardIsWarning(t *testing.T) {
t.Parallel()
regularClass := "NetworkController"
linkSpeed := "Gen3"
maxLinkSpeed := "Gen4"
dev := schema.HardwarePCIeDevice{}
dev.DeviceClass = &regularClass
dev.LinkSpeed = &linkSpeed
dev.MaxLinkSpeed = &maxLinkSpeed
s := statusOK
dev.Status = &s
applyPCIeLinkSpeedWarning(&dev)
if dev.Status == nil || *dev.Status != statusWarning {
t.Fatalf("status=%v want Warning for regular card degradation", dev.Status)
}
}