- storage: add jsonInt64 dual-format unmarshaler to handle lsblk output change in util-linux 2.38 (LOG-SEC/PHY-SEC now emitted as JSON integers, not quoted strings); fixes SATA disks invisible on Debian 12 - pcie: detect NVLink bridge mezzanine CX-7 cards (Mellanox x2, no host net ifaces, DeviceName contains "NVLINK" in lspci -v) and mark them with device_class="NVLinkBridge"; escalate PCIe link speed downgrade to Critical for these cards (Gen3 on a fixed internal connector = hardware fault, not a transient warning) - pcie: cross-reference nvidia-smi topo to capture NVLink bond counts and active status for all NVLink bridge cards - packages: add infiniband-diags to ISO; provides ibstat required by nvidia-fabricmanager-start.sh to enumerate IB devices before FM launch (absence causes CUDA_ERROR_SYSTEM_NOT_READY) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
125 lines
3.1 KiB
Go
125 lines
3.1 KiB
Go
package collector
|
|
|
|
import (
|
|
"bee/audit/internal/schema"
|
|
"testing"
|
|
)
|
|
|
|
func TestParseNVIDIATopologyMatrix(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
// Real-world B200 HGX output: 8 GPUs, all pairs connected via NV18.
|
|
input := ` GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 NIC0 NIC1
|
|
GPU0 X NV18 NV18 NV18 NV18 NV18 NV18 NV18 NODE NODE
|
|
GPU1 NV18 X NV18 NV18 NV18 NV18 NV18 NV18 NODE NODE
|
|
GPU2 NV18 NV18 X NV18 NV18 NV18 NV18 NV18 NODE NODE
|
|
GPU3 NV18 NV18 NV18 X NV18 NV18 NV18 NV18 NODE NODE
|
|
GPU4 NV18 NV18 NV18 NV18 X NV18 NV18 NV18 SYS SYS
|
|
GPU5 NV18 NV18 NV18 NV18 NV18 X NV18 NV18 SYS SYS
|
|
GPU6 NV18 NV18 NV18 NV18 NV18 NV18 X NV18 SYS SYS
|
|
GPU7 NV18 NV18 NV18 NV18 NV18 NV18 NV18 X SYS SYS
|
|
NIC0 NODE NODE NODE NODE SYS SYS SYS SYS X PIX
|
|
`
|
|
got := parseNVIDIATopologyMatrix(input)
|
|
|
|
if got.GPUCount != 8 {
|
|
t.Fatalf("GPUCount=%d want 8", got.GPUCount)
|
|
}
|
|
if !got.AllActive {
|
|
t.Fatalf("AllActive=false want true")
|
|
}
|
|
if got.MinNVLinks != 18 {
|
|
t.Fatalf("MinNVLinks=%d want 18", got.MinNVLinks)
|
|
}
|
|
}
|
|
|
|
func TestParseNVIDIATopologyMatrixPartialDegradation(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
// GPU1-GPU3 pair shows NV12 (reduced) instead of NV18.
|
|
input := ` GPU0 GPU1 GPU2 GPU3
|
|
GPU0 X NV18 NV18 NV18
|
|
GPU1 NV18 X NV18 NV12
|
|
GPU2 NV18 NV18 X NV18
|
|
GPU3 NV18 NV12 NV18 X
|
|
`
|
|
got := parseNVIDIATopologyMatrix(input)
|
|
|
|
if got.MinNVLinks != 12 {
|
|
t.Fatalf("MinNVLinks=%d want 12", got.MinNVLinks)
|
|
}
|
|
if !got.AllActive {
|
|
t.Fatalf("AllActive=false want true (12 links is still active)")
|
|
}
|
|
}
|
|
|
|
func TestParseNVIDIATopologyMatrixDisconnected(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
// GPU0-GPU1 pair fully disconnected (NV0).
|
|
input := ` GPU0 GPU1
|
|
GPU0 X NV0
|
|
GPU1 NV0 X
|
|
`
|
|
got := parseNVIDIATopologyMatrix(input)
|
|
|
|
if got.AllActive {
|
|
t.Fatalf("AllActive=true want false (NV0 means no links)")
|
|
}
|
|
if got.MinNVLinks != 0 {
|
|
t.Fatalf("MinNVLinks=%d want 0", got.MinNVLinks)
|
|
}
|
|
}
|
|
|
|
func TestParseNVIDIATopologyMatrixEmpty(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
got := parseNVIDIATopologyMatrix("no gpus here")
|
|
if got.GPUCount != 0 {
|
|
t.Fatalf("GPUCount=%d want 0", got.GPUCount)
|
|
}
|
|
}
|
|
|
|
func TestApplyPCIeLinkSpeedWarningNVLinkBridgeEscalates(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
bridgeClass := "NVLinkBridge"
|
|
linkSpeed := "Gen3"
|
|
maxLinkSpeed := "Gen4"
|
|
dev := schema.HardwarePCIeDevice{}
|
|
dev.DeviceClass = &bridgeClass
|
|
dev.LinkSpeed = &linkSpeed
|
|
dev.MaxLinkSpeed = &maxLinkSpeed
|
|
s := statusOK
|
|
dev.Status = &s
|
|
|
|
applyPCIeLinkSpeedWarning(&dev)
|
|
|
|
if dev.Status == nil || *dev.Status != statusCritical {
|
|
t.Fatalf("status=%v want Critical for NVLink bridge degradation", dev.Status)
|
|
}
|
|
if dev.ErrorDescription == nil {
|
|
t.Fatal("ErrorDescription nil, want degradation message")
|
|
}
|
|
}
|
|
|
|
func TestApplyPCIeLinkSpeedWarningRegularCardIsWarning(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
regularClass := "NetworkController"
|
|
linkSpeed := "Gen3"
|
|
maxLinkSpeed := "Gen4"
|
|
dev := schema.HardwarePCIeDevice{}
|
|
dev.DeviceClass = ®ularClass
|
|
dev.LinkSpeed = &linkSpeed
|
|
dev.MaxLinkSpeed = &maxLinkSpeed
|
|
s := statusOK
|
|
dev.Status = &s
|
|
|
|
applyPCIeLinkSpeedWarning(&dev)
|
|
|
|
if dev.Status == nil || *dev.Status != statusWarning {
|
|
t.Fatalf("status=%v want Warning for regular card degradation", dev.Status)
|
|
}
|
|
}
|