Fix SATA discovery, add NVLink bridge detection, add infiniband-diags
- storage: add jsonInt64 dual-format unmarshaler to handle lsblk output change in util-linux 2.38 (LOG-SEC/PHY-SEC now emitted as JSON integers, not quoted strings); fixes SATA disks invisible on Debian 12 - pcie: detect NVLink bridge mezzanine CX-7 cards (Mellanox x2, no host net ifaces, DeviceName contains "NVLINK" in lspci -v) and mark them with device_class="NVLinkBridge"; escalate PCIe link speed downgrade to Critical for these cards (Gen3 on a fixed internal connector = hardware fault, not a transient warning) - pcie: cross-reference nvidia-smi topo to capture NVLink bond counts and active status for all NVLink bridge cards - packages: add infiniband-diags to ISO; provides ibstat required by nvidia-fabricmanager-start.sh to enumerate IB devices before FM launch (absence causes CUDA_ERROR_SYSTEM_NOT_READY) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -40,6 +40,7 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
|||||||
snap.PCIeDevices = enrichPCIeWithAMD(snap.PCIeDevices)
|
snap.PCIeDevices = enrichPCIeWithAMD(snap.PCIeDevices)
|
||||||
snap.PCIeDevices = enrichPCIeWithPCISerials(snap.PCIeDevices)
|
snap.PCIeDevices = enrichPCIeWithPCISerials(snap.PCIeDevices)
|
||||||
snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices)
|
snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices)
|
||||||
|
snap.PCIeDevices = enrichNVLinkBridgesWithGPUTopo(snap.PCIeDevices)
|
||||||
snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices)
|
snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices)
|
||||||
snap.PCIeDevices = enrichPCIeWithNICTelemetry(snap.PCIeDevices)
|
snap.PCIeDevices = enrichPCIeWithNICTelemetry(snap.PCIeDevices)
|
||||||
snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices)
|
snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices)
|
||||||
|
|||||||
@@ -126,38 +126,39 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
|||||||
dev.Status = &status
|
dev.Status = &status
|
||||||
|
|
||||||
// Slot is the BDF: "0000:00:02.0"
|
// Slot is the BDF: "0000:00:02.0"
|
||||||
if bdf := fields["Slot"]; bdf != "" {
|
bdfStr := fields["Slot"]
|
||||||
dev.Slot = &bdf
|
if bdfStr != "" {
|
||||||
dev.BDF = &bdf
|
dev.Slot = &bdfStr
|
||||||
|
dev.BDF = &bdfStr
|
||||||
// parse vendor_id and device_id from sysfs
|
// parse vendor_id and device_id from sysfs
|
||||||
vendorID, deviceID := readPCIIDs(bdf)
|
vendorID, deviceID := readPCIIDs(bdfStr)
|
||||||
if vendorID != 0 {
|
if vendorID != 0 {
|
||||||
dev.VendorID = &vendorID
|
dev.VendorID = &vendorID
|
||||||
}
|
}
|
||||||
if deviceID != 0 {
|
if deviceID != 0 {
|
||||||
dev.DeviceID = &deviceID
|
dev.DeviceID = &deviceID
|
||||||
}
|
}
|
||||||
if numaNode, ok := readPCINumaNode(bdf); ok {
|
if numaNode, ok := readPCINumaNode(bdfStr); ok {
|
||||||
dev.NUMANode = &numaNode
|
dev.NUMANode = &numaNode
|
||||||
} else if numaNode, ok := parsePCINumaNode(fields["NUMANode"]); ok {
|
} else if numaNode, ok := parsePCINumaNode(fields["NUMANode"]); ok {
|
||||||
dev.NUMANode = &numaNode
|
dev.NUMANode = &numaNode
|
||||||
}
|
}
|
||||||
if group, ok := readPCIIOMMUGroup(bdf); ok {
|
if group, ok := readPCIIOMMUGroup(bdfStr); ok {
|
||||||
dev.IOMMUGroup = &group
|
dev.IOMMUGroup = &group
|
||||||
}
|
}
|
||||||
if width, ok := readPCIIntAttribute(bdf, "current_link_width"); ok {
|
if width, ok := readPCIIntAttribute(bdfStr, "current_link_width"); ok {
|
||||||
dev.LinkWidth = &width
|
dev.LinkWidth = &width
|
||||||
}
|
}
|
||||||
if width, ok := readPCIIntAttribute(bdf, "max_link_width"); ok {
|
if width, ok := readPCIIntAttribute(bdfStr, "max_link_width"); ok {
|
||||||
dev.MaxLinkWidth = &width
|
dev.MaxLinkWidth = &width
|
||||||
}
|
}
|
||||||
if speed, ok := readPCIStringAttribute(bdf, "current_link_speed"); ok {
|
if speed, ok := readPCIStringAttribute(bdfStr, "current_link_speed"); ok {
|
||||||
linkSpeed := normalizePCILinkSpeed(speed)
|
linkSpeed := normalizePCILinkSpeed(speed)
|
||||||
if linkSpeed != "" {
|
if linkSpeed != "" {
|
||||||
dev.LinkSpeed = &linkSpeed
|
dev.LinkSpeed = &linkSpeed
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if speed, ok := readPCIStringAttribute(bdf, "max_link_speed"); ok {
|
if speed, ok := readPCIStringAttribute(bdfStr, "max_link_speed"); ok {
|
||||||
linkSpeed := normalizePCILinkSpeed(speed)
|
linkSpeed := normalizePCILinkSpeed(speed)
|
||||||
if linkSpeed != "" {
|
if linkSpeed != "" {
|
||||||
dev.MaxLinkSpeed = &linkSpeed
|
dev.MaxLinkSpeed = &linkSpeed
|
||||||
@@ -178,7 +179,15 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
|||||||
|
|
||||||
// SVendor/SDevice available but not in schema — skip
|
// SVendor/SDevice available but not in schema — skip
|
||||||
|
|
||||||
// Warn if PCIe link is running below its maximum negotiated speed.
|
// Detect NVLink bridge mezzanine cards (CPU→HGX internal link).
|
||||||
|
// These are Mellanox x2 devices with no host net interfaces and a DeviceName
|
||||||
|
// containing "NVLINK". The targeted lspci call is only executed for the small
|
||||||
|
// number of narrow-link Mellanox cards that pass the cheap pre-filter.
|
||||||
|
if bdfStr != "" && isNVLinkBridgeCandidate(bdfStr, dev) && confirmNVLinkBridgeDeviceName(bdfStr) {
|
||||||
|
markNVLinkBridge(&dev)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Warn (or Critical for NVLink bridges) if PCIe link is running below max.
|
||||||
applyPCIeLinkSpeedWarning(&dev)
|
applyPCIeLinkSpeedWarning(&dev)
|
||||||
|
|
||||||
return dev
|
return dev
|
||||||
@@ -265,17 +274,27 @@ func readPCIStringAttribute(bdf, attribute string) (string, bool) {
|
|||||||
return value, true
|
return value, true
|
||||||
}
|
}
|
||||||
|
|
||||||
// applyPCIeLinkSpeedWarning sets the device status to Warning if the current PCIe link
|
// applyPCIeLinkSpeedWarning sets device status when the current PCIe link speed is
|
||||||
// speed is below the maximum negotiated speed supported by both ends.
|
// below the device maximum. Regular PCIe slots get Warning; NVLink bridge cards
|
||||||
|
// get Critical because they are fixed internal connectors that must always train
|
||||||
|
// to max speed — any downgrade signals a hardware fault.
|
||||||
func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) {
|
func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) {
|
||||||
if dev.LinkSpeed == nil || dev.MaxLinkSpeed == nil {
|
if dev.LinkSpeed == nil || dev.MaxLinkSpeed == nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if pcieLinkSpeedRank(*dev.LinkSpeed) < pcieLinkSpeedRank(*dev.MaxLinkSpeed) {
|
if pcieLinkSpeedRank(*dev.LinkSpeed) >= pcieLinkSpeedRank(*dev.MaxLinkSpeed) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
|
||||||
|
dev.ErrorDescription = &desc
|
||||||
|
|
||||||
|
isNVLinkBridge := dev.DeviceClass != nil && *dev.DeviceClass == "NVLinkBridge"
|
||||||
|
if isNVLinkBridge {
|
||||||
|
crit := statusCritical
|
||||||
|
dev.Status = &crit
|
||||||
|
} else {
|
||||||
warn := statusWarning
|
warn := statusWarning
|
||||||
dev.Status = &warn
|
dev.Status = &warn
|
||||||
desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
|
|
||||||
dev.ErrorDescription = &desc
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
206
audit/internal/collector/pcie_nvlink_bridge.go
Normal file
206
audit/internal/collector/pcie_nvlink_bridge.go
Normal file
@@ -0,0 +1,206 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
"log/slog"
|
||||||
|
"os/exec"
|
||||||
|
"regexp"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
var nv5re = regexp.MustCompile(`(?i)^NV(\d+)$`)
|
||||||
|
|
||||||
|
// isNVLinkBridgeCandidate returns true for Mellanox PCIe devices that look like
|
||||||
|
// NVLink bridge mezzanine cards: narrow link (x2), no host net interfaces.
|
||||||
|
// These are the CPU-side PCIe control plane of the NVSwitch fabric on HGX/DGX systems.
|
||||||
|
func isNVLinkBridgeCandidate(bdf string, dev schema.HardwarePCIeDevice) bool {
|
||||||
|
if !isMellanoxDevice(dev) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if dev.LinkWidth == nil || *dev.LinkWidth > 2 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if len(netIfacesByBDF(bdf)) > 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// confirmNVLinkBridgeDeviceName checks if the lspci DeviceName for bdf contains
|
||||||
|
// "NVLINK". This is a targeted single-device call, only executed for candidates
|
||||||
|
// already pre-filtered by isNVLinkBridgeCandidate.
|
||||||
|
func confirmNVLinkBridgeDeviceName(bdf string) bool {
|
||||||
|
out, err := exec.Command("lspci", "-s", bdf, "-v").Output()
|
||||||
|
if err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
for _, line := range strings.Split(string(out), "\n") {
|
||||||
|
if strings.Contains(strings.ToUpper(strings.TrimSpace(line)), "NVLINK") {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// markNVLinkBridge overwrites device_class and adds telemetry flags on a detected
|
||||||
|
// NVLink bridge card. Must be called before applyPCIeLinkSpeedWarning so that the
|
||||||
|
// correct severity (Critical) is applied.
|
||||||
|
func markNVLinkBridge(dev *schema.HardwarePCIeDevice) {
|
||||||
|
class := "NVLinkBridge"
|
||||||
|
dev.DeviceClass = &class
|
||||||
|
if dev.Telemetry == nil {
|
||||||
|
dev.Telemetry = map[string]any{}
|
||||||
|
}
|
||||||
|
dev.Telemetry["nvlink_bridge"] = true
|
||||||
|
}
|
||||||
|
|
||||||
|
// enrichNVLinkBridgesWithGPUTopo cross-references NVLink bridge PCIe status with
|
||||||
|
// the GPU-side NVLink topology reported by nvidia-smi. For each bridge device it
|
||||||
|
// adds nvlink_topo_all_active and nvlink_topo_min_links to the telemetry, and
|
||||||
|
// upgrades a degraded-link Warning to Critical when the fabric is also affected.
|
||||||
|
func enrichNVLinkBridgesWithGPUTopo(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
|
||||||
|
hasBridge := false
|
||||||
|
for _, d := range devs {
|
||||||
|
if d.DeviceClass != nil && *d.DeviceClass == "NVLinkBridge" {
|
||||||
|
hasBridge = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !hasBridge {
|
||||||
|
return devs
|
||||||
|
}
|
||||||
|
|
||||||
|
topo, err := queryNVIDIANVLinkTopo()
|
||||||
|
if err != nil {
|
||||||
|
slog.Info("nvlink-bridge: nvidia-smi topo unavailable, skipping cross-reference", "err", err)
|
||||||
|
return devs
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := range devs {
|
||||||
|
if devs[i].DeviceClass == nil || *devs[i].DeviceClass != "NVLinkBridge" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if devs[i].Telemetry == nil {
|
||||||
|
devs[i].Telemetry = map[string]any{}
|
||||||
|
}
|
||||||
|
devs[i].Telemetry["nvlink_topo_all_active"] = topo.AllActive
|
||||||
|
devs[i].Telemetry["nvlink_topo_min_links"] = topo.MinNVLinks
|
||||||
|
devs[i].Telemetry["nvlink_topo_gpu_count"] = topo.GPUCount
|
||||||
|
|
||||||
|
// If the bridge PCIe is already degraded AND the fabric is also degraded
|
||||||
|
// (missing NVLink connections), escalate to Critical.
|
||||||
|
if devs[i].Status != nil && *devs[i].Status == statusCritical && !topo.AllActive {
|
||||||
|
devs[i].Telemetry["nvlink_fabric_affected"] = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
slog.Info("nvlink-bridge: topo cross-reference applied",
|
||||||
|
"gpu_count", topo.GPUCount,
|
||||||
|
"all_active", topo.AllActive,
|
||||||
|
"min_links", topo.MinNVLinks,
|
||||||
|
)
|
||||||
|
return devs
|
||||||
|
}
|
||||||
|
|
||||||
|
// nvlinkTopoResult summarises the GPU NVLink connectivity matrix.
|
||||||
|
type nvlinkTopoResult struct {
|
||||||
|
GPUCount int
|
||||||
|
AllActive bool // true if every GPU pair has at least one NVLink bond
|
||||||
|
MinNVLinks int // minimum NVLink bonds seen across any GPU pair (0 = some pair disconnected)
|
||||||
|
}
|
||||||
|
|
||||||
|
// queryNVIDIANVLinkTopo runs nvidia-smi topo -m and parses the NVLink matrix.
|
||||||
|
func queryNVIDIANVLinkTopo() (nvlinkTopoResult, error) {
|
||||||
|
out, err := exec.Command("nvidia-smi", "topo", "-m").Output()
|
||||||
|
if err != nil {
|
||||||
|
return nvlinkTopoResult{}, err
|
||||||
|
}
|
||||||
|
return parseNVIDIATopologyMatrix(string(out)), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseNVIDIATopologyMatrix extracts the minimum NVLink bond count from the
|
||||||
|
// nvidia-smi topo -m matrix.
|
||||||
|
//
|
||||||
|
// Format (abbreviated):
|
||||||
|
//
|
||||||
|
// GPU0 GPU1 ... NIC0 NIC1
|
||||||
|
// GPU0 X NV18 ... NODE NODE
|
||||||
|
// GPU1 NV18 X ... NODE NODE
|
||||||
|
// NIC0 NODE NODE... X PIX
|
||||||
|
//
|
||||||
|
// The header row starts with "GPU0"; its columns may include non-GPU entries
|
||||||
|
// (NIC, CPU) which are ignored. Only GPU×GPU cells containing NV# values are
|
||||||
|
// counted. X is self; non-NV tokens (NODE, SYS, PHB, PIX) are skipped.
|
||||||
|
func parseNVIDIATopologyMatrix(raw string) nvlinkTopoResult {
|
||||||
|
lines := strings.Split(raw, "\n")
|
||||||
|
|
||||||
|
// Locate the header line and record which column indices are GPU columns.
|
||||||
|
headerIdx := -1
|
||||||
|
var gpuColIndices []int // 0-based indices within fields (excluding the row label)
|
||||||
|
var gpuCount int
|
||||||
|
for i, line := range lines {
|
||||||
|
trimmed := strings.TrimSpace(line)
|
||||||
|
if strings.HasPrefix(trimmed, "GPU0") {
|
||||||
|
parts := strings.Fields(trimmed)
|
||||||
|
for j, col := range parts {
|
||||||
|
if strings.HasPrefix(col, "GPU") {
|
||||||
|
gpuColIndices = append(gpuColIndices, j)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
gpuCount = len(gpuColIndices)
|
||||||
|
if gpuCount >= 2 {
|
||||||
|
headerIdx = i
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if headerIdx < 0 || gpuCount == 0 {
|
||||||
|
return nvlinkTopoResult{}
|
||||||
|
}
|
||||||
|
|
||||||
|
minLinks := -1 // -1 = no NV pair seen yet
|
||||||
|
allActive := true
|
||||||
|
|
||||||
|
for _, line := range lines[headerIdx+1:] {
|
||||||
|
trimmed := strings.TrimSpace(line)
|
||||||
|
if !strings.HasPrefix(trimmed, "GPU") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
cells := strings.Fields(trimmed)
|
||||||
|
// cells[0] is the row label (e.g. "GPU0"); cells[1..] are column values.
|
||||||
|
// gpuColIndices are 0-based within the header fields, so they map to
|
||||||
|
// cells[idx+1] in the data rows (shift by 1 for the row label).
|
||||||
|
for _, colIdx := range gpuColIndices {
|
||||||
|
dataIdx := colIdx + 1
|
||||||
|
if dataIdx >= len(cells) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
cell := cells[dataIdx]
|
||||||
|
m := nv5re.FindStringSubmatch(cell)
|
||||||
|
if len(m) != 2 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
n, err := strconv.Atoi(m[1])
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if n == 0 {
|
||||||
|
allActive = false
|
||||||
|
}
|
||||||
|
if minLinks < 0 || n < minLinks {
|
||||||
|
minLinks = n
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if minLinks < 0 {
|
||||||
|
minLinks = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
return nvlinkTopoResult{
|
||||||
|
GPUCount: gpuCount,
|
||||||
|
AllActive: allActive && minLinks > 0,
|
||||||
|
MinNVLinks: minLinks,
|
||||||
|
}
|
||||||
|
}
|
||||||
124
audit/internal/collector/pcie_nvlink_bridge_test.go
Normal file
124
audit/internal/collector/pcie_nvlink_bridge_test.go
Normal file
@@ -0,0 +1,124 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestParseNVIDIATopologyMatrix(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
// Real-world B200 HGX output: 8 GPUs, all pairs connected via NV18.
|
||||||
|
input := ` GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 NIC0 NIC1
|
||||||
|
GPU0 X NV18 NV18 NV18 NV18 NV18 NV18 NV18 NODE NODE
|
||||||
|
GPU1 NV18 X NV18 NV18 NV18 NV18 NV18 NV18 NODE NODE
|
||||||
|
GPU2 NV18 NV18 X NV18 NV18 NV18 NV18 NV18 NODE NODE
|
||||||
|
GPU3 NV18 NV18 NV18 X NV18 NV18 NV18 NV18 NODE NODE
|
||||||
|
GPU4 NV18 NV18 NV18 NV18 X NV18 NV18 NV18 SYS SYS
|
||||||
|
GPU5 NV18 NV18 NV18 NV18 NV18 X NV18 NV18 SYS SYS
|
||||||
|
GPU6 NV18 NV18 NV18 NV18 NV18 NV18 X NV18 SYS SYS
|
||||||
|
GPU7 NV18 NV18 NV18 NV18 NV18 NV18 NV18 X SYS SYS
|
||||||
|
NIC0 NODE NODE NODE NODE SYS SYS SYS SYS X PIX
|
||||||
|
`
|
||||||
|
got := parseNVIDIATopologyMatrix(input)
|
||||||
|
|
||||||
|
if got.GPUCount != 8 {
|
||||||
|
t.Fatalf("GPUCount=%d want 8", got.GPUCount)
|
||||||
|
}
|
||||||
|
if !got.AllActive {
|
||||||
|
t.Fatalf("AllActive=false want true")
|
||||||
|
}
|
||||||
|
if got.MinNVLinks != 18 {
|
||||||
|
t.Fatalf("MinNVLinks=%d want 18", got.MinNVLinks)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseNVIDIATopologyMatrixPartialDegradation(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
// GPU1-GPU3 pair shows NV12 (reduced) instead of NV18.
|
||||||
|
input := ` GPU0 GPU1 GPU2 GPU3
|
||||||
|
GPU0 X NV18 NV18 NV18
|
||||||
|
GPU1 NV18 X NV18 NV12
|
||||||
|
GPU2 NV18 NV18 X NV18
|
||||||
|
GPU3 NV18 NV12 NV18 X
|
||||||
|
`
|
||||||
|
got := parseNVIDIATopologyMatrix(input)
|
||||||
|
|
||||||
|
if got.MinNVLinks != 12 {
|
||||||
|
t.Fatalf("MinNVLinks=%d want 12", got.MinNVLinks)
|
||||||
|
}
|
||||||
|
if !got.AllActive {
|
||||||
|
t.Fatalf("AllActive=false want true (12 links is still active)")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseNVIDIATopologyMatrixDisconnected(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
// GPU0-GPU1 pair fully disconnected (NV0).
|
||||||
|
input := ` GPU0 GPU1
|
||||||
|
GPU0 X NV0
|
||||||
|
GPU1 NV0 X
|
||||||
|
`
|
||||||
|
got := parseNVIDIATopologyMatrix(input)
|
||||||
|
|
||||||
|
if got.AllActive {
|
||||||
|
t.Fatalf("AllActive=true want false (NV0 means no links)")
|
||||||
|
}
|
||||||
|
if got.MinNVLinks != 0 {
|
||||||
|
t.Fatalf("MinNVLinks=%d want 0", got.MinNVLinks)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseNVIDIATopologyMatrixEmpty(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
got := parseNVIDIATopologyMatrix("no gpus here")
|
||||||
|
if got.GPUCount != 0 {
|
||||||
|
t.Fatalf("GPUCount=%d want 0", got.GPUCount)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestApplyPCIeLinkSpeedWarningNVLinkBridgeEscalates(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
bridgeClass := "NVLinkBridge"
|
||||||
|
linkSpeed := "Gen3"
|
||||||
|
maxLinkSpeed := "Gen4"
|
||||||
|
dev := schema.HardwarePCIeDevice{}
|
||||||
|
dev.DeviceClass = &bridgeClass
|
||||||
|
dev.LinkSpeed = &linkSpeed
|
||||||
|
dev.MaxLinkSpeed = &maxLinkSpeed
|
||||||
|
s := statusOK
|
||||||
|
dev.Status = &s
|
||||||
|
|
||||||
|
applyPCIeLinkSpeedWarning(&dev)
|
||||||
|
|
||||||
|
if dev.Status == nil || *dev.Status != statusCritical {
|
||||||
|
t.Fatalf("status=%v want Critical for NVLink bridge degradation", dev.Status)
|
||||||
|
}
|
||||||
|
if dev.ErrorDescription == nil {
|
||||||
|
t.Fatal("ErrorDescription nil, want degradation message")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestApplyPCIeLinkSpeedWarningRegularCardIsWarning(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
regularClass := "NetworkController"
|
||||||
|
linkSpeed := "Gen3"
|
||||||
|
maxLinkSpeed := "Gen4"
|
||||||
|
dev := schema.HardwarePCIeDevice{}
|
||||||
|
dev.DeviceClass = ®ularClass
|
||||||
|
dev.LinkSpeed = &linkSpeed
|
||||||
|
dev.MaxLinkSpeed = &maxLinkSpeed
|
||||||
|
s := statusOK
|
||||||
|
dev.Status = &s
|
||||||
|
|
||||||
|
applyPCIeLinkSpeedWarning(&dev)
|
||||||
|
|
||||||
|
if dev.Status == nil || *dev.Status != statusWarning {
|
||||||
|
t.Fatalf("status=%v want Warning for regular card degradation", dev.Status)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -66,17 +66,41 @@ func collectStorage() []schema.HardwareStorage {
|
|||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// jsonInt64 accepts both a bare JSON number and a JSON-quoted number string.
|
||||||
|
// lsblk -J emits LOG-SEC / PHY-SEC as integers on util-linux ≥ 2.37 (Debian 12)
|
||||||
|
// but older versions emit them as strings. This type handles both.
|
||||||
|
type jsonInt64 int64
|
||||||
|
|
||||||
|
func (j *jsonInt64) UnmarshalJSON(data []byte) error {
|
||||||
|
// bare number: 512
|
||||||
|
var n int64
|
||||||
|
if err := json.Unmarshal(data, &n); err == nil {
|
||||||
|
*j = jsonInt64(n)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
// quoted string: "512"
|
||||||
|
var s string
|
||||||
|
if err := json.Unmarshal(data, &s); err == nil {
|
||||||
|
n, err := strconv.ParseInt(strings.TrimSpace(s), 10, 64)
|
||||||
|
if err == nil {
|
||||||
|
*j = jsonInt64(n)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return nil // null or unexpected type — leave zero
|
||||||
|
}
|
||||||
|
|
||||||
// lsblkDevice is a minimal lsblk JSON record.
|
// lsblkDevice is a minimal lsblk JSON record.
|
||||||
type lsblkDevice struct {
|
type lsblkDevice struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Type string `json:"type"`
|
Type string `json:"type"`
|
||||||
Size string `json:"size"`
|
Size string `json:"size"`
|
||||||
Serial string `json:"serial"`
|
Serial string `json:"serial"`
|
||||||
Model string `json:"model"`
|
Model string `json:"model"`
|
||||||
Tran string `json:"tran"`
|
Tran string `json:"tran"`
|
||||||
Hctl string `json:"hctl"`
|
Hctl string `json:"hctl"`
|
||||||
LogSec string `json:"log-sec"`
|
LogSec jsonInt64 `json:"log-sec"`
|
||||||
PhySec string `json:"phy-sec"`
|
PhySec jsonInt64 `json:"phy-sec"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type lsblkRoot struct {
|
type lsblkRoot struct {
|
||||||
@@ -620,8 +644,8 @@ func applyStorageBlockGeometry(s *schema.HardwareStorage, dev lsblkDevice) {
|
|||||||
if s == nil {
|
if s == nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
logical := parseStorageBytes(dev.LogSec)
|
logical := int64(dev.LogSec)
|
||||||
physical := parseStorageBytes(dev.PhySec)
|
physical := int64(dev.PhySec)
|
||||||
if logical <= 0 && physical <= 0 {
|
if logical <= 0 && physical <= 0 {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package collector
|
package collector
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"encoding/json"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
@@ -38,6 +39,54 @@ func TestParseStorageBytes(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestJsonInt64UnmarshalBothFormats(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
// util-linux ≥ 2.37 emits LOG-SEC / PHY-SEC as bare JSON numbers.
|
||||||
|
// Older versions emit quoted strings. Both must parse without error
|
||||||
|
// so that the entire lsblkDevices() call does not return nil on Debian 12.
|
||||||
|
cases := []struct {
|
||||||
|
json string
|
||||||
|
want int64
|
||||||
|
}{
|
||||||
|
{`512`, 512},
|
||||||
|
{`4096`, 4096},
|
||||||
|
{`"512"`, 512},
|
||||||
|
{`"4096"`, 4096},
|
||||||
|
{`null`, 0},
|
||||||
|
}
|
||||||
|
for _, tc := range cases {
|
||||||
|
var v jsonInt64
|
||||||
|
if err := v.UnmarshalJSON([]byte(tc.json)); err != nil {
|
||||||
|
t.Fatalf("UnmarshalJSON(%s): unexpected error %v", tc.json, err)
|
||||||
|
}
|
||||||
|
if int64(v) != tc.want {
|
||||||
|
t.Fatalf("UnmarshalJSON(%s)=%d want %d", tc.json, int64(v), tc.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Simulate the exact JSON shape that triggered the bug on Debian 12.
|
||||||
|
input := []byte(`{
|
||||||
|
"blockdevices": [
|
||||||
|
{"name":"sda","type":"disk","size":"3.6T","serial":"S1234","model":"SEAGATE","tran":"sata","hctl":"0:0:0:0","log-sec":512,"phy-sec":4096},
|
||||||
|
{"name":"sdb","type":"disk","size":"3.6T","serial":"S5678","model":"SEAGATE","tran":"sata","hctl":"0:0:1:0","log-sec":512,"phy-sec":4096}
|
||||||
|
]
|
||||||
|
}`)
|
||||||
|
var root lsblkRoot
|
||||||
|
if err := json.Unmarshal(input, &root); err != nil {
|
||||||
|
t.Fatalf("lsblkRoot unmarshal with integer log-sec/phy-sec: %v", err)
|
||||||
|
}
|
||||||
|
if len(root.Blockdevices) != 2 {
|
||||||
|
t.Fatalf("got %d blockdevices want 2", len(root.Blockdevices))
|
||||||
|
}
|
||||||
|
if int64(root.Blockdevices[0].LogSec) != 512 {
|
||||||
|
t.Fatalf("LogSec=%d want 512", root.Blockdevices[0].LogSec)
|
||||||
|
}
|
||||||
|
if int64(root.Blockdevices[0].PhySec) != 4096 {
|
||||||
|
t.Fatalf("PhySec=%d want 4096", root.Blockdevices[0].PhySec)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestBestEffortRescanHotplugStorage(t *testing.T) {
|
func TestBestEffortRescanHotplugStorage(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
|
|||||||
@@ -38,6 +38,7 @@ exfat-fuse
|
|||||||
ntfs-3g
|
ntfs-3g
|
||||||
|
|
||||||
# Utilities
|
# Utilities
|
||||||
|
infiniband-diags
|
||||||
bash
|
bash
|
||||||
procps
|
procps
|
||||||
lsof
|
lsof
|
||||||
|
|||||||
Reference in New Issue
Block a user