Fix SATA discovery, add NVLink bridge detection, add infiniband-diags

- storage: add jsonInt64 dual-format unmarshaler to handle lsblk output
  change in util-linux 2.38 (LOG-SEC/PHY-SEC now emitted as JSON
  integers, not quoted strings); fixes SATA disks invisible on Debian 12
- pcie: detect NVLink bridge mezzanine CX-7 cards (Mellanox x2, no host
  net ifaces, DeviceName contains "NVLINK" in lspci -v) and mark them
  with device_class="NVLinkBridge"; escalate PCIe link speed downgrade to
  Critical for these cards (Gen3 on a fixed internal connector = hardware
  fault, not a transient warning)
- pcie: cross-reference nvidia-smi topo to capture NVLink bond counts and
  active status for all NVLink bridge cards
- packages: add infiniband-diags to ISO; provides ibstat required by
  nvidia-fabricmanager-start.sh to enumerate IB devices before FM launch
  (absence causes CUDA_ERROR_SYSTEM_NOT_READY)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-28 20:57:04 +03:00
parent 4f6579e040
commit 963bc960ca
7 changed files with 451 additions and 27 deletions

View File

@@ -40,6 +40,7 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
snap.PCIeDevices = enrichPCIeWithAMD(snap.PCIeDevices) snap.PCIeDevices = enrichPCIeWithAMD(snap.PCIeDevices)
snap.PCIeDevices = enrichPCIeWithPCISerials(snap.PCIeDevices) snap.PCIeDevices = enrichPCIeWithPCISerials(snap.PCIeDevices)
snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices) snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices)
snap.PCIeDevices = enrichNVLinkBridgesWithGPUTopo(snap.PCIeDevices)
snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices) snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices)
snap.PCIeDevices = enrichPCIeWithNICTelemetry(snap.PCIeDevices) snap.PCIeDevices = enrichPCIeWithNICTelemetry(snap.PCIeDevices)
snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices) snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices)

View File

@@ -126,38 +126,39 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
dev.Status = &status dev.Status = &status
// Slot is the BDF: "0000:00:02.0" // Slot is the BDF: "0000:00:02.0"
if bdf := fields["Slot"]; bdf != "" { bdfStr := fields["Slot"]
dev.Slot = &bdf if bdfStr != "" {
dev.BDF = &bdf dev.Slot = &bdfStr
dev.BDF = &bdfStr
// parse vendor_id and device_id from sysfs // parse vendor_id and device_id from sysfs
vendorID, deviceID := readPCIIDs(bdf) vendorID, deviceID := readPCIIDs(bdfStr)
if vendorID != 0 { if vendorID != 0 {
dev.VendorID = &vendorID dev.VendorID = &vendorID
} }
if deviceID != 0 { if deviceID != 0 {
dev.DeviceID = &deviceID dev.DeviceID = &deviceID
} }
if numaNode, ok := readPCINumaNode(bdf); ok { if numaNode, ok := readPCINumaNode(bdfStr); ok {
dev.NUMANode = &numaNode dev.NUMANode = &numaNode
} else if numaNode, ok := parsePCINumaNode(fields["NUMANode"]); ok { } else if numaNode, ok := parsePCINumaNode(fields["NUMANode"]); ok {
dev.NUMANode = &numaNode dev.NUMANode = &numaNode
} }
if group, ok := readPCIIOMMUGroup(bdf); ok { if group, ok := readPCIIOMMUGroup(bdfStr); ok {
dev.IOMMUGroup = &group dev.IOMMUGroup = &group
} }
if width, ok := readPCIIntAttribute(bdf, "current_link_width"); ok { if width, ok := readPCIIntAttribute(bdfStr, "current_link_width"); ok {
dev.LinkWidth = &width dev.LinkWidth = &width
} }
if width, ok := readPCIIntAttribute(bdf, "max_link_width"); ok { if width, ok := readPCIIntAttribute(bdfStr, "max_link_width"); ok {
dev.MaxLinkWidth = &width dev.MaxLinkWidth = &width
} }
if speed, ok := readPCIStringAttribute(bdf, "current_link_speed"); ok { if speed, ok := readPCIStringAttribute(bdfStr, "current_link_speed"); ok {
linkSpeed := normalizePCILinkSpeed(speed) linkSpeed := normalizePCILinkSpeed(speed)
if linkSpeed != "" { if linkSpeed != "" {
dev.LinkSpeed = &linkSpeed dev.LinkSpeed = &linkSpeed
} }
} }
if speed, ok := readPCIStringAttribute(bdf, "max_link_speed"); ok { if speed, ok := readPCIStringAttribute(bdfStr, "max_link_speed"); ok {
linkSpeed := normalizePCILinkSpeed(speed) linkSpeed := normalizePCILinkSpeed(speed)
if linkSpeed != "" { if linkSpeed != "" {
dev.MaxLinkSpeed = &linkSpeed dev.MaxLinkSpeed = &linkSpeed
@@ -178,7 +179,15 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
// SVendor/SDevice available but not in schema — skip // SVendor/SDevice available but not in schema — skip
// Warn if PCIe link is running below its maximum negotiated speed. // Detect NVLink bridge mezzanine cards (CPU→HGX internal link).
// These are Mellanox x2 devices with no host net interfaces and a DeviceName
// containing "NVLINK". The targeted lspci call is only executed for the small
// number of narrow-link Mellanox cards that pass the cheap pre-filter.
if bdfStr != "" && isNVLinkBridgeCandidate(bdfStr, dev) && confirmNVLinkBridgeDeviceName(bdfStr) {
markNVLinkBridge(&dev)
}
// Warn (or Critical for NVLink bridges) if PCIe link is running below max.
applyPCIeLinkSpeedWarning(&dev) applyPCIeLinkSpeedWarning(&dev)
return dev return dev
@@ -265,17 +274,27 @@ func readPCIStringAttribute(bdf, attribute string) (string, bool) {
return value, true return value, true
} }
// applyPCIeLinkSpeedWarning sets the device status to Warning if the current PCIe link // applyPCIeLinkSpeedWarning sets device status when the current PCIe link speed is
// speed is below the maximum negotiated speed supported by both ends. // below the device maximum. Regular PCIe slots get Warning; NVLink bridge cards
// get Critical because they are fixed internal connectors that must always train
// to max speed — any downgrade signals a hardware fault.
func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) { func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) {
if dev.LinkSpeed == nil || dev.MaxLinkSpeed == nil { if dev.LinkSpeed == nil || dev.MaxLinkSpeed == nil {
return return
} }
if pcieLinkSpeedRank(*dev.LinkSpeed) < pcieLinkSpeedRank(*dev.MaxLinkSpeed) { if pcieLinkSpeedRank(*dev.LinkSpeed) >= pcieLinkSpeedRank(*dev.MaxLinkSpeed) {
return
}
desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
dev.ErrorDescription = &desc
isNVLinkBridge := dev.DeviceClass != nil && *dev.DeviceClass == "NVLinkBridge"
if isNVLinkBridge {
crit := statusCritical
dev.Status = &crit
} else {
warn := statusWarning warn := statusWarning
dev.Status = &warn dev.Status = &warn
desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
dev.ErrorDescription = &desc
} }
} }

View File

@@ -0,0 +1,206 @@
package collector
import (
"bee/audit/internal/schema"
"log/slog"
"os/exec"
"regexp"
"strconv"
"strings"
)
var nv5re = regexp.MustCompile(`(?i)^NV(\d+)$`)
// isNVLinkBridgeCandidate returns true for Mellanox PCIe devices that look like
// NVLink bridge mezzanine cards: narrow link (x2), no host net interfaces.
// These are the CPU-side PCIe control plane of the NVSwitch fabric on HGX/DGX systems.
func isNVLinkBridgeCandidate(bdf string, dev schema.HardwarePCIeDevice) bool {
if !isMellanoxDevice(dev) {
return false
}
if dev.LinkWidth == nil || *dev.LinkWidth > 2 {
return false
}
if len(netIfacesByBDF(bdf)) > 0 {
return false
}
return true
}
// confirmNVLinkBridgeDeviceName checks if the lspci DeviceName for bdf contains
// "NVLINK". This is a targeted single-device call, only executed for candidates
// already pre-filtered by isNVLinkBridgeCandidate.
func confirmNVLinkBridgeDeviceName(bdf string) bool {
out, err := exec.Command("lspci", "-s", bdf, "-v").Output()
if err != nil {
return false
}
for _, line := range strings.Split(string(out), "\n") {
if strings.Contains(strings.ToUpper(strings.TrimSpace(line)), "NVLINK") {
return true
}
}
return false
}
// markNVLinkBridge overwrites device_class and adds telemetry flags on a detected
// NVLink bridge card. Must be called before applyPCIeLinkSpeedWarning so that the
// correct severity (Critical) is applied.
func markNVLinkBridge(dev *schema.HardwarePCIeDevice) {
class := "NVLinkBridge"
dev.DeviceClass = &class
if dev.Telemetry == nil {
dev.Telemetry = map[string]any{}
}
dev.Telemetry["nvlink_bridge"] = true
}
// enrichNVLinkBridgesWithGPUTopo cross-references NVLink bridge PCIe status with
// the GPU-side NVLink topology reported by nvidia-smi. For each bridge device it
// adds nvlink_topo_all_active and nvlink_topo_min_links to the telemetry, and
// upgrades a degraded-link Warning to Critical when the fabric is also affected.
func enrichNVLinkBridgesWithGPUTopo(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
hasBridge := false
for _, d := range devs {
if d.DeviceClass != nil && *d.DeviceClass == "NVLinkBridge" {
hasBridge = true
break
}
}
if !hasBridge {
return devs
}
topo, err := queryNVIDIANVLinkTopo()
if err != nil {
slog.Info("nvlink-bridge: nvidia-smi topo unavailable, skipping cross-reference", "err", err)
return devs
}
for i := range devs {
if devs[i].DeviceClass == nil || *devs[i].DeviceClass != "NVLinkBridge" {
continue
}
if devs[i].Telemetry == nil {
devs[i].Telemetry = map[string]any{}
}
devs[i].Telemetry["nvlink_topo_all_active"] = topo.AllActive
devs[i].Telemetry["nvlink_topo_min_links"] = topo.MinNVLinks
devs[i].Telemetry["nvlink_topo_gpu_count"] = topo.GPUCount
// If the bridge PCIe is already degraded AND the fabric is also degraded
// (missing NVLink connections), escalate to Critical.
if devs[i].Status != nil && *devs[i].Status == statusCritical && !topo.AllActive {
devs[i].Telemetry["nvlink_fabric_affected"] = true
}
}
slog.Info("nvlink-bridge: topo cross-reference applied",
"gpu_count", topo.GPUCount,
"all_active", topo.AllActive,
"min_links", topo.MinNVLinks,
)
return devs
}
// nvlinkTopoResult summarises the GPU NVLink connectivity matrix.
type nvlinkTopoResult struct {
GPUCount int
AllActive bool // true if every GPU pair has at least one NVLink bond
MinNVLinks int // minimum NVLink bonds seen across any GPU pair (0 = some pair disconnected)
}
// queryNVIDIANVLinkTopo runs nvidia-smi topo -m and parses the NVLink matrix.
func queryNVIDIANVLinkTopo() (nvlinkTopoResult, error) {
out, err := exec.Command("nvidia-smi", "topo", "-m").Output()
if err != nil {
return nvlinkTopoResult{}, err
}
return parseNVIDIATopologyMatrix(string(out)), nil
}
// parseNVIDIATopologyMatrix extracts the minimum NVLink bond count from the
// nvidia-smi topo -m matrix.
//
// Format (abbreviated):
//
// GPU0 GPU1 ... NIC0 NIC1
// GPU0 X NV18 ... NODE NODE
// GPU1 NV18 X ... NODE NODE
// NIC0 NODE NODE... X PIX
//
// The header row starts with "GPU0"; its columns may include non-GPU entries
// (NIC, CPU) which are ignored. Only GPU×GPU cells containing NV# values are
// counted. X is self; non-NV tokens (NODE, SYS, PHB, PIX) are skipped.
func parseNVIDIATopologyMatrix(raw string) nvlinkTopoResult {
lines := strings.Split(raw, "\n")
// Locate the header line and record which column indices are GPU columns.
headerIdx := -1
var gpuColIndices []int // 0-based indices within fields (excluding the row label)
var gpuCount int
for i, line := range lines {
trimmed := strings.TrimSpace(line)
if strings.HasPrefix(trimmed, "GPU0") {
parts := strings.Fields(trimmed)
for j, col := range parts {
if strings.HasPrefix(col, "GPU") {
gpuColIndices = append(gpuColIndices, j)
}
}
gpuCount = len(gpuColIndices)
if gpuCount >= 2 {
headerIdx = i
}
break
}
}
if headerIdx < 0 || gpuCount == 0 {
return nvlinkTopoResult{}
}
minLinks := -1 // -1 = no NV pair seen yet
allActive := true
for _, line := range lines[headerIdx+1:] {
trimmed := strings.TrimSpace(line)
if !strings.HasPrefix(trimmed, "GPU") {
continue
}
cells := strings.Fields(trimmed)
// cells[0] is the row label (e.g. "GPU0"); cells[1..] are column values.
// gpuColIndices are 0-based within the header fields, so they map to
// cells[idx+1] in the data rows (shift by 1 for the row label).
for _, colIdx := range gpuColIndices {
dataIdx := colIdx + 1
if dataIdx >= len(cells) {
continue
}
cell := cells[dataIdx]
m := nv5re.FindStringSubmatch(cell)
if len(m) != 2 {
continue
}
n, err := strconv.Atoi(m[1])
if err != nil {
continue
}
if n == 0 {
allActive = false
}
if minLinks < 0 || n < minLinks {
minLinks = n
}
}
}
if minLinks < 0 {
minLinks = 0
}
return nvlinkTopoResult{
GPUCount: gpuCount,
AllActive: allActive && minLinks > 0,
MinNVLinks: minLinks,
}
}

View File

@@ -0,0 +1,124 @@
package collector
import (
"bee/audit/internal/schema"
"testing"
)
func TestParseNVIDIATopologyMatrix(t *testing.T) {
t.Parallel()
// Real-world B200 HGX output: 8 GPUs, all pairs connected via NV18.
input := ` GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 NIC0 NIC1
GPU0 X NV18 NV18 NV18 NV18 NV18 NV18 NV18 NODE NODE
GPU1 NV18 X NV18 NV18 NV18 NV18 NV18 NV18 NODE NODE
GPU2 NV18 NV18 X NV18 NV18 NV18 NV18 NV18 NODE NODE
GPU3 NV18 NV18 NV18 X NV18 NV18 NV18 NV18 NODE NODE
GPU4 NV18 NV18 NV18 NV18 X NV18 NV18 NV18 SYS SYS
GPU5 NV18 NV18 NV18 NV18 NV18 X NV18 NV18 SYS SYS
GPU6 NV18 NV18 NV18 NV18 NV18 NV18 X NV18 SYS SYS
GPU7 NV18 NV18 NV18 NV18 NV18 NV18 NV18 X SYS SYS
NIC0 NODE NODE NODE NODE SYS SYS SYS SYS X PIX
`
got := parseNVIDIATopologyMatrix(input)
if got.GPUCount != 8 {
t.Fatalf("GPUCount=%d want 8", got.GPUCount)
}
if !got.AllActive {
t.Fatalf("AllActive=false want true")
}
if got.MinNVLinks != 18 {
t.Fatalf("MinNVLinks=%d want 18", got.MinNVLinks)
}
}
func TestParseNVIDIATopologyMatrixPartialDegradation(t *testing.T) {
t.Parallel()
// GPU1-GPU3 pair shows NV12 (reduced) instead of NV18.
input := ` GPU0 GPU1 GPU2 GPU3
GPU0 X NV18 NV18 NV18
GPU1 NV18 X NV18 NV12
GPU2 NV18 NV18 X NV18
GPU3 NV18 NV12 NV18 X
`
got := parseNVIDIATopologyMatrix(input)
if got.MinNVLinks != 12 {
t.Fatalf("MinNVLinks=%d want 12", got.MinNVLinks)
}
if !got.AllActive {
t.Fatalf("AllActive=false want true (12 links is still active)")
}
}
func TestParseNVIDIATopologyMatrixDisconnected(t *testing.T) {
t.Parallel()
// GPU0-GPU1 pair fully disconnected (NV0).
input := ` GPU0 GPU1
GPU0 X NV0
GPU1 NV0 X
`
got := parseNVIDIATopologyMatrix(input)
if got.AllActive {
t.Fatalf("AllActive=true want false (NV0 means no links)")
}
if got.MinNVLinks != 0 {
t.Fatalf("MinNVLinks=%d want 0", got.MinNVLinks)
}
}
func TestParseNVIDIATopologyMatrixEmpty(t *testing.T) {
t.Parallel()
got := parseNVIDIATopologyMatrix("no gpus here")
if got.GPUCount != 0 {
t.Fatalf("GPUCount=%d want 0", got.GPUCount)
}
}
func TestApplyPCIeLinkSpeedWarningNVLinkBridgeEscalates(t *testing.T) {
t.Parallel()
bridgeClass := "NVLinkBridge"
linkSpeed := "Gen3"
maxLinkSpeed := "Gen4"
dev := schema.HardwarePCIeDevice{}
dev.DeviceClass = &bridgeClass
dev.LinkSpeed = &linkSpeed
dev.MaxLinkSpeed = &maxLinkSpeed
s := statusOK
dev.Status = &s
applyPCIeLinkSpeedWarning(&dev)
if dev.Status == nil || *dev.Status != statusCritical {
t.Fatalf("status=%v want Critical for NVLink bridge degradation", dev.Status)
}
if dev.ErrorDescription == nil {
t.Fatal("ErrorDescription nil, want degradation message")
}
}
func TestApplyPCIeLinkSpeedWarningRegularCardIsWarning(t *testing.T) {
t.Parallel()
regularClass := "NetworkController"
linkSpeed := "Gen3"
maxLinkSpeed := "Gen4"
dev := schema.HardwarePCIeDevice{}
dev.DeviceClass = &regularClass
dev.LinkSpeed = &linkSpeed
dev.MaxLinkSpeed = &maxLinkSpeed
s := statusOK
dev.Status = &s
applyPCIeLinkSpeedWarning(&dev)
if dev.Status == nil || *dev.Status != statusWarning {
t.Fatalf("status=%v want Warning for regular card degradation", dev.Status)
}
}

View File

@@ -66,17 +66,41 @@ func collectStorage() []schema.HardwareStorage {
return result return result
} }
// jsonInt64 accepts both a bare JSON number and a JSON-quoted number string.
// lsblk -J emits LOG-SEC / PHY-SEC as integers on util-linux ≥ 2.37 (Debian 12)
// but older versions emit them as strings. This type handles both.
type jsonInt64 int64
func (j *jsonInt64) UnmarshalJSON(data []byte) error {
// bare number: 512
var n int64
if err := json.Unmarshal(data, &n); err == nil {
*j = jsonInt64(n)
return nil
}
// quoted string: "512"
var s string
if err := json.Unmarshal(data, &s); err == nil {
n, err := strconv.ParseInt(strings.TrimSpace(s), 10, 64)
if err == nil {
*j = jsonInt64(n)
}
return nil
}
return nil // null or unexpected type — leave zero
}
// lsblkDevice is a minimal lsblk JSON record. // lsblkDevice is a minimal lsblk JSON record.
type lsblkDevice struct { type lsblkDevice struct {
Name string `json:"name"` Name string `json:"name"`
Type string `json:"type"` Type string `json:"type"`
Size string `json:"size"` Size string `json:"size"`
Serial string `json:"serial"` Serial string `json:"serial"`
Model string `json:"model"` Model string `json:"model"`
Tran string `json:"tran"` Tran string `json:"tran"`
Hctl string `json:"hctl"` Hctl string `json:"hctl"`
LogSec string `json:"log-sec"` LogSec jsonInt64 `json:"log-sec"`
PhySec string `json:"phy-sec"` PhySec jsonInt64 `json:"phy-sec"`
} }
type lsblkRoot struct { type lsblkRoot struct {
@@ -620,8 +644,8 @@ func applyStorageBlockGeometry(s *schema.HardwareStorage, dev lsblkDevice) {
if s == nil { if s == nil {
return return
} }
logical := parseStorageBytes(dev.LogSec) logical := int64(dev.LogSec)
physical := parseStorageBytes(dev.PhySec) physical := int64(dev.PhySec)
if logical <= 0 && physical <= 0 { if logical <= 0 && physical <= 0 {
return return
} }

View File

@@ -1,6 +1,7 @@
package collector package collector
import ( import (
"encoding/json"
"os" "os"
"os/exec" "os/exec"
"path/filepath" "path/filepath"
@@ -38,6 +39,54 @@ func TestParseStorageBytes(t *testing.T) {
} }
} }
func TestJsonInt64UnmarshalBothFormats(t *testing.T) {
t.Parallel()
// util-linux ≥ 2.37 emits LOG-SEC / PHY-SEC as bare JSON numbers.
// Older versions emit quoted strings. Both must parse without error
// so that the entire lsblkDevices() call does not return nil on Debian 12.
cases := []struct {
json string
want int64
}{
{`512`, 512},
{`4096`, 4096},
{`"512"`, 512},
{`"4096"`, 4096},
{`null`, 0},
}
for _, tc := range cases {
var v jsonInt64
if err := v.UnmarshalJSON([]byte(tc.json)); err != nil {
t.Fatalf("UnmarshalJSON(%s): unexpected error %v", tc.json, err)
}
if int64(v) != tc.want {
t.Fatalf("UnmarshalJSON(%s)=%d want %d", tc.json, int64(v), tc.want)
}
}
// Simulate the exact JSON shape that triggered the bug on Debian 12.
input := []byte(`{
"blockdevices": [
{"name":"sda","type":"disk","size":"3.6T","serial":"S1234","model":"SEAGATE","tran":"sata","hctl":"0:0:0:0","log-sec":512,"phy-sec":4096},
{"name":"sdb","type":"disk","size":"3.6T","serial":"S5678","model":"SEAGATE","tran":"sata","hctl":"0:0:1:0","log-sec":512,"phy-sec":4096}
]
}`)
var root lsblkRoot
if err := json.Unmarshal(input, &root); err != nil {
t.Fatalf("lsblkRoot unmarshal with integer log-sec/phy-sec: %v", err)
}
if len(root.Blockdevices) != 2 {
t.Fatalf("got %d blockdevices want 2", len(root.Blockdevices))
}
if int64(root.Blockdevices[0].LogSec) != 512 {
t.Fatalf("LogSec=%d want 512", root.Blockdevices[0].LogSec)
}
if int64(root.Blockdevices[0].PhySec) != 4096 {
t.Fatalf("PhySec=%d want 4096", root.Blockdevices[0].PhySec)
}
}
func TestBestEffortRescanHotplugStorage(t *testing.T) { func TestBestEffortRescanHotplugStorage(t *testing.T) {
t.Parallel() t.Parallel()

View File

@@ -38,6 +38,7 @@ exfat-fuse
ntfs-3g ntfs-3g
# Utilities # Utilities
infiniband-diags
bash bash
procps procps
lsof lsof