- storage: add jsonInt64 dual-format unmarshaler to handle lsblk output change in util-linux 2.38 (LOG-SEC/PHY-SEC now emitted as JSON integers, not quoted strings); fixes SATA disks invisible on Debian 12 - pcie: detect NVLink bridge mezzanine CX-7 cards (Mellanox x2, no host net ifaces, DeviceName contains "NVLINK" in lspci -v) and mark them with device_class="NVLinkBridge"; escalate PCIe link speed downgrade to Critical for these cards (Gen3 on a fixed internal connector = hardware fault, not a transient warning) - pcie: cross-reference nvidia-smi topo to capture NVLink bond counts and active status for all NVLink bridge cards - packages: add infiniband-diags to ISO; provides ibstat required by nvidia-fabricmanager-start.sh to enumerate IB devices before FM launch (absence causes CUDA_ERROR_SYSTEM_NOT_READY) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
168 lines
4.8 KiB
Go
168 lines
4.8 KiB
Go
package collector
|
|
|
|
import (
|
|
"encoding/json"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"strings"
|
|
"testing"
|
|
)
|
|
|
|
func TestMergeStorageDevicePrefersNonEmptyFields(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
got := mergeStorageDevice(
|
|
lsblkDevice{Name: "nvme0n1", Type: "disk", Tran: "nvme"},
|
|
lsblkDevice{Name: "nvme0n1", Type: "disk", Size: "1024", Serial: "SN123", Model: "Kioxia"},
|
|
)
|
|
|
|
if got.Serial != "SN123" {
|
|
t.Fatalf("serial=%q want SN123", got.Serial)
|
|
}
|
|
if got.Model != "Kioxia" {
|
|
t.Fatalf("model=%q want Kioxia", got.Model)
|
|
}
|
|
if got.Size != "1024" {
|
|
t.Fatalf("size=%q want 1024", got.Size)
|
|
}
|
|
}
|
|
|
|
func TestParseStorageBytes(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
if got := parseStorageBytes(" 2048 "); got != 2048 {
|
|
t.Fatalf("parseStorageBytes=%d want 2048", got)
|
|
}
|
|
if got := parseStorageBytes("1.92 TB"); got != 0 {
|
|
t.Fatalf("parseStorageBytes invalid=%d want 0", got)
|
|
}
|
|
}
|
|
|
|
func TestJsonInt64UnmarshalBothFormats(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
// util-linux ≥ 2.37 emits LOG-SEC / PHY-SEC as bare JSON numbers.
|
|
// Older versions emit quoted strings. Both must parse without error
|
|
// so that the entire lsblkDevices() call does not return nil on Debian 12.
|
|
cases := []struct {
|
|
json string
|
|
want int64
|
|
}{
|
|
{`512`, 512},
|
|
{`4096`, 4096},
|
|
{`"512"`, 512},
|
|
{`"4096"`, 4096},
|
|
{`null`, 0},
|
|
}
|
|
for _, tc := range cases {
|
|
var v jsonInt64
|
|
if err := v.UnmarshalJSON([]byte(tc.json)); err != nil {
|
|
t.Fatalf("UnmarshalJSON(%s): unexpected error %v", tc.json, err)
|
|
}
|
|
if int64(v) != tc.want {
|
|
t.Fatalf("UnmarshalJSON(%s)=%d want %d", tc.json, int64(v), tc.want)
|
|
}
|
|
}
|
|
|
|
// Simulate the exact JSON shape that triggered the bug on Debian 12.
|
|
input := []byte(`{
|
|
"blockdevices": [
|
|
{"name":"sda","type":"disk","size":"3.6T","serial":"S1234","model":"SEAGATE","tran":"sata","hctl":"0:0:0:0","log-sec":512,"phy-sec":4096},
|
|
{"name":"sdb","type":"disk","size":"3.6T","serial":"S5678","model":"SEAGATE","tran":"sata","hctl":"0:0:1:0","log-sec":512,"phy-sec":4096}
|
|
]
|
|
}`)
|
|
var root lsblkRoot
|
|
if err := json.Unmarshal(input, &root); err != nil {
|
|
t.Fatalf("lsblkRoot unmarshal with integer log-sec/phy-sec: %v", err)
|
|
}
|
|
if len(root.Blockdevices) != 2 {
|
|
t.Fatalf("got %d blockdevices want 2", len(root.Blockdevices))
|
|
}
|
|
if int64(root.Blockdevices[0].LogSec) != 512 {
|
|
t.Fatalf("LogSec=%d want 512", root.Blockdevices[0].LogSec)
|
|
}
|
|
if int64(root.Blockdevices[0].PhySec) != 4096 {
|
|
t.Fatalf("PhySec=%d want 4096", root.Blockdevices[0].PhySec)
|
|
}
|
|
}
|
|
|
|
func TestBestEffortRescanHotplugStorage(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
tmp := t.TempDir()
|
|
rescanPath := filepath.Join(tmp, "pci-rescan")
|
|
scanDir := filepath.Join(tmp, "scsi_host")
|
|
host0Path := filepath.Join(scanDir, "host0", "scan")
|
|
host1Path := filepath.Join(scanDir, "host1", "scan")
|
|
argsPath := filepath.Join(tmp, "udevadm-args")
|
|
toolPath := filepath.Join(tmp, "udevadm")
|
|
if err := os.MkdirAll(filepath.Dir(host0Path), 0755); err != nil {
|
|
t.Fatalf("mkdir host0: %v", err)
|
|
}
|
|
if err := os.MkdirAll(filepath.Dir(host1Path), 0755); err != nil {
|
|
t.Fatalf("mkdir host1: %v", err)
|
|
}
|
|
if err := os.WriteFile(host0Path, nil, 0644); err != nil {
|
|
t.Fatalf("touch host0 scan: %v", err)
|
|
}
|
|
if err := os.WriteFile(host1Path, nil, 0644); err != nil {
|
|
t.Fatalf("touch host1 scan: %v", err)
|
|
}
|
|
script := "#!/bin/sh\nprintf '%s' \"$*\" > \"" + argsPath + "\"\n"
|
|
if err := os.WriteFile(toolPath, []byte(script), 0755); err != nil {
|
|
t.Fatalf("write udevadm stub: %v", err)
|
|
}
|
|
|
|
oldPath := os.Getenv("PATH")
|
|
if err := os.Setenv("PATH", tmp+string(os.PathListSeparator)+oldPath); err != nil {
|
|
t.Fatalf("set PATH: %v", err)
|
|
}
|
|
defer func() { _ = os.Setenv("PATH", oldPath) }()
|
|
|
|
oldRescanPath := pciRescanPath
|
|
oldSCSIGlob := scsiHostScanGlob
|
|
oldWriteFile := hotplugWriteFile
|
|
oldExecCommand := hotplugExecCommand
|
|
oldGlob := hotplugGlob
|
|
pciRescanPath = rescanPath
|
|
scsiHostScanGlob = filepath.Join(scanDir, "host*", "scan")
|
|
hotplugWriteFile = os.WriteFile
|
|
hotplugExecCommand = exec.Command
|
|
hotplugGlob = filepath.Glob
|
|
defer func() {
|
|
pciRescanPath = oldRescanPath
|
|
scsiHostScanGlob = oldSCSIGlob
|
|
hotplugWriteFile = oldWriteFile
|
|
hotplugExecCommand = oldExecCommand
|
|
hotplugGlob = oldGlob
|
|
}()
|
|
|
|
bestEffortRescanHotplugStorage()
|
|
|
|
raw, err := os.ReadFile(rescanPath)
|
|
if err != nil {
|
|
t.Fatalf("read rescan file: %v", err)
|
|
}
|
|
if string(raw) != "1\n" {
|
|
t.Fatalf("rescan payload=%q want %q", string(raw), "1\n")
|
|
}
|
|
for _, path := range []string{host0Path, host1Path} {
|
|
raw, err := os.ReadFile(path)
|
|
if err != nil {
|
|
t.Fatalf("read scsi scan file %s: %v", path, err)
|
|
}
|
|
if string(raw) != "- - -\n" {
|
|
t.Fatalf("scsi scan payload at %s =%q want %q", path, string(raw), "- - -\n")
|
|
}
|
|
}
|
|
|
|
args, err := os.ReadFile(argsPath)
|
|
if err != nil {
|
|
t.Fatalf("read udevadm args: %v", err)
|
|
}
|
|
if got := strings.TrimSpace(string(args)); got != "settle --timeout=10" {
|
|
t.Fatalf("udevadm args=%q want %q", got, "settle --timeout=10")
|
|
}
|
|
}
|