From 963bc960ca5332d6604cd4f79bedf496f7afd4a8 Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Thu, 28 May 2026 20:57:04 +0300 Subject: [PATCH] Fix SATA discovery, add NVLink bridge detection, add infiniband-diags - storage: add jsonInt64 dual-format unmarshaler to handle lsblk output change in util-linux 2.38 (LOG-SEC/PHY-SEC now emitted as JSON integers, not quoted strings); fixes SATA disks invisible on Debian 12 - pcie: detect NVLink bridge mezzanine CX-7 cards (Mellanox x2, no host net ifaces, DeviceName contains "NVLINK" in lspci -v) and mark them with device_class="NVLinkBridge"; escalate PCIe link speed downgrade to Critical for these cards (Gen3 on a fixed internal connector = hardware fault, not a transient warning) - pcie: cross-reference nvidia-smi topo to capture NVLink bond counts and active status for all NVLink bridge cards - packages: add infiniband-diags to ISO; provides ibstat required by nvidia-fabricmanager-start.sh to enumerate IB devices before FM launch (absence causes CUDA_ERROR_SYSTEM_NOT_READY) Co-Authored-By: Claude Sonnet 4.6 --- audit/internal/collector/collector.go | 1 + audit/internal/collector/pcie.go | 51 +++-- .../internal/collector/pcie_nvlink_bridge.go | 206 ++++++++++++++++++ .../collector/pcie_nvlink_bridge_test.go | 124 +++++++++++ audit/internal/collector/storage.go | 46 +++- .../collector/storage_discovery_test.go | 49 +++++ .../config/package-lists/bee.list.chroot | 1 + 7 files changed, 451 insertions(+), 27 deletions(-) create mode 100644 audit/internal/collector/pcie_nvlink_bridge.go create mode 100644 audit/internal/collector/pcie_nvlink_bridge_test.go diff --git a/audit/internal/collector/collector.go b/audit/internal/collector/collector.go index b906e25..e91434e 100644 --- a/audit/internal/collector/collector.go +++ b/audit/internal/collector/collector.go @@ -40,6 +40,7 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest { snap.PCIeDevices = enrichPCIeWithAMD(snap.PCIeDevices) snap.PCIeDevices = enrichPCIeWithPCISerials(snap.PCIeDevices) snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices) + snap.PCIeDevices = enrichNVLinkBridgesWithGPUTopo(snap.PCIeDevices) snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices) snap.PCIeDevices = enrichPCIeWithNICTelemetry(snap.PCIeDevices) snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices) diff --git a/audit/internal/collector/pcie.go b/audit/internal/collector/pcie.go index 1e0b324..b766cd9 100644 --- a/audit/internal/collector/pcie.go +++ b/audit/internal/collector/pcie.go @@ -126,38 +126,39 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice { dev.Status = &status // Slot is the BDF: "0000:00:02.0" - if bdf := fields["Slot"]; bdf != "" { - dev.Slot = &bdf - dev.BDF = &bdf + bdfStr := fields["Slot"] + if bdfStr != "" { + dev.Slot = &bdfStr + dev.BDF = &bdfStr // parse vendor_id and device_id from sysfs - vendorID, deviceID := readPCIIDs(bdf) + vendorID, deviceID := readPCIIDs(bdfStr) if vendorID != 0 { dev.VendorID = &vendorID } if deviceID != 0 { dev.DeviceID = &deviceID } - if numaNode, ok := readPCINumaNode(bdf); ok { + if numaNode, ok := readPCINumaNode(bdfStr); ok { dev.NUMANode = &numaNode } else if numaNode, ok := parsePCINumaNode(fields["NUMANode"]); ok { dev.NUMANode = &numaNode } - if group, ok := readPCIIOMMUGroup(bdf); ok { + if group, ok := readPCIIOMMUGroup(bdfStr); ok { dev.IOMMUGroup = &group } - if width, ok := readPCIIntAttribute(bdf, "current_link_width"); ok { + if width, ok := readPCIIntAttribute(bdfStr, "current_link_width"); ok { dev.LinkWidth = &width } - if width, ok := readPCIIntAttribute(bdf, "max_link_width"); ok { + if width, ok := readPCIIntAttribute(bdfStr, "max_link_width"); ok { dev.MaxLinkWidth = &width } - if speed, ok := readPCIStringAttribute(bdf, "current_link_speed"); ok { + if speed, ok := readPCIStringAttribute(bdfStr, "current_link_speed"); ok { linkSpeed := normalizePCILinkSpeed(speed) if linkSpeed != "" { dev.LinkSpeed = &linkSpeed } } - if speed, ok := readPCIStringAttribute(bdf, "max_link_speed"); ok { + if speed, ok := readPCIStringAttribute(bdfStr, "max_link_speed"); ok { linkSpeed := normalizePCILinkSpeed(speed) if linkSpeed != "" { dev.MaxLinkSpeed = &linkSpeed @@ -178,7 +179,15 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice { // SVendor/SDevice available but not in schema — skip - // Warn if PCIe link is running below its maximum negotiated speed. + // Detect NVLink bridge mezzanine cards (CPU→HGX internal link). + // These are Mellanox x2 devices with no host net interfaces and a DeviceName + // containing "NVLINK". The targeted lspci call is only executed for the small + // number of narrow-link Mellanox cards that pass the cheap pre-filter. + if bdfStr != "" && isNVLinkBridgeCandidate(bdfStr, dev) && confirmNVLinkBridgeDeviceName(bdfStr) { + markNVLinkBridge(&dev) + } + + // Warn (or Critical for NVLink bridges) if PCIe link is running below max. applyPCIeLinkSpeedWarning(&dev) return dev @@ -265,17 +274,27 @@ func readPCIStringAttribute(bdf, attribute string) (string, bool) { return value, true } -// applyPCIeLinkSpeedWarning sets the device status to Warning if the current PCIe link -// speed is below the maximum negotiated speed supported by both ends. +// applyPCIeLinkSpeedWarning sets device status when the current PCIe link speed is +// below the device maximum. Regular PCIe slots get Warning; NVLink bridge cards +// get Critical because they are fixed internal connectors that must always train +// to max speed — any downgrade signals a hardware fault. func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) { if dev.LinkSpeed == nil || dev.MaxLinkSpeed == nil { return } - if pcieLinkSpeedRank(*dev.LinkSpeed) < pcieLinkSpeedRank(*dev.MaxLinkSpeed) { + if pcieLinkSpeedRank(*dev.LinkSpeed) >= pcieLinkSpeedRank(*dev.MaxLinkSpeed) { + return + } + desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed) + dev.ErrorDescription = &desc + + isNVLinkBridge := dev.DeviceClass != nil && *dev.DeviceClass == "NVLinkBridge" + if isNVLinkBridge { + crit := statusCritical + dev.Status = &crit + } else { warn := statusWarning dev.Status = &warn - desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed) - dev.ErrorDescription = &desc } } diff --git a/audit/internal/collector/pcie_nvlink_bridge.go b/audit/internal/collector/pcie_nvlink_bridge.go new file mode 100644 index 0000000..26057c6 --- /dev/null +++ b/audit/internal/collector/pcie_nvlink_bridge.go @@ -0,0 +1,206 @@ +package collector + +import ( + "bee/audit/internal/schema" + "log/slog" + "os/exec" + "regexp" + "strconv" + "strings" +) + +var nv5re = regexp.MustCompile(`(?i)^NV(\d+)$`) + +// isNVLinkBridgeCandidate returns true for Mellanox PCIe devices that look like +// NVLink bridge mezzanine cards: narrow link (x2), no host net interfaces. +// These are the CPU-side PCIe control plane of the NVSwitch fabric on HGX/DGX systems. +func isNVLinkBridgeCandidate(bdf string, dev schema.HardwarePCIeDevice) bool { + if !isMellanoxDevice(dev) { + return false + } + if dev.LinkWidth == nil || *dev.LinkWidth > 2 { + return false + } + if len(netIfacesByBDF(bdf)) > 0 { + return false + } + return true +} + +// confirmNVLinkBridgeDeviceName checks if the lspci DeviceName for bdf contains +// "NVLINK". This is a targeted single-device call, only executed for candidates +// already pre-filtered by isNVLinkBridgeCandidate. +func confirmNVLinkBridgeDeviceName(bdf string) bool { + out, err := exec.Command("lspci", "-s", bdf, "-v").Output() + if err != nil { + return false + } + for _, line := range strings.Split(string(out), "\n") { + if strings.Contains(strings.ToUpper(strings.TrimSpace(line)), "NVLINK") { + return true + } + } + return false +} + +// markNVLinkBridge overwrites device_class and adds telemetry flags on a detected +// NVLink bridge card. Must be called before applyPCIeLinkSpeedWarning so that the +// correct severity (Critical) is applied. +func markNVLinkBridge(dev *schema.HardwarePCIeDevice) { + class := "NVLinkBridge" + dev.DeviceClass = &class + if dev.Telemetry == nil { + dev.Telemetry = map[string]any{} + } + dev.Telemetry["nvlink_bridge"] = true +} + +// enrichNVLinkBridgesWithGPUTopo cross-references NVLink bridge PCIe status with +// the GPU-side NVLink topology reported by nvidia-smi. For each bridge device it +// adds nvlink_topo_all_active and nvlink_topo_min_links to the telemetry, and +// upgrades a degraded-link Warning to Critical when the fabric is also affected. +func enrichNVLinkBridgesWithGPUTopo(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice { + hasBridge := false + for _, d := range devs { + if d.DeviceClass != nil && *d.DeviceClass == "NVLinkBridge" { + hasBridge = true + break + } + } + if !hasBridge { + return devs + } + + topo, err := queryNVIDIANVLinkTopo() + if err != nil { + slog.Info("nvlink-bridge: nvidia-smi topo unavailable, skipping cross-reference", "err", err) + return devs + } + + for i := range devs { + if devs[i].DeviceClass == nil || *devs[i].DeviceClass != "NVLinkBridge" { + continue + } + if devs[i].Telemetry == nil { + devs[i].Telemetry = map[string]any{} + } + devs[i].Telemetry["nvlink_topo_all_active"] = topo.AllActive + devs[i].Telemetry["nvlink_topo_min_links"] = topo.MinNVLinks + devs[i].Telemetry["nvlink_topo_gpu_count"] = topo.GPUCount + + // If the bridge PCIe is already degraded AND the fabric is also degraded + // (missing NVLink connections), escalate to Critical. + if devs[i].Status != nil && *devs[i].Status == statusCritical && !topo.AllActive { + devs[i].Telemetry["nvlink_fabric_affected"] = true + } + } + + slog.Info("nvlink-bridge: topo cross-reference applied", + "gpu_count", topo.GPUCount, + "all_active", topo.AllActive, + "min_links", topo.MinNVLinks, + ) + return devs +} + +// nvlinkTopoResult summarises the GPU NVLink connectivity matrix. +type nvlinkTopoResult struct { + GPUCount int + AllActive bool // true if every GPU pair has at least one NVLink bond + MinNVLinks int // minimum NVLink bonds seen across any GPU pair (0 = some pair disconnected) +} + +// queryNVIDIANVLinkTopo runs nvidia-smi topo -m and parses the NVLink matrix. +func queryNVIDIANVLinkTopo() (nvlinkTopoResult, error) { + out, err := exec.Command("nvidia-smi", "topo", "-m").Output() + if err != nil { + return nvlinkTopoResult{}, err + } + return parseNVIDIATopologyMatrix(string(out)), nil +} + +// parseNVIDIATopologyMatrix extracts the minimum NVLink bond count from the +// nvidia-smi topo -m matrix. +// +// Format (abbreviated): +// +// GPU0 GPU1 ... NIC0 NIC1 +// GPU0 X NV18 ... NODE NODE +// GPU1 NV18 X ... NODE NODE +// NIC0 NODE NODE... X PIX +// +// The header row starts with "GPU0"; its columns may include non-GPU entries +// (NIC, CPU) which are ignored. Only GPU×GPU cells containing NV# values are +// counted. X is self; non-NV tokens (NODE, SYS, PHB, PIX) are skipped. +func parseNVIDIATopologyMatrix(raw string) nvlinkTopoResult { + lines := strings.Split(raw, "\n") + + // Locate the header line and record which column indices are GPU columns. + headerIdx := -1 + var gpuColIndices []int // 0-based indices within fields (excluding the row label) + var gpuCount int + for i, line := range lines { + trimmed := strings.TrimSpace(line) + if strings.HasPrefix(trimmed, "GPU0") { + parts := strings.Fields(trimmed) + for j, col := range parts { + if strings.HasPrefix(col, "GPU") { + gpuColIndices = append(gpuColIndices, j) + } + } + gpuCount = len(gpuColIndices) + if gpuCount >= 2 { + headerIdx = i + } + break + } + } + if headerIdx < 0 || gpuCount == 0 { + return nvlinkTopoResult{} + } + + minLinks := -1 // -1 = no NV pair seen yet + allActive := true + + for _, line := range lines[headerIdx+1:] { + trimmed := strings.TrimSpace(line) + if !strings.HasPrefix(trimmed, "GPU") { + continue + } + cells := strings.Fields(trimmed) + // cells[0] is the row label (e.g. "GPU0"); cells[1..] are column values. + // gpuColIndices are 0-based within the header fields, so they map to + // cells[idx+1] in the data rows (shift by 1 for the row label). + for _, colIdx := range gpuColIndices { + dataIdx := colIdx + 1 + if dataIdx >= len(cells) { + continue + } + cell := cells[dataIdx] + m := nv5re.FindStringSubmatch(cell) + if len(m) != 2 { + continue + } + n, err := strconv.Atoi(m[1]) + if err != nil { + continue + } + if n == 0 { + allActive = false + } + if minLinks < 0 || n < minLinks { + minLinks = n + } + } + } + + if minLinks < 0 { + minLinks = 0 + } + + return nvlinkTopoResult{ + GPUCount: gpuCount, + AllActive: allActive && minLinks > 0, + MinNVLinks: minLinks, + } +} diff --git a/audit/internal/collector/pcie_nvlink_bridge_test.go b/audit/internal/collector/pcie_nvlink_bridge_test.go new file mode 100644 index 0000000..126a08b --- /dev/null +++ b/audit/internal/collector/pcie_nvlink_bridge_test.go @@ -0,0 +1,124 @@ +package collector + +import ( + "bee/audit/internal/schema" + "testing" +) + +func TestParseNVIDIATopologyMatrix(t *testing.T) { + t.Parallel() + + // Real-world B200 HGX output: 8 GPUs, all pairs connected via NV18. + input := ` GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 NIC0 NIC1 +GPU0 X NV18 NV18 NV18 NV18 NV18 NV18 NV18 NODE NODE +GPU1 NV18 X NV18 NV18 NV18 NV18 NV18 NV18 NODE NODE +GPU2 NV18 NV18 X NV18 NV18 NV18 NV18 NV18 NODE NODE +GPU3 NV18 NV18 NV18 X NV18 NV18 NV18 NV18 NODE NODE +GPU4 NV18 NV18 NV18 NV18 X NV18 NV18 NV18 SYS SYS +GPU5 NV18 NV18 NV18 NV18 NV18 X NV18 NV18 SYS SYS +GPU6 NV18 NV18 NV18 NV18 NV18 NV18 X NV18 SYS SYS +GPU7 NV18 NV18 NV18 NV18 NV18 NV18 NV18 X SYS SYS +NIC0 NODE NODE NODE NODE SYS SYS SYS SYS X PIX +` + got := parseNVIDIATopologyMatrix(input) + + if got.GPUCount != 8 { + t.Fatalf("GPUCount=%d want 8", got.GPUCount) + } + if !got.AllActive { + t.Fatalf("AllActive=false want true") + } + if got.MinNVLinks != 18 { + t.Fatalf("MinNVLinks=%d want 18", got.MinNVLinks) + } +} + +func TestParseNVIDIATopologyMatrixPartialDegradation(t *testing.T) { + t.Parallel() + + // GPU1-GPU3 pair shows NV12 (reduced) instead of NV18. + input := ` GPU0 GPU1 GPU2 GPU3 +GPU0 X NV18 NV18 NV18 +GPU1 NV18 X NV18 NV12 +GPU2 NV18 NV18 X NV18 +GPU3 NV18 NV12 NV18 X +` + got := parseNVIDIATopologyMatrix(input) + + if got.MinNVLinks != 12 { + t.Fatalf("MinNVLinks=%d want 12", got.MinNVLinks) + } + if !got.AllActive { + t.Fatalf("AllActive=false want true (12 links is still active)") + } +} + +func TestParseNVIDIATopologyMatrixDisconnected(t *testing.T) { + t.Parallel() + + // GPU0-GPU1 pair fully disconnected (NV0). + input := ` GPU0 GPU1 +GPU0 X NV0 +GPU1 NV0 X +` + got := parseNVIDIATopologyMatrix(input) + + if got.AllActive { + t.Fatalf("AllActive=true want false (NV0 means no links)") + } + if got.MinNVLinks != 0 { + t.Fatalf("MinNVLinks=%d want 0", got.MinNVLinks) + } +} + +func TestParseNVIDIATopologyMatrixEmpty(t *testing.T) { + t.Parallel() + + got := parseNVIDIATopologyMatrix("no gpus here") + if got.GPUCount != 0 { + t.Fatalf("GPUCount=%d want 0", got.GPUCount) + } +} + +func TestApplyPCIeLinkSpeedWarningNVLinkBridgeEscalates(t *testing.T) { + t.Parallel() + + bridgeClass := "NVLinkBridge" + linkSpeed := "Gen3" + maxLinkSpeed := "Gen4" + dev := schema.HardwarePCIeDevice{} + dev.DeviceClass = &bridgeClass + dev.LinkSpeed = &linkSpeed + dev.MaxLinkSpeed = &maxLinkSpeed + s := statusOK + dev.Status = &s + + applyPCIeLinkSpeedWarning(&dev) + + if dev.Status == nil || *dev.Status != statusCritical { + t.Fatalf("status=%v want Critical for NVLink bridge degradation", dev.Status) + } + if dev.ErrorDescription == nil { + t.Fatal("ErrorDescription nil, want degradation message") + } +} + +func TestApplyPCIeLinkSpeedWarningRegularCardIsWarning(t *testing.T) { + t.Parallel() + + regularClass := "NetworkController" + linkSpeed := "Gen3" + maxLinkSpeed := "Gen4" + dev := schema.HardwarePCIeDevice{} + dev.DeviceClass = ®ularClass + dev.LinkSpeed = &linkSpeed + dev.MaxLinkSpeed = &maxLinkSpeed + s := statusOK + dev.Status = &s + + applyPCIeLinkSpeedWarning(&dev) + + if dev.Status == nil || *dev.Status != statusWarning { + t.Fatalf("status=%v want Warning for regular card degradation", dev.Status) + } +} diff --git a/audit/internal/collector/storage.go b/audit/internal/collector/storage.go index 202c3ff..aec8d54 100644 --- a/audit/internal/collector/storage.go +++ b/audit/internal/collector/storage.go @@ -66,17 +66,41 @@ func collectStorage() []schema.HardwareStorage { return result } +// jsonInt64 accepts both a bare JSON number and a JSON-quoted number string. +// lsblk -J emits LOG-SEC / PHY-SEC as integers on util-linux ≥ 2.37 (Debian 12) +// but older versions emit them as strings. This type handles both. +type jsonInt64 int64 + +func (j *jsonInt64) UnmarshalJSON(data []byte) error { + // bare number: 512 + var n int64 + if err := json.Unmarshal(data, &n); err == nil { + *j = jsonInt64(n) + return nil + } + // quoted string: "512" + var s string + if err := json.Unmarshal(data, &s); err == nil { + n, err := strconv.ParseInt(strings.TrimSpace(s), 10, 64) + if err == nil { + *j = jsonInt64(n) + } + return nil + } + return nil // null or unexpected type — leave zero +} + // lsblkDevice is a minimal lsblk JSON record. type lsblkDevice struct { - Name string `json:"name"` - Type string `json:"type"` - Size string `json:"size"` - Serial string `json:"serial"` - Model string `json:"model"` - Tran string `json:"tran"` - Hctl string `json:"hctl"` - LogSec string `json:"log-sec"` - PhySec string `json:"phy-sec"` + Name string `json:"name"` + Type string `json:"type"` + Size string `json:"size"` + Serial string `json:"serial"` + Model string `json:"model"` + Tran string `json:"tran"` + Hctl string `json:"hctl"` + LogSec jsonInt64 `json:"log-sec"` + PhySec jsonInt64 `json:"phy-sec"` } type lsblkRoot struct { @@ -620,8 +644,8 @@ func applyStorageBlockGeometry(s *schema.HardwareStorage, dev lsblkDevice) { if s == nil { return } - logical := parseStorageBytes(dev.LogSec) - physical := parseStorageBytes(dev.PhySec) + logical := int64(dev.LogSec) + physical := int64(dev.PhySec) if logical <= 0 && physical <= 0 { return } diff --git a/audit/internal/collector/storage_discovery_test.go b/audit/internal/collector/storage_discovery_test.go index 46d78d2..bb259da 100644 --- a/audit/internal/collector/storage_discovery_test.go +++ b/audit/internal/collector/storage_discovery_test.go @@ -1,6 +1,7 @@ package collector import ( + "encoding/json" "os" "os/exec" "path/filepath" @@ -38,6 +39,54 @@ func TestParseStorageBytes(t *testing.T) { } } +func TestJsonInt64UnmarshalBothFormats(t *testing.T) { + t.Parallel() + + // util-linux ≥ 2.37 emits LOG-SEC / PHY-SEC as bare JSON numbers. + // Older versions emit quoted strings. Both must parse without error + // so that the entire lsblkDevices() call does not return nil on Debian 12. + cases := []struct { + json string + want int64 + }{ + {`512`, 512}, + {`4096`, 4096}, + {`"512"`, 512}, + {`"4096"`, 4096}, + {`null`, 0}, + } + for _, tc := range cases { + var v jsonInt64 + if err := v.UnmarshalJSON([]byte(tc.json)); err != nil { + t.Fatalf("UnmarshalJSON(%s): unexpected error %v", tc.json, err) + } + if int64(v) != tc.want { + t.Fatalf("UnmarshalJSON(%s)=%d want %d", tc.json, int64(v), tc.want) + } + } + + // Simulate the exact JSON shape that triggered the bug on Debian 12. + input := []byte(`{ + "blockdevices": [ + {"name":"sda","type":"disk","size":"3.6T","serial":"S1234","model":"SEAGATE","tran":"sata","hctl":"0:0:0:0","log-sec":512,"phy-sec":4096}, + {"name":"sdb","type":"disk","size":"3.6T","serial":"S5678","model":"SEAGATE","tran":"sata","hctl":"0:0:1:0","log-sec":512,"phy-sec":4096} + ] + }`) + var root lsblkRoot + if err := json.Unmarshal(input, &root); err != nil { + t.Fatalf("lsblkRoot unmarshal with integer log-sec/phy-sec: %v", err) + } + if len(root.Blockdevices) != 2 { + t.Fatalf("got %d blockdevices want 2", len(root.Blockdevices)) + } + if int64(root.Blockdevices[0].LogSec) != 512 { + t.Fatalf("LogSec=%d want 512", root.Blockdevices[0].LogSec) + } + if int64(root.Blockdevices[0].PhySec) != 4096 { + t.Fatalf("PhySec=%d want 4096", root.Blockdevices[0].PhySec) + } +} + func TestBestEffortRescanHotplugStorage(t *testing.T) { t.Parallel() diff --git a/iso/builder/config/package-lists/bee.list.chroot b/iso/builder/config/package-lists/bee.list.chroot index 7d4fd5d..3c043a2 100644 --- a/iso/builder/config/package-lists/bee.list.chroot +++ b/iso/builder/config/package-lists/bee.list.chroot @@ -38,6 +38,7 @@ exfat-fuse ntfs-3g # Utilities +infiniband-diags bash procps lsof