diff --git a/audit/internal/collector/pcie.go b/audit/internal/collector/pcie.go index b766cd9..d2384b5 100644 --- a/audit/internal/collector/pcie.go +++ b/audit/internal/collector/pcie.go @@ -278,6 +278,11 @@ func readPCIStringAttribute(bdf, attribute string) (string, bool) { // below the device maximum. Regular PCIe slots get Warning; NVLink bridge cards // get Critical because they are fixed internal connectors that must always train // to max speed — any downgrade signals a hardware fault. +// +// Disabled devices (sysfs enable==0) are skipped: they carry no data traffic and +// their link state has no operational impact. This covers management endpoints +// (e.g. PCIe switch fabric controllers on HGX baseboards) that the kernel never +// activates but that lspci still reports with link stats. func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) { if dev.LinkSpeed == nil || dev.MaxLinkSpeed == nil { return @@ -285,6 +290,11 @@ func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) { if pcieLinkSpeedRank(*dev.LinkSpeed) >= pcieLinkSpeedRank(*dev.MaxLinkSpeed) { return } + if dev.BDF != nil { + if enabled, ok := readPCIIntAttribute(*dev.BDF, "enable"); ok && enabled == 0 { + return + } + } desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed) dev.ErrorDescription = &desc diff --git a/bible-local/decisions/2026-06-12-pcie-disabled-device-link-warning.md b/bible-local/decisions/2026-06-12-pcie-disabled-device-link-warning.md new file mode 100644 index 0000000..72f5fb5 --- /dev/null +++ b/bible-local/decisions/2026-06-12-pcie-disabled-device-link-warning.md @@ -0,0 +1,41 @@ +# Decision: Skip PCIe link-speed warnings for disabled devices + +**Date:** 2026-06-12 +**Status:** active + +## Context + +On HGX H100 SXM5 baseboards, the Microchip Switchtec PM41028 PSX PCIe switch +(vendor 11F8, device 4128, NVIDIA subsystem 10DE:1643) appears in `lspci` as a +"Memory controller". Its upstream link trains at Gen3 x2 while the device is +capable of Gen4 x16. The device is permanently in a disabled state: memory access +and bus-mastering are both off (Mem-, BusMaster-); `/sys/bus/pci/devices//enable` +reads `0`. + +This chip is the PCIe fabric management endpoint for the NVSwitch interconnect — it +carries only management traffic at low bandwidth and is intentionally not activated +by any Linux driver. The bee audit was reporting a `statusWarning` with message +"PCIe link speed degraded" for this device, which is misleading because the device +is not in the data path. + +## Decision + +`applyPCIeLinkSpeedWarning` reads `/sys/bus/pci/devices//enable` via the +existing `readPCIIntAttribute` helper. If the value is `0` the function returns +early without setting any warning status. + +The check is vendor-agnostic: it applies to any PCIe device that Linux has not +activated, regardless of make or model. This is consistent with the +`no-hardcoded-vendors` contract — no vendor ID, device ID, or name string is +used as a condition. + +## Consequences + +- PCIe fabric management endpoints, IPMI virtual devices, and other permanently + disabled PCIe functions no longer produce spurious link-degradation warnings. +- Real link degradation on active devices (GPUs, NICs, NVMe, NVLink bridges) + continues to be detected and reported as before. +- NVLink bridge cards retain their existing `statusCritical` path (they are always + enabled, so the early return is never taken for them). +- The Switchtec device on HGX H100 boards shows `statusOK` with no + `error_description` in the audit JSON. diff --git a/bible-local/decisions/README.md b/bible-local/decisions/README.md index a9251e0..98a0c0a 100644 --- a/bible-local/decisions/README.md +++ b/bible-local/decisions/README.md @@ -7,3 +7,4 @@ One file per decision, named `YYYY-MM-DD-short-topic.md`. | 2026-03-05 | Use NVIDIA proprietary driver | active | | 2026-04-01 | Treat memtest as explicit ISO content | active | | 2026-04-29 | Treat embedded submodules as read-only | active | +| 2026-06-12 | Skip PCIe link-speed warnings for disabled devices | active |