Skip PCIe link-speed warnings for disabled devices
Disabled PCIe devices (sysfs enable==0) carry no data traffic; their link state has no operational impact. Switchtec PCIe switch management endpoints on NVIDIA HGX H100 baseboards (and similar fabric controllers) train at reduced speed intentionally and were producing spurious warnings. Check is vendor-agnostic: reads enable attribute via existing helper, no vendor/device ID hardcoding. Documented in bible-local/decisions/2026-06-12-pcie-disabled-device-link-warning.md. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -278,6 +278,11 @@ func readPCIStringAttribute(bdf, attribute string) (string, bool) {
|
||||
// below the device maximum. Regular PCIe slots get Warning; NVLink bridge cards
|
||||
// get Critical because they are fixed internal connectors that must always train
|
||||
// to max speed — any downgrade signals a hardware fault.
|
||||
//
|
||||
// Disabled devices (sysfs enable==0) are skipped: they carry no data traffic and
|
||||
// their link state has no operational impact. This covers management endpoints
|
||||
// (e.g. PCIe switch fabric controllers on HGX baseboards) that the kernel never
|
||||
// activates but that lspci still reports with link stats.
|
||||
func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) {
|
||||
if dev.LinkSpeed == nil || dev.MaxLinkSpeed == nil {
|
||||
return
|
||||
@@ -285,6 +290,11 @@ func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) {
|
||||
if pcieLinkSpeedRank(*dev.LinkSpeed) >= pcieLinkSpeedRank(*dev.MaxLinkSpeed) {
|
||||
return
|
||||
}
|
||||
if dev.BDF != nil {
|
||||
if enabled, ok := readPCIIntAttribute(*dev.BDF, "enable"); ok && enabled == 0 {
|
||||
return
|
||||
}
|
||||
}
|
||||
desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
|
||||
dev.ErrorDescription = &desc
|
||||
|
||||
|
||||
Reference in New Issue
Block a user