diff --git a/audit/internal/platform/error_patterns.go b/audit/internal/platform/error_patterns.go index 21bfdb6..5f53259 100644 --- a/audit/internal/platform/error_patterns.go +++ b/audit/internal/platform/error_patterns.go @@ -38,6 +38,15 @@ var HardwareErrorPatterns = []ErrorPattern{ Category: "gpu", Severity: "warning", }, + // PCIe AER correctable from the NVIDIA driver — "bus correctable error" in SEL. + // Severity is warning (not critical): correctable errors are hardware-recovered. + { + Name: "nvidia-aer-correctable", + Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER.*[Cc]orrect`), + Category: "gpu", + Severity: "warning", + BDFGroup: 1, + }, { Name: "nvidia-aer", Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`), @@ -54,6 +63,15 @@ var HardwareErrorPatterns = []ErrorPattern{ }, // ── PCIe AER (generic) ────────────────────────────────────────────────────── + // PCIe AER correctable from the root port — captures the reported device BDF + // (second BDF in "pcieport X: AER: Correctable error received: Y"). + { + Name: "pcie-aer-correctable", + Re: mustPat(`(?i)pcieport.*AER:.*[Cc]orrect.*:\s*([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`), + Category: "pcie", + Severity: "warning", + BDFGroup: 1, + }, { Name: "pcie-aer", Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`), diff --git a/audit/internal/webui/kmsg_watcher.go b/audit/internal/webui/kmsg_watcher.go index 0132956..cfc8903 100644 --- a/audit/internal/webui/kmsg_watcher.go +++ b/audit/internal/webui/kmsg_watcher.go @@ -165,7 +165,9 @@ func (w *kmsgWatcher) flushWindow(window *kmsgWindow) { for _, id := range evt.ids { var key string switch evt.category { - case "gpu", "pcie": + case "gpu": + key = "pcie:gpu:" + normalizeBDF(id) + case "pcie": key = "pcie:" + normalizeBDF(id) case "storage": key = "storage:" + id @@ -218,7 +220,9 @@ func (w *kmsgWatcher) flushImmediate(evt kmsgEvent) { for _, id := range evt.ids { var key string switch evt.category { - case "gpu", "pcie": + case "gpu": + key = "pcie:gpu:" + normalizeBDF(id) + case "pcie": key = "pcie:" + normalizeBDF(id) case "storage": key = "storage:" + id