From 805a3b277d3676736ef4a4b0f6f685238172d7ee Mon Sep 17 00:00:00 2001 From: Mikhail Chusavitin Date: Fri, 8 May 2026 12:50:14 +0300 Subject: [PATCH] Track PCIe AER correctable errors; fix GPU status key routing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add nvidia-aer-correctable and pcie-aer-correctable patterns to catch "bus correctable error" events seen in SEL (Critical Interrupt / offset 7). Both patterns carry severity "warning" — correctable errors are hardware-recovered and should not flag a card as failed. Fix kmsg_watcher routing: GPU-category events were keyed as pcie: but the UI queries for pcie:gpu: prefix. Split the switch so "gpu" → pcie:gpu: and "pcie" → pcie:. This applies to both flushWindow (SAT-window path) and flushImmediate (always-on path). Co-Authored-By: Claude Sonnet 4.6 --- audit/internal/platform/error_patterns.go | 18 ++++++++++++++++++ audit/internal/webui/kmsg_watcher.go | 8 ++++++-- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/audit/internal/platform/error_patterns.go b/audit/internal/platform/error_patterns.go index 21bfdb6..5f53259 100644 --- a/audit/internal/platform/error_patterns.go +++ b/audit/internal/platform/error_patterns.go @@ -38,6 +38,15 @@ var HardwareErrorPatterns = []ErrorPattern{ Category: "gpu", Severity: "warning", }, + // PCIe AER correctable from the NVIDIA driver — "bus correctable error" in SEL. + // Severity is warning (not critical): correctable errors are hardware-recovered. + { + Name: "nvidia-aer-correctable", + Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER.*[Cc]orrect`), + Category: "gpu", + Severity: "warning", + BDFGroup: 1, + }, { Name: "nvidia-aer", Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`), @@ -54,6 +63,15 @@ var HardwareErrorPatterns = []ErrorPattern{ }, // ── PCIe AER (generic) ────────────────────────────────────────────────────── + // PCIe AER correctable from the root port — captures the reported device BDF + // (second BDF in "pcieport X: AER: Correctable error received: Y"). + { + Name: "pcie-aer-correctable", + Re: mustPat(`(?i)pcieport.*AER:.*[Cc]orrect.*:\s*([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`), + Category: "pcie", + Severity: "warning", + BDFGroup: 1, + }, { Name: "pcie-aer", Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`), diff --git a/audit/internal/webui/kmsg_watcher.go b/audit/internal/webui/kmsg_watcher.go index 0132956..cfc8903 100644 --- a/audit/internal/webui/kmsg_watcher.go +++ b/audit/internal/webui/kmsg_watcher.go @@ -165,7 +165,9 @@ func (w *kmsgWatcher) flushWindow(window *kmsgWindow) { for _, id := range evt.ids { var key string switch evt.category { - case "gpu", "pcie": + case "gpu": + key = "pcie:gpu:" + normalizeBDF(id) + case "pcie": key = "pcie:" + normalizeBDF(id) case "storage": key = "storage:" + id @@ -218,7 +220,9 @@ func (w *kmsgWatcher) flushImmediate(evt kmsgEvent) { for _, id := range evt.ids { var key string switch evt.category { - case "gpu", "pcie": + case "gpu": + key = "pcie:gpu:" + normalizeBDF(id) + case "pcie": key = "pcie:" + normalizeBDF(id) case "storage": key = "storage:" + id