Track PCIe AER correctable errors; fix GPU status key routing

Add nvidia-aer-correctable and pcie-aer-correctable patterns to catch
"bus correctable error" events seen in SEL (Critical Interrupt / offset 7).
Both patterns carry severity "warning" — correctable errors are
hardware-recovered and should not flag a card as failed.

Fix kmsg_watcher routing: GPU-category events were keyed as pcie:<BDF>
but the UI queries for pcie:gpu: prefix. Split the switch so "gpu" →
pcie:gpu:<BDF> and "pcie" → pcie:<BDF>. This applies to both
flushWindow (SAT-window path) and flushImmediate (always-on path).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Mikhail Chusavitin
2026-05-08 12:50:14 +03:00
parent 5bc9bd7fb3
commit 805a3b277d
2 changed files with 24 additions and 2 deletions

View File

@@ -38,6 +38,15 @@ var HardwareErrorPatterns = []ErrorPattern{
Category: "gpu",
Severity: "warning",
},
// PCIe AER correctable from the NVIDIA driver — "bus correctable error" in SEL.
// Severity is warning (not critical): correctable errors are hardware-recovered.
{
Name: "nvidia-aer-correctable",
Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER.*[Cc]orrect`),
Category: "gpu",
Severity: "warning",
BDFGroup: 1,
},
{
Name: "nvidia-aer",
Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
@@ -54,6 +63,15 @@ var HardwareErrorPatterns = []ErrorPattern{
},
// ── PCIe AER (generic) ──────────────────────────────────────────────────────
// PCIe AER correctable from the root port — captures the reported device BDF
// (second BDF in "pcieport X: AER: Correctable error received: Y").
{
Name: "pcie-aer-correctable",
Re: mustPat(`(?i)pcieport.*AER:.*[Cc]orrect.*:\s*([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
Category: "pcie",
Severity: "warning",
BDFGroup: 1,
},
{
Name: "pcie-aer",
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),

View File

@@ -165,7 +165,9 @@ func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
for _, id := range evt.ids {
var key string
switch evt.category {
case "gpu", "pcie":
case "gpu":
key = "pcie:gpu:" + normalizeBDF(id)
case "pcie":
key = "pcie:" + normalizeBDF(id)
case "storage":
key = "storage:" + id
@@ -218,7 +220,9 @@ func (w *kmsgWatcher) flushImmediate(evt kmsgEvent) {
for _, id := range evt.ids {
var key string
switch evt.category {
case "gpu", "pcie":
case "gpu":
key = "pcie:gpu:" + normalizeBDF(id)
case "pcie":
key = "pcie:" + normalizeBDF(id)
case "storage":
key = "storage:" + id