Track PCIe AER correctable errors; fix GPU status key routing
Add nvidia-aer-correctable and pcie-aer-correctable patterns to catch "bus correctable error" events seen in SEL (Critical Interrupt / offset 7). Both patterns carry severity "warning" — correctable errors are hardware-recovered and should not flag a card as failed. Fix kmsg_watcher routing: GPU-category events were keyed as pcie:<BDF> but the UI queries for pcie:gpu: prefix. Split the switch so "gpu" → pcie:gpu:<BDF> and "pcie" → pcie:<BDF>. This applies to both flushWindow (SAT-window path) and flushImmediate (always-on path). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -38,6 +38,15 @@ var HardwareErrorPatterns = []ErrorPattern{
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
},
|
||||
// PCIe AER correctable from the NVIDIA driver — "bus correctable error" in SEL.
|
||||
// Severity is warning (not critical): correctable errors are hardware-recovered.
|
||||
{
|
||||
Name: "nvidia-aer-correctable",
|
||||
Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER.*[Cc]orrect`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "nvidia-aer",
|
||||
Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||
@@ -54,6 +63,15 @@ var HardwareErrorPatterns = []ErrorPattern{
|
||||
},
|
||||
|
||||
// ── PCIe AER (generic) ──────────────────────────────────────────────────────
|
||||
// PCIe AER correctable from the root port — captures the reported device BDF
|
||||
// (second BDF in "pcieport X: AER: Correctable error received: Y").
|
||||
{
|
||||
Name: "pcie-aer-correctable",
|
||||
Re: mustPat(`(?i)pcieport.*AER:.*[Cc]orrect.*:\s*([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
|
||||
Category: "pcie",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "pcie-aer",
|
||||
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||
|
||||
@@ -165,7 +165,9 @@ func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
|
||||
for _, id := range evt.ids {
|
||||
var key string
|
||||
switch evt.category {
|
||||
case "gpu", "pcie":
|
||||
case "gpu":
|
||||
key = "pcie:gpu:" + normalizeBDF(id)
|
||||
case "pcie":
|
||||
key = "pcie:" + normalizeBDF(id)
|
||||
case "storage":
|
||||
key = "storage:" + id
|
||||
@@ -218,7 +220,9 @@ func (w *kmsgWatcher) flushImmediate(evt kmsgEvent) {
|
||||
for _, id := range evt.ids {
|
||||
var key string
|
||||
switch evt.category {
|
||||
case "gpu", "pcie":
|
||||
case "gpu":
|
||||
key = "pcie:gpu:" + normalizeBDF(id)
|
||||
case "pcie":
|
||||
key = "pcie:" + normalizeBDF(id)
|
||||
case "storage":
|
||||
key = "storage:" + id
|
||||
|
||||
Reference in New Issue
Block a user