- Add platform/error_patterns.go: pluggable table of kernel log patterns (NVIDIA/GPU, PCIe AER, storage I/O, MCE, EDAC) — extend by adding one struct - Add app/component_status_db.go: persistent JSON store (component-status.json) keyed by "pcie:BDF", "storage:dev", "cpu:all", "memory:all"; OK never downgrades Warning or Critical - Add webui/kmsg_watcher.go: goroutine reads /dev/kmsg during SAT tasks, writes Warning to DB for matched hardware errors - Fix task status: overall_status=FAILED in summary.txt now marks task failed - Audit routine overlays component DB statuses into bee-audit.json on every read Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
140 lines
4.5 KiB
Go
140 lines
4.5 KiB
Go
package platform
|
|
|
|
import "regexp"
|
|
|
|
// ErrorPattern describes a kernel log pattern that indicates a hardware error.
|
|
// Add new patterns by appending to HardwareErrorPatterns — no other code changes needed.
|
|
type ErrorPattern struct {
|
|
// Name is a short machine-readable label for logging and deduplication.
|
|
Name string
|
|
// Re is the compiled regular expression matched against a single kmsg line.
|
|
Re *regexp.Regexp
|
|
// Category groups related errors: "gpu", "pcie", "storage", "mce", "memory", "cpu".
|
|
Category string
|
|
// Severity is "warning" for recoverable/uncertain faults, "critical" for definitive failures.
|
|
Severity string
|
|
// BDFGroup is the capture group index (1-based) that contains a PCIe BDF address
|
|
// (e.g. "0000:c8:00.0"). 0 means no BDF is captured by this pattern.
|
|
BDFGroup int
|
|
// DevGroup is the capture group index (1-based) that contains a device name
|
|
// (e.g. "sda", "nvme0"). 0 means no device name is captured by this pattern.
|
|
DevGroup int
|
|
}
|
|
|
|
// HardwareErrorPatterns is the global list of kernel log patterns that indicate hardware faults.
|
|
// To add a new pattern: append a new ErrorPattern struct to this slice.
|
|
var HardwareErrorPatterns = []ErrorPattern{
|
|
// ── GPU / NVIDIA ────────────────────────────────────────────────────────────
|
|
{
|
|
Name: "nvidia-rminitadapter",
|
|
Re: mustPat(`(?i)NVRM:.*GPU\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
|
|
Category: "gpu",
|
|
Severity: "warning",
|
|
BDFGroup: 1,
|
|
},
|
|
{
|
|
Name: "nvidia-msi-fail",
|
|
Re: mustPat(`(?i)NVRM:.*Failed to enable MSI`),
|
|
Category: "gpu",
|
|
Severity: "warning",
|
|
},
|
|
{
|
|
Name: "nvidia-aer",
|
|
Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
|
Category: "gpu",
|
|
Severity: "warning",
|
|
BDFGroup: 1,
|
|
},
|
|
{
|
|
Name: "nvidia-xid",
|
|
Re: mustPat(`(?i)NVRM:.*Xid.*\b([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
|
|
Category: "gpu",
|
|
Severity: "warning",
|
|
BDFGroup: 1,
|
|
},
|
|
|
|
// ── PCIe AER (generic) ──────────────────────────────────────────────────────
|
|
{
|
|
Name: "pcie-aer",
|
|
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
|
Category: "pcie",
|
|
Severity: "warning",
|
|
BDFGroup: 1,
|
|
},
|
|
{
|
|
Name: "pcie-uncorrectable",
|
|
Re: mustPat(`(?i)([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Uu]ncorrectable`),
|
|
Category: "pcie",
|
|
Severity: "warning",
|
|
BDFGroup: 1,
|
|
},
|
|
{
|
|
Name: "pcie-link-down",
|
|
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Ll]ink.*[Dd]own`),
|
|
Category: "pcie",
|
|
Severity: "warning",
|
|
BDFGroup: 1,
|
|
},
|
|
|
|
// ── Storage ─────────────────────────────────────────────────────────────────
|
|
{
|
|
Name: "blk-io-error",
|
|
Re: mustPat(`(?i)blk_update_request.*I/O error.*dev\s+(\w+)`),
|
|
Category: "storage",
|
|
Severity: "warning",
|
|
DevGroup: 1,
|
|
},
|
|
{
|
|
Name: "nvme-timeout",
|
|
Re: mustPat(`(?i)nvme\s+(\w+):.*timeout`),
|
|
Category: "storage",
|
|
Severity: "warning",
|
|
DevGroup: 1,
|
|
},
|
|
{
|
|
Name: "scsi-failed",
|
|
Re: mustPat(`(?i)sd\s+[\da-f:]+:.*FAILED`),
|
|
Category: "storage",
|
|
Severity: "warning",
|
|
},
|
|
{
|
|
Name: "nvme-reset",
|
|
Re: mustPat(`(?i)nvme\s+(\w+):.*reset`),
|
|
Category: "storage",
|
|
Severity: "warning",
|
|
DevGroup: 1,
|
|
},
|
|
|
|
// ── Machine Check Exceptions ────────────────────────────────────────────────
|
|
{
|
|
Name: "mce-hardware-error",
|
|
Re: mustPat(`(?i)mce:.*[Hh]ardware [Ee]rror`),
|
|
Category: "mce",
|
|
Severity: "warning",
|
|
},
|
|
{
|
|
Name: "mce-corrected",
|
|
Re: mustPat(`(?i)mce:.*[Cc]orrected`),
|
|
Category: "mce",
|
|
Severity: "warning",
|
|
},
|
|
|
|
// ── Memory ─────────────────────────────────────────────────────────────────
|
|
{
|
|
Name: "edac-ue",
|
|
Re: mustPat(`(?i)EDAC.*[Uu]ncorrectable`),
|
|
Category: "memory",
|
|
Severity: "warning",
|
|
},
|
|
{
|
|
Name: "edac-ce",
|
|
Re: mustPat(`(?i)EDAC.*[Cc]orrectable`),
|
|
Category: "memory",
|
|
Severity: "warning",
|
|
},
|
|
}
|
|
|
|
func mustPat(s string) *regexp.Regexp {
|
|
return regexp.MustCompile(s)
|
|
}
|