Files
bee/audit/internal/platform/error_patterns.go
Mikhail Chusavitin fd722692a4 feat(watchdog): hardware error monitor + unified component status store
- Add platform/error_patterns.go: pluggable table of kernel log patterns
  (NVIDIA/GPU, PCIe AER, storage I/O, MCE, EDAC) — extend by adding one struct
- Add app/component_status_db.go: persistent JSON store (component-status.json)
  keyed by "pcie:BDF", "storage:dev", "cpu:all", "memory:all"; OK never
  downgrades Warning or Critical
- Add webui/kmsg_watcher.go: goroutine reads /dev/kmsg during SAT tasks,
  writes Warning to DB for matched hardware errors
- Fix task status: overall_status=FAILED in summary.txt now marks task failed
- Audit routine overlays component DB statuses into bee-audit.json on every read

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-02 19:20:59 +03:00

140 lines
4.5 KiB
Go

package platform
import "regexp"
// ErrorPattern describes a kernel log pattern that indicates a hardware error.
// Add new patterns by appending to HardwareErrorPatterns — no other code changes needed.
type ErrorPattern struct {
// Name is a short machine-readable label for logging and deduplication.
Name string
// Re is the compiled regular expression matched against a single kmsg line.
Re *regexp.Regexp
// Category groups related errors: "gpu", "pcie", "storage", "mce", "memory", "cpu".
Category string
// Severity is "warning" for recoverable/uncertain faults, "critical" for definitive failures.
Severity string
// BDFGroup is the capture group index (1-based) that contains a PCIe BDF address
// (e.g. "0000:c8:00.0"). 0 means no BDF is captured by this pattern.
BDFGroup int
// DevGroup is the capture group index (1-based) that contains a device name
// (e.g. "sda", "nvme0"). 0 means no device name is captured by this pattern.
DevGroup int
}
// HardwareErrorPatterns is the global list of kernel log patterns that indicate hardware faults.
// To add a new pattern: append a new ErrorPattern struct to this slice.
var HardwareErrorPatterns = []ErrorPattern{
// ── GPU / NVIDIA ────────────────────────────────────────────────────────────
{
Name: "nvidia-rminitadapter",
Re: mustPat(`(?i)NVRM:.*GPU\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
Category: "gpu",
Severity: "warning",
BDFGroup: 1,
},
{
Name: "nvidia-msi-fail",
Re: mustPat(`(?i)NVRM:.*Failed to enable MSI`),
Category: "gpu",
Severity: "warning",
},
{
Name: "nvidia-aer",
Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
Category: "gpu",
Severity: "warning",
BDFGroup: 1,
},
{
Name: "nvidia-xid",
Re: mustPat(`(?i)NVRM:.*Xid.*\b([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
Category: "gpu",
Severity: "warning",
BDFGroup: 1,
},
// ── PCIe AER (generic) ──────────────────────────────────────────────────────
{
Name: "pcie-aer",
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
Category: "pcie",
Severity: "warning",
BDFGroup: 1,
},
{
Name: "pcie-uncorrectable",
Re: mustPat(`(?i)([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Uu]ncorrectable`),
Category: "pcie",
Severity: "warning",
BDFGroup: 1,
},
{
Name: "pcie-link-down",
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Ll]ink.*[Dd]own`),
Category: "pcie",
Severity: "warning",
BDFGroup: 1,
},
// ── Storage ─────────────────────────────────────────────────────────────────
{
Name: "blk-io-error",
Re: mustPat(`(?i)blk_update_request.*I/O error.*dev\s+(\w+)`),
Category: "storage",
Severity: "warning",
DevGroup: 1,
},
{
Name: "nvme-timeout",
Re: mustPat(`(?i)nvme\s+(\w+):.*timeout`),
Category: "storage",
Severity: "warning",
DevGroup: 1,
},
{
Name: "scsi-failed",
Re: mustPat(`(?i)sd\s+[\da-f:]+:.*FAILED`),
Category: "storage",
Severity: "warning",
},
{
Name: "nvme-reset",
Re: mustPat(`(?i)nvme\s+(\w+):.*reset`),
Category: "storage",
Severity: "warning",
DevGroup: 1,
},
// ── Machine Check Exceptions ────────────────────────────────────────────────
{
Name: "mce-hardware-error",
Re: mustPat(`(?i)mce:.*[Hh]ardware [Ee]rror`),
Category: "mce",
Severity: "warning",
},
{
Name: "mce-corrected",
Re: mustPat(`(?i)mce:.*[Cc]orrected`),
Category: "mce",
Severity: "warning",
},
// ── Memory ─────────────────────────────────────────────────────────────────
{
Name: "edac-ue",
Re: mustPat(`(?i)EDAC.*[Uu]ncorrectable`),
Category: "memory",
Severity: "warning",
},
{
Name: "edac-ce",
Re: mustPat(`(?i)EDAC.*[Cc]orrectable`),
Category: "memory",
Severity: "warning",
},
}
func mustPat(s string) *regexp.Regexp {
return regexp.MustCompile(s)
}