package platform import "regexp" // ErrorPattern describes a kernel log pattern that indicates a hardware error. // Add new patterns by appending to HardwareErrorPatterns — no other code changes needed. type ErrorPattern struct { // Name is a short machine-readable label for logging and deduplication. Name string // Re is the compiled regular expression matched against a single kmsg line. Re *regexp.Regexp // Category groups related errors: "gpu", "pcie", "storage", "mce", "memory", "cpu". Category string // Severity is "warning" for recoverable/uncertain faults, "critical" for definitive failures. Severity string // BDFGroup is the capture group index (1-based) that contains a PCIe BDF address // (e.g. "0000:c8:00.0"). 0 means no BDF is captured by this pattern. BDFGroup int // DevGroup is the capture group index (1-based) that contains a device name // (e.g. "sda", "nvme0"). 0 means no device name is captured by this pattern. DevGroup int } // HardwareErrorPatterns is the global list of kernel log patterns that indicate hardware faults. // To add a new pattern: append a new ErrorPattern struct to this slice. var HardwareErrorPatterns = []ErrorPattern{ // ── GPU / NVIDIA ──────────────────────────────────────────────────────────── { Name: "nvidia-rminitadapter", Re: mustPat(`(?i)NVRM:.*GPU\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`), Category: "gpu", Severity: "warning", BDFGroup: 1, }, { Name: "nvidia-msi-fail", Re: mustPat(`(?i)NVRM:.*Failed to enable MSI`), Category: "gpu", Severity: "warning", }, { Name: "nvidia-aer", Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`), Category: "gpu", Severity: "warning", BDFGroup: 1, }, { Name: "nvidia-xid", Re: mustPat(`(?i)NVRM:.*Xid.*\b([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`), Category: "gpu", Severity: "warning", BDFGroup: 1, }, // ── PCIe AER (generic) ────────────────────────────────────────────────────── { Name: "pcie-aer", Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`), Category: "pcie", Severity: "warning", BDFGroup: 1, }, { Name: "pcie-uncorrectable", Re: mustPat(`(?i)([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Uu]ncorrectable`), Category: "pcie", Severity: "warning", BDFGroup: 1, }, { Name: "pcie-link-down", Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Ll]ink.*[Dd]own`), Category: "pcie", Severity: "warning", BDFGroup: 1, }, // ── Storage ───────────────────────────────────────────────────────────────── { Name: "blk-io-error", Re: mustPat(`(?i)blk_update_request.*I/O error.*dev\s+(\w+)`), Category: "storage", Severity: "warning", DevGroup: 1, }, { Name: "nvme-timeout", Re: mustPat(`(?i)nvme\s+(\w+):.*timeout`), Category: "storage", Severity: "warning", DevGroup: 1, }, { Name: "scsi-failed", Re: mustPat(`(?i)sd\s+[\da-f:]+:.*FAILED`), Category: "storage", Severity: "warning", }, { Name: "nvme-reset", Re: mustPat(`(?i)nvme\s+(\w+):.*reset`), Category: "storage", Severity: "warning", DevGroup: 1, }, // ── Machine Check Exceptions ──────────────────────────────────────────────── { Name: "mce-hardware-error", Re: mustPat(`(?i)mce:.*[Hh]ardware [Ee]rror`), Category: "mce", Severity: "warning", }, { Name: "mce-corrected", Re: mustPat(`(?i)mce:.*[Cc]orrected`), Category: "mce", Severity: "warning", }, // ── Memory ───────────────────────────────────────────────────────────────── { Name: "edac-ue", Re: mustPat(`(?i)EDAC.*[Uu]ncorrectable`), Category: "memory", Severity: "warning", }, { Name: "edac-ce", Re: mustPat(`(?i)EDAC.*[Cc]orrectable`), Category: "memory", Severity: "warning", }, } func mustPat(s string) *regexp.Regexp { return regexp.MustCompile(s) }