feat(watchdog): hardware error monitor + unified component status store
- Add platform/error_patterns.go: pluggable table of kernel log patterns (NVIDIA/GPU, PCIe AER, storage I/O, MCE, EDAC) — extend by adding one struct - Add app/component_status_db.go: persistent JSON store (component-status.json) keyed by "pcie:BDF", "storage:dev", "cpu:all", "memory:all"; OK never downgrades Warning or Critical - Add webui/kmsg_watcher.go: goroutine reads /dev/kmsg during SAT tasks, writes Warning to DB for matched hardware errors - Fix task status: overall_status=FAILED in summary.txt now marks task failed - Audit routine overlays component DB statuses into bee-audit.json on every read Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -40,6 +40,8 @@ type App struct {
|
||||
sat satRunner
|
||||
runtime runtimeChecker
|
||||
installer installer
|
||||
// StatusDB is the unified component health store (nil if unavailable).
|
||||
StatusDB *ComponentStatusDB
|
||||
}
|
||||
|
||||
type ActionResult struct {
|
||||
@@ -136,7 +138,7 @@ type runtimeChecker interface {
|
||||
}
|
||||
|
||||
func New(platform *platform.System) *App {
|
||||
return &App{
|
||||
a := &App{
|
||||
network: platform,
|
||||
services: platform,
|
||||
exports: platform,
|
||||
@@ -145,6 +147,10 @@ func New(platform *platform.System) *App {
|
||||
runtime: platform,
|
||||
installer: platform,
|
||||
}
|
||||
if db, err := OpenComponentStatusDB(DefaultExportDir + "/component-status.json"); err == nil {
|
||||
a.StatusDB = db
|
||||
}
|
||||
return a
|
||||
}
|
||||
|
||||
// ApplySATOverlay parses a raw audit JSON, overlays the latest SAT results,
|
||||
@@ -154,7 +160,7 @@ func ApplySATOverlay(auditJSON []byte) ([]byte, error) {
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir)
|
||||
applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir, nil)
|
||||
return json.MarshalIndent(snap, "", " ")
|
||||
}
|
||||
|
||||
@@ -174,7 +180,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
|
||||
}
|
||||
}
|
||||
result := collector.Run(runtimeMode)
|
||||
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir)
|
||||
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
|
||||
if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
|
||||
result.Runtime = &health
|
||||
}
|
||||
|
||||
266
audit/internal/app/component_status_db.go
Normal file
266
audit/internal/app/component_status_db.go
Normal file
@@ -0,0 +1,266 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ComponentStatusDB is a persistent, append-only store of hardware component health records.
|
||||
// Records are keyed by component identity strings (e.g. "pcie:0000:c8:00.0", "storage:nvme0n1").
|
||||
// Once a component is marked Warning or Critical, subsequent OK entries do not downgrade it —
|
||||
// the component stays at the highest observed severity until explicitly reset.
|
||||
type ComponentStatusDB struct {
|
||||
path string
|
||||
mu sync.Mutex
|
||||
records map[string]*ComponentStatusRecord
|
||||
}
|
||||
|
||||
// ComponentStatusRecord holds the current and historical health of one hardware component.
|
||||
type ComponentStatusRecord struct {
|
||||
ComponentKey string `json:"component_key"`
|
||||
Status string `json:"status"` // "OK", "Warning", "Critical", "Unknown"
|
||||
LastCheckedAt time.Time `json:"last_checked_at"`
|
||||
LastChangedAt time.Time `json:"last_changed_at"`
|
||||
ErrorSummary string `json:"error_summary,omitempty"`
|
||||
History []ComponentStatusEntry `json:"history"`
|
||||
}
|
||||
|
||||
// ComponentStatusEntry is one observation written to a component's history.
|
||||
type ComponentStatusEntry struct {
|
||||
At time.Time `json:"at"`
|
||||
Status string `json:"status"`
|
||||
Source string `json:"source"` // e.g. "sat:nvidia", "sat:memory", "watchdog:kmsg"
|
||||
Detail string `json:"detail,omitempty"`
|
||||
}
|
||||
|
||||
// OpenComponentStatusDB opens (or creates) the JSON status DB at path.
|
||||
func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) {
|
||||
db := &ComponentStatusDB{
|
||||
path: path,
|
||||
records: make(map[string]*ComponentStatusRecord),
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil && !os.IsNotExist(err) {
|
||||
return nil, err
|
||||
}
|
||||
if len(data) > 0 {
|
||||
var records []ComponentStatusRecord
|
||||
if err := json.Unmarshal(data, &records); err == nil {
|
||||
for i := range records {
|
||||
db.records[records[i].ComponentKey] = &records[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
return db, nil
|
||||
}
|
||||
|
||||
// Record writes one observation for the given component key.
|
||||
// source is a short label like "sat:nvidia" or "watchdog:kmsg".
|
||||
// status is "OK", "Warning", "Critical", or "Unknown".
|
||||
// OK never downgrades an existing Warning or Critical status.
|
||||
func (db *ComponentStatusDB) Record(key, source, status, detail string) {
|
||||
if db == nil || strings.TrimSpace(key) == "" {
|
||||
return
|
||||
}
|
||||
db.mu.Lock()
|
||||
defer db.mu.Unlock()
|
||||
|
||||
now := time.Now().UTC()
|
||||
rec, exists := db.records[key]
|
||||
if !exists {
|
||||
rec = &ComponentStatusRecord{ComponentKey: key}
|
||||
db.records[key] = rec
|
||||
}
|
||||
rec.LastCheckedAt = now
|
||||
|
||||
entry := ComponentStatusEntry{At: now, Status: status, Source: source, Detail: detail}
|
||||
rec.History = append(rec.History, entry)
|
||||
|
||||
// Status merge: OK never downgrades Warning/Critical.
|
||||
newSev := componentSeverity(status)
|
||||
curSev := componentSeverity(rec.Status)
|
||||
if newSev > curSev {
|
||||
rec.Status = status
|
||||
rec.LastChangedAt = now
|
||||
rec.ErrorSummary = detail
|
||||
} else if rec.Status == "" {
|
||||
rec.Status = status
|
||||
rec.LastChangedAt = now
|
||||
}
|
||||
|
||||
_ = db.saveLocked()
|
||||
}
|
||||
|
||||
// Get returns the current record for a component key.
|
||||
func (db *ComponentStatusDB) Get(key string) (ComponentStatusRecord, bool) {
|
||||
if db == nil {
|
||||
return ComponentStatusRecord{}, false
|
||||
}
|
||||
db.mu.Lock()
|
||||
defer db.mu.Unlock()
|
||||
r, ok := db.records[key]
|
||||
if !ok {
|
||||
return ComponentStatusRecord{}, false
|
||||
}
|
||||
return *r, true
|
||||
}
|
||||
|
||||
// All returns a snapshot of all records.
|
||||
func (db *ComponentStatusDB) All() []ComponentStatusRecord {
|
||||
if db == nil {
|
||||
return nil
|
||||
}
|
||||
db.mu.Lock()
|
||||
defer db.mu.Unlock()
|
||||
out := make([]ComponentStatusRecord, 0, len(db.records))
|
||||
for _, r := range db.records {
|
||||
out = append(out, *r)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func (db *ComponentStatusDB) saveLocked() error {
|
||||
records := make([]ComponentStatusRecord, 0, len(db.records))
|
||||
for _, r := range db.records {
|
||||
records = append(records, *r)
|
||||
}
|
||||
data, err := json.MarshalIndent(records, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(db.path, data, 0644)
|
||||
}
|
||||
|
||||
// componentSeverity returns a numeric severity so higher values win.
|
||||
func componentSeverity(status string) int {
|
||||
switch strings.TrimSpace(status) {
|
||||
case "Critical":
|
||||
return 3
|
||||
case "Warning":
|
||||
return 2
|
||||
case "OK":
|
||||
return 1
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
// ApplySATResultToDB reads a SAT summary.txt from the run directory next to archivePath
|
||||
// and writes component status records to db for the given SAT target.
|
||||
// archivePath may be either a bare .tar.gz path or "Archive written to /path/foo.tar.gz".
|
||||
func ApplySATResultToDB(db *ComponentStatusDB, target, archivePath string) {
|
||||
if db == nil || strings.TrimSpace(archivePath) == "" {
|
||||
return
|
||||
}
|
||||
archivePath = extractArchivePath(archivePath)
|
||||
if archivePath == "" {
|
||||
return
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
kv := parseSATKV(string(data))
|
||||
overall := strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
|
||||
if overall == "" {
|
||||
return
|
||||
}
|
||||
|
||||
source := "sat:" + target
|
||||
dbStatus := satStatusToDBStatus(overall)
|
||||
|
||||
// Map SAT target to component keys.
|
||||
switch target {
|
||||
case "nvidia", "amd", "nvidia-stress", "amd-stress", "amd-mem", "amd-bandwidth":
|
||||
db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall)
|
||||
case "memory", "memory-stress", "sat-stress":
|
||||
db.Record("memory:all", source, dbStatus, target+" SAT: "+overall)
|
||||
case "cpu", "platform-stress":
|
||||
db.Record("cpu:all", source, dbStatus, target+" SAT: "+overall)
|
||||
case "storage":
|
||||
// Try to record per-device if available in summary.
|
||||
recordedAny := false
|
||||
for key, val := range kv {
|
||||
if !strings.HasSuffix(key, "_status") || key == "overall_status" {
|
||||
continue
|
||||
}
|
||||
base := strings.TrimSuffix(key, "_status")
|
||||
idx := strings.Index(base, "_")
|
||||
if idx <= 0 {
|
||||
continue
|
||||
}
|
||||
devName := base[:idx]
|
||||
devStatus := satStatusToDBStatus(strings.ToUpper(strings.TrimSpace(val)))
|
||||
db.Record("storage:"+devName, source, devStatus, "storage SAT: "+val)
|
||||
recordedAny = true
|
||||
}
|
||||
if !recordedAny {
|
||||
db.Record("storage:all", source, dbStatus, "storage SAT: "+overall)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func satStatusToDBStatus(overall string) string {
|
||||
switch overall {
|
||||
case "OK":
|
||||
return "OK"
|
||||
case "FAILED":
|
||||
return "Warning"
|
||||
case "PARTIAL", "UNSUPPORTED":
|
||||
return "Unknown"
|
||||
default:
|
||||
return "Unknown"
|
||||
}
|
||||
}
|
||||
|
||||
// ExtractArchivePath extracts a bare .tar.gz path from a string that may be
|
||||
// "Archive written to /path/foo.tar.gz" or already a bare path.
|
||||
func ExtractArchivePath(s string) string {
|
||||
return extractArchivePath(s)
|
||||
}
|
||||
|
||||
// ReadSATOverallStatus reads the overall_status value from the summary.txt
|
||||
// file located in the run directory alongside archivePath.
|
||||
// Returns "" if the file cannot be read.
|
||||
func ReadSATOverallStatus(archivePath string) string {
|
||||
if strings.TrimSpace(archivePath) == "" {
|
||||
return ""
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
kv := parseSATKV(string(data))
|
||||
return strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
|
||||
}
|
||||
|
||||
func extractArchivePath(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
if strings.HasSuffix(s, ".tar.gz") {
|
||||
parts := strings.Fields(s)
|
||||
if len(parts) > 0 {
|
||||
return parts[len(parts)-1]
|
||||
}
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
func parseSATKV(raw string) map[string]string {
|
||||
kv := make(map[string]string)
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
k, v, ok := strings.Cut(strings.TrimSpace(line), "=")
|
||||
if ok {
|
||||
kv[strings.TrimSpace(k)] = strings.TrimSpace(v)
|
||||
}
|
||||
}
|
||||
return kv
|
||||
}
|
||||
@@ -9,7 +9,7 @@ import (
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
|
||||
func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *ComponentStatusDB) {
|
||||
if snap == nil || strings.TrimSpace(baseDir) == "" {
|
||||
return
|
||||
}
|
||||
@@ -28,6 +28,8 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "storage-"); ok {
|
||||
applyStorageSAT(snap.Storage, summary)
|
||||
}
|
||||
// Apply unified component status DB — overlaid last so it can only upgrade severity.
|
||||
applyComponentStatusDB(snap, db)
|
||||
}
|
||||
|
||||
type satSummary struct {
|
||||
@@ -206,6 +208,86 @@ func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool {
|
||||
}
|
||||
}
|
||||
|
||||
func applyComponentStatusDB(snap *schema.HardwareSnapshot, db *ComponentStatusDB) {
|
||||
if snap == nil || db == nil {
|
||||
return
|
||||
}
|
||||
for _, rec := range db.All() {
|
||||
key := rec.ComponentKey
|
||||
status := dbStatusToSATStatus(rec.Status)
|
||||
if status == "" {
|
||||
continue
|
||||
}
|
||||
detail := rec.ErrorSummary
|
||||
ts := rec.LastChangedAt.UTC().Format("2006-01-02T15:04:05Z")
|
||||
|
||||
switch {
|
||||
case strings.HasPrefix(key, "pcie:"):
|
||||
bdf := strings.TrimPrefix(key, "pcie:")
|
||||
bdf = strings.TrimPrefix(bdf, "gpu:") // strip sub-type if present
|
||||
// bdf may be empty (e.g. "pcie:gpu:nvidia") — skip BDF matching
|
||||
if sanitizeBDFForLookup(bdf) == "" {
|
||||
break
|
||||
}
|
||||
normalized := sanitizeBDFForLookup(bdf)
|
||||
for i := range snap.PCIeDevices {
|
||||
if snap.PCIeDevices[i].BDF == nil {
|
||||
continue
|
||||
}
|
||||
if sanitizeBDFForLookup(*snap.PCIeDevices[i].BDF) == normalized {
|
||||
mergeComponentStatus(&snap.PCIeDevices[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
}
|
||||
case strings.HasPrefix(key, "storage:"):
|
||||
devName := strings.TrimPrefix(key, "storage:")
|
||||
if devName == "all" {
|
||||
for i := range snap.Storage {
|
||||
mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
} else {
|
||||
for i := range snap.Storage {
|
||||
linuxDev, _ := snap.Storage[i].Telemetry["linux_device"].(string)
|
||||
if filepath.Base(strings.TrimSpace(linuxDev)) == devName {
|
||||
mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
}
|
||||
}
|
||||
case strings.HasPrefix(key, "memory:"):
|
||||
for i := range snap.Memory {
|
||||
mergeComponentStatus(&snap.Memory[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
case strings.HasPrefix(key, "cpu:"):
|
||||
for i := range snap.CPUs {
|
||||
mergeComponentStatus(&snap.CPUs[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// dbStatusToSATStatus converts ComponentStatusDB status strings to the format
|
||||
// expected by mergeComponentStatus (which uses "OK", "Warning", "Critical", "Unknown").
|
||||
func dbStatusToSATStatus(s string) string {
|
||||
switch strings.TrimSpace(s) {
|
||||
case "OK", "Warning", "Critical", "Unknown":
|
||||
return s
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
// sanitizeBDFForLookup normalises a PCIe BDF address to a canonical lower-case form
|
||||
// suitable for comparison. "c8:00.0" → "0000:c8:00.0"; already-full BDFs are left as-is.
|
||||
func sanitizeBDFForLookup(bdf string) string {
|
||||
bdf = strings.ToLower(strings.TrimSpace(bdf))
|
||||
if bdf == "" || bdf == "gpu" || strings.ContainsAny(bdf, " \t") {
|
||||
return ""
|
||||
}
|
||||
if strings.Count(bdf, ":") == 1 {
|
||||
bdf = "0000:" + bdf
|
||||
}
|
||||
return bdf
|
||||
}
|
||||
|
||||
func ptrString(v *string) string {
|
||||
if v == nil {
|
||||
return ""
|
||||
|
||||
@@ -23,7 +23,7 @@ func TestApplyLatestSATStatusesMarksStorageByDevice(t *testing.T) {
|
||||
usb := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/sda"}}
|
||||
snap := schema.HardwareSnapshot{Storage: []schema.HardwareStorage{nvme, usb}}
|
||||
|
||||
applyLatestSATStatuses(&snap, baseDir)
|
||||
applyLatestSATStatuses(&snap, baseDir, nil)
|
||||
|
||||
if snap.Storage[0].Status == nil || *snap.Storage[0].Status != "OK" {
|
||||
t.Fatalf("nvme status=%v want OK", snap.Storage[0].Status)
|
||||
@@ -53,7 +53,7 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
|
||||
}},
|
||||
}
|
||||
|
||||
applyLatestSATStatuses(&snap, baseDir)
|
||||
applyLatestSATStatuses(&snap, baseDir, nil)
|
||||
|
||||
if snap.PCIeDevices[0].Status == nil || *snap.PCIeDevices[0].Status != "Critical" {
|
||||
t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
|
||||
|
||||
139
audit/internal/platform/error_patterns.go
Normal file
139
audit/internal/platform/error_patterns.go
Normal file
@@ -0,0 +1,139 @@
|
||||
package platform
|
||||
|
||||
import "regexp"
|
||||
|
||||
// ErrorPattern describes a kernel log pattern that indicates a hardware error.
|
||||
// Add new patterns by appending to HardwareErrorPatterns — no other code changes needed.
|
||||
type ErrorPattern struct {
|
||||
// Name is a short machine-readable label for logging and deduplication.
|
||||
Name string
|
||||
// Re is the compiled regular expression matched against a single kmsg line.
|
||||
Re *regexp.Regexp
|
||||
// Category groups related errors: "gpu", "pcie", "storage", "mce", "memory", "cpu".
|
||||
Category string
|
||||
// Severity is "warning" for recoverable/uncertain faults, "critical" for definitive failures.
|
||||
Severity string
|
||||
// BDFGroup is the capture group index (1-based) that contains a PCIe BDF address
|
||||
// (e.g. "0000:c8:00.0"). 0 means no BDF is captured by this pattern.
|
||||
BDFGroup int
|
||||
// DevGroup is the capture group index (1-based) that contains a device name
|
||||
// (e.g. "sda", "nvme0"). 0 means no device name is captured by this pattern.
|
||||
DevGroup int
|
||||
}
|
||||
|
||||
// HardwareErrorPatterns is the global list of kernel log patterns that indicate hardware faults.
|
||||
// To add a new pattern: append a new ErrorPattern struct to this slice.
|
||||
var HardwareErrorPatterns = []ErrorPattern{
|
||||
// ── GPU / NVIDIA ────────────────────────────────────────────────────────────
|
||||
{
|
||||
Name: "nvidia-rminitadapter",
|
||||
Re: mustPat(`(?i)NVRM:.*GPU\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "nvidia-msi-fail",
|
||||
Re: mustPat(`(?i)NVRM:.*Failed to enable MSI`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
},
|
||||
{
|
||||
Name: "nvidia-aer",
|
||||
Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "nvidia-xid",
|
||||
Re: mustPat(`(?i)NVRM:.*Xid.*\b([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
|
||||
// ── PCIe AER (generic) ──────────────────────────────────────────────────────
|
||||
{
|
||||
Name: "pcie-aer",
|
||||
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||
Category: "pcie",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "pcie-uncorrectable",
|
||||
Re: mustPat(`(?i)([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Uu]ncorrectable`),
|
||||
Category: "pcie",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "pcie-link-down",
|
||||
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Ll]ink.*[Dd]own`),
|
||||
Category: "pcie",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
|
||||
// ── Storage ─────────────────────────────────────────────────────────────────
|
||||
{
|
||||
Name: "blk-io-error",
|
||||
Re: mustPat(`(?i)blk_update_request.*I/O error.*dev\s+(\w+)`),
|
||||
Category: "storage",
|
||||
Severity: "warning",
|
||||
DevGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "nvme-timeout",
|
||||
Re: mustPat(`(?i)nvme\s+(\w+):.*timeout`),
|
||||
Category: "storage",
|
||||
Severity: "warning",
|
||||
DevGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "scsi-failed",
|
||||
Re: mustPat(`(?i)sd\s+[\da-f:]+:.*FAILED`),
|
||||
Category: "storage",
|
||||
Severity: "warning",
|
||||
},
|
||||
{
|
||||
Name: "nvme-reset",
|
||||
Re: mustPat(`(?i)nvme\s+(\w+):.*reset`),
|
||||
Category: "storage",
|
||||
Severity: "warning",
|
||||
DevGroup: 1,
|
||||
},
|
||||
|
||||
// ── Machine Check Exceptions ────────────────────────────────────────────────
|
||||
{
|
||||
Name: "mce-hardware-error",
|
||||
Re: mustPat(`(?i)mce:.*[Hh]ardware [Ee]rror`),
|
||||
Category: "mce",
|
||||
Severity: "warning",
|
||||
},
|
||||
{
|
||||
Name: "mce-corrected",
|
||||
Re: mustPat(`(?i)mce:.*[Cc]orrected`),
|
||||
Category: "mce",
|
||||
Severity: "warning",
|
||||
},
|
||||
|
||||
// ── Memory ─────────────────────────────────────────────────────────────────
|
||||
{
|
||||
Name: "edac-ue",
|
||||
Re: mustPat(`(?i)EDAC.*[Uu]ncorrectable`),
|
||||
Category: "memory",
|
||||
Severity: "warning",
|
||||
},
|
||||
{
|
||||
Name: "edac-ce",
|
||||
Re: mustPat(`(?i)EDAC.*[Cc]orrectable`),
|
||||
Category: "memory",
|
||||
Severity: "warning",
|
||||
},
|
||||
}
|
||||
|
||||
func mustPat(s string) *regexp.Regexp {
|
||||
return regexp.MustCompile(s)
|
||||
}
|
||||
230
audit/internal/webui/kmsg_watcher.go
Normal file
230
audit/internal/webui/kmsg_watcher.go
Normal file
@@ -0,0 +1,230 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"io"
|
||||
"log/slog"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
// kmsgWatcher reads /dev/kmsg and accumulates hardware error events.
|
||||
// During an active SAT task window it records matching lines; on task finish
|
||||
// it writes Warning status records to the component status DB.
|
||||
type kmsgWatcher struct {
|
||||
mu sync.Mutex
|
||||
activeWindow *kmsgWindow
|
||||
statusDB *app.ComponentStatusDB
|
||||
}
|
||||
|
||||
type kmsgWindow struct {
|
||||
taskID string
|
||||
target string
|
||||
startedAt time.Time
|
||||
seen map[kmsgEventKey]bool
|
||||
events []kmsgEvent
|
||||
}
|
||||
|
||||
type kmsgEventKey struct {
|
||||
id string // BDF or device name
|
||||
category string
|
||||
}
|
||||
|
||||
type kmsgEvent struct {
|
||||
timestamp time.Time
|
||||
raw string
|
||||
ids []string // BDF addresses or device names extracted
|
||||
category string
|
||||
}
|
||||
|
||||
func newKmsgWatcher(statusDB *app.ComponentStatusDB) *kmsgWatcher {
|
||||
return &kmsgWatcher{statusDB: statusDB}
|
||||
}
|
||||
|
||||
// start launches the background kmsg reading goroutine.
|
||||
func (w *kmsgWatcher) start() {
|
||||
go w.run()
|
||||
}
|
||||
|
||||
func (w *kmsgWatcher) run() {
|
||||
f, err := os.Open("/dev/kmsg")
|
||||
if err != nil {
|
||||
slog.Warn("kmsg watcher unavailable", "err", err)
|
||||
return
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
// Best-effort seek to end so we only capture events from now forward.
|
||||
_, _ = f.Seek(0, io.SeekEnd)
|
||||
|
||||
scanner := bufio.NewScanner(f)
|
||||
scanner.Buffer(make([]byte, 64*1024), 64*1024)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
evt, ok := parseKmsgLine(line)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
w.mu.Lock()
|
||||
if w.activeWindow != nil {
|
||||
w.recordEvent(evt)
|
||||
}
|
||||
w.mu.Unlock()
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
slog.Warn("kmsg watcher stopped", "err", err)
|
||||
}
|
||||
}
|
||||
|
||||
// recordEvent appends evt to the active window, deduplicating by (id, category).
|
||||
// Must be called with w.mu held.
|
||||
func (w *kmsgWatcher) recordEvent(evt kmsgEvent) {
|
||||
if len(evt.ids) == 0 {
|
||||
// Events without a device ID (e.g. MCE) — deduplicate by category.
|
||||
key := kmsgEventKey{id: "", category: evt.category}
|
||||
if !w.activeWindow.seen[key] {
|
||||
w.activeWindow.seen[key] = true
|
||||
w.activeWindow.events = append(w.activeWindow.events, evt)
|
||||
}
|
||||
return
|
||||
}
|
||||
for _, id := range evt.ids {
|
||||
key := kmsgEventKey{id: id, category: evt.category}
|
||||
if !w.activeWindow.seen[key] {
|
||||
w.activeWindow.seen[key] = true
|
||||
w.activeWindow.events = append(w.activeWindow.events, evt)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// NotifyTaskStarted opens a new event window for the given SAT task.
|
||||
func (w *kmsgWatcher) NotifyTaskStarted(taskID, target string) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
w.activeWindow = &kmsgWindow{
|
||||
taskID: taskID,
|
||||
target: target,
|
||||
startedAt: time.Now(),
|
||||
seen: make(map[kmsgEventKey]bool),
|
||||
}
|
||||
}
|
||||
|
||||
// NotifyTaskFinished closes the event window and asynchronously writes status records.
|
||||
func (w *kmsgWatcher) NotifyTaskFinished(taskID string) {
|
||||
w.mu.Lock()
|
||||
window := w.activeWindow
|
||||
if window != nil && window.taskID == taskID {
|
||||
w.activeWindow = nil
|
||||
}
|
||||
w.mu.Unlock()
|
||||
|
||||
if window == nil || len(window.events) == 0 {
|
||||
return
|
||||
}
|
||||
go w.flushWindow(window)
|
||||
}
|
||||
|
||||
func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
|
||||
if w.statusDB == nil {
|
||||
return
|
||||
}
|
||||
source := "watchdog:kmsg"
|
||||
// Collect unique component keys from events.
|
||||
seen := map[string]string{} // componentKey → first raw line
|
||||
for _, evt := range window.events {
|
||||
if len(evt.ids) == 0 {
|
||||
// MCE or un-identified error.
|
||||
key := "cpu:all"
|
||||
if evt.category == "memory" {
|
||||
key = "memory:all"
|
||||
}
|
||||
if _, exists := seen[key]; !exists {
|
||||
seen[key] = evt.raw
|
||||
}
|
||||
continue
|
||||
}
|
||||
for _, id := range evt.ids {
|
||||
var key string
|
||||
switch evt.category {
|
||||
case "gpu", "pcie":
|
||||
key = "pcie:" + normalizeBDF(id)
|
||||
case "storage":
|
||||
key = "storage:" + id
|
||||
default:
|
||||
key = "pcie:" + normalizeBDF(id)
|
||||
}
|
||||
if _, exists := seen[key]; !exists {
|
||||
seen[key] = evt.raw
|
||||
}
|
||||
}
|
||||
}
|
||||
for key, detail := range seen {
|
||||
detail = "kernel error during " + window.target + " SAT: " + truncate(detail, 120)
|
||||
w.statusDB.Record(key, source, "Warning", detail)
|
||||
}
|
||||
}
|
||||
|
||||
// parseKmsgLine parses a single /dev/kmsg line and returns an event if it matches
|
||||
// any pattern in platform.HardwareErrorPatterns.
|
||||
// kmsg format: "<priority>,<sequence>,<timestamp_usec>,-;message text"
|
||||
func parseKmsgLine(raw string) (kmsgEvent, bool) {
|
||||
msg := raw
|
||||
if idx := strings.Index(raw, ";"); idx >= 0 {
|
||||
msg = strings.TrimSpace(raw[idx+1:])
|
||||
}
|
||||
if msg == "" {
|
||||
return kmsgEvent{}, false
|
||||
}
|
||||
|
||||
for _, p := range platform.HardwareErrorPatterns {
|
||||
m := p.Re.FindStringSubmatch(msg)
|
||||
if m == nil {
|
||||
continue
|
||||
}
|
||||
evt := kmsgEvent{
|
||||
timestamp: time.Now(),
|
||||
raw: msg,
|
||||
category: p.Category,
|
||||
}
|
||||
if p.BDFGroup > 0 && p.BDFGroup < len(m) {
|
||||
evt.ids = append(evt.ids, normalizeBDF(m[p.BDFGroup]))
|
||||
}
|
||||
if p.DevGroup > 0 && p.DevGroup < len(m) {
|
||||
evt.ids = append(evt.ids, m[p.DevGroup])
|
||||
}
|
||||
return evt, true
|
||||
}
|
||||
return kmsgEvent{}, false
|
||||
}
|
||||
|
||||
// normalizeBDF normalizes a PCIe BDF to the 4-part form "0000:c8:00.0".
|
||||
func normalizeBDF(bdf string) string {
|
||||
bdf = strings.ToLower(strings.TrimSpace(bdf))
|
||||
if strings.Count(bdf, ":") == 1 {
|
||||
return "0000:" + bdf
|
||||
}
|
||||
return bdf
|
||||
}
|
||||
|
||||
func truncate(s string, max int) string {
|
||||
if len(s) <= max {
|
||||
return s
|
||||
}
|
||||
return s[:max] + "..."
|
||||
}
|
||||
|
||||
// isSATTarget returns true for task targets that run hardware acceptance tests.
|
||||
func isSATTarget(target string) bool {
|
||||
switch target {
|
||||
case "nvidia", "nvidia-stress", "memory", "memory-stress", "storage",
|
||||
"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
|
||||
"platform-stress":
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
@@ -164,6 +164,8 @@ type handler struct {
|
||||
// pending network change (rollback on timeout)
|
||||
pendingNet *pendingNetChange
|
||||
pendingNetMu sync.Mutex
|
||||
// kmsg hardware error watcher
|
||||
kmsg *kmsgWatcher
|
||||
}
|
||||
|
||||
// NewHandler creates the HTTP mux with all routes.
|
||||
@@ -203,6 +205,13 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
}
|
||||
h.startMetricsCollector()
|
||||
|
||||
// Start kmsg hardware error watcher if the app (and its status DB) is available.
|
||||
if opts.App != nil {
|
||||
h.kmsg = newKmsgWatcher(opts.App.StatusDB)
|
||||
h.kmsg.start()
|
||||
globalQueue.kmsgWatcher = h.kmsg
|
||||
}
|
||||
|
||||
globalQueue.startWorker(&opts)
|
||||
mux := http.NewServeMux()
|
||||
|
||||
|
||||
@@ -173,13 +173,14 @@ func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions
|
||||
|
||||
// taskQueue manages a priority-ordered list of tasks and runs them one at a time.
|
||||
type taskQueue struct {
|
||||
mu sync.Mutex
|
||||
tasks []*Task
|
||||
trigger chan struct{}
|
||||
opts *HandlerOptions // set by startWorker
|
||||
statePath string
|
||||
logsDir string
|
||||
started bool
|
||||
mu sync.Mutex
|
||||
tasks []*Task
|
||||
trigger chan struct{}
|
||||
opts *HandlerOptions // set by startWorker
|
||||
statePath string
|
||||
logsDir string
|
||||
started bool
|
||||
kmsgWatcher *kmsgWatcher
|
||||
}
|
||||
|
||||
var globalQueue = &taskQueue{trigger: make(chan struct{}, 1)}
|
||||
@@ -411,8 +412,16 @@ func (q *taskQueue) worker() {
|
||||
q.persistLocked()
|
||||
q.mu.Unlock()
|
||||
|
||||
if q.kmsgWatcher != nil && isSATTarget(t.Target) {
|
||||
q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
|
||||
}
|
||||
|
||||
q.runTask(t, j, ctx)
|
||||
|
||||
if q.kmsgWatcher != nil {
|
||||
q.kmsgWatcher.NotifyTaskFinished(t.ID)
|
||||
}
|
||||
|
||||
q.mu.Lock()
|
||||
now2 := time.Now()
|
||||
t.DoneAt = &now2
|
||||
@@ -618,6 +627,19 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
// If the SAT archive was produced, check overall_status and write to component DB.
|
||||
if archive != "" {
|
||||
archivePath := app.ExtractArchivePath(archive)
|
||||
if err == nil {
|
||||
if app.ReadSATOverallStatus(archivePath) == "FAILED" {
|
||||
err = fmt.Errorf("SAT overall_status=FAILED (see summary.txt)")
|
||||
}
|
||||
}
|
||||
if db := q.statusDB(); db != nil {
|
||||
app.ApplySATResultToDB(db, t.Target, archivePath)
|
||||
}
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
if ctx.Err() != nil {
|
||||
j.append("Aborted.")
|
||||
@@ -634,6 +656,13 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
j.finish("")
|
||||
}
|
||||
|
||||
func (q *taskQueue) statusDB() *app.ComponentStatusDB {
|
||||
if q.opts == nil || q.opts.App == nil {
|
||||
return nil
|
||||
}
|
||||
return q.opts.App.StatusDB
|
||||
}
|
||||
|
||||
func splitLines(s string) []string {
|
||||
var out []string
|
||||
for _, l := range splitNL(s) {
|
||||
|
||||
Reference in New Issue
Block a user