diff --git a/audit/internal/app/app.go b/audit/internal/app/app.go index f8b7578..11f6c73 100644 --- a/audit/internal/app/app.go +++ b/audit/internal/app/app.go @@ -40,6 +40,8 @@ type App struct { sat satRunner runtime runtimeChecker installer installer + // StatusDB is the unified component health store (nil if unavailable). + StatusDB *ComponentStatusDB } type ActionResult struct { @@ -136,7 +138,7 @@ type runtimeChecker interface { } func New(platform *platform.System) *App { - return &App{ + a := &App{ network: platform, services: platform, exports: platform, @@ -145,6 +147,10 @@ func New(platform *platform.System) *App { runtime: platform, installer: platform, } + if db, err := OpenComponentStatusDB(DefaultExportDir + "/component-status.json"); err == nil { + a.StatusDB = db + } + return a } // ApplySATOverlay parses a raw audit JSON, overlays the latest SAT results, @@ -154,7 +160,7 @@ func ApplySATOverlay(auditJSON []byte) ([]byte, error) { if err != nil { return nil, err } - applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir) + applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir, nil) return json.MarshalIndent(snap, "", " ") } @@ -174,7 +180,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro } } result := collector.Run(runtimeMode) - applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir) + applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB) if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil { result.Runtime = &health } diff --git a/audit/internal/app/component_status_db.go b/audit/internal/app/component_status_db.go new file mode 100644 index 0000000..661f64d --- /dev/null +++ b/audit/internal/app/component_status_db.go @@ -0,0 +1,266 @@ +package app + +import ( + "encoding/json" + "os" + "path/filepath" + "strings" + "sync" + "time" +) + +// ComponentStatusDB is a persistent, append-only store of hardware component health records. +// Records are keyed by component identity strings (e.g. "pcie:0000:c8:00.0", "storage:nvme0n1"). +// Once a component is marked Warning or Critical, subsequent OK entries do not downgrade it — +// the component stays at the highest observed severity until explicitly reset. +type ComponentStatusDB struct { + path string + mu sync.Mutex + records map[string]*ComponentStatusRecord +} + +// ComponentStatusRecord holds the current and historical health of one hardware component. +type ComponentStatusRecord struct { + ComponentKey string `json:"component_key"` + Status string `json:"status"` // "OK", "Warning", "Critical", "Unknown" + LastCheckedAt time.Time `json:"last_checked_at"` + LastChangedAt time.Time `json:"last_changed_at"` + ErrorSummary string `json:"error_summary,omitempty"` + History []ComponentStatusEntry `json:"history"` +} + +// ComponentStatusEntry is one observation written to a component's history. +type ComponentStatusEntry struct { + At time.Time `json:"at"` + Status string `json:"status"` + Source string `json:"source"` // e.g. "sat:nvidia", "sat:memory", "watchdog:kmsg" + Detail string `json:"detail,omitempty"` +} + +// OpenComponentStatusDB opens (or creates) the JSON status DB at path. +func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) { + db := &ComponentStatusDB{ + path: path, + records: make(map[string]*ComponentStatusRecord), + } + if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { + return nil, err + } + data, err := os.ReadFile(path) + if err != nil && !os.IsNotExist(err) { + return nil, err + } + if len(data) > 0 { + var records []ComponentStatusRecord + if err := json.Unmarshal(data, &records); err == nil { + for i := range records { + db.records[records[i].ComponentKey] = &records[i] + } + } + } + return db, nil +} + +// Record writes one observation for the given component key. +// source is a short label like "sat:nvidia" or "watchdog:kmsg". +// status is "OK", "Warning", "Critical", or "Unknown". +// OK never downgrades an existing Warning or Critical status. +func (db *ComponentStatusDB) Record(key, source, status, detail string) { + if db == nil || strings.TrimSpace(key) == "" { + return + } + db.mu.Lock() + defer db.mu.Unlock() + + now := time.Now().UTC() + rec, exists := db.records[key] + if !exists { + rec = &ComponentStatusRecord{ComponentKey: key} + db.records[key] = rec + } + rec.LastCheckedAt = now + + entry := ComponentStatusEntry{At: now, Status: status, Source: source, Detail: detail} + rec.History = append(rec.History, entry) + + // Status merge: OK never downgrades Warning/Critical. + newSev := componentSeverity(status) + curSev := componentSeverity(rec.Status) + if newSev > curSev { + rec.Status = status + rec.LastChangedAt = now + rec.ErrorSummary = detail + } else if rec.Status == "" { + rec.Status = status + rec.LastChangedAt = now + } + + _ = db.saveLocked() +} + +// Get returns the current record for a component key. +func (db *ComponentStatusDB) Get(key string) (ComponentStatusRecord, bool) { + if db == nil { + return ComponentStatusRecord{}, false + } + db.mu.Lock() + defer db.mu.Unlock() + r, ok := db.records[key] + if !ok { + return ComponentStatusRecord{}, false + } + return *r, true +} + +// All returns a snapshot of all records. +func (db *ComponentStatusDB) All() []ComponentStatusRecord { + if db == nil { + return nil + } + db.mu.Lock() + defer db.mu.Unlock() + out := make([]ComponentStatusRecord, 0, len(db.records)) + for _, r := range db.records { + out = append(out, *r) + } + return out +} + +func (db *ComponentStatusDB) saveLocked() error { + records := make([]ComponentStatusRecord, 0, len(db.records)) + for _, r := range db.records { + records = append(records, *r) + } + data, err := json.MarshalIndent(records, "", " ") + if err != nil { + return err + } + return os.WriteFile(db.path, data, 0644) +} + +// componentSeverity returns a numeric severity so higher values win. +func componentSeverity(status string) int { + switch strings.TrimSpace(status) { + case "Critical": + return 3 + case "Warning": + return 2 + case "OK": + return 1 + default: + return 0 + } +} + +// ApplySATResultToDB reads a SAT summary.txt from the run directory next to archivePath +// and writes component status records to db for the given SAT target. +// archivePath may be either a bare .tar.gz path or "Archive written to /path/foo.tar.gz". +func ApplySATResultToDB(db *ComponentStatusDB, target, archivePath string) { + if db == nil || strings.TrimSpace(archivePath) == "" { + return + } + archivePath = extractArchivePath(archivePath) + if archivePath == "" { + return + } + runDir := strings.TrimSuffix(archivePath, ".tar.gz") + data, err := os.ReadFile(filepath.Join(runDir, "summary.txt")) + if err != nil { + return + } + kv := parseSATKV(string(data)) + overall := strings.ToUpper(strings.TrimSpace(kv["overall_status"])) + if overall == "" { + return + } + + source := "sat:" + target + dbStatus := satStatusToDBStatus(overall) + + // Map SAT target to component keys. + switch target { + case "nvidia", "amd", "nvidia-stress", "amd-stress", "amd-mem", "amd-bandwidth": + db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall) + case "memory", "memory-stress", "sat-stress": + db.Record("memory:all", source, dbStatus, target+" SAT: "+overall) + case "cpu", "platform-stress": + db.Record("cpu:all", source, dbStatus, target+" SAT: "+overall) + case "storage": + // Try to record per-device if available in summary. + recordedAny := false + for key, val := range kv { + if !strings.HasSuffix(key, "_status") || key == "overall_status" { + continue + } + base := strings.TrimSuffix(key, "_status") + idx := strings.Index(base, "_") + if idx <= 0 { + continue + } + devName := base[:idx] + devStatus := satStatusToDBStatus(strings.ToUpper(strings.TrimSpace(val))) + db.Record("storage:"+devName, source, devStatus, "storage SAT: "+val) + recordedAny = true + } + if !recordedAny { + db.Record("storage:all", source, dbStatus, "storage SAT: "+overall) + } + } +} + +func satStatusToDBStatus(overall string) string { + switch overall { + case "OK": + return "OK" + case "FAILED": + return "Warning" + case "PARTIAL", "UNSUPPORTED": + return "Unknown" + default: + return "Unknown" + } +} + +// ExtractArchivePath extracts a bare .tar.gz path from a string that may be +// "Archive written to /path/foo.tar.gz" or already a bare path. +func ExtractArchivePath(s string) string { + return extractArchivePath(s) +} + +// ReadSATOverallStatus reads the overall_status value from the summary.txt +// file located in the run directory alongside archivePath. +// Returns "" if the file cannot be read. +func ReadSATOverallStatus(archivePath string) string { + if strings.TrimSpace(archivePath) == "" { + return "" + } + runDir := strings.TrimSuffix(archivePath, ".tar.gz") + data, err := os.ReadFile(filepath.Join(runDir, "summary.txt")) + if err != nil { + return "" + } + kv := parseSATKV(string(data)) + return strings.ToUpper(strings.TrimSpace(kv["overall_status"])) +} + +func extractArchivePath(s string) string { + s = strings.TrimSpace(s) + if strings.HasSuffix(s, ".tar.gz") { + parts := strings.Fields(s) + if len(parts) > 0 { + return parts[len(parts)-1] + } + } + return s +} + +func parseSATKV(raw string) map[string]string { + kv := make(map[string]string) + for _, line := range strings.Split(raw, "\n") { + k, v, ok := strings.Cut(strings.TrimSpace(line), "=") + if ok { + kv[strings.TrimSpace(k)] = strings.TrimSpace(v) + } + } + return kv +} diff --git a/audit/internal/app/sat_overlay.go b/audit/internal/app/sat_overlay.go index 79abadc..cc981e0 100644 --- a/audit/internal/app/sat_overlay.go +++ b/audit/internal/app/sat_overlay.go @@ -9,7 +9,7 @@ import ( "bee/audit/internal/schema" ) -func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) { +func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *ComponentStatusDB) { if snap == nil || strings.TrimSpace(baseDir) == "" { return } @@ -28,6 +28,8 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) { if summary, ok := loadLatestSATSummary(baseDir, "storage-"); ok { applyStorageSAT(snap.Storage, summary) } + // Apply unified component status DB — overlaid last so it can only upgrade severity. + applyComponentStatusDB(snap, db) } type satSummary struct { @@ -206,6 +208,86 @@ func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool { } } +func applyComponentStatusDB(snap *schema.HardwareSnapshot, db *ComponentStatusDB) { + if snap == nil || db == nil { + return + } + for _, rec := range db.All() { + key := rec.ComponentKey + status := dbStatusToSATStatus(rec.Status) + if status == "" { + continue + } + detail := rec.ErrorSummary + ts := rec.LastChangedAt.UTC().Format("2006-01-02T15:04:05Z") + + switch { + case strings.HasPrefix(key, "pcie:"): + bdf := strings.TrimPrefix(key, "pcie:") + bdf = strings.TrimPrefix(bdf, "gpu:") // strip sub-type if present + // bdf may be empty (e.g. "pcie:gpu:nvidia") — skip BDF matching + if sanitizeBDFForLookup(bdf) == "" { + break + } + normalized := sanitizeBDFForLookup(bdf) + for i := range snap.PCIeDevices { + if snap.PCIeDevices[i].BDF == nil { + continue + } + if sanitizeBDFForLookup(*snap.PCIeDevices[i].BDF) == normalized { + mergeComponentStatus(&snap.PCIeDevices[i].HardwareComponentStatus, ts, status, detail) + } + } + case strings.HasPrefix(key, "storage:"): + devName := strings.TrimPrefix(key, "storage:") + if devName == "all" { + for i := range snap.Storage { + mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail) + } + } else { + for i := range snap.Storage { + linuxDev, _ := snap.Storage[i].Telemetry["linux_device"].(string) + if filepath.Base(strings.TrimSpace(linuxDev)) == devName { + mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail) + } + } + } + case strings.HasPrefix(key, "memory:"): + for i := range snap.Memory { + mergeComponentStatus(&snap.Memory[i].HardwareComponentStatus, ts, status, detail) + } + case strings.HasPrefix(key, "cpu:"): + for i := range snap.CPUs { + mergeComponentStatus(&snap.CPUs[i].HardwareComponentStatus, ts, status, detail) + } + } + } +} + +// dbStatusToSATStatus converts ComponentStatusDB status strings to the format +// expected by mergeComponentStatus (which uses "OK", "Warning", "Critical", "Unknown"). +func dbStatusToSATStatus(s string) string { + switch strings.TrimSpace(s) { + case "OK", "Warning", "Critical", "Unknown": + return s + default: + return "" + } +} + +// sanitizeBDFForLookup normalises a PCIe BDF address to a canonical lower-case form +// suitable for comparison. "c8:00.0" → "0000:c8:00.0"; already-full BDFs are left as-is. +func sanitizeBDFForLookup(bdf string) string { + bdf = strings.ToLower(strings.TrimSpace(bdf)) + if bdf == "" || bdf == "gpu" || strings.ContainsAny(bdf, " \t") { + return "" + } + if strings.Count(bdf, ":") == 1 { + bdf = "0000:" + bdf + } + return bdf +} + func ptrString(v *string) string { if v == nil { return "" diff --git a/audit/internal/app/sat_overlay_test.go b/audit/internal/app/sat_overlay_test.go index defe09a..b2bde36 100644 --- a/audit/internal/app/sat_overlay_test.go +++ b/audit/internal/app/sat_overlay_test.go @@ -23,7 +23,7 @@ func TestApplyLatestSATStatusesMarksStorageByDevice(t *testing.T) { usb := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/sda"}} snap := schema.HardwareSnapshot{Storage: []schema.HardwareStorage{nvme, usb}} - applyLatestSATStatuses(&snap, baseDir) + applyLatestSATStatuses(&snap, baseDir, nil) if snap.Storage[0].Status == nil || *snap.Storage[0].Status != "OK" { t.Fatalf("nvme status=%v want OK", snap.Storage[0].Status) @@ -53,7 +53,7 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) { }}, } - applyLatestSATStatuses(&snap, baseDir) + applyLatestSATStatuses(&snap, baseDir, nil) if snap.PCIeDevices[0].Status == nil || *snap.PCIeDevices[0].Status != "Critical" { t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status) diff --git a/audit/internal/platform/error_patterns.go b/audit/internal/platform/error_patterns.go new file mode 100644 index 0000000..21bfdb6 --- /dev/null +++ b/audit/internal/platform/error_patterns.go @@ -0,0 +1,139 @@ +package platform + +import "regexp" + +// ErrorPattern describes a kernel log pattern that indicates a hardware error. +// Add new patterns by appending to HardwareErrorPatterns — no other code changes needed. +type ErrorPattern struct { + // Name is a short machine-readable label for logging and deduplication. + Name string + // Re is the compiled regular expression matched against a single kmsg line. + Re *regexp.Regexp + // Category groups related errors: "gpu", "pcie", "storage", "mce", "memory", "cpu". + Category string + // Severity is "warning" for recoverable/uncertain faults, "critical" for definitive failures. + Severity string + // BDFGroup is the capture group index (1-based) that contains a PCIe BDF address + // (e.g. "0000:c8:00.0"). 0 means no BDF is captured by this pattern. + BDFGroup int + // DevGroup is the capture group index (1-based) that contains a device name + // (e.g. "sda", "nvme0"). 0 means no device name is captured by this pattern. + DevGroup int +} + +// HardwareErrorPatterns is the global list of kernel log patterns that indicate hardware faults. +// To add a new pattern: append a new ErrorPattern struct to this slice. +var HardwareErrorPatterns = []ErrorPattern{ + // ── GPU / NVIDIA ──────────────────────────────────────────────────────────── + { + Name: "nvidia-rminitadapter", + Re: mustPat(`(?i)NVRM:.*GPU\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`), + Category: "gpu", + Severity: "warning", + BDFGroup: 1, + }, + { + Name: "nvidia-msi-fail", + Re: mustPat(`(?i)NVRM:.*Failed to enable MSI`), + Category: "gpu", + Severity: "warning", + }, + { + Name: "nvidia-aer", + Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`), + Category: "gpu", + Severity: "warning", + BDFGroup: 1, + }, + { + Name: "nvidia-xid", + Re: mustPat(`(?i)NVRM:.*Xid.*\b([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`), + Category: "gpu", + Severity: "warning", + BDFGroup: 1, + }, + + // ── PCIe AER (generic) ────────────────────────────────────────────────────── + { + Name: "pcie-aer", + Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`), + Category: "pcie", + Severity: "warning", + BDFGroup: 1, + }, + { + Name: "pcie-uncorrectable", + Re: mustPat(`(?i)([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Uu]ncorrectable`), + Category: "pcie", + Severity: "warning", + BDFGroup: 1, + }, + { + Name: "pcie-link-down", + Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Ll]ink.*[Dd]own`), + Category: "pcie", + Severity: "warning", + BDFGroup: 1, + }, + + // ── Storage ───────────────────────────────────────────────────────────────── + { + Name: "blk-io-error", + Re: mustPat(`(?i)blk_update_request.*I/O error.*dev\s+(\w+)`), + Category: "storage", + Severity: "warning", + DevGroup: 1, + }, + { + Name: "nvme-timeout", + Re: mustPat(`(?i)nvme\s+(\w+):.*timeout`), + Category: "storage", + Severity: "warning", + DevGroup: 1, + }, + { + Name: "scsi-failed", + Re: mustPat(`(?i)sd\s+[\da-f:]+:.*FAILED`), + Category: "storage", + Severity: "warning", + }, + { + Name: "nvme-reset", + Re: mustPat(`(?i)nvme\s+(\w+):.*reset`), + Category: "storage", + Severity: "warning", + DevGroup: 1, + }, + + // ── Machine Check Exceptions ──────────────────────────────────────────────── + { + Name: "mce-hardware-error", + Re: mustPat(`(?i)mce:.*[Hh]ardware [Ee]rror`), + Category: "mce", + Severity: "warning", + }, + { + Name: "mce-corrected", + Re: mustPat(`(?i)mce:.*[Cc]orrected`), + Category: "mce", + Severity: "warning", + }, + + // ── Memory ───────────────────────────────────────────────────────────────── + { + Name: "edac-ue", + Re: mustPat(`(?i)EDAC.*[Uu]ncorrectable`), + Category: "memory", + Severity: "warning", + }, + { + Name: "edac-ce", + Re: mustPat(`(?i)EDAC.*[Cc]orrectable`), + Category: "memory", + Severity: "warning", + }, +} + +func mustPat(s string) *regexp.Regexp { + return regexp.MustCompile(s) +} diff --git a/audit/internal/webui/kmsg_watcher.go b/audit/internal/webui/kmsg_watcher.go new file mode 100644 index 0000000..4e4c41d --- /dev/null +++ b/audit/internal/webui/kmsg_watcher.go @@ -0,0 +1,230 @@ +package webui + +import ( + "bufio" + "io" + "log/slog" + "os" + "strings" + "sync" + "time" + + "bee/audit/internal/app" + "bee/audit/internal/platform" +) + +// kmsgWatcher reads /dev/kmsg and accumulates hardware error events. +// During an active SAT task window it records matching lines; on task finish +// it writes Warning status records to the component status DB. +type kmsgWatcher struct { + mu sync.Mutex + activeWindow *kmsgWindow + statusDB *app.ComponentStatusDB +} + +type kmsgWindow struct { + taskID string + target string + startedAt time.Time + seen map[kmsgEventKey]bool + events []kmsgEvent +} + +type kmsgEventKey struct { + id string // BDF or device name + category string +} + +type kmsgEvent struct { + timestamp time.Time + raw string + ids []string // BDF addresses or device names extracted + category string +} + +func newKmsgWatcher(statusDB *app.ComponentStatusDB) *kmsgWatcher { + return &kmsgWatcher{statusDB: statusDB} +} + +// start launches the background kmsg reading goroutine. +func (w *kmsgWatcher) start() { + go w.run() +} + +func (w *kmsgWatcher) run() { + f, err := os.Open("/dev/kmsg") + if err != nil { + slog.Warn("kmsg watcher unavailable", "err", err) + return + } + defer f.Close() + + // Best-effort seek to end so we only capture events from now forward. + _, _ = f.Seek(0, io.SeekEnd) + + scanner := bufio.NewScanner(f) + scanner.Buffer(make([]byte, 64*1024), 64*1024) + for scanner.Scan() { + line := scanner.Text() + evt, ok := parseKmsgLine(line) + if !ok { + continue + } + w.mu.Lock() + if w.activeWindow != nil { + w.recordEvent(evt) + } + w.mu.Unlock() + } + if err := scanner.Err(); err != nil { + slog.Warn("kmsg watcher stopped", "err", err) + } +} + +// recordEvent appends evt to the active window, deduplicating by (id, category). +// Must be called with w.mu held. +func (w *kmsgWatcher) recordEvent(evt kmsgEvent) { + if len(evt.ids) == 0 { + // Events without a device ID (e.g. MCE) — deduplicate by category. + key := kmsgEventKey{id: "", category: evt.category} + if !w.activeWindow.seen[key] { + w.activeWindow.seen[key] = true + w.activeWindow.events = append(w.activeWindow.events, evt) + } + return + } + for _, id := range evt.ids { + key := kmsgEventKey{id: id, category: evt.category} + if !w.activeWindow.seen[key] { + w.activeWindow.seen[key] = true + w.activeWindow.events = append(w.activeWindow.events, evt) + } + } +} + +// NotifyTaskStarted opens a new event window for the given SAT task. +func (w *kmsgWatcher) NotifyTaskStarted(taskID, target string) { + w.mu.Lock() + defer w.mu.Unlock() + w.activeWindow = &kmsgWindow{ + taskID: taskID, + target: target, + startedAt: time.Now(), + seen: make(map[kmsgEventKey]bool), + } +} + +// NotifyTaskFinished closes the event window and asynchronously writes status records. +func (w *kmsgWatcher) NotifyTaskFinished(taskID string) { + w.mu.Lock() + window := w.activeWindow + if window != nil && window.taskID == taskID { + w.activeWindow = nil + } + w.mu.Unlock() + + if window == nil || len(window.events) == 0 { + return + } + go w.flushWindow(window) +} + +func (w *kmsgWatcher) flushWindow(window *kmsgWindow) { + if w.statusDB == nil { + return + } + source := "watchdog:kmsg" + // Collect unique component keys from events. + seen := map[string]string{} // componentKey → first raw line + for _, evt := range window.events { + if len(evt.ids) == 0 { + // MCE or un-identified error. + key := "cpu:all" + if evt.category == "memory" { + key = "memory:all" + } + if _, exists := seen[key]; !exists { + seen[key] = evt.raw + } + continue + } + for _, id := range evt.ids { + var key string + switch evt.category { + case "gpu", "pcie": + key = "pcie:" + normalizeBDF(id) + case "storage": + key = "storage:" + id + default: + key = "pcie:" + normalizeBDF(id) + } + if _, exists := seen[key]; !exists { + seen[key] = evt.raw + } + } + } + for key, detail := range seen { + detail = "kernel error during " + window.target + " SAT: " + truncate(detail, 120) + w.statusDB.Record(key, source, "Warning", detail) + } +} + +// parseKmsgLine parses a single /dev/kmsg line and returns an event if it matches +// any pattern in platform.HardwareErrorPatterns. +// kmsg format: ",,,-;message text" +func parseKmsgLine(raw string) (kmsgEvent, bool) { + msg := raw + if idx := strings.Index(raw, ";"); idx >= 0 { + msg = strings.TrimSpace(raw[idx+1:]) + } + if msg == "" { + return kmsgEvent{}, false + } + + for _, p := range platform.HardwareErrorPatterns { + m := p.Re.FindStringSubmatch(msg) + if m == nil { + continue + } + evt := kmsgEvent{ + timestamp: time.Now(), + raw: msg, + category: p.Category, + } + if p.BDFGroup > 0 && p.BDFGroup < len(m) { + evt.ids = append(evt.ids, normalizeBDF(m[p.BDFGroup])) + } + if p.DevGroup > 0 && p.DevGroup < len(m) { + evt.ids = append(evt.ids, m[p.DevGroup]) + } + return evt, true + } + return kmsgEvent{}, false +} + +// normalizeBDF normalizes a PCIe BDF to the 4-part form "0000:c8:00.0". +func normalizeBDF(bdf string) string { + bdf = strings.ToLower(strings.TrimSpace(bdf)) + if strings.Count(bdf, ":") == 1 { + return "0000:" + bdf + } + return bdf +} + +func truncate(s string, max int) string { + if len(s) <= max { + return s + } + return s[:max] + "..." +} + +// isSATTarget returns true for task targets that run hardware acceptance tests. +func isSATTarget(target string) bool { + switch target { + case "nvidia", "nvidia-stress", "memory", "memory-stress", "storage", + "cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress", + "platform-stress": + return true + } + return false +} diff --git a/audit/internal/webui/server.go b/audit/internal/webui/server.go index d33d38d..b437c49 100644 --- a/audit/internal/webui/server.go +++ b/audit/internal/webui/server.go @@ -164,6 +164,8 @@ type handler struct { // pending network change (rollback on timeout) pendingNet *pendingNetChange pendingNetMu sync.Mutex + // kmsg hardware error watcher + kmsg *kmsgWatcher } // NewHandler creates the HTTP mux with all routes. @@ -203,6 +205,13 @@ func NewHandler(opts HandlerOptions) http.Handler { } h.startMetricsCollector() + // Start kmsg hardware error watcher if the app (and its status DB) is available. + if opts.App != nil { + h.kmsg = newKmsgWatcher(opts.App.StatusDB) + h.kmsg.start() + globalQueue.kmsgWatcher = h.kmsg + } + globalQueue.startWorker(&opts) mux := http.NewServeMux() diff --git a/audit/internal/webui/tasks.go b/audit/internal/webui/tasks.go index a11dd4d..7745501 100644 --- a/audit/internal/webui/tasks.go +++ b/audit/internal/webui/tasks.go @@ -173,13 +173,14 @@ func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions // taskQueue manages a priority-ordered list of tasks and runs them one at a time. type taskQueue struct { - mu sync.Mutex - tasks []*Task - trigger chan struct{} - opts *HandlerOptions // set by startWorker - statePath string - logsDir string - started bool + mu sync.Mutex + tasks []*Task + trigger chan struct{} + opts *HandlerOptions // set by startWorker + statePath string + logsDir string + started bool + kmsgWatcher *kmsgWatcher } var globalQueue = &taskQueue{trigger: make(chan struct{}, 1)} @@ -411,8 +412,16 @@ func (q *taskQueue) worker() { q.persistLocked() q.mu.Unlock() + if q.kmsgWatcher != nil && isSATTarget(t.Target) { + q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target) + } + q.runTask(t, j, ctx) + if q.kmsgWatcher != nil { + q.kmsgWatcher.NotifyTaskFinished(t.ID) + } + q.mu.Lock() now2 := time.Now() t.DoneAt = &now2 @@ -618,6 +627,19 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) { return } + // If the SAT archive was produced, check overall_status and write to component DB. + if archive != "" { + archivePath := app.ExtractArchivePath(archive) + if err == nil { + if app.ReadSATOverallStatus(archivePath) == "FAILED" { + err = fmt.Errorf("SAT overall_status=FAILED (see summary.txt)") + } + } + if db := q.statusDB(); db != nil { + app.ApplySATResultToDB(db, t.Target, archivePath) + } + } + if err != nil { if ctx.Err() != nil { j.append("Aborted.") @@ -634,6 +656,13 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) { j.finish("") } +func (q *taskQueue) statusDB() *app.ComponentStatusDB { + if q.opts == nil || q.opts.App == nil { + return nil + } + return q.opts.App.StatusDB +} + func splitLines(s string) []string { var out []string for _, l := range splitNL(s) {