feat(watchdog): hardware error monitor + unified component status store

- Add platform/error_patterns.go: pluggable table of kernel log patterns
  (NVIDIA/GPU, PCIe AER, storage I/O, MCE, EDAC) — extend by adding one struct
- Add app/component_status_db.go: persistent JSON store (component-status.json)
  keyed by "pcie:BDF", "storage:dev", "cpu:all", "memory:all"; OK never
  downgrades Warning or Critical
- Add webui/kmsg_watcher.go: goroutine reads /dev/kmsg during SAT tasks,
  writes Warning to DB for matched hardware errors
- Fix task status: overall_status=FAILED in summary.txt now marks task failed
- Audit routine overlays component DB statuses into bee-audit.json on every read

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Mikhail Chusavitin
2026-04-02 19:20:59 +03:00
parent 99cece524c
commit fd722692a4
8 changed files with 774 additions and 13 deletions

View File

@@ -40,6 +40,8 @@ type App struct {
sat satRunner
runtime runtimeChecker
installer installer
// StatusDB is the unified component health store (nil if unavailable).
StatusDB *ComponentStatusDB
}
type ActionResult struct {
@@ -136,7 +138,7 @@ type runtimeChecker interface {
}
func New(platform *platform.System) *App {
return &App{
a := &App{
network: platform,
services: platform,
exports: platform,
@@ -145,6 +147,10 @@ func New(platform *platform.System) *App {
runtime: platform,
installer: platform,
}
if db, err := OpenComponentStatusDB(DefaultExportDir + "/component-status.json"); err == nil {
a.StatusDB = db
}
return a
}
// ApplySATOverlay parses a raw audit JSON, overlays the latest SAT results,
@@ -154,7 +160,7 @@ func ApplySATOverlay(auditJSON []byte) ([]byte, error) {
if err != nil {
return nil, err
}
applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir)
applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir, nil)
return json.MarshalIndent(snap, "", " ")
}
@@ -174,7 +180,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
}
}
result := collector.Run(runtimeMode)
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir)
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
result.Runtime = &health
}

View File

@@ -0,0 +1,266 @@
package app
import (
"encoding/json"
"os"
"path/filepath"
"strings"
"sync"
"time"
)
// ComponentStatusDB is a persistent, append-only store of hardware component health records.
// Records are keyed by component identity strings (e.g. "pcie:0000:c8:00.0", "storage:nvme0n1").
// Once a component is marked Warning or Critical, subsequent OK entries do not downgrade it —
// the component stays at the highest observed severity until explicitly reset.
type ComponentStatusDB struct {
path string
mu sync.Mutex
records map[string]*ComponentStatusRecord
}
// ComponentStatusRecord holds the current and historical health of one hardware component.
type ComponentStatusRecord struct {
ComponentKey string `json:"component_key"`
Status string `json:"status"` // "OK", "Warning", "Critical", "Unknown"
LastCheckedAt time.Time `json:"last_checked_at"`
LastChangedAt time.Time `json:"last_changed_at"`
ErrorSummary string `json:"error_summary,omitempty"`
History []ComponentStatusEntry `json:"history"`
}
// ComponentStatusEntry is one observation written to a component's history.
type ComponentStatusEntry struct {
At time.Time `json:"at"`
Status string `json:"status"`
Source string `json:"source"` // e.g. "sat:nvidia", "sat:memory", "watchdog:kmsg"
Detail string `json:"detail,omitempty"`
}
// OpenComponentStatusDB opens (or creates) the JSON status DB at path.
func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) {
db := &ComponentStatusDB{
path: path,
records: make(map[string]*ComponentStatusRecord),
}
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
return nil, err
}
data, err := os.ReadFile(path)
if err != nil && !os.IsNotExist(err) {
return nil, err
}
if len(data) > 0 {
var records []ComponentStatusRecord
if err := json.Unmarshal(data, &records); err == nil {
for i := range records {
db.records[records[i].ComponentKey] = &records[i]
}
}
}
return db, nil
}
// Record writes one observation for the given component key.
// source is a short label like "sat:nvidia" or "watchdog:kmsg".
// status is "OK", "Warning", "Critical", or "Unknown".
// OK never downgrades an existing Warning or Critical status.
func (db *ComponentStatusDB) Record(key, source, status, detail string) {
if db == nil || strings.TrimSpace(key) == "" {
return
}
db.mu.Lock()
defer db.mu.Unlock()
now := time.Now().UTC()
rec, exists := db.records[key]
if !exists {
rec = &ComponentStatusRecord{ComponentKey: key}
db.records[key] = rec
}
rec.LastCheckedAt = now
entry := ComponentStatusEntry{At: now, Status: status, Source: source, Detail: detail}
rec.History = append(rec.History, entry)
// Status merge: OK never downgrades Warning/Critical.
newSev := componentSeverity(status)
curSev := componentSeverity(rec.Status)
if newSev > curSev {
rec.Status = status
rec.LastChangedAt = now
rec.ErrorSummary = detail
} else if rec.Status == "" {
rec.Status = status
rec.LastChangedAt = now
}
_ = db.saveLocked()
}
// Get returns the current record for a component key.
func (db *ComponentStatusDB) Get(key string) (ComponentStatusRecord, bool) {
if db == nil {
return ComponentStatusRecord{}, false
}
db.mu.Lock()
defer db.mu.Unlock()
r, ok := db.records[key]
if !ok {
return ComponentStatusRecord{}, false
}
return *r, true
}
// All returns a snapshot of all records.
func (db *ComponentStatusDB) All() []ComponentStatusRecord {
if db == nil {
return nil
}
db.mu.Lock()
defer db.mu.Unlock()
out := make([]ComponentStatusRecord, 0, len(db.records))
for _, r := range db.records {
out = append(out, *r)
}
return out
}
func (db *ComponentStatusDB) saveLocked() error {
records := make([]ComponentStatusRecord, 0, len(db.records))
for _, r := range db.records {
records = append(records, *r)
}
data, err := json.MarshalIndent(records, "", " ")
if err != nil {
return err
}
return os.WriteFile(db.path, data, 0644)
}
// componentSeverity returns a numeric severity so higher values win.
func componentSeverity(status string) int {
switch strings.TrimSpace(status) {
case "Critical":
return 3
case "Warning":
return 2
case "OK":
return 1
default:
return 0
}
}
// ApplySATResultToDB reads a SAT summary.txt from the run directory next to archivePath
// and writes component status records to db for the given SAT target.
// archivePath may be either a bare .tar.gz path or "Archive written to /path/foo.tar.gz".
func ApplySATResultToDB(db *ComponentStatusDB, target, archivePath string) {
if db == nil || strings.TrimSpace(archivePath) == "" {
return
}
archivePath = extractArchivePath(archivePath)
if archivePath == "" {
return
}
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
if err != nil {
return
}
kv := parseSATKV(string(data))
overall := strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
if overall == "" {
return
}
source := "sat:" + target
dbStatus := satStatusToDBStatus(overall)
// Map SAT target to component keys.
switch target {
case "nvidia", "amd", "nvidia-stress", "amd-stress", "amd-mem", "amd-bandwidth":
db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall)
case "memory", "memory-stress", "sat-stress":
db.Record("memory:all", source, dbStatus, target+" SAT: "+overall)
case "cpu", "platform-stress":
db.Record("cpu:all", source, dbStatus, target+" SAT: "+overall)
case "storage":
// Try to record per-device if available in summary.
recordedAny := false
for key, val := range kv {
if !strings.HasSuffix(key, "_status") || key == "overall_status" {
continue
}
base := strings.TrimSuffix(key, "_status")
idx := strings.Index(base, "_")
if idx <= 0 {
continue
}
devName := base[:idx]
devStatus := satStatusToDBStatus(strings.ToUpper(strings.TrimSpace(val)))
db.Record("storage:"+devName, source, devStatus, "storage SAT: "+val)
recordedAny = true
}
if !recordedAny {
db.Record("storage:all", source, dbStatus, "storage SAT: "+overall)
}
}
}
func satStatusToDBStatus(overall string) string {
switch overall {
case "OK":
return "OK"
case "FAILED":
return "Warning"
case "PARTIAL", "UNSUPPORTED":
return "Unknown"
default:
return "Unknown"
}
}
// ExtractArchivePath extracts a bare .tar.gz path from a string that may be
// "Archive written to /path/foo.tar.gz" or already a bare path.
func ExtractArchivePath(s string) string {
return extractArchivePath(s)
}
// ReadSATOverallStatus reads the overall_status value from the summary.txt
// file located in the run directory alongside archivePath.
// Returns "" if the file cannot be read.
func ReadSATOverallStatus(archivePath string) string {
if strings.TrimSpace(archivePath) == "" {
return ""
}
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
if err != nil {
return ""
}
kv := parseSATKV(string(data))
return strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
}
func extractArchivePath(s string) string {
s = strings.TrimSpace(s)
if strings.HasSuffix(s, ".tar.gz") {
parts := strings.Fields(s)
if len(parts) > 0 {
return parts[len(parts)-1]
}
}
return s
}
func parseSATKV(raw string) map[string]string {
kv := make(map[string]string)
for _, line := range strings.Split(raw, "\n") {
k, v, ok := strings.Cut(strings.TrimSpace(line), "=")
if ok {
kv[strings.TrimSpace(k)] = strings.TrimSpace(v)
}
}
return kv
}

View File

@@ -9,7 +9,7 @@ import (
"bee/audit/internal/schema"
)
func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *ComponentStatusDB) {
if snap == nil || strings.TrimSpace(baseDir) == "" {
return
}
@@ -28,6 +28,8 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
if summary, ok := loadLatestSATSummary(baseDir, "storage-"); ok {
applyStorageSAT(snap.Storage, summary)
}
// Apply unified component status DB — overlaid last so it can only upgrade severity.
applyComponentStatusDB(snap, db)
}
type satSummary struct {
@@ -206,6 +208,86 @@ func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool {
}
}
func applyComponentStatusDB(snap *schema.HardwareSnapshot, db *ComponentStatusDB) {
if snap == nil || db == nil {
return
}
for _, rec := range db.All() {
key := rec.ComponentKey
status := dbStatusToSATStatus(rec.Status)
if status == "" {
continue
}
detail := rec.ErrorSummary
ts := rec.LastChangedAt.UTC().Format("2006-01-02T15:04:05Z")
switch {
case strings.HasPrefix(key, "pcie:"):
bdf := strings.TrimPrefix(key, "pcie:")
bdf = strings.TrimPrefix(bdf, "gpu:") // strip sub-type if present
// bdf may be empty (e.g. "pcie:gpu:nvidia") — skip BDF matching
if sanitizeBDFForLookup(bdf) == "" {
break
}
normalized := sanitizeBDFForLookup(bdf)
for i := range snap.PCIeDevices {
if snap.PCIeDevices[i].BDF == nil {
continue
}
if sanitizeBDFForLookup(*snap.PCIeDevices[i].BDF) == normalized {
mergeComponentStatus(&snap.PCIeDevices[i].HardwareComponentStatus, ts, status, detail)
}
}
case strings.HasPrefix(key, "storage:"):
devName := strings.TrimPrefix(key, "storage:")
if devName == "all" {
for i := range snap.Storage {
mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
}
} else {
for i := range snap.Storage {
linuxDev, _ := snap.Storage[i].Telemetry["linux_device"].(string)
if filepath.Base(strings.TrimSpace(linuxDev)) == devName {
mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
}
}
}
case strings.HasPrefix(key, "memory:"):
for i := range snap.Memory {
mergeComponentStatus(&snap.Memory[i].HardwareComponentStatus, ts, status, detail)
}
case strings.HasPrefix(key, "cpu:"):
for i := range snap.CPUs {
mergeComponentStatus(&snap.CPUs[i].HardwareComponentStatus, ts, status, detail)
}
}
}
}
// dbStatusToSATStatus converts ComponentStatusDB status strings to the format
// expected by mergeComponentStatus (which uses "OK", "Warning", "Critical", "Unknown").
func dbStatusToSATStatus(s string) string {
switch strings.TrimSpace(s) {
case "OK", "Warning", "Critical", "Unknown":
return s
default:
return ""
}
}
// sanitizeBDFForLookup normalises a PCIe BDF address to a canonical lower-case form
// suitable for comparison. "c8:00.0" → "0000:c8:00.0"; already-full BDFs are left as-is.
func sanitizeBDFForLookup(bdf string) string {
bdf = strings.ToLower(strings.TrimSpace(bdf))
if bdf == "" || bdf == "gpu" || strings.ContainsAny(bdf, " \t") {
return ""
}
if strings.Count(bdf, ":") == 1 {
bdf = "0000:" + bdf
}
return bdf
}
func ptrString(v *string) string {
if v == nil {
return ""

View File

@@ -23,7 +23,7 @@ func TestApplyLatestSATStatusesMarksStorageByDevice(t *testing.T) {
usb := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/sda"}}
snap := schema.HardwareSnapshot{Storage: []schema.HardwareStorage{nvme, usb}}
applyLatestSATStatuses(&snap, baseDir)
applyLatestSATStatuses(&snap, baseDir, nil)
if snap.Storage[0].Status == nil || *snap.Storage[0].Status != "OK" {
t.Fatalf("nvme status=%v want OK", snap.Storage[0].Status)
@@ -53,7 +53,7 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
}},
}
applyLatestSATStatuses(&snap, baseDir)
applyLatestSATStatuses(&snap, baseDir, nil)
if snap.PCIeDevices[0].Status == nil || *snap.PCIeDevices[0].Status != "Critical" {
t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)

View File

@@ -0,0 +1,139 @@
package platform
import "regexp"
// ErrorPattern describes a kernel log pattern that indicates a hardware error.
// Add new patterns by appending to HardwareErrorPatterns — no other code changes needed.
type ErrorPattern struct {
// Name is a short machine-readable label for logging and deduplication.
Name string
// Re is the compiled regular expression matched against a single kmsg line.
Re *regexp.Regexp
// Category groups related errors: "gpu", "pcie", "storage", "mce", "memory", "cpu".
Category string
// Severity is "warning" for recoverable/uncertain faults, "critical" for definitive failures.
Severity string
// BDFGroup is the capture group index (1-based) that contains a PCIe BDF address
// (e.g. "0000:c8:00.0"). 0 means no BDF is captured by this pattern.
BDFGroup int
// DevGroup is the capture group index (1-based) that contains a device name
// (e.g. "sda", "nvme0"). 0 means no device name is captured by this pattern.
DevGroup int
}
// HardwareErrorPatterns is the global list of kernel log patterns that indicate hardware faults.
// To add a new pattern: append a new ErrorPattern struct to this slice.
var HardwareErrorPatterns = []ErrorPattern{
// ── GPU / NVIDIA ────────────────────────────────────────────────────────────
{
Name: "nvidia-rminitadapter",
Re: mustPat(`(?i)NVRM:.*GPU\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
Category: "gpu",
Severity: "warning",
BDFGroup: 1,
},
{
Name: "nvidia-msi-fail",
Re: mustPat(`(?i)NVRM:.*Failed to enable MSI`),
Category: "gpu",
Severity: "warning",
},
{
Name: "nvidia-aer",
Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
Category: "gpu",
Severity: "warning",
BDFGroup: 1,
},
{
Name: "nvidia-xid",
Re: mustPat(`(?i)NVRM:.*Xid.*\b([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
Category: "gpu",
Severity: "warning",
BDFGroup: 1,
},
// ── PCIe AER (generic) ──────────────────────────────────────────────────────
{
Name: "pcie-aer",
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
Category: "pcie",
Severity: "warning",
BDFGroup: 1,
},
{
Name: "pcie-uncorrectable",
Re: mustPat(`(?i)([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Uu]ncorrectable`),
Category: "pcie",
Severity: "warning",
BDFGroup: 1,
},
{
Name: "pcie-link-down",
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Ll]ink.*[Dd]own`),
Category: "pcie",
Severity: "warning",
BDFGroup: 1,
},
// ── Storage ─────────────────────────────────────────────────────────────────
{
Name: "blk-io-error",
Re: mustPat(`(?i)blk_update_request.*I/O error.*dev\s+(\w+)`),
Category: "storage",
Severity: "warning",
DevGroup: 1,
},
{
Name: "nvme-timeout",
Re: mustPat(`(?i)nvme\s+(\w+):.*timeout`),
Category: "storage",
Severity: "warning",
DevGroup: 1,
},
{
Name: "scsi-failed",
Re: mustPat(`(?i)sd\s+[\da-f:]+:.*FAILED`),
Category: "storage",
Severity: "warning",
},
{
Name: "nvme-reset",
Re: mustPat(`(?i)nvme\s+(\w+):.*reset`),
Category: "storage",
Severity: "warning",
DevGroup: 1,
},
// ── Machine Check Exceptions ────────────────────────────────────────────────
{
Name: "mce-hardware-error",
Re: mustPat(`(?i)mce:.*[Hh]ardware [Ee]rror`),
Category: "mce",
Severity: "warning",
},
{
Name: "mce-corrected",
Re: mustPat(`(?i)mce:.*[Cc]orrected`),
Category: "mce",
Severity: "warning",
},
// ── Memory ─────────────────────────────────────────────────────────────────
{
Name: "edac-ue",
Re: mustPat(`(?i)EDAC.*[Uu]ncorrectable`),
Category: "memory",
Severity: "warning",
},
{
Name: "edac-ce",
Re: mustPat(`(?i)EDAC.*[Cc]orrectable`),
Category: "memory",
Severity: "warning",
},
}
func mustPat(s string) *regexp.Regexp {
return regexp.MustCompile(s)
}

View File

@@ -0,0 +1,230 @@
package webui
import (
"bufio"
"io"
"log/slog"
"os"
"strings"
"sync"
"time"
"bee/audit/internal/app"
"bee/audit/internal/platform"
)
// kmsgWatcher reads /dev/kmsg and accumulates hardware error events.
// During an active SAT task window it records matching lines; on task finish
// it writes Warning status records to the component status DB.
type kmsgWatcher struct {
mu sync.Mutex
activeWindow *kmsgWindow
statusDB *app.ComponentStatusDB
}
type kmsgWindow struct {
taskID string
target string
startedAt time.Time
seen map[kmsgEventKey]bool
events []kmsgEvent
}
type kmsgEventKey struct {
id string // BDF or device name
category string
}
type kmsgEvent struct {
timestamp time.Time
raw string
ids []string // BDF addresses or device names extracted
category string
}
func newKmsgWatcher(statusDB *app.ComponentStatusDB) *kmsgWatcher {
return &kmsgWatcher{statusDB: statusDB}
}
// start launches the background kmsg reading goroutine.
func (w *kmsgWatcher) start() {
go w.run()
}
func (w *kmsgWatcher) run() {
f, err := os.Open("/dev/kmsg")
if err != nil {
slog.Warn("kmsg watcher unavailable", "err", err)
return
}
defer f.Close()
// Best-effort seek to end so we only capture events from now forward.
_, _ = f.Seek(0, io.SeekEnd)
scanner := bufio.NewScanner(f)
scanner.Buffer(make([]byte, 64*1024), 64*1024)
for scanner.Scan() {
line := scanner.Text()
evt, ok := parseKmsgLine(line)
if !ok {
continue
}
w.mu.Lock()
if w.activeWindow != nil {
w.recordEvent(evt)
}
w.mu.Unlock()
}
if err := scanner.Err(); err != nil {
slog.Warn("kmsg watcher stopped", "err", err)
}
}
// recordEvent appends evt to the active window, deduplicating by (id, category).
// Must be called with w.mu held.
func (w *kmsgWatcher) recordEvent(evt kmsgEvent) {
if len(evt.ids) == 0 {
// Events without a device ID (e.g. MCE) — deduplicate by category.
key := kmsgEventKey{id: "", category: evt.category}
if !w.activeWindow.seen[key] {
w.activeWindow.seen[key] = true
w.activeWindow.events = append(w.activeWindow.events, evt)
}
return
}
for _, id := range evt.ids {
key := kmsgEventKey{id: id, category: evt.category}
if !w.activeWindow.seen[key] {
w.activeWindow.seen[key] = true
w.activeWindow.events = append(w.activeWindow.events, evt)
}
}
}
// NotifyTaskStarted opens a new event window for the given SAT task.
func (w *kmsgWatcher) NotifyTaskStarted(taskID, target string) {
w.mu.Lock()
defer w.mu.Unlock()
w.activeWindow = &kmsgWindow{
taskID: taskID,
target: target,
startedAt: time.Now(),
seen: make(map[kmsgEventKey]bool),
}
}
// NotifyTaskFinished closes the event window and asynchronously writes status records.
func (w *kmsgWatcher) NotifyTaskFinished(taskID string) {
w.mu.Lock()
window := w.activeWindow
if window != nil && window.taskID == taskID {
w.activeWindow = nil
}
w.mu.Unlock()
if window == nil || len(window.events) == 0 {
return
}
go w.flushWindow(window)
}
func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
if w.statusDB == nil {
return
}
source := "watchdog:kmsg"
// Collect unique component keys from events.
seen := map[string]string{} // componentKey → first raw line
for _, evt := range window.events {
if len(evt.ids) == 0 {
// MCE or un-identified error.
key := "cpu:all"
if evt.category == "memory" {
key = "memory:all"
}
if _, exists := seen[key]; !exists {
seen[key] = evt.raw
}
continue
}
for _, id := range evt.ids {
var key string
switch evt.category {
case "gpu", "pcie":
key = "pcie:" + normalizeBDF(id)
case "storage":
key = "storage:" + id
default:
key = "pcie:" + normalizeBDF(id)
}
if _, exists := seen[key]; !exists {
seen[key] = evt.raw
}
}
}
for key, detail := range seen {
detail = "kernel error during " + window.target + " SAT: " + truncate(detail, 120)
w.statusDB.Record(key, source, "Warning", detail)
}
}
// parseKmsgLine parses a single /dev/kmsg line and returns an event if it matches
// any pattern in platform.HardwareErrorPatterns.
// kmsg format: "<priority>,<sequence>,<timestamp_usec>,-;message text"
func parseKmsgLine(raw string) (kmsgEvent, bool) {
msg := raw
if idx := strings.Index(raw, ";"); idx >= 0 {
msg = strings.TrimSpace(raw[idx+1:])
}
if msg == "" {
return kmsgEvent{}, false
}
for _, p := range platform.HardwareErrorPatterns {
m := p.Re.FindStringSubmatch(msg)
if m == nil {
continue
}
evt := kmsgEvent{
timestamp: time.Now(),
raw: msg,
category: p.Category,
}
if p.BDFGroup > 0 && p.BDFGroup < len(m) {
evt.ids = append(evt.ids, normalizeBDF(m[p.BDFGroup]))
}
if p.DevGroup > 0 && p.DevGroup < len(m) {
evt.ids = append(evt.ids, m[p.DevGroup])
}
return evt, true
}
return kmsgEvent{}, false
}
// normalizeBDF normalizes a PCIe BDF to the 4-part form "0000:c8:00.0".
func normalizeBDF(bdf string) string {
bdf = strings.ToLower(strings.TrimSpace(bdf))
if strings.Count(bdf, ":") == 1 {
return "0000:" + bdf
}
return bdf
}
func truncate(s string, max int) string {
if len(s) <= max {
return s
}
return s[:max] + "..."
}
// isSATTarget returns true for task targets that run hardware acceptance tests.
func isSATTarget(target string) bool {
switch target {
case "nvidia", "nvidia-stress", "memory", "memory-stress", "storage",
"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
"platform-stress":
return true
}
return false
}

View File

@@ -164,6 +164,8 @@ type handler struct {
// pending network change (rollback on timeout)
pendingNet *pendingNetChange
pendingNetMu sync.Mutex
// kmsg hardware error watcher
kmsg *kmsgWatcher
}
// NewHandler creates the HTTP mux with all routes.
@@ -203,6 +205,13 @@ func NewHandler(opts HandlerOptions) http.Handler {
}
h.startMetricsCollector()
// Start kmsg hardware error watcher if the app (and its status DB) is available.
if opts.App != nil {
h.kmsg = newKmsgWatcher(opts.App.StatusDB)
h.kmsg.start()
globalQueue.kmsgWatcher = h.kmsg
}
globalQueue.startWorker(&opts)
mux := http.NewServeMux()

View File

@@ -173,13 +173,14 @@ func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions
// taskQueue manages a priority-ordered list of tasks and runs them one at a time.
type taskQueue struct {
mu sync.Mutex
tasks []*Task
trigger chan struct{}
opts *HandlerOptions // set by startWorker
statePath string
logsDir string
started bool
mu sync.Mutex
tasks []*Task
trigger chan struct{}
opts *HandlerOptions // set by startWorker
statePath string
logsDir string
started bool
kmsgWatcher *kmsgWatcher
}
var globalQueue = &taskQueue{trigger: make(chan struct{}, 1)}
@@ -411,8 +412,16 @@ func (q *taskQueue) worker() {
q.persistLocked()
q.mu.Unlock()
if q.kmsgWatcher != nil && isSATTarget(t.Target) {
q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
}
q.runTask(t, j, ctx)
if q.kmsgWatcher != nil {
q.kmsgWatcher.NotifyTaskFinished(t.ID)
}
q.mu.Lock()
now2 := time.Now()
t.DoneAt = &now2
@@ -618,6 +627,19 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
return
}
// If the SAT archive was produced, check overall_status and write to component DB.
if archive != "" {
archivePath := app.ExtractArchivePath(archive)
if err == nil {
if app.ReadSATOverallStatus(archivePath) == "FAILED" {
err = fmt.Errorf("SAT overall_status=FAILED (see summary.txt)")
}
}
if db := q.statusDB(); db != nil {
app.ApplySATResultToDB(db, t.Target, archivePath)
}
}
if err != nil {
if ctx.Err() != nil {
j.append("Aborted.")
@@ -634,6 +656,13 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
j.finish("")
}
func (q *taskQueue) statusDB() *app.ComponentStatusDB {
if q.opts == nil || q.opts.App == nil {
return nil
}
return q.opts.App.StatusDB
}
func splitLines(s string) []string {
var out []string
for _, l := range splitNL(s) {