feat(watchdog): hardware error monitor + unified component status store
- Add platform/error_patterns.go: pluggable table of kernel log patterns (NVIDIA/GPU, PCIe AER, storage I/O, MCE, EDAC) — extend by adding one struct - Add app/component_status_db.go: persistent JSON store (component-status.json) keyed by "pcie:BDF", "storage:dev", "cpu:all", "memory:all"; OK never downgrades Warning or Critical - Add webui/kmsg_watcher.go: goroutine reads /dev/kmsg during SAT tasks, writes Warning to DB for matched hardware errors - Fix task status: overall_status=FAILED in summary.txt now marks task failed - Audit routine overlays component DB statuses into bee-audit.json on every read Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -40,6 +40,8 @@ type App struct {
|
||||
sat satRunner
|
||||
runtime runtimeChecker
|
||||
installer installer
|
||||
// StatusDB is the unified component health store (nil if unavailable).
|
||||
StatusDB *ComponentStatusDB
|
||||
}
|
||||
|
||||
type ActionResult struct {
|
||||
@@ -136,7 +138,7 @@ type runtimeChecker interface {
|
||||
}
|
||||
|
||||
func New(platform *platform.System) *App {
|
||||
return &App{
|
||||
a := &App{
|
||||
network: platform,
|
||||
services: platform,
|
||||
exports: platform,
|
||||
@@ -145,6 +147,10 @@ func New(platform *platform.System) *App {
|
||||
runtime: platform,
|
||||
installer: platform,
|
||||
}
|
||||
if db, err := OpenComponentStatusDB(DefaultExportDir + "/component-status.json"); err == nil {
|
||||
a.StatusDB = db
|
||||
}
|
||||
return a
|
||||
}
|
||||
|
||||
// ApplySATOverlay parses a raw audit JSON, overlays the latest SAT results,
|
||||
@@ -154,7 +160,7 @@ func ApplySATOverlay(auditJSON []byte) ([]byte, error) {
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir)
|
||||
applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir, nil)
|
||||
return json.MarshalIndent(snap, "", " ")
|
||||
}
|
||||
|
||||
@@ -174,7 +180,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
|
||||
}
|
||||
}
|
||||
result := collector.Run(runtimeMode)
|
||||
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir)
|
||||
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
|
||||
if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
|
||||
result.Runtime = &health
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user