Add continuous hardware health monitoring and component detail view

- kmsg watcher now records kernel errors (GPU Xid, MCE, EDAC, storage I/O) at all times,
  not only during SAT tasks; flushImmediate writes directly to ComponentStatusDB
- New health_poller: polls ipmitool sdr every 60s for PSU health (watchdog:psu source)
- Hardware Summary card auto-refreshes every 30s via htmx without page reload
- Component rows (CPU/Memory/Storage/GPU/PSU) are now clickable -- opens a modal
  with per-component status, source, timestamp and last 20 history entries

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Mikhail Chusavitin
2026-05-06 09:56:39 +03:00
parent ca78b9df65
commit ae80d7711e
5 changed files with 272 additions and 12 deletions

View File

@@ -73,6 +73,9 @@ func (w *kmsgWatcher) run() {
w.mu.Lock()
if w.window != nil {
w.recordEvent(evt)
} else {
evtCopy := evt
goRecoverOnce("kmsg flush immediate", func() { w.flushImmediate(evtCopy) })
}
w.mu.Unlock()
}
@@ -180,6 +183,52 @@ func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
}
}
// flushImmediate writes a single kmsg event directly to the status DB without a SAT window.
// Called when an error is detected outside of any SAT task (always-on watching).
func (w *kmsgWatcher) flushImmediate(evt kmsgEvent) {
if w.statusDB == nil {
return
}
const source = "watchdog:kmsg"
detail := "kernel: " + truncate(evt.raw, 120)
var severity string
for _, p := range platform.HardwareErrorPatterns {
if p.Re.MatchString(evt.raw) {
if p.Severity == "critical" {
severity = "Critical"
} else {
severity = "Warning"
}
break
}
}
if severity == "" {
severity = "Warning"
}
if len(evt.ids) == 0 {
key := "cpu:all"
if evt.category == "memory" {
key = "memory:all"
}
w.statusDB.Record(key, source, severity, detail)
return
}
for _, id := range evt.ids {
var key string
switch evt.category {
case "gpu", "pcie":
key = "pcie:" + normalizeBDF(id)
case "storage":
key = "storage:" + id
default:
key = "pcie:" + normalizeBDF(id)
}
w.statusDB.Record(key, source, severity, detail)
}
}
// parseKmsgLine parses a single /dev/kmsg line and returns an event if it matches
// any pattern in platform.HardwareErrorPatterns.
// kmsg format: "<priority>,<sequence>,<timestamp_usec>,-;message text"