Add continuous hardware health monitoring and component detail view
- kmsg watcher now records kernel errors (GPU Xid, MCE, EDAC, storage I/O) at all times, not only during SAT tasks; flushImmediate writes directly to ComponentStatusDB - New health_poller: polls ipmitool sdr every 60s for PSU health (watchdog:psu source) - Hardware Summary card auto-refreshes every 30s via htmx without page reload - Component rows (CPU/Memory/Storage/GPU/PSU) are now clickable -- opens a modal with per-component status, source, timestamp and last 20 history entries Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -73,6 +73,9 @@ func (w *kmsgWatcher) run() {
|
||||
w.mu.Lock()
|
||||
if w.window != nil {
|
||||
w.recordEvent(evt)
|
||||
} else {
|
||||
evtCopy := evt
|
||||
goRecoverOnce("kmsg flush immediate", func() { w.flushImmediate(evtCopy) })
|
||||
}
|
||||
w.mu.Unlock()
|
||||
}
|
||||
@@ -180,6 +183,52 @@ func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
|
||||
}
|
||||
}
|
||||
|
||||
// flushImmediate writes a single kmsg event directly to the status DB without a SAT window.
|
||||
// Called when an error is detected outside of any SAT task (always-on watching).
|
||||
func (w *kmsgWatcher) flushImmediate(evt kmsgEvent) {
|
||||
if w.statusDB == nil {
|
||||
return
|
||||
}
|
||||
const source = "watchdog:kmsg"
|
||||
detail := "kernel: " + truncate(evt.raw, 120)
|
||||
|
||||
var severity string
|
||||
for _, p := range platform.HardwareErrorPatterns {
|
||||
if p.Re.MatchString(evt.raw) {
|
||||
if p.Severity == "critical" {
|
||||
severity = "Critical"
|
||||
} else {
|
||||
severity = "Warning"
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
if severity == "" {
|
||||
severity = "Warning"
|
||||
}
|
||||
|
||||
if len(evt.ids) == 0 {
|
||||
key := "cpu:all"
|
||||
if evt.category == "memory" {
|
||||
key = "memory:all"
|
||||
}
|
||||
w.statusDB.Record(key, source, severity, detail)
|
||||
return
|
||||
}
|
||||
for _, id := range evt.ids {
|
||||
var key string
|
||||
switch evt.category {
|
||||
case "gpu", "pcie":
|
||||
key = "pcie:" + normalizeBDF(id)
|
||||
case "storage":
|
||||
key = "storage:" + id
|
||||
default:
|
||||
key = "pcie:" + normalizeBDF(id)
|
||||
}
|
||||
w.statusDB.Record(key, source, severity, detail)
|
||||
}
|
||||
}
|
||||
|
||||
// parseKmsgLine parses a single /dev/kmsg line and returns an event if it matches
|
||||
// any pattern in platform.HardwareErrorPatterns.
|
||||
// kmsg format: "<priority>,<sequence>,<timestamp_usec>,-;message text"
|
||||
|
||||
Reference in New Issue
Block a user