Add continuous hardware health monitoring and component detail view
- kmsg watcher now records kernel errors (GPU Xid, MCE, EDAC, storage I/O) at all times, not only during SAT tasks; flushImmediate writes directly to ComponentStatusDB - New health_poller: polls ipmitool sdr every 60s for PSU health (watchdog:psu source) - Hardware Summary card auto-refreshes every 30s via htmx without page reload - Component rows (CPU/Memory/Storage/GPU/PSU) are now clickable -- opens a modal with per-component status, source, timestamp and last 20 history entries Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,76 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/collector"
|
||||
)
|
||||
|
||||
const healthPollInterval = 60 * time.Second
|
||||
const psuIPMITimeout = 15 * time.Second
|
||||
|
||||
// healthPoller runs periodic health checks for hardware components that do not
|
||||
// emit kernel log events (e.g. PSU). Results are written to ComponentStatusDB.
|
||||
type healthPoller struct {
|
||||
statusDB *app.ComponentStatusDB
|
||||
}
|
||||
|
||||
func newHealthPoller(statusDB *app.ComponentStatusDB) *healthPoller {
|
||||
return &healthPoller{statusDB: statusDB}
|
||||
}
|
||||
|
||||
func (p *healthPoller) start() {
|
||||
goRecoverLoop("health poller", 5*time.Second, p.run)
|
||||
}
|
||||
|
||||
func (p *healthPoller) run() {
|
||||
ticker := time.NewTicker(healthPollInterval)
|
||||
defer ticker.Stop()
|
||||
for range ticker.C {
|
||||
p.pollPSU()
|
||||
}
|
||||
}
|
||||
|
||||
func (p *healthPoller) pollPSU() {
|
||||
if p.statusDB == nil {
|
||||
return
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), psuIPMITimeout)
|
||||
defer cancel()
|
||||
|
||||
cmd := exec.CommandContext(ctx, "ipmitool", "sdr")
|
||||
var out bytes.Buffer
|
||||
cmd.Stdout = &out
|
||||
if err := cmd.Run(); err != nil {
|
||||
// IPMI not available or not a server — skip silently.
|
||||
slog.Debug("health poller: ipmitool sdr unavailable", "err", err)
|
||||
return
|
||||
}
|
||||
|
||||
slots := collector.PSUSlotsFromSDR(out.String())
|
||||
if len(slots) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
const source = "watchdog:psu"
|
||||
for slot, psu := range slots {
|
||||
key := "psu:" + slot
|
||||
status := psu.Status
|
||||
if status == "" {
|
||||
status = "Unknown"
|
||||
}
|
||||
detail := ""
|
||||
switch status {
|
||||
case "Critical":
|
||||
detail = "PSU sensor reported non-OK state"
|
||||
case "Warning":
|
||||
detail = "PSU sensor in warning state"
|
||||
}
|
||||
p.statusDB.Record(key, source, status, detail)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user