- kmsg watcher now records kernel errors (GPU Xid, MCE, EDAC, storage I/O) at all times, not only during SAT tasks; flushImmediate writes directly to ComponentStatusDB - New health_poller: polls ipmitool sdr every 60s for PSU health (watchdog:psu source) - Hardware Summary card auto-refreshes every 30s via htmx without page reload - Component rows (CPU/Memory/Storage/GPU/PSU) are now clickable -- opens a modal with per-component status, source, timestamp and last 20 history entries Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
77 lines
1.6 KiB
Go
77 lines
1.6 KiB
Go
package webui
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"log/slog"
|
|
"os/exec"
|
|
"time"
|
|
|
|
"bee/audit/internal/app"
|
|
"bee/audit/internal/collector"
|
|
)
|
|
|
|
const healthPollInterval = 60 * time.Second
|
|
const psuIPMITimeout = 15 * time.Second
|
|
|
|
// healthPoller runs periodic health checks for hardware components that do not
|
|
// emit kernel log events (e.g. PSU). Results are written to ComponentStatusDB.
|
|
type healthPoller struct {
|
|
statusDB *app.ComponentStatusDB
|
|
}
|
|
|
|
func newHealthPoller(statusDB *app.ComponentStatusDB) *healthPoller {
|
|
return &healthPoller{statusDB: statusDB}
|
|
}
|
|
|
|
func (p *healthPoller) start() {
|
|
goRecoverLoop("health poller", 5*time.Second, p.run)
|
|
}
|
|
|
|
func (p *healthPoller) run() {
|
|
ticker := time.NewTicker(healthPollInterval)
|
|
defer ticker.Stop()
|
|
for range ticker.C {
|
|
p.pollPSU()
|
|
}
|
|
}
|
|
|
|
func (p *healthPoller) pollPSU() {
|
|
if p.statusDB == nil {
|
|
return
|
|
}
|
|
ctx, cancel := context.WithTimeout(context.Background(), psuIPMITimeout)
|
|
defer cancel()
|
|
|
|
cmd := exec.CommandContext(ctx, "ipmitool", "sdr")
|
|
var out bytes.Buffer
|
|
cmd.Stdout = &out
|
|
if err := cmd.Run(); err != nil {
|
|
// IPMI not available or not a server — skip silently.
|
|
slog.Debug("health poller: ipmitool sdr unavailable", "err", err)
|
|
return
|
|
}
|
|
|
|
slots := collector.PSUSlotsFromSDR(out.String())
|
|
if len(slots) == 0 {
|
|
return
|
|
}
|
|
|
|
const source = "watchdog:psu"
|
|
for slot, psu := range slots {
|
|
key := "psu:" + slot
|
|
status := psu.Status
|
|
if status == "" {
|
|
status = "Unknown"
|
|
}
|
|
detail := ""
|
|
switch status {
|
|
case "Critical":
|
|
detail = "PSU sensor reported non-OK state"
|
|
case "Warning":
|
|
detail = "PSU sensor in warning state"
|
|
}
|
|
p.statusDB.Record(key, source, status, detail)
|
|
}
|
|
}
|