Files
bee/audit/internal/webui/health_poller.go
Mikhail Chusavitin ae80d7711e Add continuous hardware health monitoring and component detail view
- kmsg watcher now records kernel errors (GPU Xid, MCE, EDAC, storage I/O) at all times,
  not only during SAT tasks; flushImmediate writes directly to ComponentStatusDB
- New health_poller: polls ipmitool sdr every 60s for PSU health (watchdog:psu source)
- Hardware Summary card auto-refreshes every 30s via htmx without page reload
- Component rows (CPU/Memory/Storage/GPU/PSU) are now clickable -- opens a modal
  with per-component status, source, timestamp and last 20 history entries

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-06 09:56:39 +03:00

77 lines
1.6 KiB
Go

package webui
import (
"bytes"
"context"
"log/slog"
"os/exec"
"time"
"bee/audit/internal/app"
"bee/audit/internal/collector"
)
const healthPollInterval = 60 * time.Second
const psuIPMITimeout = 15 * time.Second
// healthPoller runs periodic health checks for hardware components that do not
// emit kernel log events (e.g. PSU). Results are written to ComponentStatusDB.
type healthPoller struct {
statusDB *app.ComponentStatusDB
}
func newHealthPoller(statusDB *app.ComponentStatusDB) *healthPoller {
return &healthPoller{statusDB: statusDB}
}
func (p *healthPoller) start() {
goRecoverLoop("health poller", 5*time.Second, p.run)
}
func (p *healthPoller) run() {
ticker := time.NewTicker(healthPollInterval)
defer ticker.Stop()
for range ticker.C {
p.pollPSU()
}
}
func (p *healthPoller) pollPSU() {
if p.statusDB == nil {
return
}
ctx, cancel := context.WithTimeout(context.Background(), psuIPMITimeout)
defer cancel()
cmd := exec.CommandContext(ctx, "ipmitool", "sdr")
var out bytes.Buffer
cmd.Stdout = &out
if err := cmd.Run(); err != nil {
// IPMI not available or not a server — skip silently.
slog.Debug("health poller: ipmitool sdr unavailable", "err", err)
return
}
slots := collector.PSUSlotsFromSDR(out.String())
if len(slots) == 0 {
return
}
const source = "watchdog:psu"
for slot, psu := range slots {
key := "psu:" + slot
status := psu.Status
if status == "" {
status = "Unknown"
}
detail := ""
switch status {
case "Critical":
detail = "PSU sensor reported non-OK state"
case "Warning":
detail = "PSU sensor in warning state"
}
p.statusDB.Record(key, source, status, detail)
}
}