From ae80d7711e570c95cb743fdf1269559f9101a6e6 Mon Sep 17 00:00:00 2001 From: Mikhail Chusavitin Date: Wed, 6 May 2026 09:56:39 +0300 Subject: [PATCH] Add continuous hardware health monitoring and component detail view - kmsg watcher now records kernel errors (GPU Xid, MCE, EDAC, storage I/O) at all times, not only during SAT tasks; flushImmediate writes directly to ComponentStatusDB - New health_poller: polls ipmitool sdr every 60s for PSU health (watchdog:psu source) - Hardware Summary card auto-refreshes every 30s via htmx without page reload - Component rows (CPU/Memory/Storage/GPU/PSU) are now clickable -- opens a modal with per-component status, source, timestamp and last 20 history entries Co-Authored-By: Claude Sonnet 4.6 --- audit/internal/webui/api.go | 50 +++++++++++++ audit/internal/webui/health_poller.go | 76 ++++++++++++++++++++ audit/internal/webui/kmsg_watcher.go | 49 +++++++++++++ audit/internal/webui/pages.go | 100 ++++++++++++++++++++++---- audit/internal/webui/server.go | 9 +++ 5 files changed, 272 insertions(+), 12 deletions(-) create mode 100644 audit/internal/webui/health_poller.go diff --git a/audit/internal/webui/api.go b/audit/internal/webui/api.go index 23dbe8d..5d2504d 100644 --- a/audit/internal/webui/api.go +++ b/audit/internal/webui/api.go @@ -1679,6 +1679,56 @@ func (h *handler) handleAPIBenchmarkResults(w http.ResponseWriter, r *http.Reque fmt.Fprint(w, renderBenchmarkResultsCard(h.opts.ExportDir)) } +// ── Hardware summary / component detail ────────────────────────────────────── + +// handleAPIHardwareSummary returns the hardware summary card HTML fragment for +// htmx polling (hx-get="/api/hardware-summary" hx-swap="outerHTML"). +func (h *handler) handleAPIHardwareSummary(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("Content-Type", "text/html; charset=utf-8") + w.Header().Set("Cache-Control", "no-store") + fmt.Fprint(w, renderHardwareSummaryCard(h.opts)) +} + +// handleAPIComponentDetail returns an HTML fragment describing the current and +// historical status for one component type (cpu, memory, storage, gpu, psu). +func (h *handler) handleAPIComponentDetail(w http.ResponseWriter, r *http.Request) { + compType := r.PathValue("type") + var exact, prefixes []string + var title string + switch compType { + case "cpu": + title = "CPU" + exact = []string{"cpu:all"} + case "memory": + title = "Memory" + exact = []string{"memory:all"} + prefixes = []string{"memory:"} + case "storage": + title = "Storage" + exact = []string{"storage:all"} + prefixes = []string{"storage:"} + case "gpu": + title = "GPU" + prefixes = []string{"pcie:gpu:"} + case "psu": + title = "PSU" + prefixes = []string{"psu:"} + default: + http.NotFound(w, r) + return + } + + var records []app.ComponentStatusRecord + if h.opts.App != nil && h.opts.App.StatusDB != nil { + all := h.opts.App.StatusDB.All() + records = matchedRecords(all, exact, prefixes) + } + + w.Header().Set("Content-Type", "text/html; charset=utf-8") + w.Header().Set("Cache-Control", "no-store") + fmt.Fprint(w, renderComponentDetail(title, records)) +} + func (h *handler) rollbackPendingNetworkChange() error { h.pendingNetMu.Lock() pnc := h.pendingNet diff --git a/audit/internal/webui/health_poller.go b/audit/internal/webui/health_poller.go new file mode 100644 index 0000000..1cdce0e --- /dev/null +++ b/audit/internal/webui/health_poller.go @@ -0,0 +1,76 @@ +package webui + +import ( + "bytes" + "context" + "log/slog" + "os/exec" + "time" + + "bee/audit/internal/app" + "bee/audit/internal/collector" +) + +const healthPollInterval = 60 * time.Second +const psuIPMITimeout = 15 * time.Second + +// healthPoller runs periodic health checks for hardware components that do not +// emit kernel log events (e.g. PSU). Results are written to ComponentStatusDB. +type healthPoller struct { + statusDB *app.ComponentStatusDB +} + +func newHealthPoller(statusDB *app.ComponentStatusDB) *healthPoller { + return &healthPoller{statusDB: statusDB} +} + +func (p *healthPoller) start() { + goRecoverLoop("health poller", 5*time.Second, p.run) +} + +func (p *healthPoller) run() { + ticker := time.NewTicker(healthPollInterval) + defer ticker.Stop() + for range ticker.C { + p.pollPSU() + } +} + +func (p *healthPoller) pollPSU() { + if p.statusDB == nil { + return + } + ctx, cancel := context.WithTimeout(context.Background(), psuIPMITimeout) + defer cancel() + + cmd := exec.CommandContext(ctx, "ipmitool", "sdr") + var out bytes.Buffer + cmd.Stdout = &out + if err := cmd.Run(); err != nil { + // IPMI not available or not a server — skip silently. + slog.Debug("health poller: ipmitool sdr unavailable", "err", err) + return + } + + slots := collector.PSUSlotsFromSDR(out.String()) + if len(slots) == 0 { + return + } + + const source = "watchdog:psu" + for slot, psu := range slots { + key := "psu:" + slot + status := psu.Status + if status == "" { + status = "Unknown" + } + detail := "" + switch status { + case "Critical": + detail = "PSU sensor reported non-OK state" + case "Warning": + detail = "PSU sensor in warning state" + } + p.statusDB.Record(key, source, status, detail) + } +} diff --git a/audit/internal/webui/kmsg_watcher.go b/audit/internal/webui/kmsg_watcher.go index 42201c5..0132956 100644 --- a/audit/internal/webui/kmsg_watcher.go +++ b/audit/internal/webui/kmsg_watcher.go @@ -73,6 +73,9 @@ func (w *kmsgWatcher) run() { w.mu.Lock() if w.window != nil { w.recordEvent(evt) + } else { + evtCopy := evt + goRecoverOnce("kmsg flush immediate", func() { w.flushImmediate(evtCopy) }) } w.mu.Unlock() } @@ -180,6 +183,52 @@ func (w *kmsgWatcher) flushWindow(window *kmsgWindow) { } } +// flushImmediate writes a single kmsg event directly to the status DB without a SAT window. +// Called when an error is detected outside of any SAT task (always-on watching). +func (w *kmsgWatcher) flushImmediate(evt kmsgEvent) { + if w.statusDB == nil { + return + } + const source = "watchdog:kmsg" + detail := "kernel: " + truncate(evt.raw, 120) + + var severity string + for _, p := range platform.HardwareErrorPatterns { + if p.Re.MatchString(evt.raw) { + if p.Severity == "critical" { + severity = "Critical" + } else { + severity = "Warning" + } + break + } + } + if severity == "" { + severity = "Warning" + } + + if len(evt.ids) == 0 { + key := "cpu:all" + if evt.category == "memory" { + key = "memory:all" + } + w.statusDB.Record(key, source, severity, detail) + return + } + for _, id := range evt.ids { + var key string + switch evt.category { + case "gpu", "pcie": + key = "pcie:" + normalizeBDF(id) + case "storage": + key = "storage:" + id + default: + key = "pcie:" + normalizeBDF(id) + } + w.statusDB.Record(key, source, severity, detail) + } +} + // parseKmsgLine parses a single /dev/kmsg line and returns an event if it matches // any pattern in platform.HardwareErrorPatterns. // kmsg format: ",,,-;message text" diff --git a/audit/internal/webui/pages.go b/audit/internal/webui/pages.go index a868792..45c0152 100644 --- a/audit/internal/webui/pages.go +++ b/audit/internal/webui/pages.go @@ -85,6 +85,7 @@ func renderPage(page string, opts HandlerOptions) string { body + `` + renderAuditModal() + + `
` + `