Add continuous hardware health monitoring and component detail view
- kmsg watcher now records kernel errors (GPU Xid, MCE, EDAC, storage I/O) at all times, not only during SAT tasks; flushImmediate writes directly to ComponentStatusDB - New health_poller: polls ipmitool sdr every 60s for PSU health (watchdog:psu source) - Hardware Summary card auto-refreshes every 30s via htmx without page reload - Component rows (CPU/Memory/Storage/GPU/PSU) are now clickable -- opens a modal with per-component status, source, timestamp and last 20 history entries Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1679,6 +1679,56 @@ func (h *handler) handleAPIBenchmarkResults(w http.ResponseWriter, r *http.Reque
|
||||
fmt.Fprint(w, renderBenchmarkResultsCard(h.opts.ExportDir))
|
||||
}
|
||||
|
||||
// ── Hardware summary / component detail ──────────────────────────────────────
|
||||
|
||||
// handleAPIHardwareSummary returns the hardware summary card HTML fragment for
|
||||
// htmx polling (hx-get="/api/hardware-summary" hx-swap="outerHTML").
|
||||
func (h *handler) handleAPIHardwareSummary(w http.ResponseWriter, _ *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
fmt.Fprint(w, renderHardwareSummaryCard(h.opts))
|
||||
}
|
||||
|
||||
// handleAPIComponentDetail returns an HTML fragment describing the current and
|
||||
// historical status for one component type (cpu, memory, storage, gpu, psu).
|
||||
func (h *handler) handleAPIComponentDetail(w http.ResponseWriter, r *http.Request) {
|
||||
compType := r.PathValue("type")
|
||||
var exact, prefixes []string
|
||||
var title string
|
||||
switch compType {
|
||||
case "cpu":
|
||||
title = "CPU"
|
||||
exact = []string{"cpu:all"}
|
||||
case "memory":
|
||||
title = "Memory"
|
||||
exact = []string{"memory:all"}
|
||||
prefixes = []string{"memory:"}
|
||||
case "storage":
|
||||
title = "Storage"
|
||||
exact = []string{"storage:all"}
|
||||
prefixes = []string{"storage:"}
|
||||
case "gpu":
|
||||
title = "GPU"
|
||||
prefixes = []string{"pcie:gpu:"}
|
||||
case "psu":
|
||||
title = "PSU"
|
||||
prefixes = []string{"psu:"}
|
||||
default:
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
|
||||
var records []app.ComponentStatusRecord
|
||||
if h.opts.App != nil && h.opts.App.StatusDB != nil {
|
||||
all := h.opts.App.StatusDB.All()
|
||||
records = matchedRecords(all, exact, prefixes)
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
fmt.Fprint(w, renderComponentDetail(title, records))
|
||||
}
|
||||
|
||||
func (h *handler) rollbackPendingNetworkChange() error {
|
||||
h.pendingNetMu.Lock()
|
||||
pnc := h.pendingNet
|
||||
|
||||
Reference in New Issue
Block a user