Refine validate UI and runtime health table

This commit is contained in:
2026-04-05 16:24:45 +03:00
parent 38e79143eb
commit 33e0a5bef2
2 changed files with 719 additions and 56 deletions

View File

@@ -9,6 +9,9 @@ import (
"path/filepath" "path/filepath"
"sort" "sort"
"strings" "strings"
"bee/audit/internal/app"
"bee/audit/internal/schema"
) )
// ── Layout ──────────────────────────────────────────────────────────────────── // ── Layout ────────────────────────────────────────────────────────────────────
@@ -42,6 +45,8 @@ a{color:var(--accent);text-decoration:none}
/* Cards */ /* Cards */
.card{background:var(--surface);border:1px solid var(--border);border-radius:4px;box-shadow:0 1px 2px rgba(34,36,38,.15);margin-bottom:16px;overflow:hidden} .card{background:var(--surface);border:1px solid var(--border);border-radius:4px;box-shadow:0 1px 2px rgba(34,36,38,.15);margin-bottom:16px;overflow:hidden}
.card-head{padding:11px 16px;background:var(--surface-2);border-bottom:1px solid var(--border);font-weight:700;font-size:13px;display:flex;align-items:center;gap:8px} .card-head{padding:11px 16px;background:var(--surface-2);border-bottom:1px solid var(--border);font-weight:700;font-size:13px;display:flex;align-items:center;gap:8px}
.card-head-actions{justify-content:space-between}
.card-head-buttons{display:flex;align-items:center;gap:8px;margin-left:auto;flex-wrap:wrap}
.card-body{padding:16px} .card-body{padding:16px}
/* Buttons */ /* Buttons */
.btn{display:inline-flex;align-items:center;gap:6px;padding:8px 16px;border-radius:4px;font-size:13px;font-weight:700;cursor:pointer;border:none;transition:background .1s;font-family:inherit} .btn{display:inline-flex;align-items:center;gap:6px;padding:8px 16px;border-radius:4px;font-size:13px;font-weight:700;cursor:pointer;border:none;transition:background .1s;font-family:inherit}
@@ -72,7 +77,7 @@ tbody tr:hover td{background:rgba(0,0,0,.03)}
/* Grid */ /* Grid */
.grid2{display:grid;grid-template-columns:1fr 1fr;gap:16px} .grid2{display:grid;grid-template-columns:1fr 1fr;gap:16px}
.grid3{display:grid;grid-template-columns:1fr 1fr 1fr;gap:16px} .grid3{display:grid;grid-template-columns:1fr 1fr 1fr;gap:16px}
@media(max-width:900px){.grid2,.grid3{grid-template-columns:1fr}} @media(max-width:900px){.grid2,.grid3{grid-template-columns:1fr}.card-head-actions{align-items:flex-start;flex-direction:column}.card-head-buttons{margin-left:0}}
/* iframe viewer */ /* iframe viewer */
.viewer-frame{width:100%;height:calc(100vh - 160px);border:0;border-radius:4px;background:var(--surface-2)} .viewer-frame{width:100%;height:calc(100vh - 160px);border:0;border-radius:4px;background:var(--surface-2)}
/* Alerts */ /* Alerts */
@@ -136,7 +141,7 @@ func renderPage(page string, opts HandlerOptions) string {
case "validate": case "validate":
pageID = "validate" pageID = "validate"
title = "Validate" title = "Validate"
body = renderValidate() body = renderValidate(opts)
case "burn": case "burn":
pageID = "burn" pageID = "burn"
title = "Burn" title = "Burn"
@@ -161,7 +166,7 @@ func renderPage(page string, opts HandlerOptions) string {
case "tests": case "tests":
pageID = "validate" pageID = "validate"
title = "Acceptance Tests" title = "Acceptance Tests"
body = renderValidate() body = renderValidate(opts)
case "burn-in": case "burn-in":
pageID = "burn" pageID = "burn"
title = "Burn-in Tests" title = "Burn-in Tests"
@@ -295,7 +300,7 @@ func renderAudit() string {
func renderHardwareSummaryCard(opts HandlerOptions) string { func renderHardwareSummaryCard(opts HandlerOptions) string {
data, err := loadSnapshot(opts.AuditPath) data, err := loadSnapshot(opts.AuditPath)
if err != nil { if err != nil {
return `<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body"><button class="btn btn-primary" onclick="auditModalRun()">&#9654; Run Audit</button></div></div>` return `<div class="card"><div class="card-head card-head-actions"><span>Hardware Summary</span><div class="card-head-buttons"><button class="btn btn-primary btn-sm" onclick="auditModalRun()">Run audit</button></div></div><div class="card-body"></div></div>`
} }
// Parse just enough fields for the summary banner // Parse just enough fields for the summary banner
var snap struct { var snap struct {
@@ -434,11 +439,14 @@ func renderHealthCard(opts HandlerOptions) string {
if err != nil { if err != nil {
return `<div class="card"><div class="card-head">Runtime Health</div><div class="card-body"><span class="badge badge-unknown">No data</span></div></div>` return `<div class="card"><div class="card-head">Runtime Health</div><div class="card-body"><span class="badge badge-unknown">No data</span></div></div>`
} }
var health map[string]any var health schema.RuntimeHealth
if err := json.Unmarshal(data, &health); err != nil { if err := json.Unmarshal(data, &health); err != nil {
return `<div class="card"><div class="card-head">Runtime Health</div><div class="card-body"><span class="badge badge-err">Parse error</span></div></div>` return `<div class="card"><div class="card-head">Runtime Health</div><div class="card-body"><span class="badge badge-err">Parse error</span></div></div>`
} }
status := fmt.Sprintf("%v", health["status"]) status := strings.TrimSpace(health.Status)
if status == "" {
status = "UNKNOWN"
}
badge := "badge-ok" badge := "badge-ok"
if status == "PARTIAL" { if status == "PARTIAL" {
badge = "badge-warn" badge = "badge-warn"
@@ -448,19 +456,312 @@ func renderHealthCard(opts HandlerOptions) string {
var b strings.Builder var b strings.Builder
b.WriteString(`<div class="card"><div class="card-head">Runtime Health</div><div class="card-body">`) b.WriteString(`<div class="card"><div class="card-head">Runtime Health</div><div class="card-body">`)
b.WriteString(fmt.Sprintf(`<div style="margin-bottom:10px"><span class="badge %s">%s</span></div>`, badge, html.EscapeString(status))) b.WriteString(fmt.Sprintf(`<div style="margin-bottom:10px"><span class="badge %s">%s</span></div>`, badge, html.EscapeString(status)))
if issues, ok := health["issues"].([]any); ok && len(issues) > 0 { if checkedAt := strings.TrimSpace(health.CheckedAt); checkedAt != "" {
b.WriteString(`<div style="font-size:12px;color:#f87171">Issues:<br>`) b.WriteString(`<div style="font-size:12px;color:var(--muted);margin-bottom:12px">Checked at: ` + html.EscapeString(checkedAt) + `</div>`)
for _, issue := range issues {
if m, ok := issue.(map[string]any); ok {
b.WriteString(html.EscapeString(fmt.Sprintf("%v: %v", m["code"], m["message"])) + "<br>")
}
}
b.WriteString(`</div>`)
} }
rows := []runtimeHealthRow{
buildRuntimeExportRow(health),
buildRuntimeNetworkRow(health),
buildRuntimeDriverRow(health),
buildRuntimeAccelerationRow(health),
buildRuntimeToolsRow(health),
buildRuntimeServicesRow(health),
}
rows = append(rows, buildHardwareComponentRows(opts.ExportDir)...)
b.WriteString(`<table><thead><tr><th>Check</th><th>Status</th><th>Source</th><th>Issue</th></tr></thead><tbody>`)
for _, row := range rows {
b.WriteString(`<tr><td>` + html.EscapeString(row.Title) + `</td><td>` + runtimeStatusBadge(row.Status) + `</td><td>` + html.EscapeString(row.Source) + `</td><td>` + rowIssueHTML(row.Issue) + `</td></tr>`)
}
b.WriteString(`</tbody></table>`)
b.WriteString(`</div></div>`) b.WriteString(`</div></div>`)
return b.String() return b.String()
} }
type runtimeHealthRow struct {
Title string
Status string
Source string
Issue string
}
func buildRuntimeExportRow(health schema.RuntimeHealth) runtimeHealthRow {
issue := runtimeIssueDescriptions(health.Issues, "export_dir_unavailable")
status := "UNKNOWN"
switch {
case issue != "":
status = "FAILED"
case strings.TrimSpace(health.ExportDir) != "":
status = "OK"
}
source := "os.MkdirAll"
if dir := strings.TrimSpace(health.ExportDir); dir != "" {
source += " " + dir
}
return runtimeHealthRow{Title: "Export Directory", Status: status, Source: source, Issue: issue}
}
func buildRuntimeNetworkRow(health schema.RuntimeHealth) runtimeHealthRow {
status := strings.TrimSpace(health.NetworkStatus)
if status == "" {
status = "UNKNOWN"
}
issue := runtimeIssueDescriptions(health.Issues, "dhcp_partial", "dhcp_failed")
return runtimeHealthRow{Title: "Network", Status: status, Source: "ListInterfaces / DHCP", Issue: issue}
}
func buildRuntimeDriverRow(health schema.RuntimeHealth) runtimeHealthRow {
issue := runtimeIssueDescriptions(health.Issues, "nvidia_kernel_module_missing", "nvidia_modeset_failed", "amdgpu_kernel_module_missing")
status := "UNKNOWN"
switch {
case health.DriverReady && issue == "":
status = "OK"
case health.DriverReady:
status = "PARTIAL"
case issue != "":
status = "FAILED"
}
return runtimeHealthRow{Title: "NVIDIA/AMD Driver", Status: status, Source: "lsmod / vendor probe", Issue: issue}
}
func buildRuntimeAccelerationRow(health schema.RuntimeHealth) runtimeHealthRow {
issue := runtimeIssueDescriptions(health.Issues, "cuda_runtime_not_ready", "rocm_smi_unavailable")
status := "UNKNOWN"
switch {
case health.CUDAReady && issue == "":
status = "OK"
case health.CUDAReady:
status = "PARTIAL"
case issue != "":
status = "FAILED"
}
return runtimeHealthRow{Title: "CUDA / ROCm", Status: status, Source: "bee-gpu-burn / rocm-smi", Issue: issue}
}
func buildRuntimeToolsRow(health schema.RuntimeHealth) runtimeHealthRow {
if len(health.Tools) == 0 {
return runtimeHealthRow{Title: "Required Utilities", Status: "UNKNOWN", Source: "CheckTools", Issue: "No tool status data."}
}
missing := make([]string, 0)
for _, tool := range health.Tools {
if !tool.OK {
missing = append(missing, tool.Name)
}
}
status := "OK"
issue := ""
if len(missing) > 0 {
status = "PARTIAL"
issue = "Missing: " + strings.Join(missing, ", ")
}
return runtimeHealthRow{Title: "Required Utilities", Status: status, Source: "CheckTools", Issue: issue}
}
func buildRuntimeServicesRow(health schema.RuntimeHealth) runtimeHealthRow {
if len(health.Services) == 0 {
return runtimeHealthRow{Title: "Bee Services", Status: "UNKNOWN", Source: "systemctl is-active", Issue: "No service status data."}
}
nonActive := make([]string, 0)
for _, svc := range health.Services {
state := strings.TrimSpace(strings.ToLower(svc.Status))
if state != "active" {
nonActive = append(nonActive, svc.Name+"="+svc.Status)
}
}
status := "OK"
issue := ""
if len(nonActive) > 0 {
status = "PARTIAL"
issue = strings.Join(nonActive, ", ")
}
return runtimeHealthRow{Title: "Bee Services", Status: status, Source: "ServiceState", Issue: issue}
}
func buildHardwareComponentRows(exportDir string) []runtimeHealthRow {
path := filepath.Join(exportDir, "component-status.json")
db, err := app.OpenComponentStatusDB(path)
if err != nil {
return []runtimeHealthRow{
{Title: "CPU Component Health", Status: "UNKNOWN", Source: "component-status.json", Issue: "Component status DB not available."},
{Title: "Memory Component Health", Status: "UNKNOWN", Source: "component-status.json", Issue: "Component status DB not available."},
{Title: "Storage Component Health", Status: "UNKNOWN", Source: "component-status.json", Issue: "Component status DB not available."},
{Title: "GPU Component Health", Status: "UNKNOWN", Source: "component-status.json", Issue: "Component status DB not available."},
{Title: "PSU Component Health", Status: "UNKNOWN", Source: "component-status.json", Issue: "No PSU component checks recorded."},
}
}
records := db.All()
return []runtimeHealthRow{
aggregateComponentStatus("CPU", records, []string{"cpu:all"}, nil),
aggregateComponentStatus("Memory", records, []string{"memory:all"}, []string{"memory:"}),
aggregateComponentStatus("Storage", records, []string{"storage:all"}, []string{"storage:"}),
aggregateComponentStatus("GPU", records, nil, []string{"pcie:gpu:"}),
aggregateComponentStatus("PSU", records, nil, []string{"psu:"}),
}
}
func aggregateComponentStatus(title string, records []app.ComponentStatusRecord, exact []string, prefixes []string) runtimeHealthRow {
matched := make([]app.ComponentStatusRecord, 0)
for _, rec := range records {
key := strings.TrimSpace(rec.ComponentKey)
if key == "" {
continue
}
if containsExactKey(key, exact) || hasAnyPrefix(key, prefixes) {
matched = append(matched, rec)
}
}
if len(matched) == 0 {
return runtimeHealthRow{Title: title, Status: "UNKNOWN", Source: "component-status.json", Issue: "No component status data."}
}
maxSev := -1
for _, rec := range matched {
if sev := runtimeComponentSeverity(rec.Status); sev > maxSev {
maxSev = sev
}
}
status := "UNKNOWN"
switch maxSev {
case 3:
status = "CRITICAL"
case 2:
status = "WARNING"
case 1:
status = "OK"
}
sources := make([]string, 0)
sourceSeen := map[string]struct{}{}
issues := make([]string, 0)
issueSeen := map[string]struct{}{}
for _, rec := range matched {
if runtimeComponentSeverity(rec.Status) != maxSev {
continue
}
source := latestComponentSource(rec)
if source == "" {
source = "component-status.json"
}
if _, ok := sourceSeen[source]; !ok {
sourceSeen[source] = struct{}{}
sources = append(sources, source)
}
issue := strings.TrimSpace(rec.ErrorSummary)
if issue == "" {
issue = latestComponentDetail(rec)
}
if issue == "" {
continue
}
if _, ok := issueSeen[issue]; ok {
continue
}
issueSeen[issue] = struct{}{}
issues = append(issues, issue)
}
if len(sources) == 0 {
sources = append(sources, "component-status.json")
}
issue := strings.Join(issues, "; ")
if issue == "" {
issue = "—"
}
return runtimeHealthRow{
Title: title,
Status: status,
Source: strings.Join(sources, ", "),
Issue: issue,
}
}
func containsExactKey(key string, exact []string) bool {
for _, candidate := range exact {
if key == candidate {
return true
}
}
return false
}
func hasAnyPrefix(key string, prefixes []string) bool {
for _, prefix := range prefixes {
if strings.HasPrefix(key, prefix) {
return true
}
}
return false
}
func runtimeComponentSeverity(status string) int {
switch strings.TrimSpace(strings.ToLower(status)) {
case "critical":
return 3
case "warning":
return 2
case "ok":
return 1
default:
return 0
}
}
func latestComponentSource(rec app.ComponentStatusRecord) string {
if len(rec.History) == 0 {
return ""
}
return strings.TrimSpace(rec.History[len(rec.History)-1].Source)
}
func latestComponentDetail(rec app.ComponentStatusRecord) string {
if len(rec.History) == 0 {
return ""
}
return strings.TrimSpace(rec.History[len(rec.History)-1].Detail)
}
func runtimeIssueDescriptions(issues []schema.RuntimeIssue, codes ...string) string {
if len(issues) == 0 || len(codes) == 0 {
return ""
}
allowed := make(map[string]struct{}, len(codes))
for _, code := range codes {
allowed[code] = struct{}{}
}
messages := make([]string, 0)
for _, issue := range issues {
if _, ok := allowed[issue.Code]; !ok {
continue
}
desc := strings.TrimSpace(issue.Description)
if desc == "" {
desc = issue.Code
}
messages = append(messages, desc)
}
return strings.Join(messages, "; ")
}
func runtimeStatusBadge(status string) string {
status = strings.ToUpper(strings.TrimSpace(status))
badge := "badge-unknown"
switch status {
case "OK":
badge = "badge-ok"
case "PARTIAL", "WARNING", "WARN":
badge = "badge-warn"
case "FAIL", "FAILED", "CRITICAL":
badge = "badge-err"
}
return `<span class="badge ` + badge + `">` + html.EscapeString(status) + `</span>`
}
func rowIssueHTML(issue string) string {
issue = strings.TrimSpace(issue)
if issue == "" {
return `<span style="color:var(--muted)">—</span>`
}
return html.EscapeString(issue)
}
// ── Metrics ─────────────────────────────────────────────────────────────────── // ── Metrics ───────────────────────────────────────────────────────────────────
func renderMetrics() string { func renderMetrics() string {
@@ -675,50 +976,137 @@ setInterval(loadMetricsLayout, 5000);
// ── Validate (Acceptance Tests) ─────────────────────────────────────────────── // ── Validate (Acceptance Tests) ───────────────────────────────────────────────
func renderValidate() string { type validateInventory struct {
CPU string
Memory string
Storage string
NVIDIA string
AMD string
}
func renderValidate(opts HandlerOptions) string {
inv := loadValidateInventory(opts)
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div> return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p> <p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
<div class="card" style="margin-bottom:16px"> <div class="card" style="margin-bottom:16px">
<div class="card-head">Run All Tests</div> <div class="card-head">Validate Profile</div>
<div class="card-body" style="display:flex;align-items:center;gap:12px;flex-wrap:wrap"> <div class="card-body validate-profile-body">
<div class="form-row" style="margin:0"><label style="margin-right:6px">Cycles</label><input type="number" id="sat-cycles" value="1" min="1" max="100" style="width:70px;display:inline-block"></div> <div class="validate-profile-col">
<button class="btn btn-primary" onclick="runAllSAT()">&#9654; Run All</button> <div class="form-row" style="margin:0"><label>Cycles</label><input type="number" id="sat-cycles" value="1" min="1" max="100" style="width:100%"></div>
<div class="form-row" style="margin:12px 0 0"><label>Diag level</label><select id="sat-profile-nvidia-level" style="width:100%"><option value="1">Level 1 — Quick</option><option value="2">Level 2 — Standard</option><option value="3">Level 3 — Extended</option><option value="4">Level 4 — Full</option></select></div>
</div>
<div class="validate-profile-col validate-profile-action">
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially with the selected cycle count. NVIDIA <code>dcgmi diag</code> uses the selected diag level from this profile.</p>
<button class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
</div>
<div class="validate-profile-col"></div>
</div>
<div class="card-body" style="padding-top:0;display:flex;justify-content:center">
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span> <span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
</div> </div>
</div> </div>
<div class="grid3"> <div class="grid3">
` + renderSATCard("nvidia", "NVIDIA GPU", `<div class="form-row"><label>Diag Level</label><select id="sat-nvidia-level"><option value="1">Level 1 — Quick</option><option value="2">Level 2 — Standard</option><option value="3">Level 3 — Extended</option><option value="4">Level 4 — Full</option></select></div><div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button id="sat-btn-nvidia-targeted-stress" class="btn" type="button" onclick="runSAT('nvidia-targeted-stress')">Targeted Stress (dcgmi diag targeted_stress)</button></div><p style="color:var(--muted);font-size:12px;margin:0">Official DCGM `+"targeted_stress"+` stays in Validate as a controlled diagnostic load, not a max-burn recipe.</p>`) + ` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
renderSATCard("memory", "Memory", "") + inv.CPU,
renderSATCard("storage", "Storage", "") + `Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
renderSATCard("cpu", "CPU", `<div class="form-row"><label>Duration (seconds)</label><input type="number" id="sat-cpu-dur" value="60" min="10"></div>`) + `<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
renderSATCard("amd", "AMD GPU", `<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px"> `Duration is taken from Validate Profile diag level: Level 1 = 60s, Level 2 = 5m, Level 3 = 1h, Level 4 = 1h.`,
<button id="sat-btn-amd-mem" class="btn" type="button" onclick="runSAT('amd-mem')">MEM Integrity</button> )) +
<button id="sat-btn-amd-bandwidth" class="btn" type="button" onclick="runSAT('amd-bandwidth')">MEM Bandwidth</button> renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
</div> inv.Memory,
<p style="color:var(--muted);font-size:12px;margin:0">Additional AMD memory diagnostics: RVS MEM for integrity and BABEL + rocm-bandwidth-test for memory/interconnect bandwidth.</p>`) + `Runs a short RAM validation pass and records memory state around the test.`,
`<code>free</code>, <code>memtester</code>`,
`No extra settings.`,
)) +
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
inv.Storage,
`Scans all storage devices and runs the matching health or self-test path for each device type.`,
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
`No extra settings.`,
)) +
`</div>
<div style="height:1px;background:var(--border);margin:16px 0"></div>
<div class="grid3">
` + renderSATCard("nvidia", "NVIDIA GPU", "runSAT('nvidia')", "", renderValidateCardBody(
inv.NVIDIA,
`Runs NVIDIA diagnostics and board inventory checks.`,
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
`Diag level is taken from Validate Profile.`,
)) +
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runSAT('nvidia-targeted-stress')", "", renderValidateCardBody(
inv.NVIDIA,
`Runs a controlled NVIDIA DCGM load in Validate to check stability under moderate stress.`,
`<code>dcgmi diag targeted_stress</code>`,
`Uses the fixed DCGM targeted stress recipe.`,
)) +
`</div>
<div class="grid3" style="margin-top:16px">
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
inv.AMD,
`Runs the selected AMD checks only. GPU Validate collects inventory; MEM Integrity uses the RVS MEM module; MEM Bandwidth uses rocm-bandwidth-test and the RVS BABEL module.`,
`GPU Validate: <code>rocm-smi</code>, <code>dmidecode</code>; MEM Integrity: <code>rvs mem</code>; MEM Bandwidth: <code>rocm-bandwidth-test</code>, <code>rvs babel</code>`,
`<div style="display:flex;flex-direction:column;gap:4px"><label class="cb-row"><input type="checkbox" id="sat-amd-target" checked><span>GPU Validate</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-mem-target" checked><span>MEM Integrity</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-bandwidth-target" checked><span>MEM Bandwidth</span></label></div>`,
)) +
`</div> `</div>
<div id="sat-output" style="display:none;margin-top:16px" class="card"> <div id="sat-output" style="display:none;margin-top:16px" class="card">
<div class="card-head">Test Output <span id="sat-title"></span></div> <div class="card-head">Test Output <span id="sat-title"></span></div>
<div class="card-body"><div id="sat-terminal" class="terminal"></div></div> <div class="card-body"><div id="sat-terminal" class="terminal"></div></div>
</div> </div>
<style>
.validate-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
.validate-profile-col { min-width:0; }
.validate-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:center; }
.validate-card-body { padding:0; }
.validate-card-section { padding:12px 16px 0; }
.validate-card-section:last-child { padding-bottom:16px; }
@media(max-width:900px){ .validate-profile-body { grid-template-columns:1fr; } }
</style>
<script> <script>
let satES = null; let satES = null;
function satDiagLevel() {
return parseInt(document.getElementById('sat-profile-nvidia-level').value) || 1;
}
function satCPUDurationFromDiagLevel() {
const level = satDiagLevel();
if (level === 1) return 60;
if (level === 2) return 5 * 60;
return 60 * 60;
}
function satLabels() {
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
}
function satRequestBody(target) {
const body = {};
const labels = satLabels();
body.display_name = labels[target] || ('Validate ' + target);
if (target === 'nvidia') body.diag_level = satDiagLevel();
if (target === 'nvidia-targeted-stress') body.duration = 300;
if (target === 'cpu') body.duration = satCPUDurationFromDiagLevel();
return body;
}
function enqueueSATTarget(target) {
return fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(satRequestBody(target))})
.then(r => r.json());
}
function selectedAMDValidateTargets() {
const targets = [];
const gpu = document.getElementById('sat-amd-target');
const mem = document.getElementById('sat-amd-mem-target');
const bw = document.getElementById('sat-amd-bandwidth-target');
if (gpu && gpu.checked && !gpu.disabled) targets.push('amd');
if (mem && mem.checked && !mem.disabled) targets.push('amd-mem');
if (bw && bw.checked && !bw.disabled) targets.push('amd-bandwidth');
return targets;
}
function runSAT(target) { function runSAT(target) {
if (satES) { satES.close(); satES = null; } if (satES) { satES.close(); satES = null; }
const body = {};
const labels = {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
body.display_name = labels[target] || ('Validate ' + target);
if (target === 'nvidia') body.diag_level = parseInt(document.getElementById('sat-nvidia-level').value)||1;
if (target === 'nvidia-targeted-stress') body.duration = 300;
if (target === 'cpu') body.duration = parseInt(document.getElementById('sat-cpu-dur').value)||60;
document.getElementById('sat-output').style.display='block'; document.getElementById('sat-output').style.display='block';
document.getElementById('sat-title').textContent = '— ' + target; document.getElementById('sat-title').textContent = '— ' + target;
const term = document.getElementById('sat-terminal'); const term = document.getElementById('sat-terminal');
term.textContent = 'Enqueuing ' + target + ' test...\n'; term.textContent = 'Enqueuing ' + target + ' test...\n';
return fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(body)}) return enqueueSATTarget(target)
.then(r => r.json())
.then(d => { .then(d => {
term.textContent += 'Task ' + d.task_id + ' queued. Streaming log...\n'; term.textContent += 'Task ' + d.task_id + ' queued. Streaming log...\n';
satES = new EventSource('/api/tasks/'+d.task_id+'/stream'); satES = new EventSource('/api/tasks/'+d.task_id+'/stream');
@@ -726,9 +1114,35 @@ function runSAT(target) {
satES.addEventListener('done', e => { satES.close(); satES=null; term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n'; }); satES.addEventListener('done', e => { satES.close(); satES=null; term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n'; });
}); });
} }
function runAMDValidateSet() {
const targets = selectedAMDValidateTargets();
if (!targets.length) return;
if (targets.length === 1) return runSAT(targets[0]);
if (satES) { satES.close(); satES = null; }
document.getElementById('sat-output').style.display='block';
document.getElementById('sat-title').textContent = '— amd';
const term = document.getElementById('sat-terminal');
term.textContent = 'Enqueuing AMD validate set...\n';
const labels = satLabels();
const enqueueNext = (idx) => {
if (idx >= targets.length) return;
const target = targets[idx];
enqueueSATTarget(target)
.then(d => {
term.textContent += 'Task ' + d.task_id + ' queued for ' + labels[target] + '.\n';
if (idx === targets.length - 1) {
satES = new EventSource('/api/tasks/'+d.task_id+'/stream');
satES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop=term.scrollHeight; };
satES.addEventListener('done', e => { satES.close(); satES=null; term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n'; });
}
enqueueNext(idx + 1);
});
};
enqueueNext(0);
}
function runAllSAT() { function runAllSAT() {
const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1); const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
const targets = ['nvidia','nvidia-targeted-stress','memory','storage','cpu','amd','amd-mem','amd-bandwidth']; const targets = ['nvidia','nvidia-targeted-stress','memory','storage','cpu'].concat(selectedAMDValidateTargets());
const total = targets.length * cycles; const total = targets.length * cycles;
let enqueued = 0; let enqueued = 0;
const status = document.getElementById('sat-all-status'); const status = document.getElementById('sat-all-status');
@@ -739,14 +1153,8 @@ function runAllSAT() {
const target = targets[idx]; const target = targets[idx];
const btn = document.getElementById('sat-btn-' + target); const btn = document.getElementById('sat-btn-' + target);
if (btn && btn.disabled) { enqueueNext(cycle, idx+1); return; } if (btn && btn.disabled) { enqueueNext(cycle, idx+1); return; }
const body = {}; enqueueSATTarget(target)
const labels = {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'}; .then(()=>{
body.display_name = labels[target] || ('Validate ' + target);
if (target === 'nvidia') body.diag_level = parseInt(document.getElementById('sat-nvidia-level').value)||1;
if (target === 'nvidia-targeted-stress') body.duration = 300;
if (target === 'cpu') body.duration = parseInt(document.getElementById('sat-cpu-dur').value)||60;
fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(body)})
.then(r=>r.json()).then(()=>{
enqueued++; enqueued++;
status.textContent = 'Enqueued '+enqueued+'/'+total+'...'; status.textContent = 'Enqueued '+enqueued+'/'+total+'...';
enqueueNext(cycle, idx+1); enqueueNext(cycle, idx+1);
@@ -760,9 +1168,17 @@ fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected'); if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
if (!gp.nvidia) disableSATCard('nvidia-targeted-stress', 'No NVIDIA GPU detected'); if (!gp.nvidia) disableSATCard('nvidia-targeted-stress', 'No NVIDIA GPU detected');
if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected'); if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
if (!gp.amd) disableSATCard('amd-mem', 'No AMD GPU detected'); if (!gp.amd) disableSATAMDOptions('No AMD GPU detected');
if (!gp.amd) disableSATCard('amd-bandwidth', 'No AMD GPU detected');
}); });
function disableSATAMDOptions(reason) {
['sat-amd-target','sat-amd-mem-target','sat-amd-bandwidth-target'].forEach(function(id) {
const cb = document.getElementById(id);
if (!cb) return;
cb.disabled = true;
cb.checked = false;
cb.title = reason;
});
}
function disableSATCard(id, reason) { function disableSATCard(id, reason) {
const btn = document.getElementById('sat-btn-' + id); const btn = document.getElementById('sat-btn-' + id);
if (!btn) return; if (!btn) return;
@@ -775,8 +1191,9 @@ function disableSATCard(id, reason) {
if (!note) { if (!note) {
note = document.createElement('p'); note = document.createElement('p');
note.className = 'sat-unavail'; note.className = 'sat-unavail';
note.style.cssText = 'color:var(--muted);font-size:12px;margin-top:6px'; note.style.cssText = 'color:var(--muted);font-size:12px;margin:0 0 8px';
btn.parentNode.insertBefore(note, btn.nextSibling); const body = card.querySelector('.card-body');
if (body) body.insertBefore(note, body.firstChild);
} }
note.textContent = reason; note.textContent = reason;
} }
@@ -784,9 +1201,159 @@ function disableSATCard(id, reason) {
</script>` </script>`
} }
func renderSATCard(id, label, extra string) string { func loadValidateInventory(opts HandlerOptions) validateInventory {
return fmt.Sprintf(`<div class="card"><div class="card-head">%s</div><div class="card-body">%s<button id="sat-btn-%s" class="btn btn-primary" onclick="runSAT('%s')">▶ Run Test</button></div></div>`, unknown := "Audit snapshot not loaded."
label, extra, id, id) out := validateInventory{
CPU: unknown,
Memory: unknown,
Storage: unknown,
NVIDIA: unknown,
AMD: unknown,
}
data, err := loadSnapshot(opts.AuditPath)
if err != nil {
return out
}
var snap schema.HardwareIngestRequest
if err := json.Unmarshal(data, &snap); err != nil {
return out
}
cpuCounts := map[string]int{}
cpuTotal := 0
for _, cpu := range snap.Hardware.CPUs {
if cpu.Present != nil && !*cpu.Present {
continue
}
cpuTotal++
addValidateModel(cpuCounts, validateFirstNonEmpty(validateTrimPtr(cpu.Model), validateTrimPtr(cpu.Manufacturer), "unknown"))
}
memCounts := map[string]int{}
memTotal := 0
for _, dimm := range snap.Hardware.Memory {
if dimm.Present != nil && !*dimm.Present {
continue
}
memTotal++
addValidateModel(memCounts, validateFirstNonEmpty(validateTrimPtr(dimm.PartNumber), validateTrimPtr(dimm.Type), validateTrimPtr(dimm.Manufacturer), "unknown"))
}
storageCounts := map[string]int{}
storageTotal := 0
for _, dev := range snap.Hardware.Storage {
if dev.Present != nil && !*dev.Present {
continue
}
storageTotal++
addValidateModel(storageCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
}
nvidiaCounts := map[string]int{}
nvidiaTotal := 0
amdCounts := map[string]int{}
amdTotal := 0
for _, dev := range snap.Hardware.PCIeDevices {
if dev.Present != nil && !*dev.Present {
continue
}
if validateIsVendorGPU(dev, "nvidia") {
nvidiaTotal++
addValidateModel(nvidiaCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
}
if validateIsVendorGPU(dev, "amd") {
amdTotal++
addValidateModel(amdCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
}
}
out.CPU = formatValidateDeviceSummary(cpuTotal, cpuCounts, "CPU")
out.Memory = formatValidateDeviceSummary(memTotal, memCounts, "module")
out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device")
out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU")
out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU")
return out
}
func renderValidateCardBody(devices, description, commands, settings string) string {
return `<div class="validate-card-section"><div style="font-size:13px;color:var(--muted)">` + devices + `</div></div>` +
`<div class="validate-card-section"><div style="font-size:13px">` + description + `</div></div>` +
`<div class="validate-card-section"><div style="font-size:13px">` + commands + `</div></div>` +
`<div class="validate-card-section"><div style="font-size:13px;color:var(--muted)">` + settings + `</div></div>`
}
func formatValidateDeviceSummary(total int, models map[string]int, unit string) string {
if total == 0 {
return "0 " + unit + "s detected."
}
keys := make([]string, 0, len(models))
for key := range models {
keys = append(keys, key)
}
sort.Strings(keys)
parts := make([]string, 0, len(keys))
for _, key := range keys {
parts = append(parts, fmt.Sprintf("%d x %s", models[key], html.EscapeString(key)))
}
label := unit
if total != 1 {
label += "s"
}
return fmt.Sprintf("%d %s: %s", total, label, strings.Join(parts, ", "))
}
func addValidateModel(counts map[string]int, name string) {
name = strings.TrimSpace(name)
if name == "" {
name = "unknown"
}
counts[name]++
}
func validateTrimPtr(value *string) string {
if value == nil {
return ""
}
return strings.TrimSpace(*value)
}
func validateFirstNonEmpty(values ...string) string {
for _, value := range values {
value = strings.TrimSpace(value)
if value != "" {
return value
}
}
return ""
}
func validateIsVendorGPU(dev schema.HardwarePCIeDevice, vendor string) bool {
model := strings.ToLower(validateTrimPtr(dev.Model))
manufacturer := strings.ToLower(validateTrimPtr(dev.Manufacturer))
class := strings.ToLower(validateTrimPtr(dev.DeviceClass))
if strings.Contains(model, "aspeed") || strings.Contains(manufacturer, "aspeed") {
return false
}
switch vendor {
case "nvidia":
return strings.Contains(model, "nvidia") || strings.Contains(manufacturer, "nvidia")
case "amd":
isGPUClass := class == "processingaccelerator" || class == "displaycontroller" || class == "videocontroller"
isAMDVendor := strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd") || strings.Contains(manufacturer, "ati")
isAMDModel := strings.Contains(model, "instinct") || strings.Contains(model, "radeon") || strings.Contains(model, "amd")
return isGPUClass && (isAMDVendor || isAMDModel)
default:
return false
}
}
func renderSATCard(id, label, runAction, headerActions, body string) string {
actions := `<button id="sat-btn-` + id + `" class="btn btn-primary btn-sm" onclick="` + runAction + `">Run</button>`
if strings.TrimSpace(headerActions) != "" {
actions += headerActions
}
return fmt.Sprintf(`<div class="card"><div class="card-head card-head-actions"><span>%s</span><div class="card-head-buttons">%s</div></div><div class="card-body validate-card-body">%s</div></div>`,
label, actions, body)
} }
// ── Benchmark ───────────────────────────────────────────────────────────────── // ── Benchmark ─────────────────────────────────────────────────────────────────

View File

@@ -543,7 +543,7 @@ func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
t.Fatalf("status=%d", rec.Code) t.Fatalf("status=%d", rec.Code)
} }
body := rec.Body.String() body := rec.Body.String()
if !strings.Contains(body, `Run Audit`) { if !strings.Contains(body, `onclick="auditModalRun()">Run audit</button>`) {
t.Fatalf("dashboard missing run audit button: %s", body) t.Fatalf("dashboard missing run audit button: %s", body)
} }
if strings.Contains(body, `No audit data`) { if strings.Contains(body, `No audit data`) {
@@ -650,7 +650,7 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
} }
} }
func TestValidatePageRendersNvidiaTargetedStressAction(t *testing.T) { func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
handler := NewHandler(HandlerOptions{}) handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder() rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil)) handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
@@ -659,9 +659,10 @@ func TestValidatePageRendersNvidiaTargetedStressAction(t *testing.T) {
} }
body := rec.Body.String() body := rec.Body.String()
for _, needle := range []string{ for _, needle := range []string{
`Targeted Stress`, `NVIDIA GPU Targeted Stress`,
`nvidia-targeted-stress`, `nvidia-targeted-stress`,
`Official DCGM`, `controlled NVIDIA DCGM load`,
`<code>dcgmi diag targeted_stress</code>`,
} { } {
if !strings.Contains(body, needle) { if !strings.Contains(body, needle) {
t.Fatalf("validate page missing %q: %s", needle, body) t.Fatalf("validate page missing %q: %s", needle, body)
@@ -845,3 +846,98 @@ func TestRuntimeHealthEndpointReturnsJSON(t *testing.T) {
t.Fatalf("body=%q want %q", strings.TrimSpace(rec.Body.String()), body) t.Fatalf("body=%q want %q", strings.TrimSpace(rec.Body.String()), body)
} }
} }
func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "audit.json")
exportDir := filepath.Join(dir, "export")
if err := os.MkdirAll(exportDir, 0755); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z","hardware":{"board":{"serial_number":"SERIAL-1"}}}`), 0644); err != nil {
t.Fatal(err)
}
health := `{
"status":"PARTIAL",
"checked_at":"2026-03-16T10:00:00Z",
"export_dir":"/tmp/export",
"driver_ready":true,
"cuda_ready":false,
"network_status":"PARTIAL",
"issues":[
{"code":"dhcp_partial","description":"At least one interface did not obtain IPv4 connectivity."},
{"code":"cuda_runtime_not_ready","description":"CUDA runtime is not ready for GPU SAT."}
],
"tools":[
{"name":"dmidecode","ok":true},
{"name":"nvidia-smi","ok":false}
],
"services":[
{"name":"bee-web","status":"active"},
{"name":"bee-nvidia","status":"inactive"}
]
}`
if err := os.WriteFile(filepath.Join(exportDir, "runtime-health.json"), []byte(health), 0644); err != nil {
t.Fatal(err)
}
componentStatus := `[
{
"component_key":"cpu:all",
"status":"Warning",
"error_summary":"cpu SAT: FAILED",
"history":[{"at":"2026-03-16T10:00:00Z","status":"Warning","source":"sat:cpu","detail":"cpu SAT: FAILED"}]
},
{
"component_key":"memory:all",
"status":"OK",
"history":[{"at":"2026-03-16T10:01:00Z","status":"OK","source":"sat:memory","detail":"memory SAT: OK"}]
},
{
"component_key":"storage:nvme0n1",
"status":"Critical",
"error_summary":"storage SAT: FAILED",
"history":[{"at":"2026-03-16T10:02:00Z","status":"Critical","source":"sat:storage","detail":"storage SAT: FAILED"}]
},
{
"component_key":"pcie:gpu:nvidia",
"status":"Warning",
"error_summary":"nvidia SAT: FAILED",
"history":[{"at":"2026-03-16T10:03:00Z","status":"Warning","source":"sat:nvidia","detail":"nvidia SAT: FAILED"}]
}
]`
if err := os.WriteFile(filepath.Join(exportDir, "component-status.json"), []byte(componentStatus), 0644); err != nil {
t.Fatal(err)
}
handler := NewHandler(HandlerOptions{AuditPath: path, ExportDir: exportDir})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
}
body := rec.Body.String()
for _, needle := range []string{
`Runtime Health`,
`<th>Check</th><th>Status</th><th>Source</th><th>Issue</th>`,
`Export Directory`,
`Network`,
`NVIDIA/AMD Driver`,
`CUDA / ROCm`,
`Required Utilities`,
`Bee Services`,
`<td>CPU</td>`,
`<td>Memory</td>`,
`<td>Storage</td>`,
`<td>GPU</td>`,
`CUDA runtime is not ready for GPU SAT.`,
`Missing: nvidia-smi`,
`bee-nvidia=inactive`,
`cpu SAT: FAILED`,
`storage SAT: FAILED`,
`sat:nvidia`,
} {
if !strings.Contains(body, needle) {
t.Fatalf("dashboard missing %q: %s", needle, body)
}
}
}