Refine validate UI and runtime health table
This commit is contained in:
@@ -9,6 +9,9 @@ import (
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
// ── Layout ────────────────────────────────────────────────────────────────────
|
||||
@@ -42,6 +45,8 @@ a{color:var(--accent);text-decoration:none}
|
||||
/* Cards */
|
||||
.card{background:var(--surface);border:1px solid var(--border);border-radius:4px;box-shadow:0 1px 2px rgba(34,36,38,.15);margin-bottom:16px;overflow:hidden}
|
||||
.card-head{padding:11px 16px;background:var(--surface-2);border-bottom:1px solid var(--border);font-weight:700;font-size:13px;display:flex;align-items:center;gap:8px}
|
||||
.card-head-actions{justify-content:space-between}
|
||||
.card-head-buttons{display:flex;align-items:center;gap:8px;margin-left:auto;flex-wrap:wrap}
|
||||
.card-body{padding:16px}
|
||||
/* Buttons */
|
||||
.btn{display:inline-flex;align-items:center;gap:6px;padding:8px 16px;border-radius:4px;font-size:13px;font-weight:700;cursor:pointer;border:none;transition:background .1s;font-family:inherit}
|
||||
@@ -72,7 +77,7 @@ tbody tr:hover td{background:rgba(0,0,0,.03)}
|
||||
/* Grid */
|
||||
.grid2{display:grid;grid-template-columns:1fr 1fr;gap:16px}
|
||||
.grid3{display:grid;grid-template-columns:1fr 1fr 1fr;gap:16px}
|
||||
@media(max-width:900px){.grid2,.grid3{grid-template-columns:1fr}}
|
||||
@media(max-width:900px){.grid2,.grid3{grid-template-columns:1fr}.card-head-actions{align-items:flex-start;flex-direction:column}.card-head-buttons{margin-left:0}}
|
||||
/* iframe viewer */
|
||||
.viewer-frame{width:100%;height:calc(100vh - 160px);border:0;border-radius:4px;background:var(--surface-2)}
|
||||
/* Alerts */
|
||||
@@ -136,7 +141,7 @@ func renderPage(page string, opts HandlerOptions) string {
|
||||
case "validate":
|
||||
pageID = "validate"
|
||||
title = "Validate"
|
||||
body = renderValidate()
|
||||
body = renderValidate(opts)
|
||||
case "burn":
|
||||
pageID = "burn"
|
||||
title = "Burn"
|
||||
@@ -161,7 +166,7 @@ func renderPage(page string, opts HandlerOptions) string {
|
||||
case "tests":
|
||||
pageID = "validate"
|
||||
title = "Acceptance Tests"
|
||||
body = renderValidate()
|
||||
body = renderValidate(opts)
|
||||
case "burn-in":
|
||||
pageID = "burn"
|
||||
title = "Burn-in Tests"
|
||||
@@ -295,7 +300,7 @@ func renderAudit() string {
|
||||
func renderHardwareSummaryCard(opts HandlerOptions) string {
|
||||
data, err := loadSnapshot(opts.AuditPath)
|
||||
if err != nil {
|
||||
return `<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body"><button class="btn btn-primary" onclick="auditModalRun()">▶ Run Audit</button></div></div>`
|
||||
return `<div class="card"><div class="card-head card-head-actions"><span>Hardware Summary</span><div class="card-head-buttons"><button class="btn btn-primary btn-sm" onclick="auditModalRun()">Run audit</button></div></div><div class="card-body"></div></div>`
|
||||
}
|
||||
// Parse just enough fields for the summary banner
|
||||
var snap struct {
|
||||
@@ -434,11 +439,14 @@ func renderHealthCard(opts HandlerOptions) string {
|
||||
if err != nil {
|
||||
return `<div class="card"><div class="card-head">Runtime Health</div><div class="card-body"><span class="badge badge-unknown">No data</span></div></div>`
|
||||
}
|
||||
var health map[string]any
|
||||
var health schema.RuntimeHealth
|
||||
if err := json.Unmarshal(data, &health); err != nil {
|
||||
return `<div class="card"><div class="card-head">Runtime Health</div><div class="card-body"><span class="badge badge-err">Parse error</span></div></div>`
|
||||
}
|
||||
status := fmt.Sprintf("%v", health["status"])
|
||||
status := strings.TrimSpace(health.Status)
|
||||
if status == "" {
|
||||
status = "UNKNOWN"
|
||||
}
|
||||
badge := "badge-ok"
|
||||
if status == "PARTIAL" {
|
||||
badge = "badge-warn"
|
||||
@@ -448,19 +456,312 @@ func renderHealthCard(opts HandlerOptions) string {
|
||||
var b strings.Builder
|
||||
b.WriteString(`<div class="card"><div class="card-head">Runtime Health</div><div class="card-body">`)
|
||||
b.WriteString(fmt.Sprintf(`<div style="margin-bottom:10px"><span class="badge %s">%s</span></div>`, badge, html.EscapeString(status)))
|
||||
if issues, ok := health["issues"].([]any); ok && len(issues) > 0 {
|
||||
b.WriteString(`<div style="font-size:12px;color:#f87171">Issues:<br>`)
|
||||
for _, issue := range issues {
|
||||
if m, ok := issue.(map[string]any); ok {
|
||||
b.WriteString(html.EscapeString(fmt.Sprintf("%v: %v", m["code"], m["message"])) + "<br>")
|
||||
}
|
||||
}
|
||||
b.WriteString(`</div>`)
|
||||
if checkedAt := strings.TrimSpace(health.CheckedAt); checkedAt != "" {
|
||||
b.WriteString(`<div style="font-size:12px;color:var(--muted);margin-bottom:12px">Checked at: ` + html.EscapeString(checkedAt) + `</div>`)
|
||||
}
|
||||
rows := []runtimeHealthRow{
|
||||
buildRuntimeExportRow(health),
|
||||
buildRuntimeNetworkRow(health),
|
||||
buildRuntimeDriverRow(health),
|
||||
buildRuntimeAccelerationRow(health),
|
||||
buildRuntimeToolsRow(health),
|
||||
buildRuntimeServicesRow(health),
|
||||
}
|
||||
rows = append(rows, buildHardwareComponentRows(opts.ExportDir)...)
|
||||
b.WriteString(`<table><thead><tr><th>Check</th><th>Status</th><th>Source</th><th>Issue</th></tr></thead><tbody>`)
|
||||
for _, row := range rows {
|
||||
b.WriteString(`<tr><td>` + html.EscapeString(row.Title) + `</td><td>` + runtimeStatusBadge(row.Status) + `</td><td>` + html.EscapeString(row.Source) + `</td><td>` + rowIssueHTML(row.Issue) + `</td></tr>`)
|
||||
}
|
||||
b.WriteString(`</tbody></table>`)
|
||||
b.WriteString(`</div></div>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
type runtimeHealthRow struct {
|
||||
Title string
|
||||
Status string
|
||||
Source string
|
||||
Issue string
|
||||
}
|
||||
|
||||
func buildRuntimeExportRow(health schema.RuntimeHealth) runtimeHealthRow {
|
||||
issue := runtimeIssueDescriptions(health.Issues, "export_dir_unavailable")
|
||||
status := "UNKNOWN"
|
||||
switch {
|
||||
case issue != "":
|
||||
status = "FAILED"
|
||||
case strings.TrimSpace(health.ExportDir) != "":
|
||||
status = "OK"
|
||||
}
|
||||
source := "os.MkdirAll"
|
||||
if dir := strings.TrimSpace(health.ExportDir); dir != "" {
|
||||
source += " " + dir
|
||||
}
|
||||
return runtimeHealthRow{Title: "Export Directory", Status: status, Source: source, Issue: issue}
|
||||
}
|
||||
|
||||
func buildRuntimeNetworkRow(health schema.RuntimeHealth) runtimeHealthRow {
|
||||
status := strings.TrimSpace(health.NetworkStatus)
|
||||
if status == "" {
|
||||
status = "UNKNOWN"
|
||||
}
|
||||
issue := runtimeIssueDescriptions(health.Issues, "dhcp_partial", "dhcp_failed")
|
||||
return runtimeHealthRow{Title: "Network", Status: status, Source: "ListInterfaces / DHCP", Issue: issue}
|
||||
}
|
||||
|
||||
func buildRuntimeDriverRow(health schema.RuntimeHealth) runtimeHealthRow {
|
||||
issue := runtimeIssueDescriptions(health.Issues, "nvidia_kernel_module_missing", "nvidia_modeset_failed", "amdgpu_kernel_module_missing")
|
||||
status := "UNKNOWN"
|
||||
switch {
|
||||
case health.DriverReady && issue == "":
|
||||
status = "OK"
|
||||
case health.DriverReady:
|
||||
status = "PARTIAL"
|
||||
case issue != "":
|
||||
status = "FAILED"
|
||||
}
|
||||
return runtimeHealthRow{Title: "NVIDIA/AMD Driver", Status: status, Source: "lsmod / vendor probe", Issue: issue}
|
||||
}
|
||||
|
||||
func buildRuntimeAccelerationRow(health schema.RuntimeHealth) runtimeHealthRow {
|
||||
issue := runtimeIssueDescriptions(health.Issues, "cuda_runtime_not_ready", "rocm_smi_unavailable")
|
||||
status := "UNKNOWN"
|
||||
switch {
|
||||
case health.CUDAReady && issue == "":
|
||||
status = "OK"
|
||||
case health.CUDAReady:
|
||||
status = "PARTIAL"
|
||||
case issue != "":
|
||||
status = "FAILED"
|
||||
}
|
||||
return runtimeHealthRow{Title: "CUDA / ROCm", Status: status, Source: "bee-gpu-burn / rocm-smi", Issue: issue}
|
||||
}
|
||||
|
||||
func buildRuntimeToolsRow(health schema.RuntimeHealth) runtimeHealthRow {
|
||||
if len(health.Tools) == 0 {
|
||||
return runtimeHealthRow{Title: "Required Utilities", Status: "UNKNOWN", Source: "CheckTools", Issue: "No tool status data."}
|
||||
}
|
||||
missing := make([]string, 0)
|
||||
for _, tool := range health.Tools {
|
||||
if !tool.OK {
|
||||
missing = append(missing, tool.Name)
|
||||
}
|
||||
}
|
||||
status := "OK"
|
||||
issue := ""
|
||||
if len(missing) > 0 {
|
||||
status = "PARTIAL"
|
||||
issue = "Missing: " + strings.Join(missing, ", ")
|
||||
}
|
||||
return runtimeHealthRow{Title: "Required Utilities", Status: status, Source: "CheckTools", Issue: issue}
|
||||
}
|
||||
|
||||
func buildRuntimeServicesRow(health schema.RuntimeHealth) runtimeHealthRow {
|
||||
if len(health.Services) == 0 {
|
||||
return runtimeHealthRow{Title: "Bee Services", Status: "UNKNOWN", Source: "systemctl is-active", Issue: "No service status data."}
|
||||
}
|
||||
nonActive := make([]string, 0)
|
||||
for _, svc := range health.Services {
|
||||
state := strings.TrimSpace(strings.ToLower(svc.Status))
|
||||
if state != "active" {
|
||||
nonActive = append(nonActive, svc.Name+"="+svc.Status)
|
||||
}
|
||||
}
|
||||
status := "OK"
|
||||
issue := ""
|
||||
if len(nonActive) > 0 {
|
||||
status = "PARTIAL"
|
||||
issue = strings.Join(nonActive, ", ")
|
||||
}
|
||||
return runtimeHealthRow{Title: "Bee Services", Status: status, Source: "ServiceState", Issue: issue}
|
||||
}
|
||||
|
||||
func buildHardwareComponentRows(exportDir string) []runtimeHealthRow {
|
||||
path := filepath.Join(exportDir, "component-status.json")
|
||||
db, err := app.OpenComponentStatusDB(path)
|
||||
if err != nil {
|
||||
return []runtimeHealthRow{
|
||||
{Title: "CPU Component Health", Status: "UNKNOWN", Source: "component-status.json", Issue: "Component status DB not available."},
|
||||
{Title: "Memory Component Health", Status: "UNKNOWN", Source: "component-status.json", Issue: "Component status DB not available."},
|
||||
{Title: "Storage Component Health", Status: "UNKNOWN", Source: "component-status.json", Issue: "Component status DB not available."},
|
||||
{Title: "GPU Component Health", Status: "UNKNOWN", Source: "component-status.json", Issue: "Component status DB not available."},
|
||||
{Title: "PSU Component Health", Status: "UNKNOWN", Source: "component-status.json", Issue: "No PSU component checks recorded."},
|
||||
}
|
||||
}
|
||||
records := db.All()
|
||||
return []runtimeHealthRow{
|
||||
aggregateComponentStatus("CPU", records, []string{"cpu:all"}, nil),
|
||||
aggregateComponentStatus("Memory", records, []string{"memory:all"}, []string{"memory:"}),
|
||||
aggregateComponentStatus("Storage", records, []string{"storage:all"}, []string{"storage:"}),
|
||||
aggregateComponentStatus("GPU", records, nil, []string{"pcie:gpu:"}),
|
||||
aggregateComponentStatus("PSU", records, nil, []string{"psu:"}),
|
||||
}
|
||||
}
|
||||
|
||||
func aggregateComponentStatus(title string, records []app.ComponentStatusRecord, exact []string, prefixes []string) runtimeHealthRow {
|
||||
matched := make([]app.ComponentStatusRecord, 0)
|
||||
for _, rec := range records {
|
||||
key := strings.TrimSpace(rec.ComponentKey)
|
||||
if key == "" {
|
||||
continue
|
||||
}
|
||||
if containsExactKey(key, exact) || hasAnyPrefix(key, prefixes) {
|
||||
matched = append(matched, rec)
|
||||
}
|
||||
}
|
||||
if len(matched) == 0 {
|
||||
return runtimeHealthRow{Title: title, Status: "UNKNOWN", Source: "component-status.json", Issue: "No component status data."}
|
||||
}
|
||||
|
||||
maxSev := -1
|
||||
for _, rec := range matched {
|
||||
if sev := runtimeComponentSeverity(rec.Status); sev > maxSev {
|
||||
maxSev = sev
|
||||
}
|
||||
}
|
||||
status := "UNKNOWN"
|
||||
switch maxSev {
|
||||
case 3:
|
||||
status = "CRITICAL"
|
||||
case 2:
|
||||
status = "WARNING"
|
||||
case 1:
|
||||
status = "OK"
|
||||
}
|
||||
|
||||
sources := make([]string, 0)
|
||||
sourceSeen := map[string]struct{}{}
|
||||
issues := make([]string, 0)
|
||||
issueSeen := map[string]struct{}{}
|
||||
for _, rec := range matched {
|
||||
if runtimeComponentSeverity(rec.Status) != maxSev {
|
||||
continue
|
||||
}
|
||||
source := latestComponentSource(rec)
|
||||
if source == "" {
|
||||
source = "component-status.json"
|
||||
}
|
||||
if _, ok := sourceSeen[source]; !ok {
|
||||
sourceSeen[source] = struct{}{}
|
||||
sources = append(sources, source)
|
||||
}
|
||||
issue := strings.TrimSpace(rec.ErrorSummary)
|
||||
if issue == "" {
|
||||
issue = latestComponentDetail(rec)
|
||||
}
|
||||
if issue == "" {
|
||||
continue
|
||||
}
|
||||
if _, ok := issueSeen[issue]; ok {
|
||||
continue
|
||||
}
|
||||
issueSeen[issue] = struct{}{}
|
||||
issues = append(issues, issue)
|
||||
}
|
||||
if len(sources) == 0 {
|
||||
sources = append(sources, "component-status.json")
|
||||
}
|
||||
issue := strings.Join(issues, "; ")
|
||||
if issue == "" {
|
||||
issue = "—"
|
||||
}
|
||||
return runtimeHealthRow{
|
||||
Title: title,
|
||||
Status: status,
|
||||
Source: strings.Join(sources, ", "),
|
||||
Issue: issue,
|
||||
}
|
||||
}
|
||||
|
||||
func containsExactKey(key string, exact []string) bool {
|
||||
for _, candidate := range exact {
|
||||
if key == candidate {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func hasAnyPrefix(key string, prefixes []string) bool {
|
||||
for _, prefix := range prefixes {
|
||||
if strings.HasPrefix(key, prefix) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func runtimeComponentSeverity(status string) int {
|
||||
switch strings.TrimSpace(strings.ToLower(status)) {
|
||||
case "critical":
|
||||
return 3
|
||||
case "warning":
|
||||
return 2
|
||||
case "ok":
|
||||
return 1
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
func latestComponentSource(rec app.ComponentStatusRecord) string {
|
||||
if len(rec.History) == 0 {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(rec.History[len(rec.History)-1].Source)
|
||||
}
|
||||
|
||||
func latestComponentDetail(rec app.ComponentStatusRecord) string {
|
||||
if len(rec.History) == 0 {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(rec.History[len(rec.History)-1].Detail)
|
||||
}
|
||||
|
||||
func runtimeIssueDescriptions(issues []schema.RuntimeIssue, codes ...string) string {
|
||||
if len(issues) == 0 || len(codes) == 0 {
|
||||
return ""
|
||||
}
|
||||
allowed := make(map[string]struct{}, len(codes))
|
||||
for _, code := range codes {
|
||||
allowed[code] = struct{}{}
|
||||
}
|
||||
messages := make([]string, 0)
|
||||
for _, issue := range issues {
|
||||
if _, ok := allowed[issue.Code]; !ok {
|
||||
continue
|
||||
}
|
||||
desc := strings.TrimSpace(issue.Description)
|
||||
if desc == "" {
|
||||
desc = issue.Code
|
||||
}
|
||||
messages = append(messages, desc)
|
||||
}
|
||||
return strings.Join(messages, "; ")
|
||||
}
|
||||
|
||||
func runtimeStatusBadge(status string) string {
|
||||
status = strings.ToUpper(strings.TrimSpace(status))
|
||||
badge := "badge-unknown"
|
||||
switch status {
|
||||
case "OK":
|
||||
badge = "badge-ok"
|
||||
case "PARTIAL", "WARNING", "WARN":
|
||||
badge = "badge-warn"
|
||||
case "FAIL", "FAILED", "CRITICAL":
|
||||
badge = "badge-err"
|
||||
}
|
||||
return `<span class="badge ` + badge + `">` + html.EscapeString(status) + `</span>`
|
||||
}
|
||||
|
||||
func rowIssueHTML(issue string) string {
|
||||
issue = strings.TrimSpace(issue)
|
||||
if issue == "" {
|
||||
return `<span style="color:var(--muted)">—</span>`
|
||||
}
|
||||
return html.EscapeString(issue)
|
||||
}
|
||||
|
||||
// ── Metrics ───────────────────────────────────────────────────────────────────
|
||||
|
||||
func renderMetrics() string {
|
||||
@@ -675,50 +976,137 @@ setInterval(loadMetricsLayout, 5000);
|
||||
|
||||
// ── Validate (Acceptance Tests) ───────────────────────────────────────────────
|
||||
|
||||
func renderValidate() string {
|
||||
type validateInventory struct {
|
||||
CPU string
|
||||
Memory string
|
||||
Storage string
|
||||
NVIDIA string
|
||||
AMD string
|
||||
}
|
||||
|
||||
func renderValidate(opts HandlerOptions) string {
|
||||
inv := loadValidateInventory(opts)
|
||||
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
|
||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Run All Tests</div>
|
||||
<div class="card-body" style="display:flex;align-items:center;gap:12px;flex-wrap:wrap">
|
||||
<div class="form-row" style="margin:0"><label style="margin-right:6px">Cycles</label><input type="number" id="sat-cycles" value="1" min="1" max="100" style="width:70px;display:inline-block"></div>
|
||||
<button class="btn btn-primary" onclick="runAllSAT()">▶ Run All</button>
|
||||
<div class="card-head">Validate Profile</div>
|
||||
<div class="card-body validate-profile-body">
|
||||
<div class="validate-profile-col">
|
||||
<div class="form-row" style="margin:0"><label>Cycles</label><input type="number" id="sat-cycles" value="1" min="1" max="100" style="width:100%"></div>
|
||||
<div class="form-row" style="margin:12px 0 0"><label>Diag level</label><select id="sat-profile-nvidia-level" style="width:100%"><option value="1">Level 1 — Quick</option><option value="2">Level 2 — Standard</option><option value="3">Level 3 — Extended</option><option value="4">Level 4 — Full</option></select></div>
|
||||
</div>
|
||||
<div class="validate-profile-col validate-profile-action">
|
||||
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially with the selected cycle count. NVIDIA <code>dcgmi diag</code> uses the selected diag level from this profile.</p>
|
||||
<button class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
|
||||
</div>
|
||||
<div class="validate-profile-col"></div>
|
||||
</div>
|
||||
<div class="card-body" style="padding-top:0;display:flex;justify-content:center">
|
||||
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="grid3">
|
||||
` + renderSATCard("nvidia", "NVIDIA GPU", `<div class="form-row"><label>Diag Level</label><select id="sat-nvidia-level"><option value="1">Level 1 — Quick</option><option value="2">Level 2 — Standard</option><option value="3">Level 3 — Extended</option><option value="4">Level 4 — Full</option></select></div><div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button id="sat-btn-nvidia-targeted-stress" class="btn" type="button" onclick="runSAT('nvidia-targeted-stress')">Targeted Stress (dcgmi diag targeted_stress)</button></div><p style="color:var(--muted);font-size:12px;margin:0">Official DCGM `+"targeted_stress"+` stays in Validate as a controlled diagnostic load, not a max-burn recipe.</p>`) +
|
||||
renderSATCard("memory", "Memory", "") +
|
||||
renderSATCard("storage", "Storage", "") +
|
||||
renderSATCard("cpu", "CPU", `<div class="form-row"><label>Duration (seconds)</label><input type="number" id="sat-cpu-dur" value="60" min="10"></div>`) +
|
||||
renderSATCard("amd", "AMD GPU", `<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||||
<button id="sat-btn-amd-mem" class="btn" type="button" onclick="runSAT('amd-mem')">MEM Integrity</button>
|
||||
<button id="sat-btn-amd-bandwidth" class="btn" type="button" onclick="runSAT('amd-bandwidth')">MEM Bandwidth</button>
|
||||
</div>
|
||||
<p style="color:var(--muted);font-size:12px;margin:0">Additional AMD memory diagnostics: RVS MEM for integrity and BABEL + rocm-bandwidth-test for memory/interconnect bandwidth.</p>`) +
|
||||
` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
|
||||
inv.CPU,
|
||||
`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
|
||||
`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
|
||||
`Duration is taken from Validate Profile diag level: Level 1 = 60s, Level 2 = 5m, Level 3 = 1h, Level 4 = 1h.`,
|
||||
)) +
|
||||
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
|
||||
inv.Memory,
|
||||
`Runs a short RAM validation pass and records memory state around the test.`,
|
||||
`<code>free</code>, <code>memtester</code>`,
|
||||
`No extra settings.`,
|
||||
)) +
|
||||
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
|
||||
inv.Storage,
|
||||
`Scans all storage devices and runs the matching health or self-test path for each device type.`,
|
||||
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
|
||||
`No extra settings.`,
|
||||
)) +
|
||||
`</div>
|
||||
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
||||
<div class="grid3">
|
||||
` + renderSATCard("nvidia", "NVIDIA GPU", "runSAT('nvidia')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Runs NVIDIA diagnostics and board inventory checks.`,
|
||||
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
||||
`Diag level is taken from Validate Profile.`,
|
||||
)) +
|
||||
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runSAT('nvidia-targeted-stress')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Runs a controlled NVIDIA DCGM load in Validate to check stability under moderate stress.`,
|
||||
`<code>dcgmi diag targeted_stress</code>`,
|
||||
`Uses the fixed DCGM targeted stress recipe.`,
|
||||
)) +
|
||||
`</div>
|
||||
<div class="grid3" style="margin-top:16px">
|
||||
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
|
||||
inv.AMD,
|
||||
`Runs the selected AMD checks only. GPU Validate collects inventory; MEM Integrity uses the RVS MEM module; MEM Bandwidth uses rocm-bandwidth-test and the RVS BABEL module.`,
|
||||
`GPU Validate: <code>rocm-smi</code>, <code>dmidecode</code>; MEM Integrity: <code>rvs mem</code>; MEM Bandwidth: <code>rocm-bandwidth-test</code>, <code>rvs babel</code>`,
|
||||
`<div style="display:flex;flex-direction:column;gap:4px"><label class="cb-row"><input type="checkbox" id="sat-amd-target" checked><span>GPU Validate</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-mem-target" checked><span>MEM Integrity</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-bandwidth-target" checked><span>MEM Bandwidth</span></label></div>`,
|
||||
)) +
|
||||
`</div>
|
||||
<div id="sat-output" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Test Output <span id="sat-title"></span></div>
|
||||
<div class="card-body"><div id="sat-terminal" class="terminal"></div></div>
|
||||
</div>
|
||||
<style>
|
||||
.validate-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
|
||||
.validate-profile-col { min-width:0; }
|
||||
.validate-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:center; }
|
||||
.validate-card-body { padding:0; }
|
||||
.validate-card-section { padding:12px 16px 0; }
|
||||
.validate-card-section:last-child { padding-bottom:16px; }
|
||||
@media(max-width:900px){ .validate-profile-body { grid-template-columns:1fr; } }
|
||||
</style>
|
||||
<script>
|
||||
let satES = null;
|
||||
function satDiagLevel() {
|
||||
return parseInt(document.getElementById('sat-profile-nvidia-level').value) || 1;
|
||||
}
|
||||
function satCPUDurationFromDiagLevel() {
|
||||
const level = satDiagLevel();
|
||||
if (level === 1) return 60;
|
||||
if (level === 2) return 5 * 60;
|
||||
return 60 * 60;
|
||||
}
|
||||
function satLabels() {
|
||||
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||
}
|
||||
function satRequestBody(target) {
|
||||
const body = {};
|
||||
const labels = satLabels();
|
||||
body.display_name = labels[target] || ('Validate ' + target);
|
||||
if (target === 'nvidia') body.diag_level = satDiagLevel();
|
||||
if (target === 'nvidia-targeted-stress') body.duration = 300;
|
||||
if (target === 'cpu') body.duration = satCPUDurationFromDiagLevel();
|
||||
return body;
|
||||
}
|
||||
function enqueueSATTarget(target) {
|
||||
return fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(satRequestBody(target))})
|
||||
.then(r => r.json());
|
||||
}
|
||||
function selectedAMDValidateTargets() {
|
||||
const targets = [];
|
||||
const gpu = document.getElementById('sat-amd-target');
|
||||
const mem = document.getElementById('sat-amd-mem-target');
|
||||
const bw = document.getElementById('sat-amd-bandwidth-target');
|
||||
if (gpu && gpu.checked && !gpu.disabled) targets.push('amd');
|
||||
if (mem && mem.checked && !mem.disabled) targets.push('amd-mem');
|
||||
if (bw && bw.checked && !bw.disabled) targets.push('amd-bandwidth');
|
||||
return targets;
|
||||
}
|
||||
function runSAT(target) {
|
||||
if (satES) { satES.close(); satES = null; }
|
||||
const body = {};
|
||||
const labels = {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||
body.display_name = labels[target] || ('Validate ' + target);
|
||||
if (target === 'nvidia') body.diag_level = parseInt(document.getElementById('sat-nvidia-level').value)||1;
|
||||
if (target === 'nvidia-targeted-stress') body.duration = 300;
|
||||
if (target === 'cpu') body.duration = parseInt(document.getElementById('sat-cpu-dur').value)||60;
|
||||
document.getElementById('sat-output').style.display='block';
|
||||
document.getElementById('sat-title').textContent = '— ' + target;
|
||||
const term = document.getElementById('sat-terminal');
|
||||
term.textContent = 'Enqueuing ' + target + ' test...\n';
|
||||
return fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(body)})
|
||||
.then(r => r.json())
|
||||
return enqueueSATTarget(target)
|
||||
.then(d => {
|
||||
term.textContent += 'Task ' + d.task_id + ' queued. Streaming log...\n';
|
||||
satES = new EventSource('/api/tasks/'+d.task_id+'/stream');
|
||||
@@ -726,9 +1114,35 @@ function runSAT(target) {
|
||||
satES.addEventListener('done', e => { satES.close(); satES=null; term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n'; });
|
||||
});
|
||||
}
|
||||
function runAMDValidateSet() {
|
||||
const targets = selectedAMDValidateTargets();
|
||||
if (!targets.length) return;
|
||||
if (targets.length === 1) return runSAT(targets[0]);
|
||||
if (satES) { satES.close(); satES = null; }
|
||||
document.getElementById('sat-output').style.display='block';
|
||||
document.getElementById('sat-title').textContent = '— amd';
|
||||
const term = document.getElementById('sat-terminal');
|
||||
term.textContent = 'Enqueuing AMD validate set...\n';
|
||||
const labels = satLabels();
|
||||
const enqueueNext = (idx) => {
|
||||
if (idx >= targets.length) return;
|
||||
const target = targets[idx];
|
||||
enqueueSATTarget(target)
|
||||
.then(d => {
|
||||
term.textContent += 'Task ' + d.task_id + ' queued for ' + labels[target] + '.\n';
|
||||
if (idx === targets.length - 1) {
|
||||
satES = new EventSource('/api/tasks/'+d.task_id+'/stream');
|
||||
satES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop=term.scrollHeight; };
|
||||
satES.addEventListener('done', e => { satES.close(); satES=null; term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n'; });
|
||||
}
|
||||
enqueueNext(idx + 1);
|
||||
});
|
||||
};
|
||||
enqueueNext(0);
|
||||
}
|
||||
function runAllSAT() {
|
||||
const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
|
||||
const targets = ['nvidia','nvidia-targeted-stress','memory','storage','cpu','amd','amd-mem','amd-bandwidth'];
|
||||
const targets = ['nvidia','nvidia-targeted-stress','memory','storage','cpu'].concat(selectedAMDValidateTargets());
|
||||
const total = targets.length * cycles;
|
||||
let enqueued = 0;
|
||||
const status = document.getElementById('sat-all-status');
|
||||
@@ -739,14 +1153,8 @@ function runAllSAT() {
|
||||
const target = targets[idx];
|
||||
const btn = document.getElementById('sat-btn-' + target);
|
||||
if (btn && btn.disabled) { enqueueNext(cycle, idx+1); return; }
|
||||
const body = {};
|
||||
const labels = {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||
body.display_name = labels[target] || ('Validate ' + target);
|
||||
if (target === 'nvidia') body.diag_level = parseInt(document.getElementById('sat-nvidia-level').value)||1;
|
||||
if (target === 'nvidia-targeted-stress') body.duration = 300;
|
||||
if (target === 'cpu') body.duration = parseInt(document.getElementById('sat-cpu-dur').value)||60;
|
||||
fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(body)})
|
||||
.then(r=>r.json()).then(()=>{
|
||||
enqueueSATTarget(target)
|
||||
.then(()=>{
|
||||
enqueued++;
|
||||
status.textContent = 'Enqueued '+enqueued+'/'+total+'...';
|
||||
enqueueNext(cycle, idx+1);
|
||||
@@ -760,9 +1168,17 @@ fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
|
||||
if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
|
||||
if (!gp.nvidia) disableSATCard('nvidia-targeted-stress', 'No NVIDIA GPU detected');
|
||||
if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
|
||||
if (!gp.amd) disableSATCard('amd-mem', 'No AMD GPU detected');
|
||||
if (!gp.amd) disableSATCard('amd-bandwidth', 'No AMD GPU detected');
|
||||
if (!gp.amd) disableSATAMDOptions('No AMD GPU detected');
|
||||
});
|
||||
function disableSATAMDOptions(reason) {
|
||||
['sat-amd-target','sat-amd-mem-target','sat-amd-bandwidth-target'].forEach(function(id) {
|
||||
const cb = document.getElementById(id);
|
||||
if (!cb) return;
|
||||
cb.disabled = true;
|
||||
cb.checked = false;
|
||||
cb.title = reason;
|
||||
});
|
||||
}
|
||||
function disableSATCard(id, reason) {
|
||||
const btn = document.getElementById('sat-btn-' + id);
|
||||
if (!btn) return;
|
||||
@@ -775,8 +1191,9 @@ function disableSATCard(id, reason) {
|
||||
if (!note) {
|
||||
note = document.createElement('p');
|
||||
note.className = 'sat-unavail';
|
||||
note.style.cssText = 'color:var(--muted);font-size:12px;margin-top:6px';
|
||||
btn.parentNode.insertBefore(note, btn.nextSibling);
|
||||
note.style.cssText = 'color:var(--muted);font-size:12px;margin:0 0 8px';
|
||||
const body = card.querySelector('.card-body');
|
||||
if (body) body.insertBefore(note, body.firstChild);
|
||||
}
|
||||
note.textContent = reason;
|
||||
}
|
||||
@@ -784,9 +1201,159 @@ function disableSATCard(id, reason) {
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderSATCard(id, label, extra string) string {
|
||||
return fmt.Sprintf(`<div class="card"><div class="card-head">%s</div><div class="card-body">%s<button id="sat-btn-%s" class="btn btn-primary" onclick="runSAT('%s')">▶ Run Test</button></div></div>`,
|
||||
label, extra, id, id)
|
||||
func loadValidateInventory(opts HandlerOptions) validateInventory {
|
||||
unknown := "Audit snapshot not loaded."
|
||||
out := validateInventory{
|
||||
CPU: unknown,
|
||||
Memory: unknown,
|
||||
Storage: unknown,
|
||||
NVIDIA: unknown,
|
||||
AMD: unknown,
|
||||
}
|
||||
data, err := loadSnapshot(opts.AuditPath)
|
||||
if err != nil {
|
||||
return out
|
||||
}
|
||||
var snap schema.HardwareIngestRequest
|
||||
if err := json.Unmarshal(data, &snap); err != nil {
|
||||
return out
|
||||
}
|
||||
|
||||
cpuCounts := map[string]int{}
|
||||
cpuTotal := 0
|
||||
for _, cpu := range snap.Hardware.CPUs {
|
||||
if cpu.Present != nil && !*cpu.Present {
|
||||
continue
|
||||
}
|
||||
cpuTotal++
|
||||
addValidateModel(cpuCounts, validateFirstNonEmpty(validateTrimPtr(cpu.Model), validateTrimPtr(cpu.Manufacturer), "unknown"))
|
||||
}
|
||||
|
||||
memCounts := map[string]int{}
|
||||
memTotal := 0
|
||||
for _, dimm := range snap.Hardware.Memory {
|
||||
if dimm.Present != nil && !*dimm.Present {
|
||||
continue
|
||||
}
|
||||
memTotal++
|
||||
addValidateModel(memCounts, validateFirstNonEmpty(validateTrimPtr(dimm.PartNumber), validateTrimPtr(dimm.Type), validateTrimPtr(dimm.Manufacturer), "unknown"))
|
||||
}
|
||||
|
||||
storageCounts := map[string]int{}
|
||||
storageTotal := 0
|
||||
for _, dev := range snap.Hardware.Storage {
|
||||
if dev.Present != nil && !*dev.Present {
|
||||
continue
|
||||
}
|
||||
storageTotal++
|
||||
addValidateModel(storageCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
|
||||
}
|
||||
|
||||
nvidiaCounts := map[string]int{}
|
||||
nvidiaTotal := 0
|
||||
amdCounts := map[string]int{}
|
||||
amdTotal := 0
|
||||
for _, dev := range snap.Hardware.PCIeDevices {
|
||||
if dev.Present != nil && !*dev.Present {
|
||||
continue
|
||||
}
|
||||
if validateIsVendorGPU(dev, "nvidia") {
|
||||
nvidiaTotal++
|
||||
addValidateModel(nvidiaCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
|
||||
}
|
||||
if validateIsVendorGPU(dev, "amd") {
|
||||
amdTotal++
|
||||
addValidateModel(amdCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
|
||||
}
|
||||
}
|
||||
|
||||
out.CPU = formatValidateDeviceSummary(cpuTotal, cpuCounts, "CPU")
|
||||
out.Memory = formatValidateDeviceSummary(memTotal, memCounts, "module")
|
||||
out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device")
|
||||
out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU")
|
||||
out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU")
|
||||
return out
|
||||
}
|
||||
|
||||
func renderValidateCardBody(devices, description, commands, settings string) string {
|
||||
return `<div class="validate-card-section"><div style="font-size:13px;color:var(--muted)">` + devices + `</div></div>` +
|
||||
`<div class="validate-card-section"><div style="font-size:13px">` + description + `</div></div>` +
|
||||
`<div class="validate-card-section"><div style="font-size:13px">` + commands + `</div></div>` +
|
||||
`<div class="validate-card-section"><div style="font-size:13px;color:var(--muted)">` + settings + `</div></div>`
|
||||
}
|
||||
|
||||
func formatValidateDeviceSummary(total int, models map[string]int, unit string) string {
|
||||
if total == 0 {
|
||||
return "0 " + unit + "s detected."
|
||||
}
|
||||
keys := make([]string, 0, len(models))
|
||||
for key := range models {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
parts := make([]string, 0, len(keys))
|
||||
for _, key := range keys {
|
||||
parts = append(parts, fmt.Sprintf("%d x %s", models[key], html.EscapeString(key)))
|
||||
}
|
||||
label := unit
|
||||
if total != 1 {
|
||||
label += "s"
|
||||
}
|
||||
return fmt.Sprintf("%d %s: %s", total, label, strings.Join(parts, ", "))
|
||||
}
|
||||
|
||||
func addValidateModel(counts map[string]int, name string) {
|
||||
name = strings.TrimSpace(name)
|
||||
if name == "" {
|
||||
name = "unknown"
|
||||
}
|
||||
counts[name]++
|
||||
}
|
||||
|
||||
func validateTrimPtr(value *string) string {
|
||||
if value == nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(*value)
|
||||
}
|
||||
|
||||
func validateFirstNonEmpty(values ...string) string {
|
||||
for _, value := range values {
|
||||
value = strings.TrimSpace(value)
|
||||
if value != "" {
|
||||
return value
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func validateIsVendorGPU(dev schema.HardwarePCIeDevice, vendor string) bool {
|
||||
model := strings.ToLower(validateTrimPtr(dev.Model))
|
||||
manufacturer := strings.ToLower(validateTrimPtr(dev.Manufacturer))
|
||||
class := strings.ToLower(validateTrimPtr(dev.DeviceClass))
|
||||
if strings.Contains(model, "aspeed") || strings.Contains(manufacturer, "aspeed") {
|
||||
return false
|
||||
}
|
||||
switch vendor {
|
||||
case "nvidia":
|
||||
return strings.Contains(model, "nvidia") || strings.Contains(manufacturer, "nvidia")
|
||||
case "amd":
|
||||
isGPUClass := class == "processingaccelerator" || class == "displaycontroller" || class == "videocontroller"
|
||||
isAMDVendor := strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd") || strings.Contains(manufacturer, "ati")
|
||||
isAMDModel := strings.Contains(model, "instinct") || strings.Contains(model, "radeon") || strings.Contains(model, "amd")
|
||||
return isGPUClass && (isAMDVendor || isAMDModel)
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func renderSATCard(id, label, runAction, headerActions, body string) string {
|
||||
actions := `<button id="sat-btn-` + id + `" class="btn btn-primary btn-sm" onclick="` + runAction + `">Run</button>`
|
||||
if strings.TrimSpace(headerActions) != "" {
|
||||
actions += headerActions
|
||||
}
|
||||
return fmt.Sprintf(`<div class="card"><div class="card-head card-head-actions"><span>%s</span><div class="card-head-buttons">%s</div></div><div class="card-body validate-card-body">%s</div></div>`,
|
||||
label, actions, body)
|
||||
}
|
||||
|
||||
// ── Benchmark ─────────────────────────────────────────────────────────────────
|
||||
|
||||
@@ -543,7 +543,7 @@ func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, `Run Audit`) {
|
||||
if !strings.Contains(body, `onclick="auditModalRun()">Run audit</button>`) {
|
||||
t.Fatalf("dashboard missing run audit button: %s", body)
|
||||
}
|
||||
if strings.Contains(body, `No audit data`) {
|
||||
@@ -650,7 +650,7 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidatePageRendersNvidiaTargetedStressAction(t *testing.T) {
|
||||
func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
|
||||
@@ -659,9 +659,10 @@ func TestValidatePageRendersNvidiaTargetedStressAction(t *testing.T) {
|
||||
}
|
||||
body := rec.Body.String()
|
||||
for _, needle := range []string{
|
||||
`Targeted Stress`,
|
||||
`NVIDIA GPU Targeted Stress`,
|
||||
`nvidia-targeted-stress`,
|
||||
`Official DCGM`,
|
||||
`controlled NVIDIA DCGM load`,
|
||||
`<code>dcgmi diag targeted_stress</code>`,
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("validate page missing %q: %s", needle, body)
|
||||
@@ -845,3 +846,98 @@ func TestRuntimeHealthEndpointReturnsJSON(t *testing.T) {
|
||||
t.Fatalf("body=%q want %q", strings.TrimSpace(rec.Body.String()), body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
exportDir := filepath.Join(dir, "export")
|
||||
if err := os.MkdirAll(exportDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z","hardware":{"board":{"serial_number":"SERIAL-1"}}}`), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
health := `{
|
||||
"status":"PARTIAL",
|
||||
"checked_at":"2026-03-16T10:00:00Z",
|
||||
"export_dir":"/tmp/export",
|
||||
"driver_ready":true,
|
||||
"cuda_ready":false,
|
||||
"network_status":"PARTIAL",
|
||||
"issues":[
|
||||
{"code":"dhcp_partial","description":"At least one interface did not obtain IPv4 connectivity."},
|
||||
{"code":"cuda_runtime_not_ready","description":"CUDA runtime is not ready for GPU SAT."}
|
||||
],
|
||||
"tools":[
|
||||
{"name":"dmidecode","ok":true},
|
||||
{"name":"nvidia-smi","ok":false}
|
||||
],
|
||||
"services":[
|
||||
{"name":"bee-web","status":"active"},
|
||||
{"name":"bee-nvidia","status":"inactive"}
|
||||
]
|
||||
}`
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "runtime-health.json"), []byte(health), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
componentStatus := `[
|
||||
{
|
||||
"component_key":"cpu:all",
|
||||
"status":"Warning",
|
||||
"error_summary":"cpu SAT: FAILED",
|
||||
"history":[{"at":"2026-03-16T10:00:00Z","status":"Warning","source":"sat:cpu","detail":"cpu SAT: FAILED"}]
|
||||
},
|
||||
{
|
||||
"component_key":"memory:all",
|
||||
"status":"OK",
|
||||
"history":[{"at":"2026-03-16T10:01:00Z","status":"OK","source":"sat:memory","detail":"memory SAT: OK"}]
|
||||
},
|
||||
{
|
||||
"component_key":"storage:nvme0n1",
|
||||
"status":"Critical",
|
||||
"error_summary":"storage SAT: FAILED",
|
||||
"history":[{"at":"2026-03-16T10:02:00Z","status":"Critical","source":"sat:storage","detail":"storage SAT: FAILED"}]
|
||||
},
|
||||
{
|
||||
"component_key":"pcie:gpu:nvidia",
|
||||
"status":"Warning",
|
||||
"error_summary":"nvidia SAT: FAILED",
|
||||
"history":[{"at":"2026-03-16T10:03:00Z","status":"Warning","source":"sat:nvidia","detail":"nvidia SAT: FAILED"}]
|
||||
}
|
||||
]`
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "component-status.json"), []byte(componentStatus), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
handler := NewHandler(HandlerOptions{AuditPath: path, ExportDir: exportDir})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
body := rec.Body.String()
|
||||
for _, needle := range []string{
|
||||
`Runtime Health`,
|
||||
`<th>Check</th><th>Status</th><th>Source</th><th>Issue</th>`,
|
||||
`Export Directory`,
|
||||
`Network`,
|
||||
`NVIDIA/AMD Driver`,
|
||||
`CUDA / ROCm`,
|
||||
`Required Utilities`,
|
||||
`Bee Services`,
|
||||
`<td>CPU</td>`,
|
||||
`<td>Memory</td>`,
|
||||
`<td>Storage</td>`,
|
||||
`<td>GPU</td>`,
|
||||
`CUDA runtime is not ready for GPU SAT.`,
|
||||
`Missing: nvidia-smi`,
|
||||
`bee-nvidia=inactive`,
|
||||
`cpu SAT: FAILED`,
|
||||
`storage SAT: FAILED`,
|
||||
`sat:nvidia`,
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("dashboard missing %q: %s", needle, body)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user