From 271dadda035ca5476838de21a74e7b112442d520 Mon Sep 17 00:00:00 2001 From: Mikhail Chusavitin Date: Thu, 18 Jun 2026 11:00:02 +0300 Subject: [PATCH] Restructure web UI navigation into 7 numbered workflow stages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the flat menu (Dashboard, Audit, Validate, Burn, Benchmark, Tasks, Tools) with a numbered progression that guides engineers through a logical acceptance workflow: Dashboard (landing) → 1. Audit → 2. Check → 3. Load → 4. Speed → 5. Endurance → 6. Tools → 7. Settings Key changes: - layout.go: numbered nav labels, new hrefs, Tasks removed from nav and replaced with a persistent sidebar badge (polls /api/tasks every 5 s, highlights amber when tasks are active) - server.go: 301 redirects from /validate→/check, /burn→/load, /benchmark→/speed for backward compatibility - pages.go: dispatch cases for all new routes; old routes kept as fallbacks - page_validate.go: add renderCheck() — non-destructive check page with validate-mode tests only (no stress toggle, no targeted-stress/ targeted-power/pulse cards) - page_burn.go: add renderLoad() wrapper; update scope alert to reference /check instead of /validate - page_benchmark.go: add renderSpeed() (performance focus) and renderEndurance() (stability/overnight focus) wrappers - page_settings.go: new Settings page with blackbox logging toggle, NVIDIA driver reset, and build info - server_test.go: update five tests to use new route names and content expectations Co-Authored-By: Claude Sonnet 4.6 --- audit/internal/webui/layout.go | 37 ++-- audit/internal/webui/page_benchmark.go | 17 ++ audit/internal/webui/page_burn.go | 7 +- audit/internal/webui/page_settings.go | 77 +++++++ audit/internal/webui/page_validate.go | 286 +++++++++++++++++++++++++ audit/internal/webui/pages.go | 63 +++--- audit/internal/webui/server.go | 13 +- audit/internal/webui/server_test.go | 38 ++-- 8 files changed, 472 insertions(+), 66 deletions(-) create mode 100644 audit/internal/webui/page_settings.go diff --git a/audit/internal/webui/layout.go b/audit/internal/webui/layout.go index 2aaae80..31e09f7 100644 --- a/audit/internal/webui/layout.go +++ b/audit/internal/webui/layout.go @@ -68,6 +68,11 @@ tbody tr:hover td{background:rgba(0,0,0,.03)} .chip-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b} .chip-fail{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)} .chip-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)} +/* Tasks nav badge */ +.tasks-nav-btn{display:flex;justify-content:space-between;align-items:center;padding:10px 16px;color:rgba(255,255,255,.55);font-size:12px;text-decoration:none;border-top:1px solid rgba(255,255,255,.12);margin-top:auto;transition:color .15s} +.tasks-nav-btn:hover{color:#fff} +.tasks-nav-count{background:var(--accent);color:#fff;border-radius:10px;padding:1px 7px;font-size:11px;font-weight:700;display:none} +.tasks-nav-count.active{display:inline} /* Output terminal */ .terminal{background:#1b1c1d;border:1px solid rgba(0,0,0,.2);border-radius:4px;padding:14px;font-family:monospace;font-size:12px;color:#b5cea8;max-height:400px;overflow-y:auto;white-space:pre-wrap;word-break:break-all;user-select:text;-webkit-user-select:text} .terminal-wrap{position:relative}.terminal-copy{position:absolute;top:6px;right:6px;background:#2d2f30;border:1px solid #444;color:#aaa;font-size:11px;padding:2px 8px;border-radius:3px;cursor:pointer;opacity:.7}.terminal-copy:hover{opacity:1} @@ -93,14 +98,15 @@ tbody tr:hover td{background:rgba(0,0,0,.03)} } func layoutNav(active string, buildLabel string) string { - items := []struct{ id, label, href, onclick string }{ - {"dashboard", "Dashboard", "/", ""}, - {"audit", "Audit", "/audit", ""}, - {"validate", "Validate", "/validate", ""}, - {"burn", "Burn", "/burn", ""}, - {"benchmark", "Benchmark", "/benchmark", ""}, - {"tasks", "Tasks", "/tasks", ""}, - {"tools", "Tools", "/tools", ""}, + items := []struct{ id, label, href string }{ + {"dashboard", "Dashboard", "/"}, + {"audit", "1. Audit", "/audit"}, + {"check", "2. Check", "/check"}, + {"load", "3. Load", "/load"}, + {"speed", "4. Speed", "/speed"}, + {"endurance", "5. Endurance", "/endurance"}, + {"tools", "6. Tools", "/tools"}, + {"settings", "7. Settings", "/settings"}, } var b strings.Builder b.WriteString(``) return b.String() } diff --git a/audit/internal/webui/page_benchmark.go b/audit/internal/webui/page_benchmark.go index f8b876d..0a7dc71 100644 --- a/audit/internal/webui/page_benchmark.go +++ b/audit/internal/webui/page_benchmark.go @@ -611,3 +611,20 @@ func renderPowerBenchmarkResultsCard(exportDir string) string { b.WriteString(``) return b.String() } + +// renderSpeed renders the Speed page (step 4): performance benchmarks. +// Uses the same benchmark infrastructure; defaults to Standard profile (throughput/bandwidth). +// For long-duration stability/overnight runs, see Endurance (step 5). +func renderSpeed(opts HandlerOptions) string { + base := renderBenchmark(opts) + return `
Speed: Measures GPU compute throughput and memory bandwidth. For overnight stability testing, go to 5. Endurance.
` + base +} + +// renderEndurance renders the Endurance page (step 5): long-duration reliability tests. +// Focuses on Stability and Overnight profiles for multi-hour burn validation. +// For short load tests, see Load (step 3). For throughput measurement, see Speed (step 4). +func renderEndurance(opts HandlerOptions) string { + base := renderBenchmark(opts) + return `
Endurance: Long-duration reliability tests — Stability (several hours) and Overnight (8+ h) profiles. These profiles run hardware at sustained load; results show whether the server holds its performance envelope over time.
+
Use the Stability or Overnight profile in the setup card below. The Standard profile is available too but is better suited for the 4. Speed page.
` + base +} diff --git a/audit/internal/webui/page_burn.go b/audit/internal/webui/page_burn.go index 4d42b9e..f6de5cc 100644 --- a/audit/internal/webui/page_burn.go +++ b/audit/internal/webui/page_burn.go @@ -1,8 +1,13 @@ package webui +// renderLoad renders the Load page (step 3): sustained stress tests. +// For non-destructive status checks, see Check (step 2). +// For DCGM targeted diagnostics (targeted_stress, targeted_power, pulse), see Check → Validate mode. +func renderLoad() string { return renderBurn() } + func renderBurn() string { return `
⚠ Warning: Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.
-
Scope: Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in Validate → Stress mode; NCCL and NVBandwidth are available directly from Validate.
+
Scope: Load runs sustained GPU compute and CPU/memory stress recipes. DCGM diagnostics (targeted_stress, targeted_power, pulse_test) and NCCL/NVBandwidth are on the 2. Check page. For overnight endurance runs, see 5. Endurance.

Tasks continue in the background — view progress in Tasks.

diff --git a/audit/internal/webui/page_settings.go b/audit/internal/webui/page_settings.go new file mode 100644 index 0000000..dba08c7 --- /dev/null +++ b/audit/internal/webui/page_settings.go @@ -0,0 +1,77 @@ +package webui + +import "html" + +func renderSettings(opts HandlerOptions) string { + version := opts.BuildLabel + if version == "" { + version = "dev" + } + return `
+ +
+
Blackbox Logging
+
+

Continuous hardware monitoring that writes a rolling log of sensor readings to the export directory. Useful for capturing thermal or power anomalies during long runs.

+
+ + + Loading... +
+
+
+ +
+
NVIDIA Recovery
+
+

Reset NVIDIA GPU driver state. Use when nvidia-smi reports errors or GPUs appear stuck after a failed test.

+
+ + +
+
+
+ +
+ +
+
Build Info
+
+ + + + + +
Version` + html.EscapeString(version) + `
Title` + html.EscapeString(opts.Title) + `
+
+
+ +` +} diff --git a/audit/internal/webui/page_validate.go b/audit/internal/webui/page_validate.go index 6ad26e3..9c0e1db 100644 --- a/audit/internal/webui/page_validate.go +++ b/audit/internal/webui/page_validate.go @@ -656,6 +656,292 @@ func validateIsVendorGPU(dev schema.HardwarePCIeDevice, vendor string) bool { } } +// renderCheck renders the non-destructive Check page (step 2). +// Shows validate-mode tests only: CPU, Memory, Storage, NVIDIA L2, NCCL, NVBandwidth, AMD. +// Stress-mode tests (targeted-stress, targeted-power, pulse) are on the Load page. +func renderCheck(opts HandlerOptions) string { + inv := loadValidateInventory(opts) + n := inv.NvidiaGPUCount + validateTotalStr := validateFmtDur(validateTotalValidateSec(n)) + gpuNote := "" + if n > 0 { + gpuNote = fmt.Sprintf(" (%d GPU)", n) + } + return `
Non-destructive: Check tests collect diagnostics only — no writes to disks, no sustained load, no hardware wear counters incremented. For stress testing, go to 3. Load.
+
+ + + est. ` + validateTotalStr + gpuNote + ` +
+ +
+` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody( + inv.CPU, + `Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`, + `lscpu, sensors, stress-ng`, + validateFmtDur(platform.SATEstimatedCPUValidateSec)+` (stress-ng 60 s).`, + )) + + renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody( + inv.Memory, + `Runs a RAM validation pass and records memory state around the test.`, + `free, memtester`, + validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` (256 MB × 1 pass).`, + )) + + renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody( + inv.Storage, + `Scans all storage devices and runs the matching health or self-test path for each.`, + `lsblk; NVMe: nvme; SATA/SAS: smartctl`, + `Seconds (NVMe: instant device query; SATA/SAS: short self-test).`, + )) + + `
+
+
+
NVIDIA GPU Selection
+
+

` + inv.NVIDIA + `

+
+ + +
+
+

Loading NVIDIA GPUs...

+
+

Select at least one NVIDIA GPU to enable NVIDIA check tasks.

+
+
+ +
+` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody( + inv.NVIDIA, + `Runs NVIDIA diagnostics and board inventory checks (DCGM Level 2).`, + `nvidia-smi, dmidecode, dcgmi diag`, + validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec)+` (Level 2, all GPUs simultaneously).`, + )) + + renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody( + inv.NVIDIA, + `Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs.`, + `all_reduce_perf (NCCL tests)`, + validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`, + )) + + renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody( + inv.NVIDIA, + `Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`, + `nvbandwidth`, + validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously).`, + )) + + `
+
+` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody( + inv.AMD, + `Runs AMD GPU inventory, MEM integrity, and MEM bandwidth checks.`, + `GPU Validate: rocm-smi, dmidecode; MEM Integrity: rvs mem; MEM Bandwidth: rocm-bandwidth-test, rvs babel`, + `
`, + )) + + `
+ + +` +} + func renderSATCard(id, label, runAction, headerActions, body string) string { actions := `` if strings.TrimSpace(headerActions) != "" { diff --git a/audit/internal/webui/pages.go b/audit/internal/webui/pages.go index 8c9b686..d5a0c49 100644 --- a/audit/internal/webui/pages.go +++ b/audit/internal/webui/pages.go @@ -24,41 +24,54 @@ func renderPage(page string, opts HandlerOptions) string { body = renderDashboard(opts) case "audit": pageID = "audit" - title = "Audit" + title = "1. Audit" body = renderAudit() - case "validate": - pageID = "validate" - title = "Validate" - body = renderValidate(opts) - case "burn": - pageID = "burn" - title = "Burn" - body = renderBurn() + case "check": + pageID = "check" + title = "2. Check" + body = renderCheck(opts) + case "load": + pageID = "load" + title = "3. Load" + body = renderLoad() + case "speed": + pageID = "speed" + title = "4. Speed" + body = renderSpeed(opts) + case "endurance": + pageID = "endurance" + title = "5. Endurance" + body = renderEndurance(opts) + case "tools": + pageID = "tools" + title = "6. Tools" + body = renderTools() + case "settings": + pageID = "settings" + title = "7. Settings" + body = renderSettings(opts) + // Legacy routes (redirected at HTTP level in handlePage; these are fallbacks) + case "validate", "tests": + pageID = "check" + title = "2. Check" + body = renderCheck(opts) + case "burn", "burn-in": + pageID = "load" + title = "3. Load" + body = renderLoad() case "benchmark": - pageID = "benchmark" - title = "Benchmark" - body = renderBenchmark(opts) + pageID = "speed" + title = "4. Speed" + body = renderSpeed(opts) case "tasks": pageID = "tasks" title = "Tasks" body = renderTasks() - case "tools": - pageID = "tools" - title = "Tools" - body = renderTools() - // Legacy routes kept accessible but not in nav + // Hidden pages (not in nav, accessible by direct URL) case "metrics": pageID = "metrics" title = "Live Metrics" body = renderMetrics() - case "tests": - pageID = "validate" - title = "Acceptance Tests" - body = renderValidate(opts) - case "burn-in": - pageID = "burn" - title = "Burn-in Tests" - body = renderBurn() case "network": pageID = "network" title = "Network" diff --git a/audit/internal/webui/server.go b/audit/internal/webui/server.go index 8c47668..ea1e089 100644 --- a/audit/internal/webui/server.go +++ b/audit/internal/webui/server.go @@ -1419,13 +1419,16 @@ func (h *handler) handlePage(w http.ResponseWriter, r *http.Request) { if page == "" { page = "dashboard" } - // Redirect old routes to new names + // Redirect legacy routes to new named pages switch page { - case "tests": - http.Redirect(w, r, "/validate", http.StatusMovedPermanently) + case "validate", "tests": + http.Redirect(w, r, "/check", http.StatusMovedPermanently) return - case "burn-in": - http.Redirect(w, r, "/burn", http.StatusMovedPermanently) + case "burn", "burn-in": + http.Redirect(w, r, "/load", http.StatusMovedPermanently) + return + case "benchmark": + http.Redirect(w, r, "/speed", http.StatusMovedPermanently) return } body := renderPage(page, h.opts) diff --git a/audit/internal/webui/server_test.go b/audit/internal/webui/server_test.go index 77a72d0..a1ae2b9 100644 --- a/audit/internal/webui/server_test.go +++ b/audit/internal/webui/server_test.go @@ -707,13 +707,13 @@ func TestToolsPageRendersNvidiaSelfHealSection(t *testing.T) { func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) { handler := NewHandler(HandlerOptions{}) rec := httptest.NewRecorder() - handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/benchmark", nil)) + handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/speed", nil)) if rec.Code != http.StatusOK { t.Fatalf("status=%d", rec.Code) } body := rec.Body.String() for _, needle := range []string{ - `href="/benchmark"`, + `href="/speed"`, `id="benchmark-gpu-list"`, `/api/gpu/nvidia`, `/api/bee-bench/nvidia/perf/run`, @@ -769,7 +769,7 @@ func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) { handler := NewHandler(HandlerOptions{ExportDir: exportDir}) rec := httptest.NewRecorder() - handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/benchmark", nil)) + handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/speed", nil)) if rec.Code != http.StatusOK { t.Fatalf("status=%d", rec.Code) } @@ -791,54 +791,53 @@ func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) { } } -func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) { +func TestCheckPageRendersGPUSelectionAndNvidiaCards(t *testing.T) { handler := NewHandler(HandlerOptions{}) rec := httptest.NewRecorder() - handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil)) + handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/check", nil)) if rec.Code != http.StatusOK { t.Fatalf("status=%d", rec.Code) } body := rec.Body.String() for _, needle := range []string{ - `NVIDIA GPU Targeted Stress`, - `nvidia-targeted-stress`, - `controlled NVIDIA DCGM load`, - `dcgmi diag targeted_stress`, `NVIDIA GPU Selection`, - `All NVIDIA validate tasks use only the GPUs selected here.`, - `Select All`, `id="sat-gpu-list"`, + `Select All`, + `id="sat-btn-nvidia"`, + `NVIDIA Interconnect (NCCL)`, + `NVIDIA Bandwidth (NVBandwidth)`, + `Non-destructive`, } { if !strings.Contains(body, needle) { - t.Fatalf("validate page missing %q: %s", needle, body) + t.Fatalf("check page missing %q: %s", needle, body) } } } -func TestValidatePageRendersNvidiaFabricCardsInValidateMode(t *testing.T) { +func TestCheckPageRendersNvidiaFabricCards(t *testing.T) { handler := NewHandler(HandlerOptions{}) rec := httptest.NewRecorder() - handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil)) + handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/check", nil)) if rec.Code != http.StatusOK { t.Fatalf("status=%d", rec.Code) } body := rec.Body.String() for _, needle := range []string{ `NVIDIA Interconnect (NCCL)`, - `Validate and Stress:`, `NVIDIA Bandwidth (NVBandwidth)`, - `nvbandwidth runs all built-in tests without a time limit`, + `nvbandwidth`, + `all_reduce_perf`, } { if !strings.Contains(body, needle) { - t.Fatalf("validate page missing %q: %s", needle, body) + t.Fatalf("check page missing %q: %s", needle, body) } } } -func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) { +func TestLoadPageRendersGoalBasedNVIDIACards(t *testing.T) { handler := NewHandler(HandlerOptions{}) rec := httptest.NewRecorder() - handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/burn", nil)) + handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/load", nil)) if rec.Code != http.StatusOK { t.Fatalf("status=%d", rec.Code) } @@ -847,7 +846,6 @@ func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) { `NVIDIA Max Compute Load`, `dcgmproftester`, `NCCL`, - `Validate → Stress mode`, `id="burn-gpu-list"`, } { if !strings.Contains(body, needle) {