From 65bcc9ce81e2bb54eb9ca8cdf9b82817d34b831c Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Mon, 20 Apr 2026 06:56:52 +0300 Subject: [PATCH] refactor(webui): split pages into task modules --- audit/internal/webui/layout.go | 137 + audit/internal/webui/page_benchmark.go | 612 ++++ audit/internal/webui/page_burn.go | 383 +++ audit/internal/webui/page_export_tools.go | 434 +++ audit/internal/webui/page_install_tasks.go | 314 ++ audit/internal/webui/page_metrics.go | 238 ++ audit/internal/webui/page_network_services.go | 213 ++ audit/internal/webui/page_validate.go | 716 ++++ audit/internal/webui/pages.go | 2993 ----------------- 9 files changed, 3047 insertions(+), 2993 deletions(-) create mode 100644 audit/internal/webui/layout.go create mode 100644 audit/internal/webui/page_benchmark.go create mode 100644 audit/internal/webui/page_burn.go create mode 100644 audit/internal/webui/page_export_tools.go create mode 100644 audit/internal/webui/page_install_tasks.go create mode 100644 audit/internal/webui/page_metrics.go create mode 100644 audit/internal/webui/page_network_services.go create mode 100644 audit/internal/webui/page_validate.go diff --git a/audit/internal/webui/layout.go b/audit/internal/webui/layout.go new file mode 100644 index 0000000..73acea3 --- /dev/null +++ b/audit/internal/webui/layout.go @@ -0,0 +1,137 @@ +package webui + +import ( + "fmt" + "html" + "os" + "strings" +) + +func layoutHead(title string) string { + return ` + + + + +` + html.EscapeString(title) + ` + + + +` +} + +func layoutNav(active string, buildLabel string) string { + items := []struct{ id, label, href, onclick string }{ + {"dashboard", "Dashboard", "/", ""}, + {"audit", "Audit", "/audit", ""}, + {"validate", "Validate", "/validate", ""}, + {"burn", "Burn", "/burn", ""}, + {"benchmark", "Benchmark", "/benchmark", ""}, + {"tasks", "Tasks", "/tasks", ""}, + {"tools", "Tools", "/tools", ""}, + } + var b strings.Builder + b.WriteString(``) + return b.String() +} diff --git a/audit/internal/webui/page_benchmark.go b/audit/internal/webui/page_benchmark.go new file mode 100644 index 0000000..f118805 --- /dev/null +++ b/audit/internal/webui/page_benchmark.go @@ -0,0 +1,612 @@ +package webui + +import ( + "encoding/json" + "fmt" + "html" + "os" + "path/filepath" + "sort" + "strconv" + "strings" + "time" + + "bee/audit/internal/app" + "bee/audit/internal/platform" +) + +type benchmarkHistoryRun struct { + generatedAt time.Time + displayTime string + gpuScores map[int]float64 + gpuStatuses map[int]string + overallStatus string +} + +func renderBenchmark(opts HandlerOptions) string { + return `

Benchmark runs generate a human-readable TXT report and machine-readable result bundle. Tasks continue in the background — view progress in Tasks.

+ +
+
+
Benchmark Setup
+
+
+ + +
+
+ +
+ + +
+
+

Loading NVIDIA GPUs...

+
+
+ + + +

Select one GPU for single-card benchmarking or several GPUs for a constrained multi-GPU run.

+
+ + + +
+ + +
Autotune status: loading…
+
+
+ +
+
Method Split
+
+

The benchmark page now exposes two fundamentally different test families so compute score and server power-fit are not mixed into one number.

+ + + + +
Run TypeEngineQuestionStandardStability
Performance Benchmarkbee-gpu-burnHow much isolated compute performance does the GPU realize in this server?` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + `` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + `
Power / Thermal Fitdcgmproftester + nvidia-smi -plHow much power per GPU can this server sustain as GPU count ramps up?` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `
+

Timings are per full ramp-up run (1 GPU → all selected), measured on 4–8 GPU servers. Use ramp-up mode for capacity work: it creates 1 GPU → 2 GPU → … → all selected steps so analysis software can derive server total score and watts-per-GPU curves.

+
+
+
+ +` + `
` + renderBenchmarkResultsCard(opts.ExportDir) + `
` + ` + + + + + +` +} + +func renderBenchmarkResultsCard(exportDir string) string { + maxIdx, runs := loadBenchmarkHistory(exportDir) + perf := renderBenchmarkResultsCardFromRuns( + "Perf Results", + "Composite score by saved benchmark run and GPU.", + "No saved performance benchmark runs yet.", + maxIdx, + runs, + ) + power := renderPowerBenchmarkResultsCard(exportDir) + return perf + "\n" + power +} + +func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, maxGPUIndex int, runs []benchmarkHistoryRun) string { + if len(runs) == 0 { + return `
` + html.EscapeString(title) + `

` + html.EscapeString(emptyMessage) + `

` + } + var b strings.Builder + b.WriteString(`
` + html.EscapeString(title) + `
`) + if strings.TrimSpace(description) != "" { + b.WriteString(`

` + html.EscapeString(description) + `

`) + } + b.WriteString(`
`) + b.WriteString(``) + for i := 0; i <= maxGPUIndex; i++ { + b.WriteString(``) + } + b.WriteString(``) + for i, run := range runs { + b.WriteString(``) + b.WriteString(``) + b.WriteString(``) + overallColor := "var(--ok)" + overallLabel := run.overallStatus + if overallLabel == "" { + overallLabel = "OK" + } + if overallLabel == "FAILED" { + overallColor = "var(--crit-fg,#9f3a38)" + } else if overallLabel != "OK" { + overallColor = "var(--warn)" + } + b.WriteString(``) + for idx := 0; idx <= maxGPUIndex; idx++ { + score, ok := run.gpuScores[idx] + if !ok { + b.WriteString(``) + continue + } + gpuStatus := run.gpuStatuses[idx] + scoreColor := "" + switch gpuStatus { + case "FAILED": + scoreColor = ` style="color:var(--crit-fg,#9f3a38);font-weight:600"` + case "WARNING", "PARTIAL": + scoreColor = ` style="color:var(--warn);font-weight:600"` + case "", "OK": + default: + scoreColor = ` style="color:var(--warn);font-weight:600"` + } + b.WriteString(`` + fmt.Sprintf("%.2f", score) + ``) + } + b.WriteString(``) + } + b.WriteString(`
RunTimeStatusGPU ` + strconv.Itoa(i) + `
#` + strconv.Itoa(i+1) + `` + html.EscapeString(run.displayTime) + `` + html.EscapeString(overallLabel) + `-
`) + return b.String() +} + +func loadBenchmarkHistory(exportDir string) (int, []benchmarkHistoryRun) { + baseDir := app.DefaultBeeBenchPerfDir + if strings.TrimSpace(exportDir) != "" { + baseDir = filepath.Join(exportDir, "bee-bench", "perf") + } + paths, err := filepath.Glob(filepath.Join(baseDir, "perf-*", "result.json")) + if err != nil || len(paths) == 0 { + return -1, nil + } + sort.Strings(paths) + return loadBenchmarkHistoryFromPaths(paths) +} + +func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun) { + runs := make([]benchmarkHistoryRun, 0, len(paths)) + maxGPUIndex := -1 + for _, path := range paths { + raw, err := os.ReadFile(path) + if err != nil { + continue + } + var result platform.NvidiaBenchmarkResult + if err := json.Unmarshal(raw, &result); err != nil { + continue + } + run := benchmarkHistoryRun{ + generatedAt: result.GeneratedAt, + displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"), + gpuScores: make(map[int]float64), + gpuStatuses: make(map[int]string), + overallStatus: result.OverallStatus, + } + for _, gpu := range result.GPUs { + run.gpuScores[gpu.Index] = gpu.Scores.CompositeScore + run.gpuStatuses[gpu.Index] = gpu.Status + if gpu.Index > maxGPUIndex { + maxGPUIndex = gpu.Index + } + } + runs = append(runs, run) + } + sort.Slice(runs, func(i, j int) bool { + return runs[i].generatedAt.After(runs[j].generatedAt) + }) + return maxGPUIndex, runs +} + +func renderPowerBenchmarkResultsCard(exportDir string) string { + baseDir := app.DefaultBeeBenchPowerDir + if strings.TrimSpace(exportDir) != "" { + baseDir = filepath.Join(exportDir, "bee-bench", "power") + } + paths, err := filepath.Glob(filepath.Join(baseDir, "power-*", "result.json")) + if err != nil || len(paths) == 0 { + return `
Power / Thermal Fit Results

No saved power benchmark runs yet.

` + } + sort.Strings(paths) + + type powerRun struct { + generatedAt time.Time + displayTime string + result platform.NvidiaPowerBenchResult + } + var runs []powerRun + for _, path := range paths { + raw, err := os.ReadFile(path) + if err != nil { + continue + } + var r platform.NvidiaPowerBenchResult + if err := json.Unmarshal(raw, &r); err != nil { + continue + } + runs = append(runs, powerRun{ + generatedAt: r.GeneratedAt, + displayTime: r.GeneratedAt.Local().Format("2006-01-02 15:04:05"), + result: r, + }) + } + sort.Slice(runs, func(i, j int) bool { + return runs[i].generatedAt.After(runs[j].generatedAt) + }) + + var b strings.Builder + b.WriteString(`
Power / Thermal Fit Results
`) + + latest := runs[0].result + b.WriteString(`

Latest run: ` + html.EscapeString(runs[0].displayTime)) + if latest.Hostname != "" { + b.WriteString(` — ` + html.EscapeString(latest.Hostname)) + } + if latest.OverallStatus != "" { + statusColor := "var(--ok)" + if latest.OverallStatus != "OK" { + statusColor = "var(--warn)" + } + b.WriteString(` — ` + html.EscapeString(latest.OverallStatus) + ``) + } + b.WriteString(`

`) + + if len(latest.GPUs) > 0 { + b.WriteString(`
`) + b.WriteString(``) + b.WriteString(``) + for _, gpu := range latest.GPUs { + finalLimitW := gpu.StablePowerLimitW + if finalLimitW <= 0 { + finalLimitW = gpu.AppliedPowerLimitW + } + derated := gpu.Derated || + (gpu.DefaultPowerLimitW > 0 && finalLimitW > 0 && finalLimitW < gpu.DefaultPowerLimitW-1) + rowStyle := "" + finalStyle := "" + if derated { + rowStyle = ` style="background:rgba(255,180,0,0.08)"` + finalStyle = ` style="color:#e6a000;font-weight:600"` + } + statusLabel := gpu.Status + if statusLabel == "" { + statusLabel = "OK" + } + statusColor := "var(--ok)" + if statusLabel == "FAILED" { + statusColor = "var(--crit-fg,#9f3a38)" + } else if statusLabel != "OK" { + statusColor = "var(--warn)" + } + nominalStr := "-" + if gpu.DefaultPowerLimitW > 0 { + nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW) + } + singleStr := "-" + if gpu.AppliedPowerLimitW > 0 { + singleStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW) + } + multiStr := "-" + if gpu.StablePowerLimitW > 0 { + multiStr = fmt.Sprintf("%.0f", gpu.StablePowerLimitW) + } + p95Str := "-" + if gpu.MaxObservedPowerW > 0 { + p95Str = fmt.Sprintf("%.0f", gpu.MaxObservedPowerW) + } + b.WriteString(``) + b.WriteString(``) + b.WriteString(``) + b.WriteString(``) + b.WriteString(``) + b.WriteString(`` + multiStr + ``) + b.WriteString(``) + b.WriteString(``) + b.WriteString(``) + } + b.WriteString(`
GPUModelNominal WSingle-card WMulti-GPU WP95 Observed WStatus
` + strconv.Itoa(gpu.Index) + `` + html.EscapeString(gpu.Name) + `` + nominalStr + `` + singleStr + `` + p95Str + `` + html.EscapeString(statusLabel) + `
`) + } + + if len(runs) > 1 { + b.WriteString(`
` + strconv.Itoa(len(runs)) + ` runs total`) + b.WriteString(`
`) + for i, run := range runs { + statusColor := "var(--ok)" + if run.result.OverallStatus != "OK" { + statusColor = "var(--warn)" + } + b.WriteString(``) + b.WriteString(``) + b.WriteString(``) + b.WriteString(``) + b.WriteString(``) + b.WriteString(``) + } + b.WriteString(`
#TimeGPUsStatus
#` + strconv.Itoa(i+1) + `` + html.EscapeString(run.displayTime) + `` + strconv.Itoa(len(run.result.GPUs)) + `` + html.EscapeString(run.result.OverallStatus) + `
`) + } + + b.WriteString(`
`) + return b.String() +} diff --git a/audit/internal/webui/page_burn.go b/audit/internal/webui/page_burn.go new file mode 100644 index 0000000..4d42b9e --- /dev/null +++ b/audit/internal/webui/page_burn.go @@ -0,0 +1,383 @@ +package webui + +func renderBurn() string { + return `
⚠ Warning: Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.
+
Scope: Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in Validate → Stress mode; NCCL and NVBandwidth are available directly from Validate.
+

Tasks continue in the background — view progress in Tasks.

+ +
+
Burn Profile
+
+
+
+ + + +
+
+ +

Runs checked tests as separate sequential tasks. In sequential GPU mode, total time = profile duration × N GPU. In parallel mode, all selected GPUs burn simultaneously for one profile duration.

+
+
+ +

Run checked core test modules (CPU, MEM, GPU). Tests start at the same time and run for a period with short cooldown phases to stress the server cooling system.

+
+
+
+ +
+
+ +
+
NVIDIA GPU Selection
+
+

Official NVIDIA recipes and custom NVIDIA stressors use only the GPUs selected here. Multi-GPU interconnect tests are limited to this selection as well.

+
+ + +
+
+

Loading NVIDIA GPUs...

+
+

Select at least one NVIDIA GPU to enable NVIDIA burn recipes.

+
+ + + +
+
+
+ +
Core Burn Paths
+
+
+
GPU Max Load
+
+

Combine vendor-backed and custom GPU max-load recipes in one run set. ` + "dcgmproftester" + ` is the primary official NVIDIA path; custom stressors remain available as parallel checkbox options.

+ + + + +
+
+ +
+
Compute Stress
+
+

Select which subsystems to stress. Each checked item runs as a separate task.

+ + + +
+
+
+ + + + + +` +} diff --git a/audit/internal/webui/page_export_tools.go b/audit/internal/webui/page_export_tools.go new file mode 100644 index 0000000..f79ac53 --- /dev/null +++ b/audit/internal/webui/page_export_tools.go @@ -0,0 +1,434 @@ +package webui + +import ( + "fmt" + "html" + "net/url" + "os" + "path/filepath" + "sort" + "strings" +) + +func renderExport(exportDir string) string { + entries, _ := listExportFiles(exportDir) + var rows strings.Builder + for _, e := range entries { + rows.WriteString(fmt.Sprintf(`%s`, + url.QueryEscape(e), html.EscapeString(e))) + } + if len(entries) == 0 { + rows.WriteString(`No export files found.`) + } + return `
+
Support Bundle
+

Creates a tar.gz archive of all audit files, SAT results, and logs.

+` + renderSupportBundleInline() + ` +
+
Export Files
+` + rows.String() + `
File
+
+
+ +` + renderUSBExportCard() +} + +func listExportFiles(exportDir string) ([]string, error) { + var entries []string + err := filepath.Walk(strings.TrimSpace(exportDir), func(path string, info os.FileInfo, err error) error { + if err != nil { + return err + } + if info.IsDir() { + return nil + } + rel, err := filepath.Rel(exportDir, path) + if err != nil { + return err + } + entries = append(entries, rel) + return nil + }) + if err != nil && !os.IsNotExist(err) { + return nil, err + } + sort.Strings(entries) + return entries, nil +} + +func renderSupportBundleInline() string { + return ` +
+` +} + +func renderUSBExportCard() string { + return `
+
Export to USB + +
+
` + renderUSBExportInline() + `
+
` +} + +func renderUSBExportInline() string { + return `

Write audit JSON or support bundle directly to a removable USB drive.

+
Scanning for USB devices...
+
+
+` +} + +func renderNvidiaSelfHealInline() string { + return `

Inspect NVIDIA GPU health, restart the bee-nvidia driver service, and issue a per-GPU reset when the driver reports reset required.

+
+ + +
+
Loading NVIDIA GPU status...
+

Loading...

+ +` +} + +func renderTools() string { + return `
+
System Install
+
+
+
Install to RAM
+

Detecting boot source...

+

Checking...

+ +
+
+
Install to Disk
` + + renderInstallInline() + ` +
+
+
+ + +
Support Bundle
+

Downloads a tar.gz archive of all audit files, SAT results, and logs.

+` + renderSupportBundleInline() + ` +
+
Export to USB
+ ` + renderUSBExportInline() + ` +
+
+ +
Tool Check
+

Checking...

+ +
NVIDIA Self Heal
` + + renderNvidiaSelfHealInline() + `
+ +
Network
` + + renderNetworkInline() + `
+ +
Services
` + + renderServicesInline() + `
+ + +` +} + +func renderExportIndex(exportDir string) (string, error) { + entries, err := listExportFiles(exportDir) + if err != nil { + return "", err + } + var body strings.Builder + body.WriteString(`Bee Export Files`) + body.WriteString(`

Bee Export Files

`) + return body.String(), nil +} diff --git a/audit/internal/webui/page_install_tasks.go b/audit/internal/webui/page_install_tasks.go new file mode 100644 index 0000000..0f63851 --- /dev/null +++ b/audit/internal/webui/page_install_tasks.go @@ -0,0 +1,314 @@ +package webui + +func renderInstallInline() string { + return ` +
+ Warning: Installing will completely erase the selected + disk and write the live system onto it. All existing data on the target disk will be lost. + This operation cannot be undone. +
+
Loading disk list…
+ + + + + + + +` +} + +func renderInstall() string { + return `
Install Live System to Disk
` + + renderInstallInline() + + `
` +} + +func renderTasks() string { + return `
+ + + +Open a task to view its saved logs and charts. +
+
+

Loading...

+
+` +} diff --git a/audit/internal/webui/page_metrics.go b/audit/internal/webui/page_metrics.go new file mode 100644 index 0000000..66aa0e1 --- /dev/null +++ b/audit/internal/webui/page_metrics.go @@ -0,0 +1,238 @@ +package webui + +func renderMetrics() string { + return `

Live metrics — updated every 2 seconds.

+ +
+
Server — Load
+
+ CPU/Mem load +
+
+ +
+
Temperature — CPU
+
+ CPU temperature +
+
+ + +
+
Temperature — Ambient Sensors
+
+ Ambient temperature sensors +
+
+ +
+
Server — Power
+
+ System power +
+
+ + + + + +` +} diff --git a/audit/internal/webui/page_network_services.go b/audit/internal/webui/page_network_services.go new file mode 100644 index 0000000..92db367 --- /dev/null +++ b/audit/internal/webui/page_network_services.go @@ -0,0 +1,213 @@ +package webui + +import "html" + +// renderNetworkInline returns the network UI without a wrapping card (for embedding in Tools). +func renderNetworkInline() string { + return ` +

Loading...

+
+
DHCP
+
+ +
+
+
Static IPv4
+
+
+
+
+
+ +
+
+
+` +} + +func renderNetwork() string { + return `
Network Interfaces
` + + renderNetworkInline() + + `
` +} + +func renderServicesInline() string { + return `

` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `

+
+

Loading...

+ +` +} + +func renderServices() string { + return `
Bee Services
` + + renderServicesInline() + + `
` +} diff --git a/audit/internal/webui/page_validate.go b/audit/internal/webui/page_validate.go new file mode 100644 index 0000000..a8fc138 --- /dev/null +++ b/audit/internal/webui/page_validate.go @@ -0,0 +1,716 @@ +package webui + +import ( + "encoding/json" + "fmt" + "html" + "sort" + "strings" + + "bee/audit/internal/platform" + "bee/audit/internal/schema" +) + +type validateInventory struct { + CPU string + Memory string + Storage string + NVIDIA string + AMD string + NvidiaGPUCount int + AMDGPUCount int +} + +func validateFmtDur(secs int) string { + if secs < 120 { + return fmt.Sprintf("~%d s", secs) + } + mins := (secs + 29) / 60 + return fmt.Sprintf("~%d min", mins) +} + +func validateTotalValidateSec(n int) int { + if n < 0 { + n = 0 + } + total := platform.SATEstimatedCPUValidateSec + + platform.SATEstimatedMemoryValidateSec + + n*platform.SATEstimatedNvidiaGPUValidatePerGPUSec + + platform.SATEstimatedNvidiaInterconnectSec + + platform.SATEstimatedNvidiaBandwidthSec + return total +} + +func validateTotalStressSec(n int) int { + if n < 0 { + n = 0 + } + total := platform.SATEstimatedCPUStressSec + + platform.SATEstimatedMemoryStressSec + + n*platform.SATEstimatedNvidiaGPUStressPerGPUSec + + n*platform.SATEstimatedNvidiaTargetedStressPerGPUSec + + n*platform.SATEstimatedNvidiaTargetedPowerPerGPUSec + + platform.SATEstimatedNvidiaPulseTestSec + + platform.SATEstimatedNvidiaInterconnectSec + + platform.SATEstimatedNvidiaBandwidthSec + return total +} + +func renderValidate(opts HandlerOptions) string { + inv := loadValidateInventory(opts) + n := inv.NvidiaGPUCount + validateTotalStr := validateFmtDur(validateTotalValidateSec(n)) + stressTotalStr := validateFmtDur(validateTotalStressSec(n)) + gpuNote := "" + if n > 0 { + gpuNote = fmt.Sprintf(" (%d GPU)", n) + } + return `
Non-destructive: Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.
+

Tasks continue in the background — view progress in Tasks.

+ +
+
Validate Profile
+
+
+
+ + +
+
+

Runs validate modules sequentially. Validate: ` + validateTotalStr + gpuNote + `; Stress: ` + stressTotalStr + gpuNote + `. Estimates are based on real log data and scale with GPU count.

+ +
+ +
+
+
+
+ +
+` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody( + inv.CPU, + `Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`, + `lscpu, sensors, stress-ng`, + validateFmtDur(platform.SATEstimatedCPUValidateSec)+` in Validate (stress-ng 60 s). `+validateFmtDur(platform.SATEstimatedCPUStressSec)+` in Stress (stress-ng 30 min).`, + )) + + renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody( + inv.Memory, + `Runs a RAM validation pass and records memory state around the test.`, + `free, memtester`, + validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` in Validate (256 MB × 1 pass). `+validateFmtDur(platform.SATEstimatedMemoryStressSec)+` in Stress (512 MB × 1 pass).`, + )) + + renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody( + inv.Storage, + `Scans all storage devices and runs the matching health or self-test path for each device type.`, + `lsblk; NVMe: nvme; SATA/SAS: smartctl`, + `Seconds in Validate (NVMe: instant device query; SATA/SAS: short self-test). Up to ~1 h per device in Stress (extended self-test, device-dependent).`, + )) + + `
+
+
+
NVIDIA GPU Selection
+
+

` + inv.NVIDIA + `

+

All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Validate one by one.

+
+ + +
+
+

Loading NVIDIA GPUs...

+
+

Select at least one NVIDIA GPU to enable NVIDIA validate tasks.

+
+
+ +
+` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody( + inv.NVIDIA, + `Runs NVIDIA diagnostics and board inventory checks.`, + `nvidia-smi, dmidecode, dcgmi diag`, + func() string { + perV := platform.SATEstimatedNvidiaGPUValidatePerGPUSec + perS := platform.SATEstimatedNvidiaGPUStressPerGPUSec + if n > 0 { + return fmt.Sprintf("Validate: %s/GPU × %d = %s (Level 2, sequential). Stress: %s/GPU × %d = %s (Level 3, sequential).", + validateFmtDur(perV), n, validateFmtDur(perV*n), + validateFmtDur(perS), n, validateFmtDur(perS*n)) + } + return fmt.Sprintf("Validate: %s/GPU (Level 2, sequential). Stress: %s/GPU (Level 3, sequential).", + validateFmtDur(perV), validateFmtDur(perS)) + }(), + )) + + `
` + + renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody( + inv.NVIDIA, + `Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`, + `dcgmi diag targeted_stress`, + func() string { + per := platform.SATEstimatedNvidiaTargetedStressPerGPUSec + s := "Skipped in Validate. " + if n > 0 { + s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n)) + } else { + s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per)) + } + return s + `

Only runs in Stress mode. Switch mode above to enable in Run All.

` + }(), + )) + + `
` + + `
` + + renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody( + inv.NVIDIA, + `Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`, + `dcgmi diag targeted_power`, + func() string { + per := platform.SATEstimatedNvidiaTargetedPowerPerGPUSec + s := "Skipped in Validate. " + if n > 0 { + s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n)) + } else { + s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per)) + } + return s + `

Only runs in Stress mode. Switch mode above to enable in Run All.

` + }(), + )) + + `
` + + `
` + + renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody( + inv.NVIDIA, + `Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`, + `dcgmi diag pulse_test`, + `Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`

Only runs in Stress mode. Switch mode above to enable in Run All.

`, + )) + + `
` + + `
` + + renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody( + inv.NVIDIA, + `Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`, + `all_reduce_perf (NCCL tests)`, + `Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`, + )) + + `
` + + `
` + + renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody( + inv.NVIDIA, + `Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`, + `nvbandwidth`, + `Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`, + )) + + `
` + + `
+
+` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody( + inv.AMD, + `Runs the selected AMD checks only. GPU Validate collects inventory; MEM Integrity uses the RVS MEM module; MEM Bandwidth uses rocm-bandwidth-test and the RVS BABEL module.`, + `GPU Validate: rocm-smi, dmidecode; MEM Integrity: rvs mem; MEM Bandwidth: rocm-bandwidth-test, rvs babel`, + `
`, + )) + + `
+ + + +` +} + +func loadValidateInventory(opts HandlerOptions) validateInventory { + unknown := "Audit snapshot not loaded." + out := validateInventory{ + CPU: unknown, + Memory: unknown, + Storage: unknown, + NVIDIA: unknown, + AMD: unknown, + } + data, err := loadSnapshot(opts.AuditPath) + if err != nil { + return out + } + var snap schema.HardwareIngestRequest + if err := json.Unmarshal(data, &snap); err != nil { + return out + } + + cpuCounts := map[string]int{} + cpuTotal := 0 + for _, cpu := range snap.Hardware.CPUs { + if cpu.Present != nil && !*cpu.Present { + continue + } + cpuTotal++ + addValidateModel(cpuCounts, validateFirstNonEmpty(validateTrimPtr(cpu.Model), validateTrimPtr(cpu.Manufacturer), "unknown")) + } + + memCounts := map[string]int{} + memTotal := 0 + for _, dimm := range snap.Hardware.Memory { + if dimm.Present != nil && !*dimm.Present { + continue + } + memTotal++ + addValidateModel(memCounts, validateFirstNonEmpty(validateTrimPtr(dimm.PartNumber), validateTrimPtr(dimm.Type), validateTrimPtr(dimm.Manufacturer), "unknown")) + } + + storageCounts := map[string]int{} + storageTotal := 0 + for _, dev := range snap.Hardware.Storage { + if dev.Present != nil && !*dev.Present { + continue + } + storageTotal++ + addValidateModel(storageCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown")) + } + + nvidiaCounts := map[string]int{} + nvidiaTotal := 0 + amdCounts := map[string]int{} + amdTotal := 0 + for _, dev := range snap.Hardware.PCIeDevices { + if dev.Present != nil && !*dev.Present { + continue + } + if validateIsVendorGPU(dev, "nvidia") { + nvidiaTotal++ + addValidateModel(nvidiaCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown")) + } + if validateIsVendorGPU(dev, "amd") { + amdTotal++ + addValidateModel(amdCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown")) + } + } + + out.CPU = formatValidateDeviceSummary(cpuTotal, cpuCounts, "CPU") + out.Memory = formatValidateDeviceSummary(memTotal, memCounts, "module") + out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device") + out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU") + out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU") + out.NvidiaGPUCount = nvidiaTotal + out.AMDGPUCount = amdTotal + return out +} + +func renderValidateCardBody(devices, description, commands, settings string) string { + return `
` + devices + `
` + + `
` + description + `
` + + `
` + commands + `
` + + `
` + settings + `
` +} + +func formatValidateDeviceSummary(total int, models map[string]int, unit string) string { + if total == 0 { + return "0 " + unit + "s detected." + } + keys := make([]string, 0, len(models)) + for key := range models { + keys = append(keys, key) + } + sort.Strings(keys) + parts := make([]string, 0, len(keys)) + for _, key := range keys { + parts = append(parts, fmt.Sprintf("%d x %s", models[key], html.EscapeString(key))) + } + label := unit + if total != 1 { + label += "s" + } + if len(parts) == 1 { + return parts[0] + " " + label + } + return fmt.Sprintf("%d %s: %s", total, label, strings.Join(parts, ", ")) +} + +func addValidateModel(counts map[string]int, name string) { + name = strings.TrimSpace(name) + if name == "" { + name = "unknown" + } + counts[name]++ +} + +func validateTrimPtr(value *string) string { + if value == nil { + return "" + } + return strings.TrimSpace(*value) +} + +func validateFirstNonEmpty(values ...string) string { + for _, value := range values { + value = strings.TrimSpace(value) + if value != "" { + return value + } + } + return "" +} + +func validateIsVendorGPU(dev schema.HardwarePCIeDevice, vendor string) bool { + model := strings.ToLower(validateTrimPtr(dev.Model)) + manufacturer := strings.ToLower(validateTrimPtr(dev.Manufacturer)) + class := strings.ToLower(validateTrimPtr(dev.DeviceClass)) + if strings.Contains(model, "aspeed") || strings.Contains(manufacturer, "aspeed") { + return false + } + switch vendor { + case "nvidia": + return strings.Contains(model, "nvidia") || strings.Contains(manufacturer, "nvidia") + case "amd": + isGPUClass := class == "processingaccelerator" || class == "displaycontroller" || class == "videocontroller" + isAMDVendor := strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd") || strings.Contains(manufacturer, "ati") + isAMDModel := strings.Contains(model, "instinct") || strings.Contains(model, "radeon") || strings.Contains(model, "amd") + return isGPUClass && (isAMDVendor || isAMDModel) + default: + return false + } +} + +func renderSATCard(id, label, runAction, headerActions, body string) string { + actions := `` + if strings.TrimSpace(headerActions) != "" { + actions += headerActions + } + return fmt.Sprintf(`
%s
%s
%s
`, + label, actions, body) +} diff --git a/audit/internal/webui/pages.go b/audit/internal/webui/pages.go index 5ee6ac3..a868792 100644 --- a/audit/internal/webui/pages.go +++ b/audit/internal/webui/pages.go @@ -4,150 +4,14 @@ import ( "encoding/json" "fmt" "html" - "net/url" - "os" "path/filepath" "sort" - "strconv" "strings" - "time" "bee/audit/internal/app" - "bee/audit/internal/platform" "bee/audit/internal/schema" ) -// ── Layout ──────────────────────────────────────────────────────────────────── - -func layoutHead(title string) string { - return ` - - - - -` + html.EscapeString(title) + ` - - - -` -} - -func layoutNav(active string, buildLabel string) string { - items := []struct{ id, label, href, onclick string }{ - {"dashboard", "Dashboard", "/", ""}, - {"audit", "Audit", "/audit", ""}, - {"validate", "Validate", "/validate", ""}, - {"burn", "Burn", "/burn", ""}, - {"benchmark", "Benchmark", "/benchmark", ""}, - {"tasks", "Tasks", "/tasks", ""}, - {"tools", "Tools", "/tools", ""}, - } - var b strings.Builder - b.WriteString(``) - return b.String() -} - // renderPage dispatches to the appropriate page renderer. func renderPage(page string, opts HandlerOptions) string { var pageID, title, body string @@ -1135,2860 +999,3 @@ func rowIssueHTML(issue string) string { } return html.EscapeString(issue) } - -// ── Metrics ─────────────────────────────────────────────────────────────────── - -func renderMetrics() string { - return `

Live metrics — updated every 2 seconds.

- -
-
Server — Load
-
- CPU/Mem load -
-
- -
-
Temperature — CPU
-
- CPU temperature -
-
- - -
-
Temperature — Ambient Sensors
-
- Ambient temperature sensors -
-
- -
-
Server — Power
-
- System power -
-
- - - - - -` -} - -// ── Validate (Acceptance Tests) ─────────────────────────────────────────────── - -type validateInventory struct { - CPU string - Memory string - Storage string - NVIDIA string - AMD string - NvidiaGPUCount int - AMDGPUCount int -} - -// validateFmtDur formats a duration in seconds as a human-readable "~N min" or "~N s" string. -func validateFmtDur(secs int) string { - if secs < 120 { - return fmt.Sprintf("~%d s", secs) - } - mins := (secs + 29) / 60 - return fmt.Sprintf("~%d min", mins) -} - -// validateTotalValidateSec returns the estimated wall-clock duration of -// "Validate one by one" in Validate mode for n NVIDIA GPUs. -func validateTotalValidateSec(n int) int { - if n < 0 { - n = 0 - } - total := platform.SATEstimatedCPUValidateSec + - platform.SATEstimatedMemoryValidateSec + - n*platform.SATEstimatedNvidiaGPUValidatePerGPUSec + - platform.SATEstimatedNvidiaInterconnectSec + - platform.SATEstimatedNvidiaBandwidthSec - return total -} - -// validateTotalStressSec returns the estimated wall-clock duration of -// "Validate one by one" in Stress mode for n NVIDIA GPUs. -func validateTotalStressSec(n int) int { - if n < 0 { - n = 0 - } - total := platform.SATEstimatedCPUStressSec + - platform.SATEstimatedMemoryStressSec + - n*platform.SATEstimatedNvidiaGPUStressPerGPUSec + - n*platform.SATEstimatedNvidiaTargetedStressPerGPUSec + - n*platform.SATEstimatedNvidiaTargetedPowerPerGPUSec + - platform.SATEstimatedNvidiaPulseTestSec + - platform.SATEstimatedNvidiaInterconnectSec + - platform.SATEstimatedNvidiaBandwidthSec - return total -} - -func renderValidate(opts HandlerOptions) string { - inv := loadValidateInventory(opts) - n := inv.NvidiaGPUCount - validateTotalStr := validateFmtDur(validateTotalValidateSec(n)) - stressTotalStr := validateFmtDur(validateTotalStressSec(n)) - gpuNote := "" - if n > 0 { - gpuNote = fmt.Sprintf(" (%d GPU)", n) - } - return `
Non-destructive: Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.
-

Tasks continue in the background — view progress in Tasks.

- -
-
Validate Profile
-
-
-
- - -
-
-

Runs validate modules sequentially. Validate: ` + validateTotalStr + gpuNote + `; Stress: ` + stressTotalStr + gpuNote + `. Estimates are based on real log data and scale with GPU count.

- -
- -
-
-
-
- -
-` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody( - inv.CPU, - `Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`, - `lscpu, sensors, stress-ng`, - validateFmtDur(platform.SATEstimatedCPUValidateSec)+` in Validate (stress-ng 60 s). `+validateFmtDur(platform.SATEstimatedCPUStressSec)+` in Stress (stress-ng 30 min).`, - )) + - renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody( - inv.Memory, - `Runs a RAM validation pass and records memory state around the test.`, - `free, memtester`, - validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` in Validate (256 MB × 1 pass). `+validateFmtDur(platform.SATEstimatedMemoryStressSec)+` in Stress (512 MB × 1 pass).`, - )) + - renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody( - inv.Storage, - `Scans all storage devices and runs the matching health or self-test path for each device type.`, - `lsblk; NVMe: nvme; SATA/SAS: smartctl`, - `Seconds in Validate (NVMe: instant device query; SATA/SAS: short self-test). Up to ~1 h per device in Stress (extended self-test, device-dependent).`, - )) + - `
-
-
-
NVIDIA GPU Selection
-
-

` + inv.NVIDIA + `

-

All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Validate one by one.

-
- - -
-
-

Loading NVIDIA GPUs...

-
-

Select at least one NVIDIA GPU to enable NVIDIA validate tasks.

-
-
- -
-` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody( - inv.NVIDIA, - `Runs NVIDIA diagnostics and board inventory checks.`, - `nvidia-smi, dmidecode, dcgmi diag`, - func() string { - perV := platform.SATEstimatedNvidiaGPUValidatePerGPUSec - perS := platform.SATEstimatedNvidiaGPUStressPerGPUSec - if n > 0 { - return fmt.Sprintf("Validate: %s/GPU × %d = %s (Level 2, sequential). Stress: %s/GPU × %d = %s (Level 3, sequential).", - validateFmtDur(perV), n, validateFmtDur(perV*n), - validateFmtDur(perS), n, validateFmtDur(perS*n)) - } - return fmt.Sprintf("Validate: %s/GPU (Level 2, sequential). Stress: %s/GPU (Level 3, sequential).", - validateFmtDur(perV), validateFmtDur(perS)) - }(), - )) + - `
` + - renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody( - inv.NVIDIA, - `Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`, - `dcgmi diag targeted_stress`, - func() string { - per := platform.SATEstimatedNvidiaTargetedStressPerGPUSec - s := "Skipped in Validate. " - if n > 0 { - s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n)) - } else { - s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per)) - } - return s + `

Only runs in Stress mode. Switch mode above to enable in Run All.

` - }(), - )) + - `
` + - `
` + - renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody( - inv.NVIDIA, - `Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`, - `dcgmi diag targeted_power`, - func() string { - per := platform.SATEstimatedNvidiaTargetedPowerPerGPUSec - s := "Skipped in Validate. " - if n > 0 { - s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n)) - } else { - s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per)) - } - return s + `

Only runs in Stress mode. Switch mode above to enable in Run All.

` - }(), - )) + - `
` + - `
` + - renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody( - inv.NVIDIA, - `Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`, - `dcgmi diag pulse_test`, - `Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`

Only runs in Stress mode. Switch mode above to enable in Run All.

`, - )) + - `
` + - `
` + - renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody( - inv.NVIDIA, - `Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`, - `all_reduce_perf (NCCL tests)`, - `Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`, - )) + - `
` + - `
` + - renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody( - inv.NVIDIA, - `Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`, - `nvbandwidth`, - `Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`, - )) + - `
` + - `
-
-` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody( - inv.AMD, - `Runs the selected AMD checks only. GPU Validate collects inventory; MEM Integrity uses the RVS MEM module; MEM Bandwidth uses rocm-bandwidth-test and the RVS BABEL module.`, - `GPU Validate: rocm-smi, dmidecode; MEM Integrity: rvs mem; MEM Bandwidth: rocm-bandwidth-test, rvs babel`, - `
`, - )) + - `
- - - -` -} - -func loadValidateInventory(opts HandlerOptions) validateInventory { - unknown := "Audit snapshot not loaded." - out := validateInventory{ - CPU: unknown, - Memory: unknown, - Storage: unknown, - NVIDIA: unknown, - AMD: unknown, - } - data, err := loadSnapshot(opts.AuditPath) - if err != nil { - return out - } - var snap schema.HardwareIngestRequest - if err := json.Unmarshal(data, &snap); err != nil { - return out - } - - cpuCounts := map[string]int{} - cpuTotal := 0 - for _, cpu := range snap.Hardware.CPUs { - if cpu.Present != nil && !*cpu.Present { - continue - } - cpuTotal++ - addValidateModel(cpuCounts, validateFirstNonEmpty(validateTrimPtr(cpu.Model), validateTrimPtr(cpu.Manufacturer), "unknown")) - } - - memCounts := map[string]int{} - memTotal := 0 - for _, dimm := range snap.Hardware.Memory { - if dimm.Present != nil && !*dimm.Present { - continue - } - memTotal++ - addValidateModel(memCounts, validateFirstNonEmpty(validateTrimPtr(dimm.PartNumber), validateTrimPtr(dimm.Type), validateTrimPtr(dimm.Manufacturer), "unknown")) - } - - storageCounts := map[string]int{} - storageTotal := 0 - for _, dev := range snap.Hardware.Storage { - if dev.Present != nil && !*dev.Present { - continue - } - storageTotal++ - addValidateModel(storageCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown")) - } - - nvidiaCounts := map[string]int{} - nvidiaTotal := 0 - amdCounts := map[string]int{} - amdTotal := 0 - for _, dev := range snap.Hardware.PCIeDevices { - if dev.Present != nil && !*dev.Present { - continue - } - if validateIsVendorGPU(dev, "nvidia") { - nvidiaTotal++ - addValidateModel(nvidiaCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown")) - } - if validateIsVendorGPU(dev, "amd") { - amdTotal++ - addValidateModel(amdCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown")) - } - } - - out.CPU = formatValidateDeviceSummary(cpuTotal, cpuCounts, "CPU") - out.Memory = formatValidateDeviceSummary(memTotal, memCounts, "module") - out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device") - out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU") - out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU") - out.NvidiaGPUCount = nvidiaTotal - out.AMDGPUCount = amdTotal - return out -} - -func renderValidateCardBody(devices, description, commands, settings string) string { - return `
` + devices + `
` + - `
` + description + `
` + - `
` + commands + `
` + - `
` + settings + `
` -} - -func formatValidateDeviceSummary(total int, models map[string]int, unit string) string { - if total == 0 { - return "0 " + unit + "s detected." - } - keys := make([]string, 0, len(models)) - for key := range models { - keys = append(keys, key) - } - sort.Strings(keys) - parts := make([]string, 0, len(keys)) - for _, key := range keys { - parts = append(parts, fmt.Sprintf("%d x %s", models[key], html.EscapeString(key))) - } - label := unit - if total != 1 { - label += "s" - } - // If there is only one model the leading count duplicates the per-model - // count already in parts (e.g. "4 GPU: 4 x RTX …" → "4 x RTX …"). - if len(parts) == 1 { - return parts[0] + " " + label - } - return fmt.Sprintf("%d %s: %s", total, label, strings.Join(parts, ", ")) -} - -func addValidateModel(counts map[string]int, name string) { - name = strings.TrimSpace(name) - if name == "" { - name = "unknown" - } - counts[name]++ -} - -func validateTrimPtr(value *string) string { - if value == nil { - return "" - } - return strings.TrimSpace(*value) -} - -func validateFirstNonEmpty(values ...string) string { - for _, value := range values { - value = strings.TrimSpace(value) - if value != "" { - return value - } - } - return "" -} - -func validateIsVendorGPU(dev schema.HardwarePCIeDevice, vendor string) bool { - model := strings.ToLower(validateTrimPtr(dev.Model)) - manufacturer := strings.ToLower(validateTrimPtr(dev.Manufacturer)) - class := strings.ToLower(validateTrimPtr(dev.DeviceClass)) - if strings.Contains(model, "aspeed") || strings.Contains(manufacturer, "aspeed") { - return false - } - switch vendor { - case "nvidia": - return strings.Contains(model, "nvidia") || strings.Contains(manufacturer, "nvidia") - case "amd": - isGPUClass := class == "processingaccelerator" || class == "displaycontroller" || class == "videocontroller" - isAMDVendor := strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd") || strings.Contains(manufacturer, "ati") - isAMDModel := strings.Contains(model, "instinct") || strings.Contains(model, "radeon") || strings.Contains(model, "amd") - return isGPUClass && (isAMDVendor || isAMDModel) - default: - return false - } -} - -func renderSATCard(id, label, runAction, headerActions, body string) string { - actions := `` - if strings.TrimSpace(headerActions) != "" { - actions += headerActions - } - return fmt.Sprintf(`
%s
%s
%s
`, - label, actions, body) -} - -// ── Benchmark ───────────────────────────────────────────────────────────────── - -type benchmarkHistoryRun struct { - generatedAt time.Time - displayTime string - gpuScores map[int]float64 // GPU index → composite score - gpuStatuses map[int]string // GPU index → status ("OK", "WARNING", "FAILED", …) - overallStatus string -} - -func renderBenchmark(opts HandlerOptions) string { - return `

Benchmark runs generate a human-readable TXT report and machine-readable result bundle. Tasks continue in the background — view progress in Tasks.

- -
-
-
Benchmark Setup
-
-
- - -
-
- -
- - -
-
-

Loading NVIDIA GPUs...

-
-
- - - -

Select one GPU for single-card benchmarking or several GPUs for a constrained multi-GPU run.

-
- - -
- - -
-
- -
-
Method Split
-
-

The benchmark page now exposes two fundamentally different test families so compute score and server power-fit are not mixed into one number.

- - - - -
Run TypeEngineQuestionStandardStability
Performance Benchmarkbee-gpu-burnHow much isolated compute performance does the GPU realize in this server?` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + `` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + `
Power / Thermal Fitdcgmi targeted_powerHow much power per GPU can this server sustain as GPU count ramps up?` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `
-

Timings are per full ramp-up run (1 GPU → all selected), measured on 4–8 GPU servers. Use ramp-up mode for capacity work: it creates 1 GPU → 2 GPU → … → all selected steps so analysis software can derive server total score and watts-per-GPU curves.

-
-
-
- -` + `
` + renderBenchmarkResultsCard(opts.ExportDir) + `
` + ` - - - - - -` -} - -func renderBenchmarkResultsCard(exportDir string) string { - maxIdx, runs := loadBenchmarkHistory(exportDir) - perf := renderBenchmarkResultsCardFromRuns( - "Perf Results", - "Composite score by saved benchmark run and GPU.", - "No saved performance benchmark runs yet.", - maxIdx, - runs, - ) - power := renderPowerBenchmarkResultsCard(exportDir) - return perf + "\n" + power -} - -func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, maxGPUIndex int, runs []benchmarkHistoryRun) string { - if len(runs) == 0 { - return `
` + html.EscapeString(title) + `

` + html.EscapeString(emptyMessage) + `

` - } - var b strings.Builder - b.WriteString(`
` + html.EscapeString(title) + `
`) - if strings.TrimSpace(description) != "" { - b.WriteString(`

` + html.EscapeString(description) + `

`) - } - b.WriteString(`
`) - b.WriteString(``) - for i := 0; i <= maxGPUIndex; i++ { - b.WriteString(``) - } - b.WriteString(``) - for i, run := range runs { - b.WriteString(``) - b.WriteString(``) - b.WriteString(``) - overallColor := "var(--ok)" - overallLabel := run.overallStatus - if overallLabel == "" { - overallLabel = "OK" - } - if overallLabel == "FAILED" { - overallColor = "var(--crit-fg,#9f3a38)" - } else if overallLabel != "OK" { - overallColor = "var(--warn)" - } - b.WriteString(``) - for idx := 0; idx <= maxGPUIndex; idx++ { - score, ok := run.gpuScores[idx] - if !ok { - b.WriteString(``) - continue - } - gpuStatus := run.gpuStatuses[idx] - scoreColor := "" - switch gpuStatus { - case "FAILED": - scoreColor = ` style="color:var(--crit-fg,#9f3a38);font-weight:600"` - case "WARNING", "PARTIAL": - scoreColor = ` style="color:var(--warn);font-weight:600"` - case "", "OK": - // no override - default: - scoreColor = ` style="color:var(--warn);font-weight:600"` - } - b.WriteString(`` + fmt.Sprintf("%.2f", score) + ``) - } - b.WriteString(``) - } - b.WriteString(`
RunTimeStatusGPU ` + strconv.Itoa(i) + `
#` + strconv.Itoa(i+1) + `` + html.EscapeString(run.displayTime) + `` + html.EscapeString(overallLabel) + `-
`) - return b.String() -} - -func loadBenchmarkHistory(exportDir string) (int, []benchmarkHistoryRun) { - baseDir := app.DefaultBeeBenchPerfDir - if strings.TrimSpace(exportDir) != "" { - baseDir = filepath.Join(exportDir, "bee-bench", "perf") - } - paths, err := filepath.Glob(filepath.Join(baseDir, "perf-*", "result.json")) - if err != nil || len(paths) == 0 { - return -1, nil - } - sort.Strings(paths) - return loadBenchmarkHistoryFromPaths(paths) -} - -func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun) { - runs := make([]benchmarkHistoryRun, 0, len(paths)) - maxGPUIndex := -1 - for _, path := range paths { - raw, err := os.ReadFile(path) - if err != nil { - continue - } - var result platform.NvidiaBenchmarkResult - if err := json.Unmarshal(raw, &result); err != nil { - continue - } - run := benchmarkHistoryRun{ - generatedAt: result.GeneratedAt, - displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"), - gpuScores: make(map[int]float64), - gpuStatuses: make(map[int]string), - overallStatus: result.OverallStatus, - } - for _, gpu := range result.GPUs { - run.gpuScores[gpu.Index] = gpu.Scores.CompositeScore - run.gpuStatuses[gpu.Index] = gpu.Status - if gpu.Index > maxGPUIndex { - maxGPUIndex = gpu.Index - } - } - runs = append(runs, run) - } - sort.Slice(runs, func(i, j int) bool { - return runs[i].generatedAt.After(runs[j].generatedAt) - }) - return maxGPUIndex, runs -} - -func renderPowerBenchmarkResultsCard(exportDir string) string { - baseDir := app.DefaultBeeBenchPowerDir - if strings.TrimSpace(exportDir) != "" { - baseDir = filepath.Join(exportDir, "bee-bench", "power") - } - paths, err := filepath.Glob(filepath.Join(baseDir, "power-*", "result.json")) - if err != nil || len(paths) == 0 { - return `
Power / Thermal Fit Results

No saved power benchmark runs yet.

` - } - sort.Strings(paths) - - type powerRun struct { - generatedAt time.Time - displayTime string - result platform.NvidiaPowerBenchResult - } - var runs []powerRun - for _, path := range paths { - raw, err := os.ReadFile(path) - if err != nil { - continue - } - var r platform.NvidiaPowerBenchResult - if err := json.Unmarshal(raw, &r); err != nil { - continue - } - runs = append(runs, powerRun{ - generatedAt: r.GeneratedAt, - displayTime: r.GeneratedAt.Local().Format("2006-01-02 15:04:05"), - result: r, - }) - } - sort.Slice(runs, func(i, j int) bool { - return runs[i].generatedAt.After(runs[j].generatedAt) - }) - - // Show only the most recent run's GPU slot table, plus a run history summary. - var b strings.Builder - b.WriteString(`
Power / Thermal Fit Results
`) - - latest := runs[0].result - b.WriteString(`

Latest run: ` + html.EscapeString(runs[0].displayTime)) - if latest.Hostname != "" { - b.WriteString(` — ` + html.EscapeString(latest.Hostname)) - } - if latest.OverallStatus != "" { - statusColor := "var(--ok)" - if latest.OverallStatus != "OK" { - statusColor = "var(--warn)" - } - b.WriteString(` — ` + html.EscapeString(latest.OverallStatus) + ``) - } - b.WriteString(`

`) - - if len(latest.GPUs) > 0 { - b.WriteString(`
`) - b.WriteString(``) - b.WriteString(``) - for _, gpu := range latest.GPUs { - // finalLimitW is the definitive TDP: multi-GPU stable limit from the ramp, - // falling back to single-card applied limit if the ramp hasn't run. - finalLimitW := gpu.StablePowerLimitW - if finalLimitW <= 0 { - finalLimitW = gpu.AppliedPowerLimitW - } - // Derate is relative to nominal (DefaultPowerLimitW), using the final limit. - derated := gpu.Derated || - (gpu.DefaultPowerLimitW > 0 && finalLimitW > 0 && finalLimitW < gpu.DefaultPowerLimitW-1) - rowStyle := "" - finalStyle := "" - if derated { - rowStyle = ` style="background:rgba(255,180,0,0.08)"` - finalStyle = ` style="color:#e6a000;font-weight:600"` - } - statusLabel := gpu.Status - if statusLabel == "" { - statusLabel = "OK" - } - statusColor := "var(--ok)" - if statusLabel == "FAILED" { - statusColor = "var(--crit-fg,#9f3a38)" - } else if statusLabel != "OK" { - statusColor = "var(--warn)" - } - nominalStr := "-" - if gpu.DefaultPowerLimitW > 0 { - nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW) - } - singleStr := "-" - if gpu.AppliedPowerLimitW > 0 { - singleStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW) - } - multiStr := "-" - if gpu.StablePowerLimitW > 0 { - multiStr = fmt.Sprintf("%.0f", gpu.StablePowerLimitW) - } - p95Str := "-" - if gpu.MaxObservedPowerW > 0 { - p95Str = fmt.Sprintf("%.0f", gpu.MaxObservedPowerW) - } - b.WriteString(``) - b.WriteString(``) - b.WriteString(``) - b.WriteString(``) - b.WriteString(``) - b.WriteString(`` + multiStr + ``) - b.WriteString(``) - b.WriteString(``) - b.WriteString(``) - } - b.WriteString(`
GPUModelNominal WSingle-card WMulti-GPU WP95 Observed WStatus
` + strconv.Itoa(gpu.Index) + `` + html.EscapeString(gpu.Name) + `` + nominalStr + `` + singleStr + `` + p95Str + `` + html.EscapeString(statusLabel) + `
`) - } - - if len(runs) > 1 { - b.WriteString(`
` + strconv.Itoa(len(runs)) + ` runs total`) - b.WriteString(`
`) - for i, run := range runs { - statusColor := "var(--ok)" - if run.result.OverallStatus != "OK" { - statusColor = "var(--warn)" - } - b.WriteString(``) - b.WriteString(``) - b.WriteString(``) - b.WriteString(``) - b.WriteString(``) - b.WriteString(``) - } - b.WriteString(`
#TimeGPUsStatus
#` + strconv.Itoa(i+1) + `` + html.EscapeString(run.displayTime) + `` + strconv.Itoa(len(run.result.GPUs)) + `` + html.EscapeString(run.result.OverallStatus) + `
`) - } - - b.WriteString(`
`) - return b.String() -} - -// ── Burn ────────────────────────────────────────────────────────────────────── - -func renderBurn() string { - return `
⚠ Warning: Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.
-
Scope: Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in Validate → Stress mode; NCCL and NVBandwidth are available directly from Validate.
-

Tasks continue in the background — view progress in Tasks.

- -
-
Burn Profile
-
-
-
- - - -
-
- -

Runs checked tests as separate sequential tasks. In sequential GPU mode, total time = profile duration × N GPU. In parallel mode, all selected GPUs burn simultaneously for one profile duration.

-
-
- -

Run checked core test modules (CPU, MEM, GPU). Tests start at the same time and run for a period with short cooldown phases to stress the server cooling system.

-
-
-
- -
-
- -
-
NVIDIA GPU Selection
-
-

Official NVIDIA recipes and custom NVIDIA stressors use only the GPUs selected here. Multi-GPU interconnect tests are limited to this selection as well.

-
- - -
-
-

Loading NVIDIA GPUs...

-
-

Select at least one NVIDIA GPU to enable NVIDIA burn recipes.

-
- - - -
-
-
- -
Core Burn Paths
-
-
-
GPU Max Load
-
-

Combine vendor-backed and custom GPU max-load recipes in one run set. ` + "dcgmproftester" + ` is the primary official NVIDIA path; custom stressors remain available as parallel checkbox options.

- - - - -
-
- -
-
Compute Stress
-
-

Select which subsystems to stress. Each checked item runs as a separate task.

- - - -
-
-
- - - - - -` -} - -// ── Network ─────────────────────────────────────────────────────────────────── - -// renderNetworkInline returns the network UI without a wrapping card (for embedding in Tools). -func renderNetworkInline() string { - return ` -

Loading...

-
-
DHCP
-
- -
-
-
Static IPv4
-
-
-
-
-
- -
-
-
-` -} - -func renderNetwork() string { - return `
Network Interfaces
` + - renderNetworkInline() + - `
` -} - -// ── Services ────────────────────────────────────────────────────────────────── - -func renderServicesInline() string { - return `

` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `

-
-

Loading...

- -` -} - -func renderServices() string { - return `
Bee Services
` + - renderServicesInline() + - `
` -} - -// ── Export ──────────────────────────────────────────────────────────────────── - -func renderExport(exportDir string) string { - entries, _ := listExportFiles(exportDir) - var rows strings.Builder - for _, e := range entries { - rows.WriteString(fmt.Sprintf(`%s`, - url.QueryEscape(e), html.EscapeString(e))) - } - if len(entries) == 0 { - rows.WriteString(`No export files found.`) - } - return `
-
Support Bundle
-

Creates a tar.gz archive of all audit files, SAT results, and logs.

-` + renderSupportBundleInline() + ` -
-
Export Files
-` + rows.String() + `
File
-
-
- -` + renderUSBExportCard() -} - -func listExportFiles(exportDir string) ([]string, error) { - var entries []string - err := filepath.Walk(strings.TrimSpace(exportDir), func(path string, info os.FileInfo, err error) error { - if err != nil { - return err - } - if info.IsDir() { - return nil - } - rel, err := filepath.Rel(exportDir, path) - if err != nil { - return err - } - entries = append(entries, rel) - return nil - }) - if err != nil && !os.IsNotExist(err) { - return nil, err - } - sort.Strings(entries) - return entries, nil -} - -func renderSupportBundleInline() string { - return ` -
-` -} - -func renderUSBExportCard() string { - return `
-
Export to USB - -
-
` + renderUSBExportInline() + `
-
` -} - -func renderUSBExportInline() string { - return `

Write audit JSON or support bundle directly to a removable USB drive.

-
Scanning for USB devices...
-
-
-` -} - -func renderNvidiaSelfHealInline() string { - return `

Inspect NVIDIA GPU health, restart the bee-nvidia driver service, and issue a per-GPU reset when the driver reports reset required.

-
- - -
-
Loading NVIDIA GPU status...
-

Loading...

- -` -} - -// ── Tools ───────────────────────────────────────────────────────────────────── - -func renderTools() string { - return `
-
System Install
-
-
-
Install to RAM
-

Detecting boot source...

-

Checking...

- -
-
-
Install to Disk
` + - renderInstallInline() + ` -
-
-
- - -
Support Bundle
-

Downloads a tar.gz archive of all audit files, SAT results, and logs.

-` + renderSupportBundleInline() + ` -
-
Export to USB
- ` + renderUSBExportInline() + ` -
-
- -
Tool Check
-

Checking...

- -
NVIDIA Self Heal
` + - renderNvidiaSelfHealInline() + `
- -
Network
` + - renderNetworkInline() + `
- -
Services
` + - renderServicesInline() + `
- - -` -} - -// ── Install to Disk ────────────────────────────────────────────────────────── - -func renderInstallInline() string { - return ` -
- Warning: Installing will completely erase the selected - disk and write the live system onto it. All existing data on the target disk will be lost. - This operation cannot be undone. -
-
Loading disk list…
- - - - - - - -` -} - -func renderInstall() string { - return `
Install Live System to Disk
` + - renderInstallInline() + - `
` -} - -// ── Tasks ───────────────────────────────────────────────────────────────────── - -func renderTasks() string { - return `
- - - -Open a task to view its saved logs and charts. -
-
-

Loading...

-
-` -} - -func renderExportIndex(exportDir string) (string, error) { - entries, err := listExportFiles(exportDir) - if err != nil { - return "", err - } - var body strings.Builder - body.WriteString(`Bee Export Files`) - body.WriteString(`

Bee Export Files

`) - return body.String(), nil -}