` + html.EscapeString(title) + `
`
}
func layoutNav(active string, buildLabel string) string {
items := []struct{ id, label, href, onclick string }{
{"dashboard", "Dashboard", "/", ""},
{"audit", "Audit", "/audit", ""},
{"validate", "Validate", "/validate", ""},
{"burn", "Burn", "/burn", ""},
{"benchmark", "Benchmark", "/benchmark", ""},
{"tasks", "Tasks", "/tasks", ""},
{"tools", "Tools", "/tools", ""},
}
var b strings.Builder
b.WriteString(``)
return b.String()
}
// renderPage dispatches to the appropriate page renderer.
func renderPage(page string, opts HandlerOptions) string {
var pageID, title, body string
switch page {
case "dashboard", "":
pageID = "dashboard"
title = "Dashboard"
body = renderDashboard(opts)
case "audit":
pageID = "audit"
title = "Audit"
body = renderAudit()
case "validate":
pageID = "validate"
title = "Validate"
body = renderValidate(opts)
case "burn":
pageID = "burn"
title = "Burn"
body = renderBurn()
case "benchmark":
pageID = "benchmark"
title = "Benchmark"
body = renderBenchmark(opts)
case "tasks":
pageID = "tasks"
title = "Tasks"
body = renderTasks()
case "tools":
pageID = "tools"
title = "Tools"
body = renderTools()
// Legacy routes kept accessible but not in nav
case "metrics":
pageID = "metrics"
title = "Live Metrics"
body = renderMetrics()
case "tests":
pageID = "validate"
title = "Acceptance Tests"
body = renderValidate(opts)
case "burn-in":
pageID = "burn"
title = "Burn-in Tests"
body = renderBurn()
case "network":
pageID = "network"
title = "Network"
body = renderNetwork()
case "services":
pageID = "services"
title = "Services"
body = renderServices()
case "export":
pageID = "export"
title = "Export"
body = renderExport(opts.ExportDir)
case "install":
pageID = "install"
title = "Install to Disk"
body = renderInstall()
default:
pageID = "dashboard"
title = "Not Found"
body = `
` +
renderAuditModal() +
`` +
``
}
// ── Dashboard ─────────────────────────────────────────────────────────────────
func renderDashboard(opts HandlerOptions) string {
var b strings.Builder
b.WriteString(renderAuditStatusBanner(opts))
b.WriteString(renderHardwareSummaryCard(opts))
b.WriteString(renderHealthCard(opts))
b.WriteString(renderMetrics())
return b.String()
}
// renderAuditStatusBanner shows a live progress banner when an audit task is
// running and auto-reloads the page when it completes.
func renderAuditStatusBanner(opts HandlerOptions) string {
// If audit data already exists, no banner needed — data is fresh.
// We still inject the polling script so a newly-triggered audit also reloads.
hasData := false
if _, err := loadSnapshot(opts.AuditPath); err == nil {
hasData = true
}
_ = hasData
return `
▶ Hardware audit is running — page will refresh automatically when complete.View in Tasks
`
}
var ingest schema.HardwareIngestRequest
if err := json.Unmarshal(data, &ingest); err != nil {
return `
Hardware Summary
Parse error
`
}
hw := ingest.Hardware
var records []app.ComponentStatusRecord
if db, err := app.OpenComponentStatusDB(filepath.Join(opts.ExportDir, "component-status.json")); err == nil {
records = db.All()
}
var b strings.Builder
b.WriteString(`
Hardware Summary
`)
// Server identity block above the component table.
{
var model, serial string
parts := []string{}
if hw.Board.Manufacturer != nil && strings.TrimSpace(*hw.Board.Manufacturer) != "" {
parts = append(parts, strings.TrimSpace(*hw.Board.Manufacturer))
}
if hw.Board.ProductName != nil && strings.TrimSpace(*hw.Board.ProductName) != "" {
parts = append(parts, strings.TrimSpace(*hw.Board.ProductName))
}
if len(parts) > 0 {
model = strings.Join(parts, " ")
}
serial = strings.TrimSpace(hw.Board.SerialNumber)
if model != "" || serial != "" {
b.WriteString(`
`)
if model != "" {
fmt.Fprintf(&b, `
%s
`, html.EscapeString(model))
}
if serial != "" {
fmt.Fprintf(&b, `
`
}
var health schema.RuntimeHealth
if err := json.Unmarshal(data, &health); err != nil {
return `
Runtime Health
Parse error
`
}
status := strings.TrimSpace(health.Status)
if status == "" {
status = "UNKNOWN"
}
badge := "badge-ok"
if status == "PARTIAL" {
badge = "badge-warn"
} else if status == "FAIL" || status == "FAILED" {
badge = "badge-err"
}
var b strings.Builder
b.WriteString(`
Non-destructive: Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.
Tasks continue in the background — view progress in Tasks.
Validate Profile
Runs validate modules sequentially with the selected cycle count and mode. Validate is quick (~5–15 min total); Stress is thorough (~30–60 min total).
` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
inv.CPU,
`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
`lscpu, sensors, stress-ng`,
`60s in Validate, 30 min in Stress.`,
)) +
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
inv.Memory,
`Runs a RAM validation pass and records memory state around the test.`,
`free, memtester`,
`256 MB / 1 pass in Validate, 512 MB / 1 pass in Stress.`,
)) +
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
inv.Storage,
`Scans all storage devices and runs the matching health or self-test path for each device type.`,
`lsblk; NVMe: nvme; SATA/SAS: smartctl`,
`Short self-test in Validate, extended self-test in Stress.`,
)) +
`
NVIDIA GPU Selection
` + inv.NVIDIA + `
All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Validate one by one.
Loading NVIDIA GPUs...
Select at least one NVIDIA GPU to enable NVIDIA validate tasks.
` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
inv.NVIDIA,
`Runs NVIDIA diagnostics and board inventory checks.`,
`nvidia-smi, dmidecode, dcgmi diag`,
`Level 2 in Validate, Level 3 in Stress. Runs one GPU at a time on the selected NVIDIA GPUs.`,
)) +
`
` +
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
inv.NVIDIA,
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
`dcgmi diag targeted_stress`,
`Skipped in Validate mode. Runs after dcgmi diag in Stress mode. Runs one GPU at a time on the selected NVIDIA GPUs.
Only runs in Stress mode. Switch mode above to enable in Run All.
`,
)) +
`
` +
`
` +
renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody(
inv.NVIDIA,
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
`dcgmi diag targeted_power`,
`Skipped in Validate mode. Runs in Stress mode only. Runs one GPU at a time.
Only runs in Stress mode. Switch mode above to enable in Run All.
`,
)) +
`
` +
`
` +
renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody(
inv.NVIDIA,
`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
`dcgmi diag pulse_test`,
`Skipped in Validate mode. Runs in Stress mode only. Runs all selected GPUs simultaneously — synchronous pulsing is required to stress the PSU.
Only runs in Stress mode. Switch mode above to enable in Run All.
`,
)) +
`
` +
`
` +
renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
inv.NVIDIA,
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
`all_reduce_perf (NCCL tests)`,
`Runs in Validate and Stress. Uses all selected GPUs simultaneously (requires ≥2) and is kept short so it fits the Validate flow.`,
)) +
`
` +
`
` +
renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
inv.NVIDIA,
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
`nvbandwidth`,
`Runs in Validate and Stress across all selected GPUs simultaneously. Intended to stay short enough for Validate.`,
)) +
`
` +
`
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
inv.AMD,
`Runs the selected AMD checks only. GPU Validate collects inventory; MEM Integrity uses the RVS MEM module; MEM Bandwidth uses rocm-bandwidth-test and the RVS BABEL module.`,
`GPU Validate: rocm-smi, dmidecode; MEM Integrity: rvs mem; MEM Bandwidth: rocm-bandwidth-test, rvs babel`,
``,
)) +
`
Test Output
`
}
func loadValidateInventory(opts HandlerOptions) validateInventory {
unknown := "Audit snapshot not loaded."
out := validateInventory{
CPU: unknown,
Memory: unknown,
Storage: unknown,
NVIDIA: unknown,
AMD: unknown,
}
data, err := loadSnapshot(opts.AuditPath)
if err != nil {
return out
}
var snap schema.HardwareIngestRequest
if err := json.Unmarshal(data, &snap); err != nil {
return out
}
cpuCounts := map[string]int{}
cpuTotal := 0
for _, cpu := range snap.Hardware.CPUs {
if cpu.Present != nil && !*cpu.Present {
continue
}
cpuTotal++
addValidateModel(cpuCounts, validateFirstNonEmpty(validateTrimPtr(cpu.Model), validateTrimPtr(cpu.Manufacturer), "unknown"))
}
memCounts := map[string]int{}
memTotal := 0
for _, dimm := range snap.Hardware.Memory {
if dimm.Present != nil && !*dimm.Present {
continue
}
memTotal++
addValidateModel(memCounts, validateFirstNonEmpty(validateTrimPtr(dimm.PartNumber), validateTrimPtr(dimm.Type), validateTrimPtr(dimm.Manufacturer), "unknown"))
}
storageCounts := map[string]int{}
storageTotal := 0
for _, dev := range snap.Hardware.Storage {
if dev.Present != nil && !*dev.Present {
continue
}
storageTotal++
addValidateModel(storageCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
}
nvidiaCounts := map[string]int{}
nvidiaTotal := 0
amdCounts := map[string]int{}
amdTotal := 0
for _, dev := range snap.Hardware.PCIeDevices {
if dev.Present != nil && !*dev.Present {
continue
}
if validateIsVendorGPU(dev, "nvidia") {
nvidiaTotal++
addValidateModel(nvidiaCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
}
if validateIsVendorGPU(dev, "amd") {
amdTotal++
addValidateModel(amdCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
}
}
out.CPU = formatValidateDeviceSummary(cpuTotal, cpuCounts, "CPU")
out.Memory = formatValidateDeviceSummary(memTotal, memCounts, "module")
out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device")
out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU")
out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU")
return out
}
func renderValidateCardBody(devices, description, commands, settings string) string {
return `
` + devices + `
` +
`
` + description + `
` +
`
` + commands + `
` +
`
` + settings + `
`
}
func formatValidateDeviceSummary(total int, models map[string]int, unit string) string {
if total == 0 {
return "0 " + unit + "s detected."
}
keys := make([]string, 0, len(models))
for key := range models {
keys = append(keys, key)
}
sort.Strings(keys)
parts := make([]string, 0, len(keys))
for _, key := range keys {
parts = append(parts, fmt.Sprintf("%d x %s", models[key], html.EscapeString(key)))
}
label := unit
if total != 1 {
label += "s"
}
// If there is only one model the leading count duplicates the per-model
// count already in parts (e.g. "4 GPU: 4 x RTX …" → "4 x RTX …").
if len(parts) == 1 {
return parts[0] + " " + label
}
return fmt.Sprintf("%d %s: %s", total, label, strings.Join(parts, ", "))
}
func addValidateModel(counts map[string]int, name string) {
name = strings.TrimSpace(name)
if name == "" {
name = "unknown"
}
counts[name]++
}
func validateTrimPtr(value *string) string {
if value == nil {
return ""
}
return strings.TrimSpace(*value)
}
func validateFirstNonEmpty(values ...string) string {
for _, value := range values {
value = strings.TrimSpace(value)
if value != "" {
return value
}
}
return ""
}
func validateIsVendorGPU(dev schema.HardwarePCIeDevice, vendor string) bool {
model := strings.ToLower(validateTrimPtr(dev.Model))
manufacturer := strings.ToLower(validateTrimPtr(dev.Manufacturer))
class := strings.ToLower(validateTrimPtr(dev.DeviceClass))
if strings.Contains(model, "aspeed") || strings.Contains(manufacturer, "aspeed") {
return false
}
switch vendor {
case "nvidia":
return strings.Contains(model, "nvidia") || strings.Contains(manufacturer, "nvidia")
case "amd":
isGPUClass := class == "processingaccelerator" || class == "displaycontroller" || class == "videocontroller"
isAMDVendor := strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd") || strings.Contains(manufacturer, "ati")
isAMDModel := strings.Contains(model, "instinct") || strings.Contains(model, "radeon") || strings.Contains(model, "amd")
return isGPUClass && (isAMDVendor || isAMDModel)
default:
return false
}
}
func renderSATCard(id, label, runAction, headerActions, body string) string {
actions := ``
if strings.TrimSpace(headerActions) != "" {
actions += headerActions
}
return fmt.Sprintf(`
Benchmark runs generate a human-readable TXT report and machine-readable result bundle. Tasks continue in the background — view progress in Tasks.
Benchmark Setup
Loading NVIDIA GPUs...
Select one GPU for single-card benchmarking or several GPUs for a constrained multi-GPU run.
nccl-auto
Method Split
The benchmark page now exposes two fundamentally different test families so compute score and server power-fit are not mixed into one number.
Run Type
Engine
Question
Performance Benchmark
bee-gpu-burn
How much isolated compute performance does the GPU realize in this server?
Power / Thermal Fit
dcgmi targeted_power
How much power per GPU can this server sustain as GPU count ramps up?
Use ramp-up mode for capacity work: it creates 1 GPU → 2 GPU → … → all selected steps so analysis software can derive server total score and watts-per-GPU curves.
`
}
sort.Strings(paths)
type powerRun struct {
generatedAt time.Time
displayTime string
result platform.NvidiaPowerBenchResult
}
var runs []powerRun
for _, path := range paths {
raw, err := os.ReadFile(path)
if err != nil {
continue
}
var r platform.NvidiaPowerBenchResult
if err := json.Unmarshal(raw, &r); err != nil {
continue
}
runs = append(runs, powerRun{
generatedAt: r.GeneratedAt,
displayTime: r.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
result: r,
})
}
sort.Slice(runs, func(i, j int) bool {
return runs[i].generatedAt.After(runs[j].generatedAt)
})
// Show only the most recent run's GPU slot table, plus a run history summary.
var b strings.Builder
b.WriteString(`
⚠ Warning: Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.
Scope: Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in Validate → Stress mode; NCCL and NVBandwidth are available directly from Validate.
Tasks continue in the background — view progress in Tasks.
Burn Profile
Run checked tests one by one. Tests run without cooldown. Each test duration is determined by the Burn Profile. Total test duration is the sum of all selected tests multiplied by the Burn Profile duration.
Run checked core test modules (CPU, MEM, GPU). Tests start at the same time and run for a period with short cooldown phases to stress the server cooling system.
NVIDIA GPU Selection
Official NVIDIA recipes and custom NVIDIA stressors use only the GPUs selected here. Multi-GPU interconnect tests are limited to this selection as well.
Loading NVIDIA GPUs...
Select at least one NVIDIA GPU to enable NVIDIA burn recipes.
Core Burn Paths
GPU Max Load
Combine vendor-backed and custom GPU max-load recipes in one run set. ` + "dcgmproftester" + ` is the primary official NVIDIA path; custom stressors remain available as parallel checkbox options.
Compute Stress
Select which subsystems to stress. Each checked item runs as a separate task.
Output
`
}
// ── Network ───────────────────────────────────────────────────────────────────
// renderNetworkInline returns the network UI without a wrapping card (for embedding in Tools).
func renderNetworkInline() string {
return `
⚠ Network change applied. Reverting in 60s unless confirmed.
` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `
Loading...
Output
`
}
func renderServices() string {
return `
Bee Services
` +
renderServicesInline() +
`
`
}
// ── Export ────────────────────────────────────────────────────────────────────
func renderExport(exportDir string) string {
entries, _ := listExportFiles(exportDir)
var rows strings.Builder
for _, e := range entries {
rows.WriteString(fmt.Sprintf(`
Downloads a tar.gz archive of all audit files, SAT results, and logs.
` + renderSupportBundleInline() + `
Export to USB
` + renderUSBExportInline() + `
Tool Check
Checking...
NVIDIA Self Heal
` +
renderNvidiaSelfHealInline() + `
Network
` +
renderNetworkInline() + `
Services
` +
renderServicesInline() + `
`
}
// ── Install to Disk ──────────────────────────────────────────────────────────
func renderInstallInline() string {
return `
Warning: Installing will completely erase the selected
disk and write the live system onto it. All existing data on the target disk will be lost.
This operation cannot be undone.