` + html.EscapeString(title) + `

Hardware Summary

`) // Server identity block above the component table. { var model, serial string parts := []string{} if hw.Board.Manufacturer != nil && strings.TrimSpace(*hw.Board.Manufacturer) != "" { parts = append(parts, strings.TrimSpace(*hw.Board.Manufacturer)) } if hw.Board.ProductName != nil && strings.TrimSpace(*hw.Board.ProductName) != "" { parts = append(parts, strings.TrimSpace(*hw.Board.ProductName)) } if len(parts) > 0 { model = strings.Join(parts, " ") } serial = strings.TrimSpace(hw.Board.SerialNumber) if model != "" || serial != "" { b.WriteString(`

`) if model != "" { fmt.Fprintf(&b, `

`, html.EscapeString(model)) } if serial != "" { fmt.Fprintf(&b, `

S/N: %s

`, html.EscapeString(serial)) } b.WriteString(`

`) } } b.WriteString(``) writeRow := func(label, value, badgeHTML string) { b.WriteString(fmt.Sprintf(``, html.EscapeString(label), html.EscapeString(value), badgeHTML)) } writeRow("CPU", hwDescribeCPU(hw), renderComponentChips(matchedRecords(records, []string{"cpu:all"}, nil))) writeRow("Memory", hwDescribeMemory(hw), renderComponentChips(matchedRecords(records, []string{"memory:all"}, []string{"memory:"}))) writeRow("Storage", hwDescribeStorage(hw), renderComponentChips(matchedRecords(records, []string{"storage:all"}, []string{"storage:"}))) writeRow("GPU", hwDescribeGPU(hw), renderComponentChips(matchedRecords(records, nil, []string{"pcie:gpu:"}))) psuMatched := matchedRecords(records, nil, []string{"psu:"}) if len(psuMatched) == 0 && len(hw.PowerSupplies) > 0 { // No PSU records yet — synthesise a single chip from IPMI status. psuStatus := hwPSUStatus(hw.PowerSupplies) psuMatched = []app.ComponentStatusRecord{{ComponentKey: "psu:ipmi", Status: psuStatus}} } writeRow("PSU", hwDescribePSU(hw), renderComponentChips(psuMatched)) if nicDesc := hwDescribeNIC(hw); nicDesc != "" { writeRow("Network", nicDesc, "") } b.WriteString(`

`) b.WriteString(`

Runtime Health

`) b.WriteString(fmt.Sprintf(`

`, badge, html.EscapeString(status))) if checkedAt := strings.TrimSpace(health.CheckedAt); checkedAt != "" { b.WriteString(`

Checked at: ` + html.EscapeString(checkedAt) + `

`) } rows := []runtimeHealthRow{ buildRuntimeExportRow(health), buildRuntimeNetworkRow(health), buildRuntimeDriverRow(health), buildRuntimeAccelerationRow(health), buildRuntimeToolsRow(health), buildRuntimeServicesRow(health), buildRuntimeUSBExportRow(health), buildRuntimeToRAMRow(health), } b.WriteString(``) for _, row := range rows { b.WriteString(``) } b.WriteString(`

Check	Status	Source	Issue
` + html.EscapeString(row.Title) + `	` + runtimeStatusBadge(row.Status) + `	` + html.EscapeString(row.Source) + `	` + rowIssueHTML(row.Issue) + `

`) b.WriteString(`

` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody( inv.CPU, `Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`, `lscpu, sensors, stress-ng`, `60s in Validate, 30 min in Stress.`, )) + renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody( inv.Memory, `Runs a RAM validation pass and records memory state around the test.`, `free, memtester`, `256 MB / 1 pass in Validate, 512 MB / 1 pass in Stress.`, )) + renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody( inv.Storage, `Scans all storage devices and runs the matching health or self-test path for each device type.`, `lsblk; NVMe: nvme; SATA/SAS: smartctl`, `Short self-test in Validate, extended self-test in Stress.`, )) + `

` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody( inv.NVIDIA, `Runs NVIDIA diagnostics and board inventory checks.`, `nvidia-smi, dmidecode, dcgmi diag`, `Level 2 in Validate, Level 3 in Stress. Runs one GPU at a time on the selected NVIDIA GPUs.`, )) + `

` + renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody( inv.NVIDIA, `Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`, `dcgmi diag targeted_stress`, `Skipped in Validate mode. Runs after dcgmi diag in Stress mode. Runs one GPU at a time on the selected NVIDIA GPUs.

Only runs in Stress mode. Switch mode above to enable in Run All.

`, )) + `

` + `

` + renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody( inv.NVIDIA, `Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`, `dcgmi diag targeted_power`, `Skipped in Validate mode. Runs in Stress mode only. Runs one GPU at a time.

Only runs in Stress mode. Switch mode above to enable in Run All.

`, )) + `

` + `

` + renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody( inv.NVIDIA, `Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`, `dcgmi diag pulse_test`, `Skipped in Validate mode. Runs in Stress mode only. Runs all selected GPUs simultaneously — synchronous pulsing is required to stress the PSU.

Only runs in Stress mode. Switch mode above to enable in Run All.

`, )) + `

` + `

` + renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody( inv.NVIDIA, `Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`, `all_reduce_perf (NCCL tests)`, `Runs in Validate and Stress. Uses all selected GPUs simultaneously (requires ≥2) and is kept short so it fits the Validate flow.`, )) + `

` + `

` + renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody( inv.NVIDIA, `Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`, `nvbandwidth`, `Runs in Validate and Stress across all selected GPUs simultaneously. Intended to stay short enough for Validate.`, )) + `

` + `

` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody( inv.AMD, `Runs the selected AMD checks only. GPU Validate collects inventory; MEM Integrity uses the RVS MEM module; MEM Bandwidth uses rocm-bandwidth-test and the RVS BABEL module.`, `GPU Validate: rocm-smi, dmidecode; MEM Integrity: rvs mem; MEM Bandwidth: rocm-bandwidth-test, rvs babel`, `

GPU ValidateMEM IntegrityMEM Bandwidth

`, )) + `

Benchmark runs generate a human-readable TXT report and machine-readable result bundle. Tasks continue in the background — view progress in Tasks.

Benchmark Setup

Profile

GPU Selection

Loading NVIDIA GPUs...

Sequential — one GPU at a time Parallel — all selected GPUs simultaneously Ramp-up — 1 GPU → 2 → … → all selected (separate tasks)

Select one GPU for single-card benchmarking or several GPUs for a constrained multi-GPU run.

Method Split

The benchmark page now exposes two fundamentally different test families so compute score and server power-fit are not mixed into one number.

Run Type	Engine	Question
Performance Benchmark	`bee-gpu-burn`	How much isolated compute performance does the GPU realize in this server?
Power / Thermal Fit	`dcgmi targeted_power`	How much power per GPU can this server sustain as GPU count ramps up?

Use ramp-up mode for capacity work: it creates 1 GPU → 2 GPU → … → all selected steps so analysis software can derive server total score and watts-per-GPU curves.

Run	Time	GPU ` + strconv.Itoa(i) + `
#` + strconv.Itoa(i+1) + `	` + html.EscapeString(run.displayTime) + `	-	` + fmt.Sprintf("%.2f", score) + `

Run

Time

GPU ` + strconv.Itoa(i) + `

#` + strconv.Itoa(i+1) + `

` + html.EscapeString(run.displayTime) + `

` + fmt.Sprintf("%.2f", score) + `

Power / Thermal Fit Results

`) latest := runs[0].result b.WriteString(`

Latest run: ` + html.EscapeString(runs[0].displayTime)) if latest.Hostname != "" { b.WriteString(` — ` + html.EscapeString(latest.Hostname)) } if latest.OverallStatus != "" { statusColor := "var(--ok)" if latest.OverallStatus != "OK" { statusColor = "var(--warn)" } b.WriteString(` — ` + html.EscapeString(latest.OverallStatus) + ``) } b.WriteString(`

`) if len(latest.GPUs) > 0 { b.WriteString(`

`) b.WriteString(``) b.WriteString(``) for _, gpu := range latest.GPUs { derated := gpu.Derated || (gpu.DefaultPowerLimitW > 0 && gpu.AppliedPowerLimitW < gpu.DefaultPowerLimitW-1) rowStyle := "" achievedStyle := "" if derated { rowStyle = ` style="background:rgba(255,180,0,0.08)"` achievedStyle = ` style="color:#e6a000;font-weight:600"` } statusLabel := gpu.Status if statusLabel == "" { statusLabel = "OK" } statusColor := "var(--ok)" if statusLabel != "OK" { statusColor = "var(--warn)" } nominalStr := "-" if gpu.DefaultPowerLimitW > 0 { nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW) } achievedStr := "-" if gpu.AppliedPowerLimitW > 0 { achievedStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW) } p95Str := "-" if gpu.MaxObservedPowerW > 0 { p95Str = fmt.Sprintf("%.0f", gpu.MaxObservedPowerW) } b.WriteString(``) b.WriteString(``) b.WriteString(``) b.WriteString(``) b.WriteString(`` + achievedStr + ``) b.WriteString(``) b.WriteString(``) b.WriteString(``) } b.WriteString(`

GPU	Model	Nominal W	Achieved W	P95 Observed W	Status
` + strconv.Itoa(gpu.Index) + `	` + html.EscapeString(gpu.Name) + `	` + nominalStr + `	` + p95Str + `	` + html.EscapeString(statusLabel) + `

`) } if len(runs) > 1 { b.WriteString(`

` + strconv.Itoa(len(runs)) + ` runs total

`) b.WriteString(`

`) for i, run := range runs { statusColor := "var(--ok)" if run.result.OverallStatus != "OK" { statusColor = "var(--warn)" } b.WriteString(``) b.WriteString(``) b.WriteString(``) b.WriteString(``) b.WriteString(``) b.WriteString(``) } b.WriteString(`

#	Time	GPUs	Status
#` + strconv.Itoa(i+1) + `	` + html.EscapeString(run.displayTime) + `	` + strconv.Itoa(len(run.result.GPUs)) + `	` + html.EscapeString(run.result.OverallStatus) + `

`) } b.WriteString(`

` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `

Inspect NVIDIA GPU health, restart the bee-nvidia driver service, and issue a per-GPU reset when the driver reports reset required.

` + html.EscapeString(title) + `

Bee Export Files