` + html.EscapeString(title) + `

Hardware Summary

`) // Server identity block above the component table. { var model, serial string parts := []string{} if hw.Board.Manufacturer != nil && strings.TrimSpace(*hw.Board.Manufacturer) != "" { parts = append(parts, strings.TrimSpace(*hw.Board.Manufacturer)) } if hw.Board.ProductName != nil && strings.TrimSpace(*hw.Board.ProductName) != "" { parts = append(parts, strings.TrimSpace(*hw.Board.ProductName)) } if len(parts) > 0 { model = strings.Join(parts, " ") } serial = strings.TrimSpace(hw.Board.SerialNumber) if model != "" || serial != "" { b.WriteString(`

`) if model != "" { fmt.Fprintf(&b, `

`, html.EscapeString(model)) } if serial != "" { fmt.Fprintf(&b, `

S/N: %s

`, html.EscapeString(serial)) } b.WriteString(`

`) } } b.WriteString(``) writeRow := func(label, value, badgeHTML string) { b.WriteString(fmt.Sprintf(``, html.EscapeString(label), html.EscapeString(value), badgeHTML)) } writeRow("CPU", hwDescribeCPU(hw), renderComponentChips(matchedRecords(records, []string{"cpu:all"}, nil))) writeRow("Memory", hwDescribeMemory(hw), renderComponentChips(matchedRecords(records, []string{"memory:all"}, []string{"memory:"}))) writeRow("Storage", hwDescribeStorage(hw), renderComponentChips(matchedRecords(records, []string{"storage:all"}, []string{"storage:"}))) writeRow("GPU", hwDescribeGPU(hw), renderComponentChips(matchedRecords(records, nil, []string{"pcie:gpu:"}))) psuMatched := matchedRecords(records, nil, []string{"psu:"}) if len(psuMatched) == 0 && len(hw.PowerSupplies) > 0 { // No PSU records yet — synthesise a single chip from IPMI status. psuStatus := hwPSUStatus(hw.PowerSupplies) psuMatched = []app.ComponentStatusRecord{{ComponentKey: "psu:ipmi", Status: psuStatus}} } writeRow("PSU", hwDescribePSU(hw), renderComponentChips(psuMatched)) if nicDesc := hwDescribeNIC(hw); nicDesc != "" { writeRow("Network", nicDesc, "") } b.WriteString(`

`) b.WriteString(`

Check	Status	Source	Issue
` + html.EscapeString(row.Title) + `	` + runtimeStatusBadge(row.Status) + `	` + html.EscapeString(row.Source) + `	` + rowIssueHTML(row.Issue) + `

Check

Status

Source

Issue

` + html.EscapeString(row.Title) + `

` + runtimeStatusBadge(row.Status) + `

` + html.EscapeString(row.Source) + `

` + rowIssueHTML(row.Issue) + `

` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody( inv.CPU, `Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`, `lscpu, sensors, stress-ng`, validateFmtDur(platform.SATEstimatedCPUValidateSec)+` in Validate (stress-ng 60 s). `+validateFmtDur(platform.SATEstimatedCPUStressSec)+` in Stress (stress-ng 30 min).`, )) + renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody( inv.Memory, `Runs a RAM validation pass and records memory state around the test.`, `free, memtester`, validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` in Validate (256 MB × 1 pass). `+validateFmtDur(platform.SATEstimatedMemoryStressSec)+` in Stress (512 MB × 1 pass).`, )) + renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody( inv.Storage, `Scans all storage devices and runs the matching health or self-test path for each device type.`, `lsblk; NVMe: nvme; SATA/SAS: smartctl`, `Seconds in Validate (NVMe: instant device query; SATA/SAS: short self-test). Up to ~1 h per device in Stress (extended self-test, device-dependent).`, )) + `

` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody( inv.NVIDIA, `Runs NVIDIA diagnostics and board inventory checks.`, `nvidia-smi, dmidecode, dcgmi diag`, func() string { perV := platform.SATEstimatedNvidiaGPUValidatePerGPUSec perS := platform.SATEstimatedNvidiaGPUStressPerGPUSec if n > 0 { return fmt.Sprintf("Validate: %s/GPU × %d = %s (Level 2, sequential). Stress: %s/GPU × %d = %s (Level 3, sequential).", validateFmtDur(perV), n, validateFmtDur(perV*n), validateFmtDur(perS), n, validateFmtDur(perS*n)) } return fmt.Sprintf("Validate: %s/GPU (Level 2, sequential). Stress: %s/GPU (Level 3, sequential).", validateFmtDur(perV), validateFmtDur(perS)) }(), )) + `

` + renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody( inv.NVIDIA, `Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`, `dcgmi diag targeted_stress`, func() string { per := platform.SATEstimatedNvidiaTargetedStressPerGPUSec s := "Skipped in Validate. " if n > 0 { s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n)) } else { s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per)) } return s + `

Only runs in Stress mode. Switch mode above to enable in Run All.

` }(), )) + `

` + `

` + renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody( inv.NVIDIA, `Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`, `dcgmi diag targeted_power`, func() string { per := platform.SATEstimatedNvidiaTargetedPowerPerGPUSec s := "Skipped in Validate. " if n > 0 { s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n)) } else { s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per)) } return s + `

Only runs in Stress mode. Switch mode above to enable in Run All.

` }(), )) + `

` + `

` + renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody( inv.NVIDIA, `Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`, `dcgmi diag pulse_test`, `Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`

Only runs in Stress mode. Switch mode above to enable in Run All.

`, )) + `

` + `

` + renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody( inv.NVIDIA, `Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`, `all_reduce_perf (NCCL tests)`, `Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`, )) + `

` + `

` + renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody( inv.NVIDIA, `Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`, `nvbandwidth`, `Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`, )) + `

` + `

` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody( inv.AMD, `Runs the selected AMD checks only. GPU Validate collects inventory; MEM Integrity uses the RVS MEM module; MEM Bandwidth uses rocm-bandwidth-test and the RVS BABEL module.`, `GPU Validate: rocm-smi, dmidecode; MEM Integrity: rvs mem; MEM Bandwidth: rocm-bandwidth-test, rvs babel`, `

GPU ValidateMEM IntegrityMEM Bandwidth

`, )) + `

Benchmark runs generate a human-readable TXT report and machine-readable result bundle. Tasks continue in the background — view progress in Tasks.

Benchmark Setup

Profile

GPU Selection

Loading NVIDIA GPUs...

Sequential — one GPU at a time Parallel — all selected GPUs simultaneously Ramp-up — 1 GPU → 2 → … → all selected (separate tasks)

Select one GPU for single-card benchmarking or several GPUs for a constrained multi-GPU run.

Method Split

The benchmark page now exposes two fundamentally different test families so compute score and server power-fit are not mixed into one number.

Run Type	Engine	Question	Standard	Stability
Performance Benchmark	`bee-gpu-burn`	How much isolated compute performance does the GPU realize in this server?	` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + `	` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + `
Power / Thermal Fit	`dcgmi targeted_power`	How much power per GPU can this server sustain as GPU count ramps up?	` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `	` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `

Timings are per full ramp-up run (1 GPU → all selected), measured on 4–8 GPU servers. Use ramp-up mode for capacity work: it creates 1 GPU → 2 GPU → … → all selected steps so analysis software can derive server total score and watts-per-GPU curves.

` + html.EscapeString(title) + `

`) if strings.TrimSpace(description) != "" { b.WriteString(`

` + html.EscapeString(description) + `

`) } b.WriteString(`

`) b.WriteString(``) for i := 0; i <= maxGPUIndex; i++ { b.WriteString(``) } b.WriteString(``) for i, run := range runs { b.WriteString(``) b.WriteString(``) b.WriteString(``) overallColor := "var(--ok)" overallLabel := run.overallStatus if overallLabel == "" { overallLabel = "OK" } if overallLabel == "FAILED" { overallColor = "var(--crit-fg,#9f3a38)" } else if overallLabel != "OK" { overallColor = "var(--warn)" } b.WriteString(``) for idx := 0; idx <= maxGPUIndex; idx++ { score, ok := run.gpuScores[idx] if !ok { b.WriteString(``) continue } gpuStatus := run.gpuStatuses[idx] scoreColor := "" switch gpuStatus { case "FAILED": scoreColor = ` style="color:var(--crit-fg,#9f3a38);font-weight:600"` case "WARNING", "PARTIAL": scoreColor = ` style="color:var(--warn);font-weight:600"` case "", "OK": // no override default: scoreColor = ` style="color:var(--warn);font-weight:600"` } b.WriteString(`` + fmt.Sprintf("%.2f", score) + ``) } b.WriteString(``) } b.WriteString(`

Run	Time	Status	GPU ` + strconv.Itoa(i) + `
#` + strconv.Itoa(i+1) + `	` + html.EscapeString(run.displayTime) + `	` + html.EscapeString(overallLabel) + `	-

Power / Thermal Fit Results

`) latest := runs[0].result b.WriteString(`

Latest run: ` + html.EscapeString(runs[0].displayTime)) if latest.Hostname != "" { b.WriteString(` — ` + html.EscapeString(latest.Hostname)) } if latest.OverallStatus != "" { statusColor := "var(--ok)" if latest.OverallStatus != "OK" { statusColor = "var(--warn)" } b.WriteString(` — ` + html.EscapeString(latest.OverallStatus) + ``) } b.WriteString(`

`) if len(latest.GPUs) > 0 { b.WriteString(`

`) b.WriteString(``) b.WriteString(``) for _, gpu := range latest.GPUs { // finalLimitW is the definitive TDP: multi-GPU stable limit from the ramp, // falling back to single-card applied limit if the ramp hasn't run. finalLimitW := gpu.StablePowerLimitW if finalLimitW <= 0 { finalLimitW = gpu.AppliedPowerLimitW } // Derate is relative to nominal (DefaultPowerLimitW), using the final limit. derated := gpu.Derated || (gpu.DefaultPowerLimitW > 0 && finalLimitW > 0 && finalLimitW < gpu.DefaultPowerLimitW-1) rowStyle := "" finalStyle := "" if derated { rowStyle = ` style="background:rgba(255,180,0,0.08)"` finalStyle = ` style="color:#e6a000;font-weight:600"` } statusLabel := gpu.Status if statusLabel == "" { statusLabel = "OK" } statusColor := "var(--ok)" if statusLabel == "FAILED" { statusColor = "var(--crit-fg,#9f3a38)" } else if statusLabel != "OK" { statusColor = "var(--warn)" } nominalStr := "-" if gpu.DefaultPowerLimitW > 0 { nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW) } singleStr := "-" if gpu.AppliedPowerLimitW > 0 { singleStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW) } multiStr := "-" if gpu.StablePowerLimitW > 0 { multiStr = fmt.Sprintf("%.0f", gpu.StablePowerLimitW) } p95Str := "-" if gpu.MaxObservedPowerW > 0 { p95Str = fmt.Sprintf("%.0f", gpu.MaxObservedPowerW) } b.WriteString(``) b.WriteString(``) b.WriteString(``) b.WriteString(``) b.WriteString(``) b.WriteString(`` + multiStr + ``) b.WriteString(``) b.WriteString(``) b.WriteString(``) } b.WriteString(`

GPU	Model	Nominal W	Single-card W	Multi-GPU W	P95 Observed W	Status
` + strconv.Itoa(gpu.Index) + `	` + html.EscapeString(gpu.Name) + `	` + nominalStr + `	` + singleStr + `	` + p95Str + `	` + html.EscapeString(statusLabel) + `

`) } if len(runs) > 1 { b.WriteString(`

` + strconv.Itoa(len(runs)) + ` runs total

`) b.WriteString(`

`) for i, run := range runs { statusColor := "var(--ok)" if run.result.OverallStatus != "OK" { statusColor = "var(--warn)" } b.WriteString(``) b.WriteString(``) b.WriteString(``) b.WriteString(``) b.WriteString(``) b.WriteString(``) } b.WriteString(`

#	Time	GPUs	Status
#` + strconv.Itoa(i+1) + `	` + html.EscapeString(run.displayTime) + `	` + strconv.Itoa(len(run.result.GPUs)) + `	` + html.EscapeString(run.result.OverallStatus) + `

`) } b.WriteString(`

` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `

Inspect NVIDIA GPU health, restart the bee-nvidia driver service, and issue a per-GPU reset when the driver reports reset required.

` + html.EscapeString(title) + `

Bee Export Files