` + html.EscapeString(title) + `
`
}
func layoutNav(active string, buildLabel string) string {
items := []struct{ id, label, href, onclick string }{
{"dashboard", "Dashboard", "/", ""},
{"audit", "Audit", "/audit", ""},
{"validate", "Validate", "/validate", ""},
{"burn", "Burn", "/burn", ""},
{"benchmark", "Benchmark", "/benchmark", ""},
{"tasks", "Tasks", "/tasks", ""},
{"tools", "Tools", "/tools", ""},
}
var b strings.Builder
b.WriteString(``)
return b.String()
}
// renderPage dispatches to the appropriate page renderer.
func renderPage(page string, opts HandlerOptions) string {
var pageID, title, body string
switch page {
case "dashboard", "":
pageID = "dashboard"
title = "Dashboard"
body = renderDashboard(opts)
case "audit":
pageID = "audit"
title = "Audit"
body = renderAudit()
case "validate":
pageID = "validate"
title = "Validate"
body = renderValidate(opts)
case "burn":
pageID = "burn"
title = "Burn"
body = renderBurn()
case "benchmark":
pageID = "benchmark"
title = "Benchmark"
body = renderBenchmark(opts)
case "tasks":
pageID = "tasks"
title = "Tasks"
body = renderTasks()
case "tools":
pageID = "tools"
title = "Tools"
body = renderTools()
// Legacy routes kept accessible but not in nav
case "metrics":
pageID = "metrics"
title = "Live Metrics"
body = renderMetrics()
case "tests":
pageID = "validate"
title = "Acceptance Tests"
body = renderValidate(opts)
case "burn-in":
pageID = "burn"
title = "Burn-in Tests"
body = renderBurn()
case "network":
pageID = "network"
title = "Network"
body = renderNetwork()
case "services":
pageID = "services"
title = "Services"
body = renderServices()
case "export":
pageID = "export"
title = "Export"
body = renderExport(opts.ExportDir)
case "install":
pageID = "install"
title = "Install to Disk"
body = renderInstall()
default:
pageID = "dashboard"
title = "Not Found"
body = `
` +
renderAuditModal() +
`` +
``
}
// ── Dashboard ─────────────────────────────────────────────────────────────────
func renderDashboard(opts HandlerOptions) string {
var b strings.Builder
b.WriteString(renderAuditStatusBanner(opts))
b.WriteString(renderHardwareSummaryCard(opts))
b.WriteString(renderHealthCard(opts))
b.WriteString(renderMetrics())
return b.String()
}
// renderAuditStatusBanner shows a live progress banner when an audit task is
// running and auto-reloads the page when it completes.
func renderAuditStatusBanner(opts HandlerOptions) string {
// If audit data already exists, no banner needed — data is fresh.
// We still inject the polling script so a newly-triggered audit also reloads.
hasData := false
if _, err := loadSnapshot(opts.AuditPath); err == nil {
hasData = true
}
_ = hasData
return `
▶ Hardware audit is running — page will refresh automatically when complete.View in Tasks
`
}
var ingest schema.HardwareIngestRequest
if err := json.Unmarshal(data, &ingest); err != nil {
return `
Hardware Summary
Parse error
`
}
hw := ingest.Hardware
var records []app.ComponentStatusRecord
if db, err := app.OpenComponentStatusDB(filepath.Join(opts.ExportDir, "component-status.json")); err == nil {
records = db.All()
}
var b strings.Builder
b.WriteString(`
Hardware Summary
`)
// Server identity block above the component table.
{
var model, serial string
parts := []string{}
if hw.Board.Manufacturer != nil && strings.TrimSpace(*hw.Board.Manufacturer) != "" {
parts = append(parts, strings.TrimSpace(*hw.Board.Manufacturer))
}
if hw.Board.ProductName != nil && strings.TrimSpace(*hw.Board.ProductName) != "" {
parts = append(parts, strings.TrimSpace(*hw.Board.ProductName))
}
if len(parts) > 0 {
model = strings.Join(parts, " ")
}
serial = strings.TrimSpace(hw.Board.SerialNumber)
if model != "" || serial != "" {
b.WriteString(`
`)
if model != "" {
fmt.Fprintf(&b, `
%s
`, html.EscapeString(model))
}
if serial != "" {
fmt.Fprintf(&b, `
`
}
var health schema.RuntimeHealth
if err := json.Unmarshal(data, &health); err != nil {
return `
Runtime Health
Parse error
`
}
status := strings.TrimSpace(health.Status)
if status == "" {
status = "UNKNOWN"
}
badge := "badge-ok"
if status == "PARTIAL" {
badge = "badge-warn"
} else if status == "FAIL" || status == "FAILED" {
badge = "badge-err"
}
var b strings.Builder
b.WriteString(`
`)
return b.String()
}
type runtimeHealthRow struct {
Title string
Status string
Source string
Issue string
}
func buildRuntimeExportRow(health schema.RuntimeHealth) runtimeHealthRow {
issue := runtimeIssueDescriptions(health.Issues, "export_dir_unavailable")
status := "UNKNOWN"
switch {
case issue != "":
status = "FAILED"
case strings.TrimSpace(health.ExportDir) != "":
status = "OK"
}
source := "os.MkdirAll"
if dir := strings.TrimSpace(health.ExportDir); dir != "" {
source += " " + dir
}
return runtimeHealthRow{Title: "Export Directory", Status: status, Source: source, Issue: issue}
}
func buildRuntimeNetworkRow(health schema.RuntimeHealth) runtimeHealthRow {
status := strings.TrimSpace(health.NetworkStatus)
if status == "" {
status = "UNKNOWN"
}
issue := runtimeIssueDescriptions(health.Issues, "dhcp_partial", "dhcp_failed")
return runtimeHealthRow{Title: "Network", Status: status, Source: "ListInterfaces / DHCP", Issue: issue}
}
func buildRuntimeDriverRow(health schema.RuntimeHealth) runtimeHealthRow {
issue := runtimeIssueDescriptions(health.Issues, "nvidia_kernel_module_missing", "nvidia_modeset_failed", "amdgpu_kernel_module_missing")
status := "UNKNOWN"
switch {
case health.DriverReady && issue == "":
status = "OK"
case health.DriverReady:
status = "PARTIAL"
case issue != "":
status = "FAILED"
}
return runtimeHealthRow{Title: "NVIDIA/AMD Driver", Status: status, Source: "lsmod / vendor probe", Issue: issue}
}
func buildRuntimeAccelerationRow(health schema.RuntimeHealth) runtimeHealthRow {
issue := runtimeIssueDescriptions(health.Issues, "cuda_runtime_not_ready", "rocm_smi_unavailable")
status := "UNKNOWN"
switch {
case health.CUDAReady && issue == "":
status = "OK"
case health.CUDAReady:
status = "PARTIAL"
case issue != "":
status = "FAILED"
}
return runtimeHealthRow{Title: "CUDA / ROCm", Status: status, Source: "bee-gpu-burn / rocm-smi", Issue: issue}
}
func buildRuntimeToolsRow(health schema.RuntimeHealth) runtimeHealthRow {
if len(health.Tools) == 0 {
return runtimeHealthRow{Title: "Required Utilities", Status: "UNKNOWN", Source: "CheckTools", Issue: "No tool status data."}
}
missing := make([]string, 0)
for _, tool := range health.Tools {
if !tool.OK {
missing = append(missing, tool.Name)
}
}
status := "OK"
issue := ""
if len(missing) > 0 {
status = "PARTIAL"
issue = "Missing: " + strings.Join(missing, ", ")
}
return runtimeHealthRow{Title: "Required Utilities", Status: status, Source: "CheckTools", Issue: issue}
}
func buildRuntimeServicesRow(health schema.RuntimeHealth) runtimeHealthRow {
if len(health.Services) == 0 {
return runtimeHealthRow{Title: "Bee Services", Status: "UNKNOWN", Source: "systemctl is-active", Issue: "No service status data."}
}
nonActive := make([]string, 0)
for _, svc := range health.Services {
state := strings.TrimSpace(strings.ToLower(svc.Status))
// "activating" and "deactivating" are transient states for oneshot services
// (RemainAfterExit=yes) — the service is running normally, not failed.
// Only "failed" and "inactive" (after services should be running) are problems.
switch state {
case "active", "activating", "deactivating", "reloading":
// OK — service is running or transitioning normally
default:
nonActive = append(nonActive, svc.Name+"="+svc.Status)
}
}
status := "OK"
issue := ""
if len(nonActive) > 0 {
status = "PARTIAL"
issue = strings.Join(nonActive, ", ")
}
return runtimeHealthRow{Title: "Bee Services", Status: status, Source: "ServiceState", Issue: issue}
}
func buildRuntimeUSBExportRow(health schema.RuntimeHealth) runtimeHealthRow {
path := strings.TrimSpace(health.USBExportPath)
if path != "" {
return runtimeHealthRow{
Title: "USB Export Drive",
Status: "OK",
Source: "/proc/mounts + lsblk",
Issue: path,
}
}
return runtimeHealthRow{
Title: "USB Export Drive",
Status: "WARNING",
Source: "/proc/mounts + lsblk",
Issue: "No writable USB drive mounted. Plug in a USB drive to enable log export.",
}
}
func buildRuntimeToRAMRow(health schema.RuntimeHealth) runtimeHealthRow {
switch strings.ToLower(strings.TrimSpace(health.ToRAMStatus)) {
case "ok":
return runtimeHealthRow{
Title: "LiveCD in RAM",
Status: "OK",
Source: "live-boot / /proc/mounts",
Issue: "",
}
case "partial":
return runtimeHealthRow{
Title: "LiveCD in RAM",
Status: "WARNING",
Source: "live-boot / /proc/mounts / /dev/shm/bee-live",
Issue: "Partial or staged RAM copy detected. System is not fully running from RAM; Copy to RAM can be retried.",
}
case "failed":
return runtimeHealthRow{
Title: "LiveCD in RAM",
Status: "FAILED",
Source: "live-boot / /proc/mounts",
Issue: "toram boot parameter set but ISO is not mounted from RAM. Copy may have failed.",
}
default:
// toram not active — ISO still on original boot media (USB/CD)
return runtimeHealthRow{
Title: "LiveCD in RAM",
Status: "WARNING",
Source: "live-boot / /proc/mounts",
Issue: "ISO not copied to RAM. Use \u201cCopy to RAM\u201d to free the boot drive and improve performance.",
}
}
}
func buildHardwareComponentRows(exportDir string) []runtimeHealthRow {
path := filepath.Join(exportDir, "component-status.json")
db, err := app.OpenComponentStatusDB(path)
if err != nil {
return []runtimeHealthRow{
{Title: "CPU Component Health", Status: "UNKNOWN", Source: "component-status.json", Issue: "Component status DB not available."},
{Title: "Memory Component Health", Status: "UNKNOWN", Source: "component-status.json", Issue: "Component status DB not available."},
{Title: "Storage Component Health", Status: "UNKNOWN", Source: "component-status.json", Issue: "Component status DB not available."},
{Title: "GPU Component Health", Status: "UNKNOWN", Source: "component-status.json", Issue: "Component status DB not available."},
{Title: "PSU Component Health", Status: "UNKNOWN", Source: "component-status.json", Issue: "No PSU component checks recorded."},
}
}
records := db.All()
return []runtimeHealthRow{
aggregateComponentStatus("CPU", records, []string{"cpu:all"}, nil),
aggregateComponentStatus("Memory", records, []string{"memory:all"}, []string{"memory:"}),
aggregateComponentStatus("Storage", records, []string{"storage:all"}, []string{"storage:"}),
aggregateComponentStatus("GPU", records, nil, []string{"pcie:gpu:"}),
aggregateComponentStatus("PSU", records, nil, []string{"psu:"}),
}
}
// matchedRecords returns all ComponentStatusRecord entries whose key matches
// any exact key or any of the given prefixes. Used for per-device chip rendering.
func firstNonEmpty(vals ...string) string {
for _, v := range vals {
if v != "" {
return v
}
}
return ""
}
func matchedRecords(records []app.ComponentStatusRecord, exact []string, prefixes []string) []app.ComponentStatusRecord {
var matched []app.ComponentStatusRecord
for _, rec := range records {
key := strings.TrimSpace(rec.ComponentKey)
if key == "" {
continue
}
if containsExactKey(key, exact) || hasAnyPrefix(key, prefixes) {
matched = append(matched, rec)
}
}
return matched
}
func aggregateComponentStatus(title string, records []app.ComponentStatusRecord, exact []string, prefixes []string) runtimeHealthRow {
matched := make([]app.ComponentStatusRecord, 0)
for _, rec := range records {
key := strings.TrimSpace(rec.ComponentKey)
if key == "" {
continue
}
if containsExactKey(key, exact) || hasAnyPrefix(key, prefixes) {
matched = append(matched, rec)
}
}
if len(matched) == 0 {
return runtimeHealthRow{Title: title, Status: "UNKNOWN", Source: "component-status.json", Issue: "No component status data."}
}
maxSev := -1
for _, rec := range matched {
if sev := runtimeComponentSeverity(rec.Status); sev > maxSev {
maxSev = sev
}
}
status := "UNKNOWN"
switch maxSev {
case 3:
status = "CRITICAL"
case 2:
status = "WARNING"
case 1:
status = "OK"
}
sources := make([]string, 0)
sourceSeen := map[string]struct{}{}
issues := make([]string, 0)
issueSeen := map[string]struct{}{}
for _, rec := range matched {
if runtimeComponentSeverity(rec.Status) != maxSev {
continue
}
source := latestComponentSource(rec)
if source == "" {
source = "component-status.json"
}
if _, ok := sourceSeen[source]; !ok {
sourceSeen[source] = struct{}{}
sources = append(sources, source)
}
issue := strings.TrimSpace(rec.ErrorSummary)
if issue == "" {
issue = latestComponentDetail(rec)
}
if issue == "" {
continue
}
if _, ok := issueSeen[issue]; ok {
continue
}
issueSeen[issue] = struct{}{}
issues = append(issues, issue)
}
if len(sources) == 0 {
sources = append(sources, "component-status.json")
}
issue := strings.Join(issues, "; ")
if issue == "" {
issue = "—"
}
return runtimeHealthRow{
Title: title,
Status: status,
Source: strings.Join(sources, ", "),
Issue: issue,
}
}
func containsExactKey(key string, exact []string) bool {
for _, candidate := range exact {
if key == candidate {
return true
}
}
return false
}
func hasAnyPrefix(key string, prefixes []string) bool {
for _, prefix := range prefixes {
if strings.HasPrefix(key, prefix) {
return true
}
}
return false
}
func runtimeComponentSeverity(status string) int {
switch strings.TrimSpace(strings.ToLower(status)) {
case "critical":
return 3
case "warning":
return 2
case "ok":
return 1
default:
return 0
}
}
func latestComponentSource(rec app.ComponentStatusRecord) string {
if len(rec.History) == 0 {
return ""
}
return strings.TrimSpace(rec.History[len(rec.History)-1].Source)
}
func latestComponentDetail(rec app.ComponentStatusRecord) string {
if len(rec.History) == 0 {
return ""
}
return strings.TrimSpace(rec.History[len(rec.History)-1].Detail)
}
func runtimeIssueDescriptions(issues []schema.RuntimeIssue, codes ...string) string {
if len(issues) == 0 || len(codes) == 0 {
return ""
}
allowed := make(map[string]struct{}, len(codes))
for _, code := range codes {
allowed[code] = struct{}{}
}
messages := make([]string, 0)
for _, issue := range issues {
if _, ok := allowed[issue.Code]; !ok {
continue
}
desc := strings.TrimSpace(issue.Description)
if desc == "" {
desc = issue.Code
}
messages = append(messages, desc)
}
return strings.Join(messages, "; ")
}
// chipLetterClass maps a component status to a single display letter and CSS class.
func chipLetterClass(status string) (letter, cls string) {
switch strings.ToUpper(strings.TrimSpace(status)) {
case "OK":
return "O", "chip-ok"
case "WARNING", "WARN", "PARTIAL":
return "W", "chip-warn"
case "CRITICAL", "FAIL", "FAILED", "ERROR":
return "F", "chip-fail"
default:
return "?", "chip-unknown"
}
}
// renderComponentChips renders one 20×20 chip per ComponentStatusRecord.
// Hover tooltip shows component key, status, error summary and last check time.
// Falls back to a single unknown chip when no records are available.
func renderComponentChips(matched []app.ComponentStatusRecord) string {
if len(matched) == 0 {
return `?`
}
sort.Slice(matched, func(i, j int) bool {
return matched[i].ComponentKey < matched[j].ComponentKey
})
var b strings.Builder
b.WriteString(``)
for _, rec := range matched {
letter, cls := chipLetterClass(rec.Status)
var tooltip strings.Builder
tooltip.WriteString(rec.ComponentKey)
tooltip.WriteString(": ")
tooltip.WriteString(firstNonEmpty(rec.Status, "UNKNOWN"))
if rec.ErrorSummary != "" {
tooltip.WriteString(" — ")
tooltip.WriteString(rec.ErrorSummary)
}
if !rec.LastCheckedAt.IsZero() {
fmt.Fprintf(&tooltip, " (checked %s)", rec.LastCheckedAt.Format("15:04:05"))
}
fmt.Fprintf(&b, `%s`,
cls, html.EscapeString(tooltip.String()), letter)
}
b.WriteString(``)
return b.String()
}
func runtimeStatusBadge(status string) string {
status = strings.ToUpper(strings.TrimSpace(status))
badge := "badge-unknown"
switch status {
case "OK":
badge = "badge-ok"
case "PARTIAL", "WARNING", "WARN":
badge = "badge-warn"
case "FAIL", "FAILED", "CRITICAL":
badge = "badge-err"
}
return `` + html.EscapeString(status) + ``
}
func rowIssueHTML(issue string) string {
issue = strings.TrimSpace(issue)
if issue == "" {
return `—`
}
return html.EscapeString(issue)
}
// ── Metrics ───────────────────────────────────────────────────────────────────
func renderMetrics() string {
return `
Live metrics — updated every 2 seconds.
Server — Load
Temperature — CPU
Temperature — Ambient Sensors
Server — Power
Server — Fan RPM
GPU Metrics
Detected GPUs are rendered in a dedicated section.
GPU — Compute Load
GPU — Memory Load
GPU — Core Clock
GPU — Power
GPU — Temperature
`
}
// ── Validate (Acceptance Tests) ───────────────────────────────────────────────
type validateInventory struct {
CPU string
Memory string
Storage string
NVIDIA string
AMD string
NvidiaGPUCount int
AMDGPUCount int
}
// validateFmtDur formats a duration in seconds as a human-readable "~N min" or "~N s" string.
func validateFmtDur(secs int) string {
if secs < 120 {
return fmt.Sprintf("~%d s", secs)
}
mins := (secs + 29) / 60
return fmt.Sprintf("~%d min", mins)
}
// validateTotalValidateSec returns the estimated wall-clock duration of
// "Validate one by one" in Validate mode for n NVIDIA GPUs.
func validateTotalValidateSec(n int) int {
if n < 0 {
n = 0
}
total := platform.SATEstimatedCPUValidateSec +
platform.SATEstimatedMemoryValidateSec +
n*platform.SATEstimatedNvidiaGPUValidatePerGPUSec +
platform.SATEstimatedNvidiaInterconnectSec +
platform.SATEstimatedNvidiaBandwidthSec
return total
}
// validateTotalStressSec returns the estimated wall-clock duration of
// "Validate one by one" in Stress mode for n NVIDIA GPUs.
func validateTotalStressSec(n int) int {
if n < 0 {
n = 0
}
total := platform.SATEstimatedCPUStressSec +
platform.SATEstimatedMemoryStressSec +
n*platform.SATEstimatedNvidiaGPUStressPerGPUSec +
n*platform.SATEstimatedNvidiaTargetedStressPerGPUSec +
n*platform.SATEstimatedNvidiaTargetedPowerPerGPUSec +
platform.SATEstimatedNvidiaPulseTestSec +
platform.SATEstimatedNvidiaInterconnectSec +
platform.SATEstimatedNvidiaBandwidthSec
return total
}
func renderValidate(opts HandlerOptions) string {
inv := loadValidateInventory(opts)
n := inv.NvidiaGPUCount
validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
stressTotalStr := validateFmtDur(validateTotalStressSec(n))
gpuNote := ""
if n > 0 {
gpuNote = fmt.Sprintf(" (%d GPU)", n)
}
return `
Non-destructive: Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.
Tasks continue in the background — view progress in Tasks.
Validate Profile
Runs validate modules sequentially. Validate: ` + validateTotalStr + gpuNote + `; Stress: ` + stressTotalStr + gpuNote + `. Estimates are based on real log data and scale with GPU count.
` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
inv.CPU,
`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
`lscpu, sensors, stress-ng`,
validateFmtDur(platform.SATEstimatedCPUValidateSec)+` in Validate (stress-ng 60 s). `+validateFmtDur(platform.SATEstimatedCPUStressSec)+` in Stress (stress-ng 30 min).`,
)) +
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
inv.Memory,
`Runs a RAM validation pass and records memory state around the test.`,
`free, memtester`,
validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` in Validate (256 MB × 1 pass). `+validateFmtDur(platform.SATEstimatedMemoryStressSec)+` in Stress (512 MB × 1 pass).`,
)) +
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
inv.Storage,
`Scans all storage devices and runs the matching health or self-test path for each device type.`,
`lsblk; NVMe: nvme; SATA/SAS: smartctl`,
`Seconds in Validate (NVMe: instant device query; SATA/SAS: short self-test). Up to ~1 h per device in Stress (extended self-test, device-dependent).`,
)) +
`
NVIDIA GPU Selection
` + inv.NVIDIA + `
All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Validate one by one.
Loading NVIDIA GPUs...
Select at least one NVIDIA GPU to enable NVIDIA validate tasks.
` +
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
inv.NVIDIA,
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
`dcgmi diag targeted_stress`,
func() string {
per := platform.SATEstimatedNvidiaTargetedStressPerGPUSec
s := "Skipped in Validate. "
if n > 0 {
s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
} else {
s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
}
return s + `
Only runs in Stress mode. Switch mode above to enable in Run All.
`
}(),
)) +
`
` +
`
` +
renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody(
inv.NVIDIA,
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
`dcgmi diag targeted_power`,
func() string {
per := platform.SATEstimatedNvidiaTargetedPowerPerGPUSec
s := "Skipped in Validate. "
if n > 0 {
s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
} else {
s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
}
return s + `
Only runs in Stress mode. Switch mode above to enable in Run All.
`
}(),
)) +
`
` +
`
` +
renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody(
inv.NVIDIA,
`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
`dcgmi diag pulse_test`,
`Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`
Only runs in Stress mode. Switch mode above to enable in Run All.
`,
)) +
`
` +
`
` +
renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
inv.NVIDIA,
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
`all_reduce_perf (NCCL tests)`,
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
)) +
`
` +
`
` +
renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
inv.NVIDIA,
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
`nvbandwidth`,
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
)) +
`
` +
`
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
inv.AMD,
`Runs the selected AMD checks only. GPU Validate collects inventory; MEM Integrity uses the RVS MEM module; MEM Bandwidth uses rocm-bandwidth-test and the RVS BABEL module.`,
`GPU Validate: rocm-smi, dmidecode; MEM Integrity: rvs mem; MEM Bandwidth: rocm-bandwidth-test, rvs babel`,
``,
)) +
`
Test Output
`
}
func loadValidateInventory(opts HandlerOptions) validateInventory {
unknown := "Audit snapshot not loaded."
out := validateInventory{
CPU: unknown,
Memory: unknown,
Storage: unknown,
NVIDIA: unknown,
AMD: unknown,
}
data, err := loadSnapshot(opts.AuditPath)
if err != nil {
return out
}
var snap schema.HardwareIngestRequest
if err := json.Unmarshal(data, &snap); err != nil {
return out
}
cpuCounts := map[string]int{}
cpuTotal := 0
for _, cpu := range snap.Hardware.CPUs {
if cpu.Present != nil && !*cpu.Present {
continue
}
cpuTotal++
addValidateModel(cpuCounts, validateFirstNonEmpty(validateTrimPtr(cpu.Model), validateTrimPtr(cpu.Manufacturer), "unknown"))
}
memCounts := map[string]int{}
memTotal := 0
for _, dimm := range snap.Hardware.Memory {
if dimm.Present != nil && !*dimm.Present {
continue
}
memTotal++
addValidateModel(memCounts, validateFirstNonEmpty(validateTrimPtr(dimm.PartNumber), validateTrimPtr(dimm.Type), validateTrimPtr(dimm.Manufacturer), "unknown"))
}
storageCounts := map[string]int{}
storageTotal := 0
for _, dev := range snap.Hardware.Storage {
if dev.Present != nil && !*dev.Present {
continue
}
storageTotal++
addValidateModel(storageCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
}
nvidiaCounts := map[string]int{}
nvidiaTotal := 0
amdCounts := map[string]int{}
amdTotal := 0
for _, dev := range snap.Hardware.PCIeDevices {
if dev.Present != nil && !*dev.Present {
continue
}
if validateIsVendorGPU(dev, "nvidia") {
nvidiaTotal++
addValidateModel(nvidiaCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
}
if validateIsVendorGPU(dev, "amd") {
amdTotal++
addValidateModel(amdCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
}
}
out.CPU = formatValidateDeviceSummary(cpuTotal, cpuCounts, "CPU")
out.Memory = formatValidateDeviceSummary(memTotal, memCounts, "module")
out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device")
out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU")
out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU")
out.NvidiaGPUCount = nvidiaTotal
out.AMDGPUCount = amdTotal
return out
}
func renderValidateCardBody(devices, description, commands, settings string) string {
return `
` + devices + `
` +
`
` + description + `
` +
`
` + commands + `
` +
`
` + settings + `
`
}
func formatValidateDeviceSummary(total int, models map[string]int, unit string) string {
if total == 0 {
return "0 " + unit + "s detected."
}
keys := make([]string, 0, len(models))
for key := range models {
keys = append(keys, key)
}
sort.Strings(keys)
parts := make([]string, 0, len(keys))
for _, key := range keys {
parts = append(parts, fmt.Sprintf("%d x %s", models[key], html.EscapeString(key)))
}
label := unit
if total != 1 {
label += "s"
}
// If there is only one model the leading count duplicates the per-model
// count already in parts (e.g. "4 GPU: 4 x RTX …" → "4 x RTX …").
if len(parts) == 1 {
return parts[0] + " " + label
}
return fmt.Sprintf("%d %s: %s", total, label, strings.Join(parts, ", "))
}
func addValidateModel(counts map[string]int, name string) {
name = strings.TrimSpace(name)
if name == "" {
name = "unknown"
}
counts[name]++
}
func validateTrimPtr(value *string) string {
if value == nil {
return ""
}
return strings.TrimSpace(*value)
}
func validateFirstNonEmpty(values ...string) string {
for _, value := range values {
value = strings.TrimSpace(value)
if value != "" {
return value
}
}
return ""
}
func validateIsVendorGPU(dev schema.HardwarePCIeDevice, vendor string) bool {
model := strings.ToLower(validateTrimPtr(dev.Model))
manufacturer := strings.ToLower(validateTrimPtr(dev.Manufacturer))
class := strings.ToLower(validateTrimPtr(dev.DeviceClass))
if strings.Contains(model, "aspeed") || strings.Contains(manufacturer, "aspeed") {
return false
}
switch vendor {
case "nvidia":
return strings.Contains(model, "nvidia") || strings.Contains(manufacturer, "nvidia")
case "amd":
isGPUClass := class == "processingaccelerator" || class == "displaycontroller" || class == "videocontroller"
isAMDVendor := strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd") || strings.Contains(manufacturer, "ati")
isAMDModel := strings.Contains(model, "instinct") || strings.Contains(model, "radeon") || strings.Contains(model, "amd")
return isGPUClass && (isAMDVendor || isAMDModel)
default:
return false
}
}
func renderSATCard(id, label, runAction, headerActions, body string) string {
actions := ``
if strings.TrimSpace(headerActions) != "" {
actions += headerActions
}
return fmt.Sprintf(`
Timings are per full ramp-up run (1 GPU → all selected), measured on 4–8 GPU servers. Use ramp-up mode for capacity work: it creates 1 GPU → 2 GPU → … → all selected steps so analysis software can derive server total score and watts-per-GPU curves.
`
}
sort.Strings(paths)
type powerRun struct {
generatedAt time.Time
displayTime string
result platform.NvidiaPowerBenchResult
}
var runs []powerRun
for _, path := range paths {
raw, err := os.ReadFile(path)
if err != nil {
continue
}
var r platform.NvidiaPowerBenchResult
if err := json.Unmarshal(raw, &r); err != nil {
continue
}
runs = append(runs, powerRun{
generatedAt: r.GeneratedAt,
displayTime: r.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
result: r,
})
}
sort.Slice(runs, func(i, j int) bool {
return runs[i].generatedAt.After(runs[j].generatedAt)
})
// Show only the most recent run's GPU slot table, plus a run history summary.
var b strings.Builder
b.WriteString(`
⚠ Warning: Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.
Scope: Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in Validate → Stress mode; NCCL and NVBandwidth are available directly from Validate.
Tasks continue in the background — view progress in Tasks.
Burn Profile
Runs checked tests as separate sequential tasks. In sequential GPU mode, total time = profile duration × N GPU. In parallel mode, all selected GPUs burn simultaneously for one profile duration.
Run checked core test modules (CPU, MEM, GPU). Tests start at the same time and run for a period with short cooldown phases to stress the server cooling system.
NVIDIA GPU Selection
Official NVIDIA recipes and custom NVIDIA stressors use only the GPUs selected here. Multi-GPU interconnect tests are limited to this selection as well.
Loading NVIDIA GPUs...
Select at least one NVIDIA GPU to enable NVIDIA burn recipes.
Core Burn Paths
GPU Max Load
Combine vendor-backed and custom GPU max-load recipes in one run set. ` + "dcgmproftester" + ` is the primary official NVIDIA path; custom stressors remain available as parallel checkbox options.
Compute Stress
Select which subsystems to stress. Each checked item runs as a separate task.
Output
`
}
// ── Network ───────────────────────────────────────────────────────────────────
// renderNetworkInline returns the network UI without a wrapping card (for embedding in Tools).
func renderNetworkInline() string {
return `
⚠ Network change applied. Reverting in 60s unless confirmed.
` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `
Loading...
Output
`
}
func renderServices() string {
return `
Bee Services
` +
renderServicesInline() +
`
`
}
// ── Export ────────────────────────────────────────────────────────────────────
func renderExport(exportDir string) string {
entries, _ := listExportFiles(exportDir)
var rows strings.Builder
for _, e := range entries {
rows.WriteString(fmt.Sprintf(`
Downloads a tar.gz archive of all audit files, SAT results, and logs.
` + renderSupportBundleInline() + `
Export to USB
` + renderUSBExportInline() + `
Tool Check
Checking...
NVIDIA Self Heal
` +
renderNvidiaSelfHealInline() + `
Network
` +
renderNetworkInline() + `
Services
` +
renderServicesInline() + `
`
}
// ── Install to Disk ──────────────────────────────────────────────────────────
func renderInstallInline() string {
return `
Warning: Installing will completely erase the selected
disk and write the live system onto it. All existing data on the target disk will be lost.
This operation cannot be undone.