package webui import ( "encoding/json" "fmt" "html" "net/url" "os" "path/filepath" "sort" "strconv" "strings" "time" "bee/audit/internal/app" "bee/audit/internal/platform" "bee/audit/internal/schema" ) // ── Layout ──────────────────────────────────────────────────────────────────── func layoutHead(title string) string { return ` ` + html.EscapeString(title) + ` ` } func layoutNav(active string, buildLabel string) string { items := []struct{ id, label, href, onclick string }{ {"dashboard", "Dashboard", "/", ""}, {"audit", "Audit", "/audit", ""}, {"validate", "Validate", "/validate", ""}, {"burn", "Burn", "/burn", ""}, {"benchmark", "Benchmark", "/benchmark", ""}, {"tasks", "Tasks", "/tasks", ""}, {"tools", "Tools", "/tools", ""}, } var b strings.Builder b.WriteString(``) return b.String() } // renderPage dispatches to the appropriate page renderer. func renderPage(page string, opts HandlerOptions) string { var pageID, title, body string switch page { case "dashboard", "": pageID = "dashboard" title = "Dashboard" body = renderDashboard(opts) case "audit": pageID = "audit" title = "Audit" body = renderAudit() case "validate": pageID = "validate" title = "Validate" body = renderValidate(opts) case "burn": pageID = "burn" title = "Burn" body = renderBurn() case "benchmark": pageID = "benchmark" title = "Benchmark" body = renderBenchmark(opts) case "tasks": pageID = "tasks" title = "Tasks" body = renderTasks() case "tools": pageID = "tools" title = "Tools" body = renderTools() // Legacy routes kept accessible but not in nav case "metrics": pageID = "metrics" title = "Live Metrics" body = renderMetrics() case "tests": pageID = "validate" title = "Acceptance Tests" body = renderValidate(opts) case "burn-in": pageID = "burn" title = "Burn-in Tests" body = renderBurn() case "network": pageID = "network" title = "Network" body = renderNetwork() case "services": pageID = "services" title = "Services" body = renderServices() case "export": pageID = "export" title = "Export" body = renderExport(opts.ExportDir) case "install": pageID = "install" title = "Install to Disk" body = renderInstall() default: pageID = "dashboard" title = "Not Found" body = `
Page not found.
` } return layoutHead(opts.Title+" — "+title) + layoutNav(pageID, opts.BuildLabel) + `

` + html.EscapeString(title) + `

` + body + `
` + renderAuditModal() + `` + `` } // ── Dashboard ───────────────────────────────────────────────────────────────── func renderDashboard(opts HandlerOptions) string { var b strings.Builder b.WriteString(renderAuditStatusBanner(opts)) b.WriteString(renderHardwareSummaryCard(opts)) b.WriteString(renderHealthCard(opts)) b.WriteString(renderMetrics()) return b.String() } // renderAuditStatusBanner shows a live progress banner when an audit task is // running and auto-reloads the page when it completes. func renderAuditStatusBanner(opts HandlerOptions) string { // If audit data already exists, no banner needed — data is fresh. // We still inject the polling script so a newly-triggered audit also reloads. hasData := false if _, err := loadSnapshot(opts.AuditPath); err == nil { hasData = true } _ = hasData return ` ` } func renderAudit() string { return `
Audit Viewer
` } func renderHardwareSummaryCard(opts HandlerOptions) string { data, err := loadSnapshot(opts.AuditPath) if err != nil { return `
Hardware Summary
` } var ingest schema.HardwareIngestRequest if err := json.Unmarshal(data, &ingest); err != nil { return `
Hardware Summary
Parse error
` } hw := ingest.Hardware var records []app.ComponentStatusRecord if db, err := app.OpenComponentStatusDB(filepath.Join(opts.ExportDir, "component-status.json")); err == nil { records = db.All() } var b strings.Builder b.WriteString(`
Hardware Summary
`) // Server identity block above the component table. { var model, serial string parts := []string{} if hw.Board.Manufacturer != nil && strings.TrimSpace(*hw.Board.Manufacturer) != "" { parts = append(parts, strings.TrimSpace(*hw.Board.Manufacturer)) } if hw.Board.ProductName != nil && strings.TrimSpace(*hw.Board.ProductName) != "" { parts = append(parts, strings.TrimSpace(*hw.Board.ProductName)) } if len(parts) > 0 { model = strings.Join(parts, " ") } serial = strings.TrimSpace(hw.Board.SerialNumber) if model != "" || serial != "" { b.WriteString(`
`) if model != "" { fmt.Fprintf(&b, `
%s
`, html.EscapeString(model)) } if serial != "" { fmt.Fprintf(&b, `
S/N: %s
`, html.EscapeString(serial)) } b.WriteString(`
`) } } b.WriteString(``) writeRow := func(label, value, badgeHTML string) { b.WriteString(fmt.Sprintf(``, html.EscapeString(label), html.EscapeString(value), badgeHTML)) } writeRow("CPU", hwDescribeCPU(hw), renderComponentChips(matchedRecords(records, []string{"cpu:all"}, nil))) writeRow("Memory", hwDescribeMemory(hw), renderComponentChips(matchedRecords(records, []string{"memory:all"}, []string{"memory:"}))) writeRow("Storage", hwDescribeStorage(hw), renderComponentChips(matchedRecords(records, []string{"storage:all"}, []string{"storage:"}))) writeRow("GPU", hwDescribeGPU(hw), renderComponentChips(matchedRecords(records, nil, []string{"pcie:gpu:"}))) psuMatched := matchedRecords(records, nil, []string{"psu:"}) if len(psuMatched) == 0 && len(hw.PowerSupplies) > 0 { // No PSU records yet — synthesise a single chip from IPMI status. psuStatus := hwPSUStatus(hw.PowerSupplies) psuMatched = []app.ComponentStatusRecord{{ComponentKey: "psu:ipmi", Status: psuStatus}} } writeRow("PSU", hwDescribePSU(hw), renderComponentChips(psuMatched)) if nicDesc := hwDescribeNIC(hw); nicDesc != "" { writeRow("Network", nicDesc, "") } b.WriteString(`
%s%s%s
`) b.WriteString(`
`) return b.String() } // hwDescribeCPU returns a human-readable CPU summary, e.g. "2× Intel Xeon Gold 6338". func hwDescribeCPU(hw schema.HardwareSnapshot) string { counts := map[string]int{} order := []string{} for _, cpu := range hw.CPUs { model := "Unknown CPU" if cpu.Model != nil && *cpu.Model != "" { model = *cpu.Model } if counts[model] == 0 { order = append(order, model) } counts[model]++ } if len(order) == 0 { return "—" } parts := make([]string, 0, len(order)) for _, m := range order { if counts[m] > 1 { parts = append(parts, fmt.Sprintf("%d× %s", counts[m], m)) } else { parts = append(parts, m) } } return strings.Join(parts, ", ") } // hwDescribeMemory returns a summary like "16× 32 GB DDR4". func hwDescribeMemory(hw schema.HardwareSnapshot) string { type key struct { sizeMB int typ string } counts := map[key]int{} order := []key{} for _, dimm := range hw.Memory { if dimm.SizeMB == nil || *dimm.SizeMB == 0 { continue } t := "" if dimm.Type != nil { t = *dimm.Type } k := key{*dimm.SizeMB, t} if counts[k] == 0 { order = append(order, k) } counts[k]++ } if len(order) == 0 { return "—" } parts := make([]string, 0, len(order)) for _, k := range order { gb := k.sizeMB / 1024 desc := fmt.Sprintf("%d× %d GB", counts[k], gb) if k.typ != "" { desc += " " + k.typ } parts = append(parts, desc) } return strings.Join(parts, ", ") } // hwDescribeStorage returns a summary like "4× 3.84 TB NVMe, 2× 1.92 TB SATA". func hwDescribeStorage(hw schema.HardwareSnapshot) string { type key struct { sizeGB int iface string } counts := map[key]int{} order := []key{} for _, disk := range hw.Storage { sz := 0 if disk.SizeGB != nil { sz = *disk.SizeGB } iface := "" if disk.Interface != nil { iface = *disk.Interface } else if disk.Type != nil { iface = *disk.Type } k := key{sz, iface} if counts[k] == 0 { order = append(order, k) } counts[k]++ } if len(order) == 0 { return "—" } parts := make([]string, 0, len(order)) for _, k := range order { var sizeStr string if k.sizeGB >= 1000 { sizeStr = fmt.Sprintf("%.2g TB", float64(k.sizeGB)/1000) } else if k.sizeGB > 0 { sizeStr = fmt.Sprintf("%d GB", k.sizeGB) } else { sizeStr = "?" } desc := fmt.Sprintf("%d× %s", counts[k], sizeStr) if k.iface != "" { desc += " " + k.iface } parts = append(parts, desc) } return strings.Join(parts, ", ") } // hwDescribeGPU returns a summary like "8× NVIDIA H100 80GB". func hwDescribeGPU(hw schema.HardwareSnapshot) string { counts := map[string]int{} order := []string{} for _, dev := range hw.PCIeDevices { if dev.DeviceClass == nil { continue } if !isGPUDeviceClass(*dev.DeviceClass) { continue } model := "Unknown GPU" if dev.Model != nil && *dev.Model != "" { model = *dev.Model } if counts[model] == 0 { order = append(order, model) } counts[model]++ } if len(order) == 0 { return "—" } parts := make([]string, 0, len(order)) for _, m := range order { if counts[m] > 1 { parts = append(parts, fmt.Sprintf("%d× %s", counts[m], m)) } else { parts = append(parts, m) } } return strings.Join(parts, ", ") } // hwPSUStatus returns "OK", "CRITICAL", "WARNING", or "UNKNOWN" based on // PSU statuses from the audit snapshot. Used as fallback when component-status.json // has no psu: records yet (e.g. first boot before audit writes them). func hwPSUStatus(psus []schema.HardwarePowerSupply) string { worst := "UNKNOWN" for _, psu := range psus { if psu.Status == nil { continue } switch strings.ToUpper(strings.TrimSpace(*psu.Status)) { case "CRITICAL": return "CRITICAL" case "WARNING": if worst != "CRITICAL" { worst = "WARNING" } case "OK": if worst == "UNKNOWN" { worst = "OK" } } } return worst } // hwDescribePSU returns a summary like "2× 1600 W" or "2× PSU". func hwDescribePSU(hw schema.HardwareSnapshot) string { n := len(hw.PowerSupplies) if n == 0 { return "—" } // Try to get a consistent wattage watt := 0 consistent := true for _, psu := range hw.PowerSupplies { if psu.WattageW == nil { consistent = false break } if watt == 0 { watt = *psu.WattageW } else if *psu.WattageW != watt { consistent = false break } } if consistent && watt > 0 { return fmt.Sprintf("%d× %d W", n, watt) } return fmt.Sprintf("%d× PSU", n) } // hwDescribeNIC returns a summary like "2× Mellanox ConnectX-6". func hwDescribeNIC(hw schema.HardwareSnapshot) string { counts := map[string]int{} order := []string{} for _, dev := range hw.PCIeDevices { isNIC := false if dev.DeviceClass != nil { c := strings.ToLower(strings.TrimSpace(*dev.DeviceClass)) isNIC = c == "ethernetcontroller" || c == "networkcontroller" || strings.Contains(c, "fibrechannel") } if !isNIC && len(dev.MacAddresses) == 0 { continue } model := "" if dev.Model != nil && *dev.Model != "" { model = *dev.Model } else if dev.Manufacturer != nil && *dev.Manufacturer != "" { model = *dev.Manufacturer + " NIC" } else { model = "NIC" } if counts[model] == 0 { order = append(order, model) } counts[model]++ } if len(order) == 0 { return "" } parts := make([]string, 0, len(order)) for _, m := range order { if counts[m] > 1 { parts = append(parts, fmt.Sprintf("%d× %s", counts[m], m)) } else { parts = append(parts, m) } } return strings.Join(parts, ", ") } func isGPUDeviceClass(class string) bool { switch strings.TrimSpace(class) { case "VideoController", "DisplayController", "ProcessingAccelerator": return true default: return false } } func renderAuditModal() string { return ` ` } func renderHealthCard(opts HandlerOptions) string { data, err := loadSnapshot(filepath.Join(opts.ExportDir, "runtime-health.json")) if err != nil { return `
Runtime Health
No data
` } var health schema.RuntimeHealth if err := json.Unmarshal(data, &health); err != nil { return `
Runtime Health
Parse error
` } status := strings.TrimSpace(health.Status) if status == "" { status = "UNKNOWN" } badge := "badge-ok" if status == "PARTIAL" { badge = "badge-warn" } else if status == "FAIL" || status == "FAILED" { badge = "badge-err" } var b strings.Builder b.WriteString(`
Runtime Health
`) b.WriteString(fmt.Sprintf(`
%s
`, badge, html.EscapeString(status))) if checkedAt := strings.TrimSpace(health.CheckedAt); checkedAt != "" { b.WriteString(`
Checked at: ` + html.EscapeString(checkedAt) + `
`) } rows := []runtimeHealthRow{ buildRuntimeExportRow(health), buildRuntimeNetworkRow(health), buildRuntimeDriverRow(health), buildRuntimeAccelerationRow(health), buildRuntimeToolsRow(health), buildRuntimeServicesRow(health), buildRuntimeUSBExportRow(health), buildRuntimeToRAMRow(health), } b.WriteString(``) for _, row := range rows { b.WriteString(``) } b.WriteString(`
CheckStatusSourceIssue
` + html.EscapeString(row.Title) + `` + runtimeStatusBadge(row.Status) + `` + html.EscapeString(row.Source) + `` + rowIssueHTML(row.Issue) + `
`) b.WriteString(`
`) return b.String() } type runtimeHealthRow struct { Title string Status string Source string Issue string } func buildRuntimeExportRow(health schema.RuntimeHealth) runtimeHealthRow { issue := runtimeIssueDescriptions(health.Issues, "export_dir_unavailable") status := "UNKNOWN" switch { case issue != "": status = "FAILED" case strings.TrimSpace(health.ExportDir) != "": status = "OK" } source := "os.MkdirAll" if dir := strings.TrimSpace(health.ExportDir); dir != "" { source += " " + dir } return runtimeHealthRow{Title: "Export Directory", Status: status, Source: source, Issue: issue} } func buildRuntimeNetworkRow(health schema.RuntimeHealth) runtimeHealthRow { status := strings.TrimSpace(health.NetworkStatus) if status == "" { status = "UNKNOWN" } issue := runtimeIssueDescriptions(health.Issues, "dhcp_partial", "dhcp_failed") return runtimeHealthRow{Title: "Network", Status: status, Source: "ListInterfaces / DHCP", Issue: issue} } func buildRuntimeDriverRow(health schema.RuntimeHealth) runtimeHealthRow { issue := runtimeIssueDescriptions(health.Issues, "nvidia_kernel_module_missing", "nvidia_modeset_failed", "amdgpu_kernel_module_missing") status := "UNKNOWN" switch { case health.DriverReady && issue == "": status = "OK" case health.DriverReady: status = "PARTIAL" case issue != "": status = "FAILED" } return runtimeHealthRow{Title: "NVIDIA/AMD Driver", Status: status, Source: "lsmod / vendor probe", Issue: issue} } func buildRuntimeAccelerationRow(health schema.RuntimeHealth) runtimeHealthRow { issue := runtimeIssueDescriptions(health.Issues, "cuda_runtime_not_ready", "rocm_smi_unavailable") status := "UNKNOWN" switch { case health.CUDAReady && issue == "": status = "OK" case health.CUDAReady: status = "PARTIAL" case issue != "": status = "FAILED" } return runtimeHealthRow{Title: "CUDA / ROCm", Status: status, Source: "bee-gpu-burn / rocm-smi", Issue: issue} } func buildRuntimeToolsRow(health schema.RuntimeHealth) runtimeHealthRow { if len(health.Tools) == 0 { return runtimeHealthRow{Title: "Required Utilities", Status: "UNKNOWN", Source: "CheckTools", Issue: "No tool status data."} } missing := make([]string, 0) for _, tool := range health.Tools { if !tool.OK { missing = append(missing, tool.Name) } } status := "OK" issue := "" if len(missing) > 0 { status = "PARTIAL" issue = "Missing: " + strings.Join(missing, ", ") } return runtimeHealthRow{Title: "Required Utilities", Status: status, Source: "CheckTools", Issue: issue} } func buildRuntimeServicesRow(health schema.RuntimeHealth) runtimeHealthRow { if len(health.Services) == 0 { return runtimeHealthRow{Title: "Bee Services", Status: "UNKNOWN", Source: "systemctl is-active", Issue: "No service status data."} } nonActive := make([]string, 0) for _, svc := range health.Services { state := strings.TrimSpace(strings.ToLower(svc.Status)) // "activating" and "deactivating" are transient states for oneshot services // (RemainAfterExit=yes) — the service is running normally, not failed. // Only "failed" and "inactive" (after services should be running) are problems. switch state { case "active", "activating", "deactivating", "reloading": // OK — service is running or transitioning normally default: nonActive = append(nonActive, svc.Name+"="+svc.Status) } } status := "OK" issue := "" if len(nonActive) > 0 { status = "PARTIAL" issue = strings.Join(nonActive, ", ") } return runtimeHealthRow{Title: "Bee Services", Status: status, Source: "ServiceState", Issue: issue} } func buildRuntimeUSBExportRow(health schema.RuntimeHealth) runtimeHealthRow { path := strings.TrimSpace(health.USBExportPath) if path != "" { return runtimeHealthRow{ Title: "USB Export Drive", Status: "OK", Source: "/proc/mounts + lsblk", Issue: path, } } return runtimeHealthRow{ Title: "USB Export Drive", Status: "WARNING", Source: "/proc/mounts + lsblk", Issue: "No writable USB drive mounted. Plug in a USB drive to enable log export.", } } func buildRuntimeToRAMRow(health schema.RuntimeHealth) runtimeHealthRow { switch strings.ToLower(strings.TrimSpace(health.ToRAMStatus)) { case "ok": return runtimeHealthRow{ Title: "LiveCD in RAM", Status: "OK", Source: "live-boot / /proc/mounts", Issue: "", } case "partial": return runtimeHealthRow{ Title: "LiveCD in RAM", Status: "WARNING", Source: "live-boot / /proc/mounts / /dev/shm/bee-live", Issue: "Partial or staged RAM copy detected. System is not fully running from RAM; Copy to RAM can be retried.", } case "failed": return runtimeHealthRow{ Title: "LiveCD in RAM", Status: "FAILED", Source: "live-boot / /proc/mounts", Issue: "toram boot parameter set but ISO is not mounted from RAM. Copy may have failed.", } default: // toram not active — ISO still on original boot media (USB/CD) return runtimeHealthRow{ Title: "LiveCD in RAM", Status: "WARNING", Source: "live-boot / /proc/mounts", Issue: "ISO not copied to RAM. Use \u201cCopy to RAM\u201d to free the boot drive and improve performance.", } } } func buildHardwareComponentRows(exportDir string) []runtimeHealthRow { path := filepath.Join(exportDir, "component-status.json") db, err := app.OpenComponentStatusDB(path) if err != nil { return []runtimeHealthRow{ {Title: "CPU Component Health", Status: "UNKNOWN", Source: "component-status.json", Issue: "Component status DB not available."}, {Title: "Memory Component Health", Status: "UNKNOWN", Source: "component-status.json", Issue: "Component status DB not available."}, {Title: "Storage Component Health", Status: "UNKNOWN", Source: "component-status.json", Issue: "Component status DB not available."}, {Title: "GPU Component Health", Status: "UNKNOWN", Source: "component-status.json", Issue: "Component status DB not available."}, {Title: "PSU Component Health", Status: "UNKNOWN", Source: "component-status.json", Issue: "No PSU component checks recorded."}, } } records := db.All() return []runtimeHealthRow{ aggregateComponentStatus("CPU", records, []string{"cpu:all"}, nil), aggregateComponentStatus("Memory", records, []string{"memory:all"}, []string{"memory:"}), aggregateComponentStatus("Storage", records, []string{"storage:all"}, []string{"storage:"}), aggregateComponentStatus("GPU", records, nil, []string{"pcie:gpu:"}), aggregateComponentStatus("PSU", records, nil, []string{"psu:"}), } } // matchedRecords returns all ComponentStatusRecord entries whose key matches // any exact key or any of the given prefixes. Used for per-device chip rendering. func firstNonEmpty(vals ...string) string { for _, v := range vals { if v != "" { return v } } return "" } func matchedRecords(records []app.ComponentStatusRecord, exact []string, prefixes []string) []app.ComponentStatusRecord { var matched []app.ComponentStatusRecord for _, rec := range records { key := strings.TrimSpace(rec.ComponentKey) if key == "" { continue } if containsExactKey(key, exact) || hasAnyPrefix(key, prefixes) { matched = append(matched, rec) } } return matched } func aggregateComponentStatus(title string, records []app.ComponentStatusRecord, exact []string, prefixes []string) runtimeHealthRow { matched := make([]app.ComponentStatusRecord, 0) for _, rec := range records { key := strings.TrimSpace(rec.ComponentKey) if key == "" { continue } if containsExactKey(key, exact) || hasAnyPrefix(key, prefixes) { matched = append(matched, rec) } } if len(matched) == 0 { return runtimeHealthRow{Title: title, Status: "UNKNOWN", Source: "component-status.json", Issue: "No component status data."} } maxSev := -1 for _, rec := range matched { if sev := runtimeComponentSeverity(rec.Status); sev > maxSev { maxSev = sev } } status := "UNKNOWN" switch maxSev { case 3: status = "CRITICAL" case 2: status = "WARNING" case 1: status = "OK" } sources := make([]string, 0) sourceSeen := map[string]struct{}{} issues := make([]string, 0) issueSeen := map[string]struct{}{} for _, rec := range matched { if runtimeComponentSeverity(rec.Status) != maxSev { continue } source := latestComponentSource(rec) if source == "" { source = "component-status.json" } if _, ok := sourceSeen[source]; !ok { sourceSeen[source] = struct{}{} sources = append(sources, source) } issue := strings.TrimSpace(rec.ErrorSummary) if issue == "" { issue = latestComponentDetail(rec) } if issue == "" { continue } if _, ok := issueSeen[issue]; ok { continue } issueSeen[issue] = struct{}{} issues = append(issues, issue) } if len(sources) == 0 { sources = append(sources, "component-status.json") } issue := strings.Join(issues, "; ") if issue == "" { issue = "—" } return runtimeHealthRow{ Title: title, Status: status, Source: strings.Join(sources, ", "), Issue: issue, } } func containsExactKey(key string, exact []string) bool { for _, candidate := range exact { if key == candidate { return true } } return false } func hasAnyPrefix(key string, prefixes []string) bool { for _, prefix := range prefixes { if strings.HasPrefix(key, prefix) { return true } } return false } func runtimeComponentSeverity(status string) int { switch strings.TrimSpace(strings.ToLower(status)) { case "critical": return 3 case "warning": return 2 case "ok": return 1 default: return 0 } } func latestComponentSource(rec app.ComponentStatusRecord) string { if len(rec.History) == 0 { return "" } return strings.TrimSpace(rec.History[len(rec.History)-1].Source) } func latestComponentDetail(rec app.ComponentStatusRecord) string { if len(rec.History) == 0 { return "" } return strings.TrimSpace(rec.History[len(rec.History)-1].Detail) } func runtimeIssueDescriptions(issues []schema.RuntimeIssue, codes ...string) string { if len(issues) == 0 || len(codes) == 0 { return "" } allowed := make(map[string]struct{}, len(codes)) for _, code := range codes { allowed[code] = struct{}{} } messages := make([]string, 0) for _, issue := range issues { if _, ok := allowed[issue.Code]; !ok { continue } desc := strings.TrimSpace(issue.Description) if desc == "" { desc = issue.Code } messages = append(messages, desc) } return strings.Join(messages, "; ") } // chipLetterClass maps a component status to a single display letter and CSS class. func chipLetterClass(status string) (letter, cls string) { switch strings.ToUpper(strings.TrimSpace(status)) { case "OK": return "O", "chip-ok" case "WARNING", "WARN", "PARTIAL": return "W", "chip-warn" case "CRITICAL", "FAIL", "FAILED", "ERROR": return "F", "chip-fail" default: return "?", "chip-unknown" } } // renderComponentChips renders one 20×20 chip per ComponentStatusRecord. // Hover tooltip shows component key, status, error summary and last check time. // Falls back to a single unknown chip when no records are available. func renderComponentChips(matched []app.ComponentStatusRecord) string { if len(matched) == 0 { return `?` } sort.Slice(matched, func(i, j int) bool { return matched[i].ComponentKey < matched[j].ComponentKey }) var b strings.Builder b.WriteString(``) for _, rec := range matched { letter, cls := chipLetterClass(rec.Status) var tooltip strings.Builder tooltip.WriteString(rec.ComponentKey) tooltip.WriteString(": ") tooltip.WriteString(firstNonEmpty(rec.Status, "UNKNOWN")) if rec.ErrorSummary != "" { tooltip.WriteString(" — ") tooltip.WriteString(rec.ErrorSummary) } if !rec.LastCheckedAt.IsZero() { fmt.Fprintf(&tooltip, " (checked %s)", rec.LastCheckedAt.Format("15:04:05")) } fmt.Fprintf(&b, `%s`, cls, html.EscapeString(tooltip.String()), letter) } b.WriteString(``) return b.String() } func runtimeStatusBadge(status string) string { status = strings.ToUpper(strings.TrimSpace(status)) badge := "badge-unknown" switch status { case "OK": badge = "badge-ok" case "PARTIAL", "WARNING", "WARN": badge = "badge-warn" case "FAIL", "FAILED", "CRITICAL": badge = "badge-err" } return `` + html.EscapeString(status) + `` } func rowIssueHTML(issue string) string { issue = strings.TrimSpace(issue) if issue == "" { return `` } return html.EscapeString(issue) } // ── Metrics ─────────────────────────────────────────────────────────────────── func renderMetrics() string { return `

Live metrics — updated every 2 seconds.

Server — Load
CPU/Mem load
Temperature — CPU
CPU temperature
Temperature — Ambient Sensors
Ambient temperature sensors
Server — Power
System power
` } // ── Validate (Acceptance Tests) ─────────────────────────────────────────────── type validateInventory struct { CPU string Memory string Storage string NVIDIA string AMD string } func renderValidate(opts HandlerOptions) string { inv := loadValidateInventory(opts) return `
Non-destructive: Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.

Tasks continue in the background — view progress in Tasks.

Validate Profile

Runs validate modules sequentially with the selected cycle count and mode. Validate is quick (~5–15 min total); Stress is thorough (~30–60 min total).

` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody( inv.CPU, `Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`, `lscpu, sensors, stress-ng`, `60s in Validate, 30 min in Stress.`, )) + renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody( inv.Memory, `Runs a RAM validation pass and records memory state around the test.`, `free, memtester`, `256 MB / 1 pass in Validate, 512 MB / 1 pass in Stress.`, )) + renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody( inv.Storage, `Scans all storage devices and runs the matching health or self-test path for each device type.`, `lsblk; NVMe: nvme; SATA/SAS: smartctl`, `Short self-test in Validate, extended self-test in Stress.`, )) + `
NVIDIA GPU Selection

` + inv.NVIDIA + `

All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Validate one by one.

Loading NVIDIA GPUs...

Select at least one NVIDIA GPU to enable NVIDIA validate tasks.

` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody( inv.NVIDIA, `Runs NVIDIA diagnostics and board inventory checks.`, `nvidia-smi, dmidecode, dcgmi diag`, `Level 2 in Validate, Level 3 in Stress. Runs one GPU at a time on the selected NVIDIA GPUs.`, )) + `
` + renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody( inv.NVIDIA, `Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`, `dcgmi diag targeted_stress`, `Skipped in Validate mode. Runs after dcgmi diag in Stress mode. Runs one GPU at a time on the selected NVIDIA GPUs.

Only runs in Stress mode. Switch mode above to enable in Run All.

`, )) + `
` + `
` + renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody( inv.NVIDIA, `Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`, `dcgmi diag targeted_power`, `Skipped in Validate mode. Runs in Stress mode only. Runs one GPU at a time.

Only runs in Stress mode. Switch mode above to enable in Run All.

`, )) + `
` + `
` + renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody( inv.NVIDIA, `Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`, `dcgmi diag pulse_test`, `Skipped in Validate mode. Runs in Stress mode only. Runs all selected GPUs simultaneously — synchronous pulsing is required to stress the PSU.

Only runs in Stress mode. Switch mode above to enable in Run All.

`, )) + `
` + `
` + renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody( inv.NVIDIA, `Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`, `all_reduce_perf (NCCL tests)`, `Runs in Validate and Stress. Uses all selected GPUs simultaneously (requires ≥2) and is kept short so it fits the Validate flow.`, )) + `
` + `
` + renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody( inv.NVIDIA, `Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`, `nvbandwidth`, `Runs in Validate and Stress across all selected GPUs simultaneously. Intended to stay short enough for Validate.`, )) + `
` + `
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody( inv.AMD, `Runs the selected AMD checks only. GPU Validate collects inventory; MEM Integrity uses the RVS MEM module; MEM Bandwidth uses rocm-bandwidth-test and the RVS BABEL module.`, `GPU Validate: rocm-smi, dmidecode; MEM Integrity: rvs mem; MEM Bandwidth: rocm-bandwidth-test, rvs babel`, `
`, )) + `
` } func loadValidateInventory(opts HandlerOptions) validateInventory { unknown := "Audit snapshot not loaded." out := validateInventory{ CPU: unknown, Memory: unknown, Storage: unknown, NVIDIA: unknown, AMD: unknown, } data, err := loadSnapshot(opts.AuditPath) if err != nil { return out } var snap schema.HardwareIngestRequest if err := json.Unmarshal(data, &snap); err != nil { return out } cpuCounts := map[string]int{} cpuTotal := 0 for _, cpu := range snap.Hardware.CPUs { if cpu.Present != nil && !*cpu.Present { continue } cpuTotal++ addValidateModel(cpuCounts, validateFirstNonEmpty(validateTrimPtr(cpu.Model), validateTrimPtr(cpu.Manufacturer), "unknown")) } memCounts := map[string]int{} memTotal := 0 for _, dimm := range snap.Hardware.Memory { if dimm.Present != nil && !*dimm.Present { continue } memTotal++ addValidateModel(memCounts, validateFirstNonEmpty(validateTrimPtr(dimm.PartNumber), validateTrimPtr(dimm.Type), validateTrimPtr(dimm.Manufacturer), "unknown")) } storageCounts := map[string]int{} storageTotal := 0 for _, dev := range snap.Hardware.Storage { if dev.Present != nil && !*dev.Present { continue } storageTotal++ addValidateModel(storageCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown")) } nvidiaCounts := map[string]int{} nvidiaTotal := 0 amdCounts := map[string]int{} amdTotal := 0 for _, dev := range snap.Hardware.PCIeDevices { if dev.Present != nil && !*dev.Present { continue } if validateIsVendorGPU(dev, "nvidia") { nvidiaTotal++ addValidateModel(nvidiaCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown")) } if validateIsVendorGPU(dev, "amd") { amdTotal++ addValidateModel(amdCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown")) } } out.CPU = formatValidateDeviceSummary(cpuTotal, cpuCounts, "CPU") out.Memory = formatValidateDeviceSummary(memTotal, memCounts, "module") out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device") out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU") out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU") return out } func renderValidateCardBody(devices, description, commands, settings string) string { return `
` + devices + `
` + `
` + description + `
` + `
` + commands + `
` + `
` + settings + `
` } func formatValidateDeviceSummary(total int, models map[string]int, unit string) string { if total == 0 { return "0 " + unit + "s detected." } keys := make([]string, 0, len(models)) for key := range models { keys = append(keys, key) } sort.Strings(keys) parts := make([]string, 0, len(keys)) for _, key := range keys { parts = append(parts, fmt.Sprintf("%d x %s", models[key], html.EscapeString(key))) } label := unit if total != 1 { label += "s" } // If there is only one model the leading count duplicates the per-model // count already in parts (e.g. "4 GPU: 4 x RTX …" → "4 x RTX …"). if len(parts) == 1 { return parts[0] + " " + label } return fmt.Sprintf("%d %s: %s", total, label, strings.Join(parts, ", ")) } func addValidateModel(counts map[string]int, name string) { name = strings.TrimSpace(name) if name == "" { name = "unknown" } counts[name]++ } func validateTrimPtr(value *string) string { if value == nil { return "" } return strings.TrimSpace(*value) } func validateFirstNonEmpty(values ...string) string { for _, value := range values { value = strings.TrimSpace(value) if value != "" { return value } } return "" } func validateIsVendorGPU(dev schema.HardwarePCIeDevice, vendor string) bool { model := strings.ToLower(validateTrimPtr(dev.Model)) manufacturer := strings.ToLower(validateTrimPtr(dev.Manufacturer)) class := strings.ToLower(validateTrimPtr(dev.DeviceClass)) if strings.Contains(model, "aspeed") || strings.Contains(manufacturer, "aspeed") { return false } switch vendor { case "nvidia": return strings.Contains(model, "nvidia") || strings.Contains(manufacturer, "nvidia") case "amd": isGPUClass := class == "processingaccelerator" || class == "displaycontroller" || class == "videocontroller" isAMDVendor := strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd") || strings.Contains(manufacturer, "ati") isAMDModel := strings.Contains(model, "instinct") || strings.Contains(model, "radeon") || strings.Contains(model, "amd") return isGPUClass && (isAMDVendor || isAMDModel) default: return false } } func renderSATCard(id, label, runAction, headerActions, body string) string { actions := `` if strings.TrimSpace(headerActions) != "" { actions += headerActions } return fmt.Sprintf(`
%s
%s
%s
`, label, actions, body) } // ── Benchmark ───────────────────────────────────────────────────────────────── type benchmarkHistoryRun struct { generatedAt time.Time displayTime string gpuScores map[int]float64 // GPU index → composite score } func renderBenchmark(opts HandlerOptions) string { return `

Benchmark runs generate a human-readable TXT report and machine-readable result bundle. Tasks continue in the background — view progress in Tasks.

Benchmark Setup

Loading NVIDIA GPUs...

Select one GPU for single-card benchmarking or several GPUs for a constrained multi-GPU run.

Method Split

The benchmark page now exposes two fundamentally different test families so compute score and server power-fit are not mixed into one number.

Run TypeEngineQuestion
Performance Benchmarkbee-gpu-burnHow much isolated compute performance does the GPU realize in this server?
Power / Thermal Fitdcgmi targeted_powerHow much power per GPU can this server sustain as GPU count ramps up?

Use ramp-up mode for capacity work: it creates 1 GPU → 2 GPU → … → all selected steps so analysis software can derive server total score and watts-per-GPU curves.

` + `
` + renderBenchmarkResultsCard(opts.ExportDir) + `
` + ` ` } func renderBenchmarkResultsCard(exportDir string) string { maxIdx, runs := loadBenchmarkHistory(exportDir) perf := renderBenchmarkResultsCardFromRuns( "Performance Results", "Composite score by saved benchmark run and GPU.", "No saved performance benchmark runs yet.", maxIdx, runs, ) power := renderPowerBenchmarkResultsCard(exportDir) return perf + "\n" + power } func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, maxGPUIndex int, runs []benchmarkHistoryRun) string { if len(runs) == 0 { return `
` + html.EscapeString(title) + `

` + html.EscapeString(emptyMessage) + `

` } var b strings.Builder b.WriteString(`
` + html.EscapeString(title) + `
`) if strings.TrimSpace(description) != "" { b.WriteString(`

` + html.EscapeString(description) + `

`) } b.WriteString(`
`) b.WriteString(``) for i := 0; i <= maxGPUIndex; i++ { b.WriteString(``) } b.WriteString(``) for i, run := range runs { b.WriteString(``) b.WriteString(``) b.WriteString(``) for idx := 0; idx <= maxGPUIndex; idx++ { score, ok := run.gpuScores[idx] if !ok { b.WriteString(``) continue } b.WriteString(``) } b.WriteString(``) } b.WriteString(`
RunTimeGPU ` + strconv.Itoa(i) + `
#` + strconv.Itoa(i+1) + `` + html.EscapeString(run.displayTime) + `-` + fmt.Sprintf("%.2f", score) + `
`) return b.String() } func loadBenchmarkHistory(exportDir string) (int, []benchmarkHistoryRun) { baseDir := app.DefaultBeeBenchPerfDir if strings.TrimSpace(exportDir) != "" { baseDir = filepath.Join(exportDir, "bee-bench", "perf") } paths, err := filepath.Glob(filepath.Join(baseDir, "perf-*", "result.json")) if err != nil || len(paths) == 0 { return -1, nil } sort.Strings(paths) return loadBenchmarkHistoryFromPaths(paths) } func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun) { runs := make([]benchmarkHistoryRun, 0, len(paths)) maxGPUIndex := -1 for _, path := range paths { raw, err := os.ReadFile(path) if err != nil { continue } var result platform.NvidiaBenchmarkResult if err := json.Unmarshal(raw, &result); err != nil { continue } run := benchmarkHistoryRun{ generatedAt: result.GeneratedAt, displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"), gpuScores: make(map[int]float64), } for _, gpu := range result.GPUs { run.gpuScores[gpu.Index] = gpu.Scores.CompositeScore if gpu.Index > maxGPUIndex { maxGPUIndex = gpu.Index } } runs = append(runs, run) } sort.Slice(runs, func(i, j int) bool { return runs[i].generatedAt.After(runs[j].generatedAt) }) return maxGPUIndex, runs } func renderPowerBenchmarkResultsCard(exportDir string) string { baseDir := app.DefaultBeeBenchPowerDir if strings.TrimSpace(exportDir) != "" { baseDir = filepath.Join(exportDir, "bee-bench", "power") } paths, err := filepath.Glob(filepath.Join(baseDir, "power-*", "result.json")) if err != nil || len(paths) == 0 { return `
Power / Thermal Fit Results

No saved power benchmark runs yet.

` } sort.Strings(paths) type powerRun struct { generatedAt time.Time displayTime string result platform.NvidiaPowerBenchResult } var runs []powerRun for _, path := range paths { raw, err := os.ReadFile(path) if err != nil { continue } var r platform.NvidiaPowerBenchResult if err := json.Unmarshal(raw, &r); err != nil { continue } runs = append(runs, powerRun{ generatedAt: r.GeneratedAt, displayTime: r.GeneratedAt.Local().Format("2006-01-02 15:04:05"), result: r, }) } sort.Slice(runs, func(i, j int) bool { return runs[i].generatedAt.After(runs[j].generatedAt) }) // Show only the most recent run's GPU slot table, plus a run history summary. var b strings.Builder b.WriteString(`
Power / Thermal Fit Results
`) latest := runs[0].result b.WriteString(`

Latest run: ` + html.EscapeString(runs[0].displayTime)) if latest.Hostname != "" { b.WriteString(` — ` + html.EscapeString(latest.Hostname)) } if latest.OverallStatus != "" { statusColor := "var(--ok)" if latest.OverallStatus != "OK" { statusColor = "var(--warn)" } b.WriteString(` — ` + html.EscapeString(latest.OverallStatus) + ``) } b.WriteString(`

`) if len(latest.GPUs) > 0 { b.WriteString(`
`) b.WriteString(``) b.WriteString(``) for _, gpu := range latest.GPUs { derated := gpu.Derated || (gpu.DefaultPowerLimitW > 0 && gpu.AppliedPowerLimitW < gpu.DefaultPowerLimitW-1) rowStyle := "" achievedStyle := "" if derated { rowStyle = ` style="background:rgba(255,180,0,0.08)"` achievedStyle = ` style="color:#e6a000;font-weight:600"` } statusLabel := gpu.Status if statusLabel == "" { statusLabel = "OK" } statusColor := "var(--ok)" if statusLabel != "OK" { statusColor = "var(--warn)" } nominalStr := "-" if gpu.DefaultPowerLimitW > 0 { nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW) } achievedStr := "-" if gpu.AppliedPowerLimitW > 0 { achievedStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW) } p95Str := "-" if gpu.MaxObservedPowerW > 0 { p95Str = fmt.Sprintf("%.0f", gpu.MaxObservedPowerW) } b.WriteString(``) b.WriteString(``) b.WriteString(``) b.WriteString(``) b.WriteString(`` + achievedStr + ``) b.WriteString(``) b.WriteString(``) b.WriteString(``) } b.WriteString(`
GPUModelNominal WAchieved WP95 Observed WStatus
` + strconv.Itoa(gpu.Index) + `` + html.EscapeString(gpu.Name) + `` + nominalStr + `` + p95Str + `` + html.EscapeString(statusLabel) + `
`) } if len(runs) > 1 { b.WriteString(`
` + strconv.Itoa(len(runs)) + ` runs total`) b.WriteString(`
`) for i, run := range runs { statusColor := "var(--ok)" if run.result.OverallStatus != "OK" { statusColor = "var(--warn)" } b.WriteString(``) b.WriteString(``) b.WriteString(``) b.WriteString(``) b.WriteString(``) b.WriteString(``) } b.WriteString(`
#TimeGPUsStatus
#` + strconv.Itoa(i+1) + `` + html.EscapeString(run.displayTime) + `` + strconv.Itoa(len(run.result.GPUs)) + `` + html.EscapeString(run.result.OverallStatus) + `
`) } b.WriteString(`
`) return b.String() } // ── Burn ────────────────────────────────────────────────────────────────────── func renderBurn() string { return `
⚠ Warning: Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.
Scope: Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in Validate → Stress mode; NCCL and NVBandwidth are available directly from Validate.

Tasks continue in the background — view progress in Tasks.

Burn Profile

Run checked tests one by one. Tests run without cooldown. Each test duration is determined by the Burn Profile. Total test duration is the sum of all selected tests multiplied by the Burn Profile duration.

Run checked core test modules (CPU, MEM, GPU). Tests start at the same time and run for a period with short cooldown phases to stress the server cooling system.

NVIDIA GPU Selection

Official NVIDIA recipes and custom NVIDIA stressors use only the GPUs selected here. Multi-GPU interconnect tests are limited to this selection as well.

Loading NVIDIA GPUs...

Select at least one NVIDIA GPU to enable NVIDIA burn recipes.

Core Burn Paths
GPU Max Load

Combine vendor-backed and custom GPU max-load recipes in one run set. ` + "dcgmproftester" + ` is the primary official NVIDIA path; custom stressors remain available as parallel checkbox options.

Compute Stress

Select which subsystems to stress. Each checked item runs as a separate task.

` } // ── Network ─────────────────────────────────────────────────────────────────── // renderNetworkInline returns the network UI without a wrapping card (for embedding in Tools). func renderNetworkInline() string { return `

Loading...

DHCP
Static IPv4
` } func renderNetwork() string { return `
Network Interfaces
` + renderNetworkInline() + `
` } // ── Services ────────────────────────────────────────────────────────────────── func renderServicesInline() string { return `

` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `

Loading...

` } func renderServices() string { return `
Bee Services
` + renderServicesInline() + `
` } // ── Export ──────────────────────────────────────────────────────────────────── func renderExport(exportDir string) string { entries, _ := listExportFiles(exportDir) var rows strings.Builder for _, e := range entries { rows.WriteString(fmt.Sprintf(`%s`, url.QueryEscape(e), html.EscapeString(e))) } if len(entries) == 0 { rows.WriteString(`No export files found.`) } return `
Support Bundle

Creates a tar.gz archive of all audit files, SAT results, and logs.

` + renderSupportBundleInline() + `
Export Files
` + rows.String() + `
File
` + renderUSBExportCard() } func listExportFiles(exportDir string) ([]string, error) { var entries []string err := filepath.Walk(strings.TrimSpace(exportDir), func(path string, info os.FileInfo, err error) error { if err != nil { return err } if info.IsDir() { return nil } rel, err := filepath.Rel(exportDir, path) if err != nil { return err } entries = append(entries, rel) return nil }) if err != nil && !os.IsNotExist(err) { return nil, err } sort.Strings(entries) return entries, nil } func renderSupportBundleInline() string { return `
` } func renderUSBExportCard() string { return `
Export to USB
` + renderUSBExportInline() + `
` } func renderUSBExportInline() string { return `

Write audit JSON or support bundle directly to a removable USB drive.

Scanning for USB devices...
` } func renderNvidiaSelfHealInline() string { return `

Inspect NVIDIA GPU health, restart the bee-nvidia driver service, and issue a per-GPU reset when the driver reports reset required.

Loading NVIDIA GPU status...

Loading...

` } // ── Tools ───────────────────────────────────────────────────────────────────── func renderTools() string { return `
System Install
Install to RAM

Detecting boot source...

Checking...

Install to Disk
` + renderInstallInline() + `
Support Bundle

Downloads a tar.gz archive of all audit files, SAT results, and logs.

` + renderSupportBundleInline() + `
Export to USB
` + renderUSBExportInline() + `
Tool Check

Checking...

NVIDIA Self Heal
` + renderNvidiaSelfHealInline() + `
Network
` + renderNetworkInline() + `
Services
` + renderServicesInline() + `
` } // ── Install to Disk ────────────────────────────────────────────────────────── func renderInstallInline() string { return `
Warning: Installing will completely erase the selected disk and write the live system onto it. All existing data on the target disk will be lost. This operation cannot be undone.
Loading disk list…
` } func renderInstall() string { return `
Install Live System to Disk
` + renderInstallInline() + `
` } // ── Tasks ───────────────────────────────────────────────────────────────────── func renderTasks() string { return `
Open a task to view its saved logs and charts.

Loading...

` } func renderExportIndex(exportDir string) (string, error) { entries, err := listExportFiles(exportDir) if err != nil { return "", err } var body strings.Builder body.WriteString(`Bee Export Files`) body.WriteString(`

Bee Export Files

`) return body.String(), nil }