package webui import ( "encoding/json" "fmt" "html" "sort" "strings" "bee/audit/internal/platform" "bee/audit/internal/schema" ) type validateInventory struct { CPU string Memory string Storage string NVIDIA string AMD string NvidiaGPUCount int AMDGPUCount int } func validateFmtDur(secs int) string { if secs < 120 { return fmt.Sprintf("~%d s", secs) } mins := (secs + 29) / 60 return fmt.Sprintf("~%d min", mins) } func validateTotalValidateSec(n int) int { if n < 0 { n = 0 } total := platform.SATEstimatedCPUValidateSec + platform.SATEstimatedMemoryValidateSec + n*platform.SATEstimatedNvidiaGPUValidatePerGPUSec + platform.SATEstimatedNvidiaInterconnectSec + platform.SATEstimatedNvidiaBandwidthSec return total } func validateTotalStressSec(n int) int { if n < 0 { n = 0 } total := platform.SATEstimatedCPUStressSec + platform.SATEstimatedMemoryStressSec + n*platform.SATEstimatedNvidiaGPUStressPerGPUSec + n*platform.SATEstimatedNvidiaTargetedStressPerGPUSec + n*platform.SATEstimatedNvidiaTargetedPowerPerGPUSec + platform.SATEstimatedNvidiaPulseTestSec + platform.SATEstimatedNvidiaInterconnectSec + platform.SATEstimatedNvidiaBandwidthSec return total } func renderValidate(opts HandlerOptions) string { inv := loadValidateInventory(opts) n := inv.NvidiaGPUCount validateTotalStr := validateFmtDur(validateTotalValidateSec(n)) stressTotalStr := validateFmtDur(validateTotalStressSec(n)) gpuNote := "" if n > 0 { gpuNote = fmt.Sprintf(" (%d GPU)", n) } return `
Non-destructive: Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.

Tasks continue in the background — view progress in Tasks.

Validate Profile

Runs validate modules sequentially. Validate: ` + validateTotalStr + gpuNote + `; Stress: ` + stressTotalStr + gpuNote + `. Estimates are based on real log data and scale with GPU count.

` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody( inv.CPU, `Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`, `lscpu, sensors, stress-ng`, validateFmtDur(platform.SATEstimatedCPUValidateSec)+` in Validate (stress-ng 60 s). `+validateFmtDur(platform.SATEstimatedCPUStressSec)+` in Stress (stress-ng 30 min).`, )) + renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody( inv.Memory, `Runs a RAM validation pass and records memory state around the test.`, `free, memtester`, validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` in Validate (256 MB × 1 pass). `+validateFmtDur(platform.SATEstimatedMemoryStressSec)+` in Stress (512 MB × 1 pass).`, )) + renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody( inv.Storage, `Scans all storage devices and runs the matching health or self-test path for each device type.`, `lsblk; NVMe: nvme; SATA/SAS: smartctl`, `Seconds in Validate (NVMe: instant device query; SATA/SAS: short self-test). Up to ~1 h per device in Stress (extended self-test, device-dependent).`, )) + `
NVIDIA GPU Selection

` + inv.NVIDIA + `

All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Validate one by one.

Loading NVIDIA GPUs...

Select at least one NVIDIA GPU to enable NVIDIA validate tasks.

` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody( inv.NVIDIA, `Runs NVIDIA diagnostics and board inventory checks.`, `nvidia-smi, dmidecode, dcgmi diag`, func() string { perV := platform.SATEstimatedNvidiaGPUValidatePerGPUSec perS := platform.SATEstimatedNvidiaGPUStressPerGPUSec if n > 0 { return fmt.Sprintf("Validate: %s/GPU × %d = %s (Level 2, sequential). Stress: %s/GPU × %d = %s (Level 3, sequential).", validateFmtDur(perV), n, validateFmtDur(perV*n), validateFmtDur(perS), n, validateFmtDur(perS*n)) } return fmt.Sprintf("Validate: %s/GPU (Level 2, sequential). Stress: %s/GPU (Level 3, sequential).", validateFmtDur(perV), validateFmtDur(perS)) }(), )) + `
` + renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody( inv.NVIDIA, `Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`, `dcgmi diag targeted_stress`, func() string { per := platform.SATEstimatedNvidiaTargetedStressPerGPUSec s := "Skipped in Validate. " if n > 0 { s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n)) } else { s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per)) } return s + `

Only runs in Stress mode. Switch mode above to enable in Run All.

` }(), )) + `
` + `
` + renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody( inv.NVIDIA, `Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`, `dcgmi diag targeted_power`, func() string { per := platform.SATEstimatedNvidiaTargetedPowerPerGPUSec s := "Skipped in Validate. " if n > 0 { s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n)) } else { s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per)) } return s + `

Only runs in Stress mode. Switch mode above to enable in Run All.

` }(), )) + `
` + `
` + renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody( inv.NVIDIA, `Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`, `dcgmi diag pulse_test`, `Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`

Only runs in Stress mode. Switch mode above to enable in Run All.

`, )) + `
` + `
` + renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody( inv.NVIDIA, `Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`, `all_reduce_perf (NCCL tests)`, `Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`, )) + `
` + `
` + renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody( inv.NVIDIA, `Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`, `nvbandwidth`, `Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`, )) + `
` + `
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody( inv.AMD, `Runs the selected AMD checks only. GPU Validate collects inventory; MEM Integrity uses the RVS MEM module; MEM Bandwidth uses rocm-bandwidth-test and the RVS BABEL module.`, `GPU Validate: rocm-smi, dmidecode; MEM Integrity: rvs mem; MEM Bandwidth: rocm-bandwidth-test, rvs babel`, `
`, )) + `
` } func loadValidateInventory(opts HandlerOptions) validateInventory { unknown := "Audit snapshot not loaded." out := validateInventory{ CPU: unknown, Memory: unknown, Storage: unknown, NVIDIA: unknown, AMD: unknown, } data, err := loadSnapshot(opts.AuditPath) if err != nil { return out } var snap schema.HardwareIngestRequest if err := json.Unmarshal(data, &snap); err != nil { return out } cpuCounts := map[string]int{} cpuTotal := 0 for _, cpu := range snap.Hardware.CPUs { if cpu.Present != nil && !*cpu.Present { continue } cpuTotal++ addValidateModel(cpuCounts, validateFirstNonEmpty(validateTrimPtr(cpu.Model), validateTrimPtr(cpu.Manufacturer), "unknown")) } memCounts := map[string]int{} memTotal := 0 for _, dimm := range snap.Hardware.Memory { if dimm.Present != nil && !*dimm.Present { continue } memTotal++ addValidateModel(memCounts, validateFirstNonEmpty(validateTrimPtr(dimm.PartNumber), validateTrimPtr(dimm.Type), validateTrimPtr(dimm.Manufacturer), "unknown")) } storageCounts := map[string]int{} storageTotal := 0 for _, dev := range snap.Hardware.Storage { if dev.Present != nil && !*dev.Present { continue } storageTotal++ addValidateModel(storageCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown")) } nvidiaCounts := map[string]int{} nvidiaTotal := 0 amdCounts := map[string]int{} amdTotal := 0 for _, dev := range snap.Hardware.PCIeDevices { if dev.Present != nil && !*dev.Present { continue } if validateIsVendorGPU(dev, "nvidia") { nvidiaTotal++ addValidateModel(nvidiaCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown")) } if validateIsVendorGPU(dev, "amd") { amdTotal++ addValidateModel(amdCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown")) } } out.CPU = formatValidateDeviceSummary(cpuTotal, cpuCounts, "CPU") out.Memory = formatValidateDeviceSummary(memTotal, memCounts, "module") out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device") out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU") out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU") out.NvidiaGPUCount = nvidiaTotal out.AMDGPUCount = amdTotal return out } func renderValidateCardBody(devices, description, commands, settings string) string { return `
` + devices + `
` + `
` + description + `
` + `
` + commands + `
` + `
` + settings + `
` } func formatValidateDeviceSummary(total int, models map[string]int, unit string) string { if total == 0 { return "0 " + unit + "s detected." } keys := make([]string, 0, len(models)) for key := range models { keys = append(keys, key) } sort.Strings(keys) parts := make([]string, 0, len(keys)) for _, key := range keys { parts = append(parts, fmt.Sprintf("%d x %s", models[key], html.EscapeString(key))) } label := unit if total != 1 { label += "s" } if len(parts) == 1 { return parts[0] + " " + label } return fmt.Sprintf("%d %s: %s", total, label, strings.Join(parts, ", ")) } func addValidateModel(counts map[string]int, name string) { name = strings.TrimSpace(name) if name == "" { name = "unknown" } counts[name]++ } func validateTrimPtr(value *string) string { if value == nil { return "" } return strings.TrimSpace(*value) } func validateFirstNonEmpty(values ...string) string { for _, value := range values { value = strings.TrimSpace(value) if value != "" { return value } } return "" } func validateIsVendorGPU(dev schema.HardwarePCIeDevice, vendor string) bool { model := strings.ToLower(validateTrimPtr(dev.Model)) manufacturer := strings.ToLower(validateTrimPtr(dev.Manufacturer)) class := strings.ToLower(validateTrimPtr(dev.DeviceClass)) if strings.Contains(model, "aspeed") || strings.Contains(manufacturer, "aspeed") { return false } switch vendor { case "nvidia": return strings.Contains(model, "nvidia") || strings.Contains(manufacturer, "nvidia") case "amd": isGPUClass := class == "processingaccelerator" || class == "displaycontroller" || class == "videocontroller" isAMDVendor := strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd") || strings.Contains(manufacturer, "ati") isAMDModel := strings.Contains(model, "instinct") || strings.Contains(model, "radeon") || strings.Contains(model, "amd") return isGPUClass && (isAMDVendor || isAMDModel) default: return false } } func renderSATCard(id, label, runAction, headerActions, body string) string { actions := `` if strings.TrimSpace(headerActions) != "" { actions += headerActions } return fmt.Sprintf(`
%s
%s
%s
`, label, actions, body) }