package webui import ( "encoding/json" "fmt" "html" "sort" "strings" "bee/audit/internal/platform" "bee/audit/internal/schema" ) // PCI vendor IDs used for GPU classification (source: pci-ids.ucw.cz). const ( pciVendorNvidia = 0x10de pciVendorAMD = 0x1002 pciVendorAspeed = 0x1a03 ) type validateInventory struct { CPU string Memory string Storage string NVIDIA string AMD string NvidiaGPUCount int AMDGPUCount int } func validateFmtDur(secs int) string { if secs < 120 { return fmt.Sprintf("~%d s", secs) } mins := (secs + 29) / 60 return fmt.Sprintf("~%d min", mins) } func validateTotalValidateSec(n int) int { if n < 0 { n = 0 } total := platform.SATEstimatedCPUValidateSec + platform.SATEstimatedMemoryValidateSec + platform.SATEstimatedNvidiaInterconnectSec + platform.SATEstimatedNvidiaBandwidthSec if n > 0 { total += platform.SATEstimatedNvidiaGPUValidateSec } return total } func validateTotalStressSec(n int) int { if n < 0 { n = 0 } total := platform.SATEstimatedCPUStressSec + platform.SATEstimatedMemoryStressSec + platform.SATEstimatedNvidiaPulseTestSec + platform.SATEstimatedNvidiaInterconnectSec + platform.SATEstimatedNvidiaBandwidthSec if n > 0 { total += platform.SATEstimatedNvidiaGPUStressSec + platform.SATEstimatedNvidiaTargetedStressSec + platform.SATEstimatedNvidiaTargetedPowerSec } return total } func renderValidate(opts HandlerOptions) string { return renderValidateMode(opts, false) } func renderValidateStress(opts HandlerOptions) string { return renderValidateMode(opts, true) } func renderValidateMode(opts HandlerOptions, stressDefault bool) string { inv := loadValidateInventory(opts) n := inv.NvidiaGPUCount validateTotalStr := validateFmtDur(validateTotalValidateSec(n)) stressTotalStr := validateFmtDur(validateTotalStressSec(n)) gpuNote := "" if n > 0 { gpuNote = fmt.Sprintf(" (%d GPU)", n) } estStr := validateTotalStr if stressDefault { estStr = stressTotalStr } alert := `
dcgmi diag targeted_stress`,
validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec)+` (all GPUs simultaneously).`,
)) +
renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody(
inv.NVIDIA,
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
`dcgmi diag targeted_power`,
validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec)+` (all GPUs simultaneously).`,
)) +
renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody(
inv.NVIDIA,
`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
`dcgmi diag pulse_test`,
validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`,
))
}
satStressModeJS := "function satStressMode() { return false; }"
if stressDefault {
satStressModeJS = "function satStressMode() { return true; }"
}
return alert + `
Tasks continue in the background — view progress in Tasks.
lscpu, sensors, stress-ng`,
validateFmtDur(platform.SATEstimatedCPUValidateSec)+` in Validate (stress-ng 60 s). `+validateFmtDur(platform.SATEstimatedCPUStressSec)+` in Stress (stress-ng 30 min).`,
)) +
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
inv.Memory,
`Runs a RAM validation pass and records memory state around the test.`,
`free, memtester`,
validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` in Validate (256 MB × 1 pass). `+validateFmtDur(platform.SATEstimatedMemoryStressSec)+` in Stress (512 MB × 1 pass).`,
)) +
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
inv.Storage,
`Scans all storage devices and runs the matching health or self-test path for each device type.`,
`lsblk; NVMe: nvme; SATA/SAS: smartctl`,
`Seconds in Validate (NVMe: instant device query; SATA/SAS: short self-test). Up to ~1 h per device in Stress (extended self-test, device-dependent).`,
)) +
`` + inv.NVIDIA + `
All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Run All.
Loading NVIDIA GPUs...
Select at least one NVIDIA GPU to enable NVIDIA validate tasks.
nvidia-smi, dmidecode, dcgmi diag`,
fmt.Sprintf("Validate: %s (Level 2, all GPUs simultaneously). Stress: %s (Level 3, all GPUs simultaneously).",
validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec),
validateFmtDur(platform.SATEstimatedNvidiaGPUStressSec)),
)) +
stressOnlyCards +
renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
inv.NVIDIA,
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
`all_reduce_perf (NCCL tests)`,
validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
)) +
renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
inv.NVIDIA,
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
`nvbandwidth`,
validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
)) +
`rocm-smi, dmidecode; MEM Integrity: rvs mem; MEM Bandwidth: rocm-bandwidth-test, rvs babel`,
``,
)) +
`lscpu, sensors, stress-ng`,
validateFmtDur(platform.SATEstimatedCPUValidateSec)+` (stress-ng 60 s).`,
)) +
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
inv.Memory,
`Runs a RAM validation pass and records memory state around the test.`,
`free, memtester`,
validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` (256 MB × 1 pass).`,
)) +
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
inv.Storage,
`Scans all storage devices and runs the matching health or self-test path for each.`,
`lsblk; NVMe: nvme; SATA/SAS: smartctl`,
`Seconds (NVMe: instant device query; SATA/SAS: short self-test).`,
)) +
`` + inv.NVIDIA + `
Loading NVIDIA GPUs...
Select at least one NVIDIA GPU to enable NVIDIA check tasks.
nvidia-smi, dmidecode, dcgmi diag`,
validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec)+` (Level 2, all GPUs simultaneously).`,
)) +
renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
inv.NVIDIA,
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs.`,
`all_reduce_perf (NCCL tests)`,
validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
)) +
renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
inv.NVIDIA,
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
`nvbandwidth`,
validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously).`,
)) +
`rocm-smi, dmidecode; MEM Integrity: rvs mem; MEM Bandwidth: rocm-bandwidth-test, rvs babel`,
``,
)) +
`