diff --git a/audit/internal/webui/page_validate.go b/audit/internal/webui/page_validate.go index a6919c7..d9c420a 100644 --- a/audit/internal/webui/page_validate.go +++ b/audit/internal/webui/page_validate.go @@ -84,38 +84,49 @@ func renderValidateMode(opts HandlerOptions, stressDefault bool) string { if n > 0 { gpuNote = fmt.Sprintf(" (%d GPU)", n) } - validateChecked, stressChecked := "checked", "" + estStr := validateTotalStr if stressDefault { - validateChecked, stressChecked = "", "checked" + estStr = stressTotalStr } alert := `
Non-destructive: Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.
` if stressDefault { alert = `
⚠ Stress mode: Runs extended load tests — CPU stress-ng, memory passes, DCGM targeted diagnostics. Higher wear than Validate.
` } - onloadJS := "" + + stressOnlyCards := "" if stressDefault { - onloadJS = `` + stressOnlyCards = renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody( + inv.NVIDIA, + `Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`, + `dcgmi diag targeted_stress`, + validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec)+` (all GPUs simultaneously).`, + )) + + renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody( + inv.NVIDIA, + `Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`, + `dcgmi diag targeted_power`, + validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec)+` (all GPUs simultaneously).`, + )) + + renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody( + inv.NVIDIA, + `Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`, + `dcgmi diag pulse_test`, + validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`, + )) } + + satStressModeJS := "function satStressMode() { return false; }" + if stressDefault { + satStressModeJS = "function satStressMode() { return true; }" + } + return alert + `

Tasks continue in the background — view progress in Tasks.

- -
-
Validate Profile
-
-
-
- - -
-
-

Runs validate modules sequentially. Validate: ` + validateTotalStr + gpuNote + `; Stress: ` + stressTotalStr + gpuNote + `. Estimates are based on real log data and scale with GPU count.

- -
- -
-
-
-
` + onloadJS +
+ + + est. ` + estStr + gpuNote + ` +
` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody( @@ -142,7 +153,7 @@ func renderValidateMode(opts HandlerOptions, stressDefault bool) string {
NVIDIA GPU Selection

` + inv.NVIDIA + `

-

All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Validate one by one.

+

All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Run All.

@@ -163,46 +174,19 @@ func renderValidateMode(opts HandlerOptions, stressDefault bool) string { validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec), validateFmtDur(platform.SATEstimatedNvidiaGPUStressSec)), )) + - `
` + - renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody( - inv.NVIDIA, - `Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`, - `dcgmi diag targeted_stress`, - "Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec) + ` (all GPUs simultaneously).

Only runs in Stress mode. Switch mode above to enable in Run All.

`, - )) + - `
` + - `
` + - renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody( - inv.NVIDIA, - `Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`, - `dcgmi diag targeted_power`, - "Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec) + ` (all GPUs simultaneously).

Only runs in Stress mode. Switch mode above to enable in Run All.

`, - )) + - `
` + - `
` + - renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody( - inv.NVIDIA, - `Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`, - `dcgmi diag pulse_test`, - `Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`

Only runs in Stress mode. Switch mode above to enable in Run All.

`, - )) + - `
` + - `
` + + stressOnlyCards + renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody( inv.NVIDIA, `Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`, `all_reduce_perf (NCCL tests)`, - `Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`, + validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`, )) + - `
` + - `
` + renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody( inv.NVIDIA, `Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`, `nvbandwidth`, - `Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`, + validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`, )) + - `
` + `
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody( @@ -217,36 +201,15 @@ func renderValidateMode(opts HandlerOptions, stressDefault bool) string {