diff --git a/audit/internal/webui/page_validate.go b/audit/internal/webui/page_validate.go index a6919c7..d9c420a 100644 --- a/audit/internal/webui/page_validate.go +++ b/audit/internal/webui/page_validate.go @@ -84,38 +84,49 @@ func renderValidateMode(opts HandlerOptions, stressDefault bool) string { if n > 0 { gpuNote = fmt.Sprintf(" (%d GPU)", n) } - validateChecked, stressChecked := "checked", "" + estStr := validateTotalStr if stressDefault { - validateChecked, stressChecked = "", "checked" + estStr = stressTotalStr } alert := `
dcgmi diag targeted_stress`,
+ validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec)+` (all GPUs simultaneously).`,
+ )) +
+ renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody(
+ inv.NVIDIA,
+ `Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
+ `dcgmi diag targeted_power`,
+ validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec)+` (all GPUs simultaneously).`,
+ )) +
+ renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody(
+ inv.NVIDIA,
+ `Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
+ `dcgmi diag pulse_test`,
+ validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`,
+ ))
}
+
+ satStressModeJS := "function satStressMode() { return false; }"
+ if stressDefault {
+ satStressModeJS = "function satStressMode() { return true; }"
+ }
+
return alert + `
Tasks continue in the background — view progress in Tasks.
- -Runs validate modules sequentially. Validate: ` + validateTotalStr + gpuNote + `; Stress: ` + stressTotalStr + gpuNote + `. Estimates are based on real log data and scale with GPU count.
- -` + inv.NVIDIA + `
-All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Validate one by one.
+All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Run All.
dcgmi diag targeted_stress`,
- "Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec) + ` (all GPUs simultaneously).Only runs in Stress mode. Switch mode above to enable in Run All.
`, - )) + - `dcgmi diag targeted_power`,
- "Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec) + ` (all GPUs simultaneously).Only runs in Stress mode. Switch mode above to enable in Run All.
`, - )) + - `dcgmi diag pulse_test`,
- `Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`Only runs in Stress mode. Switch mode above to enable in Run All.
`, - )) + - `all_reduce_perf (NCCL tests)`,
- `Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
+ validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
)) +
- `nvbandwidth`,
- `Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
+ validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
)) +
- `