Split validate/stress into separate fixed-mode pages

- Check (2): validate mode only — no mode switcher, no stress-only cards
  (nvidia-targeted-stress, nvidia-targeted-power, nvidia-pulse hidden)
- Load (3): stress mode only — no mode switcher, all cards shown
- satStressMode() hardcoded per page; satModeChanged() removed
- Profile card with radio buttons removed from both pages
- Replaced with simple Run All button + est. time

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Mikhail Chusavitin
2026-06-18 19:12:17 +03:00
parent a2d7513153
commit 85d1acdaa3

View File

@@ -84,38 +84,49 @@ func renderValidateMode(opts HandlerOptions, stressDefault bool) string {
if n > 0 {
gpuNote = fmt.Sprintf(" (%d GPU)", n)
}
validateChecked, stressChecked := "checked", ""
estStr := validateTotalStr
if stressDefault {
validateChecked, stressChecked = "", "checked"
estStr = stressTotalStr
}
alert := `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>`
if stressDefault {
alert = `<div class="alert alert-warn" style="margin-bottom:16px"><strong>&#9888; Stress mode:</strong> Runs extended load tests — CPU stress-ng, memory passes, DCGM targeted diagnostics. Higher wear than Validate.</div>`
}
onloadJS := ""
stressOnlyCards := ""
if stressDefault {
onloadJS = `<script>satModeChanged();</script>`
stressOnlyCards = renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
inv.NVIDIA,
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
`<code>dcgmi diag targeted_stress</code>`,
validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec)+` (all GPUs simultaneously).`,
)) +
renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody(
inv.NVIDIA,
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
`<code>dcgmi diag targeted_power</code>`,
validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec)+` (all GPUs simultaneously).`,
)) +
renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody(
inv.NVIDIA,
`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
`<code>dcgmi diag pulse_test</code>`,
validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`,
))
}
satStressModeJS := "function satStressMode() { return false; }"
if stressDefault {
satStressModeJS = "function satStressMode() { return true; }"
}
return alert + `
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
<div class="card" style="margin-bottom:16px">
<div class="card-head">Validate Profile</div>
<div class="card-body validate-profile-body">
<div class="validate-profile-col">
<div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" ` + validateChecked + ` onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" ` + stressChecked + ` onchange="satModeChanged()"><span>Stress — thorough load test (` + stressTotalStr + gpuNote + `)</span></label>
</div>
<div class="validate-profile-col validate-profile-action">
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially. Validate: ` + validateTotalStr + gpuNote + `; Stress: ` + stressTotalStr + gpuNote + `. Estimates are based on real log data and scale with GPU count.</p>
<button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
<div style="margin-top:12px">
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
</div>
</div>
</div>
</div>` + onloadJS
<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px">
<button type="button" class="btn btn-primary" onclick="runAllSAT()">Run All</button>
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
<span style="font-size:12px;color:var(--muted)">est. ` + estStr + gpuNote + `</span>
</div>
<div class="grid3">
` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
@@ -142,7 +153,7 @@ func renderValidateMode(opts HandlerOptions, stressDefault bool) string {
<div class="card-head">NVIDIA GPU Selection</div>
<div class="card-body">
<p style="font-size:12px;color:var(--muted);margin:0 0 8px">` + inv.NVIDIA + `</p>
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Validate one by one.</p>
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Run All.</p>
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectAllGPUs()">Select All</button>
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectNoGPUs()">Clear</button>
@@ -163,46 +174,19 @@ func renderValidateMode(opts HandlerOptions, stressDefault bool) string {
validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec),
validateFmtDur(platform.SATEstimatedNvidiaGPUStressSec)),
)) +
`<div id="sat-card-nvidia-targeted-stress">` +
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
inv.NVIDIA,
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
`<code>dcgmi diag targeted_stress</code>`,
"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec) + ` (all GPUs simultaneously).<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
)) +
`</div>` +
`<div id="sat-card-nvidia-targeted-power">` +
renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody(
inv.NVIDIA,
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
`<code>dcgmi diag targeted_power</code>`,
"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec) + ` (all GPUs simultaneously).<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
)) +
`</div>` +
`<div id="sat-card-nvidia-pulse">` +
renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody(
inv.NVIDIA,
`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes running per-GPU would miss PSU-level failures.`,
`<code>dcgmi diag pulse_test</code>`,
`Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
)) +
`</div>` +
`<div id="sat-card-nvidia-interconnect">` +
stressOnlyCards +
renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
inv.NVIDIA,
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
`<code>all_reduce_perf</code> (NCCL tests)`,
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires 2).`,
validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
)) +
`</div>` +
`<div id="sat-card-nvidia-bandwidth">` +
renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
inv.NVIDIA,
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
`<code>nvbandwidth</code>`,
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit duration set by the tool).`,
validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
)) +
`</div>` +
`</div>
<div class="grid3" style="margin-top:16px">
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
@@ -217,36 +201,15 @@ func renderValidateMode(opts HandlerOptions, stressDefault bool) string {
<div class="card-body"><div id="sat-terminal" class="terminal"></div></div>
</div>
<style>
.validate-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
.validate-profile-col { min-width:0; display:flex; flex-direction:column; }
.validate-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:center; }
.validate-card-body { padding:0; }
.validate-card-section { padding:12px 16px 0; }
.validate-card-section:last-child { padding-bottom:16px; }
.sat-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
.sat-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
@media(max-width:900px){ .validate-profile-body { grid-template-columns:1fr; } }
</style>
<script>
let satES = null;
function satStressMode() {
return document.querySelector('input[name="sat-mode"]:checked')?.value === 'stress';
}
function satModeChanged() {
const stress = satStressMode();
[
{card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
{card: 'sat-card-nvidia-targeted-power', hint: 'sat-tp-mode-hint'},
{card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'},
].forEach(function(item) {
const card = document.getElementById(item.card);
if (card) {
card.style.opacity = stress ? '1' : '0.5';
const hint = document.getElementById(item.hint);
if (hint) hint.style.display = stress ? 'none' : '';
}
});
}
` + satStressModeJS + `
function satLabels() {
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
}