From 679aeb994711fab29d7f5f4c7d0c7166f04c3814 Mon Sep 17 00:00:00 2001 From: Mikhail Chusavitin Date: Mon, 20 Apr 2026 11:53:25 +0300 Subject: [PATCH] Run NVIDIA DCGM diag tests on all selected GPUs simultaneously MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit targeted_stress, targeted_power, and the Level 2/3 diag were dispatched one GPU at a time from the UI, turning a single dcgmi command into 8 sequential ~350–450 s runs. DCGM supports -i with a comma-separated list of GPU indices and runs the diagnostic on all of them in parallel. Move nvidia, nvidia-targeted-stress, nvidia-targeted-power into nvidiaAllGPUTargets so expandSATTarget passes all selected indices in one API call. Simplify runNvidiaValidateSet to match runNvidiaFabricValidate. Update sat.go constants and page_validate.go estimates to reflect all-GPU simultaneous execution (remove n× multiplier from total time estimates). Stress test on 8-GPU system: ~5.3 h → ~2.5 h. Co-Authored-By: Claude Sonnet 4.6 --- audit/internal/platform/sat.go | 24 ++++---- audit/internal/webui/page_validate.go | 89 ++++++--------------------- 2 files changed, 30 insertions(+), 83 deletions(-) diff --git a/audit/internal/platform/sat.go b/audit/internal/platform/sat.go index b880370..88e844a 100644 --- a/audit/internal/platform/sat.go +++ b/audit/internal/platform/sat.go @@ -30,10 +30,10 @@ import ( // Sources: // - SATEstimatedCPUValidateSec: xFusion v8.6 — 62 s // - SATEstimatedMemoryValidateSec: xFusion v8.6 — 68 s -// - SATEstimatedNvidiaGPUValidatePerGPUSec: xFusion v8.6/v8.22 — 77–87 s/GPU -// - SATEstimatedNvidiaGPUStressPerGPUSec: xFusion v8.6/v8.22 — 444–448 s/GPU -// - SATEstimatedNvidiaTargetedStressPerGPUSec: xFusion v8.6/v8.22 — 347–348 s/GPU (300 s default + overhead) -// - SATEstimatedNvidiaTargetedPowerPerGPUSec: MSI v8.22 / xFusion v8.6 — 346–351 s/GPU +// - SATEstimatedNvidiaGPUValidateSec: xFusion v8.6/v8.22 — 77–87 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous) +// - SATEstimatedNvidiaGPUStressSec: xFusion v8.6/v8.22 — 444–448 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous) +// - SATEstimatedNvidiaTargetedStressSec: xFusion v8.6/v8.22 — 347–348 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous) +// - SATEstimatedNvidiaTargetedPowerSec: MSI v8.22 / xFusion v8.6 — 346–351 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous) // - SATEstimatedNvidiaPulseTestSec: xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous) // - SATEstimatedNvidiaInterconnectSec: xFusion v8.6/v8.22 — 210–384 s / 8 GPU (all simultaneous) // - SATEstimatedNvidiaBandwidthSec: xFusion v8.6/v8.22 — 2 664–2 688 s / 8 GPU (all simultaneous) @@ -48,15 +48,15 @@ const ( // RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size). SATEstimatedMemoryStressSec = 140 - // NVIDIA dcgmi diag Level 2 (medium), per GPU, sequential. - SATEstimatedNvidiaGPUValidatePerGPUSec = 85 - // NVIDIA dcgmi diag Level 3 (targeted stress), per GPU, sequential. - SATEstimatedNvidiaGPUStressPerGPUSec = 450 + // NVIDIA dcgmi diag Level 2 (medium), all GPUs simultaneously. + SATEstimatedNvidiaGPUValidateSec = 85 + // NVIDIA dcgmi diag Level 3 (targeted stress), all GPUs simultaneously. + SATEstimatedNvidiaGPUStressSec = 450 - // NVIDIA dcgmi targeted_stress 300 s + overhead, per GPU, sequential. - SATEstimatedNvidiaTargetedStressPerGPUSec = 350 - // NVIDIA dcgmi targeted_power 300 s + overhead, per GPU, sequential. - SATEstimatedNvidiaTargetedPowerPerGPUSec = 350 + // NVIDIA dcgmi targeted_stress 300 s + overhead, all GPUs simultaneously. + SATEstimatedNvidiaTargetedStressSec = 350 + // NVIDIA dcgmi targeted_power 300 s + overhead, all GPUs simultaneously. + SATEstimatedNvidiaTargetedPowerSec = 350 // NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU). SATEstimatedNvidiaPulseTestSec = 5000 diff --git a/audit/internal/webui/page_validate.go b/audit/internal/webui/page_validate.go index a8fc138..4c2ac74 100644 --- a/audit/internal/webui/page_validate.go +++ b/audit/internal/webui/page_validate.go @@ -35,9 +35,11 @@ func validateTotalValidateSec(n int) int { } total := platform.SATEstimatedCPUValidateSec + platform.SATEstimatedMemoryValidateSec + - n*platform.SATEstimatedNvidiaGPUValidatePerGPUSec + platform.SATEstimatedNvidiaInterconnectSec + platform.SATEstimatedNvidiaBandwidthSec + if n > 0 { + total += platform.SATEstimatedNvidiaGPUValidateSec + } return total } @@ -47,12 +49,14 @@ func validateTotalStressSec(n int) int { } total := platform.SATEstimatedCPUStressSec + platform.SATEstimatedMemoryStressSec + - n*platform.SATEstimatedNvidiaGPUStressPerGPUSec + - n*platform.SATEstimatedNvidiaTargetedStressPerGPUSec + - n*platform.SATEstimatedNvidiaTargetedPowerPerGPUSec + platform.SATEstimatedNvidiaPulseTestSec + platform.SATEstimatedNvidiaInterconnectSec + platform.SATEstimatedNvidiaBandwidthSec + if n > 0 { + total += platform.SATEstimatedNvidiaGPUStressSec + + platform.SATEstimatedNvidiaTargetedStressSec + + platform.SATEstimatedNvidiaTargetedPowerSec + } return total } @@ -128,33 +132,16 @@ func renderValidate(opts HandlerOptions) string { inv.NVIDIA, `Runs NVIDIA diagnostics and board inventory checks.`, `nvidia-smi, dmidecode, dcgmi diag`, - func() string { - perV := platform.SATEstimatedNvidiaGPUValidatePerGPUSec - perS := platform.SATEstimatedNvidiaGPUStressPerGPUSec - if n > 0 { - return fmt.Sprintf("Validate: %s/GPU × %d = %s (Level 2, sequential). Stress: %s/GPU × %d = %s (Level 3, sequential).", - validateFmtDur(perV), n, validateFmtDur(perV*n), - validateFmtDur(perS), n, validateFmtDur(perS*n)) - } - return fmt.Sprintf("Validate: %s/GPU (Level 2, sequential). Stress: %s/GPU (Level 3, sequential).", - validateFmtDur(perV), validateFmtDur(perS)) - }(), + fmt.Sprintf("Validate: %s (Level 2, all GPUs simultaneously). Stress: %s (Level 3, all GPUs simultaneously).", + validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec), + validateFmtDur(platform.SATEstimatedNvidiaGPUStressSec)), )) + `
` + renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody( inv.NVIDIA, `Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`, `dcgmi diag targeted_stress`, - func() string { - per := platform.SATEstimatedNvidiaTargetedStressPerGPUSec - s := "Skipped in Validate. " - if n > 0 { - s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n)) - } else { - s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per)) - } - return s + `

Only runs in Stress mode. Switch mode above to enable in Run All.

` - }(), + "Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec) + ` (all GPUs simultaneously).

Only runs in Stress mode. Switch mode above to enable in Run All.

`, )) + `
` + `
` + @@ -162,16 +149,7 @@ func renderValidate(opts HandlerOptions) string { inv.NVIDIA, `Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`, `dcgmi diag targeted_power`, - func() string { - per := platform.SATEstimatedNvidiaTargetedPowerPerGPUSec - s := "Skipped in Validate. " - if n > 0 { - s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n)) - } else { - s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per)) - } - return s + `

Only runs in Stress mode. Switch mode above to enable in Run All.

` - }(), + "Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec) + ` (all GPUs simultaneously).

Only runs in Stress mode. Switch mode above to enable in Run All.

`, )) + `
` + `
` + @@ -382,8 +360,8 @@ function runSATWithOverrides(target, overrides) { return enqueueSATTarget(target, overrides) .then(d => streamSATTask(d.task_id, title, false)); } -const nvidiaPerGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power']; -const nvidiaAllGPUTargets = ['nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth']; +const nvidiaPerGPUTargets = []; +const nvidiaAllGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth']; function satAllGPUIndicesForMulti() { return Promise.resolve(satSelectedGPUIndices()); } @@ -417,40 +395,9 @@ function runNvidiaFabricValidate(target) { }); } function runNvidiaValidateSet(target) { - return loadSatNvidiaGPUs().then(gpus => { - const selected = satSelectedGPUIndices(); - const picked = gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0); - if (!picked.length) { - throw new Error('Select at least one NVIDIA GPU.'); - } - if (picked.length === 1) { - const gpu = picked[0]; - return runSATWithOverrides(target, { - gpu_indices: [Number(gpu.index)], - display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')', - }); - } - document.getElementById('sat-output').style.display='block'; - document.getElementById('sat-title').textContent = '— ' + target; - const term = document.getElementById('sat-terminal'); - term.textContent = 'Running ' + target + ' one GPU at a time...\n'; - const labelBase = satLabels()[target] || ('Validate ' + target); - const runNext = (idx) => { - if (idx >= picked.length) return Promise.resolve(); - const gpu = picked[idx]; - const gpuLabel = satGPUDisplayName(gpu); - term.textContent += '\n[' + (idx + 1) + '/' + picked.length + '] ' + gpuLabel + '\n'; - return enqueueSATTarget(target, { - gpu_indices: [Number(gpu.index)], - display_name: labelBase + ' (' + gpuLabel + ')', - }).then(d => { - return streamSATTask(d.task_id, labelBase + ' (' + gpuLabel + ')', false); - }).then(function() { - return runNext(idx + 1); - }); - }; - return runNext(0); - }); + const selected = satSelectedGPUIndices(); + if (!selected.length) { alert('Select at least one NVIDIA GPU.'); return; } + return runSATWithOverrides(target, {gpu_indices: selected, display_name: satLabels()[target] || target}); } function runAMDValidateSet() { const targets = selectedAMDValidateTargets();