Run NVIDIA DCGM diag tests on all selected GPUs simultaneously
targeted_stress, targeted_power, and the Level 2/3 diag were dispatched one GPU at a time from the UI, turning a single dcgmi command into 8 sequential ~350–450 s runs. DCGM supports -i with a comma-separated list of GPU indices and runs the diagnostic on all of them in parallel. Move nvidia, nvidia-targeted-stress, nvidia-targeted-power into nvidiaAllGPUTargets so expandSATTarget passes all selected indices in one API call. Simplify runNvidiaValidateSet to match runNvidiaFabricValidate. Update sat.go constants and page_validate.go estimates to reflect all-GPU simultaneous execution (remove n× multiplier from total time estimates). Stress test on 8-GPU system: ~5.3 h → ~2.5 h. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -30,10 +30,10 @@ import (
|
|||||||
// Sources:
|
// Sources:
|
||||||
// - SATEstimatedCPUValidateSec: xFusion v8.6 — 62 s
|
// - SATEstimatedCPUValidateSec: xFusion v8.6 — 62 s
|
||||||
// - SATEstimatedMemoryValidateSec: xFusion v8.6 — 68 s
|
// - SATEstimatedMemoryValidateSec: xFusion v8.6 — 68 s
|
||||||
// - SATEstimatedNvidiaGPUValidatePerGPUSec: xFusion v8.6/v8.22 — 77–87 s/GPU
|
// - SATEstimatedNvidiaGPUValidateSec: xFusion v8.6/v8.22 — 77–87 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
|
||||||
// - SATEstimatedNvidiaGPUStressPerGPUSec: xFusion v8.6/v8.22 — 444–448 s/GPU
|
// - SATEstimatedNvidiaGPUStressSec: xFusion v8.6/v8.22 — 444–448 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
|
||||||
// - SATEstimatedNvidiaTargetedStressPerGPUSec: xFusion v8.6/v8.22 — 347–348 s/GPU (300 s default + overhead)
|
// - SATEstimatedNvidiaTargetedStressSec: xFusion v8.6/v8.22 — 347–348 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
|
||||||
// - SATEstimatedNvidiaTargetedPowerPerGPUSec: MSI v8.22 / xFusion v8.6 — 346–351 s/GPU
|
// - SATEstimatedNvidiaTargetedPowerSec: MSI v8.22 / xFusion v8.6 — 346–351 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
|
||||||
// - SATEstimatedNvidiaPulseTestSec: xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous)
|
// - SATEstimatedNvidiaPulseTestSec: xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous)
|
||||||
// - SATEstimatedNvidiaInterconnectSec: xFusion v8.6/v8.22 — 210–384 s / 8 GPU (all simultaneous)
|
// - SATEstimatedNvidiaInterconnectSec: xFusion v8.6/v8.22 — 210–384 s / 8 GPU (all simultaneous)
|
||||||
// - SATEstimatedNvidiaBandwidthSec: xFusion v8.6/v8.22 — 2 664–2 688 s / 8 GPU (all simultaneous)
|
// - SATEstimatedNvidiaBandwidthSec: xFusion v8.6/v8.22 — 2 664–2 688 s / 8 GPU (all simultaneous)
|
||||||
@@ -48,15 +48,15 @@ const (
|
|||||||
// RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size).
|
// RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size).
|
||||||
SATEstimatedMemoryStressSec = 140
|
SATEstimatedMemoryStressSec = 140
|
||||||
|
|
||||||
// NVIDIA dcgmi diag Level 2 (medium), per GPU, sequential.
|
// NVIDIA dcgmi diag Level 2 (medium), all GPUs simultaneously.
|
||||||
SATEstimatedNvidiaGPUValidatePerGPUSec = 85
|
SATEstimatedNvidiaGPUValidateSec = 85
|
||||||
// NVIDIA dcgmi diag Level 3 (targeted stress), per GPU, sequential.
|
// NVIDIA dcgmi diag Level 3 (targeted stress), all GPUs simultaneously.
|
||||||
SATEstimatedNvidiaGPUStressPerGPUSec = 450
|
SATEstimatedNvidiaGPUStressSec = 450
|
||||||
|
|
||||||
// NVIDIA dcgmi targeted_stress 300 s + overhead, per GPU, sequential.
|
// NVIDIA dcgmi targeted_stress 300 s + overhead, all GPUs simultaneously.
|
||||||
SATEstimatedNvidiaTargetedStressPerGPUSec = 350
|
SATEstimatedNvidiaTargetedStressSec = 350
|
||||||
// NVIDIA dcgmi targeted_power 300 s + overhead, per GPU, sequential.
|
// NVIDIA dcgmi targeted_power 300 s + overhead, all GPUs simultaneously.
|
||||||
SATEstimatedNvidiaTargetedPowerPerGPUSec = 350
|
SATEstimatedNvidiaTargetedPowerSec = 350
|
||||||
|
|
||||||
// NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU).
|
// NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU).
|
||||||
SATEstimatedNvidiaPulseTestSec = 5000
|
SATEstimatedNvidiaPulseTestSec = 5000
|
||||||
|
|||||||
@@ -35,9 +35,11 @@ func validateTotalValidateSec(n int) int {
|
|||||||
}
|
}
|
||||||
total := platform.SATEstimatedCPUValidateSec +
|
total := platform.SATEstimatedCPUValidateSec +
|
||||||
platform.SATEstimatedMemoryValidateSec +
|
platform.SATEstimatedMemoryValidateSec +
|
||||||
n*platform.SATEstimatedNvidiaGPUValidatePerGPUSec +
|
|
||||||
platform.SATEstimatedNvidiaInterconnectSec +
|
platform.SATEstimatedNvidiaInterconnectSec +
|
||||||
platform.SATEstimatedNvidiaBandwidthSec
|
platform.SATEstimatedNvidiaBandwidthSec
|
||||||
|
if n > 0 {
|
||||||
|
total += platform.SATEstimatedNvidiaGPUValidateSec
|
||||||
|
}
|
||||||
return total
|
return total
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -47,12 +49,14 @@ func validateTotalStressSec(n int) int {
|
|||||||
}
|
}
|
||||||
total := platform.SATEstimatedCPUStressSec +
|
total := platform.SATEstimatedCPUStressSec +
|
||||||
platform.SATEstimatedMemoryStressSec +
|
platform.SATEstimatedMemoryStressSec +
|
||||||
n*platform.SATEstimatedNvidiaGPUStressPerGPUSec +
|
|
||||||
n*platform.SATEstimatedNvidiaTargetedStressPerGPUSec +
|
|
||||||
n*platform.SATEstimatedNvidiaTargetedPowerPerGPUSec +
|
|
||||||
platform.SATEstimatedNvidiaPulseTestSec +
|
platform.SATEstimatedNvidiaPulseTestSec +
|
||||||
platform.SATEstimatedNvidiaInterconnectSec +
|
platform.SATEstimatedNvidiaInterconnectSec +
|
||||||
platform.SATEstimatedNvidiaBandwidthSec
|
platform.SATEstimatedNvidiaBandwidthSec
|
||||||
|
if n > 0 {
|
||||||
|
total += platform.SATEstimatedNvidiaGPUStressSec +
|
||||||
|
platform.SATEstimatedNvidiaTargetedStressSec +
|
||||||
|
platform.SATEstimatedNvidiaTargetedPowerSec
|
||||||
|
}
|
||||||
return total
|
return total
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -128,33 +132,16 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Runs NVIDIA diagnostics and board inventory checks.`,
|
`Runs NVIDIA diagnostics and board inventory checks.`,
|
||||||
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
||||||
func() string {
|
fmt.Sprintf("Validate: %s (Level 2, all GPUs simultaneously). Stress: %s (Level 3, all GPUs simultaneously).",
|
||||||
perV := platform.SATEstimatedNvidiaGPUValidatePerGPUSec
|
validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec),
|
||||||
perS := platform.SATEstimatedNvidiaGPUStressPerGPUSec
|
validateFmtDur(platform.SATEstimatedNvidiaGPUStressSec)),
|
||||||
if n > 0 {
|
|
||||||
return fmt.Sprintf("Validate: %s/GPU × %d = %s (Level 2, sequential). Stress: %s/GPU × %d = %s (Level 3, sequential).",
|
|
||||||
validateFmtDur(perV), n, validateFmtDur(perV*n),
|
|
||||||
validateFmtDur(perS), n, validateFmtDur(perS*n))
|
|
||||||
}
|
|
||||||
return fmt.Sprintf("Validate: %s/GPU (Level 2, sequential). Stress: %s/GPU (Level 3, sequential).",
|
|
||||||
validateFmtDur(perV), validateFmtDur(perS))
|
|
||||||
}(),
|
|
||||||
)) +
|
)) +
|
||||||
`<div id="sat-card-nvidia-targeted-stress">` +
|
`<div id="sat-card-nvidia-targeted-stress">` +
|
||||||
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
|
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
|
||||||
`<code>dcgmi diag targeted_stress</code>`,
|
`<code>dcgmi diag targeted_stress</code>`,
|
||||||
func() string {
|
"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec) + ` (all GPUs simultaneously).<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||||
per := platform.SATEstimatedNvidiaTargetedStressPerGPUSec
|
|
||||||
s := "Skipped in Validate. "
|
|
||||||
if n > 0 {
|
|
||||||
s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
|
|
||||||
} else {
|
|
||||||
s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
|
|
||||||
}
|
|
||||||
return s + `<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
|
|
||||||
}(),
|
|
||||||
)) +
|
)) +
|
||||||
`</div>` +
|
`</div>` +
|
||||||
`<div id="sat-card-nvidia-targeted-power">` +
|
`<div id="sat-card-nvidia-targeted-power">` +
|
||||||
@@ -162,16 +149,7 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
|
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
|
||||||
`<code>dcgmi diag targeted_power</code>`,
|
`<code>dcgmi diag targeted_power</code>`,
|
||||||
func() string {
|
"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec) + ` (all GPUs simultaneously).<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||||
per := platform.SATEstimatedNvidiaTargetedPowerPerGPUSec
|
|
||||||
s := "Skipped in Validate. "
|
|
||||||
if n > 0 {
|
|
||||||
s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
|
|
||||||
} else {
|
|
||||||
s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
|
|
||||||
}
|
|
||||||
return s + `<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
|
|
||||||
}(),
|
|
||||||
)) +
|
)) +
|
||||||
`</div>` +
|
`</div>` +
|
||||||
`<div id="sat-card-nvidia-pulse">` +
|
`<div id="sat-card-nvidia-pulse">` +
|
||||||
@@ -382,8 +360,8 @@ function runSATWithOverrides(target, overrides) {
|
|||||||
return enqueueSATTarget(target, overrides)
|
return enqueueSATTarget(target, overrides)
|
||||||
.then(d => streamSATTask(d.task_id, title, false));
|
.then(d => streamSATTask(d.task_id, title, false));
|
||||||
}
|
}
|
||||||
const nvidiaPerGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power'];
|
const nvidiaPerGPUTargets = [];
|
||||||
const nvidiaAllGPUTargets = ['nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
|
const nvidiaAllGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
|
||||||
function satAllGPUIndicesForMulti() {
|
function satAllGPUIndicesForMulti() {
|
||||||
return Promise.resolve(satSelectedGPUIndices());
|
return Promise.resolve(satSelectedGPUIndices());
|
||||||
}
|
}
|
||||||
@@ -417,40 +395,9 @@ function runNvidiaFabricValidate(target) {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
function runNvidiaValidateSet(target) {
|
function runNvidiaValidateSet(target) {
|
||||||
return loadSatNvidiaGPUs().then(gpus => {
|
const selected = satSelectedGPUIndices();
|
||||||
const selected = satSelectedGPUIndices();
|
if (!selected.length) { alert('Select at least one NVIDIA GPU.'); return; }
|
||||||
const picked = gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0);
|
return runSATWithOverrides(target, {gpu_indices: selected, display_name: satLabels()[target] || target});
|
||||||
if (!picked.length) {
|
|
||||||
throw new Error('Select at least one NVIDIA GPU.');
|
|
||||||
}
|
|
||||||
if (picked.length === 1) {
|
|
||||||
const gpu = picked[0];
|
|
||||||
return runSATWithOverrides(target, {
|
|
||||||
gpu_indices: [Number(gpu.index)],
|
|
||||||
display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')',
|
|
||||||
});
|
|
||||||
}
|
|
||||||
document.getElementById('sat-output').style.display='block';
|
|
||||||
document.getElementById('sat-title').textContent = '— ' + target;
|
|
||||||
const term = document.getElementById('sat-terminal');
|
|
||||||
term.textContent = 'Running ' + target + ' one GPU at a time...\n';
|
|
||||||
const labelBase = satLabels()[target] || ('Validate ' + target);
|
|
||||||
const runNext = (idx) => {
|
|
||||||
if (idx >= picked.length) return Promise.resolve();
|
|
||||||
const gpu = picked[idx];
|
|
||||||
const gpuLabel = satGPUDisplayName(gpu);
|
|
||||||
term.textContent += '\n[' + (idx + 1) + '/' + picked.length + '] ' + gpuLabel + '\n';
|
|
||||||
return enqueueSATTarget(target, {
|
|
||||||
gpu_indices: [Number(gpu.index)],
|
|
||||||
display_name: labelBase + ' (' + gpuLabel + ')',
|
|
||||||
}).then(d => {
|
|
||||||
return streamSATTask(d.task_id, labelBase + ' (' + gpuLabel + ')', false);
|
|
||||||
}).then(function() {
|
|
||||||
return runNext(idx + 1);
|
|
||||||
});
|
|
||||||
};
|
|
||||||
return runNext(0);
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
function runAMDValidateSet() {
|
function runAMDValidateSet() {
|
||||||
const targets = selectedAMDValidateTargets();
|
const targets = selectedAMDValidateTargets();
|
||||||
|
|||||||
Reference in New Issue
Block a user