Run NVIDIA DCGM diag tests on all selected GPUs simultaneously

targeted_stress, targeted_power, and the Level 2/3 diag were dispatched one GPU at a time from the UI, turning a single dcgmi command into 8 sequential ~350–450 s runs. DCGM supports -i with a comma-separated list of GPU indices and runs the diagnostic on all of them in parallel. Move nvidia, nvidia-targeted-stress, nvidia-targeted-power into nvidiaAllGPUTargets so expandSATTarget passes all selected indices in one API call. Simplify runNvidiaValidateSet to match runNvidiaFabricValidate. Update sat.go constants and page_validate.go estimates to reflect all-GPU simultaneous execution (remove n× multiplier from total time estimates). Stress test on 8-GPU system: ~5.3 h → ~2.5 h. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-20 11:53:25 +03:00
parent 647e99b697
commit 679aeb9947
2 changed files with 30 additions and 83 deletions
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -30,10 +30,10 @@ import (
 // Sources:
 //   - SATEstimatedCPUValidateSec:                 xFusion v8.6 — 62 s
 //   - SATEstimatedMemoryValidateSec:               xFusion v8.6 — 68 s
-//   - SATEstimatedNvidiaGPUValidatePerGPUSec:      xFusion v8.6/v8.22 — 77–87 s/GPU
+//   - SATEstimatedNvidiaGPUValidateSec:            xFusion v8.6/v8.22 — 77–87 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
-//   - SATEstimatedNvidiaGPUStressPerGPUSec:        xFusion v8.6/v8.22 — 444–448 s/GPU
+//   - SATEstimatedNvidiaGPUStressSec:              xFusion v8.6/v8.22 — 444–448 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
-//   - SATEstimatedNvidiaTargetedStressPerGPUSec:   xFusion v8.6/v8.22 — 347–348 s/GPU (300 s default + overhead)
+//   - SATEstimatedNvidiaTargetedStressSec:         xFusion v8.6/v8.22 — 347–348 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
-//   - SATEstimatedNvidiaTargetedPowerPerGPUSec:    MSI v8.22 / xFusion v8.6 — 346–351 s/GPU
+//   - SATEstimatedNvidiaTargetedPowerSec:          MSI v8.22 / xFusion v8.6 — 346–351 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
 //   - SATEstimatedNvidiaPulseTestSec:              xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous)
 //   - SATEstimatedNvidiaInterconnectSec:           xFusion v8.6/v8.22 — 210–384 s / 8 GPU (all simultaneous)
 //   - SATEstimatedNvidiaBandwidthSec:              xFusion v8.6/v8.22 — 2 664–2 688 s / 8 GPU (all simultaneous)
@@ -48,15 +48,15 @@ const (
 	// RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size).
 	SATEstimatedMemoryStressSec = 140
-	// NVIDIA dcgmi diag Level 2 (medium), per GPU, sequential.
+	// NVIDIA dcgmi diag Level 2 (medium), all GPUs simultaneously.
-	SATEstimatedNvidiaGPUValidatePerGPUSec = 85
+	SATEstimatedNvidiaGPUValidateSec = 85
-	// NVIDIA dcgmi diag Level 3 (targeted stress), per GPU, sequential.
+	// NVIDIA dcgmi diag Level 3 (targeted stress), all GPUs simultaneously.
-	SATEstimatedNvidiaGPUStressPerGPUSec = 450
+	SATEstimatedNvidiaGPUStressSec = 450
-	// NVIDIA dcgmi targeted_stress 300 s + overhead, per GPU, sequential.
+	// NVIDIA dcgmi targeted_stress 300 s + overhead, all GPUs simultaneously.
-	SATEstimatedNvidiaTargetedStressPerGPUSec = 350
+	SATEstimatedNvidiaTargetedStressSec = 350
-	// NVIDIA dcgmi targeted_power 300 s + overhead, per GPU, sequential.
+	// NVIDIA dcgmi targeted_power 300 s + overhead, all GPUs simultaneously.
-	SATEstimatedNvidiaTargetedPowerPerGPUSec = 350
+	SATEstimatedNvidiaTargetedPowerSec = 350
 	// NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU).
 	SATEstimatedNvidiaPulseTestSec = 5000
--- a/audit/internal/webui/page_validate.go
+++ b/audit/internal/webui/page_validate.go
@@ -35,9 +35,11 @@ func validateTotalValidateSec(n int) int {
 	}
 	total := platform.SATEstimatedCPUValidateSec +
 		platform.SATEstimatedMemoryValidateSec +
 		n*platform.SATEstimatedNvidiaGPUValidatePerGPUSec +
 		platform.SATEstimatedNvidiaInterconnectSec +
 		platform.SATEstimatedNvidiaBandwidthSec
 	if n > 0 {
 		total += platform.SATEstimatedNvidiaGPUValidateSec
 	}
 	return total
 }
@@ -47,12 +49,14 @@ func validateTotalStressSec(n int) int {
 	}
 	total := platform.SATEstimatedCPUStressSec +
 		platform.SATEstimatedMemoryStressSec +
 		n*platform.SATEstimatedNvidiaGPUStressPerGPUSec +
 		n*platform.SATEstimatedNvidiaTargetedStressPerGPUSec +
 		n*platform.SATEstimatedNvidiaTargetedPowerPerGPUSec +
 		platform.SATEstimatedNvidiaPulseTestSec +
 		platform.SATEstimatedNvidiaInterconnectSec +
 		platform.SATEstimatedNvidiaBandwidthSec
 	if n > 0 {
 		total += platform.SATEstimatedNvidiaGPUStressSec +
 			platform.SATEstimatedNvidiaTargetedStressSec +
 			platform.SATEstimatedNvidiaTargetedPowerSec
 	}
 	return total
 }
@@ -128,33 +132,16 @@ func renderValidate(opts HandlerOptions) string {
 		inv.NVIDIA,
 		`Runs NVIDIA diagnostics and board inventory checks.`,
 		`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
-		func() string {
+		fmt.Sprintf("Validate: %s (Level 2, all GPUs simultaneously). Stress: %s (Level 3, all GPUs simultaneously).",
-			perV := platform.SATEstimatedNvidiaGPUValidatePerGPUSec
+			validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec),
-			perS := platform.SATEstimatedNvidiaGPUStressPerGPUSec
+			validateFmtDur(platform.SATEstimatedNvidiaGPUStressSec)),
 			if n > 0 {
 				return fmt.Sprintf("Validate: %s/GPU × %d = %s (Level 2, sequential). Stress: %s/GPU × %d = %s (Level 3, sequential).",
 					validateFmtDur(perV), n, validateFmtDur(perV*n),
 					validateFmtDur(perS), n, validateFmtDur(perS*n))
 			}
 			return fmt.Sprintf("Validate: %s/GPU (Level 2, sequential). Stress: %s/GPU (Level 3, sequential).",
 				validateFmtDur(perV), validateFmtDur(perS))
 		}(),
 	)) +
 		`<div id="sat-card-nvidia-targeted-stress">` +
 		renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
 			inv.NVIDIA,
 			`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
 			`<code>dcgmi diag targeted_stress</code>`,
-			func() string {
+		"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec) + ` (all GPUs simultaneously).<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
 				per := platform.SATEstimatedNvidiaTargetedStressPerGPUSec
 				s := "Skipped in Validate. "
 				if n > 0 {
 					s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
 				} else {
 					s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
 				}
 				return s + `<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
 			}(),
 		)) +
 		`</div>` +
 		`<div id="sat-card-nvidia-targeted-power">` +
@@ -162,16 +149,7 @@ func renderValidate(opts HandlerOptions) string {
 			inv.NVIDIA,
 			`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
 			`<code>dcgmi diag targeted_power</code>`,
-			func() string {
+		"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec) + ` (all GPUs simultaneously).<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
 				per := platform.SATEstimatedNvidiaTargetedPowerPerGPUSec
 				s := "Skipped in Validate. "
 				if n > 0 {
 					s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
 				} else {
 					s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
 				}
 				return s + `<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
 			}(),
 		)) +
 		`</div>` +
 		`<div id="sat-card-nvidia-pulse">` +
@@ -382,8 +360,8 @@ function runSATWithOverrides(target, overrides) {
  return enqueueSATTarget(target, overrides)
    .then(d => streamSATTask(d.task_id, title, false));
 }
-const nvidiaPerGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power'];
+const nvidiaPerGPUTargets = [];
-const nvidiaAllGPUTargets = ['nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
+const nvidiaAllGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
 function satAllGPUIndicesForMulti() {
  return Promise.resolve(satSelectedGPUIndices());
 }
@@ -417,40 +395,9 @@ function runNvidiaFabricValidate(target) {
  });
 }
 function runNvidiaValidateSet(target) {
-  return loadSatNvidiaGPUs().then(gpus => {
+  const selected = satSelectedGPUIndices();
-    const selected = satSelectedGPUIndices();
+  if (!selected.length) { alert('Select at least one NVIDIA GPU.'); return; }
-    const picked = gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0);
+  return runSATWithOverrides(target, {gpu_indices: selected, display_name: satLabels()[target] || target});
    if (!picked.length) {
      throw new Error('Select at least one NVIDIA GPU.');
    }
    if (picked.length === 1) {
      const gpu = picked[0];
      return runSATWithOverrides(target, {
        gpu_indices: [Number(gpu.index)],
        display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')',
      });
    }
    document.getElementById('sat-output').style.display='block';
    document.getElementById('sat-title').textContent = '— ' + target;
    const term = document.getElementById('sat-terminal');
    term.textContent = 'Running ' + target + ' one GPU at a time...\n';
    const labelBase = satLabels()[target] || ('Validate ' + target);
    const runNext = (idx) => {
      if (idx >= picked.length) return Promise.resolve();
      const gpu = picked[idx];
      const gpuLabel = satGPUDisplayName(gpu);
      term.textContent += '\n[' + (idx + 1) + '/' + picked.length + '] ' + gpuLabel + '\n';
      return enqueueSATTarget(target, {
        gpu_indices: [Number(gpu.index)],
        display_name: labelBase + ' (' + gpuLabel + ')',
      }).then(d => {
        return streamSATTask(d.task_id, labelBase + ' (' + gpuLabel + ')', false);
      }).then(function() {
        return runNext(idx + 1);
      });
    };
    return runNext(0);
  });
 }
 function runAMDValidateSet() {
  const targets = selectedAMDValidateTargets();