Move power diag tests to validate/stress; fix GPU burn power saturation

- bee-gpu-stress.c: remove per-wave cuCtxSynchronize barrier in both cuBLASLt and PTX hot loops; sync at most once/sec so the GPU queue stays continuously full — eliminates the CPU↔GPU ping-pong that prevented reaching full TDP - sat_fan_stress.go: default SizeMB 0 (auto = 95% VRAM) instead of hardcoded 64 MB; tiny matrices caused <0.1 ms kernels where CPU re-queue overhead dominated - pages.go: move nvidia-targeted-power and nvidia-pulse from Burn → Validate stress section alongside nvidia-targeted-stress; these are DCGM pass/fail diagnostics, not sustained burn loads; remove the Power Delivery / Power Budget card from Burn entirely Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-08 00:13:52 +03:00
parent 0a4bb596f6
commit d1a22d782d
3 changed files with 196 additions and 150 deletions
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
@@ -1036,10 +1036,12 @@ func renderValidate(opts HandlerOptions) string {
  <div class="card-body validate-profile-body">
    <div class="validate-profile-col">
      <div class="form-row" style="margin:0"><label>Cycles</label><input type="number" id="sat-cycles" value="1" min="1" max="100" style="width:100%"></div>
-      <div class="form-row" style="margin:12px 0 0"><label>Diag level</label><select id="sat-profile-nvidia-level" style="width:100%"><option value="1">Level 1 — Quick</option><option value="2">Level 2 — Standard</option><option value="3">Level 3 — Extended</option><option value="4">Level 4 — Full</option></select></div>
+      <div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
+      <label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
+      <label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (~30–60 min)</span></label>
    </div>
    <div class="validate-profile-col validate-profile-action">
-      <p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially with the selected cycle count. NVIDIA <code>dcgmi diag</code> uses the selected diag level from this profile.</p>
+      <p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially with the selected cycle count and mode. Validate is quick (~5–15 min total); Stress is thorough (~30–60 min total).</p>
      <button class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
    </div>
    <div class="validate-profile-col"></div>
@@ -1054,19 +1056,19 @@ func renderValidate(opts HandlerOptions) string {
 		inv.CPU,
 		`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
 		`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
-		`Duration is taken from Validate Profile diag level: Level 1 = 60s, Level 2 = 5m, Level 3 = 1h, Level 4 = 1h.`,
+		`60s in Validate, 30 min in Stress.`,
 	)) +
 		renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
 			inv.Memory,
-			`Runs a short RAM validation pass and records memory state around the test.`,
+			`Runs a RAM validation pass and records memory state around the test.`,
 			`<code>free</code>, <code>memtester</code>`,
-			`No extra settings.`,
+			`256 MB / 1 pass in Validate, 1 GB / 3 passes in Stress.`,
 		)) +
 		renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
 			inv.Storage,
 			`Scans all storage devices and runs the matching health or self-test path for each device type.`,
 			`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
-			`No extra settings.`,
+			`Short self-test in Validate, extended self-test in Stress.`,
 		)) +
 		`</div>
 <div style="height:1px;background:var(--border);margin:16px 0"></div>
@@ -1091,14 +1093,32 @@ func renderValidate(opts HandlerOptions) string {
 			inv.NVIDIA,
 			`Runs NVIDIA diagnostics and board inventory checks.`,
 			`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
-			`Runs one GPU at a time on the selected NVIDIA GPUs. Diag level is taken from Validate Profile.`,
+			`Level 2 in Validate, Level 3 in Stress. Runs one GPU at a time on the selected NVIDIA GPUs.`,
 		)) +
+		`<div id="sat-card-nvidia-targeted-stress">` +
 		renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
 			inv.NVIDIA,
-			`Runs a controlled NVIDIA DCGM load in Validate to check stability under moderate stress.`,
+			`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
 			`<code>dcgmi diag targeted_stress</code>`,
-			`Runs one GPU at a time on the selected NVIDIA GPUs with the fixed DCGM targeted stress recipe.`,
+			`Skipped in Validate mode. Runs after dcgmi diag in Stress mode. Runs one GPU at a time on the selected NVIDIA GPUs.<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
 		)) +
+		`</div>` +
+		`<div id="sat-card-nvidia-targeted-power">` +
+		renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody(
+			inv.NVIDIA,
+			`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
+			`<code>dcgmi diag targeted_power</code>`,
+			`Skipped in Validate mode. Runs in Stress mode only. Runs one GPU at a time.<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
+		)) +
+		`</div>` +
+		`<div id="sat-card-nvidia-pulse">` +
+		renderSATCard("nvidia-pulse", "NVIDIA Pulse Test", "runNvidiaValidateSet('nvidia-pulse')", "", renderValidateCardBody(
+			inv.NVIDIA,
+			`Verifies GPU transient power response using DCGM pulse load. Pass/fail determined by DCGM.`,
+			`<code>dcgmi diag pulse_test</code>`,
+			`Skipped in Validate mode. Runs in Stress mode only. Runs one GPU at a time.<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
+		)) +
+		`</div>` +
 		`</div>
 <div class="grid3" style="margin-top:16px">
 ` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
@@ -1125,17 +1145,26 @@ func renderValidate(opts HandlerOptions) string {
 </style>
 <script>
 let satES = null;
-function satDiagLevel() {
-  return parseInt(document.getElementById('sat-profile-nvidia-level').value) || 1;
+function satStressMode() {
+  return document.querySelector('input[name="sat-mode"]:checked')?.value === 'stress';
 }
-function satCPUDurationFromDiagLevel() {
-  const level = satDiagLevel();
-  if (level === 1) return 60;
-  if (level === 2) return 5 * 60;
-  return 60 * 60;
+function satModeChanged() {
+  const stress = satStressMode();
+  [
+    {card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
+    {card: 'sat-card-nvidia-targeted-power',  hint: 'sat-tp-mode-hint'},
+    {card: 'sat-card-nvidia-pulse',           hint: 'sat-pt-mode-hint'},
+  ].forEach(function(item) {
+    const card = document.getElementById(item.card);
+    if (card) {
+      card.style.opacity = stress ? '1' : '0.5';
+      const hint = document.getElementById(item.hint);
+      if (hint) hint.style.display = stress ? 'none' : '';
+    }
+  });
 }
 function satLabels() {
-  return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
+  return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA Pulse Test (dcgmi diag pulse_test)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
 }
 let satNvidiaGPUsPromise = null;
 function loadSatNvidiaGPUs() {
@@ -1211,9 +1240,8 @@ function satRequestBody(target, overrides) {
  const body = {};
  const labels = satLabels();
  body.display_name = labels[target] || ('Validate ' + target);
-  if (target === 'nvidia') body.diag_level = satDiagLevel();
-  if (target === 'nvidia-targeted-stress') body.duration = 300;
-  if (target === 'cpu') body.duration = satCPUDurationFromDiagLevel();
+  body.stress_mode = satStressMode();
+  if (target === 'cpu') body.duration = satStressMode() ? 1800 : 60;
  if (overrides) {
    Object.keys(overrides).forEach(key => { body[key] = overrides[key]; });
  }
@@ -1275,8 +1303,9 @@ function runSATWithOverrides(target, overrides) {
  return enqueueSATTarget(target, overrides)
    .then(d => streamSATTask(d.task_id, title, false));
 }
+const nvidiaPerGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
 function expandSATTarget(target) {
-  if (target !== 'nvidia' && target !== 'nvidia-targeted-stress') {
+  if (nvidiaPerGPUTargets.indexOf(target) < 0) {
    return Promise.resolve([{target: target}]);
  }
  const selected = satSelectedGPUIndices();
@@ -1354,8 +1383,10 @@ function runAllSAT() {
  const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
  const status = document.getElementById('sat-all-status');
  status.textContent = 'Enqueuing...';
-  const baseTargets = ['nvidia','nvidia-targeted-stress','memory','storage','cpu'].concat(selectedAMDValidateTargets());
+  const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
+  const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','memory','storage','cpu'].concat(selectedAMDValidateTargets());
  const activeTargets = baseTargets.filter(target => {
+    if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
    const btn = document.getElementById('sat-btn-' + target);
    return !(btn && btn.disabled);
  });
@@ -1390,6 +1421,8 @@ function runAllSAT() {
 fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
    if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
    if (!gp.nvidia) disableSATCard('nvidia-targeted-stress', 'No NVIDIA GPU detected');
+    if (!gp.nvidia) disableSATCard('nvidia-targeted-power', 'No NVIDIA GPU detected');
+    if (!gp.nvidia) disableSATCard('nvidia-pulse', 'No NVIDIA GPU detected');
    if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
    if (!gp.amd) disableSATAMDOptions('No AMD GPU detected');
 });
@@ -1583,10 +1616,11 @@ func renderSATCard(id, label, runAction, headerActions, body string) string {
 // ── Benchmark ─────────────────────────────────────────────────────────────────

 type benchmarkHistoryColumn struct {
-	key   string
-	label string
-	name  string
-	index int
+	key      string
+	label    string
+	name     string
+	index    int
+	parallel bool
 }

 type benchmarkHistoryCell struct {
@@ -1894,29 +1928,43 @@ func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []
 			cells:       make(map[string]benchmarkHistoryCell),
 		}

-		// Count how many GPUs of each model appear in this run (for the label).
-		gpuModelCount := make(map[string]int)
-		for _, gpu := range result.GPUs {
-			gpuModelCount[strings.TrimSpace(gpu.Name)]++
-		}
-
-		// Track best composite score per column key within this run.
-		runBest := make(map[string]float64)
-		for _, gpu := range result.GPUs {
-			key := benchmarkHistoryColumnKey(result.ServerModel, gpu.Name)
-			count := gpuModelCount[strings.TrimSpace(gpu.Name)]
-			columnByKey[key] = benchmarkHistoryColumn{
-				key:   key,
-				label: benchmarkHistoryColumnLabel(result.ServerModel, gpu.Name, count),
-				name:  strings.TrimSpace(gpu.Name),
-				index: gpu.Index,
+		if result.ParallelGPUs {
+			// All GPUs ran simultaneously — one column per server, score = avg composite.
+			gpuModelCount := make(map[string]int)
+			for _, gpu := range result.GPUs {
+				gpuModelCount[strings.TrimSpace(gpu.Name)]++
 			}
-			if gpu.Scores.CompositeScore > runBest[key] {
-				runBest[key] = gpu.Scores.CompositeScore
+			scoreSum := make(map[string]float64)
+			scoreCnt := make(map[string]int)
+			for _, gpu := range result.GPUs {
+				key := "parallel|" + strings.TrimSpace(result.ServerModel) + "|" + strings.TrimSpace(gpu.Name)
+				scoreSum[key] += gpu.Scores.CompositeScore
+				scoreCnt[key]++
+				count := gpuModelCount[strings.TrimSpace(gpu.Name)]
+				columnByKey[key] = benchmarkHistoryColumn{
+					key:      key,
+					label:    benchmarkHistoryParallelLabel(result.ServerModel, gpu.Name, count),
+					name:     strings.TrimSpace(gpu.Name),
+					index:    -1,
+					parallel: true,
+				}
+			}
+			for key, sum := range scoreSum {
+				run.cells[key] = benchmarkHistoryCell{score: sum / float64(scoreCnt[key]), present: true}
+			}
+		} else {
+			// Each GPU ran independently — one column per GPU index.
+			for _, gpu := range result.GPUs {
+				key := "gpu|" + strings.TrimSpace(result.ServerModel) + "|" + strings.TrimSpace(gpu.Name) + "|" + strconv.Itoa(gpu.Index)
+				columnByKey[key] = benchmarkHistoryColumn{
+					key:      key,
+					label:    benchmarkHistoryPerGPULabel(gpu.Name, gpu.Index),
+					name:     strings.TrimSpace(gpu.Name),
+					index:    gpu.Index,
+					parallel: false,
+				}
+				run.cells[key] = benchmarkHistoryCell{score: gpu.Scores.CompositeScore, present: true}
 			}
-		}
-		for key, score := range runBest {
-			run.cells[key] = benchmarkHistoryCell{score: score, present: true}
 		}
 		runs = append(runs, run)
 	}
@@ -1925,13 +1973,24 @@ func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []
 	for _, col := range columnByKey {
 		columns = append(columns, col)
 	}
+	// Sequential GPU columns first (sorted by GPU index), then parallel server columns.
 	sort.Slice(columns, func(i, j int) bool {
-		li := strings.ToLower(columns[i].label)
-		lj := strings.ToLower(columns[j].label)
-		if li != lj {
-			return li < lj
+		if columns[i].parallel != columns[j].parallel {
+			return !columns[i].parallel // sequential first
 		}
-		return columns[i].key < columns[j].key
+		if columns[i].parallel {
+			li := strings.ToLower(columns[i].label)
+			lj := strings.ToLower(columns[j].label)
+			if li != lj {
+				return li < lj
+			}
+			return columns[i].key < columns[j].key
+		}
+		// Sequential: sort by GPU index, then name.
+		if columns[i].index != columns[j].index {
+			return columns[i].index < columns[j].index
+		}
+		return strings.ToLower(columns[i].name) < strings.ToLower(columns[j].name)
 	})
 	sort.Slice(runs, func(i, j int) bool {
 		return runs[i].generatedAt.After(runs[j].generatedAt)
@@ -1939,25 +1998,28 @@ func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []
 	return columns, runs
 }

-// benchmarkHistoryColumnKey groups results by server model + GPU model so that
-// runs on the same hardware produce one column regardless of individual GPU index.
-func benchmarkHistoryColumnKey(serverModel, gpuName string) string {
-	return strings.TrimSpace(serverModel) + "|" + strings.TrimSpace(gpuName)
+// benchmarkHistoryPerGPULabel formats a label for a single-GPU column: "GPU #N — ModelName".
+func benchmarkHistoryPerGPULabel(gpuName string, index int) string {
+	gpuName = strings.TrimSpace(gpuName)
+	if gpuName == "" {
+		gpuName = "Unknown GPU"
+	}
+	return fmt.Sprintf("GPU #%d — %s", index, gpuName)
 }

-// benchmarkHistoryColumnLabel formats the column header as
-// "Server Model (N× GPU Model)" or "GPU Model" when server info is missing.
-func benchmarkHistoryColumnLabel(serverModel, gpuName string, count int) string {
+// benchmarkHistoryParallelLabel formats a label for an all-GPU parallel column:
+// "ServerModel — N× ModelName (All GPUs)" or "N× ModelName (All GPUs)" if no server.
+func benchmarkHistoryParallelLabel(serverModel, gpuName string, count int) string {
 	serverModel = strings.TrimSpace(serverModel)
 	gpuName = strings.TrimSpace(gpuName)
 	if gpuName == "" {
 		gpuName = "Unknown GPU"
 	}
-	gpuPart := fmt.Sprintf("%d× %s", count, gpuName)
+	gpuPart := fmt.Sprintf("%d× %s (All GPUs)", count, gpuName)
 	if serverModel == "" {
 		return gpuPart
 	}
-	return fmt.Sprintf("%s (%s)", serverModel, gpuPart)
+	return fmt.Sprintf("%s — %s", serverModel, gpuPart)
 }

 // ── Burn ──────────────────────────────────────────────────────────────────────
@@ -2031,15 +2093,6 @@ func renderBurn() string {

 <div class="burn-section">GPU-Specific Tests</div>
 <div class="grid2 burn-grid" style="margin-bottom:16px">
-<div class="card burn-card">
-  <div class="card-head card-head-actions"><span>Power Delivery / Power Budget</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-nvidia-power',target:'nvidia-targeted-power',label:'NVIDIA Targeted Power (dcgmi diag targeted_power)',nvidia:true},{id:'burn-nvidia-pulse',target:'nvidia-pulse',label:'NVIDIA Pulse Test (dcgmi diag pulse_test)',nvidia:true}])">Run</button></div>
-  <div class="card-body burn-card-body">
-    <p style="font-size:12px;color:var(--muted);margin:0 0 10px">Official NVIDIA power-oriented recipes. ` + "targeted_power" + ` checks sustained delivery; ` + "pulse_test" + ` checks transient behavior.</p>
-    <label class="cb-row"><input type="checkbox" id="burn-nvidia-power" disabled><span>NVIDIA Targeted Power (dcgmi diag targeted_power) <span class="cb-note" id="note-nvidia-power"></span></span></label>
-    <label class="cb-row"><input type="checkbox" id="burn-nvidia-pulse" disabled><span>NVIDIA Pulse Test (dcgmi diag pulse_test) <span class="cb-note" id="note-nvidia-pulse"></span></span></label>
-  </div>
-</div>
-
 <div class="card burn-card">
  <div class="card-head card-head-actions"><span>Interconnect / Bandwidth</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-nvidia-interconnect',target:'nvidia-interconnect',label:'NVIDIA Interconnect Test (NCCL all_reduce_perf)',nvidia:true},{id:'burn-nvidia-bandwidth',target:'nvidia-bandwidth',label:'NVIDIA Bandwidth Test (NVBandwidth)',nvidia:true}])">Run</button></div>
  <div class="card-body burn-card-body">
@@ -2299,8 +2352,6 @@ function runAllBurnTasks() {
  const status = document.getElementById('burn-all-status');
  const all = [
    {id:'burn-nvidia-compute',target:'nvidia-compute',label:'NVIDIA Max Compute Load (dcgmproftester)',nvidia:true},
-    {id:'burn-nvidia-power',target:'nvidia-targeted-power',label:'NVIDIA Targeted Power (dcgmi diag targeted_power)',nvidia:true},
-    {id:'burn-nvidia-pulse',target:'nvidia-pulse',label:'NVIDIA Pulse Test (dcgmi diag pulse_test)',nvidia:true},
    {id:'burn-nvidia-interconnect',target:'nvidia-interconnect',label:'NVIDIA Interconnect Test (NCCL all_reduce_perf)',nvidia:true},
    {id:'burn-nvidia-bandwidth',target:'nvidia-bandwidth',label:'NVIDIA Bandwidth Test (NVBandwidth)',nvidia:true},
    {id:'burn-gpu-bee',target:'nvidia-stress',label:'GPU Burn (bee-gpu-burn)',nvidia:true,extra:{loader:'builtin'}},
@@ -2317,8 +2368,6 @@ function runAllBurnTasks() {
 fetch('/api/gpu/tools').then(function(r) { return r.json(); }).then(function(tools) {
  const map = {
    'nvidia-compute': {cb:'burn-nvidia-compute', note:'note-nvidia-compute', reason:'dcgmproftester not available or NVIDIA driver not running'},
-    'nvidia-targeted-power': {cb:'burn-nvidia-power', note:'note-nvidia-power', reason:'dcgmi not available or NVIDIA driver not running'},
-    'nvidia-pulse': {cb:'burn-nvidia-pulse', note:'note-nvidia-pulse', reason:'dcgmi not available or NVIDIA driver not running'},
    'nvidia-interconnect': {cb:'burn-nvidia-interconnect', note:'note-nvidia-interconnect', reason:'NCCL interconnect tool not available or NVIDIA driver not running'},
    'nvidia-bandwidth': {cb:'burn-nvidia-bandwidth', note:'note-nvidia-bandwidth', reason:'nvbandwidth or dcgmi not available or NVIDIA driver not running'},
    'bee-gpu-burn': {cb:'burn-gpu-bee', note:'note-bee', reason:'bee-gpu-burn not available or NVIDIA driver not running'},