Fix dcgmproftester parallel mode: use staggered script for all multi-GPU runs

A single dcgmproftester process without -i only loads GPU 0 regardless of CUDA_VISIBLE_DEVICES. Now always routes multi-GPU runs through bee-dcgmproftester-staggered (--stagger-seconds 0 for parallel mode), which spawns one process per GPU so all GPUs are loaded simultaneously. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-19 18:31:34 +03:00
parent f8cd9a7376
commit df1385d3d6
1 changed files with 10 additions and 2 deletions
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -443,11 +443,19 @@ func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir strin
 		profCmd []string
 		profEnv []string
 	)
-	if staggerSec > 0 && len(selected) > 1 {
+	if len(selected) > 1 {
+		// For multiple GPUs, always spawn one dcgmproftester process per GPU via
+		// bee-dcgmproftester-staggered (stagger=0 means all start simultaneously).
+		// A single dcgmproftester process without -i only loads GPU 0 regardless
+		// of CUDA_VISIBLE_DEVICES.
+		stagger := staggerSec
+		if stagger < 0 {
+			stagger = 0
+		}
 		profCmd = []string{
 			"bee-dcgmproftester-staggered",
 			"--seconds", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)),
-			"--stagger-seconds", strconv.Itoa(staggerSec),
+			"--stagger-seconds", strconv.Itoa(stagger),
 			"--devices", joinIndexList(selected),
 		}
 	} else {