From df1385d3d65c0ea2e94360e9e73f0259af899a32 Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Sun, 19 Apr 2026 18:31:34 +0300 Subject: [PATCH] Fix dcgmproftester parallel mode: use staggered script for all multi-GPU runs A single dcgmproftester process without -i only loads GPU 0 regardless of CUDA_VISIBLE_DEVICES. Now always routes multi-GPU runs through bee-dcgmproftester-staggered (--stagger-seconds 0 for parallel mode), which spawns one process per GPU so all GPUs are loaded simultaneously. Co-Authored-By: Claude Sonnet 4.6 --- audit/internal/platform/sat.go | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/audit/internal/platform/sat.go b/audit/internal/platform/sat.go index 349d5aa..fb15b8c 100644 --- a/audit/internal/platform/sat.go +++ b/audit/internal/platform/sat.go @@ -443,11 +443,19 @@ func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir strin profCmd []string profEnv []string ) - if staggerSec > 0 && len(selected) > 1 { + if len(selected) > 1 { + // For multiple GPUs, always spawn one dcgmproftester process per GPU via + // bee-dcgmproftester-staggered (stagger=0 means all start simultaneously). + // A single dcgmproftester process without -i only loads GPU 0 regardless + // of CUDA_VISIBLE_DEVICES. + stagger := staggerSec + if stagger < 0 { + stagger = 0 + } profCmd = []string{ "bee-dcgmproftester-staggered", "--seconds", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)), - "--stagger-seconds", strconv.Itoa(staggerSec), + "--stagger-seconds", strconv.Itoa(stagger), "--devices", joinIndexList(selected), } } else {