Fix dcgmproftester parallel mode: use staggered script for all multi-GPU runs

A single dcgmproftester process without -i only loads GPU 0 regardless of
CUDA_VISIBLE_DEVICES. Now always routes multi-GPU runs through
bee-dcgmproftester-staggered (--stagger-seconds 0 for parallel mode),
which spawns one process per GPU so all GPUs are loaded simultaneously.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-19 18:31:34 +03:00
parent f8cd9a7376
commit df1385d3d6

View File

@@ -443,11 +443,19 @@ func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir strin
profCmd []string
profEnv []string
)
if staggerSec > 0 && len(selected) > 1 {
if len(selected) > 1 {
// For multiple GPUs, always spawn one dcgmproftester process per GPU via
// bee-dcgmproftester-staggered (stagger=0 means all start simultaneously).
// A single dcgmproftester process without -i only loads GPU 0 regardless
// of CUDA_VISIBLE_DEVICES.
stagger := staggerSec
if stagger < 0 {
stagger = 0
}
profCmd = []string{
"bee-dcgmproftester-staggered",
"--seconds", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)),
"--stagger-seconds", strconv.Itoa(staggerSec),
"--stagger-seconds", strconv.Itoa(stagger),
"--devices", joinIndexList(selected),
}
} else {