Fix dcgmproftester parallel mode: use staggered script for all multi-GPU runs
A single dcgmproftester process without -i only loads GPU 0 regardless of CUDA_VISIBLE_DEVICES. Now always routes multi-GPU runs through bee-dcgmproftester-staggered (--stagger-seconds 0 for parallel mode), which spawns one process per GPU so all GPUs are loaded simultaneously. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -443,11 +443,19 @@ func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir strin
|
||||
profCmd []string
|
||||
profEnv []string
|
||||
)
|
||||
if staggerSec > 0 && len(selected) > 1 {
|
||||
if len(selected) > 1 {
|
||||
// For multiple GPUs, always spawn one dcgmproftester process per GPU via
|
||||
// bee-dcgmproftester-staggered (stagger=0 means all start simultaneously).
|
||||
// A single dcgmproftester process without -i only loads GPU 0 regardless
|
||||
// of CUDA_VISIBLE_DEVICES.
|
||||
stagger := staggerSec
|
||||
if stagger < 0 {
|
||||
stagger = 0
|
||||
}
|
||||
profCmd = []string{
|
||||
"bee-dcgmproftester-staggered",
|
||||
"--seconds", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)),
|
||||
"--stagger-seconds", strconv.Itoa(staggerSec),
|
||||
"--stagger-seconds", strconv.Itoa(stagger),
|
||||
"--devices", joinIndexList(selected),
|
||||
}
|
||||
} else {
|
||||
|
||||
Reference in New Issue
Block a user