Fix dcgmproftester parallel mode: use staggered script for all multi-GPU runs
A single dcgmproftester process without -i only loads GPU 0 regardless of CUDA_VISIBLE_DEVICES. Now always routes multi-GPU runs through bee-dcgmproftester-staggered (--stagger-seconds 0 for parallel mode), which spawns one process per GPU so all GPUs are loaded simultaneously. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -443,11 +443,19 @@ func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir strin
|
|||||||
profCmd []string
|
profCmd []string
|
||||||
profEnv []string
|
profEnv []string
|
||||||
)
|
)
|
||||||
if staggerSec > 0 && len(selected) > 1 {
|
if len(selected) > 1 {
|
||||||
|
// For multiple GPUs, always spawn one dcgmproftester process per GPU via
|
||||||
|
// bee-dcgmproftester-staggered (stagger=0 means all start simultaneously).
|
||||||
|
// A single dcgmproftester process without -i only loads GPU 0 regardless
|
||||||
|
// of CUDA_VISIBLE_DEVICES.
|
||||||
|
stagger := staggerSec
|
||||||
|
if stagger < 0 {
|
||||||
|
stagger = 0
|
||||||
|
}
|
||||||
profCmd = []string{
|
profCmd = []string{
|
||||||
"bee-dcgmproftester-staggered",
|
"bee-dcgmproftester-staggered",
|
||||||
"--seconds", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)),
|
"--seconds", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)),
|
||||||
"--stagger-seconds", strconv.Itoa(staggerSec),
|
"--stagger-seconds", strconv.Itoa(stagger),
|
||||||
"--devices", joinIndexList(selected),
|
"--devices", joinIndexList(selected),
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
Reference in New Issue
Block a user