Stabilize NVIDIA GPU device mapping across loaders
This commit is contained in:
@@ -679,7 +679,10 @@ func runBenchmarkInterconnect(ctx context.Context, verboseLog, runDir string, gp
|
||||
"-g", strconv.Itoa(len(gpuIndices)),
|
||||
"--iters", strconv.Itoa(maxInt(20, spec.NCCLSec/10)),
|
||||
}
|
||||
env := []string{"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices)}
|
||||
env := []string{
|
||||
"CUDA_DEVICE_ORDER=PCI_BUS_ID",
|
||||
"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices),
|
||||
}
|
||||
logFunc(fmt.Sprintf("NCCL interconnect: gpus=%s", joinIndexList(gpuIndices)))
|
||||
out, err := runSATCommandCtx(ctx, verboseLog, "nccl-all-reduce.log", cmd, env, logFunc)
|
||||
_ = os.WriteFile(filepath.Join(runDir, "nccl-all-reduce.log"), out, 0644)
|
||||
|
||||
Reference in New Issue
Block a user