Stabilize NVIDIA GPU device mapping across loaders

This commit is contained in:
Mikhail Chusavitin
2026-04-06 12:22:04 +03:00
parent 981315e6fd
commit 35f4c53887
6 changed files with 27 additions and 8 deletions

View File

@@ -679,7 +679,10 @@ func runBenchmarkInterconnect(ctx context.Context, verboseLog, runDir string, gp
"-g", strconv.Itoa(len(gpuIndices)),
"--iters", strconv.Itoa(maxInt(20, spec.NCCLSec/10)),
}
env := []string{"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices)}
env := []string{
"CUDA_DEVICE_ORDER=PCI_BUS_ID",
"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices),
}
logFunc(fmt.Sprintf("NCCL interconnect: gpus=%s", joinIndexList(gpuIndices)))
out, err := runSATCommandCtx(ctx, verboseLog, "nccl-all-reduce.log", cmd, env, logFunc)
_ = os.WriteFile(filepath.Join(runDir, "nccl-all-reduce.log"), out, 0644)

View File

@@ -630,7 +630,10 @@ func nvidiaVisibleDevicesEnv(gpuIndices []int) []string {
if len(gpuIndices) == 0 {
return nil
}
return []string{"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices)}
return []string{
"CUDA_DEVICE_ORDER=PCI_BUS_ID",
"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices),
}
}
func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {

View File

@@ -253,11 +253,14 @@ func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
if len(env) != 1 {
t.Fatalf("env len=%d want 1 (%v)", len(env), env)
if len(env) != 2 {
t.Fatalf("env len=%d want 2 (%v)", len(env), env)
}
if env[0] != "CUDA_VISIBLE_DEVICES=0,2,4" {
t.Fatalf("env[0]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[0])
if env[0] != "CUDA_DEVICE_ORDER=PCI_BUS_ID" {
t.Fatalf("env[0]=%q want CUDA_DEVICE_ORDER=PCI_BUS_ID", env[0])
}
if env[1] != "CUDA_VISIBLE_DEVICES=0,2,4" {
t.Fatalf("env[1]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[1])
}
}