From 1cfabc9230b39dcda66a696bf35014e693b1be5b Mon Sep 17 00:00:00 2001 From: Mikhail Chusavitin Date: Mon, 20 Apr 2026 09:42:19 +0300 Subject: [PATCH] Reset GPUs before power benchmark --- audit/internal/platform/benchmark.go | 55 ++++++++++++++--- audit/internal/platform/benchmark_test.go | 74 +++++++++++++++++++++++ 2 files changed, 121 insertions(+), 8 deletions(-) diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index 3e06579..cddd630 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -97,6 +97,8 @@ var ( benchmarkReadyPattern = regexp.MustCompile(`^([a-z0-9_]+)\[(\d+)\]=READY dim=(\d+)x(\d+)x(\d+)\b`) benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`) benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`) + benchmarkGeteuid = os.Geteuid + benchmarkSleep = time.Sleep ) // benchmarkPrecisionPhases lists the precision categories run as individual @@ -240,6 +242,39 @@ func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, po return nil } +func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int, logFunc func(string)) []int { + if len(gpuIndices) == 0 { + return nil + } + if benchmarkGeteuid() != 0 { + if logFunc != nil { + logFunc("power benchmark pre-flight: root privileges unavailable, GPU reset skipped") + } + return append([]int(nil), gpuIndices...) + } + if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil { + for _, p := range killed { + logFunc(fmt.Sprintf("power benchmark pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name)) + } + } + var failed []int + for _, idx := range gpuIndices { + name := fmt.Sprintf("power-preflight-gpu-%d-reset.log", idx) + if _, err := runSATCommandCtx(ctx, verboseLog, name, []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-r"}, nil, logFunc); err != nil { + failed = append(failed, idx) + if logFunc != nil { + logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset failed: %v", idx, err)) + } + continue + } + if logFunc != nil { + logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset completed", idx)) + } + benchmarkSleep(time.Second) + } + return failed +} + func benchmarkPowerEngine() string { switch strings.TrimSpace(strings.ToLower(os.Getenv("BEE_BENCH_POWER_ENGINE"))) { case BenchmarkPowerEngineTargetedPower: @@ -4150,14 +4185,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N return "", fmt.Errorf("mkdir %s: %w", runDir, err) } verboseLog := filepath.Join(runDir, "verbose.log") - infoByIndex, infoErr := queryBenchmarkGPUInfo(selected) - if infoErr != nil { - return "", infoErr - } - // Capture full nvidia-smi -q snapshot at the start of the run. - if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil { - _ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644) - } hostname, _ := os.Hostname() result := NvidiaPowerBenchResult{ BenchmarkVersion: benchmarkVersion, @@ -4168,6 +4195,18 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N SelectedGPUIndices: append([]int(nil), selected...), OverallStatus: "OK", } + if failed := resetBenchmarkGPUs(ctx, verboseLog, selected, logFunc); len(failed) > 0 { + result.Findings = append(result.Findings, + fmt.Sprintf("GPU reset pre-flight did not complete for GPU(s) %s; throttle counters may contain stale state.", joinIndexList(failed))) + } + infoByIndex, infoErr := queryBenchmarkGPUInfo(selected) + if infoErr != nil { + return "", infoErr + } + // Capture full nvidia-smi -q snapshot at the start of the run. + if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil { + _ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644) + } durationSec := powerBenchDurationSec(opts.Profile) // Sample server idle power before any GPU load. diff --git a/audit/internal/platform/benchmark_test.go b/audit/internal/platform/benchmark_test.go index b2293c7..07f17c2 100644 --- a/audit/internal/platform/benchmark_test.go +++ b/audit/internal/platform/benchmark_test.go @@ -1,8 +1,13 @@ package platform import ( + "context" + "os" + "os/exec" + "path/filepath" "strings" "testing" + "time" ) func TestResolveBenchmarkProfile(t *testing.T) { @@ -182,6 +187,75 @@ func TestBenchmarkCalibrationThrottleReasonIgnoresPowerReasons(t *testing.T) { } } +func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) { + t.Parallel() + + oldGeteuid := benchmarkGeteuid + oldExec := satExecCommand + benchmarkGeteuid = func() int { return 1000 } + satExecCommand = func(name string, args ...string) *exec.Cmd { + t.Fatalf("unexpected command: %s %v", name, args) + return nil + } + t.Cleanup(func() { + benchmarkGeteuid = oldGeteuid + satExecCommand = oldExec + }) + + var logs []string + failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{0, 2}, func(line string) { + logs = append(logs, line) + }) + if got, want := strings.Join(logs, "\n"), "power benchmark pre-flight: root privileges unavailable, GPU reset skipped"; !strings.Contains(got, want) { + t.Fatalf("logs=%q want substring %q", got, want) + } + if len(failed) != 2 || failed[0] != 0 || failed[1] != 2 { + t.Fatalf("failed=%v want [0 2]", failed) + } +} + +func TestResetBenchmarkGPUsResetsEachGPU(t *testing.T) { + t.Parallel() + + dir := t.TempDir() + script := filepath.Join(dir, "nvidia-smi") + argsLog := filepath.Join(dir, "args.log") + if err := os.WriteFile(script, []byte("#!/bin/sh\nprintf '%s\\n' \"$*\" >> "+argsLog+"\nprintf 'ok\\n'\n"), 0755); err != nil { + t.Fatalf("write script: %v", err) + } + + oldGeteuid := benchmarkGeteuid + oldSleep := benchmarkSleep + oldLookPath := satLookPath + benchmarkGeteuid = func() int { return 0 } + benchmarkSleep = func(time.Duration) {} + satLookPath = func(file string) (string, error) { + if file == "nvidia-smi" { + return script, nil + } + return exec.LookPath(file) + } + t.Cleanup(func() { + benchmarkGeteuid = oldGeteuid + benchmarkSleep = oldSleep + satLookPath = oldLookPath + }) + + failed := resetBenchmarkGPUs(context.Background(), filepath.Join(dir, "verbose.log"), []int{2, 5}, nil) + if len(failed) != 0 { + t.Fatalf("failed=%v want no failures", failed) + } + raw, err := os.ReadFile(argsLog) + if err != nil { + t.Fatalf("read args log: %v", err) + } + got := strings.Fields(string(raw)) + want := []string{"-i", "2", "-r", "-i", "5", "-r"} + if strings.Join(got, " ") != strings.Join(want, " ") { + t.Fatalf("args=%v want %v", got, want) + } +} + func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) { t.Parallel()