Reset GPUs before power benchmark

This commit is contained in:
Mikhail Chusavitin
2026-04-20 09:42:19 +03:00
parent 5dc711de23
commit 1cfabc9230
2 changed files with 121 additions and 8 deletions

View File

@@ -97,6 +97,8 @@ var (
benchmarkReadyPattern = regexp.MustCompile(`^([a-z0-9_]+)\[(\d+)\]=READY dim=(\d+)x(\d+)x(\d+)\b`)
benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
benchmarkGeteuid = os.Geteuid
benchmarkSleep = time.Sleep
)
// benchmarkPrecisionPhases lists the precision categories run as individual
@@ -240,6 +242,39 @@ func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, po
return nil
}
func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int, logFunc func(string)) []int {
if len(gpuIndices) == 0 {
return nil
}
if benchmarkGeteuid() != 0 {
if logFunc != nil {
logFunc("power benchmark pre-flight: root privileges unavailable, GPU reset skipped")
}
return append([]int(nil), gpuIndices...)
}
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
for _, p := range killed {
logFunc(fmt.Sprintf("power benchmark pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
}
}
var failed []int
for _, idx := range gpuIndices {
name := fmt.Sprintf("power-preflight-gpu-%d-reset.log", idx)
if _, err := runSATCommandCtx(ctx, verboseLog, name, []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-r"}, nil, logFunc); err != nil {
failed = append(failed, idx)
if logFunc != nil {
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset failed: %v", idx, err))
}
continue
}
if logFunc != nil {
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset completed", idx))
}
benchmarkSleep(time.Second)
}
return failed
}
func benchmarkPowerEngine() string {
switch strings.TrimSpace(strings.ToLower(os.Getenv("BEE_BENCH_POWER_ENGINE"))) {
case BenchmarkPowerEngineTargetedPower:
@@ -4150,14 +4185,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
}
verboseLog := filepath.Join(runDir, "verbose.log")
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
if infoErr != nil {
return "", infoErr
}
// Capture full nvidia-smi -q snapshot at the start of the run.
if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
}
hostname, _ := os.Hostname()
result := NvidiaPowerBenchResult{
BenchmarkVersion: benchmarkVersion,
@@ -4168,6 +4195,18 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
SelectedGPUIndices: append([]int(nil), selected...),
OverallStatus: "OK",
}
if failed := resetBenchmarkGPUs(ctx, verboseLog, selected, logFunc); len(failed) > 0 {
result.Findings = append(result.Findings,
fmt.Sprintf("GPU reset pre-flight did not complete for GPU(s) %s; throttle counters may contain stale state.", joinIndexList(failed)))
}
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
if infoErr != nil {
return "", infoErr
}
// Capture full nvidia-smi -q snapshot at the start of the run.
if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
}
durationSec := powerBenchDurationSec(opts.Profile)
// Sample server idle power before any GPU load.

View File

@@ -1,8 +1,13 @@
package platform
import (
"context"
"os"
"os/exec"
"path/filepath"
"strings"
"testing"
"time"
)
func TestResolveBenchmarkProfile(t *testing.T) {
@@ -182,6 +187,75 @@ func TestBenchmarkCalibrationThrottleReasonIgnoresPowerReasons(t *testing.T) {
}
}
func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
t.Parallel()
oldGeteuid := benchmarkGeteuid
oldExec := satExecCommand
benchmarkGeteuid = func() int { return 1000 }
satExecCommand = func(name string, args ...string) *exec.Cmd {
t.Fatalf("unexpected command: %s %v", name, args)
return nil
}
t.Cleanup(func() {
benchmarkGeteuid = oldGeteuid
satExecCommand = oldExec
})
var logs []string
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{0, 2}, func(line string) {
logs = append(logs, line)
})
if got, want := strings.Join(logs, "\n"), "power benchmark pre-flight: root privileges unavailable, GPU reset skipped"; !strings.Contains(got, want) {
t.Fatalf("logs=%q want substring %q", got, want)
}
if len(failed) != 2 || failed[0] != 0 || failed[1] != 2 {
t.Fatalf("failed=%v want [0 2]", failed)
}
}
func TestResetBenchmarkGPUsResetsEachGPU(t *testing.T) {
t.Parallel()
dir := t.TempDir()
script := filepath.Join(dir, "nvidia-smi")
argsLog := filepath.Join(dir, "args.log")
if err := os.WriteFile(script, []byte("#!/bin/sh\nprintf '%s\\n' \"$*\" >> "+argsLog+"\nprintf 'ok\\n'\n"), 0755); err != nil {
t.Fatalf("write script: %v", err)
}
oldGeteuid := benchmarkGeteuid
oldSleep := benchmarkSleep
oldLookPath := satLookPath
benchmarkGeteuid = func() int { return 0 }
benchmarkSleep = func(time.Duration) {}
satLookPath = func(file string) (string, error) {
if file == "nvidia-smi" {
return script, nil
}
return exec.LookPath(file)
}
t.Cleanup(func() {
benchmarkGeteuid = oldGeteuid
benchmarkSleep = oldSleep
satLookPath = oldLookPath
})
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(dir, "verbose.log"), []int{2, 5}, nil)
if len(failed) != 0 {
t.Fatalf("failed=%v want no failures", failed)
}
raw, err := os.ReadFile(argsLog)
if err != nil {
t.Fatalf("read args log: %v", err)
}
got := strings.Fields(string(raw))
want := []string{"-i", "2", "-r", "-i", "5", "-r"}
if strings.Join(got, " ") != strings.Join(want, " ") {
t.Fatalf("args=%v want %v", got, want)
}
}
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
t.Parallel()