Reset GPUs before power benchmark

This commit is contained in:
Mikhail Chusavitin
2026-04-20 09:42:19 +03:00
parent 5dc711de23
commit 1cfabc9230
2 changed files with 121 additions and 8 deletions

View File

@@ -97,6 +97,8 @@ var (
benchmarkReadyPattern = regexp.MustCompile(`^([a-z0-9_]+)\[(\d+)\]=READY dim=(\d+)x(\d+)x(\d+)\b`)
benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
benchmarkGeteuid = os.Geteuid
benchmarkSleep = time.Sleep
)
// benchmarkPrecisionPhases lists the precision categories run as individual
@@ -240,6 +242,39 @@ func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, po
return nil
}
func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int, logFunc func(string)) []int {
if len(gpuIndices) == 0 {
return nil
}
if benchmarkGeteuid() != 0 {
if logFunc != nil {
logFunc("power benchmark pre-flight: root privileges unavailable, GPU reset skipped")
}
return append([]int(nil), gpuIndices...)
}
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
for _, p := range killed {
logFunc(fmt.Sprintf("power benchmark pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
}
}
var failed []int
for _, idx := range gpuIndices {
name := fmt.Sprintf("power-preflight-gpu-%d-reset.log", idx)
if _, err := runSATCommandCtx(ctx, verboseLog, name, []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-r"}, nil, logFunc); err != nil {
failed = append(failed, idx)
if logFunc != nil {
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset failed: %v", idx, err))
}
continue
}
if logFunc != nil {
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset completed", idx))
}
benchmarkSleep(time.Second)
}
return failed
}
func benchmarkPowerEngine() string {
switch strings.TrimSpace(strings.ToLower(os.Getenv("BEE_BENCH_POWER_ENGINE"))) {
case BenchmarkPowerEngineTargetedPower:
@@ -4150,14 +4185,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
}
verboseLog := filepath.Join(runDir, "verbose.log")
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
if infoErr != nil {
return "", infoErr
}
// Capture full nvidia-smi -q snapshot at the start of the run.
if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
}
hostname, _ := os.Hostname()
result := NvidiaPowerBenchResult{
BenchmarkVersion: benchmarkVersion,
@@ -4168,6 +4195,18 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
SelectedGPUIndices: append([]int(nil), selected...),
OverallStatus: "OK",
}
if failed := resetBenchmarkGPUs(ctx, verboseLog, selected, logFunc); len(failed) > 0 {
result.Findings = append(result.Findings,
fmt.Sprintf("GPU reset pre-flight did not complete for GPU(s) %s; throttle counters may contain stale state.", joinIndexList(failed)))
}
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
if infoErr != nil {
return "", infoErr
}
// Capture full nvidia-smi -q snapshot at the start of the run.
if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
}
durationSec := powerBenchDurationSec(opts.Profile)
// Sample server idle power before any GPU load.