Reset GPUs before power benchmark
This commit is contained in:
@@ -97,6 +97,8 @@ var (
|
||||
benchmarkReadyPattern = regexp.MustCompile(`^([a-z0-9_]+)\[(\d+)\]=READY dim=(\d+)x(\d+)x(\d+)\b`)
|
||||
benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
|
||||
benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
|
||||
benchmarkGeteuid = os.Geteuid
|
||||
benchmarkSleep = time.Sleep
|
||||
)
|
||||
|
||||
// benchmarkPrecisionPhases lists the precision categories run as individual
|
||||
@@ -240,6 +242,39 @@ func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, po
|
||||
return nil
|
||||
}
|
||||
|
||||
func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int, logFunc func(string)) []int {
|
||||
if len(gpuIndices) == 0 {
|
||||
return nil
|
||||
}
|
||||
if benchmarkGeteuid() != 0 {
|
||||
if logFunc != nil {
|
||||
logFunc("power benchmark pre-flight: root privileges unavailable, GPU reset skipped")
|
||||
}
|
||||
return append([]int(nil), gpuIndices...)
|
||||
}
|
||||
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||
for _, p := range killed {
|
||||
logFunc(fmt.Sprintf("power benchmark pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||
}
|
||||
}
|
||||
var failed []int
|
||||
for _, idx := range gpuIndices {
|
||||
name := fmt.Sprintf("power-preflight-gpu-%d-reset.log", idx)
|
||||
if _, err := runSATCommandCtx(ctx, verboseLog, name, []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-r"}, nil, logFunc); err != nil {
|
||||
failed = append(failed, idx)
|
||||
if logFunc != nil {
|
||||
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset failed: %v", idx, err))
|
||||
}
|
||||
continue
|
||||
}
|
||||
if logFunc != nil {
|
||||
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset completed", idx))
|
||||
}
|
||||
benchmarkSleep(time.Second)
|
||||
}
|
||||
return failed
|
||||
}
|
||||
|
||||
func benchmarkPowerEngine() string {
|
||||
switch strings.TrimSpace(strings.ToLower(os.Getenv("BEE_BENCH_POWER_ENGINE"))) {
|
||||
case BenchmarkPowerEngineTargetedPower:
|
||||
@@ -4150,14 +4185,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
|
||||
}
|
||||
verboseLog := filepath.Join(runDir, "verbose.log")
|
||||
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
|
||||
if infoErr != nil {
|
||||
return "", infoErr
|
||||
}
|
||||
// Capture full nvidia-smi -q snapshot at the start of the run.
|
||||
if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
|
||||
_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
|
||||
}
|
||||
hostname, _ := os.Hostname()
|
||||
result := NvidiaPowerBenchResult{
|
||||
BenchmarkVersion: benchmarkVersion,
|
||||
@@ -4168,6 +4195,18 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
SelectedGPUIndices: append([]int(nil), selected...),
|
||||
OverallStatus: "OK",
|
||||
}
|
||||
if failed := resetBenchmarkGPUs(ctx, verboseLog, selected, logFunc); len(failed) > 0 {
|
||||
result.Findings = append(result.Findings,
|
||||
fmt.Sprintf("GPU reset pre-flight did not complete for GPU(s) %s; throttle counters may contain stale state.", joinIndexList(failed)))
|
||||
}
|
||||
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
|
||||
if infoErr != nil {
|
||||
return "", infoErr
|
||||
}
|
||||
// Capture full nvidia-smi -q snapshot at the start of the run.
|
||||
if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
|
||||
_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
|
||||
}
|
||||
durationSec := powerBenchDurationSec(opts.Profile)
|
||||
|
||||
// Sample server idle power before any GPU load.
|
||||
|
||||
Reference in New Issue
Block a user