Update power benchmark GPU reset flow

This commit is contained in:
Mikhail Chusavitin
2026-04-20 09:46:00 +03:00
parent 84a2551dc0
commit 5f0103635b
2 changed files with 4 additions and 4 deletions

BIN
audit/bee

Binary file not shown.

View File

@@ -4195,10 +4195,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
SelectedGPUIndices: append([]int(nil), selected...),
OverallStatus: "OK",
}
if failed := resetBenchmarkGPUs(ctx, verboseLog, selected, logFunc); len(failed) > 0 {
result.Findings = append(result.Findings,
fmt.Sprintf("GPU reset pre-flight did not complete for GPU(s) %s; throttle counters may contain stale state.", joinIndexList(failed)))
}
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
if infoErr != nil {
return "", infoErr
@@ -4232,6 +4228,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
_ = os.MkdirAll(singleDir, 0755)
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
if failed := resetBenchmarkGPUs(ctx, verboseLog, []int{idx}, logFunc); len(failed) > 0 {
result.Findings = append(result.Findings,
fmt.Sprintf("GPU %d reset pre-flight did not complete before its first power test; throttle counters may contain stale state.", idx))
}
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
singlePowerStopCh := make(chan struct{})
singlePowerCh := startSelectedPowerSourceSampler(singlePowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)