Update power benchmark GPU reset flow
This commit is contained in:
@@ -4195,10 +4195,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
SelectedGPUIndices: append([]int(nil), selected...),
|
||||
OverallStatus: "OK",
|
||||
}
|
||||
if failed := resetBenchmarkGPUs(ctx, verboseLog, selected, logFunc); len(failed) > 0 {
|
||||
result.Findings = append(result.Findings,
|
||||
fmt.Sprintf("GPU reset pre-flight did not complete for GPU(s) %s; throttle counters may contain stale state.", joinIndexList(failed)))
|
||||
}
|
||||
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
|
||||
if infoErr != nil {
|
||||
return "", infoErr
|
||||
@@ -4232,6 +4228,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
|
||||
_ = os.MkdirAll(singleDir, 0755)
|
||||
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||
if failed := resetBenchmarkGPUs(ctx, verboseLog, []int{idx}, logFunc); len(failed) > 0 {
|
||||
result.Findings = append(result.Findings,
|
||||
fmt.Sprintf("GPU %d reset pre-flight did not complete before its first power test; throttle counters may contain stale state.", idx))
|
||||
}
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
||||
singlePowerStopCh := make(chan struct{})
|
||||
singlePowerCh := startSelectedPowerSourceSampler(singlePowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
|
||||
|
||||
Reference in New Issue
Block a user