Update power benchmark GPU reset flow
This commit is contained in:
@@ -4195,10 +4195,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
SelectedGPUIndices: append([]int(nil), selected...),
|
SelectedGPUIndices: append([]int(nil), selected...),
|
||||||
OverallStatus: "OK",
|
OverallStatus: "OK",
|
||||||
}
|
}
|
||||||
if failed := resetBenchmarkGPUs(ctx, verboseLog, selected, logFunc); len(failed) > 0 {
|
|
||||||
result.Findings = append(result.Findings,
|
|
||||||
fmt.Sprintf("GPU reset pre-flight did not complete for GPU(s) %s; throttle counters may contain stale state.", joinIndexList(failed)))
|
|
||||||
}
|
|
||||||
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
|
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
|
||||||
if infoErr != nil {
|
if infoErr != nil {
|
||||||
return "", infoErr
|
return "", infoErr
|
||||||
@@ -4232,6 +4228,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
|
singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
|
||||||
_ = os.MkdirAll(singleDir, 0755)
|
_ = os.MkdirAll(singleDir, 0755)
|
||||||
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||||
|
if failed := resetBenchmarkGPUs(ctx, verboseLog, []int{idx}, logFunc); len(failed) > 0 {
|
||||||
|
result.Findings = append(result.Findings,
|
||||||
|
fmt.Sprintf("GPU %d reset pre-flight did not complete before its first power test; throttle counters may contain stale state.", idx))
|
||||||
|
}
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
||||||
singlePowerStopCh := make(chan struct{})
|
singlePowerStopCh := make(chan struct{})
|
||||||
singlePowerCh := startSelectedPowerSourceSampler(singlePowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
|
singlePowerCh := startSelectedPowerSourceSampler(singlePowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
|
||||||
|
|||||||
Reference in New Issue
Block a user