Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 19dbabd71d |
@@ -2476,9 +2476,6 @@ func runBenchmarkPowerCalibration(
|
||||
// calibSearchTolerance is the binary-search convergence threshold in watts.
|
||||
// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
|
||||
const calibSearchTolerance = 10
|
||||
// calibPreThrottleMarginW is subtracted from the telemetry-estimated
|
||||
// pre-throttle power draw to produce a smarter initial search candidate.
|
||||
const calibPreThrottleMarginW = 10
|
||||
// dcgmResourceBusyMaxDelaySec caps the exponential back-off when DCGM
|
||||
// returns DCGM_ST_IN_USE (exit 222). The sequence is 1 s, 2 s, 4 s, …
|
||||
// doubling each retry until it would exceed the cap, at which point the
|
||||
@@ -2717,20 +2714,8 @@ func runBenchmarkPowerCalibration(
|
||||
break
|
||||
}
|
||||
|
||||
// Compute the next candidate.
|
||||
// For thermal throttle: use the pre-throttle power draw from telemetry
|
||||
// as a smarter initial estimate instead of the binary midpoint — it
|
||||
// lands much closer to the true limit on the first attempt.
|
||||
nextLimitW := (lo + hi) / 2
|
||||
if strings.Contains(throttleReason, "thermal") {
|
||||
if onsetW := calibPreThrottlePowerW(perGPU); onsetW > 0 {
|
||||
candidate := roundTo5W(int(math.Round(onsetW)) - calibPreThrottleMarginW)
|
||||
if candidate > lo && candidate < hi {
|
||||
nextLimitW = candidate
|
||||
}
|
||||
}
|
||||
}
|
||||
nextLimitW = roundTo5W(nextLimitW)
|
||||
// Binary midpoint within the remaining search range.
|
||||
nextLimitW := roundTo5W((lo + hi) / 2)
|
||||
// Ensure the candidate is strictly inside the search range.
|
||||
if nextLimitW <= lo {
|
||||
nextLimitW = lo + calibSearchTolerance
|
||||
@@ -2770,28 +2755,6 @@ func isDCGMResourceBusy(err error) bool {
|
||||
return errors.As(err, &exitErr) && exitErr.ExitCode() == 222
|
||||
}
|
||||
|
||||
// calibPreThrottlePowerW estimates the GPU power draw just before thermal
|
||||
// throttle onset by averaging the first quarter of telemetry rows. The early
|
||||
// samples capture the GPU at peak before clock/power reduction kicks in.
|
||||
func calibPreThrottlePowerW(rows []GPUMetricRow) float64 {
|
||||
if len(rows) < 4 {
|
||||
return 0
|
||||
}
|
||||
n := len(rows) / 4
|
||||
var sum float64
|
||||
var cnt int
|
||||
for _, r := range rows[:n] {
|
||||
if r.PowerW > 0 {
|
||||
sum += r.PowerW
|
||||
cnt++
|
||||
}
|
||||
}
|
||||
if cnt == 0 {
|
||||
return 0
|
||||
}
|
||||
return sum / float64(cnt)
|
||||
}
|
||||
|
||||
// roundTo5W rounds w to the nearest 5 W boundary.
|
||||
func roundTo5W(w int) int {
|
||||
return ((w + 2) / 5) * 5
|
||||
|
||||
Reference in New Issue
Block a user