Compare commits

..

1 Commits
v8.4 ... v8.5

Author SHA1 Message Date
19dbabd71d Simplify power calibration: pure binary search, no telemetry guessing
Remove telemetry-guided initial candidate; use strict binary search
midpoint at every step. Clean and predictable convergence in O(log N)
attempts within the allowed power range [minLimitW, startingLimitW].

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 22:12:45 +03:00

View File

@@ -2476,9 +2476,6 @@ func runBenchmarkPowerCalibration(
// calibSearchTolerance is the binary-search convergence threshold in watts.
// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
const calibSearchTolerance = 10
// calibPreThrottleMarginW is subtracted from the telemetry-estimated
// pre-throttle power draw to produce a smarter initial search candidate.
const calibPreThrottleMarginW = 10
// dcgmResourceBusyMaxDelaySec caps the exponential back-off when DCGM
// returns DCGM_ST_IN_USE (exit 222). The sequence is 1 s, 2 s, 4 s, …
// doubling each retry until it would exceed the cap, at which point the
@@ -2717,20 +2714,8 @@ func runBenchmarkPowerCalibration(
break
}
// Compute the next candidate.
// For thermal throttle: use the pre-throttle power draw from telemetry
// as a smarter initial estimate instead of the binary midpoint — it
// lands much closer to the true limit on the first attempt.
nextLimitW := (lo + hi) / 2
if strings.Contains(throttleReason, "thermal") {
if onsetW := calibPreThrottlePowerW(perGPU); onsetW > 0 {
candidate := roundTo5W(int(math.Round(onsetW)) - calibPreThrottleMarginW)
if candidate > lo && candidate < hi {
nextLimitW = candidate
}
}
}
nextLimitW = roundTo5W(nextLimitW)
// Binary midpoint within the remaining search range.
nextLimitW := roundTo5W((lo + hi) / 2)
// Ensure the candidate is strictly inside the search range.
if nextLimitW <= lo {
nextLimitW = lo + calibSearchTolerance
@@ -2770,28 +2755,6 @@ func isDCGMResourceBusy(err error) bool {
return errors.As(err, &exitErr) && exitErr.ExitCode() == 222
}
// calibPreThrottlePowerW estimates the GPU power draw just before thermal
// throttle onset by averaging the first quarter of telemetry rows. The early
// samples capture the GPU at peak before clock/power reduction kicks in.
func calibPreThrottlePowerW(rows []GPUMetricRow) float64 {
if len(rows) < 4 {
return 0
}
n := len(rows) / 4
var sum float64
var cnt int
for _, r := range rows[:n] {
if r.PowerW > 0 {
sum += r.PowerW
cnt++
}
}
if cnt == 0 {
return 0
}
return sum / float64(cnt)
}
// roundTo5W rounds w to the nearest 5 W boundary.
func roundTo5W(w int) int {
return ((w + 2) / 5) * 5