Compare commits

...

2 Commits
v8.2 ... v8.5

Author SHA1 Message Date
19dbabd71d Simplify power calibration: pure binary search, no telemetry guessing
Remove telemetry-guided initial candidate; use strict binary search
midpoint at every step. Clean and predictable convergence in O(log N)
attempts within the allowed power range [minLimitW, startingLimitW].

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 22:12:45 +03:00
a6a07f2626 Replace linear power derate with binary search + telemetry-guided jump
Power calibration previously stepped down 25 W at a time (linear),
requiring up to 6 attempts to find a stable limit within 150 W range.

New strategy:
- Binary search between minLimitW (lo, assumed stable floor) and the
  starting/failed limit (hi, confirmed unstable), converging within a
  10 W tolerance in ~4 attempts.
- For thermal throttle: the first-quarter telemetry rows estimate the
  GPU's pre-throttle power draw. nextLimit = round5W(onset - 10 W) is
  used as the initial candidate instead of the binary midpoint, landing
  much closer to the true limit on the first step.
- On success: lo is updated and a higher level is tried (binary search
  upward) until hi-lo ≤ tolerance, ensuring the highest stable limit is
  found rather than the first stable one.
- Let targeted_power run to natural completion on throttle (no mid-run
  SIGKILL) so nv-hostengine releases its diagnostic slot cleanly before
  the next attempt.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 22:05:23 +03:00

View File

@@ -2472,8 +2472,10 @@ func runBenchmarkPowerCalibration(
logFunc func(string),
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) {
const calibDurationSec = 120
const derateStepW = 25
const maxDerateW = 150
// calibSearchTolerance is the binary-search convergence threshold in watts.
// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
const calibSearchTolerance = 10
// dcgmResourceBusyMaxDelaySec caps the exponential back-off when DCGM
// returns DCGM_ST_IN_USE (exit 222). The sequence is 1 s, 2 s, 4 s, …
// doubling each retry until it would exceed the cap, at which point the
@@ -2523,13 +2525,18 @@ func runBenchmarkPowerCalibration(
case appliedLimitW > 0:
minLimitW = appliedLimitW - maxDerateW
}
if minLimitW < derateStepW {
minLimitW = derateStepW
if minLimitW < calibSearchTolerance {
minLimitW = calibSearchTolerance
}
calib := benchmarkPowerCalibrationResult{
AppliedPowerLimitW: float64(appliedLimitW),
}
// Binary search bounds for finding the highest stable power limit.
// lo = highest verified-stable level (assumed: minLimitW).
// hi = lowest verified-unstable level (assumed: above the starting limit).
lo := minLimitW
hi := appliedLimitW + 1 // exclusive: not yet tested, so not yet confirmed unstable
busyRetries := 0
busyDelaySec := 1 // exponential back-off seed; doubles each retry up to dcgmResourceBusyMaxDelaySec
if canDerate && originalLimitW > 0 {
@@ -2573,9 +2580,15 @@ func runBenchmarkPowerCalibration(
if err != nil {
continue
}
if reason := benchmarkCalibrationThrottleReason(beforeThrottle, afterThrottle); reason != "" {
// Record the throttle reason but do NOT cancel the dcgmi
// process. Killing it mid-run leaves nv-hostengine holding
// the diagnostic slot, which causes DCGM_ST_IN_USE on every
// subsequent attempt. Let targeted_power run to its natural
// end so the daemon releases the slot cleanly before we
// reduce power and retry.
if reason := benchmarkCalibrationThrottleReason(beforeThrottle, afterThrottle); reason != "" && throttleReason == "" {
throttleReason = reason
cancel()
logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for current run to finish before reducing power limit", idx, reason, appliedLimitW))
}
case <-ctx.Done():
cancel()
@@ -2590,10 +2603,25 @@ func runBenchmarkPowerCalibration(
perGPU := filterRowsByGPU(attempt.rows, idx)
summary := summarizeBenchmarkTelemetry(perGPU)
if throttleReason == "" && attempt.err == nil && summary.P95PowerW > 0 {
// Stable at appliedLimitW: record it and binary-search upward.
calib.Summary = summary
calib.Completed = true
calib.AppliedPowerLimitW = float64(appliedLimitW)
logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", idx, appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
lo = appliedLimitW
// If there is still headroom to search, try a higher level.
if canDerate && hi-lo > calibSearchTolerance {
nextLimitW := roundTo5W((lo + hi) / 2)
if nextLimitW > lo && nextLimitW < hi {
if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err == nil {
appliedLimitW = nextLimitW
calib.AppliedPowerLimitW = float64(appliedLimitW)
calib.Notes = append(calib.Notes, fmt.Sprintf("binary search: stable at %d W, trying %d W (lo=%d hi=%d)", lo, nextLimitW, lo, hi))
logFunc(fmt.Sprintf("power calibration: GPU %d binary search up: stable at %d W, trying %d W", idx, lo, nextLimitW))
continue calibLoop
}
}
}
break
}
@@ -2667,22 +2695,50 @@ func runBenchmarkPowerCalibration(
if !canDerate || appliedLimitW <= 0 {
break
}
nextLimitW := appliedLimitW - derateStepW
// Binary-search for the highest stable power limit.
// This attempt failed or throttled, so update the upper bound.
hi = appliedLimitW
if hi-lo <= calibSearchTolerance {
// Search range exhausted: lo is the highest verified-stable level.
if lo > minLimitW {
calib.Notes = append(calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", lo, lo, hi))
if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, lo); err == nil {
appliedLimitW = lo
calib.AppliedPowerLimitW = float64(lo)
calib.Derated = lo < originalLimitW
}
} else {
calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
}
break
}
// Binary midpoint within the remaining search range.
nextLimitW := roundTo5W((lo + hi) / 2)
// Ensure the candidate is strictly inside the search range.
if nextLimitW <= lo {
nextLimitW = lo + calibSearchTolerance
}
if nextLimitW >= hi {
nextLimitW = (lo + hi) / 2
}
if nextLimitW < minLimitW {
calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default/current limit", maxDerateW))
calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
break
}
if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err != nil {
calib.Notes = append(calib.Notes, "failed to lower power limit: "+err.Error())
logFunc(fmt.Sprintf("power calibration: GPU %d failed to set reduced power limit %d W: %v", idx, nextLimitW, err))
calib.Notes = append(calib.Notes, "failed to set power limit: "+err.Error())
logFunc(fmt.Sprintf("power calibration: GPU %d failed to set power limit %d W: %v", idx, nextLimitW, err))
break
}
appliedLimitW = nextLimitW
calib.AppliedPowerLimitW = float64(appliedLimitW)
calib.Derated = true
calib.Derated = appliedLimitW < originalLimitW
info.PowerLimitW = float64(appliedLimitW)
infoByIndex[idx] = info
calib.Notes = append(calib.Notes, fmt.Sprintf("reduced power limit to %d W and restarted targeted_power from the beginning", appliedLimitW))
calib.Notes = append(calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", nextLimitW, lo, hi))
logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", idx, nextLimitW, lo, hi))
}
if calib.Completed || calib.Attempts > 0 || len(calib.Notes) > 0 {
@@ -2699,6 +2755,11 @@ func isDCGMResourceBusy(err error) bool {
return errors.As(err, &exitErr) && exitErr.ExitCode() == 222
}
// roundTo5W rounds w to the nearest 5 W boundary.
func roundTo5W(w int) int {
return ((w + 2) / 5) * 5
}
func powerBenchDurationSec(profile string) int {
switch strings.TrimSpace(strings.ToLower(profile)) {
case NvidiaBenchmarkProfileStability: