Simplify power calibration: pure binary search, no telemetry guessing

Remove telemetry-guided initial candidate; use strict binary search midpoint at every step. Clean and predictable convergence in O(log N) attempts within the allowed power range [minLimitW, startingLimitW]. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Replace linear power derate with binary search + telemetry-guided jump
2026-04-14 22:12:45 +03:00 · 2026-04-14 22:05:23 +03:00
1 changed files with 72 additions and 11 deletions
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -2472,8 +2472,10 @@ func runBenchmarkPowerCalibration(
 	logFunc func(string),
 ) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) {
 	const calibDurationSec = 120
-	const derateStepW = 25
 	const maxDerateW = 150
+	// calibSearchTolerance is the binary-search convergence threshold in watts.
+	// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
+	const calibSearchTolerance = 10
 	// dcgmResourceBusyMaxDelaySec caps the exponential back-off when DCGM
 	// returns DCGM_ST_IN_USE (exit 222). The sequence is 1 s, 2 s, 4 s, …
 	// doubling each retry until it would exceed the cap, at which point the
@@ -2523,13 +2525,18 @@ func runBenchmarkPowerCalibration(
 		case appliedLimitW > 0:
 			minLimitW = appliedLimitW - maxDerateW
 		}
-		if minLimitW < derateStepW {
-			minLimitW = derateStepW
+		if minLimitW < calibSearchTolerance {
+			minLimitW = calibSearchTolerance
 		}

 		calib := benchmarkPowerCalibrationResult{
 			AppliedPowerLimitW: float64(appliedLimitW),
 		}
+		// Binary search bounds for finding the highest stable power limit.
+		// lo = highest verified-stable level (assumed: minLimitW).
+		// hi = lowest verified-unstable level (assumed: above the starting limit).
+		lo := minLimitW
+		hi := appliedLimitW + 1 // exclusive: not yet tested, so not yet confirmed unstable
 		busyRetries := 0
 		busyDelaySec := 1 // exponential back-off seed; doubles each retry up to dcgmResourceBusyMaxDelaySec
 		if canDerate && originalLimitW > 0 {
@@ -2573,9 +2580,15 @@ func runBenchmarkPowerCalibration(
 					if err != nil {
 						continue
 					}
-					if reason := benchmarkCalibrationThrottleReason(beforeThrottle, afterThrottle); reason != "" {
+					// Record the throttle reason but do NOT cancel the dcgmi
+					// process. Killing it mid-run leaves nv-hostengine holding
+					// the diagnostic slot, which causes DCGM_ST_IN_USE on every
+					// subsequent attempt. Let targeted_power run to its natural
+					// end so the daemon releases the slot cleanly before we
+					// reduce power and retry.
+					if reason := benchmarkCalibrationThrottleReason(beforeThrottle, afterThrottle); reason != "" && throttleReason == "" {
 						throttleReason = reason
-						cancel()
+						logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for current run to finish before reducing power limit", idx, reason, appliedLimitW))
 					}
 				case <-ctx.Done():
 					cancel()
@@ -2590,10 +2603,25 @@ func runBenchmarkPowerCalibration(
 			perGPU := filterRowsByGPU(attempt.rows, idx)
 			summary := summarizeBenchmarkTelemetry(perGPU)
 			if throttleReason == "" && attempt.err == nil && summary.P95PowerW > 0 {
+				// Stable at appliedLimitW: record it and binary-search upward.
 				calib.Summary = summary
 				calib.Completed = true
 				calib.AppliedPowerLimitW = float64(appliedLimitW)
 				logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", idx, appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
+				lo = appliedLimitW
+				// If there is still headroom to search, try a higher level.
+				if canDerate && hi-lo > calibSearchTolerance {
+					nextLimitW := roundTo5W((lo + hi) / 2)
+					if nextLimitW > lo && nextLimitW < hi {
+						if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err == nil {
+							appliedLimitW = nextLimitW
+							calib.AppliedPowerLimitW = float64(appliedLimitW)
+							calib.Notes = append(calib.Notes, fmt.Sprintf("binary search: stable at %d W, trying %d W (lo=%d hi=%d)", lo, nextLimitW, lo, hi))
+							logFunc(fmt.Sprintf("power calibration: GPU %d binary search up: stable at %d W, trying %d W", idx, lo, nextLimitW))
+							continue calibLoop
+						}
+					}
+				}
 				break
 			}

@@ -2667,22 +2695,50 @@ func runBenchmarkPowerCalibration(
 			if !canDerate || appliedLimitW <= 0 {
 				break
 			}
-			nextLimitW := appliedLimitW - derateStepW
+			// Binary-search for the highest stable power limit.
+			// This attempt failed or throttled, so update the upper bound.
+			hi = appliedLimitW
+
+			if hi-lo <= calibSearchTolerance {
+				// Search range exhausted: lo is the highest verified-stable level.
+				if lo > minLimitW {
+					calib.Notes = append(calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", lo, lo, hi))
+					if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, lo); err == nil {
+						appliedLimitW = lo
+						calib.AppliedPowerLimitW = float64(lo)
+						calib.Derated = lo < originalLimitW
+					}
+				} else {
+					calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
+				}
+				break
+			}
+
+			// Binary midpoint within the remaining search range.
+			nextLimitW := roundTo5W((lo + hi) / 2)
+			// Ensure the candidate is strictly inside the search range.
+			if nextLimitW <= lo {
+				nextLimitW = lo + calibSearchTolerance
+			}
+			if nextLimitW >= hi {
+				nextLimitW = (lo + hi) / 2
+			}
 			if nextLimitW < minLimitW {
-				calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default/current limit", maxDerateW))
+				calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
 				break
 			}
 			if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err != nil {
-				calib.Notes = append(calib.Notes, "failed to lower power limit: "+err.Error())
-				logFunc(fmt.Sprintf("power calibration: GPU %d failed to set reduced power limit %d W: %v", idx, nextLimitW, err))
+				calib.Notes = append(calib.Notes, "failed to set power limit: "+err.Error())
+				logFunc(fmt.Sprintf("power calibration: GPU %d failed to set power limit %d W: %v", idx, nextLimitW, err))
 				break
 			}
 			appliedLimitW = nextLimitW
 			calib.AppliedPowerLimitW = float64(appliedLimitW)
-			calib.Derated = true
+			calib.Derated = appliedLimitW < originalLimitW
 			info.PowerLimitW = float64(appliedLimitW)
 			infoByIndex[idx] = info
-			calib.Notes = append(calib.Notes, fmt.Sprintf("reduced power limit to %d W and restarted targeted_power from the beginning", appliedLimitW))
+			calib.Notes = append(calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", nextLimitW, lo, hi))
+			logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", idx, nextLimitW, lo, hi))
 		}

 		if calib.Completed || calib.Attempts > 0 || len(calib.Notes) > 0 {
@@ -2699,6 +2755,11 @@ func isDCGMResourceBusy(err error) bool {
 	return errors.As(err, &exitErr) && exitErr.ExitCode() == 222
 }

+// roundTo5W rounds w to the nearest 5 W boundary.
+func roundTo5W(w int) int {
+	return ((w + 2) / 5) * 5
+}
+
 func powerBenchDurationSec(profile string) int {
 	switch strings.TrimSpace(strings.ToLower(profile)) {
 	case NvidiaBenchmarkProfileStability: