From a6a07f262636e3d04c3f24e40e258240aa73a5e6 Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Tue, 14 Apr 2026 22:05:23 +0300 Subject: [PATCH] Replace linear power derate with binary search + telemetry-guided jump MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Power calibration previously stepped down 25 W at a time (linear), requiring up to 6 attempts to find a stable limit within 150 W range. New strategy: - Binary search between minLimitW (lo, assumed stable floor) and the starting/failed limit (hi, confirmed unstable), converging within a 10 W tolerance in ~4 attempts. - For thermal throttle: the first-quarter telemetry rows estimate the GPU's pre-throttle power draw. nextLimit = round5W(onset - 10 W) is used as the initial candidate instead of the binary midpoint, landing much closer to the true limit on the first step. - On success: lo is updated and a higher level is tried (binary search upward) until hi-lo ≤ tolerance, ensuring the highest stable limit is found rather than the first stable one. - Let targeted_power run to natural completion on throttle (no mid-run SIGKILL) so nv-hostengine releases its diagnostic slot cleanly before the next attempt. Co-Authored-By: Claude Sonnet 4.6 --- audit/internal/platform/benchmark.go | 120 ++++++++++++++++++++++++--- 1 file changed, 109 insertions(+), 11 deletions(-) diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index 9a52e5c..a182582 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -2472,8 +2472,13 @@ func runBenchmarkPowerCalibration( logFunc func(string), ) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) { const calibDurationSec = 120 - const derateStepW = 25 const maxDerateW = 150 + // calibSearchTolerance is the binary-search convergence threshold in watts. + // When hi-lo ≤ this, the highest verified-stable limit (lo) is used. + const calibSearchTolerance = 10 + // calibPreThrottleMarginW is subtracted from the telemetry-estimated + // pre-throttle power draw to produce a smarter initial search candidate. + const calibPreThrottleMarginW = 10 // dcgmResourceBusyMaxDelaySec caps the exponential back-off when DCGM // returns DCGM_ST_IN_USE (exit 222). The sequence is 1 s, 2 s, 4 s, … // doubling each retry until it would exceed the cap, at which point the @@ -2523,13 +2528,18 @@ func runBenchmarkPowerCalibration( case appliedLimitW > 0: minLimitW = appliedLimitW - maxDerateW } - if minLimitW < derateStepW { - minLimitW = derateStepW + if minLimitW < calibSearchTolerance { + minLimitW = calibSearchTolerance } calib := benchmarkPowerCalibrationResult{ AppliedPowerLimitW: float64(appliedLimitW), } + // Binary search bounds for finding the highest stable power limit. + // lo = highest verified-stable level (assumed: minLimitW). + // hi = lowest verified-unstable level (assumed: above the starting limit). + lo := minLimitW + hi := appliedLimitW + 1 // exclusive: not yet tested, so not yet confirmed unstable busyRetries := 0 busyDelaySec := 1 // exponential back-off seed; doubles each retry up to dcgmResourceBusyMaxDelaySec if canDerate && originalLimitW > 0 { @@ -2573,9 +2583,15 @@ func runBenchmarkPowerCalibration( if err != nil { continue } - if reason := benchmarkCalibrationThrottleReason(beforeThrottle, afterThrottle); reason != "" { + // Record the throttle reason but do NOT cancel the dcgmi + // process. Killing it mid-run leaves nv-hostengine holding + // the diagnostic slot, which causes DCGM_ST_IN_USE on every + // subsequent attempt. Let targeted_power run to its natural + // end so the daemon releases the slot cleanly before we + // reduce power and retry. + if reason := benchmarkCalibrationThrottleReason(beforeThrottle, afterThrottle); reason != "" && throttleReason == "" { throttleReason = reason - cancel() + logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for current run to finish before reducing power limit", idx, reason, appliedLimitW)) } case <-ctx.Done(): cancel() @@ -2590,10 +2606,25 @@ func runBenchmarkPowerCalibration( perGPU := filterRowsByGPU(attempt.rows, idx) summary := summarizeBenchmarkTelemetry(perGPU) if throttleReason == "" && attempt.err == nil && summary.P95PowerW > 0 { + // Stable at appliedLimitW: record it and binary-search upward. calib.Summary = summary calib.Completed = true calib.AppliedPowerLimitW = float64(appliedLimitW) logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", idx, appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples)) + lo = appliedLimitW + // If there is still headroom to search, try a higher level. + if canDerate && hi-lo > calibSearchTolerance { + nextLimitW := roundTo5W((lo + hi) / 2) + if nextLimitW > lo && nextLimitW < hi { + if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err == nil { + appliedLimitW = nextLimitW + calib.AppliedPowerLimitW = float64(appliedLimitW) + calib.Notes = append(calib.Notes, fmt.Sprintf("binary search: stable at %d W, trying %d W (lo=%d hi=%d)", lo, nextLimitW, lo, hi)) + logFunc(fmt.Sprintf("power calibration: GPU %d binary search up: stable at %d W, trying %d W", idx, lo, nextLimitW)) + continue calibLoop + } + } + } break } @@ -2667,22 +2698,62 @@ func runBenchmarkPowerCalibration( if !canDerate || appliedLimitW <= 0 { break } - nextLimitW := appliedLimitW - derateStepW + // Binary-search for the highest stable power limit. + // This attempt failed or throttled, so update the upper bound. + hi = appliedLimitW + + if hi-lo <= calibSearchTolerance { + // Search range exhausted: lo is the highest verified-stable level. + if lo > minLimitW { + calib.Notes = append(calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", lo, lo, hi)) + if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, lo); err == nil { + appliedLimitW = lo + calib.AppliedPowerLimitW = float64(lo) + calib.Derated = lo < originalLimitW + } + } else { + calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW)) + } + break + } + + // Compute the next candidate. + // For thermal throttle: use the pre-throttle power draw from telemetry + // as a smarter initial estimate instead of the binary midpoint — it + // lands much closer to the true limit on the first attempt. + nextLimitW := (lo + hi) / 2 + if strings.Contains(throttleReason, "thermal") { + if onsetW := calibPreThrottlePowerW(perGPU); onsetW > 0 { + candidate := roundTo5W(int(math.Round(onsetW)) - calibPreThrottleMarginW) + if candidate > lo && candidate < hi { + nextLimitW = candidate + } + } + } + nextLimitW = roundTo5W(nextLimitW) + // Ensure the candidate is strictly inside the search range. + if nextLimitW <= lo { + nextLimitW = lo + calibSearchTolerance + } + if nextLimitW >= hi { + nextLimitW = (lo + hi) / 2 + } if nextLimitW < minLimitW { - calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default/current limit", maxDerateW)) + calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW)) break } if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err != nil { - calib.Notes = append(calib.Notes, "failed to lower power limit: "+err.Error()) - logFunc(fmt.Sprintf("power calibration: GPU %d failed to set reduced power limit %d W: %v", idx, nextLimitW, err)) + calib.Notes = append(calib.Notes, "failed to set power limit: "+err.Error()) + logFunc(fmt.Sprintf("power calibration: GPU %d failed to set power limit %d W: %v", idx, nextLimitW, err)) break } appliedLimitW = nextLimitW calib.AppliedPowerLimitW = float64(appliedLimitW) - calib.Derated = true + calib.Derated = appliedLimitW < originalLimitW info.PowerLimitW = float64(appliedLimitW) infoByIndex[idx] = info - calib.Notes = append(calib.Notes, fmt.Sprintf("reduced power limit to %d W and restarted targeted_power from the beginning", appliedLimitW)) + calib.Notes = append(calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", nextLimitW, lo, hi)) + logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", idx, nextLimitW, lo, hi)) } if calib.Completed || calib.Attempts > 0 || len(calib.Notes) > 0 { @@ -2699,6 +2770,33 @@ func isDCGMResourceBusy(err error) bool { return errors.As(err, &exitErr) && exitErr.ExitCode() == 222 } +// calibPreThrottlePowerW estimates the GPU power draw just before thermal +// throttle onset by averaging the first quarter of telemetry rows. The early +// samples capture the GPU at peak before clock/power reduction kicks in. +func calibPreThrottlePowerW(rows []GPUMetricRow) float64 { + if len(rows) < 4 { + return 0 + } + n := len(rows) / 4 + var sum float64 + var cnt int + for _, r := range rows[:n] { + if r.PowerW > 0 { + sum += r.PowerW + cnt++ + } + } + if cnt == 0 { + return 0 + } + return sum / float64(cnt) +} + +// roundTo5W rounds w to the nearest 5 W boundary. +func roundTo5W(w int) int { + return ((w + 2) / 5) * 5 +} + func powerBenchDurationSec(profile string) int { switch strings.TrimSpace(strings.ToLower(profile)) { case NvidiaBenchmarkProfileStability: