Simplify power calibration: pure binary search, no telemetry guessing

Remove telemetry-guided initial candidate; use strict binary search midpoint at every step. Clean and predictable convergence in O(log N) attempts within the allowed power range [minLimitW, startingLimitW]. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Replace linear power derate with binary search + telemetry-guided jump
2026-04-14 22:12:45 +03:00 · 2026-04-14 22:05:23 +03:00 · 2026-04-14 21:44:57 +03:00 · 2026-04-14 20:41:17 +03:00
2 changed files with 180 additions and 12 deletions
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"encoding/csv"
 	"encoding/json"
 	"errors"
 	"fmt"
 	"math"
 	"os"
@@ -48,6 +49,10 @@ type benchmarkPowerCalibrationResult struct {
 	Derated            bool
 	Completed          bool
 	Notes              []string
 	// CoolingWarning is set when the GPU throttled thermally with a clock drop
 	// ≥20% while server fans were below 100% duty cycle — a signal that the
 	// cooling system may not be correctly configured for full GPU load.
 	CoolingWarning string
 }
 type benchmarkBurnProfile struct {
@@ -343,6 +348,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 				gpuResult.PowerCalibrationTries = calib.Attempts
 				gpuResult.PowerLimitDerated = calib.Derated
 				gpuResult.Notes = append(gpuResult.Notes, calib.Notes...)
 				if calib.CoolingWarning != "" {
 					gpuResult.CoolingWarning = calib.CoolingWarning
 				}
 			}
 			if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
 				gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz
@@ -1624,7 +1632,15 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
 			case "power_capped":
 				findings = append(findings, fmt.Sprintf("GPU %d spent measurable time under SW power cap.", gpu.Index))
 			case "thermal_limited":
-				findings = append(findings, fmt.Sprintf("GPU %d reported thermal slowdown during steady state.", gpu.Index))
+				msg := fmt.Sprintf("GPU %d reported thermal slowdown during steady state.", gpu.Index)
 				if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable &&
 					result.Cooling.P95FanDutyCyclePct < 98 && gpu.Steady.ClockDriftPct >= 20 {
 					msg += fmt.Sprintf(
 						" Fans peaked at %.0f%% duty cycle (not at maximum) while clocks dropped %.0f%% — possible cooling misconfiguration; rerun the benchmark with fan speed manually fixed at 100%%.",
 						result.Cooling.P95FanDutyCyclePct, gpu.Steady.ClockDriftPct,
 					)
 				}
 				findings = append(findings, msg)
 			case "sync_boost_limited":
 				findings = append(findings, fmt.Sprintf("GPU %d was limited by sync boost behaviour.", gpu.Index))
 			case "low_sm_clock_vs_target":
@@ -1641,6 +1657,12 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
 				findings = append(findings, fmt.Sprintf("GPU %d reported %d corrected ECC error(s) — possible DRAM degradation.", gpu.Index, gpu.ECC.Corrected))
 			}
 		}
 		if gpu.CoolingWarning != "" {
 			findings = append(findings, fmt.Sprintf(
 				"GPU %d: %s. Operator action: rerun the benchmark with fan speed manually fixed at 100%% to confirm actual thermal headroom.",
 				gpu.Index, gpu.CoolingWarning,
 			))
 		}
 		if len(gpu.PrecisionFailures) > 0 {
 			findings = append(findings, fmt.Sprintf("GPU %d had incomplete precision coverage: %s.", gpu.Index, strings.Join(gpu.PrecisionFailures, ", ")))
 		}
@@ -2043,6 +2065,9 @@ func runNvidiaBenchmarkParallel(
 			r.PowerCalibrationTries = calib.Attempts
 			r.PowerLimitDerated = calib.Derated
 			r.Notes = append(r.Notes, calib.Notes...)
 			if calib.CoolingWarning != "" {
 				r.CoolingWarning = calib.CoolingWarning
 			}
 		}
 		if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
 			r.LockedGraphicsClockMHz = norm.GPUClockLockMHz
@@ -2447,8 +2472,15 @@ func runBenchmarkPowerCalibration(
 	logFunc func(string),
 ) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) {
 	const calibDurationSec = 120
 	const derateStepW = 25
 	const maxDerateW = 150
 	// calibSearchTolerance is the binary-search convergence threshold in watts.
 	// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
 	const calibSearchTolerance = 10
 	// dcgmResourceBusyMaxDelaySec caps the exponential back-off when DCGM
 	// returns DCGM_ST_IN_USE (exit 222). The sequence is 1 s, 2 s, 4 s, …
 	// doubling each retry until it would exceed the cap, at which point the
 	// next busy response fails the calibration immediately.
 	const dcgmResourceBusyMaxDelaySec = 300
 	if _, err := exec.LookPath("dcgmi"); err != nil {
 		logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
@@ -2493,13 +2525,20 @@ func runBenchmarkPowerCalibration(
 		case appliedLimitW > 0:
 			minLimitW = appliedLimitW - maxDerateW
 		}
-		if minLimitW < derateStepW {
+		if minLimitW < calibSearchTolerance {
-			minLimitW = derateStepW
+			minLimitW = calibSearchTolerance
 		}
 		calib := benchmarkPowerCalibrationResult{
 			AppliedPowerLimitW: float64(appliedLimitW),
 		}
 		// Binary search bounds for finding the highest stable power limit.
 		// lo = highest verified-stable level (assumed: minLimitW).
 		// hi = lowest verified-unstable level (assumed: above the starting limit).
 		lo := minLimitW
 		hi := appliedLimitW + 1 // exclusive: not yet tested, so not yet confirmed unstable
 		busyRetries := 0
 		busyDelaySec := 1 // exponential back-off seed; doubles each retry up to dcgmResourceBusyMaxDelaySec
 		if canDerate && originalLimitW > 0 {
 			idxCopy := idx
 			orig := originalLimitW
@@ -2511,6 +2550,7 @@ func runBenchmarkPowerCalibration(
 			})
 		}
 	calibLoop:
 		for {
 			calib.Attempts++
 			logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", idx, calib.Attempts, appliedLimitW, calibDurationSec))
@@ -2540,9 +2580,15 @@ func runBenchmarkPowerCalibration(
 					if err != nil {
 						continue
 					}
-					if reason := benchmarkCalibrationThrottleReason(beforeThrottle, afterThrottle); reason != "" {
+					// Record the throttle reason but do NOT cancel the dcgmi
 					// process. Killing it mid-run leaves nv-hostengine holding
 					// the diagnostic slot, which causes DCGM_ST_IN_USE on every
 					// subsequent attempt. Let targeted_power run to its natural
 					// end so the daemon releases the slot cleanly before we
 					// reduce power and retry.
 					if reason := benchmarkCalibrationThrottleReason(beforeThrottle, afterThrottle); reason != "" && throttleReason == "" {
 						throttleReason = reason
-						cancel()
+						logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for current run to finish before reducing power limit", idx, reason, appliedLimitW))
 					}
 				case <-ctx.Done():
 					cancel()
@@ -2557,17 +2603,87 @@ func runBenchmarkPowerCalibration(
 			perGPU := filterRowsByGPU(attempt.rows, idx)
 			summary := summarizeBenchmarkTelemetry(perGPU)
 			if throttleReason == "" && attempt.err == nil && summary.P95PowerW > 0 {
 				// Stable at appliedLimitW: record it and binary-search upward.
 				calib.Summary = summary
 				calib.Completed = true
 				calib.AppliedPowerLimitW = float64(appliedLimitW)
 				logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", idx, appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
 				lo = appliedLimitW
 				// If there is still headroom to search, try a higher level.
 				if canDerate && hi-lo > calibSearchTolerance {
 					nextLimitW := roundTo5W((lo + hi) / 2)
 					if nextLimitW > lo && nextLimitW < hi {
 						if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err == nil {
 							appliedLimitW = nextLimitW
 							calib.AppliedPowerLimitW = float64(appliedLimitW)
 							calib.Notes = append(calib.Notes, fmt.Sprintf("binary search: stable at %d W, trying %d W (lo=%d hi=%d)", lo, nextLimitW, lo, hi))
 							logFunc(fmt.Sprintf("power calibration: GPU %d binary search up: stable at %d W, trying %d W", idx, lo, nextLimitW))
 							continue calibLoop
 						}
 					}
 				}
 				break
 			}
 			// If DCGM reports the resource is in use, nv-hostengine has not yet
 			// released the diagnostic slot from the previous attempt. Do not
 			// derate: wait with exponential back-off and retry at the same
 			// power limit. Once the back-off delay would exceed
 			// dcgmResourceBusyMaxDelaySec, fail — the slot is persistently
 			// held by something else.
 			if attempt.err != nil && isDCGMResourceBusy(attempt.err) {
 				if busyDelaySec > dcgmResourceBusyMaxDelaySec {
 					calib.Notes = append(calib.Notes, fmt.Sprintf("DCGM resource busy after %d retries, giving up", busyRetries))
 					logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource persistently busy after %d retries, stopping", idx, busyRetries))
 					break
 				}
 				busyRetries++
 				logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource busy (attempt %d), retrying in %ds", idx, calib.Attempts, busyDelaySec))
 				select {
 				case <-ctx.Done():
 					break calibLoop
 				case <-time.After(time.Duration(busyDelaySec) * time.Second):
 				}
 				next := busyDelaySec * 2
 				if next > dcgmResourceBusyMaxDelaySec {
 					next = dcgmResourceBusyMaxDelaySec + 1 // sentinel: next busy → fail
 				}
 				busyDelaySec = next
 				continue calibLoop
 			}
 			busyRetries = 0    // reset on any non-busy outcome
 			busyDelaySec = 1   // reset back-off
 			switch {
 			case throttleReason != "":
 				calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power was canceled on attempt %d after %s throttling at %d W", calib.Attempts, throttleReason, appliedLimitW))
 				logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", idx, throttleReason, appliedLimitW))
 				// Check whether the thermal throttle coincided with fans below
 				// maximum: that combination suggests cooling misconfiguration
 				// rather than a fundamental power-delivery limit.
 				if strings.Contains(throttleReason, "thermal") && calib.CoolingWarning == "" {
 					clocks := make([]float64, 0, len(perGPU))
 					var fanDutyValues []float64
 					fanDutyAvail := false
 					for _, r := range perGPU {
 						if r.ClockMHz > 0 {
 							clocks = append(clocks, r.ClockMHz)
 						}
 						if r.FanDutyCycleAvailable {
 							fanDutyAvail = true
 							fanDutyValues = append(fanDutyValues, r.FanDutyCyclePct)
 						}
 					}
 					dropPct := benchmarkClockDrift(clocks)
 					p95FanDuty := benchmarkPercentile(fanDutyValues, 95)
 					if dropPct >= 20 && fanDutyAvail && p95FanDuty < 98 {
 						calib.CoolingWarning = fmt.Sprintf(
 							"thermal throttle (%s) caused a %.0f%% clock drop while fans were at %.0f%% duty cycle — server cooling may not be configured for full GPU load",
 							throttleReason, dropPct, p95FanDuty,
 						)
 						logFunc(fmt.Sprintf("power calibration: GPU %d cooling warning: %s", idx, calib.CoolingWarning))
 					}
 				}
 			case attempt.err != nil:
 				calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", calib.Attempts, appliedLimitW, attempt.err))
 				logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", idx, appliedLimitW, attempt.err))
@@ -2579,22 +2695,50 @@ func runBenchmarkPowerCalibration(
 			if !canDerate || appliedLimitW <= 0 {
 				break
 			}
-			nextLimitW := appliedLimitW - derateStepW
+			// Binary-search for the highest stable power limit.
 			// This attempt failed or throttled, so update the upper bound.
 			hi = appliedLimitW
 			if hi-lo <= calibSearchTolerance {
 				// Search range exhausted: lo is the highest verified-stable level.
 				if lo > minLimitW {
 					calib.Notes = append(calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", lo, lo, hi))
 					if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, lo); err == nil {
 						appliedLimitW = lo
 						calib.AppliedPowerLimitW = float64(lo)
 						calib.Derated = lo < originalLimitW
 					}
 				} else {
 					calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
 				}
 				break
 			}
 			// Binary midpoint within the remaining search range.
 			nextLimitW := roundTo5W((lo + hi) / 2)
 			// Ensure the candidate is strictly inside the search range.
 			if nextLimitW <= lo {
 				nextLimitW = lo + calibSearchTolerance
 			}
 			if nextLimitW >= hi {
 				nextLimitW = (lo + hi) / 2
 			}
 			if nextLimitW < minLimitW {
-				calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default/current limit", maxDerateW))
+				calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
 				break
 			}
 			if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err != nil {
-				calib.Notes = append(calib.Notes, "failed to lower power limit: "+err.Error())
+				calib.Notes = append(calib.Notes, "failed to set power limit: "+err.Error())
-				logFunc(fmt.Sprintf("power calibration: GPU %d failed to set reduced power limit %d W: %v", idx, nextLimitW, err))
+				logFunc(fmt.Sprintf("power calibration: GPU %d failed to set power limit %d W: %v", idx, nextLimitW, err))
 				break
 			}
 			appliedLimitW = nextLimitW
 			calib.AppliedPowerLimitW = float64(appliedLimitW)
-			calib.Derated = true
+			calib.Derated = appliedLimitW < originalLimitW
 			info.PowerLimitW = float64(appliedLimitW)
 			infoByIndex[idx] = info
-			calib.Notes = append(calib.Notes, fmt.Sprintf("reduced power limit to %d W and restarted targeted_power from the beginning", appliedLimitW))
+			calib.Notes = append(calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", nextLimitW, lo, hi))
 			logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", idx, nextLimitW, lo, hi))
 		}
 		if calib.Completed || calib.Attempts > 0 || len(calib.Notes) > 0 {
@@ -2604,6 +2748,18 @@ func runBenchmarkPowerCalibration(
 	return results, restore
 }
 // isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222),
 // meaning nv-hostengine still holds the diagnostic slot from a prior run.
 func isDCGMResourceBusy(err error) bool {
 	var exitErr *exec.ExitError
 	return errors.As(err, &exitErr) && exitErr.ExitCode() == 222
 }
 // roundTo5W rounds w to the nearest 5 W boundary.
 func roundTo5W(w int) int {
 	return ((w + 2) / 5) * 5
 }
 func powerBenchDurationSec(profile string) int {
 	switch strings.TrimSpace(strings.ToLower(profile)) {
 	case NvidiaBenchmarkProfileStability:
@@ -2778,6 +2934,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			OccupiedSlots:       occupied,
 			OccupiedSlotsNote:   note,
 			Notes:               append([]string(nil), calib.Notes...),
 			CoolingWarning:      calib.CoolingWarning,
 		})
 	}
 	sort.Slice(gpus, func(i, j int) bool {
@@ -2804,6 +2961,12 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		if gpu.Derated {
 			result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete targeted_power.", gpu.Index, gpu.AppliedPowerLimitW))
 		}
 		if gpu.CoolingWarning != "" {
 			result.Findings = append(result.Findings, fmt.Sprintf(
 				"GPU %d: %s. Operator action: rerun the benchmark with fan speed manually fixed at 100%% to confirm actual thermal headroom.",
 				gpu.Index, gpu.CoolingWarning,
 			))
 		}
 	}
 	singleByIndex := make(map[int]NvidiaPowerBenchGPU, len(gpus))
 	for _, gpu := range gpus {
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -131,6 +131,9 @@ type BenchmarkGPUResult struct {
 	Scores             BenchmarkScorecard         `json:"scores"`
 	DegradationReasons []string                   `json:"degradation_reasons,omitempty"`
 	Notes              []string                   `json:"notes,omitempty"`
 	// CoolingWarning is non-empty when a thermal throttle event occurred with
 	// a clock drop ≥20% while server fans were not at 100% duty cycle.
 	CoolingWarning string `json:"cooling_warning,omitempty"`
 }
 type BenchmarkTelemetrySummary struct {
@@ -280,6 +283,8 @@ type NvidiaPowerBenchGPU struct {
 	OccupiedSlots       []int    `json:"occupied_slots,omitempty"`
 	OccupiedSlotsNote   string   `json:"occupied_slots_note,omitempty"`
 	Notes               []string `json:"notes,omitempty"`
 	// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
 	CoolingWarning string `json:"cooling_warning,omitempty"`
 }
 type NvidiaPowerBenchStep struct {
Author	SHA1	Message	Date
Michael Chus	19dbabd71d	Simplify power calibration: pure binary search, no telemetry guessing Remove telemetry-guided initial candidate; use strict binary search midpoint at every step. Clean and predictable convergence in O(log N) attempts within the allowed power range [minLimitW, startingLimitW]. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-14 22:12:45 +03:00
Michael Chus	a6a07f2626	Replace linear power derate with binary search + telemetry-guided jump Power calibration previously stepped down 25 W at a time (linear), requiring up to 6 attempts to find a stable limit within 150 W range. New strategy: - Binary search between minLimitW (lo, assumed stable floor) and the starting/failed limit (hi, confirmed unstable), converging within a 10 W tolerance in ~4 attempts. - For thermal throttle: the first-quarter telemetry rows estimate the GPU's pre-throttle power draw. nextLimit = round5W(onset - 10 W) is used as the initial candidate instead of the binary midpoint, landing much closer to the true limit on the first step. - On success: lo is updated and a higher level is tried (binary search upward) until hi-lo ≤ tolerance, ensuring the highest stable limit is found rather than the first stable one. - Let targeted_power run to natural completion on throttle (no mid-run SIGKILL) so nv-hostengine releases its diagnostic slot cleanly before the next attempt. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-14 22:05:23 +03:00
Michael Chus	f87461ee4a	Detect thermal throttle with fans below 100% as cooling misconfiguration During power calibration: if a thermal throttle (sw_thermal/hw_thermal) causes ≥20% clock drop while server fans are below 98% P95 duty cycle, record a CoolingWarning on the GPU result and emit an actionable finding telling the operator to rerun with fans manually fixed at 100%. During steady-state benchmark: same signal enriches the existing thermal_limited finding with fan duty cycle and clock drift values. Covers both the main benchmark (buildBenchmarkFindings) and the power bench (NvidiaPowerBenchResult.Findings). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-14 21:44:57 +03:00
Michael Chus	a636146dbd	Fix power calibration failing due to DCGM resource contention When a targeted_power attempt is cancelled (e.g. after sw_thermal throttle), nv-hostengine holds the diagnostic slot asynchronously. The next attempt immediately received DCGM_ST_IN_USE (exit 222) and incorrectly derated the power limit. Now: exit 222 is detected via isDCGMResourceBusy and triggers an exponential back-off retry at the same power limit (1s, 2s, 4s, … up to 256s). Once the back-off delay would exceed 300s the calibration fails, indicating the slot is persistently held. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-14 20:41:17 +03:00