Replace linear power derate with binary search + telemetry-guided jump
Power calibration previously stepped down 25 W at a time (linear), requiring up to 6 attempts to find a stable limit within 150 W range. New strategy: - Binary search between minLimitW (lo, assumed stable floor) and the starting/failed limit (hi, confirmed unstable), converging within a 10 W tolerance in ~4 attempts. - For thermal throttle: the first-quarter telemetry rows estimate the GPU's pre-throttle power draw. nextLimit = round5W(onset - 10 W) is used as the initial candidate instead of the binary midpoint, landing much closer to the true limit on the first step. - On success: lo is updated and a higher level is tried (binary search upward) until hi-lo ≤ tolerance, ensuring the highest stable limit is found rather than the first stable one. - Let targeted_power run to natural completion on throttle (no mid-run SIGKILL) so nv-hostengine releases its diagnostic slot cleanly before the next attempt. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2472,8 +2472,13 @@ func runBenchmarkPowerCalibration(
|
|||||||
logFunc func(string),
|
logFunc func(string),
|
||||||
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) {
|
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) {
|
||||||
const calibDurationSec = 120
|
const calibDurationSec = 120
|
||||||
const derateStepW = 25
|
|
||||||
const maxDerateW = 150
|
const maxDerateW = 150
|
||||||
|
// calibSearchTolerance is the binary-search convergence threshold in watts.
|
||||||
|
// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
|
||||||
|
const calibSearchTolerance = 10
|
||||||
|
// calibPreThrottleMarginW is subtracted from the telemetry-estimated
|
||||||
|
// pre-throttle power draw to produce a smarter initial search candidate.
|
||||||
|
const calibPreThrottleMarginW = 10
|
||||||
// dcgmResourceBusyMaxDelaySec caps the exponential back-off when DCGM
|
// dcgmResourceBusyMaxDelaySec caps the exponential back-off when DCGM
|
||||||
// returns DCGM_ST_IN_USE (exit 222). The sequence is 1 s, 2 s, 4 s, …
|
// returns DCGM_ST_IN_USE (exit 222). The sequence is 1 s, 2 s, 4 s, …
|
||||||
// doubling each retry until it would exceed the cap, at which point the
|
// doubling each retry until it would exceed the cap, at which point the
|
||||||
@@ -2523,13 +2528,18 @@ func runBenchmarkPowerCalibration(
|
|||||||
case appliedLimitW > 0:
|
case appliedLimitW > 0:
|
||||||
minLimitW = appliedLimitW - maxDerateW
|
minLimitW = appliedLimitW - maxDerateW
|
||||||
}
|
}
|
||||||
if minLimitW < derateStepW {
|
if minLimitW < calibSearchTolerance {
|
||||||
minLimitW = derateStepW
|
minLimitW = calibSearchTolerance
|
||||||
}
|
}
|
||||||
|
|
||||||
calib := benchmarkPowerCalibrationResult{
|
calib := benchmarkPowerCalibrationResult{
|
||||||
AppliedPowerLimitW: float64(appliedLimitW),
|
AppliedPowerLimitW: float64(appliedLimitW),
|
||||||
}
|
}
|
||||||
|
// Binary search bounds for finding the highest stable power limit.
|
||||||
|
// lo = highest verified-stable level (assumed: minLimitW).
|
||||||
|
// hi = lowest verified-unstable level (assumed: above the starting limit).
|
||||||
|
lo := minLimitW
|
||||||
|
hi := appliedLimitW + 1 // exclusive: not yet tested, so not yet confirmed unstable
|
||||||
busyRetries := 0
|
busyRetries := 0
|
||||||
busyDelaySec := 1 // exponential back-off seed; doubles each retry up to dcgmResourceBusyMaxDelaySec
|
busyDelaySec := 1 // exponential back-off seed; doubles each retry up to dcgmResourceBusyMaxDelaySec
|
||||||
if canDerate && originalLimitW > 0 {
|
if canDerate && originalLimitW > 0 {
|
||||||
@@ -2573,9 +2583,15 @@ func runBenchmarkPowerCalibration(
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if reason := benchmarkCalibrationThrottleReason(beforeThrottle, afterThrottle); reason != "" {
|
// Record the throttle reason but do NOT cancel the dcgmi
|
||||||
|
// process. Killing it mid-run leaves nv-hostengine holding
|
||||||
|
// the diagnostic slot, which causes DCGM_ST_IN_USE on every
|
||||||
|
// subsequent attempt. Let targeted_power run to its natural
|
||||||
|
// end so the daemon releases the slot cleanly before we
|
||||||
|
// reduce power and retry.
|
||||||
|
if reason := benchmarkCalibrationThrottleReason(beforeThrottle, afterThrottle); reason != "" && throttleReason == "" {
|
||||||
throttleReason = reason
|
throttleReason = reason
|
||||||
cancel()
|
logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for current run to finish before reducing power limit", idx, reason, appliedLimitW))
|
||||||
}
|
}
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
cancel()
|
cancel()
|
||||||
@@ -2590,10 +2606,25 @@ func runBenchmarkPowerCalibration(
|
|||||||
perGPU := filterRowsByGPU(attempt.rows, idx)
|
perGPU := filterRowsByGPU(attempt.rows, idx)
|
||||||
summary := summarizeBenchmarkTelemetry(perGPU)
|
summary := summarizeBenchmarkTelemetry(perGPU)
|
||||||
if throttleReason == "" && attempt.err == nil && summary.P95PowerW > 0 {
|
if throttleReason == "" && attempt.err == nil && summary.P95PowerW > 0 {
|
||||||
|
// Stable at appliedLimitW: record it and binary-search upward.
|
||||||
calib.Summary = summary
|
calib.Summary = summary
|
||||||
calib.Completed = true
|
calib.Completed = true
|
||||||
calib.AppliedPowerLimitW = float64(appliedLimitW)
|
calib.AppliedPowerLimitW = float64(appliedLimitW)
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", idx, appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
|
logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", idx, appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
|
||||||
|
lo = appliedLimitW
|
||||||
|
// If there is still headroom to search, try a higher level.
|
||||||
|
if canDerate && hi-lo > calibSearchTolerance {
|
||||||
|
nextLimitW := roundTo5W((lo + hi) / 2)
|
||||||
|
if nextLimitW > lo && nextLimitW < hi {
|
||||||
|
if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err == nil {
|
||||||
|
appliedLimitW = nextLimitW
|
||||||
|
calib.AppliedPowerLimitW = float64(appliedLimitW)
|
||||||
|
calib.Notes = append(calib.Notes, fmt.Sprintf("binary search: stable at %d W, trying %d W (lo=%d hi=%d)", lo, nextLimitW, lo, hi))
|
||||||
|
logFunc(fmt.Sprintf("power calibration: GPU %d binary search up: stable at %d W, trying %d W", idx, lo, nextLimitW))
|
||||||
|
continue calibLoop
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2667,22 +2698,62 @@ func runBenchmarkPowerCalibration(
|
|||||||
if !canDerate || appliedLimitW <= 0 {
|
if !canDerate || appliedLimitW <= 0 {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
nextLimitW := appliedLimitW - derateStepW
|
// Binary-search for the highest stable power limit.
|
||||||
|
// This attempt failed or throttled, so update the upper bound.
|
||||||
|
hi = appliedLimitW
|
||||||
|
|
||||||
|
if hi-lo <= calibSearchTolerance {
|
||||||
|
// Search range exhausted: lo is the highest verified-stable level.
|
||||||
|
if lo > minLimitW {
|
||||||
|
calib.Notes = append(calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", lo, lo, hi))
|
||||||
|
if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, lo); err == nil {
|
||||||
|
appliedLimitW = lo
|
||||||
|
calib.AppliedPowerLimitW = float64(lo)
|
||||||
|
calib.Derated = lo < originalLimitW
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute the next candidate.
|
||||||
|
// For thermal throttle: use the pre-throttle power draw from telemetry
|
||||||
|
// as a smarter initial estimate instead of the binary midpoint — it
|
||||||
|
// lands much closer to the true limit on the first attempt.
|
||||||
|
nextLimitW := (lo + hi) / 2
|
||||||
|
if strings.Contains(throttleReason, "thermal") {
|
||||||
|
if onsetW := calibPreThrottlePowerW(perGPU); onsetW > 0 {
|
||||||
|
candidate := roundTo5W(int(math.Round(onsetW)) - calibPreThrottleMarginW)
|
||||||
|
if candidate > lo && candidate < hi {
|
||||||
|
nextLimitW = candidate
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
nextLimitW = roundTo5W(nextLimitW)
|
||||||
|
// Ensure the candidate is strictly inside the search range.
|
||||||
|
if nextLimitW <= lo {
|
||||||
|
nextLimitW = lo + calibSearchTolerance
|
||||||
|
}
|
||||||
|
if nextLimitW >= hi {
|
||||||
|
nextLimitW = (lo + hi) / 2
|
||||||
|
}
|
||||||
if nextLimitW < minLimitW {
|
if nextLimitW < minLimitW {
|
||||||
calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default/current limit", maxDerateW))
|
calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err != nil {
|
if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err != nil {
|
||||||
calib.Notes = append(calib.Notes, "failed to lower power limit: "+err.Error())
|
calib.Notes = append(calib.Notes, "failed to set power limit: "+err.Error())
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d failed to set reduced power limit %d W: %v", idx, nextLimitW, err))
|
logFunc(fmt.Sprintf("power calibration: GPU %d failed to set power limit %d W: %v", idx, nextLimitW, err))
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
appliedLimitW = nextLimitW
|
appliedLimitW = nextLimitW
|
||||||
calib.AppliedPowerLimitW = float64(appliedLimitW)
|
calib.AppliedPowerLimitW = float64(appliedLimitW)
|
||||||
calib.Derated = true
|
calib.Derated = appliedLimitW < originalLimitW
|
||||||
info.PowerLimitW = float64(appliedLimitW)
|
info.PowerLimitW = float64(appliedLimitW)
|
||||||
infoByIndex[idx] = info
|
infoByIndex[idx] = info
|
||||||
calib.Notes = append(calib.Notes, fmt.Sprintf("reduced power limit to %d W and restarted targeted_power from the beginning", appliedLimitW))
|
calib.Notes = append(calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", nextLimitW, lo, hi))
|
||||||
|
logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", idx, nextLimitW, lo, hi))
|
||||||
}
|
}
|
||||||
|
|
||||||
if calib.Completed || calib.Attempts > 0 || len(calib.Notes) > 0 {
|
if calib.Completed || calib.Attempts > 0 || len(calib.Notes) > 0 {
|
||||||
@@ -2699,6 +2770,33 @@ func isDCGMResourceBusy(err error) bool {
|
|||||||
return errors.As(err, &exitErr) && exitErr.ExitCode() == 222
|
return errors.As(err, &exitErr) && exitErr.ExitCode() == 222
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// calibPreThrottlePowerW estimates the GPU power draw just before thermal
|
||||||
|
// throttle onset by averaging the first quarter of telemetry rows. The early
|
||||||
|
// samples capture the GPU at peak before clock/power reduction kicks in.
|
||||||
|
func calibPreThrottlePowerW(rows []GPUMetricRow) float64 {
|
||||||
|
if len(rows) < 4 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
n := len(rows) / 4
|
||||||
|
var sum float64
|
||||||
|
var cnt int
|
||||||
|
for _, r := range rows[:n] {
|
||||||
|
if r.PowerW > 0 {
|
||||||
|
sum += r.PowerW
|
||||||
|
cnt++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if cnt == 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return sum / float64(cnt)
|
||||||
|
}
|
||||||
|
|
||||||
|
// roundTo5W rounds w to the nearest 5 W boundary.
|
||||||
|
func roundTo5W(w int) int {
|
||||||
|
return ((w + 2) / 5) * 5
|
||||||
|
}
|
||||||
|
|
||||||
func powerBenchDurationSec(profile string) int {
|
func powerBenchDurationSec(profile string) int {
|
||||||
switch strings.TrimSpace(strings.ToLower(profile)) {
|
switch strings.TrimSpace(strings.ToLower(profile)) {
|
||||||
case NvidiaBenchmarkProfileStability:
|
case NvidiaBenchmarkProfileStability:
|
||||||
|
|||||||
Reference in New Issue
Block a user