Fix power calibration failing due to DCGM resource contention

When a targeted_power attempt is cancelled (e.g. after sw_thermal throttle), nv-hostengine holds the diagnostic slot asynchronously. The next attempt immediately received DCGM_ST_IN_USE (exit 222) and incorrectly derated the power limit. Now: exit 222 is detected via isDCGMResourceBusy and triggers an exponential back-off retry at the same power limit (1s, 2s, 4s, … up to 256s). Once the back-off delay would exceed 300s the calibration fails, indicating the slot is persistently held. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 20:41:17 +03:00
1 changed files with 45 additions and 0 deletions
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"encoding/csv"
 	"encoding/json"
 	"errors"
 	"fmt"
 	"math"
 	"os"
@@ -2449,6 +2450,11 @@ func runBenchmarkPowerCalibration(
 	const calibDurationSec = 120
 	const derateStepW = 25
 	const maxDerateW = 150
 	// dcgmResourceBusyMaxDelaySec caps the exponential back-off when DCGM
 	// returns DCGM_ST_IN_USE (exit 222). The sequence is 1 s, 2 s, 4 s, …
 	// doubling each retry until it would exceed the cap, at which point the
 	// next busy response fails the calibration immediately.
 	const dcgmResourceBusyMaxDelaySec = 300
 	if _, err := exec.LookPath("dcgmi"); err != nil {
 		logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
@@ -2500,6 +2506,8 @@ func runBenchmarkPowerCalibration(
 		calib := benchmarkPowerCalibrationResult{
 			AppliedPowerLimitW: float64(appliedLimitW),
 		}
 		busyRetries := 0
 		busyDelaySec := 1 // exponential back-off seed; doubles each retry up to dcgmResourceBusyMaxDelaySec
 		if canDerate && originalLimitW > 0 {
 			idxCopy := idx
 			orig := originalLimitW
@@ -2511,6 +2519,7 @@ func runBenchmarkPowerCalibration(
 			})
 		}
 	calibLoop:
 		for {
 			calib.Attempts++
 			logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", idx, calib.Attempts, appliedLimitW, calibDurationSec))
@@ -2564,6 +2573,35 @@ func runBenchmarkPowerCalibration(
 				break
 			}
 			// If DCGM reports the resource is in use, nv-hostengine has not yet
 			// released the diagnostic slot from the previous attempt. Do not
 			// derate: wait with exponential back-off and retry at the same
 			// power limit. Once the back-off delay would exceed
 			// dcgmResourceBusyMaxDelaySec, fail — the slot is persistently
 			// held by something else.
 			if attempt.err != nil && isDCGMResourceBusy(attempt.err) {
 				if busyDelaySec > dcgmResourceBusyMaxDelaySec {
 					calib.Notes = append(calib.Notes, fmt.Sprintf("DCGM resource busy after %d retries, giving up", busyRetries))
 					logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource persistently busy after %d retries, stopping", idx, busyRetries))
 					break
 				}
 				busyRetries++
 				logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource busy (attempt %d), retrying in %ds", idx, calib.Attempts, busyDelaySec))
 				select {
 				case <-ctx.Done():
 					break calibLoop
 				case <-time.After(time.Duration(busyDelaySec) * time.Second):
 				}
 				next := busyDelaySec * 2
 				if next > dcgmResourceBusyMaxDelaySec {
 					next = dcgmResourceBusyMaxDelaySec + 1 // sentinel: next busy → fail
 				}
 				busyDelaySec = next
 				continue calibLoop
 			}
 			busyRetries = 0    // reset on any non-busy outcome
 			busyDelaySec = 1   // reset back-off
 			switch {
 			case throttleReason != "":
 				calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power was canceled on attempt %d after %s throttling at %d W", calib.Attempts, throttleReason, appliedLimitW))
@@ -2604,6 +2642,13 @@ func runBenchmarkPowerCalibration(
 	return results, restore
 }
 // isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222),
 // meaning nv-hostengine still holds the diagnostic slot from a prior run.
 func isDCGMResourceBusy(err error) bool {
 	var exitErr *exec.ExitError
 	return errors.As(err, &exitErr) && exitErr.ExitCode() == 222
 }
 func powerBenchDurationSec(profile string) int {
 	switch strings.TrimSpace(strings.ToLower(profile)) {
 	case NvidiaBenchmarkProfileStability: