Fix power calibration failing due to DCGM resource contention

When a targeted_power attempt is cancelled (e.g. after sw_thermal throttle), nv-hostengine holds the diagnostic slot asynchronously. The next attempt immediately received DCGM_ST_IN_USE (exit 222) and incorrectly derated the power limit. Now: exit 222 is detected via isDCGMResourceBusy and triggers an exponential back-off retry at the same power limit (1s, 2s, 4s, … up to 256s). Once the back-off delay would exceed 300s the calibration fails, indicating the slot is persistently held. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 20:41:17 +03:00
1 changed files with 45 additions and 0 deletions
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"encoding/csv"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"math"
 	"os"
@@ -2449,6 +2450,11 @@ func runBenchmarkPowerCalibration(
 	const calibDurationSec = 120
 	const derateStepW = 25
 	const maxDerateW = 150
+	// dcgmResourceBusyMaxDelaySec caps the exponential back-off when DCGM
+	// returns DCGM_ST_IN_USE (exit 222). The sequence is 1 s, 2 s, 4 s, …
+	// doubling each retry until it would exceed the cap, at which point the
+	// next busy response fails the calibration immediately.
+	const dcgmResourceBusyMaxDelaySec = 300

 	if _, err := exec.LookPath("dcgmi"); err != nil {
 		logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
@@ -2500,6 +2506,8 @@ func runBenchmarkPowerCalibration(
 		calib := benchmarkPowerCalibrationResult{
 			AppliedPowerLimitW: float64(appliedLimitW),
 		}
+		busyRetries := 0
+		busyDelaySec := 1 // exponential back-off seed; doubles each retry up to dcgmResourceBusyMaxDelaySec
 		if canDerate && originalLimitW > 0 {
 			idxCopy := idx
 			orig := originalLimitW
@@ -2511,6 +2519,7 @@ func runBenchmarkPowerCalibration(
 			})
 		}

+	calibLoop:
 		for {
 			calib.Attempts++
 			logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", idx, calib.Attempts, appliedLimitW, calibDurationSec))
@@ -2564,6 +2573,35 @@ func runBenchmarkPowerCalibration(
 				break
 			}

+			// If DCGM reports the resource is in use, nv-hostengine has not yet
+			// released the diagnostic slot from the previous attempt. Do not
+			// derate: wait with exponential back-off and retry at the same
+			// power limit. Once the back-off delay would exceed
+			// dcgmResourceBusyMaxDelaySec, fail — the slot is persistently
+			// held by something else.
+			if attempt.err != nil && isDCGMResourceBusy(attempt.err) {
+				if busyDelaySec > dcgmResourceBusyMaxDelaySec {
+					calib.Notes = append(calib.Notes, fmt.Sprintf("DCGM resource busy after %d retries, giving up", busyRetries))
+					logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource persistently busy after %d retries, stopping", idx, busyRetries))
+					break
+				}
+				busyRetries++
+				logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource busy (attempt %d), retrying in %ds", idx, calib.Attempts, busyDelaySec))
+				select {
+				case <-ctx.Done():
+					break calibLoop
+				case <-time.After(time.Duration(busyDelaySec) * time.Second):
+				}
+				next := busyDelaySec * 2
+				if next > dcgmResourceBusyMaxDelaySec {
+					next = dcgmResourceBusyMaxDelaySec + 1 // sentinel: next busy → fail
+				}
+				busyDelaySec = next
+				continue calibLoop
+			}
+			busyRetries = 0    // reset on any non-busy outcome
+			busyDelaySec = 1   // reset back-off
+
 			switch {
 			case throttleReason != "":
 				calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power was canceled on attempt %d after %s throttling at %d W", calib.Attempts, throttleReason, appliedLimitW))
@@ -2604,6 +2642,13 @@ func runBenchmarkPowerCalibration(
 	return results, restore
 }

+// isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222),
+// meaning nv-hostengine still holds the diagnostic slot from a prior run.
+func isDCGMResourceBusy(err error) bool {
+	var exitErr *exec.ExitError
+	return errors.As(err, &exitErr) && exitErr.ExitCode() == 222
+}
+
 func powerBenchDurationSec(profile string) int {
 	switch strings.TrimSpace(strings.ToLower(profile)) {
 	case NvidiaBenchmarkProfileStability: