diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index 719b14f..825d454 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -4,6 +4,7 @@ import ( "context" "encoding/csv" "encoding/json" + "errors" "fmt" "math" "os" @@ -2449,6 +2450,11 @@ func runBenchmarkPowerCalibration( const calibDurationSec = 120 const derateStepW = 25 const maxDerateW = 150 + // dcgmResourceBusyMaxDelaySec caps the exponential back-off when DCGM + // returns DCGM_ST_IN_USE (exit 222). The sequence is 1 s, 2 s, 4 s, … + // doubling each retry until it would exceed the cap, at which point the + // next busy response fails the calibration immediately. + const dcgmResourceBusyMaxDelaySec = 300 if _, err := exec.LookPath("dcgmi"); err != nil { logFunc("power calibration: dcgmi not found, skipping (will use default power limit)") @@ -2500,6 +2506,8 @@ func runBenchmarkPowerCalibration( calib := benchmarkPowerCalibrationResult{ AppliedPowerLimitW: float64(appliedLimitW), } + busyRetries := 0 + busyDelaySec := 1 // exponential back-off seed; doubles each retry up to dcgmResourceBusyMaxDelaySec if canDerate && originalLimitW > 0 { idxCopy := idx orig := originalLimitW @@ -2511,6 +2519,7 @@ func runBenchmarkPowerCalibration( }) } + calibLoop: for { calib.Attempts++ logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", idx, calib.Attempts, appliedLimitW, calibDurationSec)) @@ -2564,6 +2573,35 @@ func runBenchmarkPowerCalibration( break } + // If DCGM reports the resource is in use, nv-hostengine has not yet + // released the diagnostic slot from the previous attempt. Do not + // derate: wait with exponential back-off and retry at the same + // power limit. Once the back-off delay would exceed + // dcgmResourceBusyMaxDelaySec, fail — the slot is persistently + // held by something else. + if attempt.err != nil && isDCGMResourceBusy(attempt.err) { + if busyDelaySec > dcgmResourceBusyMaxDelaySec { + calib.Notes = append(calib.Notes, fmt.Sprintf("DCGM resource busy after %d retries, giving up", busyRetries)) + logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource persistently busy after %d retries, stopping", idx, busyRetries)) + break + } + busyRetries++ + logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource busy (attempt %d), retrying in %ds", idx, calib.Attempts, busyDelaySec)) + select { + case <-ctx.Done(): + break calibLoop + case <-time.After(time.Duration(busyDelaySec) * time.Second): + } + next := busyDelaySec * 2 + if next > dcgmResourceBusyMaxDelaySec { + next = dcgmResourceBusyMaxDelaySec + 1 // sentinel: next busy → fail + } + busyDelaySec = next + continue calibLoop + } + busyRetries = 0 // reset on any non-busy outcome + busyDelaySec = 1 // reset back-off + switch { case throttleReason != "": calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power was canceled on attempt %d after %s throttling at %d W", calib.Attempts, throttleReason, appliedLimitW)) @@ -2604,6 +2642,13 @@ func runBenchmarkPowerCalibration( return results, restore } +// isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222), +// meaning nv-hostengine still holds the diagnostic slot from a prior run. +func isDCGMResourceBusy(err error) bool { + var exitErr *exec.ExitError + return errors.As(err, &exitErr) && exitErr.ExitCode() == 222 +} + func powerBenchDurationSec(profile string) int { switch strings.TrimSpace(strings.ToLower(profile)) { case NvidiaBenchmarkProfileStability: