Compare commits

..

1 Commits
v8.0 ... v8.1

Author SHA1 Message Date
a636146dbd Fix power calibration failing due to DCGM resource contention
When a targeted_power attempt is cancelled (e.g. after sw_thermal
throttle), nv-hostengine holds the diagnostic slot asynchronously.
The next attempt immediately received DCGM_ST_IN_USE (exit 222)
and incorrectly derated the power limit.

Now: exit 222 is detected via isDCGMResourceBusy and triggers an
exponential back-off retry at the same power limit (1s, 2s, 4s, …
up to 256s). Once the back-off delay would exceed 300s the
calibration fails, indicating the slot is persistently held.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 20:41:17 +03:00

View File

@@ -4,6 +4,7 @@ import (
"context"
"encoding/csv"
"encoding/json"
"errors"
"fmt"
"math"
"os"
@@ -2449,6 +2450,11 @@ func runBenchmarkPowerCalibration(
const calibDurationSec = 120
const derateStepW = 25
const maxDerateW = 150
// dcgmResourceBusyMaxDelaySec caps the exponential back-off when DCGM
// returns DCGM_ST_IN_USE (exit 222). The sequence is 1 s, 2 s, 4 s, …
// doubling each retry until it would exceed the cap, at which point the
// next busy response fails the calibration immediately.
const dcgmResourceBusyMaxDelaySec = 300
if _, err := exec.LookPath("dcgmi"); err != nil {
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
@@ -2500,6 +2506,8 @@ func runBenchmarkPowerCalibration(
calib := benchmarkPowerCalibrationResult{
AppliedPowerLimitW: float64(appliedLimitW),
}
busyRetries := 0
busyDelaySec := 1 // exponential back-off seed; doubles each retry up to dcgmResourceBusyMaxDelaySec
if canDerate && originalLimitW > 0 {
idxCopy := idx
orig := originalLimitW
@@ -2511,6 +2519,7 @@ func runBenchmarkPowerCalibration(
})
}
calibLoop:
for {
calib.Attempts++
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", idx, calib.Attempts, appliedLimitW, calibDurationSec))
@@ -2564,6 +2573,35 @@ func runBenchmarkPowerCalibration(
break
}
// If DCGM reports the resource is in use, nv-hostengine has not yet
// released the diagnostic slot from the previous attempt. Do not
// derate: wait with exponential back-off and retry at the same
// power limit. Once the back-off delay would exceed
// dcgmResourceBusyMaxDelaySec, fail — the slot is persistently
// held by something else.
if attempt.err != nil && isDCGMResourceBusy(attempt.err) {
if busyDelaySec > dcgmResourceBusyMaxDelaySec {
calib.Notes = append(calib.Notes, fmt.Sprintf("DCGM resource busy after %d retries, giving up", busyRetries))
logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource persistently busy after %d retries, stopping", idx, busyRetries))
break
}
busyRetries++
logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource busy (attempt %d), retrying in %ds", idx, calib.Attempts, busyDelaySec))
select {
case <-ctx.Done():
break calibLoop
case <-time.After(time.Duration(busyDelaySec) * time.Second):
}
next := busyDelaySec * 2
if next > dcgmResourceBusyMaxDelaySec {
next = dcgmResourceBusyMaxDelaySec + 1 // sentinel: next busy → fail
}
busyDelaySec = next
continue calibLoop
}
busyRetries = 0 // reset on any non-busy outcome
busyDelaySec = 1 // reset back-off
switch {
case throttleReason != "":
calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power was canceled on attempt %d after %s throttling at %d W", calib.Attempts, throttleReason, appliedLimitW))
@@ -2604,6 +2642,13 @@ func runBenchmarkPowerCalibration(
return results, restore
}
// isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222),
// meaning nv-hostengine still holds the diagnostic slot from a prior run.
func isDCGMResourceBusy(err error) bool {
var exitErr *exec.ExitError
return errors.As(err, &exitErr) && exitErr.ExitCode() == 222
}
func powerBenchDurationSec(profile string) int {
switch strings.TrimSpace(strings.ToLower(profile)) {
case NvidiaBenchmarkProfileStability: