Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| a636146dbd |
@@ -4,6 +4,7 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"encoding/csv"
|
"encoding/csv"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"math"
|
"math"
|
||||||
"os"
|
"os"
|
||||||
@@ -2449,6 +2450,11 @@ func runBenchmarkPowerCalibration(
|
|||||||
const calibDurationSec = 120
|
const calibDurationSec = 120
|
||||||
const derateStepW = 25
|
const derateStepW = 25
|
||||||
const maxDerateW = 150
|
const maxDerateW = 150
|
||||||
|
// dcgmResourceBusyMaxDelaySec caps the exponential back-off when DCGM
|
||||||
|
// returns DCGM_ST_IN_USE (exit 222). The sequence is 1 s, 2 s, 4 s, …
|
||||||
|
// doubling each retry until it would exceed the cap, at which point the
|
||||||
|
// next busy response fails the calibration immediately.
|
||||||
|
const dcgmResourceBusyMaxDelaySec = 300
|
||||||
|
|
||||||
if _, err := exec.LookPath("dcgmi"); err != nil {
|
if _, err := exec.LookPath("dcgmi"); err != nil {
|
||||||
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
|
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
|
||||||
@@ -2500,6 +2506,8 @@ func runBenchmarkPowerCalibration(
|
|||||||
calib := benchmarkPowerCalibrationResult{
|
calib := benchmarkPowerCalibrationResult{
|
||||||
AppliedPowerLimitW: float64(appliedLimitW),
|
AppliedPowerLimitW: float64(appliedLimitW),
|
||||||
}
|
}
|
||||||
|
busyRetries := 0
|
||||||
|
busyDelaySec := 1 // exponential back-off seed; doubles each retry up to dcgmResourceBusyMaxDelaySec
|
||||||
if canDerate && originalLimitW > 0 {
|
if canDerate && originalLimitW > 0 {
|
||||||
idxCopy := idx
|
idxCopy := idx
|
||||||
orig := originalLimitW
|
orig := originalLimitW
|
||||||
@@ -2511,6 +2519,7 @@ func runBenchmarkPowerCalibration(
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
calibLoop:
|
||||||
for {
|
for {
|
||||||
calib.Attempts++
|
calib.Attempts++
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", idx, calib.Attempts, appliedLimitW, calibDurationSec))
|
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", idx, calib.Attempts, appliedLimitW, calibDurationSec))
|
||||||
@@ -2564,6 +2573,35 @@ func runBenchmarkPowerCalibration(
|
|||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If DCGM reports the resource is in use, nv-hostengine has not yet
|
||||||
|
// released the diagnostic slot from the previous attempt. Do not
|
||||||
|
// derate: wait with exponential back-off and retry at the same
|
||||||
|
// power limit. Once the back-off delay would exceed
|
||||||
|
// dcgmResourceBusyMaxDelaySec, fail — the slot is persistently
|
||||||
|
// held by something else.
|
||||||
|
if attempt.err != nil && isDCGMResourceBusy(attempt.err) {
|
||||||
|
if busyDelaySec > dcgmResourceBusyMaxDelaySec {
|
||||||
|
calib.Notes = append(calib.Notes, fmt.Sprintf("DCGM resource busy after %d retries, giving up", busyRetries))
|
||||||
|
logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource persistently busy after %d retries, stopping", idx, busyRetries))
|
||||||
|
break
|
||||||
|
}
|
||||||
|
busyRetries++
|
||||||
|
logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource busy (attempt %d), retrying in %ds", idx, calib.Attempts, busyDelaySec))
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
break calibLoop
|
||||||
|
case <-time.After(time.Duration(busyDelaySec) * time.Second):
|
||||||
|
}
|
||||||
|
next := busyDelaySec * 2
|
||||||
|
if next > dcgmResourceBusyMaxDelaySec {
|
||||||
|
next = dcgmResourceBusyMaxDelaySec + 1 // sentinel: next busy → fail
|
||||||
|
}
|
||||||
|
busyDelaySec = next
|
||||||
|
continue calibLoop
|
||||||
|
}
|
||||||
|
busyRetries = 0 // reset on any non-busy outcome
|
||||||
|
busyDelaySec = 1 // reset back-off
|
||||||
|
|
||||||
switch {
|
switch {
|
||||||
case throttleReason != "":
|
case throttleReason != "":
|
||||||
calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power was canceled on attempt %d after %s throttling at %d W", calib.Attempts, throttleReason, appliedLimitW))
|
calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power was canceled on attempt %d after %s throttling at %d W", calib.Attempts, throttleReason, appliedLimitW))
|
||||||
@@ -2604,6 +2642,13 @@ func runBenchmarkPowerCalibration(
|
|||||||
return results, restore
|
return results, restore
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222),
|
||||||
|
// meaning nv-hostengine still holds the diagnostic slot from a prior run.
|
||||||
|
func isDCGMResourceBusy(err error) bool {
|
||||||
|
var exitErr *exec.ExitError
|
||||||
|
return errors.As(err, &exitErr) && exitErr.ExitCode() == 222
|
||||||
|
}
|
||||||
|
|
||||||
func powerBenchDurationSec(profile string) int {
|
func powerBenchDurationSec(profile string) int {
|
||||||
switch strings.TrimSpace(strings.ToLower(profile)) {
|
switch strings.TrimSpace(strings.ToLower(profile)) {
|
||||||
case NvidiaBenchmarkProfileStability:
|
case NvidiaBenchmarkProfileStability:
|
||||||
|
|||||||
Reference in New Issue
Block a user