Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 3cf2e9c9dc | |||
| 19dbabd71d |
@@ -2476,9 +2476,6 @@ func runBenchmarkPowerCalibration(
|
||||
// calibSearchTolerance is the binary-search convergence threshold in watts.
|
||||
// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
|
||||
const calibSearchTolerance = 10
|
||||
// calibPreThrottleMarginW is subtracted from the telemetry-estimated
|
||||
// pre-throttle power draw to produce a smarter initial search candidate.
|
||||
const calibPreThrottleMarginW = 10
|
||||
// dcgmResourceBusyMaxDelaySec caps the exponential back-off when DCGM
|
||||
// returns DCGM_ST_IN_USE (exit 222). The sequence is 1 s, 2 s, 4 s, …
|
||||
// doubling each retry until it would exceed the cap, at which point the
|
||||
@@ -2501,8 +2498,25 @@ func runBenchmarkPowerCalibration(
|
||||
err error
|
||||
}
|
||||
|
||||
|
||||
// gpuCalibState holds per-GPU binary search state during parallel calibration.
|
||||
type gpuCalibState struct {
|
||||
idx int
|
||||
info benchmarkGPUInfo
|
||||
originalLimitW int
|
||||
appliedLimitW int
|
||||
minLimitW int
|
||||
lo int // highest verified-stable limit (assumed: minLimitW)
|
||||
hi int // lowest verified-unstable limit (exclusive sentinel above start)
|
||||
calib benchmarkPowerCalibrationResult
|
||||
converged bool
|
||||
}
|
||||
|
||||
results := make(map[int]benchmarkPowerCalibrationResult, len(gpuIndices))
|
||||
var restore []benchmarkRestoreAction
|
||||
|
||||
// Initialise per-GPU state.
|
||||
states := make([]*gpuCalibState, 0, len(gpuIndices))
|
||||
for _, idx := range gpuIndices {
|
||||
info := infoByIndex[idx]
|
||||
originalLimitW := int(math.Round(info.PowerLimitW))
|
||||
@@ -2531,17 +2545,17 @@ func runBenchmarkPowerCalibration(
|
||||
if minLimitW < calibSearchTolerance {
|
||||
minLimitW = calibSearchTolerance
|
||||
}
|
||||
|
||||
calib := benchmarkPowerCalibrationResult{
|
||||
AppliedPowerLimitW: float64(appliedLimitW),
|
||||
s := &gpuCalibState{
|
||||
idx: idx,
|
||||
info: info,
|
||||
originalLimitW: originalLimitW,
|
||||
appliedLimitW: appliedLimitW,
|
||||
minLimitW: minLimitW,
|
||||
lo: minLimitW,
|
||||
hi: appliedLimitW + 1, // not yet tested, not yet confirmed unstable
|
||||
calib: benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
|
||||
}
|
||||
// Binary search bounds for finding the highest stable power limit.
|
||||
// lo = highest verified-stable level (assumed: minLimitW).
|
||||
// hi = lowest verified-unstable level (assumed: above the starting limit).
|
||||
lo := minLimitW
|
||||
hi := appliedLimitW + 1 // exclusive: not yet tested, so not yet confirmed unstable
|
||||
busyRetries := 0
|
||||
busyDelaySec := 1 // exponential back-off seed; doubles each retry up to dcgmResourceBusyMaxDelaySec
|
||||
states = append(states, s)
|
||||
if canDerate && originalLimitW > 0 {
|
||||
idxCopy := idx
|
||||
orig := originalLimitW
|
||||
@@ -2552,212 +2566,243 @@ func runBenchmarkPowerCalibration(
|
||||
},
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
calibLoop:
|
||||
// Shared DCGM resource-busy back-off state (single diagnostic session).
|
||||
busyRetries := 0
|
||||
busyDelaySec := 1
|
||||
sharedAttempt := 0
|
||||
|
||||
type sharedAttemptResult struct {
|
||||
out []byte
|
||||
rows []GPUMetricRow
|
||||
err error
|
||||
}
|
||||
|
||||
calibDone:
|
||||
for {
|
||||
// Collect non-converged GPUs.
|
||||
var active []*gpuCalibState
|
||||
for _, s := range states {
|
||||
if !s.converged {
|
||||
active = append(active, s)
|
||||
}
|
||||
}
|
||||
if len(active) == 0 || ctx.Err() != nil {
|
||||
break
|
||||
}
|
||||
|
||||
sharedAttempt++
|
||||
for _, s := range active {
|
||||
s.calib.Attempts++
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", s.idx, s.calib.Attempts, s.appliedLimitW, calibDurationSec))
|
||||
}
|
||||
|
||||
// Snapshot throttle counters for all active GPUs before the run.
|
||||
beforeThrottle := make(map[int]BenchmarkThrottleCounters, len(active))
|
||||
for _, s := range active {
|
||||
beforeThrottle[s.idx], _ = queryThrottleCounters(s.idx)
|
||||
}
|
||||
|
||||
// Run targeted_power for ALL gpuIndices simultaneously so every card
|
||||
// is under load during calibration — this reflects real server thermals.
|
||||
logName := fmt.Sprintf("power-calibration-attempt-%d.log", sharedAttempt)
|
||||
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices)
|
||||
attemptCtx, cancelAttempt := context.WithCancel(ctx)
|
||||
doneCh := make(chan sharedAttemptResult, 1)
|
||||
go func() {
|
||||
out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, gpuIndices, logFunc)
|
||||
doneCh <- sharedAttemptResult{out: out, rows: rows, err: err}
|
||||
}()
|
||||
|
||||
ticker := time.NewTicker(time.Second)
|
||||
throttleReasons := make(map[int]string, len(active))
|
||||
var ar sharedAttemptResult
|
||||
|
||||
attemptLoop:
|
||||
for {
|
||||
calib.Attempts++
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", idx, calib.Attempts, appliedLimitW, calibDurationSec))
|
||||
|
||||
beforeThrottle, _ := queryThrottleCounters(idx)
|
||||
attemptCtx, cancel := context.WithCancel(ctx)
|
||||
doneCh := make(chan calibrationAttemptResult, 1)
|
||||
logName := fmt.Sprintf("power-calibration-gpu-%d-attempt-%d.log", idx, calib.Attempts)
|
||||
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, []int{idx})
|
||||
go func() {
|
||||
out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, []int{idx}, logFunc)
|
||||
doneCh <- calibrationAttemptResult{out: out, rows: rows, err: err}
|
||||
}()
|
||||
|
||||
ticker := time.NewTicker(time.Second)
|
||||
var (
|
||||
attempt calibrationAttemptResult
|
||||
throttleReason string
|
||||
)
|
||||
attemptLoop:
|
||||
for {
|
||||
select {
|
||||
case attempt = <-doneCh:
|
||||
break attemptLoop
|
||||
case <-ticker.C:
|
||||
afterThrottle, err := queryThrottleCounters(idx)
|
||||
select {
|
||||
case ar = <-doneCh:
|
||||
break attemptLoop
|
||||
case <-ticker.C:
|
||||
// Poll throttle counters for each active GPU independently.
|
||||
for _, s := range active {
|
||||
if throttleReasons[s.idx] != "" {
|
||||
continue // already detected for this GPU
|
||||
}
|
||||
after, err := queryThrottleCounters(s.idx)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
// Record the throttle reason but do NOT cancel the dcgmi
|
||||
// process. Killing it mid-run leaves nv-hostengine holding
|
||||
// the diagnostic slot, which causes DCGM_ST_IN_USE on every
|
||||
// subsequent attempt. Let targeted_power run to its natural
|
||||
// end so the daemon releases the slot cleanly before we
|
||||
// reduce power and retry.
|
||||
if reason := benchmarkCalibrationThrottleReason(beforeThrottle, afterThrottle); reason != "" && throttleReason == "" {
|
||||
throttleReason = reason
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for current run to finish before reducing power limit", idx, reason, appliedLimitW))
|
||||
// Record throttle but do NOT cancel — let dcgmi finish so
|
||||
// nv-hostengine releases the slot cleanly before the next attempt.
|
||||
if reason := benchmarkCalibrationThrottleReason(beforeThrottle[s.idx], after); reason != "" {
|
||||
throttleReasons[s.idx] = reason
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for run to finish", s.idx, reason, s.appliedLimitW))
|
||||
}
|
||||
case <-ctx.Done():
|
||||
cancel()
|
||||
attempt = <-doneCh
|
||||
break attemptLoop
|
||||
}
|
||||
case <-ctx.Done():
|
||||
cancelAttempt()
|
||||
ar = <-doneCh
|
||||
break attemptLoop
|
||||
}
|
||||
ticker.Stop()
|
||||
cancel()
|
||||
_ = os.WriteFile(filepath.Join(runDir, logName), attempt.out, 0644)
|
||||
}
|
||||
ticker.Stop()
|
||||
cancelAttempt()
|
||||
_ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644)
|
||||
|
||||
perGPU := filterRowsByGPU(attempt.rows, idx)
|
||||
// Resource busy: retry with exponential back-off (shared — one DCGM session).
|
||||
if ar.err != nil && isDCGMResourceBusy(ar.err) {
|
||||
if busyDelaySec > dcgmResourceBusyMaxDelaySec {
|
||||
for _, s := range active {
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("DCGM resource busy after %d retries, giving up", busyRetries))
|
||||
s.converged = true
|
||||
}
|
||||
logFunc(fmt.Sprintf("power calibration: DCGM resource persistently busy after %d retries, stopping", busyRetries))
|
||||
break calibDone
|
||||
}
|
||||
busyRetries++
|
||||
// Undo attempt counter: busy retries don't count as real attempts.
|
||||
for _, s := range active {
|
||||
s.calib.Attempts--
|
||||
}
|
||||
logFunc(fmt.Sprintf("power calibration: DCGM resource busy (attempt %d), retrying in %ds", sharedAttempt, busyDelaySec))
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
break calibDone
|
||||
case <-time.After(time.Duration(busyDelaySec) * time.Second):
|
||||
}
|
||||
next := busyDelaySec * 2
|
||||
if next > dcgmResourceBusyMaxDelaySec {
|
||||
next = dcgmResourceBusyMaxDelaySec + 1
|
||||
}
|
||||
busyDelaySec = next
|
||||
sharedAttempt-- // retry same logical attempt number
|
||||
continue
|
||||
}
|
||||
busyRetries = 0
|
||||
busyDelaySec = 1
|
||||
|
||||
// Per-GPU analysis and binary search update.
|
||||
for _, s := range active {
|
||||
perGPU := filterRowsByGPU(ar.rows, s.idx)
|
||||
summary := summarizeBenchmarkTelemetry(perGPU)
|
||||
if throttleReason == "" && attempt.err == nil && summary.P95PowerW > 0 {
|
||||
// Stable at appliedLimitW: record it and binary-search upward.
|
||||
calib.Summary = summary
|
||||
calib.Completed = true
|
||||
calib.AppliedPowerLimitW = float64(appliedLimitW)
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", idx, appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
|
||||
lo = appliedLimitW
|
||||
// If there is still headroom to search, try a higher level.
|
||||
if canDerate && hi-lo > calibSearchTolerance {
|
||||
nextLimitW := roundTo5W((lo + hi) / 2)
|
||||
if nextLimitW > lo && nextLimitW < hi {
|
||||
if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err == nil {
|
||||
appliedLimitW = nextLimitW
|
||||
calib.AppliedPowerLimitW = float64(appliedLimitW)
|
||||
calib.Notes = append(calib.Notes, fmt.Sprintf("binary search: stable at %d W, trying %d W (lo=%d hi=%d)", lo, nextLimitW, lo, hi))
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d binary search up: stable at %d W, trying %d W", idx, lo, nextLimitW))
|
||||
continue calibLoop
|
||||
throttle := throttleReasons[s.idx]
|
||||
|
||||
// Cooling warning: thermal throttle with fans not at maximum.
|
||||
if strings.Contains(throttle, "thermal") && s.calib.CoolingWarning == "" {
|
||||
clocks := make([]float64, 0, len(perGPU))
|
||||
var fanDutyValues []float64
|
||||
fanDutyAvail := false
|
||||
for _, r := range perGPU {
|
||||
if r.ClockMHz > 0 {
|
||||
clocks = append(clocks, r.ClockMHz)
|
||||
}
|
||||
if r.FanDutyCycleAvailable {
|
||||
fanDutyAvail = true
|
||||
fanDutyValues = append(fanDutyValues, r.FanDutyCyclePct)
|
||||
}
|
||||
}
|
||||
dropPct := benchmarkClockDrift(clocks)
|
||||
p95FanDuty := benchmarkPercentile(fanDutyValues, 95)
|
||||
if dropPct >= 20 && fanDutyAvail && p95FanDuty < 98 {
|
||||
s.calib.CoolingWarning = fmt.Sprintf(
|
||||
"thermal throttle (%s) caused a %.0f%% clock drop while fans were at %.0f%% duty cycle — server cooling may not be configured for full GPU load",
|
||||
throttle, dropPct, p95FanDuty,
|
||||
)
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d cooling warning: %s", s.idx, s.calib.CoolingWarning))
|
||||
}
|
||||
}
|
||||
|
||||
if throttle == "" && ar.err == nil && summary.P95PowerW > 0 {
|
||||
// Stable at current limit — update lo and binary-search upward.
|
||||
s.calib.Summary = summary
|
||||
s.calib.Completed = true
|
||||
s.calib.AppliedPowerLimitW = float64(s.appliedLimitW)
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", s.idx, s.appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
|
||||
s.lo = s.appliedLimitW
|
||||
if canDerate && s.hi-s.lo > calibSearchTolerance {
|
||||
next := roundTo5W((s.lo + s.hi) / 2)
|
||||
if next > s.lo && next < s.hi {
|
||||
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err == nil {
|
||||
s.appliedLimitW = next
|
||||
s.calib.AppliedPowerLimitW = float64(next)
|
||||
s.calib.Completed = false // keep searching
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: stable at %d W, trying %d W (lo=%d hi=%d)", s.lo, next, s.lo, s.hi))
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d binary search up: stable at %d W, trying %d W", s.idx, s.lo, next))
|
||||
continue // next GPU in active list
|
||||
}
|
||||
}
|
||||
}
|
||||
break
|
||||
s.converged = true
|
||||
continue
|
||||
}
|
||||
|
||||
// If DCGM reports the resource is in use, nv-hostengine has not yet
|
||||
// released the diagnostic slot from the previous attempt. Do not
|
||||
// derate: wait with exponential back-off and retry at the same
|
||||
// power limit. Once the back-off delay would exceed
|
||||
// dcgmResourceBusyMaxDelaySec, fail — the slot is persistently
|
||||
// held by something else.
|
||||
if attempt.err != nil && isDCGMResourceBusy(attempt.err) {
|
||||
if busyDelaySec > dcgmResourceBusyMaxDelaySec {
|
||||
calib.Notes = append(calib.Notes, fmt.Sprintf("DCGM resource busy after %d retries, giving up", busyRetries))
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource persistently busy after %d retries, stopping", idx, busyRetries))
|
||||
break
|
||||
}
|
||||
busyRetries++
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource busy (attempt %d), retrying in %ds", idx, calib.Attempts, busyDelaySec))
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
break calibLoop
|
||||
case <-time.After(time.Duration(busyDelaySec) * time.Second):
|
||||
}
|
||||
next := busyDelaySec * 2
|
||||
if next > dcgmResourceBusyMaxDelaySec {
|
||||
next = dcgmResourceBusyMaxDelaySec + 1 // sentinel: next busy → fail
|
||||
}
|
||||
busyDelaySec = next
|
||||
continue calibLoop
|
||||
}
|
||||
busyRetries = 0 // reset on any non-busy outcome
|
||||
busyDelaySec = 1 // reset back-off
|
||||
|
||||
// Failed or throttled — log and binary-search downward.
|
||||
switch {
|
||||
case throttleReason != "":
|
||||
calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power was canceled on attempt %d after %s throttling at %d W", calib.Attempts, throttleReason, appliedLimitW))
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", idx, throttleReason, appliedLimitW))
|
||||
// Check whether the thermal throttle coincided with fans below
|
||||
// maximum: that combination suggests cooling misconfiguration
|
||||
// rather than a fundamental power-delivery limit.
|
||||
if strings.Contains(throttleReason, "thermal") && calib.CoolingWarning == "" {
|
||||
clocks := make([]float64, 0, len(perGPU))
|
||||
var fanDutyValues []float64
|
||||
fanDutyAvail := false
|
||||
for _, r := range perGPU {
|
||||
if r.ClockMHz > 0 {
|
||||
clocks = append(clocks, r.ClockMHz)
|
||||
}
|
||||
if r.FanDutyCycleAvailable {
|
||||
fanDutyAvail = true
|
||||
fanDutyValues = append(fanDutyValues, r.FanDutyCyclePct)
|
||||
}
|
||||
}
|
||||
dropPct := benchmarkClockDrift(clocks)
|
||||
p95FanDuty := benchmarkPercentile(fanDutyValues, 95)
|
||||
if dropPct >= 20 && fanDutyAvail && p95FanDuty < 98 {
|
||||
calib.CoolingWarning = fmt.Sprintf(
|
||||
"thermal throttle (%s) caused a %.0f%% clock drop while fans were at %.0f%% duty cycle — server cooling may not be configured for full GPU load",
|
||||
throttleReason, dropPct, p95FanDuty,
|
||||
)
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d cooling warning: %s", idx, calib.CoolingWarning))
|
||||
}
|
||||
}
|
||||
case attempt.err != nil:
|
||||
calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", calib.Attempts, appliedLimitW, attempt.err))
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", idx, appliedLimitW, attempt.err))
|
||||
case throttle != "":
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d: %s throttle at %d W", s.calib.Attempts, throttle, s.appliedLimitW))
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", s.idx, throttle, s.appliedLimitW))
|
||||
case ar.err != nil:
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", s.calib.Attempts, s.appliedLimitW, ar.err))
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", s.idx, s.appliedLimitW, ar.err))
|
||||
default:
|
||||
calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W produced no valid power telemetry", calib.Attempts, appliedLimitW))
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W produced no valid telemetry", idx, calib.Attempts, appliedLimitW))
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W: no valid power telemetry", s.calib.Attempts, s.appliedLimitW))
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W: no valid telemetry", s.idx, s.calib.Attempts, s.appliedLimitW))
|
||||
}
|
||||
|
||||
if !canDerate || appliedLimitW <= 0 {
|
||||
break
|
||||
if !canDerate || s.appliedLimitW <= 0 {
|
||||
s.converged = true
|
||||
continue
|
||||
}
|
||||
// Binary-search for the highest stable power limit.
|
||||
// This attempt failed or throttled, so update the upper bound.
|
||||
hi = appliedLimitW
|
||||
s.hi = s.appliedLimitW
|
||||
|
||||
if hi-lo <= calibSearchTolerance {
|
||||
// Search range exhausted: lo is the highest verified-stable level.
|
||||
if lo > minLimitW {
|
||||
calib.Notes = append(calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", lo, lo, hi))
|
||||
if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, lo); err == nil {
|
||||
appliedLimitW = lo
|
||||
calib.AppliedPowerLimitW = float64(lo)
|
||||
calib.Derated = lo < originalLimitW
|
||||
if s.hi-s.lo <= calibSearchTolerance {
|
||||
if s.lo > s.minLimitW {
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", s.lo, s.lo, s.hi))
|
||||
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.lo); err == nil {
|
||||
s.appliedLimitW = s.lo
|
||||
s.calib.AppliedPowerLimitW = float64(s.lo)
|
||||
s.calib.Derated = s.lo < s.originalLimitW
|
||||
}
|
||||
} else {
|
||||
calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
|
||||
}
|
||||
break
|
||||
s.converged = true
|
||||
continue
|
||||
}
|
||||
|
||||
// Compute the next candidate.
|
||||
// For thermal throttle: use the pre-throttle power draw from telemetry
|
||||
// as a smarter initial estimate instead of the binary midpoint — it
|
||||
// lands much closer to the true limit on the first attempt.
|
||||
nextLimitW := (lo + hi) / 2
|
||||
if strings.Contains(throttleReason, "thermal") {
|
||||
if onsetW := calibPreThrottlePowerW(perGPU); onsetW > 0 {
|
||||
candidate := roundTo5W(int(math.Round(onsetW)) - calibPreThrottleMarginW)
|
||||
if candidate > lo && candidate < hi {
|
||||
nextLimitW = candidate
|
||||
}
|
||||
}
|
||||
next := roundTo5W((s.lo + s.hi) / 2)
|
||||
if next <= s.lo {
|
||||
next = s.lo + calibSearchTolerance
|
||||
}
|
||||
nextLimitW = roundTo5W(nextLimitW)
|
||||
// Ensure the candidate is strictly inside the search range.
|
||||
if nextLimitW <= lo {
|
||||
nextLimitW = lo + calibSearchTolerance
|
||||
if next >= s.hi {
|
||||
next = (s.lo + s.hi) / 2
|
||||
}
|
||||
if nextLimitW >= hi {
|
||||
nextLimitW = (lo + hi) / 2
|
||||
if next < s.minLimitW {
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
|
||||
s.converged = true
|
||||
continue
|
||||
}
|
||||
if nextLimitW < minLimitW {
|
||||
calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
|
||||
break
|
||||
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err != nil {
|
||||
s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d failed to set power limit %d W: %v", s.idx, next, err))
|
||||
s.converged = true
|
||||
continue
|
||||
}
|
||||
if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err != nil {
|
||||
calib.Notes = append(calib.Notes, "failed to set power limit: "+err.Error())
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d failed to set power limit %d W: %v", idx, nextLimitW, err))
|
||||
break
|
||||
}
|
||||
appliedLimitW = nextLimitW
|
||||
calib.AppliedPowerLimitW = float64(appliedLimitW)
|
||||
calib.Derated = appliedLimitW < originalLimitW
|
||||
info.PowerLimitW = float64(appliedLimitW)
|
||||
infoByIndex[idx] = info
|
||||
calib.Notes = append(calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", nextLimitW, lo, hi))
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", idx, nextLimitW, lo, hi))
|
||||
s.appliedLimitW = next
|
||||
s.calib.AppliedPowerLimitW = float64(next)
|
||||
s.calib.Derated = next < s.originalLimitW
|
||||
s.info.PowerLimitW = float64(next)
|
||||
infoByIndex[s.idx] = s.info
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", next, s.lo, s.hi))
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", s.idx, next, s.lo, s.hi))
|
||||
}
|
||||
}
|
||||
|
||||
if calib.Completed || calib.Attempts > 0 || len(calib.Notes) > 0 {
|
||||
results[idx] = calib
|
||||
for _, s := range states {
|
||||
if s.calib.Completed || s.calib.Attempts > 0 || len(s.calib.Notes) > 0 {
|
||||
results[s.idx] = s.calib
|
||||
}
|
||||
}
|
||||
return results, restore
|
||||
@@ -2770,28 +2815,6 @@ func isDCGMResourceBusy(err error) bool {
|
||||
return errors.As(err, &exitErr) && exitErr.ExitCode() == 222
|
||||
}
|
||||
|
||||
// calibPreThrottlePowerW estimates the GPU power draw just before thermal
|
||||
// throttle onset by averaging the first quarter of telemetry rows. The early
|
||||
// samples capture the GPU at peak before clock/power reduction kicks in.
|
||||
func calibPreThrottlePowerW(rows []GPUMetricRow) float64 {
|
||||
if len(rows) < 4 {
|
||||
return 0
|
||||
}
|
||||
n := len(rows) / 4
|
||||
var sum float64
|
||||
var cnt int
|
||||
for _, r := range rows[:n] {
|
||||
if r.PowerW > 0 {
|
||||
sum += r.PowerW
|
||||
cnt++
|
||||
}
|
||||
}
|
||||
if cnt == 0 {
|
||||
return 0
|
||||
}
|
||||
return sum / float64(cnt)
|
||||
}
|
||||
|
||||
// roundTo5W rounds w to the nearest 5 W boundary.
|
||||
func roundTo5W(w int) int {
|
||||
return ((w + 2) / 5) * 5
|
||||
|
||||
Reference in New Issue
Block a user