Compare commits

..

2 Commits
v8.0 ... v8.2

Author SHA1 Message Date
f87461ee4a Detect thermal throttle with fans below 100% as cooling misconfiguration
During power calibration: if a thermal throttle (sw_thermal/hw_thermal)
causes ≥20% clock drop while server fans are below 98% P95 duty cycle,
record a CoolingWarning on the GPU result and emit an actionable finding
telling the operator to rerun with fans manually fixed at 100%.

During steady-state benchmark: same signal enriches the existing
thermal_limited finding with fan duty cycle and clock drift values.

Covers both the main benchmark (buildBenchmarkFindings) and the power
bench (NvidiaPowerBenchResult.Findings).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 21:44:57 +03:00
a636146dbd Fix power calibration failing due to DCGM resource contention
When a targeted_power attempt is cancelled (e.g. after sw_thermal
throttle), nv-hostengine holds the diagnostic slot asynchronously.
The next attempt immediately received DCGM_ST_IN_USE (exit 222)
and incorrectly derated the power limit.

Now: exit 222 is detected via isDCGMResourceBusy and triggers an
exponential back-off retry at the same power limit (1s, 2s, 4s, …
up to 256s). Once the back-off delay would exceed 300s the
calibration fails, indicating the slot is persistently held.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 20:41:17 +03:00
2 changed files with 108 additions and 1 deletions

View File

@@ -4,6 +4,7 @@ import (
"context"
"encoding/csv"
"encoding/json"
"errors"
"fmt"
"math"
"os"
@@ -48,6 +49,10 @@ type benchmarkPowerCalibrationResult struct {
Derated bool
Completed bool
Notes []string
// CoolingWarning is set when the GPU throttled thermally with a clock drop
// ≥20% while server fans were below 100% duty cycle — a signal that the
// cooling system may not be correctly configured for full GPU load.
CoolingWarning string
}
type benchmarkBurnProfile struct {
@@ -343,6 +348,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
gpuResult.PowerCalibrationTries = calib.Attempts
gpuResult.PowerLimitDerated = calib.Derated
gpuResult.Notes = append(gpuResult.Notes, calib.Notes...)
if calib.CoolingWarning != "" {
gpuResult.CoolingWarning = calib.CoolingWarning
}
}
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz
@@ -1624,7 +1632,15 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
case "power_capped":
findings = append(findings, fmt.Sprintf("GPU %d spent measurable time under SW power cap.", gpu.Index))
case "thermal_limited":
findings = append(findings, fmt.Sprintf("GPU %d reported thermal slowdown during steady state.", gpu.Index))
msg := fmt.Sprintf("GPU %d reported thermal slowdown during steady state.", gpu.Index)
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable &&
result.Cooling.P95FanDutyCyclePct < 98 && gpu.Steady.ClockDriftPct >= 20 {
msg += fmt.Sprintf(
" Fans peaked at %.0f%% duty cycle (not at maximum) while clocks dropped %.0f%% — possible cooling misconfiguration; rerun the benchmark with fan speed manually fixed at 100%%.",
result.Cooling.P95FanDutyCyclePct, gpu.Steady.ClockDriftPct,
)
}
findings = append(findings, msg)
case "sync_boost_limited":
findings = append(findings, fmt.Sprintf("GPU %d was limited by sync boost behaviour.", gpu.Index))
case "low_sm_clock_vs_target":
@@ -1641,6 +1657,12 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
findings = append(findings, fmt.Sprintf("GPU %d reported %d corrected ECC error(s) — possible DRAM degradation.", gpu.Index, gpu.ECC.Corrected))
}
}
if gpu.CoolingWarning != "" {
findings = append(findings, fmt.Sprintf(
"GPU %d: %s. Operator action: rerun the benchmark with fan speed manually fixed at 100%% to confirm actual thermal headroom.",
gpu.Index, gpu.CoolingWarning,
))
}
if len(gpu.PrecisionFailures) > 0 {
findings = append(findings, fmt.Sprintf("GPU %d had incomplete precision coverage: %s.", gpu.Index, strings.Join(gpu.PrecisionFailures, ", ")))
}
@@ -2043,6 +2065,9 @@ func runNvidiaBenchmarkParallel(
r.PowerCalibrationTries = calib.Attempts
r.PowerLimitDerated = calib.Derated
r.Notes = append(r.Notes, calib.Notes...)
if calib.CoolingWarning != "" {
r.CoolingWarning = calib.CoolingWarning
}
}
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
r.LockedGraphicsClockMHz = norm.GPUClockLockMHz
@@ -2449,6 +2474,11 @@ func runBenchmarkPowerCalibration(
const calibDurationSec = 120
const derateStepW = 25
const maxDerateW = 150
// dcgmResourceBusyMaxDelaySec caps the exponential back-off when DCGM
// returns DCGM_ST_IN_USE (exit 222). The sequence is 1 s, 2 s, 4 s, …
// doubling each retry until it would exceed the cap, at which point the
// next busy response fails the calibration immediately.
const dcgmResourceBusyMaxDelaySec = 300
if _, err := exec.LookPath("dcgmi"); err != nil {
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
@@ -2500,6 +2530,8 @@ func runBenchmarkPowerCalibration(
calib := benchmarkPowerCalibrationResult{
AppliedPowerLimitW: float64(appliedLimitW),
}
busyRetries := 0
busyDelaySec := 1 // exponential back-off seed; doubles each retry up to dcgmResourceBusyMaxDelaySec
if canDerate && originalLimitW > 0 {
idxCopy := idx
orig := originalLimitW
@@ -2511,6 +2543,7 @@ func runBenchmarkPowerCalibration(
})
}
calibLoop:
for {
calib.Attempts++
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", idx, calib.Attempts, appliedLimitW, calibDurationSec))
@@ -2564,10 +2597,65 @@ func runBenchmarkPowerCalibration(
break
}
// If DCGM reports the resource is in use, nv-hostengine has not yet
// released the diagnostic slot from the previous attempt. Do not
// derate: wait with exponential back-off and retry at the same
// power limit. Once the back-off delay would exceed
// dcgmResourceBusyMaxDelaySec, fail — the slot is persistently
// held by something else.
if attempt.err != nil && isDCGMResourceBusy(attempt.err) {
if busyDelaySec > dcgmResourceBusyMaxDelaySec {
calib.Notes = append(calib.Notes, fmt.Sprintf("DCGM resource busy after %d retries, giving up", busyRetries))
logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource persistently busy after %d retries, stopping", idx, busyRetries))
break
}
busyRetries++
logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource busy (attempt %d), retrying in %ds", idx, calib.Attempts, busyDelaySec))
select {
case <-ctx.Done():
break calibLoop
case <-time.After(time.Duration(busyDelaySec) * time.Second):
}
next := busyDelaySec * 2
if next > dcgmResourceBusyMaxDelaySec {
next = dcgmResourceBusyMaxDelaySec + 1 // sentinel: next busy → fail
}
busyDelaySec = next
continue calibLoop
}
busyRetries = 0 // reset on any non-busy outcome
busyDelaySec = 1 // reset back-off
switch {
case throttleReason != "":
calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power was canceled on attempt %d after %s throttling at %d W", calib.Attempts, throttleReason, appliedLimitW))
logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", idx, throttleReason, appliedLimitW))
// Check whether the thermal throttle coincided with fans below
// maximum: that combination suggests cooling misconfiguration
// rather than a fundamental power-delivery limit.
if strings.Contains(throttleReason, "thermal") && calib.CoolingWarning == "" {
clocks := make([]float64, 0, len(perGPU))
var fanDutyValues []float64
fanDutyAvail := false
for _, r := range perGPU {
if r.ClockMHz > 0 {
clocks = append(clocks, r.ClockMHz)
}
if r.FanDutyCycleAvailable {
fanDutyAvail = true
fanDutyValues = append(fanDutyValues, r.FanDutyCyclePct)
}
}
dropPct := benchmarkClockDrift(clocks)
p95FanDuty := benchmarkPercentile(fanDutyValues, 95)
if dropPct >= 20 && fanDutyAvail && p95FanDuty < 98 {
calib.CoolingWarning = fmt.Sprintf(
"thermal throttle (%s) caused a %.0f%% clock drop while fans were at %.0f%% duty cycle — server cooling may not be configured for full GPU load",
throttleReason, dropPct, p95FanDuty,
)
logFunc(fmt.Sprintf("power calibration: GPU %d cooling warning: %s", idx, calib.CoolingWarning))
}
}
case attempt.err != nil:
calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", calib.Attempts, appliedLimitW, attempt.err))
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", idx, appliedLimitW, attempt.err))
@@ -2604,6 +2692,13 @@ func runBenchmarkPowerCalibration(
return results, restore
}
// isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222),
// meaning nv-hostengine still holds the diagnostic slot from a prior run.
func isDCGMResourceBusy(err error) bool {
var exitErr *exec.ExitError
return errors.As(err, &exitErr) && exitErr.ExitCode() == 222
}
func powerBenchDurationSec(profile string) int {
switch strings.TrimSpace(strings.ToLower(profile)) {
case NvidiaBenchmarkProfileStability:
@@ -2778,6 +2873,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
OccupiedSlots: occupied,
OccupiedSlotsNote: note,
Notes: append([]string(nil), calib.Notes...),
CoolingWarning: calib.CoolingWarning,
})
}
sort.Slice(gpus, func(i, j int) bool {
@@ -2804,6 +2900,12 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
if gpu.Derated {
result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete targeted_power.", gpu.Index, gpu.AppliedPowerLimitW))
}
if gpu.CoolingWarning != "" {
result.Findings = append(result.Findings, fmt.Sprintf(
"GPU %d: %s. Operator action: rerun the benchmark with fan speed manually fixed at 100%% to confirm actual thermal headroom.",
gpu.Index, gpu.CoolingWarning,
))
}
}
singleByIndex := make(map[int]NvidiaPowerBenchGPU, len(gpus))
for _, gpu := range gpus {

View File

@@ -131,6 +131,9 @@ type BenchmarkGPUResult struct {
Scores BenchmarkScorecard `json:"scores"`
DegradationReasons []string `json:"degradation_reasons,omitempty"`
Notes []string `json:"notes,omitempty"`
// CoolingWarning is non-empty when a thermal throttle event occurred with
// a clock drop ≥20% while server fans were not at 100% duty cycle.
CoolingWarning string `json:"cooling_warning,omitempty"`
}
type BenchmarkTelemetrySummary struct {
@@ -280,6 +283,8 @@ type NvidiaPowerBenchGPU struct {
OccupiedSlots []int `json:"occupied_slots,omitempty"`
OccupiedSlotsNote string `json:"occupied_slots_note,omitempty"`
Notes []string `json:"notes,omitempty"`
// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
CoolingWarning string `json:"cooling_warning,omitempty"`
}
type NvidiaPowerBenchStep struct {