Detect thermal throttle with fans below 100% as cooling misconfiguration

During power calibration: if a thermal throttle (sw_thermal/hw_thermal) causes ≥20% clock drop while server fans are below 98% P95 duty cycle, record a CoolingWarning on the GPU result and emit an actionable finding telling the operator to rerun with fans manually fixed at 100%. During steady-state benchmark: same signal enriches the existing thermal_limited finding with fan duty cycle and clock drift values. Covers both the main benchmark (buildBenchmarkFindings) and the power bench (NvidiaPowerBenchResult.Findings). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Fix power calibration failing due to DCGM resource contention
2026-04-14 21:44:57 +03:00 · 2026-04-14 20:41:17 +03:00
2 changed files with 108 additions and 1 deletions
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"encoding/csv"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"math"
 	"os"
@@ -48,6 +49,10 @@ type benchmarkPowerCalibrationResult struct {
 	Derated            bool
 	Completed          bool
 	Notes              []string
+	// CoolingWarning is set when the GPU throttled thermally with a clock drop
+	// ≥20% while server fans were below 100% duty cycle — a signal that the
+	// cooling system may not be correctly configured for full GPU load.
+	CoolingWarning string
 }

 type benchmarkBurnProfile struct {
@@ -343,6 +348,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 				gpuResult.PowerCalibrationTries = calib.Attempts
 				gpuResult.PowerLimitDerated = calib.Derated
 				gpuResult.Notes = append(gpuResult.Notes, calib.Notes...)
+				if calib.CoolingWarning != "" {
+					gpuResult.CoolingWarning = calib.CoolingWarning
+				}
 			}
 			if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
 				gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz
@@ -1624,7 +1632,15 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
 			case "power_capped":
 				findings = append(findings, fmt.Sprintf("GPU %d spent measurable time under SW power cap.", gpu.Index))
 			case "thermal_limited":
-				findings = append(findings, fmt.Sprintf("GPU %d reported thermal slowdown during steady state.", gpu.Index))
+				msg := fmt.Sprintf("GPU %d reported thermal slowdown during steady state.", gpu.Index)
+				if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable &&
+					result.Cooling.P95FanDutyCyclePct < 98 && gpu.Steady.ClockDriftPct >= 20 {
+					msg += fmt.Sprintf(
+						" Fans peaked at %.0f%% duty cycle (not at maximum) while clocks dropped %.0f%% — possible cooling misconfiguration; rerun the benchmark with fan speed manually fixed at 100%%.",
+						result.Cooling.P95FanDutyCyclePct, gpu.Steady.ClockDriftPct,
+					)
+				}
+				findings = append(findings, msg)
 			case "sync_boost_limited":
 				findings = append(findings, fmt.Sprintf("GPU %d was limited by sync boost behaviour.", gpu.Index))
 			case "low_sm_clock_vs_target":
@@ -1641,6 +1657,12 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
 				findings = append(findings, fmt.Sprintf("GPU %d reported %d corrected ECC error(s) — possible DRAM degradation.", gpu.Index, gpu.ECC.Corrected))
 			}
 		}
+		if gpu.CoolingWarning != "" {
+			findings = append(findings, fmt.Sprintf(
+				"GPU %d: %s. Operator action: rerun the benchmark with fan speed manually fixed at 100%% to confirm actual thermal headroom.",
+				gpu.Index, gpu.CoolingWarning,
+			))
+		}
 		if len(gpu.PrecisionFailures) > 0 {
 			findings = append(findings, fmt.Sprintf("GPU %d had incomplete precision coverage: %s.", gpu.Index, strings.Join(gpu.PrecisionFailures, ", ")))
 		}
@@ -2043,6 +2065,9 @@ func runNvidiaBenchmarkParallel(
 			r.PowerCalibrationTries = calib.Attempts
 			r.PowerLimitDerated = calib.Derated
 			r.Notes = append(r.Notes, calib.Notes...)
+			if calib.CoolingWarning != "" {
+				r.CoolingWarning = calib.CoolingWarning
+			}
 		}
 		if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
 			r.LockedGraphicsClockMHz = norm.GPUClockLockMHz
@@ -2449,6 +2474,11 @@ func runBenchmarkPowerCalibration(
 	const calibDurationSec = 120
 	const derateStepW = 25
 	const maxDerateW = 150
+	// dcgmResourceBusyMaxDelaySec caps the exponential back-off when DCGM
+	// returns DCGM_ST_IN_USE (exit 222). The sequence is 1 s, 2 s, 4 s, …
+	// doubling each retry until it would exceed the cap, at which point the
+	// next busy response fails the calibration immediately.
+	const dcgmResourceBusyMaxDelaySec = 300

 	if _, err := exec.LookPath("dcgmi"); err != nil {
 		logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
@@ -2500,6 +2530,8 @@ func runBenchmarkPowerCalibration(
 		calib := benchmarkPowerCalibrationResult{
 			AppliedPowerLimitW: float64(appliedLimitW),
 		}
+		busyRetries := 0
+		busyDelaySec := 1 // exponential back-off seed; doubles each retry up to dcgmResourceBusyMaxDelaySec
 		if canDerate && originalLimitW > 0 {
 			idxCopy := idx
 			orig := originalLimitW
@@ -2511,6 +2543,7 @@ func runBenchmarkPowerCalibration(
 			})
 		}

+	calibLoop:
 		for {
 			calib.Attempts++
 			logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", idx, calib.Attempts, appliedLimitW, calibDurationSec))
@@ -2564,10 +2597,65 @@ func runBenchmarkPowerCalibration(
 				break
 			}

+			// If DCGM reports the resource is in use, nv-hostengine has not yet
+			// released the diagnostic slot from the previous attempt. Do not
+			// derate: wait with exponential back-off and retry at the same
+			// power limit. Once the back-off delay would exceed
+			// dcgmResourceBusyMaxDelaySec, fail — the slot is persistently
+			// held by something else.
+			if attempt.err != nil && isDCGMResourceBusy(attempt.err) {
+				if busyDelaySec > dcgmResourceBusyMaxDelaySec {
+					calib.Notes = append(calib.Notes, fmt.Sprintf("DCGM resource busy after %d retries, giving up", busyRetries))
+					logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource persistently busy after %d retries, stopping", idx, busyRetries))
+					break
+				}
+				busyRetries++
+				logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource busy (attempt %d), retrying in %ds", idx, calib.Attempts, busyDelaySec))
+				select {
+				case <-ctx.Done():
+					break calibLoop
+				case <-time.After(time.Duration(busyDelaySec) * time.Second):
+				}
+				next := busyDelaySec * 2
+				if next > dcgmResourceBusyMaxDelaySec {
+					next = dcgmResourceBusyMaxDelaySec + 1 // sentinel: next busy → fail
+				}
+				busyDelaySec = next
+				continue calibLoop
+			}
+			busyRetries = 0    // reset on any non-busy outcome
+			busyDelaySec = 1   // reset back-off
+
 			switch {
 			case throttleReason != "":
 				calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power was canceled on attempt %d after %s throttling at %d W", calib.Attempts, throttleReason, appliedLimitW))
 				logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", idx, throttleReason, appliedLimitW))
+				// Check whether the thermal throttle coincided with fans below
+				// maximum: that combination suggests cooling misconfiguration
+				// rather than a fundamental power-delivery limit.
+				if strings.Contains(throttleReason, "thermal") && calib.CoolingWarning == "" {
+					clocks := make([]float64, 0, len(perGPU))
+					var fanDutyValues []float64
+					fanDutyAvail := false
+					for _, r := range perGPU {
+						if r.ClockMHz > 0 {
+							clocks = append(clocks, r.ClockMHz)
+						}
+						if r.FanDutyCycleAvailable {
+							fanDutyAvail = true
+							fanDutyValues = append(fanDutyValues, r.FanDutyCyclePct)
+						}
+					}
+					dropPct := benchmarkClockDrift(clocks)
+					p95FanDuty := benchmarkPercentile(fanDutyValues, 95)
+					if dropPct >= 20 && fanDutyAvail && p95FanDuty < 98 {
+						calib.CoolingWarning = fmt.Sprintf(
+							"thermal throttle (%s) caused a %.0f%% clock drop while fans were at %.0f%% duty cycle — server cooling may not be configured for full GPU load",
+							throttleReason, dropPct, p95FanDuty,
+						)
+						logFunc(fmt.Sprintf("power calibration: GPU %d cooling warning: %s", idx, calib.CoolingWarning))
+					}
+				}
 			case attempt.err != nil:
 				calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", calib.Attempts, appliedLimitW, attempt.err))
 				logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", idx, appliedLimitW, attempt.err))
@@ -2604,6 +2692,13 @@ func runBenchmarkPowerCalibration(
 	return results, restore
 }

+// isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222),
+// meaning nv-hostengine still holds the diagnostic slot from a prior run.
+func isDCGMResourceBusy(err error) bool {
+	var exitErr *exec.ExitError
+	return errors.As(err, &exitErr) && exitErr.ExitCode() == 222
+}
+
 func powerBenchDurationSec(profile string) int {
 	switch strings.TrimSpace(strings.ToLower(profile)) {
 	case NvidiaBenchmarkProfileStability:
@@ -2778,6 +2873,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			OccupiedSlots:       occupied,
 			OccupiedSlotsNote:   note,
 			Notes:               append([]string(nil), calib.Notes...),
+			CoolingWarning:      calib.CoolingWarning,
 		})
 	}
 	sort.Slice(gpus, func(i, j int) bool {
@@ -2804,6 +2900,12 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		if gpu.Derated {
 			result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete targeted_power.", gpu.Index, gpu.AppliedPowerLimitW))
 		}
+		if gpu.CoolingWarning != "" {
+			result.Findings = append(result.Findings, fmt.Sprintf(
+				"GPU %d: %s. Operator action: rerun the benchmark with fan speed manually fixed at 100%% to confirm actual thermal headroom.",
+				gpu.Index, gpu.CoolingWarning,
+			))
+		}
 	}
 	singleByIndex := make(map[int]NvidiaPowerBenchGPU, len(gpus))
 	for _, gpu := range gpus {
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -131,6 +131,9 @@ type BenchmarkGPUResult struct {
 	Scores             BenchmarkScorecard         `json:"scores"`
 	DegradationReasons []string                   `json:"degradation_reasons,omitempty"`
 	Notes              []string                   `json:"notes,omitempty"`
+	// CoolingWarning is non-empty when a thermal throttle event occurred with
+	// a clock drop ≥20% while server fans were not at 100% duty cycle.
+	CoolingWarning string `json:"cooling_warning,omitempty"`
 }

 type BenchmarkTelemetrySummary struct {
@@ -280,6 +283,8 @@ type NvidiaPowerBenchGPU struct {
 	OccupiedSlots       []int    `json:"occupied_slots,omitempty"`
 	OccupiedSlotsNote   string   `json:"occupied_slots_note,omitempty"`
 	Notes               []string `json:"notes,omitempty"`
+	// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
+	CoolingWarning string `json:"cooling_warning,omitempty"`
 }

 type NvidiaPowerBenchStep struct {