From f87461ee4a7491df12e8d09ae916e05e13295349 Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Tue, 14 Apr 2026 21:44:57 +0300 Subject: [PATCH] Detect thermal throttle with fans below 100% as cooling misconfiguration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During power calibration: if a thermal throttle (sw_thermal/hw_thermal) causes ≥20% clock drop while server fans are below 98% P95 duty cycle, record a CoolingWarning on the GPU result and emit an actionable finding telling the operator to rerun with fans manually fixed at 100%. During steady-state benchmark: same signal enriches the existing thermal_limited finding with fan duty cycle and clock drift values. Covers both the main benchmark (buildBenchmarkFindings) and the power bench (NvidiaPowerBenchResult.Findings). Co-Authored-By: Claude Sonnet 4.6 --- audit/internal/platform/benchmark.go | 59 +++++++++++++++++++++- audit/internal/platform/benchmark_types.go | 5 ++ 2 files changed, 63 insertions(+), 1 deletion(-) diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index 825d454..9a52e5c 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -49,6 +49,10 @@ type benchmarkPowerCalibrationResult struct { Derated bool Completed bool Notes []string + // CoolingWarning is set when the GPU throttled thermally with a clock drop + // ≥20% while server fans were below 100% duty cycle — a signal that the + // cooling system may not be correctly configured for full GPU load. + CoolingWarning string } type benchmarkBurnProfile struct { @@ -344,6 +348,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv gpuResult.PowerCalibrationTries = calib.Attempts gpuResult.PowerLimitDerated = calib.Derated gpuResult.Notes = append(gpuResult.Notes, calib.Notes...) + if calib.CoolingWarning != "" { + gpuResult.CoolingWarning = calib.CoolingWarning + } } if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil { gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz @@ -1625,7 +1632,15 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string { case "power_capped": findings = append(findings, fmt.Sprintf("GPU %d spent measurable time under SW power cap.", gpu.Index)) case "thermal_limited": - findings = append(findings, fmt.Sprintf("GPU %d reported thermal slowdown during steady state.", gpu.Index)) + msg := fmt.Sprintf("GPU %d reported thermal slowdown during steady state.", gpu.Index) + if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && + result.Cooling.P95FanDutyCyclePct < 98 && gpu.Steady.ClockDriftPct >= 20 { + msg += fmt.Sprintf( + " Fans peaked at %.0f%% duty cycle (not at maximum) while clocks dropped %.0f%% — possible cooling misconfiguration; rerun the benchmark with fan speed manually fixed at 100%%.", + result.Cooling.P95FanDutyCyclePct, gpu.Steady.ClockDriftPct, + ) + } + findings = append(findings, msg) case "sync_boost_limited": findings = append(findings, fmt.Sprintf("GPU %d was limited by sync boost behaviour.", gpu.Index)) case "low_sm_clock_vs_target": @@ -1642,6 +1657,12 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string { findings = append(findings, fmt.Sprintf("GPU %d reported %d corrected ECC error(s) — possible DRAM degradation.", gpu.Index, gpu.ECC.Corrected)) } } + if gpu.CoolingWarning != "" { + findings = append(findings, fmt.Sprintf( + "GPU %d: %s. Operator action: rerun the benchmark with fan speed manually fixed at 100%% to confirm actual thermal headroom.", + gpu.Index, gpu.CoolingWarning, + )) + } if len(gpu.PrecisionFailures) > 0 { findings = append(findings, fmt.Sprintf("GPU %d had incomplete precision coverage: %s.", gpu.Index, strings.Join(gpu.PrecisionFailures, ", "))) } @@ -2044,6 +2065,9 @@ func runNvidiaBenchmarkParallel( r.PowerCalibrationTries = calib.Attempts r.PowerLimitDerated = calib.Derated r.Notes = append(r.Notes, calib.Notes...) + if calib.CoolingWarning != "" { + r.CoolingWarning = calib.CoolingWarning + } } if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil { r.LockedGraphicsClockMHz = norm.GPUClockLockMHz @@ -2606,6 +2630,32 @@ func runBenchmarkPowerCalibration( case throttleReason != "": calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power was canceled on attempt %d after %s throttling at %d W", calib.Attempts, throttleReason, appliedLimitW)) logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", idx, throttleReason, appliedLimitW)) + // Check whether the thermal throttle coincided with fans below + // maximum: that combination suggests cooling misconfiguration + // rather than a fundamental power-delivery limit. + if strings.Contains(throttleReason, "thermal") && calib.CoolingWarning == "" { + clocks := make([]float64, 0, len(perGPU)) + var fanDutyValues []float64 + fanDutyAvail := false + for _, r := range perGPU { + if r.ClockMHz > 0 { + clocks = append(clocks, r.ClockMHz) + } + if r.FanDutyCycleAvailable { + fanDutyAvail = true + fanDutyValues = append(fanDutyValues, r.FanDutyCyclePct) + } + } + dropPct := benchmarkClockDrift(clocks) + p95FanDuty := benchmarkPercentile(fanDutyValues, 95) + if dropPct >= 20 && fanDutyAvail && p95FanDuty < 98 { + calib.CoolingWarning = fmt.Sprintf( + "thermal throttle (%s) caused a %.0f%% clock drop while fans were at %.0f%% duty cycle — server cooling may not be configured for full GPU load", + throttleReason, dropPct, p95FanDuty, + ) + logFunc(fmt.Sprintf("power calibration: GPU %d cooling warning: %s", idx, calib.CoolingWarning)) + } + } case attempt.err != nil: calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", calib.Attempts, appliedLimitW, attempt.err)) logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", idx, appliedLimitW, attempt.err)) @@ -2823,6 +2873,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N OccupiedSlots: occupied, OccupiedSlotsNote: note, Notes: append([]string(nil), calib.Notes...), + CoolingWarning: calib.CoolingWarning, }) } sort.Slice(gpus, func(i, j int) bool { @@ -2849,6 +2900,12 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N if gpu.Derated { result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete targeted_power.", gpu.Index, gpu.AppliedPowerLimitW)) } + if gpu.CoolingWarning != "" { + result.Findings = append(result.Findings, fmt.Sprintf( + "GPU %d: %s. Operator action: rerun the benchmark with fan speed manually fixed at 100%% to confirm actual thermal headroom.", + gpu.Index, gpu.CoolingWarning, + )) + } } singleByIndex := make(map[int]NvidiaPowerBenchGPU, len(gpus)) for _, gpu := range gpus { diff --git a/audit/internal/platform/benchmark_types.go b/audit/internal/platform/benchmark_types.go index 6c497b0..ea9330a 100644 --- a/audit/internal/platform/benchmark_types.go +++ b/audit/internal/platform/benchmark_types.go @@ -131,6 +131,9 @@ type BenchmarkGPUResult struct { Scores BenchmarkScorecard `json:"scores"` DegradationReasons []string `json:"degradation_reasons,omitempty"` Notes []string `json:"notes,omitempty"` + // CoolingWarning is non-empty when a thermal throttle event occurred with + // a clock drop ≥20% while server fans were not at 100% duty cycle. + CoolingWarning string `json:"cooling_warning,omitempty"` } type BenchmarkTelemetrySummary struct { @@ -280,6 +283,8 @@ type NvidiaPowerBenchGPU struct { OccupiedSlots []int `json:"occupied_slots,omitempty"` OccupiedSlotsNote string `json:"occupied_slots_note,omitempty"` Notes []string `json:"notes,omitempty"` + // CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow. + CoolingWarning string `json:"cooling_warning,omitempty"` } type NvidiaPowerBenchStep struct {