From f87461ee4a7491df12e8d09ae916e05e13295349 Mon Sep 17 00:00:00 2001
From: Michael Chus <mike@mchus.pro>
Date: Tue, 14 Apr 2026 21:44:57 +0300
Subject: [PATCH] Detect thermal throttle with fans below 100% as cooling
 misconfiguration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

During power calibration: if a thermal throttle (sw_thermal/hw_thermal)
causes ≥20% clock drop while server fans are below 98% P95 duty cycle,
record a CoolingWarning on the GPU result and emit an actionable finding
telling the operator to rerun with fans manually fixed at 100%.

During steady-state benchmark: same signal enriches the existing
thermal_limited finding with fan duty cycle and clock drift values.

Covers both the main benchmark (buildBenchmarkFindings) and the power
bench (NvidiaPowerBenchResult.Findings).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 audit/internal/platform/benchmark.go       | 59 +++++++++++++++++++++-
 audit/internal/platform/benchmark_types.go |  5 ++
 2 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go
index 825d454..9a52e5c 100644
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -49,6 +49,10 @@ type benchmarkPowerCalibrationResult struct {
 	Derated            bool
 	Completed          bool
 	Notes              []string
+	// CoolingWarning is set when the GPU throttled thermally with a clock drop
+	// ≥20% while server fans were below 100% duty cycle — a signal that the
+	// cooling system may not be correctly configured for full GPU load.
+	CoolingWarning string
 }
 
 type benchmarkBurnProfile struct {
@@ -344,6 +348,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 				gpuResult.PowerCalibrationTries = calib.Attempts
 				gpuResult.PowerLimitDerated = calib.Derated
 				gpuResult.Notes = append(gpuResult.Notes, calib.Notes...)
+				if calib.CoolingWarning != "" {
+					gpuResult.CoolingWarning = calib.CoolingWarning
+				}
 			}
 			if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
 				gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz
@@ -1625,7 +1632,15 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
 			case "power_capped":
 				findings = append(findings, fmt.Sprintf("GPU %d spent measurable time under SW power cap.", gpu.Index))
 			case "thermal_limited":
-				findings = append(findings, fmt.Sprintf("GPU %d reported thermal slowdown during steady state.", gpu.Index))
+				msg := fmt.Sprintf("GPU %d reported thermal slowdown during steady state.", gpu.Index)
+				if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable &&
+					result.Cooling.P95FanDutyCyclePct < 98 && gpu.Steady.ClockDriftPct >= 20 {
+					msg += fmt.Sprintf(
+						" Fans peaked at %.0f%% duty cycle (not at maximum) while clocks dropped %.0f%% — possible cooling misconfiguration; rerun the benchmark with fan speed manually fixed at 100%%.",
+						result.Cooling.P95FanDutyCyclePct, gpu.Steady.ClockDriftPct,
+					)
+				}
+				findings = append(findings, msg)
 			case "sync_boost_limited":
 				findings = append(findings, fmt.Sprintf("GPU %d was limited by sync boost behaviour.", gpu.Index))
 			case "low_sm_clock_vs_target":
@@ -1642,6 +1657,12 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
 				findings = append(findings, fmt.Sprintf("GPU %d reported %d corrected ECC error(s) — possible DRAM degradation.", gpu.Index, gpu.ECC.Corrected))
 			}
 		}
+		if gpu.CoolingWarning != "" {
+			findings = append(findings, fmt.Sprintf(
+				"GPU %d: %s. Operator action: rerun the benchmark with fan speed manually fixed at 100%% to confirm actual thermal headroom.",
+				gpu.Index, gpu.CoolingWarning,
+			))
+		}
 		if len(gpu.PrecisionFailures) > 0 {
 			findings = append(findings, fmt.Sprintf("GPU %d had incomplete precision coverage: %s.", gpu.Index, strings.Join(gpu.PrecisionFailures, ", ")))
 		}
@@ -2044,6 +2065,9 @@ func runNvidiaBenchmarkParallel(
 			r.PowerCalibrationTries = calib.Attempts
 			r.PowerLimitDerated = calib.Derated
 			r.Notes = append(r.Notes, calib.Notes...)
+			if calib.CoolingWarning != "" {
+				r.CoolingWarning = calib.CoolingWarning
+			}
 		}
 		if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
 			r.LockedGraphicsClockMHz = norm.GPUClockLockMHz
@@ -2606,6 +2630,32 @@ func runBenchmarkPowerCalibration(
 			case throttleReason != "":
 				calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power was canceled on attempt %d after %s throttling at %d W", calib.Attempts, throttleReason, appliedLimitW))
 				logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", idx, throttleReason, appliedLimitW))
+				// Check whether the thermal throttle coincided with fans below
+				// maximum: that combination suggests cooling misconfiguration
+				// rather than a fundamental power-delivery limit.
+				if strings.Contains(throttleReason, "thermal") && calib.CoolingWarning == "" {
+					clocks := make([]float64, 0, len(perGPU))
+					var fanDutyValues []float64
+					fanDutyAvail := false
+					for _, r := range perGPU {
+						if r.ClockMHz > 0 {
+							clocks = append(clocks, r.ClockMHz)
+						}
+						if r.FanDutyCycleAvailable {
+							fanDutyAvail = true
+							fanDutyValues = append(fanDutyValues, r.FanDutyCyclePct)
+						}
+					}
+					dropPct := benchmarkClockDrift(clocks)
+					p95FanDuty := benchmarkPercentile(fanDutyValues, 95)
+					if dropPct >= 20 && fanDutyAvail && p95FanDuty < 98 {
+						calib.CoolingWarning = fmt.Sprintf(
+							"thermal throttle (%s) caused a %.0f%% clock drop while fans were at %.0f%% duty cycle — server cooling may not be configured for full GPU load",
+							throttleReason, dropPct, p95FanDuty,
+						)
+						logFunc(fmt.Sprintf("power calibration: GPU %d cooling warning: %s", idx, calib.CoolingWarning))
+					}
+				}
 			case attempt.err != nil:
 				calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", calib.Attempts, appliedLimitW, attempt.err))
 				logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", idx, appliedLimitW, attempt.err))
@@ -2823,6 +2873,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			OccupiedSlots:       occupied,
 			OccupiedSlotsNote:   note,
 			Notes:               append([]string(nil), calib.Notes...),
+			CoolingWarning:      calib.CoolingWarning,
 		})
 	}
 	sort.Slice(gpus, func(i, j int) bool {
@@ -2849,6 +2900,12 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		if gpu.Derated {
 			result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete targeted_power.", gpu.Index, gpu.AppliedPowerLimitW))
 		}
+		if gpu.CoolingWarning != "" {
+			result.Findings = append(result.Findings, fmt.Sprintf(
+				"GPU %d: %s. Operator action: rerun the benchmark with fan speed manually fixed at 100%% to confirm actual thermal headroom.",
+				gpu.Index, gpu.CoolingWarning,
+			))
+		}
 	}
 	singleByIndex := make(map[int]NvidiaPowerBenchGPU, len(gpus))
 	for _, gpu := range gpus {
diff --git a/audit/internal/platform/benchmark_types.go b/audit/internal/platform/benchmark_types.go
index 6c497b0..ea9330a 100644
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -131,6 +131,9 @@ type BenchmarkGPUResult struct {
 	Scores             BenchmarkScorecard         `json:"scores"`
 	DegradationReasons []string                   `json:"degradation_reasons,omitempty"`
 	Notes              []string                   `json:"notes,omitempty"`
+	// CoolingWarning is non-empty when a thermal throttle event occurred with
+	// a clock drop ≥20% while server fans were not at 100% duty cycle.
+	CoolingWarning string `json:"cooling_warning,omitempty"`
 }
 
 type BenchmarkTelemetrySummary struct {
@@ -280,6 +283,8 @@ type NvidiaPowerBenchGPU struct {
 	OccupiedSlots       []int    `json:"occupied_slots,omitempty"`
 	OccupiedSlotsNote   string   `json:"occupied_slots_note,omitempty"`
 	Notes               []string `json:"notes,omitempty"`
+	// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
+	CoolingWarning string `json:"cooling_warning,omitempty"`
 }
 
 type NvidiaPowerBenchStep struct {