Detect thermal throttle with fans below 100% as cooling misconfiguration
During power calibration: if a thermal throttle (sw_thermal/hw_thermal) causes ≥20% clock drop while server fans are below 98% P95 duty cycle, record a CoolingWarning on the GPU result and emit an actionable finding telling the operator to rerun with fans manually fixed at 100%. During steady-state benchmark: same signal enriches the existing thermal_limited finding with fan duty cycle and clock drift values. Covers both the main benchmark (buildBenchmarkFindings) and the power bench (NvidiaPowerBenchResult.Findings). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -49,6 +49,10 @@ type benchmarkPowerCalibrationResult struct {
|
|||||||
Derated bool
|
Derated bool
|
||||||
Completed bool
|
Completed bool
|
||||||
Notes []string
|
Notes []string
|
||||||
|
// CoolingWarning is set when the GPU throttled thermally with a clock drop
|
||||||
|
// ≥20% while server fans were below 100% duty cycle — a signal that the
|
||||||
|
// cooling system may not be correctly configured for full GPU load.
|
||||||
|
CoolingWarning string
|
||||||
}
|
}
|
||||||
|
|
||||||
type benchmarkBurnProfile struct {
|
type benchmarkBurnProfile struct {
|
||||||
@@ -344,6 +348,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
gpuResult.PowerCalibrationTries = calib.Attempts
|
gpuResult.PowerCalibrationTries = calib.Attempts
|
||||||
gpuResult.PowerLimitDerated = calib.Derated
|
gpuResult.PowerLimitDerated = calib.Derated
|
||||||
gpuResult.Notes = append(gpuResult.Notes, calib.Notes...)
|
gpuResult.Notes = append(gpuResult.Notes, calib.Notes...)
|
||||||
|
if calib.CoolingWarning != "" {
|
||||||
|
gpuResult.CoolingWarning = calib.CoolingWarning
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
|
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
|
||||||
gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz
|
gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz
|
||||||
@@ -1625,7 +1632,15 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
|
|||||||
case "power_capped":
|
case "power_capped":
|
||||||
findings = append(findings, fmt.Sprintf("GPU %d spent measurable time under SW power cap.", gpu.Index))
|
findings = append(findings, fmt.Sprintf("GPU %d spent measurable time under SW power cap.", gpu.Index))
|
||||||
case "thermal_limited":
|
case "thermal_limited":
|
||||||
findings = append(findings, fmt.Sprintf("GPU %d reported thermal slowdown during steady state.", gpu.Index))
|
msg := fmt.Sprintf("GPU %d reported thermal slowdown during steady state.", gpu.Index)
|
||||||
|
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable &&
|
||||||
|
result.Cooling.P95FanDutyCyclePct < 98 && gpu.Steady.ClockDriftPct >= 20 {
|
||||||
|
msg += fmt.Sprintf(
|
||||||
|
" Fans peaked at %.0f%% duty cycle (not at maximum) while clocks dropped %.0f%% — possible cooling misconfiguration; rerun the benchmark with fan speed manually fixed at 100%%.",
|
||||||
|
result.Cooling.P95FanDutyCyclePct, gpu.Steady.ClockDriftPct,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
findings = append(findings, msg)
|
||||||
case "sync_boost_limited":
|
case "sync_boost_limited":
|
||||||
findings = append(findings, fmt.Sprintf("GPU %d was limited by sync boost behaviour.", gpu.Index))
|
findings = append(findings, fmt.Sprintf("GPU %d was limited by sync boost behaviour.", gpu.Index))
|
||||||
case "low_sm_clock_vs_target":
|
case "low_sm_clock_vs_target":
|
||||||
@@ -1642,6 +1657,12 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
|
|||||||
findings = append(findings, fmt.Sprintf("GPU %d reported %d corrected ECC error(s) — possible DRAM degradation.", gpu.Index, gpu.ECC.Corrected))
|
findings = append(findings, fmt.Sprintf("GPU %d reported %d corrected ECC error(s) — possible DRAM degradation.", gpu.Index, gpu.ECC.Corrected))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if gpu.CoolingWarning != "" {
|
||||||
|
findings = append(findings, fmt.Sprintf(
|
||||||
|
"GPU %d: %s. Operator action: rerun the benchmark with fan speed manually fixed at 100%% to confirm actual thermal headroom.",
|
||||||
|
gpu.Index, gpu.CoolingWarning,
|
||||||
|
))
|
||||||
|
}
|
||||||
if len(gpu.PrecisionFailures) > 0 {
|
if len(gpu.PrecisionFailures) > 0 {
|
||||||
findings = append(findings, fmt.Sprintf("GPU %d had incomplete precision coverage: %s.", gpu.Index, strings.Join(gpu.PrecisionFailures, ", ")))
|
findings = append(findings, fmt.Sprintf("GPU %d had incomplete precision coverage: %s.", gpu.Index, strings.Join(gpu.PrecisionFailures, ", ")))
|
||||||
}
|
}
|
||||||
@@ -2044,6 +2065,9 @@ func runNvidiaBenchmarkParallel(
|
|||||||
r.PowerCalibrationTries = calib.Attempts
|
r.PowerCalibrationTries = calib.Attempts
|
||||||
r.PowerLimitDerated = calib.Derated
|
r.PowerLimitDerated = calib.Derated
|
||||||
r.Notes = append(r.Notes, calib.Notes...)
|
r.Notes = append(r.Notes, calib.Notes...)
|
||||||
|
if calib.CoolingWarning != "" {
|
||||||
|
r.CoolingWarning = calib.CoolingWarning
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
|
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
|
||||||
r.LockedGraphicsClockMHz = norm.GPUClockLockMHz
|
r.LockedGraphicsClockMHz = norm.GPUClockLockMHz
|
||||||
@@ -2606,6 +2630,32 @@ func runBenchmarkPowerCalibration(
|
|||||||
case throttleReason != "":
|
case throttleReason != "":
|
||||||
calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power was canceled on attempt %d after %s throttling at %d W", calib.Attempts, throttleReason, appliedLimitW))
|
calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power was canceled on attempt %d after %s throttling at %d W", calib.Attempts, throttleReason, appliedLimitW))
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", idx, throttleReason, appliedLimitW))
|
logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", idx, throttleReason, appliedLimitW))
|
||||||
|
// Check whether the thermal throttle coincided with fans below
|
||||||
|
// maximum: that combination suggests cooling misconfiguration
|
||||||
|
// rather than a fundamental power-delivery limit.
|
||||||
|
if strings.Contains(throttleReason, "thermal") && calib.CoolingWarning == "" {
|
||||||
|
clocks := make([]float64, 0, len(perGPU))
|
||||||
|
var fanDutyValues []float64
|
||||||
|
fanDutyAvail := false
|
||||||
|
for _, r := range perGPU {
|
||||||
|
if r.ClockMHz > 0 {
|
||||||
|
clocks = append(clocks, r.ClockMHz)
|
||||||
|
}
|
||||||
|
if r.FanDutyCycleAvailable {
|
||||||
|
fanDutyAvail = true
|
||||||
|
fanDutyValues = append(fanDutyValues, r.FanDutyCyclePct)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
dropPct := benchmarkClockDrift(clocks)
|
||||||
|
p95FanDuty := benchmarkPercentile(fanDutyValues, 95)
|
||||||
|
if dropPct >= 20 && fanDutyAvail && p95FanDuty < 98 {
|
||||||
|
calib.CoolingWarning = fmt.Sprintf(
|
||||||
|
"thermal throttle (%s) caused a %.0f%% clock drop while fans were at %.0f%% duty cycle — server cooling may not be configured for full GPU load",
|
||||||
|
throttleReason, dropPct, p95FanDuty,
|
||||||
|
)
|
||||||
|
logFunc(fmt.Sprintf("power calibration: GPU %d cooling warning: %s", idx, calib.CoolingWarning))
|
||||||
|
}
|
||||||
|
}
|
||||||
case attempt.err != nil:
|
case attempt.err != nil:
|
||||||
calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", calib.Attempts, appliedLimitW, attempt.err))
|
calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", calib.Attempts, appliedLimitW, attempt.err))
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", idx, appliedLimitW, attempt.err))
|
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", idx, appliedLimitW, attempt.err))
|
||||||
@@ -2823,6 +2873,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
OccupiedSlots: occupied,
|
OccupiedSlots: occupied,
|
||||||
OccupiedSlotsNote: note,
|
OccupiedSlotsNote: note,
|
||||||
Notes: append([]string(nil), calib.Notes...),
|
Notes: append([]string(nil), calib.Notes...),
|
||||||
|
CoolingWarning: calib.CoolingWarning,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
sort.Slice(gpus, func(i, j int) bool {
|
sort.Slice(gpus, func(i, j int) bool {
|
||||||
@@ -2849,6 +2900,12 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
if gpu.Derated {
|
if gpu.Derated {
|
||||||
result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete targeted_power.", gpu.Index, gpu.AppliedPowerLimitW))
|
result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete targeted_power.", gpu.Index, gpu.AppliedPowerLimitW))
|
||||||
}
|
}
|
||||||
|
if gpu.CoolingWarning != "" {
|
||||||
|
result.Findings = append(result.Findings, fmt.Sprintf(
|
||||||
|
"GPU %d: %s. Operator action: rerun the benchmark with fan speed manually fixed at 100%% to confirm actual thermal headroom.",
|
||||||
|
gpu.Index, gpu.CoolingWarning,
|
||||||
|
))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
singleByIndex := make(map[int]NvidiaPowerBenchGPU, len(gpus))
|
singleByIndex := make(map[int]NvidiaPowerBenchGPU, len(gpus))
|
||||||
for _, gpu := range gpus {
|
for _, gpu := range gpus {
|
||||||
|
|||||||
@@ -131,6 +131,9 @@ type BenchmarkGPUResult struct {
|
|||||||
Scores BenchmarkScorecard `json:"scores"`
|
Scores BenchmarkScorecard `json:"scores"`
|
||||||
DegradationReasons []string `json:"degradation_reasons,omitempty"`
|
DegradationReasons []string `json:"degradation_reasons,omitempty"`
|
||||||
Notes []string `json:"notes,omitempty"`
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
// CoolingWarning is non-empty when a thermal throttle event occurred with
|
||||||
|
// a clock drop ≥20% while server fans were not at 100% duty cycle.
|
||||||
|
CoolingWarning string `json:"cooling_warning,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type BenchmarkTelemetrySummary struct {
|
type BenchmarkTelemetrySummary struct {
|
||||||
@@ -280,6 +283,8 @@ type NvidiaPowerBenchGPU struct {
|
|||||||
OccupiedSlots []int `json:"occupied_slots,omitempty"`
|
OccupiedSlots []int `json:"occupied_slots,omitempty"`
|
||||||
OccupiedSlotsNote string `json:"occupied_slots_note,omitempty"`
|
OccupiedSlotsNote string `json:"occupied_slots_note,omitempty"`
|
||||||
Notes []string `json:"notes,omitempty"`
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
|
||||||
|
CoolingWarning string `json:"cooling_warning,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type NvidiaPowerBenchStep struct {
|
type NvidiaPowerBenchStep struct {
|
||||||
|
|||||||
Reference in New Issue
Block a user