Use per-GPU temperature limits from nvidia-smi -q for headroom calculation

Parse "GPU Shutdown Temp" and "GPU Slowdown Temp" from nvidia-smi -q verbose
output in enrichGPUInfoWithMaxClocks. Store as ShutdownTempC/SlowdownTempC
on benchmarkGPUInfo and BenchmarkGPUResult. Fallback: 90°C shutdown / 80°C
slowdown when not available.

TempHeadroomC = ShutdownTempC - P95TempC (per-GPU, not hardcoded 100°C).
Warning threshold: p95 >= SlowdownTempC. Critical: headroom < 10°C.
Report table shows both limits alongside headroom and p95 temp.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-16 06:45:15 +03:00
parent a8d5e019a5
commit 0d925299ff
3 changed files with 75 additions and 18 deletions

View File

@@ -115,23 +115,31 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
// Perspective 2: Thermal headroom
b.WriteString("### 2. Thermal Headroom\n\n")
b.WriteString("| GPU | p95 temp | Headroom to 100°C | Thermal throttle | Status |\n")
b.WriteString("|-----|----------|-------------------|------------------|--------|\n")
b.WriteString("| GPU | p95 temp | Slowdown limit | Shutdown limit | Headroom | Thermal throttle | Status |\n")
b.WriteString("|-----|----------|----------------|----------------|----------|------------------|--------|\n")
for _, gpu := range result.GPUs {
shutdownTemp := gpu.ShutdownTempC
if shutdownTemp <= 0 {
shutdownTemp = 90
}
slowdownTemp := gpu.SlowdownTempC
if slowdownTemp <= 0 {
slowdownTemp = 80
}
headroom := gpu.Scores.TempHeadroomC
thermalStatus := "✓ OK"
switch {
case headroom < 10:
thermalStatus = "⛔ CRITICAL"
case headroom < 20:
case gpu.Steady.P95TempC >= slowdownTemp:
thermalStatus = "⚠ WARNING"
}
throttlePct := "-"
if gpu.Scores.ThermalThrottlePct > 0 {
throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
}
fmt.Fprintf(&b, "| GPU %d | %.1f°C | %.1f°C | %s | %s |\n",
gpu.Index, gpu.Steady.P95TempC, headroom, throttlePct, thermalStatus)
fmt.Fprintf(&b, "| GPU %d | %.1f°C | %.0f°C | %.0f°C | %.1f°C | %s | %s |\n",
gpu.Index, gpu.Steady.P95TempC, slowdownTemp, shutdownTemp, headroom, throttlePct, thermalStatus)
}
b.WriteString("\n")