Use per-GPU temperature limits from nvidia-smi -q for headroom calculation

Parse "GPU Shutdown Temp" and "GPU Slowdown Temp" from nvidia-smi -q verbose
output in enrichGPUInfoWithMaxClocks. Store as ShutdownTempC/SlowdownTempC
on benchmarkGPUInfo and BenchmarkGPUResult. Fallback: 90°C shutdown / 80°C
slowdown when not available.

TempHeadroomC = ShutdownTempC - P95TempC (per-GPU, not hardcoded 100°C).
Warning threshold: p95 >= SlowdownTempC. Critical: headroom < 10°C.
Report table shows both limits alongside headroom and p95 temp.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-16 06:45:15 +03:00
parent a8d5e019a5
commit 0d925299ff
3 changed files with 75 additions and 18 deletions

View File

@@ -40,6 +40,12 @@ type benchmarkGPUInfo struct {
MaxMemoryClockMHz float64
BaseGraphicsClockMHz float64
MultiprocessorCount int
// Temperature limits sourced from nvidia-smi -q verbose output.
// ShutdownTempC is the hardware thermal shutdown threshold.
// SlowdownTempC is the software throttle onset threshold.
// Both fall back to safe conservative defaults when not available.
ShutdownTempC float64 // fallback: 90°C
SlowdownTempC float64 // fallback: 80°C
}
type benchmarkPowerCalibrationResult struct {
@@ -330,6 +336,8 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
gpuResult.PowerLimitW = info.PowerLimitW
gpuResult.MultiprocessorCount = info.MultiprocessorCount
gpuResult.DefaultPowerLimitW = info.DefaultPowerLimitW
gpuResult.ShutdownTempC = info.ShutdownTempC
gpuResult.SlowdownTempC = info.SlowdownTempC
gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
@@ -741,6 +749,8 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`)
currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`)
smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`)
shutdownTempRe := regexp.MustCompile(`(?i)GPU Shutdown Temp\s*:\s*(\d+)\s*C`)
slowdownTempRe := regexp.MustCompile(`(?i)GPU Slowdown Temp\s*:\s*(\d+)\s*C`)
sectionStarts := gpuSectionRe.FindAllSubmatchIndex(nvsmiQ, -1)
for i, loc := range sectionStarts {
@@ -804,6 +814,20 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
}
}
}
if info.ShutdownTempC == 0 {
if m := shutdownTempRe.FindSubmatch(section); m != nil {
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
info.ShutdownTempC = v
}
}
}
if info.SlowdownTempC == 0 {
if m := slowdownTempRe.FindSubmatch(section); m != nil {
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
info.SlowdownTempC = v
}
}
}
infoByIndex[benchIdx] = info
}
}
@@ -1448,13 +1472,19 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS+gpu.Throttle.SWPowerCapUS)/runtimeUS*100)
score.StabilityScore = clampScore(100 - combinedThrottlePct)
// TempHeadroomC: distance from p95 temperature to the 100°C destruction
// threshold. Assessed independently of throttle — a GPU at 86°C without
// any throttle counter still has only 14°C headroom, which is a concern.
// Warning zone: < 20°C headroom (p95 > 80°C).
// Critical zone: < 10°C headroom (p95 > 90°C).
// TempHeadroomC: distance from p95 temperature to the GPU's hardware
// shutdown threshold (sourced from nvidia-smi -q "GPU Shutdown Temp").
// Fallback: 90°C when not available.
// Assessed independently of throttle — a GPU at 86°C without any throttle
// counter still has limited headroom and operates in degraded reliability zone.
// Warning zone: headroom < (shutdownTemp - slowdownTemp), i.e. past slowdown onset.
// Critical zone: headroom < 10°C from shutdown.
if gpu.Steady.P95TempC > 0 {
score.TempHeadroomC = 100 - gpu.Steady.P95TempC
shutdownTemp := gpu.ShutdownTempC
if shutdownTemp <= 0 {
shutdownTemp = 90
}
score.TempHeadroomC = shutdownTemp - gpu.Steady.P95TempC
}
score.ServerQualityScore = serverQualityScore(score)
score.CompositeScore = score.ComputeScore
@@ -1797,16 +1827,27 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
}
}
// Temperature headroom checks — independent of throttle counters.
if gpu.Scores.TempHeadroomC > 0 {
// Shutdown and slowdown thresholds are per-GPU from nvidia-smi -q;
// fall back to 90°C / 80°C when unavailable.
if gpu.Steady.P95TempC > 0 {
shutdownTemp := gpu.ShutdownTempC
if shutdownTemp <= 0 {
shutdownTemp = 90
}
slowdownTemp := gpu.SlowdownTempC
if slowdownTemp <= 0 {
slowdownTemp = 80
}
headroom := shutdownTemp - gpu.Steady.P95TempC
switch {
case gpu.Scores.TempHeadroomC < 10:
case headroom < 10:
findings = append(findings, fmt.Sprintf(
"[HARD STOP] GPU %d: p95 temperature %.1f°C — only %.1f°C from destruction threshold (100°C). Do not operate.",
gpu.Index, gpu.Steady.P95TempC, gpu.Scores.TempHeadroomC))
case gpu.Scores.TempHeadroomC < 20:
"[HARD STOP] GPU %d: p95 temperature %.1f°C — only %.1f°C from shutdown threshold (%.0f°C). Do not operate.",
gpu.Index, gpu.Steady.P95TempC, headroom, shutdownTemp))
case gpu.Steady.P95TempC >= slowdownTemp:
findings = append(findings, fmt.Sprintf(
"[THERMAL] GPU %d: p95 temperature %.1f°C — %.1f°C headroom to limit. Operating in degraded reliability zone (>80°C).",
gpu.Index, gpu.Steady.P95TempC, gpu.Scores.TempHeadroomC))
"[THERMAL] GPU %d: p95 temperature %.1f°C exceeds slowdown threshold (%.0f°C) — %.1f°C headroom to shutdown. Operating in degraded reliability zone.",
gpu.Index, gpu.Steady.P95TempC, slowdownTemp, headroom))
}
}
if gpu.CoolingWarning != "" {
@@ -2207,6 +2248,8 @@ func runNvidiaBenchmarkParallel(
r.PowerLimitW = info.PowerLimitW
r.MultiprocessorCount = info.MultiprocessorCount
r.DefaultPowerLimitW = info.DefaultPowerLimitW
r.ShutdownTempC = info.ShutdownTempC
r.SlowdownTempC = info.SlowdownTempC
r.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
r.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
r.MaxMemoryClockMHz = info.MaxMemoryClockMHz

View File

@@ -115,23 +115,31 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
// Perspective 2: Thermal headroom
b.WriteString("### 2. Thermal Headroom\n\n")
b.WriteString("| GPU | p95 temp | Headroom to 100°C | Thermal throttle | Status |\n")
b.WriteString("|-----|----------|-------------------|------------------|--------|\n")
b.WriteString("| GPU | p95 temp | Slowdown limit | Shutdown limit | Headroom | Thermal throttle | Status |\n")
b.WriteString("|-----|----------|----------------|----------------|----------|------------------|--------|\n")
for _, gpu := range result.GPUs {
shutdownTemp := gpu.ShutdownTempC
if shutdownTemp <= 0 {
shutdownTemp = 90
}
slowdownTemp := gpu.SlowdownTempC
if slowdownTemp <= 0 {
slowdownTemp = 80
}
headroom := gpu.Scores.TempHeadroomC
thermalStatus := "✓ OK"
switch {
case headroom < 10:
thermalStatus = "⛔ CRITICAL"
case headroom < 20:
case gpu.Steady.P95TempC >= slowdownTemp:
thermalStatus = "⚠ WARNING"
}
throttlePct := "-"
if gpu.Scores.ThermalThrottlePct > 0 {
throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
}
fmt.Fprintf(&b, "| GPU %d | %.1f°C | %.1f°C | %s | %s |\n",
gpu.Index, gpu.Steady.P95TempC, headroom, throttlePct, thermalStatus)
fmt.Fprintf(&b, "| GPU %d | %.1f°C | %.0f°C | %.0f°C | %.1f°C | %s | %s |\n",
gpu.Index, gpu.Steady.P95TempC, slowdownTemp, shutdownTemp, headroom, throttlePct, thermalStatus)
}
b.WriteString("\n")

View File

@@ -112,6 +112,12 @@ type BenchmarkGPUResult struct {
PowerLimitDerated bool `json:"power_limit_derated,omitempty"`
MultiprocessorCount int `json:"multiprocessor_count,omitempty"`
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
// ShutdownTempC is the hardware thermal shutdown threshold for this GPU,
// sourced from nvidia-smi -q ("GPU Shutdown Temp"). Fallback: 90°C.
ShutdownTempC float64 `json:"shutdown_temp_c,omitempty"`
// SlowdownTempC is the software throttle onset threshold ("GPU Slowdown Temp").
// Fallback: 80°C.
SlowdownTempC float64 `json:"slowdown_temp_c,omitempty"`
// CalibratedPeakPowerW is the p95 power measured during a short
// dcgmi targeted_power calibration run before the main benchmark.
// Used as the reference denominator for PowerSustainScore instead of