Use per-GPU temperature limits from nvidia-smi -q for headroom calculation
Parse "GPU Shutdown Temp" and "GPU Slowdown Temp" from nvidia-smi -q verbose output in enrichGPUInfoWithMaxClocks. Store as ShutdownTempC/SlowdownTempC on benchmarkGPUInfo and BenchmarkGPUResult. Fallback: 90°C shutdown / 80°C slowdown when not available. TempHeadroomC = ShutdownTempC - P95TempC (per-GPU, not hardcoded 100°C). Warning threshold: p95 >= SlowdownTempC. Critical: headroom < 10°C. Report table shows both limits alongside headroom and p95 temp. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -40,6 +40,12 @@ type benchmarkGPUInfo struct {
|
||||
MaxMemoryClockMHz float64
|
||||
BaseGraphicsClockMHz float64
|
||||
MultiprocessorCount int
|
||||
// Temperature limits sourced from nvidia-smi -q verbose output.
|
||||
// ShutdownTempC is the hardware thermal shutdown threshold.
|
||||
// SlowdownTempC is the software throttle onset threshold.
|
||||
// Both fall back to safe conservative defaults when not available.
|
||||
ShutdownTempC float64 // fallback: 90°C
|
||||
SlowdownTempC float64 // fallback: 80°C
|
||||
}
|
||||
|
||||
type benchmarkPowerCalibrationResult struct {
|
||||
@@ -330,6 +336,8 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
||||
gpuResult.PowerLimitW = info.PowerLimitW
|
||||
gpuResult.MultiprocessorCount = info.MultiprocessorCount
|
||||
gpuResult.DefaultPowerLimitW = info.DefaultPowerLimitW
|
||||
gpuResult.ShutdownTempC = info.ShutdownTempC
|
||||
gpuResult.SlowdownTempC = info.SlowdownTempC
|
||||
gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
|
||||
gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
|
||||
gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
|
||||
@@ -741,6 +749,8 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
|
||||
defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`)
|
||||
currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`)
|
||||
smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`)
|
||||
shutdownTempRe := regexp.MustCompile(`(?i)GPU Shutdown Temp\s*:\s*(\d+)\s*C`)
|
||||
slowdownTempRe := regexp.MustCompile(`(?i)GPU Slowdown Temp\s*:\s*(\d+)\s*C`)
|
||||
|
||||
sectionStarts := gpuSectionRe.FindAllSubmatchIndex(nvsmiQ, -1)
|
||||
for i, loc := range sectionStarts {
|
||||
@@ -804,6 +814,20 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
|
||||
}
|
||||
}
|
||||
}
|
||||
if info.ShutdownTempC == 0 {
|
||||
if m := shutdownTempRe.FindSubmatch(section); m != nil {
|
||||
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
|
||||
info.ShutdownTempC = v
|
||||
}
|
||||
}
|
||||
}
|
||||
if info.SlowdownTempC == 0 {
|
||||
if m := slowdownTempRe.FindSubmatch(section); m != nil {
|
||||
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
|
||||
info.SlowdownTempC = v
|
||||
}
|
||||
}
|
||||
}
|
||||
infoByIndex[benchIdx] = info
|
||||
}
|
||||
}
|
||||
@@ -1448,13 +1472,19 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
|
||||
float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS+gpu.Throttle.SWPowerCapUS)/runtimeUS*100)
|
||||
score.StabilityScore = clampScore(100 - combinedThrottlePct)
|
||||
|
||||
// TempHeadroomC: distance from p95 temperature to the 100°C destruction
|
||||
// threshold. Assessed independently of throttle — a GPU at 86°C without
|
||||
// any throttle counter still has only 14°C headroom, which is a concern.
|
||||
// Warning zone: < 20°C headroom (p95 > 80°C).
|
||||
// Critical zone: < 10°C headroom (p95 > 90°C).
|
||||
// TempHeadroomC: distance from p95 temperature to the GPU's hardware
|
||||
// shutdown threshold (sourced from nvidia-smi -q "GPU Shutdown Temp").
|
||||
// Fallback: 90°C when not available.
|
||||
// Assessed independently of throttle — a GPU at 86°C without any throttle
|
||||
// counter still has limited headroom and operates in degraded reliability zone.
|
||||
// Warning zone: headroom < (shutdownTemp - slowdownTemp), i.e. past slowdown onset.
|
||||
// Critical zone: headroom < 10°C from shutdown.
|
||||
if gpu.Steady.P95TempC > 0 {
|
||||
score.TempHeadroomC = 100 - gpu.Steady.P95TempC
|
||||
shutdownTemp := gpu.ShutdownTempC
|
||||
if shutdownTemp <= 0 {
|
||||
shutdownTemp = 90
|
||||
}
|
||||
score.TempHeadroomC = shutdownTemp - gpu.Steady.P95TempC
|
||||
}
|
||||
score.ServerQualityScore = serverQualityScore(score)
|
||||
score.CompositeScore = score.ComputeScore
|
||||
@@ -1797,16 +1827,27 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
|
||||
}
|
||||
}
|
||||
// Temperature headroom checks — independent of throttle counters.
|
||||
if gpu.Scores.TempHeadroomC > 0 {
|
||||
// Shutdown and slowdown thresholds are per-GPU from nvidia-smi -q;
|
||||
// fall back to 90°C / 80°C when unavailable.
|
||||
if gpu.Steady.P95TempC > 0 {
|
||||
shutdownTemp := gpu.ShutdownTempC
|
||||
if shutdownTemp <= 0 {
|
||||
shutdownTemp = 90
|
||||
}
|
||||
slowdownTemp := gpu.SlowdownTempC
|
||||
if slowdownTemp <= 0 {
|
||||
slowdownTemp = 80
|
||||
}
|
||||
headroom := shutdownTemp - gpu.Steady.P95TempC
|
||||
switch {
|
||||
case gpu.Scores.TempHeadroomC < 10:
|
||||
case headroom < 10:
|
||||
findings = append(findings, fmt.Sprintf(
|
||||
"[HARD STOP] GPU %d: p95 temperature %.1f°C — only %.1f°C from destruction threshold (100°C). Do not operate.",
|
||||
gpu.Index, gpu.Steady.P95TempC, gpu.Scores.TempHeadroomC))
|
||||
case gpu.Scores.TempHeadroomC < 20:
|
||||
"[HARD STOP] GPU %d: p95 temperature %.1f°C — only %.1f°C from shutdown threshold (%.0f°C). Do not operate.",
|
||||
gpu.Index, gpu.Steady.P95TempC, headroom, shutdownTemp))
|
||||
case gpu.Steady.P95TempC >= slowdownTemp:
|
||||
findings = append(findings, fmt.Sprintf(
|
||||
"[THERMAL] GPU %d: p95 temperature %.1f°C — %.1f°C headroom to limit. Operating in degraded reliability zone (>80°C).",
|
||||
gpu.Index, gpu.Steady.P95TempC, gpu.Scores.TempHeadroomC))
|
||||
"[THERMAL] GPU %d: p95 temperature %.1f°C exceeds slowdown threshold (%.0f°C) — %.1f°C headroom to shutdown. Operating in degraded reliability zone.",
|
||||
gpu.Index, gpu.Steady.P95TempC, slowdownTemp, headroom))
|
||||
}
|
||||
}
|
||||
if gpu.CoolingWarning != "" {
|
||||
@@ -2207,6 +2248,8 @@ func runNvidiaBenchmarkParallel(
|
||||
r.PowerLimitW = info.PowerLimitW
|
||||
r.MultiprocessorCount = info.MultiprocessorCount
|
||||
r.DefaultPowerLimitW = info.DefaultPowerLimitW
|
||||
r.ShutdownTempC = info.ShutdownTempC
|
||||
r.SlowdownTempC = info.SlowdownTempC
|
||||
r.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
|
||||
r.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
|
||||
r.MaxMemoryClockMHz = info.MaxMemoryClockMHz
|
||||
|
||||
@@ -115,23 +115,31 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
|
||||
// Perspective 2: Thermal headroom
|
||||
b.WriteString("### 2. Thermal Headroom\n\n")
|
||||
b.WriteString("| GPU | p95 temp | Headroom to 100°C | Thermal throttle | Status |\n")
|
||||
b.WriteString("|-----|----------|-------------------|------------------|--------|\n")
|
||||
b.WriteString("| GPU | p95 temp | Slowdown limit | Shutdown limit | Headroom | Thermal throttle | Status |\n")
|
||||
b.WriteString("|-----|----------|----------------|----------------|----------|------------------|--------|\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
shutdownTemp := gpu.ShutdownTempC
|
||||
if shutdownTemp <= 0 {
|
||||
shutdownTemp = 90
|
||||
}
|
||||
slowdownTemp := gpu.SlowdownTempC
|
||||
if slowdownTemp <= 0 {
|
||||
slowdownTemp = 80
|
||||
}
|
||||
headroom := gpu.Scores.TempHeadroomC
|
||||
thermalStatus := "✓ OK"
|
||||
switch {
|
||||
case headroom < 10:
|
||||
thermalStatus = "⛔ CRITICAL"
|
||||
case headroom < 20:
|
||||
case gpu.Steady.P95TempC >= slowdownTemp:
|
||||
thermalStatus = "⚠ WARNING"
|
||||
}
|
||||
throttlePct := "-"
|
||||
if gpu.Scores.ThermalThrottlePct > 0 {
|
||||
throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
|
||||
}
|
||||
fmt.Fprintf(&b, "| GPU %d | %.1f°C | %.1f°C | %s | %s |\n",
|
||||
gpu.Index, gpu.Steady.P95TempC, headroom, throttlePct, thermalStatus)
|
||||
fmt.Fprintf(&b, "| GPU %d | %.1f°C | %.0f°C | %.0f°C | %.1f°C | %s | %s |\n",
|
||||
gpu.Index, gpu.Steady.P95TempC, slowdownTemp, shutdownTemp, headroom, throttlePct, thermalStatus)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
|
||||
@@ -112,6 +112,12 @@ type BenchmarkGPUResult struct {
|
||||
PowerLimitDerated bool `json:"power_limit_derated,omitempty"`
|
||||
MultiprocessorCount int `json:"multiprocessor_count,omitempty"`
|
||||
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
||||
// ShutdownTempC is the hardware thermal shutdown threshold for this GPU,
|
||||
// sourced from nvidia-smi -q ("GPU Shutdown Temp"). Fallback: 90°C.
|
||||
ShutdownTempC float64 `json:"shutdown_temp_c,omitempty"`
|
||||
// SlowdownTempC is the software throttle onset threshold ("GPU Slowdown Temp").
|
||||
// Fallback: 80°C.
|
||||
SlowdownTempC float64 `json:"slowdown_temp_c,omitempty"`
|
||||
// CalibratedPeakPowerW is the p95 power measured during a short
|
||||
// dcgmi targeted_power calibration run before the main benchmark.
|
||||
// Used as the reference denominator for PowerSustainScore instead of
|
||||
|
||||
Reference in New Issue
Block a user