diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index aa87177..361e427 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -40,6 +40,12 @@ type benchmarkGPUInfo struct { MaxMemoryClockMHz float64 BaseGraphicsClockMHz float64 MultiprocessorCount int + // Temperature limits sourced from nvidia-smi -q verbose output. + // ShutdownTempC is the hardware thermal shutdown threshold. + // SlowdownTempC is the software throttle onset threshold. + // Both fall back to safe conservative defaults when not available. + ShutdownTempC float64 // fallback: 90°C + SlowdownTempC float64 // fallback: 80°C } type benchmarkPowerCalibrationResult struct { @@ -330,6 +336,8 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv gpuResult.PowerLimitW = info.PowerLimitW gpuResult.MultiprocessorCount = info.MultiprocessorCount gpuResult.DefaultPowerLimitW = info.DefaultPowerLimitW + gpuResult.ShutdownTempC = info.ShutdownTempC + gpuResult.SlowdownTempC = info.SlowdownTempC gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz @@ -741,6 +749,8 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`) currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`) smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`) + shutdownTempRe := regexp.MustCompile(`(?i)GPU Shutdown Temp\s*:\s*(\d+)\s*C`) + slowdownTempRe := regexp.MustCompile(`(?i)GPU Slowdown Temp\s*:\s*(\d+)\s*C`) sectionStarts := gpuSectionRe.FindAllSubmatchIndex(nvsmiQ, -1) for i, loc := range sectionStarts { @@ -804,6 +814,20 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b } } } + if info.ShutdownTempC == 0 { + if m := shutdownTempRe.FindSubmatch(section); m != nil { + if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 { + info.ShutdownTempC = v + } + } + } + if info.SlowdownTempC == 0 { + if m := slowdownTempRe.FindSubmatch(section); m != nil { + if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 { + info.SlowdownTempC = v + } + } + } infoByIndex[benchIdx] = info } } @@ -1448,13 +1472,19 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard { float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS+gpu.Throttle.SWPowerCapUS)/runtimeUS*100) score.StabilityScore = clampScore(100 - combinedThrottlePct) - // TempHeadroomC: distance from p95 temperature to the 100°C destruction - // threshold. Assessed independently of throttle — a GPU at 86°C without - // any throttle counter still has only 14°C headroom, which is a concern. - // Warning zone: < 20°C headroom (p95 > 80°C). - // Critical zone: < 10°C headroom (p95 > 90°C). + // TempHeadroomC: distance from p95 temperature to the GPU's hardware + // shutdown threshold (sourced from nvidia-smi -q "GPU Shutdown Temp"). + // Fallback: 90°C when not available. + // Assessed independently of throttle — a GPU at 86°C without any throttle + // counter still has limited headroom and operates in degraded reliability zone. + // Warning zone: headroom < (shutdownTemp - slowdownTemp), i.e. past slowdown onset. + // Critical zone: headroom < 10°C from shutdown. if gpu.Steady.P95TempC > 0 { - score.TempHeadroomC = 100 - gpu.Steady.P95TempC + shutdownTemp := gpu.ShutdownTempC + if shutdownTemp <= 0 { + shutdownTemp = 90 + } + score.TempHeadroomC = shutdownTemp - gpu.Steady.P95TempC } score.ServerQualityScore = serverQualityScore(score) score.CompositeScore = score.ComputeScore @@ -1797,16 +1827,27 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string { } } // Temperature headroom checks — independent of throttle counters. - if gpu.Scores.TempHeadroomC > 0 { + // Shutdown and slowdown thresholds are per-GPU from nvidia-smi -q; + // fall back to 90°C / 80°C when unavailable. + if gpu.Steady.P95TempC > 0 { + shutdownTemp := gpu.ShutdownTempC + if shutdownTemp <= 0 { + shutdownTemp = 90 + } + slowdownTemp := gpu.SlowdownTempC + if slowdownTemp <= 0 { + slowdownTemp = 80 + } + headroom := shutdownTemp - gpu.Steady.P95TempC switch { - case gpu.Scores.TempHeadroomC < 10: + case headroom < 10: findings = append(findings, fmt.Sprintf( - "[HARD STOP] GPU %d: p95 temperature %.1f°C — only %.1f°C from destruction threshold (100°C). Do not operate.", - gpu.Index, gpu.Steady.P95TempC, gpu.Scores.TempHeadroomC)) - case gpu.Scores.TempHeadroomC < 20: + "[HARD STOP] GPU %d: p95 temperature %.1f°C — only %.1f°C from shutdown threshold (%.0f°C). Do not operate.", + gpu.Index, gpu.Steady.P95TempC, headroom, shutdownTemp)) + case gpu.Steady.P95TempC >= slowdownTemp: findings = append(findings, fmt.Sprintf( - "[THERMAL] GPU %d: p95 temperature %.1f°C — %.1f°C headroom to limit. Operating in degraded reliability zone (>80°C).", - gpu.Index, gpu.Steady.P95TempC, gpu.Scores.TempHeadroomC)) + "[THERMAL] GPU %d: p95 temperature %.1f°C exceeds slowdown threshold (%.0f°C) — %.1f°C headroom to shutdown. Operating in degraded reliability zone.", + gpu.Index, gpu.Steady.P95TempC, slowdownTemp, headroom)) } } if gpu.CoolingWarning != "" { @@ -2207,6 +2248,8 @@ func runNvidiaBenchmarkParallel( r.PowerLimitW = info.PowerLimitW r.MultiprocessorCount = info.MultiprocessorCount r.DefaultPowerLimitW = info.DefaultPowerLimitW + r.ShutdownTempC = info.ShutdownTempC + r.SlowdownTempC = info.SlowdownTempC r.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz r.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz r.MaxMemoryClockMHz = info.MaxMemoryClockMHz diff --git a/audit/internal/platform/benchmark_report.go b/audit/internal/platform/benchmark_report.go index 5dede3b..22ca59e 100644 --- a/audit/internal/platform/benchmark_report.go +++ b/audit/internal/platform/benchmark_report.go @@ -115,23 +115,31 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { // Perspective 2: Thermal headroom b.WriteString("### 2. Thermal Headroom\n\n") - b.WriteString("| GPU | p95 temp | Headroom to 100°C | Thermal throttle | Status |\n") - b.WriteString("|-----|----------|-------------------|------------------|--------|\n") + b.WriteString("| GPU | p95 temp | Slowdown limit | Shutdown limit | Headroom | Thermal throttle | Status |\n") + b.WriteString("|-----|----------|----------------|----------------|----------|------------------|--------|\n") for _, gpu := range result.GPUs { + shutdownTemp := gpu.ShutdownTempC + if shutdownTemp <= 0 { + shutdownTemp = 90 + } + slowdownTemp := gpu.SlowdownTempC + if slowdownTemp <= 0 { + slowdownTemp = 80 + } headroom := gpu.Scores.TempHeadroomC thermalStatus := "✓ OK" switch { case headroom < 10: thermalStatus = "⛔ CRITICAL" - case headroom < 20: + case gpu.Steady.P95TempC >= slowdownTemp: thermalStatus = "⚠ WARNING" } throttlePct := "-" if gpu.Scores.ThermalThrottlePct > 0 { throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct) } - fmt.Fprintf(&b, "| GPU %d | %.1f°C | %.1f°C | %s | %s |\n", - gpu.Index, gpu.Steady.P95TempC, headroom, throttlePct, thermalStatus) + fmt.Fprintf(&b, "| GPU %d | %.1f°C | %.0f°C | %.0f°C | %.1f°C | %s | %s |\n", + gpu.Index, gpu.Steady.P95TempC, slowdownTemp, shutdownTemp, headroom, throttlePct, thermalStatus) } b.WriteString("\n") diff --git a/audit/internal/platform/benchmark_types.go b/audit/internal/platform/benchmark_types.go index fe6b3ff..b02958c 100644 --- a/audit/internal/platform/benchmark_types.go +++ b/audit/internal/platform/benchmark_types.go @@ -112,6 +112,12 @@ type BenchmarkGPUResult struct { PowerLimitDerated bool `json:"power_limit_derated,omitempty"` MultiprocessorCount int `json:"multiprocessor_count,omitempty"` DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"` + // ShutdownTempC is the hardware thermal shutdown threshold for this GPU, + // sourced from nvidia-smi -q ("GPU Shutdown Temp"). Fallback: 90°C. + ShutdownTempC float64 `json:"shutdown_temp_c,omitempty"` + // SlowdownTempC is the software throttle onset threshold ("GPU Slowdown Temp"). + // Fallback: 80°C. + SlowdownTempC float64 `json:"slowdown_temp_c,omitempty"` // CalibratedPeakPowerW is the p95 power measured during a short // dcgmi targeted_power calibration run before the main benchmark. // Used as the reference denominator for PowerSustainScore instead of