diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index 361e427..b6df699 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -520,6 +520,13 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv gpuResult.Notes = append(gpuResult.Notes, fmt.Sprintf("[HARD STOP] GPU %d: %s", idx, anomaly)) } + if warn := detectSlowdownTempExceedance(metricRows, idx, gpuResult.SlowdownTempC); warn != "" { + gpuResult.Notes = append(gpuResult.Notes, + fmt.Sprintf("[WARNING] GPU %d: %s", idx, warn)) + if gpuResult.Status == "OK" { + gpuResult.Status = "PARTIAL" + } + } if planErr != nil { gpuResult.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr) } else if len(gpuResult.PrecisionFailures) > 0 { @@ -1561,6 +1568,39 @@ func detectPowerAnomaly(rows []GPUMetricRow, gpuIndex int) string { return "" } +// detectSlowdownTempExceedance scans steady-state metric rows for a GPU and +// returns a warning string if any temperature sample exceeded the GPU's +// SlowdownTempC threshold. Uses fallback 80°C when SlowdownTempC is zero. +// This is a real-time signal distinct from p95 stats — even a single spike +// above the slowdown threshold is worth flagging. +func detectSlowdownTempExceedance(rows []GPUMetricRow, gpuIndex int, slowdownTempC float64) string { + if slowdownTempC <= 0 { + slowdownTempC = 80 + } + var maxTemp float64 + var exceedCount int + for _, r := range rows { + if r.GPUIndex != gpuIndex { + continue + } + if !strings.Contains(r.Stage, "steady") { + continue + } + if r.TempC > maxTemp { + maxTemp = r.TempC + } + if r.TempC >= slowdownTempC { + exceedCount++ + } + } + if exceedCount == 0 { + return "" + } + return fmt.Sprintf( + "temperature exceeded slowdown threshold (%.0f°C) in %d sample(s) during steady state — peak %.1f°C", + slowdownTempC, exceedCount, maxTemp) +} + func detectBenchmarkDegradationReasons(gpu BenchmarkGPUResult, normalizationStatus string) []string { var reasons []string runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)