From 3732e64a4a2d8d5654931629bdf2bf59feb22adf Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Thu, 16 Apr 2026 06:46:45 +0300 Subject: [PATCH] Add slowdown temperature exceedance detector to benchmark MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit detectSlowdownTempExceedance scans steady-state metric rows per GPU and emits a [WARNING] note + PARTIAL status if any sample >= SlowdownTempC. Uses per-GPU threshold from nvidia-smi -q, fallback 80°C. Distinct from p95-based TempHeadroomC check: catches even a single spike above the slowdown threshold that would be smoothed out in aggregates. Co-Authored-By: Claude Sonnet 4.6 --- audit/internal/platform/benchmark.go | 40 ++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index 361e427..b6df699 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -520,6 +520,13 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv gpuResult.Notes = append(gpuResult.Notes, fmt.Sprintf("[HARD STOP] GPU %d: %s", idx, anomaly)) } + if warn := detectSlowdownTempExceedance(metricRows, idx, gpuResult.SlowdownTempC); warn != "" { + gpuResult.Notes = append(gpuResult.Notes, + fmt.Sprintf("[WARNING] GPU %d: %s", idx, warn)) + if gpuResult.Status == "OK" { + gpuResult.Status = "PARTIAL" + } + } if planErr != nil { gpuResult.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr) } else if len(gpuResult.PrecisionFailures) > 0 { @@ -1561,6 +1568,39 @@ func detectPowerAnomaly(rows []GPUMetricRow, gpuIndex int) string { return "" } +// detectSlowdownTempExceedance scans steady-state metric rows for a GPU and +// returns a warning string if any temperature sample exceeded the GPU's +// SlowdownTempC threshold. Uses fallback 80°C when SlowdownTempC is zero. +// This is a real-time signal distinct from p95 stats — even a single spike +// above the slowdown threshold is worth flagging. +func detectSlowdownTempExceedance(rows []GPUMetricRow, gpuIndex int, slowdownTempC float64) string { + if slowdownTempC <= 0 { + slowdownTempC = 80 + } + var maxTemp float64 + var exceedCount int + for _, r := range rows { + if r.GPUIndex != gpuIndex { + continue + } + if !strings.Contains(r.Stage, "steady") { + continue + } + if r.TempC > maxTemp { + maxTemp = r.TempC + } + if r.TempC >= slowdownTempC { + exceedCount++ + } + } + if exceedCount == 0 { + return "" + } + return fmt.Sprintf( + "temperature exceeded slowdown threshold (%.0f°C) in %d sample(s) during steady state — peak %.1f°C", + slowdownTempC, exceedCount, maxTemp) +} + func detectBenchmarkDegradationReasons(gpu BenchmarkGPUResult, normalizationStatus string) []string { var reasons []string runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)