Add slowdown temperature exceedance detector to benchmark

detectSlowdownTempExceedance scans steady-state metric rows per GPU and
emits a [WARNING] note + PARTIAL status if any sample >= SlowdownTempC.
Uses per-GPU threshold from nvidia-smi -q, fallback 80°C.

Distinct from p95-based TempHeadroomC check: catches even a single spike
above the slowdown threshold that would be smoothed out in aggregates.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-16 06:46:45 +03:00
parent 0d925299ff
commit 3732e64a4a

View File

@@ -520,6 +520,13 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
gpuResult.Notes = append(gpuResult.Notes,
fmt.Sprintf("[HARD STOP] GPU %d: %s", idx, anomaly))
}
if warn := detectSlowdownTempExceedance(metricRows, idx, gpuResult.SlowdownTempC); warn != "" {
gpuResult.Notes = append(gpuResult.Notes,
fmt.Sprintf("[WARNING] GPU %d: %s", idx, warn))
if gpuResult.Status == "OK" {
gpuResult.Status = "PARTIAL"
}
}
if planErr != nil {
gpuResult.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr)
} else if len(gpuResult.PrecisionFailures) > 0 {
@@ -1561,6 +1568,39 @@ func detectPowerAnomaly(rows []GPUMetricRow, gpuIndex int) string {
return ""
}
// detectSlowdownTempExceedance scans steady-state metric rows for a GPU and
// returns a warning string if any temperature sample exceeded the GPU's
// SlowdownTempC threshold. Uses fallback 80°C when SlowdownTempC is zero.
// This is a real-time signal distinct from p95 stats — even a single spike
// above the slowdown threshold is worth flagging.
func detectSlowdownTempExceedance(rows []GPUMetricRow, gpuIndex int, slowdownTempC float64) string {
if slowdownTempC <= 0 {
slowdownTempC = 80
}
var maxTemp float64
var exceedCount int
for _, r := range rows {
if r.GPUIndex != gpuIndex {
continue
}
if !strings.Contains(r.Stage, "steady") {
continue
}
if r.TempC > maxTemp {
maxTemp = r.TempC
}
if r.TempC >= slowdownTempC {
exceedCount++
}
}
if exceedCount == 0 {
return ""
}
return fmt.Sprintf(
"temperature exceeded slowdown threshold (%.0f°C) in %d sample(s) during steady state — peak %.1f°C",
slowdownTempC, exceedCount, maxTemp)
}
func detectBenchmarkDegradationReasons(gpu BenchmarkGPUResult, normalizationStatus string) []string {
var reasons []string
runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)