Add slowdown temperature exceedance detector to benchmark
detectSlowdownTempExceedance scans steady-state metric rows per GPU and emits a [WARNING] note + PARTIAL status if any sample >= SlowdownTempC. Uses per-GPU threshold from nvidia-smi -q, fallback 80°C. Distinct from p95-based TempHeadroomC check: catches even a single spike above the slowdown threshold that would be smoothed out in aggregates. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -520,6 +520,13 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
||||
gpuResult.Notes = append(gpuResult.Notes,
|
||||
fmt.Sprintf("[HARD STOP] GPU %d: %s", idx, anomaly))
|
||||
}
|
||||
if warn := detectSlowdownTempExceedance(metricRows, idx, gpuResult.SlowdownTempC); warn != "" {
|
||||
gpuResult.Notes = append(gpuResult.Notes,
|
||||
fmt.Sprintf("[WARNING] GPU %d: %s", idx, warn))
|
||||
if gpuResult.Status == "OK" {
|
||||
gpuResult.Status = "PARTIAL"
|
||||
}
|
||||
}
|
||||
if planErr != nil {
|
||||
gpuResult.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr)
|
||||
} else if len(gpuResult.PrecisionFailures) > 0 {
|
||||
@@ -1561,6 +1568,39 @@ func detectPowerAnomaly(rows []GPUMetricRow, gpuIndex int) string {
|
||||
return ""
|
||||
}
|
||||
|
||||
// detectSlowdownTempExceedance scans steady-state metric rows for a GPU and
|
||||
// returns a warning string if any temperature sample exceeded the GPU's
|
||||
// SlowdownTempC threshold. Uses fallback 80°C when SlowdownTempC is zero.
|
||||
// This is a real-time signal distinct from p95 stats — even a single spike
|
||||
// above the slowdown threshold is worth flagging.
|
||||
func detectSlowdownTempExceedance(rows []GPUMetricRow, gpuIndex int, slowdownTempC float64) string {
|
||||
if slowdownTempC <= 0 {
|
||||
slowdownTempC = 80
|
||||
}
|
||||
var maxTemp float64
|
||||
var exceedCount int
|
||||
for _, r := range rows {
|
||||
if r.GPUIndex != gpuIndex {
|
||||
continue
|
||||
}
|
||||
if !strings.Contains(r.Stage, "steady") {
|
||||
continue
|
||||
}
|
||||
if r.TempC > maxTemp {
|
||||
maxTemp = r.TempC
|
||||
}
|
||||
if r.TempC >= slowdownTempC {
|
||||
exceedCount++
|
||||
}
|
||||
}
|
||||
if exceedCount == 0 {
|
||||
return ""
|
||||
}
|
||||
return fmt.Sprintf(
|
||||
"temperature exceeded slowdown threshold (%.0f°C) in %d sample(s) during steady state — peak %.1f°C",
|
||||
slowdownTempC, exceedCount, maxTemp)
|
||||
}
|
||||
|
||||
func detectBenchmarkDegradationReasons(gpu BenchmarkGPUResult, normalizationStatus string) []string {
|
||||
var reasons []string
|
||||
runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
|
||||
|
||||
Reference in New Issue
Block a user