Add slowdown temperature exceedance detector to benchmark
detectSlowdownTempExceedance scans steady-state metric rows per GPU and emits a [WARNING] note + PARTIAL status if any sample >= SlowdownTempC. Uses per-GPU threshold from nvidia-smi -q, fallback 80°C. Distinct from p95-based TempHeadroomC check: catches even a single spike above the slowdown threshold that would be smoothed out in aggregates. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -520,6 +520,13 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
gpuResult.Notes = append(gpuResult.Notes,
|
gpuResult.Notes = append(gpuResult.Notes,
|
||||||
fmt.Sprintf("[HARD STOP] GPU %d: %s", idx, anomaly))
|
fmt.Sprintf("[HARD STOP] GPU %d: %s", idx, anomaly))
|
||||||
}
|
}
|
||||||
|
if warn := detectSlowdownTempExceedance(metricRows, idx, gpuResult.SlowdownTempC); warn != "" {
|
||||||
|
gpuResult.Notes = append(gpuResult.Notes,
|
||||||
|
fmt.Sprintf("[WARNING] GPU %d: %s", idx, warn))
|
||||||
|
if gpuResult.Status == "OK" {
|
||||||
|
gpuResult.Status = "PARTIAL"
|
||||||
|
}
|
||||||
|
}
|
||||||
if planErr != nil {
|
if planErr != nil {
|
||||||
gpuResult.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr)
|
gpuResult.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr)
|
||||||
} else if len(gpuResult.PrecisionFailures) > 0 {
|
} else if len(gpuResult.PrecisionFailures) > 0 {
|
||||||
@@ -1561,6 +1568,39 @@ func detectPowerAnomaly(rows []GPUMetricRow, gpuIndex int) string {
|
|||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// detectSlowdownTempExceedance scans steady-state metric rows for a GPU and
|
||||||
|
// returns a warning string if any temperature sample exceeded the GPU's
|
||||||
|
// SlowdownTempC threshold. Uses fallback 80°C when SlowdownTempC is zero.
|
||||||
|
// This is a real-time signal distinct from p95 stats — even a single spike
|
||||||
|
// above the slowdown threshold is worth flagging.
|
||||||
|
func detectSlowdownTempExceedance(rows []GPUMetricRow, gpuIndex int, slowdownTempC float64) string {
|
||||||
|
if slowdownTempC <= 0 {
|
||||||
|
slowdownTempC = 80
|
||||||
|
}
|
||||||
|
var maxTemp float64
|
||||||
|
var exceedCount int
|
||||||
|
for _, r := range rows {
|
||||||
|
if r.GPUIndex != gpuIndex {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !strings.Contains(r.Stage, "steady") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if r.TempC > maxTemp {
|
||||||
|
maxTemp = r.TempC
|
||||||
|
}
|
||||||
|
if r.TempC >= slowdownTempC {
|
||||||
|
exceedCount++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if exceedCount == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return fmt.Sprintf(
|
||||||
|
"temperature exceeded slowdown threshold (%.0f°C) in %d sample(s) during steady state — peak %.1f°C",
|
||||||
|
slowdownTempC, exceedCount, maxTemp)
|
||||||
|
}
|
||||||
|
|
||||||
func detectBenchmarkDegradationReasons(gpu BenchmarkGPUResult, normalizationStatus string) []string {
|
func detectBenchmarkDegradationReasons(gpu BenchmarkGPUResult, normalizationStatus string) []string {
|
||||||
var reasons []string
|
var reasons []string
|
||||||
runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
|
runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
|
||||||
|
|||||||
Reference in New Issue
Block a user