Translate report to English; add power anomaly detector

All report strings are now English only.

Add detectPowerAnomaly: scans steady-state metric rows per GPU with a
5-sample rolling baseline; flags a sudden drop ≥30% while GPU usage >50%
as [HARD STOP] — indicates bad cable contact or VRM fault.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-16 06:42:00 +03:00
parent 72ec086568
commit a8d5e019a5
2 changed files with 67 additions and 16 deletions

View File

@@ -508,6 +508,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult)
gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status)
if anomaly := detectPowerAnomaly(metricRows, idx); anomaly != "" {
gpuResult.Notes = append(gpuResult.Notes,
fmt.Sprintf("[HARD STOP] GPU %d: %s", idx, anomaly))
}
if planErr != nil {
gpuResult.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr)
} else if len(gpuResult.PrecisionFailures) > 0 {
@@ -1480,6 +1484,53 @@ func serverQualityScore(score BenchmarkScorecard) float64 {
return clampScore(q * 100)
}
// detectPowerAnomaly scans per-GPU steady-state metric rows for a sudden
// power drop — a symptom of bad cable contact, VRM fault, or thermal event
// on the power delivery path. Returns a non-empty string if an anomaly is found.
//
// Algorithm: uses a 5-sample rolling baseline; flags any sample that falls
// more than 30% below the baseline while the GPU was otherwise loaded
// (usage > 50%). A sustained throttle (power cap) is not flagged here —
// that is already captured by PowerCapThrottlePct.
func detectPowerAnomaly(rows []GPUMetricRow, gpuIndex int) string {
const windowSize = 5
const dropThresholdPct = 30.0
const minUsagePct = 50.0
// Filter rows for this GPU during steady state only.
var steady []GPUMetricRow
for _, r := range rows {
if r.GPUIndex == gpuIndex && r.Stage != "" && strings.Contains(r.Stage, "steady") {
steady = append(steady, r)
}
}
if len(steady) < windowSize+2 {
return ""
}
// Compute initial baseline from the first window.
var baseSum float64
for i := 0; i < windowSize; i++ {
baseSum += steady[i].PowerW
}
for i := windowSize; i < len(steady); i++ {
baseline := baseSum / float64(windowSize)
sample := steady[i]
if baseline > 0 && sample.UsagePct >= minUsagePct {
dropPct := (baseline - sample.PowerW) / baseline * 100
if dropPct >= dropThresholdPct {
return fmt.Sprintf("sudden power drop detected at t=%.0fs: %.0f W → %.0f W (%.0f%% below rolling baseline) — possible bad cable contact or VRM fault",
sample.ElapsedSec, baseline, sample.PowerW, dropPct)
}
}
// Slide the window baseline.
baseSum -= steady[i-windowSize].PowerW
baseSum += sample.PowerW
}
return ""
}
func detectBenchmarkDegradationReasons(gpu BenchmarkGPUResult, normalizationStatus string) []string {
var reasons []string
runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)