Translate report to English; add power anomaly detector
All report strings are now English only. Add detectPowerAnomaly: scans steady-state metric rows per GPU with a 5-sample rolling baseline; flags a sudden drop ≥30% while GPU usage >50% as [HARD STOP] — indicates bad cable contact or VRM fault. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -508,6 +508,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
||||
|
||||
gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult)
|
||||
gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status)
|
||||
if anomaly := detectPowerAnomaly(metricRows, idx); anomaly != "" {
|
||||
gpuResult.Notes = append(gpuResult.Notes,
|
||||
fmt.Sprintf("[HARD STOP] GPU %d: %s", idx, anomaly))
|
||||
}
|
||||
if planErr != nil {
|
||||
gpuResult.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr)
|
||||
} else if len(gpuResult.PrecisionFailures) > 0 {
|
||||
@@ -1480,6 +1484,53 @@ func serverQualityScore(score BenchmarkScorecard) float64 {
|
||||
return clampScore(q * 100)
|
||||
}
|
||||
|
||||
// detectPowerAnomaly scans per-GPU steady-state metric rows for a sudden
|
||||
// power drop — a symptom of bad cable contact, VRM fault, or thermal event
|
||||
// on the power delivery path. Returns a non-empty string if an anomaly is found.
|
||||
//
|
||||
// Algorithm: uses a 5-sample rolling baseline; flags any sample that falls
|
||||
// more than 30% below the baseline while the GPU was otherwise loaded
|
||||
// (usage > 50%). A sustained throttle (power cap) is not flagged here —
|
||||
// that is already captured by PowerCapThrottlePct.
|
||||
func detectPowerAnomaly(rows []GPUMetricRow, gpuIndex int) string {
|
||||
const windowSize = 5
|
||||
const dropThresholdPct = 30.0
|
||||
const minUsagePct = 50.0
|
||||
|
||||
// Filter rows for this GPU during steady state only.
|
||||
var steady []GPUMetricRow
|
||||
for _, r := range rows {
|
||||
if r.GPUIndex == gpuIndex && r.Stage != "" && strings.Contains(r.Stage, "steady") {
|
||||
steady = append(steady, r)
|
||||
}
|
||||
}
|
||||
if len(steady) < windowSize+2 {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Compute initial baseline from the first window.
|
||||
var baseSum float64
|
||||
for i := 0; i < windowSize; i++ {
|
||||
baseSum += steady[i].PowerW
|
||||
}
|
||||
|
||||
for i := windowSize; i < len(steady); i++ {
|
||||
baseline := baseSum / float64(windowSize)
|
||||
sample := steady[i]
|
||||
if baseline > 0 && sample.UsagePct >= minUsagePct {
|
||||
dropPct := (baseline - sample.PowerW) / baseline * 100
|
||||
if dropPct >= dropThresholdPct {
|
||||
return fmt.Sprintf("sudden power drop detected at t=%.0fs: %.0f W → %.0f W (%.0f%% below rolling baseline) — possible bad cable contact or VRM fault",
|
||||
sample.ElapsedSec, baseline, sample.PowerW, dropPct)
|
||||
}
|
||||
}
|
||||
// Slide the window baseline.
|
||||
baseSum -= steady[i-windowSize].PowerW
|
||||
baseSum += sample.PowerW
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func detectBenchmarkDegradationReasons(gpu BenchmarkGPUResult, normalizationStatus string) []string {
|
||||
var reasons []string
|
||||
runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
|
||||
|
||||
Reference in New Issue
Block a user