Translate report to English; add power anomaly detector

All report strings are now English only.

Add detectPowerAnomaly: scans steady-state metric rows per GPU with a
5-sample rolling baseline; flags a sudden drop ≥30% while GPU usage >50%
as [HARD STOP] — indicates bad cable contact or VRM fault.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-16 06:42:00 +03:00
parent 72ec086568
commit a8d5e019a5
2 changed files with 67 additions and 16 deletions

View File

@@ -508,6 +508,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult)
gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status)
if anomaly := detectPowerAnomaly(metricRows, idx); anomaly != "" {
gpuResult.Notes = append(gpuResult.Notes,
fmt.Sprintf("[HARD STOP] GPU %d: %s", idx, anomaly))
}
if planErr != nil {
gpuResult.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr)
} else if len(gpuResult.PrecisionFailures) > 0 {
@@ -1480,6 +1484,53 @@ func serverQualityScore(score BenchmarkScorecard) float64 {
return clampScore(q * 100)
}
// detectPowerAnomaly scans per-GPU steady-state metric rows for a sudden
// power drop — a symptom of bad cable contact, VRM fault, or thermal event
// on the power delivery path. Returns a non-empty string if an anomaly is found.
//
// Algorithm: uses a 5-sample rolling baseline; flags any sample that falls
// more than 30% below the baseline while the GPU was otherwise loaded
// (usage > 50%). A sustained throttle (power cap) is not flagged here —
// that is already captured by PowerCapThrottlePct.
func detectPowerAnomaly(rows []GPUMetricRow, gpuIndex int) string {
const windowSize = 5
const dropThresholdPct = 30.0
const minUsagePct = 50.0
// Filter rows for this GPU during steady state only.
var steady []GPUMetricRow
for _, r := range rows {
if r.GPUIndex == gpuIndex && r.Stage != "" && strings.Contains(r.Stage, "steady") {
steady = append(steady, r)
}
}
if len(steady) < windowSize+2 {
return ""
}
// Compute initial baseline from the first window.
var baseSum float64
for i := 0; i < windowSize; i++ {
baseSum += steady[i].PowerW
}
for i := windowSize; i < len(steady); i++ {
baseline := baseSum / float64(windowSize)
sample := steady[i]
if baseline > 0 && sample.UsagePct >= minUsagePct {
dropPct := (baseline - sample.PowerW) / baseline * 100
if dropPct >= dropThresholdPct {
return fmt.Sprintf("sudden power drop detected at t=%.0fs: %.0f W → %.0f W (%.0f%% below rolling baseline) — possible bad cable contact or VRM fault",
sample.ElapsedSec, baseline, sample.PowerW, dropPct)
}
}
// Slide the window baseline.
baseSum -= steady[i-windowSize].PowerW
baseSum += sample.PowerW
}
return ""
}
func detectBenchmarkDegradationReasons(gpu BenchmarkGPUResult, normalizationStatus string) []string {
var reasons []string
runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)

View File

@@ -88,9 +88,9 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
b.WriteString("## Balanced Scorecard\n\n")
// Perspective 1: Compatibility — hard stops
b.WriteString("### 1. Совместимость\n\n")
b.WriteString("| GPU | Тепл. throttle | Вентиляторы при throttle | ECC uncorr | Статус |\n")
b.WriteString("|-----|---------------|--------------------------|------------|--------|\n")
b.WriteString("### 1. Compatibility\n\n")
b.WriteString("| GPU | Thermal throttle | Fan duty at throttle | ECC uncorr | Status |\n")
b.WriteString("|-----|------------------|----------------------|------------|--------|\n")
for _, gpu := range result.GPUs {
thermalThrottle := "-"
if gpu.Scores.ThermalThrottlePct > 0 {
@@ -114,9 +114,9 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
b.WriteString("\n")
// Perspective 2: Thermal headroom
b.WriteString("### 2. Тепловой запас\n\n")
b.WriteString("| GPU | p95 темп | Запас до 100°C | Тепл. throttle | Статус |\n")
b.WriteString("|-----|----------|----------------|----------------|--------|\n")
b.WriteString("### 2. Thermal Headroom\n\n")
b.WriteString("| GPU | p95 temp | Headroom to 100°C | Thermal throttle | Status |\n")
b.WriteString("|-----|----------|-------------------|------------------|--------|\n")
for _, gpu := range result.GPUs {
headroom := gpu.Scores.TempHeadroomC
thermalStatus := "✓ OK"
@@ -136,9 +136,9 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
b.WriteString("\n")
// Perspective 3: Power delivery
b.WriteString("### 3. Энергетика\n\n")
b.WriteString("| GPU | Power cap throttle | Power CV | Вентиляторы (p95) | Статус |\n")
b.WriteString("|-----|-------------------|----------|-------------------|--------|\n")
b.WriteString("### 3. Power Delivery\n\n")
b.WriteString("| GPU | Power cap throttle | Power stability | Fan duty (p95) | Status |\n")
b.WriteString("|-----|-------------------|-----------------|----------------|--------|\n")
for _, gpu := range result.GPUs {
powerCap := "-"
if gpu.Scores.PowerCapThrottlePct > 0 {
@@ -158,7 +158,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
b.WriteString("\n")
// Perspective 4: Performance
b.WriteString("### 4. Производительность\n\n")
b.WriteString("### 4. Performance\n\n")
b.WriteString("| GPU | Compute TOPS | Synthetic | Mixed | Mixed Eff. | TOPS/SM/GHz |\n")
b.WriteString("|-----|--------------|-----------|-------|------------|-------------|\n")
for _, gpu := range result.GPUs {
@@ -182,14 +182,14 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
gpu.Index, gpu.Scores.CompositeScore, synthetic, mixed, mixedEff, topsPerSM)
}
if len(result.PerformanceRampSteps) > 0 {
fmt.Fprintf(&b, "\n**Platform power score (масштабируемость):** %.1f%%\n", result.PlatformPowerScore)
fmt.Fprintf(&b, "\n**Platform power score (scalability):** %.1f%%\n", result.PlatformPowerScore)
}
b.WriteString("\n")
// Perspective 5: Anomaly flags
b.WriteString("### 5. Аномалии\n\n")
b.WriteString("| GPU | ECC corr | Sync boost throttle | Нестаб. питание | Нестаб. охлаждение |\n")
b.WriteString("|-----|----------|---------------------|-----------------|--------------------|\n")
b.WriteString("### 5. Anomalies\n\n")
b.WriteString("| GPU | ECC corrected | Sync boost throttle | Power instability | Thermal instability |\n")
b.WriteString("|-----|---------------|---------------------|-------------------|---------------------|\n")
for _, gpu := range result.GPUs {
eccCorr := "-"
if gpu.ECC.Corrected > 0 {
@@ -201,11 +201,11 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
}
powerVar := "OK"
if gpu.Scores.PowerSustainScore < 70 {
powerVar = "⚠ нестабильно"
powerVar = "⚠ unstable"
}
thermalVar := "OK"
if gpu.Scores.ThermalSustainScore < 70 {
thermalVar = "⚠ нестабильно"
thermalVar = "⚠ unstable"
}
fmt.Fprintf(&b, "| GPU %d | %s | %s | %s | %s |\n",
gpu.Index, eccCorr, syncBoost, powerVar, thermalVar)