From a8d5e019a54377fbde9ded28f73409e7e8089419 Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Thu, 16 Apr 2026 06:42:00 +0300 Subject: [PATCH] Translate report to English; add power anomaly detector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All report strings are now English only. Add detectPowerAnomaly: scans steady-state metric rows per GPU with a 5-sample rolling baseline; flags a sudden drop ≥30% while GPU usage >50% as [HARD STOP] — indicates bad cable contact or VRM fault. Co-Authored-By: Claude Sonnet 4.6 --- audit/internal/platform/benchmark.go | 51 +++++++++++++++++++++ audit/internal/platform/benchmark_report.go | 32 ++++++------- 2 files changed, 67 insertions(+), 16 deletions(-) diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index 8c09dae..aa87177 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -508,6 +508,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult) gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status) + if anomaly := detectPowerAnomaly(metricRows, idx); anomaly != "" { + gpuResult.Notes = append(gpuResult.Notes, + fmt.Sprintf("[HARD STOP] GPU %d: %s", idx, anomaly)) + } if planErr != nil { gpuResult.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr) } else if len(gpuResult.PrecisionFailures) > 0 { @@ -1480,6 +1484,53 @@ func serverQualityScore(score BenchmarkScorecard) float64 { return clampScore(q * 100) } +// detectPowerAnomaly scans per-GPU steady-state metric rows for a sudden +// power drop — a symptom of bad cable contact, VRM fault, or thermal event +// on the power delivery path. Returns a non-empty string if an anomaly is found. +// +// Algorithm: uses a 5-sample rolling baseline; flags any sample that falls +// more than 30% below the baseline while the GPU was otherwise loaded +// (usage > 50%). A sustained throttle (power cap) is not flagged here — +// that is already captured by PowerCapThrottlePct. +func detectPowerAnomaly(rows []GPUMetricRow, gpuIndex int) string { + const windowSize = 5 + const dropThresholdPct = 30.0 + const minUsagePct = 50.0 + + // Filter rows for this GPU during steady state only. + var steady []GPUMetricRow + for _, r := range rows { + if r.GPUIndex == gpuIndex && r.Stage != "" && strings.Contains(r.Stage, "steady") { + steady = append(steady, r) + } + } + if len(steady) < windowSize+2 { + return "" + } + + // Compute initial baseline from the first window. + var baseSum float64 + for i := 0; i < windowSize; i++ { + baseSum += steady[i].PowerW + } + + for i := windowSize; i < len(steady); i++ { + baseline := baseSum / float64(windowSize) + sample := steady[i] + if baseline > 0 && sample.UsagePct >= minUsagePct { + dropPct := (baseline - sample.PowerW) / baseline * 100 + if dropPct >= dropThresholdPct { + return fmt.Sprintf("sudden power drop detected at t=%.0fs: %.0f W → %.0f W (%.0f%% below rolling baseline) — possible bad cable contact or VRM fault", + sample.ElapsedSec, baseline, sample.PowerW, dropPct) + } + } + // Slide the window baseline. + baseSum -= steady[i-windowSize].PowerW + baseSum += sample.PowerW + } + return "" +} + func detectBenchmarkDegradationReasons(gpu BenchmarkGPUResult, normalizationStatus string) []string { var reasons []string runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6) diff --git a/audit/internal/platform/benchmark_report.go b/audit/internal/platform/benchmark_report.go index dc89f9a..5dede3b 100644 --- a/audit/internal/platform/benchmark_report.go +++ b/audit/internal/platform/benchmark_report.go @@ -88,9 +88,9 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { b.WriteString("## Balanced Scorecard\n\n") // Perspective 1: Compatibility — hard stops - b.WriteString("### 1. Совместимость\n\n") - b.WriteString("| GPU | Тепл. throttle | Вентиляторы при throttle | ECC uncorr | Статус |\n") - b.WriteString("|-----|---------------|--------------------------|------------|--------|\n") + b.WriteString("### 1. Compatibility\n\n") + b.WriteString("| GPU | Thermal throttle | Fan duty at throttle | ECC uncorr | Status |\n") + b.WriteString("|-----|------------------|----------------------|------------|--------|\n") for _, gpu := range result.GPUs { thermalThrottle := "-" if gpu.Scores.ThermalThrottlePct > 0 { @@ -114,9 +114,9 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { b.WriteString("\n") // Perspective 2: Thermal headroom - b.WriteString("### 2. Тепловой запас\n\n") - b.WriteString("| GPU | p95 темп | Запас до 100°C | Тепл. throttle | Статус |\n") - b.WriteString("|-----|----------|----------------|----------------|--------|\n") + b.WriteString("### 2. Thermal Headroom\n\n") + b.WriteString("| GPU | p95 temp | Headroom to 100°C | Thermal throttle | Status |\n") + b.WriteString("|-----|----------|-------------------|------------------|--------|\n") for _, gpu := range result.GPUs { headroom := gpu.Scores.TempHeadroomC thermalStatus := "✓ OK" @@ -136,9 +136,9 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { b.WriteString("\n") // Perspective 3: Power delivery - b.WriteString("### 3. Энергетика\n\n") - b.WriteString("| GPU | Power cap throttle | Power CV | Вентиляторы (p95) | Статус |\n") - b.WriteString("|-----|-------------------|----------|-------------------|--------|\n") + b.WriteString("### 3. Power Delivery\n\n") + b.WriteString("| GPU | Power cap throttle | Power stability | Fan duty (p95) | Status |\n") + b.WriteString("|-----|-------------------|-----------------|----------------|--------|\n") for _, gpu := range result.GPUs { powerCap := "-" if gpu.Scores.PowerCapThrottlePct > 0 { @@ -158,7 +158,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { b.WriteString("\n") // Perspective 4: Performance - b.WriteString("### 4. Производительность\n\n") + b.WriteString("### 4. Performance\n\n") b.WriteString("| GPU | Compute TOPS | Synthetic | Mixed | Mixed Eff. | TOPS/SM/GHz |\n") b.WriteString("|-----|--------------|-----------|-------|------------|-------------|\n") for _, gpu := range result.GPUs { @@ -182,14 +182,14 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { gpu.Index, gpu.Scores.CompositeScore, synthetic, mixed, mixedEff, topsPerSM) } if len(result.PerformanceRampSteps) > 0 { - fmt.Fprintf(&b, "\n**Platform power score (масштабируемость):** %.1f%%\n", result.PlatformPowerScore) + fmt.Fprintf(&b, "\n**Platform power score (scalability):** %.1f%%\n", result.PlatformPowerScore) } b.WriteString("\n") // Perspective 5: Anomaly flags - b.WriteString("### 5. Аномалии\n\n") - b.WriteString("| GPU | ECC corr | Sync boost throttle | Нестаб. питание | Нестаб. охлаждение |\n") - b.WriteString("|-----|----------|---------------------|-----------------|--------------------|\n") + b.WriteString("### 5. Anomalies\n\n") + b.WriteString("| GPU | ECC corrected | Sync boost throttle | Power instability | Thermal instability |\n") + b.WriteString("|-----|---------------|---------------------|-------------------|---------------------|\n") for _, gpu := range result.GPUs { eccCorr := "-" if gpu.ECC.Corrected > 0 { @@ -201,11 +201,11 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { } powerVar := "OK" if gpu.Scores.PowerSustainScore < 70 { - powerVar = "⚠ нестабильно" + powerVar = "⚠ unstable" } thermalVar := "OK" if gpu.Scores.ThermalSustainScore < 70 { - thermalVar = "⚠ нестабильно" + thermalVar = "⚠ unstable" } fmt.Fprintf(&b, "| GPU %d | %s | %s | %s | %s |\n", gpu.Index, eccCorr, syncBoost, powerVar, thermalVar)