Translate report to English; add power anomaly detector
All report strings are now English only. Add detectPowerAnomaly: scans steady-state metric rows per GPU with a 5-sample rolling baseline; flags a sudden drop ≥30% while GPU usage >50% as [HARD STOP] — indicates bad cable contact or VRM fault. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -508,6 +508,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
||||
|
||||
gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult)
|
||||
gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status)
|
||||
if anomaly := detectPowerAnomaly(metricRows, idx); anomaly != "" {
|
||||
gpuResult.Notes = append(gpuResult.Notes,
|
||||
fmt.Sprintf("[HARD STOP] GPU %d: %s", idx, anomaly))
|
||||
}
|
||||
if planErr != nil {
|
||||
gpuResult.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr)
|
||||
} else if len(gpuResult.PrecisionFailures) > 0 {
|
||||
@@ -1480,6 +1484,53 @@ func serverQualityScore(score BenchmarkScorecard) float64 {
|
||||
return clampScore(q * 100)
|
||||
}
|
||||
|
||||
// detectPowerAnomaly scans per-GPU steady-state metric rows for a sudden
|
||||
// power drop — a symptom of bad cable contact, VRM fault, or thermal event
|
||||
// on the power delivery path. Returns a non-empty string if an anomaly is found.
|
||||
//
|
||||
// Algorithm: uses a 5-sample rolling baseline; flags any sample that falls
|
||||
// more than 30% below the baseline while the GPU was otherwise loaded
|
||||
// (usage > 50%). A sustained throttle (power cap) is not flagged here —
|
||||
// that is already captured by PowerCapThrottlePct.
|
||||
func detectPowerAnomaly(rows []GPUMetricRow, gpuIndex int) string {
|
||||
const windowSize = 5
|
||||
const dropThresholdPct = 30.0
|
||||
const minUsagePct = 50.0
|
||||
|
||||
// Filter rows for this GPU during steady state only.
|
||||
var steady []GPUMetricRow
|
||||
for _, r := range rows {
|
||||
if r.GPUIndex == gpuIndex && r.Stage != "" && strings.Contains(r.Stage, "steady") {
|
||||
steady = append(steady, r)
|
||||
}
|
||||
}
|
||||
if len(steady) < windowSize+2 {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Compute initial baseline from the first window.
|
||||
var baseSum float64
|
||||
for i := 0; i < windowSize; i++ {
|
||||
baseSum += steady[i].PowerW
|
||||
}
|
||||
|
||||
for i := windowSize; i < len(steady); i++ {
|
||||
baseline := baseSum / float64(windowSize)
|
||||
sample := steady[i]
|
||||
if baseline > 0 && sample.UsagePct >= minUsagePct {
|
||||
dropPct := (baseline - sample.PowerW) / baseline * 100
|
||||
if dropPct >= dropThresholdPct {
|
||||
return fmt.Sprintf("sudden power drop detected at t=%.0fs: %.0f W → %.0f W (%.0f%% below rolling baseline) — possible bad cable contact or VRM fault",
|
||||
sample.ElapsedSec, baseline, sample.PowerW, dropPct)
|
||||
}
|
||||
}
|
||||
// Slide the window baseline.
|
||||
baseSum -= steady[i-windowSize].PowerW
|
||||
baseSum += sample.PowerW
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func detectBenchmarkDegradationReasons(gpu BenchmarkGPUResult, normalizationStatus string) []string {
|
||||
var reasons []string
|
||||
runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
|
||||
|
||||
@@ -88,9 +88,9 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
b.WriteString("## Balanced Scorecard\n\n")
|
||||
|
||||
// Perspective 1: Compatibility — hard stops
|
||||
b.WriteString("### 1. Совместимость\n\n")
|
||||
b.WriteString("| GPU | Тепл. throttle | Вентиляторы при throttle | ECC uncorr | Статус |\n")
|
||||
b.WriteString("|-----|---------------|--------------------------|------------|--------|\n")
|
||||
b.WriteString("### 1. Compatibility\n\n")
|
||||
b.WriteString("| GPU | Thermal throttle | Fan duty at throttle | ECC uncorr | Status |\n")
|
||||
b.WriteString("|-----|------------------|----------------------|------------|--------|\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
thermalThrottle := "-"
|
||||
if gpu.Scores.ThermalThrottlePct > 0 {
|
||||
@@ -114,9 +114,9 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
b.WriteString("\n")
|
||||
|
||||
// Perspective 2: Thermal headroom
|
||||
b.WriteString("### 2. Тепловой запас\n\n")
|
||||
b.WriteString("| GPU | p95 темп | Запас до 100°C | Тепл. throttle | Статус |\n")
|
||||
b.WriteString("|-----|----------|----------------|----------------|--------|\n")
|
||||
b.WriteString("### 2. Thermal Headroom\n\n")
|
||||
b.WriteString("| GPU | p95 temp | Headroom to 100°C | Thermal throttle | Status |\n")
|
||||
b.WriteString("|-----|----------|-------------------|------------------|--------|\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
headroom := gpu.Scores.TempHeadroomC
|
||||
thermalStatus := "✓ OK"
|
||||
@@ -136,9 +136,9 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
b.WriteString("\n")
|
||||
|
||||
// Perspective 3: Power delivery
|
||||
b.WriteString("### 3. Энергетика\n\n")
|
||||
b.WriteString("| GPU | Power cap throttle | Power CV | Вентиляторы (p95) | Статус |\n")
|
||||
b.WriteString("|-----|-------------------|----------|-------------------|--------|\n")
|
||||
b.WriteString("### 3. Power Delivery\n\n")
|
||||
b.WriteString("| GPU | Power cap throttle | Power stability | Fan duty (p95) | Status |\n")
|
||||
b.WriteString("|-----|-------------------|-----------------|----------------|--------|\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
powerCap := "-"
|
||||
if gpu.Scores.PowerCapThrottlePct > 0 {
|
||||
@@ -158,7 +158,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
b.WriteString("\n")
|
||||
|
||||
// Perspective 4: Performance
|
||||
b.WriteString("### 4. Производительность\n\n")
|
||||
b.WriteString("### 4. Performance\n\n")
|
||||
b.WriteString("| GPU | Compute TOPS | Synthetic | Mixed | Mixed Eff. | TOPS/SM/GHz |\n")
|
||||
b.WriteString("|-----|--------------|-----------|-------|------------|-------------|\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
@@ -182,14 +182,14 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
gpu.Index, gpu.Scores.CompositeScore, synthetic, mixed, mixedEff, topsPerSM)
|
||||
}
|
||||
if len(result.PerformanceRampSteps) > 0 {
|
||||
fmt.Fprintf(&b, "\n**Platform power score (масштабируемость):** %.1f%%\n", result.PlatformPowerScore)
|
||||
fmt.Fprintf(&b, "\n**Platform power score (scalability):** %.1f%%\n", result.PlatformPowerScore)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// Perspective 5: Anomaly flags
|
||||
b.WriteString("### 5. Аномалии\n\n")
|
||||
b.WriteString("| GPU | ECC corr | Sync boost throttle | Нестаб. питание | Нестаб. охлаждение |\n")
|
||||
b.WriteString("|-----|----------|---------------------|-----------------|--------------------|\n")
|
||||
b.WriteString("### 5. Anomalies\n\n")
|
||||
b.WriteString("| GPU | ECC corrected | Sync boost throttle | Power instability | Thermal instability |\n")
|
||||
b.WriteString("|-----|---------------|---------------------|-------------------|---------------------|\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
eccCorr := "-"
|
||||
if gpu.ECC.Corrected > 0 {
|
||||
@@ -201,11 +201,11 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
}
|
||||
powerVar := "OK"
|
||||
if gpu.Scores.PowerSustainScore < 70 {
|
||||
powerVar = "⚠ нестабильно"
|
||||
powerVar = "⚠ unstable"
|
||||
}
|
||||
thermalVar := "OK"
|
||||
if gpu.Scores.ThermalSustainScore < 70 {
|
||||
thermalVar = "⚠ нестабильно"
|
||||
thermalVar = "⚠ unstable"
|
||||
}
|
||||
fmt.Fprintf(&b, "| GPU %d | %s | %s | %s | %s |\n",
|
||||
gpu.Index, eccCorr, syncBoost, powerVar, thermalVar)
|
||||
|
||||
Reference in New Issue
Block a user