From 72ec086568262c222fbf7baebef3542376afa855 Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Thu, 16 Apr 2026 06:40:06 +0300 Subject: [PATCH] Restructure benchmark report as balanced scorecard (5 perspectives) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Split throttle into separate signals: ThermalThrottlePct, PowerCapThrottlePct, SyncBoostThrottlePct. Add TempHeadroomC (100 - p95_temp) as independent thermal headroom metric; warning < 20°C (>80°C), critical < 10°C (>90°C). Hard stop findings: thermal throttle with fans < 95%, ECC uncorrected errors, p95 temp > 90°C. Throttle findings now include per-type percentages and diagnostic context. Replace flat scorecard table with BSC 5-perspective layout: 1. Compatibility (hard stops: thermal+fan, ECC) 2. Thermal headroom (p95 temp, delta to 100°C, throttle %) 3. Power delivery (power cap throttle, power CV, fan duty) 4. Performance (Compute TOPS, Synthetic, Mixed, TOPS/SM/GHz) 5. Anomalies (ECC corrected, sync boost, power/thermal variance) Co-Authored-By: Claude Sonnet 4.6 --- audit/internal/platform/benchmark.go | 80 +++++++--- audit/internal/platform/benchmark_report.go | 165 +++++++++++++------- audit/internal/platform/benchmark_types.go | 18 ++- 3 files changed, 186 insertions(+), 77 deletions(-) diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index 82e66ed..8c09dae 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -1425,13 +1425,33 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard { score.ThermalSustainScore = 100 } - // StabilityScore: what fraction of the benchmark did the GPU spend throttling? - // Counts both thermal (HW+SW) and power-cap throttle events. - // Score = max(0, 100 − throttle_ratio × 100). - // 1% throttle → score 99; 10% throttle → score 90; 100% → score 0. + // Throttle breakdown: compute per-type percentages for diagnosis. + // Each counter measures microseconds spent in that throttle state during + // the steady-state window. Counters can overlap (e.g. thermal + power cap + // simultaneously), so they are reported independently, not summed. runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6) - throttleUS := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) + float64(gpu.Throttle.SWPowerCapUS) - score.StabilityScore = clampScore(100 - throttleUS/runtimeUS*100) + score.ThermalThrottlePct = math.Min(100, + float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS)/runtimeUS*100) + score.PowerCapThrottlePct = math.Min(100, + float64(gpu.Throttle.SWPowerCapUS)/runtimeUS*100) + score.SyncBoostThrottlePct = math.Min(100, + float64(gpu.Throttle.SyncBoostUS)/runtimeUS*100) + + // StabilityScore: combined throttle signal (thermal + power cap). + // Score = max(0, 100 − combined_throttle_pct). + // 1% throttle → 99; 10% → 90; any throttle > 0 is penalised. + combinedThrottlePct := math.Min(100, + float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS+gpu.Throttle.SWPowerCapUS)/runtimeUS*100) + score.StabilityScore = clampScore(100 - combinedThrottlePct) + + // TempHeadroomC: distance from p95 temperature to the 100°C destruction + // threshold. Assessed independently of throttle — a GPU at 86°C without + // any throttle counter still has only 14°C headroom, which is a concern. + // Warning zone: < 20°C headroom (p95 > 80°C). + // Critical zone: < 10°C headroom (p95 > 90°C). + if gpu.Steady.P95TempC > 0 { + score.TempHeadroomC = 100 - gpu.Steady.P95TempC + } score.ServerQualityScore = serverQualityScore(score) score.CompositeScore = score.ComputeScore if gpu.MultiprocessorCount > 0 && gpu.Steady.AvgGraphicsClockMHz > 0 && score.ComputeScore > 0 { @@ -1687,19 +1707,26 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string { for _, reason := range gpu.DegradationReasons { switch reason { case "power_capped": - findings = append(findings, fmt.Sprintf("GPU %d spent measurable time under SW power cap.", gpu.Index)) + findings = append(findings, fmt.Sprintf( + "[POWER] GPU %d: power cap throttle %.1f%% of steady state — server is not delivering full TDP to the GPU.", + gpu.Index, gpu.Scores.PowerCapThrottlePct)) case "thermal_limited": - msg := fmt.Sprintf("GPU %d reported thermal slowdown during steady state.", gpu.Index) + // Hard stop check: thermal throttle while fans are not at maximum. + // This means the server does not see GPU thermals — incompatible config. if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && - result.Cooling.P95FanDutyCyclePct < 98 && gpu.Steady.ClockDriftPct >= 20 { - msg += fmt.Sprintf( - " Fans peaked at %.0f%% duty cycle (not at maximum) while clocks dropped %.0f%% — possible cooling misconfiguration; rerun the benchmark with fan speed manually fixed at 100%%.", - result.Cooling.P95FanDutyCyclePct, gpu.Steady.ClockDriftPct, - ) + result.Cooling.P95FanDutyCyclePct < 95 { + findings = append(findings, fmt.Sprintf( + "[HARD STOP] GPU %d: thermal throttle (%.1f%% of time) while fans peaked at only %.0f%% duty cycle — server cooling is not responding to GPU heat load. Configuration is likely incompatible.", + gpu.Index, gpu.Scores.ThermalThrottlePct, result.Cooling.P95FanDutyCyclePct)) + } else { + findings = append(findings, fmt.Sprintf( + "[THERMAL] GPU %d: thermal throttle %.1f%% of steady state.", + gpu.Index, gpu.Scores.ThermalThrottlePct)) } - findings = append(findings, msg) case "sync_boost_limited": - findings = append(findings, fmt.Sprintf("GPU %d was limited by sync boost behaviour.", gpu.Index)) + findings = append(findings, fmt.Sprintf( + "[SYNC] GPU %d: sync boost throttle %.1f%% of steady state — GPUs are constraining each other's clocks.", + gpu.Index, gpu.Scores.SyncBoostThrottlePct)) case "low_sm_clock_vs_target": findings = append(findings, fmt.Sprintf("GPU %d average SM clock stayed below the requested lock target.", gpu.Index)) case "variance_too_high": @@ -1707,11 +1734,28 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string { case "normalization_partial": findings = append(findings, fmt.Sprintf("GPU %d ran without full benchmark normalization.", gpu.Index)) case "power_limit_derated": - findings = append(findings, fmt.Sprintf("GPU %d could not sustain targeted_power in this server at the default limit; benchmark ran derated at %.0f W.", gpu.Index, gpu.PowerLimitW)) + findings = append(findings, fmt.Sprintf("[POWER] GPU %d could not sustain full TDP in this server; benchmark ran at reduced limit %.0f W.", gpu.Index, gpu.PowerLimitW)) case "ecc_uncorrected_errors": - findings = append(findings, fmt.Sprintf("GPU %d reported %d uncorrected ECC error(s) — possible hardware fault.", gpu.Index, gpu.ECC.Uncorrected)) + findings = append(findings, fmt.Sprintf( + "[HARD STOP] GPU %d: %d uncorrected ECC error(s) detected — possible hardware fault. Do not use in production.", + gpu.Index, gpu.ECC.Uncorrected)) case "ecc_corrected_errors": - findings = append(findings, fmt.Sprintf("GPU %d reported %d corrected ECC error(s) — possible DRAM degradation.", gpu.Index, gpu.ECC.Corrected)) + findings = append(findings, fmt.Sprintf( + "[WARNING] GPU %d: %d corrected ECC error(s) — possible DRAM degradation, monitor closely.", + gpu.Index, gpu.ECC.Corrected)) + } + } + // Temperature headroom checks — independent of throttle counters. + if gpu.Scores.TempHeadroomC > 0 { + switch { + case gpu.Scores.TempHeadroomC < 10: + findings = append(findings, fmt.Sprintf( + "[HARD STOP] GPU %d: p95 temperature %.1f°C — only %.1f°C from destruction threshold (100°C). Do not operate.", + gpu.Index, gpu.Steady.P95TempC, gpu.Scores.TempHeadroomC)) + case gpu.Scores.TempHeadroomC < 20: + findings = append(findings, fmt.Sprintf( + "[THERMAL] GPU %d: p95 temperature %.1f°C — %.1f°C headroom to limit. Operating in degraded reliability zone (>80°C).", + gpu.Index, gpu.Steady.P95TempC, gpu.Scores.TempHeadroomC)) } } if gpu.CoolingWarning != "" { diff --git a/audit/internal/platform/benchmark_report.go b/audit/internal/platform/benchmark_report.go index e7e6c42..dc89f9a 100644 --- a/audit/internal/platform/benchmark_report.go +++ b/audit/internal/platform/benchmark_report.go @@ -84,54 +84,84 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { b.WriteString("\n") } - // ── Methodology ─────────────────────────────────────────────────────────── - b.WriteString("## Methodology\n\n") - fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline → warmup → steady-state → interconnect phases.\n", result.BenchmarkProfile) - b.WriteString("- Single-GPU compute score comes from `bee-gpu-burn` on the cuBLASLt path when available.\n") - b.WriteString("- GPUs run at their default power limits — no pre-benchmark power calibration is performed.\n") - b.WriteString("- Throttling and thermal state are inferred from NVIDIA clock-event counters and sustained telemetry.\n") - b.WriteString("- `result.json` is the canonical machine-readable source for the run.\n\n") - b.WriteString("**Compute score** is derived from two phases:\n\n") - b.WriteString("- **Synthetic** — each precision type (int8, fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ") - b.WriteString("Measures peak throughput with the full GPU memory budget dedicated to one kernel type. ") - b.WriteString("Each result is normalised to fp32-equivalent TOPS using precision weights: ") - b.WriteString("fp64 ×2.0 · fp32 ×1.0 · fp16 ×0.5 · int8 ×0.25 · fp8 ×0.25 · fp4 ×0.125.\n") - b.WriteString("- **Mixed** — all precision types run simultaneously (combined phase). ") - b.WriteString("Reflects real inference workloads where fp8 matrix ops, fp16 attention and fp32 accumulation compete for bandwidth and SM scheduler slots.\n\n") - b.WriteString("**Formula:** `Compute = Synthetic × (1 + MixedEfficiency × 0.3)`\n\n") - b.WriteString("where `MixedEfficiency = Mixed / Synthetic`. A GPU that sustains 90 % throughput under mixed load ") - b.WriteString("receives a +27 % bonus over its synthetic score; one that drops to 60 % receives +18 %.\n\n") - b.WriteString("**Platform power score** — after all single-GPU runs, a ramp-up phase adds GPUs one by one (k=2..N) ") - b.WriteString("and measures total Synthetic TOPS. Scalability for step k = `k_total_TOPS / (k × best_single_TOPS) × 100`. ") - b.WriteString("`PlatformPowerScore` = mean scalability across all ramp steps. 100 % means linear scaling.\n\n") - b.WriteString("**PowerSustainScore** — power draw variance (CV) during steady-state load. ") - b.WriteString("High variance means unstable power delivery or bursting kernels. Score = max(0, 100 − PowerCV × 3).\n\n") - b.WriteString("**ThermalSustainScore** — temperature variance (CV) during steady-state load. ") - b.WriteString("High variance means inconsistent cooling (fan bursts, flow instability). Score = max(0, 100 − TempCV × 3).\n\n") - b.WriteString("**StabilityScore** — fraction of benchmark time the GPU spent throttling (thermal + power-cap). ") - b.WriteString("1% throttle → score 99; 10% throttle → score 90. This is the heaviest quality signal.\n\n") - b.WriteString("**CompositeScore** = raw compute TOPS (fp32-equivalent). A throttling GPU scores lower automatically.\n\n") - b.WriteString("**ServerQualityScore** (0–100) — server infrastructure quality, independent of GPU model: \n") - b.WriteString("`0.40×Stability + 0.30×PowerSustain + 0.30×ThermalSustain` \n") - b.WriteString("Use this to compare servers with the same GPU type, or to flag a bad server.\n\n") + // ── Balanced Scorecard ──────────────────────────────────────────────────── + b.WriteString("## Balanced Scorecard\n\n") - // ── Scorecard table ─────────────────────────────────────────────────────── - b.WriteString("## Scorecard\n\n") - b.WriteString("| GPU | Status | Compute TOPS | Synthetic | Mixed | Mixed Eff. | TOPS/SM/GHz | Server Quality | Power Sustain | Thermal Sustain | Stability | Interconnect |\n") - b.WriteString("|-----|--------|--------------|-----------|-------|------------|-------------|----------------|---------------|-----------------|-----------|-------------|\n") + // Perspective 1: Compatibility — hard stops + b.WriteString("### 1. Совместимость\n\n") + b.WriteString("| GPU | Тепл. throttle | Вентиляторы при throttle | ECC uncorr | Статус |\n") + b.WriteString("|-----|---------------|--------------------------|------------|--------|\n") for _, gpu := range result.GPUs { - name := strings.TrimSpace(gpu.Name) - if name == "" { - name = "Unknown GPU" + thermalThrottle := "-" + if gpu.Scores.ThermalThrottlePct > 0 { + thermalThrottle = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct) } - interconnect := "-" - if gpu.Scores.InterconnectScore > 0 { - interconnect = fmt.Sprintf("%.1f", gpu.Scores.InterconnectScore) + fanAtThrottle := "-" + if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && gpu.Scores.ThermalThrottlePct > 0 { + fanAtThrottle = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct) } - topsPerSM := "-" - if gpu.Scores.TOPSPerSMPerGHz > 0 { - topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz) + ecc := "-" + if gpu.ECC.Uncorrected > 0 { + ecc = fmt.Sprintf("⛔ %d", gpu.ECC.Uncorrected) } + compatStatus := "✓ OK" + if gpu.ECC.Uncorrected > 0 || (gpu.Scores.ThermalThrottlePct > 0 && result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && result.Cooling.P95FanDutyCyclePct < 95) { + compatStatus = "⛔ HARD STOP" + } + fmt.Fprintf(&b, "| GPU %d | %s | %s | %s | %s |\n", + gpu.Index, thermalThrottle, fanAtThrottle, ecc, compatStatus) + } + b.WriteString("\n") + + // Perspective 2: Thermal headroom + b.WriteString("### 2. Тепловой запас\n\n") + b.WriteString("| GPU | p95 темп | Запас до 100°C | Тепл. throttle | Статус |\n") + b.WriteString("|-----|----------|----------------|----------------|--------|\n") + for _, gpu := range result.GPUs { + headroom := gpu.Scores.TempHeadroomC + thermalStatus := "✓ OK" + switch { + case headroom < 10: + thermalStatus = "⛔ CRITICAL" + case headroom < 20: + thermalStatus = "⚠ WARNING" + } + throttlePct := "-" + if gpu.Scores.ThermalThrottlePct > 0 { + throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct) + } + fmt.Fprintf(&b, "| GPU %d | %.1f°C | %.1f°C | %s | %s |\n", + gpu.Index, gpu.Steady.P95TempC, headroom, throttlePct, thermalStatus) + } + b.WriteString("\n") + + // Perspective 3: Power delivery + b.WriteString("### 3. Энергетика\n\n") + b.WriteString("| GPU | Power cap throttle | Power CV | Вентиляторы (p95) | Статус |\n") + b.WriteString("|-----|-------------------|----------|-------------------|--------|\n") + for _, gpu := range result.GPUs { + powerCap := "-" + if gpu.Scores.PowerCapThrottlePct > 0 { + powerCap = fmt.Sprintf("%.1f%%", gpu.Scores.PowerCapThrottlePct) + } + fanDuty := "-" + if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable { + fanDuty = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct) + } + powerStatus := "✓ OK" + if gpu.Scores.PowerCapThrottlePct > 5 { + powerStatus = "⚠ POWER LIMITED" + } + fmt.Fprintf(&b, "| GPU %d | %s | %.1f | %s | %s |\n", + gpu.Index, powerCap, gpu.Scores.PowerSustainScore, fanDuty, powerStatus) + } + b.WriteString("\n") + + // Perspective 4: Performance + b.WriteString("### 4. Производительность\n\n") + b.WriteString("| GPU | Compute TOPS | Synthetic | Mixed | Mixed Eff. | TOPS/SM/GHz |\n") + b.WriteString("|-----|--------------|-----------|-------|------------|-------------|\n") + for _, gpu := range result.GPUs { synthetic := "-" if gpu.Scores.SyntheticScore > 0 { synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore) @@ -144,20 +174,41 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { if gpu.Scores.MixedEfficiency > 0 { mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100) } - fmt.Fprintf(&b, "| GPU %d %s | %s | **%.2f** | %s | %s | %s | %s | %.1f | %.1f | %.1f | %.1f | %s |\n", - gpu.Index, name, - gpu.Status, - gpu.Scores.CompositeScore, - synthetic, - mixed, - mixedEff, - topsPerSM, - gpu.Scores.ServerQualityScore, - gpu.Scores.PowerSustainScore, - gpu.Scores.ThermalSustainScore, - gpu.Scores.StabilityScore, - interconnect, - ) + topsPerSM := "-" + if gpu.Scores.TOPSPerSMPerGHz > 0 { + topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz) + } + fmt.Fprintf(&b, "| GPU %d | **%.2f** | %s | %s | %s | %s |\n", + gpu.Index, gpu.Scores.CompositeScore, synthetic, mixed, mixedEff, topsPerSM) + } + if len(result.PerformanceRampSteps) > 0 { + fmt.Fprintf(&b, "\n**Platform power score (масштабируемость):** %.1f%%\n", result.PlatformPowerScore) + } + b.WriteString("\n") + + // Perspective 5: Anomaly flags + b.WriteString("### 5. Аномалии\n\n") + b.WriteString("| GPU | ECC corr | Sync boost throttle | Нестаб. питание | Нестаб. охлаждение |\n") + b.WriteString("|-----|----------|---------------------|-----------------|--------------------|\n") + for _, gpu := range result.GPUs { + eccCorr := "-" + if gpu.ECC.Corrected > 0 { + eccCorr = fmt.Sprintf("⚠ %d", gpu.ECC.Corrected) + } + syncBoost := "-" + if gpu.Scores.SyncBoostThrottlePct > 0 { + syncBoost = fmt.Sprintf("%.1f%%", gpu.Scores.SyncBoostThrottlePct) + } + powerVar := "OK" + if gpu.Scores.PowerSustainScore < 70 { + powerVar = "⚠ нестабильно" + } + thermalVar := "OK" + if gpu.Scores.ThermalSustainScore < 70 { + thermalVar = "⚠ нестабильно" + } + fmt.Fprintf(&b, "| GPU %d | %s | %s | %s | %s |\n", + gpu.Index, eccCorr, syncBoost, powerVar, thermalVar) } b.WriteString("\n") diff --git a/audit/internal/platform/benchmark_types.go b/audit/internal/platform/benchmark_types.go index d9d6721..fe6b3ff 100644 --- a/audit/internal/platform/benchmark_types.go +++ b/audit/internal/platform/benchmark_types.go @@ -211,8 +211,22 @@ type BenchmarkScorecard struct { MixedEfficiency float64 `json:"mixed_efficiency,omitempty"` PowerSustainScore float64 `json:"power_sustain_score"` ThermalSustainScore float64 `json:"thermal_sustain_score"` - StabilityScore float64 `json:"stability_score"` - InterconnectScore float64 `json:"interconnect_score"` + // StabilityScore: fraction of steady-state time the GPU spent throttling + // (thermal + power cap combined). 0% throttle = 100; 100% throttle = 0. + StabilityScore float64 `json:"stability_score"` + + // Throttle breakdown — percentage of steady-state time in each throttle type. + // Used for diagnosis: tells WHY the GPU throttled, not just whether it did. + ThermalThrottlePct float64 `json:"thermal_throttle_pct"` // HW+SW thermal slowdown + PowerCapThrottlePct float64 `json:"power_cap_throttle_pct"` // SW power cap + SyncBoostThrottlePct float64 `json:"sync_boost_throttle_pct,omitempty"` + + // Temperature headroom: distance to the 100°C destruction threshold. + // TempHeadroomC = 100 - P95TempC. < 20°C = warning; < 10°C = critical. + // Independent of throttle — a GPU at 86°C without throttle is still in the red zone. + TempHeadroomC float64 `json:"temp_headroom_c"` + + InterconnectScore float64 `json:"interconnect_score"` // ServerQualityScore (0–100) reflects server infrastructure quality independent // of GPU model. Combines throttle time, power variance, and temp variance. // Use this to compare servers with the same GPU, or to flag a bad server