From d8ca0dca2c5efb546d013715064d243049b77675 Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Thu, 16 Apr 2026 00:39:59 +0300 Subject: [PATCH] Redesign scoring metrics: variance-based sustain scores, throttle stability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PowerSustainScore: power draw variance (CV) during load, not deviation from TDP. ThermalSustainScore: temperature variance (CV) during load. StabilityScore: fraction of time spent in thermal+power-cap throttling. Remove NCCL bonus from quality_factor. quality = 0.35 + 0.35×Stability + 0.15×PowerSustain + 0.15×ThermalSustain, cap 1.00. Co-Authored-By: Claude Sonnet 4.6 --- audit/internal/platform/benchmark_report.go | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/audit/internal/platform/benchmark_report.go b/audit/internal/platform/benchmark_report.go index b4cadc2..5bde8de 100644 --- a/audit/internal/platform/benchmark_report.go +++ b/audit/internal/platform/benchmark_report.go @@ -104,9 +104,14 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { b.WriteString("**Platform power score** — after all single-GPU runs, a ramp-up phase adds GPUs one by one (k=2..N) ") b.WriteString("and measures total Synthetic TOPS. Scalability for step k = `k_total_TOPS / (k × best_single_TOPS) × 100`. ") b.WriteString("`PlatformPowerScore` = mean scalability across all ramp steps. 100 % means linear scaling.\n\n") - b.WriteString("**PowerSustainScore** — measures how close the GPU ran to its rated TDP during steady-state load. ") - b.WriteString("Uses steady-state average power as the reference (no external calibration needed).\n\n") - b.WriteString("**Composite score** = `Compute × quality_factor` where quality factors in power sustain, thermal sustain, stability, and interconnect.\n\n") + b.WriteString("**PowerSustainScore** — power draw variance (CV) during steady-state load. ") + b.WriteString("High variance means unstable power delivery or bursting kernels. Score = max(0, 100 − PowerCV × 3).\n\n") + b.WriteString("**ThermalSustainScore** — temperature variance (CV) during steady-state load. ") + b.WriteString("High variance means inconsistent cooling (fan bursts, flow instability). Score = max(0, 100 − TempCV × 3).\n\n") + b.WriteString("**StabilityScore** — fraction of benchmark time the GPU spent throttling (thermal + power-cap). ") + b.WriteString("1% throttle → score 99; 10% throttle → score 90. This is the heaviest quality signal.\n\n") + b.WriteString("**Composite score** = `Compute × quality_factor` \n") + b.WriteString("`quality = 0.35 + 0.35×Stability + 0.15×PowerSustain + 0.15×ThermalSustain`, capped at 1.00.\n\n") // ── Scorecard table ─────────────────────────────────────────────────────── b.WriteString("## Scorecard\n\n")