From 7a0b0934df52f69c3caef98d496594d10c6aecc2 Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Thu, 16 Apr 2026 00:45:55 +0300 Subject: [PATCH] Separate compute score from server quality score MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CompositeScore = raw ComputeScore (TOPS). Throttling GPUs score lower automatically — no quality multiplier distorting the compute signal. Add ServerQualityScore (0-100): server infrastructure quality independent of GPU model. Formula: 0.40×Stability + 0.30×PowerSustain + 0.30×Thermal. Use to compare servers with the same GPU or flag bad server conditions. Co-Authored-By: Claude Sonnet 4.6 --- audit/internal/platform/benchmark.go | 37 +++++++++++---------- audit/internal/platform/benchmark_report.go | 14 ++++---- audit/internal/platform/benchmark_types.go | 9 ++++- 3 files changed, 36 insertions(+), 24 deletions(-) diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index 12aeba6..82e66ed 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -591,7 +591,6 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv if result.Interconnect != nil && result.Interconnect.Supported { for i := range result.GPUs { result.GPUs[i].Scores.InterconnectScore = result.Interconnect.MaxBusBWGBps - result.GPUs[i].Scores.CompositeScore = compositeBenchmarkScore(result.GPUs[i].Scores) } } } @@ -1433,28 +1432,32 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard { runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6) throttleUS := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) + float64(gpu.Throttle.SWPowerCapUS) score.StabilityScore = clampScore(100 - throttleUS/runtimeUS*100) - score.CompositeScore = compositeBenchmarkScore(score) + score.ServerQualityScore = serverQualityScore(score) + score.CompositeScore = score.ComputeScore if gpu.MultiprocessorCount > 0 && gpu.Steady.AvgGraphicsClockMHz > 0 && score.ComputeScore > 0 { score.TOPSPerSMPerGHz = score.ComputeScore / float64(gpu.MultiprocessorCount) / (gpu.Steady.AvgGraphicsClockMHz / 1000.0) } return score } +// compositeBenchmarkScore is kept for compatibility with legacy callers. +// CompositeScore = ComputeScore (no quality multiplier; throttling already +// reduces TOPS directly, so no additional penalty is needed). func compositeBenchmarkScore(score BenchmarkScorecard) float64 { - // quality_factor weights: - // base 0.35 — floor so a GPU that fails all sustain checks still scores - // StabilityScore 0.35 — throttle time: heaviest, direct signal of GPU not keeping up - // PowerSustainScore 0.15 — power variance: unstable draw hints at regulation issues - // ThermalSustainScore 0.15 — temp variance: unstable cooling hints at airflow issues - // cap 1.00 - quality := 0.35 + - 0.35*(score.StabilityScore/100.0) + - 0.15*(score.PowerSustainScore/100.0) + - 0.15*(score.ThermalSustainScore/100.0) - if quality > 1.00 { - quality = 1.00 - } - return score.ComputeScore * quality + return score.ComputeScore +} + +// serverQualityScore returns a 0–100 score reflecting server infrastructure +// quality, independent of GPU model or compute speed. +// +// StabilityScore (throttle time) 0.40 — heaviest: direct evidence GPU can't sustain load +// PowerSustainScore (power CV) 0.30 — unstable draw hints at PSU/VRM issues +// ThermalSustainScore (temp CV) 0.30 — unstable temp hints at airflow/cooling issues +func serverQualityScore(score BenchmarkScorecard) float64 { + q := 0.40*(score.StabilityScore/100.0) + + 0.30*(score.PowerSustainScore/100.0) + + 0.30*(score.ThermalSustainScore/100.0) + return clampScore(q * 100) } func detectBenchmarkDegradationReasons(gpu BenchmarkGPUResult, normalizationStatus string) []string { @@ -1646,7 +1649,7 @@ func finalizeBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkGPUResult { gpu.Status = "OK" } if gpu.Scores.CompositeScore == 0 { - gpu.Scores.CompositeScore = compositeBenchmarkScore(gpu.Scores) + gpu.Scores.CompositeScore = gpu.Scores.ComputeScore } return gpu } diff --git a/audit/internal/platform/benchmark_report.go b/audit/internal/platform/benchmark_report.go index 5bde8de..e7e6c42 100644 --- a/audit/internal/platform/benchmark_report.go +++ b/audit/internal/platform/benchmark_report.go @@ -110,13 +110,15 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { b.WriteString("High variance means inconsistent cooling (fan bursts, flow instability). Score = max(0, 100 − TempCV × 3).\n\n") b.WriteString("**StabilityScore** — fraction of benchmark time the GPU spent throttling (thermal + power-cap). ") b.WriteString("1% throttle → score 99; 10% throttle → score 90. This is the heaviest quality signal.\n\n") - b.WriteString("**Composite score** = `Compute × quality_factor` \n") - b.WriteString("`quality = 0.35 + 0.35×Stability + 0.15×PowerSustain + 0.15×ThermalSustain`, capped at 1.00.\n\n") + b.WriteString("**CompositeScore** = raw compute TOPS (fp32-equivalent). A throttling GPU scores lower automatically.\n\n") + b.WriteString("**ServerQualityScore** (0–100) — server infrastructure quality, independent of GPU model: \n") + b.WriteString("`0.40×Stability + 0.30×PowerSustain + 0.30×ThermalSustain` \n") + b.WriteString("Use this to compare servers with the same GPU type, or to flag a bad server.\n\n") // ── Scorecard table ─────────────────────────────────────────────────────── b.WriteString("## Scorecard\n\n") - b.WriteString("| GPU | Status | Composite | Compute | Synthetic | Mixed | Mixed Eff. | TOPS/SM/GHz | Power Sustain | Thermal Sustain | Stability | Interconnect |\n") - b.WriteString("|-----|--------|-----------|---------|-----------|-------|------------|-------------|---------------|-----------------|-----------|-------------|\n") + b.WriteString("| GPU | Status | Compute TOPS | Synthetic | Mixed | Mixed Eff. | TOPS/SM/GHz | Server Quality | Power Sustain | Thermal Sustain | Stability | Interconnect |\n") + b.WriteString("|-----|--------|--------------|-----------|-------|------------|-------------|----------------|---------------|-----------------|-----------|-------------|\n") for _, gpu := range result.GPUs { name := strings.TrimSpace(gpu.Name) if name == "" { @@ -142,15 +144,15 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { if gpu.Scores.MixedEfficiency > 0 { mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100) } - fmt.Fprintf(&b, "| GPU %d %s | %s | **%.2f** | %.2f | %s | %s | %s | %s | %.1f | %.1f | %.1f | %s |\n", + fmt.Fprintf(&b, "| GPU %d %s | %s | **%.2f** | %s | %s | %s | %s | %.1f | %.1f | %.1f | %.1f | %s |\n", gpu.Index, name, gpu.Status, gpu.Scores.CompositeScore, - gpu.Scores.ComputeScore, synthetic, mixed, mixedEff, topsPerSM, + gpu.Scores.ServerQualityScore, gpu.Scores.PowerSustainScore, gpu.Scores.ThermalSustainScore, gpu.Scores.StabilityScore, diff --git a/audit/internal/platform/benchmark_types.go b/audit/internal/platform/benchmark_types.go index 3383619..d9d6721 100644 --- a/audit/internal/platform/benchmark_types.go +++ b/audit/internal/platform/benchmark_types.go @@ -213,7 +213,14 @@ type BenchmarkScorecard struct { ThermalSustainScore float64 `json:"thermal_sustain_score"` StabilityScore float64 `json:"stability_score"` InterconnectScore float64 `json:"interconnect_score"` - CompositeScore float64 `json:"composite_score"` + // ServerQualityScore (0–100) reflects server infrastructure quality independent + // of GPU model. Combines throttle time, power variance, and temp variance. + // Use this to compare servers with the same GPU, or to flag a bad server + // that throttles an otherwise fast GPU. + ServerQualityScore float64 `json:"server_quality_score"` + // CompositeScore is the raw compute score (TOPS, fp32-equivalent). + // A throttling GPU will score lower here automatically — no quality multiplier. + CompositeScore float64 `json:"composite_score"` // TOPSPerSMPerGHz is compute efficiency independent of clock speed and SM count. TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"` }