Separate compute score from server quality score

CompositeScore = raw ComputeScore (TOPS). Throttling GPUs score lower
automatically — no quality multiplier distorting the compute signal.

Add ServerQualityScore (0-100): server infrastructure quality independent
of GPU model. Formula: 0.40×Stability + 0.30×PowerSustain + 0.30×Thermal.
Use to compare servers with the same GPU or flag bad server conditions.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-16 00:45:55 +03:00
parent d8ca0dca2c
commit 7a0b0934df
3 changed files with 36 additions and 24 deletions

View File

@@ -591,7 +591,6 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
if result.Interconnect != nil && result.Interconnect.Supported {
for i := range result.GPUs {
result.GPUs[i].Scores.InterconnectScore = result.Interconnect.MaxBusBWGBps
result.GPUs[i].Scores.CompositeScore = compositeBenchmarkScore(result.GPUs[i].Scores)
}
}
}
@@ -1433,28 +1432,32 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
throttleUS := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) + float64(gpu.Throttle.SWPowerCapUS)
score.StabilityScore = clampScore(100 - throttleUS/runtimeUS*100)
score.CompositeScore = compositeBenchmarkScore(score)
score.ServerQualityScore = serverQualityScore(score)
score.CompositeScore = score.ComputeScore
if gpu.MultiprocessorCount > 0 && gpu.Steady.AvgGraphicsClockMHz > 0 && score.ComputeScore > 0 {
score.TOPSPerSMPerGHz = score.ComputeScore / float64(gpu.MultiprocessorCount) / (gpu.Steady.AvgGraphicsClockMHz / 1000.0)
}
return score
}
// compositeBenchmarkScore is kept for compatibility with legacy callers.
// CompositeScore = ComputeScore (no quality multiplier; throttling already
// reduces TOPS directly, so no additional penalty is needed).
func compositeBenchmarkScore(score BenchmarkScorecard) float64 {
// quality_factor weights:
// base 0.35 — floor so a GPU that fails all sustain checks still scores
// StabilityScore 0.35 — throttle time: heaviest, direct signal of GPU not keeping up
// PowerSustainScore 0.15 — power variance: unstable draw hints at regulation issues
// ThermalSustainScore 0.15 — temp variance: unstable cooling hints at airflow issues
// cap 1.00
quality := 0.35 +
0.35*(score.StabilityScore/100.0) +
0.15*(score.PowerSustainScore/100.0) +
0.15*(score.ThermalSustainScore/100.0)
if quality > 1.00 {
quality = 1.00
}
return score.ComputeScore * quality
return score.ComputeScore
}
// serverQualityScore returns a 0100 score reflecting server infrastructure
// quality, independent of GPU model or compute speed.
//
// StabilityScore (throttle time) 0.40 — heaviest: direct evidence GPU can't sustain load
// PowerSustainScore (power CV) 0.30 — unstable draw hints at PSU/VRM issues
// ThermalSustainScore (temp CV) 0.30 — unstable temp hints at airflow/cooling issues
func serverQualityScore(score BenchmarkScorecard) float64 {
q := 0.40*(score.StabilityScore/100.0) +
0.30*(score.PowerSustainScore/100.0) +
0.30*(score.ThermalSustainScore/100.0)
return clampScore(q * 100)
}
func detectBenchmarkDegradationReasons(gpu BenchmarkGPUResult, normalizationStatus string) []string {
@@ -1646,7 +1649,7 @@ func finalizeBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkGPUResult {
gpu.Status = "OK"
}
if gpu.Scores.CompositeScore == 0 {
gpu.Scores.CompositeScore = compositeBenchmarkScore(gpu.Scores)
gpu.Scores.CompositeScore = gpu.Scores.ComputeScore
}
return gpu
}