Separate compute score from server quality score
CompositeScore = raw ComputeScore (TOPS). Throttling GPUs score lower automatically — no quality multiplier distorting the compute signal. Add ServerQualityScore (0-100): server infrastructure quality independent of GPU model. Formula: 0.40×Stability + 0.30×PowerSustain + 0.30×Thermal. Use to compare servers with the same GPU or flag bad server conditions. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -591,7 +591,6 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
||||
if result.Interconnect != nil && result.Interconnect.Supported {
|
||||
for i := range result.GPUs {
|
||||
result.GPUs[i].Scores.InterconnectScore = result.Interconnect.MaxBusBWGBps
|
||||
result.GPUs[i].Scores.CompositeScore = compositeBenchmarkScore(result.GPUs[i].Scores)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1433,28 +1432,32 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
|
||||
runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
|
||||
throttleUS := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) + float64(gpu.Throttle.SWPowerCapUS)
|
||||
score.StabilityScore = clampScore(100 - throttleUS/runtimeUS*100)
|
||||
score.CompositeScore = compositeBenchmarkScore(score)
|
||||
score.ServerQualityScore = serverQualityScore(score)
|
||||
score.CompositeScore = score.ComputeScore
|
||||
if gpu.MultiprocessorCount > 0 && gpu.Steady.AvgGraphicsClockMHz > 0 && score.ComputeScore > 0 {
|
||||
score.TOPSPerSMPerGHz = score.ComputeScore / float64(gpu.MultiprocessorCount) / (gpu.Steady.AvgGraphicsClockMHz / 1000.0)
|
||||
}
|
||||
return score
|
||||
}
|
||||
|
||||
// compositeBenchmarkScore is kept for compatibility with legacy callers.
|
||||
// CompositeScore = ComputeScore (no quality multiplier; throttling already
|
||||
// reduces TOPS directly, so no additional penalty is needed).
|
||||
func compositeBenchmarkScore(score BenchmarkScorecard) float64 {
|
||||
// quality_factor weights:
|
||||
// base 0.35 — floor so a GPU that fails all sustain checks still scores
|
||||
// StabilityScore 0.35 — throttle time: heaviest, direct signal of GPU not keeping up
|
||||
// PowerSustainScore 0.15 — power variance: unstable draw hints at regulation issues
|
||||
// ThermalSustainScore 0.15 — temp variance: unstable cooling hints at airflow issues
|
||||
// cap 1.00
|
||||
quality := 0.35 +
|
||||
0.35*(score.StabilityScore/100.0) +
|
||||
0.15*(score.PowerSustainScore/100.0) +
|
||||
0.15*(score.ThermalSustainScore/100.0)
|
||||
if quality > 1.00 {
|
||||
quality = 1.00
|
||||
}
|
||||
return score.ComputeScore * quality
|
||||
return score.ComputeScore
|
||||
}
|
||||
|
||||
// serverQualityScore returns a 0–100 score reflecting server infrastructure
|
||||
// quality, independent of GPU model or compute speed.
|
||||
//
|
||||
// StabilityScore (throttle time) 0.40 — heaviest: direct evidence GPU can't sustain load
|
||||
// PowerSustainScore (power CV) 0.30 — unstable draw hints at PSU/VRM issues
|
||||
// ThermalSustainScore (temp CV) 0.30 — unstable temp hints at airflow/cooling issues
|
||||
func serverQualityScore(score BenchmarkScorecard) float64 {
|
||||
q := 0.40*(score.StabilityScore/100.0) +
|
||||
0.30*(score.PowerSustainScore/100.0) +
|
||||
0.30*(score.ThermalSustainScore/100.0)
|
||||
return clampScore(q * 100)
|
||||
}
|
||||
|
||||
func detectBenchmarkDegradationReasons(gpu BenchmarkGPUResult, normalizationStatus string) []string {
|
||||
@@ -1646,7 +1649,7 @@ func finalizeBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkGPUResult {
|
||||
gpu.Status = "OK"
|
||||
}
|
||||
if gpu.Scores.CompositeScore == 0 {
|
||||
gpu.Scores.CompositeScore = compositeBenchmarkScore(gpu.Scores)
|
||||
gpu.Scores.CompositeScore = gpu.Scores.ComputeScore
|
||||
}
|
||||
return gpu
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user