Restructure benchmark report as balanced scorecard (5 perspectives)
Split throttle into separate signals: ThermalThrottlePct, PowerCapThrottlePct, SyncBoostThrottlePct. Add TempHeadroomC (100 - p95_temp) as independent thermal headroom metric; warning < 20°C (>80°C), critical < 10°C (>90°C). Hard stop findings: thermal throttle with fans < 95%, ECC uncorrected errors, p95 temp > 90°C. Throttle findings now include per-type percentages and diagnostic context. Replace flat scorecard table with BSC 5-perspective layout: 1. Compatibility (hard stops: thermal+fan, ECC) 2. Thermal headroom (p95 temp, delta to 100°C, throttle %) 3. Power delivery (power cap throttle, power CV, fan duty) 4. Performance (Compute TOPS, Synthetic, Mixed, TOPS/SM/GHz) 5. Anomalies (ECC corrected, sync boost, power/thermal variance) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1425,13 +1425,33 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
|
||||
score.ThermalSustainScore = 100
|
||||
}
|
||||
|
||||
// StabilityScore: what fraction of the benchmark did the GPU spend throttling?
|
||||
// Counts both thermal (HW+SW) and power-cap throttle events.
|
||||
// Score = max(0, 100 − throttle_ratio × 100).
|
||||
// 1% throttle → score 99; 10% throttle → score 90; 100% → score 0.
|
||||
// Throttle breakdown: compute per-type percentages for diagnosis.
|
||||
// Each counter measures microseconds spent in that throttle state during
|
||||
// the steady-state window. Counters can overlap (e.g. thermal + power cap
|
||||
// simultaneously), so they are reported independently, not summed.
|
||||
runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
|
||||
throttleUS := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) + float64(gpu.Throttle.SWPowerCapUS)
|
||||
score.StabilityScore = clampScore(100 - throttleUS/runtimeUS*100)
|
||||
score.ThermalThrottlePct = math.Min(100,
|
||||
float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS)/runtimeUS*100)
|
||||
score.PowerCapThrottlePct = math.Min(100,
|
||||
float64(gpu.Throttle.SWPowerCapUS)/runtimeUS*100)
|
||||
score.SyncBoostThrottlePct = math.Min(100,
|
||||
float64(gpu.Throttle.SyncBoostUS)/runtimeUS*100)
|
||||
|
||||
// StabilityScore: combined throttle signal (thermal + power cap).
|
||||
// Score = max(0, 100 − combined_throttle_pct).
|
||||
// 1% throttle → 99; 10% → 90; any throttle > 0 is penalised.
|
||||
combinedThrottlePct := math.Min(100,
|
||||
float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS+gpu.Throttle.SWPowerCapUS)/runtimeUS*100)
|
||||
score.StabilityScore = clampScore(100 - combinedThrottlePct)
|
||||
|
||||
// TempHeadroomC: distance from p95 temperature to the 100°C destruction
|
||||
// threshold. Assessed independently of throttle — a GPU at 86°C without
|
||||
// any throttle counter still has only 14°C headroom, which is a concern.
|
||||
// Warning zone: < 20°C headroom (p95 > 80°C).
|
||||
// Critical zone: < 10°C headroom (p95 > 90°C).
|
||||
if gpu.Steady.P95TempC > 0 {
|
||||
score.TempHeadroomC = 100 - gpu.Steady.P95TempC
|
||||
}
|
||||
score.ServerQualityScore = serverQualityScore(score)
|
||||
score.CompositeScore = score.ComputeScore
|
||||
if gpu.MultiprocessorCount > 0 && gpu.Steady.AvgGraphicsClockMHz > 0 && score.ComputeScore > 0 {
|
||||
@@ -1687,19 +1707,26 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
|
||||
for _, reason := range gpu.DegradationReasons {
|
||||
switch reason {
|
||||
case "power_capped":
|
||||
findings = append(findings, fmt.Sprintf("GPU %d spent measurable time under SW power cap.", gpu.Index))
|
||||
findings = append(findings, fmt.Sprintf(
|
||||
"[POWER] GPU %d: power cap throttle %.1f%% of steady state — server is not delivering full TDP to the GPU.",
|
||||
gpu.Index, gpu.Scores.PowerCapThrottlePct))
|
||||
case "thermal_limited":
|
||||
msg := fmt.Sprintf("GPU %d reported thermal slowdown during steady state.", gpu.Index)
|
||||
// Hard stop check: thermal throttle while fans are not at maximum.
|
||||
// This means the server does not see GPU thermals — incompatible config.
|
||||
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable &&
|
||||
result.Cooling.P95FanDutyCyclePct < 98 && gpu.Steady.ClockDriftPct >= 20 {
|
||||
msg += fmt.Sprintf(
|
||||
" Fans peaked at %.0f%% duty cycle (not at maximum) while clocks dropped %.0f%% — possible cooling misconfiguration; rerun the benchmark with fan speed manually fixed at 100%%.",
|
||||
result.Cooling.P95FanDutyCyclePct, gpu.Steady.ClockDriftPct,
|
||||
)
|
||||
result.Cooling.P95FanDutyCyclePct < 95 {
|
||||
findings = append(findings, fmt.Sprintf(
|
||||
"[HARD STOP] GPU %d: thermal throttle (%.1f%% of time) while fans peaked at only %.0f%% duty cycle — server cooling is not responding to GPU heat load. Configuration is likely incompatible.",
|
||||
gpu.Index, gpu.Scores.ThermalThrottlePct, result.Cooling.P95FanDutyCyclePct))
|
||||
} else {
|
||||
findings = append(findings, fmt.Sprintf(
|
||||
"[THERMAL] GPU %d: thermal throttle %.1f%% of steady state.",
|
||||
gpu.Index, gpu.Scores.ThermalThrottlePct))
|
||||
}
|
||||
findings = append(findings, msg)
|
||||
case "sync_boost_limited":
|
||||
findings = append(findings, fmt.Sprintf("GPU %d was limited by sync boost behaviour.", gpu.Index))
|
||||
findings = append(findings, fmt.Sprintf(
|
||||
"[SYNC] GPU %d: sync boost throttle %.1f%% of steady state — GPUs are constraining each other's clocks.",
|
||||
gpu.Index, gpu.Scores.SyncBoostThrottlePct))
|
||||
case "low_sm_clock_vs_target":
|
||||
findings = append(findings, fmt.Sprintf("GPU %d average SM clock stayed below the requested lock target.", gpu.Index))
|
||||
case "variance_too_high":
|
||||
@@ -1707,11 +1734,28 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
|
||||
case "normalization_partial":
|
||||
findings = append(findings, fmt.Sprintf("GPU %d ran without full benchmark normalization.", gpu.Index))
|
||||
case "power_limit_derated":
|
||||
findings = append(findings, fmt.Sprintf("GPU %d could not sustain targeted_power in this server at the default limit; benchmark ran derated at %.0f W.", gpu.Index, gpu.PowerLimitW))
|
||||
findings = append(findings, fmt.Sprintf("[POWER] GPU %d could not sustain full TDP in this server; benchmark ran at reduced limit %.0f W.", gpu.Index, gpu.PowerLimitW))
|
||||
case "ecc_uncorrected_errors":
|
||||
findings = append(findings, fmt.Sprintf("GPU %d reported %d uncorrected ECC error(s) — possible hardware fault.", gpu.Index, gpu.ECC.Uncorrected))
|
||||
findings = append(findings, fmt.Sprintf(
|
||||
"[HARD STOP] GPU %d: %d uncorrected ECC error(s) detected — possible hardware fault. Do not use in production.",
|
||||
gpu.Index, gpu.ECC.Uncorrected))
|
||||
case "ecc_corrected_errors":
|
||||
findings = append(findings, fmt.Sprintf("GPU %d reported %d corrected ECC error(s) — possible DRAM degradation.", gpu.Index, gpu.ECC.Corrected))
|
||||
findings = append(findings, fmt.Sprintf(
|
||||
"[WARNING] GPU %d: %d corrected ECC error(s) — possible DRAM degradation, monitor closely.",
|
||||
gpu.Index, gpu.ECC.Corrected))
|
||||
}
|
||||
}
|
||||
// Temperature headroom checks — independent of throttle counters.
|
||||
if gpu.Scores.TempHeadroomC > 0 {
|
||||
switch {
|
||||
case gpu.Scores.TempHeadroomC < 10:
|
||||
findings = append(findings, fmt.Sprintf(
|
||||
"[HARD STOP] GPU %d: p95 temperature %.1f°C — only %.1f°C from destruction threshold (100°C). Do not operate.",
|
||||
gpu.Index, gpu.Steady.P95TempC, gpu.Scores.TempHeadroomC))
|
||||
case gpu.Scores.TempHeadroomC < 20:
|
||||
findings = append(findings, fmt.Sprintf(
|
||||
"[THERMAL] GPU %d: p95 temperature %.1f°C — %.1f°C headroom to limit. Operating in degraded reliability zone (>80°C).",
|
||||
gpu.Index, gpu.Steady.P95TempC, gpu.Scores.TempHeadroomC))
|
||||
}
|
||||
}
|
||||
if gpu.CoolingWarning != "" {
|
||||
|
||||
Reference in New Issue
Block a user