Restructure benchmark report as balanced scorecard (5 perspectives)

Split throttle into separate signals: ThermalThrottlePct, PowerCapThrottlePct,
SyncBoostThrottlePct. Add TempHeadroomC (100 - p95_temp) as independent
thermal headroom metric; warning < 20°C (>80°C), critical < 10°C (>90°C).

Hard stop findings: thermal throttle with fans < 95%, ECC uncorrected errors,
p95 temp > 90°C. Throttle findings now include per-type percentages and
diagnostic context.

Replace flat scorecard table with BSC 5-perspective layout:
1. Compatibility (hard stops: thermal+fan, ECC)
2. Thermal headroom (p95 temp, delta to 100°C, throttle %)
3. Power delivery (power cap throttle, power CV, fan duty)
4. Performance (Compute TOPS, Synthetic, Mixed, TOPS/SM/GHz)
5. Anomalies (ECC corrected, sync boost, power/thermal variance)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-16 06:40:06 +03:00
parent 7a0b0934df
commit 72ec086568
3 changed files with 186 additions and 77 deletions

View File

@@ -84,54 +84,84 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
b.WriteString("\n")
}
// ── Methodology ───────────────────────────────────────────────────────────
b.WriteString("## Methodology\n\n")
fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline → warmup → steady-state → interconnect phases.\n", result.BenchmarkProfile)
b.WriteString("- Single-GPU compute score comes from `bee-gpu-burn` on the cuBLASLt path when available.\n")
b.WriteString("- GPUs run at their default power limits — no pre-benchmark power calibration is performed.\n")
b.WriteString("- Throttling and thermal state are inferred from NVIDIA clock-event counters and sustained telemetry.\n")
b.WriteString("- `result.json` is the canonical machine-readable source for the run.\n\n")
b.WriteString("**Compute score** is derived from two phases:\n\n")
b.WriteString("- **Synthetic** — each precision type (int8, fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ")
b.WriteString("Measures peak throughput with the full GPU memory budget dedicated to one kernel type. ")
b.WriteString("Each result is normalised to fp32-equivalent TOPS using precision weights: ")
b.WriteString("fp64 ×2.0 · fp32 ×1.0 · fp16 ×0.5 · int8 ×0.25 · fp8 ×0.25 · fp4 ×0.125.\n")
b.WriteString("- **Mixed** — all precision types run simultaneously (combined phase). ")
b.WriteString("Reflects real inference workloads where fp8 matrix ops, fp16 attention and fp32 accumulation compete for bandwidth and SM scheduler slots.\n\n")
b.WriteString("**Formula:** `Compute = Synthetic × (1 + MixedEfficiency × 0.3)`\n\n")
b.WriteString("where `MixedEfficiency = Mixed / Synthetic`. A GPU that sustains 90 % throughput under mixed load ")
b.WriteString("receives a +27 % bonus over its synthetic score; one that drops to 60 % receives +18 %.\n\n")
b.WriteString("**Platform power score** — after all single-GPU runs, a ramp-up phase adds GPUs one by one (k=2..N) ")
b.WriteString("and measures total Synthetic TOPS. Scalability for step k = `k_total_TOPS / (k × best_single_TOPS) × 100`. ")
b.WriteString("`PlatformPowerScore` = mean scalability across all ramp steps. 100 % means linear scaling.\n\n")
b.WriteString("**PowerSustainScore** — power draw variance (CV) during steady-state load. ")
b.WriteString("High variance means unstable power delivery or bursting kernels. Score = max(0, 100 PowerCV × 3).\n\n")
b.WriteString("**ThermalSustainScore** — temperature variance (CV) during steady-state load. ")
b.WriteString("High variance means inconsistent cooling (fan bursts, flow instability). Score = max(0, 100 TempCV × 3).\n\n")
b.WriteString("**StabilityScore** — fraction of benchmark time the GPU spent throttling (thermal + power-cap). ")
b.WriteString("1% throttle → score 99; 10% throttle → score 90. This is the heaviest quality signal.\n\n")
b.WriteString("**CompositeScore** = raw compute TOPS (fp32-equivalent). A throttling GPU scores lower automatically.\n\n")
b.WriteString("**ServerQualityScore** (0100) — server infrastructure quality, independent of GPU model: \n")
b.WriteString("`0.40×Stability + 0.30×PowerSustain + 0.30×ThermalSustain` \n")
b.WriteString("Use this to compare servers with the same GPU type, or to flag a bad server.\n\n")
// ── Balanced Scorecard ────────────────────────────────────────────────────
b.WriteString("## Balanced Scorecard\n\n")
// ── Scorecard table ───────────────────────────────────────────────────────
b.WriteString("## Scorecard\n\n")
b.WriteString("| GPU | Status | Compute TOPS | Synthetic | Mixed | Mixed Eff. | TOPS/SM/GHz | Server Quality | Power Sustain | Thermal Sustain | Stability | Interconnect |\n")
b.WriteString("|-----|--------|--------------|-----------|-------|------------|-------------|----------------|---------------|-----------------|-----------|-------------|\n")
// Perspective 1: Compatibility — hard stops
b.WriteString("### 1. Совместимость\n\n")
b.WriteString("| GPU | Тепл. throttle | Вентиляторы при throttle | ECC uncorr | Статус |\n")
b.WriteString("|-----|---------------|--------------------------|------------|--------|\n")
for _, gpu := range result.GPUs {
name := strings.TrimSpace(gpu.Name)
if name == "" {
name = "Unknown GPU"
thermalThrottle := "-"
if gpu.Scores.ThermalThrottlePct > 0 {
thermalThrottle = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
}
interconnect := "-"
if gpu.Scores.InterconnectScore > 0 {
interconnect = fmt.Sprintf("%.1f", gpu.Scores.InterconnectScore)
fanAtThrottle := "-"
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && gpu.Scores.ThermalThrottlePct > 0 {
fanAtThrottle = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
}
topsPerSM := "-"
if gpu.Scores.TOPSPerSMPerGHz > 0 {
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
ecc := "-"
if gpu.ECC.Uncorrected > 0 {
ecc = fmt.Sprintf("⛔ %d", gpu.ECC.Uncorrected)
}
compatStatus := "✓ OK"
if gpu.ECC.Uncorrected > 0 || (gpu.Scores.ThermalThrottlePct > 0 && result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && result.Cooling.P95FanDutyCyclePct < 95) {
compatStatus = "⛔ HARD STOP"
}
fmt.Fprintf(&b, "| GPU %d | %s | %s | %s | %s |\n",
gpu.Index, thermalThrottle, fanAtThrottle, ecc, compatStatus)
}
b.WriteString("\n")
// Perspective 2: Thermal headroom
b.WriteString("### 2. Тепловой запас\n\n")
b.WriteString("| GPU | p95 темп | Запас до 100°C | Тепл. throttle | Статус |\n")
b.WriteString("|-----|----------|----------------|----------------|--------|\n")
for _, gpu := range result.GPUs {
headroom := gpu.Scores.TempHeadroomC
thermalStatus := "✓ OK"
switch {
case headroom < 10:
thermalStatus = "⛔ CRITICAL"
case headroom < 20:
thermalStatus = "⚠ WARNING"
}
throttlePct := "-"
if gpu.Scores.ThermalThrottlePct > 0 {
throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
}
fmt.Fprintf(&b, "| GPU %d | %.1f°C | %.1f°C | %s | %s |\n",
gpu.Index, gpu.Steady.P95TempC, headroom, throttlePct, thermalStatus)
}
b.WriteString("\n")
// Perspective 3: Power delivery
b.WriteString("### 3. Энергетика\n\n")
b.WriteString("| GPU | Power cap throttle | Power CV | Вентиляторы (p95) | Статус |\n")
b.WriteString("|-----|-------------------|----------|-------------------|--------|\n")
for _, gpu := range result.GPUs {
powerCap := "-"
if gpu.Scores.PowerCapThrottlePct > 0 {
powerCap = fmt.Sprintf("%.1f%%", gpu.Scores.PowerCapThrottlePct)
}
fanDuty := "-"
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable {
fanDuty = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
}
powerStatus := "✓ OK"
if gpu.Scores.PowerCapThrottlePct > 5 {
powerStatus = "⚠ POWER LIMITED"
}
fmt.Fprintf(&b, "| GPU %d | %s | %.1f | %s | %s |\n",
gpu.Index, powerCap, gpu.Scores.PowerSustainScore, fanDuty, powerStatus)
}
b.WriteString("\n")
// Perspective 4: Performance
b.WriteString("### 4. Производительность\n\n")
b.WriteString("| GPU | Compute TOPS | Synthetic | Mixed | Mixed Eff. | TOPS/SM/GHz |\n")
b.WriteString("|-----|--------------|-----------|-------|------------|-------------|\n")
for _, gpu := range result.GPUs {
synthetic := "-"
if gpu.Scores.SyntheticScore > 0 {
synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
@@ -144,20 +174,41 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
if gpu.Scores.MixedEfficiency > 0 {
mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
}
fmt.Fprintf(&b, "| GPU %d %s | %s | **%.2f** | %s | %s | %s | %s | %.1f | %.1f | %.1f | %.1f | %s |\n",
gpu.Index, name,
gpu.Status,
gpu.Scores.CompositeScore,
synthetic,
mixed,
mixedEff,
topsPerSM,
gpu.Scores.ServerQualityScore,
gpu.Scores.PowerSustainScore,
gpu.Scores.ThermalSustainScore,
gpu.Scores.StabilityScore,
interconnect,
)
topsPerSM := "-"
if gpu.Scores.TOPSPerSMPerGHz > 0 {
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
}
fmt.Fprintf(&b, "| GPU %d | **%.2f** | %s | %s | %s | %s |\n",
gpu.Index, gpu.Scores.CompositeScore, synthetic, mixed, mixedEff, topsPerSM)
}
if len(result.PerformanceRampSteps) > 0 {
fmt.Fprintf(&b, "\n**Platform power score (масштабируемость):** %.1f%%\n", result.PlatformPowerScore)
}
b.WriteString("\n")
// Perspective 5: Anomaly flags
b.WriteString("### 5. Аномалии\n\n")
b.WriteString("| GPU | ECC corr | Sync boost throttle | Нестаб. питание | Нестаб. охлаждение |\n")
b.WriteString("|-----|----------|---------------------|-----------------|--------------------|\n")
for _, gpu := range result.GPUs {
eccCorr := "-"
if gpu.ECC.Corrected > 0 {
eccCorr = fmt.Sprintf("⚠ %d", gpu.ECC.Corrected)
}
syncBoost := "-"
if gpu.Scores.SyncBoostThrottlePct > 0 {
syncBoost = fmt.Sprintf("%.1f%%", gpu.Scores.SyncBoostThrottlePct)
}
powerVar := "OK"
if gpu.Scores.PowerSustainScore < 70 {
powerVar = "⚠ нестабильно"
}
thermalVar := "OK"
if gpu.Scores.ThermalSustainScore < 70 {
thermalVar = "⚠ нестабильно"
}
fmt.Fprintf(&b, "| GPU %d | %s | %s | %s | %s |\n",
gpu.Index, eccCorr, syncBoost, powerVar, thermalVar)
}
b.WriteString("\n")