386 lines
16 KiB
Go
386 lines
16 KiB
Go
package platform
|
||
|
||
import (
|
||
"fmt"
|
||
"strings"
|
||
"time"
|
||
)
|
||
|
||
func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
|
||
return renderBenchmarkReportWithCharts(result)
|
||
}
|
||
|
||
func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||
var b strings.Builder
|
||
|
||
// ── Header ────────────────────────────────────────────────────────────────
|
||
b.WriteString("# Bee NVIDIA Benchmark Report\n\n")
|
||
|
||
// System identity block
|
||
if result.ServerModel != "" {
|
||
fmt.Fprintf(&b, "**Server:** %s \n", result.ServerModel)
|
||
}
|
||
if result.Hostname != "" {
|
||
fmt.Fprintf(&b, "**Host:** %s \n", result.Hostname)
|
||
}
|
||
// GPU models summary
|
||
if len(result.GPUs) > 0 {
|
||
modelCount := make(map[string]int)
|
||
var modelOrder []string
|
||
for _, g := range result.GPUs {
|
||
m := strings.TrimSpace(g.Name)
|
||
if m == "" {
|
||
m = "Unknown GPU"
|
||
}
|
||
if modelCount[m] == 0 {
|
||
modelOrder = append(modelOrder, m)
|
||
}
|
||
modelCount[m]++
|
||
}
|
||
var parts []string
|
||
for _, m := range modelOrder {
|
||
if modelCount[m] == 1 {
|
||
parts = append(parts, m)
|
||
} else {
|
||
parts = append(parts, fmt.Sprintf("%d× %s", modelCount[m], m))
|
||
}
|
||
}
|
||
fmt.Fprintf(&b, "**GPU(s):** %s \n", strings.Join(parts, ", "))
|
||
}
|
||
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
|
||
fmt.Fprintf(&b, "**App version:** %s \n", result.BenchmarkVersion)
|
||
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
||
if result.RampStep > 0 && result.RampTotal > 0 {
|
||
fmt.Fprintf(&b, "**Ramp-up step:** %d of %d \n", result.RampStep, result.RampTotal)
|
||
if result.RampRunID != "" {
|
||
fmt.Fprintf(&b, "**Ramp-up run ID:** %s \n", result.RampRunID)
|
||
}
|
||
} else if result.ParallelGPUs {
|
||
fmt.Fprintf(&b, "**Mode:** parallel (all GPUs simultaneously) \n")
|
||
}
|
||
if result.ScalabilityScore > 0 {
|
||
fmt.Fprintf(&b, "**Scalability score:** %.1f%% \n", result.ScalabilityScore)
|
||
}
|
||
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
|
||
b.WriteString("\n")
|
||
|
||
// ── Executive Summary ─────────────────────────────────────────────────────
|
||
if len(result.Findings) > 0 {
|
||
b.WriteString("## Executive Summary\n\n")
|
||
for _, finding := range result.Findings {
|
||
fmt.Fprintf(&b, "- %s\n", finding)
|
||
}
|
||
b.WriteString("\n")
|
||
}
|
||
|
||
if len(result.Warnings) > 0 {
|
||
b.WriteString("## Warnings\n\n")
|
||
for _, warning := range result.Warnings {
|
||
fmt.Fprintf(&b, "- %s\n", warning)
|
||
}
|
||
b.WriteString("\n")
|
||
}
|
||
|
||
// ── Methodology ───────────────────────────────────────────────────────────
|
||
b.WriteString("## Methodology\n\n")
|
||
fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline -> warmup -> steady-state -> interconnect -> cooldown phases.\n", result.BenchmarkProfile)
|
||
b.WriteString("- Single-GPU compute score comes from `bee-gpu-burn` on the cuBLASLt path when available.\n")
|
||
b.WriteString("- Thermal and power limits are inferred from NVIDIA clock-event counters plus sustained telemetry.\n")
|
||
b.WriteString("- `result.json` is the canonical machine-readable source for the run.\n\n")
|
||
b.WriteString("**Compute score** is derived from two phases:\n\n")
|
||
b.WriteString("- **Synthetic** — each precision type (fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ")
|
||
b.WriteString("Measures peak throughput with the full GPU dedicated to one kernel type. ")
|
||
b.WriteString("Each result is normalised to fp32-equivalent TOPS using precision weights: ")
|
||
b.WriteString("fp64 ×2.0 · fp32 ×1.0 · fp16 ×0.5 · fp8 ×0.25 · fp4 ×0.125.\n")
|
||
b.WriteString("- **Mixed** — all precision types run simultaneously (combined phase). ")
|
||
b.WriteString("Reflects real inference workloads where fp8 matrix ops, fp16 attention and fp32 accumulation compete for bandwidth and SM scheduler slots.\n\n")
|
||
b.WriteString("**Formula:** `Compute = Synthetic × (1 + MixedEfficiency × 0.3)`\n\n")
|
||
b.WriteString("where `MixedEfficiency = Mixed / Synthetic`. A GPU that sustains 90 % throughput under mixed load ")
|
||
b.WriteString("receives a +27 % bonus over its synthetic score; one that drops to 60 % receives +18 %.\n\n")
|
||
b.WriteString("**Composite score** = `Compute × quality_factor` where quality factors in power sustain, thermal sustain, stability, and interconnect.\n\n")
|
||
|
||
// ── Scorecard table ───────────────────────────────────────────────────────
|
||
b.WriteString("## Scorecard\n\n")
|
||
b.WriteString("| GPU | Status | Composite | Compute | Synthetic | Mixed | Mixed Eff. | TOPS/SM/GHz | Power Sustain | Thermal Sustain | Stability | Interconnect |\n")
|
||
b.WriteString("|-----|--------|-----------|---------|-----------|-------|------------|-------------|---------------|-----------------|-----------|-------------|\n")
|
||
for _, gpu := range result.GPUs {
|
||
name := strings.TrimSpace(gpu.Name)
|
||
if name == "" {
|
||
name = "Unknown GPU"
|
||
}
|
||
interconnect := "-"
|
||
if gpu.Scores.InterconnectScore > 0 {
|
||
interconnect = fmt.Sprintf("%.1f", gpu.Scores.InterconnectScore)
|
||
}
|
||
topsPerSM := "-"
|
||
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
||
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
|
||
}
|
||
synthetic := "-"
|
||
if gpu.Scores.SyntheticScore > 0 {
|
||
synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
|
||
}
|
||
mixed := "-"
|
||
if gpu.Scores.MixedScore > 0 {
|
||
mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore)
|
||
}
|
||
mixedEff := "-"
|
||
if gpu.Scores.MixedEfficiency > 0 {
|
||
mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
|
||
}
|
||
fmt.Fprintf(&b, "| GPU %d %s | %s | **%.2f** | %.2f | %s | %s | %s | %s | %.1f | %.1f | %.1f | %s |\n",
|
||
gpu.Index, name,
|
||
gpu.Status,
|
||
gpu.Scores.CompositeScore,
|
||
gpu.Scores.ComputeScore,
|
||
synthetic,
|
||
mixed,
|
||
mixedEff,
|
||
topsPerSM,
|
||
gpu.Scores.PowerSustainScore,
|
||
gpu.Scores.ThermalSustainScore,
|
||
gpu.Scores.StabilityScore,
|
||
interconnect,
|
||
)
|
||
}
|
||
b.WriteString("\n")
|
||
|
||
// ── Per GPU detail ────────────────────────────────────────────────────────
|
||
b.WriteString("## Per-GPU Details\n\n")
|
||
for _, gpu := range result.GPUs {
|
||
name := strings.TrimSpace(gpu.Name)
|
||
if name == "" {
|
||
name = "Unknown GPU"
|
||
}
|
||
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, name)
|
||
|
||
// Identity
|
||
if gpu.BusID != "" {
|
||
fmt.Fprintf(&b, "- **Bus ID:** %s\n", gpu.BusID)
|
||
}
|
||
if gpu.VBIOS != "" {
|
||
fmt.Fprintf(&b, "- **vBIOS:** %s\n", gpu.VBIOS)
|
||
}
|
||
if gpu.ComputeCapability != "" {
|
||
fmt.Fprintf(&b, "- **Compute capability:** %s\n", gpu.ComputeCapability)
|
||
}
|
||
if gpu.MultiprocessorCount > 0 {
|
||
fmt.Fprintf(&b, "- **SMs:** %d\n", gpu.MultiprocessorCount)
|
||
}
|
||
if gpu.PowerLimitW > 0 {
|
||
fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
|
||
}
|
||
if gpu.LockedGraphicsClockMHz > 0 {
|
||
fmt.Fprintf(&b, "- **Locked clocks:** GPU %.0f MHz / Mem %.0f MHz\n", gpu.LockedGraphicsClockMHz, gpu.LockedMemoryClockMHz)
|
||
}
|
||
b.WriteString("\n")
|
||
|
||
// Steady-state telemetry
|
||
fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
|
||
b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
|
||
fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
|
||
fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
|
||
fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
|
||
fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
|
||
fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
|
||
b.WriteString("\n")
|
||
|
||
// Per-precision stability phases.
|
||
if len(gpu.PrecisionSteady) > 0 {
|
||
b.WriteString("**Per-precision stability:**\n\n")
|
||
b.WriteString("| Precision | Clock CV | Power CV | Clock Drift | ECC corr | ECC uncorr |\n|-----------|----------|----------|-------------|----------|------------|\n")
|
||
for _, p := range gpu.PrecisionSteady {
|
||
eccCorr := "—"
|
||
eccUncorr := "—"
|
||
if !p.ECC.IsZero() {
|
||
eccCorr = fmt.Sprintf("%d", p.ECC.Corrected)
|
||
eccUncorr = fmt.Sprintf("%d", p.ECC.Uncorrected)
|
||
}
|
||
fmt.Fprintf(&b, "| %s | %.1f%% | %.1f%% | %.1f%% | %s | %s |\n",
|
||
p.Precision, p.Steady.ClockCVPct, p.Steady.PowerCVPct, p.Steady.ClockDriftPct,
|
||
eccCorr, eccUncorr)
|
||
}
|
||
b.WriteString("\n")
|
||
} else {
|
||
// Legacy: show combined-window variance.
|
||
fmt.Fprintf(&b, "**Clock/power variance (combined window):** clock CV %.1f%% · power CV %.1f%% · clock drift %.1f%%\n\n",
|
||
gpu.Steady.ClockCVPct, gpu.Steady.PowerCVPct, gpu.Steady.ClockDriftPct)
|
||
}
|
||
|
||
// ECC summary
|
||
if !gpu.ECC.IsZero() {
|
||
fmt.Fprintf(&b, "**ECC errors (total):** corrected=%d uncorrected=%d\n\n",
|
||
gpu.ECC.Corrected, gpu.ECC.Uncorrected)
|
||
}
|
||
|
||
// Throttle
|
||
throttle := formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec)
|
||
if throttle != "none" {
|
||
fmt.Fprintf(&b, "**Throttle:** %s\n\n", throttle)
|
||
}
|
||
|
||
// Precision results
|
||
if len(gpu.PrecisionResults) > 0 {
|
||
b.WriteString("**Precision results:**\n\n")
|
||
b.WriteString("| Precision | TOPS (raw) | Weight | TOPS (fp32-eq) | Lanes | Iterations |\n|-----------|------------|--------|----------------|-------|------------|\n")
|
||
for _, p := range gpu.PrecisionResults {
|
||
if p.Supported {
|
||
weightStr := fmt.Sprintf("×%.3g", p.Weight)
|
||
fmt.Fprintf(&b, "| %s | %.2f | %s | %.2f | %d | %d |\n",
|
||
p.Name, p.TeraOpsPerSec, weightStr, p.WeightedTeraOpsPerSec, p.Lanes, p.Iterations)
|
||
} else {
|
||
fmt.Fprintf(&b, "| %s | — (unsupported) | — | — | — | — |\n", p.Name)
|
||
}
|
||
}
|
||
b.WriteString("\n")
|
||
}
|
||
|
||
// Degradation / Notes
|
||
if len(gpu.DegradationReasons) > 0 {
|
||
fmt.Fprintf(&b, "**Degradation reasons:** %s\n\n", strings.Join(gpu.DegradationReasons, ", "))
|
||
}
|
||
if len(gpu.Notes) > 0 {
|
||
b.WriteString("**Notes:**\n\n")
|
||
for _, note := range gpu.Notes {
|
||
fmt.Fprintf(&b, "- %s\n", note)
|
||
}
|
||
b.WriteString("\n")
|
||
}
|
||
}
|
||
|
||
// ── Interconnect ──────────────────────────────────────────────────────────
|
||
if result.Interconnect != nil {
|
||
b.WriteString("## Interconnect (NCCL)\n\n")
|
||
fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status)
|
||
if result.Interconnect.Supported {
|
||
b.WriteString("| Metric | Avg | Max |\n|--------|-----|-----|\n")
|
||
fmt.Fprintf(&b, "| Alg BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.MaxAlgBWGBps)
|
||
fmt.Fprintf(&b, "| Bus BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgBusBWGBps, result.Interconnect.MaxBusBWGBps)
|
||
b.WriteString("\n")
|
||
}
|
||
for _, note := range result.Interconnect.Notes {
|
||
fmt.Fprintf(&b, "- %s\n", note)
|
||
}
|
||
if len(result.Interconnect.Notes) > 0 {
|
||
b.WriteString("\n")
|
||
}
|
||
}
|
||
|
||
// ── Server Power (IPMI) ───────────────────────────────────────────────────
|
||
if sp := result.ServerPower; sp != nil {
|
||
b.WriteString("## Server Power (IPMI)\n\n")
|
||
if !sp.Available {
|
||
b.WriteString("IPMI power measurement unavailable.\n\n")
|
||
} else {
|
||
b.WriteString("| | Value |\n|---|---|\n")
|
||
fmt.Fprintf(&b, "| Server idle | %.0f W |\n", sp.IdleW)
|
||
fmt.Fprintf(&b, "| Server under load | %.0f W |\n", sp.LoadedW)
|
||
fmt.Fprintf(&b, "| Server delta (load − idle) | %.0f W |\n", sp.DeltaW)
|
||
fmt.Fprintf(&b, "| GPU-reported sum | %.0f W |\n", sp.GPUReportedSumW)
|
||
if sp.ReportingRatio > 0 {
|
||
fmt.Fprintf(&b, "| Reporting ratio | %.2f (1.0 = accurate, <0.75 = GPU over-reports) |\n", sp.ReportingRatio)
|
||
}
|
||
b.WriteString("\n")
|
||
}
|
||
for _, note := range sp.Notes {
|
||
fmt.Fprintf(&b, "- %s\n", note)
|
||
}
|
||
if len(sp.Notes) > 0 {
|
||
b.WriteString("\n")
|
||
}
|
||
}
|
||
|
||
// ── Cooling ───────────────────────────────────────────────────────────────
|
||
if cooling := result.Cooling; cooling != nil {
|
||
b.WriteString("## Cooling\n\n")
|
||
if cooling.Available {
|
||
b.WriteString("| Metric | Value |\n|--------|-------|\n")
|
||
fmt.Fprintf(&b, "| Average fan speed | %.0f RPM |\n", cooling.AvgFanRPM)
|
||
if cooling.FanDutyCycleAvailable {
|
||
fmt.Fprintf(&b, "| Average fan duty cycle | %.1f%% |\n", cooling.AvgFanDutyCyclePct)
|
||
fmt.Fprintf(&b, "| P95 fan duty cycle | %.1f%% |\n", cooling.P95FanDutyCyclePct)
|
||
} else {
|
||
b.WriteString("| Average fan duty cycle | N/A |\n")
|
||
b.WriteString("| P95 fan duty cycle | N/A |\n")
|
||
}
|
||
b.WriteString("\n")
|
||
} else {
|
||
b.WriteString("Cooling telemetry unavailable.\n\n")
|
||
}
|
||
for _, note := range cooling.Notes {
|
||
fmt.Fprintf(&b, "- %s\n", note)
|
||
}
|
||
if len(cooling.Notes) > 0 {
|
||
b.WriteString("\n")
|
||
}
|
||
}
|
||
|
||
// ── Raw files ─────────────────────────────────────────────────────────────
|
||
b.WriteString("## Raw Files\n\n")
|
||
b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
|
||
b.WriteString("- `gpu-metrics.csv`\n- `gpu-metrics.html`\n- `gpu-burn.log`\n")
|
||
if result.Interconnect != nil {
|
||
b.WriteString("- `nccl-all-reduce.log`\n")
|
||
}
|
||
return b.String()
|
||
}
|
||
|
||
// formatThrottleLine renders throttle counters as human-readable percentages of
|
||
// the steady-state window. Only non-zero counters are shown. When the steady
|
||
// duration is unknown (0), raw seconds are shown instead.
|
||
func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64) string {
|
||
type counter struct {
|
||
label string
|
||
us uint64
|
||
}
|
||
counters := []counter{
|
||
{"sw_power", t.SWPowerCapUS},
|
||
{"sw_thermal", t.SWThermalSlowdownUS},
|
||
{"sync_boost", t.SyncBoostUS},
|
||
{"hw_thermal", t.HWThermalSlowdownUS},
|
||
{"hw_power_brake", t.HWPowerBrakeSlowdownUS},
|
||
}
|
||
var parts []string
|
||
for _, c := range counters {
|
||
if c.us == 0 {
|
||
continue
|
||
}
|
||
sec := float64(c.us) / 1e6
|
||
if steadyDurationSec > 0 {
|
||
pct := sec / steadyDurationSec * 100
|
||
parts = append(parts, fmt.Sprintf("%s=%.1f%% (%.0fs)", c.label, pct, sec))
|
||
} else if sec < 1 {
|
||
parts = append(parts, fmt.Sprintf("%s=%.0fms", c.label, sec*1000))
|
||
} else {
|
||
parts = append(parts, fmt.Sprintf("%s=%.1fs", c.label, sec))
|
||
}
|
||
}
|
||
if len(parts) == 0 {
|
||
return "none"
|
||
}
|
||
return strings.Join(parts, " ")
|
||
}
|
||
|
||
func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
|
||
var b strings.Builder
|
||
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
|
||
fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
|
||
fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
|
||
fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
|
||
fmt.Fprintf(&b, "normalization_status=%s\n", result.Normalization.Status)
|
||
var best float64
|
||
for i, gpu := range result.GPUs {
|
||
fmt.Fprintf(&b, "gpu_%d_status=%s\n", gpu.Index, gpu.Status)
|
||
fmt.Fprintf(&b, "gpu_%d_composite_score=%.2f\n", gpu.Index, gpu.Scores.CompositeScore)
|
||
if i == 0 || gpu.Scores.CompositeScore > best {
|
||
best = gpu.Scores.CompositeScore
|
||
}
|
||
}
|
||
fmt.Fprintf(&b, "best_composite_score=%.2f\n", best)
|
||
if result.Interconnect != nil {
|
||
fmt.Fprintf(&b, "interconnect_status=%s\n", result.Interconnect.Status)
|
||
fmt.Fprintf(&b, "interconnect_max_busbw_gbps=%.1f\n", result.Interconnect.MaxBusBWGBps)
|
||
}
|
||
return b.String()
|
||
}
|