package platform import ( "fmt" "strings" "time" ) func renderBenchmarkReport(result NvidiaBenchmarkResult) string { return renderBenchmarkReportWithCharts(result) } func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { var b strings.Builder // ── Header ──────────────────────────────────────────────────────────────── b.WriteString("# Bee NVIDIA Benchmark Report\n\n") // System identity block if result.ServerModel != "" { fmt.Fprintf(&b, "**Server:** %s \n", result.ServerModel) } if result.Hostname != "" { fmt.Fprintf(&b, "**Host:** %s \n", result.Hostname) } // GPU models summary if len(result.GPUs) > 0 { modelCount := make(map[string]int) var modelOrder []string for _, g := range result.GPUs { m := strings.TrimSpace(g.Name) if m == "" { m = "Unknown GPU" } if modelCount[m] == 0 { modelOrder = append(modelOrder, m) } modelCount[m]++ } var parts []string for _, m := range modelOrder { if modelCount[m] == 1 { parts = append(parts, m) } else { parts = append(parts, fmt.Sprintf("%d× %s", modelCount[m], m)) } } fmt.Fprintf(&b, "**GPU(s):** %s \n", strings.Join(parts, ", ")) } fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile) fmt.Fprintf(&b, "**Benchmark version:** %s \n", result.BenchmarkVersion) fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC")) if result.RampStep > 0 && result.RampTotal > 0 { fmt.Fprintf(&b, "**Ramp-up step:** %d of %d \n", result.RampStep, result.RampTotal) if result.RampRunID != "" { fmt.Fprintf(&b, "**Ramp-up run ID:** %s \n", result.RampRunID) } } else if result.ParallelGPUs { fmt.Fprintf(&b, "**Mode:** parallel (all GPUs simultaneously) \n") } if result.ScalabilityScore > 0 { fmt.Fprintf(&b, "**Scalability score:** %.1f%% \n", result.ScalabilityScore) } if result.PlatformPowerScore > 0 { fmt.Fprintf(&b, "**Platform power score:** %.1f%% \n", result.PlatformPowerScore) } fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus) b.WriteString("\n") // ── Executive Summary ───────────────────────────────────────────────────── if len(result.Findings) > 0 { b.WriteString("## Executive Summary\n\n") for _, finding := range result.Findings { fmt.Fprintf(&b, "- %s\n", finding) } b.WriteString("\n") } if len(result.Warnings) > 0 { b.WriteString("## Warnings\n\n") for _, warning := range result.Warnings { fmt.Fprintf(&b, "- %s\n", warning) } b.WriteString("\n") } // ── Balanced Scorecard ──────────────────────────────────────────────────── b.WriteString("## Balanced Scorecard\n\n") // Perspective 1: Compatibility — hard stops b.WriteString("### 1. Compatibility\n\n") { var rows [][]string for _, gpu := range result.GPUs { thermalThrottle := "-" if gpu.Scores.ThermalThrottlePct > 0 { thermalThrottle = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct) } fanAtThrottle := "-" if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && gpu.Scores.ThermalThrottlePct > 0 { fanAtThrottle = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct) } ecc := "-" if gpu.ECC.Uncorrected > 0 { ecc = fmt.Sprintf("⛔ %d", gpu.ECC.Uncorrected) } compatStatus := "✓ OK" if gpu.ECC.Uncorrected > 0 || (gpu.Scores.ThermalThrottlePct > 0 && result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && result.Cooling.P95FanDutyCyclePct < 95) { compatStatus = "⛔ HARD STOP" } rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), thermalThrottle, fanAtThrottle, ecc, compatStatus}) } b.WriteString(fmtMDTable([]string{"GPU", "Thermal throttle", "Fan duty at throttle", "ECC uncorr", "Status"}, rows)) b.WriteString("\n") } // Perspective 2: Thermal headroom b.WriteString("### 2. Thermal Headroom\n\n") { var rows [][]string for _, gpu := range result.GPUs { shutdownTemp := gpu.ShutdownTempC if shutdownTemp <= 0 { shutdownTemp = 90 } slowdownTemp := gpu.SlowdownTempC if slowdownTemp <= 0 { slowdownTemp = 80 } headroom := gpu.Scores.TempHeadroomC thermalStatus := "✓ OK" switch { case headroom < 10: thermalStatus = "⛔ CRITICAL" case gpu.Steady.P95TempC >= slowdownTemp: thermalStatus = "⚠ WARNING" } throttlePct := "-" if gpu.Scores.ThermalThrottlePct > 0 { throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct) } rows = append(rows, []string{ fmt.Sprintf("GPU %d", gpu.Index), fmt.Sprintf("%.1f°C", gpu.Steady.P95TempC), fmt.Sprintf("%.0f°C", slowdownTemp), fmt.Sprintf("%.0f°C", shutdownTemp), fmt.Sprintf("%.1f°C", headroom), throttlePct, thermalStatus, }) } b.WriteString(fmtMDTable([]string{"GPU", "p95 temp", "Slowdown limit", "Shutdown limit", "Headroom", "Thermal throttle", "Status"}, rows)) b.WriteString("\n") } // Perspective 3: Power delivery b.WriteString("### 3. Power Delivery\n\n") { var rows [][]string for _, gpu := range result.GPUs { powerCap := "-" if gpu.Scores.PowerCapThrottlePct > 0 { powerCap = fmt.Sprintf("%.1f%%", gpu.Scores.PowerCapThrottlePct) } fanDuty := "-" if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable { fanDuty = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct) } powerStatus := "✓ OK" if gpu.Scores.PowerCapThrottlePct > 5 { powerStatus = "⚠ POWER LIMITED" } rows = append(rows, []string{ fmt.Sprintf("GPU %d", gpu.Index), powerCap, fmt.Sprintf("%.1f", gpu.Scores.PowerSustainScore), fanDuty, powerStatus, }) } b.WriteString(fmtMDTable([]string{"GPU", "Power cap throttle", "Power stability", "Fan duty (p95)", "Status"}, rows)) b.WriteString("\n") } // Perspective 4: Performance b.WriteString("### 4. Performance\n\n") { var rows [][]string for _, gpu := range result.GPUs { synthetic := "-" if gpu.Scores.SyntheticScore > 0 { synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore) } mixed := "-" if gpu.Scores.MixedScore > 0 { mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore) } mixedEff := "-" if gpu.Scores.MixedEfficiency > 0 { mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100) } topsPerSM := "-" if gpu.Scores.TOPSPerSMPerGHz > 0 { topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz) } rows = append(rows, []string{ fmt.Sprintf("GPU %d", gpu.Index), fmt.Sprintf("**%.2f**", gpu.Scores.CompositeScore), synthetic, mixed, mixedEff, topsPerSM, }) } b.WriteString(fmtMDTable([]string{"GPU", "Compute TOPS", "Synthetic", "Mixed", "Mixed Eff.", "TOPS/SM/GHz"}, rows)) if len(result.PerformanceRampSteps) > 0 { fmt.Fprintf(&b, "\n**Platform power score (scalability):** %.1f%%\n", result.PlatformPowerScore) } b.WriteString("\n") } // Perspective 5: Anomaly flags b.WriteString("### 5. Anomalies\n\n") { var rows [][]string for _, gpu := range result.GPUs { eccCorr := "-" if gpu.ECC.Corrected > 0 { eccCorr = fmt.Sprintf("⚠ %d", gpu.ECC.Corrected) } syncBoost := "-" if gpu.Scores.SyncBoostThrottlePct > 0 { syncBoost = fmt.Sprintf("%.1f%%", gpu.Scores.SyncBoostThrottlePct) } powerVar := "OK" if gpu.Scores.PowerSustainScore < 70 { powerVar = "⚠ unstable" } thermalVar := "OK" if gpu.Scores.ThermalSustainScore < 70 { thermalVar = "⚠ unstable" } rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), eccCorr, syncBoost, powerVar, thermalVar}) } b.WriteString(fmtMDTable([]string{"GPU", "ECC corrected", "Sync boost throttle", "Power instability", "Thermal instability"}, rows)) b.WriteString("\n") } // ── Per GPU detail ──────────────────────────────────────────────────────── b.WriteString("## Per-GPU Details\n\n") for _, gpu := range result.GPUs { name := strings.TrimSpace(gpu.Name) if name == "" { name = "Unknown GPU" } fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, name) // Identity if gpu.BusID != "" { fmt.Fprintf(&b, "- **Bus ID:** %s\n", gpu.BusID) } if gpu.VBIOS != "" { fmt.Fprintf(&b, "- **vBIOS:** %s\n", gpu.VBIOS) } if gpu.ComputeCapability != "" { fmt.Fprintf(&b, "- **Compute capability:** %s\n", gpu.ComputeCapability) } if gpu.MultiprocessorCount > 0 { fmt.Fprintf(&b, "- **SMs:** %d\n", gpu.MultiprocessorCount) } if gpu.PowerLimitW > 0 { fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW) } if gpu.PowerLimitDerated { fmt.Fprintf(&b, "- **Power limit derating:** active (reduced limit %.0f W)\n", gpu.PowerLimitW) } if gpu.CalibratedPeakPowerW > 0 { if gpu.CalibratedPeakTempC > 0 { fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95 at %.1f °C p95\n", gpu.CalibratedPeakPowerW, gpu.CalibratedPeakTempC) } else { fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95\n", gpu.CalibratedPeakPowerW) } } if gpu.LockedGraphicsClockMHz > 0 { fmt.Fprintf(&b, "- **Locked clocks:** GPU %.0f MHz / Mem %.0f MHz\n", gpu.LockedGraphicsClockMHz, gpu.LockedMemoryClockMHz) } b.WriteString("\n") // Steady-state telemetry if benchmarkTelemetryAvailable(gpu.Steady) { fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec)) b.WriteString(fmtMDTable( []string{"", "Avg", "P95"}, [][]string{ {"Power", fmt.Sprintf("%.1f W", gpu.Steady.AvgPowerW), fmt.Sprintf("%.1f W", gpu.Steady.P95PowerW)}, {"Temperature", fmt.Sprintf("%.1f °C", gpu.Steady.AvgTempC), fmt.Sprintf("%.1f °C", gpu.Steady.P95TempC)}, {"GPU clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgGraphicsClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95GraphicsClockMHz)}, {"Memory clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgMemoryClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95MemoryClockMHz)}, {"GPU utilisation", fmt.Sprintf("%.1f %%", gpu.Steady.AvgUsagePct), "—"}, }, )) b.WriteString("\n") } else { b.WriteString("**Steady-state telemetry:** unavailable\n\n") } // Per-precision stability phases. if len(gpu.PrecisionSteady) > 0 { b.WriteString("**Per-precision stability:**\n\n") var precRows [][]string for _, p := range gpu.PrecisionSteady { eccCorr := "—" eccUncorr := "—" if !p.ECC.IsZero() { eccCorr = fmt.Sprintf("%d", p.ECC.Corrected) eccUncorr = fmt.Sprintf("%d", p.ECC.Uncorrected) } status := p.Status if strings.TrimSpace(status) == "" { status = "OK" } precRows = append(precRows, []string{ p.Precision, status, fmt.Sprintf("%.1f%%", p.Steady.ClockCVPct), fmt.Sprintf("%.1f%%", p.Steady.PowerCVPct), fmt.Sprintf("%.1f%%", p.Steady.ClockDriftPct), eccCorr, eccUncorr, }) } b.WriteString(fmtMDTable([]string{"Precision", "Status", "Clock CV", "Power CV", "Clock Drift", "ECC corr", "ECC uncorr"}, precRows)) b.WriteString("\n") } else { // Legacy: show combined-window variance. fmt.Fprintf(&b, "**Clock/power variance (combined window):** clock CV %.1f%% · power CV %.1f%% · clock drift %.1f%%\n\n", gpu.Steady.ClockCVPct, gpu.Steady.PowerCVPct, gpu.Steady.ClockDriftPct) } // ECC summary if !gpu.ECC.IsZero() { fmt.Fprintf(&b, "**ECC errors (total):** corrected=%d uncorrected=%d\n\n", gpu.ECC.Corrected, gpu.ECC.Uncorrected) } // Throttle throttle := formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec) if throttle != "none" { fmt.Fprintf(&b, "**Throttle:** %s\n\n", throttle) } // Precision results if len(gpu.PrecisionResults) > 0 { b.WriteString("**Precision results:**\n\n") var presRows [][]string for _, p := range gpu.PrecisionResults { if p.Supported { presRows = append(presRows, []string{ p.Name, fmt.Sprintf("%.2f", p.TeraOpsPerSec), fmt.Sprintf("×%.3g", p.Weight), fmt.Sprintf("%.2f", p.WeightedTeraOpsPerSec), fmt.Sprintf("%d", p.Lanes), fmt.Sprintf("%d", p.Iterations), }) } else { presRows = append(presRows, []string{p.Name, "— (unsupported)", "—", "—", "—", "—"}) } } b.WriteString(fmtMDTable([]string{"Precision", "TOPS (raw)", "Weight", "TOPS (fp32-eq)", "Lanes", "Iterations"}, presRows)) b.WriteString("\n") } // Degradation / Notes if len(gpu.DegradationReasons) > 0 { fmt.Fprintf(&b, "**Degradation reasons:** %s\n\n", strings.Join(gpu.DegradationReasons, ", ")) } if len(gpu.Notes) > 0 { b.WriteString("**Notes:**\n\n") for _, note := range gpu.Notes { fmt.Fprintf(&b, "- %s\n", note) } b.WriteString("\n") } } // ── Interconnect ────────────────────────────────────────────────────────── if result.Interconnect != nil { b.WriteString("## Interconnect (NCCL)\n\n") fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status) if result.Interconnect.Supported { b.WriteString(fmtMDTable( []string{"Metric", "Avg", "Max"}, [][]string{ {"Alg BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgAlgBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxAlgBWGBps)}, {"Bus BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgBusBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxBusBWGBps)}, }, )) b.WriteString("\n") } for _, note := range result.Interconnect.Notes { fmt.Fprintf(&b, "- %s\n", note) } if len(result.Interconnect.Notes) > 0 { b.WriteString("\n") } } // ── Server Power (IPMI) ─────────────────────────────────────────────────── if sp := result.ServerPower; sp != nil { b.WriteString("## Server Power (IPMI)\n\n") if !sp.Available { b.WriteString("IPMI power measurement unavailable.\n\n") } else { spRows := [][]string{ {"Server idle", fmt.Sprintf("%.0f W", sp.IdleW)}, {"Server under load", fmt.Sprintf("%.0f W", sp.LoadedW)}, {"Server delta (load − idle)", fmt.Sprintf("%.0f W", sp.DeltaW)}, {"GPU-reported sum", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)}, } if sp.ReportingRatio > 0 { spRows = append(spRows, []string{"Reporting ratio", fmt.Sprintf("%.2f (1.0 = accurate, <0.75 = GPU over-reports)", sp.ReportingRatio)}) } b.WriteString(fmtMDTable([]string{"", "Value"}, spRows)) b.WriteString("\n") } for _, note := range sp.Notes { fmt.Fprintf(&b, "- %s\n", note) } if len(sp.Notes) > 0 { b.WriteString("\n") } } // ── PSU Issues ──────────────────────────────────────────────────────────── if len(result.PSUIssues) > 0 { b.WriteString("## PSU Issues\n\n") b.WriteString("The following power supply anomalies were detected during the benchmark:\n\n") for _, issue := range result.PSUIssues { fmt.Fprintf(&b, "- ⛔ %s\n", issue) } b.WriteString("\n") } // ── Cooling ─────────────────────────────────────────────────────────────── if cooling := result.Cooling; cooling != nil { b.WriteString("## Cooling\n\n") if cooling.Available { dutyAvg, dutyP95 := "N/A", "N/A" if cooling.FanDutyCycleAvailable { dutyAvg = fmt.Sprintf("%.1f%%", cooling.AvgFanDutyCyclePct) dutyP95 = fmt.Sprintf("%.1f%%", cooling.P95FanDutyCyclePct) } b.WriteString(fmtMDTable( []string{"Metric", "Value"}, [][]string{ {"Average fan speed", fmt.Sprintf("%.0f RPM", cooling.AvgFanRPM)}, {"Average fan duty cycle", dutyAvg}, {"P95 fan duty cycle", dutyP95}, }, )) b.WriteString("\n") } else { b.WriteString("Cooling telemetry unavailable.\n\n") } for _, note := range cooling.Notes { fmt.Fprintf(&b, "- %s\n", note) } if len(cooling.Notes) > 0 { b.WriteString("\n") } } // ── Platform Scalability ────────────────────────────────────────────────── if len(result.PerformanceRampSteps) > 0 { b.WriteString("## Platform Scalability (Performance Ramp)\n\n") fmt.Fprintf(&b, "**Platform power score:** %.1f%% \n\n", result.PlatformPowerScore) var scalRows [][]string for _, step := range result.PerformanceRampSteps { scalRows = append(scalRows, []string{ fmt.Sprintf("%d", step.StepIndex), joinIndexList(step.GPUIndices), fmt.Sprintf("%.2f", step.TotalSyntheticTOPS), fmt.Sprintf("%.1f%%", step.ScalabilityPct), }) } b.WriteString(fmtMDTable([]string{"k GPUs", "GPU Indices", "Total Synthetic TOPS", "Scalability"}, scalRows)) b.WriteString("\n") } // ── Raw files ───────────────────────────────────────────────────────────── b.WriteString("## Raw Files\n\n") b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n") b.WriteString("- `gpu-metrics.csv`\n- `gpu-metrics.html`\n- `gpu-burn.log`\n") if result.Interconnect != nil { b.WriteString("- `nccl-all-reduce.log`\n") } return b.String() } // formatThrottleLine renders throttle counters as human-readable percentages of // the steady-state window. Only non-zero counters are shown. When the steady // duration is unknown (0), raw seconds are shown instead. func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64) string { type counter struct { label string us uint64 } counters := []counter{ {"sw_power", t.SWPowerCapUS}, {"sw_thermal", t.SWThermalSlowdownUS}, {"sync_boost", t.SyncBoostUS}, {"hw_thermal", t.HWThermalSlowdownUS}, {"hw_power_brake", t.HWPowerBrakeSlowdownUS}, } var parts []string for _, c := range counters { if c.us == 0 { continue } sec := float64(c.us) / 1e6 if steadyDurationSec > 0 { pct := sec / steadyDurationSec * 100 parts = append(parts, fmt.Sprintf("%s=%.1f%% (%.0fs)", c.label, pct, sec)) } else if sec < 1 { parts = append(parts, fmt.Sprintf("%s=%.0fms", c.label, sec*1000)) } else { parts = append(parts, fmt.Sprintf("%s=%.1fs", c.label, sec)) } } if len(parts) == 0 { return "none" } return strings.Join(parts, " ") } func renderBenchmarkSummary(result NvidiaBenchmarkResult) string { var b strings.Builder fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339)) fmt.Fprintf(&b, "benchmark_version=%s\n", result.BenchmarkVersion) fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile) fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus) fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs)) fmt.Fprintf(&b, "normalization_status=%s\n", result.Normalization.Status) var best float64 for i, gpu := range result.GPUs { fmt.Fprintf(&b, "gpu_%d_status=%s\n", gpu.Index, gpu.Status) fmt.Fprintf(&b, "gpu_%d_composite_score=%.2f\n", gpu.Index, gpu.Scores.CompositeScore) if i == 0 || gpu.Scores.CompositeScore > best { best = gpu.Scores.CompositeScore } } fmt.Fprintf(&b, "best_composite_score=%.2f\n", best) if result.Interconnect != nil { fmt.Fprintf(&b, "interconnect_status=%s\n", result.Interconnect.Status) fmt.Fprintf(&b, "interconnect_max_busbw_gbps=%.1f\n", result.Interconnect.MaxBusBWGBps) } return b.String() }