559 lines
20 KiB
Go
559 lines
20 KiB
Go
package platform
|
||
|
||
import (
|
||
"fmt"
|
||
"strings"
|
||
"time"
|
||
)
|
||
|
||
func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
|
||
return renderBenchmarkReportWithCharts(result)
|
||
}
|
||
|
||
func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||
var b strings.Builder
|
||
|
||
// ── Header ────────────────────────────────────────────────────────────────
|
||
b.WriteString("# Bee NVIDIA Benchmark Report\n\n")
|
||
|
||
// System identity block
|
||
if result.ServerModel != "" {
|
||
fmt.Fprintf(&b, "**Server:** %s \n", result.ServerModel)
|
||
}
|
||
if result.Hostname != "" {
|
||
fmt.Fprintf(&b, "**Host:** %s \n", result.Hostname)
|
||
}
|
||
// GPU models summary
|
||
if len(result.GPUs) > 0 {
|
||
modelCount := make(map[string]int)
|
||
var modelOrder []string
|
||
for _, g := range result.GPUs {
|
||
m := strings.TrimSpace(g.Name)
|
||
if m == "" {
|
||
m = "Unknown GPU"
|
||
}
|
||
if modelCount[m] == 0 {
|
||
modelOrder = append(modelOrder, m)
|
||
}
|
||
modelCount[m]++
|
||
}
|
||
var parts []string
|
||
for _, m := range modelOrder {
|
||
if modelCount[m] == 1 {
|
||
parts = append(parts, m)
|
||
} else {
|
||
parts = append(parts, fmt.Sprintf("%d× %s", modelCount[m], m))
|
||
}
|
||
}
|
||
fmt.Fprintf(&b, "**GPU(s):** %s \n", strings.Join(parts, ", "))
|
||
}
|
||
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
|
||
fmt.Fprintf(&b, "**Benchmark version:** %s \n", result.BenchmarkVersion)
|
||
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
||
if result.RampStep > 0 && result.RampTotal > 0 {
|
||
fmt.Fprintf(&b, "**Ramp-up step:** %d of %d \n", result.RampStep, result.RampTotal)
|
||
if result.RampRunID != "" {
|
||
fmt.Fprintf(&b, "**Ramp-up run ID:** %s \n", result.RampRunID)
|
||
}
|
||
} else if result.ParallelGPUs {
|
||
fmt.Fprintf(&b, "**Mode:** parallel (all GPUs simultaneously) \n")
|
||
}
|
||
if result.ScalabilityScore > 0 {
|
||
fmt.Fprintf(&b, "**Scalability score:** %.1f%% \n", result.ScalabilityScore)
|
||
}
|
||
if result.PlatformPowerScore > 0 {
|
||
fmt.Fprintf(&b, "**Platform power score:** %.1f%% \n", result.PlatformPowerScore)
|
||
}
|
||
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
|
||
b.WriteString("\n")
|
||
|
||
// ── Executive Summary ─────────────────────────────────────────────────────
|
||
if len(result.Findings) > 0 {
|
||
b.WriteString("## Executive Summary\n\n")
|
||
for _, finding := range result.Findings {
|
||
fmt.Fprintf(&b, "- %s\n", finding)
|
||
}
|
||
b.WriteString("\n")
|
||
}
|
||
|
||
if len(result.Warnings) > 0 {
|
||
b.WriteString("## Warnings\n\n")
|
||
for _, warning := range result.Warnings {
|
||
fmt.Fprintf(&b, "- %s\n", warning)
|
||
}
|
||
b.WriteString("\n")
|
||
}
|
||
|
||
// ── Balanced Scorecard ────────────────────────────────────────────────────
|
||
b.WriteString("## Balanced Scorecard\n\n")
|
||
|
||
// Perspective 1: Compatibility — hard stops
|
||
b.WriteString("### 1. Compatibility\n\n")
|
||
{
|
||
var rows [][]string
|
||
for _, gpu := range result.GPUs {
|
||
thermalThrottle := "-"
|
||
if gpu.Scores.ThermalThrottlePct > 0 {
|
||
thermalThrottle = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
|
||
}
|
||
fanAtThrottle := "-"
|
||
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && gpu.Scores.ThermalThrottlePct > 0 {
|
||
fanAtThrottle = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
|
||
}
|
||
ecc := "-"
|
||
if gpu.ECC.Uncorrected > 0 {
|
||
ecc = fmt.Sprintf("⛔ %d", gpu.ECC.Uncorrected)
|
||
}
|
||
compatStatus := "✓ OK"
|
||
if gpu.ECC.Uncorrected > 0 || (gpu.Scores.ThermalThrottlePct > 0 && result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && result.Cooling.P95FanDutyCyclePct < 95) {
|
||
compatStatus = "⛔ HARD STOP"
|
||
}
|
||
rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), thermalThrottle, fanAtThrottle, ecc, compatStatus})
|
||
}
|
||
b.WriteString(fmtMDTable([]string{"GPU", "Thermal throttle", "Fan duty at throttle", "ECC uncorr", "Status"}, rows))
|
||
b.WriteString("\n")
|
||
}
|
||
|
||
// Perspective 2: Thermal headroom
|
||
b.WriteString("### 2. Thermal Headroom\n\n")
|
||
{
|
||
var rows [][]string
|
||
for _, gpu := range result.GPUs {
|
||
shutdownTemp := gpu.ShutdownTempC
|
||
if shutdownTemp <= 0 {
|
||
shutdownTemp = 90
|
||
}
|
||
slowdownTemp := gpu.SlowdownTempC
|
||
if slowdownTemp <= 0 {
|
||
slowdownTemp = 80
|
||
}
|
||
headroom := gpu.Scores.TempHeadroomC
|
||
thermalStatus := "✓ OK"
|
||
switch {
|
||
case headroom < 10:
|
||
thermalStatus = "⛔ CRITICAL"
|
||
case gpu.Steady.P95TempC >= slowdownTemp:
|
||
thermalStatus = "⚠ WARNING"
|
||
}
|
||
throttlePct := "-"
|
||
if gpu.Scores.ThermalThrottlePct > 0 {
|
||
throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
|
||
}
|
||
rows = append(rows, []string{
|
||
fmt.Sprintf("GPU %d", gpu.Index),
|
||
fmt.Sprintf("%.1f°C", gpu.Steady.P95TempC),
|
||
fmt.Sprintf("%.0f°C", slowdownTemp),
|
||
fmt.Sprintf("%.0f°C", shutdownTemp),
|
||
fmt.Sprintf("%.1f°C", headroom),
|
||
throttlePct,
|
||
thermalStatus,
|
||
})
|
||
}
|
||
b.WriteString(fmtMDTable([]string{"GPU", "p95 temp", "Slowdown limit", "Shutdown limit", "Headroom", "Thermal throttle", "Status"}, rows))
|
||
b.WriteString("\n")
|
||
}
|
||
|
||
// Perspective 3: Power delivery
|
||
b.WriteString("### 3. Power Delivery\n\n")
|
||
{
|
||
var rows [][]string
|
||
for _, gpu := range result.GPUs {
|
||
powerCap := "-"
|
||
if gpu.Scores.PowerCapThrottlePct > 0 {
|
||
powerCap = fmt.Sprintf("%.1f%%", gpu.Scores.PowerCapThrottlePct)
|
||
}
|
||
fanDuty := "-"
|
||
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable {
|
||
fanDuty = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
|
||
}
|
||
powerStatus := "✓ OK"
|
||
if gpu.Scores.PowerCapThrottlePct > 5 {
|
||
powerStatus = "⚠ POWER LIMITED"
|
||
}
|
||
rows = append(rows, []string{
|
||
fmt.Sprintf("GPU %d", gpu.Index),
|
||
powerCap,
|
||
fmt.Sprintf("%.1f", gpu.Scores.PowerSustainScore),
|
||
fanDuty,
|
||
powerStatus,
|
||
})
|
||
}
|
||
b.WriteString(fmtMDTable([]string{"GPU", "Power cap throttle", "Power stability", "Fan duty (p95)", "Status"}, rows))
|
||
b.WriteString("\n")
|
||
}
|
||
|
||
// Perspective 4: Performance
|
||
b.WriteString("### 4. Performance\n\n")
|
||
{
|
||
var rows [][]string
|
||
for _, gpu := range result.GPUs {
|
||
synthetic := "-"
|
||
if gpu.Scores.SyntheticScore > 0 {
|
||
synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
|
||
}
|
||
mixed := "-"
|
||
if gpu.Scores.MixedScore > 0 {
|
||
mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore)
|
||
}
|
||
mixedEff := "-"
|
||
if gpu.Scores.MixedEfficiency > 0 {
|
||
mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
|
||
}
|
||
topsPerSM := "-"
|
||
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
||
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
|
||
}
|
||
rows = append(rows, []string{
|
||
fmt.Sprintf("GPU %d", gpu.Index),
|
||
fmt.Sprintf("**%.2f**", gpu.Scores.CompositeScore),
|
||
synthetic, mixed, mixedEff, topsPerSM,
|
||
})
|
||
}
|
||
b.WriteString(fmtMDTable([]string{"GPU", "Compute TOPS", "Synthetic", "Mixed", "Mixed Eff.", "TOPS/SM/GHz"}, rows))
|
||
if len(result.PerformanceRampSteps) > 0 {
|
||
fmt.Fprintf(&b, "\n**Platform power score (scalability):** %.1f%%\n", result.PlatformPowerScore)
|
||
}
|
||
b.WriteString("\n")
|
||
}
|
||
|
||
// Perspective 5: Anomaly flags
|
||
b.WriteString("### 5. Anomalies\n\n")
|
||
{
|
||
var rows [][]string
|
||
for _, gpu := range result.GPUs {
|
||
eccCorr := "-"
|
||
if gpu.ECC.Corrected > 0 {
|
||
eccCorr = fmt.Sprintf("⚠ %d", gpu.ECC.Corrected)
|
||
}
|
||
syncBoost := "-"
|
||
if gpu.Scores.SyncBoostThrottlePct > 0 {
|
||
syncBoost = fmt.Sprintf("%.1f%%", gpu.Scores.SyncBoostThrottlePct)
|
||
}
|
||
powerVar := "OK"
|
||
if gpu.Scores.PowerSustainScore < 70 {
|
||
powerVar = "⚠ unstable"
|
||
}
|
||
thermalVar := "OK"
|
||
if gpu.Scores.ThermalSustainScore < 70 {
|
||
thermalVar = "⚠ unstable"
|
||
}
|
||
rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), eccCorr, syncBoost, powerVar, thermalVar})
|
||
}
|
||
b.WriteString(fmtMDTable([]string{"GPU", "ECC corrected", "Sync boost throttle", "Power instability", "Thermal instability"}, rows))
|
||
b.WriteString("\n")
|
||
}
|
||
|
||
// ── Per GPU detail ────────────────────────────────────────────────────────
|
||
b.WriteString("## Per-GPU Details\n\n")
|
||
for _, gpu := range result.GPUs {
|
||
name := strings.TrimSpace(gpu.Name)
|
||
if name == "" {
|
||
name = "Unknown GPU"
|
||
}
|
||
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, name)
|
||
|
||
// Identity
|
||
if gpu.BusID != "" {
|
||
fmt.Fprintf(&b, "- **Bus ID:** %s\n", gpu.BusID)
|
||
}
|
||
if gpu.VBIOS != "" {
|
||
fmt.Fprintf(&b, "- **vBIOS:** %s\n", gpu.VBIOS)
|
||
}
|
||
if gpu.ComputeCapability != "" {
|
||
fmt.Fprintf(&b, "- **Compute capability:** %s\n", gpu.ComputeCapability)
|
||
}
|
||
if gpu.MultiprocessorCount > 0 {
|
||
fmt.Fprintf(&b, "- **SMs:** %d\n", gpu.MultiprocessorCount)
|
||
}
|
||
if gpu.PowerLimitW > 0 {
|
||
fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
|
||
}
|
||
if gpu.PowerLimitDerated {
|
||
fmt.Fprintf(&b, "- **Power limit derating:** active (reduced limit %.0f W)\n", gpu.PowerLimitW)
|
||
}
|
||
if gpu.CalibratedPeakPowerW > 0 {
|
||
if gpu.CalibratedPeakTempC > 0 {
|
||
fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95 at %.1f °C p95\n", gpu.CalibratedPeakPowerW, gpu.CalibratedPeakTempC)
|
||
} else {
|
||
fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95\n", gpu.CalibratedPeakPowerW)
|
||
}
|
||
}
|
||
if gpu.LockedGraphicsClockMHz > 0 {
|
||
fmt.Fprintf(&b, "- **Locked clocks:** GPU %.0f MHz / Mem %.0f MHz\n", gpu.LockedGraphicsClockMHz, gpu.LockedMemoryClockMHz)
|
||
}
|
||
b.WriteString("\n")
|
||
|
||
// Steady-state telemetry
|
||
if benchmarkTelemetryAvailable(gpu.Steady) {
|
||
fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
|
||
b.WriteString(fmtMDTable(
|
||
[]string{"", "Avg", "P95"},
|
||
[][]string{
|
||
{"Power", fmt.Sprintf("%.1f W", gpu.Steady.AvgPowerW), fmt.Sprintf("%.1f W", gpu.Steady.P95PowerW)},
|
||
{"Temperature", fmt.Sprintf("%.1f °C", gpu.Steady.AvgTempC), fmt.Sprintf("%.1f °C", gpu.Steady.P95TempC)},
|
||
{"GPU clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgGraphicsClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95GraphicsClockMHz)},
|
||
{"Memory clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgMemoryClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95MemoryClockMHz)},
|
||
{"GPU utilisation", fmt.Sprintf("%.1f %%", gpu.Steady.AvgUsagePct), "—"},
|
||
},
|
||
))
|
||
b.WriteString("\n")
|
||
} else {
|
||
b.WriteString("**Steady-state telemetry:** unavailable\n\n")
|
||
}
|
||
|
||
// Per-precision stability phases.
|
||
if len(gpu.PrecisionSteady) > 0 {
|
||
b.WriteString("**Per-precision stability:**\n\n")
|
||
var precRows [][]string
|
||
for _, p := range gpu.PrecisionSteady {
|
||
eccCorr := "—"
|
||
eccUncorr := "—"
|
||
if !p.ECC.IsZero() {
|
||
eccCorr = fmt.Sprintf("%d", p.ECC.Corrected)
|
||
eccUncorr = fmt.Sprintf("%d", p.ECC.Uncorrected)
|
||
}
|
||
status := p.Status
|
||
if strings.TrimSpace(status) == "" {
|
||
status = "OK"
|
||
}
|
||
precRows = append(precRows, []string{
|
||
p.Precision, status,
|
||
fmt.Sprintf("%.1f%%", p.Steady.ClockCVPct),
|
||
fmt.Sprintf("%.1f%%", p.Steady.PowerCVPct),
|
||
fmt.Sprintf("%.1f%%", p.Steady.ClockDriftPct),
|
||
eccCorr, eccUncorr,
|
||
})
|
||
}
|
||
b.WriteString(fmtMDTable([]string{"Precision", "Status", "Clock CV", "Power CV", "Clock Drift", "ECC corr", "ECC uncorr"}, precRows))
|
||
b.WriteString("\n")
|
||
} else {
|
||
// Legacy: show combined-window variance.
|
||
fmt.Fprintf(&b, "**Clock/power variance (combined window):** clock CV %.1f%% · power CV %.1f%% · clock drift %.1f%%\n\n",
|
||
gpu.Steady.ClockCVPct, gpu.Steady.PowerCVPct, gpu.Steady.ClockDriftPct)
|
||
}
|
||
|
||
// ECC summary
|
||
if !gpu.ECC.IsZero() {
|
||
fmt.Fprintf(&b, "**ECC errors (total):** corrected=%d uncorrected=%d\n\n",
|
||
gpu.ECC.Corrected, gpu.ECC.Uncorrected)
|
||
}
|
||
|
||
// Throttle
|
||
throttle := formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec)
|
||
if throttle != "none" {
|
||
fmt.Fprintf(&b, "**Throttle:** %s\n\n", throttle)
|
||
}
|
||
|
||
// Precision results
|
||
if len(gpu.PrecisionResults) > 0 {
|
||
b.WriteString("**Precision results:**\n\n")
|
||
var presRows [][]string
|
||
for _, p := range gpu.PrecisionResults {
|
||
if p.Supported {
|
||
presRows = append(presRows, []string{
|
||
p.Name,
|
||
fmt.Sprintf("%.2f", p.TeraOpsPerSec),
|
||
fmt.Sprintf("×%.3g", p.Weight),
|
||
fmt.Sprintf("%.2f", p.WeightedTeraOpsPerSec),
|
||
fmt.Sprintf("%d", p.Lanes),
|
||
fmt.Sprintf("%d", p.Iterations),
|
||
})
|
||
} else {
|
||
presRows = append(presRows, []string{p.Name, "— (unsupported)", "—", "—", "—", "—"})
|
||
}
|
||
}
|
||
b.WriteString(fmtMDTable([]string{"Precision", "TOPS (raw)", "Weight", "TOPS (fp32-eq)", "Lanes", "Iterations"}, presRows))
|
||
b.WriteString("\n")
|
||
}
|
||
|
||
// Degradation / Notes
|
||
if len(gpu.DegradationReasons) > 0 {
|
||
fmt.Fprintf(&b, "**Degradation reasons:** %s\n\n", strings.Join(gpu.DegradationReasons, ", "))
|
||
}
|
||
if len(gpu.Notes) > 0 {
|
||
b.WriteString("**Notes:**\n\n")
|
||
for _, note := range gpu.Notes {
|
||
fmt.Fprintf(&b, "- %s\n", note)
|
||
}
|
||
b.WriteString("\n")
|
||
}
|
||
}
|
||
|
||
// ── Interconnect ──────────────────────────────────────────────────────────
|
||
if result.Interconnect != nil {
|
||
b.WriteString("## Interconnect (NCCL)\n\n")
|
||
fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status)
|
||
if result.Interconnect.Supported {
|
||
b.WriteString(fmtMDTable(
|
||
[]string{"Metric", "Avg", "Max"},
|
||
[][]string{
|
||
{"Alg BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgAlgBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxAlgBWGBps)},
|
||
{"Bus BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgBusBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxBusBWGBps)},
|
||
},
|
||
))
|
||
b.WriteString("\n")
|
||
}
|
||
for _, note := range result.Interconnect.Notes {
|
||
fmt.Fprintf(&b, "- %s\n", note)
|
||
}
|
||
if len(result.Interconnect.Notes) > 0 {
|
||
b.WriteString("\n")
|
||
}
|
||
}
|
||
|
||
// ── Server Power ───────────────────────────────────────────────────────────
|
||
if sp := result.ServerPower; sp != nil {
|
||
title := "## Server Power\n\n"
|
||
if sp.Source != "" {
|
||
title = fmt.Sprintf("## Server Power (`%s`)\n\n", sp.Source)
|
||
}
|
||
b.WriteString(title)
|
||
if !sp.Available {
|
||
b.WriteString("Server power measurement unavailable.\n\n")
|
||
} else {
|
||
spRows := [][]string{
|
||
{"Server idle", fmt.Sprintf("%.0f W", sp.IdleW)},
|
||
{"Server under load", fmt.Sprintf("%.0f W", sp.LoadedW)},
|
||
{"Server delta (load − idle)", fmt.Sprintf("%.0f W", sp.DeltaW)},
|
||
{"GPU-reported sum", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)},
|
||
}
|
||
if sp.ReportingRatio > 0 {
|
||
spRows = append(spRows, []string{"Reporting ratio", fmt.Sprintf("%.2f (1.0 = accurate, <0.75 = GPU over-reports)", sp.ReportingRatio)})
|
||
}
|
||
b.WriteString(fmtMDTable([]string{"", "Value"}, spRows))
|
||
b.WriteString("\n")
|
||
}
|
||
for _, note := range sp.Notes {
|
||
fmt.Fprintf(&b, "- %s\n", note)
|
||
}
|
||
if len(sp.Notes) > 0 {
|
||
b.WriteString("\n")
|
||
}
|
||
}
|
||
|
||
// ── PSU Issues ────────────────────────────────────────────────────────────
|
||
if len(result.PSUIssues) > 0 {
|
||
b.WriteString("## PSU Issues\n\n")
|
||
b.WriteString("The following power supply anomalies were detected during the benchmark:\n\n")
|
||
for _, issue := range result.PSUIssues {
|
||
fmt.Fprintf(&b, "- ⛔ %s\n", issue)
|
||
}
|
||
b.WriteString("\n")
|
||
}
|
||
|
||
// ── Cooling ───────────────────────────────────────────────────────────────
|
||
if cooling := result.Cooling; cooling != nil {
|
||
b.WriteString("## Cooling\n\n")
|
||
if cooling.Available {
|
||
dutyAvg, dutyP95 := "N/A", "N/A"
|
||
if cooling.FanDutyCycleAvailable {
|
||
dutyAvg = fmt.Sprintf("%.1f%%", cooling.AvgFanDutyCyclePct)
|
||
dutyP95 = fmt.Sprintf("%.1f%%", cooling.P95FanDutyCyclePct)
|
||
}
|
||
b.WriteString(fmtMDTable(
|
||
[]string{"Metric", "Value"},
|
||
[][]string{
|
||
{"Average fan speed", fmt.Sprintf("%.0f RPM", cooling.AvgFanRPM)},
|
||
{"Average fan duty cycle", dutyAvg},
|
||
{"P95 fan duty cycle", dutyP95},
|
||
},
|
||
))
|
||
b.WriteString("\n")
|
||
} else {
|
||
b.WriteString("Cooling telemetry unavailable.\n\n")
|
||
}
|
||
for _, note := range cooling.Notes {
|
||
fmt.Fprintf(&b, "- %s\n", note)
|
||
}
|
||
if len(cooling.Notes) > 0 {
|
||
b.WriteString("\n")
|
||
}
|
||
}
|
||
|
||
// ── Platform Scalability ──────────────────────────────────────────────────
|
||
if len(result.PerformanceRampSteps) > 0 {
|
||
b.WriteString("## Platform Scalability (Performance Ramp)\n\n")
|
||
fmt.Fprintf(&b, "**Platform power score:** %.1f%% \n\n", result.PlatformPowerScore)
|
||
var scalRows [][]string
|
||
for _, step := range result.PerformanceRampSteps {
|
||
scalRows = append(scalRows, []string{
|
||
fmt.Sprintf("%d", step.StepIndex),
|
||
joinIndexList(step.GPUIndices),
|
||
fmt.Sprintf("%.2f", step.TotalSyntheticTOPS),
|
||
fmt.Sprintf("%.1f%%", step.ScalabilityPct),
|
||
})
|
||
}
|
||
b.WriteString(fmtMDTable([]string{"k GPUs", "GPU Indices", "Total Synthetic TOPS", "Scalability"}, scalRows))
|
||
b.WriteString("\n")
|
||
}
|
||
|
||
// ── Raw files ─────────────────────────────────────────────────────────────
|
||
b.WriteString("## Raw Files\n\n")
|
||
b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
|
||
b.WriteString("- `gpu-metrics.csv`\n- `gpu-metrics.html`\n- `gpu-burn.log`\n")
|
||
if result.Interconnect != nil {
|
||
b.WriteString("- `nccl-all-reduce.log`\n")
|
||
}
|
||
return b.String()
|
||
}
|
||
|
||
// formatThrottleLine renders throttle counters as human-readable percentages of
|
||
// the steady-state window. Only non-zero counters are shown. When the steady
|
||
// duration is unknown (0), raw seconds are shown instead.
|
||
func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64) string {
|
||
type counter struct {
|
||
label string
|
||
us uint64
|
||
}
|
||
counters := []counter{
|
||
{"sw_power", t.SWPowerCapUS},
|
||
{"sw_thermal", t.SWThermalSlowdownUS},
|
||
{"sync_boost", t.SyncBoostUS},
|
||
{"hw_thermal", t.HWThermalSlowdownUS},
|
||
{"hw_power_brake", t.HWPowerBrakeSlowdownUS},
|
||
}
|
||
var parts []string
|
||
for _, c := range counters {
|
||
if c.us == 0 {
|
||
continue
|
||
}
|
||
sec := float64(c.us) / 1e6
|
||
if steadyDurationSec > 0 {
|
||
pct := sec / steadyDurationSec * 100
|
||
parts = append(parts, fmt.Sprintf("%s=%.1f%% (%.0fs)", c.label, pct, sec))
|
||
} else if sec < 1 {
|
||
parts = append(parts, fmt.Sprintf("%s=%.0fms", c.label, sec*1000))
|
||
} else {
|
||
parts = append(parts, fmt.Sprintf("%s=%.1fs", c.label, sec))
|
||
}
|
||
}
|
||
if len(parts) == 0 {
|
||
return "none"
|
||
}
|
||
return strings.Join(parts, " ")
|
||
}
|
||
|
||
func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
|
||
var b strings.Builder
|
||
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
|
||
fmt.Fprintf(&b, "benchmark_version=%s\n", result.BenchmarkVersion)
|
||
fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
|
||
fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
|
||
fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
|
||
fmt.Fprintf(&b, "normalization_status=%s\n", result.Normalization.Status)
|
||
var best float64
|
||
for i, gpu := range result.GPUs {
|
||
fmt.Fprintf(&b, "gpu_%d_status=%s\n", gpu.Index, gpu.Status)
|
||
fmt.Fprintf(&b, "gpu_%d_composite_score=%.2f\n", gpu.Index, gpu.Scores.CompositeScore)
|
||
if i == 0 || gpu.Scores.CompositeScore > best {
|
||
best = gpu.Scores.CompositeScore
|
||
}
|
||
}
|
||
fmt.Fprintf(&b, "best_composite_score=%.2f\n", best)
|
||
if result.Interconnect != nil {
|
||
fmt.Fprintf(&b, "interconnect_status=%s\n", result.Interconnect.Status)
|
||
fmt.Fprintf(&b, "interconnect_max_busbw_gbps=%.1f\n", result.Interconnect.MaxBusBWGBps)
|
||
}
|
||
return b.String()
|
||
}
|