Parse "GPU Shutdown Temp" and "GPU Slowdown Temp" from nvidia-smi -q verbose output in enrichGPUInfoWithMaxClocks. Store as ShutdownTempC/SlowdownTempC on benchmarkGPUInfo and BenchmarkGPUResult. Fallback: 90°C shutdown / 80°C slowdown when not available. TempHeadroomC = ShutdownTempC - P95TempC (per-GPU, not hardcoded 100°C). Warning threshold: p95 >= SlowdownTempC. Critical: headroom < 10°C. Report table shows both limits alongside headroom and p95 temp. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
489 lines
19 KiB
Go
489 lines
19 KiB
Go
package platform
|
||
|
||
import (
|
||
"fmt"
|
||
"strings"
|
||
"time"
|
||
)
|
||
|
||
func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
|
||
return renderBenchmarkReportWithCharts(result)
|
||
}
|
||
|
||
func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||
var b strings.Builder
|
||
|
||
// ── Header ────────────────────────────────────────────────────────────────
|
||
b.WriteString("# Bee NVIDIA Benchmark Report\n\n")
|
||
|
||
// System identity block
|
||
if result.ServerModel != "" {
|
||
fmt.Fprintf(&b, "**Server:** %s \n", result.ServerModel)
|
||
}
|
||
if result.Hostname != "" {
|
||
fmt.Fprintf(&b, "**Host:** %s \n", result.Hostname)
|
||
}
|
||
// GPU models summary
|
||
if len(result.GPUs) > 0 {
|
||
modelCount := make(map[string]int)
|
||
var modelOrder []string
|
||
for _, g := range result.GPUs {
|
||
m := strings.TrimSpace(g.Name)
|
||
if m == "" {
|
||
m = "Unknown GPU"
|
||
}
|
||
if modelCount[m] == 0 {
|
||
modelOrder = append(modelOrder, m)
|
||
}
|
||
modelCount[m]++
|
||
}
|
||
var parts []string
|
||
for _, m := range modelOrder {
|
||
if modelCount[m] == 1 {
|
||
parts = append(parts, m)
|
||
} else {
|
||
parts = append(parts, fmt.Sprintf("%d× %s", modelCount[m], m))
|
||
}
|
||
}
|
||
fmt.Fprintf(&b, "**GPU(s):** %s \n", strings.Join(parts, ", "))
|
||
}
|
||
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
|
||
fmt.Fprintf(&b, "**Benchmark version:** %s \n", result.BenchmarkVersion)
|
||
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
||
if result.RampStep > 0 && result.RampTotal > 0 {
|
||
fmt.Fprintf(&b, "**Ramp-up step:** %d of %d \n", result.RampStep, result.RampTotal)
|
||
if result.RampRunID != "" {
|
||
fmt.Fprintf(&b, "**Ramp-up run ID:** %s \n", result.RampRunID)
|
||
}
|
||
} else if result.ParallelGPUs {
|
||
fmt.Fprintf(&b, "**Mode:** parallel (all GPUs simultaneously) \n")
|
||
}
|
||
if result.ScalabilityScore > 0 {
|
||
fmt.Fprintf(&b, "**Scalability score:** %.1f%% \n", result.ScalabilityScore)
|
||
}
|
||
if result.PlatformPowerScore > 0 {
|
||
fmt.Fprintf(&b, "**Platform power score:** %.1f%% \n", result.PlatformPowerScore)
|
||
}
|
||
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
|
||
b.WriteString("\n")
|
||
|
||
// ── Executive Summary ─────────────────────────────────────────────────────
|
||
if len(result.Findings) > 0 {
|
||
b.WriteString("## Executive Summary\n\n")
|
||
for _, finding := range result.Findings {
|
||
fmt.Fprintf(&b, "- %s\n", finding)
|
||
}
|
||
b.WriteString("\n")
|
||
}
|
||
|
||
if len(result.Warnings) > 0 {
|
||
b.WriteString("## Warnings\n\n")
|
||
for _, warning := range result.Warnings {
|
||
fmt.Fprintf(&b, "- %s\n", warning)
|
||
}
|
||
b.WriteString("\n")
|
||
}
|
||
|
||
// ── Balanced Scorecard ────────────────────────────────────────────────────
|
||
b.WriteString("## Balanced Scorecard\n\n")
|
||
|
||
// Perspective 1: Compatibility — hard stops
|
||
b.WriteString("### 1. Compatibility\n\n")
|
||
b.WriteString("| GPU | Thermal throttle | Fan duty at throttle | ECC uncorr | Status |\n")
|
||
b.WriteString("|-----|------------------|----------------------|------------|--------|\n")
|
||
for _, gpu := range result.GPUs {
|
||
thermalThrottle := "-"
|
||
if gpu.Scores.ThermalThrottlePct > 0 {
|
||
thermalThrottle = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
|
||
}
|
||
fanAtThrottle := "-"
|
||
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && gpu.Scores.ThermalThrottlePct > 0 {
|
||
fanAtThrottle = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
|
||
}
|
||
ecc := "-"
|
||
if gpu.ECC.Uncorrected > 0 {
|
||
ecc = fmt.Sprintf("⛔ %d", gpu.ECC.Uncorrected)
|
||
}
|
||
compatStatus := "✓ OK"
|
||
if gpu.ECC.Uncorrected > 0 || (gpu.Scores.ThermalThrottlePct > 0 && result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && result.Cooling.P95FanDutyCyclePct < 95) {
|
||
compatStatus = "⛔ HARD STOP"
|
||
}
|
||
fmt.Fprintf(&b, "| GPU %d | %s | %s | %s | %s |\n",
|
||
gpu.Index, thermalThrottle, fanAtThrottle, ecc, compatStatus)
|
||
}
|
||
b.WriteString("\n")
|
||
|
||
// Perspective 2: Thermal headroom
|
||
b.WriteString("### 2. Thermal Headroom\n\n")
|
||
b.WriteString("| GPU | p95 temp | Slowdown limit | Shutdown limit | Headroom | Thermal throttle | Status |\n")
|
||
b.WriteString("|-----|----------|----------------|----------------|----------|------------------|--------|\n")
|
||
for _, gpu := range result.GPUs {
|
||
shutdownTemp := gpu.ShutdownTempC
|
||
if shutdownTemp <= 0 {
|
||
shutdownTemp = 90
|
||
}
|
||
slowdownTemp := gpu.SlowdownTempC
|
||
if slowdownTemp <= 0 {
|
||
slowdownTemp = 80
|
||
}
|
||
headroom := gpu.Scores.TempHeadroomC
|
||
thermalStatus := "✓ OK"
|
||
switch {
|
||
case headroom < 10:
|
||
thermalStatus = "⛔ CRITICAL"
|
||
case gpu.Steady.P95TempC >= slowdownTemp:
|
||
thermalStatus = "⚠ WARNING"
|
||
}
|
||
throttlePct := "-"
|
||
if gpu.Scores.ThermalThrottlePct > 0 {
|
||
throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
|
||
}
|
||
fmt.Fprintf(&b, "| GPU %d | %.1f°C | %.0f°C | %.0f°C | %.1f°C | %s | %s |\n",
|
||
gpu.Index, gpu.Steady.P95TempC, slowdownTemp, shutdownTemp, headroom, throttlePct, thermalStatus)
|
||
}
|
||
b.WriteString("\n")
|
||
|
||
// Perspective 3: Power delivery
|
||
b.WriteString("### 3. Power Delivery\n\n")
|
||
b.WriteString("| GPU | Power cap throttle | Power stability | Fan duty (p95) | Status |\n")
|
||
b.WriteString("|-----|-------------------|-----------------|----------------|--------|\n")
|
||
for _, gpu := range result.GPUs {
|
||
powerCap := "-"
|
||
if gpu.Scores.PowerCapThrottlePct > 0 {
|
||
powerCap = fmt.Sprintf("%.1f%%", gpu.Scores.PowerCapThrottlePct)
|
||
}
|
||
fanDuty := "-"
|
||
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable {
|
||
fanDuty = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
|
||
}
|
||
powerStatus := "✓ OK"
|
||
if gpu.Scores.PowerCapThrottlePct > 5 {
|
||
powerStatus = "⚠ POWER LIMITED"
|
||
}
|
||
fmt.Fprintf(&b, "| GPU %d | %s | %.1f | %s | %s |\n",
|
||
gpu.Index, powerCap, gpu.Scores.PowerSustainScore, fanDuty, powerStatus)
|
||
}
|
||
b.WriteString("\n")
|
||
|
||
// Perspective 4: Performance
|
||
b.WriteString("### 4. Performance\n\n")
|
||
b.WriteString("| GPU | Compute TOPS | Synthetic | Mixed | Mixed Eff. | TOPS/SM/GHz |\n")
|
||
b.WriteString("|-----|--------------|-----------|-------|------------|-------------|\n")
|
||
for _, gpu := range result.GPUs {
|
||
synthetic := "-"
|
||
if gpu.Scores.SyntheticScore > 0 {
|
||
synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
|
||
}
|
||
mixed := "-"
|
||
if gpu.Scores.MixedScore > 0 {
|
||
mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore)
|
||
}
|
||
mixedEff := "-"
|
||
if gpu.Scores.MixedEfficiency > 0 {
|
||
mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
|
||
}
|
||
topsPerSM := "-"
|
||
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
||
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
|
||
}
|
||
fmt.Fprintf(&b, "| GPU %d | **%.2f** | %s | %s | %s | %s |\n",
|
||
gpu.Index, gpu.Scores.CompositeScore, synthetic, mixed, mixedEff, topsPerSM)
|
||
}
|
||
if len(result.PerformanceRampSteps) > 0 {
|
||
fmt.Fprintf(&b, "\n**Platform power score (scalability):** %.1f%%\n", result.PlatformPowerScore)
|
||
}
|
||
b.WriteString("\n")
|
||
|
||
// Perspective 5: Anomaly flags
|
||
b.WriteString("### 5. Anomalies\n\n")
|
||
b.WriteString("| GPU | ECC corrected | Sync boost throttle | Power instability | Thermal instability |\n")
|
||
b.WriteString("|-----|---------------|---------------------|-------------------|---------------------|\n")
|
||
for _, gpu := range result.GPUs {
|
||
eccCorr := "-"
|
||
if gpu.ECC.Corrected > 0 {
|
||
eccCorr = fmt.Sprintf("⚠ %d", gpu.ECC.Corrected)
|
||
}
|
||
syncBoost := "-"
|
||
if gpu.Scores.SyncBoostThrottlePct > 0 {
|
||
syncBoost = fmt.Sprintf("%.1f%%", gpu.Scores.SyncBoostThrottlePct)
|
||
}
|
||
powerVar := "OK"
|
||
if gpu.Scores.PowerSustainScore < 70 {
|
||
powerVar = "⚠ unstable"
|
||
}
|
||
thermalVar := "OK"
|
||
if gpu.Scores.ThermalSustainScore < 70 {
|
||
thermalVar = "⚠ unstable"
|
||
}
|
||
fmt.Fprintf(&b, "| GPU %d | %s | %s | %s | %s |\n",
|
||
gpu.Index, eccCorr, syncBoost, powerVar, thermalVar)
|
||
}
|
||
b.WriteString("\n")
|
||
|
||
// ── Per GPU detail ────────────────────────────────────────────────────────
|
||
b.WriteString("## Per-GPU Details\n\n")
|
||
for _, gpu := range result.GPUs {
|
||
name := strings.TrimSpace(gpu.Name)
|
||
if name == "" {
|
||
name = "Unknown GPU"
|
||
}
|
||
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, name)
|
||
|
||
// Identity
|
||
if gpu.BusID != "" {
|
||
fmt.Fprintf(&b, "- **Bus ID:** %s\n", gpu.BusID)
|
||
}
|
||
if gpu.VBIOS != "" {
|
||
fmt.Fprintf(&b, "- **vBIOS:** %s\n", gpu.VBIOS)
|
||
}
|
||
if gpu.ComputeCapability != "" {
|
||
fmt.Fprintf(&b, "- **Compute capability:** %s\n", gpu.ComputeCapability)
|
||
}
|
||
if gpu.MultiprocessorCount > 0 {
|
||
fmt.Fprintf(&b, "- **SMs:** %d\n", gpu.MultiprocessorCount)
|
||
}
|
||
if gpu.PowerLimitW > 0 {
|
||
fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
|
||
}
|
||
if gpu.PowerLimitDerated {
|
||
fmt.Fprintf(&b, "- **Power limit derating:** active (reduced limit %.0f W)\n", gpu.PowerLimitW)
|
||
}
|
||
if gpu.CalibratedPeakPowerW > 0 {
|
||
if gpu.CalibratedPeakTempC > 0 {
|
||
fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95 at %.1f °C p95\n", gpu.CalibratedPeakPowerW, gpu.CalibratedPeakTempC)
|
||
} else {
|
||
fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95\n", gpu.CalibratedPeakPowerW)
|
||
}
|
||
}
|
||
if gpu.LockedGraphicsClockMHz > 0 {
|
||
fmt.Fprintf(&b, "- **Locked clocks:** GPU %.0f MHz / Mem %.0f MHz\n", gpu.LockedGraphicsClockMHz, gpu.LockedMemoryClockMHz)
|
||
}
|
||
b.WriteString("\n")
|
||
|
||
// Steady-state telemetry
|
||
fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
|
||
b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
|
||
fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
|
||
fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
|
||
fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
|
||
fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
|
||
fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
|
||
b.WriteString("\n")
|
||
|
||
// Per-precision stability phases.
|
||
if len(gpu.PrecisionSteady) > 0 {
|
||
b.WriteString("**Per-precision stability:**\n\n")
|
||
b.WriteString("| Precision | Status | Clock CV | Power CV | Clock Drift | ECC corr | ECC uncorr |\n|-----------|--------|----------|----------|-------------|----------|------------|\n")
|
||
for _, p := range gpu.PrecisionSteady {
|
||
eccCorr := "—"
|
||
eccUncorr := "—"
|
||
if !p.ECC.IsZero() {
|
||
eccCorr = fmt.Sprintf("%d", p.ECC.Corrected)
|
||
eccUncorr = fmt.Sprintf("%d", p.ECC.Uncorrected)
|
||
}
|
||
status := p.Status
|
||
if strings.TrimSpace(status) == "" {
|
||
status = "OK"
|
||
}
|
||
fmt.Fprintf(&b, "| %s | %s | %.1f%% | %.1f%% | %.1f%% | %s | %s |\n",
|
||
p.Precision, status, p.Steady.ClockCVPct, p.Steady.PowerCVPct, p.Steady.ClockDriftPct,
|
||
eccCorr, eccUncorr)
|
||
}
|
||
b.WriteString("\n")
|
||
} else {
|
||
// Legacy: show combined-window variance.
|
||
fmt.Fprintf(&b, "**Clock/power variance (combined window):** clock CV %.1f%% · power CV %.1f%% · clock drift %.1f%%\n\n",
|
||
gpu.Steady.ClockCVPct, gpu.Steady.PowerCVPct, gpu.Steady.ClockDriftPct)
|
||
}
|
||
|
||
// ECC summary
|
||
if !gpu.ECC.IsZero() {
|
||
fmt.Fprintf(&b, "**ECC errors (total):** corrected=%d uncorrected=%d\n\n",
|
||
gpu.ECC.Corrected, gpu.ECC.Uncorrected)
|
||
}
|
||
|
||
// Throttle
|
||
throttle := formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec)
|
||
if throttle != "none" {
|
||
fmt.Fprintf(&b, "**Throttle:** %s\n\n", throttle)
|
||
}
|
||
|
||
// Precision results
|
||
if len(gpu.PrecisionResults) > 0 {
|
||
b.WriteString("**Precision results:**\n\n")
|
||
b.WriteString("| Precision | TOPS (raw) | Weight | TOPS (fp32-eq) | Lanes | Iterations |\n|-----------|------------|--------|----------------|-------|------------|\n")
|
||
for _, p := range gpu.PrecisionResults {
|
||
if p.Supported {
|
||
weightStr := fmt.Sprintf("×%.3g", p.Weight)
|
||
fmt.Fprintf(&b, "| %s | %.2f | %s | %.2f | %d | %d |\n",
|
||
p.Name, p.TeraOpsPerSec, weightStr, p.WeightedTeraOpsPerSec, p.Lanes, p.Iterations)
|
||
} else {
|
||
fmt.Fprintf(&b, "| %s | — (unsupported) | — | — | — | — |\n", p.Name)
|
||
}
|
||
}
|
||
b.WriteString("\n")
|
||
}
|
||
|
||
// Degradation / Notes
|
||
if len(gpu.DegradationReasons) > 0 {
|
||
fmt.Fprintf(&b, "**Degradation reasons:** %s\n\n", strings.Join(gpu.DegradationReasons, ", "))
|
||
}
|
||
if len(gpu.Notes) > 0 {
|
||
b.WriteString("**Notes:**\n\n")
|
||
for _, note := range gpu.Notes {
|
||
fmt.Fprintf(&b, "- %s\n", note)
|
||
}
|
||
b.WriteString("\n")
|
||
}
|
||
}
|
||
|
||
// ── Interconnect ──────────────────────────────────────────────────────────
|
||
if result.Interconnect != nil {
|
||
b.WriteString("## Interconnect (NCCL)\n\n")
|
||
fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status)
|
||
if result.Interconnect.Supported {
|
||
b.WriteString("| Metric | Avg | Max |\n|--------|-----|-----|\n")
|
||
fmt.Fprintf(&b, "| Alg BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.MaxAlgBWGBps)
|
||
fmt.Fprintf(&b, "| Bus BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgBusBWGBps, result.Interconnect.MaxBusBWGBps)
|
||
b.WriteString("\n")
|
||
}
|
||
for _, note := range result.Interconnect.Notes {
|
||
fmt.Fprintf(&b, "- %s\n", note)
|
||
}
|
||
if len(result.Interconnect.Notes) > 0 {
|
||
b.WriteString("\n")
|
||
}
|
||
}
|
||
|
||
// ── Server Power (IPMI) ───────────────────────────────────────────────────
|
||
if sp := result.ServerPower; sp != nil {
|
||
b.WriteString("## Server Power (IPMI)\n\n")
|
||
if !sp.Available {
|
||
b.WriteString("IPMI power measurement unavailable.\n\n")
|
||
} else {
|
||
b.WriteString("| | Value |\n|---|---|\n")
|
||
fmt.Fprintf(&b, "| Server idle | %.0f W |\n", sp.IdleW)
|
||
fmt.Fprintf(&b, "| Server under load | %.0f W |\n", sp.LoadedW)
|
||
fmt.Fprintf(&b, "| Server delta (load − idle) | %.0f W |\n", sp.DeltaW)
|
||
fmt.Fprintf(&b, "| GPU-reported sum | %.0f W |\n", sp.GPUReportedSumW)
|
||
if sp.ReportingRatio > 0 {
|
||
fmt.Fprintf(&b, "| Reporting ratio | %.2f (1.0 = accurate, <0.75 = GPU over-reports) |\n", sp.ReportingRatio)
|
||
}
|
||
b.WriteString("\n")
|
||
}
|
||
for _, note := range sp.Notes {
|
||
fmt.Fprintf(&b, "- %s\n", note)
|
||
}
|
||
if len(sp.Notes) > 0 {
|
||
b.WriteString("\n")
|
||
}
|
||
}
|
||
|
||
// ── Cooling ───────────────────────────────────────────────────────────────
|
||
if cooling := result.Cooling; cooling != nil {
|
||
b.WriteString("## Cooling\n\n")
|
||
if cooling.Available {
|
||
b.WriteString("| Metric | Value |\n|--------|-------|\n")
|
||
fmt.Fprintf(&b, "| Average fan speed | %.0f RPM |\n", cooling.AvgFanRPM)
|
||
if cooling.FanDutyCycleAvailable {
|
||
fmt.Fprintf(&b, "| Average fan duty cycle | %.1f%% |\n", cooling.AvgFanDutyCyclePct)
|
||
fmt.Fprintf(&b, "| P95 fan duty cycle | %.1f%% |\n", cooling.P95FanDutyCyclePct)
|
||
} else {
|
||
b.WriteString("| Average fan duty cycle | N/A |\n")
|
||
b.WriteString("| P95 fan duty cycle | N/A |\n")
|
||
}
|
||
b.WriteString("\n")
|
||
} else {
|
||
b.WriteString("Cooling telemetry unavailable.\n\n")
|
||
}
|
||
for _, note := range cooling.Notes {
|
||
fmt.Fprintf(&b, "- %s\n", note)
|
||
}
|
||
if len(cooling.Notes) > 0 {
|
||
b.WriteString("\n")
|
||
}
|
||
}
|
||
|
||
// ── Platform Scalability ──────────────────────────────────────────────────
|
||
if len(result.PerformanceRampSteps) > 0 {
|
||
b.WriteString("## Platform Scalability (Performance Ramp)\n\n")
|
||
fmt.Fprintf(&b, "**Platform power score:** %.1f%% \n\n", result.PlatformPowerScore)
|
||
b.WriteString("| k GPUs | GPU Indices | Total Synthetic TOPS | Scalability |\n")
|
||
b.WriteString("|--------|-------------|----------------------|-------------|\n")
|
||
for _, step := range result.PerformanceRampSteps {
|
||
fmt.Fprintf(&b, "| %d | %s | %.2f | %.1f%% |\n",
|
||
step.StepIndex, joinIndexList(step.GPUIndices), step.TotalSyntheticTOPS, step.ScalabilityPct)
|
||
}
|
||
b.WriteString("\n")
|
||
}
|
||
|
||
// ── Raw files ─────────────────────────────────────────────────────────────
|
||
b.WriteString("## Raw Files\n\n")
|
||
b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
|
||
b.WriteString("- `gpu-metrics.csv`\n- `gpu-metrics.html`\n- `gpu-burn.log`\n")
|
||
if result.Interconnect != nil {
|
||
b.WriteString("- `nccl-all-reduce.log`\n")
|
||
}
|
||
return b.String()
|
||
}
|
||
|
||
// formatThrottleLine renders throttle counters as human-readable percentages of
|
||
// the steady-state window. Only non-zero counters are shown. When the steady
|
||
// duration is unknown (0), raw seconds are shown instead.
|
||
func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64) string {
|
||
type counter struct {
|
||
label string
|
||
us uint64
|
||
}
|
||
counters := []counter{
|
||
{"sw_power", t.SWPowerCapUS},
|
||
{"sw_thermal", t.SWThermalSlowdownUS},
|
||
{"sync_boost", t.SyncBoostUS},
|
||
{"hw_thermal", t.HWThermalSlowdownUS},
|
||
{"hw_power_brake", t.HWPowerBrakeSlowdownUS},
|
||
}
|
||
var parts []string
|
||
for _, c := range counters {
|
||
if c.us == 0 {
|
||
continue
|
||
}
|
||
sec := float64(c.us) / 1e6
|
||
if steadyDurationSec > 0 {
|
||
pct := sec / steadyDurationSec * 100
|
||
parts = append(parts, fmt.Sprintf("%s=%.1f%% (%.0fs)", c.label, pct, sec))
|
||
} else if sec < 1 {
|
||
parts = append(parts, fmt.Sprintf("%s=%.0fms", c.label, sec*1000))
|
||
} else {
|
||
parts = append(parts, fmt.Sprintf("%s=%.1fs", c.label, sec))
|
||
}
|
||
}
|
||
if len(parts) == 0 {
|
||
return "none"
|
||
}
|
||
return strings.Join(parts, " ")
|
||
}
|
||
|
||
func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
|
||
var b strings.Builder
|
||
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
|
||
fmt.Fprintf(&b, "benchmark_version=%s\n", result.BenchmarkVersion)
|
||
fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
|
||
fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
|
||
fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
|
||
fmt.Fprintf(&b, "normalization_status=%s\n", result.Normalization.Status)
|
||
var best float64
|
||
for i, gpu := range result.GPUs {
|
||
fmt.Fprintf(&b, "gpu_%d_status=%s\n", gpu.Index, gpu.Status)
|
||
fmt.Fprintf(&b, "gpu_%d_composite_score=%.2f\n", gpu.Index, gpu.Scores.CompositeScore)
|
||
if i == 0 || gpu.Scores.CompositeScore > best {
|
||
best = gpu.Scores.CompositeScore
|
||
}
|
||
}
|
||
fmt.Fprintf(&b, "best_composite_score=%.2f\n", best)
|
||
if result.Interconnect != nil {
|
||
fmt.Fprintf(&b, "interconnect_status=%s\n", result.Interconnect.Status)
|
||
fmt.Fprintf(&b, "interconnect_max_busbw_gbps=%.1f\n", result.Interconnect.MaxBusBWGBps)
|
||
}
|
||
return b.String()
|
||
}
|