- Sample server power (IPMI dcmi) during baseline+steady phases in parallel; compute delta vs GPU-reported sum; flag ratio < 0.75 as unreliable reporting - Collect base_graphics_clock_mhz, multiprocessor_count, default_power_limit_w from nvidia-smi alongside existing GPU info - Add tops_per_sm_per_ghz efficiency metric (model-agnostic silicon quality signal) - Flag when enforced power limit is below default TDP by >5% - Add fp64 profile to bee-gpu-burn worker (CUDA_R_64F, CUBLAS_COMPUTE_64F, min cc 8.0) - Improve Executive Summary: overall pass count, FAILED GPU finding - Throttle counters now shown as % of steady window instead of raw microseconds - bible-local: clock calibration research, H100/H200 spec, real-world GEMM baselines Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
253 lines
8.9 KiB
Go
253 lines
8.9 KiB
Go
package platform
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"regexp"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
|
|
return renderBenchmarkReportWithCharts(result, nil)
|
|
}
|
|
|
|
type benchmarkReportChart struct {
|
|
Title string
|
|
Content string
|
|
}
|
|
|
|
var ansiEscapePattern = regexp.MustCompile(`\x1b\[[0-9;]*m`)
|
|
|
|
func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benchmarkReportChart) string {
|
|
var b strings.Builder
|
|
fmt.Fprintf(&b, "Bee NVIDIA Benchmark Report\n")
|
|
fmt.Fprintf(&b, "===========================\n\n")
|
|
fmt.Fprintf(&b, "Generated: %s\n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
|
fmt.Fprintf(&b, "Host: %s\n", result.Hostname)
|
|
fmt.Fprintf(&b, "Profile: %s\n", result.BenchmarkProfile)
|
|
fmt.Fprintf(&b, "Overall status: %s\n", result.OverallStatus)
|
|
fmt.Fprintf(&b, "Selected GPUs: %s\n", joinIndexList(result.SelectedGPUIndices))
|
|
fmt.Fprintf(&b, "Normalization: %s\n\n", result.Normalization.Status)
|
|
|
|
if len(result.Findings) > 0 {
|
|
fmt.Fprintf(&b, "Executive Summary\n")
|
|
fmt.Fprintf(&b, "-----------------\n")
|
|
for _, finding := range result.Findings {
|
|
fmt.Fprintf(&b, "- %s\n", finding)
|
|
}
|
|
b.WriteString("\n")
|
|
}
|
|
|
|
if len(result.Warnings) > 0 {
|
|
fmt.Fprintf(&b, "Warnings\n")
|
|
fmt.Fprintf(&b, "--------\n")
|
|
for _, warning := range result.Warnings {
|
|
fmt.Fprintf(&b, "- %s\n", warning)
|
|
}
|
|
b.WriteString("\n")
|
|
}
|
|
|
|
fmt.Fprintf(&b, "Per GPU Scorecard\n")
|
|
fmt.Fprintf(&b, "-----------------\n")
|
|
for _, gpu := range result.GPUs {
|
|
fmt.Fprintf(&b, "GPU %d %s\n", gpu.Index, gpu.Name)
|
|
fmt.Fprintf(&b, " Status: %s\n", gpu.Status)
|
|
fmt.Fprintf(&b, " Composite score: %.2f\n", gpu.Scores.CompositeScore)
|
|
fmt.Fprintf(&b, " Compute score: %.2f\n", gpu.Scores.ComputeScore)
|
|
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
|
fmt.Fprintf(&b, " Compute efficiency: %.3f TOPS/SM/GHz\n", gpu.Scores.TOPSPerSMPerGHz)
|
|
}
|
|
fmt.Fprintf(&b, " Power sustain: %.1f\n", gpu.Scores.PowerSustainScore)
|
|
fmt.Fprintf(&b, " Thermal sustain: %.1f\n", gpu.Scores.ThermalSustainScore)
|
|
fmt.Fprintf(&b, " Stability: %.1f\n", gpu.Scores.StabilityScore)
|
|
if gpu.Scores.InterconnectScore > 0 {
|
|
fmt.Fprintf(&b, " Interconnect: %.1f\n", gpu.Scores.InterconnectScore)
|
|
}
|
|
if len(gpu.DegradationReasons) > 0 {
|
|
fmt.Fprintf(&b, " Degradation reasons: %s\n", strings.Join(gpu.DegradationReasons, ", "))
|
|
}
|
|
fmt.Fprintf(&b, " Avg power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.AvgPowerW, gpu.Steady.AvgTempC, gpu.Steady.AvgGraphicsClockMHz)
|
|
fmt.Fprintf(&b, " P95 power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.P95PowerW, gpu.Steady.P95TempC, gpu.Steady.P95GraphicsClockMHz)
|
|
if len(gpu.PrecisionResults) > 0 {
|
|
fmt.Fprintf(&b, " Precision results:\n")
|
|
for _, precision := range gpu.PrecisionResults {
|
|
if precision.Supported {
|
|
fmt.Fprintf(&b, " - %s: %.2f TOPS lanes=%d iterations=%d\n", precision.Name, precision.TeraOpsPerSec, precision.Lanes, precision.Iterations)
|
|
} else {
|
|
fmt.Fprintf(&b, " - %s: unsupported (%s)\n", precision.Name, precision.Notes)
|
|
}
|
|
}
|
|
}
|
|
fmt.Fprintf(&b, " Throttle: %s\n", formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec))
|
|
if len(gpu.Notes) > 0 {
|
|
fmt.Fprintf(&b, " Notes:\n")
|
|
for _, note := range gpu.Notes {
|
|
fmt.Fprintf(&b, " - %s\n", note)
|
|
}
|
|
}
|
|
b.WriteString("\n")
|
|
}
|
|
|
|
if result.Interconnect != nil {
|
|
fmt.Fprintf(&b, "Interconnect\n")
|
|
fmt.Fprintf(&b, "------------\n")
|
|
fmt.Fprintf(&b, "Status: %s\n", result.Interconnect.Status)
|
|
if result.Interconnect.Supported {
|
|
fmt.Fprintf(&b, "Avg algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.AvgBusBWGBps)
|
|
fmt.Fprintf(&b, "Max algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.MaxAlgBWGBps, result.Interconnect.MaxBusBWGBps)
|
|
}
|
|
for _, note := range result.Interconnect.Notes {
|
|
fmt.Fprintf(&b, "- %s\n", note)
|
|
}
|
|
b.WriteString("\n")
|
|
}
|
|
|
|
if len(charts) > 0 {
|
|
fmt.Fprintf(&b, "Terminal Charts\n")
|
|
fmt.Fprintf(&b, "---------------\n")
|
|
for _, chart := range charts {
|
|
content := strings.TrimSpace(stripANSIEscapeSequences(chart.Content))
|
|
if content == "" {
|
|
continue
|
|
}
|
|
fmt.Fprintf(&b, "%s\n", chart.Title)
|
|
fmt.Fprintf(&b, "%s\n", strings.Repeat("~", len(chart.Title)))
|
|
fmt.Fprintf(&b, "%s\n\n", content)
|
|
}
|
|
}
|
|
|
|
if sp := result.ServerPower; sp != nil {
|
|
fmt.Fprintf(&b, "Server Power (IPMI)\n")
|
|
fmt.Fprintf(&b, "-------------------\n")
|
|
if !sp.Available {
|
|
fmt.Fprintf(&b, "Unavailable\n")
|
|
} else {
|
|
fmt.Fprintf(&b, " Server idle: %.0f W\n", sp.IdleW)
|
|
fmt.Fprintf(&b, " Server under load: %.0f W\n", sp.LoadedW)
|
|
fmt.Fprintf(&b, " Server delta: %.0f W\n", sp.DeltaW)
|
|
fmt.Fprintf(&b, " GPU reported (sum): %.0f W\n", sp.GPUReportedSumW)
|
|
if sp.ReportingRatio > 0 {
|
|
fmt.Fprintf(&b, " Reporting ratio: %.2f (1.0 = accurate, <0.75 = GPU over-reports)\n", sp.ReportingRatio)
|
|
}
|
|
}
|
|
for _, note := range sp.Notes {
|
|
fmt.Fprintf(&b, " Note: %s\n", note)
|
|
}
|
|
b.WriteString("\n")
|
|
}
|
|
|
|
fmt.Fprintf(&b, "Methodology\n")
|
|
fmt.Fprintf(&b, "-----------\n")
|
|
fmt.Fprintf(&b, "- Profile %s uses standardized baseline, warmup, steady-state, interconnect, and cooldown phases.\n", result.BenchmarkProfile)
|
|
fmt.Fprintf(&b, "- Single-GPU compute score comes from bee-gpu-burn cuBLASLt output when available.\n")
|
|
fmt.Fprintf(&b, "- Thermal and power limitations are inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
|
|
fmt.Fprintf(&b, "- result.json is the canonical machine-readable source for this benchmark run.\n\n")
|
|
|
|
fmt.Fprintf(&b, "Raw Files\n")
|
|
fmt.Fprintf(&b, "---------\n")
|
|
fmt.Fprintf(&b, "- result.json\n")
|
|
fmt.Fprintf(&b, "- report.txt\n")
|
|
fmt.Fprintf(&b, "- summary.txt\n")
|
|
fmt.Fprintf(&b, "- verbose.log\n")
|
|
fmt.Fprintf(&b, "- gpu-*-baseline-metrics.csv/html/term.txt\n")
|
|
fmt.Fprintf(&b, "- gpu-*-warmup.log\n")
|
|
fmt.Fprintf(&b, "- gpu-*-steady.log\n")
|
|
fmt.Fprintf(&b, "- gpu-*-steady-metrics.csv/html/term.txt\n")
|
|
fmt.Fprintf(&b, "- gpu-*-cooldown-metrics.csv/html/term.txt\n")
|
|
if result.Interconnect != nil {
|
|
fmt.Fprintf(&b, "- nccl-all-reduce.log\n")
|
|
}
|
|
return b.String()
|
|
}
|
|
|
|
func loadBenchmarkReportCharts(runDir string, gpuIndices []int) []benchmarkReportChart {
|
|
phases := []struct {
|
|
name string
|
|
label string
|
|
}{
|
|
{name: "baseline", label: "Baseline"},
|
|
{name: "steady", label: "Steady State"},
|
|
{name: "cooldown", label: "Cooldown"},
|
|
}
|
|
var charts []benchmarkReportChart
|
|
for _, idx := range gpuIndices {
|
|
for _, phase := range phases {
|
|
path := filepath.Join(runDir, fmt.Sprintf("gpu-%d-%s-metrics-term.txt", idx, phase.name))
|
|
raw, err := os.ReadFile(path)
|
|
if err != nil || len(raw) == 0 {
|
|
continue
|
|
}
|
|
charts = append(charts, benchmarkReportChart{
|
|
Title: fmt.Sprintf("GPU %d %s", idx, phase.label),
|
|
Content: string(raw),
|
|
})
|
|
}
|
|
}
|
|
return charts
|
|
}
|
|
|
|
func stripANSIEscapeSequences(raw string) string {
|
|
return ansiEscapePattern.ReplaceAllString(raw, "")
|
|
}
|
|
|
|
// formatThrottleLine renders throttle counters as human-readable percentages of
|
|
// the steady-state window. Only non-zero counters are shown. When the steady
|
|
// duration is unknown (0), raw seconds are shown instead.
|
|
func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64) string {
|
|
type counter struct {
|
|
label string
|
|
us uint64
|
|
}
|
|
counters := []counter{
|
|
{"sw_power", t.SWPowerCapUS},
|
|
{"sw_thermal", t.SWThermalSlowdownUS},
|
|
{"sync_boost", t.SyncBoostUS},
|
|
{"hw_thermal", t.HWThermalSlowdownUS},
|
|
{"hw_power_brake", t.HWPowerBrakeSlowdownUS},
|
|
}
|
|
var parts []string
|
|
for _, c := range counters {
|
|
if c.us == 0 {
|
|
continue
|
|
}
|
|
sec := float64(c.us) / 1e6
|
|
if steadyDurationSec > 0 {
|
|
pct := sec / steadyDurationSec * 100
|
|
parts = append(parts, fmt.Sprintf("%s=%.1f%% (%.0fs)", c.label, pct, sec))
|
|
} else if sec < 1 {
|
|
parts = append(parts, fmt.Sprintf("%s=%.0fms", c.label, sec*1000))
|
|
} else {
|
|
parts = append(parts, fmt.Sprintf("%s=%.1fs", c.label, sec))
|
|
}
|
|
}
|
|
if len(parts) == 0 {
|
|
return "none"
|
|
}
|
|
return strings.Join(parts, " ")
|
|
}
|
|
|
|
func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
|
|
var b strings.Builder
|
|
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
|
|
fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
|
|
fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
|
|
fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
|
|
fmt.Fprintf(&b, "normalization_status=%s\n", result.Normalization.Status)
|
|
var best float64
|
|
for i, gpu := range result.GPUs {
|
|
fmt.Fprintf(&b, "gpu_%d_status=%s\n", gpu.Index, gpu.Status)
|
|
fmt.Fprintf(&b, "gpu_%d_composite_score=%.2f\n", gpu.Index, gpu.Scores.CompositeScore)
|
|
if i == 0 || gpu.Scores.CompositeScore > best {
|
|
best = gpu.Scores.CompositeScore
|
|
}
|
|
}
|
|
fmt.Fprintf(&b, "best_composite_score=%.2f\n", best)
|
|
if result.Interconnect != nil {
|
|
fmt.Fprintf(&b, "interconnect_status=%s\n", result.Interconnect.Status)
|
|
fmt.Fprintf(&b, "interconnect_max_busbw_gbps=%.1f\n", result.Interconnect.MaxBusBWGBps)
|
|
}
|
|
return b.String()
|
|
}
|