- Replace diag level 1-4 dropdown with Validate/Stress radio buttons - Validate: dcgmi L2, 60s CPU, 256MB/1p memtester, SMART short - Stress: dcgmi L3 + targeted_stress in Run All, 30min CPU, 1GB/3p memtester, SMART long/NVMe extended - Parallel GPU mode: spawn single task for all GPUs instead of splitting per model - Benchmark table: per-GPU columns for sequential runs, server-wide column for parallel - Benchmark report converted to Markdown with server model, GPU model, version in header; only steady-state charts - Fix IPMI power parsing in benchmark (was looking for 'Current Power', correct field is 'Instantaneous power reading') Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
345 lines
13 KiB
Go
345 lines
13 KiB
Go
package platform
|
||
|
||
import (
|
||
"fmt"
|
||
"os"
|
||
"path/filepath"
|
||
"regexp"
|
||
"strings"
|
||
"time"
|
||
)
|
||
|
||
func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
|
||
return renderBenchmarkReportWithCharts(result, nil)
|
||
}
|
||
|
||
type benchmarkReportChart struct {
|
||
Title string
|
||
Content string
|
||
}
|
||
|
||
var ansiEscapePattern = regexp.MustCompile(`\x1b\[[0-9;]*m`)
|
||
|
||
func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benchmarkReportChart) string {
|
||
var b strings.Builder
|
||
|
||
// ── Header ────────────────────────────────────────────────────────────────
|
||
b.WriteString("# Bee NVIDIA Benchmark Report\n\n")
|
||
|
||
// System identity block
|
||
if result.ServerModel != "" {
|
||
fmt.Fprintf(&b, "**Server:** %s \n", result.ServerModel)
|
||
}
|
||
if result.Hostname != "" {
|
||
fmt.Fprintf(&b, "**Host:** %s \n", result.Hostname)
|
||
}
|
||
// GPU models summary
|
||
if len(result.GPUs) > 0 {
|
||
modelCount := make(map[string]int)
|
||
var modelOrder []string
|
||
for _, g := range result.GPUs {
|
||
m := strings.TrimSpace(g.Name)
|
||
if m == "" {
|
||
m = "Unknown GPU"
|
||
}
|
||
if modelCount[m] == 0 {
|
||
modelOrder = append(modelOrder, m)
|
||
}
|
||
modelCount[m]++
|
||
}
|
||
var parts []string
|
||
for _, m := range modelOrder {
|
||
if modelCount[m] == 1 {
|
||
parts = append(parts, m)
|
||
} else {
|
||
parts = append(parts, fmt.Sprintf("%d× %s", modelCount[m], m))
|
||
}
|
||
}
|
||
fmt.Fprintf(&b, "**GPU(s):** %s \n", strings.Join(parts, ", "))
|
||
}
|
||
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
|
||
fmt.Fprintf(&b, "**App version:** %s \n", result.BenchmarkVersion)
|
||
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
||
if result.ParallelGPUs {
|
||
fmt.Fprintf(&b, "**Mode:** parallel (all GPUs simultaneously) \n")
|
||
}
|
||
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
|
||
b.WriteString("\n")
|
||
|
||
// ── Executive Summary ─────────────────────────────────────────────────────
|
||
if len(result.Findings) > 0 {
|
||
b.WriteString("## Executive Summary\n\n")
|
||
for _, finding := range result.Findings {
|
||
fmt.Fprintf(&b, "- %s\n", finding)
|
||
}
|
||
b.WriteString("\n")
|
||
}
|
||
|
||
if len(result.Warnings) > 0 {
|
||
b.WriteString("## Warnings\n\n")
|
||
for _, warning := range result.Warnings {
|
||
fmt.Fprintf(&b, "- %s\n", warning)
|
||
}
|
||
b.WriteString("\n")
|
||
}
|
||
|
||
// ── Scorecard table ───────────────────────────────────────────────────────
|
||
b.WriteString("## Scorecard\n\n")
|
||
b.WriteString("| GPU | Status | Composite | Compute | TOPS/SM/GHz | Power Sustain | Thermal Sustain | Stability | Interconnect |\n")
|
||
b.WriteString("|-----|--------|-----------|---------|-------------|---------------|-----------------|-----------|-------------|\n")
|
||
for _, gpu := range result.GPUs {
|
||
name := strings.TrimSpace(gpu.Name)
|
||
if name == "" {
|
||
name = "Unknown"
|
||
}
|
||
interconnect := "-"
|
||
if gpu.Scores.InterconnectScore > 0 {
|
||
interconnect = fmt.Sprintf("%.1f", gpu.Scores.InterconnectScore)
|
||
}
|
||
topsPerSM := "-"
|
||
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
||
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
|
||
}
|
||
fmt.Fprintf(&b, "| GPU %d %s | %s | **%.2f** | %.2f | %s | %.1f | %.1f | %.1f | %s |\n",
|
||
gpu.Index, name,
|
||
gpu.Status,
|
||
gpu.Scores.CompositeScore,
|
||
gpu.Scores.ComputeScore,
|
||
topsPerSM,
|
||
gpu.Scores.PowerSustainScore,
|
||
gpu.Scores.ThermalSustainScore,
|
||
gpu.Scores.StabilityScore,
|
||
interconnect,
|
||
)
|
||
}
|
||
b.WriteString("\n")
|
||
|
||
// ── Per GPU detail ────────────────────────────────────────────────────────
|
||
b.WriteString("## Per-GPU Details\n\n")
|
||
for _, gpu := range result.GPUs {
|
||
name := strings.TrimSpace(gpu.Name)
|
||
if name == "" {
|
||
name = "Unknown GPU"
|
||
}
|
||
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, name)
|
||
|
||
// Identity
|
||
if gpu.BusID != "" {
|
||
fmt.Fprintf(&b, "- **Bus ID:** %s\n", gpu.BusID)
|
||
}
|
||
if gpu.VBIOS != "" {
|
||
fmt.Fprintf(&b, "- **vBIOS:** %s\n", gpu.VBIOS)
|
||
}
|
||
if gpu.ComputeCapability != "" {
|
||
fmt.Fprintf(&b, "- **Compute capability:** %s\n", gpu.ComputeCapability)
|
||
}
|
||
if gpu.MultiprocessorCount > 0 {
|
||
fmt.Fprintf(&b, "- **SMs:** %d\n", gpu.MultiprocessorCount)
|
||
}
|
||
if gpu.PowerLimitW > 0 {
|
||
fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
|
||
}
|
||
if gpu.LockedGraphicsClockMHz > 0 {
|
||
fmt.Fprintf(&b, "- **Locked clocks:** GPU %.0f MHz / Mem %.0f MHz\n", gpu.LockedGraphicsClockMHz, gpu.LockedMemoryClockMHz)
|
||
}
|
||
b.WriteString("\n")
|
||
|
||
// Steady-state telemetry
|
||
fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
|
||
b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
|
||
fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
|
||
fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
|
||
fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
|
||
fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
|
||
fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
|
||
b.WriteString("\n")
|
||
|
||
// Throttle
|
||
throttle := formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec)
|
||
if throttle != "none" {
|
||
fmt.Fprintf(&b, "**Throttle:** %s\n\n", throttle)
|
||
}
|
||
|
||
// Precision results
|
||
if len(gpu.PrecisionResults) > 0 {
|
||
b.WriteString("**Precision results:**\n\n")
|
||
b.WriteString("| Precision | TOPS | Lanes | Iterations |\n|-----------|------|-------|------------|\n")
|
||
for _, p := range gpu.PrecisionResults {
|
||
if p.Supported {
|
||
fmt.Fprintf(&b, "| %s | %.2f | %d | %d |\n", p.Name, p.TeraOpsPerSec, p.Lanes, p.Iterations)
|
||
} else {
|
||
fmt.Fprintf(&b, "| %s | — (unsupported) | — | — |\n", p.Name)
|
||
}
|
||
}
|
||
b.WriteString("\n")
|
||
}
|
||
|
||
// Degradation / Notes
|
||
if len(gpu.DegradationReasons) > 0 {
|
||
fmt.Fprintf(&b, "**Degradation reasons:** %s\n\n", strings.Join(gpu.DegradationReasons, ", "))
|
||
}
|
||
if len(gpu.Notes) > 0 {
|
||
b.WriteString("**Notes:**\n\n")
|
||
for _, note := range gpu.Notes {
|
||
fmt.Fprintf(&b, "- %s\n", note)
|
||
}
|
||
b.WriteString("\n")
|
||
}
|
||
}
|
||
|
||
// ── Interconnect ──────────────────────────────────────────────────────────
|
||
if result.Interconnect != nil {
|
||
b.WriteString("## Interconnect (NCCL)\n\n")
|
||
fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status)
|
||
if result.Interconnect.Supported {
|
||
b.WriteString("| Metric | Avg | Max |\n|--------|-----|-----|\n")
|
||
fmt.Fprintf(&b, "| Alg BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.MaxAlgBWGBps)
|
||
fmt.Fprintf(&b, "| Bus BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgBusBWGBps, result.Interconnect.MaxBusBWGBps)
|
||
b.WriteString("\n")
|
||
}
|
||
for _, note := range result.Interconnect.Notes {
|
||
fmt.Fprintf(&b, "- %s\n", note)
|
||
}
|
||
if len(result.Interconnect.Notes) > 0 {
|
||
b.WriteString("\n")
|
||
}
|
||
}
|
||
|
||
// ── Server Power (IPMI) ───────────────────────────────────────────────────
|
||
if sp := result.ServerPower; sp != nil {
|
||
b.WriteString("## Server Power (IPMI)\n\n")
|
||
if !sp.Available {
|
||
b.WriteString("IPMI power measurement unavailable.\n\n")
|
||
} else {
|
||
b.WriteString("| | Value |\n|---|---|\n")
|
||
fmt.Fprintf(&b, "| Server idle | %.0f W |\n", sp.IdleW)
|
||
fmt.Fprintf(&b, "| Server under load | %.0f W |\n", sp.LoadedW)
|
||
fmt.Fprintf(&b, "| Server delta (load − idle) | %.0f W |\n", sp.DeltaW)
|
||
fmt.Fprintf(&b, "| GPU-reported sum | %.0f W |\n", sp.GPUReportedSumW)
|
||
if sp.ReportingRatio > 0 {
|
||
fmt.Fprintf(&b, "| Reporting ratio | %.2f (1.0 = accurate, <0.75 = GPU over-reports) |\n", sp.ReportingRatio)
|
||
}
|
||
b.WriteString("\n")
|
||
}
|
||
for _, note := range sp.Notes {
|
||
fmt.Fprintf(&b, "- %s\n", note)
|
||
}
|
||
if len(sp.Notes) > 0 {
|
||
b.WriteString("\n")
|
||
}
|
||
}
|
||
|
||
// ── Terminal charts (steady-state only) ───────────────────────────────────
|
||
if len(charts) > 0 {
|
||
b.WriteString("## Steady-State Charts\n\n")
|
||
for _, chart := range charts {
|
||
content := strings.TrimSpace(stripANSIEscapeSequences(chart.Content))
|
||
if content == "" {
|
||
continue
|
||
}
|
||
fmt.Fprintf(&b, "### %s\n\n```\n%s\n```\n\n", chart.Title, content)
|
||
}
|
||
}
|
||
|
||
// ── Methodology ───────────────────────────────────────────────────────────
|
||
b.WriteString("## Methodology\n\n")
|
||
fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline → warmup → steady-state → interconnect → cooldown phases.\n", result.BenchmarkProfile)
|
||
b.WriteString("- Single-GPU compute score from bee-gpu-burn cuBLASLt when available.\n")
|
||
b.WriteString("- Thermal and power limitations inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
|
||
b.WriteString("- `result.json` is the canonical machine-readable source for this benchmark run.\n\n")
|
||
|
||
// ── Raw files ─────────────────────────────────────────────────────────────
|
||
b.WriteString("## Raw Files\n\n")
|
||
b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
|
||
b.WriteString("- `gpu-*-baseline-metrics.csv/html/term.txt`\n")
|
||
b.WriteString("- `gpu-*-warmup.log`\n")
|
||
b.WriteString("- `gpu-*-steady.log`\n")
|
||
b.WriteString("- `gpu-*-steady-metrics.csv/html/term.txt`\n")
|
||
b.WriteString("- `gpu-*-cooldown-metrics.csv/html/term.txt`\n")
|
||
if result.Interconnect != nil {
|
||
b.WriteString("- `nccl-all-reduce.log`\n")
|
||
}
|
||
return b.String()
|
||
}
|
||
|
||
// loadBenchmarkReportCharts loads only steady-state terminal charts (baseline and
|
||
// cooldown charts are not useful for human review).
|
||
func loadBenchmarkReportCharts(runDir string, gpuIndices []int) []benchmarkReportChart {
|
||
var charts []benchmarkReportChart
|
||
for _, idx := range gpuIndices {
|
||
path := filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady-metrics-term.txt", idx))
|
||
raw, err := os.ReadFile(path)
|
||
if err != nil || len(raw) == 0 {
|
||
continue
|
||
}
|
||
charts = append(charts, benchmarkReportChart{
|
||
Title: fmt.Sprintf("GPU %d — Steady State", idx),
|
||
Content: string(raw),
|
||
})
|
||
}
|
||
return charts
|
||
}
|
||
|
||
func stripANSIEscapeSequences(raw string) string {
|
||
return ansiEscapePattern.ReplaceAllString(raw, "")
|
||
}
|
||
|
||
// formatThrottleLine renders throttle counters as human-readable percentages of
|
||
// the steady-state window. Only non-zero counters are shown. When the steady
|
||
// duration is unknown (0), raw seconds are shown instead.
|
||
func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64) string {
|
||
type counter struct {
|
||
label string
|
||
us uint64
|
||
}
|
||
counters := []counter{
|
||
{"sw_power", t.SWPowerCapUS},
|
||
{"sw_thermal", t.SWThermalSlowdownUS},
|
||
{"sync_boost", t.SyncBoostUS},
|
||
{"hw_thermal", t.HWThermalSlowdownUS},
|
||
{"hw_power_brake", t.HWPowerBrakeSlowdownUS},
|
||
}
|
||
var parts []string
|
||
for _, c := range counters {
|
||
if c.us == 0 {
|
||
continue
|
||
}
|
||
sec := float64(c.us) / 1e6
|
||
if steadyDurationSec > 0 {
|
||
pct := sec / steadyDurationSec * 100
|
||
parts = append(parts, fmt.Sprintf("%s=%.1f%% (%.0fs)", c.label, pct, sec))
|
||
} else if sec < 1 {
|
||
parts = append(parts, fmt.Sprintf("%s=%.0fms", c.label, sec*1000))
|
||
} else {
|
||
parts = append(parts, fmt.Sprintf("%s=%.1fs", c.label, sec))
|
||
}
|
||
}
|
||
if len(parts) == 0 {
|
||
return "none"
|
||
}
|
||
return strings.Join(parts, " ")
|
||
}
|
||
|
||
func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
|
||
var b strings.Builder
|
||
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
|
||
fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
|
||
fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
|
||
fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
|
||
fmt.Fprintf(&b, "normalization_status=%s\n", result.Normalization.Status)
|
||
var best float64
|
||
for i, gpu := range result.GPUs {
|
||
fmt.Fprintf(&b, "gpu_%d_status=%s\n", gpu.Index, gpu.Status)
|
||
fmt.Fprintf(&b, "gpu_%d_composite_score=%.2f\n", gpu.Index, gpu.Scores.CompositeScore)
|
||
if i == 0 || gpu.Scores.CompositeScore > best {
|
||
best = gpu.Scores.CompositeScore
|
||
}
|
||
}
|
||
fmt.Fprintf(&b, "best_composite_score=%.2f\n", best)
|
||
if result.Interconnect != nil {
|
||
fmt.Fprintf(&b, "interconnect_status=%s\n", result.Interconnect.Status)
|
||
fmt.Fprintf(&b, "interconnect_max_busbw_gbps=%.1f\n", result.Interconnect.MaxBusBWGBps)
|
||
}
|
||
return b.String()
|
||
}
|