Files
bee/audit/internal/platform/benchmark_report.go
Michael Chus ba16021cdb Fix GPU model propagation, export filenames, PSU/service status, and chart perf
- nvidia.go: add Name field to nvidiaGPUInfo, include model name in
  nvidia-smi query, set dev.Model in enrichPCIeWithNVIDIAData
- pages.go: fix duplicate GPU count in validate card summary (4 GPU: 4 x …
  → 4 x … GPU); fix PSU UNKNOWN fallback from hw.PowerSupplies; treat
  activating/deactivating/reloading service states as OK in Runtime Health
- support_bundle.go: use "150405" time format (no colons) for exFAT compat
- sat.go / benchmark.go / platform_stress.go / sat_fan_stress.go: remove
  .tar.gz archive creation from export dirs — export packs everything itself
- charts_svg.go: add min-max downsampling (1400 pt cap) for SVG chart perf
- benchmark_report.go / sat.go: normalize GPU fallback to "Unknown GPU"

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-11 10:05:27 +03:00

345 lines
13 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package platform
import (
"fmt"
"os"
"path/filepath"
"regexp"
"strings"
"time"
)
func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
return renderBenchmarkReportWithCharts(result, nil)
}
type benchmarkReportChart struct {
Title string
Content string
}
var ansiEscapePattern = regexp.MustCompile(`\x1b\[[0-9;]*m`)
func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benchmarkReportChart) string {
var b strings.Builder
// ── Header ────────────────────────────────────────────────────────────────
b.WriteString("# Bee NVIDIA Benchmark Report\n\n")
// System identity block
if result.ServerModel != "" {
fmt.Fprintf(&b, "**Server:** %s \n", result.ServerModel)
}
if result.Hostname != "" {
fmt.Fprintf(&b, "**Host:** %s \n", result.Hostname)
}
// GPU models summary
if len(result.GPUs) > 0 {
modelCount := make(map[string]int)
var modelOrder []string
for _, g := range result.GPUs {
m := strings.TrimSpace(g.Name)
if m == "" {
m = "Unknown GPU"
}
if modelCount[m] == 0 {
modelOrder = append(modelOrder, m)
}
modelCount[m]++
}
var parts []string
for _, m := range modelOrder {
if modelCount[m] == 1 {
parts = append(parts, m)
} else {
parts = append(parts, fmt.Sprintf("%d× %s", modelCount[m], m))
}
}
fmt.Fprintf(&b, "**GPU(s):** %s \n", strings.Join(parts, ", "))
}
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
fmt.Fprintf(&b, "**App version:** %s \n", result.BenchmarkVersion)
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
if result.ParallelGPUs {
fmt.Fprintf(&b, "**Mode:** parallel (all GPUs simultaneously) \n")
}
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
b.WriteString("\n")
// ── Executive Summary ─────────────────────────────────────────────────────
if len(result.Findings) > 0 {
b.WriteString("## Executive Summary\n\n")
for _, finding := range result.Findings {
fmt.Fprintf(&b, "- %s\n", finding)
}
b.WriteString("\n")
}
if len(result.Warnings) > 0 {
b.WriteString("## Warnings\n\n")
for _, warning := range result.Warnings {
fmt.Fprintf(&b, "- %s\n", warning)
}
b.WriteString("\n")
}
// ── Scorecard table ───────────────────────────────────────────────────────
b.WriteString("## Scorecard\n\n")
b.WriteString("| GPU | Status | Composite | Compute | TOPS/SM/GHz | Power Sustain | Thermal Sustain | Stability | Interconnect |\n")
b.WriteString("|-----|--------|-----------|---------|-------------|---------------|-----------------|-----------|-------------|\n")
for _, gpu := range result.GPUs {
name := strings.TrimSpace(gpu.Name)
if name == "" {
name = "Unknown GPU"
}
interconnect := "-"
if gpu.Scores.InterconnectScore > 0 {
interconnect = fmt.Sprintf("%.1f", gpu.Scores.InterconnectScore)
}
topsPerSM := "-"
if gpu.Scores.TOPSPerSMPerGHz > 0 {
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
}
fmt.Fprintf(&b, "| GPU %d %s | %s | **%.2f** | %.2f | %s | %.1f | %.1f | %.1f | %s |\n",
gpu.Index, name,
gpu.Status,
gpu.Scores.CompositeScore,
gpu.Scores.ComputeScore,
topsPerSM,
gpu.Scores.PowerSustainScore,
gpu.Scores.ThermalSustainScore,
gpu.Scores.StabilityScore,
interconnect,
)
}
b.WriteString("\n")
// ── Per GPU detail ────────────────────────────────────────────────────────
b.WriteString("## Per-GPU Details\n\n")
for _, gpu := range result.GPUs {
name := strings.TrimSpace(gpu.Name)
if name == "" {
name = "Unknown GPU"
}
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, name)
// Identity
if gpu.BusID != "" {
fmt.Fprintf(&b, "- **Bus ID:** %s\n", gpu.BusID)
}
if gpu.VBIOS != "" {
fmt.Fprintf(&b, "- **vBIOS:** %s\n", gpu.VBIOS)
}
if gpu.ComputeCapability != "" {
fmt.Fprintf(&b, "- **Compute capability:** %s\n", gpu.ComputeCapability)
}
if gpu.MultiprocessorCount > 0 {
fmt.Fprintf(&b, "- **SMs:** %d\n", gpu.MultiprocessorCount)
}
if gpu.PowerLimitW > 0 {
fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
}
if gpu.LockedGraphicsClockMHz > 0 {
fmt.Fprintf(&b, "- **Locked clocks:** GPU %.0f MHz / Mem %.0f MHz\n", gpu.LockedGraphicsClockMHz, gpu.LockedMemoryClockMHz)
}
b.WriteString("\n")
// Steady-state telemetry
fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
b.WriteString("\n")
// Throttle
throttle := formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec)
if throttle != "none" {
fmt.Fprintf(&b, "**Throttle:** %s\n\n", throttle)
}
// Precision results
if len(gpu.PrecisionResults) > 0 {
b.WriteString("**Precision results:**\n\n")
b.WriteString("| Precision | TOPS | Lanes | Iterations |\n|-----------|------|-------|------------|\n")
for _, p := range gpu.PrecisionResults {
if p.Supported {
fmt.Fprintf(&b, "| %s | %.2f | %d | %d |\n", p.Name, p.TeraOpsPerSec, p.Lanes, p.Iterations)
} else {
fmt.Fprintf(&b, "| %s | — (unsupported) | — | — |\n", p.Name)
}
}
b.WriteString("\n")
}
// Degradation / Notes
if len(gpu.DegradationReasons) > 0 {
fmt.Fprintf(&b, "**Degradation reasons:** %s\n\n", strings.Join(gpu.DegradationReasons, ", "))
}
if len(gpu.Notes) > 0 {
b.WriteString("**Notes:**\n\n")
for _, note := range gpu.Notes {
fmt.Fprintf(&b, "- %s\n", note)
}
b.WriteString("\n")
}
}
// ── Interconnect ──────────────────────────────────────────────────────────
if result.Interconnect != nil {
b.WriteString("## Interconnect (NCCL)\n\n")
fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status)
if result.Interconnect.Supported {
b.WriteString("| Metric | Avg | Max |\n|--------|-----|-----|\n")
fmt.Fprintf(&b, "| Alg BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.MaxAlgBWGBps)
fmt.Fprintf(&b, "| Bus BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgBusBWGBps, result.Interconnect.MaxBusBWGBps)
b.WriteString("\n")
}
for _, note := range result.Interconnect.Notes {
fmt.Fprintf(&b, "- %s\n", note)
}
if len(result.Interconnect.Notes) > 0 {
b.WriteString("\n")
}
}
// ── Server Power (IPMI) ───────────────────────────────────────────────────
if sp := result.ServerPower; sp != nil {
b.WriteString("## Server Power (IPMI)\n\n")
if !sp.Available {
b.WriteString("IPMI power measurement unavailable.\n\n")
} else {
b.WriteString("| | Value |\n|---|---|\n")
fmt.Fprintf(&b, "| Server idle | %.0f W |\n", sp.IdleW)
fmt.Fprintf(&b, "| Server under load | %.0f W |\n", sp.LoadedW)
fmt.Fprintf(&b, "| Server delta (load idle) | %.0f W |\n", sp.DeltaW)
fmt.Fprintf(&b, "| GPU-reported sum | %.0f W |\n", sp.GPUReportedSumW)
if sp.ReportingRatio > 0 {
fmt.Fprintf(&b, "| Reporting ratio | %.2f (1.0 = accurate, <0.75 = GPU over-reports) |\n", sp.ReportingRatio)
}
b.WriteString("\n")
}
for _, note := range sp.Notes {
fmt.Fprintf(&b, "- %s\n", note)
}
if len(sp.Notes) > 0 {
b.WriteString("\n")
}
}
// ── Terminal charts (steady-state only) ───────────────────────────────────
if len(charts) > 0 {
b.WriteString("## Steady-State Charts\n\n")
for _, chart := range charts {
content := strings.TrimSpace(stripANSIEscapeSequences(chart.Content))
if content == "" {
continue
}
fmt.Fprintf(&b, "### %s\n\n```\n%s\n```\n\n", chart.Title, content)
}
}
// ── Methodology ───────────────────────────────────────────────────────────
b.WriteString("## Methodology\n\n")
fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline → warmup → steady-state → interconnect → cooldown phases.\n", result.BenchmarkProfile)
b.WriteString("- Single-GPU compute score from bee-gpu-burn cuBLASLt when available.\n")
b.WriteString("- Thermal and power limitations inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
b.WriteString("- `result.json` is the canonical machine-readable source for this benchmark run.\n\n")
// ── Raw files ─────────────────────────────────────────────────────────────
b.WriteString("## Raw Files\n\n")
b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
b.WriteString("- `gpu-*-baseline-metrics.csv/html/term.txt`\n")
b.WriteString("- `gpu-*-warmup.log`\n")
b.WriteString("- `gpu-*-steady.log`\n")
b.WriteString("- `gpu-*-steady-metrics.csv/html/term.txt`\n")
b.WriteString("- `gpu-*-cooldown-metrics.csv/html/term.txt`\n")
if result.Interconnect != nil {
b.WriteString("- `nccl-all-reduce.log`\n")
}
return b.String()
}
// loadBenchmarkReportCharts loads only steady-state terminal charts (baseline and
// cooldown charts are not useful for human review).
func loadBenchmarkReportCharts(runDir string, gpuIndices []int) []benchmarkReportChart {
var charts []benchmarkReportChart
for _, idx := range gpuIndices {
path := filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady-metrics-term.txt", idx))
raw, err := os.ReadFile(path)
if err != nil || len(raw) == 0 {
continue
}
charts = append(charts, benchmarkReportChart{
Title: fmt.Sprintf("GPU %d — Steady State", idx),
Content: string(raw),
})
}
return charts
}
func stripANSIEscapeSequences(raw string) string {
return ansiEscapePattern.ReplaceAllString(raw, "")
}
// formatThrottleLine renders throttle counters as human-readable percentages of
// the steady-state window. Only non-zero counters are shown. When the steady
// duration is unknown (0), raw seconds are shown instead.
func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64) string {
type counter struct {
label string
us uint64
}
counters := []counter{
{"sw_power", t.SWPowerCapUS},
{"sw_thermal", t.SWThermalSlowdownUS},
{"sync_boost", t.SyncBoostUS},
{"hw_thermal", t.HWThermalSlowdownUS},
{"hw_power_brake", t.HWPowerBrakeSlowdownUS},
}
var parts []string
for _, c := range counters {
if c.us == 0 {
continue
}
sec := float64(c.us) / 1e6
if steadyDurationSec > 0 {
pct := sec / steadyDurationSec * 100
parts = append(parts, fmt.Sprintf("%s=%.1f%% (%.0fs)", c.label, pct, sec))
} else if sec < 1 {
parts = append(parts, fmt.Sprintf("%s=%.0fms", c.label, sec*1000))
} else {
parts = append(parts, fmt.Sprintf("%s=%.1fs", c.label, sec))
}
}
if len(parts) == 0 {
return "none"
}
return strings.Join(parts, " ")
}
func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
var b strings.Builder
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
fmt.Fprintf(&b, "normalization_status=%s\n", result.Normalization.Status)
var best float64
for i, gpu := range result.GPUs {
fmt.Fprintf(&b, "gpu_%d_status=%s\n", gpu.Index, gpu.Status)
fmt.Fprintf(&b, "gpu_%d_composite_score=%.2f\n", gpu.Index, gpu.Scores.CompositeScore)
if i == 0 || gpu.Scores.CompositeScore > best {
best = gpu.Scores.CompositeScore
}
}
fmt.Fprintf(&b, "best_composite_score=%.2f\n", best)
if result.Interconnect != nil {
fmt.Fprintf(&b, "interconnect_status=%s\n", result.Interconnect.Status)
fmt.Fprintf(&b, "interconnect_max_busbw_gbps=%.1f\n", result.Interconnect.MaxBusBWGBps)
}
return b.String()
}