Refactor validate modes, fix benchmark report and IPMI power

- Replace diag level 1-4 dropdown with Validate/Stress radio buttons
- Validate: dcgmi L2, 60s CPU, 256MB/1p memtester, SMART short
- Stress: dcgmi L3 + targeted_stress in Run All, 30min CPU, 1GB/3p memtester, SMART long/NVMe extended
- Parallel GPU mode: spawn single task for all GPUs instead of splitting per model
- Benchmark table: per-GPU columns for sequential runs, server-wide column for parallel
- Benchmark report converted to Markdown with server model, GPU model, version in header; only steady-state charts
- Fix IPMI power parsing in benchmark (was looking for 'Current Power', correct field is 'Instantaneous power reading')

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-08 00:42:12 +03:00
parent dd26e03b2d
commit b2f8626fee
12 changed files with 332 additions and 164 deletions

View File

@@ -382,9 +382,9 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
archive, err = application.RunNvidiaAcceptancePack("", logLine) archive, err = application.RunNvidiaAcceptancePack("", logLine)
} }
case "memory": case "memory":
archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", logLine) archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", 256, 1, logLine)
case "storage": case "storage":
archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", logLine) archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", false, logLine)
case "cpu": case "cpu":
dur := *duration dur := *duration
if dur <= 0 { if dur <= 0 {

View File

@@ -124,8 +124,8 @@ type satRunner interface {
RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error)
ResetNvidiaGPU(index int) (string, error) ResetNvidiaGPU(index int) (string, error)
RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error)
RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error)
RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
ListNvidiaGPUs() ([]platform.NvidiaGPU, error) ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
DetectGPUVendor() string DetectGPUVendor() string
@@ -602,14 +602,14 @@ func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts p
} }
func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) { func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, logFunc) return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, 256, 1, logFunc)
} }
func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) { func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" { if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir baseDir = DefaultSATBaseDir
} }
return a.sat.RunMemoryAcceptancePack(ctx, baseDir, logFunc) return a.sat.RunMemoryAcceptancePack(ctx, baseDir, sizeMB, passes, logFunc)
} }
func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) { func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
@@ -634,14 +634,14 @@ func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (Actio
} }
func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) { func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, logFunc) return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, false, logFunc)
} }
func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) { func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" { if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir baseDir = DefaultSATBaseDir
} }
return a.sat.RunStorageAcceptancePack(ctx, baseDir, logFunc) return a.sat.RunStorageAcceptancePack(ctx, baseDir, extended, logFunc)
} }
func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) { func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {

View File

@@ -217,11 +217,11 @@ func (f fakeSAT) ResetNvidiaGPU(index int) (string, error) {
return "", nil return "", nil
} }
func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) { func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _, _ int, _ func(string)) (string, error) {
return f.runMemoryFn(baseDir) return f.runMemoryFn(baseDir)
} }
func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) { func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ bool, _ func(string)) (string, error) {
return f.runStorageFn(baseDir) return f.runStorageFn(baseDir)
} }

View File

@@ -326,8 +326,8 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
} }
report := renderBenchmarkReportWithCharts(result, loadBenchmarkReportCharts(runDir, selected)) report := renderBenchmarkReportWithCharts(result, loadBenchmarkReportCharts(runDir, selected))
if err := os.WriteFile(filepath.Join(runDir, "report.txt"), []byte(report), 0644); err != nil { if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(report), 0644); err != nil {
return "", fmt.Errorf("write report.txt: %w", err) return "", fmt.Errorf("write report.md: %w", err)
} }
summary := renderBenchmarkSummary(result) summary := renderBenchmarkSummary(result)
@@ -1183,18 +1183,8 @@ func queryIPMIServerPowerW() (float64, error) {
if err != nil { if err != nil {
return 0, fmt.Errorf("ipmitool dcmi power reading: %w", err) return 0, fmt.Errorf("ipmitool dcmi power reading: %w", err)
} }
for _, line := range strings.Split(string(out), "\n") { if w := parseDCMIPowerReading(string(out)); w > 0 {
if strings.Contains(line, "Current Power") { return w, nil
parts := strings.SplitN(line, ":", 2)
if len(parts) == 2 {
val := strings.TrimSpace(strings.TrimSuffix(strings.TrimSpace(parts[1]), "Watts"))
val = strings.TrimSpace(val)
w, err := strconv.ParseFloat(val, 64)
if err == nil && w > 0 {
return w, nil
}
}
}
} }
return 0, fmt.Errorf("could not parse ipmitool dcmi power reading output") return 0, fmt.Errorf("could not parse ipmitool dcmi power reading output")
} }

View File

@@ -22,18 +22,53 @@ var ansiEscapePattern = regexp.MustCompile(`\x1b\[[0-9;]*m`)
func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benchmarkReportChart) string { func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benchmarkReportChart) string {
var b strings.Builder var b strings.Builder
fmt.Fprintf(&b, "Bee NVIDIA Benchmark Report\n")
fmt.Fprintf(&b, "===========================\n\n")
fmt.Fprintf(&b, "Generated: %s\n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
fmt.Fprintf(&b, "Host: %s\n", result.Hostname)
fmt.Fprintf(&b, "Profile: %s\n", result.BenchmarkProfile)
fmt.Fprintf(&b, "Overall status: %s\n", result.OverallStatus)
fmt.Fprintf(&b, "Selected GPUs: %s\n", joinIndexList(result.SelectedGPUIndices))
fmt.Fprintf(&b, "Normalization: %s\n\n", result.Normalization.Status)
// ── Header ────────────────────────────────────────────────────────────────
b.WriteString("# Bee NVIDIA Benchmark Report\n\n")
// System identity block
if result.ServerModel != "" {
fmt.Fprintf(&b, "**Server:** %s \n", result.ServerModel)
}
if result.Hostname != "" {
fmt.Fprintf(&b, "**Host:** %s \n", result.Hostname)
}
// GPU models summary
if len(result.GPUs) > 0 {
modelCount := make(map[string]int)
var modelOrder []string
for _, g := range result.GPUs {
m := strings.TrimSpace(g.Name)
if m == "" {
m = "Unknown GPU"
}
if modelCount[m] == 0 {
modelOrder = append(modelOrder, m)
}
modelCount[m]++
}
var parts []string
for _, m := range modelOrder {
if modelCount[m] == 1 {
parts = append(parts, m)
} else {
parts = append(parts, fmt.Sprintf("%d× %s", modelCount[m], m))
}
}
fmt.Fprintf(&b, "**GPU(s):** %s \n", strings.Join(parts, ", "))
}
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
fmt.Fprintf(&b, "**App version:** %s \n", result.BenchmarkVersion)
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
if result.ParallelGPUs {
fmt.Fprintf(&b, "**Mode:** parallel (all GPUs simultaneously) \n")
}
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
b.WriteString("\n")
// ── Executive Summary ─────────────────────────────────────────────────────
if len(result.Findings) > 0 { if len(result.Findings) > 0 {
fmt.Fprintf(&b, "Executive Summary\n") b.WriteString("## Executive Summary\n\n")
fmt.Fprintf(&b, "-----------------\n")
for _, finding := range result.Findings { for _, finding := range result.Findings {
fmt.Fprintf(&b, "- %s\n", finding) fmt.Fprintf(&b, "- %s\n", finding)
} }
@@ -41,149 +76,206 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
} }
if len(result.Warnings) > 0 { if len(result.Warnings) > 0 {
fmt.Fprintf(&b, "Warnings\n") b.WriteString("## Warnings\n\n")
fmt.Fprintf(&b, "--------\n")
for _, warning := range result.Warnings { for _, warning := range result.Warnings {
fmt.Fprintf(&b, "- %s\n", warning) fmt.Fprintf(&b, "- %s\n", warning)
} }
b.WriteString("\n") b.WriteString("\n")
} }
fmt.Fprintf(&b, "Per GPU Scorecard\n") // ── Scorecard table ───────────────────────────────────────────────────────
fmt.Fprintf(&b, "-----------------\n") b.WriteString("## Scorecard\n\n")
b.WriteString("| GPU | Status | Composite | Compute | TOPS/SM/GHz | Power Sustain | Thermal Sustain | Stability | Interconnect |\n")
b.WriteString("|-----|--------|-----------|---------|-------------|---------------|-----------------|-----------|-------------|\n")
for _, gpu := range result.GPUs { for _, gpu := range result.GPUs {
fmt.Fprintf(&b, "GPU %d %s\n", gpu.Index, gpu.Name) name := strings.TrimSpace(gpu.Name)
fmt.Fprintf(&b, " Status: %s\n", gpu.Status) if name == "" {
fmt.Fprintf(&b, " Composite score: %.2f\n", gpu.Scores.CompositeScore) name = "Unknown"
fmt.Fprintf(&b, " Compute score: %.2f\n", gpu.Scores.ComputeScore)
if gpu.Scores.TOPSPerSMPerGHz > 0 {
fmt.Fprintf(&b, " Compute efficiency: %.3f TOPS/SM/GHz\n", gpu.Scores.TOPSPerSMPerGHz)
} }
fmt.Fprintf(&b, " Power sustain: %.1f\n", gpu.Scores.PowerSustainScore) interconnect := "-"
fmt.Fprintf(&b, " Thermal sustain: %.1f\n", gpu.Scores.ThermalSustainScore)
fmt.Fprintf(&b, " Stability: %.1f\n", gpu.Scores.StabilityScore)
if gpu.Scores.InterconnectScore > 0 { if gpu.Scores.InterconnectScore > 0 {
fmt.Fprintf(&b, " Interconnect: %.1f\n", gpu.Scores.InterconnectScore) interconnect = fmt.Sprintf("%.1f", gpu.Scores.InterconnectScore)
} }
if len(gpu.DegradationReasons) > 0 { topsPerSM := "-"
fmt.Fprintf(&b, " Degradation reasons: %s\n", strings.Join(gpu.DegradationReasons, ", ")) if gpu.Scores.TOPSPerSMPerGHz > 0 {
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
} }
fmt.Fprintf(&b, " Avg power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.AvgPowerW, gpu.Steady.AvgTempC, gpu.Steady.AvgGraphicsClockMHz) fmt.Fprintf(&b, "| GPU %d %s | %s | **%.2f** | %.2f | %s | %.1f | %.1f | %.1f | %s |\n",
fmt.Fprintf(&b, " P95 power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.P95PowerW, gpu.Steady.P95TempC, gpu.Steady.P95GraphicsClockMHz) gpu.Index, name,
if len(gpu.PrecisionResults) > 0 { gpu.Status,
fmt.Fprintf(&b, " Precision results:\n") gpu.Scores.CompositeScore,
for _, precision := range gpu.PrecisionResults { gpu.Scores.ComputeScore,
if precision.Supported { topsPerSM,
fmt.Fprintf(&b, " - %s: %.2f TOPS lanes=%d iterations=%d\n", precision.Name, precision.TeraOpsPerSec, precision.Lanes, precision.Iterations) gpu.Scores.PowerSustainScore,
} else { gpu.Scores.ThermalSustainScore,
fmt.Fprintf(&b, " - %s: unsupported (%s)\n", precision.Name, precision.Notes) gpu.Scores.StabilityScore,
} interconnect,
} )
}
b.WriteString("\n")
// ── Per GPU detail ────────────────────────────────────────────────────────
b.WriteString("## Per-GPU Details\n\n")
for _, gpu := range result.GPUs {
name := strings.TrimSpace(gpu.Name)
if name == "" {
name = "Unknown GPU"
} }
fmt.Fprintf(&b, " Throttle: %s\n", formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec)) fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, name)
if len(gpu.Notes) > 0 {
fmt.Fprintf(&b, " Notes:\n") // Identity
for _, note := range gpu.Notes { if gpu.BusID != "" {
fmt.Fprintf(&b, " - %s\n", note) fmt.Fprintf(&b, "- **Bus ID:** %s\n", gpu.BusID)
} }
if gpu.VBIOS != "" {
fmt.Fprintf(&b, "- **vBIOS:** %s\n", gpu.VBIOS)
}
if gpu.ComputeCapability != "" {
fmt.Fprintf(&b, "- **Compute capability:** %s\n", gpu.ComputeCapability)
}
if gpu.MultiprocessorCount > 0 {
fmt.Fprintf(&b, "- **SMs:** %d\n", gpu.MultiprocessorCount)
}
if gpu.PowerLimitW > 0 {
fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
}
if gpu.LockedGraphicsClockMHz > 0 {
fmt.Fprintf(&b, "- **Locked clocks:** GPU %.0f MHz / Mem %.0f MHz\n", gpu.LockedGraphicsClockMHz, gpu.LockedMemoryClockMHz)
} }
b.WriteString("\n") b.WriteString("\n")
// Steady-state telemetry
fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
b.WriteString("\n")
// Throttle
throttle := formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec)
if throttle != "none" {
fmt.Fprintf(&b, "**Throttle:** %s\n\n", throttle)
}
// Precision results
if len(gpu.PrecisionResults) > 0 {
b.WriteString("**Precision results:**\n\n")
b.WriteString("| Precision | TOPS | Lanes | Iterations |\n|-----------|------|-------|------------|\n")
for _, p := range gpu.PrecisionResults {
if p.Supported {
fmt.Fprintf(&b, "| %s | %.2f | %d | %d |\n", p.Name, p.TeraOpsPerSec, p.Lanes, p.Iterations)
} else {
fmt.Fprintf(&b, "| %s | — (unsupported) | — | — |\n", p.Name)
}
}
b.WriteString("\n")
}
// Degradation / Notes
if len(gpu.DegradationReasons) > 0 {
fmt.Fprintf(&b, "**Degradation reasons:** %s\n\n", strings.Join(gpu.DegradationReasons, ", "))
}
if len(gpu.Notes) > 0 {
b.WriteString("**Notes:**\n\n")
for _, note := range gpu.Notes {
fmt.Fprintf(&b, "- %s\n", note)
}
b.WriteString("\n")
}
} }
// ── Interconnect ──────────────────────────────────────────────────────────
if result.Interconnect != nil { if result.Interconnect != nil {
fmt.Fprintf(&b, "Interconnect\n") b.WriteString("## Interconnect (NCCL)\n\n")
fmt.Fprintf(&b, "------------\n") fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status)
fmt.Fprintf(&b, "Status: %s\n", result.Interconnect.Status)
if result.Interconnect.Supported { if result.Interconnect.Supported {
fmt.Fprintf(&b, "Avg algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.AvgBusBWGBps) b.WriteString("| Metric | Avg | Max |\n|--------|-----|-----|\n")
fmt.Fprintf(&b, "Max algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.MaxAlgBWGBps, result.Interconnect.MaxBusBWGBps) fmt.Fprintf(&b, "| Alg BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.MaxAlgBWGBps)
fmt.Fprintf(&b, "| Bus BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgBusBWGBps, result.Interconnect.MaxBusBWGBps)
b.WriteString("\n")
} }
for _, note := range result.Interconnect.Notes { for _, note := range result.Interconnect.Notes {
fmt.Fprintf(&b, "- %s\n", note) fmt.Fprintf(&b, "- %s\n", note)
} }
b.WriteString("\n") if len(result.Interconnect.Notes) > 0 {
b.WriteString("\n")
}
} }
// ── Server Power (IPMI) ───────────────────────────────────────────────────
if sp := result.ServerPower; sp != nil {
b.WriteString("## Server Power (IPMI)\n\n")
if !sp.Available {
b.WriteString("IPMI power measurement unavailable.\n\n")
} else {
b.WriteString("| | Value |\n|---|---|\n")
fmt.Fprintf(&b, "| Server idle | %.0f W |\n", sp.IdleW)
fmt.Fprintf(&b, "| Server under load | %.0f W |\n", sp.LoadedW)
fmt.Fprintf(&b, "| Server delta (load idle) | %.0f W |\n", sp.DeltaW)
fmt.Fprintf(&b, "| GPU-reported sum | %.0f W |\n", sp.GPUReportedSumW)
if sp.ReportingRatio > 0 {
fmt.Fprintf(&b, "| Reporting ratio | %.2f (1.0 = accurate, <0.75 = GPU over-reports) |\n", sp.ReportingRatio)
}
b.WriteString("\n")
}
for _, note := range sp.Notes {
fmt.Fprintf(&b, "- %s\n", note)
}
if len(sp.Notes) > 0 {
b.WriteString("\n")
}
}
// ── Terminal charts (steady-state only) ───────────────────────────────────
if len(charts) > 0 { if len(charts) > 0 {
fmt.Fprintf(&b, "Terminal Charts\n") b.WriteString("## Steady-State Charts\n\n")
fmt.Fprintf(&b, "---------------\n")
for _, chart := range charts { for _, chart := range charts {
content := strings.TrimSpace(stripANSIEscapeSequences(chart.Content)) content := strings.TrimSpace(stripANSIEscapeSequences(chart.Content))
if content == "" { if content == "" {
continue continue
} }
fmt.Fprintf(&b, "%s\n", chart.Title) fmt.Fprintf(&b, "### %s\n\n```\n%s\n```\n\n", chart.Title, content)
fmt.Fprintf(&b, "%s\n", strings.Repeat("~", len(chart.Title)))
fmt.Fprintf(&b, "%s\n\n", content)
} }
} }
if sp := result.ServerPower; sp != nil { // ── Methodology ───────────────────────────────────────────────────────────
fmt.Fprintf(&b, "Server Power (IPMI)\n") b.WriteString("## Methodology\n\n")
fmt.Fprintf(&b, "-------------------\n") fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline → warmup → steady-state → interconnect → cooldown phases.\n", result.BenchmarkProfile)
if !sp.Available { b.WriteString("- Single-GPU compute score from bee-gpu-burn cuBLASLt when available.\n")
fmt.Fprintf(&b, "Unavailable\n") b.WriteString("- Thermal and power limitations inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
} else { b.WriteString("- `result.json` is the canonical machine-readable source for this benchmark run.\n\n")
fmt.Fprintf(&b, " Server idle: %.0f W\n", sp.IdleW)
fmt.Fprintf(&b, " Server under load: %.0f W\n", sp.LoadedW)
fmt.Fprintf(&b, " Server delta: %.0f W\n", sp.DeltaW)
fmt.Fprintf(&b, " GPU reported (sum): %.0f W\n", sp.GPUReportedSumW)
if sp.ReportingRatio > 0 {
fmt.Fprintf(&b, " Reporting ratio: %.2f (1.0 = accurate, <0.75 = GPU over-reports)\n", sp.ReportingRatio)
}
}
for _, note := range sp.Notes {
fmt.Fprintf(&b, " Note: %s\n", note)
}
b.WriteString("\n")
}
fmt.Fprintf(&b, "Methodology\n") // ── Raw files ─────────────────────────────────────────────────────────────
fmt.Fprintf(&b, "-----------\n") b.WriteString("## Raw Files\n\n")
fmt.Fprintf(&b, "- Profile %s uses standardized baseline, warmup, steady-state, interconnect, and cooldown phases.\n", result.BenchmarkProfile) b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
fmt.Fprintf(&b, "- Single-GPU compute score comes from bee-gpu-burn cuBLASLt output when available.\n") b.WriteString("- `gpu-*-baseline-metrics.csv/html/term.txt`\n")
fmt.Fprintf(&b, "- Thermal and power limitations are inferred from NVIDIA clock event reason counters and sustained telemetry.\n") b.WriteString("- `gpu-*-warmup.log`\n")
fmt.Fprintf(&b, "- result.json is the canonical machine-readable source for this benchmark run.\n\n") b.WriteString("- `gpu-*-steady.log`\n")
b.WriteString("- `gpu-*-steady-metrics.csv/html/term.txt`\n")
fmt.Fprintf(&b, "Raw Files\n") b.WriteString("- `gpu-*-cooldown-metrics.csv/html/term.txt`\n")
fmt.Fprintf(&b, "---------\n")
fmt.Fprintf(&b, "- result.json\n")
fmt.Fprintf(&b, "- report.txt\n")
fmt.Fprintf(&b, "- summary.txt\n")
fmt.Fprintf(&b, "- verbose.log\n")
fmt.Fprintf(&b, "- gpu-*-baseline-metrics.csv/html/term.txt\n")
fmt.Fprintf(&b, "- gpu-*-warmup.log\n")
fmt.Fprintf(&b, "- gpu-*-steady.log\n")
fmt.Fprintf(&b, "- gpu-*-steady-metrics.csv/html/term.txt\n")
fmt.Fprintf(&b, "- gpu-*-cooldown-metrics.csv/html/term.txt\n")
if result.Interconnect != nil { if result.Interconnect != nil {
fmt.Fprintf(&b, "- nccl-all-reduce.log\n") b.WriteString("- `nccl-all-reduce.log`\n")
} }
return b.String() return b.String()
} }
// loadBenchmarkReportCharts loads only steady-state terminal charts (baseline and
// cooldown charts are not useful for human review).
func loadBenchmarkReportCharts(runDir string, gpuIndices []int) []benchmarkReportChart { func loadBenchmarkReportCharts(runDir string, gpuIndices []int) []benchmarkReportChart {
phases := []struct {
name string
label string
}{
{name: "baseline", label: "Baseline"},
{name: "steady", label: "Steady State"},
{name: "cooldown", label: "Cooldown"},
}
var charts []benchmarkReportChart var charts []benchmarkReportChart
for _, idx := range gpuIndices { for _, idx := range gpuIndices {
for _, phase := range phases { path := filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady-metrics-term.txt", idx))
path := filepath.Join(runDir, fmt.Sprintf("gpu-%d-%s-metrics-term.txt", idx, phase.name)) raw, err := os.ReadFile(path)
raw, err := os.ReadFile(path) if err != nil || len(raw) == 0 {
if err != nil || len(raw) == 0 { continue
continue
}
charts = append(charts, benchmarkReportChart{
Title: fmt.Sprintf("GPU %d %s", idx, phase.label),
Content: string(raw),
})
} }
charts = append(charts, benchmarkReportChart{
Title: fmt.Sprintf("GPU %d — Steady State", idx),
Content: string(raw),
})
} }
return charts return charts
} }

View File

@@ -137,8 +137,9 @@ func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
for _, needle := range []string{ for _, needle := range []string{
"Executive Summary", "Executive Summary",
"GPU 0 spent measurable time under SW power cap.", "GPU 0 spent measurable time under SW power cap.",
"Composite score: 1176.00", "1176.00",
"fp16_tensor: 700.00 TOPS", "fp16_tensor",
"700.00",
} { } {
if !strings.Contains(report, needle) { if !strings.Contains(report, needle) {
t.Fatalf("report missing %q\n%s", needle, report) t.Fatalf("report missing %q\n%s", needle, report)
@@ -164,7 +165,7 @@ func TestRenderBenchmarkReportIncludesTerminalChartsWithoutANSI(t *testing.T) {
}) })
for _, needle := range []string{ for _, needle := range []string{
"Terminal Charts", "Steady-State Charts",
"GPU 0 Steady State", "GPU 0 Steady State",
"GPU 0 chart", "GPU 0 chart",
"42┤───", "42┤───",

View File

@@ -531,9 +531,13 @@ func memoryStressSizeArg() string {
return fmt.Sprintf("%dM", targetMB) return fmt.Sprintf("%dM", targetMB)
} }
func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) { func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128) if sizeMB <= 0 {
passes := envInt("BEE_MEMTESTER_PASSES", 1) sizeMB = 256
}
if passes <= 0 {
passes = 1
}
return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{ return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
{name: "01-free-before.log", cmd: []string{"free", "-h"}}, {name: "01-free-before.log", cmd: []string{"free", "-h"}},
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}}, {name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
@@ -590,7 +594,7 @@ func (s *System) RunCPUAcceptancePack(ctx context.Context, baseDir string, durat
}, logFunc) }, logFunc)
} }
func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) { func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
if baseDir == "" { if baseDir == "" {
baseDir = "/var/log/bee-sat" baseDir = "/var/log/bee-sat"
} }
@@ -622,7 +626,7 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, l
break break
} }
prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath)) prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath))
commands := storageSATCommands(devPath) commands := storageSATCommands(devPath, extended)
for cmdIndex, job := range commands { for cmdIndex, job := range commands {
if ctx.Err() != nil { if ctx.Err() != nil {
break break
@@ -1086,17 +1090,25 @@ func listStorageDevices() ([]string, error) {
return parseStorageDevices(string(out)), nil return parseStorageDevices(string(out)), nil
} }
func storageSATCommands(devPath string) []satJob { func storageSATCommands(devPath string, extended bool) []satJob {
if strings.Contains(filepath.Base(devPath), "nvme") { if strings.Contains(filepath.Base(devPath), "nvme") {
selfTestLevel := "1"
if extended {
selfTestLevel = "2"
}
return []satJob{ return []satJob{
{name: "nvme-id-ctrl", cmd: []string{"nvme", "id-ctrl", devPath, "-o", "json"}}, {name: "nvme-id-ctrl", cmd: []string{"nvme", "id-ctrl", devPath, "-o", "json"}},
{name: "nvme-smart-log", cmd: []string{"nvme", "smart-log", devPath, "-o", "json"}}, {name: "nvme-smart-log", cmd: []string{"nvme", "smart-log", devPath, "-o", "json"}},
{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", "1", "--wait"}}, {name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", selfTestLevel, "--wait"}},
} }
} }
smartTestType := "short"
if extended {
smartTestType = "long"
}
return []satJob{ return []satJob{
{name: "smartctl-health", cmd: []string{"smartctl", "-H", "-A", devPath}}, {name: "smartctl-health", cmd: []string{"smartctl", "-H", "-A", devPath}},
{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", "short", devPath}}, {name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", smartTestType, devPath}},
} }
} }

View File

@@ -14,12 +14,12 @@ import (
func TestStorageSATCommands(t *testing.T) { func TestStorageSATCommands(t *testing.T) {
t.Parallel() t.Parallel()
nvme := storageSATCommands("/dev/nvme0n1") nvme := storageSATCommands("/dev/nvme0n1", false)
if len(nvme) != 3 || nvme[2].cmd[0] != "nvme" { if len(nvme) != 3 || nvme[2].cmd[0] != "nvme" {
t.Fatalf("unexpected nvme commands: %#v", nvme) t.Fatalf("unexpected nvme commands: %#v", nvme)
} }
sata := storageSATCommands("/dev/sda") sata := storageSATCommands("/dev/sda", false)
if len(sata) != 2 || sata[0].cmd[0] != "smartctl" { if len(sata) != 2 || sata[0].cmd[0] != "smartctl" {
t.Fatalf("unexpected sata commands: %#v", sata) t.Fatalf("unexpected sata commands: %#v", sata)
} }

View File

@@ -222,7 +222,21 @@ func formatSplitTaskName(baseName, selectionLabel string) string {
} }
func buildNvidiaTaskSet(target string, priority int, createdAt time.Time, params taskParams, baseName string, appRef *app.App, idPrefix string) ([]*Task, error) { func buildNvidiaTaskSet(target string, priority int, createdAt time.Time, params taskParams, baseName string, appRef *app.App, idPrefix string) ([]*Task, error) {
if !shouldSplitHomogeneousNvidiaTarget(target) { if !shouldSplitHomogeneousNvidiaTarget(target) || params.ParallelGPUs {
// Parallel mode (or non-splittable target): one task for all selected GPUs.
if params.ParallelGPUs && shouldSplitHomogeneousNvidiaTarget(target) {
// Resolve the selected GPU indices so ExcludeGPUIndices is applied.
gpus, err := apiListNvidiaGPUs(appRef)
if err != nil {
return nil, err
}
resolved, err := expandSelectedGPUIndices(gpus, params.GPUIndices, params.ExcludeGPUIndices)
if err != nil {
return nil, err
}
params.GPUIndices = resolved
params.ExcludeGPUIndices = nil
}
t := &Task{ t := &Task{
ID: newJobID(idPrefix), ID: newJobID(idPrefix),
Name: baseName, Name: baseName,
@@ -262,6 +276,53 @@ func buildNvidiaTaskSet(target string, priority int, createdAt time.Time, params
return tasks, nil return tasks, nil
} }
// expandSelectedGPUIndices returns the sorted list of selected GPU indices after
// applying include/exclude filters, without splitting by model.
func expandSelectedGPUIndices(gpus []platform.NvidiaGPU, include, exclude []int) ([]int, error) {
indexed := make(map[int]struct{}, len(gpus))
allIndices := make([]int, 0, len(gpus))
for _, gpu := range gpus {
indexed[gpu.Index] = struct{}{}
allIndices = append(allIndices, gpu.Index)
}
sort.Ints(allIndices)
selected := allIndices
if len(include) > 0 {
selected = make([]int, 0, len(include))
seen := make(map[int]struct{}, len(include))
for _, idx := range include {
if _, ok := indexed[idx]; !ok {
continue
}
if _, dup := seen[idx]; dup {
continue
}
seen[idx] = struct{}{}
selected = append(selected, idx)
}
sort.Ints(selected)
}
if len(exclude) > 0 {
skip := make(map[int]struct{}, len(exclude))
for _, idx := range exclude {
skip[idx] = struct{}{}
}
filtered := selected[:0]
for _, idx := range selected {
if _, ok := skip[idx]; ok {
continue
}
filtered = append(filtered, idx)
}
selected = filtered
}
if len(selected) == 0 {
return nil, fmt.Errorf("no NVIDIA GPUs selected")
}
return selected, nil
}
// ── SSE helpers ─────────────────────────────────────────────────────────────── // ── SSE helpers ───────────────────────────────────────────────────────────────
func sseWrite(w http.ResponseWriter, event, data string) bool { func sseWrite(w http.ResponseWriter, event, data string) bool {
@@ -423,7 +484,7 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
var body struct { var body struct {
Duration int `json:"duration"` Duration int `json:"duration"`
DiagLevel int `json:"diag_level"` StressMode bool `json:"stress_mode"`
GPUIndices []int `json:"gpu_indices"` GPUIndices []int `json:"gpu_indices"`
ExcludeGPUIndices []int `json:"exclude_gpu_indices"` ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
Loader string `json:"loader"` Loader string `json:"loader"`
@@ -444,7 +505,7 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
} }
params := taskParams{ params := taskParams{
Duration: body.Duration, Duration: body.Duration,
DiagLevel: body.DiagLevel, StressMode: body.StressMode,
GPUIndices: body.GPUIndices, GPUIndices: body.GPUIndices,
ExcludeGPUIndices: body.ExcludeGPUIndices, ExcludeGPUIndices: body.ExcludeGPUIndices,
Loader: body.Loader, Loader: body.Loader,

View File

@@ -693,8 +693,8 @@ func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
for _, needle := range []string{ for _, needle := range []string{
`Benchmark Results`, `Benchmark Results`,
`Composite score by saved benchmark run and GPU.`, `Composite score by saved benchmark run and GPU.`,
`NVIDIA H100 PCIe / GPU 0`, `GPU #0 — NVIDIA H100 PCIe`,
`NVIDIA H100 PCIe / GPU 1`, `GPU #1 — NVIDIA H100 PCIe`,
`#1`, `#1`,
wantTime, wantTime,
`1176.25`, `1176.25`,

View File

@@ -115,10 +115,11 @@ type Task struct {
// taskParams holds optional parameters parsed from the run request. // taskParams holds optional parameters parsed from the run request.
type taskParams struct { type taskParams struct {
Duration int `json:"duration,omitempty"` Duration int `json:"duration,omitempty"`
DiagLevel int `json:"diag_level,omitempty"` StressMode bool `json:"stress_mode,omitempty"`
GPUIndices []int `json:"gpu_indices,omitempty"` GPUIndices []int `json:"gpu_indices,omitempty"`
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"` ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
SizeMB int `json:"size_mb,omitempty"` SizeMB int `json:"size_mb,omitempty"`
Passes int `json:"passes,omitempty"`
Loader string `json:"loader,omitempty"` Loader string `json:"loader,omitempty"`
BurnProfile string `json:"burn_profile,omitempty"` BurnProfile string `json:"burn_profile,omitempty"`
BenchmarkProfile string `json:"benchmark_profile,omitempty"` BenchmarkProfile string `json:"benchmark_profile,omitempty"`
@@ -215,11 +216,11 @@ var globalQueue = &taskQueue{trigger: make(chan struct{}, 1)}
const maxTaskHistory = 50 const maxTaskHistory = 50
var ( var (
runMemoryAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) { runMemoryAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
return a.RunMemoryAcceptancePackCtx(ctx, baseDir, logFunc) return a.RunMemoryAcceptancePackCtx(ctx, baseDir, sizeMB, passes, logFunc)
} }
runStorageAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) { runStorageAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
return a.RunStorageAcceptancePackCtx(ctx, baseDir, logFunc) return a.RunStorageAcceptancePackCtx(ctx, baseDir, extended, logFunc)
} }
runCPUAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) { runCPUAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
return a.RunCPUAcceptancePackCtx(ctx, baseDir, durationSec, logFunc) return a.RunCPUAcceptancePackCtx(ctx, baseDir, durationSec, logFunc)
@@ -552,7 +553,10 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
err = fmt.Errorf("app not configured") err = fmt.Errorf("app not configured")
break break
} }
diagLevel := t.params.DiagLevel diagLevel := 2
if t.params.StressMode {
diagLevel = 3
}
if len(t.params.GPUIndices) > 0 || diagLevel > 0 { if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
result, e := a.RunNvidiaAcceptancePackWithOptions( result, e := a.RunNvidiaAcceptancePackWithOptions(
ctx, "", diagLevel, t.params.GPUIndices, j.append, ctx, "", diagLevel, t.params.GPUIndices, j.append,
@@ -658,13 +662,17 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
err = fmt.Errorf("app not configured") err = fmt.Errorf("app not configured")
break break
} }
archive, err = runMemoryAcceptancePackCtx(a, ctx, "", j.append) sizeMB, passes := 256, 1
if t.params.StressMode {
sizeMB, passes = 1024, 3
}
archive, err = runMemoryAcceptancePackCtx(a, ctx, "", sizeMB, passes, j.append)
case "storage": case "storage":
if a == nil { if a == nil {
err = fmt.Errorf("app not configured") err = fmt.Errorf("app not configured")
break break
} }
archive, err = runStorageAcceptancePackCtx(a, ctx, "", j.append) archive, err = runStorageAcceptancePackCtx(a, ctx, "", t.params.StressMode, j.append)
case "cpu": case "cpu":
if a == nil { if a == nil {
err = fmt.Errorf("app not configured") err = fmt.Errorf("app not configured")
@@ -675,7 +683,11 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
} }
if dur <= 0 { if dur <= 0 {
dur = 60 if t.params.StressMode {
dur = 1800
} else {
dur = 60
}
} }
j.append(fmt.Sprintf("CPU stress duration: %ds", dur)) j.append(fmt.Sprintf("CPU stress duration: %ds", dur))
archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append) archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)

View File

@@ -422,7 +422,7 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
for _, needle := range []string{ for _, needle := range []string{
`Benchmark Results`, `Benchmark Results`,
`Composite score for this benchmark task.`, `Composite score for this benchmark task.`,
`NVIDIA H100 PCIe / GPU 0`, `GPU #0 — NVIDIA H100 PCIe`,
`1176.25`, `1176.25`,
} { } {
if !strings.Contains(html, needle) { if !strings.Contains(html, needle) {