From b2f8626fee143bc2877256feecd0933d0b5259ad Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Wed, 8 Apr 2026 00:42:12 +0300 Subject: [PATCH] Refactor validate modes, fix benchmark report and IPMI power - Replace diag level 1-4 dropdown with Validate/Stress radio buttons - Validate: dcgmi L2, 60s CPU, 256MB/1p memtester, SMART short - Stress: dcgmi L3 + targeted_stress in Run All, 30min CPU, 1GB/3p memtester, SMART long/NVMe extended - Parallel GPU mode: spawn single task for all GPUs instead of splitting per model - Benchmark table: per-GPU columns for sequential runs, server-wide column for parallel - Benchmark report converted to Markdown with server model, GPU model, version in header; only steady-state charts - Fix IPMI power parsing in benchmark (was looking for 'Current Power', correct field is 'Instantaneous power reading') Co-Authored-By: Claude Sonnet 4.6 --- audit/cmd/bee/main.go | 4 +- audit/internal/app/app.go | 16 +- audit/internal/app/app_test.go | 4 +- audit/internal/platform/benchmark.go | 18 +- audit/internal/platform/benchmark_report.go | 312 +++++++++++++------- audit/internal/platform/benchmark_test.go | 7 +- audit/internal/platform/sat.go | 28 +- audit/internal/platform/sat_test.go | 4 +- audit/internal/webui/api.go | 67 ++++- audit/internal/webui/server_test.go | 4 +- audit/internal/webui/tasks.go | 30 +- audit/internal/webui/tasks_test.go | 2 +- 12 files changed, 332 insertions(+), 164 deletions(-) diff --git a/audit/cmd/bee/main.go b/audit/cmd/bee/main.go index c8123ae..1e84410 100644 --- a/audit/cmd/bee/main.go +++ b/audit/cmd/bee/main.go @@ -382,9 +382,9 @@ func runSAT(args []string, stdout, stderr io.Writer) int { archive, err = application.RunNvidiaAcceptancePack("", logLine) } case "memory": - archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", logLine) + archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", 256, 1, logLine) case "storage": - archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", logLine) + archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", false, logLine) case "cpu": dur := *duration if dur <= 0 { diff --git a/audit/internal/app/app.go b/audit/internal/app/app.go index 8a54063..e93e337 100644 --- a/audit/internal/app/app.go +++ b/audit/internal/app/app.go @@ -124,8 +124,8 @@ type satRunner interface { RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) ResetNvidiaGPU(index int) (string, error) - RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) - RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) + RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) + RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) DetectGPUVendor() string @@ -602,14 +602,14 @@ func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts p } func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) { - return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, logFunc) + return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, 256, 1, logFunc) } -func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) { +func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) { if strings.TrimSpace(baseDir) == "" { baseDir = DefaultSATBaseDir } - return a.sat.RunMemoryAcceptancePack(ctx, baseDir, logFunc) + return a.sat.RunMemoryAcceptancePack(ctx, baseDir, sizeMB, passes, logFunc) } func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) { @@ -634,14 +634,14 @@ func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (Actio } func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) { - return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, logFunc) + return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, false, logFunc) } -func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) { +func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) { if strings.TrimSpace(baseDir) == "" { baseDir = DefaultSATBaseDir } - return a.sat.RunStorageAcceptancePack(ctx, baseDir, logFunc) + return a.sat.RunStorageAcceptancePack(ctx, baseDir, extended, logFunc) } func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) { diff --git a/audit/internal/app/app_test.go b/audit/internal/app/app_test.go index 62eb429..b809ce9 100644 --- a/audit/internal/app/app_test.go +++ b/audit/internal/app/app_test.go @@ -217,11 +217,11 @@ func (f fakeSAT) ResetNvidiaGPU(index int) (string, error) { return "", nil } -func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) { +func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _, _ int, _ func(string)) (string, error) { return f.runMemoryFn(baseDir) } -func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) { +func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ bool, _ func(string)) (string, error) { return f.runStorageFn(baseDir) } diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index 23f5a31..d25bde4 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -326,8 +326,8 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv } report := renderBenchmarkReportWithCharts(result, loadBenchmarkReportCharts(runDir, selected)) - if err := os.WriteFile(filepath.Join(runDir, "report.txt"), []byte(report), 0644); err != nil { - return "", fmt.Errorf("write report.txt: %w", err) + if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(report), 0644); err != nil { + return "", fmt.Errorf("write report.md: %w", err) } summary := renderBenchmarkSummary(result) @@ -1183,18 +1183,8 @@ func queryIPMIServerPowerW() (float64, error) { if err != nil { return 0, fmt.Errorf("ipmitool dcmi power reading: %w", err) } - for _, line := range strings.Split(string(out), "\n") { - if strings.Contains(line, "Current Power") { - parts := strings.SplitN(line, ":", 2) - if len(parts) == 2 { - val := strings.TrimSpace(strings.TrimSuffix(strings.TrimSpace(parts[1]), "Watts")) - val = strings.TrimSpace(val) - w, err := strconv.ParseFloat(val, 64) - if err == nil && w > 0 { - return w, nil - } - } - } + if w := parseDCMIPowerReading(string(out)); w > 0 { + return w, nil } return 0, fmt.Errorf("could not parse ipmitool dcmi power reading output") } diff --git a/audit/internal/platform/benchmark_report.go b/audit/internal/platform/benchmark_report.go index 79c6a49..84c1735 100644 --- a/audit/internal/platform/benchmark_report.go +++ b/audit/internal/platform/benchmark_report.go @@ -22,18 +22,53 @@ var ansiEscapePattern = regexp.MustCompile(`\x1b\[[0-9;]*m`) func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benchmarkReportChart) string { var b strings.Builder - fmt.Fprintf(&b, "Bee NVIDIA Benchmark Report\n") - fmt.Fprintf(&b, "===========================\n\n") - fmt.Fprintf(&b, "Generated: %s\n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC")) - fmt.Fprintf(&b, "Host: %s\n", result.Hostname) - fmt.Fprintf(&b, "Profile: %s\n", result.BenchmarkProfile) - fmt.Fprintf(&b, "Overall status: %s\n", result.OverallStatus) - fmt.Fprintf(&b, "Selected GPUs: %s\n", joinIndexList(result.SelectedGPUIndices)) - fmt.Fprintf(&b, "Normalization: %s\n\n", result.Normalization.Status) + // ── Header ──────────────────────────────────────────────────────────────── + b.WriteString("# Bee NVIDIA Benchmark Report\n\n") + + // System identity block + if result.ServerModel != "" { + fmt.Fprintf(&b, "**Server:** %s \n", result.ServerModel) + } + if result.Hostname != "" { + fmt.Fprintf(&b, "**Host:** %s \n", result.Hostname) + } + // GPU models summary + if len(result.GPUs) > 0 { + modelCount := make(map[string]int) + var modelOrder []string + for _, g := range result.GPUs { + m := strings.TrimSpace(g.Name) + if m == "" { + m = "Unknown GPU" + } + if modelCount[m] == 0 { + modelOrder = append(modelOrder, m) + } + modelCount[m]++ + } + var parts []string + for _, m := range modelOrder { + if modelCount[m] == 1 { + parts = append(parts, m) + } else { + parts = append(parts, fmt.Sprintf("%d× %s", modelCount[m], m)) + } + } + fmt.Fprintf(&b, "**GPU(s):** %s \n", strings.Join(parts, ", ")) + } + fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile) + fmt.Fprintf(&b, "**App version:** %s \n", result.BenchmarkVersion) + fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC")) + if result.ParallelGPUs { + fmt.Fprintf(&b, "**Mode:** parallel (all GPUs simultaneously) \n") + } + fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus) + b.WriteString("\n") + + // ── Executive Summary ───────────────────────────────────────────────────── if len(result.Findings) > 0 { - fmt.Fprintf(&b, "Executive Summary\n") - fmt.Fprintf(&b, "-----------------\n") + b.WriteString("## Executive Summary\n\n") for _, finding := range result.Findings { fmt.Fprintf(&b, "- %s\n", finding) } @@ -41,149 +76,206 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc } if len(result.Warnings) > 0 { - fmt.Fprintf(&b, "Warnings\n") - fmt.Fprintf(&b, "--------\n") + b.WriteString("## Warnings\n\n") for _, warning := range result.Warnings { fmt.Fprintf(&b, "- %s\n", warning) } b.WriteString("\n") } - fmt.Fprintf(&b, "Per GPU Scorecard\n") - fmt.Fprintf(&b, "-----------------\n") + // ── Scorecard table ─────────────────────────────────────────────────────── + b.WriteString("## Scorecard\n\n") + b.WriteString("| GPU | Status | Composite | Compute | TOPS/SM/GHz | Power Sustain | Thermal Sustain | Stability | Interconnect |\n") + b.WriteString("|-----|--------|-----------|---------|-------------|---------------|-----------------|-----------|-------------|\n") for _, gpu := range result.GPUs { - fmt.Fprintf(&b, "GPU %d %s\n", gpu.Index, gpu.Name) - fmt.Fprintf(&b, " Status: %s\n", gpu.Status) - fmt.Fprintf(&b, " Composite score: %.2f\n", gpu.Scores.CompositeScore) - fmt.Fprintf(&b, " Compute score: %.2f\n", gpu.Scores.ComputeScore) - if gpu.Scores.TOPSPerSMPerGHz > 0 { - fmt.Fprintf(&b, " Compute efficiency: %.3f TOPS/SM/GHz\n", gpu.Scores.TOPSPerSMPerGHz) + name := strings.TrimSpace(gpu.Name) + if name == "" { + name = "Unknown" } - fmt.Fprintf(&b, " Power sustain: %.1f\n", gpu.Scores.PowerSustainScore) - fmt.Fprintf(&b, " Thermal sustain: %.1f\n", gpu.Scores.ThermalSustainScore) - fmt.Fprintf(&b, " Stability: %.1f\n", gpu.Scores.StabilityScore) + interconnect := "-" if gpu.Scores.InterconnectScore > 0 { - fmt.Fprintf(&b, " Interconnect: %.1f\n", gpu.Scores.InterconnectScore) + interconnect = fmt.Sprintf("%.1f", gpu.Scores.InterconnectScore) } - if len(gpu.DegradationReasons) > 0 { - fmt.Fprintf(&b, " Degradation reasons: %s\n", strings.Join(gpu.DegradationReasons, ", ")) + topsPerSM := "-" + if gpu.Scores.TOPSPerSMPerGHz > 0 { + topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz) } - fmt.Fprintf(&b, " Avg power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.AvgPowerW, gpu.Steady.AvgTempC, gpu.Steady.AvgGraphicsClockMHz) - fmt.Fprintf(&b, " P95 power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.P95PowerW, gpu.Steady.P95TempC, gpu.Steady.P95GraphicsClockMHz) - if len(gpu.PrecisionResults) > 0 { - fmt.Fprintf(&b, " Precision results:\n") - for _, precision := range gpu.PrecisionResults { - if precision.Supported { - fmt.Fprintf(&b, " - %s: %.2f TOPS lanes=%d iterations=%d\n", precision.Name, precision.TeraOpsPerSec, precision.Lanes, precision.Iterations) - } else { - fmt.Fprintf(&b, " - %s: unsupported (%s)\n", precision.Name, precision.Notes) - } - } + fmt.Fprintf(&b, "| GPU %d %s | %s | **%.2f** | %.2f | %s | %.1f | %.1f | %.1f | %s |\n", + gpu.Index, name, + gpu.Status, + gpu.Scores.CompositeScore, + gpu.Scores.ComputeScore, + topsPerSM, + gpu.Scores.PowerSustainScore, + gpu.Scores.ThermalSustainScore, + gpu.Scores.StabilityScore, + interconnect, + ) + } + b.WriteString("\n") + + // ── Per GPU detail ──────────────────────────────────────────────────────── + b.WriteString("## Per-GPU Details\n\n") + for _, gpu := range result.GPUs { + name := strings.TrimSpace(gpu.Name) + if name == "" { + name = "Unknown GPU" } - fmt.Fprintf(&b, " Throttle: %s\n", formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec)) - if len(gpu.Notes) > 0 { - fmt.Fprintf(&b, " Notes:\n") - for _, note := range gpu.Notes { - fmt.Fprintf(&b, " - %s\n", note) - } + fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, name) + + // Identity + if gpu.BusID != "" { + fmt.Fprintf(&b, "- **Bus ID:** %s\n", gpu.BusID) + } + if gpu.VBIOS != "" { + fmt.Fprintf(&b, "- **vBIOS:** %s\n", gpu.VBIOS) + } + if gpu.ComputeCapability != "" { + fmt.Fprintf(&b, "- **Compute capability:** %s\n", gpu.ComputeCapability) + } + if gpu.MultiprocessorCount > 0 { + fmt.Fprintf(&b, "- **SMs:** %d\n", gpu.MultiprocessorCount) + } + if gpu.PowerLimitW > 0 { + fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW) + } + if gpu.LockedGraphicsClockMHz > 0 { + fmt.Fprintf(&b, "- **Locked clocks:** GPU %.0f MHz / Mem %.0f MHz\n", gpu.LockedGraphicsClockMHz, gpu.LockedMemoryClockMHz) } b.WriteString("\n") + + // Steady-state telemetry + fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec)) + b.WriteString("| | Avg | P95 |\n|---|---|---|\n") + fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW) + fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC) + fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz) + fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz) + fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct) + b.WriteString("\n") + + // Throttle + throttle := formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec) + if throttle != "none" { + fmt.Fprintf(&b, "**Throttle:** %s\n\n", throttle) + } + + // Precision results + if len(gpu.PrecisionResults) > 0 { + b.WriteString("**Precision results:**\n\n") + b.WriteString("| Precision | TOPS | Lanes | Iterations |\n|-----------|------|-------|------------|\n") + for _, p := range gpu.PrecisionResults { + if p.Supported { + fmt.Fprintf(&b, "| %s | %.2f | %d | %d |\n", p.Name, p.TeraOpsPerSec, p.Lanes, p.Iterations) + } else { + fmt.Fprintf(&b, "| %s | — (unsupported) | — | — |\n", p.Name) + } + } + b.WriteString("\n") + } + + // Degradation / Notes + if len(gpu.DegradationReasons) > 0 { + fmt.Fprintf(&b, "**Degradation reasons:** %s\n\n", strings.Join(gpu.DegradationReasons, ", ")) + } + if len(gpu.Notes) > 0 { + b.WriteString("**Notes:**\n\n") + for _, note := range gpu.Notes { + fmt.Fprintf(&b, "- %s\n", note) + } + b.WriteString("\n") + } } + // ── Interconnect ────────────────────────────────────────────────────────── if result.Interconnect != nil { - fmt.Fprintf(&b, "Interconnect\n") - fmt.Fprintf(&b, "------------\n") - fmt.Fprintf(&b, "Status: %s\n", result.Interconnect.Status) + b.WriteString("## Interconnect (NCCL)\n\n") + fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status) if result.Interconnect.Supported { - fmt.Fprintf(&b, "Avg algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.AvgBusBWGBps) - fmt.Fprintf(&b, "Max algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.MaxAlgBWGBps, result.Interconnect.MaxBusBWGBps) + b.WriteString("| Metric | Avg | Max |\n|--------|-----|-----|\n") + fmt.Fprintf(&b, "| Alg BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.MaxAlgBWGBps) + fmt.Fprintf(&b, "| Bus BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgBusBWGBps, result.Interconnect.MaxBusBWGBps) + b.WriteString("\n") } for _, note := range result.Interconnect.Notes { fmt.Fprintf(&b, "- %s\n", note) } - b.WriteString("\n") + if len(result.Interconnect.Notes) > 0 { + b.WriteString("\n") + } } + // ── Server Power (IPMI) ─────────────────────────────────────────────────── + if sp := result.ServerPower; sp != nil { + b.WriteString("## Server Power (IPMI)\n\n") + if !sp.Available { + b.WriteString("IPMI power measurement unavailable.\n\n") + } else { + b.WriteString("| | Value |\n|---|---|\n") + fmt.Fprintf(&b, "| Server idle | %.0f W |\n", sp.IdleW) + fmt.Fprintf(&b, "| Server under load | %.0f W |\n", sp.LoadedW) + fmt.Fprintf(&b, "| Server delta (load − idle) | %.0f W |\n", sp.DeltaW) + fmt.Fprintf(&b, "| GPU-reported sum | %.0f W |\n", sp.GPUReportedSumW) + if sp.ReportingRatio > 0 { + fmt.Fprintf(&b, "| Reporting ratio | %.2f (1.0 = accurate, <0.75 = GPU over-reports) |\n", sp.ReportingRatio) + } + b.WriteString("\n") + } + for _, note := range sp.Notes { + fmt.Fprintf(&b, "- %s\n", note) + } + if len(sp.Notes) > 0 { + b.WriteString("\n") + } + } + + // ── Terminal charts (steady-state only) ─────────────────────────────────── if len(charts) > 0 { - fmt.Fprintf(&b, "Terminal Charts\n") - fmt.Fprintf(&b, "---------------\n") + b.WriteString("## Steady-State Charts\n\n") for _, chart := range charts { content := strings.TrimSpace(stripANSIEscapeSequences(chart.Content)) if content == "" { continue } - fmt.Fprintf(&b, "%s\n", chart.Title) - fmt.Fprintf(&b, "%s\n", strings.Repeat("~", len(chart.Title))) - fmt.Fprintf(&b, "%s\n\n", content) + fmt.Fprintf(&b, "### %s\n\n```\n%s\n```\n\n", chart.Title, content) } } - if sp := result.ServerPower; sp != nil { - fmt.Fprintf(&b, "Server Power (IPMI)\n") - fmt.Fprintf(&b, "-------------------\n") - if !sp.Available { - fmt.Fprintf(&b, "Unavailable\n") - } else { - fmt.Fprintf(&b, " Server idle: %.0f W\n", sp.IdleW) - fmt.Fprintf(&b, " Server under load: %.0f W\n", sp.LoadedW) - fmt.Fprintf(&b, " Server delta: %.0f W\n", sp.DeltaW) - fmt.Fprintf(&b, " GPU reported (sum): %.0f W\n", sp.GPUReportedSumW) - if sp.ReportingRatio > 0 { - fmt.Fprintf(&b, " Reporting ratio: %.2f (1.0 = accurate, <0.75 = GPU over-reports)\n", sp.ReportingRatio) - } - } - for _, note := range sp.Notes { - fmt.Fprintf(&b, " Note: %s\n", note) - } - b.WriteString("\n") - } + // ── Methodology ─────────────────────────────────────────────────────────── + b.WriteString("## Methodology\n\n") + fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline → warmup → steady-state → interconnect → cooldown phases.\n", result.BenchmarkProfile) + b.WriteString("- Single-GPU compute score from bee-gpu-burn cuBLASLt when available.\n") + b.WriteString("- Thermal and power limitations inferred from NVIDIA clock event reason counters and sustained telemetry.\n") + b.WriteString("- `result.json` is the canonical machine-readable source for this benchmark run.\n\n") - fmt.Fprintf(&b, "Methodology\n") - fmt.Fprintf(&b, "-----------\n") - fmt.Fprintf(&b, "- Profile %s uses standardized baseline, warmup, steady-state, interconnect, and cooldown phases.\n", result.BenchmarkProfile) - fmt.Fprintf(&b, "- Single-GPU compute score comes from bee-gpu-burn cuBLASLt output when available.\n") - fmt.Fprintf(&b, "- Thermal and power limitations are inferred from NVIDIA clock event reason counters and sustained telemetry.\n") - fmt.Fprintf(&b, "- result.json is the canonical machine-readable source for this benchmark run.\n\n") - - fmt.Fprintf(&b, "Raw Files\n") - fmt.Fprintf(&b, "---------\n") - fmt.Fprintf(&b, "- result.json\n") - fmt.Fprintf(&b, "- report.txt\n") - fmt.Fprintf(&b, "- summary.txt\n") - fmt.Fprintf(&b, "- verbose.log\n") - fmt.Fprintf(&b, "- gpu-*-baseline-metrics.csv/html/term.txt\n") - fmt.Fprintf(&b, "- gpu-*-warmup.log\n") - fmt.Fprintf(&b, "- gpu-*-steady.log\n") - fmt.Fprintf(&b, "- gpu-*-steady-metrics.csv/html/term.txt\n") - fmt.Fprintf(&b, "- gpu-*-cooldown-metrics.csv/html/term.txt\n") + // ── Raw files ───────────────────────────────────────────────────────────── + b.WriteString("## Raw Files\n\n") + b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n") + b.WriteString("- `gpu-*-baseline-metrics.csv/html/term.txt`\n") + b.WriteString("- `gpu-*-warmup.log`\n") + b.WriteString("- `gpu-*-steady.log`\n") + b.WriteString("- `gpu-*-steady-metrics.csv/html/term.txt`\n") + b.WriteString("- `gpu-*-cooldown-metrics.csv/html/term.txt`\n") if result.Interconnect != nil { - fmt.Fprintf(&b, "- nccl-all-reduce.log\n") + b.WriteString("- `nccl-all-reduce.log`\n") } return b.String() } +// loadBenchmarkReportCharts loads only steady-state terminal charts (baseline and +// cooldown charts are not useful for human review). func loadBenchmarkReportCharts(runDir string, gpuIndices []int) []benchmarkReportChart { - phases := []struct { - name string - label string - }{ - {name: "baseline", label: "Baseline"}, - {name: "steady", label: "Steady State"}, - {name: "cooldown", label: "Cooldown"}, - } var charts []benchmarkReportChart for _, idx := range gpuIndices { - for _, phase := range phases { - path := filepath.Join(runDir, fmt.Sprintf("gpu-%d-%s-metrics-term.txt", idx, phase.name)) - raw, err := os.ReadFile(path) - if err != nil || len(raw) == 0 { - continue - } - charts = append(charts, benchmarkReportChart{ - Title: fmt.Sprintf("GPU %d %s", idx, phase.label), - Content: string(raw), - }) + path := filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady-metrics-term.txt", idx)) + raw, err := os.ReadFile(path) + if err != nil || len(raw) == 0 { + continue } + charts = append(charts, benchmarkReportChart{ + Title: fmt.Sprintf("GPU %d — Steady State", idx), + Content: string(raw), + }) } return charts } diff --git a/audit/internal/platform/benchmark_test.go b/audit/internal/platform/benchmark_test.go index 3422e7a..12463c7 100644 --- a/audit/internal/platform/benchmark_test.go +++ b/audit/internal/platform/benchmark_test.go @@ -137,8 +137,9 @@ func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) { for _, needle := range []string{ "Executive Summary", "GPU 0 spent measurable time under SW power cap.", - "Composite score: 1176.00", - "fp16_tensor: 700.00 TOPS", + "1176.00", + "fp16_tensor", + "700.00", } { if !strings.Contains(report, needle) { t.Fatalf("report missing %q\n%s", needle, report) @@ -164,7 +165,7 @@ func TestRenderBenchmarkReportIncludesTerminalChartsWithoutANSI(t *testing.T) { }) for _, needle := range []string{ - "Terminal Charts", + "Steady-State Charts", "GPU 0 Steady State", "GPU 0 chart", "42┤───", diff --git a/audit/internal/platform/sat.go b/audit/internal/platform/sat.go index 2351418..ca06bc5 100644 --- a/audit/internal/platform/sat.go +++ b/audit/internal/platform/sat.go @@ -531,9 +531,13 @@ func memoryStressSizeArg() string { return fmt.Sprintf("%dM", targetMB) } -func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) { - sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128) - passes := envInt("BEE_MEMTESTER_PASSES", 1) +func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) { + if sizeMB <= 0 { + sizeMB = 256 + } + if passes <= 0 { + passes = 1 + } return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{ {name: "01-free-before.log", cmd: []string{"free", "-h"}}, {name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}}, @@ -590,7 +594,7 @@ func (s *System) RunCPUAcceptancePack(ctx context.Context, baseDir string, durat }, logFunc) } -func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) { +func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) { if baseDir == "" { baseDir = "/var/log/bee-sat" } @@ -622,7 +626,7 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, l break } prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath)) - commands := storageSATCommands(devPath) + commands := storageSATCommands(devPath, extended) for cmdIndex, job := range commands { if ctx.Err() != nil { break @@ -1086,17 +1090,25 @@ func listStorageDevices() ([]string, error) { return parseStorageDevices(string(out)), nil } -func storageSATCommands(devPath string) []satJob { +func storageSATCommands(devPath string, extended bool) []satJob { if strings.Contains(filepath.Base(devPath), "nvme") { + selfTestLevel := "1" + if extended { + selfTestLevel = "2" + } return []satJob{ {name: "nvme-id-ctrl", cmd: []string{"nvme", "id-ctrl", devPath, "-o", "json"}}, {name: "nvme-smart-log", cmd: []string{"nvme", "smart-log", devPath, "-o", "json"}}, - {name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", "1", "--wait"}}, + {name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", selfTestLevel, "--wait"}}, } } + smartTestType := "short" + if extended { + smartTestType = "long" + } return []satJob{ {name: "smartctl-health", cmd: []string{"smartctl", "-H", "-A", devPath}}, - {name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", "short", devPath}}, + {name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", smartTestType, devPath}}, } } diff --git a/audit/internal/platform/sat_test.go b/audit/internal/platform/sat_test.go index 4186f12..71ae50b 100644 --- a/audit/internal/platform/sat_test.go +++ b/audit/internal/platform/sat_test.go @@ -14,12 +14,12 @@ import ( func TestStorageSATCommands(t *testing.T) { t.Parallel() - nvme := storageSATCommands("/dev/nvme0n1") + nvme := storageSATCommands("/dev/nvme0n1", false) if len(nvme) != 3 || nvme[2].cmd[0] != "nvme" { t.Fatalf("unexpected nvme commands: %#v", nvme) } - sata := storageSATCommands("/dev/sda") + sata := storageSATCommands("/dev/sda", false) if len(sata) != 2 || sata[0].cmd[0] != "smartctl" { t.Fatalf("unexpected sata commands: %#v", sata) } diff --git a/audit/internal/webui/api.go b/audit/internal/webui/api.go index 76ab0e3..3d9ff94 100644 --- a/audit/internal/webui/api.go +++ b/audit/internal/webui/api.go @@ -222,7 +222,21 @@ func formatSplitTaskName(baseName, selectionLabel string) string { } func buildNvidiaTaskSet(target string, priority int, createdAt time.Time, params taskParams, baseName string, appRef *app.App, idPrefix string) ([]*Task, error) { - if !shouldSplitHomogeneousNvidiaTarget(target) { + if !shouldSplitHomogeneousNvidiaTarget(target) || params.ParallelGPUs { + // Parallel mode (or non-splittable target): one task for all selected GPUs. + if params.ParallelGPUs && shouldSplitHomogeneousNvidiaTarget(target) { + // Resolve the selected GPU indices so ExcludeGPUIndices is applied. + gpus, err := apiListNvidiaGPUs(appRef) + if err != nil { + return nil, err + } + resolved, err := expandSelectedGPUIndices(gpus, params.GPUIndices, params.ExcludeGPUIndices) + if err != nil { + return nil, err + } + params.GPUIndices = resolved + params.ExcludeGPUIndices = nil + } t := &Task{ ID: newJobID(idPrefix), Name: baseName, @@ -262,6 +276,53 @@ func buildNvidiaTaskSet(target string, priority int, createdAt time.Time, params return tasks, nil } +// expandSelectedGPUIndices returns the sorted list of selected GPU indices after +// applying include/exclude filters, without splitting by model. +func expandSelectedGPUIndices(gpus []platform.NvidiaGPU, include, exclude []int) ([]int, error) { + indexed := make(map[int]struct{}, len(gpus)) + allIndices := make([]int, 0, len(gpus)) + for _, gpu := range gpus { + indexed[gpu.Index] = struct{}{} + allIndices = append(allIndices, gpu.Index) + } + sort.Ints(allIndices) + + selected := allIndices + if len(include) > 0 { + selected = make([]int, 0, len(include)) + seen := make(map[int]struct{}, len(include)) + for _, idx := range include { + if _, ok := indexed[idx]; !ok { + continue + } + if _, dup := seen[idx]; dup { + continue + } + seen[idx] = struct{}{} + selected = append(selected, idx) + } + sort.Ints(selected) + } + if len(exclude) > 0 { + skip := make(map[int]struct{}, len(exclude)) + for _, idx := range exclude { + skip[idx] = struct{}{} + } + filtered := selected[:0] + for _, idx := range selected { + if _, ok := skip[idx]; ok { + continue + } + filtered = append(filtered, idx) + } + selected = filtered + } + if len(selected) == 0 { + return nil, fmt.Errorf("no NVIDIA GPUs selected") + } + return selected, nil +} + // ── SSE helpers ─────────────────────────────────────────────────────────────── func sseWrite(w http.ResponseWriter, event, data string) bool { @@ -423,7 +484,7 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc { var body struct { Duration int `json:"duration"` - DiagLevel int `json:"diag_level"` + StressMode bool `json:"stress_mode"` GPUIndices []int `json:"gpu_indices"` ExcludeGPUIndices []int `json:"exclude_gpu_indices"` Loader string `json:"loader"` @@ -444,7 +505,7 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc { } params := taskParams{ Duration: body.Duration, - DiagLevel: body.DiagLevel, + StressMode: body.StressMode, GPUIndices: body.GPUIndices, ExcludeGPUIndices: body.ExcludeGPUIndices, Loader: body.Loader, diff --git a/audit/internal/webui/server_test.go b/audit/internal/webui/server_test.go index e0ec1b3..b169ff2 100644 --- a/audit/internal/webui/server_test.go +++ b/audit/internal/webui/server_test.go @@ -693,8 +693,8 @@ func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) { for _, needle := range []string{ `Benchmark Results`, `Composite score by saved benchmark run and GPU.`, - `NVIDIA H100 PCIe / GPU 0`, - `NVIDIA H100 PCIe / GPU 1`, + `GPU #0 — NVIDIA H100 PCIe`, + `GPU #1 — NVIDIA H100 PCIe`, `#1`, wantTime, `1176.25`, diff --git a/audit/internal/webui/tasks.go b/audit/internal/webui/tasks.go index 3aa0744..68c430d 100644 --- a/audit/internal/webui/tasks.go +++ b/audit/internal/webui/tasks.go @@ -115,10 +115,11 @@ type Task struct { // taskParams holds optional parameters parsed from the run request. type taskParams struct { Duration int `json:"duration,omitempty"` - DiagLevel int `json:"diag_level,omitempty"` + StressMode bool `json:"stress_mode,omitempty"` GPUIndices []int `json:"gpu_indices,omitempty"` ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"` SizeMB int `json:"size_mb,omitempty"` + Passes int `json:"passes,omitempty"` Loader string `json:"loader,omitempty"` BurnProfile string `json:"burn_profile,omitempty"` BenchmarkProfile string `json:"benchmark_profile,omitempty"` @@ -215,11 +216,11 @@ var globalQueue = &taskQueue{trigger: make(chan struct{}, 1)} const maxTaskHistory = 50 var ( - runMemoryAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) { - return a.RunMemoryAcceptancePackCtx(ctx, baseDir, logFunc) + runMemoryAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) { + return a.RunMemoryAcceptancePackCtx(ctx, baseDir, sizeMB, passes, logFunc) } - runStorageAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) { - return a.RunStorageAcceptancePackCtx(ctx, baseDir, logFunc) + runStorageAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) { + return a.RunStorageAcceptancePackCtx(ctx, baseDir, extended, logFunc) } runCPUAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) { return a.RunCPUAcceptancePackCtx(ctx, baseDir, durationSec, logFunc) @@ -552,7 +553,10 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) { err = fmt.Errorf("app not configured") break } - diagLevel := t.params.DiagLevel + diagLevel := 2 + if t.params.StressMode { + diagLevel = 3 + } if len(t.params.GPUIndices) > 0 || diagLevel > 0 { result, e := a.RunNvidiaAcceptancePackWithOptions( ctx, "", diagLevel, t.params.GPUIndices, j.append, @@ -658,13 +662,17 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) { err = fmt.Errorf("app not configured") break } - archive, err = runMemoryAcceptancePackCtx(a, ctx, "", j.append) + sizeMB, passes := 256, 1 + if t.params.StressMode { + sizeMB, passes = 1024, 3 + } + archive, err = runMemoryAcceptancePackCtx(a, ctx, "", sizeMB, passes, j.append) case "storage": if a == nil { err = fmt.Errorf("app not configured") break } - archive, err = runStorageAcceptancePackCtx(a, ctx, "", j.append) + archive, err = runStorageAcceptancePackCtx(a, ctx, "", t.params.StressMode, j.append) case "cpu": if a == nil { err = fmt.Errorf("app not configured") @@ -675,7 +683,11 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) { dur = resolveBurnPreset(t.params.BurnProfile).DurationSec } if dur <= 0 { - dur = 60 + if t.params.StressMode { + dur = 1800 + } else { + dur = 60 + } } j.append(fmt.Sprintf("CPU stress duration: %ds", dur)) archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append) diff --git a/audit/internal/webui/tasks_test.go b/audit/internal/webui/tasks_test.go index 59fe740..fe37d96 100644 --- a/audit/internal/webui/tasks_test.go +++ b/audit/internal/webui/tasks_test.go @@ -422,7 +422,7 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) { for _, needle := range []string{ `Benchmark Results`, `Composite score for this benchmark task.`, - `NVIDIA H100 PCIe / GPU 0`, + `GPU #0 — NVIDIA H100 PCIe`, `1176.25`, } { if !strings.Contains(html, needle) {