From 95124d228fd868cff0bbc9849ad416038a5af242 Mon Sep 17 00:00:00 2001 From: Mikhail Chusavitin Date: Tue, 14 Apr 2026 17:33:13 +0300 Subject: [PATCH] Split bee-bench into perf and power workflows --- audit/internal/app/app.go | 6 +- audit/internal/platform/benchmark.go | 426 ++++++++++++++---- audit/internal/platform/benchmark_report.go | 25 +- audit/internal/platform/benchmark_test.go | 37 ++ audit/internal/platform/benchmark_types.go | 8 +- audit/internal/platform/gpu_metrics.go | 17 +- audit/internal/webui/api.go | 241 +++++----- audit/internal/webui/api_test.go | 58 ++- audit/internal/webui/kmsg_watcher.go | 2 +- audit/internal/webui/pages.go | 62 ++- audit/internal/webui/server.go | 3 +- audit/internal/webui/server_test.go | 9 +- audit/internal/webui/task_report.go | 6 +- audit/internal/webui/tasks.go | 30 +- audit/internal/webui/tasks_test.go | 10 +- .../docs/benchmark-clock-calibration.md | 29 ++ iso/builder/bee-gpu-stress.c | 8 + 17 files changed, 718 insertions(+), 259 deletions(-) diff --git a/audit/internal/app/app.go b/audit/internal/app/app.go index ca1c94d..b17eda6 100644 --- a/audit/internal/app/app.go +++ b/audit/internal/app/app.go @@ -30,7 +30,9 @@ var ( DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log" DefaultTechDumpDir = DefaultExportDir + "/techdump" DefaultSATBaseDir = DefaultExportDir + "/bee-sat" - DefaultBenchmarkBaseDir = DefaultExportDir + "/bee-benchmark" + DefaultBeeBenchBaseDir = DefaultExportDir + "/bee-bench" + DefaultBeeBenchPerfDir = DefaultBeeBenchBaseDir + "/perf" + DefaultBeeBenchPowerDir = DefaultBeeBenchBaseDir + "/power" ) type App struct { @@ -567,7 +569,7 @@ func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOp func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) { if strings.TrimSpace(baseDir) == "" { - baseDir = DefaultBenchmarkBaseDir + baseDir = DefaultBeeBenchPerfDir } return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc) } diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index 6ad9754..4ddac4c 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -16,7 +16,7 @@ import ( "time" ) -const benchmarkVersion = "1" +const benchmarkVersion = "2" type benchmarkProfileSpec struct { Name string @@ -41,6 +41,15 @@ type benchmarkGPUInfo struct { MultiprocessorCount int } +type benchmarkPowerCalibrationResult struct { + Summary BenchmarkTelemetrySummary + AppliedPowerLimitW float64 + Attempts int + Derated bool + Completed bool + Notes []string +} + type benchmarkBurnProfile struct { name string category string @@ -78,7 +87,36 @@ var ( // to highest power draw so thermal ramp-up is gradual. var benchmarkPrecisionPhases = []string{"int8", "fp8", "fp16", "fp32", "fp64", "fp4"} -func buildBenchmarkSteadyPlan(spec benchmarkProfileSpec, metricStage func(string) string) (planLabels []string, planPhases []benchmarkPlannedPhase, basePhaseSec int, mixedPhaseSec int) { +func computeCapabilityCode(raw string) int { + raw = strings.TrimSpace(raw) + if raw == "" { + return 0 + } + parts := strings.SplitN(raw, ".", 2) + major, _ := strconv.Atoi(strings.TrimSpace(parts[0])) + minor := 0 + if len(parts) > 1 { + minor, _ = strconv.Atoi(strings.TrimSpace(parts[1])) + } + return major*10 + minor +} + +func benchmarkSupportedPrecisions(computeCapability string) []string { + cc := computeCapabilityCode(computeCapability) + out := make([]string, 0, len(benchmarkPrecisionPhases)) + for _, prec := range benchmarkPrecisionPhases { + if prec == "fp4" && cc > 0 && cc < 100 { + continue + } + out = append(out, prec) + } + return out +} + +func buildBenchmarkSteadyPlan(spec benchmarkProfileSpec, precisions []string, metricStage func(string) string) (planLabels []string, planPhases []benchmarkPlannedPhase, basePhaseSec int, mixedPhaseSec int) { + if len(precisions) == 0 { + precisions = append([]string(nil), benchmarkPrecisionPhases...) + } switch spec.Name { case NvidiaBenchmarkProfileStandard: basePhaseSec = 60 @@ -90,7 +128,7 @@ func buildBenchmarkSteadyPlan(spec benchmarkProfileSpec, metricStage func(string basePhaseSec = 3600 mixedPhaseSec = 14400 default: - totalWeight := len(benchmarkPrecisionPhases) + 5 + totalWeight := len(precisions) + 5 if totalWeight <= 0 { return nil, nil, 0, 0 } @@ -100,9 +138,9 @@ func buildBenchmarkSteadyPlan(spec benchmarkProfileSpec, metricStage func(string } mixedPhaseSec = basePhaseSec * 5 } - planLabels = make([]string, 0, len(benchmarkPrecisionPhases)+1) - planPhases = make([]benchmarkPlannedPhase, 0, len(benchmarkPrecisionPhases)+1) - for _, prec := range benchmarkPrecisionPhases { + planLabels = make([]string, 0, len(precisions)+1) + planPhases = make([]benchmarkPlannedPhase, 0, len(precisions)+1) + for _, prec := range precisions { planLabels = append(planLabels, prec) planPhases = append(planPhases, benchmarkPlannedPhase{ PlanLabel: prec, @@ -127,6 +165,53 @@ func benchmarkPlanDurationsCSV(phases []benchmarkPlannedPhase) string { return strings.Join(values, ",") } +func benchmarkPlannedPhaseStatus(raw []byte) (string, string) { + text := strings.ToLower(strings.TrimSpace(string(raw))) + switch { + case text == "": + return "FAILED", "phase produced no output" + case strings.Contains(text, "phase_error="): + if strings.Contains(text, "unsupported") || strings.Contains(text, "not supported") || strings.Contains(text, "cublaslt_profiles=unsupported") { + return "UNSUPPORTED", "precision phase unsupported on this GPU/userspace path" + } + return "FAILED", "precision phase failed" + case strings.Contains(text, "status=failed"): + if strings.Contains(text, "unsupported") || strings.Contains(text, "not supported") { + return "UNSUPPORTED", "precision phase unsupported on this GPU/userspace path" + } + return "FAILED", "precision phase failed" + default: + return "OK", "" + } +} + +func benchmarkCalibrationThrottleReason(before, after BenchmarkThrottleCounters) string { + diff := diffThrottleCounters(before, after) + switch { + case diff.HWThermalSlowdownUS > 0: + return "hw_thermal" + case diff.SWThermalSlowdownUS > 0: + return "sw_thermal" + case diff.HWPowerBrakeSlowdownUS > 0: + return "hw_power_brake" + default: + return "" + } +} + +func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, powerLimitW int) error { + if powerLimitW <= 0 { + return fmt.Errorf("invalid power limit %d", powerLimitW) + } + out, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("gpu-%d-set-power-limit-%dw", gpuIndex, powerLimitW), []string{ + "nvidia-smi", "-i", strconv.Itoa(gpuIndex), "-pl", strconv.Itoa(powerLimitW), + }, nil, nil) + if err != nil { + return fmt.Errorf("set power limit gpu=%d limit=%dw: %w (%s)", gpuIndex, powerLimitW, err, strings.TrimSpace(string(out))) + } + return nil +} + func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) { if ctx == nil { ctx = context.Background() @@ -135,7 +220,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv logFunc = func(string) {} } if strings.TrimSpace(baseDir) == "" { - baseDir = "/var/log/bee-benchmark" + baseDir = "/var/log/bee-bench/perf" } spec := resolveBenchmarkProfile(opts.Profile) opts = normalizeNvidiaBenchmarkOptionsForBenchmark(opts) @@ -149,7 +234,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv } ts := time.Now().UTC().Format("20060102-150405") - runDir := filepath.Join(baseDir, "gpu-benchmark-"+ts) + runDir := filepath.Join(baseDir, "perf-"+ts) if err := os.MkdirAll(runDir, 0755); err != nil { return "", fmt.Errorf("mkdir %s: %w", runDir, err) } @@ -175,6 +260,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected))) var metricRows []GPUMetricRow + metricTimelineSec := 0.0 gpuBurnLog := filepath.Join(runDir, "gpu-burn.log") // Server power characterization state — populated during per-GPU phases. @@ -215,14 +301,23 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv // Power calibration: run dcgmi targeted_power while sampling nvidia-smi power. // Returns per-GPU p95 power as an honest TDP reference for PowerSustainScore. - calibPowerByIndex := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, logFunc) + calibByIndex, powerRestoreActions := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, infoByIndex, logFunc) + restoreActions = append(restoreActions, powerRestoreActions...) + for _, idx := range selected { + if calib, ok := calibByIndex[idx]; ok && calib.Derated && calib.AppliedPowerLimitW > 0 { + result.Warnings = append(result.Warnings, fmt.Sprintf( + "GPU %d could not complete targeted_power at its default server power budget; benchmark ran at reduced power limit %.0f W.", + idx, calib.AppliedPowerLimitW, + )) + } + } // Start background CPU load sampler — samples every 10s during GPU phases. cpuStopCh := make(chan struct{}) cpuSamplesCh := startCPULoadSampler(cpuStopCh, 10) if opts.ParallelGPUs { - runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, calibPowerByIndex, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples, &metricRows, gpuBurnLog) + runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, calibByIndex, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples, &metricRows, &metricTimelineSec, gpuBurnLog) } else { for _, idx := range selected { @@ -242,8 +337,12 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz } - if w, ok := calibPowerByIndex[idx]; ok && w > 0 { - gpuResult.CalibratedPeakPowerW = w + if calib, ok := calibByIndex[idx]; ok { + gpuResult.CalibratedPeakPowerW = calib.Summary.P95PowerW + gpuResult.CalibratedPeakTempC = calib.Summary.P95TempC + gpuResult.PowerCalibrationTries = calib.Attempts + gpuResult.PowerLimitDerated = calib.Derated + gpuResult.Notes = append(gpuResult.Notes, calib.Notes...) } if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil { gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz @@ -255,7 +354,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv gpuResult.Notes = append(gpuResult.Notes, "baseline sampling failed: "+err.Error()) } gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows) - appendBenchmarkMetrics(&metricRows, baselineRows, fmt.Sprintf("gpu-%d-baseline", idx)) + appendBenchmarkMetrics(&metricRows, baselineRows, fmt.Sprintf("gpu-%d-baseline", idx), &metricTimelineSec, float64(spec.BaselineSec)) // Sample server idle power once (first GPU only — server state is global). if !serverIdleOK { @@ -274,18 +373,23 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv } logFunc(fmt.Sprintf("GPU %d: warmup (%ds)", idx, spec.WarmupSec)) warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-warmup.log", idx), warmupCmd, nil, []int{idx}, logFunc) - appendBenchmarkMetrics(&metricRows, warmupRows, fmt.Sprintf("gpu-%d-warmup", idx)) + appendBenchmarkMetrics(&metricRows, warmupRows, fmt.Sprintf("gpu-%d-warmup", idx), &metricTimelineSec, float64(spec.WarmupSec)) appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", fmt.Sprintf("gpu-%d-warmup", idx), warmupOut) if warmupErr != nil { gpuResult.Notes = append(gpuResult.Notes, "warmup failed: "+warmupErr.Error()) result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult)) continue } + warmupParse := parseBenchmarkBurnLog(string(warmupOut)) + if gpuResult.ComputeCapability == "" { + gpuResult.ComputeCapability = warmupParse.ComputeCapability + } // Run synthetic precision phases and the combined steady phase as one // uninterrupted command so the GPU stays hot between windows. eccBase, _ := queryECCCounters(idx) - planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, func(label string) string { + supportedPrecisions := benchmarkSupportedPrecisions(gpuResult.ComputeCapability) + planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, supportedPrecisions, func(label string) string { if label == "mixed" { return fmt.Sprintf("gpu-%d-steady", idx) } @@ -299,24 +403,27 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv "--precision-plan", strings.Join(planLabels, ","), "--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases), } - logFunc(fmt.Sprintf("GPU %d: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", idx, len(benchmarkPrecisionPhases), basePhaseSec, mixedPhaseSec)) + logFunc(fmt.Sprintf("GPU %d: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", idx, len(supportedPrecisions), basePhaseSec, mixedPhaseSec)) _, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-precision-plan.log", idx), planCmd, nil, []int{idx}, planPhases, logFunc) for _, phaseSpec := range planPhases { if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 { - appendBenchmarkMetrics(&metricRows, rows, phaseSpec.MetricStage) + appendBenchmarkMetrics(&metricRows, rows, phaseSpec.MetricStage, &metricTimelineSec, float64(phaseSpec.DurationSec)) } appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseSpec.MetricStage, phaseLogs[phaseSpec.PlanLabel]) } - for _, prec := range benchmarkPrecisionPhases { + for _, prec := range supportedPrecisions { stageName := fmt.Sprintf("gpu-%d-steady-%s", idx, prec) phaseRows := phaseRowsByStage[stageName] - if len(phaseRows) == 0 { - continue - } phase := BenchmarkPrecisionSteadyPhase{ Precision: prec, + Status: "OK", Steady: summarizeBenchmarkTelemetry(phaseRows), } + if status, note := benchmarkPlannedPhaseStatus(phaseLogs[prec]); status != "OK" { + phase.Status = status + phase.Notes = note + gpuResult.PrecisionFailures = append(gpuResult.PrecisionFailures, prec+":"+status) + } for _, p := range parseBenchmarkBurnLog(string(phaseLogs[prec])).Profiles { if p.Supported { phase.TeraOpsPerSec += p.TeraOpsPerSec @@ -396,13 +503,15 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error()) } gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows) - appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx)) + appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx), &metricTimelineSec, float64(spec.CooldownSec)) } gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult) gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status) if planErr != nil { gpuResult.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr) + } else if len(gpuResult.PrecisionFailures) > 0 { + gpuResult.Status = "PARTIAL" } else if parseResult.Fallback { gpuResult.Status = "PARTIAL" } else { @@ -929,35 +1038,34 @@ func sampleBenchmarkCoolingSample() benchmarkCoolingSample { } } -func annotateBenchmarkMetricRows(rows []GPUMetricRow, stage string, offset float64) []GPUMetricRow { +func annotateBenchmarkMetricRows(rows []GPUMetricRow, stage string, offset, durationSec float64) []GPUMetricRow { if len(rows) == 0 { return nil } + stageEnd := offset + durationSec + if stageEnd <= offset { + stageEnd = offset + for _, row := range rows { + if row.ElapsedSec+offset > stageEnd { + stageEnd = row.ElapsedSec + offset + } + } + } out := make([]GPUMetricRow, len(rows)) for i, row := range rows { row.Stage = stage row.ElapsedSec += offset + row.StageStartSec = offset + row.StageEndSec = stageEnd out[i] = row } return out } -func benchmarkMetricOffset(rows []GPUMetricRow) float64 { - if len(rows) == 0 { - return 0 - } - var maxElapsed float64 - for _, row := range rows { - if row.ElapsedSec > maxElapsed { - maxElapsed = row.ElapsedSec - } - } - return maxElapsed -} - -func appendBenchmarkMetrics(allRows *[]GPUMetricRow, rows []GPUMetricRow, stage string) { - annotated := annotateBenchmarkMetricRows(rows, stage, benchmarkMetricOffset(*allRows)) +func appendBenchmarkMetrics(allRows *[]GPUMetricRow, rows []GPUMetricRow, stage string, cursor *float64, durationSec float64) { + annotated := annotateBenchmarkMetricRows(rows, stage, *cursor, durationSec) *allRows = append(*allRows, annotated...) + *cursor += durationSec } func writeBenchmarkMetricsFiles(runDir string, rows []GPUMetricRow) { @@ -1308,6 +1416,9 @@ func detectBenchmarkDegradationReasons(gpu BenchmarkGPUResult, normalizationStat if normalizationStatus != "full" { reasons = append(reasons, "normalization_partial") } + if gpu.PowerLimitDerated { + reasons = append(reasons, "power_limit_derated") + } if gpu.ECC.Uncorrected > 0 { reasons = append(reasons, "ecc_uncorrected_errors") } @@ -1522,12 +1633,17 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string { findings = append(findings, fmt.Sprintf("GPU %d showed unstable clocks/power over the benchmark window.", gpu.Index)) case "normalization_partial": findings = append(findings, fmt.Sprintf("GPU %d ran without full benchmark normalization.", gpu.Index)) + case "power_limit_derated": + findings = append(findings, fmt.Sprintf("GPU %d could not sustain targeted_power in this server at the default limit; benchmark ran derated at %.0f W.", gpu.Index, gpu.PowerLimitW)) case "ecc_uncorrected_errors": findings = append(findings, fmt.Sprintf("GPU %d reported %d uncorrected ECC error(s) — possible hardware fault.", gpu.Index, gpu.ECC.Uncorrected)) case "ecc_corrected_errors": findings = append(findings, fmt.Sprintf("GPU %d reported %d corrected ECC error(s) — possible DRAM degradation.", gpu.Index, gpu.ECC.Corrected)) } } + if len(gpu.PrecisionFailures) > 0 { + findings = append(findings, fmt.Sprintf("GPU %d had incomplete precision coverage: %s.", gpu.Index, strings.Join(gpu.PrecisionFailures, ", "))) + } if gpu.Backend == "driver-ptx" { findings = append(findings, fmt.Sprintf("GPU %d used driver PTX fallback; tensor score is intentionally degraded.", gpu.Index)) } @@ -1896,10 +2012,11 @@ func runNvidiaBenchmarkParallel( spec benchmarkProfileSpec, logFunc func(string), result *NvidiaBenchmarkResult, - calibPowerByIndex map[int]float64, + calibByIndex map[int]benchmarkPowerCalibrationResult, serverIdleW *float64, serverLoadedWSum *float64, serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int, allMetricRows *[]GPUMetricRow, + metricTimelineSec *float64, gpuBurnLog string, ) { allDevices := joinIndexList(selected) @@ -1920,8 +2037,12 @@ func runNvidiaBenchmarkParallel( r.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz r.MaxMemoryClockMHz = info.MaxMemoryClockMHz } - if w, ok := calibPowerByIndex[idx]; ok && w > 0 { - r.CalibratedPeakPowerW = w + if calib, ok := calibByIndex[idx]; ok { + r.CalibratedPeakPowerW = calib.Summary.P95PowerW + r.CalibratedPeakTempC = calib.Summary.P95TempC + r.PowerCalibrationTries = calib.Attempts + r.PowerLimitDerated = calib.Derated + r.Notes = append(r.Notes, calib.Notes...) } if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil { r.LockedGraphicsClockMHz = norm.GPUClockLockMHz @@ -1941,7 +2062,7 @@ func runNvidiaBenchmarkParallel( perGPU := filterRowsByGPU(baselineRows, idx) gpuResults[idx].Baseline = summarizeBenchmarkTelemetry(perGPU) } - appendBenchmarkMetrics(allMetricRows, baselineRows, "baseline") + appendBenchmarkMetrics(allMetricRows, baselineRows, "baseline", metricTimelineSec, float64(spec.BaselineSec)) // Sample server idle power once. if !*serverIdleOK { @@ -1961,13 +2082,25 @@ func runNvidiaBenchmarkParallel( } logFunc(fmt.Sprintf("GPUs %s: parallel warmup (%ds)", allDevices, spec.WarmupSec)) warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-warmup.log", warmupCmd, nil, selected, logFunc) - appendBenchmarkMetrics(allMetricRows, warmupRows, "warmup") + appendBenchmarkMetrics(allMetricRows, warmupRows, "warmup", metricTimelineSec, float64(spec.WarmupSec)) appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", "warmup", warmupOut) if warmupErr != nil { for _, idx := range selected { gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel warmup failed: "+warmupErr.Error()) } } + warmupParseByGPU := parseBenchmarkBurnLogByGPU(string(warmupOut)) + supportedPrecisions := append([]string(nil), benchmarkPrecisionPhases...) + for _, idx := range selected { + if pr, ok := warmupParseByGPU[idx]; ok && pr.ComputeCapability != "" { + if gpuResults[idx].ComputeCapability == "" { + gpuResults[idx].ComputeCapability = pr.ComputeCapability + } + if ccPrecisions := benchmarkSupportedPrecisions(pr.ComputeCapability); len(ccPrecisions) < len(supportedPrecisions) { + supportedPrecisions = ccPrecisions + } + } + } // Run synthetic precision phases and the combined steady phase as one // uninterrupted command so the GPUs stay hot between windows. @@ -1975,7 +2108,7 @@ func runNvidiaBenchmarkParallel( for _, idx := range selected { eccBase[idx], _ = queryECCCounters(idx) } - planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, func(label string) string { + planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, supportedPrecisions, func(label string) string { if label == "mixed" { return "steady" } @@ -1989,30 +2122,30 @@ func runNvidiaBenchmarkParallel( "--precision-plan", strings.Join(planLabels, ","), "--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases), } - logFunc(fmt.Sprintf("GPUs %s: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", allDevices, len(benchmarkPrecisionPhases), basePhaseSec, mixedPhaseSec)) + logFunc(fmt.Sprintf("GPUs %s: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", allDevices, len(supportedPrecisions), basePhaseSec, mixedPhaseSec)) _, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, "gpu-all-precision-plan.log", planCmd, nil, selected, planPhases, logFunc) for _, phaseSpec := range planPhases { if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 { - appendBenchmarkMetrics(allMetricRows, rows, phaseSpec.MetricStage) + appendBenchmarkMetrics(allMetricRows, rows, phaseSpec.MetricStage, metricTimelineSec, float64(phaseSpec.DurationSec)) } appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseSpec.MetricStage, phaseLogs[phaseSpec.PlanLabel]) } - for _, prec := range benchmarkPrecisionPhases { + for _, prec := range supportedPrecisions { phaseLogName := "gpu-all-steady-" + prec phaseRows := phaseRowsByStage[phaseLogName] - if len(phaseRows) == 0 { - continue - } parseByGPU := parseBenchmarkBurnLogByGPU(string(phaseLogs[prec])) for _, idx := range selected { perGPU := filterRowsByGPU(phaseRows, idx) - if len(perGPU) == 0 { - continue - } phase := BenchmarkPrecisionSteadyPhase{ Precision: prec, + Status: "OK", Steady: summarizeBenchmarkTelemetry(perGPU), } + if status, note := benchmarkPlannedPhaseStatus(phaseLogs[prec]); status != "OK" { + phase.Status = status + phase.Notes = note + gpuResults[idx].PrecisionFailures = append(gpuResults[idx].PrecisionFailures, prec+":"+status) + } if pr, ok := parseByGPU[idx]; ok { for _, p := range pr.Profiles { if p.Supported { @@ -2113,7 +2246,7 @@ func runNvidiaBenchmarkParallel( perGPU := filterRowsByGPU(cooldownRows, idx) gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU) } - appendBenchmarkMetrics(allMetricRows, cooldownRows, "cooldown") + appendBenchmarkMetrics(allMetricRows, cooldownRows, "cooldown", metricTimelineSec, float64(spec.CooldownSec)) } // Score and finalize each GPU. @@ -2125,6 +2258,8 @@ func runNvidiaBenchmarkParallel( switch { case planErr != nil: r.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr) + case len(r.PrecisionFailures) > 0: + r.Status = "PARTIAL" case pr.Fallback: r.Status = "PARTIAL" default: @@ -2299,59 +2434,172 @@ func summarizeCPULoad(samples []float64) *BenchmarkCPULoad { return cl } -// runBenchmarkPowerCalibration runs a short dcgmi targeted_power test while -// collecting nvidia-smi power samples in parallel. It returns a map from GPU -// index to p95 observed power (watts), which is used as the reference for -// PowerSustainScore instead of the hardware default limit. -// -// If dcgmi is unavailable or the run fails the function returns an empty map -// and the caller falls back to DefaultPowerLimitW. The calibration is skipped -// gracefully — it must never block or fail the main benchmark. +// runBenchmarkPowerCalibration runs targeted_power per GPU and actively watches +// throttle counters. If a GPU starts throttling, the current targeted_power run +// is canceled immediately, the power limit is reduced, and a fresh full cycle +// is started again from the beginning. The selected reduced power limit stays +// active for the main benchmark and is restored by the caller afterwards. func runBenchmarkPowerCalibration( ctx context.Context, verboseLog, runDir string, gpuIndices []int, + infoByIndex map[int]benchmarkGPUInfo, logFunc func(string), -) map[int]float64 { +) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) { const calibDurationSec = 120 + const derateStepW = 25 + const maxDerateW = 150 - // dcgmi must be present. if _, err := exec.LookPath("dcgmi"); err != nil { logFunc("power calibration: dcgmi not found, skipping (will use default power limit)") - return map[int]float64{} + return map[int]benchmarkPowerCalibrationResult{}, nil } - logFunc(fmt.Sprintf("power calibration: running dcgmi targeted_power for %ds on GPUs %s", calibDurationSec, joinIndexList(gpuIndices))) - - cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices) - out, rows, err := runBenchmarkCommandWithMetrics(ctx, verboseLog, "power-calibration.log", cmd, nil, gpuIndices, logFunc) - _ = os.WriteFile(filepath.Join(runDir, "power-calibration.log"), out, 0644) - if err != nil { - logFunc(fmt.Sprintf("power calibration: dcgmi targeted_power failed (%v), skipping", err)) - return map[int]float64{} + canDerate := os.Geteuid() == 0 + if !canDerate { + logFunc("power calibration: root privileges unavailable, adaptive power-limit derating disabled") } - // Group rows by GPU index and compute p95 power for each. - result := make(map[int]float64, len(gpuIndices)) + type calibrationAttemptResult struct { + out []byte + rows []GPUMetricRow + err error + } + + results := make(map[int]benchmarkPowerCalibrationResult, len(gpuIndices)) + var restore []benchmarkRestoreAction for _, idx := range gpuIndices { - perGPU := filterRowsByGPU(rows, idx) - if len(perGPU) == 0 { - continue + info := infoByIndex[idx] + originalLimitW := int(math.Round(info.PowerLimitW)) + if originalLimitW <= 0 { + originalLimitW = int(math.Round(info.DefaultPowerLimitW)) } - powers := make([]float64, 0, len(perGPU)) - for _, r := range perGPU { - if r.PowerW > 0 { - powers = append(powers, r.PowerW) + defaultLimitW := int(math.Round(info.DefaultPowerLimitW)) + if defaultLimitW <= 0 { + defaultLimitW = originalLimitW + } + appliedLimitW := originalLimitW + if appliedLimitW <= 0 { + appliedLimitW = defaultLimitW + } + minLimitW := appliedLimitW + switch { + case defaultLimitW > 0: + minLimitW = defaultLimitW - maxDerateW + floorByRatio := int(math.Round(float64(defaultLimitW) * 0.70)) + if minLimitW < floorByRatio { + minLimitW = floorByRatio } + case appliedLimitW > 0: + minLimitW = appliedLimitW - maxDerateW } - if len(powers) == 0 { - continue + if minLimitW < derateStepW { + minLimitW = derateStepW } - p95 := benchmarkPercentile(powers, 95) - if p95 > 0 { - result[idx] = p95 - logFunc(fmt.Sprintf("power calibration: GPU %d p95=%.0f W (%d samples)", idx, p95, len(powers))) + + calib := benchmarkPowerCalibrationResult{ + AppliedPowerLimitW: float64(appliedLimitW), + } + if canDerate && originalLimitW > 0 { + idxCopy := idx + orig := originalLimitW + restore = append(restore, benchmarkRestoreAction{ + name: fmt.Sprintf("gpu-%d-restore-power-limit", idxCopy), + fn: func() { + _ = setBenchmarkPowerLimit(context.Background(), verboseLog, idxCopy, orig) + }, + }) + } + + for { + calib.Attempts++ + logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", idx, calib.Attempts, appliedLimitW, calibDurationSec)) + + beforeThrottle, _ := queryThrottleCounters(idx) + attemptCtx, cancel := context.WithCancel(ctx) + doneCh := make(chan calibrationAttemptResult, 1) + logName := fmt.Sprintf("power-calibration-gpu-%d-attempt-%d.log", idx, calib.Attempts) + cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, []int{idx}) + go func() { + out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, []int{idx}, logFunc) + doneCh <- calibrationAttemptResult{out: out, rows: rows, err: err} + }() + + ticker := time.NewTicker(time.Second) + var ( + attempt calibrationAttemptResult + throttleReason string + ) + attemptLoop: + for { + select { + case attempt = <-doneCh: + break attemptLoop + case <-ticker.C: + afterThrottle, err := queryThrottleCounters(idx) + if err != nil { + continue + } + if reason := benchmarkCalibrationThrottleReason(beforeThrottle, afterThrottle); reason != "" { + throttleReason = reason + cancel() + } + case <-ctx.Done(): + cancel() + attempt = <-doneCh + break attemptLoop + } + } + ticker.Stop() + cancel() + _ = os.WriteFile(filepath.Join(runDir, logName), attempt.out, 0644) + + perGPU := filterRowsByGPU(attempt.rows, idx) + summary := summarizeBenchmarkTelemetry(perGPU) + if throttleReason == "" && attempt.err == nil && summary.P95PowerW > 0 { + calib.Summary = summary + calib.Completed = true + calib.AppliedPowerLimitW = float64(appliedLimitW) + logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", idx, appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples)) + break + } + + switch { + case throttleReason != "": + calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power was canceled on attempt %d after %s throttling at %d W", calib.Attempts, throttleReason, appliedLimitW)) + logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", idx, throttleReason, appliedLimitW)) + case attempt.err != nil: + calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", calib.Attempts, appliedLimitW, attempt.err)) + logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", idx, appliedLimitW, attempt.err)) + default: + calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W produced no valid power telemetry", calib.Attempts, appliedLimitW)) + logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W produced no valid telemetry", idx, calib.Attempts, appliedLimitW)) + } + + if !canDerate || appliedLimitW <= 0 { + break + } + nextLimitW := appliedLimitW - derateStepW + if nextLimitW < minLimitW { + calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default/current limit", maxDerateW)) + break + } + if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err != nil { + calib.Notes = append(calib.Notes, "failed to lower power limit: "+err.Error()) + logFunc(fmt.Sprintf("power calibration: GPU %d failed to set reduced power limit %d W: %v", idx, nextLimitW, err)) + break + } + appliedLimitW = nextLimitW + calib.AppliedPowerLimitW = float64(appliedLimitW) + calib.Derated = true + info.PowerLimitW = float64(appliedLimitW) + infoByIndex[idx] = info + calib.Notes = append(calib.Notes, fmt.Sprintf("reduced power limit to %d W and restarted targeted_power from the beginning", appliedLimitW)) + } + + if calib.Completed || calib.Attempts > 0 || len(calib.Notes) > 0 { + results[idx] = calib } } - return result + return results, restore } diff --git a/audit/internal/platform/benchmark_report.go b/audit/internal/platform/benchmark_report.go index b75fa5d..d482c03 100644 --- a/audit/internal/platform/benchmark_report.go +++ b/audit/internal/platform/benchmark_report.go @@ -48,7 +48,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { fmt.Fprintf(&b, "**GPU(s):** %s \n", strings.Join(parts, ", ")) } fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile) - fmt.Fprintf(&b, "**App version:** %s \n", result.BenchmarkVersion) + fmt.Fprintf(&b, "**Benchmark version:** %s \n", result.BenchmarkVersion) fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC")) if result.RampStep > 0 && result.RampTotal > 0 { fmt.Fprintf(&b, "**Ramp-up step:** %d of %d \n", result.RampStep, result.RampTotal) @@ -83,7 +83,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { // ── Methodology ─────────────────────────────────────────────────────────── b.WriteString("## Methodology\n\n") - fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline -> warmup -> steady-state -> interconnect -> cooldown phases.\n", result.BenchmarkProfile) + fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline -> warmup -> steady-state -> interconnect phases.\n", result.BenchmarkProfile) b.WriteString("- Single-GPU compute score comes from `bee-gpu-burn` on the cuBLASLt path when available.\n") b.WriteString("- Thermal and power limits are inferred from NVIDIA clock-event counters plus sustained telemetry.\n") b.WriteString("- `result.json` is the canonical machine-readable source for the run.\n\n") @@ -170,6 +170,16 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { if gpu.PowerLimitW > 0 { fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW) } + if gpu.PowerLimitDerated { + fmt.Fprintf(&b, "- **Power limit derating:** active after %d targeted_power attempt(s)\n", gpu.PowerCalibrationTries) + } + if gpu.CalibratedPeakPowerW > 0 { + if gpu.CalibratedPeakTempC > 0 { + fmt.Fprintf(&b, "- **Power calibration (`dcgmi targeted_power`):** %.0f W p95 at %.1f °C p95\n", gpu.CalibratedPeakPowerW, gpu.CalibratedPeakTempC) + } else { + fmt.Fprintf(&b, "- **Power calibration (`dcgmi targeted_power`):** %.0f W p95\n", gpu.CalibratedPeakPowerW) + } + } if gpu.LockedGraphicsClockMHz > 0 { fmt.Fprintf(&b, "- **Locked clocks:** GPU %.0f MHz / Mem %.0f MHz\n", gpu.LockedGraphicsClockMHz, gpu.LockedMemoryClockMHz) } @@ -188,7 +198,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { // Per-precision stability phases. if len(gpu.PrecisionSteady) > 0 { b.WriteString("**Per-precision stability:**\n\n") - b.WriteString("| Precision | Clock CV | Power CV | Clock Drift | ECC corr | ECC uncorr |\n|-----------|----------|----------|-------------|----------|------------|\n") + b.WriteString("| Precision | Status | Clock CV | Power CV | Clock Drift | ECC corr | ECC uncorr |\n|-----------|--------|----------|----------|-------------|----------|------------|\n") for _, p := range gpu.PrecisionSteady { eccCorr := "—" eccUncorr := "—" @@ -196,8 +206,12 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { eccCorr = fmt.Sprintf("%d", p.ECC.Corrected) eccUncorr = fmt.Sprintf("%d", p.ECC.Uncorrected) } - fmt.Fprintf(&b, "| %s | %.1f%% | %.1f%% | %.1f%% | %s | %s |\n", - p.Precision, p.Steady.ClockCVPct, p.Steady.PowerCVPct, p.Steady.ClockDriftPct, + status := p.Status + if strings.TrimSpace(status) == "" { + status = "OK" + } + fmt.Fprintf(&b, "| %s | %s | %.1f%% | %.1f%% | %.1f%% | %s | %s |\n", + p.Precision, status, p.Steady.ClockCVPct, p.Steady.PowerCVPct, p.Steady.ClockDriftPct, eccCorr, eccUncorr) } b.WriteString("\n") @@ -364,6 +378,7 @@ func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64) func renderBenchmarkSummary(result NvidiaBenchmarkResult) string { var b strings.Builder fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339)) + fmt.Fprintf(&b, "benchmark_version=%s\n", result.BenchmarkVersion) fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile) fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus) fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs)) diff --git a/audit/internal/platform/benchmark_test.go b/audit/internal/platform/benchmark_test.go index 57219d9..7c0d540 100644 --- a/audit/internal/platform/benchmark_test.go +++ b/audit/internal/platform/benchmark_test.go @@ -46,6 +46,7 @@ func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) { labels, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan( benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, SteadySec: 480}, + benchmarkPrecisionPhases, func(label string) string { return label }, ) if len(labels) != 7 || len(phases) != 7 { @@ -70,6 +71,7 @@ func TestBuildBenchmarkSteadyPlanStability(t *testing.T) { _, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan( benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, SteadySec: 3600}, + benchmarkPrecisionPhases, func(label string) string { return label }, ) if basePhaseSec != 300 { @@ -88,6 +90,7 @@ func TestBuildBenchmarkSteadyPlanOvernight(t *testing.T) { _, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan( benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, SteadySec: 27000}, + benchmarkPrecisionPhases, func(label string) string { return label }, ) if basePhaseSec != 3600 { @@ -127,6 +130,40 @@ func TestSplitBenchmarkRowsByPlannedPhaseUsesPhaseDurations(t *testing.T) { } } +func TestBenchmarkSupportedPrecisionsSkipsFP4BeforeBlackwell(t *testing.T) { + t.Parallel() + + if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64" { + t.Fatalf("supported=%v", got) + } + if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64,fp4" { + t.Fatalf("supported=%v", got) + } +} + +func TestBenchmarkPlannedPhaseStatus(t *testing.T) { + t.Parallel() + + cases := []struct { + name string + raw string + wantStatus string + }{ + {name: "ok", raw: "status=OK\n", wantStatus: "OK"}, + {name: "failed", raw: "phase_error=fp16\n", wantStatus: "FAILED"}, + {name: "unsupported", raw: "cublasLt_profiles=unsupported\nphase_error=fp4\n", wantStatus: "UNSUPPORTED"}, + } + for _, tc := range cases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + got, _ := benchmarkPlannedPhaseStatus([]byte(tc.raw)) + if got != tc.wantStatus { + t.Fatalf("status=%q want %q", got, tc.wantStatus) + } + }) + } +} + func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) { t.Parallel() diff --git a/audit/internal/platform/benchmark_types.go b/audit/internal/platform/benchmark_types.go index 2c1544b..080a257 100644 --- a/audit/internal/platform/benchmark_types.go +++ b/audit/internal/platform/benchmark_types.go @@ -104,6 +104,7 @@ type BenchmarkGPUResult struct { Backend string `json:"backend,omitempty"` Status string `json:"status"` PowerLimitW float64 `json:"power_limit_w,omitempty"` + PowerLimitDerated bool `json:"power_limit_derated,omitempty"` MultiprocessorCount int `json:"multiprocessor_count,omitempty"` DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"` // CalibratedPeakPowerW is the p95 power measured during a short @@ -111,6 +112,8 @@ type BenchmarkGPUResult struct { // Used as the reference denominator for PowerSustainScore instead of // the hardware default limit, which bee-gpu-burn cannot reach. CalibratedPeakPowerW float64 `json:"calibrated_peak_power_w,omitempty"` + CalibratedPeakTempC float64 `json:"calibrated_peak_temp_c,omitempty"` + PowerCalibrationTries int `json:"power_calibration_tries,omitempty"` MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"` BaseGraphicsClockMHz float64 `json:"base_graphics_clock_mhz,omitempty"` MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"` @@ -119,6 +122,7 @@ type BenchmarkGPUResult struct { Baseline BenchmarkTelemetrySummary `json:"baseline"` Steady BenchmarkTelemetrySummary `json:"steady"` PrecisionSteady []BenchmarkPrecisionSteadyPhase `json:"precision_steady,omitempty"` + PrecisionFailures []string `json:"precision_failures,omitempty"` Cooldown BenchmarkTelemetrySummary `json:"cooldown"` Throttle BenchmarkThrottleCounters `json:"throttle_counters"` // ECC error delta accumulated over the full benchmark (all phases combined). @@ -225,13 +229,15 @@ type BenchmarkServerPower struct { // type runs at a time the PowerCVPct here is a genuine stability signal. type BenchmarkPrecisionSteadyPhase struct { Precision string `json:"precision"` // e.g. "fp8", "fp16", "fp32" + Status string `json:"status,omitempty"` Steady BenchmarkTelemetrySummary `json:"steady"` TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"` WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"` // ECC errors accumulated during this precision phase only. // Non-zero corrected = stress-induced DRAM errors for this kernel type. // Any uncorrected = serious fault triggered by this precision workload. - ECC BenchmarkECCCounters `json:"ecc,omitempty"` + ECC BenchmarkECCCounters `json:"ecc,omitempty"` + Notes string `json:"notes,omitempty"` } type BenchmarkInterconnectResult struct { diff --git a/audit/internal/platform/gpu_metrics.go b/audit/internal/platform/gpu_metrics.go index 6d16393..8d215bc 100644 --- a/audit/internal/platform/gpu_metrics.go +++ b/audit/internal/platform/gpu_metrics.go @@ -14,6 +14,8 @@ import ( // GPUMetricRow is one telemetry sample from nvidia-smi during a stress test. type GPUMetricRow struct { Stage string `json:"stage,omitempty"` + StageStartSec float64 `json:"stage_start_sec,omitempty"` + StageEndSec float64 `json:"stage_end_sec,omitempty"` ElapsedSec float64 `json:"elapsed_sec"` GPUIndex int `json:"index"` TempC float64 `json:"temp_c"` @@ -509,11 +511,22 @@ func buildGPUMetricStageSpans(rows []GPUMetricRow) []gpuMetricStageSpan { if name == "" { name = "run" } + start := row.StageStartSec + end := row.StageEndSec + if end <= start { + start = row.ElapsedSec + end = row.ElapsedSec + } if len(spans) == 0 || spans[len(spans)-1].Name != name { - spans = append(spans, gpuMetricStageSpan{Name: name, Start: row.ElapsedSec, End: row.ElapsedSec}) + spans = append(spans, gpuMetricStageSpan{Name: name, Start: start, End: end}) continue } - spans[len(spans)-1].End = row.ElapsedSec + if start < spans[len(spans)-1].Start { + spans[len(spans)-1].Start = start + } + if end > spans[len(spans)-1].End { + spans[len(spans)-1].End = end + } } for i := range spans { if spans[i].End <= spans[i].Start { diff --git a/audit/internal/webui/api.go b/audit/internal/webui/api.go index 3216bb6..3ea975e 100644 --- a/audit/internal/webui/api.go +++ b/audit/internal/webui/api.go @@ -110,7 +110,7 @@ func writeTaskRunResponse(w http.ResponseWriter, tasks []*Task) { func shouldSplitHomogeneousNvidiaTarget(target string) bool { switch strings.TrimSpace(target) { - case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute", + case "nvidia", "nvidia-targeted-stress", "nvidia-bench-perf", "nvidia-bench-power", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse", "nvidia-interconnect", "nvidia-bandwidth", "nvidia-stress": return true @@ -127,7 +127,7 @@ func defaultTaskPriority(target string, params taskParams) int { return taskPriorityInstallToRAM case "audit": return taskPriorityAudit - case "nvidia-benchmark": + case "nvidia-bench-perf", "nvidia-bench-power": return taskPriorityBenchmark case "nvidia-stress", "amd-stress", "memory-stress", "sat-stress", "platform-stress", "nvidia-compute": return taskPriorityBurn @@ -573,131 +573,142 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc { } } -func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) { - if h.opts.App == nil { - writeError(w, http.StatusServiceUnavailable, "app not configured") - return - } - - var body struct { - Profile string `json:"profile"` - SizeMB int `json:"size_mb"` - GPUIndices []int `json:"gpu_indices"` - ExcludeGPUIndices []int `json:"exclude_gpu_indices"` - RunNCCL *bool `json:"run_nccl"` - ParallelGPUs *bool `json:"parallel_gpus"` - RampUp *bool `json:"ramp_up"` - DisplayName string `json:"display_name"` - } - if r.Body != nil { - if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) { - writeError(w, http.StatusBadRequest, "invalid request body") +func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + if h.opts.App == nil { + writeError(w, http.StatusServiceUnavailable, "app not configured") return } - } - runNCCL := true - if body.RunNCCL != nil { - runNCCL = *body.RunNCCL - } - parallelGPUs := false - if body.ParallelGPUs != nil { - parallelGPUs = *body.ParallelGPUs - } - rampUp := false - if body.RampUp != nil { - rampUp = *body.RampUp - } - // Build a descriptive base name that includes profile and mode so the task - // list is self-explanatory without opening individual task detail pages. - profile := strings.TrimSpace(body.Profile) - if profile == "" { - profile = "standard" - } - name := taskDisplayName("nvidia-benchmark", "", "") - if strings.TrimSpace(body.DisplayName) != "" { - name = body.DisplayName - } - // Append profile tag. - name = fmt.Sprintf("%s · %s", name, profile) + var body struct { + Profile string `json:"profile"` + SizeMB int `json:"size_mb"` + GPUIndices []int `json:"gpu_indices"` + ExcludeGPUIndices []int `json:"exclude_gpu_indices"` + RunNCCL *bool `json:"run_nccl"` + ParallelGPUs *bool `json:"parallel_gpus"` + RampUp *bool `json:"ramp_up"` + DisplayName string `json:"display_name"` + } + if r.Body != nil { + if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) { + writeError(w, http.StatusBadRequest, "invalid request body") + return + } + } - if rampUp && len(body.GPUIndices) > 1 { - // Ramp-up mode: resolve GPU list, then create one task per prefix - // [gpu0], [gpu0,gpu1], ..., [gpu0,...,gpuN-1], each running in parallel. - gpus, err := apiListNvidiaGPUs(h.opts.App) - if err != nil { - writeError(w, http.StatusBadRequest, err.Error()) + runNCCL := true + if body.RunNCCL != nil { + runNCCL = *body.RunNCCL + } + parallelGPUs := false + if body.ParallelGPUs != nil { + parallelGPUs = *body.ParallelGPUs + } + rampUp := false + if body.RampUp != nil { + rampUp = *body.RampUp + } + // Build a descriptive base name that includes profile and mode so the task + // list is self-explanatory without opening individual task detail pages. + profile := strings.TrimSpace(body.Profile) + if profile == "" { + profile = "standard" + } + name := taskDisplayName(target, "", "") + if strings.TrimSpace(body.DisplayName) != "" { + name = body.DisplayName + } + // Append profile tag. + name = fmt.Sprintf("%s · %s", name, profile) + + if target == "nvidia-bench-power" && parallelGPUs { + writeError(w, http.StatusBadRequest, "power / thermal fit benchmark uses sequential or ramp-up modes only") return } - resolved, err := expandSelectedGPUIndices(gpus, body.GPUIndices, body.ExcludeGPUIndices) - if err != nil { - writeError(w, http.StatusBadRequest, err.Error()) - return - } - if len(resolved) < 2 { - // Fall through to normal single-task path. - rampUp = false - } else { - now := time.Now() - rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405")) - var allTasks []*Task - for step := 1; step <= len(resolved); step++ { - subset := resolved[:step] - stepName := fmt.Sprintf("%s · ramp %d/%d · GPU %s", name, step, len(resolved), formatGPUIndexList(subset)) - t := &Task{ - ID: newJobID("benchmark-nvidia"), - Name: stepName, - Target: "nvidia-benchmark", - Priority: defaultTaskPriority("nvidia-benchmark", taskParams{}), - Status: TaskPending, - CreatedAt: now, - params: taskParams{ - GPUIndices: append([]int(nil), subset...), - SizeMB: body.SizeMB, - BenchmarkProfile: body.Profile, - RunNCCL: runNCCL && step == len(resolved), - ParallelGPUs: true, - RampStep: step, - RampTotal: len(resolved), - RampRunID: rampRunID, - DisplayName: stepName, - }, + + if rampUp && len(body.GPUIndices) > 1 { + // Ramp-up mode: resolve GPU list, then create one task per prefix + // [gpu0], [gpu0,gpu1], ..., [gpu0,...,gpuN-1], each running in parallel. + gpus, err := apiListNvidiaGPUs(h.opts.App) + if err != nil { + writeError(w, http.StatusBadRequest, err.Error()) + return + } + resolved, err := expandSelectedGPUIndices(gpus, body.GPUIndices, body.ExcludeGPUIndices) + if err != nil { + writeError(w, http.StatusBadRequest, err.Error()) + return + } + if len(resolved) < 2 { + // Fall through to normal single-task path. + rampUp = false + } else { + now := time.Now() + rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405")) + var allTasks []*Task + for step := 1; step <= len(resolved); step++ { + subset := resolved[:step] + stepName := fmt.Sprintf("%s · ramp %d/%d · GPU %s", name, step, len(resolved), formatGPUIndexList(subset)) + t := &Task{ + ID: newJobID("bee-bench-nvidia"), + Name: stepName, + Target: target, + Priority: defaultTaskPriority(target, taskParams{}), + Status: TaskPending, + CreatedAt: now, + params: taskParams{ + GPUIndices: append([]int(nil), subset...), + SizeMB: body.SizeMB, + BenchmarkProfile: body.Profile, + RunNCCL: runNCCL && step == len(resolved), + ParallelGPUs: true, + RampStep: step, + RampTotal: len(resolved), + RampRunID: rampRunID, + DisplayName: stepName, + }, + } + allTasks = append(allTasks, t) } - allTasks = append(allTasks, t) + for _, t := range allTasks { + globalQueue.enqueue(t) + } + writeTaskRunResponse(w, allTasks) + return } - for _, t := range allTasks { - globalQueue.enqueue(t) - } - writeTaskRunResponse(w, allTasks) + } + + // For non-ramp tasks append mode tag. + if parallelGPUs { + name = fmt.Sprintf("%s · parallel", name) + } else { + name = fmt.Sprintf("%s · sequential", name) + } + + params := taskParams{ + GPUIndices: body.GPUIndices, + ExcludeGPUIndices: body.ExcludeGPUIndices, + SizeMB: body.SizeMB, + BenchmarkProfile: body.Profile, + RunNCCL: runNCCL, + ParallelGPUs: parallelGPUs, + DisplayName: body.DisplayName, + } + tasks, err := buildNvidiaTaskSet(target, defaultTaskPriority(target, params), time.Now(), params, name, h.opts.App, "bee-bench-nvidia") + if err != nil { + writeError(w, http.StatusBadRequest, err.Error()) return } + for _, t := range tasks { + globalQueue.enqueue(t) + } + writeTaskRunResponse(w, tasks) } +} - // For non-ramp tasks append mode tag. - if parallelGPUs { - name = fmt.Sprintf("%s · parallel", name) - } else { - name = fmt.Sprintf("%s · sequential", name) - } - - params := taskParams{ - GPUIndices: body.GPUIndices, - ExcludeGPUIndices: body.ExcludeGPUIndices, - SizeMB: body.SizeMB, - BenchmarkProfile: body.Profile, - RunNCCL: runNCCL, - ParallelGPUs: parallelGPUs, - DisplayName: body.DisplayName, - } - tasks, err := buildNvidiaTaskSet("nvidia-benchmark", defaultTaskPriority("nvidia-benchmark", params), time.Now(), params, name, h.opts.App, "benchmark-nvidia") - if err != nil { - writeError(w, http.StatusBadRequest, err.Error()) - return - } - for _, t := range tasks { - globalQueue.enqueue(t) - } - writeTaskRunResponse(w, tasks) +func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) { + h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf").ServeHTTP(w, r) } func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) { diff --git a/audit/internal/webui/api_test.go b/audit/internal/webui/api_test.go index 5a78970..3ac9f90 100644 --- a/audit/internal/webui/api_test.go +++ b/audit/internal/webui/api_test.go @@ -64,7 +64,7 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) { t.Cleanup(func() { apiListNvidiaGPUs = prevList }) h := &handler{opts: HandlerOptions{App: &app.App{}}} - req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`)) + req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/perf/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`)) rec := httptest.NewRecorder() h.handleAPIBenchmarkNvidiaRun(rec, req) @@ -78,8 +78,8 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) { t.Fatalf("tasks=%d want 1", len(globalQueue.tasks)) } task := globalQueue.tasks[0] - if task.Target != "nvidia-benchmark" { - t.Fatalf("target=%q want nvidia-benchmark", task.Target) + if task.Target != "nvidia-bench-perf" { + t.Fatalf("target=%q want nvidia-bench-perf", task.Target) } if got := task.params.GPUIndices; len(got) != 2 || got[0] != 1 || got[1] != 3 { t.Fatalf("gpu indices=%v want [1 3]", got) @@ -113,7 +113,7 @@ func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) { t.Cleanup(func() { apiListNvidiaGPUs = prevList }) h := &handler{opts: HandlerOptions{App: &app.App{}}} - req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`)) + req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/perf/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`)) rec := httptest.NewRecorder() h.handleAPIBenchmarkNvidiaRun(rec, req) @@ -147,6 +147,50 @@ func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) { } } +func TestHandleAPIBenchmarkPowerFitRampQueuesBenchmarkPowerFitTasks(t *testing.T) { + globalQueue.mu.Lock() + originalTasks := globalQueue.tasks + globalQueue.tasks = nil + globalQueue.mu.Unlock() + t.Cleanup(func() { + globalQueue.mu.Lock() + globalQueue.tasks = originalTasks + globalQueue.mu.Unlock() + }) + prevList := apiListNvidiaGPUs + apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) { + return []platform.NvidiaGPU{ + {Index: 0, Name: "NVIDIA H100 PCIe"}, + {Index: 1, Name: "NVIDIA H100 PCIe"}, + {Index: 2, Name: "NVIDIA H100 PCIe"}, + }, nil + } + t.Cleanup(func() { apiListNvidiaGPUs = prevList }) + + h := &handler{opts: HandlerOptions{App: &app.App{}}} + req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/power/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"ramp_up":true}`)) + rec := httptest.NewRecorder() + + h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power").ServeHTTP(rec, req) + + if rec.Code != 200 { + t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String()) + } + globalQueue.mu.Lock() + defer globalQueue.mu.Unlock() + if len(globalQueue.tasks) != 3 { + t.Fatalf("tasks=%d want 3", len(globalQueue.tasks)) + } + for i, task := range globalQueue.tasks { + if task.Target != "nvidia-bench-power" { + t.Fatalf("task[%d] target=%q", i, task.Target) + } + if task.Priority != taskPriorityBenchmark { + t.Fatalf("task[%d] priority=%d want %d", i, task.Priority, taskPriorityBenchmark) + } + } +} + func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) { globalQueue.mu.Lock() originalTasks := globalQueue.tasks @@ -202,7 +246,8 @@ func TestDefaultTaskPriorityOrder(t *testing.T) { defaultTaskPriority("cpu", taskParams{}), defaultTaskPriority("cpu", taskParams{StressMode: true}), defaultTaskPriority("nvidia-stress", taskParams{}), - defaultTaskPriority("nvidia-benchmark", taskParams{}), + defaultTaskPriority("nvidia-bench-perf", taskParams{}), + defaultTaskPriority("nvidia-bench-power", taskParams{}), } want := []int{ taskPriorityInstallToRAM, @@ -211,13 +256,14 @@ func TestDefaultTaskPriorityOrder(t *testing.T) { taskPriorityValidateStress, taskPriorityBurn, taskPriorityBenchmark, + taskPriorityBenchmark, } for i := range want { if got[i] != want[i] { t.Fatalf("priority[%d]=%d want %d", i, got[i], want[i]) } } - if !(got[0] > got[1] && got[1] > got[2] && got[2] > got[3] && got[3] > got[4] && got[4] > got[5]) { + if !(got[0] > got[1] && got[1] > got[2] && got[2] > got[3] && got[3] > got[4] && got[4] > got[5] && got[5] == got[6]) { t.Fatalf("priority order=%v", got) } } diff --git a/audit/internal/webui/kmsg_watcher.go b/audit/internal/webui/kmsg_watcher.go index d67472b..42201c5 100644 --- a/audit/internal/webui/kmsg_watcher.go +++ b/audit/internal/webui/kmsg_watcher.go @@ -232,7 +232,7 @@ func truncate(s string, max int) string { // isSATTarget returns true for task targets that run hardware acceptance tests. func isSATTarget(target string) bool { switch target { - case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse", + case "nvidia", "nvidia-targeted-stress", "nvidia-bench-perf", "nvidia-bench-power", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse", "nvidia-interconnect", "nvidia-bandwidth", "nvidia-stress", "memory", "memory-stress", "storage", "cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress", "platform-stress": diff --git a/audit/internal/webui/pages.go b/audit/internal/webui/pages.go index 6e24398..b6bc7dd 100644 --- a/audit/internal/webui/pages.go +++ b/audit/internal/webui/pages.go @@ -1946,7 +1946,7 @@ func renderBenchmark(opts HandlerOptions) string {
-
NVIDIA Benchmark
+
Benchmark Setup
@@ -1979,21 +1979,25 @@ func renderBenchmark(opts HandlerOptions) string { Ramp-up — 1 GPU → 2 → … → all selected (separate tasks)

Select one GPU for single-card benchmarking or several GPUs for a constrained multi-GPU run.

- +
+ + +
+
-
Method
+
Method Split
-

Each benchmark run performs warmup, sustained compute, telemetry capture, cooldown, and optional NCCL interconnect checks.

+

The benchmark page now exposes two fundamentally different test families so compute score and server power-fit are not mixed into one number.

- - - - + + +
ProfilePurpose
StandardFast, repeatable performance check for server-to-server comparison.
StabilityLonger run for thermal drift, power caps, and clock instability.
OvernightExtended verification of long-run stability and late throttling.
Run TypeEngineQuestion
Performance Benchmarkbee-gpu-burnHow much isolated compute performance does the GPU realize in this server?
Power / Thermal Fitdcgmi targeted_powerHow much power per GPU can this server sustain as GPU count ramps up?
+

Use ramp-up mode for capacity work: it creates 1 GPU → 2 GPU → … → all selected steps so analysis software can derive server total score and watts-per-GPU curves.

@@ -2036,21 +2040,24 @@ function benchmarkMode() { function benchmarkUpdateSelectionNote() { const selected = benchmarkSelectedGPUIndices(); - const btn = document.getElementById('benchmark-run-btn'); + const perfBtn = document.getElementById('benchmark-run-performance-btn'); + const fitBtn = document.getElementById('benchmark-run-power-fit-btn'); const note = document.getElementById('benchmark-selection-note'); if (!selected.length) { - btn.disabled = true; + perfBtn.disabled = true; + fitBtn.disabled = true; note.textContent = 'Select at least one NVIDIA GPU to run the benchmark.'; return; } - btn.disabled = false; + perfBtn.disabled = false; + fitBtn.disabled = false; const mode = benchmarkMode(); if (mode === 'ramp-up') { - note.textContent = 'Ramp-up: ' + selected.length + ' tasks (1 GPU → ' + selected.length + ' GPUs). NCCL on final step.'; + note.textContent = 'Ramp-up: ' + selected.length + ' tasks (1 GPU → ' + selected.length + ' GPUs). Performance uses compute benchmark; Power / Thermal Fit uses targeted_power per step.'; } else if (mode === 'parallel') { - note.textContent = 'Parallel: all ' + selected.length + ' GPU(s) simultaneously.' + (selected.length > 1 ? ' NCCL included.' : ''); + note.textContent = 'Parallel: all ' + selected.length + ' GPU(s) simultaneously. Only the performance benchmark supports this mode.'; } else { - note.textContent = 'Sequential: each GPU benchmarked separately.' + (selected.length > 1 ? ' NCCL included on each.' : ''); + note.textContent = 'Sequential: each selected GPU benchmarked separately.'; } } @@ -2124,7 +2131,7 @@ function benchmarkSelectNone() { benchmarkUpdateSelectionNote(); } -function runNvidiaBenchmark() { +function runNvidiaBenchmark(kind) { const selected = benchmarkSelectedGPUIndices(); const status = document.getElementById('benchmark-run-status'); if (!selected.length) { @@ -2134,21 +2141,26 @@ function runNvidiaBenchmark() { if (benchmarkES) { benchmarkES.close(); benchmarkES = null; } const mode = benchmarkMode(); const rampUp = mode === 'ramp-up' && selected.length > 1; - const parallelGPUs = mode === 'parallel'; + const parallelGPUs = mode === 'parallel' && kind === 'performance'; + if (kind === 'power-fit' && mode === 'parallel') { + status.textContent = 'Power / Thermal Fit supports sequential or ramp-up only.'; + return; + } const body = { profile: document.getElementById('benchmark-profile').value || 'standard', gpu_indices: selected, - run_nccl: selected.length > 1, + run_nccl: kind === 'performance' && selected.length > 1, parallel_gpus: parallelGPUs, ramp_up: rampUp, - display_name: 'NVIDIA Benchmark' + display_name: kind === 'power-fit' ? 'NVIDIA Power / Thermal Fit' : 'NVIDIA Performance Benchmark' }; document.getElementById('benchmark-output').style.display = 'block'; - document.getElementById('benchmark-title').textContent = '— ' + body.profile + ' [' + selected.join(', ') + ']'; + document.getElementById('benchmark-title').textContent = '— ' + body.display_name + ' · ' + body.profile + ' [' + selected.join(', ') + ']'; const term = document.getElementById('benchmark-terminal'); - term.textContent = 'Enqueuing benchmark for GPUs ' + selected.join(', ') + '...\n'; + term.textContent = 'Enqueuing ' + body.display_name + ' for GPUs ' + selected.join(', ') + '...\n'; status.textContent = 'Queueing...'; - fetch('/api/benchmark/nvidia/run', { + const endpoint = kind === 'power-fit' ? '/api/bee-bench/nvidia/power/run' : '/api/bee-bench/nvidia/perf/run'; + fetch(endpoint, { method: 'POST', headers: {'Content-Type':'application/json'}, body: JSON.stringify(body) @@ -2202,7 +2214,7 @@ benchmarkLoadGPUs(); func renderBenchmarkResultsCard(exportDir string) string { maxIdx, runs := loadBenchmarkHistory(exportDir) return renderBenchmarkResultsCardFromRuns( - "Benchmark Results", + "Perf Results", "Composite score by saved benchmark run and GPU.", "No saved benchmark runs yet.", maxIdx, @@ -2244,11 +2256,11 @@ func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, } func loadBenchmarkHistory(exportDir string) (int, []benchmarkHistoryRun) { - baseDir := app.DefaultBenchmarkBaseDir + baseDir := app.DefaultBeeBenchPerfDir if strings.TrimSpace(exportDir) != "" { - baseDir = filepath.Join(exportDir, "bee-benchmark") + baseDir = filepath.Join(exportDir, "bee-bench", "perf") } - paths, err := filepath.Glob(filepath.Join(baseDir, "gpu-benchmark-*", "result.json")) + paths, err := filepath.Glob(filepath.Join(baseDir, "perf-*", "result.json")) if err != nil || len(paths) == 0 { return -1, nil } diff --git a/audit/internal/webui/server.go b/audit/internal/webui/server.go index 248fc9f..47670ac 100644 --- a/audit/internal/webui/server.go +++ b/audit/internal/webui/server.go @@ -261,7 +261,8 @@ func NewHandler(opts HandlerOptions) http.Handler { mux.HandleFunc("POST /api/sat/platform-stress/run", h.handleAPISATRun("platform-stress")) mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream) mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort) - mux.HandleFunc("POST /api/benchmark/nvidia/run", h.handleAPIBenchmarkNvidiaRun) + mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf")) + mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power")) // Tasks mux.HandleFunc("GET /api/tasks", h.handleAPITasksList) diff --git a/audit/internal/webui/server_test.go b/audit/internal/webui/server_test.go index 3bbb57b..e822955 100644 --- a/audit/internal/webui/server_test.go +++ b/audit/internal/webui/server_test.go @@ -648,8 +648,11 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) { `href="/benchmark"`, `id="benchmark-gpu-list"`, `/api/gpu/nvidia`, - `/api/benchmark/nvidia/run`, + `/api/bee-bench/nvidia/perf/run`, + `/api/bee-bench/nvidia/power/run`, `benchmark-run-nccl`, + `Run Performance Benchmark`, + `Run Power / Thermal Fit`, } { if !strings.Contains(body, needle) { t.Fatalf("benchmark page missing %q: %s", needle, body) @@ -660,7 +663,7 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) { func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) { dir := t.TempDir() exportDir := filepath.Join(dir, "export") - runDir := filepath.Join(exportDir, "bee-benchmark", "gpu-benchmark-20260406-120000") + runDir := filepath.Join(exportDir, "bee-bench", "perf", "perf-20260406-120000") if err := os.MkdirAll(runDir, 0755); err != nil { t.Fatal(err) } @@ -702,7 +705,7 @@ func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) { body := rec.Body.String() wantTime := result.GeneratedAt.Local().Format("2006-01-02 15:04:05") for _, needle := range []string{ - `Benchmark Results`, + `Perf Results`, `Composite score by saved benchmark run and GPU.`, `GPU 0`, `GPU 1`, diff --git a/audit/internal/webui/task_report.go b/audit/internal/webui/task_report.go index d13c6bc..8ec87e6 100644 --- a/audit/internal/webui/task_report.go +++ b/audit/internal/webui/task_report.go @@ -251,7 +251,9 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe } func renderTaskBenchmarkResultsCard(target, logText string) string { - if strings.TrimSpace(target) != "nvidia-benchmark" { + switch strings.TrimSpace(target) { + case "nvidia-bench-perf": + default: return "" } resultPath := taskBenchmarkResultPath(logText) @@ -263,7 +265,7 @@ func renderTaskBenchmarkResultsCard(target, logText string) string { return "" } return renderBenchmarkResultsCardFromRuns( - "Benchmark Results", + "Perf Results", "Composite score for this benchmark task.", "No benchmark results were saved for this task.", columns, diff --git a/audit/internal/webui/tasks.go b/audit/internal/webui/tasks.go index e7e449b..b4665f5 100644 --- a/audit/internal/webui/tasks.go +++ b/audit/internal/webui/tasks.go @@ -32,7 +32,8 @@ const ( var taskNames = map[string]string{ "nvidia": "NVIDIA SAT", "nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)", - "nvidia-benchmark": "NVIDIA Benchmark", + "nvidia-bench-perf": "NVIDIA Bee Bench Perf", + "nvidia-bench-power": "NVIDIA Bee Bench Power", "nvidia-compute": "NVIDIA Max Compute Load (dcgmproftester)", "nvidia-targeted-power": "NVIDIA Targeted Power (dcgmi diag targeted_power)", "nvidia-pulse": "NVIDIA Pulse Test (dcgmi diag pulse_test)", @@ -628,7 +629,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) { dur = 300 } archive, err = a.RunNvidiaTargetedStressValidatePack(ctx, "", dur, t.params.GPUIndices, j.append) - case "nvidia-benchmark": + case "nvidia-bench-perf": if a == nil { err = fmt.Errorf("app not configured") break @@ -644,6 +645,31 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) { RampTotal: t.params.RampTotal, RampRunID: t.params.RampRunID, }, j.append) + case "nvidia-bench-power": + if a == nil { + err = fmt.Errorf("app not configured") + break + } + dur := t.params.Duration + if dur <= 0 { + switch strings.TrimSpace(strings.ToLower(t.params.BenchmarkProfile)) { + case platform.NvidiaBenchmarkProfileStability: + dur = 300 + case platform.NvidiaBenchmarkProfileOvernight: + dur = 600 + default: + dur = 120 + } + } + rampPlan, planErr := resolveNvidiaRampPlan(t.params.BenchmarkProfile, t.params.RampTotal > 0, t.params.GPUIndices) + if planErr != nil { + err = planErr + break + } + if t.params.RampTotal > 0 && t.params.RampStep > 0 && dur <= 0 { + dur = rampPlan.DurationSec + } + archive, err = a.RunNvidiaTargetedPowerPack(ctx, app.DefaultBeeBenchPowerDir, dur, t.params.GPUIndices, j.append) case "nvidia-compute": if a == nil { err = fmt.Errorf("app not configured") diff --git a/audit/internal/webui/tasks_test.go b/audit/internal/webui/tasks_test.go index 807d079..5f39830 100644 --- a/audit/internal/webui/tasks_test.go +++ b/audit/internal/webui/tasks_test.go @@ -366,7 +366,7 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) { taskReportMetricsDBPath = metricsPath t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath }) - benchmarkDir := filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000") + benchmarkDir := filepath.Join(dir, "bee-bench", "perf", "perf-20260406-120000") if err := os.MkdirAll(benchmarkDir, 0755); err != nil { t.Fatal(err) } @@ -398,14 +398,14 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) { } task := &Task{ ID: "task-bench", - Name: "NVIDIA Benchmark", - Target: "nvidia-benchmark", + Name: "NVIDIA Bee Bench Perf", + Target: "nvidia-bench-perf", Status: TaskDone, CreatedAt: time.Now().UTC().Add(-time.Minute), ArtifactsDir: artifactsDir, } ensureTaskReportPaths(task) - logText := "line-1\nArchive: " + filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000.tar.gz") + "\n" + logText := "line-1\nArchive: " + filepath.Join(dir, "bee-bench", "perf", "perf-20260406-120000.tar.gz") + "\n" if err := os.WriteFile(task.LogPath, []byte(logText), 0644); err != nil { t.Fatal(err) } @@ -420,7 +420,7 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) { } html := string(body) for _, needle := range []string{ - `Benchmark Results`, + `Perf Results`, `Composite score for this benchmark task.`, `GPU 0`, `1176.25`, diff --git a/bible-local/docs/benchmark-clock-calibration.md b/bible-local/docs/benchmark-clock-calibration.md index 4437467..74ccc40 100644 --- a/bible-local/docs/benchmark-clock-calibration.md +++ b/bible-local/docs/benchmark-clock-calibration.md @@ -1,5 +1,34 @@ # Benchmark clock calibration research +## Benchmark methodology versioning + +Every benchmark methodology change must bump the benchmark version constant in +source code by exactly `+1`. + +Methodology change means any change that affects comparability of benchmark +results, including for example: +- phase durations or phase order +- enabled/disabled precisions +- fallback rules +- normalization rules +- score formulas or weights +- degradation thresholds +- power calibration logic +- thermal/power penalty logic + +Requirements: +- benchmark version must be stored in source code as an explicit version + constant, not inferred from git tag or build metadata +- benchmark report must always print the benchmark version +- `result.json` must always include the benchmark version +- results from different benchmark versions must be treated as non-comparable by + default + +Purpose: +- prevent accidental comparison of runs produced by different methodologies +- make historical benchmark archives self-describing even when detached from git +- force deliberate version bumps whenever scoring or execution semantics change + ## Status In progress. Baseline data from production servers pending. diff --git a/iso/builder/bee-gpu-stress.c b/iso/builder/bee-gpu-stress.c index 3bba297..01cba57 100644 --- a/iso/builder/bee-gpu-stress.c +++ b/iso/builder/bee-gpu-stress.c @@ -796,6 +796,9 @@ static cudaDataType_t matmul_scale_type(const struct profile_desc *desc) { if (desc->compute_type == CUBLAS_COMPUTE_32I) { return CUDA_R_32I; } + if (desc->compute_type == CUBLAS_COMPUTE_64F) { + return CUDA_R_64F; + } return CUDA_R_32F; } @@ -1120,6 +1123,8 @@ static int run_cublas_profile(cublasLtHandle_t handle, struct prepared_profile *profile) { int32_t alpha_i32 = 1; int32_t beta_i32 = 0; + double alpha_f64 = 1.0; + double beta_f64 = 0.0; float alpha = 1.0f; float beta = 0.0f; const void *alpha_ptr = α @@ -1127,6 +1132,9 @@ static int run_cublas_profile(cublasLtHandle_t handle, if (profile->desc.compute_type == CUBLAS_COMPUTE_32I) { alpha_ptr = &alpha_i32; beta_ptr = &beta_i32; + } else if (profile->desc.compute_type == CUBLAS_COMPUTE_64F) { + alpha_ptr = &alpha_f64; + beta_ptr = &beta_f64; } return check_cublas(profile->desc.name, cublas->cublasLtMatmul(handle,