Split bee-bench into perf and power workflows

2026-04-14 17:33:13 +03:00
parent 54338dbae5
commit 95124d228f
17 changed files with 718 additions and 259 deletions
--- a/audit/internal/app/app.go
+++ b/audit/internal/app/app.go
@@ -30,7 +30,9 @@ var (
 	DefaultRuntimeLogPath   = DefaultExportDir + "/runtime-health.log"
 	DefaultTechDumpDir      = DefaultExportDir + "/techdump"
 	DefaultSATBaseDir       = DefaultExportDir + "/bee-sat"
-	DefaultBenchmarkBaseDir = DefaultExportDir + "/bee-benchmark"
+	DefaultBeeBenchBaseDir  = DefaultExportDir + "/bee-bench"
 	DefaultBeeBenchPerfDir  = DefaultBeeBenchBaseDir + "/perf"
 	DefaultBeeBenchPowerDir = DefaultBeeBenchBaseDir + "/power"
 )
 type App struct {
@@ -567,7 +569,7 @@ func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOp
 func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
-		baseDir = DefaultBenchmarkBaseDir
+		baseDir = DefaultBeeBenchPerfDir
 	}
 	return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
 }
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -16,7 +16,7 @@ import (
 	"time"
 )
-const benchmarkVersion = "1"
+const benchmarkVersion = "2"
 type benchmarkProfileSpec struct {
 	Name        string
@@ -41,6 +41,15 @@ type benchmarkGPUInfo struct {
 	MultiprocessorCount  int
 }
 type benchmarkPowerCalibrationResult struct {
 	Summary            BenchmarkTelemetrySummary
 	AppliedPowerLimitW float64
 	Attempts           int
 	Derated            bool
 	Completed          bool
 	Notes              []string
 }
 type benchmarkBurnProfile struct {
 	name       string
 	category   string
@@ -78,7 +87,36 @@ var (
 // to highest power draw so thermal ramp-up is gradual.
 var benchmarkPrecisionPhases = []string{"int8", "fp8", "fp16", "fp32", "fp64", "fp4"}
-func buildBenchmarkSteadyPlan(spec benchmarkProfileSpec, metricStage func(string) string) (planLabels []string, planPhases []benchmarkPlannedPhase, basePhaseSec int, mixedPhaseSec int) {
+func computeCapabilityCode(raw string) int {
 	raw = strings.TrimSpace(raw)
 	if raw == "" {
 		return 0
 	}
 	parts := strings.SplitN(raw, ".", 2)
 	major, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
 	minor := 0
 	if len(parts) > 1 {
 		minor, _ = strconv.Atoi(strings.TrimSpace(parts[1]))
 	}
 	return major*10 + minor
 }
 func benchmarkSupportedPrecisions(computeCapability string) []string {
 	cc := computeCapabilityCode(computeCapability)
 	out := make([]string, 0, len(benchmarkPrecisionPhases))
 	for _, prec := range benchmarkPrecisionPhases {
 		if prec == "fp4" && cc > 0 && cc < 100 {
 			continue
 		}
 		out = append(out, prec)
 	}
 	return out
 }
 func buildBenchmarkSteadyPlan(spec benchmarkProfileSpec, precisions []string, metricStage func(string) string) (planLabels []string, planPhases []benchmarkPlannedPhase, basePhaseSec int, mixedPhaseSec int) {
 	if len(precisions) == 0 {
 		precisions = append([]string(nil), benchmarkPrecisionPhases...)
 	}
 	switch spec.Name {
 	case NvidiaBenchmarkProfileStandard:
 		basePhaseSec = 60
@@ -90,7 +128,7 @@ func buildBenchmarkSteadyPlan(spec benchmarkProfileSpec, metricStage func(string
 		basePhaseSec = 3600
 		mixedPhaseSec = 14400
 	default:
-		totalWeight := len(benchmarkPrecisionPhases) + 5
+		totalWeight := len(precisions) + 5
 		if totalWeight <= 0 {
 			return nil, nil, 0, 0
 		}
@@ -100,9 +138,9 @@ func buildBenchmarkSteadyPlan(spec benchmarkProfileSpec, metricStage func(string
 		}
 		mixedPhaseSec = basePhaseSec * 5
 	}
-	planLabels = make([]string, 0, len(benchmarkPrecisionPhases)+1)
+	planLabels = make([]string, 0, len(precisions)+1)
-	planPhases = make([]benchmarkPlannedPhase, 0, len(benchmarkPrecisionPhases)+1)
+	planPhases = make([]benchmarkPlannedPhase, 0, len(precisions)+1)
-	for _, prec := range benchmarkPrecisionPhases {
+	for _, prec := range precisions {
 		planLabels = append(planLabels, prec)
 		planPhases = append(planPhases, benchmarkPlannedPhase{
 			PlanLabel:   prec,
@@ -127,6 +165,53 @@ func benchmarkPlanDurationsCSV(phases []benchmarkPlannedPhase) string {
 	return strings.Join(values, ",")
 }
 func benchmarkPlannedPhaseStatus(raw []byte) (string, string) {
 	text := strings.ToLower(strings.TrimSpace(string(raw)))
 	switch {
 	case text == "":
 		return "FAILED", "phase produced no output"
 	case strings.Contains(text, "phase_error="):
 		if strings.Contains(text, "unsupported") || strings.Contains(text, "not supported") || strings.Contains(text, "cublaslt_profiles=unsupported") {
 			return "UNSUPPORTED", "precision phase unsupported on this GPU/userspace path"
 		}
 		return "FAILED", "precision phase failed"
 	case strings.Contains(text, "status=failed"):
 		if strings.Contains(text, "unsupported") || strings.Contains(text, "not supported") {
 			return "UNSUPPORTED", "precision phase unsupported on this GPU/userspace path"
 		}
 		return "FAILED", "precision phase failed"
 	default:
 		return "OK", ""
 	}
 }
 func benchmarkCalibrationThrottleReason(before, after BenchmarkThrottleCounters) string {
 	diff := diffThrottleCounters(before, after)
 	switch {
 	case diff.HWThermalSlowdownUS > 0:
 		return "hw_thermal"
 	case diff.SWThermalSlowdownUS > 0:
 		return "sw_thermal"
 	case diff.HWPowerBrakeSlowdownUS > 0:
 		return "hw_power_brake"
 	default:
 		return ""
 	}
 }
 func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, powerLimitW int) error {
 	if powerLimitW <= 0 {
 		return fmt.Errorf("invalid power limit %d", powerLimitW)
 	}
 	out, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("gpu-%d-set-power-limit-%dw", gpuIndex, powerLimitW), []string{
 		"nvidia-smi", "-i", strconv.Itoa(gpuIndex), "-pl", strconv.Itoa(powerLimitW),
 	}, nil, nil)
 	if err != nil {
 		return fmt.Errorf("set power limit gpu=%d limit=%dw: %w (%s)", gpuIndex, powerLimitW, err, strings.TrimSpace(string(out)))
 	}
 	return nil
 }
 func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
 	if ctx == nil {
 		ctx = context.Background()
@@ -135,7 +220,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		logFunc = func(string) {}
 	}
 	if strings.TrimSpace(baseDir) == "" {
-		baseDir = "/var/log/bee-benchmark"
+		baseDir = "/var/log/bee-bench/perf"
 	}
 	spec := resolveBenchmarkProfile(opts.Profile)
 	opts = normalizeNvidiaBenchmarkOptionsForBenchmark(opts)
@@ -149,7 +234,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 	}
 	ts := time.Now().UTC().Format("20060102-150405")
-	runDir := filepath.Join(baseDir, "gpu-benchmark-"+ts)
+	runDir := filepath.Join(baseDir, "perf-"+ts)
 	if err := os.MkdirAll(runDir, 0755); err != nil {
 		return "", fmt.Errorf("mkdir %s: %w", runDir, err)
 	}
@@ -175,6 +260,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 	logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected)))
 	var metricRows []GPUMetricRow
 	metricTimelineSec := 0.0
 	gpuBurnLog := filepath.Join(runDir, "gpu-burn.log")
 	// Server power characterization state — populated during per-GPU phases.
@@ -215,14 +301,23 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 	// Power calibration: run dcgmi targeted_power while sampling nvidia-smi power.
 	// Returns per-GPU p95 power as an honest TDP reference for PowerSustainScore.
-	calibPowerByIndex := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, logFunc)
+	calibByIndex, powerRestoreActions := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, infoByIndex, logFunc)
 	restoreActions = append(restoreActions, powerRestoreActions...)
 	for _, idx := range selected {
 		if calib, ok := calibByIndex[idx]; ok && calib.Derated && calib.AppliedPowerLimitW > 0 {
 			result.Warnings = append(result.Warnings, fmt.Sprintf(
 				"GPU %d could not complete targeted_power at its default server power budget; benchmark ran at reduced power limit %.0f W.",
 				idx, calib.AppliedPowerLimitW,
 			))
 		}
 	}
 	// Start background CPU load sampler — samples every 10s during GPU phases.
 	cpuStopCh := make(chan struct{})
 	cpuSamplesCh := startCPULoadSampler(cpuStopCh, 10)
 	if opts.ParallelGPUs {
-		runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, calibPowerByIndex, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples, &metricRows, gpuBurnLog)
+		runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, calibByIndex, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples, &metricRows, &metricTimelineSec, gpuBurnLog)
 	} else {
 		for _, idx := range selected {
@@ -242,8 +337,12 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 				gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
 				gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
 			}
-			if w, ok := calibPowerByIndex[idx]; ok && w > 0 {
+			if calib, ok := calibByIndex[idx]; ok {
-				gpuResult.CalibratedPeakPowerW = w
+				gpuResult.CalibratedPeakPowerW = calib.Summary.P95PowerW
 				gpuResult.CalibratedPeakTempC = calib.Summary.P95TempC
 				gpuResult.PowerCalibrationTries = calib.Attempts
 				gpuResult.PowerLimitDerated = calib.Derated
 				gpuResult.Notes = append(gpuResult.Notes, calib.Notes...)
 			}
 			if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
 				gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz
@@ -255,7 +354,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 				gpuResult.Notes = append(gpuResult.Notes, "baseline sampling failed: "+err.Error())
 			}
 			gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows)
-			appendBenchmarkMetrics(&metricRows, baselineRows, fmt.Sprintf("gpu-%d-baseline", idx))
+			appendBenchmarkMetrics(&metricRows, baselineRows, fmt.Sprintf("gpu-%d-baseline", idx), &metricTimelineSec, float64(spec.BaselineSec))
 			// Sample server idle power once (first GPU only — server state is global).
 			if !serverIdleOK {
@@ -274,18 +373,23 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 			}
 			logFunc(fmt.Sprintf("GPU %d: warmup (%ds)", idx, spec.WarmupSec))
 			warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-warmup.log", idx), warmupCmd, nil, []int{idx}, logFunc)
-			appendBenchmarkMetrics(&metricRows, warmupRows, fmt.Sprintf("gpu-%d-warmup", idx))
+			appendBenchmarkMetrics(&metricRows, warmupRows, fmt.Sprintf("gpu-%d-warmup", idx), &metricTimelineSec, float64(spec.WarmupSec))
 			appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", fmt.Sprintf("gpu-%d-warmup", idx), warmupOut)
 			if warmupErr != nil {
 				gpuResult.Notes = append(gpuResult.Notes, "warmup failed: "+warmupErr.Error())
 				result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
 				continue
 			}
 			warmupParse := parseBenchmarkBurnLog(string(warmupOut))
 			if gpuResult.ComputeCapability == "" {
 				gpuResult.ComputeCapability = warmupParse.ComputeCapability
 			}
 			// Run synthetic precision phases and the combined steady phase as one
 			// uninterrupted command so the GPU stays hot between windows.
 			eccBase, _ := queryECCCounters(idx)
-			planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, func(label string) string {
+			supportedPrecisions := benchmarkSupportedPrecisions(gpuResult.ComputeCapability)
 			planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, supportedPrecisions, func(label string) string {
 				if label == "mixed" {
 					return fmt.Sprintf("gpu-%d-steady", idx)
 				}
@@ -299,24 +403,27 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 				"--precision-plan", strings.Join(planLabels, ","),
 				"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
 			}
-			logFunc(fmt.Sprintf("GPU %d: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", idx, len(benchmarkPrecisionPhases), basePhaseSec, mixedPhaseSec))
+			logFunc(fmt.Sprintf("GPU %d: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", idx, len(supportedPrecisions), basePhaseSec, mixedPhaseSec))
 			_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-precision-plan.log", idx), planCmd, nil, []int{idx}, planPhases, logFunc)
 			for _, phaseSpec := range planPhases {
 				if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
-					appendBenchmarkMetrics(&metricRows, rows, phaseSpec.MetricStage)
+					appendBenchmarkMetrics(&metricRows, rows, phaseSpec.MetricStage, &metricTimelineSec, float64(phaseSpec.DurationSec))
 				}
 				appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseSpec.MetricStage, phaseLogs[phaseSpec.PlanLabel])
 			}
-			for _, prec := range benchmarkPrecisionPhases {
+			for _, prec := range supportedPrecisions {
 				stageName := fmt.Sprintf("gpu-%d-steady-%s", idx, prec)
 				phaseRows := phaseRowsByStage[stageName]
 				if len(phaseRows) == 0 {
 					continue
 				}
 				phase := BenchmarkPrecisionSteadyPhase{
 					Precision: prec,
 					Status:    "OK",
 					Steady:    summarizeBenchmarkTelemetry(phaseRows),
 				}
 				if status, note := benchmarkPlannedPhaseStatus(phaseLogs[prec]); status != "OK" {
 					phase.Status = status
 					phase.Notes = note
 					gpuResult.PrecisionFailures = append(gpuResult.PrecisionFailures, prec+":"+status)
 				}
 				for _, p := range parseBenchmarkBurnLog(string(phaseLogs[prec])).Profiles {
 					if p.Supported {
 						phase.TeraOpsPerSec += p.TeraOpsPerSec
@@ -396,13 +503,15 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 					gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error())
 				}
 				gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows)
-				appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx))
+				appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx), &metricTimelineSec, float64(spec.CooldownSec))
 			}
 			gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult)
 			gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status)
 			if planErr != nil {
 				gpuResult.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr)
 			} else if len(gpuResult.PrecisionFailures) > 0 {
 				gpuResult.Status = "PARTIAL"
 			} else if parseResult.Fallback {
 				gpuResult.Status = "PARTIAL"
 			} else {
@@ -929,35 +1038,34 @@ func sampleBenchmarkCoolingSample() benchmarkCoolingSample {
 	}
 }
-func annotateBenchmarkMetricRows(rows []GPUMetricRow, stage string, offset float64) []GPUMetricRow {
+func annotateBenchmarkMetricRows(rows []GPUMetricRow, stage string, offset, durationSec float64) []GPUMetricRow {
 	if len(rows) == 0 {
 		return nil
 	}
 	stageEnd := offset + durationSec
 	if stageEnd <= offset {
 		stageEnd = offset
 		for _, row := range rows {
 			if row.ElapsedSec+offset > stageEnd {
 				stageEnd = row.ElapsedSec + offset
 			}
 		}
 	}
 	out := make([]GPUMetricRow, len(rows))
 	for i, row := range rows {
 		row.Stage = stage
 		row.ElapsedSec += offset
 		row.StageStartSec = offset
 		row.StageEndSec = stageEnd
 		out[i] = row
 	}
 	return out
 }
-func benchmarkMetricOffset(rows []GPUMetricRow) float64 {
+func appendBenchmarkMetrics(allRows *[]GPUMetricRow, rows []GPUMetricRow, stage string, cursor *float64, durationSec float64) {
-	if len(rows) == 0 {
+	annotated := annotateBenchmarkMetricRows(rows, stage, *cursor, durationSec)
 		return 0
 	}
 	var maxElapsed float64
 	for _, row := range rows {
 		if row.ElapsedSec > maxElapsed {
 			maxElapsed = row.ElapsedSec
 		}
 	}
 	return maxElapsed
 }
 func appendBenchmarkMetrics(allRows *[]GPUMetricRow, rows []GPUMetricRow, stage string) {
 	annotated := annotateBenchmarkMetricRows(rows, stage, benchmarkMetricOffset(*allRows))
 	*allRows = append(*allRows, annotated...)
 	*cursor += durationSec
 }
 func writeBenchmarkMetricsFiles(runDir string, rows []GPUMetricRow) {
@@ -1308,6 +1416,9 @@ func detectBenchmarkDegradationReasons(gpu BenchmarkGPUResult, normalizationStat
 	if normalizationStatus != "full" {
 		reasons = append(reasons, "normalization_partial")
 	}
 	if gpu.PowerLimitDerated {
 		reasons = append(reasons, "power_limit_derated")
 	}
 	if gpu.ECC.Uncorrected > 0 {
 		reasons = append(reasons, "ecc_uncorrected_errors")
 	}
@@ -1522,12 +1633,17 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
 				findings = append(findings, fmt.Sprintf("GPU %d showed unstable clocks/power over the benchmark window.", gpu.Index))
 			case "normalization_partial":
 				findings = append(findings, fmt.Sprintf("GPU %d ran without full benchmark normalization.", gpu.Index))
 			case "power_limit_derated":
 				findings = append(findings, fmt.Sprintf("GPU %d could not sustain targeted_power in this server at the default limit; benchmark ran derated at %.0f W.", gpu.Index, gpu.PowerLimitW))
 			case "ecc_uncorrected_errors":
 				findings = append(findings, fmt.Sprintf("GPU %d reported %d uncorrected ECC error(s) — possible hardware fault.", gpu.Index, gpu.ECC.Uncorrected))
 			case "ecc_corrected_errors":
 				findings = append(findings, fmt.Sprintf("GPU %d reported %d corrected ECC error(s) — possible DRAM degradation.", gpu.Index, gpu.ECC.Corrected))
 			}
 		}
 		if len(gpu.PrecisionFailures) > 0 {
 			findings = append(findings, fmt.Sprintf("GPU %d had incomplete precision coverage: %s.", gpu.Index, strings.Join(gpu.PrecisionFailures, ", ")))
 		}
 		if gpu.Backend == "driver-ptx" {
 			findings = append(findings, fmt.Sprintf("GPU %d used driver PTX fallback; tensor score is intentionally degraded.", gpu.Index))
 		}
@@ -1896,10 +2012,11 @@ func runNvidiaBenchmarkParallel(
 	spec benchmarkProfileSpec,
 	logFunc func(string),
 	result *NvidiaBenchmarkResult,
-	calibPowerByIndex map[int]float64,
+	calibByIndex map[int]benchmarkPowerCalibrationResult,
 	serverIdleW *float64, serverLoadedWSum *float64,
 	serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int,
 	allMetricRows *[]GPUMetricRow,
 	metricTimelineSec *float64,
 	gpuBurnLog string,
 ) {
 	allDevices := joinIndexList(selected)
@@ -1920,8 +2037,12 @@ func runNvidiaBenchmarkParallel(
 			r.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
 			r.MaxMemoryClockMHz = info.MaxMemoryClockMHz
 		}
-		if w, ok := calibPowerByIndex[idx]; ok && w > 0 {
+		if calib, ok := calibByIndex[idx]; ok {
-			r.CalibratedPeakPowerW = w
+			r.CalibratedPeakPowerW = calib.Summary.P95PowerW
 			r.CalibratedPeakTempC = calib.Summary.P95TempC
 			r.PowerCalibrationTries = calib.Attempts
 			r.PowerLimitDerated = calib.Derated
 			r.Notes = append(r.Notes, calib.Notes...)
 		}
 		if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
 			r.LockedGraphicsClockMHz = norm.GPUClockLockMHz
@@ -1941,7 +2062,7 @@ func runNvidiaBenchmarkParallel(
 		perGPU := filterRowsByGPU(baselineRows, idx)
 		gpuResults[idx].Baseline = summarizeBenchmarkTelemetry(perGPU)
 	}
-	appendBenchmarkMetrics(allMetricRows, baselineRows, "baseline")
+	appendBenchmarkMetrics(allMetricRows, baselineRows, "baseline", metricTimelineSec, float64(spec.BaselineSec))
 	// Sample server idle power once.
 	if !*serverIdleOK {
@@ -1961,13 +2082,25 @@ func runNvidiaBenchmarkParallel(
 	}
 	logFunc(fmt.Sprintf("GPUs %s: parallel warmup (%ds)", allDevices, spec.WarmupSec))
 	warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-warmup.log", warmupCmd, nil, selected, logFunc)
-	appendBenchmarkMetrics(allMetricRows, warmupRows, "warmup")
+	appendBenchmarkMetrics(allMetricRows, warmupRows, "warmup", metricTimelineSec, float64(spec.WarmupSec))
 	appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", "warmup", warmupOut)
 	if warmupErr != nil {
 		for _, idx := range selected {
 			gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel warmup failed: "+warmupErr.Error())
 		}
 	}
 	warmupParseByGPU := parseBenchmarkBurnLogByGPU(string(warmupOut))
 	supportedPrecisions := append([]string(nil), benchmarkPrecisionPhases...)
 	for _, idx := range selected {
 		if pr, ok := warmupParseByGPU[idx]; ok && pr.ComputeCapability != "" {
 			if gpuResults[idx].ComputeCapability == "" {
 				gpuResults[idx].ComputeCapability = pr.ComputeCapability
 			}
 			if ccPrecisions := benchmarkSupportedPrecisions(pr.ComputeCapability); len(ccPrecisions) < len(supportedPrecisions) {
 				supportedPrecisions = ccPrecisions
 			}
 		}
 	}
 	// Run synthetic precision phases and the combined steady phase as one
 	// uninterrupted command so the GPUs stay hot between windows.
@@ -1975,7 +2108,7 @@ func runNvidiaBenchmarkParallel(
 	for _, idx := range selected {
 		eccBase[idx], _ = queryECCCounters(idx)
 	}
-	planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, func(label string) string {
+	planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, supportedPrecisions, func(label string) string {
 		if label == "mixed" {
 			return "steady"
 		}
@@ -1989,30 +2122,30 @@ func runNvidiaBenchmarkParallel(
 		"--precision-plan", strings.Join(planLabels, ","),
 		"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
 	}
-	logFunc(fmt.Sprintf("GPUs %s: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", allDevices, len(benchmarkPrecisionPhases), basePhaseSec, mixedPhaseSec))
+	logFunc(fmt.Sprintf("GPUs %s: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", allDevices, len(supportedPrecisions), basePhaseSec, mixedPhaseSec))
 	_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, "gpu-all-precision-plan.log", planCmd, nil, selected, planPhases, logFunc)
 	for _, phaseSpec := range planPhases {
 		if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
-			appendBenchmarkMetrics(allMetricRows, rows, phaseSpec.MetricStage)
+			appendBenchmarkMetrics(allMetricRows, rows, phaseSpec.MetricStage, metricTimelineSec, float64(phaseSpec.DurationSec))
 		}
 		appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseSpec.MetricStage, phaseLogs[phaseSpec.PlanLabel])
 	}
-	for _, prec := range benchmarkPrecisionPhases {
+	for _, prec := range supportedPrecisions {
 		phaseLogName := "gpu-all-steady-" + prec
 		phaseRows := phaseRowsByStage[phaseLogName]
 		if len(phaseRows) == 0 {
 			continue
 		}
 		parseByGPU := parseBenchmarkBurnLogByGPU(string(phaseLogs[prec]))
 		for _, idx := range selected {
 			perGPU := filterRowsByGPU(phaseRows, idx)
 			if len(perGPU) == 0 {
 				continue
 			}
 			phase := BenchmarkPrecisionSteadyPhase{
 				Precision: prec,
 				Status:    "OK",
 				Steady:    summarizeBenchmarkTelemetry(perGPU),
 			}
 			if status, note := benchmarkPlannedPhaseStatus(phaseLogs[prec]); status != "OK" {
 				phase.Status = status
 				phase.Notes = note
 				gpuResults[idx].PrecisionFailures = append(gpuResults[idx].PrecisionFailures, prec+":"+status)
 			}
 			if pr, ok := parseByGPU[idx]; ok {
 				for _, p := range pr.Profiles {
 					if p.Supported {
@@ -2113,7 +2246,7 @@ func runNvidiaBenchmarkParallel(
 			perGPU := filterRowsByGPU(cooldownRows, idx)
 			gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU)
 		}
-		appendBenchmarkMetrics(allMetricRows, cooldownRows, "cooldown")
+		appendBenchmarkMetrics(allMetricRows, cooldownRows, "cooldown", metricTimelineSec, float64(spec.CooldownSec))
 	}
 	// Score and finalize each GPU.
@@ -2125,6 +2258,8 @@ func runNvidiaBenchmarkParallel(
 		switch {
 		case planErr != nil:
 			r.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr)
 		case len(r.PrecisionFailures) > 0:
 			r.Status = "PARTIAL"
 		case pr.Fallback:
 			r.Status = "PARTIAL"
 		default:
@@ -2299,59 +2434,172 @@ func summarizeCPULoad(samples []float64) *BenchmarkCPULoad {
 	return cl
 }
-// runBenchmarkPowerCalibration runs a short dcgmi targeted_power test while
+// runBenchmarkPowerCalibration runs targeted_power per GPU and actively watches
-// collecting nvidia-smi power samples in parallel. It returns a map from GPU
+// throttle counters. If a GPU starts throttling, the current targeted_power run
-// index to p95 observed power (watts), which is used as the reference for
+// is canceled immediately, the power limit is reduced, and a fresh full cycle
-// PowerSustainScore instead of the hardware default limit.
+// is started again from the beginning. The selected reduced power limit stays
-//
+// active for the main benchmark and is restored by the caller afterwards.
 // If dcgmi is unavailable or the run fails the function returns an empty map
 // and the caller falls back to DefaultPowerLimitW. The calibration is skipped
 // gracefully — it must never block or fail the main benchmark.
 func runBenchmarkPowerCalibration(
 	ctx context.Context,
 	verboseLog, runDir string,
 	gpuIndices []int,
 	infoByIndex map[int]benchmarkGPUInfo,
 	logFunc func(string),
-) map[int]float64 {
+) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) {
 	const calibDurationSec = 120
 	const derateStepW = 25
 	const maxDerateW = 150
 	// dcgmi must be present.
 	if _, err := exec.LookPath("dcgmi"); err != nil {
 		logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
-		return map[int]float64{}
+		return map[int]benchmarkPowerCalibrationResult{}, nil
 	}
-	logFunc(fmt.Sprintf("power calibration: running dcgmi targeted_power for %ds on GPUs %s", calibDurationSec, joinIndexList(gpuIndices)))
+	canDerate := os.Geteuid() == 0
-
+	if !canDerate {
-	cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices)
+		logFunc("power calibration: root privileges unavailable, adaptive power-limit derating disabled")
 	out, rows, err := runBenchmarkCommandWithMetrics(ctx, verboseLog, "power-calibration.log", cmd, nil, gpuIndices, logFunc)
 	_ = os.WriteFile(filepath.Join(runDir, "power-calibration.log"), out, 0644)
 	if err != nil {
 		logFunc(fmt.Sprintf("power calibration: dcgmi targeted_power failed (%v), skipping", err))
 		return map[int]float64{}
 	}
-	// Group rows by GPU index and compute p95 power for each.
+	type calibrationAttemptResult struct {
-	result := make(map[int]float64, len(gpuIndices))
+		out  []byte
 		rows []GPUMetricRow
 		err  error
 	}
 	results := make(map[int]benchmarkPowerCalibrationResult, len(gpuIndices))
 	var restore []benchmarkRestoreAction
 	for _, idx := range gpuIndices {
-		perGPU := filterRowsByGPU(rows, idx)
+		info := infoByIndex[idx]
-		if len(perGPU) == 0 {
+		originalLimitW := int(math.Round(info.PowerLimitW))
-			continue
+		if originalLimitW <= 0 {
 			originalLimitW = int(math.Round(info.DefaultPowerLimitW))
 		}
-		powers := make([]float64, 0, len(perGPU))
+		defaultLimitW := int(math.Round(info.DefaultPowerLimitW))
-		for _, r := range perGPU {
+		if defaultLimitW <= 0 {
-			if r.PowerW > 0 {
+			defaultLimitW = originalLimitW
-				powers = append(powers, r.PowerW)
+		}
 		appliedLimitW := originalLimitW
 		if appliedLimitW <= 0 {
 			appliedLimitW = defaultLimitW
 		}
 		minLimitW := appliedLimitW
 		switch {
 		case defaultLimitW > 0:
 			minLimitW = defaultLimitW - maxDerateW
 			floorByRatio := int(math.Round(float64(defaultLimitW) * 0.70))
 			if minLimitW < floorByRatio {
 				minLimitW = floorByRatio
 			}
 		case appliedLimitW > 0:
 			minLimitW = appliedLimitW - maxDerateW
 		}
-		if len(powers) == 0 {
+		if minLimitW < derateStepW {
-			continue
+			minLimitW = derateStepW
 		}
-		p95 := benchmarkPercentile(powers, 95)
+
-		if p95 > 0 {
+		calib := benchmarkPowerCalibrationResult{
-			result[idx] = p95
+			AppliedPowerLimitW: float64(appliedLimitW),
-			logFunc(fmt.Sprintf("power calibration: GPU %d p95=%.0f W (%d samples)", idx, p95, len(powers)))
+		}
 		if canDerate && originalLimitW > 0 {
 			idxCopy := idx
 			orig := originalLimitW
 			restore = append(restore, benchmarkRestoreAction{
 				name: fmt.Sprintf("gpu-%d-restore-power-limit", idxCopy),
 				fn: func() {
 					_ = setBenchmarkPowerLimit(context.Background(), verboseLog, idxCopy, orig)
 				},
 			})
 		}
 		for {
 			calib.Attempts++
 			logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", idx, calib.Attempts, appliedLimitW, calibDurationSec))
 			beforeThrottle, _ := queryThrottleCounters(idx)
 			attemptCtx, cancel := context.WithCancel(ctx)
 			doneCh := make(chan calibrationAttemptResult, 1)
 			logName := fmt.Sprintf("power-calibration-gpu-%d-attempt-%d.log", idx, calib.Attempts)
 			cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, []int{idx})
 			go func() {
 				out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, []int{idx}, logFunc)
 				doneCh <- calibrationAttemptResult{out: out, rows: rows, err: err}
 			}()
 			ticker := time.NewTicker(time.Second)
 			var (
 				attempt        calibrationAttemptResult
 				throttleReason string
 			)
 		attemptLoop:
 			for {
 				select {
 				case attempt = <-doneCh:
 					break attemptLoop
 				case <-ticker.C:
 					afterThrottle, err := queryThrottleCounters(idx)
 					if err != nil {
 						continue
 					}
 					if reason := benchmarkCalibrationThrottleReason(beforeThrottle, afterThrottle); reason != "" {
 						throttleReason = reason
 						cancel()
 					}
 				case <-ctx.Done():
 					cancel()
 					attempt = <-doneCh
 					break attemptLoop
 				}
 			}
 			ticker.Stop()
 			cancel()
 			_ = os.WriteFile(filepath.Join(runDir, logName), attempt.out, 0644)
 			perGPU := filterRowsByGPU(attempt.rows, idx)
 			summary := summarizeBenchmarkTelemetry(perGPU)
 			if throttleReason == "" && attempt.err == nil && summary.P95PowerW > 0 {
 				calib.Summary = summary
 				calib.Completed = true
 				calib.AppliedPowerLimitW = float64(appliedLimitW)
 				logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", idx, appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
 				break
 			}
 			switch {
 			case throttleReason != "":
 				calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power was canceled on attempt %d after %s throttling at %d W", calib.Attempts, throttleReason, appliedLimitW))
 				logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", idx, throttleReason, appliedLimitW))
 			case attempt.err != nil:
 				calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", calib.Attempts, appliedLimitW, attempt.err))
 				logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", idx, appliedLimitW, attempt.err))
 			default:
 				calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W produced no valid power telemetry", calib.Attempts, appliedLimitW))
 				logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W produced no valid telemetry", idx, calib.Attempts, appliedLimitW))
 			}
 			if !canDerate || appliedLimitW <= 0 {
 				break
 			}
 			nextLimitW := appliedLimitW - derateStepW
 			if nextLimitW < minLimitW {
 				calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default/current limit", maxDerateW))
 				break
 			}
 			if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err != nil {
 				calib.Notes = append(calib.Notes, "failed to lower power limit: "+err.Error())
 				logFunc(fmt.Sprintf("power calibration: GPU %d failed to set reduced power limit %d W: %v", idx, nextLimitW, err))
 				break
 			}
 			appliedLimitW = nextLimitW
 			calib.AppliedPowerLimitW = float64(appliedLimitW)
 			calib.Derated = true
 			info.PowerLimitW = float64(appliedLimitW)
 			infoByIndex[idx] = info
 			calib.Notes = append(calib.Notes, fmt.Sprintf("reduced power limit to %d W and restarted targeted_power from the beginning", appliedLimitW))
 		}
 		if calib.Completed || calib.Attempts > 0 || len(calib.Notes) > 0 {
 			results[idx] = calib
 		}
 	}
-	return result
+	return results, restore
 }
--- a/audit/internal/platform/benchmark_report.go
+++ b/audit/internal/platform/benchmark_report.go
@@ -48,7 +48,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		fmt.Fprintf(&b, "**GPU(s):** %s  \n", strings.Join(parts, ", "))
 	}
 	fmt.Fprintf(&b, "**Profile:** %s  \n", result.BenchmarkProfile)
-	fmt.Fprintf(&b, "**App version:** %s  \n", result.BenchmarkVersion)
+	fmt.Fprintf(&b, "**Benchmark version:** %s  \n", result.BenchmarkVersion)
 	fmt.Fprintf(&b, "**Generated:** %s  \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
 	if result.RampStep > 0 && result.RampTotal > 0 {
 		fmt.Fprintf(&b, "**Ramp-up step:** %d of %d  \n", result.RampStep, result.RampTotal)
@@ -83,7 +83,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 	// ── Methodology ───────────────────────────────────────────────────────────
 	b.WriteString("## Methodology\n\n")
-	fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline -> warmup -> steady-state -> interconnect -> cooldown phases.\n", result.BenchmarkProfile)
+	fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline -> warmup -> steady-state -> interconnect phases.\n", result.BenchmarkProfile)
 	b.WriteString("- Single-GPU compute score comes from `bee-gpu-burn` on the cuBLASLt path when available.\n")
 	b.WriteString("- Thermal and power limits are inferred from NVIDIA clock-event counters plus sustained telemetry.\n")
 	b.WriteString("- `result.json` is the canonical machine-readable source for the run.\n\n")
@@ -170,6 +170,16 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		if gpu.PowerLimitW > 0 {
 			fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
 		}
 		if gpu.PowerLimitDerated {
 			fmt.Fprintf(&b, "- **Power limit derating:** active after %d targeted_power attempt(s)\n", gpu.PowerCalibrationTries)
 		}
 		if gpu.CalibratedPeakPowerW > 0 {
 			if gpu.CalibratedPeakTempC > 0 {
 				fmt.Fprintf(&b, "- **Power calibration (`dcgmi targeted_power`):** %.0f W p95 at %.1f °C p95\n", gpu.CalibratedPeakPowerW, gpu.CalibratedPeakTempC)
 			} else {
 				fmt.Fprintf(&b, "- **Power calibration (`dcgmi targeted_power`):** %.0f W p95\n", gpu.CalibratedPeakPowerW)
 			}
 		}
 		if gpu.LockedGraphicsClockMHz > 0 {
 			fmt.Fprintf(&b, "- **Locked clocks:** GPU %.0f MHz / Mem %.0f MHz\n", gpu.LockedGraphicsClockMHz, gpu.LockedMemoryClockMHz)
 		}
@@ -188,7 +198,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		// Per-precision stability phases.
 		if len(gpu.PrecisionSteady) > 0 {
 			b.WriteString("**Per-precision stability:**\n\n")
-			b.WriteString("| Precision | Clock CV | Power CV | Clock Drift | ECC corr | ECC uncorr |\n|-----------|----------|----------|-------------|----------|------------|\n")
+			b.WriteString("| Precision | Status | Clock CV | Power CV | Clock Drift | ECC corr | ECC uncorr |\n|-----------|--------|----------|----------|-------------|----------|------------|\n")
 			for _, p := range gpu.PrecisionSteady {
 				eccCorr := "—"
 				eccUncorr := "—"
@@ -196,8 +206,12 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 					eccCorr = fmt.Sprintf("%d", p.ECC.Corrected)
 					eccUncorr = fmt.Sprintf("%d", p.ECC.Uncorrected)
 				}
-				fmt.Fprintf(&b, "| %s | %.1f%% | %.1f%% | %.1f%% | %s | %s |\n",
+				status := p.Status
-					p.Precision, p.Steady.ClockCVPct, p.Steady.PowerCVPct, p.Steady.ClockDriftPct,
+				if strings.TrimSpace(status) == "" {
 					status = "OK"
 				}
 				fmt.Fprintf(&b, "| %s | %s | %.1f%% | %.1f%% | %.1f%% | %s | %s |\n",
 					p.Precision, status, p.Steady.ClockCVPct, p.Steady.PowerCVPct, p.Steady.ClockDriftPct,
 					eccCorr, eccUncorr)
 			}
 			b.WriteString("\n")
@@ -364,6 +378,7 @@ func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64)
 func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
 	var b strings.Builder
 	fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
 	fmt.Fprintf(&b, "benchmark_version=%s\n", result.BenchmarkVersion)
 	fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
 	fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
 	fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
--- a/audit/internal/platform/benchmark_test.go
+++ b/audit/internal/platform/benchmark_test.go
@@ -46,6 +46,7 @@ func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
 	labels, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
 		benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, SteadySec: 480},
 		benchmarkPrecisionPhases,
 		func(label string) string { return label },
 	)
 	if len(labels) != 7 || len(phases) != 7 {
@@ -70,6 +71,7 @@ func TestBuildBenchmarkSteadyPlanStability(t *testing.T) {
 	_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
 		benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, SteadySec: 3600},
 		benchmarkPrecisionPhases,
 		func(label string) string { return label },
 	)
 	if basePhaseSec != 300 {
@@ -88,6 +90,7 @@ func TestBuildBenchmarkSteadyPlanOvernight(t *testing.T) {
 	_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
 		benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, SteadySec: 27000},
 		benchmarkPrecisionPhases,
 		func(label string) string { return label },
 	)
 	if basePhaseSec != 3600 {
@@ -127,6 +130,40 @@ func TestSplitBenchmarkRowsByPlannedPhaseUsesPhaseDurations(t *testing.T) {
 	}
 }
 func TestBenchmarkSupportedPrecisionsSkipsFP4BeforeBlackwell(t *testing.T) {
 	t.Parallel()
 	if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64" {
 		t.Fatalf("supported=%v", got)
 	}
 	if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64,fp4" {
 		t.Fatalf("supported=%v", got)
 	}
 }
 func TestBenchmarkPlannedPhaseStatus(t *testing.T) {
 	t.Parallel()
 	cases := []struct {
 		name       string
 		raw        string
 		wantStatus string
 	}{
 		{name: "ok", raw: "status=OK\n", wantStatus: "OK"},
 		{name: "failed", raw: "phase_error=fp16\n", wantStatus: "FAILED"},
 		{name: "unsupported", raw: "cublasLt_profiles=unsupported\nphase_error=fp4\n", wantStatus: "UNSUPPORTED"},
 	}
 	for _, tc := range cases {
 		tc := tc
 		t.Run(tc.name, func(t *testing.T) {
 			got, _ := benchmarkPlannedPhaseStatus([]byte(tc.raw))
 			if got != tc.wantStatus {
 				t.Fatalf("status=%q want %q", got, tc.wantStatus)
 			}
 		})
 	}
 }
 func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
 	t.Parallel()
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -104,6 +104,7 @@ type BenchmarkGPUResult struct {
 	Backend             string  `json:"backend,omitempty"`
 	Status              string  `json:"status"`
 	PowerLimitW         float64 `json:"power_limit_w,omitempty"`
 	PowerLimitDerated   bool    `json:"power_limit_derated,omitempty"`
 	MultiprocessorCount int     `json:"multiprocessor_count,omitempty"`
 	DefaultPowerLimitW  float64 `json:"default_power_limit_w,omitempty"`
 	// CalibratedPeakPowerW is the p95 power measured during a short
@@ -111,6 +112,8 @@ type BenchmarkGPUResult struct {
 	// Used as the reference denominator for PowerSustainScore instead of
 	// the hardware default limit, which bee-gpu-burn cannot reach.
 	CalibratedPeakPowerW   float64                         `json:"calibrated_peak_power_w,omitempty"`
 	CalibratedPeakTempC    float64                         `json:"calibrated_peak_temp_c,omitempty"`
 	PowerCalibrationTries  int                             `json:"power_calibration_tries,omitempty"`
 	MaxGraphicsClockMHz    float64                         `json:"max_graphics_clock_mhz,omitempty"`
 	BaseGraphicsClockMHz   float64                         `json:"base_graphics_clock_mhz,omitempty"`
 	MaxMemoryClockMHz      float64                         `json:"max_memory_clock_mhz,omitempty"`
@@ -119,6 +122,7 @@ type BenchmarkGPUResult struct {
 	Baseline               BenchmarkTelemetrySummary       `json:"baseline"`
 	Steady                 BenchmarkTelemetrySummary       `json:"steady"`
 	PrecisionSteady        []BenchmarkPrecisionSteadyPhase `json:"precision_steady,omitempty"`
 	PrecisionFailures      []string                        `json:"precision_failures,omitempty"`
 	Cooldown               BenchmarkTelemetrySummary       `json:"cooldown"`
 	Throttle               BenchmarkThrottleCounters       `json:"throttle_counters"`
 	// ECC error delta accumulated over the full benchmark (all phases combined).
@@ -225,13 +229,15 @@ type BenchmarkServerPower struct {
 // type runs at a time the PowerCVPct here is a genuine stability signal.
 type BenchmarkPrecisionSteadyPhase struct {
 	Precision             string                    `json:"precision"` // e.g. "fp8", "fp16", "fp32"
 	Status                string                    `json:"status,omitempty"`
 	Steady                BenchmarkTelemetrySummary `json:"steady"`
 	TeraOpsPerSec         float64                   `json:"teraops_per_sec,omitempty"`
 	WeightedTeraOpsPerSec float64                   `json:"weighted_teraops_per_sec,omitempty"`
 	// ECC errors accumulated during this precision phase only.
 	// Non-zero corrected = stress-induced DRAM errors for this kernel type.
 	// Any uncorrected = serious fault triggered by this precision workload.
-	ECC BenchmarkECCCounters `json:"ecc,omitempty"`
+	ECC   BenchmarkECCCounters `json:"ecc,omitempty"`
 	Notes string               `json:"notes,omitempty"`
 }
 type BenchmarkInterconnectResult struct {
--- a/audit/internal/platform/gpu_metrics.go
+++ b/audit/internal/platform/gpu_metrics.go
@@ -14,6 +14,8 @@ import (
 // GPUMetricRow is one telemetry sample from nvidia-smi during a stress test.
 type GPUMetricRow struct {
 	Stage                 string  `json:"stage,omitempty"`
 	StageStartSec         float64 `json:"stage_start_sec,omitempty"`
 	StageEndSec           float64 `json:"stage_end_sec,omitempty"`
 	ElapsedSec            float64 `json:"elapsed_sec"`
 	GPUIndex              int     `json:"index"`
 	TempC                 float64 `json:"temp_c"`
@@ -509,11 +511,22 @@ func buildGPUMetricStageSpans(rows []GPUMetricRow) []gpuMetricStageSpan {
 		if name == "" {
 			name = "run"
 		}
 		start := row.StageStartSec
 		end := row.StageEndSec
 		if end <= start {
 			start = row.ElapsedSec
 			end = row.ElapsedSec
 		}
 		if len(spans) == 0 || spans[len(spans)-1].Name != name {
-			spans = append(spans, gpuMetricStageSpan{Name: name, Start: row.ElapsedSec, End: row.ElapsedSec})
+			spans = append(spans, gpuMetricStageSpan{Name: name, Start: start, End: end})
 			continue
 		}
-		spans[len(spans)-1].End = row.ElapsedSec
+		if start < spans[len(spans)-1].Start {
 			spans[len(spans)-1].Start = start
 		}
 		if end > spans[len(spans)-1].End {
 			spans[len(spans)-1].End = end
 		}
 	}
 	for i := range spans {
 		if spans[i].End <= spans[i].Start {
--- a/audit/internal/webui/api.go
+++ b/audit/internal/webui/api.go
@@ -110,7 +110,7 @@ func writeTaskRunResponse(w http.ResponseWriter, tasks []*Task) {
 func shouldSplitHomogeneousNvidiaTarget(target string) bool {
 	switch strings.TrimSpace(target) {
-	case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute",
+	case "nvidia", "nvidia-targeted-stress", "nvidia-bench-perf", "nvidia-bench-power", "nvidia-compute",
 		"nvidia-targeted-power", "nvidia-pulse", "nvidia-interconnect",
 		"nvidia-bandwidth", "nvidia-stress":
 		return true
@@ -127,7 +127,7 @@ func defaultTaskPriority(target string, params taskParams) int {
 		return taskPriorityInstallToRAM
 	case "audit":
 		return taskPriorityAudit
-	case "nvidia-benchmark":
+	case "nvidia-bench-perf", "nvidia-bench-power":
 		return taskPriorityBenchmark
 	case "nvidia-stress", "amd-stress", "memory-stress", "sat-stress", "platform-stress", "nvidia-compute":
 		return taskPriorityBurn
@@ -573,131 +573,142 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
 	}
 }
-func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
+func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFunc {
-	if h.opts.App == nil {
+	return func(w http.ResponseWriter, r *http.Request) {
-		writeError(w, http.StatusServiceUnavailable, "app not configured")
+		if h.opts.App == nil {
-		return
+			writeError(w, http.StatusServiceUnavailable, "app not configured")
 	}
 	var body struct {
 		Profile           string `json:"profile"`
 		SizeMB            int    `json:"size_mb"`
 		GPUIndices        []int  `json:"gpu_indices"`
 		ExcludeGPUIndices []int  `json:"exclude_gpu_indices"`
 		RunNCCL           *bool  `json:"run_nccl"`
 		ParallelGPUs      *bool  `json:"parallel_gpus"`
 		RampUp            *bool  `json:"ramp_up"`
 		DisplayName       string `json:"display_name"`
 	}
 	if r.Body != nil {
 		if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
 			writeError(w, http.StatusBadRequest, "invalid request body")
 			return
 		}
 	}
-	runNCCL := true
+		var body struct {
-	if body.RunNCCL != nil {
+			Profile           string `json:"profile"`
-		runNCCL = *body.RunNCCL
+			SizeMB            int    `json:"size_mb"`
-	}
+			GPUIndices        []int  `json:"gpu_indices"`
-	parallelGPUs := false
+			ExcludeGPUIndices []int  `json:"exclude_gpu_indices"`
-	if body.ParallelGPUs != nil {
+			RunNCCL           *bool  `json:"run_nccl"`
-		parallelGPUs = *body.ParallelGPUs
+			ParallelGPUs      *bool  `json:"parallel_gpus"`
-	}
+			RampUp            *bool  `json:"ramp_up"`
-	rampUp := false
+			DisplayName       string `json:"display_name"`
-	if body.RampUp != nil {
+		}
-		rampUp = *body.RampUp
+		if r.Body != nil {
-	}
+			if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
-	// Build a descriptive base name that includes profile and mode so the task
+				writeError(w, http.StatusBadRequest, "invalid request body")
-	// list is self-explanatory without opening individual task detail pages.
+				return
-	profile := strings.TrimSpace(body.Profile)
+			}
-	if profile == "" {
+		}
 		profile = "standard"
 	}
 	name := taskDisplayName("nvidia-benchmark", "", "")
 	if strings.TrimSpace(body.DisplayName) != "" {
 		name = body.DisplayName
 	}
 	// Append profile tag.
 	name = fmt.Sprintf("%s · %s", name, profile)
-	if rampUp && len(body.GPUIndices) > 1 {
+		runNCCL := true
-		// Ramp-up mode: resolve GPU list, then create one task per prefix
+		if body.RunNCCL != nil {
-		// [gpu0], [gpu0,gpu1], ..., [gpu0,...,gpuN-1], each running in parallel.
+			runNCCL = *body.RunNCCL
-		gpus, err := apiListNvidiaGPUs(h.opts.App)
+		}
-		if err != nil {
+		parallelGPUs := false
-			writeError(w, http.StatusBadRequest, err.Error())
+		if body.ParallelGPUs != nil {
 			parallelGPUs = *body.ParallelGPUs
 		}
 		rampUp := false
 		if body.RampUp != nil {
 			rampUp = *body.RampUp
 		}
 		// Build a descriptive base name that includes profile and mode so the task
 		// list is self-explanatory without opening individual task detail pages.
 		profile := strings.TrimSpace(body.Profile)
 		if profile == "" {
 			profile = "standard"
 		}
 		name := taskDisplayName(target, "", "")
 		if strings.TrimSpace(body.DisplayName) != "" {
 			name = body.DisplayName
 		}
 		// Append profile tag.
 		name = fmt.Sprintf("%s · %s", name, profile)
 		if target == "nvidia-bench-power" && parallelGPUs {
 			writeError(w, http.StatusBadRequest, "power / thermal fit benchmark uses sequential or ramp-up modes only")
 			return
 		}
-		resolved, err := expandSelectedGPUIndices(gpus, body.GPUIndices, body.ExcludeGPUIndices)
+
-		if err != nil {
+		if rampUp && len(body.GPUIndices) > 1 {
-			writeError(w, http.StatusBadRequest, err.Error())
+			// Ramp-up mode: resolve GPU list, then create one task per prefix
-			return
+			// [gpu0], [gpu0,gpu1], ..., [gpu0,...,gpuN-1], each running in parallel.
-		}
+			gpus, err := apiListNvidiaGPUs(h.opts.App)
-		if len(resolved) < 2 {
+			if err != nil {
-			// Fall through to normal single-task path.
+				writeError(w, http.StatusBadRequest, err.Error())
-			rampUp = false
+				return
-		} else {
+			}
-			now := time.Now()
+			resolved, err := expandSelectedGPUIndices(gpus, body.GPUIndices, body.ExcludeGPUIndices)
-			rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
+			if err != nil {
-			var allTasks []*Task
+				writeError(w, http.StatusBadRequest, err.Error())
-			for step := 1; step <= len(resolved); step++ {
+				return
-				subset := resolved[:step]
+			}
-				stepName := fmt.Sprintf("%s · ramp %d/%d · GPU %s", name, step, len(resolved), formatGPUIndexList(subset))
+			if len(resolved) < 2 {
-				t := &Task{
+				// Fall through to normal single-task path.
-					ID:        newJobID("benchmark-nvidia"),
+				rampUp = false
-					Name:      stepName,
+			} else {
-					Target:    "nvidia-benchmark",
+				now := time.Now()
-					Priority:  defaultTaskPriority("nvidia-benchmark", taskParams{}),
+				rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
-					Status:    TaskPending,
+				var allTasks []*Task
-					CreatedAt: now,
+				for step := 1; step <= len(resolved); step++ {
-					params: taskParams{
+					subset := resolved[:step]
-						GPUIndices:       append([]int(nil), subset...),
+					stepName := fmt.Sprintf("%s · ramp %d/%d · GPU %s", name, step, len(resolved), formatGPUIndexList(subset))
-						SizeMB:           body.SizeMB,
+					t := &Task{
-						BenchmarkProfile: body.Profile,
+						ID:        newJobID("bee-bench-nvidia"),
-						RunNCCL:          runNCCL && step == len(resolved),
+						Name:      stepName,
-						ParallelGPUs:     true,
+						Target:    target,
-						RampStep:         step,
+						Priority:  defaultTaskPriority(target, taskParams{}),
-						RampTotal:        len(resolved),
+						Status:    TaskPending,
-						RampRunID:        rampRunID,
+						CreatedAt: now,
-						DisplayName:      stepName,
+						params: taskParams{
-					},
+							GPUIndices:       append([]int(nil), subset...),
 							SizeMB:           body.SizeMB,
 							BenchmarkProfile: body.Profile,
 							RunNCCL:          runNCCL && step == len(resolved),
 							ParallelGPUs:     true,
 							RampStep:         step,
 							RampTotal:        len(resolved),
 							RampRunID:        rampRunID,
 							DisplayName:      stepName,
 						},
 					}
 					allTasks = append(allTasks, t)
 				}
-				allTasks = append(allTasks, t)
+				for _, t := range allTasks {
 					globalQueue.enqueue(t)
 				}
 				writeTaskRunResponse(w, allTasks)
 				return
 			}
-			for _, t := range allTasks {
+		}
-				globalQueue.enqueue(t)
+
-			}
+		// For non-ramp tasks append mode tag.
-			writeTaskRunResponse(w, allTasks)
+		if parallelGPUs {
 			name = fmt.Sprintf("%s · parallel", name)
 		} else {
 			name = fmt.Sprintf("%s · sequential", name)
 		}
 		params := taskParams{
 			GPUIndices:        body.GPUIndices,
 			ExcludeGPUIndices: body.ExcludeGPUIndices,
 			SizeMB:            body.SizeMB,
 			BenchmarkProfile:  body.Profile,
 			RunNCCL:           runNCCL,
 			ParallelGPUs:      parallelGPUs,
 			DisplayName:       body.DisplayName,
 		}
 		tasks, err := buildNvidiaTaskSet(target, defaultTaskPriority(target, params), time.Now(), params, name, h.opts.App, "bee-bench-nvidia")
 		if err != nil {
 			writeError(w, http.StatusBadRequest, err.Error())
 			return
 		}
 		for _, t := range tasks {
 			globalQueue.enqueue(t)
 		}
 		writeTaskRunResponse(w, tasks)
 	}
 }
-	// For non-ramp tasks append mode tag.
+func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
-	if parallelGPUs {
+	h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf").ServeHTTP(w, r)
 		name = fmt.Sprintf("%s · parallel", name)
 	} else {
 		name = fmt.Sprintf("%s · sequential", name)
 	}
 	params := taskParams{
 		GPUIndices:        body.GPUIndices,
 		ExcludeGPUIndices: body.ExcludeGPUIndices,
 		SizeMB:            body.SizeMB,
 		BenchmarkProfile:  body.Profile,
 		RunNCCL:           runNCCL,
 		ParallelGPUs:      parallelGPUs,
 		DisplayName:       body.DisplayName,
 	}
 	tasks, err := buildNvidiaTaskSet("nvidia-benchmark", defaultTaskPriority("nvidia-benchmark", params), time.Now(), params, name, h.opts.App, "benchmark-nvidia")
 	if err != nil {
 		writeError(w, http.StatusBadRequest, err.Error())
 		return
 	}
 	for _, t := range tasks {
 		globalQueue.enqueue(t)
 	}
 	writeTaskRunResponse(w, tasks)
 }
 func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {
--- a/audit/internal/webui/api_test.go
+++ b/audit/internal/webui/api_test.go
@@ -64,7 +64,7 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
 	t.Cleanup(func() { apiListNvidiaGPUs = prevList })
 	h := &handler{opts: HandlerOptions{App: &app.App{}}}
-	req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
+	req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/perf/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
 	rec := httptest.NewRecorder()
 	h.handleAPIBenchmarkNvidiaRun(rec, req)
@@ -78,8 +78,8 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
 		t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
 	}
 	task := globalQueue.tasks[0]
-	if task.Target != "nvidia-benchmark" {
+	if task.Target != "nvidia-bench-perf" {
-		t.Fatalf("target=%q want nvidia-benchmark", task.Target)
+		t.Fatalf("target=%q want nvidia-bench-perf", task.Target)
 	}
 	if got := task.params.GPUIndices; len(got) != 2 || got[0] != 1 || got[1] != 3 {
 		t.Fatalf("gpu indices=%v want [1 3]", got)
@@ -113,7 +113,7 @@ func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
 	t.Cleanup(func() { apiListNvidiaGPUs = prevList })
 	h := &handler{opts: HandlerOptions{App: &app.App{}}}
-	req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
+	req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/perf/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
 	rec := httptest.NewRecorder()
 	h.handleAPIBenchmarkNvidiaRun(rec, req)
@@ -147,6 +147,50 @@ func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
 	}
 }
 func TestHandleAPIBenchmarkPowerFitRampQueuesBenchmarkPowerFitTasks(t *testing.T) {
 	globalQueue.mu.Lock()
 	originalTasks := globalQueue.tasks
 	globalQueue.tasks = nil
 	globalQueue.mu.Unlock()
 	t.Cleanup(func() {
 		globalQueue.mu.Lock()
 		globalQueue.tasks = originalTasks
 		globalQueue.mu.Unlock()
 	})
 	prevList := apiListNvidiaGPUs
 	apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
 		return []platform.NvidiaGPU{
 			{Index: 0, Name: "NVIDIA H100 PCIe"},
 			{Index: 1, Name: "NVIDIA H100 PCIe"},
 			{Index: 2, Name: "NVIDIA H100 PCIe"},
 		}, nil
 	}
 	t.Cleanup(func() { apiListNvidiaGPUs = prevList })
 	h := &handler{opts: HandlerOptions{App: &app.App{}}}
 	req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/power/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"ramp_up":true}`))
 	rec := httptest.NewRecorder()
 	h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power").ServeHTTP(rec, req)
 	if rec.Code != 200 {
 		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
 	}
 	globalQueue.mu.Lock()
 	defer globalQueue.mu.Unlock()
 	if len(globalQueue.tasks) != 3 {
 		t.Fatalf("tasks=%d want 3", len(globalQueue.tasks))
 	}
 	for i, task := range globalQueue.tasks {
 		if task.Target != "nvidia-bench-power" {
 			t.Fatalf("task[%d] target=%q", i, task.Target)
 		}
 		if task.Priority != taskPriorityBenchmark {
 			t.Fatalf("task[%d] priority=%d want %d", i, task.Priority, taskPriorityBenchmark)
 		}
 	}
 }
 func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
 	globalQueue.mu.Lock()
 	originalTasks := globalQueue.tasks
@@ -202,7 +246,8 @@ func TestDefaultTaskPriorityOrder(t *testing.T) {
 		defaultTaskPriority("cpu", taskParams{}),
 		defaultTaskPriority("cpu", taskParams{StressMode: true}),
 		defaultTaskPriority("nvidia-stress", taskParams{}),
-		defaultTaskPriority("nvidia-benchmark", taskParams{}),
+		defaultTaskPriority("nvidia-bench-perf", taskParams{}),
 		defaultTaskPriority("nvidia-bench-power", taskParams{}),
 	}
 	want := []int{
 		taskPriorityInstallToRAM,
@@ -211,13 +256,14 @@ func TestDefaultTaskPriorityOrder(t *testing.T) {
 		taskPriorityValidateStress,
 		taskPriorityBurn,
 		taskPriorityBenchmark,
 		taskPriorityBenchmark,
 	}
 	for i := range want {
 		if got[i] != want[i] {
 			t.Fatalf("priority[%d]=%d want %d", i, got[i], want[i])
 		}
 	}
-	if !(got[0] > got[1] && got[1] > got[2] && got[2] > got[3] && got[3] > got[4] && got[4] > got[5]) {
+	if !(got[0] > got[1] && got[1] > got[2] && got[2] > got[3] && got[3] > got[4] && got[4] > got[5] && got[5] == got[6]) {
 		t.Fatalf("priority order=%v", got)
 	}
 }
--- a/audit/internal/webui/kmsg_watcher.go
+++ b/audit/internal/webui/kmsg_watcher.go
@@ -232,7 +232,7 @@ func truncate(s string, max int) string {
 // isSATTarget returns true for task targets that run hardware acceptance tests.
 func isSATTarget(target string) bool {
 	switch target {
-	case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
+	case "nvidia", "nvidia-targeted-stress", "nvidia-bench-perf", "nvidia-bench-power", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
 		"nvidia-interconnect", "nvidia-bandwidth", "nvidia-stress", "memory", "memory-stress", "storage",
 		"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
 		"platform-stress":
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
@@ -1946,7 +1946,7 @@ func renderBenchmark(opts HandlerOptions) string {
 <div class="grid2">
  <div class="card">
-    <div class="card-head">NVIDIA Benchmark</div>
+    <div class="card-head">Benchmark Setup</div>
    <div class="card-body">
      <div class="form-row">
        <label>Profile</label>
@@ -1979,21 +1979,25 @@ func renderBenchmark(opts HandlerOptions) string {
        <span>Ramp-up — 1 GPU → 2 → … → all selected (separate tasks)</span>
      </label>
      <p id="benchmark-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 14px">Select one GPU for single-card benchmarking or several GPUs for a constrained multi-GPU run.</p>
-      <button id="benchmark-run-btn" class="btn btn-primary" onclick="runNvidiaBenchmark()" disabled>&#9654; Run Benchmark</button>
+      <div style="display:flex;gap:8px;flex-wrap:wrap;align-items:center">
        <button id="benchmark-run-performance-btn" class="btn btn-primary" onclick="runNvidiaBenchmark('performance')" disabled>&#9654; Run Performance Benchmark</button>
        <button id="benchmark-run-power-fit-btn" class="btn btn-secondary" onclick="runNvidiaBenchmark('power-fit')" disabled>&#9654; Run Power / Thermal Fit</button>
      </div>
      <span id="benchmark-run-nccl" hidden>nccl-auto</span>
      <span id="benchmark-run-status" style="margin-left:10px;font-size:12px;color:var(--muted)"></span>
    </div>
  </div>
  <div class="card">
-    <div class="card-head">Method</div>
+    <div class="card-head">Method Split</div>
    <div class="card-body">
-      <p style="font-size:13px;color:var(--muted);margin-bottom:10px">Each benchmark run performs warmup, sustained compute, telemetry capture, cooldown, and optional NCCL interconnect checks.</p>
+      <p style="font-size:13px;color:var(--muted);margin-bottom:10px">The benchmark page now exposes two fundamentally different test families so compute score and server power-fit are not mixed into one number.</p>
      <table>
-        <tr><th>Profile</th><th>Purpose</th></tr>
+        <tr><th>Run Type</th><th>Engine</th><th>Question</th></tr>
-        <tr><td>Standard</td><td>Fast, repeatable performance check for server-to-server comparison.</td></tr>
+        <tr><td>Performance Benchmark</td><td><code>bee-gpu-burn</code></td><td>How much isolated compute performance does the GPU realize in this server?</td></tr>
-        <tr><td>Stability</td><td>Longer run for thermal drift, power caps, and clock instability.</td></tr>
+        <tr><td>Power / Thermal Fit</td><td><code>dcgmi targeted_power</code></td><td>How much power per GPU can this server sustain as GPU count ramps up?</td></tr>
        <tr><td>Overnight</td><td>Extended verification of long-run stability and late throttling.</td></tr>
      </table>
      <p style="font-size:12px;color:var(--muted);margin-top:10px">Use ramp-up mode for capacity work: it creates 1 GPU → 2 GPU → … → all selected steps so analysis software can derive server total score and watts-per-GPU curves.</p>
    </div>
  </div>
 </div>
@@ -2036,21 +2040,24 @@ function benchmarkMode() {
 function benchmarkUpdateSelectionNote() {
  const selected = benchmarkSelectedGPUIndices();
-  const btn = document.getElementById('benchmark-run-btn');
+  const perfBtn = document.getElementById('benchmark-run-performance-btn');
  const fitBtn = document.getElementById('benchmark-run-power-fit-btn');
  const note = document.getElementById('benchmark-selection-note');
  if (!selected.length) {
-    btn.disabled = true;
+    perfBtn.disabled = true;
    fitBtn.disabled = true;
    note.textContent = 'Select at least one NVIDIA GPU to run the benchmark.';
    return;
  }
-  btn.disabled = false;
+  perfBtn.disabled = false;
  fitBtn.disabled = false;
  const mode = benchmarkMode();
  if (mode === 'ramp-up') {
-    note.textContent = 'Ramp-up: ' + selected.length + ' tasks (1 GPU → ' + selected.length + ' GPUs). NCCL on final step.';
+    note.textContent = 'Ramp-up: ' + selected.length + ' tasks (1 GPU → ' + selected.length + ' GPUs). Performance uses compute benchmark; Power / Thermal Fit uses targeted_power per step.';
  } else if (mode === 'parallel') {
-    note.textContent = 'Parallel: all ' + selected.length + ' GPU(s) simultaneously.' + (selected.length > 1 ? ' NCCL included.' : '');
+    note.textContent = 'Parallel: all ' + selected.length + ' GPU(s) simultaneously. Only the performance benchmark supports this mode.';
  } else {
-    note.textContent = 'Sequential: each GPU benchmarked separately.' + (selected.length > 1 ? ' NCCL included on each.' : '');
+    note.textContent = 'Sequential: each selected GPU benchmarked separately.';
  }
 }
@@ -2124,7 +2131,7 @@ function benchmarkSelectNone() {
  benchmarkUpdateSelectionNote();
 }
-function runNvidiaBenchmark() {
+function runNvidiaBenchmark(kind) {
  const selected = benchmarkSelectedGPUIndices();
  const status = document.getElementById('benchmark-run-status');
  if (!selected.length) {
@@ -2134,21 +2141,26 @@ function runNvidiaBenchmark() {
  if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
  const mode = benchmarkMode();
  const rampUp = mode === 'ramp-up' && selected.length > 1;
-  const parallelGPUs = mode === 'parallel';
+  const parallelGPUs = mode === 'parallel' && kind === 'performance';
  if (kind === 'power-fit' && mode === 'parallel') {
    status.textContent = 'Power / Thermal Fit supports sequential or ramp-up only.';
    return;
  }
  const body = {
    profile: document.getElementById('benchmark-profile').value || 'standard',
    gpu_indices: selected,
-    run_nccl: selected.length > 1,
+    run_nccl: kind === 'performance' && selected.length > 1,
    parallel_gpus: parallelGPUs,
    ramp_up: rampUp,
-    display_name: 'NVIDIA Benchmark'
+    display_name: kind === 'power-fit' ? 'NVIDIA Power / Thermal Fit' : 'NVIDIA Performance Benchmark'
  };
  document.getElementById('benchmark-output').style.display = 'block';
-  document.getElementById('benchmark-title').textContent = '— ' + body.profile + ' [' + selected.join(', ') + ']';
+  document.getElementById('benchmark-title').textContent = '— ' + body.display_name + ' · ' + body.profile + ' [' + selected.join(', ') + ']';
  const term = document.getElementById('benchmark-terminal');
-  term.textContent = 'Enqueuing benchmark for GPUs ' + selected.join(', ') + '...\n';
+  term.textContent = 'Enqueuing ' + body.display_name + ' for GPUs ' + selected.join(', ') + '...\n';
  status.textContent = 'Queueing...';
-  fetch('/api/benchmark/nvidia/run', {
+  const endpoint = kind === 'power-fit' ? '/api/bee-bench/nvidia/power/run' : '/api/bee-bench/nvidia/perf/run';
  fetch(endpoint, {
    method: 'POST',
    headers: {'Content-Type':'application/json'},
    body: JSON.stringify(body)
@@ -2202,7 +2214,7 @@ benchmarkLoadGPUs();
 func renderBenchmarkResultsCard(exportDir string) string {
 	maxIdx, runs := loadBenchmarkHistory(exportDir)
 	return renderBenchmarkResultsCardFromRuns(
-		"Benchmark Results",
+		"Perf Results",
 		"Composite score by saved benchmark run and GPU.",
 		"No saved benchmark runs yet.",
 		maxIdx,
@@ -2244,11 +2256,11 @@ func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string,
 }
 func loadBenchmarkHistory(exportDir string) (int, []benchmarkHistoryRun) {
-	baseDir := app.DefaultBenchmarkBaseDir
+	baseDir := app.DefaultBeeBenchPerfDir
 	if strings.TrimSpace(exportDir) != "" {
-		baseDir = filepath.Join(exportDir, "bee-benchmark")
+		baseDir = filepath.Join(exportDir, "bee-bench", "perf")
 	}
-	paths, err := filepath.Glob(filepath.Join(baseDir, "gpu-benchmark-*", "result.json"))
+	paths, err := filepath.Glob(filepath.Join(baseDir, "perf-*", "result.json"))
 	if err != nil || len(paths) == 0 {
 		return -1, nil
 	}
--- a/audit/internal/webui/server.go
+++ b/audit/internal/webui/server.go
@@ -261,7 +261,8 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	mux.HandleFunc("POST /api/sat/platform-stress/run", h.handleAPISATRun("platform-stress"))
 	mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream)
 	mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
-	mux.HandleFunc("POST /api/benchmark/nvidia/run", h.handleAPIBenchmarkNvidiaRun)
+	mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf"))
 	mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power"))
 	// Tasks
 	mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)
--- a/audit/internal/webui/server_test.go
+++ b/audit/internal/webui/server_test.go
@@ -648,8 +648,11 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
 		`href="/benchmark"`,
 		`id="benchmark-gpu-list"`,
 		`/api/gpu/nvidia`,
-		`/api/benchmark/nvidia/run`,
+		`/api/bee-bench/nvidia/perf/run`,
 		`/api/bee-bench/nvidia/power/run`,
 		`benchmark-run-nccl`,
 		`Run Performance Benchmark`,
 		`Run Power / Thermal Fit`,
 	} {
 		if !strings.Contains(body, needle) {
 			t.Fatalf("benchmark page missing %q: %s", needle, body)
@@ -660,7 +663,7 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
 func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
 	dir := t.TempDir()
 	exportDir := filepath.Join(dir, "export")
-	runDir := filepath.Join(exportDir, "bee-benchmark", "gpu-benchmark-20260406-120000")
+	runDir := filepath.Join(exportDir, "bee-bench", "perf", "perf-20260406-120000")
 	if err := os.MkdirAll(runDir, 0755); err != nil {
 		t.Fatal(err)
 	}
@@ -702,7 +705,7 @@ func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
 	body := rec.Body.String()
 	wantTime := result.GeneratedAt.Local().Format("2006-01-02 15:04:05")
 	for _, needle := range []string{
-		`Benchmark Results`,
+		`Perf Results`,
 		`Composite score by saved benchmark run and GPU.`,
 		`GPU 0`,
 		`GPU 1`,
--- a/audit/internal/webui/task_report.go
+++ b/audit/internal/webui/task_report.go
@@ -251,7 +251,9 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
 }
 func renderTaskBenchmarkResultsCard(target, logText string) string {
-	if strings.TrimSpace(target) != "nvidia-benchmark" {
+	switch strings.TrimSpace(target) {
 	case "nvidia-bench-perf":
 	default:
 		return ""
 	}
 	resultPath := taskBenchmarkResultPath(logText)
@@ -263,7 +265,7 @@ func renderTaskBenchmarkResultsCard(target, logText string) string {
 		return ""
 	}
 	return renderBenchmarkResultsCardFromRuns(
-		"Benchmark Results",
+		"Perf Results",
 		"Composite score for this benchmark task.",
 		"No benchmark results were saved for this task.",
 		columns,
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -32,7 +32,8 @@ const (
 var taskNames = map[string]string{
 	"nvidia":                 "NVIDIA SAT",
 	"nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)",
-	"nvidia-benchmark":       "NVIDIA Benchmark",
+	"nvidia-bench-perf":      "NVIDIA Bee Bench Perf",
 	"nvidia-bench-power":     "NVIDIA Bee Bench Power",
 	"nvidia-compute":         "NVIDIA Max Compute Load (dcgmproftester)",
 	"nvidia-targeted-power":  "NVIDIA Targeted Power (dcgmi diag targeted_power)",
 	"nvidia-pulse":           "NVIDIA Pulse Test (dcgmi diag pulse_test)",
@@ -628,7 +629,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			dur = 300
 		}
 		archive, err = a.RunNvidiaTargetedStressValidatePack(ctx, "", dur, t.params.GPUIndices, j.append)
-	case "nvidia-benchmark":
+	case "nvidia-bench-perf":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
@@ -644,6 +645,31 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			RampTotal:         t.params.RampTotal,
 			RampRunID:         t.params.RampRunID,
 		}, j.append)
 	case "nvidia-bench-power":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		dur := t.params.Duration
 		if dur <= 0 {
 			switch strings.TrimSpace(strings.ToLower(t.params.BenchmarkProfile)) {
 			case platform.NvidiaBenchmarkProfileStability:
 				dur = 300
 			case platform.NvidiaBenchmarkProfileOvernight:
 				dur = 600
 			default:
 				dur = 120
 			}
 		}
 		rampPlan, planErr := resolveNvidiaRampPlan(t.params.BenchmarkProfile, t.params.RampTotal > 0, t.params.GPUIndices)
 		if planErr != nil {
 			err = planErr
 			break
 		}
 		if t.params.RampTotal > 0 && t.params.RampStep > 0 && dur <= 0 {
 			dur = rampPlan.DurationSec
 		}
 		archive, err = a.RunNvidiaTargetedPowerPack(ctx, app.DefaultBeeBenchPowerDir, dur, t.params.GPUIndices, j.append)
 	case "nvidia-compute":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
--- a/audit/internal/webui/tasks_test.go
+++ b/audit/internal/webui/tasks_test.go
@@ -366,7 +366,7 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
 	taskReportMetricsDBPath = metricsPath
 	t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
-	benchmarkDir := filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000")
+	benchmarkDir := filepath.Join(dir, "bee-bench", "perf", "perf-20260406-120000")
 	if err := os.MkdirAll(benchmarkDir, 0755); err != nil {
 		t.Fatal(err)
 	}
@@ -398,14 +398,14 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
 	}
 	task := &Task{
 		ID:           "task-bench",
-		Name:         "NVIDIA Benchmark",
+		Name:         "NVIDIA Bee Bench Perf",
-		Target:       "nvidia-benchmark",
+		Target:       "nvidia-bench-perf",
 		Status:       TaskDone,
 		CreatedAt:    time.Now().UTC().Add(-time.Minute),
 		ArtifactsDir: artifactsDir,
 	}
 	ensureTaskReportPaths(task)
-	logText := "line-1\nArchive: " + filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000.tar.gz") + "\n"
+	logText := "line-1\nArchive: " + filepath.Join(dir, "bee-bench", "perf", "perf-20260406-120000.tar.gz") + "\n"
 	if err := os.WriteFile(task.LogPath, []byte(logText), 0644); err != nil {
 		t.Fatal(err)
 	}
@@ -420,7 +420,7 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
 	}
 	html := string(body)
 	for _, needle := range []string{
-		`Benchmark Results`,
+		`Perf Results`,
 		`Composite score for this benchmark task.`,
 		`GPU 0`,
 		`1176.25`,
--- a/bible-local/docs/benchmark-clock-calibration.md
+++ b/bible-local/docs/benchmark-clock-calibration.md
@@ -1,5 +1,34 @@
 # Benchmark clock calibration research
 ## Benchmark methodology versioning
 Every benchmark methodology change must bump the benchmark version constant in
 source code by exactly `+1`.
 Methodology change means any change that affects comparability of benchmark
 results, including for example:
 - phase durations or phase order
 - enabled/disabled precisions
 - fallback rules
 - normalization rules
 - score formulas or weights
 - degradation thresholds
 - power calibration logic
 - thermal/power penalty logic
 Requirements:
 - benchmark version must be stored in source code as an explicit version
  constant, not inferred from git tag or build metadata
 - benchmark report must always print the benchmark version
 - `result.json` must always include the benchmark version
 - results from different benchmark versions must be treated as non-comparable by
  default
 Purpose:
 - prevent accidental comparison of runs produced by different methodologies
 - make historical benchmark archives self-describing even when detached from git
 - force deliberate version bumps whenever scoring or execution semantics change
 ## Status
 In progress. Baseline data from production servers pending.
--- a/iso/builder/bee-gpu-stress.c
+++ b/iso/builder/bee-gpu-stress.c
@@ -796,6 +796,9 @@ static cudaDataType_t matmul_scale_type(const struct profile_desc *desc) {
    if (desc->compute_type == CUBLAS_COMPUTE_32I) {
        return CUDA_R_32I;
    }
    if (desc->compute_type == CUBLAS_COMPUTE_64F) {
        return CUDA_R_64F;
    }
    return CUDA_R_32F;
 }
@@ -1120,6 +1123,8 @@ static int run_cublas_profile(cublasLtHandle_t handle,
                              struct prepared_profile *profile) {
    int32_t alpha_i32 = 1;
    int32_t beta_i32 = 0;
    double alpha_f64 = 1.0;
    double beta_f64 = 0.0;
    float alpha = 1.0f;
    float beta = 0.0f;
    const void *alpha_ptr = &alpha;
@@ -1127,6 +1132,9 @@ static int run_cublas_profile(cublasLtHandle_t handle,
    if (profile->desc.compute_type == CUBLAS_COMPUTE_32I) {
        alpha_ptr = &alpha_i32;
        beta_ptr = &beta_i32;
    } else if (profile->desc.compute_type == CUBLAS_COMPUTE_64F) {
        alpha_ptr = &alpha_f64;
        beta_ptr = &beta_f64;
    }
    return check_cublas(profile->desc.name,
                        cublas->cublasLtMatmul(handle,