Simplify power calibration: pure binary search, no telemetry guessing

Remove telemetry-guided initial candidate; use strict binary search midpoint at every step. Clean and predictable convergence in O(log N) attempts within the allowed power range [minLimitW, startingLimitW]. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Replace linear power derate with binary search + telemetry-guided jump
2026-04-14 22:12:45 +03:00 · 2026-04-14 22:05:23 +03:00 · 2026-04-14 21:44:57 +03:00 · 2026-04-14 20:41:17 +03:00 · 2026-04-14 17:47:40 +03:00 · 2026-04-14 17:33:13 +03:00
24 changed files with 1951 additions and 450 deletions
--- a/audit/internal/app/app.go
+++ b/audit/internal/app/app.go
@@ -30,7 +30,9 @@ var (
 	DefaultRuntimeLogPath   = DefaultExportDir + "/runtime-health.log"
 	DefaultTechDumpDir      = DefaultExportDir + "/techdump"
 	DefaultSATBaseDir       = DefaultExportDir + "/bee-sat"
-	DefaultBenchmarkBaseDir = DefaultExportDir + "/bee-benchmark"
+	DefaultBeeBenchBaseDir  = DefaultExportDir + "/bee-bench"
 	DefaultBeeBenchPerfDir  = DefaultBeeBenchBaseDir + "/perf"
 	DefaultBeeBenchPowerDir = DefaultBeeBenchBaseDir + "/power"
 )
 type App struct {
@@ -84,6 +86,7 @@ type installer interface {
 	InstallToDisk(ctx context.Context, device string, logFile string) error
 	IsLiveMediaInRAM() bool
 	LiveBootSource() platform.LiveBootSource
 	LiveMediaRAMState() platform.LiveMediaRAMState
 	RunInstallToRAM(ctx context.Context, logFunc func(string)) error
 }
@@ -108,6 +111,10 @@ func (a *App) LiveBootSource() platform.LiveBootSource {
 	return a.installer.LiveBootSource()
 }
 func (a *App) LiveMediaRAMState() platform.LiveMediaRAMState {
 	return a.installer.LiveMediaRAMState()
 }
 func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
 	return a.installer.RunInstallToRAM(ctx, logFunc)
 }
@@ -117,6 +124,7 @@ type satRunner interface {
 	RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
 	RunNvidiaPowerBench(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
 	RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
 	RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
@@ -562,11 +570,18 @@ func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOp
 func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
-		baseDir = DefaultBenchmarkBaseDir
+		baseDir = DefaultBeeBenchPerfDir
 	}
 	return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
 }
 func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultBeeBenchPowerDir
 	}
 	return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
 }
 func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
--- a/audit/internal/app/app_test.go
+++ b/audit/internal/app/app_test.go
@@ -122,6 +122,7 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
 type fakeSAT struct {
 	runNvidiaFn               func(string) (string, error)
 	runNvidiaBenchmarkFn      func(string, platform.NvidiaBenchmarkOptions) (string, error)
 	runNvidiaPowerBenchFn     func(string, platform.NvidiaBenchmarkOptions) (string, error)
 	runNvidiaStressFn         func(string, platform.NvidiaStressOptions) (string, error)
 	runNvidiaComputeFn        func(string, int, []int) (string, error)
 	runNvidiaPowerFn          func(string, int, []int) (string, error)
@@ -154,6 +155,13 @@ func (f fakeSAT) RunNvidiaBenchmark(_ context.Context, baseDir string, opts plat
 	return f.runNvidiaFn(baseDir)
 }
 func (f fakeSAT) RunNvidiaPowerBench(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, _ func(string)) (string, error) {
 	if f.runNvidiaPowerBenchFn != nil {
 		return f.runNvidiaPowerBenchFn(baseDir, opts)
 	}
 	return f.runNvidiaFn(baseDir)
 }
 func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
 	if f.runNvidiaTargetedStressFn != nil {
 		return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
--- a/audit/internal/platform/benchmark_report.go
+++ b/audit/internal/platform/benchmark_report.go
@@ -48,7 +48,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		fmt.Fprintf(&b, "**GPU(s):** %s  \n", strings.Join(parts, ", "))
 	}
 	fmt.Fprintf(&b, "**Profile:** %s  \n", result.BenchmarkProfile)
-	fmt.Fprintf(&b, "**App version:** %s  \n", result.BenchmarkVersion)
+	fmt.Fprintf(&b, "**Benchmark version:** %s  \n", result.BenchmarkVersion)
 	fmt.Fprintf(&b, "**Generated:** %s  \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
 	if result.RampStep > 0 && result.RampTotal > 0 {
 		fmt.Fprintf(&b, "**Ramp-up step:** %d of %d  \n", result.RampStep, result.RampTotal)
@@ -83,15 +83,15 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 	// ── Methodology ───────────────────────────────────────────────────────────
 	b.WriteString("## Methodology\n\n")
-	fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline -> warmup -> steady-state -> interconnect -> cooldown phases.\n", result.BenchmarkProfile)
+	fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline -> warmup -> steady-state -> interconnect phases.\n", result.BenchmarkProfile)
 	b.WriteString("- Single-GPU compute score comes from `bee-gpu-burn` on the cuBLASLt path when available.\n")
 	b.WriteString("- Thermal and power limits are inferred from NVIDIA clock-event counters plus sustained telemetry.\n")
 	b.WriteString("- `result.json` is the canonical machine-readable source for the run.\n\n")
 	b.WriteString("**Compute score** is derived from two phases:\n\n")
-	b.WriteString("- **Synthetic** — each precision type (fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ")
+	b.WriteString("- **Synthetic** — each precision type (int8, fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ")
 	b.WriteString("Measures peak throughput with the full GPU dedicated to one kernel type. ")
 	b.WriteString("Each result is normalised to fp32-equivalent TOPS using precision weights: ")
-	b.WriteString("fp64 ×2.0 · fp32 ×1.0 · fp16 ×0.5 · fp8 ×0.25 · fp4 ×0.125.\n")
+	b.WriteString("fp64 ×2.0 · fp32 ×1.0 · fp16 ×0.5 · int8 ×0.25 · fp8 ×0.25 · fp4 ×0.125.\n")
 	b.WriteString("- **Mixed** — all precision types run simultaneously (combined phase). ")
 	b.WriteString("Reflects real inference workloads where fp8 matrix ops, fp16 attention and fp32 accumulation compete for bandwidth and SM scheduler slots.\n\n")
 	b.WriteString("**Formula:** `Compute = Synthetic × (1 + MixedEfficiency × 0.3)`\n\n")
@@ -170,6 +170,16 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		if gpu.PowerLimitW > 0 {
 			fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
 		}
 		if gpu.PowerLimitDerated {
 			fmt.Fprintf(&b, "- **Power limit derating:** active after %d targeted_power attempt(s)\n", gpu.PowerCalibrationTries)
 		}
 		if gpu.CalibratedPeakPowerW > 0 {
 			if gpu.CalibratedPeakTempC > 0 {
 				fmt.Fprintf(&b, "- **Power calibration (`dcgmi targeted_power`):** %.0f W p95 at %.1f °C p95\n", gpu.CalibratedPeakPowerW, gpu.CalibratedPeakTempC)
 			} else {
 				fmt.Fprintf(&b, "- **Power calibration (`dcgmi targeted_power`):** %.0f W p95\n", gpu.CalibratedPeakPowerW)
 			}
 		}
 		if gpu.LockedGraphicsClockMHz > 0 {
 			fmt.Fprintf(&b, "- **Locked clocks:** GPU %.0f MHz / Mem %.0f MHz\n", gpu.LockedGraphicsClockMHz, gpu.LockedMemoryClockMHz)
 		}
@@ -188,7 +198,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		// Per-precision stability phases.
 		if len(gpu.PrecisionSteady) > 0 {
 			b.WriteString("**Per-precision stability:**\n\n")
-			b.WriteString("| Precision | Clock CV | Power CV | Clock Drift | ECC corr | ECC uncorr |\n|-----------|----------|----------|-------------|----------|------------|\n")
+			b.WriteString("| Precision | Status | Clock CV | Power CV | Clock Drift | ECC corr | ECC uncorr |\n|-----------|--------|----------|----------|-------------|----------|------------|\n")
 			for _, p := range gpu.PrecisionSteady {
 				eccCorr := "—"
 				eccUncorr := "—"
@@ -196,8 +206,12 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 					eccCorr = fmt.Sprintf("%d", p.ECC.Corrected)
 					eccUncorr = fmt.Sprintf("%d", p.ECC.Uncorrected)
 				}
-				fmt.Fprintf(&b, "| %s | %.1f%% | %.1f%% | %.1f%% | %s | %s |\n",
+				status := p.Status
-					p.Precision, p.Steady.ClockCVPct, p.Steady.PowerCVPct, p.Steady.ClockDriftPct,
+				if strings.TrimSpace(status) == "" {
 					status = "OK"
 				}
 				fmt.Fprintf(&b, "| %s | %s | %.1f%% | %.1f%% | %.1f%% | %s | %s |\n",
 					p.Precision, status, p.Steady.ClockCVPct, p.Steady.PowerCVPct, p.Steady.ClockDriftPct,
 					eccCorr, eccUncorr)
 			}
 			b.WriteString("\n")
@@ -364,6 +378,7 @@ func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64)
 func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
 	var b strings.Builder
 	fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
 	fmt.Fprintf(&b, "benchmark_version=%s\n", result.BenchmarkVersion)
 	fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
 	fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
 	fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
--- a/audit/internal/platform/benchmark_test.go
+++ b/audit/internal/platform/benchmark_test.go
@@ -16,17 +16,17 @@ func TestResolveBenchmarkProfile(t *testing.T) {
 		{
 			name:    "default",
 			profile: "",
-			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120},
+			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 45, SteadySec: 480, NCCLSec: 180, CooldownSec: 0},
 		},
 		{
 			name:    "stability",
 			profile: "stability",
-			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300},
+			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 120, SteadySec: 3600, NCCLSec: 300, CooldownSec: 0},
 		},
 		{
 			name:    "overnight",
 			profile: "overnight",
-			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300},
+			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 180, SteadySec: 27000, NCCLSec: 600, CooldownSec: 0},
 		},
 	}
@@ -41,6 +41,129 @@ func TestResolveBenchmarkProfile(t *testing.T) {
 	}
 }
 func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
 	t.Parallel()
 	labels, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
 		benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, SteadySec: 480},
 		benchmarkPrecisionPhases,
 		func(label string) string { return label },
 	)
 	if len(labels) != 7 || len(phases) != 7 {
 		t.Fatalf("labels=%d phases=%d want 7", len(labels), len(phases))
 	}
 	if basePhaseSec != 60 {
 		t.Fatalf("basePhaseSec=%d want 60", basePhaseSec)
 	}
 	if mixedPhaseSec != 300 {
 		t.Fatalf("mixedPhaseSec=%d want 300", mixedPhaseSec)
 	}
 	if phases[len(phases)-1].PlanLabel != "mixed" || phases[len(phases)-1].DurationSec != 300 {
 		t.Fatalf("mixed phase=%+v want duration 300", phases[len(phases)-1])
 	}
 	if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,60,60,300" {
 		t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
 	}
 }
 func TestBuildBenchmarkSteadyPlanStability(t *testing.T) {
 	t.Parallel()
 	_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
 		benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, SteadySec: 3600},
 		benchmarkPrecisionPhases,
 		func(label string) string { return label },
 	)
 	if basePhaseSec != 300 {
 		t.Fatalf("basePhaseSec=%d want 300", basePhaseSec)
 	}
 	if mixedPhaseSec != 3600 {
 		t.Fatalf("mixedPhaseSec=%d want 3600", mixedPhaseSec)
 	}
 	if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,300,300,3600" {
 		t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
 	}
 }
 func TestBuildBenchmarkSteadyPlanOvernight(t *testing.T) {
 	t.Parallel()
 	_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
 		benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, SteadySec: 27000},
 		benchmarkPrecisionPhases,
 		func(label string) string { return label },
 	)
 	if basePhaseSec != 3600 {
 		t.Fatalf("basePhaseSec=%d want 3600", basePhaseSec)
 	}
 	if mixedPhaseSec != 14400 {
 		t.Fatalf("mixedPhaseSec=%d want 14400", mixedPhaseSec)
 	}
 	if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,3600,3600,14400" {
 		t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
 	}
 }
 func TestSplitBenchmarkRowsByPlannedPhaseUsesPhaseDurations(t *testing.T) {
 	t.Parallel()
 	phases := []benchmarkPlannedPhase{
 		{PlanLabel: "fp8", MetricStage: "fp8", DurationSec: 10},
 		{PlanLabel: "fp16", MetricStage: "fp16", DurationSec: 10},
 		{PlanLabel: "mixed", MetricStage: "mixed", DurationSec: 50},
 	}
 	rows := []GPUMetricRow{
 		{ElapsedSec: 5},
 		{ElapsedSec: 15},
 		{ElapsedSec: 25},
 		{ElapsedSec: 65},
 	}
 	got := splitBenchmarkRowsByPlannedPhase(rows, phases)
 	if len(got["fp8"]) != 1 {
 		t.Fatalf("fp8 rows=%d want 1", len(got["fp8"]))
 	}
 	if len(got["fp16"]) != 1 {
 		t.Fatalf("fp16 rows=%d want 1", len(got["fp16"]))
 	}
 	if len(got["mixed"]) != 2 {
 		t.Fatalf("mixed rows=%d want 2", len(got["mixed"]))
 	}
 }
 func TestBenchmarkSupportedPrecisionsSkipsFP4BeforeBlackwell(t *testing.T) {
 	t.Parallel()
 	if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64" {
 		t.Fatalf("supported=%v", got)
 	}
 	if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64,fp4" {
 		t.Fatalf("supported=%v", got)
 	}
 }
 func TestBenchmarkPlannedPhaseStatus(t *testing.T) {
 	t.Parallel()
 	cases := []struct {
 		name       string
 		raw        string
 		wantStatus string
 	}{
 		{name: "ok", raw: "status=OK\n", wantStatus: "OK"},
 		{name: "failed", raw: "phase_error=fp16\n", wantStatus: "FAILED"},
 		{name: "unsupported", raw: "cublasLt_profiles=unsupported\nphase_error=fp4\n", wantStatus: "UNSUPPORTED"},
 	}
 	for _, tc := range cases {
 		tc := tc
 		t.Run(tc.name, func(t *testing.T) {
 			got, _ := benchmarkPlannedPhaseStatus([]byte(tc.raw))
 			if got != tc.wantStatus {
 				t.Fatalf("status=%q want %q", got, tc.wantStatus)
 			}
 		})
 	}
 }
 func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
 	t.Parallel()
@@ -65,8 +188,10 @@ func TestParseBenchmarkBurnLog(t *testing.T) {
 		"[gpu 0] compute_capability=9.0",
 		"[gpu 0] backend=cublasLt",
 		"[gpu 0] duration_s=10",
 		"[gpu 0] int8_tensor[0]=READY dim=16384x16384x8192 block=128 stream=0",
 		"[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0",
 		"[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0",
 		"[gpu 0] int8_tensor_iterations=80",
 		"[gpu 0] fp16_tensor_iterations=200",
 		"[gpu 0] fp8_e4m3_iterations=50",
 		"[gpu 0] status=OK",
@@ -79,15 +204,24 @@ func TestParseBenchmarkBurnLog(t *testing.T) {
 	if got.ComputeCapability != "9.0" {
 		t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability)
 	}
-	if len(got.Profiles) != 2 {
+	if len(got.Profiles) != 3 {
-		t.Fatalf("profiles=%d want 2", len(got.Profiles))
+		t.Fatalf("profiles=%d want 3", len(got.Profiles))
 	}
 	if got.Profiles[0].TeraOpsPerSec <= 0 {
 		t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec)
 	}
 	if got.Profiles[0].Category != "fp16_bf16" {
 		t.Fatalf("profile[0] category=%q want fp16_bf16", got.Profiles[0].Category)
 	}
 	if got.Profiles[1].Category != "fp8" {
 		t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category)
 	}
 	if got.Profiles[2].Category != "int8" {
 		t.Fatalf("profile[2] category=%q want int8", got.Profiles[2].Category)
 	}
 	if got.Profiles[2].Weight != 0.25 {
 		t.Fatalf("profile[2] weight=%f want 0.25", got.Profiles[2].Weight)
 	}
 }
 func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -104,6 +104,7 @@ type BenchmarkGPUResult struct {
 	Backend             string  `json:"backend,omitempty"`
 	Status              string  `json:"status"`
 	PowerLimitW         float64 `json:"power_limit_w,omitempty"`
 	PowerLimitDerated   bool    `json:"power_limit_derated,omitempty"`
 	MultiprocessorCount int     `json:"multiprocessor_count,omitempty"`
 	DefaultPowerLimitW  float64 `json:"default_power_limit_w,omitempty"`
 	// CalibratedPeakPowerW is the p95 power measured during a short
@@ -111,6 +112,8 @@ type BenchmarkGPUResult struct {
 	// Used as the reference denominator for PowerSustainScore instead of
 	// the hardware default limit, which bee-gpu-burn cannot reach.
 	CalibratedPeakPowerW   float64                         `json:"calibrated_peak_power_w,omitempty"`
 	CalibratedPeakTempC    float64                         `json:"calibrated_peak_temp_c,omitempty"`
 	PowerCalibrationTries  int                             `json:"power_calibration_tries,omitempty"`
 	MaxGraphicsClockMHz    float64                         `json:"max_graphics_clock_mhz,omitempty"`
 	BaseGraphicsClockMHz   float64                         `json:"base_graphics_clock_mhz,omitempty"`
 	MaxMemoryClockMHz      float64                         `json:"max_memory_clock_mhz,omitempty"`
@@ -119,6 +122,7 @@ type BenchmarkGPUResult struct {
 	Baseline               BenchmarkTelemetrySummary       `json:"baseline"`
 	Steady                 BenchmarkTelemetrySummary       `json:"steady"`
 	PrecisionSteady        []BenchmarkPrecisionSteadyPhase `json:"precision_steady,omitempty"`
 	PrecisionFailures      []string                        `json:"precision_failures,omitempty"`
 	Cooldown               BenchmarkTelemetrySummary       `json:"cooldown"`
 	Throttle               BenchmarkThrottleCounters       `json:"throttle_counters"`
 	// ECC error delta accumulated over the full benchmark (all phases combined).
@@ -127,6 +131,9 @@ type BenchmarkGPUResult struct {
 	Scores             BenchmarkScorecard         `json:"scores"`
 	DegradationReasons []string                   `json:"degradation_reasons,omitempty"`
 	Notes              []string                   `json:"notes,omitempty"`
 	// CoolingWarning is non-empty when a thermal throttle event occurred with
 	// a clock drop ≥20% while server fans were not at 100% duty cycle.
 	CoolingWarning string `json:"cooling_warning,omitempty"`
 }
 type BenchmarkTelemetrySummary struct {
@@ -179,7 +186,7 @@ type BenchmarkPrecisionResult struct {
 	Iterations    uint64  `json:"iterations,omitempty"`
 	TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
 	// Weight is the fp32-equivalence factor for this precision category.
-	// fp32 = 1.0 (baseline), fp64 = 2.0, fp16 = 0.5, fp8 = 0.25, fp4 = 0.125.
+	// fp32 = 1.0 (baseline), fp64 = 2.0, fp16 = 0.5, int8/fp8 = 0.25, fp4 = 0.125.
 	// WeightedTOPS = TeraOpsPerSec * Weight gives fp32-equivalent throughput.
 	Weight                float64 `json:"weight,omitempty"`
 	WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"`
@@ -225,6 +232,7 @@ type BenchmarkServerPower struct {
 // type runs at a time the PowerCVPct here is a genuine stability signal.
 type BenchmarkPrecisionSteadyPhase struct {
 	Precision             string                    `json:"precision"` // e.g. "fp8", "fp16", "fp32"
 	Status                string                    `json:"status,omitempty"`
 	Steady                BenchmarkTelemetrySummary `json:"steady"`
 	TeraOpsPerSec         float64                   `json:"teraops_per_sec,omitempty"`
 	WeightedTeraOpsPerSec float64                   `json:"weighted_teraops_per_sec,omitempty"`
@@ -232,6 +240,7 @@ type BenchmarkPrecisionSteadyPhase struct {
 	// Non-zero corrected = stress-induced DRAM errors for this kernel type.
 	// Any uncorrected = serious fault triggered by this precision workload.
 	ECC   BenchmarkECCCounters `json:"ecc,omitempty"`
 	Notes string               `json:"notes,omitempty"`
 }
 type BenchmarkInterconnectResult struct {
@@ -245,3 +254,47 @@ type BenchmarkInterconnectResult struct {
 	MaxBusBWGBps       float64  `json:"max_busbw_gbps,omitempty"`
 	Notes              []string `json:"notes,omitempty"`
 }
 type NvidiaPowerBenchResult struct {
 	BenchmarkVersion     string                 `json:"benchmark_version"`
 	GeneratedAt          time.Time              `json:"generated_at"`
 	Hostname             string                 `json:"hostname,omitempty"`
 	ServerModel          string                 `json:"server_model,omitempty"`
 	BenchmarkProfile     string                 `json:"benchmark_profile"`
 	SelectedGPUIndices   []int                  `json:"selected_gpu_indices"`
 	RecommendedSlotOrder []int                  `json:"recommended_slot_order,omitempty"`
 	RampSteps            []NvidiaPowerBenchStep `json:"ramp_steps,omitempty"`
 	OverallStatus        string                 `json:"overall_status"`
 	Findings             []string               `json:"findings,omitempty"`
 	GPUs                 []NvidiaPowerBenchGPU  `json:"gpus"`
 }
 type NvidiaPowerBenchGPU struct {
 	Index               int      `json:"index"`
 	Name                string   `json:"name,omitempty"`
 	BusID               string   `json:"bus_id,omitempty"`
 	DefaultPowerLimitW  float64  `json:"default_power_limit_w,omitempty"`
 	AppliedPowerLimitW  float64  `json:"applied_power_limit_w,omitempty"`
 	MaxObservedPowerW   float64  `json:"max_observed_power_w,omitempty"`
 	MaxObservedTempC    float64  `json:"max_observed_temp_c,omitempty"`
 	CalibrationAttempts int      `json:"calibration_attempts,omitempty"`
 	Derated             bool     `json:"derated,omitempty"`
 	Status              string   `json:"status"`
 	OccupiedSlots       []int    `json:"occupied_slots,omitempty"`
 	OccupiedSlotsNote   string   `json:"occupied_slots_note,omitempty"`
 	Notes               []string `json:"notes,omitempty"`
 	// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
 	CoolingWarning string `json:"cooling_warning,omitempty"`
 }
 type NvidiaPowerBenchStep struct {
 	StepIndex              int      `json:"step_index"`
 	GPUIndices             []int    `json:"gpu_indices"`
 	TotalObservedPowerW    float64  `json:"total_observed_power_w,omitempty"`
 	AvgObservedPowerW      float64  `json:"avg_observed_power_w,omitempty"`
 	MinPowerRealizationPct float64  `json:"min_power_realization_pct,omitempty"`
 	AvgPowerRealizationPct float64  `json:"avg_power_realization_pct,omitempty"`
 	DeratedGPUCount        int      `json:"derated_gpu_count,omitempty"`
 	Status                 string   `json:"status"`
 	Notes                  []string `json:"notes,omitempty"`
 }
--- a/audit/internal/platform/gpu_metrics.go
+++ b/audit/internal/platform/gpu_metrics.go
@@ -14,6 +14,8 @@ import (
 // GPUMetricRow is one telemetry sample from nvidia-smi during a stress test.
 type GPUMetricRow struct {
 	Stage                 string  `json:"stage,omitempty"`
 	StageStartSec         float64 `json:"stage_start_sec,omitempty"`
 	StageEndSec           float64 `json:"stage_end_sec,omitempty"`
 	ElapsedSec            float64 `json:"elapsed_sec"`
 	GPUIndex              int     `json:"index"`
 	TempC                 float64 `json:"temp_c"`
@@ -509,11 +511,22 @@ func buildGPUMetricStageSpans(rows []GPUMetricRow) []gpuMetricStageSpan {
 		if name == "" {
 			name = "run"
 		}
 		start := row.StageStartSec
 		end := row.StageEndSec
 		if end <= start {
 			start = row.ElapsedSec
 			end = row.ElapsedSec
 		}
 		if len(spans) == 0 || spans[len(spans)-1].Name != name {
-			spans = append(spans, gpuMetricStageSpan{Name: name, Start: row.ElapsedSec, End: row.ElapsedSec})
+			spans = append(spans, gpuMetricStageSpan{Name: name, Start: start, End: end})
 			continue
 		}
-		spans[len(spans)-1].End = row.ElapsedSec
+		if start < spans[len(spans)-1].Start {
 			spans[len(spans)-1].Start = start
 		}
 		if end > spans[len(spans)-1].End {
 			spans[len(spans)-1].End = end
 		}
 	}
 	for i := range spans {
 		if spans[i].End <= spans[i].Start {
--- a/audit/internal/platform/install_to_ram.go
+++ b/audit/internal/platform/install_to_ram.go
@@ -11,20 +11,10 @@ import (
 	"strings"
 )
 const installToRAMDir = "/dev/shm/bee-live"
 func (s *System) IsLiveMediaInRAM() bool {
-	fsType := mountFSType("/run/live/medium")
+	return s.LiveMediaRAMState().InRAM
 	if fsType == "" {
 		// No medium mount at all — fall back to toram kernel parameter.
 		return toramActive()
 	}
 	if strings.EqualFold(fsType, "tmpfs") {
 		return true
 	}
 	// When RunInstallToRAM copies squashfs to /dev/shm/bee-live but the bind
 	// mount of /run/live/medium fails (common for CD-ROM boots), the medium
 	// fstype still shows the CD-ROM type. Check whether the RAM copy exists.
 	files, _ := filepath.Glob("/dev/shm/bee-live/*.squashfs")
 	return len(files) > 0
 }
 func (s *System) LiveBootSource() LiveBootSource {
@@ -56,14 +46,95 @@ func (s *System) LiveBootSource() LiveBootSource {
 	return status
 }
-func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
+func (s *System) LiveMediaRAMState() LiveMediaRAMState {
 	return evaluateLiveMediaRAMState(
 		s.LiveBootSource(),
 		toramActive(),
 		globPaths("/run/live/medium/live/*.squashfs"),
 		globPaths(filepath.Join(installToRAMDir, "*.squashfs")),
 	)
 }
 func evaluateLiveMediaRAMState(status LiveBootSource, toram bool, sourceSquashfs, copiedSquashfs []string) LiveMediaRAMState {
 	state := LiveMediaRAMState{
 		LiveBootSource: status,
 		ToramActive:    toram,
 		CopyPresent:    len(copiedSquashfs) > 0,
 	}
 	if status.InRAM {
 		state.State = "in_ram"
 		state.Status = "ok"
 		state.CopyComplete = true
 		state.Message = "Running from RAM — installation media can be safely disconnected."
 		return state
 	}
 	expected := pathBaseSet(sourceSquashfs)
 	copied := pathBaseSet(copiedSquashfs)
 	state.CopyComplete = len(expected) > 0 && setContainsAll(copied, expected)
 	switch {
 	case state.CopyComplete:
 		state.State = "partial"
 		state.Status = "partial"
 		state.CanStartCopy = true
 		state.Message = "Live media files were copied to RAM, but the system is still mounted from the original boot source."
 	case state.CopyPresent:
 		state.State = "partial"
 		state.Status = "partial"
 		state.CanStartCopy = true
 		state.Message = "Partial RAM copy detected. A previous Copy to RAM run was interrupted or cancelled."
 	case toram:
 		state.State = "toram_failed"
 		state.Status = "failed"
 		state.CanStartCopy = true
 		state.Message = "toram boot parameter is set but the live medium is not mounted from RAM."
 	default:
 		state.State = "not_in_ram"
 		state.Status = "warning"
 		state.CanStartCopy = true
 		state.Message = "ISO not copied to RAM. Use Copy to RAM to free the boot drive and improve performance."
 	}
 	return state
 }
 func globPaths(pattern string) []string {
 	matches, _ := filepath.Glob(pattern)
 	return matches
 }
 func pathBaseSet(paths []string) map[string]struct{} {
 	out := make(map[string]struct{}, len(paths))
 	for _, path := range paths {
 		base := strings.TrimSpace(filepath.Base(path))
 		if base != "" {
 			out[base] = struct{}{}
 		}
 	}
 	return out
 }
 func setContainsAll(have, want map[string]struct{}) bool {
 	if len(want) == 0 {
 		return false
 	}
 	for name := range want {
 		if _, ok := have[name]; !ok {
 			return false
 		}
 	}
 	return true
 }
 func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (retErr error) {
 	log := func(msg string) {
 		if logFunc != nil {
 			logFunc(msg)
 		}
 	}
-	if s.IsLiveMediaInRAM() {
+	state := s.LiveMediaRAMState()
 	if state.InRAM {
 		log("Already running from RAM — installation media can be safely disconnected.")
 		return nil
 	}
@@ -88,10 +159,21 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) erro
 			humanBytes(needed+headroom), humanBytes(free))
 	}
-	dstDir := "/dev/shm/bee-live"
+	dstDir := installToRAMDir
 	if state.CopyPresent {
 		log("Removing stale partial RAM copy before retry...")
 	}
 	_ = os.RemoveAll(dstDir)
 	if err := os.MkdirAll(dstDir, 0755); err != nil {
 		return fmt.Errorf("create tmpfs dir: %v", err)
 	}
 	defer func() {
 		if retErr == nil {
 			return
 		}
 		_ = os.RemoveAll(dstDir)
 		log("Removed incomplete RAM copy.")
 	}()
 	for _, sf := range squashfsFiles {
 		if err := ctx.Err(); err != nil {
--- a/audit/internal/platform/install_to_ram_test.go
+++ b/audit/internal/platform/install_to_ram_test.go
@@ -58,3 +58,46 @@ func TestDescribeLiveBootSource(t *testing.T) {
 		t.Fatalf("got %q want /run/live/medium", got)
 	}
 }
 func TestEvaluateLiveMediaRAMState(t *testing.T) {
 	t.Parallel()
 	t.Run("in_ram", func(t *testing.T) {
 		state := evaluateLiveMediaRAMState(
 			LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"},
 			false,
 			nil,
 			nil,
 		)
 		if state.State != "in_ram" || state.Status != "ok" || state.CanStartCopy {
 			t.Fatalf("state=%+v", state)
 		}
 	})
 	t.Run("partial_copy_after_cancel", func(t *testing.T) {
 		state := evaluateLiveMediaRAMState(
 			LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"},
 			false,
 			[]string{"/run/live/medium/live/filesystem.squashfs", "/run/live/medium/live/firmware.squashfs"},
 			[]string{"/dev/shm/bee-live/filesystem.squashfs"},
 		)
 		if state.State != "partial" || state.Status != "partial" || !state.CanStartCopy {
 			t.Fatalf("state=%+v", state)
 		}
 		if state.CopyComplete {
 			t.Fatalf("CopyComplete=%v want false", state.CopyComplete)
 		}
 	})
 	t.Run("toram_failed", func(t *testing.T) {
 		state := evaluateLiveMediaRAMState(
 			LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"},
 			true,
 			nil,
 			nil,
 		)
 		if state.State != "toram_failed" || state.Status != "failed" || !state.CanStartCopy {
 			t.Fatalf("state=%+v", state)
 		}
 	})
 }
--- a/audit/internal/platform/runtime.go
+++ b/audit/internal/platform/runtime.go
@@ -171,25 +171,28 @@ func resolvedToolStatus(display string, candidates ...string) ToolStatus {
 	return ToolStatus{Name: display}
 }
-// collectToRAMHealth checks whether the LiveCD ISO has been copied to RAM.
+// collectToRAMHealth evaluates whether the live system is fully running from RAM.
-// Status values: "ok" = in RAM, "warning" = toram not active (no copy attempted),
+// Status values: "ok" = fully in RAM, "warning" = not copied, "partial" = stale or
-// "failed" = toram was requested but medium is not in RAM (copy failed or in progress).
+// incomplete RAM copy exists but runtime still depends on the boot medium,
 // "failed" = toram was requested but medium is not in RAM.
 func (s *System) collectToRAMHealth(health *schema.RuntimeHealth) {
-	inRAM := s.IsLiveMediaInRAM()
+	state := s.LiveMediaRAMState()
-	active := toramActive()
+	health.ToRAMStatus = state.Status
-	switch {
+	switch state.Status {
-	case inRAM:
+	case "ok":
-		health.ToRAMStatus = "ok"
+		return
-	case active:
+	case "failed":
 		// toram was requested but medium is not yet/no longer in RAM
 		health.ToRAMStatus = "failed"
 		health.Issues = append(health.Issues, schema.RuntimeIssue{
 			Code:        "toram_copy_failed",
 			Severity:    "warning",
-			Description: "toram boot parameter is set but the live medium is not mounted from RAM.",
+			Description: state.Message,
 		})
 	case "partial":
 		health.Issues = append(health.Issues, schema.RuntimeIssue{
 			Code:        "toram_copy_partial",
 			Severity:    "warning",
 			Description: state.Message,
 		})
 	default:
 		health.ToRAMStatus = "warning"
 	}
 }
--- a/audit/internal/platform/types.go
+++ b/audit/internal/platform/types.go
@@ -9,6 +9,17 @@ type LiveBootSource struct {
 	Device string `json:"device,omitempty"`
 }
 type LiveMediaRAMState struct {
 	LiveBootSource
 	State        string `json:"state"`
 	Status       string `json:"status"`
 	ToramActive  bool   `json:"toram_active,omitempty"`
 	CopyPresent  bool   `json:"copy_present,omitempty"`
 	CopyComplete bool   `json:"copy_complete,omitempty"`
 	CanStartCopy bool   `json:"can_start_copy,omitempty"`
 	Message      string `json:"message,omitempty"`
 }
 type InterfaceInfo struct {
 	Name  string
 	State string
--- a/audit/internal/schema/hardware.go
+++ b/audit/internal/schema/hardware.go
@@ -22,7 +22,7 @@ type RuntimeHealth struct {
 	CUDAReady     bool   `json:"cuda_ready,omitempty"`
 	NvidiaGSPMode string `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
 	NetworkStatus string `json:"network_status,omitempty"`
-	// ToRAMStatus: "ok" (ISO in RAM), "warning" (toram not active), "failed" (toram active but copy failed)
+	// ToRAMStatus: "ok" (fully in RAM), "warning" (not copied), "partial" (stale/incomplete copy exists), "failed" (toram active but copy failed)
 	ToRAMStatus string `json:"toram_status,omitempty"`
 	// USBExportPath: mount point of the first writable USB drive found, empty if none.
 	USBExportPath string                 `json:"usb_export_path,omitempty"`
--- a/audit/internal/webui/api.go
+++ b/audit/internal/webui/api.go
@@ -110,7 +110,7 @@ func writeTaskRunResponse(w http.ResponseWriter, tasks []*Task) {
 func shouldSplitHomogeneousNvidiaTarget(target string) bool {
 	switch strings.TrimSpace(target) {
-	case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute",
+	case "nvidia", "nvidia-targeted-stress", "nvidia-bench-perf", "nvidia-bench-power", "nvidia-compute",
 		"nvidia-targeted-power", "nvidia-pulse", "nvidia-interconnect",
 		"nvidia-bandwidth", "nvidia-stress":
 		return true
@@ -127,7 +127,7 @@ func defaultTaskPriority(target string, params taskParams) int {
 		return taskPriorityInstallToRAM
 	case "audit":
 		return taskPriorityAudit
-	case "nvidia-benchmark":
+	case "nvidia-bench-perf", "nvidia-bench-power":
 		return taskPriorityBenchmark
 	case "nvidia-stress", "amd-stress", "memory-stress", "sat-stress", "platform-stress", "nvidia-compute":
 		return taskPriorityBurn
@@ -573,7 +573,8 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
 	}
 }
-func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
+func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFunc {
 	return func(w http.ResponseWriter, r *http.Request) {
 		if h.opts.App == nil {
 			writeError(w, http.StatusServiceUnavailable, "app not configured")
 			return
@@ -614,13 +615,18 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
 		if profile == "" {
 			profile = "standard"
 		}
-	name := taskDisplayName("nvidia-benchmark", "", "")
+		name := taskDisplayName(target, "", "")
 		if strings.TrimSpace(body.DisplayName) != "" {
 			name = body.DisplayName
 		}
 		// Append profile tag.
 		name = fmt.Sprintf("%s · %s", name, profile)
 		if target == "nvidia-bench-power" && parallelGPUs {
 			writeError(w, http.StatusBadRequest, "power / thermal fit benchmark uses sequential or ramp-up modes only")
 			return
 		}
 		if rampUp && len(body.GPUIndices) > 1 {
 			// Ramp-up mode: resolve GPU list, then create one task per prefix
 			// [gpu0], [gpu0,gpu1], ..., [gpu0,...,gpuN-1], each running in parallel.
@@ -645,10 +651,10 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
 					subset := resolved[:step]
 					stepName := fmt.Sprintf("%s · ramp %d/%d · GPU %s", name, step, len(resolved), formatGPUIndexList(subset))
 					t := &Task{
-					ID:        newJobID("benchmark-nvidia"),
+						ID:        newJobID("bee-bench-nvidia"),
 						Name:      stepName,
-					Target:    "nvidia-benchmark",
+						Target:    target,
-					Priority:  defaultTaskPriority("nvidia-benchmark", taskParams{}),
+						Priority:  defaultTaskPriority(target, taskParams{}),
 						Status:    TaskPending,
 						CreatedAt: now,
 						params: taskParams{
@@ -689,7 +695,7 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
 			ParallelGPUs:      parallelGPUs,
 			DisplayName:       body.DisplayName,
 		}
-	tasks, err := buildNvidiaTaskSet("nvidia-benchmark", defaultTaskPriority("nvidia-benchmark", params), time.Now(), params, name, h.opts.App, "benchmark-nvidia")
+		tasks, err := buildNvidiaTaskSet(target, defaultTaskPriority(target, params), time.Now(), params, name, h.opts.App, "bee-bench-nvidia")
 		if err != nil {
 			writeError(w, http.StatusBadRequest, err.Error())
 			return
@@ -698,6 +704,11 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
 			globalQueue.enqueue(t)
 		}
 		writeTaskRunResponse(w, tasks)
 	}
 }
 func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
 	h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf").ServeHTTP(w, r)
 }
 func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {
@@ -1072,18 +1083,55 @@ func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) {
 		writeError(w, http.StatusServiceUnavailable, "app not configured")
 		return
 	}
-	status := h.opts.App.LiveBootSource()
+	status := h.currentRAMStatus()
 	w.Header().Set("Content-Type", "application/json")
 	_ = json.NewEncoder(w).Encode(status)
 }
 type ramStatusResponse struct {
 	platform.LiveMediaRAMState
 	InstallTaskActive bool   `json:"install_task_active,omitempty"`
 	CopyTaskActive    bool   `json:"copy_task_active,omitempty"`
 	CanStartTask      bool   `json:"can_start_task,omitempty"`
 	BlockedReason     string `json:"blocked_reason,omitempty"`
 }
 func (h *handler) currentRAMStatus() ramStatusResponse {
 	state := h.opts.App.LiveMediaRAMState()
 	resp := ramStatusResponse{LiveMediaRAMState: state}
 	if globalQueue.hasActiveTarget("install") {
 		resp.InstallTaskActive = true
 		resp.BlockedReason = "install to disk is already running"
 		return resp
 	}
 	if globalQueue.hasActiveTarget("install-to-ram") {
 		resp.CopyTaskActive = true
 		resp.BlockedReason = "install to RAM task is already pending or running"
 		return resp
 	}
 	if state.InRAM {
 		resp.BlockedReason = "system is already running from RAM"
 		return resp
 	}
 	resp.CanStartTask = state.CanStartCopy
 	if !resp.CanStartTask && resp.BlockedReason == "" {
 		resp.BlockedReason = state.Message
 	}
 	return resp
 }
 func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request) {
 	if h.opts.App == nil {
 		writeError(w, http.StatusServiceUnavailable, "app not configured")
 		return
 	}
-	if globalQueue.hasActiveTarget("install") {
+	status := h.currentRAMStatus()
-		writeError(w, http.StatusConflict, "install to disk is already running")
+	if !status.CanStartTask {
 		msg := strings.TrimSpace(status.BlockedReason)
 		if msg == "" {
 			msg = "install to RAM is not available"
 		}
 		writeError(w, http.StatusConflict, msg)
 		return
 	}
 	t := &Task{
--- a/audit/internal/webui/api_test.go
+++ b/audit/internal/webui/api_test.go
@@ -64,7 +64,7 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
 	t.Cleanup(func() { apiListNvidiaGPUs = prevList })
 	h := &handler{opts: HandlerOptions{App: &app.App{}}}
-	req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
+	req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/perf/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
 	rec := httptest.NewRecorder()
 	h.handleAPIBenchmarkNvidiaRun(rec, req)
@@ -78,8 +78,8 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
 		t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
 	}
 	task := globalQueue.tasks[0]
-	if task.Target != "nvidia-benchmark" {
+	if task.Target != "nvidia-bench-perf" {
-		t.Fatalf("target=%q want nvidia-benchmark", task.Target)
+		t.Fatalf("target=%q want nvidia-bench-perf", task.Target)
 	}
 	if got := task.params.GPUIndices; len(got) != 2 || got[0] != 1 || got[1] != 3 {
 		t.Fatalf("gpu indices=%v want [1 3]", got)
@@ -113,7 +113,7 @@ func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
 	t.Cleanup(func() { apiListNvidiaGPUs = prevList })
 	h := &handler{opts: HandlerOptions{App: &app.App{}}}
-	req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
+	req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/perf/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
 	rec := httptest.NewRecorder()
 	h.handleAPIBenchmarkNvidiaRun(rec, req)
@@ -147,6 +147,50 @@ func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
 	}
 }
 func TestHandleAPIBenchmarkPowerFitRampQueuesBenchmarkPowerFitTasks(t *testing.T) {
 	globalQueue.mu.Lock()
 	originalTasks := globalQueue.tasks
 	globalQueue.tasks = nil
 	globalQueue.mu.Unlock()
 	t.Cleanup(func() {
 		globalQueue.mu.Lock()
 		globalQueue.tasks = originalTasks
 		globalQueue.mu.Unlock()
 	})
 	prevList := apiListNvidiaGPUs
 	apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
 		return []platform.NvidiaGPU{
 			{Index: 0, Name: "NVIDIA H100 PCIe"},
 			{Index: 1, Name: "NVIDIA H100 PCIe"},
 			{Index: 2, Name: "NVIDIA H100 PCIe"},
 		}, nil
 	}
 	t.Cleanup(func() { apiListNvidiaGPUs = prevList })
 	h := &handler{opts: HandlerOptions{App: &app.App{}}}
 	req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/power/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"ramp_up":true}`))
 	rec := httptest.NewRecorder()
 	h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power").ServeHTTP(rec, req)
 	if rec.Code != 200 {
 		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
 	}
 	globalQueue.mu.Lock()
 	defer globalQueue.mu.Unlock()
 	if len(globalQueue.tasks) != 3 {
 		t.Fatalf("tasks=%d want 3", len(globalQueue.tasks))
 	}
 	for i, task := range globalQueue.tasks {
 		if task.Target != "nvidia-bench-power" {
 			t.Fatalf("task[%d] target=%q", i, task.Target)
 		}
 		if task.Priority != taskPriorityBenchmark {
 			t.Fatalf("task[%d] priority=%d want %d", i, task.Priority, taskPriorityBenchmark)
 		}
 	}
 }
 func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
 	globalQueue.mu.Lock()
 	originalTasks := globalQueue.tasks
@@ -202,7 +246,8 @@ func TestDefaultTaskPriorityOrder(t *testing.T) {
 		defaultTaskPriority("cpu", taskParams{}),
 		defaultTaskPriority("cpu", taskParams{StressMode: true}),
 		defaultTaskPriority("nvidia-stress", taskParams{}),
-		defaultTaskPriority("nvidia-benchmark", taskParams{}),
+		defaultTaskPriority("nvidia-bench-perf", taskParams{}),
 		defaultTaskPriority("nvidia-bench-power", taskParams{}),
 	}
 	want := []int{
 		taskPriorityInstallToRAM,
@@ -211,13 +256,14 @@ func TestDefaultTaskPriorityOrder(t *testing.T) {
 		taskPriorityValidateStress,
 		taskPriorityBurn,
 		taskPriorityBenchmark,
 		taskPriorityBenchmark,
 	}
 	for i := range want {
 		if got[i] != want[i] {
 			t.Fatalf("priority[%d]=%d want %d", i, got[i], want[i])
 		}
 	}
-	if !(got[0] > got[1] && got[1] > got[2] && got[2] > got[3] && got[3] > got[4] && got[4] > got[5]) {
+	if !(got[0] > got[1] && got[1] > got[2] && got[2] > got[3] && got[3] > got[4] && got[4] > got[5] && got[5] == got[6]) {
 		t.Fatalf("priority order=%v", got)
 	}
 }
--- a/audit/internal/webui/kmsg_watcher.go
+++ b/audit/internal/webui/kmsg_watcher.go
@@ -232,7 +232,7 @@ func truncate(s string, max int) string {
 // isSATTarget returns true for task targets that run hardware acceptance tests.
 func isSATTarget(target string) bool {
 	switch target {
-	case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
+	case "nvidia", "nvidia-targeted-stress", "nvidia-bench-perf", "nvidia-bench-power", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
 		"nvidia-interconnect", "nvidia-bandwidth", "nvidia-stress", "memory", "memory-stress", "storage",
 		"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
 		"platform-stress":
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
@@ -845,6 +845,13 @@ func buildRuntimeToRAMRow(health schema.RuntimeHealth) runtimeHealthRow {
 			Source: "live-boot / /proc/mounts",
 			Issue:  "",
 		}
 	case "partial":
 		return runtimeHealthRow{
 			Title:  "LiveCD in RAM",
 			Status: "WARNING",
 			Source: "live-boot / /proc/mounts / /dev/shm/bee-live",
 			Issue:  "Partial or staged RAM copy detected. System is not fully running from RAM; Copy to RAM can be retried.",
 		}
 	case "failed":
 		return runtimeHealthRow{
 			Title:  "LiveCD in RAM",
@@ -1939,7 +1946,7 @@ func renderBenchmark(opts HandlerOptions) string {
 <div class="grid2">
  <div class="card">
-    <div class="card-head">NVIDIA Benchmark</div>
+    <div class="card-head">Benchmark Setup</div>
    <div class="card-body">
      <div class="form-row">
        <label>Profile</label>
@@ -1972,21 +1979,25 @@ func renderBenchmark(opts HandlerOptions) string {
        <span>Ramp-up — 1 GPU → 2 → … → all selected (separate tasks)</span>
      </label>
      <p id="benchmark-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 14px">Select one GPU for single-card benchmarking or several GPUs for a constrained multi-GPU run.</p>
-      <button id="benchmark-run-btn" class="btn btn-primary" onclick="runNvidiaBenchmark()" disabled>&#9654; Run Benchmark</button>
+      <div style="display:flex;gap:8px;flex-wrap:wrap;align-items:center">
        <button id="benchmark-run-performance-btn" class="btn btn-primary" onclick="runNvidiaBenchmark('performance')" disabled>&#9654; Run Performance Benchmark</button>
        <button id="benchmark-run-power-fit-btn" class="btn btn-secondary" onclick="runNvidiaBenchmark('power-fit')" disabled>&#9654; Run Power / Thermal Fit</button>
      </div>
      <span id="benchmark-run-nccl" hidden>nccl-auto</span>
      <span id="benchmark-run-status" style="margin-left:10px;font-size:12px;color:var(--muted)"></span>
    </div>
  </div>
  <div class="card">
-    <div class="card-head">Method</div>
+    <div class="card-head">Method Split</div>
    <div class="card-body">
-      <p style="font-size:13px;color:var(--muted);margin-bottom:10px">Each benchmark run performs warmup, sustained compute, telemetry capture, cooldown, and optional NCCL interconnect checks.</p>
+      <p style="font-size:13px;color:var(--muted);margin-bottom:10px">The benchmark page now exposes two fundamentally different test families so compute score and server power-fit are not mixed into one number.</p>
      <table>
-        <tr><th>Profile</th><th>Purpose</th></tr>
+        <tr><th>Run Type</th><th>Engine</th><th>Question</th></tr>
-        <tr><td>Standard</td><td>Fast, repeatable performance check for server-to-server comparison.</td></tr>
+        <tr><td>Performance Benchmark</td><td><code>bee-gpu-burn</code></td><td>How much isolated compute performance does the GPU realize in this server?</td></tr>
-        <tr><td>Stability</td><td>Longer run for thermal drift, power caps, and clock instability.</td></tr>
+        <tr><td>Power / Thermal Fit</td><td><code>dcgmi targeted_power</code></td><td>How much power per GPU can this server sustain as GPU count ramps up?</td></tr>
        <tr><td>Overnight</td><td>Extended verification of long-run stability and late throttling.</td></tr>
      </table>
      <p style="font-size:12px;color:var(--muted);margin-top:10px">Use ramp-up mode for capacity work: it creates 1 GPU → 2 GPU → … → all selected steps so analysis software can derive server total score and watts-per-GPU curves.</p>
    </div>
  </div>
 </div>
@@ -2029,21 +2040,24 @@ function benchmarkMode() {
 function benchmarkUpdateSelectionNote() {
  const selected = benchmarkSelectedGPUIndices();
-  const btn = document.getElementById('benchmark-run-btn');
+  const perfBtn = document.getElementById('benchmark-run-performance-btn');
  const fitBtn = document.getElementById('benchmark-run-power-fit-btn');
  const note = document.getElementById('benchmark-selection-note');
  if (!selected.length) {
-    btn.disabled = true;
+    perfBtn.disabled = true;
    fitBtn.disabled = true;
    note.textContent = 'Select at least one NVIDIA GPU to run the benchmark.';
    return;
  }
-  btn.disabled = false;
+  perfBtn.disabled = false;
  fitBtn.disabled = false;
  const mode = benchmarkMode();
  if (mode === 'ramp-up') {
-    note.textContent = 'Ramp-up: ' + selected.length + ' tasks (1 GPU → ' + selected.length + ' GPUs). NCCL on final step.';
+    note.textContent = 'Ramp-up: ' + selected.length + ' tasks (1 GPU → ' + selected.length + ' GPUs). Performance uses compute benchmark; Power / Thermal Fit uses targeted_power per step.';
  } else if (mode === 'parallel') {
-    note.textContent = 'Parallel: all ' + selected.length + ' GPU(s) simultaneously.' + (selected.length > 1 ? ' NCCL included.' : '');
+    note.textContent = 'Parallel: all ' + selected.length + ' GPU(s) simultaneously. Only the performance benchmark supports this mode.';
  } else {
-    note.textContent = 'Sequential: each GPU benchmarked separately.' + (selected.length > 1 ? ' NCCL included on each.' : '');
+    note.textContent = 'Sequential: each selected GPU benchmarked separately.';
  }
 }
@@ -2117,7 +2131,7 @@ function benchmarkSelectNone() {
  benchmarkUpdateSelectionNote();
 }
-function runNvidiaBenchmark() {
+function runNvidiaBenchmark(kind) {
  const selected = benchmarkSelectedGPUIndices();
  const status = document.getElementById('benchmark-run-status');
  if (!selected.length) {
@@ -2127,21 +2141,26 @@ function runNvidiaBenchmark() {
  if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
  const mode = benchmarkMode();
  const rampUp = mode === 'ramp-up' && selected.length > 1;
-  const parallelGPUs = mode === 'parallel';
+  const parallelGPUs = mode === 'parallel' && kind === 'performance';
  if (kind === 'power-fit' && mode === 'parallel') {
    status.textContent = 'Power / Thermal Fit supports sequential or ramp-up only.';
    return;
  }
  const body = {
    profile: document.getElementById('benchmark-profile').value || 'standard',
    gpu_indices: selected,
-    run_nccl: selected.length > 1,
+    run_nccl: kind === 'performance' && selected.length > 1,
    parallel_gpus: parallelGPUs,
    ramp_up: rampUp,
-    display_name: 'NVIDIA Benchmark'
+    display_name: kind === 'power-fit' ? 'NVIDIA Power / Thermal Fit' : 'NVIDIA Performance Benchmark'
  };
  document.getElementById('benchmark-output').style.display = 'block';
-  document.getElementById('benchmark-title').textContent = '— ' + body.profile + ' [' + selected.join(', ') + ']';
+  document.getElementById('benchmark-title').textContent = '— ' + body.display_name + ' · ' + body.profile + ' [' + selected.join(', ') + ']';
  const term = document.getElementById('benchmark-terminal');
-  term.textContent = 'Enqueuing benchmark for GPUs ' + selected.join(', ') + '...\n';
+  term.textContent = 'Enqueuing ' + body.display_name + ' for GPUs ' + selected.join(', ') + '...\n';
  status.textContent = 'Queueing...';
-  fetch('/api/benchmark/nvidia/run', {
+  const endpoint = kind === 'power-fit' ? '/api/bee-bench/nvidia/power/run' : '/api/bee-bench/nvidia/perf/run';
  fetch(endpoint, {
    method: 'POST',
    headers: {'Content-Type':'application/json'},
    body: JSON.stringify(body)
@@ -2195,7 +2214,7 @@ benchmarkLoadGPUs();
 func renderBenchmarkResultsCard(exportDir string) string {
 	maxIdx, runs := loadBenchmarkHistory(exportDir)
 	return renderBenchmarkResultsCardFromRuns(
-		"Benchmark Results",
+		"Perf Results",
 		"Composite score by saved benchmark run and GPU.",
 		"No saved benchmark runs yet.",
 		maxIdx,
@@ -2237,11 +2256,11 @@ func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string,
 }
 func loadBenchmarkHistory(exportDir string) (int, []benchmarkHistoryRun) {
-	baseDir := app.DefaultBenchmarkBaseDir
+	baseDir := app.DefaultBeeBenchPerfDir
 	if strings.TrimSpace(exportDir) != "" {
-		baseDir = filepath.Join(exportDir, "bee-benchmark")
+		baseDir = filepath.Join(exportDir, "bee-bench", "perf")
 	}
-	paths, err := filepath.Glob(filepath.Join(baseDir, "gpu-benchmark-*", "result.json"))
+	paths, err := filepath.Glob(filepath.Join(baseDir, "perf-*", "result.json"))
 	if err != nil || len(paths) == 0 {
 		return -1, nil
 	}
@@ -2280,7 +2299,6 @@ func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun)
 	return maxGPUIndex, runs
 }
 // ── Burn ──────────────────────────────────────────────────────────────────────
 func renderBurn() string {
@@ -3245,12 +3263,19 @@ fetch('/api/system/ram-status').then(r=>r.json()).then(d=>{
  else if (kind === 'disk') label = 'disk (' + source + ')';
  else label = source;
  boot.textContent = 'Current boot source: ' + label + '.';
-  if (d.in_ram) {
+  txt.textContent = d.message || 'Checking...';
-    txt.textContent = '✓ Running from RAM — installation media can be safely disconnected.';
+  if (d.status === 'ok' || d.in_ram) {
    txt.style.color = 'var(--ok, green)';
  } else if (d.status === 'failed') {
    txt.style.color = 'var(--err, #b91c1c)';
  } else {
-    txt.textContent = 'Live media is mounted from installation device. Copy to RAM to allow media removal.';
+    txt.style.color = 'var(--muted)';
  }
  if (d.can_start_task) {
    btn.style.display = '';
    btn.disabled = false;
  } else {
    btn.style.display = 'none';
  }
 });
 function installToRAM() {
--- a/audit/internal/webui/server.go
+++ b/audit/internal/webui/server.go
@@ -261,7 +261,8 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	mux.HandleFunc("POST /api/sat/platform-stress/run", h.handleAPISATRun("platform-stress"))
 	mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream)
 	mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
-	mux.HandleFunc("POST /api/benchmark/nvidia/run", h.handleAPIBenchmarkNvidiaRun)
+	mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf"))
 	mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power"))
 	// Tasks
 	mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)
--- a/audit/internal/webui/server_test.go
+++ b/audit/internal/webui/server_test.go
@@ -11,6 +11,7 @@ import (
 	"time"
 	"bee/audit/internal/platform"
 	"bee/audit/internal/schema"
 )
 func TestChartLegendNumber(t *testing.T) {
@@ -78,6 +79,16 @@ func TestRecoverMiddlewarePreservesStreamingInterfaces(t *testing.T) {
 	}
 }
 func TestBuildRuntimeToRAMRowShowsPartialCopyWarning(t *testing.T) {
 	row := buildRuntimeToRAMRow(schema.RuntimeHealth{ToRAMStatus: "partial"})
 	if row.Status != "WARNING" {
 		t.Fatalf("status=%q want WARNING", row.Status)
 	}
 	if !strings.Contains(row.Issue, "Partial or staged RAM copy detected") {
 		t.Fatalf("issue=%q", row.Issue)
 	}
 }
 func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
 	samples := []platform.LiveMetricSample{
 		{
@@ -637,8 +648,11 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
 		`href="/benchmark"`,
 		`id="benchmark-gpu-list"`,
 		`/api/gpu/nvidia`,
-		`/api/benchmark/nvidia/run`,
+		`/api/bee-bench/nvidia/perf/run`,
 		`/api/bee-bench/nvidia/power/run`,
 		`benchmark-run-nccl`,
 		`Run Performance Benchmark`,
 		`Run Power / Thermal Fit`,
 	} {
 		if !strings.Contains(body, needle) {
 			t.Fatalf("benchmark page missing %q: %s", needle, body)
@@ -649,7 +663,7 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
 func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
 	dir := t.TempDir()
 	exportDir := filepath.Join(dir, "export")
-	runDir := filepath.Join(exportDir, "bee-benchmark", "gpu-benchmark-20260406-120000")
+	runDir := filepath.Join(exportDir, "bee-bench", "perf", "perf-20260406-120000")
 	if err := os.MkdirAll(runDir, 0755); err != nil {
 		t.Fatal(err)
 	}
@@ -691,7 +705,7 @@ func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
 	body := rec.Body.String()
 	wantTime := result.GeneratedAt.Local().Format("2006-01-02 15:04:05")
 	for _, needle := range []string{
-		`Benchmark Results`,
+		`Perf Results`,
 		`Composite score by saved benchmark run and GPU.`,
 		`GPU 0`,
 		`GPU 1`,
--- a/audit/internal/webui/task_report.go
+++ b/audit/internal/webui/task_report.go
@@ -233,6 +233,9 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
 	if benchmarkCard := renderTaskBenchmarkResultsCard(report.Target, logText); benchmarkCard != "" {
 		b.WriteString(benchmarkCard)
 	}
 	if powerCard := renderTaskPowerResultsCard(report.Target, logText); powerCard != "" {
 		b.WriteString(powerCard)
 	}
 	if len(report.Charts) > 0 {
 		for _, chart := range report.Charts {
@@ -251,7 +254,9 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
 }
 func renderTaskBenchmarkResultsCard(target, logText string) string {
-	if strings.TrimSpace(target) != "nvidia-benchmark" {
+	switch strings.TrimSpace(target) {
 	case "nvidia-bench-perf":
 	default:
 		return ""
 	}
 	resultPath := taskBenchmarkResultPath(logText)
@@ -263,7 +268,7 @@ func renderTaskBenchmarkResultsCard(target, logText string) string {
 		return ""
 	}
 	return renderBenchmarkResultsCardFromRuns(
-		"Benchmark Results",
+		"Perf Results",
 		"Composite score for this benchmark task.",
 		"No benchmark results were saved for this task.",
 		columns,
@@ -271,15 +276,42 @@ func renderTaskBenchmarkResultsCard(target, logText string) string {
 	)
 }
 func renderTaskPowerResultsCard(target, logText string) string {
 	if strings.TrimSpace(target) != "nvidia-bench-power" {
 		return ""
 	}
 	resultPath := taskBenchmarkResultPath(logText)
 	if strings.TrimSpace(resultPath) == "" {
 		return ""
 	}
 	raw, err := os.ReadFile(resultPath)
 	if err != nil {
 		return ""
 	}
 	var result platform.NvidiaPowerBenchResult
 	if err := json.Unmarshal(raw, &result); err != nil {
 		return ""
 	}
 	var b strings.Builder
 	b.WriteString(`<div class="card"><div class="card-head">Power Results</div><div class="card-body">`)
 	if len(result.RecommendedSlotOrder) > 0 {
 		b.WriteString(`<p style="margin-bottom:10px"><strong>Recommended slot order:</strong> ` + html.EscapeString(joinTaskIndices(result.RecommendedSlotOrder)) + `</p>`)
 	}
 	b.WriteString(`<table><tr><th>GPU</th><th>Status</th><th>Max Power</th><th>Applied Limit</th></tr>`)
 	for _, gpu := range result.GPUs {
 		fmt.Fprintf(&b, `<tr><td>GPU %d</td><td>%s</td><td>%.0f W</td><td>%.0f W</td></tr>`,
 			gpu.Index, html.EscapeString(gpu.Status), gpu.MaxObservedPowerW, gpu.AppliedPowerLimitW)
 	}
 	b.WriteString(`</table></div></div>`)
 	return b.String()
 }
 func taskBenchmarkResultPath(logText string) string {
 	archivePath := taskArchivePathFromLog(logText)
 	if archivePath == "" {
 		return ""
 	}
 	runDir := strings.TrimSuffix(archivePath, ".tar.gz")
 	if runDir == archivePath {
 		return ""
 	}
 	return filepath.Join(runDir, "result.json")
 }
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -32,7 +32,8 @@ const (
 var taskNames = map[string]string{
 	"nvidia":                 "NVIDIA SAT",
 	"nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)",
-	"nvidia-benchmark":       "NVIDIA Benchmark",
+	"nvidia-bench-perf":      "NVIDIA Bee Bench Perf",
 	"nvidia-bench-power":     "NVIDIA Bee Bench Power",
 	"nvidia-compute":         "NVIDIA Max Compute Load (dcgmproftester)",
 	"nvidia-targeted-power":  "NVIDIA Targeted Power (dcgmi diag targeted_power)",
 	"nvidia-pulse":           "NVIDIA Pulse Test (dcgmi diag pulse_test)",
@@ -628,7 +629,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			dur = 300
 		}
 		archive, err = a.RunNvidiaTargetedStressValidatePack(ctx, "", dur, t.params.GPUIndices, j.append)
-	case "nvidia-benchmark":
+	case "nvidia-bench-perf":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
@@ -644,6 +645,19 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			RampTotal:         t.params.RampTotal,
 			RampRunID:         t.params.RampRunID,
 		}, j.append)
 	case "nvidia-bench-power":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
 			break
 		}
 		archive, err = a.RunNvidiaPowerBenchCtx(ctx, app.DefaultBeeBenchPowerDir, platform.NvidiaBenchmarkOptions{
 			Profile:           t.params.BenchmarkProfile,
 			GPUIndices:        t.params.GPUIndices,
 			ExcludeGPUIndices: t.params.ExcludeGPUIndices,
 			RampStep:          t.params.RampStep,
 			RampTotal:         t.params.RampTotal,
 			RampRunID:         t.params.RampRunID,
 		}, j.append)
 	case "nvidia-compute":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
--- a/audit/internal/webui/tasks_test.go
+++ b/audit/internal/webui/tasks_test.go
@@ -366,7 +366,7 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
 	taskReportMetricsDBPath = metricsPath
 	t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
-	benchmarkDir := filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000")
+	benchmarkDir := filepath.Join(dir, "bee-bench", "perf", "perf-20260406-120000")
 	if err := os.MkdirAll(benchmarkDir, 0755); err != nil {
 		t.Fatal(err)
 	}
@@ -398,14 +398,14 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
 	}
 	task := &Task{
 		ID:           "task-bench",
-		Name:         "NVIDIA Benchmark",
+		Name:         "NVIDIA Bee Bench Perf",
-		Target:       "nvidia-benchmark",
+		Target:       "nvidia-bench-perf",
 		Status:       TaskDone,
 		CreatedAt:    time.Now().UTC().Add(-time.Minute),
 		ArtifactsDir: artifactsDir,
 	}
 	ensureTaskReportPaths(task)
-	logText := "line-1\nArchive: " + filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000.tar.gz") + "\n"
+	logText := "line-1\nArchive: " + filepath.Join(dir, "bee-bench", "perf", "perf-20260406-120000.tar.gz") + "\n"
 	if err := os.WriteFile(task.LogPath, []byte(logText), 0644); err != nil {
 		t.Fatal(err)
 	}
@@ -420,7 +420,7 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
 	}
 	html := string(body)
 	for _, needle := range []string{
-		`Benchmark Results`,
+		`Perf Results`,
 		`Composite score for this benchmark task.`,
 		`GPU 0`,
 		`1176.25`,
--- a/bible-local/docs/benchmark-clock-calibration.md
+++ b/bible-local/docs/benchmark-clock-calibration.md
@@ -1,5 +1,34 @@
 # Benchmark clock calibration research
 ## Benchmark methodology versioning
 Every benchmark methodology change must bump the benchmark version constant in
 source code by exactly `+1`.
 Methodology change means any change that affects comparability of benchmark
 results, including for example:
 - phase durations or phase order
 - enabled/disabled precisions
 - fallback rules
 - normalization rules
 - score formulas or weights
 - degradation thresholds
 - power calibration logic
 - thermal/power penalty logic
 Requirements:
 - benchmark version must be stored in source code as an explicit version
  constant, not inferred from git tag or build metadata
 - benchmark report must always print the benchmark version
 - `result.json` must always include the benchmark version
 - results from different benchmark versions must be treated as non-comparable by
  default
 Purpose:
 - prevent accidental comparison of runs produced by different methodologies
 - make historical benchmark archives self-describing even when detached from git
 - force deliberate version bumps whenever scoring or execution semantics change
 ## Status
 In progress. Baseline data from production servers pending.
--- a/iso/builder/bee-gpu-stress.c
+++ b/iso/builder/bee-gpu-stress.c
@@ -642,6 +642,20 @@ static const struct profile_desc k_profiles[] = {
        CUDA_R_16F,
        CUBLAS_COMPUTE_32F_FAST_16F,
    },
    {
        "int8_tensor",
        "int8",
        75,
        1,
        0,
        0,
        128,
        CUDA_R_8I,
        CUDA_R_8I,
        CUDA_R_32I,
        CUDA_R_32I,
        CUBLAS_COMPUTE_32I,
    },
    {
        "fp8_e4m3",
        "fp8",
@@ -760,10 +774,12 @@ static int check_cublas(const char *step, cublasStatus_t status) {
 static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) {
    switch (type) {
        case CUDA_R_32F:
        case CUDA_R_32I:
            return (size_t)(elements * 4u);
        case CUDA_R_16F:
        case CUDA_R_16BF:
            return (size_t)(elements * 2u);
        case CUDA_R_8I:
        case CUDA_R_8F_E4M3:
        case CUDA_R_8F_E5M2:
            return (size_t)(elements);
@@ -776,6 +792,16 @@ static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) {
    }
 }
 static cudaDataType_t matmul_scale_type(const struct profile_desc *desc) {
    if (desc->compute_type == CUBLAS_COMPUTE_32I) {
        return CUDA_R_32I;
    }
    if (desc->compute_type == CUBLAS_COMPUTE_64F) {
        return CUDA_R_64F;
    }
    return CUDA_R_32F;
 }
 static size_t fp4_scale_bytes(uint64_t rows, uint64_t cols) {
    uint64_t row_tiles = (rows + 127u) / 128u;
    uint64_t col_tiles = (cols + 63u) / 64u;
@@ -944,8 +970,9 @@ static int prepare_profile(struct cublaslt_api *cublas,
        return 0;
    }
    cudaDataType_t scale_type = matmul_scale_type(desc);
    if (!check_cublas("cublasLtMatmulDescCreate",
-                      cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, CUDA_R_32F))) {
+                      cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, scale_type))) {
        destroy_profile(cublas, cuda, out);
        return 0;
    }
@@ -1094,17 +1121,30 @@ static int prepare_profile(struct cublaslt_api *cublas,
 static int run_cublas_profile(cublasLtHandle_t handle,
                              struct cublaslt_api *cublas,
                              struct prepared_profile *profile) {
    int32_t alpha_i32 = 1;
    int32_t beta_i32 = 0;
    double alpha_f64 = 1.0;
    double beta_f64 = 0.0;
    float alpha = 1.0f;
    float beta = 0.0f;
    const void *alpha_ptr = &alpha;
    const void *beta_ptr = &beta;
    if (profile->desc.compute_type == CUBLAS_COMPUTE_32I) {
        alpha_ptr = &alpha_i32;
        beta_ptr = &beta_i32;
    } else if (profile->desc.compute_type == CUBLAS_COMPUTE_64F) {
        alpha_ptr = &alpha_f64;
        beta_ptr = &beta_f64;
    }
    return check_cublas(profile->desc.name,
                        cublas->cublasLtMatmul(handle,
                                               profile->op_desc,
-                                               &alpha,
+                                               alpha_ptr,
                                               (const void *)(uintptr_t)profile->a_dev,
                                               profile->a_layout,
                                               (const void *)(uintptr_t)profile->b_dev,
                                               profile->b_layout,
-                                               &beta,
+                                               beta_ptr,
                                               (const void *)(uintptr_t)profile->c_dev,
                                               profile->c_layout,
                                               (void *)(uintptr_t)profile->d_dev,
@@ -1359,11 +1399,29 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
 }
 #endif
 static void print_stress_report(const struct stress_report *report, int device_index, int seconds) {
    printf("device=%s\n", report->device);
    printf("device_index=%d\n", device_index);
    printf("compute_capability=%d.%d\n", report->cc_major, report->cc_minor);
    printf("backend=%s\n", report->backend);
    printf("duration_s=%d\n", seconds);
    printf("buffer_mb=%d\n", report->buffer_mb);
    printf("streams=%d\n", report->stream_count);
    printf("iterations=%lu\n", report->iterations);
    printf("checksum=%llu\n", (unsigned long long)report->checksum);
    if (report->details[0] != '\0') {
        printf("%s", report->details);
    }
    printf("status=OK\n");
 }
 int main(int argc, char **argv) {
    int seconds = 5;
    int size_mb = 64;
    int device_index = 0;
    const char *precision_filter = NULL; /* NULL = all; else block_label to match */
    const char *precision_plan = NULL;
    const char *precision_plan_seconds = NULL;
    for (int i = 1; i < argc; i++) {
        if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) {
            seconds = atoi(argv[++i]);
@@ -1373,9 +1431,13 @@ int main(int argc, char **argv) {
            device_index = atoi(argv[++i]);
        } else if (strcmp(argv[i], "--precision") == 0 && i + 1 < argc) {
            precision_filter = argv[++i];
        } else if (strcmp(argv[i], "--precision-plan") == 0 && i + 1 < argc) {
            precision_plan = argv[++i];
        } else if (strcmp(argv[i], "--precision-plan-seconds") == 0 && i + 1 < argc) {
            precision_plan_seconds = argv[++i];
        } else {
            fprintf(stderr,
-                    "usage: %s [--seconds N] [--size-mb N] [--device N] [--precision fp8|fp16|fp32|fp64|fp4]\n",
+                    "usage: %s [--seconds N] [--size-mb N] [--device N] [--precision int8|fp8|fp16|fp32|fp64|fp4] [--precision-plan p1,p2,...,mixed] [--precision-plan-seconds s1,s2,...]\n",
                    argv[0]);
            return 2;
        }
@@ -1436,6 +1498,76 @@ int main(int argc, char **argv) {
    int ok = 0;
 #if HAVE_CUBLASLT_HEADERS
    if (precision_plan != NULL && precision_plan[0] != '\0') {
        char *plan_copy = strdup(precision_plan);
        char *plan_seconds_copy = NULL;
        int phase_seconds[32] = {0};
        int phase_seconds_count = 0;
        int phase_ok = 0;
        if (plan_copy == NULL) {
            fprintf(stderr, "failed to allocate precision plan buffer\n");
            return 1;
        }
        if (precision_plan_seconds != NULL && precision_plan_seconds[0] != '\0') {
            plan_seconds_copy = strdup(precision_plan_seconds);
            if (plan_seconds_copy == NULL) {
                free(plan_copy);
                fprintf(stderr, "failed to allocate precision plan seconds buffer\n");
                return 1;
            }
            for (char *sec_token = strtok(plan_seconds_copy, ",");
                 sec_token != NULL && phase_seconds_count < (int)(sizeof(phase_seconds) / sizeof(phase_seconds[0]));
                 sec_token = strtok(NULL, ",")) {
                while (*sec_token == ' ' || *sec_token == '\t') {
                    sec_token++;
                }
                if (*sec_token == '\0') {
                    continue;
                }
                phase_seconds[phase_seconds_count++] = atoi(sec_token);
            }
        }
        int phase_idx = 0;
        for (char *token = strtok(plan_copy, ","); token != NULL; token = strtok(NULL, ","), phase_idx++) {
            while (*token == ' ' || *token == '\t') {
                token++;
            }
            if (*token == '\0') {
                continue;
            }
            const char *phase_name = token;
            const char *phase_filter = token;
            if (strcmp(token, "mixed") == 0 || strcmp(token, "all") == 0) {
                phase_filter = NULL;
            }
            int phase_duration = seconds;
            if (phase_idx < phase_seconds_count && phase_seconds[phase_idx] > 0) {
                phase_duration = phase_seconds[phase_idx];
            }
            printf("phase_begin=%s\n", phase_name);
            fflush(stdout);
            memset(&report, 0, sizeof(report));
            ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, phase_duration, size_mb, phase_filter, &report);
            if (ok) {
                print_stress_report(&report, device_index, phase_duration);
                phase_ok = 1;
            } else {
                printf("phase_error=%s\n", phase_name);
                if (report.details[0] != '\0') {
                    printf("%s", report.details);
                    if (report.details[strlen(report.details) - 1] != '\n') {
                        printf("\n");
                    }
                }
                printf("status=FAILED\n");
            }
            printf("phase_end=%s\n", phase_name);
            fflush(stdout);
        }
        free(plan_seconds_copy);
        free(plan_copy);
        return phase_ok ? 0 : 1;
    }
    ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, precision_filter, &report);
 #endif
    if (!ok) {
@@ -1454,18 +1586,6 @@ int main(int argc, char **argv) {
        }
    }
-    printf("device=%s\n", report.device);
+    print_stress_report(&report, device_index, seconds);
    printf("device_index=%d\n", device_index);
    printf("compute_capability=%d.%d\n", report.cc_major, report.cc_minor);
    printf("backend=%s\n", report.backend);
    printf("duration_s=%d\n", seconds);
    printf("buffer_mb=%d\n", report.buffer_mb);
    printf("streams=%d\n", report.stream_count);
    printf("iterations=%lu\n", report.iterations);
    printf("checksum=%llu\n", (unsigned long long)report.checksum);
    if (report.details[0] != '\0') {
        printf("%s", report.details);
    }
    printf("status=OK\n");
    return 0;
 }
--- a/iso/overlay/usr/local/bin/bee-gpu-burn
+++ b/iso/overlay/usr/local/bin/bee-gpu-burn
@@ -7,10 +7,12 @@ SIZE_MB=0
 DEVICES=""
 EXCLUDE=""
 PRECISION=""
 PRECISION_PLAN=""
 PRECISION_PLAN_SECONDS=""
 WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"
 usage() {
-    echo "usage: $0 [--seconds N] [--stagger-seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3] [--precision fp8|fp16|fp32|fp64|fp4]" >&2
+    echo "usage: $0 [--seconds N] [--stagger-seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3] [--precision int8|fp8|fp16|fp32|fp64|fp4] [--precision-plan p1,p2,...,mixed] [--precision-plan-seconds s1,s2,...]" >&2
    exit 2
 }
@@ -32,6 +34,8 @@ while [ "$#" -gt 0 ]; do
        --devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
        --exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
        --precision) [ "$#" -ge 2 ] || usage; PRECISION="$2"; shift 2 ;;
        --precision-plan) [ "$#" -ge 2 ] || usage; PRECISION_PLAN="$2"; shift 2 ;;
        --precision-plan-seconds) [ "$#" -ge 2 ] || usage; PRECISION_PLAN_SECONDS="$2"; shift 2 ;;
        *) usage ;;
    esac
 done
@@ -92,8 +96,12 @@ for id in $(echo "${FINAL}" | tr ',' ' '); do
    echo "starting gpu ${id} size=${gpu_size_mb}MB seconds=${gpu_seconds}"
    precision_arg=""
    [ -n "${PRECISION}" ] && precision_arg="--precision ${PRECISION}"
    precision_plan_arg=""
    [ -n "${PRECISION_PLAN}" ] && precision_plan_arg="--precision-plan ${PRECISION_PLAN}"
    precision_plan_seconds_arg=""
    [ -n "${PRECISION_PLAN_SECONDS}" ] && precision_plan_seconds_arg="--precision-plan-seconds ${PRECISION_PLAN_SECONDS}"
    CUDA_VISIBLE_DEVICES="${id}" \
-        "${WORKER}" --device 0 --seconds "${gpu_seconds}" --size-mb "${gpu_size_mb}" ${precision_arg} >"${log}" 2>&1 &
+        "${WORKER}" --device 0 --seconds "${gpu_seconds}" --size-mb "${gpu_size_mb}" ${precision_arg} ${precision_plan_arg} ${precision_plan_seconds_arg} >"${log}" 2>&1 &
    pid=$!
    WORKERS="${WORKERS} ${pid}:${id}:${log}"
    if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
Author	SHA1	Message	Date
Michael Chus	19dbabd71d	Simplify power calibration: pure binary search, no telemetry guessing Remove telemetry-guided initial candidate; use strict binary search midpoint at every step. Clean and predictable convergence in O(log N) attempts within the allowed power range [minLimitW, startingLimitW]. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-14 22:12:45 +03:00
Michael Chus	a6a07f2626	Replace linear power derate with binary search + telemetry-guided jump Power calibration previously stepped down 25 W at a time (linear), requiring up to 6 attempts to find a stable limit within 150 W range. New strategy: - Binary search between minLimitW (lo, assumed stable floor) and the starting/failed limit (hi, confirmed unstable), converging within a 10 W tolerance in ~4 attempts. - For thermal throttle: the first-quarter telemetry rows estimate the GPU's pre-throttle power draw. nextLimit = round5W(onset - 10 W) is used as the initial candidate instead of the binary midpoint, landing much closer to the true limit on the first step. - On success: lo is updated and a higher level is tried (binary search upward) until hi-lo ≤ tolerance, ensuring the highest stable limit is found rather than the first stable one. - Let targeted_power run to natural completion on throttle (no mid-run SIGKILL) so nv-hostengine releases its diagnostic slot cleanly before the next attempt. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-14 22:05:23 +03:00
Michael Chus	f87461ee4a	Detect thermal throttle with fans below 100% as cooling misconfiguration During power calibration: if a thermal throttle (sw_thermal/hw_thermal) causes ≥20% clock drop while server fans are below 98% P95 duty cycle, record a CoolingWarning on the GPU result and emit an actionable finding telling the operator to rerun with fans manually fixed at 100%. During steady-state benchmark: same signal enriches the existing thermal_limited finding with fan duty cycle and clock drift values. Covers both the main benchmark (buildBenchmarkFindings) and the power bench (NvidiaPowerBenchResult.Findings). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-14 21:44:57 +03:00
Michael Chus	a636146dbd	Fix power calibration failing due to DCGM resource contention When a targeted_power attempt is cancelled (e.g. after sw_thermal throttle), nv-hostengine holds the diagnostic slot asynchronously. The next attempt immediately received DCGM_ST_IN_USE (exit 222) and incorrectly derated the power limit. Now: exit 222 is detected via isDCGMResourceBusy and triggers an exponential back-off retry at the same power limit (1s, 2s, 4s, … up to 256s). Once the back-off delay would exceed 300s the calibration fails, indicating the slot is persistently held. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-14 20:41:17 +03:00
Mikhail Chusavitin	303de2df04	Add slot-aware ramp sequence to bee-bench power	2026-04-14 17:47:40 +03:00
Mikhail Chusavitin	95124d228f	Split bee-bench into perf and power workflows	2026-04-14 17:33:13 +03:00
Mikhail Chusavitin	54338dbae5	Unify live RAM runtime state	2026-04-14 16:18:33 +03:00
Mikhail Chusavitin	2be7ae6d28	Refine NVIDIA benchmark phase timing	2026-04-14 14:12:06 +03:00