Estimate fan duty from observed RPM maxima

Disable fp64/fp4 in mixed gpu burn
Disable unstable fp4/fp64 benchmark phases
2026-04-16 10:10:18 +03:00 · 2026-04-16 10:00:03 +03:00 · 2026-04-16 09:58:02 +03:00 · 2026-04-16 07:21:02 +03:00 · 2026-04-16 06:57:23 +03:00 · 2026-04-16 06:54:13 +03:00
15 changed files with 1555 additions and 427 deletions
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
--- a/audit/internal/platform/benchmark_report.go
+++ b/audit/internal/platform/benchmark_report.go
@@ -61,6 +61,9 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 	if result.ScalabilityScore > 0 {
 		fmt.Fprintf(&b, "**Scalability score:** %.1f%%  \n", result.ScalabilityScore)
 	}
+	if result.PlatformPowerScore > 0 {
+		fmt.Fprintf(&b, "**Platform power score:** %.1f%%  \n", result.PlatformPowerScore)
+	}
 	fmt.Fprintf(&b, "**Overall status:** %s  \n", result.OverallStatus)
 	b.WriteString("\n")

@@ -81,41 +84,92 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		b.WriteString("\n")
 	}

-	// ── Methodology ───────────────────────────────────────────────────────────
-	b.WriteString("## Methodology\n\n")
-	fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline -> warmup -> steady-state -> interconnect phases.\n", result.BenchmarkProfile)
-	b.WriteString("- Single-GPU compute score comes from `bee-gpu-burn` on the cuBLASLt path when available.\n")
-	b.WriteString("- Thermal and power limits are inferred from NVIDIA clock-event counters plus sustained telemetry.\n")
-	b.WriteString("- `result.json` is the canonical machine-readable source for the run.\n\n")
-	b.WriteString("**Compute score** is derived from two phases:\n\n")
-	b.WriteString("- **Synthetic** — each precision type (int8, fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ")
-	b.WriteString("Measures peak throughput with the full GPU dedicated to one kernel type. ")
-	b.WriteString("Each result is normalised to fp32-equivalent TOPS using precision weights: ")
-	b.WriteString("fp64 ×2.0 · fp32 ×1.0 · fp16 ×0.5 · int8 ×0.25 · fp8 ×0.25 · fp4 ×0.125.\n")
-	b.WriteString("- **Mixed** — all precision types run simultaneously (combined phase). ")
-	b.WriteString("Reflects real inference workloads where fp8 matrix ops, fp16 attention and fp32 accumulation compete for bandwidth and SM scheduler slots.\n\n")
-	b.WriteString("**Formula:** `Compute = Synthetic × (1 + MixedEfficiency × 0.3)`\n\n")
-	b.WriteString("where `MixedEfficiency = Mixed / Synthetic`. A GPU that sustains 90 % throughput under mixed load ")
-	b.WriteString("receives a +27 % bonus over its synthetic score; one that drops to 60 % receives +18 %.\n\n")
-	b.WriteString("**Composite score** = `Compute × quality_factor` where quality factors in power sustain, thermal sustain, stability, and interconnect.\n\n")
+	// ── Balanced Scorecard ────────────────────────────────────────────────────
+	b.WriteString("## Balanced Scorecard\n\n")

-	// ── Scorecard table ───────────────────────────────────────────────────────
-	b.WriteString("## Scorecard\n\n")
-	b.WriteString("| GPU | Status | Composite | Compute | Synthetic | Mixed | Mixed Eff. | TOPS/SM/GHz | Power Sustain | Thermal Sustain | Stability | Interconnect |\n")
-	b.WriteString("|-----|--------|-----------|---------|-----------|-------|------------|-------------|---------------|-----------------|-----------|-------------|\n")
+	// Perspective 1: Compatibility — hard stops
+	b.WriteString("### 1. Compatibility\n\n")
+	b.WriteString("| GPU | Thermal throttle | Fan duty at throttle | ECC uncorr | Status |\n")
+	b.WriteString("|-----|------------------|----------------------|------------|--------|\n")
 	for _, gpu := range result.GPUs {
-		name := strings.TrimSpace(gpu.Name)
-		if name == "" {
-			name = "Unknown GPU"
+		thermalThrottle := "-"
+		if gpu.Scores.ThermalThrottlePct > 0 {
+			thermalThrottle = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
 		}
-		interconnect := "-"
-		if gpu.Scores.InterconnectScore > 0 {
-			interconnect = fmt.Sprintf("%.1f", gpu.Scores.InterconnectScore)
+		fanAtThrottle := "-"
+		if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && gpu.Scores.ThermalThrottlePct > 0 {
+			fanAtThrottle = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
 		}
-		topsPerSM := "-"
-		if gpu.Scores.TOPSPerSMPerGHz > 0 {
-			topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
+		ecc := "-"
+		if gpu.ECC.Uncorrected > 0 {
+			ecc = fmt.Sprintf("⛔ %d", gpu.ECC.Uncorrected)
 		}
+		compatStatus := "✓ OK"
+		if gpu.ECC.Uncorrected > 0 || (gpu.Scores.ThermalThrottlePct > 0 && result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && result.Cooling.P95FanDutyCyclePct < 95) {
+			compatStatus = "⛔ HARD STOP"
+		}
+		fmt.Fprintf(&b, "| GPU %d | %s | %s | %s | %s |\n",
+			gpu.Index, thermalThrottle, fanAtThrottle, ecc, compatStatus)
+	}
+	b.WriteString("\n")
+
+	// Perspective 2: Thermal headroom
+	b.WriteString("### 2. Thermal Headroom\n\n")
+	b.WriteString("| GPU | p95 temp | Slowdown limit | Shutdown limit | Headroom | Thermal throttle | Status |\n")
+	b.WriteString("|-----|----------|----------------|----------------|----------|------------------|--------|\n")
+	for _, gpu := range result.GPUs {
+		shutdownTemp := gpu.ShutdownTempC
+		if shutdownTemp <= 0 {
+			shutdownTemp = 90
+		}
+		slowdownTemp := gpu.SlowdownTempC
+		if slowdownTemp <= 0 {
+			slowdownTemp = 80
+		}
+		headroom := gpu.Scores.TempHeadroomC
+		thermalStatus := "✓ OK"
+		switch {
+		case headroom < 10:
+			thermalStatus = "⛔ CRITICAL"
+		case gpu.Steady.P95TempC >= slowdownTemp:
+			thermalStatus = "⚠ WARNING"
+		}
+		throttlePct := "-"
+		if gpu.Scores.ThermalThrottlePct > 0 {
+			throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
+		}
+		fmt.Fprintf(&b, "| GPU %d | %.1f°C | %.0f°C | %.0f°C | %.1f°C | %s | %s |\n",
+			gpu.Index, gpu.Steady.P95TempC, slowdownTemp, shutdownTemp, headroom, throttlePct, thermalStatus)
+	}
+	b.WriteString("\n")
+
+	// Perspective 3: Power delivery
+	b.WriteString("### 3. Power Delivery\n\n")
+	b.WriteString("| GPU | Power cap throttle | Power stability | Fan duty (p95) | Status |\n")
+	b.WriteString("|-----|-------------------|-----------------|----------------|--------|\n")
+	for _, gpu := range result.GPUs {
+		powerCap := "-"
+		if gpu.Scores.PowerCapThrottlePct > 0 {
+			powerCap = fmt.Sprintf("%.1f%%", gpu.Scores.PowerCapThrottlePct)
+		}
+		fanDuty := "-"
+		if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable {
+			fanDuty = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
+		}
+		powerStatus := "✓ OK"
+		if gpu.Scores.PowerCapThrottlePct > 5 {
+			powerStatus = "⚠ POWER LIMITED"
+		}
+		fmt.Fprintf(&b, "| GPU %d | %s | %.1f | %s | %s |\n",
+			gpu.Index, powerCap, gpu.Scores.PowerSustainScore, fanDuty, powerStatus)
+	}
+	b.WriteString("\n")
+
+	// Perspective 4: Performance
+	b.WriteString("### 4. Performance\n\n")
+	b.WriteString("| GPU | Compute TOPS | Synthetic | Mixed | Mixed Eff. | TOPS/SM/GHz |\n")
+	b.WriteString("|-----|--------------|-----------|-------|------------|-------------|\n")
+	for _, gpu := range result.GPUs {
 		synthetic := "-"
 		if gpu.Scores.SyntheticScore > 0 {
 			synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
@@ -128,20 +182,41 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		if gpu.Scores.MixedEfficiency > 0 {
 			mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
 		}
-		fmt.Fprintf(&b, "| GPU %d %s | %s | **%.2f** | %.2f | %s | %s | %s | %s | %.1f | %.1f | %.1f | %s |\n",
-			gpu.Index, name,
-			gpu.Status,
-			gpu.Scores.CompositeScore,
-			gpu.Scores.ComputeScore,
-			synthetic,
-			mixed,
-			mixedEff,
-			topsPerSM,
-			gpu.Scores.PowerSustainScore,
-			gpu.Scores.ThermalSustainScore,
-			gpu.Scores.StabilityScore,
-			interconnect,
-		)
+		topsPerSM := "-"
+		if gpu.Scores.TOPSPerSMPerGHz > 0 {
+			topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
+		}
+		fmt.Fprintf(&b, "| GPU %d | **%.2f** | %s | %s | %s | %s |\n",
+			gpu.Index, gpu.Scores.CompositeScore, synthetic, mixed, mixedEff, topsPerSM)
+	}
+	if len(result.PerformanceRampSteps) > 0 {
+		fmt.Fprintf(&b, "\n**Platform power score (scalability):** %.1f%%\n", result.PlatformPowerScore)
+	}
+	b.WriteString("\n")
+
+	// Perspective 5: Anomaly flags
+	b.WriteString("### 5. Anomalies\n\n")
+	b.WriteString("| GPU | ECC corrected | Sync boost throttle | Power instability | Thermal instability |\n")
+	b.WriteString("|-----|---------------|---------------------|-------------------|---------------------|\n")
+	for _, gpu := range result.GPUs {
+		eccCorr := "-"
+		if gpu.ECC.Corrected > 0 {
+			eccCorr = fmt.Sprintf("⚠ %d", gpu.ECC.Corrected)
+		}
+		syncBoost := "-"
+		if gpu.Scores.SyncBoostThrottlePct > 0 {
+			syncBoost = fmt.Sprintf("%.1f%%", gpu.Scores.SyncBoostThrottlePct)
+		}
+		powerVar := "OK"
+		if gpu.Scores.PowerSustainScore < 70 {
+			powerVar = "⚠ unstable"
+		}
+		thermalVar := "OK"
+		if gpu.Scores.ThermalSustainScore < 70 {
+			thermalVar = "⚠ unstable"
+		}
+		fmt.Fprintf(&b, "| GPU %d | %s | %s | %s | %s |\n",
+			gpu.Index, eccCorr, syncBoost, powerVar, thermalVar)
 	}
 	b.WriteString("\n")

@@ -171,13 +246,13 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 			fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
 		}
 		if gpu.PowerLimitDerated {
-			fmt.Fprintf(&b, "- **Power limit derating:** active after %d targeted_power attempt(s)\n", gpu.PowerCalibrationTries)
+			fmt.Fprintf(&b, "- **Power limit derating:** active (reduced limit %.0f W)\n", gpu.PowerLimitW)
 		}
 		if gpu.CalibratedPeakPowerW > 0 {
 			if gpu.CalibratedPeakTempC > 0 {
-				fmt.Fprintf(&b, "- **Power calibration (`dcgmi targeted_power`):** %.0f W p95 at %.1f °C p95\n", gpu.CalibratedPeakPowerW, gpu.CalibratedPeakTempC)
+				fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95 at %.1f °C p95\n", gpu.CalibratedPeakPowerW, gpu.CalibratedPeakTempC)
 			} else {
-				fmt.Fprintf(&b, "- **Power calibration (`dcgmi targeted_power`):** %.0f W p95\n", gpu.CalibratedPeakPowerW)
+				fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95\n", gpu.CalibratedPeakPowerW)
 			}
 		}
 		if gpu.LockedGraphicsClockMHz > 0 {
@@ -186,14 +261,18 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		b.WriteString("\n")

 		// Steady-state telemetry
-		fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
-		b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
-		fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
-		fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
-		fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
-		fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
-		fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
-		b.WriteString("\n")
+		if benchmarkTelemetryAvailable(gpu.Steady) {
+			fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
+			b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
+			fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
+			fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
+			fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
+			fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
+			fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
+			b.WriteString("\n")
+		} else {
+			b.WriteString("**Steady-state telemetry:** unavailable\n\n")
+		}

 		// Per-precision stability phases.
 		if len(gpu.PrecisionSteady) > 0 {
@@ -329,6 +408,19 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		}
 	}

+	// ── Platform Scalability ──────────────────────────────────────────────────
+	if len(result.PerformanceRampSteps) > 0 {
+		b.WriteString("## Platform Scalability (Performance Ramp)\n\n")
+		fmt.Fprintf(&b, "**Platform power score:** %.1f%%  \n\n", result.PlatformPowerScore)
+		b.WriteString("| k GPUs | GPU Indices | Total Synthetic TOPS | Scalability |\n")
+		b.WriteString("|--------|-------------|----------------------|-------------|\n")
+		for _, step := range result.PerformanceRampSteps {
+			fmt.Fprintf(&b, "| %d | %s | %.2f | %.1f%% |\n",
+				step.StepIndex, joinIndexList(step.GPUIndices), step.TotalSyntheticTOPS, step.ScalabilityPct)
+		}
+		b.WriteString("\n")
+	}
+
 	// ── Raw files ─────────────────────────────────────────────────────────────
 	b.WriteString("## Raw Files\n\n")
 	b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
--- a/audit/internal/platform/benchmark_test.go
+++ b/audit/internal/platform/benchmark_test.go
@@ -49,8 +49,8 @@ func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
 		benchmarkPrecisionPhases,
 		func(label string) string { return label },
 	)
-	if len(labels) != 7 || len(phases) != 7 {
-		t.Fatalf("labels=%d phases=%d want 7", len(labels), len(phases))
+	if len(labels) != 5 || len(phases) != 5 {
+		t.Fatalf("labels=%d phases=%d want 5", len(labels), len(phases))
 	}
 	if basePhaseSec != 60 {
 		t.Fatalf("basePhaseSec=%d want 60", basePhaseSec)
@@ -61,7 +61,7 @@ func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
 	if phases[len(phases)-1].PlanLabel != "mixed" || phases[len(phases)-1].DurationSec != 300 {
 		t.Fatalf("mixed phase=%+v want duration 300", phases[len(phases)-1])
 	}
-	if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,60,60,300" {
+	if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,300" {
 		t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
 	}
 }
@@ -80,7 +80,7 @@ func TestBuildBenchmarkSteadyPlanStability(t *testing.T) {
 	if mixedPhaseSec != 3600 {
 		t.Fatalf("mixedPhaseSec=%d want 3600", mixedPhaseSec)
 	}
-	if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,300,300,3600" {
+	if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,3600" {
 		t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
 	}
 }
@@ -99,7 +99,7 @@ func TestBuildBenchmarkSteadyPlanOvernight(t *testing.T) {
 	if mixedPhaseSec != 14400 {
 		t.Fatalf("mixedPhaseSec=%d want 14400", mixedPhaseSec)
 	}
-	if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,3600,3600,14400" {
+	if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,14400" {
 		t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
 	}
 }
@@ -133,10 +133,10 @@ func TestSplitBenchmarkRowsByPlannedPhaseUsesPhaseDurations(t *testing.T) {
 func TestBenchmarkSupportedPrecisionsSkipsFP4BeforeBlackwell(t *testing.T) {
 	t.Parallel()

-	if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64" {
+	if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
 		t.Fatalf("supported=%v", got)
 	}
-	if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64,fp4" {
+	if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
 		t.Fatalf("supported=%v", got)
 	}
 }
@@ -314,6 +314,30 @@ func TestRenderBenchmarkReportListsUnifiedArtifacts(t *testing.T) {
 	}
 }

+func TestScoreBenchmarkGPUIgnoresDisabledPrecisions(t *testing.T) {
+	t.Parallel()
+
+	score := scoreBenchmarkGPUResult(BenchmarkGPUResult{
+		PrecisionSteady: []BenchmarkPrecisionSteadyPhase{
+			{Precision: "fp16", WeightedTeraOpsPerSec: 100},
+			{Precision: "fp64", WeightedTeraOpsPerSec: 999},
+			{Precision: "fp4", WeightedTeraOpsPerSec: 999},
+		},
+		PrecisionResults: []BenchmarkPrecisionResult{
+			{Category: "fp32_tf32", Supported: true, WeightedTeraOpsPerSec: 50},
+			{Category: "fp64", Supported: true, WeightedTeraOpsPerSec: 999},
+			{Category: "fp4", Supported: true, WeightedTeraOpsPerSec: 999},
+		},
+	})
+
+	if score.SyntheticScore != 100 {
+		t.Fatalf("SyntheticScore=%f want 100", score.SyntheticScore)
+	}
+	if score.MixedScore != 50 {
+		t.Fatalf("MixedScore=%f want 50", score.MixedScore)
+	}
+}
+
 func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {
 	t.Parallel()

--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -31,6 +31,7 @@ type BenchmarkCoolingSummary struct {
 	Available             bool     `json:"available"`
 	AvgFanRPM             float64  `json:"avg_fan_rpm,omitempty"`
 	FanDutyCycleAvailable bool     `json:"fan_duty_cycle_available,omitempty"`
+	FanDutyCycleEstimated bool     `json:"fan_duty_cycle_estimated,omitempty"`
 	AvgFanDutyCyclePct    float64  `json:"avg_fan_duty_cycle_pct,omitempty"`
 	P95FanDutyCyclePct    float64  `json:"p95_fan_duty_cycle_pct,omitempty"`
 	Notes                 []string `json:"notes,omitempty"`
@@ -55,27 +56,32 @@ type NvidiaBenchmarkOptions struct {
 }

 type NvidiaBenchmarkResult struct {
-	BenchmarkVersion   string                       `json:"benchmark_version"`
-	GeneratedAt        time.Time                    `json:"generated_at"`
-	Hostname           string                       `json:"hostname,omitempty"`
-	ServerModel        string                       `json:"server_model,omitempty"`
-	BenchmarkProfile   string                       `json:"benchmark_profile"`
-	ParallelGPUs       bool                         `json:"parallel_gpus,omitempty"`
-	RampStep           int                          `json:"ramp_step,omitempty"`
-	RampTotal          int                          `json:"ramp_total,omitempty"`
-	RampRunID          string                       `json:"ramp_run_id,omitempty"`
-	ScalabilityScore   float64                      `json:"scalability_score,omitempty"`
-	OverallStatus      string                       `json:"overall_status"`
-	SelectedGPUIndices []int                        `json:"selected_gpu_indices"`
-	Findings           []string                     `json:"findings,omitempty"`
-	Warnings           []string                     `json:"warnings,omitempty"`
-	Normalization      BenchmarkNormalization       `json:"normalization"`
-	HostConfig         *BenchmarkHostConfig         `json:"host_config,omitempty"`
-	CPULoad            *BenchmarkCPULoad            `json:"cpu_load,omitempty"`
-	Cooling            *BenchmarkCoolingSummary     `json:"cooling,omitempty"`
-	GPUs               []BenchmarkGPUResult         `json:"gpus"`
-	Interconnect       *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
-	ServerPower        *BenchmarkServerPower        `json:"server_power,omitempty"`
+	BenchmarkVersion string    `json:"benchmark_version"`
+	GeneratedAt      time.Time `json:"generated_at"`
+	Hostname         string    `json:"hostname,omitempty"`
+	ServerModel      string    `json:"server_model,omitempty"`
+	BenchmarkProfile string    `json:"benchmark_profile"`
+	ParallelGPUs     bool      `json:"parallel_gpus,omitempty"`
+	RampStep         int       `json:"ramp_step,omitempty"`
+	RampTotal        int       `json:"ramp_total,omitempty"`
+	RampRunID        string    `json:"ramp_run_id,omitempty"`
+	ScalabilityScore float64   `json:"scalability_score,omitempty"`
+	// PlatformPowerScore is the mean compute scalability across ramp steps 2..N.
+	// 100% = each added GPU contributes exactly its single-card throughput.
+	// < 100% = throughput loss due to thermal throttle, power limits, or contention.
+	PlatformPowerScore   float64                      `json:"platform_power_score,omitempty"`
+	PerformanceRampSteps []NvidiaPerformanceRampStep  `json:"performance_ramp_steps,omitempty"`
+	OverallStatus        string                       `json:"overall_status"`
+	SelectedGPUIndices   []int                        `json:"selected_gpu_indices"`
+	Findings             []string                     `json:"findings,omitempty"`
+	Warnings             []string                     `json:"warnings,omitempty"`
+	Normalization        BenchmarkNormalization       `json:"normalization"`
+	HostConfig           *BenchmarkHostConfig         `json:"host_config,omitempty"`
+	CPULoad              *BenchmarkCPULoad            `json:"cpu_load,omitempty"`
+	Cooling              *BenchmarkCoolingSummary     `json:"cooling,omitempty"`
+	GPUs                 []BenchmarkGPUResult         `json:"gpus"`
+	Interconnect         *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
+	ServerPower          *BenchmarkServerPower        `json:"server_power,omitempty"`
 }

 type BenchmarkNormalization struct {
@@ -107,6 +113,12 @@ type BenchmarkGPUResult struct {
 	PowerLimitDerated   bool    `json:"power_limit_derated,omitempty"`
 	MultiprocessorCount int     `json:"multiprocessor_count,omitempty"`
 	DefaultPowerLimitW  float64 `json:"default_power_limit_w,omitempty"`
+	// ShutdownTempC is the hardware thermal shutdown threshold for this GPU,
+	// sourced from nvidia-smi -q ("GPU Shutdown Temp"). Fallback: 90°C.
+	ShutdownTempC float64 `json:"shutdown_temp_c,omitempty"`
+	// SlowdownTempC is the software throttle onset threshold ("GPU Slowdown Temp").
+	// Fallback: 80°C.
+	SlowdownTempC float64 `json:"slowdown_temp_c,omitempty"`
 	// CalibratedPeakPowerW is the p95 power measured during a short
 	// dcgmi targeted_power calibration run before the main benchmark.
 	// Used as the reference denominator for PowerSustainScore instead of
@@ -206,9 +218,30 @@ type BenchmarkScorecard struct {
 	MixedEfficiency     float64 `json:"mixed_efficiency,omitempty"`
 	PowerSustainScore   float64 `json:"power_sustain_score"`
 	ThermalSustainScore float64 `json:"thermal_sustain_score"`
-	StabilityScore      float64 `json:"stability_score"`
-	InterconnectScore   float64 `json:"interconnect_score"`
-	CompositeScore      float64 `json:"composite_score"`
+	// StabilityScore: fraction of steady-state time the GPU spent throttling
+	// (thermal + power cap combined). 0% throttle = 100; 100% throttle = 0.
+	StabilityScore float64 `json:"stability_score"`
+
+	// Throttle breakdown — percentage of steady-state time in each throttle type.
+	// Used for diagnosis: tells WHY the GPU throttled, not just whether it did.
+	ThermalThrottlePct   float64 `json:"thermal_throttle_pct"`   // HW+SW thermal slowdown
+	PowerCapThrottlePct  float64 `json:"power_cap_throttle_pct"` // SW power cap
+	SyncBoostThrottlePct float64 `json:"sync_boost_throttle_pct,omitempty"`
+
+	// Temperature headroom: distance to the 100°C destruction threshold.
+	// TempHeadroomC = 100 - P95TempC. < 20°C = warning; < 10°C = critical.
+	// Independent of throttle — a GPU at 86°C without throttle is still in the red zone.
+	TempHeadroomC float64 `json:"temp_headroom_c"`
+
+	InterconnectScore float64 `json:"interconnect_score"`
+	// ServerQualityScore (0–100) reflects server infrastructure quality independent
+	// of GPU model. Combines throttle time, power variance, and temp variance.
+	// Use this to compare servers with the same GPU, or to flag a bad server
+	// that throttles an otherwise fast GPU.
+	ServerQualityScore float64 `json:"server_quality_score"`
+	// CompositeScore is the raw compute score (TOPS, fp32-equivalent).
+	// A throttling GPU will score lower here automatically — no quality multiplier.
+	CompositeScore float64 `json:"composite_score"`
 	// TOPSPerSMPerGHz is compute efficiency independent of clock speed and SM count.
 	TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
 }
@@ -265,16 +298,31 @@ type NvidiaPowerBenchResult struct {
 	RecommendedSlotOrder []int                  `json:"recommended_slot_order,omitempty"`
 	RampSteps            []NvidiaPowerBenchStep `json:"ramp_steps,omitempty"`
 	OverallStatus        string                 `json:"overall_status"`
-	Findings             []string               `json:"findings,omitempty"`
-	GPUs                 []NvidiaPowerBenchGPU  `json:"gpus"`
+	// PlatformMaxTDPW is the sum of per-GPU stable power limits found during the
+	// cumulative thermal ramp. Represents the actual sustained power budget of
+	// this server under full GPU load. Use for rack power planning.
+	PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
+	// ServerPower captures IPMI server power delta (idle→loaded) measured in
+	// parallel with the thermal ramp. Use to compare GPU-reported TDP against
+	// actual wall-power draw as seen by the server's power supply.
+	ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
+	Findings    []string              `json:"findings,omitempty"`
+	GPUs        []NvidiaPowerBenchGPU `json:"gpus"`
 }

 type NvidiaPowerBenchGPU struct {
-	Index               int      `json:"index"`
-	Name                string   `json:"name,omitempty"`
-	BusID               string   `json:"bus_id,omitempty"`
-	DefaultPowerLimitW  float64  `json:"default_power_limit_w,omitempty"`
-	AppliedPowerLimitW  float64  `json:"applied_power_limit_w,omitempty"`
+	Index              int     `json:"index"`
+	Name               string  `json:"name,omitempty"`
+	BusID              string  `json:"bus_id,omitempty"`
+	DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
+	// AppliedPowerLimitW is the stable limit found during single-card calibration.
+	AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
+	// StablePowerLimitW is the final fixed limit for this GPU after the
+	// cumulative thermal ramp. This is the limit at which the GPU operated
+	// stably with all other GPUs running simultaneously at their own limits.
+	// May be lower than AppliedPowerLimitW if multi-GPU thermal load required
+	// additional derating.
+	StablePowerLimitW   float64  `json:"stable_power_limit_w,omitempty"`
 	MaxObservedPowerW   float64  `json:"max_observed_power_w,omitempty"`
 	MaxObservedTempC    float64  `json:"max_observed_temp_c,omitempty"`
 	CalibrationAttempts int      `json:"calibration_attempts,omitempty"`
@@ -286,13 +334,31 @@ type NvidiaPowerBenchGPU struct {
 }

 type NvidiaPowerBenchStep struct {
-	StepIndex              int      `json:"step_index"`
-	GPUIndices             []int    `json:"gpu_indices"`
-	TotalObservedPowerW    float64  `json:"total_observed_power_w,omitempty"`
-	AvgObservedPowerW      float64  `json:"avg_observed_power_w,omitempty"`
-	MinPowerRealizationPct float64  `json:"min_power_realization_pct,omitempty"`
-	AvgPowerRealizationPct float64  `json:"avg_power_realization_pct,omitempty"`
-	DeratedGPUCount        int      `json:"derated_gpu_count,omitempty"`
-	Status                 string   `json:"status"`
-	Notes                  []string `json:"notes,omitempty"`
+	StepIndex  int   `json:"step_index"`
+	GPUIndices []int `json:"gpu_indices"`
+	// NewGPUIndex is the GPU whose stable limit was searched in this step.
+	NewGPUIndex int `json:"new_gpu_index"`
+	// NewGPUStableLimitW is the stable power limit found for the new GPU.
+	NewGPUStableLimitW  float64  `json:"new_gpu_stable_limit_w,omitempty"`
+	TotalObservedPowerW float64  `json:"total_observed_power_w,omitempty"`
+	AvgObservedPowerW   float64  `json:"avg_observed_power_w,omitempty"`
+	Derated             bool     `json:"derated,omitempty"`
+	Status              string   `json:"status"`
+	Notes               []string `json:"notes,omitempty"`
+}
+
+// NvidiaPerformanceRampStep holds per-step performance data for the
+// scalability ramp-up phase of the performance benchmark.
+type NvidiaPerformanceRampStep struct {
+	StepIndex  int   `json:"step_index"`
+	GPUIndices []int `json:"gpu_indices"`
+	// TotalSyntheticTOPS is the sum of per-GPU SyntheticScore (fp32-equivalent
+	// TOPS from dedicated single-precision phases) across all GPUs in this step.
+	TotalSyntheticTOPS float64 `json:"total_synthetic_tops"`
+	TotalMixedTOPS     float64 `json:"total_mixed_tops,omitempty"`
+	// ScalabilityPct = TotalSyntheticTOPS / (k × best_single_gpu_tops) × 100.
+	// 100% = perfect linear scaling. < 100% = thermal/power/interconnect loss.
+	ScalabilityPct float64  `json:"scalability_pct"`
+	Status         string   `json:"status"`
+	Notes          []string `json:"notes,omitempty"`
 }
--- a/audit/internal/platform/gpu_metrics.go
+++ b/audit/internal/platform/gpu_metrics.go
@@ -27,6 +27,7 @@ type GPUMetricRow struct {
 	FanAvgRPM             float64 `json:"fan_avg_rpm,omitempty"`
 	FanDutyCyclePct       float64 `json:"fan_duty_cycle_pct,omitempty"`
 	FanDutyCycleAvailable bool    `json:"fan_duty_cycle_available,omitempty"`
+	FanDutyCycleEstimated bool    `json:"fan_duty_cycle_estimated,omitempty"`
 }

 // sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
@@ -147,14 +148,18 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
 // WriteGPUMetricsCSV writes collected rows as a CSV file.
 func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
 	var b bytes.Buffer
-	b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available\n")
+	b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available,fan_duty_cycle_estimated\n")
 	for _, r := range rows {
 		dutyAvail := 0
 		if r.FanDutyCycleAvailable {
 			dutyAvail = 1
 		}
-		fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d\n",
-			strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail)
+		dutyEstimated := 0
+		if r.FanDutyCycleEstimated {
+			dutyEstimated = 1
+		}
+		fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d,%d\n",
+			strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail, dutyEstimated)
 	}
 	return os.WriteFile(path, b.Bytes(), 0644)
 }
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -426,6 +426,13 @@ func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string,
 	if err != nil {
 		return "", err
 	}
+	// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
+	// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
+	if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
+		for _, p := range killed {
+			logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
+		}
+	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", withNvidiaPersistenceMode(
 		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		satJob{
@@ -443,6 +450,13 @@ func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, dur
 	if err != nil {
 		return "", err
 	}
+	// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
+	// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
+	if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
+		for _, p := range killed {
+			logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
+		}
+	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", withNvidiaPersistenceMode(
 		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		satJob{
@@ -460,6 +474,13 @@ func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpu
 	if err != nil {
 		return "", err
 	}
+	// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
+	// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
+	if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
+		for _, p := range killed {
+			logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
+		}
+	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", withNvidiaPersistenceMode(
 		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		satJob{
@@ -552,10 +573,16 @@ func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, si
 	if passes <= 0 {
 		passes = 1
 	}
-	// Bound memtester with a hard wall-clock timeout: ~2.5 min per 100 MB per
-	// pass, plus a fixed 2-minute buffer. Without this, a stuck memory
-	// controller can cause memtester to spin forever on a single subtest.
-	timeoutSec := sizeMB*passes*150/100 + 120
+	// Keep Validate Memory bounded to a quick diagnostic window. The timeout is
+	// intentionally conservative enough for healthy systems while avoiding the
+	// prior 30-80 minute hangs caused by memtester spinning on a bad subtest.
+	timeoutSec := sizeMB*passes*20/100 + 60
+	if timeoutSec < 180 {
+		timeoutSec = 180
+	}
+	if timeoutSec > 900 {
+		timeoutSec = 900
+	}
 	return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
 		{name: "01-free-before.log", cmd: []string{"free", "-h"}},
 		{name: "02-memtester.log", cmd: []string{"timeout", fmt.Sprintf("%d", timeoutSec), "memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
--- a/audit/internal/platform/sat_fan_stress.go
+++ b/audit/internal/platform/sat_fan_stress.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
+	"math"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -56,13 +57,37 @@ type cachedPowerReading struct {
 	UpdatedAt time.Time
 }

+type fanObservationState struct {
+	MaxRPM map[string]float64 `json:"max_rpm"`
+}
+
+type fanPeakCandidate struct {
+	FirstSeen time.Time
+	RPM       float64
+}
+
 var (
 	systemPowerCacheMu sync.Mutex
 	systemPowerCache   cachedPowerReading
+	fanObservationMu   sync.Mutex
+	fanObservation     fanObservationState
+	fanObservationInit bool
+	fanPeakCandidates  = make(map[string]fanPeakCandidate)
 )

 const systemPowerHoldTTL = 15 * time.Second

+var fanObservationStatePath = "/var/log/bee-sat/fan-observation.json"
+
+const fanObservationMinPeakHold = time.Second
+
+func normalizeObservedFanMaxRPM(rpm float64) float64 {
+	if rpm <= 0 {
+		return 0
+	}
+	return math.Ceil(rpm/1000.0) * 1000.0
+}
+
 // RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
 // temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
 // Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
@@ -310,11 +335,13 @@ func sampleFanSpeeds() ([]FanReading, error) {
 	out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
 	if err == nil {
 		if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
+			updateFanObservation(fans, time.Now())
 			return fans, nil
 		}
 	}
 	fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
 	if len(fans) > 0 {
+		updateFanObservation(fans, time.Now())
 		return fans, nil
 	}
 	if err != nil {
@@ -323,6 +350,119 @@ func sampleFanSpeeds() ([]FanReading, error) {
 	return nil, sensorsErr
 }

+func loadFanObservationLocked() {
+	if fanObservationInit {
+		return
+	}
+	fanObservationInit = true
+	fanObservation.MaxRPM = make(map[string]float64)
+	raw, err := os.ReadFile(fanObservationStatePath)
+	if err != nil || len(raw) == 0 {
+		return
+	}
+	var persisted fanObservationState
+	if json.Unmarshal(raw, &persisted) != nil {
+		return
+	}
+	for name, rpm := range persisted.MaxRPM {
+		name = strings.TrimSpace(name)
+		if name == "" || rpm <= 0 {
+			continue
+		}
+		fanObservation.MaxRPM[name] = rpm
+	}
+}
+
+func saveFanObservationLocked() {
+	if len(fanObservation.MaxRPM) == 0 {
+		return
+	}
+	dir := filepath.Dir(fanObservationStatePath)
+	if dir == "" || dir == "." {
+		dir = "/var/log/bee-sat"
+	}
+	if err := os.MkdirAll(dir, 0755); err != nil {
+		return
+	}
+	raw, err := json.MarshalIndent(fanObservation, "", "  ")
+	if err != nil {
+		return
+	}
+	_ = os.WriteFile(fanObservationStatePath, raw, 0644)
+}
+
+func updateFanObservation(fans []FanReading, now time.Time) {
+	if len(fans) == 0 {
+		return
+	}
+	fanObservationMu.Lock()
+	defer fanObservationMu.Unlock()
+	loadFanObservationLocked()
+	changed := false
+	for _, fan := range fans {
+		name := strings.TrimSpace(fan.Name)
+		if name == "" || fan.RPM <= 0 {
+			continue
+		}
+		currentMax := fanObservation.MaxRPM[name]
+		if fan.RPM <= currentMax {
+			delete(fanPeakCandidates, name)
+			continue
+		}
+		if cand, ok := fanPeakCandidates[name]; ok {
+			if now.Sub(cand.FirstSeen) >= fanObservationMinPeakHold {
+				newMax := math.Max(cand.RPM, fan.RPM)
+				if newMax > currentMax {
+					fanObservation.MaxRPM[name] = normalizeObservedFanMaxRPM(newMax)
+					changed = true
+				}
+				delete(fanPeakCandidates, name)
+				continue
+			}
+			if fan.RPM > cand.RPM {
+				fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: cand.FirstSeen, RPM: fan.RPM}
+			}
+			continue
+		}
+		fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: now, RPM: fan.RPM}
+	}
+	if changed {
+		saveFanObservationLocked()
+	}
+}
+
+func estimateFanDutyCyclePctFromObservation(fans []FanReading) (float64, bool) {
+	if len(fans) == 0 {
+		return 0, false
+	}
+	fanObservationMu.Lock()
+	defer fanObservationMu.Unlock()
+	loadFanObservationLocked()
+	var samples []float64
+	for _, fan := range fans {
+		name := strings.TrimSpace(fan.Name)
+		if name == "" || fan.RPM <= 0 {
+			continue
+		}
+		maxRPM := fanObservation.MaxRPM[name]
+		if maxRPM <= 0 {
+			continue
+		}
+		pct := fan.RPM / maxRPM * 100.0
+		if pct > 100 {
+			pct = 100
+		}
+		if pct < 0 {
+			pct = 0
+		}
+		samples = append(samples, pct)
+	}
+	if len(samples) == 0 {
+		return 0, false
+	}
+	return benchmarkMean(samples), true
+}
+
 // parseFanSpeeds parses "ipmitool sdr type Fan" output.
 // Handles two formats:
 //
@@ -428,12 +568,27 @@ func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {

 // sampleFanDutyCyclePct reads fan PWM/duty-cycle controls from lm-sensors.
 // Returns the average duty cycle across all exposed PWM controls.
-func sampleFanDutyCyclePct() (float64, bool) {
+func sampleFanDutyCyclePct() (float64, bool, bool) {
 	out, err := exec.Command("sensors", "-j").Output()
 	if err != nil || len(out) == 0 {
-		return 0, false
+		fans, fanErr := sampleFanSpeeds()
+		if fanErr != nil {
+			return 0, false, false
+		}
+		return sampleFanDutyCyclePctFromFans(fans)
 	}
-	return parseFanDutyCyclePctSensorsJSON(out)
+	pct, ok := parseFanDutyCyclePctSensorsJSON(out)
+	return pct, ok, false
+}
+
+func sampleFanDutyCyclePctFromFans(fans []FanReading) (float64, bool, bool) {
+	if len(fans) == 0 {
+		return 0, false, false
+	}
+	if pct, ok := estimateFanDutyCyclePctFromObservation(fans); ok {
+		return pct, true, true
+	}
+	return 0, false, false
 }

 func parseFanDutyCyclePctSensorsJSON(raw []byte) (float64, bool) {
--- a/audit/internal/platform/sat_fan_stress_test.go
+++ b/audit/internal/platform/sat_fan_stress_test.go
@@ -1,6 +1,7 @@
 package platform

 import (
+	"path/filepath"
 	"testing"
 	"time"
 )
@@ -50,6 +51,53 @@ func TestParseFanDutyCyclePctSensorsJSON(t *testing.T) {
 	}
 }

+func TestEstimateFanDutyCyclePctFromObservation(t *testing.T) {
+	t.Parallel()
+
+	oldPath := fanObservationStatePath
+	oldState := fanObservation
+	oldInit := fanObservationInit
+	oldCandidates := fanPeakCandidates
+	fanObservationStatePath = filepath.Join(t.TempDir(), "fan-observation.json")
+	fanObservation = fanObservationState{}
+	fanObservationInit = false
+	fanPeakCandidates = make(map[string]fanPeakCandidate)
+	t.Cleanup(func() {
+		fanObservationStatePath = oldPath
+		fanObservation = oldState
+		fanObservationInit = oldInit
+		fanPeakCandidates = oldCandidates
+	})
+
+	start := time.Unix(100, 0)
+	updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5000}}, start)
+	if _, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2500}}); ok {
+		t.Fatalf("single-sample spike should not establish observed max")
+	}
+
+	updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5200}}, start.Add(500*time.Millisecond))
+	updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5100}}, start.Add(1500*time.Millisecond))
+
+	got, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
+	if !ok {
+		t.Fatalf("expected estimated duty cycle from persisted observed max")
+	}
+	if got < 43 || got > 44 {
+		t.Fatalf("got=%v want ~43.3", got)
+	}
+
+	fanObservation = fanObservationState{}
+	fanObservationInit = false
+	fanPeakCandidates = make(map[string]fanPeakCandidate)
+	got, ok = estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
+	if !ok {
+		t.Fatalf("expected persisted observed max to be reloaded from disk")
+	}
+	if got < 43 || got > 44 {
+		t.Fatalf("reloaded got=%v want ~43.3", got)
+	}
+}
+
 func TestParseDCMIPowerReading(t *testing.T) {
 	raw := `
 Instantaneous power reading:                   512 Watts
--- a/audit/internal/webui/api.go
+++ b/audit/internal/webui/api.go
@@ -737,6 +737,9 @@ func (h *handler) handleAPISATAbort(w http.ResponseWriter, r *http.Request) {
 			if t.job != nil {
 				t.job.abort()
 			}
+			if taskMayLeaveOrphanWorkers(t.Target) {
+				platform.KillTestWorkers()
+			}
 			t.Status = TaskCancelled
 			now := time.Now()
 			t.DoneAt = &now
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
@@ -72,6 +72,13 @@ tbody tr:hover td{background:rgba(0,0,0,.03)}
 .badge-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
 .badge-err{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
 .badge-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
+/* Component chips — one small square per device */
+.chips{display:inline-flex;flex-wrap:wrap;gap:3px;align-items:center;vertical-align:middle}
+.chip{display:inline-flex;align-items:center;justify-content:center;width:20px;height:20px;border-radius:3px;font-size:10px;font-weight:800;cursor:default;font-family:monospace;letter-spacing:0;user-select:none}
+.chip-ok{background:var(--ok-bg);color:var(--ok-fg);border:1px solid #a3c293}
+.chip-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
+.chip-fail{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
+.chip-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
 /* Output terminal */
 .terminal{background:#1b1c1d;border:1px solid rgba(0,0,0,.2);border-radius:4px;padding:14px;font-family:monospace;font-size:12px;color:#b5cea8;max-height:400px;overflow-y:auto;white-space:pre-wrap;word-break:break-all;user-select:text;-webkit-user-select:text}
 .terminal-wrap{position:relative}.terminal-copy{position:absolute;top:6px;right:6px;background:#2d2f30;border:1px solid #444;color:#aaa;font-size:11px;padding:2px 8px;border-radius:3px;cursor:pointer;opacity:.7}.terminal-copy:hover{opacity:1}
@@ -363,23 +370,25 @@ func renderHardwareSummaryCard(opts HandlerOptions) string {
 			html.EscapeString(label), html.EscapeString(value), badgeHTML))
 	}

-	cpuRow := aggregateComponentStatus("CPU", records, []string{"cpu:all"}, nil)
-	writeRow("CPU", hwDescribeCPU(hw), runtimeStatusBadge(cpuRow.Status))
+	writeRow("CPU", hwDescribeCPU(hw),
+		renderComponentChips(matchedRecords(records, []string{"cpu:all"}, nil)))

-	memRow := aggregateComponentStatus("Memory", records, []string{"memory:all"}, []string{"memory:"})
-	writeRow("Memory", hwDescribeMemory(hw), runtimeStatusBadge(memRow.Status))
+	writeRow("Memory", hwDescribeMemory(hw),
+		renderComponentChips(matchedRecords(records, []string{"memory:all"}, []string{"memory:"})))

-	storageRow := aggregateComponentStatus("Storage", records, []string{"storage:all"}, []string{"storage:"})
-	writeRow("Storage", hwDescribeStorage(hw), runtimeStatusBadge(storageRow.Status))
+	writeRow("Storage", hwDescribeStorage(hw),
+		renderComponentChips(matchedRecords(records, []string{"storage:all"}, []string{"storage:"})))

-	gpuRow := aggregateComponentStatus("GPU", records, nil, []string{"pcie:gpu:"})
-	writeRow("GPU", hwDescribeGPU(hw), runtimeStatusBadge(gpuRow.Status))
+	writeRow("GPU", hwDescribeGPU(hw),
+		renderComponentChips(matchedRecords(records, nil, []string{"pcie:gpu:"})))

-	psuRow := aggregateComponentStatus("PSU", records, nil, []string{"psu:"})
-	if psuRow.Status == "UNKNOWN" && len(hw.PowerSupplies) > 0 {
-		psuRow.Status = hwPSUStatus(hw.PowerSupplies)
+	psuMatched := matchedRecords(records, nil, []string{"psu:"})
+	if len(psuMatched) == 0 && len(hw.PowerSupplies) > 0 {
+		// No PSU records yet — synthesise a single chip from IPMI status.
+		psuStatus := hwPSUStatus(hw.PowerSupplies)
+		psuMatched = []app.ComponentStatusRecord{{ComponentKey: "psu:ipmi", Status: psuStatus}}
 	}
-	writeRow("PSU", hwDescribePSU(hw), runtimeStatusBadge(psuRow.Status))
+	writeRow("PSU", hwDescribePSU(hw), renderComponentChips(psuMatched))

 	if nicDesc := hwDescribeNIC(hw); nicDesc != "" {
 		writeRow("Network", nicDesc, "")
@@ -892,6 +901,31 @@ func buildHardwareComponentRows(exportDir string) []runtimeHealthRow {
 	}
 }

+// matchedRecords returns all ComponentStatusRecord entries whose key matches
+// any exact key or any of the given prefixes. Used for per-device chip rendering.
+func firstNonEmpty(vals ...string) string {
+	for _, v := range vals {
+		if v != "" {
+			return v
+		}
+	}
+	return ""
+}
+
+func matchedRecords(records []app.ComponentStatusRecord, exact []string, prefixes []string) []app.ComponentStatusRecord {
+	var matched []app.ComponentStatusRecord
+	for _, rec := range records {
+		key := strings.TrimSpace(rec.ComponentKey)
+		if key == "" {
+			continue
+		}
+		if containsExactKey(key, exact) || hasAnyPrefix(key, prefixes) {
+			matched = append(matched, rec)
+		}
+	}
+	return matched
+}
+
 func aggregateComponentStatus(title string, records []app.ComponentStatusRecord, exact []string, prefixes []string) runtimeHealthRow {
 	matched := make([]app.ComponentStatusRecord, 0)
 	for _, rec := range records {
@@ -1034,6 +1068,52 @@ func runtimeIssueDescriptions(issues []schema.RuntimeIssue, codes ...string) str
 	return strings.Join(messages, "; ")
 }

+// chipLetterClass maps a component status to a single display letter and CSS class.
+func chipLetterClass(status string) (letter, cls string) {
+	switch strings.ToUpper(strings.TrimSpace(status)) {
+	case "OK":
+		return "O", "chip-ok"
+	case "WARNING", "WARN", "PARTIAL":
+		return "W", "chip-warn"
+	case "CRITICAL", "FAIL", "FAILED", "ERROR":
+		return "F", "chip-fail"
+	default:
+		return "?", "chip-unknown"
+	}
+}
+
+// renderComponentChips renders one 20×20 chip per ComponentStatusRecord.
+// Hover tooltip shows component key, status, error summary and last check time.
+// Falls back to a single unknown chip when no records are available.
+func renderComponentChips(matched []app.ComponentStatusRecord) string {
+	if len(matched) == 0 {
+		return `<span class="chips"><span class="chip chip-unknown" title="No data">?</span></span>`
+	}
+	sort.Slice(matched, func(i, j int) bool {
+		return matched[i].ComponentKey < matched[j].ComponentKey
+	})
+	var b strings.Builder
+	b.WriteString(`<span class="chips">`)
+	for _, rec := range matched {
+		letter, cls := chipLetterClass(rec.Status)
+		var tooltip strings.Builder
+		tooltip.WriteString(rec.ComponentKey)
+		tooltip.WriteString(": ")
+		tooltip.WriteString(firstNonEmpty(rec.Status, "UNKNOWN"))
+		if rec.ErrorSummary != "" {
+			tooltip.WriteString(" — ")
+			tooltip.WriteString(rec.ErrorSummary)
+		}
+		if !rec.LastCheckedAt.IsZero() {
+			fmt.Fprintf(&tooltip, " (checked %s)", rec.LastCheckedAt.Format("15:04:05"))
+		}
+		fmt.Fprintf(&b, `<span class="chip %s" title="%s">%s</span>`,
+			cls, html.EscapeString(tooltip.String()), letter)
+	}
+	b.WriteString(`</span>`)
+	return b.String()
+}
+
 func runtimeStatusBadge(status string) string {
 	status = strings.ToUpper(strings.TrimSpace(status))
 	badge := "badge-unknown"
@@ -1339,7 +1419,7 @@ func renderValidate(opts HandlerOptions) string {
 			inv.Memory,
 			`Runs a RAM validation pass and records memory state around the test.`,
 			`<code>free</code>, <code>memtester</code>`,
-			`256 MB / 1 pass in Validate, 1 GB / 3 passes in Stress.`,
+			`256 MB / 1 pass in Validate, 512 MB / 1 pass in Stress.`,
 		)) +
 		renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
 			inv.Storage,
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -162,6 +162,32 @@ type nvidiaRampSpec struct {
 	TotalDurationSec int
 }

+func resolveMemoryValidatePreset(profile string, stress bool) (sizeMB, passes int) {
+	switch strings.TrimSpace(strings.ToLower(profile)) {
+	case "overnight":
+		return 1024, 2
+	case "acceptance":
+		return 1024, 1
+	case "smoke":
+		return 256, 1
+	}
+	if stress {
+		return 512, 1
+	}
+	return 256, 1
+}
+
+func taskMayLeaveOrphanWorkers(target string) bool {
+	switch strings.TrimSpace(strings.ToLower(target)) {
+	case "nvidia", "nvidia-targeted-stress", "nvidia-targeted-power", "nvidia-pulse",
+		"nvidia-bandwidth", "nvidia-stress", "nvidia-compute", "nvidia-bench-perf",
+		"memory", "memory-stress", "cpu", "sat-stress", "platform-stress":
+		return true
+	default:
+		return false
+	}
+}
+
 func resolveBurnPreset(profile string) burnPreset {
 	switch profile {
 	case "overnight":
@@ -751,10 +777,8 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			err = fmt.Errorf("app not configured")
 			break
 		}
-		sizeMB, passes := 256, 1
-		if t.params.StressMode {
-			sizeMB, passes = 1024, 3
-		}
+		sizeMB, passes := resolveMemoryValidatePreset(t.params.BurnProfile, t.params.StressMode)
+		j.append(fmt.Sprintf("Memory validate preset: %d MB x %d pass(es)", sizeMB, passes))
 		archive, err = runMemoryAcceptancePackCtx(a, ctx, "", sizeMB, passes, j.append)
 	case "storage":
 		if a == nil {
@@ -1010,6 +1034,9 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
 			if t.job != nil {
 				t.job.abort()
 			}
+			if taskMayLeaveOrphanWorkers(t.Target) {
+				platform.KillTestWorkers()
+			}
 			t.Status = TaskCancelled
 			t.DoneAt = &now
 			taskSerialEvent(t, "finished with status="+t.Status)
@@ -1037,6 +1064,9 @@ func (h *handler) handleAPITasksKillWorkers(w http.ResponseWriter, _ *http.Reque
 			if t.job != nil {
 				t.job.abort()
 			}
+			if taskMayLeaveOrphanWorkers(t.Target) {
+				platform.KillTestWorkers()
+			}
 			t.Status = TaskCancelled
 			t.DoneAt = &now
 			taskSerialEvent(t, "finished with status="+t.Status)
@@ -1141,10 +1171,13 @@ func (q *taskQueue) loadLocked() {
 		q.assignTaskLogPathLocked(t)
 		if t.Status == TaskRunning {
 			// The task was interrupted by a bee-web restart. Child processes
-			// (e.g. bee-gpu-burn-worker) survive the restart in their own
-			// process groups and cannot be cancelled retroactively. Mark the
-			// task as failed so the user can decide whether to re-run it
-			// rather than blindly re-launching duplicate workers.
+			// (e.g. bee-gpu-burn-worker, dcgmi/nvvs) survive the restart in
+			// their own process groups. Kill any matching stale workers before
+			// marking the task failed so the next GPU test does not inherit a
+			// busy DCGM slot or duplicate workers.
+			if taskMayLeaveOrphanWorkers(t.Target) {
+				_ = platform.KillTestWorkers()
+			}
 			now := time.Now()
 			t.Status = TaskFailed
 			t.DoneAt = &now
--- a/audit/internal/webui/tasks_test.go
+++ b/audit/internal/webui/tasks_test.go
@@ -672,6 +672,36 @@ func TestRunTaskUsesBurnProfileDurationForCPU(t *testing.T) {
 	}
 }

+func TestRunTaskUsesQuickPresetForMemoryValidate(t *testing.T) {
+	var gotSizeMB, gotPasses int
+	q := &taskQueue{
+		opts: &HandlerOptions{App: &app.App{}},
+	}
+	tk := &Task{
+		ID:        "mem-validate-1",
+		Name:      "Memory SAT",
+		Target:    "memory",
+		Status:    TaskRunning,
+		CreatedAt: time.Now(),
+		params:    taskParams{StressMode: true},
+	}
+	j := &jobState{}
+
+	orig := runMemoryAcceptancePackCtx
+	runMemoryAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, sizeMB, passes int, _ func(string)) (string, error) {
+		gotSizeMB = sizeMB
+		gotPasses = passes
+		return "/tmp/memory-validate.tar.gz", nil
+	}
+	defer func() { runMemoryAcceptancePackCtx = orig }()
+
+	q.runTask(tk, j, context.Background())
+
+	if gotSizeMB != 512 || gotPasses != 1 {
+		t.Fatalf("memory validate preset=%dMB x%d want 512MB x1", gotSizeMB, gotPasses)
+	}
+}
+
 func TestRunTaskBuildsSupportBundleWithoutApp(t *testing.T) {
 	dir := t.TempDir()
 	q := &taskQueue{
--- a/iso/builder/bee-gpu-stress.c
+++ b/iso/builder/bee-gpu-stress.c
@@ -35,6 +35,8 @@ typedef void *CUstream;
 #define MAX_STRESS_STREAMS 16
 #define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
 #define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
+#define MAX_SINGLE_PRECISION_STREAMS 4
+#define MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES ((size_t)2u * 1024u * 1024u * 1024u)

 static const char *ptx_source =
    ".version 6.0\n"
@@ -296,6 +298,13 @@ static int choose_stream_count(int mp_count, int planned_profiles, size_t total_
    return stream_count;
 }

+static size_t clamp_single_precision_profile_budget(size_t profile_budget_bytes) {
+    if (profile_budget_bytes > MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES) {
+        return MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES;
+    }
+    return profile_budget_bytes;
+}
+
 static void destroy_streams(struct cuda_api *api, CUstream *streams, int count) {
    if (!api->cuStreamDestroy) {
        return;
@@ -704,6 +713,19 @@ static const struct profile_desc k_profiles[] = {

 #define PROFILE_COUNT ((int)(sizeof(k_profiles) / sizeof(k_profiles[0])))

+static int profile_allowed_for_run(const struct profile_desc *desc, int cc, const char *precision_filter) {
+    if (!(desc->enabled && cc >= desc->min_cc)) {
+        return 0;
+    }
+    if (precision_filter != NULL) {
+        return strcmp(desc->block_label, precision_filter) == 0;
+    }
+    /* Mixed/all phases intentionally exclude fp64/fp4 for now: both paths are
+     * unstable on the current benchmark fleet and can abort the whole mixed
+     * pass after earlier phases already collected useful telemetry. */
+    return strcmp(desc->block_label, "fp64") != 0 && strcmp(desc->block_label, "fp4") != 0;
+}
+
 static int load_cublaslt(struct cublaslt_api *api) {
    memset(api, 0, sizeof(*api));
    api->lib = dlopen("libcublasLt.so.13", RTLD_NOW | RTLD_LOCAL);
@@ -908,11 +930,9 @@ static int prepare_profile(struct cublaslt_api *cublas,
                           CUstream stream,
                           size_t profile_budget_bytes,
                           struct prepared_profile *out) {
-    memset(out, 0, sizeof(*out));
-    out->desc = *desc;
-    out->stream = stream;
-
    size_t bytes_per_cell = 0;
+    size_t attempt_budget = profile_budget_bytes;
+
    bytes_per_cell += bytes_for_elements(desc->a_type, 1);
    bytes_per_cell += bytes_for_elements(desc->b_type, 1);
    bytes_per_cell += bytes_for_elements(desc->c_type, 1);
@@ -921,106 +941,115 @@ static int prepare_profile(struct cublaslt_api *cublas,
        return 0;
    }

-    uint64_t dim = choose_square_dim(profile_budget_bytes, bytes_per_cell, desc->min_multiple);
-    out->m = dim;
-    out->n = dim;
-    out->k = dim;
+    while (attempt_budget >= MIN_PROFILE_BUDGET_BYTES) {
+        memset(out, 0, sizeof(*out));
+        out->desc = *desc;
+        out->stream = stream;

-    size_t desired_workspace = profile_budget_bytes / 8u;
-    if (desired_workspace > 32u * 1024u * 1024u) {
-        desired_workspace = 32u * 1024u * 1024u;
-    }
-    desired_workspace = round_down_size(desired_workspace, 256u);
+        uint64_t dim = choose_square_dim(attempt_budget, bytes_per_cell, desc->min_multiple);
+        out->m = dim;
+        out->n = dim;
+        out->k = dim;

-    size_t a_bytes = 0;
-    size_t b_bytes = 0;
-    size_t c_bytes = 0;
-    size_t d_bytes = 0;
-    size_t scale_bytes = 0;
-    while (1) {
-        a_bytes = bytes_for_elements(desc->a_type, out->k * out->m);
-        b_bytes = bytes_for_elements(desc->b_type, out->k * out->n);
-        c_bytes = bytes_for_elements(desc->c_type, out->m * out->n);
-        d_bytes = bytes_for_elements(desc->d_type, out->m * out->n);
-        scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k);
+        size_t desired_workspace = attempt_budget / 8u;
+        if (desired_workspace > 32u * 1024u * 1024u) {
+            desired_workspace = 32u * 1024u * 1024u;
+        }
+        desired_workspace = round_down_size(desired_workspace, 256u);

-        size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes;
-        if (matrix_bytes <= profile_budget_bytes) {
-            size_t remaining = profile_budget_bytes - matrix_bytes;
-            out->workspace_size = desired_workspace;
-            if (out->workspace_size > remaining) {
-                out->workspace_size = round_down_size(remaining, 256u);
+        size_t a_bytes = 0;
+        size_t b_bytes = 0;
+        size_t c_bytes = 0;
+        size_t d_bytes = 0;
+        size_t scale_bytes = 0;
+        while (1) {
+            a_bytes = bytes_for_elements(desc->a_type, out->k * out->m);
+            b_bytes = bytes_for_elements(desc->b_type, out->k * out->n);
+            c_bytes = bytes_for_elements(desc->c_type, out->m * out->n);
+            d_bytes = bytes_for_elements(desc->d_type, out->m * out->n);
+            scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k);
+
+            size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes;
+            if (matrix_bytes <= attempt_budget) {
+                size_t remaining = attempt_budget - matrix_bytes;
+                out->workspace_size = desired_workspace;
+                if (out->workspace_size > remaining) {
+                    out->workspace_size = round_down_size(remaining, 256u);
+                }
+                break;
            }
-            break;
+
+            if (out->m <= (uint64_t)desc->min_multiple) {
+                break;
+            }
+            out->m -= (uint64_t)desc->min_multiple;
+            out->n = out->m;
+            out->k = out->m;
+        }
+        if (out->m < (uint64_t)desc->min_multiple) {
+            attempt_budget /= 2u;
+            continue;
        }

-        if (out->m <= (uint64_t)desc->min_multiple) {
-            return 0;
-        }
-        out->m -= (uint64_t)desc->min_multiple;
-        out->n = out->m;
-        out->k = out->m;
-    }
-
-    if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) ||
-        !alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) ||
-        !alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) ||
-        !alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    cudaDataType_t scale_type = matmul_scale_type(desc);
-    if (!check_cublas("cublasLtMatmulDescCreate",
-                      cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, scale_type))) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    cublasOperation_t transa = CUBLAS_OP_T;
-    cublasOperation_t transb = CUBLAS_OP_N;
-    if (!check_cublas("set TRANSA",
-                      cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
-                                                             CUBLASLT_MATMUL_DESC_TRANSA,
-                                                             &transa,
-                                                             sizeof(transa))) ||
-        !check_cublas("set TRANSB",
-                      cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
-                                                             CUBLASLT_MATMUL_DESC_TRANSB,
-                                                             &transb,
-                                                             sizeof(transb)))) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    if (desc->needs_scalar_scale) {
-        float one = 1.0f;
-        if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) ||
-            !alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) {
+        if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) ||
+            !alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) ||
+            !alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) ||
+            !alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
-        if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) ||
-            !device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) {
+
+        cudaDataType_t scale_type = matmul_scale_type(desc);
+        if (!check_cublas("cublasLtMatmulDescCreate",
+                          cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, scale_type))) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
-        void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev;
-        void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev;
-        if (!check_cublas("set A scale ptr",
+
+        cublasOperation_t transa = CUBLAS_OP_T;
+        cublasOperation_t transb = CUBLAS_OP_N;
+        if (!check_cublas("set TRANSA",
                          cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
-                                                                 CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
-                                                                 &a_scale_ptr,
-                                                                 sizeof(a_scale_ptr))) ||
-            !check_cublas("set B scale ptr",
+                                                                 CUBLASLT_MATMUL_DESC_TRANSA,
+                                                                 &transa,
+                                                                 sizeof(transa))) ||
+            !check_cublas("set TRANSB",
                          cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
-                                                                 CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
-                                                                 &b_scale_ptr,
-                                                                 sizeof(b_scale_ptr)))) {
+                                                                 CUBLASLT_MATMUL_DESC_TRANSB,
+                                                                 &transb,
+                                                                 sizeof(transb)))) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
-    }
+
+        if (desc->needs_scalar_scale) {
+            float one = 1.0f;
+            if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) ||
+                !alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) {
+                destroy_profile(cublas, cuda, out);
+                return 0;
+            }
+            if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) ||
+                !device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) {
+                destroy_profile(cublas, cuda, out);
+                return 0;
+            }
+            void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev;
+            void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev;
+            if (!check_cublas("set A scale ptr",
+                              cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
+                                                                     CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
+                                                                     &a_scale_ptr,
+                                                                     sizeof(a_scale_ptr))) ||
+                !check_cublas("set B scale ptr",
+                              cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
+                                                                     CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
+                                                                     &b_scale_ptr,
+                                                                     sizeof(b_scale_ptr)))) {
+                destroy_profile(cublas, cuda, out);
+                return 0;
+            }
+        }

 #if defined(CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3)
    if (desc->needs_block_scale) {
@@ -1060,62 +1089,65 @@ static int prepare_profile(struct cublaslt_api *cublas,
    }
 #endif

-    if (!check_cublas("create A layout",
-                      cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) ||
-        !check_cublas("create B layout",
-                      cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) ||
-        !check_cublas("create C layout",
-                      cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) ||
-        !check_cublas("create D layout",
-                      cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    if (out->workspace_size > 0) {
-        if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) {
+        if (!check_cublas("create A layout",
+                          cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) ||
+            !check_cublas("create B layout",
+                          cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) ||
+            !check_cublas("create C layout",
+                          cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) ||
+            !check_cublas("create D layout",
+                          cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
+
+        if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) {
+            destroy_profile(cublas, cuda, out);
+            return 0;
+        }
+
+        if (out->workspace_size > 0) {
+            if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) {
+                destroy_profile(cublas, cuda, out);
+                return 0;
+            }
+        }
+
+        if (!check_cublas("set workspace",
+                          cublas->cublasLtMatmulPreferenceSetAttribute(
+                              out->preference,
+                              CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
+                              &out->workspace_size,
+                              sizeof(out->workspace_size)))) {
+            destroy_profile(cublas, cuda, out);
+            return 0;
+        }
+
+        int found = 0;
+        if (check_cublas("heuristic",
+                         cublas->cublasLtMatmulAlgoGetHeuristic(handle,
+                                                                out->op_desc,
+                                                                out->a_layout,
+                                                                out->b_layout,
+                                                                out->c_layout,
+                                                                out->d_layout,
+                                                                out->preference,
+                                                                1,
+                                                                &out->heuristic,
+                                                                &found)) &&
+            found > 0) {
+            out->ready = 1;
+            return 1;
+        }
+
+        destroy_profile(cublas, cuda, out);
+        attempt_budget = round_down_size(attempt_budget * 3u / 4u, 256u);
+        if (attempt_budget < MIN_PROFILE_BUDGET_BYTES) {
+            break;
+        }
    }

-    if (!check_cublas("set workspace",
-                      cublas->cublasLtMatmulPreferenceSetAttribute(
-                          out->preference,
-                          CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
-                          &out->workspace_size,
-                          sizeof(out->workspace_size)))) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    int found = 0;
-    if (!check_cublas("heuristic",
-                      cublas->cublasLtMatmulAlgoGetHeuristic(handle,
-                                                             out->op_desc,
-                                                             out->a_layout,
-                                                             out->b_layout,
-                                                             out->c_layout,
-                                                             out->d_layout,
-                                                             out->preference,
-                                                             1,
-                                                             &out->heuristic,
-                                                             &found))) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-    if (found <= 0) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    out->ready = 1;
-    return 1;
+    return 0;
 }

 static int run_cublas_profile(cublasLtHandle_t handle,
@@ -1180,6 +1212,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
    size_t requested_budget = 0;
    size_t total_budget = 0;
    size_t per_profile_budget = 0;
+    int budget_profiles = 0;

    memset(report, 0, sizeof(*report));
    snprintf(report->backend, sizeof(report->backend), "cublasLt");
@@ -1202,8 +1235,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,

    /* Count profiles matching the filter (for deciding what to run). */
    for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
-        if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc &&
-            (precision_filter == NULL || strcmp(k_profiles[i].block_label, precision_filter) == 0)) {
+        if (profile_allowed_for_run(&k_profiles[i], cc, precision_filter)) {
            planned++;
        }
    }
@@ -1215,30 +1247,41 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
    }

    /* Count all profiles active on this GPU regardless of filter.
-     * Used as the budget divisor so matrix sizes stay consistent whether
-     * running all precisions together or a single-precision phase. */
+     * Mixed phases still divide budget across the full precision set, while
+     * single-precision benchmark phases dedicate budget only to active
+     * profiles matching precision_filter. */
    int planned_total = 0;
    for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
-        if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) {
+        if (profile_allowed_for_run(&k_profiles[i], cc, precision_filter)) {
            planned_total++;
        }
    }
    if (planned_total < planned) {
        planned_total = planned;
    }
+    budget_profiles = planned_total;
+    if (precision_filter != NULL) {
+        budget_profiles = planned;
+    }
+    if (budget_profiles <= 0) {
+        budget_profiles = planned_total;
+    }

    requested_budget = (size_t)size_mb * 1024u * 1024u;
-    if (requested_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) {
-        requested_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES;
+    if (requested_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) {
+        requested_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES;
    }
    total_budget = clamp_budget_to_free_memory(cuda, requested_budget);
-    if (total_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) {
-        total_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES;
+    if (total_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) {
+        total_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES;
    }
    if (query_multiprocessor_count(cuda, dev, &mp_count) &&
        cuda->cuStreamCreate &&
        cuda->cuStreamDestroy) {
-        stream_count = choose_stream_count(mp_count, planned_total, total_budget, 1);
+        stream_count = choose_stream_count(mp_count, budget_profiles, total_budget, 1);
+    }
+    if (precision_filter != NULL && stream_count > MAX_SINGLE_PRECISION_STREAMS) {
+        stream_count = MAX_SINGLE_PRECISION_STREAMS;
    }
    if (stream_count > 1) {
        int created = 0;
@@ -1251,18 +1294,22 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
        }
    }
    report->stream_count = stream_count;
-    per_profile_budget = total_budget / ((size_t)planned_total * (size_t)stream_count);
+    per_profile_budget = total_budget / ((size_t)budget_profiles * (size_t)stream_count);
    if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) {
        per_profile_budget = MIN_PROFILE_BUDGET_BYTES;
    }
+    if (precision_filter != NULL) {
+        per_profile_budget = clamp_single_precision_profile_budget(per_profile_budget);
+    }
    report->buffer_mb = (int)(total_budget / (1024u * 1024u));
    append_detail(report->details,
                  sizeof(report->details),
-                  "requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n",
+                  "requested_mb=%d actual_mb=%d streams=%d mp_count=%d budget_profiles=%d per_worker_mb=%zu\n",
                  size_mb,
                  report->buffer_mb,
                  report->stream_count,
                  mp_count,
+                  budget_profiles,
                  per_profile_budget / (1024u * 1024u));

    for (int i = 0; i < profile_count; i++) {
@@ -1275,10 +1322,10 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
                          desc->min_cc);
            continue;
        }
-        if (precision_filter != NULL && strcmp(desc->block_label, precision_filter) != 0) {
+        if (!profile_allowed_for_run(desc, cc, precision_filter)) {
            append_detail(report->details,
                          sizeof(report->details),
-                          "%s=SKIPPED precision_filter\n",
+                          "%s=SKIPPED benchmark_disabled\n",
                          desc->name);
            continue;
        }
--- a/iso/overlay/usr/local/bin/bee-openbox-session
+++ b/iso/overlay/usr/local/bin/bee-openbox-session
@@ -9,9 +9,9 @@ xset s noblank

 # Set desktop background.
 if [ -f /usr/share/bee/wallpaper.png ]; then
-    feh --bg-fill /usr/share/bee/wallpaper.png
+    feh --bg-center --image-bg '#000000' /usr/share/bee/wallpaper.png
 else
-    xsetroot -solid '#f6c90e'
+    xsetroot -solid '#000000'
 fi

 tint2 &
--- a/iso/overlay/usr/share/bee/wallpaper.png
+++ b/iso/overlay/usr/share/bee/wallpaper.png
Author	SHA1	Message	Date
Mikhail Chusavitin	18e24a9aa5	Estimate fan duty from observed RPM maxima	2026-04-16 10:10:18 +03:00
Mikhail Chusavitin	e306250da7	Disable fp64/fp4 in mixed gpu burn	2026-04-16 10:00:03 +03:00
Mikhail Chusavitin	c5b2081ac9	Disable unstable fp4/fp64 benchmark phases	2026-04-16 09:58:02 +03:00
Michael Chus	434528083e	Power bench: compare GPU-reported TDP vs IPMI server power delta - NvidiaPowerBenchResult gains ServerPower *BenchmarkServerPower - RunNvidiaPowerBench samples IPMI idle before Phase 1 and loaded via background goroutine throughout Phase 2 ramp - renderPowerBenchReport: new "Server vs GPU Power Comparison" table with ratio annotation (✓ match / ⚠ minor / ✗ over-report) - renderPowerBenchSummary: server_idle_w, server_loaded_w, server_delta_w, server_reporting_ratio keys Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 07:21:02 +03:00
Michael Chus	30aa30cd67	LiveCD: set Baby Bee wallpaper centered on black background 400×400px PNG centered via feh --bg-center --image-bg '#000000'. Fallback solid fill also changed to black. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 06:57:23 +03:00
Michael Chus	4f76e1de21	Dashboard: per-device status chips with hover tooltips Replace single aggregated badge per hardware category with individual colored chips (O/W/F/?) for each ComponentStatusRecord. Added helper functions: matchedRecords, firstNonEmpty. CSS classes: chip-ok/warn/fail/unknown. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 06:54:13 +03:00
Michael Chus	3732e64a4a	Add slowdown temperature exceedance detector to benchmark detectSlowdownTempExceedance scans steady-state metric rows per GPU and emits a [WARNING] note + PARTIAL status if any sample >= SlowdownTempC. Uses per-GPU threshold from nvidia-smi -q, fallback 80°C. Distinct from p95-based TempHeadroomC check: catches even a single spike above the slowdown threshold that would be smoothed out in aggregates. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 06:46:45 +03:00
Michael Chus	0d925299ff	Use per-GPU temperature limits from nvidia-smi -q for headroom calculation Parse "GPU Shutdown Temp" and "GPU Slowdown Temp" from nvidia-smi -q verbose output in enrichGPUInfoWithMaxClocks. Store as ShutdownTempC/SlowdownTempC on benchmarkGPUInfo and BenchmarkGPUResult. Fallback: 90°C shutdown / 80°C slowdown when not available. TempHeadroomC = ShutdownTempC - P95TempC (per-GPU, not hardcoded 100°C). Warning threshold: p95 >= SlowdownTempC. Critical: headroom < 10°C. Report table shows both limits alongside headroom and p95 temp. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 06:45:15 +03:00
Michael Chus	a8d5e019a5	Translate report to English; add power anomaly detector All report strings are now English only. Add detectPowerAnomaly: scans steady-state metric rows per GPU with a 5-sample rolling baseline; flags a sudden drop ≥30% while GPU usage >50% as [HARD STOP] — indicates bad cable contact or VRM fault. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 06:42:00 +03:00
Michael Chus	72ec086568	Restructure benchmark report as balanced scorecard (5 perspectives) Split throttle into separate signals: ThermalThrottlePct, PowerCapThrottlePct, SyncBoostThrottlePct. Add TempHeadroomC (100 - p95_temp) as independent thermal headroom metric; warning < 20°C (>80°C), critical < 10°C (>90°C). Hard stop findings: thermal throttle with fans < 95%, ECC uncorrected errors, p95 temp > 90°C. Throttle findings now include per-type percentages and diagnostic context. Replace flat scorecard table with BSC 5-perspective layout: 1. Compatibility (hard stops: thermal+fan, ECC) 2. Thermal headroom (p95 temp, delta to 100°C, throttle %) 3. Power delivery (power cap throttle, power CV, fan duty) 4. Performance (Compute TOPS, Synthetic, Mixed, TOPS/SM/GHz) 5. Anomalies (ECC corrected, sync boost, power/thermal variance) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 06:40:06 +03:00
Michael Chus	7a0b0934df	Separate compute score from server quality score CompositeScore = raw ComputeScore (TOPS). Throttling GPUs score lower automatically — no quality multiplier distorting the compute signal. Add ServerQualityScore (0-100): server infrastructure quality independent of GPU model. Formula: 0.40×Stability + 0.30×PowerSustain + 0.30×Thermal. Use to compare servers with the same GPU or flag bad server conditions. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 00:45:55 +03:00
Michael Chus	d8ca0dca2c	Redesign scoring metrics: variance-based sustain scores, throttle stability PowerSustainScore: power draw variance (CV) during load, not deviation from TDP. ThermalSustainScore: temperature variance (CV) during load. StabilityScore: fraction of time spent in thermal+power-cap throttling. Remove NCCL bonus from quality_factor. quality = 0.35 + 0.35×Stability + 0.15×PowerSustain + 0.15×ThermalSustain, cap 1.00. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 00:39:59 +03:00
Michael Chus	d90250f80a	Fix DCGM cleanup and shorten memory validate	2026-04-16 00:39:37 +03:00
Michael Chus	8d6eaef5de	Update perf benchmark report methodology to reflect new design Remove references to pre-benchmark power calibration and dcgmi targeted_power. Document platform_power_score ramp-up methodology, PowerSustainScore fallback to steady-state power, and full-budget single-precision phases. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 00:31:58 +03:00
Michael Chus	732bf4cbab	Redesign power and performance benchmarks with new methodology Power/Thermal Fit: cumulative fixed-limit ramp where each GPU's stable TDP is found under real multi-GPU thermal load (all prior GPUs running at their fixed limits). PlatformMaxTDPW = sum of stable limits across all GPUs. Remove PlatformPowerScore from power test. Performance Benchmark: remove pre-benchmark power calibration entirely. After N single-card runs, execute k=2..N parallel ramp-up steps and compute PlatformPowerScore = mean compute scalability vs best single-card TOPS. PowerSustainScore falls back to Steady.AvgPowerW when calibration absent. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 00:30:50 +03:00
Michael Chus	fa6d905a10	Tune bee-gpu-burn single-precision benchmark phases	2026-04-16 00:05:47 +03:00