Rework Power Fit report: 90 min stability, aligned tables, PSU/fan sections

- Increase stability profile duration from 33 min to 90 min by wiring powerBenchDurationSec() into runBenchmarkPowerCalibration (was discarded) - Collect per-step PSU slot readings, fan RPM/duty, and per-GPU telemetry in ramp loop; add matching fields to NvidiaPowerBenchStep/NvidiaPowerBenchGPU - Rewrite renderPowerBenchReport: replace Per-Slot Results with Single GPU section, rework Ramp Sequence rows=runs/cols=GPUs, add PSU Performance section (conditional on IPMI data), add transposed Single vs All-GPU comparison table in per-GPU sections - Add fmtMDTable helper (benchmark_table.go) and apply to all tables in both power and performance reports so columns align in plain-text view Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Stability hardening, build script fixes, GRUB bee logo
2026-04-19 18:04:12 +03:00 · 2026-04-19 13:08:31 +03:00 · 2026-04-19 13:07:48 +03:00 · 2026-04-19 08:42:17 +03:00 · 2026-04-18 15:28:05 +03:00 · 2026-04-18 15:08:41 +03:00
50 changed files with 4005 additions and 889 deletions
--- a/audit/internal/app/app.go
+++ b/audit/internal/app/app.go
@@ -146,7 +146,7 @@ type satRunner interface {
 	RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
 	RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
 	RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
-	RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
+	RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
 }

 type runtimeChecker interface {
@@ -304,7 +304,7 @@ func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error)
 	}
 	filename := fmt.Sprintf("audit-%s-%s.json", sanitizeFilename(hostnameOr("unknown")), time.Now().UTC().Format("20060102-150405"))
 	tmpPath := filepath.Join(os.TempDir(), filename)
-	data, err := os.ReadFile(DefaultAuditJSONPath)
+	data, err := readFileLimited(DefaultAuditJSONPath, 100<<20)
 	if err != nil {
 		return "", err
 	}
@@ -744,8 +744,15 @@ func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platfo
 	return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
 }

+func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc)
+}
+
 func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
-	path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
+	path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil)
 	body := "Results: " + path
 	if err != nil && err != context.Canceled {
 		body += "\nERROR: " + err.Error()
--- a/audit/internal/app/app_test.go
+++ b/audit/internal/app/app_test.go
@@ -128,6 +128,7 @@ type fakeSAT struct {
 	runNvidiaPowerFn          func(string, int, []int) (string, error)
 	runNvidiaPulseFn          func(string, int, []int) (string, error)
 	runNvidiaBandwidthFn      func(string, []int) (string, error)
+	runNCCLFn                 func(string, []int) (string, error)
 	runNvidiaTargetedStressFn func(string, int, []int) (string, error)
 	runMemoryFn               func(string) (string, error)
 	runStorageFn              func(string) (string, error)
@@ -287,10 +288,43 @@ func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.Platf
 	return "", nil
 }

-func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
+func (f fakeSAT) RunNCCLTests(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
+	if f.runNCCLFn != nil {
+		return f.runNCCLFn(baseDir, gpuIndices)
+	}
 	return "", nil
 }

+func TestRunNCCLTestsPassesSelectedGPUs(t *testing.T) {
+	t.Parallel()
+
+	var gotBaseDir string
+	var gotGPUIndices []int
+	a := &App{
+		sat: fakeSAT{
+			runNCCLFn: func(baseDir string, gpuIndices []int) (string, error) {
+				gotBaseDir = baseDir
+				gotGPUIndices = append([]int(nil), gpuIndices...)
+				return "/tmp/nccl-tests.tar.gz", nil
+			},
+		},
+	}
+
+	path, err := a.RunNCCLTests(context.Background(), "/tmp/sat", []int{3, 1}, nil)
+	if err != nil {
+		t.Fatalf("RunNCCLTests error: %v", err)
+	}
+	if path != "/tmp/nccl-tests.tar.gz" {
+		t.Fatalf("path=%q want %q", path, "/tmp/nccl-tests.tar.gz")
+	}
+	if gotBaseDir != "/tmp/sat" {
+		t.Fatalf("baseDir=%q want %q", gotBaseDir, "/tmp/sat")
+	}
+	if len(gotGPUIndices) != 2 || gotGPUIndices[0] != 3 || gotGPUIndices[1] != 1 {
+		t.Fatalf("gpuIndices=%v want [3 1]", gotGPUIndices)
+	}
+}
+
 func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
 	t.Parallel()

--- a/audit/internal/app/atomic_write.go
+++ b/audit/internal/app/atomic_write.go
@@ -2,10 +2,29 @@ package app

 import (
 	"fmt"
+	"io"
 	"os"
 	"path/filepath"
 )

+// readFileLimited reads path into memory, refusing files larger than maxBytes.
+// Prevents OOM on corrupted or unexpectedly large data files.
+func readFileLimited(path string, maxBytes int64) ([]byte, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+	data, err := io.ReadAll(io.LimitReader(f, maxBytes+1))
+	if err != nil {
+		return nil, err
+	}
+	if int64(len(data)) > maxBytes {
+		return nil, fmt.Errorf("file %s too large (exceeds %d bytes)", path, maxBytes)
+	}
+	return data, nil
+}
+
 func atomicWriteFile(path string, data []byte, perm os.FileMode) error {
 	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
 		return fmt.Errorf("mkdir %s: %w", filepath.Dir(path), err)
--- a/audit/internal/app/component_status_db.go
+++ b/audit/internal/app/component_status_db.go
@@ -46,7 +46,7 @@ func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) {
 	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
 		return nil, err
 	}
-	data, err := os.ReadFile(path)
+	data, err := readFileLimited(path, 10<<20)
 	if err != nil && !os.IsNotExist(err) {
 		return nil, err
 	}
--- a/audit/internal/collector/psu.go
+++ b/audit/internal/collector/psu.go
@@ -160,11 +160,54 @@ type psuSDR struct {
 }

 var psuSlotPatterns = []*regexp.Regexp{
-	regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`),
-	regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`),
-	regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`),
-	regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`),
-	regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`),
+	regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`),                    // PSU1, PS1, ps 2
+	regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`),                      // PS 6, PS6
+	regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`),                     // PWS1
+	regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`), // Power Supply 1, Power Supply Bay 3
+	regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`),                     // Bay 1
+	// Fallback for xFusion-style generic numbered PSU sensors (Power1, Power2, …).
+	// Must be last: "power supply N" is already caught by the pattern above.
+	regexp.MustCompile(`(?i)\bpower([0-9]+)\b`),
+}
+
+// psuInputPowerKeywords matches AC-input power sensor names across vendors:
+//   MSI:     PSU1_POWER_IN, PSU1_PIN
+//   MLT:     PSU1_PIN
+//   xFusion: (matched via default fallback — no explicit keyword)
+//   HPE:     PS1 Input Power, PS1 Input Watts
+func isPSUInputPower(name string) bool {
+	return strings.Contains(name, "input power") ||
+		strings.Contains(name, "input watts") ||
+		strings.Contains(name, "_pin") ||
+		strings.Contains(name, " pin") ||
+		strings.Contains(name, "_power_in") ||
+		strings.Contains(name, "power_in")
+}
+
+// isPSUOutputPower matches DC-output power sensor names across vendors:
+//   MSI:     PSU1_POWER_OUT
+//   MLT:     PSU1_POUT
+//   xFusion: PS1 POut
+func isPSUOutputPower(name string) bool {
+	return strings.Contains(name, "output power") ||
+		strings.Contains(name, "output watts") ||
+		strings.Contains(name, "_pout") ||
+		strings.Contains(name, " pout") ||
+		strings.Contains(name, "_power_out") ||
+		strings.Contains(name, "power_out") ||
+		strings.Contains(name, "power supply bay") ||
+		strings.Contains(name, "psu bay")
+}
+
+// parseBoundedFloat parses a numeric value from an SDR value field and
+// validates it is within (0, max]. Returns nil for zero, negative, or
+// out-of-range values — these indicate missing/off/fault sensor readings.
+func parseBoundedFloat(raw string, max float64) *float64 {
+	v := parseFloatPtr(raw)
+	if v == nil || *v <= 0 || *v > max {
+		return nil
+	}
+	return v
 }

 func parsePSUSDR(raw string) map[int]psuSDR {
@@ -194,24 +237,59 @@ func parsePSUSDR(raw string) map[int]psuSDR {

 		lowerName := strings.ToLower(name)
 		switch {
-		case strings.Contains(lowerName, "input power"):
-			entry.inputPowerW = parseFloatPtr(value)
-		case strings.Contains(lowerName, "output power"):
-			entry.outputPowerW = parseFloatPtr(value)
-		case strings.Contains(lowerName, "power supply bay"), strings.Contains(lowerName, "psu bay"):
-			entry.outputPowerW = parseFloatPtr(value)
+		case isPSUInputPower(lowerName):
+			entry.inputPowerW = parseBoundedFloat(value, 6000)
+		case isPSUOutputPower(lowerName):
+			entry.outputPowerW = parseBoundedFloat(value, 6000)
 		case strings.Contains(lowerName, "input voltage"), strings.Contains(lowerName, "ac input"):
 			entry.inputVoltage = parseFloatPtr(value)
 		case strings.Contains(lowerName, "temp"):
 			entry.temperatureC = parseFloatPtr(value)
 		case strings.Contains(lowerName, "health"), strings.Contains(lowerName, "remaining life"), strings.Contains(lowerName, "life remaining"):
 			entry.healthPct = parsePercentPtr(value)
+		default:
+			// Generic PSU power reading: sensor matched a slot pattern but carries
+			// no input/output keyword (e.g. xFusion "Power1", "Power2"). Treat as
+			// AC input if the value looks like wattage and no better data is set yet.
+			if entry.inputPowerW == nil {
+				entry.inputPowerW = parseBoundedFloat(value, 6000)
+			}
 		}
 		out[slot] = entry
 	}
 	return out
 }

+// PSUSlotPower holds SDR power readings for one PSU slot.
+// Slot key used by PSUSlotsFromSDR is the 0-based index string,
+// matching HardwarePowerSupply.Slot in the audit schema.
+type PSUSlotPower struct {
+	InputW  *float64 `json:"input_w,omitempty"`
+	OutputW *float64 `json:"output_w,omitempty"`
+	Status  string   `json:"status,omitempty"`
+}
+
+// PSUSlotsFromSDR parses `ipmitool sdr` output and returns per-slot PSU data
+// using the same battle-tested slot patterns as the hardware audit collector.
+// Works across MSI (PSU1_POWER_IN), xFusion (Power1, PS1 POut), MLT (PSU1_PIN).
+// Slot keys are 0-based index strings matching HardwarePowerSupply.Slot.
+func PSUSlotsFromSDR(sdrOutput string) map[string]PSUSlotPower {
+	sdr := parsePSUSDR(sdrOutput)
+	if len(sdr) == 0 {
+		return nil
+	}
+	out := make(map[string]PSUSlotPower, len(sdr))
+	for slot, entry := range sdr {
+		key := strconv.Itoa(slot - 1) // audit uses 0-based slot
+		out[key] = PSUSlotPower{
+			InputW:  entry.inputPowerW,
+			OutputW: entry.outputPowerW,
+			Status:  entry.status,
+		}
+	}
+	return out
+}
+
 func synthesizePSUsFromSDR(sdr map[int]psuSDR) []schema.HardwarePowerSupply {
 	if len(sdr) == 0 {
 		return nil
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
--- a/audit/internal/platform/benchmark_report.go
+++ b/audit/internal/platform/benchmark_report.go
@@ -61,6 +61,9 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 	if result.ScalabilityScore > 0 {
 		fmt.Fprintf(&b, "**Scalability score:** %.1f%%  \n", result.ScalabilityScore)
 	}
+	if result.PlatformPowerScore > 0 {
+		fmt.Fprintf(&b, "**Platform power score:** %.1f%%  \n", result.PlatformPowerScore)
+	}
 	fmt.Fprintf(&b, "**Overall status:** %s  \n", result.OverallStatus)
 	b.WriteString("\n")

@@ -81,69 +84,164 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		b.WriteString("\n")
 	}

-	// ── Methodology ───────────────────────────────────────────────────────────
-	b.WriteString("## Methodology\n\n")
-	fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline -> warmup -> steady-state -> interconnect phases.\n", result.BenchmarkProfile)
-	b.WriteString("- Single-GPU compute score comes from `bee-gpu-burn` on the cuBLASLt path when available.\n")
-	b.WriteString("- Thermal and power limits are inferred from NVIDIA clock-event counters plus sustained telemetry.\n")
-	b.WriteString("- `result.json` is the canonical machine-readable source for the run.\n\n")
-	b.WriteString("**Compute score** is derived from two phases:\n\n")
-	b.WriteString("- **Synthetic** — each precision type (int8, fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ")
-	b.WriteString("Measures peak throughput with the full GPU dedicated to one kernel type. ")
-	b.WriteString("Each result is normalised to fp32-equivalent TOPS using precision weights: ")
-	b.WriteString("fp64 ×2.0 · fp32 ×1.0 · fp16 ×0.5 · int8 ×0.25 · fp8 ×0.25 · fp4 ×0.125.\n")
-	b.WriteString("- **Mixed** — all precision types run simultaneously (combined phase). ")
-	b.WriteString("Reflects real inference workloads where fp8 matrix ops, fp16 attention and fp32 accumulation compete for bandwidth and SM scheduler slots.\n\n")
-	b.WriteString("**Formula:** `Compute = Synthetic × (1 + MixedEfficiency × 0.3)`\n\n")
-	b.WriteString("where `MixedEfficiency = Mixed / Synthetic`. A GPU that sustains 90 % throughput under mixed load ")
-	b.WriteString("receives a +27 % bonus over its synthetic score; one that drops to 60 % receives +18 %.\n\n")
-	b.WriteString("**Composite score** = `Compute × quality_factor` where quality factors in power sustain, thermal sustain, stability, and interconnect.\n\n")
+	// ── Balanced Scorecard ────────────────────────────────────────────────────
+	b.WriteString("## Balanced Scorecard\n\n")

-	// ── Scorecard table ───────────────────────────────────────────────────────
-	b.WriteString("## Scorecard\n\n")
-	b.WriteString("| GPU | Status | Composite | Compute | Synthetic | Mixed | Mixed Eff. | TOPS/SM/GHz | Power Sustain | Thermal Sustain | Stability | Interconnect |\n")
-	b.WriteString("|-----|--------|-----------|---------|-----------|-------|------------|-------------|---------------|-----------------|-----------|-------------|\n")
-	for _, gpu := range result.GPUs {
-		name := strings.TrimSpace(gpu.Name)
-		if name == "" {
-			name = "Unknown GPU"
+	// Perspective 1: Compatibility — hard stops
+	b.WriteString("### 1. Compatibility\n\n")
+	{
+		var rows [][]string
+		for _, gpu := range result.GPUs {
+			thermalThrottle := "-"
+			if gpu.Scores.ThermalThrottlePct > 0 {
+				thermalThrottle = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
+			}
+			fanAtThrottle := "-"
+			if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && gpu.Scores.ThermalThrottlePct > 0 {
+				fanAtThrottle = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
+			}
+			ecc := "-"
+			if gpu.ECC.Uncorrected > 0 {
+				ecc = fmt.Sprintf("⛔ %d", gpu.ECC.Uncorrected)
+			}
+			compatStatus := "✓ OK"
+			if gpu.ECC.Uncorrected > 0 || (gpu.Scores.ThermalThrottlePct > 0 && result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && result.Cooling.P95FanDutyCyclePct < 95) {
+				compatStatus = "⛔ HARD STOP"
+			}
+			rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), thermalThrottle, fanAtThrottle, ecc, compatStatus})
 		}
-		interconnect := "-"
-		if gpu.Scores.InterconnectScore > 0 {
-			interconnect = fmt.Sprintf("%.1f", gpu.Scores.InterconnectScore)
-		}
-		topsPerSM := "-"
-		if gpu.Scores.TOPSPerSMPerGHz > 0 {
-			topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
-		}
-		synthetic := "-"
-		if gpu.Scores.SyntheticScore > 0 {
-			synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
-		}
-		mixed := "-"
-		if gpu.Scores.MixedScore > 0 {
-			mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore)
-		}
-		mixedEff := "-"
-		if gpu.Scores.MixedEfficiency > 0 {
-			mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
-		}
-		fmt.Fprintf(&b, "| GPU %d %s | %s | **%.2f** | %.2f | %s | %s | %s | %s | %.1f | %.1f | %.1f | %s |\n",
-			gpu.Index, name,
-			gpu.Status,
-			gpu.Scores.CompositeScore,
-			gpu.Scores.ComputeScore,
-			synthetic,
-			mixed,
-			mixedEff,
-			topsPerSM,
-			gpu.Scores.PowerSustainScore,
-			gpu.Scores.ThermalSustainScore,
-			gpu.Scores.StabilityScore,
-			interconnect,
-		)
+		b.WriteString(fmtMDTable([]string{"GPU", "Thermal throttle", "Fan duty at throttle", "ECC uncorr", "Status"}, rows))
+		b.WriteString("\n")
+	}
+
+	// Perspective 2: Thermal headroom
+	b.WriteString("### 2. Thermal Headroom\n\n")
+	{
+		var rows [][]string
+		for _, gpu := range result.GPUs {
+			shutdownTemp := gpu.ShutdownTempC
+			if shutdownTemp <= 0 {
+				shutdownTemp = 90
+			}
+			slowdownTemp := gpu.SlowdownTempC
+			if slowdownTemp <= 0 {
+				slowdownTemp = 80
+			}
+			headroom := gpu.Scores.TempHeadroomC
+			thermalStatus := "✓ OK"
+			switch {
+			case headroom < 10:
+				thermalStatus = "⛔ CRITICAL"
+			case gpu.Steady.P95TempC >= slowdownTemp:
+				thermalStatus = "⚠ WARNING"
+			}
+			throttlePct := "-"
+			if gpu.Scores.ThermalThrottlePct > 0 {
+				throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
+			}
+			rows = append(rows, []string{
+				fmt.Sprintf("GPU %d", gpu.Index),
+				fmt.Sprintf("%.1f°C", gpu.Steady.P95TempC),
+				fmt.Sprintf("%.0f°C", slowdownTemp),
+				fmt.Sprintf("%.0f°C", shutdownTemp),
+				fmt.Sprintf("%.1f°C", headroom),
+				throttlePct,
+				thermalStatus,
+			})
+		}
+		b.WriteString(fmtMDTable([]string{"GPU", "p95 temp", "Slowdown limit", "Shutdown limit", "Headroom", "Thermal throttle", "Status"}, rows))
+		b.WriteString("\n")
+	}
+
+	// Perspective 3: Power delivery
+	b.WriteString("### 3. Power Delivery\n\n")
+	{
+		var rows [][]string
+		for _, gpu := range result.GPUs {
+			powerCap := "-"
+			if gpu.Scores.PowerCapThrottlePct > 0 {
+				powerCap = fmt.Sprintf("%.1f%%", gpu.Scores.PowerCapThrottlePct)
+			}
+			fanDuty := "-"
+			if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable {
+				fanDuty = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
+			}
+			powerStatus := "✓ OK"
+			if gpu.Scores.PowerCapThrottlePct > 5 {
+				powerStatus = "⚠ POWER LIMITED"
+			}
+			rows = append(rows, []string{
+				fmt.Sprintf("GPU %d", gpu.Index),
+				powerCap,
+				fmt.Sprintf("%.1f", gpu.Scores.PowerSustainScore),
+				fanDuty,
+				powerStatus,
+			})
+		}
+		b.WriteString(fmtMDTable([]string{"GPU", "Power cap throttle", "Power stability", "Fan duty (p95)", "Status"}, rows))
+		b.WriteString("\n")
+	}
+
+	// Perspective 4: Performance
+	b.WriteString("### 4. Performance\n\n")
+	{
+		var rows [][]string
+		for _, gpu := range result.GPUs {
+			synthetic := "-"
+			if gpu.Scores.SyntheticScore > 0 {
+				synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
+			}
+			mixed := "-"
+			if gpu.Scores.MixedScore > 0 {
+				mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore)
+			}
+			mixedEff := "-"
+			if gpu.Scores.MixedEfficiency > 0 {
+				mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
+			}
+			topsPerSM := "-"
+			if gpu.Scores.TOPSPerSMPerGHz > 0 {
+				topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
+			}
+			rows = append(rows, []string{
+				fmt.Sprintf("GPU %d", gpu.Index),
+				fmt.Sprintf("**%.2f**", gpu.Scores.CompositeScore),
+				synthetic, mixed, mixedEff, topsPerSM,
+			})
+		}
+		b.WriteString(fmtMDTable([]string{"GPU", "Compute TOPS", "Synthetic", "Mixed", "Mixed Eff.", "TOPS/SM/GHz"}, rows))
+		if len(result.PerformanceRampSteps) > 0 {
+			fmt.Fprintf(&b, "\n**Platform power score (scalability):** %.1f%%\n", result.PlatformPowerScore)
+		}
+		b.WriteString("\n")
+	}
+
+	// Perspective 5: Anomaly flags
+	b.WriteString("### 5. Anomalies\n\n")
+	{
+		var rows [][]string
+		for _, gpu := range result.GPUs {
+			eccCorr := "-"
+			if gpu.ECC.Corrected > 0 {
+				eccCorr = fmt.Sprintf("⚠ %d", gpu.ECC.Corrected)
+			}
+			syncBoost := "-"
+			if gpu.Scores.SyncBoostThrottlePct > 0 {
+				syncBoost = fmt.Sprintf("%.1f%%", gpu.Scores.SyncBoostThrottlePct)
+			}
+			powerVar := "OK"
+			if gpu.Scores.PowerSustainScore < 70 {
+				powerVar = "⚠ unstable"
+			}
+			thermalVar := "OK"
+			if gpu.Scores.ThermalSustainScore < 70 {
+				thermalVar = "⚠ unstable"
+			}
+			rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), eccCorr, syncBoost, powerVar, thermalVar})
+		}
+		b.WriteString(fmtMDTable([]string{"GPU", "ECC corrected", "Sync boost throttle", "Power instability", "Thermal instability"}, rows))
+		b.WriteString("\n")
 	}
-	b.WriteString("\n")

 	// ── Per GPU detail ────────────────────────────────────────────────────────
 	b.WriteString("## Per-GPU Details\n\n")
@@ -171,13 +269,13 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 			fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
 		}
 		if gpu.PowerLimitDerated {
-			fmt.Fprintf(&b, "- **Power limit derating:** active after %d targeted_power attempt(s)\n", gpu.PowerCalibrationTries)
+			fmt.Fprintf(&b, "- **Power limit derating:** active (reduced limit %.0f W)\n", gpu.PowerLimitW)
 		}
 		if gpu.CalibratedPeakPowerW > 0 {
 			if gpu.CalibratedPeakTempC > 0 {
-				fmt.Fprintf(&b, "- **Power calibration (`dcgmi targeted_power`):** %.0f W p95 at %.1f °C p95\n", gpu.CalibratedPeakPowerW, gpu.CalibratedPeakTempC)
+				fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95 at %.1f °C p95\n", gpu.CalibratedPeakPowerW, gpu.CalibratedPeakTempC)
 			} else {
-				fmt.Fprintf(&b, "- **Power calibration (`dcgmi targeted_power`):** %.0f W p95\n", gpu.CalibratedPeakPowerW)
+				fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95\n", gpu.CalibratedPeakPowerW)
 			}
 		}
 		if gpu.LockedGraphicsClockMHz > 0 {
@@ -186,19 +284,27 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		b.WriteString("\n")

 		// Steady-state telemetry
-		fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
-		b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
-		fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
-		fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
-		fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
-		fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
-		fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
-		b.WriteString("\n")
+		if benchmarkTelemetryAvailable(gpu.Steady) {
+			fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
+			b.WriteString(fmtMDTable(
+				[]string{"", "Avg", "P95"},
+				[][]string{
+					{"Power", fmt.Sprintf("%.1f W", gpu.Steady.AvgPowerW), fmt.Sprintf("%.1f W", gpu.Steady.P95PowerW)},
+					{"Temperature", fmt.Sprintf("%.1f °C", gpu.Steady.AvgTempC), fmt.Sprintf("%.1f °C", gpu.Steady.P95TempC)},
+					{"GPU clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgGraphicsClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95GraphicsClockMHz)},
+					{"Memory clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgMemoryClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95MemoryClockMHz)},
+					{"GPU utilisation", fmt.Sprintf("%.1f %%", gpu.Steady.AvgUsagePct), "—"},
+				},
+			))
+			b.WriteString("\n")
+		} else {
+			b.WriteString("**Steady-state telemetry:** unavailable\n\n")
+		}

 		// Per-precision stability phases.
 		if len(gpu.PrecisionSteady) > 0 {
 			b.WriteString("**Per-precision stability:**\n\n")
-			b.WriteString("| Precision | Status | Clock CV | Power CV | Clock Drift | ECC corr | ECC uncorr |\n|-----------|--------|----------|----------|-------------|----------|------------|\n")
+			var precRows [][]string
 			for _, p := range gpu.PrecisionSteady {
 				eccCorr := "—"
 				eccUncorr := "—"
@@ -210,10 +316,15 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 				if strings.TrimSpace(status) == "" {
 					status = "OK"
 				}
-				fmt.Fprintf(&b, "| %s | %s | %.1f%% | %.1f%% | %.1f%% | %s | %s |\n",
-					p.Precision, status, p.Steady.ClockCVPct, p.Steady.PowerCVPct, p.Steady.ClockDriftPct,
-					eccCorr, eccUncorr)
+				precRows = append(precRows, []string{
+					p.Precision, status,
+					fmt.Sprintf("%.1f%%", p.Steady.ClockCVPct),
+					fmt.Sprintf("%.1f%%", p.Steady.PowerCVPct),
+					fmt.Sprintf("%.1f%%", p.Steady.ClockDriftPct),
+					eccCorr, eccUncorr,
+				})
 			}
+			b.WriteString(fmtMDTable([]string{"Precision", "Status", "Clock CV", "Power CV", "Clock Drift", "ECC corr", "ECC uncorr"}, precRows))
 			b.WriteString("\n")
 		} else {
 			// Legacy: show combined-window variance.
@@ -236,16 +347,22 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		// Precision results
 		if len(gpu.PrecisionResults) > 0 {
 			b.WriteString("**Precision results:**\n\n")
-			b.WriteString("| Precision | TOPS (raw) | Weight | TOPS (fp32-eq) | Lanes | Iterations |\n|-----------|------------|--------|----------------|-------|------------|\n")
+			var presRows [][]string
 			for _, p := range gpu.PrecisionResults {
 				if p.Supported {
-					weightStr := fmt.Sprintf("×%.3g", p.Weight)
-					fmt.Fprintf(&b, "| %s | %.2f | %s | %.2f | %d | %d |\n",
-						p.Name, p.TeraOpsPerSec, weightStr, p.WeightedTeraOpsPerSec, p.Lanes, p.Iterations)
+					presRows = append(presRows, []string{
+						p.Name,
+						fmt.Sprintf("%.2f", p.TeraOpsPerSec),
+						fmt.Sprintf("×%.3g", p.Weight),
+						fmt.Sprintf("%.2f", p.WeightedTeraOpsPerSec),
+						fmt.Sprintf("%d", p.Lanes),
+						fmt.Sprintf("%d", p.Iterations),
+					})
 				} else {
-					fmt.Fprintf(&b, "| %s | — (unsupported) | — | — | — | — |\n", p.Name)
+					presRows = append(presRows, []string{p.Name, "— (unsupported)", "—", "—", "—", "—"})
 				}
 			}
+			b.WriteString(fmtMDTable([]string{"Precision", "TOPS (raw)", "Weight", "TOPS (fp32-eq)", "Lanes", "Iterations"}, presRows))
 			b.WriteString("\n")
 		}

@@ -267,9 +384,13 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		b.WriteString("## Interconnect (NCCL)\n\n")
 		fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status)
 		if result.Interconnect.Supported {
-			b.WriteString("| Metric | Avg | Max |\n|--------|-----|-----|\n")
-			fmt.Fprintf(&b, "| Alg BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.MaxAlgBWGBps)
-			fmt.Fprintf(&b, "| Bus BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgBusBWGBps, result.Interconnect.MaxBusBWGBps)
+			b.WriteString(fmtMDTable(
+				[]string{"Metric", "Avg", "Max"},
+				[][]string{
+					{"Alg BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgAlgBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxAlgBWGBps)},
+					{"Bus BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgBusBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxBusBWGBps)},
+				},
+			))
 			b.WriteString("\n")
 		}
 		for _, note := range result.Interconnect.Notes {
@@ -286,14 +407,16 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		if !sp.Available {
 			b.WriteString("IPMI power measurement unavailable.\n\n")
 		} else {
-			b.WriteString("| | Value |\n|---|---|\n")
-			fmt.Fprintf(&b, "| Server idle | %.0f W |\n", sp.IdleW)
-			fmt.Fprintf(&b, "| Server under load | %.0f W |\n", sp.LoadedW)
-			fmt.Fprintf(&b, "| Server delta (load − idle) | %.0f W |\n", sp.DeltaW)
-			fmt.Fprintf(&b, "| GPU-reported sum | %.0f W |\n", sp.GPUReportedSumW)
-			if sp.ReportingRatio > 0 {
-				fmt.Fprintf(&b, "| Reporting ratio | %.2f (1.0 = accurate, <0.75 = GPU over-reports) |\n", sp.ReportingRatio)
+			spRows := [][]string{
+				{"Server idle", fmt.Sprintf("%.0f W", sp.IdleW)},
+				{"Server under load", fmt.Sprintf("%.0f W", sp.LoadedW)},
+				{"Server delta (load − idle)", fmt.Sprintf("%.0f W", sp.DeltaW)},
+				{"GPU-reported sum", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)},
 			}
+			if sp.ReportingRatio > 0 {
+				spRows = append(spRows, []string{"Reporting ratio", fmt.Sprintf("%.2f (1.0 = accurate, <0.75 = GPU over-reports)", sp.ReportingRatio)})
+			}
+			b.WriteString(fmtMDTable([]string{"", "Value"}, spRows))
 			b.WriteString("\n")
 		}
 		for _, note := range sp.Notes {
@@ -304,19 +427,33 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		}
 	}

+	// ── PSU Issues ────────────────────────────────────────────────────────────
+	if len(result.PSUIssues) > 0 {
+		b.WriteString("## PSU Issues\n\n")
+		b.WriteString("The following power supply anomalies were detected during the benchmark:\n\n")
+		for _, issue := range result.PSUIssues {
+			fmt.Fprintf(&b, "- ⛔ %s\n", issue)
+		}
+		b.WriteString("\n")
+	}
+
 	// ── Cooling ───────────────────────────────────────────────────────────────
 	if cooling := result.Cooling; cooling != nil {
 		b.WriteString("## Cooling\n\n")
 		if cooling.Available {
-			b.WriteString("| Metric | Value |\n|--------|-------|\n")
-			fmt.Fprintf(&b, "| Average fan speed | %.0f RPM |\n", cooling.AvgFanRPM)
+			dutyAvg, dutyP95 := "N/A", "N/A"
 			if cooling.FanDutyCycleAvailable {
-				fmt.Fprintf(&b, "| Average fan duty cycle | %.1f%% |\n", cooling.AvgFanDutyCyclePct)
-				fmt.Fprintf(&b, "| P95 fan duty cycle | %.1f%% |\n", cooling.P95FanDutyCyclePct)
-			} else {
-				b.WriteString("| Average fan duty cycle | N/A |\n")
-				b.WriteString("| P95 fan duty cycle | N/A |\n")
+				dutyAvg = fmt.Sprintf("%.1f%%", cooling.AvgFanDutyCyclePct)
+				dutyP95 = fmt.Sprintf("%.1f%%", cooling.P95FanDutyCyclePct)
 			}
+			b.WriteString(fmtMDTable(
+				[]string{"Metric", "Value"},
+				[][]string{
+					{"Average fan speed", fmt.Sprintf("%.0f RPM", cooling.AvgFanRPM)},
+					{"Average fan duty cycle", dutyAvg},
+					{"P95 fan duty cycle", dutyP95},
+				},
+			))
 			b.WriteString("\n")
 		} else {
 			b.WriteString("Cooling telemetry unavailable.\n\n")
@@ -329,6 +466,23 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		}
 	}

+	// ── Platform Scalability ──────────────────────────────────────────────────
+	if len(result.PerformanceRampSteps) > 0 {
+		b.WriteString("## Platform Scalability (Performance Ramp)\n\n")
+		fmt.Fprintf(&b, "**Platform power score:** %.1f%%  \n\n", result.PlatformPowerScore)
+		var scalRows [][]string
+		for _, step := range result.PerformanceRampSteps {
+			scalRows = append(scalRows, []string{
+				fmt.Sprintf("%d", step.StepIndex),
+				joinIndexList(step.GPUIndices),
+				fmt.Sprintf("%.2f", step.TotalSyntheticTOPS),
+				fmt.Sprintf("%.1f%%", step.ScalabilityPct),
+			})
+		}
+		b.WriteString(fmtMDTable([]string{"k GPUs", "GPU Indices", "Total Synthetic TOPS", "Scalability"}, scalRows))
+		b.WriteString("\n")
+	}
+
 	// ── Raw files ─────────────────────────────────────────────────────────────
 	b.WriteString("## Raw Files\n\n")
 	b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
--- a/audit/internal/platform/benchmark_table.go
+++ b/audit/internal/platform/benchmark_table.go
@@ -0,0 +1,75 @@
+package platform
+
+import (
+	"strings"
+)
+
+// fmtMDTable renders a markdown table with column widths padded so the table
+// is readable as plain text without a markdown renderer.
+//
+// headers contains the column header strings.
+// rows contains data rows; each row must have the same number of cells as headers.
+// Cells with fewer entries than headers are treated as empty.
+func fmtMDTable(headers []string, rows [][]string) string {
+	ncols := len(headers)
+	if ncols == 0 {
+		return ""
+	}
+
+	// Compute max width per column.
+	widths := make([]int, ncols)
+	for i, h := range headers {
+		if len(h) > widths[i] {
+			widths[i] = len(h)
+		}
+	}
+	for _, row := range rows {
+		for i := 0; i < ncols; i++ {
+			cell := ""
+			if i < len(row) {
+				cell = row[i]
+			}
+			if len(cell) > widths[i] {
+				widths[i] = len(cell)
+			}
+		}
+	}
+
+	var b strings.Builder
+
+	// Header row.
+	b.WriteByte('|')
+	for i, h := range headers {
+		b.WriteByte(' ')
+		b.WriteString(h)
+		b.WriteString(strings.Repeat(" ", widths[i]-len(h)))
+		b.WriteString(" |")
+	}
+	b.WriteByte('\n')
+
+	// Separator row.
+	b.WriteByte('|')
+	for i := range headers {
+		b.WriteString(strings.Repeat("-", widths[i]+2))
+		b.WriteByte('|')
+	}
+	b.WriteByte('\n')
+
+	// Data rows.
+	for _, row := range rows {
+		b.WriteByte('|')
+		for i := 0; i < ncols; i++ {
+			cell := ""
+			if i < len(row) {
+				cell = row[i]
+			}
+			b.WriteByte(' ')
+			b.WriteString(cell)
+			b.WriteString(strings.Repeat(" ", widths[i]-len(cell)))
+			b.WriteString(" |")
+		}
+		b.WriteByte('\n')
+	}
+
+	return b.String()
+}
--- a/audit/internal/platform/benchmark_test.go
+++ b/audit/internal/platform/benchmark_test.go
@@ -49,8 +49,8 @@ func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
 		benchmarkPrecisionPhases,
 		func(label string) string { return label },
 	)
-	if len(labels) != 7 || len(phases) != 7 {
-		t.Fatalf("labels=%d phases=%d want 7", len(labels), len(phases))
+	if len(labels) != 5 || len(phases) != 5 {
+		t.Fatalf("labels=%d phases=%d want 5", len(labels), len(phases))
 	}
 	if basePhaseSec != 60 {
 		t.Fatalf("basePhaseSec=%d want 60", basePhaseSec)
@@ -61,7 +61,7 @@ func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
 	if phases[len(phases)-1].PlanLabel != "mixed" || phases[len(phases)-1].DurationSec != 300 {
 		t.Fatalf("mixed phase=%+v want duration 300", phases[len(phases)-1])
 	}
-	if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,60,60,300" {
+	if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,300" {
 		t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
 	}
 }
@@ -80,7 +80,7 @@ func TestBuildBenchmarkSteadyPlanStability(t *testing.T) {
 	if mixedPhaseSec != 3600 {
 		t.Fatalf("mixedPhaseSec=%d want 3600", mixedPhaseSec)
 	}
-	if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,300,300,3600" {
+	if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,3600" {
 		t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
 	}
 }
@@ -99,7 +99,7 @@ func TestBuildBenchmarkSteadyPlanOvernight(t *testing.T) {
 	if mixedPhaseSec != 14400 {
 		t.Fatalf("mixedPhaseSec=%d want 14400", mixedPhaseSec)
 	}
-	if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,3600,3600,14400" {
+	if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,14400" {
 		t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
 	}
 }
@@ -133,10 +133,10 @@ func TestSplitBenchmarkRowsByPlannedPhaseUsesPhaseDurations(t *testing.T) {
 func TestBenchmarkSupportedPrecisionsSkipsFP4BeforeBlackwell(t *testing.T) {
 	t.Parallel()

-	if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64" {
+	if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
 		t.Fatalf("supported=%v", got)
 	}
-	if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64,fp4" {
+	if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
 		t.Fatalf("supported=%v", got)
 	}
 }
@@ -314,6 +314,30 @@ func TestRenderBenchmarkReportListsUnifiedArtifacts(t *testing.T) {
 	}
 }

+func TestScoreBenchmarkGPUIgnoresDisabledPrecisions(t *testing.T) {
+	t.Parallel()
+
+	score := scoreBenchmarkGPUResult(BenchmarkGPUResult{
+		PrecisionSteady: []BenchmarkPrecisionSteadyPhase{
+			{Precision: "fp16", WeightedTeraOpsPerSec: 100},
+			{Precision: "fp64", WeightedTeraOpsPerSec: 999},
+			{Precision: "fp4", WeightedTeraOpsPerSec: 999},
+		},
+		PrecisionResults: []BenchmarkPrecisionResult{
+			{Category: "fp32_tf32", Supported: true, WeightedTeraOpsPerSec: 50},
+			{Category: "fp64", Supported: true, WeightedTeraOpsPerSec: 999},
+			{Category: "fp4", Supported: true, WeightedTeraOpsPerSec: 999},
+		},
+	})
+
+	if score.SyntheticScore != 100 {
+		t.Fatalf("SyntheticScore=%f want 100", score.SyntheticScore)
+	}
+	if score.MixedScore != 50 {
+		t.Fatalf("MixedScore=%f want 50", score.MixedScore)
+	}
+}
+
 func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {
 	t.Parallel()

--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -31,6 +31,7 @@ type BenchmarkCoolingSummary struct {
 	Available             bool     `json:"available"`
 	AvgFanRPM             float64  `json:"avg_fan_rpm,omitempty"`
 	FanDutyCycleAvailable bool     `json:"fan_duty_cycle_available,omitempty"`
+	FanDutyCycleEstimated bool     `json:"fan_duty_cycle_estimated,omitempty"`
 	AvgFanDutyCyclePct    float64  `json:"avg_fan_duty_cycle_pct,omitempty"`
 	P95FanDutyCyclePct    float64  `json:"p95_fan_duty_cycle_pct,omitempty"`
 	Notes                 []string `json:"notes,omitempty"`
@@ -42,6 +43,31 @@ const (
 	NvidiaBenchmarkProfileOvernight = "overnight"
 )

+// Estimated wall-clock durations for benchmark runs, derived from real _v8 logs.
+// Rule: when changing profile phase durations in resolveBenchmarkProfile(),
+// re-measure from actual task logs and update the constants here.
+//
+// Sources:
+//   - BenchmarkEstimatedPerfStandardSec:   MLT v8.22 ramp 1-4: 927 s; xFusion v8.22 parallel 8GPU: 1080 s
+//   - BenchmarkEstimatedPerfStabilitySec:  xFusion v8.22 ramp 1-8: 5532 s
+//   - BenchmarkEstimatedPerfOvernightSec:  derived from profile phases (SteadySec=27000)
+//   - BenchmarkEstimatedPowerStandardSec:  MLT v8.22 ramp 1-4: 2663 s; MSI v8.22 ramp 1-8: 2375 s
+//   - BenchmarkEstimatedPowerStabilitySec: target ~90 min with calibDurationSec=300 (8 GPU × ~2-3 attempts)
+const (
+	// Performance Benchmark (bee-gpu-burn).
+	// Duration is per full ramp-up run (ramp 1→N) or per single parallel run.
+	// Sequential per-GPU mode scales approximately linearly.
+	BenchmarkEstimatedPerfStandardSec  = 960  // ~16 min; ramp-up 1-4: 927 s, parallel 8GPU: 1080 s
+	BenchmarkEstimatedPerfStabilitySec = 5532 // ~92 min; ramp-up 1-8 measured
+	BenchmarkEstimatedPerfOvernightSec = 8 * 3600
+
+	// Power / Thermal Fit (dcgmi targeted_power binary-search calibration).
+	// Duration is for the full ramp-up run; individual steps vary with convergence speed.
+	BenchmarkEstimatedPowerStandardSec  = 2600 // ~43 min; ramp 1-4: 2663 s, ramp 1-8: 2375 s
+	BenchmarkEstimatedPowerStabilitySec = 5400 // ~90 min; calibDurationSec=300 × 8 GPU × ~2-3 attempts
+	BenchmarkEstimatedPowerOvernightSec = 3 * 3600
+)
+
 type NvidiaBenchmarkOptions struct {
 	Profile           string
 	SizeMB            int
@@ -55,27 +81,36 @@ type NvidiaBenchmarkOptions struct {
 }

 type NvidiaBenchmarkResult struct {
-	BenchmarkVersion   string                       `json:"benchmark_version"`
-	GeneratedAt        time.Time                    `json:"generated_at"`
-	Hostname           string                       `json:"hostname,omitempty"`
-	ServerModel        string                       `json:"server_model,omitempty"`
-	BenchmarkProfile   string                       `json:"benchmark_profile"`
-	ParallelGPUs       bool                         `json:"parallel_gpus,omitempty"`
-	RampStep           int                          `json:"ramp_step,omitempty"`
-	RampTotal          int                          `json:"ramp_total,omitempty"`
-	RampRunID          string                       `json:"ramp_run_id,omitempty"`
-	ScalabilityScore   float64                      `json:"scalability_score,omitempty"`
-	OverallStatus      string                       `json:"overall_status"`
-	SelectedGPUIndices []int                        `json:"selected_gpu_indices"`
-	Findings           []string                     `json:"findings,omitempty"`
-	Warnings           []string                     `json:"warnings,omitempty"`
-	Normalization      BenchmarkNormalization       `json:"normalization"`
-	HostConfig         *BenchmarkHostConfig         `json:"host_config,omitempty"`
-	CPULoad            *BenchmarkCPULoad            `json:"cpu_load,omitempty"`
-	Cooling            *BenchmarkCoolingSummary     `json:"cooling,omitempty"`
-	GPUs               []BenchmarkGPUResult         `json:"gpus"`
-	Interconnect       *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
-	ServerPower        *BenchmarkServerPower        `json:"server_power,omitempty"`
+	BenchmarkVersion string    `json:"benchmark_version"`
+	GeneratedAt      time.Time `json:"generated_at"`
+	Hostname         string    `json:"hostname,omitempty"`
+	ServerModel      string    `json:"server_model,omitempty"`
+	BenchmarkProfile string    `json:"benchmark_profile"`
+	ParallelGPUs     bool      `json:"parallel_gpus,omitempty"`
+	RampStep         int       `json:"ramp_step,omitempty"`
+	RampTotal        int       `json:"ramp_total,omitempty"`
+	RampRunID        string    `json:"ramp_run_id,omitempty"`
+	ScalabilityScore float64   `json:"scalability_score,omitempty"`
+	// PlatformPowerScore is the mean compute scalability across ramp steps 2..N.
+	// 100% = each added GPU contributes exactly its single-card throughput.
+	// < 100% = throughput loss due to thermal throttle, power limits, or contention.
+	PlatformPowerScore   float64                      `json:"platform_power_score,omitempty"`
+	PerformanceRampSteps []NvidiaPerformanceRampStep  `json:"performance_ramp_steps,omitempty"`
+	OverallStatus        string                       `json:"overall_status"`
+	SelectedGPUIndices   []int                        `json:"selected_gpu_indices"`
+	Findings             []string                     `json:"findings,omitempty"`
+	Warnings             []string                     `json:"warnings,omitempty"`
+	Normalization        BenchmarkNormalization       `json:"normalization"`
+	HostConfig           *BenchmarkHostConfig         `json:"host_config,omitempty"`
+	CPULoad              *BenchmarkCPULoad            `json:"cpu_load,omitempty"`
+	Cooling              *BenchmarkCoolingSummary     `json:"cooling,omitempty"`
+	GPUs                 []BenchmarkGPUResult         `json:"gpus"`
+	Interconnect         *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
+	ServerPower          *BenchmarkServerPower        `json:"server_power,omitempty"`
+	// PSUIssues holds power supply fault events detected by comparing IPMI PSU
+	// sensor states before and after the benchmark run. Empty when IPMI is
+	// unavailable or no PSU faults occurred during the test.
+	PSUIssues []string `json:"psu_issues,omitempty"`
 }

 type BenchmarkNormalization struct {
@@ -107,6 +142,12 @@ type BenchmarkGPUResult struct {
 	PowerLimitDerated   bool    `json:"power_limit_derated,omitempty"`
 	MultiprocessorCount int     `json:"multiprocessor_count,omitempty"`
 	DefaultPowerLimitW  float64 `json:"default_power_limit_w,omitempty"`
+	// ShutdownTempC is the hardware thermal shutdown threshold for this GPU,
+	// sourced from nvidia-smi -q ("GPU Shutdown Temp"). Fallback: 90°C.
+	ShutdownTempC float64 `json:"shutdown_temp_c,omitempty"`
+	// SlowdownTempC is the software throttle onset threshold ("GPU Slowdown Temp").
+	// Fallback: 80°C.
+	SlowdownTempC float64 `json:"slowdown_temp_c,omitempty"`
 	// CalibratedPeakPowerW is the p95 power measured during a short
 	// dcgmi targeted_power calibration run before the main benchmark.
 	// Used as the reference denominator for PowerSustainScore instead of
@@ -206,25 +247,83 @@ type BenchmarkScorecard struct {
 	MixedEfficiency     float64 `json:"mixed_efficiency,omitempty"`
 	PowerSustainScore   float64 `json:"power_sustain_score"`
 	ThermalSustainScore float64 `json:"thermal_sustain_score"`
-	StabilityScore      float64 `json:"stability_score"`
-	InterconnectScore   float64 `json:"interconnect_score"`
-	CompositeScore      float64 `json:"composite_score"`
+	// StabilityScore: fraction of steady-state time the GPU spent throttling
+	// (thermal + power cap combined). 0% throttle = 100; 100% throttle = 0.
+	StabilityScore float64 `json:"stability_score"`
+
+	// Throttle breakdown — percentage of steady-state time in each throttle type.
+	// Used for diagnosis: tells WHY the GPU throttled, not just whether it did.
+	ThermalThrottlePct   float64 `json:"thermal_throttle_pct"`   // HW+SW thermal slowdown
+	PowerCapThrottlePct  float64 `json:"power_cap_throttle_pct"` // SW power cap
+	SyncBoostThrottlePct float64 `json:"sync_boost_throttle_pct,omitempty"`
+
+	// Temperature headroom: distance to the 100°C destruction threshold.
+	// TempHeadroomC = 100 - P95TempC. < 20°C = warning; < 10°C = critical.
+	// Independent of throttle — a GPU at 86°C without throttle is still in the red zone.
+	TempHeadroomC float64 `json:"temp_headroom_c"`
+
+	InterconnectScore float64 `json:"interconnect_score"`
+	// ServerQualityScore (0–100) reflects server infrastructure quality independent
+	// of GPU model. Combines throttle time, power variance, and temp variance.
+	// Use this to compare servers with the same GPU, or to flag a bad server
+	// that throttles an otherwise fast GPU.
+	ServerQualityScore float64 `json:"server_quality_score"`
+	// CompositeScore is the raw compute score (TOPS, fp32-equivalent).
+	// A throttling GPU will score lower here automatically — no quality multiplier.
+	CompositeScore float64 `json:"composite_score"`
 	// TOPSPerSMPerGHz is compute efficiency independent of clock speed and SM count.
 	TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
 }

-// BenchmarkServerPower captures server-side power via IPMI alongside GPU-reported
-// power. The reporting_ratio (delta / gpu_reported_sum) near 1.0 means GPU power
-// telemetry is accurate; a ratio well below 1.0 (e.g. 0.5) means the GPU is
-// over-reporting its power consumption.
+// BenchmarkPSUSlotPower holds SDR power readings for one PSU slot sampled
+// during the benchmark. Slot keys match audit HardwarePowerSupply.Slot (0-based)
+// so benchmark and audit data can be correlated by slot.
+type BenchmarkPSUSlotPower struct {
+	InputW  *float64 `json:"input_w,omitempty"`  // AC wall input (PSUx_POWER_IN)
+	OutputW *float64 `json:"output_w,omitempty"` // DC output (PSUx_POWER_OUT)
+	Status  string   `json:"status,omitempty"`
+}
+
+// BenchmarkServerPower captures server-side power from multiple independent
+// sources: IPMI DCMI (high-level), IPMI SDR per-PSU sensors (granular), and
+// GPU-reported power (nvidia-smi). Cross-comparing sources detects when DCMI
+// covers only a subset of installed PSUs (partial coverage).
+//
+// Source legend:
+//   - DCMI      — `ipmitool dcmi power reading`; fast but may miss PSUs
+//   - SDR       — `ipmitool sdr` PSUx_POWER_IN/OUT; per-PSU, reliable
+//   - nvidia-smi — GPU self-reported via internal shunt; accurate for GPU load
 type BenchmarkServerPower struct {
-	Available       bool     `json:"available"`
-	IdleW           float64  `json:"idle_w,omitempty"`
-	LoadedW         float64  `json:"loaded_w,omitempty"`
-	DeltaW          float64  `json:"delta_w,omitempty"`
-	GPUReportedSumW float64  `json:"gpu_reported_sum_w,omitempty"`
-	ReportingRatio  float64  `json:"reporting_ratio,omitempty"`
-	Notes           []string `json:"notes,omitempty"`
+	Available       bool    `json:"available"`
+	IdleW           float64 `json:"idle_w,omitempty"`   // DCMI at idle
+	LoadedW         float64 `json:"loaded_w,omitempty"` // DCMI at peak load
+	DeltaW          float64 `json:"delta_w,omitempty"`  // DCMI loaded − idle
+	GPUReportedSumW float64 `json:"gpu_reported_sum_w,omitempty"`
+	ReportingRatio  float64 `json:"reporting_ratio,omitempty"`
+
+	// PSU AC input sum — sampled at idle and at peak load using collector's
+	// slot patterns (PSU1_POWER_IN, PSU1_PIN, PS1 POut, Power1…).
+	PSUInputIdleW   float64 `json:"psu_input_idle_w,omitempty"`
+	PSUInputLoadedW float64 `json:"psu_input_loaded_w,omitempty"`
+
+	// PSU DC output sum — power delivered to server internals after conversion.
+	PSUOutputIdleW   float64 `json:"psu_output_idle_w,omitempty"`
+	PSUOutputLoadedW float64 `json:"psu_output_loaded_w,omitempty"`
+
+	// Per-slot PSU readings at idle and at peak load.
+	// Keys are 0-based slot strings matching audit HardwarePowerSupply.Slot.
+	PSUSlotReadingsIdle   map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings_idle,omitempty"`
+	PSUSlotReadingsLoaded map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings_loaded,omitempty"`
+
+	// GPUSlotTotalW is the sum of GPU_POWER_SLOTx SDR sensors at peak load.
+	// PCIe slot delivery only (excludes 16-pin connector power).
+	GPUSlotTotalW float64 `json:"gpu_slot_total_w,omitempty"`
+
+	// DCMICoverageRatio = DCMI_idle / SDR_PSU_IN_idle.
+	// Near 1.0 → DCMI tracks all PSUs. Near 0.5 → DCMI tracks half the PSUs.
+	DCMICoverageRatio float64 `json:"dcmi_coverage_ratio,omitempty"`
+
+	Notes []string `json:"notes,omitempty"`
 }

 // BenchmarkPrecisionSteadyPhase holds per-precision-category telemetry collected
@@ -265,16 +364,35 @@ type NvidiaPowerBenchResult struct {
 	RecommendedSlotOrder []int                  `json:"recommended_slot_order,omitempty"`
 	RampSteps            []NvidiaPowerBenchStep `json:"ramp_steps,omitempty"`
 	OverallStatus        string                 `json:"overall_status"`
-	Findings             []string               `json:"findings,omitempty"`
-	GPUs                 []NvidiaPowerBenchGPU  `json:"gpus"`
+	// PlatformMaxTDPW is the sum of per-GPU stable power limits found during the
+	// cumulative thermal ramp. Represents the actual sustained power budget of
+	// this server under full GPU load. Use for rack power planning.
+	PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
+	// ServerPower captures IPMI server power delta (idle→loaded) measured in
+	// parallel with the thermal ramp. Use to compare GPU-reported TDP against
+	// actual wall-power draw as seen by the server's power supply.
+	ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
+	Findings    []string              `json:"findings,omitempty"`
+	GPUs        []NvidiaPowerBenchGPU `json:"gpus"`
+	// PSUIssues holds power supply fault events detected by comparing IPMI PSU
+	// sensor states before and after the power benchmark run. Empty when IPMI is
+	// unavailable or no PSU faults occurred during the test.
+	PSUIssues []string `json:"psu_issues,omitempty"`
 }

 type NvidiaPowerBenchGPU struct {
-	Index               int      `json:"index"`
-	Name                string   `json:"name,omitempty"`
-	BusID               string   `json:"bus_id,omitempty"`
-	DefaultPowerLimitW  float64  `json:"default_power_limit_w,omitempty"`
-	AppliedPowerLimitW  float64  `json:"applied_power_limit_w,omitempty"`
+	Index              int     `json:"index"`
+	Name               string  `json:"name,omitempty"`
+	BusID              string  `json:"bus_id,omitempty"`
+	DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
+	// AppliedPowerLimitW is the stable limit found during single-card calibration.
+	AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
+	// StablePowerLimitW is the final fixed limit for this GPU after the
+	// cumulative thermal ramp. This is the limit at which the GPU operated
+	// stably with all other GPUs running simultaneously at their own limits.
+	// May be lower than AppliedPowerLimitW if multi-GPU thermal load required
+	// additional derating.
+	StablePowerLimitW   float64  `json:"stable_power_limit_w,omitempty"`
 	MaxObservedPowerW   float64  `json:"max_observed_power_w,omitempty"`
 	MaxObservedTempC    float64  `json:"max_observed_temp_c,omitempty"`
 	CalibrationAttempts int      `json:"calibration_attempts,omitempty"`
@@ -283,16 +401,55 @@ type NvidiaPowerBenchGPU struct {
 	Notes               []string `json:"notes,omitempty"`
 	// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
 	CoolingWarning string `json:"cooling_warning,omitempty"`
+	// ServerLoadedW is the IPMI server power reading captured during this
+	// GPU's single-card calibration run. ServerDeltaW = ServerLoadedW − idle.
+	ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
+	ServerDeltaW  float64 `json:"server_delta_w,omitempty"`
+	// Telemetry holds the aggregated stats from the final converged calibration
+	// attempt for this GPU (temperature, power, fan, clock percentiles).
+	Telemetry *BenchmarkTelemetrySummary `json:"telemetry,omitempty"`
+	// Fan state sampled at the end of single-card calibration.
+	AvgFanRPM          float64 `json:"avg_fan_rpm,omitempty"`
+	AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
 }

 type NvidiaPowerBenchStep struct {
-	StepIndex              int      `json:"step_index"`
-	GPUIndices             []int    `json:"gpu_indices"`
-	TotalObservedPowerW    float64  `json:"total_observed_power_w,omitempty"`
-	AvgObservedPowerW      float64  `json:"avg_observed_power_w,omitempty"`
-	MinPowerRealizationPct float64  `json:"min_power_realization_pct,omitempty"`
-	AvgPowerRealizationPct float64  `json:"avg_power_realization_pct,omitempty"`
-	DeratedGPUCount        int      `json:"derated_gpu_count,omitempty"`
-	Status                 string   `json:"status"`
-	Notes                  []string `json:"notes,omitempty"`
+	StepIndex  int   `json:"step_index"`
+	GPUIndices []int `json:"gpu_indices"`
+	// NewGPUIndex is the GPU whose stable limit was searched in this step.
+	NewGPUIndex int `json:"new_gpu_index"`
+	// NewGPUStableLimitW is the stable power limit found for the new GPU.
+	NewGPUStableLimitW  float64  `json:"new_gpu_stable_limit_w,omitempty"`
+	TotalObservedPowerW float64  `json:"total_observed_power_w,omitempty"`
+	AvgObservedPowerW   float64  `json:"avg_observed_power_w,omitempty"`
+	Derated             bool     `json:"derated,omitempty"`
+	Status              string   `json:"status"`
+	Notes               []string `json:"notes,omitempty"`
+	// ServerLoadedW is the IPMI server power reading captured during this
+	// ramp step's calibration run. ServerDeltaW = ServerLoadedW − idle.
+	ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
+	ServerDeltaW  float64 `json:"server_delta_w,omitempty"`
+	// PSU slot readings sampled at end of this ramp step.
+	PSUSlotReadings map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings,omitempty"`
+	// Fan state at end of this ramp step.
+	AvgFanRPM          float64 `json:"avg_fan_rpm,omitempty"`
+	AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
+	// Per-GPU telemetry from this step's calibration, keyed by GPU index.
+	PerGPUTelemetry map[int]*BenchmarkTelemetrySummary `json:"per_gpu_telemetry,omitempty"`
+}
+
+// NvidiaPerformanceRampStep holds per-step performance data for the
+// scalability ramp-up phase of the performance benchmark.
+type NvidiaPerformanceRampStep struct {
+	StepIndex  int   `json:"step_index"`
+	GPUIndices []int `json:"gpu_indices"`
+	// TotalSyntheticTOPS is the sum of per-GPU SyntheticScore (fp32-equivalent
+	// TOPS from dedicated single-precision phases) across all GPUs in this step.
+	TotalSyntheticTOPS float64 `json:"total_synthetic_tops"`
+	TotalMixedTOPS     float64 `json:"total_mixed_tops,omitempty"`
+	// ScalabilityPct = TotalSyntheticTOPS / (k × best_single_gpu_tops) × 100.
+	// 100% = perfect linear scaling. < 100% = thermal/power/interconnect loss.
+	ScalabilityPct float64  `json:"scalability_pct"`
+	Status         string   `json:"status"`
+	Notes          []string `json:"notes,omitempty"`
 }
--- a/audit/internal/platform/gpu_metrics.go
+++ b/audit/internal/platform/gpu_metrics.go
@@ -27,6 +27,7 @@ type GPUMetricRow struct {
 	FanAvgRPM             float64 `json:"fan_avg_rpm,omitempty"`
 	FanDutyCyclePct       float64 `json:"fan_duty_cycle_pct,omitempty"`
 	FanDutyCycleAvailable bool    `json:"fan_duty_cycle_available,omitempty"`
+	FanDutyCycleEstimated bool    `json:"fan_duty_cycle_estimated,omitempty"`
 }

 // sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
@@ -147,14 +148,18 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
 // WriteGPUMetricsCSV writes collected rows as a CSV file.
 func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
 	var b bytes.Buffer
-	b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available\n")
+	b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available,fan_duty_cycle_estimated\n")
 	for _, r := range rows {
 		dutyAvail := 0
 		if r.FanDutyCycleAvailable {
 			dutyAvail = 1
 		}
-		fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d\n",
-			strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail)
+		dutyEstimated := 0
+		if r.FanDutyCycleEstimated {
+			dutyEstimated = 1
+		}
+		fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d,%d\n",
+			strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail, dutyEstimated)
 	}
 	return os.WriteFile(path, b.Bytes(), 0644)
 }
--- a/audit/internal/platform/install_to_ram.go
+++ b/audit/internal/platform/install_to_ram.go
@@ -140,26 +140,56 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (ret
 	}

 	squashfsFiles, err := filepath.Glob("/run/live/medium/live/*.squashfs")
-	if err != nil || len(squashfsFiles) == 0 {
-		return fmt.Errorf("no squashfs files found in /run/live/medium/live/")
-	}
-
-	free := freeMemBytes()
-	var needed int64
-	for _, sf := range squashfsFiles {
-		fi, err2 := os.Stat(sf)
-		if err2 != nil {
-			return fmt.Errorf("stat %s: %v", sf, err2)
-		}
-		needed += fi.Size()
-	}
-	const headroom = 256 * 1024 * 1024
-	if free > 0 && needed+headroom > free {
-		return fmt.Errorf("insufficient RAM: need %s, available %s",
-			humanBytes(needed+headroom), humanBytes(free))
-	}
+	sourceAvailable := err == nil && len(squashfsFiles) > 0

 	dstDir := installToRAMDir
+
+	// If the source medium is unavailable, check whether a previous run already
+	// produced a complete copy in RAM. If so, skip the copy phase and proceed
+	// directly to the loop-rebind / bind-mount steps.
+	if !sourceAvailable {
+		copiedFiles, _ := filepath.Glob(filepath.Join(dstDir, "*.squashfs"))
+		if len(copiedFiles) > 0 {
+			log("Source medium not available, but a previous RAM copy was found — resuming from existing copy.")
+			// Proceed to rebind with the already-copied files.
+			for _, dst := range copiedFiles {
+				base := filepath.Base(dst)
+				// Re-associate the loop device that was originally backed by the
+				// source file (now gone); find it by the old source path pattern.
+				srcGuess := "/run/live/medium/live/" + base
+				loopDev, lerr := findLoopForFile(srcGuess)
+				if lerr != nil {
+					log(fmt.Sprintf("Loop device for %s not found (%v) — skipping re-association.", base, lerr))
+					continue
+				}
+				if rerr := reassociateLoopDevice(loopDev, dst); rerr != nil {
+					log(fmt.Sprintf("Warning: could not re-associate %s → %s: %v", loopDev, dst, rerr))
+				} else {
+					log(fmt.Sprintf("Loop device %s now backed by RAM copy.", loopDev))
+				}
+			}
+			goto bindMedium
+		}
+		return fmt.Errorf("no squashfs files found in /run/live/medium/live/ and no prior RAM copy in %s — reconnect the installation medium and retry", dstDir)
+	}
+
+	{
+		free := freeMemBytes()
+		var needed int64
+		for _, sf := range squashfsFiles {
+			fi, err2 := os.Stat(sf)
+			if err2 != nil {
+				return fmt.Errorf("stat %s: %v", sf, err2)
+			}
+			needed += fi.Size()
+		}
+		const headroom = 256 * 1024 * 1024
+		if free > 0 && needed+headroom > free {
+			return fmt.Errorf("insufficient RAM: need %s, available %s",
+				humanBytes(needed+headroom), humanBytes(free))
+		}
+	}
+
 	if state.CopyPresent {
 		log("Removing stale partial RAM copy before retry...")
 	}
@@ -199,6 +229,7 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (ret
 		}
 	}

+bindMedium:
 	log("Copying remaining medium files...")
 	if err := cpDir(ctx, "/run/live/medium", dstDir, log); err != nil {
 		log(fmt.Sprintf("Warning: partial copy: %v", err))
--- a/audit/internal/platform/kill_workers.go
+++ b/audit/internal/platform/kill_workers.go
@@ -1,11 +1,14 @@
 package platform

 import (
+	"context"
 	"fmt"
+	"log/slog"
 	"os"
 	"strconv"
 	"strings"
 	"syscall"
+	"time"
 )

 // workerPatterns are substrings matched against /proc/<pid>/cmdline to identify
@@ -30,7 +33,12 @@ type KilledProcess struct {
 // KillTestWorkers scans /proc for running test worker processes and sends
 // SIGKILL to each one found. It returns a list of killed processes.
 // Errors for individual processes (e.g. already exited) are silently ignored.
+// The scan runs under a 5-second deadline to avoid blocking if the process
+// table is very large (e.g. after a stress test with thousands of children).
 func KillTestWorkers() []KilledProcess {
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
 	entries, err := os.ReadDir("/proc")
 	if err != nil {
 		return nil
@@ -38,6 +46,13 @@ func KillTestWorkers() []KilledProcess {

 	var killed []KilledProcess
 	for _, e := range entries {
+		select {
+		case <-ctx.Done():
+			slog.Warn("KillTestWorkers scan timed out", "killed_so_far", len(killed))
+			return killed
+		default:
+		}
+
 		if !e.IsDir() {
 			continue
 		}
--- a/audit/internal/platform/live_metrics.go
+++ b/audit/internal/platform/live_metrics.go
@@ -18,11 +18,19 @@ type LiveMetricSample struct {
 	Fans       []FanReading   `json:"fans"`
 	Temps      []TempReading  `json:"temps"`
 	PowerW     float64        `json:"power_w"`
+	PSUs       []PSUReading   `json:"psus,omitempty"`
 	CPULoadPct float64        `json:"cpu_load_pct"`
 	MemLoadPct float64        `json:"mem_load_pct"`
 	GPUs       []GPUMetricRow `json:"gpus"`
 }

+// PSUReading is a per-slot power supply input power reading.
+type PSUReading struct {
+	Slot   int     `json:"slot"`
+	Name   string  `json:"name"`
+	PowerW float64 `json:"power_w"`
+}
+
 // TempReading is a named temperature sensor value.
 type TempReading struct {
 	Name    string  `json:"name"`
@@ -57,6 +65,9 @@ func SampleLiveMetrics() LiveMetricSample {
 	// System power — returns 0 if unavailable
 	s.PowerW = sampleSystemPower()

+	// Per-PSU power — populated when IPMI SDR has Power Supply entities with Watt readings
+	s.PSUs = samplePSUPower()
+
 	// CPU load — from /proc/stat
 	s.CPULoadPct = sampleCPULoadPct()

@@ -326,3 +337,65 @@ func compactAmbientTempName(chip, name string) string {
 	}
 	return chip + " / " + name
 }
+
+// samplePSUPower reads per-PSU input power via IPMI SDR.
+// It parses `ipmitool sdr elist full` output looking for Power Supply entity
+// sensors (entity ID "10.N") that report a value in Watts.
+// Returns nil when IPMI is unavailable or no PSU Watt sensors exist.
+func samplePSUPower() []PSUReading {
+	out, err := exec.Command("ipmitool", "sdr", "elist", "full").Output()
+	if err != nil || len(out) == 0 {
+		return nil
+	}
+	// map slot → reading (keep highest-watt value per slot in case of duplicates)
+	type entry struct {
+		name   string
+		powerW float64
+	}
+	bySlot := map[int]entry{}
+	for _, line := range strings.Split(string(out), "\n") {
+		parts := strings.Split(line, "|")
+		if len(parts) < 5 {
+			continue
+		}
+		entityID := strings.TrimSpace(parts[3]) // e.g. "10.1"
+		if !strings.HasPrefix(entityID, "10.") {
+			continue // not a Power Supply entity
+		}
+		slotStr := strings.TrimPrefix(entityID, "10.")
+		slot, err := strconv.Atoi(slotStr)
+		if err != nil {
+			continue
+		}
+		valueField := strings.TrimSpace(parts[4]) // e.g. "740.00 Watts"
+		if !strings.Contains(strings.ToLower(valueField), "watts") {
+			continue
+		}
+		valueFields := strings.Fields(valueField)
+		if len(valueFields) < 2 {
+			continue
+		}
+		w, err := strconv.ParseFloat(valueFields[0], 64)
+		if err != nil || w <= 0 {
+			continue
+		}
+		sensorName := strings.TrimSpace(parts[0])
+		if existing, ok := bySlot[slot]; !ok || w > existing.powerW {
+			bySlot[slot] = entry{name: sensorName, powerW: w}
+		}
+	}
+	if len(bySlot) == 0 {
+		return nil
+	}
+	slots := make([]int, 0, len(bySlot))
+	for s := range bySlot {
+		slots = append(slots, s)
+	}
+	sort.Ints(slots)
+	psus := make([]PSUReading, 0, len(slots))
+	for _, s := range slots {
+		e := bySlot[s]
+		psus = append(psus, PSUReading{Slot: s, Name: e.name, PowerW: e.powerW})
+	}
+	return psus
+}
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -20,6 +20,54 @@ import (
 	"time"
 )

+// Estimated wall-clock durations for each SAT/validate test, derived from real
+// production logs in _benchmark/_v8/.
+//
+// Rule: whenever the commands, timeout parameters, or number of sub-jobs inside
+// the corresponding Run*Pack function change, re-measure the wall-clock duration
+// from actual task logs and update the matching constant here.
+//
+// Sources:
+//   - SATEstimatedCPUValidateSec:                 xFusion v8.6 — 62 s
+//   - SATEstimatedMemoryValidateSec:               xFusion v8.6 — 68 s
+//   - SATEstimatedNvidiaGPUValidatePerGPUSec:      xFusion v8.6/v8.22 — 77–87 s/GPU
+//   - SATEstimatedNvidiaGPUStressPerGPUSec:        xFusion v8.6/v8.22 — 444–448 s/GPU
+//   - SATEstimatedNvidiaTargetedStressPerGPUSec:   xFusion v8.6/v8.22 — 347–348 s/GPU (300 s default + overhead)
+//   - SATEstimatedNvidiaTargetedPowerPerGPUSec:    MSI v8.22 / xFusion v8.6 — 346–351 s/GPU
+//   - SATEstimatedNvidiaPulseTestSec:              xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous)
+//   - SATEstimatedNvidiaInterconnectSec:           xFusion v8.6/v8.22 — 210–384 s / 8 GPU (all simultaneous)
+//   - SATEstimatedNvidiaBandwidthSec:              xFusion v8.6/v8.22 — 2 664–2 688 s / 8 GPU (all simultaneous)
+const (
+	// CPU stress: stress-ng 60 s + lscpu/sensors overhead.
+	SATEstimatedCPUValidateSec = 65
+	// CPU stress: stress-ng 1800 s (stress mode default).
+	SATEstimatedCPUStressSec = 1800
+
+	// RAM: memtester 256 MB / 1 pass.
+	SATEstimatedMemoryValidateSec = 70
+	// RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size).
+	SATEstimatedMemoryStressSec = 140
+
+	// NVIDIA dcgmi diag Level 2 (medium), per GPU, sequential.
+	SATEstimatedNvidiaGPUValidatePerGPUSec = 85
+	// NVIDIA dcgmi diag Level 3 (targeted stress), per GPU, sequential.
+	SATEstimatedNvidiaGPUStressPerGPUSec = 450
+
+	// NVIDIA dcgmi targeted_stress 300 s + overhead, per GPU, sequential.
+	SATEstimatedNvidiaTargetedStressPerGPUSec = 350
+	// NVIDIA dcgmi targeted_power 300 s + overhead, per GPU, sequential.
+	SATEstimatedNvidiaTargetedPowerPerGPUSec = 350
+
+	// NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU).
+	SATEstimatedNvidiaPulseTestSec = 5000
+
+	// NCCL all_reduce_perf, all GPUs simultaneously.
+	SATEstimatedNvidiaInterconnectSec = 300
+	// nvbandwidth, all GPUs simultaneously. Tool runs all built-in tests
+	// without a user-configurable time limit; duration is determined by nvbandwidth itself.
+	SATEstimatedNvidiaBandwidthSec = 2700
+)
+
 var (
 	satExecCommand  = exec.Command
 	satLookPath     = exec.LookPath
@@ -366,12 +414,14 @@ func (s *System) ResetNvidiaGPU(index int) (string, error) {
 	return string(raw), err
 }

-// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
+// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
 // Measures collective communication bandwidth over NVLink/PCIe.
-func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
-	// detect GPU count
-	out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
-	gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
+func (s *System) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
+	selected, err := resolveDCGMGPUIndices(gpuIndices)
+	if err != nil {
+		return "", err
+	}
+	gpuCount := len(selected)
 	if gpuCount < 1 {
 		gpuCount = 1
 	}
@@ -380,7 +430,7 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
 		satJob{name: "02-all-reduce-perf.log", cmd: []string{
 			"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
 			"-g", strconv.Itoa(gpuCount), "--iters", "20",
-		}},
+		}, env: nvidiaVisibleDevicesEnv(selected)},
 	), logFunc)
 }

@@ -426,6 +476,13 @@ func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string,
 	if err != nil {
 		return "", err
 	}
+	// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
+	// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
+	if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
+		for _, p := range killed {
+			logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
+		}
+	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", withNvidiaPersistenceMode(
 		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		satJob{
@@ -443,6 +500,13 @@ func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, dur
 	if err != nil {
 		return "", err
 	}
+	// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
+	// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
+	if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
+		for _, p := range killed {
+			logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
+		}
+	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", withNvidiaPersistenceMode(
 		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		satJob{
@@ -460,6 +524,13 @@ func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpu
 	if err != nil {
 		return "", err
 	}
+	// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
+	// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
+	if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
+		for _, p := range killed {
+			logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
+		}
+	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", withNvidiaPersistenceMode(
 		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		satJob{
@@ -552,10 +623,16 @@ func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, si
 	if passes <= 0 {
 		passes = 1
 	}
-	// Bound memtester with a hard wall-clock timeout: ~2.5 min per 100 MB per
-	// pass, plus a fixed 2-minute buffer. Without this, a stuck memory
-	// controller can cause memtester to spin forever on a single subtest.
-	timeoutSec := sizeMB*passes*150/100 + 120
+	// Keep Validate Memory bounded to a quick diagnostic window. The timeout is
+	// intentionally conservative enough for healthy systems while avoiding the
+	// prior 30-80 minute hangs caused by memtester spinning on a bad subtest.
+	timeoutSec := sizeMB*passes*20/100 + 60
+	if timeoutSec < 180 {
+		timeoutSec = 180
+	}
+	if timeoutSec > 900 {
+		timeoutSec = 900
+	}
 	return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
 		{name: "01-free-before.log", cmd: []string{"free", "-h"}},
 		{name: "02-memtester.log", cmd: []string{"timeout", fmt.Sprintf("%d", timeoutSec), "memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
--- a/audit/internal/platform/sat_fan_stress.go
+++ b/audit/internal/platform/sat_fan_stress.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
+	"math"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -56,13 +57,37 @@ type cachedPowerReading struct {
 	UpdatedAt time.Time
 }

+type fanObservationState struct {
+	MaxRPM map[string]float64 `json:"max_rpm"`
+}
+
+type fanPeakCandidate struct {
+	FirstSeen time.Time
+	RPM       float64
+}
+
 var (
 	systemPowerCacheMu sync.Mutex
 	systemPowerCache   cachedPowerReading
+	fanObservationMu   sync.Mutex
+	fanObservation     fanObservationState
+	fanObservationInit bool
+	fanPeakCandidates  = make(map[string]fanPeakCandidate)
 )

 const systemPowerHoldTTL = 15 * time.Second

+var fanObservationStatePath = "/var/log/bee-sat/fan-observation.json"
+
+const fanObservationMinPeakHold = time.Second
+
+func normalizeObservedFanMaxRPM(rpm float64) float64 {
+	if rpm <= 0 {
+		return 0
+	}
+	return math.Ceil(rpm/1000.0) * 1000.0
+}
+
 // RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
 // temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
 // Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
@@ -310,11 +335,13 @@ func sampleFanSpeeds() ([]FanReading, error) {
 	out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
 	if err == nil {
 		if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
+			updateFanObservation(fans, time.Now())
 			return fans, nil
 		}
 	}
 	fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
 	if len(fans) > 0 {
+		updateFanObservation(fans, time.Now())
 		return fans, nil
 	}
 	if err != nil {
@@ -323,6 +350,119 @@ func sampleFanSpeeds() ([]FanReading, error) {
 	return nil, sensorsErr
 }

+func loadFanObservationLocked() {
+	if fanObservationInit {
+		return
+	}
+	fanObservationInit = true
+	fanObservation.MaxRPM = make(map[string]float64)
+	raw, err := os.ReadFile(fanObservationStatePath)
+	if err != nil || len(raw) == 0 {
+		return
+	}
+	var persisted fanObservationState
+	if json.Unmarshal(raw, &persisted) != nil {
+		return
+	}
+	for name, rpm := range persisted.MaxRPM {
+		name = strings.TrimSpace(name)
+		if name == "" || rpm <= 0 {
+			continue
+		}
+		fanObservation.MaxRPM[name] = rpm
+	}
+}
+
+func saveFanObservationLocked() {
+	if len(fanObservation.MaxRPM) == 0 {
+		return
+	}
+	dir := filepath.Dir(fanObservationStatePath)
+	if dir == "" || dir == "." {
+		dir = "/var/log/bee-sat"
+	}
+	if err := os.MkdirAll(dir, 0755); err != nil {
+		return
+	}
+	raw, err := json.MarshalIndent(fanObservation, "", "  ")
+	if err != nil {
+		return
+	}
+	_ = os.WriteFile(fanObservationStatePath, raw, 0644)
+}
+
+func updateFanObservation(fans []FanReading, now time.Time) {
+	if len(fans) == 0 {
+		return
+	}
+	fanObservationMu.Lock()
+	defer fanObservationMu.Unlock()
+	loadFanObservationLocked()
+	changed := false
+	for _, fan := range fans {
+		name := strings.TrimSpace(fan.Name)
+		if name == "" || fan.RPM <= 0 {
+			continue
+		}
+		currentMax := fanObservation.MaxRPM[name]
+		if fan.RPM <= currentMax {
+			delete(fanPeakCandidates, name)
+			continue
+		}
+		if cand, ok := fanPeakCandidates[name]; ok {
+			if now.Sub(cand.FirstSeen) >= fanObservationMinPeakHold {
+				newMax := math.Max(cand.RPM, fan.RPM)
+				if newMax > currentMax {
+					fanObservation.MaxRPM[name] = normalizeObservedFanMaxRPM(newMax)
+					changed = true
+				}
+				delete(fanPeakCandidates, name)
+				continue
+			}
+			if fan.RPM > cand.RPM {
+				fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: cand.FirstSeen, RPM: fan.RPM}
+			}
+			continue
+		}
+		fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: now, RPM: fan.RPM}
+	}
+	if changed {
+		saveFanObservationLocked()
+	}
+}
+
+func estimateFanDutyCyclePctFromObservation(fans []FanReading) (float64, bool) {
+	if len(fans) == 0 {
+		return 0, false
+	}
+	fanObservationMu.Lock()
+	defer fanObservationMu.Unlock()
+	loadFanObservationLocked()
+	var samples []float64
+	for _, fan := range fans {
+		name := strings.TrimSpace(fan.Name)
+		if name == "" || fan.RPM <= 0 {
+			continue
+		}
+		maxRPM := fanObservation.MaxRPM[name]
+		if maxRPM <= 0 {
+			continue
+		}
+		pct := fan.RPM / maxRPM * 100.0
+		if pct > 100 {
+			pct = 100
+		}
+		if pct < 0 {
+			pct = 0
+		}
+		samples = append(samples, pct)
+	}
+	if len(samples) == 0 {
+		return 0, false
+	}
+	return benchmarkMean(samples), true
+}
+
 // parseFanSpeeds parses "ipmitool sdr type Fan" output.
 // Handles two formats:
 //
@@ -428,12 +568,27 @@ func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {

 // sampleFanDutyCyclePct reads fan PWM/duty-cycle controls from lm-sensors.
 // Returns the average duty cycle across all exposed PWM controls.
-func sampleFanDutyCyclePct() (float64, bool) {
+func sampleFanDutyCyclePct() (float64, bool, bool) {
 	out, err := exec.Command("sensors", "-j").Output()
 	if err != nil || len(out) == 0 {
-		return 0, false
+		fans, fanErr := sampleFanSpeeds()
+		if fanErr != nil {
+			return 0, false, false
+		}
+		return sampleFanDutyCyclePctFromFans(fans)
 	}
-	return parseFanDutyCyclePctSensorsJSON(out)
+	pct, ok := parseFanDutyCyclePctSensorsJSON(out)
+	return pct, ok, false
+}
+
+func sampleFanDutyCyclePctFromFans(fans []FanReading) (float64, bool, bool) {
+	if len(fans) == 0 {
+		return 0, false, false
+	}
+	if pct, ok := estimateFanDutyCyclePctFromObservation(fans); ok {
+		return pct, true, true
+	}
+	return 0, false, false
 }

 func parseFanDutyCyclePctSensorsJSON(raw []byte) (float64, bool) {
--- a/audit/internal/platform/sat_fan_stress_test.go
+++ b/audit/internal/platform/sat_fan_stress_test.go
@@ -1,6 +1,7 @@
 package platform

 import (
+	"path/filepath"
 	"testing"
 	"time"
 )
@@ -50,6 +51,53 @@ func TestParseFanDutyCyclePctSensorsJSON(t *testing.T) {
 	}
 }

+func TestEstimateFanDutyCyclePctFromObservation(t *testing.T) {
+	t.Parallel()
+
+	oldPath := fanObservationStatePath
+	oldState := fanObservation
+	oldInit := fanObservationInit
+	oldCandidates := fanPeakCandidates
+	fanObservationStatePath = filepath.Join(t.TempDir(), "fan-observation.json")
+	fanObservation = fanObservationState{}
+	fanObservationInit = false
+	fanPeakCandidates = make(map[string]fanPeakCandidate)
+	t.Cleanup(func() {
+		fanObservationStatePath = oldPath
+		fanObservation = oldState
+		fanObservationInit = oldInit
+		fanPeakCandidates = oldCandidates
+	})
+
+	start := time.Unix(100, 0)
+	updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5000}}, start)
+	if _, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2500}}); ok {
+		t.Fatalf("single-sample spike should not establish observed max")
+	}
+
+	updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5200}}, start.Add(500*time.Millisecond))
+	updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5100}}, start.Add(1500*time.Millisecond))
+
+	got, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
+	if !ok {
+		t.Fatalf("expected estimated duty cycle from persisted observed max")
+	}
+	if got < 43 || got > 44 {
+		t.Fatalf("got=%v want ~43.3", got)
+	}
+
+	fanObservation = fanObservationState{}
+	fanObservationInit = false
+	fanPeakCandidates = make(map[string]fanPeakCandidate)
+	got, ok = estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
+	if !ok {
+		t.Fatalf("expected persisted observed max to be reloaded from disk")
+	}
+	if got < 43 || got > 44 {
+		t.Fatalf("reloaded got=%v want ~43.3", got)
+	}
+}
+
 func TestParseDCMIPowerReading(t *testing.T) {
 	raw := `
 Instantaneous power reading:                   512 Watts
--- a/audit/internal/platform/sat_test.go
+++ b/audit/internal/platform/sat_test.go
@@ -321,6 +321,19 @@ func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
 	}
 }

+func TestNvidiaDCGMNamedDiagCommandSkipsDurationForNVBandwidth(t *testing.T) {
+	cmd := nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, []int{2, 0})
+	want := []string{"dcgmi", "diag", "-r", "nvbandwidth", "-i", "2,0"}
+	if len(cmd) != len(want) {
+		t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
+	}
+	for i := range want {
+		if cmd[i] != want[i] {
+			t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
+		}
+	}
+}
+
 func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
 	env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
 	if len(env) != 2 {
--- a/audit/internal/webui/api.go
+++ b/audit/internal/webui/api.go
@@ -737,6 +737,9 @@ func (h *handler) handleAPISATAbort(w http.ResponseWriter, r *http.Request) {
 			if t.job != nil {
 				t.job.abort()
 			}
+			if taskMayLeaveOrphanWorkers(t.Target) {
+				platform.KillTestWorkers()
+			}
 			t.Status = TaskCancelled
 			now := time.Now()
 			t.DoneAt = &now
--- a/audit/internal/webui/api_test.go
+++ b/audit/internal/webui/api_test.go
@@ -178,16 +178,20 @@ func TestHandleAPIBenchmarkPowerFitRampQueuesBenchmarkPowerFitTasks(t *testing.T
 	}
 	globalQueue.mu.Lock()
 	defer globalQueue.mu.Unlock()
-	if len(globalQueue.tasks) != 3 {
-		t.Fatalf("tasks=%d want 3", len(globalQueue.tasks))
+	// Ramp-up mode creates a single task that handles the 1→N GPU ramp internally
+	// (spawning N separate tasks would redundantly repeat all earlier ramp steps).
+	if len(globalQueue.tasks) != 1 {
+		t.Fatalf("tasks=%d want 1 (ramp-up uses single task)", len(globalQueue.tasks))
 	}
-	for i, task := range globalQueue.tasks {
-		if task.Target != "nvidia-bench-power" {
-			t.Fatalf("task[%d] target=%q", i, task.Target)
-		}
-		if task.Priority != taskPriorityBenchmark {
-			t.Fatalf("task[%d] priority=%d want %d", i, task.Priority, taskPriorityBenchmark)
-		}
+	task := globalQueue.tasks[0]
+	if task.Target != "nvidia-bench-power" {
+		t.Fatalf("task target=%q want nvidia-bench-power", task.Target)
+	}
+	if task.Priority != taskPriorityBenchmark {
+		t.Fatalf("task priority=%d want %d", task.Priority, taskPriorityBenchmark)
+	}
+	if task.params.RampTotal != 3 {
+		t.Fatalf("task RampTotal=%d want 3", task.params.RampTotal)
 	}
 }

--- a/audit/internal/webui/charts_svg.go
+++ b/audit/internal/webui/charts_svg.go
@@ -462,6 +462,127 @@ func synthesizeChartTimes(times []time.Time, count int) []time.Time {
 	return out
 }

+// renderStackedMetricChartSVG renders a stacked area chart where each dataset
+// is visually "stacked" on top of the previous one. Intended for multi-PSU
+// power charts where the filled area of each PSU shows its individual
+// contribution and the total height equals the combined draw.
+func renderStackedMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
+	pointCount := len(labels)
+	if len(times) > pointCount {
+		pointCount = len(times)
+	}
+	if pointCount == 0 {
+		pointCount = 1
+		labels = []string{""}
+		times = []time.Time{{}}
+	}
+	if len(labels) < pointCount {
+		padded := make([]string, pointCount)
+		copy(padded, labels)
+		labels = padded
+	}
+	if len(times) < pointCount {
+		times = synthesizeChartTimes(times, pointCount)
+	}
+	for i := range datasets {
+		if len(datasets[i]) == 0 {
+			datasets[i] = make([]float64, pointCount)
+		}
+	}
+
+	times, datasets = downsampleTimeSeries(times, datasets, 1400)
+	pointCount = len(times)
+
+	// Build cumulative sums per time point.
+	cumulative := make([][]float64, len(datasets)+1)
+	for i := range cumulative {
+		cumulative[i] = make([]float64, pointCount)
+	}
+	for i, ds := range datasets {
+		for j, v := range ds {
+			cumulative[i+1][j] = cumulative[i][j] + v
+		}
+	}
+
+	// Scale is based on the total (top cumulative row).
+	total := cumulative[len(cumulative)-1]
+	yMin := floatPtr(0)
+	if yMax == nil {
+		yMax = autoMax120(total)
+	}
+	scale := singleAxisChartScale([][]float64{total}, yMin, yMax)
+
+	legendItems := make([]metricChartSeries, len(datasets))
+	for i, name := range names {
+		color := metricChartPalette[i%len(metricChartPalette)]
+		legendItems[i] = metricChartSeries{Name: name, Color: color, Values: datasets[i]}
+	}
+
+	// Stats label from totals.
+	statsLabel := chartStatsLabel([][]float64{total})
+
+	layout := singleAxisChartLayout(canvasHeight, len(legendItems))
+	start, end := chartTimeBounds(times)
+
+	var b strings.Builder
+	writeSVGOpen(&b, layout.Width, layout.Height)
+	writeChartFrame(&b, title, statsLabel, layout.Width, layout.Height)
+	writeTimelineIdleSpans(&b, layout, start, end, timeline)
+	writeVerticalGrid(&b, layout, times, pointCount, 8)
+	writeHorizontalGrid(&b, layout, scale)
+	writeTimelineBoundaries(&b, layout, start, end, timeline)
+	writePlotBorder(&b, layout)
+	writeSingleAxisY(&b, layout, scale)
+	writeXAxisLabels(&b, layout, times, labels, start, end, 8)
+
+	// Draw stacked areas from top to bottom so lower layers are visible.
+	for i := len(datasets) - 1; i >= 0; i-- {
+		writeStackedArea(&b, layout, times, start, end, cumulative[i], cumulative[i+1], scale, legendItems[i].Color)
+	}
+	// Draw border polylines on top.
+	for i := len(datasets) - 1; i >= 0; i-- {
+		writeSeriesPolyline(&b, layout, times, start, end, cumulative[i+1], scale, legendItems[i].Color)
+	}
+
+	writeLegend(&b, layout, legendItems)
+	writeSVGClose(&b)
+	return []byte(b.String()), nil
+}
+
+// writeStackedArea draws a filled polygon between two cumulative value arrays
+// (baseline and top), using the given color at 55% opacity.
+func writeStackedArea(b *strings.Builder, layout chartLayout, times []time.Time, start, end time.Time, baseline, top []float64, scale chartScale, color string) {
+	n := len(top)
+	if n == 0 {
+		return
+	}
+	if len(baseline) < n {
+		baseline = make([]float64, n)
+	}
+
+	// Forward path along top values, then backward along baseline values.
+	var points strings.Builder
+	for i := 0; i < n; i++ {
+		x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
+		y := chartYForValue(valueClamp(top[i], scale), scale, layout.PlotTop, layout.PlotBottom)
+		if i > 0 {
+			points.WriteByte(' ')
+		}
+		points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
+		points.WriteByte(',')
+		points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
+	}
+	for i := n - 1; i >= 0; i-- {
+		x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
+		y := chartYForValue(valueClamp(baseline[i], scale), scale, layout.PlotTop, layout.PlotBottom)
+		points.WriteByte(' ')
+		points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
+		points.WriteByte(',')
+		points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
+	}
+	fmt.Fprintf(b, `<polygon points="%s" fill="%s" fill-opacity="0.55" stroke="none"/>`+"\n", points.String(), color)
+}
+
 func writeSVGOpen(b *strings.Builder, width, height int) {
 	fmt.Fprintf(b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" viewBox="0 0 %d %d">`+"\n", width, height, width, height)
 }
--- a/audit/internal/webui/jobs.go
+++ b/audit/internal/webui/jobs.go
@@ -1,6 +1,9 @@
 package webui

 import (
+	"bufio"
+	"fmt"
+	"io"
 	"os"
 	"strings"
 	"sync"
@@ -17,6 +20,25 @@ type jobState struct {
 	cancel       func() // optional cancel function; nil if job is not cancellable
 	logPath      string
 	serialPrefix string
+	logFile      *os.File    // kept open for the task lifetime to avoid per-line open/close
+	logBuf       *bufio.Writer
+}
+
+// readTaskLogFile reads a task log, refusing files over 50 MB.
+func readTaskLogFile(path string) ([]byte, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+	data, err := io.ReadAll(io.LimitReader(f, 50<<20+1))
+	if err != nil {
+		return nil, err
+	}
+	if int64(len(data)) > 50<<20 {
+		return nil, fmt.Errorf("task log %s too large (exceeds 50 MB)", path)
+	}
+	return data, nil
 }

 // abort cancels the job if it has a cancel function and is not yet done.
@@ -35,7 +57,7 @@ func (j *jobState) append(line string) {
 	defer j.mu.Unlock()
 	j.lines = append(j.lines, line)
 	if j.logPath != "" {
-		appendJobLog(j.logPath, line)
+		j.writeLogLineLocked(line)
 	}
 	if j.serialPrefix != "" {
 		taskSerialWriteLine(j.serialPrefix + line)
@@ -48,6 +70,35 @@ func (j *jobState) append(line string) {
 	}
 }

+// writeLogLineLocked writes a line to the persistent log file, opening it lazily.
+// Must be called with j.mu held. Uses a buffered writer kept open for the task
+// lifetime — avoids thousands of open/close syscalls during high-frequency logs.
+func (j *jobState) writeLogLineLocked(line string) {
+	if j.logFile == nil {
+		f, err := os.OpenFile(j.logPath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
+		if err != nil {
+			return
+		}
+		j.logFile = f
+		j.logBuf = bufio.NewWriterSize(f, 64*1024)
+	}
+	_, _ = j.logBuf.WriteString(line + "\n")
+}
+
+// closeLog flushes and closes the log file. Called after all task output is done.
+func (j *jobState) closeLog() {
+	j.mu.Lock()
+	defer j.mu.Unlock()
+	if j.logBuf != nil {
+		_ = j.logBuf.Flush()
+	}
+	if j.logFile != nil {
+		_ = j.logFile.Close()
+		j.logFile = nil
+		j.logBuf = nil
+	}
+}
+
 func (j *jobState) finish(errMsg string) {
 	j.mu.Lock()
 	defer j.mu.Unlock()
@@ -119,7 +170,7 @@ func newTaskJobState(logPath string, serialPrefix ...string) *jobState {
 	if logPath == "" {
 		return j
 	}
-	data, err := os.ReadFile(logPath)
+	data, err := readTaskLogFile(logPath)
 	if err != nil || len(data) == 0 {
 		return j
 	}
--- a/audit/internal/webui/metricsdb.go
+++ b/audit/internal/webui/metricsdb.go
@@ -161,6 +161,56 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
 	return tx.Commit()
 }

+// Downsample reduces density of old metrics rows to 1 sample per minute.
+// Only rows in the half-open window [deleteOlderThan, downsampleBefore) are
+// affected — rows newer than downsampleBefore keep full 5-second resolution.
+// For each 60-second bucket the row with the smallest ts is kept; the rest
+// are deleted. This trims ~92 % of rows in that window while preserving
+// the overall shape of every chart.
+//
+// Called hourly by the metrics collector background goroutine.
+func (m *MetricsDB) Downsample(downsampleBefore, deleteOlderThan time.Time) error {
+	if m == nil || m.db == nil {
+		return nil
+	}
+	start := deleteOlderThan.Unix()
+	end := downsampleBefore.Unix()
+	if end <= start {
+		return nil
+	}
+	// For each table: delete rows in [start, end) whose ts is NOT the minimum
+	// ts in its 60-second bucket (ts/60 integer division = bucket ID).
+	for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
+		_, err := m.db.Exec(`
+DELETE FROM `+table+` WHERE ts >= ? AND ts < ?
+  AND ts NOT IN (
+    SELECT MIN(ts) FROM `+table+`
+    WHERE ts >= ? AND ts < ?
+    GROUP BY ts / 60
+  )`, start, end, start, end)
+		if err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// Prune deletes all rows older than the given cutoff from every metrics table.
+// Called hourly by the metrics collector to keep the DB size bounded.
+func (m *MetricsDB) Prune(before time.Time) error {
+	if m == nil || m.db == nil {
+		return nil
+	}
+	cutTS := before.Unix()
+	for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
+		if _, err := m.db.Exec("DELETE FROM "+table+" WHERE ts < ?", cutTS); err != nil {
+			return err
+		}
+	}
+	_, _ = m.db.Exec("PRAGMA wal_checkpoint(TRUNCATE)")
+	return nil
+}
+
 // LoadRecent returns up to n samples in chronological order (oldest first).
 func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
 	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
@@ -72,6 +72,13 @@ tbody tr:hover td{background:rgba(0,0,0,.03)}
 .badge-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
 .badge-err{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
 .badge-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
+/* Component chips — one small square per device */
+.chips{display:inline-flex;flex-wrap:wrap;gap:3px;align-items:center;vertical-align:middle}
+.chip{display:inline-flex;align-items:center;justify-content:center;width:20px;height:20px;border-radius:3px;font-size:10px;font-weight:800;cursor:default;font-family:monospace;letter-spacing:0;user-select:none}
+.chip-ok{background:var(--ok-bg);color:var(--ok-fg);border:1px solid #a3c293}
+.chip-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
+.chip-fail{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
+.chip-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
 /* Output terminal */
 .terminal{background:#1b1c1d;border:1px solid rgba(0,0,0,.2);border-radius:4px;padding:14px;font-family:monospace;font-size:12px;color:#b5cea8;max-height:400px;overflow-y:auto;white-space:pre-wrap;word-break:break-all;user-select:text;-webkit-user-select:text}
 .terminal-wrap{position:relative}.terminal-copy{position:absolute;top:6px;right:6px;background:#2d2f30;border:1px solid #444;color:#aaa;font-size:11px;padding:2px 8px;border-radius:3px;cursor:pointer;opacity:.7}.terminal-copy:hover{opacity:1}
@@ -363,23 +370,25 @@ func renderHardwareSummaryCard(opts HandlerOptions) string {
 			html.EscapeString(label), html.EscapeString(value), badgeHTML))
 	}

-	cpuRow := aggregateComponentStatus("CPU", records, []string{"cpu:all"}, nil)
-	writeRow("CPU", hwDescribeCPU(hw), runtimeStatusBadge(cpuRow.Status))
+	writeRow("CPU", hwDescribeCPU(hw),
+		renderComponentChips(matchedRecords(records, []string{"cpu:all"}, nil)))

-	memRow := aggregateComponentStatus("Memory", records, []string{"memory:all"}, []string{"memory:"})
-	writeRow("Memory", hwDescribeMemory(hw), runtimeStatusBadge(memRow.Status))
+	writeRow("Memory", hwDescribeMemory(hw),
+		renderComponentChips(matchedRecords(records, []string{"memory:all"}, []string{"memory:"})))

-	storageRow := aggregateComponentStatus("Storage", records, []string{"storage:all"}, []string{"storage:"})
-	writeRow("Storage", hwDescribeStorage(hw), runtimeStatusBadge(storageRow.Status))
+	writeRow("Storage", hwDescribeStorage(hw),
+		renderComponentChips(matchedRecords(records, []string{"storage:all"}, []string{"storage:"})))

-	gpuRow := aggregateComponentStatus("GPU", records, nil, []string{"pcie:gpu:"})
-	writeRow("GPU", hwDescribeGPU(hw), runtimeStatusBadge(gpuRow.Status))
+	writeRow("GPU", hwDescribeGPU(hw),
+		renderComponentChips(matchedRecords(records, nil, []string{"pcie:gpu:"})))

-	psuRow := aggregateComponentStatus("PSU", records, nil, []string{"psu:"})
-	if psuRow.Status == "UNKNOWN" && len(hw.PowerSupplies) > 0 {
-		psuRow.Status = hwPSUStatus(hw.PowerSupplies)
+	psuMatched := matchedRecords(records, nil, []string{"psu:"})
+	if len(psuMatched) == 0 && len(hw.PowerSupplies) > 0 {
+		// No PSU records yet — synthesise a single chip from IPMI status.
+		psuStatus := hwPSUStatus(hw.PowerSupplies)
+		psuMatched = []app.ComponentStatusRecord{{ComponentKey: "psu:ipmi", Status: psuStatus}}
 	}
-	writeRow("PSU", hwDescribePSU(hw), runtimeStatusBadge(psuRow.Status))
+	writeRow("PSU", hwDescribePSU(hw), renderComponentChips(psuMatched))

 	if nicDesc := hwDescribeNIC(hw); nicDesc != "" {
 		writeRow("Network", nicDesc, "")
@@ -892,6 +901,31 @@ func buildHardwareComponentRows(exportDir string) []runtimeHealthRow {
 	}
 }

+// matchedRecords returns all ComponentStatusRecord entries whose key matches
+// any exact key or any of the given prefixes. Used for per-device chip rendering.
+func firstNonEmpty(vals ...string) string {
+	for _, v := range vals {
+		if v != "" {
+			return v
+		}
+	}
+	return ""
+}
+
+func matchedRecords(records []app.ComponentStatusRecord, exact []string, prefixes []string) []app.ComponentStatusRecord {
+	var matched []app.ComponentStatusRecord
+	for _, rec := range records {
+		key := strings.TrimSpace(rec.ComponentKey)
+		if key == "" {
+			continue
+		}
+		if containsExactKey(key, exact) || hasAnyPrefix(key, prefixes) {
+			matched = append(matched, rec)
+		}
+	}
+	return matched
+}
+
 func aggregateComponentStatus(title string, records []app.ComponentStatusRecord, exact []string, prefixes []string) runtimeHealthRow {
 	matched := make([]app.ComponentStatusRecord, 0)
 	for _, rec := range records {
@@ -1034,6 +1068,52 @@ func runtimeIssueDescriptions(issues []schema.RuntimeIssue, codes ...string) str
 	return strings.Join(messages, "; ")
 }

+// chipLetterClass maps a component status to a single display letter and CSS class.
+func chipLetterClass(status string) (letter, cls string) {
+	switch strings.ToUpper(strings.TrimSpace(status)) {
+	case "OK":
+		return "O", "chip-ok"
+	case "WARNING", "WARN", "PARTIAL":
+		return "W", "chip-warn"
+	case "CRITICAL", "FAIL", "FAILED", "ERROR":
+		return "F", "chip-fail"
+	default:
+		return "?", "chip-unknown"
+	}
+}
+
+// renderComponentChips renders one 20×20 chip per ComponentStatusRecord.
+// Hover tooltip shows component key, status, error summary and last check time.
+// Falls back to a single unknown chip when no records are available.
+func renderComponentChips(matched []app.ComponentStatusRecord) string {
+	if len(matched) == 0 {
+		return `<span class="chips"><span class="chip chip-unknown" title="No data">?</span></span>`
+	}
+	sort.Slice(matched, func(i, j int) bool {
+		return matched[i].ComponentKey < matched[j].ComponentKey
+	})
+	var b strings.Builder
+	b.WriteString(`<span class="chips">`)
+	for _, rec := range matched {
+		letter, cls := chipLetterClass(rec.Status)
+		var tooltip strings.Builder
+		tooltip.WriteString(rec.ComponentKey)
+		tooltip.WriteString(": ")
+		tooltip.WriteString(firstNonEmpty(rec.Status, "UNKNOWN"))
+		if rec.ErrorSummary != "" {
+			tooltip.WriteString(" — ")
+			tooltip.WriteString(rec.ErrorSummary)
+		}
+		if !rec.LastCheckedAt.IsZero() {
+			fmt.Fprintf(&tooltip, " (checked %s)", rec.LastCheckedAt.Format("15:04:05"))
+		}
+		fmt.Fprintf(&b, `<span class="chip %s" title="%s">%s</span>`,
+			cls, html.EscapeString(tooltip.String()), letter)
+	}
+	b.WriteString(`</span>`)
+	return b.String()
+}
+
 func runtimeStatusBadge(status string) string {
 	status = strings.ToUpper(strings.TrimSpace(status))
 	badge := "badge-unknown"
@@ -1298,15 +1378,64 @@ setInterval(loadMetricsLayout, 5000);
 // ── Validate (Acceptance Tests) ───────────────────────────────────────────────

 type validateInventory struct {
-	CPU     string
-	Memory  string
-	Storage string
-	NVIDIA  string
-	AMD     string
+	CPU           string
+	Memory        string
+	Storage       string
+	NVIDIA        string
+	AMD           string
+	NvidiaGPUCount int
+	AMDGPUCount    int
+}
+
+// validateFmtDur formats a duration in seconds as a human-readable "~N min" or "~N s" string.
+func validateFmtDur(secs int) string {
+	if secs < 120 {
+		return fmt.Sprintf("~%d s", secs)
+	}
+	mins := (secs + 29) / 60
+	return fmt.Sprintf("~%d min", mins)
+}
+
+// validateTotalValidateSec returns the estimated wall-clock duration of
+// "Validate one by one" in Validate mode for n NVIDIA GPUs.
+func validateTotalValidateSec(n int) int {
+	if n < 0 {
+		n = 0
+	}
+	total := platform.SATEstimatedCPUValidateSec +
+		platform.SATEstimatedMemoryValidateSec +
+		n*platform.SATEstimatedNvidiaGPUValidatePerGPUSec +
+		platform.SATEstimatedNvidiaInterconnectSec +
+		platform.SATEstimatedNvidiaBandwidthSec
+	return total
+}
+
+// validateTotalStressSec returns the estimated wall-clock duration of
+// "Validate one by one" in Stress mode for n NVIDIA GPUs.
+func validateTotalStressSec(n int) int {
+	if n < 0 {
+		n = 0
+	}
+	total := platform.SATEstimatedCPUStressSec +
+		platform.SATEstimatedMemoryStressSec +
+		n*platform.SATEstimatedNvidiaGPUStressPerGPUSec +
+		n*platform.SATEstimatedNvidiaTargetedStressPerGPUSec +
+		n*platform.SATEstimatedNvidiaTargetedPowerPerGPUSec +
+		platform.SATEstimatedNvidiaPulseTestSec +
+		platform.SATEstimatedNvidiaInterconnectSec +
+		platform.SATEstimatedNvidiaBandwidthSec
+	return total
 }

 func renderValidate(opts HandlerOptions) string {
 	inv := loadValidateInventory(opts)
+	n := inv.NvidiaGPUCount
+	validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
+	stressTotalStr := validateFmtDur(validateTotalStressSec(n))
+	gpuNote := ""
+	if n > 0 {
+		gpuNote = fmt.Sprintf(" (%d GPU)", n)
+	}
 	return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
 <p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>

@@ -1316,10 +1445,10 @@ func renderValidate(opts HandlerOptions) string {
 	    <div class="validate-profile-col">
 	      <div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
 	      <label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
-	      <label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (~30–60 min)</span></label>
+	      <label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (` + stressTotalStr + gpuNote + `)</span></label>
 	    </div>
 	    <div class="validate-profile-col validate-profile-action">
-	      <p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially with the selected cycle count and mode. Validate is quick (~5–15 min total); Stress is thorough (~30–60 min total).</p>
+	      <p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially. Validate: ` + validateTotalStr + gpuNote + `; Stress: ` + stressTotalStr + gpuNote + `. Estimates are based on real log data and scale with GPU count.</p>
 	      <button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
 	      <div style="margin-top:12px">
 	        <span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
@@ -1333,19 +1462,19 @@ func renderValidate(opts HandlerOptions) string {
 		inv.CPU,
 		`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
 		`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
-		`60s in Validate, 30 min in Stress.`,
+		validateFmtDur(platform.SATEstimatedCPUValidateSec)+` in Validate (stress-ng 60 s). `+validateFmtDur(platform.SATEstimatedCPUStressSec)+` in Stress (stress-ng 30 min).`,
 	)) +
 		renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
 			inv.Memory,
 			`Runs a RAM validation pass and records memory state around the test.`,
 			`<code>free</code>, <code>memtester</code>`,
-			`256 MB / 1 pass in Validate, 1 GB / 3 passes in Stress.`,
+			validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` in Validate (256 MB × 1 pass). `+validateFmtDur(platform.SATEstimatedMemoryStressSec)+` in Stress (512 MB × 1 pass).`,
 		)) +
 		renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
 			inv.Storage,
 			`Scans all storage devices and runs the matching health or self-test path for each device type.`,
 			`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
-			`Short self-test in Validate, extended self-test in Stress.`,
+			`Seconds in Validate (NVMe: instant device query; SATA/SAS: short self-test). Up to ~1 h per device in Stress (extended self-test, device-dependent).`,
 		)) +
 		`</div>
 <div style="height:1px;background:var(--border);margin:16px 0"></div>
@@ -1370,14 +1499,33 @@ func renderValidate(opts HandlerOptions) string {
 		inv.NVIDIA,
 		`Runs NVIDIA diagnostics and board inventory checks.`,
 		`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
-		`Level 2 in Validate, Level 3 in Stress. Runs one GPU at a time on the selected NVIDIA GPUs.`,
+		func() string {
+			perV := platform.SATEstimatedNvidiaGPUValidatePerGPUSec
+			perS := platform.SATEstimatedNvidiaGPUStressPerGPUSec
+			if n > 0 {
+				return fmt.Sprintf("Validate: %s/GPU × %d = %s (Level 2, sequential). Stress: %s/GPU × %d = %s (Level 3, sequential).",
+					validateFmtDur(perV), n, validateFmtDur(perV*n),
+					validateFmtDur(perS), n, validateFmtDur(perS*n))
+			}
+			return fmt.Sprintf("Validate: %s/GPU (Level 2, sequential). Stress: %s/GPU (Level 3, sequential).",
+				validateFmtDur(perV), validateFmtDur(perS))
+		}(),
 	)) +
 		`<div id="sat-card-nvidia-targeted-stress">` +
 		renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
 			inv.NVIDIA,
 			`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
 			`<code>dcgmi diag targeted_stress</code>`,
-			`Skipped in Validate mode. Runs after dcgmi diag in Stress mode. Runs one GPU at a time on the selected NVIDIA GPUs.<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
+			func() string {
+				per := platform.SATEstimatedNvidiaTargetedStressPerGPUSec
+				s := "Skipped in Validate. "
+				if n > 0 {
+					s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
+				} else {
+					s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
+				}
+				return s + `<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
+			}(),
 		)) +
 		`</div>` +
 		`<div id="sat-card-nvidia-targeted-power">` +
@@ -1385,7 +1533,16 @@ func renderValidate(opts HandlerOptions) string {
 			inv.NVIDIA,
 			`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
 			`<code>dcgmi diag targeted_power</code>`,
-			`Skipped in Validate mode. Runs in Stress mode only. Runs one GPU at a time.<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
+			func() string {
+				per := platform.SATEstimatedNvidiaTargetedPowerPerGPUSec
+				s := "Skipped in Validate. "
+				if n > 0 {
+					s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
+				} else {
+					s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
+				}
+				return s + `<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
+			}(),
 		)) +
 		`</div>` +
 		`<div id="sat-card-nvidia-pulse">` +
@@ -1393,7 +1550,7 @@ func renderValidate(opts HandlerOptions) string {
 			inv.NVIDIA,
 			`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
 			`<code>dcgmi diag pulse_test</code>`,
-			`Skipped in Validate mode. Runs in Stress mode only. Runs all selected GPUs simultaneously — synchronous pulsing is required to stress the PSU.<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
+			`Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
 		)) +
 		`</div>` +
 		`<div id="sat-card-nvidia-interconnect">` +
@@ -1401,7 +1558,7 @@ func renderValidate(opts HandlerOptions) string {
 			inv.NVIDIA,
 			`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
 			`<code>all_reduce_perf</code> (NCCL tests)`,
-			`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously (requires ≥2).<p id="sat-ni-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
+			`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
 		)) +
 		`</div>` +
 		`<div id="sat-card-nvidia-bandwidth">` +
@@ -1409,7 +1566,7 @@ func renderValidate(opts HandlerOptions) string {
 			inv.NVIDIA,
 			`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
 			`<code>nvbandwidth</code>`,
-			`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously.<p id="sat-nb-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
+			`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
 		)) +
 		`</div>` +
 		`</div>
@@ -1447,8 +1604,6 @@ function satModeChanged() {
    {card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
    {card: 'sat-card-nvidia-targeted-power',  hint: 'sat-tp-mode-hint'},
    {card: 'sat-card-nvidia-pulse',           hint: 'sat-pt-mode-hint'},
-    {card: 'sat-card-nvidia-interconnect',    hint: 'sat-ni-mode-hint'},
-    {card: 'sat-card-nvidia-bandwidth',       hint: 'sat-nb-mode-hint'},
  ].forEach(function(item) {
    const card = document.getElementById(item.card);
    if (card) {
@@ -1696,7 +1851,7 @@ function runAllSAT() {
  const cycles = 1;
  const status = document.getElementById('sat-all-status');
  status.textContent = 'Enqueuing...';
-  const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
+  const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
  const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
  const activeTargets = baseTargets.filter(target => {
    if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
@@ -1844,6 +1999,8 @@ func loadValidateInventory(opts HandlerOptions) validateInventory {
 	out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device")
 	out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU")
 	out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU")
+	out.NvidiaGPUCount = nvidiaTotal
+	out.AMDGPUCount = amdTotal
 	return out
 }

@@ -1936,9 +2093,11 @@ func renderSATCard(id, label, runAction, headerActions, body string) string {
 // ── Benchmark ─────────────────────────────────────────────────────────────────

 type benchmarkHistoryRun struct {
-	generatedAt time.Time
-	displayTime string
-	gpuScores   map[int]float64 // GPU index → composite score
+	generatedAt  time.Time
+	displayTime  string
+	gpuScores    map[int]float64 // GPU index → composite score
+	gpuStatuses  map[int]string  // GPU index → status ("OK", "WARNING", "FAILED", …)
+	overallStatus string
 }

 func renderBenchmark(opts HandlerOptions) string {
@@ -1951,9 +2110,9 @@ func renderBenchmark(opts HandlerOptions) string {
      <div class="form-row">
        <label>Profile</label>
        <select id="benchmark-profile">
-          <option value="standard" selected>Standard — about 15 minutes</option>
-          <option value="stability">Stability — 1 to 2 hours</option>
-          <option value="overnight">Overnight — 8 hours</option>
+          <option value="standard" selected>Standard — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</option>
+          <option value="stability">Stability — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</option>
+          <option value="overnight">Overnight — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfOvernightSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerOvernightSec) + `</option>
        </select>
      </div>
      <div class="form-row">
@@ -1993,16 +2152,16 @@ func renderBenchmark(opts HandlerOptions) string {
    <div class="card-body">
      <p style="font-size:13px;color:var(--muted);margin-bottom:10px">The benchmark page now exposes two fundamentally different test families so compute score and server power-fit are not mixed into one number.</p>
      <table>
-        <tr><th>Run Type</th><th>Engine</th><th>Question</th></tr>
-        <tr><td>Performance Benchmark</td><td><code>bee-gpu-burn</code></td><td>How much isolated compute performance does the GPU realize in this server?</td></tr>
-        <tr><td>Power / Thermal Fit</td><td><code>dcgmi targeted_power</code></td><td>How much power per GPU can this server sustain as GPU count ramps up?</td></tr>
+        <tr><th>Run Type</th><th>Engine</th><th>Question</th><th>Standard</th><th>Stability</th></tr>
+        <tr><td>Performance Benchmark</td><td><code>bee-gpu-burn</code></td><td>How much isolated compute performance does the GPU realize in this server?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + `</td></tr>
+        <tr><td>Power / Thermal Fit</td><td><code>dcgmi targeted_power</code></td><td>How much power per GPU can this server sustain as GPU count ramps up?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</td></tr>
      </table>
-      <p style="font-size:12px;color:var(--muted);margin-top:10px">Use ramp-up mode for capacity work: it creates 1 GPU → 2 GPU → … → all selected steps so analysis software can derive server total score and watts-per-GPU curves.</p>
+      <p style="font-size:12px;color:var(--muted);margin-top:10px">Timings are per full ramp-up run (1 GPU → all selected), measured on 4–8 GPU servers. Use ramp-up mode for capacity work: it creates 1 GPU → 2 GPU → … → all selected steps so analysis software can derive server total score and watts-per-GPU curves.</p>
    </div>
  </div>
 </div>

-`+`<div id="benchmark-results-section">`+renderBenchmarkResultsCard(opts.ExportDir)+`</div>`+`
+` + `<div id="benchmark-results-section">` + renderBenchmarkResultsCard(opts.ExportDir) + `</div>` + `

 <div id="benchmark-output" style="display:none;margin-top:16px" class="card">
  <div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
@@ -2226,7 +2385,7 @@ function benchmarkRefreshResults() {
 func renderBenchmarkResultsCard(exportDir string) string {
 	maxIdx, runs := loadBenchmarkHistory(exportDir)
 	perf := renderBenchmarkResultsCardFromRuns(
-		"Performance Results",
+		"Perf Results",
 		"Composite score by saved benchmark run and GPU.",
 		"No saved performance benchmark runs yet.",
 		maxIdx,
@@ -2246,7 +2405,7 @@ func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string,
 		b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">` + html.EscapeString(description) + `</p>`)
 	}
 	b.WriteString(`<div style="overflow-x:auto">`)
-	b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th>`)
+	b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th><th>Status</th>`)
 	for i := 0; i <= maxGPUIndex; i++ {
 		b.WriteString(`<th>GPU ` + strconv.Itoa(i) + `</th>`)
 	}
@@ -2255,13 +2414,36 @@ func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string,
 		b.WriteString(`<tr>`)
 		b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
 		b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
+		overallColor := "var(--ok)"
+		overallLabel := run.overallStatus
+		if overallLabel == "" {
+			overallLabel = "OK"
+		}
+		if overallLabel == "FAILED" {
+			overallColor = "var(--crit-fg,#9f3a38)"
+		} else if overallLabel != "OK" {
+			overallColor = "var(--warn)"
+		}
+		b.WriteString(`<td style="color:` + overallColor + `;font-weight:600">` + html.EscapeString(overallLabel) + `</td>`)
 		for idx := 0; idx <= maxGPUIndex; idx++ {
 			score, ok := run.gpuScores[idx]
 			if !ok {
 				b.WriteString(`<td style="color:var(--muted)">-</td>`)
 				continue
 			}
-			b.WriteString(`<td>` + fmt.Sprintf("%.2f", score) + `</td>`)
+			gpuStatus := run.gpuStatuses[idx]
+			scoreColor := ""
+			switch gpuStatus {
+			case "FAILED":
+				scoreColor = ` style="color:var(--crit-fg,#9f3a38);font-weight:600"`
+			case "WARNING", "PARTIAL":
+				scoreColor = ` style="color:var(--warn);font-weight:600"`
+			case "", "OK":
+				// no override
+			default:
+				scoreColor = ` style="color:var(--warn);font-weight:600"`
+			}
+			b.WriteString(`<td` + scoreColor + `>` + fmt.Sprintf("%.2f", score) + `</td>`)
 		}
 		b.WriteString(`</tr>`)
 	}
@@ -2295,12 +2477,15 @@ func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun)
 			continue
 		}
 		run := benchmarkHistoryRun{
-			generatedAt: result.GeneratedAt,
-			displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
-			gpuScores:   make(map[int]float64),
+			generatedAt:   result.GeneratedAt,
+			displayTime:   result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
+			gpuScores:     make(map[int]float64),
+			gpuStatuses:   make(map[int]string),
+			overallStatus: result.OverallStatus,
 		}
 		for _, gpu := range result.GPUs {
 			run.gpuScores[gpu.Index] = gpu.Scores.CompositeScore
+			run.gpuStatuses[gpu.Index] = gpu.Status
 			if gpu.Index > maxGPUIndex {
 				maxGPUIndex = gpu.Index
 			}
@@ -2369,31 +2554,45 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {

 	if len(latest.GPUs) > 0 {
 		b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`)
-		b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Achieved W</th><th>P95 Observed W</th><th>Status</th>`)
+		b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Single-card W</th><th>Multi-GPU W</th><th>P95 Observed W</th><th>Status</th>`)
 		b.WriteString(`</tr></thead><tbody>`)
 		for _, gpu := range latest.GPUs {
-			derated := gpu.Derated || (gpu.DefaultPowerLimitW > 0 && gpu.AppliedPowerLimitW < gpu.DefaultPowerLimitW-1)
+			// finalLimitW is the definitive TDP: multi-GPU stable limit from the ramp,
+			// falling back to single-card applied limit if the ramp hasn't run.
+			finalLimitW := gpu.StablePowerLimitW
+			if finalLimitW <= 0 {
+				finalLimitW = gpu.AppliedPowerLimitW
+			}
+			// Derate is relative to nominal (DefaultPowerLimitW), using the final limit.
+			derated := gpu.Derated ||
+				(gpu.DefaultPowerLimitW > 0 && finalLimitW > 0 && finalLimitW < gpu.DefaultPowerLimitW-1)
 			rowStyle := ""
-			achievedStyle := ""
+			finalStyle := ""
 			if derated {
 				rowStyle = ` style="background:rgba(255,180,0,0.08)"`
-				achievedStyle = ` style="color:#e6a000;font-weight:600"`
+				finalStyle = ` style="color:#e6a000;font-weight:600"`
 			}
 			statusLabel := gpu.Status
 			if statusLabel == "" {
 				statusLabel = "OK"
 			}
 			statusColor := "var(--ok)"
-			if statusLabel != "OK" {
+			if statusLabel == "FAILED" {
+				statusColor = "var(--crit-fg,#9f3a38)"
+			} else if statusLabel != "OK" {
 				statusColor = "var(--warn)"
 			}
 			nominalStr := "-"
 			if gpu.DefaultPowerLimitW > 0 {
 				nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW)
 			}
-			achievedStr := "-"
+			singleStr := "-"
 			if gpu.AppliedPowerLimitW > 0 {
-				achievedStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
+				singleStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
+			}
+			multiStr := "-"
+			if gpu.StablePowerLimitW > 0 {
+				multiStr = fmt.Sprintf("%.0f", gpu.StablePowerLimitW)
 			}
 			p95Str := "-"
 			if gpu.MaxObservedPowerW > 0 {
@@ -2403,7 +2602,8 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {
 			b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`)
 			b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`)
 			b.WriteString(`<td>` + nominalStr + `</td>`)
-			b.WriteString(`<td` + achievedStyle + `>` + achievedStr + `</td>`)
+			b.WriteString(`<td>` + singleStr + `</td>`)
+			b.WriteString(`<td` + finalStyle + `>` + multiStr + `</td>`)
 			b.WriteString(`<td>` + p95Str + `</td>`)
 			b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`)
 			b.WriteString(`</tr>`)
@@ -2437,7 +2637,7 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {

 func renderBurn() string {
 	return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>&#9888; Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
-<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `), NCCL, NVBandwidth, and LINPACK remain in <a href="/validate">Validate → Stress mode</a>. Burn exposes sustained GPU compute load recipes.</div>
+<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in <a href="/validate">Validate → Stress mode</a>; NCCL and NVBandwidth are available directly from <a href="/validate">Validate</a>.</div>
 <p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>

 <div class="card" style="margin-bottom:16px">
@@ -2445,13 +2645,13 @@ func renderBurn() string {
  <div class="card-body burn-profile-body">
    <div class="burn-profile-col">
      <div class="form-row" style="margin:0 0 8px"><label>Preset</label></div>
-      <label class="cb-row"><input type="radio" name="burn-profile" value="smoke" checked><span>Smoke — quick check (~5 min)</span></label>
-      <label class="cb-row"><input type="radio" name="burn-profile" value="acceptance"><span>Acceptance — 1 hour</span></label>
-      <label class="cb-row"><input type="radio" name="burn-profile" value="overnight"><span>Overnight — 8 hours</span></label>
+      <label class="cb-row"><input type="radio" name="burn-profile" value="smoke" checked><span>Smoke — 5 min/GPU (sequential) or 5 min (parallel)</span></label>
+      <label class="cb-row"><input type="radio" name="burn-profile" value="acceptance"><span>Acceptance — 1 h/GPU (sequential) or 1 h (parallel)</span></label>
+      <label class="cb-row"><input type="radio" name="burn-profile" value="overnight"><span>Overnight — 8 h/GPU (sequential) or 8 h (parallel)</span></label>
    </div>
    <div class="burn-profile-col burn-profile-action">
      <button type="button" class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button>
-      <p>Run checked tests one by one. Tests run without cooldown. Each test duration is determined by the Burn Profile. Total test duration is the sum of all selected tests multiplied by the Burn Profile duration.</p>
+      <p>Runs checked tests as separate sequential tasks. In sequential GPU mode, total time = profile duration × N GPU. In parallel mode, all selected GPUs burn simultaneously for one profile duration.</p>
    </div>
    <div class="burn-profile-col burn-profile-action">
      <button type="button" class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button>
--- a/audit/internal/webui/server.go
+++ b/audit/internal/webui/server.go
@@ -135,6 +135,14 @@ type namedMetricsRing struct {
 // At metricsCollectInterval = 5 s this covers 30 minutes of live history.
 const metricsChartWindow = 360

+// metricsDownsampleAge is the age after which old metrics rows are downsampled
+// to 1 sample per minute. Data fresher than this is kept at full resolution.
+const metricsDownsampleAge = 2 * time.Hour
+
+// metricsRetainWindow is the total retention period for metrics rows.
+// Rows older than this are deleted entirely by the background compactor.
+const metricsRetainWindow = 48 * time.Hour
+
 var metricsCollectInterval = 5 * time.Second

 // pendingNetChange tracks a network state change awaiting confirmation.
@@ -335,13 +343,24 @@ func (h *handler) startMetricsCollector() {
 	goRecoverLoop("metrics collector", 2*time.Second, func() {
 		ticker := time.NewTicker(metricsCollectInterval)
 		defer ticker.Stop()
-		for range ticker.C {
-			sample := platform.SampleLiveMetrics()
-			if h.metricsDB != nil {
-				_ = h.metricsDB.Write(sample)
+		pruneTicker := time.NewTicker(time.Hour)
+		defer pruneTicker.Stop()
+		for {
+			select {
+			case <-ticker.C:
+				sample := platform.SampleLiveMetrics()
+				if h.metricsDB != nil {
+					_ = h.metricsDB.Write(sample)
+				}
+				h.feedRings(sample)
+				h.setLatestMetric(sample)
+			case <-pruneTicker.C:
+				if h.metricsDB != nil {
+					now := time.Now().UTC()
+					_ = h.metricsDB.Downsample(now.Add(-metricsDownsampleAge), now.Add(-metricsRetainWindow))
+					_ = h.metricsDB.Prune(now.Add(-metricsRetainWindow))
+				}
 			}
-			h.feedRings(sample)
-			h.setLatestMetric(sample)
 		}
 	})
 }
@@ -575,12 +594,14 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
 	}
 	timeline := metricsTimelineSegments(samples, time.Now())
 	if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
-		buf, ok, err := renderGPUOverviewChartSVG(idx, samples, timeline)
+		var overviewOk bool
+		var buf []byte
+		buf, overviewOk, err = renderGPUOverviewChartSVG(idx, samples, timeline)
 		if err != nil {
 			http.Error(w, err.Error(), http.StatusInternalServerError)
 			return
 		}
-		if !ok {
+		if !overviewOk {
 			http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
 			return
 		}
@@ -589,23 +610,37 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
 		_, _ = w.Write(buf)
 		return
 	}
-	datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
+	datasets, names, labels, title, yMin, yMax, stacked, ok := chartDataFromSamples(path, samples)
 	if !ok {
 		http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
 		return
 	}

-	buf, err := renderMetricChartSVG(
-		title,
-		labels,
-		sampleTimes(samples),
-		datasets,
-		names,
-		yMin,
-		yMax,
-		chartCanvasHeightForPath(path, len(names)),
-		timeline,
-	)
+	var buf []byte
+	if stacked {
+		buf, err = renderStackedMetricChartSVG(
+			title,
+			labels,
+			sampleTimes(samples),
+			datasets,
+			names,
+			yMax,
+			chartCanvasHeightForPath(path, len(names)),
+			timeline,
+		)
+	} else {
+		buf, err = renderMetricChartSVG(
+			title,
+			labels,
+			sampleTimes(samples),
+			datasets,
+			names,
+			yMin,
+			yMax,
+			chartCanvasHeightForPath(path, len(names)),
+			timeline,
+		)
+	}
 	if err != nil {
 		http.Error(w, err.Error(), http.StatusInternalServerError)
 		return
@@ -615,12 +650,8 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
 	_, _ = w.Write(buf)
 }

-func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][]float64, []string, []string, string, *float64, *float64, bool) {
-	var datasets [][]float64
-	var names []string
-	var title string
-	var yMin, yMax *float64
-	labels := sampleTimeLabels(samples)
+func chartDataFromSamples(path string, samples []platform.LiveMetricSample) (datasets [][]float64, names []string, labels []string, title string, yMin, yMax *float64, stacked bool, ok bool) {
+	labels = sampleTimeLabels(samples)

 	switch {
 	case path == "server-load":
@@ -656,15 +687,41 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][

 	case path == "server-power":
 		title = "System Power"
-		power := make([]float64, len(samples))
-		for i, s := range samples {
-			power[i] = s.PowerW
+		// Use per-PSU stacked chart when PSU SDR data is available.
+		// Collect the union of PSU slots seen across all samples.
+		psuSlots := psuSlotsFromSamples(samples)
+		if len(psuSlots) > 1 {
+			// Build one dataset per PSU slot.
+			psuDatasets := make([][]float64, len(psuSlots))
+			psuNames := make([]string, len(psuSlots))
+			for si, slot := range psuSlots {
+				ds := make([]float64, len(samples))
+				for i, s := range samples {
+					for _, psu := range s.PSUs {
+						if psu.Slot == slot {
+							ds[i] = psu.PowerW
+							break
+						}
+					}
+				}
+				psuDatasets[si] = normalizePowerSeries(ds)
+				psuNames[si] = fmt.Sprintf("PSU %d", slot)
+			}
+			datasets = psuDatasets
+			names = psuNames
+			stacked = true
+			yMax = autoMax120(psuStackedTotal(psuDatasets))
+		} else {
+			power := make([]float64, len(samples))
+			for i, s := range samples {
+				power[i] = s.PowerW
+			}
+			power = normalizePowerSeries(power)
+			datasets = [][]float64{power}
+			names = []string{"Power W"}
+			yMin = floatPtr(0)
+			yMax = autoMax120(power)
 		}
-		power = normalizePowerSeries(power)
-		datasets = [][]float64{power}
-		names = []string{"Power W"}
-		yMin = floatPtr(0)
-		yMax = autoMax120(power)

 	case path == "server-fans":
 		title = "Fan RPM"
@@ -707,7 +764,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 	case strings.HasPrefix(path, "gpu/"):
 		idx, sub, ok := parseGPUChartPath(path)
 		if !ok {
-			return nil, nil, nil, "", nil, nil, false
+			return nil, nil, nil, "", nil, nil, false, false
 		}
 		switch sub {
 		case "load":
@@ -715,7 +772,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
 			mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
 			if util == nil && mem == nil {
-				return nil, nil, nil, "", nil, nil, false
+				return nil, nil, nil, "", nil, nil, false, false
 			}
 			datasets = [][]float64{coalesceDataset(util, len(samples)), coalesceDataset(mem, len(samples))}
 			names = []string{"Load %", "Mem %"}
@@ -725,7 +782,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			title = gpuDisplayLabel(idx) + " Temperature"
 			temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
 			if temp == nil {
-				return nil, nil, nil, "", nil, nil, false
+				return nil, nil, nil, "", nil, nil, false, false
 			}
 			datasets = [][]float64{temp}
 			names = []string{"Temp °C"}
@@ -735,7 +792,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			title = gpuDisplayLabel(idx) + " Core Clock"
 			clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
 			if clock == nil {
-				return nil, nil, nil, "", nil, nil, false
+				return nil, nil, nil, "", nil, nil, false, false
 			}
 			datasets = [][]float64{clock}
 			names = []string{"Core Clock MHz"}
@@ -744,7 +801,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			title = gpuDisplayLabel(idx) + " Memory Clock"
 			clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
 			if clock == nil {
-				return nil, nil, nil, "", nil, nil, false
+				return nil, nil, nil, "", nil, nil, false, false
 			}
 			datasets = [][]float64{clock}
 			names = []string{"Memory Clock MHz"}
@@ -753,7 +810,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			title = gpuDisplayLabel(idx) + " Power"
 			power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
 			if power == nil {
-				return nil, nil, nil, "", nil, nil, false
+				return nil, nil, nil, "", nil, nil, false, false
 			}
 			datasets = [][]float64{power}
 			names = []string{"Power W"}
@@ -761,10 +818,10 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 		}

 	default:
-		return nil, nil, nil, "", nil, nil, false
+		return nil, nil, nil, "", nil, nil, false, false
 	}

-	return datasets, names, labels, title, yMin, yMax, len(datasets) > 0
+	return datasets, names, labels, title, yMin, yMax, stacked, len(datasets) > 0
 }

 func parseGPUChartPath(path string) (idx int, sub string, ok bool) {
@@ -930,6 +987,37 @@ func normalizePowerSeries(ds []float64) []float64 {
 	return out
 }

+// psuSlotsFromSamples returns the sorted list of PSU slot numbers seen across samples.
+func psuSlotsFromSamples(samples []platform.LiveMetricSample) []int {
+	seen := map[int]struct{}{}
+	for _, s := range samples {
+		for _, p := range s.PSUs {
+			seen[p.Slot] = struct{}{}
+		}
+	}
+	slots := make([]int, 0, len(seen))
+	for s := range seen {
+		slots = append(slots, s)
+	}
+	sort.Ints(slots)
+	return slots
+}
+
+// psuStackedTotal returns the point-by-point sum of all PSU datasets (for scale calculation).
+func psuStackedTotal(datasets [][]float64) []float64 {
+	if len(datasets) == 0 {
+		return nil
+	}
+	n := len(datasets[0])
+	total := make([]float64, n)
+	for _, ds := range datasets {
+		for i, v := range ds {
+			total[i] += v
+		}
+	}
+	return total
+}
+
 func normalizeFanSeries(ds []float64) []float64 {
 	if len(ds) == 0 {
 		return nil
--- a/audit/internal/webui/server_test.go
+++ b/audit/internal/webui/server_test.go
@@ -120,7 +120,7 @@ func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
 		},
 	}

-	datasets, names, labels, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
+	datasets, names, labels, title, _, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
 	if !ok {
 		t.Fatal("chartDataFromSamples returned ok=false")
 	}
@@ -164,7 +164,7 @@ func TestChartDataFromSamplesKeepsStableGPUSeriesOrder(t *testing.T) {
 		},
 	}

-	datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
+	datasets, names, _, title, _, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
 	if !ok {
 		t.Fatal("chartDataFromSamples returned ok=false")
 	}
@@ -209,7 +209,7 @@ func TestChartDataFromSamplesIncludesGPUClockCharts(t *testing.T) {
 		},
 	}

-	datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
+	datasets, names, _, title, _, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
 	if !ok {
 		t.Fatal("gpu-all-clock returned ok=false")
 	}
@@ -744,6 +744,26 @@ func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
 	}
 }

+func TestValidatePageRendersNvidiaFabricCardsInValidateMode(t *testing.T) {
+	handler := NewHandler(HandlerOptions{})
+	rec := httptest.NewRecorder()
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d", rec.Code)
+	}
+	body := rec.Body.String()
+	for _, needle := range []string{
+		`NVIDIA Interconnect (NCCL)`,
+		`Validate and Stress:`,
+		`NVIDIA Bandwidth (NVBandwidth)`,
+		`nvbandwidth runs all built-in tests without a time limit`,
+	} {
+		if !strings.Contains(body, needle) {
+			t.Fatalf("validate page missing %q: %s", needle, body)
+		}
+	}
+}
+
 func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
--- a/audit/internal/webui/stability.go
+++ b/audit/internal/webui/stability.go
@@ -7,14 +7,43 @@ import (
 	"time"
 )

+const (
+	recoverLoopMaxDelay   = 60 * time.Second
+	recoverLoopResetAfter = 30 * time.Second
+)
+
+// goRecoverLoop starts fn in a goroutine, restarting after panics.
+// restartDelay is the initial delay; successive panics double it up to
+// recoverLoopMaxDelay. The delay resets to restartDelay once fn runs
+// successfully for recoverLoopResetAfter without panicking.
 func goRecoverLoop(name string, restartDelay time.Duration, fn func()) {
 	go func() {
+		delay := restartDelay
+		consecutive := 0
 		for {
-			if !runRecoverable(name, fn) {
+			start := time.Now()
+			panicked := runRecoverable(name, fn)
+			if !panicked {
 				return
 			}
-			if restartDelay > 0 {
-				time.Sleep(restartDelay)
+			consecutive++
+			if time.Since(start) >= recoverLoopResetAfter {
+				delay = restartDelay
+				consecutive = 1
+			}
+			slog.Warn("goroutine restarting after panic",
+				"component", name,
+				"consecutive_panics", consecutive,
+				"next_delay", delay,
+			)
+			if delay > 0 {
+				time.Sleep(delay)
+			}
+			if delay < recoverLoopMaxDelay {
+				delay *= 2
+				if delay > recoverLoopMaxDelay {
+					delay = recoverLoopMaxDelay
+				}
 			}
 		}
 	}()
--- a/audit/internal/webui/task_report.go
+++ b/audit/internal/webui/task_report.go
@@ -171,21 +171,17 @@ func renderTaskChartSVG(path string, samples []platform.LiveMetricSample, timeli
 		}
 		return gpuDisplayLabel(idx) + " Overview", buf, true
 	}
-	datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
+	datasets, names, labels, title, yMin, yMax, stacked, ok := chartDataFromSamples(path, samples)
 	if !ok {
 		return "", nil, false
 	}
-	buf, err := renderMetricChartSVG(
-		title,
-		labels,
-		sampleTimes(samples),
-		datasets,
-		names,
-		yMin,
-		yMax,
-		chartCanvasHeightForPath(path, len(names)),
-		timeline,
-	)
+	var buf []byte
+	var err error
+	if stacked {
+		buf, err = renderStackedMetricChartSVG(title, labels, sampleTimes(samples), datasets, names, yMax, chartCanvasHeightForPath(path, len(names)), timeline)
+	} else {
+		buf, err = renderMetricChartSVG(title, labels, sampleTimes(samples), datasets, names, yMin, yMax, chartCanvasHeightForPath(path, len(names)), timeline)
+	}
 	if err != nil {
 		return "", nil, false
 	}
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -162,6 +162,32 @@ type nvidiaRampSpec struct {
 	TotalDurationSec int
 }

+func resolveMemoryValidatePreset(profile string, stress bool) (sizeMB, passes int) {
+	switch strings.TrimSpace(strings.ToLower(profile)) {
+	case "overnight":
+		return 1024, 2
+	case "acceptance":
+		return 1024, 1
+	case "smoke":
+		return 256, 1
+	}
+	if stress {
+		return 512, 1
+	}
+	return 256, 1
+}
+
+func taskMayLeaveOrphanWorkers(target string) bool {
+	switch strings.TrimSpace(strings.ToLower(target)) {
+	case "nvidia", "nvidia-targeted-stress", "nvidia-targeted-power", "nvidia-pulse",
+		"nvidia-bandwidth", "nvidia-stress", "nvidia-compute", "nvidia-bench-perf",
+		"memory", "memory-stress", "cpu", "sat-stress", "platform-stress":
+		return true
+	default:
+		return false
+	}
+}
+
 func resolveBurnPreset(profile string) burnPreset {
 	switch profile {
 	case "overnight":
@@ -559,6 +585,7 @@ func (q *taskQueue) finalizeTaskRun(t *Task, j *jobState) {
 	if err := writeTaskReportArtifacts(t); err != nil {
 		appendJobLog(t.LogPath, "WARN: task report generation failed: "+err.Error())
 	}
+	j.closeLog()
 	if t.ErrMsg != "" {
 		taskSerialEvent(t, "finished with status="+t.Status+" error="+t.ErrMsg)
 		return
@@ -587,8 +614,9 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 	}
 	a := q.opts.App

+	recovered := len(j.lines) > 0
 	j.append(fmt.Sprintf("Starting %s...", t.Name))
-	if len(j.lines) > 0 {
+	if recovered {
 		j.append(fmt.Sprintf("Recovered after bee-web restart at %s", time.Now().UTC().Format(time.RFC3339)))
 	}

@@ -710,15 +738,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			err = fmt.Errorf("app not configured")
 			break
 		}
-		dur := t.params.Duration
-		if t.params.BurnProfile != "" && dur <= 0 {
-			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
-		}
-		archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
-			DurationSec: dur,
-			Loader:      platform.NvidiaStressLoaderNCCL,
-			GPUIndices:  t.params.GPUIndices,
-		}, j.append)
+		archive, err = a.RunNCCLTests(ctx, "", t.params.GPUIndices, j.append)
 	case "nvidia-stress":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
@@ -751,10 +771,8 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			err = fmt.Errorf("app not configured")
 			break
 		}
-		sizeMB, passes := 256, 1
-		if t.params.StressMode {
-			sizeMB, passes = 1024, 3
-		}
+		sizeMB, passes := resolveMemoryValidatePreset(t.params.BurnProfile, t.params.StressMode)
+		j.append(fmt.Sprintf("Memory validate preset: %d MB x %d pass(es)", sizeMB, passes))
 		archive, err = runMemoryAcceptancePackCtx(a, ctx, "", sizeMB, passes, j.append)
 	case "storage":
 		if a == nil {
@@ -1010,6 +1028,9 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
 			if t.job != nil {
 				t.job.abort()
 			}
+			if taskMayLeaveOrphanWorkers(t.Target) {
+				platform.KillTestWorkers()
+			}
 			t.Status = TaskCancelled
 			t.DoneAt = &now
 			taskSerialEvent(t, "finished with status="+t.Status)
@@ -1037,6 +1058,9 @@ func (h *handler) handleAPITasksKillWorkers(w http.ResponseWriter, _ *http.Reque
 			if t.job != nil {
 				t.job.abort()
 			}
+			if taskMayLeaveOrphanWorkers(t.Target) {
+				platform.KillTestWorkers()
+			}
 			t.Status = TaskCancelled
 			t.DoneAt = &now
 			taskSerialEvent(t, "finished with status="+t.Status)
@@ -1141,10 +1165,13 @@ func (q *taskQueue) loadLocked() {
 		q.assignTaskLogPathLocked(t)
 		if t.Status == TaskRunning {
 			// The task was interrupted by a bee-web restart. Child processes
-			// (e.g. bee-gpu-burn-worker) survive the restart in their own
-			// process groups and cannot be cancelled retroactively. Mark the
-			// task as failed so the user can decide whether to re-run it
-			// rather than blindly re-launching duplicate workers.
+			// (e.g. bee-gpu-burn-worker, dcgmi/nvvs) survive the restart in
+			// their own process groups. Kill any matching stale workers before
+			// marking the task failed so the next GPU test does not inherit a
+			// busy DCGM slot or duplicate workers.
+			if taskMayLeaveOrphanWorkers(t.Target) {
+				_ = platform.KillTestWorkers()
+			}
 			now := time.Now()
 			t.Status = TaskFailed
 			t.DoneAt = &now
--- a/audit/internal/webui/tasks_test.go
+++ b/audit/internal/webui/tasks_test.go
@@ -672,6 +672,36 @@ func TestRunTaskUsesBurnProfileDurationForCPU(t *testing.T) {
 	}
 }

+func TestRunTaskUsesQuickPresetForMemoryValidate(t *testing.T) {
+	var gotSizeMB, gotPasses int
+	q := &taskQueue{
+		opts: &HandlerOptions{App: &app.App{}},
+	}
+	tk := &Task{
+		ID:        "mem-validate-1",
+		Name:      "Memory SAT",
+		Target:    "memory",
+		Status:    TaskRunning,
+		CreatedAt: time.Now(),
+		params:    taskParams{StressMode: true},
+	}
+	j := &jobState{}
+
+	orig := runMemoryAcceptancePackCtx
+	runMemoryAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, sizeMB, passes int, _ func(string)) (string, error) {
+		gotSizeMB = sizeMB
+		gotPasses = passes
+		return "/tmp/memory-validate.tar.gz", nil
+	}
+	defer func() { runMemoryAcceptancePackCtx = orig }()
+
+	q.runTask(tk, j, context.Background())
+
+	if gotSizeMB != 512 || gotPasses != 1 {
+		t.Fatalf("memory validate preset=%dMB x%d want 512MB x1", gotSizeMB, gotPasses)
+	}
+}
+
 func TestRunTaskBuildsSupportBundleWithoutApp(t *testing.T) {
 	dir := t.TempDir()
 	q := &taskQueue{
--- a/bible-local/docs/gpu-model-propagation.md
+++ b/bible-local/docs/gpu-model-propagation.md
@@ -110,8 +110,12 @@ nvidia-smi / lspci (audit collection)

 ---

-## What Needs Fixing
+## Fixed Issues

-1. **NVIDIA PCIe Model** — `enrichPCIeWithNVIDIAData()` should set `dev.Model = &gpu.Name`
-2. **Fallback consistency** — `benchmark_report.go:93` should say `"Unknown GPU"` not `"Unknown"`; `sat.go:922` should say `"Unknown GPU"` not `"unknown"`
-3. **Old benchmark JSONs** — no fix possible for already-saved results with missing names (display-only issue)
+All previously open items are resolved:
+
+1. **NVIDIA PCIe Model** — `enrichPCIeWithNVIDIAData()` sets `dev.Model = &v` (`nvidia.go:78`).
+2. **Fallback consistency** — `sat.go` and `benchmark_report.go` both use `"Unknown GPU"`.
+3. **`tops_per_sm_per_ghz`** — computed in `benchmark.go` and stored in `BenchmarkGPUScore.TOPSPerSMPerGHz`.
+4. **`MultiprocessorCount`, `PowerLimitW`, `DefaultPowerLimitW`** — present in `benchmark_types.go`.
+5. **Old benchmark JSONs** — no fix possible for already-saved results with missing names (display-only issue).
--- a/bible-local/docs/iso-build-rules.md
+++ b/bible-local/docs/iso-build-rules.md
@@ -15,6 +15,41 @@ This applies to:
 - `iso/builder/config/package-lists/*.list.chroot`
 - Any package referenced in bootloader configs, hooks, or overlay scripts

+## Bootloader sync rule
+
+The ISO has two independent bootloader configs that must be kept in sync manually:
+
+| File | Used by |
+|------|---------|
+| `config/bootloaders/grub-efi/grub.cfg` | UEFI (all modern servers) |
+| `config/bootloaders/isolinux/live.cfg.in` | CSM / legacy BIOS (syslinux) |
+
+live-build does NOT derive one from the other. Any new boot entry, kernel parameter
+change, or new mode added to one file must be manually mirrored in the other.
+
+**Canonical entry list** (both files must have all of these):
+
+| Label | Key params |
+|-------|-----------|
+| normal (default) | `nomodeset bee.nvidia.mode=normal` + full param set |
+| load to RAM | `toram nomodeset bee.nvidia.mode=normal` + full param set |
+| GSP=off | `nomodeset bee.nvidia.mode=gsp-off` + full param set |
+| KMS | no `nomodeset`, `bee.nvidia.mode=normal` + full param set |
+| KMS + GSP=off | no `nomodeset`, `bee.nvidia.mode=gsp-off` + full param set |
+| fail-safe | `nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp` |
+
+**Full standard param set** (append after `@APPEND_LIVE@` / `nomodeset` flags):
+```
+net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always
+numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
+nowatchdog nosoftlockup
+```
+(fail-safe is the exception — it deliberately uses minimal params.)
+
+**Historical note:** `grub-pc/` was mistakenly used instead of `grub-efi/` until v8.25.
+live-build reads `config/bootloaders/grub-efi/` for UEFI because the build is
+configured with `--bootloaders "grub-efi,syslinux"`. Directory `grub-pc` is ignored.
+
 ## Memtest rule

 Do not assume live-build's built-in memtest integration is sufficient for `bee`.
--- a/iso/builder/auto/config
+++ b/iso/builder/auto/config
@@ -33,7 +33,7 @@ lb config noauto \
    --iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
    --iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
    --bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
-    --bootstrap-packages "ca-certificates" \
+    --debootstrap-options "--include=ca-certificates" \
    --apt-recommends false \
    --chroot-squashfs-compression-type zstd \
    "${@}"
--- a/iso/builder/bee-gpu-stress.c
+++ b/iso/builder/bee-gpu-stress.c
@@ -35,6 +35,8 @@ typedef void *CUstream;
 #define MAX_STRESS_STREAMS 16
 #define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
 #define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
+#define MAX_SINGLE_PRECISION_STREAMS 4
+#define MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES ((size_t)2u * 1024u * 1024u * 1024u)

 static const char *ptx_source =
    ".version 6.0\n"
@@ -296,6 +298,13 @@ static int choose_stream_count(int mp_count, int planned_profiles, size_t total_
    return stream_count;
 }

+static size_t clamp_single_precision_profile_budget(size_t profile_budget_bytes) {
+    if (profile_budget_bytes > MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES) {
+        return MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES;
+    }
+    return profile_budget_bytes;
+}
+
 static void destroy_streams(struct cuda_api *api, CUstream *streams, int count) {
    if (!api->cuStreamDestroy) {
        return;
@@ -704,6 +713,19 @@ static const struct profile_desc k_profiles[] = {

 #define PROFILE_COUNT ((int)(sizeof(k_profiles) / sizeof(k_profiles[0])))

+static int profile_allowed_for_run(const struct profile_desc *desc, int cc, const char *precision_filter) {
+    if (!(desc->enabled && cc >= desc->min_cc)) {
+        return 0;
+    }
+    if (precision_filter != NULL) {
+        return strcmp(desc->block_label, precision_filter) == 0;
+    }
+    /* Mixed/all phases intentionally exclude fp64/fp4 for now: both paths are
+     * unstable on the current benchmark fleet and can abort the whole mixed
+     * pass after earlier phases already collected useful telemetry. */
+    return strcmp(desc->block_label, "fp64") != 0 && strcmp(desc->block_label, "fp4") != 0;
+}
+
 static int load_cublaslt(struct cublaslt_api *api) {
    memset(api, 0, sizeof(*api));
    api->lib = dlopen("libcublasLt.so.13", RTLD_NOW | RTLD_LOCAL);
@@ -908,11 +930,9 @@ static int prepare_profile(struct cublaslt_api *cublas,
                           CUstream stream,
                           size_t profile_budget_bytes,
                           struct prepared_profile *out) {
-    memset(out, 0, sizeof(*out));
-    out->desc = *desc;
-    out->stream = stream;
-
    size_t bytes_per_cell = 0;
+    size_t attempt_budget = profile_budget_bytes;
+
    bytes_per_cell += bytes_for_elements(desc->a_type, 1);
    bytes_per_cell += bytes_for_elements(desc->b_type, 1);
    bytes_per_cell += bytes_for_elements(desc->c_type, 1);
@@ -921,106 +941,115 @@ static int prepare_profile(struct cublaslt_api *cublas,
        return 0;
    }

-    uint64_t dim = choose_square_dim(profile_budget_bytes, bytes_per_cell, desc->min_multiple);
-    out->m = dim;
-    out->n = dim;
-    out->k = dim;
+    while (attempt_budget >= MIN_PROFILE_BUDGET_BYTES) {
+        memset(out, 0, sizeof(*out));
+        out->desc = *desc;
+        out->stream = stream;

-    size_t desired_workspace = profile_budget_bytes / 8u;
-    if (desired_workspace > 32u * 1024u * 1024u) {
-        desired_workspace = 32u * 1024u * 1024u;
-    }
-    desired_workspace = round_down_size(desired_workspace, 256u);
+        uint64_t dim = choose_square_dim(attempt_budget, bytes_per_cell, desc->min_multiple);
+        out->m = dim;
+        out->n = dim;
+        out->k = dim;

-    size_t a_bytes = 0;
-    size_t b_bytes = 0;
-    size_t c_bytes = 0;
-    size_t d_bytes = 0;
-    size_t scale_bytes = 0;
-    while (1) {
-        a_bytes = bytes_for_elements(desc->a_type, out->k * out->m);
-        b_bytes = bytes_for_elements(desc->b_type, out->k * out->n);
-        c_bytes = bytes_for_elements(desc->c_type, out->m * out->n);
-        d_bytes = bytes_for_elements(desc->d_type, out->m * out->n);
-        scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k);
+        size_t desired_workspace = attempt_budget / 8u;
+        if (desired_workspace > 32u * 1024u * 1024u) {
+            desired_workspace = 32u * 1024u * 1024u;
+        }
+        desired_workspace = round_down_size(desired_workspace, 256u);

-        size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes;
-        if (matrix_bytes <= profile_budget_bytes) {
-            size_t remaining = profile_budget_bytes - matrix_bytes;
-            out->workspace_size = desired_workspace;
-            if (out->workspace_size > remaining) {
-                out->workspace_size = round_down_size(remaining, 256u);
+        size_t a_bytes = 0;
+        size_t b_bytes = 0;
+        size_t c_bytes = 0;
+        size_t d_bytes = 0;
+        size_t scale_bytes = 0;
+        while (1) {
+            a_bytes = bytes_for_elements(desc->a_type, out->k * out->m);
+            b_bytes = bytes_for_elements(desc->b_type, out->k * out->n);
+            c_bytes = bytes_for_elements(desc->c_type, out->m * out->n);
+            d_bytes = bytes_for_elements(desc->d_type, out->m * out->n);
+            scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k);
+
+            size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes;
+            if (matrix_bytes <= attempt_budget) {
+                size_t remaining = attempt_budget - matrix_bytes;
+                out->workspace_size = desired_workspace;
+                if (out->workspace_size > remaining) {
+                    out->workspace_size = round_down_size(remaining, 256u);
+                }
+                break;
            }
-            break;
+
+            if (out->m <= (uint64_t)desc->min_multiple) {
+                break;
+            }
+            out->m -= (uint64_t)desc->min_multiple;
+            out->n = out->m;
+            out->k = out->m;
+        }
+        if (out->m < (uint64_t)desc->min_multiple) {
+            attempt_budget /= 2u;
+            continue;
        }

-        if (out->m <= (uint64_t)desc->min_multiple) {
-            return 0;
-        }
-        out->m -= (uint64_t)desc->min_multiple;
-        out->n = out->m;
-        out->k = out->m;
-    }
-
-    if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) ||
-        !alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) ||
-        !alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) ||
-        !alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    cudaDataType_t scale_type = matmul_scale_type(desc);
-    if (!check_cublas("cublasLtMatmulDescCreate",
-                      cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, scale_type))) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    cublasOperation_t transa = CUBLAS_OP_T;
-    cublasOperation_t transb = CUBLAS_OP_N;
-    if (!check_cublas("set TRANSA",
-                      cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
-                                                             CUBLASLT_MATMUL_DESC_TRANSA,
-                                                             &transa,
-                                                             sizeof(transa))) ||
-        !check_cublas("set TRANSB",
-                      cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
-                                                             CUBLASLT_MATMUL_DESC_TRANSB,
-                                                             &transb,
-                                                             sizeof(transb)))) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    if (desc->needs_scalar_scale) {
-        float one = 1.0f;
-        if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) ||
-            !alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) {
+        if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) ||
+            !alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) ||
+            !alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) ||
+            !alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
-        if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) ||
-            !device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) {
+
+        cudaDataType_t scale_type = matmul_scale_type(desc);
+        if (!check_cublas("cublasLtMatmulDescCreate",
+                          cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, scale_type))) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
-        void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev;
-        void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev;
-        if (!check_cublas("set A scale ptr",
+
+        cublasOperation_t transa = CUBLAS_OP_T;
+        cublasOperation_t transb = CUBLAS_OP_N;
+        if (!check_cublas("set TRANSA",
                          cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
-                                                                 CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
-                                                                 &a_scale_ptr,
-                                                                 sizeof(a_scale_ptr))) ||
-            !check_cublas("set B scale ptr",
+                                                                 CUBLASLT_MATMUL_DESC_TRANSA,
+                                                                 &transa,
+                                                                 sizeof(transa))) ||
+            !check_cublas("set TRANSB",
                          cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
-                                                                 CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
-                                                                 &b_scale_ptr,
-                                                                 sizeof(b_scale_ptr)))) {
+                                                                 CUBLASLT_MATMUL_DESC_TRANSB,
+                                                                 &transb,
+                                                                 sizeof(transb)))) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
-    }
+
+        if (desc->needs_scalar_scale) {
+            float one = 1.0f;
+            if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) ||
+                !alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) {
+                destroy_profile(cublas, cuda, out);
+                return 0;
+            }
+            if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) ||
+                !device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) {
+                destroy_profile(cublas, cuda, out);
+                return 0;
+            }
+            void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev;
+            void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev;
+            if (!check_cublas("set A scale ptr",
+                              cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
+                                                                     CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
+                                                                     &a_scale_ptr,
+                                                                     sizeof(a_scale_ptr))) ||
+                !check_cublas("set B scale ptr",
+                              cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
+                                                                     CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
+                                                                     &b_scale_ptr,
+                                                                     sizeof(b_scale_ptr)))) {
+                destroy_profile(cublas, cuda, out);
+                return 0;
+            }
+        }

 #if defined(CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3)
    if (desc->needs_block_scale) {
@@ -1060,62 +1089,65 @@ static int prepare_profile(struct cublaslt_api *cublas,
    }
 #endif

-    if (!check_cublas("create A layout",
-                      cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) ||
-        !check_cublas("create B layout",
-                      cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) ||
-        !check_cublas("create C layout",
-                      cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) ||
-        !check_cublas("create D layout",
-                      cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    if (out->workspace_size > 0) {
-        if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) {
+        if (!check_cublas("create A layout",
+                          cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) ||
+            !check_cublas("create B layout",
+                          cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) ||
+            !check_cublas("create C layout",
+                          cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) ||
+            !check_cublas("create D layout",
+                          cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
+
+        if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) {
+            destroy_profile(cublas, cuda, out);
+            return 0;
+        }
+
+        if (out->workspace_size > 0) {
+            if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) {
+                destroy_profile(cublas, cuda, out);
+                return 0;
+            }
+        }
+
+        if (!check_cublas("set workspace",
+                          cublas->cublasLtMatmulPreferenceSetAttribute(
+                              out->preference,
+                              CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
+                              &out->workspace_size,
+                              sizeof(out->workspace_size)))) {
+            destroy_profile(cublas, cuda, out);
+            return 0;
+        }
+
+        int found = 0;
+        if (check_cublas("heuristic",
+                         cublas->cublasLtMatmulAlgoGetHeuristic(handle,
+                                                                out->op_desc,
+                                                                out->a_layout,
+                                                                out->b_layout,
+                                                                out->c_layout,
+                                                                out->d_layout,
+                                                                out->preference,
+                                                                1,
+                                                                &out->heuristic,
+                                                                &found)) &&
+            found > 0) {
+            out->ready = 1;
+            return 1;
+        }
+
+        destroy_profile(cublas, cuda, out);
+        attempt_budget = round_down_size(attempt_budget * 3u / 4u, 256u);
+        if (attempt_budget < MIN_PROFILE_BUDGET_BYTES) {
+            break;
+        }
    }

-    if (!check_cublas("set workspace",
-                      cublas->cublasLtMatmulPreferenceSetAttribute(
-                          out->preference,
-                          CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
-                          &out->workspace_size,
-                          sizeof(out->workspace_size)))) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    int found = 0;
-    if (!check_cublas("heuristic",
-                      cublas->cublasLtMatmulAlgoGetHeuristic(handle,
-                                                             out->op_desc,
-                                                             out->a_layout,
-                                                             out->b_layout,
-                                                             out->c_layout,
-                                                             out->d_layout,
-                                                             out->preference,
-                                                             1,
-                                                             &out->heuristic,
-                                                             &found))) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-    if (found <= 0) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    out->ready = 1;
-    return 1;
+    return 0;
 }

 static int run_cublas_profile(cublasLtHandle_t handle,
@@ -1180,6 +1212,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
    size_t requested_budget = 0;
    size_t total_budget = 0;
    size_t per_profile_budget = 0;
+    int budget_profiles = 0;

    memset(report, 0, sizeof(*report));
    snprintf(report->backend, sizeof(report->backend), "cublasLt");
@@ -1202,8 +1235,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,

    /* Count profiles matching the filter (for deciding what to run). */
    for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
-        if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc &&
-            (precision_filter == NULL || strcmp(k_profiles[i].block_label, precision_filter) == 0)) {
+        if (profile_allowed_for_run(&k_profiles[i], cc, precision_filter)) {
            planned++;
        }
    }
@@ -1215,30 +1247,41 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
    }

    /* Count all profiles active on this GPU regardless of filter.
-     * Used as the budget divisor so matrix sizes stay consistent whether
-     * running all precisions together or a single-precision phase. */
+     * Mixed phases still divide budget across the full precision set, while
+     * single-precision benchmark phases dedicate budget only to active
+     * profiles matching precision_filter. */
    int planned_total = 0;
    for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
-        if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) {
+        if (profile_allowed_for_run(&k_profiles[i], cc, precision_filter)) {
            planned_total++;
        }
    }
    if (planned_total < planned) {
        planned_total = planned;
    }
+    budget_profiles = planned_total;
+    if (precision_filter != NULL) {
+        budget_profiles = planned;
+    }
+    if (budget_profiles <= 0) {
+        budget_profiles = planned_total;
+    }

    requested_budget = (size_t)size_mb * 1024u * 1024u;
-    if (requested_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) {
-        requested_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES;
+    if (requested_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) {
+        requested_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES;
    }
    total_budget = clamp_budget_to_free_memory(cuda, requested_budget);
-    if (total_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) {
-        total_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES;
+    if (total_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) {
+        total_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES;
    }
    if (query_multiprocessor_count(cuda, dev, &mp_count) &&
        cuda->cuStreamCreate &&
        cuda->cuStreamDestroy) {
-        stream_count = choose_stream_count(mp_count, planned_total, total_budget, 1);
+        stream_count = choose_stream_count(mp_count, budget_profiles, total_budget, 1);
+    }
+    if (precision_filter != NULL && stream_count > MAX_SINGLE_PRECISION_STREAMS) {
+        stream_count = MAX_SINGLE_PRECISION_STREAMS;
    }
    if (stream_count > 1) {
        int created = 0;
@@ -1251,18 +1294,22 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
        }
    }
    report->stream_count = stream_count;
-    per_profile_budget = total_budget / ((size_t)planned_total * (size_t)stream_count);
+    per_profile_budget = total_budget / ((size_t)budget_profiles * (size_t)stream_count);
    if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) {
        per_profile_budget = MIN_PROFILE_BUDGET_BYTES;
    }
+    if (precision_filter != NULL) {
+        per_profile_budget = clamp_single_precision_profile_budget(per_profile_budget);
+    }
    report->buffer_mb = (int)(total_budget / (1024u * 1024u));
    append_detail(report->details,
                  sizeof(report->details),
-                  "requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n",
+                  "requested_mb=%d actual_mb=%d streams=%d mp_count=%d budget_profiles=%d per_worker_mb=%zu\n",
                  size_mb,
                  report->buffer_mb,
                  report->stream_count,
                  mp_count,
+                  budget_profiles,
                  per_profile_budget / (1024u * 1024u));

    for (int i = 0; i < profile_count; i++) {
@@ -1275,10 +1322,10 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
                          desc->min_cc);
            continue;
        }
-        if (precision_filter != NULL && strcmp(desc->block_label, precision_filter) != 0) {
+        if (!profile_allowed_for_run(desc, cc, precision_filter)) {
            append_detail(report->details,
                          sizeof(report->details),
-                          "%s=SKIPPED precision_filter\n",
+                          "%s=SKIPPED benchmark_disabled\n",
                          desc->name);
            continue;
        }
--- a/iso/builder/build.sh
+++ b/iso/builder/build.sh
@@ -203,7 +203,7 @@ dump_memtest_debug() {

        echo "-- source bootloader templates --"
        for cfg in \
-            "${BUILDER_DIR}/config/bootloaders/grub-pc/grub.cfg" \
+            "${BUILDER_DIR}/config/bootloaders/grub-efi/grub.cfg" \
            "${BUILDER_DIR}/config/bootloaders/isolinux/live.cfg.in"; do
            if [ -f "$cfg" ]; then
                echo "  file: $cfg"
@@ -954,86 +954,6 @@ elif [ -d "${LB_PKG_CACHE}" ] && [ "$(ls -A "${LB_PKG_CACHE}" 2>/dev/null)" ]; t
    rsync -a "${LB_PKG_CACHE}/" "${BUILD_WORK_DIR}/cache/packages.chroot/"
 fi

-if [ "$BEE_GPU_VENDOR" != "nvidia" ] || [ "$BEE_NVIDIA_MODULE_FLAVOR" != "proprietary" ]; then
-    cat > "${BUILD_WORK_DIR}/config/bootloaders/grub-pc/grub.cfg" <<'EOF'
-source /boot/grub/config.cfg
-
-echo ""
-echo "  ███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗"
-echo "  ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝"
-echo "  █████╗  ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗  █████╗"
-echo "  ██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝"
-echo "  ███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗"
-echo "  ╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝"
-echo "  Hardware Audit LiveCD"
-echo ""
-
-menuentry "EASY-BEE" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
-    initrd  @INITRD_LIVE@
-}
-
-submenu "EASY-BEE (advanced options) -->" {
-    menuentry "EASY-BEE — KMS (no nomodeset)" {
-        linux   @KERNEL_LIVE@ @APPEND_LIVE@ net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
-        initrd  @INITRD_LIVE@
-    }
-
-    menuentry "EASY-BEE — fail-safe" {
-        linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
-        initrd  @INITRD_LIVE@
-    }
-}
-
-if [ "${grub_platform}" = "efi" ]; then
-    menuentry "Memory Test (memtest86+)" {
-        chainloader /boot/memtest86+x64.efi
-    }
-else
-    menuentry "Memory Test (memtest86+)" {
-        linux16 /boot/memtest86+x64.bin
-    }
-fi
-
-if [ "${grub_platform}" = "efi" ]; then
-    menuentry "UEFI Firmware Settings" {
-        fwsetup
-    }
-fi
-EOF
-
-    cat > "${BUILD_WORK_DIR}/config/bootloaders/isolinux/live.cfg.in" <<'EOF'
-label live-@FLAVOUR@-normal
-    menu label ^EASY-BEE
-    menu default
-    linux @LINUX@
-    initrd @INITRD@
-    append @APPEND_LIVE@
-
-label live-@FLAVOUR@-kms
-    menu label EASY-BEE (^graphics/KMS)
-    linux @LINUX@
-    initrd @INITRD@
-    append @APPEND_LIVE@ bee.display=kms
-
-label live-@FLAVOUR@-toram
-    menu label EASY-BEE (^load to RAM)
-    linux @LINUX@
-    initrd @INITRD@
-    append @APPEND_LIVE@ toram
-
-label live-@FLAVOUR@-failsafe
-    menu label EASY-BEE (^fail-safe)
-    linux @LINUX@
-    initrd @INITRD@
-    append @APPEND_LIVE@ memtest noapic noapm nodma nomce nolapic nosmp vga=normal
-
-label memtest
-    menu label ^Memory Test (memtest86+)
-    linux /boot/memtest86+x64.bin
-EOF
-fi
-
 rsync -a "${OVERLAY_DIR}/" "${OVERLAY_STAGE_DIR}/"
 rm -f \
    "${OVERLAY_STAGE_DIR}/etc/bee-ssh-password-fallback" \
@@ -1305,7 +1225,7 @@ BEE_GPU_VENDOR_UPPER="$(echo "${BUILD_VARIANT}" | tr 'a-z-' 'A-Z_')"
 export BEE_GPU_VENDOR_UPPER

 cd "${LB_DIR}"
-run_step_sh "live-build clean" "80-lb-clean" "lb clean 2>&1 | tail -3"
+run_step_sh "live-build clean" "80-lb-clean" "lb clean --all 2>&1 | tail -3"
 run_step_sh "live-build config" "81-lb-config" "lb config 2>&1 | tail -5"
 dump_memtest_debug "pre-build" "${LB_DIR}"
 run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
--- a/iso/builder/config/bootloaders/grub-efi/config.cfg
+++ b/iso/builder/config/bootloaders/grub-efi/config.cfg
--- a/iso/builder/config/bootloaders/grub-efi/grub.cfg
+++ b/iso/builder/config/bootloaders/grub-efi/grub.cfg
@@ -16,6 +16,11 @@ menuentry "EASY-BEE" {
 }

 submenu "EASY-BEE (advanced options) -->" {
+    menuentry "EASY-BEE — load to RAM (toram)" {
+        linux   @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+        initrd  @INITRD_LIVE@
+    }
+
    menuentry "EASY-BEE — GSP=off" {
        linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
        initrd  @INITRD_LIVE@
@@ -26,6 +31,11 @@ submenu "EASY-BEE (advanced options) -->" {
        initrd  @INITRD_LIVE@
    }

+    menuentry "EASY-BEE — KMS + GSP=off" {
+        linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+        initrd  @INITRD_LIVE@
+    }
+
    menuentry "EASY-BEE — fail-safe" {
        linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
        initrd  @INITRD_LIVE@
--- a/iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.png
+++ b/iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.png
--- a/iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt
+++ b/iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt
@@ -5,6 +5,15 @@ title-text: ""
 message-font: "Unifont Regular 16"
 terminal-font: "Unifont Regular 16"

+#bee logo — centered, upper third of screen
+ image {
+        top = 4%
+        left = 50%-200
+        width = 400
+        height = 400
+        file = "bee-logo.png"
+}
+
 #help bar at the bottom
 + label {
        top = 100%-50
@@ -21,8 +30,8 @@ terminal-font: "Unifont Regular 16"
 + boot_menu {
        left = 20%
        width = 60%
-        top = 62%
-        height = 38%-80
+        top = 65%
+        height = 35%-80
        item_color = "#c88000"
        item_font = "Unifont Regular 16"
        selected_item_color= "#f5a800"
--- a/iso/builder/config/bootloaders/grub-efi/theme.cfg
+++ b/iso/builder/config/bootloaders/grub-efi/theme.cfg
--- a/iso/builder/config/bootloaders/isolinux/live.cfg.in
+++ b/iso/builder/config/bootloaders/isolinux/live.cfg.in
@@ -3,37 +3,37 @@ label live-@FLAVOUR@-normal
    menu default
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
-
-label live-@FLAVOUR@-kms
-    menu label EASY-BEE (^graphics/KMS)
-    linux @LINUX@
-    initrd @INITRD@
-    append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
+    append @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup

 label live-@FLAVOUR@-toram
    menu label EASY-BEE (^load to RAM)
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ toram bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
+    append @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup

 label live-@FLAVOUR@-gsp-off
    menu label EASY-BEE (^NVIDIA GSP=off)
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
+    append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup

-label live-@FLAVOUR@-kms-gsp-off
-    menu label EASY-BEE (g^raphics/KMS, GSP=off)
+label live-@FLAVOUR@-kms
+    menu label EASY-BEE (^KMS, no nomodeset)
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
+    append @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+
+label live-@FLAVOUR@-kms-gsp-off
+    menu label EASY-BEE (KMS, ^GSP=off)
+    linux @LINUX@
+    initrd @INITRD@
+    append @APPEND_LIVE@ bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup

 label live-@FLAVOUR@-failsafe
    menu label EASY-BEE (^fail-safe)
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal
+    append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0

 label memtest
    menu label ^Memory Test (memtest86+)
--- a/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
+++ b/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
@@ -63,8 +63,10 @@ chmod +x /usr/local/bin/bee-sshsetup   2>/dev/null || true
 chmod +x /usr/local/bin/bee-smoketest  2>/dev/null || true
 chmod +x /usr/local/bin/bee            2>/dev/null || true
 chmod +x /usr/local/bin/bee-log-run    2>/dev/null || true
-chmod +x /usr/local/bin/bee-selfheal      2>/dev/null || true
-chmod +x /usr/local/bin/bee-boot-status  2>/dev/null || true
+chmod +x /usr/local/bin/bee-selfheal        2>/dev/null || true
+chmod +x /usr/local/bin/bee-boot-status    2>/dev/null || true
+chmod +x /usr/local/bin/bee-install        2>/dev/null || true
+chmod +x /usr/local/bin/bee-remount-medium 2>/dev/null || true
 if [ "$GPU_VENDOR" = "nvidia" ]; then
    chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
    chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
--- a/iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
+++ b/iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
@@ -1,117 +0,0 @@
-#!/bin/sh
-# 9001-wallpaper.hook.chroot — generate /usr/share/bee/wallpaper.png inside chroot
-set -e
-echo "=== generating bee wallpaper ==="
-mkdir -p /usr/share/bee
-
-python3 - <<'PYEOF'
-from PIL import Image, ImageDraw, ImageFont, ImageFilter
-import os
-
-W, H = 1920, 1080
-
-ASCII_ART = [
-    "  ███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗",
-    "  ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝",
-    "  █████╗  ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗  █████╗",
-    "  ██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝",
-    "  ███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗",
-    "  ╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝",
-]
-SUBTITLE = "  Hardware Audit LiveCD"
-
-FG = (0xF6, 0xD0, 0x47)
-FG_DIM = (0xD4, 0xA9, 0x1C)
-SHADOW = (0x5E, 0x47, 0x05)
-SUB = (0x96, 0x7A, 0x17)
-BG = (0x05, 0x05, 0x05)
-
-MONO_FONT_CANDIDATES = [
-    '/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf',
-    '/usr/share/fonts/truetype/liberation2/LiberationMono-Bold.ttf',
-    '/usr/share/fonts/truetype/liberation/LiberationMono-Bold.ttf',
-    '/usr/share/fonts/truetype/freefont/FreeMonoBold.ttf',
-]
-SUB_FONT_CANDIDATES = [
-    '/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',
-    '/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf',
-    '/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf',
-    '/usr/share/fonts/truetype/freefont/FreeSansBold.ttf',
-]
-
-
-def load_font(candidates, size):
-    for path in candidates:
-        if os.path.exists(path):
-            return ImageFont.truetype(path, size)
-    return ImageFont.load_default()
-
-
-def mono_metrics(font):
-    probe = Image.new('L', (W, H), 0)
-    draw = ImageDraw.Draw(probe)
-    char_w = int(round(draw.textlength("M", font=font)))
-    bb = draw.textbbox((0, 0), "Mg", font=font)
-    char_h = bb[3] - bb[1]
-    return char_w, char_h
-
-
-def render_ascii_mask(font, lines, char_w, char_h, line_gap):
-    width = max(len(line) for line in lines) * char_w
-    height = len(lines) * char_h + line_gap * (len(lines) - 1)
-    mask = Image.new('L', (width, height), 0)
-    draw = ImageDraw.Draw(mask)
-    for row, line in enumerate(lines):
-        y = row * (char_h + line_gap)
-        for col, ch in enumerate(line):
-            if ch == ' ':
-                continue
-            x = col * char_w
-            draw.text((x, y), ch, font=font, fill=255)
-    return mask
-
-
-img = Image.new('RGB', (W, H), BG)
-draw = ImageDraw.Draw(img)
-
-# Soft amber glow under the logo without depending on font rendering.
-glow = Image.new('RGBA', (W, H), (0, 0, 0, 0))
-glow_draw = ImageDraw.Draw(glow)
-glow_draw.ellipse((360, 250, 1560, 840), fill=(180, 120, 10, 56))
-glow_draw.ellipse((520, 340, 1400, 760), fill=(255, 190, 40, 36))
-glow = glow.filter(ImageFilter.GaussianBlur(60))
-img = Image.alpha_composite(img.convert('RGBA'), glow)
-
-TARGET_LOGO_W = 400
-max_chars = max(len(line) for line in ASCII_ART)
-_probe_font = load_font(MONO_FONT_CANDIDATES, 64)
-_probe_cw, _ = mono_metrics(_probe_font)
-font_size_logo = max(6, int(64 * TARGET_LOGO_W / (_probe_cw * max_chars)))
-font_logo = load_font(MONO_FONT_CANDIDATES, font_size_logo)
-char_w, char_h = mono_metrics(font_logo)
-logo_mask = render_ascii_mask(font_logo, ASCII_ART, char_w, char_h, 2)
-logo_w, logo_h = logo_mask.size
-logo_x = (W - logo_w) // 2
-logo_y = 380
-
-sh_off = max(1, font_size_logo // 6)
-shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(1))
-img.paste(SHADOW, (logo_x + sh_off * 2, logo_y + sh_off * 2), shadow_mask)
-img.paste(FG_DIM, (logo_x + sh_off, logo_y + sh_off), logo_mask)
-img.paste(FG, (logo_x, logo_y), logo_mask)
-
-font_sub = load_font(SUB_FONT_CANDIDATES, 30)
-sub_bb = draw.textbbox((0, 0), SUBTITLE, font=font_sub)
-sub_x = (W - (sub_bb[2] - sub_bb[0])) // 2
-sub_y = logo_y + logo_h + 48
-draw = ImageDraw.Draw(img)
-draw.text((sub_x + 2, sub_y + 2), SUBTITLE, font=font_sub, fill=(35, 28, 6))
-draw.text((sub_x, sub_y), SUBTITLE, font=font_sub, fill=SUB)
-
-img = img.convert('RGB')
-
-img.save('/usr/share/bee/wallpaper.png', optimize=True)
-print('wallpaper written: /usr/share/bee/wallpaper.png')
-PYEOF
-
-echo "=== wallpaper done ==="
--- a/iso/builder/config/hooks/normal/9011-toram-rsync.hook.chroot
+++ b/iso/builder/config/hooks/normal/9011-toram-rsync.hook.chroot
@@ -0,0 +1,46 @@
+#!/bin/sh
+# 9011-toram-rsync.hook.chroot
+#
+# Adds rsync to the initramfs so that live-boot's toram code takes the
+# rsync --progress path instead of the silent "cp -a" fallback.
+#
+# live-boot's 9990-toram-todisk.sh already contains:
+#   if [ -x /bin/rsync ]; then
+#       rsync -a --progress ... 1>/dev/console
+#   else
+#       cp -a ...   # no output
+#   fi
+#
+# We install an initramfs-tools hook that calls copy_exec /usr/bin/rsync,
+# which copies the binary + all shared-library dependencies into the initrd.
+
+set -e
+
+HOOK_DIR="/etc/initramfs-tools/hooks"
+HOOK="${HOOK_DIR}/bee-rsync"
+
+mkdir -p "${HOOK_DIR}"
+
+cat > "${HOOK}" << 'EOF'
+#!/bin/sh
+# initramfs hook: include rsync for live-boot toram progress output
+PREREQ=""
+prereqs() { echo "$PREREQ"; }
+case "$1" in prereqs) prereqs; exit 0 ;; esac
+
+. /usr/share/initramfs-tools/hook-functions
+
+if [ -x /usr/bin/rsync ]; then
+    copy_exec /usr/bin/rsync /bin
+fi
+EOF
+
+chmod +x "${HOOK}"
+
+echo "9011-toram-rsync: installed initramfs hook at ${HOOK}"
+
+# Rebuild initramfs so the hook takes effect in the ISO's initrd.img
+KVER=$(ls /lib/modules | sort -V | tail -1)
+echo "9011-toram-rsync: rebuilding initramfs for kernel ${KVER}"
+update-initramfs -u -k "${KVER}"
+echo "9011-toram-rsync: done"
--- a/iso/builder/config/package-lists/bee.list.chroot
+++ b/iso/builder/config/package-lists/bee.list.chroot
@@ -3,6 +3,7 @@ dmidecode
 smartmontools
 nvme-cli
 pciutils
+rsync
 ipmitool
 util-linux
 e2fsprogs
--- a/iso/overlay/etc/systemd/system/bee-web.service
+++ b/iso/overlay/etc/systemd/system/bee-web.service
@@ -10,6 +10,7 @@ RestartSec=3
 StandardOutput=journal
 StandardError=journal
 LimitMEMLOCK=infinity
+MemoryMax=3G
 # Keep the web server responsive during GPU/CPU stress (children inherit nice+10
 # via Setpriority in runCmdJob, but the bee-web parent stays at 0).
 Nice=0
--- a/iso/overlay/usr/local/bin/bee-install
+++ b/iso/overlay/usr/local/bin/bee-install
@@ -65,6 +65,9 @@ done
 SQUASHFS="/run/live/medium/live/filesystem.squashfs"
 if [ ! -f "$SQUASHFS" ]; then
    echo "ERROR: squashfs not found at $SQUASHFS" >&2
+    echo "  The live medium may have been disconnected." >&2
+    echo "  Reconnect the disc and run:  bee-remount-medium --wait" >&2
+    echo "  Then re-run bee-install." >&2
    exit 1
 fi

@@ -162,10 +165,59 @@ log "  Mounted."
 log "--- Step 5/7: Unpacking filesystem (this takes 10-20 minutes) ---"
 log "  Source: $SQUASHFS"
 log "  Target: $MOUNT_ROOT"
-unsquashfs -f -d "$MOUNT_ROOT" "$SQUASHFS" 2>&1 | \
-    grep -E '^\[|^inod|^created|^extract' | \
-    while read -r line; do log "  $line"; done || true
-log "  Unpack complete."
+
+# unsquashfs does not support resume, so retry the entire unpack step if the
+# source medium disappears mid-copy (e.g. CD physically disconnected).
+UNPACK_ATTEMPTS=0
+UNPACK_MAX=5
+while true; do
+    UNPACK_ATTEMPTS=$(( UNPACK_ATTEMPTS + 1 ))
+    if [ "$UNPACK_ATTEMPTS" -gt "$UNPACK_MAX" ]; then
+        die "Unpack failed $UNPACK_MAX times — giving up. Check the disc and logs."
+    fi
+    [ "$UNPACK_ATTEMPTS" -gt 1 ] && log "  Retry attempt $UNPACK_ATTEMPTS / $UNPACK_MAX ..."
+
+    # Re-check squashfs is reachable before each attempt
+    if [ ! -f "$SQUASHFS" ]; then
+        log "  SOURCE LOST: $SQUASHFS not found."
+        log "  Reconnect the disc and run 'bee-remount-medium --wait' in another terminal,"
+        log "  then press Enter here to retry."
+        read -r _
+        continue
+    fi
+
+    # wipe partial unpack so unsquashfs starts clean
+    if [ "$UNPACK_ATTEMPTS" -gt 1 ]; then
+        log "  Cleaning partial unpack from $MOUNT_ROOT ..."
+        # keep the mount point itself but remove its contents
+        find "$MOUNT_ROOT" -mindepth 1 -maxdepth 1 -exec rm -rf {} + 2>/dev/null || true
+    fi
+
+    UNPACK_OK=0
+    unsquashfs -f -d "$MOUNT_ROOT" "$SQUASHFS" 2>&1 | \
+        grep -E '^\[|^inod|^created|^extract|^ERROR|failed' | \
+        while IFS= read -r line; do log "  $line"; done || UNPACK_OK=$?
+
+    # Check squashfs is still reachable (gone = disc pulled during copy)
+    if [ ! -f "$SQUASHFS" ]; then
+        log "  WARNING: source medium lost during unpack — will retry after remount."
+        log "  Run 'bee-remount-medium --wait' in another terminal, then press Enter."
+        read -r _
+        continue
+    fi
+
+    # Verify the unpack produced a usable root (presence of /etc is a basic check)
+    if [ -d "${MOUNT_ROOT}/etc" ]; then
+        log "  Unpack complete."
+        break
+    else
+        log "  WARNING: unpack produced no /etc — squashfs may be corrupt or incomplete."
+        if [ "$UNPACK_ATTEMPTS" -lt "$UNPACK_MAX" ]; then
+            log "  Retrying in 5 s ..."
+            sleep 5
+        fi
+    fi
+done

 # ------------------------------------------------------------------
 log "--- Step 6/7: Configuring installed system ---"
--- a/iso/overlay/usr/local/bin/bee-openbox-session
+++ b/iso/overlay/usr/local/bin/bee-openbox-session
@@ -9,9 +9,9 @@ xset s noblank

 # Set desktop background.
 if [ -f /usr/share/bee/wallpaper.png ]; then
-    feh --bg-fill /usr/share/bee/wallpaper.png
+    feh --bg-center --image-bg '#000000' /usr/share/bee/wallpaper.png
 else
-    xsetroot -solid '#f6c90e'
+    xsetroot -solid '#000000'
 fi

 tint2 &
--- a/iso/overlay/usr/local/bin/bee-remount-medium
+++ b/iso/overlay/usr/local/bin/bee-remount-medium
@@ -0,0 +1,100 @@
+#!/bin/bash
+# bee-remount-medium — find and remount the live ISO medium to /run/live/medium
+#
+# Run this after reconnecting the ISO source disc (USB/CD) if the live medium
+# was lost and /run/live/medium/live/filesystem.squashfs is missing.
+#
+# Usage: bee-remount-medium [--wait]
+#   --wait  keep retrying every 5 seconds until the medium is found (useful
+#           while physically reconnecting the device)
+
+set -euo pipefail
+
+MEDIUM_DIR="/run/live/medium"
+SQUASHFS_REL="live/filesystem.squashfs"
+WAIT_MODE=0
+
+for arg in "$@"; do
+    case "$arg" in
+        --wait|-w) WAIT_MODE=1 ;;
+        --help|-h)
+            echo "Usage: bee-remount-medium [--wait]"
+            echo "  Finds and remounts the live ISO medium to $MEDIUM_DIR"
+            echo "  --wait  retry every 5 s until a medium with squashfs is found"
+            exit 0 ;;
+    esac
+done
+
+log() { echo "[$(date +%H:%M:%S)] $*"; }
+die() { log "ERROR: $*" >&2; exit 1; }
+
+# Return all candidate block devices (optical + removable USB mass storage)
+find_candidates() {
+    # CD/DVD drives
+    for dev in /dev/sr* /dev/scd*; do
+        [ -b "$dev" ] && echo "$dev"
+    done
+    # USB/removable disks and partitions
+    for dev in /dev/sd* /dev/vd*; do
+        [ -b "$dev" ] || continue
+        # Only whole disks or partitions — skip the same device we are running from
+        local removable
+        local base
+        base=$(basename "$dev")
+        removable=$(cat "/sys/block/${base%%[0-9]*}/removable" 2>/dev/null || echo 0)
+        [ "$removable" = "1" ] && echo "$dev"
+    done
+}
+
+# Try to mount $1 to $MEDIUM_DIR and check for squashfs
+try_mount() {
+    local dev="$1"
+    local tmpdir
+    tmpdir=$(mktemp -d /tmp/bee-probe-XXXXXX)
+    if mount -o ro "$dev" "$tmpdir" 2>/dev/null; then
+        if [ -f "${tmpdir}/${SQUASHFS_REL}" ]; then
+            # Unmount probe mount and mount properly onto live path
+            umount "$tmpdir" 2>/dev/null || true
+            rmdir "$tmpdir"  2>/dev/null || true
+            # Unmount whatever is currently on MEDIUM_DIR (may be empty/stale)
+            umount "$MEDIUM_DIR" 2>/dev/null || true
+            mkdir -p "$MEDIUM_DIR"
+            if mount -o ro "$dev" "$MEDIUM_DIR"; then
+                log "Mounted $dev on $MEDIUM_DIR"
+                return 0
+            else
+                log "Mount of $dev on $MEDIUM_DIR failed"
+                return 1
+            fi
+        fi
+        umount "$tmpdir" 2>/dev/null || true
+    fi
+    rmdir "$tmpdir" 2>/dev/null || true
+    return 1
+}
+
+attempt() {
+    log "Scanning for ISO medium..."
+    for dev in $(find_candidates); do
+        log "  Trying $dev ..."
+        if try_mount "$dev"; then
+            local sq="${MEDIUM_DIR}/${SQUASHFS_REL}"
+            log "SUCCESS: squashfs available at $sq ($(du -sh "$sq" | cut -f1))"
+            return 0
+        fi
+    done
+    return 1
+}
+
+if [ "$WAIT_MODE" = "1" ]; then
+    log "Waiting for live medium (press Ctrl+C to abort)..."
+    while true; do
+        if attempt; then
+            exit 0
+        fi
+        log "  Not found — retrying in 5 s (reconnect the disc now)"
+        sleep 5
+    done
+else
+    attempt || die "No ISO medium with ${SQUASHFS_REL} found. Reconnect the disc and re-run, or use --wait."
+fi
--- a/iso/overlay/usr/share/bee/wallpaper.png
+++ b/iso/overlay/usr/share/bee/wallpaper.png
Author	SHA1	Message	Date
Michael Chus	f8cd9a7376	Rework Power Fit report: 90 min stability, aligned tables, PSU/fan sections - Increase stability profile duration from 33 min to 90 min by wiring powerBenchDurationSec() into runBenchmarkPowerCalibration (was discarded) - Collect per-step PSU slot readings, fan RPM/duty, and per-GPU telemetry in ramp loop; add matching fields to NvidiaPowerBenchStep/NvidiaPowerBenchGPU - Rewrite renderPowerBenchReport: replace Per-Slot Results with Single GPU section, rework Ramp Sequence rows=runs/cols=GPUs, add PSU Performance section (conditional on IPMI data), add transposed Single vs All-GPU comparison table in per-GPU sections - Add fmtMDTable helper (benchmark_table.go) and apply to all tables in both power and performance reports so columns align in plain-text view Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-19 18:04:12 +03:00
Michael Chus	d52ec67f8f	Stability hardening, build script fixes, GRUB bee logo Stability hardening (webui/app): - readFileLimited(): защита от OOM при чтении audit JSON (100 MB), component-status DB (10 MB) и лога задачи (50 MB) - jobs.go: буферизованный лог задачи — один открытый fd на задачу вместо open/write/close на каждую строку (устраняет тысячи syscall/сек при GPU стресс-тестах) - stability.go: экспоненциальный backoff в goRecoverLoop (2s→4s→…→60s), сброс при успешном прогоне >30s, счётчик перезапусков в slog - kill_workers.go: таймаут 5s на скан /proc, warn при срабатывании - bee-web.service: MemoryMax=3G — OOM killer защищён Build script: - build.sh: удалён блок генерации grub-pc/grub.cfg + live.cfg.in — мёртвый код с v8.25; grub-pc игнорируется live-build, а генерируемый live.cfg.in перезаписывал правильный статический файл устаревшей версией без tuning-параметров ядра и пунктов gsp-off/kms+gsp-off - build.sh: dump_memtest_debug теперь логирует grub-efi/grub.cfg вместо grub-pc/grub.cfg (было всегда "missing") GRUB: - live-theme/bee-logo.png: логотип пчелы 400×400px на чёрном фоне - live-theme/theme.txt: + image компонент по центру в верхней трети экрана; меню сдвинуто с 62% до 65% Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-19 13:08:31 +03:00
Michael Chus	61c7abaa80	Add multi-source PSU power triangulation and per-slot distribution table - collector/psu.go: export PSUSlotsFromSDR() reusing slot regex patterns; add isPSUInputPower/isPSUOutputPower helpers covering MSI/MLT/xFusion/HPE naming; add xFusion Power<N> slot pattern; parseBoundedFloat for self-healing (rejects zero/negative/out-of-range sensor readings); default fallback treats unclassified PSU sensors as AC input - benchmark_types.go: BenchmarkPSUSlotPower struct; BenchmarkServerPower gains PSUInputIdle/Loaded, PSUOutputIdle/Loaded, PSUSlotReadingsIdle/Loaded, GPUSlotTotalW, DCMICoverageRatio fields - benchmark.go: sampleIPMISDRPowerSensors uses collector.PSUSlotsFromSDR instead of custom classifier; detectDCMIPartialCoverage replaces ramp heuristic — compares DCMI idle vs SDR PSU sum, flags <0.70 ratio as partial coverage; detectIPMISaturationFallback kept for servers without SDR PSU sensors; report gains PSU Load Distribution table (per-slot AC/DC idle vs loaded, Δ) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-19 13:07:48 +03:00
Michael Chus	d60f7758ba	Fix grub-pc directory missing before writing grub.cfg Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-19 08:42:17 +03:00
Michael Chus	52c3a24b76	Compact metrics DB in background to prevent CPU spin under load As metrics.db grew (1 sample/5 s × hours), handleMetricsChartSVG called LoadAll() on every chart request — loading all rows across 4 tables through a single SQLite connection. With ~10 charts auto-refreshing in parallel, requests queued behind each other, saturating the connection pool and pegging a CPU core. Fix: add a background compactor that runs every hour via the metrics collector: • Downsample: rows older than 2 h are thinned to 1 per minute (keep MIN(ts) per ts/60 bucket) — retains chart shape while cutting row count by ~92 %. • Prune: rows older than 48 h are deleted entirely. • After prune: WAL checkpoint/truncate to release disk space. LoadAll() in handleMetricsChartSVG is unchanged — it now stays fast because the DB is kept small rather than capping the query window. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-18 15:28:05 +03:00
Michael Chus	028bb30333	Detect PSU faults during perf and power benchmarks Snapshot IPMI "Power Supply" sensor states before and after each benchmark run. Compare before/after to surface only new anomalies (pre-existing faults are excluded). Results land in NvidiaBenchmarkResult.PSUIssues and NvidiaPowerBenchResult.PSUIssues (JSON: psu_issues) and are printed in the text benchmark report under a "PSU Issues" section. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-18 15:08:41 +03:00
Michael Chus	7d64e5d215	Fix two stale failing tests - TestHandleAPIBenchmarkPowerFitRampQueuesBenchmarkPowerFitTasks: ramp-up mode intentionally creates a single task (the runner handles 1→N internally to avoid redundant repetition of earlier ramp steps). Updated the test to expect 1 task and verify RampTotal=3 instead of asserting 3 separate tasks. - TestBenchmarkPageRendersSavedResultsTable: benchmark page used "Performance Results" as heading while the test looked for "Perf Results". Aligned the page heading with the shorter label used everywhere else (task reports, etc.). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-18 15:07:27 +03:00
Michael Chus	51b721aeb3	Add real-data duration estimates to benchmark and burn pages - Add BenchmarkEstimated* constants to benchmark_types.go from _v8 logs (Standard Perf ~16 min, Standard Power Fit ~43 min, Stability Perf ~92 min) - Update benchmark profile dropdown to show Perf / Power Fit timing per profile - Add timing columns to Method Split table (Standard vs Stability per run type) - Update burn preset labels to show "N min/GPU (sequential) or N min (parallel)" - Clarify burn "one by one" description with sequential vs parallel scaling Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-18 10:54:50 +03:00
Michael Chus	bac89bb6e5	Add real-data duration estimates to validate tab profiles - Add SATEstimated* constants to sat.go derived from _v8 production logs, with a rule to recalculate them whenever the script changes - Extend validateInventory with NvidiaGPUCount to make estimates GPU-aware - Update all validate card duration strings: CPU, memory, storage, NVIDIA GPU, targeted stress/power, pulse test, NCCL, nvbandwidth - Fix nvbandwidth description ("intended to stay short" → actual ~45 min) - Top-level profile labels show computed total including GPU count Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-18 10:51:15 +03:00
Michael Chus	7a618da1f9	Redesign system power chart as stacked per-PSU area chart - Add PSUReading struct and PSUs []PSUReading to LiveMetricSample - Sample per-PSU input watts from IPMI SDR entity 10.x (Power Supply) - Render stacked filled-area SVG chart (one layer per PSU, cumulative total) - Fall back to single-line chart on systems with ≤1 PSU in SDR Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-18 10:42:00 +03:00
Michael Chus	64ae1c0ff0	Sync GRUB and isolinux boot entries; document sync rule grub-efi/grub.cfg: add KMS+GSP=off entry (was in isolinux, missing in GRUB) isolinux/live.cfg.in: add full standard param set to all entries (net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup) to match grub-efi bible-local/docs/iso-build-rules.md: add bootloader sync rule documenting that grub-efi and isolinux must be kept in sync manually, listing canonical entries and standard param set, and noting the grub-pc/grub-efi history. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-18 10:32:16 +03:00
Michael Chus	49050ca717	Fix GRUB bootloader config dir: grub-pc → grub-efi Build uses --bootloaders "grub-efi,syslinux" so live-build reads config/bootloaders/grub-efi/ for the UEFI GRUB config. The directory was incorrectly named grub-pc, causing live-build to ignore our custom grub.cfg and generate a default one (missing toram, GSP-off entries). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-18 10:30:11 +03:00
Michael Chus	5ba72ab315	Add rsync to initramfs for toram progress output live-boot already uses rsync --progress when /bin/rsync exists; without it the copy falls back to silent cp -a. Add rsync to the ISO package list and install an initramfs-tools hook (bee-rsync) that copies the rsync binary + shared libs into the initrd via copy_exec. The hook then rebuilds the initramfs so the change takes effect in the ISO's initrd.img. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-17 23:52:47 +03:00
Michael Chus	63363e9629	Add toram boot entry and Install to RAM resume support - grub.cfg: add "load to RAM (toram)" entry to advanced submenu - install_to_ram.go: resume from existing /dev/shm/bee-live copy if source medium is unavailable after bee-web restart - tasks.go: fix "Recovered after bee-web restart" shown on every run (check j.lines before first append, not after) - bee-install: retry unsquashfs up to 5x with wait-for-remount on source loss; clear error message with bee-remount-medium hint - bee-remount-medium: new script to find and remount live ISO source after USB/CD reconnect; supports --wait polling mode - 9000-bee-setup: chmod +x for bee-install and bee-remount-medium Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-17 23:48:56 +03:00
Mikhail Chusavitin	5285c0d101	Capture per-run IPMI power and GPU telemetry in power benchmark - Sample IPMI loaded_w per single-card calibration and per ramp step instead of averaging over the entire Phase 2; top-level ServerPower uses the final (all-GPU) ramp step value - Add ServerLoadedW/ServerDeltaW to NvidiaPowerBenchGPU and NvidiaPowerBenchStep so external tooling can compare wall power per phase without re-parsing logs - Write gpu-metrics.csv/.html inside each single-XX/ and step-XX/ subdir; aggregate all phases into a top-level gpu-metrics.csv/.html - Write 00-nvidia-smi-q.log at the start of every power run - Add Telemetry (p95 temp/power/fan/clock) to NvidiaPowerBenchGPU in result.json from the converged calibration attempt - Power benchmark page: split "Achieved W" into Single-card W and Multi-GPU W (StablePowerLimitW); derate highlight and status color now reflect the final multi-GPU limit vs nominal - Performance benchmark page: add Status column and per-GPU score color coding (green/yellow/red) based on gpu.Status and OverallStatus Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-17 17:59:58 +03:00
Mikhail Chusavitin	dca4afb8d0	Seed power ramp with single-card TDP limits	2026-04-16 11:43:01 +03:00
Mikhail Chusavitin	b4280941f5	Move NCCL and NVBandwidth into validate mode	2026-04-16 11:02:30 +03:00
Mikhail Chusavitin	f74976ec4c	Use static overlay wallpaper in ISO build	2026-04-16 10:54:03 +03:00
Mikhail Chusavitin	18e24a9aa5	Estimate fan duty from observed RPM maxima	2026-04-16 10:10:18 +03:00
Mikhail Chusavitin	e306250da7	Disable fp64/fp4 in mixed gpu burn	2026-04-16 10:00:03 +03:00
Mikhail Chusavitin	c5b2081ac9	Disable unstable fp4/fp64 benchmark phases	2026-04-16 09:58:02 +03:00
Michael Chus	434528083e	Power bench: compare GPU-reported TDP vs IPMI server power delta - NvidiaPowerBenchResult gains ServerPower *BenchmarkServerPower - RunNvidiaPowerBench samples IPMI idle before Phase 1 and loaded via background goroutine throughout Phase 2 ramp - renderPowerBenchReport: new "Server vs GPU Power Comparison" table with ratio annotation (✓ match / ⚠ minor / ✗ over-report) - renderPowerBenchSummary: server_idle_w, server_loaded_w, server_delta_w, server_reporting_ratio keys Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 07:21:02 +03:00
Michael Chus	30aa30cd67	LiveCD: set Baby Bee wallpaper centered on black background 400×400px PNG centered via feh --bg-center --image-bg '#000000'. Fallback solid fill also changed to black. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 06:57:23 +03:00
Michael Chus	4f76e1de21	Dashboard: per-device status chips with hover tooltips Replace single aggregated badge per hardware category with individual colored chips (O/W/F/?) for each ComponentStatusRecord. Added helper functions: matchedRecords, firstNonEmpty. CSS classes: chip-ok/warn/fail/unknown. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 06:54:13 +03:00
Michael Chus	3732e64a4a	Add slowdown temperature exceedance detector to benchmark detectSlowdownTempExceedance scans steady-state metric rows per GPU and emits a [WARNING] note + PARTIAL status if any sample >= SlowdownTempC. Uses per-GPU threshold from nvidia-smi -q, fallback 80°C. Distinct from p95-based TempHeadroomC check: catches even a single spike above the slowdown threshold that would be smoothed out in aggregates. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 06:46:45 +03:00
Michael Chus	0d925299ff	Use per-GPU temperature limits from nvidia-smi -q for headroom calculation Parse "GPU Shutdown Temp" and "GPU Slowdown Temp" from nvidia-smi -q verbose output in enrichGPUInfoWithMaxClocks. Store as ShutdownTempC/SlowdownTempC on benchmarkGPUInfo and BenchmarkGPUResult. Fallback: 90°C shutdown / 80°C slowdown when not available. TempHeadroomC = ShutdownTempC - P95TempC (per-GPU, not hardcoded 100°C). Warning threshold: p95 >= SlowdownTempC. Critical: headroom < 10°C. Report table shows both limits alongside headroom and p95 temp. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 06:45:15 +03:00
Michael Chus	a8d5e019a5	Translate report to English; add power anomaly detector All report strings are now English only. Add detectPowerAnomaly: scans steady-state metric rows per GPU with a 5-sample rolling baseline; flags a sudden drop ≥30% while GPU usage >50% as [HARD STOP] — indicates bad cable contact or VRM fault. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 06:42:00 +03:00
Michael Chus	72ec086568	Restructure benchmark report as balanced scorecard (5 perspectives) Split throttle into separate signals: ThermalThrottlePct, PowerCapThrottlePct, SyncBoostThrottlePct. Add TempHeadroomC (100 - p95_temp) as independent thermal headroom metric; warning < 20°C (>80°C), critical < 10°C (>90°C). Hard stop findings: thermal throttle with fans < 95%, ECC uncorrected errors, p95 temp > 90°C. Throttle findings now include per-type percentages and diagnostic context. Replace flat scorecard table with BSC 5-perspective layout: 1. Compatibility (hard stops: thermal+fan, ECC) 2. Thermal headroom (p95 temp, delta to 100°C, throttle %) 3. Power delivery (power cap throttle, power CV, fan duty) 4. Performance (Compute TOPS, Synthetic, Mixed, TOPS/SM/GHz) 5. Anomalies (ECC corrected, sync boost, power/thermal variance) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 06:40:06 +03:00
Michael Chus	7a0b0934df	Separate compute score from server quality score CompositeScore = raw ComputeScore (TOPS). Throttling GPUs score lower automatically — no quality multiplier distorting the compute signal. Add ServerQualityScore (0-100): server infrastructure quality independent of GPU model. Formula: 0.40×Stability + 0.30×PowerSustain + 0.30×Thermal. Use to compare servers with the same GPU or flag bad server conditions. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 00:45:55 +03:00
Michael Chus	d8ca0dca2c	Redesign scoring metrics: variance-based sustain scores, throttle stability PowerSustainScore: power draw variance (CV) during load, not deviation from TDP. ThermalSustainScore: temperature variance (CV) during load. StabilityScore: fraction of time spent in thermal+power-cap throttling. Remove NCCL bonus from quality_factor. quality = 0.35 + 0.35×Stability + 0.15×PowerSustain + 0.15×ThermalSustain, cap 1.00. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 00:39:59 +03:00
Michael Chus	d90250f80a	Fix DCGM cleanup and shorten memory validate	2026-04-16 00:39:37 +03:00
Michael Chus	8d6eaef5de	Update perf benchmark report methodology to reflect new design Remove references to pre-benchmark power calibration and dcgmi targeted_power. Document platform_power_score ramp-up methodology, PowerSustainScore fallback to steady-state power, and full-budget single-precision phases. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 00:31:58 +03:00
Michael Chus	732bf4cbab	Redesign power and performance benchmarks with new methodology Power/Thermal Fit: cumulative fixed-limit ramp where each GPU's stable TDP is found under real multi-GPU thermal load (all prior GPUs running at their fixed limits). PlatformMaxTDPW = sum of stable limits across all GPUs. Remove PlatformPowerScore from power test. Performance Benchmark: remove pre-benchmark power calibration entirely. After N single-card runs, execute k=2..N parallel ramp-up steps and compute PlatformPowerScore = mean compute scalability vs best single-card TOPS. PowerSustainScore falls back to Steady.AvgPowerW when calibration absent. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-16 00:30:50 +03:00
Michael Chus	fa6d905a10	Tune bee-gpu-burn single-precision benchmark phases	2026-04-16 00:05:47 +03:00
Mikhail Chusavitin	5c1862ce4c	Use lb clean --all to clear bootstrap cache on every build Prevents stale debootstrap cache from bypassing --debootstrap-options changes (e.g. --include=ca-certificates added in v8.15). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-15 17:37:08 +03:00
Mikhail Chusavitin	b65ef2ea1d	Fix: use --debootstrap-options to include ca-certificates in bootstrap --bootstrap-packages is not a valid lb config option (20230502). Use --debootstrap-options "--include=ca-certificates" instead to ensure ca-certificates is present when lb chroot_archives runs apt-get update against the NVIDIA CUDA HTTPS source. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-15 17:26:01 +03:00