Add rsync to initramfs for toram progress output

live-boot already uses rsync --progress when /bin/rsync exists; without it the copy falls back to silent cp -a. Add rsync to the ISO package list and install an initramfs-tools hook (bee-rsync) that copies the rsync binary + shared libs into the initrd via copy_exec. The hook then rebuilds the initramfs so the change takes effect in the ISO's initrd.img. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Add toram boot entry and Install to RAM resume support
2026-04-17 23:52:47 +03:00 · 2026-04-17 23:48:56 +03:00 · 2026-04-17 17:59:58 +03:00
11 changed files with 441 additions and 76 deletions
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -59,6 +59,9 @@ type benchmarkPowerCalibrationResult struct {
 	// ≥20% while server fans were below 100% duty cycle — a signal that the
 	// cooling system may not be correctly configured for full GPU load.
 	CoolingWarning string
+	// MetricRows holds the telemetry rows from the final (converged) attempt
+	// for this GPU. Used to build per-run gpu-metrics.csv.
+	MetricRows []GPUMetricRow
 }

 type benchmarkBurnProfile struct {
@@ -2781,7 +2784,7 @@ func runBenchmarkPowerCalibration(
 	infoByIndex map[int]benchmarkGPUInfo,
 	logFunc func(string),
 	seedLimits map[int]int,
-) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) {
+) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow) {
 	const calibDurationSec = 120
 	const maxDerateW = 150
 	// calibSearchTolerance is the binary-search convergence threshold in watts.
@@ -2795,7 +2798,7 @@ func runBenchmarkPowerCalibration(

 	if _, err := exec.LookPath("dcgmi"); err != nil {
 		logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
-		return map[int]benchmarkPowerCalibrationResult{}, nil
+		return map[int]benchmarkPowerCalibrationResult{}, nil, nil
 	}
 	if killed := KillTestWorkers(); len(killed) > 0 {
 		for _, p := range killed {
@@ -2829,6 +2832,8 @@ func runBenchmarkPowerCalibration(

 	results := make(map[int]benchmarkPowerCalibrationResult, len(gpuIndices))
 	var restore []benchmarkRestoreAction
+	var allCalibRows []GPUMetricRow // accumulated telemetry across all attempts
+	var calibCursor float64

 	// Initialise per-GPU state.
 	states := make([]*gpuCalibState, 0, len(gpuIndices))
@@ -2981,6 +2986,8 @@ calibDone:
 		ticker.Stop()
 		cancelAttempt()
 		_ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644)
+		// Accumulate telemetry rows with attempt stage label.
+		appendBenchmarkMetrics(&allCalibRows, ar.rows, fmt.Sprintf("attempt-%d", sharedAttempt), &calibCursor, float64(calibDurationSec))

 		// Resource busy: retry with exponential back-off (shared — one DCGM session).
 		if ar.err != nil && isDCGMResourceBusy(ar.err) {
@@ -3065,6 +3072,7 @@ calibDone:
 						}
 					}
 				}
+				s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
 				s.converged = true
 				continue
 			}
@@ -3103,6 +3111,7 @@ calibDone:
 				} else {
 					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
 				}
+				s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
 				s.converged = true
 				continue
 			}
@@ -3140,7 +3149,8 @@ calibDone:
 			results[s.idx] = s.calib
 		}
 	}
-	return results, restore
+	writeBenchmarkMetricsFiles(runDir, allCalibRows)
+	return results, restore, allCalibRows
 }

 // isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222),
@@ -3230,21 +3240,25 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 	}
 	if len(result.RampSteps) > 0 {
 		b.WriteString("## Ramp Sequence\n\n")
-		b.WriteString("| Step | New GPU | Stable Limit | Total Observed | Derated | Status |\n")
-		b.WriteString("|------|---------|--------------|----------------|---------|--------|\n")
+		b.WriteString("| Step | New GPU | Stable Limit | Total Observed | Server Δ (IPMI) | Derated | Status |\n")
+		b.WriteString("|------|---------|--------------|----------------|-----------------|---------|--------|\n")
 		for _, step := range result.RampSteps {
 			derated := "-"
 			if step.Derated {
 				derated = "⚠ yes"
 			}
-			fmt.Fprintf(&b, "| %d | GPU %d | %.0f W | %.0f W | %s | %s |\n",
-				step.StepIndex, step.NewGPUIndex, step.NewGPUStableLimitW, step.TotalObservedPowerW, derated, step.Status)
+			serverDelta := "-"
+			if step.ServerDeltaW > 0 {
+				serverDelta = fmt.Sprintf("%.0f W", step.ServerDeltaW)
+			}
+			fmt.Fprintf(&b, "| %d | GPU %d | %.0f W | %.0f W | %s | %s | %s |\n",
+				step.StepIndex, step.NewGPUIndex, step.NewGPUStableLimitW, step.TotalObservedPowerW, serverDelta, derated, step.Status)
 		}
 		b.WriteString("\n")
 	}
 	b.WriteString("## Per-Slot Results\n\n")
-	b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Temp | Attempts |\n")
-	b.WriteString("|-----|--------|-------------------|--------------|------|----------|\n")
+	b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Server Δ (IPMI) | Temp | Attempts |\n")
+	b.WriteString("|-----|--------|-------------------|--------------|-----------------|------|----------|\n")
 	for _, gpu := range result.GPUs {
 		stableLimit := "-"
 		if gpu.StablePowerLimitW > 0 {
@@ -3254,8 +3268,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 				stableLimit = fmt.Sprintf("%.0f W", gpu.StablePowerLimitW)
 			}
 		}
-		fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %s | %.1f C | %d |\n",
-			gpu.Index, gpu.Status, gpu.AppliedPowerLimitW, stableLimit, gpu.MaxObservedTempC, gpu.CalibrationAttempts)
+		serverDelta := "-"
+		if gpu.ServerDeltaW > 0 {
+			serverDelta = fmt.Sprintf("%.0f W", gpu.ServerDeltaW)
+		}
+		fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %s | %s | %.1f C | %d |\n",
+			gpu.Index, gpu.Status, gpu.AppliedPowerLimitW, stableLimit, serverDelta, gpu.MaxObservedTempC, gpu.CalibrationAttempts)
 	}
 	b.WriteString("\n")
 	for _, gpu := range result.GPUs {
@@ -3284,11 +3302,19 @@ func renderPowerBenchSummary(result NvidiaPowerBenchResult) string {
 		fmt.Fprintf(&b, "ramp_step_%d_new_gpu=%d\n", step.StepIndex, step.NewGPUIndex)
 		fmt.Fprintf(&b, "ramp_step_%d_stable_limit_w=%.0f\n", step.StepIndex, step.NewGPUStableLimitW)
 		fmt.Fprintf(&b, "ramp_step_%d_total_power_w=%.0f\n", step.StepIndex, step.TotalObservedPowerW)
+		if step.ServerLoadedW > 0 {
+			fmt.Fprintf(&b, "ramp_step_%d_server_loaded_w=%.0f\n", step.StepIndex, step.ServerLoadedW)
+			fmt.Fprintf(&b, "ramp_step_%d_server_delta_w=%.0f\n", step.StepIndex, step.ServerDeltaW)
+		}
 	}
 	for _, gpu := range result.GPUs {
 		if gpu.StablePowerLimitW > 0 {
 			fmt.Fprintf(&b, "gpu_%d_stable_limit_w=%.0f\n", gpu.Index, gpu.StablePowerLimitW)
 		}
+		if gpu.ServerLoadedW > 0 {
+			fmt.Fprintf(&b, "gpu_%d_server_loaded_w=%.0f\n", gpu.Index, gpu.ServerLoadedW)
+			fmt.Fprintf(&b, "gpu_%d_server_delta_w=%.0f\n", gpu.Index, gpu.ServerDeltaW)
+		}
 	}
 	if sp := result.ServerPower; sp != nil && sp.Available {
 		fmt.Fprintf(&b, "server_idle_w=%.0f\n", sp.IdleW)
@@ -3327,6 +3353,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	if infoErr != nil {
 		return "", infoErr
 	}
+	// Capture full nvidia-smi -q snapshot at the start of the run.
+	if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
+		_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
+	}
 	hostname, _ := os.Hostname()
 	result := NvidiaPowerBenchResult{
 		BenchmarkVersion:   benchmarkVersion,
@@ -3352,13 +3382,31 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
 	// establish a true single-card power baseline unaffected by neighbour heat.
 	calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
+	singleIPMILoadedW := make(map[int]float64, len(selected))
 	var allRestoreActions []benchmarkRestoreAction
+	// allPowerRows accumulates telemetry from all phases for the top-level gpu-metrics.csv.
+	var allPowerRows []GPUMetricRow
+	var powerCursor float64
 	for _, idx := range selected {
 		singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
 		_ = os.MkdirAll(singleDir, 0755)
 		singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
 		logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
-		c, restore := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil)
+		ipmiSingleCtx, ipmiSingleCancel := context.WithCancel(ctx)
+		ipmiSingleDone := make(chan float64, 1)
+		go func() {
+			defer close(ipmiSingleDone)
+			if w, ok := sampleIPMIPowerSeries(ipmiSingleCtx, 3600); ok {
+				ipmiSingleDone <- w
+			}
+		}()
+		c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil)
+		appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0)
+		ipmiSingleCancel()
+		if w, ok := <-ipmiSingleDone; ok {
+			singleIPMILoadedW[idx] = w
+			logFunc(fmt.Sprintf("power calibration: GPU %d single-card IPMI loaded: %.0f W", idx, w))
+		}
 		allRestoreActions = append(allRestoreActions, restore...)
 		if r, ok := c[idx]; ok {
 			calibByIndex[idx] = r
@@ -3383,7 +3431,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 				result.OverallStatus = "PARTIAL"
 			}
 		}
-		gpus = append(gpus, NvidiaPowerBenchGPU{
+		gpu := NvidiaPowerBenchGPU{
 			Index:               idx,
 			Name:                info.Name,
 			BusID:               info.BusID,
@@ -3396,7 +3444,16 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			Status:              status,
 			Notes:               append([]string(nil), calib.Notes...),
 			CoolingWarning:      calib.CoolingWarning,
-		})
+		}
+		if w, ok := singleIPMILoadedW[idx]; ok && serverIdleOK && w > 0 {
+			gpu.ServerLoadedW = w
+			gpu.ServerDeltaW = w - serverIdleW
+		}
+		if len(calib.MetricRows) > 0 {
+			t := summarizeBenchmarkTelemetry(calib.MetricRows)
+			gpu.Telemetry = &t
+		}
+		gpus = append(gpus, gpu)
 	}
 	sort.Slice(gpus, func(i, j int) bool {
 		if gpus[i].MaxObservedPowerW != gpus[j].MaxObservedPowerW {
@@ -3445,20 +3502,11 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	// stableLimits accumulates GPU index → fixed stable limit (W) across steps.
 	stableLimits := make(map[int]int, len(result.RecommendedSlotOrder))

-	// Start an IPMI sampling goroutine that runs throughout Phase 2 to capture
-	// server-side loaded power while GPUs are under stress. The goroutine is
-	// cancelled as soon as Phase 2 finishes, and the average is used to compare
-	// against PlatformMaxTDPW (GPU-reported stable limits sum).
+	// serverLoadedW tracks the IPMI server power from the final ramp step
+	// (all GPUs simultaneously loaded). Earlier steps' values are stored
+	// per-step in NvidiaPowerBenchStep.ServerLoadedW.
 	var serverLoadedW float64
 	var serverLoadedOK bool
-	ipmiPhase2Ctx, ipmiPhase2Cancel := context.WithCancel(ctx)
-	ipmiPhase2Done := make(chan float64, 1)
-	go func() {
-		defer close(ipmiPhase2Done)
-		if w, ok := sampleIPMIPowerSeries(ipmiPhase2Ctx, 3600); ok {
-			ipmiPhase2Done <- w
-		}
-	}()

 	// Step 1: reuse single-card calibration result directly.
 	if len(result.RecommendedSlotOrder) > 0 {
@@ -3475,6 +3523,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			Derated:             firstCalib.Derated,
 			Status:              "OK",
 		}
+		if w, ok := singleIPMILoadedW[firstIdx]; ok && serverIdleOK && w > 0 {
+			ramp.ServerLoadedW = w
+			ramp.ServerDeltaW = w - serverIdleW
+		}
 		if !firstCalib.Completed {
 			ramp.Status = "FAILED"
 			ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card targeted_power", firstIdx))
@@ -3523,7 +3575,24 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			step, len(result.RecommendedSlotOrder), len(subset), newGPUIdx))

 		stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
-		stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep)
+		ipmiStepCtx, ipmiStepCancel := context.WithCancel(ctx)
+		ipmiStepDone := make(chan float64, 1)
+		go func() {
+			defer close(ipmiStepDone)
+			if w, ok := sampleIPMIPowerSeries(ipmiStepCtx, 3600); ok {
+				ipmiStepDone <- w
+			}
+		}()
+		stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep)
+		appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0)
+		ipmiStepCancel()
+		var stepIPMILoadedW float64
+		var stepIPMIOK bool
+		if w, ok := <-ipmiStepDone; ok {
+			stepIPMILoadedW = w
+			stepIPMIOK = true
+			logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W", step, w))
+		}
 		// Accumulate restore actions; they all run in the outer defer.
 		allRestoreActions = append(allRestoreActions, stepRestore...)

@@ -3586,15 +3655,17 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
 		}

-		result.RampSteps = append(result.RampSteps, ramp)
-	}
+		if stepIPMIOK && serverIdleOK && stepIPMILoadedW > 0 {
+			ramp.ServerLoadedW = stepIPMILoadedW
+			ramp.ServerDeltaW = stepIPMILoadedW - serverIdleW
+			// The last step has all GPUs loaded — use it as the top-level loaded_w.
+			if step == len(result.RecommendedSlotOrder) {
+				serverLoadedW = stepIPMILoadedW
+				serverLoadedOK = true
+			}
+		}

-	// Stop IPMI Phase 2 sampling and collect result.
-	ipmiPhase2Cancel()
-	if w, ok := <-ipmiPhase2Done; ok {
-		serverLoadedW = w
-		serverLoadedOK = true
-		logFunc(fmt.Sprintf("server loaded power (IPMI, Phase 2 avg): %.0f W", w))
+		result.RampSteps = append(result.RampSteps, ramp)
 	}

 	// Populate StablePowerLimitW on each GPU entry from the accumulated stable limits.
@@ -3624,6 +3695,8 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	//   ~1.0 → GPU telemetry matches wall power; <0.75 → GPU over-reports its TDP.
 	_ = serverIdleOK // used implicitly via characterizeServerPower
 	result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, result.PlatformMaxTDPW, serverIdleOK && serverLoadedOK)
+	// Write top-level gpu-metrics.csv/.html aggregating all phases.
+	writeBenchmarkMetricsFiles(runDir, allPowerRows)
 	resultJSON, err := json.MarshalIndent(result, "", "  ")
 	if err != nil {
 		return "", fmt.Errorf("marshal power result: %w", err)
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -331,6 +331,13 @@ type NvidiaPowerBenchGPU struct {
 	Notes               []string `json:"notes,omitempty"`
 	// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
 	CoolingWarning string `json:"cooling_warning,omitempty"`
+	// ServerLoadedW is the IPMI server power reading captured during this
+	// GPU's single-card calibration run. ServerDeltaW = ServerLoadedW − idle.
+	ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
+	ServerDeltaW  float64 `json:"server_delta_w,omitempty"`
+	// Telemetry holds the aggregated stats from the final converged calibration
+	// attempt for this GPU (temperature, power, fan, clock percentiles).
+	Telemetry *BenchmarkTelemetrySummary `json:"telemetry,omitempty"`
 }

 type NvidiaPowerBenchStep struct {
@@ -345,6 +352,10 @@ type NvidiaPowerBenchStep struct {
 	Derated             bool     `json:"derated,omitempty"`
 	Status              string   `json:"status"`
 	Notes               []string `json:"notes,omitempty"`
+	// ServerLoadedW is the IPMI server power reading captured during this
+	// ramp step's calibration run. ServerDeltaW = ServerLoadedW − idle.
+	ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
+	ServerDeltaW  float64 `json:"server_delta_w,omitempty"`
 }

 // NvidiaPerformanceRampStep holds per-step performance data for the
--- a/audit/internal/platform/install_to_ram.go
+++ b/audit/internal/platform/install_to_ram.go
@@ -140,26 +140,56 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (ret
 	}

 	squashfsFiles, err := filepath.Glob("/run/live/medium/live/*.squashfs")
-	if err != nil || len(squashfsFiles) == 0 {
-		return fmt.Errorf("no squashfs files found in /run/live/medium/live/")
-	}
-
-	free := freeMemBytes()
-	var needed int64
-	for _, sf := range squashfsFiles {
-		fi, err2 := os.Stat(sf)
-		if err2 != nil {
-			return fmt.Errorf("stat %s: %v", sf, err2)
-		}
-		needed += fi.Size()
-	}
-	const headroom = 256 * 1024 * 1024
-	if free > 0 && needed+headroom > free {
-		return fmt.Errorf("insufficient RAM: need %s, available %s",
-			humanBytes(needed+headroom), humanBytes(free))
-	}
+	sourceAvailable := err == nil && len(squashfsFiles) > 0

 	dstDir := installToRAMDir
+
+	// If the source medium is unavailable, check whether a previous run already
+	// produced a complete copy in RAM. If so, skip the copy phase and proceed
+	// directly to the loop-rebind / bind-mount steps.
+	if !sourceAvailable {
+		copiedFiles, _ := filepath.Glob(filepath.Join(dstDir, "*.squashfs"))
+		if len(copiedFiles) > 0 {
+			log("Source medium not available, but a previous RAM copy was found — resuming from existing copy.")
+			// Proceed to rebind with the already-copied files.
+			for _, dst := range copiedFiles {
+				base := filepath.Base(dst)
+				// Re-associate the loop device that was originally backed by the
+				// source file (now gone); find it by the old source path pattern.
+				srcGuess := "/run/live/medium/live/" + base
+				loopDev, lerr := findLoopForFile(srcGuess)
+				if lerr != nil {
+					log(fmt.Sprintf("Loop device for %s not found (%v) — skipping re-association.", base, lerr))
+					continue
+				}
+				if rerr := reassociateLoopDevice(loopDev, dst); rerr != nil {
+					log(fmt.Sprintf("Warning: could not re-associate %s → %s: %v", loopDev, dst, rerr))
+				} else {
+					log(fmt.Sprintf("Loop device %s now backed by RAM copy.", loopDev))
+				}
+			}
+			goto bindMedium
+		}
+		return fmt.Errorf("no squashfs files found in /run/live/medium/live/ and no prior RAM copy in %s — reconnect the installation medium and retry", dstDir)
+	}
+
+	{
+		free := freeMemBytes()
+		var needed int64
+		for _, sf := range squashfsFiles {
+			fi, err2 := os.Stat(sf)
+			if err2 != nil {
+				return fmt.Errorf("stat %s: %v", sf, err2)
+			}
+			needed += fi.Size()
+		}
+		const headroom = 256 * 1024 * 1024
+		if free > 0 && needed+headroom > free {
+			return fmt.Errorf("insufficient RAM: need %s, available %s",
+				humanBytes(needed+headroom), humanBytes(free))
+		}
+	}
+
 	if state.CopyPresent {
 		log("Removing stale partial RAM copy before retry...")
 	}
@@ -199,6 +229,7 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (ret
 		}
 	}

+bindMedium:
 	log("Copying remaining medium files...")
 	if err := cpDir(ctx, "/run/live/medium", dstDir, log); err != nil {
 		log(fmt.Sprintf("Warning: partial copy: %v", err))
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
@@ -2014,9 +2014,11 @@ func renderSATCard(id, label, runAction, headerActions, body string) string {
 // ── Benchmark ─────────────────────────────────────────────────────────────────

 type benchmarkHistoryRun struct {
-	generatedAt time.Time
-	displayTime string
-	gpuScores   map[int]float64 // GPU index → composite score
+	generatedAt  time.Time
+	displayTime  string
+	gpuScores    map[int]float64 // GPU index → composite score
+	gpuStatuses  map[int]string  // GPU index → status ("OK", "WARNING", "FAILED", …)
+	overallStatus string
 }

 func renderBenchmark(opts HandlerOptions) string {
@@ -2324,7 +2326,7 @@ func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string,
 		b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">` + html.EscapeString(description) + `</p>`)
 	}
 	b.WriteString(`<div style="overflow-x:auto">`)
-	b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th>`)
+	b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th><th>Status</th>`)
 	for i := 0; i <= maxGPUIndex; i++ {
 		b.WriteString(`<th>GPU ` + strconv.Itoa(i) + `</th>`)
 	}
@@ -2333,13 +2335,36 @@ func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string,
 		b.WriteString(`<tr>`)
 		b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
 		b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
+		overallColor := "var(--ok)"
+		overallLabel := run.overallStatus
+		if overallLabel == "" {
+			overallLabel = "OK"
+		}
+		if overallLabel == "FAILED" {
+			overallColor = "var(--crit-fg,#9f3a38)"
+		} else if overallLabel != "OK" {
+			overallColor = "var(--warn)"
+		}
+		b.WriteString(`<td style="color:` + overallColor + `;font-weight:600">` + html.EscapeString(overallLabel) + `</td>`)
 		for idx := 0; idx <= maxGPUIndex; idx++ {
 			score, ok := run.gpuScores[idx]
 			if !ok {
 				b.WriteString(`<td style="color:var(--muted)">-</td>`)
 				continue
 			}
-			b.WriteString(`<td>` + fmt.Sprintf("%.2f", score) + `</td>`)
+			gpuStatus := run.gpuStatuses[idx]
+			scoreColor := ""
+			switch gpuStatus {
+			case "FAILED":
+				scoreColor = ` style="color:var(--crit-fg,#9f3a38);font-weight:600"`
+			case "WARNING", "PARTIAL":
+				scoreColor = ` style="color:var(--warn);font-weight:600"`
+			case "", "OK":
+				// no override
+			default:
+				scoreColor = ` style="color:var(--warn);font-weight:600"`
+			}
+			b.WriteString(`<td` + scoreColor + `>` + fmt.Sprintf("%.2f", score) + `</td>`)
 		}
 		b.WriteString(`</tr>`)
 	}
@@ -2373,12 +2398,15 @@ func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun)
 			continue
 		}
 		run := benchmarkHistoryRun{
-			generatedAt: result.GeneratedAt,
-			displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
-			gpuScores:   make(map[int]float64),
+			generatedAt:   result.GeneratedAt,
+			displayTime:   result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
+			gpuScores:     make(map[int]float64),
+			gpuStatuses:   make(map[int]string),
+			overallStatus: result.OverallStatus,
 		}
 		for _, gpu := range result.GPUs {
 			run.gpuScores[gpu.Index] = gpu.Scores.CompositeScore
+			run.gpuStatuses[gpu.Index] = gpu.Status
 			if gpu.Index > maxGPUIndex {
 				maxGPUIndex = gpu.Index
 			}
@@ -2447,31 +2475,45 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {

 	if len(latest.GPUs) > 0 {
 		b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`)
-		b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Achieved W</th><th>P95 Observed W</th><th>Status</th>`)
+		b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Single-card W</th><th>Multi-GPU W</th><th>P95 Observed W</th><th>Status</th>`)
 		b.WriteString(`</tr></thead><tbody>`)
 		for _, gpu := range latest.GPUs {
-			derated := gpu.Derated || (gpu.DefaultPowerLimitW > 0 && gpu.AppliedPowerLimitW < gpu.DefaultPowerLimitW-1)
+			// finalLimitW is the definitive TDP: multi-GPU stable limit from the ramp,
+			// falling back to single-card applied limit if the ramp hasn't run.
+			finalLimitW := gpu.StablePowerLimitW
+			if finalLimitW <= 0 {
+				finalLimitW = gpu.AppliedPowerLimitW
+			}
+			// Derate is relative to nominal (DefaultPowerLimitW), using the final limit.
+			derated := gpu.Derated ||
+				(gpu.DefaultPowerLimitW > 0 && finalLimitW > 0 && finalLimitW < gpu.DefaultPowerLimitW-1)
 			rowStyle := ""
-			achievedStyle := ""
+			finalStyle := ""
 			if derated {
 				rowStyle = ` style="background:rgba(255,180,0,0.08)"`
-				achievedStyle = ` style="color:#e6a000;font-weight:600"`
+				finalStyle = ` style="color:#e6a000;font-weight:600"`
 			}
 			statusLabel := gpu.Status
 			if statusLabel == "" {
 				statusLabel = "OK"
 			}
 			statusColor := "var(--ok)"
-			if statusLabel != "OK" {
+			if statusLabel == "FAILED" {
+				statusColor = "var(--crit-fg,#9f3a38)"
+			} else if statusLabel != "OK" {
 				statusColor = "var(--warn)"
 			}
 			nominalStr := "-"
 			if gpu.DefaultPowerLimitW > 0 {
 				nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW)
 			}
-			achievedStr := "-"
+			singleStr := "-"
 			if gpu.AppliedPowerLimitW > 0 {
-				achievedStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
+				singleStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
+			}
+			multiStr := "-"
+			if gpu.StablePowerLimitW > 0 {
+				multiStr = fmt.Sprintf("%.0f", gpu.StablePowerLimitW)
 			}
 			p95Str := "-"
 			if gpu.MaxObservedPowerW > 0 {
@@ -2481,7 +2523,8 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {
 			b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`)
 			b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`)
 			b.WriteString(`<td>` + nominalStr + `</td>`)
-			b.WriteString(`<td` + achievedStyle + `>` + achievedStr + `</td>`)
+			b.WriteString(`<td>` + singleStr + `</td>`)
+			b.WriteString(`<td` + finalStyle + `>` + multiStr + `</td>`)
 			b.WriteString(`<td>` + p95Str + `</td>`)
 			b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`)
 			b.WriteString(`</tr>`)
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -613,8 +613,9 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 	}
 	a := q.opts.App

+	recovered := len(j.lines) > 0
 	j.append(fmt.Sprintf("Starting %s...", t.Name))
-	if len(j.lines) > 0 {
+	if recovered {
 		j.append(fmt.Sprintf("Recovered after bee-web restart at %s", time.Now().UTC().Format(time.RFC3339)))
 	}

--- a/iso/builder/config/bootloaders/grub-pc/grub.cfg
+++ b/iso/builder/config/bootloaders/grub-pc/grub.cfg
@@ -16,6 +16,11 @@ menuentry "EASY-BEE" {
 }

 submenu "EASY-BEE (advanced options) -->" {
+    menuentry "EASY-BEE — load to RAM (toram)" {
+        linux   @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+        initrd  @INITRD_LIVE@
+    }
+
    menuentry "EASY-BEE — GSP=off" {
        linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
        initrd  @INITRD_LIVE@
--- a/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
+++ b/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
@@ -63,8 +63,10 @@ chmod +x /usr/local/bin/bee-sshsetup   2>/dev/null || true
 chmod +x /usr/local/bin/bee-smoketest  2>/dev/null || true
 chmod +x /usr/local/bin/bee            2>/dev/null || true
 chmod +x /usr/local/bin/bee-log-run    2>/dev/null || true
-chmod +x /usr/local/bin/bee-selfheal      2>/dev/null || true
-chmod +x /usr/local/bin/bee-boot-status  2>/dev/null || true
+chmod +x /usr/local/bin/bee-selfheal        2>/dev/null || true
+chmod +x /usr/local/bin/bee-boot-status    2>/dev/null || true
+chmod +x /usr/local/bin/bee-install        2>/dev/null || true
+chmod +x /usr/local/bin/bee-remount-medium 2>/dev/null || true
 if [ "$GPU_VENDOR" = "nvidia" ]; then
    chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
    chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
--- a/iso/builder/config/hooks/normal/9011-toram-rsync.hook.chroot
+++ b/iso/builder/config/hooks/normal/9011-toram-rsync.hook.chroot
@@ -0,0 +1,46 @@
+#!/bin/sh
+# 9011-toram-rsync.hook.chroot
+#
+# Adds rsync to the initramfs so that live-boot's toram code takes the
+# rsync --progress path instead of the silent "cp -a" fallback.
+#
+# live-boot's 9990-toram-todisk.sh already contains:
+#   if [ -x /bin/rsync ]; then
+#       rsync -a --progress ... 1>/dev/console
+#   else
+#       cp -a ...   # no output
+#   fi
+#
+# We install an initramfs-tools hook that calls copy_exec /usr/bin/rsync,
+# which copies the binary + all shared-library dependencies into the initrd.
+
+set -e
+
+HOOK_DIR="/etc/initramfs-tools/hooks"
+HOOK="${HOOK_DIR}/bee-rsync"
+
+mkdir -p "${HOOK_DIR}"
+
+cat > "${HOOK}" << 'EOF'
+#!/bin/sh
+# initramfs hook: include rsync for live-boot toram progress output
+PREREQ=""
+prereqs() { echo "$PREREQ"; }
+case "$1" in prereqs) prereqs; exit 0 ;; esac
+
+. /usr/share/initramfs-tools/hook-functions
+
+if [ -x /usr/bin/rsync ]; then
+    copy_exec /usr/bin/rsync /bin
+fi
+EOF
+
+chmod +x "${HOOK}"
+
+echo "9011-toram-rsync: installed initramfs hook at ${HOOK}"
+
+# Rebuild initramfs so the hook takes effect in the ISO's initrd.img
+KVER=$(ls /lib/modules | sort -V | tail -1)
+echo "9011-toram-rsync: rebuilding initramfs for kernel ${KVER}"
+update-initramfs -u -k "${KVER}"
+echo "9011-toram-rsync: done"
--- a/iso/builder/config/package-lists/bee.list.chroot
+++ b/iso/builder/config/package-lists/bee.list.chroot
@@ -3,6 +3,7 @@ dmidecode
 smartmontools
 nvme-cli
 pciutils
+rsync
 ipmitool
 util-linux
 e2fsprogs
--- a/iso/overlay/usr/local/bin/bee-install
+++ b/iso/overlay/usr/local/bin/bee-install
@@ -65,6 +65,9 @@ done
 SQUASHFS="/run/live/medium/live/filesystem.squashfs"
 if [ ! -f "$SQUASHFS" ]; then
    echo "ERROR: squashfs not found at $SQUASHFS" >&2
+    echo "  The live medium may have been disconnected." >&2
+    echo "  Reconnect the disc and run:  bee-remount-medium --wait" >&2
+    echo "  Then re-run bee-install." >&2
    exit 1
 fi

@@ -162,10 +165,59 @@ log "  Mounted."
 log "--- Step 5/7: Unpacking filesystem (this takes 10-20 minutes) ---"
 log "  Source: $SQUASHFS"
 log "  Target: $MOUNT_ROOT"
-unsquashfs -f -d "$MOUNT_ROOT" "$SQUASHFS" 2>&1 | \
-    grep -E '^\[|^inod|^created|^extract' | \
-    while read -r line; do log "  $line"; done || true
-log "  Unpack complete."
+
+# unsquashfs does not support resume, so retry the entire unpack step if the
+# source medium disappears mid-copy (e.g. CD physically disconnected).
+UNPACK_ATTEMPTS=0
+UNPACK_MAX=5
+while true; do
+    UNPACK_ATTEMPTS=$(( UNPACK_ATTEMPTS + 1 ))
+    if [ "$UNPACK_ATTEMPTS" -gt "$UNPACK_MAX" ]; then
+        die "Unpack failed $UNPACK_MAX times — giving up. Check the disc and logs."
+    fi
+    [ "$UNPACK_ATTEMPTS" -gt 1 ] && log "  Retry attempt $UNPACK_ATTEMPTS / $UNPACK_MAX ..."
+
+    # Re-check squashfs is reachable before each attempt
+    if [ ! -f "$SQUASHFS" ]; then
+        log "  SOURCE LOST: $SQUASHFS not found."
+        log "  Reconnect the disc and run 'bee-remount-medium --wait' in another terminal,"
+        log "  then press Enter here to retry."
+        read -r _
+        continue
+    fi
+
+    # wipe partial unpack so unsquashfs starts clean
+    if [ "$UNPACK_ATTEMPTS" -gt 1 ]; then
+        log "  Cleaning partial unpack from $MOUNT_ROOT ..."
+        # keep the mount point itself but remove its contents
+        find "$MOUNT_ROOT" -mindepth 1 -maxdepth 1 -exec rm -rf {} + 2>/dev/null || true
+    fi
+
+    UNPACK_OK=0
+    unsquashfs -f -d "$MOUNT_ROOT" "$SQUASHFS" 2>&1 | \
+        grep -E '^\[|^inod|^created|^extract|^ERROR|failed' | \
+        while IFS= read -r line; do log "  $line"; done || UNPACK_OK=$?
+
+    # Check squashfs is still reachable (gone = disc pulled during copy)
+    if [ ! -f "$SQUASHFS" ]; then
+        log "  WARNING: source medium lost during unpack — will retry after remount."
+        log "  Run 'bee-remount-medium --wait' in another terminal, then press Enter."
+        read -r _
+        continue
+    fi
+
+    # Verify the unpack produced a usable root (presence of /etc is a basic check)
+    if [ -d "${MOUNT_ROOT}/etc" ]; then
+        log "  Unpack complete."
+        break
+    else
+        log "  WARNING: unpack produced no /etc — squashfs may be corrupt or incomplete."
+        if [ "$UNPACK_ATTEMPTS" -lt "$UNPACK_MAX" ]; then
+            log "  Retrying in 5 s ..."
+            sleep 5
+        fi
+    fi
+done

 # ------------------------------------------------------------------
 log "--- Step 6/7: Configuring installed system ---"
--- a/iso/overlay/usr/local/bin/bee-remount-medium
+++ b/iso/overlay/usr/local/bin/bee-remount-medium
@@ -0,0 +1,100 @@
+#!/bin/bash
+# bee-remount-medium — find and remount the live ISO medium to /run/live/medium
+#
+# Run this after reconnecting the ISO source disc (USB/CD) if the live medium
+# was lost and /run/live/medium/live/filesystem.squashfs is missing.
+#
+# Usage: bee-remount-medium [--wait]
+#   --wait  keep retrying every 5 seconds until the medium is found (useful
+#           while physically reconnecting the device)
+
+set -euo pipefail
+
+MEDIUM_DIR="/run/live/medium"
+SQUASHFS_REL="live/filesystem.squashfs"
+WAIT_MODE=0
+
+for arg in "$@"; do
+    case "$arg" in
+        --wait|-w) WAIT_MODE=1 ;;
+        --help|-h)
+            echo "Usage: bee-remount-medium [--wait]"
+            echo "  Finds and remounts the live ISO medium to $MEDIUM_DIR"
+            echo "  --wait  retry every 5 s until a medium with squashfs is found"
+            exit 0 ;;
+    esac
+done
+
+log() { echo "[$(date +%H:%M:%S)] $*"; }
+die() { log "ERROR: $*" >&2; exit 1; }
+
+# Return all candidate block devices (optical + removable USB mass storage)
+find_candidates() {
+    # CD/DVD drives
+    for dev in /dev/sr* /dev/scd*; do
+        [ -b "$dev" ] && echo "$dev"
+    done
+    # USB/removable disks and partitions
+    for dev in /dev/sd* /dev/vd*; do
+        [ -b "$dev" ] || continue
+        # Only whole disks or partitions — skip the same device we are running from
+        local removable
+        local base
+        base=$(basename "$dev")
+        removable=$(cat "/sys/block/${base%%[0-9]*}/removable" 2>/dev/null || echo 0)
+        [ "$removable" = "1" ] && echo "$dev"
+    done
+}
+
+# Try to mount $1 to $MEDIUM_DIR and check for squashfs
+try_mount() {
+    local dev="$1"
+    local tmpdir
+    tmpdir=$(mktemp -d /tmp/bee-probe-XXXXXX)
+    if mount -o ro "$dev" "$tmpdir" 2>/dev/null; then
+        if [ -f "${tmpdir}/${SQUASHFS_REL}" ]; then
+            # Unmount probe mount and mount properly onto live path
+            umount "$tmpdir" 2>/dev/null || true
+            rmdir "$tmpdir"  2>/dev/null || true
+            # Unmount whatever is currently on MEDIUM_DIR (may be empty/stale)
+            umount "$MEDIUM_DIR" 2>/dev/null || true
+            mkdir -p "$MEDIUM_DIR"
+            if mount -o ro "$dev" "$MEDIUM_DIR"; then
+                log "Mounted $dev on $MEDIUM_DIR"
+                return 0
+            else
+                log "Mount of $dev on $MEDIUM_DIR failed"
+                return 1
+            fi
+        fi
+        umount "$tmpdir" 2>/dev/null || true
+    fi
+    rmdir "$tmpdir" 2>/dev/null || true
+    return 1
+}
+
+attempt() {
+    log "Scanning for ISO medium..."
+    for dev in $(find_candidates); do
+        log "  Trying $dev ..."
+        if try_mount "$dev"; then
+            local sq="${MEDIUM_DIR}/${SQUASHFS_REL}"
+            log "SUCCESS: squashfs available at $sq ($(du -sh "$sq" | cut -f1))"
+            return 0
+        fi
+    done
+    return 1
+}
+
+if [ "$WAIT_MODE" = "1" ]; then
+    log "Waiting for live medium (press Ctrl+C to abort)..."
+    while true; do
+        if attempt; then
+            exit 0
+        fi
+        log "  Not found — retrying in 5 s (reconnect the disc now)"
+        sleep 5
+    done
+else
+    attempt || die "No ISO medium with ${SQUASHFS_REL} found. Reconnect the disc and re-run, or use --wait."
+fi