Add real-data duration estimates to validate tab profiles

- Add SATEstimated* constants to sat.go derived from _v8 production logs, with a rule to recalculate them whenever the script changes - Extend validateInventory with NvidiaGPUCount to make estimates GPU-aware - Update all validate card duration strings: CPU, memory, storage, NVIDIA GPU, targeted stress/power, pulse test, NCCL, nvbandwidth - Fix nvbandwidth description ("intended to stay short" → actual ~45 min) - Top-level profile labels show computed total including GPU count Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Redesign system power chart as stacked per-PSU area chart
2026-04-18 10:51:15 +03:00 · 2026-04-18 10:42:00 +03:00 · 2026-04-18 10:32:16 +03:00 · 2026-04-18 10:30:11 +03:00 · 2026-04-17 23:52:47 +03:00 · 2026-04-17 23:48:56 +03:00
29 changed files with 1324 additions and 353 deletions
--- a/audit/internal/app/app.go
+++ b/audit/internal/app/app.go
@@ -146,7 +146,7 @@ type satRunner interface {
 	RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
 	RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
 	RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
-	RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
+	RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
 }
 type runtimeChecker interface {
@@ -744,8 +744,15 @@ func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platfo
 	return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
 }
 func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
 	return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc)
 }
 func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
-	path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
+	path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil)
 	body := "Results: " + path
 	if err != nil && err != context.Canceled {
 		body += "\nERROR: " + err.Error()
--- a/audit/internal/app/app_test.go
+++ b/audit/internal/app/app_test.go
@@ -128,6 +128,7 @@ type fakeSAT struct {
 	runNvidiaPowerFn          func(string, int, []int) (string, error)
 	runNvidiaPulseFn          func(string, int, []int) (string, error)
 	runNvidiaBandwidthFn      func(string, []int) (string, error)
 	runNCCLFn                 func(string, []int) (string, error)
 	runNvidiaTargetedStressFn func(string, int, []int) (string, error)
 	runMemoryFn               func(string) (string, error)
 	runStorageFn              func(string) (string, error)
@@ -287,10 +288,43 @@ func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.Platf
 	return "", nil
 }
-func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
+func (f fakeSAT) RunNCCLTests(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
 	if f.runNCCLFn != nil {
 		return f.runNCCLFn(baseDir, gpuIndices)
 	}
 	return "", nil
 }
 func TestRunNCCLTestsPassesSelectedGPUs(t *testing.T) {
 	t.Parallel()
 	var gotBaseDir string
 	var gotGPUIndices []int
 	a := &App{
 		sat: fakeSAT{
 			runNCCLFn: func(baseDir string, gpuIndices []int) (string, error) {
 				gotBaseDir = baseDir
 				gotGPUIndices = append([]int(nil), gpuIndices...)
 				return "/tmp/nccl-tests.tar.gz", nil
 			},
 		},
 	}
 	path, err := a.RunNCCLTests(context.Background(), "/tmp/sat", []int{3, 1}, nil)
 	if err != nil {
 		t.Fatalf("RunNCCLTests error: %v", err)
 	}
 	if path != "/tmp/nccl-tests.tar.gz" {
 		t.Fatalf("path=%q want %q", path, "/tmp/nccl-tests.tar.gz")
 	}
 	if gotBaseDir != "/tmp/sat" {
 		t.Fatalf("baseDir=%q want %q", gotBaseDir, "/tmp/sat")
 	}
 	if len(gotGPUIndices) != 2 || gotGPUIndices[0] != 3 || gotGPUIndices[1] != 1 {
 		t.Fatalf("gpuIndices=%v want [3 1]", gotGPUIndices)
 	}
 }
 func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
 	t.Parallel()
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -59,6 +59,9 @@ type benchmarkPowerCalibrationResult struct {
 	// ≥20% while server fans were below 100% duty cycle — a signal that the
 	// cooling system may not be correctly configured for full GPU load.
 	CoolingWarning string
 	// MetricRows holds the telemetry rows from the final (converged) attempt
 	// for this GPU. Used to build per-run gpu-metrics.csv.
 	MetricRows []GPUMetricRow
 }
 type benchmarkBurnProfile struct {
@@ -1122,6 +1125,7 @@ type benchmarkCoolingSample struct {
 	AvgFanRPM             float64
 	AvgFanDutyCyclePct    float64
 	FanDutyCycleAvailable bool
 	FanDutyCycleEstimated bool
 }
 func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
@@ -1134,6 +1138,7 @@ func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
 		samples[i].FanAvgRPM = fanSample.AvgFanRPM
 		samples[i].FanDutyCyclePct = fanSample.AvgFanDutyCyclePct
 		samples[i].FanDutyCycleAvailable = fanSample.FanDutyCycleAvailable
 		samples[i].FanDutyCycleEstimated = fanSample.FanDutyCycleEstimated
 	}
 	return samples, nil
 }
@@ -1141,11 +1146,12 @@ func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
 func sampleBenchmarkCoolingSample() benchmarkCoolingSample {
 	fans, _ := sampleFanSpeeds()
 	avgRPM, _, _ := fanRPMStats(fans)
-	dutyPct, dutyAvailable := sampleFanDutyCyclePct()
+	dutyPct, dutyAvailable, dutyEstimated := sampleFanDutyCyclePctFromFans(fans)
 	return benchmarkCoolingSample{
 		AvgFanRPM:             avgRPM,
 		AvgFanDutyCyclePct:    dutyPct,
 		FanDutyCycleAvailable: dutyAvailable,
 		FanDutyCycleEstimated: dutyEstimated,
 	}
 }
@@ -1387,25 +1393,33 @@ func summarizeBenchmarkCooling(rows []GPUMetricRow) *BenchmarkCoolingSummary {
 	}
 	var rpmValues []float64
 	var dutyValues []float64
 	var dutyEstimated bool
 	for _, row := range rows {
 		if row.FanAvgRPM > 0 {
 			rpmValues = append(rpmValues, row.FanAvgRPM)
 		}
 		if row.FanDutyCycleAvailable {
 			dutyValues = append(dutyValues, row.FanDutyCyclePct)
 			if row.FanDutyCycleEstimated {
 				dutyEstimated = true
 			}
 		}
 	}
 	if len(rpmValues) == 0 && len(dutyValues) == 0 {
 		return nil
 	}
 	summary := &BenchmarkCoolingSummary{
-		Available: true,
+		Available:             true,
-		AvgFanRPM: benchmarkMean(rpmValues),
+		AvgFanRPM:             benchmarkMean(rpmValues),
 		FanDutyCycleEstimated: dutyEstimated,
 	}
 	if len(dutyValues) > 0 {
 		summary.FanDutyCycleAvailable = true
 		summary.AvgFanDutyCyclePct = benchmarkMean(dutyValues)
 		summary.P95FanDutyCyclePct = benchmarkPercentile(dutyValues, 95)
 		if summary.FanDutyCycleEstimated {
 			summary.Notes = append(summary.Notes, "fan duty cycle is estimated from the highest fan RPM observed since boot; treat it as an approximation, not a direct PWM reading")
 		}
 	} else {
 		summary.Notes = append(summary.Notes, "fan duty cycle unavailable on this host; RPM-only fan telemetry was collected")
 	}
@@ -2770,7 +2784,7 @@ func runBenchmarkPowerCalibration(
 	infoByIndex map[int]benchmarkGPUInfo,
 	logFunc func(string),
 	seedLimits map[int]int,
-) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) {
+) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow) {
 	const calibDurationSec = 120
 	const maxDerateW = 150
 	// calibSearchTolerance is the binary-search convergence threshold in watts.
@@ -2784,7 +2798,7 @@ func runBenchmarkPowerCalibration(
 	if _, err := exec.LookPath("dcgmi"); err != nil {
 		logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
-		return map[int]benchmarkPowerCalibrationResult{}, nil
+		return map[int]benchmarkPowerCalibrationResult{}, nil, nil
 	}
 	if killed := KillTestWorkers(); len(killed) > 0 {
 		for _, p := range killed {
@@ -2818,6 +2832,8 @@ func runBenchmarkPowerCalibration(
 	results := make(map[int]benchmarkPowerCalibrationResult, len(gpuIndices))
 	var restore []benchmarkRestoreAction
 	var allCalibRows []GPUMetricRow // accumulated telemetry across all attempts
 	var calibCursor float64
 	// Initialise per-GPU state.
 	states := make([]*gpuCalibState, 0, len(gpuIndices))
@@ -2970,6 +2986,8 @@ calibDone:
 		ticker.Stop()
 		cancelAttempt()
 		_ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644)
 		// Accumulate telemetry rows with attempt stage label.
 		appendBenchmarkMetrics(&allCalibRows, ar.rows, fmt.Sprintf("attempt-%d", sharedAttempt), &calibCursor, float64(calibDurationSec))
 		// Resource busy: retry with exponential back-off (shared — one DCGM session).
 		if ar.err != nil && isDCGMResourceBusy(ar.err) {
@@ -3054,6 +3072,7 @@ calibDone:
 						}
 					}
 				}
 				s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
 				s.converged = true
 				continue
 			}
@@ -3092,6 +3111,7 @@ calibDone:
 				} else {
 					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
 				}
 				s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
 				s.converged = true
 				continue
 			}
@@ -3129,7 +3149,8 @@ calibDone:
 			results[s.idx] = s.calib
 		}
 	}
-	return results, restore
+	writeBenchmarkMetricsFiles(runDir, allCalibRows)
 	return results, restore, allCalibRows
 }
 // isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222),
@@ -3219,21 +3240,25 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 	}
 	if len(result.RampSteps) > 0 {
 		b.WriteString("## Ramp Sequence\n\n")
-		b.WriteString("| Step | New GPU | Stable Limit | Total Observed | Derated | Status |\n")
+		b.WriteString("| Step | New GPU | Stable Limit | Total Observed | Server Δ (IPMI) | Derated | Status |\n")
-		b.WriteString("|------|---------|--------------|----------------|---------|--------|\n")
+		b.WriteString("|------|---------|--------------|----------------|-----------------|---------|--------|\n")
 		for _, step := range result.RampSteps {
 			derated := "-"
 			if step.Derated {
 				derated = "⚠ yes"
 			}
-			fmt.Fprintf(&b, "| %d | GPU %d | %.0f W | %.0f W | %s | %s |\n",
+			serverDelta := "-"
-				step.StepIndex, step.NewGPUIndex, step.NewGPUStableLimitW, step.TotalObservedPowerW, derated, step.Status)
+			if step.ServerDeltaW > 0 {
 				serverDelta = fmt.Sprintf("%.0f W", step.ServerDeltaW)
 			}
 			fmt.Fprintf(&b, "| %d | GPU %d | %.0f W | %.0f W | %s | %s | %s |\n",
 				step.StepIndex, step.NewGPUIndex, step.NewGPUStableLimitW, step.TotalObservedPowerW, serverDelta, derated, step.Status)
 		}
 		b.WriteString("\n")
 	}
 	b.WriteString("## Per-Slot Results\n\n")
-	b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Temp | Attempts |\n")
+	b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Server Δ (IPMI) | Temp | Attempts |\n")
-	b.WriteString("|-----|--------|-------------------|--------------|------|----------|\n")
+	b.WriteString("|-----|--------|-------------------|--------------|-----------------|------|----------|\n")
 	for _, gpu := range result.GPUs {
 		stableLimit := "-"
 		if gpu.StablePowerLimitW > 0 {
@@ -3243,8 +3268,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 				stableLimit = fmt.Sprintf("%.0f W", gpu.StablePowerLimitW)
 			}
 		}
-		fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %s | %.1f C | %d |\n",
+		serverDelta := "-"
-			gpu.Index, gpu.Status, gpu.AppliedPowerLimitW, stableLimit, gpu.MaxObservedTempC, gpu.CalibrationAttempts)
+		if gpu.ServerDeltaW > 0 {
 			serverDelta = fmt.Sprintf("%.0f W", gpu.ServerDeltaW)
 		}
 		fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %s | %s | %.1f C | %d |\n",
 			gpu.Index, gpu.Status, gpu.AppliedPowerLimitW, stableLimit, serverDelta, gpu.MaxObservedTempC, gpu.CalibrationAttempts)
 	}
 	b.WriteString("\n")
 	for _, gpu := range result.GPUs {
@@ -3273,11 +3302,19 @@ func renderPowerBenchSummary(result NvidiaPowerBenchResult) string {
 		fmt.Fprintf(&b, "ramp_step_%d_new_gpu=%d\n", step.StepIndex, step.NewGPUIndex)
 		fmt.Fprintf(&b, "ramp_step_%d_stable_limit_w=%.0f\n", step.StepIndex, step.NewGPUStableLimitW)
 		fmt.Fprintf(&b, "ramp_step_%d_total_power_w=%.0f\n", step.StepIndex, step.TotalObservedPowerW)
 		if step.ServerLoadedW > 0 {
 			fmt.Fprintf(&b, "ramp_step_%d_server_loaded_w=%.0f\n", step.StepIndex, step.ServerLoadedW)
 			fmt.Fprintf(&b, "ramp_step_%d_server_delta_w=%.0f\n", step.StepIndex, step.ServerDeltaW)
 		}
 	}
 	for _, gpu := range result.GPUs {
 		if gpu.StablePowerLimitW > 0 {
 			fmt.Fprintf(&b, "gpu_%d_stable_limit_w=%.0f\n", gpu.Index, gpu.StablePowerLimitW)
 		}
 		if gpu.ServerLoadedW > 0 {
 			fmt.Fprintf(&b, "gpu_%d_server_loaded_w=%.0f\n", gpu.Index, gpu.ServerLoadedW)
 			fmt.Fprintf(&b, "gpu_%d_server_delta_w=%.0f\n", gpu.Index, gpu.ServerDeltaW)
 		}
 	}
 	if sp := result.ServerPower; sp != nil && sp.Available {
 		fmt.Fprintf(&b, "server_idle_w=%.0f\n", sp.IdleW)
@@ -3316,6 +3353,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	if infoErr != nil {
 		return "", infoErr
 	}
 	// Capture full nvidia-smi -q snapshot at the start of the run.
 	if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
 		_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
 	}
 	hostname, _ := os.Hostname()
 	result := NvidiaPowerBenchResult{
 		BenchmarkVersion:   benchmarkVersion,
@@ -3341,13 +3382,31 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
 	// establish a true single-card power baseline unaffected by neighbour heat.
 	calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
 	singleIPMILoadedW := make(map[int]float64, len(selected))
 	var allRestoreActions []benchmarkRestoreAction
 	// allPowerRows accumulates telemetry from all phases for the top-level gpu-metrics.csv.
 	var allPowerRows []GPUMetricRow
 	var powerCursor float64
 	for _, idx := range selected {
 		singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
 		_ = os.MkdirAll(singleDir, 0755)
 		singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
 		logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
-		c, restore := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil)
+		ipmiSingleCtx, ipmiSingleCancel := context.WithCancel(ctx)
 		ipmiSingleDone := make(chan float64, 1)
 		go func() {
 			defer close(ipmiSingleDone)
 			if w, ok := sampleIPMIPowerSeries(ipmiSingleCtx, 3600); ok {
 				ipmiSingleDone <- w
 			}
 		}()
 		c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil)
 		appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0)
 		ipmiSingleCancel()
 		if w, ok := <-ipmiSingleDone; ok {
 			singleIPMILoadedW[idx] = w
 			logFunc(fmt.Sprintf("power calibration: GPU %d single-card IPMI loaded: %.0f W", idx, w))
 		}
 		allRestoreActions = append(allRestoreActions, restore...)
 		if r, ok := c[idx]; ok {
 			calibByIndex[idx] = r
@@ -3372,7 +3431,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 				result.OverallStatus = "PARTIAL"
 			}
 		}
-		gpus = append(gpus, NvidiaPowerBenchGPU{
+		gpu := NvidiaPowerBenchGPU{
 			Index:               idx,
 			Name:                info.Name,
 			BusID:               info.BusID,
@@ -3385,7 +3444,16 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			Status:              status,
 			Notes:               append([]string(nil), calib.Notes...),
 			CoolingWarning:      calib.CoolingWarning,
-		})
+		}
 		if w, ok := singleIPMILoadedW[idx]; ok && serverIdleOK && w > 0 {
 			gpu.ServerLoadedW = w
 			gpu.ServerDeltaW = w - serverIdleW
 		}
 		if len(calib.MetricRows) > 0 {
 			t := summarizeBenchmarkTelemetry(calib.MetricRows)
 			gpu.Telemetry = &t
 		}
 		gpus = append(gpus, gpu)
 	}
 	sort.Slice(gpus, func(i, j int) bool {
 		if gpus[i].MaxObservedPowerW != gpus[j].MaxObservedPowerW {
@@ -3434,20 +3502,11 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	// stableLimits accumulates GPU index → fixed stable limit (W) across steps.
 	stableLimits := make(map[int]int, len(result.RecommendedSlotOrder))
-	// Start an IPMI sampling goroutine that runs throughout Phase 2 to capture
+	// serverLoadedW tracks the IPMI server power from the final ramp step
-	// server-side loaded power while GPUs are under stress. The goroutine is
+	// (all GPUs simultaneously loaded). Earlier steps' values are stored
-	// cancelled as soon as Phase 2 finishes, and the average is used to compare
+	// per-step in NvidiaPowerBenchStep.ServerLoadedW.
 	// against PlatformMaxTDPW (GPU-reported stable limits sum).
 	var serverLoadedW float64
 	var serverLoadedOK bool
 	ipmiPhase2Ctx, ipmiPhase2Cancel := context.WithCancel(ctx)
 	ipmiPhase2Done := make(chan float64, 1)
 	go func() {
 		defer close(ipmiPhase2Done)
 		if w, ok := sampleIPMIPowerSeries(ipmiPhase2Ctx, 3600); ok {
 			ipmiPhase2Done <- w
 		}
 	}()
 	// Step 1: reuse single-card calibration result directly.
 	if len(result.RecommendedSlotOrder) > 0 {
@@ -3464,6 +3523,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			Derated:             firstCalib.Derated,
 			Status:              "OK",
 		}
 		if w, ok := singleIPMILoadedW[firstIdx]; ok && serverIdleOK && w > 0 {
 			ramp.ServerLoadedW = w
 			ramp.ServerDeltaW = w - serverIdleW
 		}
 		if !firstCalib.Completed {
 			ramp.Status = "FAILED"
 			ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card targeted_power", firstIdx))
@@ -3491,17 +3554,45 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		_ = os.MkdirAll(stepDir, 0755)
 		// Reuse the latest stable limits as starting points, but re-check every
-		// active GPU in this hotter configuration.
+		// active GPU in this hotter configuration. For the newly introduced GPU,
-		seedForStep := make(map[int]int, len(stableLimits))
+		// seed from its single-card calibration so we do not restart from the
-		for k, v := range stableLimits {
+		// default TDP when a prior derated limit is already known.
-			seedForStep[k] = v
+		seedForStep := make(map[int]int, len(subset))
 		for _, idx := range subset {
 			if lim, ok := stableLimits[idx]; ok && lim > 0 {
 				seedForStep[idx] = lim
 				continue
 			}
 			if base, ok := calibByIndex[idx]; ok {
 				lim := int(math.Round(base.AppliedPowerLimitW))
 				if lim > 0 {
 					seedForStep[idx] = lim
 				}
 			}
 		}
 		logFunc(fmt.Sprintf("power ramp: step %d/%d — revalidating %d active GPU(s) including new GPU %d",
 			step, len(result.RecommendedSlotOrder), len(subset), newGPUIdx))
 		stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
-		stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep)
+		ipmiStepCtx, ipmiStepCancel := context.WithCancel(ctx)
 		ipmiStepDone := make(chan float64, 1)
 		go func() {
 			defer close(ipmiStepDone)
 			if w, ok := sampleIPMIPowerSeries(ipmiStepCtx, 3600); ok {
 				ipmiStepDone <- w
 			}
 		}()
 		stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep)
 		appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0)
 		ipmiStepCancel()
 		var stepIPMILoadedW float64
 		var stepIPMIOK bool
 		if w, ok := <-ipmiStepDone; ok {
 			stepIPMILoadedW = w
 			stepIPMIOK = true
 			logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W", step, w))
 		}
 		// Accumulate restore actions; they all run in the outer defer.
 		allRestoreActions = append(allRestoreActions, stepRestore...)
@@ -3564,15 +3655,17 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
 		}
-		result.RampSteps = append(result.RampSteps, ramp)
+		if stepIPMIOK && serverIdleOK && stepIPMILoadedW > 0 {
-	}
+			ramp.ServerLoadedW = stepIPMILoadedW
 			ramp.ServerDeltaW = stepIPMILoadedW - serverIdleW
 			// The last step has all GPUs loaded — use it as the top-level loaded_w.
 			if step == len(result.RecommendedSlotOrder) {
 				serverLoadedW = stepIPMILoadedW
 				serverLoadedOK = true
 			}
 		}
-	// Stop IPMI Phase 2 sampling and collect result.
+		result.RampSteps = append(result.RampSteps, ramp)
 	ipmiPhase2Cancel()
 	if w, ok := <-ipmiPhase2Done; ok {
 		serverLoadedW = w
 		serverLoadedOK = true
 		logFunc(fmt.Sprintf("server loaded power (IPMI, Phase 2 avg): %.0f W", w))
 	}
 	// Populate StablePowerLimitW on each GPU entry from the accumulated stable limits.
@@ -3602,6 +3695,8 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	//   ~1.0 → GPU telemetry matches wall power; <0.75 → GPU over-reports its TDP.
 	_ = serverIdleOK // used implicitly via characterizeServerPower
 	result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, result.PlatformMaxTDPW, serverIdleOK && serverLoadedOK)
 	// Write top-level gpu-metrics.csv/.html aggregating all phases.
 	writeBenchmarkMetricsFiles(runDir, allPowerRows)
 	resultJSON, err := json.MarshalIndent(result, "", "  ")
 	if err != nil {
 		return "", fmt.Errorf("marshal power result: %w", err)
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -31,6 +31,7 @@ type BenchmarkCoolingSummary struct {
 	Available             bool     `json:"available"`
 	AvgFanRPM             float64  `json:"avg_fan_rpm,omitempty"`
 	FanDutyCycleAvailable bool     `json:"fan_duty_cycle_available,omitempty"`
 	FanDutyCycleEstimated bool     `json:"fan_duty_cycle_estimated,omitempty"`
 	AvgFanDutyCyclePct    float64  `json:"avg_fan_duty_cycle_pct,omitempty"`
 	P95FanDutyCyclePct    float64  `json:"p95_fan_duty_cycle_pct,omitempty"`
 	Notes                 []string `json:"notes,omitempty"`
@@ -55,32 +56,32 @@ type NvidiaBenchmarkOptions struct {
 }
 type NvidiaBenchmarkResult struct {
-	BenchmarkVersion   string                       `json:"benchmark_version"`
+	BenchmarkVersion string    `json:"benchmark_version"`
-	GeneratedAt        time.Time                    `json:"generated_at"`
+	GeneratedAt      time.Time `json:"generated_at"`
-	Hostname           string                       `json:"hostname,omitempty"`
+	Hostname         string    `json:"hostname,omitempty"`
-	ServerModel        string                       `json:"server_model,omitempty"`
+	ServerModel      string    `json:"server_model,omitempty"`
-	BenchmarkProfile   string                       `json:"benchmark_profile"`
+	BenchmarkProfile string    `json:"benchmark_profile"`
-	ParallelGPUs       bool                         `json:"parallel_gpus,omitempty"`
+	ParallelGPUs     bool      `json:"parallel_gpus,omitempty"`
-	RampStep           int                          `json:"ramp_step,omitempty"`
+	RampStep         int       `json:"ramp_step,omitempty"`
-	RampTotal          int                          `json:"ramp_total,omitempty"`
+	RampTotal        int       `json:"ramp_total,omitempty"`
-	RampRunID          string                       `json:"ramp_run_id,omitempty"`
+	RampRunID        string    `json:"ramp_run_id,omitempty"`
-	ScalabilityScore   float64                      `json:"scalability_score,omitempty"`
+	ScalabilityScore float64   `json:"scalability_score,omitempty"`
 	// PlatformPowerScore is the mean compute scalability across ramp steps 2..N.
 	// 100% = each added GPU contributes exactly its single-card throughput.
 	// < 100% = throughput loss due to thermal throttle, power limits, or contention.
-	PlatformPowerScore   float64                    `json:"platform_power_score,omitempty"`
+	PlatformPowerScore   float64                      `json:"platform_power_score,omitempty"`
-	PerformanceRampSteps []NvidiaPerformanceRampStep `json:"performance_ramp_steps,omitempty"`
+	PerformanceRampSteps []NvidiaPerformanceRampStep  `json:"performance_ramp_steps,omitempty"`
-	OverallStatus      string                       `json:"overall_status"`
+	OverallStatus        string                       `json:"overall_status"`
-	SelectedGPUIndices []int                        `json:"selected_gpu_indices"`
+	SelectedGPUIndices   []int                        `json:"selected_gpu_indices"`
-	Findings           []string                     `json:"findings,omitempty"`
+	Findings             []string                     `json:"findings,omitempty"`
-	Warnings           []string                     `json:"warnings,omitempty"`
+	Warnings             []string                     `json:"warnings,omitempty"`
-	Normalization      BenchmarkNormalization       `json:"normalization"`
+	Normalization        BenchmarkNormalization       `json:"normalization"`
-	HostConfig         *BenchmarkHostConfig         `json:"host_config,omitempty"`
+	HostConfig           *BenchmarkHostConfig         `json:"host_config,omitempty"`
-	CPULoad            *BenchmarkCPULoad            `json:"cpu_load,omitempty"`
+	CPULoad              *BenchmarkCPULoad            `json:"cpu_load,omitempty"`
-	Cooling            *BenchmarkCoolingSummary     `json:"cooling,omitempty"`
+	Cooling              *BenchmarkCoolingSummary     `json:"cooling,omitempty"`
-	GPUs               []BenchmarkGPUResult         `json:"gpus"`
+	GPUs                 []BenchmarkGPUResult         `json:"gpus"`
-	Interconnect       *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
+	Interconnect         *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
-	ServerPower        *BenchmarkServerPower        `json:"server_power,omitempty"`
+	ServerPower          *BenchmarkServerPower        `json:"server_power,omitempty"`
 }
 type BenchmarkNormalization struct {
@@ -223,8 +224,8 @@ type BenchmarkScorecard struct {
 	// Throttle breakdown — percentage of steady-state time in each throttle type.
 	// Used for diagnosis: tells WHY the GPU throttled, not just whether it did.
-	ThermalThrottlePct  float64 `json:"thermal_throttle_pct"`  // HW+SW thermal slowdown
+	ThermalThrottlePct   float64 `json:"thermal_throttle_pct"`   // HW+SW thermal slowdown
-	PowerCapThrottlePct float64 `json:"power_cap_throttle_pct"` // SW power cap
+	PowerCapThrottlePct  float64 `json:"power_cap_throttle_pct"` // SW power cap
 	SyncBoostThrottlePct float64 `json:"sync_boost_throttle_pct,omitempty"`
 	// Temperature headroom: distance to the 100°C destruction threshold.
@@ -300,22 +301,22 @@ type NvidiaPowerBenchResult struct {
 	// PlatformMaxTDPW is the sum of per-GPU stable power limits found during the
 	// cumulative thermal ramp. Represents the actual sustained power budget of
 	// this server under full GPU load. Use for rack power planning.
-	PlatformMaxTDPW float64               `json:"platform_max_tdp_w"`
+	PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
 	// ServerPower captures IPMI server power delta (idle→loaded) measured in
 	// parallel with the thermal ramp. Use to compare GPU-reported TDP against
 	// actual wall-power draw as seen by the server's power supply.
-	ServerPower     *BenchmarkServerPower `json:"server_power,omitempty"`
+	ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
-	Findings        []string              `json:"findings,omitempty"`
+	Findings    []string              `json:"findings,omitempty"`
-	GPUs            []NvidiaPowerBenchGPU `json:"gpus"`
+	GPUs        []NvidiaPowerBenchGPU `json:"gpus"`
 }
 type NvidiaPowerBenchGPU struct {
-	Index               int      `json:"index"`
+	Index              int     `json:"index"`
-	Name                string   `json:"name,omitempty"`
+	Name               string  `json:"name,omitempty"`
-	BusID               string   `json:"bus_id,omitempty"`
+	BusID              string  `json:"bus_id,omitempty"`
-	DefaultPowerLimitW  float64  `json:"default_power_limit_w,omitempty"`
+	DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
 	// AppliedPowerLimitW is the stable limit found during single-card calibration.
-	AppliedPowerLimitW  float64  `json:"applied_power_limit_w,omitempty"`
+	AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
 	// StablePowerLimitW is the final fixed limit for this GPU after the
 	// cumulative thermal ramp. This is the limit at which the GPU operated
 	// stably with all other GPUs running simultaneously at their own limits.
@@ -330,13 +331,20 @@ type NvidiaPowerBenchGPU struct {
 	Notes               []string `json:"notes,omitempty"`
 	// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
 	CoolingWarning string `json:"cooling_warning,omitempty"`
 	// ServerLoadedW is the IPMI server power reading captured during this
 	// GPU's single-card calibration run. ServerDeltaW = ServerLoadedW − idle.
 	ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
 	ServerDeltaW  float64 `json:"server_delta_w,omitempty"`
 	// Telemetry holds the aggregated stats from the final converged calibration
 	// attempt for this GPU (temperature, power, fan, clock percentiles).
 	Telemetry *BenchmarkTelemetrySummary `json:"telemetry,omitempty"`
 }
 type NvidiaPowerBenchStep struct {
-	StepIndex           int      `json:"step_index"`
+	StepIndex  int   `json:"step_index"`
-	GPUIndices          []int    `json:"gpu_indices"`
+	GPUIndices []int `json:"gpu_indices"`
 	// NewGPUIndex is the GPU whose stable limit was searched in this step.
-	NewGPUIndex         int      `json:"new_gpu_index"`
+	NewGPUIndex int `json:"new_gpu_index"`
 	// NewGPUStableLimitW is the stable power limit found for the new GPU.
 	NewGPUStableLimitW  float64  `json:"new_gpu_stable_limit_w,omitempty"`
 	TotalObservedPowerW float64  `json:"total_observed_power_w,omitempty"`
@@ -344,20 +352,24 @@ type NvidiaPowerBenchStep struct {
 	Derated             bool     `json:"derated,omitempty"`
 	Status              string   `json:"status"`
 	Notes               []string `json:"notes,omitempty"`
 	// ServerLoadedW is the IPMI server power reading captured during this
 	// ramp step's calibration run. ServerDeltaW = ServerLoadedW − idle.
 	ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
 	ServerDeltaW  float64 `json:"server_delta_w,omitempty"`
 }
 // NvidiaPerformanceRampStep holds per-step performance data for the
 // scalability ramp-up phase of the performance benchmark.
 type NvidiaPerformanceRampStep struct {
-	StepIndex          int      `json:"step_index"`
+	StepIndex  int   `json:"step_index"`
-	GPUIndices         []int    `json:"gpu_indices"`
+	GPUIndices []int `json:"gpu_indices"`
 	// TotalSyntheticTOPS is the sum of per-GPU SyntheticScore (fp32-equivalent
 	// TOPS from dedicated single-precision phases) across all GPUs in this step.
-	TotalSyntheticTOPS float64  `json:"total_synthetic_tops"`
+	TotalSyntheticTOPS float64 `json:"total_synthetic_tops"`
-	TotalMixedTOPS     float64  `json:"total_mixed_tops,omitempty"`
+	TotalMixedTOPS     float64 `json:"total_mixed_tops,omitempty"`
 	// ScalabilityPct = TotalSyntheticTOPS / (k × best_single_gpu_tops) × 100.
 	// 100% = perfect linear scaling. < 100% = thermal/power/interconnect loss.
-	ScalabilityPct     float64  `json:"scalability_pct"`
+	ScalabilityPct float64  `json:"scalability_pct"`
-	Status             string   `json:"status"`
+	Status         string   `json:"status"`
-	Notes              []string `json:"notes,omitempty"`
+	Notes          []string `json:"notes,omitempty"`
 }
--- a/audit/internal/platform/gpu_metrics.go
+++ b/audit/internal/platform/gpu_metrics.go
@@ -27,6 +27,7 @@ type GPUMetricRow struct {
 	FanAvgRPM             float64 `json:"fan_avg_rpm,omitempty"`
 	FanDutyCyclePct       float64 `json:"fan_duty_cycle_pct,omitempty"`
 	FanDutyCycleAvailable bool    `json:"fan_duty_cycle_available,omitempty"`
 	FanDutyCycleEstimated bool    `json:"fan_duty_cycle_estimated,omitempty"`
 }
 // sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
@@ -147,14 +148,18 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
 // WriteGPUMetricsCSV writes collected rows as a CSV file.
 func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
 	var b bytes.Buffer
-	b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available\n")
+	b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available,fan_duty_cycle_estimated\n")
 	for _, r := range rows {
 		dutyAvail := 0
 		if r.FanDutyCycleAvailable {
 			dutyAvail = 1
 		}
-		fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d\n",
+		dutyEstimated := 0
-			strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail)
+		if r.FanDutyCycleEstimated {
 			dutyEstimated = 1
 		}
 		fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d,%d\n",
 			strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail, dutyEstimated)
 	}
 	return os.WriteFile(path, b.Bytes(), 0644)
 }
--- a/audit/internal/platform/install_to_ram.go
+++ b/audit/internal/platform/install_to_ram.go
@@ -140,26 +140,56 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (ret
 	}
 	squashfsFiles, err := filepath.Glob("/run/live/medium/live/*.squashfs")
-	if err != nil || len(squashfsFiles) == 0 {
+	sourceAvailable := err == nil && len(squashfsFiles) > 0
 		return fmt.Errorf("no squashfs files found in /run/live/medium/live/")
 	}
 	free := freeMemBytes()
 	var needed int64
 	for _, sf := range squashfsFiles {
 		fi, err2 := os.Stat(sf)
 		if err2 != nil {
 			return fmt.Errorf("stat %s: %v", sf, err2)
 		}
 		needed += fi.Size()
 	}
 	const headroom = 256 * 1024 * 1024
 	if free > 0 && needed+headroom > free {
 		return fmt.Errorf("insufficient RAM: need %s, available %s",
 			humanBytes(needed+headroom), humanBytes(free))
 	}
 	dstDir := installToRAMDir
 	// If the source medium is unavailable, check whether a previous run already
 	// produced a complete copy in RAM. If so, skip the copy phase and proceed
 	// directly to the loop-rebind / bind-mount steps.
 	if !sourceAvailable {
 		copiedFiles, _ := filepath.Glob(filepath.Join(dstDir, "*.squashfs"))
 		if len(copiedFiles) > 0 {
 			log("Source medium not available, but a previous RAM copy was found — resuming from existing copy.")
 			// Proceed to rebind with the already-copied files.
 			for _, dst := range copiedFiles {
 				base := filepath.Base(dst)
 				// Re-associate the loop device that was originally backed by the
 				// source file (now gone); find it by the old source path pattern.
 				srcGuess := "/run/live/medium/live/" + base
 				loopDev, lerr := findLoopForFile(srcGuess)
 				if lerr != nil {
 					log(fmt.Sprintf("Loop device for %s not found (%v) — skipping re-association.", base, lerr))
 					continue
 				}
 				if rerr := reassociateLoopDevice(loopDev, dst); rerr != nil {
 					log(fmt.Sprintf("Warning: could not re-associate %s → %s: %v", loopDev, dst, rerr))
 				} else {
 					log(fmt.Sprintf("Loop device %s now backed by RAM copy.", loopDev))
 				}
 			}
 			goto bindMedium
 		}
 		return fmt.Errorf("no squashfs files found in /run/live/medium/live/ and no prior RAM copy in %s — reconnect the installation medium and retry", dstDir)
 	}
 	{
 		free := freeMemBytes()
 		var needed int64
 		for _, sf := range squashfsFiles {
 			fi, err2 := os.Stat(sf)
 			if err2 != nil {
 				return fmt.Errorf("stat %s: %v", sf, err2)
 			}
 			needed += fi.Size()
 		}
 		const headroom = 256 * 1024 * 1024
 		if free > 0 && needed+headroom > free {
 			return fmt.Errorf("insufficient RAM: need %s, available %s",
 				humanBytes(needed+headroom), humanBytes(free))
 		}
 	}
 	if state.CopyPresent {
 		log("Removing stale partial RAM copy before retry...")
 	}
@@ -199,6 +229,7 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (ret
 		}
 	}
 bindMedium:
 	log("Copying remaining medium files...")
 	if err := cpDir(ctx, "/run/live/medium", dstDir, log); err != nil {
 		log(fmt.Sprintf("Warning: partial copy: %v", err))
--- a/audit/internal/platform/live_metrics.go
+++ b/audit/internal/platform/live_metrics.go
@@ -18,11 +18,19 @@ type LiveMetricSample struct {
 	Fans       []FanReading   `json:"fans"`
 	Temps      []TempReading  `json:"temps"`
 	PowerW     float64        `json:"power_w"`
 	PSUs       []PSUReading   `json:"psus,omitempty"`
 	CPULoadPct float64        `json:"cpu_load_pct"`
 	MemLoadPct float64        `json:"mem_load_pct"`
 	GPUs       []GPUMetricRow `json:"gpus"`
 }
 // PSUReading is a per-slot power supply input power reading.
 type PSUReading struct {
 	Slot   int     `json:"slot"`
 	Name   string  `json:"name"`
 	PowerW float64 `json:"power_w"`
 }
 // TempReading is a named temperature sensor value.
 type TempReading struct {
 	Name    string  `json:"name"`
@@ -57,6 +65,9 @@ func SampleLiveMetrics() LiveMetricSample {
 	// System power — returns 0 if unavailable
 	s.PowerW = sampleSystemPower()
 	// Per-PSU power — populated when IPMI SDR has Power Supply entities with Watt readings
 	s.PSUs = samplePSUPower()
 	// CPU load — from /proc/stat
 	s.CPULoadPct = sampleCPULoadPct()
@@ -326,3 +337,65 @@ func compactAmbientTempName(chip, name string) string {
 	}
 	return chip + " / " + name
 }
 // samplePSUPower reads per-PSU input power via IPMI SDR.
 // It parses `ipmitool sdr elist full` output looking for Power Supply entity
 // sensors (entity ID "10.N") that report a value in Watts.
 // Returns nil when IPMI is unavailable or no PSU Watt sensors exist.
 func samplePSUPower() []PSUReading {
 	out, err := exec.Command("ipmitool", "sdr", "elist", "full").Output()
 	if err != nil || len(out) == 0 {
 		return nil
 	}
 	// map slot → reading (keep highest-watt value per slot in case of duplicates)
 	type entry struct {
 		name   string
 		powerW float64
 	}
 	bySlot := map[int]entry{}
 	for _, line := range strings.Split(string(out), "\n") {
 		parts := strings.Split(line, "|")
 		if len(parts) < 5 {
 			continue
 		}
 		entityID := strings.TrimSpace(parts[3]) // e.g. "10.1"
 		if !strings.HasPrefix(entityID, "10.") {
 			continue // not a Power Supply entity
 		}
 		slotStr := strings.TrimPrefix(entityID, "10.")
 		slot, err := strconv.Atoi(slotStr)
 		if err != nil {
 			continue
 		}
 		valueField := strings.TrimSpace(parts[4]) // e.g. "740.00 Watts"
 		if !strings.Contains(strings.ToLower(valueField), "watts") {
 			continue
 		}
 		valueFields := strings.Fields(valueField)
 		if len(valueFields) < 2 {
 			continue
 		}
 		w, err := strconv.ParseFloat(valueFields[0], 64)
 		if err != nil || w <= 0 {
 			continue
 		}
 		sensorName := strings.TrimSpace(parts[0])
 		if existing, ok := bySlot[slot]; !ok || w > existing.powerW {
 			bySlot[slot] = entry{name: sensorName, powerW: w}
 		}
 	}
 	if len(bySlot) == 0 {
 		return nil
 	}
 	slots := make([]int, 0, len(bySlot))
 	for s := range bySlot {
 		slots = append(slots, s)
 	}
 	sort.Ints(slots)
 	psus := make([]PSUReading, 0, len(slots))
 	for _, s := range slots {
 		e := bySlot[s]
 		psus = append(psus, PSUReading{Slot: s, Name: e.name, PowerW: e.powerW})
 	}
 	return psus
 }
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -20,6 +20,54 @@ import (
 	"time"
 )
 // Estimated wall-clock durations for each SAT/validate test, derived from real
 // production logs in _benchmark/_v8/.
 //
 // Rule: whenever the commands, timeout parameters, or number of sub-jobs inside
 // the corresponding Run*Pack function change, re-measure the wall-clock duration
 // from actual task logs and update the matching constant here.
 //
 // Sources:
 //   - SATEstimatedCPUValidateSec:                 xFusion v8.6 — 62 s
 //   - SATEstimatedMemoryValidateSec:               xFusion v8.6 — 68 s
 //   - SATEstimatedNvidiaGPUValidatePerGPUSec:      xFusion v8.6/v8.22 — 77–87 s/GPU
 //   - SATEstimatedNvidiaGPUStressPerGPUSec:        xFusion v8.6/v8.22 — 444–448 s/GPU
 //   - SATEstimatedNvidiaTargetedStressPerGPUSec:   xFusion v8.6/v8.22 — 347–348 s/GPU (300 s default + overhead)
 //   - SATEstimatedNvidiaTargetedPowerPerGPUSec:    MSI v8.22 / xFusion v8.6 — 346–351 s/GPU
 //   - SATEstimatedNvidiaPulseTestSec:              xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous)
 //   - SATEstimatedNvidiaInterconnectSec:           xFusion v8.6/v8.22 — 210–384 s / 8 GPU (all simultaneous)
 //   - SATEstimatedNvidiaBandwidthSec:              xFusion v8.6/v8.22 — 2 664–2 688 s / 8 GPU (all simultaneous)
 const (
 	// CPU stress: stress-ng 60 s + lscpu/sensors overhead.
 	SATEstimatedCPUValidateSec = 65
 	// CPU stress: stress-ng 1800 s (stress mode default).
 	SATEstimatedCPUStressSec = 1800
 	// RAM: memtester 256 MB / 1 pass.
 	SATEstimatedMemoryValidateSec = 70
 	// RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size).
 	SATEstimatedMemoryStressSec = 140
 	// NVIDIA dcgmi diag Level 2 (medium), per GPU, sequential.
 	SATEstimatedNvidiaGPUValidatePerGPUSec = 85
 	// NVIDIA dcgmi diag Level 3 (targeted stress), per GPU, sequential.
 	SATEstimatedNvidiaGPUStressPerGPUSec = 450
 	// NVIDIA dcgmi targeted_stress 300 s + overhead, per GPU, sequential.
 	SATEstimatedNvidiaTargetedStressPerGPUSec = 350
 	// NVIDIA dcgmi targeted_power 300 s + overhead, per GPU, sequential.
 	SATEstimatedNvidiaTargetedPowerPerGPUSec = 350
 	// NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU).
 	SATEstimatedNvidiaPulseTestSec = 5000
 	// NCCL all_reduce_perf, all GPUs simultaneously.
 	SATEstimatedNvidiaInterconnectSec = 300
 	// nvbandwidth, all GPUs simultaneously. Tool runs all built-in tests
 	// without a user-configurable time limit; duration is determined by nvbandwidth itself.
 	SATEstimatedNvidiaBandwidthSec = 2700
 )
 var (
 	satExecCommand  = exec.Command
 	satLookPath     = exec.LookPath
@@ -366,12 +414,14 @@ func (s *System) ResetNvidiaGPU(index int) (string, error) {
 	return string(raw), err
 }
-// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
+// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
 // Measures collective communication bandwidth over NVLink/PCIe.
-func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+func (s *System) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
-	// detect GPU count
+	selected, err := resolveDCGMGPUIndices(gpuIndices)
-	out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
+	if err != nil {
-	gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
+		return "", err
 	}
 	gpuCount := len(selected)
 	if gpuCount < 1 {
 		gpuCount = 1
 	}
@@ -380,7 +430,7 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
 		satJob{name: "02-all-reduce-perf.log", cmd: []string{
 			"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
 			"-g", strconv.Itoa(gpuCount), "--iters", "20",
-		}},
+		}, env: nvidiaVisibleDevicesEnv(selected)},
 	), logFunc)
 }
--- a/audit/internal/platform/sat_fan_stress.go
+++ b/audit/internal/platform/sat_fan_stress.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
 	"math"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -56,13 +57,37 @@ type cachedPowerReading struct {
 	UpdatedAt time.Time
 }
 type fanObservationState struct {
 	MaxRPM map[string]float64 `json:"max_rpm"`
 }
 type fanPeakCandidate struct {
 	FirstSeen time.Time
 	RPM       float64
 }
 var (
 	systemPowerCacheMu sync.Mutex
 	systemPowerCache   cachedPowerReading
 	fanObservationMu   sync.Mutex
 	fanObservation     fanObservationState
 	fanObservationInit bool
 	fanPeakCandidates  = make(map[string]fanPeakCandidate)
 )
 const systemPowerHoldTTL = 15 * time.Second
 var fanObservationStatePath = "/var/log/bee-sat/fan-observation.json"
 const fanObservationMinPeakHold = time.Second
 func normalizeObservedFanMaxRPM(rpm float64) float64 {
 	if rpm <= 0 {
 		return 0
 	}
 	return math.Ceil(rpm/1000.0) * 1000.0
 }
 // RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
 // temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
 // Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
@@ -310,11 +335,13 @@ func sampleFanSpeeds() ([]FanReading, error) {
 	out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
 	if err == nil {
 		if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
 			updateFanObservation(fans, time.Now())
 			return fans, nil
 		}
 	}
 	fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
 	if len(fans) > 0 {
 		updateFanObservation(fans, time.Now())
 		return fans, nil
 	}
 	if err != nil {
@@ -323,6 +350,119 @@ func sampleFanSpeeds() ([]FanReading, error) {
 	return nil, sensorsErr
 }
 func loadFanObservationLocked() {
 	if fanObservationInit {
 		return
 	}
 	fanObservationInit = true
 	fanObservation.MaxRPM = make(map[string]float64)
 	raw, err := os.ReadFile(fanObservationStatePath)
 	if err != nil || len(raw) == 0 {
 		return
 	}
 	var persisted fanObservationState
 	if json.Unmarshal(raw, &persisted) != nil {
 		return
 	}
 	for name, rpm := range persisted.MaxRPM {
 		name = strings.TrimSpace(name)
 		if name == "" || rpm <= 0 {
 			continue
 		}
 		fanObservation.MaxRPM[name] = rpm
 	}
 }
 func saveFanObservationLocked() {
 	if len(fanObservation.MaxRPM) == 0 {
 		return
 	}
 	dir := filepath.Dir(fanObservationStatePath)
 	if dir == "" || dir == "." {
 		dir = "/var/log/bee-sat"
 	}
 	if err := os.MkdirAll(dir, 0755); err != nil {
 		return
 	}
 	raw, err := json.MarshalIndent(fanObservation, "", "  ")
 	if err != nil {
 		return
 	}
 	_ = os.WriteFile(fanObservationStatePath, raw, 0644)
 }
 func updateFanObservation(fans []FanReading, now time.Time) {
 	if len(fans) == 0 {
 		return
 	}
 	fanObservationMu.Lock()
 	defer fanObservationMu.Unlock()
 	loadFanObservationLocked()
 	changed := false
 	for _, fan := range fans {
 		name := strings.TrimSpace(fan.Name)
 		if name == "" || fan.RPM <= 0 {
 			continue
 		}
 		currentMax := fanObservation.MaxRPM[name]
 		if fan.RPM <= currentMax {
 			delete(fanPeakCandidates, name)
 			continue
 		}
 		if cand, ok := fanPeakCandidates[name]; ok {
 			if now.Sub(cand.FirstSeen) >= fanObservationMinPeakHold {
 				newMax := math.Max(cand.RPM, fan.RPM)
 				if newMax > currentMax {
 					fanObservation.MaxRPM[name] = normalizeObservedFanMaxRPM(newMax)
 					changed = true
 				}
 				delete(fanPeakCandidates, name)
 				continue
 			}
 			if fan.RPM > cand.RPM {
 				fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: cand.FirstSeen, RPM: fan.RPM}
 			}
 			continue
 		}
 		fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: now, RPM: fan.RPM}
 	}
 	if changed {
 		saveFanObservationLocked()
 	}
 }
 func estimateFanDutyCyclePctFromObservation(fans []FanReading) (float64, bool) {
 	if len(fans) == 0 {
 		return 0, false
 	}
 	fanObservationMu.Lock()
 	defer fanObservationMu.Unlock()
 	loadFanObservationLocked()
 	var samples []float64
 	for _, fan := range fans {
 		name := strings.TrimSpace(fan.Name)
 		if name == "" || fan.RPM <= 0 {
 			continue
 		}
 		maxRPM := fanObservation.MaxRPM[name]
 		if maxRPM <= 0 {
 			continue
 		}
 		pct := fan.RPM / maxRPM * 100.0
 		if pct > 100 {
 			pct = 100
 		}
 		if pct < 0 {
 			pct = 0
 		}
 		samples = append(samples, pct)
 	}
 	if len(samples) == 0 {
 		return 0, false
 	}
 	return benchmarkMean(samples), true
 }
 // parseFanSpeeds parses "ipmitool sdr type Fan" output.
 // Handles two formats:
 //
@@ -428,12 +568,27 @@ func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {
 // sampleFanDutyCyclePct reads fan PWM/duty-cycle controls from lm-sensors.
 // Returns the average duty cycle across all exposed PWM controls.
-func sampleFanDutyCyclePct() (float64, bool) {
+func sampleFanDutyCyclePct() (float64, bool, bool) {
 	out, err := exec.Command("sensors", "-j").Output()
 	if err != nil || len(out) == 0 {
-		return 0, false
+		fans, fanErr := sampleFanSpeeds()
 		if fanErr != nil {
 			return 0, false, false
 		}
 		return sampleFanDutyCyclePctFromFans(fans)
 	}
-	return parseFanDutyCyclePctSensorsJSON(out)
+	pct, ok := parseFanDutyCyclePctSensorsJSON(out)
 	return pct, ok, false
 }
 func sampleFanDutyCyclePctFromFans(fans []FanReading) (float64, bool, bool) {
 	if len(fans) == 0 {
 		return 0, false, false
 	}
 	if pct, ok := estimateFanDutyCyclePctFromObservation(fans); ok {
 		return pct, true, true
 	}
 	return 0, false, false
 }
 func parseFanDutyCyclePctSensorsJSON(raw []byte) (float64, bool) {
--- a/audit/internal/platform/sat_fan_stress_test.go
+++ b/audit/internal/platform/sat_fan_stress_test.go
@@ -1,6 +1,7 @@
 package platform
 import (
 	"path/filepath"
 	"testing"
 	"time"
 )
@@ -50,6 +51,53 @@ func TestParseFanDutyCyclePctSensorsJSON(t *testing.T) {
 	}
 }
 func TestEstimateFanDutyCyclePctFromObservation(t *testing.T) {
 	t.Parallel()
 	oldPath := fanObservationStatePath
 	oldState := fanObservation
 	oldInit := fanObservationInit
 	oldCandidates := fanPeakCandidates
 	fanObservationStatePath = filepath.Join(t.TempDir(), "fan-observation.json")
 	fanObservation = fanObservationState{}
 	fanObservationInit = false
 	fanPeakCandidates = make(map[string]fanPeakCandidate)
 	t.Cleanup(func() {
 		fanObservationStatePath = oldPath
 		fanObservation = oldState
 		fanObservationInit = oldInit
 		fanPeakCandidates = oldCandidates
 	})
 	start := time.Unix(100, 0)
 	updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5000}}, start)
 	if _, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2500}}); ok {
 		t.Fatalf("single-sample spike should not establish observed max")
 	}
 	updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5200}}, start.Add(500*time.Millisecond))
 	updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5100}}, start.Add(1500*time.Millisecond))
 	got, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
 	if !ok {
 		t.Fatalf("expected estimated duty cycle from persisted observed max")
 	}
 	if got < 43 || got > 44 {
 		t.Fatalf("got=%v want ~43.3", got)
 	}
 	fanObservation = fanObservationState{}
 	fanObservationInit = false
 	fanPeakCandidates = make(map[string]fanPeakCandidate)
 	got, ok = estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
 	if !ok {
 		t.Fatalf("expected persisted observed max to be reloaded from disk")
 	}
 	if got < 43 || got > 44 {
 		t.Fatalf("reloaded got=%v want ~43.3", got)
 	}
 }
 func TestParseDCMIPowerReading(t *testing.T) {
 	raw := `
 Instantaneous power reading:                   512 Watts
--- a/audit/internal/platform/sat_test.go
+++ b/audit/internal/platform/sat_test.go
@@ -321,6 +321,19 @@ func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
 	}
 }
 func TestNvidiaDCGMNamedDiagCommandSkipsDurationForNVBandwidth(t *testing.T) {
 	cmd := nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, []int{2, 0})
 	want := []string{"dcgmi", "diag", "-r", "nvbandwidth", "-i", "2,0"}
 	if len(cmd) != len(want) {
 		t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
 	}
 	for i := range want {
 		if cmd[i] != want[i] {
 			t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
 		}
 	}
 }
 func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
 	env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
 	if len(env) != 2 {
--- a/audit/internal/webui/charts_svg.go
+++ b/audit/internal/webui/charts_svg.go
@@ -462,6 +462,127 @@ func synthesizeChartTimes(times []time.Time, count int) []time.Time {
 	return out
 }
 // renderStackedMetricChartSVG renders a stacked area chart where each dataset
 // is visually "stacked" on top of the previous one. Intended for multi-PSU
 // power charts where the filled area of each PSU shows its individual
 // contribution and the total height equals the combined draw.
 func renderStackedMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
 	pointCount := len(labels)
 	if len(times) > pointCount {
 		pointCount = len(times)
 	}
 	if pointCount == 0 {
 		pointCount = 1
 		labels = []string{""}
 		times = []time.Time{{}}
 	}
 	if len(labels) < pointCount {
 		padded := make([]string, pointCount)
 		copy(padded, labels)
 		labels = padded
 	}
 	if len(times) < pointCount {
 		times = synthesizeChartTimes(times, pointCount)
 	}
 	for i := range datasets {
 		if len(datasets[i]) == 0 {
 			datasets[i] = make([]float64, pointCount)
 		}
 	}
 	times, datasets = downsampleTimeSeries(times, datasets, 1400)
 	pointCount = len(times)
 	// Build cumulative sums per time point.
 	cumulative := make([][]float64, len(datasets)+1)
 	for i := range cumulative {
 		cumulative[i] = make([]float64, pointCount)
 	}
 	for i, ds := range datasets {
 		for j, v := range ds {
 			cumulative[i+1][j] = cumulative[i][j] + v
 		}
 	}
 	// Scale is based on the total (top cumulative row).
 	total := cumulative[len(cumulative)-1]
 	yMin := floatPtr(0)
 	if yMax == nil {
 		yMax = autoMax120(total)
 	}
 	scale := singleAxisChartScale([][]float64{total}, yMin, yMax)
 	legendItems := make([]metricChartSeries, len(datasets))
 	for i, name := range names {
 		color := metricChartPalette[i%len(metricChartPalette)]
 		legendItems[i] = metricChartSeries{Name: name, Color: color, Values: datasets[i]}
 	}
 	// Stats label from totals.
 	statsLabel := chartStatsLabel([][]float64{total})
 	layout := singleAxisChartLayout(canvasHeight, len(legendItems))
 	start, end := chartTimeBounds(times)
 	var b strings.Builder
 	writeSVGOpen(&b, layout.Width, layout.Height)
 	writeChartFrame(&b, title, statsLabel, layout.Width, layout.Height)
 	writeTimelineIdleSpans(&b, layout, start, end, timeline)
 	writeVerticalGrid(&b, layout, times, pointCount, 8)
 	writeHorizontalGrid(&b, layout, scale)
 	writeTimelineBoundaries(&b, layout, start, end, timeline)
 	writePlotBorder(&b, layout)
 	writeSingleAxisY(&b, layout, scale)
 	writeXAxisLabels(&b, layout, times, labels, start, end, 8)
 	// Draw stacked areas from top to bottom so lower layers are visible.
 	for i := len(datasets) - 1; i >= 0; i-- {
 		writeStackedArea(&b, layout, times, start, end, cumulative[i], cumulative[i+1], scale, legendItems[i].Color)
 	}
 	// Draw border polylines on top.
 	for i := len(datasets) - 1; i >= 0; i-- {
 		writeSeriesPolyline(&b, layout, times, start, end, cumulative[i+1], scale, legendItems[i].Color)
 	}
 	writeLegend(&b, layout, legendItems)
 	writeSVGClose(&b)
 	return []byte(b.String()), nil
 }
 // writeStackedArea draws a filled polygon between two cumulative value arrays
 // (baseline and top), using the given color at 55% opacity.
 func writeStackedArea(b *strings.Builder, layout chartLayout, times []time.Time, start, end time.Time, baseline, top []float64, scale chartScale, color string) {
 	n := len(top)
 	if n == 0 {
 		return
 	}
 	if len(baseline) < n {
 		baseline = make([]float64, n)
 	}
 	// Forward path along top values, then backward along baseline values.
 	var points strings.Builder
 	for i := 0; i < n; i++ {
 		x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
 		y := chartYForValue(valueClamp(top[i], scale), scale, layout.PlotTop, layout.PlotBottom)
 		if i > 0 {
 			points.WriteByte(' ')
 		}
 		points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
 		points.WriteByte(',')
 		points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
 	}
 	for i := n - 1; i >= 0; i-- {
 		x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
 		y := chartYForValue(valueClamp(baseline[i], scale), scale, layout.PlotTop, layout.PlotBottom)
 		points.WriteByte(' ')
 		points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
 		points.WriteByte(',')
 		points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
 	}
 	fmt.Fprintf(b, `<polygon points="%s" fill="%s" fill-opacity="0.55" stroke="none"/>`+"\n", points.String(), color)
 }
 func writeSVGOpen(b *strings.Builder, width, height int) {
 	fmt.Fprintf(b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" viewBox="0 0 %d %d">`+"\n", width, height, width, height)
 }
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
@@ -1378,15 +1378,64 @@ setInterval(loadMetricsLayout, 5000);
 // ── Validate (Acceptance Tests) ───────────────────────────────────────────────
 type validateInventory struct {
-	CPU     string
+	CPU           string
-	Memory  string
+	Memory        string
-	Storage string
+	Storage       string
-	NVIDIA  string
+	NVIDIA        string
-	AMD     string
+	AMD           string
 	NvidiaGPUCount int
 	AMDGPUCount    int
 }
 // validateFmtDur formats a duration in seconds as a human-readable "~N min" or "~N s" string.
 func validateFmtDur(secs int) string {
 	if secs < 120 {
 		return fmt.Sprintf("~%d s", secs)
 	}
 	mins := (secs + 29) / 60
 	return fmt.Sprintf("~%d min", mins)
 }
 // validateTotalValidateSec returns the estimated wall-clock duration of
 // "Validate one by one" in Validate mode for n NVIDIA GPUs.
 func validateTotalValidateSec(n int) int {
 	if n < 0 {
 		n = 0
 	}
 	total := platform.SATEstimatedCPUValidateSec +
 		platform.SATEstimatedMemoryValidateSec +
 		n*platform.SATEstimatedNvidiaGPUValidatePerGPUSec +
 		platform.SATEstimatedNvidiaInterconnectSec +
 		platform.SATEstimatedNvidiaBandwidthSec
 	return total
 }
 // validateTotalStressSec returns the estimated wall-clock duration of
 // "Validate one by one" in Stress mode for n NVIDIA GPUs.
 func validateTotalStressSec(n int) int {
 	if n < 0 {
 		n = 0
 	}
 	total := platform.SATEstimatedCPUStressSec +
 		platform.SATEstimatedMemoryStressSec +
 		n*platform.SATEstimatedNvidiaGPUStressPerGPUSec +
 		n*platform.SATEstimatedNvidiaTargetedStressPerGPUSec +
 		n*platform.SATEstimatedNvidiaTargetedPowerPerGPUSec +
 		platform.SATEstimatedNvidiaPulseTestSec +
 		platform.SATEstimatedNvidiaInterconnectSec +
 		platform.SATEstimatedNvidiaBandwidthSec
 	return total
 }
 func renderValidate(opts HandlerOptions) string {
 	inv := loadValidateInventory(opts)
 	n := inv.NvidiaGPUCount
 	validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
 	stressTotalStr := validateFmtDur(validateTotalStressSec(n))
 	gpuNote := ""
 	if n > 0 {
 		gpuNote = fmt.Sprintf(" (%d GPU)", n)
 	}
 	return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
 <p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
@@ -1396,10 +1445,10 @@ func renderValidate(opts HandlerOptions) string {
 	    <div class="validate-profile-col">
 	      <div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
 	      <label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
-	      <label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (~30–60 min)</span></label>
+	      <label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (` + stressTotalStr + gpuNote + `)</span></label>
 	    </div>
 	    <div class="validate-profile-col validate-profile-action">
-	      <p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially with the selected cycle count and mode. Validate is quick (~5–15 min total); Stress is thorough (~30–60 min total).</p>
+	      <p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially. Validate: ` + validateTotalStr + gpuNote + `; Stress: ` + stressTotalStr + gpuNote + `. Estimates are based on real log data and scale with GPU count.</p>
 	      <button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
 	      <div style="margin-top:12px">
 	        <span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
@@ -1413,19 +1462,19 @@ func renderValidate(opts HandlerOptions) string {
 		inv.CPU,
 		`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
 		`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
-		`60s in Validate, 30 min in Stress.`,
+		validateFmtDur(platform.SATEstimatedCPUValidateSec)+` in Validate (stress-ng 60 s). `+validateFmtDur(platform.SATEstimatedCPUStressSec)+` in Stress (stress-ng 30 min).`,
 	)) +
 		renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
 			inv.Memory,
 			`Runs a RAM validation pass and records memory state around the test.`,
 			`<code>free</code>, <code>memtester</code>`,
-			`256 MB / 1 pass in Validate, 512 MB / 1 pass in Stress.`,
+			validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` in Validate (256 MB × 1 pass). `+validateFmtDur(platform.SATEstimatedMemoryStressSec)+` in Stress (512 MB × 1 pass).`,
 		)) +
 		renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
 			inv.Storage,
 			`Scans all storage devices and runs the matching health or self-test path for each device type.`,
 			`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
-			`Short self-test in Validate, extended self-test in Stress.`,
+			`Seconds in Validate (NVMe: instant device query; SATA/SAS: short self-test). Up to ~1 h per device in Stress (extended self-test, device-dependent).`,
 		)) +
 		`</div>
 <div style="height:1px;background:var(--border);margin:16px 0"></div>
@@ -1450,14 +1499,33 @@ func renderValidate(opts HandlerOptions) string {
 		inv.NVIDIA,
 		`Runs NVIDIA diagnostics and board inventory checks.`,
 		`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
-		`Level 2 in Validate, Level 3 in Stress. Runs one GPU at a time on the selected NVIDIA GPUs.`,
+		func() string {
 			perV := platform.SATEstimatedNvidiaGPUValidatePerGPUSec
 			perS := platform.SATEstimatedNvidiaGPUStressPerGPUSec
 			if n > 0 {
 				return fmt.Sprintf("Validate: %s/GPU × %d = %s (Level 2, sequential). Stress: %s/GPU × %d = %s (Level 3, sequential).",
 					validateFmtDur(perV), n, validateFmtDur(perV*n),
 					validateFmtDur(perS), n, validateFmtDur(perS*n))
 			}
 			return fmt.Sprintf("Validate: %s/GPU (Level 2, sequential). Stress: %s/GPU (Level 3, sequential).",
 				validateFmtDur(perV), validateFmtDur(perS))
 		}(),
 	)) +
 		`<div id="sat-card-nvidia-targeted-stress">` +
 		renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
 			inv.NVIDIA,
 			`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
 			`<code>dcgmi diag targeted_stress</code>`,
-			`Skipped in Validate mode. Runs after dcgmi diag in Stress mode. Runs one GPU at a time on the selected NVIDIA GPUs.<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
+			func() string {
 				per := platform.SATEstimatedNvidiaTargetedStressPerGPUSec
 				s := "Skipped in Validate. "
 				if n > 0 {
 					s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
 				} else {
 					s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
 				}
 				return s + `<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
 			}(),
 		)) +
 		`</div>` +
 		`<div id="sat-card-nvidia-targeted-power">` +
@@ -1465,7 +1533,16 @@ func renderValidate(opts HandlerOptions) string {
 			inv.NVIDIA,
 			`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
 			`<code>dcgmi diag targeted_power</code>`,
-			`Skipped in Validate mode. Runs in Stress mode only. Runs one GPU at a time.<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
+			func() string {
 				per := platform.SATEstimatedNvidiaTargetedPowerPerGPUSec
 				s := "Skipped in Validate. "
 				if n > 0 {
 					s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
 				} else {
 					s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
 				}
 				return s + `<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
 			}(),
 		)) +
 		`</div>` +
 		`<div id="sat-card-nvidia-pulse">` +
@@ -1473,7 +1550,7 @@ func renderValidate(opts HandlerOptions) string {
 			inv.NVIDIA,
 			`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
 			`<code>dcgmi diag pulse_test</code>`,
-			`Skipped in Validate mode. Runs in Stress mode only. Runs all selected GPUs simultaneously — synchronous pulsing is required to stress the PSU.<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
+			`Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
 		)) +
 		`</div>` +
 		`<div id="sat-card-nvidia-interconnect">` +
@@ -1481,7 +1558,7 @@ func renderValidate(opts HandlerOptions) string {
 			inv.NVIDIA,
 			`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
 			`<code>all_reduce_perf</code> (NCCL tests)`,
-			`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously (requires ≥2).<p id="sat-ni-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
+			`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
 		)) +
 		`</div>` +
 		`<div id="sat-card-nvidia-bandwidth">` +
@@ -1489,7 +1566,7 @@ func renderValidate(opts HandlerOptions) string {
 			inv.NVIDIA,
 			`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
 			`<code>nvbandwidth</code>`,
-			`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously.<p id="sat-nb-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
+			`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
 		)) +
 		`</div>` +
 		`</div>
@@ -1527,8 +1604,6 @@ function satModeChanged() {
    {card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
    {card: 'sat-card-nvidia-targeted-power',  hint: 'sat-tp-mode-hint'},
    {card: 'sat-card-nvidia-pulse',           hint: 'sat-pt-mode-hint'},
    {card: 'sat-card-nvidia-interconnect',    hint: 'sat-ni-mode-hint'},
    {card: 'sat-card-nvidia-bandwidth',       hint: 'sat-nb-mode-hint'},
  ].forEach(function(item) {
    const card = document.getElementById(item.card);
    if (card) {
@@ -1776,7 +1851,7 @@ function runAllSAT() {
  const cycles = 1;
  const status = document.getElementById('sat-all-status');
  status.textContent = 'Enqueuing...';
-  const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
+  const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
  const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
  const activeTargets = baseTargets.filter(target => {
    if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
@@ -1924,6 +1999,8 @@ func loadValidateInventory(opts HandlerOptions) validateInventory {
 	out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device")
 	out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU")
 	out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU")
 	out.NvidiaGPUCount = nvidiaTotal
 	out.AMDGPUCount = amdTotal
 	return out
 }
@@ -2016,9 +2093,11 @@ func renderSATCard(id, label, runAction, headerActions, body string) string {
 // ── Benchmark ─────────────────────────────────────────────────────────────────
 type benchmarkHistoryRun struct {
-	generatedAt time.Time
+	generatedAt  time.Time
-	displayTime string
+	displayTime  string
-	gpuScores   map[int]float64 // GPU index → composite score
+	gpuScores    map[int]float64 // GPU index → composite score
 	gpuStatuses  map[int]string  // GPU index → status ("OK", "WARNING", "FAILED", …)
 	overallStatus string
 }
 func renderBenchmark(opts HandlerOptions) string {
@@ -2082,7 +2161,7 @@ func renderBenchmark(opts HandlerOptions) string {
  </div>
 </div>
-`+`<div id="benchmark-results-section">`+renderBenchmarkResultsCard(opts.ExportDir)+`</div>`+`
+` + `<div id="benchmark-results-section">` + renderBenchmarkResultsCard(opts.ExportDir) + `</div>` + `
 <div id="benchmark-output" style="display:none;margin-top:16px" class="card">
  <div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
@@ -2326,7 +2405,7 @@ func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string,
 		b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">` + html.EscapeString(description) + `</p>`)
 	}
 	b.WriteString(`<div style="overflow-x:auto">`)
-	b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th>`)
+	b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th><th>Status</th>`)
 	for i := 0; i <= maxGPUIndex; i++ {
 		b.WriteString(`<th>GPU ` + strconv.Itoa(i) + `</th>`)
 	}
@@ -2335,13 +2414,36 @@ func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string,
 		b.WriteString(`<tr>`)
 		b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
 		b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
 		overallColor := "var(--ok)"
 		overallLabel := run.overallStatus
 		if overallLabel == "" {
 			overallLabel = "OK"
 		}
 		if overallLabel == "FAILED" {
 			overallColor = "var(--crit-fg,#9f3a38)"
 		} else if overallLabel != "OK" {
 			overallColor = "var(--warn)"
 		}
 		b.WriteString(`<td style="color:` + overallColor + `;font-weight:600">` + html.EscapeString(overallLabel) + `</td>`)
 		for idx := 0; idx <= maxGPUIndex; idx++ {
 			score, ok := run.gpuScores[idx]
 			if !ok {
 				b.WriteString(`<td style="color:var(--muted)">-</td>`)
 				continue
 			}
-			b.WriteString(`<td>` + fmt.Sprintf("%.2f", score) + `</td>`)
+			gpuStatus := run.gpuStatuses[idx]
 			scoreColor := ""
 			switch gpuStatus {
 			case "FAILED":
 				scoreColor = ` style="color:var(--crit-fg,#9f3a38);font-weight:600"`
 			case "WARNING", "PARTIAL":
 				scoreColor = ` style="color:var(--warn);font-weight:600"`
 			case "", "OK":
 				// no override
 			default:
 				scoreColor = ` style="color:var(--warn);font-weight:600"`
 			}
 			b.WriteString(`<td` + scoreColor + `>` + fmt.Sprintf("%.2f", score) + `</td>`)
 		}
 		b.WriteString(`</tr>`)
 	}
@@ -2375,12 +2477,15 @@ func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun)
 			continue
 		}
 		run := benchmarkHistoryRun{
-			generatedAt: result.GeneratedAt,
+			generatedAt:   result.GeneratedAt,
-			displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
+			displayTime:   result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
-			gpuScores:   make(map[int]float64),
+			gpuScores:     make(map[int]float64),
 			gpuStatuses:   make(map[int]string),
 			overallStatus: result.OverallStatus,
 		}
 		for _, gpu := range result.GPUs {
 			run.gpuScores[gpu.Index] = gpu.Scores.CompositeScore
 			run.gpuStatuses[gpu.Index] = gpu.Status
 			if gpu.Index > maxGPUIndex {
 				maxGPUIndex = gpu.Index
 			}
@@ -2449,31 +2554,45 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {
 	if len(latest.GPUs) > 0 {
 		b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`)
-		b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Achieved W</th><th>P95 Observed W</th><th>Status</th>`)
+		b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Single-card W</th><th>Multi-GPU W</th><th>P95 Observed W</th><th>Status</th>`)
 		b.WriteString(`</tr></thead><tbody>`)
 		for _, gpu := range latest.GPUs {
-			derated := gpu.Derated || (gpu.DefaultPowerLimitW > 0 && gpu.AppliedPowerLimitW < gpu.DefaultPowerLimitW-1)
+			// finalLimitW is the definitive TDP: multi-GPU stable limit from the ramp,
 			// falling back to single-card applied limit if the ramp hasn't run.
 			finalLimitW := gpu.StablePowerLimitW
 			if finalLimitW <= 0 {
 				finalLimitW = gpu.AppliedPowerLimitW
 			}
 			// Derate is relative to nominal (DefaultPowerLimitW), using the final limit.
 			derated := gpu.Derated ||
 				(gpu.DefaultPowerLimitW > 0 && finalLimitW > 0 && finalLimitW < gpu.DefaultPowerLimitW-1)
 			rowStyle := ""
-			achievedStyle := ""
+			finalStyle := ""
 			if derated {
 				rowStyle = ` style="background:rgba(255,180,0,0.08)"`
-				achievedStyle = ` style="color:#e6a000;font-weight:600"`
+				finalStyle = ` style="color:#e6a000;font-weight:600"`
 			}
 			statusLabel := gpu.Status
 			if statusLabel == "" {
 				statusLabel = "OK"
 			}
 			statusColor := "var(--ok)"
-			if statusLabel != "OK" {
+			if statusLabel == "FAILED" {
 				statusColor = "var(--crit-fg,#9f3a38)"
 			} else if statusLabel != "OK" {
 				statusColor = "var(--warn)"
 			}
 			nominalStr := "-"
 			if gpu.DefaultPowerLimitW > 0 {
 				nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW)
 			}
-			achievedStr := "-"
+			singleStr := "-"
 			if gpu.AppliedPowerLimitW > 0 {
-				achievedStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
+				singleStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
 			}
 			multiStr := "-"
 			if gpu.StablePowerLimitW > 0 {
 				multiStr = fmt.Sprintf("%.0f", gpu.StablePowerLimitW)
 			}
 			p95Str := "-"
 			if gpu.MaxObservedPowerW > 0 {
@@ -2483,7 +2602,8 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {
 			b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`)
 			b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`)
 			b.WriteString(`<td>` + nominalStr + `</td>`)
-			b.WriteString(`<td` + achievedStyle + `>` + achievedStr + `</td>`)
+			b.WriteString(`<td>` + singleStr + `</td>`)
 			b.WriteString(`<td` + finalStyle + `>` + multiStr + `</td>`)
 			b.WriteString(`<td>` + p95Str + `</td>`)
 			b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`)
 			b.WriteString(`</tr>`)
@@ -2517,7 +2637,7 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {
 func renderBurn() string {
 	return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>&#9888; Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
-<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `), NCCL, NVBandwidth, and LINPACK remain in <a href="/validate">Validate → Stress mode</a>. Burn exposes sustained GPU compute load recipes.</div>
+<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in <a href="/validate">Validate → Stress mode</a>; NCCL and NVBandwidth are available directly from <a href="/validate">Validate</a>.</div>
 <p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
 <div class="card" style="margin-bottom:16px">
--- a/audit/internal/webui/server.go
+++ b/audit/internal/webui/server.go
@@ -575,12 +575,14 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
 	}
 	timeline := metricsTimelineSegments(samples, time.Now())
 	if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
-		buf, ok, err := renderGPUOverviewChartSVG(idx, samples, timeline)
+		var overviewOk bool
 		var buf []byte
 		buf, overviewOk, err = renderGPUOverviewChartSVG(idx, samples, timeline)
 		if err != nil {
 			http.Error(w, err.Error(), http.StatusInternalServerError)
 			return
 		}
-		if !ok {
+		if !overviewOk {
 			http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
 			return
 		}
@@ -589,23 +591,37 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
 		_, _ = w.Write(buf)
 		return
 	}
-	datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
+	datasets, names, labels, title, yMin, yMax, stacked, ok := chartDataFromSamples(path, samples)
 	if !ok {
 		http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
 		return
 	}
-	buf, err := renderMetricChartSVG(
+	var buf []byte
-		title,
+	if stacked {
-		labels,
+		buf, err = renderStackedMetricChartSVG(
-		sampleTimes(samples),
+			title,
-		datasets,
+			labels,
-		names,
+			sampleTimes(samples),
-		yMin,
+			datasets,
-		yMax,
+			names,
-		chartCanvasHeightForPath(path, len(names)),
+			yMax,
-		timeline,
+			chartCanvasHeightForPath(path, len(names)),
-	)
+			timeline,
 		)
 	} else {
 		buf, err = renderMetricChartSVG(
 			title,
 			labels,
 			sampleTimes(samples),
 			datasets,
 			names,
 			yMin,
 			yMax,
 			chartCanvasHeightForPath(path, len(names)),
 			timeline,
 		)
 	}
 	if err != nil {
 		http.Error(w, err.Error(), http.StatusInternalServerError)
 		return
@@ -615,12 +631,8 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
 	_, _ = w.Write(buf)
 }
-func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][]float64, []string, []string, string, *float64, *float64, bool) {
+func chartDataFromSamples(path string, samples []platform.LiveMetricSample) (datasets [][]float64, names []string, labels []string, title string, yMin, yMax *float64, stacked bool, ok bool) {
-	var datasets [][]float64
+	labels = sampleTimeLabels(samples)
 	var names []string
 	var title string
 	var yMin, yMax *float64
 	labels := sampleTimeLabels(samples)
 	switch {
 	case path == "server-load":
@@ -656,15 +668,41 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 	case path == "server-power":
 		title = "System Power"
-		power := make([]float64, len(samples))
+		// Use per-PSU stacked chart when PSU SDR data is available.
-		for i, s := range samples {
+		// Collect the union of PSU slots seen across all samples.
-			power[i] = s.PowerW
+		psuSlots := psuSlotsFromSamples(samples)
 		if len(psuSlots) > 1 {
 			// Build one dataset per PSU slot.
 			psuDatasets := make([][]float64, len(psuSlots))
 			psuNames := make([]string, len(psuSlots))
 			for si, slot := range psuSlots {
 				ds := make([]float64, len(samples))
 				for i, s := range samples {
 					for _, psu := range s.PSUs {
 						if psu.Slot == slot {
 							ds[i] = psu.PowerW
 							break
 						}
 					}
 				}
 				psuDatasets[si] = normalizePowerSeries(ds)
 				psuNames[si] = fmt.Sprintf("PSU %d", slot)
 			}
 			datasets = psuDatasets
 			names = psuNames
 			stacked = true
 			yMax = autoMax120(psuStackedTotal(psuDatasets))
 		} else {
 			power := make([]float64, len(samples))
 			for i, s := range samples {
 				power[i] = s.PowerW
 			}
 			power = normalizePowerSeries(power)
 			datasets = [][]float64{power}
 			names = []string{"Power W"}
 			yMin = floatPtr(0)
 			yMax = autoMax120(power)
 		}
 		power = normalizePowerSeries(power)
 		datasets = [][]float64{power}
 		names = []string{"Power W"}
 		yMin = floatPtr(0)
 		yMax = autoMax120(power)
 	case path == "server-fans":
 		title = "Fan RPM"
@@ -707,7 +745,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 	case strings.HasPrefix(path, "gpu/"):
 		idx, sub, ok := parseGPUChartPath(path)
 		if !ok {
-			return nil, nil, nil, "", nil, nil, false
+			return nil, nil, nil, "", nil, nil, false, false
 		}
 		switch sub {
 		case "load":
@@ -715,7 +753,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
 			mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
 			if util == nil && mem == nil {
-				return nil, nil, nil, "", nil, nil, false
+				return nil, nil, nil, "", nil, nil, false, false
 			}
 			datasets = [][]float64{coalesceDataset(util, len(samples)), coalesceDataset(mem, len(samples))}
 			names = []string{"Load %", "Mem %"}
@@ -725,7 +763,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			title = gpuDisplayLabel(idx) + " Temperature"
 			temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
 			if temp == nil {
-				return nil, nil, nil, "", nil, nil, false
+				return nil, nil, nil, "", nil, nil, false, false
 			}
 			datasets = [][]float64{temp}
 			names = []string{"Temp °C"}
@@ -735,7 +773,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			title = gpuDisplayLabel(idx) + " Core Clock"
 			clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
 			if clock == nil {
-				return nil, nil, nil, "", nil, nil, false
+				return nil, nil, nil, "", nil, nil, false, false
 			}
 			datasets = [][]float64{clock}
 			names = []string{"Core Clock MHz"}
@@ -744,7 +782,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			title = gpuDisplayLabel(idx) + " Memory Clock"
 			clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
 			if clock == nil {
-				return nil, nil, nil, "", nil, nil, false
+				return nil, nil, nil, "", nil, nil, false, false
 			}
 			datasets = [][]float64{clock}
 			names = []string{"Memory Clock MHz"}
@@ -753,7 +791,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			title = gpuDisplayLabel(idx) + " Power"
 			power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
 			if power == nil {
-				return nil, nil, nil, "", nil, nil, false
+				return nil, nil, nil, "", nil, nil, false, false
 			}
 			datasets = [][]float64{power}
 			names = []string{"Power W"}
@@ -761,10 +799,10 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 		}
 	default:
-		return nil, nil, nil, "", nil, nil, false
+		return nil, nil, nil, "", nil, nil, false, false
 	}
-	return datasets, names, labels, title, yMin, yMax, len(datasets) > 0
+	return datasets, names, labels, title, yMin, yMax, stacked, len(datasets) > 0
 }
 func parseGPUChartPath(path string) (idx int, sub string, ok bool) {
@@ -930,6 +968,37 @@ func normalizePowerSeries(ds []float64) []float64 {
 	return out
 }
 // psuSlotsFromSamples returns the sorted list of PSU slot numbers seen across samples.
 func psuSlotsFromSamples(samples []platform.LiveMetricSample) []int {
 	seen := map[int]struct{}{}
 	for _, s := range samples {
 		for _, p := range s.PSUs {
 			seen[p.Slot] = struct{}{}
 		}
 	}
 	slots := make([]int, 0, len(seen))
 	for s := range seen {
 		slots = append(slots, s)
 	}
 	sort.Ints(slots)
 	return slots
 }
 // psuStackedTotal returns the point-by-point sum of all PSU datasets (for scale calculation).
 func psuStackedTotal(datasets [][]float64) []float64 {
 	if len(datasets) == 0 {
 		return nil
 	}
 	n := len(datasets[0])
 	total := make([]float64, n)
 	for _, ds := range datasets {
 		for i, v := range ds {
 			total[i] += v
 		}
 	}
 	return total
 }
 func normalizeFanSeries(ds []float64) []float64 {
 	if len(ds) == 0 {
 		return nil
--- a/audit/internal/webui/server_test.go
+++ b/audit/internal/webui/server_test.go
@@ -120,7 +120,7 @@ func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
 		},
 	}
-	datasets, names, labels, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
+	datasets, names, labels, title, _, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
 	if !ok {
 		t.Fatal("chartDataFromSamples returned ok=false")
 	}
@@ -164,7 +164,7 @@ func TestChartDataFromSamplesKeepsStableGPUSeriesOrder(t *testing.T) {
 		},
 	}
-	datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
+	datasets, names, _, title, _, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
 	if !ok {
 		t.Fatal("chartDataFromSamples returned ok=false")
 	}
@@ -209,7 +209,7 @@ func TestChartDataFromSamplesIncludesGPUClockCharts(t *testing.T) {
 		},
 	}
-	datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
+	datasets, names, _, title, _, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
 	if !ok {
 		t.Fatal("gpu-all-clock returned ok=false")
 	}
@@ -744,6 +744,26 @@ func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
 	}
 }
 func TestValidatePageRendersNvidiaFabricCardsInValidateMode(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
 	for _, needle := range []string{
 		`NVIDIA Interconnect (NCCL)`,
 		`Validate and Stress:`,
 		`NVIDIA Bandwidth (NVBandwidth)`,
 		`nvbandwidth runs all built-in tests without a time limit`,
 	} {
 		if !strings.Contains(body, needle) {
 			t.Fatalf("validate page missing %q: %s", needle, body)
 		}
 	}
 }
 func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
--- a/audit/internal/webui/task_report.go
+++ b/audit/internal/webui/task_report.go
@@ -171,21 +171,17 @@ func renderTaskChartSVG(path string, samples []platform.LiveMetricSample, timeli
 		}
 		return gpuDisplayLabel(idx) + " Overview", buf, true
 	}
-	datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
+	datasets, names, labels, title, yMin, yMax, stacked, ok := chartDataFromSamples(path, samples)
 	if !ok {
 		return "", nil, false
 	}
-	buf, err := renderMetricChartSVG(
+	var buf []byte
-		title,
+	var err error
-		labels,
+	if stacked {
-		sampleTimes(samples),
+		buf, err = renderStackedMetricChartSVG(title, labels, sampleTimes(samples), datasets, names, yMax, chartCanvasHeightForPath(path, len(names)), timeline)
-		datasets,
+	} else {
-		names,
+		buf, err = renderMetricChartSVG(title, labels, sampleTimes(samples), datasets, names, yMin, yMax, chartCanvasHeightForPath(path, len(names)), timeline)
-		yMin,
+	}
 		yMax,
 		chartCanvasHeightForPath(path, len(names)),
 		timeline,
 	)
 	if err != nil {
 		return "", nil, false
 	}
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -613,8 +613,9 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 	}
 	a := q.opts.App
 	recovered := len(j.lines) > 0
 	j.append(fmt.Sprintf("Starting %s...", t.Name))
-	if len(j.lines) > 0 {
+	if recovered {
 		j.append(fmt.Sprintf("Recovered after bee-web restart at %s", time.Now().UTC().Format(time.RFC3339)))
 	}
@@ -736,15 +737,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			err = fmt.Errorf("app not configured")
 			break
 		}
-		dur := t.params.Duration
+		archive, err = a.RunNCCLTests(ctx, "", t.params.GPUIndices, j.append)
 		if t.params.BurnProfile != "" && dur <= 0 {
 			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
 		}
 		archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
 			DurationSec: dur,
 			Loader:      platform.NvidiaStressLoaderNCCL,
 			GPUIndices:  t.params.GPUIndices,
 		}, j.append)
 	case "nvidia-stress":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
--- a/bible-local/docs/iso-build-rules.md
+++ b/bible-local/docs/iso-build-rules.md
@@ -15,6 +15,41 @@ This applies to:
 - `iso/builder/config/package-lists/*.list.chroot`
 - Any package referenced in bootloader configs, hooks, or overlay scripts
 ## Bootloader sync rule
 The ISO has two independent bootloader configs that must be kept in sync manually:
 | File | Used by |
 |------|---------|
 | `config/bootloaders/grub-efi/grub.cfg` | UEFI (all modern servers) |
 | `config/bootloaders/isolinux/live.cfg.in` | CSM / legacy BIOS (syslinux) |
 live-build does NOT derive one from the other. Any new boot entry, kernel parameter
 change, or new mode added to one file must be manually mirrored in the other.
 **Canonical entry list** (both files must have all of these):
 | Label | Key params |
 |-------|-----------|
 | normal (default) | `nomodeset bee.nvidia.mode=normal` + full param set |
 | load to RAM | `toram nomodeset bee.nvidia.mode=normal` + full param set |
 | GSP=off | `nomodeset bee.nvidia.mode=gsp-off` + full param set |
 | KMS | no `nomodeset`, `bee.nvidia.mode=normal` + full param set |
 | KMS + GSP=off | no `nomodeset`, `bee.nvidia.mode=gsp-off` + full param set |
 | fail-safe | `nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp` |
 **Full standard param set** (append after `@APPEND_LIVE@` / `nomodeset` flags):
 ```
 net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always
 numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
 nowatchdog nosoftlockup
 ```
 (fail-safe is the exception — it deliberately uses minimal params.)
 **Historical note:** `grub-pc/` was mistakenly used instead of `grub-efi/` until v8.25.
 live-build reads `config/bootloaders/grub-efi/` for UEFI because the build is
 configured with `--bootloaders "grub-efi,syslinux"`. Directory `grub-pc` is ignored.
 ## Memtest rule
 Do not assume live-build's built-in memtest integration is sufficient for `bee`.
--- a/iso/builder/config/bootloaders/grub-efi/config.cfg
+++ b/iso/builder/config/bootloaders/grub-efi/config.cfg
--- a/iso/builder/config/bootloaders/grub-efi/grub.cfg
+++ b/iso/builder/config/bootloaders/grub-efi/grub.cfg
@@ -16,6 +16,11 @@ menuentry "EASY-BEE" {
 }
 submenu "EASY-BEE (advanced options) -->" {
    menuentry "EASY-BEE — load to RAM (toram)" {
        linux   @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
        initrd  @INITRD_LIVE@
    }
    menuentry "EASY-BEE — GSP=off" {
        linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
        initrd  @INITRD_LIVE@
@@ -26,6 +31,11 @@ submenu "EASY-BEE (advanced options) -->" {
        initrd  @INITRD_LIVE@
    }
    menuentry "EASY-BEE — KMS + GSP=off" {
        linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
        initrd  @INITRD_LIVE@
    }
    menuentry "EASY-BEE — fail-safe" {
        linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
        initrd  @INITRD_LIVE@
--- a/iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt
+++ b/iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt
--- a/iso/builder/config/bootloaders/grub-efi/theme.cfg
+++ b/iso/builder/config/bootloaders/grub-efi/theme.cfg
--- a/iso/builder/config/bootloaders/isolinux/live.cfg.in
+++ b/iso/builder/config/bootloaders/isolinux/live.cfg.in
@@ -3,37 +3,37 @@ label live-@FLAVOUR@-normal
    menu default
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
+    append @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
 label live-@FLAVOUR@-kms
    menu label EASY-BEE (^graphics/KMS)
    linux @LINUX@
    initrd @INITRD@
    append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
 label live-@FLAVOUR@-toram
    menu label EASY-BEE (^load to RAM)
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ toram bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
+    append @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
 label live-@FLAVOUR@-gsp-off
    menu label EASY-BEE (^NVIDIA GSP=off)
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
+    append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
-label live-@FLAVOUR@-kms-gsp-off
+label live-@FLAVOUR@-kms
-    menu label EASY-BEE (g^raphics/KMS, GSP=off)
+    menu label EASY-BEE (^KMS, no nomodeset)
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
+    append @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
 label live-@FLAVOUR@-kms-gsp-off
    menu label EASY-BEE (KMS, ^GSP=off)
    linux @LINUX@
    initrd @INITRD@
    append @APPEND_LIVE@ bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
 label live-@FLAVOUR@-failsafe
    menu label EASY-BEE (^fail-safe)
    linux @LINUX@
    initrd @INITRD@
-    append @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal
+    append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
 label memtest
    menu label ^Memory Test (memtest86+)
--- a/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
+++ b/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
@@ -63,8 +63,10 @@ chmod +x /usr/local/bin/bee-sshsetup   2>/dev/null || true
 chmod +x /usr/local/bin/bee-smoketest  2>/dev/null || true
 chmod +x /usr/local/bin/bee            2>/dev/null || true
 chmod +x /usr/local/bin/bee-log-run    2>/dev/null || true
-chmod +x /usr/local/bin/bee-selfheal      2>/dev/null || true
+chmod +x /usr/local/bin/bee-selfheal        2>/dev/null || true
-chmod +x /usr/local/bin/bee-boot-status  2>/dev/null || true
+chmod +x /usr/local/bin/bee-boot-status    2>/dev/null || true
 chmod +x /usr/local/bin/bee-install        2>/dev/null || true
 chmod +x /usr/local/bin/bee-remount-medium 2>/dev/null || true
 if [ "$GPU_VENDOR" = "nvidia" ]; then
    chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
    chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
--- a/iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
+++ b/iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
@@ -1,117 +0,0 @@
 #!/bin/sh
 # 9001-wallpaper.hook.chroot — generate /usr/share/bee/wallpaper.png inside chroot
 set -e
 echo "=== generating bee wallpaper ==="
 mkdir -p /usr/share/bee
 python3 - <<'PYEOF'
 from PIL import Image, ImageDraw, ImageFont, ImageFilter
 import os
 W, H = 1920, 1080
 ASCII_ART = [
    "  ███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗",
    "  ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝",
    "  █████╗  ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗  █████╗",
    "  ██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝",
    "  ███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗",
    "  ╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝",
 ]
 SUBTITLE = "  Hardware Audit LiveCD"
 FG = (0xF6, 0xD0, 0x47)
 FG_DIM = (0xD4, 0xA9, 0x1C)
 SHADOW = (0x5E, 0x47, 0x05)
 SUB = (0x96, 0x7A, 0x17)
 BG = (0x05, 0x05, 0x05)
 MONO_FONT_CANDIDATES = [
    '/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf',
    '/usr/share/fonts/truetype/liberation2/LiberationMono-Bold.ttf',
    '/usr/share/fonts/truetype/liberation/LiberationMono-Bold.ttf',
    '/usr/share/fonts/truetype/freefont/FreeMonoBold.ttf',
 ]
 SUB_FONT_CANDIDATES = [
    '/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',
    '/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf',
    '/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf',
    '/usr/share/fonts/truetype/freefont/FreeSansBold.ttf',
 ]
 def load_font(candidates, size):
    for path in candidates:
        if os.path.exists(path):
            return ImageFont.truetype(path, size)
    return ImageFont.load_default()
 def mono_metrics(font):
    probe = Image.new('L', (W, H), 0)
    draw = ImageDraw.Draw(probe)
    char_w = int(round(draw.textlength("M", font=font)))
    bb = draw.textbbox((0, 0), "Mg", font=font)
    char_h = bb[3] - bb[1]
    return char_w, char_h
 def render_ascii_mask(font, lines, char_w, char_h, line_gap):
    width = max(len(line) for line in lines) * char_w
    height = len(lines) * char_h + line_gap * (len(lines) - 1)
    mask = Image.new('L', (width, height), 0)
    draw = ImageDraw.Draw(mask)
    for row, line in enumerate(lines):
        y = row * (char_h + line_gap)
        for col, ch in enumerate(line):
            if ch == ' ':
                continue
            x = col * char_w
            draw.text((x, y), ch, font=font, fill=255)
    return mask
 img = Image.new('RGB', (W, H), BG)
 draw = ImageDraw.Draw(img)
 # Soft amber glow under the logo without depending on font rendering.
 glow = Image.new('RGBA', (W, H), (0, 0, 0, 0))
 glow_draw = ImageDraw.Draw(glow)
 glow_draw.ellipse((360, 250, 1560, 840), fill=(180, 120, 10, 56))
 glow_draw.ellipse((520, 340, 1400, 760), fill=(255, 190, 40, 36))
 glow = glow.filter(ImageFilter.GaussianBlur(60))
 img = Image.alpha_composite(img.convert('RGBA'), glow)
 TARGET_LOGO_W = 400
 max_chars = max(len(line) for line in ASCII_ART)
 _probe_font = load_font(MONO_FONT_CANDIDATES, 64)
 _probe_cw, _ = mono_metrics(_probe_font)
 font_size_logo = max(6, int(64 * TARGET_LOGO_W / (_probe_cw * max_chars)))
 font_logo = load_font(MONO_FONT_CANDIDATES, font_size_logo)
 char_w, char_h = mono_metrics(font_logo)
 logo_mask = render_ascii_mask(font_logo, ASCII_ART, char_w, char_h, 2)
 logo_w, logo_h = logo_mask.size
 logo_x = (W - logo_w) // 2
 logo_y = 380
 sh_off = max(1, font_size_logo // 6)
 shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(1))
 img.paste(SHADOW, (logo_x + sh_off * 2, logo_y + sh_off * 2), shadow_mask)
 img.paste(FG_DIM, (logo_x + sh_off, logo_y + sh_off), logo_mask)
 img.paste(FG, (logo_x, logo_y), logo_mask)
 font_sub = load_font(SUB_FONT_CANDIDATES, 30)
 sub_bb = draw.textbbox((0, 0), SUBTITLE, font=font_sub)
 sub_x = (W - (sub_bb[2] - sub_bb[0])) // 2
 sub_y = logo_y + logo_h + 48
 draw = ImageDraw.Draw(img)
 draw.text((sub_x + 2, sub_y + 2), SUBTITLE, font=font_sub, fill=(35, 28, 6))
 draw.text((sub_x, sub_y), SUBTITLE, font=font_sub, fill=SUB)
 img = img.convert('RGB')
 img.save('/usr/share/bee/wallpaper.png', optimize=True)
 print('wallpaper written: /usr/share/bee/wallpaper.png')
 PYEOF
 echo "=== wallpaper done ==="
--- a/iso/builder/config/hooks/normal/9011-toram-rsync.hook.chroot
+++ b/iso/builder/config/hooks/normal/9011-toram-rsync.hook.chroot
@@ -0,0 +1,46 @@
 #!/bin/sh
 # 9011-toram-rsync.hook.chroot
 #
 # Adds rsync to the initramfs so that live-boot's toram code takes the
 # rsync --progress path instead of the silent "cp -a" fallback.
 #
 # live-boot's 9990-toram-todisk.sh already contains:
 #   if [ -x /bin/rsync ]; then
 #       rsync -a --progress ... 1>/dev/console
 #   else
 #       cp -a ...   # no output
 #   fi
 #
 # We install an initramfs-tools hook that calls copy_exec /usr/bin/rsync,
 # which copies the binary + all shared-library dependencies into the initrd.
 set -e
 HOOK_DIR="/etc/initramfs-tools/hooks"
 HOOK="${HOOK_DIR}/bee-rsync"
 mkdir -p "${HOOK_DIR}"
 cat > "${HOOK}" << 'EOF'
 #!/bin/sh
 # initramfs hook: include rsync for live-boot toram progress output
 PREREQ=""
 prereqs() { echo "$PREREQ"; }
 case "$1" in prereqs) prereqs; exit 0 ;; esac
 . /usr/share/initramfs-tools/hook-functions
 if [ -x /usr/bin/rsync ]; then
    copy_exec /usr/bin/rsync /bin
 fi
 EOF
 chmod +x "${HOOK}"
 echo "9011-toram-rsync: installed initramfs hook at ${HOOK}"
 # Rebuild initramfs so the hook takes effect in the ISO's initrd.img
 KVER=$(ls /lib/modules | sort -V | tail -1)
 echo "9011-toram-rsync: rebuilding initramfs for kernel ${KVER}"
 update-initramfs -u -k "${KVER}"
 echo "9011-toram-rsync: done"
--- a/iso/builder/config/package-lists/bee.list.chroot
+++ b/iso/builder/config/package-lists/bee.list.chroot
@@ -3,6 +3,7 @@ dmidecode
 smartmontools
 nvme-cli
 pciutils
 rsync
 ipmitool
 util-linux
 e2fsprogs
--- a/iso/overlay/usr/local/bin/bee-install
+++ b/iso/overlay/usr/local/bin/bee-install
@@ -65,6 +65,9 @@ done
 SQUASHFS="/run/live/medium/live/filesystem.squashfs"
 if [ ! -f "$SQUASHFS" ]; then
    echo "ERROR: squashfs not found at $SQUASHFS" >&2
    echo "  The live medium may have been disconnected." >&2
    echo "  Reconnect the disc and run:  bee-remount-medium --wait" >&2
    echo "  Then re-run bee-install." >&2
    exit 1
 fi
@@ -162,10 +165,59 @@ log "  Mounted."
 log "--- Step 5/7: Unpacking filesystem (this takes 10-20 minutes) ---"
 log "  Source: $SQUASHFS"
 log "  Target: $MOUNT_ROOT"
-unsquashfs -f -d "$MOUNT_ROOT" "$SQUASHFS" 2>&1 | \
+
-    grep -E '^\[|^inod|^created|^extract' | \
+# unsquashfs does not support resume, so retry the entire unpack step if the
-    while read -r line; do log "  $line"; done || true
+# source medium disappears mid-copy (e.g. CD physically disconnected).
-log "  Unpack complete."
+UNPACK_ATTEMPTS=0
 UNPACK_MAX=5
 while true; do
    UNPACK_ATTEMPTS=$(( UNPACK_ATTEMPTS + 1 ))
    if [ "$UNPACK_ATTEMPTS" -gt "$UNPACK_MAX" ]; then
        die "Unpack failed $UNPACK_MAX times — giving up. Check the disc and logs."
    fi
    [ "$UNPACK_ATTEMPTS" -gt 1 ] && log "  Retry attempt $UNPACK_ATTEMPTS / $UNPACK_MAX ..."
    # Re-check squashfs is reachable before each attempt
    if [ ! -f "$SQUASHFS" ]; then
        log "  SOURCE LOST: $SQUASHFS not found."
        log "  Reconnect the disc and run 'bee-remount-medium --wait' in another terminal,"
        log "  then press Enter here to retry."
        read -r _
        continue
    fi
    # wipe partial unpack so unsquashfs starts clean
    if [ "$UNPACK_ATTEMPTS" -gt 1 ]; then
        log "  Cleaning partial unpack from $MOUNT_ROOT ..."
        # keep the mount point itself but remove its contents
        find "$MOUNT_ROOT" -mindepth 1 -maxdepth 1 -exec rm -rf {} + 2>/dev/null || true
    fi
    UNPACK_OK=0
    unsquashfs -f -d "$MOUNT_ROOT" "$SQUASHFS" 2>&1 | \
        grep -E '^\[|^inod|^created|^extract|^ERROR|failed' | \
        while IFS= read -r line; do log "  $line"; done || UNPACK_OK=$?
    # Check squashfs is still reachable (gone = disc pulled during copy)
    if [ ! -f "$SQUASHFS" ]; then
        log "  WARNING: source medium lost during unpack — will retry after remount."
        log "  Run 'bee-remount-medium --wait' in another terminal, then press Enter."
        read -r _
        continue
    fi
    # Verify the unpack produced a usable root (presence of /etc is a basic check)
    if [ -d "${MOUNT_ROOT}/etc" ]; then
        log "  Unpack complete."
        break
    else
        log "  WARNING: unpack produced no /etc — squashfs may be corrupt or incomplete."
        if [ "$UNPACK_ATTEMPTS" -lt "$UNPACK_MAX" ]; then
            log "  Retrying in 5 s ..."
            sleep 5
        fi
    fi
 done
 # ------------------------------------------------------------------
 log "--- Step 6/7: Configuring installed system ---"
--- a/iso/overlay/usr/local/bin/bee-remount-medium
+++ b/iso/overlay/usr/local/bin/bee-remount-medium
@@ -0,0 +1,100 @@
 #!/bin/bash
 # bee-remount-medium — find and remount the live ISO medium to /run/live/medium
 #
 # Run this after reconnecting the ISO source disc (USB/CD) if the live medium
 # was lost and /run/live/medium/live/filesystem.squashfs is missing.
 #
 # Usage: bee-remount-medium [--wait]
 #   --wait  keep retrying every 5 seconds until the medium is found (useful
 #           while physically reconnecting the device)
 set -euo pipefail
 MEDIUM_DIR="/run/live/medium"
 SQUASHFS_REL="live/filesystem.squashfs"
 WAIT_MODE=0
 for arg in "$@"; do
    case "$arg" in
        --wait|-w) WAIT_MODE=1 ;;
        --help|-h)
            echo "Usage: bee-remount-medium [--wait]"
            echo "  Finds and remounts the live ISO medium to $MEDIUM_DIR"
            echo "  --wait  retry every 5 s until a medium with squashfs is found"
            exit 0 ;;
    esac
 done
 log() { echo "[$(date +%H:%M:%S)] $*"; }
 die() { log "ERROR: $*" >&2; exit 1; }
 # Return all candidate block devices (optical + removable USB mass storage)
 find_candidates() {
    # CD/DVD drives
    for dev in /dev/sr* /dev/scd*; do
        [ -b "$dev" ] && echo "$dev"
    done
    # USB/removable disks and partitions
    for dev in /dev/sd* /dev/vd*; do
        [ -b "$dev" ] || continue
        # Only whole disks or partitions — skip the same device we are running from
        local removable
        local base
        base=$(basename "$dev")
        removable=$(cat "/sys/block/${base%%[0-9]*}/removable" 2>/dev/null || echo 0)
        [ "$removable" = "1" ] && echo "$dev"
    done
 }
 # Try to mount $1 to $MEDIUM_DIR and check for squashfs
 try_mount() {
    local dev="$1"
    local tmpdir
    tmpdir=$(mktemp -d /tmp/bee-probe-XXXXXX)
    if mount -o ro "$dev" "$tmpdir" 2>/dev/null; then
        if [ -f "${tmpdir}/${SQUASHFS_REL}" ]; then
            # Unmount probe mount and mount properly onto live path
            umount "$tmpdir" 2>/dev/null || true
            rmdir "$tmpdir"  2>/dev/null || true
            # Unmount whatever is currently on MEDIUM_DIR (may be empty/stale)
            umount "$MEDIUM_DIR" 2>/dev/null || true
            mkdir -p "$MEDIUM_DIR"
            if mount -o ro "$dev" "$MEDIUM_DIR"; then
                log "Mounted $dev on $MEDIUM_DIR"
                return 0
            else
                log "Mount of $dev on $MEDIUM_DIR failed"
                return 1
            fi
        fi
        umount "$tmpdir" 2>/dev/null || true
    fi
    rmdir "$tmpdir" 2>/dev/null || true
    return 1
 }
 attempt() {
    log "Scanning for ISO medium..."
    for dev in $(find_candidates); do
        log "  Trying $dev ..."
        if try_mount "$dev"; then
            local sq="${MEDIUM_DIR}/${SQUASHFS_REL}"
            log "SUCCESS: squashfs available at $sq ($(du -sh "$sq" | cut -f1))"
            return 0
        fi
    done
    return 1
 }
 if [ "$WAIT_MODE" = "1" ]; then
    log "Waiting for live medium (press Ctrl+C to abort)..."
    while true; do
        if attempt; then
            exit 0
        fi
        log "  Not found — retrying in 5 s (reconnect the disc now)"
        sleep 5
    done
 else
    attempt || die "No ISO medium with ${SQUASHFS_REL} found. Reconnect the disc and re-run, or use --wait."
 fi
Author	SHA1	Message	Date
Michael Chus	bac89bb6e5	Add real-data duration estimates to validate tab profiles - Add SATEstimated* constants to sat.go derived from _v8 production logs, with a rule to recalculate them whenever the script changes - Extend validateInventory with NvidiaGPUCount to make estimates GPU-aware - Update all validate card duration strings: CPU, memory, storage, NVIDIA GPU, targeted stress/power, pulse test, NCCL, nvbandwidth - Fix nvbandwidth description ("intended to stay short" → actual ~45 min) - Top-level profile labels show computed total including GPU count Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-18 10:51:15 +03:00
Michael Chus	7a618da1f9	Redesign system power chart as stacked per-PSU area chart - Add PSUReading struct and PSUs []PSUReading to LiveMetricSample - Sample per-PSU input watts from IPMI SDR entity 10.x (Power Supply) - Render stacked filled-area SVG chart (one layer per PSU, cumulative total) - Fall back to single-line chart on systems with ≤1 PSU in SDR Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-18 10:42:00 +03:00
Michael Chus	64ae1c0ff0	Sync GRUB and isolinux boot entries; document sync rule grub-efi/grub.cfg: add KMS+GSP=off entry (was in isolinux, missing in GRUB) isolinux/live.cfg.in: add full standard param set to all entries (net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup) to match grub-efi bible-local/docs/iso-build-rules.md: add bootloader sync rule documenting that grub-efi and isolinux must be kept in sync manually, listing canonical entries and standard param set, and noting the grub-pc/grub-efi history. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-18 10:32:16 +03:00
Michael Chus	49050ca717	Fix GRUB bootloader config dir: grub-pc → grub-efi Build uses --bootloaders "grub-efi,syslinux" so live-build reads config/bootloaders/grub-efi/ for the UEFI GRUB config. The directory was incorrectly named grub-pc, causing live-build to ignore our custom grub.cfg and generate a default one (missing toram, GSP-off entries). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-18 10:30:11 +03:00
Michael Chus	5ba72ab315	Add rsync to initramfs for toram progress output live-boot already uses rsync --progress when /bin/rsync exists; without it the copy falls back to silent cp -a. Add rsync to the ISO package list and install an initramfs-tools hook (bee-rsync) that copies the rsync binary + shared libs into the initrd via copy_exec. The hook then rebuilds the initramfs so the change takes effect in the ISO's initrd.img. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-17 23:52:47 +03:00
Michael Chus	63363e9629	Add toram boot entry and Install to RAM resume support - grub.cfg: add "load to RAM (toram)" entry to advanced submenu - install_to_ram.go: resume from existing /dev/shm/bee-live copy if source medium is unavailable after bee-web restart - tasks.go: fix "Recovered after bee-web restart" shown on every run (check j.lines before first append, not after) - bee-install: retry unsquashfs up to 5x with wait-for-remount on source loss; clear error message with bee-remount-medium hint - bee-remount-medium: new script to find and remount live ISO source after USB/CD reconnect; supports --wait polling mode - 9000-bee-setup: chmod +x for bee-install and bee-remount-medium Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-17 23:48:56 +03:00
Mikhail Chusavitin	5285c0d101	Capture per-run IPMI power and GPU telemetry in power benchmark - Sample IPMI loaded_w per single-card calibration and per ramp step instead of averaging over the entire Phase 2; top-level ServerPower uses the final (all-GPU) ramp step value - Add ServerLoadedW/ServerDeltaW to NvidiaPowerBenchGPU and NvidiaPowerBenchStep so external tooling can compare wall power per phase without re-parsing logs - Write gpu-metrics.csv/.html inside each single-XX/ and step-XX/ subdir; aggregate all phases into a top-level gpu-metrics.csv/.html - Write 00-nvidia-smi-q.log at the start of every power run - Add Telemetry (p95 temp/power/fan/clock) to NvidiaPowerBenchGPU in result.json from the converged calibration attempt - Power benchmark page: split "Achieved W" into Single-card W and Multi-GPU W (StablePowerLimitW); derate highlight and status color now reflect the final multi-GPU limit vs nominal - Performance benchmark page: add Status column and per-GPU score color coding (green/yellow/red) based on gpu.Status and OverallStatus Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-17 17:59:58 +03:00
Mikhail Chusavitin	dca4afb8d0	Seed power ramp with single-card TDP limits	2026-04-16 11:43:01 +03:00
Mikhail Chusavitin	b4280941f5	Move NCCL and NVBandwidth into validate mode	2026-04-16 11:02:30 +03:00
Mikhail Chusavitin	f74976ec4c	Use static overlay wallpaper in ISO build	2026-04-16 10:54:03 +03:00
Mikhail Chusavitin	18e24a9aa5	Estimate fan duty from observed RPM maxima	2026-04-16 10:10:18 +03:00