Run NVIDIA DCGM diag tests on all selected GPUs simultaneously

targeted_stress, targeted_power, and the Level 2/3 diag were dispatched one GPU at a time from the UI, turning a single dcgmi command into 8 sequential ~350–450 s runs. DCGM supports -i with a comma-separated list of GPU indices and runs the diagnostic on all of them in parallel. Move nvidia, nvidia-targeted-stress, nvidia-targeted-power into nvidiaAllGPUTargets so expandSATTarget passes all selected indices in one API call. Simplify runNvidiaValidateSet to match runNvidiaFabricValidate. Update sat.go constants and page_validate.go estimates to reflect all-GPU simultaneous execution (remove n× multiplier from total time estimates). Stress test on 8-GPU system: ~5.3 h → ~2.5 h. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Fix post-sync live-build ISO rebuild
2026-04-20 11:53:25 +03:00 · 2026-04-20 11:01:15 +03:00 · 2026-04-20 10:55:42 +03:00 · 2026-04-20 10:53:53 +03:00 · 2026-04-20 09:46:00 +03:00 · 2026-04-20 09:43:22 +03:00
9 changed files with 846 additions and 205 deletions
--- a/audit/bee
+++ b/audit/bee
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -37,6 +37,8 @@ type benchmarkGPUInfo struct {
 	VBIOS                string
 	PowerLimitW          float64
 	DefaultPowerLimitW   float64
+	MinPowerLimitW       float64
+	MaxPowerLimitW       float64
 	MaxGraphicsClockMHz  float64
 	MaxMemoryClockMHz    float64
 	BaseGraphicsClockMHz float64
@@ -65,6 +67,13 @@ type benchmarkPowerCalibrationResult struct {
 	MetricRows []GPUMetricRow
 }

+type benchmarkPowerCalibrationRunSummary struct {
+	LoadedSDR          benchmarkSDRSeriesSummary
+	AvgFanRPM          float64
+	AvgFanDutyCyclePct float64
+	FanSamples         int
+}
+
 type benchmarkBurnProfile struct {
 	name       string
 	category   string
@@ -95,6 +104,8 @@ var (
 	benchmarkReadyPattern      = regexp.MustCompile(`^([a-z0-9_]+)\[(\d+)\]=READY dim=(\d+)x(\d+)x(\d+)\b`)
 	benchmarkSkippedPattern    = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
 	benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
+	benchmarkGeteuid           = os.Geteuid
+	benchmarkSleep             = time.Sleep
 )

 // benchmarkPrecisionPhases lists the precision categories run as individual
@@ -220,8 +231,6 @@ func benchmarkCalibrationThrottleReason(before, after BenchmarkThrottleCounters)
 		return "hw_thermal"
 	case diff.SWThermalSlowdownUS > 0:
 		return "sw_thermal"
-	case diff.HWPowerBrakeSlowdownUS > 0:
-		return "hw_power_brake"
 	default:
 		return ""
 	}
@@ -240,6 +249,39 @@ func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, po
 	return nil
 }

+func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int, logFunc func(string)) []int {
+	if len(gpuIndices) == 0 {
+		return nil
+	}
+	if benchmarkGeteuid() != 0 {
+		if logFunc != nil {
+			logFunc("power benchmark pre-flight: root privileges unavailable, GPU reset skipped")
+		}
+		return append([]int(nil), gpuIndices...)
+	}
+	if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
+		for _, p := range killed {
+			logFunc(fmt.Sprintf("power benchmark pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
+		}
+	}
+	var failed []int
+	for _, idx := range gpuIndices {
+		name := fmt.Sprintf("power-preflight-gpu-%d-reset.log", idx)
+		if _, err := runSATCommandCtx(ctx, verboseLog, name, []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-r"}, nil, logFunc); err != nil {
+			failed = append(failed, idx)
+			if logFunc != nil {
+				logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset failed: %v", idx, err))
+			}
+			continue
+		}
+		if logFunc != nil {
+			logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset completed", idx))
+		}
+		benchmarkSleep(time.Second)
+	}
+	return failed
+}
+
 func benchmarkPowerEngine() string {
 	switch strings.TrimSpace(strings.ToLower(os.Getenv("BEE_BENCH_POWER_ENGINE"))) {
 	case BenchmarkPowerEngineTargetedPower:
@@ -351,9 +393,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
 		result.Normalization.Status = "partial"
 	}
-	// Enrich with max clocks from verbose output — covers GPUs where
-	// clocks.max.* CSV fields are unsupported (e.g. Blackwell / driver 98.x).
-	enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQOut)
+	// Enrich with verbose nvidia-smi data — covers GPUs where some CSV fields
+	// are unsupported (e.g. clocks.max.* on Blackwell / driver 98.x).
+	enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQOut)

 	activeApps, err := queryActiveComputeApps(selected)
 	if err == nil && len(activeApps) > 0 {
@@ -737,8 +779,8 @@ func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
 // (attribute.multiprocessor_count, power.default_limit) are not supported on
 // all driver versions, so we fall back to the base set if the full query fails.
 // The minimal fallback omits clock fields entirely — clocks.max.* returns
-// exit status 2 on some GPU generations (e.g. Blackwell); max clocks are
-// then recovered from nvidia-smi -q via enrichGPUInfoWithMaxClocks.
+// exit status 2 on some GPU generations (e.g. Blackwell); missing data is
+// then recovered from nvidia-smi -q.
 var benchmarkGPUInfoQueries = []struct {
 	fields   string
 	extended bool // whether this query includes optional extended fields
@@ -758,12 +800,9 @@ var benchmarkGPUInfoQueries = []struct {
 	},
 }

-// enrichGPUInfoWithMaxClocks fills MaxGraphicsClockMHz / MaxMemoryClockMHz for
-// any GPU in infoByIndex where those values are still zero.  It parses the
-// "Max Clocks" section of nvidia-smi -q output (already available as nvsmiQ).
-// This is the fallback for GPUs (e.g. Blackwell) where clocks.max.* CSV fields
-// return exit status 2 but the verbose query works fine.
-func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) {
+// enrichGPUInfoWithNvidiaSMIQ fills benchmark GPU metadata from nvidia-smi -q
+// for fields that may be missing from --query-gpu on some driver versions.
+func enrichGPUInfoWithNvidiaSMIQ(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) {
 	if len(infoByIndex) == 0 || len(nvsmiQ) == 0 {
 		return
 	}
@@ -784,6 +823,8 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
 	maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`)
 	defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`)
 	currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`)
+	minPwrRe := regexp.MustCompile(`(?i)Min Power Limit\s*:\s*([0-9.]+)\s*W`)
+	maxPwrRe := regexp.MustCompile(`(?i)Max Power Limit\s*:\s*([0-9.]+)\s*W`)
 	smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`)
 	shutdownTempRe := regexp.MustCompile(`(?i)GPU Shutdown Temp\s*:\s*(\d+)\s*C`)
 	slowdownTempRe := regexp.MustCompile(`(?i)GPU Slowdown Temp\s*:\s*(\d+)\s*C`)
@@ -843,6 +884,20 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
 				}
 			}
 		}
+		if info.MinPowerLimitW == 0 {
+			if m := minPwrRe.FindSubmatch(section); m != nil {
+				if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
+					info.MinPowerLimitW = v
+				}
+			}
+		}
+		if info.MaxPowerLimitW == 0 {
+			if m := maxPwrRe.FindSubmatch(section); m != nil {
+				if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
+					info.MaxPowerLimitW = v
+				}
+			}
+		}
 		if info.MultiprocessorCount == 0 {
 			if m := smCountRe.FindSubmatch(section); m != nil {
 				if v, err := strconv.Atoi(string(m[1])); err == nil && v > 0 {
@@ -2365,6 +2420,16 @@ type sdrPowerSnapshot struct {
 	SkippedSensors []string // sensors rejected during self-healing
 }

+type benchmarkSDRSeriesSummary struct {
+	PSUInW   float64
+	PSUOutW  float64
+	GPUSlotW float64
+	PSUSlots map[string]BenchmarkPSUSlotPower
+	Samples  int
+
+	SkippedSensors []string
+}
+
 // sdrSensor is a name+watts pair used for GPU slot self-healing filtering.
 type sdrSensor struct {
 	name  string
@@ -2494,6 +2559,137 @@ func sampleIPMISDRPowerSensors() sdrPowerSnapshot {
 	return snap
 }

+func startIPMISDRSampler(stopCh <-chan struct{}, intervalSec int) <-chan []sdrPowerSnapshot {
+	if intervalSec <= 0 {
+		intervalSec = benchmarkPowerAutotuneSampleInterval
+	}
+	ch := make(chan []sdrPowerSnapshot, 1)
+	go func() {
+		defer close(ch)
+		var samples []sdrPowerSnapshot
+		record := func() {
+			snap := sampleIPMISDRPowerSensors()
+			if snap.PSUInW <= 0 && snap.PSUOutW <= 0 && snap.GPUSlotW <= 0 && len(snap.PSUSlots) == 0 {
+				return
+			}
+			samples = append(samples, snap)
+		}
+		record()
+		ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
+		defer ticker.Stop()
+		for {
+			select {
+			case <-stopCh:
+				ch <- samples
+				return
+			case <-ticker.C:
+				record()
+			}
+		}
+	}()
+	return ch
+}
+
+func summarizeSDRPowerSeries(samples []sdrPowerSnapshot) benchmarkSDRSeriesSummary {
+	var summary benchmarkSDRSeriesSummary
+	if len(samples) == 0 {
+		return summary
+	}
+
+	type slotAggregate struct {
+		inputs  []float64
+		outputs []float64
+		status  string
+	}
+
+	slotAgg := make(map[string]*slotAggregate)
+	skippedSet := make(map[string]struct{})
+	var inputTotals []float64
+	var outputTotals []float64
+	var gpuSlotTotals []float64
+
+	for _, sample := range samples {
+		if sample.PSUInW > 0 {
+			inputTotals = append(inputTotals, sample.PSUInW)
+		}
+		if sample.PSUOutW > 0 {
+			outputTotals = append(outputTotals, sample.PSUOutW)
+		}
+		if sample.GPUSlotW > 0 {
+			gpuSlotTotals = append(gpuSlotTotals, sample.GPUSlotW)
+		}
+		for _, skipped := range sample.SkippedSensors {
+			if skipped != "" {
+				skippedSet[skipped] = struct{}{}
+			}
+		}
+		for slot, reading := range sample.PSUSlots {
+			agg := slotAgg[slot]
+			if agg == nil {
+				agg = &slotAggregate{}
+				slotAgg[slot] = agg
+			}
+			if reading.InputW != nil && *reading.InputW > 0 {
+				agg.inputs = append(agg.inputs, *reading.InputW)
+			}
+			if reading.OutputW != nil && *reading.OutputW > 0 {
+				agg.outputs = append(agg.outputs, *reading.OutputW)
+			}
+			switch {
+			case reading.Status == "":
+			case agg.status == "":
+				agg.status = reading.Status
+			case agg.status == "OK" && reading.Status != "OK":
+				agg.status = reading.Status
+			}
+		}
+	}
+
+	summary.PSUInW = benchmarkMean(inputTotals)
+	summary.PSUOutW = benchmarkMean(outputTotals)
+	summary.GPUSlotW = benchmarkMean(gpuSlotTotals)
+	summary.Samples = len(samples)
+
+	if len(slotAgg) > 0 {
+		summary.PSUSlots = make(map[string]BenchmarkPSUSlotPower, len(slotAgg))
+		for slot, agg := range slotAgg {
+			reading := BenchmarkPSUSlotPower{Status: agg.status}
+			if mean := benchmarkMean(agg.inputs); mean > 0 {
+				v := mean
+				reading.InputW = &v
+			}
+			if mean := benchmarkMean(agg.outputs); mean > 0 {
+				v := mean
+				reading.OutputW = &v
+			}
+			summary.PSUSlots[slot] = reading
+		}
+	}
+	if len(skippedSet) > 0 {
+		summary.SkippedSensors = make([]string, 0, len(skippedSet))
+		for skipped := range skippedSet {
+			summary.SkippedSensors = append(summary.SkippedSensors, skipped)
+		}
+		sort.Strings(summary.SkippedSensors)
+	}
+
+	return summary
+}
+
+func collectIPMISDRPowerSeries(ctx context.Context, durationSec, intervalSec int) benchmarkSDRSeriesSummary {
+	if durationSec <= 0 {
+		return benchmarkSDRSeriesSummary{}
+	}
+	stopCh := make(chan struct{})
+	doneCh := startIPMISDRSampler(stopCh, intervalSec)
+	select {
+	case <-ctx.Done():
+	case <-time.After(time.Duration(durationSec) * time.Second):
+	}
+	close(stopCh)
+	return summarizeSDRPowerSeries(<-doneCh)
+}
+
 // queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
 // Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
 func queryIPMIServerPowerW() (float64, error) {
@@ -3038,12 +3234,12 @@ func runBenchmarkPowerCalibration(
 	logFunc func(string),
 	seedLimits map[int]int,
 	durationSec int,
-) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow) {
+) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow, benchmarkPowerCalibrationRunSummary) {
 	calibDurationSec := durationSec
+	var runSummary benchmarkPowerCalibrationRunSummary
 	if calibDurationSec <= 0 {
 		calibDurationSec = 120
 	}
-	const maxDerateW = 150
 	// calibSearchTolerance is the binary-search convergence threshold in watts.
 	// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
 	const calibSearchTolerance = 10
@@ -3058,12 +3254,12 @@ func runBenchmarkPowerCalibration(
 	if engine == BenchmarkPowerEngineTargetedPower {
 		if _, err := exec.LookPath("dcgmi"); err != nil {
 			logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
-			return map[int]benchmarkPowerCalibrationResult{}, nil, nil
+			return map[int]benchmarkPowerCalibrationResult{}, nil, nil, runSummary
 		}
 	} else {
 		if _, _, err := resolveBenchmarkPowerLoadCommand(calibDurationSec, gpuIndices); err != nil {
 			logFunc("power calibration: dcgmproftester not found, skipping (will use default power limit)")
-			return map[int]benchmarkPowerCalibrationResult{}, nil, nil
+			return map[int]benchmarkPowerCalibrationResult{}, nil, nil, runSummary
 		}
 	}
 	if killed := KillTestWorkers(); len(killed) > 0 {
@@ -3090,8 +3286,9 @@ func runBenchmarkPowerCalibration(
 		originalLimitW int
 		appliedLimitW  int
 		minLimitW      int
-		lo             int // highest verified-stable limit (assumed: minLimitW)
+		lo             int // highest verified-stable limit
 		hi             int // lowest verified-unstable limit (exclusive sentinel above start)
+		loVerified     bool
 		calib          benchmarkPowerCalibrationResult
 		converged      bool
 	}
@@ -3113,23 +3310,17 @@ func runBenchmarkPowerCalibration(
 		if defaultLimitW <= 0 {
 			defaultLimitW = originalLimitW
 		}
-		appliedLimitW := originalLimitW
+		appliedLimitW := initialBenchmarkCalibrationLimitW(info)
 		if appliedLimitW <= 0 {
 			appliedLimitW = defaultLimitW
 		}
-		minLimitW := appliedLimitW
-		switch {
-		case defaultLimitW > 0:
-			minLimitW = defaultLimitW - maxDerateW
-			floorByRatio := int(math.Round(float64(defaultLimitW) * 0.70))
-			if minLimitW < floorByRatio {
-				minLimitW = floorByRatio
+		minLimitW := int(math.Round(info.MinPowerLimitW))
+		if minLimitW <= 0 {
+			minLimitW = appliedLimitW
 		}
-		case appliedLimitW > 0:
-			minLimitW = appliedLimitW - maxDerateW
-		}
-		if minLimitW < calibSearchTolerance {
-			minLimitW = calibSearchTolerance
+		maxLimitW := int(math.Round(info.MaxPowerLimitW))
+		if maxLimitW > 0 && appliedLimitW > maxLimitW {
+			appliedLimitW = maxLimitW
 		}
 		s := &gpuCalibState{
 			idx:            idx,
@@ -3141,11 +3332,24 @@ func runBenchmarkPowerCalibration(
 			hi:             appliedLimitW + 1, // not yet tested, not yet confirmed unstable
 			calib:          benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
 		}
+		if minLimitW > 0 && appliedLimitW > 0 && minLimitW >= appliedLimitW {
+			s.appliedLimitW = minLimitW
+			s.hi = minLimitW + 1
+		}
+		if info.MinPowerLimitW <= 0 {
+			s.calib.Notes = append(s.calib.Notes, "minimum power limit was not reported by nvidia-smi; calibration can only validate the current/default power limit")
+		}
 		if seedLimits != nil {
 			if seedW, ok := seedLimits[idx]; ok && seedW > 0 {
 				// A previously validated limit is only a starting point. Re-run
 				// targeted_power under the current multi-GPU thermal load and derate
 				// again if this step shows new throttling.
+				if seedW < s.minLimitW {
+					seedW = s.minLimitW
+				}
+				if maxLimitW > 0 && seedW > maxLimitW {
+					seedW = maxLimitW
+				}
 				if canDerate {
 					_ = setBenchmarkPowerLimit(ctx, verboseLog, idx, seedW)
 				}
@@ -3220,6 +3424,10 @@ calibDone:
 		}
 		attemptCtx, cancelAttempt := context.WithCancel(ctx)
 		doneCh := make(chan sharedAttemptResult, 1)
+		sdrStopCh := make(chan struct{})
+		sdrDoneCh := startIPMISDRSampler(sdrStopCh, benchmarkPowerAutotuneSampleInterval)
+		fanStopCh := make(chan struct{})
+		fanDoneCh := startBenchmarkFanSampler(fanStopCh, benchmarkPowerAutotuneSampleInterval)
 		go func() {
 			out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, env, gpuIndices, logFunc)
 			doneCh <- sharedAttemptResult{out: out, rows: rows, err: err}
@@ -3259,6 +3467,10 @@ calibDone:
 		}
 		ticker.Stop()
 		cancelAttempt()
+		close(sdrStopCh)
+		close(fanStopCh)
+		attemptSDRSummary := summarizeSDRPowerSeries(<-sdrDoneCh)
+		attemptFanSummary := <-fanDoneCh
 		_ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644)
 		// Accumulate telemetry rows with attempt stage label.
 		appendBenchmarkMetrics(&allCalibRows, ar.rows, fmt.Sprintf("attempt-%d", sharedAttempt), &calibCursor, float64(calibDurationSec))
@@ -3296,10 +3508,14 @@ calibDone:
 		busyDelaySec = 1

 		// Per-GPU analysis and binary search update.
+		attemptStable := ar.err == nil
 		for _, s := range active {
 			perGPU := filterRowsByGPU(ar.rows, s.idx)
 			summary := summarizeBenchmarkTelemetry(perGPU)
 			throttle := throttleReasons[s.idx]
+			if throttle != "" || summary.P95PowerW <= 0 {
+				attemptStable = false
+			}

 			// Cooling warning: thermal throttle with fans not at maximum.
 			if strings.Contains(throttle, "thermal") && s.calib.CoolingWarning == "" {
@@ -3333,6 +3549,7 @@ calibDone:
 				s.calib.AppliedPowerLimitW = float64(s.appliedLimitW)
 				logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", s.idx, s.appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
 				s.lo = s.appliedLimitW
+				s.loVerified = true
 				if canDerate && s.hi-s.lo > calibSearchTolerance {
 					next := roundTo5W((s.lo + s.hi) / 2)
 					if next > s.lo && next < s.hi {
@@ -3371,7 +3588,23 @@ calibDone:
 			s.hi = s.appliedLimitW

 			if s.hi-s.lo <= calibSearchTolerance {
-				if s.lo > s.minLimitW {
+				if !s.loVerified && s.minLimitW > 0 && s.appliedLimitW != s.minLimitW {
+					if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.minLimitW); err != nil {
+						s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
+						logFunc(fmt.Sprintf("power calibration: GPU %d failed to set minimum power limit %d W: %v", s.idx, s.minLimitW, err))
+						s.converged = true
+						continue
+					}
+					s.appliedLimitW = s.minLimitW
+					s.calib.AppliedPowerLimitW = float64(s.minLimitW)
+					s.calib.Derated = s.minLimitW < s.originalLimitW
+					s.info.PowerLimitW = float64(s.minLimitW)
+					infoByIndex[s.idx] = s.info
+					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: validating minimum settable limit %d W before concluding failure", s.minLimitW))
+					logFunc(fmt.Sprintf("power calibration: GPU %d binary search: validating minimum settable limit %d W", s.idx, s.minLimitW))
+					continue
+				}
+				if s.loVerified {
 					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", s.lo, s.lo, s.hi))
 					if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.lo); err == nil {
 						s.appliedLimitW = s.lo
@@ -3383,7 +3616,8 @@ calibDone:
 						s.calib.Completed = true
 					}
 				} else {
-					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
+					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit down to the minimum settable power limit %d W", engineLabel, s.minLimitW))
+					logFunc(fmt.Sprintf("power calibration: GPU %d no stable limit found down to minimum settable power limit %d W", s.idx, s.minLimitW))
 				}
 				s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
 				s.converged = true
@@ -3398,9 +3632,7 @@ calibDone:
 				next = (s.lo + s.hi) / 2
 			}
 			if next < s.minLimitW {
-				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
-				s.converged = true
-				continue
+				next = s.minLimitW
 			}
 			if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err != nil {
 				s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
@@ -3416,6 +3648,16 @@ calibDone:
 			s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", next, s.lo, s.hi))
 			logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", s.idx, next, s.lo, s.hi))
 		}
+		if attemptStable {
+			if attemptSDRSummary.Samples > 0 {
+				runSummary.LoadedSDR = attemptSDRSummary
+			}
+			if attemptFanSummary.FanSamples > 0 {
+				runSummary.AvgFanRPM = attemptFanSummary.AvgFanRPM
+				runSummary.AvgFanDutyCyclePct = attemptFanSummary.AvgFanDutyCyclePct
+				runSummary.FanSamples = attemptFanSummary.FanSamples
+			}
+		}
 	}

 	for _, s := range states {
@@ -3424,7 +3666,7 @@ calibDone:
 		}
 	}
 	writeBenchmarkMetricsFiles(runDir, allCalibRows)
-	return results, restore, allCalibRows
+	return results, restore, allCalibRows, runSummary
 }

 // isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222),
@@ -3439,6 +3681,24 @@ func roundTo5W(w int) int {
 	return ((w + 2) / 5) * 5
 }

+func initialBenchmarkCalibrationLimitW(info benchmarkGPUInfo) int {
+	defaultLimitW := int(math.Round(info.DefaultPowerLimitW))
+	currentLimitW := int(math.Round(info.PowerLimitW))
+	maxLimitW := int(math.Round(info.MaxPowerLimitW))
+
+	startW := defaultLimitW
+	if startW <= 0 {
+		startW = currentLimitW
+	}
+	if startW <= 0 {
+		startW = maxLimitW
+	}
+	if maxLimitW > 0 && startW > maxLimitW {
+		startW = maxLimitW
+	}
+	return startW
+}
+
 // meanFanRPM returns the average RPM across a set of fan readings.
 func meanFanRPM(fans []FanReading) float64 {
 	if len(fans) == 0 {
@@ -3451,6 +3711,47 @@ func meanFanRPM(fans []FanReading) float64 {
 	return sum / float64(len(fans))
 }

+func startBenchmarkFanSampler(stopCh <-chan struct{}, intervalSec int) <-chan benchmarkPowerCalibrationRunSummary {
+	if intervalSec <= 0 {
+		intervalSec = benchmarkPowerAutotuneSampleInterval
+	}
+	ch := make(chan benchmarkPowerCalibrationRunSummary, 1)
+	go func() {
+		defer close(ch)
+		var rpmSamples []float64
+		var dutySamples []float64
+		record := func() {
+			fans, err := sampleFanSpeeds()
+			if err != nil || len(fans) == 0 {
+				return
+			}
+			if rpm := meanFanRPM(fans); rpm > 0 {
+				rpmSamples = append(rpmSamples, rpm)
+			}
+			if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok && duty > 0 {
+				dutySamples = append(dutySamples, duty)
+			}
+		}
+		record()
+		ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
+		defer ticker.Stop()
+		for {
+			select {
+			case <-stopCh:
+				ch <- benchmarkPowerCalibrationRunSummary{
+					AvgFanRPM:          benchmarkMean(rpmSamples),
+					AvgFanDutyCyclePct: benchmarkMean(dutySamples),
+					FanSamples:         len(rpmSamples),
+				}
+				return
+			case <-ticker.C:
+				record()
+			}
+		}
+	}()
+	return ch
+}
+
 func powerBenchDurationSec(profile string) int {
 	switch strings.TrimSpace(strings.ToLower(profile)) {
 	case NvidiaBenchmarkProfileStability:
@@ -3479,41 +3780,39 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 	fmt.Fprintf(&b, "**Overall status:** %s  \n", result.OverallStatus)
 	fmt.Fprintf(&b, "**Platform max TDP (GPU-reported):** %.0f W  \n", result.PlatformMaxTDPW)
 	if sp := result.ServerPower; sp != nil && sp.Available {
-		fmt.Fprintf(&b, "**Server power delta (IPMI DCMI):** %.0f W  \n", sp.DeltaW)
-		if sp.PSUInputLoadedW > 0 {
-			psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
-			fmt.Fprintf(&b, "**PSU AC input Δ (IPMI SDR):** %.0f W  \n", psuDelta)
+		sourceLabel := "autotuned source"
+		switch normalizeBenchmarkPowerSource(sp.Source) {
+		case BenchmarkPowerSourceSDRPSUInput:
+			sourceLabel = "autotuned source (SDR PSU AC input)"
+		case BenchmarkPowerSourceDCMI:
+			sourceLabel = "autotuned source (DCMI)"
 		}
-		fmt.Fprintf(&b, "**Reporting ratio (IPMI Δ / GPU actual sum):** %.2f  \n", sp.ReportingRatio)
+		fmt.Fprintf(&b, "**Server power delta (%s):** %.0f W  \n", sourceLabel, sp.DeltaW)
+		fmt.Fprintf(&b, "**Reporting ratio:** %.2f  \n", sp.ReportingRatio)
 	}
 	b.WriteString("\n")
 	// Server power comparison table.
 	if sp := result.ServerPower; sp != nil {
 		b.WriteString("## Server vs GPU Power Comparison\n\n")
+		selectedSource := normalizeBenchmarkPowerSource(sp.Source)
+		selectedSourceLabel := "Selected source"
+		if selectedSource == BenchmarkPowerSourceSDRPSUInput {
+			selectedSourceLabel = "Selected source (SDR PSU AC input)"
+		} else if selectedSource == BenchmarkPowerSourceDCMI {
+			selectedSourceLabel = "Selected source (DCMI)"
+		}
 		var spRows [][]string
-		spRows = append(spRows, []string{"GPU stable limits sum", "nvidia-smi", fmt.Sprintf("%.0f W", result.PlatformMaxTDPW)})
-		spRows = append(spRows, []string{"GPU actual power sum (p95, last step)", "nvidia-smi", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)})
-		if sp.GPUSlotTotalW > 0 {
-			spRows = append(spRows, []string{"GPU PCIe slot power (at peak load)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.GPUSlotTotalW)})
-		}
+		spRows = append(spRows, []string{"GPU actual power sum (p95, last step)", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)})
 		if sp.Available {
-			spRows = append(spRows, []string{"Server idle power", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.IdleW)})
-			spRows = append(spRows, []string{"Server loaded power", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.LoadedW)})
-			spRows = append(spRows, []string{"Server Δ power (loaded − idle)", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.DeltaW)})
+			spRows = append(spRows, []string{selectedSourceLabel + " idle power", fmt.Sprintf("%.0f W", sp.IdleW)})
+			spRows = append(spRows, []string{selectedSourceLabel + " loaded power", fmt.Sprintf("%.0f W", sp.LoadedW)})
+			spRows = append(spRows, []string{selectedSourceLabel + " Δ power (loaded − idle)", fmt.Sprintf("%.0f W", sp.DeltaW)})
 		}
-		if sp.PSUInputLoadedW > 0 {
-			spRows = append(spRows, []string{"PSU AC input (idle)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUInputIdleW)})
-			spRows = append(spRows, []string{"PSU AC input (loaded)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUInputLoadedW)})
+		if selectedSource == BenchmarkPowerSourceSDRPSUInput && sp.PSUInputLoadedW > 0 {
+			spRows = append(spRows, []string{"PSU AC input (idle avg, pre-load phase)", fmt.Sprintf("%.0f W", sp.PSUInputIdleW)})
+			spRows = append(spRows, []string{"PSU AC input (loaded avg, final phase)", fmt.Sprintf("%.0f W", sp.PSUInputLoadedW)})
 			psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
-			spRows = append(spRows, []string{"PSU AC input Δ (loaded − idle)", "IPMI SDR", fmt.Sprintf("%.0f W", psuDelta)})
-		}
-		if sp.PSUOutputLoadedW > 0 {
-			spRows = append(spRows, []string{"PSU DC output (idle)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUOutputIdleW)})
-			spRows = append(spRows, []string{"PSU DC output (loaded)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUOutputLoadedW)})
-			if sp.PSUInputLoadedW > 0 && sp.PSUInputIdleW > 0 {
-				psuEff := sp.PSUOutputIdleW / sp.PSUInputIdleW * 100
-				spRows = append(spRows, []string{"PSU conversion efficiency (idle)", "IPMI SDR", fmt.Sprintf("%.1f%%", psuEff)})
-			}
+			spRows = append(spRows, []string{"PSU AC input Δ (loaded − idle)", fmt.Sprintf("%.0f W", psuDelta)})
 		}
 		if sp.Available {
 			ratio := sp.ReportingRatio
@@ -3530,8 +3829,8 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 			default:
 				ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power"
 			}
-			spRows = append(spRows, []string{"Reporting ratio (DCMI Δ / GPU actual)", "IPMI DCMI", fmt.Sprintf("%.2f — %s", ratio, ratioNote)})
-			if sp.PSUInputLoadedW > 0 && sp.GPUReportedSumW > 0 {
+			spRows = append(spRows, []string{"Reporting ratio", fmt.Sprintf("%.2f — %s", ratio, ratioNote)})
+			if selectedSource == BenchmarkPowerSourceSDRPSUInput && sp.PSUInputLoadedW > 0 && sp.GPUReportedSumW > 0 {
 				psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
 				sdrRatio := psuDelta / sp.GPUReportedSumW
 				sdrNote := ""
@@ -3543,12 +3842,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 				default:
 					sdrNote = "✗ significant discrepancy"
 				}
-				spRows = append(spRows, []string{"Reporting ratio (SDR PSU Δ / GPU actual)", "IPMI SDR", fmt.Sprintf("%.2f — %s", sdrRatio, sdrNote)})
+				spRows = append(spRows, []string{"PSU AC input reporting ratio", fmt.Sprintf("%.2f — %s", sdrRatio, sdrNote)})
 			}
 		} else {
-			spRows = append(spRows, []string{"IPMI availability", "—", "not available — IPMI not supported or ipmitool not found"})
+			spRows = append(spRows, []string{"IPMI availability", "not available — IPMI not supported or ipmitool not found"})
 		}
-		b.WriteString(fmtMDTable([]string{"Metric", "Source", "Value"}, spRows))
+		b.WriteString(fmtMDTable([]string{"Metric", "Value"}, spRows))
 		for _, note := range sp.Notes {
 			fmt.Fprintf(&b, "\n> %s\n", note)
 		}
@@ -3600,11 +3899,10 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 				psuDistRows = append(psuDistRows, []string{
 					slot,
 					fmtW(idle.InputW), fmtW(loaded.InputW),
-					fmtW(idle.OutputW), fmtW(loaded.OutputW),
 					deltaStr, status,
 				})
 			}
-			b.WriteString(fmtMDTable([]string{"Slot", "AC Input (idle)", "AC Input (loaded)", "DC Output (idle)", "DC Output (loaded)", "Load Δ", "Status"}, psuDistRows))
+			b.WriteString(fmtMDTable([]string{"Slot", "AC Input (idle avg)", "AC Input (loaded avg)", "Load Δ", "Status"}, psuDistRows))
 			b.WriteString("\n")
 		}
 	}
@@ -3652,7 +3950,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 				fan,
 			})
 		}
-		b.WriteString(fmtMDTable([]string{"GPU", "Clock MHz (Mem MHz)", "Avg Temp °C", "Power W", "Server Δ W", "Fan RPM (duty%)"}, sgRows))
+		b.WriteString(fmtMDTable([]string{"GPU", "Clock MHz (Mem MHz)", "Avg Temp °C", "Power W", "Server Δ W", "Avg Fan RPM (duty%)"}, sgRows))
 		b.WriteString("\n")
 	}
 	if len(result.RecommendedSlotOrder) > 0 {
@@ -3761,7 +4059,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 			for _, slot := range psuSlots {
 				psuHeaders = append(psuHeaders, fmt.Sprintf("PSU %s W", slot))
 			}
-			psuHeaders = append(psuHeaders, "PSU Total W", "Platform eff.", "Fan RPM (duty%)")
+			psuHeaders = append(psuHeaders, "PSU Total W", "Platform eff.", "Avg Fan RPM (duty%)")

 			var psuRows [][]string
 			for _, step := range result.RampSteps {
@@ -3842,7 +4140,6 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 			}
 			pdRows = append(pdRows, []string{
 				fmt.Sprintf("GPU %d", gpu.Index),
-				fmt.Sprintf("%.0f W", gpu.DefaultPowerLimitW),
 				fmt.Sprintf("%.0f W", gpu.AppliedPowerLimitW),
 				fmt.Sprintf("%.0f W", stable),
 				realization,
@@ -3855,13 +4152,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 		}
 		pdRows = append(pdRows, []string{
 			"**Platform**",
-			fmt.Sprintf("**%.0f W**", totalDefault),
 			"—",
 			fmt.Sprintf("**%.0f W**", totalStable),
 			fmt.Sprintf("**%s**", platformReal),
 			"",
 		})
-		b.WriteString(fmtMDTable([]string{"GPU", "Default TDP", "Single-card limit", "Stable limit", "Realization", "Derated"}, pdRows))
+		b.WriteString(fmtMDTable([]string{"GPU", "Single-card limit", "Stable limit", "Realization", "Derated"}, pdRows))
 		b.WriteString("\n")

 		// Balance across GPUs — only meaningful with 2+ GPUs.
@@ -4011,7 +4307,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 			{"Avg Temp °C", singleTemp},
 			{"Power W", singlePwr},
 			{"Per GPU wall W", singleWall},
-			{"Fan RPM (duty%)", singleFan},
+			{"Avg Fan RPM (duty%)", singleFan},
 		}
 		if lastStep != nil {
 			compRows[0] = append(compRows[0], fmt.Sprintf("%s (%s)", allClk, allMem))
@@ -4096,14 +4392,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		return "", fmt.Errorf("mkdir %s: %w", runDir, err)
 	}
 	verboseLog := filepath.Join(runDir, "verbose.log")
-	infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
-	if infoErr != nil {
-		return "", infoErr
-	}
-	// Capture full nvidia-smi -q snapshot at the start of the run.
-	if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
-		_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
-	}
 	hostname, _ := os.Hostname()
 	result := NvidiaPowerBenchResult{
 		BenchmarkVersion:   benchmarkVersion,
@@ -4114,23 +4402,35 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		SelectedGPUIndices: append([]int(nil), selected...),
 		OverallStatus:      "OK",
 	}
+	infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
+	if infoErr != nil {
+		return "", infoErr
+	}
+	// Capture full nvidia-smi -q snapshot at the start of the run.
+	if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
+		_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
+	}
 	durationSec := powerBenchDurationSec(opts.Profile)

 	// Sample server idle power before any GPU load.
 	var serverIdleW float64
 	var serverIdleOK bool
+	idleSDRStopCh := make(chan struct{})
+	idleSDRCh := startIPMISDRSampler(idleSDRStopCh, benchmarkPowerAutotuneSampleInterval)
 	if w, ok := sampleBenchmarkPowerSourceSeries(ctx, opts.ServerPowerSource, 10, benchmarkPowerAutotuneSampleInterval); ok {
 		serverIdleW = w
 		serverIdleOK = true
 		logFunc(fmt.Sprintf("server idle power (%s): %.0f W", opts.ServerPowerSource, w))
 	}
-	sdrIdle := sampleIPMISDRPowerSensors()
+	close(idleSDRStopCh)
+	sdrIdle := summarizeSDRPowerSeries(<-idleSDRCh)
 	psuBefore := psuStatusSnapshot()

 	// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
 	// establish a true single-card power baseline unaffected by neighbour heat.
 	calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
 	singleIPMILoadedW := make(map[int]float64, len(selected))
+	singleRunSummaryByIndex := make(map[int]benchmarkPowerCalibrationRunSummary, len(selected))
 	var allRestoreActions []benchmarkRestoreAction
 	// allPowerRows accumulates telemetry from all phases for the top-level gpu-metrics.csv.
 	var allPowerRows []GPUMetricRow
@@ -4139,24 +4439,28 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
 		_ = os.MkdirAll(singleDir, 0755)
 		singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
+		if failed := resetBenchmarkGPUs(ctx, verboseLog, []int{idx}, logFunc); len(failed) > 0 {
+			result.Findings = append(result.Findings,
+				fmt.Sprintf("GPU %d reset pre-flight did not complete before its first power test; throttle counters may contain stale state.", idx))
+		}
 		logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
 		singlePowerStopCh := make(chan struct{})
 		singlePowerCh := startSelectedPowerSourceSampler(singlePowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
-		c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil, durationSec)
+		c, restore, singleRows, singleRun := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil, durationSec)
 		appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0)
 		close(singlePowerStopCh)
-		sdrSingle := sampleIPMISDRPowerSensors()
 		if samples := <-singlePowerCh; len(samples) > 0 {
 			singleIPMILoadedW[idx] = benchmarkMean(samples)
 			logFunc(fmt.Sprintf("power calibration: GPU %d single-card server power (%s avg): %.0f W", idx, opts.ServerPowerSource, singleIPMILoadedW[idx]))
-		} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrSingle.PSUInW > 0 {
-			singleIPMILoadedW[idx] = sdrSingle.PSUInW
-			logFunc(fmt.Sprintf("power calibration: GPU %d single-card fallback server power (SDR snapshot): %.0f W", idx, sdrSingle.PSUInW))
+		} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && singleRun.LoadedSDR.PSUInW > 0 {
+			singleIPMILoadedW[idx] = singleRun.LoadedSDR.PSUInW
+			logFunc(fmt.Sprintf("power calibration: GPU %d single-card fallback server power (SDR avg): %.0f W", idx, singleRun.LoadedSDR.PSUInW))
 		}
 		allRestoreActions = append(allRestoreActions, restore...)
 		if r, ok := c[idx]; ok {
 			calibByIndex[idx] = r
 		}
+		singleRunSummaryByIndex[idx] = singleRun
 	}
 	defer func() {
 		for i := len(allRestoreActions) - 1; i >= 0; i-- {
@@ -4199,11 +4503,9 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			t := summarizeBenchmarkTelemetry(calib.MetricRows)
 			gpu.Telemetry = &t
 		}
-		if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 {
-			gpu.AvgFanRPM = meanFanRPM(fans)
-			if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok {
-				gpu.AvgFanDutyCyclePct = duty
-			}
+		if singleRun := singleRunSummaryByIndex[idx]; singleRun.AvgFanRPM > 0 {
+			gpu.AvgFanRPM = singleRun.AvgFanRPM
+			gpu.AvgFanDutyCyclePct = singleRun.AvgFanDutyCyclePct
 		}
 		gpus = append(gpus, gpu)
 	}
@@ -4259,10 +4561,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	// per-step in NvidiaPowerBenchStep.ServerLoadedW.
 	var serverLoadedW float64
 	var serverLoadedOK bool
-	// sdrLastStep retains the SDR snapshot from the last ramp step while GPUs are
-	// still loaded. Used as PSUInputLoadedW in the summary instead of re-sampling
-	// after the test when GPUs have already returned to idle.
-	var sdrLastStep sdrPowerSnapshot
+	// sdrLastStep retains the phase-averaged SDR readings from the last ramp step
+	// while GPUs are loaded. Used in the summary instead of re-sampling after the
+	// test when GPUs have already returned to idle.
+	var sdrLastStep benchmarkSDRSeriesSummary

 	// Step 1: reuse single-card calibration result directly.
 	if len(result.RecommendedSlotOrder) > 0 {
@@ -4283,6 +4585,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			ramp.ServerLoadedW = w
 			ramp.ServerDeltaW = w - serverIdleW
 		}
+		if singleRun := singleRunSummaryByIndex[firstIdx]; singleRun.AvgFanRPM > 0 {
+			ramp.AvgFanRPM = singleRun.AvgFanRPM
+			ramp.AvgFanDutyCyclePct = singleRun.AvgFanDutyCyclePct
+		}
 		if !firstCalib.Completed {
 			ramp.Status = "FAILED"
 			ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card %s", firstIdx, benchmarkPowerEngineLabel(benchmarkPowerEngine())))
@@ -4333,7 +4639,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
 		stepPowerStopCh := make(chan struct{})
 		stepPowerCh := startSelectedPowerSourceSampler(stepPowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
-		stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep, durationSec)
+		stepCalib, stepRestore, stepRows, stepRun := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep, durationSec)
 		appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0)
 		close(stepPowerStopCh)
 		var stepIPMILoadedW float64
@@ -4404,10 +4710,9 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
 		}

-		// Per-step PSU slot snapshot — also used as the authoritative loaded power
-		// source when SDR PSU sensors are available (more accurate than DCMI on
-		// servers where DCMI covers only a subset of installed PSUs).
-		sdrStep := sampleIPMISDRPowerSensors()
+		// Per-step PSU slot readings are averaged over the whole load phase rather
+		// than captured as a single end-of-phase snapshot.
+		sdrStep := stepRun.LoadedSDR
 		if len(sdrStep.PSUSlots) > 0 {
 			ramp.PSUSlotReadings = sdrStep.PSUSlots
 		}
@@ -4425,7 +4730,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrStep.PSUInW > 0 {
 			ramp.ServerLoadedW = sdrStep.PSUInW
 			ramp.ServerDeltaW = sdrStep.PSUInW - sdrIdle.PSUInW
-			logFunc(fmt.Sprintf("power ramp: step %d fallback server loaded power (SDR snapshot): %.0f W", step, sdrStep.PSUInW))
+			logFunc(fmt.Sprintf("power ramp: step %d fallback server loaded power (SDR avg): %.0f W", step, sdrStep.PSUInW))
 			if step == len(result.RecommendedSlotOrder) {
 				serverLoadedW = sdrStep.PSUInW
 				serverLoadedOK = true
@@ -4433,12 +4738,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			}
 		}

-		// Fan state at end of ramp step.
-		if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 {
-			ramp.AvgFanRPM = meanFanRPM(fans)
-			if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok {
-				ramp.AvgFanDutyCyclePct = duty
-			}
+		// Fan values are phase averages over the same load window.
+		if stepRun.AvgFanRPM > 0 {
+			ramp.AvgFanRPM = stepRun.AvgFanRPM
+			ramp.AvgFanDutyCyclePct = stepRun.AvgFanDutyCyclePct
 		}

 		// Per-GPU telemetry from this ramp step's calibration.
@@ -4491,8 +4794,8 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	// Supplement DCMI with SDR multi-source data via collector's PSU slot patterns.
 	// Per-slot readings enable correlation with audit HardwarePowerSupply entries.
 	if result.ServerPower != nil {
-		// Use the SDR snapshot from the last ramp step (GPUs still loaded) rather
-		// than re-sampling here, which would capture post-test idle state.
+		// Use the SDR phase average from the last ramp step (GPUs still loaded)
+		// rather than re-sampling here, which would capture post-test idle state.
 		sdrLoaded := sdrLastStep
 		result.ServerPower.PSUInputIdleW = sdrIdle.PSUInW
 		result.ServerPower.PSUInputLoadedW = sdrLoaded.PSUInW
@@ -4512,6 +4815,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			result.ServerPower.Notes = append(result.ServerPower.Notes,
 				"SDR sensors skipped (self-healed): "+strings.Join(sdrLoaded.SkippedSensors, "; "))
 		}
+		if sdrLoaded.Samples > 0 {
+			result.ServerPower.Notes = append(result.ServerPower.Notes,
+				fmt.Sprintf("Final SDR PSU loaded values are phase averages across %d sample(s) from the last full-load step.", sdrLoaded.Samples))
+		}
 		// Detect DCMI partial coverage: direct SDR comparison first,
 		// ramp heuristic as fallback when SDR PSU sensors are absent.
 		dcmiUnreliable := detectDCMIPartialCoverage(result.ServerPower) ||
--- a/audit/internal/platform/benchmark_test.go
+++ b/audit/internal/platform/benchmark_test.go
@@ -1,8 +1,13 @@
 package platform

 import (
+	"context"
+	"os"
+	"os/exec"
+	"path/filepath"
 	"strings"
 	"testing"
+	"time"
 )

 func TestResolveBenchmarkProfile(t *testing.T) {
@@ -164,6 +169,93 @@ func TestBenchmarkPlannedPhaseStatus(t *testing.T) {
 	}
 }

+func TestBenchmarkCalibrationThrottleReasonIgnoresPowerReasons(t *testing.T) {
+	t.Parallel()
+
+	before := BenchmarkThrottleCounters{}
+	if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWPowerCapUS: 1_000_000}); got != "" {
+		t.Fatalf("sw_power_cap should be ignored, got %q", got)
+	}
+	if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWPowerBrakeSlowdownUS: 1_000_000}); got != "" {
+		t.Fatalf("hw_power_brake should be ignored, got %q", got)
+	}
+	if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWThermalSlowdownUS: 1_000_000}); got != "hw_thermal" {
+		t.Fatalf("hw_thermal mismatch: got %q", got)
+	}
+	if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWThermalSlowdownUS: 1_000_000}); got != "sw_thermal" {
+		t.Fatalf("sw_thermal mismatch: got %q", got)
+	}
+}
+
+func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
+	t.Parallel()
+
+	oldGeteuid := benchmarkGeteuid
+	oldExec := satExecCommand
+	benchmarkGeteuid = func() int { return 1000 }
+	satExecCommand = func(name string, args ...string) *exec.Cmd {
+		t.Fatalf("unexpected command: %s %v", name, args)
+		return nil
+	}
+	t.Cleanup(func() {
+		benchmarkGeteuid = oldGeteuid
+		satExecCommand = oldExec
+	})
+
+	var logs []string
+	failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{0, 2}, func(line string) {
+		logs = append(logs, line)
+	})
+	if got, want := strings.Join(logs, "\n"), "power benchmark pre-flight: root privileges unavailable, GPU reset skipped"; !strings.Contains(got, want) {
+		t.Fatalf("logs=%q want substring %q", got, want)
+	}
+	if len(failed) != 2 || failed[0] != 0 || failed[1] != 2 {
+		t.Fatalf("failed=%v want [0 2]", failed)
+	}
+}
+
+func TestResetBenchmarkGPUsResetsEachGPU(t *testing.T) {
+	t.Parallel()
+
+	dir := t.TempDir()
+	script := filepath.Join(dir, "nvidia-smi")
+	argsLog := filepath.Join(dir, "args.log")
+	if err := os.WriteFile(script, []byte("#!/bin/sh\nprintf '%s\\n' \"$*\" >> "+argsLog+"\nprintf 'ok\\n'\n"), 0755); err != nil {
+		t.Fatalf("write script: %v", err)
+	}
+
+	oldGeteuid := benchmarkGeteuid
+	oldSleep := benchmarkSleep
+	oldLookPath := satLookPath
+	benchmarkGeteuid = func() int { return 0 }
+	benchmarkSleep = func(time.Duration) {}
+	satLookPath = func(file string) (string, error) {
+		if file == "nvidia-smi" {
+			return script, nil
+		}
+		return exec.LookPath(file)
+	}
+	t.Cleanup(func() {
+		benchmarkGeteuid = oldGeteuid
+		benchmarkSleep = oldSleep
+		satLookPath = oldLookPath
+	})
+
+	failed := resetBenchmarkGPUs(context.Background(), filepath.Join(dir, "verbose.log"), []int{2, 5}, nil)
+	if len(failed) != 0 {
+		t.Fatalf("failed=%v want no failures", failed)
+	}
+	raw, err := os.ReadFile(argsLog)
+	if err != nil {
+		t.Fatalf("read args log: %v", err)
+	}
+	got := strings.Fields(string(raw))
+	want := []string{"-i", "2", "-r", "-i", "5", "-r"}
+	if strings.Join(got, " ") != strings.Join(want, " ") {
+		t.Fatalf("args=%v want %v", got, want)
+	}
+}
+
 func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
 	t.Parallel()

@@ -179,6 +271,59 @@ func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
 	}
 }

+func TestInitialBenchmarkCalibrationLimitW(t *testing.T) {
+	t.Parallel()
+
+	cases := []struct {
+		name string
+		info benchmarkGPUInfo
+		want int
+	}{
+		{
+			name: "prefers default tdp over current derated limit",
+			info: benchmarkGPUInfo{
+				PowerLimitW:        500,
+				DefaultPowerLimitW: 600,
+				MaxPowerLimitW:     600,
+			},
+			want: 600,
+		},
+		{
+			name: "caps default tdp to reported max limit",
+			info: benchmarkGPUInfo{
+				PowerLimitW:        500,
+				DefaultPowerLimitW: 700,
+				MaxPowerLimitW:     650,
+			},
+			want: 650,
+		},
+		{
+			name: "falls back to current limit when default missing",
+			info: benchmarkGPUInfo{
+				PowerLimitW:    525,
+				MaxPowerLimitW: 600,
+			},
+			want: 525,
+		},
+		{
+			name: "falls back to max limit when only that is known",
+			info: benchmarkGPUInfo{
+				MaxPowerLimitW: 575,
+			},
+			want: 575,
+		},
+	}
+
+	for _, tc := range cases {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			if got := initialBenchmarkCalibrationLimitW(tc.info); got != tc.want {
+				t.Fatalf("initialBenchmarkCalibrationLimitW(%+v)=%d want %d", tc.info, got, tc.want)
+			}
+		})
+	}
+}
+
 func TestParseBenchmarkBurnLog(t *testing.T) {
 	t.Parallel()

@@ -338,12 +483,16 @@ func TestScoreBenchmarkGPUIgnoresDisabledPrecisions(t *testing.T) {
 	}
 }

-func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {
+func TestEnrichGPUInfoWithNvidiaSMIQ(t *testing.T) {
 	t.Parallel()

 	nvsmiQ := []byte(`
 GPU 00000000:4E:00.0
    Product Name                          : NVIDIA RTX PRO 6000 Blackwell Server Edition
+    Min Power Limit                       : 200.00 W
+    Max Power Limit                       : 600.00 W
+    Default Power Limit                   : 575.00 W
+    Current Power Limit                   : 560.00 W
    Clocks
        Graphics                          : 2422 MHz
        Memory                            : 12481 MHz
@@ -365,7 +514,7 @@ GPU 00000000:4F:00.0
 		1: {Index: 1, BusID: "00000000:4F:00.0"},
 	}

-	enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
+	enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)

 	if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
 		t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz)
@@ -379,25 +528,49 @@ GPU 00000000:4F:00.0
 	if infoByIndex[1].MaxMemoryClockMHz != 12481 {
 		t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz)
 	}
+	if infoByIndex[0].MinPowerLimitW != 200 {
+		t.Errorf("GPU 0 MinPowerLimitW = %v, want 200", infoByIndex[0].MinPowerLimitW)
+	}
+	if infoByIndex[0].MaxPowerLimitW != 600 {
+		t.Errorf("GPU 0 MaxPowerLimitW = %v, want 600", infoByIndex[0].MaxPowerLimitW)
+	}
+	if infoByIndex[0].DefaultPowerLimitW != 575 {
+		t.Errorf("GPU 0 DefaultPowerLimitW = %v, want 575", infoByIndex[0].DefaultPowerLimitW)
+	}
+	if infoByIndex[0].PowerLimitW != 560 {
+		t.Errorf("GPU 0 PowerLimitW = %v, want 560", infoByIndex[0].PowerLimitW)
+	}
 }

-func TestEnrichGPUInfoWithMaxClocksSkipsPopulated(t *testing.T) {
+func TestEnrichGPUInfoWithNvidiaSMIQSkipsPopulated(t *testing.T) {
 	t.Parallel()

 	nvsmiQ := []byte(`
 GPU 00000000:4E:00.0
+    Min Power Limit                       : 100.00 W
+    Max Power Limit                       : 900.00 W
    Max Clocks
        Graphics                          : 9999 MHz
        Memory                            : 9999 MHz
 `)
 	// Already populated — must not be overwritten.
 	infoByIndex := map[int]benchmarkGPUInfo{
-		0: {Index: 0, BusID: "00000000:4E:00.0", MaxGraphicsClockMHz: 2430, MaxMemoryClockMHz: 12481},
+		0: {
+			Index:               0,
+			BusID:               "00000000:4E:00.0",
+			MaxGraphicsClockMHz: 2430,
+			MaxMemoryClockMHz:   12481,
+			MinPowerLimitW:      200,
+			MaxPowerLimitW:      600,
+		},
 	}

-	enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
+	enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)

 	if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
 		t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz)
 	}
+	if infoByIndex[0].MinPowerLimitW != 200 {
+		t.Errorf("expected existing min power limit to be preserved, got %v", infoByIndex[0].MinPowerLimitW)
+	}
 }
--- a/audit/internal/platform/nvidia_recover.go
+++ b/audit/internal/platform/nvidia_recover.go
@@ -0,0 +1,30 @@
+package platform
+
+import (
+	"fmt"
+	"os/exec"
+	"time"
+)
+
+const nvidiaRecoverHelper = "/usr/local/bin/bee-nvidia-recover"
+
+func runNvidiaRecover(args ...string) (string, error) {
+	helperArgs := append([]string{nvidiaRecoverHelper}, args...)
+	if _, err := exec.LookPath("systemd-run"); err == nil {
+		unit := fmt.Sprintf("bee-nvidia-recover-%d", time.Now().UnixNano())
+		cmdArgs := []string{
+			"systemd-run",
+			"--quiet",
+			"--pipe",
+			"--wait",
+			"--collect",
+			"--service-type=oneshot",
+			"--unit", unit,
+		}
+		cmdArgs = append(cmdArgs, helperArgs...)
+		raw, err := exec.Command("sudo", cmdArgs...).CombinedOutput()
+		return string(raw), err
+	}
+	raw, err := exec.Command("sudo", helperArgs...).CombinedOutput()
+	return string(raw), err
+}
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -30,10 +30,10 @@ import (
 // Sources:
 //   - SATEstimatedCPUValidateSec:                 xFusion v8.6 — 62 s
 //   - SATEstimatedMemoryValidateSec:               xFusion v8.6 — 68 s
-//   - SATEstimatedNvidiaGPUValidatePerGPUSec:      xFusion v8.6/v8.22 — 77–87 s/GPU
-//   - SATEstimatedNvidiaGPUStressPerGPUSec:        xFusion v8.6/v8.22 — 444–448 s/GPU
-//   - SATEstimatedNvidiaTargetedStressPerGPUSec:   xFusion v8.6/v8.22 — 347–348 s/GPU (300 s default + overhead)
-//   - SATEstimatedNvidiaTargetedPowerPerGPUSec:    MSI v8.22 / xFusion v8.6 — 346–351 s/GPU
+//   - SATEstimatedNvidiaGPUValidateSec:            xFusion v8.6/v8.22 — 77–87 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
+//   - SATEstimatedNvidiaGPUStressSec:              xFusion v8.6/v8.22 — 444–448 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
+//   - SATEstimatedNvidiaTargetedStressSec:         xFusion v8.6/v8.22 — 347–348 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
+//   - SATEstimatedNvidiaTargetedPowerSec:          MSI v8.22 / xFusion v8.6 — 346–351 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
 //   - SATEstimatedNvidiaPulseTestSec:              xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous)
 //   - SATEstimatedNvidiaInterconnectSec:           xFusion v8.6/v8.22 — 210–384 s / 8 GPU (all simultaneous)
 //   - SATEstimatedNvidiaBandwidthSec:              xFusion v8.6/v8.22 — 2 664–2 688 s / 8 GPU (all simultaneous)
@@ -48,15 +48,15 @@ const (
 	// RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size).
 	SATEstimatedMemoryStressSec = 140

-	// NVIDIA dcgmi diag Level 2 (medium), per GPU, sequential.
-	SATEstimatedNvidiaGPUValidatePerGPUSec = 85
-	// NVIDIA dcgmi diag Level 3 (targeted stress), per GPU, sequential.
-	SATEstimatedNvidiaGPUStressPerGPUSec = 450
+	// NVIDIA dcgmi diag Level 2 (medium), all GPUs simultaneously.
+	SATEstimatedNvidiaGPUValidateSec = 85
+	// NVIDIA dcgmi diag Level 3 (targeted stress), all GPUs simultaneously.
+	SATEstimatedNvidiaGPUStressSec = 450

-	// NVIDIA dcgmi targeted_stress 300 s + overhead, per GPU, sequential.
-	SATEstimatedNvidiaTargetedStressPerGPUSec = 350
-	// NVIDIA dcgmi targeted_power 300 s + overhead, per GPU, sequential.
-	SATEstimatedNvidiaTargetedPowerPerGPUSec = 350
+	// NVIDIA dcgmi targeted_stress 300 s + overhead, all GPUs simultaneously.
+	SATEstimatedNvidiaTargetedStressSec = 350
+	// NVIDIA dcgmi targeted_power 300 s + overhead, all GPUs simultaneously.
+	SATEstimatedNvidiaTargetedPowerSec = 350

 	// NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU).
 	SATEstimatedNvidiaPulseTestSec = 5000
@@ -407,11 +407,11 @@ func (s *System) ResetNvidiaGPU(index int) (string, error) {
 	if index < 0 {
 		return "", fmt.Errorf("gpu index must be >= 0")
 	}
-	raw, err := satExecCommand("nvidia-smi", "-r", "-i", strconv.Itoa(index)).CombinedOutput()
-	if len(raw) == 0 && err == nil {
-		raw = []byte("GPU reset completed.\n")
+	out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
+	if strings.TrimSpace(out) == "" && err == nil {
+		out = "GPU reset completed.\n"
 	}
-	return string(raw), err
+	return out, err
 }

 // RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
--- a/audit/internal/platform/services.go
+++ b/audit/internal/platform/services.go
@@ -61,6 +61,9 @@ func (s *System) ServiceState(name string) string {
 }

 func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
+	if name == "bee-nvidia" && action == ServiceRestart {
+		return runNvidiaRecover("restart-drivers")
+	}
 	// bee-web runs as the bee user; sudo is required to control system services.
 	// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
 	raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()
--- a/audit/internal/webui/page_validate.go
+++ b/audit/internal/webui/page_validate.go
@@ -35,9 +35,11 @@ func validateTotalValidateSec(n int) int {
 	}
 	total := platform.SATEstimatedCPUValidateSec +
 		platform.SATEstimatedMemoryValidateSec +
-		n*platform.SATEstimatedNvidiaGPUValidatePerGPUSec +
 		platform.SATEstimatedNvidiaInterconnectSec +
 		platform.SATEstimatedNvidiaBandwidthSec
+	if n > 0 {
+		total += platform.SATEstimatedNvidiaGPUValidateSec
+	}
 	return total
 }

@@ -47,12 +49,14 @@ func validateTotalStressSec(n int) int {
 	}
 	total := platform.SATEstimatedCPUStressSec +
 		platform.SATEstimatedMemoryStressSec +
-		n*platform.SATEstimatedNvidiaGPUStressPerGPUSec +
-		n*platform.SATEstimatedNvidiaTargetedStressPerGPUSec +
-		n*platform.SATEstimatedNvidiaTargetedPowerPerGPUSec +
 		platform.SATEstimatedNvidiaPulseTestSec +
 		platform.SATEstimatedNvidiaInterconnectSec +
 		platform.SATEstimatedNvidiaBandwidthSec
+	if n > 0 {
+		total += platform.SATEstimatedNvidiaGPUStressSec +
+			platform.SATEstimatedNvidiaTargetedStressSec +
+			platform.SATEstimatedNvidiaTargetedPowerSec
+	}
 	return total
 }

@@ -128,33 +132,16 @@ func renderValidate(opts HandlerOptions) string {
 		inv.NVIDIA,
 		`Runs NVIDIA diagnostics and board inventory checks.`,
 		`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
-		func() string {
-			perV := platform.SATEstimatedNvidiaGPUValidatePerGPUSec
-			perS := platform.SATEstimatedNvidiaGPUStressPerGPUSec
-			if n > 0 {
-				return fmt.Sprintf("Validate: %s/GPU × %d = %s (Level 2, sequential). Stress: %s/GPU × %d = %s (Level 3, sequential).",
-					validateFmtDur(perV), n, validateFmtDur(perV*n),
-					validateFmtDur(perS), n, validateFmtDur(perS*n))
-			}
-			return fmt.Sprintf("Validate: %s/GPU (Level 2, sequential). Stress: %s/GPU (Level 3, sequential).",
-				validateFmtDur(perV), validateFmtDur(perS))
-		}(),
+		fmt.Sprintf("Validate: %s (Level 2, all GPUs simultaneously). Stress: %s (Level 3, all GPUs simultaneously).",
+			validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec),
+			validateFmtDur(platform.SATEstimatedNvidiaGPUStressSec)),
 	)) +
 		`<div id="sat-card-nvidia-targeted-stress">` +
 		renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
 			inv.NVIDIA,
 			`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
 			`<code>dcgmi diag targeted_stress</code>`,
-			func() string {
-				per := platform.SATEstimatedNvidiaTargetedStressPerGPUSec
-				s := "Skipped in Validate. "
-				if n > 0 {
-					s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
-				} else {
-					s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
-				}
-				return s + `<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
-			}(),
+		"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec) + ` (all GPUs simultaneously).<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
 		)) +
 		`</div>` +
 		`<div id="sat-card-nvidia-targeted-power">` +
@@ -162,16 +149,7 @@ func renderValidate(opts HandlerOptions) string {
 			inv.NVIDIA,
 			`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
 			`<code>dcgmi diag targeted_power</code>`,
-			func() string {
-				per := platform.SATEstimatedNvidiaTargetedPowerPerGPUSec
-				s := "Skipped in Validate. "
-				if n > 0 {
-					s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
-				} else {
-					s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
-				}
-				return s + `<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
-			}(),
+		"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec) + ` (all GPUs simultaneously).<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
 		)) +
 		`</div>` +
 		`<div id="sat-card-nvidia-pulse">` +
@@ -382,8 +360,8 @@ function runSATWithOverrides(target, overrides) {
  return enqueueSATTarget(target, overrides)
    .then(d => streamSATTask(d.task_id, title, false));
 }
-const nvidiaPerGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power'];
-const nvidiaAllGPUTargets = ['nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
+const nvidiaPerGPUTargets = [];
+const nvidiaAllGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
 function satAllGPUIndicesForMulti() {
  return Promise.resolve(satSelectedGPUIndices());
 }
@@ -417,40 +395,9 @@ function runNvidiaFabricValidate(target) {
  });
 }
 function runNvidiaValidateSet(target) {
-  return loadSatNvidiaGPUs().then(gpus => {
  const selected = satSelectedGPUIndices();
-    const picked = gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0);
-    if (!picked.length) {
-      throw new Error('Select at least one NVIDIA GPU.');
-    }
-    if (picked.length === 1) {
-      const gpu = picked[0];
-      return runSATWithOverrides(target, {
-        gpu_indices: [Number(gpu.index)],
-        display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')',
-      });
-    }
-    document.getElementById('sat-output').style.display='block';
-    document.getElementById('sat-title').textContent = '— ' + target;
-    const term = document.getElementById('sat-terminal');
-    term.textContent = 'Running ' + target + ' one GPU at a time...\n';
-    const labelBase = satLabels()[target] || ('Validate ' + target);
-    const runNext = (idx) => {
-      if (idx >= picked.length) return Promise.resolve();
-      const gpu = picked[idx];
-      const gpuLabel = satGPUDisplayName(gpu);
-      term.textContent += '\n[' + (idx + 1) + '/' + picked.length + '] ' + gpuLabel + '\n';
-      return enqueueSATTarget(target, {
-        gpu_indices: [Number(gpu.index)],
-        display_name: labelBase + ' (' + gpuLabel + ')',
-      }).then(d => {
-        return streamSATTask(d.task_id, labelBase + ' (' + gpuLabel + ')', false);
-      }).then(function() {
-        return runNext(idx + 1);
-      });
-    };
-    return runNext(0);
-  });
+  if (!selected.length) { alert('Select at least one NVIDIA GPU.'); return; }
+  return runSATWithOverrides(target, {gpu_indices: selected, display_name: satLabels()[target] || target});
 }
 function runAMDValidateSet() {
  const targets = selectedAMDValidateTargets();
--- a/iso/builder/build.sh
+++ b/iso/builder/build.sh
@@ -1411,8 +1411,11 @@ dump_memtest_debug "pre-build" "${LB_DIR}"
 run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
 echo "=== enforcing canonical bootloader assets ==="
 enforce_live_build_bootloader_assets "${LB_DIR}"
+reset_live_build_stage "${LB_DIR}" "binary_checksums"
+reset_live_build_stage "${LB_DIR}" "binary_iso"
+reset_live_build_stage "${LB_DIR}" "binary_zsync"
 run_step_sh "rebuild live-build checksums after bootloader sync" "91b-lb-checksums" "lb binary_checksums 2>&1"
-run_step_sh "rebuild ISO after bootloader sync" "91c-lb-binary-iso" "rm -f '${LB_DIR}/live-image-amd64.hybrid.iso' && lb binary_iso 2>&1"
+run_step_sh "rebuild ISO after bootloader sync" "91c-lb-binary-iso" "lb binary_iso 2>&1"
 run_step_sh "rebuild zsync after bootloader sync" "91d-lb-zsync" "lb binary_zsync 2>&1"

 # --- persist deb package cache back to shared location ---
--- a/iso/overlay/usr/local/bin/bee-nvidia-recover
+++ b/iso/overlay/usr/local/bin/bee-nvidia-recover
@@ -0,0 +1,178 @@
+#!/bin/sh
+# bee-nvidia-recover — drain NVIDIA clients, then reset a GPU or reload drivers.
+
+set -u
+
+log() {
+    echo "[bee-nvidia-recover] $*"
+}
+
+log_blocker() {
+    echo "[bee-nvidia-recover] blocker: $*"
+}
+
+usage() {
+    cat <<'EOF'
+usage:
+  bee-nvidia-recover restart-drivers
+  bee-nvidia-recover reset-gpu <index>
+EOF
+}
+
+unit_exists() {
+    systemctl cat "$1" >/dev/null 2>&1
+}
+
+unit_is_active() {
+    systemctl is-active --quiet "$1" 2>/dev/null
+}
+
+stop_unit_if_active() {
+    unit="$1"
+    if unit_is_active "$unit"; then
+        log "stopping $unit"
+        systemctl stop "$unit"
+        return 0
+    fi
+    return 1
+}
+
+start_unit_if_marked() {
+    unit="$1"
+    marker="$2"
+    if [ "$marker" = "1" ] && unit_exists "$unit"; then
+        log "starting $unit"
+        systemctl start "$unit"
+    fi
+}
+
+wait_for_process_exit() {
+    name="$1"
+    tries=0
+    while pgrep -x "$name" >/dev/null 2>&1; do
+        tries=$((tries + 1))
+        if [ "$tries" -ge 15 ]; then
+            log "WARN: $name is still running after stop request"
+            return 1
+        fi
+        sleep 1
+    done
+    return 0
+}
+
+kill_pattern() {
+    pattern="$1"
+    if pgrep -f "$pattern" >/dev/null 2>&1; then
+        pgrep -af "$pattern" 2>/dev/null | while IFS= read -r line; do
+            [ -n "$line" ] || continue
+            log_blocker "$line"
+        done
+        log "killing processes matching: $pattern"
+        pkill -TERM -f "$pattern" >/dev/null 2>&1 || true
+        sleep 1
+        pkill -KILL -f "$pattern" >/dev/null 2>&1 || true
+    fi
+}
+
+drain_gpu_clients() {
+    display_was_active=0
+    fabric_was_active=0
+
+    for unit in display-manager.service lightdm.service; do
+        if unit_exists "$unit" && stop_unit_if_active "$unit"; then
+            log_blocker "service $unit"
+            display_was_active=1
+        fi
+    done
+
+    if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
+        log_blocker "service nvidia-fabricmanager.service"
+        fabric_was_active=1
+    fi
+
+    if pgrep -x nv-hostengine >/dev/null 2>&1; then
+        pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do
+            [ -n "$line" ] || continue
+            log_blocker "$line"
+        done
+        log "stopping nv-hostengine"
+        pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true
+        wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true
+    fi
+
+    for pattern in \
+        "nvidia-smi" \
+        "dcgmi" \
+        "nvvs" \
+        "dcgmproftester" \
+        "all_reduce_perf" \
+        "nvtop" \
+        "bee-gpu-burn" \
+        "bee-john-gpu-stress" \
+        "bee-nccl-gpu-stress" \
+        "Xorg" \
+        "Xwayland"; do
+        kill_pattern "$pattern"
+    done
+}
+
+restore_gpu_clients() {
+    if command -v nvidia-smi >/dev/null 2>&1; then
+        if nvidia-smi -pm 1 >/dev/null 2>&1; then
+            log "enabled NVIDIA persistence mode"
+        else
+            log "WARN: failed to enable NVIDIA persistence mode"
+        fi
+    fi
+
+    if command -v nv-hostengine >/dev/null 2>&1 && ! pgrep -x nv-hostengine >/dev/null 2>&1; then
+        log "starting nv-hostengine"
+        nv-hostengine
+    fi
+
+    start_unit_if_marked nvidia-fabricmanager.service "${fabric_was_active:-0}"
+    start_unit_if_marked display-manager.service "${display_was_active:-0}"
+    if [ "${display_was_active:-0}" = "1" ] && unit_exists lightdm.service && ! unit_is_active lightdm.service; then
+        start_unit_if_marked lightdm.service "1"
+    fi
+}
+
+restart_drivers() {
+    drain_gpu_clients
+    for mod in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do
+        if lsmod | awk '{print $1}' | grep -qx "$mod"; then
+            log "unloading module $mod"
+            rmmod "$mod"
+        fi
+    done
+    rm -f /dev/nvidiactl /dev/nvidia-uvm /dev/nvidia-uvm-tools /dev/nvidia[0-9]* 2>/dev/null || true
+    log "reloading NVIDIA driver stack"
+    /usr/local/bin/bee-nvidia-load
+    restore_gpu_clients
+}
+
+reset_gpu() {
+    index="$1"
+    drain_gpu_clients
+    log "resetting GPU $index"
+    nvidia-smi -r -i "$index"
+    restore_gpu_clients
+}
+
+cmd="${1:-}"
+case "$cmd" in
+    restart-drivers)
+        restart_drivers
+        ;;
+    reset-gpu)
+        if [ "$#" -ne 2 ]; then
+            usage >&2
+            exit 2
+        fi
+        reset_gpu "$2"
+        ;;
+    *)
+        usage >&2
+        exit 2
+        ;;
+esac
Author	SHA1	Message	Date
Mikhail Chusavitin	679aeb9947	Run NVIDIA DCGM diag tests on all selected GPUs simultaneously targeted_stress, targeted_power, and the Level 2/3 diag were dispatched one GPU at a time from the UI, turning a single dcgmi command into 8 sequential ~350–450 s runs. DCGM supports -i with a comma-separated list of GPU indices and runs the diagnostic on all of them in parallel. Move nvidia, nvidia-targeted-stress, nvidia-targeted-power into nvidiaAllGPUTargets so expandSATTarget passes all selected indices in one API call. Simplify runNvidiaValidateSet to match runNvidiaFabricValidate. Update sat.go constants and page_validate.go estimates to reflect all-GPU simultaneous execution (remove n× multiplier from total time estimates). Stress test on 8-GPU system: ~5.3 h → ~2.5 h. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-20 11:53:25 +03:00
Mikhail Chusavitin	647e99b697	Fix post-sync live-build ISO rebuild	2026-04-20 11:01:15 +03:00
Mikhail Chusavitin	4af997f436	Update audit bee binary	2026-04-20 10:55:42 +03:00
Mikhail Chusavitin	6caace0cc0	Make power benchmark report phase-averaged	2026-04-20 10:53:53 +03:00
Mikhail Chusavitin	5f0103635b	Update power benchmark GPU reset flow	2026-04-20 09:46:00 +03:00
Mikhail Chusavitin	84a2551dc0	Fix NVIDIA self-heal recovery flow	2026-04-20 09:43:22 +03:00
Mikhail Chusavitin	1cfabc9230	Reset GPUs before power benchmark	2026-04-20 09:42:19 +03:00
Mikhail Chusavitin	5dc711de23	Start power calibration from full GPU TDP	2026-04-20 09:28:58 +03:00
Mikhail Chusavitin	ab802719f8	Use real NVIDIA power-limit bounds in benchmark	2026-04-20 09:26:56 +03:00
Mikhail Chusavitin	a94e8007f8	Ignore power throttling in benchmark calibration	2026-04-20 09:26:29 +03:00