audit: switch power benchmark load to dcgmproftester

2026-04-20 06:57:14 +03:00
parent 65bcc9ce81
commit 17118298bd
3 changed files with 962 additions and 161 deletions
@@ -240,6 +240,47 @@ func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, po
 	return nil
 }
 func benchmarkPowerEngine() string {
 	switch strings.TrimSpace(strings.ToLower(os.Getenv("BEE_BENCH_POWER_ENGINE"))) {
 	case BenchmarkPowerEngineTargetedPower:
 		return BenchmarkPowerEngineTargetedPower
 	default:
 		return BenchmarkPowerEngineDCGMProfTester
 	}
 }
 func benchmarkPowerEngineLabel(engine string) string {
 	switch strings.TrimSpace(strings.ToLower(engine)) {
 	case BenchmarkPowerEngineTargetedPower:
 		return "dcgmi diag targeted_power"
 	default:
 		return "dcgmproftester"
 	}
 }
 func resolveBenchmarkPowerLoadCommand(durationSec int, gpuIndices []int) ([]string, []string, error) {
 	engine := benchmarkPowerEngine()
 	durationSec = normalizeNvidiaBurnDuration(durationSec)
 	switch engine {
 	case BenchmarkPowerEngineTargetedPower:
 		return nvidiaDCGMNamedDiagCommand("targeted_power", durationSec, gpuIndices), nil, nil
 	default:
 		if len(gpuIndices) > 1 {
 			return []string{
 				"bee-dcgmproftester-staggered",
 				"--seconds", strconv.Itoa(durationSec),
 				"--stagger-seconds", "0",
 				"--devices", joinIndexList(gpuIndices),
 			}, nil, nil
 		}
 		cmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(durationSec))
 		if err != nil {
 			return nil, nil, err
 		}
 		return cmd, nvidiaVisibleDevicesEnv(gpuIndices), nil
 	}
 }
 func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
 	if ctx == nil {
 		ctx = context.Background()
@@ -384,10 +425,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 			// Sample server idle power once (first GPU only — server state is global).
 			if !serverIdleOK {
-				if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
+				if w, ok := sampleBenchmarkPowerSourceSeries(ctx, opts.ServerPowerSource, maxInt(spec.BaselineSec, 10), benchmarkPowerAutotuneSampleInterval); ok {
 					serverIdleW = w
 					serverIdleOK = true
-					logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
+					logFunc(fmt.Sprintf("server idle power (%s): %.0f W", opts.ServerPowerSource, w))
 				}
 			}
@@ -430,7 +471,16 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 				"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
 			}
 			logFunc(fmt.Sprintf("GPU %d: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", idx, len(supportedPrecisions), basePhaseSec, mixedPhaseSec))
 			serverPowerStopCh := make(chan struct{})
 			serverPowerCh := startSelectedPowerSourceSampler(serverPowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
 			_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-precision-plan.log", idx), planCmd, nil, []int{idx}, planPhases, logFunc)
 			close(serverPowerStopCh)
 			if serverPowerSamples := <-serverPowerCh; len(serverPowerSamples) > 0 {
 				serverLoadedWSum += benchmarkMean(serverPowerSamples)
 				serverLoadedSamples++
 				serverLoadedOK = true
 				logFunc(fmt.Sprintf("GPU %d: server loaded power (%s avg): %.0f W", idx, opts.ServerPowerSource, benchmarkMean(serverPowerSamples)))
 			}
 			for _, phaseSpec := range planPhases {
 				if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
 					appendBenchmarkMetrics(&metricRows, rows, phaseSpec.MetricStage, &metricTimelineSec, float64(phaseSpec.DurationSec))
@@ -461,48 +511,6 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 			beforeThrottle, _ := queryThrottleCounters(idx)
 			logFunc(fmt.Sprintf("GPU %d: steady compute (combined, %ds)", idx, mixedPhaseSec))
 			// Sample server power via IPMI in parallel with the steady phase.
 			// We collect readings every 5s and average them.
 			ipmiStopCh := make(chan struct{})
 			ipmiResultCh := make(chan float64, 1)
 			go func() {
 				defer close(ipmiResultCh)
 				var samples []float64
 				ticker := time.NewTicker(5 * time.Second)
 				defer ticker.Stop()
 				// First sample after a short warmup delay.
 				select {
 				case <-ipmiStopCh:
 					return
 				case <-time.After(15 * time.Second):
 				}
 				for {
 					if w, err := queryIPMIServerPowerW(); err == nil {
 						samples = append(samples, w)
 					}
 					select {
 					case <-ipmiStopCh:
 						if len(samples) > 0 {
 							var sum float64
 							for _, w := range samples {
 								sum += w
 							}
 							ipmiResultCh <- sum / float64(len(samples))
 						}
 						return
 					case <-ticker.C:
 					}
 				}
 			}()
 			close(ipmiStopCh)
 			if loadedW, ok := <-ipmiResultCh; ok {
 				serverLoadedWSum += loadedW
 				serverLoadedSamples++
 				serverLoadedOK = true
 				logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW))
 			}
 			afterThrottle, _ := queryThrottleCounters(idx)
 			if planErr != nil {
 				gpuResult.Notes = append(gpuResult.Notes, "precision plan failed: "+planErr.Error())
@@ -652,7 +660,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 	if serverLoadedSamples > 0 {
 		serverLoadedW = serverLoadedWSum / float64(serverLoadedSamples)
 	}
-	result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK)
+	result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, opts.ServerPowerSource, serverIdleOK && serverLoadedOK)
 	result.Cooling = summarizeBenchmarkCooling(metricRows)
 	// Apply server-power penalty when IPMI reports the server delta is much
@@ -707,6 +715,7 @@ func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) Nv
 	if opts.SizeMB < 0 {
 		opts.SizeMB = 0
 	}
 	opts.ServerPowerSource = normalizeBenchmarkPowerSource(opts.ServerPowerSource)
 	opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices)
 	opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices)
 	return opts
@@ -2535,10 +2544,14 @@ loop:
 }
 // characterizeServerPower computes BenchmarkServerPower from idle and loaded
-// IPMI samples plus the GPU-reported average power during steady state.
+// samples plus the GPU-reported average power during steady state.
-func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, ipmiAvailable bool) *BenchmarkServerPower {
+func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, source string, available bool) *BenchmarkServerPower {
-	sp := &BenchmarkServerPower{Available: ipmiAvailable}
+	sp := &BenchmarkServerPower{
-	if !ipmiAvailable {
+		Available:         available,
 		Source:            normalizeBenchmarkPowerSource(source),
 		SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
 	}
 	if !available {
 		sp.Notes = append(sp.Notes, "IPMI power reading unavailable; server-side power characterization skipped")
 		return sp
 	}
@@ -2671,10 +2684,10 @@ func runNvidiaBenchmarkParallel(
 	// Sample server idle power once.
 	if !*serverIdleOK {
-		if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
+		if w, ok := sampleBenchmarkPowerSourceSeries(ctx, opts.ServerPowerSource, maxInt(spec.BaselineSec, 10), benchmarkPowerAutotuneSampleInterval); ok {
 			*serverIdleW = w
 			*serverIdleOK = true
-			logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
+			logFunc(fmt.Sprintf("server idle power (%s): %.0f W", opts.ServerPowerSource, w))
 		}
 	}
@@ -2728,7 +2741,16 @@ func runNvidiaBenchmarkParallel(
 		"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
 	}
 	logFunc(fmt.Sprintf("GPUs %s: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", allDevices, len(supportedPrecisions), basePhaseSec, mixedPhaseSec))
 	serverPowerStopCh := make(chan struct{})
 	serverPowerCh := startSelectedPowerSourceSampler(serverPowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
 	_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, "gpu-all-precision-plan.log", planCmd, nil, selected, planPhases, logFunc)
 	close(serverPowerStopCh)
 	if serverPowerSamples := <-serverPowerCh; len(serverPowerSamples) > 0 {
 		*serverLoadedWSum += benchmarkMean(serverPowerSamples)
 		(*serverLoadedSamples)++
 		*serverLoadedOK = true
 		logFunc(fmt.Sprintf("GPUs %s: server loaded power (%s avg): %.0f W", allDevices, opts.ServerPowerSource, benchmarkMean(serverPowerSamples)))
 	}
 	for _, phaseSpec := range planPhases {
 		if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
 			appendBenchmarkMetrics(allMetricRows, rows, phaseSpec.MetricStage, metricTimelineSec, float64(phaseSpec.DurationSec))
@@ -2770,46 +2792,6 @@ func runNvidiaBenchmarkParallel(
 	}
 	logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (combined, %ds)", allDevices, mixedPhaseSec))
 	// Sample server power via IPMI in parallel with steady phase.
 	ipmiStopCh := make(chan struct{})
 	ipmiResultCh := make(chan float64, 1)
 	go func() {
 		defer close(ipmiResultCh)
 		var samples []float64
 		ticker := time.NewTicker(5 * time.Second)
 		defer ticker.Stop()
 		select {
 		case <-ipmiStopCh:
 			return
 		case <-time.After(15 * time.Second):
 		}
 		for {
 			if w, err := queryIPMIServerPowerW(); err == nil {
 				samples = append(samples, w)
 			}
 			select {
 			case <-ipmiStopCh:
 				if len(samples) > 0 {
 					var sum float64
 					for _, w := range samples {
 						sum += w
 					}
 					ipmiResultCh <- sum / float64(len(samples))
 				}
 				return
 			case <-ticker.C:
 			}
 		}
 	}()
 	close(ipmiStopCh)
 	if loadedW, ok := <-ipmiResultCh; ok {
 		*serverLoadedWSum += loadedW
 		(*serverLoadedSamples)++
 		*serverLoadedOK = true
 		logFunc(fmt.Sprintf("GPUs %s: server loaded power (IPMI): %.0f W", allDevices, loadedW))
 	}
 	afterThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
 	for _, idx := range selected {
 		afterThrottle[idx], _ = queryThrottleCounters(idx)
@@ -3040,8 +3022,8 @@ func summarizeCPULoad(samples []float64) *BenchmarkCPULoad {
 	return cl
 }
-// runBenchmarkPowerCalibration runs targeted_power for the supplied GPU set and
+// runBenchmarkPowerCalibration runs the configured power-fit load for the supplied
-// actively watches throttle counters. seedLimits, when provided, are treated as
+// GPU set and actively watches throttle counters. seedLimits, when provided, are treated as
 // the starting point for this calibration pass rather than as immutable fixed
 // limits. This matters during cumulative ramp-up: once an additional GPU is
 // introduced, every already-active GPU must be revalidated under the new
@@ -3070,10 +3052,19 @@ func runBenchmarkPowerCalibration(
 	// doubling each retry until it would exceed the cap, at which point the
 	// next busy response fails the calibration immediately.
 	const dcgmResourceBusyMaxDelaySec = 300
 	engine := benchmarkPowerEngine()
 	engineLabel := benchmarkPowerEngineLabel(engine)
-	if _, err := exec.LookPath("dcgmi"); err != nil {
+	if engine == BenchmarkPowerEngineTargetedPower {
-		logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
+		if _, err := exec.LookPath("dcgmi"); err != nil {
-		return map[int]benchmarkPowerCalibrationResult{}, nil, nil
+			logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
 			return map[int]benchmarkPowerCalibrationResult{}, nil, nil
 		}
 	} else {
 		if _, _, err := resolveBenchmarkPowerLoadCommand(calibDurationSec, gpuIndices); err != nil {
 			logFunc("power calibration: dcgmproftester not found, skipping (will use default power limit)")
 			return map[int]benchmarkPowerCalibrationResult{}, nil, nil
 		}
 	}
 	if killed := KillTestWorkers(); len(killed) > 0 {
 		for _, p := range killed {
@@ -3206,7 +3197,7 @@ calibDone:
 		sharedAttempt++
 		for _, s := range active {
 			s.calib.Attempts++
-			logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", s.idx, s.calib.Attempts, s.appliedLimitW, calibDurationSec))
+			logFunc(fmt.Sprintf("power calibration: GPU %d %s attempt %d at %d W for %ds", s.idx, engineLabel, s.calib.Attempts, s.appliedLimitW, calibDurationSec))
 		}
 		// Snapshot throttle counters for all active GPUs before the run.
@@ -3215,14 +3206,22 @@ calibDone:
 			beforeThrottle[s.idx], _ = queryThrottleCounters(s.idx)
 		}
-		// Run targeted_power for ALL gpuIndices simultaneously so every card
+		// Run the selected power-fit load for ALL gpuIndices simultaneously so every card
 		// is under load during calibration — this reflects real server thermals.
 		logName := fmt.Sprintf("power-calibration-attempt-%d.log", sharedAttempt)
-		cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices)
+		cmd, env, err := resolveBenchmarkPowerLoadCommand(calibDurationSec, gpuIndices)
 		if err != nil {
 			for _, s := range active {
 				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("failed to resolve %s command: %v", engineLabel, err))
 				s.converged = true
 			}
 			logFunc(fmt.Sprintf("power calibration: failed to resolve %s command: %v", engineLabel, err))
 			break calibDone
 		}
 		attemptCtx, cancelAttempt := context.WithCancel(ctx)
 		doneCh := make(chan sharedAttemptResult, 1)
 		go func() {
-			out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, gpuIndices, logFunc)
+			out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, env, gpuIndices, logFunc)
 			doneCh <- sharedAttemptResult{out: out, rows: rows, err: err}
 		}()
@@ -3245,8 +3244,8 @@ calibDone:
 					if err != nil {
 						continue
 					}
-					// Record throttle but do NOT cancel — let dcgmi finish so
+					// Record throttle but do NOT cancel — let the load command finish so
-					// nv-hostengine releases the slot cleanly before the next attempt.
+					// runtime resources release cleanly before the next attempt.
 					if reason := benchmarkCalibrationThrottleReason(beforeThrottle[s.idx], after); reason != "" {
 						throttleReasons[s.idx] = reason
 						logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for run to finish", s.idx, reason, s.appliedLimitW))
@@ -3359,9 +3358,9 @@ calibDone:
 				logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", s.idx, throttle, s.appliedLimitW))
 			case ar.err != nil:
 				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", s.calib.Attempts, s.appliedLimitW, ar.err))
-				logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", s.idx, s.appliedLimitW, ar.err))
+				logFunc(fmt.Sprintf("power calibration: GPU %d %s failed at %d W: %v", s.idx, engineLabel, s.appliedLimitW, ar.err))
 			default:
-				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W: no valid power telemetry", s.calib.Attempts, s.appliedLimitW))
+				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("%s attempt %d at %d W: no valid power telemetry", engineLabel, s.calib.Attempts, s.appliedLimitW))
 				logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W: no valid telemetry", s.idx, s.calib.Attempts, s.appliedLimitW))
 			}
@@ -3384,7 +3383,7 @@ calibDone:
 						s.calib.Completed = true
 					}
 				} else {
-					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
+					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
 				}
 				s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
 				s.converged = true
@@ -3399,7 +3398,7 @@ calibDone:
 				next = (s.lo + s.hi) / 2
 			}
 			if next < s.minLimitW {
-				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
+				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
 				s.converged = true
 				continue
 			}
@@ -4117,13 +4116,13 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	}
 	durationSec := powerBenchDurationSec(opts.Profile)
-	// Sample IPMI idle power before any GPU load.
+	// Sample server idle power before any GPU load.
 	var serverIdleW float64
 	var serverIdleOK bool
-	if w, ok := sampleIPMIPowerSeries(ctx, 10); ok {
+	if w, ok := sampleBenchmarkPowerSourceSeries(ctx, opts.ServerPowerSource, 10, benchmarkPowerAutotuneSampleInterval); ok {
 		serverIdleW = w
 		serverIdleOK = true
-		logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
+		logFunc(fmt.Sprintf("server idle power (%s): %.0f W", opts.ServerPowerSource, w))
 	}
 	sdrIdle := sampleIPMISDRPowerSensors()
 	psuBefore := psuStatusSnapshot()
@@ -4141,26 +4140,18 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		_ = os.MkdirAll(singleDir, 0755)
 		singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
 		logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
-		ipmiSingleCtx, ipmiSingleCancel := context.WithCancel(ctx)
+		singlePowerStopCh := make(chan struct{})
-		ipmiSingleDone := make(chan float64, 1)
+		singlePowerCh := startSelectedPowerSourceSampler(singlePowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
 		go func() {
 			defer close(ipmiSingleDone)
 			if w, ok := sampleIPMIPowerSeries(ipmiSingleCtx, 3600); ok {
 				ipmiSingleDone <- w
 			}
 		}()
 		c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil, durationSec)
 		appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0)
-		ipmiSingleCancel()
+		close(singlePowerStopCh)
 		sdrSingle := sampleIPMISDRPowerSensors()
-		if sdrSingle.PSUInW > 0 {
+		if samples := <-singlePowerCh; len(samples) > 0 {
 			singleIPMILoadedW[idx] = benchmarkMean(samples)
 			logFunc(fmt.Sprintf("power calibration: GPU %d single-card server power (%s avg): %.0f W", idx, opts.ServerPowerSource, singleIPMILoadedW[idx]))
 		} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrSingle.PSUInW > 0 {
 			singleIPMILoadedW[idx] = sdrSingle.PSUInW
-			logFunc(fmt.Sprintf("power calibration: GPU %d single-card IPMI loaded: %.0f W (SDR PSU AC input)", idx, sdrSingle.PSUInW))
+			logFunc(fmt.Sprintf("power calibration: GPU %d single-card fallback server power (SDR snapshot): %.0f W", idx, sdrSingle.PSUInW))
 		} else if w, ok := <-ipmiSingleDone; ok {
 			singleIPMILoadedW[idx] = w
 			logFunc(fmt.Sprintf("power calibration: GPU %d single-card IPMI loaded: %.0f W (DCMI)", idx, w))
 		} else {
 			<-ipmiSingleDone // drain channel
 		}
 		allRestoreActions = append(allRestoreActions, restore...)
 		if r, ok := c[idx]; ok {
@@ -4234,11 +4225,11 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		result.RecommendedSlotOrder = append(result.RecommendedSlotOrder, gpu.Index)
 	}
 	if len(result.RecommendedSlotOrder) > 0 {
-		result.Findings = append(result.Findings, fmt.Sprintf("Recommended slot order for installation based on single-card targeted_power: %s.", joinIndexList(result.RecommendedSlotOrder)))
+		result.Findings = append(result.Findings, fmt.Sprintf("Recommended slot order for installation based on single-card %s: %s.", benchmarkPowerEngineLabel(benchmarkPowerEngine()), joinIndexList(result.RecommendedSlotOrder)))
 	}
 	for _, gpu := range gpus {
 		if gpu.Derated {
-			result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete targeted_power.", gpu.Index, gpu.AppliedPowerLimitW))
+			result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete %s.", gpu.Index, gpu.AppliedPowerLimitW, benchmarkPowerEngineLabel(benchmarkPowerEngine())))
 		}
 		if gpu.CoolingWarning != "" {
 			result.Findings = append(result.Findings, fmt.Sprintf(
@@ -4255,7 +4246,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	// Phase 2: cumulative thermal ramp.
 	// Each step introduces one new GPU into an environment where all previously
 	// calibrated GPUs are already running at their fixed stable limits. The new
-	// GPU's stable TDP is searched via binary search (targeted_power) under real
+	// GPU's stable TDP is searched via binary search under real
 	// multi-GPU thermal load. Once found, its limit is fixed permanently for all
 	// subsequent steps. This ensures each GPU's limit reflects actual sustained
 	// power in the final full-system thermal state.
@@ -4294,7 +4285,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		}
 		if !firstCalib.Completed {
 			ramp.Status = "FAILED"
-			ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card targeted_power", firstIdx))
+			ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card %s", firstIdx, benchmarkPowerEngineLabel(benchmarkPowerEngine())))
 			result.OverallStatus = "PARTIAL"
 		} else if firstCalib.Derated {
 			ramp.Status = "PARTIAL"
@@ -4340,21 +4331,15 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			step, len(result.RecommendedSlotOrder), len(subset), newGPUIdx))
 		stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
-		ipmiStepCtx, ipmiStepCancel := context.WithCancel(ctx)
+		stepPowerStopCh := make(chan struct{})
-		ipmiStepDone := make(chan float64, 1)
+		stepPowerCh := startSelectedPowerSourceSampler(stepPowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
 		go func() {
 			defer close(ipmiStepDone)
 			if w, ok := sampleIPMIPowerSeries(ipmiStepCtx, 3600); ok {
 				ipmiStepDone <- w
 			}
 		}()
 		stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep, durationSec)
 		appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0)
-		ipmiStepCancel()
+		close(stepPowerStopCh)
 		var stepIPMILoadedW float64
 		var stepIPMIOK bool
-		if w, ok := <-ipmiStepDone; ok {
+		if samples := <-stepPowerCh; len(samples) > 0 {
-			stepIPMILoadedW = w
+			stepIPMILoadedW = benchmarkMean(samples)
 			stepIPMIOK = true
 		}
 		// Accumulate restore actions; they all run in the outer defer.
@@ -4391,7 +4376,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 				}
 				ramp.Status = "FAILED"
 				ramp.Notes = append(ramp.Notes,
-					fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d; keeping previous stable limit %d W", idx, step, fallback))
+					fmt.Sprintf("GPU %d did not complete %s in ramp step %d; keeping previous stable limit %d W", idx, benchmarkPowerEngineLabel(benchmarkPowerEngine()), step, fallback))
 				result.OverallStatus = "PARTIAL"
 				continue
 			}
@@ -4427,24 +4412,24 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			ramp.PSUSlotReadings = sdrStep.PSUSlots
 		}
-		if sdrStep.PSUInW > 0 {
+		if stepIPMIOK && serverIdleOK && stepIPMILoadedW > 0 {
 			// SDR PSU sum is available — use it for server power (includes all PSUs).
 			ramp.ServerLoadedW = sdrStep.PSUInW
 			ramp.ServerDeltaW = sdrStep.PSUInW - sdrIdle.PSUInW
 			logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W (SDR PSU AC input)", step, sdrStep.PSUInW))
 			if step == len(result.RecommendedSlotOrder) {
 				serverLoadedW = sdrStep.PSUInW
 				serverLoadedOK = true
 				sdrLastStep = sdrStep
 			}
 		} else if stepIPMIOK && serverIdleOK && stepIPMILoadedW > 0 {
 			ramp.ServerLoadedW = stepIPMILoadedW
 			ramp.ServerDeltaW = stepIPMILoadedW - serverIdleW
-			logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W (DCMI)", step, stepIPMILoadedW))
+			logFunc(fmt.Sprintf("power ramp: step %d server loaded power (%s avg): %.0f W", step, opts.ServerPowerSource, stepIPMILoadedW))
 			// The last step has all GPUs loaded — use it as the top-level loaded_w.
 			if step == len(result.RecommendedSlotOrder) {
 				serverLoadedW = stepIPMILoadedW
 				serverLoadedOK = true
 				sdrLastStep = sdrStep
 			}
 		} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrStep.PSUInW > 0 {
 			ramp.ServerLoadedW = sdrStep.PSUInW
 			ramp.ServerDeltaW = sdrStep.PSUInW - sdrIdle.PSUInW
 			logFunc(fmt.Sprintf("power ramp: step %d fallback server loaded power (SDR snapshot): %.0f W", step, sdrStep.PSUInW))
 			if step == len(result.RecommendedSlotOrder) {
 				serverLoadedW = sdrStep.PSUInW
 				serverLoadedOK = true
 				sdrLastStep = sdrStep
 			}
 		}
@@ -4502,7 +4487,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		gpuActualSumW = result.PlatformMaxTDPW
 	}
 	_ = serverIdleOK // used implicitly via characterizeServerPower
-	result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuActualSumW, serverIdleOK && serverLoadedOK)
+	result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuActualSumW, opts.ServerPowerSource, serverIdleOK && serverLoadedOK)
 	// Supplement DCMI with SDR multi-source data via collector's PSU slot patterns.
 	// Per-slot readings enable correlation with audit HardwarePowerSupply entries.
 	if result.ServerPower != nil {
@@ -0,0 +1,735 @@
 package platform
 import (
 	"context"
 	"encoding/json"
 	"fmt"
 	"math"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"sort"
 	"strings"
 	"time"
 )
 const (
 	benchmarkPowerAutotuneVersion         = 1
 	benchmarkPowerAutotuneIdleSec         = 60
 	benchmarkPowerAutotuneLoadSec         = 90
 	benchmarkPowerAutotuneSampleInterval  = 3
 	defaultBenchmarkPowerSourceConfigPath = "/appdata/bee/export/bee-bench/power-source-autotune.json"
 )
 func BenchmarkPowerSourceConfigPath(baseDir string) string {
 	baseDir = strings.TrimSpace(baseDir)
 	if baseDir == "" {
 		return defaultBenchmarkPowerSourceConfigPath
 	}
 	return filepath.Join(filepath.Dir(baseDir), "power-source-autotune.json")
 }
 func LoadBenchmarkPowerAutotuneConfig(path string) (*BenchmarkPowerAutotuneConfig, error) {
 	raw, err := os.ReadFile(path)
 	if err != nil {
 		return nil, err
 	}
 	var cfg BenchmarkPowerAutotuneConfig
 	if err := json.Unmarshal(raw, &cfg); err != nil {
 		return nil, err
 	}
 	if strings.TrimSpace(cfg.SelectedSource) == "" {
 		return nil, fmt.Errorf("autotune config missing selected_source")
 	}
 	return &cfg, nil
 }
 func SaveBenchmarkPowerAutotuneConfig(path string, cfg BenchmarkPowerAutotuneConfig) error {
 	if strings.TrimSpace(path) == "" {
 		return fmt.Errorf("empty autotune config path")
 	}
 	if cfg.Version <= 0 {
 		cfg.Version = benchmarkPowerAutotuneVersion
 	}
 	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
 		return err
 	}
 	data, err := json.MarshalIndent(cfg, "", "  ")
 	if err != nil {
 		return err
 	}
 	tmp := path + ".tmp"
 	if err := os.WriteFile(tmp, data, 0644); err != nil {
 		return err
 	}
 	return os.Rename(tmp, path)
 }
 func LoadSystemPowerSourceConfig(exportDir string) (*BenchmarkPowerAutotuneConfig, error) {
 	return LoadBenchmarkPowerAutotuneConfig(BenchmarkPowerSourceConfigPath(exportDir))
 }
 func ResetBenchmarkPowerAutotuneConfig(path string) error {
 	if strings.TrimSpace(path) == "" {
 		return fmt.Errorf("empty autotune config path")
 	}
 	if err := os.Remove(path); err != nil && !os.IsNotExist(err) {
 		return err
 	}
 	return nil
 }
 func normalizeBenchmarkPowerSource(source string) string {
 	switch strings.TrimSpace(strings.ToLower(source)) {
 	case BenchmarkPowerSourceSDRPSUInput:
 		return BenchmarkPowerSourceSDRPSUInput
 	default:
 		return BenchmarkPowerSourceDCMI
 	}
 }
 func ResolveSystemPowerDecision(exportDir string) SystemPowerSourceDecision {
 	cfg, err := LoadSystemPowerSourceConfig(exportDir)
 	if err == nil && cfg != nil && strings.TrimSpace(cfg.SelectedSource) != "" {
 		selected := normalizeBenchmarkPowerSource(cfg.SelectedSource)
 		return SystemPowerSourceDecision{
 			Configured:      true,
 			SelectedSource:  selected,
 			EffectiveSource: selected,
 			Mode:            "autotuned",
 			Reason:          strings.TrimSpace(cfg.Reason),
 			ConfiguredAt:    cfg.UpdatedAt,
 		}
 	}
 	sources := sampleBenchmarkPowerSources()
 	if value := sources[BenchmarkPowerSourceSDRPSUInput]; value > 0 {
 		return SystemPowerSourceDecision{
 			Configured:      false,
 			EffectiveSource: BenchmarkPowerSourceSDRPSUInput,
 			Mode:            "fallback",
 			Reason:          "autotune config not found; using temporary fallback source sdr_psu_input",
 		}
 	}
 	return SystemPowerSourceDecision{
 		Configured:      false,
 		EffectiveSource: BenchmarkPowerSourceDCMI,
 		Mode:            "fallback",
 		Reason:          "autotune config not found; using temporary fallback source dcmi",
 	}
 }
 func SampleSystemPowerResolved(exportDir string) (float64, SystemPowerSourceDecision, error) {
 	decision := ResolveSystemPowerDecision(exportDir)
 	if decision.EffectiveSource != "" {
 		if value, err := queryBenchmarkPowerSourceW(decision.EffectiveSource); err == nil && value > 0 {
 			return value, decision, nil
 		} else if decision.Configured {
 			fallback := BenchmarkPowerSourceDCMI
 			if decision.EffectiveSource == BenchmarkPowerSourceDCMI {
 				fallback = BenchmarkPowerSourceSDRPSUInput
 			}
 			if fallbackValue, fallbackErr := queryBenchmarkPowerSourceW(fallback); fallbackErr == nil && fallbackValue > 0 {
 				decision.Mode = "degraded"
 				decision.Reason = fmt.Sprintf("configured source %s unavailable; using degraded fallback %s", decision.SelectedSource, fallback)
 				decision.EffectiveSource = fallback
 				return fallbackValue, decision, nil
 			}
 			decision.Mode = "degraded"
 			decision.Reason = fmt.Sprintf("configured source %s unavailable and no fallback source responded", decision.SelectedSource)
 			return 0, decision, err
 		}
 	}
 	return 0, decision, fmt.Errorf("system power source unavailable")
 }
 func queryBenchmarkPowerSourceW(source string) (float64, error) {
 	switch normalizeBenchmarkPowerSource(source) {
 	case BenchmarkPowerSourceSDRPSUInput:
 		sdr := sampleIPMISDRPowerSensors()
 		if sdr.PSUInW > 0 {
 			return sdr.PSUInW, nil
 		}
 		return 0, fmt.Errorf("sdr psu input unavailable")
 	default:
 		return queryIPMIServerPowerW()
 	}
 }
 func sampleBenchmarkPowerSources() map[string]float64 {
 	out := map[string]float64{}
 	if w, err := queryIPMIServerPowerW(); err == nil && w > 0 {
 		out[BenchmarkPowerSourceDCMI] = w
 	}
 	if w, err := queryBenchmarkPowerSourceW(BenchmarkPowerSourceSDRPSUInput); err == nil && w > 0 {
 		out[BenchmarkPowerSourceSDRPSUInput] = w
 	}
 	return out
 }
 func sampleBenchmarkPowerSourceSeries(ctx context.Context, source string, durationSec, intervalSec int) (float64, bool) {
 	if durationSec <= 0 {
 		return 0, false
 	}
 	samples := collectSelectedPowerSourceSamples(ctx, source, durationSec, intervalSec)
 	if len(samples) == 0 {
 		return 0, false
 	}
 	return benchmarkMean(samples), true
 }
 func collectSelectedPowerSourceSamples(ctx context.Context, source string, durationSec, intervalSec int) []float64 {
 	if durationSec <= 0 {
 		return nil
 	}
 	stopCh := make(chan struct{})
 	doneCh := startSelectedPowerSourceSampler(stopCh, source, intervalSec)
 	select {
 	case <-ctx.Done():
 	case <-time.After(time.Duration(durationSec) * time.Second):
 	}
 	close(stopCh)
 	return <-doneCh
 }
 func startSelectedPowerSourceSampler(stopCh <-chan struct{}, source string, intervalSec int) <-chan []float64 {
 	if intervalSec <= 0 {
 		intervalSec = benchmarkPowerAutotuneSampleInterval
 	}
 	ch := make(chan []float64, 1)
 	go func() {
 		defer close(ch)
 		var samples []float64
 		record := func() {
 			if w, err := queryBenchmarkPowerSourceW(source); err == nil && w > 0 {
 				samples = append(samples, w)
 			}
 		}
 		record()
 		ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
 		defer ticker.Stop()
 		for {
 			select {
 			case <-stopCh:
 				ch <- samples
 				return
 			case <-ticker.C:
 				record()
 			}
 		}
 	}()
 	return ch
 }
 type benchmarkPowerAutotuneSample struct {
 	ElapsedSec     float64
 	GPUAvgUsagePct float64
 	CPUUsagePct    float64
 	GPUSumPowerW   float64
 	Sources        map[string]float64
 }
 func collectBenchmarkPowerAutotuneSamples(ctx context.Context, phase string, gpuIndices []int, durationSec int, logFunc func(string)) []benchmarkPowerAutotuneSample {
 	if durationSec <= 0 {
 		return nil
 	}
 	var out []benchmarkPowerAutotuneSample
 	deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
 	start := time.Now()
 	for {
 		if ctx.Err() != nil {
 			return out
 		}
 		row := benchmarkPowerAutotuneSample{
 			ElapsedSec:  time.Since(start).Seconds(),
 			CPUUsagePct: sampleCPULoadPct(),
 			Sources:     sampleBenchmarkPowerSources(),
 		}
 		if gpuRows, err := sampleGPUMetrics(gpuIndices); err == nil && len(gpuRows) > 0 {
 			var usageSum float64
 			for _, gpu := range gpuRows {
 				row.GPUSumPowerW += gpu.PowerW
 				usageSum += gpu.UsagePct
 			}
 			row.GPUAvgUsagePct = usageSum / float64(len(gpuRows))
 		}
 		out = append(out, row)
 		logBenchmarkPowerAutotuneSample(phase, row, logFunc)
 		if time.Now().After(deadline) {
 			return out
 		}
 		select {
 		case <-ctx.Done():
 			return out
 		case <-time.After(benchmarkPowerAutotuneSampleInterval * time.Second):
 		}
 	}
 }
 func logBenchmarkPowerAutotuneSample(phase string, sample benchmarkPowerAutotuneSample, logFunc func(string)) {
 	if logFunc == nil {
 		return
 	}
 	var sourceParts []string
 	for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} {
 		if value, ok := sample.Sources[source]; ok && value > 0 {
 			sourceParts = append(sourceParts, fmt.Sprintf("%s=%.0fW", source, value))
 		} else {
 			sourceParts = append(sourceParts, fmt.Sprintf("%s=n/a", source))
 		}
 	}
 	logFunc(fmt.Sprintf(
 		"autotune %s sample t=%.0fs gpu_avg_util=%.1f%% gpu_sum_power=%.0fW cpu_load=%.1f%% %s",
 		phase,
 		sample.ElapsedSec,
 		sample.GPUAvgUsagePct,
 		sample.GPUSumPowerW,
 		sample.CPUUsagePct,
 		strings.Join(sourceParts, " "),
 	))
 }
 func logBenchmarkPowerAutotunePhaseSummary(phase string, samples []benchmarkPowerAutotuneSample, logFunc func(string)) {
 	if logFunc == nil || len(samples) == 0 {
 		return
 	}
 	var gpuUsage []float64
 	var cpuUsage []float64
 	var gpuPower []float64
 	sourceBuckets := map[string][]float64{}
 	for _, sample := range samples {
 		gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct)
 		cpuUsage = append(cpuUsage, sample.CPUUsagePct)
 		gpuPower = append(gpuPower, sample.GPUSumPowerW)
 		for source, value := range sample.Sources {
 			if value > 0 {
 				sourceBuckets[source] = append(sourceBuckets[source], value)
 			}
 		}
 	}
 	var sourceParts []string
 	for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} {
 		values := sourceBuckets[source]
 		if len(values) == 0 {
 			sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=n/a", source))
 			continue
 		}
 		sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=%.0fW", source, benchmarkMean(values)))
 	}
 	logFunc(fmt.Sprintf(
 		"autotune %s summary samples=%d gpu_avg_util=%.1f%% gpu_p95_util=%.1f%% gpu_avg_power=%.0fW cpu_avg=%.1f%% cpu_p95=%.1f%% %s",
 		phase,
 		len(samples),
 		benchmarkMean(gpuUsage),
 		benchmarkPercentile(gpuUsage, 95),
 		benchmarkMean(gpuPower),
 		benchmarkMean(cpuUsage),
 		benchmarkPercentile(cpuUsage, 95),
 		strings.Join(sourceParts, " "),
 	))
 }
 func logBenchmarkPowerAutotuneSelection(candidates []BenchmarkPowerAutotuneCandidate, selectedSource string, gpuDelta float64, logFunc func(string)) {
 	if logFunc == nil {
 		return
 	}
 	for _, candidate := range candidates {
 		if !candidate.Available {
 			logFunc(fmt.Sprintf("autotune candidate %s unavailable", candidate.Source))
 			continue
 		}
 		logFunc(fmt.Sprintf(
 			"autotune candidate %s idle_avg=%.0fW load_avg=%.0fW delta=%.0fW gpu_delta=%.0fW relative_error=%.3f confidence=%.0f%%%s",
 			candidate.Source,
 			candidate.IdleAvgW,
 			candidate.LoadAvgW,
 			candidate.DeltaW,
 			gpuDelta,
 			candidate.RelativeError,
 			candidate.Confidence*100,
 			map[bool]string{true: " SELECTED", false: ""}[candidate.Source == selectedSource],
 		))
 		if strings.TrimSpace(candidate.SelectionNotes) != "" {
 			logFunc(fmt.Sprintf("autotune candidate %s reason: %s", candidate.Source, candidate.SelectionNotes))
 		}
 	}
 }
 func validateBenchmarkPowerAutotuneIdle(samples []benchmarkPowerAutotuneSample) *BenchmarkPowerAutotuneValidation {
 	result := &BenchmarkPowerAutotuneValidation{}
 	if len(samples) == 0 {
 		result.Reason = "no idle telemetry samples collected"
 		return result
 	}
 	var gpuUsage []float64
 	var cpuUsage []float64
 	for _, sample := range samples {
 		gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct)
 		if sample.CPUUsagePct > 0 {
 			cpuUsage = append(cpuUsage, sample.CPUUsagePct)
 		}
 	}
 	result.GPUSamples = len(gpuUsage)
 	result.CPUSamples = len(cpuUsage)
 	result.GPUAvgUsagePct = math.Round(benchmarkMean(gpuUsage)*10) / 10
 	result.GPUP95UsagePct = math.Round(benchmarkPercentile(gpuUsage, 95)*10) / 10
 	result.CPUAvgUsagePct = math.Round(benchmarkMean(cpuUsage)*10) / 10
 	result.CPUP95UsagePct = math.Round(benchmarkPercentile(cpuUsage, 95)*10) / 10
 	switch {
 	case result.GPUAvgUsagePct > 5:
 		result.Reason = fmt.Sprintf("idle validation failed: average GPU load %.1f%% exceeds 5%%", result.GPUAvgUsagePct)
 	case result.GPUP95UsagePct > 10:
 		result.Reason = fmt.Sprintf("idle validation failed: p95 GPU load %.1f%% exceeds 10%%", result.GPUP95UsagePct)
 	case result.CPUAvgUsagePct > 20:
 		result.Reason = fmt.Sprintf("idle validation failed: average CPU load %.1f%% exceeds 20%%", result.CPUAvgUsagePct)
 	case result.CPUP95UsagePct > 35:
 		result.Reason = fmt.Sprintf("idle validation failed: p95 CPU load %.1f%% exceeds 35%%", result.CPUP95UsagePct)
 	default:
 		result.Valid = true
 	}
 	return result
 }
 func chooseBenchmarkPowerAutotuneSource(idle, load []benchmarkPowerAutotuneSample) (string, []BenchmarkPowerAutotuneCandidate, float64, float64, error) {
 	idleBySource := map[string][]float64{}
 	loadBySource := map[string][]float64{}
 	var idleGPU []float64
 	var loadGPU []float64
 	for _, sample := range idle {
 		idleGPU = append(idleGPU, sample.GPUSumPowerW)
 		for source, value := range sample.Sources {
 			if value > 0 {
 				idleBySource[source] = append(idleBySource[source], value)
 			}
 		}
 	}
 	for _, sample := range load {
 		loadGPU = append(loadGPU, sample.GPUSumPowerW)
 		for source, value := range sample.Sources {
 			if value > 0 {
 				loadBySource[source] = append(loadBySource[source], value)
 			}
 		}
 	}
 	idleGPUAvg := benchmarkMean(idleGPU)
 	loadGPUAvg := benchmarkMean(loadGPU)
 	gpuDelta := loadGPUAvg - idleGPUAvg
 	if gpuDelta <= 0 {
 		gpuDelta = loadGPUAvg
 	}
 	candidates := []BenchmarkPowerAutotuneCandidate{
 		buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceDCMI, idleBySource[BenchmarkPowerSourceDCMI], loadBySource[BenchmarkPowerSourceDCMI], gpuDelta),
 		buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceSDRPSUInput, idleBySource[BenchmarkPowerSourceSDRPSUInput], loadBySource[BenchmarkPowerSourceSDRPSUInput], gpuDelta),
 	}
 	available := make([]BenchmarkPowerAutotuneCandidate, 0, len(candidates))
 	for _, candidate := range candidates {
 		if candidate.Available && candidate.DeltaW > 0 {
 			available = append(available, candidate)
 		}
 	}
 	if len(available) == 0 {
 		return "", candidates, idleGPUAvg, loadGPUAvg, fmt.Errorf("no usable server power source samples collected")
 	}
 	sort.Slice(available, func(i, j int) bool {
 		if math.Abs(available[i].RelativeError-available[j].RelativeError) <= 0.10 {
 			if available[i].Source != available[j].Source {
 				return available[i].Source == BenchmarkPowerSourceSDRPSUInput
 			}
 		}
 		if available[i].RelativeError != available[j].RelativeError {
 			return available[i].RelativeError < available[j].RelativeError
 		}
 		return available[i].Samples > available[j].Samples
 	})
 	selected := available[0]
 	for idx := range candidates {
 		if candidates[idx].Source == selected.Source {
 			candidates[idx].Selected = true
 			candidates[idx].SelectionNotes = fmt.Sprintf("selected because delta %.0f W is closest to GPU delta %.0f W (relative error %.3f)", selected.DeltaW, gpuDelta, selected.RelativeError)
 		}
 	}
 	return selected.Source, candidates, idleGPUAvg, loadGPUAvg, nil
 }
 func buildBenchmarkPowerAutotuneCandidate(source string, idle, load []float64, gpuDelta float64) BenchmarkPowerAutotuneCandidate {
 	candidate := BenchmarkPowerAutotuneCandidate{
 		Source:    source,
 		Available: len(idle) > 0 && len(load) > 0,
 		Samples:   minInt(len(idle), len(load)),
 	}
 	if !candidate.Available {
 		return candidate
 	}
 	candidate.IdleAvgW = benchmarkMean(idle)
 	candidate.LoadAvgW = benchmarkMean(load)
 	candidate.DeltaW = candidate.LoadAvgW - candidate.IdleAvgW
 	if gpuDelta > 0 {
 		candidate.RelativeError = math.Abs(candidate.DeltaW-gpuDelta) / gpuDelta
 		candidate.Confidence = math.Max(0, 1-candidate.RelativeError)
 	}
 	return candidate
 }
 func renderBenchmarkPowerAutotuneSummary(result BenchmarkPowerAutotuneResult) string {
 	var b strings.Builder
 	fmt.Fprintf(&b, "generated_at=%s\n", result.GeneratedAt.UTC().Format(time.RFC3339))
 	fmt.Fprintf(&b, "status=%s\n", result.Status)
 	fmt.Fprintf(&b, "benchmark_kind=%s\n", result.BenchmarkKind)
 	fmt.Fprintf(&b, "profile=%s\n", result.Profile)
 	fmt.Fprintf(&b, "idle_duration_sec=%d\n", result.IdleDurationSec)
 	fmt.Fprintf(&b, "load_duration_sec=%d\n", result.LoadDurationSec)
 	fmt.Fprintf(&b, "sample_interval_sec=%d\n", result.SampleIntervalSec)
 	if result.SelectedSource != "" {
 		fmt.Fprintf(&b, "selected_source=%s\n", result.SelectedSource)
 	}
 	if result.IdleValidation != nil {
 		fmt.Fprintf(&b, "idle_valid=%t\n", result.IdleValidation.Valid)
 		fmt.Fprintf(&b, "idle_gpu_avg_usage_pct=%.1f\n", result.IdleValidation.GPUAvgUsagePct)
 		fmt.Fprintf(&b, "idle_gpu_p95_usage_pct=%.1f\n", result.IdleValidation.GPUP95UsagePct)
 		fmt.Fprintf(&b, "idle_cpu_avg_usage_pct=%.1f\n", result.IdleValidation.CPUAvgUsagePct)
 		fmt.Fprintf(&b, "idle_cpu_p95_usage_pct=%.1f\n", result.IdleValidation.CPUP95UsagePct)
 		if result.IdleValidation.Reason != "" {
 			fmt.Fprintf(&b, "idle_validation_error=%s\n", result.IdleValidation.Reason)
 		}
 	}
 	for _, candidate := range result.Candidates {
 		fmt.Fprintf(&b, "candidate_%s_available=%t\n", candidate.Source, candidate.Available)
 		if candidate.Available {
 			fmt.Fprintf(&b, "candidate_%s_idle_avg_w=%.0f\n", candidate.Source, candidate.IdleAvgW)
 			fmt.Fprintf(&b, "candidate_%s_load_avg_w=%.0f\n", candidate.Source, candidate.LoadAvgW)
 			fmt.Fprintf(&b, "candidate_%s_delta_w=%.0f\n", candidate.Source, candidate.DeltaW)
 			fmt.Fprintf(&b, "candidate_%s_relative_error=%.3f\n", candidate.Source, candidate.RelativeError)
 		}
 	}
 	return b.String()
 }
 func renderBenchmarkPowerAutotuneReport(result BenchmarkPowerAutotuneResult) string {
 	var b strings.Builder
 	b.WriteString("# Bee Bench Power Source Autotune\n\n")
 	fmt.Fprintf(&b, "**Status:** %s  \n", result.Status)
 	fmt.Fprintf(&b, "**Benchmark kind:** %s  \n", result.BenchmarkKind)
 	fmt.Fprintf(&b, "**Profile:** %s  \n", result.Profile)
 	fmt.Fprintf(&b, "**Idle window:** %ds  \n", result.IdleDurationSec)
 	fmt.Fprintf(&b, "**Load window:** %ds  \n", result.LoadDurationSec)
 	fmt.Fprintf(&b, "**Sample interval:** %ds  \n", result.SampleIntervalSec)
 	if result.SelectedSource != "" {
 		fmt.Fprintf(&b, "**Selected source:** `%s`  \n", result.SelectedSource)
 	}
 	b.WriteString("\n")
 	if result.IdleValidation != nil {
 		b.WriteString("## Idle Validation\n\n")
 		fmt.Fprintf(&b, "- valid: %t\n", result.IdleValidation.Valid)
 		fmt.Fprintf(&b, "- GPU avg usage: %.1f%%\n", result.IdleValidation.GPUAvgUsagePct)
 		fmt.Fprintf(&b, "- GPU p95 usage: %.1f%%\n", result.IdleValidation.GPUP95UsagePct)
 		fmt.Fprintf(&b, "- CPU avg usage: %.1f%%\n", result.IdleValidation.CPUAvgUsagePct)
 		fmt.Fprintf(&b, "- CPU p95 usage: %.1f%%\n", result.IdleValidation.CPUP95UsagePct)
 		if result.IdleValidation.Reason != "" {
 			fmt.Fprintf(&b, "- reason: %s\n", result.IdleValidation.Reason)
 		}
 		b.WriteString("\n")
 	}
 	if len(result.Candidates) > 0 {
 		b.WriteString("## Candidates\n\n")
 		b.WriteString("| Source | Idle avg W | Load avg W | Delta W | Relative error | Selected |\n")
 		b.WriteString("|--------|------------|------------|---------|----------------|----------|\n")
 		for _, candidate := range result.Candidates {
 			if !candidate.Available {
 				fmt.Fprintf(&b, "| %s | — | — | — | — | no |\n", candidate.Source)
 				continue
 			}
 			selected := "no"
 			if candidate.Selected {
 				selected = "yes"
 			}
 			fmt.Fprintf(&b, "| %s | %.0f | %.0f | %.0f | %.2f | %s |\n",
 				candidate.Source, candidate.IdleAvgW, candidate.LoadAvgW, candidate.DeltaW, candidate.RelativeError, selected)
 		}
 		b.WriteString("\n")
 	}
 	for _, note := range result.Notes {
 		fmt.Fprintf(&b, "- %s\n", note)
 	}
 	return b.String()
 }
 func benchmarkAutotuneLoadCommand(kind string, durationSec int, gpuIndices []int, sizeMB int) ([]string, string) {
 	allDevices := joinIndexList(gpuIndices)
 	switch strings.TrimSpace(strings.ToLower(kind)) {
 	case "power-fit", "power", "nvidia-bench-power":
 		cmd, _, err := resolveBenchmarkPowerLoadCommand(durationSec, gpuIndices)
 		if err == nil {
 			return cmd, "power-fit"
 		}
 		return nvidiaDCGMNamedDiagCommand("targeted_power", durationSec, gpuIndices), "power-fit"
 	default:
 		cmd := []string{
 			"bee-gpu-burn",
 			"--seconds", fmt.Sprintf("%d", durationSec),
 			"--devices", allDevices,
 		}
 		if sizeMB > 0 {
 			cmd = append(cmd, "--size-mb", fmt.Sprintf("%d", sizeMB))
 		}
 		return cmd, "performance"
 	}
 }
 func (s *System) RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
 	if ctx == nil {
 		ctx = context.Background()
 	}
 	if logFunc == nil {
 		logFunc = func(string) {}
 	}
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = "/var/log/bee-bench/autotune"
 	}
 	if err := os.MkdirAll(baseDir, 0755); err != nil {
 		return "", fmt.Errorf("mkdir %s: %w", baseDir, err)
 	}
 	selected, err := resolveNvidiaGPUSelection(nil, nil)
 	if err != nil {
 		return "", err
 	}
 	if len(selected) == 0 {
 		return "", fmt.Errorf("no NVIDIA GPUs detected for autotune")
 	}
 	ts := time.Now().UTC().Format("20060102-150405")
 	runDir := filepath.Join(baseDir, "autotune-"+ts)
 	if err := os.MkdirAll(runDir, 0755); err != nil {
 		return "", fmt.Errorf("mkdir %s: %w", runDir, err)
 	}
 	verboseLog := filepath.Join(runDir, "verbose.log")
 	hostname, _ := os.Hostname()
 	loadCmd, normalizedKind := benchmarkAutotuneLoadCommand(benchmarkKind, benchmarkPowerAutotuneLoadSec, selected, opts.SizeMB)
 	result := BenchmarkPowerAutotuneResult{
 		GeneratedAt:       time.Now().UTC(),
 		Hostname:          hostname,
 		ServerModel:       readServerModel(),
 		BenchmarkKind:     normalizedKind,
 		Profile:           opts.Profile,
 		Status:            "FAILED",
 		IdleDurationSec:   benchmarkPowerAutotuneIdleSec,
 		LoadDurationSec:   benchmarkPowerAutotuneLoadSec,
 		SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
 	}
 	logFunc(fmt.Sprintf("autotune: idle validation window %ds on GPUs %s", benchmarkPowerAutotuneIdleSec, joinIndexList(selected)))
 	idleSamples := collectBenchmarkPowerAutotuneSamples(ctx, "idle", selected, benchmarkPowerAutotuneIdleSec, logFunc)
 	logBenchmarkPowerAutotunePhaseSummary("idle", idleSamples, logFunc)
 	result.IdleValidation = validateBenchmarkPowerAutotuneIdle(idleSamples)
 	if result.IdleValidation == nil || !result.IdleValidation.Valid {
 		if result.IdleValidation != nil {
 			result.IdleValidationError = result.IdleValidation.Reason
 			logFunc(result.IdleValidation.Reason)
 		}
 		result.Notes = append(result.Notes, "autotune stopped before load stage because idle validation failed")
 		if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
 			return "", err
 		}
 		return runDir, fmt.Errorf("%s", result.IdleValidationError)
 	}
 	logFunc(fmt.Sprintf("autotune: full-load stage using %s for %ds", normalizedKind, benchmarkPowerAutotuneLoadSec))
 	loadSamplesCh := make(chan []benchmarkPowerAutotuneSample, 1)
 	go func() {
 		loadSamplesCh <- collectBenchmarkPowerAutotuneSamples(ctx, "load", selected, benchmarkPowerAutotuneLoadSec, logFunc)
 	}()
 	out, runErr := runSATCommandCtx(ctx, verboseLog, "autotune-load.log", loadCmd, nil, logFunc)
 	_ = os.WriteFile(filepath.Join(runDir, "autotune-load.log"), out, 0644)
 	loadSamples := <-loadSamplesCh
 	logBenchmarkPowerAutotunePhaseSummary("load", loadSamples, logFunc)
 	if runErr != nil {
 		result.Notes = append(result.Notes, "full-load stage failed: "+runErr.Error())
 		if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
 			return "", err
 		}
 		return runDir, fmt.Errorf("autotune load stage: %w", runErr)
 	}
 	selectedSource, candidates, idleGPUAvg, loadGPUAvg, chooseErr := chooseBenchmarkPowerAutotuneSource(idleSamples, loadSamples)
 	result.Candidates = candidates
 	result.GPUPowerIdleW = idleGPUAvg
 	result.GPUPowerLoadW = loadGPUAvg
 	if chooseErr != nil {
 		result.Notes = append(result.Notes, chooseErr.Error())
 		if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
 			return "", err
 		}
 		return runDir, chooseErr
 	}
 	gpuDelta := loadGPUAvg - idleGPUAvg
 	if gpuDelta <= 0 {
 		gpuDelta = loadGPUAvg
 	}
 	logBenchmarkPowerAutotuneSelection(candidates, selectedSource, gpuDelta, logFunc)
 	result.SelectedSource = selectedSource
 	result.Status = "OK"
 	var confidence float64
 	selectionReason := fmt.Sprintf("selected %s after comparing full-load average against GPU-reported delta", selectedSource)
 	for _, candidate := range candidates {
 		if candidate.Selected {
 			confidence = candidate.Confidence
 			if strings.TrimSpace(candidate.SelectionNotes) != "" {
 				selectionReason = candidate.SelectionNotes
 			}
 			break
 		}
 	}
 	cfg := BenchmarkPowerAutotuneConfig{
 		Version:           benchmarkPowerAutotuneVersion,
 		UpdatedAt:         time.Now().UTC(),
 		SelectedSource:    selectedSource,
 		BenchmarkKind:     normalizedKind,
 		Profile:           opts.Profile,
 		IdleDurationSec:   benchmarkPowerAutotuneIdleSec,
 		LoadDurationSec:   benchmarkPowerAutotuneLoadSec,
 		SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
 		Confidence:        confidence,
 		Reason:            selectionReason,
 	}
 	result.Config = &cfg
 	configPath := BenchmarkPowerSourceConfigPath(baseDir)
 	if err := SaveBenchmarkPowerAutotuneConfig(configPath, cfg); err != nil {
 		result.Status = "FAILED"
 		result.Notes = append(result.Notes, "failed to save autotune config: "+err.Error())
 		if writeErr := writeBenchmarkPowerAutotuneArtifacts(runDir, result); writeErr != nil {
 			return "", writeErr
 		}
 		return runDir, err
 	}
 	logFunc(fmt.Sprintf("autotune conclusion: selected source %s; reason: %s", selectedSource, cfg.Reason))
 	result.Notes = append(result.Notes, "saved autotune config to "+configPath)
 	if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
 		return "", err
 	}
 	return runDir, nil
 }
 func writeBenchmarkPowerAutotuneArtifacts(runDir string, result BenchmarkPowerAutotuneResult) error {
 	resultJSON, err := json.MarshalIndent(result, "", "  ")
 	if err != nil {
 		return fmt.Errorf("marshal autotune result: %w", err)
 	}
 	if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
 		return fmt.Errorf("write autotune result.json: %w", err)
 	}
 	if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(renderBenchmarkPowerAutotuneSummary(result)), 0644); err != nil {
 		return fmt.Errorf("write autotune summary.txt: %w", err)
 	}
 	if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(renderBenchmarkPowerAutotuneReport(result)), 0644); err != nil {
 		return fmt.Errorf("write autotune report.md: %w", err)
 	}
 	return nil
 }
 func minInt(a, b int) int {
 	if a < b {
 		return a
 	}
 	return b
 }
 var _ = exec.ErrNotFound
@@ -43,6 +43,11 @@ const (
 	NvidiaBenchmarkProfileOvernight = "overnight"
 )
 const (
 	BenchmarkPowerEngineDCGMProfTester = "dcgmproftester"
 	BenchmarkPowerEngineTargetedPower  = "targeted_power"
 )
 // Estimated wall-clock durations for benchmark runs, derived from real _v8 logs.
 // Rule: when changing profile phase durations in resolveBenchmarkProfile(),
 // re-measure from actual task logs and update the constants here.
@@ -61,7 +66,7 @@ const (
 	BenchmarkEstimatedPerfStabilitySec = 5532 // ~92 min; ramp-up 1-8 measured
 	BenchmarkEstimatedPerfOvernightSec = 8 * 3600
-	// Power / Thermal Fit (dcgmi targeted_power binary-search calibration).
+	// Power / Thermal Fit (dcgmproftester load + nvidia-smi power-limit binary search).
 	// Duration is for the full ramp-up run; individual steps vary with convergence speed.
 	BenchmarkEstimatedPowerStandardSec  = 2600 // ~43 min; ramp 1-4: 2663 s, ramp 1-8: 2375 s
 	BenchmarkEstimatedPowerStabilitySec = 5400 // ~90 min; calibDurationSec=300 × 8 GPU × ~2-3 attempts
@@ -74,12 +79,84 @@ type NvidiaBenchmarkOptions struct {
 	GPUIndices        []int
 	ExcludeGPUIndices []int
 	RunNCCL           bool
 	ServerPowerSource string
 	ParallelGPUs      bool   // run all selected GPUs simultaneously instead of sequentially
 	RampStep          int    // 1-based step index within a ramp-up run (0 = not a ramp-up)
 	RampTotal         int    // total number of ramp-up steps in this run
 	RampRunID         string // shared identifier across all steps of the same ramp-up run
 }
 const (
 	BenchmarkPowerSourceDCMI        = "dcmi"
 	BenchmarkPowerSourceSDRPSUInput = "sdr_psu_input"
 )
 type BenchmarkPowerAutotuneConfig struct {
 	Version           int       `json:"version"`
 	UpdatedAt         time.Time `json:"updated_at"`
 	SelectedSource    string    `json:"selected_source"`
 	BenchmarkKind     string    `json:"benchmark_kind,omitempty"`
 	Profile           string    `json:"profile,omitempty"`
 	IdleDurationSec   int       `json:"idle_duration_sec,omitempty"`
 	LoadDurationSec   int       `json:"load_duration_sec,omitempty"`
 	SampleIntervalSec int       `json:"sample_interval_sec,omitempty"`
 	Confidence        float64   `json:"confidence,omitempty"`
 	Reason            string    `json:"reason,omitempty"`
 }
 type SystemPowerSourceDecision struct {
 	Configured      bool      `json:"configured"`
 	SelectedSource  string    `json:"selected_source,omitempty"`
 	EffectiveSource string    `json:"effective_source,omitempty"`
 	Mode            string    `json:"mode,omitempty"` // autotuned, fallback, degraded
 	Reason          string    `json:"reason,omitempty"`
 	ConfiguredAt    time.Time `json:"configured_at,omitempty"`
 }
 type BenchmarkPowerAutotuneResult struct {
 	GeneratedAt         time.Time                         `json:"generated_at"`
 	Hostname            string                            `json:"hostname,omitempty"`
 	ServerModel         string                            `json:"server_model,omitempty"`
 	BenchmarkKind       string                            `json:"benchmark_kind,omitempty"`
 	Profile             string                            `json:"profile,omitempty"`
 	Status              string                            `json:"status"`
 	IdleDurationSec     int                               `json:"idle_duration_sec"`
 	LoadDurationSec     int                               `json:"load_duration_sec"`
 	SampleIntervalSec   int                               `json:"sample_interval_sec"`
 	SelectedSource      string                            `json:"selected_source,omitempty"`
 	IdleValidationError string                            `json:"idle_validation_error,omitempty"`
 	IdleValidation      *BenchmarkPowerAutotuneValidation `json:"idle_validation,omitempty"`
 	GPUPowerIdleW       float64                           `json:"gpu_power_idle_w,omitempty"`
 	GPUPowerLoadW       float64                           `json:"gpu_power_load_w,omitempty"`
 	Candidates          []BenchmarkPowerAutotuneCandidate `json:"candidates,omitempty"`
 	Notes               []string                          `json:"notes,omitempty"`
 	Config              *BenchmarkPowerAutotuneConfig     `json:"config,omitempty"`
 }
 type BenchmarkPowerAutotuneValidation struct {
 	Valid          bool    `json:"valid"`
 	GPUAvgUsagePct float64 `json:"gpu_avg_usage_pct,omitempty"`
 	GPUP95UsagePct float64 `json:"gpu_p95_usage_pct,omitempty"`
 	CPUAvgUsagePct float64 `json:"cpu_avg_usage_pct,omitempty"`
 	CPUP95UsagePct float64 `json:"cpu_p95_usage_pct,omitempty"`
 	GPUSamples     int     `json:"gpu_samples,omitempty"`
 	CPUSamples     int     `json:"cpu_samples,omitempty"`
 	Reason         string  `json:"reason,omitempty"`
 }
 type BenchmarkPowerAutotuneCandidate struct {
 	Source         string  `json:"source"`
 	IdleAvgW       float64 `json:"idle_avg_w,omitempty"`
 	LoadAvgW       float64 `json:"load_avg_w,omitempty"`
 	DeltaW         float64 `json:"delta_w,omitempty"`
 	Samples        int     `json:"samples,omitempty"`
 	RelativeError  float64 `json:"relative_error,omitempty"`
 	Confidence     float64 `json:"confidence,omitempty"`
 	Selected       bool    `json:"selected,omitempty"`
 	Available      bool    `json:"available"`
 	SelectionNotes string  `json:"selection_notes,omitempty"`
 }
 type NvidiaBenchmarkResult struct {
 	BenchmarkVersion string    `json:"benchmark_version"`
 	GeneratedAt      time.Time `json:"generated_at"`
@@ -294,12 +371,16 @@ type BenchmarkPSUSlotPower struct {
 //   - SDR       — `ipmitool sdr` PSUx_POWER_IN/OUT; per-PSU, reliable
 //   - nvidia-smi — GPU self-reported via internal shunt; accurate for GPU load
 type BenchmarkServerPower struct {
-	Available       bool    `json:"available"`
+	Available         bool    `json:"available"`
-	IdleW           float64 `json:"idle_w,omitempty"`   // DCMI at idle
+	Source            string  `json:"source,omitempty"`
-	LoadedW         float64 `json:"loaded_w,omitempty"` // DCMI at peak load
+	Mode              string  `json:"mode,omitempty"`
-	DeltaW          float64 `json:"delta_w,omitempty"`  // DCMI loaded − idle
+	Reason            string  `json:"reason,omitempty"`
-	GPUReportedSumW float64 `json:"gpu_reported_sum_w,omitempty"`
+	SampleIntervalSec int     `json:"sample_interval_sec,omitempty"`
-	ReportingRatio  float64 `json:"reporting_ratio,omitempty"`
+	IdleW             float64 `json:"idle_w,omitempty"`   // DCMI at idle
 	LoadedW           float64 `json:"loaded_w,omitempty"` // DCMI at peak load
 	DeltaW            float64 `json:"delta_w,omitempty"`  // DCMI loaded − idle
 	GPUReportedSumW   float64 `json:"gpu_reported_sum_w,omitempty"`
 	ReportingRatio    float64 `json:"reporting_ratio,omitempty"`
 	// PSU AC input sum — sampled at idle and at peak load using collector's
 	// slot patterns (PSU1_POWER_IN, PSU1_PIN, PS1 POut, Power1…).