audit: switch power benchmark load to dcgmproftester

2026-04-20 06:57:14 +03:00
parent 65bcc9ce81
commit 17118298bd
3 changed files with 962 additions and 161 deletions
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -240,6 +240,47 @@ func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, po
 	return nil
 }

+func benchmarkPowerEngine() string {
+	switch strings.TrimSpace(strings.ToLower(os.Getenv("BEE_BENCH_POWER_ENGINE"))) {
+	case BenchmarkPowerEngineTargetedPower:
+		return BenchmarkPowerEngineTargetedPower
+	default:
+		return BenchmarkPowerEngineDCGMProfTester
+	}
+}
+
+func benchmarkPowerEngineLabel(engine string) string {
+	switch strings.TrimSpace(strings.ToLower(engine)) {
+	case BenchmarkPowerEngineTargetedPower:
+		return "dcgmi diag targeted_power"
+	default:
+		return "dcgmproftester"
+	}
+}
+
+func resolveBenchmarkPowerLoadCommand(durationSec int, gpuIndices []int) ([]string, []string, error) {
+	engine := benchmarkPowerEngine()
+	durationSec = normalizeNvidiaBurnDuration(durationSec)
+	switch engine {
+	case BenchmarkPowerEngineTargetedPower:
+		return nvidiaDCGMNamedDiagCommand("targeted_power", durationSec, gpuIndices), nil, nil
+	default:
+		if len(gpuIndices) > 1 {
+			return []string{
+				"bee-dcgmproftester-staggered",
+				"--seconds", strconv.Itoa(durationSec),
+				"--stagger-seconds", "0",
+				"--devices", joinIndexList(gpuIndices),
+			}, nil, nil
+		}
+		cmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(durationSec))
+		if err != nil {
+			return nil, nil, err
+		}
+		return cmd, nvidiaVisibleDevicesEnv(gpuIndices), nil
+	}
+}
+
 func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
 	if ctx == nil {
 		ctx = context.Background()
@@ -384,10 +425,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv

 			// Sample server idle power once (first GPU only — server state is global).
 			if !serverIdleOK {
-				if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
+				if w, ok := sampleBenchmarkPowerSourceSeries(ctx, opts.ServerPowerSource, maxInt(spec.BaselineSec, 10), benchmarkPowerAutotuneSampleInterval); ok {
 					serverIdleW = w
 					serverIdleOK = true
-					logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
+					logFunc(fmt.Sprintf("server idle power (%s): %.0f W", opts.ServerPowerSource, w))
 				}
 			}

@@ -430,7 +471,16 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 				"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
 			}
 			logFunc(fmt.Sprintf("GPU %d: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", idx, len(supportedPrecisions), basePhaseSec, mixedPhaseSec))
+			serverPowerStopCh := make(chan struct{})
+			serverPowerCh := startSelectedPowerSourceSampler(serverPowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
 			_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-precision-plan.log", idx), planCmd, nil, []int{idx}, planPhases, logFunc)
+			close(serverPowerStopCh)
+			if serverPowerSamples := <-serverPowerCh; len(serverPowerSamples) > 0 {
+				serverLoadedWSum += benchmarkMean(serverPowerSamples)
+				serverLoadedSamples++
+				serverLoadedOK = true
+				logFunc(fmt.Sprintf("GPU %d: server loaded power (%s avg): %.0f W", idx, opts.ServerPowerSource, benchmarkMean(serverPowerSamples)))
+			}
 			for _, phaseSpec := range planPhases {
 				if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
 					appendBenchmarkMetrics(&metricRows, rows, phaseSpec.MetricStage, &metricTimelineSec, float64(phaseSpec.DurationSec))
@@ -461,48 +511,6 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv

 			beforeThrottle, _ := queryThrottleCounters(idx)
 			logFunc(fmt.Sprintf("GPU %d: steady compute (combined, %ds)", idx, mixedPhaseSec))
-
-			// Sample server power via IPMI in parallel with the steady phase.
-			// We collect readings every 5s and average them.
-			ipmiStopCh := make(chan struct{})
-			ipmiResultCh := make(chan float64, 1)
-			go func() {
-				defer close(ipmiResultCh)
-				var samples []float64
-				ticker := time.NewTicker(5 * time.Second)
-				defer ticker.Stop()
-				// First sample after a short warmup delay.
-				select {
-				case <-ipmiStopCh:
-					return
-				case <-time.After(15 * time.Second):
-				}
-				for {
-					if w, err := queryIPMIServerPowerW(); err == nil {
-						samples = append(samples, w)
-					}
-					select {
-					case <-ipmiStopCh:
-						if len(samples) > 0 {
-							var sum float64
-							for _, w := range samples {
-								sum += w
-							}
-							ipmiResultCh <- sum / float64(len(samples))
-						}
-						return
-					case <-ticker.C:
-					}
-				}
-			}()
-
-			close(ipmiStopCh)
-			if loadedW, ok := <-ipmiResultCh; ok {
-				serverLoadedWSum += loadedW
-				serverLoadedSamples++
-				serverLoadedOK = true
-				logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW))
-			}
 			afterThrottle, _ := queryThrottleCounters(idx)
 			if planErr != nil {
 				gpuResult.Notes = append(gpuResult.Notes, "precision plan failed: "+planErr.Error())
@@ -652,7 +660,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 	if serverLoadedSamples > 0 {
 		serverLoadedW = serverLoadedWSum / float64(serverLoadedSamples)
 	}
-	result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK)
+	result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, opts.ServerPowerSource, serverIdleOK && serverLoadedOK)
 	result.Cooling = summarizeBenchmarkCooling(metricRows)

 	// Apply server-power penalty when IPMI reports the server delta is much
@@ -707,6 +715,7 @@ func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) Nv
 	if opts.SizeMB < 0 {
 		opts.SizeMB = 0
 	}
+	opts.ServerPowerSource = normalizeBenchmarkPowerSource(opts.ServerPowerSource)
 	opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices)
 	opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices)
 	return opts
@@ -2535,10 +2544,14 @@ loop:
 }

 // characterizeServerPower computes BenchmarkServerPower from idle and loaded
-// IPMI samples plus the GPU-reported average power during steady state.
-func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, ipmiAvailable bool) *BenchmarkServerPower {
-	sp := &BenchmarkServerPower{Available: ipmiAvailable}
-	if !ipmiAvailable {
+// samples plus the GPU-reported average power during steady state.
+func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, source string, available bool) *BenchmarkServerPower {
+	sp := &BenchmarkServerPower{
+		Available:         available,
+		Source:            normalizeBenchmarkPowerSource(source),
+		SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
+	}
+	if !available {
 		sp.Notes = append(sp.Notes, "IPMI power reading unavailable; server-side power characterization skipped")
 		return sp
 	}
@@ -2671,10 +2684,10 @@ func runNvidiaBenchmarkParallel(

 	// Sample server idle power once.
 	if !*serverIdleOK {
-		if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
+		if w, ok := sampleBenchmarkPowerSourceSeries(ctx, opts.ServerPowerSource, maxInt(spec.BaselineSec, 10), benchmarkPowerAutotuneSampleInterval); ok {
 			*serverIdleW = w
 			*serverIdleOK = true
-			logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
+			logFunc(fmt.Sprintf("server idle power (%s): %.0f W", opts.ServerPowerSource, w))
 		}
 	}

@@ -2728,7 +2741,16 @@ func runNvidiaBenchmarkParallel(
 		"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
 	}
 	logFunc(fmt.Sprintf("GPUs %s: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", allDevices, len(supportedPrecisions), basePhaseSec, mixedPhaseSec))
+	serverPowerStopCh := make(chan struct{})
+	serverPowerCh := startSelectedPowerSourceSampler(serverPowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
 	_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, "gpu-all-precision-plan.log", planCmd, nil, selected, planPhases, logFunc)
+	close(serverPowerStopCh)
+	if serverPowerSamples := <-serverPowerCh; len(serverPowerSamples) > 0 {
+		*serverLoadedWSum += benchmarkMean(serverPowerSamples)
+		(*serverLoadedSamples)++
+		*serverLoadedOK = true
+		logFunc(fmt.Sprintf("GPUs %s: server loaded power (%s avg): %.0f W", allDevices, opts.ServerPowerSource, benchmarkMean(serverPowerSamples)))
+	}
 	for _, phaseSpec := range planPhases {
 		if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
 			appendBenchmarkMetrics(allMetricRows, rows, phaseSpec.MetricStage, metricTimelineSec, float64(phaseSpec.DurationSec))
@@ -2770,46 +2792,6 @@ func runNvidiaBenchmarkParallel(
 	}

 	logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (combined, %ds)", allDevices, mixedPhaseSec))
-
-	// Sample server power via IPMI in parallel with steady phase.
-	ipmiStopCh := make(chan struct{})
-	ipmiResultCh := make(chan float64, 1)
-	go func() {
-		defer close(ipmiResultCh)
-		var samples []float64
-		ticker := time.NewTicker(5 * time.Second)
-		defer ticker.Stop()
-		select {
-		case <-ipmiStopCh:
-			return
-		case <-time.After(15 * time.Second):
-		}
-		for {
-			if w, err := queryIPMIServerPowerW(); err == nil {
-				samples = append(samples, w)
-			}
-			select {
-			case <-ipmiStopCh:
-				if len(samples) > 0 {
-					var sum float64
-					for _, w := range samples {
-						sum += w
-					}
-					ipmiResultCh <- sum / float64(len(samples))
-				}
-				return
-			case <-ticker.C:
-			}
-		}
-	}()
-
-	close(ipmiStopCh)
-	if loadedW, ok := <-ipmiResultCh; ok {
-		*serverLoadedWSum += loadedW
-		(*serverLoadedSamples)++
-		*serverLoadedOK = true
-		logFunc(fmt.Sprintf("GPUs %s: server loaded power (IPMI): %.0f W", allDevices, loadedW))
-	}
 	afterThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
 	for _, idx := range selected {
 		afterThrottle[idx], _ = queryThrottleCounters(idx)
@@ -3040,8 +3022,8 @@ func summarizeCPULoad(samples []float64) *BenchmarkCPULoad {
 	return cl
 }

-// runBenchmarkPowerCalibration runs targeted_power for the supplied GPU set and
-// actively watches throttle counters. seedLimits, when provided, are treated as
+// runBenchmarkPowerCalibration runs the configured power-fit load for the supplied
+// GPU set and actively watches throttle counters. seedLimits, when provided, are treated as
 // the starting point for this calibration pass rather than as immutable fixed
 // limits. This matters during cumulative ramp-up: once an additional GPU is
 // introduced, every already-active GPU must be revalidated under the new
@@ -3070,10 +3052,19 @@ func runBenchmarkPowerCalibration(
 	// doubling each retry until it would exceed the cap, at which point the
 	// next busy response fails the calibration immediately.
 	const dcgmResourceBusyMaxDelaySec = 300
+	engine := benchmarkPowerEngine()
+	engineLabel := benchmarkPowerEngineLabel(engine)

-	if _, err := exec.LookPath("dcgmi"); err != nil {
-		logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
-		return map[int]benchmarkPowerCalibrationResult{}, nil, nil
+	if engine == BenchmarkPowerEngineTargetedPower {
+		if _, err := exec.LookPath("dcgmi"); err != nil {
+			logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
+			return map[int]benchmarkPowerCalibrationResult{}, nil, nil
+		}
+	} else {
+		if _, _, err := resolveBenchmarkPowerLoadCommand(calibDurationSec, gpuIndices); err != nil {
+			logFunc("power calibration: dcgmproftester not found, skipping (will use default power limit)")
+			return map[int]benchmarkPowerCalibrationResult{}, nil, nil
+		}
 	}
 	if killed := KillTestWorkers(); len(killed) > 0 {
 		for _, p := range killed {
@@ -3206,7 +3197,7 @@ calibDone:
 		sharedAttempt++
 		for _, s := range active {
 			s.calib.Attempts++
-			logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", s.idx, s.calib.Attempts, s.appliedLimitW, calibDurationSec))
+			logFunc(fmt.Sprintf("power calibration: GPU %d %s attempt %d at %d W for %ds", s.idx, engineLabel, s.calib.Attempts, s.appliedLimitW, calibDurationSec))
 		}

 		// Snapshot throttle counters for all active GPUs before the run.
@@ -3215,14 +3206,22 @@ calibDone:
 			beforeThrottle[s.idx], _ = queryThrottleCounters(s.idx)
 		}

-		// Run targeted_power for ALL gpuIndices simultaneously so every card
+		// Run the selected power-fit load for ALL gpuIndices simultaneously so every card
 		// is under load during calibration — this reflects real server thermals.
 		logName := fmt.Sprintf("power-calibration-attempt-%d.log", sharedAttempt)
-		cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices)
+		cmd, env, err := resolveBenchmarkPowerLoadCommand(calibDurationSec, gpuIndices)
+		if err != nil {
+			for _, s := range active {
+				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("failed to resolve %s command: %v", engineLabel, err))
+				s.converged = true
+			}
+			logFunc(fmt.Sprintf("power calibration: failed to resolve %s command: %v", engineLabel, err))
+			break calibDone
+		}
 		attemptCtx, cancelAttempt := context.WithCancel(ctx)
 		doneCh := make(chan sharedAttemptResult, 1)
 		go func() {
-			out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, gpuIndices, logFunc)
+			out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, env, gpuIndices, logFunc)
 			doneCh <- sharedAttemptResult{out: out, rows: rows, err: err}
 		}()

@@ -3245,8 +3244,8 @@ calibDone:
 					if err != nil {
 						continue
 					}
-					// Record throttle but do NOT cancel — let dcgmi finish so
-					// nv-hostengine releases the slot cleanly before the next attempt.
+					// Record throttle but do NOT cancel — let the load command finish so
+					// runtime resources release cleanly before the next attempt.
 					if reason := benchmarkCalibrationThrottleReason(beforeThrottle[s.idx], after); reason != "" {
 						throttleReasons[s.idx] = reason
 						logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for run to finish", s.idx, reason, s.appliedLimitW))
@@ -3359,9 +3358,9 @@ calibDone:
 				logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", s.idx, throttle, s.appliedLimitW))
 			case ar.err != nil:
 				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", s.calib.Attempts, s.appliedLimitW, ar.err))
-				logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", s.idx, s.appliedLimitW, ar.err))
+				logFunc(fmt.Sprintf("power calibration: GPU %d %s failed at %d W: %v", s.idx, engineLabel, s.appliedLimitW, ar.err))
 			default:
-				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W: no valid power telemetry", s.calib.Attempts, s.appliedLimitW))
+				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("%s attempt %d at %d W: no valid power telemetry", engineLabel, s.calib.Attempts, s.appliedLimitW))
 				logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W: no valid telemetry", s.idx, s.calib.Attempts, s.appliedLimitW))
 			}

@@ -3384,7 +3383,7 @@ calibDone:
 						s.calib.Completed = true
 					}
 				} else {
-					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
+					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
 				}
 				s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
 				s.converged = true
@@ -3399,7 +3398,7 @@ calibDone:
 				next = (s.lo + s.hi) / 2
 			}
 			if next < s.minLimitW {
-				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
+				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
 				s.converged = true
 				continue
 			}
@@ -4117,13 +4116,13 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	}
 	durationSec := powerBenchDurationSec(opts.Profile)

-	// Sample IPMI idle power before any GPU load.
+	// Sample server idle power before any GPU load.
 	var serverIdleW float64
 	var serverIdleOK bool
-	if w, ok := sampleIPMIPowerSeries(ctx, 10); ok {
+	if w, ok := sampleBenchmarkPowerSourceSeries(ctx, opts.ServerPowerSource, 10, benchmarkPowerAutotuneSampleInterval); ok {
 		serverIdleW = w
 		serverIdleOK = true
-		logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
+		logFunc(fmt.Sprintf("server idle power (%s): %.0f W", opts.ServerPowerSource, w))
 	}
 	sdrIdle := sampleIPMISDRPowerSensors()
 	psuBefore := psuStatusSnapshot()
@@ -4141,26 +4140,18 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		_ = os.MkdirAll(singleDir, 0755)
 		singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
 		logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
-		ipmiSingleCtx, ipmiSingleCancel := context.WithCancel(ctx)
-		ipmiSingleDone := make(chan float64, 1)
-		go func() {
-			defer close(ipmiSingleDone)
-			if w, ok := sampleIPMIPowerSeries(ipmiSingleCtx, 3600); ok {
-				ipmiSingleDone <- w
-			}
-		}()
+		singlePowerStopCh := make(chan struct{})
+		singlePowerCh := startSelectedPowerSourceSampler(singlePowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
 		c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil, durationSec)
 		appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0)
-		ipmiSingleCancel()
+		close(singlePowerStopCh)
 		sdrSingle := sampleIPMISDRPowerSensors()
-		if sdrSingle.PSUInW > 0 {
+		if samples := <-singlePowerCh; len(samples) > 0 {
+			singleIPMILoadedW[idx] = benchmarkMean(samples)
+			logFunc(fmt.Sprintf("power calibration: GPU %d single-card server power (%s avg): %.0f W", idx, opts.ServerPowerSource, singleIPMILoadedW[idx]))
+		} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrSingle.PSUInW > 0 {
 			singleIPMILoadedW[idx] = sdrSingle.PSUInW
-			logFunc(fmt.Sprintf("power calibration: GPU %d single-card IPMI loaded: %.0f W (SDR PSU AC input)", idx, sdrSingle.PSUInW))
-		} else if w, ok := <-ipmiSingleDone; ok {
-			singleIPMILoadedW[idx] = w
-			logFunc(fmt.Sprintf("power calibration: GPU %d single-card IPMI loaded: %.0f W (DCMI)", idx, w))
-		} else {
-			<-ipmiSingleDone // drain channel
+			logFunc(fmt.Sprintf("power calibration: GPU %d single-card fallback server power (SDR snapshot): %.0f W", idx, sdrSingle.PSUInW))
 		}
 		allRestoreActions = append(allRestoreActions, restore...)
 		if r, ok := c[idx]; ok {
@@ -4234,11 +4225,11 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		result.RecommendedSlotOrder = append(result.RecommendedSlotOrder, gpu.Index)
 	}
 	if len(result.RecommendedSlotOrder) > 0 {
-		result.Findings = append(result.Findings, fmt.Sprintf("Recommended slot order for installation based on single-card targeted_power: %s.", joinIndexList(result.RecommendedSlotOrder)))
+		result.Findings = append(result.Findings, fmt.Sprintf("Recommended slot order for installation based on single-card %s: %s.", benchmarkPowerEngineLabel(benchmarkPowerEngine()), joinIndexList(result.RecommendedSlotOrder)))
 	}
 	for _, gpu := range gpus {
 		if gpu.Derated {
-			result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete targeted_power.", gpu.Index, gpu.AppliedPowerLimitW))
+			result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete %s.", gpu.Index, gpu.AppliedPowerLimitW, benchmarkPowerEngineLabel(benchmarkPowerEngine())))
 		}
 		if gpu.CoolingWarning != "" {
 			result.Findings = append(result.Findings, fmt.Sprintf(
@@ -4255,7 +4246,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	// Phase 2: cumulative thermal ramp.
 	// Each step introduces one new GPU into an environment where all previously
 	// calibrated GPUs are already running at their fixed stable limits. The new
-	// GPU's stable TDP is searched via binary search (targeted_power) under real
+	// GPU's stable TDP is searched via binary search under real
 	// multi-GPU thermal load. Once found, its limit is fixed permanently for all
 	// subsequent steps. This ensures each GPU's limit reflects actual sustained
 	// power in the final full-system thermal state.
@@ -4294,7 +4285,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		}
 		if !firstCalib.Completed {
 			ramp.Status = "FAILED"
-			ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card targeted_power", firstIdx))
+			ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card %s", firstIdx, benchmarkPowerEngineLabel(benchmarkPowerEngine())))
 			result.OverallStatus = "PARTIAL"
 		} else if firstCalib.Derated {
 			ramp.Status = "PARTIAL"
@@ -4340,21 +4331,15 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			step, len(result.RecommendedSlotOrder), len(subset), newGPUIdx))

 		stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
-		ipmiStepCtx, ipmiStepCancel := context.WithCancel(ctx)
-		ipmiStepDone := make(chan float64, 1)
-		go func() {
-			defer close(ipmiStepDone)
-			if w, ok := sampleIPMIPowerSeries(ipmiStepCtx, 3600); ok {
-				ipmiStepDone <- w
-			}
-		}()
+		stepPowerStopCh := make(chan struct{})
+		stepPowerCh := startSelectedPowerSourceSampler(stepPowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
 		stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep, durationSec)
 		appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0)
-		ipmiStepCancel()
+		close(stepPowerStopCh)
 		var stepIPMILoadedW float64
 		var stepIPMIOK bool
-		if w, ok := <-ipmiStepDone; ok {
-			stepIPMILoadedW = w
+		if samples := <-stepPowerCh; len(samples) > 0 {
+			stepIPMILoadedW = benchmarkMean(samples)
 			stepIPMIOK = true
 		}
 		// Accumulate restore actions; they all run in the outer defer.
@@ -4391,7 +4376,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 				}
 				ramp.Status = "FAILED"
 				ramp.Notes = append(ramp.Notes,
-					fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d; keeping previous stable limit %d W", idx, step, fallback))
+					fmt.Sprintf("GPU %d did not complete %s in ramp step %d; keeping previous stable limit %d W", idx, benchmarkPowerEngineLabel(benchmarkPowerEngine()), step, fallback))
 				result.OverallStatus = "PARTIAL"
 				continue
 			}
@@ -4427,24 +4412,24 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			ramp.PSUSlotReadings = sdrStep.PSUSlots
 		}

-		if sdrStep.PSUInW > 0 {
-			// SDR PSU sum is available — use it for server power (includes all PSUs).
-			ramp.ServerLoadedW = sdrStep.PSUInW
-			ramp.ServerDeltaW = sdrStep.PSUInW - sdrIdle.PSUInW
-			logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W (SDR PSU AC input)", step, sdrStep.PSUInW))
-			if step == len(result.RecommendedSlotOrder) {
-				serverLoadedW = sdrStep.PSUInW
-				serverLoadedOK = true
-				sdrLastStep = sdrStep
-			}
-		} else if stepIPMIOK && serverIdleOK && stepIPMILoadedW > 0 {
+		if stepIPMIOK && serverIdleOK && stepIPMILoadedW > 0 {
 			ramp.ServerLoadedW = stepIPMILoadedW
 			ramp.ServerDeltaW = stepIPMILoadedW - serverIdleW
-			logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W (DCMI)", step, stepIPMILoadedW))
+			logFunc(fmt.Sprintf("power ramp: step %d server loaded power (%s avg): %.0f W", step, opts.ServerPowerSource, stepIPMILoadedW))
 			// The last step has all GPUs loaded — use it as the top-level loaded_w.
 			if step == len(result.RecommendedSlotOrder) {
 				serverLoadedW = stepIPMILoadedW
 				serverLoadedOK = true
+				sdrLastStep = sdrStep
+			}
+		} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrStep.PSUInW > 0 {
+			ramp.ServerLoadedW = sdrStep.PSUInW
+			ramp.ServerDeltaW = sdrStep.PSUInW - sdrIdle.PSUInW
+			logFunc(fmt.Sprintf("power ramp: step %d fallback server loaded power (SDR snapshot): %.0f W", step, sdrStep.PSUInW))
+			if step == len(result.RecommendedSlotOrder) {
+				serverLoadedW = sdrStep.PSUInW
+				serverLoadedOK = true
+				sdrLastStep = sdrStep
 			}
 		}

@@ -4502,7 +4487,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		gpuActualSumW = result.PlatformMaxTDPW
 	}
 	_ = serverIdleOK // used implicitly via characterizeServerPower
-	result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuActualSumW, serverIdleOK && serverLoadedOK)
+	result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuActualSumW, opts.ServerPowerSource, serverIdleOK && serverLoadedOK)
 	// Supplement DCMI with SDR multi-source data via collector's PSU slot patterns.
 	// Per-slot readings enable correlation with audit HardwarePowerSupply entries.
 	if result.ServerPower != nil {
--- a/audit/internal/platform/benchmark_power_autotune.go
+++ b/audit/internal/platform/benchmark_power_autotune.go
@@ -0,0 +1,735 @@
+package platform
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"math"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"sort"
+	"strings"
+	"time"
+)
+
+const (
+	benchmarkPowerAutotuneVersion         = 1
+	benchmarkPowerAutotuneIdleSec         = 60
+	benchmarkPowerAutotuneLoadSec         = 90
+	benchmarkPowerAutotuneSampleInterval  = 3
+	defaultBenchmarkPowerSourceConfigPath = "/appdata/bee/export/bee-bench/power-source-autotune.json"
+)
+
+func BenchmarkPowerSourceConfigPath(baseDir string) string {
+	baseDir = strings.TrimSpace(baseDir)
+	if baseDir == "" {
+		return defaultBenchmarkPowerSourceConfigPath
+	}
+	return filepath.Join(filepath.Dir(baseDir), "power-source-autotune.json")
+}
+
+func LoadBenchmarkPowerAutotuneConfig(path string) (*BenchmarkPowerAutotuneConfig, error) {
+	raw, err := os.ReadFile(path)
+	if err != nil {
+		return nil, err
+	}
+	var cfg BenchmarkPowerAutotuneConfig
+	if err := json.Unmarshal(raw, &cfg); err != nil {
+		return nil, err
+	}
+	if strings.TrimSpace(cfg.SelectedSource) == "" {
+		return nil, fmt.Errorf("autotune config missing selected_source")
+	}
+	return &cfg, nil
+}
+
+func SaveBenchmarkPowerAutotuneConfig(path string, cfg BenchmarkPowerAutotuneConfig) error {
+	if strings.TrimSpace(path) == "" {
+		return fmt.Errorf("empty autotune config path")
+	}
+	if cfg.Version <= 0 {
+		cfg.Version = benchmarkPowerAutotuneVersion
+	}
+	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+		return err
+	}
+	data, err := json.MarshalIndent(cfg, "", "  ")
+	if err != nil {
+		return err
+	}
+	tmp := path + ".tmp"
+	if err := os.WriteFile(tmp, data, 0644); err != nil {
+		return err
+	}
+	return os.Rename(tmp, path)
+}
+
+func LoadSystemPowerSourceConfig(exportDir string) (*BenchmarkPowerAutotuneConfig, error) {
+	return LoadBenchmarkPowerAutotuneConfig(BenchmarkPowerSourceConfigPath(exportDir))
+}
+
+func ResetBenchmarkPowerAutotuneConfig(path string) error {
+	if strings.TrimSpace(path) == "" {
+		return fmt.Errorf("empty autotune config path")
+	}
+	if err := os.Remove(path); err != nil && !os.IsNotExist(err) {
+		return err
+	}
+	return nil
+}
+
+func normalizeBenchmarkPowerSource(source string) string {
+	switch strings.TrimSpace(strings.ToLower(source)) {
+	case BenchmarkPowerSourceSDRPSUInput:
+		return BenchmarkPowerSourceSDRPSUInput
+	default:
+		return BenchmarkPowerSourceDCMI
+	}
+}
+
+func ResolveSystemPowerDecision(exportDir string) SystemPowerSourceDecision {
+	cfg, err := LoadSystemPowerSourceConfig(exportDir)
+	if err == nil && cfg != nil && strings.TrimSpace(cfg.SelectedSource) != "" {
+		selected := normalizeBenchmarkPowerSource(cfg.SelectedSource)
+		return SystemPowerSourceDecision{
+			Configured:      true,
+			SelectedSource:  selected,
+			EffectiveSource: selected,
+			Mode:            "autotuned",
+			Reason:          strings.TrimSpace(cfg.Reason),
+			ConfiguredAt:    cfg.UpdatedAt,
+		}
+	}
+
+	sources := sampleBenchmarkPowerSources()
+	if value := sources[BenchmarkPowerSourceSDRPSUInput]; value > 0 {
+		return SystemPowerSourceDecision{
+			Configured:      false,
+			EffectiveSource: BenchmarkPowerSourceSDRPSUInput,
+			Mode:            "fallback",
+			Reason:          "autotune config not found; using temporary fallback source sdr_psu_input",
+		}
+	}
+	return SystemPowerSourceDecision{
+		Configured:      false,
+		EffectiveSource: BenchmarkPowerSourceDCMI,
+		Mode:            "fallback",
+		Reason:          "autotune config not found; using temporary fallback source dcmi",
+	}
+}
+
+func SampleSystemPowerResolved(exportDir string) (float64, SystemPowerSourceDecision, error) {
+	decision := ResolveSystemPowerDecision(exportDir)
+	if decision.EffectiveSource != "" {
+		if value, err := queryBenchmarkPowerSourceW(decision.EffectiveSource); err == nil && value > 0 {
+			return value, decision, nil
+		} else if decision.Configured {
+			fallback := BenchmarkPowerSourceDCMI
+			if decision.EffectiveSource == BenchmarkPowerSourceDCMI {
+				fallback = BenchmarkPowerSourceSDRPSUInput
+			}
+			if fallbackValue, fallbackErr := queryBenchmarkPowerSourceW(fallback); fallbackErr == nil && fallbackValue > 0 {
+				decision.Mode = "degraded"
+				decision.Reason = fmt.Sprintf("configured source %s unavailable; using degraded fallback %s", decision.SelectedSource, fallback)
+				decision.EffectiveSource = fallback
+				return fallbackValue, decision, nil
+			}
+			decision.Mode = "degraded"
+			decision.Reason = fmt.Sprintf("configured source %s unavailable and no fallback source responded", decision.SelectedSource)
+			return 0, decision, err
+		}
+	}
+	return 0, decision, fmt.Errorf("system power source unavailable")
+}
+
+func queryBenchmarkPowerSourceW(source string) (float64, error) {
+	switch normalizeBenchmarkPowerSource(source) {
+	case BenchmarkPowerSourceSDRPSUInput:
+		sdr := sampleIPMISDRPowerSensors()
+		if sdr.PSUInW > 0 {
+			return sdr.PSUInW, nil
+		}
+		return 0, fmt.Errorf("sdr psu input unavailable")
+	default:
+		return queryIPMIServerPowerW()
+	}
+}
+
+func sampleBenchmarkPowerSources() map[string]float64 {
+	out := map[string]float64{}
+	if w, err := queryIPMIServerPowerW(); err == nil && w > 0 {
+		out[BenchmarkPowerSourceDCMI] = w
+	}
+	if w, err := queryBenchmarkPowerSourceW(BenchmarkPowerSourceSDRPSUInput); err == nil && w > 0 {
+		out[BenchmarkPowerSourceSDRPSUInput] = w
+	}
+	return out
+}
+
+func sampleBenchmarkPowerSourceSeries(ctx context.Context, source string, durationSec, intervalSec int) (float64, bool) {
+	if durationSec <= 0 {
+		return 0, false
+	}
+	samples := collectSelectedPowerSourceSamples(ctx, source, durationSec, intervalSec)
+	if len(samples) == 0 {
+		return 0, false
+	}
+	return benchmarkMean(samples), true
+}
+
+func collectSelectedPowerSourceSamples(ctx context.Context, source string, durationSec, intervalSec int) []float64 {
+	if durationSec <= 0 {
+		return nil
+	}
+	stopCh := make(chan struct{})
+	doneCh := startSelectedPowerSourceSampler(stopCh, source, intervalSec)
+	select {
+	case <-ctx.Done():
+	case <-time.After(time.Duration(durationSec) * time.Second):
+	}
+	close(stopCh)
+	return <-doneCh
+}
+
+func startSelectedPowerSourceSampler(stopCh <-chan struct{}, source string, intervalSec int) <-chan []float64 {
+	if intervalSec <= 0 {
+		intervalSec = benchmarkPowerAutotuneSampleInterval
+	}
+	ch := make(chan []float64, 1)
+	go func() {
+		defer close(ch)
+		var samples []float64
+		record := func() {
+			if w, err := queryBenchmarkPowerSourceW(source); err == nil && w > 0 {
+				samples = append(samples, w)
+			}
+		}
+		record()
+		ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
+		defer ticker.Stop()
+		for {
+			select {
+			case <-stopCh:
+				ch <- samples
+				return
+			case <-ticker.C:
+				record()
+			}
+		}
+	}()
+	return ch
+}
+
+type benchmarkPowerAutotuneSample struct {
+	ElapsedSec     float64
+	GPUAvgUsagePct float64
+	CPUUsagePct    float64
+	GPUSumPowerW   float64
+	Sources        map[string]float64
+}
+
+func collectBenchmarkPowerAutotuneSamples(ctx context.Context, phase string, gpuIndices []int, durationSec int, logFunc func(string)) []benchmarkPowerAutotuneSample {
+	if durationSec <= 0 {
+		return nil
+	}
+	var out []benchmarkPowerAutotuneSample
+	deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
+	start := time.Now()
+	for {
+		if ctx.Err() != nil {
+			return out
+		}
+		row := benchmarkPowerAutotuneSample{
+			ElapsedSec:  time.Since(start).Seconds(),
+			CPUUsagePct: sampleCPULoadPct(),
+			Sources:     sampleBenchmarkPowerSources(),
+		}
+		if gpuRows, err := sampleGPUMetrics(gpuIndices); err == nil && len(gpuRows) > 0 {
+			var usageSum float64
+			for _, gpu := range gpuRows {
+				row.GPUSumPowerW += gpu.PowerW
+				usageSum += gpu.UsagePct
+			}
+			row.GPUAvgUsagePct = usageSum / float64(len(gpuRows))
+		}
+		out = append(out, row)
+		logBenchmarkPowerAutotuneSample(phase, row, logFunc)
+		if time.Now().After(deadline) {
+			return out
+		}
+		select {
+		case <-ctx.Done():
+			return out
+		case <-time.After(benchmarkPowerAutotuneSampleInterval * time.Second):
+		}
+	}
+}
+
+func logBenchmarkPowerAutotuneSample(phase string, sample benchmarkPowerAutotuneSample, logFunc func(string)) {
+	if logFunc == nil {
+		return
+	}
+	var sourceParts []string
+	for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} {
+		if value, ok := sample.Sources[source]; ok && value > 0 {
+			sourceParts = append(sourceParts, fmt.Sprintf("%s=%.0fW", source, value))
+		} else {
+			sourceParts = append(sourceParts, fmt.Sprintf("%s=n/a", source))
+		}
+	}
+	logFunc(fmt.Sprintf(
+		"autotune %s sample t=%.0fs gpu_avg_util=%.1f%% gpu_sum_power=%.0fW cpu_load=%.1f%% %s",
+		phase,
+		sample.ElapsedSec,
+		sample.GPUAvgUsagePct,
+		sample.GPUSumPowerW,
+		sample.CPUUsagePct,
+		strings.Join(sourceParts, " "),
+	))
+}
+
+func logBenchmarkPowerAutotunePhaseSummary(phase string, samples []benchmarkPowerAutotuneSample, logFunc func(string)) {
+	if logFunc == nil || len(samples) == 0 {
+		return
+	}
+	var gpuUsage []float64
+	var cpuUsage []float64
+	var gpuPower []float64
+	sourceBuckets := map[string][]float64{}
+	for _, sample := range samples {
+		gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct)
+		cpuUsage = append(cpuUsage, sample.CPUUsagePct)
+		gpuPower = append(gpuPower, sample.GPUSumPowerW)
+		for source, value := range sample.Sources {
+			if value > 0 {
+				sourceBuckets[source] = append(sourceBuckets[source], value)
+			}
+		}
+	}
+	var sourceParts []string
+	for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} {
+		values := sourceBuckets[source]
+		if len(values) == 0 {
+			sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=n/a", source))
+			continue
+		}
+		sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=%.0fW", source, benchmarkMean(values)))
+	}
+	logFunc(fmt.Sprintf(
+		"autotune %s summary samples=%d gpu_avg_util=%.1f%% gpu_p95_util=%.1f%% gpu_avg_power=%.0fW cpu_avg=%.1f%% cpu_p95=%.1f%% %s",
+		phase,
+		len(samples),
+		benchmarkMean(gpuUsage),
+		benchmarkPercentile(gpuUsage, 95),
+		benchmarkMean(gpuPower),
+		benchmarkMean(cpuUsage),
+		benchmarkPercentile(cpuUsage, 95),
+		strings.Join(sourceParts, " "),
+	))
+}
+
+func logBenchmarkPowerAutotuneSelection(candidates []BenchmarkPowerAutotuneCandidate, selectedSource string, gpuDelta float64, logFunc func(string)) {
+	if logFunc == nil {
+		return
+	}
+	for _, candidate := range candidates {
+		if !candidate.Available {
+			logFunc(fmt.Sprintf("autotune candidate %s unavailable", candidate.Source))
+			continue
+		}
+		logFunc(fmt.Sprintf(
+			"autotune candidate %s idle_avg=%.0fW load_avg=%.0fW delta=%.0fW gpu_delta=%.0fW relative_error=%.3f confidence=%.0f%%%s",
+			candidate.Source,
+			candidate.IdleAvgW,
+			candidate.LoadAvgW,
+			candidate.DeltaW,
+			gpuDelta,
+			candidate.RelativeError,
+			candidate.Confidence*100,
+			map[bool]string{true: " SELECTED", false: ""}[candidate.Source == selectedSource],
+		))
+		if strings.TrimSpace(candidate.SelectionNotes) != "" {
+			logFunc(fmt.Sprintf("autotune candidate %s reason: %s", candidate.Source, candidate.SelectionNotes))
+		}
+	}
+}
+
+func validateBenchmarkPowerAutotuneIdle(samples []benchmarkPowerAutotuneSample) *BenchmarkPowerAutotuneValidation {
+	result := &BenchmarkPowerAutotuneValidation{}
+	if len(samples) == 0 {
+		result.Reason = "no idle telemetry samples collected"
+		return result
+	}
+	var gpuUsage []float64
+	var cpuUsage []float64
+	for _, sample := range samples {
+		gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct)
+		if sample.CPUUsagePct > 0 {
+			cpuUsage = append(cpuUsage, sample.CPUUsagePct)
+		}
+	}
+	result.GPUSamples = len(gpuUsage)
+	result.CPUSamples = len(cpuUsage)
+	result.GPUAvgUsagePct = math.Round(benchmarkMean(gpuUsage)*10) / 10
+	result.GPUP95UsagePct = math.Round(benchmarkPercentile(gpuUsage, 95)*10) / 10
+	result.CPUAvgUsagePct = math.Round(benchmarkMean(cpuUsage)*10) / 10
+	result.CPUP95UsagePct = math.Round(benchmarkPercentile(cpuUsage, 95)*10) / 10
+	switch {
+	case result.GPUAvgUsagePct > 5:
+		result.Reason = fmt.Sprintf("idle validation failed: average GPU load %.1f%% exceeds 5%%", result.GPUAvgUsagePct)
+	case result.GPUP95UsagePct > 10:
+		result.Reason = fmt.Sprintf("idle validation failed: p95 GPU load %.1f%% exceeds 10%%", result.GPUP95UsagePct)
+	case result.CPUAvgUsagePct > 20:
+		result.Reason = fmt.Sprintf("idle validation failed: average CPU load %.1f%% exceeds 20%%", result.CPUAvgUsagePct)
+	case result.CPUP95UsagePct > 35:
+		result.Reason = fmt.Sprintf("idle validation failed: p95 CPU load %.1f%% exceeds 35%%", result.CPUP95UsagePct)
+	default:
+		result.Valid = true
+	}
+	return result
+}
+
+func chooseBenchmarkPowerAutotuneSource(idle, load []benchmarkPowerAutotuneSample) (string, []BenchmarkPowerAutotuneCandidate, float64, float64, error) {
+	idleBySource := map[string][]float64{}
+	loadBySource := map[string][]float64{}
+	var idleGPU []float64
+	var loadGPU []float64
+	for _, sample := range idle {
+		idleGPU = append(idleGPU, sample.GPUSumPowerW)
+		for source, value := range sample.Sources {
+			if value > 0 {
+				idleBySource[source] = append(idleBySource[source], value)
+			}
+		}
+	}
+	for _, sample := range load {
+		loadGPU = append(loadGPU, sample.GPUSumPowerW)
+		for source, value := range sample.Sources {
+			if value > 0 {
+				loadBySource[source] = append(loadBySource[source], value)
+			}
+		}
+	}
+	idleGPUAvg := benchmarkMean(idleGPU)
+	loadGPUAvg := benchmarkMean(loadGPU)
+	gpuDelta := loadGPUAvg - idleGPUAvg
+	if gpuDelta <= 0 {
+		gpuDelta = loadGPUAvg
+	}
+
+	candidates := []BenchmarkPowerAutotuneCandidate{
+		buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceDCMI, idleBySource[BenchmarkPowerSourceDCMI], loadBySource[BenchmarkPowerSourceDCMI], gpuDelta),
+		buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceSDRPSUInput, idleBySource[BenchmarkPowerSourceSDRPSUInput], loadBySource[BenchmarkPowerSourceSDRPSUInput], gpuDelta),
+	}
+	available := make([]BenchmarkPowerAutotuneCandidate, 0, len(candidates))
+	for _, candidate := range candidates {
+		if candidate.Available && candidate.DeltaW > 0 {
+			available = append(available, candidate)
+		}
+	}
+	if len(available) == 0 {
+		return "", candidates, idleGPUAvg, loadGPUAvg, fmt.Errorf("no usable server power source samples collected")
+	}
+	sort.Slice(available, func(i, j int) bool {
+		if math.Abs(available[i].RelativeError-available[j].RelativeError) <= 0.10 {
+			if available[i].Source != available[j].Source {
+				return available[i].Source == BenchmarkPowerSourceSDRPSUInput
+			}
+		}
+		if available[i].RelativeError != available[j].RelativeError {
+			return available[i].RelativeError < available[j].RelativeError
+		}
+		return available[i].Samples > available[j].Samples
+	})
+	selected := available[0]
+	for idx := range candidates {
+		if candidates[idx].Source == selected.Source {
+			candidates[idx].Selected = true
+			candidates[idx].SelectionNotes = fmt.Sprintf("selected because delta %.0f W is closest to GPU delta %.0f W (relative error %.3f)", selected.DeltaW, gpuDelta, selected.RelativeError)
+		}
+	}
+	return selected.Source, candidates, idleGPUAvg, loadGPUAvg, nil
+}
+
+func buildBenchmarkPowerAutotuneCandidate(source string, idle, load []float64, gpuDelta float64) BenchmarkPowerAutotuneCandidate {
+	candidate := BenchmarkPowerAutotuneCandidate{
+		Source:    source,
+		Available: len(idle) > 0 && len(load) > 0,
+		Samples:   minInt(len(idle), len(load)),
+	}
+	if !candidate.Available {
+		return candidate
+	}
+	candidate.IdleAvgW = benchmarkMean(idle)
+	candidate.LoadAvgW = benchmarkMean(load)
+	candidate.DeltaW = candidate.LoadAvgW - candidate.IdleAvgW
+	if gpuDelta > 0 {
+		candidate.RelativeError = math.Abs(candidate.DeltaW-gpuDelta) / gpuDelta
+		candidate.Confidence = math.Max(0, 1-candidate.RelativeError)
+	}
+	return candidate
+}
+
+func renderBenchmarkPowerAutotuneSummary(result BenchmarkPowerAutotuneResult) string {
+	var b strings.Builder
+	fmt.Fprintf(&b, "generated_at=%s\n", result.GeneratedAt.UTC().Format(time.RFC3339))
+	fmt.Fprintf(&b, "status=%s\n", result.Status)
+	fmt.Fprintf(&b, "benchmark_kind=%s\n", result.BenchmarkKind)
+	fmt.Fprintf(&b, "profile=%s\n", result.Profile)
+	fmt.Fprintf(&b, "idle_duration_sec=%d\n", result.IdleDurationSec)
+	fmt.Fprintf(&b, "load_duration_sec=%d\n", result.LoadDurationSec)
+	fmt.Fprintf(&b, "sample_interval_sec=%d\n", result.SampleIntervalSec)
+	if result.SelectedSource != "" {
+		fmt.Fprintf(&b, "selected_source=%s\n", result.SelectedSource)
+	}
+	if result.IdleValidation != nil {
+		fmt.Fprintf(&b, "idle_valid=%t\n", result.IdleValidation.Valid)
+		fmt.Fprintf(&b, "idle_gpu_avg_usage_pct=%.1f\n", result.IdleValidation.GPUAvgUsagePct)
+		fmt.Fprintf(&b, "idle_gpu_p95_usage_pct=%.1f\n", result.IdleValidation.GPUP95UsagePct)
+		fmt.Fprintf(&b, "idle_cpu_avg_usage_pct=%.1f\n", result.IdleValidation.CPUAvgUsagePct)
+		fmt.Fprintf(&b, "idle_cpu_p95_usage_pct=%.1f\n", result.IdleValidation.CPUP95UsagePct)
+		if result.IdleValidation.Reason != "" {
+			fmt.Fprintf(&b, "idle_validation_error=%s\n", result.IdleValidation.Reason)
+		}
+	}
+	for _, candidate := range result.Candidates {
+		fmt.Fprintf(&b, "candidate_%s_available=%t\n", candidate.Source, candidate.Available)
+		if candidate.Available {
+			fmt.Fprintf(&b, "candidate_%s_idle_avg_w=%.0f\n", candidate.Source, candidate.IdleAvgW)
+			fmt.Fprintf(&b, "candidate_%s_load_avg_w=%.0f\n", candidate.Source, candidate.LoadAvgW)
+			fmt.Fprintf(&b, "candidate_%s_delta_w=%.0f\n", candidate.Source, candidate.DeltaW)
+			fmt.Fprintf(&b, "candidate_%s_relative_error=%.3f\n", candidate.Source, candidate.RelativeError)
+		}
+	}
+	return b.String()
+}
+
+func renderBenchmarkPowerAutotuneReport(result BenchmarkPowerAutotuneResult) string {
+	var b strings.Builder
+	b.WriteString("# Bee Bench Power Source Autotune\n\n")
+	fmt.Fprintf(&b, "**Status:** %s  \n", result.Status)
+	fmt.Fprintf(&b, "**Benchmark kind:** %s  \n", result.BenchmarkKind)
+	fmt.Fprintf(&b, "**Profile:** %s  \n", result.Profile)
+	fmt.Fprintf(&b, "**Idle window:** %ds  \n", result.IdleDurationSec)
+	fmt.Fprintf(&b, "**Load window:** %ds  \n", result.LoadDurationSec)
+	fmt.Fprintf(&b, "**Sample interval:** %ds  \n", result.SampleIntervalSec)
+	if result.SelectedSource != "" {
+		fmt.Fprintf(&b, "**Selected source:** `%s`  \n", result.SelectedSource)
+	}
+	b.WriteString("\n")
+	if result.IdleValidation != nil {
+		b.WriteString("## Idle Validation\n\n")
+		fmt.Fprintf(&b, "- valid: %t\n", result.IdleValidation.Valid)
+		fmt.Fprintf(&b, "- GPU avg usage: %.1f%%\n", result.IdleValidation.GPUAvgUsagePct)
+		fmt.Fprintf(&b, "- GPU p95 usage: %.1f%%\n", result.IdleValidation.GPUP95UsagePct)
+		fmt.Fprintf(&b, "- CPU avg usage: %.1f%%\n", result.IdleValidation.CPUAvgUsagePct)
+		fmt.Fprintf(&b, "- CPU p95 usage: %.1f%%\n", result.IdleValidation.CPUP95UsagePct)
+		if result.IdleValidation.Reason != "" {
+			fmt.Fprintf(&b, "- reason: %s\n", result.IdleValidation.Reason)
+		}
+		b.WriteString("\n")
+	}
+	if len(result.Candidates) > 0 {
+		b.WriteString("## Candidates\n\n")
+		b.WriteString("| Source | Idle avg W | Load avg W | Delta W | Relative error | Selected |\n")
+		b.WriteString("|--------|------------|------------|---------|----------------|----------|\n")
+		for _, candidate := range result.Candidates {
+			if !candidate.Available {
+				fmt.Fprintf(&b, "| %s | — | — | — | — | no |\n", candidate.Source)
+				continue
+			}
+			selected := "no"
+			if candidate.Selected {
+				selected = "yes"
+			}
+			fmt.Fprintf(&b, "| %s | %.0f | %.0f | %.0f | %.2f | %s |\n",
+				candidate.Source, candidate.IdleAvgW, candidate.LoadAvgW, candidate.DeltaW, candidate.RelativeError, selected)
+		}
+		b.WriteString("\n")
+	}
+	for _, note := range result.Notes {
+		fmt.Fprintf(&b, "- %s\n", note)
+	}
+	return b.String()
+}
+
+func benchmarkAutotuneLoadCommand(kind string, durationSec int, gpuIndices []int, sizeMB int) ([]string, string) {
+	allDevices := joinIndexList(gpuIndices)
+	switch strings.TrimSpace(strings.ToLower(kind)) {
+	case "power-fit", "power", "nvidia-bench-power":
+		cmd, _, err := resolveBenchmarkPowerLoadCommand(durationSec, gpuIndices)
+		if err == nil {
+			return cmd, "power-fit"
+		}
+		return nvidiaDCGMNamedDiagCommand("targeted_power", durationSec, gpuIndices), "power-fit"
+	default:
+		cmd := []string{
+			"bee-gpu-burn",
+			"--seconds", fmt.Sprintf("%d", durationSec),
+			"--devices", allDevices,
+		}
+		if sizeMB > 0 {
+			cmd = append(cmd, "--size-mb", fmt.Sprintf("%d", sizeMB))
+		}
+		return cmd, "performance"
+	}
+}
+
+func (s *System) RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
+	if ctx == nil {
+		ctx = context.Background()
+	}
+	if logFunc == nil {
+		logFunc = func(string) {}
+	}
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = "/var/log/bee-bench/autotune"
+	}
+	if err := os.MkdirAll(baseDir, 0755); err != nil {
+		return "", fmt.Errorf("mkdir %s: %w", baseDir, err)
+	}
+	selected, err := resolveNvidiaGPUSelection(nil, nil)
+	if err != nil {
+		return "", err
+	}
+	if len(selected) == 0 {
+		return "", fmt.Errorf("no NVIDIA GPUs detected for autotune")
+	}
+	ts := time.Now().UTC().Format("20060102-150405")
+	runDir := filepath.Join(baseDir, "autotune-"+ts)
+	if err := os.MkdirAll(runDir, 0755); err != nil {
+		return "", fmt.Errorf("mkdir %s: %w", runDir, err)
+	}
+	verboseLog := filepath.Join(runDir, "verbose.log")
+	hostname, _ := os.Hostname()
+	loadCmd, normalizedKind := benchmarkAutotuneLoadCommand(benchmarkKind, benchmarkPowerAutotuneLoadSec, selected, opts.SizeMB)
+	result := BenchmarkPowerAutotuneResult{
+		GeneratedAt:       time.Now().UTC(),
+		Hostname:          hostname,
+		ServerModel:       readServerModel(),
+		BenchmarkKind:     normalizedKind,
+		Profile:           opts.Profile,
+		Status:            "FAILED",
+		IdleDurationSec:   benchmarkPowerAutotuneIdleSec,
+		LoadDurationSec:   benchmarkPowerAutotuneLoadSec,
+		SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
+	}
+
+	logFunc(fmt.Sprintf("autotune: idle validation window %ds on GPUs %s", benchmarkPowerAutotuneIdleSec, joinIndexList(selected)))
+	idleSamples := collectBenchmarkPowerAutotuneSamples(ctx, "idle", selected, benchmarkPowerAutotuneIdleSec, logFunc)
+	logBenchmarkPowerAutotunePhaseSummary("idle", idleSamples, logFunc)
+	result.IdleValidation = validateBenchmarkPowerAutotuneIdle(idleSamples)
+	if result.IdleValidation == nil || !result.IdleValidation.Valid {
+		if result.IdleValidation != nil {
+			result.IdleValidationError = result.IdleValidation.Reason
+			logFunc(result.IdleValidation.Reason)
+		}
+		result.Notes = append(result.Notes, "autotune stopped before load stage because idle validation failed")
+		if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
+			return "", err
+		}
+		return runDir, fmt.Errorf("%s", result.IdleValidationError)
+	}
+
+	logFunc(fmt.Sprintf("autotune: full-load stage using %s for %ds", normalizedKind, benchmarkPowerAutotuneLoadSec))
+	loadSamplesCh := make(chan []benchmarkPowerAutotuneSample, 1)
+	go func() {
+		loadSamplesCh <- collectBenchmarkPowerAutotuneSamples(ctx, "load", selected, benchmarkPowerAutotuneLoadSec, logFunc)
+	}()
+	out, runErr := runSATCommandCtx(ctx, verboseLog, "autotune-load.log", loadCmd, nil, logFunc)
+	_ = os.WriteFile(filepath.Join(runDir, "autotune-load.log"), out, 0644)
+	loadSamples := <-loadSamplesCh
+	logBenchmarkPowerAutotunePhaseSummary("load", loadSamples, logFunc)
+	if runErr != nil {
+		result.Notes = append(result.Notes, "full-load stage failed: "+runErr.Error())
+		if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
+			return "", err
+		}
+		return runDir, fmt.Errorf("autotune load stage: %w", runErr)
+	}
+
+	selectedSource, candidates, idleGPUAvg, loadGPUAvg, chooseErr := chooseBenchmarkPowerAutotuneSource(idleSamples, loadSamples)
+	result.Candidates = candidates
+	result.GPUPowerIdleW = idleGPUAvg
+	result.GPUPowerLoadW = loadGPUAvg
+	if chooseErr != nil {
+		result.Notes = append(result.Notes, chooseErr.Error())
+		if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
+			return "", err
+		}
+		return runDir, chooseErr
+	}
+	gpuDelta := loadGPUAvg - idleGPUAvg
+	if gpuDelta <= 0 {
+		gpuDelta = loadGPUAvg
+	}
+	logBenchmarkPowerAutotuneSelection(candidates, selectedSource, gpuDelta, logFunc)
+	result.SelectedSource = selectedSource
+	result.Status = "OK"
+	var confidence float64
+	selectionReason := fmt.Sprintf("selected %s after comparing full-load average against GPU-reported delta", selectedSource)
+	for _, candidate := range candidates {
+		if candidate.Selected {
+			confidence = candidate.Confidence
+			if strings.TrimSpace(candidate.SelectionNotes) != "" {
+				selectionReason = candidate.SelectionNotes
+			}
+			break
+		}
+	}
+	cfg := BenchmarkPowerAutotuneConfig{
+		Version:           benchmarkPowerAutotuneVersion,
+		UpdatedAt:         time.Now().UTC(),
+		SelectedSource:    selectedSource,
+		BenchmarkKind:     normalizedKind,
+		Profile:           opts.Profile,
+		IdleDurationSec:   benchmarkPowerAutotuneIdleSec,
+		LoadDurationSec:   benchmarkPowerAutotuneLoadSec,
+		SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
+		Confidence:        confidence,
+		Reason:            selectionReason,
+	}
+	result.Config = &cfg
+	configPath := BenchmarkPowerSourceConfigPath(baseDir)
+	if err := SaveBenchmarkPowerAutotuneConfig(configPath, cfg); err != nil {
+		result.Status = "FAILED"
+		result.Notes = append(result.Notes, "failed to save autotune config: "+err.Error())
+		if writeErr := writeBenchmarkPowerAutotuneArtifacts(runDir, result); writeErr != nil {
+			return "", writeErr
+		}
+		return runDir, err
+	}
+	logFunc(fmt.Sprintf("autotune conclusion: selected source %s; reason: %s", selectedSource, cfg.Reason))
+	result.Notes = append(result.Notes, "saved autotune config to "+configPath)
+	if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
+		return "", err
+	}
+	return runDir, nil
+}
+
+func writeBenchmarkPowerAutotuneArtifacts(runDir string, result BenchmarkPowerAutotuneResult) error {
+	resultJSON, err := json.MarshalIndent(result, "", "  ")
+	if err != nil {
+		return fmt.Errorf("marshal autotune result: %w", err)
+	}
+	if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
+		return fmt.Errorf("write autotune result.json: %w", err)
+	}
+	if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(renderBenchmarkPowerAutotuneSummary(result)), 0644); err != nil {
+		return fmt.Errorf("write autotune summary.txt: %w", err)
+	}
+	if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(renderBenchmarkPowerAutotuneReport(result)), 0644); err != nil {
+		return fmt.Errorf("write autotune report.md: %w", err)
+	}
+	return nil
+}
+
+func minInt(a, b int) int {
+	if a < b {
+		return a
+	}
+	return b
+}
+
+var _ = exec.ErrNotFound
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -43,6 +43,11 @@ const (
 	NvidiaBenchmarkProfileOvernight = "overnight"
 )

+const (
+	BenchmarkPowerEngineDCGMProfTester = "dcgmproftester"
+	BenchmarkPowerEngineTargetedPower  = "targeted_power"
+)
+
 // Estimated wall-clock durations for benchmark runs, derived from real _v8 logs.
 // Rule: when changing profile phase durations in resolveBenchmarkProfile(),
 // re-measure from actual task logs and update the constants here.
@@ -61,7 +66,7 @@ const (
 	BenchmarkEstimatedPerfStabilitySec = 5532 // ~92 min; ramp-up 1-8 measured
 	BenchmarkEstimatedPerfOvernightSec = 8 * 3600

-	// Power / Thermal Fit (dcgmi targeted_power binary-search calibration).
+	// Power / Thermal Fit (dcgmproftester load + nvidia-smi power-limit binary search).
 	// Duration is for the full ramp-up run; individual steps vary with convergence speed.
 	BenchmarkEstimatedPowerStandardSec  = 2600 // ~43 min; ramp 1-4: 2663 s, ramp 1-8: 2375 s
 	BenchmarkEstimatedPowerStabilitySec = 5400 // ~90 min; calibDurationSec=300 × 8 GPU × ~2-3 attempts
@@ -74,12 +79,84 @@ type NvidiaBenchmarkOptions struct {
 	GPUIndices        []int
 	ExcludeGPUIndices []int
 	RunNCCL           bool
+	ServerPowerSource string
 	ParallelGPUs      bool   // run all selected GPUs simultaneously instead of sequentially
 	RampStep          int    // 1-based step index within a ramp-up run (0 = not a ramp-up)
 	RampTotal         int    // total number of ramp-up steps in this run
 	RampRunID         string // shared identifier across all steps of the same ramp-up run
 }

+const (
+	BenchmarkPowerSourceDCMI        = "dcmi"
+	BenchmarkPowerSourceSDRPSUInput = "sdr_psu_input"
+)
+
+type BenchmarkPowerAutotuneConfig struct {
+	Version           int       `json:"version"`
+	UpdatedAt         time.Time `json:"updated_at"`
+	SelectedSource    string    `json:"selected_source"`
+	BenchmarkKind     string    `json:"benchmark_kind,omitempty"`
+	Profile           string    `json:"profile,omitempty"`
+	IdleDurationSec   int       `json:"idle_duration_sec,omitempty"`
+	LoadDurationSec   int       `json:"load_duration_sec,omitempty"`
+	SampleIntervalSec int       `json:"sample_interval_sec,omitempty"`
+	Confidence        float64   `json:"confidence,omitempty"`
+	Reason            string    `json:"reason,omitempty"`
+}
+
+type SystemPowerSourceDecision struct {
+	Configured      bool      `json:"configured"`
+	SelectedSource  string    `json:"selected_source,omitempty"`
+	EffectiveSource string    `json:"effective_source,omitempty"`
+	Mode            string    `json:"mode,omitempty"` // autotuned, fallback, degraded
+	Reason          string    `json:"reason,omitempty"`
+	ConfiguredAt    time.Time `json:"configured_at,omitempty"`
+}
+
+type BenchmarkPowerAutotuneResult struct {
+	GeneratedAt         time.Time                         `json:"generated_at"`
+	Hostname            string                            `json:"hostname,omitempty"`
+	ServerModel         string                            `json:"server_model,omitempty"`
+	BenchmarkKind       string                            `json:"benchmark_kind,omitempty"`
+	Profile             string                            `json:"profile,omitempty"`
+	Status              string                            `json:"status"`
+	IdleDurationSec     int                               `json:"idle_duration_sec"`
+	LoadDurationSec     int                               `json:"load_duration_sec"`
+	SampleIntervalSec   int                               `json:"sample_interval_sec"`
+	SelectedSource      string                            `json:"selected_source,omitempty"`
+	IdleValidationError string                            `json:"idle_validation_error,omitempty"`
+	IdleValidation      *BenchmarkPowerAutotuneValidation `json:"idle_validation,omitempty"`
+	GPUPowerIdleW       float64                           `json:"gpu_power_idle_w,omitempty"`
+	GPUPowerLoadW       float64                           `json:"gpu_power_load_w,omitempty"`
+	Candidates          []BenchmarkPowerAutotuneCandidate `json:"candidates,omitempty"`
+	Notes               []string                          `json:"notes,omitempty"`
+	Config              *BenchmarkPowerAutotuneConfig     `json:"config,omitempty"`
+}
+
+type BenchmarkPowerAutotuneValidation struct {
+	Valid          bool    `json:"valid"`
+	GPUAvgUsagePct float64 `json:"gpu_avg_usage_pct,omitempty"`
+	GPUP95UsagePct float64 `json:"gpu_p95_usage_pct,omitempty"`
+	CPUAvgUsagePct float64 `json:"cpu_avg_usage_pct,omitempty"`
+	CPUP95UsagePct float64 `json:"cpu_p95_usage_pct,omitempty"`
+	GPUSamples     int     `json:"gpu_samples,omitempty"`
+	CPUSamples     int     `json:"cpu_samples,omitempty"`
+	Reason         string  `json:"reason,omitempty"`
+}
+
+type BenchmarkPowerAutotuneCandidate struct {
+	Source         string  `json:"source"`
+	IdleAvgW       float64 `json:"idle_avg_w,omitempty"`
+	LoadAvgW       float64 `json:"load_avg_w,omitempty"`
+	DeltaW         float64 `json:"delta_w,omitempty"`
+	Samples        int     `json:"samples,omitempty"`
+	RelativeError  float64 `json:"relative_error,omitempty"`
+	Confidence     float64 `json:"confidence,omitempty"`
+	Selected       bool    `json:"selected,omitempty"`
+	Available      bool    `json:"available"`
+	SelectionNotes string  `json:"selection_notes,omitempty"`
+}
+
 type NvidiaBenchmarkResult struct {
 	BenchmarkVersion string    `json:"benchmark_version"`
 	GeneratedAt      time.Time `json:"generated_at"`
@@ -294,12 +371,16 @@ type BenchmarkPSUSlotPower struct {
 //   - SDR       — `ipmitool sdr` PSUx_POWER_IN/OUT; per-PSU, reliable
 //   - nvidia-smi — GPU self-reported via internal shunt; accurate for GPU load
 type BenchmarkServerPower struct {
-	Available       bool    `json:"available"`
-	IdleW           float64 `json:"idle_w,omitempty"`   // DCMI at idle
-	LoadedW         float64 `json:"loaded_w,omitempty"` // DCMI at peak load
-	DeltaW          float64 `json:"delta_w,omitempty"`  // DCMI loaded − idle
-	GPUReportedSumW float64 `json:"gpu_reported_sum_w,omitempty"`
-	ReportingRatio  float64 `json:"reporting_ratio,omitempty"`
+	Available         bool    `json:"available"`
+	Source            string  `json:"source,omitempty"`
+	Mode              string  `json:"mode,omitempty"`
+	Reason            string  `json:"reason,omitempty"`
+	SampleIntervalSec int     `json:"sample_interval_sec,omitempty"`
+	IdleW             float64 `json:"idle_w,omitempty"`   // DCMI at idle
+	LoadedW           float64 `json:"loaded_w,omitempty"` // DCMI at peak load
+	DeltaW            float64 `json:"delta_w,omitempty"`  // DCMI loaded − idle
+	GPUReportedSumW   float64 `json:"gpu_reported_sum_w,omitempty"`
+	ReportingRatio    float64 `json:"reporting_ratio,omitempty"`

 	// PSU AC input sum — sampled at idle and at peak load using collector's
 	// slot patterns (PSU1_POWER_IN, PSU1_PIN, PS1 POut, Power1…).