package platform

import (
	"context"
	"encoding/csv"
	"encoding/json"
	"errors"
	"fmt"
	"math"
	"os"
	"os/exec"
	"path/filepath"
	"regexp"
	"sort"
	"strconv"
	"strings"
	"time"
)

const benchmarkVersion = "2"

type benchmarkProfileSpec struct {
	Name        string
	BaselineSec int
	WarmupSec   int
	SteadySec   int
	NCCLSec     int
	CooldownSec int
}

type benchmarkGPUInfo struct {
	Index                int
	UUID                 string
	Name                 string
	BusID                string
	VBIOS                string
	PowerLimitW          float64
	DefaultPowerLimitW   float64
	MaxGraphicsClockMHz  float64
	MaxMemoryClockMHz    float64
	BaseGraphicsClockMHz float64
	MultiprocessorCount  int
}

type benchmarkPowerCalibrationResult struct {
	Summary            BenchmarkTelemetrySummary
	AppliedPowerLimitW float64
	Attempts           int
	Derated            bool
	Completed          bool
	Notes              []string
	// CoolingWarning is set when the GPU throttled thermally with a clock drop
	// ≥20% while server fans were below 100% duty cycle — a signal that the
	// cooling system may not be correctly configured for full GPU load.
	CoolingWarning string
}

type benchmarkBurnProfile struct {
	name       string
	category   string
	supported  bool
	lanes      int
	m          uint64
	n          uint64
	k          uint64
	iterations uint64
	notes      string
}

type benchmarkBurnParseResult struct {
	Device            string
	ComputeCapability string
	Backend           string
	DurationSec       int
	Profiles          []BenchmarkPrecisionResult
	Fallback          bool
}

type benchmarkRestoreAction struct {
	name string
	fn   func()
}

var (
	benchmarkReadyPattern      = regexp.MustCompile(`^([a-z0-9_]+)\[(\d+)\]=READY dim=(\d+)x(\d+)x(\d+)\b`)
	benchmarkSkippedPattern    = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
	benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
)

// benchmarkPrecisionPhases lists the precision categories run as individual
// steady-state windows before the combined steady pass.  Order is from lowest
// to highest power draw so thermal ramp-up is gradual.
var benchmarkPrecisionPhases = []string{"int8", "fp8", "fp16", "fp32", "fp64", "fp4"}

func computeCapabilityCode(raw string) int {
	raw = strings.TrimSpace(raw)
	if raw == "" {
		return 0
	}
	parts := strings.SplitN(raw, ".", 2)
	major, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
	minor := 0
	if len(parts) > 1 {
		minor, _ = strconv.Atoi(strings.TrimSpace(parts[1]))
	}
	return major*10 + minor
}

func benchmarkSupportedPrecisions(computeCapability string) []string {
	cc := computeCapabilityCode(computeCapability)
	out := make([]string, 0, len(benchmarkPrecisionPhases))
	for _, prec := range benchmarkPrecisionPhases {
		if prec == "fp4" && cc > 0 && cc < 100 {
			continue
		}
		out = append(out, prec)
	}
	return out
}

func buildBenchmarkSteadyPlan(spec benchmarkProfileSpec, precisions []string, metricStage func(string) string) (planLabels []string, planPhases []benchmarkPlannedPhase, basePhaseSec int, mixedPhaseSec int) {
	if len(precisions) == 0 {
		precisions = append([]string(nil), benchmarkPrecisionPhases...)
	}
	switch spec.Name {
	case NvidiaBenchmarkProfileStandard:
		basePhaseSec = 60
		mixedPhaseSec = 300
	case NvidiaBenchmarkProfileStability:
		basePhaseSec = 300
		mixedPhaseSec = 3600
	case NvidiaBenchmarkProfileOvernight:
		basePhaseSec = 3600
		mixedPhaseSec = 14400
	default:
		totalWeight := len(precisions) + 5
		if totalWeight <= 0 {
			return nil, nil, 0, 0
		}
		basePhaseSec = spec.SteadySec / totalWeight
		if basePhaseSec <= 0 {
			basePhaseSec = 1
		}
		mixedPhaseSec = basePhaseSec * 5
	}
	planLabels = make([]string, 0, len(precisions)+1)
	planPhases = make([]benchmarkPlannedPhase, 0, len(precisions)+1)
	for _, prec := range precisions {
		planLabels = append(planLabels, prec)
		planPhases = append(planPhases, benchmarkPlannedPhase{
			PlanLabel:   prec,
			MetricStage: metricStage(prec),
			DurationSec: basePhaseSec,
		})
	}
	planLabels = append(planLabels, "mixed")
	planPhases = append(planPhases, benchmarkPlannedPhase{
		PlanLabel:   "mixed",
		MetricStage: metricStage("mixed"),
		DurationSec: mixedPhaseSec,
	})
	return planLabels, planPhases, basePhaseSec, mixedPhaseSec
}

func benchmarkPlanDurationsCSV(phases []benchmarkPlannedPhase) string {
	values := make([]string, 0, len(phases))
	for _, phase := range phases {
		values = append(values, strconv.Itoa(phase.DurationSec))
	}
	return strings.Join(values, ",")
}

func benchmarkPlannedPhaseStatus(raw []byte) (string, string) {
	text := strings.ToLower(strings.TrimSpace(string(raw)))
	switch {
	case text == "":
		return "FAILED", "phase produced no output"
	case strings.Contains(text, "phase_error="):
		if strings.Contains(text, "unsupported") || strings.Contains(text, "not supported") || strings.Contains(text, "cublaslt_profiles=unsupported") {
			return "UNSUPPORTED", "precision phase unsupported on this GPU/userspace path"
		}
		return "FAILED", "precision phase failed"
	case strings.Contains(text, "status=failed"):
		if strings.Contains(text, "unsupported") || strings.Contains(text, "not supported") {
			return "UNSUPPORTED", "precision phase unsupported on this GPU/userspace path"
		}
		return "FAILED", "precision phase failed"
	default:
		return "OK", ""
	}
}

func benchmarkCalibrationThrottleReason(before, after BenchmarkThrottleCounters) string {
	diff := diffThrottleCounters(before, after)
	switch {
	case diff.HWThermalSlowdownUS > 0:
		return "hw_thermal"
	case diff.SWThermalSlowdownUS > 0:
		return "sw_thermal"
	case diff.HWPowerBrakeSlowdownUS > 0:
		return "hw_power_brake"
	default:
		return ""
	}
}

func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, powerLimitW int) error {
	if powerLimitW <= 0 {
		return fmt.Errorf("invalid power limit %d", powerLimitW)
	}
	out, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("gpu-%d-set-power-limit-%dw", gpuIndex, powerLimitW), []string{
		"nvidia-smi", "-i", strconv.Itoa(gpuIndex), "-pl", strconv.Itoa(powerLimitW),
	}, nil, nil)
	if err != nil {
		return fmt.Errorf("set power limit gpu=%d limit=%dw: %w (%s)", gpuIndex, powerLimitW, err, strings.TrimSpace(string(out)))
	}
	return nil
}

func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
	if ctx == nil {
		ctx = context.Background()
	}
	if logFunc == nil {
		logFunc = func(string) {}
	}
	if strings.TrimSpace(baseDir) == "" {
		baseDir = "/var/log/bee-bench/perf"
	}
	spec := resolveBenchmarkProfile(opts.Profile)
	opts = normalizeNvidiaBenchmarkOptionsForBenchmark(opts)

	selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
	if err != nil {
		return "", err
	}
	if len(selected) == 0 {
		return "", fmt.Errorf("no NVIDIA GPUs selected")
	}

	ts := time.Now().UTC().Format("20060102-150405")
	runDir := filepath.Join(baseDir, "perf-"+ts)
	if err := os.MkdirAll(runDir, 0755); err != nil {
		return "", fmt.Errorf("mkdir %s: %w", runDir, err)
	}
	verboseLog := filepath.Join(runDir, "verbose.log")

	hostname, _ := os.Hostname()
	result := NvidiaBenchmarkResult{
		BenchmarkVersion:   benchmarkVersion,
		GeneratedAt:        time.Now().UTC(),
		Hostname:           hostname,
		ServerModel:        readServerModel(),
		BenchmarkProfile:   spec.Name,
		ParallelGPUs:       opts.ParallelGPUs,
		RampStep:           opts.RampStep,
		RampTotal:          opts.RampTotal,
		RampRunID:          opts.RampRunID,
		SelectedGPUIndices: append([]int(nil), selected...),
		HostConfig:         readBenchmarkHostConfig(),
		Normalization: BenchmarkNormalization{
			Status: "full",
		},
	}

	logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected)))
	var metricRows []GPUMetricRow
	metricTimelineSec := 0.0
	gpuBurnLog := filepath.Join(runDir, "gpu-burn.log")

	// Server power characterization state — populated during per-GPU phases.
	var serverIdleW, serverLoadedWSum float64
	var serverIdleOK, serverLoadedOK bool
	var serverLoadedSamples int

	// Run nvidia-smi -q first: used both for the log file and as a fallback
	// source of max clock values when CSV clock fields are unsupported.
	var nvsmiQOut []byte
	if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
		nvsmiQOut = out
		_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
	}

	infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
	if infoErr != nil {
		result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
		result.Normalization.Status = "partial"
	}
	// Enrich with max clocks from verbose output — covers GPUs where
	// clocks.max.* CSV fields are unsupported (e.g. Blackwell / driver 98.x).
	enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQOut)

	activeApps, err := queryActiveComputeApps(selected)
	if err == nil && len(activeApps) > 0 {
		result.Warnings = append(result.Warnings, "active GPU compute processes detected before benchmark")
		result.Normalization.Notes = append(result.Normalization.Notes, activeApps...)
		result.Normalization.Status = "partial"
	}

	restoreActions := applyBenchmarkNormalization(ctx, verboseLog, selected, infoByIndex, &result)
	defer func() {
		for i := len(restoreActions) - 1; i >= 0; i-- {
			restoreActions[i].fn()
		}
	}()

	// No power calibration before performance benchmark — GPUs run at their
	// default power limits. PowerSustainScore is derived from steady-state power
	// observed during the benchmark itself.
	calibByIndex := make(map[int]benchmarkPowerCalibrationResult)

	// Start background CPU load sampler — samples every 10s during GPU phases.
	cpuStopCh := make(chan struct{})
	cpuSamplesCh := startCPULoadSampler(cpuStopCh, 10)

	if opts.ParallelGPUs {
		runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, calibByIndex, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples, &metricRows, &metricTimelineSec, gpuBurnLog)
	} else {

		for _, idx := range selected {
			gpuResult := BenchmarkGPUResult{
				Index:  idx,
				Status: "FAILED",
			}
			if info, ok := infoByIndex[idx]; ok {
				gpuResult.UUID = info.UUID
				gpuResult.Name = info.Name
				gpuResult.BusID = info.BusID
				gpuResult.VBIOS = info.VBIOS
				gpuResult.PowerLimitW = info.PowerLimitW
				gpuResult.MultiprocessorCount = info.MultiprocessorCount
				gpuResult.DefaultPowerLimitW = info.DefaultPowerLimitW
				gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
				gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
				gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
			}
			if calib, ok := calibByIndex[idx]; ok {
				gpuResult.CalibratedPeakPowerW = calib.Summary.P95PowerW
				gpuResult.CalibratedPeakTempC = calib.Summary.P95TempC
				gpuResult.PowerCalibrationTries = calib.Attempts
				gpuResult.PowerLimitDerated = calib.Derated
				gpuResult.Notes = append(gpuResult.Notes, calib.Notes...)
				if calib.CoolingWarning != "" {
					gpuResult.CoolingWarning = calib.CoolingWarning
				}
			}
			if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
				gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz
				gpuResult.LockedMemoryClockMHz = norm.MemoryClockLockMHz
			}

			baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, []int{idx})
			if err != nil && err != context.Canceled {
				gpuResult.Notes = append(gpuResult.Notes, "baseline sampling failed: "+err.Error())
			}
			gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows)
			appendBenchmarkMetrics(&metricRows, baselineRows, fmt.Sprintf("gpu-%d-baseline", idx), &metricTimelineSec, float64(spec.BaselineSec))

			// Sample server idle power once (first GPU only — server state is global).
			if !serverIdleOK {
				if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
					serverIdleW = w
					serverIdleOK = true
					logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
				}
			}

			warmupCmd := []string{
				"bee-gpu-burn",
				"--seconds", strconv.Itoa(spec.WarmupSec),
				"--size-mb", strconv.Itoa(opts.SizeMB),
				"--devices", strconv.Itoa(idx),
			}
			logFunc(fmt.Sprintf("GPU %d: warmup (%ds)", idx, spec.WarmupSec))
			warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-warmup.log", idx), warmupCmd, nil, []int{idx}, logFunc)
			appendBenchmarkMetrics(&metricRows, warmupRows, fmt.Sprintf("gpu-%d-warmup", idx), &metricTimelineSec, float64(spec.WarmupSec))
			appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", fmt.Sprintf("gpu-%d-warmup", idx), warmupOut)
			if warmupErr != nil {
				gpuResult.Notes = append(gpuResult.Notes, "warmup failed: "+warmupErr.Error())
				result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
				continue
			}
			warmupParse := parseBenchmarkBurnLog(string(warmupOut))
			if gpuResult.ComputeCapability == "" {
				gpuResult.ComputeCapability = warmupParse.ComputeCapability
			}

			// Run synthetic precision phases and the combined steady phase as one
			// uninterrupted command so the GPU stays hot between windows.
			eccBase, _ := queryECCCounters(idx)
			supportedPrecisions := benchmarkSupportedPrecisions(gpuResult.ComputeCapability)
			planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, supportedPrecisions, func(label string) string {
				if label == "mixed" {
					return fmt.Sprintf("gpu-%d-steady", idx)
				}
				return fmt.Sprintf("gpu-%d-steady-%s", idx, label)
			})
			planCmd := []string{
				"bee-gpu-burn",
				"--seconds", strconv.Itoa(basePhaseSec),
				"--size-mb", strconv.Itoa(opts.SizeMB),
				"--devices", strconv.Itoa(idx),
				"--precision-plan", strings.Join(planLabels, ","),
				"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
			}
			logFunc(fmt.Sprintf("GPU %d: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", idx, len(supportedPrecisions), basePhaseSec, mixedPhaseSec))
			_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-precision-plan.log", idx), planCmd, nil, []int{idx}, planPhases, logFunc)
			for _, phaseSpec := range planPhases {
				if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
					appendBenchmarkMetrics(&metricRows, rows, phaseSpec.MetricStage, &metricTimelineSec, float64(phaseSpec.DurationSec))
				}
				appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseSpec.MetricStage, phaseLogs[phaseSpec.PlanLabel])
			}
			for _, prec := range supportedPrecisions {
				stageName := fmt.Sprintf("gpu-%d-steady-%s", idx, prec)
				phaseRows := phaseRowsByStage[stageName]
				phase := BenchmarkPrecisionSteadyPhase{
					Precision: prec,
					Status:    "OK",
					Steady:    summarizeBenchmarkTelemetry(phaseRows),
				}
				if status, note := benchmarkPlannedPhaseStatus(phaseLogs[prec]); status != "OK" {
					phase.Status = status
					phase.Notes = note
					gpuResult.PrecisionFailures = append(gpuResult.PrecisionFailures, prec+":"+status)
				}
				for _, p := range parseBenchmarkBurnLog(string(phaseLogs[prec])).Profiles {
					if p.Supported {
						phase.TeraOpsPerSec += p.TeraOpsPerSec
						phase.WeightedTeraOpsPerSec += p.WeightedTeraOpsPerSec
					}
				}
				gpuResult.PrecisionSteady = append(gpuResult.PrecisionSteady, phase)
			}

			beforeThrottle, _ := queryThrottleCounters(idx)
			logFunc(fmt.Sprintf("GPU %d: steady compute (combined, %ds)", idx, mixedPhaseSec))

			// Sample server power via IPMI in parallel with the steady phase.
			// We collect readings every 5s and average them.
			ipmiStopCh := make(chan struct{})
			ipmiResultCh := make(chan float64, 1)
			go func() {
				defer close(ipmiResultCh)
				var samples []float64
				ticker := time.NewTicker(5 * time.Second)
				defer ticker.Stop()
				// First sample after a short warmup delay.
				select {
				case <-ipmiStopCh:
					return
				case <-time.After(15 * time.Second):
				}
				for {
					if w, err := queryIPMIServerPowerW(); err == nil {
						samples = append(samples, w)
					}
					select {
					case <-ipmiStopCh:
						if len(samples) > 0 {
							var sum float64
							for _, w := range samples {
								sum += w
							}
							ipmiResultCh <- sum / float64(len(samples))
						}
						return
					case <-ticker.C:
					}
				}
			}()

			close(ipmiStopCh)
			if loadedW, ok := <-ipmiResultCh; ok {
				serverLoadedWSum += loadedW
				serverLoadedSamples++
				serverLoadedOK = true
				logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW))
			}
			afterThrottle, _ := queryThrottleCounters(idx)
			if planErr != nil {
				gpuResult.Notes = append(gpuResult.Notes, "precision plan failed: "+planErr.Error())
			}

			steadyRows := phaseRowsByStage[fmt.Sprintf("gpu-%d-steady", idx)]
			parseResult := parseBenchmarkBurnLog(string(phaseLogs["mixed"]))
			gpuResult.ComputeCapability = parseResult.ComputeCapability
			gpuResult.Backend = parseResult.Backend
			gpuResult.PrecisionResults = parseResult.Profiles
			if parseResult.Fallback {
				gpuResult.Notes = append(gpuResult.Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable")
			}

			gpuResult.Steady = summarizeBenchmarkTelemetry(steadyRows)
			gpuResult.Throttle = diffThrottleCounters(beforeThrottle, afterThrottle)
			if eccFinal, err := queryECCCounters(idx); err == nil {
				gpuResult.ECC = diffECCCounters(eccBase, eccFinal)
			}

			if spec.CooldownSec > 0 {
				cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, []int{idx})
				if err != nil && err != context.Canceled {
					gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error())
				}
				gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows)
				appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx), &metricTimelineSec, float64(spec.CooldownSec))
			}

			gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult)
			gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status)
			if planErr != nil {
				gpuResult.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr)
			} else if len(gpuResult.PrecisionFailures) > 0 {
				gpuResult.Status = "PARTIAL"
			} else if parseResult.Fallback {
				gpuResult.Status = "PARTIAL"
			} else {
				gpuResult.Status = "OK"
			}

			result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
		}

	} // end sequential path

	// Performance scalability ramp-up: run parallel benchmarks for k=2..N GPUs
	// and compute compute scalability relative to the best single-GPU result.
	// Only runs in sequential mode (each GPU was tested individually above) and
	// when there are at least 2 GPUs.
	if !opts.ParallelGPUs && len(selected) >= 2 {
		// Find the best single-card SyntheticScore as the 1-GPU baseline.
		var bestTOPS float64
		for _, g := range result.GPUs {
			if g.Scores.SyntheticScore > bestTOPS {
				bestTOPS = g.Scores.SyntheticScore
			}
		}
		if bestTOPS > 0 {
			var rampSteps []NvidiaPerformanceRampStep
			var scalabilityPcts []float64
			for k := 2; k <= len(selected); k++ {
				subset := append([]int(nil), selected[:k]...)
				rampDir := filepath.Join(runDir, fmt.Sprintf("ramp-%02d", k))
				_ = os.MkdirAll(rampDir, 0755)
				logFunc(fmt.Sprintf("performance ramp: step %d/%d — running %d GPUs in parallel", k, len(selected), k))

				var rampResult NvidiaBenchmarkResult
				var rampIdleW, rampLoadedWSum float64
				var rampIdleOK, rampLoadedOK bool
				var rampLoadedSamples int
				var rampMetricRows []GPUMetricRow
				var rampTimelineSec float64
				emptyCalib := make(map[int]benchmarkPowerCalibrationResult)

				runNvidiaBenchmarkParallel(ctx, verboseLog, rampDir, subset, infoByIndex, opts, spec, logFunc,
					&rampResult, emptyCalib,
					&rampIdleW, &rampLoadedWSum, &rampIdleOK, &rampLoadedOK, &rampLoadedSamples,
					&rampMetricRows, &rampTimelineSec, "")

				var totalSynth, totalMixed float64
				for _, g := range rampResult.GPUs {
					totalSynth += g.Scores.SyntheticScore
					totalMixed += g.Scores.MixedScore
				}
				scalPct := totalSynth / (float64(k) * bestTOPS) * 100
				scalabilityPcts = append(scalabilityPcts, scalPct)

				stepStatus := "OK"
				if len(rampResult.GPUs) < k {
					stepStatus = "PARTIAL"
				}
				rampSteps = append(rampSteps, NvidiaPerformanceRampStep{
					StepIndex:          k,
					GPUIndices:         subset,
					TotalSyntheticTOPS: totalSynth,
					TotalMixedTOPS:     totalMixed,
					ScalabilityPct:     scalPct,
					Status:             stepStatus,
				})
			}
			result.PerformanceRampSteps = rampSteps
			result.PlatformPowerScore = benchmarkMean(scalabilityPcts)
			if len(scalabilityPcts) > 0 {
				result.ScalabilityScore = scalabilityPcts[len(scalabilityPcts)-1]
			}
		}
	}

	if len(selected) > 1 && opts.RunNCCL {
		result.Interconnect = runBenchmarkInterconnect(ctx, verboseLog, runDir, selected, spec, logFunc)
		if result.Interconnect != nil && result.Interconnect.Supported {
			for i := range result.GPUs {
				result.GPUs[i].Scores.InterconnectScore = result.Interconnect.MaxBusBWGBps
				result.GPUs[i].Scores.CompositeScore = compositeBenchmarkScore(result.GPUs[i].Scores)
			}
		}
	}

	// Stop CPU load sampler and attach results.
	close(cpuStopCh)
	if cpuSamples := <-cpuSamplesCh; len(cpuSamples) > 0 {
		result.CPULoad = summarizeCPULoad(cpuSamples)
		if result.CPULoad != nil && result.CPULoad.Status != "ok" {
			logFunc(fmt.Sprintf("host CPU load during benchmark: avg=%.1f%% max=%.1f%% status=%s",
				result.CPULoad.AvgPct, result.CPULoad.MaxPct, result.CPULoad.Status))
		}
	}

	// Compute server power characterization from accumulated IPMI samples.
	var gpuReportedSumW float64
	for _, gpu := range result.GPUs {
		gpuReportedSumW += gpu.Steady.AvgPowerW
	}
	var serverLoadedW float64
	if serverLoadedSamples > 0 {
		serverLoadedW = serverLoadedWSum / float64(serverLoadedSamples)
	}
	result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK)
	result.Cooling = summarizeBenchmarkCooling(metricRows)

	// Apply server-power penalty when IPMI reports the server delta is much
	// lower than GPU-reported sum: GPU power telemetry is over-stated, making
	// CalibratedPeakPowerW and PowerSustainScore unreliable.
	// Penalty factor scales from 1.0 (ratio ≥ 0.75, no penalty) down to 0.
	if sp := result.ServerPower; sp != nil && sp.Available && sp.ReportingRatio > 0 && sp.ReportingRatio < 0.75 {
		factor := sp.ReportingRatio / 0.75
		for i := range result.GPUs {
			result.GPUs[i].Scores.CompositeScore *= factor
			result.GPUs[i].Notes = append(result.GPUs[i].Notes,
				fmt.Sprintf("server-power penalty applied (reporting_ratio=%.2f < 0.75): composite score reduced to %.1f%%",
					sp.ReportingRatio, factor*100))
		}
	}

	result.Findings = buildBenchmarkFindings(result)
	result.OverallStatus = benchmarkOverallStatus(result)
	writeBenchmarkMetricsFiles(runDir, metricRows)

	resultJSON, err := json.MarshalIndent(result, "", "  ")
	if err != nil {
		return "", fmt.Errorf("marshal benchmark result: %w", err)
	}
	if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
		return "", fmt.Errorf("write result.json: %w", err)
	}

	report := renderBenchmarkReportWithCharts(result)
	if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(report), 0644); err != nil {
		return "", fmt.Errorf("write report.md: %w", err)
	}

	summary := renderBenchmarkSummary(result)
	if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644); err != nil {
		return "", fmt.Errorf("write summary.txt: %w", err)
	}

	return runDir, nil
}

func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) NvidiaBenchmarkOptions {
	switch strings.TrimSpace(strings.ToLower(opts.Profile)) {
	case NvidiaBenchmarkProfileStability:
		opts.Profile = NvidiaBenchmarkProfileStability
	case NvidiaBenchmarkProfileOvernight:
		opts.Profile = NvidiaBenchmarkProfileOvernight
	default:
		opts.Profile = NvidiaBenchmarkProfileStandard
	}
	if opts.SizeMB < 0 {
		opts.SizeMB = 0
	}
	opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices)
	opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices)
	return opts
}

func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
	switch strings.TrimSpace(strings.ToLower(profile)) {
	case NvidiaBenchmarkProfileStability:
		return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 120, SteadySec: 3600, NCCLSec: 300, CooldownSec: 0}
	case NvidiaBenchmarkProfileOvernight:
		return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 180, SteadySec: 27000, NCCLSec: 600, CooldownSec: 0}
	default:
		return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 45, SteadySec: 480, NCCLSec: 180, CooldownSec: 0}
	}
}

// benchmarkGPUInfoQuery describes a nvidia-smi --query-gpu field set to try.
// Fields are tried in order; the first successful query wins. Extended fields
// (attribute.multiprocessor_count, power.default_limit) are not supported on
// all driver versions, so we fall back to the base set if the full query fails.
// The minimal fallback omits clock fields entirely — clocks.max.* returns
// exit status 2 on some GPU generations (e.g. Blackwell); max clocks are
// then recovered from nvidia-smi -q via enrichGPUInfoWithMaxClocks.
var benchmarkGPUInfoQueries = []struct {
	fields   string
	extended bool // whether this query includes optional extended fields
	minimal  bool // clock fields omitted; max clocks must be filled separately
}{
	{
		fields:   "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics,attribute.multiprocessor_count,power.default_limit",
		extended: true,
	},
	{
		fields:   "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics",
		extended: false,
	},
	{
		fields:  "index,uuid,name,pci.bus_id,vbios_version,power.limit",
		minimal: true,
	},
}

// enrichGPUInfoWithMaxClocks fills MaxGraphicsClockMHz / MaxMemoryClockMHz for
// any GPU in infoByIndex where those values are still zero.  It parses the
// "Max Clocks" section of nvidia-smi -q output (already available as nvsmiQ).
// This is the fallback for GPUs (e.g. Blackwell) where clocks.max.* CSV fields
// return exit status 2 but the verbose query works fine.
func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) {
	if len(infoByIndex) == 0 || len(nvsmiQ) == 0 {
		return
	}

	// Build bus_id → index map for matching verbose sections to GPU indices.
	busToBenchIdx := make(map[string]int, len(infoByIndex))
	for idx, info := range infoByIndex {
		if info.BusID != "" {
			// nvidia-smi -q uses "GPU 00000000:4E:00.0" (8-digit domain),
			// while --query-gpu returns the same format; normalise to lower.
			busToBenchIdx[strings.ToLower(strings.TrimSpace(info.BusID))] = idx
		}
	}

	// Split the verbose output into per-GPU sections on "^GPU " lines.
	gpuSectionRe := regexp.MustCompile(`(?m)^GPU\s+([\dA-Fa-f:\.]+)`)
	maxGfxRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Graphics\s*:\s*(\d+)\s*MHz`)
	maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`)
	defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`)
	currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`)
	smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`)

	sectionStarts := gpuSectionRe.FindAllSubmatchIndex(nvsmiQ, -1)
	for i, loc := range sectionStarts {
		busID := strings.ToLower(string(nvsmiQ[loc[2]:loc[3]]))
		benchIdx, ok := busToBenchIdx[busID]
		if !ok {
			// Bus IDs from verbose output may have a different domain prefix;
			// try suffix match on the slot portion (XX:XX.X).
			for k, v := range busToBenchIdx {
				if strings.HasSuffix(k, busID) || strings.HasSuffix(busID, k) {
					benchIdx = v
					ok = true
					break
				}
			}
		}
		if !ok {
			continue
		}

		end := len(nvsmiQ)
		if i+1 < len(sectionStarts) {
			end = sectionStarts[i+1][0]
		}
		section := nvsmiQ[loc[0]:end]

		info := infoByIndex[benchIdx]

		if info.MaxGraphicsClockMHz == 0 {
			if m := maxGfxRe.FindSubmatch(section); m != nil {
				if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil {
					info.MaxGraphicsClockMHz = v
				}
			}
		}
		if info.MaxMemoryClockMHz == 0 {
			if m := maxMemRe.FindSubmatch(section); m != nil {
				if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil {
					info.MaxMemoryClockMHz = v
				}
			}
		}
		if info.DefaultPowerLimitW == 0 {
			if m := defaultPwrRe.FindSubmatch(section); m != nil {
				if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
					info.DefaultPowerLimitW = v
				}
			}
		}
		if info.PowerLimitW == 0 {
			if m := currentPwrRe.FindSubmatch(section); m != nil {
				if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
					info.PowerLimitW = v
				}
			}
		}
		if info.MultiprocessorCount == 0 {
			if m := smCountRe.FindSubmatch(section); m != nil {
				if v, err := strconv.Atoi(string(m[1])); err == nil && v > 0 {
					info.MultiprocessorCount = v
				}
			}
		}
		infoByIndex[benchIdx] = info
	}
}

func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
	var lastErr error
	for _, q := range benchmarkGPUInfoQueries {
		args := []string{
			"--query-gpu=" + q.fields,
			"--format=csv,noheader,nounits",
		}
		if len(gpuIndices) > 0 {
			args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...)
		}
		out, err := satExecCommand("nvidia-smi", args...).Output()
		if err != nil {
			lastErr = fmt.Errorf("nvidia-smi gpu info (%s): %w", q.fields[:min(len(q.fields), 40)], err)
			continue
		}

		r := csv.NewReader(strings.NewReader(string(out)))
		r.TrimLeadingSpace = true
		r.FieldsPerRecord = -1
		rows, err := r.ReadAll()
		if err != nil {
			lastErr = fmt.Errorf("parse nvidia-smi gpu info: %w", err)
			continue
		}

		minFields := 6
		if !q.minimal {
			minFields = 9
		}
		infoByIndex := make(map[int]benchmarkGPUInfo, len(rows))
		for _, row := range rows {
			if len(row) < minFields {
				continue
			}
			idx, err := strconv.Atoi(strings.TrimSpace(row[0]))
			if err != nil {
				continue
			}
			info := benchmarkGPUInfo{
				Index:       idx,
				UUID:        strings.TrimSpace(row[1]),
				Name:        strings.TrimSpace(row[2]),
				BusID:       strings.TrimSpace(row[3]),
				VBIOS:       strings.TrimSpace(row[4]),
				PowerLimitW: parseBenchmarkFloat(row[5]),
			}
			if !q.minimal {
				info.MaxGraphicsClockMHz = parseBenchmarkFloat(row[6])
				info.MaxMemoryClockMHz = parseBenchmarkFloat(row[7])
				if len(row) >= 9 {
					info.BaseGraphicsClockMHz = parseBenchmarkFloat(row[8])
				}
				if q.extended {
					if len(row) >= 10 {
						info.MultiprocessorCount = int(parseBenchmarkFloat(row[9]))
					}
					if len(row) >= 11 {
						info.DefaultPowerLimitW = parseBenchmarkFloat(row[10])
					}
				}
			}
			infoByIndex[idx] = info
		}
		return infoByIndex, nil
	}
	return nil, lastErr
}

func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndices []int, infoByIndex map[int]benchmarkGPUInfo, result *NvidiaBenchmarkResult) []benchmarkRestoreAction {
	if os.Geteuid() != 0 {
		result.Normalization.Status = "partial"
		result.Normalization.Notes = append(result.Normalization.Notes, "benchmark normalization skipped: root privileges are required for persistence mode and clock locks")
		for _, idx := range gpuIndices {
			result.Normalization.GPUs = append(result.Normalization.GPUs, BenchmarkNormalizationGPU{
				Index: idx,
				Notes: []string{"normalization skipped: root privileges are required"},
			})
		}
		return nil
	}

	var restore []benchmarkRestoreAction
	for _, idx := range gpuIndices {
		rec := BenchmarkNormalizationGPU{Index: idx}
		if _, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-pm", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-pm", "1"}, nil, nil); err != nil {
			rec.PersistenceMode = "failed"
			rec.Notes = append(rec.Notes, "failed to enable persistence mode")
			result.Normalization.Status = "partial"
		} else {
			rec.PersistenceMode = "applied"
		}

		if info, ok := infoByIndex[idx]; ok && info.MaxGraphicsClockMHz > 0 {
			target := int(math.Round(info.MaxGraphicsClockMHz))
			if out, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-lgc", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-lgc", strconv.Itoa(target)}, nil, nil); err != nil {
				rec.GPUClockLockStatus = "failed"
				rec.Notes = append(rec.Notes, "graphics clock lock failed: "+strings.TrimSpace(string(out)))
				result.Normalization.Status = "partial"
			} else {
				rec.GPUClockLockStatus = "applied"
				rec.GPUClockLockMHz = float64(target)
				idxCopy := idx
				restore = append(restore, benchmarkRestoreAction{name: fmt.Sprintf("gpu-%d-rgc", idxCopy), fn: func() {
					_, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rgc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rgc"}, nil, nil)
				}})
			}
		} else {
			rec.GPUClockLockStatus = "skipped"
			rec.Notes = append(rec.Notes, "graphics clock lock skipped: gpu inventory unavailable or MaxGraphicsClockMHz=0")
			result.Normalization.Status = "partial"
		}

		if info, ok := infoByIndex[idx]; ok && info.MaxMemoryClockMHz > 0 {
			target := int(math.Round(info.MaxMemoryClockMHz))
			out, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-lmc", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-lmc", strconv.Itoa(target)}, nil, nil)
			switch {
			case err == nil:
				rec.MemoryClockLockStatus = "applied"
				rec.MemoryClockLockMHz = float64(target)
				idxCopy := idx
				restore = append(restore, benchmarkRestoreAction{name: fmt.Sprintf("gpu-%d-rmc", idxCopy), fn: func() {
					_, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rmc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rmc"}, nil, nil)
				}})
			case strings.Contains(strings.ToLower(string(out)), "deferred") || strings.Contains(strings.ToLower(string(out)), "not supported"):
				rec.MemoryClockLockStatus = "unsupported"
				rec.Notes = append(rec.Notes, "memory clock lock unsupported on this GPU/driver path")
				result.Normalization.Status = "partial"
			default:
				rec.MemoryClockLockStatus = "failed"
				rec.Notes = append(rec.Notes, "memory clock lock failed: "+strings.TrimSpace(string(out)))
				result.Normalization.Status = "partial"
			}
		}

		result.Normalization.GPUs = append(result.Normalization.GPUs, rec)
	}
	return restore
}

func collectBenchmarkSamples(ctx context.Context, durationSec int, gpuIndices []int) ([]GPUMetricRow, error) {
	if durationSec <= 0 {
		return nil, nil
	}
	deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
	var rows []GPUMetricRow
	start := time.Now()
	for {
		if ctx.Err() != nil {
			return rows, ctx.Err()
		}
		samples, err := sampleBenchmarkTelemetry(gpuIndices)
		if err == nil {
			elapsed := time.Since(start).Seconds()
			for i := range samples {
				samples[i].ElapsedSec = elapsed
			}
			rows = append(rows, samples...)
		}
		if time.Now().After(deadline) {
			break
		}
		select {
		case <-ctx.Done():
			return rows, ctx.Err()
		case <-time.After(time.Second):
		}
	}
	return rows, nil
}

func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, logFunc func(string)) ([]byte, []GPUMetricRow, error) {
	stopCh := make(chan struct{})
	doneCh := make(chan struct{})
	var metricRows []GPUMetricRow
	start := time.Now()

	go func() {
		defer close(doneCh)
		ticker := time.NewTicker(time.Second)
		defer ticker.Stop()
		for {
			select {
			case <-stopCh:
				return
			case <-ticker.C:
				samples, err := sampleBenchmarkTelemetry(gpuIndices)
				if err != nil {
					continue
				}
				elapsed := time.Since(start).Seconds()
				for i := range samples {
					samples[i].ElapsedSec = elapsed
				}
				metricRows = append(metricRows, samples...)
			}
		}
	}()

	out, err := runSATCommandCtx(ctx, verboseLog, name, cmd, env, logFunc)
	close(stopCh)
	<-doneCh

	return out, metricRows, err
}

type benchmarkPlannedPhase struct {
	PlanLabel   string
	MetricStage string
	DurationSec int
}

func runBenchmarkPlannedCommandWithMetrics(
	ctx context.Context,
	verboseLog, name string,
	cmd []string,
	env []string,
	gpuIndices []int,
	phases []benchmarkPlannedPhase,
	logFunc func(string),
) ([]byte, map[string][]GPUMetricRow, map[string][]byte, error) {
	out, rows, err := runBenchmarkCommandWithMetrics(ctx, verboseLog, name, cmd, env, gpuIndices, logFunc)
	return out, splitBenchmarkRowsByPlannedPhase(rows, phases), splitBenchmarkLogByPlannedPhase(out), err
}

func splitBenchmarkRowsByPlannedPhase(rows []GPUMetricRow, phases []benchmarkPlannedPhase) map[string][]GPUMetricRow {
	out := make(map[string][]GPUMetricRow, len(phases))
	if len(rows) == 0 || len(phases) == 0 {
		return out
	}
	for _, row := range rows {
		idx := len(phases) - 1
		var elapsed float64
		for i, phase := range phases {
			durationSec := phase.DurationSec
			if durationSec <= 0 {
				durationSec = 1
			}
			elapsed += float64(durationSec)
			if row.ElapsedSec < elapsed {
				idx = i
				break
			}
		}
		out[phases[idx].MetricStage] = append(out[phases[idx].MetricStage], row)
	}
	return out
}

func splitBenchmarkLogByPlannedPhase(raw []byte) map[string][]byte {
	out := make(map[string][]byte)
	var current string
	for _, line := range strings.Split(strings.ReplaceAll(string(raw), "\r\n", "\n"), "\n") {
		trimmed := strings.TrimSpace(stripBenchmarkPrefix(line))
		switch {
		case strings.HasPrefix(trimmed, "phase_begin="):
			current = strings.TrimSpace(strings.TrimPrefix(trimmed, "phase_begin="))
		case strings.HasPrefix(trimmed, "phase_end="):
			current = ""
		case current != "":
			out[current] = append(out[current], []byte(line+"\n")...)
		}
	}
	return out
}

type benchmarkCoolingSample struct {
	AvgFanRPM             float64
	AvgFanDutyCyclePct    float64
	FanDutyCycleAvailable bool
}

func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
	samples, err := sampleGPUMetrics(gpuIndices)
	if err != nil {
		return nil, err
	}
	fanSample := sampleBenchmarkCoolingSample()
	for i := range samples {
		samples[i].FanAvgRPM = fanSample.AvgFanRPM
		samples[i].FanDutyCyclePct = fanSample.AvgFanDutyCyclePct
		samples[i].FanDutyCycleAvailable = fanSample.FanDutyCycleAvailable
	}
	return samples, nil
}

func sampleBenchmarkCoolingSample() benchmarkCoolingSample {
	fans, _ := sampleFanSpeeds()
	avgRPM, _, _ := fanRPMStats(fans)
	dutyPct, dutyAvailable := sampleFanDutyCyclePct()
	return benchmarkCoolingSample{
		AvgFanRPM:             avgRPM,
		AvgFanDutyCyclePct:    dutyPct,
		FanDutyCycleAvailable: dutyAvailable,
	}
}

func annotateBenchmarkMetricRows(rows []GPUMetricRow, stage string, offset, durationSec float64) []GPUMetricRow {
	if len(rows) == 0 {
		return nil
	}
	stageEnd := offset + durationSec
	if stageEnd <= offset {
		stageEnd = offset
		for _, row := range rows {
			if row.ElapsedSec+offset > stageEnd {
				stageEnd = row.ElapsedSec + offset
			}
		}
	}
	out := make([]GPUMetricRow, len(rows))
	for i, row := range rows {
		row.Stage = stage
		row.ElapsedSec += offset
		row.StageStartSec = offset
		row.StageEndSec = stageEnd
		out[i] = row
	}
	return out
}

func appendBenchmarkMetrics(allRows *[]GPUMetricRow, rows []GPUMetricRow, stage string, cursor *float64, durationSec float64) {
	annotated := annotateBenchmarkMetricRows(rows, stage, *cursor, durationSec)
	*allRows = append(*allRows, annotated...)
	*cursor += durationSec
}

func writeBenchmarkMetricsFiles(runDir string, rows []GPUMetricRow) {
	if len(rows) == 0 {
		return
	}
	_ = WriteGPUMetricsCSV(filepath.Join(runDir, "gpu-metrics.csv"), rows)
	_ = WriteGPUMetricsHTML(filepath.Join(runDir, "gpu-metrics.html"), rows)
}

func appendBenchmarkStageLog(path, source, stage string, raw []byte) {
	if path == "" || len(raw) == 0 {
		return
	}
	f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
	if err != nil {
		return
	}
	defer f.Close()
	header := fmt.Sprintf("\n========== %s | stage=%s ==========\n", source, stage)
	_, _ = f.WriteString(header)
	if len(raw) > 0 {
		_, _ = f.Write(raw)
		if raw[len(raw)-1] != '\n' {
			_, _ = f.WriteString("\n")
		}
	}
}

func parseBenchmarkBurnLog(raw string) benchmarkBurnParseResult {
	result := benchmarkBurnParseResult{}
	lines := strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n")
	profiles := make(map[string]*benchmarkBurnProfile)
	for _, line := range lines {
		line = stripBenchmarkPrefix(strings.TrimSpace(line))
		if line == "" {
			continue
		}
		switch {
		case strings.HasPrefix(line, "device="):
			result.Device = strings.TrimSpace(strings.TrimPrefix(line, "device="))
		case strings.HasPrefix(line, "compute_capability="):
			result.ComputeCapability = strings.TrimSpace(strings.TrimPrefix(line, "compute_capability="))
		case strings.HasPrefix(line, "backend="):
			result.Backend = strings.TrimSpace(strings.TrimPrefix(line, "backend="))
			result.Fallback = result.Backend == "driver-ptx"
		case strings.HasPrefix(line, "duration_s="):
			result.DurationSec, _ = strconv.Atoi(strings.TrimSpace(strings.TrimPrefix(line, "duration_s=")))
		default:
			if m := benchmarkReadyPattern.FindStringSubmatch(line); len(m) == 6 {
				profile := ensureBenchmarkProfile(profiles, m[1])
				profile.supported = true
				profile.lanes++
				profile.m, _ = strconv.ParseUint(m[3], 10, 64)
				profile.n, _ = strconv.ParseUint(m[4], 10, 64)
				profile.k, _ = strconv.ParseUint(m[5], 10, 64)
				continue
			}
			if m := benchmarkSkippedPattern.FindStringSubmatch(line); len(m) == 3 {
				profile := ensureBenchmarkProfile(profiles, m[1])
				profile.supported = false
				profile.notes = strings.TrimSpace(m[2])
				continue
			}
			if m := benchmarkIterationsPattern.FindStringSubmatch(line); len(m) == 3 {
				profile := ensureBenchmarkProfile(profiles, m[1])
				iters, _ := strconv.ParseUint(m[2], 10, 64)
				profile.iterations += iters
			}
		}
	}

	keys := make([]string, 0, len(profiles))
	for key := range profiles {
		keys = append(keys, key)
	}
	sort.Strings(keys)
	for _, key := range keys {
		profile := profiles[key]
		precision := BenchmarkPrecisionResult{
			Name:       profile.name,
			Category:   profile.category,
			Supported:  profile.supported,
			Lanes:      profile.lanes,
			M:          profile.m,
			N:          profile.n,
			K:          profile.k,
			Iterations: profile.iterations,
			Notes:      profile.notes,
		}
		w := precisionWeight(profile.category)
		precision.Weight = w
		if profile.supported && result.DurationSec > 0 && profile.m > 0 && profile.n > 0 && profile.k > 0 && profile.iterations > 0 {
			precision.TeraOpsPerSec = (2.0 * float64(profile.m) * float64(profile.n) * float64(profile.k) * float64(profile.iterations)) / float64(result.DurationSec) / 1e12
			precision.WeightedTeraOpsPerSec = precision.TeraOpsPerSec * w
		}
		result.Profiles = append(result.Profiles, precision)
	}
	return result
}

func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name string) *benchmarkBurnProfile {
	if profile, ok := profiles[name]; ok {
		return profile
	}
	category := "other"
	switch {
	case strings.HasPrefix(name, "fp64"):
		category = "fp64"
	case strings.HasPrefix(name, "fp32"):
		category = "fp32_tf32"
	case strings.HasPrefix(name, "fp16"):
		category = "fp16_bf16"
	case strings.HasPrefix(name, "int8"):
		category = "int8"
	case strings.HasPrefix(name, "fp8"):
		category = "fp8"
	case strings.HasPrefix(name, "fp4"):
		category = "fp4"
	}
	profile := &benchmarkBurnProfile{name: name, category: category, supported: true}
	profiles[name] = profile
	return profile
}

// precisionWeight returns the fp32-equivalence factor for a precision category.
// Each factor represents how much "real" numeric work one operation of that
// type performs relative to fp32 (single precision = 1.0 baseline):
//
//	fp64  = 2.0  — double precision, 2× more bits per operand
//	fp32  = 1.0  — single precision baseline
//	fp16  = 0.5  — half precision
//	int8  = 0.25 — quarter precision
//	fp8   = 0.25 — quarter precision
//	fp4   = 0.125 — eighth precision
//
// Multiplying raw TOPS by the weight gives fp32-equivalent TOPS, enabling
// cross-precision comparison on the same numeric scale.
func precisionWeight(category string) float64 {
	switch category {
	case "fp64":
		return 2.0
	case "fp32_tf32":
		return 1.0
	case "fp16_bf16":
		return 0.5
	case "int8":
		return 0.25
	case "fp8":
		return 0.25
	case "fp4":
		return 0.125
	default:
		return 1.0
	}
}

func stripBenchmarkPrefix(line string) string {
	if strings.HasPrefix(line, "[gpu ") {
		if idx := strings.Index(line, "] "); idx >= 0 {
			return line[idx+2:]
		}
	}
	return line
}

func summarizeBenchmarkTelemetry(rows []GPUMetricRow) BenchmarkTelemetrySummary {
	summary := BenchmarkTelemetrySummary{}
	if len(rows) == 0 {
		return summary
	}
	temps := make([]float64, 0, len(rows))
	powers := make([]float64, 0, len(rows))
	clocks := make([]float64, 0, len(rows))
	memClocks := make([]float64, 0, len(rows))
	usages := make([]float64, 0, len(rows))
	memUsages := make([]float64, 0, len(rows))
	summary.DurationSec = rows[len(rows)-1].ElapsedSec
	summary.Samples = len(rows)
	for _, row := range rows {
		temps = append(temps, row.TempC)
		powers = append(powers, row.PowerW)
		clocks = append(clocks, row.ClockMHz)
		memClocks = append(memClocks, row.MemClockMHz)
		usages = append(usages, row.UsagePct)
		memUsages = append(memUsages, row.MemUsagePct)
	}
	summary.AvgTempC = benchmarkMean(temps)
	summary.P95TempC = benchmarkPercentile(temps, 95)
	summary.AvgPowerW = benchmarkMean(powers)
	summary.P95PowerW = benchmarkPercentile(powers, 95)
	summary.AvgGraphicsClockMHz = benchmarkMean(clocks)
	summary.P95GraphicsClockMHz = benchmarkPercentile(clocks, 95)
	summary.AvgMemoryClockMHz = benchmarkMean(memClocks)
	summary.P95MemoryClockMHz = benchmarkPercentile(memClocks, 95)
	summary.AvgUsagePct = benchmarkMean(usages)
	summary.AvgMemUsagePct = benchmarkMean(memUsages)
	summary.ClockCVPct = benchmarkCV(clocks)
	summary.PowerCVPct = benchmarkCV(powers)
	summary.TempCVPct = benchmarkCV(temps)
	summary.ClockDriftPct = benchmarkClockDrift(clocks)
	return summary
}

func summarizeBenchmarkCooling(rows []GPUMetricRow) *BenchmarkCoolingSummary {
	if len(rows) == 0 {
		return nil
	}
	var rpmValues []float64
	var dutyValues []float64
	for _, row := range rows {
		if row.FanAvgRPM > 0 {
			rpmValues = append(rpmValues, row.FanAvgRPM)
		}
		if row.FanDutyCycleAvailable {
			dutyValues = append(dutyValues, row.FanDutyCyclePct)
		}
	}
	if len(rpmValues) == 0 && len(dutyValues) == 0 {
		return nil
	}
	summary := &BenchmarkCoolingSummary{
		Available: true,
		AvgFanRPM: benchmarkMean(rpmValues),
	}
	if len(dutyValues) > 0 {
		summary.FanDutyCycleAvailable = true
		summary.AvgFanDutyCyclePct = benchmarkMean(dutyValues)
		summary.P95FanDutyCyclePct = benchmarkPercentile(dutyValues, 95)
	} else {
		summary.Notes = append(summary.Notes, "fan duty cycle unavailable on this host; RPM-only fan telemetry was collected")
	}
	return summary
}

func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
	score := BenchmarkScorecard{}

	// SyntheticScore: sum of fp32-equivalent TOPS from per-precision phases.
	// Each precision ran alone with full GPU dedicated — peak capability.
	for _, p := range gpu.PrecisionSteady {
		score.SyntheticScore += p.WeightedTeraOpsPerSec
	}

	// MixedScore: sum of fp32-equivalent TOPS from the combined phase.
	// All precisions compete simultaneously — closer to real inference workloads.
	for _, p := range gpu.PrecisionResults {
		if p.Supported {
			score.MixedScore += p.WeightedTeraOpsPerSec
		}
	}

	// MixedEfficiency = MixedScore / SyntheticScore.
	// Measures how well the GPU sustains throughput under concurrent mixed load.
	// A healthy GPU scores ~0.8–0.95; severe degradation suggests bandwidth
	// contention or scheduler inefficiency.
	if score.SyntheticScore > 0 && score.MixedScore > 0 {
		score.MixedEfficiency = score.MixedScore / score.SyntheticScore
	}

	// ComputeScore = SyntheticScore × (1 + MixedEfficiency × 0.3).
	// SyntheticScore is the primary signal; MixedEfficiency adds up to +30%
	// bonus for GPUs that handle mixed-precision concurrency well.
	// Falls back to MixedScore alone when per-precision data is absent.
	switch {
	case score.SyntheticScore > 0:
		score.ComputeScore = score.SyntheticScore * (1 + score.MixedEfficiency*0.3)
	case score.MixedScore > 0:
		score.ComputeScore = score.MixedScore
	}
	// PowerSustainScore: how stable is GPU power draw during the benchmark?
	// High variance means the workload is bursting or the power delivery is
	// unstable. Score = max(0, 100 − PowerCVPct × 3).
	// At 10% CV → score 70; at 33%+ CV → score 0.
	// Uses per-precision windows when available (each runs a single kernel,
	// so CV reflects genuine power regulation, not workload switching).
	if len(gpu.PrecisionSteady) > 0 {
		var sum float64
		for _, p := range gpu.PrecisionSteady {
			sum += clampScore(100 - p.Steady.PowerCVPct*3)
		}
		score.PowerSustainScore = sum / float64(len(gpu.PrecisionSteady))
	} else if gpu.Steady.PowerCVPct > 0 {
		score.PowerSustainScore = clampScore(100 - gpu.Steady.PowerCVPct*3)
	}

	// ThermalSustainScore: how stable is GPU temperature during the benchmark?
	// High variance means cooling is inconsistent (fan bursts, liquid flow
	// instability, or frequent transitions in and out of throttle).
	// Score = max(0, 100 − TempCVPct × 3).
	if gpu.Steady.TempCVPct > 0 {
		score.ThermalSustainScore = clampScore(100 - gpu.Steady.TempCVPct*3)
	} else {
		// TempCV not recorded — fall back to 100 (no penalty).
		score.ThermalSustainScore = 100
	}

	// StabilityScore: what fraction of the benchmark did the GPU spend throttling?
	// Counts both thermal (HW+SW) and power-cap throttle events.
	// Score = max(0, 100 − throttle_ratio × 100).
	// 1% throttle → score 99; 10% throttle → score 90; 100% → score 0.
	runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
	throttleUS := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) + float64(gpu.Throttle.SWPowerCapUS)
	score.StabilityScore = clampScore(100 - throttleUS/runtimeUS*100)
	score.CompositeScore = compositeBenchmarkScore(score)
	if gpu.MultiprocessorCount > 0 && gpu.Steady.AvgGraphicsClockMHz > 0 && score.ComputeScore > 0 {
		score.TOPSPerSMPerGHz = score.ComputeScore / float64(gpu.MultiprocessorCount) / (gpu.Steady.AvgGraphicsClockMHz / 1000.0)
	}
	return score
}

func compositeBenchmarkScore(score BenchmarkScorecard) float64 {
	// quality_factor weights:
	//   base          0.35 — floor so a GPU that fails all sustain checks still scores
	//   StabilityScore 0.35 — throttle time: heaviest, direct signal of GPU not keeping up
	//   PowerSustainScore 0.15 — power variance: unstable draw hints at regulation issues
	//   ThermalSustainScore 0.15 — temp variance: unstable cooling hints at airflow issues
	//   cap           1.00
	quality := 0.35 +
		0.35*(score.StabilityScore/100.0) +
		0.15*(score.PowerSustainScore/100.0) +
		0.15*(score.ThermalSustainScore/100.0)
	if quality > 1.00 {
		quality = 1.00
	}
	return score.ComputeScore * quality
}

func detectBenchmarkDegradationReasons(gpu BenchmarkGPUResult, normalizationStatus string) []string {
	var reasons []string
	runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
	if float64(gpu.Throttle.SWPowerCapUS)/runtimeUS >= 0.05 {
		reasons = append(reasons, "power_capped")
	}
	if float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS)/runtimeUS >= 0.01 {
		reasons = append(reasons, "thermal_limited")
	}
	if float64(gpu.Throttle.SyncBoostUS)/runtimeUS >= 0.01 {
		reasons = append(reasons, "sync_boost_limited")
	}
	if gpu.LockedGraphicsClockMHz > 0 && gpu.Steady.AvgGraphicsClockMHz < gpu.LockedGraphicsClockMHz*0.90 {
		reasons = append(reasons, "low_sm_clock_vs_target")
	}
	if gpu.Scores.StabilityScore > 0 && gpu.Scores.StabilityScore < 85 {
		reasons = append(reasons, "variance_too_high")
	}
	if normalizationStatus != "full" {
		reasons = append(reasons, "normalization_partial")
	}
	if gpu.PowerLimitDerated {
		reasons = append(reasons, "power_limit_derated")
	}
	if gpu.ECC.Uncorrected > 0 {
		reasons = append(reasons, "ecc_uncorrected_errors")
	}
	if gpu.ECC.Corrected > 0 {
		reasons = append(reasons, "ecc_corrected_errors")
	}
	return dedupeStrings(reasons)
}

func runBenchmarkInterconnect(ctx context.Context, verboseLog, runDir string, gpuIndices []int, spec benchmarkProfileSpec, logFunc func(string)) *BenchmarkInterconnectResult {
	result := &BenchmarkInterconnectResult{
		Status:             "UNSUPPORTED",
		Attempted:          true,
		SelectedGPUIndices: append([]int(nil), gpuIndices...),
	}
	cmd := []string{
		"all_reduce_perf",
		"-b", "512M",
		"-e", "4G",
		"-f", "2",
		"-g", strconv.Itoa(len(gpuIndices)),
		"--iters", strconv.Itoa(maxInt(20, spec.NCCLSec/10)),
	}
	env := []string{
		"CUDA_DEVICE_ORDER=PCI_BUS_ID",
		"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices),
	}
	logFunc(fmt.Sprintf("NCCL interconnect: gpus=%s", joinIndexList(gpuIndices)))
	out, err := runSATCommandCtx(ctx, verboseLog, "nccl-all-reduce.log", cmd, env, logFunc)
	_ = os.WriteFile(filepath.Join(runDir, "nccl-all-reduce.log"), out, 0644)
	if err != nil {
		result.Notes = append(result.Notes, strings.TrimSpace(string(out)))
		return result
	}
	avgAlg, maxAlg, avgBus, maxBus := parseNCCLAllReduceOutput(string(out))
	result.Status = "OK"
	result.Supported = true
	result.AvgAlgBWGBps = avgAlg
	result.MaxAlgBWGBps = maxAlg
	result.AvgBusBWGBps = avgBus
	result.MaxBusBWGBps = maxBus
	return result
}

func parseNCCLAllReduceOutput(raw string) (avgAlg, maxAlg, avgBus, maxBus float64) {
	lines := strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n")
	var algs []float64
	var buses []float64
	for _, line := range lines {
		line = strings.TrimSpace(line)
		if line == "" || strings.HasPrefix(line, "#") {
			continue
		}
		fields := strings.Fields(line)
		if len(fields) < 8 {
			continue
		}
		for i := 0; i+2 < len(fields); i++ {
			timeVal, err1 := strconv.ParseFloat(fields[i], 64)
			algVal, err2 := strconv.ParseFloat(fields[i+1], 64)
			busVal, err3 := strconv.ParseFloat(fields[i+2], 64)
			if err1 == nil && err2 == nil && err3 == nil && timeVal > 0 {
				algs = append(algs, algVal)
				buses = append(buses, busVal)
				break
			}
		}
	}
	if len(algs) == 0 {
		return 0, 0, 0, 0
	}
	return benchmarkMean(algs), benchmarkMax(algs), benchmarkMean(buses), benchmarkMax(buses)
}

func queryThrottleCounters(gpuIndex int) (BenchmarkThrottleCounters, error) {
	out, err := satExecCommand(
		"nvidia-smi",
		"--id="+strconv.Itoa(gpuIndex),
		"--query-gpu=clocks_event_reasons_counters.sw_power_cap,clocks_event_reasons_counters.sw_thermal_slowdown,clocks_event_reasons_counters.sync_boost,clocks_event_reasons_counters.hw_thermal_slowdown,clocks_event_reasons_counters.hw_power_brake_slowdown",
		"--format=csv,noheader,nounits",
	).Output()
	if err != nil {
		return BenchmarkThrottleCounters{}, err
	}
	fields := strings.Split(strings.TrimSpace(string(out)), ",")
	if len(fields) < 5 {
		return BenchmarkThrottleCounters{}, fmt.Errorf("unexpected throttle counter columns: %q", strings.TrimSpace(string(out)))
	}
	return BenchmarkThrottleCounters{
		SWPowerCapUS:           parseBenchmarkUint64(fields[0]),
		SWThermalSlowdownUS:    parseBenchmarkUint64(fields[1]),
		SyncBoostUS:            parseBenchmarkUint64(fields[2]),
		HWThermalSlowdownUS:    parseBenchmarkUint64(fields[3]),
		HWPowerBrakeSlowdownUS: parseBenchmarkUint64(fields[4]),
	}, nil
}

func diffThrottleCounters(before, after BenchmarkThrottleCounters) BenchmarkThrottleCounters {
	return BenchmarkThrottleCounters{
		SWPowerCapUS:           saturatingSub(after.SWPowerCapUS, before.SWPowerCapUS),
		SWThermalSlowdownUS:    saturatingSub(after.SWThermalSlowdownUS, before.SWThermalSlowdownUS),
		SyncBoostUS:            saturatingSub(after.SyncBoostUS, before.SyncBoostUS),
		HWThermalSlowdownUS:    saturatingSub(after.HWThermalSlowdownUS, before.HWThermalSlowdownUS),
		HWPowerBrakeSlowdownUS: saturatingSub(after.HWPowerBrakeSlowdownUS, before.HWPowerBrakeSlowdownUS),
	}
}

func queryECCCounters(gpuIndex int) (BenchmarkECCCounters, error) {
	out, err := satExecCommand(
		"nvidia-smi",
		"--id="+strconv.Itoa(gpuIndex),
		"--query-gpu=ecc.errors.corrected.volatile.total,ecc.errors.uncorrected.volatile.total",
		"--format=csv,noheader,nounits",
	).Output()
	if err != nil {
		return BenchmarkECCCounters{}, err
	}
	fields := strings.Split(strings.TrimSpace(string(out)), ",")
	if len(fields) < 2 {
		return BenchmarkECCCounters{}, fmt.Errorf("unexpected ECC counter columns: %q", strings.TrimSpace(string(out)))
	}
	corrected, err1 := strconv.ParseUint(strings.TrimSpace(fields[0]), 10, 64)
	uncorrected, err2 := strconv.ParseUint(strings.TrimSpace(fields[1]), 10, 64)
	if err1 != nil || err2 != nil {
		// ECC may be disabled on this GPU — return zero counters silently.
		return BenchmarkECCCounters{}, nil
	}
	return BenchmarkECCCounters{Corrected: corrected, Uncorrected: uncorrected}, nil
}

func diffECCCounters(before, after BenchmarkECCCounters) BenchmarkECCCounters {
	return BenchmarkECCCounters{
		Corrected:   saturatingSub(after.Corrected, before.Corrected),
		Uncorrected: saturatingSub(after.Uncorrected, before.Uncorrected),
	}
}

func queryActiveComputeApps(gpuIndices []int) ([]string, error) {
	args := []string{
		"--query-compute-apps=gpu_uuid,pid,process_name",
		"--format=csv,noheader,nounits",
	}
	if len(gpuIndices) > 0 {
		args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...)
	}
	out, err := satExecCommand("nvidia-smi", args...).Output()
	if err != nil {
		return nil, err
	}
	var lines []string
	for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
		line = strings.TrimSpace(line)
		if line == "" {
			continue
		}
		lines = append(lines, line)
	}
	return lines, nil
}

func finalizeBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkGPUResult {
	if gpu.Status == "" {
		gpu.Status = "OK"
	}
	if gpu.Scores.CompositeScore == 0 {
		gpu.Scores.CompositeScore = compositeBenchmarkScore(gpu.Scores)
	}
	return gpu
}

func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
	var findings []string

	passed := 0
	for _, gpu := range result.GPUs {
		if gpu.Status == "OK" {
			passed++
		}
	}
	total := len(result.GPUs)
	if total > 0 {
		if passed == total {
			findings = append(findings, fmt.Sprintf("All %d GPU(s) passed the benchmark.", total))
		} else {
			findings = append(findings, fmt.Sprintf("%d of %d GPU(s) passed the benchmark.", passed, total))
		}
	}

	if result.Normalization.Status != "full" {
		findings = append(findings, "Environment normalization was partial; compare results with caution.")
	}
	for _, gpu := range result.GPUs {
		if gpu.Status == "FAILED" && len(gpu.DegradationReasons) == 0 {
			findings = append(findings, fmt.Sprintf("GPU %d failed the benchmark (check verbose.log for details).", gpu.Index))
			continue
		}
		if len(gpu.DegradationReasons) == 0 && gpu.Status == "OK" {
			findings = append(findings, fmt.Sprintf("GPU %d held clocks without observable throttle counters during steady state.", gpu.Index))
			continue
		}
		for _, reason := range gpu.DegradationReasons {
			switch reason {
			case "power_capped":
				findings = append(findings, fmt.Sprintf("GPU %d spent measurable time under SW power cap.", gpu.Index))
			case "thermal_limited":
				msg := fmt.Sprintf("GPU %d reported thermal slowdown during steady state.", gpu.Index)
				if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable &&
					result.Cooling.P95FanDutyCyclePct < 98 && gpu.Steady.ClockDriftPct >= 20 {
					msg += fmt.Sprintf(
						" Fans peaked at %.0f%% duty cycle (not at maximum) while clocks dropped %.0f%% — possible cooling misconfiguration; rerun the benchmark with fan speed manually fixed at 100%%.",
						result.Cooling.P95FanDutyCyclePct, gpu.Steady.ClockDriftPct,
					)
				}
				findings = append(findings, msg)
			case "sync_boost_limited":
				findings = append(findings, fmt.Sprintf("GPU %d was limited by sync boost behaviour.", gpu.Index))
			case "low_sm_clock_vs_target":
				findings = append(findings, fmt.Sprintf("GPU %d average SM clock stayed below the requested lock target.", gpu.Index))
			case "variance_too_high":
				findings = append(findings, fmt.Sprintf("GPU %d showed unstable clocks/power over the benchmark window.", gpu.Index))
			case "normalization_partial":
				findings = append(findings, fmt.Sprintf("GPU %d ran without full benchmark normalization.", gpu.Index))
			case "power_limit_derated":
				findings = append(findings, fmt.Sprintf("GPU %d could not sustain targeted_power in this server at the default limit; benchmark ran derated at %.0f W.", gpu.Index, gpu.PowerLimitW))
			case "ecc_uncorrected_errors":
				findings = append(findings, fmt.Sprintf("GPU %d reported %d uncorrected ECC error(s) — possible hardware fault.", gpu.Index, gpu.ECC.Uncorrected))
			case "ecc_corrected_errors":
				findings = append(findings, fmt.Sprintf("GPU %d reported %d corrected ECC error(s) — possible DRAM degradation.", gpu.Index, gpu.ECC.Corrected))
			}
		}
		if gpu.CoolingWarning != "" {
			findings = append(findings, fmt.Sprintf(
				"GPU %d: %s. Operator action: rerun the benchmark with fan speed manually fixed at 100%% to confirm actual thermal headroom.",
				gpu.Index, gpu.CoolingWarning,
			))
		}
		if len(gpu.PrecisionFailures) > 0 {
			findings = append(findings, fmt.Sprintf("GPU %d had incomplete precision coverage: %s.", gpu.Index, strings.Join(gpu.PrecisionFailures, ", ")))
		}
		if gpu.Backend == "driver-ptx" {
			findings = append(findings, fmt.Sprintf("GPU %d used driver PTX fallback; tensor score is intentionally degraded.", gpu.Index))
		}
		if gpu.DefaultPowerLimitW > 0 && gpu.PowerLimitW > 0 && gpu.PowerLimitW < gpu.DefaultPowerLimitW*0.95 {
			findings = append(findings, fmt.Sprintf(
				"GPU %d power limit %.0f W is below default %.0f W (%.0f%%). Performance may be artificially reduced.",
				gpu.Index, gpu.PowerLimitW, gpu.DefaultPowerLimitW, gpu.PowerLimitW/gpu.DefaultPowerLimitW*100,
			))
		}
		// Flag significant TDP deviation (over or under) from calibration.
		if gpu.CalibratedPeakPowerW > 0 {
			ref := gpu.DefaultPowerLimitW
			if ref <= 0 {
				ref = gpu.PowerLimitW
			}
			if ref > 0 {
				deviationPct := (gpu.CalibratedPeakPowerW - ref) / ref * 100
				switch {
				case deviationPct < -10:
					findings = append(findings, fmt.Sprintf(
						"GPU %d reached only %.0f W (%.0f%% of rated %.0f W) under targeted_power. Check power delivery or cooling.",
						gpu.Index, gpu.CalibratedPeakPowerW, gpu.CalibratedPeakPowerW/ref*100, ref,
					))
				case deviationPct > 5:
					findings = append(findings, fmt.Sprintf(
						"GPU %d exceeded rated TDP: %.0f W measured vs %.0f W rated (+%.0f%%). Power limit may not be enforced correctly.",
						gpu.Index, gpu.CalibratedPeakPowerW, ref, deviationPct,
					))
				}
			}
		}
	}
	if result.Interconnect != nil && result.Interconnect.Supported {
		findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps))
	}
	if cl := result.CPULoad; cl != nil {
		switch cl.Status {
		case "high":
			findings = append(findings, fmt.Sprintf(
				"Host CPU load was elevated during the benchmark (avg %.1f%%, max %.1f%%). A competing CPU workload may skew GPU results.",
				cl.AvgPct, cl.MaxPct,
			))
		case "unstable":
			findings = append(findings, fmt.Sprintf(
				"Host CPU load was erratic during the benchmark (avg %.1f%%, p95 %.1f%%). Results may be less reproducible.",
				cl.AvgPct, cl.P95Pct,
			))
		}
	}
	if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 {
		if sp.ReportingRatio < 0.75 {
			findings = append(findings, fmt.Sprintf(
				"GPU power reporting may be unreliable: server delta %.0f W vs GPU-reported %.0f W (ratio %.2f). GPU telemetry likely over-reports actual consumption. Composite scores have been penalized accordingly.",
				sp.DeltaW, sp.GPUReportedSumW, sp.ReportingRatio,
			))
		} else if sp.ReportingRatio > 1.25 {
			findings = append(findings, fmt.Sprintf(
				"Server power delta %.0f W exceeds GPU-reported sum %.0f W by %.0f%%. Other components (CPU, NVMe, networking) may be drawing substantial power under GPU load.",
				sp.DeltaW, sp.GPUReportedSumW, (sp.ReportingRatio-1)*100,
			))
		}
	}
	return dedupeStrings(findings)
}

func benchmarkOverallStatus(result NvidiaBenchmarkResult) string {
	if len(result.GPUs) == 0 {
		return "FAILED"
	}
	hasOK := false
	hasPartial := result.Normalization.Status != "full"
	for _, gpu := range result.GPUs {
		switch gpu.Status {
		case "OK":
			hasOK = true
		case "PARTIAL", "UNSUPPORTED":
			hasPartial = true
		}
	}
	if !hasOK {
		return "FAILED"
	}
	if hasPartial {
		return "PARTIAL"
	}
	return "OK"
}

func findBenchmarkNormalization(items []BenchmarkNormalizationGPU, idx int) *BenchmarkNormalizationGPU {
	for i := range items {
		if items[i].Index == idx {
			return &items[i]
		}
	}
	return nil
}

func classifySATErrorStatus(out []byte, err error) string {
	status, _ := classifySATResult("benchmark", out, err)
	if status == "UNSUPPORTED" {
		return "UNSUPPORTED"
	}
	return "FAILED"
}

func parseBenchmarkFloat(raw string) float64 {
	raw = strings.TrimSpace(raw)
	if raw == "" || strings.EqualFold(raw, "n/a") || strings.EqualFold(raw, "[not supported]") {
		return 0
	}
	value, _ := strconv.ParseFloat(raw, 64)
	return value
}

func parseBenchmarkUint64(raw string) uint64 {
	raw = strings.TrimSpace(raw)
	if raw == "" || strings.EqualFold(raw, "n/a") || strings.EqualFold(raw, "[not supported]") {
		return 0
	}
	value, _ := strconv.ParseUint(raw, 10, 64)
	return value
}

func benchmarkMean(values []float64) float64 {
	if len(values) == 0 {
		return 0
	}
	var sum float64
	for _, value := range values {
		sum += value
	}
	return sum / float64(len(values))
}

func benchmarkPercentile(values []float64, p float64) float64 {
	if len(values) == 0 {
		return 0
	}
	copyValues := append([]float64(nil), values...)
	sort.Float64s(copyValues)
	if len(copyValues) == 1 {
		return copyValues[0]
	}
	rank := (p / 100.0) * float64(len(copyValues)-1)
	lower := int(math.Floor(rank))
	upper := int(math.Ceil(rank))
	if lower == upper {
		return copyValues[lower]
	}
	frac := rank - float64(lower)
	return copyValues[lower] + (copyValues[upper]-copyValues[lower])*frac
}

func benchmarkCV(values []float64) float64 {
	if len(values) == 0 {
		return 0
	}
	mean := benchmarkMean(values)
	if mean == 0 {
		return 0
	}
	var variance float64
	for _, value := range values {
		diff := value - mean
		variance += diff * diff
	}
	variance /= float64(len(values))
	return math.Sqrt(variance) / mean * 100
}

func benchmarkClockDrift(values []float64) float64 {
	if len(values) < 4 {
		return 0
	}
	window := len(values) / 4
	if window < 1 {
		window = 1
	}
	head := benchmarkMean(values[:window])
	tail := benchmarkMean(values[len(values)-window:])
	if head <= 0 || tail >= head {
		return 0
	}
	return ((head - tail) / head) * 100
}

func benchmarkMax(values []float64) float64 {
	var max float64
	for i, value := range values {
		if i == 0 || value > max {
			max = value
		}
	}
	return max
}

func clampScore(value float64) float64 {
	switch {
	case value < 0:
		return 0
	case value > 100:
		return 100
	default:
		return value
	}
}

func dedupeStrings(values []string) []string {
	if len(values) == 0 {
		return nil
	}
	seen := make(map[string]struct{}, len(values))
	out := make([]string, 0, len(values))
	for _, value := range values {
		value = strings.TrimSpace(value)
		if value == "" {
			continue
		}
		if _, ok := seen[value]; ok {
			continue
		}
		seen[value] = struct{}{}
		out = append(out, value)
	}
	return out
}

func saturatingSub(after, before uint64) uint64 {
	if after <= before {
		return 0
	}
	return after - before
}

func maxInt(a, b int) int {
	if a > b {
		return a
	}
	return b
}

// queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
// Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
func queryIPMIServerPowerW() (float64, error) {
	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
	defer cancel()
	cmd := exec.CommandContext(ctx, "ipmitool", "dcmi", "power", "reading")
	out, err := cmd.Output()
	if err != nil {
		return 0, fmt.Errorf("ipmitool dcmi power reading: %w", err)
	}
	if w := parseDCMIPowerReading(string(out)); w > 0 {
		return w, nil
	}
	return 0, fmt.Errorf("could not parse ipmitool dcmi power reading output")
}

// sampleIPMIPowerSeries collects IPMI power readings every 2 seconds for
// durationSec seconds. Returns the mean of all successful samples.
// Returns 0, false if IPMI is unavailable.
func sampleIPMIPowerSeries(ctx context.Context, durationSec int) (meanW float64, ok bool) {
	if durationSec <= 0 {
		return 0, false
	}
	deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
	var samples []float64
loop:
	for {
		if w, err := queryIPMIServerPowerW(); err == nil {
			samples = append(samples, w)
		}
		if time.Now().After(deadline) {
			break
		}
		select {
		case <-ctx.Done():
			break loop
		case <-time.After(2 * time.Second):
		}
	}
	if len(samples) == 0 {
		return 0, false
	}
	var sum float64
	for _, w := range samples {
		sum += w
	}
	return sum / float64(len(samples)), true
}

// characterizeServerPower computes BenchmarkServerPower from idle and loaded
// IPMI samples plus the GPU-reported average power during steady state.
func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, ipmiAvailable bool) *BenchmarkServerPower {
	sp := &BenchmarkServerPower{Available: ipmiAvailable}
	if !ipmiAvailable {
		sp.Notes = append(sp.Notes, "IPMI power reading unavailable; server-side power characterization skipped")
		return sp
	}
	sp.IdleW = idleW
	sp.LoadedW = loadedW
	sp.DeltaW = loadedW - idleW
	sp.GPUReportedSumW = gpuReportedSumW
	if gpuReportedSumW > 0 && sp.DeltaW > 0 {
		sp.ReportingRatio = sp.DeltaW / gpuReportedSumW
	}
	return sp
}

// readServerModel returns the DMI system product name (e.g. "SuperMicro SYS-421GE-TNRT").
// Returns empty string if unavailable (non-Linux or missing DMI entry).
func readServerModel() string {
	data, err := os.ReadFile("/sys/class/dmi/id/product_name")
	if err != nil {
		return ""
	}
	return strings.TrimSpace(string(data))
}

// filterRowsByGPU returns only the metric rows for a specific GPU index.
func filterRowsByGPU(rows []GPUMetricRow, gpuIndex int) []GPUMetricRow {
	var out []GPUMetricRow
	for _, r := range rows {
		if r.GPUIndex == gpuIndex {
			out = append(out, r)
		}
	}
	return out
}

// parseBenchmarkBurnLogByGPU splits a multi-GPU bee-gpu-burn output by [gpu N] prefix
// and returns a per-GPU parse result map.
func parseBenchmarkBurnLogByGPU(raw string) map[int]benchmarkBurnParseResult {
	gpuLines := make(map[int][]string)
	for _, line := range strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n") {
		line = strings.TrimSpace(line)
		if !strings.HasPrefix(line, "[gpu ") {
			continue
		}
		end := strings.Index(line, "] ")
		if end < 0 {
			continue
		}
		gpuIdx, err := strconv.Atoi(strings.TrimSpace(line[5:end]))
		if err != nil {
			continue
		}
		gpuLines[gpuIdx] = append(gpuLines[gpuIdx], line[end+2:])
	}
	results := make(map[int]benchmarkBurnParseResult, len(gpuLines))
	for gpuIdx, lines := range gpuLines {
		// Lines are already stripped of the [gpu N] prefix; parseBenchmarkBurnLog
		// calls stripBenchmarkPrefix which is a no-op on already-stripped lines.
		results[gpuIdx] = parseBenchmarkBurnLog(strings.Join(lines, "\n"))
	}
	return results
}

// runNvidiaBenchmarkParallel runs warmup and steady compute on all selected GPUs
// simultaneously using a single bee-gpu-burn invocation per phase.
func runNvidiaBenchmarkParallel(
	ctx context.Context,
	verboseLog, runDir string,
	selected []int,
	infoByIndex map[int]benchmarkGPUInfo,
	opts NvidiaBenchmarkOptions,
	spec benchmarkProfileSpec,
	logFunc func(string),
	result *NvidiaBenchmarkResult,
	calibByIndex map[int]benchmarkPowerCalibrationResult,
	serverIdleW *float64, serverLoadedWSum *float64,
	serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int,
	allMetricRows *[]GPUMetricRow,
	metricTimelineSec *float64,
	gpuBurnLog string,
) {
	allDevices := joinIndexList(selected)

	// Build per-GPU result stubs.
	gpuResults := make(map[int]*BenchmarkGPUResult, len(selected))
	for _, idx := range selected {
		r := &BenchmarkGPUResult{Index: idx, Status: "FAILED"}
		if info, ok := infoByIndex[idx]; ok {
			r.UUID = info.UUID
			r.Name = info.Name
			r.BusID = info.BusID
			r.VBIOS = info.VBIOS
			r.PowerLimitW = info.PowerLimitW
			r.MultiprocessorCount = info.MultiprocessorCount
			r.DefaultPowerLimitW = info.DefaultPowerLimitW
			r.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
			r.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
			r.MaxMemoryClockMHz = info.MaxMemoryClockMHz
		}
		if calib, ok := calibByIndex[idx]; ok {
			r.CalibratedPeakPowerW = calib.Summary.P95PowerW
			r.CalibratedPeakTempC = calib.Summary.P95TempC
			r.PowerCalibrationTries = calib.Attempts
			r.PowerLimitDerated = calib.Derated
			r.Notes = append(r.Notes, calib.Notes...)
			if calib.CoolingWarning != "" {
				r.CoolingWarning = calib.CoolingWarning
			}
		}
		if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
			r.LockedGraphicsClockMHz = norm.GPUClockLockMHz
			r.LockedMemoryClockMHz = norm.MemoryClockLockMHz
		}
		gpuResults[idx] = r
	}

	// Baseline: sample all GPUs together.
	baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, selected)
	if err != nil && err != context.Canceled {
		for _, idx := range selected {
			gpuResults[idx].Notes = append(gpuResults[idx].Notes, "baseline sampling failed: "+err.Error())
		}
	}
	for _, idx := range selected {
		perGPU := filterRowsByGPU(baselineRows, idx)
		gpuResults[idx].Baseline = summarizeBenchmarkTelemetry(perGPU)
	}
	appendBenchmarkMetrics(allMetricRows, baselineRows, "baseline", metricTimelineSec, float64(spec.BaselineSec))

	// Sample server idle power once.
	if !*serverIdleOK {
		if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
			*serverIdleW = w
			*serverIdleOK = true
			logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
		}
	}

	// Warmup: all GPUs simultaneously.
	warmupCmd := []string{
		"bee-gpu-burn",
		"--seconds", strconv.Itoa(spec.WarmupSec),
		"--size-mb", strconv.Itoa(opts.SizeMB),
		"--devices", allDevices,
	}
	logFunc(fmt.Sprintf("GPUs %s: parallel warmup (%ds)", allDevices, spec.WarmupSec))
	warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-warmup.log", warmupCmd, nil, selected, logFunc)
	appendBenchmarkMetrics(allMetricRows, warmupRows, "warmup", metricTimelineSec, float64(spec.WarmupSec))
	appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", "warmup", warmupOut)
	if warmupErr != nil {
		for _, idx := range selected {
			gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel warmup failed: "+warmupErr.Error())
		}
	}
	warmupParseByGPU := parseBenchmarkBurnLogByGPU(string(warmupOut))
	supportedPrecisions := append([]string(nil), benchmarkPrecisionPhases...)
	for _, idx := range selected {
		if pr, ok := warmupParseByGPU[idx]; ok && pr.ComputeCapability != "" {
			if gpuResults[idx].ComputeCapability == "" {
				gpuResults[idx].ComputeCapability = pr.ComputeCapability
			}
			if ccPrecisions := benchmarkSupportedPrecisions(pr.ComputeCapability); len(ccPrecisions) < len(supportedPrecisions) {
				supportedPrecisions = ccPrecisions
			}
		}
	}

	// Run synthetic precision phases and the combined steady phase as one
	// uninterrupted command so the GPUs stay hot between windows.
	eccBase := make(map[int]BenchmarkECCCounters, len(selected))
	for _, idx := range selected {
		eccBase[idx], _ = queryECCCounters(idx)
	}
	planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, supportedPrecisions, func(label string) string {
		if label == "mixed" {
			return "steady"
		}
		return "gpu-all-steady-" + label
	})
	planCmd := []string{
		"bee-gpu-burn",
		"--seconds", strconv.Itoa(basePhaseSec),
		"--size-mb", strconv.Itoa(opts.SizeMB),
		"--devices", allDevices,
		"--precision-plan", strings.Join(planLabels, ","),
		"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
	}
	logFunc(fmt.Sprintf("GPUs %s: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", allDevices, len(supportedPrecisions), basePhaseSec, mixedPhaseSec))
	_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, "gpu-all-precision-plan.log", planCmd, nil, selected, planPhases, logFunc)
	for _, phaseSpec := range planPhases {
		if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
			appendBenchmarkMetrics(allMetricRows, rows, phaseSpec.MetricStage, metricTimelineSec, float64(phaseSpec.DurationSec))
		}
		appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseSpec.MetricStage, phaseLogs[phaseSpec.PlanLabel])
	}
	for _, prec := range supportedPrecisions {
		phaseLogName := "gpu-all-steady-" + prec
		phaseRows := phaseRowsByStage[phaseLogName]
		parseByGPU := parseBenchmarkBurnLogByGPU(string(phaseLogs[prec]))
		for _, idx := range selected {
			perGPU := filterRowsByGPU(phaseRows, idx)
			phase := BenchmarkPrecisionSteadyPhase{
				Precision: prec,
				Status:    "OK",
				Steady:    summarizeBenchmarkTelemetry(perGPU),
			}
			if status, note := benchmarkPlannedPhaseStatus(phaseLogs[prec]); status != "OK" {
				phase.Status = status
				phase.Notes = note
				gpuResults[idx].PrecisionFailures = append(gpuResults[idx].PrecisionFailures, prec+":"+status)
			}
			if pr, ok := parseByGPU[idx]; ok {
				for _, p := range pr.Profiles {
					if p.Supported {
						phase.TeraOpsPerSec += p.TeraOpsPerSec
						phase.WeightedTeraOpsPerSec += p.WeightedTeraOpsPerSec
					}
				}
			}
			gpuResults[idx].PrecisionSteady = append(gpuResults[idx].PrecisionSteady, phase)
		}
	}

	// Snapshot throttle counters before steady.
	beforeThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
	for _, idx := range selected {
		beforeThrottle[idx], _ = queryThrottleCounters(idx)
	}

	logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (combined, %ds)", allDevices, mixedPhaseSec))

	// Sample server power via IPMI in parallel with steady phase.
	ipmiStopCh := make(chan struct{})
	ipmiResultCh := make(chan float64, 1)
	go func() {
		defer close(ipmiResultCh)
		var samples []float64
		ticker := time.NewTicker(5 * time.Second)
		defer ticker.Stop()
		select {
		case <-ipmiStopCh:
			return
		case <-time.After(15 * time.Second):
		}
		for {
			if w, err := queryIPMIServerPowerW(); err == nil {
				samples = append(samples, w)
			}
			select {
			case <-ipmiStopCh:
				if len(samples) > 0 {
					var sum float64
					for _, w := range samples {
						sum += w
					}
					ipmiResultCh <- sum / float64(len(samples))
				}
				return
			case <-ticker.C:
			}
		}
	}()

	close(ipmiStopCh)
	if loadedW, ok := <-ipmiResultCh; ok {
		*serverLoadedWSum += loadedW
		(*serverLoadedSamples)++
		*serverLoadedOK = true
		logFunc(fmt.Sprintf("GPUs %s: server loaded power (IPMI): %.0f W", allDevices, loadedW))
	}
	afterThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
	for _, idx := range selected {
		afterThrottle[idx], _ = queryThrottleCounters(idx)
	}

	steadyRows := phaseRowsByStage["steady"]
	parseResults := parseBenchmarkBurnLogByGPU(string(phaseLogs["mixed"]))

	for _, idx := range selected {
		perGPU := filterRowsByGPU(steadyRows, idx)
		gpuResults[idx].Steady = summarizeBenchmarkTelemetry(perGPU)
		gpuResults[idx].Throttle = diffThrottleCounters(beforeThrottle[idx], afterThrottle[idx])
		if eccFinal, err := queryECCCounters(idx); err == nil {
			gpuResults[idx].ECC = diffECCCounters(eccBase[idx], eccFinal)
		}

		if pr, ok := parseResults[idx]; ok {
			gpuResults[idx].ComputeCapability = pr.ComputeCapability
			gpuResults[idx].Backend = pr.Backend
			gpuResults[idx].PrecisionResults = pr.Profiles
			if pr.Fallback {
				gpuResults[idx].Notes = append(gpuResults[idx].Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable")
			}
		}
		if planErr != nil {
			gpuResults[idx].Notes = append(gpuResults[idx].Notes, "precision plan failed: "+planErr.Error())
		}
	}

	// Cooldown: all GPUs together.
	if spec.CooldownSec > 0 {
		cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, selected)
		if err != nil && err != context.Canceled {
			for _, idx := range selected {
				gpuResults[idx].Notes = append(gpuResults[idx].Notes, "cooldown sampling failed: "+err.Error())
			}
		}
		for _, idx := range selected {
			perGPU := filterRowsByGPU(cooldownRows, idx)
			gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU)
		}
		appendBenchmarkMetrics(allMetricRows, cooldownRows, "cooldown", metricTimelineSec, float64(spec.CooldownSec))
	}

	// Score and finalize each GPU.
	for _, idx := range selected {
		r := gpuResults[idx]
		r.Scores = scoreBenchmarkGPUResult(*r)
		r.DegradationReasons = detectBenchmarkDegradationReasons(*r, result.Normalization.Status)
		pr := parseResults[idx]
		switch {
		case planErr != nil:
			r.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr)
		case len(r.PrecisionFailures) > 0:
			r.Status = "PARTIAL"
		case pr.Fallback:
			r.Status = "PARTIAL"
		default:
			r.Status = "OK"
		}
		result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(*r))
	}
}

// readBenchmarkHostConfig reads static CPU and memory configuration from
// /proc/cpuinfo and /proc/meminfo. Returns nil if neither source is readable.
func readBenchmarkHostConfig() *BenchmarkHostConfig {
	cfg := &BenchmarkHostConfig{}
	populated := false

	// Parse /proc/cpuinfo for CPU model, sockets, cores, threads.
	if data, err := os.ReadFile("/proc/cpuinfo"); err == nil {
		socketIDs := map[string]struct{}{}
		coresPerSocket := map[string]int{}
		var modelName string
		threads := 0
		for _, line := range strings.Split(string(data), "\n") {
			kv := strings.SplitN(line, ":", 2)
			if len(kv) != 2 {
				continue
			}
			key := strings.TrimSpace(kv[0])
			val := strings.TrimSpace(kv[1])
			switch key {
			case "processor":
				threads++
			case "model name":
				if modelName == "" {
					modelName = val
				}
			case "physical id":
				socketIDs[val] = struct{}{}
			case "cpu cores":
				// Overwrite per-socket core count (last wins per socket, but all
				// entries for the same socket report the same value).
				if physLine := ""; physLine == "" {
					// We accumulate below by treating cpu cores as a per-thread
					// field; sum by socket requires a two-pass approach. Use the
					// simpler approximation: totalCores = threads / (threads per core).
					_ = val
				}
			}
		}
		// Second pass: per-socket core count.
		var curSocket string
		for _, line := range strings.Split(string(data), "\n") {
			kv := strings.SplitN(line, ":", 2)
			if len(kv) != 2 {
				continue
			}
			key := strings.TrimSpace(kv[0])
			val := strings.TrimSpace(kv[1])
			switch key {
			case "physical id":
				curSocket = val
			case "cpu cores":
				if curSocket != "" {
					if _, seen := coresPerSocket[curSocket]; !seen {
						v, _ := strconv.Atoi(val)
						coresPerSocket[curSocket] = v
					}
				}
			}
		}
		totalCores := 0
		for _, c := range coresPerSocket {
			totalCores += c
		}
		cfg.CPUModel = modelName
		cfg.CPUSockets = len(socketIDs)
		if cfg.CPUSockets == 0 && threads > 0 {
			cfg.CPUSockets = 1
		}
		cfg.CPUCores = totalCores
		cfg.CPUThreads = threads
		if modelName != "" || threads > 0 {
			populated = true
		}
	}

	// Parse /proc/meminfo for total physical RAM.
	if data, err := os.ReadFile("/proc/meminfo"); err == nil {
		for _, line := range strings.Split(string(data), "\n") {
			if strings.HasPrefix(line, "MemTotal:") {
				fields := strings.Fields(line)
				if len(fields) >= 2 {
					kb, _ := strconv.ParseUint(fields[1], 10, 64)
					cfg.MemTotalGiB = float64(kb) / (1024 * 1024)
					populated = true
				}
				break
			}
		}
	}

	if !populated {
		return nil
	}
	return cfg
}

// startCPULoadSampler starts a goroutine that samples host CPU load every
// intervalSec seconds until stopCh is closed, then sends the collected
// samples on the returned channel.
func startCPULoadSampler(stopCh <-chan struct{}, intervalSec int) <-chan []float64 {
	ch := make(chan []float64, 1)
	go func() {
		var samples []float64
		ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
		defer ticker.Stop()
		for {
			select {
			case <-stopCh:
				ch <- samples
				return
			case <-ticker.C:
				if pct := sampleCPULoadPct(); pct > 0 {
					samples = append(samples, pct)
				}
			}
		}
	}()
	return ch
}

// summarizeCPULoad computes stats over sampled CPU load values and assigns
// a health status.
func summarizeCPULoad(samples []float64) *BenchmarkCPULoad {
	if len(samples) == 0 {
		return nil
	}
	sorted := append([]float64(nil), samples...)
	sort.Float64s(sorted)
	var sum float64
	for _, v := range sorted {
		sum += v
	}
	avg := sum / float64(len(sorted))
	p95 := sorted[int(float64(len(sorted))*0.95)]
	max := sorted[len(sorted)-1]

	cl := &BenchmarkCPULoad{
		AvgPct:  math.Round(avg*10) / 10,
		MaxPct:  math.Round(max*10) / 10,
		P95Pct:  math.Round(p95*10) / 10,
		Samples: len(sorted),
	}

	// Compute standard deviation to detect instability.
	var variance float64
	for _, v := range sorted {
		d := v - avg
		variance += d * d
	}
	stdDev := math.Sqrt(variance / float64(len(sorted)))

	switch {
	case avg > 20 || max > 40:
		cl.Status = "high"
		cl.Note = fmt.Sprintf("avg %.1f%% max %.1f%% — elevated host CPU load may interfere with GPU benchmark results", avg, max)
	case stdDev > 12:
		cl.Status = "unstable"
		cl.Note = fmt.Sprintf("avg %.1f%% stddev %.1f%% — host CPU load was erratic during the benchmark", avg, stdDev)
	default:
		cl.Status = "ok"
	}
	return cl
}

// runBenchmarkPowerCalibration runs targeted_power per GPU and actively watches
// throttle counters. If a GPU starts throttling, the current targeted_power run
// is canceled immediately, the power limit is reduced, and a fresh full cycle
// is started again from the beginning. The selected reduced power limit stays
// active for the main benchmark and is restored by the caller afterwards.
func runBenchmarkPowerCalibration(
	ctx context.Context,
	verboseLog, runDir string,
	gpuIndices []int,
	infoByIndex map[int]benchmarkGPUInfo,
	logFunc func(string),
	fixedLimits map[int]int,
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) {
	const calibDurationSec = 120
	const maxDerateW = 150
	// calibSearchTolerance is the binary-search convergence threshold in watts.
	// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
	const calibSearchTolerance = 10
	// dcgmResourceBusyMaxDelaySec caps the exponential back-off when DCGM
	// returns DCGM_ST_IN_USE (exit 222). The sequence is 1 s, 2 s, 4 s, …
	// doubling each retry until it would exceed the cap, at which point the
	// next busy response fails the calibration immediately.
	const dcgmResourceBusyMaxDelaySec = 300

	if _, err := exec.LookPath("dcgmi"); err != nil {
		logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
		return map[int]benchmarkPowerCalibrationResult{}, nil
	}
	if killed := KillTestWorkers(); len(killed) > 0 {
		for _, p := range killed {
			logFunc(fmt.Sprintf("power calibration pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
		}
	}

	canDerate := os.Geteuid() == 0
	if !canDerate {
		logFunc("power calibration: root privileges unavailable, adaptive power-limit derating disabled")
	}

	type calibrationAttemptResult struct {
		out  []byte
		rows []GPUMetricRow
		err  error
	}


	// gpuCalibState holds per-GPU binary search state during parallel calibration.
	type gpuCalibState struct {
		idx            int
		info           benchmarkGPUInfo
		originalLimitW int
		appliedLimitW  int
		minLimitW      int
		lo             int // highest verified-stable limit (assumed: minLimitW)
		hi             int // lowest verified-unstable limit (exclusive sentinel above start)
		calib          benchmarkPowerCalibrationResult
		converged      bool
	}

	results := make(map[int]benchmarkPowerCalibrationResult, len(gpuIndices))
	var restore []benchmarkRestoreAction

	// Initialise per-GPU state.
	states := make([]*gpuCalibState, 0, len(gpuIndices))
	for _, idx := range gpuIndices {
		info := infoByIndex[idx]
		originalLimitW := int(math.Round(info.PowerLimitW))
		if originalLimitW <= 0 {
			originalLimitW = int(math.Round(info.DefaultPowerLimitW))
		}
		defaultLimitW := int(math.Round(info.DefaultPowerLimitW))
		if defaultLimitW <= 0 {
			defaultLimitW = originalLimitW
		}
		appliedLimitW := originalLimitW
		if appliedLimitW <= 0 {
			appliedLimitW = defaultLimitW
		}
		minLimitW := appliedLimitW
		switch {
		case defaultLimitW > 0:
			minLimitW = defaultLimitW - maxDerateW
			floorByRatio := int(math.Round(float64(defaultLimitW) * 0.70))
			if minLimitW < floorByRatio {
				minLimitW = floorByRatio
			}
		case appliedLimitW > 0:
			minLimitW = appliedLimitW - maxDerateW
		}
		if minLimitW < calibSearchTolerance {
			minLimitW = calibSearchTolerance
		}
		s := &gpuCalibState{
			idx:            idx,
			info:           info,
			originalLimitW: originalLimitW,
			appliedLimitW:  appliedLimitW,
			minLimitW:      minLimitW,
			lo:             minLimitW,
			hi:             appliedLimitW + 1, // not yet tested, not yet confirmed unstable
			calib:          benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
		}
		if fixedLimits != nil {
			if fixedW, ok := fixedLimits[idx]; ok {
				// This GPU's limit was established in a prior ramp step and must
				// remain unchanged. Apply it immediately and skip the binary search.
				if canDerate && fixedW > 0 {
					_ = setBenchmarkPowerLimit(ctx, verboseLog, idx, fixedW)
				}
				s.appliedLimitW = fixedW
				s.calib.AppliedPowerLimitW = float64(fixedW)
				s.calib.Completed = true
				s.converged = true
				s.calib.Notes = append(s.calib.Notes,
					fmt.Sprintf("fixed limit: %d W (held from prior ramp step)", fixedW))
			}
		}
		states = append(states, s)
		if canDerate && originalLimitW > 0 {
			idxCopy := idx
			orig := originalLimitW
			restore = append(restore, benchmarkRestoreAction{
				name: fmt.Sprintf("gpu-%d-restore-power-limit", idxCopy),
				fn: func() {
					_ = setBenchmarkPowerLimit(context.Background(), verboseLog, idxCopy, orig)
				},
			})
		}
	}

	// Shared DCGM resource-busy back-off state (single diagnostic session).
	busyRetries := 0
	busyDelaySec := 1
	sharedAttempt := 0

	type sharedAttemptResult struct {
		out  []byte
		rows []GPUMetricRow
		err  error
	}

calibDone:
	for {
		// Collect non-converged GPUs.
		var active []*gpuCalibState
		for _, s := range states {
			if !s.converged {
				active = append(active, s)
			}
		}
		if len(active) == 0 || ctx.Err() != nil {
			break
		}

		sharedAttempt++
		for _, s := range active {
			s.calib.Attempts++
			logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", s.idx, s.calib.Attempts, s.appliedLimitW, calibDurationSec))
		}

		// Snapshot throttle counters for all active GPUs before the run.
		beforeThrottle := make(map[int]BenchmarkThrottleCounters, len(active))
		for _, s := range active {
			beforeThrottle[s.idx], _ = queryThrottleCounters(s.idx)
		}

		// Run targeted_power for ALL gpuIndices simultaneously so every card
		// is under load during calibration — this reflects real server thermals.
		logName := fmt.Sprintf("power-calibration-attempt-%d.log", sharedAttempt)
		cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices)
		attemptCtx, cancelAttempt := context.WithCancel(ctx)
		doneCh := make(chan sharedAttemptResult, 1)
		go func() {
			out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, gpuIndices, logFunc)
			doneCh <- sharedAttemptResult{out: out, rows: rows, err: err}
		}()

		ticker := time.NewTicker(time.Second)
		throttleReasons := make(map[int]string, len(active))
		var ar sharedAttemptResult

	attemptLoop:
		for {
			select {
			case ar = <-doneCh:
				break attemptLoop
			case <-ticker.C:
				// Poll throttle counters for each active GPU independently.
				for _, s := range active {
					if throttleReasons[s.idx] != "" {
						continue // already detected for this GPU
					}
					after, err := queryThrottleCounters(s.idx)
					if err != nil {
						continue
					}
					// Record throttle but do NOT cancel — let dcgmi finish so
					// nv-hostengine releases the slot cleanly before the next attempt.
					if reason := benchmarkCalibrationThrottleReason(beforeThrottle[s.idx], after); reason != "" {
						throttleReasons[s.idx] = reason
						logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for run to finish", s.idx, reason, s.appliedLimitW))
					}
				}
			case <-ctx.Done():
				cancelAttempt()
				ar = <-doneCh
				break attemptLoop
			}
		}
		ticker.Stop()
		cancelAttempt()
		_ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644)

		// Resource busy: retry with exponential back-off (shared — one DCGM session).
		if ar.err != nil && isDCGMResourceBusy(ar.err) {
			if busyDelaySec > dcgmResourceBusyMaxDelaySec {
				for _, s := range active {
					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("DCGM resource busy after %d retries, giving up", busyRetries))
					s.converged = true
				}
				logFunc(fmt.Sprintf("power calibration: DCGM resource persistently busy after %d retries, stopping", busyRetries))
				break calibDone
			}
			busyRetries++
			// Undo attempt counter: busy retries don't count as real attempts.
			for _, s := range active {
				s.calib.Attempts--
			}
			logFunc(fmt.Sprintf("power calibration: DCGM resource busy (attempt %d), retrying in %ds", sharedAttempt, busyDelaySec))
			select {
			case <-ctx.Done():
				break calibDone
			case <-time.After(time.Duration(busyDelaySec) * time.Second):
			}
			next := busyDelaySec * 2
			if next > dcgmResourceBusyMaxDelaySec {
				next = dcgmResourceBusyMaxDelaySec + 1
			}
			busyDelaySec = next
			sharedAttempt-- // retry same logical attempt number
			continue
		}
		busyRetries = 0
		busyDelaySec = 1

		// Per-GPU analysis and binary search update.
		for _, s := range active {
			perGPU := filterRowsByGPU(ar.rows, s.idx)
			summary := summarizeBenchmarkTelemetry(perGPU)
			throttle := throttleReasons[s.idx]

			// Cooling warning: thermal throttle with fans not at maximum.
			if strings.Contains(throttle, "thermal") && s.calib.CoolingWarning == "" {
				clocks := make([]float64, 0, len(perGPU))
				var fanDutyValues []float64
				fanDutyAvail := false
				for _, r := range perGPU {
					if r.ClockMHz > 0 {
						clocks = append(clocks, r.ClockMHz)
					}
					if r.FanDutyCycleAvailable {
						fanDutyAvail = true
						fanDutyValues = append(fanDutyValues, r.FanDutyCyclePct)
					}
				}
				dropPct := benchmarkClockDrift(clocks)
				p95FanDuty := benchmarkPercentile(fanDutyValues, 95)
				if dropPct >= 20 && fanDutyAvail && p95FanDuty < 98 {
					s.calib.CoolingWarning = fmt.Sprintf(
						"thermal throttle (%s) caused a %.0f%% clock drop while fans were at %.0f%% duty cycle — server cooling may not be configured for full GPU load",
						throttle, dropPct, p95FanDuty,
					)
					logFunc(fmt.Sprintf("power calibration: GPU %d cooling warning: %s", s.idx, s.calib.CoolingWarning))
				}
			}

			if throttle == "" && ar.err == nil && summary.P95PowerW > 0 {
				// Stable at current limit — update lo and binary-search upward.
				s.calib.Summary = summary
				s.calib.Completed = true
				s.calib.AppliedPowerLimitW = float64(s.appliedLimitW)
				logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", s.idx, s.appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
				s.lo = s.appliedLimitW
				if canDerate && s.hi-s.lo > calibSearchTolerance {
					next := roundTo5W((s.lo + s.hi) / 2)
					if next > s.lo && next < s.hi {
						if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err == nil {
							s.appliedLimitW = next
							s.calib.AppliedPowerLimitW = float64(next)
							s.calib.Completed = false // keep searching
							s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: stable at %d W, trying %d W (lo=%d hi=%d)", s.lo, next, s.lo, s.hi))
							logFunc(fmt.Sprintf("power calibration: GPU %d binary search up: stable at %d W, trying %d W", s.idx, s.lo, next))
							continue // next GPU in active list
						}
					}
				}
				s.converged = true
				continue
			}

			// Failed or throttled — log and binary-search downward.
			switch {
			case throttle != "":
				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d: %s throttle at %d W", s.calib.Attempts, throttle, s.appliedLimitW))
				logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", s.idx, throttle, s.appliedLimitW))
			case ar.err != nil:
				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", s.calib.Attempts, s.appliedLimitW, ar.err))
				logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", s.idx, s.appliedLimitW, ar.err))
			default:
				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W: no valid power telemetry", s.calib.Attempts, s.appliedLimitW))
				logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W: no valid telemetry", s.idx, s.calib.Attempts, s.appliedLimitW))
			}

			if !canDerate || s.appliedLimitW <= 0 {
				s.converged = true
				continue
			}
			s.hi = s.appliedLimitW

			if s.hi-s.lo <= calibSearchTolerance {
				if s.lo > s.minLimitW {
					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", s.lo, s.lo, s.hi))
					if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.lo); err == nil {
						s.appliedLimitW = s.lo
						s.calib.AppliedPowerLimitW = float64(s.lo)
						s.calib.Derated = s.lo < s.originalLimitW
						// Summary was captured when we last verified stability at s.lo,
						// so the result is valid — mark as completed even though we
						// converged from the failure path (tried higher, failed, fell back).
						s.calib.Completed = true
					}
				} else {
					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
				}
				s.converged = true
				continue
			}

			next := roundTo5W((s.lo + s.hi) / 2)
			if next <= s.lo {
				next = s.lo + calibSearchTolerance
			}
			if next >= s.hi {
				next = (s.lo + s.hi) / 2
			}
			if next < s.minLimitW {
				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
				s.converged = true
				continue
			}
			if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err != nil {
				s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
				logFunc(fmt.Sprintf("power calibration: GPU %d failed to set power limit %d W: %v", s.idx, next, err))
				s.converged = true
				continue
			}
			s.appliedLimitW = next
			s.calib.AppliedPowerLimitW = float64(next)
			s.calib.Derated = next < s.originalLimitW
			s.info.PowerLimitW = float64(next)
			infoByIndex[s.idx] = s.info
			s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", next, s.lo, s.hi))
			logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", s.idx, next, s.lo, s.hi))
		}
	}

	for _, s := range states {
		if s.calib.Completed || s.calib.Attempts > 0 || len(s.calib.Notes) > 0 {
			results[s.idx] = s.calib
		}
	}
	return results, restore
}

// isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222),
// meaning nv-hostengine still holds the diagnostic slot from a prior run.
func isDCGMResourceBusy(err error) bool {
	var exitErr *exec.ExitError
	return errors.As(err, &exitErr) && exitErr.ExitCode() == 222
}

// roundTo5W rounds w to the nearest 5 W boundary.
func roundTo5W(w int) int {
	return ((w + 2) / 5) * 5
}

func powerBenchDurationSec(profile string) int {
	switch strings.TrimSpace(strings.ToLower(profile)) {
	case NvidiaBenchmarkProfileStability:
		return 300
	case NvidiaBenchmarkProfileOvernight:
		return 600
	default:
		return 120
	}
}


func cloneBenchmarkGPUInfoMap(src map[int]benchmarkGPUInfo) map[int]benchmarkGPUInfo {
	out := make(map[int]benchmarkGPUInfo, len(src))
	for k, v := range src {
		out[k] = v
	}
	return out
}

func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
	var b strings.Builder
	b.WriteString("# Bee Bench Power Report\n\n")
	fmt.Fprintf(&b, "**Benchmark version:** %s  \n", result.BenchmarkVersion)
	fmt.Fprintf(&b, "**Profile:** %s  \n", result.BenchmarkProfile)
	fmt.Fprintf(&b, "**Generated:** %s  \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
	fmt.Fprintf(&b, "**Overall status:** %s  \n", result.OverallStatus)
	fmt.Fprintf(&b, "**Platform max TDP:** %.0f W  \n\n", result.PlatformMaxTDPW)
	if len(result.Findings) > 0 {
		b.WriteString("## Summary\n\n")
		for _, finding := range result.Findings {
			fmt.Fprintf(&b, "- %s\n", finding)
		}
		b.WriteString("\n")
	}
	if len(result.RecommendedSlotOrder) > 0 {
		b.WriteString("## Recommended Slot Order\n\n")
		fmt.Fprintf(&b, "Populate GPUs in this order for best single-card power realization: `%s`\n\n", joinIndexList(result.RecommendedSlotOrder))
	}
	if len(result.RampSteps) > 0 {
		b.WriteString("## Ramp Sequence\n\n")
		b.WriteString("| Step | New GPU | Stable Limit | Total Observed | Derated | Status |\n")
		b.WriteString("|------|---------|--------------|----------------|---------|--------|\n")
		for _, step := range result.RampSteps {
			derated := "-"
			if step.Derated {
				derated = "⚠ yes"
			}
			fmt.Fprintf(&b, "| %d | GPU %d | %.0f W | %.0f W | %s | %s |\n",
				step.StepIndex, step.NewGPUIndex, step.NewGPUStableLimitW, step.TotalObservedPowerW, derated, step.Status)
		}
		b.WriteString("\n")
	}
	b.WriteString("## Per-Slot Results\n\n")
	b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Temp | Attempts |\n")
	b.WriteString("|-----|--------|-------------------|--------------|------|----------|\n")
	for _, gpu := range result.GPUs {
		stableLimit := "-"
		if gpu.StablePowerLimitW > 0 {
			if gpu.Derated {
				stableLimit = fmt.Sprintf("%.0f W ⚠", gpu.StablePowerLimitW)
			} else {
				stableLimit = fmt.Sprintf("%.0f W", gpu.StablePowerLimitW)
			}
		}
		fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %s | %.1f C | %d |\n",
			gpu.Index, gpu.Status, gpu.AppliedPowerLimitW, stableLimit, gpu.MaxObservedTempC, gpu.CalibrationAttempts)
	}
	b.WriteString("\n")
	for _, gpu := range result.GPUs {
		fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name)
		for _, note := range gpu.Notes {
			fmt.Fprintf(&b, "- %s\n", note)
		}
		b.WriteString("\n")
	}
	return b.String()
}

func renderPowerBenchSummary(result NvidiaPowerBenchResult) string {
	var b strings.Builder
	fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
	fmt.Fprintf(&b, "benchmark_version=%s\n", result.BenchmarkVersion)
	fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
	fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
	fmt.Fprintf(&b, "platform_max_tdp_w=%.0f\n", result.PlatformMaxTDPW)
	fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
	if len(result.RecommendedSlotOrder) > 0 {
		fmt.Fprintf(&b, "recommended_slot_order=%s\n", joinIndexList(result.RecommendedSlotOrder))
	}
	for _, step := range result.RampSteps {
		fmt.Fprintf(&b, "ramp_step_%d_gpus=%s\n", step.StepIndex, joinIndexList(step.GPUIndices))
		fmt.Fprintf(&b, "ramp_step_%d_new_gpu=%d\n", step.StepIndex, step.NewGPUIndex)
		fmt.Fprintf(&b, "ramp_step_%d_stable_limit_w=%.0f\n", step.StepIndex, step.NewGPUStableLimitW)
		fmt.Fprintf(&b, "ramp_step_%d_total_power_w=%.0f\n", step.StepIndex, step.TotalObservedPowerW)
	}
	for _, gpu := range result.GPUs {
		if gpu.StablePowerLimitW > 0 {
			fmt.Fprintf(&b, "gpu_%d_stable_limit_w=%.0f\n", gpu.Index, gpu.StablePowerLimitW)
		}
	}
	return b.String()
}

func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
	if ctx == nil {
		ctx = context.Background()
	}
	if logFunc == nil {
		logFunc = func(string) {}
	}
	if strings.TrimSpace(baseDir) == "" {
		baseDir = "/var/log/bee-bench/power"
	}
	opts = normalizeNvidiaBenchmarkOptionsForBenchmark(opts)
	selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
	if err != nil {
		return "", err
	}
	if len(selected) == 0 {
		return "", fmt.Errorf("no NVIDIA GPUs selected")
	}
	ts := time.Now().UTC().Format("20060102-150405")
	runDir := filepath.Join(baseDir, "power-"+ts)
	if err := os.MkdirAll(runDir, 0755); err != nil {
		return "", fmt.Errorf("mkdir %s: %w", runDir, err)
	}
	verboseLog := filepath.Join(runDir, "verbose.log")
	infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
	if infoErr != nil {
		return "", infoErr
	}
	hostname, _ := os.Hostname()
	result := NvidiaPowerBenchResult{
		BenchmarkVersion:   benchmarkVersion,
		GeneratedAt:        time.Now().UTC(),
		Hostname:           hostname,
		ServerModel:        readServerModel(),
		BenchmarkProfile:   opts.Profile,
		SelectedGPUIndices: append([]int(nil), selected...),
		OverallStatus:      "OK",
	}
	durationSec := powerBenchDurationSec(opts.Profile)
	_ = durationSec
	// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
	// establish a true single-card power baseline unaffected by neighbour heat.
	calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
	var allRestoreActions []benchmarkRestoreAction
	for _, idx := range selected {
		singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
		_ = os.MkdirAll(singleDir, 0755)
		singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
		logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
		c, restore := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil)
		allRestoreActions = append(allRestoreActions, restore...)
		if r, ok := c[idx]; ok {
			calibByIndex[idx] = r
		}
	}
	defer func() {
		for i := len(allRestoreActions) - 1; i >= 0; i-- {
			allRestoreActions[i].fn()
		}
	}()
	gpus := make([]NvidiaPowerBenchGPU, 0, len(selected))
	for _, idx := range selected {
		info := infoByIndex[idx]
		calib := calibByIndex[idx]
		status := "OK"
		if !calib.Completed {
			status = "FAILED"
			result.OverallStatus = "PARTIAL"
		} else if calib.Derated {
			status = "PARTIAL"
			if result.OverallStatus == "OK" {
				result.OverallStatus = "PARTIAL"
			}
		}
		gpus = append(gpus, NvidiaPowerBenchGPU{
			Index:               idx,
			Name:                info.Name,
			BusID:               info.BusID,
			DefaultPowerLimitW:  info.DefaultPowerLimitW,
			AppliedPowerLimitW:  calib.AppliedPowerLimitW,
			MaxObservedPowerW:   calib.Summary.P95PowerW,
			MaxObservedTempC:    calib.Summary.P95TempC,
			CalibrationAttempts: calib.Attempts,
			Derated:             calib.Derated,
			Status:              status,
			Notes:               append([]string(nil), calib.Notes...),
			CoolingWarning:      calib.CoolingWarning,
		})
	}
	sort.Slice(gpus, func(i, j int) bool {
		if gpus[i].MaxObservedPowerW != gpus[j].MaxObservedPowerW {
			return gpus[i].MaxObservedPowerW > gpus[j].MaxObservedPowerW
		}
		if gpus[i].AppliedPowerLimitW != gpus[j].AppliedPowerLimitW {
			return gpus[i].AppliedPowerLimitW > gpus[j].AppliedPowerLimitW
		}
		if gpus[i].Derated != gpus[j].Derated {
			return !gpus[i].Derated
		}
		return gpus[i].Index < gpus[j].Index
	})
	result.GPUs = gpus
	result.RecommendedSlotOrder = make([]int, 0, len(gpus))
	for _, gpu := range gpus {
		result.RecommendedSlotOrder = append(result.RecommendedSlotOrder, gpu.Index)
	}
	if len(result.RecommendedSlotOrder) > 0 {
		result.Findings = append(result.Findings, fmt.Sprintf("Recommended slot order for installation based on single-card targeted_power: %s.", joinIndexList(result.RecommendedSlotOrder)))
	}
	for _, gpu := range gpus {
		if gpu.Derated {
			result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete targeted_power.", gpu.Index, gpu.AppliedPowerLimitW))
		}
		if gpu.CoolingWarning != "" {
			result.Findings = append(result.Findings, fmt.Sprintf(
				"GPU %d: %s. Operator action: rerun the benchmark with fan speed manually fixed at 100%% to confirm actual thermal headroom.",
				gpu.Index, gpu.CoolingWarning,
			))
		}
	}
	singleByIndex := make(map[int]NvidiaPowerBenchGPU, len(gpus))
	for _, gpu := range gpus {
		singleByIndex[gpu.Index] = gpu
	}

	// Phase 2: cumulative thermal ramp.
	// Each step introduces one new GPU into an environment where all previously
	// calibrated GPUs are already running at their fixed stable limits. The new
	// GPU's stable TDP is searched via binary search (targeted_power) under real
	// multi-GPU thermal load. Once found, its limit is fixed permanently for all
	// subsequent steps. This ensures each GPU's limit reflects actual sustained
	// power in the final full-system thermal state.
	//
	// stableLimits accumulates GPU index → fixed stable limit (W) across steps.
	stableLimits := make(map[int]int, len(result.RecommendedSlotOrder))

	// Step 1: reuse single-card calibration result directly.
	if len(result.RecommendedSlotOrder) > 0 {
		firstIdx := result.RecommendedSlotOrder[0]
		firstCalib := calibByIndex[firstIdx]
		stableLimits[firstIdx] = int(math.Round(firstCalib.AppliedPowerLimitW))
		ramp := NvidiaPowerBenchStep{
			StepIndex:         1,
			GPUIndices:        []int{firstIdx},
			NewGPUIndex:       firstIdx,
			NewGPUStableLimitW: firstCalib.AppliedPowerLimitW,
			TotalObservedPowerW: firstCalib.Summary.P95PowerW,
			AvgObservedPowerW:   firstCalib.Summary.P95PowerW,
			Derated:           firstCalib.Derated,
			Status:            "OK",
		}
		if !firstCalib.Completed {
			ramp.Status = "FAILED"
			ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card targeted_power", firstIdx))
			result.OverallStatus = "PARTIAL"
		} else if firstCalib.Derated {
			ramp.Status = "PARTIAL"
			if result.OverallStatus == "OK" {
				result.OverallStatus = "PARTIAL"
			}
			result.Findings = append(result.Findings, fmt.Sprintf("Ramp step 1 (GPU %d) required derating to %.0f W.", firstIdx, firstCalib.AppliedPowerLimitW))
		}
		result.RampSteps = append(result.RampSteps, ramp)
		logFunc(fmt.Sprintf("power ramp: step 1/%d — reused single-card calibration for GPU %d, stable limit %.0f W",
			len(result.RecommendedSlotOrder), firstIdx, firstCalib.AppliedPowerLimitW))
	}

	// Steps 2..N: each step fixes previously calibrated GPUs and searches only
	// the new GPU's stable limit in the combined thermal environment.
	for stepNum := 1; stepNum < len(result.RecommendedSlotOrder); stepNum++ {
		step := stepNum + 1
		subset := append([]int(nil), result.RecommendedSlotOrder[:step]...)
		newGPUIdx := result.RecommendedSlotOrder[stepNum]
		stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step))
		_ = os.MkdirAll(stepDir, 0755)

		// All previously calibrated GPUs are fixed at their stable limits.
		fixedForStep := make(map[int]int, len(stableLimits))
		for k, v := range stableLimits {
			fixedForStep[k] = v
		}

		logFunc(fmt.Sprintf("power ramp: step %d/%d — calibrating GPU %d with %d fixed GPU(s)",
			step, len(result.RecommendedSlotOrder), newGPUIdx, len(fixedForStep)))

		stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
		stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, fixedForStep)
		// Accumulate restore actions; they all run in the outer defer.
		allRestoreActions = append(allRestoreActions, stepRestore...)

		ramp := NvidiaPowerBenchStep{
			StepIndex:   step,
			GPUIndices:  subset,
			NewGPUIndex: newGPUIdx,
			Status:      "OK",
		}

		// Total observed power = sum of p95 across all GPUs in this step.
		for _, idx := range subset {
			if c, ok := stepCalib[idx]; ok {
				ramp.TotalObservedPowerW += c.Summary.P95PowerW
			}
		}
		if len(subset) > 0 {
			ramp.AvgObservedPowerW = ramp.TotalObservedPowerW / float64(len(subset))
		}

		// Determine stable limit for the new GPU.
		if c, ok := stepCalib[newGPUIdx]; ok && c.Completed {
			stableLimits[newGPUIdx] = int(math.Round(c.AppliedPowerLimitW))
			ramp.NewGPUStableLimitW = c.AppliedPowerLimitW
			ramp.Derated = c.Derated
			if c.Derated {
				ramp.Status = "PARTIAL"
				if result.OverallStatus == "OK" {
					result.OverallStatus = "PARTIAL"
				}
				result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
			}
		} else {
			// Calibration failed — fall back to single-card limit.
			fb := calibByIndex[newGPUIdx]
			stableLimits[newGPUIdx] = int(math.Round(fb.AppliedPowerLimitW))
			ramp.NewGPUStableLimitW = fb.AppliedPowerLimitW
			ramp.Status = "FAILED"
			ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d; using single-card limit %.0f W", newGPUIdx, step, fb.AppliedPowerLimitW))
			result.OverallStatus = "PARTIAL"
		}

		result.RampSteps = append(result.RampSteps, ramp)
	}

	// Populate StablePowerLimitW on each GPU entry from the accumulated stable limits.
	for i := range result.GPUs {
		if lim, ok := stableLimits[result.GPUs[i].Index]; ok {
			result.GPUs[i].StablePowerLimitW = float64(lim)
		}
	}

	// PlatformMaxTDPW = sum of all stable limits — the actual sustained power
	// budget of this server with all GPUs running simultaneously without throttling.
	for _, lim := range stableLimits {
		result.PlatformMaxTDPW += float64(lim)
	}
	resultJSON, err := json.MarshalIndent(result, "", "  ")
	if err != nil {
		return "", fmt.Errorf("marshal power result: %w", err)
	}
	if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
		return "", fmt.Errorf("write result.json: %w", err)
	}
	if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(renderPowerBenchReport(result)), 0644); err != nil {
		return "", fmt.Errorf("write report.md: %w", err)
	}
	if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(renderPowerBenchSummary(result)), 0644); err != nil {
		return "", fmt.Errorf("write summary.txt: %w", err)
	}
	return runDir, nil
}