From 732bf4cbabd497748e337dd300c9c406c8538f16 Mon Sep 17 00:00:00 2001
From: Michael Chus <mike@mchus.pro>
Date: Thu, 16 Apr 2026 00:30:50 +0300
Subject: [PATCH] Redesign power and performance benchmarks with new
 methodology

Power/Thermal Fit: cumulative fixed-limit ramp where each GPU's stable TDP
is found under real multi-GPU thermal load (all prior GPUs running at their
fixed limits). PlatformMaxTDPW = sum of stable limits across all GPUs.
Remove PlatformPowerScore from power test.

Performance Benchmark: remove pre-benchmark power calibration entirely.
After N single-card runs, execute k=2..N parallel ramp-up steps and compute
PlatformPowerScore = mean compute scalability vs best single-card TOPS.
PowerSustainScore falls back to Steady.AvgPowerW when calibration absent.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 audit/internal/platform/benchmark.go        | 307 +++++++++++++++-----
 audit/internal/platform/benchmark_report.go |  16 +
 audit/internal/platform/benchmark_types.go  |  56 +++-
 3 files changed, 291 insertions(+), 88 deletions(-)

diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go
index 6f1295d..055178e 100644
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -304,18 +304,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		}
 	}()
 
-	// Power calibration: run dcgmi targeted_power while sampling nvidia-smi power.
-	// Returns per-GPU p95 power as an honest TDP reference for PowerSustainScore.
-	calibByIndex, powerRestoreActions := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, infoByIndex, logFunc)
-	restoreActions = append(restoreActions, powerRestoreActions...)
-	for _, idx := range selected {
-		if calib, ok := calibByIndex[idx]; ok && calib.Derated && calib.AppliedPowerLimitW > 0 {
-			result.Warnings = append(result.Warnings, fmt.Sprintf(
-				"GPU %d could not complete targeted_power at its default server power budget; benchmark ran at reduced power limit %.0f W.",
-				idx, calib.AppliedPowerLimitW,
-			))
-		}
-	}
+	// No power calibration before performance benchmark — GPUs run at their
+	// default power limits. PowerSustainScore is derived from steady-state power
+	// observed during the benchmark itself.
+	calibByIndex := make(map[int]benchmarkPowerCalibrationResult)
 
 	// Start background CPU load sampler — samples every 10s during GPU phases.
 	cpuStopCh := make(chan struct{})
@@ -531,6 +523,69 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 
 	} // end sequential path
 
+	// Performance scalability ramp-up: run parallel benchmarks for k=2..N GPUs
+	// and compute compute scalability relative to the best single-GPU result.
+	// Only runs in sequential mode (each GPU was tested individually above) and
+	// when there are at least 2 GPUs.
+	if !opts.ParallelGPUs && len(selected) >= 2 {
+		// Find the best single-card SyntheticScore as the 1-GPU baseline.
+		var bestTOPS float64
+		for _, g := range result.GPUs {
+			if g.Scores.SyntheticScore > bestTOPS {
+				bestTOPS = g.Scores.SyntheticScore
+			}
+		}
+		if bestTOPS > 0 {
+			var rampSteps []NvidiaPerformanceRampStep
+			var scalabilityPcts []float64
+			for k := 2; k <= len(selected); k++ {
+				subset := append([]int(nil), selected[:k]...)
+				rampDir := filepath.Join(runDir, fmt.Sprintf("ramp-%02d", k))
+				_ = os.MkdirAll(rampDir, 0755)
+				logFunc(fmt.Sprintf("performance ramp: step %d/%d — running %d GPUs in parallel", k, len(selected), k))
+
+				var rampResult NvidiaBenchmarkResult
+				var rampIdleW, rampLoadedWSum float64
+				var rampIdleOK, rampLoadedOK bool
+				var rampLoadedSamples int
+				var rampMetricRows []GPUMetricRow
+				var rampTimelineSec float64
+				emptyCalib := make(map[int]benchmarkPowerCalibrationResult)
+
+				runNvidiaBenchmarkParallel(ctx, verboseLog, rampDir, subset, infoByIndex, opts, spec, logFunc,
+					&rampResult, emptyCalib,
+					&rampIdleW, &rampLoadedWSum, &rampIdleOK, &rampLoadedOK, &rampLoadedSamples,
+					&rampMetricRows, &rampTimelineSec, "")
+
+				var totalSynth, totalMixed float64
+				for _, g := range rampResult.GPUs {
+					totalSynth += g.Scores.SyntheticScore
+					totalMixed += g.Scores.MixedScore
+				}
+				scalPct := totalSynth / (float64(k) * bestTOPS) * 100
+				scalabilityPcts = append(scalabilityPcts, scalPct)
+
+				stepStatus := "OK"
+				if len(rampResult.GPUs) < k {
+					stepStatus = "PARTIAL"
+				}
+				rampSteps = append(rampSteps, NvidiaPerformanceRampStep{
+					StepIndex:          k,
+					GPUIndices:         subset,
+					TotalSyntheticTOPS: totalSynth,
+					TotalMixedTOPS:     totalMixed,
+					ScalabilityPct:     scalPct,
+					Status:             stepStatus,
+				})
+			}
+			result.PerformanceRampSteps = rampSteps
+			result.PlatformPowerScore = benchmarkMean(scalabilityPcts)
+			if len(scalabilityPcts) > 0 {
+				result.ScalabilityScore = scalabilityPcts[len(scalabilityPcts)-1]
+			}
+		}
+	}
+
 	if len(selected) > 1 && opts.RunNCCL {
 		result.Interconnect = runBenchmarkInterconnect(ctx, verboseLog, runDir, selected, spec, logFunc)
 		if result.Interconnect != nil && result.Interconnect.Supported {
@@ -1344,20 +1399,25 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
 	case score.MixedScore > 0:
 		score.ComputeScore = score.MixedScore
 	}
-	// PowerSustainScore: measures how close the GPU came to its rated TDP under
-	// a full-spectrum load (dcgmi targeted_power). 100 = exactly at rated TDP.
+	// PowerSustainScore: measures how close the GPU came to its rated TDP during
+	// steady-state benchmark load. 100 = exactly at rated TDP.
 	// Penalty applied symmetrically for both under- and over-TDP deviations:
 	//   score = max(0, 100 − |measured − rated| / rated × 100)
 	// Under-TDP → power delivery / cooling issue.
 	// Over-TDP  → power limit not properly enforced / power regulation fault.
-	// Falls back to 0 if calibration was not performed (dcgmi unavailable).
+	// Uses CalibratedPeakPowerW when available (from external power calibration),
+	// otherwise falls back to Steady.AvgPowerW observed during the benchmark.
 	{
 		ref := gpu.DefaultPowerLimitW
 		if ref <= 0 {
 			ref = gpu.PowerLimitW
 		}
-		if gpu.CalibratedPeakPowerW > 0 && ref > 0 {
-			deviationPct := math.Abs(gpu.CalibratedPeakPowerW-ref) / ref * 100
+		measured := gpu.CalibratedPeakPowerW
+		if measured <= 0 {
+			measured = gpu.Steady.AvgPowerW
+		}
+		if measured > 0 && ref > 0 {
+			deviationPct := math.Abs(measured-ref) / ref * 100
 			score.PowerSustainScore = clampScore(100 - deviationPct)
 		}
 	}
@@ -2470,6 +2530,7 @@ func runBenchmarkPowerCalibration(
 	gpuIndices []int,
 	infoByIndex map[int]benchmarkGPUInfo,
 	logFunc func(string),
+	fixedLimits map[int]int,
 ) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) {
 	const calibDurationSec = 120
 	const maxDerateW = 150
@@ -2555,6 +2616,21 @@ func runBenchmarkPowerCalibration(
 			hi:             appliedLimitW + 1, // not yet tested, not yet confirmed unstable
 			calib:          benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
 		}
+		if fixedLimits != nil {
+			if fixedW, ok := fixedLimits[idx]; ok {
+				// This GPU's limit was established in a prior ramp step and must
+				// remain unchanged. Apply it immediately and skip the binary search.
+				if canDerate && fixedW > 0 {
+					_ = setBenchmarkPowerLimit(ctx, verboseLog, idx, fixedW)
+				}
+				s.appliedLimitW = fixedW
+				s.calib.AppliedPowerLimitW = float64(fixedW)
+				s.calib.Completed = true
+				s.converged = true
+				s.calib.Notes = append(s.calib.Notes,
+					fmt.Sprintf("fixed limit: %d W (held from prior ramp step)", fixedW))
+			}
+		}
 		states = append(states, s)
 		if canDerate && originalLimitW > 0 {
 			idxCopy := idx
@@ -2764,6 +2840,10 @@ calibDone:
 						s.appliedLimitW = s.lo
 						s.calib.AppliedPowerLimitW = float64(s.lo)
 						s.calib.Derated = s.lo < s.originalLimitW
+						// Summary was captured when we last verified stability at s.lo,
+						// so the result is valid — mark as completed even though we
+						// converged from the failure path (tried higher, failed, fell back).
+						s.calib.Completed = true
 					}
 				} else {
 					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
@@ -2846,7 +2926,8 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 	fmt.Fprintf(&b, "**Benchmark version:** %s  \n", result.BenchmarkVersion)
 	fmt.Fprintf(&b, "**Profile:** %s  \n", result.BenchmarkProfile)
 	fmt.Fprintf(&b, "**Generated:** %s  \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
-	fmt.Fprintf(&b, "**Overall status:** %s  \n\n", result.OverallStatus)
+	fmt.Fprintf(&b, "**Overall status:** %s  \n", result.OverallStatus)
+	fmt.Fprintf(&b, "**Platform max TDP:** %.0f W  \n\n", result.PlatformMaxTDPW)
 	if len(result.Findings) > 0 {
 		b.WriteString("## Summary\n\n")
 		for _, finding := range result.Findings {
@@ -2860,25 +2941,36 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 	}
 	if len(result.RampSteps) > 0 {
 		b.WriteString("## Ramp Sequence\n\n")
-		b.WriteString("| Step | GPUs | Total Power | Avg / GPU | Avg Realization | Min Realization | Derated |\n")
-		b.WriteString("|------|------|-------------|-----------|-----------------|-----------------|---------|\n")
+		b.WriteString("| Step | New GPU | Stable Limit | Total Observed | Derated | Status |\n")
+		b.WriteString("|------|---------|--------------|----------------|---------|--------|\n")
 		for _, step := range result.RampSteps {
-			fmt.Fprintf(&b, "| %d | %s | %.0f W | %.0f W | %.1f%% | %.1f%% | %d |\n",
-				step.StepIndex, joinIndexList(step.GPUIndices), step.TotalObservedPowerW, step.AvgObservedPowerW, step.AvgPowerRealizationPct, step.MinPowerRealizationPct, step.DeratedGPUCount)
+			derated := "-"
+			if step.Derated {
+				derated = "⚠ yes"
+			}
+			fmt.Fprintf(&b, "| %d | GPU %d | %.0f W | %.0f W | %s | %s |\n",
+				step.StepIndex, step.NewGPUIndex, step.NewGPUStableLimitW, step.TotalObservedPowerW, derated, step.Status)
 		}
 		b.WriteString("\n")
 	}
 	b.WriteString("## Per-Slot Results\n\n")
-	b.WriteString("| GPU | Status | Max Power | Temp | Applied Limit | Default Limit | Attempts |\n")
-	b.WriteString("|-----|--------|-----------|------|---------------|---------------|----------|\n")
+	b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Temp | Attempts |\n")
+	b.WriteString("|-----|--------|-------------------|--------------|------|----------|\n")
 	for _, gpu := range result.GPUs {
-		fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %.1f C | %.0f W | %.0f W | %d |\n",
-			gpu.Index, gpu.Status, gpu.MaxObservedPowerW, gpu.MaxObservedTempC, gpu.AppliedPowerLimitW, gpu.DefaultPowerLimitW, gpu.CalibrationAttempts)
+		stableLimit := "-"
+		if gpu.StablePowerLimitW > 0 {
+			if gpu.Derated {
+				stableLimit = fmt.Sprintf("%.0f W ⚠", gpu.StablePowerLimitW)
+			} else {
+				stableLimit = fmt.Sprintf("%.0f W", gpu.StablePowerLimitW)
+			}
+		}
+		fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %s | %.1f C | %d |\n",
+			gpu.Index, gpu.Status, gpu.AppliedPowerLimitW, stableLimit, gpu.MaxObservedTempC, gpu.CalibrationAttempts)
 	}
 	b.WriteString("\n")
 	for _, gpu := range result.GPUs {
 		fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name)
-
 		for _, note := range gpu.Notes {
 			fmt.Fprintf(&b, "- %s\n", note)
 		}
@@ -2893,14 +2985,22 @@ func renderPowerBenchSummary(result NvidiaPowerBenchResult) string {
 	fmt.Fprintf(&b, "benchmark_version=%s\n", result.BenchmarkVersion)
 	fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
 	fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
+	fmt.Fprintf(&b, "platform_max_tdp_w=%.0f\n", result.PlatformMaxTDPW)
 	fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
 	if len(result.RecommendedSlotOrder) > 0 {
 		fmt.Fprintf(&b, "recommended_slot_order=%s\n", joinIndexList(result.RecommendedSlotOrder))
 	}
 	for _, step := range result.RampSteps {
 		fmt.Fprintf(&b, "ramp_step_%d_gpus=%s\n", step.StepIndex, joinIndexList(step.GPUIndices))
+		fmt.Fprintf(&b, "ramp_step_%d_new_gpu=%d\n", step.StepIndex, step.NewGPUIndex)
+		fmt.Fprintf(&b, "ramp_step_%d_stable_limit_w=%.0f\n", step.StepIndex, step.NewGPUStableLimitW)
 		fmt.Fprintf(&b, "ramp_step_%d_total_power_w=%.0f\n", step.StepIndex, step.TotalObservedPowerW)
 	}
+	for _, gpu := range result.GPUs {
+		if gpu.StablePowerLimitW > 0 {
+			fmt.Fprintf(&b, "gpu_%d_stable_limit_w=%.0f\n", gpu.Index, gpu.StablePowerLimitW)
+		}
+	}
 	return b.String()
 }
 
@@ -2953,7 +3053,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		_ = os.MkdirAll(singleDir, 0755)
 		singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
 		logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
-		c, restore := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc)
+		c, restore := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil)
 		allRestoreActions = append(allRestoreActions, restore...)
 		if r, ok := c[idx]; ok {
 			calibByIndex[idx] = r
@@ -3029,72 +3129,125 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		singleByIndex[gpu.Index] = gpu
 	}
 
-	// Phase 2: ramp — add one GPU per step and calibrate the growing subset
-	// simultaneously. Step 1 reuses single-card results; steps 2..N run fresh
-	// targeted_power with derating if degradation is detected.
-	for step := 1; step <= len(result.RecommendedSlotOrder); step++ {
+	// Phase 2: cumulative thermal ramp.
+	// Each step introduces one new GPU into an environment where all previously
+	// calibrated GPUs are already running at their fixed stable limits. The new
+	// GPU's stable TDP is searched via binary search (targeted_power) under real
+	// multi-GPU thermal load. Once found, its limit is fixed permanently for all
+	// subsequent steps. This ensures each GPU's limit reflects actual sustained
+	// power in the final full-system thermal state.
+	//
+	// stableLimits accumulates GPU index → fixed stable limit (W) across steps.
+	stableLimits := make(map[int]int, len(result.RecommendedSlotOrder))
+
+	// Step 1: reuse single-card calibration result directly.
+	if len(result.RecommendedSlotOrder) > 0 {
+		firstIdx := result.RecommendedSlotOrder[0]
+		firstCalib := calibByIndex[firstIdx]
+		stableLimits[firstIdx] = int(math.Round(firstCalib.AppliedPowerLimitW))
+		ramp := NvidiaPowerBenchStep{
+			StepIndex:         1,
+			GPUIndices:        []int{firstIdx},
+			NewGPUIndex:       firstIdx,
+			NewGPUStableLimitW: firstCalib.AppliedPowerLimitW,
+			TotalObservedPowerW: firstCalib.Summary.P95PowerW,
+			AvgObservedPowerW:   firstCalib.Summary.P95PowerW,
+			Derated:           firstCalib.Derated,
+			Status:            "OK",
+		}
+		if !firstCalib.Completed {
+			ramp.Status = "FAILED"
+			ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card targeted_power", firstIdx))
+			result.OverallStatus = "PARTIAL"
+		} else if firstCalib.Derated {
+			ramp.Status = "PARTIAL"
+			if result.OverallStatus == "OK" {
+				result.OverallStatus = "PARTIAL"
+			}
+			result.Findings = append(result.Findings, fmt.Sprintf("Ramp step 1 (GPU %d) required derating to %.0f W.", firstIdx, firstCalib.AppliedPowerLimitW))
+		}
+		result.RampSteps = append(result.RampSteps, ramp)
+		logFunc(fmt.Sprintf("power ramp: step 1/%d — reused single-card calibration for GPU %d, stable limit %.0f W",
+			len(result.RecommendedSlotOrder), firstIdx, firstCalib.AppliedPowerLimitW))
+	}
+
+	// Steps 2..N: each step fixes previously calibrated GPUs and searches only
+	// the new GPU's stable limit in the combined thermal environment.
+	for stepNum := 1; stepNum < len(result.RecommendedSlotOrder); stepNum++ {
+		step := stepNum + 1
 		subset := append([]int(nil), result.RecommendedSlotOrder[:step]...)
+		newGPUIdx := result.RecommendedSlotOrder[stepNum]
 		stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step))
 		_ = os.MkdirAll(stepDir, 0755)
-		var stepCalib map[int]benchmarkPowerCalibrationResult
-		if step == 1 {
-			// Single-GPU step — already measured in phase 1; reuse directly.
-			stepCalib = calibByIndex
-			logFunc(fmt.Sprintf("power ramp: step 1/%d — reusing single-card calibration for GPU %d", len(result.RecommendedSlotOrder), subset[0]))
-		} else {
-			stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
-			var stepRestore []benchmarkRestoreAction
-			stepCalib, stepRestore = runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc)
-			for i := len(stepRestore) - 1; i >= 0; i-- {
-				stepRestore[i].fn()
-			}
+
+		// All previously calibrated GPUs are fixed at their stable limits.
+		fixedForStep := make(map[int]int, len(stableLimits))
+		for k, v := range stableLimits {
+			fixedForStep[k] = v
 		}
+
+		logFunc(fmt.Sprintf("power ramp: step %d/%d — calibrating GPU %d with %d fixed GPU(s)",
+			step, len(result.RecommendedSlotOrder), newGPUIdx, len(fixedForStep)))
+
+		stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
+		stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, fixedForStep)
+		// Accumulate restore actions; they all run in the outer defer.
+		allRestoreActions = append(allRestoreActions, stepRestore...)
+
 		ramp := NvidiaPowerBenchStep{
-			StepIndex:  step,
-			GPUIndices: subset,
-			Status:     "OK",
+			StepIndex:   step,
+			GPUIndices:  subset,
+			NewGPUIndex: newGPUIdx,
+			Status:      "OK",
 		}
-		var realizationValues []float64
+
+		// Total observed power = sum of p95 across all GPUs in this step.
 		for _, idx := range subset {
-			calib := stepCalib[idx]
-			ramp.TotalObservedPowerW += calib.Summary.P95PowerW
-			if calib.Derated {
-				ramp.DeratedGPUCount++
-				ramp.Status = "PARTIAL"
-			}
-			if !calib.Completed {
-				ramp.Status = "FAILED"
-				ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d", idx, step))
-				continue
-			}
-			if single, ok := singleByIndex[idx]; ok && single.MaxObservedPowerW > 0 {
-				realization := calib.Summary.P95PowerW / single.MaxObservedPowerW * 100
-				realizationValues = append(realizationValues, realization)
+			if c, ok := stepCalib[idx]; ok {
+				ramp.TotalObservedPowerW += c.Summary.P95PowerW
 			}
 		}
 		if len(subset) > 0 {
 			ramp.AvgObservedPowerW = ramp.TotalObservedPowerW / float64(len(subset))
 		}
-		if len(realizationValues) > 0 {
-			ramp.AvgPowerRealizationPct = benchmarkMean(realizationValues)
-			ramp.MinPowerRealizationPct = realizationValues[0]
-			for _, v := range realizationValues[1:] {
-				if v < ramp.MinPowerRealizationPct {
-					ramp.MinPowerRealizationPct = v
+
+		// Determine stable limit for the new GPU.
+		if c, ok := stepCalib[newGPUIdx]; ok && c.Completed {
+			stableLimits[newGPUIdx] = int(math.Round(c.AppliedPowerLimitW))
+			ramp.NewGPUStableLimitW = c.AppliedPowerLimitW
+			ramp.Derated = c.Derated
+			if c.Derated {
+				ramp.Status = "PARTIAL"
+				if result.OverallStatus == "OK" {
+					result.OverallStatus = "PARTIAL"
 				}
+				result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
 			}
+		} else {
+			// Calibration failed — fall back to single-card limit.
+			fb := calibByIndex[newGPUIdx]
+			stableLimits[newGPUIdx] = int(math.Round(fb.AppliedPowerLimitW))
+			ramp.NewGPUStableLimitW = fb.AppliedPowerLimitW
+			ramp.Status = "FAILED"
+			ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d; using single-card limit %.0f W", newGPUIdx, step, fb.AppliedPowerLimitW))
+			result.OverallStatus = "PARTIAL"
 		}
-		if ramp.MinPowerRealizationPct > 0 && ramp.MinPowerRealizationPct < 90 {
-			ramp.Notes = append(ramp.Notes, fmt.Sprintf("Power realization fell to %.1f%% of single-card baseline by step %d.", ramp.MinPowerRealizationPct, step))
-			if result.OverallStatus == "OK" {
-				result.OverallStatus = "PARTIAL"
-			}
-		}
-		if ramp.DeratedGPUCount > 0 {
-			result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (%s) needed derating on %d GPU(s).", step, joinIndexList(subset), ramp.DeratedGPUCount))
-		}
+
 		result.RampSteps = append(result.RampSteps, ramp)
 	}
+
+	// Populate StablePowerLimitW on each GPU entry from the accumulated stable limits.
+	for i := range result.GPUs {
+		if lim, ok := stableLimits[result.GPUs[i].Index]; ok {
+			result.GPUs[i].StablePowerLimitW = float64(lim)
+		}
+	}
+
+	// PlatformMaxTDPW = sum of all stable limits — the actual sustained power
+	// budget of this server with all GPUs running simultaneously without throttling.
+	for _, lim := range stableLimits {
+		result.PlatformMaxTDPW += float64(lim)
+	}
 	resultJSON, err := json.MarshalIndent(result, "", "  ")
 	if err != nil {
 		return "", fmt.Errorf("marshal power result: %w", err)
diff --git a/audit/internal/platform/benchmark_report.go b/audit/internal/platform/benchmark_report.go
index d482c03..54487f1 100644
--- a/audit/internal/platform/benchmark_report.go
+++ b/audit/internal/platform/benchmark_report.go
@@ -61,6 +61,9 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 	if result.ScalabilityScore > 0 {
 		fmt.Fprintf(&b, "**Scalability score:** %.1f%%  \n", result.ScalabilityScore)
 	}
+	if result.PlatformPowerScore > 0 {
+		fmt.Fprintf(&b, "**Platform power score:** %.1f%%  \n", result.PlatformPowerScore)
+	}
 	fmt.Fprintf(&b, "**Overall status:** %s  \n", result.OverallStatus)
 	b.WriteString("\n")
 
@@ -329,6 +332,19 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		}
 	}
 
+	// ── Platform Scalability ──────────────────────────────────────────────────
+	if len(result.PerformanceRampSteps) > 0 {
+		b.WriteString("## Platform Scalability (Performance Ramp)\n\n")
+		fmt.Fprintf(&b, "**Platform power score:** %.1f%%  \n\n", result.PlatformPowerScore)
+		b.WriteString("| k GPUs | GPU Indices | Total Synthetic TOPS | Scalability |\n")
+		b.WriteString("|--------|-------------|----------------------|-------------|\n")
+		for _, step := range result.PerformanceRampSteps {
+			fmt.Fprintf(&b, "| %d | %s | %.2f | %.1f%% |\n",
+				step.StepIndex, joinIndexList(step.GPUIndices), step.TotalSyntheticTOPS, step.ScalabilityPct)
+		}
+		b.WriteString("\n")
+	}
+
 	// ── Raw files ─────────────────────────────────────────────────────────────
 	b.WriteString("## Raw Files\n\n")
 	b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
diff --git a/audit/internal/platform/benchmark_types.go b/audit/internal/platform/benchmark_types.go
index f09dea7..3383619 100644
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -65,6 +65,11 @@ type NvidiaBenchmarkResult struct {
 	RampTotal          int                          `json:"ramp_total,omitempty"`
 	RampRunID          string                       `json:"ramp_run_id,omitempty"`
 	ScalabilityScore   float64                      `json:"scalability_score,omitempty"`
+	// PlatformPowerScore is the mean compute scalability across ramp steps 2..N.
+	// 100% = each added GPU contributes exactly its single-card throughput.
+	// < 100% = throughput loss due to thermal throttle, power limits, or contention.
+	PlatformPowerScore   float64                    `json:"platform_power_score,omitempty"`
+	PerformanceRampSteps []NvidiaPerformanceRampStep `json:"performance_ramp_steps,omitempty"`
 	OverallStatus      string                       `json:"overall_status"`
 	SelectedGPUIndices []int                        `json:"selected_gpu_indices"`
 	Findings           []string                     `json:"findings,omitempty"`
@@ -265,8 +270,12 @@ type NvidiaPowerBenchResult struct {
 	RecommendedSlotOrder []int                  `json:"recommended_slot_order,omitempty"`
 	RampSteps            []NvidiaPowerBenchStep `json:"ramp_steps,omitempty"`
 	OverallStatus        string                 `json:"overall_status"`
-	Findings             []string               `json:"findings,omitempty"`
-	GPUs                 []NvidiaPowerBenchGPU  `json:"gpus"`
+	// PlatformMaxTDPW is the sum of per-GPU stable power limits found during the
+	// cumulative thermal ramp. Represents the actual sustained power budget of
+	// this server under full GPU load. Use for rack power planning.
+	PlatformMaxTDPW float64  `json:"platform_max_tdp_w"`
+	Findings        []string `json:"findings,omitempty"`
+	GPUs            []NvidiaPowerBenchGPU `json:"gpus"`
 }
 
 type NvidiaPowerBenchGPU struct {
@@ -274,7 +283,14 @@ type NvidiaPowerBenchGPU struct {
 	Name                string   `json:"name,omitempty"`
 	BusID               string   `json:"bus_id,omitempty"`
 	DefaultPowerLimitW  float64  `json:"default_power_limit_w,omitempty"`
+	// AppliedPowerLimitW is the stable limit found during single-card calibration.
 	AppliedPowerLimitW  float64  `json:"applied_power_limit_w,omitempty"`
+	// StablePowerLimitW is the final fixed limit for this GPU after the
+	// cumulative thermal ramp. This is the limit at which the GPU operated
+	// stably with all other GPUs running simultaneously at their own limits.
+	// May be lower than AppliedPowerLimitW if multi-GPU thermal load required
+	// additional derating.
+	StablePowerLimitW   float64  `json:"stable_power_limit_w,omitempty"`
 	MaxObservedPowerW   float64  `json:"max_observed_power_w,omitempty"`
 	MaxObservedTempC    float64  `json:"max_observed_temp_c,omitempty"`
 	CalibrationAttempts int      `json:"calibration_attempts,omitempty"`
@@ -286,13 +302,31 @@ type NvidiaPowerBenchGPU struct {
 }
 
 type NvidiaPowerBenchStep struct {
-	StepIndex              int      `json:"step_index"`
-	GPUIndices             []int    `json:"gpu_indices"`
-	TotalObservedPowerW    float64  `json:"total_observed_power_w,omitempty"`
-	AvgObservedPowerW      float64  `json:"avg_observed_power_w,omitempty"`
-	MinPowerRealizationPct float64  `json:"min_power_realization_pct,omitempty"`
-	AvgPowerRealizationPct float64  `json:"avg_power_realization_pct,omitempty"`
-	DeratedGPUCount        int      `json:"derated_gpu_count,omitempty"`
-	Status                 string   `json:"status"`
-	Notes                  []string `json:"notes,omitempty"`
+	StepIndex           int      `json:"step_index"`
+	GPUIndices          []int    `json:"gpu_indices"`
+	// NewGPUIndex is the GPU whose stable limit was searched in this step.
+	NewGPUIndex         int      `json:"new_gpu_index"`
+	// NewGPUStableLimitW is the stable power limit found for the new GPU.
+	NewGPUStableLimitW  float64  `json:"new_gpu_stable_limit_w,omitempty"`
+	TotalObservedPowerW float64  `json:"total_observed_power_w,omitempty"`
+	AvgObservedPowerW   float64  `json:"avg_observed_power_w,omitempty"`
+	Derated             bool     `json:"derated,omitempty"`
+	Status              string   `json:"status"`
+	Notes               []string `json:"notes,omitempty"`
+}
+
+// NvidiaPerformanceRampStep holds per-step performance data for the
+// scalability ramp-up phase of the performance benchmark.
+type NvidiaPerformanceRampStep struct {
+	StepIndex          int      `json:"step_index"`
+	GPUIndices         []int    `json:"gpu_indices"`
+	// TotalSyntheticTOPS is the sum of per-GPU SyntheticScore (fp32-equivalent
+	// TOPS from dedicated single-precision phases) across all GPUs in this step.
+	TotalSyntheticTOPS float64  `json:"total_synthetic_tops"`
+	TotalMixedTOPS     float64  `json:"total_mixed_tops,omitempty"`
+	// ScalabilityPct = TotalSyntheticTOPS / (k × best_single_gpu_tops) × 100.
+	// 100% = perfect linear scaling. < 100% = thermal/power/interconnect loss.
+	ScalabilityPct     float64  `json:"scalability_pct"`
+	Status             string   `json:"status"`
+	Notes              []string `json:"notes,omitempty"`
 }