Add power calibration step to benchmark; fix PowerSustainScore reference

Before the per-GPU compute phases, run `dcgmi diag -r targeted_power` for 45 s while collecting nvidia-smi power metrics in parallel. The p95 power per GPU is stored as calibrated_peak_power_w and used as the denominator for PowerSustainScore instead of the hardware default limit, which bee-gpu-burn cannot reach because it is compute-only. Fallback chain: calibrated peak → default limit → enforced limit. If dcgmi is absent or the run fails, calibration is skipped silently. Adjust composite score weights to match the new honest power reference: base 0.35, thermal 0.25, stability 0.25, power 0.15, NCCL bonus 0.10. Power weight reduced (0.20→0.15) because even with a calibrated reference bee-gpu-burn reaches ~60-75% of TDP by design (no concurrent mem stress). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-12 22:06:46 +03:00
parent 9e3dcf9b4d
commit f4a19c0a00
2 changed files with 92 additions and 5 deletions
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -7,6 +7,7 @@ import (
 	"fmt"
 	"math"
 	"os"
+	"os/exec"
 	"path/filepath"
 	"regexp"
 	"sort"
@@ -153,12 +154,16 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		}
 	}()

+	// Power calibration: run dcgmi targeted_power while sampling nvidia-smi power.
+	// Returns per-GPU p95 power as an honest TDP reference for PowerSustainScore.
+	calibPowerByIndex := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, logFunc)
+
 	// Start background CPU load sampler — samples every 10s during GPU phases.
 	cpuStopCh := make(chan struct{})
 	cpuSamplesCh := startCPULoadSampler(cpuStopCh, 10)

 	if opts.ParallelGPUs {
-		runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples)
+		runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, calibPowerByIndex, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples)
 	} else {

 	for _, idx := range selected {
@@ -178,6 +183,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 			gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
 			gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
 		}
+		if w, ok := calibPowerByIndex[idx]; ok && w > 0 {
+			gpuResult.CalibratedPeakPowerW = w
+		}
 		if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
 			gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz
 			gpuResult.LockedMemoryClockMHz = norm.MemoryClockLockMHz
@@ -849,9 +857,14 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
 			score.ComputeScore += precision.TeraOpsPerSec
 		}
 	}
-	// Use default power limit for sustain score so a manually reduced limit
-	// does not inflate the score. Fall back to enforced limit if default unknown.
-	referencePowerW := gpu.DefaultPowerLimitW
+	// PowerSustainScore: prefer calibrated peak power (measured under targeted_power
+	// load) as the reference — it reflects what this GPU actually reaches under a
+	// full-spectrum workload, unlike the hardware default limit which bee-gpu-burn
+	// cannot reach. Fall back to default limit, then enforced limit.
+	referencePowerW := gpu.CalibratedPeakPowerW
+	if referencePowerW <= 0 {
+		referencePowerW = gpu.DefaultPowerLimitW
+	}
 	if referencePowerW <= 0 {
 		referencePowerW = gpu.PowerLimitW
 	}
@@ -870,7 +883,15 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
 }

 func compositeBenchmarkScore(score BenchmarkScorecard) float64 {
-	quality := 0.40 + 0.20*(score.PowerSustainScore/100.0) + 0.20*(score.ThermalSustainScore/100.0) + 0.20*(score.StabilityScore/100.0)
+	// Weights after introducing calibrated power reference:
+	//   base        0.35 — floor so a GPU that fails all sustain checks still scores
+	//   thermal     0.25 — heaviest: throttle counters are the most reliable signal
+	//   stability   0.25 — clock/power variance matters for reproducibility
+	//   power       0.15 — honest with calibrated reference; lower because
+	//                       bee-gpu-burn is compute-only (not mem+compute like TDP test)
+	//   NCCL bonus  0.10 — interconnect health
+	//   cap         1.10
+	quality := 0.35 + 0.15*(score.PowerSustainScore/100.0) + 0.25*(score.ThermalSustainScore/100.0) + 0.25*(score.StabilityScore/100.0)
 	if score.InterconnectScore > 0 {
 		quality += 0.10
 	}
@@ -1418,6 +1439,7 @@ func runNvidiaBenchmarkParallel(
 	spec benchmarkProfileSpec,
 	logFunc func(string),
 	result *NvidiaBenchmarkResult,
+	calibPowerByIndex map[int]float64,
 	serverIdleW *float64, serverLoadedWSum *float64,
 	serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int,
 ) {
@@ -1439,6 +1461,9 @@ func runNvidiaBenchmarkParallel(
 			r.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
 			r.MaxMemoryClockMHz = info.MaxMemoryClockMHz
 		}
+		if w, ok := calibPowerByIndex[idx]; ok && w > 0 {
+			r.CalibratedPeakPowerW = w
+		}
 		if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
 			r.LockedGraphicsClockMHz = norm.GPUClockLockMHz
 			r.LockedMemoryClockMHz = norm.MemoryClockLockMHz
@@ -1765,3 +1790,60 @@ func summarizeCPULoad(samples []float64) *BenchmarkCPULoad {
 	}
 	return cl
 }
+
+// runBenchmarkPowerCalibration runs a short dcgmi targeted_power test while
+// collecting nvidia-smi power samples in parallel. It returns a map from GPU
+// index to p95 observed power (watts), which is used as the reference for
+// PowerSustainScore instead of the hardware default limit.
+//
+// If dcgmi is unavailable or the run fails the function returns an empty map
+// and the caller falls back to DefaultPowerLimitW. The calibration is skipped
+// gracefully — it must never block or fail the main benchmark.
+func runBenchmarkPowerCalibration(
+	ctx context.Context,
+	verboseLog, runDir string,
+	gpuIndices []int,
+	logFunc func(string),
+) map[int]float64 {
+	const calibDurationSec = 45
+
+	// dcgmi must be present.
+	if _, err := exec.LookPath("dcgmi"); err != nil {
+		logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
+		return map[int]float64{}
+	}
+
+	logFunc(fmt.Sprintf("power calibration: running dcgmi targeted_power for %ds on GPUs %s", calibDurationSec, joinIndexList(gpuIndices)))
+
+	cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices)
+	out, rows, err := runBenchmarkCommandWithMetrics(ctx, verboseLog, "power-calibration.log", cmd, nil, gpuIndices, runDir, "power-calibration", logFunc)
+	_ = os.WriteFile(filepath.Join(runDir, "power-calibration.log"), out, 0644)
+	if err != nil {
+		logFunc(fmt.Sprintf("power calibration: dcgmi targeted_power failed (%v), skipping", err))
+		return map[int]float64{}
+	}
+
+	// Group rows by GPU index and compute p95 power for each.
+	result := make(map[int]float64, len(gpuIndices))
+	for _, idx := range gpuIndices {
+		perGPU := filterRowsByGPU(rows, idx)
+		if len(perGPU) == 0 {
+			continue
+		}
+		powers := make([]float64, 0, len(perGPU))
+		for _, r := range perGPU {
+			if r.PowerW > 0 {
+				powers = append(powers, r.PowerW)
+			}
+		}
+		if len(powers) == 0 {
+			continue
+		}
+		p95 := benchmarkPercentile(powers, 95)
+		if p95 > 0 {
+			result[idx] = p95
+			logFunc(fmt.Sprintf("power calibration: GPU %d p95=%.0f W (%d samples)", idx, p95, len(powers)))
+		}
+	}
+	return result
+}
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -88,6 +88,11 @@ type BenchmarkGPUResult struct {
 	PowerLimitW            float64                    `json:"power_limit_w,omitempty"`
 	MultiprocessorCount    int                        `json:"multiprocessor_count,omitempty"`
 	DefaultPowerLimitW     float64                    `json:"default_power_limit_w,omitempty"`
+	// CalibratedPeakPowerW is the p95 power measured during a short
+	// dcgmi targeted_power calibration run before the main benchmark.
+	// Used as the reference denominator for PowerSustainScore instead of
+	// the hardware default limit, which bee-gpu-burn cannot reach.
+	CalibratedPeakPowerW   float64                    `json:"calibrated_peak_power_w,omitempty"`
 	MaxGraphicsClockMHz    float64                    `json:"max_graphics_clock_mhz,omitempty"`
 	BaseGraphicsClockMHz   float64                    `json:"base_graphics_clock_mhz,omitempty"`
 	MaxMemoryClockMHz      float64                    `json:"max_memory_clock_mhz,omitempty"`