Add power calibration step to benchmark; fix PowerSustainScore reference

Before the per-GPU compute phases, run `dcgmi diag -r targeted_power` for 45 s while collecting nvidia-smi power metrics in parallel. The p95 power per GPU is stored as calibrated_peak_power_w and used as the denominator for PowerSustainScore instead of the hardware default limit, which bee-gpu-burn cannot reach because it is compute-only. Fallback chain: calibrated peak → default limit → enforced limit. If dcgmi is absent or the run fails, calibration is skipped silently. Adjust composite score weights to match the new honest power reference: base 0.35, thermal 0.25, stability 0.25, power 0.15, NCCL bonus 0.10. Power weight reduced (0.20→0.15) because even with a calibrated reference bee-gpu-burn reaches ~60-75% of TDP by design (no concurrent mem stress). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-12 22:06:46 +03:00
parent 9e3dcf9b4d
commit f4a19c0a00
2 changed files with 92 additions and 5 deletions
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -7,6 +7,7 @@ import (
 	"fmt"
 	"math"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"regexp"
 	"sort"
@@ -153,12 +154,16 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		}
 	}()
 	// Power calibration: run dcgmi targeted_power while sampling nvidia-smi power.
 	// Returns per-GPU p95 power as an honest TDP reference for PowerSustainScore.
 	calibPowerByIndex := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, logFunc)
 	// Start background CPU load sampler — samples every 10s during GPU phases.
 	cpuStopCh := make(chan struct{})
 	cpuSamplesCh := startCPULoadSampler(cpuStopCh, 10)
 	if opts.ParallelGPUs {
-		runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples)
+		runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, calibPowerByIndex, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples)
 	} else {
 	for _, idx := range selected {
@@ -178,6 +183,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 			gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
 			gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
 		}
 		if w, ok := calibPowerByIndex[idx]; ok && w > 0 {
 			gpuResult.CalibratedPeakPowerW = w
 		}
 		if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
 			gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz
 			gpuResult.LockedMemoryClockMHz = norm.MemoryClockLockMHz
@@ -849,9 +857,14 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
 			score.ComputeScore += precision.TeraOpsPerSec
 		}
 	}
-	// Use default power limit for sustain score so a manually reduced limit
+	// PowerSustainScore: prefer calibrated peak power (measured under targeted_power
-	// does not inflate the score. Fall back to enforced limit if default unknown.
+	// load) as the reference — it reflects what this GPU actually reaches under a
-	referencePowerW := gpu.DefaultPowerLimitW
+	// full-spectrum workload, unlike the hardware default limit which bee-gpu-burn
 	// cannot reach. Fall back to default limit, then enforced limit.
 	referencePowerW := gpu.CalibratedPeakPowerW
 	if referencePowerW <= 0 {
 		referencePowerW = gpu.DefaultPowerLimitW
 	}
 	if referencePowerW <= 0 {
 		referencePowerW = gpu.PowerLimitW
 	}
@@ -870,7 +883,15 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
 }
 func compositeBenchmarkScore(score BenchmarkScorecard) float64 {
-	quality := 0.40 + 0.20*(score.PowerSustainScore/100.0) + 0.20*(score.ThermalSustainScore/100.0) + 0.20*(score.StabilityScore/100.0)
+	// Weights after introducing calibrated power reference:
 	//   base        0.35 — floor so a GPU that fails all sustain checks still scores
 	//   thermal     0.25 — heaviest: throttle counters are the most reliable signal
 	//   stability   0.25 — clock/power variance matters for reproducibility
 	//   power       0.15 — honest with calibrated reference; lower because
 	//                       bee-gpu-burn is compute-only (not mem+compute like TDP test)
 	//   NCCL bonus  0.10 — interconnect health
 	//   cap         1.10
 	quality := 0.35 + 0.15*(score.PowerSustainScore/100.0) + 0.25*(score.ThermalSustainScore/100.0) + 0.25*(score.StabilityScore/100.0)
 	if score.InterconnectScore > 0 {
 		quality += 0.10
 	}
@@ -1418,6 +1439,7 @@ func runNvidiaBenchmarkParallel(
 	spec benchmarkProfileSpec,
 	logFunc func(string),
 	result *NvidiaBenchmarkResult,
 	calibPowerByIndex map[int]float64,
 	serverIdleW *float64, serverLoadedWSum *float64,
 	serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int,
 ) {
@@ -1439,6 +1461,9 @@ func runNvidiaBenchmarkParallel(
 			r.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
 			r.MaxMemoryClockMHz = info.MaxMemoryClockMHz
 		}
 		if w, ok := calibPowerByIndex[idx]; ok && w > 0 {
 			r.CalibratedPeakPowerW = w
 		}
 		if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
 			r.LockedGraphicsClockMHz = norm.GPUClockLockMHz
 			r.LockedMemoryClockMHz = norm.MemoryClockLockMHz
@@ -1765,3 +1790,60 @@ func summarizeCPULoad(samples []float64) *BenchmarkCPULoad {
 	}
 	return cl
 }
 // runBenchmarkPowerCalibration runs a short dcgmi targeted_power test while
 // collecting nvidia-smi power samples in parallel. It returns a map from GPU
 // index to p95 observed power (watts), which is used as the reference for
 // PowerSustainScore instead of the hardware default limit.
 //
 // If dcgmi is unavailable or the run fails the function returns an empty map
 // and the caller falls back to DefaultPowerLimitW. The calibration is skipped
 // gracefully — it must never block or fail the main benchmark.
 func runBenchmarkPowerCalibration(
 	ctx context.Context,
 	verboseLog, runDir string,
 	gpuIndices []int,
 	logFunc func(string),
 ) map[int]float64 {
 	const calibDurationSec = 45
 	// dcgmi must be present.
 	if _, err := exec.LookPath("dcgmi"); err != nil {
 		logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
 		return map[int]float64{}
 	}
 	logFunc(fmt.Sprintf("power calibration: running dcgmi targeted_power for %ds on GPUs %s", calibDurationSec, joinIndexList(gpuIndices)))
 	cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices)
 	out, rows, err := runBenchmarkCommandWithMetrics(ctx, verboseLog, "power-calibration.log", cmd, nil, gpuIndices, runDir, "power-calibration", logFunc)
 	_ = os.WriteFile(filepath.Join(runDir, "power-calibration.log"), out, 0644)
 	if err != nil {
 		logFunc(fmt.Sprintf("power calibration: dcgmi targeted_power failed (%v), skipping", err))
 		return map[int]float64{}
 	}
 	// Group rows by GPU index and compute p95 power for each.
 	result := make(map[int]float64, len(gpuIndices))
 	for _, idx := range gpuIndices {
 		perGPU := filterRowsByGPU(rows, idx)
 		if len(perGPU) == 0 {
 			continue
 		}
 		powers := make([]float64, 0, len(perGPU))
 		for _, r := range perGPU {
 			if r.PowerW > 0 {
 				powers = append(powers, r.PowerW)
 			}
 		}
 		if len(powers) == 0 {
 			continue
 		}
 		p95 := benchmarkPercentile(powers, 95)
 		if p95 > 0 {
 			result[idx] = p95
 			logFunc(fmt.Sprintf("power calibration: GPU %d p95=%.0f W (%d samples)", idx, p95, len(powers)))
 		}
 	}
 	return result
 }
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -88,6 +88,11 @@ type BenchmarkGPUResult struct {
 	PowerLimitW            float64                    `json:"power_limit_w,omitempty"`
 	MultiprocessorCount    int                        `json:"multiprocessor_count,omitempty"`
 	DefaultPowerLimitW     float64                    `json:"default_power_limit_w,omitempty"`
 	// CalibratedPeakPowerW is the p95 power measured during a short
 	// dcgmi targeted_power calibration run before the main benchmark.
 	// Used as the reference denominator for PowerSustainScore instead of
 	// the hardware default limit, which bee-gpu-burn cannot reach.
 	CalibratedPeakPowerW   float64                    `json:"calibrated_peak_power_w,omitempty"`
 	MaxGraphicsClockMHz    float64                    `json:"max_graphics_clock_mhz,omitempty"`
 	BaseGraphicsClockMHz   float64                    `json:"base_graphics_clock_mhz,omitempty"`
 	MaxMemoryClockMHz      float64                    `json:"max_memory_clock_mhz,omitempty"`