Add power calibration step to benchmark; fix PowerSustainScore reference

Before the per-GPU compute phases, run `dcgmi diag -r targeted_power`
for 45 s while collecting nvidia-smi power metrics in parallel.
The p95 power per GPU is stored as calibrated_peak_power_w and used
as the denominator for PowerSustainScore instead of the hardware default
limit, which bee-gpu-burn cannot reach because it is compute-only.

Fallback chain: calibrated peak → default limit → enforced limit.
If dcgmi is absent or the run fails, calibration is skipped silently.

Adjust composite score weights to match the new honest power reference:
  base 0.35, thermal 0.25, stability 0.25, power 0.15, NCCL bonus 0.10.
Power weight reduced (0.20→0.15) because even with a calibrated reference
bee-gpu-burn reaches ~60-75% of TDP by design (no concurrent mem stress).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-12 22:06:46 +03:00
parent 9e3dcf9b4d
commit f4a19c0a00
2 changed files with 92 additions and 5 deletions

View File

@@ -7,6 +7,7 @@ import (
"fmt"
"math"
"os"
"os/exec"
"path/filepath"
"regexp"
"sort"
@@ -153,12 +154,16 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
}
}()
// Power calibration: run dcgmi targeted_power while sampling nvidia-smi power.
// Returns per-GPU p95 power as an honest TDP reference for PowerSustainScore.
calibPowerByIndex := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, logFunc)
// Start background CPU load sampler — samples every 10s during GPU phases.
cpuStopCh := make(chan struct{})
cpuSamplesCh := startCPULoadSampler(cpuStopCh, 10)
if opts.ParallelGPUs {
runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples)
runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, calibPowerByIndex, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples)
} else {
for _, idx := range selected {
@@ -178,6 +183,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
}
if w, ok := calibPowerByIndex[idx]; ok && w > 0 {
gpuResult.CalibratedPeakPowerW = w
}
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz
gpuResult.LockedMemoryClockMHz = norm.MemoryClockLockMHz
@@ -849,9 +857,14 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
score.ComputeScore += precision.TeraOpsPerSec
}
}
// Use default power limit for sustain score so a manually reduced limit
// does not inflate the score. Fall back to enforced limit if default unknown.
referencePowerW := gpu.DefaultPowerLimitW
// PowerSustainScore: prefer calibrated peak power (measured under targeted_power
// load) as the reference — it reflects what this GPU actually reaches under a
// full-spectrum workload, unlike the hardware default limit which bee-gpu-burn
// cannot reach. Fall back to default limit, then enforced limit.
referencePowerW := gpu.CalibratedPeakPowerW
if referencePowerW <= 0 {
referencePowerW = gpu.DefaultPowerLimitW
}
if referencePowerW <= 0 {
referencePowerW = gpu.PowerLimitW
}
@@ -870,7 +883,15 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
}
func compositeBenchmarkScore(score BenchmarkScorecard) float64 {
quality := 0.40 + 0.20*(score.PowerSustainScore/100.0) + 0.20*(score.ThermalSustainScore/100.0) + 0.20*(score.StabilityScore/100.0)
// Weights after introducing calibrated power reference:
// base 0.35 — floor so a GPU that fails all sustain checks still scores
// thermal 0.25 — heaviest: throttle counters are the most reliable signal
// stability 0.25 — clock/power variance matters for reproducibility
// power 0.15 — honest with calibrated reference; lower because
// bee-gpu-burn is compute-only (not mem+compute like TDP test)
// NCCL bonus 0.10 — interconnect health
// cap 1.10
quality := 0.35 + 0.15*(score.PowerSustainScore/100.0) + 0.25*(score.ThermalSustainScore/100.0) + 0.25*(score.StabilityScore/100.0)
if score.InterconnectScore > 0 {
quality += 0.10
}
@@ -1418,6 +1439,7 @@ func runNvidiaBenchmarkParallel(
spec benchmarkProfileSpec,
logFunc func(string),
result *NvidiaBenchmarkResult,
calibPowerByIndex map[int]float64,
serverIdleW *float64, serverLoadedWSum *float64,
serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int,
) {
@@ -1439,6 +1461,9 @@ func runNvidiaBenchmarkParallel(
r.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
r.MaxMemoryClockMHz = info.MaxMemoryClockMHz
}
if w, ok := calibPowerByIndex[idx]; ok && w > 0 {
r.CalibratedPeakPowerW = w
}
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
r.LockedGraphicsClockMHz = norm.GPUClockLockMHz
r.LockedMemoryClockMHz = norm.MemoryClockLockMHz
@@ -1765,3 +1790,60 @@ func summarizeCPULoad(samples []float64) *BenchmarkCPULoad {
}
return cl
}
// runBenchmarkPowerCalibration runs a short dcgmi targeted_power test while
// collecting nvidia-smi power samples in parallel. It returns a map from GPU
// index to p95 observed power (watts), which is used as the reference for
// PowerSustainScore instead of the hardware default limit.
//
// If dcgmi is unavailable or the run fails the function returns an empty map
// and the caller falls back to DefaultPowerLimitW. The calibration is skipped
// gracefully — it must never block or fail the main benchmark.
func runBenchmarkPowerCalibration(
ctx context.Context,
verboseLog, runDir string,
gpuIndices []int,
logFunc func(string),
) map[int]float64 {
const calibDurationSec = 45
// dcgmi must be present.
if _, err := exec.LookPath("dcgmi"); err != nil {
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
return map[int]float64{}
}
logFunc(fmt.Sprintf("power calibration: running dcgmi targeted_power for %ds on GPUs %s", calibDurationSec, joinIndexList(gpuIndices)))
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices)
out, rows, err := runBenchmarkCommandWithMetrics(ctx, verboseLog, "power-calibration.log", cmd, nil, gpuIndices, runDir, "power-calibration", logFunc)
_ = os.WriteFile(filepath.Join(runDir, "power-calibration.log"), out, 0644)
if err != nil {
logFunc(fmt.Sprintf("power calibration: dcgmi targeted_power failed (%v), skipping", err))
return map[int]float64{}
}
// Group rows by GPU index and compute p95 power for each.
result := make(map[int]float64, len(gpuIndices))
for _, idx := range gpuIndices {
perGPU := filterRowsByGPU(rows, idx)
if len(perGPU) == 0 {
continue
}
powers := make([]float64, 0, len(perGPU))
for _, r := range perGPU {
if r.PowerW > 0 {
powers = append(powers, r.PowerW)
}
}
if len(powers) == 0 {
continue
}
p95 := benchmarkPercentile(powers, 95)
if p95 > 0 {
result[idx] = p95
logFunc(fmt.Sprintf("power calibration: GPU %d p95=%.0f W (%d samples)", idx, p95, len(powers)))
}
}
return result
}

View File

@@ -88,6 +88,11 @@ type BenchmarkGPUResult struct {
PowerLimitW float64 `json:"power_limit_w,omitempty"`
MultiprocessorCount int `json:"multiprocessor_count,omitempty"`
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
// CalibratedPeakPowerW is the p95 power measured during a short
// dcgmi targeted_power calibration run before the main benchmark.
// Used as the reference denominator for PowerSustainScore instead of
// the hardware default limit, which bee-gpu-burn cannot reach.
CalibratedPeakPowerW float64 `json:"calibrated_peak_power_w,omitempty"`
MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"`
BaseGraphicsClockMHz float64 `json:"base_graphics_clock_mhz,omitempty"`
MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"`