Add power calibration step to benchmark; fix PowerSustainScore reference
Before the per-GPU compute phases, run `dcgmi diag -r targeted_power` for 45 s while collecting nvidia-smi power metrics in parallel. The p95 power per GPU is stored as calibrated_peak_power_w and used as the denominator for PowerSustainScore instead of the hardware default limit, which bee-gpu-burn cannot reach because it is compute-only. Fallback chain: calibrated peak → default limit → enforced limit. If dcgmi is absent or the run fails, calibration is skipped silently. Adjust composite score weights to match the new honest power reference: base 0.35, thermal 0.25, stability 0.25, power 0.15, NCCL bonus 0.10. Power weight reduced (0.20→0.15) because even with a calibrated reference bee-gpu-burn reaches ~60-75% of TDP by design (no concurrent mem stress). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -7,6 +7,7 @@ import (
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"sort"
|
||||
@@ -153,12 +154,16 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
||||
}
|
||||
}()
|
||||
|
||||
// Power calibration: run dcgmi targeted_power while sampling nvidia-smi power.
|
||||
// Returns per-GPU p95 power as an honest TDP reference for PowerSustainScore.
|
||||
calibPowerByIndex := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, logFunc)
|
||||
|
||||
// Start background CPU load sampler — samples every 10s during GPU phases.
|
||||
cpuStopCh := make(chan struct{})
|
||||
cpuSamplesCh := startCPULoadSampler(cpuStopCh, 10)
|
||||
|
||||
if opts.ParallelGPUs {
|
||||
runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples)
|
||||
runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, calibPowerByIndex, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples)
|
||||
} else {
|
||||
|
||||
for _, idx := range selected {
|
||||
@@ -178,6 +183,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
||||
gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
|
||||
gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
|
||||
}
|
||||
if w, ok := calibPowerByIndex[idx]; ok && w > 0 {
|
||||
gpuResult.CalibratedPeakPowerW = w
|
||||
}
|
||||
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
|
||||
gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz
|
||||
gpuResult.LockedMemoryClockMHz = norm.MemoryClockLockMHz
|
||||
@@ -849,9 +857,14 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
|
||||
score.ComputeScore += precision.TeraOpsPerSec
|
||||
}
|
||||
}
|
||||
// Use default power limit for sustain score so a manually reduced limit
|
||||
// does not inflate the score. Fall back to enforced limit if default unknown.
|
||||
referencePowerW := gpu.DefaultPowerLimitW
|
||||
// PowerSustainScore: prefer calibrated peak power (measured under targeted_power
|
||||
// load) as the reference — it reflects what this GPU actually reaches under a
|
||||
// full-spectrum workload, unlike the hardware default limit which bee-gpu-burn
|
||||
// cannot reach. Fall back to default limit, then enforced limit.
|
||||
referencePowerW := gpu.CalibratedPeakPowerW
|
||||
if referencePowerW <= 0 {
|
||||
referencePowerW = gpu.DefaultPowerLimitW
|
||||
}
|
||||
if referencePowerW <= 0 {
|
||||
referencePowerW = gpu.PowerLimitW
|
||||
}
|
||||
@@ -870,7 +883,15 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
|
||||
}
|
||||
|
||||
func compositeBenchmarkScore(score BenchmarkScorecard) float64 {
|
||||
quality := 0.40 + 0.20*(score.PowerSustainScore/100.0) + 0.20*(score.ThermalSustainScore/100.0) + 0.20*(score.StabilityScore/100.0)
|
||||
// Weights after introducing calibrated power reference:
|
||||
// base 0.35 — floor so a GPU that fails all sustain checks still scores
|
||||
// thermal 0.25 — heaviest: throttle counters are the most reliable signal
|
||||
// stability 0.25 — clock/power variance matters for reproducibility
|
||||
// power 0.15 — honest with calibrated reference; lower because
|
||||
// bee-gpu-burn is compute-only (not mem+compute like TDP test)
|
||||
// NCCL bonus 0.10 — interconnect health
|
||||
// cap 1.10
|
||||
quality := 0.35 + 0.15*(score.PowerSustainScore/100.0) + 0.25*(score.ThermalSustainScore/100.0) + 0.25*(score.StabilityScore/100.0)
|
||||
if score.InterconnectScore > 0 {
|
||||
quality += 0.10
|
||||
}
|
||||
@@ -1418,6 +1439,7 @@ func runNvidiaBenchmarkParallel(
|
||||
spec benchmarkProfileSpec,
|
||||
logFunc func(string),
|
||||
result *NvidiaBenchmarkResult,
|
||||
calibPowerByIndex map[int]float64,
|
||||
serverIdleW *float64, serverLoadedWSum *float64,
|
||||
serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int,
|
||||
) {
|
||||
@@ -1439,6 +1461,9 @@ func runNvidiaBenchmarkParallel(
|
||||
r.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
|
||||
r.MaxMemoryClockMHz = info.MaxMemoryClockMHz
|
||||
}
|
||||
if w, ok := calibPowerByIndex[idx]; ok && w > 0 {
|
||||
r.CalibratedPeakPowerW = w
|
||||
}
|
||||
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
|
||||
r.LockedGraphicsClockMHz = norm.GPUClockLockMHz
|
||||
r.LockedMemoryClockMHz = norm.MemoryClockLockMHz
|
||||
@@ -1765,3 +1790,60 @@ func summarizeCPULoad(samples []float64) *BenchmarkCPULoad {
|
||||
}
|
||||
return cl
|
||||
}
|
||||
|
||||
// runBenchmarkPowerCalibration runs a short dcgmi targeted_power test while
|
||||
// collecting nvidia-smi power samples in parallel. It returns a map from GPU
|
||||
// index to p95 observed power (watts), which is used as the reference for
|
||||
// PowerSustainScore instead of the hardware default limit.
|
||||
//
|
||||
// If dcgmi is unavailable or the run fails the function returns an empty map
|
||||
// and the caller falls back to DefaultPowerLimitW. The calibration is skipped
|
||||
// gracefully — it must never block or fail the main benchmark.
|
||||
func runBenchmarkPowerCalibration(
|
||||
ctx context.Context,
|
||||
verboseLog, runDir string,
|
||||
gpuIndices []int,
|
||||
logFunc func(string),
|
||||
) map[int]float64 {
|
||||
const calibDurationSec = 45
|
||||
|
||||
// dcgmi must be present.
|
||||
if _, err := exec.LookPath("dcgmi"); err != nil {
|
||||
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
|
||||
return map[int]float64{}
|
||||
}
|
||||
|
||||
logFunc(fmt.Sprintf("power calibration: running dcgmi targeted_power for %ds on GPUs %s", calibDurationSec, joinIndexList(gpuIndices)))
|
||||
|
||||
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices)
|
||||
out, rows, err := runBenchmarkCommandWithMetrics(ctx, verboseLog, "power-calibration.log", cmd, nil, gpuIndices, runDir, "power-calibration", logFunc)
|
||||
_ = os.WriteFile(filepath.Join(runDir, "power-calibration.log"), out, 0644)
|
||||
if err != nil {
|
||||
logFunc(fmt.Sprintf("power calibration: dcgmi targeted_power failed (%v), skipping", err))
|
||||
return map[int]float64{}
|
||||
}
|
||||
|
||||
// Group rows by GPU index and compute p95 power for each.
|
||||
result := make(map[int]float64, len(gpuIndices))
|
||||
for _, idx := range gpuIndices {
|
||||
perGPU := filterRowsByGPU(rows, idx)
|
||||
if len(perGPU) == 0 {
|
||||
continue
|
||||
}
|
||||
powers := make([]float64, 0, len(perGPU))
|
||||
for _, r := range perGPU {
|
||||
if r.PowerW > 0 {
|
||||
powers = append(powers, r.PowerW)
|
||||
}
|
||||
}
|
||||
if len(powers) == 0 {
|
||||
continue
|
||||
}
|
||||
p95 := benchmarkPercentile(powers, 95)
|
||||
if p95 > 0 {
|
||||
result[idx] = p95
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d p95=%.0f W (%d samples)", idx, p95, len(powers)))
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
@@ -88,6 +88,11 @@ type BenchmarkGPUResult struct {
|
||||
PowerLimitW float64 `json:"power_limit_w,omitempty"`
|
||||
MultiprocessorCount int `json:"multiprocessor_count,omitempty"`
|
||||
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
||||
// CalibratedPeakPowerW is the p95 power measured during a short
|
||||
// dcgmi targeted_power calibration run before the main benchmark.
|
||||
// Used as the reference denominator for PowerSustainScore instead of
|
||||
// the hardware default limit, which bee-gpu-burn cannot reach.
|
||||
CalibratedPeakPowerW float64 `json:"calibrated_peak_power_w,omitempty"`
|
||||
MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"`
|
||||
BaseGraphicsClockMHz float64 `json:"base_graphics_clock_mhz,omitempty"`
|
||||
MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"`
|
||||
|
||||
Reference in New Issue
Block a user