Add power calibration step to benchmark; fix PowerSustainScore reference
Before the per-GPU compute phases, run `dcgmi diag -r targeted_power` for 45 s while collecting nvidia-smi power metrics in parallel. The p95 power per GPU is stored as calibrated_peak_power_w and used as the denominator for PowerSustainScore instead of the hardware default limit, which bee-gpu-burn cannot reach because it is compute-only. Fallback chain: calibrated peak → default limit → enforced limit. If dcgmi is absent or the run fails, calibration is skipped silently. Adjust composite score weights to match the new honest power reference: base 0.35, thermal 0.25, stability 0.25, power 0.15, NCCL bonus 0.10. Power weight reduced (0.20→0.15) because even with a calibrated reference bee-gpu-burn reaches ~60-75% of TDP by design (no concurrent mem stress). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -7,6 +7,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"math"
|
"math"
|
||||||
"os"
|
"os"
|
||||||
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"regexp"
|
"regexp"
|
||||||
"sort"
|
"sort"
|
||||||
@@ -153,12 +154,16 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
// Power calibration: run dcgmi targeted_power while sampling nvidia-smi power.
|
||||||
|
// Returns per-GPU p95 power as an honest TDP reference for PowerSustainScore.
|
||||||
|
calibPowerByIndex := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, logFunc)
|
||||||
|
|
||||||
// Start background CPU load sampler — samples every 10s during GPU phases.
|
// Start background CPU load sampler — samples every 10s during GPU phases.
|
||||||
cpuStopCh := make(chan struct{})
|
cpuStopCh := make(chan struct{})
|
||||||
cpuSamplesCh := startCPULoadSampler(cpuStopCh, 10)
|
cpuSamplesCh := startCPULoadSampler(cpuStopCh, 10)
|
||||||
|
|
||||||
if opts.ParallelGPUs {
|
if opts.ParallelGPUs {
|
||||||
runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples)
|
runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, calibPowerByIndex, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples)
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
for _, idx := range selected {
|
for _, idx := range selected {
|
||||||
@@ -178,6 +183,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
|
gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
|
||||||
gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
|
gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
|
||||||
}
|
}
|
||||||
|
if w, ok := calibPowerByIndex[idx]; ok && w > 0 {
|
||||||
|
gpuResult.CalibratedPeakPowerW = w
|
||||||
|
}
|
||||||
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
|
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
|
||||||
gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz
|
gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz
|
||||||
gpuResult.LockedMemoryClockMHz = norm.MemoryClockLockMHz
|
gpuResult.LockedMemoryClockMHz = norm.MemoryClockLockMHz
|
||||||
@@ -849,9 +857,14 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
|
|||||||
score.ComputeScore += precision.TeraOpsPerSec
|
score.ComputeScore += precision.TeraOpsPerSec
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Use default power limit for sustain score so a manually reduced limit
|
// PowerSustainScore: prefer calibrated peak power (measured under targeted_power
|
||||||
// does not inflate the score. Fall back to enforced limit if default unknown.
|
// load) as the reference — it reflects what this GPU actually reaches under a
|
||||||
referencePowerW := gpu.DefaultPowerLimitW
|
// full-spectrum workload, unlike the hardware default limit which bee-gpu-burn
|
||||||
|
// cannot reach. Fall back to default limit, then enforced limit.
|
||||||
|
referencePowerW := gpu.CalibratedPeakPowerW
|
||||||
|
if referencePowerW <= 0 {
|
||||||
|
referencePowerW = gpu.DefaultPowerLimitW
|
||||||
|
}
|
||||||
if referencePowerW <= 0 {
|
if referencePowerW <= 0 {
|
||||||
referencePowerW = gpu.PowerLimitW
|
referencePowerW = gpu.PowerLimitW
|
||||||
}
|
}
|
||||||
@@ -870,7 +883,15 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func compositeBenchmarkScore(score BenchmarkScorecard) float64 {
|
func compositeBenchmarkScore(score BenchmarkScorecard) float64 {
|
||||||
quality := 0.40 + 0.20*(score.PowerSustainScore/100.0) + 0.20*(score.ThermalSustainScore/100.0) + 0.20*(score.StabilityScore/100.0)
|
// Weights after introducing calibrated power reference:
|
||||||
|
// base 0.35 — floor so a GPU that fails all sustain checks still scores
|
||||||
|
// thermal 0.25 — heaviest: throttle counters are the most reliable signal
|
||||||
|
// stability 0.25 — clock/power variance matters for reproducibility
|
||||||
|
// power 0.15 — honest with calibrated reference; lower because
|
||||||
|
// bee-gpu-burn is compute-only (not mem+compute like TDP test)
|
||||||
|
// NCCL bonus 0.10 — interconnect health
|
||||||
|
// cap 1.10
|
||||||
|
quality := 0.35 + 0.15*(score.PowerSustainScore/100.0) + 0.25*(score.ThermalSustainScore/100.0) + 0.25*(score.StabilityScore/100.0)
|
||||||
if score.InterconnectScore > 0 {
|
if score.InterconnectScore > 0 {
|
||||||
quality += 0.10
|
quality += 0.10
|
||||||
}
|
}
|
||||||
@@ -1418,6 +1439,7 @@ func runNvidiaBenchmarkParallel(
|
|||||||
spec benchmarkProfileSpec,
|
spec benchmarkProfileSpec,
|
||||||
logFunc func(string),
|
logFunc func(string),
|
||||||
result *NvidiaBenchmarkResult,
|
result *NvidiaBenchmarkResult,
|
||||||
|
calibPowerByIndex map[int]float64,
|
||||||
serverIdleW *float64, serverLoadedWSum *float64,
|
serverIdleW *float64, serverLoadedWSum *float64,
|
||||||
serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int,
|
serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int,
|
||||||
) {
|
) {
|
||||||
@@ -1439,6 +1461,9 @@ func runNvidiaBenchmarkParallel(
|
|||||||
r.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
|
r.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
|
||||||
r.MaxMemoryClockMHz = info.MaxMemoryClockMHz
|
r.MaxMemoryClockMHz = info.MaxMemoryClockMHz
|
||||||
}
|
}
|
||||||
|
if w, ok := calibPowerByIndex[idx]; ok && w > 0 {
|
||||||
|
r.CalibratedPeakPowerW = w
|
||||||
|
}
|
||||||
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
|
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
|
||||||
r.LockedGraphicsClockMHz = norm.GPUClockLockMHz
|
r.LockedGraphicsClockMHz = norm.GPUClockLockMHz
|
||||||
r.LockedMemoryClockMHz = norm.MemoryClockLockMHz
|
r.LockedMemoryClockMHz = norm.MemoryClockLockMHz
|
||||||
@@ -1765,3 +1790,60 @@ func summarizeCPULoad(samples []float64) *BenchmarkCPULoad {
|
|||||||
}
|
}
|
||||||
return cl
|
return cl
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// runBenchmarkPowerCalibration runs a short dcgmi targeted_power test while
|
||||||
|
// collecting nvidia-smi power samples in parallel. It returns a map from GPU
|
||||||
|
// index to p95 observed power (watts), which is used as the reference for
|
||||||
|
// PowerSustainScore instead of the hardware default limit.
|
||||||
|
//
|
||||||
|
// If dcgmi is unavailable or the run fails the function returns an empty map
|
||||||
|
// and the caller falls back to DefaultPowerLimitW. The calibration is skipped
|
||||||
|
// gracefully — it must never block or fail the main benchmark.
|
||||||
|
func runBenchmarkPowerCalibration(
|
||||||
|
ctx context.Context,
|
||||||
|
verboseLog, runDir string,
|
||||||
|
gpuIndices []int,
|
||||||
|
logFunc func(string),
|
||||||
|
) map[int]float64 {
|
||||||
|
const calibDurationSec = 45
|
||||||
|
|
||||||
|
// dcgmi must be present.
|
||||||
|
if _, err := exec.LookPath("dcgmi"); err != nil {
|
||||||
|
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
|
||||||
|
return map[int]float64{}
|
||||||
|
}
|
||||||
|
|
||||||
|
logFunc(fmt.Sprintf("power calibration: running dcgmi targeted_power for %ds on GPUs %s", calibDurationSec, joinIndexList(gpuIndices)))
|
||||||
|
|
||||||
|
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices)
|
||||||
|
out, rows, err := runBenchmarkCommandWithMetrics(ctx, verboseLog, "power-calibration.log", cmd, nil, gpuIndices, runDir, "power-calibration", logFunc)
|
||||||
|
_ = os.WriteFile(filepath.Join(runDir, "power-calibration.log"), out, 0644)
|
||||||
|
if err != nil {
|
||||||
|
logFunc(fmt.Sprintf("power calibration: dcgmi targeted_power failed (%v), skipping", err))
|
||||||
|
return map[int]float64{}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Group rows by GPU index and compute p95 power for each.
|
||||||
|
result := make(map[int]float64, len(gpuIndices))
|
||||||
|
for _, idx := range gpuIndices {
|
||||||
|
perGPU := filterRowsByGPU(rows, idx)
|
||||||
|
if len(perGPU) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
powers := make([]float64, 0, len(perGPU))
|
||||||
|
for _, r := range perGPU {
|
||||||
|
if r.PowerW > 0 {
|
||||||
|
powers = append(powers, r.PowerW)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(powers) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
p95 := benchmarkPercentile(powers, 95)
|
||||||
|
if p95 > 0 {
|
||||||
|
result[idx] = p95
|
||||||
|
logFunc(fmt.Sprintf("power calibration: GPU %d p95=%.0f W (%d samples)", idx, p95, len(powers)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|||||||
@@ -88,6 +88,11 @@ type BenchmarkGPUResult struct {
|
|||||||
PowerLimitW float64 `json:"power_limit_w,omitempty"`
|
PowerLimitW float64 `json:"power_limit_w,omitempty"`
|
||||||
MultiprocessorCount int `json:"multiprocessor_count,omitempty"`
|
MultiprocessorCount int `json:"multiprocessor_count,omitempty"`
|
||||||
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
||||||
|
// CalibratedPeakPowerW is the p95 power measured during a short
|
||||||
|
// dcgmi targeted_power calibration run before the main benchmark.
|
||||||
|
// Used as the reference denominator for PowerSustainScore instead of
|
||||||
|
// the hardware default limit, which bee-gpu-burn cannot reach.
|
||||||
|
CalibratedPeakPowerW float64 `json:"calibrated_peak_power_w,omitempty"`
|
||||||
MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"`
|
MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"`
|
||||||
BaseGraphicsClockMHz float64 `json:"base_graphics_clock_mhz,omitempty"`
|
BaseGraphicsClockMHz float64 `json:"base_graphics_clock_mhz,omitempty"`
|
||||||
MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"`
|
MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"`
|
||||||
|
|||||||
Reference in New Issue
Block a user