diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index 8dfbadb..586a290 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -7,6 +7,7 @@ import ( "fmt" "math" "os" + "os/exec" "path/filepath" "regexp" "sort" @@ -153,12 +154,16 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv } }() + // Power calibration: run dcgmi targeted_power while sampling nvidia-smi power. + // Returns per-GPU p95 power as an honest TDP reference for PowerSustainScore. + calibPowerByIndex := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, logFunc) + // Start background CPU load sampler — samples every 10s during GPU phases. cpuStopCh := make(chan struct{}) cpuSamplesCh := startCPULoadSampler(cpuStopCh, 10) if opts.ParallelGPUs { - runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples) + runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, calibPowerByIndex, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples) } else { for _, idx := range selected { @@ -178,6 +183,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz } + if w, ok := calibPowerByIndex[idx]; ok && w > 0 { + gpuResult.CalibratedPeakPowerW = w + } if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil { gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz gpuResult.LockedMemoryClockMHz = norm.MemoryClockLockMHz @@ -849,9 +857,14 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard { score.ComputeScore += precision.TeraOpsPerSec } } - // Use default power limit for sustain score so a manually reduced limit - // does not inflate the score. Fall back to enforced limit if default unknown. - referencePowerW := gpu.DefaultPowerLimitW + // PowerSustainScore: prefer calibrated peak power (measured under targeted_power + // load) as the reference — it reflects what this GPU actually reaches under a + // full-spectrum workload, unlike the hardware default limit which bee-gpu-burn + // cannot reach. Fall back to default limit, then enforced limit. + referencePowerW := gpu.CalibratedPeakPowerW + if referencePowerW <= 0 { + referencePowerW = gpu.DefaultPowerLimitW + } if referencePowerW <= 0 { referencePowerW = gpu.PowerLimitW } @@ -870,7 +883,15 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard { } func compositeBenchmarkScore(score BenchmarkScorecard) float64 { - quality := 0.40 + 0.20*(score.PowerSustainScore/100.0) + 0.20*(score.ThermalSustainScore/100.0) + 0.20*(score.StabilityScore/100.0) + // Weights after introducing calibrated power reference: + // base 0.35 — floor so a GPU that fails all sustain checks still scores + // thermal 0.25 — heaviest: throttle counters are the most reliable signal + // stability 0.25 — clock/power variance matters for reproducibility + // power 0.15 — honest with calibrated reference; lower because + // bee-gpu-burn is compute-only (not mem+compute like TDP test) + // NCCL bonus 0.10 — interconnect health + // cap 1.10 + quality := 0.35 + 0.15*(score.PowerSustainScore/100.0) + 0.25*(score.ThermalSustainScore/100.0) + 0.25*(score.StabilityScore/100.0) if score.InterconnectScore > 0 { quality += 0.10 } @@ -1418,6 +1439,7 @@ func runNvidiaBenchmarkParallel( spec benchmarkProfileSpec, logFunc func(string), result *NvidiaBenchmarkResult, + calibPowerByIndex map[int]float64, serverIdleW *float64, serverLoadedWSum *float64, serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int, ) { @@ -1439,6 +1461,9 @@ func runNvidiaBenchmarkParallel( r.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz r.MaxMemoryClockMHz = info.MaxMemoryClockMHz } + if w, ok := calibPowerByIndex[idx]; ok && w > 0 { + r.CalibratedPeakPowerW = w + } if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil { r.LockedGraphicsClockMHz = norm.GPUClockLockMHz r.LockedMemoryClockMHz = norm.MemoryClockLockMHz @@ -1765,3 +1790,60 @@ func summarizeCPULoad(samples []float64) *BenchmarkCPULoad { } return cl } + +// runBenchmarkPowerCalibration runs a short dcgmi targeted_power test while +// collecting nvidia-smi power samples in parallel. It returns a map from GPU +// index to p95 observed power (watts), which is used as the reference for +// PowerSustainScore instead of the hardware default limit. +// +// If dcgmi is unavailable or the run fails the function returns an empty map +// and the caller falls back to DefaultPowerLimitW. The calibration is skipped +// gracefully — it must never block or fail the main benchmark. +func runBenchmarkPowerCalibration( + ctx context.Context, + verboseLog, runDir string, + gpuIndices []int, + logFunc func(string), +) map[int]float64 { + const calibDurationSec = 45 + + // dcgmi must be present. + if _, err := exec.LookPath("dcgmi"); err != nil { + logFunc("power calibration: dcgmi not found, skipping (will use default power limit)") + return map[int]float64{} + } + + logFunc(fmt.Sprintf("power calibration: running dcgmi targeted_power for %ds on GPUs %s", calibDurationSec, joinIndexList(gpuIndices))) + + cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices) + out, rows, err := runBenchmarkCommandWithMetrics(ctx, verboseLog, "power-calibration.log", cmd, nil, gpuIndices, runDir, "power-calibration", logFunc) + _ = os.WriteFile(filepath.Join(runDir, "power-calibration.log"), out, 0644) + if err != nil { + logFunc(fmt.Sprintf("power calibration: dcgmi targeted_power failed (%v), skipping", err)) + return map[int]float64{} + } + + // Group rows by GPU index and compute p95 power for each. + result := make(map[int]float64, len(gpuIndices)) + for _, idx := range gpuIndices { + perGPU := filterRowsByGPU(rows, idx) + if len(perGPU) == 0 { + continue + } + powers := make([]float64, 0, len(perGPU)) + for _, r := range perGPU { + if r.PowerW > 0 { + powers = append(powers, r.PowerW) + } + } + if len(powers) == 0 { + continue + } + p95 := benchmarkPercentile(powers, 95) + if p95 > 0 { + result[idx] = p95 + logFunc(fmt.Sprintf("power calibration: GPU %d p95=%.0f W (%d samples)", idx, p95, len(powers))) + } + } + return result +} diff --git a/audit/internal/platform/benchmark_types.go b/audit/internal/platform/benchmark_types.go index b309e0a..447a08c 100644 --- a/audit/internal/platform/benchmark_types.go +++ b/audit/internal/platform/benchmark_types.go @@ -88,6 +88,11 @@ type BenchmarkGPUResult struct { PowerLimitW float64 `json:"power_limit_w,omitempty"` MultiprocessorCount int `json:"multiprocessor_count,omitempty"` DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"` + // CalibratedPeakPowerW is the p95 power measured during a short + // dcgmi targeted_power calibration run before the main benchmark. + // Used as the reference denominator for PowerSustainScore instead of + // the hardware default limit, which bee-gpu-burn cannot reach. + CalibratedPeakPowerW float64 `json:"calibrated_peak_power_w,omitempty"` MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"` BaseGraphicsClockMHz float64 `json:"base_graphics_clock_mhz,omitempty"` MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"`