Redesign power and performance benchmarks with new methodology
Power/Thermal Fit: cumulative fixed-limit ramp where each GPU's stable TDP is found under real multi-GPU thermal load (all prior GPUs running at their fixed limits). PlatformMaxTDPW = sum of stable limits across all GPUs. Remove PlatformPowerScore from power test. Performance Benchmark: remove pre-benchmark power calibration entirely. After N single-card runs, execute k=2..N parallel ramp-up steps and compute PlatformPowerScore = mean compute scalability vs best single-card TOPS. PowerSustainScore falls back to Steady.AvgPowerW when calibration absent. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -304,18 +304,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
// Power calibration: run dcgmi targeted_power while sampling nvidia-smi power.
|
// No power calibration before performance benchmark — GPUs run at their
|
||||||
// Returns per-GPU p95 power as an honest TDP reference for PowerSustainScore.
|
// default power limits. PowerSustainScore is derived from steady-state power
|
||||||
calibByIndex, powerRestoreActions := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, infoByIndex, logFunc)
|
// observed during the benchmark itself.
|
||||||
restoreActions = append(restoreActions, powerRestoreActions...)
|
calibByIndex := make(map[int]benchmarkPowerCalibrationResult)
|
||||||
for _, idx := range selected {
|
|
||||||
if calib, ok := calibByIndex[idx]; ok && calib.Derated && calib.AppliedPowerLimitW > 0 {
|
|
||||||
result.Warnings = append(result.Warnings, fmt.Sprintf(
|
|
||||||
"GPU %d could not complete targeted_power at its default server power budget; benchmark ran at reduced power limit %.0f W.",
|
|
||||||
idx, calib.AppliedPowerLimitW,
|
|
||||||
))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Start background CPU load sampler — samples every 10s during GPU phases.
|
// Start background CPU load sampler — samples every 10s during GPU phases.
|
||||||
cpuStopCh := make(chan struct{})
|
cpuStopCh := make(chan struct{})
|
||||||
@@ -531,6 +523,69 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
|
|
||||||
} // end sequential path
|
} // end sequential path
|
||||||
|
|
||||||
|
// Performance scalability ramp-up: run parallel benchmarks for k=2..N GPUs
|
||||||
|
// and compute compute scalability relative to the best single-GPU result.
|
||||||
|
// Only runs in sequential mode (each GPU was tested individually above) and
|
||||||
|
// when there are at least 2 GPUs.
|
||||||
|
if !opts.ParallelGPUs && len(selected) >= 2 {
|
||||||
|
// Find the best single-card SyntheticScore as the 1-GPU baseline.
|
||||||
|
var bestTOPS float64
|
||||||
|
for _, g := range result.GPUs {
|
||||||
|
if g.Scores.SyntheticScore > bestTOPS {
|
||||||
|
bestTOPS = g.Scores.SyntheticScore
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if bestTOPS > 0 {
|
||||||
|
var rampSteps []NvidiaPerformanceRampStep
|
||||||
|
var scalabilityPcts []float64
|
||||||
|
for k := 2; k <= len(selected); k++ {
|
||||||
|
subset := append([]int(nil), selected[:k]...)
|
||||||
|
rampDir := filepath.Join(runDir, fmt.Sprintf("ramp-%02d", k))
|
||||||
|
_ = os.MkdirAll(rampDir, 0755)
|
||||||
|
logFunc(fmt.Sprintf("performance ramp: step %d/%d — running %d GPUs in parallel", k, len(selected), k))
|
||||||
|
|
||||||
|
var rampResult NvidiaBenchmarkResult
|
||||||
|
var rampIdleW, rampLoadedWSum float64
|
||||||
|
var rampIdleOK, rampLoadedOK bool
|
||||||
|
var rampLoadedSamples int
|
||||||
|
var rampMetricRows []GPUMetricRow
|
||||||
|
var rampTimelineSec float64
|
||||||
|
emptyCalib := make(map[int]benchmarkPowerCalibrationResult)
|
||||||
|
|
||||||
|
runNvidiaBenchmarkParallel(ctx, verboseLog, rampDir, subset, infoByIndex, opts, spec, logFunc,
|
||||||
|
&rampResult, emptyCalib,
|
||||||
|
&rampIdleW, &rampLoadedWSum, &rampIdleOK, &rampLoadedOK, &rampLoadedSamples,
|
||||||
|
&rampMetricRows, &rampTimelineSec, "")
|
||||||
|
|
||||||
|
var totalSynth, totalMixed float64
|
||||||
|
for _, g := range rampResult.GPUs {
|
||||||
|
totalSynth += g.Scores.SyntheticScore
|
||||||
|
totalMixed += g.Scores.MixedScore
|
||||||
|
}
|
||||||
|
scalPct := totalSynth / (float64(k) * bestTOPS) * 100
|
||||||
|
scalabilityPcts = append(scalabilityPcts, scalPct)
|
||||||
|
|
||||||
|
stepStatus := "OK"
|
||||||
|
if len(rampResult.GPUs) < k {
|
||||||
|
stepStatus = "PARTIAL"
|
||||||
|
}
|
||||||
|
rampSteps = append(rampSteps, NvidiaPerformanceRampStep{
|
||||||
|
StepIndex: k,
|
||||||
|
GPUIndices: subset,
|
||||||
|
TotalSyntheticTOPS: totalSynth,
|
||||||
|
TotalMixedTOPS: totalMixed,
|
||||||
|
ScalabilityPct: scalPct,
|
||||||
|
Status: stepStatus,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
result.PerformanceRampSteps = rampSteps
|
||||||
|
result.PlatformPowerScore = benchmarkMean(scalabilityPcts)
|
||||||
|
if len(scalabilityPcts) > 0 {
|
||||||
|
result.ScalabilityScore = scalabilityPcts[len(scalabilityPcts)-1]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if len(selected) > 1 && opts.RunNCCL {
|
if len(selected) > 1 && opts.RunNCCL {
|
||||||
result.Interconnect = runBenchmarkInterconnect(ctx, verboseLog, runDir, selected, spec, logFunc)
|
result.Interconnect = runBenchmarkInterconnect(ctx, verboseLog, runDir, selected, spec, logFunc)
|
||||||
if result.Interconnect != nil && result.Interconnect.Supported {
|
if result.Interconnect != nil && result.Interconnect.Supported {
|
||||||
@@ -1344,20 +1399,25 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
|
|||||||
case score.MixedScore > 0:
|
case score.MixedScore > 0:
|
||||||
score.ComputeScore = score.MixedScore
|
score.ComputeScore = score.MixedScore
|
||||||
}
|
}
|
||||||
// PowerSustainScore: measures how close the GPU came to its rated TDP under
|
// PowerSustainScore: measures how close the GPU came to its rated TDP during
|
||||||
// a full-spectrum load (dcgmi targeted_power). 100 = exactly at rated TDP.
|
// steady-state benchmark load. 100 = exactly at rated TDP.
|
||||||
// Penalty applied symmetrically for both under- and over-TDP deviations:
|
// Penalty applied symmetrically for both under- and over-TDP deviations:
|
||||||
// score = max(0, 100 − |measured − rated| / rated × 100)
|
// score = max(0, 100 − |measured − rated| / rated × 100)
|
||||||
// Under-TDP → power delivery / cooling issue.
|
// Under-TDP → power delivery / cooling issue.
|
||||||
// Over-TDP → power limit not properly enforced / power regulation fault.
|
// Over-TDP → power limit not properly enforced / power regulation fault.
|
||||||
// Falls back to 0 if calibration was not performed (dcgmi unavailable).
|
// Uses CalibratedPeakPowerW when available (from external power calibration),
|
||||||
|
// otherwise falls back to Steady.AvgPowerW observed during the benchmark.
|
||||||
{
|
{
|
||||||
ref := gpu.DefaultPowerLimitW
|
ref := gpu.DefaultPowerLimitW
|
||||||
if ref <= 0 {
|
if ref <= 0 {
|
||||||
ref = gpu.PowerLimitW
|
ref = gpu.PowerLimitW
|
||||||
}
|
}
|
||||||
if gpu.CalibratedPeakPowerW > 0 && ref > 0 {
|
measured := gpu.CalibratedPeakPowerW
|
||||||
deviationPct := math.Abs(gpu.CalibratedPeakPowerW-ref) / ref * 100
|
if measured <= 0 {
|
||||||
|
measured = gpu.Steady.AvgPowerW
|
||||||
|
}
|
||||||
|
if measured > 0 && ref > 0 {
|
||||||
|
deviationPct := math.Abs(measured-ref) / ref * 100
|
||||||
score.PowerSustainScore = clampScore(100 - deviationPct)
|
score.PowerSustainScore = clampScore(100 - deviationPct)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -2470,6 +2530,7 @@ func runBenchmarkPowerCalibration(
|
|||||||
gpuIndices []int,
|
gpuIndices []int,
|
||||||
infoByIndex map[int]benchmarkGPUInfo,
|
infoByIndex map[int]benchmarkGPUInfo,
|
||||||
logFunc func(string),
|
logFunc func(string),
|
||||||
|
fixedLimits map[int]int,
|
||||||
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) {
|
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) {
|
||||||
const calibDurationSec = 120
|
const calibDurationSec = 120
|
||||||
const maxDerateW = 150
|
const maxDerateW = 150
|
||||||
@@ -2555,6 +2616,21 @@ func runBenchmarkPowerCalibration(
|
|||||||
hi: appliedLimitW + 1, // not yet tested, not yet confirmed unstable
|
hi: appliedLimitW + 1, // not yet tested, not yet confirmed unstable
|
||||||
calib: benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
|
calib: benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
|
||||||
}
|
}
|
||||||
|
if fixedLimits != nil {
|
||||||
|
if fixedW, ok := fixedLimits[idx]; ok {
|
||||||
|
// This GPU's limit was established in a prior ramp step and must
|
||||||
|
// remain unchanged. Apply it immediately and skip the binary search.
|
||||||
|
if canDerate && fixedW > 0 {
|
||||||
|
_ = setBenchmarkPowerLimit(ctx, verboseLog, idx, fixedW)
|
||||||
|
}
|
||||||
|
s.appliedLimitW = fixedW
|
||||||
|
s.calib.AppliedPowerLimitW = float64(fixedW)
|
||||||
|
s.calib.Completed = true
|
||||||
|
s.converged = true
|
||||||
|
s.calib.Notes = append(s.calib.Notes,
|
||||||
|
fmt.Sprintf("fixed limit: %d W (held from prior ramp step)", fixedW))
|
||||||
|
}
|
||||||
|
}
|
||||||
states = append(states, s)
|
states = append(states, s)
|
||||||
if canDerate && originalLimitW > 0 {
|
if canDerate && originalLimitW > 0 {
|
||||||
idxCopy := idx
|
idxCopy := idx
|
||||||
@@ -2764,6 +2840,10 @@ calibDone:
|
|||||||
s.appliedLimitW = s.lo
|
s.appliedLimitW = s.lo
|
||||||
s.calib.AppliedPowerLimitW = float64(s.lo)
|
s.calib.AppliedPowerLimitW = float64(s.lo)
|
||||||
s.calib.Derated = s.lo < s.originalLimitW
|
s.calib.Derated = s.lo < s.originalLimitW
|
||||||
|
// Summary was captured when we last verified stability at s.lo,
|
||||||
|
// so the result is valid — mark as completed even though we
|
||||||
|
// converged from the failure path (tried higher, failed, fell back).
|
||||||
|
s.calib.Completed = true
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
|
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
|
||||||
@@ -2846,7 +2926,8 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
fmt.Fprintf(&b, "**Benchmark version:** %s \n", result.BenchmarkVersion)
|
fmt.Fprintf(&b, "**Benchmark version:** %s \n", result.BenchmarkVersion)
|
||||||
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
|
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
|
||||||
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
||||||
fmt.Fprintf(&b, "**Overall status:** %s \n\n", result.OverallStatus)
|
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
|
||||||
|
fmt.Fprintf(&b, "**Platform max TDP:** %.0f W \n\n", result.PlatformMaxTDPW)
|
||||||
if len(result.Findings) > 0 {
|
if len(result.Findings) > 0 {
|
||||||
b.WriteString("## Summary\n\n")
|
b.WriteString("## Summary\n\n")
|
||||||
for _, finding := range result.Findings {
|
for _, finding := range result.Findings {
|
||||||
@@ -2860,25 +2941,36 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
}
|
}
|
||||||
if len(result.RampSteps) > 0 {
|
if len(result.RampSteps) > 0 {
|
||||||
b.WriteString("## Ramp Sequence\n\n")
|
b.WriteString("## Ramp Sequence\n\n")
|
||||||
b.WriteString("| Step | GPUs | Total Power | Avg / GPU | Avg Realization | Min Realization | Derated |\n")
|
b.WriteString("| Step | New GPU | Stable Limit | Total Observed | Derated | Status |\n")
|
||||||
b.WriteString("|------|------|-------------|-----------|-----------------|-----------------|---------|\n")
|
b.WriteString("|------|---------|--------------|----------------|---------|--------|\n")
|
||||||
for _, step := range result.RampSteps {
|
for _, step := range result.RampSteps {
|
||||||
fmt.Fprintf(&b, "| %d | %s | %.0f W | %.0f W | %.1f%% | %.1f%% | %d |\n",
|
derated := "-"
|
||||||
step.StepIndex, joinIndexList(step.GPUIndices), step.TotalObservedPowerW, step.AvgObservedPowerW, step.AvgPowerRealizationPct, step.MinPowerRealizationPct, step.DeratedGPUCount)
|
if step.Derated {
|
||||||
|
derated = "⚠ yes"
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "| %d | GPU %d | %.0f W | %.0f W | %s | %s |\n",
|
||||||
|
step.StepIndex, step.NewGPUIndex, step.NewGPUStableLimitW, step.TotalObservedPowerW, derated, step.Status)
|
||||||
}
|
}
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
}
|
}
|
||||||
b.WriteString("## Per-Slot Results\n\n")
|
b.WriteString("## Per-Slot Results\n\n")
|
||||||
b.WriteString("| GPU | Status | Max Power | Temp | Applied Limit | Default Limit | Attempts |\n")
|
b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Temp | Attempts |\n")
|
||||||
b.WriteString("|-----|--------|-----------|------|---------------|---------------|----------|\n")
|
b.WriteString("|-----|--------|-------------------|--------------|------|----------|\n")
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %.1f C | %.0f W | %.0f W | %d |\n",
|
stableLimit := "-"
|
||||||
gpu.Index, gpu.Status, gpu.MaxObservedPowerW, gpu.MaxObservedTempC, gpu.AppliedPowerLimitW, gpu.DefaultPowerLimitW, gpu.CalibrationAttempts)
|
if gpu.StablePowerLimitW > 0 {
|
||||||
|
if gpu.Derated {
|
||||||
|
stableLimit = fmt.Sprintf("%.0f W ⚠", gpu.StablePowerLimitW)
|
||||||
|
} else {
|
||||||
|
stableLimit = fmt.Sprintf("%.0f W", gpu.StablePowerLimitW)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %s | %.1f C | %d |\n",
|
||||||
|
gpu.Index, gpu.Status, gpu.AppliedPowerLimitW, stableLimit, gpu.MaxObservedTempC, gpu.CalibrationAttempts)
|
||||||
}
|
}
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name)
|
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name)
|
||||||
|
|
||||||
for _, note := range gpu.Notes {
|
for _, note := range gpu.Notes {
|
||||||
fmt.Fprintf(&b, "- %s\n", note)
|
fmt.Fprintf(&b, "- %s\n", note)
|
||||||
}
|
}
|
||||||
@@ -2893,14 +2985,22 @@ func renderPowerBenchSummary(result NvidiaPowerBenchResult) string {
|
|||||||
fmt.Fprintf(&b, "benchmark_version=%s\n", result.BenchmarkVersion)
|
fmt.Fprintf(&b, "benchmark_version=%s\n", result.BenchmarkVersion)
|
||||||
fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
|
fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
|
||||||
fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
|
fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
|
||||||
|
fmt.Fprintf(&b, "platform_max_tdp_w=%.0f\n", result.PlatformMaxTDPW)
|
||||||
fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
|
fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
|
||||||
if len(result.RecommendedSlotOrder) > 0 {
|
if len(result.RecommendedSlotOrder) > 0 {
|
||||||
fmt.Fprintf(&b, "recommended_slot_order=%s\n", joinIndexList(result.RecommendedSlotOrder))
|
fmt.Fprintf(&b, "recommended_slot_order=%s\n", joinIndexList(result.RecommendedSlotOrder))
|
||||||
}
|
}
|
||||||
for _, step := range result.RampSteps {
|
for _, step := range result.RampSteps {
|
||||||
fmt.Fprintf(&b, "ramp_step_%d_gpus=%s\n", step.StepIndex, joinIndexList(step.GPUIndices))
|
fmt.Fprintf(&b, "ramp_step_%d_gpus=%s\n", step.StepIndex, joinIndexList(step.GPUIndices))
|
||||||
|
fmt.Fprintf(&b, "ramp_step_%d_new_gpu=%d\n", step.StepIndex, step.NewGPUIndex)
|
||||||
|
fmt.Fprintf(&b, "ramp_step_%d_stable_limit_w=%.0f\n", step.StepIndex, step.NewGPUStableLimitW)
|
||||||
fmt.Fprintf(&b, "ramp_step_%d_total_power_w=%.0f\n", step.StepIndex, step.TotalObservedPowerW)
|
fmt.Fprintf(&b, "ramp_step_%d_total_power_w=%.0f\n", step.StepIndex, step.TotalObservedPowerW)
|
||||||
}
|
}
|
||||||
|
for _, gpu := range result.GPUs {
|
||||||
|
if gpu.StablePowerLimitW > 0 {
|
||||||
|
fmt.Fprintf(&b, "gpu_%d_stable_limit_w=%.0f\n", gpu.Index, gpu.StablePowerLimitW)
|
||||||
|
}
|
||||||
|
}
|
||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2953,7 +3053,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
_ = os.MkdirAll(singleDir, 0755)
|
_ = os.MkdirAll(singleDir, 0755)
|
||||||
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
||||||
c, restore := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc)
|
c, restore := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil)
|
||||||
allRestoreActions = append(allRestoreActions, restore...)
|
allRestoreActions = append(allRestoreActions, restore...)
|
||||||
if r, ok := c[idx]; ok {
|
if r, ok := c[idx]; ok {
|
||||||
calibByIndex[idx] = r
|
calibByIndex[idx] = r
|
||||||
@@ -3029,72 +3129,125 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
singleByIndex[gpu.Index] = gpu
|
singleByIndex[gpu.Index] = gpu
|
||||||
}
|
}
|
||||||
|
|
||||||
// Phase 2: ramp — add one GPU per step and calibrate the growing subset
|
// Phase 2: cumulative thermal ramp.
|
||||||
// simultaneously. Step 1 reuses single-card results; steps 2..N run fresh
|
// Each step introduces one new GPU into an environment where all previously
|
||||||
// targeted_power with derating if degradation is detected.
|
// calibrated GPUs are already running at their fixed stable limits. The new
|
||||||
for step := 1; step <= len(result.RecommendedSlotOrder); step++ {
|
// GPU's stable TDP is searched via binary search (targeted_power) under real
|
||||||
|
// multi-GPU thermal load. Once found, its limit is fixed permanently for all
|
||||||
|
// subsequent steps. This ensures each GPU's limit reflects actual sustained
|
||||||
|
// power in the final full-system thermal state.
|
||||||
|
//
|
||||||
|
// stableLimits accumulates GPU index → fixed stable limit (W) across steps.
|
||||||
|
stableLimits := make(map[int]int, len(result.RecommendedSlotOrder))
|
||||||
|
|
||||||
|
// Step 1: reuse single-card calibration result directly.
|
||||||
|
if len(result.RecommendedSlotOrder) > 0 {
|
||||||
|
firstIdx := result.RecommendedSlotOrder[0]
|
||||||
|
firstCalib := calibByIndex[firstIdx]
|
||||||
|
stableLimits[firstIdx] = int(math.Round(firstCalib.AppliedPowerLimitW))
|
||||||
|
ramp := NvidiaPowerBenchStep{
|
||||||
|
StepIndex: 1,
|
||||||
|
GPUIndices: []int{firstIdx},
|
||||||
|
NewGPUIndex: firstIdx,
|
||||||
|
NewGPUStableLimitW: firstCalib.AppliedPowerLimitW,
|
||||||
|
TotalObservedPowerW: firstCalib.Summary.P95PowerW,
|
||||||
|
AvgObservedPowerW: firstCalib.Summary.P95PowerW,
|
||||||
|
Derated: firstCalib.Derated,
|
||||||
|
Status: "OK",
|
||||||
|
}
|
||||||
|
if !firstCalib.Completed {
|
||||||
|
ramp.Status = "FAILED"
|
||||||
|
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card targeted_power", firstIdx))
|
||||||
|
result.OverallStatus = "PARTIAL"
|
||||||
|
} else if firstCalib.Derated {
|
||||||
|
ramp.Status = "PARTIAL"
|
||||||
|
if result.OverallStatus == "OK" {
|
||||||
|
result.OverallStatus = "PARTIAL"
|
||||||
|
}
|
||||||
|
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step 1 (GPU %d) required derating to %.0f W.", firstIdx, firstCalib.AppliedPowerLimitW))
|
||||||
|
}
|
||||||
|
result.RampSteps = append(result.RampSteps, ramp)
|
||||||
|
logFunc(fmt.Sprintf("power ramp: step 1/%d — reused single-card calibration for GPU %d, stable limit %.0f W",
|
||||||
|
len(result.RecommendedSlotOrder), firstIdx, firstCalib.AppliedPowerLimitW))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Steps 2..N: each step fixes previously calibrated GPUs and searches only
|
||||||
|
// the new GPU's stable limit in the combined thermal environment.
|
||||||
|
for stepNum := 1; stepNum < len(result.RecommendedSlotOrder); stepNum++ {
|
||||||
|
step := stepNum + 1
|
||||||
subset := append([]int(nil), result.RecommendedSlotOrder[:step]...)
|
subset := append([]int(nil), result.RecommendedSlotOrder[:step]...)
|
||||||
|
newGPUIdx := result.RecommendedSlotOrder[stepNum]
|
||||||
stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step))
|
stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step))
|
||||||
_ = os.MkdirAll(stepDir, 0755)
|
_ = os.MkdirAll(stepDir, 0755)
|
||||||
var stepCalib map[int]benchmarkPowerCalibrationResult
|
|
||||||
if step == 1 {
|
// All previously calibrated GPUs are fixed at their stable limits.
|
||||||
// Single-GPU step — already measured in phase 1; reuse directly.
|
fixedForStep := make(map[int]int, len(stableLimits))
|
||||||
stepCalib = calibByIndex
|
for k, v := range stableLimits {
|
||||||
logFunc(fmt.Sprintf("power ramp: step 1/%d — reusing single-card calibration for GPU %d", len(result.RecommendedSlotOrder), subset[0]))
|
fixedForStep[k] = v
|
||||||
} else {
|
}
|
||||||
|
|
||||||
|
logFunc(fmt.Sprintf("power ramp: step %d/%d — calibrating GPU %d with %d fixed GPU(s)",
|
||||||
|
step, len(result.RecommendedSlotOrder), newGPUIdx, len(fixedForStep)))
|
||||||
|
|
||||||
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||||
var stepRestore []benchmarkRestoreAction
|
stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, fixedForStep)
|
||||||
stepCalib, stepRestore = runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc)
|
// Accumulate restore actions; they all run in the outer defer.
|
||||||
for i := len(stepRestore) - 1; i >= 0; i-- {
|
allRestoreActions = append(allRestoreActions, stepRestore...)
|
||||||
stepRestore[i].fn()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ramp := NvidiaPowerBenchStep{
|
ramp := NvidiaPowerBenchStep{
|
||||||
StepIndex: step,
|
StepIndex: step,
|
||||||
GPUIndices: subset,
|
GPUIndices: subset,
|
||||||
|
NewGPUIndex: newGPUIdx,
|
||||||
Status: "OK",
|
Status: "OK",
|
||||||
}
|
}
|
||||||
var realizationValues []float64
|
|
||||||
|
// Total observed power = sum of p95 across all GPUs in this step.
|
||||||
for _, idx := range subset {
|
for _, idx := range subset {
|
||||||
calib := stepCalib[idx]
|
if c, ok := stepCalib[idx]; ok {
|
||||||
ramp.TotalObservedPowerW += calib.Summary.P95PowerW
|
ramp.TotalObservedPowerW += c.Summary.P95PowerW
|
||||||
if calib.Derated {
|
|
||||||
ramp.DeratedGPUCount++
|
|
||||||
ramp.Status = "PARTIAL"
|
|
||||||
}
|
|
||||||
if !calib.Completed {
|
|
||||||
ramp.Status = "FAILED"
|
|
||||||
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d", idx, step))
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if single, ok := singleByIndex[idx]; ok && single.MaxObservedPowerW > 0 {
|
|
||||||
realization := calib.Summary.P95PowerW / single.MaxObservedPowerW * 100
|
|
||||||
realizationValues = append(realizationValues, realization)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(subset) > 0 {
|
if len(subset) > 0 {
|
||||||
ramp.AvgObservedPowerW = ramp.TotalObservedPowerW / float64(len(subset))
|
ramp.AvgObservedPowerW = ramp.TotalObservedPowerW / float64(len(subset))
|
||||||
}
|
}
|
||||||
if len(realizationValues) > 0 {
|
|
||||||
ramp.AvgPowerRealizationPct = benchmarkMean(realizationValues)
|
// Determine stable limit for the new GPU.
|
||||||
ramp.MinPowerRealizationPct = realizationValues[0]
|
if c, ok := stepCalib[newGPUIdx]; ok && c.Completed {
|
||||||
for _, v := range realizationValues[1:] {
|
stableLimits[newGPUIdx] = int(math.Round(c.AppliedPowerLimitW))
|
||||||
if v < ramp.MinPowerRealizationPct {
|
ramp.NewGPUStableLimitW = c.AppliedPowerLimitW
|
||||||
ramp.MinPowerRealizationPct = v
|
ramp.Derated = c.Derated
|
||||||
}
|
if c.Derated {
|
||||||
}
|
ramp.Status = "PARTIAL"
|
||||||
}
|
|
||||||
if ramp.MinPowerRealizationPct > 0 && ramp.MinPowerRealizationPct < 90 {
|
|
||||||
ramp.Notes = append(ramp.Notes, fmt.Sprintf("Power realization fell to %.1f%% of single-card baseline by step %d.", ramp.MinPowerRealizationPct, step))
|
|
||||||
if result.OverallStatus == "OK" {
|
if result.OverallStatus == "OK" {
|
||||||
result.OverallStatus = "PARTIAL"
|
result.OverallStatus = "PARTIAL"
|
||||||
}
|
}
|
||||||
|
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
|
||||||
}
|
}
|
||||||
if ramp.DeratedGPUCount > 0 {
|
} else {
|
||||||
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (%s) needed derating on %d GPU(s).", step, joinIndexList(subset), ramp.DeratedGPUCount))
|
// Calibration failed — fall back to single-card limit.
|
||||||
|
fb := calibByIndex[newGPUIdx]
|
||||||
|
stableLimits[newGPUIdx] = int(math.Round(fb.AppliedPowerLimitW))
|
||||||
|
ramp.NewGPUStableLimitW = fb.AppliedPowerLimitW
|
||||||
|
ramp.Status = "FAILED"
|
||||||
|
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d; using single-card limit %.0f W", newGPUIdx, step, fb.AppliedPowerLimitW))
|
||||||
|
result.OverallStatus = "PARTIAL"
|
||||||
}
|
}
|
||||||
|
|
||||||
result.RampSteps = append(result.RampSteps, ramp)
|
result.RampSteps = append(result.RampSteps, ramp)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Populate StablePowerLimitW on each GPU entry from the accumulated stable limits.
|
||||||
|
for i := range result.GPUs {
|
||||||
|
if lim, ok := stableLimits[result.GPUs[i].Index]; ok {
|
||||||
|
result.GPUs[i].StablePowerLimitW = float64(lim)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// PlatformMaxTDPW = sum of all stable limits — the actual sustained power
|
||||||
|
// budget of this server with all GPUs running simultaneously without throttling.
|
||||||
|
for _, lim := range stableLimits {
|
||||||
|
result.PlatformMaxTDPW += float64(lim)
|
||||||
|
}
|
||||||
resultJSON, err := json.MarshalIndent(result, "", " ")
|
resultJSON, err := json.MarshalIndent(result, "", " ")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", fmt.Errorf("marshal power result: %w", err)
|
return "", fmt.Errorf("marshal power result: %w", err)
|
||||||
|
|||||||
@@ -61,6 +61,9 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
|||||||
if result.ScalabilityScore > 0 {
|
if result.ScalabilityScore > 0 {
|
||||||
fmt.Fprintf(&b, "**Scalability score:** %.1f%% \n", result.ScalabilityScore)
|
fmt.Fprintf(&b, "**Scalability score:** %.1f%% \n", result.ScalabilityScore)
|
||||||
}
|
}
|
||||||
|
if result.PlatformPowerScore > 0 {
|
||||||
|
fmt.Fprintf(&b, "**Platform power score:** %.1f%% \n", result.PlatformPowerScore)
|
||||||
|
}
|
||||||
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
|
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
|
|
||||||
@@ -329,6 +332,19 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── Platform Scalability ──────────────────────────────────────────────────
|
||||||
|
if len(result.PerformanceRampSteps) > 0 {
|
||||||
|
b.WriteString("## Platform Scalability (Performance Ramp)\n\n")
|
||||||
|
fmt.Fprintf(&b, "**Platform power score:** %.1f%% \n\n", result.PlatformPowerScore)
|
||||||
|
b.WriteString("| k GPUs | GPU Indices | Total Synthetic TOPS | Scalability |\n")
|
||||||
|
b.WriteString("|--------|-------------|----------------------|-------------|\n")
|
||||||
|
for _, step := range result.PerformanceRampSteps {
|
||||||
|
fmt.Fprintf(&b, "| %d | %s | %.2f | %.1f%% |\n",
|
||||||
|
step.StepIndex, joinIndexList(step.GPUIndices), step.TotalSyntheticTOPS, step.ScalabilityPct)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
// ── Raw files ─────────────────────────────────────────────────────────────
|
// ── Raw files ─────────────────────────────────────────────────────────────
|
||||||
b.WriteString("## Raw Files\n\n")
|
b.WriteString("## Raw Files\n\n")
|
||||||
b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
|
b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
|
||||||
|
|||||||
@@ -65,6 +65,11 @@ type NvidiaBenchmarkResult struct {
|
|||||||
RampTotal int `json:"ramp_total,omitempty"`
|
RampTotal int `json:"ramp_total,omitempty"`
|
||||||
RampRunID string `json:"ramp_run_id,omitempty"`
|
RampRunID string `json:"ramp_run_id,omitempty"`
|
||||||
ScalabilityScore float64 `json:"scalability_score,omitempty"`
|
ScalabilityScore float64 `json:"scalability_score,omitempty"`
|
||||||
|
// PlatformPowerScore is the mean compute scalability across ramp steps 2..N.
|
||||||
|
// 100% = each added GPU contributes exactly its single-card throughput.
|
||||||
|
// < 100% = throughput loss due to thermal throttle, power limits, or contention.
|
||||||
|
PlatformPowerScore float64 `json:"platform_power_score,omitempty"`
|
||||||
|
PerformanceRampSteps []NvidiaPerformanceRampStep `json:"performance_ramp_steps,omitempty"`
|
||||||
OverallStatus string `json:"overall_status"`
|
OverallStatus string `json:"overall_status"`
|
||||||
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
||||||
Findings []string `json:"findings,omitempty"`
|
Findings []string `json:"findings,omitempty"`
|
||||||
@@ -265,6 +270,10 @@ type NvidiaPowerBenchResult struct {
|
|||||||
RecommendedSlotOrder []int `json:"recommended_slot_order,omitempty"`
|
RecommendedSlotOrder []int `json:"recommended_slot_order,omitempty"`
|
||||||
RampSteps []NvidiaPowerBenchStep `json:"ramp_steps,omitempty"`
|
RampSteps []NvidiaPowerBenchStep `json:"ramp_steps,omitempty"`
|
||||||
OverallStatus string `json:"overall_status"`
|
OverallStatus string `json:"overall_status"`
|
||||||
|
// PlatformMaxTDPW is the sum of per-GPU stable power limits found during the
|
||||||
|
// cumulative thermal ramp. Represents the actual sustained power budget of
|
||||||
|
// this server under full GPU load. Use for rack power planning.
|
||||||
|
PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
|
||||||
Findings []string `json:"findings,omitempty"`
|
Findings []string `json:"findings,omitempty"`
|
||||||
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
|
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
|
||||||
}
|
}
|
||||||
@@ -274,7 +283,14 @@ type NvidiaPowerBenchGPU struct {
|
|||||||
Name string `json:"name,omitempty"`
|
Name string `json:"name,omitempty"`
|
||||||
BusID string `json:"bus_id,omitempty"`
|
BusID string `json:"bus_id,omitempty"`
|
||||||
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
||||||
|
// AppliedPowerLimitW is the stable limit found during single-card calibration.
|
||||||
AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
|
AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
|
||||||
|
// StablePowerLimitW is the final fixed limit for this GPU after the
|
||||||
|
// cumulative thermal ramp. This is the limit at which the GPU operated
|
||||||
|
// stably with all other GPUs running simultaneously at their own limits.
|
||||||
|
// May be lower than AppliedPowerLimitW if multi-GPU thermal load required
|
||||||
|
// additional derating.
|
||||||
|
StablePowerLimitW float64 `json:"stable_power_limit_w,omitempty"`
|
||||||
MaxObservedPowerW float64 `json:"max_observed_power_w,omitempty"`
|
MaxObservedPowerW float64 `json:"max_observed_power_w,omitempty"`
|
||||||
MaxObservedTempC float64 `json:"max_observed_temp_c,omitempty"`
|
MaxObservedTempC float64 `json:"max_observed_temp_c,omitempty"`
|
||||||
CalibrationAttempts int `json:"calibration_attempts,omitempty"`
|
CalibrationAttempts int `json:"calibration_attempts,omitempty"`
|
||||||
@@ -288,11 +304,29 @@ type NvidiaPowerBenchGPU struct {
|
|||||||
type NvidiaPowerBenchStep struct {
|
type NvidiaPowerBenchStep struct {
|
||||||
StepIndex int `json:"step_index"`
|
StepIndex int `json:"step_index"`
|
||||||
GPUIndices []int `json:"gpu_indices"`
|
GPUIndices []int `json:"gpu_indices"`
|
||||||
|
// NewGPUIndex is the GPU whose stable limit was searched in this step.
|
||||||
|
NewGPUIndex int `json:"new_gpu_index"`
|
||||||
|
// NewGPUStableLimitW is the stable power limit found for the new GPU.
|
||||||
|
NewGPUStableLimitW float64 `json:"new_gpu_stable_limit_w,omitempty"`
|
||||||
TotalObservedPowerW float64 `json:"total_observed_power_w,omitempty"`
|
TotalObservedPowerW float64 `json:"total_observed_power_w,omitempty"`
|
||||||
AvgObservedPowerW float64 `json:"avg_observed_power_w,omitempty"`
|
AvgObservedPowerW float64 `json:"avg_observed_power_w,omitempty"`
|
||||||
MinPowerRealizationPct float64 `json:"min_power_realization_pct,omitempty"`
|
Derated bool `json:"derated,omitempty"`
|
||||||
AvgPowerRealizationPct float64 `json:"avg_power_realization_pct,omitempty"`
|
Status string `json:"status"`
|
||||||
DeratedGPUCount int `json:"derated_gpu_count,omitempty"`
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// NvidiaPerformanceRampStep holds per-step performance data for the
|
||||||
|
// scalability ramp-up phase of the performance benchmark.
|
||||||
|
type NvidiaPerformanceRampStep struct {
|
||||||
|
StepIndex int `json:"step_index"`
|
||||||
|
GPUIndices []int `json:"gpu_indices"`
|
||||||
|
// TotalSyntheticTOPS is the sum of per-GPU SyntheticScore (fp32-equivalent
|
||||||
|
// TOPS from dedicated single-precision phases) across all GPUs in this step.
|
||||||
|
TotalSyntheticTOPS float64 `json:"total_synthetic_tops"`
|
||||||
|
TotalMixedTOPS float64 `json:"total_mixed_tops,omitempty"`
|
||||||
|
// ScalabilityPct = TotalSyntheticTOPS / (k × best_single_gpu_tops) × 100.
|
||||||
|
// 100% = perfect linear scaling. < 100% = thermal/power/interconnect loss.
|
||||||
|
ScalabilityPct float64 `json:"scalability_pct"`
|
||||||
Status string `json:"status"`
|
Status string `json:"status"`
|
||||||
Notes []string `json:"notes,omitempty"`
|
Notes []string `json:"notes,omitempty"`
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user