From 732bf4cbabd497748e337dd300c9c406c8538f16 Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Thu, 16 Apr 2026 00:30:50 +0300 Subject: [PATCH] Redesign power and performance benchmarks with new methodology Power/Thermal Fit: cumulative fixed-limit ramp where each GPU's stable TDP is found under real multi-GPU thermal load (all prior GPUs running at their fixed limits). PlatformMaxTDPW = sum of stable limits across all GPUs. Remove PlatformPowerScore from power test. Performance Benchmark: remove pre-benchmark power calibration entirely. After N single-card runs, execute k=2..N parallel ramp-up steps and compute PlatformPowerScore = mean compute scalability vs best single-card TOPS. PowerSustainScore falls back to Steady.AvgPowerW when calibration absent. Co-Authored-By: Claude Sonnet 4.6 --- audit/internal/platform/benchmark.go | 307 +++++++++++++++----- audit/internal/platform/benchmark_report.go | 16 + audit/internal/platform/benchmark_types.go | 56 +++- 3 files changed, 291 insertions(+), 88 deletions(-) diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index 6f1295d..055178e 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -304,18 +304,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv } }() - // Power calibration: run dcgmi targeted_power while sampling nvidia-smi power. - // Returns per-GPU p95 power as an honest TDP reference for PowerSustainScore. - calibByIndex, powerRestoreActions := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, infoByIndex, logFunc) - restoreActions = append(restoreActions, powerRestoreActions...) - for _, idx := range selected { - if calib, ok := calibByIndex[idx]; ok && calib.Derated && calib.AppliedPowerLimitW > 0 { - result.Warnings = append(result.Warnings, fmt.Sprintf( - "GPU %d could not complete targeted_power at its default server power budget; benchmark ran at reduced power limit %.0f W.", - idx, calib.AppliedPowerLimitW, - )) - } - } + // No power calibration before performance benchmark — GPUs run at their + // default power limits. PowerSustainScore is derived from steady-state power + // observed during the benchmark itself. + calibByIndex := make(map[int]benchmarkPowerCalibrationResult) // Start background CPU load sampler — samples every 10s during GPU phases. cpuStopCh := make(chan struct{}) @@ -531,6 +523,69 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv } // end sequential path + // Performance scalability ramp-up: run parallel benchmarks for k=2..N GPUs + // and compute compute scalability relative to the best single-GPU result. + // Only runs in sequential mode (each GPU was tested individually above) and + // when there are at least 2 GPUs. + if !opts.ParallelGPUs && len(selected) >= 2 { + // Find the best single-card SyntheticScore as the 1-GPU baseline. + var bestTOPS float64 + for _, g := range result.GPUs { + if g.Scores.SyntheticScore > bestTOPS { + bestTOPS = g.Scores.SyntheticScore + } + } + if bestTOPS > 0 { + var rampSteps []NvidiaPerformanceRampStep + var scalabilityPcts []float64 + for k := 2; k <= len(selected); k++ { + subset := append([]int(nil), selected[:k]...) + rampDir := filepath.Join(runDir, fmt.Sprintf("ramp-%02d", k)) + _ = os.MkdirAll(rampDir, 0755) + logFunc(fmt.Sprintf("performance ramp: step %d/%d — running %d GPUs in parallel", k, len(selected), k)) + + var rampResult NvidiaBenchmarkResult + var rampIdleW, rampLoadedWSum float64 + var rampIdleOK, rampLoadedOK bool + var rampLoadedSamples int + var rampMetricRows []GPUMetricRow + var rampTimelineSec float64 + emptyCalib := make(map[int]benchmarkPowerCalibrationResult) + + runNvidiaBenchmarkParallel(ctx, verboseLog, rampDir, subset, infoByIndex, opts, spec, logFunc, + &rampResult, emptyCalib, + &rampIdleW, &rampLoadedWSum, &rampIdleOK, &rampLoadedOK, &rampLoadedSamples, + &rampMetricRows, &rampTimelineSec, "") + + var totalSynth, totalMixed float64 + for _, g := range rampResult.GPUs { + totalSynth += g.Scores.SyntheticScore + totalMixed += g.Scores.MixedScore + } + scalPct := totalSynth / (float64(k) * bestTOPS) * 100 + scalabilityPcts = append(scalabilityPcts, scalPct) + + stepStatus := "OK" + if len(rampResult.GPUs) < k { + stepStatus = "PARTIAL" + } + rampSteps = append(rampSteps, NvidiaPerformanceRampStep{ + StepIndex: k, + GPUIndices: subset, + TotalSyntheticTOPS: totalSynth, + TotalMixedTOPS: totalMixed, + ScalabilityPct: scalPct, + Status: stepStatus, + }) + } + result.PerformanceRampSteps = rampSteps + result.PlatformPowerScore = benchmarkMean(scalabilityPcts) + if len(scalabilityPcts) > 0 { + result.ScalabilityScore = scalabilityPcts[len(scalabilityPcts)-1] + } + } + } + if len(selected) > 1 && opts.RunNCCL { result.Interconnect = runBenchmarkInterconnect(ctx, verboseLog, runDir, selected, spec, logFunc) if result.Interconnect != nil && result.Interconnect.Supported { @@ -1344,20 +1399,25 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard { case score.MixedScore > 0: score.ComputeScore = score.MixedScore } - // PowerSustainScore: measures how close the GPU came to its rated TDP under - // a full-spectrum load (dcgmi targeted_power). 100 = exactly at rated TDP. + // PowerSustainScore: measures how close the GPU came to its rated TDP during + // steady-state benchmark load. 100 = exactly at rated TDP. // Penalty applied symmetrically for both under- and over-TDP deviations: // score = max(0, 100 − |measured − rated| / rated × 100) // Under-TDP → power delivery / cooling issue. // Over-TDP → power limit not properly enforced / power regulation fault. - // Falls back to 0 if calibration was not performed (dcgmi unavailable). + // Uses CalibratedPeakPowerW when available (from external power calibration), + // otherwise falls back to Steady.AvgPowerW observed during the benchmark. { ref := gpu.DefaultPowerLimitW if ref <= 0 { ref = gpu.PowerLimitW } - if gpu.CalibratedPeakPowerW > 0 && ref > 0 { - deviationPct := math.Abs(gpu.CalibratedPeakPowerW-ref) / ref * 100 + measured := gpu.CalibratedPeakPowerW + if measured <= 0 { + measured = gpu.Steady.AvgPowerW + } + if measured > 0 && ref > 0 { + deviationPct := math.Abs(measured-ref) / ref * 100 score.PowerSustainScore = clampScore(100 - deviationPct) } } @@ -2470,6 +2530,7 @@ func runBenchmarkPowerCalibration( gpuIndices []int, infoByIndex map[int]benchmarkGPUInfo, logFunc func(string), + fixedLimits map[int]int, ) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) { const calibDurationSec = 120 const maxDerateW = 150 @@ -2555,6 +2616,21 @@ func runBenchmarkPowerCalibration( hi: appliedLimitW + 1, // not yet tested, not yet confirmed unstable calib: benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)}, } + if fixedLimits != nil { + if fixedW, ok := fixedLimits[idx]; ok { + // This GPU's limit was established in a prior ramp step and must + // remain unchanged. Apply it immediately and skip the binary search. + if canDerate && fixedW > 0 { + _ = setBenchmarkPowerLimit(ctx, verboseLog, idx, fixedW) + } + s.appliedLimitW = fixedW + s.calib.AppliedPowerLimitW = float64(fixedW) + s.calib.Completed = true + s.converged = true + s.calib.Notes = append(s.calib.Notes, + fmt.Sprintf("fixed limit: %d W (held from prior ramp step)", fixedW)) + } + } states = append(states, s) if canDerate && originalLimitW > 0 { idxCopy := idx @@ -2764,6 +2840,10 @@ calibDone: s.appliedLimitW = s.lo s.calib.AppliedPowerLimitW = float64(s.lo) s.calib.Derated = s.lo < s.originalLimitW + // Summary was captured when we last verified stability at s.lo, + // so the result is valid — mark as completed even though we + // converged from the failure path (tried higher, failed, fell back). + s.calib.Completed = true } } else { s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW)) @@ -2846,7 +2926,8 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { fmt.Fprintf(&b, "**Benchmark version:** %s \n", result.BenchmarkVersion) fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile) fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC")) - fmt.Fprintf(&b, "**Overall status:** %s \n\n", result.OverallStatus) + fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus) + fmt.Fprintf(&b, "**Platform max TDP:** %.0f W \n\n", result.PlatformMaxTDPW) if len(result.Findings) > 0 { b.WriteString("## Summary\n\n") for _, finding := range result.Findings { @@ -2860,25 +2941,36 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { } if len(result.RampSteps) > 0 { b.WriteString("## Ramp Sequence\n\n") - b.WriteString("| Step | GPUs | Total Power | Avg / GPU | Avg Realization | Min Realization | Derated |\n") - b.WriteString("|------|------|-------------|-----------|-----------------|-----------------|---------|\n") + b.WriteString("| Step | New GPU | Stable Limit | Total Observed | Derated | Status |\n") + b.WriteString("|------|---------|--------------|----------------|---------|--------|\n") for _, step := range result.RampSteps { - fmt.Fprintf(&b, "| %d | %s | %.0f W | %.0f W | %.1f%% | %.1f%% | %d |\n", - step.StepIndex, joinIndexList(step.GPUIndices), step.TotalObservedPowerW, step.AvgObservedPowerW, step.AvgPowerRealizationPct, step.MinPowerRealizationPct, step.DeratedGPUCount) + derated := "-" + if step.Derated { + derated = "⚠ yes" + } + fmt.Fprintf(&b, "| %d | GPU %d | %.0f W | %.0f W | %s | %s |\n", + step.StepIndex, step.NewGPUIndex, step.NewGPUStableLimitW, step.TotalObservedPowerW, derated, step.Status) } b.WriteString("\n") } b.WriteString("## Per-Slot Results\n\n") - b.WriteString("| GPU | Status | Max Power | Temp | Applied Limit | Default Limit | Attempts |\n") - b.WriteString("|-----|--------|-----------|------|---------------|---------------|----------|\n") + b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Temp | Attempts |\n") + b.WriteString("|-----|--------|-------------------|--------------|------|----------|\n") for _, gpu := range result.GPUs { - fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %.1f C | %.0f W | %.0f W | %d |\n", - gpu.Index, gpu.Status, gpu.MaxObservedPowerW, gpu.MaxObservedTempC, gpu.AppliedPowerLimitW, gpu.DefaultPowerLimitW, gpu.CalibrationAttempts) + stableLimit := "-" + if gpu.StablePowerLimitW > 0 { + if gpu.Derated { + stableLimit = fmt.Sprintf("%.0f W ⚠", gpu.StablePowerLimitW) + } else { + stableLimit = fmt.Sprintf("%.0f W", gpu.StablePowerLimitW) + } + } + fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %s | %.1f C | %d |\n", + gpu.Index, gpu.Status, gpu.AppliedPowerLimitW, stableLimit, gpu.MaxObservedTempC, gpu.CalibrationAttempts) } b.WriteString("\n") for _, gpu := range result.GPUs { fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name) - for _, note := range gpu.Notes { fmt.Fprintf(&b, "- %s\n", note) } @@ -2893,14 +2985,22 @@ func renderPowerBenchSummary(result NvidiaPowerBenchResult) string { fmt.Fprintf(&b, "benchmark_version=%s\n", result.BenchmarkVersion) fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile) fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus) + fmt.Fprintf(&b, "platform_max_tdp_w=%.0f\n", result.PlatformMaxTDPW) fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs)) if len(result.RecommendedSlotOrder) > 0 { fmt.Fprintf(&b, "recommended_slot_order=%s\n", joinIndexList(result.RecommendedSlotOrder)) } for _, step := range result.RampSteps { fmt.Fprintf(&b, "ramp_step_%d_gpus=%s\n", step.StepIndex, joinIndexList(step.GPUIndices)) + fmt.Fprintf(&b, "ramp_step_%d_new_gpu=%d\n", step.StepIndex, step.NewGPUIndex) + fmt.Fprintf(&b, "ramp_step_%d_stable_limit_w=%.0f\n", step.StepIndex, step.NewGPUStableLimitW) fmt.Fprintf(&b, "ramp_step_%d_total_power_w=%.0f\n", step.StepIndex, step.TotalObservedPowerW) } + for _, gpu := range result.GPUs { + if gpu.StablePowerLimitW > 0 { + fmt.Fprintf(&b, "gpu_%d_stable_limit_w=%.0f\n", gpu.Index, gpu.StablePowerLimitW) + } + } return b.String() } @@ -2953,7 +3053,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N _ = os.MkdirAll(singleDir, 0755) singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex) logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx)) - c, restore := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc) + c, restore := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil) allRestoreActions = append(allRestoreActions, restore...) if r, ok := c[idx]; ok { calibByIndex[idx] = r @@ -3029,72 +3129,125 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N singleByIndex[gpu.Index] = gpu } - // Phase 2: ramp — add one GPU per step and calibrate the growing subset - // simultaneously. Step 1 reuses single-card results; steps 2..N run fresh - // targeted_power with derating if degradation is detected. - for step := 1; step <= len(result.RecommendedSlotOrder); step++ { + // Phase 2: cumulative thermal ramp. + // Each step introduces one new GPU into an environment where all previously + // calibrated GPUs are already running at their fixed stable limits. The new + // GPU's stable TDP is searched via binary search (targeted_power) under real + // multi-GPU thermal load. Once found, its limit is fixed permanently for all + // subsequent steps. This ensures each GPU's limit reflects actual sustained + // power in the final full-system thermal state. + // + // stableLimits accumulates GPU index → fixed stable limit (W) across steps. + stableLimits := make(map[int]int, len(result.RecommendedSlotOrder)) + + // Step 1: reuse single-card calibration result directly. + if len(result.RecommendedSlotOrder) > 0 { + firstIdx := result.RecommendedSlotOrder[0] + firstCalib := calibByIndex[firstIdx] + stableLimits[firstIdx] = int(math.Round(firstCalib.AppliedPowerLimitW)) + ramp := NvidiaPowerBenchStep{ + StepIndex: 1, + GPUIndices: []int{firstIdx}, + NewGPUIndex: firstIdx, + NewGPUStableLimitW: firstCalib.AppliedPowerLimitW, + TotalObservedPowerW: firstCalib.Summary.P95PowerW, + AvgObservedPowerW: firstCalib.Summary.P95PowerW, + Derated: firstCalib.Derated, + Status: "OK", + } + if !firstCalib.Completed { + ramp.Status = "FAILED" + ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card targeted_power", firstIdx)) + result.OverallStatus = "PARTIAL" + } else if firstCalib.Derated { + ramp.Status = "PARTIAL" + if result.OverallStatus == "OK" { + result.OverallStatus = "PARTIAL" + } + result.Findings = append(result.Findings, fmt.Sprintf("Ramp step 1 (GPU %d) required derating to %.0f W.", firstIdx, firstCalib.AppliedPowerLimitW)) + } + result.RampSteps = append(result.RampSteps, ramp) + logFunc(fmt.Sprintf("power ramp: step 1/%d — reused single-card calibration for GPU %d, stable limit %.0f W", + len(result.RecommendedSlotOrder), firstIdx, firstCalib.AppliedPowerLimitW)) + } + + // Steps 2..N: each step fixes previously calibrated GPUs and searches only + // the new GPU's stable limit in the combined thermal environment. + for stepNum := 1; stepNum < len(result.RecommendedSlotOrder); stepNum++ { + step := stepNum + 1 subset := append([]int(nil), result.RecommendedSlotOrder[:step]...) + newGPUIdx := result.RecommendedSlotOrder[stepNum] stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step)) _ = os.MkdirAll(stepDir, 0755) - var stepCalib map[int]benchmarkPowerCalibrationResult - if step == 1 { - // Single-GPU step — already measured in phase 1; reuse directly. - stepCalib = calibByIndex - logFunc(fmt.Sprintf("power ramp: step 1/%d — reusing single-card calibration for GPU %d", len(result.RecommendedSlotOrder), subset[0])) - } else { - stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex) - var stepRestore []benchmarkRestoreAction - stepCalib, stepRestore = runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc) - for i := len(stepRestore) - 1; i >= 0; i-- { - stepRestore[i].fn() - } + + // All previously calibrated GPUs are fixed at their stable limits. + fixedForStep := make(map[int]int, len(stableLimits)) + for k, v := range stableLimits { + fixedForStep[k] = v } + + logFunc(fmt.Sprintf("power ramp: step %d/%d — calibrating GPU %d with %d fixed GPU(s)", + step, len(result.RecommendedSlotOrder), newGPUIdx, len(fixedForStep))) + + stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex) + stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, fixedForStep) + // Accumulate restore actions; they all run in the outer defer. + allRestoreActions = append(allRestoreActions, stepRestore...) + ramp := NvidiaPowerBenchStep{ - StepIndex: step, - GPUIndices: subset, - Status: "OK", + StepIndex: step, + GPUIndices: subset, + NewGPUIndex: newGPUIdx, + Status: "OK", } - var realizationValues []float64 + + // Total observed power = sum of p95 across all GPUs in this step. for _, idx := range subset { - calib := stepCalib[idx] - ramp.TotalObservedPowerW += calib.Summary.P95PowerW - if calib.Derated { - ramp.DeratedGPUCount++ - ramp.Status = "PARTIAL" - } - if !calib.Completed { - ramp.Status = "FAILED" - ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d", idx, step)) - continue - } - if single, ok := singleByIndex[idx]; ok && single.MaxObservedPowerW > 0 { - realization := calib.Summary.P95PowerW / single.MaxObservedPowerW * 100 - realizationValues = append(realizationValues, realization) + if c, ok := stepCalib[idx]; ok { + ramp.TotalObservedPowerW += c.Summary.P95PowerW } } if len(subset) > 0 { ramp.AvgObservedPowerW = ramp.TotalObservedPowerW / float64(len(subset)) } - if len(realizationValues) > 0 { - ramp.AvgPowerRealizationPct = benchmarkMean(realizationValues) - ramp.MinPowerRealizationPct = realizationValues[0] - for _, v := range realizationValues[1:] { - if v < ramp.MinPowerRealizationPct { - ramp.MinPowerRealizationPct = v + + // Determine stable limit for the new GPU. + if c, ok := stepCalib[newGPUIdx]; ok && c.Completed { + stableLimits[newGPUIdx] = int(math.Round(c.AppliedPowerLimitW)) + ramp.NewGPUStableLimitW = c.AppliedPowerLimitW + ramp.Derated = c.Derated + if c.Derated { + ramp.Status = "PARTIAL" + if result.OverallStatus == "OK" { + result.OverallStatus = "PARTIAL" } + result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW)) } + } else { + // Calibration failed — fall back to single-card limit. + fb := calibByIndex[newGPUIdx] + stableLimits[newGPUIdx] = int(math.Round(fb.AppliedPowerLimitW)) + ramp.NewGPUStableLimitW = fb.AppliedPowerLimitW + ramp.Status = "FAILED" + ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d; using single-card limit %.0f W", newGPUIdx, step, fb.AppliedPowerLimitW)) + result.OverallStatus = "PARTIAL" } - if ramp.MinPowerRealizationPct > 0 && ramp.MinPowerRealizationPct < 90 { - ramp.Notes = append(ramp.Notes, fmt.Sprintf("Power realization fell to %.1f%% of single-card baseline by step %d.", ramp.MinPowerRealizationPct, step)) - if result.OverallStatus == "OK" { - result.OverallStatus = "PARTIAL" - } - } - if ramp.DeratedGPUCount > 0 { - result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (%s) needed derating on %d GPU(s).", step, joinIndexList(subset), ramp.DeratedGPUCount)) - } + result.RampSteps = append(result.RampSteps, ramp) } + + // Populate StablePowerLimitW on each GPU entry from the accumulated stable limits. + for i := range result.GPUs { + if lim, ok := stableLimits[result.GPUs[i].Index]; ok { + result.GPUs[i].StablePowerLimitW = float64(lim) + } + } + + // PlatformMaxTDPW = sum of all stable limits — the actual sustained power + // budget of this server with all GPUs running simultaneously without throttling. + for _, lim := range stableLimits { + result.PlatformMaxTDPW += float64(lim) + } resultJSON, err := json.MarshalIndent(result, "", " ") if err != nil { return "", fmt.Errorf("marshal power result: %w", err) diff --git a/audit/internal/platform/benchmark_report.go b/audit/internal/platform/benchmark_report.go index d482c03..54487f1 100644 --- a/audit/internal/platform/benchmark_report.go +++ b/audit/internal/platform/benchmark_report.go @@ -61,6 +61,9 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { if result.ScalabilityScore > 0 { fmt.Fprintf(&b, "**Scalability score:** %.1f%% \n", result.ScalabilityScore) } + if result.PlatformPowerScore > 0 { + fmt.Fprintf(&b, "**Platform power score:** %.1f%% \n", result.PlatformPowerScore) + } fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus) b.WriteString("\n") @@ -329,6 +332,19 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { } } + // ── Platform Scalability ────────────────────────────────────────────────── + if len(result.PerformanceRampSteps) > 0 { + b.WriteString("## Platform Scalability (Performance Ramp)\n\n") + fmt.Fprintf(&b, "**Platform power score:** %.1f%% \n\n", result.PlatformPowerScore) + b.WriteString("| k GPUs | GPU Indices | Total Synthetic TOPS | Scalability |\n") + b.WriteString("|--------|-------------|----------------------|-------------|\n") + for _, step := range result.PerformanceRampSteps { + fmt.Fprintf(&b, "| %d | %s | %.2f | %.1f%% |\n", + step.StepIndex, joinIndexList(step.GPUIndices), step.TotalSyntheticTOPS, step.ScalabilityPct) + } + b.WriteString("\n") + } + // ── Raw files ───────────────────────────────────────────────────────────── b.WriteString("## Raw Files\n\n") b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n") diff --git a/audit/internal/platform/benchmark_types.go b/audit/internal/platform/benchmark_types.go index f09dea7..3383619 100644 --- a/audit/internal/platform/benchmark_types.go +++ b/audit/internal/platform/benchmark_types.go @@ -65,6 +65,11 @@ type NvidiaBenchmarkResult struct { RampTotal int `json:"ramp_total,omitempty"` RampRunID string `json:"ramp_run_id,omitempty"` ScalabilityScore float64 `json:"scalability_score,omitempty"` + // PlatformPowerScore is the mean compute scalability across ramp steps 2..N. + // 100% = each added GPU contributes exactly its single-card throughput. + // < 100% = throughput loss due to thermal throttle, power limits, or contention. + PlatformPowerScore float64 `json:"platform_power_score,omitempty"` + PerformanceRampSteps []NvidiaPerformanceRampStep `json:"performance_ramp_steps,omitempty"` OverallStatus string `json:"overall_status"` SelectedGPUIndices []int `json:"selected_gpu_indices"` Findings []string `json:"findings,omitempty"` @@ -265,8 +270,12 @@ type NvidiaPowerBenchResult struct { RecommendedSlotOrder []int `json:"recommended_slot_order,omitempty"` RampSteps []NvidiaPowerBenchStep `json:"ramp_steps,omitempty"` OverallStatus string `json:"overall_status"` - Findings []string `json:"findings,omitempty"` - GPUs []NvidiaPowerBenchGPU `json:"gpus"` + // PlatformMaxTDPW is the sum of per-GPU stable power limits found during the + // cumulative thermal ramp. Represents the actual sustained power budget of + // this server under full GPU load. Use for rack power planning. + PlatformMaxTDPW float64 `json:"platform_max_tdp_w"` + Findings []string `json:"findings,omitempty"` + GPUs []NvidiaPowerBenchGPU `json:"gpus"` } type NvidiaPowerBenchGPU struct { @@ -274,7 +283,14 @@ type NvidiaPowerBenchGPU struct { Name string `json:"name,omitempty"` BusID string `json:"bus_id,omitempty"` DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"` + // AppliedPowerLimitW is the stable limit found during single-card calibration. AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"` + // StablePowerLimitW is the final fixed limit for this GPU after the + // cumulative thermal ramp. This is the limit at which the GPU operated + // stably with all other GPUs running simultaneously at their own limits. + // May be lower than AppliedPowerLimitW if multi-GPU thermal load required + // additional derating. + StablePowerLimitW float64 `json:"stable_power_limit_w,omitempty"` MaxObservedPowerW float64 `json:"max_observed_power_w,omitempty"` MaxObservedTempC float64 `json:"max_observed_temp_c,omitempty"` CalibrationAttempts int `json:"calibration_attempts,omitempty"` @@ -286,13 +302,31 @@ type NvidiaPowerBenchGPU struct { } type NvidiaPowerBenchStep struct { - StepIndex int `json:"step_index"` - GPUIndices []int `json:"gpu_indices"` - TotalObservedPowerW float64 `json:"total_observed_power_w,omitempty"` - AvgObservedPowerW float64 `json:"avg_observed_power_w,omitempty"` - MinPowerRealizationPct float64 `json:"min_power_realization_pct,omitempty"` - AvgPowerRealizationPct float64 `json:"avg_power_realization_pct,omitempty"` - DeratedGPUCount int `json:"derated_gpu_count,omitempty"` - Status string `json:"status"` - Notes []string `json:"notes,omitempty"` + StepIndex int `json:"step_index"` + GPUIndices []int `json:"gpu_indices"` + // NewGPUIndex is the GPU whose stable limit was searched in this step. + NewGPUIndex int `json:"new_gpu_index"` + // NewGPUStableLimitW is the stable power limit found for the new GPU. + NewGPUStableLimitW float64 `json:"new_gpu_stable_limit_w,omitempty"` + TotalObservedPowerW float64 `json:"total_observed_power_w,omitempty"` + AvgObservedPowerW float64 `json:"avg_observed_power_w,omitempty"` + Derated bool `json:"derated,omitempty"` + Status string `json:"status"` + Notes []string `json:"notes,omitempty"` +} + +// NvidiaPerformanceRampStep holds per-step performance data for the +// scalability ramp-up phase of the performance benchmark. +type NvidiaPerformanceRampStep struct { + StepIndex int `json:"step_index"` + GPUIndices []int `json:"gpu_indices"` + // TotalSyntheticTOPS is the sum of per-GPU SyntheticScore (fp32-equivalent + // TOPS from dedicated single-precision phases) across all GPUs in this step. + TotalSyntheticTOPS float64 `json:"total_synthetic_tops"` + TotalMixedTOPS float64 `json:"total_mixed_tops,omitempty"` + // ScalabilityPct = TotalSyntheticTOPS / (k × best_single_gpu_tops) × 100. + // 100% = perfect linear scaling. < 100% = thermal/power/interconnect loss. + ScalabilityPct float64 `json:"scalability_pct"` + Status string `json:"status"` + Notes []string `json:"notes,omitempty"` }