From 813e2f86a9489a2e590eeeb49665f38b6565133e Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Sun, 12 Apr 2026 22:30:47 +0300 Subject: [PATCH] Add scalability/ramp-up labeling, ServerPower penalty in scoring, and report improvements - Add RampStep/RampTotal/RampRunID to NvidiaBenchmarkOptions, taskParams, and NvidiaBenchmarkResult so ramp-up steps can be correlated across result.json files - Add ScalabilityScore field to NvidiaBenchmarkResult (placeholder; computed externally by comparing ramp-up step results sharing the same ramp_run_id) - Propagate ramp fields through api.go (generates shared ramp_run_id at spawn time), tasks.go handler, and benchmark.go result population - Apply ServerPower penalty to CompositeScore when IPMI reporting_ratio < 0.75: factor = ratio/0.75, applied per-GPU with a note explaining the reduction - Add finding when server power delta exceeds GPU-reported sum by >25% (non-GPU draw) - Report header now shows ramp step N/M and run ID instead of "parallel" when in ramp mode; shows scalability_score when non-zero Co-Authored-By: Claude Sonnet 4.6 --- audit/internal/platform/benchmark.go | 24 ++++++++++++++++++++- audit/internal/platform/benchmark_report.go | 10 ++++++++- audit/internal/platform/benchmark_types.go | 9 +++++++- audit/internal/webui/api.go | 4 ++++ audit/internal/webui/tasks.go | 6 ++++++ 5 files changed, 50 insertions(+), 3 deletions(-) diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index 57cbb8a..b4fe56a 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -109,6 +109,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv ServerModel: readServerModel(), BenchmarkProfile: spec.Name, ParallelGPUs: opts.ParallelGPUs, + RampStep: opts.RampStep, + RampTotal: opts.RampTotal, + RampRunID: opts.RampRunID, SelectedGPUIndices: append([]int(nil), selected...), HostConfig: readBenchmarkHostConfig(), Normalization: BenchmarkNormalization{ @@ -344,6 +347,20 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv } result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK) + // Apply server-power penalty when IPMI reports the server delta is much + // lower than GPU-reported sum: GPU power telemetry is over-stated, making + // CalibratedPeakPowerW and PowerSustainScore unreliable. + // Penalty factor scales from 1.0 (ratio ≥ 0.75, no penalty) down to 0. + if sp := result.ServerPower; sp != nil && sp.Available && sp.ReportingRatio > 0 && sp.ReportingRatio < 0.75 { + factor := sp.ReportingRatio / 0.75 + for i := range result.GPUs { + result.GPUs[i].Scores.CompositeScore *= factor + result.GPUs[i].Notes = append(result.GPUs[i].Notes, + fmt.Sprintf("server-power penalty applied (reporting_ratio=%.2f < 0.75): composite score reduced to %.1f%%", + sp.ReportingRatio, factor*100)) + } + } + result.Findings = buildBenchmarkFindings(result) result.OverallStatus = benchmarkOverallStatus(result) @@ -1178,9 +1195,14 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string { if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 { if sp.ReportingRatio < 0.75 { findings = append(findings, fmt.Sprintf( - "GPU power reporting may be unreliable: server delta %.0f W vs GPU-reported %.0f W (ratio %.2f). GPU telemetry likely over-reports actual consumption.", + "GPU power reporting may be unreliable: server delta %.0f W vs GPU-reported %.0f W (ratio %.2f). GPU telemetry likely over-reports actual consumption. Composite scores have been penalized accordingly.", sp.DeltaW, sp.GPUReportedSumW, sp.ReportingRatio, )) + } else if sp.ReportingRatio > 1.25 { + findings = append(findings, fmt.Sprintf( + "Server power delta %.0f W exceeds GPU-reported sum %.0f W by %.0f%%. Other components (CPU, NVMe, networking) may be drawing substantial power under GPU load.", + sp.DeltaW, sp.GPUReportedSumW, (sp.ReportingRatio-1)*100, + )) } } return dedupeStrings(findings) diff --git a/audit/internal/platform/benchmark_report.go b/audit/internal/platform/benchmark_report.go index 9670e20..1c69463 100644 --- a/audit/internal/platform/benchmark_report.go +++ b/audit/internal/platform/benchmark_report.go @@ -60,9 +60,17 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile) fmt.Fprintf(&b, "**App version:** %s \n", result.BenchmarkVersion) fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC")) - if result.ParallelGPUs { + if result.RampStep > 0 && result.RampTotal > 0 { + fmt.Fprintf(&b, "**Ramp-up step:** %d of %d \n", result.RampStep, result.RampTotal) + if result.RampRunID != "" { + fmt.Fprintf(&b, "**Ramp-up run ID:** %s \n", result.RampRunID) + } + } else if result.ParallelGPUs { fmt.Fprintf(&b, "**Mode:** parallel (all GPUs simultaneously) \n") } + if result.ScalabilityScore > 0 { + fmt.Fprintf(&b, "**Scalability score:** %.1f%% \n", result.ScalabilityScore) + } fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus) b.WriteString("\n") diff --git a/audit/internal/platform/benchmark_types.go b/audit/internal/platform/benchmark_types.go index 447a08c..f3ddb7a 100644 --- a/audit/internal/platform/benchmark_types.go +++ b/audit/internal/platform/benchmark_types.go @@ -37,7 +37,10 @@ type NvidiaBenchmarkOptions struct { GPUIndices []int ExcludeGPUIndices []int RunNCCL bool - ParallelGPUs bool // run all selected GPUs simultaneously instead of sequentially + ParallelGPUs bool // run all selected GPUs simultaneously instead of sequentially + RampStep int // 1-based step index within a ramp-up run (0 = not a ramp-up) + RampTotal int // total number of ramp-up steps in this run + RampRunID string // shared identifier across all steps of the same ramp-up run } @@ -48,6 +51,10 @@ type NvidiaBenchmarkResult struct { ServerModel string `json:"server_model,omitempty"` BenchmarkProfile string `json:"benchmark_profile"` ParallelGPUs bool `json:"parallel_gpus,omitempty"` + RampStep int `json:"ramp_step,omitempty"` + RampTotal int `json:"ramp_total,omitempty"` + RampRunID string `json:"ramp_run_id,omitempty"` + ScalabilityScore float64 `json:"scalability_score,omitempty"` OverallStatus string `json:"overall_status"` SelectedGPUIndices []int `json:"selected_gpu_indices"` Findings []string `json:"findings,omitempty"` diff --git a/audit/internal/webui/api.go b/audit/internal/webui/api.go index 30cedb2..3cd94f5 100644 --- a/audit/internal/webui/api.go +++ b/audit/internal/webui/api.go @@ -594,6 +594,7 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req rampUp = false } else { now := time.Now() + rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405")) var allTasks []*Task for step := 1; step <= len(resolved); step++ { subset := resolved[:step] @@ -611,6 +612,9 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req BenchmarkProfile: body.Profile, RunNCCL: runNCCL && step == len(resolved), ParallelGPUs: true, + RampStep: step, + RampTotal: len(resolved), + RampRunID: rampRunID, DisplayName: stepName, }, } diff --git a/audit/internal/webui/tasks.go b/audit/internal/webui/tasks.go index 4afd96c..e7e449b 100644 --- a/audit/internal/webui/tasks.go +++ b/audit/internal/webui/tasks.go @@ -126,6 +126,9 @@ type taskParams struct { BenchmarkProfile string `json:"benchmark_profile,omitempty"` RunNCCL bool `json:"run_nccl,omitempty"` ParallelGPUs bool `json:"parallel_gpus,omitempty"` + RampStep int `json:"ramp_step,omitempty"` + RampTotal int `json:"ramp_total,omitempty"` + RampRunID string `json:"ramp_run_id,omitempty"` DisplayName string `json:"display_name,omitempty"` Device string `json:"device,omitempty"` // for install PlatformComponents []string `json:"platform_components,omitempty"` @@ -637,6 +640,9 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) { ExcludeGPUIndices: t.params.ExcludeGPUIndices, RunNCCL: t.params.RunNCCL, ParallelGPUs: t.params.ParallelGPUs, + RampStep: t.params.RampStep, + RampTotal: t.params.RampTotal, + RampRunID: t.params.RampRunID, }, j.append) case "nvidia-compute": if a == nil {