Add scalability/ramp-up labeling, ServerPower penalty in scoring, and report improvements

- Add RampStep/RampTotal/RampRunID to NvidiaBenchmarkOptions, taskParams, and NvidiaBenchmarkResult so ramp-up steps can be correlated across result.json files - Add ScalabilityScore field to NvidiaBenchmarkResult (placeholder; computed externally by comparing ramp-up step results sharing the same ramp_run_id) - Propagate ramp fields through api.go (generates shared ramp_run_id at spawn time), tasks.go handler, and benchmark.go result population - Apply ServerPower penalty to CompositeScore when IPMI reporting_ratio < 0.75: factor = ratio/0.75, applied per-GPU with a note explaining the reduction - Add finding when server power delta exceeds GPU-reported sum by >25% (non-GPU draw) - Report header now shows ramp step N/M and run ID instead of "parallel" when in ramp mode; shows scalability_score when non-zero Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-12 22:30:47 +03:00
parent 58a6da9b44
commit 813e2f86a9
5 changed files with 50 additions and 3 deletions
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -109,6 +109,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		ServerModel:        readServerModel(),
 		BenchmarkProfile:   spec.Name,
 		ParallelGPUs:       opts.ParallelGPUs,
+		RampStep:           opts.RampStep,
+		RampTotal:          opts.RampTotal,
+		RampRunID:          opts.RampRunID,
 		SelectedGPUIndices: append([]int(nil), selected...),
 		HostConfig:         readBenchmarkHostConfig(),
 		Normalization: BenchmarkNormalization{
@@ -344,6 +347,20 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 	}
 	result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK)

+	// Apply server-power penalty when IPMI reports the server delta is much
+	// lower than GPU-reported sum: GPU power telemetry is over-stated, making
+	// CalibratedPeakPowerW and PowerSustainScore unreliable.
+	// Penalty factor scales from 1.0 (ratio ≥ 0.75, no penalty) down to 0.
+	if sp := result.ServerPower; sp != nil && sp.Available && sp.ReportingRatio > 0 && sp.ReportingRatio < 0.75 {
+		factor := sp.ReportingRatio / 0.75
+		for i := range result.GPUs {
+			result.GPUs[i].Scores.CompositeScore *= factor
+			result.GPUs[i].Notes = append(result.GPUs[i].Notes,
+				fmt.Sprintf("server-power penalty applied (reporting_ratio=%.2f < 0.75): composite score reduced to %.1f%%",
+					sp.ReportingRatio, factor*100))
+		}
+	}
+
 	result.Findings = buildBenchmarkFindings(result)
 	result.OverallStatus = benchmarkOverallStatus(result)

@@ -1178,9 +1195,14 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
 	if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 {
 		if sp.ReportingRatio < 0.75 {
 			findings = append(findings, fmt.Sprintf(
-				"GPU power reporting may be unreliable: server delta %.0f W vs GPU-reported %.0f W (ratio %.2f). GPU telemetry likely over-reports actual consumption.",
+				"GPU power reporting may be unreliable: server delta %.0f W vs GPU-reported %.0f W (ratio %.2f). GPU telemetry likely over-reports actual consumption. Composite scores have been penalized accordingly.",
 				sp.DeltaW, sp.GPUReportedSumW, sp.ReportingRatio,
 			))
+		} else if sp.ReportingRatio > 1.25 {
+			findings = append(findings, fmt.Sprintf(
+				"Server power delta %.0f W exceeds GPU-reported sum %.0f W by %.0f%%. Other components (CPU, NVMe, networking) may be drawing substantial power under GPU load.",
+				sp.DeltaW, sp.GPUReportedSumW, (sp.ReportingRatio-1)*100,
+			))
 		}
 	}
 	return dedupeStrings(findings)
--- a/audit/internal/platform/benchmark_report.go
+++ b/audit/internal/platform/benchmark_report.go
@@ -60,9 +60,17 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
 	fmt.Fprintf(&b, "**Profile:** %s  \n", result.BenchmarkProfile)
 	fmt.Fprintf(&b, "**App version:** %s  \n", result.BenchmarkVersion)
 	fmt.Fprintf(&b, "**Generated:** %s  \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
-	if result.ParallelGPUs {
+	if result.RampStep > 0 && result.RampTotal > 0 {
+		fmt.Fprintf(&b, "**Ramp-up step:** %d of %d  \n", result.RampStep, result.RampTotal)
+		if result.RampRunID != "" {
+			fmt.Fprintf(&b, "**Ramp-up run ID:** %s  \n", result.RampRunID)
+		}
+	} else if result.ParallelGPUs {
 		fmt.Fprintf(&b, "**Mode:** parallel (all GPUs simultaneously)  \n")
 	}
+	if result.ScalabilityScore > 0 {
+		fmt.Fprintf(&b, "**Scalability score:** %.1f%%  \n", result.ScalabilityScore)
+	}
 	fmt.Fprintf(&b, "**Overall status:** %s  \n", result.OverallStatus)
 	b.WriteString("\n")

--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -37,7 +37,10 @@ type NvidiaBenchmarkOptions struct {
 	GPUIndices        []int
 	ExcludeGPUIndices []int
 	RunNCCL           bool
-	ParallelGPUs      bool // run all selected GPUs simultaneously instead of sequentially
+	ParallelGPUs      bool   // run all selected GPUs simultaneously instead of sequentially
+	RampStep          int    // 1-based step index within a ramp-up run (0 = not a ramp-up)
+	RampTotal         int    // total number of ramp-up steps in this run
+	RampRunID         string // shared identifier across all steps of the same ramp-up run
 }


@@ -48,6 +51,10 @@ type NvidiaBenchmarkResult struct {
 	ServerModel        string                       `json:"server_model,omitempty"`
 	BenchmarkProfile   string                       `json:"benchmark_profile"`
 	ParallelGPUs       bool                         `json:"parallel_gpus,omitempty"`
+	RampStep           int                          `json:"ramp_step,omitempty"`
+	RampTotal          int                          `json:"ramp_total,omitempty"`
+	RampRunID          string                       `json:"ramp_run_id,omitempty"`
+	ScalabilityScore   float64                      `json:"scalability_score,omitempty"`
 	OverallStatus      string                       `json:"overall_status"`
 	SelectedGPUIndices []int                        `json:"selected_gpu_indices"`
 	Findings           []string                     `json:"findings,omitempty"`