Add scalability/ramp-up labeling, ServerPower penalty in scoring, and report improvements

- Add RampStep/RampTotal/RampRunID to NvidiaBenchmarkOptions, taskParams, and
  NvidiaBenchmarkResult so ramp-up steps can be correlated across result.json files
- Add ScalabilityScore field to NvidiaBenchmarkResult (placeholder; computed externally
  by comparing ramp-up step results sharing the same ramp_run_id)
- Propagate ramp fields through api.go (generates shared ramp_run_id at spawn time),
  tasks.go handler, and benchmark.go result population
- Apply ServerPower penalty to CompositeScore when IPMI reporting_ratio < 0.75:
  factor = ratio/0.75, applied per-GPU with a note explaining the reduction
- Add finding when server power delta exceeds GPU-reported sum by >25% (non-GPU draw)
- Report header now shows ramp step N/M and run ID instead of "parallel" when in ramp mode;
  shows scalability_score when non-zero

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-12 22:30:47 +03:00
parent 58a6da9b44
commit 813e2f86a9
5 changed files with 50 additions and 3 deletions

View File

@@ -109,6 +109,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
ServerModel: readServerModel(),
BenchmarkProfile: spec.Name,
ParallelGPUs: opts.ParallelGPUs,
RampStep: opts.RampStep,
RampTotal: opts.RampTotal,
RampRunID: opts.RampRunID,
SelectedGPUIndices: append([]int(nil), selected...),
HostConfig: readBenchmarkHostConfig(),
Normalization: BenchmarkNormalization{
@@ -344,6 +347,20 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
}
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK)
// Apply server-power penalty when IPMI reports the server delta is much
// lower than GPU-reported sum: GPU power telemetry is over-stated, making
// CalibratedPeakPowerW and PowerSustainScore unreliable.
// Penalty factor scales from 1.0 (ratio ≥ 0.75, no penalty) down to 0.
if sp := result.ServerPower; sp != nil && sp.Available && sp.ReportingRatio > 0 && sp.ReportingRatio < 0.75 {
factor := sp.ReportingRatio / 0.75
for i := range result.GPUs {
result.GPUs[i].Scores.CompositeScore *= factor
result.GPUs[i].Notes = append(result.GPUs[i].Notes,
fmt.Sprintf("server-power penalty applied (reporting_ratio=%.2f < 0.75): composite score reduced to %.1f%%",
sp.ReportingRatio, factor*100))
}
}
result.Findings = buildBenchmarkFindings(result)
result.OverallStatus = benchmarkOverallStatus(result)
@@ -1178,9 +1195,14 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 {
if sp.ReportingRatio < 0.75 {
findings = append(findings, fmt.Sprintf(
"GPU power reporting may be unreliable: server delta %.0f W vs GPU-reported %.0f W (ratio %.2f). GPU telemetry likely over-reports actual consumption.",
"GPU power reporting may be unreliable: server delta %.0f W vs GPU-reported %.0f W (ratio %.2f). GPU telemetry likely over-reports actual consumption. Composite scores have been penalized accordingly.",
sp.DeltaW, sp.GPUReportedSumW, sp.ReportingRatio,
))
} else if sp.ReportingRatio > 1.25 {
findings = append(findings, fmt.Sprintf(
"Server power delta %.0f W exceeds GPU-reported sum %.0f W by %.0f%%. Other components (CPU, NVMe, networking) may be drawing substantial power under GPU load.",
sp.DeltaW, sp.GPUReportedSumW, (sp.ReportingRatio-1)*100,
))
}
}
return dedupeStrings(findings)