Add scalability/ramp-up labeling, ServerPower penalty in scoring, and report improvements
- Add RampStep/RampTotal/RampRunID to NvidiaBenchmarkOptions, taskParams, and NvidiaBenchmarkResult so ramp-up steps can be correlated across result.json files - Add ScalabilityScore field to NvidiaBenchmarkResult (placeholder; computed externally by comparing ramp-up step results sharing the same ramp_run_id) - Propagate ramp fields through api.go (generates shared ramp_run_id at spawn time), tasks.go handler, and benchmark.go result population - Apply ServerPower penalty to CompositeScore when IPMI reporting_ratio < 0.75: factor = ratio/0.75, applied per-GPU with a note explaining the reduction - Add finding when server power delta exceeds GPU-reported sum by >25% (non-GPU draw) - Report header now shows ramp step N/M and run ID instead of "parallel" when in ramp mode; shows scalability_score when non-zero Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -109,6 +109,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
ServerModel: readServerModel(),
|
ServerModel: readServerModel(),
|
||||||
BenchmarkProfile: spec.Name,
|
BenchmarkProfile: spec.Name,
|
||||||
ParallelGPUs: opts.ParallelGPUs,
|
ParallelGPUs: opts.ParallelGPUs,
|
||||||
|
RampStep: opts.RampStep,
|
||||||
|
RampTotal: opts.RampTotal,
|
||||||
|
RampRunID: opts.RampRunID,
|
||||||
SelectedGPUIndices: append([]int(nil), selected...),
|
SelectedGPUIndices: append([]int(nil), selected...),
|
||||||
HostConfig: readBenchmarkHostConfig(),
|
HostConfig: readBenchmarkHostConfig(),
|
||||||
Normalization: BenchmarkNormalization{
|
Normalization: BenchmarkNormalization{
|
||||||
@@ -344,6 +347,20 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
}
|
}
|
||||||
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK)
|
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK)
|
||||||
|
|
||||||
|
// Apply server-power penalty when IPMI reports the server delta is much
|
||||||
|
// lower than GPU-reported sum: GPU power telemetry is over-stated, making
|
||||||
|
// CalibratedPeakPowerW and PowerSustainScore unreliable.
|
||||||
|
// Penalty factor scales from 1.0 (ratio ≥ 0.75, no penalty) down to 0.
|
||||||
|
if sp := result.ServerPower; sp != nil && sp.Available && sp.ReportingRatio > 0 && sp.ReportingRatio < 0.75 {
|
||||||
|
factor := sp.ReportingRatio / 0.75
|
||||||
|
for i := range result.GPUs {
|
||||||
|
result.GPUs[i].Scores.CompositeScore *= factor
|
||||||
|
result.GPUs[i].Notes = append(result.GPUs[i].Notes,
|
||||||
|
fmt.Sprintf("server-power penalty applied (reporting_ratio=%.2f < 0.75): composite score reduced to %.1f%%",
|
||||||
|
sp.ReportingRatio, factor*100))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
result.Findings = buildBenchmarkFindings(result)
|
result.Findings = buildBenchmarkFindings(result)
|
||||||
result.OverallStatus = benchmarkOverallStatus(result)
|
result.OverallStatus = benchmarkOverallStatus(result)
|
||||||
|
|
||||||
@@ -1178,9 +1195,14 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
|
|||||||
if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 {
|
if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 {
|
||||||
if sp.ReportingRatio < 0.75 {
|
if sp.ReportingRatio < 0.75 {
|
||||||
findings = append(findings, fmt.Sprintf(
|
findings = append(findings, fmt.Sprintf(
|
||||||
"GPU power reporting may be unreliable: server delta %.0f W vs GPU-reported %.0f W (ratio %.2f). GPU telemetry likely over-reports actual consumption.",
|
"GPU power reporting may be unreliable: server delta %.0f W vs GPU-reported %.0f W (ratio %.2f). GPU telemetry likely over-reports actual consumption. Composite scores have been penalized accordingly.",
|
||||||
sp.DeltaW, sp.GPUReportedSumW, sp.ReportingRatio,
|
sp.DeltaW, sp.GPUReportedSumW, sp.ReportingRatio,
|
||||||
))
|
))
|
||||||
|
} else if sp.ReportingRatio > 1.25 {
|
||||||
|
findings = append(findings, fmt.Sprintf(
|
||||||
|
"Server power delta %.0f W exceeds GPU-reported sum %.0f W by %.0f%%. Other components (CPU, NVMe, networking) may be drawing substantial power under GPU load.",
|
||||||
|
sp.DeltaW, sp.GPUReportedSumW, (sp.ReportingRatio-1)*100,
|
||||||
|
))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return dedupeStrings(findings)
|
return dedupeStrings(findings)
|
||||||
|
|||||||
@@ -60,9 +60,17 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
|
|||||||
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
|
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
|
||||||
fmt.Fprintf(&b, "**App version:** %s \n", result.BenchmarkVersion)
|
fmt.Fprintf(&b, "**App version:** %s \n", result.BenchmarkVersion)
|
||||||
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
||||||
if result.ParallelGPUs {
|
if result.RampStep > 0 && result.RampTotal > 0 {
|
||||||
|
fmt.Fprintf(&b, "**Ramp-up step:** %d of %d \n", result.RampStep, result.RampTotal)
|
||||||
|
if result.RampRunID != "" {
|
||||||
|
fmt.Fprintf(&b, "**Ramp-up run ID:** %s \n", result.RampRunID)
|
||||||
|
}
|
||||||
|
} else if result.ParallelGPUs {
|
||||||
fmt.Fprintf(&b, "**Mode:** parallel (all GPUs simultaneously) \n")
|
fmt.Fprintf(&b, "**Mode:** parallel (all GPUs simultaneously) \n")
|
||||||
}
|
}
|
||||||
|
if result.ScalabilityScore > 0 {
|
||||||
|
fmt.Fprintf(&b, "**Scalability score:** %.1f%% \n", result.ScalabilityScore)
|
||||||
|
}
|
||||||
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
|
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
|
|
||||||
|
|||||||
@@ -37,7 +37,10 @@ type NvidiaBenchmarkOptions struct {
|
|||||||
GPUIndices []int
|
GPUIndices []int
|
||||||
ExcludeGPUIndices []int
|
ExcludeGPUIndices []int
|
||||||
RunNCCL bool
|
RunNCCL bool
|
||||||
ParallelGPUs bool // run all selected GPUs simultaneously instead of sequentially
|
ParallelGPUs bool // run all selected GPUs simultaneously instead of sequentially
|
||||||
|
RampStep int // 1-based step index within a ramp-up run (0 = not a ramp-up)
|
||||||
|
RampTotal int // total number of ramp-up steps in this run
|
||||||
|
RampRunID string // shared identifier across all steps of the same ramp-up run
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -48,6 +51,10 @@ type NvidiaBenchmarkResult struct {
|
|||||||
ServerModel string `json:"server_model,omitempty"`
|
ServerModel string `json:"server_model,omitempty"`
|
||||||
BenchmarkProfile string `json:"benchmark_profile"`
|
BenchmarkProfile string `json:"benchmark_profile"`
|
||||||
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||||
|
RampStep int `json:"ramp_step,omitempty"`
|
||||||
|
RampTotal int `json:"ramp_total,omitempty"`
|
||||||
|
RampRunID string `json:"ramp_run_id,omitempty"`
|
||||||
|
ScalabilityScore float64 `json:"scalability_score,omitempty"`
|
||||||
OverallStatus string `json:"overall_status"`
|
OverallStatus string `json:"overall_status"`
|
||||||
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
||||||
Findings []string `json:"findings,omitempty"`
|
Findings []string `json:"findings,omitempty"`
|
||||||
|
|||||||
@@ -594,6 +594,7 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
|
|||||||
rampUp = false
|
rampUp = false
|
||||||
} else {
|
} else {
|
||||||
now := time.Now()
|
now := time.Now()
|
||||||
|
rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
|
||||||
var allTasks []*Task
|
var allTasks []*Task
|
||||||
for step := 1; step <= len(resolved); step++ {
|
for step := 1; step <= len(resolved); step++ {
|
||||||
subset := resolved[:step]
|
subset := resolved[:step]
|
||||||
@@ -611,6 +612,9 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
|
|||||||
BenchmarkProfile: body.Profile,
|
BenchmarkProfile: body.Profile,
|
||||||
RunNCCL: runNCCL && step == len(resolved),
|
RunNCCL: runNCCL && step == len(resolved),
|
||||||
ParallelGPUs: true,
|
ParallelGPUs: true,
|
||||||
|
RampStep: step,
|
||||||
|
RampTotal: len(resolved),
|
||||||
|
RampRunID: rampRunID,
|
||||||
DisplayName: stepName,
|
DisplayName: stepName,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -126,6 +126,9 @@ type taskParams struct {
|
|||||||
BenchmarkProfile string `json:"benchmark_profile,omitempty"`
|
BenchmarkProfile string `json:"benchmark_profile,omitempty"`
|
||||||
RunNCCL bool `json:"run_nccl,omitempty"`
|
RunNCCL bool `json:"run_nccl,omitempty"`
|
||||||
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||||
|
RampStep int `json:"ramp_step,omitempty"`
|
||||||
|
RampTotal int `json:"ramp_total,omitempty"`
|
||||||
|
RampRunID string `json:"ramp_run_id,omitempty"`
|
||||||
DisplayName string `json:"display_name,omitempty"`
|
DisplayName string `json:"display_name,omitempty"`
|
||||||
Device string `json:"device,omitempty"` // for install
|
Device string `json:"device,omitempty"` // for install
|
||||||
PlatformComponents []string `json:"platform_components,omitempty"`
|
PlatformComponents []string `json:"platform_components,omitempty"`
|
||||||
@@ -637,6 +640,9 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||||
RunNCCL: t.params.RunNCCL,
|
RunNCCL: t.params.RunNCCL,
|
||||||
ParallelGPUs: t.params.ParallelGPUs,
|
ParallelGPUs: t.params.ParallelGPUs,
|
||||||
|
RampStep: t.params.RampStep,
|
||||||
|
RampTotal: t.params.RampTotal,
|
||||||
|
RampRunID: t.params.RampRunID,
|
||||||
}, j.append)
|
}, j.append)
|
||||||
case "nvidia-compute":
|
case "nvidia-compute":
|
||||||
if a == nil {
|
if a == nil {
|
||||||
|
|||||||
Reference in New Issue
Block a user