Add scalability/ramp-up labeling, ServerPower penalty in scoring, and report improvements
- Add RampStep/RampTotal/RampRunID to NvidiaBenchmarkOptions, taskParams, and NvidiaBenchmarkResult so ramp-up steps can be correlated across result.json files - Add ScalabilityScore field to NvidiaBenchmarkResult (placeholder; computed externally by comparing ramp-up step results sharing the same ramp_run_id) - Propagate ramp fields through api.go (generates shared ramp_run_id at spawn time), tasks.go handler, and benchmark.go result population - Apply ServerPower penalty to CompositeScore when IPMI reporting_ratio < 0.75: factor = ratio/0.75, applied per-GPU with a note explaining the reduction - Add finding when server power delta exceeds GPU-reported sum by >25% (non-GPU draw) - Report header now shows ramp step N/M and run ID instead of "parallel" when in ramp mode; shows scalability_score when non-zero Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -594,6 +594,7 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
|
||||
rampUp = false
|
||||
} else {
|
||||
now := time.Now()
|
||||
rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
|
||||
var allTasks []*Task
|
||||
for step := 1; step <= len(resolved); step++ {
|
||||
subset := resolved[:step]
|
||||
@@ -611,6 +612,9 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
|
||||
BenchmarkProfile: body.Profile,
|
||||
RunNCCL: runNCCL && step == len(resolved),
|
||||
ParallelGPUs: true,
|
||||
RampStep: step,
|
||||
RampTotal: len(resolved),
|
||||
RampRunID: rampRunID,
|
||||
DisplayName: stepName,
|
||||
},
|
||||
}
|
||||
|
||||
@@ -126,6 +126,9 @@ type taskParams struct {
|
||||
BenchmarkProfile string `json:"benchmark_profile,omitempty"`
|
||||
RunNCCL bool `json:"run_nccl,omitempty"`
|
||||
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||
RampStep int `json:"ramp_step,omitempty"`
|
||||
RampTotal int `json:"ramp_total,omitempty"`
|
||||
RampRunID string `json:"ramp_run_id,omitempty"`
|
||||
DisplayName string `json:"display_name,omitempty"`
|
||||
Device string `json:"device,omitempty"` // for install
|
||||
PlatformComponents []string `json:"platform_components,omitempty"`
|
||||
@@ -637,6 +640,9 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||
RunNCCL: t.params.RunNCCL,
|
||||
ParallelGPUs: t.params.ParallelGPUs,
|
||||
RampStep: t.params.RampStep,
|
||||
RampTotal: t.params.RampTotal,
|
||||
RampRunID: t.params.RampRunID,
|
||||
}, j.append)
|
||||
case "nvidia-compute":
|
||||
if a == nil {
|
||||
|
||||
Reference in New Issue
Block a user