Fix ramp-up power bench: one task instead of N redundant tasks

RunNvidiaPowerBench already performs a full internal ramp from 1 to N
GPUs in Phase 2. Spawning N tasks with growing GPU subsets meant task K
repeated all steps 1..K-1 already done by tasks 1..K-1 — O(N²) work
instead of O(N). Replace with a single task using all selected GPUs.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Mikhail Chusavitin
2026-04-15 12:29:11 +03:00
parent 0317dc58fd
commit cd9e2cbe13

View File

@@ -628,8 +628,10 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
}
if rampUp && len(body.GPUIndices) > 1 {
// Ramp-up mode: resolve GPU list, then create one task per prefix
// [gpu0], [gpu0,gpu1], ..., [gpu0,...,gpuN-1], each running in parallel.
// Ramp-up mode: RunNvidiaPowerBench internally ramps from 1 to N GPUs
// in Phase 2 (one additional GPU per step). A single task with all
// selected GPUs is sufficient — spawning N tasks with growing subsets
// would repeat all earlier steps redundantly.
gpus, err := apiListNvidiaGPUs(h.opts.App)
if err != nil {
writeError(w, http.StatusBadRequest, err.Error())
@@ -646,35 +648,27 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
} else {
now := time.Now()
rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
var allTasks []*Task
for step := 1; step <= len(resolved); step++ {
subset := resolved[:step]
stepName := fmt.Sprintf("%s · ramp %d/%d · GPU %s", name, step, len(resolved), formatGPUIndexList(subset))
t := &Task{
ID: newJobID("bee-bench-nvidia"),
Name: stepName,
Target: target,
Priority: defaultTaskPriority(target, taskParams{}),
Status: TaskPending,
CreatedAt: now,
params: taskParams{
GPUIndices: append([]int(nil), subset...),
SizeMB: body.SizeMB,
BenchmarkProfile: body.Profile,
RunNCCL: runNCCL && step == len(resolved),
ParallelGPUs: true,
RampStep: step,
RampTotal: len(resolved),
RampRunID: rampRunID,
DisplayName: stepName,
},
}
allTasks = append(allTasks, t)
taskName := fmt.Sprintf("%s · ramp 1%d · GPU %s", name, len(resolved), formatGPUIndexList(resolved))
t := &Task{
ID: newJobID("bee-bench-nvidia"),
Name: taskName,
Target: target,
Priority: defaultTaskPriority(target, taskParams{}),
Status: TaskPending,
CreatedAt: now,
params: taskParams{
GPUIndices: append([]int(nil), resolved...),
SizeMB: body.SizeMB,
BenchmarkProfile: body.Profile,
RunNCCL: runNCCL,
ParallelGPUs: true,
RampTotal: len(resolved),
RampRunID: rampRunID,
DisplayName: taskName,
},
}
for _, t := range allTasks {
globalQueue.enqueue(t)
}
writeTaskRunResponse(w, allTasks)
globalQueue.enqueue(t)
writeTaskRunResponse(w, []*Task{t})
return
}
}