Fix ramp-up power bench: one task instead of N redundant tasks

RunNvidiaPowerBench already performs a full internal ramp from 1 to N
GPUs in Phase 2. Spawning N tasks with growing GPU subsets meant task K
repeated all steps 1..K-1 already done by tasks 1..K-1 — O(N²) work
instead of O(N). Replace with a single task using all selected GPUs.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Mikhail Chusavitin
2026-04-15 12:29:11 +03:00
parent 0317dc58fd
commit cd9e2cbe13

View File

@@ -628,8 +628,10 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
} }
if rampUp && len(body.GPUIndices) > 1 { if rampUp && len(body.GPUIndices) > 1 {
// Ramp-up mode: resolve GPU list, then create one task per prefix // Ramp-up mode: RunNvidiaPowerBench internally ramps from 1 to N GPUs
// [gpu0], [gpu0,gpu1], ..., [gpu0,...,gpuN-1], each running in parallel. // in Phase 2 (one additional GPU per step). A single task with all
// selected GPUs is sufficient — spawning N tasks with growing subsets
// would repeat all earlier steps redundantly.
gpus, err := apiListNvidiaGPUs(h.opts.App) gpus, err := apiListNvidiaGPUs(h.opts.App)
if err != nil { if err != nil {
writeError(w, http.StatusBadRequest, err.Error()) writeError(w, http.StatusBadRequest, err.Error())
@@ -646,35 +648,27 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
} else { } else {
now := time.Now() now := time.Now()
rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405")) rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
var allTasks []*Task taskName := fmt.Sprintf("%s · ramp 1%d · GPU %s", name, len(resolved), formatGPUIndexList(resolved))
for step := 1; step <= len(resolved); step++ { t := &Task{
subset := resolved[:step] ID: newJobID("bee-bench-nvidia"),
stepName := fmt.Sprintf("%s · ramp %d/%d · GPU %s", name, step, len(resolved), formatGPUIndexList(subset)) Name: taskName,
t := &Task{ Target: target,
ID: newJobID("bee-bench-nvidia"), Priority: defaultTaskPriority(target, taskParams{}),
Name: stepName, Status: TaskPending,
Target: target, CreatedAt: now,
Priority: defaultTaskPriority(target, taskParams{}), params: taskParams{
Status: TaskPending, GPUIndices: append([]int(nil), resolved...),
CreatedAt: now, SizeMB: body.SizeMB,
params: taskParams{ BenchmarkProfile: body.Profile,
GPUIndices: append([]int(nil), subset...), RunNCCL: runNCCL,
SizeMB: body.SizeMB, ParallelGPUs: true,
BenchmarkProfile: body.Profile, RampTotal: len(resolved),
RunNCCL: runNCCL && step == len(resolved), RampRunID: rampRunID,
ParallelGPUs: true, DisplayName: taskName,
RampStep: step, },
RampTotal: len(resolved),
RampRunID: rampRunID,
DisplayName: stepName,
},
}
allTasks = append(allTasks, t)
} }
for _, t := range allTasks { globalQueue.enqueue(t)
globalQueue.enqueue(t) writeTaskRunResponse(w, []*Task{t})
}
writeTaskRunResponse(w, allTasks)
return return
} }
} }