Fix ramp-up power bench: one task instead of N redundant tasks
RunNvidiaPowerBench already performs a full internal ramp from 1 to N GPUs in Phase 2. Spawning N tasks with growing GPU subsets meant task K repeated all steps 1..K-1 already done by tasks 1..K-1 — O(N²) work instead of O(N). Replace with a single task using all selected GPUs. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -628,8 +628,10 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
|
|||||||
}
|
}
|
||||||
|
|
||||||
if rampUp && len(body.GPUIndices) > 1 {
|
if rampUp && len(body.GPUIndices) > 1 {
|
||||||
// Ramp-up mode: resolve GPU list, then create one task per prefix
|
// Ramp-up mode: RunNvidiaPowerBench internally ramps from 1 to N GPUs
|
||||||
// [gpu0], [gpu0,gpu1], ..., [gpu0,...,gpuN-1], each running in parallel.
|
// in Phase 2 (one additional GPU per step). A single task with all
|
||||||
|
// selected GPUs is sufficient — spawning N tasks with growing subsets
|
||||||
|
// would repeat all earlier steps redundantly.
|
||||||
gpus, err := apiListNvidiaGPUs(h.opts.App)
|
gpus, err := apiListNvidiaGPUs(h.opts.App)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
writeError(w, http.StatusBadRequest, err.Error())
|
writeError(w, http.StatusBadRequest, err.Error())
|
||||||
@@ -646,35 +648,27 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
|
|||||||
} else {
|
} else {
|
||||||
now := time.Now()
|
now := time.Now()
|
||||||
rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
|
rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
|
||||||
var allTasks []*Task
|
taskName := fmt.Sprintf("%s · ramp 1–%d · GPU %s", name, len(resolved), formatGPUIndexList(resolved))
|
||||||
for step := 1; step <= len(resolved); step++ {
|
t := &Task{
|
||||||
subset := resolved[:step]
|
ID: newJobID("bee-bench-nvidia"),
|
||||||
stepName := fmt.Sprintf("%s · ramp %d/%d · GPU %s", name, step, len(resolved), formatGPUIndexList(subset))
|
Name: taskName,
|
||||||
t := &Task{
|
Target: target,
|
||||||
ID: newJobID("bee-bench-nvidia"),
|
Priority: defaultTaskPriority(target, taskParams{}),
|
||||||
Name: stepName,
|
Status: TaskPending,
|
||||||
Target: target,
|
CreatedAt: now,
|
||||||
Priority: defaultTaskPriority(target, taskParams{}),
|
params: taskParams{
|
||||||
Status: TaskPending,
|
GPUIndices: append([]int(nil), resolved...),
|
||||||
CreatedAt: now,
|
SizeMB: body.SizeMB,
|
||||||
params: taskParams{
|
BenchmarkProfile: body.Profile,
|
||||||
GPUIndices: append([]int(nil), subset...),
|
RunNCCL: runNCCL,
|
||||||
SizeMB: body.SizeMB,
|
ParallelGPUs: true,
|
||||||
BenchmarkProfile: body.Profile,
|
RampTotal: len(resolved),
|
||||||
RunNCCL: runNCCL && step == len(resolved),
|
RampRunID: rampRunID,
|
||||||
ParallelGPUs: true,
|
DisplayName: taskName,
|
||||||
RampStep: step,
|
},
|
||||||
RampTotal: len(resolved),
|
|
||||||
RampRunID: rampRunID,
|
|
||||||
DisplayName: stepName,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
allTasks = append(allTasks, t)
|
|
||||||
}
|
}
|
||||||
for _, t := range allTasks {
|
globalQueue.enqueue(t)
|
||||||
globalQueue.enqueue(t)
|
writeTaskRunResponse(w, []*Task{t})
|
||||||
}
|
|
||||||
writeTaskRunResponse(w, allTasks)
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user