From 098e19f76011f184567ccc528ab8edf8988bb6c7 Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Sun, 12 Apr 2026 18:34:19 +0300 Subject: [PATCH] Add ramp-up mode to NVIDIA GPU benchmark MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a new checkbox (enabled by default) in the benchmark section. In ramp-up mode N tasks are spawned simultaneously: 1 GPU, then 2, then 3, up to all selected GPUs — each step runs its GPUs in parallel. NCCL runs only on the final step. Co-Authored-By: Claude Sonnet 4.6 --- audit/internal/webui/api.go | 63 +++++++++++++++++++++++++++++++++++ audit/internal/webui/pages.go | 25 ++++++++++---- 2 files changed, 81 insertions(+), 7 deletions(-) diff --git a/audit/internal/webui/api.go b/audit/internal/webui/api.go index 3264b0e..30cedb2 100644 --- a/audit/internal/webui/api.go +++ b/audit/internal/webui/api.go @@ -12,6 +12,7 @@ import ( "path/filepath" "regexp" "sort" + "strconv" "strings" "sync/atomic" "syscall" @@ -209,6 +210,14 @@ func joinTaskIndices(indices []int) string { return strings.Join(parts, ",") } +func formatGPUIndexList(indices []int) string { + parts := make([]string, len(indices)) + for i, idx := range indices { + parts[i] = strconv.Itoa(idx) + } + return strings.Join(parts, ",") +} + func formatSplitTaskName(baseName, selectionLabel string) string { baseName = strings.TrimSpace(baseName) selectionLabel = strings.TrimSpace(selectionLabel) @@ -540,6 +549,7 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req ExcludeGPUIndices []int `json:"exclude_gpu_indices"` RunNCCL *bool `json:"run_nccl"` ParallelGPUs *bool `json:"parallel_gpus"` + RampUp *bool `json:"ramp_up"` DisplayName string `json:"display_name"` } if r.Body != nil { @@ -557,10 +567,63 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req if body.ParallelGPUs != nil { parallelGPUs = *body.ParallelGPUs } + rampUp := false + if body.RampUp != nil { + rampUp = *body.RampUp + } name := taskDisplayName("nvidia-benchmark", "", "") if strings.TrimSpace(body.DisplayName) != "" { name = body.DisplayName } + + if rampUp && len(body.GPUIndices) > 1 { + // Ramp-up mode: resolve GPU list, then create one task per prefix + // [gpu0], [gpu0,gpu1], ..., [gpu0,...,gpuN-1], each running in parallel. + gpus, err := apiListNvidiaGPUs(h.opts.App) + if err != nil { + writeError(w, http.StatusBadRequest, err.Error()) + return + } + resolved, err := expandSelectedGPUIndices(gpus, body.GPUIndices, body.ExcludeGPUIndices) + if err != nil { + writeError(w, http.StatusBadRequest, err.Error()) + return + } + if len(resolved) < 2 { + // Fall through to normal single-task path. + rampUp = false + } else { + now := time.Now() + var allTasks []*Task + for step := 1; step <= len(resolved); step++ { + subset := resolved[:step] + stepName := fmt.Sprintf("%s [ramp %d/%d: GPU %s]", name, step, len(resolved), formatGPUIndexList(subset)) + t := &Task{ + ID: newJobID("benchmark-nvidia"), + Name: stepName, + Target: "nvidia-benchmark", + Priority: 15, + Status: TaskPending, + CreatedAt: now, + params: taskParams{ + GPUIndices: append([]int(nil), subset...), + SizeMB: body.SizeMB, + BenchmarkProfile: body.Profile, + RunNCCL: runNCCL && step == len(resolved), + ParallelGPUs: true, + DisplayName: stepName, + }, + } + allTasks = append(allTasks, t) + } + for _, t := range allTasks { + globalQueue.enqueue(t) + } + writeTaskRunResponse(w, allTasks) + return + } + } + tasks, err := buildNvidiaTaskSet("nvidia-benchmark", 15, time.Now(), taskParams{ GPUIndices: body.GPUIndices, ExcludeGPUIndices: body.ExcludeGPUIndices, diff --git a/audit/internal/webui/pages.go b/audit/internal/webui/pages.go index e019d0e..b3b1497 100644 --- a/audit/internal/webui/pages.go +++ b/audit/internal/webui/pages.go @@ -1966,9 +1966,13 @@ func renderBenchmark(opts HandlerOptions) string {

Loading NVIDIA GPUs...

+