Split bee-bench into perf and power workflows

2026-04-14 17:33:13 +03:00
parent 54338dbae5
commit 95124d228f
17 changed files with 718 additions and 259 deletions
--- a/audit/internal/webui/api.go
+++ b/audit/internal/webui/api.go
@@ -110,7 +110,7 @@ func writeTaskRunResponse(w http.ResponseWriter, tasks []*Task) {

 func shouldSplitHomogeneousNvidiaTarget(target string) bool {
 	switch strings.TrimSpace(target) {
-	case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute",
+	case "nvidia", "nvidia-targeted-stress", "nvidia-bench-perf", "nvidia-bench-power", "nvidia-compute",
 		"nvidia-targeted-power", "nvidia-pulse", "nvidia-interconnect",
 		"nvidia-bandwidth", "nvidia-stress":
 		return true
@@ -127,7 +127,7 @@ func defaultTaskPriority(target string, params taskParams) int {
 		return taskPriorityInstallToRAM
 	case "audit":
 		return taskPriorityAudit
-	case "nvidia-benchmark":
+	case "nvidia-bench-perf", "nvidia-bench-power":
 		return taskPriorityBenchmark
 	case "nvidia-stress", "amd-stress", "memory-stress", "sat-stress", "platform-stress", "nvidia-compute":
 		return taskPriorityBurn
@@ -573,131 +573,142 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
 	}
 }

-func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
-	if h.opts.App == nil {
-		writeError(w, http.StatusServiceUnavailable, "app not configured")
-		return
-	}
-
-	var body struct {
-		Profile           string `json:"profile"`
-		SizeMB            int    `json:"size_mb"`
-		GPUIndices        []int  `json:"gpu_indices"`
-		ExcludeGPUIndices []int  `json:"exclude_gpu_indices"`
-		RunNCCL           *bool  `json:"run_nccl"`
-		ParallelGPUs      *bool  `json:"parallel_gpus"`
-		RampUp            *bool  `json:"ramp_up"`
-		DisplayName       string `json:"display_name"`
-	}
-	if r.Body != nil {
-		if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
-			writeError(w, http.StatusBadRequest, "invalid request body")
+func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFunc {
+	return func(w http.ResponseWriter, r *http.Request) {
+		if h.opts.App == nil {
+			writeError(w, http.StatusServiceUnavailable, "app not configured")
 			return
 		}
-	}

-	runNCCL := true
-	if body.RunNCCL != nil {
-		runNCCL = *body.RunNCCL
-	}
-	parallelGPUs := false
-	if body.ParallelGPUs != nil {
-		parallelGPUs = *body.ParallelGPUs
-	}
-	rampUp := false
-	if body.RampUp != nil {
-		rampUp = *body.RampUp
-	}
-	// Build a descriptive base name that includes profile and mode so the task
-	// list is self-explanatory without opening individual task detail pages.
-	profile := strings.TrimSpace(body.Profile)
-	if profile == "" {
-		profile = "standard"
-	}
-	name := taskDisplayName("nvidia-benchmark", "", "")
-	if strings.TrimSpace(body.DisplayName) != "" {
-		name = body.DisplayName
-	}
-	// Append profile tag.
-	name = fmt.Sprintf("%s · %s", name, profile)
+		var body struct {
+			Profile           string `json:"profile"`
+			SizeMB            int    `json:"size_mb"`
+			GPUIndices        []int  `json:"gpu_indices"`
+			ExcludeGPUIndices []int  `json:"exclude_gpu_indices"`
+			RunNCCL           *bool  `json:"run_nccl"`
+			ParallelGPUs      *bool  `json:"parallel_gpus"`
+			RampUp            *bool  `json:"ramp_up"`
+			DisplayName       string `json:"display_name"`
+		}
+		if r.Body != nil {
+			if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
+				writeError(w, http.StatusBadRequest, "invalid request body")
+				return
+			}
+		}

-	if rampUp && len(body.GPUIndices) > 1 {
-		// Ramp-up mode: resolve GPU list, then create one task per prefix
-		// [gpu0], [gpu0,gpu1], ..., [gpu0,...,gpuN-1], each running in parallel.
-		gpus, err := apiListNvidiaGPUs(h.opts.App)
-		if err != nil {
-			writeError(w, http.StatusBadRequest, err.Error())
+		runNCCL := true
+		if body.RunNCCL != nil {
+			runNCCL = *body.RunNCCL
+		}
+		parallelGPUs := false
+		if body.ParallelGPUs != nil {
+			parallelGPUs = *body.ParallelGPUs
+		}
+		rampUp := false
+		if body.RampUp != nil {
+			rampUp = *body.RampUp
+		}
+		// Build a descriptive base name that includes profile and mode so the task
+		// list is self-explanatory without opening individual task detail pages.
+		profile := strings.TrimSpace(body.Profile)
+		if profile == "" {
+			profile = "standard"
+		}
+		name := taskDisplayName(target, "", "")
+		if strings.TrimSpace(body.DisplayName) != "" {
+			name = body.DisplayName
+		}
+		// Append profile tag.
+		name = fmt.Sprintf("%s · %s", name, profile)
+
+		if target == "nvidia-bench-power" && parallelGPUs {
+			writeError(w, http.StatusBadRequest, "power / thermal fit benchmark uses sequential or ramp-up modes only")
 			return
 		}
-		resolved, err := expandSelectedGPUIndices(gpus, body.GPUIndices, body.ExcludeGPUIndices)
-		if err != nil {
-			writeError(w, http.StatusBadRequest, err.Error())
-			return
-		}
-		if len(resolved) < 2 {
-			// Fall through to normal single-task path.
-			rampUp = false
-		} else {
-			now := time.Now()
-			rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
-			var allTasks []*Task
-			for step := 1; step <= len(resolved); step++ {
-				subset := resolved[:step]
-				stepName := fmt.Sprintf("%s · ramp %d/%d · GPU %s", name, step, len(resolved), formatGPUIndexList(subset))
-				t := &Task{
-					ID:        newJobID("benchmark-nvidia"),
-					Name:      stepName,
-					Target:    "nvidia-benchmark",
-					Priority:  defaultTaskPriority("nvidia-benchmark", taskParams{}),
-					Status:    TaskPending,
-					CreatedAt: now,
-					params: taskParams{
-						GPUIndices:       append([]int(nil), subset...),
-						SizeMB:           body.SizeMB,
-						BenchmarkProfile: body.Profile,
-						RunNCCL:          runNCCL && step == len(resolved),
-						ParallelGPUs:     true,
-						RampStep:         step,
-						RampTotal:        len(resolved),
-						RampRunID:        rampRunID,
-						DisplayName:      stepName,
-					},
+
+		if rampUp && len(body.GPUIndices) > 1 {
+			// Ramp-up mode: resolve GPU list, then create one task per prefix
+			// [gpu0], [gpu0,gpu1], ..., [gpu0,...,gpuN-1], each running in parallel.
+			gpus, err := apiListNvidiaGPUs(h.opts.App)
+			if err != nil {
+				writeError(w, http.StatusBadRequest, err.Error())
+				return
+			}
+			resolved, err := expandSelectedGPUIndices(gpus, body.GPUIndices, body.ExcludeGPUIndices)
+			if err != nil {
+				writeError(w, http.StatusBadRequest, err.Error())
+				return
+			}
+			if len(resolved) < 2 {
+				// Fall through to normal single-task path.
+				rampUp = false
+			} else {
+				now := time.Now()
+				rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
+				var allTasks []*Task
+				for step := 1; step <= len(resolved); step++ {
+					subset := resolved[:step]
+					stepName := fmt.Sprintf("%s · ramp %d/%d · GPU %s", name, step, len(resolved), formatGPUIndexList(subset))
+					t := &Task{
+						ID:        newJobID("bee-bench-nvidia"),
+						Name:      stepName,
+						Target:    target,
+						Priority:  defaultTaskPriority(target, taskParams{}),
+						Status:    TaskPending,
+						CreatedAt: now,
+						params: taskParams{
+							GPUIndices:       append([]int(nil), subset...),
+							SizeMB:           body.SizeMB,
+							BenchmarkProfile: body.Profile,
+							RunNCCL:          runNCCL && step == len(resolved),
+							ParallelGPUs:     true,
+							RampStep:         step,
+							RampTotal:        len(resolved),
+							RampRunID:        rampRunID,
+							DisplayName:      stepName,
+						},
+					}
+					allTasks = append(allTasks, t)
 				}
-				allTasks = append(allTasks, t)
+				for _, t := range allTasks {
+					globalQueue.enqueue(t)
+				}
+				writeTaskRunResponse(w, allTasks)
+				return
 			}
-			for _, t := range allTasks {
-				globalQueue.enqueue(t)
-			}
-			writeTaskRunResponse(w, allTasks)
+		}
+
+		// For non-ramp tasks append mode tag.
+		if parallelGPUs {
+			name = fmt.Sprintf("%s · parallel", name)
+		} else {
+			name = fmt.Sprintf("%s · sequential", name)
+		}
+
+		params := taskParams{
+			GPUIndices:        body.GPUIndices,
+			ExcludeGPUIndices: body.ExcludeGPUIndices,
+			SizeMB:            body.SizeMB,
+			BenchmarkProfile:  body.Profile,
+			RunNCCL:           runNCCL,
+			ParallelGPUs:      parallelGPUs,
+			DisplayName:       body.DisplayName,
+		}
+		tasks, err := buildNvidiaTaskSet(target, defaultTaskPriority(target, params), time.Now(), params, name, h.opts.App, "bee-bench-nvidia")
+		if err != nil {
+			writeError(w, http.StatusBadRequest, err.Error())
 			return
 		}
+		for _, t := range tasks {
+			globalQueue.enqueue(t)
+		}
+		writeTaskRunResponse(w, tasks)
 	}
+}

-	// For non-ramp tasks append mode tag.
-	if parallelGPUs {
-		name = fmt.Sprintf("%s · parallel", name)
-	} else {
-		name = fmt.Sprintf("%s · sequential", name)
-	}
-
-	params := taskParams{
-		GPUIndices:        body.GPUIndices,
-		ExcludeGPUIndices: body.ExcludeGPUIndices,
-		SizeMB:            body.SizeMB,
-		BenchmarkProfile:  body.Profile,
-		RunNCCL:           runNCCL,
-		ParallelGPUs:      parallelGPUs,
-		DisplayName:       body.DisplayName,
-	}
-	tasks, err := buildNvidiaTaskSet("nvidia-benchmark", defaultTaskPriority("nvidia-benchmark", params), time.Now(), params, name, h.opts.App, "benchmark-nvidia")
-	if err != nil {
-		writeError(w, http.StatusBadRequest, err.Error())
-		return
-	}
-	for _, t := range tasks {
-		globalQueue.enqueue(t)
-	}
-	writeTaskRunResponse(w, tasks)
+func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
+	h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf").ServeHTTP(w, r)
 }

 func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {