diff --git a/audit/internal/webui/api.go b/audit/internal/webui/api.go index 4144f70..ca20c94 100644 --- a/audit/internal/webui/api.go +++ b/audit/internal/webui/api.go @@ -11,6 +11,7 @@ import ( "os/exec" "path/filepath" "regexp" + "sort" "strings" "sync/atomic" "syscall" @@ -21,6 +22,12 @@ import ( ) var ansiEscapeRE = regexp.MustCompile(`\x1b\[[0-9;]*[a-zA-Z]|\x1b[()][A-Z0-9]|\x1b[DABC]`) +var apiListNvidiaGPUs = func(a *app.App) ([]platform.NvidiaGPU, error) { + if a == nil { + return nil, fmt.Errorf("app not configured") + } + return a.ListNvidiaGPUs() +} // ── Job ID counter ──────────────────────────────────────────────────────────── @@ -30,6 +37,206 @@ func newJobID(prefix string) string { return fmt.Sprintf("%s-%d", prefix, jobCounter.Add(1)) } +type taskRunResponse struct { + TaskID string `json:"task_id,omitempty"` + JobID string `json:"job_id,omitempty"` + TaskIDs []string `json:"task_ids,omitempty"` + JobIDs []string `json:"job_ids,omitempty"` + TaskCount int `json:"task_count,omitempty"` +} + +type nvidiaTaskSelection struct { + GPUIndices []int + Label string +} + +func writeTaskRunResponse(w http.ResponseWriter, tasks []*Task) { + if len(tasks) == 0 { + writeJSON(w, taskRunResponse{}) + return + } + ids := make([]string, 0, len(tasks)) + for _, t := range tasks { + if t == nil || strings.TrimSpace(t.ID) == "" { + continue + } + ids = append(ids, t.ID) + } + resp := taskRunResponse{TaskCount: len(ids)} + if len(ids) > 0 { + resp.TaskID = ids[0] + resp.JobID = ids[0] + resp.TaskIDs = ids + resp.JobIDs = ids + } + writeJSON(w, resp) +} + +func shouldSplitHomogeneousNvidiaTarget(target string) bool { + switch strings.TrimSpace(target) { + case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute", + "nvidia-targeted-power", "nvidia-pulse", "nvidia-interconnect", + "nvidia-bandwidth", "nvidia-stress": + return true + default: + return false + } +} + +func expandHomogeneousNvidiaSelections(gpus []platform.NvidiaGPU, include, exclude []int) ([]nvidiaTaskSelection, error) { + if len(gpus) == 0 { + return nil, fmt.Errorf("no NVIDIA GPUs detected") + } + indexed := make(map[int]platform.NvidiaGPU, len(gpus)) + allIndices := make([]int, 0, len(gpus)) + for _, gpu := range gpus { + indexed[gpu.Index] = gpu + allIndices = append(allIndices, gpu.Index) + } + sort.Ints(allIndices) + + selected := allIndices + if len(include) > 0 { + selected = make([]int, 0, len(include)) + seen := make(map[int]struct{}, len(include)) + for _, idx := range include { + if _, ok := indexed[idx]; !ok { + continue + } + if _, dup := seen[idx]; dup { + continue + } + seen[idx] = struct{}{} + selected = append(selected, idx) + } + sort.Ints(selected) + } + if len(exclude) > 0 { + skip := make(map[int]struct{}, len(exclude)) + for _, idx := range exclude { + skip[idx] = struct{}{} + } + filtered := selected[:0] + for _, idx := range selected { + if _, ok := skip[idx]; ok { + continue + } + filtered = append(filtered, idx) + } + selected = filtered + } + if len(selected) == 0 { + return nil, fmt.Errorf("no NVIDIA GPUs selected") + } + + modelGroups := make(map[string][]platform.NvidiaGPU) + modelOrder := make([]string, 0) + for _, idx := range selected { + gpu := indexed[idx] + model := strings.TrimSpace(gpu.Name) + if model == "" { + model = fmt.Sprintf("GPU %d", gpu.Index) + } + if _, ok := modelGroups[model]; !ok { + modelOrder = append(modelOrder, model) + } + modelGroups[model] = append(modelGroups[model], gpu) + } + sort.Slice(modelOrder, func(i, j int) bool { + left := modelGroups[modelOrder[i]] + right := modelGroups[modelOrder[j]] + if len(left) == 0 || len(right) == 0 { + return modelOrder[i] < modelOrder[j] + } + return left[0].Index < right[0].Index + }) + + var groups []nvidiaTaskSelection + var singles []nvidiaTaskSelection + for _, model := range modelOrder { + group := modelGroups[model] + sort.Slice(group, func(i, j int) bool { return group[i].Index < group[j].Index }) + indices := make([]int, 0, len(group)) + for _, gpu := range group { + indices = append(indices, gpu.Index) + } + if len(indices) >= 2 { + groups = append(groups, nvidiaTaskSelection{ + GPUIndices: indices, + Label: fmt.Sprintf("%s; GPUs %s", model, joinTaskIndices(indices)), + }) + continue + } + gpu := group[0] + singles = append(singles, nvidiaTaskSelection{ + GPUIndices: []int{gpu.Index}, + Label: fmt.Sprintf("GPU %d — %s", gpu.Index, model), + }) + } + return append(groups, singles...), nil +} + +func joinTaskIndices(indices []int) string { + parts := make([]string, 0, len(indices)) + for _, idx := range indices { + parts = append(parts, fmt.Sprintf("%d", idx)) + } + return strings.Join(parts, ",") +} + +func formatSplitTaskName(baseName, selectionLabel string) string { + baseName = strings.TrimSpace(baseName) + selectionLabel = strings.TrimSpace(selectionLabel) + if baseName == "" { + return selectionLabel + } + if selectionLabel == "" { + return baseName + } + return baseName + " (" + selectionLabel + ")" +} + +func buildNvidiaTaskSet(target string, priority int, createdAt time.Time, params taskParams, baseName string, appRef *app.App, idPrefix string) ([]*Task, error) { + if !shouldSplitHomogeneousNvidiaTarget(target) { + t := &Task{ + ID: newJobID(idPrefix), + Name: baseName, + Target: target, + Priority: priority, + Status: TaskPending, + CreatedAt: createdAt, + params: params, + } + return []*Task{t}, nil + } + gpus, err := apiListNvidiaGPUs(appRef) + if err != nil { + return nil, err + } + selections, err := expandHomogeneousNvidiaSelections(gpus, params.GPUIndices, params.ExcludeGPUIndices) + if err != nil { + return nil, err + } + tasks := make([]*Task, 0, len(selections)) + for _, selection := range selections { + taskParamsCopy := params + taskParamsCopy.GPUIndices = append([]int(nil), selection.GPUIndices...) + taskParamsCopy.ExcludeGPUIndices = nil + displayName := formatSplitTaskName(baseName, selection.Label) + taskParamsCopy.DisplayName = displayName + tasks = append(tasks, &Task{ + ID: newJobID(idPrefix), + Name: displayName, + Target: target, + Priority: priority, + Status: TaskPending, + CreatedAt: createdAt, + params: taskParamsCopy, + }) + } + return tasks, nil +} + // ── SSE helpers ─────────────────────────────────────────────────────────────── func sseWrite(w http.ResponseWriter, event, data string) bool { @@ -207,28 +414,28 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc { } name := taskDisplayName(target, body.Profile, body.Loader) - t := &Task{ - ID: newJobID("sat-" + target), - Name: name, - Target: target, - Status: TaskPending, - CreatedAt: time.Now(), - params: taskParams{ - Duration: body.Duration, - DiagLevel: body.DiagLevel, - GPUIndices: body.GPUIndices, - ExcludeGPUIndices: body.ExcludeGPUIndices, - Loader: body.Loader, - BurnProfile: body.Profile, - DisplayName: body.DisplayName, - PlatformComponents: body.PlatformComponents, - }, - } if strings.TrimSpace(body.DisplayName) != "" { - t.Name = body.DisplayName + name = body.DisplayName } - globalQueue.enqueue(t) - writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID}) + params := taskParams{ + Duration: body.Duration, + DiagLevel: body.DiagLevel, + GPUIndices: body.GPUIndices, + ExcludeGPUIndices: body.ExcludeGPUIndices, + Loader: body.Loader, + BurnProfile: body.Profile, + DisplayName: body.DisplayName, + PlatformComponents: body.PlatformComponents, + } + tasks, err := buildNvidiaTaskSet(target, 0, time.Now(), params, name, h.opts.App, "sat-"+target) + if err != nil { + writeError(w, http.StatusBadRequest, err.Error()) + return + } + for _, t := range tasks { + globalQueue.enqueue(t) + } + writeTaskRunResponse(w, tasks) } } @@ -257,27 +464,26 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req if body.RunNCCL != nil { runNCCL = *body.RunNCCL } - t := &Task{ - ID: newJobID("benchmark-nvidia"), - Name: taskDisplayName("nvidia-benchmark", "", ""), - Target: "nvidia-benchmark", - Priority: 15, - Status: TaskPending, - CreatedAt: time.Now(), - params: taskParams{ - GPUIndices: body.GPUIndices, - ExcludeGPUIndices: body.ExcludeGPUIndices, - SizeMB: body.SizeMB, - BenchmarkProfile: body.Profile, - RunNCCL: runNCCL, - DisplayName: body.DisplayName, - }, - } + name := taskDisplayName("nvidia-benchmark", "", "") if strings.TrimSpace(body.DisplayName) != "" { - t.Name = body.DisplayName + name = body.DisplayName } - globalQueue.enqueue(t) - writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID}) + tasks, err := buildNvidiaTaskSet("nvidia-benchmark", 15, time.Now(), taskParams{ + GPUIndices: body.GPUIndices, + ExcludeGPUIndices: body.ExcludeGPUIndices, + SizeMB: body.SizeMB, + BenchmarkProfile: body.Profile, + RunNCCL: runNCCL, + DisplayName: body.DisplayName, + }, name, h.opts.App, "benchmark-nvidia") + if err != nil { + writeError(w, http.StatusBadRequest, err.Error()) + return + } + for _, t := range tasks { + globalQueue.enqueue(t) + } + writeTaskRunResponse(w, tasks) } func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) { diff --git a/audit/internal/webui/api_test.go b/audit/internal/webui/api_test.go index 6a51d77..a781dc0 100644 --- a/audit/internal/webui/api_test.go +++ b/audit/internal/webui/api_test.go @@ -1,6 +1,7 @@ package webui import ( + "encoding/json" "net/http/httptest" "strings" "testing" @@ -74,6 +75,14 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) { globalQueue.tasks = originalTasks globalQueue.mu.Unlock() }) + prevList := apiListNvidiaGPUs + apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) { + return []platform.NvidiaGPU{ + {Index: 1, Name: "NVIDIA H100 PCIe"}, + {Index: 3, Name: "NVIDIA H100 PCIe"}, + }, nil + } + t.Cleanup(func() { apiListNvidiaGPUs = prevList }) h := &handler{opts: HandlerOptions{App: &app.App{}}} req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`)) @@ -101,6 +110,97 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) { } } +func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) { + globalQueue.mu.Lock() + originalTasks := globalQueue.tasks + globalQueue.tasks = nil + globalQueue.mu.Unlock() + t.Cleanup(func() { + globalQueue.mu.Lock() + globalQueue.tasks = originalTasks + globalQueue.mu.Unlock() + }) + prevList := apiListNvidiaGPUs + apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) { + return []platform.NvidiaGPU{ + {Index: 0, Name: "NVIDIA H100 PCIe"}, + {Index: 1, Name: "NVIDIA H100 PCIe"}, + {Index: 2, Name: "NVIDIA H200 NVL"}, + }, nil + } + t.Cleanup(func() { apiListNvidiaGPUs = prevList }) + + h := &handler{opts: HandlerOptions{App: &app.App{}}} + req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`)) + rec := httptest.NewRecorder() + + h.handleAPIBenchmarkNvidiaRun(rec, req) + + if rec.Code != 200 { + t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String()) + } + var resp taskRunResponse + if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil { + t.Fatalf("decode response: %v", err) + } + if len(resp.TaskIDs) != 2 { + t.Fatalf("task_ids=%v want 2 items", resp.TaskIDs) + } + globalQueue.mu.Lock() + defer globalQueue.mu.Unlock() + if len(globalQueue.tasks) != 2 { + t.Fatalf("tasks=%d want 2", len(globalQueue.tasks)) + } + if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 { + t.Fatalf("task[0] gpu indices=%v want [0 1]", got) + } + if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 { + t.Fatalf("task[1] gpu indices=%v want [2]", got) + } +} + +func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) { + globalQueue.mu.Lock() + originalTasks := globalQueue.tasks + globalQueue.tasks = nil + globalQueue.mu.Unlock() + t.Cleanup(func() { + globalQueue.mu.Lock() + globalQueue.tasks = originalTasks + globalQueue.mu.Unlock() + }) + prevList := apiListNvidiaGPUs + apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) { + return []platform.NvidiaGPU{ + {Index: 0, Name: "NVIDIA H100 PCIe"}, + {Index: 1, Name: "NVIDIA H100 PCIe"}, + {Index: 2, Name: "NVIDIA H200 NVL"}, + }, nil + } + t.Cleanup(func() { apiListNvidiaGPUs = prevList }) + + h := &handler{opts: HandlerOptions{App: &app.App{}}} + req := httptest.NewRequest("POST", "/api/sat/nvidia-targeted-power/run", strings.NewReader(`{"profile":"acceptance","gpu_indices":[0,1,2]}`)) + rec := httptest.NewRecorder() + + h.handleAPISATRun("nvidia-targeted-power").ServeHTTP(rec, req) + + if rec.Code != 200 { + t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String()) + } + globalQueue.mu.Lock() + defer globalQueue.mu.Unlock() + if len(globalQueue.tasks) != 2 { + t.Fatalf("tasks=%d want 2", len(globalQueue.tasks)) + } + if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 { + t.Fatalf("task[0] gpu indices=%v want [0 1]", got) + } + if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 { + t.Fatalf("task[1] gpu indices=%v want [2]", got) + } +} + func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) { h := &handler{} h.pushFanRings([]platform.FanReading{ diff --git a/audit/internal/webui/pages.go b/audit/internal/webui/pages.go index a0117f9..526994e 100644 --- a/audit/internal/webui/pages.go +++ b/audit/internal/webui/pages.go @@ -1656,6 +1656,12 @@ func renderBenchmark(opts HandlerOptions) string {