Split NVIDIA tasks by homogeneous GPU groups

This commit is contained in:
Mikhail Chusavitin
2026-04-06 11:58:13 +03:00
parent fc5c100a29
commit 981315e6fd
5 changed files with 569 additions and 75 deletions

View File

@@ -1,6 +1,7 @@
package webui
import (
"encoding/json"
"net/http/httptest"
"strings"
"testing"
@@ -74,6 +75,14 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
globalQueue.tasks = originalTasks
globalQueue.mu.Unlock()
})
prevList := apiListNvidiaGPUs
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
return []platform.NvidiaGPU{
{Index: 1, Name: "NVIDIA H100 PCIe"},
{Index: 3, Name: "NVIDIA H100 PCIe"},
}, nil
}
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
h := &handler{opts: HandlerOptions{App: &app.App{}}}
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
@@ -101,6 +110,97 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
}
}
func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
globalQueue.mu.Lock()
originalTasks := globalQueue.tasks
globalQueue.tasks = nil
globalQueue.mu.Unlock()
t.Cleanup(func() {
globalQueue.mu.Lock()
globalQueue.tasks = originalTasks
globalQueue.mu.Unlock()
})
prevList := apiListNvidiaGPUs
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
return []platform.NvidiaGPU{
{Index: 0, Name: "NVIDIA H100 PCIe"},
{Index: 1, Name: "NVIDIA H100 PCIe"},
{Index: 2, Name: "NVIDIA H200 NVL"},
}, nil
}
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
h := &handler{opts: HandlerOptions{App: &app.App{}}}
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
rec := httptest.NewRecorder()
h.handleAPIBenchmarkNvidiaRun(rec, req)
if rec.Code != 200 {
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
}
var resp taskRunResponse
if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil {
t.Fatalf("decode response: %v", err)
}
if len(resp.TaskIDs) != 2 {
t.Fatalf("task_ids=%v want 2 items", resp.TaskIDs)
}
globalQueue.mu.Lock()
defer globalQueue.mu.Unlock()
if len(globalQueue.tasks) != 2 {
t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
}
if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
}
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
t.Fatalf("task[1] gpu indices=%v want [2]", got)
}
}
func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
globalQueue.mu.Lock()
originalTasks := globalQueue.tasks
globalQueue.tasks = nil
globalQueue.mu.Unlock()
t.Cleanup(func() {
globalQueue.mu.Lock()
globalQueue.tasks = originalTasks
globalQueue.mu.Unlock()
})
prevList := apiListNvidiaGPUs
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
return []platform.NvidiaGPU{
{Index: 0, Name: "NVIDIA H100 PCIe"},
{Index: 1, Name: "NVIDIA H100 PCIe"},
{Index: 2, Name: "NVIDIA H200 NVL"},
}, nil
}
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
h := &handler{opts: HandlerOptions{App: &app.App{}}}
req := httptest.NewRequest("POST", "/api/sat/nvidia-targeted-power/run", strings.NewReader(`{"profile":"acceptance","gpu_indices":[0,1,2]}`))
rec := httptest.NewRecorder()
h.handleAPISATRun("nvidia-targeted-power").ServeHTTP(rec, req)
if rec.Code != 200 {
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
}
globalQueue.mu.Lock()
defer globalQueue.mu.Unlock()
if len(globalQueue.tasks) != 2 {
t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
}
if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
}
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
t.Fatalf("task[1] gpu indices=%v want [2]", got)
}
}
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
h := &handler{}
h.pushFanRings([]platform.FanReading{