Split NVIDIA tasks by homogeneous GPU groups
This commit is contained in:
@@ -11,6 +11,7 @@ import (
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"syscall"
|
||||
@@ -21,6 +22,12 @@ import (
|
||||
)
|
||||
|
||||
var ansiEscapeRE = regexp.MustCompile(`\x1b\[[0-9;]*[a-zA-Z]|\x1b[()][A-Z0-9]|\x1b[DABC]`)
|
||||
var apiListNvidiaGPUs = func(a *app.App) ([]platform.NvidiaGPU, error) {
|
||||
if a == nil {
|
||||
return nil, fmt.Errorf("app not configured")
|
||||
}
|
||||
return a.ListNvidiaGPUs()
|
||||
}
|
||||
|
||||
// ── Job ID counter ────────────────────────────────────────────────────────────
|
||||
|
||||
@@ -30,6 +37,206 @@ func newJobID(prefix string) string {
|
||||
return fmt.Sprintf("%s-%d", prefix, jobCounter.Add(1))
|
||||
}
|
||||
|
||||
type taskRunResponse struct {
|
||||
TaskID string `json:"task_id,omitempty"`
|
||||
JobID string `json:"job_id,omitempty"`
|
||||
TaskIDs []string `json:"task_ids,omitempty"`
|
||||
JobIDs []string `json:"job_ids,omitempty"`
|
||||
TaskCount int `json:"task_count,omitempty"`
|
||||
}
|
||||
|
||||
type nvidiaTaskSelection struct {
|
||||
GPUIndices []int
|
||||
Label string
|
||||
}
|
||||
|
||||
func writeTaskRunResponse(w http.ResponseWriter, tasks []*Task) {
|
||||
if len(tasks) == 0 {
|
||||
writeJSON(w, taskRunResponse{})
|
||||
return
|
||||
}
|
||||
ids := make([]string, 0, len(tasks))
|
||||
for _, t := range tasks {
|
||||
if t == nil || strings.TrimSpace(t.ID) == "" {
|
||||
continue
|
||||
}
|
||||
ids = append(ids, t.ID)
|
||||
}
|
||||
resp := taskRunResponse{TaskCount: len(ids)}
|
||||
if len(ids) > 0 {
|
||||
resp.TaskID = ids[0]
|
||||
resp.JobID = ids[0]
|
||||
resp.TaskIDs = ids
|
||||
resp.JobIDs = ids
|
||||
}
|
||||
writeJSON(w, resp)
|
||||
}
|
||||
|
||||
func shouldSplitHomogeneousNvidiaTarget(target string) bool {
|
||||
switch strings.TrimSpace(target) {
|
||||
case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute",
|
||||
"nvidia-targeted-power", "nvidia-pulse", "nvidia-interconnect",
|
||||
"nvidia-bandwidth", "nvidia-stress":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func expandHomogeneousNvidiaSelections(gpus []platform.NvidiaGPU, include, exclude []int) ([]nvidiaTaskSelection, error) {
|
||||
if len(gpus) == 0 {
|
||||
return nil, fmt.Errorf("no NVIDIA GPUs detected")
|
||||
}
|
||||
indexed := make(map[int]platform.NvidiaGPU, len(gpus))
|
||||
allIndices := make([]int, 0, len(gpus))
|
||||
for _, gpu := range gpus {
|
||||
indexed[gpu.Index] = gpu
|
||||
allIndices = append(allIndices, gpu.Index)
|
||||
}
|
||||
sort.Ints(allIndices)
|
||||
|
||||
selected := allIndices
|
||||
if len(include) > 0 {
|
||||
selected = make([]int, 0, len(include))
|
||||
seen := make(map[int]struct{}, len(include))
|
||||
for _, idx := range include {
|
||||
if _, ok := indexed[idx]; !ok {
|
||||
continue
|
||||
}
|
||||
if _, dup := seen[idx]; dup {
|
||||
continue
|
||||
}
|
||||
seen[idx] = struct{}{}
|
||||
selected = append(selected, idx)
|
||||
}
|
||||
sort.Ints(selected)
|
||||
}
|
||||
if len(exclude) > 0 {
|
||||
skip := make(map[int]struct{}, len(exclude))
|
||||
for _, idx := range exclude {
|
||||
skip[idx] = struct{}{}
|
||||
}
|
||||
filtered := selected[:0]
|
||||
for _, idx := range selected {
|
||||
if _, ok := skip[idx]; ok {
|
||||
continue
|
||||
}
|
||||
filtered = append(filtered, idx)
|
||||
}
|
||||
selected = filtered
|
||||
}
|
||||
if len(selected) == 0 {
|
||||
return nil, fmt.Errorf("no NVIDIA GPUs selected")
|
||||
}
|
||||
|
||||
modelGroups := make(map[string][]platform.NvidiaGPU)
|
||||
modelOrder := make([]string, 0)
|
||||
for _, idx := range selected {
|
||||
gpu := indexed[idx]
|
||||
model := strings.TrimSpace(gpu.Name)
|
||||
if model == "" {
|
||||
model = fmt.Sprintf("GPU %d", gpu.Index)
|
||||
}
|
||||
if _, ok := modelGroups[model]; !ok {
|
||||
modelOrder = append(modelOrder, model)
|
||||
}
|
||||
modelGroups[model] = append(modelGroups[model], gpu)
|
||||
}
|
||||
sort.Slice(modelOrder, func(i, j int) bool {
|
||||
left := modelGroups[modelOrder[i]]
|
||||
right := modelGroups[modelOrder[j]]
|
||||
if len(left) == 0 || len(right) == 0 {
|
||||
return modelOrder[i] < modelOrder[j]
|
||||
}
|
||||
return left[0].Index < right[0].Index
|
||||
})
|
||||
|
||||
var groups []nvidiaTaskSelection
|
||||
var singles []nvidiaTaskSelection
|
||||
for _, model := range modelOrder {
|
||||
group := modelGroups[model]
|
||||
sort.Slice(group, func(i, j int) bool { return group[i].Index < group[j].Index })
|
||||
indices := make([]int, 0, len(group))
|
||||
for _, gpu := range group {
|
||||
indices = append(indices, gpu.Index)
|
||||
}
|
||||
if len(indices) >= 2 {
|
||||
groups = append(groups, nvidiaTaskSelection{
|
||||
GPUIndices: indices,
|
||||
Label: fmt.Sprintf("%s; GPUs %s", model, joinTaskIndices(indices)),
|
||||
})
|
||||
continue
|
||||
}
|
||||
gpu := group[0]
|
||||
singles = append(singles, nvidiaTaskSelection{
|
||||
GPUIndices: []int{gpu.Index},
|
||||
Label: fmt.Sprintf("GPU %d — %s", gpu.Index, model),
|
||||
})
|
||||
}
|
||||
return append(groups, singles...), nil
|
||||
}
|
||||
|
||||
func joinTaskIndices(indices []int) string {
|
||||
parts := make([]string, 0, len(indices))
|
||||
for _, idx := range indices {
|
||||
parts = append(parts, fmt.Sprintf("%d", idx))
|
||||
}
|
||||
return strings.Join(parts, ",")
|
||||
}
|
||||
|
||||
func formatSplitTaskName(baseName, selectionLabel string) string {
|
||||
baseName = strings.TrimSpace(baseName)
|
||||
selectionLabel = strings.TrimSpace(selectionLabel)
|
||||
if baseName == "" {
|
||||
return selectionLabel
|
||||
}
|
||||
if selectionLabel == "" {
|
||||
return baseName
|
||||
}
|
||||
return baseName + " (" + selectionLabel + ")"
|
||||
}
|
||||
|
||||
func buildNvidiaTaskSet(target string, priority int, createdAt time.Time, params taskParams, baseName string, appRef *app.App, idPrefix string) ([]*Task, error) {
|
||||
if !shouldSplitHomogeneousNvidiaTarget(target) {
|
||||
t := &Task{
|
||||
ID: newJobID(idPrefix),
|
||||
Name: baseName,
|
||||
Target: target,
|
||||
Priority: priority,
|
||||
Status: TaskPending,
|
||||
CreatedAt: createdAt,
|
||||
params: params,
|
||||
}
|
||||
return []*Task{t}, nil
|
||||
}
|
||||
gpus, err := apiListNvidiaGPUs(appRef)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
selections, err := expandHomogeneousNvidiaSelections(gpus, params.GPUIndices, params.ExcludeGPUIndices)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
tasks := make([]*Task, 0, len(selections))
|
||||
for _, selection := range selections {
|
||||
taskParamsCopy := params
|
||||
taskParamsCopy.GPUIndices = append([]int(nil), selection.GPUIndices...)
|
||||
taskParamsCopy.ExcludeGPUIndices = nil
|
||||
displayName := formatSplitTaskName(baseName, selection.Label)
|
||||
taskParamsCopy.DisplayName = displayName
|
||||
tasks = append(tasks, &Task{
|
||||
ID: newJobID(idPrefix),
|
||||
Name: displayName,
|
||||
Target: target,
|
||||
Priority: priority,
|
||||
Status: TaskPending,
|
||||
CreatedAt: createdAt,
|
||||
params: taskParamsCopy,
|
||||
})
|
||||
}
|
||||
return tasks, nil
|
||||
}
|
||||
|
||||
// ── SSE helpers ───────────────────────────────────────────────────────────────
|
||||
|
||||
func sseWrite(w http.ResponseWriter, event, data string) bool {
|
||||
@@ -207,28 +414,28 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
||||
}
|
||||
|
||||
name := taskDisplayName(target, body.Profile, body.Loader)
|
||||
t := &Task{
|
||||
ID: newJobID("sat-" + target),
|
||||
Name: name,
|
||||
Target: target,
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{
|
||||
Duration: body.Duration,
|
||||
DiagLevel: body.DiagLevel,
|
||||
GPUIndices: body.GPUIndices,
|
||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||
Loader: body.Loader,
|
||||
BurnProfile: body.Profile,
|
||||
DisplayName: body.DisplayName,
|
||||
PlatformComponents: body.PlatformComponents,
|
||||
},
|
||||
}
|
||||
if strings.TrimSpace(body.DisplayName) != "" {
|
||||
t.Name = body.DisplayName
|
||||
name = body.DisplayName
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
|
||||
params := taskParams{
|
||||
Duration: body.Duration,
|
||||
DiagLevel: body.DiagLevel,
|
||||
GPUIndices: body.GPUIndices,
|
||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||
Loader: body.Loader,
|
||||
BurnProfile: body.Profile,
|
||||
DisplayName: body.DisplayName,
|
||||
PlatformComponents: body.PlatformComponents,
|
||||
}
|
||||
tasks, err := buildNvidiaTaskSet(target, 0, time.Now(), params, name, h.opts.App, "sat-"+target)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
for _, t := range tasks {
|
||||
globalQueue.enqueue(t)
|
||||
}
|
||||
writeTaskRunResponse(w, tasks)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -257,27 +464,26 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
|
||||
if body.RunNCCL != nil {
|
||||
runNCCL = *body.RunNCCL
|
||||
}
|
||||
t := &Task{
|
||||
ID: newJobID("benchmark-nvidia"),
|
||||
Name: taskDisplayName("nvidia-benchmark", "", ""),
|
||||
Target: "nvidia-benchmark",
|
||||
Priority: 15,
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{
|
||||
GPUIndices: body.GPUIndices,
|
||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||
SizeMB: body.SizeMB,
|
||||
BenchmarkProfile: body.Profile,
|
||||
RunNCCL: runNCCL,
|
||||
DisplayName: body.DisplayName,
|
||||
},
|
||||
}
|
||||
name := taskDisplayName("nvidia-benchmark", "", "")
|
||||
if strings.TrimSpace(body.DisplayName) != "" {
|
||||
t.Name = body.DisplayName
|
||||
name = body.DisplayName
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
|
||||
tasks, err := buildNvidiaTaskSet("nvidia-benchmark", 15, time.Now(), taskParams{
|
||||
GPUIndices: body.GPUIndices,
|
||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||
SizeMB: body.SizeMB,
|
||||
BenchmarkProfile: body.Profile,
|
||||
RunNCCL: runNCCL,
|
||||
DisplayName: body.DisplayName,
|
||||
}, name, h.opts.App, "benchmark-nvidia")
|
||||
if err != nil {
|
||||
writeError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
for _, t := range tasks {
|
||||
globalQueue.enqueue(t)
|
||||
}
|
||||
writeTaskRunResponse(w, tasks)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
@@ -74,6 +75,14 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
||||
globalQueue.tasks = originalTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
prevList := apiListNvidiaGPUs
|
||||
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||
return []platform.NvidiaGPU{
|
||||
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 3, Name: "NVIDIA H100 PCIe"},
|
||||
}, nil
|
||||
}
|
||||
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
|
||||
@@ -101,6 +110,97 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
globalQueue.tasks = nil
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = originalTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
prevList := apiListNvidiaGPUs
|
||||
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||
return []platform.NvidiaGPU{
|
||||
{Index: 0, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 2, Name: "NVIDIA H200 NVL"},
|
||||
}, nil
|
||||
}
|
||||
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPIBenchmarkNvidiaRun(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
var resp taskRunResponse
|
||||
if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("decode response: %v", err)
|
||||
}
|
||||
if len(resp.TaskIDs) != 2 {
|
||||
t.Fatalf("task_ids=%v want 2 items", resp.TaskIDs)
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
if len(globalQueue.tasks) != 2 {
|
||||
t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
|
||||
}
|
||||
if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
|
||||
t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
|
||||
}
|
||||
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
|
||||
t.Fatalf("task[1] gpu indices=%v want [2]", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
globalQueue.tasks = nil
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = originalTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
prevList := apiListNvidiaGPUs
|
||||
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||
return []platform.NvidiaGPU{
|
||||
{Index: 0, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 2, Name: "NVIDIA H200 NVL"},
|
||||
}, nil
|
||||
}
|
||||
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/sat/nvidia-targeted-power/run", strings.NewReader(`{"profile":"acceptance","gpu_indices":[0,1,2]}`))
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPISATRun("nvidia-targeted-power").ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
if len(globalQueue.tasks) != 2 {
|
||||
t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
|
||||
}
|
||||
if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
|
||||
t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
|
||||
}
|
||||
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
|
||||
t.Fatalf("task[1] gpu indices=%v want [2]", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
|
||||
h := &handler{}
|
||||
h.pushFanRings([]platform.FanReading{
|
||||
|
||||
@@ -1656,6 +1656,12 @@ func renderBenchmark(opts HandlerOptions) string {
|
||||
<script>
|
||||
let benchmarkES = null;
|
||||
|
||||
function benchmarkTaskIDs(payload) {
|
||||
if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
|
||||
if (payload && payload.task_id) return [payload.task_id];
|
||||
return [];
|
||||
}
|
||||
|
||||
function benchmarkSelectedGPUIndices() {
|
||||
return Array.from(document.querySelectorAll('.benchmark-gpu-checkbox'))
|
||||
.filter(function(el) { return el.checked && !el.disabled; })
|
||||
@@ -1755,17 +1761,37 @@ function runNvidiaBenchmark() {
|
||||
return payload;
|
||||
});
|
||||
}).then(function(d) {
|
||||
status.textContent = 'Task ' + d.task_id + ' queued.';
|
||||
term.textContent += 'Task ' + d.task_id + ' queued. Streaming log...\n';
|
||||
benchmarkES = new EventSource('/api/tasks/' + d.task_id + '/stream');
|
||||
benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||
benchmarkES.addEventListener('done', function(e) {
|
||||
benchmarkES.close();
|
||||
benchmarkES = null;
|
||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
status.textContent = e.data ? 'Failed.' : 'Completed.';
|
||||
});
|
||||
const taskIds = benchmarkTaskIDs(d);
|
||||
if (!taskIds.length) throw new Error('No benchmark task was queued.');
|
||||
status.textContent = taskIds.length === 1 ? ('Task ' + taskIds[0] + ' queued.') : ('Queued ' + taskIds.length + ' tasks.');
|
||||
const streamNext = function(idx, failures) {
|
||||
if (idx >= taskIds.length) {
|
||||
status.textContent = failures ? 'Completed with failures.' : 'Completed.';
|
||||
return;
|
||||
}
|
||||
const taskId = taskIds[idx];
|
||||
term.textContent += '\n[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming log...\n';
|
||||
benchmarkES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||
benchmarkES.addEventListener('done', function(e) {
|
||||
benchmarkES.close();
|
||||
benchmarkES = null;
|
||||
if (e.data) failures += 1;
|
||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
streamNext(idx + 1, failures);
|
||||
});
|
||||
benchmarkES.onerror = function() {
|
||||
if (benchmarkES) {
|
||||
benchmarkES.close();
|
||||
benchmarkES = null;
|
||||
}
|
||||
term.textContent += '\nERROR: stream disconnected.\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
streamNext(idx + 1, failures + 1);
|
||||
};
|
||||
};
|
||||
streamNext(0, 0);
|
||||
}).catch(function(err) {
|
||||
status.textContent = 'Error.';
|
||||
term.textContent += 'ERROR: ' + err.message + '\n';
|
||||
@@ -1779,13 +1805,24 @@ benchmarkLoadGPUs();
|
||||
|
||||
func renderBenchmarkResultsCard(exportDir string) string {
|
||||
columns, runs := loadBenchmarkHistory(exportDir)
|
||||
if len(runs) == 0 {
|
||||
return `<div class="card"><div class="card-head">Benchmark Results</div><div class="card-body"><p style="color:var(--muted);font-size:13px">No saved benchmark runs yet.</p></div></div>`
|
||||
}
|
||||
return renderBenchmarkResultsCardFromRuns(
|
||||
"Benchmark Results",
|
||||
"Composite score by saved benchmark run and GPU.",
|
||||
"No saved benchmark runs yet.",
|
||||
columns,
|
||||
runs,
|
||||
)
|
||||
}
|
||||
|
||||
func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, columns []benchmarkHistoryColumn, runs []benchmarkHistoryRun) string {
|
||||
if len(runs) == 0 {
|
||||
return `<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body"><p style="color:var(--muted);font-size:13px">` + html.EscapeString(emptyMessage) + `</p></div></div>`
|
||||
}
|
||||
var b strings.Builder
|
||||
b.WriteString(`<div class="card"><div class="card-head">Benchmark Results</div><div class="card-body">`)
|
||||
b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">Composite score by saved benchmark run and GPU.</p>`)
|
||||
b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body">`)
|
||||
if strings.TrimSpace(description) != "" {
|
||||
b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">` + html.EscapeString(description) + `</p>`)
|
||||
}
|
||||
b.WriteString(`<div style="overflow-x:auto">`)
|
||||
b.WriteString(`<table><thead><tr><th>Test</th><th>Time</th>`)
|
||||
for _, col := range columns {
|
||||
@@ -1820,7 +1857,10 @@ func loadBenchmarkHistory(exportDir string) ([]benchmarkHistoryColumn, []benchma
|
||||
return nil, nil
|
||||
}
|
||||
sort.Strings(paths)
|
||||
return loadBenchmarkHistoryFromPaths(paths)
|
||||
}
|
||||
|
||||
func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []benchmarkHistoryRun) {
|
||||
columnByKey := make(map[string]benchmarkHistoryColumn)
|
||||
runs := make([]benchmarkHistoryRun, 0, len(paths))
|
||||
for _, path := range paths {
|
||||
@@ -2005,6 +2045,12 @@ func renderBurn() string {
|
||||
<script>
|
||||
let biES = null;
|
||||
|
||||
function burnTaskIDs(payload) {
|
||||
if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
|
||||
if (payload && payload.task_id) return [payload.task_id];
|
||||
return [];
|
||||
}
|
||||
|
||||
function burnProfile() {
|
||||
const selected = document.querySelector('input[name="burn-profile"]:checked');
|
||||
return selected ? selected.value : 'smoke';
|
||||
@@ -2106,6 +2152,9 @@ function streamTask(taskId, label) {
|
||||
});
|
||||
}
|
||||
function streamBurnTask(taskId, label, resetTerminal) {
|
||||
return streamBurnTaskSet([taskId], label, resetTerminal);
|
||||
}
|
||||
function streamBurnTaskSet(taskIds, label, resetTerminal) {
|
||||
if (biES) { biES.close(); biES = null; }
|
||||
document.getElementById('bi-output').style.display = 'block';
|
||||
document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
|
||||
@@ -2113,27 +2162,40 @@ function streamBurnTask(taskId, label, resetTerminal) {
|
||||
if (resetTerminal) {
|
||||
term.textContent = '';
|
||||
}
|
||||
term.textContent += 'Task ' + taskId + ' queued. Streaming...\n';
|
||||
return new Promise(function(resolve) {
|
||||
biES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||
biES.addEventListener('done', function(e) {
|
||||
biES.close();
|
||||
biES = null;
|
||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
resolve({ok: !e.data, error: e.data || ''});
|
||||
});
|
||||
biES.onerror = function() {
|
||||
if (biES) {
|
||||
if (!Array.isArray(taskIds) || !taskIds.length) {
|
||||
term.textContent += 'ERROR: no tasks queued.\n';
|
||||
return Promise.resolve({ok:false, error:'no tasks queued'});
|
||||
}
|
||||
const streamNext = function(idx, failures) {
|
||||
if (idx >= taskIds.length) {
|
||||
return Promise.resolve({ok: failures === 0, error: failures ? (failures + ' task(s) failed') : ''});
|
||||
}
|
||||
const taskId = taskIds[idx];
|
||||
term.textContent += '[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming...\n';
|
||||
return new Promise(function(resolve) {
|
||||
biES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||
biES.addEventListener('done', function(e) {
|
||||
biES.close();
|
||||
biES = null;
|
||||
}
|
||||
term.textContent += '\nERROR: stream disconnected.\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
resolve({ok: false, error: 'stream disconnected'});
|
||||
};
|
||||
});
|
||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
resolve(failures + (e.data ? 1 : 0));
|
||||
});
|
||||
biES.onerror = function() {
|
||||
if (biES) {
|
||||
biES.close();
|
||||
biES = null;
|
||||
}
|
||||
term.textContent += '\nERROR: stream disconnected.\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
resolve(failures + 1);
|
||||
};
|
||||
}).then(function(nextFailures) {
|
||||
return streamNext(idx + 1, nextFailures);
|
||||
});
|
||||
};
|
||||
return streamNext(0, 0);
|
||||
}
|
||||
|
||||
function runBurnTaskSet(tasks, statusElId) {
|
||||
@@ -2161,7 +2223,7 @@ function runBurnTaskSet(tasks, statusElId) {
|
||||
if (status) status.textContent = 'Running ' + (idx + 1) + '/' + enabled.length + '...';
|
||||
return enqueueBurnTask(t.target, t.label, t.extra, !!t.nvidia)
|
||||
.then(function(d) {
|
||||
return streamBurnTask(d.task_id, t.label, false);
|
||||
return streamBurnTaskSet(burnTaskIDs(d), t.label, false);
|
||||
})
|
||||
.then(function() {
|
||||
return runNext(idx + 1);
|
||||
|
||||
@@ -230,6 +230,9 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
|
||||
b.WriteString(`<div style="margin-top:14px;font-size:13px;color:var(--muted)">`)
|
||||
b.WriteString(`Started: ` + formatTaskTime(report.StartedAt, report.CreatedAt) + ` | Finished: ` + formatTaskTime(report.DoneAt, time.Time{}) + ` | Duration: ` + formatTaskDuration(report.DurationSec))
|
||||
b.WriteString(`</div></div></div>`)
|
||||
if benchmarkCard := renderTaskBenchmarkResultsCard(report.Target, logText); benchmarkCard != "" {
|
||||
b.WriteString(benchmarkCard)
|
||||
}
|
||||
|
||||
if len(report.Charts) > 0 {
|
||||
for _, chart := range report.Charts {
|
||||
@@ -247,6 +250,57 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func renderTaskBenchmarkResultsCard(target, logText string) string {
|
||||
if strings.TrimSpace(target) != "nvidia-benchmark" {
|
||||
return ""
|
||||
}
|
||||
resultPath := taskBenchmarkResultPath(logText)
|
||||
if strings.TrimSpace(resultPath) == "" {
|
||||
return ""
|
||||
}
|
||||
columns, runs := loadBenchmarkHistoryFromPaths([]string{resultPath})
|
||||
if len(runs) == 0 {
|
||||
return ""
|
||||
}
|
||||
return renderBenchmarkResultsCardFromRuns(
|
||||
"Benchmark Results",
|
||||
"Composite score for this benchmark task.",
|
||||
"No benchmark results were saved for this task.",
|
||||
columns,
|
||||
runs,
|
||||
)
|
||||
}
|
||||
|
||||
func taskBenchmarkResultPath(logText string) string {
|
||||
archivePath := taskArchivePathFromLog(logText)
|
||||
if archivePath == "" {
|
||||
return ""
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
if runDir == archivePath {
|
||||
return ""
|
||||
}
|
||||
return filepath.Join(runDir, "result.json")
|
||||
}
|
||||
|
||||
func taskArchivePathFromLog(logText string) string {
|
||||
lines := strings.Split(logText, "\n")
|
||||
for i := len(lines) - 1; i >= 0; i-- {
|
||||
line := strings.TrimSpace(lines[i])
|
||||
if line == "" || !strings.HasPrefix(line, "Archive:") {
|
||||
continue
|
||||
}
|
||||
path := strings.TrimSpace(strings.TrimPrefix(line, "Archive:"))
|
||||
if strings.HasPrefix(path, "Archive written to ") {
|
||||
path = strings.TrimSpace(strings.TrimPrefix(path, "Archive written to "))
|
||||
}
|
||||
if strings.HasSuffix(path, ".tar.gz") {
|
||||
return path
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func renderTaskStatusBadge(status string) string {
|
||||
className := map[string]string{
|
||||
TaskRunning: "badge-ok",
|
||||
|
||||
@@ -325,6 +325,78 @@ func TestFinalizeTaskRunCreatesReportFolderAndArtifacts(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
metricsPath := filepath.Join(dir, "metrics.db")
|
||||
prevMetricsPath := taskReportMetricsDBPath
|
||||
taskReportMetricsDBPath = metricsPath
|
||||
t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
|
||||
|
||||
benchmarkDir := filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000")
|
||||
if err := os.MkdirAll(benchmarkDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
result := platform.NvidiaBenchmarkResult{
|
||||
GeneratedAt: time.Date(2026, time.April, 6, 12, 0, 0, 0, time.UTC),
|
||||
BenchmarkProfile: "standard",
|
||||
OverallStatus: "OK",
|
||||
GPUs: []platform.BenchmarkGPUResult{
|
||||
{
|
||||
Index: 0,
|
||||
Name: "NVIDIA H100 PCIe",
|
||||
Scores: platform.BenchmarkScorecard{
|
||||
CompositeScore: 1176.25,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
raw, err := json.Marshal(result)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(benchmarkDir, "result.json"), raw, 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
artifactsDir := filepath.Join(dir, "tasks", "task-bench_done")
|
||||
if err := os.MkdirAll(artifactsDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
task := &Task{
|
||||
ID: "task-bench",
|
||||
Name: "NVIDIA Benchmark",
|
||||
Target: "nvidia-benchmark",
|
||||
Status: TaskDone,
|
||||
CreatedAt: time.Now().UTC().Add(-time.Minute),
|
||||
ArtifactsDir: artifactsDir,
|
||||
}
|
||||
ensureTaskReportPaths(task)
|
||||
logText := "line-1\nArchive: " + filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000.tar.gz") + "\n"
|
||||
if err := os.WriteFile(task.LogPath, []byte(logText), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := writeTaskReportArtifacts(task); err != nil {
|
||||
t.Fatalf("writeTaskReportArtifacts: %v", err)
|
||||
}
|
||||
|
||||
body, err := os.ReadFile(task.ReportHTMLPath)
|
||||
if err != nil {
|
||||
t.Fatalf("ReadFile(report.html): %v", err)
|
||||
}
|
||||
html := string(body)
|
||||
for _, needle := range []string{
|
||||
`Benchmark Results`,
|
||||
`Composite score for this benchmark task.`,
|
||||
`NVIDIA H100 PCIe / GPU 0`,
|
||||
`1176.25`,
|
||||
} {
|
||||
if !strings.Contains(html, needle) {
|
||||
t.Fatalf("report missing %q: %s", needle, html)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestTaskLifecycleMirrorsToSerialConsole(t *testing.T) {
|
||||
var lines []string
|
||||
prev := taskSerialWriteLine
|
||||
|
||||
Reference in New Issue
Block a user