Compare commits

...

2 Commits
v7.10 ... v7.12

Author SHA1 Message Date
098e19f760 Add ramp-up mode to NVIDIA GPU benchmark
Adds a new checkbox (enabled by default) in the benchmark section.
In ramp-up mode N tasks are spawned simultaneously: 1 GPU, then 2,
then 3, up to all selected GPUs — each step runs its GPUs in parallel.
NCCL runs only on the final step.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-12 18:34:19 +03:00
e16d0f34b5 Adjust burn GPU ramp timing by profile 2026-04-12 15:58:30 +03:00
4 changed files with 237 additions and 31 deletions

View File

@@ -12,6 +12,7 @@ import (
"path/filepath" "path/filepath"
"regexp" "regexp"
"sort" "sort"
"strconv"
"strings" "strings"
"sync/atomic" "sync/atomic"
"syscall" "syscall"
@@ -209,6 +210,14 @@ func joinTaskIndices(indices []int) string {
return strings.Join(parts, ",") return strings.Join(parts, ",")
} }
func formatGPUIndexList(indices []int) string {
parts := make([]string, len(indices))
for i, idx := range indices {
parts[i] = strconv.Itoa(idx)
}
return strings.Join(parts, ",")
}
func formatSplitTaskName(baseName, selectionLabel string) string { func formatSplitTaskName(baseName, selectionLabel string) string {
baseName = strings.TrimSpace(baseName) baseName = strings.TrimSpace(baseName)
selectionLabel = strings.TrimSpace(selectionLabel) selectionLabel = strings.TrimSpace(selectionLabel)
@@ -540,6 +549,7 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
ExcludeGPUIndices []int `json:"exclude_gpu_indices"` ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
RunNCCL *bool `json:"run_nccl"` RunNCCL *bool `json:"run_nccl"`
ParallelGPUs *bool `json:"parallel_gpus"` ParallelGPUs *bool `json:"parallel_gpus"`
RampUp *bool `json:"ramp_up"`
DisplayName string `json:"display_name"` DisplayName string `json:"display_name"`
} }
if r.Body != nil { if r.Body != nil {
@@ -557,10 +567,63 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
if body.ParallelGPUs != nil { if body.ParallelGPUs != nil {
parallelGPUs = *body.ParallelGPUs parallelGPUs = *body.ParallelGPUs
} }
rampUp := false
if body.RampUp != nil {
rampUp = *body.RampUp
}
name := taskDisplayName("nvidia-benchmark", "", "") name := taskDisplayName("nvidia-benchmark", "", "")
if strings.TrimSpace(body.DisplayName) != "" { if strings.TrimSpace(body.DisplayName) != "" {
name = body.DisplayName name = body.DisplayName
} }
if rampUp && len(body.GPUIndices) > 1 {
// Ramp-up mode: resolve GPU list, then create one task per prefix
// [gpu0], [gpu0,gpu1], ..., [gpu0,...,gpuN-1], each running in parallel.
gpus, err := apiListNvidiaGPUs(h.opts.App)
if err != nil {
writeError(w, http.StatusBadRequest, err.Error())
return
}
resolved, err := expandSelectedGPUIndices(gpus, body.GPUIndices, body.ExcludeGPUIndices)
if err != nil {
writeError(w, http.StatusBadRequest, err.Error())
return
}
if len(resolved) < 2 {
// Fall through to normal single-task path.
rampUp = false
} else {
now := time.Now()
var allTasks []*Task
for step := 1; step <= len(resolved); step++ {
subset := resolved[:step]
stepName := fmt.Sprintf("%s [ramp %d/%d: GPU %s]", name, step, len(resolved), formatGPUIndexList(subset))
t := &Task{
ID: newJobID("benchmark-nvidia"),
Name: stepName,
Target: "nvidia-benchmark",
Priority: 15,
Status: TaskPending,
CreatedAt: now,
params: taskParams{
GPUIndices: append([]int(nil), subset...),
SizeMB: body.SizeMB,
BenchmarkProfile: body.Profile,
RunNCCL: runNCCL && step == len(resolved),
ParallelGPUs: true,
DisplayName: stepName,
},
}
allTasks = append(allTasks, t)
}
for _, t := range allTasks {
globalQueue.enqueue(t)
}
writeTaskRunResponse(w, allTasks)
return
}
}
tasks, err := buildNvidiaTaskSet("nvidia-benchmark", 15, time.Now(), taskParams{ tasks, err := buildNvidiaTaskSet("nvidia-benchmark", 15, time.Now(), taskParams{
GPUIndices: body.GPUIndices, GPUIndices: body.GPUIndices,
ExcludeGPUIndices: body.ExcludeGPUIndices, ExcludeGPUIndices: body.ExcludeGPUIndices,

View File

@@ -1342,11 +1342,11 @@ func renderValidate(opts HandlerOptions) string {
<div class="grid3"> <div class="grid3">
` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody( ` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
inv.NVIDIA, inv.NVIDIA,
`Runs NVIDIA diagnostics and board inventory checks.`, `Runs NVIDIA diagnostics and board inventory checks.`,
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`, `<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
`Level 2 in Validate, Level 3 in Stress. Runs one GPU at a time on the selected NVIDIA GPUs.`, `Level 2 in Validate, Level 3 in Stress. Runs one GPU at a time on the selected NVIDIA GPUs.`,
)) + )) +
`<div id="sat-card-nvidia-targeted-stress">` + `<div id="sat-card-nvidia-targeted-stress">` +
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody( renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
inv.NVIDIA, inv.NVIDIA,
@@ -1966,9 +1966,13 @@ func renderBenchmark(opts HandlerOptions) string {
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p> <p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
</div> </div>
</div> </div>
<label class="benchmark-cb-row">
<input type="checkbox" id="benchmark-ramp-up" checked onchange="benchmarkUpdateSelectionNote()">
<span>Ramp-up mode: run 1 GPU, then 2, then 3… up to all selected GPUs (each step is a separate task)</span>
</label>
<label class="benchmark-cb-row"> <label class="benchmark-cb-row">
<input type="checkbox" id="benchmark-parallel-gpus"> <input type="checkbox" id="benchmark-parallel-gpus">
<span>Run all selected GPUs simultaneously (parallel mode)</span> <span>Run all selected GPUs simultaneously (parallel mode, ignored in ramp-up)</span>
</label> </label>
<label class="benchmark-cb-row"> <label class="benchmark-cb-row">
<input type="checkbox" id="benchmark-run-nccl" checked> <input type="checkbox" id="benchmark-run-nccl" checked>
@@ -2036,11 +2040,16 @@ function benchmarkUpdateSelectionNote() {
return; return;
} }
btn.disabled = false; btn.disabled = false;
note.textContent = 'Selected GPUs: ' + selected.join(', ') + '.'; const rampUp = selected.length > 1 && !!document.getElementById('benchmark-ramp-up').checked;
if (nccl && nccl.checked && selected.length < 2) { if (rampUp) {
note.textContent += ' NCCL will be skipped because fewer than 2 GPUs are selected.'; note.textContent = 'Ramp-up: will spawn ' + selected.length + ' tasks (1 GPU → ' + selected.length + ' GPUs). NCCL runs on the final step only.';
} else if (nccl && nccl.checked) { } else {
note.textContent += ' NCCL interconnect will use only these GPUs.'; note.textContent = 'Selected GPUs: ' + selected.join(', ') + '.';
if (nccl && nccl.checked && selected.length < 2) {
note.textContent += ' NCCL will be skipped because fewer than 2 GPUs are selected.';
} else if (nccl && nccl.checked) {
note.textContent += ' NCCL interconnect will use only these GPUs.';
}
} }
} }
@@ -2095,12 +2104,14 @@ function runNvidiaBenchmark() {
return; return;
} }
if (benchmarkES) { benchmarkES.close(); benchmarkES = null; } if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
const parallelGPUs = !!document.getElementById('benchmark-parallel-gpus').checked; const rampUp = selected.length > 1 && !!document.getElementById('benchmark-ramp-up').checked;
const parallelGPUs = !rampUp && !!document.getElementById('benchmark-parallel-gpus').checked;
const body = { const body = {
profile: document.getElementById('benchmark-profile').value || 'standard', profile: document.getElementById('benchmark-profile').value || 'standard',
gpu_indices: selected, gpu_indices: selected,
run_nccl: !!document.getElementById('benchmark-run-nccl').checked, run_nccl: !!document.getElementById('benchmark-run-nccl').checked,
parallel_gpus: parallelGPUs, parallel_gpus: parallelGPUs,
ramp_up: rampUp,
display_name: 'NVIDIA Benchmark' display_name: 'NVIDIA Benchmark'
}; };
document.getElementById('benchmark-output').style.display = 'block'; document.getElementById('benchmark-output').style.display = 'block';
@@ -2373,7 +2384,7 @@ func renderBurn() string {
<p id="burn-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA burn recipes.</p> <p id="burn-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA burn recipes.</p>
<label class="cb-row" style="margin-top:10px"> <label class="cb-row" style="margin-top:10px">
<input type="checkbox" id="burn-stagger-nvidia"> <input type="checkbox" id="burn-stagger-nvidia">
<span>Ramp selected NVIDIA GPUs one by one before full-load hold. Uses a 3-minute stabilization window per GPU, then keeps all selected GPUs under load for the chosen Burn Profile duration.</span> <span>Ramp selected NVIDIA GPUs one by one before the full-load hold. Smoke: +2 min per GPU, then 5 min with all selected GPUs under load. Acceptance: +10 min per GPU, then at least 1 hour with all selected GPUs under load. Overnight: +1 hour per GPU, then at least 1 hour with all selected GPUs under load, capped at 10 hours total.</span>
</label> </label>
</div> </div>
</div> </div>
@@ -3108,7 +3119,6 @@ usbRefresh();
</script>` </script>`
} }
func renderNvidiaSelfHealInline() string { func renderNvidiaSelfHealInline() string {
return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Inspect NVIDIA GPU health, restart the bee-nvidia driver service, and issue a per-GPU reset when the driver reports reset required.</p> return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Inspect NVIDIA GPU health, restart the bee-nvidia driver service, and issue a per-GPU reset when the driver reports reset required.</p>
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:12px"> <div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:12px">

View File

@@ -152,6 +152,12 @@ type burnPreset struct {
DurationSec int DurationSec int
} }
type nvidiaRampSpec struct {
DurationSec int
StaggerSeconds int
TotalDurationSec int
}
func resolveBurnPreset(profile string) burnPreset { func resolveBurnPreset(profile string) burnPreset {
switch profile { switch profile {
case "overnight": case "overnight":
@@ -163,11 +169,43 @@ func resolveBurnPreset(profile string) burnPreset {
} }
} }
func boolToNvidiaStaggerSeconds(enabled bool, selected []int) int { func resolveNvidiaRampPlan(profile string, enabled bool, selected []int) (nvidiaRampSpec, error) {
if enabled && len(selected) > 1 { base := resolveBurnPreset(profile).DurationSec
return 180 plan := nvidiaRampSpec{
DurationSec: base,
TotalDurationSec: base,
} }
return 0 if !enabled {
return plan, nil
}
count := len(selected)
if count == 0 {
return nvidiaRampSpec{}, fmt.Errorf("staggered NVIDIA burn requires explicit GPU selection")
}
if count == 1 {
return plan, nil
}
switch profile {
case "acceptance":
plan.StaggerSeconds = 10 * 60
plan.TotalDurationSec = plan.DurationSec + plan.StaggerSeconds*(count-1)
case "overnight":
plan.StaggerSeconds = 60 * 60
plan.TotalDurationSec = 8 * 60 * 60
minTotal := count * 60 * 60
if plan.TotalDurationSec < minTotal {
plan.TotalDurationSec = minTotal
}
if plan.TotalDurationSec > 10*60*60 {
return nvidiaRampSpec{}, fmt.Errorf("overnight staggered NVIDIA burn supports at most 10 GPUs")
}
plan.DurationSec = plan.TotalDurationSec - plan.StaggerSeconds*(count-1)
default:
plan.StaggerSeconds = 2 * 60
plan.TotalDurationSec = plan.DurationSec + plan.StaggerSeconds*(count-1)
}
return plan, nil
} }
func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions { func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
@@ -600,7 +638,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
RunNCCL: t.params.RunNCCL, RunNCCL: t.params.RunNCCL,
ParallelGPUs: t.params.ParallelGPUs, ParallelGPUs: t.params.ParallelGPUs,
}, j.append) }, j.append)
case "nvidia-compute": case "nvidia-compute":
if a == nil { if a == nil {
err = fmt.Errorf("app not configured") err = fmt.Errorf("app not configured")
break break
@@ -609,11 +647,18 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
if t.params.BurnProfile != "" && dur <= 0 { if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
} }
staggerSec := boolToNvidiaStaggerSeconds(t.params.StaggerGPUStart, t.params.GPUIndices) rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
if staggerSec > 0 { if planErr != nil {
j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU", staggerSec)) err = planErr
} break
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, staggerSec, j.append) }
if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
dur = rampPlan.DurationSec
}
if rampPlan.StaggerSeconds > 0 {
j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
}
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, rampPlan.StaggerSeconds, j.append)
case "nvidia-targeted-power": case "nvidia-targeted-power":
if a == nil { if a == nil {
err = fmt.Errorf("app not configured") err = fmt.Errorf("app not configured")
@@ -663,13 +708,24 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
if t.params.BurnProfile != "" && dur <= 0 { if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
} }
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{ rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
DurationSec: dur, if planErr != nil {
Loader: t.params.Loader, err = planErr
GPUIndices: t.params.GPUIndices, break
ExcludeGPUIndices: t.params.ExcludeGPUIndices, }
StaggerSeconds: boolToNvidiaStaggerSeconds(t.params.StaggerGPUStart, t.params.GPUIndices), if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
}, j.append) dur = rampPlan.DurationSec
}
if rampPlan.StaggerSeconds > 0 {
j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
}
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
DurationSec: dur,
Loader: t.params.Loader,
GPUIndices: t.params.GPUIndices,
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
StaggerSeconds: rampPlan.StaggerSeconds,
}, j.append)
case "memory": case "memory":
if a == nil { if a == nil {
err = fmt.Errorf("app not configured") err = fmt.Errorf("app not configured")

View File

@@ -491,6 +491,83 @@ func TestResolveBurnPreset(t *testing.T) {
} }
} }
func TestResolveNvidiaRampPlan(t *testing.T) {
tests := []struct {
name string
profile string
enabled bool
selected []int
want nvidiaRampSpec
wantErr string
}{
{
name: "disabled uses base preset",
profile: "acceptance",
selected: []int{0, 1},
want: nvidiaRampSpec{DurationSec: 60 * 60, TotalDurationSec: 60 * 60},
},
{
name: "smoke ramp uses two minute steps",
profile: "smoke",
enabled: true,
selected: []int{0, 1, 2},
want: nvidiaRampSpec{DurationSec: 5 * 60, StaggerSeconds: 2 * 60, TotalDurationSec: 9 * 60},
},
{
name: "acceptance ramp uses ten minute steps",
profile: "acceptance",
enabled: true,
selected: []int{0, 1, 2},
want: nvidiaRampSpec{DurationSec: 60 * 60, StaggerSeconds: 10 * 60, TotalDurationSec: 80 * 60},
},
{
name: "overnight stays at eight hours when possible",
profile: "overnight",
enabled: true,
selected: []int{0, 1, 2},
want: nvidiaRampSpec{DurationSec: 6 * 60 * 60, StaggerSeconds: 60 * 60, TotalDurationSec: 8 * 60 * 60},
},
{
name: "overnight extends to keep one hour after final gpu",
profile: "overnight",
enabled: true,
selected: []int{0, 1, 2, 3, 4, 5, 6, 7, 8},
want: nvidiaRampSpec{DurationSec: 60 * 60, StaggerSeconds: 60 * 60, TotalDurationSec: 9 * 60 * 60},
},
{
name: "overnight rejects impossible gpu count",
profile: "overnight",
enabled: true,
selected: []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
wantErr: "at most 10 GPUs",
},
{
name: "enabled requires explicit selection",
profile: "smoke",
enabled: true,
wantErr: "requires explicit GPU selection",
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
got, err := resolveNvidiaRampPlan(tc.profile, tc.enabled, tc.selected)
if tc.wantErr != "" {
if err == nil || !strings.Contains(err.Error(), tc.wantErr) {
t.Fatalf("err=%v want substring %q", err, tc.wantErr)
}
return
}
if err != nil {
t.Fatalf("resolveNvidiaRampPlan error: %v", err)
}
if got != tc.want {
t.Fatalf("resolveNvidiaRampPlan(%q, %t, %v)=%+v want %+v", tc.profile, tc.enabled, tc.selected, got, tc.want)
}
})
}
}
func TestTaskDisplayNameUsesNvidiaStressLoader(t *testing.T) { func TestTaskDisplayNameUsesNvidiaStressLoader(t *testing.T) {
tests := []struct { tests := []struct {
loader string loader string