diff --git a/audit/internal/webui/pages.go b/audit/internal/webui/pages.go index de6f82f..e019d0e 100644 --- a/audit/internal/webui/pages.go +++ b/audit/internal/webui/pages.go @@ -1342,11 +1342,11 @@ func renderValidate(opts HandlerOptions) string {
` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody( - inv.NVIDIA, - `Runs NVIDIA diagnostics and board inventory checks.`, - `nvidia-smi, dmidecode, dcgmi diag`, - `Level 2 in Validate, Level 3 in Stress. Runs one GPU at a time on the selected NVIDIA GPUs.`, - )) + + inv.NVIDIA, + `Runs NVIDIA diagnostics and board inventory checks.`, + `nvidia-smi, dmidecode, dcgmi diag`, + `Level 2 in Validate, Level 3 in Stress. Runs one GPU at a time on the selected NVIDIA GPUs.`, + )) + `
` + renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody( inv.NVIDIA, @@ -2373,7 +2373,7 @@ func renderBurn() string {

Select at least one NVIDIA GPU to enable NVIDIA burn recipes.

@@ -3108,7 +3108,6 @@ usbRefresh(); ` } - func renderNvidiaSelfHealInline() string { return `

Inspect NVIDIA GPU health, restart the bee-nvidia driver service, and issue a per-GPU reset when the driver reports reset required.

diff --git a/audit/internal/webui/tasks.go b/audit/internal/webui/tasks.go index 283803a..4afd96c 100644 --- a/audit/internal/webui/tasks.go +++ b/audit/internal/webui/tasks.go @@ -152,6 +152,12 @@ type burnPreset struct { DurationSec int } +type nvidiaRampSpec struct { + DurationSec int + StaggerSeconds int + TotalDurationSec int +} + func resolveBurnPreset(profile string) burnPreset { switch profile { case "overnight": @@ -163,11 +169,43 @@ func resolveBurnPreset(profile string) burnPreset { } } -func boolToNvidiaStaggerSeconds(enabled bool, selected []int) int { - if enabled && len(selected) > 1 { - return 180 +func resolveNvidiaRampPlan(profile string, enabled bool, selected []int) (nvidiaRampSpec, error) { + base := resolveBurnPreset(profile).DurationSec + plan := nvidiaRampSpec{ + DurationSec: base, + TotalDurationSec: base, } - return 0 + if !enabled { + return plan, nil + } + count := len(selected) + if count == 0 { + return nvidiaRampSpec{}, fmt.Errorf("staggered NVIDIA burn requires explicit GPU selection") + } + if count == 1 { + return plan, nil + } + + switch profile { + case "acceptance": + plan.StaggerSeconds = 10 * 60 + plan.TotalDurationSec = plan.DurationSec + plan.StaggerSeconds*(count-1) + case "overnight": + plan.StaggerSeconds = 60 * 60 + plan.TotalDurationSec = 8 * 60 * 60 + minTotal := count * 60 * 60 + if plan.TotalDurationSec < minTotal { + plan.TotalDurationSec = minTotal + } + if plan.TotalDurationSec > 10*60*60 { + return nvidiaRampSpec{}, fmt.Errorf("overnight staggered NVIDIA burn supports at most 10 GPUs") + } + plan.DurationSec = plan.TotalDurationSec - plan.StaggerSeconds*(count-1) + default: + plan.StaggerSeconds = 2 * 60 + plan.TotalDurationSec = plan.DurationSec + plan.StaggerSeconds*(count-1) + } + return plan, nil } func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions { @@ -600,7 +638,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) { RunNCCL: t.params.RunNCCL, ParallelGPUs: t.params.ParallelGPUs, }, j.append) - case "nvidia-compute": + case "nvidia-compute": if a == nil { err = fmt.Errorf("app not configured") break @@ -609,11 +647,18 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) { if t.params.BurnProfile != "" && dur <= 0 { dur = resolveBurnPreset(t.params.BurnProfile).DurationSec } - staggerSec := boolToNvidiaStaggerSeconds(t.params.StaggerGPUStart, t.params.GPUIndices) - if staggerSec > 0 { - j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU", staggerSec)) - } - archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, staggerSec, j.append) + rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices) + if planErr != nil { + err = planErr + break + } + if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 { + dur = rampPlan.DurationSec + } + if rampPlan.StaggerSeconds > 0 { + j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec)) + } + archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, rampPlan.StaggerSeconds, j.append) case "nvidia-targeted-power": if a == nil { err = fmt.Errorf("app not configured") @@ -663,13 +708,24 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) { if t.params.BurnProfile != "" && dur <= 0 { dur = resolveBurnPreset(t.params.BurnProfile).DurationSec } - archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{ - DurationSec: dur, - Loader: t.params.Loader, - GPUIndices: t.params.GPUIndices, - ExcludeGPUIndices: t.params.ExcludeGPUIndices, - StaggerSeconds: boolToNvidiaStaggerSeconds(t.params.StaggerGPUStart, t.params.GPUIndices), - }, j.append) + rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices) + if planErr != nil { + err = planErr + break + } + if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 { + dur = rampPlan.DurationSec + } + if rampPlan.StaggerSeconds > 0 { + j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec)) + } + archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{ + DurationSec: dur, + Loader: t.params.Loader, + GPUIndices: t.params.GPUIndices, + ExcludeGPUIndices: t.params.ExcludeGPUIndices, + StaggerSeconds: rampPlan.StaggerSeconds, + }, j.append) case "memory": if a == nil { err = fmt.Errorf("app not configured") diff --git a/audit/internal/webui/tasks_test.go b/audit/internal/webui/tasks_test.go index fe37d96..1af7d82 100644 --- a/audit/internal/webui/tasks_test.go +++ b/audit/internal/webui/tasks_test.go @@ -491,6 +491,83 @@ func TestResolveBurnPreset(t *testing.T) { } } +func TestResolveNvidiaRampPlan(t *testing.T) { + tests := []struct { + name string + profile string + enabled bool + selected []int + want nvidiaRampSpec + wantErr string + }{ + { + name: "disabled uses base preset", + profile: "acceptance", + selected: []int{0, 1}, + want: nvidiaRampSpec{DurationSec: 60 * 60, TotalDurationSec: 60 * 60}, + }, + { + name: "smoke ramp uses two minute steps", + profile: "smoke", + enabled: true, + selected: []int{0, 1, 2}, + want: nvidiaRampSpec{DurationSec: 5 * 60, StaggerSeconds: 2 * 60, TotalDurationSec: 9 * 60}, + }, + { + name: "acceptance ramp uses ten minute steps", + profile: "acceptance", + enabled: true, + selected: []int{0, 1, 2}, + want: nvidiaRampSpec{DurationSec: 60 * 60, StaggerSeconds: 10 * 60, TotalDurationSec: 80 * 60}, + }, + { + name: "overnight stays at eight hours when possible", + profile: "overnight", + enabled: true, + selected: []int{0, 1, 2}, + want: nvidiaRampSpec{DurationSec: 6 * 60 * 60, StaggerSeconds: 60 * 60, TotalDurationSec: 8 * 60 * 60}, + }, + { + name: "overnight extends to keep one hour after final gpu", + profile: "overnight", + enabled: true, + selected: []int{0, 1, 2, 3, 4, 5, 6, 7, 8}, + want: nvidiaRampSpec{DurationSec: 60 * 60, StaggerSeconds: 60 * 60, TotalDurationSec: 9 * 60 * 60}, + }, + { + name: "overnight rejects impossible gpu count", + profile: "overnight", + enabled: true, + selected: []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, + wantErr: "at most 10 GPUs", + }, + { + name: "enabled requires explicit selection", + profile: "smoke", + enabled: true, + wantErr: "requires explicit GPU selection", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got, err := resolveNvidiaRampPlan(tc.profile, tc.enabled, tc.selected) + if tc.wantErr != "" { + if err == nil || !strings.Contains(err.Error(), tc.wantErr) { + t.Fatalf("err=%v want substring %q", err, tc.wantErr) + } + return + } + if err != nil { + t.Fatalf("resolveNvidiaRampPlan error: %v", err) + } + if got != tc.want { + t.Fatalf("resolveNvidiaRampPlan(%q, %t, %v)=%+v want %+v", tc.profile, tc.enabled, tc.selected, got, tc.want) + } + }) + } +} + func TestTaskDisplayNameUsesNvidiaStressLoader(t *testing.T) { tests := []struct { loader string