diff --git a/audit/internal/webui/pages.go b/audit/internal/webui/pages.go
index de6f82f..e019d0e 100644
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
@@ -1342,11 +1342,11 @@ func renderValidate(opts HandlerOptions) string {
` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
- inv.NVIDIA,
- `Runs NVIDIA diagnostics and board inventory checks.`,
- `
nvidia-smi,
dmidecode,
dcgmi diag`,
- `Level 2 in Validate, Level 3 in Stress. Runs one GPU at a time on the selected NVIDIA GPUs.`,
- )) +
+ inv.NVIDIA,
+ `Runs NVIDIA diagnostics and board inventory checks.`,
+ `
nvidia-smi,
dmidecode,
dcgmi diag`,
+ `Level 2 in Validate, Level 3 in Stress. Runs one GPU at a time on the selected NVIDIA GPUs.`,
+ )) +
`
` +
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
inv.NVIDIA,
@@ -2373,7 +2373,7 @@ func renderBurn() string {
Select at least one NVIDIA GPU to enable NVIDIA burn recipes.
- Ramp selected NVIDIA GPUs one by one before full-load hold. Uses a 3-minute stabilization window per GPU, then keeps all selected GPUs under load for the chosen Burn Profile duration.
+ Ramp selected NVIDIA GPUs one by one before the full-load hold. Smoke: +2 min per GPU, then 5 min with all selected GPUs under load. Acceptance: +10 min per GPU, then at least 1 hour with all selected GPUs under load. Overnight: +1 hour per GPU, then at least 1 hour with all selected GPUs under load, capped at 10 hours total.
@@ -3108,7 +3108,6 @@ usbRefresh();
`
}
-
func renderNvidiaSelfHealInline() string {
return `Inspect NVIDIA GPU health, restart the bee-nvidia driver service, and issue a per-GPU reset when the driver reports reset required.
diff --git a/audit/internal/webui/tasks.go b/audit/internal/webui/tasks.go
index 283803a..4afd96c 100644
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -152,6 +152,12 @@ type burnPreset struct {
DurationSec int
}
+type nvidiaRampSpec struct {
+ DurationSec int
+ StaggerSeconds int
+ TotalDurationSec int
+}
+
func resolveBurnPreset(profile string) burnPreset {
switch profile {
case "overnight":
@@ -163,11 +169,43 @@ func resolveBurnPreset(profile string) burnPreset {
}
}
-func boolToNvidiaStaggerSeconds(enabled bool, selected []int) int {
- if enabled && len(selected) > 1 {
- return 180
+func resolveNvidiaRampPlan(profile string, enabled bool, selected []int) (nvidiaRampSpec, error) {
+ base := resolveBurnPreset(profile).DurationSec
+ plan := nvidiaRampSpec{
+ DurationSec: base,
+ TotalDurationSec: base,
}
- return 0
+ if !enabled {
+ return plan, nil
+ }
+ count := len(selected)
+ if count == 0 {
+ return nvidiaRampSpec{}, fmt.Errorf("staggered NVIDIA burn requires explicit GPU selection")
+ }
+ if count == 1 {
+ return plan, nil
+ }
+
+ switch profile {
+ case "acceptance":
+ plan.StaggerSeconds = 10 * 60
+ plan.TotalDurationSec = plan.DurationSec + plan.StaggerSeconds*(count-1)
+ case "overnight":
+ plan.StaggerSeconds = 60 * 60
+ plan.TotalDurationSec = 8 * 60 * 60
+ minTotal := count * 60 * 60
+ if plan.TotalDurationSec < minTotal {
+ plan.TotalDurationSec = minTotal
+ }
+ if plan.TotalDurationSec > 10*60*60 {
+ return nvidiaRampSpec{}, fmt.Errorf("overnight staggered NVIDIA burn supports at most 10 GPUs")
+ }
+ plan.DurationSec = plan.TotalDurationSec - plan.StaggerSeconds*(count-1)
+ default:
+ plan.StaggerSeconds = 2 * 60
+ plan.TotalDurationSec = plan.DurationSec + plan.StaggerSeconds*(count-1)
+ }
+ return plan, nil
}
func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
@@ -600,7 +638,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
RunNCCL: t.params.RunNCCL,
ParallelGPUs: t.params.ParallelGPUs,
}, j.append)
- case "nvidia-compute":
+ case "nvidia-compute":
if a == nil {
err = fmt.Errorf("app not configured")
break
@@ -609,11 +647,18 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
- staggerSec := boolToNvidiaStaggerSeconds(t.params.StaggerGPUStart, t.params.GPUIndices)
- if staggerSec > 0 {
- j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU", staggerSec))
- }
- archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, staggerSec, j.append)
+ rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
+ if planErr != nil {
+ err = planErr
+ break
+ }
+ if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
+ dur = rampPlan.DurationSec
+ }
+ if rampPlan.StaggerSeconds > 0 {
+ j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
+ }
+ archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, rampPlan.StaggerSeconds, j.append)
case "nvidia-targeted-power":
if a == nil {
err = fmt.Errorf("app not configured")
@@ -663,13 +708,24 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
- archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
- DurationSec: dur,
- Loader: t.params.Loader,
- GPUIndices: t.params.GPUIndices,
- ExcludeGPUIndices: t.params.ExcludeGPUIndices,
- StaggerSeconds: boolToNvidiaStaggerSeconds(t.params.StaggerGPUStart, t.params.GPUIndices),
- }, j.append)
+ rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
+ if planErr != nil {
+ err = planErr
+ break
+ }
+ if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
+ dur = rampPlan.DurationSec
+ }
+ if rampPlan.StaggerSeconds > 0 {
+ j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
+ }
+ archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
+ DurationSec: dur,
+ Loader: t.params.Loader,
+ GPUIndices: t.params.GPUIndices,
+ ExcludeGPUIndices: t.params.ExcludeGPUIndices,
+ StaggerSeconds: rampPlan.StaggerSeconds,
+ }, j.append)
case "memory":
if a == nil {
err = fmt.Errorf("app not configured")
diff --git a/audit/internal/webui/tasks_test.go b/audit/internal/webui/tasks_test.go
index fe37d96..1af7d82 100644
--- a/audit/internal/webui/tasks_test.go
+++ b/audit/internal/webui/tasks_test.go
@@ -491,6 +491,83 @@ func TestResolveBurnPreset(t *testing.T) {
}
}
+func TestResolveNvidiaRampPlan(t *testing.T) {
+ tests := []struct {
+ name string
+ profile string
+ enabled bool
+ selected []int
+ want nvidiaRampSpec
+ wantErr string
+ }{
+ {
+ name: "disabled uses base preset",
+ profile: "acceptance",
+ selected: []int{0, 1},
+ want: nvidiaRampSpec{DurationSec: 60 * 60, TotalDurationSec: 60 * 60},
+ },
+ {
+ name: "smoke ramp uses two minute steps",
+ profile: "smoke",
+ enabled: true,
+ selected: []int{0, 1, 2},
+ want: nvidiaRampSpec{DurationSec: 5 * 60, StaggerSeconds: 2 * 60, TotalDurationSec: 9 * 60},
+ },
+ {
+ name: "acceptance ramp uses ten minute steps",
+ profile: "acceptance",
+ enabled: true,
+ selected: []int{0, 1, 2},
+ want: nvidiaRampSpec{DurationSec: 60 * 60, StaggerSeconds: 10 * 60, TotalDurationSec: 80 * 60},
+ },
+ {
+ name: "overnight stays at eight hours when possible",
+ profile: "overnight",
+ enabled: true,
+ selected: []int{0, 1, 2},
+ want: nvidiaRampSpec{DurationSec: 6 * 60 * 60, StaggerSeconds: 60 * 60, TotalDurationSec: 8 * 60 * 60},
+ },
+ {
+ name: "overnight extends to keep one hour after final gpu",
+ profile: "overnight",
+ enabled: true,
+ selected: []int{0, 1, 2, 3, 4, 5, 6, 7, 8},
+ want: nvidiaRampSpec{DurationSec: 60 * 60, StaggerSeconds: 60 * 60, TotalDurationSec: 9 * 60 * 60},
+ },
+ {
+ name: "overnight rejects impossible gpu count",
+ profile: "overnight",
+ enabled: true,
+ selected: []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+ wantErr: "at most 10 GPUs",
+ },
+ {
+ name: "enabled requires explicit selection",
+ profile: "smoke",
+ enabled: true,
+ wantErr: "requires explicit GPU selection",
+ },
+ }
+
+ for _, tc := range tests {
+ t.Run(tc.name, func(t *testing.T) {
+ got, err := resolveNvidiaRampPlan(tc.profile, tc.enabled, tc.selected)
+ if tc.wantErr != "" {
+ if err == nil || !strings.Contains(err.Error(), tc.wantErr) {
+ t.Fatalf("err=%v want substring %q", err, tc.wantErr)
+ }
+ return
+ }
+ if err != nil {
+ t.Fatalf("resolveNvidiaRampPlan error: %v", err)
+ }
+ if got != tc.want {
+ t.Fatalf("resolveNvidiaRampPlan(%q, %t, %v)=%+v want %+v", tc.profile, tc.enabled, tc.selected, got, tc.want)
+ }
+ })
+ }
+}
+
func TestTaskDisplayNameUsesNvidiaStressLoader(t *testing.T) {
tests := []struct {
loader string