Adjust burn GPU ramp timing by profile
This commit is contained in:
@@ -152,6 +152,12 @@ type burnPreset struct {
|
||||
DurationSec int
|
||||
}
|
||||
|
||||
type nvidiaRampSpec struct {
|
||||
DurationSec int
|
||||
StaggerSeconds int
|
||||
TotalDurationSec int
|
||||
}
|
||||
|
||||
func resolveBurnPreset(profile string) burnPreset {
|
||||
switch profile {
|
||||
case "overnight":
|
||||
@@ -163,11 +169,43 @@ func resolveBurnPreset(profile string) burnPreset {
|
||||
}
|
||||
}
|
||||
|
||||
func boolToNvidiaStaggerSeconds(enabled bool, selected []int) int {
|
||||
if enabled && len(selected) > 1 {
|
||||
return 180
|
||||
func resolveNvidiaRampPlan(profile string, enabled bool, selected []int) (nvidiaRampSpec, error) {
|
||||
base := resolveBurnPreset(profile).DurationSec
|
||||
plan := nvidiaRampSpec{
|
||||
DurationSec: base,
|
||||
TotalDurationSec: base,
|
||||
}
|
||||
return 0
|
||||
if !enabled {
|
||||
return plan, nil
|
||||
}
|
||||
count := len(selected)
|
||||
if count == 0 {
|
||||
return nvidiaRampSpec{}, fmt.Errorf("staggered NVIDIA burn requires explicit GPU selection")
|
||||
}
|
||||
if count == 1 {
|
||||
return plan, nil
|
||||
}
|
||||
|
||||
switch profile {
|
||||
case "acceptance":
|
||||
plan.StaggerSeconds = 10 * 60
|
||||
plan.TotalDurationSec = plan.DurationSec + plan.StaggerSeconds*(count-1)
|
||||
case "overnight":
|
||||
plan.StaggerSeconds = 60 * 60
|
||||
plan.TotalDurationSec = 8 * 60 * 60
|
||||
minTotal := count * 60 * 60
|
||||
if plan.TotalDurationSec < minTotal {
|
||||
plan.TotalDurationSec = minTotal
|
||||
}
|
||||
if plan.TotalDurationSec > 10*60*60 {
|
||||
return nvidiaRampSpec{}, fmt.Errorf("overnight staggered NVIDIA burn supports at most 10 GPUs")
|
||||
}
|
||||
plan.DurationSec = plan.TotalDurationSec - plan.StaggerSeconds*(count-1)
|
||||
default:
|
||||
plan.StaggerSeconds = 2 * 60
|
||||
plan.TotalDurationSec = plan.DurationSec + plan.StaggerSeconds*(count-1)
|
||||
}
|
||||
return plan, nil
|
||||
}
|
||||
|
||||
func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
|
||||
@@ -600,7 +638,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
RunNCCL: t.params.RunNCCL,
|
||||
ParallelGPUs: t.params.ParallelGPUs,
|
||||
}, j.append)
|
||||
case "nvidia-compute":
|
||||
case "nvidia-compute":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
@@ -609,11 +647,18 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
staggerSec := boolToNvidiaStaggerSeconds(t.params.StaggerGPUStart, t.params.GPUIndices)
|
||||
if staggerSec > 0 {
|
||||
j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU", staggerSec))
|
||||
}
|
||||
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, staggerSec, j.append)
|
||||
rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
|
||||
if planErr != nil {
|
||||
err = planErr
|
||||
break
|
||||
}
|
||||
if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
|
||||
dur = rampPlan.DurationSec
|
||||
}
|
||||
if rampPlan.StaggerSeconds > 0 {
|
||||
j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
|
||||
}
|
||||
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, rampPlan.StaggerSeconds, j.append)
|
||||
case "nvidia-targeted-power":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
@@ -663,13 +708,24 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
|
||||
DurationSec: dur,
|
||||
Loader: t.params.Loader,
|
||||
GPUIndices: t.params.GPUIndices,
|
||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||
StaggerSeconds: boolToNvidiaStaggerSeconds(t.params.StaggerGPUStart, t.params.GPUIndices),
|
||||
}, j.append)
|
||||
rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
|
||||
if planErr != nil {
|
||||
err = planErr
|
||||
break
|
||||
}
|
||||
if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
|
||||
dur = rampPlan.DurationSec
|
||||
}
|
||||
if rampPlan.StaggerSeconds > 0 {
|
||||
j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
|
||||
}
|
||||
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
|
||||
DurationSec: dur,
|
||||
Loader: t.params.Loader,
|
||||
GPUIndices: t.params.GPUIndices,
|
||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||
StaggerSeconds: rampPlan.StaggerSeconds,
|
||||
}, j.append)
|
||||
case "memory":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
|
||||
Reference in New Issue
Block a user