Normalize task queue priorities by workflow
This commit is contained in:
@@ -36,6 +36,16 @@ var apiListNvidiaGPUStatuses = func(a *app.App) ([]platform.NvidiaGPUStatus, err
|
|||||||
return a.ListNvidiaGPUStatuses()
|
return a.ListNvidiaGPUStatuses()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
taskPriorityBenchmark = 10
|
||||||
|
taskPriorityBurn = 20
|
||||||
|
taskPriorityValidateStress = 30
|
||||||
|
taskPriorityValidate = 40
|
||||||
|
taskPriorityAudit = 50
|
||||||
|
taskPriorityInstallToRAM = 60
|
||||||
|
taskPriorityInstall = 70
|
||||||
|
)
|
||||||
|
|
||||||
// ── Job ID counter ────────────────────────────────────────────────────────────
|
// ── Job ID counter ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
var jobCounter atomic.Uint64
|
var jobCounter atomic.Uint64
|
||||||
@@ -109,6 +119,30 @@ func shouldSplitHomogeneousNvidiaTarget(target string) bool {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func defaultTaskPriority(target string, params taskParams) int {
|
||||||
|
switch strings.TrimSpace(target) {
|
||||||
|
case "install":
|
||||||
|
return taskPriorityInstall
|
||||||
|
case "install-to-ram":
|
||||||
|
return taskPriorityInstallToRAM
|
||||||
|
case "audit":
|
||||||
|
return taskPriorityAudit
|
||||||
|
case "nvidia-benchmark":
|
||||||
|
return taskPriorityBenchmark
|
||||||
|
case "nvidia-stress", "amd-stress", "memory-stress", "sat-stress", "platform-stress", "nvidia-compute":
|
||||||
|
return taskPriorityBurn
|
||||||
|
case "nvidia", "nvidia-targeted-stress", "nvidia-targeted-power", "nvidia-pulse",
|
||||||
|
"nvidia-interconnect", "nvidia-bandwidth", "memory", "storage", "cpu",
|
||||||
|
"amd", "amd-mem", "amd-bandwidth":
|
||||||
|
if params.StressMode {
|
||||||
|
return taskPriorityValidateStress
|
||||||
|
}
|
||||||
|
return taskPriorityValidate
|
||||||
|
default:
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func expandHomogeneousNvidiaSelections(gpus []platform.NvidiaGPU, include, exclude []int) ([]nvidiaTaskSelection, error) {
|
func expandHomogeneousNvidiaSelections(gpus []platform.NvidiaGPU, include, exclude []int) ([]nvidiaTaskSelection, error) {
|
||||||
if len(gpus) == 0 {
|
if len(gpus) == 0 {
|
||||||
return nil, fmt.Errorf("no NVIDIA GPUs detected")
|
return nil, fmt.Errorf("no NVIDIA GPUs detected")
|
||||||
@@ -458,6 +492,7 @@ func (h *handler) handleAPIAuditRun(w http.ResponseWriter, _ *http.Request) {
|
|||||||
ID: newJobID("audit"),
|
ID: newJobID("audit"),
|
||||||
Name: "Audit",
|
Name: "Audit",
|
||||||
Target: "audit",
|
Target: "audit",
|
||||||
|
Priority: defaultTaskPriority("audit", taskParams{}),
|
||||||
Status: TaskPending,
|
Status: TaskPending,
|
||||||
CreatedAt: time.Now(),
|
CreatedAt: time.Now(),
|
||||||
}
|
}
|
||||||
@@ -526,7 +561,7 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
|||||||
DisplayName: body.DisplayName,
|
DisplayName: body.DisplayName,
|
||||||
PlatformComponents: body.PlatformComponents,
|
PlatformComponents: body.PlatformComponents,
|
||||||
}
|
}
|
||||||
tasks, err := buildNvidiaTaskSet(target, 0, time.Now(), params, name, h.opts.App, "sat-"+target)
|
tasks, err := buildNvidiaTaskSet(target, defaultTaskPriority(target, params), time.Now(), params, name, h.opts.App, "sat-"+target)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
writeError(w, http.StatusBadRequest, err.Error())
|
writeError(w, http.StatusBadRequest, err.Error())
|
||||||
return
|
return
|
||||||
@@ -613,7 +648,7 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
|
|||||||
ID: newJobID("benchmark-nvidia"),
|
ID: newJobID("benchmark-nvidia"),
|
||||||
Name: stepName,
|
Name: stepName,
|
||||||
Target: "nvidia-benchmark",
|
Target: "nvidia-benchmark",
|
||||||
Priority: 15,
|
Priority: defaultTaskPriority("nvidia-benchmark", taskParams{}),
|
||||||
Status: TaskPending,
|
Status: TaskPending,
|
||||||
CreatedAt: now,
|
CreatedAt: now,
|
||||||
params: taskParams{
|
params: taskParams{
|
||||||
@@ -645,7 +680,7 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
|
|||||||
name = fmt.Sprintf("%s · sequential", name)
|
name = fmt.Sprintf("%s · sequential", name)
|
||||||
}
|
}
|
||||||
|
|
||||||
tasks, err := buildNvidiaTaskSet("nvidia-benchmark", 15, time.Now(), taskParams{
|
params := taskParams{
|
||||||
GPUIndices: body.GPUIndices,
|
GPUIndices: body.GPUIndices,
|
||||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||||
SizeMB: body.SizeMB,
|
SizeMB: body.SizeMB,
|
||||||
@@ -653,7 +688,8 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
|
|||||||
RunNCCL: runNCCL,
|
RunNCCL: runNCCL,
|
||||||
ParallelGPUs: parallelGPUs,
|
ParallelGPUs: parallelGPUs,
|
||||||
DisplayName: body.DisplayName,
|
DisplayName: body.DisplayName,
|
||||||
}, name, h.opts.App, "benchmark-nvidia")
|
}
|
||||||
|
tasks, err := buildNvidiaTaskSet("nvidia-benchmark", defaultTaskPriority("nvidia-benchmark", params), time.Now(), params, name, h.opts.App, "benchmark-nvidia")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
writeError(w, http.StatusBadRequest, err.Error())
|
writeError(w, http.StatusBadRequest, err.Error())
|
||||||
return
|
return
|
||||||
@@ -1054,7 +1090,7 @@ func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request)
|
|||||||
ID: newJobID("install-to-ram"),
|
ID: newJobID("install-to-ram"),
|
||||||
Name: "Install to RAM",
|
Name: "Install to RAM",
|
||||||
Target: "install-to-ram",
|
Target: "install-to-ram",
|
||||||
Priority: 10,
|
Priority: defaultTaskPriority("install-to-ram", taskParams{}),
|
||||||
Status: TaskPending,
|
Status: TaskPending,
|
||||||
CreatedAt: time.Now(),
|
CreatedAt: time.Now(),
|
||||||
}
|
}
|
||||||
@@ -1169,7 +1205,7 @@ func (h *handler) handleAPIInstallRun(w http.ResponseWriter, r *http.Request) {
|
|||||||
ID: newJobID("install"),
|
ID: newJobID("install"),
|
||||||
Name: "Install to Disk",
|
Name: "Install to Disk",
|
||||||
Target: "install",
|
Target: "install",
|
||||||
Priority: 20,
|
Priority: defaultTaskPriority("install", taskParams{}),
|
||||||
Status: TaskPending,
|
Status: TaskPending,
|
||||||
CreatedAt: time.Now(),
|
CreatedAt: time.Now(),
|
||||||
params: taskParams{
|
params: taskParams{
|
||||||
@@ -1461,4 +1497,3 @@ func (h *handler) rollbackPendingNetworkChange() error {
|
|||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -39,6 +39,9 @@ func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
|
|||||||
if got := globalQueue.tasks[0].params.BurnProfile; got != "smoke" {
|
if got := globalQueue.tasks[0].params.BurnProfile; got != "smoke" {
|
||||||
t.Fatalf("burn profile=%q want smoke", got)
|
t.Fatalf("burn profile=%q want smoke", got)
|
||||||
}
|
}
|
||||||
|
if got := globalQueue.tasks[0].Priority; got != taskPriorityValidate {
|
||||||
|
t.Fatalf("priority=%d want %d", got, taskPriorityValidate)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
||||||
@@ -84,6 +87,9 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
|||||||
if task.params.RunNCCL {
|
if task.params.RunNCCL {
|
||||||
t.Fatal("RunNCCL should reflect explicit false from request")
|
t.Fatal("RunNCCL should reflect explicit false from request")
|
||||||
}
|
}
|
||||||
|
if task.Priority != taskPriorityBenchmark {
|
||||||
|
t.Fatalf("priority=%d want %d", task.Priority, taskPriorityBenchmark)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
|
func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
|
||||||
@@ -133,6 +139,12 @@ func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
|
|||||||
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
|
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
|
||||||
t.Fatalf("task[1] gpu indices=%v want [2]", got)
|
t.Fatalf("task[1] gpu indices=%v want [2]", got)
|
||||||
}
|
}
|
||||||
|
if got := globalQueue.tasks[0].Priority; got != taskPriorityBenchmark {
|
||||||
|
t.Fatalf("task[0] priority=%d want %d", got, taskPriorityBenchmark)
|
||||||
|
}
|
||||||
|
if got := globalQueue.tasks[1].Priority; got != taskPriorityBenchmark {
|
||||||
|
t.Fatalf("task[1] priority=%d want %d", got, taskPriorityBenchmark)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
|
func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
|
||||||
@@ -175,6 +187,39 @@ func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
|
|||||||
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
|
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
|
||||||
t.Fatalf("task[1] gpu indices=%v want [2]", got)
|
t.Fatalf("task[1] gpu indices=%v want [2]", got)
|
||||||
}
|
}
|
||||||
|
if got := globalQueue.tasks[0].Priority; got != taskPriorityValidate {
|
||||||
|
t.Fatalf("task[0] priority=%d want %d", got, taskPriorityValidate)
|
||||||
|
}
|
||||||
|
if got := globalQueue.tasks[1].Priority; got != taskPriorityValidate {
|
||||||
|
t.Fatalf("task[1] priority=%d want %d", got, taskPriorityValidate)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDefaultTaskPriorityOrder(t *testing.T) {
|
||||||
|
got := []int{
|
||||||
|
defaultTaskPriority("install-to-ram", taskParams{}),
|
||||||
|
defaultTaskPriority("audit", taskParams{}),
|
||||||
|
defaultTaskPriority("cpu", taskParams{}),
|
||||||
|
defaultTaskPriority("cpu", taskParams{StressMode: true}),
|
||||||
|
defaultTaskPriority("nvidia-stress", taskParams{}),
|
||||||
|
defaultTaskPriority("nvidia-benchmark", taskParams{}),
|
||||||
|
}
|
||||||
|
want := []int{
|
||||||
|
taskPriorityInstallToRAM,
|
||||||
|
taskPriorityAudit,
|
||||||
|
taskPriorityValidate,
|
||||||
|
taskPriorityValidateStress,
|
||||||
|
taskPriorityBurn,
|
||||||
|
taskPriorityBenchmark,
|
||||||
|
}
|
||||||
|
for i := range want {
|
||||||
|
if got[i] != want[i] {
|
||||||
|
t.Fatalf("priority[%d]=%d want %d", i, got[i], want[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !(got[0] > got[1] && got[1] > got[2] && got[2] > got[3] && got[3] > got[4] && got[4] > got[5]) {
|
||||||
|
t.Fatalf("priority order=%v", got)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
|
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
|
||||||
|
|||||||
Reference in New Issue
Block a user