fix(webui): prevent orphaned workers on restart, reduce metrics polling, add Kill Workers button
- tasks: mark TaskRunning tasks as TaskFailed on bee-web restart instead of re-queueing them — prevents duplicate gpu-burn-worker spawns when bee-web crashes mid-test (each restart was launching a new set of 8 workers on top of still-alive orphans from the previous crash) - server: reduce metrics collector interval 1s→5s, grow ring buffer to 360 samples (30 min); cuts nvidia-smi/ipmitool/sensors subprocess rate by 5× - platform: add KillTestWorkers() — scans /proc and SIGKILLs bee-gpu-burn, stress-ng, stressapptest, memtester without relying on pkill/killall - webui: add "Kill Workers" button next to Cancel All; calls POST /api/tasks/kill-workers which cancels the task queue then kills orphaned OS-level processes; shows toast with killed count - metricsdb: sort GPU indices and fan/temp names after map iteration to fix non-deterministic sample reconstruction order (flaky test) - server: fix chartYAxisNumber to use one decimal place for 1000–9999 (e.g. "1,7к" instead of "2к") so Y-axis ticks are distinguishable Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -716,6 +716,38 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
|
||||
writeJSON(w, map[string]int{"cancelled": n})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPITasksKillWorkers(w http.ResponseWriter, _ *http.Request) {
|
||||
// Cancel all queued/running tasks in the queue first.
|
||||
globalQueue.mu.Lock()
|
||||
now := time.Now()
|
||||
cancelled := 0
|
||||
for _, t := range globalQueue.tasks {
|
||||
switch t.Status {
|
||||
case TaskPending:
|
||||
t.Status = TaskCancelled
|
||||
t.DoneAt = &now
|
||||
cancelled++
|
||||
case TaskRunning:
|
||||
if t.job != nil {
|
||||
t.job.abort()
|
||||
}
|
||||
t.Status = TaskCancelled
|
||||
t.DoneAt = &now
|
||||
cancelled++
|
||||
}
|
||||
}
|
||||
globalQueue.persistLocked()
|
||||
globalQueue.mu.Unlock()
|
||||
|
||||
// Kill orphaned test worker processes at the OS level.
|
||||
killed := platform.KillTestWorkers()
|
||||
writeJSON(w, map[string]any{
|
||||
"cancelled": cancelled,
|
||||
"killed": len(killed),
|
||||
"processes": killed,
|
||||
})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPITasksStream(w http.ResponseWriter, r *http.Request) {
|
||||
id := r.PathValue("id")
|
||||
// Wait up to 5s for the task to get a job (it may be pending)
|
||||
@@ -769,8 +801,17 @@ func (q *taskQueue) loadLocked() {
|
||||
params: pt.Params,
|
||||
}
|
||||
q.assignTaskLogPathLocked(t)
|
||||
if t.Status == TaskPending || t.Status == TaskRunning {
|
||||
t.Status = TaskPending
|
||||
if t.Status == TaskRunning {
|
||||
// The task was interrupted by a bee-web restart. Child processes
|
||||
// (e.g. bee-gpu-burn-worker) survive the restart in their own
|
||||
// process groups and cannot be cancelled retroactively. Mark the
|
||||
// task as failed so the user can decide whether to re-run it
|
||||
// rather than blindly re-launching duplicate workers.
|
||||
now := time.Now()
|
||||
t.Status = TaskFailed
|
||||
t.DoneAt = &now
|
||||
t.ErrMsg = "interrupted by bee-web restart"
|
||||
} else if t.Status == TaskPending {
|
||||
t.StartedAt = nil
|
||||
t.DoneAt = nil
|
||||
t.ErrMsg = ""
|
||||
|
||||
Reference in New Issue
Block a user