Add stability hardening and self-heal recovery
This commit is contained in:
@@ -4,10 +4,12 @@ import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"runtime/debug"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
@@ -377,7 +379,7 @@ func (q *taskQueue) startWorker(opts *HandlerOptions) {
|
||||
if !q.started {
|
||||
q.loadLocked()
|
||||
q.started = true
|
||||
go q.worker()
|
||||
goRecoverLoop("task worker", 2*time.Second, q.worker)
|
||||
}
|
||||
hasPending := q.nextPending() != nil
|
||||
q.mu.Unlock()
|
||||
@@ -392,75 +394,90 @@ func (q *taskQueue) startWorker(opts *HandlerOptions) {
|
||||
func (q *taskQueue) worker() {
|
||||
for {
|
||||
<-q.trigger
|
||||
setCPUGovernor("performance")
|
||||
func() {
|
||||
setCPUGovernor("performance")
|
||||
defer setCPUGovernor("powersave")
|
||||
|
||||
// Drain all pending tasks and start them in parallel.
|
||||
q.mu.Lock()
|
||||
var batch []*Task
|
||||
for {
|
||||
t := q.nextPending()
|
||||
if t == nil {
|
||||
break
|
||||
// Drain all pending tasks and start them in parallel.
|
||||
q.mu.Lock()
|
||||
var batch []*Task
|
||||
for {
|
||||
t := q.nextPending()
|
||||
if t == nil {
|
||||
break
|
||||
}
|
||||
now := time.Now()
|
||||
t.Status = TaskRunning
|
||||
t.StartedAt = &now
|
||||
t.DoneAt = nil
|
||||
t.ErrMsg = ""
|
||||
j := newTaskJobState(t.LogPath)
|
||||
t.job = j
|
||||
batch = append(batch, t)
|
||||
}
|
||||
now := time.Now()
|
||||
t.Status = TaskRunning
|
||||
t.StartedAt = &now
|
||||
t.DoneAt = nil
|
||||
t.ErrMsg = ""
|
||||
j := newTaskJobState(t.LogPath)
|
||||
t.job = j
|
||||
batch = append(batch, t)
|
||||
}
|
||||
if len(batch) > 0 {
|
||||
q.persistLocked()
|
||||
}
|
||||
q.mu.Unlock()
|
||||
if len(batch) > 0 {
|
||||
q.persistLocked()
|
||||
}
|
||||
q.mu.Unlock()
|
||||
|
||||
var wg sync.WaitGroup
|
||||
for _, t := range batch {
|
||||
t := t
|
||||
j := t.job
|
||||
taskCtx, taskCancel := context.WithCancel(context.Background())
|
||||
j.cancel = taskCancel
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
var wg sync.WaitGroup
|
||||
for _, t := range batch {
|
||||
t := t
|
||||
j := t.job
|
||||
taskCtx, taskCancel := context.WithCancel(context.Background())
|
||||
j.cancel = taskCancel
|
||||
wg.Add(1)
|
||||
goRecoverOnce("task "+t.Target, func() {
|
||||
defer wg.Done()
|
||||
defer func() {
|
||||
if rec := recover(); rec != nil {
|
||||
msg := fmt.Sprintf("task panic: %v", rec)
|
||||
slog.Error("task panic",
|
||||
"task_id", t.ID,
|
||||
"target", t.Target,
|
||||
"panic", fmt.Sprint(rec),
|
||||
"stack", string(debug.Stack()),
|
||||
)
|
||||
j.append("ERROR: " + msg)
|
||||
j.finish(msg)
|
||||
}
|
||||
}()
|
||||
|
||||
if q.kmsgWatcher != nil && isSATTarget(t.Target) {
|
||||
q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
|
||||
}
|
||||
|
||||
q.runTask(t, j, taskCtx)
|
||||
|
||||
if q.kmsgWatcher != nil {
|
||||
q.kmsgWatcher.NotifyTaskFinished(t.ID)
|
||||
}
|
||||
|
||||
q.mu.Lock()
|
||||
now2 := time.Now()
|
||||
t.DoneAt = &now2
|
||||
if t.Status == TaskRunning {
|
||||
if j.err != "" {
|
||||
t.Status = TaskFailed
|
||||
t.ErrMsg = j.err
|
||||
} else {
|
||||
t.Status = TaskDone
|
||||
if q.kmsgWatcher != nil && isSATTarget(t.Target) {
|
||||
q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
|
||||
}
|
||||
}
|
||||
|
||||
q.runTask(t, j, taskCtx)
|
||||
|
||||
if q.kmsgWatcher != nil {
|
||||
q.kmsgWatcher.NotifyTaskFinished(t.ID)
|
||||
}
|
||||
|
||||
q.mu.Lock()
|
||||
now2 := time.Now()
|
||||
t.DoneAt = &now2
|
||||
if t.Status == TaskRunning {
|
||||
if j.err != "" {
|
||||
t.Status = TaskFailed
|
||||
t.ErrMsg = j.err
|
||||
} else {
|
||||
t.Status = TaskDone
|
||||
}
|
||||
}
|
||||
q.persistLocked()
|
||||
q.mu.Unlock()
|
||||
})
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
if len(batch) > 0 {
|
||||
q.mu.Lock()
|
||||
q.prune()
|
||||
q.persistLocked()
|
||||
q.mu.Unlock()
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
}()
|
||||
|
||||
if len(batch) > 0 {
|
||||
q.mu.Lock()
|
||||
q.prune()
|
||||
q.persistLocked()
|
||||
q.mu.Unlock()
|
||||
}
|
||||
|
||||
setCPUGovernor("powersave")
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user