Files
bee/audit/internal/platform/kill_workers.go
Mikhail Chusavitin 1f750d3edd fix(webui): prevent orphaned workers on restart, reduce metrics polling, add Kill Workers button
- tasks: mark TaskRunning tasks as TaskFailed on bee-web restart instead of
  re-queueing them — prevents duplicate gpu-burn-worker spawns when bee-web
  crashes mid-test (each restart was launching a new set of 8 workers on top
  of still-alive orphans from the previous crash)
- server: reduce metrics collector interval 1s→5s, grow ring buffer to 360
  samples (30 min); cuts nvidia-smi/ipmitool/sensors subprocess rate by 5×
- platform: add KillTestWorkers() — scans /proc and SIGKILLs bee-gpu-burn,
  stress-ng, stressapptest, memtester without relying on pkill/killall
- webui: add "Kill Workers" button next to Cancel All; calls
  POST /api/tasks/kill-workers which cancels the task queue then kills
  orphaned OS-level processes; shows toast with killed count
- metricsdb: sort GPU indices and fan/temp names after map iteration to fix
  non-deterministic sample reconstruction order (flaky test)
- server: fix chartYAxisNumber to use one decimal place for 1000–9999
  (e.g. "1,7к" instead of "2к") so Y-axis ticks are distinguishable

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-02 10:13:43 +03:00

65 lines
1.6 KiB
Go

package platform
import (
"fmt"
"os"
"strconv"
"strings"
"syscall"
)
// workerPatterns are substrings matched against /proc/<pid>/cmdline to identify
// bee test worker processes that should be killed by KillTestWorkers.
var workerPatterns = []string{
"bee-gpu-burn",
"stress-ng",
"stressapptest",
"memtester",
}
// KilledProcess describes a process that was sent SIGKILL.
type KilledProcess struct {
PID int `json:"pid"`
Name string `json:"name"`
}
// KillTestWorkers scans /proc for running test worker processes and sends
// SIGKILL to each one found. It returns a list of killed processes.
// Errors for individual processes (e.g. already exited) are silently ignored.
func KillTestWorkers() []KilledProcess {
entries, err := os.ReadDir("/proc")
if err != nil {
return nil
}
var killed []KilledProcess
for _, e := range entries {
if !e.IsDir() {
continue
}
pid, err := strconv.Atoi(e.Name())
if err != nil {
continue
}
cmdline, err := os.ReadFile(fmt.Sprintf("/proc/%d/cmdline", pid))
if err != nil {
continue
}
// /proc/*/cmdline uses NUL bytes as argument separators.
args := strings.SplitN(strings.ReplaceAll(string(cmdline), "\x00", " "), " ", 2)
exe := strings.TrimSpace(args[0])
base := exe
if idx := strings.LastIndexByte(exe, '/'); idx >= 0 {
base = exe[idx+1:]
}
for _, pat := range workerPatterns {
if strings.Contains(base, pat) || strings.Contains(exe, pat) {
_ = syscall.Kill(pid, syscall.SIGKILL)
killed = append(killed, KilledProcess{PID: pid, Name: base})
break
}
}
}
return killed
}