- tasks: mark TaskRunning tasks as TaskFailed on bee-web restart instead of re-queueing them — prevents duplicate gpu-burn-worker spawns when bee-web crashes mid-test (each restart was launching a new set of 8 workers on top of still-alive orphans from the previous crash) - server: reduce metrics collector interval 1s→5s, grow ring buffer to 360 samples (30 min); cuts nvidia-smi/ipmitool/sensors subprocess rate by 5× - platform: add KillTestWorkers() — scans /proc and SIGKILLs bee-gpu-burn, stress-ng, stressapptest, memtester without relying on pkill/killall - webui: add "Kill Workers" button next to Cancel All; calls POST /api/tasks/kill-workers which cancels the task queue then kills orphaned OS-level processes; shows toast with killed count - metricsdb: sort GPU indices and fan/temp names after map iteration to fix non-deterministic sample reconstruction order (flaky test) - server: fix chartYAxisNumber to use one decimal place for 1000–9999 (e.g. "1,7к" instead of "2к") so Y-axis ticks are distinguishable Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
65 lines
1.6 KiB
Go
65 lines
1.6 KiB
Go
package platform
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"strconv"
|
|
"strings"
|
|
"syscall"
|
|
)
|
|
|
|
// workerPatterns are substrings matched against /proc/<pid>/cmdline to identify
|
|
// bee test worker processes that should be killed by KillTestWorkers.
|
|
var workerPatterns = []string{
|
|
"bee-gpu-burn",
|
|
"stress-ng",
|
|
"stressapptest",
|
|
"memtester",
|
|
}
|
|
|
|
// KilledProcess describes a process that was sent SIGKILL.
|
|
type KilledProcess struct {
|
|
PID int `json:"pid"`
|
|
Name string `json:"name"`
|
|
}
|
|
|
|
// KillTestWorkers scans /proc for running test worker processes and sends
|
|
// SIGKILL to each one found. It returns a list of killed processes.
|
|
// Errors for individual processes (e.g. already exited) are silently ignored.
|
|
func KillTestWorkers() []KilledProcess {
|
|
entries, err := os.ReadDir("/proc")
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
|
|
var killed []KilledProcess
|
|
for _, e := range entries {
|
|
if !e.IsDir() {
|
|
continue
|
|
}
|
|
pid, err := strconv.Atoi(e.Name())
|
|
if err != nil {
|
|
continue
|
|
}
|
|
cmdline, err := os.ReadFile(fmt.Sprintf("/proc/%d/cmdline", pid))
|
|
if err != nil {
|
|
continue
|
|
}
|
|
// /proc/*/cmdline uses NUL bytes as argument separators.
|
|
args := strings.SplitN(strings.ReplaceAll(string(cmdline), "\x00", " "), " ", 2)
|
|
exe := strings.TrimSpace(args[0])
|
|
base := exe
|
|
if idx := strings.LastIndexByte(exe, '/'); idx >= 0 {
|
|
base = exe[idx+1:]
|
|
}
|
|
for _, pat := range workerPatterns {
|
|
if strings.Contains(base, pat) || strings.Contains(exe, pat) {
|
|
_ = syscall.Kill(pid, syscall.SIGKILL)
|
|
killed = append(killed, KilledProcess{PID: pid, Name: base})
|
|
break
|
|
}
|
|
}
|
|
}
|
|
return killed
|
|
}
|