fix(webui): prevent orphaned workers on restart, reduce metrics polling, add Kill Workers button

- tasks: mark TaskRunning tasks as TaskFailed on bee-web restart instead of
  re-queueing them — prevents duplicate gpu-burn-worker spawns when bee-web
  crashes mid-test (each restart was launching a new set of 8 workers on top
  of still-alive orphans from the previous crash)
- server: reduce metrics collector interval 1s→5s, grow ring buffer to 360
  samples (30 min); cuts nvidia-smi/ipmitool/sensors subprocess rate by 5×
- platform: add KillTestWorkers() — scans /proc and SIGKILLs bee-gpu-burn,
  stress-ng, stressapptest, memtester without relying on pkill/killall
- webui: add "Kill Workers" button next to Cancel All; calls
  POST /api/tasks/kill-workers which cancels the task queue then kills
  orphaned OS-level processes; shows toast with killed count
- metricsdb: sort GPU indices and fan/temp names after map iteration to fix
  non-deterministic sample reconstruction order (flaky test)
- server: fix chartYAxisNumber to use one decimal place for 1000–9999
  (e.g. "1,7к" instead of "2к") so Y-axis ticks are distinguishable

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Mikhail Chusavitin
2026-04-02 10:13:43 +03:00
parent b2b0444131
commit 1f750d3edd
7 changed files with 216 additions and 32 deletions

View File

@@ -6,6 +6,7 @@ import (
"io"
"os"
"path/filepath"
"sort"
"strconv"
"time"
@@ -217,7 +218,9 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
}
}
// Collect unique GPU indices and fan names from loaded data (preserve order)
// Collect unique GPU indices and fan/temp names from loaded data.
// Sort each list so that sample reconstruction is deterministic regardless
// of Go's non-deterministic map iteration order.
seenGPU := map[int]bool{}
var gpuIndices []int
for k := range gpuData {
@@ -226,6 +229,8 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
gpuIndices = append(gpuIndices, k.idx)
}
}
sort.Ints(gpuIndices)
seenFan := map[string]bool{}
var fanNames []string
for k := range fanData {
@@ -234,6 +239,8 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
fanNames = append(fanNames, k.name)
}
}
sort.Strings(fanNames)
seenTemp := map[string]bool{}
var tempNames []string
for k := range tempData {
@@ -242,6 +249,7 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
tempNames = append(tempNames, k.name)
}
}
sort.Strings(tempNames)
samples := make([]platform.LiveMetricSample, len(sysRows))
for i, r := range sysRows {