fix(webui): prevent orphaned workers on restart, reduce metrics polling, add Kill Workers button
- tasks: mark TaskRunning tasks as TaskFailed on bee-web restart instead of re-queueing them — prevents duplicate gpu-burn-worker spawns when bee-web crashes mid-test (each restart was launching a new set of 8 workers on top of still-alive orphans from the previous crash) - server: reduce metrics collector interval 1s→5s, grow ring buffer to 360 samples (30 min); cuts nvidia-smi/ipmitool/sensors subprocess rate by 5× - platform: add KillTestWorkers() — scans /proc and SIGKILLs bee-gpu-burn, stress-ng, stressapptest, memtester without relying on pkill/killall - webui: add "Kill Workers" button next to Cancel All; calls POST /api/tasks/kill-workers which cancels the task queue then kills orphaned OS-level processes; shows toast with killed count - metricsdb: sort GPU indices and fan/temp names after map iteration to fix non-deterministic sample reconstruction order (flaky test) - server: fix chartYAxisNumber to use one decimal place for 1000–9999 (e.g. "1,7к" instead of "2к") so Y-axis ticks are distinguishable Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -6,6 +6,7 @@ import (
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
@@ -217,7 +218,9 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
}
|
||||
}
|
||||
|
||||
// Collect unique GPU indices and fan names from loaded data (preserve order)
|
||||
// Collect unique GPU indices and fan/temp names from loaded data.
|
||||
// Sort each list so that sample reconstruction is deterministic regardless
|
||||
// of Go's non-deterministic map iteration order.
|
||||
seenGPU := map[int]bool{}
|
||||
var gpuIndices []int
|
||||
for k := range gpuData {
|
||||
@@ -226,6 +229,8 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
gpuIndices = append(gpuIndices, k.idx)
|
||||
}
|
||||
}
|
||||
sort.Ints(gpuIndices)
|
||||
|
||||
seenFan := map[string]bool{}
|
||||
var fanNames []string
|
||||
for k := range fanData {
|
||||
@@ -234,6 +239,8 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
fanNames = append(fanNames, k.name)
|
||||
}
|
||||
}
|
||||
sort.Strings(fanNames)
|
||||
|
||||
seenTemp := map[string]bool{}
|
||||
var tempNames []string
|
||||
for k := range tempData {
|
||||
@@ -242,6 +249,7 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
tempNames = append(tempNames, k.name)
|
||||
}
|
||||
}
|
||||
sort.Strings(tempNames)
|
||||
|
||||
samples := make([]platform.LiveMetricSample, len(sysRows))
|
||||
for i, r := range sysRows {
|
||||
|
||||
Reference in New Issue
Block a user