Stability hardening (webui/app): - readFileLimited(): защита от OOM при чтении audit JSON (100 MB), component-status DB (10 MB) и лога задачи (50 MB) - jobs.go: буферизованный лог задачи — один открытый fd на задачу вместо open/write/close на каждую строку (устраняет тысячи syscall/сек при GPU стресс-тестах) - stability.go: экспоненциальный backoff в goRecoverLoop (2s→4s→…→60s), сброс при успешном прогоне >30s, счётчик перезапусков в slog - kill_workers.go: таймаут 5s на скан /proc, warn при срабатывании - bee-web.service: MemoryMax=3G — OOM killer защищён Build script: - build.sh: удалён блок генерации grub-pc/grub.cfg + live.cfg.in — мёртвый код с v8.25; grub-pc игнорируется live-build, а генерируемый live.cfg.in перезаписывал правильный статический файл устаревшей версией без tuning-параметров ядра и пунктов gsp-off/kms+gsp-off - build.sh: dump_memtest_debug теперь логирует grub-efi/grub.cfg вместо grub-pc/grub.cfg (было всегда "missing") GRUB: - live-theme/bee-logo.png: логотип пчелы 400×400px на чёрном фоне - live-theme/theme.txt: + image компонент по центру в верхней трети экрана; меню сдвинуто с 62% до 65% Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
84 lines
2.1 KiB
Go
84 lines
2.1 KiB
Go
package platform
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log/slog"
|
|
"os"
|
|
"strconv"
|
|
"strings"
|
|
"syscall"
|
|
"time"
|
|
)
|
|
|
|
// workerPatterns are substrings matched against /proc/<pid>/cmdline to identify
|
|
// bee test worker processes that should be killed by KillTestWorkers.
|
|
var workerPatterns = []string{
|
|
"bee-gpu-burn",
|
|
"stress-ng",
|
|
"stressapptest",
|
|
"memtester",
|
|
// DCGM diagnostic workers — nvvs is spawned by dcgmi diag and survives
|
|
// if dcgmi is killed mid-run, leaving the GPU occupied (DCGM_ST_IN_USE).
|
|
"nvvs",
|
|
"dcgmi",
|
|
}
|
|
|
|
// KilledProcess describes a process that was sent SIGKILL.
|
|
type KilledProcess struct {
|
|
PID int `json:"pid"`
|
|
Name string `json:"name"`
|
|
}
|
|
|
|
// KillTestWorkers scans /proc for running test worker processes and sends
|
|
// SIGKILL to each one found. It returns a list of killed processes.
|
|
// Errors for individual processes (e.g. already exited) are silently ignored.
|
|
// The scan runs under a 5-second deadline to avoid blocking if the process
|
|
// table is very large (e.g. after a stress test with thousands of children).
|
|
func KillTestWorkers() []KilledProcess {
|
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
|
defer cancel()
|
|
|
|
entries, err := os.ReadDir("/proc")
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
|
|
var killed []KilledProcess
|
|
for _, e := range entries {
|
|
select {
|
|
case <-ctx.Done():
|
|
slog.Warn("KillTestWorkers scan timed out", "killed_so_far", len(killed))
|
|
return killed
|
|
default:
|
|
}
|
|
|
|
if !e.IsDir() {
|
|
continue
|
|
}
|
|
pid, err := strconv.Atoi(e.Name())
|
|
if err != nil {
|
|
continue
|
|
}
|
|
cmdline, err := os.ReadFile(fmt.Sprintf("/proc/%d/cmdline", pid))
|
|
if err != nil {
|
|
continue
|
|
}
|
|
// /proc/*/cmdline uses NUL bytes as argument separators.
|
|
args := strings.SplitN(strings.ReplaceAll(string(cmdline), "\x00", " "), " ", 2)
|
|
exe := strings.TrimSpace(args[0])
|
|
base := exe
|
|
if idx := strings.LastIndexByte(exe, '/'); idx >= 0 {
|
|
base = exe[idx+1:]
|
|
}
|
|
for _, pat := range workerPatterns {
|
|
if strings.Contains(base, pat) || strings.Contains(exe, pat) {
|
|
_ = syscall.Kill(pid, syscall.SIGKILL)
|
|
killed = append(killed, KilledProcess{PID: pid, Name: base})
|
|
break
|
|
}
|
|
}
|
|
}
|
|
return killed
|
|
}
|