Stability hardening, build script fixes, GRUB bee logo

Stability hardening (webui/app):
- readFileLimited(): защита от OOM при чтении audit JSON (100 MB),
  component-status DB (10 MB) и лога задачи (50 MB)
- jobs.go: буферизованный лог задачи — один открытый fd на задачу
  вместо open/write/close на каждую строку (устраняет тысячи syscall/сек
  при GPU стресс-тестах)
- stability.go: экспоненциальный backoff в goRecoverLoop (2s→4s→…→60s),
  сброс при успешном прогоне >30s, счётчик перезапусков в slog
- kill_workers.go: таймаут 5s на скан /proc, warn при срабатывании
- bee-web.service: MemoryMax=3G — OOM killer защищён

Build script:
- build.sh: удалён блок генерации grub-pc/grub.cfg + live.cfg.in —
  мёртвый код с v8.25; grub-pc игнорируется live-build, а генерируемый
  live.cfg.in перезаписывал правильный статический файл устаревшей
  версией без tuning-параметров ядра и пунктов gsp-off/kms+gsp-off
- build.sh: dump_memtest_debug теперь логирует grub-efi/grub.cfg
  вместо grub-pc/grub.cfg (было всегда "missing")

GRUB:
- live-theme/bee-logo.png: логотип пчелы 400×400px на чёрном фоне
- live-theme/theme.txt: + image компонент по центру в верхней трети
  экрана; меню сдвинуто с 62% до 65%

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-19 13:08:31 +03:00
parent 61c7abaa80
commit d52ec67f8f
12 changed files with 143 additions and 95 deletions

View File

@@ -304,7 +304,7 @@ func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error)
} }
filename := fmt.Sprintf("audit-%s-%s.json", sanitizeFilename(hostnameOr("unknown")), time.Now().UTC().Format("20060102-150405")) filename := fmt.Sprintf("audit-%s-%s.json", sanitizeFilename(hostnameOr("unknown")), time.Now().UTC().Format("20060102-150405"))
tmpPath := filepath.Join(os.TempDir(), filename) tmpPath := filepath.Join(os.TempDir(), filename)
data, err := os.ReadFile(DefaultAuditJSONPath) data, err := readFileLimited(DefaultAuditJSONPath, 100<<20)
if err != nil { if err != nil {
return "", err return "", err
} }

View File

@@ -2,10 +2,29 @@ package app
import ( import (
"fmt" "fmt"
"io"
"os" "os"
"path/filepath" "path/filepath"
) )
// readFileLimited reads path into memory, refusing files larger than maxBytes.
// Prevents OOM on corrupted or unexpectedly large data files.
func readFileLimited(path string, maxBytes int64) ([]byte, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
defer f.Close()
data, err := io.ReadAll(io.LimitReader(f, maxBytes+1))
if err != nil {
return nil, err
}
if int64(len(data)) > maxBytes {
return nil, fmt.Errorf("file %s too large (exceeds %d bytes)", path, maxBytes)
}
return data, nil
}
func atomicWriteFile(path string, data []byte, perm os.FileMode) error { func atomicWriteFile(path string, data []byte, perm os.FileMode) error {
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
return fmt.Errorf("mkdir %s: %w", filepath.Dir(path), err) return fmt.Errorf("mkdir %s: %w", filepath.Dir(path), err)

View File

@@ -46,7 +46,7 @@ func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) {
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
return nil, err return nil, err
} }
data, err := os.ReadFile(path) data, err := readFileLimited(path, 10<<20)
if err != nil && !os.IsNotExist(err) { if err != nil && !os.IsNotExist(err) {
return nil, err return nil, err
} }

View File

@@ -1,11 +1,14 @@
package platform package platform
import ( import (
"context"
"fmt" "fmt"
"log/slog"
"os" "os"
"strconv" "strconv"
"strings" "strings"
"syscall" "syscall"
"time"
) )
// workerPatterns are substrings matched against /proc/<pid>/cmdline to identify // workerPatterns are substrings matched against /proc/<pid>/cmdline to identify
@@ -30,7 +33,12 @@ type KilledProcess struct {
// KillTestWorkers scans /proc for running test worker processes and sends // KillTestWorkers scans /proc for running test worker processes and sends
// SIGKILL to each one found. It returns a list of killed processes. // SIGKILL to each one found. It returns a list of killed processes.
// Errors for individual processes (e.g. already exited) are silently ignored. // Errors for individual processes (e.g. already exited) are silently ignored.
// The scan runs under a 5-second deadline to avoid blocking if the process
// table is very large (e.g. after a stress test with thousands of children).
func KillTestWorkers() []KilledProcess { func KillTestWorkers() []KilledProcess {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
entries, err := os.ReadDir("/proc") entries, err := os.ReadDir("/proc")
if err != nil { if err != nil {
return nil return nil
@@ -38,6 +46,13 @@ func KillTestWorkers() []KilledProcess {
var killed []KilledProcess var killed []KilledProcess
for _, e := range entries { for _, e := range entries {
select {
case <-ctx.Done():
slog.Warn("KillTestWorkers scan timed out", "killed_so_far", len(killed))
return killed
default:
}
if !e.IsDir() { if !e.IsDir() {
continue continue
} }

View File

@@ -1,6 +1,9 @@
package webui package webui
import ( import (
"bufio"
"fmt"
"io"
"os" "os"
"strings" "strings"
"sync" "sync"
@@ -17,6 +20,25 @@ type jobState struct {
cancel func() // optional cancel function; nil if job is not cancellable cancel func() // optional cancel function; nil if job is not cancellable
logPath string logPath string
serialPrefix string serialPrefix string
logFile *os.File // kept open for the task lifetime to avoid per-line open/close
logBuf *bufio.Writer
}
// readTaskLogFile reads a task log, refusing files over 50 MB.
func readTaskLogFile(path string) ([]byte, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
defer f.Close()
data, err := io.ReadAll(io.LimitReader(f, 50<<20+1))
if err != nil {
return nil, err
}
if int64(len(data)) > 50<<20 {
return nil, fmt.Errorf("task log %s too large (exceeds 50 MB)", path)
}
return data, nil
} }
// abort cancels the job if it has a cancel function and is not yet done. // abort cancels the job if it has a cancel function and is not yet done.
@@ -35,7 +57,7 @@ func (j *jobState) append(line string) {
defer j.mu.Unlock() defer j.mu.Unlock()
j.lines = append(j.lines, line) j.lines = append(j.lines, line)
if j.logPath != "" { if j.logPath != "" {
appendJobLog(j.logPath, line) j.writeLogLineLocked(line)
} }
if j.serialPrefix != "" { if j.serialPrefix != "" {
taskSerialWriteLine(j.serialPrefix + line) taskSerialWriteLine(j.serialPrefix + line)
@@ -48,6 +70,35 @@ func (j *jobState) append(line string) {
} }
} }
// writeLogLineLocked writes a line to the persistent log file, opening it lazily.
// Must be called with j.mu held. Uses a buffered writer kept open for the task
// lifetime — avoids thousands of open/close syscalls during high-frequency logs.
func (j *jobState) writeLogLineLocked(line string) {
if j.logFile == nil {
f, err := os.OpenFile(j.logPath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
if err != nil {
return
}
j.logFile = f
j.logBuf = bufio.NewWriterSize(f, 64*1024)
}
_, _ = j.logBuf.WriteString(line + "\n")
}
// closeLog flushes and closes the log file. Called after all task output is done.
func (j *jobState) closeLog() {
j.mu.Lock()
defer j.mu.Unlock()
if j.logBuf != nil {
_ = j.logBuf.Flush()
}
if j.logFile != nil {
_ = j.logFile.Close()
j.logFile = nil
j.logBuf = nil
}
}
func (j *jobState) finish(errMsg string) { func (j *jobState) finish(errMsg string) {
j.mu.Lock() j.mu.Lock()
defer j.mu.Unlock() defer j.mu.Unlock()
@@ -119,7 +170,7 @@ func newTaskJobState(logPath string, serialPrefix ...string) *jobState {
if logPath == "" { if logPath == "" {
return j return j
} }
data, err := os.ReadFile(logPath) data, err := readTaskLogFile(logPath)
if err != nil || len(data) == 0 { if err != nil || len(data) == 0 {
return j return j
} }

View File

@@ -7,14 +7,43 @@ import (
"time" "time"
) )
const (
recoverLoopMaxDelay = 60 * time.Second
recoverLoopResetAfter = 30 * time.Second
)
// goRecoverLoop starts fn in a goroutine, restarting after panics.
// restartDelay is the initial delay; successive panics double it up to
// recoverLoopMaxDelay. The delay resets to restartDelay once fn runs
// successfully for recoverLoopResetAfter without panicking.
func goRecoverLoop(name string, restartDelay time.Duration, fn func()) { func goRecoverLoop(name string, restartDelay time.Duration, fn func()) {
go func() { go func() {
delay := restartDelay
consecutive := 0
for { for {
if !runRecoverable(name, fn) { start := time.Now()
panicked := runRecoverable(name, fn)
if !panicked {
return return
} }
if restartDelay > 0 { consecutive++
time.Sleep(restartDelay) if time.Since(start) >= recoverLoopResetAfter {
delay = restartDelay
consecutive = 1
}
slog.Warn("goroutine restarting after panic",
"component", name,
"consecutive_panics", consecutive,
"next_delay", delay,
)
if delay > 0 {
time.Sleep(delay)
}
if delay < recoverLoopMaxDelay {
delay *= 2
if delay > recoverLoopMaxDelay {
delay = recoverLoopMaxDelay
}
} }
} }
}() }()

View File

@@ -585,6 +585,7 @@ func (q *taskQueue) finalizeTaskRun(t *Task, j *jobState) {
if err := writeTaskReportArtifacts(t); err != nil { if err := writeTaskReportArtifacts(t); err != nil {
appendJobLog(t.LogPath, "WARN: task report generation failed: "+err.Error()) appendJobLog(t.LogPath, "WARN: task report generation failed: "+err.Error())
} }
j.closeLog()
if t.ErrMsg != "" { if t.ErrMsg != "" {
taskSerialEvent(t, "finished with status="+t.Status+" error="+t.ErrMsg) taskSerialEvent(t, "finished with status="+t.Status+" error="+t.ErrMsg)
return return

View File

@@ -110,8 +110,12 @@ nvidia-smi / lspci (audit collection)
--- ---
## What Needs Fixing ## Fixed Issues
1. **NVIDIA PCIe Model**`enrichPCIeWithNVIDIAData()` should set `dev.Model = &gpu.Name` All previously open items are resolved:
2. **Fallback consistency**`benchmark_report.go:93` should say `"Unknown GPU"` not `"Unknown"`; `sat.go:922` should say `"Unknown GPU"` not `"unknown"`
3. **Old benchmark JSONs** — no fix possible for already-saved results with missing names (display-only issue) 1. **NVIDIA PCIe Model**`enrichPCIeWithNVIDIAData()` sets `dev.Model = &v` (`nvidia.go:78`).
2. **Fallback consistency**`sat.go` and `benchmark_report.go` both use `"Unknown GPU"`.
3. **`tops_per_sm_per_ghz`** — computed in `benchmark.go` and stored in `BenchmarkGPUScore.TOPSPerSMPerGHz`.
4. **`MultiprocessorCount`, `PowerLimitW`, `DefaultPowerLimitW`** — present in `benchmark_types.go`.
5. **Old benchmark JSONs** — no fix possible for already-saved results with missing names (display-only issue).

View File

@@ -203,7 +203,7 @@ dump_memtest_debug() {
echo "-- source bootloader templates --" echo "-- source bootloader templates --"
for cfg in \ for cfg in \
"${BUILDER_DIR}/config/bootloaders/grub-pc/grub.cfg" \ "${BUILDER_DIR}/config/bootloaders/grub-efi/grub.cfg" \
"${BUILDER_DIR}/config/bootloaders/isolinux/live.cfg.in"; do "${BUILDER_DIR}/config/bootloaders/isolinux/live.cfg.in"; do
if [ -f "$cfg" ]; then if [ -f "$cfg" ]; then
echo " file: $cfg" echo " file: $cfg"
@@ -954,87 +954,6 @@ elif [ -d "${LB_PKG_CACHE}" ] && [ "$(ls -A "${LB_PKG_CACHE}" 2>/dev/null)" ]; t
rsync -a "${LB_PKG_CACHE}/" "${BUILD_WORK_DIR}/cache/packages.chroot/" rsync -a "${LB_PKG_CACHE}/" "${BUILD_WORK_DIR}/cache/packages.chroot/"
fi fi
if [ "$BEE_GPU_VENDOR" != "nvidia" ] || [ "$BEE_NVIDIA_MODULE_FLAVOR" != "proprietary" ]; then
mkdir -p "${BUILD_WORK_DIR}/config/bootloaders/grub-pc"
cat > "${BUILD_WORK_DIR}/config/bootloaders/grub-pc/grub.cfg" <<'EOF'
source /boot/grub/config.cfg
echo ""
echo " ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗"
echo " ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝"
echo " █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗"
echo " ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝"
echo " ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗"
echo " ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝"
echo " Hardware Audit LiveCD"
echo ""
menuentry "EASY-BEE" {
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
initrd @INITRD_LIVE@
}
submenu "EASY-BEE (advanced options) -->" {
menuentry "EASY-BEE — KMS (no nomodeset)" {
linux @KERNEL_LIVE@ @APPEND_LIVE@ net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
initrd @INITRD_LIVE@
}
menuentry "EASY-BEE — fail-safe" {
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
initrd @INITRD_LIVE@
}
}
if [ "${grub_platform}" = "efi" ]; then
menuentry "Memory Test (memtest86+)" {
chainloader /boot/memtest86+x64.efi
}
else
menuentry "Memory Test (memtest86+)" {
linux16 /boot/memtest86+x64.bin
}
fi
if [ "${grub_platform}" = "efi" ]; then
menuentry "UEFI Firmware Settings" {
fwsetup
}
fi
EOF
cat > "${BUILD_WORK_DIR}/config/bootloaders/isolinux/live.cfg.in" <<'EOF'
label live-@FLAVOUR@-normal
menu label ^EASY-BEE
menu default
linux @LINUX@
initrd @INITRD@
append @APPEND_LIVE@
label live-@FLAVOUR@-kms
menu label EASY-BEE (^graphics/KMS)
linux @LINUX@
initrd @INITRD@
append @APPEND_LIVE@ bee.display=kms
label live-@FLAVOUR@-toram
menu label EASY-BEE (^load to RAM)
linux @LINUX@
initrd @INITRD@
append @APPEND_LIVE@ toram
label live-@FLAVOUR@-failsafe
menu label EASY-BEE (^fail-safe)
linux @LINUX@
initrd @INITRD@
append @APPEND_LIVE@ memtest noapic noapm nodma nomce nolapic nosmp vga=normal
label memtest
menu label ^Memory Test (memtest86+)
linux /boot/memtest86+x64.bin
EOF
fi
rsync -a "${OVERLAY_DIR}/" "${OVERLAY_STAGE_DIR}/" rsync -a "${OVERLAY_DIR}/" "${OVERLAY_STAGE_DIR}/"
rm -f \ rm -f \
"${OVERLAY_STAGE_DIR}/etc/bee-ssh-password-fallback" \ "${OVERLAY_STAGE_DIR}/etc/bee-ssh-password-fallback" \

Binary file not shown.

After

Width:  |  Height:  |  Size: 70 KiB

View File

@@ -5,6 +5,15 @@ title-text: ""
message-font: "Unifont Regular 16" message-font: "Unifont Regular 16"
terminal-font: "Unifont Regular 16" terminal-font: "Unifont Regular 16"
#bee logo — centered, upper third of screen
+ image {
top = 4%
left = 50%-200
width = 400
height = 400
file = "bee-logo.png"
}
#help bar at the bottom #help bar at the bottom
+ label { + label {
top = 100%-50 top = 100%-50
@@ -21,8 +30,8 @@ terminal-font: "Unifont Regular 16"
+ boot_menu { + boot_menu {
left = 20% left = 20%
width = 60% width = 60%
top = 62% top = 65%
height = 38%-80 height = 35%-80
item_color = "#c88000" item_color = "#c88000"
item_font = "Unifont Regular 16" item_font = "Unifont Regular 16"
selected_item_color= "#f5a800" selected_item_color= "#f5a800"

View File

@@ -10,6 +10,7 @@ RestartSec=3
StandardOutput=journal StandardOutput=journal
StandardError=journal StandardError=journal
LimitMEMLOCK=infinity LimitMEMLOCK=infinity
MemoryMax=3G
# Keep the web server responsive during GPU/CPU stress (children inherit nice+10 # Keep the web server responsive during GPU/CPU stress (children inherit nice+10
# via Setpriority in runCmdJob, but the bee-web parent stays at 0). # via Setpriority in runCmdJob, but the bee-web parent stays at 0).
Nice=0 Nice=0