diff --git a/audit/internal/app/app.go b/audit/internal/app/app.go index 817907c..81c3450 100644 --- a/audit/internal/app/app.go +++ b/audit/internal/app/app.go @@ -304,7 +304,7 @@ func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error) } filename := fmt.Sprintf("audit-%s-%s.json", sanitizeFilename(hostnameOr("unknown")), time.Now().UTC().Format("20060102-150405")) tmpPath := filepath.Join(os.TempDir(), filename) - data, err := os.ReadFile(DefaultAuditJSONPath) + data, err := readFileLimited(DefaultAuditJSONPath, 100<<20) if err != nil { return "", err } diff --git a/audit/internal/app/atomic_write.go b/audit/internal/app/atomic_write.go index a8b6edc..d444840 100644 --- a/audit/internal/app/atomic_write.go +++ b/audit/internal/app/atomic_write.go @@ -2,10 +2,29 @@ package app import ( "fmt" + "io" "os" "path/filepath" ) +// readFileLimited reads path into memory, refusing files larger than maxBytes. +// Prevents OOM on corrupted or unexpectedly large data files. +func readFileLimited(path string, maxBytes int64) ([]byte, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer f.Close() + data, err := io.ReadAll(io.LimitReader(f, maxBytes+1)) + if err != nil { + return nil, err + } + if int64(len(data)) > maxBytes { + return nil, fmt.Errorf("file %s too large (exceeds %d bytes)", path, maxBytes) + } + return data, nil +} + func atomicWriteFile(path string, data []byte, perm os.FileMode) error { if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { return fmt.Errorf("mkdir %s: %w", filepath.Dir(path), err) diff --git a/audit/internal/app/component_status_db.go b/audit/internal/app/component_status_db.go index 6873ed7..4aae112 100644 --- a/audit/internal/app/component_status_db.go +++ b/audit/internal/app/component_status_db.go @@ -46,7 +46,7 @@ func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) { if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { return nil, err } - data, err := os.ReadFile(path) + data, err := readFileLimited(path, 10<<20) if err != nil && !os.IsNotExist(err) { return nil, err } diff --git a/audit/internal/platform/kill_workers.go b/audit/internal/platform/kill_workers.go index 09153f1..6687756 100644 --- a/audit/internal/platform/kill_workers.go +++ b/audit/internal/platform/kill_workers.go @@ -1,11 +1,14 @@ package platform import ( + "context" "fmt" + "log/slog" "os" "strconv" "strings" "syscall" + "time" ) // workerPatterns are substrings matched against /proc//cmdline to identify @@ -30,7 +33,12 @@ type KilledProcess struct { // KillTestWorkers scans /proc for running test worker processes and sends // SIGKILL to each one found. It returns a list of killed processes. // Errors for individual processes (e.g. already exited) are silently ignored. +// The scan runs under a 5-second deadline to avoid blocking if the process +// table is very large (e.g. after a stress test with thousands of children). func KillTestWorkers() []KilledProcess { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + entries, err := os.ReadDir("/proc") if err != nil { return nil @@ -38,6 +46,13 @@ func KillTestWorkers() []KilledProcess { var killed []KilledProcess for _, e := range entries { + select { + case <-ctx.Done(): + slog.Warn("KillTestWorkers scan timed out", "killed_so_far", len(killed)) + return killed + default: + } + if !e.IsDir() { continue } diff --git a/audit/internal/webui/jobs.go b/audit/internal/webui/jobs.go index 5924a5b..5719091 100644 --- a/audit/internal/webui/jobs.go +++ b/audit/internal/webui/jobs.go @@ -1,6 +1,9 @@ package webui import ( + "bufio" + "fmt" + "io" "os" "strings" "sync" @@ -17,6 +20,25 @@ type jobState struct { cancel func() // optional cancel function; nil if job is not cancellable logPath string serialPrefix string + logFile *os.File // kept open for the task lifetime to avoid per-line open/close + logBuf *bufio.Writer +} + +// readTaskLogFile reads a task log, refusing files over 50 MB. +func readTaskLogFile(path string) ([]byte, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer f.Close() + data, err := io.ReadAll(io.LimitReader(f, 50<<20+1)) + if err != nil { + return nil, err + } + if int64(len(data)) > 50<<20 { + return nil, fmt.Errorf("task log %s too large (exceeds 50 MB)", path) + } + return data, nil } // abort cancels the job if it has a cancel function and is not yet done. @@ -35,7 +57,7 @@ func (j *jobState) append(line string) { defer j.mu.Unlock() j.lines = append(j.lines, line) if j.logPath != "" { - appendJobLog(j.logPath, line) + j.writeLogLineLocked(line) } if j.serialPrefix != "" { taskSerialWriteLine(j.serialPrefix + line) @@ -48,6 +70,35 @@ func (j *jobState) append(line string) { } } +// writeLogLineLocked writes a line to the persistent log file, opening it lazily. +// Must be called with j.mu held. Uses a buffered writer kept open for the task +// lifetime — avoids thousands of open/close syscalls during high-frequency logs. +func (j *jobState) writeLogLineLocked(line string) { + if j.logFile == nil { + f, err := os.OpenFile(j.logPath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644) + if err != nil { + return + } + j.logFile = f + j.logBuf = bufio.NewWriterSize(f, 64*1024) + } + _, _ = j.logBuf.WriteString(line + "\n") +} + +// closeLog flushes and closes the log file. Called after all task output is done. +func (j *jobState) closeLog() { + j.mu.Lock() + defer j.mu.Unlock() + if j.logBuf != nil { + _ = j.logBuf.Flush() + } + if j.logFile != nil { + _ = j.logFile.Close() + j.logFile = nil + j.logBuf = nil + } +} + func (j *jobState) finish(errMsg string) { j.mu.Lock() defer j.mu.Unlock() @@ -119,7 +170,7 @@ func newTaskJobState(logPath string, serialPrefix ...string) *jobState { if logPath == "" { return j } - data, err := os.ReadFile(logPath) + data, err := readTaskLogFile(logPath) if err != nil || len(data) == 0 { return j } diff --git a/audit/internal/webui/stability.go b/audit/internal/webui/stability.go index 8ece151..f5e83c4 100644 --- a/audit/internal/webui/stability.go +++ b/audit/internal/webui/stability.go @@ -7,14 +7,43 @@ import ( "time" ) +const ( + recoverLoopMaxDelay = 60 * time.Second + recoverLoopResetAfter = 30 * time.Second +) + +// goRecoverLoop starts fn in a goroutine, restarting after panics. +// restartDelay is the initial delay; successive panics double it up to +// recoverLoopMaxDelay. The delay resets to restartDelay once fn runs +// successfully for recoverLoopResetAfter without panicking. func goRecoverLoop(name string, restartDelay time.Duration, fn func()) { go func() { + delay := restartDelay + consecutive := 0 for { - if !runRecoverable(name, fn) { + start := time.Now() + panicked := runRecoverable(name, fn) + if !panicked { return } - if restartDelay > 0 { - time.Sleep(restartDelay) + consecutive++ + if time.Since(start) >= recoverLoopResetAfter { + delay = restartDelay + consecutive = 1 + } + slog.Warn("goroutine restarting after panic", + "component", name, + "consecutive_panics", consecutive, + "next_delay", delay, + ) + if delay > 0 { + time.Sleep(delay) + } + if delay < recoverLoopMaxDelay { + delay *= 2 + if delay > recoverLoopMaxDelay { + delay = recoverLoopMaxDelay + } } } }() diff --git a/audit/internal/webui/tasks.go b/audit/internal/webui/tasks.go index 957f95a..d8bb9c3 100644 --- a/audit/internal/webui/tasks.go +++ b/audit/internal/webui/tasks.go @@ -585,6 +585,7 @@ func (q *taskQueue) finalizeTaskRun(t *Task, j *jobState) { if err := writeTaskReportArtifacts(t); err != nil { appendJobLog(t.LogPath, "WARN: task report generation failed: "+err.Error()) } + j.closeLog() if t.ErrMsg != "" { taskSerialEvent(t, "finished with status="+t.Status+" error="+t.ErrMsg) return diff --git a/bible-local/docs/gpu-model-propagation.md b/bible-local/docs/gpu-model-propagation.md index 8c939ad..f53f451 100644 --- a/bible-local/docs/gpu-model-propagation.md +++ b/bible-local/docs/gpu-model-propagation.md @@ -110,8 +110,12 @@ nvidia-smi / lspci (audit collection) --- -## What Needs Fixing +## Fixed Issues -1. **NVIDIA PCIe Model** — `enrichPCIeWithNVIDIAData()` should set `dev.Model = &gpu.Name` -2. **Fallback consistency** — `benchmark_report.go:93` should say `"Unknown GPU"` not `"Unknown"`; `sat.go:922` should say `"Unknown GPU"` not `"unknown"` -3. **Old benchmark JSONs** — no fix possible for already-saved results with missing names (display-only issue) +All previously open items are resolved: + +1. **NVIDIA PCIe Model** — `enrichPCIeWithNVIDIAData()` sets `dev.Model = &v` (`nvidia.go:78`). +2. **Fallback consistency** — `sat.go` and `benchmark_report.go` both use `"Unknown GPU"`. +3. **`tops_per_sm_per_ghz`** — computed in `benchmark.go` and stored in `BenchmarkGPUScore.TOPSPerSMPerGHz`. +4. **`MultiprocessorCount`, `PowerLimitW`, `DefaultPowerLimitW`** — present in `benchmark_types.go`. +5. **Old benchmark JSONs** — no fix possible for already-saved results with missing names (display-only issue). diff --git a/iso/builder/build.sh b/iso/builder/build.sh index 2aafbbe..1065317 100755 --- a/iso/builder/build.sh +++ b/iso/builder/build.sh @@ -203,7 +203,7 @@ dump_memtest_debug() { echo "-- source bootloader templates --" for cfg in \ - "${BUILDER_DIR}/config/bootloaders/grub-pc/grub.cfg" \ + "${BUILDER_DIR}/config/bootloaders/grub-efi/grub.cfg" \ "${BUILDER_DIR}/config/bootloaders/isolinux/live.cfg.in"; do if [ -f "$cfg" ]; then echo " file: $cfg" @@ -954,87 +954,6 @@ elif [ -d "${LB_PKG_CACHE}" ] && [ "$(ls -A "${LB_PKG_CACHE}" 2>/dev/null)" ]; t rsync -a "${LB_PKG_CACHE}/" "${BUILD_WORK_DIR}/cache/packages.chroot/" fi -if [ "$BEE_GPU_VENDOR" != "nvidia" ] || [ "$BEE_NVIDIA_MODULE_FLAVOR" != "proprietary" ]; then - mkdir -p "${BUILD_WORK_DIR}/config/bootloaders/grub-pc" - cat > "${BUILD_WORK_DIR}/config/bootloaders/grub-pc/grub.cfg" <<'EOF' -source /boot/grub/config.cfg - -echo "" -echo " ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗" -echo " ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝" -echo " █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗" -echo " ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝" -echo " ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗" -echo " ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝" -echo " Hardware Audit LiveCD" -echo "" - -menuentry "EASY-BEE" { - linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup - initrd @INITRD_LIVE@ -} - -submenu "EASY-BEE (advanced options) -->" { - menuentry "EASY-BEE — KMS (no nomodeset)" { - linux @KERNEL_LIVE@ @APPEND_LIVE@ net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup - initrd @INITRD_LIVE@ - } - - menuentry "EASY-BEE — fail-safe" { - linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0 - initrd @INITRD_LIVE@ - } -} - -if [ "${grub_platform}" = "efi" ]; then - menuentry "Memory Test (memtest86+)" { - chainloader /boot/memtest86+x64.efi - } -else - menuentry "Memory Test (memtest86+)" { - linux16 /boot/memtest86+x64.bin - } -fi - -if [ "${grub_platform}" = "efi" ]; then - menuentry "UEFI Firmware Settings" { - fwsetup - } -fi -EOF - - cat > "${BUILD_WORK_DIR}/config/bootloaders/isolinux/live.cfg.in" <<'EOF' -label live-@FLAVOUR@-normal - menu label ^EASY-BEE - menu default - linux @LINUX@ - initrd @INITRD@ - append @APPEND_LIVE@ - -label live-@FLAVOUR@-kms - menu label EASY-BEE (^graphics/KMS) - linux @LINUX@ - initrd @INITRD@ - append @APPEND_LIVE@ bee.display=kms - -label live-@FLAVOUR@-toram - menu label EASY-BEE (^load to RAM) - linux @LINUX@ - initrd @INITRD@ - append @APPEND_LIVE@ toram - -label live-@FLAVOUR@-failsafe - menu label EASY-BEE (^fail-safe) - linux @LINUX@ - initrd @INITRD@ - append @APPEND_LIVE@ memtest noapic noapm nodma nomce nolapic nosmp vga=normal - -label memtest - menu label ^Memory Test (memtest86+) - linux /boot/memtest86+x64.bin -EOF -fi - rsync -a "${OVERLAY_DIR}/" "${OVERLAY_STAGE_DIR}/" rm -f \ "${OVERLAY_STAGE_DIR}/etc/bee-ssh-password-fallback" \ diff --git a/iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.png b/iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.png new file mode 100644 index 0000000..6b36657 Binary files /dev/null and b/iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.png differ diff --git a/iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt b/iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt index c7044b2..4e4d5a6 100644 --- a/iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt +++ b/iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt @@ -5,6 +5,15 @@ title-text: "" message-font: "Unifont Regular 16" terminal-font: "Unifont Regular 16" +#bee logo — centered, upper third of screen ++ image { + top = 4% + left = 50%-200 + width = 400 + height = 400 + file = "bee-logo.png" +} + #help bar at the bottom + label { top = 100%-50 @@ -21,8 +30,8 @@ terminal-font: "Unifont Regular 16" + boot_menu { left = 20% width = 60% - top = 62% - height = 38%-80 + top = 65% + height = 35%-80 item_color = "#c88000" item_font = "Unifont Regular 16" selected_item_color= "#f5a800" diff --git a/iso/overlay/etc/systemd/system/bee-web.service b/iso/overlay/etc/systemd/system/bee-web.service index 0fd7b9a..62c498b 100644 --- a/iso/overlay/etc/systemd/system/bee-web.service +++ b/iso/overlay/etc/systemd/system/bee-web.service @@ -10,6 +10,7 @@ RestartSec=3 StandardOutput=journal StandardError=journal LimitMEMLOCK=infinity +MemoryMax=3G # Keep the web server responsive during GPU/CPU stress (children inherit nice+10 # via Setpriority in runCmdJob, but the bee-web parent stays at 0). Nice=0