fix(webui): repair audit actions and CPU burn flow - v3.15

fix(stress): label loaders and improve john opencl diagnostics
fix(iso): validate memtest with xorriso fallback
2026-04-01 08:19:11 +03:00 · 2026-04-01 07:31:52 +03:00 · 2026-04-01 07:24:05 +03:00 · 2026-04-01 07:14:53 +03:00 · 2026-04-01 07:04:48 +03:00 · 2026-03-31 22:28:26 +03:00
26 changed files with 831 additions and 186 deletions
--- a/audit/internal/platform/nvidia_stress.go
+++ b/audit/internal/platform/nvidia_stress.go
@@ -16,7 +16,7 @@ func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts N
 		return "", err
 	}

-	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-stress", []satJob{
+	return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), []satJob{
 		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
 		job,
@@ -24,6 +24,17 @@ func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts N
 	}, logFunc)
 }

+func nvidiaStressArchivePrefix(loader string) string {
+	switch strings.TrimSpace(strings.ToLower(loader)) {
+	case NvidiaStressLoaderJohn:
+		return "gpu-nvidia-john"
+	case NvidiaStressLoaderNCCL:
+		return "gpu-nvidia-nccl"
+	default:
+		return "gpu-nvidia-burn"
+	}
+}
+
 func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
 	selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
 	if err != nil {
--- a/audit/internal/platform/platform_stress.go
+++ b/audit/internal/platform/platform_stress.go
@@ -10,9 +10,11 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
+	"runtime"
 	"strconv"
 	"strings"
 	"sync"
+	"syscall"
 	"time"
 )

@@ -374,10 +376,17 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
 		return nil, fmt.Errorf("stressapptest not found: %w", err)
 	}
 	// Use a very long duration; the context timeout will kill it at the right time.
-	cmd := exec.CommandContext(ctx, path, "-s", "86400", "-W", "--cc_test")
+	cmdArgs := []string{"-s", "86400", "-W", "--cc_test"}
+	if threads := platformStressCPUThreads(); threads > 0 {
+		cmdArgs = append(cmdArgs, "-m", strconv.Itoa(threads))
+	}
+	if mb := platformStressMemoryMB(); mb > 0 {
+		cmdArgs = append(cmdArgs, "-M", strconv.Itoa(mb))
+	}
+	cmd := exec.CommandContext(ctx, path, cmdArgs...)
 	cmd.Stdout = nil
 	cmd.Stderr = nil
-	if err := cmd.Start(); err != nil {
+	if err := startLowPriorityCmd(cmd, 15); err != nil {
 		return nil, fmt.Errorf("stressapptest start: %w", err)
 	}
 	return cmd, nil
@@ -418,7 +427,7 @@ func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
 	cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
 	cmd.Stdout = nil
 	cmd.Stderr = nil
-	_ = cmd.Start()
+	_ = startLowPriorityCmd(cmd, 10)
 	return cmd
 }

@@ -433,10 +442,50 @@ func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
 	cmd := exec.CommandContext(ctx, path, "--seconds", "86400", "--size-mb", "64")
 	cmd.Stdout = nil
 	cmd.Stderr = nil
-	_ = cmd.Start()
+	_ = startLowPriorityCmd(cmd, 10)
 	return cmd
 }

+func startLowPriorityCmd(cmd *exec.Cmd, nice int) error {
+	if err := cmd.Start(); err != nil {
+		return err
+	}
+	if cmd.Process != nil {
+		_ = syscall.Setpriority(syscall.PRIO_PROCESS, cmd.Process.Pid, nice)
+	}
+	return nil
+}
+
+func platformStressCPUThreads() int {
+	if n := envInt("BEE_PLATFORM_STRESS_THREADS", 0); n > 0 {
+		return n
+	}
+	cpus := runtime.NumCPU()
+	switch {
+	case cpus <= 2:
+		return 1
+	case cpus <= 8:
+		return cpus - 1
+	default:
+		return cpus - 2
+	}
+}
+
+func platformStressMemoryMB() int {
+	if mb := envInt("BEE_PLATFORM_STRESS_MB", 0); mb > 0 {
+		return mb
+	}
+	free := freeMemBytes()
+	if free <= 0 {
+		return 0
+	}
+	mb := int((free * 60) / 100 / (1024 * 1024))
+	if mb < 1024 {
+		return 1024
+	}
+	return mb
+}
+
 func packPlatformDir(dir, dest string) error {
 	f, err := os.Create(dest)
 	if err != nil {
--- a/audit/internal/platform/platform_stress_test.go
+++ b/audit/internal/platform/platform_stress_test.go
@@ -0,0 +1,34 @@
+package platform
+
+import (
+	"runtime"
+	"testing"
+)
+
+func TestPlatformStressCPUThreadsOverride(t *testing.T) {
+	t.Setenv("BEE_PLATFORM_STRESS_THREADS", "7")
+	if got := platformStressCPUThreads(); got != 7 {
+		t.Fatalf("platformStressCPUThreads=%d want 7", got)
+	}
+}
+
+func TestPlatformStressCPUThreadsDefaultLeavesHeadroom(t *testing.T) {
+	t.Setenv("BEE_PLATFORM_STRESS_THREADS", "")
+	got := platformStressCPUThreads()
+	if got < 1 {
+		t.Fatalf("platformStressCPUThreads=%d want >= 1", got)
+	}
+	if got > runtime.NumCPU() {
+		t.Fatalf("platformStressCPUThreads=%d want <= NumCPU=%d", got, runtime.NumCPU())
+	}
+	if runtime.NumCPU() > 2 && got >= runtime.NumCPU() {
+		t.Fatalf("platformStressCPUThreads=%d want headroom below NumCPU=%d", got, runtime.NumCPU())
+	}
+}
+
+func TestPlatformStressMemoryMBOverride(t *testing.T) {
+	t.Setenv("BEE_PLATFORM_STRESS_MB", "8192")
+	if got := platformStressMemoryMB(); got != 8192 {
+		t.Fatalf("platformStressMemoryMB=%d want 8192", got)
+	}
+}
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -684,7 +684,11 @@ func resolveSATCommand(cmd []string) ([]string, error) {
 	case "rvs":
 		return resolveRVSCommand(cmd[1:]...)
 	}
-	return cmd, nil
+	path, err := satLookPath(cmd[0])
+	if err != nil {
+		return nil, fmt.Errorf("%s not found in PATH: %w", cmd[0], err)
+	}
+	return append([]string{path}, cmd[1:]...), nil
 }

 func resolveRVSCommand(args ...string) ([]string, error) {
--- a/audit/internal/platform/sat_test.go
+++ b/audit/internal/platform/sat_test.go
@@ -162,6 +162,25 @@ func TestBuildNvidiaStressJobUsesNCCLLoader(t *testing.T) {
 	}
 }

+func TestNvidiaStressArchivePrefixByLoader(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		loader string
+		want   string
+	}{
+		{loader: NvidiaStressLoaderBuiltin, want: "gpu-nvidia-burn"},
+		{loader: NvidiaStressLoaderJohn, want: "gpu-nvidia-john"},
+		{loader: NvidiaStressLoaderNCCL, want: "gpu-nvidia-nccl"},
+		{loader: "", want: "gpu-nvidia-burn"},
+	}
+	for _, tt := range tests {
+		if got := nvidiaStressArchivePrefix(tt.loader); got != tt.want {
+			t.Fatalf("loader=%q prefix=%q want %q", tt.loader, got, tt.want)
+		}
+	}
+}
+
 func TestEnvIntFallback(t *testing.T) {
 	os.Unsetenv("BEE_MEMTESTER_SIZE_MB")
 	if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 123 {
@@ -237,6 +256,44 @@ func TestResolveROCmSMICommandFromPATH(t *testing.T) {
 	}
 }

+func TestResolveSATCommandUsesLookPathForGenericTools(t *testing.T) {
+	oldLookPath := satLookPath
+	satLookPath = func(file string) (string, error) {
+		if file == "stress-ng" {
+			return "/usr/bin/stress-ng", nil
+		}
+		return "", exec.ErrNotFound
+	}
+	t.Cleanup(func() { satLookPath = oldLookPath })
+
+	cmd, err := resolveSATCommand([]string{"stress-ng", "--cpu", "0"})
+	if err != nil {
+		t.Fatalf("resolveSATCommand error: %v", err)
+	}
+	if len(cmd) != 3 {
+		t.Fatalf("cmd len=%d want 3 (%v)", len(cmd), cmd)
+	}
+	if cmd[0] != "/usr/bin/stress-ng" {
+		t.Fatalf("cmd[0]=%q want /usr/bin/stress-ng", cmd[0])
+	}
+}
+
+func TestResolveSATCommandFailsForMissingGenericTool(t *testing.T) {
+	oldLookPath := satLookPath
+	satLookPath = func(file string) (string, error) {
+		return "", exec.ErrNotFound
+	}
+	t.Cleanup(func() { satLookPath = oldLookPath })
+
+	_, err := resolveSATCommand([]string{"stress-ng", "--cpu", "0"})
+	if err == nil {
+		t.Fatal("expected error")
+	}
+	if !strings.Contains(err.Error(), "stress-ng not found in PATH") {
+		t.Fatalf("error=%q", err)
+	}
+}
+
 func TestResolveROCmSMICommandFallsBackToROCmTree(t *testing.T) {
 	tmp := t.TempDir()
 	execPath := filepath.Join(tmp, "opt", "rocm", "bin", "rocm-smi")
--- a/audit/internal/webui/api.go
+++ b/audit/internal/webui/api.go
@@ -4,9 +4,11 @@ import (
 	"bufio"
 	"context"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"io"
 	"net/http"
+	"os"
 	"os/exec"
 	"path/filepath"
 	"regexp"
@@ -179,19 +181,14 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
 			Profile           string `json:"profile"`
 			DisplayName       string `json:"display_name"`
 		}
-		if r.ContentLength > 0 {
-			_ = json.NewDecoder(r.Body).Decode(&body)
-		}
-
-		name := taskNames[target]
-		if body.Profile != "" {
-			if n, ok := burnNames[target]; ok {
-				name = n
+		if r.Body != nil {
+			if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
+				writeError(w, http.StatusBadRequest, "invalid request body")
+				return
 			}
 		}
-		if name == "" {
-			name = target
-		}
+
+		name := taskDisplayName(target, body.Profile, body.Loader)
 		t := &Task{
 			ID:        newJobID("sat-" + target),
 			Name:      name,
@@ -667,6 +664,22 @@ func (h *handler) handleAPIInstallStream(w http.ResponseWriter, r *http.Request)

 // ── Metrics SSE ───────────────────────────────────────────────────────────────

+func (h *handler) handleAPIMetricsLatest(w http.ResponseWriter, r *http.Request) {
+	sample, ok := h.latestMetric()
+	if !ok {
+		w.Header().Set("Content-Type", "application/json")
+		_, _ = w.Write([]byte("{}"))
+		return
+	}
+	b, err := json.Marshal(sample)
+	if err != nil {
+		http.Error(w, err.Error(), http.StatusInternalServerError)
+		return
+	}
+	w.Header().Set("Content-Type", "application/json")
+	_, _ = w.Write(b)
+}
+
 func (h *handler) handleAPIMetricsStream(w http.ResponseWriter, r *http.Request) {
 	if !sseStart(w) {
 		return
@@ -917,8 +930,31 @@ func parseXrandrOutput(out string) []displayInfo {
 	return infos
 }

+func xrandrCommand(args ...string) *exec.Cmd {
+	cmd := exec.Command("xrandr", args...)
+	env := append([]string{}, os.Environ()...)
+	hasDisplay := false
+	hasXAuthority := false
+	for _, kv := range env {
+		if strings.HasPrefix(kv, "DISPLAY=") && strings.TrimPrefix(kv, "DISPLAY=") != "" {
+			hasDisplay = true
+		}
+		if strings.HasPrefix(kv, "XAUTHORITY=") && strings.TrimPrefix(kv, "XAUTHORITY=") != "" {
+			hasXAuthority = true
+		}
+	}
+	if !hasDisplay {
+		env = append(env, "DISPLAY=:0")
+	}
+	if !hasXAuthority {
+		env = append(env, "XAUTHORITY=/home/bee/.Xauthority")
+	}
+	cmd.Env = env
+	return cmd
+}
+
 func (h *handler) handleAPIDisplayResolutions(w http.ResponseWriter, _ *http.Request) {
-	out, err := exec.Command("xrandr").Output()
+	out, err := xrandrCommand().Output()
 	if err != nil {
 		writeError(w, http.StatusInternalServerError, "xrandr: "+err.Error())
 		return
@@ -945,7 +981,7 @@ func (h *handler) handleAPIDisplaySet(w http.ResponseWriter, r *http.Request) {
 		writeError(w, http.StatusBadRequest, "invalid output name")
 		return
 	}
-	if out, err := exec.Command("xrandr", "--output", req.Output, "--mode", req.Mode).CombinedOutput(); err != nil {
+	if out, err := xrandrCommand("--output", req.Output, "--mode", req.Mode).CombinedOutput(); err != nil {
 		writeError(w, http.StatusInternalServerError, "xrandr: "+strings.TrimSpace(string(out)))
 		return
 	}
--- a/audit/internal/webui/api_test.go
+++ b/audit/internal/webui/api_test.go
@@ -0,0 +1,64 @@
+package webui
+
+import (
+	"net/http/httptest"
+	"strings"
+	"testing"
+
+	"bee/audit/internal/app"
+)
+
+func TestXrandrCommandAddsDefaultX11Env(t *testing.T) {
+	t.Setenv("DISPLAY", "")
+	t.Setenv("XAUTHORITY", "")
+
+	cmd := xrandrCommand("--query")
+
+	var hasDisplay bool
+	var hasXAuthority bool
+	for _, kv := range cmd.Env {
+		if kv == "DISPLAY=:0" {
+			hasDisplay = true
+		}
+		if kv == "XAUTHORITY=/home/bee/.Xauthority" {
+			hasXAuthority = true
+		}
+	}
+	if !hasDisplay {
+		t.Fatalf("DISPLAY not injected: %v", cmd.Env)
+	}
+	if !hasXAuthority {
+		t.Fatalf("XAUTHORITY not injected: %v", cmd.Env)
+	}
+}
+
+func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
+	globalQueue.mu.Lock()
+	originalTasks := globalQueue.tasks
+	globalQueue.tasks = nil
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = originalTasks
+		globalQueue.mu.Unlock()
+	})
+
+	h := &handler{opts: HandlerOptions{App: &app.App{}}}
+	req := httptest.NewRequest("POST", "/api/sat/cpu/run", strings.NewReader(`{"profile":"smoke"}`))
+	req.ContentLength = -1
+	rec := httptest.NewRecorder()
+
+	h.handleAPISATRun("cpu").ServeHTTP(rec, req)
+
+	if rec.Code != 200 {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	globalQueue.mu.Lock()
+	defer globalQueue.mu.Unlock()
+	if len(globalQueue.tasks) != 1 {
+		t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
+	}
+	if got := globalQueue.tasks[0].params.BurnProfile; got != "smoke" {
+		t.Fatalf("burn profile=%q want smoke", got)
+	}
+}
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
@@ -289,7 +289,7 @@ func renderAudit() string {
 func renderHardwareSummaryCard(opts HandlerOptions) string {
 	data, err := loadSnapshot(opts.AuditPath)
 	if err != nil {
-		return `<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body"><span class="badge badge-unknown">No audit data</span></div></div>`
+		return `<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body"><button class="btn btn-primary" onclick="auditModalRun()">&#9654; Run Audit</button></div></div>`
 	}
 	// Parse just enough fields for the summary banner
 	var snap struct {
@@ -532,16 +532,10 @@ function refreshCharts() {
 }
 setInterval(refreshCharts, 3000);

-const es = new EventSource('/api/metrics/stream');
-es.addEventListener('metrics', e => {
-  const d = JSON.parse(e.data);
-
-  // Show/hide Fan RPM card based on data availability
+fetch('/api/metrics/latest').then(r => r.json()).then(d => {
  const fanCard = document.getElementById('card-server-fans');
  if (fanCard) fanCard.style.display = (d.fans && d.fans.length > 0) ? '' : 'none';
-
-});
-es.onerror = () => {};
+}).catch(() => {});
 </script>`
 }

--- a/audit/internal/webui/server.go
+++ b/audit/internal/webui/server.go
@@ -270,6 +270,7 @@ func NewHandler(opts HandlerOptions) http.Handler {

 	// Metrics — SSE stream of live sensor data + server-side SVG charts + CSV export
 	mux.HandleFunc("GET /api/metrics/stream", h.handleAPIMetricsStream)
+	mux.HandleFunc("GET /api/metrics/latest", h.handleAPIMetricsLatest)
 	mux.HandleFunc("GET /api/metrics/chart/", h.handleMetricsChartSVG)
 	mux.HandleFunc("GET /api/metrics/export.csv", h.handleAPIMetricsExportCSV)

@@ -1230,13 +1231,6 @@ probe();
 func (h *handler) handlePage(w http.ResponseWriter, r *http.Request) {
 	page := strings.TrimPrefix(r.URL.Path, "/")
 	if page == "" {
-		// Serve loading page until audit snapshot exists
-		if _, err := os.Stat(h.opts.AuditPath); err != nil {
-			w.Header().Set("Cache-Control", "no-store")
-			w.Header().Set("Content-Type", "text/html; charset=utf-8")
-			_, _ = w.Write([]byte(loadingPageHTML))
-			return
-		}
 		page = "dashboard"
 	}
 	// Redirect old routes to new names
--- a/audit/internal/webui/server_test.go
+++ b/audit/internal/webui/server_test.go
@@ -136,6 +136,33 @@ func TestRootRendersDashboard(t *testing.T) {
 	}
 }

+func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
+	dir := t.TempDir()
+	exportDir := filepath.Join(dir, "export")
+	if err := os.MkdirAll(exportDir, 0755); err != nil {
+		t.Fatal(err)
+	}
+
+	handler := NewHandler(HandlerOptions{
+		Title:     "Bee Hardware Audit",
+		AuditPath: filepath.Join(dir, "missing-audit.json"),
+		ExportDir: exportDir,
+	})
+
+	rec := httptest.NewRecorder()
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d", rec.Code)
+	}
+	body := rec.Body.String()
+	if !strings.Contains(body, `Run Audit`) {
+		t.Fatalf("dashboard missing run audit button: %s", body)
+	}
+	if strings.Contains(body, `No audit data`) {
+		t.Fatalf("dashboard still shows empty audit badge: %s", body)
+	}
+}
+
 func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
 	dir := t.TempDir()
 	path := filepath.Join(dir, "audit.json")
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -8,6 +8,7 @@ import (
 	"os"
 	"path/filepath"
 	"sort"
+	"strings"
 	"sync"
 	"time"

@@ -51,6 +52,33 @@ var burnNames = map[string]string{
 	"amd":    "AMD GPU Burn-in",
 }

+func nvidiaStressTaskName(loader string) string {
+	switch strings.TrimSpace(strings.ToLower(loader)) {
+	case platform.NvidiaStressLoaderJohn:
+		return "NVIDIA GPU Stress (John/OpenCL)"
+	case platform.NvidiaStressLoaderNCCL:
+		return "NVIDIA GPU Stress (NCCL)"
+	default:
+		return "NVIDIA GPU Stress (bee-gpu-burn)"
+	}
+}
+
+func taskDisplayName(target, profile, loader string) string {
+	name := taskNames[target]
+	if profile != "" {
+		if n, ok := burnNames[target]; ok {
+			name = n
+		}
+	}
+	if target == "nvidia-stress" {
+		name = nvidiaStressTaskName(loader)
+	}
+	if name == "" {
+		name = target
+	}
+	return name
+}
+
 // Task represents one unit of work in the queue.
 type Task struct {
 	ID        string     `json:"id"`
@@ -440,6 +468,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 		if dur <= 0 {
 			dur = 60
 		}
+		j.append(fmt.Sprintf("CPU stress duration: %ds", dur))
 		archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
 	case "amd":
 		archive, err = runAMDAcceptancePackCtx(a, ctx, "", j.append)
--- a/audit/internal/webui/tasks_test.go
+++ b/audit/internal/webui/tasks_test.go
@@ -95,6 +95,23 @@ func TestResolveBurnPreset(t *testing.T) {
 	}
 }

+func TestTaskDisplayNameUsesNvidiaStressLoader(t *testing.T) {
+	tests := []struct {
+		loader string
+		want   string
+	}{
+		{loader: "", want: "NVIDIA GPU Stress (bee-gpu-burn)"},
+		{loader: "builtin", want: "NVIDIA GPU Stress (bee-gpu-burn)"},
+		{loader: "john", want: "NVIDIA GPU Stress (John/OpenCL)"},
+		{loader: "nccl", want: "NVIDIA GPU Stress (NCCL)"},
+	}
+	for _, tc := range tests {
+		if got := taskDisplayName("nvidia-stress", "acceptance", tc.loader); got != tc.want {
+			t.Fatalf("taskDisplayName(loader=%q)=%q want %q", tc.loader, got, tc.want)
+		}
+	}
+}
+
 func TestRunTaskHonorsCancel(t *testing.T) {
 	t.Parallel()

@@ -154,3 +171,34 @@ func TestRunTaskHonorsCancel(t *testing.T) {
 		t.Fatal("runTask did not return after cancel")
 	}
 }
+
+func TestRunTaskUsesBurnProfileDurationForCPU(t *testing.T) {
+	t.Parallel()
+
+	var gotDuration int
+	q := &taskQueue{
+		opts: &HandlerOptions{App: &app.App{}},
+	}
+	tk := &Task{
+		ID:        "cpu-burn-1",
+		Name:      "CPU Burn-in",
+		Target:    "cpu",
+		Status:    TaskRunning,
+		CreatedAt: time.Now(),
+		params:    taskParams{BurnProfile: "smoke"},
+	}
+	j := &jobState{}
+
+	orig := runCPUAcceptancePackCtx
+	runCPUAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, durationSec int, _ func(string)) (string, error) {
+		gotDuration = durationSec
+		return "/tmp/cpu-burn.tar.gz", nil
+	}
+	defer func() { runCPUAcceptancePackCtx = orig }()
+
+	q.runTask(tk, j, context.Background())
+
+	if gotDuration != 5*60 {
+		t.Fatalf("duration=%d want %d", gotDuration, 5*60)
+	}
+}
--- a/2
+++ b/2
--- a/bible-local/docs/iso-build-rules.md
+++ b/bible-local/docs/iso-build-rules.md
@@ -13,9 +13,10 @@ Use one of:

 This applies to:
 - `iso/builder/config/package-lists/*.list.chroot`
- Any package referenced in `grub.cfg`, hooks, or overlay scripts (e.g. file paths like `/boot/memtest86+x64.bin`)
+- Any package referenced in bootloader configs, hooks, or overlay scripts

-## Example of what goes wrong without this
+## Memtest rule

-`memtest86+` in Debian bookworm installs `/boot/memtest86+x64.bin`, not `/boot/memtest86+.bin`.
-Guessing the filename caused a broken GRUB entry that only surfaced at boot time, after a full rebuild.
+Prefer live-build's built-in memtest integration over custom hooks or hardcoded
+bootloader paths. If you ever need to reference memtest files manually, verify
+the exact package file list first for the target Debian release.
--- a/internal/chart
+++ b/internal/chart
--- a/iso/builder/auto/config
+++ b/iso/builder/auto/config
@@ -29,7 +29,7 @@ lb config noauto \
    --security true \
    --linux-flavours "amd64" \
    --linux-packages "${LB_LINUX_PACKAGES}" \
-    --memtest none \
+    --memtest memtest86+ \
    --iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
    --iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
    --bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
--- a/iso/builder/bee-gpu-stress.c
+++ b/iso/builder/bee-gpu-stress.c
@@ -36,6 +36,7 @@ typedef void *CUstream;
 #define MAX_CUBLAS_PROFILES 5
 #define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
 #define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
+#define STRESS_LAUNCH_DEPTH 8

 static const char *ptx_source =
    ".version 6.0\n"
@@ -422,24 +423,31 @@ static int run_ptx_fallback(struct cuda_api *api,
    double deadline = start + (double)seconds;
    while (now_seconds() < deadline) {
        launches_per_wave = 0;
-        for (int lane = 0; lane < stream_count; lane++) {
-            unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
-            if (!check_rc(api,
-                          "cuLaunchKernel",
-                          api->cuLaunchKernel(kernel,
-                                              blocks,
-                                              1,
-                                              1,
-                                              threads,
-                                              1,
-                                              1,
-                                              0,
-                                              streams[lane],
-                                              params[lane],
-                                              NULL))) {
-                goto fail;
+        for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
+            int launched_this_batch = 0;
+            for (int lane = 0; lane < stream_count; lane++) {
+                unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
+                if (!check_rc(api,
+                              "cuLaunchKernel",
+                              api->cuLaunchKernel(kernel,
+                                                  blocks,
+                                                  1,
+                                                  1,
+                                                  threads,
+                                                  1,
+                                                  1,
+                                                  0,
+                                                  streams[lane],
+                                                  params[lane],
+                                                  NULL))) {
+                    goto fail;
+                }
+                launches_per_wave++;
+                launched_this_batch++;
+            }
+            if (launched_this_batch <= 0) {
+                break;
            }
-            launches_per_wave++;
        }
        if (launches_per_wave <= 0) {
            goto fail;
@@ -460,10 +468,11 @@ static int run_ptx_fallback(struct cuda_api *api,
    report->iterations = iterations;
    snprintf(report->details,
             sizeof(report->details),
-             "fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d per_stream_mb=%zu iterations=%lu\n",
+             "fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d queue_depth=%d per_stream_mb=%zu iterations=%lu\n",
             size_mb,
             report->buffer_mb,
             report->stream_count,
+             STRESS_LAUNCH_DEPTH,
             bytes_per_stream[0] / (1024u * 1024u),
             iterations);

@@ -1184,10 +1193,11 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
    report->buffer_mb = (int)(total_budget / (1024u * 1024u));
    append_detail(report->details,
                  sizeof(report->details),
-                  "requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n",
+                  "requested_mb=%d actual_mb=%d streams=%d queue_depth=%d mp_count=%d per_worker_mb=%zu\n",
                  size_mb,
                  report->buffer_mb,
                  report->stream_count,
+                  STRESS_LAUNCH_DEPTH,
                  mp_count,
                  per_profile_budget / (1024u * 1024u));

@@ -1239,26 +1249,33 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
    double deadline = now_seconds() + (double)seconds;
    while (now_seconds() < deadline) {
        wave_launches = 0;
-        for (int i = 0; i < prepared_count; i++) {
-            if (!prepared[i].ready) {
-                continue;
-            }
-            if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
-                append_detail(report->details,
-                              sizeof(report->details),
-                              "%s=FAILED runtime\n",
-                              prepared[i].desc.name);
-                for (int j = 0; j < prepared_count; j++) {
-                    destroy_profile(&cublas, cuda, &prepared[j]);
+        for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
+            int launched_this_batch = 0;
+            for (int i = 0; i < prepared_count; i++) {
+                if (!prepared[i].ready) {
+                    continue;
                }
-                cublas.cublasLtDestroy(handle);
-                destroy_streams(cuda, streams, stream_count);
-                cuda->cuCtxDestroy(ctx);
-                return 0;
+                if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
+                    append_detail(report->details,
+                                  sizeof(report->details),
+                                  "%s=FAILED runtime\n",
+                                  prepared[i].desc.name);
+                    for (int j = 0; j < prepared_count; j++) {
+                        destroy_profile(&cublas, cuda, &prepared[j]);
+                    }
+                    cublas.cublasLtDestroy(handle);
+                    destroy_streams(cuda, streams, stream_count);
+                    cuda->cuCtxDestroy(ctx);
+                    return 0;
+                }
+                prepared[i].iterations++;
+                report->iterations++;
+                wave_launches++;
+                launched_this_batch++;
+            }
+            if (launched_this_batch <= 0) {
+                break;
            }
-            prepared[i].iterations++;
-            report->iterations++;
-            wave_launches++;
        }
        if (wave_launches <= 0) {
            break;
--- a/iso/builder/build.sh
+++ b/iso/builder/build.sh
@@ -111,8 +111,231 @@ resolve_iso_version() {
    resolve_audit_version
 }

+iso_list_files() {
+    iso_path="$1"
+
+    if command -v bsdtar >/dev/null 2>&1; then
+        bsdtar -tf "$iso_path"
+        return $?
+    fi
+
+    if command -v xorriso >/dev/null 2>&1; then
+        xorriso -indev "$iso_path" -find / -type f -print 2>/dev/null | sed 's#^/##'
+        return $?
+    fi
+
+    return 127
+}
+
+iso_extract_file() {
+    iso_path="$1"
+    iso_member="$2"
+
+    if command -v bsdtar >/dev/null 2>&1; then
+        bsdtar -xOf "$iso_path" "$iso_member"
+        return $?
+    fi
+
+    if command -v xorriso >/dev/null 2>&1; then
+        xorriso -osirrox on -indev "$iso_path" -cat "/$iso_member" 2>/dev/null
+        return $?
+    fi
+
+    return 127
+}
+
+require_iso_reader() {
+    command -v bsdtar >/dev/null 2>&1 && return 0
+    command -v xorriso >/dev/null 2>&1 && return 0
+    memtest_fail "ISO reader is required for validation/debug (expected bsdtar or xorriso)" "${1:-}"
+}
+
+dump_memtest_debug() {
+    phase="$1"
+    lb_dir="${2:-}"
+    iso_path="${3:-}"
+    phase_slug="$(printf '%s' "${phase}" | tr ' /' '__')"
+    memtest_log="${LOG_DIR:-}/memtest-${phase_slug}.log"
+
+    (
+        echo "=== memtest debug: ${phase} ==="
+
+        echo "-- auto/config --"
+        if [ -f "${BUILDER_DIR}/auto/config" ]; then
+            grep -n -- '--memtest' "${BUILDER_DIR}/auto/config" || echo "  (no --memtest line found)"
+        else
+            echo "  (missing ${BUILDER_DIR}/auto/config)"
+        fi
+
+        echo "-- source bootloader templates --"
+        for cfg in \
+            "${BUILDER_DIR}/config/bootloaders/grub-pc/grub.cfg" \
+            "${BUILDER_DIR}/config/bootloaders/isolinux/live.cfg.in"; do
+            if [ -f "$cfg" ]; then
+                echo "  file: $cfg"
+                grep -n 'Memory Test\|memtest' "$cfg" || echo "    (no memtest lines)"
+            fi
+        done
+
+        if [ -n "$lb_dir" ] && [ -d "$lb_dir" ]; then
+            echo "-- live-build workdir package lists --"
+            for pkg in \
+                "$lb_dir/config/package-lists/bee.list.chroot" \
+                "$lb_dir/config/package-lists/bee-gpu.list.chroot" \
+                "$lb_dir/config/package-lists/bee-nvidia.list.chroot"; do
+                if [ -f "$pkg" ]; then
+                    echo "  file: $pkg"
+                    grep -n 'memtest' "$pkg" || echo "    (no memtest lines)"
+                fi
+            done
+
+            echo "-- live-build chroot/boot --"
+            if [ -d "$lb_dir/chroot/boot" ]; then
+                find "$lb_dir/chroot/boot" -maxdepth 1 -name 'memtest*' -print | sed 's/^/  /' || true
+            else
+                echo "  (missing $lb_dir/chroot/boot)"
+            fi
+
+            echo "-- live-build binary/boot --"
+            if [ -d "$lb_dir/binary/boot" ]; then
+                find "$lb_dir/binary/boot" -maxdepth 1 -name 'memtest*' -print | sed 's/^/  /' || true
+            else
+                echo "  (missing $lb_dir/binary/boot)"
+            fi
+
+            echo "-- live-build package cache --"
+            if [ -d "$lb_dir/cache/packages.chroot" ]; then
+                find "$lb_dir/cache/packages.chroot" -maxdepth 1 -name 'memtest86+*.deb' -print | sed 's/^/  /' || true
+            else
+                echo "  (missing $lb_dir/cache/packages.chroot)"
+            fi
+        fi
+
+        if [ -n "$iso_path" ] && [ -f "$iso_path" ]; then
+            echo "-- ISO memtest files --"
+            iso_list_files "$iso_path" | grep 'memtest' | sed 's/^/  /' || echo "  (no memtest files in ISO)"
+
+            echo "-- ISO GRUB memtest lines --"
+            iso_extract_file "$iso_path" boot/grub/grub.cfg 2>/dev/null | grep -n 'Memory Test\|memtest' || echo "  (no memtest lines in boot/grub/grub.cfg)"
+
+            echo "-- ISO isolinux memtest lines --"
+            iso_extract_file "$iso_path" isolinux/live.cfg 2>/dev/null | grep -n 'Memory Test\|memtest' || echo "  (no memtest lines in isolinux/live.cfg)"
+        fi
+
+        echo "=== end memtest debug: ${phase} ==="
+    ) | {
+        if [ -n "${LOG_DIR:-}" ] && [ -d "${LOG_DIR}" ]; then
+            tee "${memtest_log}"
+        else
+            cat
+        fi
+    }
+}
+
+memtest_fail() {
+    msg="$1"
+    iso_path="${2:-}"
+    echo "ERROR: ${msg}" >&2
+    dump_memtest_debug "failure" "${LB_DIR:-}" "$iso_path" >&2
+    exit 1
+}
+
+validate_iso_memtest() {
+    iso_path="$1"
+    echo "=== validating memtest in ISO ==="
+
+    [ -f "$iso_path" ] || memtest_fail "ISO not found for validation: $iso_path" "$iso_path"
+    require_iso_reader "$iso_path"
+
+    iso_list_files "$iso_path" | grep -q '^boot/memtest86+x64\.bin$' || {
+        memtest_fail "memtest BIOS binary missing in ISO: boot/memtest86+x64.bin" "$iso_path"
+    }
+    iso_list_files "$iso_path" | grep -q '^boot/memtest86+x64\.efi$' || {
+        memtest_fail "memtest EFI binary missing in ISO: boot/memtest86+x64.efi" "$iso_path"
+    }
+
+    grub_cfg="$(mktemp)"
+    isolinux_cfg="$(mktemp)"
+
+    iso_extract_file "$iso_path" boot/grub/grub.cfg > "$grub_cfg" || memtest_fail "failed to extract boot/grub/grub.cfg from ISO" "$iso_path"
+    iso_extract_file "$iso_path" isolinux/live.cfg > "$isolinux_cfg" || memtest_fail "failed to extract isolinux/live.cfg from ISO" "$iso_path"
+
+    grep -q 'Memory Test (memtest86+)' "$grub_cfg" || {
+        memtest_fail "GRUB menu entry for memtest is missing" "$iso_path"
+    }
+    grep -q '/boot/memtest86+x64\.efi' "$grub_cfg" || {
+        memtest_fail "GRUB memtest EFI path is missing" "$iso_path"
+    }
+    grep -q '/boot/memtest86+x64\.bin' "$grub_cfg" || {
+        memtest_fail "GRUB memtest BIOS path is missing" "$iso_path"
+    }
+    grep -q 'Memory Test (memtest86+)' "$isolinux_cfg" || {
+        memtest_fail "isolinux menu entry for memtest is missing" "$iso_path"
+    }
+    grep -q '/boot/memtest86+x64\.bin' "$isolinux_cfg" || {
+        memtest_fail "isolinux memtest path is missing" "$iso_path"
+    }
+
+    rm -f "$grub_cfg" "$isolinux_cfg"
+    echo "=== memtest validation OK ==="
+}
+
 AUDIT_VERSION_EFFECTIVE="$(resolve_audit_version)"
 ISO_VERSION_EFFECTIVE="$(resolve_iso_version)"
+ISO_BASENAME="easy-bee-${BEE_GPU_VENDOR}-v${ISO_VERSION_EFFECTIVE}-amd64"
+LOG_DIR="${DIST_DIR}/${ISO_BASENAME}.logs"
+LOG_ARCHIVE="${DIST_DIR}/${ISO_BASENAME}.logs.tar.gz"
+ISO_OUT="${DIST_DIR}/${ISO_BASENAME}.iso"
+LOG_OUT="${LOG_DIR}/build.log"
+
+cleanup_build_log() {
+    status="${1:-$?}"
+    trap - EXIT INT TERM HUP
+
+    if [ "${BUILD_LOG_ACTIVE:-0}" = "1" ]; then
+        BUILD_LOG_ACTIVE=0
+        exec 1>&3 2>&4
+        exec 3>&- 4>&-
+        if [ -n "${BUILD_TEE_PID:-}" ]; then
+            wait "${BUILD_TEE_PID}" 2>/dev/null || true
+        fi
+        rm -f "${BUILD_LOG_PIPE}"
+    fi
+
+    if [ -n "${LOG_DIR:-}" ] && [ -d "${LOG_DIR}" ] && command -v tar >/dev/null 2>&1; then
+        rm -f "${LOG_ARCHIVE}"
+        tar -czf "${LOG_ARCHIVE}" -C "${DIST_DIR}" "$(basename "${LOG_DIR}")" 2>/dev/null || true
+    fi
+
+    exit "${status}"
+}
+
+start_build_log() {
+    command -v tee >/dev/null 2>&1 || {
+        echo "ERROR: tee is required for build logging" >&2
+        exit 1
+    }
+
+    rm -rf "${LOG_DIR}"
+    rm -f "${LOG_ARCHIVE}"
+    mkdir -p "${LOG_DIR}"
+    BUILD_LOG_PIPE="$(mktemp -u "${TMPDIR:-/tmp}/bee-build-log.XXXXXX")"
+    mkfifo "${BUILD_LOG_PIPE}"
+
+    exec 3>&1 4>&2
+    tee "${LOG_OUT}" < "${BUILD_LOG_PIPE}" &
+    BUILD_TEE_PID=$!
+    exec > "${BUILD_LOG_PIPE}" 2>&1
+    BUILD_LOG_ACTIVE=1
+
+    trap 'cleanup_build_log "$?"' EXIT INT TERM HUP
+
+    echo "=== build log dir: ${LOG_DIR} ==="
+    echo "=== build log: ${LOG_OUT} ==="
+    echo "=== build log archive: ${LOG_ARCHIVE} ==="
+}
+
+start_build_log

 # Auto-detect kernel ABI: refresh apt index, then query current linux-image-amd64 dependency.
 # If headers for the detected ABI are not yet installed (kernel updated since image build),
@@ -245,13 +468,13 @@ rm -f \
    "${OVERLAY_STAGE_DIR}/etc/bee-release" \
    "${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \
    "${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \
-    "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress" \
    "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress" \
    "${OVERLAY_STAGE_DIR}/usr/local/bin/john" \
    "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker" \
-    "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john" \
    "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \
    "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
+rm -rf \
+    "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john"

 # Remove NVIDIA-specific overlay files for non-nvidia variants
 if [ "$BEE_GPU_VENDOR" != "nvidia" ]; then
@@ -304,7 +527,6 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ] && [ -f "$GPU_BURN_WORKER_BIN" ]; then
    chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-burn" 2>/dev/null || true
    chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-john-gpu-stress" 2>/dev/null || true
    chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress" 2>/dev/null || true
-    ln -sfn bee-gpu-burn "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
 fi

 # --- inject smoketest into overlay so it runs directly on the live CD ---
@@ -510,6 +732,7 @@ export BEE_GPU_VENDOR_UPPER
 cd "${LB_DIR}"
 lb clean 2>&1 | tail -3
 lb config 2>&1 | tail -5
+dump_memtest_debug "pre-build" "${LB_DIR}"
 lb build 2>&1

 # --- persist deb package cache back to shared location ---
@@ -521,8 +744,9 @@ fi

 # live-build outputs live-image-amd64.hybrid.iso in LB_DIR
 ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
-ISO_OUT="${DIST_DIR}/easy-bee-${BEE_GPU_VENDOR}-v${ISO_VERSION_EFFECTIVE}-amd64.iso"
 if [ -f "$ISO_RAW" ]; then
+    dump_memtest_debug "post-build" "${LB_DIR}" "$ISO_RAW"
+    validate_iso_memtest "$ISO_RAW"
    cp "$ISO_RAW" "$ISO_OUT"
    echo ""
    echo "=== done (${BEE_GPU_VENDOR}) ==="
--- a/iso/builder/config/bootloaders/isolinux/live.cfg.in
+++ b/iso/builder/config/bootloaders/isolinux/live.cfg.in
@@ -22,3 +22,7 @@ label live-@FLAVOUR@-failsafe
    linux @LINUX@
    initrd @INITRD@
    append @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal
+
+label memtest
+    menu label ^Memory Test (memtest86+)
+    linux /boot/memtest86+x64.bin
--- a/iso/builder/config/hooks/normal/9100-memtest.hook.binary
+++ b/iso/builder/config/hooks/normal/9100-memtest.hook.binary
@@ -1,76 +0,0 @@
-#!/bin/sh
-# Copy memtest86+ binaries from chroot /boot into the ISO boot directory
-# so GRUB can chainload them directly (they must be on the ISO filesystem,
-# not inside the squashfs).
-#
-# Primary: copy from chroot/boot/ (populated by package postinst).
-# Naming fallbacks:
-#   Debian Bookworm: /boot/memtest86+       — EFI PE64 (no extension)
-#                    /boot/memtest86+.bin    — legacy binary
-#   Upstream/Ubuntu: /boot/memtest86+x64.efi, /boot/memtest86+x64.bin, etc.
-# Last resort: extract directly from the cached .deb if postinst didn't place
-#              the files (happens in chroot environments without grub triggers).
-set -e
-
-MEMTEST_FILES="memtest86+x64.bin memtest86+x64.efi memtest86+ia32.bin memtest86+ia32.efi"
-
-# Ensure destination directory exists (absence caused silent copy failures).
-mkdir -p binary/boot
-
-echo "memtest: scanning chroot/boot/ for memtest files:"
-ls chroot/boot/memtest* 2>/dev/null || echo "memtest: WARNING: no memtest files in chroot/boot/"
-
-# Primary path: copy upstream-named files from chroot/boot/
-for f in ${MEMTEST_FILES}; do
-    src="chroot/boot/${f}"
-    if [ -f "${src}" ]; then
-        cp "${src}" "binary/boot/${f}"
-        echo "memtest: copied ${f} from chroot/boot/"
-    fi
-done
-
-# Debian Bookworm naming fallback: /boot/memtest86+ (no extension) is the EFI binary.
-if [ ! -f "binary/boot/memtest86+x64.efi" ] && [ -f "chroot/boot/memtest86+" ]; then
-    cp "chroot/boot/memtest86+" "binary/boot/memtest86+x64.efi"
-    echo "memtest: copied /boot/memtest86+ as memtest86+x64.efi (Debian naming)"
-fi
-if [ ! -f "binary/boot/memtest86+x64.bin" ] && [ -f "chroot/boot/memtest86+.bin" ]; then
-    cp "chroot/boot/memtest86+.bin" "binary/boot/memtest86+x64.bin"
-    echo "memtest: copied /boot/memtest86+.bin as memtest86+x64.bin (Debian naming)"
-fi
-
-# Last resort: if EFI binary still missing, extract from cached .deb
-if [ ! -f "binary/boot/memtest86+x64.efi" ]; then
-    echo "memtest: EFI binary missing — attempting extraction from .deb cache"
-    deb=$(find chroot/var/cache/apt/archives/ chroot/var/lib/apt/lists/ \
-              -name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' 2>/dev/null \
-          | head -1)
-    if [ -z "$deb" ]; then
-        deb=$(find cache/ -name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' 2>/dev/null | head -1)
-    fi
-    if [ -n "$deb" ]; then
-        echo "memtest: extracting from ${deb}"
-        EXTRACT_DIR="$(mktemp -d)"
-        dpkg-deb -x "${deb}" "${EXTRACT_DIR}"
-        echo "memtest: files found in .deb:"
-        find "${EXTRACT_DIR}/boot" -type f 2>/dev/null || echo "  (none in /boot)"
-        for f in ${MEMTEST_FILES}; do
-            src="${EXTRACT_DIR}/boot/${f}"
-            if [ -f "${src}" ]; then
-                cp "${src}" "binary/boot/${f}"
-                echo "memtest: extracted ${f} from .deb"
-            fi
-        done
-        # Debian naming fallback inside .deb as well
-        if [ ! -f "binary/boot/memtest86+x64.efi" ] && [ -f "${EXTRACT_DIR}/boot/memtest86+" ]; then
-            cp "${EXTRACT_DIR}/boot/memtest86+" "binary/boot/memtest86+x64.efi"
-            echo "memtest: extracted /boot/memtest86+ as memtest86+x64.efi from .deb"
-        fi
-        rm -rf "${EXTRACT_DIR}"
-    else
-        echo "memtest: WARNING: no memtest86+ .deb found in cache — memtest will not be available"
-    fi
-fi
-
-echo "memtest: binary/boot/ contents:"
-ls binary/boot/memtest* 2>/dev/null || echo "  (none)"
--- a/iso/builder/config/package-lists/bee.list.chroot
+++ b/iso/builder/config/package-lists/bee.list.chroot
@@ -21,14 +21,15 @@ openssh-server
 # Disk installer
 squashfs-tools
 parted
-# grub-pc / grub-efi-amd64 provide grub-install + grub2-common (required for chroot install).
-# The -bin variants only carry binary modules and do NOT include grub-install itself.
-grub-pc
+# Keep GRUB install tools without selecting a single active platform package.
+# grub-pc and grub-efi-amd64 conflict with each other, but grub2-common
+# provides grub-install/update-grub and the *-bin packages provide BIOS/UEFI modules.
+grub2-common
 grub-pc-bin
-grub-efi-amd64
 grub-efi-amd64-bin
 grub-efi-amd64-signed
 shim-signed
+efibootmgr

 # Filesystem support for USB export targets
 exfatprogs
@@ -50,7 +51,6 @@ sudo
 zstd
 mstflint
 memtester
-memtest86+
 stress-ng
 stressapptest

--- a/iso/overlay/etc/systemd/system/bee-audit.service
+++ b/iso/overlay/etc/systemd/system/bee-audit.service
@@ -1,25 +1,9 @@
 [Unit]
-Description=Bee: schedule startup hardware audit via task queue
-# Start AFTER bee-web, not before — bee-web must not wait for audit.
-After=bee-web.service
-Wants=bee-web.service
+Description=Bee: on-demand hardware audit (not started automatically)

 [Service]
 Type=oneshot
 RemainAfterExit=yes
-# Wait up to 90s for bee-web to respond on /healthz, then sleep 60s for
-# the system to settle (GPU drivers, sensors), then enqueue the audit as
-# a background task so it appears in the task list and logs.
-ExecStart=/bin/sh -c '\
-  i=0; \
-  while [ $i -lt 90 ]; do \
-    if curl -sf http://localhost/healthz >/dev/null 2>&1; then break; fi; \
-    sleep 1; i=$((i+1)); \
-  done; \
-  sleep 60; \
-  curl -sf -X POST http://localhost/api/audit/run >/dev/null'
+ExecStart=/bin/sh -c 'curl -sf -X POST http://localhost/api/audit/run >/dev/null'
 StandardOutput=journal
 StandardError=journal
-
-[Install]
-WantedBy=multi-user.target
--- a/iso/overlay/usr/local/bin/bee-install
+++ b/iso/overlay/usr/local/bin/bee-install
@@ -12,17 +12,55 @@

 set -euo pipefail

+usage() {
+    cat >&2 <<'EOF'
+Usage: bee-install <device> [logfile]
+
+  Installs the live system to a local disk (WIPES the target).
+
+  device   Target block device, e.g. /dev/sda or /dev/nvme0n1
+           Must be a hard disk or NVMe — NOT a CD-ROM (/dev/sr*)
+  logfile  Optional path for progress log (default: /tmp/bee-install.log)
+
+Examples:
+  bee-install /dev/sda
+  bee-install /dev/nvme0n1
+  bee-install /dev/sdb /tmp/my-install.log
+
+WARNING: ALL DATA ON <device> WILL BE ERASED.
+
+Layout (UEFI):  GPT — partition 1: EFI 512MB vfat, partition 2: root ext4
+Layout (BIOS):  MBR — partition 1: root ext4
+EOF
+    exit 1
+}
+
 DEVICE="${1:-}"
 LOGFILE="${2:-/tmp/bee-install.log}"

-if [ -z "$DEVICE" ]; then
-    echo "Usage: bee-install <device> [logfile]" >&2
-    exit 1
+if [ -z "$DEVICE" ] || [ "$DEVICE" = "--help" ] || [ "$DEVICE" = "-h" ]; then
+    usage
 fi
 if [ ! -b "$DEVICE" ]; then
    echo "ERROR: $DEVICE is not a block device" >&2
+    echo "Run 'lsblk' to list available disks." >&2
    exit 1
 fi
+# Block CD-ROM devices
+case "$DEVICE" in
+    /dev/sr*|/dev/scd*)
+        echo "ERROR: $DEVICE is a CD-ROM/optical device — cannot install to it." >&2
+        echo "Run 'lsblk' to find the target disk (e.g. /dev/sda, /dev/nvme0n1)." >&2
+        exit 1
+        ;;
+esac
+# Check required tools
+for tool in parted mkfs.vfat mkfs.ext4 unsquashfs grub-install update-grub; do
+    if ! command -v "$tool" >/dev/null 2>&1; then
+        echo "ERROR: required tool not found: $tool" >&2
+        exit 1
+    fi
+done

 SQUASHFS="/run/live/medium/live/filesystem.squashfs"
 if [ ! -f "$SQUASHFS" ]; then
--- a/iso/overlay/usr/local/bin/bee-john-gpu-stress
+++ b/iso/overlay/usr/local/bin/bee-john-gpu-stress
@@ -7,6 +7,8 @@ EXCLUDE=""
 FORMAT=""
 JOHN_DIR="/usr/local/lib/bee/john/run"
 JOHN_BIN="${JOHN_DIR}/john"
+export OCL_ICD_VENDORS="/etc/OpenCL/vendors"
+export LD_LIBRARY_PATH="/usr/lib:/usr/local/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"

 usage() {
    echo "usage: $0 [--seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
@@ -23,6 +25,95 @@ contains_csv() {
    echo ",${haystack}," | grep -q ",${needle},"
 }

+show_opencl_diagnostics() {
+    echo "-- OpenCL ICD vendors --" >&2
+    if [ -d /etc/OpenCL/vendors ]; then
+        ls -l /etc/OpenCL/vendors >&2 || true
+        for icd in /etc/OpenCL/vendors/*.icd; do
+            [ -f "${icd}" ] || continue
+            echo "  file: ${icd}" >&2
+            sed 's/^/    /' "${icd}" >&2 || true
+        done
+    else
+        echo "  /etc/OpenCL/vendors is missing" >&2
+    fi
+    echo "-- NVIDIA device nodes --" >&2
+    ls -l /dev/nvidia* >&2 || true
+    echo "-- ldconfig OpenCL/NVIDIA --" >&2
+    ldconfig -p 2>/dev/null | grep 'libOpenCL\|libcuda\|libnvidia-opencl' >&2 || true
+    if command -v clinfo >/dev/null 2>&1; then
+        echo "-- clinfo -l --" >&2
+        clinfo -l >&2 || true
+    fi
+    echo "-- john --list=opencl-devices --" >&2
+    ./john --list=opencl-devices >&2 || true
+}
+
+refresh_nvidia_runtime() {
+    if [ "$(id -u)" != "0" ]; then
+        return 1
+    fi
+    if command -v bee-nvidia-load >/dev/null 2>&1; then
+        bee-nvidia-load >/dev/null 2>&1 || true
+    fi
+    ldconfig >/dev/null 2>&1 || true
+    return 0
+}
+
+ensure_nvidia_uvm() {
+    if lsmod 2>/dev/null | grep -q '^nvidia_uvm '; then
+        return 0
+    fi
+    if [ "$(id -u)" != "0" ]; then
+        return 1
+    fi
+
+    ko="/usr/local/lib/nvidia/nvidia-uvm.ko"
+    [ -f "${ko}" ] || return 1
+
+    if ! insmod "${ko}" >/dev/null 2>&1; then
+        return 1
+    fi
+
+    uvm_major=$(grep -m1 ' nvidia-uvm$' /proc/devices | awk '{print $1}')
+    if [ -n "${uvm_major}" ]; then
+        mknod -m 666 /dev/nvidia-uvm c "${uvm_major}" 0 2>/dev/null || true
+        mknod -m 666 /dev/nvidia-uvm-tools c "${uvm_major}" 1 2>/dev/null || true
+    fi
+    return 0
+}
+
+ensure_opencl_ready() {
+    out=$(./john --list=opencl-devices 2>&1 || true)
+    if echo "${out}" | grep -q "Device #"; then
+        return 0
+    fi
+
+    if refresh_nvidia_runtime; then
+        out=$(./john --list=opencl-devices 2>&1 || true)
+        if echo "${out}" | grep -q "Device #"; then
+            return 0
+        fi
+    fi
+
+    if ensure_nvidia_uvm; then
+        out=$(./john --list=opencl-devices 2>&1 || true)
+        if echo "${out}" | grep -q "Device #"; then
+            return 0
+        fi
+    fi
+
+    echo "OpenCL devices are not available for John." >&2
+    if ! lsmod 2>/dev/null | grep -q '^nvidia_uvm '; then
+        echo "nvidia_uvm is not loaded." >&2
+    fi
+    if [ ! -e /dev/nvidia-uvm ]; then
+        echo "/dev/nvidia-uvm is missing." >&2
+    fi
+    show_opencl_diagnostics
+    return 1
+}
+
 while [ "$#" -gt 0 ]; do
    case "$1" in
        --seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
@@ -76,6 +167,8 @@ echo "john_devices=${JOHN_DEVICES}"

 cd "${JOHN_DIR}"

+ensure_opencl_ready || exit 1
+
 choose_format() {
    if [ -n "${FORMAT}" ]; then
        echo "${FORMAT}"
--- a/iso/overlay/usr/local/bin/bee-log-run
+++ b/iso/overlay/usr/local/bin/bee-log-run
@@ -17,7 +17,7 @@ mkdir -p "$(dirname "$log_file")"
 serial_sink() {
    local tty="$1"
    if [ -w "$tty" ]; then
-        cat > "$tty"
+        cat > "$tty" 2>/dev/null || true
    else
        cat > /dev/null
    fi
--- a/iso/overlay/usr/local/bin/bee-nvidia-load
+++ b/iso/overlay/usr/local/bin/bee-nvidia-load
@@ -59,11 +59,24 @@ load_module() {
    return 1
 }

+load_host_module() {
+    mod="$1"
+    if modprobe "$mod" >/dev/null 2>&1; then
+        log "host module loaded: $mod"
+        return 0
+    fi
+    return 1
+}
+
 case "$nvidia_mode" in
    normal|full)
        if ! load_module nvidia; then
            exit 1
        fi
+        # nvidia-modeset on some server kernels needs ACPI video helper symbols
+        # exported by the generic "video" module. Best-effort only; compute paths
+        # remain functional even if display-related modules stay absent.
+        load_host_module video || true
        load_module nvidia-modeset || true
        load_module nvidia-uvm || true
        ;;
Author	SHA1	Message	Date
Mikhail Chusavitin	b5b34983f1	fix(webui): repair audit actions and CPU burn flow - v3.15	2026-04-01 08:19:11 +03:00
Michael Chus	45221d1e9a	fix(stress): label loaders and improve john opencl diagnostics	2026-04-01 07:31:52 +03:00
Michael Chus	3869788bac	fix(iso): validate memtest with xorriso fallback	2026-04-01 07:24:05 +03:00
Michael Chus	3dbc2184ef	fix(iso): archive build logs and memtest diagnostics	2026-04-01 07:14:53 +03:00
Michael Chus	60cb8f889a	fix(iso): restore memtest menu entries and validate ISO	2026-04-01 07:04:48 +03:00
Michael Chus	c9ee078622	fix(stress): keep platform burn responsive under load	2026-03-31 22:28:26 +03:00
Michael Chus	ea660500c9	chore: commit pending repo changes	2026-03-31 22:17:36 +03:00
Michael Chus	d43a9aeec7	fix(iso): restore live-build memtest integration	2026-03-31 22:10:28 +03:00
Mikhail Chusavitin	f5622e351e	Fix staged John cleanup for repeated ISO builds	2026-03-31 11:40:52 +03:00
Mikhail Chusavitin	a20806afc8	Fix ISO grub package conflict	2026-03-31 11:38:30 +03:00
Mikhail Chusavitin	4f9b6b3bcd	Harden NVIDIA boot logging on live ISO	2026-03-31 11:37:21 +03:00