Add vendor RAID tools for livecd

Fix fast-path: treat bootloader config changes as heavy
config/bootloaders was missing from the needs_full_build heavy-file list, so changes to GRUB theme assets (e.g. bee-logo.png RGBA→RGB fix in 333c44f) were silently skipped by the squashfs-surgery fast-path. The old broken PNG stayed in boot/grub/live-theme/ inside the ISO. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-29 17:31:25 +03:00 · 2026-04-29 15:36:29 +03:00 · 2026-04-29 13:18:50 +03:00 · 2026-04-29 12:34:54 +03:00 · 2026-04-29 11:15:16 +03:00 · 2026-04-29 10:58:26 +03:00
56 changed files with 3195 additions and 491 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,4 @@
 dist/
 iso/out/
 build-cache/
+audit/bee
--- a/audit/cmd/bee/main.go
+++ b/audit/cmd/bee/main.go
@@ -2,6 +2,7 @@ package main

 import (
 	"context"
+	"errors"
 	"flag"
 	"fmt"
 	"io"
@@ -67,10 +68,14 @@ func run(args []string, stdout, stderr io.Writer) (exitCode int) {
 		return runSupportBundle(args[1:], stdout, stderr)
 	case "web":
 		return runWeb(args[1:], stdout, stderr)
+	case "blackbox":
+		return runBlackbox(args[1:], stdout, stderr)
 	case "sat":
 		return runSAT(args[1:], stdout, stderr)
 	case "benchmark":
 		return runBenchmark(args[1:], stdout, stderr)
+	case "bee-worker":
+		return runBeeWorker(args[1:], stdout, stderr)
 	case "version", "--version", "-version":
 		fmt.Fprintln(stdout, Version)
 		return 0
@@ -88,8 +93,10 @@ func printRootUsage(w io.Writer) {
  bee export  --target <device>
  bee support-bundle --output stdout|file:<path>
  bee web     --listen :80 [--audit-path `+app.DefaultAuditJSONPath+`]
+  bee blackbox --export-dir `+app.DefaultExportDir+` [--state-file `+app.DefaultBlackboxStatePath+`]
  bee sat nvidia|memory|storage|cpu [--duration <seconds>]
  bee benchmark nvidia [--profile standard|stability|overnight]
+  bee bee-worker --export-dir `+app.DefaultExportDir+` --task-id TASK-001
  bee version
  bee help [command]`)
 }
@@ -106,10 +113,14 @@ func runHelp(args []string, stdout, stderr io.Writer) int {
 		return runSupportBundle([]string{"--help"}, stdout, stdout)
 	case "web":
 		return runWeb([]string{"--help"}, stdout, stdout)
+	case "blackbox":
+		return runBlackbox([]string{"--help"}, stdout, stdout)
 	case "sat":
 		return runSAT([]string{"--help"}, stdout, stderr)
 	case "benchmark":
 		return runBenchmark([]string{"--help"}, stdout, stderr)
+	case "bee-worker":
+		return runBeeWorker([]string{"--help"}, stdout, stderr)
 	case "version":
 		fmt.Fprintln(stdout, "usage: bee version")
 		return 0
@@ -335,6 +346,33 @@ func runWeb(args []string, stdout, stderr io.Writer) int {
 	return 0
 }

+func runBlackbox(args []string, stdout, stderr io.Writer) int {
+	fs := flag.NewFlagSet("blackbox", flag.ContinueOnError)
+	fs.SetOutput(stderr)
+	exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
+	statePath := fs.String("state-file", app.DefaultBlackboxStatePath, "blackbox state file")
+	fs.Usage = func() {
+		fmt.Fprintf(stderr, "usage: bee blackbox [--export-dir %s] [--state-file %s]\n", app.DefaultExportDir, app.DefaultBlackboxStatePath)
+		fs.PrintDefaults()
+	}
+	if err := fs.Parse(args); err != nil {
+		if err == flag.ErrHelp {
+			return 0
+		}
+		return 2
+	}
+	if fs.NArg() != 0 {
+		fs.Usage()
+		return 2
+	}
+	slog.Info("starting bee blackbox", "export_dir", *exportDir, "state_file", *statePath)
+	if err := app.RunBlackbox(context.Background(), *exportDir, *statePath, platform.New()); err != nil && !errors.Is(err, context.Canceled) {
+		slog.Error("run blackbox", "err", err)
+		return 1
+	}
+	return 0
+}
+
 func runSAT(args []string, stdout, stderr io.Writer) int {
 	if len(args) == 0 {
 		fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
@@ -462,6 +500,28 @@ func runBenchmark(args []string, stdout, stderr io.Writer) int {
 	return 0
 }

+func runBeeWorker(args []string, stdout, stderr io.Writer) int {
+	fs := flag.NewFlagSet("bee-worker", flag.ContinueOnError)
+	fs.SetOutput(stderr)
+	exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with task state and artifacts")
+	taskID := fs.String("task-id", "", "task identifier, e.g. TASK-001")
+	fs.Usage = func() {
+		fmt.Fprintf(stderr, "usage: bee bee-worker --export-dir %s --task-id TASK-001\n", app.DefaultExportDir)
+		fs.PrintDefaults()
+	}
+	if err := fs.Parse(args); err != nil {
+		if err == flag.ErrHelp {
+			return 0
+		}
+		return 2
+	}
+	if fs.NArg() != 0 {
+		fs.Usage()
+		return 2
+	}
+	return webui.RunPersistedTask(*exportDir, *taskID, stdout, stderr)
+}
+
 func parseBenchmarkIndexCSV(raw string) ([]int, error) {
 	raw = strings.TrimSpace(raw)
 	if raw == "" {
--- a/audit/internal/app/blackbox.go
+++ b/audit/internal/app/blackbox.go
@@ -0,0 +1,779 @@
+package app
+
+import (
+	"bytes"
+	"context"
+	"crypto/rand"
+	"encoding/hex"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"io/fs"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"sort"
+	"strings"
+	"sync"
+	"time"
+
+	"bee/audit/internal/platform"
+)
+
+const (
+	blackboxMarkerName        = ".bee-blackbox"
+	blackboxDiscoverInterval  = 2 * time.Second
+	blackboxMinFlushPeriod    = 1 * time.Second
+	blackboxMaxFlushPeriod    = 30 * time.Second
+	blackboxRecoveryFastCount = 5
+)
+
+var DefaultBlackboxStatePath = DefaultExportDir + "/blackbox-state.json"
+
+var (
+	blackboxExecCommand = exec.Command
+	blackboxNow         = func() time.Time { return time.Now().UTC() }
+)
+
+type BlackboxMarker struct {
+	Version      int    `json:"version"`
+	EnrollmentID string `json:"enrollment_id"`
+	CreatedAtUTC string `json:"created_at_utc"`
+	Host         string `json:"host,omitempty"`
+}
+
+type BlackboxTargetStatus struct {
+	EnrollmentID      string                   `json:"enrollment_id"`
+	Device            string                   `json:"device"`
+	FS                platform.RemovableTarget `json:"fs"`
+	BootFolder        string                   `json:"boot_folder"`
+	Status            string                   `json:"status"`
+	LastSyncAtUTC     string                   `json:"last_sync_at_utc,omitempty"`
+	LastCycleDuration string                   `json:"last_cycle_duration,omitempty"`
+	FlushPeriod       string                   `json:"flush_period"`
+	LastError         string                   `json:"last_error,omitempty"`
+	Mountpoint        string                   `json:"mountpoint,omitempty"`
+}
+
+type BlackboxState struct {
+	Status           string                 `json:"status"`
+	BootStartedAtUTC string                 `json:"boot_started_at_utc"`
+	BootFolder       string                 `json:"boot_folder"`
+	UpdatedAtUTC     string                 `json:"updated_at_utc"`
+	Targets          []BlackboxTargetStatus `json:"targets"`
+}
+
+type blackboxRuntime struct {
+	exportDir   string
+	statePath   string
+	system      *platform.System
+	bootStarted time.Time
+	bootFolder  string
+
+	mu      sync.Mutex
+	workers map[string]*blackboxWorker
+}
+
+type discoveredBlackboxTarget struct {
+	marker       BlackboxMarker
+	target       platform.RemovableTarget
+	seenMount    string
+	mountedByBee bool
+}
+
+type blackboxWorker struct {
+	runtime      *blackboxRuntime
+	enrollmentID string
+
+	mu           sync.Mutex
+	target       platform.RemovableTarget
+	marker       BlackboxMarker
+	mountpoint   string
+	mountedByBee bool
+	status       string
+	lastSyncAt   time.Time
+	lastDuration time.Duration
+	flushPeriod  time.Duration
+	lastError    string
+	fastCycles   int
+	stopCh       chan struct{}
+	stoppedCh    chan struct{}
+}
+
+func RunBlackbox(ctx context.Context, exportDir, statePath string, system *platform.System) error {
+	exportDir = strings.TrimSpace(exportDir)
+	if exportDir == "" {
+		exportDir = DefaultExportDir
+	}
+	statePath = strings.TrimSpace(statePath)
+	if statePath == "" {
+		statePath = DefaultBlackboxStatePath
+	}
+	if system == nil {
+		system = platform.New()
+	}
+	bootStarted, err := bootStartedAtUTC()
+	if err != nil {
+		bootStarted = blackboxNow()
+	}
+	rt := &blackboxRuntime{
+		exportDir:   exportDir,
+		statePath:   statePath,
+		system:      system,
+		bootStarted: bootStarted,
+		bootFolder:  SupportBundleBaseName(bootStarted),
+		workers:     make(map[string]*blackboxWorker),
+	}
+	_ = os.MkdirAll(filepath.Dir(statePath), 0755)
+	rt.persistState()
+	ticker := time.NewTicker(blackboxDiscoverInterval)
+	defer ticker.Stop()
+	for {
+		rt.reconcile()
+		select {
+		case <-ctx.Done():
+			rt.stopAll()
+			return ctx.Err()
+		case <-ticker.C:
+		}
+	}
+}
+
+func ReadBlackboxState(path string) (BlackboxState, error) {
+	path = strings.TrimSpace(path)
+	if path == "" {
+		path = DefaultBlackboxStatePath
+	}
+	raw, err := os.ReadFile(path)
+	if err != nil {
+		return BlackboxState{}, err
+	}
+	var state BlackboxState
+	if err := json.Unmarshal(raw, &state); err != nil {
+		return BlackboxState{}, err
+	}
+	return state, nil
+}
+
+func EnableBlackboxTarget(target platform.RemovableTarget) (BlackboxMarker, error) {
+	target = sanitizeRemovableTarget(target)
+	if target.Device == "" {
+		return BlackboxMarker{}, fmt.Errorf("device is required")
+	}
+	mountpoint, mountedByBee, err := ensureMountedTarget(target, "marker")
+	if err != nil {
+		return BlackboxMarker{}, err
+	}
+	defer func() {
+		if mountedByBee {
+			_ = unmountTarget(mountpoint)
+		}
+	}()
+
+	marker, _, err := readBlackboxMarker(mountpoint)
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return BlackboxMarker{}, err
+	}
+	if marker.EnrollmentID == "" {
+		marker = BlackboxMarker{
+			Version:      1,
+			EnrollmentID: newBlackboxEnrollmentID(),
+			CreatedAtUTC: blackboxNow().Format(time.RFC3339),
+			Host:         hostnameOr("unknown"),
+		}
+	}
+	if err := writeBlackboxMarker(mountpoint, marker); err != nil {
+		return BlackboxMarker{}, err
+	}
+	return marker, nil
+}
+
+func DisableBlackboxTarget(device, enrollmentID string) error {
+	device = strings.TrimSpace(device)
+	enrollmentID = strings.TrimSpace(enrollmentID)
+	if device == "" && enrollmentID == "" {
+		return fmt.Errorf("device or enrollment_id is required")
+	}
+	system := platform.New()
+	targets, err := system.ListRemovableTargets()
+	if err != nil {
+		return err
+	}
+	for _, target := range targets {
+		target = sanitizeRemovableTarget(target)
+		mountpoint, mountedByBee, mountErr := ensureMountedTarget(target, "marker")
+		if mountErr != nil {
+			continue
+		}
+		remove := false
+		marker, _, err := readBlackboxMarker(mountpoint)
+		if err == nil {
+			if enrollmentID != "" && marker.EnrollmentID == enrollmentID {
+				remove = true
+			}
+			if device != "" && target.Device == device {
+				remove = true
+			}
+		}
+		if remove {
+			err = os.Remove(filepath.Join(mountpoint, blackboxMarkerName))
+		}
+		if mountedByBee {
+			_ = unmountTarget(mountpoint)
+		}
+		if remove {
+			return err
+		}
+	}
+	return os.ErrNotExist
+}
+
+func (rt *blackboxRuntime) reconcile() {
+	discovered, _ := rt.discoverMarkedTargets()
+
+	rt.mu.Lock()
+	defer rt.mu.Unlock()
+
+	seen := make(map[string]struct{}, len(discovered))
+	for _, found := range discovered {
+		seen[found.marker.EnrollmentID] = struct{}{}
+		worker, ok := rt.workers[found.marker.EnrollmentID]
+		if !ok {
+			worker = newBlackboxWorker(rt, found)
+			rt.workers[found.marker.EnrollmentID] = worker
+			go worker.run()
+			continue
+		}
+		worker.update(found)
+	}
+	for id, worker := range rt.workers {
+		if _, ok := seen[id]; ok {
+			continue
+		}
+		worker.stop()
+		delete(rt.workers, id)
+	}
+	rt.persistStateLocked()
+}
+
+func (rt *blackboxRuntime) stopAll() {
+	rt.mu.Lock()
+	workers := make([]*blackboxWorker, 0, len(rt.workers))
+	for _, worker := range rt.workers {
+		workers = append(workers, worker)
+	}
+	rt.workers = map[string]*blackboxWorker{}
+	rt.persistStateLocked()
+	rt.mu.Unlock()
+	for _, worker := range workers {
+		worker.stop()
+	}
+}
+
+func (rt *blackboxRuntime) discoverMarkedTargets() ([]discoveredBlackboxTarget, error) {
+	targets, err := rt.system.ListRemovableTargets()
+	if err != nil {
+		return nil, err
+	}
+	var out []discoveredBlackboxTarget
+	for _, rawTarget := range targets {
+		target := sanitizeRemovableTarget(rawTarget)
+		if target.Device == "" {
+			continue
+		}
+		mountpoint, mountedByBee, err := ensureMountedTarget(target, "probe")
+		if err != nil {
+			continue
+		}
+		marker, ok, err := readBlackboxMarker(mountpoint)
+		if mountedByBee && !ok {
+			_ = unmountTarget(mountpoint)
+		}
+		if err != nil || !ok || marker.EnrollmentID == "" {
+			continue
+		}
+		if mountedByBee {
+			_ = unmountTarget(mountpoint)
+		}
+		out = append(out, discoveredBlackboxTarget{
+			marker:       marker,
+			target:       target,
+			seenMount:    mountpoint,
+			mountedByBee: mountedByBee,
+		})
+	}
+	sort.Slice(out, func(i, j int) bool {
+		return out[i].marker.EnrollmentID < out[j].marker.EnrollmentID
+	})
+	return out, nil
+}
+
+func newBlackboxWorker(rt *blackboxRuntime, found discoveredBlackboxTarget) *blackboxWorker {
+	return &blackboxWorker{
+		runtime:      rt,
+		enrollmentID: found.marker.EnrollmentID,
+		target:       found.target,
+		marker:       found.marker,
+		flushPeriod:  blackboxMinFlushPeriod,
+		status:       "running",
+		stopCh:       make(chan struct{}),
+		stoppedCh:    make(chan struct{}),
+	}
+}
+
+func (w *blackboxWorker) run() {
+	defer close(w.stoppedCh)
+	for {
+		start := time.Now()
+		err := w.syncCycle()
+		duration := time.Since(start)
+		w.finishCycle(duration, err)
+
+		wait := w.currentFlushPeriod()
+		timer := time.NewTimer(wait)
+		select {
+		case <-w.stopCh:
+			timer.Stop()
+			w.cleanup()
+			return
+		case <-timer.C:
+		}
+	}
+}
+
+func (w *blackboxWorker) update(found discoveredBlackboxTarget) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	w.target = found.target
+	w.marker = found.marker
+}
+
+func (w *blackboxWorker) stop() {
+	select {
+	case <-w.stopCh:
+	default:
+		close(w.stopCh)
+	}
+	<-w.stoppedCh
+}
+
+func (w *blackboxWorker) currentFlushPeriod() time.Duration {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	return w.flushPeriod
+}
+
+func (w *blackboxWorker) finishCycle(duration time.Duration, err error) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	w.lastDuration = duration
+	if err != nil {
+		w.status = "degraded"
+		w.lastError = err.Error()
+		w.fastCycles = 0
+		w.flushPeriod = adjustFlushPeriod(w.flushPeriod, duration, false, 0)
+	} else {
+		w.status = "running"
+		w.lastSyncAt = blackboxNow()
+		w.lastError = ""
+		if duration <= w.flushPeriod/2 {
+			w.fastCycles++
+		} else {
+			w.fastCycles = 0
+		}
+		w.flushPeriod = adjustFlushPeriod(w.flushPeriod, duration, true, w.fastCycles)
+	}
+	w.runtime.persistState()
+}
+
+func adjustFlushPeriod(current, duration time.Duration, success bool, fastCycles int) time.Duration {
+	if current <= 0 {
+		current = blackboxMinFlushPeriod
+	}
+	if duration <= 0 {
+		duration = current
+	}
+	next := current
+	if duration > current {
+		growA := time.Duration(float64(current) * 1.25)
+		growB := time.Duration(float64(duration) * 1.25)
+		if growB > growA {
+			next = growB
+		} else {
+			next = growA
+		}
+	}
+	if success && fastCycles >= blackboxRecoveryFastCount {
+		next = time.Duration(float64(current) * 0.9)
+	}
+	if next < blackboxMinFlushPeriod {
+		next = blackboxMinFlushPeriod
+	}
+	if next > blackboxMaxFlushPeriod {
+		next = blackboxMaxFlushPeriod
+	}
+	return next
+}
+
+func (w *blackboxWorker) syncCycle() error {
+	target, marker := w.snapshotTarget()
+	mountpoint, mountedByBee, err := ensureMountedTarget(target, marker.EnrollmentID)
+	if err != nil {
+		return err
+	}
+	w.recordMountpoint(mountpoint, mountedByBee)
+
+	root := filepath.Join(mountpoint, w.runtime.bootFolder)
+	if err := os.MkdirAll(filepath.Join(root, "export"), 0755); err != nil {
+		return err
+	}
+	if err := syncDirectoryTree(w.runtime.exportDir, filepath.Join(root, "export")); err != nil {
+		return err
+	}
+	if err := w.captureSnapshots(root); err != nil {
+		return err
+	}
+	return syncFilesystem(root)
+}
+
+func (w *blackboxWorker) cleanup() {
+	w.mu.Lock()
+	mountpoint := w.mountpoint
+	mountedByBee := w.mountedByBee
+	w.mu.Unlock()
+	if mountedByBee && mountpoint != "" {
+		_ = unmountTarget(mountpoint)
+	}
+}
+
+func (w *blackboxWorker) snapshotTarget() (platform.RemovableTarget, BlackboxMarker) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	return w.target, w.marker
+}
+
+func (w *blackboxWorker) recordMountpoint(mountpoint string, mountedByBee bool) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	w.mountpoint = mountpoint
+	w.mountedByBee = mountedByBee
+}
+
+func (w *blackboxWorker) captureSnapshots(root string) error {
+	if err := captureCommandAtomic(filepath.Join(root, "systemd", "combined.journal.log"), "journalctl", "--no-pager", "--since", w.runtime.bootStarted.Format(time.RFC3339)); err != nil {
+		return err
+	}
+	for _, svc := range supportBundleServices {
+		if err := captureCommandAtomic(filepath.Join(root, "systemd", svc+".journal.log"), "journalctl", "--no-pager", "-u", svc, "--since", w.runtime.bootStarted.Format(time.RFC3339)); err != nil {
+			return err
+		}
+		if err := captureCommandAtomic(filepath.Join(root, "systemd", svc+".status.txt"), "systemctl", "status", svc, "--no-pager"); err != nil {
+			return err
+		}
+	}
+	if err := captureCommandAtomic(filepath.Join(root, "system", "dmesg.txt"), "dmesg"); err != nil {
+		return err
+	}
+	for _, item := range supportBundleOptionalFiles {
+		if err := copyFileIfChanged(item.src, filepath.Join(root, item.name)); err != nil && !errors.Is(err, os.ErrNotExist) {
+			return err
+		}
+	}
+	return nil
+}
+
+func (rt *blackboxRuntime) persistState() {
+	rt.mu.Lock()
+	defer rt.mu.Unlock()
+	rt.persistStateLocked()
+}
+
+func (rt *blackboxRuntime) persistStateLocked() {
+	state := BlackboxState{
+		Status:           "disabled",
+		BootStartedAtUTC: rt.bootStarted.Format(time.RFC3339),
+		BootFolder:       rt.bootFolder,
+		UpdatedAtUTC:     blackboxNow().Format(time.RFC3339),
+		Targets:          make([]BlackboxTargetStatus, 0, len(rt.workers)),
+	}
+	if len(rt.workers) > 0 {
+		state.Status = "running"
+	}
+	for _, worker := range rt.workers {
+		worker.mu.Lock()
+		targetState := BlackboxTargetStatus{
+			EnrollmentID: worker.enrollmentID,
+			Device:       worker.target.Device,
+			FS:           worker.target,
+			BootFolder:   rt.bootFolder,
+			Status:       worker.status,
+			FlushPeriod:  worker.flushPeriod.String(),
+			LastError:    worker.lastError,
+			Mountpoint:   worker.mountpoint,
+		}
+		if !worker.lastSyncAt.IsZero() {
+			targetState.LastSyncAtUTC = worker.lastSyncAt.Format(time.RFC3339)
+		}
+		if worker.lastDuration > 0 {
+			targetState.LastCycleDuration = worker.lastDuration.String()
+		}
+		if worker.status == "degraded" {
+			state.Status = "degraded"
+		}
+		worker.mu.Unlock()
+		state.Targets = append(state.Targets, targetState)
+	}
+	sort.Slice(state.Targets, func(i, j int) bool {
+		return state.Targets[i].EnrollmentID < state.Targets[j].EnrollmentID
+	})
+	_ = writeJSONAtomic(rt.statePath, state)
+}
+
+func bootStartedAtUTC() (time.Time, error) {
+	raw, err := os.ReadFile("/proc/stat")
+	if err != nil {
+		return time.Time{}, err
+	}
+	for _, line := range strings.Split(string(raw), "\n") {
+		line = strings.TrimSpace(line)
+		if !strings.HasPrefix(line, "btime ") {
+			continue
+		}
+		parts := strings.Fields(line)
+		if len(parts) != 2 {
+			break
+		}
+		sec, err := time.ParseDuration(parts[1] + "s")
+		if err != nil {
+			break
+		}
+		return time.Unix(int64(sec/time.Second), 0).UTC(), nil
+	}
+	return time.Time{}, fmt.Errorf("boot time not found")
+}
+
+func newBlackboxEnrollmentID() string {
+	var buf [8]byte
+	if _, err := rand.Read(buf[:]); err != nil {
+		return fmt.Sprintf("bb-%d", time.Now().UnixNano())
+	}
+	return "bb-" + hex.EncodeToString(buf[:])
+}
+
+func sanitizeRemovableTarget(target platform.RemovableTarget) platform.RemovableTarget {
+	target.Device = strings.TrimSpace(target.Device)
+	target.FSType = strings.TrimSpace(target.FSType)
+	target.Size = strings.TrimSpace(target.Size)
+	target.Label = strings.TrimSpace(target.Label)
+	target.Model = strings.TrimSpace(target.Model)
+	target.Mountpoint = strings.TrimSpace(target.Mountpoint)
+	return target
+}
+
+func ensureMountedTarget(target platform.RemovableTarget, suffix string) (mountpoint string, mountedByBee bool, retErr error) {
+	target = sanitizeRemovableTarget(target)
+	if target.Mountpoint != "" {
+		if err := ensureWritableBlackboxMountpoint(target.Mountpoint); err == nil {
+			return target.Mountpoint, false, nil
+		}
+	}
+	mountpoint = filepath.Join("/tmp", "bee-blackbox-"+sanitizeFilename(suffix))
+	if err := os.MkdirAll(mountpoint, 0755); err != nil {
+		return "", false, err
+	}
+	if raw, err := blackboxExecCommand("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
+		return "", false, formatBlackboxMountTargetError(target, string(raw), err)
+	}
+	if err := ensureWritableBlackboxMountpoint(mountpoint); err != nil {
+		_ = unmountTarget(mountpoint)
+		return "", false, err
+	}
+	return mountpoint, true, nil
+}
+
+func unmountTarget(mountpoint string) error {
+	_ = blackboxExecCommand("sync").Run()
+	raw, err := blackboxExecCommand("umount", mountpoint).CombinedOutput()
+	if err != nil {
+		msg := strings.TrimSpace(string(raw))
+		if msg == "" {
+			return err
+		}
+		return fmt.Errorf("%s: %w", msg, err)
+	}
+	return nil
+}
+
+func readBlackboxMarker(mountpoint string) (BlackboxMarker, bool, error) {
+	raw, err := os.ReadFile(filepath.Join(mountpoint, blackboxMarkerName))
+	if err != nil {
+		if errors.Is(err, os.ErrNotExist) {
+			return BlackboxMarker{}, false, os.ErrNotExist
+		}
+		return BlackboxMarker{}, false, err
+	}
+	var marker BlackboxMarker
+	if err := json.Unmarshal(raw, &marker); err != nil {
+		return BlackboxMarker{}, false, err
+	}
+	return marker, true, nil
+}
+
+func writeBlackboxMarker(mountpoint string, marker BlackboxMarker) error {
+	if marker.Version == 0 {
+		marker.Version = 1
+	}
+	return writeJSONAtomic(filepath.Join(mountpoint, blackboxMarkerName), marker)
+}
+
+func syncDirectoryTree(srcDir, dstDir string) error {
+	seen := make(map[string]struct{})
+	err := filepath.WalkDir(srcDir, func(path string, d fs.DirEntry, err error) error {
+		if err != nil {
+			return err
+		}
+		rel, err := filepath.Rel(srcDir, path)
+		if err != nil {
+			return err
+		}
+		rel = filepath.Clean(rel)
+		if rel == "." {
+			seen["."] = struct{}{}
+			return os.MkdirAll(dstDir, 0755)
+		}
+		seen[rel] = struct{}{}
+		dstPath := filepath.Join(dstDir, rel)
+		if d.IsDir() {
+			info, err := d.Info()
+			if err != nil {
+				return err
+			}
+			return os.MkdirAll(dstPath, info.Mode().Perm())
+		}
+		return copyFileIfChanged(path, dstPath)
+	})
+	if err != nil {
+		return err
+	}
+	return removeMissingPaths(dstDir, seen)
+}
+
+func removeMissingPaths(dstDir string, seen map[string]struct{}) error {
+	return filepath.WalkDir(dstDir, func(path string, d fs.DirEntry, err error) error {
+		if err != nil {
+			return err
+		}
+		rel, err := filepath.Rel(dstDir, path)
+		if err != nil {
+			return err
+		}
+		rel = filepath.Clean(rel)
+		if rel == "." {
+			return nil
+		}
+		if _, ok := seen[rel]; ok {
+			return nil
+		}
+		return os.RemoveAll(path)
+	})
+}
+
+func copyFileIfChanged(src, dst string) error {
+	info, err := os.Stat(src)
+	if err != nil {
+		return err
+	}
+	if info.IsDir() {
+		return os.MkdirAll(dst, info.Mode().Perm())
+	}
+	srcData, err := os.ReadFile(src)
+	if err != nil {
+		return err
+	}
+	if dstData, err := os.ReadFile(dst); err == nil && bytes.Equal(dstData, srcData) {
+		return nil
+	}
+	return writeFileAtomic(dst, srcData, info.Mode().Perm())
+}
+
+func captureCommandAtomic(dst string, name string, args ...string) error {
+	raw, err := blackboxExecCommand(name, args...).CombinedOutput()
+	if len(raw) == 0 {
+		if err != nil {
+			raw = []byte(err.Error() + "\n")
+		} else {
+			raw = []byte("no output\n")
+		}
+	}
+	return writeFileAtomic(dst, raw, 0644)
+}
+
+func writeJSONAtomic(path string, v any) error {
+	raw, err := json.MarshalIndent(v, "", "  ")
+	if err != nil {
+		return err
+	}
+	raw = append(raw, '\n')
+	return writeFileAtomic(path, raw, 0644)
+}
+
+func writeFileAtomic(path string, data []byte, perm os.FileMode) error {
+	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+		return err
+	}
+	if existing, err := os.ReadFile(path); err == nil && bytes.Equal(existing, data) {
+		return nil
+	}
+	tmp := path + ".tmp"
+	f, err := os.OpenFile(tmp, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, perm)
+	if err != nil {
+		return err
+	}
+	if _, err := f.Write(data); err != nil {
+		_ = f.Close()
+		return err
+	}
+	if err := f.Sync(); err != nil {
+		_ = f.Close()
+		return err
+	}
+	if err := f.Close(); err != nil {
+		return err
+	}
+	if err := os.Rename(tmp, path); err != nil {
+		return err
+	}
+	return syncFilesystem(filepath.Dir(path))
+}
+
+func syncFilesystem(path string) error {
+	return blackboxExecCommand("sync").Run()
+}
+
+func ensureWritableBlackboxMountpoint(mountpoint string) error {
+	probe, err := os.CreateTemp(mountpoint, ".bee-blackbox-write-test-*")
+	if err != nil {
+		return fmt.Errorf("target filesystem is not writable: %w", err)
+	}
+	name := probe.Name()
+	if closeErr := probe.Close(); closeErr != nil {
+		_ = os.Remove(name)
+		return closeErr
+	}
+	if err := os.Remove(name); err != nil {
+		return err
+	}
+	return nil
+}
+
+func formatBlackboxMountTargetError(target platform.RemovableTarget, raw string, err error) error {
+	msg := strings.TrimSpace(raw)
+	fstype := strings.ToLower(strings.TrimSpace(target.FSType))
+	if fstype == "exfat" && strings.Contains(strings.ToLower(msg), "unknown filesystem type 'exfat'") {
+		return fmt.Errorf("mount %s: exFAT support is missing in this ISO build: %w", target.Device, err)
+	}
+	if msg == "" {
+		return err
+	}
+	return fmt.Errorf("%s: %w", msg, err)
+}
--- a/audit/internal/app/blackbox_test.go
+++ b/audit/internal/app/blackbox_test.go
@@ -0,0 +1,52 @@
+package app
+
+import (
+	"path/filepath"
+	"testing"
+	"time"
+)
+
+func TestAdjustFlushPeriodGrowsOnSlowCycle(t *testing.T) {
+	current := 2 * time.Second
+	got := adjustFlushPeriod(current, 4*time.Second, false, 0)
+	if got <= current {
+		t.Fatalf("adjustFlushPeriod=%s want > %s", got, current)
+	}
+}
+
+func TestAdjustFlushPeriodShrinksAfterFastCycles(t *testing.T) {
+	current := 10 * time.Second
+	got := adjustFlushPeriod(current, 2*time.Second, true, blackboxRecoveryFastCount)
+	if got >= current {
+		t.Fatalf("adjustFlushPeriod=%s want < %s", got, current)
+	}
+	if got < blackboxMinFlushPeriod {
+		t.Fatalf("adjustFlushPeriod=%s below min %s", got, blackboxMinFlushPeriod)
+	}
+}
+
+func TestReadBlackboxState(t *testing.T) {
+	path := filepath.Join(t.TempDir(), "blackbox-state.json")
+	want := BlackboxState{
+		Status:           "running",
+		BootStartedAtUTC: "2026-04-24T00:00:00Z",
+		BootFolder:       "boot-folder",
+		UpdatedAtUTC:     "2026-04-24T00:00:01Z",
+		Targets: []BlackboxTargetStatus{{
+			EnrollmentID: "bb-1",
+			Device:       "/dev/sdb1",
+			Status:       "running",
+			FlushPeriod:  "1s",
+		}},
+	}
+	if err := writeJSONAtomic(path, want); err != nil {
+		t.Fatalf("writeJSONAtomic: %v", err)
+	}
+	got, err := ReadBlackboxState(path)
+	if err != nil {
+		t.Fatalf("ReadBlackboxState: %v", err)
+	}
+	if got.Status != want.Status || got.BootFolder != want.BootFolder || len(got.Targets) != 1 || got.Targets[0].EnrollmentID != "bb-1" {
+		t.Fatalf("state=%+v", got)
+	}
+}
--- a/audit/internal/app/support_bundle.go
+++ b/audit/internal/app/support_bundle.go
@@ -15,6 +15,7 @@ import (
 )

 var supportBundleServices = []string{
+	"bee-blackbox.service",
 	"bee-audit.service",
 	"bee-web.service",
 	"bee-network.service",
@@ -256,11 +257,6 @@ func BuildSupportBundle(exportDir string) (string, error) {
 	}

 	now := time.Now().UTC()
-	date := now.Format("2006-01-02")
-	tod := now.Format("150405")
-	ver := bundleVersion()
-	model := serverModelForBundle()
-	sn := serverSerialForBundle()

 	stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-stage-%s-%s", sanitizeFilename(hostnameOr("unknown")), now.Format("20060102-150405")))
 	if err := os.MkdirAll(stageRoot, 0755); err != nil {
@@ -294,7 +290,7 @@ func BuildSupportBundle(exportDir string) (string, error) {
 		return "", err
 	}

-	archiveName := fmt.Sprintf("%s (BEE-SP v%s) %s %s %s.tar.gz", date, ver, model, sn, tod)
+	archiveName := SupportBundleBaseName(now) + ".tar.gz"
 	archivePath := filepath.Join(os.TempDir(), archiveName)
 	if err := createSupportTarGz(archivePath, stageRoot); err != nil {
 		return "", err
@@ -302,6 +298,16 @@ func BuildSupportBundle(exportDir string) (string, error) {
 	return archivePath, nil
 }

+func SupportBundleBaseName(at time.Time) string {
+	at = at.UTC()
+	date := at.Format("2006-01-02")
+	tod := at.Format("150405")
+	ver := bundleVersion()
+	model := serverModelForBundle()
+	sn := serverSerialForBundle()
+	return fmt.Sprintf("%s (BEE-SP v%s) %s %s %s", date, ver, model, sn, tod)
+}
+
 func LatestSupportBundlePath() (string, error) {
 	return latestSupportBundlePath(os.TempDir())
 }
--- a/audit/internal/collector/pcie.go
+++ b/audit/internal/collector/pcie.go
@@ -4,7 +4,9 @@ import (
 	"bee/audit/internal/schema"
 	"fmt"
 	"log/slog"
+	"os"
 	"os/exec"
+	"path/filepath"
 	"strconv"
 	"strings"
 )
@@ -140,6 +142,9 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
 		} else if numaNode, ok := parsePCINumaNode(fields["NUMANode"]); ok {
 			dev.NUMANode = &numaNode
 		}
+		if group, ok := readPCIIOMMUGroup(bdf); ok {
+			dev.IOMMUGroup = &group
+		}
 		if width, ok := readPCIIntAttribute(bdf, "current_link_width"); ok {
 			dev.LinkWidth = &width
 		}
@@ -179,6 +184,21 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
 	return dev
 }

+// readPCIIOMMUGroup resolves the IOMMU group number for a BDF via the
+// iommu_group symlink in sysfs: .../devices/<bdf>/iommu_group -> .../kernel/iommu_groups/<N>
+func readPCIIOMMUGroup(bdf string) (int, bool) {
+	link := "/sys/bus/pci/devices/" + bdf + "/iommu_group"
+	target, err := os.Readlink(link)
+	if err != nil {
+		return 0, false
+	}
+	n, err := strconv.Atoi(filepath.Base(target))
+	if err != nil {
+		return 0, false
+	}
+	return n, true
+}
+
 // readPCIIDs reads vendor and device IDs from sysfs for a given BDF.
 func readPCIIDs(bdf string) (vendorID, deviceID int) {
 	base := "/sys/bus/pci/devices/" + bdf
--- a/audit/internal/collector/storage.go
+++ b/audit/internal/collector/storage.go
@@ -250,6 +250,8 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
 	}

 	var info smartctlInfo
+	var raw map[string]any
+	_ = json.Unmarshal(out, &raw)
 	if err := json.Unmarshal(out, &info); err == nil {
 		if v := cleanDMIValue(info.ModelName); v != "" {
 			s.Model = &v
@@ -302,8 +304,11 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
 				value := float64(attr.Raw.Value)
 				s.LifeRemainingPct = &value
 			case 241:
-				value := attr.Raw.Value
+				value := smartLBAsToBytes(attr.Raw.Value)
 				s.WrittenBytes = &value
+			case 242:
+				value := smartLBAsToBytes(attr.Raw.Value)
+				s.ReadBytes = &value
 			case 197:
 				pending = attr.Raw.Value
 				s.CurrentPendingSectors = &pending
@@ -321,6 +326,7 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
 			offlineUncorrectable: uncorrectable,
 			lifeRemainingPct:     lifeRemaining,
 		}
+		applySCSISmartctlTelemetry(&s, raw, &status)
 		setStorageHealthStatus(&s, status)
 		return s
 	}
@@ -477,6 +483,127 @@ func nvmeDataUnitsToBytes(units int64) int64 {
 	return units * 512000
 }

+func smartLBAsToBytes(lbas int64) int64 {
+	if lbas <= 0 {
+		return 0
+	}
+	return lbas * 512
+}
+
+func applySCSISmartctlTelemetry(s *schema.HardwareStorage, raw map[string]any, status *storageHealthStatus) {
+	if s == nil || len(raw) == 0 {
+		return
+	}
+	if v, ok := firstInt64(raw,
+		"path:power_on_time.hours",
+		"path:accumulated_power_on_time.hours",
+		"path:power_on_time.hour",
+		"path:accumulated_power_on_time.hour",
+	); ok && v > 0 && s.PowerOnHours == nil {
+		s.PowerOnHours = &v
+	}
+	if v, ok := firstInt64(raw,
+		"path:power_cycle_count",
+		"path:start_stop_cycle_count",
+		"path:accumulated_start_stop_cycles",
+	); ok && v > 0 && s.PowerCycles == nil {
+		s.PowerCycles = &v
+	}
+	if v, ok := firstInt64(raw,
+		"path:scsi_grown_defect_list",
+		"path:grown_defect_list",
+	); ok && v > 0 && s.ReallocatedSectors == nil {
+		s.ReallocatedSectors = &v
+		if status != nil && status.reallocatedSectors == 0 {
+			status.reallocatedSectors = v
+		}
+	}
+	if v, ok := firstInt64(raw,
+		"path:percentage_used_endurance_indicator",
+		"path:scsi_percentage_used_endurance_indicator",
+	); ok && v > 0 {
+		if s.LifeUsedPct == nil {
+			fv := float64(v)
+			s.LifeUsedPct = &fv
+		}
+		if s.LifeRemainingPct == nil && v <= 100 {
+			remaining := float64(100 - v)
+			s.LifeRemainingPct = &remaining
+			if status != nil && status.lifeRemainingPct == 0 {
+				status.lifeRemainingPct = int64(remaining)
+			}
+		}
+	}
+	blockSize, hasBlockSize := firstInt64(raw,
+		"path:logical_block_size",
+		"path:block_size",
+		"path:user_capacity.block_size",
+	)
+	if hasBlockSize && blockSize > 0 {
+		if v, ok := firstInt64(raw,
+			"path:logical_blocks_written",
+			"path:total_lbas_written",
+		); ok && v > 0 && s.WrittenBytes == nil {
+			bytes := v * blockSize
+			s.WrittenBytes = &bytes
+		}
+		if v, ok := firstInt64(raw,
+			"path:logical_blocks_read",
+			"path:total_lbas_read",
+		); ok && v > 0 && s.ReadBytes == nil {
+			bytes := v * blockSize
+			s.ReadBytes = &bytes
+		}
+	}
+}
+
+func firstInt64(root map[string]any, candidates ...string) (int64, bool) {
+	for _, candidate := range candidates {
+		if !strings.HasPrefix(candidate, "path:") {
+			continue
+		}
+		path := strings.TrimPrefix(candidate, "path:")
+		if v, ok := nestedInt64(root, strings.Split(path, ".")); ok {
+			return v, true
+		}
+	}
+	return 0, false
+}
+
+func nestedInt64(root map[string]any, path []string) (int64, bool) {
+	var current any = root
+	for _, key := range path {
+		obj, ok := current.(map[string]any)
+		if !ok {
+			return 0, false
+		}
+		current, ok = obj[key]
+		if !ok {
+			return 0, false
+		}
+	}
+	switch v := current.(type) {
+	case float64:
+		return int64(v), true
+	case float32:
+		return int64(v), true
+	case int:
+		return int64(v), true
+	case int64:
+		return v, true
+	case int32:
+		return int64(v), true
+	case json.Number:
+		n, err := v.Int64()
+		return n, err == nil
+	case string:
+		n, err := strconv.ParseInt(strings.TrimSpace(v), 10, 64)
+		return n, err == nil
+	default:
+		return 0, false
+	}
+}
+
 type storageHealthStatus struct {
 	hasOverall           bool
 	overallPassed        bool
--- a/audit/internal/collector/storage_scsi_test.go
+++ b/audit/internal/collector/storage_scsi_test.go
@@ -0,0 +1,89 @@
+package collector
+
+import (
+	"testing"
+
+	"bee/audit/internal/schema"
+)
+
+func TestApplySCSISmartctlTelemetry(t *testing.T) {
+	t.Parallel()
+
+	raw := map[string]any{
+		"power_on_time": map[string]any{
+			"hours": float64(32123),
+		},
+		"accumulated_start_stop_cycles":       float64(17),
+		"scsi_grown_defect_list":              float64(4),
+		"percentage_used_endurance_indicator": float64(12),
+		"logical_block_size":                  float64(4096),
+		"logical_blocks_written":              float64(1000),
+		"logical_blocks_read":                 float64(2000),
+	}
+
+	var disk schema.HardwareStorage
+	status := storageHealthStatus{}
+	applySCSISmartctlTelemetry(&disk, raw, &status)
+
+	if disk.PowerOnHours == nil || *disk.PowerOnHours != 32123 {
+		t.Fatalf("power_on_hours=%v want 32123", disk.PowerOnHours)
+	}
+	if disk.PowerCycles == nil || *disk.PowerCycles != 17 {
+		t.Fatalf("power_cycles=%v want 17", disk.PowerCycles)
+	}
+	if disk.ReallocatedSectors == nil || *disk.ReallocatedSectors != 4 {
+		t.Fatalf("reallocated=%v want 4", disk.ReallocatedSectors)
+	}
+	if disk.WrittenBytes == nil || *disk.WrittenBytes != 4096000 {
+		t.Fatalf("written_bytes=%v want 4096000", disk.WrittenBytes)
+	}
+	if disk.ReadBytes == nil || *disk.ReadBytes != 8192000 {
+		t.Fatalf("read_bytes=%v want 8192000", disk.ReadBytes)
+	}
+	if disk.LifeUsedPct == nil || *disk.LifeUsedPct != 12 {
+		t.Fatalf("life_used_pct=%v want 12", disk.LifeUsedPct)
+	}
+	if disk.LifeRemainingPct == nil || *disk.LifeRemainingPct != 88 {
+		t.Fatalf("life_remaining_pct=%v want 88", disk.LifeRemainingPct)
+	}
+	if status.reallocatedSectors != 4 {
+		t.Fatalf("status.reallocated=%d want 4", status.reallocatedSectors)
+	}
+	if status.lifeRemainingPct != 88 {
+		t.Fatalf("status.life_remaining_pct=%d want 88", status.lifeRemainingPct)
+	}
+}
+
+func TestApplySCSISmartctlTelemetryDoesNotOverwriteExistingValues(t *testing.T) {
+	t.Parallel()
+
+	powerOnHours := int64(10)
+	writtenBytes := int64(20)
+	lifeRemaining := 30.0
+	disk := schema.HardwareStorage{
+		PowerOnHours:     &powerOnHours,
+		WrittenBytes:     &writtenBytes,
+		LifeRemainingPct: &lifeRemaining,
+	}
+	raw := map[string]any{
+		"power_on_time":                       map[string]any{"hours": float64(999)},
+		"logical_block_size":                  float64(512),
+		"logical_blocks_written":              float64(999),
+		"percentage_used_endurance_indicator": float64(50),
+	}
+
+	applySCSISmartctlTelemetry(&disk, raw, nil)
+
+	if *disk.PowerOnHours != 10 {
+		t.Fatalf("power_on_hours overwritten: got %d want 10", *disk.PowerOnHours)
+	}
+	if *disk.WrittenBytes != 20 {
+		t.Fatalf("written_bytes overwritten: got %d want 20", *disk.WrittenBytes)
+	}
+	if *disk.LifeRemainingPct != 30 {
+		t.Fatalf("life_remaining_pct overwritten: got %v want 30", *disk.LifeRemainingPct)
+	}
+	if disk.LifeUsedPct == nil || *disk.LifeUsedPct != 50 {
+		t.Fatalf("life_used_pct=%v want 50", disk.LifeUsedPct)
+	}
+}
--- a/audit/internal/collector/storage_telemetry_test.go
+++ b/audit/internal/collector/storage_telemetry_test.go
@@ -0,0 +1,25 @@
+package collector
+
+import "testing"
+
+func TestSmartLBAsToBytes(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name string
+		lbas int64
+		want int64
+	}{
+		{name: "zero", lbas: 0, want: 0},
+		{name: "single lba", lbas: 1, want: 512},
+		{name: "multiple lbas", lbas: 2048, want: 1048576},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := smartLBAsToBytes(tt.lbas); got != tt.want {
+				t.Fatalf("smartLBAsToBytes(%d)=%d want %d", tt.lbas, got, tt.want)
+			}
+		})
+	}
+}
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -67,6 +67,13 @@ type benchmarkPowerCalibrationResult struct {
 	MetricRows []GPUMetricRow
 }

+type benchmarkPowerCalibrationRunSummary struct {
+	LoadedSDR          benchmarkSDRSeriesSummary
+	AvgFanRPM          float64
+	AvgFanDutyCyclePct float64
+	FanSamples         int
+}
+
 type benchmarkBurnProfile struct {
 	name       string
 	category   string
@@ -98,6 +105,7 @@ var (
 	benchmarkSkippedPattern    = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
 	benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
 	benchmarkGeteuid           = os.Geteuid
+	benchmarkResetNvidiaGPU    = resetNvidiaGPU
 	benchmarkSleep             = time.Sleep
 )

@@ -242,6 +250,35 @@ func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, po
 	return nil
 }

+func resetBenchmarkGPU(ctx context.Context, verboseLog string, gpuIndex int, logFunc func(string)) error {
+	if logFunc != nil {
+		logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset via shared NVIDIA recover path", gpuIndex))
+	}
+	out, err := benchmarkResetNvidiaGPU(gpuIndex)
+	appendSATVerboseLog(verboseLog,
+		fmt.Sprintf("[%s] start power-preflight-gpu-%d-reset.log", time.Now().UTC().Format(time.RFC3339), gpuIndex),
+		"cmd: bee-nvidia-recover reset-gpu "+strconv.Itoa(gpuIndex),
+	)
+	if trimmed := strings.TrimSpace(out); trimmed != "" && logFunc != nil {
+		for _, line := range strings.Split(trimmed, "\n") {
+			line = strings.TrimSpace(line)
+			if line != "" {
+				logFunc(line)
+			}
+		}
+	}
+	rc := 0
+	if err != nil {
+		rc = 1
+	}
+	appendSATVerboseLog(verboseLog,
+		fmt.Sprintf("[%s] finish power-preflight-gpu-%d-reset.log", time.Now().UTC().Format(time.RFC3339), gpuIndex),
+		fmt.Sprintf("rc: %d", rc),
+		"",
+	)
+	return err
+}
+
 func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int, logFunc func(string)) []int {
 	if len(gpuIndices) == 0 {
 		return nil
@@ -259,8 +296,7 @@ func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int
 	}
 	var failed []int
 	for _, idx := range gpuIndices {
-		name := fmt.Sprintf("power-preflight-gpu-%d-reset.log", idx)
-		if _, err := runSATCommandCtx(ctx, verboseLog, name, []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-r"}, nil, logFunc); err != nil {
+		if err := resetBenchmarkGPU(ctx, verboseLog, idx, logFunc); err != nil {
 			failed = append(failed, idx)
 			if logFunc != nil {
 				logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset failed: %v", idx, err))
@@ -2413,6 +2449,16 @@ type sdrPowerSnapshot struct {
 	SkippedSensors []string // sensors rejected during self-healing
 }

+type benchmarkSDRSeriesSummary struct {
+	PSUInW   float64
+	PSUOutW  float64
+	GPUSlotW float64
+	PSUSlots map[string]BenchmarkPSUSlotPower
+	Samples  int
+
+	SkippedSensors []string
+}
+
 // sdrSensor is a name+watts pair used for GPU slot self-healing filtering.
 type sdrSensor struct {
 	name  string
@@ -2542,6 +2588,137 @@ func sampleIPMISDRPowerSensors() sdrPowerSnapshot {
 	return snap
 }

+func startIPMISDRSampler(stopCh <-chan struct{}, intervalSec int) <-chan []sdrPowerSnapshot {
+	if intervalSec <= 0 {
+		intervalSec = benchmarkPowerAutotuneSampleInterval
+	}
+	ch := make(chan []sdrPowerSnapshot, 1)
+	go func() {
+		defer close(ch)
+		var samples []sdrPowerSnapshot
+		record := func() {
+			snap := sampleIPMISDRPowerSensors()
+			if snap.PSUInW <= 0 && snap.PSUOutW <= 0 && snap.GPUSlotW <= 0 && len(snap.PSUSlots) == 0 {
+				return
+			}
+			samples = append(samples, snap)
+		}
+		record()
+		ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
+		defer ticker.Stop()
+		for {
+			select {
+			case <-stopCh:
+				ch <- samples
+				return
+			case <-ticker.C:
+				record()
+			}
+		}
+	}()
+	return ch
+}
+
+func summarizeSDRPowerSeries(samples []sdrPowerSnapshot) benchmarkSDRSeriesSummary {
+	var summary benchmarkSDRSeriesSummary
+	if len(samples) == 0 {
+		return summary
+	}
+
+	type slotAggregate struct {
+		inputs  []float64
+		outputs []float64
+		status  string
+	}
+
+	slotAgg := make(map[string]*slotAggregate)
+	skippedSet := make(map[string]struct{})
+	var inputTotals []float64
+	var outputTotals []float64
+	var gpuSlotTotals []float64
+
+	for _, sample := range samples {
+		if sample.PSUInW > 0 {
+			inputTotals = append(inputTotals, sample.PSUInW)
+		}
+		if sample.PSUOutW > 0 {
+			outputTotals = append(outputTotals, sample.PSUOutW)
+		}
+		if sample.GPUSlotW > 0 {
+			gpuSlotTotals = append(gpuSlotTotals, sample.GPUSlotW)
+		}
+		for _, skipped := range sample.SkippedSensors {
+			if skipped != "" {
+				skippedSet[skipped] = struct{}{}
+			}
+		}
+		for slot, reading := range sample.PSUSlots {
+			agg := slotAgg[slot]
+			if agg == nil {
+				agg = &slotAggregate{}
+				slotAgg[slot] = agg
+			}
+			if reading.InputW != nil && *reading.InputW > 0 {
+				agg.inputs = append(agg.inputs, *reading.InputW)
+			}
+			if reading.OutputW != nil && *reading.OutputW > 0 {
+				agg.outputs = append(agg.outputs, *reading.OutputW)
+			}
+			switch {
+			case reading.Status == "":
+			case agg.status == "":
+				agg.status = reading.Status
+			case agg.status == "OK" && reading.Status != "OK":
+				agg.status = reading.Status
+			}
+		}
+	}
+
+	summary.PSUInW = benchmarkMean(inputTotals)
+	summary.PSUOutW = benchmarkMean(outputTotals)
+	summary.GPUSlotW = benchmarkMean(gpuSlotTotals)
+	summary.Samples = len(samples)
+
+	if len(slotAgg) > 0 {
+		summary.PSUSlots = make(map[string]BenchmarkPSUSlotPower, len(slotAgg))
+		for slot, agg := range slotAgg {
+			reading := BenchmarkPSUSlotPower{Status: agg.status}
+			if mean := benchmarkMean(agg.inputs); mean > 0 {
+				v := mean
+				reading.InputW = &v
+			}
+			if mean := benchmarkMean(agg.outputs); mean > 0 {
+				v := mean
+				reading.OutputW = &v
+			}
+			summary.PSUSlots[slot] = reading
+		}
+	}
+	if len(skippedSet) > 0 {
+		summary.SkippedSensors = make([]string, 0, len(skippedSet))
+		for skipped := range skippedSet {
+			summary.SkippedSensors = append(summary.SkippedSensors, skipped)
+		}
+		sort.Strings(summary.SkippedSensors)
+	}
+
+	return summary
+}
+
+func collectIPMISDRPowerSeries(ctx context.Context, durationSec, intervalSec int) benchmarkSDRSeriesSummary {
+	if durationSec <= 0 {
+		return benchmarkSDRSeriesSummary{}
+	}
+	stopCh := make(chan struct{})
+	doneCh := startIPMISDRSampler(stopCh, intervalSec)
+	select {
+	case <-ctx.Done():
+	case <-time.After(time.Duration(durationSec) * time.Second):
+	}
+	close(stopCh)
+	return summarizeSDRPowerSeries(<-doneCh)
+}
+
 // queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
 // Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
 func queryIPMIServerPowerW() (float64, error) {
@@ -3086,8 +3263,9 @@ func runBenchmarkPowerCalibration(
 	logFunc func(string),
 	seedLimits map[int]int,
 	durationSec int,
-) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow) {
+) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow, benchmarkPowerCalibrationRunSummary) {
 	calibDurationSec := durationSec
+	var runSummary benchmarkPowerCalibrationRunSummary
 	if calibDurationSec <= 0 {
 		calibDurationSec = 120
 	}
@@ -3105,12 +3283,12 @@ func runBenchmarkPowerCalibration(
 	if engine == BenchmarkPowerEngineTargetedPower {
 		if _, err := exec.LookPath("dcgmi"); err != nil {
 			logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
-			return map[int]benchmarkPowerCalibrationResult{}, nil, nil
+			return map[int]benchmarkPowerCalibrationResult{}, nil, nil, runSummary
 		}
 	} else {
 		if _, _, err := resolveBenchmarkPowerLoadCommand(calibDurationSec, gpuIndices); err != nil {
 			logFunc("power calibration: dcgmproftester not found, skipping (will use default power limit)")
-			return map[int]benchmarkPowerCalibrationResult{}, nil, nil
+			return map[int]benchmarkPowerCalibrationResult{}, nil, nil, runSummary
 		}
 	}
 	if killed := KillTestWorkers(); len(killed) > 0 {
@@ -3275,6 +3453,10 @@ calibDone:
 		}
 		attemptCtx, cancelAttempt := context.WithCancel(ctx)
 		doneCh := make(chan sharedAttemptResult, 1)
+		sdrStopCh := make(chan struct{})
+		sdrDoneCh := startIPMISDRSampler(sdrStopCh, benchmarkPowerAutotuneSampleInterval)
+		fanStopCh := make(chan struct{})
+		fanDoneCh := startBenchmarkFanSampler(fanStopCh, benchmarkPowerAutotuneSampleInterval)
 		go func() {
 			out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, env, gpuIndices, logFunc)
 			doneCh <- sharedAttemptResult{out: out, rows: rows, err: err}
@@ -3314,6 +3496,10 @@ calibDone:
 		}
 		ticker.Stop()
 		cancelAttempt()
+		close(sdrStopCh)
+		close(fanStopCh)
+		attemptSDRSummary := summarizeSDRPowerSeries(<-sdrDoneCh)
+		attemptFanSummary := <-fanDoneCh
 		_ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644)
 		// Accumulate telemetry rows with attempt stage label.
 		appendBenchmarkMetrics(&allCalibRows, ar.rows, fmt.Sprintf("attempt-%d", sharedAttempt), &calibCursor, float64(calibDurationSec))
@@ -3351,10 +3537,14 @@ calibDone:
 		busyDelaySec = 1

 		// Per-GPU analysis and binary search update.
+		attemptStable := ar.err == nil
 		for _, s := range active {
 			perGPU := filterRowsByGPU(ar.rows, s.idx)
 			summary := summarizeBenchmarkTelemetry(perGPU)
 			throttle := throttleReasons[s.idx]
+			if throttle != "" || summary.P95PowerW <= 0 {
+				attemptStable = false
+			}

 			// Cooling warning: thermal throttle with fans not at maximum.
 			if strings.Contains(throttle, "thermal") && s.calib.CoolingWarning == "" {
@@ -3487,6 +3677,16 @@ calibDone:
 			s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", next, s.lo, s.hi))
 			logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", s.idx, next, s.lo, s.hi))
 		}
+		if attemptStable {
+			if attemptSDRSummary.Samples > 0 {
+				runSummary.LoadedSDR = attemptSDRSummary
+			}
+			if attemptFanSummary.FanSamples > 0 {
+				runSummary.AvgFanRPM = attemptFanSummary.AvgFanRPM
+				runSummary.AvgFanDutyCyclePct = attemptFanSummary.AvgFanDutyCyclePct
+				runSummary.FanSamples = attemptFanSummary.FanSamples
+			}
+		}
 	}

 	for _, s := range states {
@@ -3495,7 +3695,7 @@ calibDone:
 		}
 	}
 	writeBenchmarkMetricsFiles(runDir, allCalibRows)
-	return results, restore, allCalibRows
+	return results, restore, allCalibRows, runSummary
 }

 // isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222),
@@ -3540,6 +3740,47 @@ func meanFanRPM(fans []FanReading) float64 {
 	return sum / float64(len(fans))
 }

+func startBenchmarkFanSampler(stopCh <-chan struct{}, intervalSec int) <-chan benchmarkPowerCalibrationRunSummary {
+	if intervalSec <= 0 {
+		intervalSec = benchmarkPowerAutotuneSampleInterval
+	}
+	ch := make(chan benchmarkPowerCalibrationRunSummary, 1)
+	go func() {
+		defer close(ch)
+		var rpmSamples []float64
+		var dutySamples []float64
+		record := func() {
+			fans, err := sampleFanSpeeds()
+			if err != nil || len(fans) == 0 {
+				return
+			}
+			if rpm := meanFanRPM(fans); rpm > 0 {
+				rpmSamples = append(rpmSamples, rpm)
+			}
+			if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok && duty > 0 {
+				dutySamples = append(dutySamples, duty)
+			}
+		}
+		record()
+		ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
+		defer ticker.Stop()
+		for {
+			select {
+			case <-stopCh:
+				ch <- benchmarkPowerCalibrationRunSummary{
+					AvgFanRPM:          benchmarkMean(rpmSamples),
+					AvgFanDutyCyclePct: benchmarkMean(dutySamples),
+					FanSamples:         len(rpmSamples),
+				}
+				return
+			case <-ticker.C:
+				record()
+			}
+		}
+	}()
+	return ch
+}
+
 func powerBenchDurationSec(profile string) int {
 	switch strings.TrimSpace(strings.ToLower(profile)) {
 	case NvidiaBenchmarkProfileStability:
@@ -3568,41 +3809,39 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 	fmt.Fprintf(&b, "**Overall status:** %s  \n", result.OverallStatus)
 	fmt.Fprintf(&b, "**Platform max TDP (GPU-reported):** %.0f W  \n", result.PlatformMaxTDPW)
 	if sp := result.ServerPower; sp != nil && sp.Available {
-		fmt.Fprintf(&b, "**Server power delta (IPMI DCMI):** %.0f W  \n", sp.DeltaW)
-		if sp.PSUInputLoadedW > 0 {
-			psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
-			fmt.Fprintf(&b, "**PSU AC input Δ (IPMI SDR):** %.0f W  \n", psuDelta)
+		sourceLabel := "autotuned source"
+		switch normalizeBenchmarkPowerSource(sp.Source) {
+		case BenchmarkPowerSourceSDRPSUInput:
+			sourceLabel = "autotuned source (SDR PSU AC input)"
+		case BenchmarkPowerSourceDCMI:
+			sourceLabel = "autotuned source (DCMI)"
 		}
-		fmt.Fprintf(&b, "**Reporting ratio (IPMI Δ / GPU actual sum):** %.2f  \n", sp.ReportingRatio)
+		fmt.Fprintf(&b, "**Server power delta (%s):** %.0f W  \n", sourceLabel, sp.DeltaW)
+		fmt.Fprintf(&b, "**Reporting ratio:** %.2f  \n", sp.ReportingRatio)
 	}
 	b.WriteString("\n")
 	// Server power comparison table.
 	if sp := result.ServerPower; sp != nil {
 		b.WriteString("## Server vs GPU Power Comparison\n\n")
+		selectedSource := normalizeBenchmarkPowerSource(sp.Source)
+		selectedSourceLabel := "Selected source"
+		if selectedSource == BenchmarkPowerSourceSDRPSUInput {
+			selectedSourceLabel = "Selected source (SDR PSU AC input)"
+		} else if selectedSource == BenchmarkPowerSourceDCMI {
+			selectedSourceLabel = "Selected source (DCMI)"
+		}
 		var spRows [][]string
-		spRows = append(spRows, []string{"GPU stable limits sum", "nvidia-smi", fmt.Sprintf("%.0f W", result.PlatformMaxTDPW)})
-		spRows = append(spRows, []string{"GPU actual power sum (p95, last step)", "nvidia-smi", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)})
-		if sp.GPUSlotTotalW > 0 {
-			spRows = append(spRows, []string{"GPU PCIe slot power (at peak load)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.GPUSlotTotalW)})
-		}
+		spRows = append(spRows, []string{"GPU actual power sum (p95, last step)", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)})
 		if sp.Available {
-			spRows = append(spRows, []string{"Server idle power", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.IdleW)})
-			spRows = append(spRows, []string{"Server loaded power", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.LoadedW)})
-			spRows = append(spRows, []string{"Server Δ power (loaded − idle)", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.DeltaW)})
+			spRows = append(spRows, []string{selectedSourceLabel + " idle power", fmt.Sprintf("%.0f W", sp.IdleW)})
+			spRows = append(spRows, []string{selectedSourceLabel + " loaded power", fmt.Sprintf("%.0f W", sp.LoadedW)})
+			spRows = append(spRows, []string{selectedSourceLabel + " Δ power (loaded − idle)", fmt.Sprintf("%.0f W", sp.DeltaW)})
 		}
-		if sp.PSUInputLoadedW > 0 {
-			spRows = append(spRows, []string{"PSU AC input (idle)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUInputIdleW)})
-			spRows = append(spRows, []string{"PSU AC input (loaded)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUInputLoadedW)})
+		if selectedSource == BenchmarkPowerSourceSDRPSUInput && sp.PSUInputLoadedW > 0 {
+			spRows = append(spRows, []string{"PSU AC input (idle avg, pre-load phase)", fmt.Sprintf("%.0f W", sp.PSUInputIdleW)})
+			spRows = append(spRows, []string{"PSU AC input (loaded avg, final phase)", fmt.Sprintf("%.0f W", sp.PSUInputLoadedW)})
 			psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
-			spRows = append(spRows, []string{"PSU AC input Δ (loaded − idle)", "IPMI SDR", fmt.Sprintf("%.0f W", psuDelta)})
-		}
-		if sp.PSUOutputLoadedW > 0 {
-			spRows = append(spRows, []string{"PSU DC output (idle)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUOutputIdleW)})
-			spRows = append(spRows, []string{"PSU DC output (loaded)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUOutputLoadedW)})
-			if sp.PSUInputLoadedW > 0 && sp.PSUInputIdleW > 0 {
-				psuEff := sp.PSUOutputIdleW / sp.PSUInputIdleW * 100
-				spRows = append(spRows, []string{"PSU conversion efficiency (idle)", "IPMI SDR", fmt.Sprintf("%.1f%%", psuEff)})
-			}
+			spRows = append(spRows, []string{"PSU AC input Δ (loaded − idle)", fmt.Sprintf("%.0f W", psuDelta)})
 		}
 		if sp.Available {
 			ratio := sp.ReportingRatio
@@ -3619,8 +3858,8 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 			default:
 				ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power"
 			}
-			spRows = append(spRows, []string{"Reporting ratio (DCMI Δ / GPU actual)", "IPMI DCMI", fmt.Sprintf("%.2f — %s", ratio, ratioNote)})
-			if sp.PSUInputLoadedW > 0 && sp.GPUReportedSumW > 0 {
+			spRows = append(spRows, []string{"Reporting ratio", fmt.Sprintf("%.2f — %s", ratio, ratioNote)})
+			if selectedSource == BenchmarkPowerSourceSDRPSUInput && sp.PSUInputLoadedW > 0 && sp.GPUReportedSumW > 0 {
 				psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
 				sdrRatio := psuDelta / sp.GPUReportedSumW
 				sdrNote := ""
@@ -3632,12 +3871,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 				default:
 					sdrNote = "✗ significant discrepancy"
 				}
-				spRows = append(spRows, []string{"Reporting ratio (SDR PSU Δ / GPU actual)", "IPMI SDR", fmt.Sprintf("%.2f — %s", sdrRatio, sdrNote)})
+				spRows = append(spRows, []string{"PSU AC input reporting ratio", fmt.Sprintf("%.2f — %s", sdrRatio, sdrNote)})
 			}
 		} else {
-			spRows = append(spRows, []string{"IPMI availability", "—", "not available — IPMI not supported or ipmitool not found"})
+			spRows = append(spRows, []string{"IPMI availability", "not available — IPMI not supported or ipmitool not found"})
 		}
-		b.WriteString(fmtMDTable([]string{"Metric", "Source", "Value"}, spRows))
+		b.WriteString(fmtMDTable([]string{"Metric", "Value"}, spRows))
 		for _, note := range sp.Notes {
 			fmt.Fprintf(&b, "\n> %s\n", note)
 		}
@@ -3689,11 +3928,10 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 				psuDistRows = append(psuDistRows, []string{
 					slot,
 					fmtW(idle.InputW), fmtW(loaded.InputW),
-					fmtW(idle.OutputW), fmtW(loaded.OutputW),
 					deltaStr, status,
 				})
 			}
-			b.WriteString(fmtMDTable([]string{"Slot", "AC Input (idle)", "AC Input (loaded)", "DC Output (idle)", "DC Output (loaded)", "Load Δ", "Status"}, psuDistRows))
+			b.WriteString(fmtMDTable([]string{"Slot", "AC Input (idle avg)", "AC Input (loaded avg)", "Load Δ", "Status"}, psuDistRows))
 			b.WriteString("\n")
 		}
 	}
@@ -3741,7 +3979,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 				fan,
 			})
 		}
-		b.WriteString(fmtMDTable([]string{"GPU", "Clock MHz (Mem MHz)", "Avg Temp °C", "Power W", "Server Δ W", "Fan RPM (duty%)"}, sgRows))
+		b.WriteString(fmtMDTable([]string{"GPU", "Clock MHz (Mem MHz)", "Avg Temp °C", "Power W", "Server Δ W", "Avg Fan RPM (duty%)"}, sgRows))
 		b.WriteString("\n")
 	}
 	if len(result.RecommendedSlotOrder) > 0 {
@@ -3850,7 +4088,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 			for _, slot := range psuSlots {
 				psuHeaders = append(psuHeaders, fmt.Sprintf("PSU %s W", slot))
 			}
-			psuHeaders = append(psuHeaders, "PSU Total W", "Platform eff.", "Fan RPM (duty%)")
+			psuHeaders = append(psuHeaders, "PSU Total W", "Platform eff.", "Avg Fan RPM (duty%)")

 			var psuRows [][]string
 			for _, step := range result.RampSteps {
@@ -3931,7 +4169,6 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 			}
 			pdRows = append(pdRows, []string{
 				fmt.Sprintf("GPU %d", gpu.Index),
-				fmt.Sprintf("%.0f W", gpu.DefaultPowerLimitW),
 				fmt.Sprintf("%.0f W", gpu.AppliedPowerLimitW),
 				fmt.Sprintf("%.0f W", stable),
 				realization,
@@ -3944,13 +4181,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 		}
 		pdRows = append(pdRows, []string{
 			"**Platform**",
-			fmt.Sprintf("**%.0f W**", totalDefault),
 			"—",
 			fmt.Sprintf("**%.0f W**", totalStable),
 			fmt.Sprintf("**%s**", platformReal),
 			"",
 		})
-		b.WriteString(fmtMDTable([]string{"GPU", "Default TDP", "Single-card limit", "Stable limit", "Realization", "Derated"}, pdRows))
+		b.WriteString(fmtMDTable([]string{"GPU", "Single-card limit", "Stable limit", "Realization", "Derated"}, pdRows))
 		b.WriteString("\n")

 		// Balance across GPUs — only meaningful with 2+ GPUs.
@@ -4100,7 +4336,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 			{"Avg Temp °C", singleTemp},
 			{"Power W", singlePwr},
 			{"Per GPU wall W", singleWall},
-			{"Fan RPM (duty%)", singleFan},
+			{"Avg Fan RPM (duty%)", singleFan},
 		}
 		if lastStep != nil {
 			compRows[0] = append(compRows[0], fmt.Sprintf("%s (%s)", allClk, allMem))
@@ -4208,18 +4444,22 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	// Sample server idle power before any GPU load.
 	var serverIdleW float64
 	var serverIdleOK bool
+	idleSDRStopCh := make(chan struct{})
+	idleSDRCh := startIPMISDRSampler(idleSDRStopCh, benchmarkPowerAutotuneSampleInterval)
 	if w, ok := sampleBenchmarkPowerSourceSeries(ctx, opts.ServerPowerSource, 10, benchmarkPowerAutotuneSampleInterval); ok {
 		serverIdleW = w
 		serverIdleOK = true
 		logFunc(fmt.Sprintf("server idle power (%s): %.0f W", opts.ServerPowerSource, w))
 	}
-	sdrIdle := sampleIPMISDRPowerSensors()
+	close(idleSDRStopCh)
+	sdrIdle := summarizeSDRPowerSeries(<-idleSDRCh)
 	psuBefore := psuStatusSnapshot()

 	// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
 	// establish a true single-card power baseline unaffected by neighbour heat.
 	calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
 	singleIPMILoadedW := make(map[int]float64, len(selected))
+	singleRunSummaryByIndex := make(map[int]benchmarkPowerCalibrationRunSummary, len(selected))
 	var allRestoreActions []benchmarkRestoreAction
 	// allPowerRows accumulates telemetry from all phases for the top-level gpu-metrics.csv.
 	var allPowerRows []GPUMetricRow
@@ -4229,27 +4469,26 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		_ = os.MkdirAll(singleDir, 0755)
 		singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
 		if failed := resetBenchmarkGPUs(ctx, verboseLog, []int{idx}, logFunc); len(failed) > 0 {
-			result.Findings = append(result.Findings,
-				fmt.Sprintf("GPU %d reset pre-flight did not complete before its first power test; throttle counters may contain stale state.", idx))
+			return "", fmt.Errorf("power benchmark pre-flight: failed to reset GPU %d; benchmark aborted to keep measurements clean", idx)
 		}
 		logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
 		singlePowerStopCh := make(chan struct{})
 		singlePowerCh := startSelectedPowerSourceSampler(singlePowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
-		c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil, durationSec)
+		c, restore, singleRows, singleRun := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil, durationSec)
 		appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0)
 		close(singlePowerStopCh)
-		sdrSingle := sampleIPMISDRPowerSensors()
 		if samples := <-singlePowerCh; len(samples) > 0 {
 			singleIPMILoadedW[idx] = benchmarkMean(samples)
 			logFunc(fmt.Sprintf("power calibration: GPU %d single-card server power (%s avg): %.0f W", idx, opts.ServerPowerSource, singleIPMILoadedW[idx]))
-		} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrSingle.PSUInW > 0 {
-			singleIPMILoadedW[idx] = sdrSingle.PSUInW
-			logFunc(fmt.Sprintf("power calibration: GPU %d single-card fallback server power (SDR snapshot): %.0f W", idx, sdrSingle.PSUInW))
+		} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && singleRun.LoadedSDR.PSUInW > 0 {
+			singleIPMILoadedW[idx] = singleRun.LoadedSDR.PSUInW
+			logFunc(fmt.Sprintf("power calibration: GPU %d single-card fallback server power (SDR avg): %.0f W", idx, singleRun.LoadedSDR.PSUInW))
 		}
 		allRestoreActions = append(allRestoreActions, restore...)
 		if r, ok := c[idx]; ok {
 			calibByIndex[idx] = r
 		}
+		singleRunSummaryByIndex[idx] = singleRun
 	}
 	defer func() {
 		for i := len(allRestoreActions) - 1; i >= 0; i-- {
@@ -4292,11 +4531,9 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			t := summarizeBenchmarkTelemetry(calib.MetricRows)
 			gpu.Telemetry = &t
 		}
-		if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 {
-			gpu.AvgFanRPM = meanFanRPM(fans)
-			if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok {
-				gpu.AvgFanDutyCyclePct = duty
-			}
+		if singleRun := singleRunSummaryByIndex[idx]; singleRun.AvgFanRPM > 0 {
+			gpu.AvgFanRPM = singleRun.AvgFanRPM
+			gpu.AvgFanDutyCyclePct = singleRun.AvgFanDutyCyclePct
 		}
 		gpus = append(gpus, gpu)
 	}
@@ -4352,10 +4589,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	// per-step in NvidiaPowerBenchStep.ServerLoadedW.
 	var serverLoadedW float64
 	var serverLoadedOK bool
-	// sdrLastStep retains the SDR snapshot from the last ramp step while GPUs are
-	// still loaded. Used as PSUInputLoadedW in the summary instead of re-sampling
-	// after the test when GPUs have already returned to idle.
-	var sdrLastStep sdrPowerSnapshot
+	// sdrLastStep retains the phase-averaged SDR readings from the last ramp step
+	// while GPUs are loaded. Used in the summary instead of re-sampling after the
+	// test when GPUs have already returned to idle.
+	var sdrLastStep benchmarkSDRSeriesSummary

 	// Step 1: reuse single-card calibration result directly.
 	if len(result.RecommendedSlotOrder) > 0 {
@@ -4376,6 +4613,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			ramp.ServerLoadedW = w
 			ramp.ServerDeltaW = w - serverIdleW
 		}
+		if singleRun := singleRunSummaryByIndex[firstIdx]; singleRun.AvgFanRPM > 0 {
+			ramp.AvgFanRPM = singleRun.AvgFanRPM
+			ramp.AvgFanDutyCyclePct = singleRun.AvgFanDutyCyclePct
+		}
 		if !firstCalib.Completed {
 			ramp.Status = "FAILED"
 			ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card %s", firstIdx, benchmarkPowerEngineLabel(benchmarkPowerEngine())))
@@ -4426,7 +4667,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
 		stepPowerStopCh := make(chan struct{})
 		stepPowerCh := startSelectedPowerSourceSampler(stepPowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
-		stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep, durationSec)
+		stepCalib, stepRestore, stepRows, stepRun := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep, durationSec)
 		appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0)
 		close(stepPowerStopCh)
 		var stepIPMILoadedW float64
@@ -4497,10 +4738,9 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
 		}

-		// Per-step PSU slot snapshot — also used as the authoritative loaded power
-		// source when SDR PSU sensors are available (more accurate than DCMI on
-		// servers where DCMI covers only a subset of installed PSUs).
-		sdrStep := sampleIPMISDRPowerSensors()
+		// Per-step PSU slot readings are averaged over the whole load phase rather
+		// than captured as a single end-of-phase snapshot.
+		sdrStep := stepRun.LoadedSDR
 		if len(sdrStep.PSUSlots) > 0 {
 			ramp.PSUSlotReadings = sdrStep.PSUSlots
 		}
@@ -4518,7 +4758,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrStep.PSUInW > 0 {
 			ramp.ServerLoadedW = sdrStep.PSUInW
 			ramp.ServerDeltaW = sdrStep.PSUInW - sdrIdle.PSUInW
-			logFunc(fmt.Sprintf("power ramp: step %d fallback server loaded power (SDR snapshot): %.0f W", step, sdrStep.PSUInW))
+			logFunc(fmt.Sprintf("power ramp: step %d fallback server loaded power (SDR avg): %.0f W", step, sdrStep.PSUInW))
 			if step == len(result.RecommendedSlotOrder) {
 				serverLoadedW = sdrStep.PSUInW
 				serverLoadedOK = true
@@ -4526,12 +4766,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			}
 		}

-		// Fan state at end of ramp step.
-		if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 {
-			ramp.AvgFanRPM = meanFanRPM(fans)
-			if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok {
-				ramp.AvgFanDutyCyclePct = duty
-			}
+		// Fan values are phase averages over the same load window.
+		if stepRun.AvgFanRPM > 0 {
+			ramp.AvgFanRPM = stepRun.AvgFanRPM
+			ramp.AvgFanDutyCyclePct = stepRun.AvgFanDutyCyclePct
 		}

 		// Per-GPU telemetry from this ramp step's calibration.
@@ -4584,8 +4822,8 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 	// Supplement DCMI with SDR multi-source data via collector's PSU slot patterns.
 	// Per-slot readings enable correlation with audit HardwarePowerSupply entries.
 	if result.ServerPower != nil {
-		// Use the SDR snapshot from the last ramp step (GPUs still loaded) rather
-		// than re-sampling here, which would capture post-test idle state.
+		// Use the SDR phase average from the last ramp step (GPUs still loaded)
+		// rather than re-sampling here, which would capture post-test idle state.
 		sdrLoaded := sdrLastStep
 		result.ServerPower.PSUInputIdleW = sdrIdle.PSUInW
 		result.ServerPower.PSUInputLoadedW = sdrLoaded.PSUInW
@@ -4605,6 +4843,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 			result.ServerPower.Notes = append(result.ServerPower.Notes,
 				"SDR sensors skipped (self-healed): "+strings.Join(sdrLoaded.SkippedSensors, "; "))
 		}
+		if sdrLoaded.Samples > 0 {
+			result.ServerPower.Notes = append(result.ServerPower.Notes,
+				fmt.Sprintf("Final SDR PSU loaded values are phase averages across %d sample(s) from the last full-load step.", sdrLoaded.Samples))
+		}
 		// Detect DCMI partial coverage: direct SDR comparison first,
 		// ramp heuristic as fallback when SDR PSU sensors are absent.
 		dcmiUnreliable := detectDCMIPartialCoverage(result.ServerPower) ||
--- a/audit/internal/platform/benchmark_test.go
+++ b/audit/internal/platform/benchmark_test.go
@@ -2,7 +2,7 @@ package platform

 import (
 	"context"
-	"os"
+	"fmt"
 	"os/exec"
 	"path/filepath"
 	"strings"
@@ -188,18 +188,16 @@ func TestBenchmarkCalibrationThrottleReasonIgnoresPowerReasons(t *testing.T) {
 }

 func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
-	t.Parallel()
-
 	oldGeteuid := benchmarkGeteuid
-	oldExec := satExecCommand
+	oldReset := benchmarkResetNvidiaGPU
 	benchmarkGeteuid = func() int { return 1000 }
-	satExecCommand = func(name string, args ...string) *exec.Cmd {
-		t.Fatalf("unexpected command: %s %v", name, args)
-		return nil
+	benchmarkResetNvidiaGPU = func(int) (string, error) {
+		t.Fatal("unexpected reset call")
+		return "", nil
 	}
 	t.Cleanup(func() {
 		benchmarkGeteuid = oldGeteuid
-		satExecCommand = oldExec
+		benchmarkResetNvidiaGPU = oldReset
 	})

 	var logs []string
@@ -215,44 +213,52 @@ func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
 }

 func TestResetBenchmarkGPUsResetsEachGPU(t *testing.T) {
-	t.Parallel()
-
-	dir := t.TempDir()
-	script := filepath.Join(dir, "nvidia-smi")
-	argsLog := filepath.Join(dir, "args.log")
-	if err := os.WriteFile(script, []byte("#!/bin/sh\nprintf '%s\\n' \"$*\" >> "+argsLog+"\nprintf 'ok\\n'\n"), 0755); err != nil {
-		t.Fatalf("write script: %v", err)
-	}
-
 	oldGeteuid := benchmarkGeteuid
 	oldSleep := benchmarkSleep
-	oldLookPath := satLookPath
+	oldReset := benchmarkResetNvidiaGPU
 	benchmarkGeteuid = func() int { return 0 }
 	benchmarkSleep = func(time.Duration) {}
-	satLookPath = func(file string) (string, error) {
-		if file == "nvidia-smi" {
-			return script, nil
-		}
-		return exec.LookPath(file)
+	var calls []int
+	benchmarkResetNvidiaGPU = func(index int) (string, error) {
+		calls = append(calls, index)
+		return "ok\n", nil
 	}
 	t.Cleanup(func() {
 		benchmarkGeteuid = oldGeteuid
 		benchmarkSleep = oldSleep
-		satLookPath = oldLookPath
+		benchmarkResetNvidiaGPU = oldReset
 	})

-	failed := resetBenchmarkGPUs(context.Background(), filepath.Join(dir, "verbose.log"), []int{2, 5}, nil)
+	failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{2, 5}, nil)
 	if len(failed) != 0 {
 		t.Fatalf("failed=%v want no failures", failed)
 	}
-	raw, err := os.ReadFile(argsLog)
-	if err != nil {
-		t.Fatalf("read args log: %v", err)
+	if got, want := fmt.Sprint(calls), "[2 5]"; got != want {
+		t.Fatalf("calls=%v want %s", calls, want)
 	}
-	got := strings.Fields(string(raw))
-	want := []string{"-i", "2", "-r", "-i", "5", "-r"}
-	if strings.Join(got, " ") != strings.Join(want, " ") {
-		t.Fatalf("args=%v want %v", got, want)
+}
+
+func TestResetBenchmarkGPUsTracksFailuresFromSharedReset(t *testing.T) {
+	oldGeteuid := benchmarkGeteuid
+	oldSleep := benchmarkSleep
+	oldReset := benchmarkResetNvidiaGPU
+	benchmarkGeteuid = func() int { return 0 }
+	benchmarkSleep = func(time.Duration) {}
+	benchmarkResetNvidiaGPU = func(index int) (string, error) {
+		if index == 5 {
+			return "busy\n", exec.ErrNotFound
+		}
+		return "ok\n", nil
+	}
+	t.Cleanup(func() {
+		benchmarkGeteuid = oldGeteuid
+		benchmarkSleep = oldSleep
+		benchmarkResetNvidiaGPU = oldReset
+	})
+
+	failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{2, 5}, nil)
+	if got, want := fmt.Sprint(failed), "[5]"; got != want {
+		t.Fatalf("failed=%v want %s", failed, want)
 	}
 }

--- a/audit/internal/platform/kill_workers.go
+++ b/audit/internal/platform/kill_workers.go
@@ -18,6 +18,7 @@ var workerPatterns = []string{
 	"stress-ng",
 	"stressapptest",
 	"memtester",
+	"nvbandwidth",
 	// DCGM diagnostic workers — nvvs is spawned by dcgmi diag and survives
 	// if dcgmi is killed mid-run, leaving the GPU occupied (DCGM_ST_IN_USE).
 	"nvvs",
@@ -71,13 +72,19 @@ func KillTestWorkers() []KilledProcess {
 		if idx := strings.LastIndexByte(exe, '/'); idx >= 0 {
 			base = exe[idx+1:]
 		}
-		for _, pat := range workerPatterns {
-			if strings.Contains(base, pat) || strings.Contains(exe, pat) {
-				_ = syscall.Kill(pid, syscall.SIGKILL)
-				killed = append(killed, KilledProcess{PID: pid, Name: base})
-				break
-			}
+		if shouldKillWorkerProcess(exe, base) {
+			_ = syscall.Kill(pid, syscall.SIGKILL)
+			killed = append(killed, KilledProcess{PID: pid, Name: base})
 		}
 	}
 	return killed
 }
+
+func shouldKillWorkerProcess(exe, base string) bool {
+	for _, pat := range workerPatterns {
+		if strings.Contains(base, pat) || strings.Contains(exe, pat) {
+			return true
+		}
+	}
+	return false
+}
--- a/audit/internal/platform/kill_workers_test.go
+++ b/audit/internal/platform/kill_workers_test.go
@@ -0,0 +1,39 @@
+package platform
+
+import "testing"
+
+func TestShouldKillWorkerProcess(t *testing.T) {
+	tests := []struct {
+		name string
+		exe  string
+		base string
+		want bool
+	}{
+		{
+			name: "nvbandwidth executable",
+			exe:  "/usr/libexec/datacenter-gpu-manager-4/plugins/cuda13/nvbandwidth",
+			base: "nvbandwidth",
+			want: true,
+		},
+		{
+			name: "dcgmi executable",
+			exe:  "/usr/bin/dcgmi",
+			base: "dcgmi",
+			want: true,
+		},
+		{
+			name: "unrelated process",
+			exe:  "/usr/bin/bash",
+			base: "bash",
+			want: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := shouldKillWorkerProcess(tt.exe, tt.base); got != tt.want {
+				t.Fatalf("shouldKillWorkerProcess(%q, %q)=%v want %v", tt.exe, tt.base, got, tt.want)
+			}
+		})
+	}
+}
--- a/audit/internal/platform/nvidia_recover.go
+++ b/audit/internal/platform/nvidia_recover.go
@@ -3,6 +3,8 @@ package platform
 import (
 	"fmt"
 	"os/exec"
+	"strconv"
+	"strings"
 	"time"
 )

@@ -28,3 +30,22 @@ func runNvidiaRecover(args ...string) (string, error) {
 	raw, err := exec.Command("sudo", helperArgs...).CombinedOutput()
 	return string(raw), err
 }
+
+func resetNvidiaGPU(index int) (string, error) {
+	if index < 0 {
+		return "", fmt.Errorf("gpu index must be >= 0")
+	}
+	out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
+	if strings.TrimSpace(out) == "" && err == nil {
+		out = "GPU reset completed.\n"
+	}
+	return out, err
+}
+
+func restartNvidiaDrivers() (string, error) {
+	out, err := runNvidiaRecover("restart-drivers")
+	if strings.TrimSpace(out) == "" && err == nil {
+		out = "NVIDIA drivers restarted.\n"
+	}
+	return out, err
+}
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -30,10 +30,10 @@ import (
 // Sources:
 //   - SATEstimatedCPUValidateSec:                 xFusion v8.6 — 62 s
 //   - SATEstimatedMemoryValidateSec:               xFusion v8.6 — 68 s
-//   - SATEstimatedNvidiaGPUValidatePerGPUSec:      xFusion v8.6/v8.22 — 77–87 s/GPU
-//   - SATEstimatedNvidiaGPUStressPerGPUSec:        xFusion v8.6/v8.22 — 444–448 s/GPU
-//   - SATEstimatedNvidiaTargetedStressPerGPUSec:   xFusion v8.6/v8.22 — 347–348 s/GPU (300 s default + overhead)
-//   - SATEstimatedNvidiaTargetedPowerPerGPUSec:    MSI v8.22 / xFusion v8.6 — 346–351 s/GPU
+//   - SATEstimatedNvidiaGPUValidateSec:            xFusion v8.6/v8.22 — 77–87 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
+//   - SATEstimatedNvidiaGPUStressSec:              xFusion v8.6/v8.22 — 444–448 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
+//   - SATEstimatedNvidiaTargetedStressSec:         xFusion v8.6/v8.22 — 347–348 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
+//   - SATEstimatedNvidiaTargetedPowerSec:          MSI v8.22 / xFusion v8.6 — 346–351 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
 //   - SATEstimatedNvidiaPulseTestSec:              xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous)
 //   - SATEstimatedNvidiaInterconnectSec:           xFusion v8.6/v8.22 — 210–384 s / 8 GPU (all simultaneous)
 //   - SATEstimatedNvidiaBandwidthSec:              xFusion v8.6/v8.22 — 2 664–2 688 s / 8 GPU (all simultaneous)
@@ -48,15 +48,15 @@ const (
 	// RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size).
 	SATEstimatedMemoryStressSec = 140

-	// NVIDIA dcgmi diag Level 2 (medium), per GPU, sequential.
-	SATEstimatedNvidiaGPUValidatePerGPUSec = 85
-	// NVIDIA dcgmi diag Level 3 (targeted stress), per GPU, sequential.
-	SATEstimatedNvidiaGPUStressPerGPUSec = 450
+	// NVIDIA dcgmi diag Level 2 (medium), all GPUs simultaneously.
+	SATEstimatedNvidiaGPUValidateSec = 85
+	// NVIDIA dcgmi diag Level 3 (targeted stress), all GPUs simultaneously.
+	SATEstimatedNvidiaGPUStressSec = 450

-	// NVIDIA dcgmi targeted_stress 300 s + overhead, per GPU, sequential.
-	SATEstimatedNvidiaTargetedStressPerGPUSec = 350
-	// NVIDIA dcgmi targeted_power 300 s + overhead, per GPU, sequential.
-	SATEstimatedNvidiaTargetedPowerPerGPUSec = 350
+	// NVIDIA dcgmi targeted_stress 300 s + overhead, all GPUs simultaneously.
+	SATEstimatedNvidiaTargetedStressSec = 350
+	// NVIDIA dcgmi targeted_power 300 s + overhead, all GPUs simultaneously.
+	SATEstimatedNvidiaTargetedPowerSec = 350

 	// NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU).
 	SATEstimatedNvidiaPulseTestSec = 5000
@@ -404,14 +404,7 @@ func normalizeNvidiaBusID(v string) string {
 }

 func (s *System) ResetNvidiaGPU(index int) (string, error) {
-	if index < 0 {
-		return "", fmt.Errorf("gpu index must be >= 0")
-	}
-	out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
-	if strings.TrimSpace(out) == "" && err == nil {
-		out = "GPU reset completed.\n"
-	}
-	return out, err
+	return resetNvidiaGPU(index)
 }

 // RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
--- a/audit/internal/platform/services.go
+++ b/audit/internal/platform/services.go
@@ -62,7 +62,7 @@ func (s *System) ServiceState(name string) string {

 func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
 	if name == "bee-nvidia" && action == ServiceRestart {
-		return runNvidiaRecover("restart-drivers")
+		return restartNvidiaDrivers()
 	}
 	// bee-web runs as the bee user; sudo is required to control system services.
 	// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
--- a/audit/internal/schema/hardware.go
+++ b/audit/internal/schema/hardware.go
@@ -211,6 +211,7 @@ type HardwarePCIeDevice struct {
 	Firmware               *string        `json:"firmware,omitempty"`
 	MacAddresses           []string       `json:"mac_addresses,omitempty"`
 	Present                *bool          `json:"present,omitempty"`
+	IOMMUGroup             *int           `json:"iommu_group,omitempty"`
 	Telemetry              map[string]any `json:"-"`
 }

--- a/audit/internal/schema/hardware_test.go
+++ b/audit/internal/schema/hardware_test.go
@@ -44,3 +44,48 @@ func TestHardwareSnapshotMarshalsNewContractFields(t *testing.T) {
 		t.Fatalf("missing event_logs payload: %s", text)
 	}
 }
+
+func TestHardwareSnapshotMarshalsStorageTelemetryFields(t *testing.T) {
+	powerOnHours := int64(12450)
+	writtenBytes := int64(9876543210)
+	readBytes := int64(1234567890)
+	lifeRemainingPct := 91.0
+
+	payload := HardwareIngestRequest{
+		CollectedAt: "2026-03-15T15:00:00Z",
+		Hardware: HardwareSnapshot{
+			Board: HardwareBoard{SerialNumber: "SRV-001"},
+			Storage: []HardwareStorage{
+				{
+					SerialNumber:     stringPtr("DISK-001"),
+					Model:            stringPtr("TestDisk"),
+					PowerOnHours:     &powerOnHours,
+					WrittenBytes:     &writtenBytes,
+					ReadBytes:        &readBytes,
+					LifeRemainingPct: &lifeRemainingPct,
+				},
+			},
+		},
+	}
+
+	data, err := json.Marshal(payload)
+	if err != nil {
+		t.Fatalf("marshal: %v", err)
+	}
+	text := string(data)
+	for _, needle := range []string{
+		`"storage":[{`,
+		`"power_on_hours":12450`,
+		`"written_bytes":9876543210`,
+		`"read_bytes":1234567890`,
+		`"life_remaining_pct":91`,
+	} {
+		if !strings.Contains(text, needle) {
+			t.Fatalf("missing %q in payload: %s", needle, text)
+		}
+	}
+}
+
+func stringPtr(v string) *string {
+	return &v
+}
--- a/audit/internal/webui/api.go
+++ b/audit/internal/webui/api.go
@@ -806,15 +806,14 @@ func (h *handler) handleAPISATAbort(w http.ResponseWriter, r *http.Request) {
 			now := time.Now()
 			t.DoneAt = &now
 		case TaskRunning:
-			if t.job != nil {
-				t.job.abort()
+			if t.job == nil || !t.job.abort() {
+				globalQueue.mu.Unlock()
+				writeJSON(w, map[string]string{"status": "not_running"})
+				return
 			}
-			if taskMayLeaveOrphanWorkers(t.Target) {
-				platform.KillTestWorkers()
-			}
-			t.Status = TaskCancelled
-			now := time.Now()
-			t.DoneAt = &now
+			globalQueue.mu.Unlock()
+			writeJSON(w, map[string]string{"status": "aborting"})
+			return
 		}
 		globalQueue.mu.Unlock()
 		writeJSON(w, map[string]string{"status": "aborted"})
@@ -1039,6 +1038,81 @@ func (h *handler) handleAPIExportUSBBundle(w http.ResponseWriter, r *http.Reques
 	writeJSON(w, map[string]string{"status": "ok", "message": result.Body})
 }

+func (h *handler) handleAPIBlackboxStatus(w http.ResponseWriter, _ *http.Request) {
+	state, err := app.ReadBlackboxState(filepath.Join(h.opts.ExportDir, "blackbox-state.json"))
+	if err != nil {
+		if errors.Is(err, os.ErrNotExist) {
+			writeJSON(w, app.BlackboxState{Status: "disabled", Targets: []app.BlackboxTargetStatus{}})
+			return
+		}
+		writeError(w, http.StatusInternalServerError, err.Error())
+		return
+	}
+	if state.Targets == nil {
+		state.Targets = []app.BlackboxTargetStatus{}
+	}
+	writeJSON(w, state)
+}
+
+func (h *handler) handleAPIBlackboxEnable(w http.ResponseWriter, r *http.Request) {
+	if h.opts.App == nil {
+		writeError(w, http.StatusServiceUnavailable, "app not configured")
+		return
+	}
+	var target platform.RemovableTarget
+	if err := json.NewDecoder(r.Body).Decode(&target); err != nil || strings.TrimSpace(target.Device) == "" {
+		writeError(w, http.StatusBadRequest, "device is required")
+		return
+	}
+	targets, err := h.opts.App.ListRemovableTargets()
+	if err != nil {
+		writeError(w, http.StatusInternalServerError, err.Error())
+		return
+	}
+	allowed := false
+	for _, candidate := range targets {
+		if candidate.Device == target.Device {
+			target = candidate
+			allowed = true
+			break
+		}
+	}
+	if !allowed {
+		writeError(w, http.StatusBadRequest, "device not in removable target list")
+		return
+	}
+	marker, err := app.EnableBlackboxTarget(target)
+	if err != nil {
+		writeError(w, http.StatusInternalServerError, err.Error())
+		return
+	}
+	writeJSON(w, map[string]any{
+		"status":        "ok",
+		"message":       "Black-box marker written.",
+		"enrollment_id": marker.EnrollmentID,
+	})
+}
+
+func (h *handler) handleAPIBlackboxDisable(w http.ResponseWriter, r *http.Request) {
+	var req struct {
+		Device       string `json:"device"`
+		EnrollmentID string `json:"enrollment_id"`
+	}
+	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+		writeError(w, http.StatusBadRequest, "invalid request body")
+		return
+	}
+	if err := app.DisableBlackboxTarget(req.Device, req.EnrollmentID); err != nil {
+		if errors.Is(err, os.ErrNotExist) {
+			writeError(w, http.StatusNotFound, "black-box target not found")
+			return
+		}
+		writeError(w, http.StatusInternalServerError, err.Error())
+		return
+	}
+	writeJSON(w, map[string]string{"status": "ok", "message": "Black-box marker removed."})
+}
+
 // ── GPU presence ──────────────────────────────────────────────────────────────

 func (h *handler) handleAPIGNVIDIAGPUs(w http.ResponseWriter, _ *http.Request) {
--- a/audit/internal/webui/api_test.go
+++ b/audit/internal/webui/api_test.go
@@ -3,6 +3,8 @@ package webui
 import (
 	"encoding/json"
 	"net/http/httptest"
+	"os"
+	"path/filepath"
 	"strings"
 	"testing"

@@ -44,6 +46,45 @@ func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
 	}
 }

+func TestHandleAPIBlackboxStatusReturnsDisabledWhenStateMissing(t *testing.T) {
+	h := &handler{opts: HandlerOptions{ExportDir: t.TempDir()}}
+	rec := httptest.NewRecorder()
+	req := httptest.NewRequest("GET", "/api/blackbox/status", nil)
+
+	h.handleAPIBlackboxStatus(rec, req)
+
+	if rec.Code != 200 {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	var state app.BlackboxState
+	if err := json.Unmarshal(rec.Body.Bytes(), &state); err != nil {
+		t.Fatalf("decode state: %v", err)
+	}
+	if state.Status != "disabled" {
+		t.Fatalf("status=%q want disabled", state.Status)
+	}
+}
+
+func TestHandleAPIBlackboxStatusReturnsPersistedState(t *testing.T) {
+	exportDir := t.TempDir()
+	statePath := filepath.Join(exportDir, "blackbox-state.json")
+	if err := os.WriteFile(statePath, []byte(`{"status":"running","boot_folder":"boot-folder","targets":[{"enrollment_id":"bb-1","device":"/dev/sdb1","status":"running","flush_period":"1s"}]}`), 0644); err != nil {
+		t.Fatalf("write state: %v", err)
+	}
+	h := &handler{opts: HandlerOptions{ExportDir: exportDir}}
+	rec := httptest.NewRecorder()
+	req := httptest.NewRequest("GET", "/api/blackbox/status", nil)
+
+	h.handleAPIBlackboxStatus(rec, req)
+
+	if rec.Code != 200 {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	if !strings.Contains(rec.Body.String(), `"boot_folder":"boot-folder"`) {
+		t.Fatalf("body=%s", rec.Body.String())
+	}
+}
+
 func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
 	globalQueue.mu.Lock()
 	originalTasks := globalQueue.tasks
--- a/audit/internal/webui/jobs.go
+++ b/audit/internal/webui/jobs.go
@@ -20,7 +20,7 @@ type jobState struct {
 	cancel       func() // optional cancel function; nil if job is not cancellable
 	logPath      string
 	serialPrefix string
-	logFile      *os.File    // kept open for the task lifetime to avoid per-line open/close
+	logFile      *os.File // kept open for the task lifetime to avoid per-line open/close
 	logBuf       *bufio.Writer
 }

@@ -53,13 +53,21 @@ func (j *jobState) abort() bool {
 }

 func (j *jobState) append(line string) {
+	j.appendWithOptions(line, true, true)
+}
+
+func (j *jobState) appendFromLog(line string) {
+	j.appendWithOptions(line, false, false)
+}
+
+func (j *jobState) appendWithOptions(line string, persistLog, serialMirror bool) {
 	j.mu.Lock()
 	defer j.mu.Unlock()
 	j.lines = append(j.lines, line)
-	if j.logPath != "" {
+	if persistLog && j.logPath != "" {
 		j.writeLogLineLocked(line)
 	}
-	if j.serialPrefix != "" {
+	if serialMirror && j.serialPrefix != "" {
 		taskSerialWriteLine(j.serialPrefix + line)
 	}
 	for _, ch := range j.subs {
--- a/audit/internal/webui/page_export_tools.go
+++ b/audit/internal/webui/page_export_tools.go
@@ -102,47 +102,69 @@ window.supportBundleDownload = function() {

 func renderUSBExportCard() string {
 	return `<div class="card" style="margin-top:16px">
-  <div class="card-head">Export to USB
-    <button class="btn btn-sm btn-secondary" onclick="usbRefresh()" style="margin-left:auto">&#8635; Refresh</button>
+  <div class="card-head">USB Black-Box
+    <button class="btn btn-sm btn-secondary" onclick="blackboxRefresh()" style="margin-left:auto">&#8635; Refresh</button>
  </div>
  <div class="card-body">` + renderUSBExportInline() + `</div>
 </div>`
 }

 func renderUSBExportInline() string {
-	return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Write audit JSON or support bundle directly to a removable USB drive.</p>
+	return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Marks removable USB devices as black-box targets. The dedicated bee-blackbox service mirrors export files and system logs into a boot-scoped folder and resumes automatically after restart.</p>
 <div id="usb-status" style="font-size:13px;color:var(--muted)">Scanning for USB devices...</div>
+<div id="blackbox-summary" style="margin-top:8px;font-size:13px;color:var(--muted)">Loading black-box status...</div>
 <div id="usb-targets" style="margin-top:12px"></div>
 <div id="usb-msg" style="margin-top:10px;font-size:13px"></div>
 <script>
 (function(){
-function usbRefresh() {
+function blackboxRefresh() {
  document.getElementById('usb-status').textContent = 'Scanning...';
+  document.getElementById('blackbox-summary').textContent = 'Loading black-box status...';
  document.getElementById('usb-targets').innerHTML = '';
  document.getElementById('usb-msg').textContent = '';
-  fetch('/api/export/usb').then(r=>r.json()).then(targets => {
-    window._usbTargets = Array.isArray(targets) ? targets : [];
+  Promise.all([
+    fetch('/api/export/usb').then(r=>r.json()),
+    fetch('/api/blackbox/status').then(r=>r.json())
+  ]).then(function(values) {
+    const targets = Array.isArray(values[0]) ? values[0] : [];
+    const state = values[1] || {};
+    const active = Array.isArray(state.targets) ? state.targets : [];
+    window._usbTargets = targets;
+    window._blackboxTargets = active;
    const st = document.getElementById('usb-status');
    const ct = document.getElementById('usb-targets');
+    const summary = document.getElementById('blackbox-summary');
+    if (state.boot_folder) {
+      summary.textContent = 'Service state: ' + (state.status || 'unknown') + '. Boot folder: ' + state.boot_folder + '.';
+    } else {
+      summary.textContent = 'Service state: ' + (state.status || 'disabled') + '.';
+    }
    if (!targets || targets.length === 0) {
      st.textContent = 'No removable USB devices found.';
-      return;
+    } else {
+      st.textContent = targets.length + ' device(s) found:';
    }
-    st.textContent = targets.length + ' device(s) found:';
-    ct.innerHTML = '<table><tr><th>Device</th><th>FS</th><th>Size</th><th>Label</th><th>Model</th><th>Actions</th></tr>' +
+    const byDevice = {};
+    active.forEach(function(item) { byDevice[item.device] = item; });
+    ct.innerHTML = '<table><tr><th>Device</th><th>FS</th><th>Size</th><th>Label</th><th>Model</th><th>Black-Box</th><th>Actions</th></tr>' +
      targets.map((t, idx) => {
        const dev = t.device || '';
        const label = t.label || '';
        const model = t.model || '';
+        const state = byDevice[dev];
+        const status = state ? (state.status + (state.flush_period ? ', flush ' + state.flush_period : '')) : 'not enrolled';
+        const detail = state && state.last_error ? ('<div style="font-size:12px;color:var(--err,red)">'+state.last_error+'</div>') : '';
        return '<tr>' +
          '<td style="font-family:monospace">'+dev+'</td>' +
          '<td>'+t.fs_type+'</td>' +
          '<td>'+t.size+'</td>' +
          '<td>'+label+'</td>' +
          '<td style="font-size:12px;color:var(--muted)">'+model+'</td>' +
+          '<td style="font-size:12px">'+status+detail+'</td>' +
          '<td style="white-space:nowrap">' +
-            '<button class="btn btn-sm btn-primary" onclick="usbExport(\'audit\','+idx+',this)">Audit JSON</button> ' +
-            '<button class="btn btn-sm btn-secondary" onclick="usbExport(\'bundle\','+idx+',this)">Support Bundle</button>' +
+            (state
+              ? '<button class="btn btn-sm btn-secondary" onclick="blackboxDisable('+idx+',this)">Disable</button>'
+              : '<button class="btn btn-sm btn-primary" onclick="blackboxEnable('+idx+',this)">Enable</button>') +
            '<div class="usb-row-msg" style="margin-top:6px;font-size:12px;color:var(--muted)"></div>' +
          '</td></tr>';
      }).join('') + '</table>';
@@ -150,7 +172,7 @@ function usbRefresh() {
    document.getElementById('usb-status').textContent = 'Error: ' + e;
  });
 }
-window.usbExport = function(type, targetIndex, btn) {
+window.blackboxEnable = function(targetIndex, btn) {
  const target = (window._usbTargets || [])[targetIndex];
  if (!target) {
    const msg = document.getElementById('usb-msg');
@@ -164,15 +186,15 @@ window.usbExport = function(type, targetIndex, btn) {
  const originalText = btn ? btn.textContent : '';
  if (btn) {
    btn.disabled = true;
-    btn.textContent = 'Exporting...';
+    btn.textContent = 'Enabling...';
  }
  if (rowMsg) {
    rowMsg.style.color = 'var(--muted)';
    rowMsg.textContent = 'Working...';
  }
  msg.style.color = 'var(--muted)';
-  msg.textContent = 'Exporting ' + (type === 'bundle' ? 'support bundle' : 'audit JSON') + ' to ' + (target.device||'') + '...';
-  fetch('/api/export/usb/'+type, {
+  msg.textContent = 'Enabling black-box on ' + (target.device||'') + '...';
+  fetch('/api/blackbox/enable', {
    method: 'POST',
    headers: {'Content-Type':'application/json'},
    body: JSON.stringify(target)
@@ -199,10 +221,64 @@ window.usbExport = function(type, targetIndex, btn) {
      btn.disabled = false;
      btn.textContent = originalText;
    }
+    setTimeout(blackboxRefresh, 300);
  });
 };
-window.usbRefresh = usbRefresh;
-usbRefresh();
+window.blackboxDisable = function(targetIndex, btn) {
+  const target = (window._usbTargets || [])[targetIndex];
+  const active = (window._blackboxTargets || []).find(function(item){ return item.device === (target && target.device); });
+  if (!target || !active) {
+    const msg = document.getElementById('usb-msg');
+    msg.style.color = 'var(--err,red)';
+    msg.textContent = 'Error: black-box target not found. Refresh and try again.';
+    return;
+  }
+  const msg = document.getElementById('usb-msg');
+  const row = btn ? btn.closest('td') : null;
+  const rowMsg = row ? row.querySelector('.usb-row-msg') : null;
+  const originalText = btn ? btn.textContent : '';
+  if (btn) {
+    btn.disabled = true;
+    btn.textContent = 'Disabling...';
+  }
+  if (rowMsg) {
+    rowMsg.style.color = 'var(--muted)';
+    rowMsg.textContent = 'Working...';
+  }
+  msg.style.color = 'var(--muted)';
+  msg.textContent = 'Disabling black-box on ' + (target.device||'') + '...';
+  fetch('/api/blackbox/disable', {
+    method:'POST',
+    headers:{'Content-Type':'application/json'},
+    body: JSON.stringify({device: target.device, enrollment_id: active.enrollment_id})
+  }).then(async r => {
+    const d = await r.json();
+    if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status));
+    return d;
+  }).then(d => {
+    msg.style.color = 'var(--ok,green)';
+    msg.textContent = d.message || 'Done.';
+    if (rowMsg) {
+      rowMsg.style.color = 'var(--ok,green)';
+      rowMsg.textContent = d.message || 'Done.';
+    }
+  }).catch(e => {
+    msg.style.color = 'var(--err,red)';
+    msg.textContent = 'Error: '+e;
+    if (rowMsg) {
+      rowMsg.style.color = 'var(--err,red)';
+      rowMsg.textContent = 'Error: ' + e;
+    }
+  }).finally(() => {
+    if (btn) {
+      btn.disabled = false;
+      btn.textContent = originalText;
+    }
+    setTimeout(blackboxRefresh, 300);
+  });
+};
+window.blackboxRefresh = blackboxRefresh;
+blackboxRefresh();
 })();
 </script>`
 }
@@ -382,7 +458,7 @@ function installToRAM() {
 <p style="font-size:13px;color:var(--muted);margin-bottom:12px">Downloads a tar.gz archive of all audit files, SAT results, and logs.</p>
 ` + renderSupportBundleInline() + `
 <div style="border-top:1px solid var(--border);margin-top:16px;padding-top:16px">
-  <div style="font-weight:600;margin-bottom:8px">Export to USB</div>
+  <div style="font-weight:600;margin-bottom:8px">USB Black-Box</div>
  ` + renderUSBExportInline() + `
 </div>
 </div></div>
--- a/audit/internal/webui/page_install_tasks.go
+++ b/audit/internal/webui/page_install_tasks.go
@@ -207,7 +207,7 @@ func renderInstall() string {
 func renderTasks() string {
 	return `<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">
 <button class="btn btn-danger btn-sm" onclick="cancelAll()">Cancel All</button>
-<button class="btn btn-sm" style="background:#b45309;color:#fff" onclick="killWorkers()" title="Send SIGKILL to all running test processes (bee-gpu-burn, stress-ng, stressapptest, memtester)">Kill Workers</button>
+<button class="btn btn-sm" style="background:#b45309;color:#fff" onclick="killWorkers()" title="Abort running tasks and kill orphaned test processes (bee-gpu-burn, dcgmi, nvvs, nvbandwidth, stress-ng, stressapptest, memtester)">Abort Tasks And Kill Orphans</button>
 <span id="kill-toast" style="font-size:12px;color:var(--muted);display:none"></span>
 <span style="font-size:12px;color:var(--muted)">Open a task to view its saved logs and charts.</span>
 </div>
@@ -289,7 +289,7 @@ function cancelAll() {
  fetch('/api/tasks/cancel-all',{method:'POST'}).then(()=>loadTasks());
 }
 function killWorkers() {
-  if (!confirm('Send SIGKILL to all running test workers (bee-gpu-burn, stress-ng, stressapptest, memtester)?\n\nThis will also cancel all queued and running tasks.')) return;
+  if (!confirm('Abort all queued/running tasks and kill orphaned test workers (bee-gpu-burn, dcgmi, nvvs, nvbandwidth, stress-ng, stressapptest, memtester)?\n\nRunning bee-worker processes will first be asked to stop gracefully; orphaned test processes will then be killed.')) return;
  fetch('/api/tasks/kill-workers',{method:'POST'})
    .then(r=>r.json())
    .then(d=>{
--- a/audit/internal/webui/page_validate.go
+++ b/audit/internal/webui/page_validate.go
@@ -35,9 +35,11 @@ func validateTotalValidateSec(n int) int {
 	}
 	total := platform.SATEstimatedCPUValidateSec +
 		platform.SATEstimatedMemoryValidateSec +
-		n*platform.SATEstimatedNvidiaGPUValidatePerGPUSec +
 		platform.SATEstimatedNvidiaInterconnectSec +
 		platform.SATEstimatedNvidiaBandwidthSec
+	if n > 0 {
+		total += platform.SATEstimatedNvidiaGPUValidateSec
+	}
 	return total
 }

@@ -47,12 +49,14 @@ func validateTotalStressSec(n int) int {
 	}
 	total := platform.SATEstimatedCPUStressSec +
 		platform.SATEstimatedMemoryStressSec +
-		n*platform.SATEstimatedNvidiaGPUStressPerGPUSec +
-		n*platform.SATEstimatedNvidiaTargetedStressPerGPUSec +
-		n*platform.SATEstimatedNvidiaTargetedPowerPerGPUSec +
 		platform.SATEstimatedNvidiaPulseTestSec +
 		platform.SATEstimatedNvidiaInterconnectSec +
 		platform.SATEstimatedNvidiaBandwidthSec
+	if n > 0 {
+		total += platform.SATEstimatedNvidiaGPUStressSec +
+			platform.SATEstimatedNvidiaTargetedStressSec +
+			platform.SATEstimatedNvidiaTargetedPowerSec
+	}
 	return total
 }

@@ -128,33 +132,16 @@ func renderValidate(opts HandlerOptions) string {
 		inv.NVIDIA,
 		`Runs NVIDIA diagnostics and board inventory checks.`,
 		`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
-		func() string {
-			perV := platform.SATEstimatedNvidiaGPUValidatePerGPUSec
-			perS := platform.SATEstimatedNvidiaGPUStressPerGPUSec
-			if n > 0 {
-				return fmt.Sprintf("Validate: %s/GPU × %d = %s (Level 2, sequential). Stress: %s/GPU × %d = %s (Level 3, sequential).",
-					validateFmtDur(perV), n, validateFmtDur(perV*n),
-					validateFmtDur(perS), n, validateFmtDur(perS*n))
-			}
-			return fmt.Sprintf("Validate: %s/GPU (Level 2, sequential). Stress: %s/GPU (Level 3, sequential).",
-				validateFmtDur(perV), validateFmtDur(perS))
-		}(),
+		fmt.Sprintf("Validate: %s (Level 2, all GPUs simultaneously). Stress: %s (Level 3, all GPUs simultaneously).",
+			validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec),
+			validateFmtDur(platform.SATEstimatedNvidiaGPUStressSec)),
 	)) +
 		`<div id="sat-card-nvidia-targeted-stress">` +
 		renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
 			inv.NVIDIA,
 			`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
 			`<code>dcgmi diag targeted_stress</code>`,
-			func() string {
-				per := platform.SATEstimatedNvidiaTargetedStressPerGPUSec
-				s := "Skipped in Validate. "
-				if n > 0 {
-					s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
-				} else {
-					s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
-				}
-				return s + `<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
-			}(),
+		"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec) + ` (all GPUs simultaneously).<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
 		)) +
 		`</div>` +
 		`<div id="sat-card-nvidia-targeted-power">` +
@@ -162,16 +149,7 @@ func renderValidate(opts HandlerOptions) string {
 			inv.NVIDIA,
 			`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
 			`<code>dcgmi diag targeted_power</code>`,
-			func() string {
-				per := platform.SATEstimatedNvidiaTargetedPowerPerGPUSec
-				s := "Skipped in Validate. "
-				if n > 0 {
-					s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
-				} else {
-					s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
-				}
-				return s + `<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
-			}(),
+		"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec) + ` (all GPUs simultaneously).<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
 		)) +
 		`</div>` +
 		`<div id="sat-card-nvidia-pulse">` +
@@ -382,8 +360,8 @@ function runSATWithOverrides(target, overrides) {
  return enqueueSATTarget(target, overrides)
    .then(d => streamSATTask(d.task_id, title, false));
 }
-const nvidiaPerGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power'];
-const nvidiaAllGPUTargets = ['nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
+const nvidiaPerGPUTargets = [];
+const nvidiaAllGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
 function satAllGPUIndicesForMulti() {
  return Promise.resolve(satSelectedGPUIndices());
 }
@@ -417,40 +395,9 @@ function runNvidiaFabricValidate(target) {
  });
 }
 function runNvidiaValidateSet(target) {
-  return loadSatNvidiaGPUs().then(gpus => {
-    const selected = satSelectedGPUIndices();
-    const picked = gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0);
-    if (!picked.length) {
-      throw new Error('Select at least one NVIDIA GPU.');
-    }
-    if (picked.length === 1) {
-      const gpu = picked[0];
-      return runSATWithOverrides(target, {
-        gpu_indices: [Number(gpu.index)],
-        display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')',
-      });
-    }
-    document.getElementById('sat-output').style.display='block';
-    document.getElementById('sat-title').textContent = '— ' + target;
-    const term = document.getElementById('sat-terminal');
-    term.textContent = 'Running ' + target + ' one GPU at a time...\n';
-    const labelBase = satLabels()[target] || ('Validate ' + target);
-    const runNext = (idx) => {
-      if (idx >= picked.length) return Promise.resolve();
-      const gpu = picked[idx];
-      const gpuLabel = satGPUDisplayName(gpu);
-      term.textContent += '\n[' + (idx + 1) + '/' + picked.length + '] ' + gpuLabel + '\n';
-      return enqueueSATTarget(target, {
-        gpu_indices: [Number(gpu.index)],
-        display_name: labelBase + ' (' + gpuLabel + ')',
-      }).then(d => {
-        return streamSATTask(d.task_id, labelBase + ' (' + gpuLabel + ')', false);
-      }).then(function() {
-        return runNext(idx + 1);
-      });
-    };
-    return runNext(0);
-  });
+  const selected = satSelectedGPUIndices();
+  if (!selected.length) { alert('Select at least one NVIDIA GPU.'); return; }
+  return runSATWithOverrides(target, {gpu_indices: selected, display_name: satLabels()[target] || target});
 }
 function runAMDValidateSet() {
  const targets = selectedAMDValidateTargets();
--- a/audit/internal/webui/server.go
+++ b/audit/internal/webui/server.go
@@ -301,8 +301,9 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	// Export
 	mux.HandleFunc("GET /api/export/list", h.handleAPIExportList)
 	mux.HandleFunc("GET /api/export/usb", h.handleAPIExportUSBTargets)
-	mux.HandleFunc("POST /api/export/usb/audit", h.handleAPIExportUSBAudit)
-	mux.HandleFunc("POST /api/export/usb/bundle", h.handleAPIExportUSBBundle)
+	mux.HandleFunc("GET /api/blackbox/status", h.handleAPIBlackboxStatus)
+	mux.HandleFunc("POST /api/blackbox/enable", h.handleAPIBlackboxEnable)
+	mux.HandleFunc("POST /api/blackbox/disable", h.handleAPIBlackboxDisable)

 	// Tools
 	mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)
--- a/audit/internal/webui/server_test.go
+++ b/audit/internal/webui/server_test.go
@@ -671,11 +671,11 @@ func TestToolsPageRendersNvidiaSelfHealSection(t *testing.T) {
 	if !strings.Contains(body, `id="boot-source-text"`) {
 		t.Fatalf("tools page missing boot source field: %s", body)
 	}
-	if !strings.Contains(body, `Export to USB`) {
-		t.Fatalf("tools page missing export to usb section: %s", body)
+	if !strings.Contains(body, `USB Black-Box`) {
+		t.Fatalf("tools page missing usb black-box section: %s", body)
 	}
-	if !strings.Contains(body, `Support Bundle</button>`) {
-		t.Fatalf("tools page missing support bundle usb button: %s", body)
+	if !strings.Contains(body, `/api/blackbox/status`) {
+		t.Fatalf("tools page missing black-box status api usage: %s", body)
 	}
 }

--- a/audit/internal/webui/task_runner.go
+++ b/audit/internal/webui/task_runner.go
@@ -0,0 +1,505 @@
+package webui
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"log/slog"
+	"os"
+	"os/signal"
+	"path/filepath"
+	"strings"
+	"syscall"
+	"time"
+
+	"bee/audit/internal/app"
+	"bee/audit/internal/platform"
+	"bee/audit/internal/runtimeenv"
+)
+
+type taskRunnerState struct {
+	PID       int       `json:"pid"`
+	Status    string    `json:"status"`
+	Error     string    `json:"error,omitempty"`
+	UpdatedAt time.Time `json:"updated_at"`
+}
+
+func taskRunnerStatePath(t *Task) string {
+	if t == nil || strings.TrimSpace(t.ArtifactsDir) == "" {
+		return ""
+	}
+	return filepath.Join(t.ArtifactsDir, "runner-state.json")
+}
+
+func writeTaskRunnerState(t *Task, state taskRunnerState) error {
+	path := taskRunnerStatePath(t)
+	if path == "" {
+		return nil
+	}
+	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+		return err
+	}
+	data, err := json.MarshalIndent(state, "", "  ")
+	if err != nil {
+		return err
+	}
+	tmp := path + ".tmp"
+	if err := os.WriteFile(tmp, data, 0644); err != nil {
+		return err
+	}
+	return os.Rename(tmp, path)
+}
+
+func readTaskRunnerState(t *Task) (taskRunnerState, bool) {
+	path := taskRunnerStatePath(t)
+	if path == "" {
+		return taskRunnerState{}, false
+	}
+	data, err := os.ReadFile(path)
+	if err != nil || len(data) == 0 {
+		return taskRunnerState{}, false
+	}
+	var state taskRunnerState
+	if err := json.Unmarshal(data, &state); err != nil {
+		return taskRunnerState{}, false
+	}
+	return state, true
+}
+
+func processAlive(pid int) bool {
+	if pid <= 0 {
+		return false
+	}
+	err := syscall.Kill(pid, 0)
+	return err == nil || err == syscall.EPERM
+}
+
+func finalizeTaskForResult(t *Task, errMsg string, cancelled bool) {
+	now := time.Now()
+	t.DoneAt = &now
+	switch {
+	case cancelled:
+		t.Status = TaskCancelled
+		t.ErrMsg = "aborted"
+	case strings.TrimSpace(errMsg) != "":
+		t.Status = TaskFailed
+		t.ErrMsg = errMsg
+	default:
+		t.Status = TaskDone
+		t.ErrMsg = ""
+	}
+}
+
+func executeTaskWithOptions(opts *HandlerOptions, t *Task, j *jobState, ctx context.Context) {
+	if opts == nil {
+		j.append("ERROR: handler options not configured")
+		j.finish("handler options not configured")
+		return
+	}
+	a := opts.App
+
+	recovered := len(j.lines) > 0
+	j.append(fmt.Sprintf("Starting %s...", t.Name))
+	if recovered {
+		j.append(fmt.Sprintf("Recovered after bee-web restart at %s", time.Now().UTC().Format(time.RFC3339)))
+	}
+
+	var (
+		archive string
+		err     error
+	)
+
+	switch t.Target {
+	case "nvidia":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		diagLevel := 2
+		if t.params.StressMode {
+			diagLevel = 3
+		}
+		if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
+			result, e := a.RunNvidiaAcceptancePackWithOptions(ctx, "", diagLevel, t.params.GPUIndices, j.append)
+			if e != nil {
+				err = e
+			} else {
+				archive = result.Body
+			}
+		} else {
+			archive, err = a.RunNvidiaAcceptancePack("", j.append)
+		}
+	case "nvidia-targeted-stress":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		dur := t.params.Duration
+		if dur <= 0 {
+			dur = 300
+		}
+		archive, err = a.RunNvidiaTargetedStressValidatePack(ctx, "", dur, t.params.GPUIndices, j.append)
+	case "nvidia-bench-perf":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = a.RunNvidiaBenchmarkCtx(ctx, "", platform.NvidiaBenchmarkOptions{
+			Profile:           t.params.BenchmarkProfile,
+			SizeMB:            t.params.SizeMB,
+			GPUIndices:        t.params.GPUIndices,
+			ExcludeGPUIndices: t.params.ExcludeGPUIndices,
+			RunNCCL:           t.params.RunNCCL,
+			ParallelGPUs:      t.params.ParallelGPUs,
+			RampStep:          t.params.RampStep,
+			RampTotal:         t.params.RampTotal,
+			RampRunID:         t.params.RampRunID,
+		}, j.append)
+	case "nvidia-bench-power":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = a.RunNvidiaPowerBenchCtx(ctx, app.DefaultBeeBenchPowerDir, platform.NvidiaBenchmarkOptions{
+			Profile:           t.params.BenchmarkProfile,
+			GPUIndices:        t.params.GPUIndices,
+			ExcludeGPUIndices: t.params.ExcludeGPUIndices,
+			RampStep:          t.params.RampStep,
+			RampTotal:         t.params.RampTotal,
+			RampRunID:         t.params.RampRunID,
+		}, j.append)
+	case "nvidia-bench-autotune":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = a.RunNvidiaPowerSourceAutotuneCtx(ctx, app.DefaultBeeBenchAutotuneDir, platform.NvidiaBenchmarkOptions{
+			Profile: t.params.BenchmarkProfile,
+			SizeMB:  t.params.SizeMB,
+		}, t.params.BenchmarkKind, j.append)
+	case "nvidia-compute":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		dur := t.params.Duration
+		if t.params.BurnProfile != "" && dur <= 0 {
+			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
+		}
+		rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
+		if planErr != nil {
+			err = planErr
+			break
+		}
+		if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
+			dur = rampPlan.DurationSec
+		}
+		if rampPlan.StaggerSeconds > 0 {
+			j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
+		}
+		archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, rampPlan.StaggerSeconds, j.append)
+	case "nvidia-targeted-power":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		dur := t.params.Duration
+		if t.params.BurnProfile != "" && dur <= 0 {
+			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
+		}
+		archive, err = a.RunNvidiaTargetedPowerPack(ctx, "", dur, t.params.GPUIndices, j.append)
+	case "nvidia-pulse":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		dur := t.params.Duration
+		if t.params.BurnProfile != "" && dur <= 0 {
+			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
+		}
+		archive, err = a.RunNvidiaPulseTestPack(ctx, "", dur, t.params.GPUIndices, j.append)
+	case "nvidia-bandwidth":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = a.RunNvidiaBandwidthPack(ctx, "", t.params.GPUIndices, j.append)
+	case "nvidia-interconnect":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = a.RunNCCLTests(ctx, "", t.params.GPUIndices, j.append)
+	case "nvidia-stress":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		dur := t.params.Duration
+		if t.params.BurnProfile != "" && dur <= 0 {
+			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
+		}
+		rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
+		if planErr != nil {
+			err = planErr
+			break
+		}
+		if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
+			dur = rampPlan.DurationSec
+		}
+		if rampPlan.StaggerSeconds > 0 {
+			j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
+		}
+		archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
+			DurationSec:       dur,
+			Loader:            t.params.Loader,
+			GPUIndices:        t.params.GPUIndices,
+			ExcludeGPUIndices: t.params.ExcludeGPUIndices,
+			StaggerSeconds:    rampPlan.StaggerSeconds,
+		}, j.append)
+	case "memory":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		sizeMB, passes := resolveMemoryValidatePreset(t.params.BurnProfile, t.params.StressMode)
+		j.append(fmt.Sprintf("Memory validate preset: %d MB x %d pass(es)", sizeMB, passes))
+		archive, err = runMemoryAcceptancePackCtx(a, ctx, "", sizeMB, passes, j.append)
+	case "storage":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = runStorageAcceptancePackCtx(a, ctx, "", t.params.StressMode, j.append)
+	case "cpu":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		dur := t.params.Duration
+		if t.params.BurnProfile != "" && dur <= 0 {
+			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
+		}
+		if dur <= 0 {
+			if t.params.StressMode {
+				dur = 1800
+			} else {
+				dur = 60
+			}
+		}
+		j.append(fmt.Sprintf("CPU stress duration: %ds", dur))
+		archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
+	case "amd":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = runAMDAcceptancePackCtx(a, ctx, "", j.append)
+	case "amd-mem":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = runAMDMemIntegrityPackCtx(a, ctx, "", j.append)
+	case "amd-bandwidth":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = runAMDMemBandwidthPackCtx(a, ctx, "", j.append)
+	case "amd-stress":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		dur := t.params.Duration
+		if t.params.BurnProfile != "" && dur <= 0 {
+			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
+		}
+		archive, err = runAMDStressPackCtx(a, ctx, "", dur, j.append)
+	case "memory-stress":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		dur := t.params.Duration
+		if t.params.BurnProfile != "" && dur <= 0 {
+			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
+		}
+		archive, err = runMemoryStressPackCtx(a, ctx, "", dur, j.append)
+	case "sat-stress":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		dur := t.params.Duration
+		if t.params.BurnProfile != "" && dur <= 0 {
+			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
+		}
+		archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
+	case "platform-stress":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		runOpts := resolvePlatformStressPreset(t.params.BurnProfile)
+		runOpts.Components = t.params.PlatformComponents
+		archive, err = a.RunPlatformStress(ctx, "", runOpts, j.append)
+	case "audit":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		result, e := a.RunAuditNow(opts.RuntimeMode)
+		if e != nil {
+			err = e
+		} else {
+			for _, line := range splitLines(result.Body) {
+				j.append(line)
+			}
+		}
+	case "support-bundle":
+		j.append("Building support bundle...")
+		archive, err = buildSupportBundle(opts.ExportDir)
+	case "install":
+		if strings.TrimSpace(t.params.Device) == "" {
+			err = fmt.Errorf("device is required")
+			break
+		}
+		installLogPath := platform.InstallLogPath(t.params.Device)
+		j.append("Install log: " + installLogPath)
+		err = streamCmdJob(j, installCommand(ctx, t.params.Device, installLogPath))
+	case "install-to-ram":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		err = a.RunInstallToRAM(ctx, j.append)
+	default:
+		j.append("ERROR: unknown target: " + t.Target)
+		j.finish("unknown target")
+		return
+	}
+
+	if archive != "" {
+		archivePath := app.ExtractArchivePath(archive)
+		if err == nil && app.ReadSATOverallStatus(archivePath) == "FAILED" {
+			err = fmt.Errorf("SAT overall_status=FAILED (see summary.txt)")
+		}
+		if opts.App != nil && opts.App.StatusDB != nil {
+			app.ApplySATResultToDB(opts.App.StatusDB, t.Target, archivePath)
+		}
+	}
+
+	if err != nil {
+		if ctx.Err() != nil {
+			j.append("Aborted.")
+			j.finish("aborted")
+		} else {
+			j.append("ERROR: " + err.Error())
+			j.finish(err.Error())
+		}
+		return
+	}
+	if archive != "" {
+		j.append("Archive: " + archive)
+	}
+	j.finish("")
+}
+
+func loadPersistedTask(statePath, taskID string) (*Task, error) {
+	data, err := os.ReadFile(statePath)
+	if err != nil {
+		return nil, err
+	}
+	var persisted []persistedTask
+	if err := json.Unmarshal(data, &persisted); err != nil {
+		return nil, err
+	}
+	for _, pt := range persisted {
+		if pt.ID != taskID {
+			continue
+		}
+		t := &Task{
+			ID:             pt.ID,
+			Name:           pt.Name,
+			Target:         pt.Target,
+			Priority:       pt.Priority,
+			Status:         pt.Status,
+			CreatedAt:      pt.CreatedAt,
+			StartedAt:      pt.StartedAt,
+			DoneAt:         pt.DoneAt,
+			ErrMsg:         pt.ErrMsg,
+			LogPath:        pt.LogPath,
+			ArtifactsDir:   pt.ArtifactsDir,
+			ReportJSONPath: pt.ReportJSONPath,
+			ReportHTMLPath: pt.ReportHTMLPath,
+			params:         pt.Params,
+		}
+		ensureTaskReportPaths(t)
+		return t, nil
+	}
+	return nil, fmt.Errorf("task %s not found", taskID)
+}
+
+func RunPersistedTask(exportDir, taskID string, stdout, stderr io.Writer) int {
+	if strings.TrimSpace(exportDir) == "" || strings.TrimSpace(taskID) == "" {
+		fmt.Fprintln(stderr, "bee task-run: --export-dir and --task-id are required")
+		return 2
+	}
+
+	runtimeInfo, err := runtimeenv.Detect("auto")
+	if err != nil {
+		slog.Warn("resolve runtime for task-run", "err", err)
+	}
+	opts := &HandlerOptions{
+		ExportDir:   exportDir,
+		App:         app.New(platform.New()),
+		RuntimeMode: runtimeInfo.Mode,
+	}
+	statePath := filepath.Join(exportDir, "tasks-state.json")
+	task, err := loadPersistedTask(statePath, taskID)
+	if err != nil {
+		fmt.Fprintln(stderr, err.Error())
+		return 1
+	}
+	if task.StartedAt == nil || task.StartedAt.IsZero() {
+		now := time.Now()
+		task.StartedAt = &now
+	}
+	if task.Status == "" {
+		task.Status = TaskRunning
+	}
+	if err := writeTaskRunnerState(task, taskRunnerState{
+		PID:       os.Getpid(),
+		Status:    TaskRunning,
+		UpdatedAt: time.Now().UTC(),
+	}); err != nil {
+		fmt.Fprintln(stderr, err.Error())
+		return 1
+	}
+
+	ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
+	defer cancel()
+
+	j := newTaskJobState(task.LogPath, taskSerialPrefix(task))
+	executeTaskWithOptions(opts, task, j, ctx)
+	finalizeTaskForResult(task, j.err, ctx.Err() != nil)
+	if err := writeTaskReportArtifacts(task); err != nil {
+		appendJobLog(task.LogPath, "WARN: task report generation failed: "+err.Error())
+	}
+	j.closeLog()
+	if err := writeTaskRunnerState(task, taskRunnerState{
+		PID:       os.Getpid(),
+		Status:    task.Status,
+		Error:     task.ErrMsg,
+		UpdatedAt: time.Now().UTC(),
+	}); err != nil {
+		fmt.Fprintln(stderr, err.Error())
+	}
+	if task.ErrMsg != "" {
+		return 1
+	}
+	return 0
+}
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
+	"io"
 	"log/slog"
 	"net/http"
 	"os"
@@ -13,6 +14,7 @@ import (
 	"sort"
 	"strings"
 	"sync"
+	"syscall"
 	"time"

 	"bee/audit/internal/app"
@@ -110,8 +112,9 @@ type Task struct {
 	ReportHTMLPath string     `json:"report_html_path,omitempty"`

 	// runtime fields (not serialised)
-	job    *jobState
-	params taskParams
+	job       *jobState
+	runnerPID int
+	params    taskParams
 }

 // taskParams holds optional parameters parsed from the run request.
@@ -328,6 +331,13 @@ var (
 	installCommand     = func(ctx context.Context, device string, logPath string) *exec.Cmd {
 		return exec.CommandContext(ctx, "bee-install", device, logPath)
 	}
+	externalTaskRunnerCommand = func(exportDir, taskID string) (*exec.Cmd, error) {
+		exe, err := os.Executable()
+		if err != nil {
+			return nil, err
+		}
+		return exec.Command(exe, "bee-worker", "--export-dir", exportDir, "--task-id", taskID), nil
+	}
 )

 // enqueue adds a task to the queue and notifies the worker.
@@ -365,6 +375,11 @@ func (q *taskQueue) prune() {

 // nextPending returns the highest-priority pending task (nil if none).
 func (q *taskQueue) nextPending() *Task {
+	for _, t := range q.tasks {
+		if t.Status == TaskRunning {
+			return nil
+		}
+	}
 	var best *Task
 	for _, t := range q.tasks {
 		if t.Status != TaskPending {
@@ -484,6 +499,7 @@ func (q *taskQueue) startWorker(opts *HandlerOptions) {
 	if !q.started {
 		q.loadLocked()
 		q.started = true
+		q.resumeRunningTasksLocked()
 		goRecoverLoop("task worker", 2*time.Second, q.worker)
 	}
 	hasPending := q.nextPending() != nil
@@ -517,15 +533,12 @@ func (q *taskQueue) worker() {
 				t.StartedAt = &now
 				t.DoneAt = nil
 				t.ErrMsg = ""
-				j := newTaskJobState(t.LogPath, taskSerialPrefix(t))
+				j := newTaskJobState(t.LogPath)
 				t.job = j
 				q.persistLocked()
 				q.mu.Unlock()

-				taskCtx, taskCancel := context.WithCancel(context.Background())
-				j.cancel = taskCancel
-				q.executeTask(t, j, taskCtx)
-				taskCancel()
+				q.runTaskExternal(t, j)

 				q.mu.Lock()
 				q.prune()
@@ -537,6 +550,207 @@ func (q *taskQueue) worker() {
 	}
 }

+func (q *taskQueue) resumeRunningTasksLocked() {
+	for _, t := range q.tasks {
+		if t.Status != TaskRunning {
+			continue
+		}
+		if t.job == nil {
+			t.job = newTaskJobState(t.LogPath)
+		}
+		q.attachExternalTaskControlsLocked(t, t.job)
+		q.startRecoveredTaskMonitorLocked(t, t.job)
+	}
+}
+
+func (q *taskQueue) attachExternalTaskControlsLocked(t *Task, j *jobState) {
+	if t == nil || j == nil {
+		return
+	}
+	j.cancel = func() {
+		pid := t.runnerPID
+		if pid <= 0 {
+			if state, ok := readTaskRunnerState(t); ok {
+				pid = state.PID
+			}
+		}
+		if pid > 0 {
+			_ = syscall.Kill(pid, syscall.SIGTERM)
+		}
+	}
+}
+
+func (q *taskQueue) startRecoveredTaskMonitorLocked(t *Task, j *jobState) {
+	if t == nil || j == nil || t.runnerPID <= 0 {
+		return
+	}
+	goRecoverOnce("task runner monitor", func() {
+		stopTail := make(chan struct{})
+		doneTail := make(chan struct{})
+		go q.followTaskLog(t, j, stopTail, doneTail)
+		for processAlive(t.runnerPID) {
+			time.Sleep(500 * time.Millisecond)
+		}
+		close(stopTail)
+		<-doneTail
+		q.finishExternalTask(t, j, nil)
+	})
+}
+
+func (q *taskQueue) runTaskExternal(t *Task, j *jobState) {
+	stopTail := make(chan struct{})
+	doneTail := make(chan struct{})
+	defer func() {
+		close(stopTail)
+		<-doneTail
+	}()
+	go q.followTaskLog(t, j, stopTail, doneTail)
+
+	cmd, err := externalTaskRunnerCommand(q.opts.ExportDir, t.ID)
+	if err != nil {
+		j.appendFromLog("ERROR: " + err.Error())
+		q.finishExternalTask(t, j, err)
+		return
+	}
+	if err := cmd.Start(); err != nil {
+		j.appendFromLog("ERROR: " + err.Error())
+		q.finishExternalTask(t, j, err)
+		return
+	}
+
+	q.mu.Lock()
+	t.runnerPID = cmd.Process.Pid
+	q.attachExternalTaskControlsLocked(t, j)
+	q.persistLocked()
+	q.mu.Unlock()
+
+	waitErr := cmd.Wait()
+	time.Sleep(200 * time.Millisecond)
+	q.finishExternalTask(t, j, waitErr)
+}
+
+func (q *taskQueue) followTaskLog(t *Task, j *jobState, stop <-chan struct{}, done chan<- struct{}) {
+	defer close(done)
+	path := ""
+	if t != nil {
+		path = t.LogPath
+	}
+	if strings.TrimSpace(path) == "" {
+		return
+	}
+	offset := int64(0)
+	if info, err := os.Stat(path); err == nil {
+		offset = info.Size()
+	}
+	var partial string
+	ticker := time.NewTicker(250 * time.Millisecond)
+	defer ticker.Stop()
+	flush := func() {
+		data, newOffset, err := readTaskLogDelta(path, offset)
+		if err != nil || len(data) == 0 {
+			offset = newOffset
+			return
+		}
+		offset = newOffset
+		text := partial + strings.ReplaceAll(string(data), "\r\n", "\n")
+		lines := strings.Split(text, "\n")
+		partial = lines[len(lines)-1]
+		for _, line := range lines[:len(lines)-1] {
+			if line == "" {
+				continue
+			}
+			j.appendFromLog(line)
+		}
+	}
+	for {
+		select {
+		case <-ticker.C:
+			flush()
+		case <-stop:
+			flush()
+			if strings.TrimSpace(partial) != "" {
+				j.appendFromLog(partial)
+			}
+			return
+		}
+	}
+}
+
+func readTaskLogDelta(path string, offset int64) ([]byte, int64, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, offset, err
+	}
+	defer f.Close()
+	info, err := f.Stat()
+	if err != nil {
+		return nil, offset, err
+	}
+	if info.Size() < offset {
+		offset = 0
+	}
+	if _, err := f.Seek(offset, io.SeekStart); err != nil {
+		return nil, offset, err
+	}
+	data, err := io.ReadAll(io.LimitReader(f, 1<<20))
+	return data, offset + int64(len(data)), err
+}
+
+func (q *taskQueue) finishExternalTask(t *Task, j *jobState, waitErr error) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	if t.Status == TaskDone || t.Status == TaskFailed || t.Status == TaskCancelled {
+		if j != nil && !j.isDone() {
+			j.finish(t.ErrMsg)
+			j.closeLog()
+		}
+		select {
+		case q.trigger <- struct{}{}:
+		default:
+		}
+		return
+	}
+
+	state, ok := readTaskRunnerState(t)
+	switch {
+	case ok && state.Status != TaskRunning:
+		t.Status = state.Status
+		t.ErrMsg = state.Error
+		now := state.UpdatedAt
+		if now.IsZero() {
+			now = time.Now()
+		}
+		t.DoneAt = &now
+	case waitErr != nil:
+		now := time.Now()
+		t.Status = TaskFailed
+		t.ErrMsg = waitErr.Error()
+		t.DoneAt = &now
+	default:
+		now := time.Now()
+		t.Status = TaskFailed
+		t.ErrMsg = "task runner exited without final state"
+		t.DoneAt = &now
+	}
+	t.runnerPID = 0
+	q.finalizeTaskArtifactPathsLocked(t)
+	q.persistLocked()
+
+	if j != nil && !j.isDone() {
+		j.finish(t.ErrMsg)
+		j.closeLog()
+	}
+	if t.ErrMsg != "" {
+		taskSerialEvent(t, "finished with status="+t.Status+" error="+t.ErrMsg)
+	} else {
+		taskSerialEvent(t, "finished with status="+t.Status)
+	}
+	select {
+	case q.trigger <- struct{}{}:
+	default:
+	}
+}
+
 func (q *taskQueue) executeTask(t *Task, j *jobState, ctx context.Context) {
 	startedKmsgWatch := false
 	defer q.finalizeTaskRun(t, j)
@@ -985,15 +1199,11 @@ func (h *handler) handleAPITasksCancel(w http.ResponseWriter, r *http.Request) {
 		taskSerialEvent(t, "finished with status="+t.Status)
 		writeJSON(w, map[string]string{"status": "cancelled"})
 	case TaskRunning:
-		if t.job != nil {
-			t.job.abort()
+		if t.job == nil || !t.job.abort() {
+			writeError(w, http.StatusConflict, "task is not cancellable")
+			return
 		}
-		t.Status = TaskCancelled
-		now := time.Now()
-		t.DoneAt = &now
-		globalQueue.persistLocked()
-		taskSerialEvent(t, "finished with status="+t.Status)
-		writeJSON(w, map[string]string{"status": "cancelled"})
+		writeJSON(w, map[string]string{"status": "aborting"})
 	default:
 		writeError(w, http.StatusConflict, "task is not running or pending")
 	}
@@ -1039,12 +1249,6 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
 			if t.job != nil {
 				t.job.abort()
 			}
-			if taskMayLeaveOrphanWorkers(t.Target) {
-				platform.KillTestWorkers()
-			}
-			t.Status = TaskCancelled
-			t.DoneAt = &now
-			taskSerialEvent(t, "finished with status="+t.Status)
 			n++
 		}
 	}
@@ -1175,18 +1379,29 @@ func (q *taskQueue) loadLocked() {
 		}
 		q.assignTaskLogPathLocked(t)
 		if t.Status == TaskRunning {
-			// The task was interrupted by a bee-web restart. Child processes
-			// (e.g. bee-gpu-burn-worker, dcgmi/nvvs) survive the restart in
-			// their own process groups. Kill any matching stale workers before
-			// marking the task failed so the next GPU test does not inherit a
-			// busy DCGM slot or duplicate workers.
-			if taskMayLeaveOrphanWorkers(t.Target) {
-				_ = platform.KillTestWorkers()
+			state, ok := readTaskRunnerState(t)
+			switch {
+			case ok && state.Status == TaskRunning && processAlive(state.PID):
+				t.runnerPID = state.PID
+				t.job = newTaskJobState(t.LogPath)
+			case ok && state.Status != TaskRunning:
+				t.runnerPID = state.PID
+				t.Status = state.Status
+				t.ErrMsg = state.Error
+				now := state.UpdatedAt
+				if now.IsZero() {
+					now = time.Now()
+				}
+				t.DoneAt = &now
+			default:
+				if taskMayLeaveOrphanWorkers(t.Target) {
+					_ = platform.KillTestWorkers()
+				}
+				now := time.Now()
+				t.Status = TaskFailed
+				t.DoneAt = &now
+				t.ErrMsg = "interrupted by bee-web restart"
 			}
-			now := time.Now()
-			t.Status = TaskFailed
-			t.DoneAt = &now
-			t.ErrMsg = "interrupted by bee-web restart"
 		} else if t.Status == TaskPending {
 			t.StartedAt = nil
 			t.DoneAt = nil
--- a/2
+++ b/2
--- a/bible-local/README.md
+++ b/bible-local/README.md
@@ -10,4 +10,4 @@ Generic engineering rules live in `bible/rules/patterns/`.
 | `architecture/system-overview.md` | What bee does, scope, tech stack |
 | `architecture/runtime-flows.md` | Boot sequence, audit flow, service order |
 | `docs/hardware-ingest-contract.md` | Current Reanimator hardware ingest JSON contract |
-| `decisions/` | Architectural decision log |
+| `decisions/` | Architectural decision log, including read-only submodule policy |
--- a/bible-local/architecture/system-overview.md
+++ b/bible-local/architecture/system-overview.md
@@ -58,6 +58,8 @@ Fills gaps where Redfish/logpile is blind:
 - `bee` should populate current component state, hardware inventory, telemetry, and `status_checked_at`.
 - Historical status transitions and component replacement logic belong to the centralized ingest/lifecycle system, not to `bee`.
 - Contract fields that have no honest local source on a generic Linux host may remain empty.
+- Embedded submodules such as `internal/chart/` and `bible/` are read-only for `bee` feature work.
+- If the UI needs extra information, `bee` must emit it through the standard audit JSON contract rather than patching `chart`.

 ## Tech stack

@@ -101,7 +103,7 @@ Fills gaps where Redfish/logpile is blind:
 | `iso/builder/` | ISO build scripts and `live-build` profile |
 | `iso/overlay/` | Source overlay copied into a staged build overlay |
 | `iso/vendor/` | Optional pre-built vendor binaries (storcli64, sas2ircu, sas3ircu, arcconf, ssacli, …) |
-| `internal/chart/` | Git submodule with `reanimator/chart`, embedded into `bee web` |
+| `internal/chart/` | Git submodule with `reanimator/chart`, embedded into `bee web`; update by submodule pointer only, never by local `bee`-specific edits |
 | `iso/builder/VERSIONS` | Pinned versions: Debian, Go, NVIDIA driver, kernel ABI |
 | `iso/builder/smoketest.sh` | Post-boot smoke test — run via SSH to verify live ISO |
 | `iso/overlay/etc/profile.d/bee.sh` | tty1 welcome message with web UI URLs |
--- a/bible-local/decisions/2026-04-29-read-only-embedded-submodules.md
+++ b/bible-local/decisions/2026-04-29-read-only-embedded-submodules.md
@@ -0,0 +1,39 @@
+# Decision: Treat embedded submodules as read-only
+
+## Context
+
+`bee` embeds external git submodules such as:
+
+- `internal/chart/` — `reanimator/chart`, a generic read-only viewer for Reanimator JSON snapshots
+- `bible/` — shared engineering rules and contracts
+
+These repositories are reused by other projects. A local feature request in `bee`
+must not be solved by silently changing shared submodule behavior.
+
+The concrete failure mode here was attempting to add project-specific storage
+telemetry presentation by editing `internal/chart/`. That couples a shared viewer
+to one host application's needs and creates hidden cross-project regressions.
+
+## Decision
+
+Embedded submodules are read-only from the point of view of `bee`.
+
+- Do not implement `bee`-specific behavior by editing `internal/chart/`.
+- Do not implement `bee`-specific behavior by editing `bible/`.
+- If `bee` needs new data in the report, produce it in the standard audit JSON
+  emitted by `bee` itself.
+- `chart` must continue to consume the canonical snapshot as an external viewer,
+  without host-specific forks.
+- Updating a submodule pointer to an upstream commit is allowed.
+- Carrying local unmerged submodule commits as part of a `bee` feature is forbidden.
+
+## Consequences
+
+- Audit/report features must be expressed through the contract in
+  `bible-local/docs/hardware-ingest-contract.md`.
+- `bee` owns collection, normalization, and serialization of storage telemetry in
+  `hardware.storage[]`.
+- `chart` remains a pure visualization module that reads the snapshot it is given.
+- If a capability is genuinely missing in a shared submodule, it must be proposed
+  and landed upstream as a generic change first, then pulled into `bee` via a
+  normal submodule update.
--- a/bible-local/decisions/README.md
+++ b/bible-local/decisions/README.md
@@ -6,3 +6,4 @@ One file per decision, named `YYYY-MM-DD-short-topic.md`.
 |---|---|---|
 | 2026-03-05 | Use NVIDIA proprietary driver | active |
 | 2026-04-01 | Treat memtest as explicit ISO content | active |
+| 2026-04-29 | Treat embedded submodules as read-only | active |
--- a/bible-local/rules/patterns/ascii-safe-text/contract.md
+++ b/bible-local/rules/patterns/ascii-safe-text/contract.md
@@ -0,0 +1,31 @@
+# Contract: ASCII-Safe Text in Scripts and Boot Configs
+
+Version: 1.0
+
+## Principle
+
+Shell scripts, bootloader configs, and any text rendered on serial/SOL consoles must use only printable ASCII characters. Non-ASCII Unicode — including typographic punctuation such as the em-dash (U+2014 `—`), en-dash (U+2013 `–`), curly quotes, and ellipsis (U+2026 `…`) — breaks rendering on serial terminals, GRUB text/serial mode, IPMI SOL, and tooling that assumes ASCII.
+
+## Rules
+
+- Never use em-dash (`—`) or en-dash (`–`) in any shell script, GRUB config, syslinux/isolinux config, or service unit file. Use ASCII double-hyphen `--` or single hyphen `-` instead.
+- Never use curly quotes (`"` `"` `'` `'`) in shell scripts or configs. Use straight quotes `"` and `'`.
+- Never use the Unicode ellipsis (`…`). Use `...`.
+- GRUB `menuentry` and `submenu` titles must be ASCII-only — GRUB serial terminal output is ASCII; non-ASCII characters render as garbage or are dropped.
+- Comments in GRUB theme files (`.txt`) must also be ASCII-only, as GRUB may parse the entire file.
+
+## Why
+
+GRUB renders menus over both `gfxterm` (graphical, Unicode-capable) and `serial` (ASCII-only) simultaneously when `terminal_output gfxterm serial` is set. The serial output — used by IPMI SOL and BMC remote consoles — cannot display multi-byte UTF-8 sequences and shows raw bytes or drops characters. A menuentry title `"EASY-BEE — GSP=off"` appears as `"EASY-BEE â€" GSP=off"` or `"EASY-BEE  GSP=off"` on SOL, making the menu unreadable.
+
+## Anti-patterns
+
+- `menuentry "EASY-BEE — GSP=off"` — em-dash in GRUB title
+- `# bee logo — centered` — em-dash in GRUB theme comment
+- `echo "done — reboot"` in a shell script displayed over serial
+
+## Correct form
+
+- `menuentry "EASY-BEE -- GSP=off"`
+- `# bee logo - centered`
+- `echo "done - reboot"`
--- a/iso/README.md
+++ b/iso/README.md
@@ -31,10 +31,10 @@ Build with explicit SSH keys baked into the ISO:
 sh iso/builder/build-in-container.sh --authorized-keys ~/.ssh/id_ed25519.pub
 ```

-Rebuild the builder image:
+Force a clean rebuild of the builder image and build caches:

 ```sh
-sh iso/builder/build-in-container.sh --rebuild-image
+sh iso/builder/build-in-container.sh --clean-build
 ```

 Use a custom cache directory:
--- a/iso/builder/build-in-container.sh
+++ b/iso/builder/build-in-container.sh
@@ -10,7 +10,6 @@ IMAGE_TAG="${BEE_BUILDER_IMAGE:-bee-iso-builder}"
 BUILDER_PLATFORM="${BEE_BUILDER_PLATFORM:-linux/amd64}"
 CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/container-cache}"
 AUTH_KEYS=""
-REBUILD_IMAGE=0
 CLEAN_CACHE=0
 VARIANT="all"

@@ -22,17 +21,12 @@ while [ $# -gt 0 ]; do
            CACHE_DIR="$2"
            shift 2
            ;;
-        --rebuild-image)
-            REBUILD_IMAGE=1
-            shift
-            ;;
        --authorized-keys)
            AUTH_KEYS="$2"
            shift 2
            ;;
        --clean-build)
            CLEAN_CACHE=1
-            REBUILD_IMAGE=1
            shift
            ;;
        --variant)
@@ -41,7 +35,7 @@ while [ $# -gt 0 ]; do
            ;;
        *)
            echo "unknown arg: $1" >&2
-            echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|nvidia-legacy|amd|nogpu|all]" >&2
+            echo "usage: $0 [--cache-dir /path] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|nvidia-legacy|amd|nogpu|all]" >&2
            exit 1
            ;;
    esac
@@ -105,7 +99,7 @@ image_matches_platform() {
 }

 NEED_BUILD_IMAGE=0
-if [ "$REBUILD_IMAGE" = "1" ]; then
+if [ "$CLEAN_CACHE" = "1" ]; then
    NEED_BUILD_IMAGE=1
 elif ! "$CONTAINER_TOOL" image inspect "${IMAGE_REF}" >/dev/null 2>&1; then
    NEED_BUILD_IMAGE=1
--- a/iso/builder/build.sh
+++ b/iso/builder/build.sh
@@ -126,6 +126,37 @@ resolve_iso_version() {
    resolve_audit_version
 }

+sync_builder_workdir() {
+    src_dir="$1"
+    dst_dir="$2"
+
+    mkdir -p "$dst_dir"
+
+    # Historical bug: old workdirs could keep config/bootloaders/grub-pc even
+    # after the source tree moved to grub-efi only. Remove bootloaders eagerly
+    # so reused workdirs cannot leak stale templates into a new ISO build.
+    rm -rf "$dst_dir/config/bootloaders"
+
+    rsync -a --delete \
+        --exclude='cache/' \
+        --exclude='chroot/' \
+        --exclude='.build/' \
+        --exclude='*.iso' \
+        --exclude='*.packages' \
+        --exclude='*.contents' \
+        --exclude='*.files' \
+        "$src_dir/" "$dst_dir/"
+
+    if [ ! -f "$dst_dir/config/bootloaders/grub-efi/grub.cfg" ]; then
+        echo "ERROR: staged workdir is missing config/bootloaders/grub-efi/grub.cfg" >&2
+        exit 1
+    fi
+    if [ -e "$dst_dir/config/bootloaders/grub-pc" ]; then
+        echo "ERROR: stale config/bootloaders/grub-pc remained in staged workdir" >&2
+        exit 1
+    fi
+}
+
 iso_list_files() {
    iso_path="$1"

@@ -466,6 +497,75 @@ validate_iso_memtest() {
    echo "=== memtest validation OK ==="
 }

+validate_iso_live_boot_entries() {
+    iso_path="$1"
+    echo "=== validating live boot entries in ISO ==="
+
+    [ -f "$iso_path" ] || {
+        echo "ERROR: ISO not found for live boot validation: $iso_path" >&2
+        exit 1
+    }
+    require_iso_reader "$iso_path" >/dev/null 2>&1 || {
+        echo "ERROR: ISO reader unavailable for live boot validation" >&2
+        exit 1
+    }
+
+    grub_cfg="$(mktemp)"
+    isolinux_cfg="$(mktemp)"
+
+    iso_read_member "$iso_path" boot/grub/grub.cfg "$grub_cfg" || {
+        echo "ERROR: failed to read boot/grub/grub.cfg from ISO" >&2
+        rm -f "$grub_cfg" "$isolinux_cfg"
+        exit 1
+    }
+    iso_read_member "$iso_path" isolinux/live.cfg "$isolinux_cfg" || {
+        echo "ERROR: failed to read isolinux/live.cfg from ISO" >&2
+        rm -f "$grub_cfg" "$isolinux_cfg"
+        exit 1
+    }
+
+    if grep -q '@APPEND_LIVE@\|@KERNEL_LIVE@\|@INITRD_LIVE@' "$grub_cfg" "$isolinux_cfg"; then
+        echo "ERROR: unresolved live-build placeholders remain in ISO bootloader config" >&2
+        rm -f "$grub_cfg" "$isolinux_cfg"
+        exit 1
+    fi
+
+    grep -q 'menuentry "EASY-BEE"' "$grub_cfg" || {
+        echo "ERROR: GRUB default EASY-BEE entry is missing" >&2
+        rm -f "$grub_cfg" "$isolinux_cfg"
+        exit 1
+    }
+    grep -q 'menuentry "EASY-BEE -- load to RAM (toram)"' "$grub_cfg" || {
+        echo "ERROR: GRUB toram entry is missing" >&2
+        rm -f "$grub_cfg" "$isolinux_cfg"
+        exit 1
+    }
+    grep -q 'linux .*boot=live ' "$grub_cfg" || {
+        echo "ERROR: GRUB live entry is missing boot=live" >&2
+        rm -f "$grub_cfg" "$isolinux_cfg"
+        exit 1
+    }
+    grep -q 'linux .*boot=live .*toram ' "$grub_cfg" || {
+        echo "ERROR: GRUB toram entry is missing boot=live or toram" >&2
+        rm -f "$grub_cfg" "$isolinux_cfg"
+        exit 1
+    }
+
+    grep -q 'append .*boot=live ' "$isolinux_cfg" || {
+        echo "ERROR: isolinux live entry is missing boot=live" >&2
+        rm -f "$grub_cfg" "$isolinux_cfg"
+        exit 1
+    }
+    grep -q 'append .*boot=live .*toram ' "$isolinux_cfg" || {
+        echo "ERROR: isolinux toram entry is missing boot=live or toram" >&2
+        rm -f "$grub_cfg" "$isolinux_cfg"
+        exit 1
+    }
+
+    rm -f "$grub_cfg" "$isolinux_cfg"
+    echo "=== live boot validation OK ==="
+}
+
 validate_iso_nvidia_runtime() {
    iso_path="$1"
    [ "$BEE_GPU_VENDOR" = "nvidia" ] || return 0
@@ -558,6 +658,21 @@ extract_live_grub_entry() {
    return 0
 }

+load_live_build_append() {
+    lb_dir="$1"
+    binary_cfg="$lb_dir/config/binary"
+    [ -f "$binary_cfg" ] || return 1
+
+    # config/binary is generated by live-build and contains shell variable
+    # assignments such as LB_BOOTAPPEND_LIVE="boot=live ...".
+    # shellcheck disable=SC1090
+    . "$binary_cfg"
+
+    [ -n "${LB_BOOTAPPEND_LIVE:-}" ] || return 1
+    live_build_append="$LB_BOOTAPPEND_LIVE"
+    return 0
+}
+
 extract_live_isolinux_entry() {
    cfg="$1"
    isolinux_linux="$(awk '/^[[:space:]]*linux[[:space:]]+\/live\// { print; exit }' "$cfg")"
@@ -594,36 +709,15 @@ echo "  Hardware Audit LiveCD"
 echo ""

 menuentry "EASY-BEE" {
-    linux   ${kernel} ${append_live} nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+    linux   ${kernel} ${append_live} bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
    initrd  ${initrd}
 }

-menuentry "EASY-BEE — load to RAM (toram)" {
-    linux   ${kernel} ${append_live} toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+menuentry "EASY-BEE -- load to RAM (toram)" {
+    linux   ${kernel} ${append_live} toram bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
    initrd  ${initrd}
 }

-submenu "EASY-BEE (advanced options) -->" {
-    menuentry "EASY-BEE — GSP=off" {
-        linux   ${kernel} ${append_live} nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
-        initrd  ${initrd}
-    }
-
-    menuentry "EASY-BEE — KMS (no nomodeset)" {
-        linux   ${kernel} ${append_live} bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
-        initrd  ${initrd}
-    }
-
-    menuentry "EASY-BEE — KMS + GSP=off" {
-        linux   ${kernel} ${append_live} bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
-        initrd  ${initrd}
-    }
-
-    menuentry "EASY-BEE — fail-safe" {
-        linux   ${kernel} ${append_live} nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
-        initrd  ${initrd}
-    }
-}

 if [ "\${grub_platform}" = "efi" ]; then
    menuentry "Memory Test (memtest86+)" {
@@ -699,13 +793,18 @@ enforce_live_build_bootloader_assets() {
    grub_dir="$lb_dir/binary/boot/grub"
    isolinux_cfg="$lb_dir/binary/isolinux/live.cfg"

+    if ! load_live_build_append "$lb_dir"; then
+        echo "bootloader sync: WARNING: could not load LB_BOOTAPPEND_LIVE from $lb_dir/config/binary" >&2
+        live_build_append=""
+    fi
+
    if [ -f "$grub_cfg" ]; then
        if extract_live_grub_entry "$grub_cfg"; then
            mkdir -p "$grub_dir/live-theme"
            cp "${BUILDER_DIR}/config/bootloaders/grub-efi/config.cfg" "$grub_dir/config.cfg"
            cp "${BUILDER_DIR}/config/bootloaders/grub-efi/theme.cfg" "$grub_dir/theme.cfg"
            cp -R "${BUILDER_DIR}/config/bootloaders/grub-efi/live-theme/." "$grub_dir/live-theme/"
-            write_canonical_grub_cfg "$grub_cfg" "$grub_kernel" "$grub_append" "$grub_initrd"
+            write_canonical_grub_cfg "$grub_cfg" "$grub_kernel" "${live_build_append:-$grub_append}" "$grub_initrd"
            echo "bootloader sync: rewrote binary/boot/grub/grub.cfg with canonical EASY-BEE menu"
        else
            echo "bootloader sync: WARNING: could not extract live entry from $grub_cfg" >&2
@@ -714,7 +813,7 @@ enforce_live_build_bootloader_assets() {

    if [ -f "$isolinux_cfg" ]; then
        if extract_live_isolinux_entry "$isolinux_cfg"; then
-            write_canonical_isolinux_cfg "$isolinux_cfg" "$isolinux_kernel" "$isolinux_initrd_path" "$isolinux_append"
+            write_canonical_isolinux_cfg "$isolinux_cfg" "$isolinux_kernel" "$isolinux_initrd_path" "${live_build_append:-$isolinux_append}"
            echo "bootloader sync: rewrote binary/isolinux/live.cfg with canonical EASY-BEE menu"
        else
            echo "bootloader sync: WARNING: could not extract live entry from $isolinux_cfg" >&2
@@ -749,6 +848,73 @@ reset_live_build_stage() {
    done
 }

+# Marker written after every successful full lb build for this variant
+FULL_BUILD_MARKER="${BUILD_WORK_DIR}/.bee-full-build-marker"
+
+# Returns 0 if full lb build is needed, 1 if fast-path is safe.
+# Fast-path is safe when only light files changed since the last full build
+# (Go source, overlay scripts/configs). Heavy changes (VERSIONS, package lists,
+# hooks, archives, Dockerfile, auto/config) require a full lb build.
+needs_full_build() {
+    [ -f "${FULL_BUILD_MARKER}" ]                                        || return 0
+    [ -f "${BUILD_WORK_DIR}/binary/live/filesystem.squashfs" ]           || return 0
+    [ -f "${BUILD_WORK_DIR}/live-image-amd64.hybrid.iso" ]               || return 0
+
+    _heavy=$(find \
+        "${BUILDER_DIR}/VERSIONS" \
+        "${BUILDER_DIR}/auto/config" \
+        "${BUILDER_DIR}/Dockerfile" \
+        "${BUILDER_DIR}/config/package-lists" \
+        "${BUILDER_DIR}/config/hooks" \
+        "${BUILDER_DIR}/config/archives" \
+        "${BUILDER_DIR}/config/bootloaders" \
+        -newer "${FULL_BUILD_MARKER}" 2>/dev/null | head -1)
+
+    if [ -n "$_heavy" ]; then
+        echo "=== full build required: heavy config changed: $(basename "$_heavy") ==="
+        return 0
+    fi
+
+    return 1
+}
+
+# Fast-path: unsquash existing filesystem, rsync overlay on top, repack.
+# Requires ~10 GB free in BEE_CACHE_DIR for the unpacked squashfs.
+fast_path_repack_squashfs() {
+    _sq="${BUILD_WORK_DIR}/binary/live/filesystem.squashfs"
+    _tmp="${BEE_CACHE_DIR}/fast-unsquash-${BUILD_VARIANT}"
+    echo "=== fast-path: unsquash ($(du -sh "$_sq" | cut -f1) compressed) ==="
+    rm -rf "$_tmp"
+    unsquashfs -d "$_tmp" "$_sq"
+    echo "=== fast-path: syncing overlay stage ==="
+    rsync -a --checksum "${OVERLAY_STAGE_DIR}/" "$_tmp/"
+    echo "=== fast-path: repacking squashfs ==="
+    _sq_new="${_sq}.new"
+    rm -f "$_sq_new"
+    mksquashfs "$_tmp" "$_sq_new" -comp zstd -b 1048576 -noappend -no-progress
+    mv "$_sq_new" "$_sq"
+    rm -rf "$_tmp"
+    echo "=== fast-path: squashfs repacked ($(du -sh "$_sq" | cut -f1)) ==="
+}
+
+# Fast-path: rebuild ISO by replacing only live/filesystem.squashfs via xorriso.
+# Boot structure (El Torito, EFI, MBR hybrid) is replayed from the prior ISO.
+fast_path_rebuild_iso() {
+    _sq="${BUILD_WORK_DIR}/binary/live/filesystem.squashfs"
+    _prior="${BUILD_WORK_DIR}/live-image-amd64.hybrid.iso"
+    _new="${BUILD_WORK_DIR}/live-image-amd64.hybrid.iso.new"
+    echo "=== fast-path: rebuilding ISO with xorriso ==="
+    rm -f "$_new"
+    xorriso \
+        -indev  "$_prior" \
+        -outdev "$_new" \
+        -map    "$_sq" /live/filesystem.squashfs \
+        -boot_image any replay \
+        -commit
+    mv "$_new" "$_prior"
+    echo "=== fast-path: ISO rebuilt ==="
+}
+
 recover_iso_memtest() {
    lb_dir="$1"
    iso_path="$2"
@@ -1112,15 +1278,7 @@ echo "=== preparing staged overlay (${BUILD_VARIANT}) ==="
 mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"

 # Sync builder config into variant work dir, preserving lb cache.
-rsync -a --delete \
-    --exclude='cache/' \
-    --exclude='chroot/' \
-    --exclude='.build/' \
-    --exclude='*.iso' \
-    --exclude='*.packages' \
-    --exclude='*.contents' \
-    --exclude='*.files' \
-    "${BUILDER_DIR}/" "${BUILD_WORK_DIR}/"
+sync_builder_workdir "${BUILDER_DIR}" "${BUILD_WORK_DIR}"

 # Share deb package cache across variants.
 # Restore: populate work dir cache from shared cache before build.
@@ -1396,6 +1554,21 @@ if [ -f "${LB_INCLUDES}/root/.ssh/authorized_keys" ]; then
    chmod 600 "${LB_INCLUDES}/root/.ssh/authorized_keys"
 fi

+# --- auto fast-path: squashfs surgery if only light files changed ---
+if ! needs_full_build; then
+    echo "=== fast-path build (no heavy config changes since last full build) ==="
+    fast_path_repack_squashfs
+    fast_path_rebuild_iso
+    ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
+    validate_iso_live_boot_entries "$ISO_RAW"
+    validate_iso_nvidia_runtime "$ISO_RAW"
+    cp "$ISO_RAW" "$ISO_OUT"
+    echo ""
+    echo "=== done (${BUILD_VARIANT}, fast-path) ==="
+    echo "ISO: $ISO_OUT"
+    exit 0
+fi
+
 # --- build ISO using live-build ---
 echo ""
 echo "=== building ISO (variant: ${BUILD_VARIANT}) ==="
@@ -1411,8 +1584,11 @@ dump_memtest_debug "pre-build" "${LB_DIR}"
 run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
 echo "=== enforcing canonical bootloader assets ==="
 enforce_live_build_bootloader_assets "${LB_DIR}"
+reset_live_build_stage "${LB_DIR}" "binary_checksums"
+reset_live_build_stage "${LB_DIR}" "binary_iso"
+reset_live_build_stage "${LB_DIR}" "binary_zsync"
 run_step_sh "rebuild live-build checksums after bootloader sync" "91b-lb-checksums" "lb binary_checksums 2>&1"
-run_step_sh "rebuild ISO after bootloader sync" "91c-lb-binary-iso" "rm -f '${LB_DIR}/live-image-amd64.hybrid.iso' && lb binary_iso 2>&1"
+run_step_sh "rebuild ISO after bootloader sync" "91c-lb-binary-iso" "lb binary_iso 2>&1"
 run_step_sh "rebuild zsync after bootloader sync" "91d-lb-zsync" "lb binary_zsync 2>&1"

 # --- persist deb package cache back to shared location ---
@@ -1438,8 +1614,10 @@ if [ -f "$ISO_RAW" ]; then
        fi
    fi
    validate_iso_memtest "$ISO_RAW"
+    validate_iso_live_boot_entries "$ISO_RAW"
    validate_iso_nvidia_runtime "$ISO_RAW"
    cp "$ISO_RAW" "$ISO_OUT"
+    touch "${FULL_BUILD_MARKER}"
    echo ""
    echo "=== done (${BUILD_VARIANT}) ==="
    echo "ISO: $ISO_OUT"
--- a/iso/builder/config/bootloaders/grub-efi/config.cfg
+++ b/iso/builder/config/bootloaders/grub-efi/config.cfg
@@ -23,9 +23,9 @@ insmod serial
 serial --unit=0 --speed=115200 --word=8 --parity=no --stop=1

 insmod gfxterm
-insmod png
-
-source /boot/grub/theme.cfg

 terminal_input console serial
 terminal_output gfxterm serial
+
+insmod png
+source /boot/grub/theme.cfg
--- a/iso/builder/config/bootloaders/grub-efi/grub.cfg
+++ b/iso/builder/config/bootloaders/grub-efi/grub.cfg
@@ -1,47 +1,16 @@
 source /boot/grub/config.cfg

-echo ""
-echo "  ███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗"
-echo "  ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝"
-echo "  █████╗  ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗  █████╗"
-echo "  ██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝"
-echo "  ███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗"
-echo "  ╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝"
-echo "  Hardware Audit LiveCD"
-echo ""
-
 menuentry "EASY-BEE" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+    linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
    initrd  @INITRD_LIVE@
 }

-submenu "EASY-BEE (advanced options) -->" {
-    menuentry "EASY-BEE — load to RAM (toram)" {
-        linux   @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
-        initrd  @INITRD_LIVE@
-    }
-
-    menuentry "EASY-BEE — GSP=off" {
-        linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
-        initrd  @INITRD_LIVE@
-    }
-
-    menuentry "EASY-BEE — KMS (no nomodeset)" {
-        linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
-        initrd  @INITRD_LIVE@
-    }
-
-    menuentry "EASY-BEE — KMS + GSP=off" {
-        linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
-        initrd  @INITRD_LIVE@
-    }
-
-    menuentry "EASY-BEE — fail-safe" {
-        linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
-        initrd  @INITRD_LIVE@
-    }
+menuentry "EASY-BEE -- load to RAM (toram)" {
+    linux   @KERNEL_LIVE@ @APPEND_LIVE@ toram bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+    initrd  @INITRD_LIVE@
 }

+
 if [ "${grub_platform}" = "efi" ]; then
    menuentry "Memory Test (memtest86+)" {
        chainloader /boot/memtest86+x64.efi
--- a/iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.png
+++ b/iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.png
--- a/iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt
+++ b/iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt
@@ -5,12 +5,10 @@ title-text: ""
 message-font: "Unifont Regular 16"
 terminal-font: "Unifont Regular 16"

-#bee logo — centered, upper third of screen
+#bee logo - centered, upper third of screen
 + image {
        top = 4%
        left = 50%-200
-        width = 400
-        height = 400
        file = "bee-logo.png"
 }

@@ -36,11 +34,11 @@ terminal-font: "Unifont Regular 16"
        item_font = "Unifont Regular 16"
        selected_item_color= "#f5a800"
        selected_item_font = "Unifont Regular 16"
-        item_height = 16
-        item_padding = 0
+        item_height = 20
+        item_padding = 2
        item_spacing = 4
        icon_width = 0
-        icon_heigh = 0
+        icon_height = 0
        item_icon_space = 0
 }

--- a/iso/builder/config/package-lists/bee.list.chroot
+++ b/iso/builder/config/package-lists/bee.list.chroot
@@ -47,18 +47,30 @@ vim-tiny
 mc
 htop
 nvtop
-btop
 sudo
 zstd
 mstflint
 memtester
 stress-ng
 stressapptest
+fio
+iperf3
+iotop
+nload
+tcpdump
+hdparm
+sysstat
+lsscsi
+sg3-utils
+jq
+curl
+net-tools

 # QR codes (for displaying audit results)
 qrencode

 # Local desktop (openbox + chromium kiosk)
+gparted
 openbox
 tint2
 feh
--- a/iso/overlay/etc/systemd/system/bee-audit.service
+++ b/iso/overlay/etc/systemd/system/bee-audit.service
@@ -1,6 +1,6 @@
 [Unit]
 Description=Bee: hardware audit
-After=bee-preflight.service bee-network.service bee-nvidia.service
+After=bee-preflight.service bee-network.service bee-nvidia.service bee-blackbox.service

 [Service]
 Type=oneshot
--- a/iso/overlay/etc/systemd/system/bee-blackbox.service
+++ b/iso/overlay/etc/systemd/system/bee-blackbox.service
@@ -0,0 +1,18 @@
+[Unit]
+Description=Bee: USB black-box log mirror
+After=local-fs.target
+Before=bee-network.service bee-nvidia.service bee-preflight.service bee-audit.service bee-web.service
+StartLimitIntervalSec=0
+
+[Service]
+Type=simple
+ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-blackbox.log /usr/local/bin/bee blackbox --export-dir /appdata/bee/export --state-file /appdata/bee/export/blackbox-state.json
+Restart=always
+RestartSec=1
+StandardOutput=journal
+StandardError=journal
+OOMScoreAdjust=-900
+Nice=0
+
+[Install]
+WantedBy=multi-user.target
--- a/iso/overlay/etc/systemd/system/bee-network.service
+++ b/iso/overlay/etc/systemd/system/bee-network.service
@@ -1,6 +1,6 @@
 [Unit]
 Description=Bee: bring up network interfaces via DHCP
-After=local-fs.target
+After=local-fs.target bee-blackbox.service
 Before=network-online.target bee-audit.service

 [Service]
--- a/iso/overlay/etc/systemd/system/bee-nvidia.service
+++ b/iso/overlay/etc/systemd/system/bee-nvidia.service
@@ -1,6 +1,6 @@
 [Unit]
 Description=Bee: load NVIDIA kernel modules and create device nodes
-After=local-fs.target udev.service
+After=local-fs.target udev.service bee-blackbox.service
 Before=bee-audit.service

 [Service]
--- a/iso/overlay/etc/systemd/system/bee-preflight.service
+++ b/iso/overlay/etc/systemd/system/bee-preflight.service
@@ -1,6 +1,6 @@
 [Unit]
 Description=Bee: runtime preflight self-check
-After=bee-network.service bee-nvidia.service
+After=bee-network.service bee-nvidia.service bee-blackbox.service
 Before=bee-audit.service

 [Service]
--- a/iso/overlay/etc/systemd/system/bee-web.service
+++ b/iso/overlay/etc/systemd/system/bee-web.service
@@ -1,5 +1,6 @@
 [Unit]
 Description=Bee: hardware audit web viewer
+After=bee-blackbox.service
 StartLimitIntervalSec=0

 [Service]
--- a/iso/overlay/usr/local/bin/bee-nvidia-recover
+++ b/iso/overlay/usr/local/bin/bee-nvidia-recover
@@ -60,35 +60,129 @@ wait_for_process_exit() {
    return 0
 }

-kill_pattern() {
-    pattern="$1"
-    if pgrep -f "$pattern" >/dev/null 2>&1; then
-        pgrep -af "$pattern" 2>/dev/null | while IFS= read -r line; do
+log_pid_details() {
+    pid="$1"
+    line=$(ps -p "$pid" -o pid=,comm=,args= 2>/dev/null | sed 's/^[[:space:]]*//')
+    if [ -n "$line" ]; then
+        log_blocker "$line"
+    else
+        log_blocker "pid $pid"
+    fi
+}
+
+collect_gpu_compute_pids() {
+    index="$1"
+    if ! command -v nvidia-smi >/dev/null 2>&1; then
+        return 0
+    fi
+    nvidia-smi --id="$index" \
+        --query-compute-apps=pid \
+        --format=csv,noheader,nounits 2>/dev/null \
+        | sed 's/^[[:space:]]*//;s/[[:space:]]*$//' \
+        | grep -E '^[0-9]+$' || true
+}
+
+collect_gpu_device_pids() {
+    index="$1"
+    dev="/dev/nvidia$index"
+    [ -e "$dev" ] || return 0
+    if command -v fuser >/dev/null 2>&1; then
+        fuser "$dev" 2>/dev/null \
+            | tr ' ' '\n' \
+            | sed 's/[^0-9].*$//' \
+            | grep -E '^[0-9]+$' || true
+    fi
+}
+
+collect_gpu_holder_pids() {
+    index="$1"
+    {
+        collect_gpu_compute_pids "$index"
+        collect_gpu_device_pids "$index"
+    } | awk 'NF' | sort -u
+}
+
+kill_pid_list() {
+    pids="$1"
+    [ -n "$pids" ] || return 0
+
+    for pid in $pids; do
+        log_pid_details "$pid"
+    done
+    log "terminating GPU holder PIDs: $(echo "$pids" | tr '\n' ' ' | sed 's/[[:space:]]*$//')"
+    for pid in $pids; do
+        kill -TERM "$pid" >/dev/null 2>&1 || true
+    done
+    sleep 1
+    for pid in $pids; do
+        if kill -0 "$pid" >/dev/null 2>&1; then
+            log "forcing GPU holder PID $pid to exit"
+            kill -KILL "$pid" >/dev/null 2>&1 || true
+        fi
+    done
+}
+
+gpu_has_display_holders() {
+    index="$1"
+    holders=$(collect_gpu_device_pids "$index")
+    [ -n "$holders" ] || return 1
+    for pid in $holders; do
+        comm=$(ps -p "$pid" -o comm= 2>/dev/null | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
+        case "$comm" in
+            Xorg|Xwayland|X|gnome-shell)
+                return 0
+                ;;
+        esac
+    done
+    return 1
+}
+
+stop_nv_hostengine_if_running() {
+    if pgrep -x nv-hostengine >/dev/null 2>&1; then
+        pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do
            [ -n "$line" ] || continue
            log_blocker "$line"
        done
-        log "killing processes matching: $pattern"
-        pkill -TERM -f "$pattern" >/dev/null 2>&1 || true
-        sleep 1
-        pkill -KILL -f "$pattern" >/dev/null 2>&1 || true
+        log "stopping nv-hostengine"
+        pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true
+        wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true
+        hostengine_was_active=1
+        return 0
    fi
+    return 1
+}
+
+stop_fabricmanager_if_active() {
+    if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
+        log_blocker "service nvidia-fabricmanager.service"
+        fabric_was_active=1
+        return 0
+    fi
+    return 1
+}
+
+stop_display_stack_if_active() {
+    stopped=1
+    for unit in display-manager.service lightdm.service; do
+        if unit_exists "$unit" && stop_unit_if_active "$unit"; then
+            log_blocker "service $unit"
+            display_was_active=1
+            stopped=0
+        fi
+    done
+    return "$stopped"
+}
+
+try_gpu_reset() {
+    index="$1"
+    log "resetting GPU $index"
+    nvidia-smi -r -i "$index"
 }

 drain_gpu_clients() {
    display_was_active=0
    fabric_was_active=0
-
-    for unit in display-manager.service lightdm.service; do
-        if unit_exists "$unit" && stop_unit_if_active "$unit"; then
-            log_blocker "service $unit"
-            display_was_active=1
-        fi
-    done
-
-    if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
-        log_blocker "service nvidia-fabricmanager.service"
-        fabric_was_active=1
-    fi
+    hostengine_was_active=0

    if pgrep -x nv-hostengine >/dev/null 2>&1; then
        pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do
@@ -98,21 +192,25 @@ drain_gpu_clients() {
        log "stopping nv-hostengine"
        pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true
        wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true
+        hostengine_was_active=1
    fi

-    for pattern in \
-        "nvidia-smi" \
-        "dcgmi" \
-        "nvvs" \
-        "dcgmproftester" \
-        "all_reduce_perf" \
-        "nvtop" \
-        "bee-gpu-burn" \
-        "bee-john-gpu-stress" \
-        "bee-nccl-gpu-stress" \
-        "Xorg" \
-        "Xwayland"; do
-        kill_pattern "$pattern"
+    if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
+        log_blocker "service nvidia-fabricmanager.service"
+        fabric_was_active=1
+    fi
+
+    for unit in display-manager.service lightdm.service; do
+        if unit_exists "$unit" && stop_unit_if_active "$unit"; then
+            log_blocker "service $unit"
+            display_was_active=1
+        fi
+    done
+
+    for dev in /dev/nvidia[0-9]*; do
+        [ -e "$dev" ] || continue
+        holders=$(collect_gpu_device_pids "${dev#/dev/nvidia}")
+        kill_pid_list "$holders"
    done
 }

@@ -125,7 +223,7 @@ restore_gpu_clients() {
        fi
    fi

-    if command -v nv-hostengine >/dev/null 2>&1 && ! pgrep -x nv-hostengine >/dev/null 2>&1; then
+    if [ "${hostengine_was_active:-0}" = "1" ] && command -v nv-hostengine >/dev/null 2>&1 && ! pgrep -x nv-hostengine >/dev/null 2>&1; then
        log "starting nv-hostengine"
        nv-hostengine
    fi
@@ -153,10 +251,60 @@ restart_drivers() {

 reset_gpu() {
    index="$1"
-    drain_gpu_clients
-    log "resetting GPU $index"
-    nvidia-smi -r -i "$index"
+    display_was_active=0
+    fabric_was_active=0
+    hostengine_was_active=0
+
+    holders=$(collect_gpu_holder_pids "$index")
+    if [ -n "$holders" ]; then
+        kill_pid_list "$holders"
+    fi
+    if try_gpu_reset "$index"; then
+        restore_gpu_clients
+        return 0
+    fi
+
+    stop_nv_hostengine_if_running || true
+    holders=$(collect_gpu_holder_pids "$index")
+    if [ -n "$holders" ]; then
+        kill_pid_list "$holders"
+    fi
+    if try_gpu_reset "$index"; then
+        restore_gpu_clients
+        return 0
+    fi
+
+    stop_fabricmanager_if_active || true
+    holders=$(collect_gpu_holder_pids "$index")
+    if [ -n "$holders" ]; then
+        kill_pid_list "$holders"
+    fi
+    if try_gpu_reset "$index"; then
+        restore_gpu_clients
+        return 0
+    fi
+
+    if gpu_has_display_holders "$index"; then
+        stop_display_stack_if_active || true
+        holders=$(collect_gpu_holder_pids "$index")
+        if [ -n "$holders" ]; then
+            kill_pid_list "$holders"
+        fi
+        if try_gpu_reset "$index"; then
+            restore_gpu_clients
+            return 0
+        fi
+    fi
+
+    holders=$(collect_gpu_holder_pids "$index")
+    if [ -n "$holders" ]; then
+        log "GPU $index still has holders after targeted drain"
+        kill_pid_list "$holders"
+    fi
+    try_gpu_reset "$index"
+    rc=$?
    restore_gpu_clients
+    return "$rc"
 }

 cmd="${1:-}"
--- a/iso/vendor/arcconf
+++ b/iso/vendor/arcconf
--- a/iso/vendor/sas2ircu
+++ b/iso/vendor/sas2ircu
--- a/iso/vendor/sas3ircu
+++ b/iso/vendor/sas3ircu
--- a/iso/vendor/ssacli
+++ b/iso/vendor/ssacli
--- a/iso/vendor/storcli64
+++ b/iso/vendor/storcli64
--- a/scripts/deploy.sh
+++ b/scripts/deploy.sh
@@ -47,6 +47,13 @@ echo "==> Сборка бинарника..."
 )
 echo "    OK: $(ls -lh "${LOCAL_BIN}" | awk '{print $5, $9}')"

+LOCAL_SHA="$(shasum -a 256 "${LOCAL_BIN}" | awk '{print $1}')"
+REMOTE_SHA="$("${SSH_CMD[@]}" "$REMOTE" "if [ -f '${REMOTE_BIN}' ] && command -v sha256sum >/dev/null 2>&1; then sha256sum '${REMOTE_BIN}' | awk '{print \\$1}'; fi" 2>/dev/null || true)"
+if [[ -n "${REMOTE_SHA}" && "${LOCAL_SHA}" == "${REMOTE_SHA}" ]]; then
+    echo "==> Бинарник не изменился (${LOCAL_SHA}); копирование и перезапуск сервисов пропущены."
+    exit 0
+fi
+
 # --- Deploy ---
 echo "==> Копирование на ${REMOTE}..."
 "${SCP_CMD[@]}" "${LOCAL_BIN}" "${REMOTE}:/tmp/bee-new"
--- a/scripts/fetch-vendor.sh
+++ b/scripts/fetch-vendor.sh
@@ -1,74 +0,0 @@
-#!/bin/sh
-# fetch-vendor.sh — download proprietary vendor utilities into iso/vendor.
-#
-# Usage:
-#   STORCLI_URL=... STORCLI_SHA256=... \
-#   SAS2IRCU_URL=... SAS2IRCU_SHA256=... \
-#   SAS3IRCU_URL=... SAS3IRCU_SHA256=... \
-#   MSTFLINT_URL=... MSTFLINT_SHA256=... \
-#   sh scripts/fetch-vendor.sh
-
-set -eu
-
-ROOT_DIR=$(CDPATH= cd -- "$(dirname -- "$0")/.." && pwd)
-OUT_DIR="$ROOT_DIR/iso/vendor"
-mkdir -p "$OUT_DIR"
-
-need_cmd() {
-    command -v "$1" >/dev/null 2>&1 || { echo "ERROR: required command not found: $1" >&2; exit 1; }
-}
-
-need_cmd sha256sum
-
-download_to() {
-    url="$1"
-    out="$2"
-    if command -v wget >/dev/null 2>&1; then
-        wget -O "$out" "$url"
-        return 0
-    fi
-    if command -v curl >/dev/null 2>&1; then
-        curl -fsSL "$url" -o "$out"
-        return 0
-    fi
-    echo "ERROR: required command not found: wget or curl" >&2
-    exit 1
-}
-
-fetch_one() {
-    name="$1"
-    url="$2"
-    sha="$3"
-
-    if [ -z "$url" ] || [ -z "$sha" ]; then
-        echo "[vendor] skip $name (URL/SHA not provided)"
-        return 0
-    fi
-
-    dst="$OUT_DIR/$name"
-    tmp="$dst.tmp"
-
-    echo "[vendor] downloading $name"
-    download_to "$url" "$tmp"
-
-    got=$(sha256sum "$tmp" | awk '{print $1}')
-    want=$(echo "$sha" | tr '[:upper:]' '[:lower:]')
-    if [ "$got" != "$want" ]; then
-        rm -f "$tmp"
-        echo "ERROR: checksum mismatch for $name" >&2
-        echo "  got:  $got" >&2
-        echo "  want: $want" >&2
-        exit 1
-    fi
-
-    mv "$tmp" "$dst"
-    chmod +x "$dst" || true
-    echo "[vendor] ok: $name"
-}
-
-fetch_one "storcli64" "${STORCLI_URL:-}" "${STORCLI_SHA256:-}"
-fetch_one "sas2ircu" "${SAS2IRCU_URL:-}" "${SAS2IRCU_SHA256:-}"
-fetch_one "sas3ircu" "${SAS3IRCU_URL:-}" "${SAS3IRCU_SHA256:-}"
-fetch_one "mstflint" "${MSTFLINT_URL:-}" "${MSTFLINT_SHA256:-}"
-
-echo "[vendor] done. output dir: $OUT_DIR"
Author	SHA1	Message	Date
Mikhail Chusavitin	c0dbbf96ad	Add vendor RAID tools for livecd	2026-04-29 17:31:25 +03:00
Mikhail Chusavitin	76484b123c	Fix fast-path: treat bootloader config changes as heavy config/bootloaders was missing from the needs_full_build heavy-file list, so changes to GRUB theme assets (e.g. bee-logo.png RGBA→RGB fix in `333c44f`) were silently skipped by the squashfs-surgery fast-path. The old broken PNG stayed in boot/grub/live-theme/ inside the ISO. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-29 15:36:29 +03:00
Mikhail Chusavitin	8901596152	Add server diagnostic tools to ISO, drop btop Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-29 13:18:50 +03:00
Mikhail Chusavitin	7c504e5056	Collect IOMMU group per PCIe device from sysfs Reads the iommu_group symlink for each BDF and exposes the group number as iommu_group in the hardware snapshot JSON. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-29 12:34:54 +03:00
Mikhail Chusavitin	333c44f3ba	Fix GRUB splash: convert bee-logo.png from RGBA to RGB GRUB does not support RGBA PNG (color_type=6) — loading it returns a null bitmap, triggering "null src bitmap in grub_video_bitmap_create_scaled". Alpha channel composited onto black background (#000000 matches desktop-color). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-29 11:15:16 +03:00
Mikhail Chusavitin	3bca821d3e	Add auto fast-path ISO rebuild via squashfs surgery When only light files changed since the last full lb build (Go source, overlay scripts/configs), the build is now automatically done in ~5-8 min instead of 30+ min: - unsquashfs existing squashfs from prior build - rsync overlay-stage on top - mksquashfs repack (zstd, same block size) - xorriso ISO repack with -boot_image any replay (preserves EFI/MBR hybrid) Heavy changes (VERSIONS, package-lists, hooks, archives, Dockerfile, auto/config) still trigger a full lb build. Tracking is via a marker file (.bee-full-build-marker) written after each successful full build. No change to build-in-container.sh or the full build path. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-29 10:58:26 +03:00
Mikhail Chusavitin	3648e37a1e	Update bible submodule to remote HEAD, preserve ascii-safe-text contract locally Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-29 10:30:27 +03:00
Mikhail Chusavitin	d109e08fab	Drop redundant rebuild-image flag	2026-04-29 10:01:57 +03:00
Mikhail Chusavitin	11d00b9442	Document read-only submodules policy	2026-04-29 09:54:23 +03:00
Mikhail Chusavitin	6defa5ae15	Revert chart submodule update	2026-04-29 09:47:35 +03:00
Mikhail Chusavitin	c76658ed00	Update bible and chart submodules	2026-04-29 09:43:57 +03:00
Mikhail Chusavitin	2163017a98	Collect and report storage telemetry	2026-04-29 09:40:58 +03:00
Michael Chus	29179917c3	Add USB blackbox log mirroring service	2026-04-24 10:20:12 +03:00
Michael Chus	be4b439804	Commit remaining workspace changes	2026-04-23 20:32:26 +03:00
Michael Chus	749fc8a94d	Unify NVIDIA GPU recovery paths	2026-04-23 20:31:41 +03:00
Michael Chus	6112094d45	fix(grub): fix bitmap error and menu rendering - Convert bee-logo.png to RGBA (color type 6) and strip all metadata chunks (cHRM, bKGD, tIME, tEXt) that confuse GRUB's minimal PNG parser - Move terminal_output gfxterm before insmod png / theme load so the theme initialises in an active gfxterm context - Remove echo ASCII art banner from grub.cfg — with gfxterm active and no terminal_box in the theme, echo output renders over the menu area - Fix icon_heigh typo → icon_height; increase item_height 16→20 with item_padding 0→2 for reliable text rendering in boot_menu Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-22 22:05:16 +03:00
Michael Chus	e9a2bc9f9d	update submodule	2026-04-22 20:39:27 +03:00
Mikhail Chusavitin	7a8f884664	fix(boot): remove advanced options submenu Keep only EASY-BEE and toram entries. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-22 19:01:50 +03:00
Mikhail Chusavitin	8bf8dfa45b	fix(boot): default to KMS + pci=realloc, drop nomodeset from main entries Default and toram entries now boot with bee.display=kms (ASPEED AST loads via KMS, Xorg uses modesetting driver) and pci=realloc (Linux reassigns GPU BARs when BIOS lacks Above 4G Decoding). nomodeset removed from these entries; still present in GSP=off and fail-safe. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-22 19:00:04 +03:00
Mikhail Chusavitin	6a22199aff	chore(bible): bump ascii-safe-text contract Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-22 18:52:10 +03:00
Mikhail Chusavitin	ddb2bb5d1c	fix(grub): replace em-dash with ASCII -- in all menu entry titles Em-dash (U+2014) renders as garbage on GRUB serial/SOL output (IPMI BMC consoles). Replace with ASCII double-hyphen throughout grub.cfg template, write_canonical_grub_cfg, and theme.txt comment. Also align template grub.cfg structure with write_canonical_grub_cfg: toram entry moved to top level (was inside submenu). bible: add ascii-safe-text contract documenting the no-em-dash rule. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-22 18:52:04 +03:00
Mikhail Chusavitin	aa284ae754	fix(iso): avoid grub logo scaling error	2026-04-20 14:06:32 +03:00
Mikhail Chusavitin	8512098174	fix(iso): restore bootappend-live in canonical boot menu	2026-04-20 13:39:05 +03:00
Mikhail Chusavitin	6b5d22c194	chore(git): ignore local audit binary	2026-04-20 13:21:35 +03:00
Mikhail Chusavitin	a35e90a93e	fix(iso): clear stale bootloader templates in workdir	2026-04-20 13:19:50 +03:00
Mikhail Chusavitin	1ced81707f	fix(iso): validate live boot entries in final ISO	2026-04-20 13:12:24 +03:00
Mikhail Chusavitin	679aeb9947	Run NVIDIA DCGM diag tests on all selected GPUs simultaneously targeted_stress, targeted_power, and the Level 2/3 diag were dispatched one GPU at a time from the UI, turning a single dcgmi command into 8 sequential ~350–450 s runs. DCGM supports -i with a comma-separated list of GPU indices and runs the diagnostic on all of them in parallel. Move nvidia, nvidia-targeted-stress, nvidia-targeted-power into nvidiaAllGPUTargets so expandSATTarget passes all selected indices in one API call. Simplify runNvidiaValidateSet to match runNvidiaFabricValidate. Update sat.go constants and page_validate.go estimates to reflect all-GPU simultaneous execution (remove n× multiplier from total time estimates). Stress test on 8-GPU system: ~5.3 h → ~2.5 h. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-20 11:53:25 +03:00
Mikhail Chusavitin	647e99b697	Fix post-sync live-build ISO rebuild	2026-04-20 11:01:15 +03:00
Mikhail Chusavitin	4af997f436	Update audit bee binary	2026-04-20 10:55:42 +03:00
Mikhail Chusavitin	6caace0cc0	Make power benchmark report phase-averaged	2026-04-20 10:53:53 +03:00