iso: improve burn-in, export, and live boot

fix(iso): route kernel logs to tty2, keep tty1 clean for TUI
console=tty0 sent kernel messages to the active VT (tty1), overwriting the TUI. Changed to console=tty2 so kernel logs land on a dedicated console. tty1 is now clean; operator can press Alt+F2 to inspect kernel messages and Alt+F3 for an extra shell. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-26 18:56:19 +03:00 · 2026-03-26 17:40:44 +03:00 · 2026-03-26 17:37:20 +03:00 · 2026-03-26 11:19:07 +03:00 · 2026-03-26 11:05:51 +03:00 · 2026-03-26 10:37:27 +03:00
66 changed files with 4984 additions and 380 deletions
--- a/audit/internal/app/app.go
+++ b/audit/internal/app/app.go
@@ -80,6 +80,7 @@ type satRunner interface {
 	DetectGPUVendor() string
 	ListAMDGPUs() ([]platform.AMDGPUInfo, error)
 	RunAMDAcceptancePack(baseDir string) (string, error)
+	RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
 }

 type runtimeChecker interface {
@@ -105,6 +106,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
 		}
 	}
 	result := collector.Run(runtimeMode)
+	applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir)
 	if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
 		result.Runtime = &health
 	}
@@ -173,11 +175,20 @@ func (a *App) RuntimeHealthResult() ActionResult {
 	if err != nil {
 		return ActionResult{Title: "Runtime issues", Body: "No runtime health found."}
 	}
+	driverLabel := "Driver ready"
+	accelLabel := "CUDA ready"
+	switch a.sat.DetectGPUVendor() {
+	case "amd":
+		driverLabel = "AMDGPU ready"
+		accelLabel = "ROCm SMI ready"
+	case "nvidia":
+		driverLabel = "NVIDIA ready"
+	}
 	var body strings.Builder
 	fmt.Fprintf(&body, "Status: %s\n", firstNonEmpty(health.Status, "UNKNOWN"))
 	fmt.Fprintf(&body, "Export dir: %s\n", firstNonEmpty(health.ExportDir, DefaultExportDir))
-	fmt.Fprintf(&body, "Driver ready: %t\n", health.DriverReady)
-	fmt.Fprintf(&body, "CUDA ready: %t\n", health.CUDAReady)
+	fmt.Fprintf(&body, "%s: %t\n", driverLabel, health.DriverReady)
+	fmt.Fprintf(&body, "%s: %t\n", accelLabel, health.CUDAReady)
 	fmt.Fprintf(&body, "Network: %s", firstNonEmpty(health.NetworkStatus, "UNKNOWN"))
 	if len(health.Issues) > 0 {
 		body.WriteString("\n\nIssues:\n")
@@ -220,8 +231,11 @@ func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error)

 func (a *App) ExportLatestAuditResult(target platform.RemovableTarget) (ActionResult, error) {
 	path, err := a.ExportLatestAudit(target)
-	body := "Audit exported."
-	if path != "" {
+	body := "Audit export failed."
+	if err == nil {
+		body = "Audit exported."
+	}
+	if err == nil && path != "" {
 		body = "Audit exported to " + path
 	}
 	return ActionResult{Title: "Export audit", Body: body}, err
@@ -238,9 +252,12 @@ func (a *App) ExportSupportBundle(target platform.RemovableTarget) (string, erro

 func (a *App) ExportSupportBundleResult(target platform.RemovableTarget) (ActionResult, error) {
 	path, err := a.ExportSupportBundle(target)
-	body := "Support bundle exported."
-	if path != "" {
-		body = "Support bundle exported to " + path
+	body := "Support bundle export failed."
+	if err == nil {
+		body = "Support bundle exported. USB target unmounted and safe to remove."
+	}
+	if err == nil && path != "" {
+		body = "Support bundle exported to " + path + ".\n\nUSB target unmounted and safe to remove."
 	}
 	return ActionResult{Title: "Export support bundle", Body: body}, err
 }
@@ -481,6 +498,67 @@ func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) {
 	return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err
 }

+func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunFanStressTest(ctx, baseDir, opts)
+}
+
+func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) {
+	path, err := a.RunFanStressTest(ctx, "", opts)
+	body := formatFanStressResult(path)
+	if err != nil && err != context.Canceled {
+		body += "\nERROR: " + err.Error()
+	}
+	return ActionResult{Title: "GPU Platform Stress Test", Body: body}, err
+}
+
+// formatFanStressResult formats the summary.txt from a fan-stress run, including
+// the per-step pass/fail display and the analysis section (throttling, max temps, fan response).
+func formatFanStressResult(archivePath string) string {
+	if archivePath == "" {
+		return "No output produced."
+	}
+	runDir := strings.TrimSuffix(archivePath, ".tar.gz")
+	raw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
+	if err != nil {
+		return "Archive written to " + archivePath
+	}
+	content := strings.TrimSpace(string(raw))
+	kv := parseKeyValueSummary(content)
+
+	var b strings.Builder
+	b.WriteString(formatSATDetail(content))
+
+	// Append analysis section.
+	var analysis []string
+	if v, ok := kv["throttling_detected"]; ok {
+		label := "NO"
+		if v == "true" {
+			label = "YES  ← throttling detected during load"
+		}
+		analysis = append(analysis, "Throttling:   "+label)
+	}
+	if v, ok := kv["max_gpu_temp_c"]; ok && v != "0.0" {
+		analysis = append(analysis, "Max GPU temp: "+v+"°C")
+	}
+	if v, ok := kv["max_cpu_temp_c"]; ok && v != "0.0" {
+		analysis = append(analysis, "Max CPU temp: "+v+"°C")
+	}
+	if v, ok := kv["fan_response_sec"]; ok && v != "N/A" && v != "-1.0" {
+		analysis = append(analysis, "Fan response: "+v+"s")
+	}
+
+	if len(analysis) > 0 {
+		b.WriteString("\n\n=== Analysis ===\n")
+		for _, line := range analysis {
+			b.WriteString(line + "\n")
+		}
+	}
+	return strings.TrimSpace(b.String())
+}
+
 // satResultBody reads summary.txt from the SAT run directory (archive path without .tar.gz)
 // and returns a formatted human-readable result. Falls back to a plain message if unreadable.
 func satResultBody(archivePath string) string {
--- a/audit/internal/app/app_test.go
+++ b/audit/internal/app/app_test.go
@@ -1,9 +1,12 @@
 package app

 import (
+	"archive/tar"
+	"compress/gzip"
 	"context"
 	"encoding/json"
 	"errors"
+	"io"
 	"os"
 	"path/filepath"
 	"testing"
@@ -57,13 +60,22 @@ func (f fakeServices) ServiceDo(name string, action platform.ServiceAction) (str
 	return f.serviceDoFn(name, action)
 }

-type fakeExports struct{}
+type fakeExports struct {
+	listTargetsFn    func() ([]platform.RemovableTarget, error)
+	exportToTargetFn func(string, platform.RemovableTarget) (string, error)
+}

 func (f fakeExports) ListRemovableTargets() ([]platform.RemovableTarget, error) {
+	if f.listTargetsFn != nil {
+		return f.listTargetsFn()
+	}
 	return nil, nil
 }

 func (f fakeExports) ExportFileToTarget(src string, target platform.RemovableTarget) (string, error) {
+	if f.exportToTargetFn != nil {
+		return f.exportToTargetFn(src, target)
+	}
 	return "", nil
 }

@@ -97,10 +109,14 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
 }

 type fakeSAT struct {
-	runNvidiaFn  func(string) (string, error)
-	runMemoryFn  func(string) (string, error)
-	runStorageFn func(string) (string, error)
-	runCPUFn     func(string, int) (string, error)
+	runNvidiaFn      func(string) (string, error)
+	runMemoryFn      func(string) (string, error)
+	runStorageFn     func(string) (string, error)
+	runCPUFn         func(string, int) (string, error)
+	detectVendorFn   func() string
+	listAMDGPUsFn    func() ([]platform.AMDGPUInfo, error)
+	runAMDPackFn     func(string) (string, error)
+	listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
 }

 func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string) (string, error) {
@@ -112,6 +128,9 @@ func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir s
 }

 func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
+	if f.listNvidiaGPUsFn != nil {
+		return f.listNvidiaGPUsFn()
+	}
 	return nil, nil
 }

@@ -130,11 +149,30 @@ func (f fakeSAT) RunCPUAcceptancePack(baseDir string, durationSec int) (string,
 	return "", nil
 }

-func (f fakeSAT) DetectGPUVendor() string { return "" }
+func (f fakeSAT) DetectGPUVendor() string {
+	if f.detectVendorFn != nil {
+		return f.detectVendorFn()
+	}
+	return ""
+}

-func (f fakeSAT) ListAMDGPUs() ([]platform.AMDGPUInfo, error) { return nil, nil }
+func (f fakeSAT) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
+	if f.listAMDGPUsFn != nil {
+		return f.listAMDGPUsFn()
+	}
+	return nil, nil
+}

-func (f fakeSAT) RunAMDAcceptancePack(baseDir string) (string, error) { return "", nil }
+func (f fakeSAT) RunAMDAcceptancePack(baseDir string) (string, error) {
+	if f.runAMDPackFn != nil {
+		return f.runAMDPackFn(baseDir)
+	}
+	return "", nil
+}
+
+func (f fakeSAT) RunFanStressTest(_ context.Context, _ string, _ platform.FanStressOptions) (string, error) {
+	return "", nil
+}

 func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
 	t.Parallel()
@@ -394,6 +432,79 @@ func TestActionResultsUseFallbackBody(t *testing.T) {
 	}
 }

+func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
+	t.Parallel()
+
+	tmp := t.TempDir()
+	oldExportDir := DefaultExportDir
+	DefaultExportDir = tmp
+	t.Cleanup(func() { DefaultExportDir = oldExportDir })
+
+	if err := os.WriteFile(filepath.Join(tmp, "bee-audit.json"), []byte("{}\n"), 0644); err != nil {
+		t.Fatalf("write bee-audit.json: %v", err)
+	}
+	if err := os.WriteFile(filepath.Join(tmp, "bee-audit.log"), []byte("audit ok\n"), 0644); err != nil {
+		t.Fatalf("write bee-audit.log: %v", err)
+	}
+
+	a := &App{
+		exports: fakeExports{
+			exportToTargetFn: func(src string, target platform.RemovableTarget) (string, error) {
+				if filepath.Base(src) == "" {
+					t.Fatalf("expected non-empty source path")
+				}
+				return "/media/bee/" + filepath.Base(src), nil
+			},
+		},
+	}
+
+	result, err := a.ExportSupportBundleResult(platform.RemovableTarget{Device: "/dev/sdb1"})
+	if err != nil {
+		t.Fatalf("ExportSupportBundleResult error: %v", err)
+	}
+	if result.Title != "Export support bundle" {
+		t.Fatalf("title=%q want %q", result.Title, "Export support bundle")
+	}
+	if want := "USB target unmounted and safe to remove."; !contains(result.Body, want) {
+		t.Fatalf("body missing %q\nbody=%s", want, result.Body)
+	}
+}
+
+func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) {
+	t.Parallel()
+
+	tmp := t.TempDir()
+	oldExportDir := DefaultExportDir
+	DefaultExportDir = tmp
+	t.Cleanup(func() { DefaultExportDir = oldExportDir })
+
+	if err := os.WriteFile(filepath.Join(tmp, "bee-audit.json"), []byte("{}\n"), 0644); err != nil {
+		t.Fatalf("write bee-audit.json: %v", err)
+	}
+	if err := os.WriteFile(filepath.Join(tmp, "bee-audit.log"), []byte("audit ok\n"), 0644); err != nil {
+		t.Fatalf("write bee-audit.log: %v", err)
+	}
+
+	a := &App{
+		exports: fakeExports{
+			exportToTargetFn: func(string, platform.RemovableTarget) (string, error) {
+				return "", errors.New("mount /dev/sda1: exFAT support is missing in this ISO build")
+			},
+		},
+	}
+
+	result, err := a.ExportSupportBundleResult(platform.RemovableTarget{Device: "/dev/sda1", FSType: "exfat"})
+	if err == nil {
+		t.Fatal("expected export error")
+	}
+	if contains(result.Body, "exported to") {
+		t.Fatalf("body should not claim success:\n%s", result.Body)
+	}
+	if result.Body != "Support bundle export failed." {
+		t.Fatalf("body=%q want %q", result.Body, "Support bundle export failed.")
+	}
+}
+
 func TestRunNvidiaAcceptancePackResult(t *testing.T) {
 	t.Parallel()

@@ -516,6 +627,9 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 	if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
 		t.Fatal(err)
 	}
+	if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run.tar.gz"), []byte("nested sat archive"), 0644); err != nil {
+		t.Fatal(err)
+	}

 	archive, err := BuildSupportBundle(exportDir)
 	if err != nil {
@@ -524,6 +638,44 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 	if _, err := os.Stat(archive); err != nil {
 		t.Fatalf("archive stat: %v", err)
 	}
+
+	file, err := os.Open(archive)
+	if err != nil {
+		t.Fatalf("open archive: %v", err)
+	}
+	defer file.Close()
+
+	gzr, err := gzip.NewReader(file)
+	if err != nil {
+		t.Fatalf("gzip reader: %v", err)
+	}
+	defer gzr.Close()
+
+	tr := tar.NewReader(gzr)
+	var names []string
+	for {
+		hdr, err := tr.Next()
+		if errors.Is(err, io.EOF) {
+			break
+		}
+		if err != nil {
+			t.Fatalf("read tar entry: %v", err)
+		}
+		names = append(names, hdr.Name)
+	}
+
+	var foundRaw bool
+	for _, name := range names {
+		if contains(name, "/export/bee-sat/memory-run/verbose.log") {
+			foundRaw = true
+		}
+		if contains(name, "/export/bee-sat/memory-run.tar.gz") {
+			t.Fatalf("support bundle should not contain nested SAT archive: %s", name)
+		}
+	}
+	if !foundRaw {
+		t.Fatalf("support bundle missing raw SAT log, names=%v", names)
+	}
 }

 func TestMainBanner(t *testing.T) {
@@ -600,6 +752,44 @@ func TestMainBanner(t *testing.T) {
 	}
 }

+func TestRuntimeHealthResultUsesAMDLabels(t *testing.T) {
+	tmp := t.TempDir()
+	oldRuntimePath := DefaultRuntimeJSONPath
+	DefaultRuntimeJSONPath = filepath.Join(tmp, "runtime-health.json")
+	t.Cleanup(func() { DefaultRuntimeJSONPath = oldRuntimePath })
+
+	raw, err := json.Marshal(schema.RuntimeHealth{
+		Status:        "OK",
+		ExportDir:     "/appdata/bee/export",
+		DriverReady:   true,
+		CUDAReady:     true,
+		NetworkStatus: "OK",
+	})
+	if err != nil {
+		t.Fatalf("marshal runtime health: %v", err)
+	}
+	if err := os.WriteFile(DefaultRuntimeJSONPath, raw, 0644); err != nil {
+		t.Fatalf("write runtime health: %v", err)
+	}
+
+	a := &App{
+		sat: fakeSAT{
+			detectVendorFn: func() string { return "amd" },
+		},
+	}
+
+	result := a.RuntimeHealthResult()
+	if !contains(result.Body, "AMDGPU ready: true") {
+		t.Fatalf("body missing AMD driver label:\n%s", result.Body)
+	}
+	if !contains(result.Body, "ROCm SMI ready: true") {
+		t.Fatalf("body missing ROCm label:\n%s", result.Body)
+	}
+	if contains(result.Body, "CUDA ready") {
+		t.Fatalf("body should not mention CUDA on AMD:\n%s", result.Body)
+	}
+}
+
 func intPtr(v int) *int { return &v }

 func contains(haystack, needle string) bool {
--- a/audit/internal/app/sat_overlay.go
+++ b/audit/internal/app/sat_overlay.go
@@ -0,0 +1,214 @@
+package app
+
+import (
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+
+	"bee/audit/internal/schema"
+)
+
+func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
+	if snap == nil || strings.TrimSpace(baseDir) == "" {
+		return
+	}
+	if summary, ok := loadLatestSATSummary(baseDir, "gpu-amd-"); ok {
+		applyGPUVendorSAT(snap.PCIeDevices, "amd", summary)
+	}
+	if summary, ok := loadLatestSATSummary(baseDir, "gpu-nvidia-"); ok {
+		applyGPUVendorSAT(snap.PCIeDevices, "nvidia", summary)
+	}
+	if summary, ok := loadLatestSATSummary(baseDir, "memory-"); ok {
+		applyMemorySAT(snap.Memory, summary)
+	}
+	if summary, ok := loadLatestSATSummary(baseDir, "cpu-"); ok {
+		applyCPUSAT(snap.CPUs, summary)
+	}
+	if summary, ok := loadLatestSATSummary(baseDir, "storage-"); ok {
+		applyStorageSAT(snap.Storage, summary)
+	}
+}
+
+type satSummary struct {
+	runAtUTC string
+	overall  string
+	kv       map[string]string
+}
+
+func loadLatestSATSummary(baseDir, prefix string) (satSummary, bool) {
+	matches, err := filepath.Glob(filepath.Join(baseDir, prefix+"*/summary.txt"))
+	if err != nil || len(matches) == 0 {
+		return satSummary{}, false
+	}
+	sort.Strings(matches)
+	raw, err := os.ReadFile(matches[len(matches)-1])
+	if err != nil {
+		return satSummary{}, false
+	}
+	kv := parseKeyValueSummary(string(raw))
+	return satSummary{
+		runAtUTC: strings.TrimSpace(kv["run_at_utc"]),
+		overall:  strings.ToUpper(strings.TrimSpace(kv["overall_status"])),
+		kv:       kv,
+	}, true
+}
+
+func applyGPUVendorSAT(devs []schema.HardwarePCIeDevice, vendor string, summary satSummary) {
+	status, description, ok := satSummaryStatus(summary, vendor+" GPU SAT")
+	if !ok {
+		return
+	}
+	for i := range devs {
+		if !matchesGPUVendor(devs[i], vendor) {
+			continue
+		}
+		mergeComponentStatus(&devs[i].HardwareComponentStatus, summary.runAtUTC, status, description)
+	}
+}
+
+func applyMemorySAT(dimms []schema.HardwareMemory, summary satSummary) {
+	status, description, ok := satSummaryStatus(summary, "memory SAT")
+	if !ok {
+		return
+	}
+	for i := range dimms {
+		mergeComponentStatus(&dimms[i].HardwareComponentStatus, summary.runAtUTC, status, description)
+	}
+}
+
+func applyCPUSAT(cpus []schema.HardwareCPU, summary satSummary) {
+	status, description, ok := satSummaryStatus(summary, "CPU SAT")
+	if !ok {
+		return
+	}
+	for i := range cpus {
+		mergeComponentStatus(&cpus[i].HardwareComponentStatus, summary.runAtUTC, status, description)
+	}
+}
+
+func applyStorageSAT(disks []schema.HardwareStorage, summary satSummary) {
+	byDevice := parseStorageSATStatus(summary)
+	for i := range disks {
+		devPath, _ := disks[i].Telemetry["linux_device"].(string)
+		devName := filepath.Base(strings.TrimSpace(devPath))
+		if devName == "" {
+			continue
+		}
+		result, ok := byDevice[devName]
+		if !ok {
+			continue
+		}
+		mergeComponentStatus(&disks[i].HardwareComponentStatus, summary.runAtUTC, result.status, result.description)
+	}
+}
+
+type satStatusResult struct {
+	status      string
+	description string
+	ok          bool
+}
+
+func parseStorageSATStatus(summary satSummary) map[string]satStatusResult {
+	result := map[string]satStatusResult{}
+	for key, value := range summary.kv {
+		if !strings.HasSuffix(key, "_status") || key == "overall_status" {
+			continue
+		}
+		base := strings.TrimSuffix(key, "_status")
+		idx := strings.Index(base, "_")
+		if idx <= 0 {
+			continue
+		}
+		devName := base[:idx]
+		step := strings.ReplaceAll(base[idx+1:], "_", "-")
+		stepStatus, desc, ok := satKeyStatus(strings.ToUpper(strings.TrimSpace(value)), "storage "+step)
+		if !ok {
+			continue
+		}
+		current := result[devName]
+		if !current.ok || statusSeverity(stepStatus) > statusSeverity(current.status) {
+			result[devName] = satStatusResult{status: stepStatus, description: desc, ok: true}
+		}
+	}
+	return result
+}
+
+func satSummaryStatus(summary satSummary, label string) (string, string, bool) {
+	return satKeyStatus(summary.overall, label)
+}
+
+func satKeyStatus(rawStatus, label string) (string, string, bool) {
+	switch strings.ToUpper(strings.TrimSpace(rawStatus)) {
+	case "OK":
+		return "OK", label + " passed", true
+	case "PARTIAL", "UNSUPPORTED", "CANCELED", "CANCELLED":
+		return "Warning", label + " incomplete", true
+	case "FAILED":
+		return "Critical", label + " failed", true
+	default:
+		return "", "", false
+	}
+}
+
+func mergeComponentStatus(component *schema.HardwareComponentStatus, changedAt, satStatus, description string) {
+	if component == nil || satStatus == "" {
+		return
+	}
+	current := strings.TrimSpace(ptrString(component.Status))
+	if current == "" || current == "Unknown" || statusSeverity(satStatus) > statusSeverity(current) {
+		component.Status = appStringPtr(satStatus)
+		if strings.TrimSpace(description) != "" {
+			component.ErrorDescription = appStringPtr(description)
+		}
+		if strings.TrimSpace(changedAt) != "" {
+			component.StatusChangedAt = appStringPtr(changedAt)
+			component.StatusHistory = append(component.StatusHistory, schema.HardwareStatusHistory{
+				Status:    satStatus,
+				ChangedAt: changedAt,
+				Details:   appStringPtr(description),
+			})
+		}
+	}
+}
+
+func statusSeverity(status string) int {
+	switch strings.TrimSpace(status) {
+	case "Critical":
+		return 3
+	case "Warning":
+		return 2
+	case "OK":
+		return 1
+	default:
+		return 0
+	}
+}
+
+func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool {
+	if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Controller") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Accelerator") {
+		if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Display") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Video") {
+			return false
+		}
+	}
+	manufacturer := strings.ToLower(strings.TrimSpace(ptrString(dev.Manufacturer)))
+	switch vendor {
+	case "amd":
+		return strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd/ati")
+	case "nvidia":
+		return strings.Contains(manufacturer, "nvidia")
+	default:
+		return false
+	}
+}
+
+func ptrString(v *string) string {
+	if v == nil {
+		return ""
+	}
+	return *v
+}
+
+func appStringPtr(value string) *string {
+	return &value
+}
--- a/audit/internal/app/sat_overlay_test.go
+++ b/audit/internal/app/sat_overlay_test.go
@@ -0,0 +1,61 @@
+package app
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+
+	"bee/audit/internal/schema"
+)
+
+func TestApplyLatestSATStatusesMarksStorageByDevice(t *testing.T) {
+	baseDir := t.TempDir()
+	runDir := filepath.Join(baseDir, "storage-20260325-161151")
+	if err := os.MkdirAll(runDir, 0755); err != nil {
+		t.Fatal(err)
+	}
+	raw := "run_at_utc=2026-03-25T16:11:51Z\nnvme0n1_nvme_smart_log_status=OK\nsda_smartctl_health_status=FAILED\noverall_status=FAILED\n"
+	if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(raw), 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	nvme := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/nvme0n1"}}
+	usb := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/sda"}}
+	snap := schema.HardwareSnapshot{Storage: []schema.HardwareStorage{nvme, usb}}
+
+	applyLatestSATStatuses(&snap, baseDir)
+
+	if snap.Storage[0].Status == nil || *snap.Storage[0].Status != "OK" {
+		t.Fatalf("nvme status=%v want OK", snap.Storage[0].Status)
+	}
+	if snap.Storage[1].Status == nil || *snap.Storage[1].Status != "Critical" {
+		t.Fatalf("sda status=%v want Critical", snap.Storage[1].Status)
+	}
+}
+
+func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
+	baseDir := t.TempDir()
+	runDir := filepath.Join(baseDir, "gpu-amd-20260325-161436")
+	if err := os.MkdirAll(runDir, 0755); err != nil {
+		t.Fatal(err)
+	}
+	raw := "run_at_utc=2026-03-25T16:14:36Z\noverall_status=FAILED\n"
+	if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(raw), 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	class := "DisplayController"
+	manufacturer := "Advanced Micro Devices, Inc. [AMD/ATI]"
+	snap := schema.HardwareSnapshot{
+		PCIeDevices: []schema.HardwarePCIeDevice{{
+			DeviceClass:  &class,
+			Manufacturer: &manufacturer,
+		}},
+	}
+
+	applyLatestSATStatuses(&snap, baseDir)
+
+	if snap.PCIeDevices[0].Status == nil || *snap.PCIeDevices[0].Status != "Critical" {
+		t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
+	}
+}
--- a/audit/internal/app/support_bundle.go
+++ b/audit/internal/app/support_bundle.go
@@ -56,7 +56,7 @@ func BuildSupportBundle(exportDir string) (string, error) {
 	}
 	defer os.RemoveAll(stageRoot)

-	if err := copyDirContents(exportDir, filepath.Join(stageRoot, "export")); err != nil {
+	if err := copyExportDirForSupportBundle(exportDir, filepath.Join(stageRoot, "export")); err != nil {
 		return "", err
 	}
 	if err := writeJournalDump(filepath.Join(stageRoot, "systemd", "combined.journal.log")); err != nil {
@@ -214,6 +214,40 @@ func copyDirContents(srcDir, dstDir string) error {
 	return nil
 }

+func copyExportDirForSupportBundle(srcDir, dstDir string) error {
+	return copyDirContentsFiltered(srcDir, dstDir, func(rel string, info os.FileInfo) bool {
+		cleanRel := filepath.ToSlash(strings.TrimPrefix(filepath.Clean(rel), "./"))
+		if cleanRel == "" {
+			return true
+		}
+		if strings.HasPrefix(cleanRel, "bee-sat/") && strings.HasSuffix(cleanRel, ".tar.gz") {
+			return false
+		}
+		if strings.HasPrefix(filepath.Base(cleanRel), "bee-support-") && strings.HasSuffix(cleanRel, ".tar.gz") {
+			return false
+		}
+		return true
+	})
+}
+
+func copyDirContentsFiltered(srcDir, dstDir string, keep func(rel string, info os.FileInfo) bool) error {
+	entries, err := os.ReadDir(srcDir)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return nil
+		}
+		return err
+	}
+	for _, entry := range entries {
+		src := filepath.Join(srcDir, entry.Name())
+		dst := filepath.Join(dstDir, entry.Name())
+		if err := copyPathFiltered(srcDir, src, dst, keep); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
 func copyPath(src, dst string) error {
 	info, err := os.Stat(src)
 	if err != nil {
@@ -254,6 +288,36 @@ func copyPath(src, dst string) error {
 	return err
 }

+func copyPathFiltered(rootSrc, src, dst string, keep func(rel string, info os.FileInfo) bool) error {
+	info, err := os.Stat(src)
+	if err != nil {
+		return err
+	}
+	rel, err := filepath.Rel(rootSrc, src)
+	if err != nil {
+		return err
+	}
+	if keep != nil && !keep(rel, info) {
+		return nil
+	}
+	if info.IsDir() {
+		if err := os.MkdirAll(dst, info.Mode().Perm()); err != nil {
+			return err
+		}
+		entries, err := os.ReadDir(src)
+		if err != nil {
+			return err
+		}
+		for _, entry := range entries {
+			if err := copyPathFiltered(rootSrc, filepath.Join(src, entry.Name()), filepath.Join(dst, entry.Name()), keep); err != nil {
+				return err
+			}
+		}
+		return nil
+	}
+	return copyPath(src, dst)
+}
+
 func createSupportTarGz(dst, srcDir string) error {
 	file, err := os.Create(dst)
 	if err != nil {
--- a/audit/internal/collector/amdgpu.go
+++ b/audit/internal/collector/amdgpu.go
@@ -0,0 +1,252 @@
+package collector
+
+import (
+	"encoding/csv"
+	"log/slog"
+	"os/exec"
+	"path/filepath"
+	"sort"
+	"strconv"
+	"strings"
+
+	"bee/audit/internal/schema"
+)
+
+var (
+	amdSMIExecCommand = exec.Command
+	amdSMILookPath    = exec.LookPath
+	amdSMIGlob        = filepath.Glob
+)
+
+var amdSMIExecutableGlobs = []string{
+	"/opt/rocm/bin/rocm-smi",
+	"/opt/rocm-*/bin/rocm-smi",
+	"/usr/local/bin/rocm-smi",
+}
+
+type amdGPUInfo struct {
+	BDF      string
+	Serial   string
+	Product  string
+	Firmware string
+	PowerW   *float64
+	TempC    *float64
+}
+
+func enrichPCIeWithAMD(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
+	if !hasAMDGPUDevices(devs) {
+		return devs
+	}
+	infoByBDF, err := queryAMDGPUs()
+	if err != nil {
+		slog.Info("amdgpu: enrichment skipped", "err", err)
+		return devs
+	}
+	enriched := 0
+	for i := range devs {
+		if !isAMDGPUDevice(devs[i]) || devs[i].BDF == nil {
+			continue
+		}
+		info, ok := infoByBDF[normalizePCIeBDF(*devs[i].BDF)]
+		if !ok {
+			continue
+		}
+		if strings.TrimSpace(info.Serial) != "" {
+			devs[i].SerialNumber = &info.Serial
+		}
+		if strings.TrimSpace(info.Firmware) != "" {
+			devs[i].Firmware = &info.Firmware
+		}
+		if strings.TrimSpace(info.Product) != "" && devs[i].Model == nil {
+			devs[i].Model = &info.Product
+		}
+		if info.PowerW != nil {
+			devs[i].PowerW = info.PowerW
+		}
+		if info.TempC != nil {
+			devs[i].TemperatureC = info.TempC
+		}
+		enriched++
+	}
+	if enriched > 0 {
+		slog.Info("amdgpu: enriched", "count", enriched)
+	}
+	return devs
+}
+
+func hasAMDGPUDevices(devs []schema.HardwarePCIeDevice) bool {
+	for _, dev := range devs {
+		if isAMDGPUDevice(dev) {
+			return true
+		}
+	}
+	return false
+}
+
+func isAMDGPUDevice(dev schema.HardwarePCIeDevice) bool {
+	if dev.Manufacturer == nil || dev.DeviceClass == nil {
+		return false
+	}
+	manufacturer := strings.ToLower(strings.TrimSpace(*dev.Manufacturer))
+	return strings.Contains(manufacturer, "advanced micro devices") && isGPUClass(strings.TrimSpace(*dev.DeviceClass))
+}
+
+func queryAMDGPUs() (map[string]amdGPUInfo, error) {
+	busByCard, err := queryAMDField("--showbus")
+	if err != nil {
+		return nil, err
+	}
+	infoByCard := map[string]amdGPUInfo{}
+	for card, bus := range busByCard {
+		bdf := normalizePCIeBDF(bus)
+		if bdf == "" {
+			continue
+		}
+		infoByCard[card] = amdGPUInfo{BDF: bdf}
+	}
+	if len(infoByCard) == 0 {
+		return map[string]amdGPUInfo{}, nil
+	}
+	mergeAMDField(infoByCard, "--showserial", func(info *amdGPUInfo, value string) { info.Serial = value })
+	mergeAMDField(infoByCard, "--showproductname", func(info *amdGPUInfo, value string) { info.Product = value })
+	mergeAMDField(infoByCard, "--showvbios", func(info *amdGPUInfo, value string) { info.Firmware = value })
+	mergeAMDNumericField(infoByCard, "--showpower", func(info *amdGPUInfo, value float64) { info.PowerW = &value })
+	mergeAMDNumericField(infoByCard, "--showtemp", func(info *amdGPUInfo, value float64) { info.TempC = &value })
+
+	result := make(map[string]amdGPUInfo, len(infoByCard))
+	for _, info := range infoByCard {
+		if info.BDF == "" {
+			continue
+		}
+		result[info.BDF] = info
+	}
+	return result, nil
+}
+
+func mergeAMDField(infoByCard map[string]amdGPUInfo, flag string, apply func(*amdGPUInfo, string)) {
+	values, err := queryAMDField(flag)
+	if err != nil {
+		return
+	}
+	for card, value := range values {
+		info, ok := infoByCard[card]
+		if !ok {
+			continue
+		}
+		value = strings.TrimSpace(value)
+		if value == "" {
+			continue
+		}
+		apply(&info, value)
+		infoByCard[card] = info
+	}
+}
+
+func mergeAMDNumericField(infoByCard map[string]amdGPUInfo, flag string, apply func(*amdGPUInfo, float64)) {
+	values, err := queryAMDNumericField(flag)
+	if err != nil {
+		return
+	}
+	for card, value := range values {
+		info, ok := infoByCard[card]
+		if !ok {
+			continue
+		}
+		apply(&info, value)
+		infoByCard[card] = info
+	}
+}
+
+func queryAMDField(flag string) (map[string]string, error) {
+	cmd, err := resolveAMDSMICmd(flag, "--csv")
+	if err != nil {
+		return nil, err
+	}
+	out, err := amdSMIExecCommand(cmd[0], cmd[1:]...).CombinedOutput()
+	if err != nil {
+		return nil, err
+	}
+	return parseROCmSingleValueCSV(string(out)), nil
+}
+
+func queryAMDNumericField(flag string) (map[string]float64, error) {
+	values, err := queryAMDField(flag)
+	if err != nil {
+		return nil, err
+	}
+	out := map[string]float64{}
+	for card, raw := range values {
+		if value, ok := firstFloat(raw); ok {
+			out[card] = value
+		}
+	}
+	return out, nil
+}
+
+func resolveAMDSMICmd(args ...string) ([]string, error) {
+	if path, err := amdSMILookPath("rocm-smi"); err == nil {
+		return append([]string{path}, args...), nil
+	}
+	for _, pattern := range amdSMIExecutableGlobs {
+		matches, err := amdSMIGlob(pattern)
+		if err != nil {
+			continue
+		}
+		sort.Strings(matches)
+		for _, match := range matches {
+			return append([]string{match}, args...), nil
+		}
+	}
+	return nil, exec.ErrNotFound
+}
+
+func parseROCmSingleValueCSV(raw string) map[string]string {
+	rows := map[string]string{}
+	reader := csv.NewReader(strings.NewReader(raw))
+	reader.FieldsPerRecord = -1
+	records, err := reader.ReadAll()
+	if err != nil {
+		return rows
+	}
+	for _, rec := range records {
+		if len(rec) < 2 {
+			continue
+		}
+		card := normalizeROCmCardKey(rec[0])
+		if card == "" {
+			continue
+		}
+		value := strings.TrimSpace(strings.Join(rec[1:], ","))
+		if value == "" || looksLikeCSVHeaderValue(value) {
+			continue
+		}
+		rows[card] = value
+	}
+	return rows
+}
+
+func normalizeROCmCardKey(raw string) string {
+	raw = strings.ToLower(strings.TrimSpace(raw))
+	raw = strings.Trim(raw, "\"")
+	if raw == "" {
+		return ""
+	}
+	if raw == "device" || raw == "gpu" || raw == "card" {
+		return ""
+	}
+	if strings.HasPrefix(raw, "card") {
+		return raw
+	}
+	if _, err := strconv.Atoi(raw); err == nil {
+		return "card" + raw
+	}
+	return ""
+}
+
+func looksLikeCSVHeaderValue(value string) bool {
+	value = strings.ToLower(strings.TrimSpace(value))
+	return strings.Contains(value, "product") ||
+		strings.Contains(value, "serial") ||
+		strings.Contains(value, "vbios") ||
+		strings.Contains(value, "bus")
+}
--- a/audit/internal/collector/amdgpu_test.go
+++ b/audit/internal/collector/amdgpu_test.go
@@ -0,0 +1,56 @@
+package collector
+
+import (
+	"os/exec"
+	"testing"
+)
+
+func TestParseROCmSingleValueCSV(t *testing.T) {
+	raw := "device,Serial Number\ncard0,ABC123\ncard1,XYZ789\n"
+	got := parseROCmSingleValueCSV(raw)
+	if got["card0"] != "ABC123" {
+		t.Fatalf("card0=%q want ABC123", got["card0"])
+	}
+	if got["card1"] != "XYZ789" {
+		t.Fatalf("card1=%q want XYZ789", got["card1"])
+	}
+}
+
+func TestQueryAMDNumericFieldParsesUnits(t *testing.T) {
+	origExec := amdSMIExecCommand
+	origLookPath := amdSMILookPath
+	t.Cleanup(func() {
+		amdSMIExecCommand = origExec
+		amdSMILookPath = origLookPath
+	})
+
+	amdSMILookPath = func(string) (string, error) { return "/usr/bin/rocm-smi", nil }
+	amdSMIExecCommand = func(name string, args ...string) *exec.Cmd {
+		return exec.Command("sh", "-c", "printf 'device,Temperature\\ncard0,45.5c\\ncard1,67.0c\\n'")
+	}
+
+	got, err := queryAMDNumericField("--showtemp")
+	if err != nil {
+		t.Fatalf("queryAMDNumericField: %v", err)
+	}
+	if got["card0"] != 45.5 {
+		t.Fatalf("card0=%v want 45.5", got["card0"])
+	}
+	if got["card1"] != 67.0 {
+		t.Fatalf("card1=%v want 67.0", got["card1"])
+	}
+}
+
+func TestNormalizeROCmCardKey(t *testing.T) {
+	tests := map[string]string{
+		"0":      "card0",
+		"card1":  "card1",
+		"Device": "",
+		"":       "",
+	}
+	for input, want := range tests {
+		if got := normalizeROCmCardKey(input); got != want {
+			t.Fatalf("normalizeROCmCardKey(%q)=%q want %q", input, got, want)
+		}
+	}
+}
--- a/audit/internal/collector/collector.go
+++ b/audit/internal/collector/collector.go
@@ -36,6 +36,8 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
 	snap.Memory = enrichMemoryWithTelemetry(snap.Memory, sensorDoc)
 	snap.Storage = collectStorage()
 	snap.PCIeDevices = collectPCIe()
+	snap.PCIeDevices = enrichPCIeWithAMD(snap.PCIeDevices)
+	snap.PCIeDevices = enrichPCIeWithPCISerials(snap.PCIeDevices)
 	snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices)
 	snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices)
 	snap.PCIeDevices = enrichPCIeWithNICTelemetry(snap.PCIeDevices)
--- a/audit/internal/collector/finalize.go
+++ b/audit/internal/collector/finalize.go
@@ -41,7 +41,18 @@ func filterStorage(disks []schema.HardwareStorage) []schema.HardwareStorage {
 func filterPSUs(psus []schema.HardwarePowerSupply) []schema.HardwarePowerSupply {
 	out := make([]schema.HardwarePowerSupply, 0, len(psus))
 	for _, psu := range psus {
-		if psu.SerialNumber == nil || *psu.SerialNumber == "" {
+		hasIdentity := false
+		switch {
+		case psu.SerialNumber != nil && *psu.SerialNumber != "":
+			hasIdentity = true
+		case psu.Slot != nil && *psu.Slot != "":
+			hasIdentity = true
+		case psu.Model != nil && *psu.Model != "":
+			hasIdentity = true
+		case psu.Vendor != nil && *psu.Vendor != "":
+			hasIdentity = true
+		}
+		if !hasIdentity {
 			continue
 		}
 		out = append(out, psu)
--- a/audit/internal/collector/finalize_test.go
+++ b/audit/internal/collector/finalize_test.go
@@ -61,3 +61,20 @@ func TestFinalizeSnapshotPreservesDuplicateSerials(t *testing.T) {
 		t.Fatalf("duplicate serial should stay unchanged: %q", got)
 	}
 }
+
+func TestFilterPSUsKeepsSlotOnlyEntries(t *testing.T) {
+	slot := "0"
+	status := statusOK
+
+	got := filterPSUs([]schema.HardwarePowerSupply{
+		{Slot: &slot, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
+		{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
+	})
+
+	if len(got) != 1 {
+		t.Fatalf("len(got)=%d want 1", len(got))
+	}
+	if got[0].Slot == nil || *got[0].Slot != "0" {
+		t.Fatalf("unexpected kept PSU: %+v", got[0])
+	}
+}
--- a/audit/internal/collector/nic_telemetry.go
+++ b/audit/internal/collector/nic_telemetry.go
@@ -44,6 +44,11 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
 		}
 		iface := ifaces[0]
 		devs[i].MacAddresses = collectInterfaceMACs(ifaces)
+		if devs[i].SerialNumber == nil {
+			if serial := queryPCIDeviceSerial(bdf); serial != "" {
+				devs[i].SerialNumber = &serial
+			}
+		}

 		if devs[i].Firmware == nil {
 			if out, err := ethtoolInfoQuery(iface); err == nil {
--- a/audit/internal/collector/nic_telemetry_test.go
+++ b/audit/internal/collector/nic_telemetry_test.go
@@ -1,6 +1,10 @@
 package collector

-import "testing"
+import (
+	"bee/audit/internal/schema"
+	"fmt"
+	"testing"
+)

 func TestParseSFPDOM(t *testing.T) {
 	raw := `
@@ -29,6 +33,74 @@ func TestParseSFPDOM(t *testing.T) {
 	}
 }

+func TestParseLSPCIDetailSerial(t *testing.T) {
+	raw := `
+05:00.0 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6]
+	Serial number: NIC-SN-12345
+`
+	if got := parseLSPCIDetailSerial(raw); got != "NIC-SN-12345" {
+		t.Fatalf("serial=%q want %q", got, "NIC-SN-12345")
+	}
+}
+
+func TestParsePCIVPDSerial(t *testing.T) {
+	raw := []byte{0x82, 0x05, 0x00, 'M', 'L', 'X', '5', 0x90, 0x08, 0x00, 'S', 'N', 0x08, 'M', 'T', '1', '2', '3', '4', '5', '6'}
+	if got := parsePCIVPDSerial(raw); got != "MT123456" {
+		t.Fatalf("serial=%q want %q", got, "MT123456")
+	}
+}
+
+func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
+	origDetail := queryPCILSPCIDetail
+	origVPD := readPCIVPDFile
+	origIfaces := netIfacesByBDF
+	origReadMAC := readNetAddressFile
+	origEth := ethtoolInfoQuery
+	origModule := ethtoolModuleQuery
+	t.Cleanup(func() {
+		queryPCILSPCIDetail = origDetail
+		readPCIVPDFile = origVPD
+		netIfacesByBDF = origIfaces
+		readNetAddressFile = origReadMAC
+		ethtoolInfoQuery = origEth
+		ethtoolModuleQuery = origModule
+	})
+
+	queryPCILSPCIDetail = func(bdf string) (string, error) {
+		if bdf != "0000:18:00.0" {
+			t.Fatalf("unexpected bdf: %s", bdf)
+		}
+		return "Serial number: NIC-SN-98765\n", nil
+	}
+	readPCIVPDFile = func(string) ([]byte, error) {
+		return nil, fmt.Errorf("no vpd needed")
+	}
+	netIfacesByBDF = func(string) []string { return []string{"eth0"} }
+	readNetAddressFile = func(iface string) (string, error) {
+		if iface != "eth0" {
+			t.Fatalf("unexpected iface: %s", iface)
+		}
+		return "aa:bb:cc:dd:ee:ff", nil
+	}
+	ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
+	ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("skip optics") }
+
+	class := "EthernetController"
+	bdf := "0000:18:00.0"
+	devs := []schema.HardwarePCIeDevice{{
+		DeviceClass: &class,
+		BDF:         &bdf,
+	}}
+
+	out := enrichPCIeWithNICTelemetry(devs)
+	if out[0].SerialNumber == nil || *out[0].SerialNumber != "NIC-SN-98765" {
+		t.Fatalf("serial=%v want NIC-SN-98765", out[0].SerialNumber)
+	}
+	if len(out[0].MacAddresses) != 1 || out[0].MacAddresses[0] != "aa:bb:cc:dd:ee:ff" {
+		t.Fatalf("mac_addresses=%v", out[0].MacAddresses)
+	}
+}
+
 func TestDBMValue(t *testing.T) {
 	tests := []struct {
 		in   string
--- a/audit/internal/collector/pcie.go
+++ b/audit/internal/collector/pcie.go
@@ -37,7 +37,7 @@ func parseLspci(output string) []schema.HardwarePCIeDevice {
 			val := strings.TrimSpace(line[idx+2:])
 			fields[key] = val
 		}
-		if !shouldIncludePCIeDevice(fields["Class"]) {
+		if !shouldIncludePCIeDevice(fields["Class"], fields["Vendor"], fields["Device"]) {
 			continue
 		}
 		dev := parseLspciDevice(fields)
@@ -46,8 +46,10 @@ func parseLspci(output string) []schema.HardwarePCIeDevice {
 	return devs
 }

-func shouldIncludePCIeDevice(class string) bool {
+func shouldIncludePCIeDevice(class, vendor, device string) bool {
 	c := strings.ToLower(strings.TrimSpace(class))
+	v := strings.ToLower(strings.TrimSpace(vendor))
+	d := strings.ToLower(strings.TrimSpace(device))
 	if c == "" {
 		return true
 	}
@@ -68,12 +70,28 @@ func shouldIncludePCIeDevice(class string) bool {
 		"audio device",
 		"serial bus controller",
 		"unassigned class",
+		"non-essential instrumentation",
 	}
 	for _, bad := range excluded {
 		if strings.Contains(c, bad) {
 			return false
 		}
 	}
+
+	if strings.Contains(v, "advanced micro devices") || strings.Contains(v, "[amd]") {
+		internalAMDPatterns := []string{
+			"dummy function",
+			"reserved spp",
+			"ptdma",
+			"cryptographic coprocessor pspcpp",
+			"pspcpp",
+		}
+		for _, bad := range internalAMDPatterns {
+			if strings.Contains(d, bad) {
+				return false
+			}
+		}
+	}
 	return true
 }

@@ -98,6 +116,8 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
 		}
 		if numaNode, ok := readPCINumaNode(bdf); ok {
 			dev.NUMANode = &numaNode
+		} else if numaNode, ok := parsePCINumaNode(fields["NUMANode"]); ok {
+			dev.NUMANode = &numaNode
 		}
 		if width, ok := readPCIIntAttribute(bdf, "current_link_width"); ok {
 			dev.LinkWidth = &width
@@ -165,6 +185,18 @@ func readPCINumaNode(bdf string) (int, bool) {
 	return value, true
 }

+func parsePCINumaNode(raw string) (int, bool) {
+	raw = strings.TrimSpace(raw)
+	if raw == "" {
+		return 0, false
+	}
+	value, err := strconv.Atoi(raw)
+	if err != nil || value < 0 {
+		return 0, false
+	}
+	return value, true
+}
+
 func readPCIIntAttribute(bdf, attribute string) (int, bool) {
 	out, err := exec.Command("cat", "/sys/bus/pci/devices/"+bdf+"/"+attribute).Output()
 	if err != nil {
--- a/audit/internal/collector/pcie_filter_test.go
+++ b/audit/internal/collector/pcie_filter_test.go
@@ -8,32 +8,42 @@ import (

 func TestShouldIncludePCIeDevice(t *testing.T) {
 	tests := []struct {
-		class string
-		want  bool
+		name   string
+		class  string
+		vendor string
+		device string
+		want   bool
 	}{
-		{"USB controller", false},
-		{"System peripheral", false},
-		{"Audio device", false},
-		{"Host bridge", false},
-		{"PCI bridge", false},
-		{"SMBus", false},
-		{"Performance counters", false},
-		{"Ethernet controller", true},
-		{"RAID bus controller", true},
-		{"Non-Volatile memory controller", true},
-		{"VGA compatible controller", true},
+		{name: "usb", class: "USB controller", want: false},
+		{name: "system peripheral", class: "System peripheral", want: false},
+		{name: "audio", class: "Audio device", want: false},
+		{name: "host bridge", class: "Host bridge", want: false},
+		{name: "pci bridge", class: "PCI bridge", want: false},
+		{name: "smbus", class: "SMBus", want: false},
+		{name: "perf", class: "Performance counters", want: false},
+		{name: "non essential instrumentation", class: "Non-Essential Instrumentation", want: false},
+		{name: "amd dummy function", class: "Encryption controller", vendor: "Advanced Micro Devices, Inc. [AMD]", device: "Starship/Matisse PTDMA", want: false},
+		{name: "amd pspcpp", class: "Encryption controller", vendor: "Advanced Micro Devices, Inc. [AMD]", device: "Starship/Matisse Cryptographic Coprocessor PSPCPP", want: false},
+		{name: "ethernet", class: "Ethernet controller", want: true},
+		{name: "raid", class: "RAID bus controller", want: true},
+		{name: "nvme", class: "Non-Volatile memory controller", want: true},
+		{name: "vga", class: "VGA compatible controller", want: true},
+		{name: "other encryption controller", class: "Encryption controller", vendor: "Intel Corporation", device: "QuickAssist", want: true},
 	}

 	for _, tt := range tests {
-		got := shouldIncludePCIeDevice(tt.class)
-		if got != tt.want {
-			t.Fatalf("class %q include=%v want %v", tt.class, got, tt.want)
-		}
+		t.Run(tt.name, func(t *testing.T) {
+			got := shouldIncludePCIeDevice(tt.class, tt.vendor, tt.device)
+			if got != tt.want {
+				t.Fatalf("class=%q vendor=%q device=%q include=%v want %v", tt.class, tt.vendor, tt.device, got, tt.want)
+			}
+		})
 	}
 }

 func TestParseLspci_filtersExcludedClasses(t *testing.T) {
 	input := "Slot:\t0000:00:14.0\nClass:\tUSB controller\nVendor:\tIntel Corporation\nDevice:\tUSB 3.0\n\n" +
+		"Slot:\t0000:00:18.0\nClass:\tNon-Essential Instrumentation\nVendor:\tAdvanced Micro Devices, Inc. [AMD]\nDevice:\tStarship/Matisse PCIe Dummy Function\n\n" +
 		"Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n"

 	devs := parseLspci(input)
@@ -51,6 +61,21 @@ func TestParseLspci_filtersExcludedClasses(t *testing.T) {
 	}
 }

+func TestParseLspci_filtersAMDChipsetNoise(t *testing.T) {
+	input := "" +
+		"Slot:\t0000:1a:00.0\nClass:\tNon-Essential Instrumentation\nVendor:\tAdvanced Micro Devices, Inc. [AMD]\nDevice:\tStarship/Matisse PCIe Dummy Function\n\n" +
+		"Slot:\t0000:1a:00.2\nClass:\tEncryption controller\nVendor:\tAdvanced Micro Devices, Inc. [AMD]\nDevice:\tStarship/Matisse PTDMA\n\n" +
+		"Slot:\t0000:05:00.0\nClass:\tEthernet controller\nVendor:\tMellanox Technologies\nDevice:\tMT28908 Family [ConnectX-6]\n\n"
+
+	devs := parseLspci(input)
+	if len(devs) != 1 {
+		t.Fatalf("expected 1 remaining device, got %d", len(devs))
+	}
+	if devs[0].Model == nil || *devs[0].Model != "MT28908 Family [ConnectX-6]" {
+		t.Fatalf("unexpected remaining device: %+v", devs[0])
+	}
+}
+
 func TestPCIeJSONUsesSlotNotBDF(t *testing.T) {
 	input := "Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n"

@@ -68,6 +93,18 @@ func TestPCIeJSONUsesSlotNotBDF(t *testing.T) {
 	}
 }

+func TestParseLspciUsesNUMANodeFieldWhenSysfsUnavailable(t *testing.T) {
+	input := "Slot:\t0000:65:00.0\nClass:\tEthernet controller\nVendor:\tIntel Corporation\nDevice:\tX710\nNUMANode:\t1\n\n"
+
+	devs := parseLspci(input)
+	if len(devs) != 1 {
+		t.Fatalf("expected 1 device, got %d", len(devs))
+	}
+	if devs[0].NUMANode == nil || *devs[0].NUMANode != 1 {
+		t.Fatalf("numa_node=%v want 1", devs[0].NUMANode)
+	}
+}
+
 func TestNormalizePCILinkSpeed(t *testing.T) {
 	tests := []struct {
 		raw  string
--- a/audit/internal/collector/pcie_identity.go
+++ b/audit/internal/collector/pcie_identity.go
@@ -0,0 +1,123 @@
+package collector
+
+import (
+	"bee/audit/internal/schema"
+	"log/slog"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
+)
+
+var (
+	queryPCILSPCIDetail = func(bdf string) (string, error) {
+		out, err := exec.Command("lspci", "-vv", "-s", bdf).Output()
+		if err != nil {
+			return "", err
+		}
+		return string(out), nil
+	}
+	readPCIVPDFile = func(bdf string) ([]byte, error) {
+		return os.ReadFile(filepath.Join("/sys/bus/pci/devices", bdf, "vpd"))
+	}
+)
+
+func enrichPCIeWithPCISerials(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
+	enriched := 0
+	for i := range devs {
+		if !shouldProbePCIeSerial(devs[i]) {
+			continue
+		}
+		bdf := normalizePCIeBDF(*devs[i].BDF)
+		if bdf == "" {
+			continue
+		}
+		if serial := queryPCIDeviceSerial(bdf); serial != "" {
+			devs[i].SerialNumber = &serial
+			enriched++
+		}
+	}
+	if enriched > 0 {
+		slog.Info("pcie: serials enriched", "count", enriched)
+	}
+	return devs
+}
+
+func shouldProbePCIeSerial(dev schema.HardwarePCIeDevice) bool {
+	if dev.BDF == nil || dev.SerialNumber != nil {
+		return false
+	}
+	if dev.DeviceClass == nil {
+		return false
+	}
+	class := strings.TrimSpace(*dev.DeviceClass)
+	return isNICClass(class) || isGPUClass(class)
+}
+
+func queryPCIDeviceSerial(bdf string) string {
+	if out, err := queryPCILSPCIDetail(bdf); err == nil {
+		if serial := parseLSPCIDetailSerial(out); serial != "" {
+			return serial
+		}
+	}
+	if raw, err := readPCIVPDFile(bdf); err == nil {
+		return parsePCIVPDSerial(raw)
+	}
+	return ""
+}
+
+func parseLSPCIDetailSerial(raw string) string {
+	for _, line := range strings.Split(raw, "\n") {
+		line = strings.TrimSpace(line)
+		if line == "" {
+			continue
+		}
+		lower := strings.ToLower(line)
+		if !strings.Contains(lower, "serial number:") {
+			continue
+		}
+		idx := strings.Index(line, ":")
+		if idx < 0 {
+			continue
+		}
+		if serial := strings.TrimSpace(line[idx+1:]); serial != "" {
+			return serial
+		}
+	}
+	return ""
+}
+
+func parsePCIVPDSerial(raw []byte) string {
+	for i := 0; i+3 < len(raw); i++ {
+		if raw[i] != 'S' || raw[i+1] != 'N' {
+			continue
+		}
+		length := int(raw[i+2])
+		if length <= 0 || length > 64 || i+3+length > len(raw) {
+			continue
+		}
+		value := strings.TrimSpace(strings.Trim(string(raw[i+3:i+3+length]), "\x00"))
+		if !looksLikeSerial(value) {
+			continue
+		}
+		return value
+	}
+	return ""
+}
+
+func looksLikeSerial(value string) bool {
+	if len(value) < 4 {
+		return false
+	}
+	hasAlphaNum := false
+	for _, r := range value {
+		switch {
+		case r >= 'a' && r <= 'z', r >= 'A' && r <= 'Z', r >= '0' && r <= '9':
+			hasAlphaNum = true
+		case strings.ContainsRune(" -_./:", r):
+		default:
+			return false
+		}
+	}
+	return hasAlphaNum
+}
--- a/audit/internal/collector/pcie_identity_test.go
+++ b/audit/internal/collector/pcie_identity_test.go
@@ -0,0 +1,47 @@
+package collector
+
+import (
+	"bee/audit/internal/schema"
+	"fmt"
+	"testing"
+)
+
+func TestEnrichPCIeWithPCISerialsAddsGPUFallback(t *testing.T) {
+	origDetail := queryPCILSPCIDetail
+	origVPD := readPCIVPDFile
+	t.Cleanup(func() {
+		queryPCILSPCIDetail = origDetail
+		readPCIVPDFile = origVPD
+	})
+
+	queryPCILSPCIDetail = func(bdf string) (string, error) {
+		if bdf != "0000:11:00.0" {
+			t.Fatalf("unexpected bdf: %s", bdf)
+		}
+		return "Serial number: GPU-SN-12345\n", nil
+	}
+	readPCIVPDFile = func(string) ([]byte, error) {
+		return nil, fmt.Errorf("no vpd needed")
+	}
+
+	class := "DisplayController"
+	bdf := "0000:11:00.0"
+	devs := []schema.HardwarePCIeDevice{{
+		DeviceClass: &class,
+		BDF:         &bdf,
+	}}
+
+	out := enrichPCIeWithPCISerials(devs)
+	if out[0].SerialNumber == nil || *out[0].SerialNumber != "GPU-SN-12345" {
+		t.Fatalf("serial=%v want GPU-SN-12345", out[0].SerialNumber)
+	}
+}
+
+func TestShouldProbePCIeSerialSkipsNonGPUOrNIC(t *testing.T) {
+	class := "StorageController"
+	bdf := "0000:19:00.0"
+	dev := schema.HardwarePCIeDevice{DeviceClass: &class, BDF: &bdf}
+	if shouldProbePCIeSerial(dev) {
+		t.Fatal("unexpected probe for storage controller")
+	}
+}
--- a/audit/internal/collector/storage.go
+++ b/audit/internal/collector/storage.go
@@ -190,6 +190,7 @@ type smartctlInfo struct {
 func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
 	present := true
 	s := schema.HardwareStorage{Present: &present}
+	s.Telemetry = map[string]any{"linux_device": "/dev/" + dev.Name}

 	tran := strings.ToLower(dev.Tran)
 	devPath := "/dev/" + dev.Name
@@ -348,6 +349,7 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
 		Present:                 &present,
 		Type:                    &devType,
 		Interface:               &iface,
+		Telemetry:               map[string]any{"linux_device": "/dev/" + dev.Name},
 	}

 	devPath := "/dev/" + dev.Name
--- a/audit/internal/platform/export.go
+++ b/audit/internal/platform/export.go
@@ -9,8 +9,50 @@ import (
 	"strings"
 )

+var exportExecCommand = exec.Command
+
+func formatMountTargetError(target RemovableTarget, raw string, err error) error {
+	msg := strings.TrimSpace(raw)
+	fstype := strings.ToLower(strings.TrimSpace(target.FSType))
+	if fstype == "exfat" && strings.Contains(strings.ToLower(msg), "unknown filesystem type 'exfat'") {
+		return fmt.Errorf("mount %s: exFAT support is missing in this ISO build: %w", target.Device, err)
+	}
+	if msg == "" {
+		return err
+	}
+	return fmt.Errorf("%s: %w", msg, err)
+}
+
+func removableTargetReadOnly(fields map[string]string) bool {
+	if fields["RO"] == "1" {
+		return true
+	}
+	switch strings.ToLower(strings.TrimSpace(fields["FSTYPE"])) {
+	case "iso9660", "squashfs":
+		return true
+	default:
+		return false
+	}
+}
+
+func ensureWritableMountpoint(mountpoint string) error {
+	probe, err := os.CreateTemp(mountpoint, ".bee-write-test-*")
+	if err != nil {
+		return fmt.Errorf("target filesystem is not writable: %w", err)
+	}
+	name := probe.Name()
+	if closeErr := probe.Close(); closeErr != nil {
+		_ = os.Remove(name)
+		return closeErr
+	}
+	if err := os.Remove(name); err != nil {
+		return err
+	}
+	return nil
+}
+
 func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
-	raw, err := exec.Command("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output()
+	raw, err := exportExecCommand("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,RO,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output()
 	if err != nil {
 		return nil, err
 	}
@@ -34,7 +76,7 @@ func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
 				}
 			}
 		}
-		if !removable || fields["FSTYPE"] == "" {
+		if !removable || fields["FSTYPE"] == "" || removableTargetReadOnly(fields) {
 			continue
 		}

@@ -52,7 +94,7 @@ func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
 	return out, nil
 }

-func (s *System) ExportFileToTarget(src string, target RemovableTarget) (string, error) {
+func (s *System) ExportFileToTarget(src string, target RemovableTarget) (dst string, retErr error) {
 	if src == "" || target.Device == "" {
 		return "", fmt.Errorf("source and target are required")
 	}
@@ -62,20 +104,43 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (string,

 	mountpoint := strings.TrimSpace(target.Mountpoint)
 	mountedHere := false
+	mounted := mountpoint != ""
 	if mountpoint == "" {
 		mountpoint = filepath.Join("/tmp", "bee-export-"+filepath.Base(target.Device))
 		if err := os.MkdirAll(mountpoint, 0755); err != nil {
 			return "", err
 		}
-		if raw, err := exec.Command("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
+		if raw, err := exportExecCommand("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
 			_ = os.Remove(mountpoint)
-			return string(raw), err
+			return "", formatMountTargetError(target, string(raw), err)
 		}
 		mountedHere = true
+		mounted = true
+	}
+	defer func() {
+		if !mounted {
+			return
+		}
+		_ = exportExecCommand("sync").Run()
+		if raw, err := exportExecCommand("umount", mountpoint).CombinedOutput(); err != nil && retErr == nil {
+			msg := strings.TrimSpace(string(raw))
+			if msg == "" {
+				retErr = err
+			} else {
+				retErr = fmt.Errorf("%s: %w", msg, err)
+			}
+		}
+		if mountedHere {
+			_ = os.Remove(mountpoint)
+		}
+	}()
+
+	if err := ensureWritableMountpoint(mountpoint); err != nil {
+		return "", err
 	}

 	filename := filepath.Base(src)
-	dst := filepath.Join(mountpoint, filename)
+	dst = filepath.Join(mountpoint, filename)
 	data, err := os.ReadFile(src)
 	if err != nil {
 		return "", err
@@ -83,12 +148,6 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (string,
 	if err := os.WriteFile(dst, data, 0644); err != nil {
 		return "", err
 	}
-	_ = exec.Command("sync").Run()
-
-	if mountedHere {
-		_ = exec.Command("umount", mountpoint).Run()
-		_ = os.Remove(mountpoint)
-	}

 	return dst, nil
 }
--- a/audit/internal/platform/export_test.go
+++ b/audit/internal/platform/export_test.go
@@ -0,0 +1,112 @@
+package platform
+
+import (
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
+	"testing"
+)
+
+func TestExportFileToTargetUnmountsExistingMountpoint(t *testing.T) {
+	tmp := t.TempDir()
+	src := filepath.Join(tmp, "bundle.tar.gz")
+	mountpoint := filepath.Join(tmp, "mnt")
+	if err := os.MkdirAll(mountpoint, 0755); err != nil {
+		t.Fatalf("mkdir mountpoint: %v", err)
+	}
+	if err := os.WriteFile(src, []byte("bundle"), 0644); err != nil {
+		t.Fatalf("write src: %v", err)
+	}
+
+	var calls [][]string
+	oldExec := exportExecCommand
+	exportExecCommand = func(name string, args ...string) *exec.Cmd {
+		calls = append(calls, append([]string{name}, args...))
+		return exec.Command("sh", "-c", "exit 0")
+	}
+	t.Cleanup(func() { exportExecCommand = oldExec })
+
+	s := &System{}
+	dst, err := s.ExportFileToTarget(src, RemovableTarget{
+		Device:     "/dev/sdb1",
+		Mountpoint: mountpoint,
+	})
+	if err != nil {
+		t.Fatalf("ExportFileToTarget error: %v", err)
+	}
+	if got, want := dst, filepath.Join(mountpoint, "bundle.tar.gz"); got != want {
+		t.Fatalf("dst=%q want %q", got, want)
+	}
+	if _, err := os.Stat(filepath.Join(mountpoint, "bundle.tar.gz")); err != nil {
+		t.Fatalf("exported file missing: %v", err)
+	}
+
+	foundUmount := false
+	for _, call := range calls {
+		if len(call) == 2 && call[0] == "umount" && call[1] == mountpoint {
+			foundUmount = true
+			break
+		}
+	}
+	if !foundUmount {
+		t.Fatalf("expected umount %q call, got %#v", mountpoint, calls)
+	}
+}
+
+func TestExportFileToTargetRejectsNonWritableMountpoint(t *testing.T) {
+	tmp := t.TempDir()
+	src := filepath.Join(tmp, "bundle.tar.gz")
+	mountpoint := filepath.Join(tmp, "mnt")
+	if err := os.MkdirAll(mountpoint, 0755); err != nil {
+		t.Fatalf("mkdir mountpoint: %v", err)
+	}
+	if err := os.WriteFile(src, []byte("bundle"), 0644); err != nil {
+		t.Fatalf("write src: %v", err)
+	}
+	if err := os.Chmod(mountpoint, 0555); err != nil {
+		t.Fatalf("chmod mountpoint: %v", err)
+	}
+
+	oldExec := exportExecCommand
+	exportExecCommand = func(name string, args ...string) *exec.Cmd {
+		return exec.Command("sh", "-c", "exit 0")
+	}
+	t.Cleanup(func() { exportExecCommand = oldExec })
+
+	s := &System{}
+	_, err := s.ExportFileToTarget(src, RemovableTarget{
+		Device:     "/dev/sdb1",
+		Mountpoint: mountpoint,
+	})
+	if err == nil {
+		t.Fatal("expected error for non-writable mountpoint")
+	}
+	if !strings.Contains(err.Error(), "target filesystem is not writable") {
+		t.Fatalf("err=%q want writable message", err)
+	}
+}
+
+func TestListRemovableTargetsSkipsReadOnlyMedia(t *testing.T) {
+	oldExec := exportExecCommand
+	lsblkOut := `NAME="sda1" TYPE="part" PKNAME="sda" RM="1" RO="1" FSTYPE="iso9660" MOUNTPOINT="/run/live/medium" SIZE="3.7G" LABEL="BEE" MODEL=""
+NAME="sdb1" TYPE="part" PKNAME="sdb" RM="1" RO="0" FSTYPE="vfat" MOUNTPOINT="/media/bee/USB" SIZE="29.8G" LABEL="USB" MODEL=""`
+	exportExecCommand = func(name string, args ...string) *exec.Cmd {
+		cmd := exec.Command("sh", "-c", "printf '%s\n' \"$LSBLK_OUT\"")
+		cmd.Env = append(os.Environ(), "LSBLK_OUT="+lsblkOut)
+		return cmd
+	}
+	t.Cleanup(func() { exportExecCommand = oldExec })
+
+	s := &System{}
+	targets, err := s.ListRemovableTargets()
+	if err != nil {
+		t.Fatalf("ListRemovableTargets error: %v", err)
+	}
+	if len(targets) != 1 {
+		t.Fatalf("len(targets)=%d want 1 (%+v)", len(targets), targets)
+	}
+	if got := targets[0].Device; got != "/dev/sdb1" {
+		t.Fatalf("device=%q want /dev/sdb1", got)
+	}
+}
--- a/audit/internal/platform/gpu_metrics.go
+++ b/audit/internal/platform/gpu_metrics.go
@@ -69,6 +69,11 @@ func parseGPUFloat(s string) float64 {
 	return v
 }

+// SampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
+func SampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
+	return sampleGPUMetrics(gpuIndices)
+}
+
 // WriteGPUMetricsCSV writes collected rows as a CSV file.
 func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
 	var b bytes.Buffer
@@ -370,6 +375,162 @@ func RenderGPUTerminalChart(rows []GPUMetricRow) string {
 	return strings.TrimRight(b.String(), "\n")
 }

+// RenderGPULiveChart renders all GPU metrics on a single combined chart per GPU.
+// Each series is normalised to its own min–max and drawn in a different colour.
+// chartWidth controls the width of the plot area (Y-axis label uses 5 extra chars).
+func RenderGPULiveChart(rows []GPUMetricRow, chartWidth int) string {
+	if chartWidth < 20 {
+		chartWidth = 70
+	}
+	const chartHeight = 14
+
+	seen := make(map[int]bool)
+	var order []int
+	gpuMap := make(map[int][]GPUMetricRow)
+	for _, r := range rows {
+		if !seen[r.GPUIndex] {
+			seen[r.GPUIndex] = true
+			order = append(order, r.GPUIndex)
+		}
+		gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
+	}
+
+	type seriesDef struct {
+		label string
+		color string
+		unit  string
+		fn    func(GPUMetricRow) float64
+	}
+	defs := []seriesDef{
+		{"Usage", ansiBlue, "%", func(r GPUMetricRow) float64 { return r.UsagePct }},
+		{"Temp", ansiRed, "°C", func(r GPUMetricRow) float64 { return r.TempC }},
+		{"Power", ansiGreen, "W", func(r GPUMetricRow) float64 { return r.PowerW }},
+	}
+
+	var b strings.Builder
+	for _, gpuIdx := range order {
+		gr := gpuMap[gpuIdx]
+		if len(gr) == 0 {
+			continue
+		}
+		elapsed := gr[len(gr)-1].ElapsedSec
+
+		// Build value slices for each series.
+		type seriesData struct {
+			seriesDef
+			vals []float64
+			mn   float64
+			mx   float64
+		}
+		var series []seriesData
+		for _, d := range defs {
+			vals := extractGPUField(gr, d.fn)
+			mn, mx := gpuMinMax(vals)
+			if mn == mx {
+				mx = mn + 1
+			}
+			series = append(series, seriesData{d, vals, mn, mx})
+		}
+
+		// Shared character grid: row 0 = top (max), row chartHeight = bottom (min).
+		type cell struct {
+			ch    rune
+			color string
+		}
+		grid := make([][]cell, chartHeight+1)
+		for r := range grid {
+			grid[r] = make([]cell, chartWidth)
+			for c := range grid[r] {
+				grid[r][c] = cell{' ', ""}
+			}
+		}
+
+		// Plot each series onto the shared grid.
+		for _, s := range series {
+			w := chartWidth
+			if len(s.vals) < w {
+				w = len(s.vals)
+			}
+			data := gpuDownsample(s.vals, w)
+			prevRow := -1
+			for x, v := range data {
+				row := chartHeight - int(math.Round((v-s.mn)/(s.mx-s.mn)*float64(chartHeight)))
+				if row < 0 {
+					row = 0
+				}
+				if row > chartHeight {
+					row = chartHeight
+				}
+				if prevRow < 0 || prevRow == row {
+					grid[row][x] = cell{'─', s.color}
+				} else {
+					lo, hi := prevRow, row
+					if lo > hi {
+						lo, hi = hi, lo
+					}
+					for y := lo + 1; y < hi; y++ {
+						grid[y][x] = cell{'│', s.color}
+					}
+					if prevRow < row {
+						grid[prevRow][x] = cell{'╮', s.color}
+						grid[row][x] = cell{'╰', s.color}
+					} else {
+						grid[prevRow][x] = cell{'╯', s.color}
+						grid[row][x] = cell{'╭', s.color}
+					}
+				}
+				prevRow = row
+			}
+		}
+
+		// Render: Y axis + data rows.
+		fmt.Fprintf(&b, "GPU %d  (%.0fs)  each series normalised to its range\n", gpuIdx, elapsed)
+		for r := 0; r <= chartHeight; r++ {
+			// Y axis label: 100% at top, 50% in middle, 0% at bottom.
+			switch r {
+			case 0:
+				fmt.Fprintf(&b, "%4s┤", "100%")
+			case chartHeight / 2:
+				fmt.Fprintf(&b, "%4s┤", "50%")
+			case chartHeight:
+				fmt.Fprintf(&b, "%4s┤", "0%")
+			default:
+				fmt.Fprintf(&b, "%4s│", "")
+			}
+			for c := 0; c < chartWidth; c++ {
+				cl := grid[r][c]
+				if cl.color != "" {
+					b.WriteString(cl.color)
+					b.WriteRune(cl.ch)
+					b.WriteString(ansiReset)
+				} else {
+					b.WriteRune(' ')
+				}
+			}
+			b.WriteRune('\n')
+		}
+		// Bottom axis.
+		b.WriteString("     └")
+		b.WriteString(strings.Repeat("─", chartWidth))
+		b.WriteRune('\n')
+
+		// Legend with current (last) values.
+		b.WriteString("     ")
+		for i, s := range series {
+			last := s.vals[len(s.vals)-1]
+			b.WriteString(s.color)
+			fmt.Fprintf(&b, "▐ %s: %.0f%s", s.label, last, s.unit)
+			b.WriteString(ansiReset)
+			if i < len(series)-1 {
+				b.WriteString("   ")
+			}
+		}
+		b.WriteRune('\n')
+	}
+
+	return strings.TrimRight(b.String(), "\n")
+}
+
 // renderLineChart draws a single time-series line chart using box-drawing characters.
 // Produces output in the style of asciigraph: ╭─╮ │ ╰─╯ with a Y axis and caption.
 func renderLineChart(vals []float64, color, caption string, height, width int) string {
--- a/audit/internal/platform/runtime.go
+++ b/audit/internal/platform/runtime.go
@@ -16,9 +16,6 @@ var runtimeRequiredTools = []string{
 	"smartctl",
 	"nvme",
 	"ipmitool",
-	"nvidia-smi",
-	"nvidia-bug-report.sh",
-	"bee-gpu-stress",
 	"dhclient",
 	"mount",
 }
@@ -93,7 +90,8 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
 		}
 	}

-	for _, tool := range s.CheckTools(runtimeRequiredTools) {
+	vendor := s.DetectGPUVendor()
+	for _, tool := range s.runtimeToolStatuses(vendor) {
 		health.Tools = append(health.Tools, schema.RuntimeToolStatus{
 			Name: tool.Name,
 			Path: tool.Path,
@@ -115,39 +113,7 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
 		})
 	}

-	lsmodText := commandText("lsmod")
-	health.DriverReady = strings.Contains(lsmodText, "nvidia ")
-	if !health.DriverReady {
-		health.Issues = append(health.Issues, schema.RuntimeIssue{
-			Code:        "nvidia_kernel_module_missing",
-			Severity:    "warning",
-			Description: "NVIDIA kernel module is not loaded.",
-		})
-	}
-	if health.DriverReady && !strings.Contains(lsmodText, "nvidia_modeset") {
-		health.Issues = append(health.Issues, schema.RuntimeIssue{
-			Code:        "nvidia_modeset_failed",
-			Severity:    "warning",
-			Description: "nvidia-modeset is not loaded; display/CUDA stack may be partial.",
-		})
-	}
-	if out, err := exec.Command("nvidia-smi", "-L").CombinedOutput(); err == nil && strings.TrimSpace(string(out)) != "" {
-		health.DriverReady = true
-	}
-
-	health.CUDAReady = false
-	if lookErr := exec.Command("sh", "-c", "command -v bee-gpu-stress >/dev/null 2>&1").Run(); lookErr == nil {
-		out, err := exec.Command("bee-gpu-stress", "--seconds", "1", "--size-mb", "1").CombinedOutput()
-		if err == nil {
-			health.CUDAReady = true
-		} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
-			health.Issues = append(health.Issues, schema.RuntimeIssue{
-				Code:        "cuda_runtime_not_ready",
-				Severity:    "warning",
-				Description: "CUDA runtime is not ready for GPU SAT.",
-			})
-		}
-	}
+	s.collectGPURuntimeHealth(vendor, &health)

 	if health.Status != "FAILED" && len(health.Issues) > 0 {
 		health.Status = "PARTIAL"
@@ -162,3 +128,87 @@ func commandText(name string, args ...string) string {
 	}
 	return string(raw)
 }
+
+func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
+	tools := s.CheckTools(runtimeRequiredTools)
+	switch vendor {
+	case "nvidia":
+		tools = append(tools, s.CheckTools([]string{
+			"nvidia-smi",
+			"nvidia-bug-report.sh",
+			"bee-gpu-stress",
+		})...)
+	case "amd":
+		tool := ToolStatus{Name: "rocm-smi"}
+		if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 {
+			tool.Path = cmd[0]
+			if len(cmd) > 1 && strings.HasSuffix(cmd[1], "rocm_smi.py") {
+				tool.Path = cmd[1]
+			}
+			tool.OK = true
+		}
+		tools = append(tools, tool)
+	}
+	return tools
+}
+
+func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
+	lsmodText := commandText("lsmod")
+
+	switch vendor {
+	case "nvidia":
+		health.DriverReady = strings.Contains(lsmodText, "nvidia ")
+		if !health.DriverReady {
+			health.Issues = append(health.Issues, schema.RuntimeIssue{
+				Code:        "nvidia_kernel_module_missing",
+				Severity:    "warning",
+				Description: "NVIDIA kernel module is not loaded.",
+			})
+		}
+		if health.DriverReady && !strings.Contains(lsmodText, "nvidia_modeset") {
+			health.Issues = append(health.Issues, schema.RuntimeIssue{
+				Code:        "nvidia_modeset_failed",
+				Severity:    "warning",
+				Description: "nvidia-modeset is not loaded; display/CUDA stack may be partial.",
+			})
+		}
+		if out, err := exec.Command("nvidia-smi", "-L").CombinedOutput(); err == nil && strings.TrimSpace(string(out)) != "" {
+			health.DriverReady = true
+		}
+
+		if lookErr := exec.Command("sh", "-c", "command -v bee-gpu-stress >/dev/null 2>&1").Run(); lookErr == nil {
+			out, err := exec.Command("bee-gpu-stress", "--seconds", "1", "--size-mb", "1").CombinedOutput()
+			if err == nil {
+				health.CUDAReady = true
+			} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
+				health.Issues = append(health.Issues, schema.RuntimeIssue{
+					Code:        "cuda_runtime_not_ready",
+					Severity:    "warning",
+					Description: "CUDA runtime is not ready for GPU SAT.",
+				})
+			}
+		}
+	case "amd":
+		health.DriverReady = strings.Contains(lsmodText, "amdgpu ") || strings.Contains(lsmodText, "amdkfd")
+		if !health.DriverReady {
+			health.Issues = append(health.Issues, schema.RuntimeIssue{
+				Code:        "amdgpu_kernel_module_missing",
+				Severity:    "warning",
+				Description: "AMD GPU driver is not loaded.",
+			})
+		}
+
+		out, err := runROCmSMI("--showproductname", "--csv")
+		if err == nil && strings.TrimSpace(string(out)) != "" {
+			health.CUDAReady = true
+			health.DriverReady = true
+			return
+		}
+
+		health.Issues = append(health.Issues, schema.RuntimeIssue{
+			Code:        "rocm_smi_unavailable",
+			Severity:    "warning",
+			Description: "ROCm SMI is not available for AMD GPU SAT.",
+		})
+	}
+}
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -4,6 +4,7 @@ import (
 	"archive/tar"
 	"compress/gzip"
 	"context"
+	"errors"
 	"fmt"
 	"io"
 	"os"
@@ -15,6 +16,22 @@ import (
 	"time"
 )

+var (
+	satExecCommand = exec.Command
+	satLookPath    = exec.LookPath
+	satGlob        = filepath.Glob
+	satStat        = os.Stat
+
+	rocmSMIExecutableGlobs = []string{
+		"/opt/rocm/bin/rocm-smi",
+		"/opt/rocm-*/bin/rocm-smi",
+	}
+	rocmSMIScriptGlobs = []string{
+		"/opt/rocm/libexec/rocm_smi/rocm_smi.py",
+		"/opt/rocm-*/libexec/rocm_smi/rocm_smi.py",
+	}
+)
+
 // NvidiaGPU holds basic GPU info from nvidia-smi.
 type NvidiaGPU struct {
 	Index    int
@@ -41,7 +58,7 @@ func (s *System) DetectGPUVendor() string {

 // ListAMDGPUs returns AMD GPUs visible to rocm-smi.
 func (s *System) ListAMDGPUs() ([]AMDGPUInfo, error) {
-	out, err := exec.Command("rocm-smi", "--showproductname", "--csv").Output()
+	out, err := runROCmSMI("--showproductname", "--csv")
 	if err != nil {
 		return nil, fmt.Errorf("rocm-smi: %w", err)
 	}
@@ -337,12 +354,22 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa

 func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string) ([]byte, error) {
 	start := time.Now().UTC()
+	resolvedCmd, err := resolveSATCommand(cmd)
 	appendSATVerboseLog(verboseLog,
 		fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
-		"cmd: "+strings.Join(cmd, " "),
+		"cmd: "+strings.Join(resolvedCmd, " "),
 	)
+	if err != nil {
+		appendSATVerboseLog(verboseLog,
+			fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
+			"rc: 1",
+			fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()),
+			"",
+		)
+		return []byte(err.Error() + "\n"), err
+	}

-	c := exec.CommandContext(ctx, cmd[0], cmd[1:]...)
+	c := exec.CommandContext(ctx, resolvedCmd[0], resolvedCmd[1:]...)
 	if len(env) > 0 {
 		c.Env = append(os.Environ(), env...)
 	}
@@ -362,19 +389,11 @@ func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string
 }

 func listStorageDevices() ([]string, error) {
-	out, err := exec.Command("lsblk", "-dn", "-o", "NAME,TYPE").Output()
+	out, err := satExecCommand("lsblk", "-dn", "-o", "NAME,TYPE,TRAN").Output()
 	if err != nil {
 		return nil, err
 	}
-	var devices []string
-	for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
-		fields := strings.Fields(strings.TrimSpace(line))
-		if len(fields) != 2 || fields[1] != "disk" {
-			continue
-		}
-		devices = append(devices, "/dev/"+fields[0])
-	}
-	return devices, nil
+	return parseStorageDevices(string(out)), nil
 }

 func storageSATCommands(devPath string) []satJob {
@@ -445,12 +464,22 @@ func classifySATResult(name string, out []byte, err error) (string, int) {

 func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) {
 	start := time.Now().UTC()
+	resolvedCmd, err := resolveSATCommand(cmd)
 	appendSATVerboseLog(verboseLog,
 		fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
-		"cmd: "+strings.Join(cmd, " "),
+		"cmd: "+strings.Join(resolvedCmd, " "),
 	)
+	if err != nil {
+		appendSATVerboseLog(verboseLog,
+			fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
+			"rc: 1",
+			fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()),
+			"",
+		)
+		return []byte(err.Error() + "\n"), err
+	}

-	out, err := exec.Command(cmd[0], cmd[1:]...).CombinedOutput()
+	out, err := satExecCommand(resolvedCmd[0], resolvedCmd[1:]...).CombinedOutput()

 	rc := 0
 	if err != nil {
@@ -465,6 +494,91 @@ func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) {
 	return out, err
 }

+func runROCmSMI(args ...string) ([]byte, error) {
+	cmd, err := resolveROCmSMICommand(args...)
+	if err != nil {
+		return nil, err
+	}
+	return satExecCommand(cmd[0], cmd[1:]...).CombinedOutput()
+}
+
+func resolveSATCommand(cmd []string) ([]string, error) {
+	if len(cmd) == 0 {
+		return nil, errors.New("empty SAT command")
+	}
+	if cmd[0] != "rocm-smi" {
+		return cmd, nil
+	}
+	return resolveROCmSMICommand(cmd[1:]...)
+}
+
+func resolveROCmSMICommand(args ...string) ([]string, error) {
+	if path, err := satLookPath("rocm-smi"); err == nil {
+		return append([]string{path}, args...), nil
+	}
+
+	for _, path := range rocmSMIExecutableCandidates() {
+		return append([]string{path}, args...), nil
+	}
+
+	pythonPath, pyErr := satLookPath("python3")
+	if pyErr == nil {
+		for _, script := range rocmSMIScriptCandidates() {
+			cmd := []string{pythonPath, script}
+			cmd = append(cmd, args...)
+			return cmd, nil
+		}
+	}
+
+	return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
+}
+
+func rocmSMIExecutableCandidates() []string {
+	return expandExistingPaths(rocmSMIExecutableGlobs)
+}
+
+func rocmSMIScriptCandidates() []string {
+	return expandExistingPaths(rocmSMIScriptGlobs)
+}
+
+func expandExistingPaths(patterns []string) []string {
+	seen := make(map[string]struct{})
+	var paths []string
+	for _, pattern := range patterns {
+		matches, err := satGlob(pattern)
+		if err != nil {
+			continue
+		}
+		sort.Strings(matches)
+		for _, match := range matches {
+			if _, err := satStat(match); err != nil {
+				continue
+			}
+			if _, ok := seen[match]; ok {
+				continue
+			}
+			seen[match] = struct{}{}
+			paths = append(paths, match)
+		}
+	}
+	return paths
+}
+
+func parseStorageDevices(raw string) []string {
+	var devices []string
+	for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
+		fields := strings.Fields(strings.TrimSpace(line))
+		if len(fields) < 2 || fields[1] != "disk" {
+			continue
+		}
+		if len(fields) >= 3 && strings.EqualFold(fields[2], "usb") {
+			continue
+		}
+		devices = append(devices, "/dev/"+fields[0])
+	}
+	return devices
+}
+
 // runSATCommandWithMetrics runs a command while collecting GPU metrics in the background.
 // On completion it writes gpu-metrics.csv and gpu-metrics.html into runDir.
 func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir string) ([]byte, error) {
--- a/audit/internal/platform/sat_fan_stress.go
+++ b/audit/internal/platform/sat_fan_stress.go
@@ -0,0 +1,587 @@
+package platform
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+)
+
+// FanStressOptions configures the fan-stress / thermal cycling test.
+type FanStressOptions struct {
+	BaselineSec  int   // idle monitoring before and after load (default 30)
+	Phase1DurSec int   // first load phase duration in seconds (default 300)
+	PauseSec     int   // pause between the two load phases (default 60)
+	Phase2DurSec int   // second load phase duration in seconds (default 300)
+	SizeMB       int   // GPU memory to allocate per GPU during stress (default 64)
+	GPUIndices   []int // which GPU indices to stress (empty = all detected)
+}
+
+// FanReading holds one fan sensor reading.
+type FanReading struct {
+	Name string
+	RPM  float64
+}
+
+// GPUStressMetric holds per-GPU metrics during the stress test.
+type GPUStressMetric struct {
+	Index     int
+	TempC     float64
+	UsagePct  float64
+	PowerW    float64
+	ClockMHz  float64
+	Throttled bool // true if any throttle reason is active
+}
+
+// FanStressRow is one second-interval telemetry sample covering all monitored dimensions.
+type FanStressRow struct {
+	TimestampUTC string
+	ElapsedSec   float64
+	Phase        string // "baseline", "load1", "pause", "load2", "cooldown"
+	GPUs         []GPUStressMetric
+	Fans         []FanReading
+	CPUMaxTempC  float64 // highest CPU temperature from ipmitool / sensors
+	SysPowerW    float64 // DCMI system power reading
+}
+
+// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
+// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
+// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
+func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanStressOptions) (string, error) {
+	if baseDir == "" {
+		baseDir = "/var/log/bee-sat"
+	}
+	applyFanStressDefaults(&opts)
+
+	ts := time.Now().UTC().Format("20060102-150405")
+	runDir := filepath.Join(baseDir, "fan-stress-"+ts)
+	if err := os.MkdirAll(runDir, 0755); err != nil {
+		return "", err
+	}
+	verboseLog := filepath.Join(runDir, "verbose.log")
+
+	// Phase name shared between sampler goroutine and main goroutine.
+	var phaseMu sync.Mutex
+	currentPhase := "init"
+	setPhase := func(name string) {
+		phaseMu.Lock()
+		currentPhase = name
+		phaseMu.Unlock()
+	}
+	getPhase := func() string {
+		phaseMu.Lock()
+		defer phaseMu.Unlock()
+		return currentPhase
+	}
+
+	start := time.Now()
+	var rowsMu sync.Mutex
+	var allRows []FanStressRow
+
+	// Start background sampler (every second).
+	stopCh := make(chan struct{})
+	doneCh := make(chan struct{})
+	go func() {
+		defer close(doneCh)
+		ticker := time.NewTicker(time.Second)
+		defer ticker.Stop()
+		for {
+			select {
+			case <-stopCh:
+				return
+			case <-ticker.C:
+				row := sampleFanStressRow(opts.GPUIndices, getPhase(), time.Since(start).Seconds())
+				rowsMu.Lock()
+				allRows = append(allRows, row)
+				rowsMu.Unlock()
+			}
+		}
+	}()
+
+	var summary strings.Builder
+	fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
+
+	stats := satStats{}
+
+	// idlePhase sleeps for durSec while the sampler stamps phaseName on each row.
+	idlePhase := func(phaseName, stepName string, durSec int) {
+		if ctx.Err() != nil {
+			return
+		}
+		setPhase(phaseName)
+		appendSATVerboseLog(verboseLog,
+			fmt.Sprintf("[%s] start %s (idle %ds)", time.Now().UTC().Format(time.RFC3339), stepName, durSec),
+		)
+		select {
+		case <-ctx.Done():
+		case <-time.After(time.Duration(durSec) * time.Second):
+		}
+		appendSATVerboseLog(verboseLog,
+			fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), stepName),
+		)
+		fmt.Fprintf(&summary, "%s_status=OK\n", stepName)
+		stats.OK++
+	}
+
+	// loadPhase runs bee-gpu-stress for durSec; sampler stamps phaseName on each row.
+	loadPhase := func(phaseName, stepName string, durSec int) {
+		if ctx.Err() != nil {
+			return
+		}
+		setPhase(phaseName)
+		var env []string
+		if len(opts.GPUIndices) > 0 {
+			ids := make([]string, len(opts.GPUIndices))
+			for i, idx := range opts.GPUIndices {
+				ids[i] = strconv.Itoa(idx)
+			}
+			env = []string{"CUDA_VISIBLE_DEVICES=" + strings.Join(ids, ",")}
+		}
+		cmd := []string{
+			"bee-gpu-stress",
+			"--seconds", strconv.Itoa(durSec),
+			"--size-mb", strconv.Itoa(opts.SizeMB),
+		}
+		out, err := runSATCommandCtx(ctx, verboseLog, stepName, cmd, env)
+		_ = os.WriteFile(filepath.Join(runDir, stepName+".log"), out, 0644)
+		if err != nil && err != context.Canceled && err.Error() != "signal: killed" {
+			fmt.Fprintf(&summary, "%s_status=FAILED\n", stepName)
+			stats.Failed++
+		} else {
+			fmt.Fprintf(&summary, "%s_status=OK\n", stepName)
+			stats.OK++
+		}
+	}
+
+	// Execute test phases.
+	idlePhase("baseline", "01-baseline", opts.BaselineSec)
+	loadPhase("load1", "02-load1", opts.Phase1DurSec)
+	idlePhase("pause", "03-pause", opts.PauseSec)
+	loadPhase("load2", "04-load2", opts.Phase2DurSec)
+	idlePhase("cooldown", "05-cooldown", opts.BaselineSec)
+
+	// Stop sampler and collect rows.
+	close(stopCh)
+	<-doneCh
+
+	rowsMu.Lock()
+	rows := allRows
+	rowsMu.Unlock()
+
+	// Analysis.
+	throttled := analyzeThrottling(rows)
+	maxGPUTemp := analyzeMaxTemp(rows, func(r FanStressRow) float64 {
+		var m float64
+		for _, g := range r.GPUs {
+			if g.TempC > m {
+				m = g.TempC
+			}
+		}
+		return m
+	})
+	maxCPUTemp := analyzeMaxTemp(rows, func(r FanStressRow) float64 {
+		return r.CPUMaxTempC
+	})
+	fanResponseSec := analyzeFanResponse(rows)
+
+	fmt.Fprintf(&summary, "throttling_detected=%v\n", throttled)
+	fmt.Fprintf(&summary, "max_gpu_temp_c=%.1f\n", maxGPUTemp)
+	fmt.Fprintf(&summary, "max_cpu_temp_c=%.1f\n", maxCPUTemp)
+	if fanResponseSec >= 0 {
+		fmt.Fprintf(&summary, "fan_response_sec=%.1f\n", fanResponseSec)
+	} else {
+		fmt.Fprintf(&summary, "fan_response_sec=N/A\n")
+	}
+
+	// Throttling failure counts against overall result.
+	if throttled {
+		stats.Failed++
+	}
+	writeSATStats(&summary, stats)
+
+	// Write CSV outputs.
+	if err := WriteFanStressCSV(filepath.Join(runDir, "metrics.csv"), rows, opts.GPUIndices); err != nil {
+		return "", err
+	}
+	_ = WriteFanSensorsCSV(filepath.Join(runDir, "fan-sensors.csv"), rows)
+
+	if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
+		return "", err
+	}
+
+	archive := filepath.Join(baseDir, "fan-stress-"+ts+".tar.gz")
+	if err := createTarGz(archive, runDir); err != nil {
+		return "", err
+	}
+	return archive, nil
+}
+
+func applyFanStressDefaults(opts *FanStressOptions) {
+	if opts.BaselineSec <= 0 {
+		opts.BaselineSec = 30
+	}
+	if opts.Phase1DurSec <= 0 {
+		opts.Phase1DurSec = 300
+	}
+	if opts.PauseSec <= 0 {
+		opts.PauseSec = 60
+	}
+	if opts.Phase2DurSec <= 0 {
+		opts.Phase2DurSec = 300
+	}
+	if opts.SizeMB <= 0 {
+		opts.SizeMB = 64
+	}
+}
+
+// sampleFanStressRow collects all metrics for one telemetry sample.
+func sampleFanStressRow(gpuIndices []int, phase string, elapsed float64) FanStressRow {
+	row := FanStressRow{
+		TimestampUTC: time.Now().UTC().Format(time.RFC3339),
+		ElapsedSec:   elapsed,
+		Phase:        phase,
+	}
+	row.GPUs = sampleGPUStressMetrics(gpuIndices)
+	row.Fans, _ = sampleFanSpeeds()
+	row.CPUMaxTempC = sampleCPUMaxTemp()
+	row.SysPowerW = sampleSystemPower()
+	return row
+}
+
+// sampleGPUStressMetrics queries nvidia-smi for temperature, utilization, power,
+// clock frequency, and active throttle reasons for each GPU.
+func sampleGPUStressMetrics(gpuIndices []int) []GPUStressMetric {
+	args := []string{
+		"--query-gpu=index,temperature.gpu,utilization.gpu,power.draw,clocks.current.graphics,clocks_throttle_reasons.active",
+		"--format=csv,noheader,nounits",
+	}
+	if len(gpuIndices) > 0 {
+		ids := make([]string, len(gpuIndices))
+		for i, idx := range gpuIndices {
+			ids[i] = strconv.Itoa(idx)
+		}
+		args = append([]string{"--id=" + strings.Join(ids, ",")}, args...)
+	}
+	out, err := exec.Command("nvidia-smi", args...).Output()
+	if err != nil {
+		return nil
+	}
+	var metrics []GPUStressMetric
+	for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
+		line = strings.TrimSpace(line)
+		if line == "" {
+			continue
+		}
+		parts := strings.Split(line, ", ")
+		if len(parts) < 6 {
+			continue
+		}
+		idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
+		throttleVal := strings.TrimSpace(parts[5])
+		// Throttled if active reasons bitmask is non-zero.
+		throttled := throttleVal != "0x0000000000000000" &&
+			throttleVal != "0x0" &&
+			throttleVal != "0" &&
+			throttleVal != "" &&
+			throttleVal != "N/A"
+		metrics = append(metrics, GPUStressMetric{
+			Index:     idx,
+			TempC:     parseGPUFloat(parts[1]),
+			UsagePct:  parseGPUFloat(parts[2]),
+			PowerW:    parseGPUFloat(parts[3]),
+			ClockMHz:  parseGPUFloat(parts[4]),
+			Throttled: throttled,
+		})
+	}
+	return metrics
+}
+
+// sampleFanSpeeds reads fan RPM values from ipmitool sdr.
+func sampleFanSpeeds() ([]FanReading, error) {
+	out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
+	if err != nil {
+		return nil, err
+	}
+	return parseFanSpeeds(string(out)), nil
+}
+
+// parseFanSpeeds parses "ipmitool sdr type Fan" output.
+// Line format: "FAN1             | 2400.000   | RPM        | ok"
+func parseFanSpeeds(raw string) []FanReading {
+	var fans []FanReading
+	for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
+		parts := strings.Split(line, "|")
+		if len(parts) < 3 {
+			continue
+		}
+		unit := strings.TrimSpace(parts[2])
+		if !strings.EqualFold(unit, "RPM") {
+			continue
+		}
+		valStr := strings.TrimSpace(parts[1])
+		if strings.EqualFold(valStr, "na") || strings.EqualFold(valStr, "disabled") || valStr == "" {
+			continue
+		}
+		val, err := strconv.ParseFloat(valStr, 64)
+		if err != nil {
+			continue
+		}
+		fans = append(fans, FanReading{
+			Name: strings.TrimSpace(parts[0]),
+			RPM:  val,
+		})
+	}
+	return fans
+}
+
+// sampleCPUMaxTemp returns the highest CPU/inlet temperature from ipmitool or sensors.
+func sampleCPUMaxTemp() float64 {
+	out, err := exec.Command("ipmitool", "sdr", "type", "Temperature").Output()
+	if err != nil {
+		return sampleCPUTempViaSensors()
+	}
+	return parseIPMIMaxTemp(string(out))
+}
+
+// parseIPMIMaxTemp extracts the maximum temperature from "ipmitool sdr type Temperature".
+func parseIPMIMaxTemp(raw string) float64 {
+	var max float64
+	for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
+		parts := strings.Split(line, "|")
+		if len(parts) < 3 {
+			continue
+		}
+		unit := strings.TrimSpace(parts[2])
+		if !strings.Contains(strings.ToLower(unit), "degrees") {
+			continue
+		}
+		valStr := strings.TrimSpace(parts[1])
+		if strings.EqualFold(valStr, "na") || valStr == "" {
+			continue
+		}
+		val, err := strconv.ParseFloat(valStr, 64)
+		if err != nil {
+			continue
+		}
+		if val > max {
+			max = val
+		}
+	}
+	return max
+}
+
+// sampleCPUTempViaSensors falls back to lm-sensors when ipmitool is unavailable.
+func sampleCPUTempViaSensors() float64 {
+	out, err := exec.Command("sensors", "-u").Output()
+	if err != nil {
+		return 0
+	}
+	var max float64
+	for _, line := range strings.Split(string(out), "\n") {
+		line = strings.TrimSpace(line)
+		fields := strings.Fields(line)
+		if len(fields) < 2 {
+			continue
+		}
+		if !strings.HasSuffix(fields[0], "_input:") {
+			continue
+		}
+		val, err := strconv.ParseFloat(fields[1], 64)
+		if err != nil {
+			continue
+		}
+		if val > 0 && val < 150 && val > max {
+			max = val
+		}
+	}
+	return max
+}
+
+// sampleSystemPower reads system power draw via DCMI.
+func sampleSystemPower() float64 {
+	out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
+	if err != nil {
+		return 0
+	}
+	return parseDCMIPowerReading(string(out))
+}
+
+// parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
+// Sample: "    Instantaneous power reading:                   500 Watts"
+func parseDCMIPowerReading(raw string) float64 {
+	for _, line := range strings.Split(raw, "\n") {
+		if !strings.Contains(strings.ToLower(line), "instantaneous") {
+			continue
+		}
+		parts := strings.Fields(line)
+		for i, p := range parts {
+			if strings.EqualFold(p, "Watts") && i > 0 {
+				val, err := strconv.ParseFloat(parts[i-1], 64)
+				if err == nil {
+					return val
+				}
+			}
+		}
+	}
+	return 0
+}
+
+// analyzeThrottling returns true if any GPU reported an active throttle reason
+// during either load phase.
+func analyzeThrottling(rows []FanStressRow) bool {
+	for _, row := range rows {
+		if row.Phase != "load1" && row.Phase != "load2" {
+			continue
+		}
+		for _, gpu := range row.GPUs {
+			if gpu.Throttled {
+				return true
+			}
+		}
+	}
+	return false
+}
+
+// analyzeMaxTemp returns the maximum value of the given extractor across all rows.
+func analyzeMaxTemp(rows []FanStressRow, extract func(FanStressRow) float64) float64 {
+	var max float64
+	for _, row := range rows {
+		if v := extract(row); v > max {
+			max = v
+		}
+	}
+	return max
+}
+
+// analyzeFanResponse returns the seconds from load1 start until fan RPM first
+// increased by more than 5% above the baseline average. Returns -1 if undetermined.
+func analyzeFanResponse(rows []FanStressRow) float64 {
+	// Compute baseline average fan RPM.
+	var baseTotal, baseCount float64
+	for _, row := range rows {
+		if row.Phase != "baseline" {
+			continue
+		}
+		for _, f := range row.Fans {
+			baseTotal += f.RPM
+			baseCount++
+		}
+	}
+	if baseCount == 0 || baseTotal == 0 {
+		return -1
+	}
+	baseAvg := baseTotal / baseCount
+	threshold := baseAvg * 1.05 // 5% increase signals fan ramp-up
+
+	// Find elapsed time when load1 started.
+	var load1Start float64 = -1
+	for _, row := range rows {
+		if row.Phase == "load1" {
+			load1Start = row.ElapsedSec
+			break
+		}
+	}
+	if load1Start < 0 {
+		return -1
+	}
+
+	// Find first load1 row where average RPM crosses the threshold.
+	for _, row := range rows {
+		if row.Phase != "load1" {
+			continue
+		}
+		var total, count float64
+		for _, f := range row.Fans {
+			total += f.RPM
+			count++
+		}
+		if count > 0 && total/count >= threshold {
+			return row.ElapsedSec - load1Start
+		}
+	}
+	return -1
+}
+
+// WriteFanStressCSV writes the wide-format metrics CSV with one row per second.
+// GPU columns are generated per index in gpuIndices order.
+func WriteFanStressCSV(path string, rows []FanStressRow, gpuIndices []int) error {
+	if len(rows) == 0 {
+		return os.WriteFile(path, []byte("no data\n"), 0644)
+	}
+
+	var b strings.Builder
+
+	// Header: fixed system columns + per-GPU columns.
+	b.WriteString("timestamp_utc,elapsed_sec,phase,fan_avg_rpm,fan_min_rpm,fan_max_rpm,cpu_max_temp_c,sys_power_w")
+	for _, idx := range gpuIndices {
+		fmt.Fprintf(&b, ",gpu%d_temp_c,gpu%d_usage_pct,gpu%d_power_w,gpu%d_clock_mhz,gpu%d_throttled",
+			idx, idx, idx, idx, idx)
+	}
+	b.WriteRune('\n')
+
+	for _, row := range rows {
+		favg, fmin, fmax := fanRPMStats(row.Fans)
+		fmt.Fprintf(&b, "%s,%.1f,%s,%.0f,%.0f,%.0f,%.1f,%.1f",
+			row.TimestampUTC,
+			row.ElapsedSec,
+			row.Phase,
+			favg, fmin, fmax,
+			row.CPUMaxTempC,
+			row.SysPowerW,
+		)
+		gpuByIdx := make(map[int]GPUStressMetric, len(row.GPUs))
+		for _, g := range row.GPUs {
+			gpuByIdx[g.Index] = g
+		}
+		for _, idx := range gpuIndices {
+			g := gpuByIdx[idx]
+			throttled := 0
+			if g.Throttled {
+				throttled = 1
+			}
+			fmt.Fprintf(&b, ",%.1f,%.1f,%.1f,%.0f,%d",
+				g.TempC, g.UsagePct, g.PowerW, g.ClockMHz, throttled)
+		}
+		b.WriteRune('\n')
+	}
+
+	return os.WriteFile(path, []byte(b.String()), 0644)
+}
+
+// WriteFanSensorsCSV writes individual fan sensor readings in long (tidy) format.
+func WriteFanSensorsCSV(path string, rows []FanStressRow) error {
+	var b strings.Builder
+	b.WriteString("timestamp_utc,elapsed_sec,phase,fan_name,rpm\n")
+	for _, row := range rows {
+		for _, f := range row.Fans {
+			fmt.Fprintf(&b, "%s,%.1f,%s,%s,%.0f\n",
+				row.TimestampUTC, row.ElapsedSec, row.Phase, f.Name, f.RPM)
+		}
+	}
+	return os.WriteFile(path, []byte(b.String()), 0644)
+}
+
+// fanRPMStats computes average, min, max RPM across all fans in a sample row.
+func fanRPMStats(fans []FanReading) (avg, min, max float64) {
+	if len(fans) == 0 {
+		return 0, 0, 0
+	}
+	min = fans[0].RPM
+	max = fans[0].RPM
+	var total float64
+	for _, f := range fans {
+		total += f.RPM
+		if f.RPM < min {
+			min = f.RPM
+		}
+		if f.RPM > max {
+			max = f.RPM
+		}
+	}
+	return total / float64(len(fans)), min, max
+}
--- a/audit/internal/platform/sat_test.go
+++ b/audit/internal/platform/sat_test.go
@@ -3,6 +3,8 @@ package platform
 import (
 	"errors"
 	"os"
+	"os/exec"
+	"path/filepath"
 	"testing"
 )

@@ -91,3 +93,90 @@ func TestClassifySATResult(t *testing.T) {
 		})
 	}
 }
+
+func TestParseStorageDevicesSkipsUSBDisks(t *testing.T) {
+	t.Parallel()
+
+	raw := "nvme0n1 disk nvme\nsda disk usb\nloop0 loop\nsdb disk sata\n"
+	got := parseStorageDevices(raw)
+	want := []string{"/dev/nvme0n1", "/dev/sdb"}
+	if len(got) != len(want) {
+		t.Fatalf("len(devices)=%d want %d (%v)", len(got), len(want), got)
+	}
+	for i := range want {
+		if got[i] != want[i] {
+			t.Fatalf("devices[%d]=%q want %q", i, got[i], want[i])
+		}
+	}
+}
+
+func TestResolveROCmSMICommandFromPATH(t *testing.T) {
+	t.Setenv("PATH", t.TempDir())
+
+	toolPath := filepath.Join(os.Getenv("PATH"), "rocm-smi")
+	if err := os.WriteFile(toolPath, []byte("#!/bin/sh\nexit 0\n"), 0755); err != nil {
+		t.Fatalf("write rocm-smi: %v", err)
+	}
+
+	cmd, err := resolveROCmSMICommand("--showproductname")
+	if err != nil {
+		t.Fatalf("resolveROCmSMICommand error: %v", err)
+	}
+	if len(cmd) != 2 {
+		t.Fatalf("cmd len=%d want 2 (%v)", len(cmd), cmd)
+	}
+	if cmd[0] != toolPath {
+		t.Fatalf("cmd[0]=%q want %q", cmd[0], toolPath)
+	}
+}
+
+func TestResolveROCmSMICommandFallsBackToROCmTree(t *testing.T) {
+	tmp := t.TempDir()
+	execPath := filepath.Join(tmp, "opt", "rocm", "bin", "rocm-smi")
+	if err := os.MkdirAll(filepath.Dir(execPath), 0755); err != nil {
+		t.Fatalf("mkdir: %v", err)
+	}
+	if err := os.WriteFile(execPath, []byte("#!/bin/sh\nexit 0\n"), 0755); err != nil {
+		t.Fatalf("write rocm-smi: %v", err)
+	}
+
+	oldGlob := rocmSMIExecutableGlobs
+	oldScriptGlobs := rocmSMIScriptGlobs
+	rocmSMIExecutableGlobs = []string{execPath}
+	rocmSMIScriptGlobs = nil
+	t.Cleanup(func() {
+		rocmSMIExecutableGlobs = oldGlob
+		rocmSMIScriptGlobs = oldScriptGlobs
+	})
+
+	t.Setenv("PATH", "")
+
+	cmd, err := resolveROCmSMICommand("--showallinfo")
+	if err != nil {
+		t.Fatalf("resolveROCmSMICommand error: %v", err)
+	}
+	if len(cmd) != 2 {
+		t.Fatalf("cmd len=%d want 2 (%v)", len(cmd), cmd)
+	}
+	if cmd[0] != execPath {
+		t.Fatalf("cmd[0]=%q want %q", cmd[0], execPath)
+	}
+}
+
+func TestRunROCmSMIReportsMissingCommand(t *testing.T) {
+	oldLookPath := satLookPath
+	oldExecGlobs := rocmSMIExecutableGlobs
+	oldScriptGlobs := rocmSMIScriptGlobs
+	satLookPath = func(string) (string, error) { return "", exec.ErrNotFound }
+	rocmSMIExecutableGlobs = nil
+	rocmSMIScriptGlobs = nil
+	t.Cleanup(func() {
+		satLookPath = oldLookPath
+		rocmSMIExecutableGlobs = oldExecGlobs
+		rocmSMIScriptGlobs = oldScriptGlobs
+	})
+
+	if _, err := runROCmSMI("--showproductname"); err == nil {
+		t.Fatal("expected missing rocm-smi error")
+	}
+}
--- a/audit/internal/platform/techdump.go
+++ b/audit/internal/platform/techdump.go
@@ -24,15 +24,23 @@ var techDumpFixedCommands = []struct {
 	{Name: "sensors", Args: []string{"-j"}, File: "sensors.json"},
 	{Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"},
 	{Name: "ipmitool", Args: []string{"sdr"}, File: "ipmitool-sdr.txt"},
+	{Name: "nvme", Args: []string{"list", "-o", "json"}, File: "nvme-list.json"},
+}
+
+var techDumpNvidiaCommands = []struct {
+	Name string
+	Args []string
+	File string
+}{
 	{Name: "nvidia-smi", Args: []string{"-q"}, File: "nvidia-smi-q.txt"},
 	{Name: "nvidia-smi", Args: []string{"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown", "--format=csv,noheader,nounits"}, File: "nvidia-smi-query.csv"},
-	{Name: "nvme", Args: []string{"list", "-o", "json"}, File: "nvme-list.json"},
 }

 type lsblkDumpRoot struct {
 	Blockdevices []struct {
 		Name string `json:"name"`
 		Type string `json:"type"`
+		Tran string `json:"tran"`
 	} `json:"blockdevices"`
 }

@@ -50,6 +58,15 @@ func (s *System) CaptureTechnicalDump(baseDir string) error {
 	for _, cmd := range techDumpFixedCommands {
 		writeCommandDump(filepath.Join(baseDir, cmd.File), cmd.Name, cmd.Args...)
 	}
+	switch s.DetectGPUVendor() {
+	case "nvidia":
+		for _, cmd := range techDumpNvidiaCommands {
+			writeCommandDump(filepath.Join(baseDir, cmd.File), cmd.Name, cmd.Args...)
+		}
+	case "amd":
+		writeROCmSMIDump(filepath.Join(baseDir, "rocm-smi.txt"))
+		writeROCmSMIDump(filepath.Join(baseDir, "rocm-smi-showallinfo.txt"), "--showallinfo")
+	}

 	for _, dev := range lsblkDumpDevices(filepath.Join(baseDir, "lsblk.json")) {
 		writeCommandDump(filepath.Join(baseDir, "smartctl-"+sanitizeDumpName(dev)+".json"), "smartctl", "-j", "-a", "/dev/"+dev)
@@ -69,6 +86,14 @@ func writeCommandDump(path, name string, args ...string) {
 	_ = os.WriteFile(path, out, 0644)
 }

+func writeROCmSMIDump(path string, args ...string) {
+	out, err := runROCmSMI(args...)
+	if err != nil && len(out) == 0 {
+		return
+	}
+	_ = os.WriteFile(path, out, 0644)
+}
+
 func lsblkDumpDevices(path string) []string {
 	raw, err := os.ReadFile(path)
 	if err != nil {
@@ -80,6 +105,9 @@ func lsblkDumpDevices(path string) []string {
 	}
 	var devices []string
 	for _, dev := range root.Blockdevices {
+		if strings.EqualFold(strings.TrimSpace(dev.Tran), "usb") {
+			continue
+		}
 		if dev.Type == "disk" && strings.TrimSpace(dev.Name) != "" {
 			devices = append(devices, strings.TrimSpace(dev.Name))
 		}
--- a/audit/internal/platform/techdump_test.go
+++ b/audit/internal/platform/techdump_test.go
@@ -12,12 +12,12 @@ func TestLSBLKDumpDevices(t *testing.T) {

 	dir := t.TempDir()
 	path := filepath.Join(dir, "lsblk.json")
-	if err := os.WriteFile(path, []byte(`{"blockdevices":[{"name":"sda","type":"disk"},{"name":"sda1","type":"part"},{"name":"nvme0n1","type":"disk"}]}`), 0644); err != nil {
+	if err := os.WriteFile(path, []byte(`{"blockdevices":[{"name":"sda","type":"disk","tran":"usb"},{"name":"sda1","type":"part"},{"name":"nvme0n1","type":"disk","tran":"nvme"},{"name":"sdb","type":"disk","tran":"sata"}]}`), 0644); err != nil {
 		t.Fatalf("write lsblk fixture: %v", err)
 	}

 	got := lsblkDumpDevices(path)
-	want := []string{"nvme0n1", "sda"}
+	want := []string{"nvme0n1", "sdb"}
 	if !reflect.DeepEqual(got, want) {
 		t.Fatalf("lsblkDumpDevices=%v want %v", got, want)
 	}
--- a/audit/internal/tui/forms.go
+++ b/audit/internal/tui/forms.go
@@ -3,6 +3,7 @@ package tui
 import (
 	"time"

+	"bee/audit/internal/platform"
 	tea "github.com/charmbracelet/bubbletea"
 )

@@ -137,6 +138,8 @@ func (m model) updateConfirm(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
 				},
 				pollSATProgress("gpu-amd", since),
 			)
+		case actionRunFanStress:
+			return m.startGPUStressTest()
 		}
 	case "ctrl+c":
 		return m, tea.Quit
@@ -150,7 +153,54 @@ func (m model) confirmCancelTarget() screen {
 		return screenExportTargets
 	case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT:
 		return screenHealthCheck
+	case actionRunFanStress:
+		return screenBurnInTests
 	default:
 		return screenMain
 	}
 }
+
+// hcFanStressOpts builds FanStressOptions for the selected mode, auto-detecting all GPUs.
+func hcFanStressOpts(hcMode int, application interface {
+	ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
+}) platform.FanStressOptions {
+	// Phase durations per mode: [baseline, load1, pause, load2]
+	type durations struct{ baseline, load1, pause, load2 int }
+	modes := [3]durations{
+		{30, 120, 30, 120},  // Quick:    ~5 min total
+		{60, 300, 60, 300},  // Standard: ~12 min total
+		{60, 600, 120, 600}, // Express:  ~24 min total
+	}
+	if hcMode < 0 || hcMode >= len(modes) {
+		hcMode = 0
+	}
+	d := modes[hcMode]
+
+	// Use all detected NVIDIA GPUs.
+	var indices []int
+	if gpus, err := application.ListNvidiaGPUs(); err == nil {
+		for _, g := range gpus {
+			indices = append(indices, g.Index)
+		}
+	}
+
+	// Use nearly full GPU memory on the smallest GPU (leave 512 MB for driver overhead).
+	sizeMB := 64
+	if gpus, err := application.ListNvidiaGPUs(); err == nil {
+		for _, g := range gpus {
+			free := g.MemoryMB - 512
+			if free > 0 && (sizeMB == 64 || free < sizeMB) {
+				sizeMB = free
+			}
+		}
+	}
+
+	return platform.FanStressOptions{
+		BaselineSec:  d.baseline,
+		Phase1DurSec: d.load1,
+		PauseSec:     d.pause,
+		Phase2DurSec: d.load2,
+		SizeMB:       sizeMB,
+		GPUIndices:   indices,
+	}
+}
--- a/audit/internal/tui/messages.go
+++ b/audit/internal/tui/messages.go
@@ -27,8 +27,9 @@ type exportTargetsMsg struct {
 	err     error
 }

-type panelMsg struct {
-	data app.HardwarePanelData
+type snapshotMsg struct {
+	banner string
+	panel  app.HardwarePanelData
 }

 type nvidiaGPUsMsg struct {
@@ -43,3 +44,14 @@ type nvidiaSATDoneMsg struct {
 	body  string
 	err   error
 }
+
+type gpuStressDoneMsg struct {
+	title string
+	body  string
+	err   error
+}
+
+type gpuLiveTickMsg struct {
+	rows    []platform.GPUMetricRow
+	indices []int
+}
--- a/audit/internal/tui/sat_progress.go
+++ b/audit/internal/tui/sat_progress.go
@@ -0,0 +1,131 @@
+package tui
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"sort"
+	"strconv"
+	"strings"
+	"time"
+
+	"bee/audit/internal/app"
+	tea "github.com/charmbracelet/bubbletea"
+)
+
+type satProgressMsg struct {
+	lines []string
+}
+
+// pollSATProgress returns a Cmd that waits 300ms then reads the latest verbose.log
+// for the given SAT prefix and returns parsed step progress lines.
+func pollSATProgress(prefix string, since time.Time) tea.Cmd {
+	return tea.Tick(300*time.Millisecond, func(_ time.Time) tea.Msg {
+		return satProgressMsg{lines: readSATProgressLines(prefix, since)}
+	})
+}
+
+func readSATProgressLines(prefix string, since time.Time) []string {
+	pattern := filepath.Join(app.DefaultSATBaseDir, prefix+"-*/verbose.log")
+	matches, err := filepath.Glob(pattern)
+	if err != nil || len(matches) == 0 {
+		return nil
+	}
+	sort.Strings(matches)
+	// Find the latest file created at or after (since - 5s) to account for clock skew.
+	cutoff := since.Add(-5 * time.Second)
+	candidate := ""
+	for _, m := range matches {
+		info, statErr := os.Stat(m)
+		if statErr == nil && info.ModTime().After(cutoff) {
+			candidate = m
+		}
+	}
+	if candidate == "" {
+		return nil
+	}
+	raw, err := os.ReadFile(candidate)
+	if err != nil {
+		return nil
+	}
+	return parseSATVerboseProgress(string(raw))
+}
+
+// parseSATVerboseProgress parses verbose.log content and returns display lines like:
+//
+//	"PASS  lscpu (234ms)"
+//	"FAIL  stress-ng (60.0s)"
+//	"...   sensors-after"
+func parseSATVerboseProgress(content string) []string {
+	type step struct {
+		name       string
+		rc         int
+		durationMs int
+		done       bool
+	}
+
+	lines := strings.Split(content, "\n")
+	var steps []step
+	stepIdx := map[string]int{}
+
+	for i, line := range lines {
+		line = strings.TrimSpace(line)
+		if idx := strings.Index(line, "] start "); idx >= 0 {
+			name := strings.TrimSpace(line[idx+len("] start "):])
+			if _, exists := stepIdx[name]; !exists {
+				stepIdx[name] = len(steps)
+				steps = append(steps, step{name: name})
+			}
+		} else if idx := strings.Index(line, "] finish "); idx >= 0 {
+			name := strings.TrimSpace(line[idx+len("] finish "):])
+			si, exists := stepIdx[name]
+			if !exists {
+				continue
+			}
+			steps[si].done = true
+			for j := i + 1; j < len(lines) && j <= i+3; j++ {
+				l := strings.TrimSpace(lines[j])
+				if strings.HasPrefix(l, "rc: ") {
+					steps[si].rc, _ = strconv.Atoi(strings.TrimPrefix(l, "rc: "))
+				} else if strings.HasPrefix(l, "duration_ms: ") {
+					steps[si].durationMs, _ = strconv.Atoi(strings.TrimPrefix(l, "duration_ms: "))
+				}
+			}
+		}
+	}
+
+	var result []string
+	for _, s := range steps {
+		display := cleanSATStepName(s.name)
+		if s.done {
+			status := "PASS"
+			if s.rc != 0 {
+				status = "FAIL"
+			}
+			result = append(result, fmt.Sprintf("%-4s  %s (%s)", status, display, fmtDurMs(s.durationMs)))
+		} else {
+			result = append(result, fmt.Sprintf("...   %s", display))
+		}
+	}
+	return result
+}
+
+// cleanSATStepName strips leading digits and dash: "01-lscpu.log" → "lscpu".
+func cleanSATStepName(name string) string {
+	name = strings.TrimSuffix(name, ".log")
+	i := 0
+	for i < len(name) && name[i] >= '0' && name[i] <= '9' {
+		i++
+	}
+	if i < len(name) && name[i] == '-' {
+		name = name[i+1:]
+	}
+	return name
+}
+
+func fmtDurMs(ms int) string {
+	if ms < 1000 {
+		return fmt.Sprintf("%dms", ms)
+	}
+	return fmt.Sprintf("%.1fs", float64(ms)/1000)
+}
--- a/audit/internal/tui/screen_burn_in.go
+++ b/audit/internal/tui/screen_burn_in.go
@@ -0,0 +1,117 @@
+package tui
+
+import (
+	"fmt"
+	"strings"
+
+	tea "github.com/charmbracelet/bubbletea"
+)
+
+const (
+	burnCurGPUStress = 0
+	burnCurModeQuick = 1
+	burnCurModeStd   = 2
+	burnCurModeExpr  = 3
+	burnCurRun       = 4
+	burnCurTotal     = 5
+)
+
+func (m model) enterBurnInTests() (tea.Model, tea.Cmd) {
+	m.screen = screenBurnInTests
+	m.cursor = 0
+	if !m.burnInitialized {
+		m.burnMode = 0
+		m.burnCursor = 0
+		m.burnInitialized = true
+	}
+	return m, nil
+}
+
+func (m model) updateBurnInTests(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
+	switch msg.String() {
+	case "up", "k":
+		if m.burnCursor > 0 {
+			m.burnCursor--
+		}
+	case "down", "j":
+		if m.burnCursor < burnCurTotal-1 {
+			m.burnCursor++
+		}
+	case " ":
+		switch m.burnCursor {
+		case burnCurModeQuick, burnCurModeStd, burnCurModeExpr:
+			m.burnMode = m.burnCursor - burnCurModeQuick
+		}
+	case "enter":
+		switch m.burnCursor {
+		case burnCurGPUStress, burnCurRun:
+			return m.burnRunSelected()
+		case burnCurModeQuick, burnCurModeStd, burnCurModeExpr:
+			m.burnMode = m.burnCursor - burnCurModeQuick
+		}
+	case "f", "F", "r", "R":
+		return m.burnRunSelected()
+	case "1":
+		m.burnMode = 0
+	case "2":
+		m.burnMode = 1
+	case "3":
+		m.burnMode = 2
+	case "esc":
+		m.screen = screenMain
+		m.cursor = 1
+	case "q", "ctrl+c":
+		return m, tea.Quit
+	}
+	return m, nil
+}
+
+func (m model) burnRunSelected() (tea.Model, tea.Cmd) {
+	return m.hcRunFanStress()
+}
+
+func renderBurnInTests(m model) string {
+	var b strings.Builder
+
+	fmt.Fprintln(&b, "BURN-IN TESTS")
+	fmt.Fprintln(&b)
+	fmt.Fprintln(&b, "  Stress tests:")
+	fmt.Fprintln(&b)
+
+	pfx := "  "
+	if m.burnCursor == burnCurGPUStress {
+		pfx = "> "
+	}
+	fmt.Fprintf(&b, "%s[ GPU PLATFORM STRESS TEST [F] ]   (thermal cycling, fan lag, throttle check)\n", pfx)
+
+	fmt.Fprintln(&b)
+	fmt.Fprintln(&b, "  Mode:")
+	modes := []struct{ label, key string }{
+		{"Quick", "1"},
+		{"Standard", "2"},
+		{"Express", "3"},
+	}
+	for i, mode := range modes {
+		pfx := "  "
+		if m.burnCursor == burnCurModeQuick+i {
+			pfx = "> "
+		}
+		radio := "( )"
+		if m.burnMode == i {
+			radio = "(*)"
+		}
+		fmt.Fprintf(&b, "%s%s  %-10s  [%s]\n", pfx, radio, mode.label, mode.key)
+	}
+
+	fmt.Fprintln(&b)
+	pfx = "  "
+	if m.burnCursor == burnCurRun {
+		pfx = "> "
+	}
+	fmt.Fprintf(&b, "%s[ RUN SELECTED [R] ]\n", pfx)
+
+	fmt.Fprintln(&b)
+	fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────")
+	fmt.Fprint(&b, "[↑↓] move  [space/enter] select  [1/2/3] mode  [R/F] run  [Esc] back")
+	return b.String()
+}
--- a/audit/internal/tui/screen_export.go
+++ b/audit/internal/tui/screen_export.go
@@ -4,7 +4,12 @@ import tea "github.com/charmbracelet/bubbletea"

 func (m model) handleExportTargetsMenu() (tea.Model, tea.Cmd) {
 	if len(m.targets) == 0 {
-		return m, resultCmd("Export support bundle", "No removable filesystems found", nil, screenMain)
+		return m, resultCmd(
+			"Export support bundle",
+			"No writable removable filesystems found.\n\nRead-only or boot media are hidden from this list.",
+			nil,
+			screenMain,
+		)
 	}
 	target := m.targets[m.cursor]
 	m.selectedTarget = &target
--- a/audit/internal/tui/screen_health_check.go
+++ b/audit/internal/tui/screen_health_check.go
@@ -4,6 +4,9 @@ import (
 	"context"
 	"fmt"
 	"strings"
+	"time"
+
+	"bee/audit/internal/platform"

 	tea "github.com/charmbracelet/bubbletea"
 )
@@ -143,6 +146,79 @@ func (m model) hcRunSingle(idx int) (tea.Model, tea.Cmd) {
 	return m, nil
 }

+func (m model) hcRunFanStress() (tea.Model, tea.Cmd) {
+	m.pendingAction = actionRunFanStress
+	m.screen = screenConfirm
+	m.cursor = 0
+	return m, nil
+}
+
+// startGPUStressTest launches the GPU Platform Stress Test with a live in-TUI chart.
+func (m model) startGPUStressTest() (tea.Model, tea.Cmd) {
+	opts := hcFanStressOpts(m.burnMode, m.app)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	m.gpuStressCancel = cancel
+	m.gpuStressAborted = false
+	m.gpuLiveRows = nil
+	m.gpuLiveIndices = opts.GPUIndices
+	m.gpuLiveStart = time.Now()
+	m.screen = screenGPUStressRunning
+	m.nvidiaSATCursor = 0
+
+	stressCmd := func() tea.Msg {
+		result, err := m.app.RunFanStressTestResult(ctx, opts)
+		return gpuStressDoneMsg{title: result.Title, body: result.Body, err: err}
+	}
+
+	return m, tea.Batch(stressCmd, pollGPULive(opts.GPUIndices))
+}
+
+// pollGPULive samples nvidia-smi once after one second and returns a gpuLiveTickMsg.
+// The update handler reschedules it to achieve continuous 1s polling.
+func pollGPULive(indices []int) tea.Cmd {
+	return tea.Tick(time.Second, func(_ time.Time) tea.Msg {
+		rows, _ := platform.SampleGPUMetrics(indices)
+		return gpuLiveTickMsg{rows: rows, indices: indices}
+	})
+}
+
+// updateGPUStressRunning handles keys on the GPU stress running screen.
+func (m model) updateGPUStressRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
+	switch msg.String() {
+	case "a", "A":
+		if m.gpuStressCancel != nil {
+			m.gpuStressCancel()
+			m.gpuStressCancel = nil
+		}
+		m.gpuStressAborted = true
+		m.screen = screenBurnInTests
+		m.burnCursor = burnCurGPUStress
+		m.cursor = 0
+	case "ctrl+c":
+		return m, tea.Quit
+	}
+	return m, nil
+}
+
+func renderGPUStressRunning(m model) string {
+	var b strings.Builder
+	fmt.Fprintln(&b, "GPU PLATFORM STRESS TEST")
+	fmt.Fprintln(&b)
+	if len(m.gpuLiveRows) == 0 {
+		fmt.Fprintln(&b, "Collecting metrics...")
+	} else {
+		chartWidth := m.width - 8
+		if chartWidth < 40 {
+			chartWidth = 70
+		}
+		b.WriteString(platform.RenderGPULiveChart(m.gpuLiveRows, chartWidth))
+	}
+	fmt.Fprintln(&b)
+	b.WriteString("[a] Abort test  [ctrl+c] quit")
+	return b.String()
+}
+
 func (m model) hcRunAll() (tea.Model, tea.Cmd) {
 	for _, sel := range m.hcSel {
 		if sel {
--- a/audit/internal/tui/screen_main.go
+++ b/audit/internal/tui/screen_main.go
@@ -8,7 +8,9 @@ func (m model) handleMainMenu() (tea.Model, tea.Cmd) {
 	switch m.cursor {
 	case 0: // Health Check
 		return m.enterHealthCheck()
-	case 1: // Export support bundle
+	case 1: // Burn-in tests
+		return m.enterBurnInTests()
+	case 2: // Export support bundle
 		m.pendingAction = actionExportBundle
 		m.busy = true
 		m.busyTitle = "Export support bundle"
@@ -16,11 +18,11 @@ func (m model) handleMainMenu() (tea.Model, tea.Cmd) {
 			targets, err := m.app.ListRemovableTargets()
 			return exportTargetsMsg{targets: targets, err: err}
 		}
-	case 2: // Settings
+	case 3: // Settings
 		m.screen = screenSettings
 		m.cursor = 0
 		return m, nil
-	case 3: // Exit
+	case 4: // Exit
 		return m, tea.Quit
 	}
 	return m, nil
--- a/audit/internal/tui/screen_nvidia_sat.go
+++ b/audit/internal/tui/screen_nvidia_sat.go
@@ -3,7 +3,6 @@ package tui
 import (
 	"context"
 	"fmt"
-	"os/exec"
 	"strings"

 	"bee/audit/internal/platform"
@@ -102,7 +101,7 @@ func (m model) updateNvidiaSATSetup(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
 	return m, nil
 }

-// startNvidiaSAT launches the SAT and nvtop.
+// startNvidiaSAT launches the NVIDIA acceptance pack.
 func (m model) startNvidiaSAT() (tea.Model, tea.Cmd) {
 	var selectedGPUs []platform.NvidiaGPU
 	for i, sel := range m.nvidiaGPUSel {
@@ -142,31 +141,12 @@ func (m model) startNvidiaSAT() (tea.Model, tea.Cmd) {
 		return nvidiaSATDoneMsg{title: result.Title, body: result.Body, err: err}
 	}

-	nvtopPath, lookErr := exec.LookPath("nvtop")
-	if lookErr != nil {
-		// nvtop not available: just run the SAT, show running screen
-		return m, satCmd
-	}
-
-	return m, tea.Batch(
-		satCmd,
-		tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
-			return nvtopClosedMsg{}
-		}),
-	)
+	return m, satCmd
 }

 // updateNvidiaSATRunning handles keys on the running screen.
 func (m model) updateNvidiaSATRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
 	switch msg.String() {
-	case "o", "O":
-		nvtopPath, err := exec.LookPath("nvtop")
-		if err != nil {
-			return m, nil
-		}
-		return m, tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
-			return nvtopClosedMsg{}
-		})
 	case "a", "A":
 		if m.nvidiaSATCancel != nil {
 			m.nvidiaSATCancel()
@@ -234,5 +214,5 @@ func renderNvidiaSATSetup(m model) string {

 // renderNvidiaSATRunning renders the running screen.
 func renderNvidiaSATRunning() string {
-	return "NVIDIA SAT\n\nTest is running...\n\n[o] Open nvtop  [a] Abort test  [ctrl+c] quit\n"
+	return "NVIDIA SAT\n\nTest is running...\n\n[a] Abort test  [ctrl+c] quit\n"
 }
--- a/audit/internal/tui/snapshot.go
+++ b/audit/internal/tui/snapshot.go
@@ -0,0 +1,30 @@
+package tui
+
+import (
+	"bee/audit/internal/app"
+
+	tea "github.com/charmbracelet/bubbletea"
+)
+
+func (m model) refreshSnapshotCmd() tea.Cmd {
+	if m.app == nil {
+		return nil
+	}
+	return func() tea.Msg {
+		return snapshotMsg{
+			banner: m.app.MainBanner(),
+			panel:  m.app.LoadHardwarePanel(),
+		}
+	}
+}
+
+func shouldRefreshSnapshot(prev, next model) bool {
+	return prev.screen != next.screen || prev.busy != next.busy
+}
+
+func emptySnapshot() snapshotMsg {
+	return snapshotMsg{
+		banner: "",
+		panel:  app.HardwarePanelData{},
+	}
+}
--- a/audit/internal/tui/tui_test.go
+++ b/audit/internal/tui/tui_test.go
@@ -53,10 +53,11 @@ func TestUpdateMainMenuEnterActions(t *testing.T) {
 		wantBusy   bool
 		wantCmd    bool
 	}{
-		{name: "health_check", cursor: 0, wantScreen: screenHealthCheck},
-		{name: "export", cursor: 1, wantScreen: screenMain, wantBusy: true, wantCmd: true},
-		{name: "settings", cursor: 2, wantScreen: screenSettings},
-		{name: "exit", cursor: 3, wantScreen: screenMain, wantCmd: true},
+		{name: "health_check", cursor: 0, wantScreen: screenHealthCheck, wantCmd: true},
+		{name: "burn_in_tests", cursor: 1, wantScreen: screenBurnInTests, wantCmd: true},
+		{name: "export", cursor: 2, wantScreen: screenMain, wantBusy: true, wantCmd: true},
+		{name: "settings", cursor: 3, wantScreen: screenSettings, wantCmd: true},
+		{name: "exit", cursor: 4, wantScreen: screenMain, wantCmd: true},
 	}

 	for _, test := range tests {
@@ -115,7 +116,8 @@ func TestMainMenuSimpleTransitions(t *testing.T) {
 		wantScreen screen
 	}{
 		{name: "health_check", cursor: 0, wantScreen: screenHealthCheck},
-		{name: "settings", cursor: 2, wantScreen: screenSettings},
+		{name: "burn_in_tests", cursor: 1, wantScreen: screenBurnInTests},
+		{name: "settings", cursor: 3, wantScreen: screenSettings},
 	}

 	for _, test := range tests {
@@ -146,7 +148,7 @@ func TestMainMenuExportSetsBusy(t *testing.T) {
 	t.Parallel()

 	m := newTestModel()
-	m.cursor = 1 // Export support bundle
+	m.cursor = 2 // Export support bundle

 	next, cmd := m.handleMainMenu()
 	got := next.(model)
@@ -163,12 +165,13 @@ func TestMainViewRendersTwoColumns(t *testing.T) {
 	t.Parallel()

 	m := newTestModel()
-	m.cursor = 1
+	m.cursor = 2

 	view := m.View()
 	for _, want := range []string{
 		"bee",
 		"Health Check",
+		"Burn-in tests",
 		"> Export support bundle",
 		"Settings",
 		"Exit",
@@ -400,6 +403,11 @@ func TestConfirmCancelTarget(t *testing.T) {
 		t.Fatalf("storage sat cancel target=%q want %q", got, screenHealthCheck)
 	}

+	m.pendingAction = actionRunFanStress
+	if got := m.confirmCancelTarget(); got != screenBurnInTests {
+		t.Fatalf("fan stress cancel target=%q want %q", got, screenBurnInTests)
+	}
+
 	m.pendingAction = actionNone
 	if got := m.confirmCancelTarget(); got != screenMain {
 		t.Fatalf("default cancel target=%q want %q", got, screenMain)
@@ -439,6 +447,68 @@ func TestViewBusyStateUsesBusyTitle(t *testing.T) {
 	}
 }

+func TestBurnInTestsEscReturnsToMain(t *testing.T) {
+	t.Parallel()
+
+	m := newTestModel()
+	m.screen = screenBurnInTests
+	m.burnCursor = 3
+
+	next, _ := m.updateBurnInTests(tea.KeyMsg{Type: tea.KeyEsc})
+	got := next.(model)
+
+	if got.screen != screenMain {
+		t.Fatalf("screen=%q want %q", got.screen, screenMain)
+	}
+	if got.cursor != 1 {
+		t.Fatalf("cursor=%d want 1", got.cursor)
+	}
+}
+
+func TestBurnInTestsRunOpensConfirm(t *testing.T) {
+	t.Parallel()
+
+	m := newTestModel()
+	m.screen = screenBurnInTests
+	m.burnInitialized = true
+	m.burnMode = 2
+
+	next, _ := m.burnRunSelected()
+	got := next.(model)
+
+	if got.screen != screenConfirm {
+		t.Fatalf("screen=%q want %q", got.screen, screenConfirm)
+	}
+	if got.pendingAction != actionRunFanStress {
+		t.Fatalf("pendingAction=%q want %q", got.pendingAction, actionRunFanStress)
+	}
+	if got.cursor != 0 {
+		t.Fatalf("cursor=%d want 0", got.cursor)
+	}
+}
+
+func TestViewBurnInTestsRendersGPUStressEntry(t *testing.T) {
+	t.Parallel()
+
+	m := newTestModel()
+	m.screen = screenBurnInTests
+
+	view := m.View()
+
+	for _, want := range []string{
+		"BURN-IN TESTS",
+		"GPU PLATFORM STRESS TEST",
+		"Quick",
+		"Standard",
+		"Express",
+		"[ RUN SELECTED [R] ]",
+	} {
+		if !strings.Contains(view, want) {
+			t.Fatalf("view missing %q\nview:\n%s", want, view)
+		}
+	}
+}
+
 func TestViewOutputScreenRendersBodyAndBackHint(t *testing.T) {
 	t.Parallel()

@@ -460,6 +530,55 @@ func TestViewOutputScreenRendersBodyAndBackHint(t *testing.T) {
 	}
 }

+func TestViewRendersBannerModuleAboveScreenBody(t *testing.T) {
+	t.Parallel()
+
+	m := newTestModel()
+	m.banner = "System: Demo Server\nIP: 10.0.0.10"
+	m.width = 60
+
+	view := m.View()
+
+	for _, want := range []string{
+		"┌ MOTD ",
+		"System: Demo Server",
+		"IP: 10.0.0.10",
+		"Health Check",
+		"Export support bundle",
+	} {
+		if !strings.Contains(view, want) {
+			t.Fatalf("view missing %q\nview:\n%s", want, view)
+		}
+	}
+}
+
+func TestSnapshotMsgUpdatesBannerAndPanel(t *testing.T) {
+	t.Parallel()
+
+	m := newTestModel()
+
+	next, cmd := m.Update(snapshotMsg{
+		banner: "System: Demo",
+		panel: app.HardwarePanelData{
+			Header: []string{"Demo header"},
+			Rows: []app.ComponentRow{
+				{Key: "CPU", Status: "PASS", Detail: "ok"},
+			},
+		},
+	})
+	got := next.(model)
+
+	if cmd != nil {
+		t.Fatal("expected nil cmd")
+	}
+	if got.banner != "System: Demo" {
+		t.Fatalf("banner=%q want %q", got.banner, "System: Demo")
+	}
+	if len(got.panel.Rows) != 1 || got.panel.Rows[0].Key != "CPU" {
+		t.Fatalf("panel rows=%+v", got.panel.Rows)
+	}
+}
+
 func TestViewExportTargetsRendersDeviceMetadata(t *testing.T) {
 	t.Parallel()

@@ -479,7 +598,7 @@ func TestViewExportTargetsRendersDeviceMetadata(t *testing.T) {

 	for _, want := range []string{
 		"Export support bundle",
-		"Select removable filesystem",
+		"Select writable removable filesystem (read-only/boot media hidden)",
 		"> /dev/sdb1 [vfat 29G] label=BEEUSB mounted=/media/bee",
 	} {
 		if !strings.Contains(view, want) {
@@ -488,6 +607,32 @@ func TestViewExportTargetsRendersDeviceMetadata(t *testing.T) {
 	}
 }

+func TestExportTargetsMsgEmptyShowsHiddenBootMediaHint(t *testing.T) {
+	t.Parallel()
+
+	m := newTestModel()
+	m.busy = true
+	m.busyTitle = "Export support bundle"
+
+	next, _ := m.Update(exportTargetsMsg{})
+	got := next.(model)
+
+	if got.screen != screenOutput {
+		t.Fatalf("screen=%q want %q", got.screen, screenOutput)
+	}
+	if got.title != "Export support bundle" {
+		t.Fatalf("title=%q want %q", got.title, "Export support bundle")
+	}
+	for _, want := range []string{
+		"No writable removable filesystems found.",
+		"Read-only or boot media are hidden from this list.",
+	} {
+		if !strings.Contains(got.body, want) {
+			t.Fatalf("body missing %q\nbody:\n%s", want, got.body)
+		}
+	}
+}
+
 func TestViewStaticFormRendersFields(t *testing.T) {
 	t.Parallel()

--- a/audit/internal/tui/types.go
+++ b/audit/internal/tui/types.go
@@ -16,6 +16,7 @@ type screen string
 const (
 	screenMain             screen = "main"
 	screenHealthCheck      screen = "health_check"
+	screenBurnInTests      screen = "burn_in_tests"
 	screenSettings         screen = "settings"
 	screenNetwork          screen = "network"
 	screenInterfacePick    screen = "interface_pick"
@@ -27,37 +28,39 @@ const (
 	screenConfirm          screen = "confirm"
 	screenNvidiaSATSetup   screen = "nvidia_sat_setup"
 	screenNvidiaSATRunning screen = "nvidia_sat_running"
+	screenGPUStressRunning screen = "gpu_stress_running"
 )

 type actionKind string

 const (
-	actionNone             actionKind = ""
-	actionDHCPOne          actionKind = "dhcp_one"
-	actionStaticIPv4       actionKind = "static_ipv4"
-	actionExportBundle     actionKind = "export_bundle"
-	actionRunAll           actionKind = "run_all"
-	actionRunMemorySAT     actionKind = "run_memory_sat"
-	actionRunStorageSAT    actionKind = "run_storage_sat"
-	actionRunCPUSAT        actionKind = "run_cpu_sat"
-	actionRunAMDGPUSAT     actionKind = "run_amd_gpu_sat"
+	actionNone          actionKind = ""
+	actionDHCPOne       actionKind = "dhcp_one"
+	actionStaticIPv4    actionKind = "static_ipv4"
+	actionExportBundle  actionKind = "export_bundle"
+	actionRunAll        actionKind = "run_all"
+	actionRunMemorySAT  actionKind = "run_memory_sat"
+	actionRunStorageSAT actionKind = "run_storage_sat"
+	actionRunCPUSAT     actionKind = "run_cpu_sat"
+	actionRunAMDGPUSAT  actionKind = "run_amd_gpu_sat"
+	actionRunFanStress  actionKind = "run_fan_stress"
 )

 type model struct {
 	app         *app.App
 	runtimeMode runtimeenv.Mode

-	screen      screen
-	prevScreen  screen
-	cursor      int
-	busy        bool
-	busyTitle   string
-	title       string
-	body        string
-	mainMenu    []string
+	screen       screen
+	prevScreen   screen
+	cursor       int
+	busy         bool
+	busyTitle    string
+	title        string
+	body         string
+	mainMenu     []string
 	settingsMenu []string
-	networkMenu []string
-	serviceMenu []string
+	networkMenu  []string
+	serviceMenu  []string

 	services        []string
 	interfaces      []platform.InterfaceInfo
@@ -74,6 +77,7 @@ type model struct {
 	panel       app.HardwarePanelData
 	panelFocus  bool
 	panelCursor int
+	banner      string

 	// Health Check screen
 	hcSel         [4]bool
@@ -81,6 +85,11 @@ type model struct {
 	hcCursor      int
 	hcInitialized bool

+	// Burn-in tests screen
+	burnMode        int
+	burnCursor      int
+	burnInitialized bool
+
 	// NVIDIA SAT setup
 	nvidiaGPUs      []platform.NvidiaGPU
 	nvidiaGPUSel    []bool
@@ -91,10 +100,20 @@ type model struct {
 	nvidiaSATCancel  func()
 	nvidiaSATAborted bool

+	// GPU Platform Stress Test running
+	gpuStressCancel  func()
+	gpuStressAborted bool
+	gpuLiveRows      []platform.GPUMetricRow
+	gpuLiveIndices   []int
+	gpuLiveStart     time.Time
+
 	// SAT verbose progress (CPU / Memory / Storage / AMD GPU)
 	progressLines  []string
 	progressPrefix string
 	progressSince  time.Time
+
+	// Terminal size
+	width int
 }

 type formField struct {
@@ -119,6 +138,7 @@ func newModel(application *app.App, runtimeMode runtimeenv.Mode) model {
 		screen:      screenMain,
 		mainMenu: []string{
 			"Health Check",
+			"Burn-in tests",
 			"Export support bundle",
 			"Settings",
 			"Exit",
@@ -151,9 +171,7 @@ func newModel(application *app.App, runtimeMode runtimeenv.Mode) model {
 }

 func (m model) Init() tea.Cmd {
-	return func() tea.Msg {
-		return panelMsg{data: m.app.LoadHardwarePanel()}
-	}
+	return m.refreshSnapshotCmd()
 }

 func (m model) confirmBody() (string, string) {
@@ -186,6 +204,11 @@ func (m model) confirmBody() (string, string) {
 		return "CPU test", "Run stress-ng? Mode: " + modes[m.hcMode]
 	case actionRunAMDGPUSAT:
 		return "AMD GPU test", "Run AMD GPU diagnostic pack (rocm-smi)?"
+	case actionRunFanStress:
+		modes := []string{"Quick (2×2min)", "Standard (2×5min)", "Express (2×10min)"}
+		return "GPU Platform Stress Test", "Two-phase GPU thermal cycling test.\n" +
+			"Monitors fans, temps, power — detects throttling.\n" +
+			"Mode: " + modes[m.burnMode] + "\n\nAll NVIDIA GPUs will be stressed."
 	default:
 		return "Confirm", "Proceed?"
 	}
--- a/audit/internal/tui/update.go
+++ b/audit/internal/tui/update.go
@@ -3,12 +3,16 @@ package tui
 import (
 	"fmt"
 	"strings"
+	"time"

 	tea "github.com/charmbracelet/bubbletea"
 )

 func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 	switch msg := msg.(type) {
+	case tea.WindowSizeMsg:
+		m.width = msg.Width
+		return m, nil
 	case tea.KeyMsg:
 		if m.busy {
 			if msg.String() == "ctrl+c" {
@@ -16,7 +20,12 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 			}
 			return m, nil
 		}
-		return m.updateKey(msg)
+		next, cmd := m.updateKey(msg)
+		nextModel := next.(model)
+		if shouldRefreshSnapshot(m, nextModel) {
+			return nextModel, tea.Batch(cmd, nextModel.refreshSnapshotCmd())
+		}
+		return nextModel, cmd
 	case satProgressMsg:
 		if m.busy && m.progressPrefix != "" {
 			if len(msg.lines) > 0 {
@@ -25,6 +34,10 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 			return m, pollSATProgress(m.progressPrefix, m.progressSince)
 		}
 		return m, nil
+	case snapshotMsg:
+		m.banner = msg.banner
+		m.panel = msg.panel
+		return m, nil
 	case resultMsg:
 		m.busy = false
 		m.busyTitle = ""
@@ -49,7 +62,7 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 		}
 		m.screen = screenOutput
 		m.cursor = 0
-		return m, nil
+		return m, m.refreshSnapshotCmd()
 	case servicesMsg:
 		m.busy = false
 		m.busyTitle = ""
@@ -58,12 +71,12 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 			m.body = msg.err.Error()
 			m.prevScreen = screenSettings
 			m.screen = screenOutput
-			return m, nil
+			return m, m.refreshSnapshotCmd()
 		}
 		m.services = msg.services
 		m.screen = screenServices
 		m.cursor = 0
-		return m, nil
+		return m, m.refreshSnapshotCmd()
 	case interfacesMsg:
 		m.busy = false
 		m.busyTitle = ""
@@ -72,12 +85,12 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 			m.body = msg.err.Error()
 			m.prevScreen = screenNetwork
 			m.screen = screenOutput
-			return m, nil
+			return m, m.refreshSnapshotCmd()
 		}
 		m.interfaces = msg.ifaces
 		m.screen = screenInterfacePick
 		m.cursor = 0
-		return m, nil
+		return m, m.refreshSnapshotCmd()
 	case exportTargetsMsg:
 		m.busy = false
 		m.busyTitle = ""
@@ -86,19 +99,61 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 			m.body = msg.err.Error()
 			m.prevScreen = screenMain
 			m.screen = screenOutput
-			return m, nil
+			return m, m.refreshSnapshotCmd()
+		}
+		if len(msg.targets) == 0 {
+			m.title = "Export support bundle"
+			m.body = "No writable removable filesystems found.\n\nRead-only or boot media are hidden from this list."
+			m.prevScreen = screenMain
+			m.screen = screenOutput
+			return m, m.refreshSnapshotCmd()
 		}
 		m.targets = msg.targets
 		m.screen = screenExportTargets
 		m.cursor = 0
-		return m, nil
-	case panelMsg:
-		m.panel = msg.data
-		return m, nil
+		return m, m.refreshSnapshotCmd()
 	case nvidiaGPUsMsg:
 		return m.handleNvidiaGPUsMsg(msg)
 	case nvtopClosedMsg:
 		return m, nil
+	case gpuStressDoneMsg:
+		if m.gpuStressAborted {
+			return m, nil
+		}
+		if m.gpuStressCancel != nil {
+			m.gpuStressCancel()
+			m.gpuStressCancel = nil
+		}
+		m.prevScreen = screenBurnInTests
+		m.screen = screenOutput
+		m.title = msg.title
+		if msg.err != nil {
+			body := strings.TrimSpace(msg.body)
+			if body == "" {
+				m.body = fmt.Sprintf("ERROR: %v", msg.err)
+			} else {
+				m.body = fmt.Sprintf("%s\n\nERROR: %v", body, msg.err)
+			}
+		} else {
+			m.body = msg.body
+		}
+		return m, m.refreshSnapshotCmd()
+	case gpuLiveTickMsg:
+		if m.screen == screenGPUStressRunning {
+			if len(msg.rows) > 0 {
+				elapsed := time.Since(m.gpuLiveStart).Seconds()
+				for i := range msg.rows {
+					msg.rows[i].ElapsedSec = elapsed
+				}
+				m.gpuLiveRows = append(m.gpuLiveRows, msg.rows...)
+				n := max(1, len(msg.indices))
+				if len(m.gpuLiveRows) > 60*n {
+					m.gpuLiveRows = m.gpuLiveRows[len(m.gpuLiveRows)-60*n:]
+				}
+			}
+			return m, pollGPULive(msg.indices)
+		}
+		return m, nil
 	case nvidiaSATDoneMsg:
 		if m.nvidiaSATAborted {
 			return m, nil
@@ -120,7 +175,7 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 		} else {
 			m.body = msg.body
 		}
-		return m, nil
+		return m, m.refreshSnapshotCmd()
 	}
 	return m, nil
 }
@@ -131,6 +186,8 @@ func (m model) updateKey(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
 		return m.updateMain(msg)
 	case screenHealthCheck:
 		return m.updateHealthCheck(msg)
+	case screenBurnInTests:
+		return m.updateBurnInTests(msg)
 	case screenSettings:
 		return m.updateMenu(msg, len(m.settingsMenu), m.handleSettingsMenu)
 	case screenNetwork:
@@ -143,6 +200,8 @@ func (m model) updateKey(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
 		return m.updateNvidiaSATSetup(msg)
 	case screenNvidiaSATRunning:
 		return m.updateNvidiaSATRunning(msg)
+	case screenGPUStressRunning:
+		return m.updateGPUStressRunning(msg)
 	case screenExportTargets:
 		return m.updateMenu(msg, len(m.targets), m.handleExportTargetsMenu)
 	case screenInterfacePick:
@@ -154,10 +213,6 @@ func (m model) updateKey(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
 			m.body = ""
 			m.title = ""
 			m.pendingAction = actionNone
-			// Refresh panel when returning to main screen.
-			if m.prevScreen == screenMain {
-				return m, func() tea.Msg { return panelMsg{data: m.app.LoadHardwarePanel()} }
-			}
 			return m, nil
 		case "ctrl+c":
 			return m, tea.Quit
--- a/audit/internal/tui/view.go
+++ b/audit/internal/tui/view.go
@@ -6,8 +6,8 @@ import (

 	"bee/audit/internal/platform"

-	"github.com/charmbracelet/lipgloss"
 	tea "github.com/charmbracelet/bubbletea"
+	"github.com/charmbracelet/lipgloss"
 )

 // Column widths for two-column main layout.
@@ -34,6 +34,7 @@ func colorStatus(status string) string {
 }

 func (m model) View() string {
+	var body string
 	if m.busy {
 		title := "bee"
 		if m.busyTitle != "" {
@@ -46,41 +47,53 @@ func (m model) View() string {
 				fmt.Fprintf(&b, "  %s\n", l)
 			}
 			b.WriteString("\n[ctrl+c] quit\n")
-			return b.String()
+			body = b.String()
+		} else {
+			body = fmt.Sprintf("%s\n\nWorking...\n\n[ctrl+c] quit\n", title)
+		}
+	} else {
+		switch m.screen {
+		case screenMain:
+			body = renderTwoColumnMain(m)
+		case screenHealthCheck:
+			body = renderHealthCheck(m)
+		case screenBurnInTests:
+			body = renderBurnInTests(m)
+		case screenSettings:
+			body = renderMenu("Settings", "Select action", m.settingsMenu, m.cursor)
+		case screenNetwork:
+			body = renderMenu("Network", "Select action", m.networkMenu, m.cursor)
+		case screenServices:
+			body = renderMenu("Services", "Select service", m.services, m.cursor)
+		case screenServiceAction:
+			body = renderMenu("Service: "+m.selectedService, "Select action", m.serviceMenu, m.cursor)
+		case screenExportTargets:
+			body = renderMenu(
+				"Export support bundle",
+				"Select writable removable filesystem (read-only/boot media hidden)",
+				renderTargetItems(m.targets),
+				m.cursor,
+			)
+		case screenInterfacePick:
+			body = renderMenu("Interfaces", "Select interface", renderInterfaceItems(m.interfaces), m.cursor)
+		case screenStaticForm:
+			body = renderForm("Static IPv4: "+m.selectedIface, m.formFields, m.formIndex)
+		case screenConfirm:
+			title, confirmBody := m.confirmBody()
+			body = renderConfirm(title, confirmBody, m.cursor)
+		case screenNvidiaSATSetup:
+			body = renderNvidiaSATSetup(m)
+		case screenNvidiaSATRunning:
+			body = renderNvidiaSATRunning()
+		case screenGPUStressRunning:
+			body = renderGPUStressRunning(m)
+		case screenOutput:
+			body = fmt.Sprintf("%s\n\n%s\n\n[enter/esc] back  [ctrl+c] quit\n", m.title, strings.TrimSpace(m.body))
+		default:
+			body = "bee\n"
 		}
-		return fmt.Sprintf("%s\n\nWorking...\n\n[ctrl+c] quit\n", title)
-	}
-	switch m.screen {
-	case screenMain:
-		return renderTwoColumnMain(m)
-	case screenHealthCheck:
-		return renderHealthCheck(m)
-	case screenSettings:
-		return renderMenu("Settings", "Select action", m.settingsMenu, m.cursor)
-	case screenNetwork:
-		return renderMenu("Network", "Select action", m.networkMenu, m.cursor)
-	case screenServices:
-		return renderMenu("Services", "Select service", m.services, m.cursor)
-	case screenServiceAction:
-		return renderMenu("Service: "+m.selectedService, "Select action", m.serviceMenu, m.cursor)
-	case screenExportTargets:
-		return renderMenu("Export support bundle", "Select removable filesystem", renderTargetItems(m.targets), m.cursor)
-	case screenInterfacePick:
-		return renderMenu("Interfaces", "Select interface", renderInterfaceItems(m.interfaces), m.cursor)
-	case screenStaticForm:
-		return renderForm("Static IPv4: "+m.selectedIface, m.formFields, m.formIndex)
-	case screenConfirm:
-		title, body := m.confirmBody()
-		return renderConfirm(title, body, m.cursor)
-	case screenNvidiaSATSetup:
-		return renderNvidiaSATSetup(m)
-	case screenNvidiaSATRunning:
-		return renderNvidiaSATRunning()
-	case screenOutput:
-		return fmt.Sprintf("%s\n\n%s\n\n[enter/esc] back  [ctrl+c] quit\n", m.title, strings.TrimSpace(m.body))
-	default:
-		return "bee\n"
 	}
+	return m.renderWithBanner(body)
 }

 // renderTwoColumnMain renders the main screen with menu on the left and hardware panel on the right.
@@ -231,3 +244,60 @@ func resultCmd(title, body string, err error, back screen) tea.Cmd {
 		return resultMsg{title: title, body: body, err: err, back: back}
 	}
 }
+
+func (m model) renderWithBanner(body string) string {
+	body = strings.TrimRight(body, "\n")
+	banner := renderBannerModule(m.banner, m.width)
+	if banner == "" {
+		if body == "" {
+			return ""
+		}
+		return body + "\n"
+	}
+	if body == "" {
+		return banner + "\n"
+	}
+	return banner + "\n\n" + body + "\n"
+}
+
+func renderBannerModule(banner string, width int) string {
+	banner = strings.TrimSpace(banner)
+	if banner == "" {
+		return ""
+	}
+
+	lines := strings.Split(banner, "\n")
+	contentWidth := 0
+	for _, line := range lines {
+		if w := lipgloss.Width(line); w > contentWidth {
+			contentWidth = w
+		}
+	}
+	if width > 0 && width-4 > contentWidth {
+		contentWidth = width - 4
+	}
+	if contentWidth < 20 {
+		contentWidth = 20
+	}
+
+	label := " MOTD "
+	topFill := contentWidth + 2 - lipgloss.Width(label)
+	if topFill < 0 {
+		topFill = 0
+	}
+
+	var b strings.Builder
+	b.WriteString("┌" + label + strings.Repeat("─", topFill) + "┐\n")
+	for _, line := range lines {
+		b.WriteString("│ " + padRight(line, contentWidth) + " │\n")
+	}
+	b.WriteString("└" + strings.Repeat("─", contentWidth+2) + "┘")
+	return b.String()
+}
+
+func padRight(value string, width int) string {
+	if gap := width - lipgloss.Width(value); gap > 0 {
+		return value + strings.Repeat(" ", gap)
+	}
+	return value
+}
--- a/bible-local/architecture/runtime-flows.md
+++ b/bible-local/architecture/runtime-flows.md
@@ -9,6 +9,8 @@ DHCP is used only for LAN (operator SSH access). Internet is NOT available.

 ## Boot sequence (single ISO)

+The live system is expected to boot with `toram`, so `live-boot` copies the full read-only medium into RAM before mounting the root filesystem. After that point, runtime must not depend on the original USB/BMC virtual media staying readable.
+
 `systemd` boot order:

 ```
@@ -25,6 +27,7 @@ local-fs.target
 ```

 **Critical invariants:**
+- The live ISO boots with `boot=live toram`. Runtime binaries must continue working even if the original boot media disappears after early boot.
 - OpenSSH MUST start without network. `bee-sshsetup.service` runs before `ssh.service`.
 - `bee-network.service` uses `dhclient -nw` (background) — network bring-up is best effort and non-blocking.
 - `bee-nvidia.service` loads modules via `insmod` with absolute paths — NOT `modprobe`.
@@ -71,24 +74,39 @@ build-in-container.sh [--authorized-keys /path/to/keys]
       d. build kernel modules against Debian headers
       e. create `libnvidia-ml.so.1` / `libcuda.so.1` symlinks in cache
       f. cache in `dist/nvidia-<version>-<kver>/`
-  7. inject NVIDIA `.ko` → staged `/usr/local/lib/nvidia/`
-  8. inject `nvidia-smi` → staged `/usr/local/bin/nvidia-smi`
-  9. inject `libnvidia-ml` + `libcuda` → staged `/usr/lib/`
-  10. write staged `/etc/bee-release` (versions + git commit)
-  11. patch staged `motd` with build metadata
-  12. copy `iso/builder/` into a temporary live-build workdir under `dist/`
-  13. sync staged overlay into workdir `config/includes.chroot/`
-  14. run `lb config && lb build` inside the privileged builder container
+  7. `build-cublas.sh`:
+       a. download `libcublas`, `libcublasLt`, `libcudart` runtime + dev packages from the NVIDIA CUDA Debian repo
+       b. verify packages against repo `Packages.gz`
+       c. extract headers for `bee-gpu-stress` build
+       d. cache userspace libs in `dist/cublas-<version>+cuda<series>/`
+  8. build `bee-gpu-stress` against extracted cuBLASLt/cudart headers
+  9. inject NVIDIA `.ko` → staged `/usr/local/lib/nvidia/`
+  10. inject `nvidia-smi` → staged `/usr/local/bin/nvidia-smi`
+  11. inject `libnvidia-ml` + `libcuda` + `libcublas` + `libcublasLt` + `libcudart` → staged `/usr/lib/`
+  12. write staged `/etc/bee-release` (versions + git commit)
+  13. patch staged `motd` with build metadata
+  14. copy `iso/builder/` into a temporary live-build workdir under `dist/`
+  15. sync staged overlay into workdir `config/includes.chroot/`
+  16. run `lb config && lb build` inside the privileged builder container
 ```

+Build host notes:
+- `build-in-container.sh` targets `linux/amd64` builder containers by default, including Docker Desktop on macOS / Apple Silicon.
+- Override with `BEE_BUILDER_PLATFORM=<os/arch>` only if you intentionally need a different container platform.
+- If the local builder image under the same tag was previously built for the wrong architecture, the script rebuilds it automatically.
+
 **Critical invariants:**
 - `DEBIAN_KERNEL_ABI` in `iso/builder/VERSIONS` pins the exact kernel ABI used in BOTH places:
  1. `build-in-container.sh` / `build-nvidia-module.sh` — Debian kernel headers for module build
  2. `auto/config` — `linux-image-${DEBIAN_KERNEL_ABI}` in the ISO
 - NVIDIA modules go to staged `usr/local/lib/nvidia/` — NOT to `/lib/modules/<kver>/extra/`.
+- `bee-gpu-stress` must be built against cached CUDA userspace headers from `build-cublas.sh`, not against random host-installed CUDA headers.
+- The live ISO must ship `libcublas`, `libcublasLt`, and `libcudart` together with `libcuda` so tensor-core stress works without internet or package installs at boot.
 - The source overlay in `iso/overlay/` is treated as immutable source. Build-time files are injected only into the staged overlay.
 - The live-build workdir under `dist/` is disposable; source files under `iso/builder/` stay clean.
 - Container build requires `--privileged` because `live-build` uses mounts/chroots/loop devices during ISO assembly.
+- On macOS / Docker Desktop, the builder still must run as `linux/amd64` so the shipped ISO binaries remain `amd64`.
+- Operators must provision enough RAM to hold the full compressed live medium plus normal runtime overhead, because `toram` copies the entire read-only ISO payload into memory before the system reaches steady state.

 ## Post-boot smoke test

@@ -131,10 +149,15 @@ Current validation state:
 Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal.

 Acceptance flows:
- `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + lightweight `bee-gpu-stress`
+- `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + mixed-precision `bee-gpu-stress`
 - `bee sat memory` → `memtester` archive
 - `bee sat storage` → SMART/NVMe diagnostic archive and short self-test trigger where supported
 - SAT `summary.txt` now includes `overall_status` and per-job `*_status` values (`OK`, `FAILED`, `UNSUPPORTED`)
+- `bee-gpu-stress` should prefer cuBLASLt GEMM load over the old integer/PTX burn path:
+  - Ampere: `fp16` + `fp32`/TF32 tensor-core load
+  - Ada / Hopper: add `fp8`
+  - Blackwell+: add `fp4`
+  - PTX fallback is only for missing cuBLASLt/userspace or unsupported narrow datatypes
 - Runtime overrides:
  - `BEE_GPU_STRESS_SECONDS`
  - `BEE_GPU_STRESS_SIZE_MB`
--- a/bible-local/architecture/system-overview.md
+++ b/bible-local/architecture/system-overview.md
@@ -21,7 +21,8 @@ Fills gaps where Redfish/logpile is blind:
 - Read-only hardware inventory: board, CPU, memory, storage, PCIe, PSU, GPU, NIC, RAID
 - Machine-readable health summary derived from collector verdicts
 - Operator-triggered acceptance tests for NVIDIA, memory, and storage
- NVIDIA SAT includes both diagnostic collection and lightweight GPU stress via `bee-gpu-stress`
+- NVIDIA SAT includes both diagnostic collection and mixed-precision GPU stress via `bee-gpu-stress`
+- `bee-gpu-stress` should exercise tensor/inference paths (`fp16`, `fp32`/TF32, `fp8`, `fp4` when supported by the GPU/userspace stack) and fall back to Driver API PTX burn only if cuBLASLt is unavailable
 - Automatic boot audit with operator-facing local console and SSH access
 - NVIDIA proprietary driver loaded at boot for GPU enrichment via `nvidia-smi`
 - SSH access (OpenSSH) always available for inspection and debugging
@@ -69,6 +70,7 @@ Fills gaps where Redfish/logpile is blind:
 | SSH | OpenSSH server |
 | NVIDIA driver | Proprietary `.run` installer, built against Debian kernel headers |
 | NVIDIA modules | Loaded via `insmod` from `/usr/local/lib/nvidia/` |
+| GPU stress backend | `bee-gpu-stress` + cuBLASLt/cuBLAS/cudart mixed-precision GEMM, with Driver API PTX fallback |
 | Builder | Debian 12 host/VM or Debian 12 container image |

 ## Operator UX
@@ -78,6 +80,7 @@ Fills gaps where Redfish/logpile is blind:
 - The TUI itself executes privileged actions as `root` via `sudo -n`
 - SSH remains available independently of the local console path
 - VM-oriented builds also include `qemu-guest-agent` and serial console support for debugging
+- The ISO boots with `toram`, so loss of the original USB/BMC virtual media after boot should not break already-installed runtime binaries

 ## Runtime split

@@ -85,6 +88,7 @@ Fills gaps where Redfish/logpile is blind:
 - Live-ISO-only responsibilities stay in `iso/` integration code
 - Live ISO launches the Go CLI with `--runtime livecd`
 - Local/manual runs use `--runtime auto` or `--runtime local`
+- Live ISO targets must have enough RAM for the full compressed live medium plus runtime working set because the boot medium is copied into memory at startup

 ## Key paths

--- a/iso/README.md
+++ b/iso/README.md
@@ -0,0 +1,58 @@
+# ISO Build
+
+`bee` ISO is built inside a Debian 12 builder container via `iso/builder/build-in-container.sh`.
+
+## Requirements
+
+- Docker Desktop or another Docker-compatible container runtime
+- Privileged containers enabled
+- Enough free disk space for builder cache, Debian live-build artifacts, NVIDIA driver cache, and CUDA userspace packages
+
+## Build On macOS
+
+From the repository root:
+
+```sh
+sh iso/builder/build-in-container.sh
+```
+
+The script defaults to `linux/amd64` builder containers, so it works on:
+
+- Intel Mac
+- Apple Silicon (`M1` / `M2` / `M3` / `M4`) via Docker Desktop's Linux VM
+
+You do not need to pass `--platform` manually for normal ISO builds.
+
+## Useful Options
+
+Build with explicit SSH keys baked into the ISO:
+
+```sh
+sh iso/builder/build-in-container.sh --authorized-keys ~/.ssh/id_ed25519.pub
+```
+
+Rebuild the builder image:
+
+```sh
+sh iso/builder/build-in-container.sh --rebuild-image
+```
+
+Use a custom cache directory:
+
+```sh
+sh iso/builder/build-in-container.sh --cache-dir /path/to/cache
+```
+
+## Notes
+
+- The builder image is automatically rebuilt if the local tag exists for the wrong architecture.
+- The live ISO boots with Debian `live-boot` `toram`, so the read-only medium is copied into RAM during boot and the runtime no longer depends on the original USB/BMC virtual media staying present.
+- Target systems need enough RAM for the full compressed live medium plus normal runtime overhead, or boot may fail before reaching the TUI.
+- Override the container platform only if you know why:
+
+```sh
+BEE_BUILDER_PLATFORM=linux/amd64 sh iso/builder/build-in-container.sh
+```
+
+- The shipped ISO is still `amd64`.
+- Output ISO artifacts are written under `dist/`.
--- a/iso/builder/VERSIONS
+++ b/iso/builder/VERSIONS
@@ -4,5 +4,7 @@ NVIDIA_DRIVER_VERSION=590.48.01
 NCCL_VERSION=2.28.9-1
 NCCL_CUDA_VERSION=13.0
 NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
+CUBLAS_VERSION=13.0.2.14-1
+CUDA_USERSPACE_VERSION=13.0.96-1
 GO_VERSION=1.24.0
 AUDIT_VERSION=1.0.0
--- a/iso/builder/auto/config
+++ b/iso/builder/auto/config
@@ -32,6 +32,6 @@ lb config noauto \
    --memtest none \
    --iso-volume "EASY-BEE" \
    --iso-application "EASY-BEE" \
-    --bootappend-live "boot=live components console=tty0 console=ttyS0,115200n8 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
+    --bootappend-live "boot=live toram components console=tty2 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
    --apt-recommends false \
    "${@}"
--- a/iso/builder/bee-gpu-stress.c
+++ b/iso/builder/bee-gpu-stress.c
--- a/iso/builder/build-cublas.sh
+++ b/iso/builder/build-cublas.sh
@@ -0,0 +1,170 @@
+#!/bin/sh
+# build-cublas.sh — download cuBLASLt/cuBLAS/cudart runtime + headers for bee-gpu-stress.
+#
+# Downloads .deb packages from NVIDIA's CUDA apt repository (Debian 12, x86_64),
+# verifies them against Packages.gz, and extracts the small subset we need:
+#   - headers for compiling bee-gpu-stress against cuBLASLt
+#   - runtime libs for libcublas, libcublasLt, libcudart inside the ISO
+
+set -e
+
+CUBLAS_VERSION="$1"
+CUDA_USERSPACE_VERSION="$2"
+CUDA_SERIES="$3"
+DIST_DIR="$4"
+
+[ -n "$CUBLAS_VERSION" ]        || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
+[ -n "$CUDA_USERSPACE_VERSION" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
+[ -n "$CUDA_SERIES" ]            || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
+[ -n "$DIST_DIR" ]               || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
+
+CUDA_SERIES_DASH=$(printf '%s' "$CUDA_SERIES" | tr '.' '-')
+REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64"
+CACHE_DIR="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${CUDA_SERIES}"
+CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
+DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/cublas-downloads"
+PACKAGES_GZ="${DOWNLOAD_CACHE_DIR}/Packages.gz"
+
+echo "=== cuBLAS ${CUBLAS_VERSION} / cudart ${CUDA_USERSPACE_VERSION} / CUDA ${CUDA_SERIES} ==="
+
+if [ -f "${CACHE_DIR}/include/cublasLt.h" ] && [ -f "${CACHE_DIR}/include/cuda_runtime_api.h" ] \
+    && [ "$(find "${CACHE_DIR}/lib" \( -name 'libcublas.so*' -o -name 'libcublasLt.so*' -o -name 'libcudart.so*' \) 2>/dev/null | wc -l)" -gt 0 ]; then
+    echo "=== cuBLAS cached, skipping download ==="
+    echo "cache: $CACHE_DIR"
+    exit 0
+fi
+
+mkdir -p "${DOWNLOAD_CACHE_DIR}" "${CACHE_DIR}/include" "${CACHE_DIR}/lib"
+
+echo "=== downloading Packages.gz ==="
+wget -q -O "${PACKAGES_GZ}" "${REPO_BASE}/Packages.gz"
+
+lookup_pkg() {
+    pkg="$1"
+    ver="$2"
+    gzip -dc "${PACKAGES_GZ}" | awk -v pkg="$pkg" -v ver="$ver" '
+        /^Package: / { cur_pkg=$2 }
+        /^Version: / { cur_ver=$2 }
+        /^Filename: / { cur_file=$2 }
+        /^SHA256: / { cur_sha=$2 }
+        /^$/ {
+            if (cur_pkg == pkg && cur_ver == ver) {
+                print cur_file " " cur_sha
+                exit
+            }
+            cur_pkg=""; cur_ver=""; cur_file=""; cur_sha=""
+        }
+        END {
+            if (cur_pkg == pkg && cur_ver == ver) {
+                print cur_file " " cur_sha
+            }
+        }'
+}
+
+download_verified_pkg() {
+    pkg="$1"
+    ver="$2"
+
+    meta="$(lookup_pkg "$pkg" "$ver")"
+    [ -n "$meta" ] || { echo "ERROR: package metadata not found for ${pkg} ${ver}"; exit 1; }
+
+    repo_file="$(printf '%s\n' "$meta" | awk '{print $1}')"
+    repo_sha="$(printf '%s\n' "$meta" | awk '{print $2}')"
+    [ -n "$repo_file" ] || { echo "ERROR: package filename missing for ${pkg}"; exit 1; }
+    [ -n "$repo_sha" ]  || { echo "ERROR: package sha missing for ${pkg}"; exit 1; }
+
+    out="${DOWNLOAD_CACHE_DIR}/$(basename "$repo_file")"
+    if [ -f "$out" ]; then
+        actual_sha="$(sha256sum "$out" | awk '{print $1}')"
+        if [ "$actual_sha" = "$repo_sha" ]; then
+            echo "=== using cached $(basename "$repo_file") ==="
+            printf '%s\n' "$out"
+            return 0
+        fi
+        echo "=== removing stale $(basename "$repo_file") (sha256 mismatch) ==="
+        rm -f "$out"
+    fi
+
+    echo "=== downloading $(basename "$repo_file") ==="
+    wget --show-progress -O "$out" "${REPO_BASE}/$(basename "$repo_file")"
+
+    actual_sha="$(sha256sum "$out" | awk '{print $1}')"
+    if [ "$actual_sha" != "$repo_sha" ]; then
+        echo "ERROR: sha256 mismatch for $(basename "$repo_file")"
+        echo "  expected: $repo_sha"
+        echo "  actual:   $actual_sha"
+        rm -f "$out"
+        exit 1
+    fi
+    echo "sha256 OK: $(basename "$repo_file")"
+    printf '%s\n' "$out"
+}
+
+extract_deb() {
+    deb="$1"
+    dst="$2"
+    mkdir -p "$dst"
+    (
+        cd "$dst"
+        ar x "$deb"
+        data_tar=$(ls data.tar.* 2>/dev/null | head -1)
+        [ -n "$data_tar" ] || { echo "ERROR: data.tar.* not found in $deb"; exit 1; }
+        tar xf "$data_tar"
+    )
+}
+
+copy_headers() {
+    from="$1"
+    if [ -d "${from}/usr/include" ]; then
+        cp -a "${from}/usr/include/." "${CACHE_DIR}/include/"
+    fi
+}
+
+copy_libs() {
+    from="$1"
+    find "$from" \( -name 'libcublas.so*' -o -name 'libcublasLt.so*' -o -name 'libcudart.so*' \) \
+        \( -type f -o -type l \) -exec cp -a {} "${CACHE_DIR}/lib/" \;
+}
+
+make_links() {
+    base="$1"
+    versioned=$(find "${CACHE_DIR}/lib" -maxdepth 1 -name "${base}.so.[0-9]*" -type f | sort | head -1)
+    [ -n "$versioned" ] || return 0
+    soname=$(printf '%s\n' "$versioned" | sed -E "s#.*/(${base}\.so\.[0-9]+).*#\\1#")
+    target=$(basename "$versioned")
+    ln -sf "$target" "${CACHE_DIR}/lib/${soname}" 2>/dev/null || true
+    ln -sf "${soname}" "${CACHE_DIR}/lib/${base}.so" 2>/dev/null || true
+}
+
+TMP_DIR=$(mktemp -d)
+trap 'rm -rf "$TMP_DIR"' EXIT INT TERM
+
+CUBLAS_RT_DEB=$(download_verified_pkg "libcublas-${CUDA_SERIES_DASH}" "${CUBLAS_VERSION}")
+CUBLAS_DEV_DEB=$(download_verified_pkg "libcublas-dev-${CUDA_SERIES_DASH}" "${CUBLAS_VERSION}")
+CUDART_RT_DEB=$(download_verified_pkg "cuda-cudart-${CUDA_SERIES_DASH}" "${CUDA_USERSPACE_VERSION}")
+CUDART_DEV_DEB=$(download_verified_pkg "cuda-cudart-dev-${CUDA_SERIES_DASH}" "${CUDA_USERSPACE_VERSION}")
+
+extract_deb "$CUBLAS_RT_DEB" "${TMP_DIR}/cublas-rt"
+extract_deb "$CUBLAS_DEV_DEB" "${TMP_DIR}/cublas-dev"
+extract_deb "$CUDART_RT_DEB" "${TMP_DIR}/cudart-rt"
+extract_deb "$CUDART_DEV_DEB" "${TMP_DIR}/cudart-dev"
+
+copy_headers "${TMP_DIR}/cublas-dev"
+copy_headers "${TMP_DIR}/cudart-dev"
+copy_libs "${TMP_DIR}/cublas-rt"
+copy_libs "${TMP_DIR}/cudart-rt"
+
+make_links "libcublas"
+make_links "libcublasLt"
+make_links "libcudart"
+
+[ -f "${CACHE_DIR}/include/cublasLt.h" ]      || { echo "ERROR: cublasLt.h not extracted"; exit 1; }
+[ -f "${CACHE_DIR}/include/cuda_runtime_api.h" ] || { echo "ERROR: cuda_runtime_api.h not extracted"; exit 1; }
+[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcublasLt.so*' | wc -l)" -gt 0 ] || { echo "ERROR: libcublasLt not extracted"; exit 1; }
+[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcublas.so*' | wc -l)" -gt 0 ] || { echo "ERROR: libcublas not extracted"; exit 1; }
+[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcudart.so*' | wc -l)" -gt 0 ] || { echo "ERROR: libcudart not extracted"; exit 1; }
+
+echo "=== cuBLAS extraction complete ==="
+echo "cache: $CACHE_DIR"
+echo "headers: $(find "${CACHE_DIR}/include" -type f | wc -l)"
+echo "libs:    $(find "${CACHE_DIR}/lib" -maxdepth 1 \( -name 'libcublas*.so*' -o -name 'libcudart.so*' \) | wc -l)"
--- a/iso/builder/build-in-container.sh
+++ b/iso/builder/build-in-container.sh
@@ -7,6 +7,7 @@ REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
 BUILDER_DIR="${REPO_ROOT}/iso/builder"
 CONTAINER_TOOL="${CONTAINER_TOOL:-docker}"
 IMAGE_TAG="${BEE_BUILDER_IMAGE:-bee-iso-builder}"
+BUILDER_PLATFORM="${BEE_BUILDER_PLATFORM:-linux/amd64}"
 CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/container-cache}"
 AUTH_KEYS=""
 REBUILD_IMAGE=0
@@ -40,6 +41,13 @@ if ! command -v "$CONTAINER_TOOL" >/dev/null 2>&1; then
    exit 1
 fi

+PLATFORM_OS="${BUILDER_PLATFORM%/*}"
+PLATFORM_ARCH="${BUILDER_PLATFORM#*/}"
+if [ -z "$PLATFORM_OS" ] || [ -z "$PLATFORM_ARCH" ] || [ "$PLATFORM_OS" = "$BUILDER_PLATFORM" ]; then
+    echo "invalid BEE_BUILDER_PLATFORM: ${BUILDER_PLATFORM} (expected os/arch, e.g. linux/amd64)" >&2
+    exit 1
+fi
+
 if [ -n "$AUTH_KEYS" ]; then
    [ -f "$AUTH_KEYS" ] || { echo "authorized_keys not found: $AUTH_KEYS" >&2; exit 1; }
    AUTH_KEYS_ABS="$(cd "$(dirname "$AUTH_KEYS")" && pwd)/$(basename "$AUTH_KEYS")"
@@ -56,17 +64,35 @@ mkdir -p \

 IMAGE_REF="${IMAGE_TAG}:debian${DEBIAN_VERSION}"

-if [ "$REBUILD_IMAGE" = "1" ] || ! "$CONTAINER_TOOL" image inspect "${IMAGE_REF}" >/dev/null 2>&1; then
+image_matches_platform() {
+    actual_platform="$("$CONTAINER_TOOL" image inspect --format '{{.Os}}/{{.Architecture}}' "${IMAGE_REF}" 2>/dev/null || true)"
+    [ "$actual_platform" = "${BUILDER_PLATFORM}" ]
+}
+
+NEED_BUILD_IMAGE=0
+if [ "$REBUILD_IMAGE" = "1" ]; then
+    NEED_BUILD_IMAGE=1
+elif ! "$CONTAINER_TOOL" image inspect "${IMAGE_REF}" >/dev/null 2>&1; then
+    NEED_BUILD_IMAGE=1
+elif ! image_matches_platform; then
+    actual_platform="$("$CONTAINER_TOOL" image inspect --format '{{.Os}}/{{.Architecture}}' "${IMAGE_REF}" 2>/dev/null || echo unknown)"
+    echo "=== rebuilding builder image ${IMAGE_REF}: platform mismatch (${actual_platform} != ${BUILDER_PLATFORM}) ==="
+    NEED_BUILD_IMAGE=1
+fi
+
+if [ "$NEED_BUILD_IMAGE" = "1" ]; then
    "$CONTAINER_TOOL" build \
+        --platform "${BUILDER_PLATFORM}" \
        --build-arg GO_VERSION="${GO_VERSION}" \
        -t "${IMAGE_REF}" \
        "${BUILDER_DIR}"
 else
-    echo "=== using existing builder image ${IMAGE_REF} ==="
+    echo "=== using existing builder image ${IMAGE_REF} (${BUILDER_PLATFORM}) ==="
 fi

 set -- \
    run --rm --privileged \
+    --platform "${BUILDER_PLATFORM}" \
    -v "${REPO_ROOT}:/work" \
    -v "${CACHE_DIR}:/cache" \
    -e BEE_CONTAINER_BUILD=1 \
@@ -80,6 +106,7 @@ set -- \

 if [ -n "$AUTH_KEYS" ]; then
    set -- run --rm --privileged \
+        --platform "${BUILDER_PLATFORM}" \
        -v "${REPO_ROOT}:/work" \
        -v "${CACHE_DIR}:/cache" \
        -v "${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro" \
--- a/iso/builder/build-nvidia-module.sh
+++ b/iso/builder/build-nvidia-module.sh
@@ -46,7 +46,8 @@ CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_VERSION}-${KVER}"
 CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
 DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads"
 EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract"
-if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ]; then
+if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
+        && [ "$(ls "$CACHE_DIR/lib/libnvidia-ptxjitcompiler.so."* 2>/dev/null | wc -l)" -gt 0 ]; then
    echo "=== NVIDIA cached, skipping build ==="
    echo "cache: $CACHE_DIR"
    echo "modules: $(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l) .ko files"
@@ -129,8 +130,10 @@ else
    echo "WARNING: no firmware/ dir found in installer (may be needed for Hopper GPUs)"
 fi

-# Copy ALL userspace library files
-for lib in libnvidia-ml libcuda; do
+# Copy ALL userspace library files.
+# libnvidia-ptxjitcompiler is required by libcuda for PTX JIT compilation
+# (cuModuleLoadDataEx with PTX source) — without it CUDA_ERROR_JIT_COMPILER_NOT_FOUND.
+for lib in libnvidia-ml libcuda libnvidia-ptxjitcompiler; do
    count=0
    for f in $(find "$EXTRACT_DIR" -maxdepth 1 -name "${lib}.so.*" 2>/dev/null); do
        cp "$f" "$CACHE_DIR/lib/" && count=$((count+1))
@@ -147,7 +150,7 @@ ko_count=$(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l)
 [ "$ko_count" -gt 0 ] || { echo "ERROR: no .ko files built in $CACHE_DIR/modules/"; exit 1; }

 # Create soname symlinks: use [0-9][0-9]* to avoid circular symlink (.so.1 has single digit)
-for lib in libnvidia-ml libcuda; do
+for lib in libnvidia-ml libcuda libnvidia-ptxjitcompiler; do
    versioned=$(ls "$CACHE_DIR/lib/${lib}.so."[0-9][0-9]* 2>/dev/null | head -1)
    [ -n "$versioned" ] || continue
    base=$(basename "$versioned")
--- a/iso/builder/build.sh
+++ b/iso/builder/build.sh
@@ -34,6 +34,63 @@ mkdir -p "${CACHE_ROOT}"
 : "${GOMODCACHE:=${CACHE_ROOT}/go-mod}"
 export GOCACHE GOMODCACHE

+resolve_audit_version() {
+    if [ -n "${BEE_AUDIT_VERSION:-}" ]; then
+        echo "${BEE_AUDIT_VERSION}"
+        return 0
+    fi
+
+    tag="$(git -C "${REPO_ROOT}" describe --tags --match 'audit/v*' --abbrev=7 --dirty 2>/dev/null || true)"
+    if [ -z "${tag}" ]; then
+        tag="$(git -C "${REPO_ROOT}" describe --tags --match 'v*' --abbrev=7 --dirty 2>/dev/null || true)"
+    fi
+    case "${tag}" in
+        audit/v*)
+            echo "${tag#audit/v}"
+            return 0
+            ;;
+        v*)
+            echo "${tag#v}"
+            return 0
+            ;;
+        "")
+            ;;
+        *)
+            echo "${tag}"
+            return 0
+            ;;
+    esac
+
+    if [ -n "${AUDIT_VERSION:-}" ]; then
+        echo "${AUDIT_VERSION}"
+        return 0
+    fi
+
+    date +%Y%m%d
+}
+
+# ISO image versioned separately from the audit binary (iso/v* tags).
+resolve_iso_version() {
+    if [ -n "${BEE_ISO_VERSION:-}" ]; then
+        echo "${BEE_ISO_VERSION}"
+        return 0
+    fi
+
+    tag="$(git -C "${REPO_ROOT}" describe --tags --match 'iso/v*' --abbrev=7 --dirty 2>/dev/null || true)"
+    case "${tag}" in
+        iso/v*)
+            echo "${tag#iso/v}"
+            return 0
+            ;;
+    esac
+
+    # Fall back to audit version so the name is still meaningful
+    resolve_audit_version
+}
+
+AUDIT_VERSION_EFFECTIVE="$(resolve_audit_version)"
+ISO_VERSION_EFFECTIVE="$(resolve_iso_version)"
+
 # Auto-detect kernel ABI: refresh apt index, then query current linux-image-amd64 dependency.
 # If headers for the detected ABI are not yet installed (kernel updated since image build),
 # install them on the fly so NVIDIA modules and ISO kernel always match.
@@ -64,6 +121,7 @@ fi

 echo "=== bee ISO build ==="
 echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERSION}"
+echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}, ISO version: ${ISO_VERSION_EFFECTIVE}"
 echo ""

 echo "=== syncing git submodules ==="
@@ -83,7 +141,7 @@ if [ "$NEED_BUILD" = "1" ]; then
    cd "${REPO_ROOT}/audit"
    GOOS=linux GOARCH=amd64 CGO_ENABLED=0 \
        go build \
-        -ldflags "-s -w -X main.Version=${AUDIT_VERSION:-$(date +%Y%m%d)}" \
+        -ldflags "-s -w -X main.Version=${AUDIT_VERSION_EFFECTIVE}" \
        -o "$BEE_BIN" \
        ./cmd/bee
    echo "binary: $BEE_BIN"
@@ -101,6 +159,16 @@ else
    echo "=== bee binary up to date, skipping build ==="
 fi

+echo ""
+echo "=== downloading cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace ==="
+sh "${BUILDER_DIR}/build-cublas.sh" \
+    "${CUBLAS_VERSION}" \
+    "${CUDA_USERSPACE_VERSION}" \
+    "${NCCL_CUDA_VERSION}" \
+    "${DIST_DIR}"
+
+CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
+
 GPU_STRESS_NEED_BUILD=1
 if [ -f "$GPU_STRESS_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_STRESS_BIN" ]; then
    GPU_STRESS_NEED_BUILD=0
@@ -109,6 +177,7 @@ fi
 if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
    echo "=== building bee-gpu-stress ==="
    gcc -O2 -s -Wall -Wextra \
+        -I"${CUBLAS_CACHE}/include" \
        -o "$GPU_STRESS_BIN" \
        "${BUILDER_DIR}/bee-gpu-stress.c" \
        -ldl
@@ -225,13 +294,17 @@ NCCL_CACHE="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
 cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
 echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="

+# Inject cuBLAS/cuBLASLt/cudart runtime libs used by bee-gpu-stress tensor-core GEMM path
+cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
+echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
+
 # --- embed build metadata ---
 mkdir -p "${OVERLAY_STAGE_DIR}/etc"
 BUILD_DATE="$(date +%Y-%m-%d)"
 GIT_COMMIT="$(git -C "${REPO_ROOT}" rev-parse --short HEAD 2>/dev/null || echo unknown)"
 cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
-BEE_ISO_VERSION=${AUDIT_VERSION}
-BEE_AUDIT_VERSION=${AUDIT_VERSION}
+BEE_ISO_VERSION=${ISO_VERSION_EFFECTIVE}
+BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
 BUILD_DATE=${BUILD_DATE}
 GIT_COMMIT=${GIT_COMMIT}
 DEBIAN_VERSION=${DEBIAN_VERSION}
@@ -239,6 +312,8 @@ DEBIAN_KERNEL_ABI=${DEBIAN_KERNEL_ABI}
 NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
 NCCL_VERSION=${NCCL_VERSION}
 NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
+CUBLAS_VERSION=${CUBLAS_VERSION}
+CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
 EOF

 # Patch motd with build info
@@ -272,7 +347,7 @@ lb build 2>&1

 # live-build outputs live-image-amd64.hybrid.iso in LB_DIR
 ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
-ISO_OUT="${DIST_DIR}/bee-debian${DEBIAN_VERSION}-v${AUDIT_VERSION}-amd64.iso"
+ISO_OUT="${DIST_DIR}/bee-debian${DEBIAN_VERSION}-v${ISO_VERSION_EFFECTIVE}-amd64.iso"
 if [ -f "$ISO_RAW" ]; then
    cp "$ISO_RAW" "$ISO_OUT"
    echo ""
--- a/iso/builder/config/bootloaders/grub-pc/grub.cfg
+++ b/iso/builder/config/bootloaders/grub-pc/grub.cfg
@@ -10,12 +10,17 @@ echo "  ╚══════╝╚═╝  ╚═╝╚══════╝
 echo ""

 menuentry "EASY-BEE" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@
+    linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal
+    initrd  @INITRD_LIVE@
+}
+
+menuentry "EASY-BEE (NVIDIA GSP=off)" {
+    linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off
    initrd  @INITRD_LIVE@
 }

 menuentry "EASY-BEE (fail-safe)" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ memtest noapic noapm nodma nomce nolapic nosmp vga=normal
+    linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal
    initrd  @INITRD_LIVE@
 }

--- a/iso/builder/config/bootloaders/isolinux/live.cfg.in
+++ b/iso/builder/config/bootloaders/isolinux/live.cfg.in
@@ -0,0 +1,18 @@
+label live-@FLAVOUR@-normal
+    menu label ^EASY-BEE
+    menu default
+    linux @LINUX@
+    initrd @INITRD@
+    append @APPEND_LIVE@ bee.nvidia.mode=normal
+
+label live-@FLAVOUR@-gsp-off
+    menu label EASY-BEE (^NVIDIA GSP=off)
+    linux @LINUX@
+    initrd @INITRD@
+    append @APPEND_LIVE@ bee.nvidia.mode=gsp-off
+
+label live-@FLAVOUR@-failsafe
+    menu label EASY-BEE (^fail-safe)
+    linux @LINUX@
+    initrd @INITRD@
+    append @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal
--- a/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
+++ b/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
@@ -5,6 +5,21 @@ set -e

 echo "=== bee chroot setup ==="

+ensure_bee_console_user() {
+    if id bee >/dev/null 2>&1; then
+        usermod -d /home/bee -s /bin/sh bee 2>/dev/null || true
+    else
+        useradd -d /home/bee -m -s /bin/sh -U bee
+    fi
+
+    mkdir -p /home/bee
+    chown -R bee:bee /home/bee
+    echo "bee:eeb" | chpasswd
+    usermod -aG sudo bee 2>/dev/null || true
+}
+
+ensure_bee_console_user
+
 # Enable bee services
 systemctl enable bee-network.service
 systemctl enable bee-nvidia.service
@@ -15,6 +30,8 @@ systemctl enable bee-sshsetup.service
 systemctl enable ssh.service
 systemctl enable qemu-guest-agent.service 2>/dev/null || true
 systemctl enable serial-getty@ttyS0.service 2>/dev/null || true
+systemctl enable serial-getty@ttyS1.service 2>/dev/null || true
+systemctl enable bee-journal-mirror@ttyS1.service 2>/dev/null || true

 # Ensure scripts are executable
 chmod +x /usr/local/bin/bee-network.sh  2>/dev/null || true
@@ -23,6 +40,7 @@ chmod +x /usr/local/bin/bee-sshsetup   2>/dev/null || true
 chmod +x /usr/local/bin/bee-smoketest  2>/dev/null || true
 chmod +x /usr/local/bin/bee-tui        2>/dev/null || true
 chmod +x /usr/local/bin/bee            2>/dev/null || true
+chmod +x /usr/local/bin/bee-log-run    2>/dev/null || true

 # Reload udev rules
 udevadm control --reload-rules 2>/dev/null || true
--- a/iso/builder/config/hooks/normal/9001-amd-rocm.hook.chroot
+++ b/iso/builder/config/hooks/normal/9001-amd-rocm.hook.chroot
@@ -2,36 +2,100 @@
 # 9001-amd-rocm.hook.chroot — install AMD ROCm SMI tool for Instinct GPU monitoring.
 # Runs inside the live-build chroot. Adds AMD's apt repository and installs
 # rocm-smi-lib which provides the `rocm-smi` CLI (analogous to nvidia-smi).
+#
+# AMD does NOT publish Debian Bookworm packages. The repo uses Ubuntu codenames
+# (jammy/noble). We use jammy (Ubuntu 22.04) — its packages install cleanly on
+# Debian 12 (Bookworm) due to compatible glibc/libstdc++.
+# Tried versions newest-first; falls back if a point release is missing.

 set -e

-ROCM_VERSION="6.4"
+# Ubuntu codename to use for the AMD repo (Debian has no AMD packages).
+ROCM_UBUNTU_DIST="jammy"
+
+# ROCm point-releases to try newest-first. AMD drops old point releases
+# from the repo, so we walk backwards until one responds 200.
+ROCM_CANDIDATES="6.3.4 6.3.3 6.3.2 6.3.1 6.3 6.2.4 6.2.3 6.2.2 6.2.1 6.2"
+
 ROCM_KEYRING="/etc/apt/keyrings/rocm.gpg"
 ROCM_LIST="/etc/apt/sources.list.d/rocm.list"
-
-echo "=== AMD ROCm ${ROCM_VERSION}: adding repository ==="
+APT_UPDATED=0

 mkdir -p /etc/apt/keyrings

+ensure_tool() {
+    tool="$1"
+    pkg="$2"
+    if command -v "${tool}" >/dev/null 2>&1; then
+        return 0
+    fi
+    if [ "${APT_UPDATED}" -eq 0 ]; then
+        apt-get update -qq
+        APT_UPDATED=1
+    fi
+    DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends "${pkg}"
+}
+
+ensure_cert_bundle() {
+    if [ -s /etc/ssl/certs/ca-certificates.crt ]; then
+        return 0
+    fi
+    if [ "${APT_UPDATED}" -eq 0 ]; then
+        apt-get update -qq
+        APT_UPDATED=1
+    fi
+    DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends ca-certificates
+}
+
+# live-build chroot may not include fetch/signing tools yet
+if ! ensure_cert_bundle || ! ensure_tool wget wget || ! ensure_tool gpg gpg; then
+    echo "WARN: failed to install wget/gpg/ca-certificates prerequisites — skipping ROCm install"
+    exit 0
+fi
+
 # Download and import AMD GPG key
 if ! wget -qO- "https://repo.radeon.com/rocm/rocm.gpg.key" \
-        | gpg --dearmor > "${ROCM_KEYRING}"; then
+        | gpg --dearmor --yes --output "${ROCM_KEYRING}"; then
    echo "WARN: failed to fetch AMD ROCm GPG key — skipping ROCm install"
    exit 0
 fi

-cat > "${ROCM_LIST}" <<EOF
-deb [arch=amd64 signed-by=${ROCM_KEYRING}] https://repo.radeon.com/rocm/apt/${ROCM_VERSION} bookworm main
+# Try each ROCm version until apt-get update succeeds.
+# AMD repo uses Ubuntu codenames; bookworm is not published — use jammy.
+ROCM_VERSION=""
+for candidate in ${ROCM_CANDIDATES}; do
+    cat > "${ROCM_LIST}" <<EOF
+deb [arch=amd64 signed-by=${ROCM_KEYRING}] https://repo.radeon.com/rocm/apt/${candidate} ${ROCM_UBUNTU_DIST} main
 EOF
+    if apt-get update -qq 2>/dev/null; then
+        ROCM_VERSION="${candidate}"
+        echo "=== AMD ROCm ${ROCM_VERSION} (${ROCM_UBUNTU_DIST}): repository available ==="
+        break
+    fi
+    echo "WARN: ROCm ${candidate} not available, trying next..."
+    rm -f "${ROCM_LIST}"
+done

-apt-get update -qq
+if [ -z "${ROCM_VERSION}" ]; then
+    echo "WARN: no ROCm apt repository available — skipping ROCm install"
+    rm -f "${ROCM_KEYRING}"
+    exit 0
+fi

 # rocm-smi-lib provides the rocm-smi CLI tool for GPU monitoring
-if apt-get install -y --no-install-recommends rocm-smi-lib 2>/dev/null; then
-    echo "=== AMD ROCm: rocm-smi installed ==="
+if DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends rocm-smi-lib; then
+    echo "=== AMD ROCm: rocm-smi-lib installed ==="
+    if [ -x /opt/rocm/bin/rocm-smi ]; then
+        ln -sf /opt/rocm/bin/rocm-smi /usr/local/bin/rocm-smi
+    else
+        smi_path="$(find /opt -path '*/bin/rocm-smi' -type f 2>/dev/null | sort | tail -1)"
+        if [ -n "${smi_path}" ]; then
+            ln -sf "${smi_path}" /usr/local/bin/rocm-smi
+        fi
+    fi
    rocm-smi --version 2>/dev/null || true
 else
-    echo "WARN: rocm-smi-lib install failed — GPU monitoring unavailable"
+    echo "WARN: rocm-smi-lib install failed — AMD GPU monitoring unavailable"
 fi

 # Clean up apt lists to keep ISO size down
--- a/iso/builder/config/package-lists/bee.list.chroot
+++ b/iso/builder/config/package-lists/bee.list.chroot
@@ -20,6 +20,7 @@ openssh-server

 # Filesystem support for USB export targets
 exfatprogs
+exfat-fuse
 ntfs-3g

 # Utilities
--- a/iso/builder/smoketest.sh
+++ b/iso/builder/smoketest.sh
@@ -26,6 +26,15 @@ echo ""

 KVER=$(uname -r)
 info "kernel: $KVER"
+NVIDIA_BOOT_MODE="normal"
+for arg in $(cat /proc/cmdline 2>/dev/null); do
+    case "$arg" in
+        bee.nvidia.mode=*)
+            NVIDIA_BOOT_MODE="${arg#*=}"
+            ;;
+    esac
+done
+info "nvidia boot mode: ${NVIDIA_BOOT_MODE}"

 # --- PATH & binaries ---
 echo "-- PATH & binaries --"
@@ -53,17 +62,25 @@ else
    fail "NVIDIA ko dir missing: $KO_DIR"
 fi

-for mod in nvidia nvidia_modeset nvidia_uvm; do
+if /sbin/lsmod 2>/dev/null | grep -q "^nvidia "; then
+    ok "module loaded: nvidia"
+else
+    fail "module NOT loaded: nvidia"
+fi
+
+for mod in nvidia_modeset nvidia_uvm; do
    if /sbin/lsmod 2>/dev/null | grep -q "^$mod "; then
        ok "module loaded: $mod"
+    elif [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; then
+        fail "module NOT loaded in normal mode: $mod"
    else
-        fail "module NOT loaded: $mod"
+        warn "module not loaded in GSP-off mode: $mod"
    fi
 done

 echo ""
 echo "-- NVIDIA device nodes --"
-for dev in nvidiactl nvidia0 nvidia-uvm; do
+for dev in nvidiactl nvidia0; do
    if [ -e "/dev/$dev" ]; then
        ok "/dev/$dev exists"
    else
@@ -71,6 +88,14 @@ for dev in nvidiactl nvidia0 nvidia-uvm; do
    fi
 done

+if [ -e /dev/nvidia-uvm ]; then
+    ok "/dev/nvidia-uvm exists"
+elif [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; then
+    fail "/dev/nvidia-uvm missing in normal mode"
+else
+    warn "/dev/nvidia-uvm missing — CUDA stress path may be unavailable until loaded on demand"
+fi
+
 echo ""
 echo "-- nvidia-smi --"
 if PATH="/usr/local/bin:$PATH" command -v nvidia-smi >/dev/null 2>&1; then
--- a/iso/overlay/etc/profile.d/bee.sh
+++ b/iso/overlay/etc/profile.d/bee.sh
@@ -1,4 +1,4 @@
-export PATH="$PATH:/usr/local/bin"
+export PATH="$PATH:/usr/local/bin:/opt/rocm/bin:/opt/rocm/sbin"

 menu() {
    if [ -x /usr/local/bin/bee-tui ]; then
@@ -17,4 +17,5 @@ if [ -z "${SSH_CONNECTION:-}" ] \
    && [ "$(tty 2>/dev/null)" = "/dev/tty1" ]; then
    echo "Bee live environment ready."
    echo "Run 'menu' to open the TUI."
+    echo "Kernel logs: Alt+F2  |  Extra shell: Alt+F3"
 fi
--- a/iso/overlay/etc/systemd/journald.conf.d/console-to-sol.conf
+++ b/iso/overlay/etc/systemd/journald.conf.d/console-to-sol.conf
@@ -0,0 +1,4 @@
+[Journal]
+ForwardToConsole=yes
+TTYPath=/dev/ttyS0
+MaxLevelConsole=info
--- a/iso/overlay/etc/systemd/system/bee-audit.service
+++ b/iso/overlay/etc/systemd/system/bee-audit.service
@@ -5,9 +5,9 @@ Before=bee-web.service

 [Service]
 Type=oneshot
-ExecStart=/bin/sh -c '/usr/local/bin/bee audit --runtime livecd --output file:/appdata/bee/export/bee-audit.json; rc=$?; if [ "$rc" -ne 0 ]; then echo "[bee-audit] WARN: audit exited with rc=$rc"; fi; exit 0'
-StandardOutput=append:/appdata/bee/export/bee-audit.log
-StandardError=append:/appdata/bee/export/bee-audit.log
+ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-audit.log /bin/sh -c '/usr/local/bin/bee audit --runtime livecd --output file:/appdata/bee/export/bee-audit.json; rc=$?; if [ "$rc" -ne 0 ]; then echo "[bee-audit] WARN: audit exited with rc=$rc"; fi; exit 0'
+StandardOutput=journal
+StandardError=journal
 RemainAfterExit=yes

 [Install]
--- a/iso/overlay/etc/systemd/system/bee-journal-mirror@.service
+++ b/iso/overlay/etc/systemd/system/bee-journal-mirror@.service
@@ -0,0 +1,16 @@
+[Unit]
+Description=Bee: mirror system journal to %I
+After=systemd-journald.service
+Requires=systemd-journald.service
+ConditionPathExists=/dev/%I
+
+[Service]
+Type=simple
+ExecStart=/bin/sh -c 'exec journalctl -f -n 200 -o short-monotonic > /dev/%I'
+Restart=always
+RestartSec=1
+StandardOutput=null
+StandardError=journal
+
+[Install]
+WantedBy=multi-user.target
--- a/iso/overlay/etc/systemd/system/bee-network.service
+++ b/iso/overlay/etc/systemd/system/bee-network.service
@@ -5,9 +5,9 @@ Before=network-online.target bee-audit.service

 [Service]
 Type=oneshot
-ExecStart=/usr/local/bin/bee-network.sh
-StandardOutput=append:/appdata/bee/export/bee-network.log
-StandardError=append:/appdata/bee/export/bee-network.log
+ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-network.log /usr/local/bin/bee-network.sh
+StandardOutput=journal
+StandardError=journal
 RemainAfterExit=yes

 [Install]
--- a/iso/overlay/etc/systemd/system/bee-nvidia.service
+++ b/iso/overlay/etc/systemd/system/bee-nvidia.service
@@ -5,9 +5,9 @@ Before=bee-audit.service

 [Service]
 Type=oneshot
-ExecStart=/usr/local/bin/bee-nvidia-load
-StandardOutput=append:/appdata/bee/export/bee-nvidia.log
-StandardError=append:/appdata/bee/export/bee-nvidia.log
+ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-nvidia.log /usr/local/bin/bee-nvidia-load
+StandardOutput=journal
+StandardError=journal
 RemainAfterExit=yes

 [Install]
--- a/iso/overlay/etc/systemd/system/bee-preflight.service
+++ b/iso/overlay/etc/systemd/system/bee-preflight.service
@@ -5,9 +5,9 @@ Before=bee-audit.service

 [Service]
 Type=oneshot
-ExecStart=/bin/sh -c '/usr/local/bin/bee preflight --output file:/appdata/bee/export/runtime-health.json; rc=$?; if [ "$rc" -ne 0 ]; then echo "[bee-preflight] WARN: preflight exited with rc=$rc"; fi; exit 0'
-StandardOutput=append:/appdata/bee/export/runtime-health.log
-StandardError=append:/appdata/bee/export/runtime-health.log
+ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/runtime-health.log /bin/sh -c '/usr/local/bin/bee preflight --output file:/appdata/bee/export/runtime-health.json; rc=$?; if [ "$rc" -ne 0 ]; then echo "[bee-preflight] WARN: preflight exited with rc=$rc"; fi; exit 0'
+StandardOutput=journal
+StandardError=journal
 RemainAfterExit=yes

 [Install]
--- a/iso/overlay/etc/systemd/system/bee-sshsetup.service
+++ b/iso/overlay/etc/systemd/system/bee-sshsetup.service
@@ -5,9 +5,9 @@ Before=ssh.service

 [Service]
 Type=oneshot
-ExecStart=/usr/local/bin/bee-sshsetup
-StandardOutput=append:/appdata/bee/export/bee-sshsetup.log
-StandardError=append:/appdata/bee/export/bee-sshsetup.log
+ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-sshsetup.log /usr/local/bin/bee-sshsetup
+StandardOutput=journal
+StandardError=journal
 RemainAfterExit=yes

 [Install]
--- a/iso/overlay/etc/systemd/system/bee-web.service
+++ b/iso/overlay/etc/systemd/system/bee-web.service
@@ -5,11 +5,11 @@ Wants=bee-audit.service

 [Service]
 Type=simple
-ExecStart=/usr/local/bin/bee web --listen :80 --audit-path /appdata/bee/export/bee-audit.json --export-dir /appdata/bee/export --title "Bee Hardware Audit"
+ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-web.log /usr/local/bin/bee web --listen :80 --audit-path /appdata/bee/export/bee-audit.json --export-dir /appdata/bee/export --title "Bee Hardware Audit"
 Restart=always
 RestartSec=2
-StandardOutput=append:/appdata/bee/export/bee-web.log
-StandardError=append:/appdata/bee/export/bee-web.log
+StandardOutput=journal
+StandardError=journal

 [Install]
 WantedBy=multi-user.target
--- a/iso/overlay/usr/local/bin/bee-log-run
+++ b/iso/overlay/usr/local/bin/bee-log-run
@@ -0,0 +1,29 @@
+#!/bin/bash
+# bee-log-run — run a command, append its output to a file, and keep stdout/stderr
+# connected to systemd so journald and the serial console also receive the logs.
+
+set -o pipefail
+
+log_file="$1"
+shift
+
+if [ -z "$log_file" ] || [ "$#" -eq 0 ]; then
+    echo "usage: $0 <log-file> <command> [args...]" >&2
+    exit 2
+fi
+
+mkdir -p "$(dirname "$log_file")"
+
+serial_sink() {
+    local tty="$1"
+    if [ -w "$tty" ]; then
+        cat > "$tty"
+    else
+        cat > /dev/null
+    fi
+}
+
+"$@" 2>&1 | tee -a "$log_file" \
+    >(serial_sink /dev/ttyS0) \
+    >(serial_sink /dev/ttyS1)
+exit "${PIPESTATUS[0]}"
--- a/iso/overlay/usr/local/bin/bee-nvidia-load
+++ b/iso/overlay/usr/local/bin/bee-nvidia-load
@@ -22,24 +22,61 @@ fi
 log "module dir: $NVIDIA_KO_DIR"
 ls "$NVIDIA_KO_DIR"/*.ko 2>/dev/null | sed 's/^/  /' || true

-# Some kernels expose backlight helper symbols only after loading `video`.
-modprobe video >/dev/null 2>&1 && log "loaded helper module: video" || log "helper module unavailable: video"
+cmdline_param() {
+    key="$1"
+    for token in $(cat /proc/cmdline 2>/dev/null); do
+        case "$token" in
+            "$key"=*)
+                echo "${token#*=}"
+                return 0
+                ;;
+        esac
+    done
+    return 1
+}

-# Load modules via insmod (direct load — no depmod needed)
-for mod in nvidia nvidia-modeset nvidia-uvm; do
+nvidia_mode="$(cmdline_param bee.nvidia.mode || true)"
+if [ -z "$nvidia_mode" ]; then
+    nvidia_mode="normal"
+fi
+log "boot mode: $nvidia_mode"
+
+load_module() {
+    mod="$1"
+    shift
    ko="$NVIDIA_KO_DIR/${mod}.ko"
    [ -f "$ko" ] || ko="$NVIDIA_KO_DIR/${mod//-/_}.ko"
-    if [ -f "$ko" ]; then
-        if insmod "$ko"; then
-            log "loaded: $mod"
-        else
-            log "WARN: failed to load: $mod"
-            dmesg | tail -n 5 | sed 's/^/  dmesg: /' || true
-        fi
-    else
+    if [ ! -f "$ko" ]; then
        log "WARN: not found: $ko"
+        return 1
    fi
-done
+    if insmod "$ko" "$@"; then
+        log "loaded: $mod $*"
+        return 0
+    fi
+    log "WARN: failed to load: $mod"
+    dmesg | tail -n 10 | sed 's/^/  dmesg: /' || true
+    return 1
+}
+
+case "$nvidia_mode" in
+    normal|full)
+        if ! load_module nvidia; then
+            exit 1
+        fi
+        load_module nvidia-modeset || true
+        load_module nvidia-uvm || true
+        ;;
+    gsp-off|safe|*)
+        # NVIDIA documents that GSP firmware is enabled by default on newer GPUs and can
+        # be disabled via NVreg_EnableGpuFirmware=0. Safe mode keeps the live ISO on the
+        # conservative path for platforms where full boot-time GSP init is unstable.
+        if ! load_module nvidia NVreg_EnableGpuFirmware=0; then
+            exit 1
+        fi
+        log "GSP-off mode: skipping nvidia-modeset and nvidia-uvm during boot"
+        ;;
+esac

 # Create /dev/nvidia* device nodes (udev rules absent since we use .run installer)
 nvidia_major=$(grep -m1 ' nvidiactl$' /proc/devices | awk '{print $1}')
@@ -61,8 +98,11 @@ if [ -n "$uvm_major" ]; then
        && log "created /dev/nvidia-uvm (major $uvm_major)" \
        || log "WARN: /dev/nvidia-uvm already exists"
    mknod -m 666 /dev/nvidia-uvm-tools c "$uvm_major" 1 || true
-else
-    log "WARN: nvidia-uvm not in /proc/devices"
 fi

+# Refresh dynamic linker cache so that NVIDIA/NCCL libs injected into /usr/lib/
+# are visible to dlopen() calls (libcuda, libnvidia-ptxjitcompiler, libnccl, etc.)
+ldconfig 2>/dev/null || true
+log "ldconfig refreshed"
+
 log "done"
Author	SHA1	Message	Date
Mikhail Chusavitin	fc5c2019aa	iso: improve burn-in, export, and live boot	2026-03-26 18:56:19 +03:00
Mikhail Chusavitin	67a215c66f	fix(iso): route kernel logs to tty2, keep tty1 clean for TUI console=tty0 sent kernel messages to the active VT (tty1), overwriting the TUI. Changed to console=tty2 so kernel logs land on a dedicated console. tty1 is now clean; operator can press Alt+F2 to inspect kernel messages and Alt+F3 for an extra shell. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-26 17:40:44 +03:00
Mikhail Chusavitin	8b4bfdf5ad	feat(tui): live GPU chart during stress test, full VRAM allocation - GPU Platform Stress Test now shows a live in-TUI chart instead of nvtop. nvidia-smi is polled every second; up to 60 data points per GPU kept. All three metrics (Usage %, Temp °C, Power W) drawn on a single plot, each normalised to its own range and rendered in a different colour. - Memory allocation changed from MemoryMB/16 to MemoryMB-512 (full VRAM minus 512 MB driver overhead) so bee-gpu-stress actually stresses memory. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-26 17:37:20 +03:00
Mikhail Chusavitin	0a52a4f3ba	fix(iso): restore loglevel=7 on VGA console for crash visibility loglevel=3 was hiding all kernel messages on tty0/ttyS0 except errors. Machine crashes (panics, driver oops, module failures) were silent on VGA. Restored loglevel=7 so kernel messages up to debug are printed to both tty0 (VGA) and ttyS0 (SOL). Journald MaxLevelConsole reduced to info (was debug) to reduce noise on SOL while keeping it useful. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-26 11:19:07 +03:00
Mikhail Chusavitin	b132f7973a	fix(iso): derive ISO filename from iso/v* tags, not audit/v* Previously the ISO file was named after git describe --match 'audit/v', so a new iso/ tag produced names like v1.0.9-1-gXXXXXXX instead of v1.0.17. Now build.sh has resolve_iso_version() that looks at iso/v tags separately. The bee binary inside the ISO still uses AUDIT_VERSION_EFFECTIVE. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-26 11:05:51 +03:00
Mikhail Chusavitin	bd94b6c792	fix(iso): add libnvidia-ptxjitcompiler + ldconfig for PTX JIT and NCCL - build-nvidia-module.sh: copy libnvidia-ptxjitcompiler.so.* alongside libcuda/libnvidia-ml — required by cuModuleLoadDataEx for PTX JIT. Without it: CUDA_ERROR_JIT_COMPILER_NOT_FOUND at runtime. Cache check updated to force rebuild when ptxjitcompiler is missing. - bee-nvidia-load: run ldconfig after module load so that NVIDIA/NCCL libs injected into /usr/lib/ are visible to dlopen() callers. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-26 10:37:27 +03:00
Mikhail Chusavitin	06017eddfd	feat(tui): remove nvtop auto-launch from NVIDIA SAT nvtop is no longer shown during NVIDIA SAT runs. [o] Open nvtop shortcut also removed from the running screen. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-26 10:29:05 +03:00
Mikhail Chusavitin	0ac7b6a963	fix(iso): restore console=tty0 — VGA screen was black without it Commit `d36e844` dropped console=tty0 and added dual-serial + debug logging. Without console=tty0 the kernel never initialises the VGA console, leaving the physical screen permanently blank. - Restore console=tty0 (VGA) as primary, keep console=ttyS0 for SOL - Drop console=ttyS1 (redundant second serial port) - Replace loglevel=7 + journald debug flood with loglevel=3 (errors only) so kernel messages don't overwrite the TUI on the local screen - Remove systemd.log_target/forward_to_console debug params Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-26 10:23:53 +03:00
Mikhail Chusavitin	3d2ae4cdcb	fix(iso): use Ubuntu jammy codename for AMD ROCm repo — Debian not supported AMD does not publish Debian Bookworm packages at all (only focal/jammy/noble). Switch ROCM_UBUNTU_DIST to "jammy"; jammy packages install cleanly on Debian 12 due to compatible glibc. Also expand candidate list to include point-releases (6.3.4, 6.3.3, …) so we pick the latest actually-published one. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-26 10:08:58 +03:00
Mikhail Chusavitin	4669f14f4f	feat(tui): GPU Platform Stress Test — live nvtop chart during test Apply the same pattern as NVIDIA SAT: launch nvtop via tea.ExecProcess so it occupies the full terminal as a live GPU chart (temp, power, fan, utilisation lines) while the stress test runs in the background. - Add screenGPUStressRunning screen + dedicated running/render handlers - startGPUStressTest: tea.Batch(stress goroutine, tea.ExecProcess(nvtop)) - [o] reopen nvtop at any time; [a] abort (cancels context) - Graceful degradation: test still runs if nvtop is not on PATH - gpuStressDoneMsg routes result to screenOutput on completion Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-26 10:01:31 +03:00
Mikhail Chusavitin	540a9e39b8	refactor(audit): rename Fan Stress Test → GPU Platform Stress Test Update all user-facing strings in TUI and ActionResult title. Internal identifiers (types, functions, file name) unchanged. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-26 09:56:25 +03:00
Mikhail Chusavitin	58510207fa	fix(iso): fall back through ROCm 6.4→6.3→6.2 if repo Release file missing ROCm 6.4 does not yet publish a Release file for Debian Bookworm, causing the live-build chroot hook to fail with "does not have a Release file". Try each version in ROCM_CANDIDATES order; skip to the next if apt-get update fails (repo unavailable). Exit gracefully if none are available. Also rename inner 'candidate' variable to 'smi_path' to avoid collision. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-26 09:52:17 +03:00
Mikhail Chusavitin	4cd7c9ab4e	feat(audit): fan-stress SAT for MSI case-04 fan lag & thermal throttle detection Two-phase GPU thermal cycling test with per-second telemetry: - Phases: baseline → load1 → pause (no cooldown) → load2 → cooldown - Monitors: fan RPM (ipmitool sdr), CPU/server temps (ipmitool/sensors), system power (ipmitool dcmi), GPU temp/power/usage/clock/throttle (nvidia-smi) - Detects throttling via clocks_throttle_reasons.active bitmask - Measures fan response lag from load start (validates case-04 ~2s lag) - Exports metrics.csv (wide format, one row/sec) and fan-sensors.csv (long format) - TUI: adds [F] Fan Stress Test to Health Check screen with Quick/Standard/Express modes Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-26 09:51:03 +03:00
Mikhail Chusavitin	cfe255f6e4	Release audit/v1.0.5	2026-03-26 09:41:19 +03:00
Mikhail Chusavitin	8b9d3447d7	Overlay SAT results into audit JSON	2026-03-25 20:11:03 +03:00
Mikhail Chusavitin	614b7cad61	Improve PCIe inventory and hardware identity collection	2026-03-25 20:00:38 +03:00
Mikhail Chusavitin	9a1df9b1ba	Tighten support bundles and fix AMD runtime checks	2026-03-25 19:35:25 +03:00
Mikhail Chusavitin	30cf014d58	Rename NVIDIA bootloader modes	2026-03-25 19:12:26 +03:00
Mikhail Chusavitin	27d478aed6	Add bootloader choice for safe vs full NVIDIA boot	2026-03-25 19:11:15 +03:00
Mikhail Chusavitin	d36e8442a9	Stabilize live ISO consoles and NVIDIA boot path	2026-03-25 19:05:18 +03:00
Mikhail Chusavitin	b345b0d14d	Derive ISO version from git tags	2026-03-25 18:40:48 +03:00
Mikhail Chusavitin	0a1ac2ab9f	Bootstrap ROCm hook prerequisites in ISO build	2026-03-25 18:38:19 +03:00
Mikhail Chusavitin	1e62f828c6	Embed MOTD banner into TUI	2026-03-25 18:11:17 +03:00
Mikhail Chusavitin	f8c997d272	Add missing SAT progress TUI helpers	2026-03-25 18:03:45 +03:00