diff --git a/audit/internal/app/app.go b/audit/internal/app/app.go
index 6e4d842..7418811 100644
--- a/audit/internal/app/app.go
+++ b/audit/internal/app/app.go
@@ -231,8 +231,11 @@ func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error)
 
 func (a *App) ExportLatestAuditResult(target platform.RemovableTarget) (ActionResult, error) {
 	path, err := a.ExportLatestAudit(target)
-	body := "Audit exported."
-	if path != "" {
+	body := "Audit export failed."
+	if err == nil {
+		body = "Audit exported."
+	}
+	if err == nil && path != "" {
 		body = "Audit exported to " + path
 	}
 	return ActionResult{Title: "Export audit", Body: body}, err
@@ -249,8 +252,11 @@ func (a *App) ExportSupportBundle(target platform.RemovableTarget) (string, erro
 
 func (a *App) ExportSupportBundleResult(target platform.RemovableTarget) (ActionResult, error) {
 	path, err := a.ExportSupportBundle(target)
-	body := "Support bundle exported. USB target unmounted and safe to remove."
-	if path != "" {
+	body := "Support bundle export failed."
+	if err == nil {
+		body = "Support bundle exported. USB target unmounted and safe to remove."
+	}
+	if err == nil && path != "" {
 		body = "Support bundle exported to " + path + ".\n\nUSB target unmounted and safe to remove."
 	}
 	return ActionResult{Title: "Export support bundle", Body: body}, err
diff --git a/audit/internal/app/app_test.go b/audit/internal/app/app_test.go
index b606005..0af6708 100644
--- a/audit/internal/app/app_test.go
+++ b/audit/internal/app/app_test.go
@@ -470,6 +470,41 @@ func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
 	}
 }
 
+func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) {
+	t.Parallel()
+
+	tmp := t.TempDir()
+	oldExportDir := DefaultExportDir
+	DefaultExportDir = tmp
+	t.Cleanup(func() { DefaultExportDir = oldExportDir })
+
+	if err := os.WriteFile(filepath.Join(tmp, "bee-audit.json"), []byte("{}\n"), 0644); err != nil {
+		t.Fatalf("write bee-audit.json: %v", err)
+	}
+	if err := os.WriteFile(filepath.Join(tmp, "bee-audit.log"), []byte("audit ok\n"), 0644); err != nil {
+		t.Fatalf("write bee-audit.log: %v", err)
+	}
+
+	a := &App{
+		exports: fakeExports{
+			exportToTargetFn: func(string, platform.RemovableTarget) (string, error) {
+				return "", errors.New("mount /dev/sda1: exFAT support is missing in this ISO build")
+			},
+		},
+	}
+
+	result, err := a.ExportSupportBundleResult(platform.RemovableTarget{Device: "/dev/sda1", FSType: "exfat"})
+	if err == nil {
+		t.Fatal("expected export error")
+	}
+	if contains(result.Body, "exported to") {
+		t.Fatalf("body should not claim success:\n%s", result.Body)
+	}
+	if result.Body != "Support bundle export failed." {
+		t.Fatalf("body=%q want %q", result.Body, "Support bundle export failed.")
+	}
+}
+
 func TestRunNvidiaAcceptancePackResult(t *testing.T) {
 	t.Parallel()
 
diff --git a/audit/internal/platform/export.go b/audit/internal/platform/export.go
index 5ded3fa..3030642 100644
--- a/audit/internal/platform/export.go
+++ b/audit/internal/platform/export.go
@@ -11,8 +11,48 @@ import (
 
 var exportExecCommand = exec.Command
 
+func formatMountTargetError(target RemovableTarget, raw string, err error) error {
+	msg := strings.TrimSpace(raw)
+	fstype := strings.ToLower(strings.TrimSpace(target.FSType))
+	if fstype == "exfat" && strings.Contains(strings.ToLower(msg), "unknown filesystem type 'exfat'") {
+		return fmt.Errorf("mount %s: exFAT support is missing in this ISO build: %w", target.Device, err)
+	}
+	if msg == "" {
+		return err
+	}
+	return fmt.Errorf("%s: %w", msg, err)
+}
+
+func removableTargetReadOnly(fields map[string]string) bool {
+	if fields["RO"] == "1" {
+		return true
+	}
+	switch strings.ToLower(strings.TrimSpace(fields["FSTYPE"])) {
+	case "iso9660", "squashfs":
+		return true
+	default:
+		return false
+	}
+}
+
+func ensureWritableMountpoint(mountpoint string) error {
+	probe, err := os.CreateTemp(mountpoint, ".bee-write-test-*")
+	if err != nil {
+		return fmt.Errorf("target filesystem is not writable: %w", err)
+	}
+	name := probe.Name()
+	if closeErr := probe.Close(); closeErr != nil {
+		_ = os.Remove(name)
+		return closeErr
+	}
+	if err := os.Remove(name); err != nil {
+		return err
+	}
+	return nil
+}
+
 func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
-	raw, err := exportExecCommand("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output()
+	raw, err := exportExecCommand("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,RO,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output()
 	if err != nil {
 		return nil, err
 	}
@@ -36,7 +76,7 @@ func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
 				}
 			}
 		}
-		if !removable || fields["FSTYPE"] == "" {
+		if !removable || fields["FSTYPE"] == "" || removableTargetReadOnly(fields) {
 			continue
 		}
 
@@ -72,7 +112,7 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (dst str
 		}
 		if raw, err := exportExecCommand("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
 			_ = os.Remove(mountpoint)
-			return string(raw), err
+			return "", formatMountTargetError(target, string(raw), err)
 		}
 		mountedHere = true
 		mounted = true
@@ -95,6 +135,10 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (dst str
 		}
 	}()
 
+	if err := ensureWritableMountpoint(mountpoint); err != nil {
+		return "", err
+	}
+
 	filename := filepath.Base(src)
 	dst = filepath.Join(mountpoint, filename)
 	data, err := os.ReadFile(src)
diff --git a/audit/internal/platform/export_test.go b/audit/internal/platform/export_test.go
index 35df2b7..54da24f 100644
--- a/audit/internal/platform/export_test.go
+++ b/audit/internal/platform/export_test.go
@@ -4,12 +4,11 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
+	"strings"
 	"testing"
 )
 
 func TestExportFileToTargetUnmountsExistingMountpoint(t *testing.T) {
-	t.Parallel()
-
 	tmp := t.TempDir()
 	src := filepath.Join(tmp, "bundle.tar.gz")
 	mountpoint := filepath.Join(tmp, "mnt")
@@ -54,3 +53,60 @@ func TestExportFileToTargetUnmountsExistingMountpoint(t *testing.T) {
 		t.Fatalf("expected umount %q call, got %#v", mountpoint, calls)
 	}
 }
+
+func TestExportFileToTargetRejectsNonWritableMountpoint(t *testing.T) {
+	tmp := t.TempDir()
+	src := filepath.Join(tmp, "bundle.tar.gz")
+	mountpoint := filepath.Join(tmp, "mnt")
+	if err := os.MkdirAll(mountpoint, 0755); err != nil {
+		t.Fatalf("mkdir mountpoint: %v", err)
+	}
+	if err := os.WriteFile(src, []byte("bundle"), 0644); err != nil {
+		t.Fatalf("write src: %v", err)
+	}
+	if err := os.Chmod(mountpoint, 0555); err != nil {
+		t.Fatalf("chmod mountpoint: %v", err)
+	}
+
+	oldExec := exportExecCommand
+	exportExecCommand = func(name string, args ...string) *exec.Cmd {
+		return exec.Command("sh", "-c", "exit 0")
+	}
+	t.Cleanup(func() { exportExecCommand = oldExec })
+
+	s := &System{}
+	_, err := s.ExportFileToTarget(src, RemovableTarget{
+		Device:     "/dev/sdb1",
+		Mountpoint: mountpoint,
+	})
+	if err == nil {
+		t.Fatal("expected error for non-writable mountpoint")
+	}
+	if !strings.Contains(err.Error(), "target filesystem is not writable") {
+		t.Fatalf("err=%q want writable message", err)
+	}
+}
+
+func TestListRemovableTargetsSkipsReadOnlyMedia(t *testing.T) {
+	oldExec := exportExecCommand
+	lsblkOut := `NAME="sda1" TYPE="part" PKNAME="sda" RM="1" RO="1" FSTYPE="iso9660" MOUNTPOINT="/run/live/medium" SIZE="3.7G" LABEL="BEE" MODEL=""
+NAME="sdb1" TYPE="part" PKNAME="sdb" RM="1" RO="0" FSTYPE="vfat" MOUNTPOINT="/media/bee/USB" SIZE="29.8G" LABEL="USB" MODEL=""`
+	exportExecCommand = func(name string, args ...string) *exec.Cmd {
+		cmd := exec.Command("sh", "-c", "printf '%s\n' \"$LSBLK_OUT\"")
+		cmd.Env = append(os.Environ(), "LSBLK_OUT="+lsblkOut)
+		return cmd
+	}
+	t.Cleanup(func() { exportExecCommand = oldExec })
+
+	s := &System{}
+	targets, err := s.ListRemovableTargets()
+	if err != nil {
+		t.Fatalf("ListRemovableTargets error: %v", err)
+	}
+	if len(targets) != 1 {
+		t.Fatalf("len(targets)=%d want 1 (%+v)", len(targets), targets)
+	}
+	if got := targets[0].Device; got != "/dev/sdb1" {
+		t.Fatalf("device=%q want /dev/sdb1", got)
+	}
+}
diff --git a/audit/internal/tui/forms.go b/audit/internal/tui/forms.go
index 6836668..936d5c5 100644
--- a/audit/internal/tui/forms.go
+++ b/audit/internal/tui/forms.go
@@ -151,8 +151,10 @@ func (m model) confirmCancelTarget() screen {
 	switch m.pendingAction {
 	case actionExportBundle:
 		return screenExportTargets
-	case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT, actionRunFanStress:
+	case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT:
 		return screenHealthCheck
+	case actionRunFanStress:
+		return screenBurnInTests
 	default:
 		return screenMain
 	}
@@ -165,9 +167,9 @@ func hcFanStressOpts(hcMode int, application interface {
 	// Phase durations per mode: [baseline, load1, pause, load2]
 	type durations struct{ baseline, load1, pause, load2 int }
 	modes := [3]durations{
-		{30, 120, 30, 120},   // Quick:    ~5 min total
-		{60, 300, 60, 300},   // Standard: ~12 min total
-		{60, 600, 120, 600},  // Express:  ~24 min total
+		{30, 120, 30, 120},  // Quick:    ~5 min total
+		{60, 300, 60, 300},  // Standard: ~12 min total
+		{60, 600, 120, 600}, // Express:  ~24 min total
 	}
 	if hcMode < 0 || hcMode >= len(modes) {
 		hcMode = 0
diff --git a/audit/internal/tui/screen_burn_in.go b/audit/internal/tui/screen_burn_in.go
new file mode 100644
index 0000000..7013962
--- /dev/null
+++ b/audit/internal/tui/screen_burn_in.go
@@ -0,0 +1,117 @@
+package tui
+
+import (
+	"fmt"
+	"strings"
+
+	tea "github.com/charmbracelet/bubbletea"
+)
+
+const (
+	burnCurGPUStress = 0
+	burnCurModeQuick = 1
+	burnCurModeStd   = 2
+	burnCurModeExpr  = 3
+	burnCurRun       = 4
+	burnCurTotal     = 5
+)
+
+func (m model) enterBurnInTests() (tea.Model, tea.Cmd) {
+	m.screen = screenBurnInTests
+	m.cursor = 0
+	if !m.burnInitialized {
+		m.burnMode = 0
+		m.burnCursor = 0
+		m.burnInitialized = true
+	}
+	return m, nil
+}
+
+func (m model) updateBurnInTests(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
+	switch msg.String() {
+	case "up", "k":
+		if m.burnCursor > 0 {
+			m.burnCursor--
+		}
+	case "down", "j":
+		if m.burnCursor < burnCurTotal-1 {
+			m.burnCursor++
+		}
+	case " ":
+		switch m.burnCursor {
+		case burnCurModeQuick, burnCurModeStd, burnCurModeExpr:
+			m.burnMode = m.burnCursor - burnCurModeQuick
+		}
+	case "enter":
+		switch m.burnCursor {
+		case burnCurGPUStress, burnCurRun:
+			return m.burnRunSelected()
+		case burnCurModeQuick, burnCurModeStd, burnCurModeExpr:
+			m.burnMode = m.burnCursor - burnCurModeQuick
+		}
+	case "f", "F", "r", "R":
+		return m.burnRunSelected()
+	case "1":
+		m.burnMode = 0
+	case "2":
+		m.burnMode = 1
+	case "3":
+		m.burnMode = 2
+	case "esc":
+		m.screen = screenMain
+		m.cursor = 1
+	case "q", "ctrl+c":
+		return m, tea.Quit
+	}
+	return m, nil
+}
+
+func (m model) burnRunSelected() (tea.Model, tea.Cmd) {
+	return m.hcRunFanStress()
+}
+
+func renderBurnInTests(m model) string {
+	var b strings.Builder
+
+	fmt.Fprintln(&b, "BURN-IN TESTS")
+	fmt.Fprintln(&b)
+	fmt.Fprintln(&b, "  Stress tests:")
+	fmt.Fprintln(&b)
+
+	pfx := "  "
+	if m.burnCursor == burnCurGPUStress {
+		pfx = "> "
+	}
+	fmt.Fprintf(&b, "%s[ GPU PLATFORM STRESS TEST [F] ]   (thermal cycling, fan lag, throttle check)\n", pfx)
+
+	fmt.Fprintln(&b)
+	fmt.Fprintln(&b, "  Mode:")
+	modes := []struct{ label, key string }{
+		{"Quick", "1"},
+		{"Standard", "2"},
+		{"Express", "3"},
+	}
+	for i, mode := range modes {
+		pfx := "  "
+		if m.burnCursor == burnCurModeQuick+i {
+			pfx = "> "
+		}
+		radio := "( )"
+		if m.burnMode == i {
+			radio = "(*)"
+		}
+		fmt.Fprintf(&b, "%s%s  %-10s  [%s]\n", pfx, radio, mode.label, mode.key)
+	}
+
+	fmt.Fprintln(&b)
+	pfx = "  "
+	if m.burnCursor == burnCurRun {
+		pfx = "> "
+	}
+	fmt.Fprintf(&b, "%s[ RUN SELECTED [R] ]\n", pfx)
+
+	fmt.Fprintln(&b)
+	fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────")
+	fmt.Fprint(&b, "[↑↓] move  [space/enter] select  [1/2/3] mode  [R/F] run  [Esc] back")
+	return b.String()
+}
diff --git a/audit/internal/tui/screen_export.go b/audit/internal/tui/screen_export.go
index 5624a12..4df5b3b 100644
--- a/audit/internal/tui/screen_export.go
+++ b/audit/internal/tui/screen_export.go
@@ -4,7 +4,12 @@ import tea "github.com/charmbracelet/bubbletea"
 
 func (m model) handleExportTargetsMenu() (tea.Model, tea.Cmd) {
 	if len(m.targets) == 0 {
-		return m, resultCmd("Export support bundle", "No removable filesystems found", nil, screenMain)
+		return m, resultCmd(
+			"Export support bundle",
+			"No writable removable filesystems found.\n\nRead-only or boot media are hidden from this list.",
+			nil,
+			screenMain,
+		)
 	}
 	target := m.targets[m.cursor]
 	m.selectedTarget = &target
diff --git a/audit/internal/tui/screen_health_check.go b/audit/internal/tui/screen_health_check.go
index d2a09be..9f0def1 100644
--- a/audit/internal/tui/screen_health_check.go
+++ b/audit/internal/tui/screen_health_check.go
@@ -21,17 +21,16 @@ const (
 
 // Cursor positions in Health Check screen.
 const (
-	hcCurGPU        = 0
-	hcCurMemory     = 1
-	hcCurStorage    = 2
-	hcCurCPU        = 3
-	hcCurSelectAll  = 4
-	hcCurModeQuick  = 5
-	hcCurModeStd    = 6
-	hcCurModeExpr   = 7
-	hcCurRunAll     = 8
-	hcCurFanStress  = 9
-	hcCurTotal      = 10
+	hcCurGPU       = 0
+	hcCurMemory    = 1
+	hcCurStorage   = 2
+	hcCurCPU       = 3
+	hcCurSelectAll = 4
+	hcCurModeQuick = 5
+	hcCurModeStd   = 6
+	hcCurModeExpr  = 7
+	hcCurRunAll    = 8
+	hcCurTotal     = 9
 )
 
 // hcModeDurations maps mode index (0=Quick,1=Standard,2=Express) to GPU stress seconds.
@@ -86,8 +85,6 @@ func (m model) updateHealthCheck(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
 			m.hcMode = m.hcCursor - hcCurModeQuick
 		case hcCurRunAll:
 			return m.hcRunAll()
-		case hcCurFanStress:
-			return m.hcRunFanStress()
 		}
 	case "g", "G":
 		return m.hcRunSingle(hcGPU)
@@ -99,8 +96,6 @@ func (m model) updateHealthCheck(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
 		return m.hcRunSingle(hcCPU)
 	case "r", "R":
 		return m.hcRunAll()
-	case "f", "F":
-		return m.hcRunFanStress()
 	case "a", "A":
 		allOn := m.hcSel[0] && m.hcSel[1] && m.hcSel[2] && m.hcSel[3]
 		for i := range m.hcSel {
@@ -160,7 +155,7 @@ func (m model) hcRunFanStress() (tea.Model, tea.Cmd) {
 
 // startGPUStressTest launches the GPU Platform Stress Test with a live in-TUI chart.
 func (m model) startGPUStressTest() (tea.Model, tea.Cmd) {
-	opts := hcFanStressOpts(m.hcMode, m.app)
+	opts := hcFanStressOpts(m.burnMode, m.app)
 
 	ctx, cancel := context.WithCancel(context.Background())
 	m.gpuStressCancel = cancel
@@ -197,7 +192,8 @@ func (m model) updateGPUStressRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
 			m.gpuStressCancel = nil
 		}
 		m.gpuStressAborted = true
-		m.screen = screenHealthCheck
+		m.screen = screenBurnInTests
+		m.burnCursor = burnCurGPUStress
 		m.cursor = 0
 	case "ctrl+c":
 		return m, tea.Quit
@@ -380,16 +376,8 @@ func renderHealthCheck(m model) string {
 		fmt.Fprintf(&b, "%s[ RUN ALL [R] ]\n", pfx)
 	}
 
-	{
-		pfx := "  "
-		if m.hcCursor == hcCurFanStress {
-			pfx = "> "
-		}
-		fmt.Fprintf(&b, "%s[ GPU PLATFORM STRESS TEST [F] ]   (thermal cycling, fan lag, throttle check)\n", pfx)
-	}
-
 	fmt.Fprintln(&b)
 	fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────")
-	fmt.Fprint(&b, "[↑↓] move  [space/enter] toggle  [letter] single test  [R] run all  [F] gpu stress  [Esc] back")
+	fmt.Fprint(&b, "[↑↓] move  [space/enter] toggle  [letter] single test  [R] run all  [Esc] back")
 	return b.String()
 }
diff --git a/audit/internal/tui/screen_main.go b/audit/internal/tui/screen_main.go
index a330eb6..50f6187 100644
--- a/audit/internal/tui/screen_main.go
+++ b/audit/internal/tui/screen_main.go
@@ -8,7 +8,9 @@ func (m model) handleMainMenu() (tea.Model, tea.Cmd) {
 	switch m.cursor {
 	case 0: // Health Check
 		return m.enterHealthCheck()
-	case 1: // Export support bundle
+	case 1: // Burn-in tests
+		return m.enterBurnInTests()
+	case 2: // Export support bundle
 		m.pendingAction = actionExportBundle
 		m.busy = true
 		m.busyTitle = "Export support bundle"
@@ -16,11 +18,11 @@ func (m model) handleMainMenu() (tea.Model, tea.Cmd) {
 			targets, err := m.app.ListRemovableTargets()
 			return exportTargetsMsg{targets: targets, err: err}
 		}
-	case 2: // Settings
+	case 3: // Settings
 		m.screen = screenSettings
 		m.cursor = 0
 		return m, nil
-	case 3: // Exit
+	case 4: // Exit
 		return m, tea.Quit
 	}
 	return m, nil
diff --git a/audit/internal/tui/tui_test.go b/audit/internal/tui/tui_test.go
index 5e02faa..b71b451 100644
--- a/audit/internal/tui/tui_test.go
+++ b/audit/internal/tui/tui_test.go
@@ -54,9 +54,10 @@ func TestUpdateMainMenuEnterActions(t *testing.T) {
 		wantCmd    bool
 	}{
 		{name: "health_check", cursor: 0, wantScreen: screenHealthCheck, wantCmd: true},
-		{name: "export", cursor: 1, wantScreen: screenMain, wantBusy: true, wantCmd: true},
-		{name: "settings", cursor: 2, wantScreen: screenSettings, wantCmd: true},
-		{name: "exit", cursor: 3, wantScreen: screenMain, wantCmd: true},
+		{name: "burn_in_tests", cursor: 1, wantScreen: screenBurnInTests, wantCmd: true},
+		{name: "export", cursor: 2, wantScreen: screenMain, wantBusy: true, wantCmd: true},
+		{name: "settings", cursor: 3, wantScreen: screenSettings, wantCmd: true},
+		{name: "exit", cursor: 4, wantScreen: screenMain, wantCmd: true},
 	}
 
 	for _, test := range tests {
@@ -115,7 +116,8 @@ func TestMainMenuSimpleTransitions(t *testing.T) {
 		wantScreen screen
 	}{
 		{name: "health_check", cursor: 0, wantScreen: screenHealthCheck},
-		{name: "settings", cursor: 2, wantScreen: screenSettings},
+		{name: "burn_in_tests", cursor: 1, wantScreen: screenBurnInTests},
+		{name: "settings", cursor: 3, wantScreen: screenSettings},
 	}
 
 	for _, test := range tests {
@@ -146,7 +148,7 @@ func TestMainMenuExportSetsBusy(t *testing.T) {
 	t.Parallel()
 
 	m := newTestModel()
-	m.cursor = 1 // Export support bundle
+	m.cursor = 2 // Export support bundle
 
 	next, cmd := m.handleMainMenu()
 	got := next.(model)
@@ -163,12 +165,13 @@ func TestMainViewRendersTwoColumns(t *testing.T) {
 	t.Parallel()
 
 	m := newTestModel()
-	m.cursor = 1
+	m.cursor = 2
 
 	view := m.View()
 	for _, want := range []string{
 		"bee",
 		"Health Check",
+		"Burn-in tests",
 		"> Export support bundle",
 		"Settings",
 		"Exit",
@@ -400,6 +403,11 @@ func TestConfirmCancelTarget(t *testing.T) {
 		t.Fatalf("storage sat cancel target=%q want %q", got, screenHealthCheck)
 	}
 
+	m.pendingAction = actionRunFanStress
+	if got := m.confirmCancelTarget(); got != screenBurnInTests {
+		t.Fatalf("fan stress cancel target=%q want %q", got, screenBurnInTests)
+	}
+
 	m.pendingAction = actionNone
 	if got := m.confirmCancelTarget(); got != screenMain {
 		t.Fatalf("default cancel target=%q want %q", got, screenMain)
@@ -439,6 +447,68 @@ func TestViewBusyStateUsesBusyTitle(t *testing.T) {
 	}
 }
 
+func TestBurnInTestsEscReturnsToMain(t *testing.T) {
+	t.Parallel()
+
+	m := newTestModel()
+	m.screen = screenBurnInTests
+	m.burnCursor = 3
+
+	next, _ := m.updateBurnInTests(tea.KeyMsg{Type: tea.KeyEsc})
+	got := next.(model)
+
+	if got.screen != screenMain {
+		t.Fatalf("screen=%q want %q", got.screen, screenMain)
+	}
+	if got.cursor != 1 {
+		t.Fatalf("cursor=%d want 1", got.cursor)
+	}
+}
+
+func TestBurnInTestsRunOpensConfirm(t *testing.T) {
+	t.Parallel()
+
+	m := newTestModel()
+	m.screen = screenBurnInTests
+	m.burnInitialized = true
+	m.burnMode = 2
+
+	next, _ := m.burnRunSelected()
+	got := next.(model)
+
+	if got.screen != screenConfirm {
+		t.Fatalf("screen=%q want %q", got.screen, screenConfirm)
+	}
+	if got.pendingAction != actionRunFanStress {
+		t.Fatalf("pendingAction=%q want %q", got.pendingAction, actionRunFanStress)
+	}
+	if got.cursor != 0 {
+		t.Fatalf("cursor=%d want 0", got.cursor)
+	}
+}
+
+func TestViewBurnInTestsRendersGPUStressEntry(t *testing.T) {
+	t.Parallel()
+
+	m := newTestModel()
+	m.screen = screenBurnInTests
+
+	view := m.View()
+
+	for _, want := range []string{
+		"BURN-IN TESTS",
+		"GPU PLATFORM STRESS TEST",
+		"Quick",
+		"Standard",
+		"Express",
+		"[ RUN SELECTED [R] ]",
+	} {
+		if !strings.Contains(view, want) {
+			t.Fatalf("view missing %q\nview:\n%s", want, view)
+		}
+	}
+}
+
 func TestViewOutputScreenRendersBodyAndBackHint(t *testing.T) {
 	t.Parallel()
 
@@ -528,7 +598,7 @@ func TestViewExportTargetsRendersDeviceMetadata(t *testing.T) {
 
 	for _, want := range []string{
 		"Export support bundle",
-		"Select removable filesystem",
+		"Select writable removable filesystem (read-only/boot media hidden)",
 		"> /dev/sdb1 [vfat 29G] label=BEEUSB mounted=/media/bee",
 	} {
 		if !strings.Contains(view, want) {
@@ -537,6 +607,32 @@ func TestViewExportTargetsRendersDeviceMetadata(t *testing.T) {
 	}
 }
 
+func TestExportTargetsMsgEmptyShowsHiddenBootMediaHint(t *testing.T) {
+	t.Parallel()
+
+	m := newTestModel()
+	m.busy = true
+	m.busyTitle = "Export support bundle"
+
+	next, _ := m.Update(exportTargetsMsg{})
+	got := next.(model)
+
+	if got.screen != screenOutput {
+		t.Fatalf("screen=%q want %q", got.screen, screenOutput)
+	}
+	if got.title != "Export support bundle" {
+		t.Fatalf("title=%q want %q", got.title, "Export support bundle")
+	}
+	for _, want := range []string{
+		"No writable removable filesystems found.",
+		"Read-only or boot media are hidden from this list.",
+	} {
+		if !strings.Contains(got.body, want) {
+			t.Fatalf("body missing %q\nbody:\n%s", want, got.body)
+		}
+	}
+}
+
 func TestViewStaticFormRendersFields(t *testing.T) {
 	t.Parallel()
 
diff --git a/audit/internal/tui/types.go b/audit/internal/tui/types.go
index 9c10778..3f0c148 100644
--- a/audit/internal/tui/types.go
+++ b/audit/internal/tui/types.go
@@ -16,6 +16,7 @@ type screen string
 const (
 	screenMain             screen = "main"
 	screenHealthCheck      screen = "health_check"
+	screenBurnInTests      screen = "burn_in_tests"
 	screenSettings         screen = "settings"
 	screenNetwork          screen = "network"
 	screenInterfacePick    screen = "interface_pick"
@@ -41,8 +42,8 @@ const (
 	actionRunMemorySAT  actionKind = "run_memory_sat"
 	actionRunStorageSAT actionKind = "run_storage_sat"
 	actionRunCPUSAT     actionKind = "run_cpu_sat"
-	actionRunAMDGPUSAT   actionKind = "run_amd_gpu_sat"
-	actionRunFanStress   actionKind = "run_fan_stress"
+	actionRunAMDGPUSAT  actionKind = "run_amd_gpu_sat"
+	actionRunFanStress  actionKind = "run_fan_stress"
 )
 
 type model struct {
@@ -84,6 +85,11 @@ type model struct {
 	hcCursor      int
 	hcInitialized bool
 
+	// Burn-in tests screen
+	burnMode        int
+	burnCursor      int
+	burnInitialized bool
+
 	// NVIDIA SAT setup
 	nvidiaGPUs      []platform.NvidiaGPU
 	nvidiaGPUSel    []bool
@@ -97,9 +103,9 @@ type model struct {
 	// GPU Platform Stress Test running
 	gpuStressCancel  func()
 	gpuStressAborted bool
-	gpuLiveRows    []platform.GPUMetricRow
-	gpuLiveIndices []int
-	gpuLiveStart   time.Time
+	gpuLiveRows      []platform.GPUMetricRow
+	gpuLiveIndices   []int
+	gpuLiveStart     time.Time
 
 	// SAT verbose progress (CPU / Memory / Storage / AMD GPU)
 	progressLines  []string
@@ -132,6 +138,7 @@ func newModel(application *app.App, runtimeMode runtimeenv.Mode) model {
 		screen:      screenMain,
 		mainMenu: []string{
 			"Health Check",
+			"Burn-in tests",
 			"Export support bundle",
 			"Settings",
 			"Exit",
@@ -201,7 +208,7 @@ func (m model) confirmBody() (string, string) {
 		modes := []string{"Quick (2×2min)", "Standard (2×5min)", "Express (2×10min)"}
 		return "GPU Platform Stress Test", "Two-phase GPU thermal cycling test.\n" +
 			"Monitors fans, temps, power — detects throttling.\n" +
-			"Mode: " + modes[m.hcMode] + "\n\nAll NVIDIA GPUs will be stressed."
+			"Mode: " + modes[m.burnMode] + "\n\nAll NVIDIA GPUs will be stressed."
 	default:
 		return "Confirm", "Proceed?"
 	}
diff --git a/audit/internal/tui/update.go b/audit/internal/tui/update.go
index ecea4af..ce7d79f 100644
--- a/audit/internal/tui/update.go
+++ b/audit/internal/tui/update.go
@@ -101,6 +101,13 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 			m.screen = screenOutput
 			return m, m.refreshSnapshotCmd()
 		}
+		if len(msg.targets) == 0 {
+			m.title = "Export support bundle"
+			m.body = "No writable removable filesystems found.\n\nRead-only or boot media are hidden from this list."
+			m.prevScreen = screenMain
+			m.screen = screenOutput
+			return m, m.refreshSnapshotCmd()
+		}
 		m.targets = msg.targets
 		m.screen = screenExportTargets
 		m.cursor = 0
@@ -117,7 +124,7 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 			m.gpuStressCancel()
 			m.gpuStressCancel = nil
 		}
-		m.prevScreen = screenHealthCheck
+		m.prevScreen = screenBurnInTests
 		m.screen = screenOutput
 		m.title = msg.title
 		if msg.err != nil {
@@ -179,6 +186,8 @@ func (m model) updateKey(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
 		return m.updateMain(msg)
 	case screenHealthCheck:
 		return m.updateHealthCheck(msg)
+	case screenBurnInTests:
+		return m.updateBurnInTests(msg)
 	case screenSettings:
 		return m.updateMenu(msg, len(m.settingsMenu), m.handleSettingsMenu)
 	case screenNetwork:
diff --git a/audit/internal/tui/view.go b/audit/internal/tui/view.go
index 62b0b67..487ee03 100644
--- a/audit/internal/tui/view.go
+++ b/audit/internal/tui/view.go
@@ -57,6 +57,8 @@ func (m model) View() string {
 			body = renderTwoColumnMain(m)
 		case screenHealthCheck:
 			body = renderHealthCheck(m)
+		case screenBurnInTests:
+			body = renderBurnInTests(m)
 		case screenSettings:
 			body = renderMenu("Settings", "Select action", m.settingsMenu, m.cursor)
 		case screenNetwork:
@@ -66,7 +68,12 @@ func (m model) View() string {
 		case screenServiceAction:
 			body = renderMenu("Service: "+m.selectedService, "Select action", m.serviceMenu, m.cursor)
 		case screenExportTargets:
-			body = renderMenu("Export support bundle", "Select removable filesystem", renderTargetItems(m.targets), m.cursor)
+			body = renderMenu(
+				"Export support bundle",
+				"Select writable removable filesystem (read-only/boot media hidden)",
+				renderTargetItems(m.targets),
+				m.cursor,
+			)
 		case screenInterfacePick:
 			body = renderMenu("Interfaces", "Select interface", renderInterfaceItems(m.interfaces), m.cursor)
 		case screenStaticForm:
diff --git a/bible-local/architecture/runtime-flows.md b/bible-local/architecture/runtime-flows.md
index 8c795d6..e0305d1 100644
--- a/bible-local/architecture/runtime-flows.md
+++ b/bible-local/architecture/runtime-flows.md
@@ -9,6 +9,8 @@ DHCP is used only for LAN (operator SSH access). Internet is NOT available.
 
 ## Boot sequence (single ISO)
 
+The live system is expected to boot with `toram`, so `live-boot` copies the full read-only medium into RAM before mounting the root filesystem. After that point, runtime must not depend on the original USB/BMC virtual media staying readable.
+
 `systemd` boot order:
 
 ```
@@ -25,6 +27,7 @@ local-fs.target
 ```
 
 **Critical invariants:**
+- The live ISO boots with `boot=live toram`. Runtime binaries must continue working even if the original boot media disappears after early boot.
 - OpenSSH MUST start without network. `bee-sshsetup.service` runs before `ssh.service`.
 - `bee-network.service` uses `dhclient -nw` (background) — network bring-up is best effort and non-blocking.
 - `bee-nvidia.service` loads modules via `insmod` with absolute paths — NOT `modprobe`.
@@ -71,24 +74,39 @@ build-in-container.sh [--authorized-keys /path/to/keys]
        d. build kernel modules against Debian headers
        e. create `libnvidia-ml.so.1` / `libcuda.so.1` symlinks in cache
        f. cache in `dist/nvidia-<version>-<kver>/`
-  7. inject NVIDIA `.ko` → staged `/usr/local/lib/nvidia/`
-  8. inject `nvidia-smi` → staged `/usr/local/bin/nvidia-smi`
-  9. inject `libnvidia-ml` + `libcuda` → staged `/usr/lib/`
-  10. write staged `/etc/bee-release` (versions + git commit)
-  11. patch staged `motd` with build metadata
-  12. copy `iso/builder/` into a temporary live-build workdir under `dist/`
-  13. sync staged overlay into workdir `config/includes.chroot/`
-  14. run `lb config && lb build` inside the privileged builder container
+  7. `build-cublas.sh`:
+       a. download `libcublas`, `libcublasLt`, `libcudart` runtime + dev packages from the NVIDIA CUDA Debian repo
+       b. verify packages against repo `Packages.gz`
+       c. extract headers for `bee-gpu-stress` build
+       d. cache userspace libs in `dist/cublas-<version>+cuda<series>/`
+  8. build `bee-gpu-stress` against extracted cuBLASLt/cudart headers
+  9. inject NVIDIA `.ko` → staged `/usr/local/lib/nvidia/`
+  10. inject `nvidia-smi` → staged `/usr/local/bin/nvidia-smi`
+  11. inject `libnvidia-ml` + `libcuda` + `libcublas` + `libcublasLt` + `libcudart` → staged `/usr/lib/`
+  12. write staged `/etc/bee-release` (versions + git commit)
+  13. patch staged `motd` with build metadata
+  14. copy `iso/builder/` into a temporary live-build workdir under `dist/`
+  15. sync staged overlay into workdir `config/includes.chroot/`
+  16. run `lb config && lb build` inside the privileged builder container
 ```
 
+Build host notes:
+- `build-in-container.sh` targets `linux/amd64` builder containers by default, including Docker Desktop on macOS / Apple Silicon.
+- Override with `BEE_BUILDER_PLATFORM=<os/arch>` only if you intentionally need a different container platform.
+- If the local builder image under the same tag was previously built for the wrong architecture, the script rebuilds it automatically.
+
 **Critical invariants:**
 - `DEBIAN_KERNEL_ABI` in `iso/builder/VERSIONS` pins the exact kernel ABI used in BOTH places:
   1. `build-in-container.sh` / `build-nvidia-module.sh` — Debian kernel headers for module build
   2. `auto/config` — `linux-image-${DEBIAN_KERNEL_ABI}` in the ISO
 - NVIDIA modules go to staged `usr/local/lib/nvidia/` — NOT to `/lib/modules/<kver>/extra/`.
+- `bee-gpu-stress` must be built against cached CUDA userspace headers from `build-cublas.sh`, not against random host-installed CUDA headers.
+- The live ISO must ship `libcublas`, `libcublasLt`, and `libcudart` together with `libcuda` so tensor-core stress works without internet or package installs at boot.
 - The source overlay in `iso/overlay/` is treated as immutable source. Build-time files are injected only into the staged overlay.
 - The live-build workdir under `dist/` is disposable; source files under `iso/builder/` stay clean.
 - Container build requires `--privileged` because `live-build` uses mounts/chroots/loop devices during ISO assembly.
+- On macOS / Docker Desktop, the builder still must run as `linux/amd64` so the shipped ISO binaries remain `amd64`.
+- Operators must provision enough RAM to hold the full compressed live medium plus normal runtime overhead, because `toram` copies the entire read-only ISO payload into memory before the system reaches steady state.
 
 ## Post-boot smoke test
 
@@ -131,10 +149,15 @@ Current validation state:
 Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal.
 
 Acceptance flows:
-- `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + lightweight `bee-gpu-stress`
+- `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + mixed-precision `bee-gpu-stress`
 - `bee sat memory` → `memtester` archive
 - `bee sat storage` → SMART/NVMe diagnostic archive and short self-test trigger where supported
 - SAT `summary.txt` now includes `overall_status` and per-job `*_status` values (`OK`, `FAILED`, `UNSUPPORTED`)
+- `bee-gpu-stress` should prefer cuBLASLt GEMM load over the old integer/PTX burn path:
+  - Ampere: `fp16` + `fp32`/TF32 tensor-core load
+  - Ada / Hopper: add `fp8`
+  - Blackwell+: add `fp4`
+  - PTX fallback is only for missing cuBLASLt/userspace or unsupported narrow datatypes
 - Runtime overrides:
   - `BEE_GPU_STRESS_SECONDS`
   - `BEE_GPU_STRESS_SIZE_MB`
diff --git a/bible-local/architecture/system-overview.md b/bible-local/architecture/system-overview.md
index 960db4a..92d1e82 100644
--- a/bible-local/architecture/system-overview.md
+++ b/bible-local/architecture/system-overview.md
@@ -21,7 +21,8 @@ Fills gaps where Redfish/logpile is blind:
 - Read-only hardware inventory: board, CPU, memory, storage, PCIe, PSU, GPU, NIC, RAID
 - Machine-readable health summary derived from collector verdicts
 - Operator-triggered acceptance tests for NVIDIA, memory, and storage
-- NVIDIA SAT includes both diagnostic collection and lightweight GPU stress via `bee-gpu-stress`
+- NVIDIA SAT includes both diagnostic collection and mixed-precision GPU stress via `bee-gpu-stress`
+- `bee-gpu-stress` should exercise tensor/inference paths (`fp16`, `fp32`/TF32, `fp8`, `fp4` when supported by the GPU/userspace stack) and fall back to Driver API PTX burn only if cuBLASLt is unavailable
 - Automatic boot audit with operator-facing local console and SSH access
 - NVIDIA proprietary driver loaded at boot for GPU enrichment via `nvidia-smi`
 - SSH access (OpenSSH) always available for inspection and debugging
@@ -69,6 +70,7 @@ Fills gaps where Redfish/logpile is blind:
 | SSH | OpenSSH server |
 | NVIDIA driver | Proprietary `.run` installer, built against Debian kernel headers |
 | NVIDIA modules | Loaded via `insmod` from `/usr/local/lib/nvidia/` |
+| GPU stress backend | `bee-gpu-stress` + cuBLASLt/cuBLAS/cudart mixed-precision GEMM, with Driver API PTX fallback |
 | Builder | Debian 12 host/VM or Debian 12 container image |
 
 ## Operator UX
@@ -78,6 +80,7 @@ Fills gaps where Redfish/logpile is blind:
 - The TUI itself executes privileged actions as `root` via `sudo -n`
 - SSH remains available independently of the local console path
 - VM-oriented builds also include `qemu-guest-agent` and serial console support for debugging
+- The ISO boots with `toram`, so loss of the original USB/BMC virtual media after boot should not break already-installed runtime binaries
 
 ## Runtime split
 
@@ -85,6 +88,7 @@ Fills gaps where Redfish/logpile is blind:
 - Live-ISO-only responsibilities stay in `iso/` integration code
 - Live ISO launches the Go CLI with `--runtime livecd`
 - Local/manual runs use `--runtime auto` or `--runtime local`
+- Live ISO targets must have enough RAM for the full compressed live medium plus runtime working set because the boot medium is copied into memory at startup
 
 ## Key paths
 
diff --git a/iso/README.md b/iso/README.md
new file mode 100644
index 0000000..31792b7
--- /dev/null
+++ b/iso/README.md
@@ -0,0 +1,58 @@
+# ISO Build
+
+`bee` ISO is built inside a Debian 12 builder container via `iso/builder/build-in-container.sh`.
+
+## Requirements
+
+- Docker Desktop or another Docker-compatible container runtime
+- Privileged containers enabled
+- Enough free disk space for builder cache, Debian live-build artifacts, NVIDIA driver cache, and CUDA userspace packages
+
+## Build On macOS
+
+From the repository root:
+
+```sh
+sh iso/builder/build-in-container.sh
+```
+
+The script defaults to `linux/amd64` builder containers, so it works on:
+
+- Intel Mac
+- Apple Silicon (`M1` / `M2` / `M3` / `M4`) via Docker Desktop's Linux VM
+
+You do not need to pass `--platform` manually for normal ISO builds.
+
+## Useful Options
+
+Build with explicit SSH keys baked into the ISO:
+
+```sh
+sh iso/builder/build-in-container.sh --authorized-keys ~/.ssh/id_ed25519.pub
+```
+
+Rebuild the builder image:
+
+```sh
+sh iso/builder/build-in-container.sh --rebuild-image
+```
+
+Use a custom cache directory:
+
+```sh
+sh iso/builder/build-in-container.sh --cache-dir /path/to/cache
+```
+
+## Notes
+
+- The builder image is automatically rebuilt if the local tag exists for the wrong architecture.
+- The live ISO boots with Debian `live-boot` `toram`, so the read-only medium is copied into RAM during boot and the runtime no longer depends on the original USB/BMC virtual media staying present.
+- Target systems need enough RAM for the full compressed live medium plus normal runtime overhead, or boot may fail before reaching the TUI.
+- Override the container platform only if you know why:
+
+```sh
+BEE_BUILDER_PLATFORM=linux/amd64 sh iso/builder/build-in-container.sh
+```
+
+- The shipped ISO is still `amd64`.
+- Output ISO artifacts are written under `dist/`.
diff --git a/iso/builder/VERSIONS b/iso/builder/VERSIONS
index bf48d6e..db45732 100644
--- a/iso/builder/VERSIONS
+++ b/iso/builder/VERSIONS
@@ -4,5 +4,7 @@ NVIDIA_DRIVER_VERSION=590.48.01
 NCCL_VERSION=2.28.9-1
 NCCL_CUDA_VERSION=13.0
 NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
+CUBLAS_VERSION=13.0.2.14-1
+CUDA_USERSPACE_VERSION=13.0.96-1
 GO_VERSION=1.24.0
 AUDIT_VERSION=1.0.0
diff --git a/iso/builder/auto/config b/iso/builder/auto/config
index a0845cb..bf1d7cc 100755
--- a/iso/builder/auto/config
+++ b/iso/builder/auto/config
@@ -32,6 +32,6 @@ lb config noauto \
     --memtest none \
     --iso-volume "EASY-BEE" \
     --iso-application "EASY-BEE" \
-    --bootappend-live "boot=live components console=tty2 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
+    --bootappend-live "boot=live toram components console=tty2 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
     --apt-recommends false \
     "${@}"
diff --git a/iso/builder/bee-gpu-stress.c b/iso/builder/bee-gpu-stress.c
index b11f41b..2d0c212 100644
--- a/iso/builder/bee-gpu-stress.c
+++ b/iso/builder/bee-gpu-stress.c
@@ -1,12 +1,25 @@
 #define _POSIX_C_SOURCE 200809L
 
 #include <dlfcn.h>
+#include <math.h>
+#include <stdarg.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
 
+#if defined(__has_include)
+#if __has_include(<cublasLt.h>)
+#include <cublasLt.h>
+#define HAVE_CUBLASLT_HEADERS 1
+#else
+#define HAVE_CUBLASLT_HEADERS 0
+#endif
+#else
+#define HAVE_CUBLASLT_HEADERS 0
+#endif
+
 typedef int CUdevice;
 typedef uint64_t CUdeviceptr;
 typedef int CUresult;
@@ -16,6 +29,8 @@ typedef void *CUfunction;
 typedef void *CUstream;
 
 #define CU_SUCCESS 0
+#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR 75
+#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR 76
 
 static const char *ptx_source =
     ".version 6.0\n"
@@ -60,11 +75,13 @@ typedef CUresult (*cuInit_fn)(unsigned int);
 typedef CUresult (*cuDeviceGetCount_fn)(int *);
 typedef CUresult (*cuDeviceGet_fn)(CUdevice *, int);
 typedef CUresult (*cuDeviceGetName_fn)(char *, int, CUdevice);
+typedef CUresult (*cuDeviceGetAttribute_fn)(int *, int, CUdevice);
 typedef CUresult (*cuCtxCreate_fn)(CUcontext *, unsigned int, CUdevice);
 typedef CUresult (*cuCtxDestroy_fn)(CUcontext);
 typedef CUresult (*cuCtxSynchronize_fn)(void);
 typedef CUresult (*cuMemAlloc_fn)(CUdeviceptr *, size_t);
 typedef CUresult (*cuMemFree_fn)(CUdeviceptr);
+typedef CUresult (*cuMemsetD8_fn)(CUdeviceptr, unsigned char, size_t);
 typedef CUresult (*cuMemcpyHtoD_fn)(CUdeviceptr, const void *, size_t);
 typedef CUresult (*cuMemcpyDtoH_fn)(void *, CUdeviceptr, size_t);
 typedef CUresult (*cuModuleLoadDataEx_fn)(CUmodule *, const void *, unsigned int, void *, void *);
@@ -89,11 +106,13 @@ struct cuda_api {
     cuDeviceGetCount_fn cuDeviceGetCount;
     cuDeviceGet_fn cuDeviceGet;
     cuDeviceGetName_fn cuDeviceGetName;
+    cuDeviceGetAttribute_fn cuDeviceGetAttribute;
     cuCtxCreate_fn cuCtxCreate;
     cuCtxDestroy_fn cuCtxDestroy;
     cuCtxSynchronize_fn cuCtxSynchronize;
     cuMemAlloc_fn cuMemAlloc;
     cuMemFree_fn cuMemFree;
+    cuMemsetD8_fn cuMemsetD8;
     cuMemcpyHtoD_fn cuMemcpyHtoD;
     cuMemcpyDtoH_fn cuMemcpyDtoH;
     cuModuleLoadDataEx_fn cuModuleLoadDataEx;
@@ -103,6 +122,17 @@ struct cuda_api {
     cuGetErrorString_fn cuGetErrorString;
 };
 
+struct stress_report {
+    char backend[32];
+    char device[128];
+    int cc_major;
+    int cc_minor;
+    int buffer_mb;
+    unsigned long iterations;
+    uint64_t checksum;
+    char details[1024];
+};
+
 static int load_symbol(void *lib, const char *name, void **out) {
     *out = dlsym(lib, name);
     return *out != NULL;
@@ -119,11 +149,13 @@ static int load_cuda(struct cuda_api *api) {
         load_symbol(api->lib, "cuDeviceGetCount", (void **)&api->cuDeviceGetCount) &&
         load_symbol(api->lib, "cuDeviceGet", (void **)&api->cuDeviceGet) &&
         load_symbol(api->lib, "cuDeviceGetName", (void **)&api->cuDeviceGetName) &&
+        load_symbol(api->lib, "cuDeviceGetAttribute", (void **)&api->cuDeviceGetAttribute) &&
         load_symbol(api->lib, "cuCtxCreate_v2", (void **)&api->cuCtxCreate) &&
         load_symbol(api->lib, "cuCtxDestroy_v2", (void **)&api->cuCtxDestroy) &&
         load_symbol(api->lib, "cuCtxSynchronize", (void **)&api->cuCtxSynchronize) &&
         load_symbol(api->lib, "cuMemAlloc_v2", (void **)&api->cuMemAlloc) &&
         load_symbol(api->lib, "cuMemFree_v2", (void **)&api->cuMemFree) &&
+        load_symbol(api->lib, "cuMemsetD8_v2", (void **)&api->cuMemsetD8) &&
         load_symbol(api->lib, "cuMemcpyHtoD_v2", (void **)&api->cuMemcpyHtoD) &&
         load_symbol(api->lib, "cuMemcpyDtoH_v2", (void **)&api->cuMemcpyDtoH) &&
         load_symbol(api->lib, "cuModuleLoadDataEx", (void **)&api->cuModuleLoadDataEx) &&
@@ -161,6 +193,905 @@ static double now_seconds(void) {
     return (double)ts.tv_sec + ((double)ts.tv_nsec / 1000000000.0);
 }
 
+#if HAVE_CUBLASLT_HEADERS
+static size_t round_down_size(size_t value, size_t multiple) {
+    if (multiple == 0 || value < multiple) {
+        return value;
+    }
+    return value - (value % multiple);
+}
+#endif
+
+static int query_compute_capability(struct cuda_api *api, CUdevice dev, int *major, int *minor) {
+    int cc_major = 0;
+    int cc_minor = 0;
+    if (!check_rc(api,
+                  "cuDeviceGetAttribute(major)",
+                  api->cuDeviceGetAttribute(&cc_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev))) {
+        return 0;
+    }
+    if (!check_rc(api,
+                  "cuDeviceGetAttribute(minor)",
+                  api->cuDeviceGetAttribute(&cc_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev))) {
+        return 0;
+    }
+    *major = cc_major;
+    *minor = cc_minor;
+    return 1;
+}
+
+#if HAVE_CUBLASLT_HEADERS
+static void append_detail(char *buf, size_t cap, const char *fmt, ...) {
+    size_t len = strlen(buf);
+    if (len >= cap) {
+        return;
+    }
+    va_list ap;
+    va_start(ap, fmt);
+    vsnprintf(buf + len, cap - len, fmt, ap);
+    va_end(ap);
+}
+#endif
+
+static int run_ptx_fallback(struct cuda_api *api,
+                            CUdevice dev,
+                            const char *device_name,
+                            int cc_major,
+                            int cc_minor,
+                            int seconds,
+                            int size_mb,
+                            struct stress_report *report) {
+    CUcontext ctx = NULL;
+    CUdeviceptr device_mem = 0;
+    CUmodule module = NULL;
+    CUfunction kernel = NULL;
+    uint32_t sample[256];
+    uint32_t words = 0;
+    unsigned long iterations = 0;
+
+    memset(report, 0, sizeof(*report));
+    snprintf(report->backend, sizeof(report->backend), "driver-ptx");
+    snprintf(report->device, sizeof(report->device), "%s", device_name);
+    report->cc_major = cc_major;
+    report->cc_minor = cc_minor;
+    report->buffer_mb = size_mb;
+
+    if (!check_rc(api, "cuCtxCreate", api->cuCtxCreate(&ctx, 0, dev))) {
+        return 0;
+    }
+
+    size_t bytes = (size_t)size_mb * 1024u * 1024u;
+    if (bytes < 4u * 1024u * 1024u) {
+        bytes = 4u * 1024u * 1024u;
+    }
+    if (bytes > (size_t)1024u * 1024u * 1024u) {
+        bytes = (size_t)1024u * 1024u * 1024u;
+    }
+    words = (uint32_t)(bytes / sizeof(uint32_t));
+
+    if (!check_rc(api, "cuMemAlloc", api->cuMemAlloc(&device_mem, bytes))) {
+        api->cuCtxDestroy(ctx);
+        return 0;
+    }
+    if (!check_rc(api, "cuMemsetD8", api->cuMemsetD8(device_mem, 0, bytes))) {
+        api->cuMemFree(device_mem);
+        api->cuCtxDestroy(ctx);
+        return 0;
+    }
+    if (!check_rc(api,
+                  "cuModuleLoadDataEx",
+                  api->cuModuleLoadDataEx(&module, ptx_source, 0, NULL, NULL))) {
+        api->cuMemFree(device_mem);
+        api->cuCtxDestroy(ctx);
+        return 0;
+    }
+    if (!check_rc(api, "cuModuleGetFunction", api->cuModuleGetFunction(&kernel, module, "burn"))) {
+        api->cuMemFree(device_mem);
+        api->cuCtxDestroy(ctx);
+        return 0;
+    }
+
+    unsigned int threads = 256;
+    unsigned int blocks = (unsigned int)((words + threads - 1) / threads);
+    uint32_t rounds = 1024;
+    void *params[] = {&device_mem, &words, &rounds};
+
+    double start = now_seconds();
+    double deadline = start + (double)seconds;
+    while (now_seconds() < deadline) {
+        if (!check_rc(api,
+                      "cuLaunchKernel",
+                      api->cuLaunchKernel(kernel, blocks, 1, 1, threads, 1, 1, 0, NULL, params, NULL))) {
+            api->cuMemFree(device_mem);
+            api->cuCtxDestroy(ctx);
+            return 0;
+        }
+        iterations++;
+    }
+
+    if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
+        api->cuMemFree(device_mem);
+        api->cuCtxDestroy(ctx);
+        return 0;
+    }
+    if (!check_rc(api, "cuMemcpyDtoH", api->cuMemcpyDtoH(sample, device_mem, sizeof(sample)))) {
+        api->cuMemFree(device_mem);
+        api->cuCtxDestroy(ctx);
+        return 0;
+    }
+
+    for (size_t i = 0; i < sizeof(sample) / sizeof(sample[0]); i++) {
+        report->checksum += sample[i];
+    }
+    report->iterations = iterations;
+    snprintf(report->details,
+             sizeof(report->details),
+             "profile_int32_fallback=OK iterations=%lu\n",
+             iterations);
+
+    api->cuMemFree(device_mem);
+    api->cuCtxDestroy(ctx);
+    return 1;
+}
+
+#if HAVE_CUBLASLT_HEADERS
+typedef cublasStatus_t (*cublasLtCreate_fn)(cublasLtHandle_t *);
+typedef cublasStatus_t (*cublasLtDestroy_fn)(cublasLtHandle_t);
+typedef cublasStatus_t (*cublasLtMatmulDescCreate_fn)(cublasLtMatmulDesc_t *,
+                                                      cublasComputeType_t,
+                                                      cudaDataType_t);
+typedef cublasStatus_t (*cublasLtMatmulDescDestroy_fn)(cublasLtMatmulDesc_t);
+typedef cublasStatus_t (*cublasLtMatmulDescSetAttribute_fn)(cublasLtMatmulDesc_t,
+                                                            cublasLtMatmulDescAttributes_t,
+                                                            const void *,
+                                                            size_t);
+typedef cublasStatus_t (*cublasLtMatrixLayoutCreate_fn)(cublasLtMatrixLayout_t *,
+                                                        cudaDataType_t,
+                                                        uint64_t,
+                                                        uint64_t,
+                                                        int64_t);
+typedef cublasStatus_t (*cublasLtMatrixLayoutDestroy_fn)(cublasLtMatrixLayout_t);
+typedef cublasStatus_t (*cublasLtMatmulPreferenceCreate_fn)(cublasLtMatmulPreference_t *);
+typedef cublasStatus_t (*cublasLtMatmulPreferenceDestroy_fn)(cublasLtMatmulPreference_t);
+typedef cublasStatus_t (*cublasLtMatmulPreferenceSetAttribute_fn)(cublasLtMatmulPreference_t,
+                                                                  cublasLtMatmulPreferenceAttributes_t,
+                                                                  const void *,
+                                                                  size_t);
+typedef cublasStatus_t (*cublasLtMatmulAlgoGetHeuristic_fn)(
+    cublasLtHandle_t,
+    cublasLtMatmulDesc_t,
+    cublasLtMatrixLayout_t,
+    cublasLtMatrixLayout_t,
+    cublasLtMatrixLayout_t,
+    cublasLtMatrixLayout_t,
+    cublasLtMatmulPreference_t,
+    int,
+    cublasLtMatmulHeuristicResult_t *,
+    int *);
+typedef cublasStatus_t (*cublasLtMatmul_fn)(cublasLtHandle_t,
+                                            cublasLtMatmulDesc_t,
+                                            const void *,
+                                            const void *,
+                                            cublasLtMatrixLayout_t,
+                                            const void *,
+                                            cublasLtMatrixLayout_t,
+                                            const void *,
+                                            const void *,
+                                            cublasLtMatrixLayout_t,
+                                            void *,
+                                            cublasLtMatrixLayout_t,
+                                            const cublasLtMatmulAlgo_t *,
+                                            void *,
+                                            size_t,
+                                            cudaStream_t);
+
+struct cublaslt_api {
+    void *lib;
+    cublasLtCreate_fn cublasLtCreate;
+    cublasLtDestroy_fn cublasLtDestroy;
+    cublasLtMatmulDescCreate_fn cublasLtMatmulDescCreate;
+    cublasLtMatmulDescDestroy_fn cublasLtMatmulDescDestroy;
+    cublasLtMatmulDescSetAttribute_fn cublasLtMatmulDescSetAttribute;
+    cublasLtMatrixLayoutCreate_fn cublasLtMatrixLayoutCreate;
+    cublasLtMatrixLayoutDestroy_fn cublasLtMatrixLayoutDestroy;
+    cublasLtMatmulPreferenceCreate_fn cublasLtMatmulPreferenceCreate;
+    cublasLtMatmulPreferenceDestroy_fn cublasLtMatmulPreferenceDestroy;
+    cublasLtMatmulPreferenceSetAttribute_fn cublasLtMatmulPreferenceSetAttribute;
+    cublasLtMatmulAlgoGetHeuristic_fn cublasLtMatmulAlgoGetHeuristic;
+    cublasLtMatmul_fn cublasLtMatmul;
+};
+
+struct profile_desc {
+    const char *name;
+    const char *block_label;
+    int min_cc;
+    int enabled;
+    int needs_scalar_scale;
+    int needs_block_scale;
+    int min_multiple;
+    cudaDataType_t a_type;
+    cudaDataType_t b_type;
+    cudaDataType_t c_type;
+    cudaDataType_t d_type;
+    cublasComputeType_t compute_type;
+};
+
+struct prepared_profile {
+    struct profile_desc desc;
+    cublasLtMatmulDesc_t op_desc;
+    cublasLtMatrixLayout_t a_layout;
+    cublasLtMatrixLayout_t b_layout;
+    cublasLtMatrixLayout_t c_layout;
+    cublasLtMatrixLayout_t d_layout;
+    cublasLtMatmulPreference_t preference;
+    cublasLtMatmulHeuristicResult_t heuristic;
+    CUdeviceptr a_dev;
+    CUdeviceptr b_dev;
+    CUdeviceptr c_dev;
+    CUdeviceptr d_dev;
+    CUdeviceptr a_scale_dev;
+    CUdeviceptr b_scale_dev;
+    CUdeviceptr workspace_dev;
+    size_t workspace_size;
+    uint64_t m;
+    uint64_t n;
+    uint64_t k;
+    unsigned long iterations;
+    int ready;
+};
+
+static const struct profile_desc k_profiles[] = {
+    {
+        "fp32_tf32",
+        "fp32",
+        80,
+        1,
+        0,
+        0,
+        128,
+        CUDA_R_32F,
+        CUDA_R_32F,
+        CUDA_R_32F,
+        CUDA_R_32F,
+        CUBLAS_COMPUTE_32F_FAST_TF32,
+    },
+    {
+        "fp16_tensor",
+        "fp16",
+        80,
+        1,
+        0,
+        0,
+        128,
+        CUDA_R_16F,
+        CUDA_R_16F,
+        CUDA_R_16F,
+        CUDA_R_16F,
+        CUBLAS_COMPUTE_32F_FAST_16F,
+    },
+    {
+        "fp8_e4m3",
+        "fp8",
+        89,
+        1,
+        1,
+        0,
+        128,
+        CUDA_R_8F_E4M3,
+        CUDA_R_8F_E4M3,
+        CUDA_R_16BF,
+        CUDA_R_16BF,
+        CUBLAS_COMPUTE_32F,
+    },
+    {
+        "fp8_e5m2",
+        "fp8",
+        89,
+        1,
+        1,
+        0,
+        128,
+        CUDA_R_8F_E5M2,
+        CUDA_R_8F_E5M2,
+        CUDA_R_16BF,
+        CUDA_R_16BF,
+        CUBLAS_COMPUTE_32F,
+    },
+#if defined(CUDA_R_4F_E2M1) && defined(CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3)
+    {
+        "fp4_e2m1",
+        "fp4",
+        100,
+        1,
+        0,
+        1,
+        128,
+        CUDA_R_4F_E2M1,
+        CUDA_R_4F_E2M1,
+        CUDA_R_16BF,
+        CUDA_R_16BF,
+        CUBLAS_COMPUTE_32F,
+    },
+#endif
+};
+
+static int load_cublaslt(struct cublaslt_api *api) {
+    memset(api, 0, sizeof(*api));
+    api->lib = dlopen("libcublasLt.so.13", RTLD_NOW | RTLD_LOCAL);
+    if (!api->lib) {
+        api->lib = dlopen("libcublasLt.so", RTLD_NOW | RTLD_LOCAL);
+    }
+    if (!api->lib) {
+        return 0;
+    }
+    return
+        load_symbol(api->lib, "cublasLtCreate", (void **)&api->cublasLtCreate) &&
+        load_symbol(api->lib, "cublasLtDestroy", (void **)&api->cublasLtDestroy) &&
+        load_symbol(api->lib, "cublasLtMatmulDescCreate", (void **)&api->cublasLtMatmulDescCreate) &&
+        load_symbol(api->lib, "cublasLtMatmulDescDestroy", (void **)&api->cublasLtMatmulDescDestroy) &&
+        load_symbol(api->lib,
+                    "cublasLtMatmulDescSetAttribute",
+                    (void **)&api->cublasLtMatmulDescSetAttribute) &&
+        load_symbol(api->lib, "cublasLtMatrixLayoutCreate", (void **)&api->cublasLtMatrixLayoutCreate) &&
+        load_symbol(api->lib, "cublasLtMatrixLayoutDestroy", (void **)&api->cublasLtMatrixLayoutDestroy) &&
+        load_symbol(api->lib,
+                    "cublasLtMatmulPreferenceCreate",
+                    (void **)&api->cublasLtMatmulPreferenceCreate) &&
+        load_symbol(api->lib,
+                    "cublasLtMatmulPreferenceDestroy",
+                    (void **)&api->cublasLtMatmulPreferenceDestroy) &&
+        load_symbol(api->lib,
+                    "cublasLtMatmulPreferenceSetAttribute",
+                    (void **)&api->cublasLtMatmulPreferenceSetAttribute) &&
+        load_symbol(api->lib,
+                    "cublasLtMatmulAlgoGetHeuristic",
+                    (void **)&api->cublasLtMatmulAlgoGetHeuristic) &&
+        load_symbol(api->lib, "cublasLtMatmul", (void **)&api->cublasLtMatmul);
+}
+
+static const char *cublas_status_text(cublasStatus_t status) {
+    switch (status) {
+        case CUBLAS_STATUS_SUCCESS:
+            return "CUBLAS_STATUS_SUCCESS";
+        case CUBLAS_STATUS_NOT_INITIALIZED:
+            return "CUBLAS_STATUS_NOT_INITIALIZED";
+        case CUBLAS_STATUS_ALLOC_FAILED:
+            return "CUBLAS_STATUS_ALLOC_FAILED";
+        case CUBLAS_STATUS_INVALID_VALUE:
+            return "CUBLAS_STATUS_INVALID_VALUE";
+        case CUBLAS_STATUS_ARCH_MISMATCH:
+            return "CUBLAS_STATUS_ARCH_MISMATCH";
+        case CUBLAS_STATUS_MAPPING_ERROR:
+            return "CUBLAS_STATUS_MAPPING_ERROR";
+        case CUBLAS_STATUS_EXECUTION_FAILED:
+            return "CUBLAS_STATUS_EXECUTION_FAILED";
+        case CUBLAS_STATUS_INTERNAL_ERROR:
+            return "CUBLAS_STATUS_INTERNAL_ERROR";
+        case CUBLAS_STATUS_NOT_SUPPORTED:
+            return "CUBLAS_STATUS_NOT_SUPPORTED";
+        default:
+            return "CUBLAS_STATUS_UNKNOWN";
+    }
+}
+
+static int check_cublas(const char *step, cublasStatus_t status) {
+    if (status == CUBLAS_STATUS_SUCCESS) {
+        return 1;
+    }
+    fprintf(stderr, "%s failed: %s (%d)\n", step, cublas_status_text(status), (int)status);
+    return 0;
+}
+
+static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) {
+    switch (type) {
+        case CUDA_R_32F:
+            return (size_t)(elements * 4u);
+        case CUDA_R_16F:
+        case CUDA_R_16BF:
+            return (size_t)(elements * 2u);
+        case CUDA_R_8F_E4M3:
+        case CUDA_R_8F_E5M2:
+            return (size_t)(elements);
+#if defined(CUDA_R_4F_E2M1)
+        case CUDA_R_4F_E2M1:
+            return (size_t)((elements + 1u) / 2u);
+#endif
+        default:
+            return (size_t)(elements * 4u);
+    }
+}
+
+static size_t fp4_scale_bytes(uint64_t rows, uint64_t cols) {
+    uint64_t row_tiles = (rows + 127u) / 128u;
+    uint64_t col_tiles = (cols + 63u) / 64u;
+    return (size_t)(row_tiles * col_tiles * 128u);
+}
+
+static uint64_t choose_square_dim(size_t budget_bytes, size_t bytes_per_cell, int multiple) {
+    double approx = sqrt((double)budget_bytes / (double)bytes_per_cell);
+    uint64_t dim = (uint64_t)approx;
+    if (dim < (uint64_t)multiple) {
+        dim = (uint64_t)multiple;
+    }
+    dim = (uint64_t)round_down_size((size_t)dim, (size_t)multiple);
+    if (dim < (uint64_t)multiple) {
+        dim = (uint64_t)multiple;
+    }
+    if (dim > 8192u) {
+        dim = 8192u;
+    }
+    return dim;
+}
+
+static int device_upload(struct cuda_api *cuda, CUdeviceptr dev, const void *src, size_t bytes) {
+    return check_rc(cuda, "cuMemcpyHtoD", cuda->cuMemcpyHtoD(dev, src, bytes));
+}
+
+static int alloc_filled(struct cuda_api *cuda, CUdeviceptr *ptr, size_t bytes, unsigned char pattern) {
+    if (!check_rc(cuda, "cuMemAlloc", cuda->cuMemAlloc(ptr, bytes))) {
+        return 0;
+    }
+    if (!check_rc(cuda, "cuMemsetD8", cuda->cuMemsetD8(*ptr, pattern, bytes))) {
+        cuda->cuMemFree(*ptr);
+        *ptr = 0;
+        return 0;
+    }
+    return 1;
+}
+
+static size_t profile_scale_bytes(const struct profile_desc *desc, uint64_t m, uint64_t n, uint64_t k) {
+    size_t bytes = 0;
+    if (desc->needs_scalar_scale) {
+        bytes += 2u * sizeof(float);
+    }
+#if defined(CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3)
+    if (desc->needs_block_scale) {
+        bytes += fp4_scale_bytes(k, m);
+        bytes += fp4_scale_bytes(k, n);
+    }
+#else
+    (void)m;
+    (void)n;
+    (void)k;
+#endif
+    return bytes;
+}
+
+static void destroy_profile(struct cublaslt_api *cublas, struct cuda_api *cuda, struct prepared_profile *profile) {
+    if (profile->workspace_dev) {
+        cuda->cuMemFree(profile->workspace_dev);
+    }
+    if (profile->a_scale_dev) {
+        cuda->cuMemFree(profile->a_scale_dev);
+    }
+    if (profile->b_scale_dev) {
+        cuda->cuMemFree(profile->b_scale_dev);
+    }
+    if (profile->d_dev) {
+        cuda->cuMemFree(profile->d_dev);
+    }
+    if (profile->c_dev) {
+        cuda->cuMemFree(profile->c_dev);
+    }
+    if (profile->b_dev) {
+        cuda->cuMemFree(profile->b_dev);
+    }
+    if (profile->a_dev) {
+        cuda->cuMemFree(profile->a_dev);
+    }
+    if (profile->preference) {
+        cublas->cublasLtMatmulPreferenceDestroy(profile->preference);
+    }
+    if (profile->d_layout) {
+        cublas->cublasLtMatrixLayoutDestroy(profile->d_layout);
+    }
+    if (profile->c_layout) {
+        cublas->cublasLtMatrixLayoutDestroy(profile->c_layout);
+    }
+    if (profile->b_layout) {
+        cublas->cublasLtMatrixLayoutDestroy(profile->b_layout);
+    }
+    if (profile->a_layout) {
+        cublas->cublasLtMatrixLayoutDestroy(profile->a_layout);
+    }
+    if (profile->op_desc) {
+        cublas->cublasLtMatmulDescDestroy(profile->op_desc);
+    }
+    memset(profile, 0, sizeof(*profile));
+}
+
+static int prepare_profile(struct cublaslt_api *cublas,
+                           cublasLtHandle_t handle,
+                           struct cuda_api *cuda,
+                           const struct profile_desc *desc,
+                           size_t profile_budget_bytes,
+                           struct prepared_profile *out) {
+    memset(out, 0, sizeof(*out));
+    out->desc = *desc;
+
+    size_t bytes_per_cell = 0;
+    bytes_per_cell += bytes_for_elements(desc->a_type, 1);
+    bytes_per_cell += bytes_for_elements(desc->b_type, 1);
+    bytes_per_cell += bytes_for_elements(desc->c_type, 1);
+    bytes_per_cell += bytes_for_elements(desc->d_type, 1);
+    if (bytes_per_cell == 0) {
+        return 0;
+    }
+
+    uint64_t dim = choose_square_dim(profile_budget_bytes, bytes_per_cell, desc->min_multiple);
+    out->m = dim;
+    out->n = dim;
+    out->k = dim;
+
+    size_t desired_workspace = profile_budget_bytes / 8u;
+    if (desired_workspace > 32u * 1024u * 1024u) {
+        desired_workspace = 32u * 1024u * 1024u;
+    }
+    desired_workspace = round_down_size(desired_workspace, 256u);
+
+    size_t a_bytes = 0;
+    size_t b_bytes = 0;
+    size_t c_bytes = 0;
+    size_t d_bytes = 0;
+    size_t scale_bytes = 0;
+    while (1) {
+        a_bytes = bytes_for_elements(desc->a_type, out->k * out->m);
+        b_bytes = bytes_for_elements(desc->b_type, out->k * out->n);
+        c_bytes = bytes_for_elements(desc->c_type, out->m * out->n);
+        d_bytes = bytes_for_elements(desc->d_type, out->m * out->n);
+        scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k);
+
+        size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes;
+        if (matrix_bytes <= profile_budget_bytes) {
+            size_t remaining = profile_budget_bytes - matrix_bytes;
+            out->workspace_size = desired_workspace;
+            if (out->workspace_size > remaining) {
+                out->workspace_size = round_down_size(remaining, 256u);
+            }
+            break;
+        }
+
+        if (out->m <= (uint64_t)desc->min_multiple) {
+            return 0;
+        }
+        out->m -= (uint64_t)desc->min_multiple;
+        out->n = out->m;
+        out->k = out->m;
+    }
+
+    if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) ||
+        !alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) ||
+        !alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) ||
+        !alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) {
+        destroy_profile(cublas, cuda, out);
+        return 0;
+    }
+
+    if (!check_cublas("cublasLtMatmulDescCreate",
+                      cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, CUDA_R_32F))) {
+        destroy_profile(cublas, cuda, out);
+        return 0;
+    }
+
+    cublasOperation_t transa = CUBLAS_OP_T;
+    cublasOperation_t transb = CUBLAS_OP_N;
+    if (!check_cublas("set TRANSA",
+                      cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
+                                                             CUBLASLT_MATMUL_DESC_TRANSA,
+                                                             &transa,
+                                                             sizeof(transa))) ||
+        !check_cublas("set TRANSB",
+                      cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
+                                                             CUBLASLT_MATMUL_DESC_TRANSB,
+                                                             &transb,
+                                                             sizeof(transb)))) {
+        destroy_profile(cublas, cuda, out);
+        return 0;
+    }
+
+    if (desc->needs_scalar_scale) {
+        float one = 1.0f;
+        if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) ||
+            !alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) {
+            destroy_profile(cublas, cuda, out);
+            return 0;
+        }
+        if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) ||
+            !device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) {
+            destroy_profile(cublas, cuda, out);
+            return 0;
+        }
+        void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev;
+        void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev;
+        if (!check_cublas("set A scale ptr",
+                          cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
+                                                                 CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
+                                                                 &a_scale_ptr,
+                                                                 sizeof(a_scale_ptr))) ||
+            !check_cublas("set B scale ptr",
+                          cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
+                                                                 CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
+                                                                 &b_scale_ptr,
+                                                                 sizeof(b_scale_ptr)))) {
+            destroy_profile(cublas, cuda, out);
+            return 0;
+        }
+    }
+
+#if defined(CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3)
+    if (desc->needs_block_scale) {
+        size_t a_scale_bytes = fp4_scale_bytes(out->k, out->m);
+        size_t b_scale_bytes = fp4_scale_bytes(out->k, out->n);
+        if (!alloc_filled(cuda, &out->a_scale_dev, a_scale_bytes, 0x11) ||
+            !alloc_filled(cuda, &out->b_scale_dev, b_scale_bytes, 0x11)) {
+            destroy_profile(cublas, cuda, out);
+            return 0;
+        }
+        cublasLtMatmulMatrixScale_t scale_mode = CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3;
+        void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev;
+        void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev;
+        if (!check_cublas("set A scale mode",
+                          cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
+                                                                 CUBLASLT_MATMUL_DESC_A_SCALE_MODE,
+                                                                 &scale_mode,
+                                                                 sizeof(scale_mode))) ||
+            !check_cublas("set B scale mode",
+                          cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
+                                                                 CUBLASLT_MATMUL_DESC_B_SCALE_MODE,
+                                                                 &scale_mode,
+                                                                 sizeof(scale_mode))) ||
+            !check_cublas("set A block scale ptr",
+                          cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
+                                                                 CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
+                                                                 &a_scale_ptr,
+                                                                 sizeof(a_scale_ptr))) ||
+            !check_cublas("set B block scale ptr",
+                          cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
+                                                                 CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
+                                                                 &b_scale_ptr,
+                                                                 sizeof(b_scale_ptr)))) {
+            destroy_profile(cublas, cuda, out);
+            return 0;
+        }
+    }
+#endif
+
+    if (!check_cublas("create A layout",
+                      cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) ||
+        !check_cublas("create B layout",
+                      cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) ||
+        !check_cublas("create C layout",
+                      cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) ||
+        !check_cublas("create D layout",
+                      cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) {
+        destroy_profile(cublas, cuda, out);
+        return 0;
+    }
+
+    if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) {
+        destroy_profile(cublas, cuda, out);
+        return 0;
+    }
+
+    if (out->workspace_size > 0) {
+        if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) {
+            destroy_profile(cublas, cuda, out);
+            return 0;
+        }
+    }
+
+    if (!check_cublas("set workspace",
+                      cublas->cublasLtMatmulPreferenceSetAttribute(
+                          out->preference,
+                          CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
+                          &out->workspace_size,
+                          sizeof(out->workspace_size)))) {
+        destroy_profile(cublas, cuda, out);
+        return 0;
+    }
+
+    int found = 0;
+    if (!check_cublas("heuristic",
+                      cublas->cublasLtMatmulAlgoGetHeuristic(handle,
+                                                             out->op_desc,
+                                                             out->a_layout,
+                                                             out->b_layout,
+                                                             out->c_layout,
+                                                             out->d_layout,
+                                                             out->preference,
+                                                             1,
+                                                             &out->heuristic,
+                                                             &found))) {
+        destroy_profile(cublas, cuda, out);
+        return 0;
+    }
+    if (found <= 0) {
+        destroy_profile(cublas, cuda, out);
+        return 0;
+    }
+
+    out->ready = 1;
+    return 1;
+}
+
+static int run_cublas_profile(cublasLtHandle_t handle,
+                              struct cublaslt_api *cublas,
+                              struct prepared_profile *profile) {
+    float alpha = 1.0f;
+    float beta = 0.0f;
+    return check_cublas(profile->desc.name,
+                        cublas->cublasLtMatmul(handle,
+                                               profile->op_desc,
+                                               &alpha,
+                                               (const void *)(uintptr_t)profile->a_dev,
+                                               profile->a_layout,
+                                               (const void *)(uintptr_t)profile->b_dev,
+                                               profile->b_layout,
+                                               &beta,
+                                               (const void *)(uintptr_t)profile->c_dev,
+                                               profile->c_layout,
+                                               (void *)(uintptr_t)profile->d_dev,
+                                               profile->d_layout,
+                                               &profile->heuristic.algo,
+                                               (void *)(uintptr_t)profile->workspace_dev,
+                                               profile->workspace_size,
+                                               (cudaStream_t)0));
+}
+
+static int run_cublaslt_stress(struct cuda_api *cuda,
+                               CUdevice dev,
+                               const char *device_name,
+                               int cc_major,
+                               int cc_minor,
+                               int seconds,
+                               int size_mb,
+                               struct stress_report *report) {
+    struct cublaslt_api cublas;
+    struct prepared_profile prepared[sizeof(k_profiles) / sizeof(k_profiles[0])];
+    cublasLtHandle_t handle = NULL;
+    CUcontext ctx = NULL;
+    uint16_t sample[256];
+    int cc = cc_major * 10 + cc_minor;
+    int planned = 0;
+    int active = 0;
+
+    memset(report, 0, sizeof(*report));
+    snprintf(report->backend, sizeof(report->backend), "cublasLt");
+    snprintf(report->device, sizeof(report->device), "%s", device_name);
+    report->cc_major = cc_major;
+    report->cc_minor = cc_minor;
+    report->buffer_mb = size_mb;
+
+    if (!load_cublaslt(&cublas)) {
+        snprintf(report->details, sizeof(report->details), "cublasLt=unavailable\n");
+        return 0;
+    }
+    if (!check_rc(cuda, "cuCtxCreate", cuda->cuCtxCreate(&ctx, 0, dev))) {
+        return 0;
+    }
+    if (!check_cublas("cublasLtCreate", cublas.cublasLtCreate(&handle))) {
+        cuda->cuCtxDestroy(ctx);
+        return 0;
+    }
+
+    for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
+        if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) {
+            planned++;
+        }
+    }
+    if (planned <= 0) {
+        snprintf(report->details, sizeof(report->details), "cublasLt_profiles=unsupported\n");
+        cublas.cublasLtDestroy(handle);
+        cuda->cuCtxDestroy(ctx);
+        return 0;
+    }
+
+    size_t total_budget = (size_t)size_mb * 1024u * 1024u;
+    if (total_budget < (size_t)planned * 4u * 1024u * 1024u) {
+        total_budget = (size_t)planned * 4u * 1024u * 1024u;
+    }
+    size_t per_profile_budget = total_budget / (size_t)planned;
+    if (per_profile_budget < 4u * 1024u * 1024u) {
+        per_profile_budget = 4u * 1024u * 1024u;
+    }
+
+    for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
+        const struct profile_desc *desc = &k_profiles[i];
+        if (!(desc->enabled && cc >= desc->min_cc)) {
+            append_detail(report->details,
+                          sizeof(report->details),
+                          "%s=SKIPPED cc<%d\n",
+                          desc->name,
+                          desc->min_cc);
+            continue;
+        }
+        if (prepare_profile(&cublas, handle, cuda, desc, per_profile_budget, &prepared[i])) {
+            active++;
+            append_detail(report->details,
+                          sizeof(report->details),
+                          "%s=READY dim=%llux%llux%llu block=%s\n",
+                          desc->name,
+                          (unsigned long long)prepared[i].m,
+                          (unsigned long long)prepared[i].n,
+                          (unsigned long long)prepared[i].k,
+                          desc->block_label);
+        } else {
+            append_detail(report->details, sizeof(report->details), "%s=SKIPPED unsupported\n", desc->name);
+        }
+    }
+
+    if (active <= 0) {
+        cublas.cublasLtDestroy(handle);
+        cuda->cuCtxDestroy(ctx);
+        return 0;
+    }
+
+    double deadline = now_seconds() + (double)seconds;
+    while (now_seconds() < deadline) {
+        for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
+            if (!prepared[i].ready) {
+                continue;
+            }
+            if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
+                append_detail(report->details,
+                              sizeof(report->details),
+                              "%s=FAILED runtime\n",
+                              prepared[i].desc.name);
+                for (size_t j = 0; j < sizeof(prepared) / sizeof(prepared[0]); j++) {
+                    destroy_profile(&cublas, cuda, &prepared[j]);
+                }
+                cublas.cublasLtDestroy(handle);
+                cuda->cuCtxDestroy(ctx);
+                return 0;
+            }
+            prepared[i].iterations++;
+            report->iterations++;
+            if (now_seconds() >= deadline) {
+                break;
+            }
+        }
+    }
+
+    if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
+        for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
+            destroy_profile(&cublas, cuda, &prepared[i]);
+        }
+        cublas.cublasLtDestroy(handle);
+        cuda->cuCtxDestroy(ctx);
+        return 0;
+    }
+
+    for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
+        if (!prepared[i].ready) {
+            continue;
+        }
+        append_detail(report->details,
+                      sizeof(report->details),
+                      "%s_iterations=%lu\n",
+                      prepared[i].desc.name,
+                      prepared[i].iterations);
+    }
+
+    for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
+        if (prepared[i].ready) {
+            if (check_rc(cuda, "cuMemcpyDtoH", cuda->cuMemcpyDtoH(sample, prepared[i].d_dev, sizeof(sample)))) {
+                for (size_t j = 0; j < sizeof(sample) / sizeof(sample[0]); j++) {
+                    report->checksum += sample[j];
+                }
+            }
+            break;
+        }
+    }
+
+    for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
+        destroy_profile(&cublas, cuda, &prepared[i]);
+    }
+    cublas.cublasLtDestroy(handle);
+    cuda->cuCtxDestroy(ctx);
+    return 1;
+}
+#endif
+
 int main(int argc, char **argv) {
     int seconds = 5;
     int size_mb = 64;
@@ -181,20 +1112,20 @@ int main(int argc, char **argv) {
         size_mb = 64;
     }
 
-    struct cuda_api api;
-    if (!load_cuda(&api)) {
+    struct cuda_api cuda;
+    if (!load_cuda(&cuda)) {
         fprintf(stderr, "failed to load libcuda.so.1 or required Driver API symbols\n");
         return 1;
     }
-    load_symbol(api.lib, "cuGetErrorName", (void **)&api.cuGetErrorName);
-    load_symbol(api.lib, "cuGetErrorString", (void **)&api.cuGetErrorString);
+    load_symbol(cuda.lib, "cuGetErrorName", (void **)&cuda.cuGetErrorName);
+    load_symbol(cuda.lib, "cuGetErrorString", (void **)&cuda.cuGetErrorString);
 
-    if (!check_rc(&api, "cuInit", api.cuInit(0))) {
+    if (!check_rc(&cuda, "cuInit", cuda.cuInit(0))) {
         return 1;
     }
 
     int count = 0;
-    if (!check_rc(&api, "cuDeviceGetCount", api.cuDeviceGetCount(&count))) {
+    if (!check_rc(&cuda, "cuDeviceGetCount", cuda.cuDeviceGetCount(&count))) {
         return 1;
     }
     if (count <= 0) {
@@ -203,112 +1134,43 @@ int main(int argc, char **argv) {
     }
 
     CUdevice dev = 0;
-    if (!check_rc(&api, "cuDeviceGet", api.cuDeviceGet(&dev, 0))) {
+    if (!check_rc(&cuda, "cuDeviceGet", cuda.cuDeviceGet(&dev, 0))) {
         return 1;
     }
+
     char name[128] = {0};
-    if (!check_rc(&api, "cuDeviceGetName", api.cuDeviceGetName(name, (int)sizeof(name), dev))) {
+    if (!check_rc(&cuda, "cuDeviceGetName", cuda.cuDeviceGetName(name, (int)sizeof(name), dev))) {
         return 1;
     }
 
-    CUcontext ctx = NULL;
-    if (!check_rc(&api, "cuCtxCreate", api.cuCtxCreate(&ctx, 0, dev))) {
+    int cc_major = 0;
+    int cc_minor = 0;
+    if (!query_compute_capability(&cuda, dev, &cc_major, &cc_minor)) {
         return 1;
     }
 
-    size_t bytes = (size_t)size_mb * 1024 * 1024;
-    uint32_t words = (uint32_t)(bytes / sizeof(uint32_t));
-    if (words < 1024) {
-        words = 1024;
-        bytes = (size_t)words * sizeof(uint32_t);
-    }
+    struct stress_report report;
+    int ok = 0;
 
-    uint32_t *host = (uint32_t *)malloc(bytes);
-    if (!host) {
-        fprintf(stderr, "malloc failed\n");
-        api.cuCtxDestroy(ctx);
-        return 1;
-    }
-    for (uint32_t i = 0; i < words; i++) {
-        host[i] = i ^ 0x12345678u;
-    }
-
-    CUdeviceptr device_mem = 0;
-    if (!check_rc(&api, "cuMemAlloc", api.cuMemAlloc(&device_mem, bytes))) {
-        free(host);
-        api.cuCtxDestroy(ctx);
-        return 1;
-    }
-    if (!check_rc(&api, "cuMemcpyHtoD", api.cuMemcpyHtoD(device_mem, host, bytes))) {
-        api.cuMemFree(device_mem);
-        free(host);
-        api.cuCtxDestroy(ctx);
-        return 1;
-    }
-
-    CUmodule module = NULL;
-    if (!check_rc(&api, "cuModuleLoadDataEx", api.cuModuleLoadDataEx(&module, ptx_source, 0, NULL, NULL))) {
-        api.cuMemFree(device_mem);
-        free(host);
-        api.cuCtxDestroy(ctx);
-        return 1;
-    }
-
-    CUfunction kernel = NULL;
-    if (!check_rc(&api, "cuModuleGetFunction", api.cuModuleGetFunction(&kernel, module, "burn"))) {
-        api.cuMemFree(device_mem);
-        free(host);
-        api.cuCtxDestroy(ctx);
-        return 1;
-    }
-
-    unsigned int threads = 256;
-    unsigned int blocks = (words + threads - 1) / threads;
-    uint32_t rounds = 256;
-    void *params[] = {&device_mem, &words, &rounds};
-
-    double start = now_seconds();
-    double deadline = start + (double)seconds;
-    unsigned long iterations = 0;
-    while (now_seconds() < deadline) {
-        if (!check_rc(&api, "cuLaunchKernel",
-                      api.cuLaunchKernel(kernel, blocks, 1, 1, threads, 1, 1, 0, NULL, params, NULL))) {
-            api.cuMemFree(device_mem);
-            free(host);
-            api.cuCtxDestroy(ctx);
+#if HAVE_CUBLASLT_HEADERS
+    ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, &report);
+#endif
+    if (!ok) {
+        if (!run_ptx_fallback(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, &report)) {
             return 1;
         }
-        iterations++;
     }
 
-    if (!check_rc(&api, "cuCtxSynchronize", api.cuCtxSynchronize())) {
-        api.cuMemFree(device_mem);
-        free(host);
-        api.cuCtxDestroy(ctx);
-        return 1;
+    printf("device=%s\n", report.device);
+    printf("compute_capability=%d.%d\n", report.cc_major, report.cc_minor);
+    printf("backend=%s\n", report.backend);
+    printf("duration_s=%d\n", seconds);
+    printf("buffer_mb=%d\n", report.buffer_mb);
+    printf("iterations=%lu\n", report.iterations);
+    printf("checksum=%llu\n", (unsigned long long)report.checksum);
+    if (report.details[0] != '\0') {
+        printf("%s", report.details);
     }
-    if (!check_rc(&api, "cuMemcpyDtoH", api.cuMemcpyDtoH(host, device_mem, bytes))) {
-        api.cuMemFree(device_mem);
-        free(host);
-        api.cuCtxDestroy(ctx);
-        return 1;
-    }
-
-    uint64_t checksum = 0;
-    for (uint32_t i = 0; i < words; i += words / 256 ? words / 256 : 1) {
-        checksum += host[i];
-    }
-
-    double elapsed = now_seconds() - start;
-    printf("device=%s\n", name);
-    printf("duration_s=%.2f\n", elapsed);
-    printf("buffer_mb=%d\n", size_mb);
-    printf("iterations=%lu\n", iterations);
-    printf("checksum=%llu\n", (unsigned long long)checksum);
     printf("status=OK\n");
-
-    api.cuMemFree(device_mem);
-    free(host);
-    api.cuCtxDestroy(ctx);
     return 0;
 }
diff --git a/iso/builder/build-cublas.sh b/iso/builder/build-cublas.sh
new file mode 100644
index 0000000..dc9aab6
--- /dev/null
+++ b/iso/builder/build-cublas.sh
@@ -0,0 +1,170 @@
+#!/bin/sh
+# build-cublas.sh — download cuBLASLt/cuBLAS/cudart runtime + headers for bee-gpu-stress.
+#
+# Downloads .deb packages from NVIDIA's CUDA apt repository (Debian 12, x86_64),
+# verifies them against Packages.gz, and extracts the small subset we need:
+#   - headers for compiling bee-gpu-stress against cuBLASLt
+#   - runtime libs for libcublas, libcublasLt, libcudart inside the ISO
+
+set -e
+
+CUBLAS_VERSION="$1"
+CUDA_USERSPACE_VERSION="$2"
+CUDA_SERIES="$3"
+DIST_DIR="$4"
+
+[ -n "$CUBLAS_VERSION" ]        || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
+[ -n "$CUDA_USERSPACE_VERSION" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
+[ -n "$CUDA_SERIES" ]            || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
+[ -n "$DIST_DIR" ]               || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
+
+CUDA_SERIES_DASH=$(printf '%s' "$CUDA_SERIES" | tr '.' '-')
+REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64"
+CACHE_DIR="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${CUDA_SERIES}"
+CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
+DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/cublas-downloads"
+PACKAGES_GZ="${DOWNLOAD_CACHE_DIR}/Packages.gz"
+
+echo "=== cuBLAS ${CUBLAS_VERSION} / cudart ${CUDA_USERSPACE_VERSION} / CUDA ${CUDA_SERIES} ==="
+
+if [ -f "${CACHE_DIR}/include/cublasLt.h" ] && [ -f "${CACHE_DIR}/include/cuda_runtime_api.h" ] \
+    && [ "$(find "${CACHE_DIR}/lib" \( -name 'libcublas.so*' -o -name 'libcublasLt.so*' -o -name 'libcudart.so*' \) 2>/dev/null | wc -l)" -gt 0 ]; then
+    echo "=== cuBLAS cached, skipping download ==="
+    echo "cache: $CACHE_DIR"
+    exit 0
+fi
+
+mkdir -p "${DOWNLOAD_CACHE_DIR}" "${CACHE_DIR}/include" "${CACHE_DIR}/lib"
+
+echo "=== downloading Packages.gz ==="
+wget -q -O "${PACKAGES_GZ}" "${REPO_BASE}/Packages.gz"
+
+lookup_pkg() {
+    pkg="$1"
+    ver="$2"
+    gzip -dc "${PACKAGES_GZ}" | awk -v pkg="$pkg" -v ver="$ver" '
+        /^Package: / { cur_pkg=$2 }
+        /^Version: / { cur_ver=$2 }
+        /^Filename: / { cur_file=$2 }
+        /^SHA256: / { cur_sha=$2 }
+        /^$/ {
+            if (cur_pkg == pkg && cur_ver == ver) {
+                print cur_file " " cur_sha
+                exit
+            }
+            cur_pkg=""; cur_ver=""; cur_file=""; cur_sha=""
+        }
+        END {
+            if (cur_pkg == pkg && cur_ver == ver) {
+                print cur_file " " cur_sha
+            }
+        }'
+}
+
+download_verified_pkg() {
+    pkg="$1"
+    ver="$2"
+
+    meta="$(lookup_pkg "$pkg" "$ver")"
+    [ -n "$meta" ] || { echo "ERROR: package metadata not found for ${pkg} ${ver}"; exit 1; }
+
+    repo_file="$(printf '%s\n' "$meta" | awk '{print $1}')"
+    repo_sha="$(printf '%s\n' "$meta" | awk '{print $2}')"
+    [ -n "$repo_file" ] || { echo "ERROR: package filename missing for ${pkg}"; exit 1; }
+    [ -n "$repo_sha" ]  || { echo "ERROR: package sha missing for ${pkg}"; exit 1; }
+
+    out="${DOWNLOAD_CACHE_DIR}/$(basename "$repo_file")"
+    if [ -f "$out" ]; then
+        actual_sha="$(sha256sum "$out" | awk '{print $1}')"
+        if [ "$actual_sha" = "$repo_sha" ]; then
+            echo "=== using cached $(basename "$repo_file") ==="
+            printf '%s\n' "$out"
+            return 0
+        fi
+        echo "=== removing stale $(basename "$repo_file") (sha256 mismatch) ==="
+        rm -f "$out"
+    fi
+
+    echo "=== downloading $(basename "$repo_file") ==="
+    wget --show-progress -O "$out" "${REPO_BASE}/$(basename "$repo_file")"
+
+    actual_sha="$(sha256sum "$out" | awk '{print $1}')"
+    if [ "$actual_sha" != "$repo_sha" ]; then
+        echo "ERROR: sha256 mismatch for $(basename "$repo_file")"
+        echo "  expected: $repo_sha"
+        echo "  actual:   $actual_sha"
+        rm -f "$out"
+        exit 1
+    fi
+    echo "sha256 OK: $(basename "$repo_file")"
+    printf '%s\n' "$out"
+}
+
+extract_deb() {
+    deb="$1"
+    dst="$2"
+    mkdir -p "$dst"
+    (
+        cd "$dst"
+        ar x "$deb"
+        data_tar=$(ls data.tar.* 2>/dev/null | head -1)
+        [ -n "$data_tar" ] || { echo "ERROR: data.tar.* not found in $deb"; exit 1; }
+        tar xf "$data_tar"
+    )
+}
+
+copy_headers() {
+    from="$1"
+    if [ -d "${from}/usr/include" ]; then
+        cp -a "${from}/usr/include/." "${CACHE_DIR}/include/"
+    fi
+}
+
+copy_libs() {
+    from="$1"
+    find "$from" \( -name 'libcublas.so*' -o -name 'libcublasLt.so*' -o -name 'libcudart.so*' \) \
+        \( -type f -o -type l \) -exec cp -a {} "${CACHE_DIR}/lib/" \;
+}
+
+make_links() {
+    base="$1"
+    versioned=$(find "${CACHE_DIR}/lib" -maxdepth 1 -name "${base}.so.[0-9]*" -type f | sort | head -1)
+    [ -n "$versioned" ] || return 0
+    soname=$(printf '%s\n' "$versioned" | sed -E "s#.*/(${base}\.so\.[0-9]+).*#\\1#")
+    target=$(basename "$versioned")
+    ln -sf "$target" "${CACHE_DIR}/lib/${soname}" 2>/dev/null || true
+    ln -sf "${soname}" "${CACHE_DIR}/lib/${base}.so" 2>/dev/null || true
+}
+
+TMP_DIR=$(mktemp -d)
+trap 'rm -rf "$TMP_DIR"' EXIT INT TERM
+
+CUBLAS_RT_DEB=$(download_verified_pkg "libcublas-${CUDA_SERIES_DASH}" "${CUBLAS_VERSION}")
+CUBLAS_DEV_DEB=$(download_verified_pkg "libcublas-dev-${CUDA_SERIES_DASH}" "${CUBLAS_VERSION}")
+CUDART_RT_DEB=$(download_verified_pkg "cuda-cudart-${CUDA_SERIES_DASH}" "${CUDA_USERSPACE_VERSION}")
+CUDART_DEV_DEB=$(download_verified_pkg "cuda-cudart-dev-${CUDA_SERIES_DASH}" "${CUDA_USERSPACE_VERSION}")
+
+extract_deb "$CUBLAS_RT_DEB" "${TMP_DIR}/cublas-rt"
+extract_deb "$CUBLAS_DEV_DEB" "${TMP_DIR}/cublas-dev"
+extract_deb "$CUDART_RT_DEB" "${TMP_DIR}/cudart-rt"
+extract_deb "$CUDART_DEV_DEB" "${TMP_DIR}/cudart-dev"
+
+copy_headers "${TMP_DIR}/cublas-dev"
+copy_headers "${TMP_DIR}/cudart-dev"
+copy_libs "${TMP_DIR}/cublas-rt"
+copy_libs "${TMP_DIR}/cudart-rt"
+
+make_links "libcublas"
+make_links "libcublasLt"
+make_links "libcudart"
+
+[ -f "${CACHE_DIR}/include/cublasLt.h" ]      || { echo "ERROR: cublasLt.h not extracted"; exit 1; }
+[ -f "${CACHE_DIR}/include/cuda_runtime_api.h" ] || { echo "ERROR: cuda_runtime_api.h not extracted"; exit 1; }
+[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcublasLt.so*' | wc -l)" -gt 0 ] || { echo "ERROR: libcublasLt not extracted"; exit 1; }
+[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcublas.so*' | wc -l)" -gt 0 ] || { echo "ERROR: libcublas not extracted"; exit 1; }
+[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcudart.so*' | wc -l)" -gt 0 ] || { echo "ERROR: libcudart not extracted"; exit 1; }
+
+echo "=== cuBLAS extraction complete ==="
+echo "cache: $CACHE_DIR"
+echo "headers: $(find "${CACHE_DIR}/include" -type f | wc -l)"
+echo "libs:    $(find "${CACHE_DIR}/lib" -maxdepth 1 \( -name 'libcublas*.so*' -o -name 'libcudart.so*' \) | wc -l)"
diff --git a/iso/builder/build-in-container.sh b/iso/builder/build-in-container.sh
index 26940bc..2b2ef70 100755
--- a/iso/builder/build-in-container.sh
+++ b/iso/builder/build-in-container.sh
@@ -7,6 +7,7 @@ REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
 BUILDER_DIR="${REPO_ROOT}/iso/builder"
 CONTAINER_TOOL="${CONTAINER_TOOL:-docker}"
 IMAGE_TAG="${BEE_BUILDER_IMAGE:-bee-iso-builder}"
+BUILDER_PLATFORM="${BEE_BUILDER_PLATFORM:-linux/amd64}"
 CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/container-cache}"
 AUTH_KEYS=""
 REBUILD_IMAGE=0
@@ -40,6 +41,13 @@ if ! command -v "$CONTAINER_TOOL" >/dev/null 2>&1; then
     exit 1
 fi
 
+PLATFORM_OS="${BUILDER_PLATFORM%/*}"
+PLATFORM_ARCH="${BUILDER_PLATFORM#*/}"
+if [ -z "$PLATFORM_OS" ] || [ -z "$PLATFORM_ARCH" ] || [ "$PLATFORM_OS" = "$BUILDER_PLATFORM" ]; then
+    echo "invalid BEE_BUILDER_PLATFORM: ${BUILDER_PLATFORM} (expected os/arch, e.g. linux/amd64)" >&2
+    exit 1
+fi
+
 if [ -n "$AUTH_KEYS" ]; then
     [ -f "$AUTH_KEYS" ] || { echo "authorized_keys not found: $AUTH_KEYS" >&2; exit 1; }
     AUTH_KEYS_ABS="$(cd "$(dirname "$AUTH_KEYS")" && pwd)/$(basename "$AUTH_KEYS")"
@@ -56,17 +64,35 @@ mkdir -p \
 
 IMAGE_REF="${IMAGE_TAG}:debian${DEBIAN_VERSION}"
 
-if [ "$REBUILD_IMAGE" = "1" ] || ! "$CONTAINER_TOOL" image inspect "${IMAGE_REF}" >/dev/null 2>&1; then
+image_matches_platform() {
+    actual_platform="$("$CONTAINER_TOOL" image inspect --format '{{.Os}}/{{.Architecture}}' "${IMAGE_REF}" 2>/dev/null || true)"
+    [ "$actual_platform" = "${BUILDER_PLATFORM}" ]
+}
+
+NEED_BUILD_IMAGE=0
+if [ "$REBUILD_IMAGE" = "1" ]; then
+    NEED_BUILD_IMAGE=1
+elif ! "$CONTAINER_TOOL" image inspect "${IMAGE_REF}" >/dev/null 2>&1; then
+    NEED_BUILD_IMAGE=1
+elif ! image_matches_platform; then
+    actual_platform="$("$CONTAINER_TOOL" image inspect --format '{{.Os}}/{{.Architecture}}' "${IMAGE_REF}" 2>/dev/null || echo unknown)"
+    echo "=== rebuilding builder image ${IMAGE_REF}: platform mismatch (${actual_platform} != ${BUILDER_PLATFORM}) ==="
+    NEED_BUILD_IMAGE=1
+fi
+
+if [ "$NEED_BUILD_IMAGE" = "1" ]; then
     "$CONTAINER_TOOL" build \
+        --platform "${BUILDER_PLATFORM}" \
         --build-arg GO_VERSION="${GO_VERSION}" \
         -t "${IMAGE_REF}" \
         "${BUILDER_DIR}"
 else
-    echo "=== using existing builder image ${IMAGE_REF} ==="
+    echo "=== using existing builder image ${IMAGE_REF} (${BUILDER_PLATFORM}) ==="
 fi
 
 set -- \
     run --rm --privileged \
+    --platform "${BUILDER_PLATFORM}" \
     -v "${REPO_ROOT}:/work" \
     -v "${CACHE_DIR}:/cache" \
     -e BEE_CONTAINER_BUILD=1 \
@@ -80,6 +106,7 @@ set -- \
 
 if [ -n "$AUTH_KEYS" ]; then
     set -- run --rm --privileged \
+        --platform "${BUILDER_PLATFORM}" \
         -v "${REPO_ROOT}:/work" \
         -v "${CACHE_DIR}:/cache" \
         -v "${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro" \
diff --git a/iso/builder/build.sh b/iso/builder/build.sh
index a5611fa..5ec4120 100755
--- a/iso/builder/build.sh
+++ b/iso/builder/build.sh
@@ -159,6 +159,16 @@ else
     echo "=== bee binary up to date, skipping build ==="
 fi
 
+echo ""
+echo "=== downloading cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace ==="
+sh "${BUILDER_DIR}/build-cublas.sh" \
+    "${CUBLAS_VERSION}" \
+    "${CUDA_USERSPACE_VERSION}" \
+    "${NCCL_CUDA_VERSION}" \
+    "${DIST_DIR}"
+
+CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
+
 GPU_STRESS_NEED_BUILD=1
 if [ -f "$GPU_STRESS_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_STRESS_BIN" ]; then
     GPU_STRESS_NEED_BUILD=0
@@ -167,6 +177,7 @@ fi
 if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
     echo "=== building bee-gpu-stress ==="
     gcc -O2 -s -Wall -Wextra \
+        -I"${CUBLAS_CACHE}/include" \
         -o "$GPU_STRESS_BIN" \
         "${BUILDER_DIR}/bee-gpu-stress.c" \
         -ldl
@@ -283,6 +294,10 @@ NCCL_CACHE="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
 cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
 echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
 
+# Inject cuBLAS/cuBLASLt/cudart runtime libs used by bee-gpu-stress tensor-core GEMM path
+cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
+echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
+
 # --- embed build metadata ---
 mkdir -p "${OVERLAY_STAGE_DIR}/etc"
 BUILD_DATE="$(date +%Y-%m-%d)"
@@ -297,6 +312,8 @@ DEBIAN_KERNEL_ABI=${DEBIAN_KERNEL_ABI}
 NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
 NCCL_VERSION=${NCCL_VERSION}
 NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
+CUBLAS_VERSION=${CUBLAS_VERSION}
+CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
 EOF
 
 # Patch motd with build info
diff --git a/iso/builder/config/package-lists/bee.list.chroot b/iso/builder/config/package-lists/bee.list.chroot
index 1d9ee0e..654f796 100644
--- a/iso/builder/config/package-lists/bee.list.chroot
+++ b/iso/builder/config/package-lists/bee.list.chroot
@@ -20,6 +20,7 @@ openssh-server
 
 # Filesystem support for USB export targets
 exfatprogs
+exfat-fuse
 ntfs-3g
 
 # Utilities