feat(iso): split into nvidia and amd variants, fix KVM graphics and PATH

- build.sh: add --variant nvidia|amd; separate work dirs per variant (live-build-work-nvidia / live-build-work-amd); GPU-specific steps (modules, NCCL, cuBLAS, nccl-tests) run only for nvidia; deb package cache synced back to shared location after each lb build so second variant reuses downloaded packages; ISO output named easy-bee-{variant}-v{ver}-amd64.iso - build-in-container.sh: add --variant nvidia|amd|all (default: all); runs build.sh twice in one container for 'all'; --clean-build wipes both variant work dirs - package-lists: remove GPU packages from bee.list.chroot; add bee-nvidia.list.chroot (DCGM) and bee-amd.list.chroot (ROCm) - 9000-bee-setup hook: read /etc/bee-gpu-vendor; enable bee-nvidia.service and DCGM only for nvidia; set up ROCm symlinks only for amd - auto/config: --iso-volume uses BEE_GPU_VENDOR_UPPER env var - grub.cfg: add nomodeset to EASY-BEE and EASY-BEE (load to RAM) entries — fixes X/lightdm on BMC KVM (ASPEED AST chip requires nomodeset for fbdev to work; NVIDIA H100 compute does not need KMS) - bee.sh / smoketest.sh: add /usr/sbin to PATH so dmidecode, smartctl, nvme are found - 9100-memtest hook: add diagnostic listing of chroot/boot/memtest* files Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
fix(iso): substitute all ROCm package version placeholders in build.sh
2026-03-30 22:24:37 +03:00 · 2026-03-29 22:00:05 +03:00 · 2026-03-29 21:57:33 +03:00 · 2026-03-29 21:24:06 +03:00 · 2026-03-29 12:28:06 +03:00 · 2026-03-29 12:03:50 +03:00
22 changed files with 1422 additions and 210 deletions
--- a/audit/internal/app/app.go
+++ b/audit/internal/app/app.go
@@ -114,10 +114,13 @@ type satRunner interface {
 	DetectGPUVendor() string
 	ListAMDGPUs() ([]platform.AMDGPUInfo, error)
 	RunAMDAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
+	RunAMDMemIntegrityPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
+	RunAMDMemBandwidthPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
 	RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
 	RunMemoryStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
 	RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
 	RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
+	RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
 	RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
 }

@@ -577,6 +580,20 @@ func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) {
 	return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err
 }

+func (a *App) RunAMDMemIntegrityPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunAMDMemIntegrityPack(ctx, baseDir, logFunc)
+}
+
+func (a *App) RunAMDMemBandwidthPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunAMDMemBandwidthPack(ctx, baseDir, logFunc)
+}
+
 func (a *App) RunMemoryStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
 	return a.RunMemoryStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
 }
@@ -611,6 +628,13 @@ func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platfor
 	return a.sat.RunFanStressTest(ctx, baseDir, opts)
 }

+func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
+}
+
 func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
 	path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
 	body := "Results: " + path
--- a/audit/internal/app/app_test.go
+++ b/audit/internal/app/app_test.go
@@ -181,6 +181,14 @@ func (f fakeSAT) RunAMDAcceptancePack(_ context.Context, baseDir string, _ func(
 	return "", nil
 }

+func (f fakeSAT) RunAMDMemIntegrityPack(_ context.Context, _ string, _ func(string)) (string, error) {
+	return "", nil
+}
+
+func (f fakeSAT) RunAMDMemBandwidthPack(_ context.Context, _ string, _ func(string)) (string, error) {
+	return "", nil
+}
+
 func (f fakeSAT) RunAMDStressPack(_ context.Context, _ string, _ int, _ func(string)) (string, error) {
 	return "", nil
 }
@@ -195,6 +203,10 @@ func (f fakeSAT) RunFanStressTest(_ context.Context, _ string, _ platform.FanStr
 	return "", nil
 }

+func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.PlatformStressOptions, _ func(string)) (string, error) {
+	return "", nil
+}
+
 func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
 	return "", nil
 }
--- a/audit/internal/platform/platform_stress.go
+++ b/audit/internal/platform/platform_stress.go
@@ -0,0 +1,476 @@
+package platform
+
+import (
+	"archive/tar"
+	"bytes"
+	"compress/gzip"
+	"context"
+	"encoding/csv"
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+)
+
+// PlatformStressCycle defines one load+idle cycle.
+type PlatformStressCycle struct {
+	LoadSec int // seconds of simultaneous CPU+GPU stress
+	IdleSec int // seconds of idle monitoring after load cut
+}
+
+// PlatformStressOptions controls the thermal cycling test.
+type PlatformStressOptions struct {
+	Cycles []PlatformStressCycle
+}
+
+// platformStressRow is one second of telemetry.
+type platformStressRow struct {
+	ElapsedSec   float64
+	Cycle        int
+	Phase        string // "load" | "idle"
+	CPULoadPct   float64
+	MaxCPUTempC  float64
+	MaxGPUTempC  float64
+	SysPowerW    float64
+	FanMinRPM    float64
+	FanMaxRPM    float64
+	GPUThrottled bool
+}
+
+// RunPlatformStress runs repeated load+idle thermal cycling.
+// Each cycle starts CPU (stressapptest) and GPU stress simultaneously,
+// runs for LoadSec, then cuts load abruptly and monitors for IdleSec.
+func (s *System) RunPlatformStress(
+	ctx context.Context,
+	baseDir string,
+	opts PlatformStressOptions,
+	logFunc func(string),
+) (string, error) {
+	if logFunc == nil {
+		logFunc = func(string) {}
+	}
+	if len(opts.Cycles) == 0 {
+		return "", fmt.Errorf("no cycles defined")
+	}
+	if err := os.MkdirAll(baseDir, 0755); err != nil {
+		return "", fmt.Errorf("mkdir %s: %w", baseDir, err)
+	}
+
+	stamp := time.Now().UTC().Format("20060102-150405")
+	runDir := filepath.Join(baseDir, "platform-stress-"+stamp)
+	if err := os.MkdirAll(runDir, 0755); err != nil {
+		return "", fmt.Errorf("mkdir run dir: %w", err)
+	}
+
+	vendor := s.DetectGPUVendor()
+	logFunc(fmt.Sprintf("Platform Thermal Cycling — %d cycle(s), GPU vendor: %s", len(opts.Cycles), vendor))
+
+	var rows []platformStressRow
+	start := time.Now()
+
+	var analyses []cycleAnalysis
+
+	for i, cycle := range opts.Cycles {
+		if ctx.Err() != nil {
+			break
+		}
+		cycleNum := i + 1
+		logFunc(fmt.Sprintf("--- Cycle %d/%d: load=%ds, idle=%ds ---", cycleNum, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec))
+
+		// ── LOAD PHASE ───────────────────────────────────────────────────────
+		loadCtx, loadCancel := context.WithTimeout(ctx, time.Duration(cycle.LoadSec)*time.Second)
+		var wg sync.WaitGroup
+
+		// CPU stress
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			cpuCmd, err := buildCPUStressCmd(loadCtx)
+			if err != nil {
+				logFunc("CPU stress: " + err.Error())
+				return
+			}
+			_ = cpuCmd.Wait() // exits when loadCtx times out (SIGKILL)
+		}()
+
+		// GPU stress
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			gpuCmd := buildGPUStressCmd(loadCtx, vendor)
+			if gpuCmd == nil {
+				return
+			}
+			_ = gpuCmd.Wait()
+		}()
+
+		// Monitoring goroutine for load phase
+		loadRows := collectPhase(loadCtx, cycleNum, "load", start)
+		for _, r := range loadRows {
+			logFunc(formatPlatformRow(r))
+		}
+		rows = append(rows, loadRows...)
+		loadCancel()
+		wg.Wait()
+
+		if len(loadRows) > 0 {
+			logFunc(fmt.Sprintf("Cycle %d load ended (%.0fs)", cycleNum, loadRows[len(loadRows)-1].ElapsedSec))
+		}
+
+		// ── IDLE PHASE ───────────────────────────────────────────────────────
+		idleCtx, idleCancel := context.WithTimeout(ctx, time.Duration(cycle.IdleSec)*time.Second)
+		idleRows := collectPhase(idleCtx, cycleNum, "idle", start)
+		for _, r := range idleRows {
+			logFunc(formatPlatformRow(r))
+		}
+		rows = append(rows, idleRows...)
+		idleCancel()
+
+		// Per-cycle analysis
+		an := analyzePlatformCycle(loadRows, idleRows)
+		analyses = append(analyses, an)
+		logFunc(fmt.Sprintf("Cycle %d: maxCPU=%.1f°C maxGPU=%.1f°C power=%.0fW throttled=%v fanDrop=%.0f%%",
+			cycleNum, an.maxCPUTemp, an.maxGPUTemp, an.maxPower, an.throttled, an.fanDropPct))
+	}
+
+	// Write CSV
+	csvData := writePlatformCSV(rows)
+	_ = os.WriteFile(filepath.Join(runDir, "metrics.csv"), csvData, 0644)
+
+	// Write summary
+	summary := writePlatformSummary(opts, analyses)
+	logFunc("--- Summary ---")
+	for _, line := range strings.Split(summary, "\n") {
+		if line != "" {
+			logFunc(line)
+		}
+	}
+	_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
+
+	// Pack tar.gz
+	archivePath := filepath.Join(baseDir, "platform-stress-"+stamp+".tar.gz")
+	if err := packPlatformDir(runDir, archivePath); err != nil {
+		return "", fmt.Errorf("pack archive: %w", err)
+	}
+	_ = os.RemoveAll(runDir)
+	return archivePath, nil
+}
+
+// collectPhase samples live metrics every second until ctx is done.
+func collectPhase(ctx context.Context, cycle int, phase string, testStart time.Time) []platformStressRow {
+	var rows []platformStressRow
+	ticker := time.NewTicker(time.Second)
+	defer ticker.Stop()
+	for {
+		select {
+		case <-ctx.Done():
+			return rows
+		case <-ticker.C:
+			sample := SampleLiveMetrics()
+			rows = append(rows, sampleToPlatformRow(sample, cycle, phase, testStart))
+		}
+	}
+}
+
+func sampleToPlatformRow(s LiveMetricSample, cycle int, phase string, testStart time.Time) platformStressRow {
+	r := platformStressRow{
+		ElapsedSec: time.Since(testStart).Seconds(),
+		Cycle:      cycle,
+		Phase:      phase,
+		CPULoadPct: s.CPULoadPct,
+		SysPowerW:  s.PowerW,
+	}
+	for _, t := range s.Temps {
+		switch t.Group {
+		case "cpu":
+			if t.Celsius > r.MaxCPUTempC {
+				r.MaxCPUTempC = t.Celsius
+			}
+		case "gpu":
+			if t.Celsius > r.MaxGPUTempC {
+				r.MaxGPUTempC = t.Celsius
+			}
+		}
+	}
+	for _, g := range s.GPUs {
+		if g.TempC > r.MaxGPUTempC {
+			r.MaxGPUTempC = g.TempC
+		}
+	}
+	if len(s.Fans) > 0 {
+		r.FanMinRPM = s.Fans[0].RPM
+		r.FanMaxRPM = s.Fans[0].RPM
+		for _, f := range s.Fans[1:] {
+			if f.RPM < r.FanMinRPM {
+				r.FanMinRPM = f.RPM
+			}
+			if f.RPM > r.FanMaxRPM {
+				r.FanMaxRPM = f.RPM
+			}
+		}
+	}
+	return r
+}
+
+func formatPlatformRow(r platformStressRow) string {
+	throttle := ""
+	if r.GPUThrottled {
+		throttle = " THROTTLE"
+	}
+	fans := ""
+	if r.FanMinRPM > 0 {
+		fans = fmt.Sprintf(" fans=%.0f-%.0fRPM", r.FanMinRPM, r.FanMaxRPM)
+	}
+	return fmt.Sprintf("[%5.0fs] cycle=%d phase=%-4s cpu=%.0f%% cpuT=%.1f°C gpuT=%.1f°C pwr=%.0fW%s%s",
+		r.ElapsedSec, r.Cycle, r.Phase, r.CPULoadPct, r.MaxCPUTempC, r.MaxGPUTempC, r.SysPowerW, fans, throttle)
+}
+
+func analyzePlatformCycle(loadRows, idleRows []platformStressRow) cycleAnalysis {
+	var an cycleAnalysis
+	for _, r := range loadRows {
+		if r.MaxCPUTempC > an.maxCPUTemp {
+			an.maxCPUTemp = r.MaxCPUTempC
+		}
+		if r.MaxGPUTempC > an.maxGPUTemp {
+			an.maxGPUTemp = r.MaxGPUTempC
+		}
+		if r.SysPowerW > an.maxPower {
+			an.maxPower = r.SysPowerW
+		}
+		if r.GPUThrottled {
+			an.throttled = true
+		}
+	}
+	// Fan RPM at cut = avg of last 5 load rows
+	if n := len(loadRows); n > 0 {
+		window := loadRows
+		if n > 5 {
+			window = loadRows[n-5:]
+		}
+		var sum float64
+		var cnt int
+		for _, r := range window {
+			if r.FanMinRPM > 0 {
+				sum += (r.FanMinRPM + r.FanMaxRPM) / 2
+				cnt++
+			}
+		}
+		if cnt > 0 {
+			an.fanAtCutAvg = sum / float64(cnt)
+		}
+	}
+	// Fan RPM min in first 15s of idle
+	an.fanMin15s = an.fanAtCutAvg
+	var cutElapsed float64
+	if len(loadRows) > 0 {
+		cutElapsed = loadRows[len(loadRows)-1].ElapsedSec
+	}
+	for _, r := range idleRows {
+		if r.ElapsedSec > cutElapsed+15 {
+			break
+		}
+		avg := (r.FanMinRPM + r.FanMaxRPM) / 2
+		if avg > 0 && (an.fanMin15s == 0 || avg < an.fanMin15s) {
+			an.fanMin15s = avg
+		}
+	}
+	if an.fanAtCutAvg > 0 {
+		an.fanDropPct = (an.fanAtCutAvg - an.fanMin15s) / an.fanAtCutAvg * 100
+	}
+	return an
+}
+
+type cycleAnalysis struct {
+	maxCPUTemp  float64
+	maxGPUTemp  float64
+	maxPower    float64
+	throttled   bool
+	fanAtCutAvg float64
+	fanMin15s   float64
+	fanDropPct  float64
+}
+
+func writePlatformSummary(opts PlatformStressOptions, analyses []cycleAnalysis) string {
+	var b strings.Builder
+	fmt.Fprintf(&b, "Platform Thermal Cycling — %d cycle(s)\n", len(opts.Cycles))
+	fmt.Fprintf(&b, "%s\n\n", strings.Repeat("=", 48))
+
+	totalThrottle := 0
+	totalFanWarn := 0
+	for i, an := range analyses {
+		cycle := opts.Cycles[i]
+		fmt.Fprintf(&b, "Cycle %d/%d (load=%ds, idle=%ds)\n", i+1, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec)
+		fmt.Fprintf(&b, "  Max CPU temp: %.1f°C\n", an.maxCPUTemp)
+		fmt.Fprintf(&b, "  Max GPU temp: %.1f°C\n", an.maxGPUTemp)
+		fmt.Fprintf(&b, "  Max sys power: %.0f W\n", an.maxPower)
+		if an.throttled {
+			fmt.Fprintf(&b, "  Throttle: DETECTED\n")
+			totalThrottle++
+		} else {
+			fmt.Fprintf(&b, "  Throttle: none\n")
+		}
+		if an.fanAtCutAvg > 0 {
+			fmt.Fprintf(&b, "  Fan at load cut: %.0f RPM avg\n", an.fanAtCutAvg)
+			fmt.Fprintf(&b, "  Fan min (first 15s idle): %.0f RPM (drop %.0f%%)\n", an.fanMin15s, an.fanDropPct)
+			if an.fanDropPct > 20 {
+				fmt.Fprintf(&b, "  Fan response: WARN — fast spindown (>20%% drop in 15s)\n")
+				totalFanWarn++
+			} else {
+				fmt.Fprintf(&b, "  Fan response: OK\n")
+			}
+		}
+		b.WriteString("\n")
+	}
+
+	fmt.Fprintf(&b, "%s\n", strings.Repeat("=", 48))
+	if totalThrottle > 0 {
+		fmt.Fprintf(&b, "Overall: FAIL — throttle detected in %d/%d cycles\n", totalThrottle, len(analyses))
+	} else if totalFanWarn > 0 {
+		fmt.Fprintf(&b, "Overall: WARN — fast fan spindown in %d/%d cycles (cooling recovery risk)\n", totalFanWarn, len(analyses))
+	} else {
+		fmt.Fprintf(&b, "Overall: PASS\n")
+	}
+	return b.String()
+}
+
+func writePlatformCSV(rows []platformStressRow) []byte {
+	var buf bytes.Buffer
+	w := csv.NewWriter(&buf)
+	_ = w.Write([]string{
+		"elapsed_sec", "cycle", "phase",
+		"cpu_load_pct", "max_cpu_temp_c", "max_gpu_temp_c",
+		"sys_power_w", "fan_min_rpm", "fan_max_rpm", "gpu_throttled",
+	})
+	for _, r := range rows {
+		throttled := "0"
+		if r.GPUThrottled {
+			throttled = "1"
+		}
+		_ = w.Write([]string{
+			strconv.FormatFloat(r.ElapsedSec, 'f', 1, 64),
+			strconv.Itoa(r.Cycle),
+			r.Phase,
+			strconv.FormatFloat(r.CPULoadPct, 'f', 1, 64),
+			strconv.FormatFloat(r.MaxCPUTempC, 'f', 1, 64),
+			strconv.FormatFloat(r.MaxGPUTempC, 'f', 1, 64),
+			strconv.FormatFloat(r.SysPowerW, 'f', 1, 64),
+			strconv.FormatFloat(r.FanMinRPM, 'f', 0, 64),
+			strconv.FormatFloat(r.FanMaxRPM, 'f', 0, 64),
+			throttled,
+		})
+	}
+	w.Flush()
+	return buf.Bytes()
+}
+
+// buildCPUStressCmd creates a stressapptest command that runs until ctx is cancelled.
+func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
+	path, err := satLookPath("stressapptest")
+	if err != nil {
+		return nil, fmt.Errorf("stressapptest not found: %w", err)
+	}
+	// Use a very long duration; the context timeout will kill it at the right time.
+	cmd := exec.CommandContext(ctx, path, "-s", "86400", "-W", "--cc_test")
+	cmd.Stdout = nil
+	cmd.Stderr = nil
+	if err := cmd.Start(); err != nil {
+		return nil, fmt.Errorf("stressapptest start: %w", err)
+	}
+	return cmd, nil
+}
+
+// buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
+// Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
+func buildGPUStressCmd(ctx context.Context, vendor string) *exec.Cmd {
+	switch strings.ToLower(vendor) {
+	case "amd":
+		return buildAMDGPUStressCmd(ctx)
+	case "nvidia":
+		return buildNvidiaGPUStressCmd(ctx)
+	}
+	return nil
+}
+
+func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
+	rvsArgs, err := resolveRVSCommand()
+	if err != nil {
+		return nil
+	}
+	rvsPath := rvsArgs[0]
+	cfg := `actions:
+- name: gst_platform
+  device: all
+  module: gst
+  parallel: true
+  duration: 86400000
+  copy_matrix: false
+  target_stress: 90
+  matrix_size_a: 8640
+  matrix_size_b: 8640
+  matrix_size_c: 8640
+`
+	cfgFile := "/tmp/bee-platform-gst.conf"
+	_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
+	cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
+	cmd.Stdout = nil
+	cmd.Stderr = nil
+	_ = cmd.Start()
+	return cmd
+}
+
+func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
+	path, err := satLookPath("bee-gpu-stress")
+	if err != nil {
+		return nil
+	}
+	cmd := exec.CommandContext(ctx, path, "--seconds", "86400", "--size-mb", "64")
+	cmd.Stdout = nil
+	cmd.Stderr = nil
+	_ = cmd.Start()
+	return cmd
+}
+
+func packPlatformDir(dir, dest string) error {
+	f, err := os.Create(dest)
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+	gz := gzip.NewWriter(f)
+	defer gz.Close()
+	tw := tar.NewWriter(gz)
+	defer tw.Close()
+
+	entries, err := os.ReadDir(dir)
+	if err != nil {
+		return err
+	}
+	base := filepath.Base(dir)
+	for _, e := range entries {
+		if e.IsDir() {
+			continue
+		}
+		fpath := filepath.Join(dir, e.Name())
+		data, err := os.ReadFile(fpath)
+		if err != nil {
+			continue
+		}
+		hdr := &tar.Header{
+			Name:    filepath.Join(base, e.Name()),
+			Size:    int64(len(data)),
+			Mode:    0644,
+			ModTime: time.Now(),
+		}
+		if err := tw.WriteHeader(hdr); err != nil {
+			return err
+		}
+		if _, err := tw.Write(data); err != nil {
+			return err
+		}
+	}
+	return nil
+}
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -136,6 +136,54 @@ func (s *System) RunAMDAcceptancePack(ctx context.Context, baseDir string, logFu
 	}, logFunc)
 }

+// RunAMDMemIntegrityPack runs the official RVS MEM module as a validate-style memory integrity test.
+func (s *System) RunAMDMemIntegrityPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+	if err := ensureAMDRuntimeReady(); err != nil {
+		return "", err
+	}
+	cfgFile := "/tmp/bee-amd-mem.conf"
+	cfg := `actions:
+- name: mem_integrity
+  device: all
+  module: mem
+  parallel: true
+  duration: 60000
+  copy_matrix: false
+  target_stress: 90
+  matrix_size: 8640
+`
+	_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
+	return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-mem", []satJob{
+		{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
+		{name: "02-rvs-mem.log", cmd: []string{"rvs", "-c", cfgFile}},
+		{name: "03-rocm-smi-after.log", cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--showmemuse", "--csv"}},
+	}, logFunc)
+}
+
+// RunAMDMemBandwidthPack runs AMD's memory/interconnect bandwidth-oriented tools.
+func (s *System) RunAMDMemBandwidthPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+	if err := ensureAMDRuntimeReady(); err != nil {
+		return "", err
+	}
+	cfgFile := "/tmp/bee-amd-babel.conf"
+	cfg := `actions:
+- name: babel_mem_bw
+  device: all
+  module: babel
+  parallel: true
+  copy_matrix: true
+  target_stress: 90
+  matrix_size: 134217728
+`
+	_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
+	return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-bandwidth", []satJob{
+		{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
+		{name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}},
+		{name: "03-rvs-babel.log", cmd: []string{"rvs", "-c", cfgFile}},
+		{name: "04-rocm-smi-after.log", cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--showmemuse", "--csv"}},
+	}, logFunc)
+}
+
 // RunAMDStressPack runs an AMD GPU burn-in pack.
 // Missing tools are reported as UNSUPPORTED, consistent with the existing SAT pattern.
 func (s *System) RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
@@ -146,8 +194,16 @@ func (s *System) RunAMDStressPack(ctx context.Context, baseDir string, durationS
 	if err := ensureAMDRuntimeReady(); err != nil {
 		return "", err
 	}
-	// Write RVS GST config to a temp file
-	rvsCfg := fmt.Sprintf(`actions:
+	// Enable copy_matrix so the same GST run drives VRAM traffic in addition to compute.
+	rvsCfg := amdStressRVSConfig(seconds)
+	cfgFile := "/tmp/bee-amd-gst.conf"
+	_ = os.WriteFile(cfgFile, []byte(rvsCfg), 0644)
+
+	return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-stress", amdStressJobs(seconds, cfgFile), logFunc)
+}
+
+func amdStressRVSConfig(seconds int) string {
+	return fmt.Sprintf(`actions:
 - name: gst_stress
  device: all
  module: gst
@@ -159,15 +215,15 @@ func (s *System) RunAMDStressPack(ctx context.Context, baseDir string, durationS
  matrix_size_b: 8640
  matrix_size_c: 8640
 `, seconds*1000)
-	cfgFile := "/tmp/bee-amd-gst.conf"
-	_ = os.WriteFile(cfgFile, []byte(rvsCfg), 0644)
+}

-	return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-stress", []satJob{
+func amdStressJobs(seconds int, cfgFile string) []satJob {
+	return []satJob{
 		{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
 		{name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}},
 		{name: fmt.Sprintf("03-rvs-gst-%ds.log", seconds), cmd: []string{"rvs", "-c", cfgFile}},
 		{name: fmt.Sprintf("04-rocm-smi-after.log"), cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--csv"}},
-	}, logFunc)
+	}
 }

 // ListNvidiaGPUs returns GPUs visible to nvidia-smi.
--- a/audit/internal/platform/sat_test.go
+++ b/audit/internal/platform/sat_test.go
@@ -5,6 +5,7 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
+	"strings"
 	"testing"
 )

@@ -38,6 +39,47 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {
 	}
 }

+func TestAMDStressConfigUsesSingleGSTAction(t *testing.T) {
+	t.Parallel()
+
+	cfg := amdStressRVSConfig(123)
+	if !strings.Contains(cfg, "module: gst") {
+		t.Fatalf("config missing gst module:\n%s", cfg)
+	}
+	if strings.Contains(cfg, "module: mem") {
+		t.Fatalf("config should not include mem module:\n%s", cfg)
+	}
+	if !strings.Contains(cfg, "copy_matrix: false") {
+		t.Fatalf("config should use copy_matrix=false:\n%s", cfg)
+	}
+	if strings.Count(cfg, "duration: 123000") != 1 {
+		t.Fatalf("config should apply duration once:\n%s", cfg)
+	}
+	for _, field := range []string{"matrix_size_a: 8640", "matrix_size_b: 8640", "matrix_size_c: 8640"} {
+		if !strings.Contains(cfg, field) {
+			t.Fatalf("config missing %s:\n%s", field, cfg)
+		}
+	}
+}
+
+func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) {
+	t.Parallel()
+
+	jobs := amdStressJobs(300, "/tmp/test-amd-gst.conf")
+	if len(jobs) != 4 {
+		t.Fatalf("jobs=%d want 4", len(jobs))
+	}
+	if got := jobs[1].cmd[0]; got != "rocm-bandwidth-test" {
+		t.Fatalf("jobs[1]=%q want rocm-bandwidth-test", got)
+	}
+	if got := jobs[2].cmd[0]; got != "rvs" {
+		t.Fatalf("jobs[2]=%q want rvs", got)
+	}
+	if got := jobs[2].cmd[2]; got != "/tmp/test-amd-gst.conf" {
+		t.Fatalf("jobs[2] cfg=%q want /tmp/test-amd-gst.conf", got)
+	}
+}
+
 func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) {
 	t.Setenv("BEE_GPU_STRESS_SECONDS", "9")
 	t.Setenv("BEE_GPU_STRESS_SIZE_MB", "96")
--- a/audit/internal/webui/api.go
+++ b/audit/internal/webui/api.go
@@ -599,10 +599,9 @@ func (h *handler) handleAPIMetricsStream(w http.ResponseWriter, r *http.Request)
 		case <-r.Context().Done():
 			return
 		case <-ticker.C:
-			sample := platform.SampleLiveMetrics()
-			h.feedRings(sample)
-			if h.metricsDB != nil {
-				_ = h.metricsDB.Write(sample)
+			sample, ok := h.latestMetric()
+			if !ok {
+				continue
 			}
 			b, err := json.Marshal(sample)
 			if err != nil {
--- a/audit/internal/webui/metricsdb.go
+++ b/audit/internal/webui/metricsdb.go
@@ -3,7 +3,6 @@ package webui
 import (
 	"database/sql"
 	"encoding/csv"
-	"fmt"
 	"io"
 	"strconv"
 	"time"
@@ -13,7 +12,6 @@ import (
 )

 const metricsDBPath = "/appdata/bee/metrics.db"
-const metricsKeepDuration = 24 * time.Hour

 // MetricsDB persists live metric samples to SQLite.
 type MetricsDB struct {
@@ -116,11 +114,18 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
 }

 // LoadRecent returns up to n samples in chronological order (oldest first).
-// It reconstructs LiveMetricSample from the normalized tables.
 func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
-	rows, err := m.db.Query(
-		`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?`, n,
-	)
+	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?`, n)
+}
+
+// LoadAll returns all persisted samples in chronological order (oldest first).
+func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
+	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
+}
+
+// loadSamples reconstructs LiveMetricSample rows from the normalized tables.
+func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetricSample, error) {
+	rows, err := m.db.Query(query, args...)
 	if err != nil {
 		return nil, err
 	}
@@ -257,14 +262,6 @@ func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
 	return samples, nil
 }

-// Prune deletes samples older than keepDuration.
-func (m *MetricsDB) Prune(keepDuration time.Duration) {
-	cutoff := time.Now().Add(-keepDuration).Unix()
-	for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
-		_, _ = m.db.Exec(fmt.Sprintf("DELETE FROM %s WHERE ts < ?", table), cutoff)
-	}
-}
-
 // ExportCSV writes all sys+gpu data as CSV to w.
 func (m *MetricsDB) ExportCSV(w io.Writer) error {
 	rows, err := m.db.Query(`
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
@@ -494,7 +494,11 @@ func renderValidate() string {
 		renderSATCard("memory", "Memory", "") +
 		renderSATCard("storage", "Storage", "") +
 		renderSATCard("cpu", "CPU", `<div class="form-row"><label>Duration (seconds)</label><input type="number" id="sat-cpu-dur" value="60" min="10"></div>`) +
-		renderSATCard("amd", "AMD GPU", "") +
+		renderSATCard("amd", "AMD GPU", `<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
+<button id="sat-btn-amd-mem" class="btn" type="button" onclick="runSAT('amd-mem')">MEM Integrity</button>
+<button id="sat-btn-amd-bandwidth" class="btn" type="button" onclick="runSAT('amd-bandwidth')">MEM Bandwidth</button>
+</div>
+<p style="color:var(--muted);font-size:12px;margin:0">Additional AMD memory diagnostics: RVS MEM for integrity and BABEL + rocm-bandwidth-test for memory/interconnect bandwidth.</p>`) +
 		`</div>
 <div id="sat-output" style="display:none;margin-top:16px" class="card">
  <div class="card-head">Test Output <span id="sat-title"></span></div>
@@ -505,7 +509,7 @@ let satES = null;
 function runSAT(target) {
  if (satES) { satES.close(); satES = null; }
  const body = {};
-  const labels = {nvidia:'Validate GPU', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU'};
+  const labels = {nvidia:'Validate GPU', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
  body.display_name = labels[target] || ('Validate ' + target);
  if (target === 'nvidia') body.diag_level = parseInt(document.getElementById('sat-nvidia-level').value)||1;
  if (target === 'cpu') body.duration = parseInt(document.getElementById('sat-cpu-dur').value)||60;
@@ -524,7 +528,7 @@ function runSAT(target) {
 }
 function runAllSAT() {
  const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
-  const targets = ['nvidia','memory','storage','cpu','amd'];
+  const targets = ['nvidia','memory','storage','cpu','amd','amd-mem','amd-bandwidth'];
  const total = targets.length * cycles;
  let enqueued = 0;
  const status = document.getElementById('sat-all-status');
@@ -536,7 +540,7 @@ function runAllSAT() {
    const btn = document.getElementById('sat-btn-' + target);
    if (btn && btn.disabled) { enqueueNext(cycle, idx+1); return; }
    const body = {};
-    const labels = {nvidia:'Validate GPU', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU'};
+    const labels = {nvidia:'Validate GPU', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
    body.display_name = labels[target] || ('Validate ' + target);
    if (target === 'nvidia') body.diag_level = parseInt(document.getElementById('sat-nvidia-level').value)||1;
    if (target === 'cpu') body.duration = parseInt(document.getElementById('sat-cpu-dur').value)||60;
@@ -554,6 +558,8 @@ function runAllSAT() {
 fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
    if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
    if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
+    if (!gp.amd) disableSATCard('amd-mem', 'No AMD GPU detected');
+    if (!gp.amd) disableSATCard('amd-bandwidth', 'No AMD GPU detected');
 });
 function disableSATCard(id, reason) {
    const btn = document.getElementById('sat-btn-' + id);
@@ -598,7 +604,7 @@ func renderBurn() string {
 <button class="btn btn-primary" onclick="runBurnIn('cpu')">&#9654; Start CPU Stress</button>
 </div></div>
 <div class="card"><div class="card-head">AMD GPU Stress</div><div class="card-body">
-<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Requires ROCm tools (rocm-bandwidth-test). Missing tools reported as UNSUPPORTED.</p>
+<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Runs ROCm compute stress together with VRAM copy/load activity via RVS GST and records a separate <code>rocm-bandwidth-test</code> snapshot. Missing tools reported as UNSUPPORTED.</p>
 <button id="sat-btn-amd-stress" class="btn btn-primary" onclick="runBurnIn('amd-stress')">&#9654; Start AMD Stress</button>
 </div></div>
 <div class="card"><div class="card-head">Memory Stress</div><div class="card-body">
@@ -609,6 +615,10 @@ func renderBurn() string {
 <p style="color:var(--muted);font-size:12px;margin-bottom:8px">Google stressapptest saturates CPU, memory and cache buses simultaneously. Env: <code>BEE_SAT_STRESS_SECONDS</code> (default 300), <code>BEE_SAT_STRESS_MB</code> (default auto).</p>
 <button class="btn btn-primary" onclick="runBurnIn('sat-stress')">&#9654; Start SAT Stress</button>
 </div></div>
+<div class="card"><div class="card-head">Platform Thermal Cycling</div><div class="card-body">
+<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Runs CPU + GPU stress simultaneously across multiple load/idle cycles with varying durations. Detects cooling systems that fail to recover under repeated load cycles. Smoke: 2 cycles ~5 min. Acceptance: 4 cycles ~25 min.</p>
+<button class="btn btn-primary" onclick="runBurnIn('platform-stress')">&#9654; Start Thermal Cycling</button>
+</div></div>
 </div>
 <div id="bi-output" style="display:none;margin-top:16px" class="card">
  <div class="card-head">Output <span id="bi-title"></span></div>
--- a/audit/internal/webui/server.go
+++ b/audit/internal/webui/server.go
@@ -72,29 +72,36 @@ func (r *metricsRing) snapshot() ([]float64, []string) {
 	defer r.mu.Unlock()
 	v := make([]float64, len(r.vals))
 	copy(v, r.vals)
-	now := time.Now()
 	labels := make([]string, len(r.times))
+	if len(r.times) == 0 {
+		return v, labels
+	}
+	sameDay := timestampsSameLocalDay(r.times)
 	for i, t := range r.times {
-		labels[i] = relAgeLabel(now.Sub(t))
+		labels[i] = formatTimelineLabel(t.Local(), sameDay)
 	}
 	return v, labels
 }

-func relAgeLabel(age time.Duration) string {
-	if age <= 0 {
-		return "0"
+func timestampsSameLocalDay(times []time.Time) bool {
+	if len(times) == 0 {
+		return true
 	}
-	if age < time.Hour {
-		m := int(age.Minutes())
-		if m == 0 {
-			return "-1m"
+	first := times[0].Local()
+	for _, t := range times[1:] {
+		local := t.Local()
+		if local.Year() != first.Year() || local.YearDay() != first.YearDay() {
+			return false
 		}
-		return fmt.Sprintf("-%dm", m)
 	}
-	if age < 24*time.Hour {
-		return fmt.Sprintf("-%dh", int(age.Hours()))
+	return true
+}
+
+func formatTimelineLabel(ts time.Time, sameDay bool) string {
+	if sameDay {
+		return ts.Format("15:04")
 	}
-	return fmt.Sprintf("-%dd", int(age.Hours()/24))
+	return ts.Format("01-02 15:04")
 }

 // gpuRings holds per-GPU ring buffers.
@@ -132,6 +139,8 @@ type handler struct {
 	// per-GPU rings (index = GPU index)
 	gpuRings []*gpuRings
 	ringsMu  sync.Mutex
+	latestMu sync.RWMutex
+	latest   *platform.LiveMetricSample
 	// metrics persistence (nil if DB unavailable)
 	metricsDB *MetricsDB
 	// install job (at most one at a time)
@@ -164,13 +173,16 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	// Open metrics DB and pre-fill ring buffers from history.
 	if db, err := openMetricsDB(metricsDBPath); err == nil {
 		h.metricsDB = db
-		db.Prune(metricsKeepDuration)
 		if samples, err := db.LoadRecent(120); err == nil {
 			for _, s := range samples {
 				h.feedRings(s)
 			}
+			if len(samples) > 0 {
+				h.setLatestMetric(samples[len(samples)-1])
+			}
 		}
 	}
+	h.startMetricsCollector()

 	globalQueue.startWorker(&opts)
 	mux := http.NewServeMux()
@@ -198,9 +210,12 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
 	mux.HandleFunc("POST /api/sat/cpu/run", h.handleAPISATRun("cpu"))
 	mux.HandleFunc("POST /api/sat/amd/run", h.handleAPISATRun("amd"))
+	mux.HandleFunc("POST /api/sat/amd-mem/run", h.handleAPISATRun("amd-mem"))
+	mux.HandleFunc("POST /api/sat/amd-bandwidth/run", h.handleAPISATRun("amd-bandwidth"))
 	mux.HandleFunc("POST /api/sat/amd-stress/run", h.handleAPISATRun("amd-stress"))
 	mux.HandleFunc("POST /api/sat/memory-stress/run", h.handleAPISATRun("memory-stress"))
 	mux.HandleFunc("POST /api/sat/sat-stress/run", h.handleAPISATRun("sat-stress"))
+	mux.HandleFunc("POST /api/sat/platform-stress/run", h.handleAPISATRun("platform-stress"))
 	mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream)
 	mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)

@@ -260,6 +275,37 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	return mux
 }

+func (h *handler) startMetricsCollector() {
+	go func() {
+		ticker := time.NewTicker(1 * time.Second)
+		defer ticker.Stop()
+		for range ticker.C {
+			sample := platform.SampleLiveMetrics()
+			h.feedRings(sample)
+			h.setLatestMetric(sample)
+			if h.metricsDB != nil {
+				_ = h.metricsDB.Write(sample)
+			}
+		}
+	}()
+}
+
+func (h *handler) setLatestMetric(sample platform.LiveMetricSample) {
+	h.latestMu.Lock()
+	defer h.latestMu.Unlock()
+	cp := sample
+	h.latest = &cp
+}
+
+func (h *handler) latestMetric() (platform.LiveMetricSample, bool) {
+	h.latestMu.RLock()
+	defer h.latestMu.RUnlock()
+	if h.latest == nil {
+		return platform.LiveMetricSample{}, false
+	}
+	return *h.latest, true
+}
+
 // ListenAndServe starts the HTTP server.
 func ListenAndServe(addr string, opts HandlerOptions) error {
 	return http.ListenAndServe(addr, NewHandler(opts))
@@ -387,6 +433,20 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
 	path := strings.TrimPrefix(r.URL.Path, "/api/metrics/chart/")
 	path = strings.TrimSuffix(path, ".svg")

+	if h.metricsDB != nil {
+		if datasets, names, labels, title, yMin, yMax, ok := h.chartDataFromDB(path); ok {
+			buf, err := renderChartSVG(title, datasets, names, labels, yMin, yMax)
+			if err != nil {
+				http.Error(w, err.Error(), http.StatusInternalServerError)
+				return
+			}
+			w.Header().Set("Content-Type", "image/svg+xml")
+			w.Header().Set("Cache-Control", "no-store")
+			_, _ = w.Write(buf)
+			return
+		}
+	}
+
 	var datasets [][]float64
 	var names []string
 	var labels []string
@@ -601,6 +661,259 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
 	_, _ = w.Write(buf)
 }

+func (h *handler) chartDataFromDB(path string) ([][]float64, []string, []string, string, *float64, *float64, bool) {
+	samples, err := h.metricsDB.LoadAll()
+	if err != nil || len(samples) == 0 {
+		return nil, nil, nil, "", nil, nil, false
+	}
+	return chartDataFromSamples(path, samples)
+}
+
+func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][]float64, []string, []string, string, *float64, *float64, bool) {
+	var datasets [][]float64
+	var names []string
+	var title string
+	var yMin, yMax *float64
+	labels := sampleTimeLabels(samples)
+
+	switch {
+	case path == "server-load":
+		title = "CPU / Memory Load"
+		cpu := make([]float64, len(samples))
+		mem := make([]float64, len(samples))
+		for i, s := range samples {
+			cpu[i] = s.CPULoadPct
+			mem[i] = s.MemLoadPct
+		}
+		datasets = [][]float64{cpu, mem}
+		names = []string{"CPU Load %", "Mem Load %"}
+		yMin = floatPtr(0)
+		yMax = floatPtr(100)
+
+	case path == "server-temp", path == "server-temp-cpu":
+		title = "CPU Temperature"
+		datasets, names = namedTempDatasets(samples, "cpu")
+		yMin = floatPtr(0)
+		yMax = autoMax120(datasets...)
+
+	case path == "server-temp-gpu":
+		title = "GPU Temperature"
+		datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.TempC })
+		yMin = floatPtr(0)
+		yMax = autoMax120(datasets...)
+
+	case path == "server-temp-ambient":
+		title = "Ambient / Other Sensors"
+		datasets, names = namedTempDatasets(samples, "ambient")
+		yMin = floatPtr(0)
+		yMax = autoMax120(datasets...)
+
+	case path == "server-power":
+		title = "System Power"
+		power := make([]float64, len(samples))
+		for i, s := range samples {
+			power[i] = s.PowerW
+		}
+		datasets = [][]float64{power}
+		names = []string{"Power W"}
+		yMin, yMax = autoBounds120(power)
+
+	case path == "server-fans":
+		title = "Fan RPM"
+		datasets, names = namedFanDatasets(samples)
+		yMin, yMax = autoBounds120(datasets...)
+
+	case path == "gpu-all-load":
+		title = "GPU Compute Load"
+		datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
+		yMin = floatPtr(0)
+		yMax = floatPtr(100)
+
+	case path == "gpu-all-memload":
+		title = "GPU Memory Load"
+		datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
+		yMin = floatPtr(0)
+		yMax = floatPtr(100)
+
+	case path == "gpu-all-power":
+		title = "GPU Power"
+		datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.PowerW })
+		yMin, yMax = autoBounds120(datasets...)
+
+	case path == "gpu-all-temp":
+		title = "GPU Temperature"
+		datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.TempC })
+		yMin = floatPtr(0)
+		yMax = autoMax120(datasets...)
+
+	case strings.HasPrefix(path, "gpu/"):
+		rest := strings.TrimPrefix(path, "gpu/")
+		sub := ""
+		if i := strings.LastIndex(rest, "-"); i > 0 {
+			sub = rest[i+1:]
+			rest = rest[:i]
+		}
+		idx := 0
+		fmt.Sscanf(rest, "%d", &idx)
+		switch sub {
+		case "load":
+			title = fmt.Sprintf("GPU %d Load", idx)
+			util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
+			mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
+			if util == nil && mem == nil {
+				return nil, nil, nil, "", nil, nil, false
+			}
+			datasets = [][]float64{coalesceDataset(util, len(samples)), coalesceDataset(mem, len(samples))}
+			names = []string{"Load %", "Mem %"}
+			yMin = floatPtr(0)
+			yMax = floatPtr(100)
+		case "temp":
+			title = fmt.Sprintf("GPU %d Temperature", idx)
+			temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
+			if temp == nil {
+				return nil, nil, nil, "", nil, nil, false
+			}
+			datasets = [][]float64{temp}
+			names = []string{"Temp °C"}
+			yMin = floatPtr(0)
+			yMax = autoMax120(temp)
+		default:
+			title = fmt.Sprintf("GPU %d Power", idx)
+			power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
+			if power == nil {
+				return nil, nil, nil, "", nil, nil, false
+			}
+			datasets = [][]float64{power}
+			names = []string{"Power W"}
+			yMin, yMax = autoBounds120(power)
+		}
+
+	default:
+		return nil, nil, nil, "", nil, nil, false
+	}
+
+	return datasets, names, labels, title, yMin, yMax, len(datasets) > 0
+}
+
+func sampleTimeLabels(samples []platform.LiveMetricSample) []string {
+	labels := make([]string, len(samples))
+	if len(samples) == 0 {
+		return labels
+	}
+	times := make([]time.Time, len(samples))
+	for i, s := range samples {
+		times[i] = s.Timestamp
+	}
+	sameDay := timestampsSameLocalDay(times)
+	for i, s := range samples {
+		labels[i] = formatTimelineLabel(s.Timestamp.Local(), sameDay)
+	}
+	return labels
+}
+
+func namedTempDatasets(samples []platform.LiveMetricSample, group string) ([][]float64, []string) {
+	seen := map[string]bool{}
+	var names []string
+	for _, s := range samples {
+		for _, t := range s.Temps {
+			if t.Group == group && !seen[t.Name] {
+				seen[t.Name] = true
+				names = append(names, t.Name)
+			}
+		}
+	}
+	datasets := make([][]float64, 0, len(names))
+	for _, name := range names {
+		ds := make([]float64, len(samples))
+		for i, s := range samples {
+			for _, t := range s.Temps {
+				if t.Group == group && t.Name == name {
+					ds[i] = t.Celsius
+					break
+				}
+			}
+		}
+		datasets = append(datasets, ds)
+	}
+	return datasets, names
+}
+
+func namedFanDatasets(samples []platform.LiveMetricSample) ([][]float64, []string) {
+	seen := map[string]bool{}
+	var names []string
+	for _, s := range samples {
+		for _, f := range s.Fans {
+			if !seen[f.Name] {
+				seen[f.Name] = true
+				names = append(names, f.Name)
+			}
+		}
+	}
+	datasets := make([][]float64, 0, len(names))
+	for _, name := range names {
+		ds := make([]float64, len(samples))
+		for i, s := range samples {
+			for _, f := range s.Fans {
+				if f.Name == name {
+					ds[i] = f.RPM
+					break
+				}
+			}
+		}
+		datasets = append(datasets, ds)
+	}
+	return datasets, names
+}
+
+func gpuDatasets(samples []platform.LiveMetricSample, pick func(platform.GPUMetricRow) float64) ([][]float64, []string) {
+	seen := map[int]bool{}
+	var indices []int
+	for _, s := range samples {
+		for _, g := range s.GPUs {
+			if !seen[g.GPUIndex] {
+				seen[g.GPUIndex] = true
+				indices = append(indices, g.GPUIndex)
+			}
+		}
+	}
+	datasets := make([][]float64, 0, len(indices))
+	names := make([]string, 0, len(indices))
+	for _, idx := range indices {
+		ds := gpuDatasetByIndex(samples, idx, pick)
+		if ds == nil {
+			continue
+		}
+		datasets = append(datasets, ds)
+		names = append(names, fmt.Sprintf("GPU %d", idx))
+	}
+	return datasets, names
+}
+
+func gpuDatasetByIndex(samples []platform.LiveMetricSample, idx int, pick func(platform.GPUMetricRow) float64) []float64 {
+	found := false
+	ds := make([]float64, len(samples))
+	for i, s := range samples {
+		for _, g := range s.GPUs {
+			if g.GPUIndex == idx {
+				ds[i] = pick(g)
+				found = true
+				break
+			}
+		}
+	}
+	if !found {
+		return nil
+	}
+	return ds
+}
+
+func coalesceDataset(ds []float64, n int) []float64 {
+	if ds != nil {
+		return ds
+	}
+	return make([]float64, n)
+}
+
 // floatPtr returns a pointer to a float64 value.
 func floatPtr(v float64) *float64 { return &v }

@@ -621,6 +934,47 @@ func autoMax120(datasets ...[]float64) *float64 {
 	return &v
 }

+func autoBounds120(datasets ...[]float64) (*float64, *float64) {
+	min := 0.0
+	max := 0.0
+	first := true
+	for _, ds := range datasets {
+		for _, v := range ds {
+			if first {
+				min, max = v, v
+				first = false
+				continue
+			}
+			if v < min {
+				min = v
+			}
+			if v > max {
+				max = v
+			}
+		}
+	}
+	if first {
+		return nil, nil
+	}
+	if max <= 0 {
+		return floatPtr(0), nil
+	}
+	span := max - min
+	if span <= 0 {
+		span = max * 0.1
+		if span <= 0 {
+			span = 1
+		}
+	}
+	pad := span * 0.2
+	low := min - pad
+	if low < 0 {
+		low = 0
+	}
+	high := max + pad
+	return floatPtr(low), floatPtr(high)
+}
+
 // renderChartSVG renders a line chart SVG with a fixed Y-axis range.
 func renderChartSVG(title string, datasets [][]float64, names []string, labels []string, yMin, yMax *float64) ([]byte, error) {
 	n := len(labels)
--- a/audit/internal/webui/server_test.go
+++ b/audit/internal/webui/server_test.go
@@ -7,6 +7,9 @@ import (
 	"path/filepath"
 	"strings"
 	"testing"
+	"time"
+
+	"bee/audit/internal/platform"
 )

 func TestChartLegendNumber(t *testing.T) {
@@ -31,6 +34,61 @@ func TestChartLegendNumber(t *testing.T) {
 	}
 }

+func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
+	samples := []platform.LiveMetricSample{
+		{
+			Timestamp:  time.Now().Add(-3 * time.Minute),
+			CPULoadPct: 10,
+			MemLoadPct: 20,
+			PowerW:     300,
+			GPUs: []platform.GPUMetricRow{
+				{GPUIndex: 0, UsagePct: 90, MemUsagePct: 5, PowerW: 120, TempC: 50},
+			},
+		},
+		{
+			Timestamp:  time.Now().Add(-2 * time.Minute),
+			CPULoadPct: 30,
+			MemLoadPct: 40,
+			PowerW:     320,
+			GPUs: []platform.GPUMetricRow{
+				{GPUIndex: 0, UsagePct: 95, MemUsagePct: 7, PowerW: 125, TempC: 51},
+			},
+		},
+		{
+			Timestamp:  time.Now().Add(-1 * time.Minute),
+			CPULoadPct: 50,
+			MemLoadPct: 60,
+			PowerW:     340,
+			GPUs: []platform.GPUMetricRow{
+				{GPUIndex: 0, UsagePct: 97, MemUsagePct: 9, PowerW: 130, TempC: 52},
+			},
+		},
+	}
+
+	datasets, names, labels, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
+	if !ok {
+		t.Fatal("chartDataFromSamples returned ok=false")
+	}
+	if title != "GPU Power" {
+		t.Fatalf("title=%q", title)
+	}
+	if len(names) != 1 || names[0] != "GPU 0" {
+		t.Fatalf("names=%v", names)
+	}
+	if len(labels) != len(samples) {
+		t.Fatalf("labels len=%d want %d", len(labels), len(samples))
+	}
+	if len(datasets) != 1 || len(datasets[0]) != len(samples) {
+		t.Fatalf("datasets shape=%v", datasets)
+	}
+	if got := datasets[0][0]; got != 120 {
+		t.Fatalf("datasets[0][0]=%v want 120", got)
+	}
+	if got := datasets[0][2]; got != 130 {
+		t.Fatalf("datasets[0][2]=%v want 130", got)
+	}
+}
+
 func TestRootRendersDashboard(t *testing.T) {
 	dir := t.TempDir()
 	path := filepath.Join(dir, "audit.json")
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -12,6 +12,7 @@ import (
 	"time"

 	"bee/audit/internal/app"
+	"bee/audit/internal/platform"
 )

 // Task statuses.
@@ -30,9 +31,12 @@ var taskNames = map[string]string{
 	"storage":        "Storage SAT",
 	"cpu":            "CPU SAT",
 	"amd":            "AMD GPU SAT",
+	"amd-mem":        "AMD GPU MEM Integrity",
+	"amd-bandwidth":  "AMD GPU MEM Bandwidth",
 	"amd-stress":     "AMD GPU Burn-in",
 	"memory-stress":  "Memory Burn-in",
-	"sat-stress":     "SAT Stress (stressapptest)",
+	"sat-stress":       "SAT Stress (stressapptest)",
+	"platform-stress": "Platform Thermal Cycling",
 	"audit":          "Audit",
 	"install":        "Install to Disk",
 	"install-to-ram": "Install to RAM",
@@ -96,6 +100,34 @@ func resolveBurnPreset(profile string) burnPreset {
 	}
 }

+func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
+	switch profile {
+	case "overnight":
+		return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
+			{LoadSec: 600, IdleSec: 120},
+			{LoadSec: 600, IdleSec: 60},
+			{LoadSec: 600, IdleSec: 30},
+			{LoadSec: 600, IdleSec: 120},
+			{LoadSec: 600, IdleSec: 60},
+			{LoadSec: 600, IdleSec: 30},
+			{LoadSec: 600, IdleSec: 120},
+			{LoadSec: 600, IdleSec: 60},
+		}}
+	case "acceptance":
+		return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
+			{LoadSec: 300, IdleSec: 60},
+			{LoadSec: 300, IdleSec: 30},
+			{LoadSec: 300, IdleSec: 60},
+			{LoadSec: 300, IdleSec: 30},
+		}}
+	default: // smoke
+		return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
+			{LoadSec: 90, IdleSec: 60},
+			{LoadSec: 90, IdleSec: 30},
+		}}
+	}
+}
+
 // taskQueue manages a priority-ordered list of tasks and runs them one at a time.
 type taskQueue struct {
 	mu        sync.Mutex
@@ -124,6 +156,12 @@ var (
 	runAMDAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
 		return a.RunAMDAcceptancePackCtx(ctx, baseDir, logFunc)
 	}
+	runAMDMemIntegrityPackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+		return a.RunAMDMemIntegrityPackCtx(ctx, baseDir, logFunc)
+	}
+	runAMDMemBandwidthPackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+		return a.RunAMDMemBandwidthPackCtx(ctx, baseDir, logFunc)
+	}
 	runAMDStressPackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
 		return a.RunAMDStressPackCtx(ctx, baseDir, durationSec, logFunc)
 	}
@@ -380,6 +418,10 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 		archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
 	case "amd":
 		archive, err = runAMDAcceptancePackCtx(a, ctx, "", j.append)
+	case "amd-mem":
+		archive, err = runAMDMemIntegrityPackCtx(a, ctx, "", j.append)
+	case "amd-bandwidth":
+		archive, err = runAMDMemBandwidthPackCtx(a, ctx, "", j.append)
 	case "amd-stress":
 		dur := t.params.Duration
 		if t.params.BurnProfile != "" && dur <= 0 {
@@ -398,6 +440,9 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
 		}
 		archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
+	case "platform-stress":
+		opts := resolvePlatformStressPreset(t.params.BurnProfile)
+		archive, err = a.RunPlatformStress(ctx, "", opts, j.append)
 	case "audit":
 		result, e := a.RunAuditNow(q.opts.RuntimeMode)
 		if e != nil {
--- a/iso/builder/auto/config
+++ b/iso/builder/auto/config
@@ -30,8 +30,8 @@ lb config noauto \
    --linux-flavours "amd64" \
    --linux-packages "${LB_LINUX_PACKAGES}" \
    --memtest none \
-    --iso-volume "EASY-BEE" \
-    --iso-application "EASY-BEE" \
+    --iso-volume "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
+    --iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
    --bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
    --apt-recommends false \
    --chroot-squashfs-compression-type zstd \
--- a/iso/builder/build-in-container.sh
+++ b/iso/builder/build-in-container.sh
@@ -12,6 +12,7 @@ CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/container-cache}"
 AUTH_KEYS=""
 REBUILD_IMAGE=0
 CLEAN_CACHE=0
+VARIANT="all"

 . "${BUILDER_DIR}/VERSIONS"

@@ -34,14 +35,23 @@ while [ $# -gt 0 ]; do
            REBUILD_IMAGE=1
            shift
            ;;
+        --variant)
+            VARIANT="$2"
+            shift 2
+            ;;
        *)
            echo "unknown arg: $1" >&2
-            echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys]" >&2
+            echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|amd|all]" >&2
            exit 1
            ;;
    esac
 done

+case "$VARIANT" in
+    nvidia|amd|all) ;;
+    *) echo "unknown variant: $VARIANT (expected nvidia, amd, or all)" >&2; exit 1 ;;
+esac
+
 if [ "$CLEAN_CACHE" = "1" ]; then
    echo "=== cleaning build cache: ${CACHE_DIR} ==="
    rm -rf "${CACHE_DIR:?}/go-build" \
@@ -49,8 +59,9 @@ if [ "$CLEAN_CACHE" = "1" ]; then
           "${CACHE_DIR:?}/tmp" \
           "${CACHE_DIR:?}/bee" \
           "${CACHE_DIR:?}/lb-packages"
-    echo "=== cleaning live-build work dir: ${REPO_ROOT}/dist/live-build-work ==="
-    rm -rf "${REPO_ROOT}/dist/live-build-work"
+    echo "=== cleaning live-build work dirs ==="
+    rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia"
+    rm -rf "${REPO_ROOT}/dist/live-build-work-amd"
    echo "=== caches cleared, proceeding with build ==="
 fi

@@ -108,34 +119,71 @@ else
    echo "=== using existing builder image ${IMAGE_REF} (${BUILDER_PLATFORM}) ==="
 fi

-set -- \
-    run --rm --privileged \
-    --platform "${BUILDER_PLATFORM}" \
-    -v "${REPO_ROOT}:/work" \
-    -v "${CACHE_DIR}:/cache" \
-    -e BEE_CONTAINER_BUILD=1 \
-    -e GOCACHE=/cache/go-build \
-    -e GOMODCACHE=/cache/go-mod \
-    -e TMPDIR=/cache/tmp \
-    -e BEE_CACHE_DIR=/cache/bee \
-    -w /work \
-    "${IMAGE_REF}" \
-    sh /work/iso/builder/build.sh
-
-if [ -n "$AUTH_KEYS" ]; then
-    set -- run --rm --privileged \
-        --platform "${BUILDER_PLATFORM}" \
-        -v "${REPO_ROOT}:/work" \
-        -v "${CACHE_DIR}:/cache" \
-        -v "${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro" \
+# Build base docker run args (without --authorized-keys)
+build_run_args() {
+    _variant="$1"
+    _auth_arg=""
+    if [ -n "$AUTH_KEYS" ]; then
+        _auth_arg="--authorized-keys /tmp/bee-authkeys/${AUTH_KEYS_BASE}"
+    fi
+    echo "run --rm --privileged \
+        --platform ${BUILDER_PLATFORM} \
+        -v ${REPO_ROOT}:/work \
+        -v ${CACHE_DIR}:/cache \
+        ${AUTH_KEYS:+-v ${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro} \
        -e BEE_CONTAINER_BUILD=1 \
        -e GOCACHE=/cache/go-build \
        -e GOMODCACHE=/cache/go-mod \
        -e TMPDIR=/cache/tmp \
        -e BEE_CACHE_DIR=/cache/bee \
        -w /work \
-        "${IMAGE_REF}" \
-        sh /work/iso/builder/build.sh --authorized-keys "/tmp/bee-authkeys/${AUTH_KEYS_BASE}"
-fi
+        ${IMAGE_REF} \
+        sh /work/iso/builder/build.sh --variant ${_variant} ${_auth_arg}"
+}

-"$CONTAINER_TOOL" "$@"
+run_variant() {
+    _v="$1"
+    echo "=== building variant: ${_v} ==="
+    if [ -n "$AUTH_KEYS" ]; then
+        "$CONTAINER_TOOL" run --rm --privileged \
+            --platform "${BUILDER_PLATFORM}" \
+            -v "${REPO_ROOT}:/work" \
+            -v "${CACHE_DIR}:/cache" \
+            -v "${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro" \
+            -e BEE_CONTAINER_BUILD=1 \
+            -e GOCACHE=/cache/go-build \
+            -e GOMODCACHE=/cache/go-mod \
+            -e TMPDIR=/cache/tmp \
+            -e BEE_CACHE_DIR=/cache/bee \
+            -w /work \
+            "${IMAGE_REF}" \
+            sh /work/iso/builder/build.sh --variant "${_v}" \
+                --authorized-keys "/tmp/bee-authkeys/${AUTH_KEYS_BASE}"
+    else
+        "$CONTAINER_TOOL" run --rm --privileged \
+            --platform "${BUILDER_PLATFORM}" \
+            -v "${REPO_ROOT}:/work" \
+            -v "${CACHE_DIR}:/cache" \
+            -e BEE_CONTAINER_BUILD=1 \
+            -e GOCACHE=/cache/go-build \
+            -e GOMODCACHE=/cache/go-mod \
+            -e TMPDIR=/cache/tmp \
+            -e BEE_CACHE_DIR=/cache/bee \
+            -w /work \
+            "${IMAGE_REF}" \
+            sh /work/iso/builder/build.sh --variant "${_v}"
+    fi
+}
+
+case "$VARIANT" in
+    nvidia)
+        run_variant nvidia
+        ;;
+    amd)
+        run_variant amd
+        ;;
+    all)
+        run_variant nvidia
+        run_variant amd
+        ;;
+esac
--- a/iso/builder/build.sh
+++ b/iso/builder/build.sh
@@ -13,19 +13,29 @@ BUILDER_DIR="${REPO_ROOT}/iso/builder"
 OVERLAY_DIR="${REPO_ROOT}/iso/overlay"
 DIST_DIR="${REPO_ROOT}/dist"
 VENDOR_DIR="${REPO_ROOT}/iso/vendor"
-BUILD_WORK_DIR="${DIST_DIR}/live-build-work"
-OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage"
 CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
 AUTH_KEYS=""
+BEE_GPU_VENDOR="nvidia"

 # parse args
 while [ $# -gt 0 ]; do
    case "$1" in
        --authorized-keys) AUTH_KEYS="$2"; shift 2 ;;
+        --variant) BEE_GPU_VENDOR="$2"; shift 2 ;;
        *) echo "unknown arg: $1"; exit 1 ;;
    esac
 done

+case "$BEE_GPU_VENDOR" in
+    nvidia|amd) ;;
+    *) echo "unknown variant: $BEE_GPU_VENDOR (expected nvidia or amd)" >&2; exit 1 ;;
+esac
+
+BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BEE_GPU_VENDOR}"
+OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BEE_GPU_VENDOR}"
+
+export BEE_GPU_VENDOR
+
 . "${BUILDER_DIR}/VERSIONS"
 export PATH="$PATH:/usr/local/go/bin"

@@ -132,7 +142,7 @@ if [ ! -d "/usr/src/linux-headers-${KVER}" ]; then
    apt-get install -y "linux-headers-${KVER}"
 fi

-echo "=== bee ISO build ==="
+echo "=== bee ISO build (variant: ${BEE_GPU_VENDOR}) ==="
 echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERSION}"
 echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}, ISO version: ${ISO_VERSION_EFFECTIVE}"
 echo ""
@@ -141,8 +151,8 @@ echo "=== syncing git submodules ==="
 git -C "${REPO_ROOT}" submodule update --init --recursive

 # --- compile bee binary (static, Linux amd64) ---
+# Shared between variants — built once, reused on second pass.
 BEE_BIN="${DIST_DIR}/bee-linux-amd64"
-GPU_STRESS_BIN="${DIST_DIR}/bee-gpu-stress-linux-amd64"
 NEED_BUILD=1
 if [ -f "$BEE_BIN" ]; then
    NEWEST_SRC=$(find "${REPO_ROOT}/audit" -name '*.go' -newer "$BEE_BIN" | head -1)
@@ -172,37 +182,41 @@ else
    echo "=== bee binary up to date, skipping build ==="
 fi

-echo ""
-echo "=== downloading cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace ==="
-sh "${BUILDER_DIR}/build-cublas.sh" \
-    "${CUBLAS_VERSION}" \
-    "${CUDA_USERSPACE_VERSION}" \
-    "${NCCL_CUDA_VERSION}" \
-    "${DIST_DIR}"
+# --- NVIDIA-only build steps ---
+GPU_STRESS_BIN="${DIST_DIR}/bee-gpu-stress-linux-amd64"
+if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
+    echo ""
+    echo "=== downloading cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace ==="
+    sh "${BUILDER_DIR}/build-cublas.sh" \
+        "${CUBLAS_VERSION}" \
+        "${CUDA_USERSPACE_VERSION}" \
+        "${NCCL_CUDA_VERSION}" \
+        "${DIST_DIR}"

-CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
+    CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"

-GPU_STRESS_NEED_BUILD=1
-if [ -f "$GPU_STRESS_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_STRESS_BIN" ]; then
-    GPU_STRESS_NEED_BUILD=0
+    GPU_STRESS_NEED_BUILD=1
+    if [ -f "$GPU_STRESS_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_STRESS_BIN" ]; then
+        GPU_STRESS_NEED_BUILD=0
+    fi
+
+    if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
+        echo "=== building bee-gpu-stress ==="
+        gcc -O2 -s -Wall -Wextra \
+            -I"${CUBLAS_CACHE}/include" \
+            -o "$GPU_STRESS_BIN" \
+            "${BUILDER_DIR}/bee-gpu-stress.c" \
+            -ldl -lm
+        echo "binary: $GPU_STRESS_BIN"
+    else
+        echo "=== bee-gpu-stress up to date, skipping build ==="
+    fi
 fi

-if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
-    echo "=== building bee-gpu-stress ==="
-    gcc -O2 -s -Wall -Wextra \
-        -I"${CUBLAS_CACHE}/include" \
-        -o "$GPU_STRESS_BIN" \
-        "${BUILDER_DIR}/bee-gpu-stress.c" \
-        -ldl -lm
-    echo "binary: $GPU_STRESS_BIN"
-else
-    echo "=== bee-gpu-stress up to date, skipping build ==="
-fi
-
-echo "=== preparing staged overlay ==="
-# Sync builder config into work dir, preserving lb cache (chroot + packages).
-# We do NOT rm -rf BUILD_WORK_DIR so lb can reuse its chroot on repeat builds.
+echo "=== preparing staged overlay (${BEE_GPU_VENDOR}) ==="
 mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
+
+# Sync builder config into variant work dir, preserving lb cache.
 rsync -a --delete \
    --exclude='cache/' \
    --exclude='chroot/' \
@@ -212,7 +226,10 @@ rsync -a --delete \
    --exclude='*.contents' \
    --exclude='*.files' \
    "${BUILDER_DIR}/" "${BUILD_WORK_DIR}/"
-# Also persist package cache to CACHE_ROOT so it survives a manual wipe of BUILD_WORK_DIR.
+
+# Share deb package cache across variants.
+# Restore: populate work dir cache from shared cache before build.
+# Persist: sync back after build (done after lb build below).
 LB_PKG_CACHE="${CACHE_ROOT}/lb-packages"
 mkdir -p "${LB_PKG_CACHE}"
 if [ -d "${BUILD_WORK_DIR}/cache/packages.chroot" ]; then
@@ -221,6 +238,7 @@ elif [ -d "${LB_PKG_CACHE}" ] && [ "$(ls -A "${LB_PKG_CACHE}" 2>/dev/null)" ]; t
    mkdir -p "${BUILD_WORK_DIR}/cache/packages.chroot"
    rsync -a "${LB_PKG_CACHE}/" "${BUILD_WORK_DIR}/cache/packages.chroot/"
 fi
+
 rsync -a "${OVERLAY_DIR}/" "${OVERLAY_STAGE_DIR}/"
 rm -f \
    "${OVERLAY_STAGE_DIR}/etc/bee-ssh-password-fallback" \
@@ -231,6 +249,12 @@ rm -f \
    "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \
    "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"

+# Remove NVIDIA-specific overlay files for AMD variant
+if [ "$BEE_GPU_VENDOR" = "amd" ]; then
+    rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nvidia-load"
+    rm -f "${OVERLAY_STAGE_DIR}/etc/systemd/system/bee-nvidia.service"
+fi
+
 # --- inject authorized_keys for SSH access ---
 AUTHORIZED_KEYS_FILE="${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys"
 mkdir -p "${OVERLAY_STAGE_DIR}/root/.ssh"
@@ -268,8 +292,11 @@ fi
 mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin"
 cp "${DIST_DIR}/bee-linux-amd64" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
 chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
-cp "${GPU_STRESS_BIN}" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
-chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
+
+if [ "$BEE_GPU_VENDOR" = "nvidia" ] && [ -f "$GPU_STRESS_BIN" ]; then
+    cp "${GPU_STRESS_BIN}" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
+    chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
+fi

 # --- inject smoketest into overlay so it runs directly on the live CD ---
 cp "${BUILDER_DIR}/smoketest.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest"
@@ -286,100 +313,143 @@ for tool in storcli64 sas2ircu sas3ircu arcconf ssacli; do
    fi
 done

-# --- build NVIDIA kernel modules ---
-echo ""
-echo "=== building NVIDIA ${NVIDIA_DRIVER_VERSION} modules ==="
-sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}"
+# --- NVIDIA kernel modules and userspace libs ---
+if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
+    echo ""
+    echo "=== building NVIDIA ${NVIDIA_DRIVER_VERSION} modules ==="
+    sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}"

-KVER="${DEBIAN_KERNEL_ABI}-amd64"
-NVIDIA_CACHE="${DIST_DIR}/nvidia-${NVIDIA_DRIVER_VERSION}-${KVER}"
+    KVER="${DEBIAN_KERNEL_ABI}-amd64"
+    NVIDIA_CACHE="${DIST_DIR}/nvidia-${NVIDIA_DRIVER_VERSION}-${KVER}"

-# Inject .ko files into overlay at /usr/local/lib/nvidia/
-OVERLAY_KMOD_DIR="${OVERLAY_DIR}/usr/local/lib/nvidia"
-OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
-mkdir -p "${OVERLAY_KMOD_DIR}"
-cp "${NVIDIA_CACHE}/modules/"*.ko "${OVERLAY_KMOD_DIR}/"
+    # Inject .ko files into overlay at /usr/local/lib/nvidia/
+    OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
+    mkdir -p "${OVERLAY_KMOD_DIR}"
+    cp "${NVIDIA_CACHE}/modules/"*.ko "${OVERLAY_KMOD_DIR}/"

-# Inject nvidia-smi and libnvidia-ml
-mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin" "${OVERLAY_STAGE_DIR}/usr/lib"
-cp "${NVIDIA_CACHE}/bin/nvidia-smi" "${OVERLAY_STAGE_DIR}/usr/local/bin/"
-chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-smi"
-cp "${NVIDIA_CACHE}/bin/nvidia-bug-report.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/" 2>/dev/null || true
-chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-bug-report.sh" 2>/dev/null || true
-cp "${NVIDIA_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
+    # Inject nvidia-smi and libnvidia-ml
+    mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin" "${OVERLAY_STAGE_DIR}/usr/lib"
+    cp "${NVIDIA_CACHE}/bin/nvidia-smi" "${OVERLAY_STAGE_DIR}/usr/local/bin/"
+    chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-smi"
+    cp "${NVIDIA_CACHE}/bin/nvidia-bug-report.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/" 2>/dev/null || true
+    chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-bug-report.sh" 2>/dev/null || true
+    cp "${NVIDIA_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true

-# Inject GSP firmware into /lib/firmware/nvidia/<version>/
-if [ -d "${NVIDIA_CACHE}/firmware" ] && [ "$(ls -A "${NVIDIA_CACHE}/firmware" 2>/dev/null)" ]; then
-    mkdir -p "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}"
-    cp "${NVIDIA_CACHE}/firmware/"* "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/"
-    echo "=== firmware: $(ls "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/" | wc -l) files injected ==="
+    # Inject GSP firmware into /lib/firmware/nvidia/<version>/
+    if [ -d "${NVIDIA_CACHE}/firmware" ] && [ "$(ls -A "${NVIDIA_CACHE}/firmware" 2>/dev/null)" ]; then
+        mkdir -p "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}"
+        cp "${NVIDIA_CACHE}/firmware/"* "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/"
+        echo "=== firmware: $(ls "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/" | wc -l) files injected ==="
+    fi
+
+    # --- build / download NCCL ---
+    echo ""
+    echo "=== downloading NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION} ==="
+    sh "${BUILDER_DIR}/build-nccl.sh" "${NCCL_VERSION}" "${NCCL_CUDA_VERSION}" "${DIST_DIR}" "${NCCL_SHA256:-}"
+
+    NCCL_CACHE="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
+
+    # Inject libnccl.so.* into overlay alongside other NVIDIA userspace libs
+    cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
+    echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
+
+    # Inject cuBLAS/cuBLASLt/cudart runtime libs used by bee-gpu-stress tensor-core GEMM path
+    cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
+    echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
+
+    # --- build nccl-tests ---
+    echo ""
+    echo "=== building nccl-tests ${NCCL_TESTS_VERSION} ==="
+    sh "${BUILDER_DIR}/build-nccl-tests.sh" \
+        "${NCCL_TESTS_VERSION}" \
+        "${NCCL_VERSION}" \
+        "${NCCL_CUDA_VERSION}" \
+        "${DIST_DIR}" \
+        "${NVCC_VERSION}" \
+        "${DEBIAN_VERSION}"
+
+    NCCL_TESTS_CACHE="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
+    cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
+    chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
+    echo "=== all_reduce_perf injected ==="
 fi

-# --- build / download NCCL ---
-echo ""
-echo "=== downloading NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION} ==="
-sh "${BUILDER_DIR}/build-nccl.sh" "${NCCL_VERSION}" "${NCCL_CUDA_VERSION}" "${DIST_DIR}" "${NCCL_SHA256:-}"
-
-NCCL_CACHE="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
-
-# Inject libnccl.so.* into overlay alongside other NVIDIA userspace libs
-cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
-echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
-
-# Inject cuBLAS/cuBLASLt/cudart runtime libs used by bee-gpu-stress tensor-core GEMM path
-cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
-echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
-
-# --- build nccl-tests ---
-echo ""
-echo "=== building nccl-tests ${NCCL_TESTS_VERSION} ==="
-sh "${BUILDER_DIR}/build-nccl-tests.sh" \
-    "${NCCL_TESTS_VERSION}" \
-    "${NCCL_VERSION}" \
-    "${NCCL_CUDA_VERSION}" \
-    "${DIST_DIR}" \
-    "${NVCC_VERSION}" \
-    "${DEBIAN_VERSION}"
-
-NCCL_TESTS_CACHE="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
-cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
-chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
-echo "=== all_reduce_perf injected ==="
-
 # --- embed build metadata ---
 mkdir -p "${OVERLAY_STAGE_DIR}/etc"
 BUILD_DATE="$(date +%Y-%m-%d)"
 GIT_COMMIT="$(git -C "${REPO_ROOT}" rev-parse --short HEAD 2>/dev/null || echo unknown)"
-cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
-BEE_ISO_VERSION=${ISO_VERSION_EFFECTIVE}
-BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
-BUILD_DATE=${BUILD_DATE}
-GIT_COMMIT=${GIT_COMMIT}
-DEBIAN_VERSION=${DEBIAN_VERSION}
-DEBIAN_KERNEL_ABI=${DEBIAN_KERNEL_ABI}
-NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
+
+if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
+    GPU_VERSION_LINE="NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
 NCCL_VERSION=${NCCL_VERSION}
 NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
 CUBLAS_VERSION=${CUBLAS_VERSION}
 CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
-NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}
+NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}"
+    GPU_BUILD_INFO="nvidia:${NVIDIA_DRIVER_VERSION}"
+else
+    GPU_VERSION_LINE="ROCM_VERSION=${ROCM_VERSION}"
+    GPU_BUILD_INFO="rocm:${ROCM_VERSION}"
+fi
+
+cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
+BEE_ISO_VERSION=${ISO_VERSION_EFFECTIVE}
+BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
+BEE_GPU_VENDOR=${BEE_GPU_VENDOR}
+BUILD_DATE=${BUILD_DATE}
+GIT_COMMIT=${GIT_COMMIT}
+DEBIAN_VERSION=${DEBIAN_VERSION}
+DEBIAN_KERNEL_ABI=${DEBIAN_KERNEL_ABI}
+${GPU_VERSION_LINE}
 EOF

+# Write GPU vendor marker for hooks
+echo "${BEE_GPU_VENDOR}" > "${OVERLAY_STAGE_DIR}/etc/bee-gpu-vendor"
+
 # Patch motd with build info
-BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} debian:${DEBIAN_VERSION} nvidia:${NVIDIA_DRIVER_VERSION}"
+BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} debian:${DEBIAN_VERSION} ${GPU_BUILD_INFO}"
 if [ -f "${OVERLAY_STAGE_DIR}/etc/motd" ]; then
    sed "s/%%BUILD_INFO%%/${BEE_BUILD_INFO}/" "${OVERLAY_STAGE_DIR}/etc/motd" \
        > "${OVERLAY_STAGE_DIR}/etc/motd.patched"
    mv "${OVERLAY_STAGE_DIR}/etc/motd.patched" "${OVERLAY_STAGE_DIR}/etc/motd"
 fi

-# --- substitute version placeholders in package list ---
-sed -i \
-    -e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \
-    -e "s/%%ROCM_VERSION%%/${ROCM_VERSION}/g" \
-    -e "s/%%ROCM_SMI_VERSION%%/${ROCM_SMI_VERSION}/g" \
-    "${BUILD_WORK_DIR}/config/package-lists/bee.list.chroot" \
-    "${BUILD_WORK_DIR}/config/archives/rocm.list.chroot"
+# --- copy variant-specific package list into work dir ---
+cp "${BUILD_WORK_DIR}/config/package-lists/bee-${BEE_GPU_VENDOR}.list.chroot" \
+   "${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
+
+# --- remove archives for the other vendor ---
+if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
+    rm -f "${BUILD_WORK_DIR}/config/archives/rocm.list.chroot" \
+          "${BUILD_WORK_DIR}/config/archives/rocm.key.chroot"
+else
+    rm -f "${BUILD_WORK_DIR}/config/archives/nvidia-cuda.list.chroot" \
+          "${BUILD_WORK_DIR}/config/archives/nvidia-cuda.key.chroot"
+fi
+
+# --- substitute version placeholders in package list and archive ---
+if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
+    sed -i \
+        -e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \
+        "${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
+else
+    sed -i \
+        -e "s/%%ROCM_VERSION%%/${ROCM_VERSION}/g" \
+        -e "s/%%ROCM_SMI_VERSION%%/${ROCM_SMI_VERSION}/g" \
+        -e "s/%%ROCM_BANDWIDTH_TEST_VERSION%%/${ROCM_BANDWIDTH_TEST_VERSION}/g" \
+        -e "s/%%ROCM_VALIDATION_SUITE_VERSION%%/${ROCM_VALIDATION_SUITE_VERSION}/g" \
+        -e "s/%%ROCBLAS_VERSION%%/${ROCBLAS_VERSION}/g" \
+        -e "s/%%ROCRAND_VERSION%%/${ROCRAND_VERSION}/g" \
+        -e "s/%%HIP_RUNTIME_AMD_VERSION%%/${HIP_RUNTIME_AMD_VERSION}/g" \
+        -e "s/%%HIPBLASLT_VERSION%%/${HIPBLASLT_VERSION}/g" \
+        -e "s/%%COMGR_VERSION%%/${COMGR_VERSION}/g" \
+        "${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
+    if [ -f "${BUILD_WORK_DIR}/config/archives/rocm.list.chroot" ]; then
+        sed -i \
+            -e "s/%%ROCM_VERSION%%/${ROCM_VERSION}/g" \
+            "${BUILD_WORK_DIR}/config/archives/rocm.list.chroot"
+    fi
+fi

 # --- sync overlay into live-build includes.chroot ---
 LB_DIR="${BUILD_WORK_DIR}"
@@ -395,20 +465,31 @@ fi

 # --- build ISO using live-build ---
 echo ""
-echo "=== building ISO (live-build) ==="
+echo "=== building ISO (live-build, variant: ${BEE_GPU_VENDOR}) ==="
+
+# Export for auto/config
+BEE_GPU_VENDOR_UPPER="$(echo "${BEE_GPU_VENDOR}" | tr 'a-z' 'A-Z')"
+export BEE_GPU_VENDOR_UPPER

 cd "${LB_DIR}"
 lb clean 2>&1 | tail -3
 lb config 2>&1 | tail -5
 lb build 2>&1

+# --- persist deb package cache back to shared location ---
+# This allows the second variant to reuse all downloaded packages.
+if [ -d "${BUILD_WORK_DIR}/cache/packages.chroot" ]; then
+    rsync -a "${BUILD_WORK_DIR}/cache/packages.chroot/" "${LB_PKG_CACHE}/"
+    echo "=== package cache synced to ${LB_PKG_CACHE} ==="
+fi
+
 # live-build outputs live-image-amd64.hybrid.iso in LB_DIR
 ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
-ISO_OUT="${DIST_DIR}/bee-debian${DEBIAN_VERSION}-v${ISO_VERSION_EFFECTIVE}-amd64.iso"
+ISO_OUT="${DIST_DIR}/easy-bee-${BEE_GPU_VENDOR}-v${ISO_VERSION_EFFECTIVE}-amd64.iso"
 if [ -f "$ISO_RAW" ]; then
    cp "$ISO_RAW" "$ISO_OUT"
    echo ""
-    echo "=== done ==="
+    echo "=== done (${BEE_GPU_VENDOR}) ==="
    echo "ISO: $ISO_OUT"
    if command -v stat >/dev/null 2>&1; then
        ISO_SIZE_BYTES="$(stat -c '%s' "$ISO_OUT" 2>/dev/null || stat -f '%z' "$ISO_OUT")"
--- a/iso/builder/config/bootloaders/grub-pc/grub.cfg
+++ b/iso/builder/config/bootloaders/grub-pc/grub.cfg
@@ -10,12 +10,12 @@ echo "  ╚══════╝╚═╝  ╚═╝╚══════╝
 echo ""

 menuentry "EASY-BEE" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
+    linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
    initrd  @INITRD_LIVE@
 }

 menuentry "EASY-BEE (load to RAM)" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ toram bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
+    linux   @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
    initrd  @INITRD_LIVE@
 }

--- a/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
+++ b/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
@@ -5,6 +5,9 @@ set -e

 echo "=== bee chroot setup ==="

+GPU_VENDOR=$(cat /etc/bee-gpu-vendor 2>/dev/null || echo nvidia)
+echo "=== GPU vendor: ${GPU_VENDOR} ==="
+
 ensure_bee_console_user() {
    if id bee >/dev/null 2>&1; then
        usermod -d /home/bee -s /bin/bash bee 2>/dev/null || true
@@ -21,10 +24,8 @@ ensure_bee_console_user() {

 ensure_bee_console_user

-# Enable bee services
-systemctl enable nvidia-dcgm.service 2>/dev/null || true
+# Enable common bee services
 systemctl enable bee-network.service
-systemctl enable bee-nvidia.service
 systemctl enable bee-preflight.service
 systemctl enable bee-audit.service
 systemctl enable bee-web.service
@@ -36,25 +37,33 @@ systemctl enable serial-getty@ttyS0.service 2>/dev/null || true
 systemctl enable serial-getty@ttyS1.service 2>/dev/null || true
 systemctl enable bee-journal-mirror@ttyS1.service 2>/dev/null || true

+# Enable GPU-vendor specific services
+if [ "$GPU_VENDOR" = "nvidia" ]; then
+    systemctl enable nvidia-dcgm.service 2>/dev/null || true
+    systemctl enable bee-nvidia.service
+elif [ "$GPU_VENDOR" = "amd" ]; then
+    # ROCm symlinks (packages install to /opt/rocm-*/bin/)
+    for tool in rocm-smi rocm-bandwidth-test rvs; do
+        if [ ! -e /usr/local/bin/${tool} ]; then
+            bin_path="$(find /opt -path "*/bin/${tool}" -type f 2>/dev/null | sort | tail -1)"
+            [ -n "${bin_path}" ] && ln -sf "${bin_path}" /usr/local/bin/${tool}
+        fi
+    done
+fi
+
 # Ensure scripts are executable
 chmod +x /usr/local/bin/bee-network.sh  2>/dev/null || true
-chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
 chmod +x /usr/local/bin/bee-sshsetup   2>/dev/null || true
 chmod +x /usr/local/bin/bee-smoketest  2>/dev/null || true
 chmod +x /usr/local/bin/bee            2>/dev/null || true
 chmod +x /usr/local/bin/bee-log-run    2>/dev/null || true
+if [ "$GPU_VENDOR" = "nvidia" ]; then
+    chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
+fi

 # Reload udev rules
 udevadm control --reload-rules 2>/dev/null || true

-# rocm symlinks (packages install to /opt/rocm-*/bin/)
-for tool in rocm-smi rocm-bandwidth-test rvs; do
-    if [ ! -e /usr/local/bin/${tool} ]; then
-        bin_path="$(find /opt -path "*/bin/${tool}" -type f 2>/dev/null | sort | tail -1)"
-        [ -n "${bin_path}" ] && ln -sf "${bin_path}" /usr/local/bin/${tool}
-    fi
-done
-
 # Create export directory
 mkdir -p /appdata/bee/export

@@ -62,4 +71,4 @@ if [ -f /etc/sudoers.d/bee ]; then
    chmod 0440 /etc/sudoers.d/bee
 fi

-echo "=== bee chroot setup complete ==="
+echo "=== bee chroot setup complete (${GPU_VENDOR}) ==="
--- a/iso/builder/config/hooks/normal/9100-memtest.hook.binary
+++ b/iso/builder/config/hooks/normal/9100-memtest.hook.binary
@@ -4,6 +4,9 @@
 # not inside the squashfs).
 set -e

+echo "memtest: scanning chroot/boot/ for memtest files:"
+ls chroot/boot/memtest* 2>/dev/null || echo "memtest: WARNING: no memtest files found in chroot/boot/"
+
 for f in memtest86+x64.bin memtest86+x64.efi memtest86+ia32.bin memtest86+ia32.efi; do
    src="chroot/boot/${f}"
    if [ -f "${src}" ]; then
--- a/iso/builder/config/package-lists/bee-amd.list.chroot
+++ b/iso/builder/config/package-lists/bee-amd.list.chroot
@@ -0,0 +1,9 @@
+# AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST)
+rocm-smi-lib=%%ROCM_SMI_VERSION%%
+rocm-bandwidth-test=%%ROCM_BANDWIDTH_TEST_VERSION%%
+rocm-validation-suite=%%ROCM_VALIDATION_SUITE_VERSION%%
+rocblas=%%ROCBLAS_VERSION%%
+rocrand=%%ROCRAND_VERSION%%
+hip-runtime-amd=%%HIP_RUNTIME_AMD_VERSION%%
+hipblaslt=%%HIPBLASLT_VERSION%%
+comgr=%%COMGR_VERSION%%
--- a/iso/builder/config/package-lists/bee-nvidia.list.chroot
+++ b/iso/builder/config/package-lists/bee-nvidia.list.chroot
@@ -0,0 +1,2 @@
+# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing
+datacenter-gpu-manager=1:%%DCGM_VERSION%%
--- a/iso/builder/config/package-lists/bee.list.chroot
+++ b/iso/builder/config/package-lists/bee.list.chroot
@@ -72,18 +72,5 @@ firmware-bnx2x
 firmware-cavium
 firmware-qlogic

-# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing
-datacenter-gpu-manager=1:%%DCGM_VERSION%%
-
-# AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST)
-rocm-smi-lib=%%ROCM_SMI_VERSION%%
-rocm-bandwidth-test=%%ROCM_BANDWIDTH_TEST_VERSION%%
-rocm-validation-suite=%%ROCM_VALIDATION_SUITE_VERSION%%
-rocblas=%%ROCBLAS_VERSION%%
-rocrand=%%ROCRAND_VERSION%%
-hip-runtime-amd=%%HIP_RUNTIME_AMD_VERSION%%
-hipblaslt=%%HIPBLASLT_VERSION%%
-comgr=%%COMGR_VERSION%%
-
 # glibc compat helpers (for any external binaries that need it)
 libc6
--- a/iso/builder/smoketest.sh
+++ b/iso/builder/smoketest.sh
@@ -39,7 +39,7 @@ info "nvidia boot mode: ${NVIDIA_BOOT_MODE}"
 # --- PATH & binaries ---
 echo "-- PATH & binaries --"
 for tool in dmidecode smartctl nvme ipmitool lspci bee; do
-    if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
+    if p=$(PATH="/usr/local/bin:/usr/sbin:/sbin:$PATH" command -v "$tool" 2>/dev/null); then
        ok "$tool found: $p"
    else
        fail "$tool: NOT FOUND"
--- a/iso/overlay/etc/profile.d/bee.sh
+++ b/iso/overlay/etc/profile.d/bee.sh
@@ -1,4 +1,4 @@
-export PATH="$PATH:/usr/local/bin:/opt/rocm/bin:/opt/rocm/sbin"
+export PATH="$PATH:/usr/local/bin:/usr/sbin:/sbin:/opt/rocm/bin:/opt/rocm/sbin"

 # Print web UI URLs on the local console at login.
 if [ -z "${SSH_CONNECTION:-}" ] \
Author	SHA1	Message	Date
Michael Chus	ace1a9dba6	feat(iso): split into nvidia and amd variants, fix KVM graphics and PATH - build.sh: add --variant nvidia\|amd; separate work dirs per variant (live-build-work-nvidia / live-build-work-amd); GPU-specific steps (modules, NCCL, cuBLAS, nccl-tests) run only for nvidia; deb package cache synced back to shared location after each lb build so second variant reuses downloaded packages; ISO output named easy-bee-{variant}-v{ver}-amd64.iso - build-in-container.sh: add --variant nvidia\|amd\|all (default: all); runs build.sh twice in one container for 'all'; --clean-build wipes both variant work dirs - package-lists: remove GPU packages from bee.list.chroot; add bee-nvidia.list.chroot (DCGM) and bee-amd.list.chroot (ROCm) - 9000-bee-setup hook: read /etc/bee-gpu-vendor; enable bee-nvidia.service and DCGM only for nvidia; set up ROCm symlinks only for amd - auto/config: --iso-volume uses BEE_GPU_VENDOR_UPPER env var - grub.cfg: add nomodeset to EASY-BEE and EASY-BEE (load to RAM) entries — fixes X/lightdm on BMC KVM (ASPEED AST chip requires nomodeset for fbdev to work; NVIDIA H100 compute does not need KMS) - bee.sh / smoketest.sh: add /usr/sbin to PATH so dmidecode, smartctl, nvme are found - 9100-memtest hook: add diagnostic listing of chroot/boot/memtest* files Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-30 22:24:37 +03:00
Michael Chus	905c581ece	fix(iso): substitute all ROCm package version placeholders in build.sh ROCM_BANDWIDTH_TEST_VERSION, ROCM_VALIDATION_SUITE_VERSION, ROCBLAS, ROCRAND, HIP_RUNTIME_AMD, HIPBLASLT, COMGR were defined in VERSIONS and in bee.list.chroot but the sed substitution block only covered 3 of them. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-29 22:00:05 +03:00
Michael Chus	7c2a0135d2	feat(audit): add platform thermal cycling stress test Runs CPU (stressapptest) + GPU stress simultaneously across multiple load/idle cycles with varying idle durations (120s/60s/30s) to detect cooling systems that fail to recover under repeated load. Presets: smoke (~5 min), acceptance (~25 min), overnight (~100 min). Outputs metrics.csv + summary.txt with per-cycle throttle and fan spindown analysis, packed as tar.gz. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-29 21:57:33 +03:00
Michael Chus	407c1cd1c4	fix(charts): unify timeline labels across graphs	2026-03-29 21:24:06 +03:00
Michael Chus	e15bcc91c5	feat(metrics): persist history in sqlite and add AMD memory validate tests	2026-03-29 12:28:06 +03:00
Michael Chus	98f0cf0d52	fix(amd-stress): include VRAM load in GST burn	2026-03-29 12:03:50 +03:00