Add NVIDIA stress loader selection and DCGM 4 support

2026-03-31 11:15:15 +03:00
parent 20f834aa96
commit 6dee8f3509
31 changed files with 789 additions and 111 deletions
@@ -343,9 +343,9 @@ Planned code shape:
 - `bee tui` can rerun the audit manually
 - `bee tui` can export the latest audit JSON to removable media
 - `bee tui` can show health summary and run NVIDIA/memory/storage acceptance tests
- NVIDIA SAT now includes a lightweight in-image GPU stress step via `bee-gpu-stress`
+- NVIDIA SAT now includes a lightweight in-image GPU stress step via `bee-gpu-burn`
 - SAT summaries now expose `overall_status` plus per-job `OK/FAILED/UNSUPPORTED`
- Memory/GPU SAT runtime defaults can be overridden via `BEE_MEMTESTER_*` and `BEE_GPU_STRESS_*`
+- Memory SAT runtime defaults can be overridden via `BEE_MEMTESTER_*`
 - removable export requires explicit target selection, mount, confirmation, copy, and cleanup
 ### 2.6 — Vendor utilities and optional assets
@@ -356,6 +356,7 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
 	fs := flag.NewFlagSet("sat", flag.ContinueOnError)
 	fs.SetOutput(stderr)
 	duration := fs.Int("duration", 0, "stress-ng duration in seconds (cpu only; default: 60)")
 	diagLevel := fs.Int("diag-level", 0, "DCGM diagnostic level for nvidia (1=quick, 2=medium, 3=targeted stress, 4=extended stress; default: 1)")
 	if err := fs.Parse(args[1:]); err != nil {
 		if err == flag.ErrHelp {
 			return 0
@@ -370,7 +371,7 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
 	target := args[0]
 	if target != "nvidia" && target != "memory" && target != "storage" && target != "cpu" {
 		fmt.Fprintf(stderr, "bee sat: unknown target %q\n", target)
-		fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
+		fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>] [--diag-level <1-4>]")
 		return 2
 	}
@@ -382,7 +383,12 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
 	logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
 	switch target {
 	case "nvidia":
-		archive, err = application.RunNvidiaAcceptancePack("", logLine)
+		level := *diagLevel
 		if level > 0 {
 			_, err = application.RunNvidiaAcceptancePackWithOptions(context.Background(), "", level, nil, logLine)
 		} else {
 			archive, err = application.RunNvidiaAcceptancePack("", logLine)
 		}
 	case "memory":
 		archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", logLine)
 	case "storage":
@@ -107,6 +107,7 @@ func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
 type satRunner interface {
 	RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error)
 	RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
 	RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
 	RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
 	RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
@@ -508,6 +509,17 @@ func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir st
 	return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
 }
 func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
 	return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
 }
 func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
 	}
 	return a.sat.RunNvidiaStressPack(ctx, baseDir, opts, logFunc)
 }
 func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
 	return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, logFunc)
 }
@@ -120,14 +120,15 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
 }
 type fakeSAT struct {
-	runNvidiaFn      func(string) (string, error)
+	runNvidiaFn       func(string) (string, error)
-	runMemoryFn      func(string) (string, error)
+	runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
-	runStorageFn     func(string) (string, error)
+	runMemoryFn       func(string) (string, error)
-	runCPUFn         func(string, int) (string, error)
+	runStorageFn      func(string) (string, error)
-	detectVendorFn   func() string
+	runCPUFn          func(string, int) (string, error)
-	listAMDGPUsFn    func() ([]platform.AMDGPUInfo, error)
+	detectVendorFn    func() string
-	runAMDPackFn     func(string) (string, error)
+	listAMDGPUsFn     func() ([]platform.AMDGPUInfo, error)
-	listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
+	runAMDPackFn      func(string) (string, error)
 	listNvidiaGPUsFn  func() ([]platform.NvidiaGPU, error)
 }
 func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
@@ -138,6 +139,13 @@ func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir s
 	return f.runNvidiaFn(baseDir)
 }
 func (f fakeSAT) RunNvidiaStressPack(_ context.Context, baseDir string, opts platform.NvidiaStressOptions, _ func(string)) (string, error) {
 	if f.runNvidiaStressFn != nil {
 		return f.runNvidiaStressFn(baseDir, opts)
 	}
 	return f.runNvidiaFn(baseDir)
 }
 func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
 	if f.listNvidiaGPUsFn != nil {
 		return f.listNvidiaGPUsFn()
@@ -0,0 +1,178 @@
 package platform
 import (
 	"context"
 	"fmt"
 	"sort"
 	"strconv"
 	"strings"
 )
 func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts NvidiaStressOptions, logFunc func(string)) (string, error) {
 	normalizeNvidiaStressOptions(&opts)
 	job, err := buildNvidiaStressJob(opts)
 	if err != nil {
 		return "", err
 	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-stress", []satJob{
 		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
 		job,
 		{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
 	}, logFunc)
 }
 func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
 	selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
 	if err != nil {
 		return satJob{}, err
 	}
 	loader := strings.TrimSpace(strings.ToLower(opts.Loader))
 	switch loader {
 	case "", NvidiaStressLoaderBuiltin:
 		cmd := []string{
 			"bee-gpu-burn",
 			"--seconds", strconv.Itoa(opts.DurationSec),
 			"--size-mb", strconv.Itoa(opts.SizeMB),
 		}
 		if len(selected) > 0 {
 			cmd = append(cmd, "--devices", joinIndexList(selected))
 		}
 		return satJob{
 			name:       "03-bee-gpu-burn.log",
 			cmd:        cmd,
 			collectGPU: true,
 			gpuIndices: selected,
 		}, nil
 	case NvidiaStressLoaderJohn:
 		cmd := []string{
 			"bee-john-gpu-stress",
 			"--seconds", strconv.Itoa(opts.DurationSec),
 		}
 		if len(selected) > 0 {
 			cmd = append(cmd, "--devices", joinIndexList(selected))
 		}
 		return satJob{
 			name:       "03-john-gpu-stress.log",
 			cmd:        cmd,
 			collectGPU: true,
 			gpuIndices: selected,
 		}, nil
 	default:
 		return satJob{}, fmt.Errorf("unknown NVIDIA stress loader %q", opts.Loader)
 	}
 }
 func normalizeNvidiaStressOptions(opts *NvidiaStressOptions) {
 	if opts.DurationSec <= 0 {
 		opts.DurationSec = 300
 	}
 	if opts.SizeMB <= 0 {
 		opts.SizeMB = 64
 	}
 	switch strings.TrimSpace(strings.ToLower(opts.Loader)) {
 	case "", NvidiaStressLoaderBuiltin:
 		opts.Loader = NvidiaStressLoaderBuiltin
 	case NvidiaStressLoaderJohn:
 		opts.Loader = NvidiaStressLoaderJohn
 	default:
 		opts.Loader = NvidiaStressLoaderBuiltin
 	}
 	opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices)
 	opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices)
 }
 func resolveNvidiaGPUSelection(include, exclude []int) ([]int, error) {
 	all, err := listNvidiaGPUIndices()
 	if err != nil {
 		return nil, err
 	}
 	if len(all) == 0 {
 		return nil, fmt.Errorf("nvidia-smi found no NVIDIA GPUs")
 	}
 	selected := all
 	if len(include) > 0 {
 		want := make(map[int]struct{}, len(include))
 		for _, idx := range include {
 			want[idx] = struct{}{}
 		}
 		selected = selected[:0]
 		for _, idx := range all {
 			if _, ok := want[idx]; ok {
 				selected = append(selected, idx)
 			}
 		}
 	}
 	if len(exclude) > 0 {
 		skip := make(map[int]struct{}, len(exclude))
 		for _, idx := range exclude {
 			skip[idx] = struct{}{}
 		}
 		filtered := selected[:0]
 		for _, idx := range selected {
 			if _, ok := skip[idx]; ok {
 				continue
 			}
 			filtered = append(filtered, idx)
 		}
 		selected = filtered
 	}
 	if len(selected) == 0 {
 		return nil, fmt.Errorf("no NVIDIA GPUs selected after applying filters")
 	}
 	out := append([]int(nil), selected...)
 	sort.Ints(out)
 	return out, nil
 }
 func listNvidiaGPUIndices() ([]int, error) {
 	out, err := satExecCommand("nvidia-smi", "--query-gpu=index", "--format=csv,noheader,nounits").Output()
 	if err != nil {
 		return nil, fmt.Errorf("nvidia-smi: %w", err)
 	}
 	var indices []int
 	for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
 		line = strings.TrimSpace(line)
 		if line == "" {
 			continue
 		}
 		idx, err := strconv.Atoi(line)
 		if err != nil {
 			continue
 		}
 		indices = append(indices, idx)
 	}
 	return dedupeSortedIndices(indices), nil
 }
 func dedupeSortedIndices(values []int) []int {
 	if len(values) == 0 {
 		return nil
 	}
 	seen := make(map[int]struct{}, len(values))
 	out := make([]int, 0, len(values))
 	for _, value := range values {
 		if value < 0 {
 			continue
 		}
 		if _, ok := seen[value]; ok {
 			continue
 		}
 		seen[value] = struct{}{}
 		out = append(out, value)
 	}
 	sort.Ints(out)
 	return out
 }
 func joinIndexList(values []int) string {
 	parts := make([]string, 0, len(values))
 	for _, value := range values {
 		parts = append(parts, strconv.Itoa(value))
 	}
 	return strings.Join(parts, ",")
 }
@@ -423,7 +423,10 @@ func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
 }
 func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
-	path, err := satLookPath("bee-gpu-stress")
+	path, err := satLookPath("bee-gpu-burn")
 	if err != nil {
 		path, err = satLookPath("bee-gpu-stress")
 	}
 	if err != nil {
 		return nil
 	}
@@ -136,7 +136,8 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
 		tools = append(tools, s.CheckTools([]string{
 			"nvidia-smi",
 			"nvidia-bug-report.sh",
-			"bee-gpu-stress",
+			"bee-gpu-burn",
 			"bee-john-gpu-stress",
 		})...)
 	case "amd":
 		tool := ToolStatus{Name: "rocm-smi"}
@@ -176,8 +177,8 @@ func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHe
 			health.DriverReady = true
 		}
-		if lookErr := exec.Command("sh", "-c", "command -v bee-gpu-stress >/dev/null 2>&1").Run(); lookErr == nil {
+		if _, lookErr := exec.LookPath("bee-gpu-burn"); lookErr == nil {
-			out, err := exec.Command("bee-gpu-stress", "--seconds", "1", "--size-mb", "1").CombinedOutput()
+			out, err := exec.Command("bee-gpu-burn", "--seconds", "1", "--size-mb", "1").CombinedOutput()
 			if err == nil {
 				health.CUDAReady = true
 			} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
@@ -425,14 +425,12 @@ type satStats struct {
 }
 func nvidiaSATJobs() []satJob {
 	seconds := envInt("BEE_GPU_STRESS_SECONDS", 5)
 	sizeMB := envInt("BEE_GPU_STRESS_SIZE_MB", 64)
 	return []satJob{
 		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
 		{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
 		{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
 		{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
-		{name: "05-bee-gpu-stress.log", cmd: []string{"bee-gpu-stress", "--seconds", fmt.Sprintf("%d", seconds), "--size-mb", fmt.Sprintf("%d", sizeMB)}},
+		{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
 	}
 }
@@ -130,26 +130,21 @@ func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanS
 		stats.OK++
 	}
-	// loadPhase runs bee-gpu-stress for durSec; sampler stamps phaseName on each row.
+	// loadPhase runs bee-gpu-burn for durSec; sampler stamps phaseName on each row.
 	loadPhase := func(phaseName, stepName string, durSec int) {
 		if ctx.Err() != nil {
 			return
 		}
 		setPhase(phaseName)
 		var env []string
 		if len(opts.GPUIndices) > 0 {
 			ids := make([]string, len(opts.GPUIndices))
 			for i, idx := range opts.GPUIndices {
 				ids[i] = strconv.Itoa(idx)
 			}
 			env = []string{"CUDA_VISIBLE_DEVICES=" + strings.Join(ids, ",")}
 		}
 		cmd := []string{
-			"bee-gpu-stress",
+			"bee-gpu-burn",
 			"--seconds", strconv.Itoa(durSec),
 			"--size-mb", strconv.Itoa(opts.SizeMB),
 		}
-		out, err := runSATCommandCtx(ctx, verboseLog, stepName, cmd, env, nil)
+		if len(opts.GPUIndices) > 0 {
 			cmd = append(cmd, "--devices", joinIndexList(dedupeSortedIndices(opts.GPUIndices)))
 		}
 		out, err := runSATCommandCtx(ctx, verboseLog, stepName, cmd, nil, nil)
 		_ = os.WriteFile(filepath.Join(runDir, stepName+".log"), out, 0644)
 		if err != nil && err != context.Canceled && err.Error() != "signal: killed" {
 			fmt.Fprintf(&summary, "%s_status=FAILED\n", stepName)
@@ -323,8 +318,9 @@ func sampleFanSpeeds() ([]FanReading, error) {
 // parseFanSpeeds parses "ipmitool sdr type Fan" output.
 // Handles two formats:
-//   Old: "FAN1 | 2400.000 | RPM | ok"           (value in col[1], unit in col[2])
+//
-//   New: "FAN1 | 41h | ok | 29.1 | 4340 RPM"   (value+unit combined in last col)
+//	Old: "FAN1 | 2400.000 | RPM | ok"           (value in col[1], unit in col[2])
 //	New: "FAN1 | 41h | ok | 29.1 | 4340 RPM"   (value+unit combined in last col)
 func parseFanSpeeds(raw string) []FanReading {
 	var fans []FanReading
 	for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
@@ -31,8 +31,8 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {
 	if len(jobs) != 5 {
 		t.Fatalf("jobs=%d want 5", len(jobs))
 	}
-	if got := jobs[4].cmd[0]; got != "bee-gpu-stress" {
+	if got := jobs[4].cmd[0]; got != "bee-gpu-burn" {
-		t.Fatalf("gpu stress command=%q want bee-gpu-stress", got)
+		t.Fatalf("gpu stress command=%q want bee-gpu-burn", got)
 	}
 	if got := jobs[3].cmd[1]; got != "--output-file" {
 		t.Fatalf("bug report flag=%q want --output-file", got)
@@ -80,13 +80,10 @@ func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) {
 	}
 }
-func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) {
+func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
 	t.Setenv("BEE_GPU_STRESS_SECONDS", "9")
 	t.Setenv("BEE_GPU_STRESS_SIZE_MB", "96")
 	jobs := nvidiaSATJobs()
 	got := jobs[4].cmd
-	want := []string{"bee-gpu-stress", "--seconds", "9", "--size-mb", "96"}
+	want := []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}
 	if len(got) != len(want) {
 		t.Fatalf("cmd len=%d want %d", len(got), len(want))
 	}
@@ -97,6 +94,40 @@ func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) {
 	}
 }
 func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) {
 	t.Parallel()
 	oldExecCommand := satExecCommand
 	satExecCommand = func(name string, args ...string) *exec.Cmd {
 		if name == "nvidia-smi" {
 			return exec.Command("sh", "-c", "printf '0\n1\n2\n'")
 		}
 		return exec.Command(name, args...)
 	}
 	t.Cleanup(func() { satExecCommand = oldExecCommand })
 	job, err := buildNvidiaStressJob(NvidiaStressOptions{
 		DurationSec:       600,
 		Loader:            NvidiaStressLoaderJohn,
 		ExcludeGPUIndices: []int{1},
 	})
 	if err != nil {
 		t.Fatalf("buildNvidiaStressJob error: %v", err)
 	}
 	wantCmd := []string{"bee-john-gpu-stress", "--seconds", "600", "--devices", "0,2"}
 	if len(job.cmd) != len(wantCmd) {
 		t.Fatalf("cmd len=%d want %d (%v)", len(job.cmd), len(wantCmd), job.cmd)
 	}
 	for i := range wantCmd {
 		if job.cmd[i] != wantCmd[i] {
 			t.Fatalf("cmd[%d]=%q want %q", i, job.cmd[i], wantCmd[i])
 		}
 	}
 	if got := joinIndexList(job.gpuIndices); got != "0,2" {
 		t.Fatalf("gpuIndices=%q want 0,2", got)
 	}
 }
 func TestEnvIntFallback(t *testing.T) {
 	os.Unsetenv("BEE_MEMTESTER_SIZE_MB")
 	if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 123 {
@@ -122,8 +153,8 @@ func TestClassifySATResult(t *testing.T) {
 	}{
 		{name: "ok", job: "memtester", out: "done", err: nil, status: "OK"},
 		{name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"},
-		{name: "failed", job: "bee-gpu-stress", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
+		{name: "failed", job: "bee-gpu-burn", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
-		{name: "cuda not ready", job: "bee-gpu-stress", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
+		{name: "cuda not ready", job: "bee-gpu-burn", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
 	}
 	for _, tt := range tests {
@@ -51,6 +51,19 @@ type ToolStatus struct {
 	OK   bool
 }
 const (
 	NvidiaStressLoaderBuiltin = "builtin"
 	NvidiaStressLoaderJohn    = "john"
 )
 type NvidiaStressOptions struct {
 	DurationSec       int
 	SizeMB            int
 	Loader            string
 	GPUIndices        []int
 	ExcludeGPUIndices []int
 }
 func New() *System {
 	return &System{}
 }
@@ -171,17 +171,24 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
 		}
 		var body struct {
-			Duration    int    `json:"duration"`
+			Duration          int    `json:"duration"`
-			DiagLevel   int    `json:"diag_level"`
+			DiagLevel         int    `json:"diag_level"`
-			GPUIndices  []int  `json:"gpu_indices"`
+			GPUIndices        []int  `json:"gpu_indices"`
-			Profile     string `json:"profile"`
+			ExcludeGPUIndices []int  `json:"exclude_gpu_indices"`
-			DisplayName string `json:"display_name"`
+			Loader            string `json:"loader"`
 			Profile           string `json:"profile"`
 			DisplayName       string `json:"display_name"`
 		}
 		if r.ContentLength > 0 {
 			_ = json.NewDecoder(r.Body).Decode(&body)
 		}
 		name := taskNames[target]
 		if body.Profile != "" {
 			if n, ok := burnNames[target]; ok {
 				name = n
 			}
 		}
 		if name == "" {
 			name = target
 		}
@@ -192,11 +199,13 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
 			Status:    TaskPending,
 			CreatedAt: time.Now(),
 			params: taskParams{
-				Duration:    body.Duration,
+				Duration:          body.Duration,
-				DiagLevel:   body.DiagLevel,
+				DiagLevel:         body.DiagLevel,
-				GPUIndices:  body.GPUIndices,
+				GPUIndices:        body.GPUIndices,
-				BurnProfile: body.Profile,
+				ExcludeGPUIndices: body.ExcludeGPUIndices,
-				DisplayName: body.DisplayName,
+				Loader:            body.Loader,
 				BurnProfile:       body.Profile,
 				DisplayName:       body.DisplayName,
 			},
 		}
 		if strings.TrimSpace(body.DisplayName) != "" {
@@ -664,12 +664,15 @@ func renderBurn() string {
 	return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>&#9888; Warning:</strong> Stress tests on this page run hardware at maximum load. Repeated or prolonged use may reduce hardware lifespan (storage endurance, GPU wear). Use only when necessary.</div>
 <p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
 <div class="card"><div class="card-head">Burn Profile</div><div class="card-body">
-<div class="form-row" style="max-width:320px"><label>Preset</label><select id="burn-profile"><option value="smoke">Smoke: 5 minutes</option><option value="acceptance">Acceptance: 1 hour</option><option value="overnight">Overnight: 8 hours</option></select></div>
+<div class="form-row" style="max-width:320px"><label>Preset</label><select id="burn-profile"><option value="smoke" selected>Smoke: quick check (~5 min CPU / DCGM level 1)</option><option value="acceptance">Acceptance: 1 hour (DCGM level 3)</option><option value="overnight">Overnight: 8 hours (DCGM level 4)</option></select></div>
-<p style="color:var(--muted);font-size:12px">Applied to all tests on this page. NVIDIA uses mapped DCGM levels: smoke=quick, acceptance=targeted stress, overnight=extended stress.</p>
+<p style="color:var(--muted);font-size:12px">Applied to all tests on this page. NVIDIA SAT on the Validate page still uses DCGM. NVIDIA GPU Stress on this page uses the selected stress loader for the preset duration.</p>
 </div></div>
 <div class="grid3">
 <div class="card"><div class="card-head">NVIDIA GPU Stress</div><div class="card-body">
-<button id="sat-btn-nvidia" class="btn btn-primary" onclick="runBurnIn('nvidia')">&#9654; Start NVIDIA Stress</button>
+<div class="form-row"><label>Load Tool</label><select id="nvidia-stress-loader"><option value="builtin" selected>bee-gpu-burn</option><option value="john">John the Ripper jumbo (OpenCL)</option></select></div>
 <div class="form-row"><label>Exclude GPU indices</label><input type="text" id="nvidia-stress-exclude" placeholder="e.g. 1,3"></div>
 <p style="color:var(--muted);font-size:12px;margin-bottom:8px"><code>bee-gpu-burn</code> runs on all detected NVIDIA GPUs by default. Use exclusions only when one or more cards must be skipped.</p>
 <button id="sat-btn-nvidia-stress" class="btn btn-primary" onclick="runBurnIn('nvidia-stress')">&#9654; Start NVIDIA Stress</button>
 </div></div>
 <div class="card"><div class="card-head">CPU Stress</div><div class="card-body">
 <button class="btn btn-primary" onclick="runBurnIn('cpu')">&#9654; Start CPU Stress</button>
@@ -697,11 +700,24 @@ func renderBurn() string {
 </div>
 <script>
 let biES = null;
 function parseGPUIndexList(raw) {
  return (raw || '')
    .split(',')
    .map(v => v.trim())
    .filter(v => v !== '')
    .map(v => Number(v))
    .filter(v => Number.isInteger(v) && v >= 0);
 }
 function runBurnIn(target) {
  if (biES) { biES.close(); biES = null; }
  const body = { profile: document.getElementById('burn-profile').value || 'smoke' };
  if (target === 'nvidia-stress') {
    body.loader = document.getElementById('nvidia-stress-loader').value || 'builtin';
    body.exclude_gpu_indices = parseGPUIndexList(document.getElementById('nvidia-stress-exclude').value);
  }
  document.getElementById('bi-output').style.display='block';
-  document.getElementById('bi-title').textContent = '— ' + target + ' [' + body.profile + ']';
+  const loaderLabel = body.loader ? ' / ' + body.loader : '';
  document.getElementById('bi-title').textContent = '— ' + target + loaderLabel + ' [' + body.profile + ']';
  const term = document.getElementById('bi-terminal');
  term.textContent = 'Enqueuing ' + target + ' stress...\n';
  fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(body)})
@@ -716,7 +732,7 @@ function runBurnIn(target) {
 </script>
 <script>
 fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
-    if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
+    if (!gp.nvidia) disableSATCard('nvidia-stress', 'No NVIDIA GPU detected');
    if (!gp.amd) disableSATCard('amd-stress', 'No AMD GPU detected');
 });
 function disableSATCard(id, reason) {
@@ -206,6 +206,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	// SAT
 	mux.HandleFunc("POST /api/sat/nvidia/run", h.handleAPISATRun("nvidia"))
 	mux.HandleFunc("POST /api/sat/nvidia-stress/run", h.handleAPISATRun("nvidia-stress"))
 	mux.HandleFunc("POST /api/sat/memory/run", h.handleAPISATRun("memory"))
 	mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
 	mux.HandleFunc("POST /api/sat/cpu/run", h.handleAPISATRun("cpu"))
@@ -24,22 +24,31 @@ const (
 	TaskCancelled = "cancelled"
 )
-// taskNames maps target → human-readable name.
+// taskNames maps target → human-readable name for validate (SAT) runs.
 var taskNames = map[string]string{
-	"nvidia":         "NVIDIA SAT",
+	"nvidia":          "NVIDIA SAT",
-	"memory":         "Memory SAT",
+	"nvidia-stress":   "NVIDIA GPU Stress",
-	"storage":        "Storage SAT",
+	"memory":          "Memory SAT",
-	"cpu":            "CPU SAT",
+	"storage":         "Storage SAT",
-	"amd":            "AMD GPU SAT",
+	"cpu":             "CPU SAT",
-	"amd-mem":        "AMD GPU MEM Integrity",
+	"amd":             "AMD GPU SAT",
-	"amd-bandwidth":  "AMD GPU MEM Bandwidth",
+	"amd-mem":         "AMD GPU MEM Integrity",
-	"amd-stress":     "AMD GPU Burn-in",
+	"amd-bandwidth":   "AMD GPU MEM Bandwidth",
-	"memory-stress":  "Memory Burn-in",
+	"amd-stress":      "AMD GPU Burn-in",
-	"sat-stress":       "SAT Stress (stressapptest)",
+	"memory-stress":   "Memory Burn-in",
 	"sat-stress":      "SAT Stress (stressapptest)",
 	"platform-stress": "Platform Thermal Cycling",
-	"audit":          "Audit",
+	"audit":           "Audit",
-	"install":        "Install to Disk",
+	"install":         "Install to Disk",
-	"install-to-ram": "Install to RAM",
+	"install-to-ram":  "Install to RAM",
 }
 // burnNames maps target → human-readable name when a burn profile is set.
 var burnNames = map[string]string{
 	"nvidia": "NVIDIA Burn-in",
 	"memory": "Memory Burn-in",
 	"cpu":    "CPU Burn-in",
 	"amd":    "AMD GPU Burn-in",
 }
 // Task represents one unit of work in the queue.
@@ -62,12 +71,14 @@ type Task struct {
 // taskParams holds optional parameters parsed from the run request.
 type taskParams struct {
-	Duration    int    `json:"duration,omitempty"`
+	Duration          int    `json:"duration,omitempty"`
-	DiagLevel   int    `json:"diag_level,omitempty"`
+	DiagLevel         int    `json:"diag_level,omitempty"`
-	GPUIndices  []int  `json:"gpu_indices,omitempty"`
+	GPUIndices        []int  `json:"gpu_indices,omitempty"`
-	BurnProfile string `json:"burn_profile,omitempty"`
+	ExcludeGPUIndices []int  `json:"exclude_gpu_indices,omitempty"`
-	DisplayName string `json:"display_name,omitempty"`
+	Loader            string `json:"loader,omitempty"`
-	Device      string `json:"device,omitempty"` // for install
+	BurnProfile       string `json:"burn_profile,omitempty"`
 	DisplayName       string `json:"display_name,omitempty"`
 	Device            string `json:"device,omitempty"` // for install
 }
 type persistedTask struct {
@@ -162,6 +173,9 @@ var (
 	runAMDMemBandwidthPackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
 		return a.RunAMDMemBandwidthPackCtx(ctx, baseDir, logFunc)
 	}
 	runNvidiaStressPackCtx = func(a *app.App, ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
 		return a.RunNvidiaStressPackCtx(ctx, baseDir, opts, logFunc)
 	}
 	runAMDStressPackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
 		return a.RunAMDStressPackCtx(ctx, baseDir, durationSec, logFunc)
 	}
@@ -403,6 +417,17 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 		} else {
 			archive, err = a.RunNvidiaAcceptancePack("", j.append)
 		}
 	case "nvidia-stress":
 		dur := t.params.Duration
 		if t.params.BurnProfile != "" && dur <= 0 {
 			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
 		}
 		archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
 			DurationSec:       dur,
 			Loader:            t.params.Loader,
 			GPUIndices:        t.params.GPUIndices,
 			ExcludeGPUIndices: t.params.ExcludeGPUIndices,
 		}, j.append)
 	case "memory":
 		archive, err = runMemoryAcceptancePackCtx(a, ctx, "", j.append)
 	case "storage":
@@ -81,9 +81,9 @@ build-in-container.sh [--authorized-keys /path/to/keys]
  7. `build-cublas.sh`:
       a. download `libcublas`, `libcublasLt`, `libcudart` runtime + dev packages from the NVIDIA CUDA Debian repo
       b. verify packages against repo `Packages.gz`
-       c. extract headers for `bee-gpu-stress` build
+       c. extract headers for `bee-gpu-burn` worker build
       d. cache userspace libs in `dist/cublas-<version>+cuda<series>/`
-  8. build `bee-gpu-stress` against extracted cuBLASLt/cudart headers
+  8. build `bee-gpu-burn` worker against extracted cuBLASLt/cudart headers
  9. inject NVIDIA `.ko` → staged `/usr/local/lib/nvidia/`
  10. inject `nvidia-smi` → staged `/usr/local/bin/nvidia-smi`
  11. inject `libnvidia-ml` + `libcuda` + `libcublas` + `libcublasLt` + `libcudart` → staged `/usr/lib/`
@@ -104,7 +104,7 @@ Build host notes:
  1. `build-in-container.sh` / `build-nvidia-module.sh` — Debian kernel headers for module build
  2. `auto/config` — `linux-image-${DEBIAN_KERNEL_ABI}` in the ISO
 - NVIDIA modules go to staged `usr/local/lib/nvidia/` — NOT to `/lib/modules/<kver>/extra/`.
- `bee-gpu-stress` must be built against cached CUDA userspace headers from `build-cublas.sh`, not against random host-installed CUDA headers.
+- `bee-gpu-burn` worker must be built against cached CUDA userspace headers from `build-cublas.sh`, not against random host-installed CUDA headers.
 - The live ISO must ship `libcublas`, `libcublasLt`, and `libcudart` together with `libcuda` so tensor-core stress works without internet or package installs at boot.
 - The source overlay in `iso/overlay/` is treated as immutable source. Build-time files are injected only into the staged overlay.
 - The live-build workdir under `dist/` is disposable; source files under `iso/builder/` stay clean.
@@ -153,18 +153,17 @@ Current validation state:
 Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal.
 Acceptance flows:
- `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + mixed-precision `bee-gpu-stress`
+- `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + lightweight `bee-gpu-burn`
 - NVIDIA GPU burn-in can use either `bee-gpu-burn` or `bee-john-gpu-stress` (John the Ripper jumbo via OpenCL)
 - `bee sat memory` → `memtester` archive
 - `bee sat storage` → SMART/NVMe diagnostic archive and short self-test trigger where supported
 - SAT `summary.txt` now includes `overall_status` and per-job `*_status` values (`OK`, `FAILED`, `UNSUPPORTED`)
- `bee-gpu-stress` should prefer cuBLASLt GEMM load over the old integer/PTX burn path:
+- `bee-gpu-burn` should prefer cuBLASLt GEMM load over the old integer/PTX burn path:
  - Ampere: `fp16` + `fp32`/TF32 tensor-core load
  - Ada / Hopper: add `fp8`
  - Blackwell+: add `fp4`
  - PTX fallback is only for missing cuBLASLt/userspace or unsupported narrow datatypes
 - Runtime overrides:
  - `BEE_GPU_STRESS_SECONDS`
  - `BEE_GPU_STRESS_SIZE_MB`
  - `BEE_MEMTESTER_SIZE_MB`
  - `BEE_MEMTESTER_PASSES`
@@ -179,6 +178,6 @@ Web UI: Acceptance Tests page → Run Test button
 ```
 **Critical invariants:**
- `bee-gpu-stress` uses `exec.CommandContext` — killed on job context cancel.
+- `bee-gpu-burn` / `bee-john-gpu-stress` use `exec.CommandContext` — killed on job context cancel.
 - Metric goroutine uses stopCh/doneCh pattern; main goroutine waits `<-doneCh` before reading rows (no mutex needed).
 - SVG chart is fully offline: no JS, no external CSS, pure inline SVG.
@@ -21,8 +21,8 @@ Fills gaps where Redfish/logpile is blind:
 - Read-only hardware inventory: board, CPU, memory, storage, PCIe, PSU, GPU, NIC, RAID
 - Machine-readable health summary derived from collector verdicts
 - Operator-triggered acceptance tests for NVIDIA, memory, and storage
- NVIDIA SAT includes both diagnostic collection and mixed-precision GPU stress via `bee-gpu-stress`
+- NVIDIA SAT includes diagnostic collection plus a lightweight in-image GPU stress step via `bee-gpu-burn`
- `bee-gpu-stress` should exercise tensor/inference paths (`fp16`, `fp32`/TF32, `fp8`, `fp4` when supported by the GPU/userspace stack) and fall back to Driver API PTX burn only if cuBLASLt is unavailable
+- `bee-gpu-burn` should exercise tensor/inference paths (`fp16`, `fp32`/TF32, `fp8`, `fp4` when supported by the GPU/userspace stack) and fall back to Driver API PTX burn only if cuBLASLt is unavailable
 - Automatic boot audit with operator-facing local console and SSH access
 - NVIDIA proprietary driver loaded at boot for GPU enrichment via `nvidia-smi`
 - SSH access (OpenSSH) always available for inspection and debugging
@@ -70,7 +70,7 @@ Fills gaps where Redfish/logpile is blind:
 | SSH | OpenSSH server |
 | NVIDIA driver | Proprietary `.run` installer, built against Debian kernel headers |
 | NVIDIA modules | Loaded via `insmod` from `/usr/local/lib/nvidia/` |
-| GPU stress backend | `bee-gpu-stress` + cuBLASLt/cuBLAS/cudart mixed-precision GEMM, with Driver API PTX fallback |
+| GPU stress backend | `bee-gpu-burn` + cuBLASLt/cuBLAS/cudart mixed-precision GEMM, with Driver API PTX fallback |
 | Builder | Debian 12 host/VM or Debian 12 container image |
 ## Operator UX
@@ -18,6 +18,8 @@ Use the official proprietary NVIDIA `.run` installer for both kernel modules and
 - Kernel modules and nvidia-smi come from a single verified source.
 - NVIDIA publishes `.sha256sum` alongside each installer — download and verify before use.
 - Driver version pinned in `iso/builder/VERSIONS` as `NVIDIA_DRIVER_VERSION`.
 - DCGM must track the CUDA user-mode driver major version exposed by `nvidia-smi`.
 - For NVIDIA driver branch `590` with CUDA `13.x`, use DCGM 4 package family `datacenter-gpu-manager-4-cuda13`; legacy `datacenter-gpu-manager` 3.x does not provide a working path for this stack.
 - Build process: download `.run`, extract, compile `kernel/` sources against `linux-lts-dev`.
 - Modules cached in `dist/nvidia-<version>-<kver>/` — rebuild only on version or kernel change.
 - ISO size increases by ~50MB for .ko files + nvidia-smi.
@@ -48,6 +48,7 @@ sh iso/builder/build-in-container.sh --cache-dir /path/to/cache
 - The builder image is automatically rebuilt if the local tag exists for the wrong architecture.
 - The live ISO boots with Debian `live-boot` `toram`, so the read-only medium is copied into RAM during boot and the runtime no longer depends on the original USB/BMC virtual media staying present.
 - Target systems need enough RAM for the full compressed live medium plus normal runtime overhead, or boot may fail before reaching the TUI.
 - The NVIDIA variant installs DCGM 4 packages matched to the CUDA user-mode driver major version. For driver branch `590` / CUDA `13.x`, the package family is `datacenter-gpu-manager-4-cuda13` rather than legacy `datacenter-gpu-manager`.
 - Override the container platform only if you know why:
 ```sh
@@ -23,6 +23,16 @@ RUN apt-get update -qq && apt-get install -y \
    gcc \
    make \
    perl \
    pkg-config \
    yasm \
    libssl-dev \
    zlib1g-dev \
    libbz2-dev \
    libgmp-dev \
    libpcap-dev \
    libsqlite3-dev \
    libcurl4-openssl-dev \
    ocl-icd-opencl-dev \
    linux-headers-amd64 \
    && rm -rf /var/lib/apt/lists/*
@@ -8,7 +8,8 @@ NCCL_TESTS_VERSION=2.13.10
 NVCC_VERSION=12.8
 CUBLAS_VERSION=13.0.2.14-1
 CUDA_USERSPACE_VERSION=13.0.96-1
-DCGM_VERSION=3.3.9
+DCGM_VERSION=4.5.2-1
 JOHN_JUMBO_COMMIT=67fcf9fe5a
 ROCM_VERSION=6.3.4
 ROCM_SMI_VERSION=7.4.0.60304-76~22.04
 ROCM_BANDWIDTH_TEST_VERSION=1.4.0.60304-76~22.04
@@ -29,6 +29,7 @@ typedef void *CUfunction;
 typedef void *CUstream;
 #define CU_SUCCESS 0
 #define CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT 16
 #define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR 75
 #define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR 76
@@ -97,6 +98,9 @@ typedef CUresult (*cuLaunchKernel_fn)(CUfunction,
                                      CUstream,
                                      void **,
                                      void **);
 typedef CUresult (*cuMemGetInfo_fn)(size_t *, size_t *);
 typedef CUresult (*cuStreamCreate_fn)(CUstream *, unsigned int);
 typedef CUresult (*cuStreamDestroy_fn)(CUstream);
 typedef CUresult (*cuGetErrorName_fn)(CUresult, const char **);
 typedef CUresult (*cuGetErrorString_fn)(CUresult, const char **);
@@ -118,6 +122,9 @@ struct cuda_api {
    cuModuleLoadDataEx_fn cuModuleLoadDataEx;
    cuModuleGetFunction_fn cuModuleGetFunction;
    cuLaunchKernel_fn cuLaunchKernel;
    cuMemGetInfo_fn cuMemGetInfo;
    cuStreamCreate_fn cuStreamCreate;
    cuStreamDestroy_fn cuStreamDestroy;
    cuGetErrorName_fn cuGetErrorName;
    cuGetErrorString_fn cuGetErrorString;
 };
@@ -128,9 +135,10 @@ struct stress_report {
    int cc_major;
    int cc_minor;
    int buffer_mb;
    int stream_count;
    unsigned long iterations;
    uint64_t checksum;
-    char details[1024];
+    char details[16384];
 };
 static int load_symbol(void *lib, const char *name, void **out) {
@@ -144,7 +152,7 @@ static int load_cuda(struct cuda_api *api) {
    if (!api->lib) {
        return 0;
    }
-    return
+    if (!(
        load_symbol(api->lib, "cuInit", (void **)&api->cuInit) &&
        load_symbol(api->lib, "cuDeviceGetCount", (void **)&api->cuDeviceGetCount) &&
        load_symbol(api->lib, "cuDeviceGet", (void **)&api->cuDeviceGet) &&
@@ -160,7 +168,17 @@ static int load_cuda(struct cuda_api *api) {
        load_symbol(api->lib, "cuMemcpyDtoH_v2", (void **)&api->cuMemcpyDtoH) &&
        load_symbol(api->lib, "cuModuleLoadDataEx", (void **)&api->cuModuleLoadDataEx) &&
        load_symbol(api->lib, "cuModuleGetFunction", (void **)&api->cuModuleGetFunction) &&
-        load_symbol(api->lib, "cuLaunchKernel", (void **)&api->cuLaunchKernel);
+        load_symbol(api->lib, "cuLaunchKernel", (void **)&api->cuLaunchKernel))) {
        dlclose(api->lib);
        memset(api, 0, sizeof(*api));
        return 0;
    }
    load_symbol(api->lib, "cuMemGetInfo_v2", (void **)&api->cuMemGetInfo);
    load_symbol(api->lib, "cuStreamCreate", (void **)&api->cuStreamCreate);
    if (!load_symbol(api->lib, "cuStreamDestroy_v2", (void **)&api->cuStreamDestroy)) {
        load_symbol(api->lib, "cuStreamDestroy", (void **)&api->cuStreamDestroy);
    }
    return 1;
 }
 static const char *cu_error_name(struct cuda_api *api, CUresult rc) {
@@ -220,6 +238,39 @@ static int query_compute_capability(struct cuda_api *api, CUdevice dev, int *maj
    return 1;
 }
 static int query_multiprocessor_count(struct cuda_api *api, CUdevice dev, int *count) {
    int mp_count = 0;
    if (!check_rc(api,
                  "cuDeviceGetAttribute(multiprocessors)",
                  api->cuDeviceGetAttribute(&mp_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev))) {
        return 0;
    }
    *count = mp_count;
    return 1;
 }
 static size_t clamp_budget_to_free_memory(struct cuda_api *api, size_t requested_bytes) {
    size_t free_bytes = 0;
    size_t total_bytes = 0;
    size_t max_bytes = requested_bytes;
    if (!api->cuMemGetInfo) {
        return requested_bytes;
    }
    if (api->cuMemGetInfo(&free_bytes, &total_bytes) != CU_SUCCESS || free_bytes == 0) {
        return requested_bytes;
    }
    max_bytes = (free_bytes * 9u) / 10u;
    if (max_bytes < (size_t)4u * 1024u * 1024u) {
        max_bytes = (size_t)4u * 1024u * 1024u;
    }
    if (requested_bytes > max_bytes) {
        return max_bytes;
    }
    return requested_bytes;
 }
 #if HAVE_CUBLASLT_HEADERS
 static void append_detail(char *buf, size_t cap, const char *fmt, ...) {
    size_t len = strlen(buf);
@@ -1095,13 +1146,16 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
 int main(int argc, char **argv) {
    int seconds = 5;
    int size_mb = 64;
    int device_index = 0;
    for (int i = 1; i < argc; i++) {
        if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) {
            seconds = atoi(argv[++i]);
        } else if ((strcmp(argv[i], "--size-mb") == 0 || strcmp(argv[i], "-m") == 0) && i + 1 < argc) {
            size_mb = atoi(argv[++i]);
        } else if ((strcmp(argv[i], "--device") == 0 || strcmp(argv[i], "-d") == 0) && i + 1 < argc) {
            device_index = atoi(argv[++i]);
        } else {
-            fprintf(stderr, "usage: %s [--seconds N] [--size-mb N]\n", argv[0]);
+            fprintf(stderr, "usage: %s [--seconds N] [--size-mb N] [--device N]\n", argv[0]);
            return 2;
        }
    }
@@ -1111,6 +1165,9 @@ int main(int argc, char **argv) {
    if (size_mb <= 0) {
        size_mb = 64;
    }
    if (device_index < 0) {
        device_index = 0;
    }
    struct cuda_api cuda;
    if (!load_cuda(&cuda)) {
@@ -1133,8 +1190,13 @@ int main(int argc, char **argv) {
        return 1;
    }
    if (device_index >= count) {
        fprintf(stderr, "device index %d out of range (found %d CUDA device(s))\n", device_index, count);
        return 1;
    }
    CUdevice dev = 0;
-    if (!check_rc(&cuda, "cuDeviceGet", cuda.cuDeviceGet(&dev, 0))) {
+    if (!check_rc(&cuda, "cuDeviceGet", cuda.cuDeviceGet(&dev, device_index))) {
        return 1;
    }
@@ -1162,6 +1224,7 @@ int main(int argc, char **argv) {
    }
    printf("device=%s\n", report.device);
    printf("device_index=%d\n", device_index);
    printf("compute_capability=%d.%d\n", report.cc_major, report.cc_minor);
    printf("backend=%s\n", report.backend);
    printf("duration_s=%d\n", seconds);
@@ -1,9 +1,9 @@
 #!/bin/sh
-# build-cublas.sh — download cuBLASLt/cuBLAS/cudart runtime + headers for bee-gpu-stress.
+# build-cublas.sh — download cuBLASLt/cuBLAS/cudart runtime + headers for bee-gpu-burn worker.
 #
 # Downloads .deb packages from NVIDIA's CUDA apt repository (Debian 12, x86_64),
 # verifies them against Packages.gz, and extracts the small subset we need:
-#   - headers for compiling bee-gpu-stress against cuBLASLt
+#   - headers for compiling bee-gpu-burn worker against cuBLASLt
 #   - runtime libs for libcublas, libcublasLt, libcudart inside the ISO
 set -e
@@ -0,0 +1,55 @@
 #!/bin/sh
 # build-john.sh — build John the Ripper jumbo with OpenCL support for the LiveCD.
 #
 # Downloads a pinned source snapshot from the official openwall/john repository,
 # builds it inside the builder container, and caches the resulting run/ tree.
 set -e
 JOHN_COMMIT="$1"
 DIST_DIR="$2"
 [ -n "$JOHN_COMMIT" ] || { echo "usage: $0 <john-commit> <dist-dir>"; exit 1; }
 [ -n "$DIST_DIR" ] || { echo "usage: $0 <john-commit> <dist-dir>"; exit 1; }
 echo "=== John the Ripper jumbo ${JOHN_COMMIT} ==="
 CACHE_DIR="${DIST_DIR}/john-${JOHN_COMMIT}"
 CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
 DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/john-downloads"
 SRC_TAR="${DOWNLOAD_CACHE_DIR}/john-${JOHN_COMMIT}.tar.gz"
 SRC_URL="https://github.com/openwall/john/archive/${JOHN_COMMIT}.tar.gz"
 if [ -x "${CACHE_DIR}/run/john" ] && [ -f "${CACHE_DIR}/run/john.conf" ]; then
    echo "=== john cached, skipping build ==="
    echo "run dir: ${CACHE_DIR}/run"
    exit 0
 fi
 mkdir -p "${DOWNLOAD_CACHE_DIR}"
 if [ ! -f "${SRC_TAR}" ]; then
    echo "=== downloading john source snapshot ==="
    wget --show-progress -O "${SRC_TAR}" "${SRC_URL}"
 fi
 BUILD_TMP=$(mktemp -d)
 trap 'rm -rf "${BUILD_TMP}"' EXIT INT TERM
 cd "${BUILD_TMP}"
 tar xf "${SRC_TAR}"
 SRC_DIR=$(find . -maxdepth 1 -type d -name 'john-*' | head -1)
 [ -n "${SRC_DIR}" ] || { echo "ERROR: john source directory not found"; exit 1; }
 cd "${SRC_DIR}/src"
 echo "=== configuring john ==="
 ./configure
 echo "=== building john ==="
 make clean >/dev/null 2>&1 || true
 make -j"$(nproc)"
 mkdir -p "${CACHE_DIR}"
 cp -a "../run" "${CACHE_DIR}/run"
 chmod +x "${CACHE_DIR}/run/john"
 echo "=== john build complete ==="
 echo "run dir: ${CACHE_DIR}/run"
@@ -10,7 +10,7 @@
 # Output layout:
 #   $CACHE_DIR/modules/   — nvidia*.ko files
 #   $CACHE_DIR/bin/       — nvidia-smi, nvidia-debugdump
-#   $CACHE_DIR/lib/       — libnvidia-ml.so*, libcuda.so* (for nvidia-smi)
+#   $CACHE_DIR/lib/       — libnvidia-ml.so*, libcuda.so*, OpenCL-related libs
 set -e
@@ -133,7 +133,14 @@ fi
 # Copy ALL userspace library files.
 # libnvidia-ptxjitcompiler is required by libcuda for PTX JIT compilation
 # (cuModuleLoadDataEx with PTX source) — without it CUDA_ERROR_JIT_COMPILER_NOT_FOUND.
-for lib in libnvidia-ml libcuda libnvidia-ptxjitcompiler; do
+for lib in \
    libnvidia-ml \
    libcuda \
    libnvidia-ptxjitcompiler \
    libnvidia-opencl \
    libnvidia-compiler \
    libnvidia-nvvm \
    libnvidia-fatbinaryloader; do
    count=0
    for f in $(find "$EXTRACT_DIR" -maxdepth 1 -name "${lib}.so.*" 2>/dev/null); do
        cp "$f" "$CACHE_DIR/lib/" && count=$((count+1))
@@ -150,7 +157,14 @@ ko_count=$(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l)
 [ "$ko_count" -gt 0 ] || { echo "ERROR: no .ko files built in $CACHE_DIR/modules/"; exit 1; }
 # Create soname symlinks: use [0-9][0-9]* to avoid circular symlink (.so.1 has single digit)
-for lib in libnvidia-ml libcuda libnvidia-ptxjitcompiler; do
+for lib in \
    libnvidia-ml \
    libcuda \
    libnvidia-ptxjitcompiler \
    libnvidia-opencl \
    libnvidia-compiler \
    libnvidia-nvvm \
    libnvidia-fatbinaryloader; do
    versioned=$(ls "$CACHE_DIR/lib/${lib}.so."[0-9][0-9]* 2>/dev/null | head -1)
    [ -n "$versioned" ] || continue
    base=$(basename "$versioned")
@@ -183,7 +183,7 @@ else
 fi
 # --- NVIDIA-only build steps ---
-GPU_STRESS_BIN="${DIST_DIR}/bee-gpu-stress-linux-amd64"
+GPU_BURN_WORKER_BIN="${DIST_DIR}/bee-gpu-burn-worker-linux-amd64"
 if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
    echo ""
    echo "=== downloading cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace ==="
@@ -196,20 +196,20 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
    CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
    GPU_STRESS_NEED_BUILD=1
-    if [ -f "$GPU_STRESS_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_STRESS_BIN" ]; then
+    if [ -f "$GPU_BURN_WORKER_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_BURN_WORKER_BIN" ]; then
        GPU_STRESS_NEED_BUILD=0
    fi
    if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
-        echo "=== building bee-gpu-stress ==="
+        echo "=== building bee-gpu-burn worker ==="
        gcc -O2 -s -Wall -Wextra \
            -I"${CUBLAS_CACHE}/include" \
-            -o "$GPU_STRESS_BIN" \
+            -o "$GPU_BURN_WORKER_BIN" \
            "${BUILDER_DIR}/bee-gpu-stress.c" \
            -ldl -lm
-        echo "binary: $GPU_STRESS_BIN"
+        echo "binary: $GPU_BURN_WORKER_BIN"
    else
-        echo "=== bee-gpu-stress up to date, skipping build ==="
+        echo "=== bee-gpu-burn worker up to date, skipping build ==="
    fi
 fi
@@ -246,6 +246,9 @@ rm -f \
    "${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \
    "${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \
    "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress" \
    "${OVERLAY_STAGE_DIR}/usr/local/bin/john" \
    "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker" \
    "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john" \
    "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \
    "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
@@ -293,9 +296,13 @@ mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin"
 cp "${DIST_DIR}/bee-linux-amd64" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
 chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
-if [ "$BEE_GPU_VENDOR" = "nvidia" ] && [ -f "$GPU_STRESS_BIN" ]; then
+if [ "$BEE_GPU_VENDOR" = "nvidia" ] && [ -f "$GPU_BURN_WORKER_BIN" ]; then
-    cp "${GPU_STRESS_BIN}" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
+    mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/lib/bee" "${OVERLAY_STAGE_DIR}/usr/local/bin"
-    chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
+    cp "${GPU_BURN_WORKER_BIN}" "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker"
    chmod +x "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker"
    chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-burn" 2>/dev/null || true
    chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-john-gpu-stress" 2>/dev/null || true
    ln -sfn bee-gpu-burn "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
 fi
 # --- inject smoketest into overlay so it runs directly on the live CD ---
@@ -334,6 +341,8 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
    cp "${NVIDIA_CACHE}/bin/nvidia-bug-report.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/" 2>/dev/null || true
    chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-bug-report.sh" 2>/dev/null || true
    cp "${NVIDIA_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
    mkdir -p "${OVERLAY_STAGE_DIR}/etc/OpenCL/vendors"
    printf 'libnvidia-opencl.so.1\n' > "${OVERLAY_STAGE_DIR}/etc/OpenCL/vendors/nvidia.icd"
    # Inject GSP firmware into /lib/firmware/nvidia/<version>/
    if [ -d "${NVIDIA_CACHE}/firmware" ] && [ "$(ls -A "${NVIDIA_CACHE}/firmware" 2>/dev/null)" ]; then
@@ -353,7 +362,7 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
    cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
    echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
-    # Inject cuBLAS/cuBLASLt/cudart runtime libs used by bee-gpu-stress tensor-core GEMM path
+    # Inject cuBLAS/cuBLASLt/cudart runtime libs used by the bee-gpu-burn worker tensor-core GEMM path
    cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
    echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
@@ -372,6 +381,16 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
    cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
    chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
    echo "=== all_reduce_perf injected ==="
    echo ""
    echo "=== building john jumbo ${JOHN_JUMBO_COMMIT} ==="
    sh "${BUILDER_DIR}/build-john.sh" "${JOHN_JUMBO_COMMIT}" "${DIST_DIR}"
    JOHN_CACHE="${DIST_DIR}/john-${JOHN_JUMBO_COMMIT}"
    mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john"
    rsync -a --delete "${JOHN_CACHE}/run/" "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john/run/"
    ln -sfn ../lib/bee/john/run/john "${OVERLAY_STAGE_DIR}/usr/local/bin/john"
    chmod +x "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john/run/john"
    echo "=== john injected ==="
 fi
 # --- embed build metadata ---
@@ -385,7 +404,8 @@ NCCL_VERSION=${NCCL_VERSION}
 NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
 CUBLAS_VERSION=${CUBLAS_VERSION}
 CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
-NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}"
+NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}
 JOHN_JUMBO_COMMIT=${JOHN_JUMBO_COMMIT}"
    GPU_BUILD_INFO="nvidia:${NVIDIA_DRIVER_VERSION}"
 elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
    GPU_VERSION_LINE="ROCM_VERSION=${ROCM_VERSION}"
@@ -60,6 +60,8 @@ chmod +x /usr/local/bin/bee            2>/dev/null || true
 chmod +x /usr/local/bin/bee-log-run    2>/dev/null || true
 if [ "$GPU_VENDOR" = "nvidia" ]; then
    chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
    chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
    chmod +x /usr/local/bin/bee-john-gpu-stress 2>/dev/null || true
 fi
 # Reload udev rules
@@ -1,2 +1,8 @@
-# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing
+# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing.
-datacenter-gpu-manager=1:%%DCGM_VERSION%%
+# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with CUDA 13 userspace,
 # so install the CUDA 13 build plus proprietary diagnostic components explicitly.
 datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
 datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
 datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%
 ocl-icd-libopencl1
 clinfo
@@ -0,0 +1,93 @@
 #!/bin/sh
 set -eu
 SECONDS=5
 SIZE_MB=64
 DEVICES=""
 EXCLUDE=""
 WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"
 usage() {
    echo "usage: $0 [--seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2
    exit 2
 }
 normalize_list() {
    echo "${1:-}" | tr ',' '\n' | sed 's/[[:space:]]//g' | awk 'NF' | sort -n | uniq | paste -sd, -
 }
 contains_csv() {
    needle="$1"
    haystack="${2:-}"
    echo ",${haystack}," | grep -q ",${needle},"
 }
 while [ "$#" -gt 0 ]; do
    case "$1" in
        --seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
        --size-mb|-m) [ "$#" -ge 2 ] || usage; SIZE_MB="$2"; shift 2 ;;
        --devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
        --exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
        *) usage ;;
    esac
 done
 [ -x "${WORKER}" ] || { echo "bee-gpu-burn worker not found: ${WORKER}" >&2; exit 1; }
 ALL_DEVICES=$(nvidia-smi --query-gpu=index --format=csv,noheader,nounits 2>/dev/null | sed 's/[[:space:]]//g' | awk 'NF' | paste -sd, -)
 [ -n "${ALL_DEVICES}" ] || { echo "nvidia-smi found no NVIDIA GPUs" >&2; exit 1; }
 DEVICES=$(normalize_list "${DEVICES}")
 EXCLUDE=$(normalize_list "${EXCLUDE}")
 SELECTED="${DEVICES}"
 if [ -z "${SELECTED}" ]; then
    SELECTED="${ALL_DEVICES}"
 fi
 FINAL=""
 for id in $(echo "${SELECTED}" | tr ',' ' '); do
    [ -n "${id}" ] || continue
    if contains_csv "${id}" "${EXCLUDE}"; then
        continue
    fi
    if [ -z "${FINAL}" ]; then
        FINAL="${id}"
    else
        FINAL="${FINAL},${id}"
    fi
 done
 [ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
 echo "loader=bee-gpu-burn"
 echo "selected_gpus=${FINAL}"
 TMP_DIR=$(mktemp -d)
 trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
 WORKERS=""
 for id in $(echo "${FINAL}" | tr ',' ' '); do
    log="${TMP_DIR}/gpu-${id}.log"
    echo "starting gpu ${id}"
    "${WORKER}" --device "${id}" --seconds "${SECONDS}" --size-mb "${SIZE_MB}" >"${log}" 2>&1 &
    pid=$!
    WORKERS="${WORKERS} ${pid}:${id}:${log}"
 done
 status=0
 for spec in ${WORKERS}; do
    pid=${spec%%:*}
    rest=${spec#*:}
    id=${rest%%:*}
    log=${rest#*:}
    if wait "${pid}"; then
        echo "gpu ${id} finished: OK"
    else
        rc=$?
        echo "gpu ${id} finished: FAILED rc=${rc}"
        status=1
    fi
    sed "s/^/[gpu ${id}] /" "${log}" || true
 done
 exit "${status}"
@@ -0,0 +1,100 @@
 #!/bin/sh
 set -eu
 SECONDS=300
 DEVICES=""
 EXCLUDE=""
 FORMAT=""
 JOHN_DIR="/usr/local/lib/bee/john/run"
 JOHN_BIN="${JOHN_DIR}/john"
 usage() {
    echo "usage: $0 [--seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
    exit 2
 }
 normalize_list() {
    echo "${1:-}" | tr ',' '\n' | sed 's/[[:space:]]//g' | awk 'NF' | sort -n | uniq | paste -sd, -
 }
 contains_csv() {
    needle="$1"
    haystack="${2:-}"
    echo ",${haystack}," | grep -q ",${needle},"
 }
 while [ "$#" -gt 0 ]; do
    case "$1" in
        --seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
        --devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
        --exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
        --format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;;
        *) usage ;;
    esac
 done
 [ -x "${JOHN_BIN}" ] || { echo "john binary not found: ${JOHN_BIN}" >&2; exit 1; }
 ALL_DEVICES=$(nvidia-smi --query-gpu=index --format=csv,noheader,nounits 2>/dev/null | sed 's/[[:space:]]//g' | awk 'NF' | paste -sd, -)
 [ -n "${ALL_DEVICES}" ] || { echo "nvidia-smi found no NVIDIA GPUs" >&2; exit 1; }
 DEVICES=$(normalize_list "${DEVICES}")
 EXCLUDE=$(normalize_list "${EXCLUDE}")
 SELECTED="${DEVICES}"
 if [ -z "${SELECTED}" ]; then
    SELECTED="${ALL_DEVICES}"
 fi
 FINAL=""
 for id in $(echo "${SELECTED}" | tr ',' ' '); do
    [ -n "${id}" ] || continue
    if contains_csv "${id}" "${EXCLUDE}"; then
        continue
    fi
    if [ -z "${FINAL}" ]; then
        FINAL="${id}"
    else
        FINAL="${FINAL},${id}"
    fi
 done
 [ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
 JOHN_DEVICES=""
 for id in $(echo "${FINAL}" | tr ',' ' '); do
    opencl_id=$((id + 1))
    if [ -z "${JOHN_DEVICES}" ]; then
        JOHN_DEVICES="${opencl_id}"
    else
        JOHN_DEVICES="${JOHN_DEVICES},${opencl_id}"
    fi
 done
 echo "loader=john"
 echo "selected_gpus=${FINAL}"
 echo "john_devices=${JOHN_DEVICES}"
 cd "${JOHN_DIR}"
 choose_format() {
    if [ -n "${FORMAT}" ]; then
        echo "${FORMAT}"
        return 0
    fi
    for candidate in sha512crypt-opencl pbkdf2-hmac-sha512-opencl 7z-opencl sha256crypt-opencl md5crypt-opencl; do
        if ./john --test=1 --format="${candidate}" --devices="${JOHN_DEVICES}" >/dev/null 2>&1; then
            echo "${candidate}"
            return 0
        fi
    done
    return 1
 }
 CHOSEN_FORMAT=$(choose_format) || {
    echo "no suitable john OpenCL format found" >&2
    ./john --list=opencl-devices >&2 || true
    exit 1
 }
 echo "format=${CHOSEN_FORMAT}"
 exec ./john --test="${SECONDS}" --format="${CHOSEN_FORMAT}" --devices="${JOHN_DEVICES}"
@@ -114,4 +114,19 @@ fi
 ldconfig 2>/dev/null || true
 log "ldconfig refreshed"
 # Start DCGM host engine so dcgmi can discover GPUs.
 # nv-hostengine must run before any dcgmi command — without it, dcgmi reports
 # "group is empty" even when GPUs and modules are present.
 # Skip if already running (e.g. started by a dcgm systemd service or prior boot).
 if command -v nv-hostengine >/dev/null 2>&1; then
    if pgrep -x nv-hostengine >/dev/null 2>&1; then
        log "nv-hostengine already running — skipping"
    else
        nv-hostengine
        log "nv-hostengine started"
    fi
 else
    log "WARN: nv-hostengine not found — dcgmi diagnostics will not work"
 fi
 log "done"