Normalize task IDs and artifact folder prefixes

Avoid misleading SAT summaries after task cancellation
Stabilize NVIDIA GPU device mapping across loaders
2026-04-06 12:26:47 +03:00 · 2026-04-06 12:24:19 +03:00 · 2026-04-06 12:22:04 +03:00 · 2026-04-06 11:58:13 +03:00 · 2026-04-06 10:47:07 +03:00 · 2026-04-05 22:34:34 +03:00
24 changed files with 1497 additions and 274 deletions
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -679,7 +679,10 @@ func runBenchmarkInterconnect(ctx context.Context, verboseLog, runDir string, gp
 		"-g", strconv.Itoa(len(gpuIndices)),
 		"--iters", strconv.Itoa(maxInt(20, spec.NCCLSec/10)),
 	}
-	env := []string{"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices)}
+	env := []string{
+		"CUDA_DEVICE_ORDER=PCI_BUS_ID",
+		"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices),
+	}
 	logFunc(fmt.Sprintf("NCCL interconnect: gpus=%s", joinIndexList(gpuIndices)))
 	out, err := runSATCommandCtx(ctx, verboseLog, "nccl-all-reduce.log", cmd, env, logFunc)
 	_ = os.WriteFile(filepath.Join(runDir, "nccl-all-reduce.log"), out, 0644)
--- a/audit/internal/platform/kill_workers.go
+++ b/audit/internal/platform/kill_workers.go
@@ -15,6 +15,10 @@ var workerPatterns = []string{
 	"stress-ng",
 	"stressapptest",
 	"memtester",
+	// DCGM diagnostic workers — nvvs is spawned by dcgmi diag and survives
+	// if dcgmi is killed mid-run, leaving the GPU occupied (DCGM_ST_IN_USE).
+	"nvvs",
+	"dcgmi",
 }

 // KilledProcess describes a process that was sent SIGKILL.
--- a/audit/internal/platform/nvidia_stress.go
+++ b/audit/internal/platform/nvidia_stress.go
@@ -16,12 +16,12 @@ func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts N
 		return "", err
 	}

-	return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), []satJob{
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
+	return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), withNvidiaPersistenceMode(
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
 		job,
-		{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
-	}, logFunc)
+		satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
+	), logFunc)
 }

 func nvidiaStressArchivePrefix(loader string) string {
--- a/audit/internal/platform/platform_stress.go
+++ b/audit/internal/platform/platform_stress.go
@@ -110,7 +110,7 @@ func (s *System) RunPlatformStress(
 			wg.Add(1)
 			go func() {
 				defer wg.Done()
-				gpuCmd := buildGPUStressCmd(loadCtx, vendor)
+				gpuCmd := buildGPUStressCmd(loadCtx, vendor, cycle.LoadSec)
 				if gpuCmd == nil {
 					return
 				}
@@ -392,6 +392,13 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
 		cmdArgs = append(cmdArgs, "-M", strconv.Itoa(mb))
 	}
 	cmd := exec.CommandContext(ctx, path, cmdArgs...)
+	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
+	cmd.Cancel = func() error {
+		if cmd.Process != nil {
+			_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
+		}
+		return nil
+	}
 	cmd.Stdout = nil
 	cmd.Stderr = nil
 	if err := startLowPriorityCmd(cmd, 15); err != nil {
@@ -402,28 +409,28 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {

 // buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
 // Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
-func buildGPUStressCmd(ctx context.Context, vendor string) *exec.Cmd {
+func buildGPUStressCmd(ctx context.Context, vendor string, durSec int) *exec.Cmd {
 	switch strings.ToLower(vendor) {
 	case "amd":
-		return buildAMDGPUStressCmd(ctx)
+		return buildAMDGPUStressCmd(ctx, durSec)
 	case "nvidia":
-		return buildNvidiaGPUStressCmd(ctx)
+		return buildNvidiaGPUStressCmd(ctx, durSec)
 	}
 	return nil
 }

-func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
+func buildAMDGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
 	rvsArgs, err := resolveRVSCommand()
 	if err != nil {
 		return nil
 	}
 	rvsPath := rvsArgs[0]
-	cfg := `actions:
+	cfg := fmt.Sprintf(`actions:
 - name: gst_platform
  device: all
  module: gst
  parallel: true
-  duration: 86400000
+  duration: %d`, durSec*1000) + `
  copy_matrix: false
  target_stress: 90
  matrix_size_a: 8640
@@ -433,13 +440,20 @@ func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
 	cfgFile := "/tmp/bee-platform-gst.conf"
 	_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
 	cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
+	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
+	cmd.Cancel = func() error {
+		if cmd.Process != nil {
+			_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
+		}
+		return nil
+	}
 	cmd.Stdout = nil
 	cmd.Stderr = nil
 	_ = startLowPriorityCmd(cmd, 10)
 	return cmd
 }

-func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
+func buildNvidiaGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
 	path, err := satLookPath("bee-gpu-burn")
 	if err != nil {
 		path, err = satLookPath("bee-gpu-stress")
@@ -447,7 +461,17 @@ func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
 	if err != nil {
 		return nil
 	}
-	cmd := exec.CommandContext(ctx, path, "--seconds", "86400")
+	// Pass exact duration so bee-gpu-burn exits on its own when the cycle ends.
+	// Process group kill via Setpgid+Cancel is kept as a safety net for cases
+	// where the context is cancelled early (user stop, parent timeout).
+	cmd := exec.CommandContext(ctx, path, "--seconds", strconv.Itoa(durSec))
+	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
+	cmd.Cancel = func() error {
+		if cmd.Process != nil {
+			_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
+		}
+		return nil
+	}
 	cmd.Stdout = nil
 	cmd.Stderr = nil
 	_ = startLowPriorityCmd(cmd, 10)
--- a/audit/internal/platform/runtime.go
+++ b/audit/internal/platform/runtime.go
@@ -173,6 +173,22 @@ func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHe

 	switch vendor {
 	case "nvidia":
+		if raw, err := os.ReadFile("/run/bee-nvidia-mode"); err == nil {
+			health.NvidiaGSPMode = strings.TrimSpace(string(raw))
+			if health.NvidiaGSPMode == "gsp-stuck" {
+				health.Issues = append(health.Issues, schema.RuntimeIssue{
+					Code:        "nvidia_gsp_stuck",
+					Severity:    "critical",
+					Description: "NVIDIA GSP firmware init timed out and the kernel module is stuck. Reboot and select 'GSP=off' in the boot menu.",
+				})
+			} else if health.NvidiaGSPMode == "gsp-off" {
+				health.Issues = append(health.Issues, schema.RuntimeIssue{
+					Code:        "nvidia_gsp_disabled",
+					Severity:    "warning",
+					Description: "NVIDIA GSP firmware disabled (fallback). Power management runs via CPU path — power draw readings may differ from reference hardware.",
+				})
+			}
+		}
 		health.DriverReady = strings.Contains(lsmodText, "nvidia ")
 		if !health.DriverReady {
 			health.Issues = append(health.Issues, schema.RuntimeIssue{
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -278,13 +278,13 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
 	if gpuCount < 1 {
 		gpuCount = 1
 	}
-	return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", []satJob{
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{name: "02-all-reduce-perf.log", cmd: []string{
+	return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", withNvidiaPersistenceMode(
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{name: "02-all-reduce-perf.log", cmd: []string{
 			"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
 			"-g", strconv.Itoa(gpuCount), "--iters", "20",
 		}},
-	}, logFunc)
+	), logFunc)
 }

 func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
@@ -296,18 +296,18 @@ func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir strin
 	if err != nil {
 		return "", err
 	}
-	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", []satJob{
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
-		{
+	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
+		satJob{
 			name:       "03-dcgmproftester.log",
 			cmd:        profCmd,
 			env:        nvidiaVisibleDevicesEnv(selected),
 			collectGPU: true,
 			gpuIndices: selected,
 		},
-		{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
-	}, logFunc)
+		satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
+	), logFunc)
 }

 func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
@@ -315,16 +315,16 @@ func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string,
 	if err != nil {
 		return "", err
 	}
-	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", []satJob{
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{
+	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", withNvidiaPersistenceMode(
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{
 			name:       "02-dcgmi-targeted-power.log",
 			cmd:        nvidiaDCGMNamedDiagCommand("targeted_power", normalizeNvidiaBurnDuration(durationSec), selected),
 			collectGPU: true,
 			gpuIndices: selected,
 		},
-		{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
-	}, logFunc)
+		satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
+	), logFunc)
 }

 func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
@@ -332,16 +332,16 @@ func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, dur
 	if err != nil {
 		return "", err
 	}
-	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", []satJob{
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{
+	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", withNvidiaPersistenceMode(
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{
 			name:       "02-dcgmi-pulse-test.log",
 			cmd:        nvidiaDCGMNamedDiagCommand("pulse_test", normalizeNvidiaBurnDuration(durationSec), selected),
 			collectGPU: true,
 			gpuIndices: selected,
 		},
-		{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
-	}, logFunc)
+		satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
+	), logFunc)
 }

 func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
@@ -349,16 +349,16 @@ func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpu
 	if err != nil {
 		return "", err
 	}
-	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", []satJob{
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{
+	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", withNvidiaPersistenceMode(
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{
 			name:       "02-dcgmi-nvbandwidth.log",
 			cmd:        nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, selected),
 			collectGPU: true,
 			gpuIndices: selected,
 		},
-		{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
-	}, logFunc)
+		satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
+	), logFunc)
 }

 func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
@@ -382,16 +382,23 @@ func (s *System) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDi
 	if err != nil {
 		return "", err
 	}
-	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", []satJob{
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{
+	// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
+	// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
+	if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
+		for _, p := range killed {
+			logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
+		}
+	}
+	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", withNvidiaPersistenceMode(
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{
 			name:       "02-dcgmi-targeted-stress.log",
 			cmd:        nvidiaDCGMNamedDiagCommand("targeted_stress", normalizeNvidiaBurnDuration(durationSec), selected),
 			collectGPU: true,
 			gpuIndices: selected,
 		},
-		{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
-	}, logFunc)
+		satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
+	), logFunc)
 }

 func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
@@ -561,14 +568,24 @@ type satStats struct {
 	Unsupported int
 }

+func withNvidiaPersistenceMode(jobs ...satJob) []satJob {
+	out := make([]satJob, 0, len(jobs)+1)
+	out = append(out, satJob{
+		name: "00-nvidia-smi-persistence-mode.log",
+		cmd:  []string{"nvidia-smi", "-pm", "1"},
+	})
+	out = append(out, jobs...)
+	return out
+}
+
 func nvidiaSATJobs() []satJob {
-	return []satJob{
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
-		{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
-		{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
-		{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
-	}
+	return withNvidiaPersistenceMode(
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
+		satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
+		satJob{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
+		satJob{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
+	)
 }

 func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
@@ -583,12 +600,12 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
 		}
 		diagArgs = append(diagArgs, "-i", strings.Join(ids, ","))
 	}
-	return []satJob{
-		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
-		{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
-		{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
-		{name: "04-dcgmi-diag.log", cmd: diagArgs},
-	}
+	return withNvidiaPersistenceMode(
+		satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
+		satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
+		satJob{name: "04-dcgmi-diag.log", cmd: diagArgs},
+	)
 }

 func nvidiaDCGMNamedDiagCommand(name string, durationSec int, gpuIndices []int) []string {
@@ -613,7 +630,10 @@ func nvidiaVisibleDevicesEnv(gpuIndices []int) []string {
 	if len(gpuIndices) == 0 {
 		return nil
 	}
-	return []string{"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices)}
+	return []string{
+		"CUDA_DEVICE_ORDER=PCI_BUS_ID",
+		"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices),
+	}
 }

 func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
@@ -654,6 +674,9 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
 		if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
 			return "", writeErr
 		}
+		if ctx.Err() != nil {
+			return "", ctx.Err()
+		}
 		status, rc := classifySATResult(job.name, out, err)
 		stats.Add(status)
 		key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
--- a/audit/internal/platform/sat_test.go
+++ b/audit/internal/platform/sat_test.go
@@ -1,12 +1,14 @@
 package platform

 import (
+	"context"
 	"errors"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"strings"
 	"testing"
+	"time"
 )

 func TestStorageSATCommands(t *testing.T) {
@@ -28,13 +30,19 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {

 	jobs := nvidiaSATJobs()

-	if len(jobs) != 5 {
-		t.Fatalf("jobs=%d want 5", len(jobs))
+	if len(jobs) != 6 {
+		t.Fatalf("jobs=%d want 6", len(jobs))
 	}
-	if got := jobs[4].cmd[0]; got != "bee-gpu-burn" {
+	if got := jobs[0].cmd[0]; got != "nvidia-smi" {
+		t.Fatalf("preflight command=%q want nvidia-smi", got)
+	}
+	if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
+		t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
+	}
+	if got := jobs[5].cmd[0]; got != "bee-gpu-burn" {
 		t.Fatalf("gpu stress command=%q want bee-gpu-burn", got)
 	}
-	if got := jobs[3].cmd[1]; got != "--output-file" {
+	if got := jobs[4].cmd[1]; got != "--output-file" {
 		t.Fatalf("bug report flag=%q want --output-file", got)
 	}
 }
@@ -82,7 +90,7 @@ func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) {

 func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
 	jobs := nvidiaSATJobs()
-	got := jobs[4].cmd
+	got := jobs[5].cmd
 	want := []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}
 	if len(got) != len(want) {
 		t.Fatalf("cmd len=%d want %d", len(got), len(want))
@@ -94,6 +102,19 @@ func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
 	}
 }

+func TestNvidiaDCGMJobsEnablePersistenceModeBeforeDiag(t *testing.T) {
+	jobs := nvidiaDCGMJobs(3, []int{2, 0})
+	if len(jobs) != 5 {
+		t.Fatalf("jobs=%d want 5", len(jobs))
+	}
+	if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
+		t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
+	}
+	if got := strings.Join(jobs[4].cmd, " "); got != "dcgmi diag -r 3 -i 2,0" {
+		t.Fatalf("diag=%q want %q", got, "dcgmi diag -r 3 -i 2,0")
+	}
+}
+
 func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) {
 	t.Parallel()

@@ -234,11 +255,14 @@ func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {

 func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
 	env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
-	if len(env) != 1 {
-		t.Fatalf("env len=%d want 1 (%v)", len(env), env)
+	if len(env) != 2 {
+		t.Fatalf("env len=%d want 2 (%v)", len(env), env)
 	}
-	if env[0] != "CUDA_VISIBLE_DEVICES=0,2,4" {
-		t.Fatalf("env[0]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[0])
+	if env[0] != "CUDA_DEVICE_ORDER=PCI_BUS_ID" {
+		t.Fatalf("env[0]=%q want CUDA_DEVICE_ORDER=PCI_BUS_ID", env[0])
+	}
+	if env[1] != "CUDA_VISIBLE_DEVICES=0,2,4" {
+		t.Fatalf("env[1]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[1])
 	}
 }

@@ -331,6 +355,38 @@ func TestClassifySATResult(t *testing.T) {
 	}
 }

+func TestRunAcceptancePackCtxReturnsContextErrorWithoutArchive(t *testing.T) {
+	dir := t.TempDir()
+	ctx, cancel := context.WithCancel(context.Background())
+	t.Cleanup(cancel)
+
+	done := make(chan struct{})
+	go func() {
+		time.Sleep(100 * time.Millisecond)
+		cancel()
+		close(done)
+	}()
+
+	archive, err := runAcceptancePackCtx(ctx, dir, "cancelled-pack", []satJob{
+		{name: "01-sleep.log", cmd: []string{"sh", "-c", "sleep 5"}},
+	}, nil)
+	<-done
+
+	if !errors.Is(err, context.Canceled) {
+		t.Fatalf("err=%v want context.Canceled", err)
+	}
+	if archive != "" {
+		t.Fatalf("archive=%q want empty", archive)
+	}
+	matches, globErr := filepath.Glob(filepath.Join(dir, "cancelled-pack-*.tar.gz"))
+	if globErr != nil {
+		t.Fatalf("Glob error: %v", globErr)
+	}
+	if len(matches) != 0 {
+		t.Fatalf("archives=%v want none", matches)
+	}
+}
+
 func TestParseStorageDevicesSkipsUSBDisks(t *testing.T) {
 	t.Parallel()

--- a/audit/internal/platform/services.go
+++ b/audit/internal/platform/services.go
@@ -61,7 +61,9 @@ func (s *System) ServiceState(name string) string {
 }

 func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
-	raw, err := exec.Command("systemctl", string(action), name).CombinedOutput()
+	// bee-web runs as the bee user; sudo is required to control system services.
+	// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
+	raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()
 	return string(raw), err
 }

--- a/audit/internal/schema/hardware.go
+++ b/audit/internal/schema/hardware.go
@@ -20,6 +20,7 @@ type RuntimeHealth struct {
 	ExportDir     string                 `json:"export_dir,omitempty"`
 	DriverReady   bool                   `json:"driver_ready,omitempty"`
 	CUDAReady     bool                   `json:"cuda_ready,omitempty"`
+	NvidiaGSPMode string                 `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
 	NetworkStatus string                 `json:"network_status,omitempty"`
 	Issues        []RuntimeIssue         `json:"issues,omitempty"`
 	Tools         []RuntimeToolStatus    `json:"tools,omitempty"`
--- a/audit/internal/webui/api.go
+++ b/audit/internal/webui/api.go
@@ -11,6 +11,7 @@ import (
 	"os/exec"
 	"path/filepath"
 	"regexp"
+	"sort"
 	"strings"
 	"sync/atomic"
 	"syscall"
@@ -21,13 +22,238 @@ import (
 )

 var ansiEscapeRE = regexp.MustCompile(`\x1b\[[0-9;]*[a-zA-Z]|\x1b[()][A-Z0-9]|\x1b[DABC]`)
+var apiListNvidiaGPUs = func(a *app.App) ([]platform.NvidiaGPU, error) {
+	if a == nil {
+		return nil, fmt.Errorf("app not configured")
+	}
+	return a.ListNvidiaGPUs()
+}

 // ── Job ID counter ────────────────────────────────────────────────────────────

 var jobCounter atomic.Uint64

-func newJobID(prefix string) string {
-	return fmt.Sprintf("%s-%d", prefix, jobCounter.Add(1))
+func newJobID(_ string) string {
+	start := int((jobCounter.Add(1) - 1) % 1000)
+	globalQueue.mu.Lock()
+	defer globalQueue.mu.Unlock()
+	for offset := 0; offset < 1000; offset++ {
+		n := (start + offset) % 1000
+		id := fmt.Sprintf("TASK-%03d", n)
+		if !taskIDInUseLocked(id) {
+			return id
+		}
+	}
+	return fmt.Sprintf("TASK-%03d", start)
+}
+
+func taskIDInUseLocked(id string) bool {
+	for _, t := range globalQueue.tasks {
+		if t != nil && t.ID == id {
+			return true
+		}
+	}
+	return false
+}
+
+type taskRunResponse struct {
+	TaskID    string   `json:"task_id,omitempty"`
+	JobID     string   `json:"job_id,omitempty"`
+	TaskIDs   []string `json:"task_ids,omitempty"`
+	JobIDs    []string `json:"job_ids,omitempty"`
+	TaskCount int      `json:"task_count,omitempty"`
+}
+
+type nvidiaTaskSelection struct {
+	GPUIndices []int
+	Label      string
+}
+
+func writeTaskRunResponse(w http.ResponseWriter, tasks []*Task) {
+	if len(tasks) == 0 {
+		writeJSON(w, taskRunResponse{})
+		return
+	}
+	ids := make([]string, 0, len(tasks))
+	for _, t := range tasks {
+		if t == nil || strings.TrimSpace(t.ID) == "" {
+			continue
+		}
+		ids = append(ids, t.ID)
+	}
+	resp := taskRunResponse{TaskCount: len(ids)}
+	if len(ids) > 0 {
+		resp.TaskID = ids[0]
+		resp.JobID = ids[0]
+		resp.TaskIDs = ids
+		resp.JobIDs = ids
+	}
+	writeJSON(w, resp)
+}
+
+func shouldSplitHomogeneousNvidiaTarget(target string) bool {
+	switch strings.TrimSpace(target) {
+	case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute",
+		"nvidia-targeted-power", "nvidia-pulse", "nvidia-interconnect",
+		"nvidia-bandwidth", "nvidia-stress":
+		return true
+	default:
+		return false
+	}
+}
+
+func expandHomogeneousNvidiaSelections(gpus []platform.NvidiaGPU, include, exclude []int) ([]nvidiaTaskSelection, error) {
+	if len(gpus) == 0 {
+		return nil, fmt.Errorf("no NVIDIA GPUs detected")
+	}
+	indexed := make(map[int]platform.NvidiaGPU, len(gpus))
+	allIndices := make([]int, 0, len(gpus))
+	for _, gpu := range gpus {
+		indexed[gpu.Index] = gpu
+		allIndices = append(allIndices, gpu.Index)
+	}
+	sort.Ints(allIndices)
+
+	selected := allIndices
+	if len(include) > 0 {
+		selected = make([]int, 0, len(include))
+		seen := make(map[int]struct{}, len(include))
+		for _, idx := range include {
+			if _, ok := indexed[idx]; !ok {
+				continue
+			}
+			if _, dup := seen[idx]; dup {
+				continue
+			}
+			seen[idx] = struct{}{}
+			selected = append(selected, idx)
+		}
+		sort.Ints(selected)
+	}
+	if len(exclude) > 0 {
+		skip := make(map[int]struct{}, len(exclude))
+		for _, idx := range exclude {
+			skip[idx] = struct{}{}
+		}
+		filtered := selected[:0]
+		for _, idx := range selected {
+			if _, ok := skip[idx]; ok {
+				continue
+			}
+			filtered = append(filtered, idx)
+		}
+		selected = filtered
+	}
+	if len(selected) == 0 {
+		return nil, fmt.Errorf("no NVIDIA GPUs selected")
+	}
+
+	modelGroups := make(map[string][]platform.NvidiaGPU)
+	modelOrder := make([]string, 0)
+	for _, idx := range selected {
+		gpu := indexed[idx]
+		model := strings.TrimSpace(gpu.Name)
+		if model == "" {
+			model = fmt.Sprintf("GPU %d", gpu.Index)
+		}
+		if _, ok := modelGroups[model]; !ok {
+			modelOrder = append(modelOrder, model)
+		}
+		modelGroups[model] = append(modelGroups[model], gpu)
+	}
+	sort.Slice(modelOrder, func(i, j int) bool {
+		left := modelGroups[modelOrder[i]]
+		right := modelGroups[modelOrder[j]]
+		if len(left) == 0 || len(right) == 0 {
+			return modelOrder[i] < modelOrder[j]
+		}
+		return left[0].Index < right[0].Index
+	})
+
+	var groups []nvidiaTaskSelection
+	var singles []nvidiaTaskSelection
+	for _, model := range modelOrder {
+		group := modelGroups[model]
+		sort.Slice(group, func(i, j int) bool { return group[i].Index < group[j].Index })
+		indices := make([]int, 0, len(group))
+		for _, gpu := range group {
+			indices = append(indices, gpu.Index)
+		}
+		if len(indices) >= 2 {
+			groups = append(groups, nvidiaTaskSelection{
+				GPUIndices: indices,
+				Label:      fmt.Sprintf("%s; GPUs %s", model, joinTaskIndices(indices)),
+			})
+			continue
+		}
+		gpu := group[0]
+		singles = append(singles, nvidiaTaskSelection{
+			GPUIndices: []int{gpu.Index},
+			Label:      fmt.Sprintf("GPU %d — %s", gpu.Index, model),
+		})
+	}
+	return append(groups, singles...), nil
+}
+
+func joinTaskIndices(indices []int) string {
+	parts := make([]string, 0, len(indices))
+	for _, idx := range indices {
+		parts = append(parts, fmt.Sprintf("%d", idx))
+	}
+	return strings.Join(parts, ",")
+}
+
+func formatSplitTaskName(baseName, selectionLabel string) string {
+	baseName = strings.TrimSpace(baseName)
+	selectionLabel = strings.TrimSpace(selectionLabel)
+	if baseName == "" {
+		return selectionLabel
+	}
+	if selectionLabel == "" {
+		return baseName
+	}
+	return baseName + " (" + selectionLabel + ")"
+}
+
+func buildNvidiaTaskSet(target string, priority int, createdAt time.Time, params taskParams, baseName string, appRef *app.App, idPrefix string) ([]*Task, error) {
+	if !shouldSplitHomogeneousNvidiaTarget(target) {
+		t := &Task{
+			ID:        newJobID(idPrefix),
+			Name:      baseName,
+			Target:    target,
+			Priority:  priority,
+			Status:    TaskPending,
+			CreatedAt: createdAt,
+			params:    params,
+		}
+		return []*Task{t}, nil
+	}
+	gpus, err := apiListNvidiaGPUs(appRef)
+	if err != nil {
+		return nil, err
+	}
+	selections, err := expandHomogeneousNvidiaSelections(gpus, params.GPUIndices, params.ExcludeGPUIndices)
+	if err != nil {
+		return nil, err
+	}
+	tasks := make([]*Task, 0, len(selections))
+	for _, selection := range selections {
+		taskParamsCopy := params
+		taskParamsCopy.GPUIndices = append([]int(nil), selection.GPUIndices...)
+		taskParamsCopy.ExcludeGPUIndices = nil
+		displayName := formatSplitTaskName(baseName, selection.Label)
+		taskParamsCopy.DisplayName = displayName
+		tasks = append(tasks, &Task{
+			ID:        newJobID(idPrefix),
+			Name:      displayName,
+			Target:    target,
+			Priority:  priority,
+			Status:    TaskPending,
+			CreatedAt: createdAt,
+			params:    taskParamsCopy,
+		})
+	}
+	return tasks, nil
 }

 // ── SSE helpers ───────────────────────────────────────────────────────────────
@@ -207,28 +433,28 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
 		}

 		name := taskDisplayName(target, body.Profile, body.Loader)
-		t := &Task{
-			ID:        newJobID("sat-" + target),
-			Name:      name,
-			Target:    target,
-			Status:    TaskPending,
-			CreatedAt: time.Now(),
-			params: taskParams{
-				Duration:           body.Duration,
-				DiagLevel:          body.DiagLevel,
-				GPUIndices:         body.GPUIndices,
-				ExcludeGPUIndices:  body.ExcludeGPUIndices,
-				Loader:             body.Loader,
-				BurnProfile:        body.Profile,
-				DisplayName:        body.DisplayName,
-				PlatformComponents: body.PlatformComponents,
-			},
-		}
 		if strings.TrimSpace(body.DisplayName) != "" {
-			t.Name = body.DisplayName
+			name = body.DisplayName
 		}
-		globalQueue.enqueue(t)
-		writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
+		params := taskParams{
+			Duration:           body.Duration,
+			DiagLevel:          body.DiagLevel,
+			GPUIndices:         body.GPUIndices,
+			ExcludeGPUIndices:  body.ExcludeGPUIndices,
+			Loader:             body.Loader,
+			BurnProfile:        body.Profile,
+			DisplayName:        body.DisplayName,
+			PlatformComponents: body.PlatformComponents,
+		}
+		tasks, err := buildNvidiaTaskSet(target, 0, time.Now(), params, name, h.opts.App, "sat-"+target)
+		if err != nil {
+			writeError(w, http.StatusBadRequest, err.Error())
+			return
+		}
+		for _, t := range tasks {
+			globalQueue.enqueue(t)
+		}
+		writeTaskRunResponse(w, tasks)
 	}
 }

@@ -257,27 +483,26 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
 	if body.RunNCCL != nil {
 		runNCCL = *body.RunNCCL
 	}
-	t := &Task{
-		ID:        newJobID("benchmark-nvidia"),
-		Name:      taskDisplayName("nvidia-benchmark", "", ""),
-		Target:    "nvidia-benchmark",
-		Priority:  15,
-		Status:    TaskPending,
-		CreatedAt: time.Now(),
-		params: taskParams{
-			GPUIndices:        body.GPUIndices,
-			ExcludeGPUIndices: body.ExcludeGPUIndices,
-			SizeMB:            body.SizeMB,
-			BenchmarkProfile:  body.Profile,
-			RunNCCL:           runNCCL,
-			DisplayName:       body.DisplayName,
-		},
-	}
+	name := taskDisplayName("nvidia-benchmark", "", "")
 	if strings.TrimSpace(body.DisplayName) != "" {
-		t.Name = body.DisplayName
+		name = body.DisplayName
 	}
-	globalQueue.enqueue(t)
-	writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
+	tasks, err := buildNvidiaTaskSet("nvidia-benchmark", 15, time.Now(), taskParams{
+		GPUIndices:        body.GPUIndices,
+		ExcludeGPUIndices: body.ExcludeGPUIndices,
+		SizeMB:            body.SizeMB,
+		BenchmarkProfile:  body.Profile,
+		RunNCCL:           runNCCL,
+		DisplayName:       body.DisplayName,
+	}, name, h.opts.App, "benchmark-nvidia")
+	if err != nil {
+		writeError(w, http.StatusBadRequest, err.Error())
+		return
+	}
+	for _, t := range tasks {
+		globalQueue.enqueue(t)
+	}
+	writeTaskRunResponse(w, tasks)
 }

 func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {
@@ -383,11 +608,13 @@ func (h *handler) handleAPIServicesAction(w http.ResponseWriter, r *http.Request
 		return
 	}
 	result, err := h.opts.App.ServiceActionResult(req.Name, action)
+	status := "ok"
 	if err != nil {
-		writeError(w, http.StatusInternalServerError, err.Error())
-		return
+		status = "error"
 	}
-	writeJSON(w, map[string]string{"status": "ok", "output": result.Body})
+	// Always return 200 with output so the frontend can display the actual
+	// systemctl error message instead of a generic "exit status 1".
+	writeJSON(w, map[string]string{"status": status, "output": result.Body})
 }

 // ── Network ───────────────────────────────────────────────────────────────────
--- a/audit/internal/webui/api_test.go
+++ b/audit/internal/webui/api_test.go
@@ -1,6 +1,7 @@
 package webui

 import (
+	"encoding/json"
 	"net/http/httptest"
 	"strings"
 	"testing"
@@ -74,6 +75,14 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
 		globalQueue.tasks = originalTasks
 		globalQueue.mu.Unlock()
 	})
+	prevList := apiListNvidiaGPUs
+	apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
+		return []platform.NvidiaGPU{
+			{Index: 1, Name: "NVIDIA H100 PCIe"},
+			{Index: 3, Name: "NVIDIA H100 PCIe"},
+		}, nil
+	}
+	t.Cleanup(func() { apiListNvidiaGPUs = prevList })

 	h := &handler{opts: HandlerOptions{App: &app.App{}}}
 	req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
@@ -101,6 +110,97 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
 	}
 }

+func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
+	globalQueue.mu.Lock()
+	originalTasks := globalQueue.tasks
+	globalQueue.tasks = nil
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = originalTasks
+		globalQueue.mu.Unlock()
+	})
+	prevList := apiListNvidiaGPUs
+	apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
+		return []platform.NvidiaGPU{
+			{Index: 0, Name: "NVIDIA H100 PCIe"},
+			{Index: 1, Name: "NVIDIA H100 PCIe"},
+			{Index: 2, Name: "NVIDIA H200 NVL"},
+		}, nil
+	}
+	t.Cleanup(func() { apiListNvidiaGPUs = prevList })
+
+	h := &handler{opts: HandlerOptions{App: &app.App{}}}
+	req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
+	rec := httptest.NewRecorder()
+
+	h.handleAPIBenchmarkNvidiaRun(rec, req)
+
+	if rec.Code != 200 {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	var resp taskRunResponse
+	if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil {
+		t.Fatalf("decode response: %v", err)
+	}
+	if len(resp.TaskIDs) != 2 {
+		t.Fatalf("task_ids=%v want 2 items", resp.TaskIDs)
+	}
+	globalQueue.mu.Lock()
+	defer globalQueue.mu.Unlock()
+	if len(globalQueue.tasks) != 2 {
+		t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
+	}
+	if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
+		t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
+	}
+	if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
+		t.Fatalf("task[1] gpu indices=%v want [2]", got)
+	}
+}
+
+func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
+	globalQueue.mu.Lock()
+	originalTasks := globalQueue.tasks
+	globalQueue.tasks = nil
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = originalTasks
+		globalQueue.mu.Unlock()
+	})
+	prevList := apiListNvidiaGPUs
+	apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
+		return []platform.NvidiaGPU{
+			{Index: 0, Name: "NVIDIA H100 PCIe"},
+			{Index: 1, Name: "NVIDIA H100 PCIe"},
+			{Index: 2, Name: "NVIDIA H200 NVL"},
+		}, nil
+	}
+	t.Cleanup(func() { apiListNvidiaGPUs = prevList })
+
+	h := &handler{opts: HandlerOptions{App: &app.App{}}}
+	req := httptest.NewRequest("POST", "/api/sat/nvidia-targeted-power/run", strings.NewReader(`{"profile":"acceptance","gpu_indices":[0,1,2]}`))
+	rec := httptest.NewRecorder()
+
+	h.handleAPISATRun("nvidia-targeted-power").ServeHTTP(rec, req)
+
+	if rec.Code != 200 {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	globalQueue.mu.Lock()
+	defer globalQueue.mu.Unlock()
+	if len(globalQueue.tasks) != 2 {
+		t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
+	}
+	if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
+		t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
+	}
+	if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
+		t.Fatalf("task[1] gpu indices=%v want [2]", got)
+	}
+}
+
 func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
 	h := &handler{}
 	h.pushFanRings([]platform.FanReading{
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
@@ -8,9 +8,12 @@ import (
 	"os"
 	"path/filepath"
 	"sort"
+	"strconv"
 	"strings"
+	"time"

 	"bee/audit/internal/app"
+	"bee/audit/internal/platform"
 	"bee/audit/internal/schema"
 )

@@ -33,6 +36,9 @@ a{color:var(--accent);text-decoration:none}
 .sidebar-logo{padding:18px 16px 12px;font-size:18px;font-weight:700;color:#fff;letter-spacing:-.5px}
 .sidebar-logo span{color:rgba(255,255,255,.5);font-weight:400;font-size:12px;display:block;margin-top:2px}
 .sidebar-version{padding:0 16px 14px;font-size:11px;color:rgba(255,255,255,.45)}
+.sidebar-badge{margin:0 12px 12px;padding:5px 8px;border-radius:4px;font-size:11px;font-weight:600;text-align:center}
+.sidebar-badge-warn{background:#7a4f00;color:#f6c90e}
+.sidebar-badge-crit{background:#5c1a1a;color:#ff6b6b}
 .nav{flex:1}
 .nav-item{display:block;padding:10px 16px;color:rgba(255,255,255,.7);font-size:13px;border-left:3px solid transparent;transition:all .15s}
 .nav-item:hover{color:#fff;background:rgba(255,255,255,.08)}
@@ -107,6 +113,15 @@ func layoutNav(active string, buildLabel string) string {
 		buildLabel = "dev"
 	}
 	b.WriteString(`<div class="sidebar-version">Version ` + html.EscapeString(buildLabel) + `</div>`)
+	if raw, err := os.ReadFile("/run/bee-nvidia-mode"); err == nil {
+		gspMode := strings.TrimSpace(string(raw))
+		switch gspMode {
+		case "gsp-off":
+			b.WriteString(`<div class="sidebar-badge sidebar-badge-warn">NVIDIA GSP=off</div>`)
+		case "gsp-stuck":
+			b.WriteString(`<div class="sidebar-badge sidebar-badge-crit">NVIDIA GSP stuck — reboot</div>`)
+		}
+	}
 	b.WriteString(`<nav class="nav">`)
 	for _, item := range items {
 		cls := "nav-item"
@@ -149,7 +164,7 @@ func renderPage(page string, opts HandlerOptions) string {
 	case "benchmark":
 		pageID = "benchmark"
 		title = "Benchmark"
-		body = renderBenchmark()
+		body = renderBenchmark(opts)
 	case "tasks":
 		pageID = "tasks"
 		title = "Tasks"
@@ -1056,17 +1071,23 @@ func renderValidate(opts HandlerOptions) string {
 		`</div>
 <div style="height:1px;background:var(--border);margin:16px 0"></div>
 <div class="grid3">
-` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
+` + renderSATCard("nvidia-selection", "NVIDIA GPU Selection", "", "", renderValidateCardBody(
 		inv.NVIDIA,
-		`Runs NVIDIA diagnostics and board inventory checks.`,
-		`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
-		`Runs one GPU at a time. Diag level is taken from Validate Profile.`,
+		`Select which NVIDIA GPUs to include in Validate. The same selection is used by both NVIDIA GPU cards below and by Validate one by one.`,
+		`<code>nvidia-smi --query-gpu=index,name,memory.total</code>`,
+		`<div id="sat-gpu-list"><p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs…</p></div><div style="display:flex;gap:8px;flex-wrap:wrap;margin-top:8px"><button type="button" class="btn btn-sm btn-secondary" onclick="satSelectAllGPUs()">Select all</button><button type="button" class="btn btn-sm btn-secondary" onclick="satSelectNoGPUs()">Clear</button></div><div id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin-top:8px"></div>`,
 	)) +
+		renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
+			inv.NVIDIA,
+			`Runs NVIDIA diagnostics and board inventory checks.`,
+			`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
+			`Runs one GPU at a time on the selected NVIDIA GPUs. Diag level is taken from Validate Profile.`,
+		)) +
 		renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
 			inv.NVIDIA,
 			`Runs a controlled NVIDIA DCGM load in Validate to check stability under moderate stress.`,
 			`<code>dcgmi diag targeted_stress</code>`,
-			`Runs one GPU at a time with the fixed DCGM targeted stress recipe.`,
+			`Runs one GPU at a time on the selected NVIDIA GPUs with the fixed DCGM targeted stress recipe.`,
 		)) +
 		`</div>
 <div class="grid3" style="margin-top:16px">
@@ -1088,6 +1109,8 @@ func renderValidate(opts HandlerOptions) string {
 .validate-card-body { padding:0; }
 .validate-card-section { padding:12px 16px 0; }
 .validate-card-section:last-child { padding-bottom:16px; }
+.sat-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
+.sat-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
@media(max-width:900px){ .validate-profile-body { grid-template-columns:1fr; } }
 </style>
 <script>
@@ -1116,6 +1139,59 @@ function loadSatNvidiaGPUs() {
  }
  return satNvidiaGPUsPromise;
 }
+function satSelectedGPUIndices() {
+  return Array.from(document.querySelectorAll('.sat-nvidia-checkbox'))
+    .filter(function(el) { return el.checked && !el.disabled; })
+    .map(function(el) { return parseInt(el.value, 10); })
+    .filter(function(v) { return !Number.isNaN(v); })
+    .sort(function(a, b) { return a - b; });
+}
+function satUpdateGPUSelectionNote() {
+  const note = document.getElementById('sat-gpu-selection-note');
+  if (!note) return;
+  const selected = satSelectedGPUIndices();
+  if (!selected.length) {
+    note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA validate tasks.';
+    return;
+  }
+  note.textContent = 'Selected NVIDIA GPUs: ' + selected.join(', ') + '.';
+}
+function satRenderGPUList(gpus) {
+  const root = document.getElementById('sat-gpu-list');
+  if (!root) return;
+  if (!gpus || !gpus.length) {
+    root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
+    satUpdateGPUSelectionNote();
+    return;
+  }
+  root.innerHTML = gpus.map(function(gpu) {
+    const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
+    return '<label class="sat-gpu-row">'
+      + '<input class="sat-nvidia-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="satUpdateGPUSelectionNote()">'
+      + '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
+      + '</label>';
+  }).join('');
+  satUpdateGPUSelectionNote();
+}
+function satSelectAllGPUs() {
+  document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = true; });
+  satUpdateGPUSelectionNote();
+}
+function satSelectNoGPUs() {
+  document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = false; });
+  satUpdateGPUSelectionNote();
+}
+function satLoadGPUs() {
+  loadSatNvidiaGPUs().then(function(gpus) {
+    satRenderGPUList(gpus);
+  }).catch(function(err) {
+    const root = document.getElementById('sat-gpu-list');
+    if (root) {
+      root.innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
+    }
+    satUpdateGPUSelectionNote();
+  });
+}
 function satGPUDisplayName(gpu) {
  const idx = (gpu && Number.isFinite(Number(gpu.index))) ? Number(gpu.index) : 0;
  const name = gpu && gpu.name ? gpu.name : ('GPU ' + idx);
@@ -1137,6 +1213,36 @@ function enqueueSATTarget(target, overrides) {
  return fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(satRequestBody(target, overrides))})
    .then(r => r.json());
 }
+function streamSATTask(taskId, title, resetTerminal) {
+  if (satES) { satES.close(); satES = null; }
+  document.getElementById('sat-output').style.display='block';
+  document.getElementById('sat-title').textContent = '— ' + title;
+  const term = document.getElementById('sat-terminal');
+  if (resetTerminal) {
+    term.textContent = '';
+  }
+  term.textContent += 'Task ' + taskId + ' queued. Streaming log...\n';
+  return new Promise(function(resolve) {
+    satES = new EventSource('/api/tasks/' + taskId + '/stream');
+    satES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
+    satES.addEventListener('done', function(e) {
+      satES.close();
+      satES = null;
+      term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
+      term.scrollTop = term.scrollHeight;
+      resolve({ok: !e.data, error: e.data || ''});
+    });
+    satES.onerror = function() {
+      if (satES) {
+        satES.close();
+        satES = null;
+      }
+      term.textContent += '\nERROR: stream disconnected.\n';
+      term.scrollTop = term.scrollHeight;
+      resolve({ok: false, error: 'stream disconnected'});
+    };
+  });
+}
 function selectedAMDValidateTargets() {
  const targets = [];
  const gpu = document.getElementById('sat-amd-target');
@@ -1151,24 +1257,23 @@ function runSAT(target) {
  return runSATWithOverrides(target, null);
 }
 function runSATWithOverrides(target, overrides) {
-  if (satES) { satES.close(); satES = null; }
-  document.getElementById('sat-output').style.display='block';
-  document.getElementById('sat-title').textContent = '— ' + target;
+  const title = (overrides && overrides.display_name) || target;
  const term = document.getElementById('sat-terminal');
-  term.textContent = 'Enqueuing ' + target + ' test...\n';
+  document.getElementById('sat-output').style.display='block';
+  document.getElementById('sat-title').textContent = '— ' + title;
+  term.textContent = 'Enqueuing ' + title + ' test...\n';
  return enqueueSATTarget(target, overrides)
-    .then(d => {
-      term.textContent += 'Task ' + d.task_id + ' queued. Streaming log...\n';
-      satES = new EventSource('/api/tasks/'+d.task_id+'/stream');
-      satES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop=term.scrollHeight; };
-      satES.addEventListener('done', e => { satES.close(); satES=null; term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n'; });
-    });
+    .then(d => streamSATTask(d.task_id, title, false));
 }
 function expandSATTarget(target) {
  if (target !== 'nvidia' && target !== 'nvidia-targeted-stress') {
    return Promise.resolve([{target: target}]);
  }
-  return loadSatNvidiaGPUs().then(gpus => gpus.map(gpu => ({
+  const selected = satSelectedGPUIndices();
+  if (!selected.length) {
+    return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
+  }
+  return loadSatNvidiaGPUs().then(gpus => gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0).map(gpu => ({
    target: target,
    overrides: {
      gpu_indices: [Number(gpu.index)],
@@ -1179,65 +1284,61 @@ function expandSATTarget(target) {
 }
 function runNvidiaValidateSet(target) {
  return loadSatNvidiaGPUs().then(gpus => {
-    if (!gpus.length) return;
-    if (gpus.length === 1) {
-      const gpu = gpus[0];
+    const selected = satSelectedGPUIndices();
+    const picked = gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0);
+    if (!picked.length) {
+      throw new Error('Select at least one NVIDIA GPU.');
+    }
+    if (picked.length === 1) {
+      const gpu = picked[0];
      return runSATWithOverrides(target, {
        gpu_indices: [Number(gpu.index)],
        display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')'
      });
    }
-    if (satES) { satES.close(); satES = null; }
    document.getElementById('sat-output').style.display='block';
    document.getElementById('sat-title').textContent = '— ' + target;
    const term = document.getElementById('sat-terminal');
-    term.textContent = 'Enqueuing ' + target + ' tests one GPU at a time...\n';
+    term.textContent = 'Running ' + target + ' one GPU at a time...\n';
    const labelBase = satLabels()[target] || ('Validate ' + target);
-    const enqueueNext = (idx) => {
-      if (idx >= gpus.length) return;
-      const gpu = gpus[idx];
+    const runNext = (idx) => {
+      if (idx >= picked.length) return Promise.resolve();
+      const gpu = picked[idx];
      const gpuLabel = satGPUDisplayName(gpu);
-      enqueueSATTarget(target, {
+      term.textContent += '\n[' + (idx + 1) + '/' + picked.length + '] ' + gpuLabel + '\n';
+      return enqueueSATTarget(target, {
        gpu_indices: [Number(gpu.index)],
        display_name: labelBase + ' (' + gpuLabel + ')'
      }).then(d => {
-        term.textContent += 'Task ' + d.task_id + ' queued for ' + gpuLabel + '.\n';
-        if (idx === gpus.length - 1) {
-          satES = new EventSource('/api/tasks/' + d.task_id + '/stream');
-          satES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop=term.scrollHeight; };
-          satES.addEventListener('done', e => { satES.close(); satES=null; term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n'; });
-        }
-        enqueueNext(idx + 1);
+        return streamSATTask(d.task_id, labelBase + ' (' + gpuLabel + ')', false);
+      }).then(function() {
+        return runNext(idx + 1);
      });
    };
-    enqueueNext(0);
+    return runNext(0);
  });
 }
 function runAMDValidateSet() {
  const targets = selectedAMDValidateTargets();
  if (!targets.length) return;
  if (targets.length === 1) return runSAT(targets[0]);
-  if (satES) { satES.close(); satES = null; }
  document.getElementById('sat-output').style.display='block';
  document.getElementById('sat-title').textContent = '— amd';
  const term = document.getElementById('sat-terminal');
-  term.textContent = 'Enqueuing AMD validate set...\n';
+  term.textContent = 'Running AMD validate set one by one...\n';
  const labels = satLabels();
-  const enqueueNext = (idx) => {
-    if (idx >= targets.length) return;
+  const runNext = (idx) => {
+    if (idx >= targets.length) return Promise.resolve();
    const target = targets[idx];
-    enqueueSATTarget(target)
+    term.textContent += '\n[' + (idx + 1) + '/' + targets.length + '] ' + labels[target] + '\n';
+    return enqueueSATTarget(target)
      .then(d => {
-        term.textContent += 'Task ' + d.task_id + ' queued for ' + labels[target] + '.\n';
-        if (idx === targets.length - 1) {
-          satES = new EventSource('/api/tasks/'+d.task_id+'/stream');
-          satES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop=term.scrollHeight; };
-          satES.addEventListener('done', e => { satES.close(); satES=null; term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n'; });
-        }
-        enqueueNext(idx + 1);
+        return streamSATTask(d.task_id, labels[target], false);
+      }).then(function() {
+        return runNext(idx + 1);
      });
  };
-  enqueueNext(0);
+  return runNext(0);
 }
 function runAllSAT() {
  const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
@@ -1259,17 +1360,17 @@ function runAllSAT() {
      status.textContent = 'No tasks selected.';
      return;
    }
-    const enqueueNext = (idx) => {
-      if (idx >= expanded.length) { status.textContent = 'Enqueued ' + total + ' tasks.'; return; }
+    const runNext = (idx) => {
+      if (idx >= expanded.length) { status.textContent = 'Completed ' + total + ' task(s).'; return Promise.resolve(); }
      const item = expanded[idx];
-      enqueueSATTarget(item.target, item.overrides)
+      status.textContent = 'Running ' + (idx + 1) + '/' + total + '...';
+      return enqueueSATTarget(item.target, item.overrides)
        .then(() => {
          enqueued++;
-          status.textContent = 'Enqueued ' + enqueued + '/' + total + '...';
-          enqueueNext(idx + 1);
+          return runNext(idx + 1);
        });
    };
-    enqueueNext(0);
+    return runNext(0);
  }).catch(err => {
    status.textContent = 'Error: ' + err.message;
  });
@@ -1282,6 +1383,7 @@ fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
    if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
    if (!gp.amd) disableSATAMDOptions('No AMD GPU detected');
 });
+satLoadGPUs();
 function disableSATAMDOptions(reason) {
    ['sat-amd-target','sat-amd-mem-target','sat-amd-bandwidth-target'].forEach(function(id) {
        const cb = document.getElementById(id);
@@ -1470,7 +1572,25 @@ func renderSATCard(id, label, runAction, headerActions, body string) string {

 // ── Benchmark ─────────────────────────────────────────────────────────────────

-func renderBenchmark() string {
+type benchmarkHistoryColumn struct {
+	key   string
+	label string
+	name  string
+	index int
+}
+
+type benchmarkHistoryCell struct {
+	score   float64
+	present bool
+}
+
+type benchmarkHistoryRun struct {
+	generatedAt time.Time
+	displayTime string
+	cells       map[string]benchmarkHistoryCell
+}
+
+func renderBenchmark(opts HandlerOptions) string {
 	return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Benchmark runs generate a human-readable TXT report and machine-readable result bundle. Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>

 <div class="grid2">
@@ -1519,6 +1639,8 @@ func renderBenchmark() string {
  </div>
 </div>

+` + renderBenchmarkResultsCard(opts.ExportDir) + `
+
 <div id="benchmark-output" style="display:none;margin-top:16px" class="card">
  <div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
  <div class="card-body"><div id="benchmark-terminal" class="terminal"></div></div>
@@ -1534,6 +1656,12 @@ func renderBenchmark() string {
 <script>
 let benchmarkES = null;

+function benchmarkTaskIDs(payload) {
+  if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
+  if (payload && payload.task_id) return [payload.task_id];
+  return [];
+}
+
 function benchmarkSelectedGPUIndices() {
  return Array.from(document.querySelectorAll('.benchmark-gpu-checkbox'))
    .filter(function(el) { return el.checked && !el.disabled; })
@@ -1633,17 +1761,37 @@ function runNvidiaBenchmark() {
      return payload;
    });
  }).then(function(d) {
-    status.textContent = 'Task ' + d.task_id + ' queued.';
-    term.textContent += 'Task ' + d.task_id + ' queued. Streaming log...\n';
-    benchmarkES = new EventSource('/api/tasks/' + d.task_id + '/stream');
-    benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
-    benchmarkES.addEventListener('done', function(e) {
-      benchmarkES.close();
-      benchmarkES = null;
-      term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
-      term.scrollTop = term.scrollHeight;
-      status.textContent = e.data ? 'Failed.' : 'Completed.';
-    });
+    const taskIds = benchmarkTaskIDs(d);
+    if (!taskIds.length) throw new Error('No benchmark task was queued.');
+    status.textContent = taskIds.length === 1 ? ('Task ' + taskIds[0] + ' queued.') : ('Queued ' + taskIds.length + ' tasks.');
+    const streamNext = function(idx, failures) {
+      if (idx >= taskIds.length) {
+        status.textContent = failures ? 'Completed with failures.' : 'Completed.';
+        return;
+      }
+      const taskId = taskIds[idx];
+      term.textContent += '\n[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming log...\n';
+      benchmarkES = new EventSource('/api/tasks/' + taskId + '/stream');
+      benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
+      benchmarkES.addEventListener('done', function(e) {
+        benchmarkES.close();
+        benchmarkES = null;
+        if (e.data) failures += 1;
+        term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
+        term.scrollTop = term.scrollHeight;
+        streamNext(idx + 1, failures);
+      });
+      benchmarkES.onerror = function() {
+        if (benchmarkES) {
+          benchmarkES.close();
+          benchmarkES = null;
+        }
+        term.textContent += '\nERROR: stream disconnected.\n';
+        term.scrollTop = term.scrollHeight;
+        streamNext(idx + 1, failures + 1);
+      };
+    };
+    streamNext(0, 0);
  }).catch(function(err) {
    status.textContent = 'Error.';
    term.textContent += 'ERROR: ' + err.message + '\n';
@@ -1655,6 +1803,129 @@ benchmarkLoadGPUs();
 </script>`
 }

+func renderBenchmarkResultsCard(exportDir string) string {
+	columns, runs := loadBenchmarkHistory(exportDir)
+	return renderBenchmarkResultsCardFromRuns(
+		"Benchmark Results",
+		"Composite score by saved benchmark run and GPU.",
+		"No saved benchmark runs yet.",
+		columns,
+		runs,
+	)
+}
+
+func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, columns []benchmarkHistoryColumn, runs []benchmarkHistoryRun) string {
+	if len(runs) == 0 {
+		return `<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body"><p style="color:var(--muted);font-size:13px">` + html.EscapeString(emptyMessage) + `</p></div></div>`
+	}
+	var b strings.Builder
+	b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body">`)
+	if strings.TrimSpace(description) != "" {
+		b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">` + html.EscapeString(description) + `</p>`)
+	}
+	b.WriteString(`<div style="overflow-x:auto">`)
+	b.WriteString(`<table><thead><tr><th>Test</th><th>Time</th>`)
+	for _, col := range columns {
+		b.WriteString(`<th>` + html.EscapeString(col.label) + `</th>`)
+	}
+	b.WriteString(`</tr></thead><tbody>`)
+	for i, run := range runs {
+		b.WriteString(`<tr>`)
+		b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
+		b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
+		for _, col := range columns {
+			cell, ok := run.cells[col.key]
+			if !ok || !cell.present {
+				b.WriteString(`<td style="color:var(--muted)">-</td>`)
+				continue
+			}
+			b.WriteString(`<td>` + fmt.Sprintf("%.2f", cell.score) + `</td>`)
+		}
+		b.WriteString(`</tr>`)
+	}
+	b.WriteString(`</tbody></table></div></div></div>`)
+	return b.String()
+}
+
+func loadBenchmarkHistory(exportDir string) ([]benchmarkHistoryColumn, []benchmarkHistoryRun) {
+	baseDir := app.DefaultBenchmarkBaseDir
+	if strings.TrimSpace(exportDir) != "" {
+		baseDir = filepath.Join(exportDir, "bee-benchmark")
+	}
+	paths, err := filepath.Glob(filepath.Join(baseDir, "gpu-benchmark-*", "result.json"))
+	if err != nil || len(paths) == 0 {
+		return nil, nil
+	}
+	sort.Strings(paths)
+	return loadBenchmarkHistoryFromPaths(paths)
+}
+
+func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []benchmarkHistoryRun) {
+	columnByKey := make(map[string]benchmarkHistoryColumn)
+	runs := make([]benchmarkHistoryRun, 0, len(paths))
+	for _, path := range paths {
+		raw, err := os.ReadFile(path)
+		if err != nil {
+			continue
+		}
+		var result platform.NvidiaBenchmarkResult
+		if err := json.Unmarshal(raw, &result); err != nil {
+			continue
+		}
+		run := benchmarkHistoryRun{
+			generatedAt: result.GeneratedAt,
+			displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
+			cells:       make(map[string]benchmarkHistoryCell),
+		}
+		for _, gpu := range result.GPUs {
+			key := benchmarkHistoryColumnKey(gpu.Name, gpu.Index)
+			columnByKey[key] = benchmarkHistoryColumn{
+				key:   key,
+				label: benchmarkHistoryColumnLabel(gpu.Name, gpu.Index),
+				name:  strings.TrimSpace(gpu.Name),
+				index: gpu.Index,
+			}
+			run.cells[key] = benchmarkHistoryCell{
+				score:   gpu.Scores.CompositeScore,
+				present: true,
+			}
+		}
+		runs = append(runs, run)
+	}
+
+	columns := make([]benchmarkHistoryColumn, 0, len(columnByKey))
+	for _, col := range columnByKey {
+		columns = append(columns, col)
+	}
+	sort.Slice(columns, func(i, j int) bool {
+		leftName := strings.ToLower(strings.TrimSpace(columns[i].name))
+		rightName := strings.ToLower(strings.TrimSpace(columns[j].name))
+		if leftName != rightName {
+			return leftName < rightName
+		}
+		if columns[i].index != columns[j].index {
+			return columns[i].index < columns[j].index
+		}
+		return columns[i].key < columns[j].key
+	})
+	sort.Slice(runs, func(i, j int) bool {
+		return runs[i].generatedAt.After(runs[j].generatedAt)
+	})
+	return columns, runs
+}
+
+func benchmarkHistoryColumnKey(name string, index int) string {
+	return strings.TrimSpace(name) + "|" + strconv.Itoa(index)
+}
+
+func benchmarkHistoryColumnLabel(name string, index int) string {
+	name = strings.TrimSpace(name)
+	if name == "" {
+		return fmt.Sprintf("GPU %d", index)
+	}
+	return fmt.Sprintf("%s / GPU %d", name, index)
+}
+
 // ── Burn ──────────────────────────────────────────────────────────────────────

 func renderBurn() string {
@@ -1774,6 +2045,12 @@ func renderBurn() string {
 <script>
 let biES = null;

+function burnTaskIDs(payload) {
+  if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
+  if (payload && payload.task_id) return [payload.task_id];
+  return [];
+}
+
 function burnProfile() {
  const selected = document.querySelector('input[name="burn-profile"]:checked');
  return selected ? selected.value : 'smoke';
@@ -1874,6 +2151,52 @@ function streamTask(taskId, label) {
    term.scrollTop = term.scrollHeight;
  });
 }
+function streamBurnTask(taskId, label, resetTerminal) {
+  return streamBurnTaskSet([taskId], label, resetTerminal);
+}
+function streamBurnTaskSet(taskIds, label, resetTerminal) {
+  if (biES) { biES.close(); biES = null; }
+  document.getElementById('bi-output').style.display = 'block';
+  document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
+  const term = document.getElementById('bi-terminal');
+  if (resetTerminal) {
+    term.textContent = '';
+  }
+  if (!Array.isArray(taskIds) || !taskIds.length) {
+    term.textContent += 'ERROR: no tasks queued.\n';
+    return Promise.resolve({ok:false, error:'no tasks queued'});
+  }
+  const streamNext = function(idx, failures) {
+    if (idx >= taskIds.length) {
+      return Promise.resolve({ok: failures === 0, error: failures ? (failures + ' task(s) failed') : ''});
+    }
+    const taskId = taskIds[idx];
+    term.textContent += '[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming...\n';
+    return new Promise(function(resolve) {
+      biES = new EventSource('/api/tasks/' + taskId + '/stream');
+      biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
+      biES.addEventListener('done', function(e) {
+        biES.close();
+        biES = null;
+        term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
+        term.scrollTop = term.scrollHeight;
+        resolve(failures + (e.data ? 1 : 0));
+      });
+      biES.onerror = function() {
+        if (biES) {
+          biES.close();
+          biES = null;
+        }
+        term.textContent += '\nERROR: stream disconnected.\n';
+        term.scrollTop = term.scrollHeight;
+        resolve(failures + 1);
+      };
+    }).then(function(nextFailures) {
+      return streamNext(idx + 1, nextFailures);
+    });
+  };
+  return streamNext(0, 0);
+}

 function runBurnTaskSet(tasks, statusElId) {
  const enabled = tasks.filter(function(t) {
@@ -1886,19 +2209,33 @@ function runBurnTaskSet(tasks, statusElId) {
    if (status) status.textContent = 'No tasks selected.';
    return;
  }
-  enabled.forEach(function(t) {
-    enqueueBurnTask(t.target, t.label, t.extra, !!t.nvidia)
+  const term = document.getElementById('bi-terminal');
+  document.getElementById('bi-output').style.display = 'block';
+  document.getElementById('bi-title').textContent = '— Burn one by one [' + burnProfile() + ']';
+  term.textContent = '';
+  const runNext = function(idx) {
+    if (idx >= enabled.length) {
+      if (status) status.textContent = 'Completed ' + enabled.length + ' task(s).';
+      return Promise.resolve();
+    }
+    const t = enabled[idx];
+    term.textContent += '\n[' + (idx + 1) + '/' + enabled.length + '] ' + t.label + '\n';
+    if (status) status.textContent = 'Running ' + (idx + 1) + '/' + enabled.length + '...';
+    return enqueueBurnTask(t.target, t.label, t.extra, !!t.nvidia)
      .then(function(d) {
-        if (status) status.textContent = enabled.length + ' task(s) queued.';
-        streamTask(d.task_id, t.label);
+        return streamBurnTaskSet(burnTaskIDs(d), t.label, false);
+      })
+      .then(function() {
+        return runNext(idx + 1);
      })
      .catch(function(err) {
        if (status) status.textContent = 'Error: ' + err.message;
-        const term = document.getElementById('bi-terminal');
        document.getElementById('bi-output').style.display = 'block';
        term.textContent += 'ERROR: ' + err.message + '\n';
+        return Promise.reject(err);
      });
-  });
+  };
+  return runNext(0);
 }

 function runPlatformStress() {
@@ -2107,9 +2444,12 @@ func renderServicesInline() string {
 	return `<p style="font-size:13px;color:var(--muted);margin-bottom:10px">` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `</p>
 <div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="restartGPUDrivers()">Restart GPU Drivers</button><button class="btn btn-sm btn-secondary" onclick="loadServices()">&#8635; Refresh</button></div>
 <div id="svc-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
-<div id="svc-out" style="display:none;margin-top:8px" class="card">
-  <div class="card-head">Output</div>
-  <div class="card-body" style="padding:10px"><div id="svc-terminal" class="terminal" style="max-height:150px"></div></div>
+<div id="svc-out" style="display:none;margin-top:12px">
+  <div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
+    <span id="svc-out-label" style="font-size:12px;font-weight:600;color:var(--muted)">Output</span>
+    <span id="svc-out-status" style="font-size:12px"></span>
+  </div>
+  <div id="svc-terminal" class="terminal" style="max-height:220px;width:100%;box-sizing:border-box"></div>
 </div>
 <script>
 function loadServices() {
@@ -2125,9 +2465,9 @@ function loadServices() {
        '<div id="'+id+'" style="display:none;margin-top:6px"><pre style="font-size:11px;white-space:pre-wrap;word-break:break-all;max-height:200px;overflow-y:auto;background:#1b1c1d;padding:8px;border-radius:4px;color:#b5cea8">'+body+'</pre></div>' +
        '</td>' +
        '<td style="white-space:nowrap">' +
-        '<button class="btn btn-sm btn-secondary" onclick="svcAction(\''+s.name+'\',\'start\')">Start</button> ' +
-        '<button class="btn btn-sm btn-secondary" onclick="svcAction(\''+s.name+'\',\'stop\')">Stop</button> ' +
-        '<button class="btn btn-sm btn-secondary" onclick="svcAction(\''+s.name+'\',\'restart\')">Restart</button>' +
+        '<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-start"   onclick="svcAction(this,\''+s.name+'\',\'start\')">Start</button> ' +
+        '<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-stop"    onclick="svcAction(this,\''+s.name+'\',\'stop\')">Stop</button> ' +
+        '<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-restart" onclick="svcAction(this,\''+s.name+'\',\'restart\')">Restart</button>' +
        '</td></tr>';
    }).join('');
    document.getElementById('svc-table').innerHTML =
@@ -2138,16 +2478,45 @@ function toggleBody(id) {
  const el = document.getElementById(id);
  if (el) el.style.display = el.style.display==='none' ? 'block' : 'none';
 }
-function svcAction(name, action) {
+function svcAction(btn, name, action) {
+  var label = btn.textContent;
+  btn.disabled = true;
+  btn.textContent = '...';
+  var out = document.getElementById('svc-out');
+  var term = document.getElementById('svc-terminal');
+  var statusEl = document.getElementById('svc-out-status');
+  var labelEl = document.getElementById('svc-out-label');
+  out.style.display = 'block';
+  labelEl.textContent = action + ' ' + name;
+  term.textContent = 'Running...';
+  statusEl.textContent = '';
+  statusEl.style.color = '';
  fetch('/api/services/action',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({name,action})})
    .then(r=>r.json()).then(d => {
-      document.getElementById('svc-out').style.display='block';
-      document.getElementById('svc-terminal').textContent = d.output || d.error || action+' '+name;
-      setTimeout(loadServices, 1000);
+      term.textContent = d.output || d.error || '(no output)';
+      term.scrollTop = term.scrollHeight;
+      if (d.status === 'ok') {
+        statusEl.textContent = '✓ done';
+        statusEl.style.color = 'var(--ok-fg, #2c662d)';
+      } else {
+        statusEl.textContent = '✗ failed';
+        statusEl.style.color = 'var(--crit-fg, #9f3a38)';
+      }
+      btn.textContent = label;
+      btn.disabled = false;
+      setTimeout(loadServices, 800);
+    }).catch(e => {
+      term.textContent = 'Request failed: ' + e;
+      statusEl.textContent = '✗ error';
+      statusEl.style.color = 'var(--crit-fg, #9f3a38)';
+      btn.textContent = label;
+      btn.disabled = false;
    });
 }
 function restartGPUDrivers() {
-  svcAction('bee-nvidia', 'restart');
+  var btn = document.querySelector('[onclick*="restartGPUDrivers"]');
+  if (!btn) { svcAction({textContent:'',disabled:false}, 'bee-nvidia', 'restart'); return; }
+  svcAction(btn, 'bee-nvidia', 'restart');
 }
 loadServices();
 </script>`
--- a/audit/internal/webui/server_test.go
+++ b/audit/internal/webui/server_test.go
@@ -1,6 +1,7 @@
 package webui

 import (
+	"encoding/json"
 	"net/http"
 	"net/http/httptest"
 	"os"
@@ -601,8 +602,8 @@ func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
 	if !strings.Contains(body, `Restart GPU Drivers`) {
 		t.Fatalf("tools page missing restart gpu drivers button: %s", body)
 	}
-	if !strings.Contains(body, `svcAction('bee-nvidia', 'restart')`) {
-		t.Fatalf("tools page missing bee-nvidia restart action: %s", body)
+	if !strings.Contains(body, `restartGPUDrivers()`) {
+		t.Fatalf("tools page missing restartGPUDrivers action: %s", body)
 	}
 	if !strings.Contains(body, `id="boot-source-text"`) {
 		t.Fatalf("tools page missing boot source field: %s", body)
@@ -636,6 +637,66 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
 	}
 }

+func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
+	dir := t.TempDir()
+	exportDir := filepath.Join(dir, "export")
+	runDir := filepath.Join(exportDir, "bee-benchmark", "gpu-benchmark-20260406-120000")
+	if err := os.MkdirAll(runDir, 0755); err != nil {
+		t.Fatal(err)
+	}
+	result := platform.NvidiaBenchmarkResult{
+		GeneratedAt:      time.Date(2026, time.April, 6, 12, 0, 0, 0, time.UTC),
+		BenchmarkProfile: "standard",
+		OverallStatus:    "OK",
+		GPUs: []platform.BenchmarkGPUResult{
+			{
+				Index: 0,
+				Name:  "NVIDIA H100 PCIe",
+				Scores: platform.BenchmarkScorecard{
+					CompositeScore: 1176.25,
+				},
+			},
+			{
+				Index: 1,
+				Name:  "NVIDIA H100 PCIe",
+				Scores: platform.BenchmarkScorecard{
+					CompositeScore: 1168.50,
+				},
+			},
+		},
+	}
+	raw, err := json.Marshal(result)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(runDir, "result.json"), raw, 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	handler := NewHandler(HandlerOptions{ExportDir: exportDir})
+	rec := httptest.NewRecorder()
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/benchmark", nil))
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d", rec.Code)
+	}
+	body := rec.Body.String()
+	wantTime := result.GeneratedAt.Local().Format("2006-01-02 15:04:05")
+	for _, needle := range []string{
+		`Benchmark Results`,
+		`Composite score by saved benchmark run and GPU.`,
+		`NVIDIA H100 PCIe / GPU 0`,
+		`NVIDIA H100 PCIe / GPU 1`,
+		`#1`,
+		wantTime,
+		`1176.25`,
+		`1168.50`,
+	} {
+		if !strings.Contains(body, needle) {
+			t.Fatalf("benchmark page missing %q: %s", needle, body)
+		}
+	}
+}
+
 func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
@@ -649,6 +710,8 @@ func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
 		`nvidia-targeted-stress`,
 		`controlled NVIDIA DCGM load`,
 		`<code>dcgmi diag targeted_stress</code>`,
+		`NVIDIA GPU Selection`,
+		`id="sat-gpu-list"`,
 	} {
 		if !strings.Contains(body, needle) {
 			t.Fatalf("validate page missing %q: %s", needle, body)
--- a/audit/internal/webui/task_page.go
+++ b/audit/internal/webui/task_page.go
@@ -97,32 +97,73 @@ func renderTaskDetailPage(opts HandlerOptions, task Task) string {
 		body.WriteString(`</div></div>`)
 	}

-	if task.Status == TaskRunning || task.Status == TaskPending {
+	if task.Status == TaskRunning {
 		body.WriteString(`<div class="card"><div class="card-head">Live Charts</div><div class="card-body">`)
 		body.WriteString(`<div id="task-live-charts" style="display:flex;flex-direction:column;gap:16px;color:var(--muted);font-size:13px">Loading charts...</div>`)
 		body.WriteString(`</div></div>`)
+	}
+
+	if task.Status == TaskRunning || task.Status == TaskPending {
 		body.WriteString(`<div class="card"><div class="card-head">Live Logs</div><div class="card-body">`)
 		body.WriteString(`<div id="task-live-log" class="terminal" style="max-height:none;white-space:pre-wrap">Connecting...</div>`)
 		body.WriteString(`</div></div>`)
 		body.WriteString(`<script>
 function cancelTaskDetail(id) {
-  fetch('/api/tasks/' + id + '/cancel', {method:'POST'}).then(function(){ window.location.reload(); });
+  fetch('/api/tasks/' + id + '/cancel', {method:'POST'}).then(function(){
+    var term = document.getElementById('task-live-log');
+    if (term) {
+      term.textContent += '\nCancel requested.\n';
+      term.scrollTop = term.scrollHeight;
+    }
+  });
+}
+function renderTaskLiveCharts(taskId, charts) {
+  const host = document.getElementById('task-live-charts');
+  if (!host) return;
+  if (!Array.isArray(charts) || charts.length === 0) {
+    host.innerHTML = 'Waiting for metric samples...';
+    return;
+  }
+  const seen = {};
+  charts.forEach(function(chart) {
+    seen[chart.file] = true;
+    let img = host.querySelector('img[data-chart-file="' + chart.file + '"]');
+    if (img) {
+      const card = img.closest('.card');
+      if (card) {
+        const title = card.querySelector('.card-head');
+        if (title) title.textContent = chart.title;
+      }
+      return;
+    }
+    const card = document.createElement('div');
+    card.className = 'card';
+    card.style.margin = '0';
+    card.innerHTML = '<div class="card-head"></div><div class="card-body" style="padding:12px"></div>';
+    card.querySelector('.card-head').textContent = chart.title;
+    const body = card.querySelector('.card-body');
+    img = document.createElement('img');
+    img.setAttribute('data-task-chart', '1');
+    img.setAttribute('data-chart-file', chart.file);
+    img.setAttribute('data-base-src', '/api/tasks/' + taskId + '/chart/' + chart.file);
+    img.src = '/api/tasks/' + taskId + '/chart/' + chart.file + '?t=' + Date.now();
+    img.style.width = '100%';
+    img.style.display = 'block';
+    img.style.borderRadius = '6px';
+    img.alt = chart.title;
+    body.appendChild(img);
+    host.appendChild(card);
+  });
+  Array.from(host.querySelectorAll('img[data-task-chart="1"]')).forEach(function(img) {
+    const file = img.getAttribute('data-chart-file') || '';
+    if (seen[file]) return;
+    const card = img.closest('.card');
+    if (card) card.remove();
+  });
 }
 function loadTaskLiveCharts(taskId) {
  fetch('/api/tasks/' + taskId + '/charts').then(function(r){ return r.json(); }).then(function(charts){
-    const host = document.getElementById('task-live-charts');
-    if (!host) return;
-    if (!Array.isArray(charts) || charts.length === 0) {
-      host.innerHTML = 'Waiting for metric samples...';
-      return;
-    }
-    host.innerHTML = charts.map(function(chart) {
-      return '<div class="card" style="margin:0">' +
-        '<div class="card-head">' + chart.title + '</div>' +
-        '<div class="card-body" style="padding:12px">' +
-        '<img data-task-chart="1" data-base-src="/api/tasks/' + taskId + '/chart/' + chart.file + '" src="/api/tasks/' + taskId + '/chart/' + chart.file + '?t=' + Date.now() + '" style="width:100%;display:block;border-radius:6px" alt="' + chart.title + '">' +
-        '</div></div>';
-    }).join('');
+    renderTaskLiveCharts(taskId, charts);
  }).catch(function(){
    const host = document.getElementById('task-live-charts');
    if (host) host.innerHTML = 'Task charts are unavailable.';
@@ -138,12 +179,31 @@ function refreshTaskLiveCharts() {
 var _taskDetailES = new EventSource('/api/tasks/` + html.EscapeString(task.ID) + `/stream');
 var _taskDetailTerm = document.getElementById('task-live-log');
 var _taskChartTimer = null;
+var _taskChartsFrozen = false;
 _taskDetailES.onopen = function(){ _taskDetailTerm.textContent = ''; };
 _taskDetailES.onmessage = function(e){ _taskDetailTerm.textContent += e.data + "\n"; _taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight; };
-_taskDetailES.addEventListener('done', function(){ if (_taskChartTimer) clearInterval(_taskChartTimer); _taskDetailES.close(); setTimeout(function(){ window.location.reload(); }, 1000); });
-_taskDetailES.onerror = function(){ if (_taskChartTimer) clearInterval(_taskChartTimer); _taskDetailES.close(); };
+_taskDetailES.addEventListener('done', function(e){
+  if (_taskChartTimer) clearInterval(_taskChartTimer);
+  _taskDetailES.close();
+  _taskDetailES = null;
+  _taskChartsFrozen = true;
+  _taskDetailTerm.textContent += (e.data ? '\nTask finished with error.\n' : '\nTask finished.\n');
+  _taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight;
+  refreshTaskLiveCharts();
+});
+_taskDetailES.onerror = function(){
+  if (_taskChartTimer) clearInterval(_taskChartTimer);
+  if (_taskDetailES) {
+    _taskDetailES.close();
+    _taskDetailES = null;
+  }
+};
 loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
-_taskChartTimer = setInterval(function(){ refreshTaskLiveCharts(); loadTaskLiveCharts('` + html.EscapeString(task.ID) + `'); }, 2000);
+_taskChartTimer = setInterval(function(){
+  if (_taskChartsFrozen) return;
+  loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
+  refreshTaskLiveCharts();
+}, 2000);
 </script>`)
 	}

--- a/audit/internal/webui/task_report.go
+++ b/audit/internal/webui/task_report.go
@@ -230,6 +230,9 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
 	b.WriteString(`<div style="margin-top:14px;font-size:13px;color:var(--muted)">`)
 	b.WriteString(`Started: ` + formatTaskTime(report.StartedAt, report.CreatedAt) + ` | Finished: ` + formatTaskTime(report.DoneAt, time.Time{}) + ` | Duration: ` + formatTaskDuration(report.DurationSec))
 	b.WriteString(`</div></div></div>`)
+	if benchmarkCard := renderTaskBenchmarkResultsCard(report.Target, logText); benchmarkCard != "" {
+		b.WriteString(benchmarkCard)
+	}

 	if len(report.Charts) > 0 {
 		for _, chart := range report.Charts {
@@ -247,6 +250,57 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
 	return b.String()
 }

+func renderTaskBenchmarkResultsCard(target, logText string) string {
+	if strings.TrimSpace(target) != "nvidia-benchmark" {
+		return ""
+	}
+	resultPath := taskBenchmarkResultPath(logText)
+	if strings.TrimSpace(resultPath) == "" {
+		return ""
+	}
+	columns, runs := loadBenchmarkHistoryFromPaths([]string{resultPath})
+	if len(runs) == 0 {
+		return ""
+	}
+	return renderBenchmarkResultsCardFromRuns(
+		"Benchmark Results",
+		"Composite score for this benchmark task.",
+		"No benchmark results were saved for this task.",
+		columns,
+		runs,
+	)
+}
+
+func taskBenchmarkResultPath(logText string) string {
+	archivePath := taskArchivePathFromLog(logText)
+	if archivePath == "" {
+		return ""
+	}
+	runDir := strings.TrimSuffix(archivePath, ".tar.gz")
+	if runDir == archivePath {
+		return ""
+	}
+	return filepath.Join(runDir, "result.json")
+}
+
+func taskArchivePathFromLog(logText string) string {
+	lines := strings.Split(logText, "\n")
+	for i := len(lines) - 1; i >= 0; i-- {
+		line := strings.TrimSpace(lines[i])
+		if line == "" || !strings.HasPrefix(line, "Archive:") {
+			continue
+		}
+		path := strings.TrimSpace(strings.TrimPrefix(line, "Archive:"))
+		if strings.HasPrefix(path, "Archive written to ") {
+			path = strings.TrimSpace(strings.TrimPrefix(path, "Archive written to "))
+		}
+		if strings.HasSuffix(path, ".tar.gz") {
+			return path
+		}
+	}
+	return ""
+}
+
 func renderTaskStatusBadge(status string) string {
 	className := map[string]string{
 		TaskRunning:   "badge-ok",
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -423,13 +423,14 @@ func (q *taskQueue) worker() {
 			setCPUGovernor("performance")
 			defer setCPUGovernor("powersave")

-			// Drain all pending tasks and start them in parallel.
-			q.mu.Lock()
-			var batch []*Task
 			for {
+				q.mu.Lock()
 				t := q.nextPending()
 				if t == nil {
-					break
+					q.prune()
+					q.persistLocked()
+					q.mu.Unlock()
+					return
 				}
 				now := time.Now()
 				t.Status = TaskRunning
@@ -438,29 +439,14 @@ func (q *taskQueue) worker() {
 				t.ErrMsg = ""
 				j := newTaskJobState(t.LogPath, taskSerialPrefix(t))
 				t.job = j
-				batch = append(batch, t)
-			}
-			if len(batch) > 0 {
 				q.persistLocked()
-			}
-			q.mu.Unlock()
+				q.mu.Unlock()

-			var wg sync.WaitGroup
-			for _, t := range batch {
-				t := t
-				j := t.job
 				taskCtx, taskCancel := context.WithCancel(context.Background())
 				j.cancel = taskCancel
-				wg.Add(1)
-				goRecoverOnce("task "+t.Target, func() {
-					defer wg.Done()
-					defer taskCancel()
-					q.executeTask(t, j, taskCtx)
-				})
-			}
-			wg.Wait()
+				q.executeTask(t, j, taskCtx)
+				taskCancel()

-			if len(batch) > 0 {
 				q.mu.Lock()
 				q.prune()
 				q.persistLocked()
@@ -1163,7 +1149,32 @@ func taskArtifactsDir(root string, t *Task, status string) string {
 	if strings.TrimSpace(root) == "" || t == nil {
 		return ""
 	}
-	return filepath.Join(root, fmt.Sprintf("%s_%s_%s", t.ID, sanitizeTaskFolderPart(t.Name), taskFolderStatus(status)))
+	prefix := taskFolderNumberPrefix(t.ID)
+	return filepath.Join(root, fmt.Sprintf("%s_%s_%s", prefix, sanitizeTaskFolderPart(t.Name), taskFolderStatus(status)))
+}
+
+func taskFolderNumberPrefix(taskID string) string {
+	taskID = strings.TrimSpace(taskID)
+	if strings.HasPrefix(taskID, "TASK-") && len(taskID) >= len("TASK-000") {
+		num := strings.TrimSpace(strings.TrimPrefix(taskID, "TASK-"))
+		if len(num) == 3 {
+			allDigits := true
+			for _, r := range num {
+				if r < '0' || r > '9' {
+					allDigits = false
+					break
+				}
+			}
+			if allDigits {
+				return num
+			}
+		}
+	}
+	fallback := sanitizeTaskFolderPart(taskID)
+	if fallback == "" {
+		return "000"
+	}
+	return fallback
 }

 func ensureTaskReportPaths(t *Task) {
--- a/audit/internal/webui/tasks_test.go
+++ b/audit/internal/webui/tasks_test.go
@@ -163,6 +163,40 @@ func TestTaskQueueSnapshotSortsNewestFirst(t *testing.T) {
 	}
 }

+func TestNewJobIDUsesTASKPrefixAndZeroPadding(t *testing.T) {
+	globalQueue.mu.Lock()
+	origTasks := globalQueue.tasks
+	globalQueue.tasks = nil
+	globalQueue.mu.Unlock()
+	origCounter := jobCounter.Load()
+	jobCounter.Store(0)
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = origTasks
+		globalQueue.mu.Unlock()
+		jobCounter.Store(origCounter)
+	})
+
+	if got := newJobID("ignored"); got != "TASK-000" {
+		t.Fatalf("id=%q want TASK-000", got)
+	}
+	if got := newJobID("ignored"); got != "TASK-001" {
+		t.Fatalf("id=%q want TASK-001", got)
+	}
+}
+
+func TestTaskArtifactsDirStartsWithTaskNumber(t *testing.T) {
+	root := t.TempDir()
+	task := &Task{
+		ID:   "TASK-007",
+		Name: "NVIDIA Benchmark",
+	}
+	got := filepath.Base(taskArtifactsDir(root, task, TaskDone))
+	if !strings.HasPrefix(got, "007_") {
+		t.Fatalf("artifacts dir=%q want prefix 007_", got)
+	}
+}
+
 func TestHandleAPITasksStreamReplaysPersistedLogWithoutLiveJob(t *testing.T) {
 	dir := t.TempDir()
 	logPath := filepath.Join(dir, "task.log")
@@ -325,6 +359,78 @@ func TestFinalizeTaskRunCreatesReportFolderAndArtifacts(t *testing.T) {
 	}
 }

+func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
+	dir := t.TempDir()
+	metricsPath := filepath.Join(dir, "metrics.db")
+	prevMetricsPath := taskReportMetricsDBPath
+	taskReportMetricsDBPath = metricsPath
+	t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
+
+	benchmarkDir := filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000")
+	if err := os.MkdirAll(benchmarkDir, 0755); err != nil {
+		t.Fatal(err)
+	}
+	result := platform.NvidiaBenchmarkResult{
+		GeneratedAt:      time.Date(2026, time.April, 6, 12, 0, 0, 0, time.UTC),
+		BenchmarkProfile: "standard",
+		OverallStatus:    "OK",
+		GPUs: []platform.BenchmarkGPUResult{
+			{
+				Index: 0,
+				Name:  "NVIDIA H100 PCIe",
+				Scores: platform.BenchmarkScorecard{
+					CompositeScore: 1176.25,
+				},
+			},
+		},
+	}
+	raw, err := json.Marshal(result)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(benchmarkDir, "result.json"), raw, 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	artifactsDir := filepath.Join(dir, "tasks", "task-bench_done")
+	if err := os.MkdirAll(artifactsDir, 0755); err != nil {
+		t.Fatal(err)
+	}
+	task := &Task{
+		ID:           "task-bench",
+		Name:         "NVIDIA Benchmark",
+		Target:       "nvidia-benchmark",
+		Status:       TaskDone,
+		CreatedAt:    time.Now().UTC().Add(-time.Minute),
+		ArtifactsDir: artifactsDir,
+	}
+	ensureTaskReportPaths(task)
+	logText := "line-1\nArchive: " + filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000.tar.gz") + "\n"
+	if err := os.WriteFile(task.LogPath, []byte(logText), 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	if err := writeTaskReportArtifacts(task); err != nil {
+		t.Fatalf("writeTaskReportArtifacts: %v", err)
+	}
+
+	body, err := os.ReadFile(task.ReportHTMLPath)
+	if err != nil {
+		t.Fatalf("ReadFile(report.html): %v", err)
+	}
+	html := string(body)
+	for _, needle := range []string{
+		`Benchmark Results`,
+		`Composite score for this benchmark task.`,
+		`NVIDIA H100 PCIe / GPU 0`,
+		`1176.25`,
+	} {
+		if !strings.Contains(html, needle) {
+			t.Fatalf("report missing %q: %s", needle, html)
+		}
+	}
+}
+
 func TestTaskLifecycleMirrorsToSerialConsole(t *testing.T) {
 	var lines []string
 	prev := taskSerialWriteLine
--- a/iso/builder/config/bootloaders/grub-pc/grub.cfg
+++ b/iso/builder/config/bootloaders/grub-pc/grub.cfg
@@ -15,29 +15,21 @@ menuentry "EASY-BEE" {
    initrd  @INITRD_LIVE@
 }

-menuentry "EASY-BEE (graphics/KMS)" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
-    initrd  @INITRD_LIVE@
-}
+submenu "EASY-BEE (advanced options) -->" {
+    menuentry "EASY-BEE — GSP=off" {
+        linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
+        initrd  @INITRD_LIVE@
+    }

-menuentry "EASY-BEE (load to RAM)" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
-    initrd  @INITRD_LIVE@
-}
+    menuentry "EASY-BEE — KMS (no nomodeset)" {
+        linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
+        initrd  @INITRD_LIVE@
+    }

-menuentry "EASY-BEE (NVIDIA GSP=off)" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
-    initrd  @INITRD_LIVE@
-}
-
-menuentry "EASY-BEE (graphics/KMS, GSP=off)" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
-    initrd  @INITRD_LIVE@
-}
-
-menuentry "EASY-BEE (fail-safe)" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
-    initrd  @INITRD_LIVE@
+    menuentry "EASY-BEE — fail-safe" {
+        linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
+        initrd  @INITRD_LIVE@
+    }
 }

 if [ "${grub_platform}" = "efi" ]; then
--- a/iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
+++ b/iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
@@ -44,23 +44,27 @@ else:
 img  = Image.new('RGB', (W, H), (0, 0, 0))
 draw = ImageDraw.Draw(img)

-# Measure logo block
+# Measure logo block line by line to avoid font ascender offset
 lines = LOGO.split('\n')
-bbox = draw.textbbox((0, 0), LOGO, font=font_logo)
-text_w = bbox[2] - bbox[0]
-text_h = bbox[3] - bbox[1]
+logo_lines = lines[:6]
+sub_line   = lines[6] if len(lines) > 6 else ''

-x = (W - text_w) // 2
-y = (H - text_h) // 2
+line_h = SIZE + 2
+block_h = len(logo_lines) * line_h + 8 + (SIZE if sub_line else 0)

-# Draw logo lines: first 6 in amber, last line (subtitle) dimmer
-logo_lines  = lines[:6]
-sub_line    = lines[6] if len(lines) > 6 else ''
+# Width: measure the widest logo line
+max_w = 0
+for line in logo_lines:
+    bb = draw.textbbox((0, 0), line, font=font_logo)
+    max_w = max(max_w, bb[2] - bb[0])
+
+x = (W - max_w) // 2
+y = (H - block_h) // 2

 cy = y
 for line in logo_lines:
    draw.text((x, cy), line, font=font_logo, fill=(0xf6, 0xc9, 0x0e))
-    cy += SIZE + 2
+    cy += line_h
 cy += 8
 if sub_line:
    draw.text((x, cy), sub_line, font=font_sub, fill=(0x80, 0x68, 0x18))
--- a/iso/builder/config/package-lists/bee.list.chroot
+++ b/iso/builder/config/package-lists/bee.list.chroot
@@ -65,6 +65,10 @@ python3-pil
 xorg
 xterm
 chromium
+mousepad
+pcmanfm
+ristretto
+mupdf
 xserver-xorg-video-fbdev
 xserver-xorg-video-vesa
 lightdm
--- a/iso/overlay/usr/local/bin/bee-gpu-burn
+++ b/iso/overlay/usr/local/bin/bee-gpu-burn
@@ -62,6 +62,8 @@ done
 echo "loader=bee-gpu-burn"
 echo "selected_gpus=${FINAL}"

+export CUDA_DEVICE_ORDER="PCI_BUS_ID"
+
 TMP_DIR=$(mktemp -d)
 trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM

@@ -78,7 +80,8 @@ for id in $(echo "${FINAL}" | tr ',' ' '); do
        fi
    fi
    echo "starting gpu ${id} size=${gpu_size_mb}MB"
-    "${WORKER}" --device "${id}" --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
+    CUDA_VISIBLE_DEVICES="${id}" \
+        "${WORKER}" --device 0 --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
    pid=$!
    WORKERS="${WORKERS} ${pid}:${id}:${log}"
 done
--- a/iso/overlay/usr/local/bin/bee-john-gpu-stress
+++ b/iso/overlay/usr/local/bin/bee-john-gpu-stress
@@ -152,14 +152,19 @@ done

 [ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }

+export CUDA_DEVICE_ORDER="PCI_BUS_ID"
+export CUDA_VISIBLE_DEVICES="${FINAL}"
+
 JOHN_DEVICES=""
+local_id=1
 for id in $(echo "${FINAL}" | tr ',' ' '); do
-    opencl_id=$((id + 1))
+    opencl_id="${local_id}"
    if [ -z "${JOHN_DEVICES}" ]; then
        JOHN_DEVICES="${opencl_id}"
    else
        JOHN_DEVICES="${JOHN_DEVICES},${opencl_id}"
    fi
+    local_id=$((local_id + 1))
 done

 echo "loader=john"
--- a/iso/overlay/usr/local/bin/bee-nccl-gpu-stress
+++ b/iso/overlay/usr/local/bin/bee-nccl-gpu-stress
@@ -70,6 +70,8 @@ echo "gpu_count=${GPU_COUNT}"
 echo "range=${MIN_BYTES}..${MAX_BYTES}"
 echo "iters=${ITERS}"

+export CUDA_DEVICE_ORDER="PCI_BUS_ID"
+
 deadline=$(( $(date +%s) + SECONDS ))
 round=0

--- a/iso/overlay/usr/local/bin/bee-nvidia-load
+++ b/iso/overlay/usr/local/bin/bee-nvidia-load
@@ -50,11 +50,93 @@ load_module() {
        log "WARN: not found: $ko"
        return 1
    fi
-    if insmod "$ko" "$@"; then
+    if timeout 90 insmod "$ko" "$@"; then
        log "loaded: $mod $*"
        return 0
    fi
-    log "WARN: failed to load: $mod"
+    log "WARN: failed to load: $mod (exit $?)"
+    dmesg | tail -n 10 | sed 's/^/  dmesg: /' || true
+    return 1
+}
+
+nvidia_is_functional() {
+    grep -q ' nvidiactl$' /proc/devices 2>/dev/null
+}
+
+load_module_with_gsp_fallback() {
+    ko="$NVIDIA_KO_DIR/nvidia.ko"
+    if [ ! -f "$ko" ]; then
+        log "ERROR: not found: $ko"
+        return 1
+    fi
+
+    # Run insmod in background — on some converted SXM→PCIe cards GSP enters an
+    # infinite crash/reload loop and insmod never returns. We check for successful
+    # initialization by polling /proc/devices for nvidiactl instead of waiting for
+    # insmod to exit.
+    log "loading nvidia (GSP enabled, timeout 90s)"
+    insmod "$ko" &
+    _insmod_pid=$!
+
+    _waited=0
+    while [ $_waited -lt 90 ]; do
+        if nvidia_is_functional; then
+            log "loaded: nvidia (GSP enabled, ${_waited}s)"
+            echo "gsp-on" > /run/bee-nvidia-mode
+            return 0
+        fi
+        # Check if insmod exited with an error before timeout
+        if ! kill -0 "$_insmod_pid" 2>/dev/null; then
+            wait "$_insmod_pid"
+            _rc=$?
+            if [ $_rc -ne 0 ]; then
+                log "nvidia load failed (exit $_rc)"
+                dmesg | tail -n 10 | sed 's/^/  dmesg: /' || true
+                return 1
+            fi
+            # insmod exited 0 but nvidiactl not yet in /proc/devices — give it a moment
+            sleep 2
+            if nvidia_is_functional; then
+                log "loaded: nvidia (GSP enabled, ${_waited}s)"
+                return 0
+            fi
+            log "insmod exited 0 but nvidiactl missing — treating as failure"
+            return 1
+        fi
+        sleep 1
+        _waited=$((_waited + 1))
+    done
+
+    # GSP init timed out — kill the hanging insmod and attempt gsp-off fallback
+    log "nvidia GSP init timed out after 90s"
+    kill "$_insmod_pid" 2>/dev/null || true
+    wait "$_insmod_pid" 2>/dev/null || true
+
+    # Attempt to unload the partially-initialized module
+    if ! rmmod nvidia 2>/dev/null; then
+        # Module is stuck in the kernel — cannot reload with different params.
+        # User must reboot and select bee.nvidia.mode=gsp-off at boot menu.
+        log "ERROR: rmmod nvidia failed (EBUSY) — module stuck in kernel"
+        log "ERROR: reboot and select 'EASY-BEE (advanced) -> GSP=off' in boot menu"
+        echo "gsp-stuck" > /run/bee-nvidia-mode
+        return 1
+    fi
+
+    sleep 2
+    log "retrying with NVreg_EnableGpuFirmware=0"
+    log "WARNING: GSP disabled — power management will run via CPU path, not GPU firmware"
+
+    if insmod "$ko" NVreg_EnableGpuFirmware=0; then
+        if nvidia_is_functional; then
+            log "loaded: nvidia (GSP disabled)"
+            echo "gsp-off" > /run/bee-nvidia-mode
+            return 0
+        fi
+        log "insmod gsp-off exited 0 but nvidiactl missing"
+        return 1
+    fi
+
+    log "nvidia load failed (GSP=off)"
    dmesg | tail -n 10 | sed 's/^/  dmesg: /' || true
    return 1
 }
@@ -70,7 +152,7 @@ load_host_module() {

 case "$nvidia_mode" in
    normal|full)
-        if ! load_module nvidia; then
+        if ! load_module_with_gsp_fallback; then
            exit 1
        fi
        # nvidia-modeset on some server kernels needs ACPI video helper symbols
@@ -127,6 +209,18 @@ fi
 ldconfig 2>/dev/null || true
 log "ldconfig refreshed"

+# Keep persistence mode enabled across the session so dcgmi / stress tools do
+# not fail with deployment warnings on otherwise healthy GPUs.
+if command -v nvidia-smi >/dev/null 2>&1; then
+    if nvidia-smi -pm 1 >/dev/null 2>&1; then
+        log "enabled NVIDIA persistence mode"
+    else
+        log "WARN: failed to enable NVIDIA persistence mode"
+    fi
+else
+    log "WARN: nvidia-smi not found — cannot enable persistence mode"
+fi
+
 # Start DCGM host engine so dcgmi can discover GPUs.
 # nv-hostengine must run after the NVIDIA modules and device nodes are ready.
 # If it started too early (for example via systemd before bee-nvidia-load), it can
Author	SHA1	Message	Date
Mikhail Chusavitin	2354ae367d	Normalize task IDs and artifact folder prefixes	2026-04-06 12:26:47 +03:00
Mikhail Chusavitin	0d0e1f55a7	Avoid misleading SAT summaries after task cancellation	2026-04-06 12:24:19 +03:00
Mikhail Chusavitin	35f4c53887	Stabilize NVIDIA GPU device mapping across loaders	2026-04-06 12:22:04 +03:00
Mikhail Chusavitin	981315e6fd	Split NVIDIA tasks by homogeneous GPU groups	2026-04-06 11:58:13 +03:00
Mikhail Chusavitin	fc5c100a29	Fix NVIDIA persistence mode and add benchmark results table	2026-04-06 10:47:07 +03:00
Michael Chus	6e94216f3b	Hide task charts while pending	2026-04-05 22:34:34 +03:00
Michael Chus	53455063b9	Stabilize live task detail page	2026-04-05 22:14:52 +03:00
Michael Chus	4602f97836	Enforce sequential task orchestration	2026-04-05 22:10:42 +03:00
Michael Chus	c65d3ae3b1	Add nomodeset to default GRUB entry — fix black screen on headless servers Servers with NVIDIA compute GPUs (H100 etc.) have no display output, so KMS blanks the console. nomodeset disables kernel modesetting and lets the NVIDIA proprietary driver handle display via Xorg. KMS variant moved to advanced submenu for cases where it is needed. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 21:40:47 +03:00
Michael Chus	7a21c370e4	Handle NVIDIA GSP firmware init hang with timeout fallback - bee-nvidia-load: run insmod in background, poll /proc/devices for nvidiactl; if GSP init doesn't complete in 90s, kill insmod and retry with NVreg_EnableGpuFirmware=0. Handles EBUSY case with clear error. - Write /run/bee-nvidia-mode (gsp-on/gsp-off/gsp-stuck) for audit layer - Show GSP mode badge in sidebar: yellow for gsp-off, red for gsp-stuck - Report NvidiaGSPMode in RuntimeHealth with issue entries - Simplify GRUB menu: default (KMS+GSP), advanced submenu (GSP=off, nomodeset, fail-safe), remove load-to-RAM entry - Add pcmanfm, ristretto, mupdf, mousepad to desktop packages Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 21:00:43 +03:00
Michael Chus	a493e3ab5b	Fix service control buttons: sudo, real error output, UX feedback - services.go: use sudo systemctl so bee user can control system services - api.go: always return 200 with output field even on error, so the frontend shows the actual systemctl message instead of "exit status 1" - pages.go: button shows "..." while pending then restores label; output panel is full-width under the table with ✓/✗ status indicator; output auto-scrolls to bottom Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 20:25:41 +03:00
Michael Chus	19b4803ec7	Pass exact cycle duration to GPU stress instead of 86400s sentinel bee-gpu-burn now receives --seconds <LoadSec> so it exits naturally when the cycle ends, rather than relying solely on context cancellation to kill it. Process group kill (Setpgid+Cancel) is kept as a safety net for early cancellation (user stop, context timeout). Same fix for AMD RVS which now gets duration_ms = LoadSec * 1000. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 20:22:43 +03:00
Michael Chus	1bdfb1e9ca	Fix nvidia-targeted-stress failing with DCGM_ST_IN_USE (-34) nvvs (DCGM validation suite) survives when dcgmi is killed mid-run, leaving the GPU occupied. The next dcgmi diag invocation then fails with "affected resource is in use". Two-part fix: - Add nvvs and dcgmi to KillTestWorkers patterns so they are cleaned up by the global cancel handler - Call KillTestWorkers at the start of RunNvidiaTargetedStressValidatePack to clear any stale processes before dcgmi diag runs Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 20:21:36 +03:00
Michael Chus	c5d6b30177	Fix platform thermal cycling leaving GPU load running after test ends bee-gpu-burn is a shell script that spawns bee-gpu-burn-worker children. exec.CommandContext default cancel only kills the shell parent; the worker processes survive and keep loading the GPU indefinitely. Fix: set Setpgid=true and a custom Cancel that sends SIGKILL to the entire process group (-pid), same pattern already used in runSATCommandCtx. Applied to Nvidia, AMD, and CPU stress commands for consistency. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-05 20:19:20 +03:00