Mirror task lifecycle to serial console

Make memory stress size follow available RAM
Add task reports and streamline GPU charts
2026-04-05 18:34:06 +03:00 · 2026-04-05 18:33:26 +03:00 · 2026-04-05 18:13:58 +03:00 · 2026-04-05 17:50:13 +03:00 · 2026-04-05 16:24:45 +03:00 · 2026-04-05 13:43:43 +03:00
50 changed files with 6689 additions and 785 deletions
--- a/audit/Makefile
+++ b/audit/Makefile
@@ -1,9 +1,10 @@
 LISTEN ?= :8080
 AUDIT_PATH ?=
+EXPORT_DIR ?= $(CURDIR)/.tmp/export
 VERSION ?= $(shell sh ./scripts/resolve-version.sh)
 GO_LDFLAGS := -X main.Version=$(VERSION)

-RUN_ARGS := web --listen $(LISTEN)
+RUN_ARGS := web --listen $(LISTEN) --export-dir $(EXPORT_DIR)
 ifneq ($(AUDIT_PATH),)
 RUN_ARGS += --audit-path $(AUDIT_PATH)
 endif
@@ -11,6 +12,7 @@ endif
 .PHONY: run build test

 run:
+	mkdir -p $(EXPORT_DIR)
 	go run -ldflags "$(GO_LDFLAGS)" ./cmd/bee $(RUN_ARGS)

 build:
--- a/audit/cmd/bee/main.go
+++ b/audit/cmd/bee/main.go
@@ -7,6 +7,8 @@ import (
 	"io"
 	"log/slog"
 	"os"
+	"runtime/debug"
+	"strconv"
 	"strings"

 	"bee/audit/internal/app"
@@ -29,10 +31,19 @@ func main() {
 	os.Exit(run(os.Args[1:], os.Stdout, os.Stderr))
 }

-func run(args []string, stdout, stderr io.Writer) int {
+func run(args []string, stdout, stderr io.Writer) (exitCode int) {
 	slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
 		Level: slog.LevelInfo,
 	})))
+	defer func() {
+		if rec := recover(); rec != nil {
+			slog.Error("fatal panic",
+				"panic", fmt.Sprint(rec),
+				"stack", string(debug.Stack()),
+			)
+			exitCode = 1
+		}
+	}()

 	if len(args) == 0 {
 		printRootUsage(stderr)
@@ -58,6 +69,8 @@ func run(args []string, stdout, stderr io.Writer) int {
 		return runWeb(args[1:], stdout, stderr)
 	case "sat":
 		return runSAT(args[1:], stdout, stderr)
+	case "benchmark":
+		return runBenchmark(args[1:], stdout, stderr)
 	case "version", "--version", "-version":
 		fmt.Fprintln(stdout, Version)
 		return 0
@@ -74,8 +87,9 @@ func printRootUsage(w io.Writer) {
  bee preflight --output stdout|file:<path>
  bee export  --target <device>
  bee support-bundle --output stdout|file:<path>
-  bee web     --listen :80 --audit-path `+app.DefaultAuditJSONPath+`
+  bee web     --listen :80 [--audit-path `+app.DefaultAuditJSONPath+`]
  bee sat nvidia|memory|storage|cpu [--duration <seconds>]
+  bee benchmark nvidia [--profile standard|stability|overnight]
  bee version
  bee help [command]`)
 }
@@ -94,6 +108,8 @@ func runHelp(args []string, stdout, stderr io.Writer) int {
 		return runWeb([]string{"--help"}, stdout, stdout)
 	case "sat":
 		return runSAT([]string{"--help"}, stdout, stderr)
+	case "benchmark":
+		return runBenchmark([]string{"--help"}, stdout, stderr)
 	case "version":
 		fmt.Fprintln(stdout, "usage: bee version")
 		return 0
@@ -280,7 +296,7 @@ func runWeb(args []string, stdout, stderr io.Writer) int {
 	fs := flag.NewFlagSet("web", flag.ContinueOnError)
 	fs.SetOutput(stderr)
 	listenAddr := fs.String("listen", ":8080", "listen address, e.g. :80")
-	auditPath := fs.String("audit-path", app.DefaultAuditJSONPath, "path to the latest audit JSON snapshot")
+	auditPath := fs.String("audit-path", "", "optional path to the latest audit JSON snapshot")
 	exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
 	title := fs.String("title", "Bee Hardware Audit", "page title")
 	fs.Usage = func() {
@@ -383,3 +399,85 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
 	slog.Info("sat archive written", "target", target, "path", archive)
 	return 0
 }
+
+func runBenchmark(args []string, stdout, stderr io.Writer) int {
+	if len(args) == 0 {
+		fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
+		return 2
+	}
+	if args[0] == "help" || args[0] == "--help" || args[0] == "-h" {
+		fmt.Fprintln(stdout, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
+		return 0
+	}
+	target := args[0]
+	if target != "nvidia" {
+		fmt.Fprintf(stderr, "bee benchmark: unknown target %q\n", target)
+		fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
+		return 2
+	}
+
+	fs := flag.NewFlagSet("benchmark", flag.ContinueOnError)
+	fs.SetOutput(stderr)
+	profile := fs.String("profile", platform.NvidiaBenchmarkProfileStandard, "benchmark profile: standard, stability, overnight")
+	devices := fs.String("devices", "", "comma-separated GPU indices to include")
+	exclude := fs.String("exclude", "", "comma-separated GPU indices to exclude")
+	sizeMB := fs.Int("size-mb", 0, "per-GPU benchmark buffer size in MB (0 = auto)")
+	skipNCCL := fs.Bool("skip-nccl", false, "skip multi-GPU NCCL interconnect benchmark")
+	if err := fs.Parse(args[1:]); err != nil {
+		if err == flag.ErrHelp {
+			return 0
+		}
+		return 2
+	}
+	if fs.NArg() != 0 {
+		fmt.Fprintf(stderr, "bee benchmark: unexpected arguments\n")
+		return 2
+	}
+
+	includeIndices, err := parseBenchmarkIndexCSV(*devices)
+	if err != nil {
+		fmt.Fprintf(stderr, "bee benchmark: invalid --devices: %v\n", err)
+		return 2
+	}
+	excludeIndices, err := parseBenchmarkIndexCSV(*exclude)
+	if err != nil {
+		fmt.Fprintf(stderr, "bee benchmark: invalid --exclude: %v\n", err)
+		return 2
+	}
+
+	application := app.New(platform.New())
+	logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
+	archive, err := application.RunNvidiaBenchmark("", platform.NvidiaBenchmarkOptions{
+		Profile:           *profile,
+		SizeMB:            *sizeMB,
+		GPUIndices:        includeIndices,
+		ExcludeGPUIndices: excludeIndices,
+		RunNCCL:           !*skipNCCL,
+	}, logLine)
+	if err != nil {
+		slog.Error("run benchmark", "target", target, "err", err)
+		return 1
+	}
+	slog.Info("benchmark archive written", "target", target, "path", archive)
+	return 0
+}
+
+func parseBenchmarkIndexCSV(raw string) ([]int, error) {
+	raw = strings.TrimSpace(raw)
+	if raw == "" {
+		return nil, nil
+	}
+	var indices []int
+	for _, part := range strings.Split(raw, ",") {
+		part = strings.TrimSpace(part)
+		if part == "" {
+			continue
+		}
+		value, err := strconv.Atoi(part)
+		if err != nil || value < 0 {
+			return nil, fmt.Errorf("bad gpu index %q", part)
+		}
+		indices = append(indices, value)
+	}
+	return indices, nil
+}
--- a/audit/internal/app/app.go
+++ b/audit/internal/app/app.go
@@ -19,17 +19,18 @@ import (
 )

 var (
-	DefaultExportDir       = "/appdata/bee/export"
-	DefaultAuditJSONPath   = DefaultExportDir + "/bee-audit.json"
-	DefaultAuditLogPath    = DefaultExportDir + "/bee-audit.log"
-	DefaultWebLogPath      = DefaultExportDir + "/bee-web.log"
-	DefaultNetworkLogPath  = DefaultExportDir + "/bee-network.log"
-	DefaultNvidiaLogPath   = DefaultExportDir + "/bee-nvidia.log"
-	DefaultSSHLogPath      = DefaultExportDir + "/bee-sshsetup.log"
-	DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
-	DefaultRuntimeLogPath  = DefaultExportDir + "/runtime-health.log"
-	DefaultTechDumpDir     = DefaultExportDir + "/techdump"
-	DefaultSATBaseDir      = DefaultExportDir + "/bee-sat"
+	DefaultExportDir        = "/appdata/bee/export"
+	DefaultAuditJSONPath    = DefaultExportDir + "/bee-audit.json"
+	DefaultAuditLogPath     = DefaultExportDir + "/bee-audit.log"
+	DefaultWebLogPath       = DefaultExportDir + "/bee-web.log"
+	DefaultNetworkLogPath   = DefaultExportDir + "/bee-network.log"
+	DefaultNvidiaLogPath    = DefaultExportDir + "/bee-nvidia.log"
+	DefaultSSHLogPath       = DefaultExportDir + "/bee-sshsetup.log"
+	DefaultRuntimeJSONPath  = DefaultExportDir + "/runtime-health.json"
+	DefaultRuntimeLogPath   = DefaultExportDir + "/runtime-health.log"
+	DefaultTechDumpDir      = DefaultExportDir + "/techdump"
+	DefaultSATBaseDir       = DefaultExportDir + "/bee-sat"
+	DefaultBenchmarkBaseDir = DefaultExportDir + "/bee-benchmark"
 )

 type App struct {
@@ -114,6 +115,12 @@ func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
 type satRunner interface {
 	RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error)
 	RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
+	RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
+	RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
+	RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
+	RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
+	RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
+	RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
 	RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
 	RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
@@ -195,10 +202,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
 		return "stdout", err
 	case strings.HasPrefix(output, "file:"):
 		path := strings.TrimPrefix(output, "file:")
-		if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
-			return "", err
-		}
-		if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
+		if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
 			return "", err
 		}
 		return path, nil
@@ -223,10 +227,7 @@ func (a *App) RunRuntimePreflight(output string) (string, error) {
 		return "stdout", err
 	case strings.HasPrefix(output, "file:"):
 		path := strings.TrimPrefix(output, "file:")
-		if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
-			return "", err
-		}
-		if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
+		if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
 			return "", err
 		}
 		return path, nil
@@ -532,10 +533,56 @@ func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir st
 	return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
 }

+func (a *App) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunNvidiaTargetedStressValidatePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
+}
+
 func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
 	return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
 }

+func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
+	return a.RunNvidiaBenchmarkCtx(context.Background(), baseDir, opts, logFunc)
+}
+
+func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultBenchmarkBaseDir
+	}
+	return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
+}
+
+func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
+}
+
+func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunNvidiaTargetedPowerPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
+}
+
+func (a *App) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunNvidiaPulseTestPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
+}
+
+func (a *App) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunNvidiaBandwidthPack(ctx, baseDir, gpuIndices, logFunc)
+}
+
 func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
@@ -886,6 +933,12 @@ func latestSATSummaries() []string {
 		prefix string
 	}{
 		{label: "NVIDIA SAT", prefix: "gpu-nvidia-"},
+		{label: "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)", prefix: "gpu-nvidia-targeted-stress-"},
+		{label: "NVIDIA Max Compute Load (dcgmproftester)", prefix: "gpu-nvidia-compute-"},
+		{label: "NVIDIA Targeted Power (dcgmi diag targeted_power)", prefix: "gpu-nvidia-targeted-power-"},
+		{label: "NVIDIA Pulse Test (dcgmi diag pulse_test)", prefix: "gpu-nvidia-pulse-"},
+		{label: "NVIDIA Interconnect Test (NCCL all_reduce_perf)", prefix: "gpu-nvidia-nccl-"},
+		{label: "NVIDIA Bandwidth Test (NVBandwidth)", prefix: "gpu-nvidia-bandwidth-"},
 		{label: "Memory SAT", prefix: "memory-"},
 		{label: "Storage SAT", prefix: "storage-"},
 		{label: "CPU SAT", prefix: "cpu-"},
--- a/audit/internal/app/app_test.go
+++ b/audit/internal/app/app_test.go
@@ -120,15 +120,21 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
 }

 type fakeSAT struct {
-	runNvidiaFn       func(string) (string, error)
-	runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
-	runMemoryFn       func(string) (string, error)
-	runStorageFn      func(string) (string, error)
-	runCPUFn          func(string, int) (string, error)
-	detectVendorFn    func() string
-	listAMDGPUsFn     func() ([]platform.AMDGPUInfo, error)
-	runAMDPackFn      func(string) (string, error)
-	listNvidiaGPUsFn  func() ([]platform.NvidiaGPU, error)
+	runNvidiaFn               func(string) (string, error)
+	runNvidiaBenchmarkFn      func(string, platform.NvidiaBenchmarkOptions) (string, error)
+	runNvidiaStressFn         func(string, platform.NvidiaStressOptions) (string, error)
+	runNvidiaComputeFn        func(string, int, []int) (string, error)
+	runNvidiaPowerFn          func(string, int, []int) (string, error)
+	runNvidiaPulseFn          func(string, int, []int) (string, error)
+	runNvidiaBandwidthFn      func(string, []int) (string, error)
+	runNvidiaTargetedStressFn func(string, int, []int) (string, error)
+	runMemoryFn               func(string) (string, error)
+	runStorageFn              func(string) (string, error)
+	runCPUFn                  func(string, int) (string, error)
+	detectVendorFn            func() string
+	listAMDGPUsFn             func() ([]platform.AMDGPUInfo, error)
+	runAMDPackFn              func(string) (string, error)
+	listNvidiaGPUsFn          func() ([]platform.NvidiaGPU, error)
 }

 func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
@@ -139,6 +145,48 @@ func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir s
 	return f.runNvidiaFn(baseDir)
 }

+func (f fakeSAT) RunNvidiaBenchmark(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, _ func(string)) (string, error) {
+	if f.runNvidiaBenchmarkFn != nil {
+		return f.runNvidiaBenchmarkFn(baseDir, opts)
+	}
+	return f.runNvidiaFn(baseDir)
+}
+
+func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
+	if f.runNvidiaTargetedStressFn != nil {
+		return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
+	}
+	return f.runNvidiaFn(baseDir)
+}
+
+func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
+	if f.runNvidiaComputeFn != nil {
+		return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices)
+	}
+	return f.runNvidiaFn(baseDir)
+}
+
+func (f fakeSAT) RunNvidiaTargetedPowerPack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
+	if f.runNvidiaPowerFn != nil {
+		return f.runNvidiaPowerFn(baseDir, durationSec, gpuIndices)
+	}
+	return f.runNvidiaFn(baseDir)
+}
+
+func (f fakeSAT) RunNvidiaPulseTestPack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
+	if f.runNvidiaPulseFn != nil {
+		return f.runNvidiaPulseFn(baseDir, durationSec, gpuIndices)
+	}
+	return f.runNvidiaFn(baseDir)
+}
+
+func (f fakeSAT) RunNvidiaBandwidthPack(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
+	if f.runNvidiaBandwidthFn != nil {
+		return f.runNvidiaBandwidthFn(baseDir, gpuIndices)
+	}
+	return f.runNvidiaFn(baseDir)
+}
+
 func (f fakeSAT) RunNvidiaStressPack(_ context.Context, baseDir string, opts platform.NvidiaStressOptions, _ func(string)) (string, error) {
 	if f.runNvidiaStressFn != nil {
 		return f.runNvidiaStressFn(baseDir, opts)
@@ -754,6 +802,26 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 		}
 	}

+	for _, want := range []string{
+		"/system/ip-link.txt",
+		"/system/ip-link-stats.txt",
+		"/system/ethtool-info.txt",
+		"/system/ethtool-link.txt",
+		"/system/ethtool-module.txt",
+		"/system/mstflint-query.txt",
+	} {
+		var found bool
+		for _, name := range names {
+			if contains(name, want) {
+				found = true
+				break
+			}
+		}
+		if !found {
+			t.Fatalf("support bundle missing %s, names=%v", want, names)
+		}
+	}
+
 	var foundRaw bool
 	for _, name := range names {
 		if contains(name, "/export/bee-sat/memory-run/verbose.log") {
--- a/audit/internal/app/atomic_write.go
+++ b/audit/internal/app/atomic_write.go
@@ -0,0 +1,48 @@
+package app
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+)
+
+func atomicWriteFile(path string, data []byte, perm os.FileMode) error {
+	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+		return fmt.Errorf("mkdir %s: %w", filepath.Dir(path), err)
+	}
+
+	tmpPath := path + ".tmp"
+	f, err := os.OpenFile(tmpPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, perm)
+	if err != nil {
+		return fmt.Errorf("open temp %s: %w", tmpPath, err)
+	}
+
+	success := false
+	defer func() {
+		_ = f.Close()
+		if !success {
+			_ = os.Remove(tmpPath)
+		}
+	}()
+
+	if _, err := f.Write(data); err != nil {
+		return fmt.Errorf("write temp %s: %w", tmpPath, err)
+	}
+	if err := f.Sync(); err != nil {
+		return fmt.Errorf("sync temp %s: %w", tmpPath, err)
+	}
+	if err := f.Close(); err != nil {
+		return fmt.Errorf("close temp %s: %w", tmpPath, err)
+	}
+	if err := os.Rename(tmpPath, path); err != nil {
+		return fmt.Errorf("rename %s -> %s: %w", tmpPath, path, err)
+	}
+
+	if dir, err := os.Open(filepath.Dir(path)); err == nil {
+		_ = dir.Sync()
+		_ = dir.Close()
+	}
+
+	success = true
+	return nil
+}
--- a/audit/internal/app/atomic_write_test.go
+++ b/audit/internal/app/atomic_write_test.go
@@ -0,0 +1,71 @@
+package app
+
+import (
+	"encoding/json"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"bee/audit/internal/schema"
+)
+
+func TestAtomicWriteFileReplacesTargetWithoutLeavingTmp(t *testing.T) {
+	path := filepath.Join(t.TempDir(), "bee-audit.json")
+	if err := os.WriteFile(path, []byte("old\n"), 0644); err != nil {
+		t.Fatalf("seed file: %v", err)
+	}
+
+	if err := atomicWriteFile(path, []byte("new\n"), 0644); err != nil {
+		t.Fatalf("atomicWriteFile: %v", err)
+	}
+
+	raw, err := os.ReadFile(path)
+	if err != nil {
+		t.Fatalf("read final: %v", err)
+	}
+	if string(raw) != "new\n" {
+		t.Fatalf("final content=%q want %q", string(raw), "new\n")
+	}
+	if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
+		t.Fatalf("tmp file should be absent after success, err=%v", err)
+	}
+}
+
+func TestRunRuntimePreflightWritesAtomically(t *testing.T) {
+	path := filepath.Join(t.TempDir(), "runtime-health.json")
+	a := &App{
+		runtime: fakeRuntime{
+			collectFn: func(exportDir string) (schema.RuntimeHealth, error) {
+				return schema.RuntimeHealth{
+					Status:      "OK",
+					ExportDir:   exportDir,
+					DriverReady: true,
+					CUDAReady:   true,
+				}, nil
+			},
+		},
+	}
+
+	got, err := a.RunRuntimePreflight("file:" + path)
+	if err != nil {
+		t.Fatalf("RunRuntimePreflight: %v", err)
+	}
+	if got != path {
+		t.Fatalf("path=%q want %q", got, path)
+	}
+	if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
+		t.Fatalf("tmp file should be absent after success, err=%v", err)
+	}
+
+	raw, err := os.ReadFile(path)
+	if err != nil {
+		t.Fatalf("read runtime file: %v", err)
+	}
+	var health schema.RuntimeHealth
+	if err := json.Unmarshal(raw, &health); err != nil {
+		t.Fatalf("json unmarshal: %v", err)
+	}
+	if health.Status != "OK" {
+		t.Fatalf("status=%q want OK", health.Status)
+	}
+}
--- a/audit/internal/app/component_status_db.go
+++ b/audit/internal/app/component_status_db.go
@@ -21,12 +21,12 @@ type ComponentStatusDB struct {

 // ComponentStatusRecord holds the current and historical health of one hardware component.
 type ComponentStatusRecord struct {
-	ComponentKey  string                  `json:"component_key"`
-	Status        string                  `json:"status"` // "OK", "Warning", "Critical", "Unknown"
-	LastCheckedAt time.Time               `json:"last_checked_at"`
-	LastChangedAt time.Time               `json:"last_changed_at"`
-	ErrorSummary  string                  `json:"error_summary,omitempty"`
-	History       []ComponentStatusEntry  `json:"history"`
+	ComponentKey  string                 `json:"component_key"`
+	Status        string                 `json:"status"` // "OK", "Warning", "Critical", "Unknown"
+	LastCheckedAt time.Time              `json:"last_checked_at"`
+	LastChangedAt time.Time              `json:"last_changed_at"`
+	ErrorSummary  string                 `json:"error_summary,omitempty"`
+	History       []ComponentStatusEntry `json:"history"`
 }

 // ComponentStatusEntry is one observation written to a component's history.
@@ -179,7 +179,9 @@ func ApplySATResultToDB(db *ComponentStatusDB, target, archivePath string) {

 	// Map SAT target to component keys.
 	switch target {
-	case "nvidia", "amd", "nvidia-stress", "amd-stress", "amd-mem", "amd-bandwidth":
+	case "nvidia", "nvidia-targeted-stress", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
+		"nvidia-interconnect", "nvidia-bandwidth", "amd", "nvidia-stress",
+		"amd-stress", "amd-mem", "amd-bandwidth":
 		db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall)
 	case "memory", "memory-stress", "sat-stress":
 		db.Record("memory:all", source, dbStatus, target+" SAT: "+overall)
--- a/audit/internal/app/support_bundle.go
+++ b/audit/internal/app/support_bundle.go
@@ -19,6 +19,8 @@ var supportBundleServices = []string{
 	"bee-network.service",
 	"bee-nvidia.service",
 	"bee-preflight.service",
+	"bee-selfheal.service",
+	"bee-selfheal.timer",
 	"bee-sshsetup.service",
 }

@@ -32,6 +34,8 @@ var supportBundleCommands = []struct {
 	{name: "system/lspci-nn.txt", cmd: []string{"lspci", "-nn"}},
 	{name: "system/lspci-vvv.txt", cmd: []string{"lspci", "-vvv"}},
 	{name: "system/ip-addr.txt", cmd: []string{"ip", "addr"}},
+	{name: "system/ip-link.txt", cmd: []string{"ip", "-details", "link", "show"}},
+	{name: "system/ip-link-stats.txt", cmd: []string{"ip", "-s", "link", "show"}},
 	{name: "system/ip-route.txt", cmd: []string{"ip", "route"}},
 	{name: "system/mount.txt", cmd: []string{"mount"}},
 	{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
@@ -47,6 +51,83 @@ for d in /sys/bus/pci/devices/*/; do
    printf "  %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
  done
 done
+`}},
+	{name: "system/ethtool-info.txt", cmd: []string{"sh", "-c", `
+if ! command -v ethtool >/dev/null 2>&1; then
+  echo "ethtool not found"
+  exit 0
+fi
+found=0
+for path in /sys/class/net/*; do
+  [ -e "$path" ] || continue
+  iface=$(basename "$path")
+  [ "$iface" = "lo" ] && continue
+  found=1
+  echo "=== $iface ==="
+  ethtool -i "$iface" 2>&1 || true
+  echo
+done
+if [ "$found" -eq 0 ]; then
+  echo "no interfaces found"
+fi
+`}},
+	{name: "system/ethtool-link.txt", cmd: []string{"sh", "-c", `
+if ! command -v ethtool >/dev/null 2>&1; then
+  echo "ethtool not found"
+  exit 0
+fi
+found=0
+for path in /sys/class/net/*; do
+  [ -e "$path" ] || continue
+  iface=$(basename "$path")
+  [ "$iface" = "lo" ] && continue
+  found=1
+  echo "=== $iface ==="
+  ethtool "$iface" 2>&1 || true
+  echo
+done
+if [ "$found" -eq 0 ]; then
+  echo "no interfaces found"
+fi
+`}},
+	{name: "system/ethtool-module.txt", cmd: []string{"sh", "-c", `
+if ! command -v ethtool >/dev/null 2>&1; then
+  echo "ethtool not found"
+  exit 0
+fi
+found=0
+for path in /sys/class/net/*; do
+  [ -e "$path" ] || continue
+  iface=$(basename "$path")
+  [ "$iface" = "lo" ] && continue
+  found=1
+  echo "=== $iface ==="
+  ethtool -m "$iface" 2>&1 || true
+  echo
+done
+if [ "$found" -eq 0 ]; then
+  echo "no interfaces found"
+fi
+`}},
+	{name: "system/mstflint-query.txt", cmd: []string{"sh", "-c", `
+if ! command -v mstflint >/dev/null 2>&1; then
+  echo "mstflint not found"
+  exit 0
+fi
+found=0
+for path in /sys/bus/pci/devices/*; do
+  [ -e "$path/vendor" ] || continue
+  vendor=$(cat "$path/vendor" 2>/dev/null)
+  [ "$vendor" = "0x15b3" ] || continue
+  bdf=$(basename "$path")
+  found=1
+  echo "=== $bdf ==="
+  mstflint -d "$bdf" q 2>&1 || true
+  echo
+done
+if [ "$found" -eq 0 ]; then
+  echo "no Mellanox/NVIDIA networking devices found"
+fi
 `}},
 }

--- a/audit/internal/collector/nic_mellanox.go
+++ b/audit/internal/collector/nic_mellanox.go
@@ -2,18 +2,21 @@ package collector

 import (
 	"bee/audit/internal/schema"
+	"context"
 	"log/slog"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"strings"
+	"time"
 )

 const mellanoxVendorID = 0x15b3
+const nicProbeTimeout = 2 * time.Second

 var (
 	mstflintQuery = func(bdf string) (string, error) {
-		out, err := exec.Command("mstflint", "-d", bdf, "q").Output()
+		out, err := commandOutputWithTimeout(nicProbeTimeout, "mstflint", "-d", bdf, "q")
 		if err != nil {
 			return "", err
 		}
@@ -21,7 +24,7 @@ var (
 	}

 	ethtoolInfoQuery = func(iface string) (string, error) {
-		out, err := exec.Command("ethtool", "-i", iface).Output()
+		out, err := commandOutputWithTimeout(nicProbeTimeout, "ethtool", "-i", iface)
 		if err != nil {
 			return "", err
 		}
@@ -29,6 +32,14 @@ var (
 	}

 	netIfacesByBDF = listNetIfacesByBDF
+	readNetCarrierFile = func(iface string) (string, error) {
+		path := filepath.Join("/sys/class/net", iface, "carrier")
+		raw, err := os.ReadFile(path)
+		if err != nil {
+			return "", err
+		}
+		return strings.TrimSpace(string(raw)), nil
+	}
 )

 // enrichPCIeWithMellanox enriches Mellanox/NVIDIA Networking devices with
@@ -162,3 +173,17 @@ func listNetIfacesByBDF(bdf string) []string {
 	}
 	return ifaces
 }
+
+func commandOutputWithTimeout(timeout time.Duration, name string, args ...string) ([]byte, error) {
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	defer cancel()
+	return exec.CommandContext(ctx, name, args...).Output()
+}
+
+func interfaceHasCarrier(iface string) bool {
+	raw, err := readNetCarrierFile(iface)
+	if err != nil {
+		return false
+	}
+	return strings.TrimSpace(raw) == "1"
+}
--- a/audit/internal/collector/nic_telemetry.go
+++ b/audit/internal/collector/nic_telemetry.go
@@ -12,7 +12,7 @@ import (

 var (
 	ethtoolModuleQuery = func(iface string) (string, error) {
-		out, err := raidToolQuery("ethtool", "-m", iface)
+		out, err := commandOutputWithTimeout(nicProbeTimeout, "ethtool", "-m", iface)
 		if err != nil {
 			return "", err
 		}
@@ -58,10 +58,12 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
 			}
 		}

-		if out, err := ethtoolModuleQuery(iface); err == nil {
-			if injectSFPDOMTelemetry(&devs[i], out) {
-				enriched++
-				continue
+		if interfaceHasCarrier(iface) {
+			if out, err := ethtoolModuleQuery(iface); err == nil {
+				if injectSFPDOMTelemetry(&devs[i], out) {
+					enriched++
+					continue
+				}
 			}
 		}
 		if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {
--- a/audit/internal/collector/nic_telemetry_test.go
+++ b/audit/internal/collector/nic_telemetry_test.go
@@ -57,6 +57,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
 	origReadMAC := readNetAddressFile
 	origEth := ethtoolInfoQuery
 	origModule := ethtoolModuleQuery
+	origCarrier := readNetCarrierFile
 	t.Cleanup(func() {
 		queryPCILSPCIDetail = origDetail
 		readPCIVPDFile = origVPD
@@ -64,6 +65,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
 		readNetAddressFile = origReadMAC
 		ethtoolInfoQuery = origEth
 		ethtoolModuleQuery = origModule
+		readNetCarrierFile = origCarrier
 	})

 	queryPCILSPCIDetail = func(bdf string) (string, error) {
@@ -82,6 +84,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
 		}
 		return "aa:bb:cc:dd:ee:ff", nil
 	}
+	readNetCarrierFile = func(string) (string, error) { return "1", nil }
 	ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
 	ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("skip optics") }

@@ -101,6 +104,42 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
 	}
 }

+func TestEnrichPCIeWithNICTelemetrySkipsModuleQueryWithoutCarrier(t *testing.T) {
+	origIfaces := netIfacesByBDF
+	origReadMAC := readNetAddressFile
+	origEth := ethtoolInfoQuery
+	origModule := ethtoolModuleQuery
+	origCarrier := readNetCarrierFile
+	t.Cleanup(func() {
+		netIfacesByBDF = origIfaces
+		readNetAddressFile = origReadMAC
+		ethtoolInfoQuery = origEth
+		ethtoolModuleQuery = origModule
+		readNetCarrierFile = origCarrier
+	})
+
+	netIfacesByBDF = func(string) []string { return []string{"eth0"} }
+	readNetAddressFile = func(string) (string, error) { return "aa:bb:cc:dd:ee:ff", nil }
+	readNetCarrierFile = func(string) (string, error) { return "0", nil }
+	ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
+	ethtoolModuleQuery = func(string) (string, error) {
+		t.Fatal("ethtool -m should not be called without carrier")
+		return "", nil
+	}
+
+	class := "EthernetController"
+	bdf := "0000:18:00.0"
+	devs := []schema.HardwarePCIeDevice{{
+		DeviceClass: &class,
+		BDF:         &bdf,
+	}}
+
+	out := enrichPCIeWithNICTelemetry(devs)
+	if len(out[0].MacAddresses) != 1 || out[0].MacAddresses[0] != "aa:bb:cc:dd:ee:ff" {
+		t.Fatalf("mac_addresses=%v", out[0].MacAddresses)
+	}
+}
+
 func TestDBMValue(t *testing.T) {
 	tests := []struct {
 		in   string
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
--- a/audit/internal/platform/benchmark_report.go
+++ b/audit/internal/platform/benchmark_report.go
@@ -0,0 +1,141 @@
+package platform
+
+import (
+	"fmt"
+	"strings"
+	"time"
+)
+
+func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
+	var b strings.Builder
+	fmt.Fprintf(&b, "Bee NVIDIA Benchmark Report\n")
+	fmt.Fprintf(&b, "===========================\n\n")
+	fmt.Fprintf(&b, "Generated: %s\n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
+	fmt.Fprintf(&b, "Host: %s\n", result.Hostname)
+	fmt.Fprintf(&b, "Profile: %s\n", result.BenchmarkProfile)
+	fmt.Fprintf(&b, "Overall status: %s\n", result.OverallStatus)
+	fmt.Fprintf(&b, "Selected GPUs: %s\n", joinIndexList(result.SelectedGPUIndices))
+	fmt.Fprintf(&b, "Normalization: %s\n\n", result.Normalization.Status)
+
+	if len(result.Findings) > 0 {
+		fmt.Fprintf(&b, "Executive Summary\n")
+		fmt.Fprintf(&b, "-----------------\n")
+		for _, finding := range result.Findings {
+			fmt.Fprintf(&b, "- %s\n", finding)
+		}
+		b.WriteString("\n")
+	}
+
+	if len(result.Warnings) > 0 {
+		fmt.Fprintf(&b, "Warnings\n")
+		fmt.Fprintf(&b, "--------\n")
+		for _, warning := range result.Warnings {
+			fmt.Fprintf(&b, "- %s\n", warning)
+		}
+		b.WriteString("\n")
+	}
+
+	fmt.Fprintf(&b, "Per GPU Scorecard\n")
+	fmt.Fprintf(&b, "-----------------\n")
+	for _, gpu := range result.GPUs {
+		fmt.Fprintf(&b, "GPU %d  %s\n", gpu.Index, gpu.Name)
+		fmt.Fprintf(&b, "  Status: %s\n", gpu.Status)
+		fmt.Fprintf(&b, "  Composite score: %.2f\n", gpu.Scores.CompositeScore)
+		fmt.Fprintf(&b, "  Compute score: %.2f\n", gpu.Scores.ComputeScore)
+		fmt.Fprintf(&b, "  Power sustain: %.1f\n", gpu.Scores.PowerSustainScore)
+		fmt.Fprintf(&b, "  Thermal sustain: %.1f\n", gpu.Scores.ThermalSustainScore)
+		fmt.Fprintf(&b, "  Stability: %.1f\n", gpu.Scores.StabilityScore)
+		if gpu.Scores.InterconnectScore > 0 {
+			fmt.Fprintf(&b, "  Interconnect: %.1f\n", gpu.Scores.InterconnectScore)
+		}
+		if len(gpu.DegradationReasons) > 0 {
+			fmt.Fprintf(&b, "  Degradation reasons: %s\n", strings.Join(gpu.DegradationReasons, ", "))
+		}
+		fmt.Fprintf(&b, "  Avg power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.AvgPowerW, gpu.Steady.AvgTempC, gpu.Steady.AvgGraphicsClockMHz)
+		fmt.Fprintf(&b, "  P95 power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.P95PowerW, gpu.Steady.P95TempC, gpu.Steady.P95GraphicsClockMHz)
+		if len(gpu.PrecisionResults) > 0 {
+			fmt.Fprintf(&b, "  Precision results:\n")
+			for _, precision := range gpu.PrecisionResults {
+				if precision.Supported {
+					fmt.Fprintf(&b, "    - %s: %.2f TOPS lanes=%d iterations=%d\n", precision.Name, precision.TeraOpsPerSec, precision.Lanes, precision.Iterations)
+				} else {
+					fmt.Fprintf(&b, "    - %s: unsupported (%s)\n", precision.Name, precision.Notes)
+				}
+			}
+		}
+		fmt.Fprintf(&b, "  Throttle counters (us): sw_power=%d sw_thermal=%d sync_boost=%d hw_thermal=%d hw_power_brake=%d\n",
+			gpu.Throttle.SWPowerCapUS,
+			gpu.Throttle.SWThermalSlowdownUS,
+			gpu.Throttle.SyncBoostUS,
+			gpu.Throttle.HWThermalSlowdownUS,
+			gpu.Throttle.HWPowerBrakeSlowdownUS,
+		)
+		if len(gpu.Notes) > 0 {
+			fmt.Fprintf(&b, "  Notes:\n")
+			for _, note := range gpu.Notes {
+				fmt.Fprintf(&b, "    - %s\n", note)
+			}
+		}
+		b.WriteString("\n")
+	}
+
+	if result.Interconnect != nil {
+		fmt.Fprintf(&b, "Interconnect\n")
+		fmt.Fprintf(&b, "------------\n")
+		fmt.Fprintf(&b, "Status: %s\n", result.Interconnect.Status)
+		if result.Interconnect.Supported {
+			fmt.Fprintf(&b, "Avg algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.AvgBusBWGBps)
+			fmt.Fprintf(&b, "Max algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.MaxAlgBWGBps, result.Interconnect.MaxBusBWGBps)
+		}
+		for _, note := range result.Interconnect.Notes {
+			fmt.Fprintf(&b, "- %s\n", note)
+		}
+		b.WriteString("\n")
+	}
+
+	fmt.Fprintf(&b, "Methodology\n")
+	fmt.Fprintf(&b, "-----------\n")
+	fmt.Fprintf(&b, "- Profile %s uses standardized baseline, warmup, steady-state, interconnect, and cooldown phases.\n", result.BenchmarkProfile)
+	fmt.Fprintf(&b, "- Single-GPU compute score comes from bee-gpu-burn cuBLASLt output when available.\n")
+	fmt.Fprintf(&b, "- Thermal and power limitations are inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
+	fmt.Fprintf(&b, "- result.json is the canonical machine-readable source for this benchmark run.\n\n")
+
+	fmt.Fprintf(&b, "Raw Files\n")
+	fmt.Fprintf(&b, "---------\n")
+	fmt.Fprintf(&b, "- result.json\n")
+	fmt.Fprintf(&b, "- report.txt\n")
+	fmt.Fprintf(&b, "- summary.txt\n")
+	fmt.Fprintf(&b, "- verbose.log\n")
+	fmt.Fprintf(&b, "- gpu-*-baseline-metrics.csv/html/term.txt\n")
+	fmt.Fprintf(&b, "- gpu-*-warmup.log\n")
+	fmt.Fprintf(&b, "- gpu-*-steady.log\n")
+	fmt.Fprintf(&b, "- gpu-*-steady-metrics.csv/html/term.txt\n")
+	fmt.Fprintf(&b, "- gpu-*-cooldown-metrics.csv/html/term.txt\n")
+	if result.Interconnect != nil {
+		fmt.Fprintf(&b, "- nccl-all-reduce.log\n")
+	}
+	return b.String()
+}
+
+func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
+	var b strings.Builder
+	fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
+	fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
+	fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
+	fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
+	fmt.Fprintf(&b, "normalization_status=%s\n", result.Normalization.Status)
+	var best float64
+	for i, gpu := range result.GPUs {
+		fmt.Fprintf(&b, "gpu_%d_status=%s\n", gpu.Index, gpu.Status)
+		fmt.Fprintf(&b, "gpu_%d_composite_score=%.2f\n", gpu.Index, gpu.Scores.CompositeScore)
+		if i == 0 || gpu.Scores.CompositeScore > best {
+			best = gpu.Scores.CompositeScore
+		}
+	}
+	fmt.Fprintf(&b, "best_composite_score=%.2f\n", best)
+	if result.Interconnect != nil {
+		fmt.Fprintf(&b, "interconnect_status=%s\n", result.Interconnect.Status)
+		fmt.Fprintf(&b, "interconnect_max_busbw_gbps=%.1f\n", result.Interconnect.MaxBusBWGBps)
+	}
+	return b.String()
+}
--- a/audit/internal/platform/benchmark_test.go
+++ b/audit/internal/platform/benchmark_test.go
@@ -0,0 +1,147 @@
+package platform
+
+import (
+	"strings"
+	"testing"
+)
+
+func TestResolveBenchmarkProfile(t *testing.T) {
+	t.Parallel()
+
+	cases := []struct {
+		name    string
+		profile string
+		want    benchmarkProfileSpec
+	}{
+		{
+			name:    "default",
+			profile: "",
+			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120},
+		},
+		{
+			name:    "stability",
+			profile: "stability",
+			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300},
+		},
+		{
+			name:    "overnight",
+			profile: "overnight",
+			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300},
+		},
+	}
+
+	for _, tc := range cases {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			got := resolveBenchmarkProfile(tc.profile)
+			if got != tc.want {
+				t.Fatalf("profile=%q got %+v want %+v", tc.profile, got, tc.want)
+			}
+		})
+	}
+}
+
+func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
+	t.Parallel()
+
+	opts := normalizeNvidiaBenchmarkOptionsForBenchmark(NvidiaBenchmarkOptions{
+		Profile: "stability",
+		RunNCCL: false,
+	})
+	if opts.Profile != NvidiaBenchmarkProfileStability {
+		t.Fatalf("profile=%q want %q", opts.Profile, NvidiaBenchmarkProfileStability)
+	}
+	if opts.RunNCCL {
+		t.Fatalf("RunNCCL should stay false when explicitly disabled")
+	}
+}
+
+func TestParseBenchmarkBurnLog(t *testing.T) {
+	t.Parallel()
+
+	raw := strings.Join([]string{
+		"loader=bee-gpu-burn",
+		"[gpu 0] device=NVIDIA H100",
+		"[gpu 0] compute_capability=9.0",
+		"[gpu 0] backend=cublasLt",
+		"[gpu 0] duration_s=10",
+		"[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0",
+		"[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0",
+		"[gpu 0] fp16_tensor_iterations=200",
+		"[gpu 0] fp8_e4m3_iterations=50",
+		"[gpu 0] status=OK",
+	}, "\n")
+
+	got := parseBenchmarkBurnLog(raw)
+	if got.Backend != "cublasLt" {
+		t.Fatalf("backend=%q want cublasLt", got.Backend)
+	}
+	if got.ComputeCapability != "9.0" {
+		t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability)
+	}
+	if len(got.Profiles) != 2 {
+		t.Fatalf("profiles=%d want 2", len(got.Profiles))
+	}
+	if got.Profiles[0].TeraOpsPerSec <= 0 {
+		t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec)
+	}
+	if got.Profiles[1].Category != "fp8" {
+		t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category)
+	}
+}
+
+func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
+	t.Parallel()
+
+	result := NvidiaBenchmarkResult{
+		BenchmarkVersion:   benchmarkVersion,
+		BenchmarkProfile:   NvidiaBenchmarkProfileStandard,
+		OverallStatus:      "PARTIAL",
+		SelectedGPUIndices: []int{0},
+		Normalization: BenchmarkNormalization{
+			Status: "partial",
+		},
+		Findings: []string{"GPU 0 spent measurable time under SW power cap."},
+		GPUs: []BenchmarkGPUResult{
+			{
+				Index:  0,
+				Name:   "NVIDIA H100",
+				Status: "OK",
+				Steady: BenchmarkTelemetrySummary{
+					AvgPowerW:           680,
+					AvgTempC:            79,
+					AvgGraphicsClockMHz: 1725,
+					P95PowerW:           700,
+					P95TempC:            82,
+					P95GraphicsClockMHz: 1800,
+				},
+				Scores: BenchmarkScorecard{
+					ComputeScore:        1200,
+					PowerSustainScore:   96,
+					ThermalSustainScore: 88,
+					StabilityScore:      92,
+					CompositeScore:      1176,
+				},
+				PrecisionResults: []BenchmarkPrecisionResult{
+					{Name: "fp16_tensor", Supported: true, TeraOpsPerSec: 700},
+				},
+				Throttle: BenchmarkThrottleCounters{
+					SWPowerCapUS: 1000000,
+				},
+				DegradationReasons: []string{"power_capped"},
+			},
+		},
+	}
+
+	report := renderBenchmarkReport(result)
+	for _, needle := range []string{
+		"Executive Summary",
+		"GPU 0 spent measurable time under SW power cap.",
+		"Composite score: 1176.00",
+		"fp16_tensor: 700.00 TOPS",
+	} {
+		if !strings.Contains(report, needle) {
+			t.Fatalf("report missing %q\n%s", needle, report)
+		}
+	}
+}
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -0,0 +1,132 @@
+package platform
+
+import "time"
+
+const (
+	NvidiaBenchmarkProfileStandard  = "standard"
+	NvidiaBenchmarkProfileStability = "stability"
+	NvidiaBenchmarkProfileOvernight = "overnight"
+)
+
+type NvidiaBenchmarkOptions struct {
+	Profile           string
+	SizeMB            int
+	GPUIndices        []int
+	ExcludeGPUIndices []int
+	RunNCCL           bool
+}
+
+type NvidiaBenchmarkResult struct {
+	BenchmarkVersion   string                       `json:"benchmark_version"`
+	GeneratedAt        time.Time                    `json:"generated_at"`
+	Hostname           string                       `json:"hostname,omitempty"`
+	BenchmarkProfile   string                       `json:"benchmark_profile"`
+	OverallStatus      string                       `json:"overall_status"`
+	SelectedGPUIndices []int                        `json:"selected_gpu_indices"`
+	Findings           []string                     `json:"findings,omitempty"`
+	Warnings           []string                     `json:"warnings,omitempty"`
+	Normalization      BenchmarkNormalization       `json:"normalization"`
+	GPUs               []BenchmarkGPUResult         `json:"gpus"`
+	Interconnect       *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
+}
+
+type BenchmarkNormalization struct {
+	Status string                      `json:"status"`
+	Notes  []string                    `json:"notes,omitempty"`
+	GPUs   []BenchmarkNormalizationGPU `json:"gpus,omitempty"`
+}
+
+type BenchmarkNormalizationGPU struct {
+	Index                 int      `json:"index"`
+	PersistenceMode       string   `json:"persistence_mode,omitempty"`
+	GPUClockLockMHz       float64  `json:"gpu_clock_lock_mhz,omitempty"`
+	GPUClockLockStatus    string   `json:"gpu_clock_lock_status,omitempty"`
+	MemoryClockLockMHz    float64  `json:"memory_clock_lock_mhz,omitempty"`
+	MemoryClockLockStatus string   `json:"memory_clock_lock_status,omitempty"`
+	Notes                 []string `json:"notes,omitempty"`
+}
+
+type BenchmarkGPUResult struct {
+	Index                  int                        `json:"index"`
+	UUID                   string                     `json:"uuid,omitempty"`
+	Name                   string                     `json:"name,omitempty"`
+	BusID                  string                     `json:"bus_id,omitempty"`
+	VBIOS                  string                     `json:"vbios,omitempty"`
+	ComputeCapability      string                     `json:"compute_capability,omitempty"`
+	Backend                string                     `json:"backend,omitempty"`
+	Status                 string                     `json:"status"`
+	PowerLimitW            float64                    `json:"power_limit_w,omitempty"`
+	MaxGraphicsClockMHz    float64                    `json:"max_graphics_clock_mhz,omitempty"`
+	MaxMemoryClockMHz      float64                    `json:"max_memory_clock_mhz,omitempty"`
+	LockedGraphicsClockMHz float64                    `json:"locked_graphics_clock_mhz,omitempty"`
+	LockedMemoryClockMHz   float64                    `json:"locked_memory_clock_mhz,omitempty"`
+	Baseline               BenchmarkTelemetrySummary  `json:"baseline"`
+	Steady                 BenchmarkTelemetrySummary  `json:"steady"`
+	Cooldown               BenchmarkTelemetrySummary  `json:"cooldown"`
+	Throttle               BenchmarkThrottleCounters  `json:"throttle_counters"`
+	PrecisionResults       []BenchmarkPrecisionResult `json:"precision_results,omitempty"`
+	Scores                 BenchmarkScorecard         `json:"scores"`
+	DegradationReasons     []string                   `json:"degradation_reasons,omitempty"`
+	Notes                  []string                   `json:"notes,omitempty"`
+}
+
+type BenchmarkTelemetrySummary struct {
+	DurationSec         float64 `json:"duration_sec"`
+	Samples             int     `json:"samples"`
+	AvgTempC            float64 `json:"avg_temp_c"`
+	P95TempC            float64 `json:"p95_temp_c"`
+	AvgPowerW           float64 `json:"avg_power_w"`
+	P95PowerW           float64 `json:"p95_power_w"`
+	AvgGraphicsClockMHz float64 `json:"avg_graphics_clock_mhz"`
+	P95GraphicsClockMHz float64 `json:"p95_graphics_clock_mhz"`
+	AvgMemoryClockMHz   float64 `json:"avg_memory_clock_mhz"`
+	P95MemoryClockMHz   float64 `json:"p95_memory_clock_mhz"`
+	AvgUsagePct         float64 `json:"avg_usage_pct"`
+	AvgMemUsagePct      float64 `json:"avg_mem_usage_pct"`
+	ClockCVPct          float64 `json:"clock_cv_pct"`
+	PowerCVPct          float64 `json:"power_cv_pct"`
+	TempCVPct           float64 `json:"temp_cv_pct"`
+	ClockDriftPct       float64 `json:"clock_drift_pct"`
+}
+
+type BenchmarkThrottleCounters struct {
+	SWPowerCapUS           uint64 `json:"sw_power_cap_us"`
+	SWThermalSlowdownUS    uint64 `json:"sw_thermal_slowdown_us"`
+	SyncBoostUS            uint64 `json:"sync_boost_us"`
+	HWThermalSlowdownUS    uint64 `json:"hw_thermal_slowdown_us"`
+	HWPowerBrakeSlowdownUS uint64 `json:"hw_power_brake_slowdown_us"`
+}
+
+type BenchmarkPrecisionResult struct {
+	Name          string  `json:"name"`
+	Category      string  `json:"category"`
+	Supported     bool    `json:"supported"`
+	Lanes         int     `json:"lanes,omitempty"`
+	M             uint64  `json:"m,omitempty"`
+	N             uint64  `json:"n,omitempty"`
+	K             uint64  `json:"k,omitempty"`
+	Iterations    uint64  `json:"iterations,omitempty"`
+	TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
+	Notes         string  `json:"notes,omitempty"`
+}
+
+type BenchmarkScorecard struct {
+	ComputeScore        float64 `json:"compute_score"`
+	PowerSustainScore   float64 `json:"power_sustain_score"`
+	ThermalSustainScore float64 `json:"thermal_sustain_score"`
+	StabilityScore      float64 `json:"stability_score"`
+	InterconnectScore   float64 `json:"interconnect_score"`
+	CompositeScore      float64 `json:"composite_score"`
+}
+
+type BenchmarkInterconnectResult struct {
+	Status             string   `json:"status"`
+	Attempted          bool     `json:"attempted"`
+	Supported          bool     `json:"supported"`
+	SelectedGPUIndices []int    `json:"selected_gpu_indices,omitempty"`
+	AvgAlgBWGBps       float64  `json:"avg_algbw_gbps,omitempty"`
+	MaxAlgBWGBps       float64  `json:"max_algbw_gbps,omitempty"`
+	AvgBusBWGBps       float64  `json:"avg_busbw_gbps,omitempty"`
+	MaxBusBWGBps       float64  `json:"max_busbw_gbps,omitempty"`
+	Notes              []string `json:"notes,omitempty"`
+}
--- a/audit/internal/platform/gpu_metrics.go
+++ b/audit/internal/platform/gpu_metrics.go
@@ -20,12 +20,13 @@ type GPUMetricRow struct {
 	MemUsagePct float64 `json:"mem_usage_pct"`
 	PowerW      float64 `json:"power_w"`
 	ClockMHz    float64 `json:"clock_mhz"`
+	MemClockMHz float64 `json:"mem_clock_mhz"`
 }

 // sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
 func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
 	args := []string{
-		"--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics",
+		"--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics,clocks.current.memory",
 		"--format=csv,noheader,nounits",
 	}
 	if len(gpuIndices) > 0 {
@@ -46,7 +47,7 @@ func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
 			continue
 		}
 		parts := strings.Split(line, ", ")
-		if len(parts) < 6 {
+		if len(parts) < 7 {
 			continue
 		}
 		idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
@@ -57,6 +58,7 @@ func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
 			MemUsagePct: parseGPUFloat(parts[3]),
 			PowerW:      parseGPUFloat(parts[4]),
 			ClockMHz:    parseGPUFloat(parts[5]),
+			MemClockMHz: parseGPUFloat(parts[6]),
 		})
 	}
 	return rows, nil
@@ -139,10 +141,10 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
 // WriteGPUMetricsCSV writes collected rows as a CSV file.
 func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
 	var b bytes.Buffer
-	b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,power_w,clock_mhz\n")
+	b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz\n")
 	for _, r := range rows {
-		fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.0f\n",
-			r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.PowerW, r.ClockMHz)
+		fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f\n",
+			r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz)
 	}
 	return os.WriteFile(path, b.Bytes(), 0644)
 }
@@ -197,7 +199,7 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
 	const PW = plotX2 - plotX1
 	const PH = plotY2 - plotY1
 	// Outer axes
-	const tempAxisX = 60  // temp axis line
+	const tempAxisX = 60   // temp axis line
 	const clockAxisX = 900 // clock axis line

 	colors := [4]string{"#e74c3c", "#3498db", "#2ecc71", "#f39c12"}
--- a/audit/internal/platform/install_to_ram.go
+++ b/audit/internal/platform/install_to_ram.go
@@ -120,10 +120,45 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) erro
 		log(fmt.Sprintf("Warning: rebind /run/live/medium failed: %v", err))
 	}

+	log("Verifying live medium now served from RAM...")
+	status := s.LiveBootSource()
+	if err := verifyInstallToRAMStatus(status); err != nil {
+		return err
+	}
+	log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
 	log("Done. Installation media can be safely disconnected.")
 	return nil
 }

+func verifyInstallToRAMStatus(status LiveBootSource) error {
+	if status.InRAM {
+		return nil
+	}
+	return fmt.Errorf("install to RAM verification failed: live medium still mounted from %s", describeLiveBootSource(status))
+}
+
+func describeLiveBootSource(status LiveBootSource) string {
+	source := strings.TrimSpace(status.Device)
+	if source == "" {
+		source = strings.TrimSpace(status.Source)
+	}
+	if source == "" {
+		source = "unknown source"
+	}
+	switch strings.TrimSpace(status.Kind) {
+	case "ram":
+		return "RAM"
+	case "usb":
+		return "USB (" + source + ")"
+	case "cdrom":
+		return "CD-ROM (" + source + ")"
+	case "disk":
+		return "disk (" + source + ")"
+	default:
+		return source
+	}
+}
+
 func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) error {
 	in, err := os.Open(src)
 	if err != nil {
--- a/audit/internal/platform/install_to_ram_test.go
+++ b/audit/internal/platform/install_to_ram_test.go
@@ -3,6 +3,8 @@ package platform
 import "testing"

 func TestInferLiveBootKind(t *testing.T) {
+	t.Parallel()
+
 	tests := []struct {
 		name       string
 		fsType     string
@@ -18,6 +20,7 @@ func TestInferLiveBootKind(t *testing.T) {
 		{name: "unknown", source: "overlay", want: "unknown"},
 	}
 	for _, tc := range tests {
+		tc := tc
 		t.Run(tc.name, func(t *testing.T) {
 			got := inferLiveBootKind(tc.fsType, tc.source, tc.deviceType, tc.transport)
 			if got != tc.want {
@@ -26,3 +29,29 @@ func TestInferLiveBootKind(t *testing.T) {
 		})
 	}
 }
+
+func TestVerifyInstallToRAMStatus(t *testing.T) {
+	t.Parallel()
+
+	if err := verifyInstallToRAMStatus(LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"}); err != nil {
+		t.Fatalf("expected success for RAM-backed status, got %v", err)
+	}
+	err := verifyInstallToRAMStatus(LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"})
+	if err == nil {
+		t.Fatal("expected verification failure when media is still on USB")
+	}
+	if got := err.Error(); got != "install to RAM verification failed: live medium still mounted from USB (/dev/sdb1)" {
+		t.Fatalf("error=%q", got)
+	}
+}
+
+func TestDescribeLiveBootSource(t *testing.T) {
+	t.Parallel()
+
+	if got := describeLiveBootSource(LiveBootSource{InRAM: true, Kind: "ram"}); got != "RAM" {
+		t.Fatalf("got %q want RAM", got)
+	}
+	if got := describeLiveBootSource(LiveBootSource{Kind: "unknown", Source: "/run/live/medium"}); got != "/run/live/medium" {
+		t.Fatalf("got %q want /run/live/medium", got)
+	}
+}
--- a/audit/internal/platform/runtime.go
+++ b/audit/internal/platform/runtime.go
@@ -135,12 +135,15 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
 	case "nvidia":
 		tools = append(tools, s.CheckTools([]string{
 			"nvidia-smi",
+			"dcgmi",
+			"nv-hostengine",
 			"nvidia-bug-report.sh",
 			"bee-gpu-burn",
 			"bee-john-gpu-stress",
 			"bee-nccl-gpu-stress",
 			"all_reduce_perf",
 		})...)
+		tools = append(tools, resolvedToolStatus("dcgmproftester", dcgmProfTesterCandidates...))
 	case "amd":
 		tool := ToolStatus{Name: "rocm-smi"}
 		if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 {
@@ -155,6 +158,16 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
 	return tools
 }

+func resolvedToolStatus(display string, candidates ...string) ToolStatus {
+	for _, candidate := range candidates {
+		path, err := exec.LookPath(candidate)
+		if err == nil {
+			return ToolStatus{Name: display, Path: path, OK: true}
+		}
+	}
+	return ToolStatus{Name: display}
+}
+
 func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
 	lsmodText := commandText("lsmod")

--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -12,19 +12,20 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
-	"syscall"
 	"sort"
 	"strconv"
 	"strings"
 	"sync"
+	"syscall"
 	"time"
 )

 var (
-	satExecCommand = exec.Command
-	satLookPath    = exec.LookPath
-	satGlob        = filepath.Glob
-	satStat        = os.Stat
+	satExecCommand  = exec.Command
+	satLookPath     = exec.LookPath
+	satGlob         = filepath.Glob
+	satStat         = os.Stat
+	satFreeMemBytes = freeMemBytes

 	rocmSMIExecutableGlobs = []string{
 		"/opt/rocm/bin/rocm-smi",
@@ -38,6 +39,12 @@ var (
 		"/opt/rocm/bin/rvs",
 		"/opt/rocm-*/bin/rvs",
 	}
+	dcgmProfTesterCandidates = []string{
+		"dcgmproftester",
+		"dcgmproftester13",
+		"dcgmproftester12",
+		"dcgmproftester11",
+	}
 )

 // streamExecOutput runs cmd and streams each output line to logFunc (if non-nil).
@@ -76,15 +83,15 @@ func streamExecOutput(cmd *exec.Cmd, logFunc func(string)) ([]byte, error) {

 // NvidiaGPU holds basic GPU info from nvidia-smi.
 type NvidiaGPU struct {
-	Index    int
-	Name     string
-	MemoryMB int
+	Index    int    `json:"index"`
+	Name     string `json:"name"`
+	MemoryMB int    `json:"memory_mb"`
 }

 // AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
 type AMDGPUInfo struct {
-	Index int
-	Name  string
+	Index int    `json:"index"`
+	Name  string `json:"name"`
 }

 // DetectGPUVendor returns "nvidia" if /dev/nvidia0 exists, "amd" if /dev/kfd exists, or "" otherwise.
@@ -256,6 +263,9 @@ func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
 			MemoryMB: memMB,
 		})
 	}
+	sort.Slice(gpus, func(i, j int) bool {
+		return gpus[i].Index < gpus[j].Index
+	})
 	return gpus, nil
 }

@@ -277,6 +287,80 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
 	}, logFunc)
 }

+func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
+	selected, err := resolveDCGMGPUIndices(gpuIndices)
+	if err != nil {
+		return "", err
+	}
+	profCmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
+	if err != nil {
+		return "", err
+	}
+	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", []satJob{
+		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
+		{
+			name:       "03-dcgmproftester.log",
+			cmd:        profCmd,
+			env:        nvidiaVisibleDevicesEnv(selected),
+			collectGPU: true,
+			gpuIndices: selected,
+		},
+		{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
+	}, logFunc)
+}
+
+func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
+	selected, err := resolveDCGMGPUIndices(gpuIndices)
+	if err != nil {
+		return "", err
+	}
+	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", []satJob{
+		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		{
+			name:       "02-dcgmi-targeted-power.log",
+			cmd:        nvidiaDCGMNamedDiagCommand("targeted_power", normalizeNvidiaBurnDuration(durationSec), selected),
+			collectGPU: true,
+			gpuIndices: selected,
+		},
+		{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
+	}, logFunc)
+}
+
+func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
+	selected, err := resolveDCGMGPUIndices(gpuIndices)
+	if err != nil {
+		return "", err
+	}
+	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", []satJob{
+		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		{
+			name:       "02-dcgmi-pulse-test.log",
+			cmd:        nvidiaDCGMNamedDiagCommand("pulse_test", normalizeNvidiaBurnDuration(durationSec), selected),
+			collectGPU: true,
+			gpuIndices: selected,
+		},
+		{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
+	}, logFunc)
+}
+
+func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
+	selected, err := resolveDCGMGPUIndices(gpuIndices)
+	if err != nil {
+		return "", err
+	}
+	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", []satJob{
+		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		{
+			name:       "02-dcgmi-nvbandwidth.log",
+			cmd:        nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, selected),
+			collectGPU: true,
+			gpuIndices: selected,
+		},
+		{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
+	}, logFunc)
+}
+
 func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
 	return runAcceptancePackCtx(context.Background(), baseDir, "gpu-nvidia", nvidiaSATJobs(), logFunc)
 }
@@ -293,6 +377,23 @@ func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, resolvedGPUIndices), logFunc)
 }

+func (s *System) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
+	selected, err := resolveDCGMGPUIndices(gpuIndices)
+	if err != nil {
+		return "", err
+	}
+	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", []satJob{
+		{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+		{
+			name:       "02-dcgmi-targeted-stress.log",
+			cmd:        nvidiaDCGMNamedDiagCommand("targeted_stress", normalizeNvidiaBurnDuration(durationSec), selected),
+			collectGPU: true,
+			gpuIndices: selected,
+		},
+		{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
+	}, logFunc)
+}
+
 func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
 	if len(gpuIndices) > 0 {
 		return dedupeSortedIndices(gpuIndices), nil
@@ -307,6 +408,25 @@ func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
 	return all, nil
 }

+func memoryStressSizeArg() string {
+	if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 {
+		return fmt.Sprintf("%dM", mb)
+	}
+	availBytes := satFreeMemBytes()
+	if availBytes <= 0 {
+		return "80%"
+	}
+	availMB := availBytes / (1024 * 1024)
+	targetMB := (availMB * 2) / 3
+	if targetMB >= 256 {
+		targetMB = (targetMB / 256) * 256
+	}
+	if targetMB <= 0 {
+		return "80%"
+	}
+	return fmt.Sprintf("%dM", targetMB)
+}
+
 func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
 	sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128)
 	passes := envInt("BEE_MEMTESTER_PASSES", 1)
@@ -322,11 +442,9 @@ func (s *System) RunMemoryStressPack(ctx context.Context, baseDir string, durati
 	if seconds <= 0 {
 		seconds = envInt("BEE_VM_STRESS_SECONDS", 300)
 	}
-	// Use 80% of RAM by default; override with BEE_VM_STRESS_SIZE_MB.
-	sizeArg := "80%"
-	if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 {
-		sizeArg = fmt.Sprintf("%dM", mb)
-	}
+	// Base the default on current MemAvailable and keep headroom for the OS and
+	// concurrent stressors so mixed burn runs do not trip the OOM killer.
+	sizeArg := memoryStressSizeArg()
 	return runAcceptancePackCtx(ctx, baseDir, "memory-stress", []satJob{
 		{name: "01-free-before.log", cmd: []string{"free", "-h"}},
 		{name: "02-stress-ng-vm.log", cmd: []string{
@@ -473,6 +591,31 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
 	}
 }

+func nvidiaDCGMNamedDiagCommand(name string, durationSec int, gpuIndices []int) []string {
+	args := []string{"dcgmi", "diag", "-r", name}
+	if durationSec > 0 {
+		args = append(args, "-p", fmt.Sprintf("%s.test_duration=%d", name, durationSec))
+	}
+	if len(gpuIndices) > 0 {
+		args = append(args, "-i", joinIndexList(gpuIndices))
+	}
+	return args
+}
+
+func normalizeNvidiaBurnDuration(durationSec int) int {
+	if durationSec <= 0 {
+		return 300
+	}
+	return durationSec
+}
+
+func nvidiaVisibleDevicesEnv(gpuIndices []int) []string {
+	if len(gpuIndices) == 0 {
+		return nil
+	}
+	return []string{"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices)}
+}
+
 func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
 	if ctx == nil {
 		ctx = context.Background()
@@ -642,6 +785,7 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
 	}
 	if strings.Contains(text, "unsupported") ||
 		strings.Contains(text, "not supported") ||
+		strings.Contains(text, "not found in path") ||
 		strings.Contains(text, "invalid opcode") ||
 		strings.Contains(text, "unknown command") ||
 		strings.Contains(text, "not implemented") ||
@@ -748,6 +892,15 @@ func resolveROCmSMICommand(args ...string) ([]string, error) {
 	return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
 }

+func resolveDCGMProfTesterCommand(args ...string) ([]string, error) {
+	for _, candidate := range dcgmProfTesterCandidates {
+		if path, err := satLookPath(candidate); err == nil {
+			return append([]string{path}, args...), nil
+		}
+	}
+	return nil, errors.New("dcgmproftester not found in PATH")
+}
+
 func ensureAMDRuntimeReady() error {
 	if _, err := os.Stat("/dev/kfd"); err == nil {
 		return nil
--- a/audit/internal/platform/sat_test.go
+++ b/audit/internal/platform/sat_test.go
@@ -195,6 +195,53 @@ func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) {
 	}
 }

+func TestResolveDCGMProfTesterCommandUsesVersionedBinary(t *testing.T) {
+	oldLookPath := satLookPath
+	satLookPath = func(file string) (string, error) {
+		switch file {
+		case "dcgmproftester13":
+			return "/usr/bin/dcgmproftester13", nil
+		default:
+			return "", exec.ErrNotFound
+		}
+	}
+	t.Cleanup(func() { satLookPath = oldLookPath })
+
+	cmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004")
+	if err != nil {
+		t.Fatalf("resolveDCGMProfTesterCommand error: %v", err)
+	}
+	if len(cmd) != 4 {
+		t.Fatalf("cmd len=%d want 4 (%v)", len(cmd), cmd)
+	}
+	if cmd[0] != "/usr/bin/dcgmproftester13" {
+		t.Fatalf("cmd[0]=%q want /usr/bin/dcgmproftester13", cmd[0])
+	}
+}
+
+func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
+	cmd := nvidiaDCGMNamedDiagCommand("targeted_power", 900, []int{3, 1})
+	want := []string{"dcgmi", "diag", "-r", "targeted_power", "-p", "targeted_power.test_duration=900", "-i", "3,1"}
+	if len(cmd) != len(want) {
+		t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
+	}
+	for i := range want {
+		if cmd[i] != want[i] {
+			t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
+		}
+	}
+}
+
+func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
+	env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
+	if len(env) != 1 {
+		t.Fatalf("env len=%d want 1 (%v)", len(env), env)
+	}
+	if env[0] != "CUDA_VISIBLE_DEVICES=0,2,4" {
+		t.Fatalf("env[0]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[0])
+	}
+}
+
 func TestNvidiaStressArchivePrefixByLoader(t *testing.T) {
 	t.Parallel()

@@ -229,6 +276,37 @@ func TestEnvIntFallback(t *testing.T) {
 	}
 }

+func TestMemoryStressSizeArgUsesAvailableMemory(t *testing.T) {
+	oldFreeMemBytes := satFreeMemBytes
+	satFreeMemBytes = func() int64 { return 96 * 1024 * 1024 * 1024 }
+	t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
+
+	if got := memoryStressSizeArg(); got != "65536M" {
+		t.Fatalf("sizeArg=%q want 65536M", got)
+	}
+}
+
+func TestMemoryStressSizeArgRespectsOverride(t *testing.T) {
+	oldFreeMemBytes := satFreeMemBytes
+	satFreeMemBytes = func() int64 { return 96 * 1024 * 1024 * 1024 }
+	t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
+	t.Setenv("BEE_VM_STRESS_SIZE_MB", "4096")
+
+	if got := memoryStressSizeArg(); got != "4096M" {
+		t.Fatalf("sizeArg=%q want 4096M", got)
+	}
+}
+
+func TestMemoryStressSizeArgFallsBackWhenFreeMemoryUnknown(t *testing.T) {
+	oldFreeMemBytes := satFreeMemBytes
+	satFreeMemBytes = func() int64 { return 0 }
+	t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
+
+	if got := memoryStressSizeArg(); got != "80%" {
+		t.Fatalf("sizeArg=%q want 80%%", got)
+	}
+}
+
 func TestClassifySATResult(t *testing.T) {
 	tests := []struct {
 		name   string
--- a/audit/internal/platform/services.go
+++ b/audit/internal/platform/services.go
@@ -10,17 +10,30 @@ import (
 func (s *System) ListBeeServices() ([]string, error) {
 	seen := map[string]bool{}
 	var out []string
-	for _, pattern := range []string{"/etc/systemd/system/bee-*.service", "/lib/systemd/system/bee-*.service"} {
+	for _, pattern := range []string{
+		"/etc/systemd/system/bee-*.service",
+		"/lib/systemd/system/bee-*.service",
+		"/etc/systemd/system/bee-*.timer",
+		"/lib/systemd/system/bee-*.timer",
+	} {
 		matches, err := filepath.Glob(pattern)
 		if err != nil {
 			return nil, err
 		}
 		for _, match := range matches {
-			name := strings.TrimSuffix(filepath.Base(match), ".service")
+			base := filepath.Base(match)
+			name := base
+			if strings.HasSuffix(base, ".service") {
+				name = strings.TrimSuffix(base, ".service")
+			}
 			// Skip template units (e.g. bee-journal-mirror@) — they have no instances to query.
 			if strings.HasSuffix(name, "@") {
 				continue
 			}
+			// bee-selfheal is timer-managed; showing the oneshot service as inactive is misleading.
+			if name == "bee-selfheal" && strings.HasSuffix(base, ".service") {
+				continue
+			}
 			if !seen[name] {
 				seen[name] = true
 				out = append(out, name)
--- a/audit/internal/platform/types.go
+++ b/audit/internal/platform/types.go
@@ -44,12 +44,12 @@ type StaticIPv4Config struct {
 }

 type RemovableTarget struct {
-	Device     string
-	FSType     string
-	Size       string
-	Label      string
-	Model      string
-	Mountpoint string
+	Device     string `json:"device"`
+	FSType     string `json:"fs_type"`
+	Size       string `json:"size"`
+	Label      string `json:"label"`
+	Model      string `json:"model"`
+	Mountpoint string `json:"mountpoint"`
 }

 type ToolStatus struct {
--- a/audit/internal/platform/types_test.go
+++ b/audit/internal/platform/types_test.go
@@ -0,0 +1,31 @@
+package platform
+
+import (
+	"encoding/json"
+	"strings"
+	"testing"
+)
+
+func TestRemovableTargetJSONUsesFrontendFieldNames(t *testing.T) {
+	t.Parallel()
+
+	data, err := json.Marshal(RemovableTarget{
+		Device: "/dev/sdb1",
+		FSType: "exfat",
+		Size:   "1.8T",
+		Label:  "USB",
+		Model:  "Flash",
+	})
+	if err != nil {
+		t.Fatalf("marshal: %v", err)
+	}
+	raw := string(data)
+	for _, key := range []string{`"device"`, `"fs_type"`, `"size"`, `"label"`, `"model"`} {
+		if !strings.Contains(raw, key) {
+			t.Fatalf("json missing key %s: %s", key, raw)
+		}
+	}
+	if strings.Contains(raw, `"Device"`) || strings.Contains(raw, `"FSType"`) {
+		t.Fatalf("json still contains Go field names: %s", raw)
+	}
+}
--- a/audit/internal/webui/api.go
+++ b/audit/internal/webui/api.go
@@ -110,6 +110,11 @@ func streamCmdJob(j *jobState, cmd *exec.Cmd) error {

 	scanDone := make(chan error, 1)
 	go func() {
+		defer func() {
+			if rec := recover(); rec != nil {
+				scanDone <- fmt.Errorf("stream scanner panic: %v", rec)
+			}
+		}()
 		scanner := bufio.NewScanner(pr)
 		scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024)
 		for scanner.Scan() {
@@ -227,6 +232,54 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
 	}
 }

+func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
+	if h.opts.App == nil {
+		writeError(w, http.StatusServiceUnavailable, "app not configured")
+		return
+	}
+
+	var body struct {
+		Profile           string `json:"profile"`
+		SizeMB            int    `json:"size_mb"`
+		GPUIndices        []int  `json:"gpu_indices"`
+		ExcludeGPUIndices []int  `json:"exclude_gpu_indices"`
+		RunNCCL           *bool  `json:"run_nccl"`
+		DisplayName       string `json:"display_name"`
+	}
+	if r.Body != nil {
+		if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
+			writeError(w, http.StatusBadRequest, "invalid request body")
+			return
+		}
+	}
+
+	runNCCL := true
+	if body.RunNCCL != nil {
+		runNCCL = *body.RunNCCL
+	}
+	t := &Task{
+		ID:        newJobID("benchmark-nvidia"),
+		Name:      taskDisplayName("nvidia-benchmark", "", ""),
+		Target:    "nvidia-benchmark",
+		Priority:  15,
+		Status:    TaskPending,
+		CreatedAt: time.Now(),
+		params: taskParams{
+			GPUIndices:        body.GPUIndices,
+			ExcludeGPUIndices: body.ExcludeGPUIndices,
+			SizeMB:            body.SizeMB,
+			BenchmarkProfile:  body.Profile,
+			RunNCCL:           runNCCL,
+			DisplayName:       body.DisplayName,
+		},
+	}
+	if strings.TrimSpace(body.DisplayName) != "" {
+		t.Name = body.DisplayName
+	}
+	globalQueue.enqueue(t)
+	writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
+}
+
 func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {
 	id := r.URL.Query().Get("job_id")
 	if id == "" {
@@ -486,6 +539,22 @@ func (h *handler) handleAPIExportUSBBundle(w http.ResponseWriter, r *http.Reques

 // ── GPU presence ──────────────────────────────────────────────────────────────

+func (h *handler) handleAPIGNVIDIAGPUs(w http.ResponseWriter, _ *http.Request) {
+	if h.opts.App == nil {
+		writeError(w, http.StatusServiceUnavailable, "app not configured")
+		return
+	}
+	gpus, err := h.opts.App.ListNvidiaGPUs()
+	if err != nil {
+		writeError(w, http.StatusInternalServerError, err.Error())
+		return
+	}
+	if gpus == nil {
+		gpus = []platform.NvidiaGPU{}
+	}
+	writeJSON(w, gpus)
+}
+
 func (h *handler) handleAPIGPUPresence(w http.ResponseWriter, r *http.Request) {
 	if h.opts.App == nil {
 		writeError(w, http.StatusServiceUnavailable, "app not configured")
@@ -511,14 +580,33 @@ func (h *handler) handleAPIGPUTools(w http.ResponseWriter, _ *http.Request) {
 	_, amdErr := os.Stat("/dev/kfd")
 	nvidiaUp := nvidiaErr == nil
 	amdUp := amdErr == nil
+	_, dcgmErr := exec.LookPath("dcgmi")
+	_, ncclStressErr := exec.LookPath("bee-nccl-gpu-stress")
+	_, johnErr := exec.LookPath("bee-john-gpu-stress")
+	_, beeBurnErr := exec.LookPath("bee-gpu-burn")
+	_, nvBandwidthErr := exec.LookPath("nvbandwidth")
+	profErr := lookPathAny("dcgmproftester", "dcgmproftester13", "dcgmproftester12", "dcgmproftester11")
 	writeJSON(w, []toolEntry{
-		{ID: "bee-gpu-burn", Available: nvidiaUp, Vendor: "nvidia"},
-		{ID: "john", Available: nvidiaUp, Vendor: "nvidia"},
-		{ID: "nccl", Available: nvidiaUp, Vendor: "nvidia"},
+		{ID: "nvidia-compute", Available: nvidiaUp && profErr == nil, Vendor: "nvidia"},
+		{ID: "nvidia-targeted-power", Available: nvidiaUp && dcgmErr == nil, Vendor: "nvidia"},
+		{ID: "nvidia-pulse", Available: nvidiaUp && dcgmErr == nil, Vendor: "nvidia"},
+		{ID: "nvidia-interconnect", Available: nvidiaUp && ncclStressErr == nil, Vendor: "nvidia"},
+		{ID: "nvidia-bandwidth", Available: nvidiaUp && dcgmErr == nil && nvBandwidthErr == nil, Vendor: "nvidia"},
+		{ID: "bee-gpu-burn", Available: nvidiaUp && beeBurnErr == nil, Vendor: "nvidia"},
+		{ID: "john", Available: nvidiaUp && johnErr == nil, Vendor: "nvidia"},
 		{ID: "rvs", Available: amdUp, Vendor: "amd"},
 	})
 }

+func lookPathAny(names ...string) error {
+	for _, name := range names {
+		if _, err := exec.LookPath(name); err == nil {
+			return nil
+		}
+	}
+	return exec.ErrNotFound
+}
+
 // ── System ────────────────────────────────────────────────────────────────────

 func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) {
@@ -557,7 +645,7 @@ func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request)

 var standardTools = []string{
 	"dmidecode", "smartctl", "nvme", "lspci", "ipmitool",
-	"nvidia-smi", "memtester", "stress-ng", "nvtop",
+	"nvidia-smi", "dcgmi", "nv-hostengine", "memtester", "stress-ng", "nvtop",
 	"mstflint", "qrencode",
 }

--- a/audit/internal/webui/api_test.go
+++ b/audit/internal/webui/api_test.go
@@ -64,6 +64,42 @@ func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
 	}
 }

+func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
+	globalQueue.mu.Lock()
+	originalTasks := globalQueue.tasks
+	globalQueue.tasks = nil
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = originalTasks
+		globalQueue.mu.Unlock()
+	})
+
+	h := &handler{opts: HandlerOptions{App: &app.App{}}}
+	req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
+	rec := httptest.NewRecorder()
+
+	h.handleAPIBenchmarkNvidiaRun(rec, req)
+
+	if rec.Code != 200 {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	globalQueue.mu.Lock()
+	defer globalQueue.mu.Unlock()
+	if len(globalQueue.tasks) != 1 {
+		t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
+	}
+	task := globalQueue.tasks[0]
+	if task.Target != "nvidia-benchmark" {
+		t.Fatalf("target=%q want nvidia-benchmark", task.Target)
+	}
+	if got := task.params.GPUIndices; len(got) != 2 || got[0] != 1 || got[1] != 3 {
+		t.Fatalf("gpu indices=%v want [1 3]", got)
+	}
+	if task.params.RunNCCL {
+		t.Fatal("RunNCCL should reflect explicit false from request")
+	}
+}

 func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
 	h := &handler{}
--- a/audit/internal/webui/charts_svg.go
+++ b/audit/internal/webui/charts_svg.go
@@ -0,0 +1,773 @@
+package webui
+
+import (
+	"fmt"
+	"math"
+	"sort"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+
+	"bee/audit/internal/platform"
+)
+
+type chartTimelineSegment struct {
+	Start  time.Time
+	End    time.Time
+	Active bool
+}
+
+type chartScale struct {
+	Min   float64
+	Max   float64
+	Ticks []float64
+}
+
+type chartLayout struct {
+	Width      int
+	Height     int
+	PlotLeft   int
+	PlotRight  int
+	PlotTop    int
+	PlotBottom int
+}
+
+type metricChartSeries struct {
+	Name      string
+	AxisTitle string
+	Color     string
+	Values    []float64
+}
+
+var metricChartPalette = []string{
+	"#5794f2",
+	"#73bf69",
+	"#f2cc0c",
+	"#ff9830",
+	"#f2495c",
+	"#b877d9",
+	"#56d2f7",
+	"#8ab8ff",
+	"#9adf8f",
+	"#ffbe5c",
+}
+
+var gpuLabelCache struct {
+	mu       sync.Mutex
+	loadedAt time.Time
+	byIndex  map[int]string
+}
+
+func renderMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMin, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
+	pointCount := len(labels)
+	if len(times) > pointCount {
+		pointCount = len(times)
+	}
+	if pointCount == 0 {
+		pointCount = 1
+		labels = []string{""}
+		times = []time.Time{time.Time{}}
+	}
+	if len(labels) < pointCount {
+		padded := make([]string, pointCount)
+		copy(padded, labels)
+		labels = padded
+	}
+	if len(times) < pointCount {
+		times = synthesizeChartTimes(times, pointCount)
+	}
+	for i := range datasets {
+		if len(datasets[i]) == 0 {
+			datasets[i] = make([]float64, pointCount)
+		}
+	}
+
+	statsLabel := chartStatsLabel(datasets)
+
+	legendItems := []metricChartSeries{}
+	for i, name := range names {
+		color := metricChartPalette[i%len(metricChartPalette)]
+		values := make([]float64, pointCount)
+		if i < len(datasets) {
+			copy(values, coalesceDataset(datasets[i], pointCount))
+		}
+		legendItems = append(legendItems, metricChartSeries{
+			Name:   name,
+			Color:  color,
+			Values: values,
+		})
+	}
+
+	scale := singleAxisChartScale(datasets, yMin, yMax)
+	layout := singleAxisChartLayout(canvasHeight, len(legendItems))
+	start, end := chartTimeBounds(times)
+
+	var b strings.Builder
+	writeSVGOpen(&b, layout.Width, layout.Height)
+	writeChartFrame(&b, title, statsLabel, layout.Width, layout.Height)
+	writeTimelineIdleSpans(&b, layout, start, end, timeline)
+	writeVerticalGrid(&b, layout, times, pointCount, 8)
+	writeHorizontalGrid(&b, layout, scale)
+	writeTimelineBoundaries(&b, layout, start, end, timeline)
+	writePlotBorder(&b, layout)
+	writeSingleAxisY(&b, layout, scale)
+	writeXAxisLabels(&b, layout, times, labels, start, end, 8)
+	for _, item := range legendItems {
+		writeSeriesPolyline(&b, layout, times, start, end, item.Values, scale, item.Color)
+	}
+	writeLegend(&b, layout, legendItems)
+	writeSVGClose(&b)
+	return []byte(b.String()), nil
+}
+
+func renderGPUOverviewChartSVG(idx int, samples []platform.LiveMetricSample, timeline []chartTimelineSegment) ([]byte, bool, error) {
+	temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
+	power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
+	coreClock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
+	if temp == nil && power == nil && coreClock == nil {
+		return nil, false, nil
+	}
+	labels := sampleTimeLabels(samples)
+	times := sampleTimes(samples)
+	svg, err := drawGPUOverviewChartSVG(
+		gpuDisplayLabel(idx)+" Overview",
+		labels,
+		times,
+		[]metricChartSeries{
+			{Name: "Temp C", Values: coalesceDataset(temp, len(labels)), Color: "#f05a5a", AxisTitle: "Temp C"},
+			{Name: "Power W", Values: coalesceDataset(power, len(labels)), Color: "#ffb357", AxisTitle: "Power W"},
+			{Name: "Core Clock MHz", Values: coalesceDataset(coreClock, len(labels)), Color: "#73bf69", AxisTitle: "Core MHz"},
+		},
+		timeline,
+	)
+	if err != nil {
+		return nil, false, err
+	}
+	return svg, true, nil
+}
+
+func drawGPUOverviewChartSVG(title string, labels []string, times []time.Time, series []metricChartSeries, timeline []chartTimelineSegment) ([]byte, error) {
+	if len(series) != 3 {
+		return nil, fmt.Errorf("gpu overview requires 3 series, got %d", len(series))
+	}
+	const (
+		width      = 1400
+		height     = 840
+		plotLeft   = 180
+		plotRight  = 1220
+		plotTop    = 96
+		plotBottom = 660
+	)
+	const (
+		leftOuterAxis  = 72
+		leftInnerAxis  = 132
+		rightInnerAxis = 1268
+	)
+	layout := chartLayout{
+		Width:      width,
+		Height:     height,
+		PlotLeft:   plotLeft,
+		PlotRight:  plotRight,
+		PlotTop:    plotTop,
+		PlotBottom: plotBottom,
+	}
+	axisX := []int{leftOuterAxis, leftInnerAxis, rightInnerAxis}
+	pointCount := len(labels)
+	if len(times) > pointCount {
+		pointCount = len(times)
+	}
+	if pointCount == 0 {
+		pointCount = 1
+		labels = []string{""}
+		times = []time.Time{time.Time{}}
+	}
+	if len(labels) < pointCount {
+		padded := make([]string, pointCount)
+		copy(padded, labels)
+		labels = padded
+	}
+	if len(times) < pointCount {
+		times = synthesizeChartTimes(times, pointCount)
+	}
+	for i := range series {
+		if len(series[i].Values) == 0 {
+			series[i].Values = make([]float64, pointCount)
+		}
+	}
+
+	scales := make([]chartScale, len(series))
+	for i := range series {
+		min, max := chartSeriesBounds(series[i].Values)
+		ticks := chartNiceTicks(min, max, 8)
+		scales[i] = chartScale{
+			Min:   ticks[0],
+			Max:   ticks[len(ticks)-1],
+			Ticks: ticks,
+		}
+	}
+	start, end := chartTimeBounds(times)
+
+	var b strings.Builder
+	writeSVGOpen(&b, width, height)
+	writeChartFrame(&b, title, "", width, height)
+	writeTimelineIdleSpans(&b, layout, start, end, timeline)
+	writeVerticalGrid(&b, layout, times, pointCount, 8)
+	writeHorizontalGrid(&b, layout, scales[0])
+	writeTimelineBoundaries(&b, layout, start, end, timeline)
+	writePlotBorder(&b, layout)
+
+	for i, axisLineX := range axisX {
+		fmt.Fprintf(&b, `<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="%s" stroke-width="1"/>`+"\n",
+			axisLineX, layout.PlotTop, axisLineX, layout.PlotBottom, series[i].Color)
+		fmt.Fprintf(&b, `<text x="%d" y="%d" text-anchor="middle" font-family="sans-serif" font-size="11" font-weight="700" fill="%s">%s</text>`+"\n",
+			axisLineX, 64, series[i].Color, sanitizeChartText(series[i].AxisTitle))
+		for _, tick := range scales[i].Ticks {
+			y := chartYForValue(valueClamp(tick, scales[i]), scales[i], layout.PlotTop, layout.PlotBottom)
+			label := sanitizeChartText(chartYAxisNumber(tick))
+			if i < 2 {
+				fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="%s" stroke-width="1"/>`+"\n",
+					axisLineX, y, axisLineX+6, y, series[i].Color)
+				fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="end" dy="4" font-family="sans-serif" font-size="10" fill="%s">%s</text>`+"\n",
+					axisLineX-8, y, series[i].Color, label)
+				continue
+			}
+			fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="%s" stroke-width="1"/>`+"\n",
+				axisLineX, y, axisLineX-6, y, series[i].Color)
+			fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="start" dy="4" font-family="sans-serif" font-size="10" fill="%s">%s</text>`+"\n",
+				axisLineX+8, y, series[i].Color, label)
+		}
+	}
+
+	writeXAxisLabels(&b, layout, times, labels, start, end, 8)
+	for i := range series {
+		writeSeriesPolyline(&b, layout, times, start, end, series[i].Values, scales[i], series[i].Color)
+	}
+	writeLegend(&b, layout, series)
+	writeSVGClose(&b)
+	return []byte(b.String()), nil
+}
+
+func metricsTimelineSegments(samples []platform.LiveMetricSample, now time.Time) []chartTimelineSegment {
+	if len(samples) == 0 {
+		return nil
+	}
+	times := sampleTimes(samples)
+	start, end := chartTimeBounds(times)
+	if start.IsZero() || end.IsZero() {
+		return nil
+	}
+	return chartTimelineSegmentsForRange(start, end, now, snapshotTaskHistory())
+}
+
+func snapshotTaskHistory() []Task {
+	globalQueue.mu.Lock()
+	defer globalQueue.mu.Unlock()
+	out := make([]Task, len(globalQueue.tasks))
+	for i, t := range globalQueue.tasks {
+		out[i] = *t
+	}
+	return out
+}
+
+func chartTimelineSegmentsForRange(start, end, now time.Time, tasks []Task) []chartTimelineSegment {
+	if start.IsZero() || end.IsZero() {
+		return nil
+	}
+	if end.Before(start) {
+		start, end = end, start
+	}
+	type interval struct {
+		start time.Time
+		end   time.Time
+	}
+	active := make([]interval, 0, len(tasks))
+	for _, task := range tasks {
+		if task.StartedAt == nil {
+			continue
+		}
+		intervalStart := task.StartedAt.UTC()
+		intervalEnd := now.UTC()
+		if task.DoneAt != nil {
+			intervalEnd = task.DoneAt.UTC()
+		}
+		if !intervalEnd.After(intervalStart) {
+			continue
+		}
+		if intervalEnd.Before(start) || intervalStart.After(end) {
+			continue
+		}
+		if intervalStart.Before(start) {
+			intervalStart = start
+		}
+		if intervalEnd.After(end) {
+			intervalEnd = end
+		}
+		active = append(active, interval{start: intervalStart, end: intervalEnd})
+	}
+	sort.Slice(active, func(i, j int) bool {
+		if active[i].start.Equal(active[j].start) {
+			return active[i].end.Before(active[j].end)
+		}
+		return active[i].start.Before(active[j].start)
+	})
+	merged := make([]interval, 0, len(active))
+	for _, span := range active {
+		if len(merged) == 0 {
+			merged = append(merged, span)
+			continue
+		}
+		last := &merged[len(merged)-1]
+		if !span.start.After(last.end) {
+			if span.end.After(last.end) {
+				last.end = span.end
+			}
+			continue
+		}
+		merged = append(merged, span)
+	}
+
+	segments := make([]chartTimelineSegment, 0, len(merged)*2+1)
+	cursor := start
+	for _, span := range merged {
+		if span.start.After(cursor) {
+			segments = append(segments, chartTimelineSegment{Start: cursor, End: span.start, Active: false})
+		}
+		segments = append(segments, chartTimelineSegment{Start: span.start, End: span.end, Active: true})
+		cursor = span.end
+	}
+	if cursor.Before(end) {
+		segments = append(segments, chartTimelineSegment{Start: cursor, End: end, Active: false})
+	}
+	if len(segments) == 0 {
+		segments = append(segments, chartTimelineSegment{Start: start, End: end, Active: false})
+	}
+	return segments
+}
+
+func sampleTimes(samples []platform.LiveMetricSample) []time.Time {
+	times := make([]time.Time, 0, len(samples))
+	for _, sample := range samples {
+		times = append(times, sample.Timestamp)
+	}
+	return times
+}
+
+func singleAxisChartScale(datasets [][]float64, yMin, yMax *float64) chartScale {
+	min, max := 0.0, 1.0
+	if yMin != nil && yMax != nil {
+		min, max = *yMin, *yMax
+	} else {
+		min, max = chartSeriesBounds(flattenDatasets(datasets))
+		if yMin != nil {
+			min = *yMin
+		}
+		if yMax != nil {
+			max = *yMax
+		}
+	}
+	ticks := chartNiceTicks(min, max, 8)
+	return chartScale{Min: ticks[0], Max: ticks[len(ticks)-1], Ticks: ticks}
+}
+
+func flattenDatasets(datasets [][]float64) []float64 {
+	total := 0
+	for _, ds := range datasets {
+		total += len(ds)
+	}
+	out := make([]float64, 0, total)
+	for _, ds := range datasets {
+		out = append(out, ds...)
+	}
+	return out
+}
+
+func singleAxisChartLayout(canvasHeight int, seriesCount int) chartLayout {
+	legendRows := 0
+	if chartLegendVisible(seriesCount) && seriesCount > 0 {
+		cols := 4
+		if seriesCount < cols {
+			cols = seriesCount
+		}
+		legendRows = (seriesCount + cols - 1) / cols
+	}
+	legendHeight := 0
+	if legendRows > 0 {
+		legendHeight = legendRows*24 + 24
+	}
+	return chartLayout{
+		Width:      1400,
+		Height:     canvasHeight,
+		PlotLeft:   96,
+		PlotRight:  1352,
+		PlotTop:    72,
+		PlotBottom: canvasHeight - 60 - legendHeight,
+	}
+}
+
+func chartTimeBounds(times []time.Time) (time.Time, time.Time) {
+	if len(times) == 0 {
+		return time.Time{}, time.Time{}
+	}
+	start := times[0].UTC()
+	end := start
+	for _, ts := range times[1:] {
+		t := ts.UTC()
+		if t.Before(start) {
+			start = t
+		}
+		if t.After(end) {
+			end = t
+		}
+	}
+	return start, end
+}
+
+func synthesizeChartTimes(times []time.Time, count int) []time.Time {
+	if count <= 0 {
+		return nil
+	}
+	if len(times) == count {
+		return times
+	}
+	if len(times) == 1 {
+		out := make([]time.Time, count)
+		for i := range out {
+			out[i] = times[0].Add(time.Duration(i) * time.Minute)
+		}
+		return out
+	}
+	base := time.Now().UTC().Add(-time.Duration(count-1) * time.Minute)
+	out := make([]time.Time, count)
+	for i := range out {
+		out[i] = base.Add(time.Duration(i) * time.Minute)
+	}
+	return out
+}
+
+func writeSVGOpen(b *strings.Builder, width, height int) {
+	fmt.Fprintf(b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" viewBox="0 0 %d %d">`+"\n", width, height, width, height)
+}
+
+func writeSVGClose(b *strings.Builder) {
+	b.WriteString("</svg>\n")
+}
+
+func writeChartFrame(b *strings.Builder, title, subtitle string, width, height int) {
+	fmt.Fprintf(b, `<rect width="%d" height="%d" rx="10" ry="10" fill="#ffffff" stroke="#d7e0ea"/>`+"\n", width, height)
+	fmt.Fprintf(b, `<text x="%d" y="30" text-anchor="middle" font-family="sans-serif" font-size="16" font-weight="700" fill="#1f2937">%s</text>`+"\n",
+		width/2, sanitizeChartText(title))
+	if strings.TrimSpace(subtitle) != "" {
+		fmt.Fprintf(b, `<text x="%d" y="50" text-anchor="middle" font-family="sans-serif" font-size="12" font-weight="600" fill="#64748b">%s</text>`+"\n",
+			width/2, sanitizeChartText(subtitle))
+	}
+}
+
+func writePlotBorder(b *strings.Builder, layout chartLayout) {
+	fmt.Fprintf(b, `<rect x="%d" y="%d" width="%d" height="%d" fill="none" stroke="#cbd5e1" stroke-width="1"/>`+"\n",
+		layout.PlotLeft, layout.PlotTop, layout.PlotRight-layout.PlotLeft, layout.PlotBottom-layout.PlotTop)
+}
+
+func writeHorizontalGrid(b *strings.Builder, layout chartLayout, scale chartScale) {
+	b.WriteString(`<g stroke="#e2e8f0" stroke-width="1">` + "\n")
+	for _, tick := range scale.Ticks {
+		y := chartYForValue(tick, scale, layout.PlotTop, layout.PlotBottom)
+		fmt.Fprintf(b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f"/>`+"\n",
+			layout.PlotLeft, y, layout.PlotRight, y)
+	}
+	b.WriteString(`</g>` + "\n")
+}
+
+func writeVerticalGrid(b *strings.Builder, layout chartLayout, times []time.Time, pointCount, target int) {
+	if pointCount <= 0 {
+		return
+	}
+	start, end := chartTimeBounds(times)
+	b.WriteString(`<g stroke="#edf2f7" stroke-width="1">` + "\n")
+	for _, idx := range gpuChartLabelIndices(pointCount, target) {
+		ts := chartPointTime(times, idx)
+		x := chartXForTime(ts, start, end, layout.PlotLeft, layout.PlotRight)
+		fmt.Fprintf(b, `<line x1="%.1f" y1="%d" x2="%.1f" y2="%d"/>`+"\n",
+			x, layout.PlotTop, x, layout.PlotBottom)
+	}
+	b.WriteString(`</g>` + "\n")
+}
+
+func writeSingleAxisY(b *strings.Builder, layout chartLayout, scale chartScale) {
+	fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="#64748b" stroke-width="1"/>`+"\n",
+		layout.PlotLeft, layout.PlotTop, layout.PlotLeft, layout.PlotBottom)
+	for _, tick := range scale.Ticks {
+		y := chartYForValue(tick, scale, layout.PlotTop, layout.PlotBottom)
+		fmt.Fprintf(b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="#64748b" stroke-width="1"/>`+"\n",
+			layout.PlotLeft, y, layout.PlotLeft-6, y)
+		fmt.Fprintf(b, `<text x="%d" y="%.1f" text-anchor="end" dy="4" font-family="sans-serif" font-size="10" fill="#475569">%s</text>`+"\n",
+			layout.PlotLeft-10, y, sanitizeChartText(chartYAxisNumber(tick)))
+	}
+}
+
+func writeXAxisLabels(b *strings.Builder, layout chartLayout, times []time.Time, labels []string, start, end time.Time, target int) {
+	pointCount := len(labels)
+	if len(times) > pointCount {
+		pointCount = len(times)
+	}
+	b.WriteString(`<g font-family="sans-serif" font-size="11" fill="#64748b" text-anchor="middle">` + "\n")
+	for _, idx := range gpuChartLabelIndices(pointCount, target) {
+		x := chartXForTime(chartPointTime(times, idx), start, end, layout.PlotLeft, layout.PlotRight)
+		label := ""
+		if idx < len(labels) {
+			label = labels[idx]
+		}
+		fmt.Fprintf(b, `<text x="%.1f" y="%d">%s</text>`+"\n", x, layout.PlotBottom+28, sanitizeChartText(label))
+	}
+	b.WriteString(`</g>` + "\n")
+	fmt.Fprintf(b, `<text x="%d" y="%d" text-anchor="middle" font-family="sans-serif" font-size="12" fill="#64748b">Time</text>`+"\n",
+		(layout.PlotLeft+layout.PlotRight)/2, layout.PlotBottom+48)
+}
+
+func writeSeriesPolyline(b *strings.Builder, layout chartLayout, times []time.Time, start, end time.Time, values []float64, scale chartScale, color string) {
+	if len(values) == 0 {
+		return
+	}
+	var points strings.Builder
+	for idx, value := range values {
+		if idx > 0 {
+			points.WriteByte(' ')
+		}
+		x := chartXForTime(chartPointTime(times, idx), start, end, layout.PlotLeft, layout.PlotRight)
+		y := chartYForValue(value, scale, layout.PlotTop, layout.PlotBottom)
+		points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
+		points.WriteByte(',')
+		points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
+	}
+	fmt.Fprintf(b, `<polyline points="%s" fill="none" stroke="%s" stroke-width="2.2" stroke-linejoin="round" stroke-linecap="round"/>`+"\n",
+		points.String(), color)
+	if len(values) == 1 {
+		x := chartXForTime(chartPointTime(times, 0), start, end, layout.PlotLeft, layout.PlotRight)
+		y := chartYForValue(values[0], scale, layout.PlotTop, layout.PlotBottom)
+		fmt.Fprintf(b, `<circle cx="%.1f" cy="%.1f" r="3.5" fill="%s"/>`+"\n", x, y, color)
+		return
+	}
+	peakIdx := 0
+	peakValue := values[0]
+	for idx, value := range values[1:] {
+		if value >= peakValue {
+			peakIdx = idx + 1
+			peakValue = value
+		}
+	}
+	x := chartXForTime(chartPointTime(times, peakIdx), start, end, layout.PlotLeft, layout.PlotRight)
+	y := chartYForValue(peakValue, scale, layout.PlotTop, layout.PlotBottom)
+	fmt.Fprintf(b, `<circle cx="%.1f" cy="%.1f" r="4.2" fill="%s" stroke="#ffffff" stroke-width="1.6"/>`+"\n", x, y, color)
+	fmt.Fprintf(b, `<path d="M %.1f %.1f L %.1f %.1f L %.1f %.1f Z" fill="%s" opacity="0.9"/>`+"\n",
+		x, y-10, x-5, y-18, x+5, y-18, color)
+}
+
+func writeLegend(b *strings.Builder, layout chartLayout, series []metricChartSeries) {
+	if !chartLegendVisible(len(series)) || len(series) == 0 {
+		return
+	}
+	cols := 4
+	if len(series) < cols {
+		cols = len(series)
+	}
+	cellWidth := float64(layout.PlotRight-layout.PlotLeft) / float64(cols)
+	baseY := layout.PlotBottom + 74
+	for i, item := range series {
+		row := i / cols
+		col := i % cols
+		x := float64(layout.PlotLeft) + cellWidth*float64(col) + 8
+		y := float64(baseY + row*24)
+		fmt.Fprintf(b, `<line x1="%.1f" y1="%.1f" x2="%.1f" y2="%.1f" stroke="%s" stroke-width="3"/>`+"\n",
+			x, y, x+28, y, item.Color)
+		fmt.Fprintf(b, `<text x="%.1f" y="%.1f" font-family="sans-serif" font-size="12" fill="#1f2937">%s</text>`+"\n",
+			x+38, y+4, sanitizeChartText(item.Name))
+	}
+}
+
+func writeTimelineIdleSpans(b *strings.Builder, layout chartLayout, start, end time.Time, segments []chartTimelineSegment) {
+	if len(segments) == 0 {
+		return
+	}
+	b.WriteString(`<g data-role="timeline-overlay">` + "\n")
+	for _, segment := range segments {
+		if segment.Active || !segment.End.After(segment.Start) {
+			continue
+		}
+		x0 := chartXForTime(segment.Start, start, end, layout.PlotLeft, layout.PlotRight)
+		x1 := chartXForTime(segment.End, start, end, layout.PlotLeft, layout.PlotRight)
+		fmt.Fprintf(b, `<rect x="%.1f" y="%d" width="%.1f" height="%d" fill="#475569" opacity="0.10"/>`+"\n",
+			x0, layout.PlotTop, math.Max(1, x1-x0), layout.PlotBottom-layout.PlotTop)
+	}
+	b.WriteString(`</g>` + "\n")
+}
+
+func writeTimelineBoundaries(b *strings.Builder, layout chartLayout, start, end time.Time, segments []chartTimelineSegment) {
+	if len(segments) == 0 {
+		return
+	}
+	seen := map[int]bool{}
+	b.WriteString(`<g data-role="timeline-boundaries" stroke="#94a3b8" stroke-width="1.2">` + "\n")
+	for i, segment := range segments {
+		if i > 0 {
+			x := int(math.Round(chartXForTime(segment.Start, start, end, layout.PlotLeft, layout.PlotRight)))
+			if !seen[x] {
+				seen[x] = true
+				fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d"/>`+"\n", x, layout.PlotTop, x, layout.PlotBottom)
+			}
+		}
+		if i < len(segments)-1 {
+			x := int(math.Round(chartXForTime(segment.End, start, end, layout.PlotLeft, layout.PlotRight)))
+			if !seen[x] {
+				seen[x] = true
+				fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d"/>`+"\n", x, layout.PlotTop, x, layout.PlotBottom)
+			}
+		}
+	}
+	b.WriteString(`</g>` + "\n")
+}
+
+func chartXForTime(ts, start, end time.Time, left, right int) float64 {
+	if !end.After(start) {
+		return float64(left+right) / 2
+	}
+	if ts.Before(start) {
+		ts = start
+	}
+	if ts.After(end) {
+		ts = end
+	}
+	ratio := float64(ts.Sub(start)) / float64(end.Sub(start))
+	return float64(left) + ratio*float64(right-left)
+}
+
+func chartPointTime(times []time.Time, idx int) time.Time {
+	if idx >= 0 && idx < len(times) && !times[idx].IsZero() {
+		return times[idx].UTC()
+	}
+	if len(times) > 0 && !times[0].IsZero() {
+		return times[0].UTC().Add(time.Duration(idx) * time.Minute)
+	}
+	return time.Now().UTC().Add(time.Duration(idx) * time.Minute)
+}
+
+func chartYForValue(value float64, scale chartScale, plotTop, plotBottom int) float64 {
+	if scale.Max <= scale.Min {
+		return float64(plotTop+plotBottom) / 2
+	}
+	return float64(plotBottom) - (value-scale.Min)/(scale.Max-scale.Min)*float64(plotBottom-plotTop)
+}
+
+func chartSeriesBounds(values []float64) (float64, float64) {
+	if len(values) == 0 {
+		return 0, 1
+	}
+	min, max := values[0], values[0]
+	for _, value := range values[1:] {
+		if value < min {
+			min = value
+		}
+		if value > max {
+			max = value
+		}
+	}
+	if min == max {
+		if max == 0 {
+			return 0, 1
+		}
+		pad := math.Abs(max) * 0.1
+		if pad == 0 {
+			pad = 1
+		}
+		min -= pad
+		max += pad
+	}
+	if min > 0 {
+		pad := (max - min) * 0.2
+		if pad == 0 {
+			pad = max * 0.1
+		}
+		min -= pad
+		if min < 0 {
+			min = 0
+		}
+		max += pad
+	}
+	return min, max
+}
+
+func chartNiceTicks(min, max float64, target int) []float64 {
+	if min == max {
+		max = min + 1
+	}
+	span := max - min
+	step := math.Pow(10, math.Floor(math.Log10(span/float64(target))))
+	for _, factor := range []float64{1, 2, 5, 10} {
+		if span/(factor*step) <= float64(target)*1.5 {
+			step = factor * step
+			break
+		}
+	}
+	low := math.Floor(min/step) * step
+	high := math.Ceil(max/step) * step
+	var ticks []float64
+	for value := low; value <= high+step*0.001; value += step {
+		ticks = append(ticks, math.Round(value*1e9)/1e9)
+	}
+	return ticks
+}
+
+func valueClamp(value float64, scale chartScale) float64 {
+	if value < scale.Min {
+		return scale.Min
+	}
+	if value > scale.Max {
+		return scale.Max
+	}
+	return value
+}
+
+func chartStatsLabel(datasets [][]float64) string {
+	mn, avg, mx := globalStats(datasets)
+	if mx <= 0 && avg <= 0 && mn <= 0 {
+		return ""
+	}
+	return fmt.Sprintf("min %s   avg %s   max %s",
+		chartLegendNumber(mn),
+		chartLegendNumber(avg),
+		chartLegendNumber(mx),
+	)
+}
+
+func gpuDisplayLabel(idx int) string {
+	if name := gpuModelNameByIndex(idx); name != "" {
+		return fmt.Sprintf("GPU %d — %s", idx, name)
+	}
+	return fmt.Sprintf("GPU %d", idx)
+}
+
+func gpuModelNameByIndex(idx int) string {
+	now := time.Now()
+	gpuLabelCache.mu.Lock()
+	if now.Sub(gpuLabelCache.loadedAt) > 30*time.Second || gpuLabelCache.byIndex == nil {
+		gpuLabelCache.loadedAt = now
+		gpuLabelCache.byIndex = loadGPUModelNames()
+	}
+	name := strings.TrimSpace(gpuLabelCache.byIndex[idx])
+	gpuLabelCache.mu.Unlock()
+	return name
+}
+
+func loadGPUModelNames() map[int]string {
+	out := map[int]string{}
+	gpus, err := platform.New().ListNvidiaGPUs()
+	if err != nil {
+		return out
+	}
+	for _, gpu := range gpus {
+		name := strings.TrimSpace(gpu.Name)
+		if name != "" {
+			out[gpu.Index] = name
+		}
+	}
+	return out
+}
--- a/audit/internal/webui/jobs.go
+++ b/audit/internal/webui/jobs.go
@@ -9,13 +9,14 @@ import (

 // jobState holds the output lines and completion status of an async job.
 type jobState struct {
-	lines   []string
-	done    bool
-	err     string
-	mu      sync.Mutex
-	subs    []chan string
-	cancel  func() // optional cancel function; nil if job is not cancellable
-	logPath string
+	lines        []string
+	done         bool
+	err          string
+	mu           sync.Mutex
+	subs         []chan string
+	cancel       func() // optional cancel function; nil if job is not cancellable
+	logPath      string
+	serialPrefix string
 }

 // abort cancels the job if it has a cancel function and is not yet done.
@@ -36,6 +37,9 @@ func (j *jobState) append(line string) {
 	if j.logPath != "" {
 		appendJobLog(j.logPath, line)
 	}
+	if j.serialPrefix != "" {
+		taskSerialWriteLine(j.serialPrefix + line)
+	}
 	for _, ch := range j.subs {
 		select {
 		case ch <- line:
@@ -84,12 +88,12 @@ func (m *jobManager) create(id string) *jobState {
 	j := &jobState{}
 	m.jobs[id] = j
 	// Schedule cleanup after 30 minutes
-	go func() {
+	goRecoverOnce("job cleanup", func() {
 		time.Sleep(30 * time.Minute)
 		m.mu.Lock()
 		delete(m.jobs, id)
 		m.mu.Unlock()
-	}()
+	})
 	return j
 }

@@ -107,8 +111,11 @@ func (m *jobManager) get(id string) (*jobState, bool) {
 	return j, ok
 }

-func newTaskJobState(logPath string) *jobState {
+func newTaskJobState(logPath string, serialPrefix ...string) *jobState {
 	j := &jobState{logPath: logPath}
+	if len(serialPrefix) > 0 {
+		j.serialPrefix = serialPrefix[0]
+	}
 	if logPath == "" {
 		return j
 	}
--- a/audit/internal/webui/kmsg_watcher.go
+++ b/audit/internal/webui/kmsg_watcher.go
@@ -17,10 +17,10 @@ import (
 // It supports multiple concurrent SAT tasks: a shared event window is open
 // while any SAT task is running, and flushed when all tasks complete.
 type kmsgWatcher struct {
-	mu           sync.Mutex
-	activeCount  int        // number of in-flight SAT tasks
-	window       *kmsgWindow
-	statusDB     *app.ComponentStatusDB
+	mu          sync.Mutex
+	activeCount int // number of in-flight SAT tasks
+	window      *kmsgWindow
+	statusDB    *app.ComponentStatusDB
 }

 type kmsgWindow struct {
@@ -48,36 +48,39 @@ func newKmsgWatcher(statusDB *app.ComponentStatusDB) *kmsgWatcher {

 // start launches the background kmsg reading goroutine.
 func (w *kmsgWatcher) start() {
-	go w.run()
+	goRecoverLoop("kmsg watcher", 5*time.Second, w.run)
 }

 func (w *kmsgWatcher) run() {
-	f, err := os.Open("/dev/kmsg")
-	if err != nil {
-		slog.Warn("kmsg watcher unavailable", "err", err)
-		return
-	}
-	defer f.Close()
-
-	// Best-effort seek to end so we only capture events from now forward.
-	_, _ = f.Seek(0, io.SeekEnd)
-
-	scanner := bufio.NewScanner(f)
-	scanner.Buffer(make([]byte, 64*1024), 64*1024)
-	for scanner.Scan() {
-		line := scanner.Text()
-		evt, ok := parseKmsgLine(line)
-		if !ok {
+	for {
+		f, err := os.Open("/dev/kmsg")
+		if err != nil {
+			slog.Warn("kmsg watcher unavailable", "err", err)
+			time.Sleep(30 * time.Second)
 			continue
 		}
-		w.mu.Lock()
-		if w.window != nil {
-			w.recordEvent(evt)
+		// Best-effort seek to end so we only capture events from now forward.
+		_, _ = f.Seek(0, io.SeekEnd)
+
+		scanner := bufio.NewScanner(f)
+		scanner.Buffer(make([]byte, 64*1024), 64*1024)
+		for scanner.Scan() {
+			line := scanner.Text()
+			evt, ok := parseKmsgLine(line)
+			if !ok {
+				continue
+			}
+			w.mu.Lock()
+			if w.window != nil {
+				w.recordEvent(evt)
+			}
+			w.mu.Unlock()
 		}
-		w.mu.Unlock()
-	}
-	if err := scanner.Err(); err != nil {
-		slog.Warn("kmsg watcher stopped", "err", err)
+		if err := scanner.Err(); err != nil {
+			slog.Warn("kmsg watcher stopped", "err", err)
+		}
+		_ = f.Close()
+		time.Sleep(2 * time.Second)
 	}
 }

@@ -134,7 +137,7 @@ func (w *kmsgWatcher) NotifyTaskFinished(taskID string) {
 	if window == nil || len(window.events) == 0 {
 		return
 	}
-	go w.flushWindow(window)
+	goRecoverOnce("kmsg watcher flush", func() { w.flushWindow(window) })
 }

 func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
@@ -229,7 +232,8 @@ func truncate(s string, max int) string {
 // isSATTarget returns true for task targets that run hardware acceptance tests.
 func isSATTarget(target string) bool {
 	switch target {
-	case "nvidia", "nvidia-stress", "memory", "memory-stress", "storage",
+	case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
+		"nvidia-interconnect", "nvidia-bandwidth", "nvidia-stress", "memory", "memory-stress", "storage",
 		"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
 		"platform-stress":
 		return true
--- a/audit/internal/webui/metricsdb.go
+++ b/audit/internal/webui/metricsdb.go
@@ -8,6 +8,7 @@ import (
 	"path/filepath"
 	"sort"
 	"strconv"
+	"strings"
 	"time"

 	"bee/audit/internal/platform"
@@ -21,6 +22,13 @@ type MetricsDB struct {
 	db *sql.DB
 }

+func (m *MetricsDB) Close() error {
+	if m == nil || m.db == nil {
+		return nil
+	}
+	return m.db.Close()
+}
+
 // openMetricsDB opens (or creates) the metrics database at the given path.
 func openMetricsDB(path string) (*MetricsDB, error) {
 	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
@@ -54,6 +62,8 @@ CREATE TABLE IF NOT EXISTS gpu_metrics (
  usage_pct     REAL,
  mem_usage_pct REAL,
  power_w       REAL,
+  clock_mhz     REAL,
+  mem_clock_mhz REAL,
  PRIMARY KEY (ts, gpu_index)
 );
 CREATE TABLE IF NOT EXISTS fan_metrics (
@@ -70,6 +80,38 @@ CREATE TABLE IF NOT EXISTS temp_metrics (
  PRIMARY KEY (ts, name)
 );
 `)
+	if err != nil {
+		return err
+	}
+	if err := ensureMetricsColumn(db, "gpu_metrics", "clock_mhz", "REAL"); err != nil {
+		return err
+	}
+	return ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL")
+}
+
+func ensureMetricsColumn(db *sql.DB, table, column, definition string) error {
+	rows, err := db.Query("PRAGMA table_info(" + table + ")")
+	if err != nil {
+		return err
+	}
+	defer rows.Close()
+
+	for rows.Next() {
+		var cid int
+		var name, ctype string
+		var notNull, pk int
+		var dflt sql.NullString
+		if err := rows.Scan(&cid, &name, &ctype, &notNull, &dflt, &pk); err != nil {
+			return err
+		}
+		if strings.EqualFold(name, column) {
+			return nil
+		}
+	}
+	if err := rows.Err(); err != nil {
+		return err
+	}
+	_, err = db.Exec("ALTER TABLE " + table + " ADD COLUMN " + column + " " + definition)
 	return err
 }

@@ -91,8 +133,8 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
 	}
 	for _, g := range s.GPUs {
 		_, err = tx.Exec(
-			`INSERT OR REPLACE INTO gpu_metrics(ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w) VALUES(?,?,?,?,?,?)`,
-			ts, g.GPUIndex, g.TempC, g.UsagePct, g.MemUsagePct, g.PowerW,
+			`INSERT OR REPLACE INTO gpu_metrics(ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz) VALUES(?,?,?,?,?,?,?,?)`,
+			ts, g.GPUIndex, g.TempC, g.UsagePct, g.MemUsagePct, g.PowerW, g.ClockMHz, g.MemClockMHz,
 		)
 		if err != nil {
 			return err
@@ -129,6 +171,23 @@ func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
 	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
 }

+// LoadBetween returns samples in chronological order within the given time window.
+func (m *MetricsDB) LoadBetween(start, end time.Time) ([]platform.LiveMetricSample, error) {
+	if m == nil {
+		return nil, nil
+	}
+	if start.IsZero() || end.IsZero() {
+		return nil, nil
+	}
+	if end.Before(start) {
+		start, end = end, start
+	}
+	return m.loadSamples(
+		`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
+		start.Unix(), end.Unix(),
+	)
+}
+
 // loadSamples reconstructs LiveMetricSample rows from the normalized tables.
 func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetricSample, error) {
 	rows, err := m.db.Query(query, args...)
@@ -163,7 +222,7 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
 	}
 	gpuData := map[gpuKey]platform.GPUMetricRow{}
 	gRows, err := m.db.Query(
-		`SELECT ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w FROM gpu_metrics WHERE ts>=? AND ts<=? ORDER BY ts,gpu_index`,
+		`SELECT ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w,IFNULL(clock_mhz,0),IFNULL(mem_clock_mhz,0) FROM gpu_metrics WHERE ts>=? AND ts<=? ORDER BY ts,gpu_index`,
 		minTS, maxTS,
 	)
 	if err == nil {
@@ -171,7 +230,7 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
 		for gRows.Next() {
 			var ts int64
 			var g platform.GPUMetricRow
-			if err := gRows.Scan(&ts, &g.GPUIndex, &g.TempC, &g.UsagePct, &g.MemUsagePct, &g.PowerW); err == nil {
+			if err := gRows.Scan(&ts, &g.GPUIndex, &g.TempC, &g.UsagePct, &g.MemUsagePct, &g.PowerW, &g.ClockMHz, &g.MemClockMHz); err == nil {
 				gpuData[gpuKey{ts, g.GPUIndex}] = g
 			}
 		}
@@ -283,7 +342,8 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
 func (m *MetricsDB) ExportCSV(w io.Writer) error {
 	rows, err := m.db.Query(`
 		SELECT s.ts, s.cpu_load_pct, s.mem_load_pct, s.power_w,
-		       g.gpu_index, g.temp_c, g.usage_pct, g.mem_usage_pct, g.power_w
+		       g.gpu_index, g.temp_c, g.usage_pct, g.mem_usage_pct, g.power_w,
+		       g.clock_mhz, g.mem_clock_mhz
 		FROM sys_metrics s
 		LEFT JOIN gpu_metrics g ON g.ts = s.ts
 		ORDER BY s.ts, g.gpu_index
@@ -294,13 +354,13 @@ func (m *MetricsDB) ExportCSV(w io.Writer) error {
 	defer rows.Close()

 	cw := csv.NewWriter(w)
-	_ = cw.Write([]string{"ts", "cpu_load_pct", "mem_load_pct", "sys_power_w", "gpu_index", "gpu_temp_c", "gpu_usage_pct", "gpu_mem_pct", "gpu_power_w"})
+	_ = cw.Write([]string{"ts", "cpu_load_pct", "mem_load_pct", "sys_power_w", "gpu_index", "gpu_temp_c", "gpu_usage_pct", "gpu_mem_pct", "gpu_power_w", "gpu_clock_mhz", "gpu_mem_clock_mhz"})
 	for rows.Next() {
 		var ts int64
 		var cpu, mem, pwr float64
 		var gpuIdx sql.NullInt64
-		var gpuTemp, gpuUse, gpuMem, gpuPow sql.NullFloat64
-		if err := rows.Scan(&ts, &cpu, &mem, &pwr, &gpuIdx, &gpuTemp, &gpuUse, &gpuMem, &gpuPow); err != nil {
+		var gpuTemp, gpuUse, gpuMem, gpuPow, gpuClock, gpuMemClock sql.NullFloat64
+		if err := rows.Scan(&ts, &cpu, &mem, &pwr, &gpuIdx, &gpuTemp, &gpuUse, &gpuMem, &gpuPow, &gpuClock, &gpuMemClock); err != nil {
 			continue
 		}
 		row := []string{
@@ -316,9 +376,11 @@ func (m *MetricsDB) ExportCSV(w io.Writer) error {
 				strconv.FormatFloat(gpuUse.Float64, 'f', 1, 64),
 				strconv.FormatFloat(gpuMem.Float64, 'f', 1, 64),
 				strconv.FormatFloat(gpuPow.Float64, 'f', 1, 64),
+				strconv.FormatFloat(gpuClock.Float64, 'f', 1, 64),
+				strconv.FormatFloat(gpuMemClock.Float64, 'f', 1, 64),
 			)
 		} else {
-			row = append(row, "", "", "", "", "")
+			row = append(row, "", "", "", "", "", "", "")
 		}
 		_ = cw.Write(row)
 	}
@@ -326,9 +388,6 @@ func (m *MetricsDB) ExportCSV(w io.Writer) error {
 	return cw.Error()
 }

-// Close closes the database.
-func (m *MetricsDB) Close() { _ = m.db.Close() }
-
 func nullFloat(v float64) sql.NullFloat64 {
 	return sql.NullFloat64{Float64: v, Valid: true}
 }
--- a/audit/internal/webui/metricsdb_test.go
+++ b/audit/internal/webui/metricsdb_test.go
@@ -1,11 +1,13 @@
 package webui

 import (
+	"database/sql"
 	"path/filepath"
 	"testing"
 	"time"

 	"bee/audit/internal/platform"
+	_ "modernc.org/sqlite"
 )

 func TestMetricsDBLoadSamplesKeepsChronologicalRangeForGPUs(t *testing.T) {
@@ -67,3 +69,106 @@ func TestMetricsDBLoadSamplesKeepsChronologicalRangeForGPUs(t *testing.T) {
 		}
 	}
 }
+
+func TestMetricsDBMigratesLegacyGPUSchema(t *testing.T) {
+	path := filepath.Join(t.TempDir(), "metrics.db")
+	raw, err := sql.Open("sqlite", path)
+	if err != nil {
+		t.Fatalf("sql.Open: %v", err)
+	}
+	_, err = raw.Exec(`
+CREATE TABLE gpu_metrics (
+  ts            INTEGER NOT NULL,
+  gpu_index     INTEGER NOT NULL,
+  temp_c        REAL,
+  usage_pct     REAL,
+  mem_usage_pct REAL,
+  power_w       REAL,
+  PRIMARY KEY (ts, gpu_index)
+);
+CREATE TABLE sys_metrics (
+  ts           INTEGER NOT NULL,
+  cpu_load_pct REAL,
+  mem_load_pct REAL,
+  power_w      REAL,
+  PRIMARY KEY (ts)
+);
+CREATE TABLE fan_metrics (
+  ts   INTEGER NOT NULL,
+  name TEXT NOT NULL,
+  rpm  REAL,
+  PRIMARY KEY (ts, name)
+);
+CREATE TABLE temp_metrics (
+  ts      INTEGER NOT NULL,
+  name    TEXT NOT NULL,
+  grp     TEXT NOT NULL,
+  celsius REAL,
+  PRIMARY KEY (ts, name)
+);
+`)
+	if err != nil {
+		t.Fatalf("create legacy schema: %v", err)
+	}
+	_ = raw.Close()
+
+	db, err := openMetricsDB(path)
+	if err != nil {
+		t.Fatalf("openMetricsDB: %v", err)
+	}
+	defer db.Close()
+
+	now := time.Unix(1_700_000_100, 0).UTC()
+	err = db.Write(platform.LiveMetricSample{
+		Timestamp: now,
+		GPUs: []platform.GPUMetricRow{
+			{GPUIndex: 0, ClockMHz: 1410, MemClockMHz: 2600},
+		},
+	})
+	if err != nil {
+		t.Fatalf("Write: %v", err)
+	}
+
+	samples, err := db.LoadAll()
+	if err != nil {
+		t.Fatalf("LoadAll: %v", err)
+	}
+	if len(samples) != 1 || len(samples[0].GPUs) != 1 {
+		t.Fatalf("samples=%+v", samples)
+	}
+	if got := samples[0].GPUs[0].ClockMHz; got != 1410 {
+		t.Fatalf("ClockMHz=%v want 1410", got)
+	}
+	if got := samples[0].GPUs[0].MemClockMHz; got != 2600 {
+		t.Fatalf("MemClockMHz=%v want 2600", got)
+	}
+}
+
+func TestMetricsDBLoadBetweenFiltersWindow(t *testing.T) {
+	db, err := openMetricsDB(filepath.Join(t.TempDir(), "metrics.db"))
+	if err != nil {
+		t.Fatalf("openMetricsDB: %v", err)
+	}
+	defer db.Close()
+
+	base := time.Unix(1_700_000_000, 0).UTC()
+	for i := 0; i < 5; i++ {
+		if err := db.Write(platform.LiveMetricSample{
+			Timestamp:  base.Add(time.Duration(i) * time.Minute),
+			CPULoadPct: float64(i),
+		}); err != nil {
+			t.Fatalf("Write(%d): %v", i, err)
+		}
+	}
+
+	got, err := db.LoadBetween(base.Add(1*time.Minute), base.Add(3*time.Minute))
+	if err != nil {
+		t.Fatalf("LoadBetween: %v", err)
+	}
+	if len(got) != 3 {
+		t.Fatalf("LoadBetween len=%d want 3", len(got))
+	}
+	if !got[0].Timestamp.Equal(base.Add(1*time.Minute)) || !got[2].Timestamp.Equal(base.Add(3*time.Minute)) {
+		t.Fatalf("window=%v..%v", got[0].Timestamp, got[2].Timestamp)
+	}
+}
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
--- a/audit/internal/webui/serial_console.go
+++ b/audit/internal/webui/serial_console.go
@@ -0,0 +1,41 @@
+package webui
+
+import (
+	"fmt"
+	"os"
+	"strings"
+	"time"
+)
+
+var taskSerialWriteLine = writeTaskSerialLine
+
+func writeTaskSerialLine(line string) {
+	line = strings.TrimSpace(line)
+	if line == "" {
+		return
+	}
+	payload := fmt.Sprintf("%s %s\n", time.Now().UTC().Format("2006-01-02 15:04:05Z"), line)
+	for _, path := range []string{"/dev/ttyS0", "/dev/ttyS1", "/dev/console"} {
+		f, err := os.OpenFile(path, os.O_WRONLY|os.O_APPEND, 0)
+		if err != nil {
+			continue
+		}
+		_, _ = f.WriteString(payload)
+		_ = f.Close()
+		return
+	}
+}
+
+func taskSerialPrefix(t *Task) string {
+	if t == nil {
+		return "[task] "
+	}
+	return fmt.Sprintf("[task %s %s] ", t.ID, t.Name)
+}
+
+func taskSerialEvent(t *Task, event string) {
+	if t == nil {
+		return
+	}
+	taskSerialWriteLine(fmt.Sprintf("%s%s", taskSerialPrefix(t), strings.TrimSpace(event)))
+}
--- a/audit/internal/webui/server.go
+++ b/audit/internal/webui/server.go
@@ -1,15 +1,19 @@
 package webui

 import (
+	"bufio"
 	"encoding/json"
 	"errors"
 	"fmt"
 	"html"
+	"io"
 	"log/slog"
 	"mime"
+	"net"
 	"net/http"
 	"os"
 	"path/filepath"
+	"runtime/debug"
 	"sort"
 	"strings"
 	"sync"
@@ -18,7 +22,6 @@ import (
 	"bee/audit/internal/app"
 	"bee/audit/internal/platform"
 	"bee/audit/internal/runtimeenv"
-	gocharts "github.com/go-analyze/charts"
 	"reanimator/chart/viewer"
 	"reanimator/chart/web"
 )
@@ -234,6 +237,12 @@ func NewHandler(opts HandlerOptions) http.Handler {

 	// SAT
 	mux.HandleFunc("POST /api/sat/nvidia/run", h.handleAPISATRun("nvidia"))
+	mux.HandleFunc("POST /api/sat/nvidia-targeted-stress/run", h.handleAPISATRun("nvidia-targeted-stress"))
+	mux.HandleFunc("POST /api/sat/nvidia-compute/run", h.handleAPISATRun("nvidia-compute"))
+	mux.HandleFunc("POST /api/sat/nvidia-targeted-power/run", h.handleAPISATRun("nvidia-targeted-power"))
+	mux.HandleFunc("POST /api/sat/nvidia-pulse/run", h.handleAPISATRun("nvidia-pulse"))
+	mux.HandleFunc("POST /api/sat/nvidia-interconnect/run", h.handleAPISATRun("nvidia-interconnect"))
+	mux.HandleFunc("POST /api/sat/nvidia-bandwidth/run", h.handleAPISATRun("nvidia-bandwidth"))
 	mux.HandleFunc("POST /api/sat/nvidia-stress/run", h.handleAPISATRun("nvidia-stress"))
 	mux.HandleFunc("POST /api/sat/memory/run", h.handleAPISATRun("memory"))
 	mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
@@ -247,6 +256,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	mux.HandleFunc("POST /api/sat/platform-stress/run", h.handleAPISATRun("platform-stress"))
 	mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream)
 	mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
+	mux.HandleFunc("POST /api/benchmark/nvidia/run", h.handleAPIBenchmarkNvidiaRun)

 	// Tasks
 	mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)
@@ -255,6 +265,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	mux.HandleFunc("POST /api/tasks/{id}/cancel", h.handleAPITasksCancel)
 	mux.HandleFunc("POST /api/tasks/{id}/priority", h.handleAPITasksPriority)
 	mux.HandleFunc("GET /api/tasks/{id}/stream", h.handleAPITasksStream)
+	mux.HandleFunc("GET /tasks/{id}", h.handleTaskPage)

 	// Services
 	mux.HandleFunc("GET /api/services", h.handleAPIServicesList)
@@ -283,6 +294,7 @@ func NewHandler(opts HandlerOptions) http.Handler {

 	// GPU presence / tools
 	mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
+	mux.HandleFunc("GET /api/gpu/nvidia", h.handleAPIGNVIDIAGPUs)
 	mux.HandleFunc("GET /api/gpu/tools", h.handleAPIGPUTools)

 	// System
@@ -309,11 +321,11 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	mux.HandleFunc("GET /", h.handlePage)

 	h.mux = mux
-	return mux
+	return recoverMiddleware(mux)
 }

 func (h *handler) startMetricsCollector() {
-	go func() {
+	goRecoverLoop("metrics collector", 2*time.Second, func() {
 		ticker := time.NewTicker(metricsCollectInterval)
 		defer ticker.Stop()
 		for range ticker.C {
@@ -324,7 +336,7 @@ func (h *handler) startMetricsCollector() {
 			h.feedRings(sample)
 			h.setLatestMetric(sample)
 		}
-	}()
+	})
 }

 func (h *handler) setLatestMetric(sample platform.LiveMetricSample) {
@@ -345,7 +357,81 @@ func (h *handler) latestMetric() (platform.LiveMetricSample, bool) {

 // ListenAndServe starts the HTTP server.
 func ListenAndServe(addr string, opts HandlerOptions) error {
-	return http.ListenAndServe(addr, NewHandler(opts))
+	srv := &http.Server{
+		Addr:              addr,
+		Handler:           NewHandler(opts),
+		ReadHeaderTimeout: 5 * time.Second,
+		ReadTimeout:       30 * time.Second,
+		IdleTimeout:       2 * time.Minute,
+	}
+	return srv.ListenAndServe()
+}
+
+type trackingResponseWriter struct {
+	http.ResponseWriter
+	wroteHeader bool
+}
+
+func (w *trackingResponseWriter) WriteHeader(statusCode int) {
+	w.wroteHeader = true
+	w.ResponseWriter.WriteHeader(statusCode)
+}
+
+func (w *trackingResponseWriter) Write(p []byte) (int, error) {
+	w.wroteHeader = true
+	return w.ResponseWriter.Write(p)
+}
+
+func (w *trackingResponseWriter) Flush() {
+	w.wroteHeader = true
+	if f, ok := w.ResponseWriter.(http.Flusher); ok {
+		f.Flush()
+	}
+}
+
+func (w *trackingResponseWriter) Hijack() (net.Conn, *bufio.ReadWriter, error) {
+	h, ok := w.ResponseWriter.(http.Hijacker)
+	if !ok {
+		return nil, nil, fmt.Errorf("hijacking not supported")
+	}
+	return h.Hijack()
+}
+
+func (w *trackingResponseWriter) Push(target string, opts *http.PushOptions) error {
+	p, ok := w.ResponseWriter.(http.Pusher)
+	if !ok {
+		return http.ErrNotSupported
+	}
+	return p.Push(target, opts)
+}
+
+func (w *trackingResponseWriter) ReadFrom(r io.Reader) (int64, error) {
+	rf, ok := w.ResponseWriter.(io.ReaderFrom)
+	if !ok {
+		return io.Copy(w.ResponseWriter, r)
+	}
+	w.wroteHeader = true
+	return rf.ReadFrom(r)
+}
+
+func recoverMiddleware(next http.Handler) http.Handler {
+	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		tw := &trackingResponseWriter{ResponseWriter: w}
+		defer func() {
+			if rec := recover(); rec != nil {
+				slog.Error("http handler panic",
+					"method", r.Method,
+					"path", r.URL.Path,
+					"panic", fmt.Sprint(rec),
+					"stack", string(debug.Stack()),
+				)
+				if !tw.wroteHeader {
+					http.Error(tw, "internal server error", http.StatusInternalServerError)
+				}
+			}
+		}()
+		next.ServeHTTP(tw, r)
+	})
 }

 // ── Infrastructure handlers ──────────────────────────────────────────────────
@@ -475,13 +561,44 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
 		http.Error(w, "metrics database not available", http.StatusServiceUnavailable)
 		return
 	}
-	datasets, names, labels, title, yMin, yMax, ok := h.chartDataFromDB(path)
+	samples, err := h.metricsDB.LoadAll()
+	if err != nil || len(samples) == 0 {
+		http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
+		return
+	}
+	timeline := metricsTimelineSegments(samples, time.Now())
+	if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
+		buf, ok, err := renderGPUOverviewChartSVG(idx, samples, timeline)
+		if err != nil {
+			http.Error(w, err.Error(), http.StatusInternalServerError)
+			return
+		}
+		if !ok {
+			http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
+			return
+		}
+		w.Header().Set("Content-Type", "image/svg+xml")
+		w.Header().Set("Cache-Control", "no-store")
+		_, _ = w.Write(buf)
+		return
+	}
+	datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
 	if !ok {
 		http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
 		return
 	}

-	buf, err := renderChartSVG(title, datasets, names, labels, yMin, yMax)
+	buf, err := renderMetricChartSVG(
+		title,
+		labels,
+		sampleTimes(samples),
+		datasets,
+		names,
+		yMin,
+		yMax,
+		chartCanvasHeightForPath(path, len(names)),
+		timeline,
+	)
 	if err != nil {
 		http.Error(w, err.Error(), http.StatusInternalServerError)
 		return
@@ -491,14 +608,6 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
 	_, _ = w.Write(buf)
 }

-func (h *handler) chartDataFromDB(path string) ([][]float64, []string, []string, string, *float64, *float64, bool) {
-	samples, err := h.metricsDB.LoadAll()
-	if err != nil || len(samples) == 0 {
-		return nil, nil, nil, "", nil, nil, false
-	}
-	return chartDataFromSamples(path, samples)
-}
-
 func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][]float64, []string, []string, string, *float64, *float64, bool) {
 	var datasets [][]float64
 	var names []string
@@ -578,18 +687,24 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 		yMin = floatPtr(0)
 		yMax = autoMax120(datasets...)

+	case path == "gpu-all-clock":
+		title = "GPU Core Clock"
+		datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
+		yMin, yMax = autoBounds120(datasets...)
+
+	case path == "gpu-all-memclock":
+		title = "GPU Memory Clock"
+		datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
+		yMin, yMax = autoBounds120(datasets...)
+
 	case strings.HasPrefix(path, "gpu/"):
-		rest := strings.TrimPrefix(path, "gpu/")
-		sub := ""
-		if i := strings.LastIndex(rest, "-"); i > 0 {
-			sub = rest[i+1:]
-			rest = rest[:i]
+		idx, sub, ok := parseGPUChartPath(path)
+		if !ok {
+			return nil, nil, nil, "", nil, nil, false
 		}
-		idx := 0
-		fmt.Sscanf(rest, "%d", &idx)
 		switch sub {
 		case "load":
-			title = fmt.Sprintf("GPU %d Load", idx)
+			title = gpuDisplayLabel(idx) + " Load"
 			util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
 			mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
 			if util == nil && mem == nil {
@@ -600,7 +715,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			yMin = floatPtr(0)
 			yMax = floatPtr(100)
 		case "temp":
-			title = fmt.Sprintf("GPU %d Temperature", idx)
+			title = gpuDisplayLabel(idx) + " Temperature"
 			temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
 			if temp == nil {
 				return nil, nil, nil, "", nil, nil, false
@@ -609,8 +724,26 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			names = []string{"Temp °C"}
 			yMin = floatPtr(0)
 			yMax = autoMax120(temp)
+		case "clock":
+			title = gpuDisplayLabel(idx) + " Core Clock"
+			clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
+			if clock == nil {
+				return nil, nil, nil, "", nil, nil, false
+			}
+			datasets = [][]float64{clock}
+			names = []string{"Core Clock MHz"}
+			yMin, yMax = autoBounds120(clock)
+		case "memclock":
+			title = gpuDisplayLabel(idx) + " Memory Clock"
+			clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
+			if clock == nil {
+				return nil, nil, nil, "", nil, nil, false
+			}
+			datasets = [][]float64{clock}
+			names = []string{"Memory Clock MHz"}
+			yMin, yMax = autoBounds120(clock)
 		default:
-			title = fmt.Sprintf("GPU %d Power", idx)
+			title = gpuDisplayLabel(idx) + " Power"
 			power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
 			if power == nil {
 				return nil, nil, nil, "", nil, nil, false
@@ -627,6 +760,26 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 	return datasets, names, labels, title, yMin, yMax, len(datasets) > 0
 }

+func parseGPUChartPath(path string) (idx int, sub string, ok bool) {
+	if !strings.HasPrefix(path, "gpu/") {
+		return 0, "", false
+	}
+	rest := strings.TrimPrefix(path, "gpu/")
+	if rest == "" {
+		return 0, "", false
+	}
+	sub = ""
+	if i := strings.LastIndex(rest, "-"); i > 0 {
+		sub = rest[i+1:]
+		rest = rest[:i]
+	}
+	n, err := fmt.Sscanf(rest, "%d", &idx)
+	if err != nil || n != 1 {
+		return 0, "", false
+	}
+	return idx, sub, true
+}
+
 func sampleTimeLabels(samples []platform.LiveMetricSample) []string {
 	labels := make([]string, len(samples))
 	if len(samples) == 0 {
@@ -719,7 +872,7 @@ func gpuDatasets(samples []platform.LiveMetricSample, pick func(platform.GPUMetr
 			continue
 		}
 		datasets = append(datasets, ds)
-		names = append(names, fmt.Sprintf("GPU %d", idx))
+		names = append(names, gpuDisplayLabel(idx))
 	}
 	return datasets, names
 }
@@ -852,64 +1005,37 @@ func autoBounds120(datasets ...[]float64) (*float64, *float64) {
 	return floatPtr(low), floatPtr(high)
 }

-// renderChartSVG renders a line chart SVG with a fixed Y-axis range.
-func renderChartSVG(title string, datasets [][]float64, names []string, labels []string, yMin, yMax *float64) ([]byte, error) {
-	n := len(labels)
-	if n == 0 {
-		n = 1
-		labels = []string{""}
+func gpuChartLabelIndices(total, target int) []int {
+	if total <= 0 {
+		return nil
 	}
-	for i := range datasets {
-		if len(datasets[i]) == 0 {
-			datasets[i] = make([]float64, n)
-		}
+	if total == 1 {
+		return []int{0}
 	}
-	// Append global min/avg/max to title.
-	mn, avg, mx := globalStats(datasets)
-	if mx > 0 {
-		title = fmt.Sprintf("%s    ↓%s  ~%s  ↑%s",
-			title,
-			chartLegendNumber(mn),
-			chartLegendNumber(avg),
-			chartLegendNumber(mx),
-		)
+	step := total / target
+	if step < 1 {
+		step = 1
 	}
-	title = sanitizeChartText(title)
-	names = sanitizeChartTexts(names)
-	sparse := sanitizeChartTexts(sparseLabels(labels, 6))
+	var indices []int
+	for i := 0; i < total; i += step {
+		indices = append(indices, i)
+	}
+	if indices[len(indices)-1] != total-1 {
+		indices = append(indices, total-1)
+	}
+	return indices
+}

-	opt := gocharts.NewLineChartOptionWithData(datasets)
-	opt.Title = gocharts.TitleOption{Text: title}
-	opt.XAxis.Labels = sparse
-	opt.Legend = gocharts.LegendOption{SeriesNames: names}
-	if chartLegendVisible(len(names)) {
-		opt.Legend.Offset = gocharts.OffsetStr{Top: gocharts.PositionBottom}
-		opt.Legend.OverlayChart = gocharts.Ptr(false)
-	} else {
-		opt.Legend.Show = gocharts.Ptr(false)
-	}
-	opt.Symbol = gocharts.SymbolNone
-	// Right padding: reserve space for the MarkLine label (library recommendation).
-	opt.Padding = gocharts.NewBox(20, 20, 80, 20)
-	if yMin != nil || yMax != nil {
-		opt.YAxis = []gocharts.YAxisOption{chartYAxisOption(yMin, yMax)}
+func chartCanvasHeightForPath(path string, seriesCount int) int {
+	height := chartCanvasHeight(seriesCount)
+	if isGPUChartPath(path) {
+		return height * 2
 	}
+	return height
+}

-	// Add a single peak mark line on the series that holds the global maximum.
-	peakIdx, _ := globalPeakSeries(datasets)
-	if peakIdx >= 0 && peakIdx < len(opt.SeriesList) {
-		opt.SeriesList[peakIdx].MarkLine = gocharts.NewMarkLine(gocharts.SeriesMarkTypeMax)
-	}
-
-	p := gocharts.NewPainter(gocharts.PainterOptions{
-		OutputFormat: gocharts.ChartOutputSVG,
-		Width:        1400,
-		Height:       chartCanvasHeight(len(names)),
-	}, gocharts.PainterThemeOption(gocharts.GetTheme("grafana")))
-	if err := p.LineChart(opt); err != nil {
-		return nil, err
-	}
-	return p.Bytes()
+func isGPUChartPath(path string) bool {
+	return strings.HasPrefix(path, "gpu-all-") || strings.HasPrefix(path, "gpu/")
 }

 func chartLegendVisible(seriesCount int) bool {
@@ -923,30 +1049,6 @@ func chartCanvasHeight(seriesCount int) int {
 	return 288
 }

-func chartYAxisOption(yMin, yMax *float64) gocharts.YAxisOption {
-	return gocharts.YAxisOption{
-		Min:            yMin,
-		Max:            yMax,
-		LabelCount:     11,
-		ValueFormatter: chartYAxisNumber,
-	}
-}
-
-// globalPeakSeries returns the index of the series containing the global maximum
-// value across all datasets, and that maximum value.
-func globalPeakSeries(datasets [][]float64) (idx int, peak float64) {
-	idx = -1
-	for i, ds := range datasets {
-		for _, v := range ds {
-			if v > peak {
-				peak = v
-				idx = i
-			}
-		}
-	}
-	return idx, peak
-}
-
 // globalStats returns min, average, and max across all values in all datasets.
 func globalStats(datasets [][]float64) (mn, avg, mx float64) {
 	var sum float64
@@ -986,21 +1088,6 @@ func sanitizeChartText(s string) string {
 	}, s))
 }

-func sanitizeChartTexts(in []string) []string {
-	out := make([]string, len(in))
-	for i, s := range in {
-		out[i] = sanitizeChartText(s)
-	}
-	return out
-}
-
-func safeIdx(s []float64, i int) float64 {
-	if i < len(s) {
-		return s[i]
-	}
-	return 0
-}
-
 func snapshotNamedRings(rings []*namedMetricsRing) ([][]float64, []string, []string) {
 	var datasets [][]float64
 	var names []string
@@ -1087,20 +1174,6 @@ func chartYAxisNumber(v float64) string {
 	return out
 }

-func sparseLabels(labels []string, n int) []string {
-	out := make([]string, len(labels))
-	step := len(labels) / n
-	if step < 1 {
-		step = 1
-	}
-	for i, l := range labels {
-		if i%step == 0 {
-			out[i] = l
-		}
-	}
-	return out
-}
-
 func (h *handler) handleAPIMetricsExportCSV(w http.ResponseWriter, r *http.Request) {
 	if h.metricsDB == nil {
 		http.Error(w, "metrics database not available", http.StatusServiceUnavailable)
@@ -1116,6 +1189,11 @@ func (h *handler) handleAPIMetricsExportCSV(w http.ResponseWriter, r *http.Reque

 func (h *handler) handleReady(w http.ResponseWriter, r *http.Request) {
 	w.Header().Set("Cache-Control", "no-store")
+	if strings.TrimSpace(h.opts.AuditPath) == "" {
+		w.WriteHeader(http.StatusOK)
+		_, _ = w.Write([]byte("ready"))
+		return
+	}
 	if _, err := os.Stat(h.opts.AuditPath); err != nil {
 		w.WriteHeader(http.StatusServiceUnavailable)
 		_, _ = w.Write([]byte("starting"))
--- a/audit/internal/webui/server_test.go
+++ b/audit/internal/webui/server_test.go
@@ -34,6 +34,49 @@ func TestChartLegendNumber(t *testing.T) {
 	}
 }

+func TestRecoverMiddlewareReturns500OnPanic(t *testing.T) {
+	handler := recoverMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		panic("boom")
+	}))
+	rec := httptest.NewRecorder()
+	req := httptest.NewRequest(http.MethodGet, "/panic", nil)
+
+	handler.ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusInternalServerError {
+		t.Fatalf("status=%d want %d", rec.Code, http.StatusInternalServerError)
+	}
+	if !strings.Contains(rec.Body.String(), "internal server error") {
+		t.Fatalf("body=%q", rec.Body.String())
+	}
+}
+
+func TestRecoverMiddlewarePreservesStreamingInterfaces(t *testing.T) {
+	handler := recoverMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if !sseStart(w) {
+			return
+		}
+		if !sseWrite(w, "tick", "ok") {
+			t.Fatal("expected sse write to succeed")
+		}
+	}))
+	rec := httptest.NewRecorder()
+	req := httptest.NewRequest(http.MethodGet, "/stream", nil)
+
+	handler.ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	if got := rec.Header().Get("Content-Type"); got != "text/event-stream" {
+		t.Fatalf("content-type=%q", got)
+	}
+	body := rec.Body.String()
+	if !strings.Contains(body, "event: tick\n") || !strings.Contains(body, "data: ok\n\n") {
+		t.Fatalf("body=%q", body)
+	}
+}
+
 func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
 	samples := []platform.LiveMetricSample{
 		{
@@ -136,6 +179,39 @@ func TestChartDataFromSamplesKeepsStableGPUSeriesOrder(t *testing.T) {
 	}
 }

+func TestChartDataFromSamplesIncludesGPUClockCharts(t *testing.T) {
+	samples := []platform.LiveMetricSample{
+		{
+			Timestamp: time.Now().Add(-2 * time.Minute),
+			GPUs: []platform.GPUMetricRow{
+				{GPUIndex: 0, ClockMHz: 1400},
+				{GPUIndex: 3, ClockMHz: 1500},
+			},
+		},
+		{
+			Timestamp: time.Now().Add(-1 * time.Minute),
+			GPUs: []platform.GPUMetricRow{
+				{GPUIndex: 0, ClockMHz: 1410},
+				{GPUIndex: 3, ClockMHz: 1510},
+			},
+		},
+	}
+
+	datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
+	if !ok {
+		t.Fatal("gpu-all-clock returned ok=false")
+	}
+	if title != "GPU Core Clock" {
+		t.Fatalf("title=%q", title)
+	}
+	if len(names) != 2 || names[0] != "GPU 0" || names[1] != "GPU 3" {
+		t.Fatalf("names=%v", names)
+	}
+	if got := datasets[1][1]; got != 1510 {
+		t.Fatalf("GPU 3 core clock=%v want 1510", got)
+	}
+}
+
 func TestNormalizePowerSeriesHoldsLastPositive(t *testing.T) {
 	got := normalizePowerSeries([]float64{0, 480, 0, 0, 510, 0})
 	want := []float64{0, 480, 480, 480, 510, 510}
@@ -157,6 +233,21 @@ func TestRenderMetricsUsesBufferedChartRefresh(t *testing.T) {
 	if !strings.Contains(body, "el.dataset.loading === '1'") {
 		t.Fatalf("metrics page should avoid overlapping chart reloads: %s", body)
 	}
+	if !strings.Contains(body, `id="gpu-metrics-section" style="display:none`) {
+		t.Fatalf("metrics page should keep gpu charts in a hidden dedicated section until GPUs are detected: %s", body)
+	}
+	if !strings.Contains(body, `id="gpu-chart-toggle"`) {
+		t.Fatalf("metrics page should render GPU chart mode toggle: %s", body)
+	}
+	if !strings.Contains(body, `/api/metrics/chart/gpu-all-clock.svg`) {
+		t.Fatalf("metrics page should include GPU core clock chart: %s", body)
+	}
+	if strings.Contains(body, `/api/metrics/chart/gpu-all-memclock.svg`) {
+		t.Fatalf("metrics page should not include GPU memory clock chart: %s", body)
+	}
+	if !strings.Contains(body, `renderGPUOverviewCards(indices, names)`) {
+		t.Fatalf("metrics page should build per-GPU chart cards dynamically: %s", body)
+	}
 }

 func TestChartLegendVisible(t *testing.T) {
@@ -199,6 +290,124 @@ func TestChartCanvasHeight(t *testing.T) {
 	}
 }

+func TestChartTimelineSegmentsForRangeMergesActiveSpansAndIdleGaps(t *testing.T) {
+	start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
+	end := start.Add(10 * time.Minute)
+	taskWindow := func(offsetStart, offsetEnd time.Duration) Task {
+		s := start.Add(offsetStart)
+		e := start.Add(offsetEnd)
+		return Task{
+			Name:      "task",
+			Status:    TaskDone,
+			StartedAt: &s,
+			DoneAt:    &e,
+		}
+	}
+	segments := chartTimelineSegmentsForRange(start, end, end, []Task{
+		taskWindow(1*time.Minute, 3*time.Minute),
+		taskWindow(2*time.Minute, 5*time.Minute),
+		taskWindow(7*time.Minute, 8*time.Minute),
+	})
+	if len(segments) != 5 {
+		t.Fatalf("segments=%d want 5: %#v", len(segments), segments)
+	}
+	wantActive := []bool{false, true, false, true, false}
+	wantMinutes := [][2]int{{0, 1}, {1, 5}, {5, 7}, {7, 8}, {8, 10}}
+	for i, segment := range segments {
+		if segment.Active != wantActive[i] {
+			t.Fatalf("segment[%d].Active=%v want %v", i, segment.Active, wantActive[i])
+		}
+		if got := int(segment.Start.Sub(start).Minutes()); got != wantMinutes[i][0] {
+			t.Fatalf("segment[%d] start=%d want %d", i, got, wantMinutes[i][0])
+		}
+		if got := int(segment.End.Sub(start).Minutes()); got != wantMinutes[i][1] {
+			t.Fatalf("segment[%d] end=%d want %d", i, got, wantMinutes[i][1])
+		}
+	}
+}
+
+func TestRenderMetricChartSVGIncludesTimelineOverlay(t *testing.T) {
+	start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
+	labels := []string{"12:00", "12:01", "12:02"}
+	times := []time.Time{start, start.Add(time.Minute), start.Add(2 * time.Minute)}
+	svg, err := renderMetricChartSVG(
+		"System Power",
+		labels,
+		times,
+		[][]float64{{300, 320, 310}},
+		[]string{"Power W"},
+		floatPtr(0),
+		floatPtr(400),
+		360,
+		[]chartTimelineSegment{
+			{Start: start, End: start.Add(time.Minute), Active: false},
+			{Start: start.Add(time.Minute), End: start.Add(2 * time.Minute), Active: true},
+		},
+	)
+	if err != nil {
+		t.Fatal(err)
+	}
+	body := string(svg)
+	if !strings.Contains(body, `data-role="timeline-overlay"`) {
+		t.Fatalf("svg missing timeline overlay: %s", body)
+	}
+	if !strings.Contains(body, `opacity="0.10"`) {
+		t.Fatalf("svg missing idle overlay opacity: %s", body)
+	}
+	if !strings.Contains(body, `System Power`) {
+		t.Fatalf("svg missing chart title: %s", body)
+	}
+}
+
+func TestHandleMetricsChartSVGRendersCustomSVG(t *testing.T) {
+	dir := t.TempDir()
+	db, err := openMetricsDB(filepath.Join(dir, "metrics.db"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	t.Cleanup(func() { _ = db.db.Close() })
+
+	start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
+	for i, sample := range []platform.LiveMetricSample{
+		{Timestamp: start, PowerW: 300},
+		{Timestamp: start.Add(time.Minute), PowerW: 320},
+		{Timestamp: start.Add(2 * time.Minute), PowerW: 310},
+	} {
+		if err := db.Write(sample); err != nil {
+			t.Fatalf("write sample %d: %v", i, err)
+		}
+	}
+
+	globalQueue.mu.Lock()
+	prevTasks := globalQueue.tasks
+	s := start.Add(30 * time.Second)
+	e := start.Add(90 * time.Second)
+	globalQueue.tasks = []*Task{{Name: "Burn", Status: TaskDone, StartedAt: &s, DoneAt: &e}}
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = prevTasks
+		globalQueue.mu.Unlock()
+	})
+
+	h := &handler{opts: HandlerOptions{ExportDir: dir}, metricsDB: db}
+
+	rec := httptest.NewRecorder()
+	req := httptest.NewRequest(http.MethodGet, "/api/metrics/chart/server-power.svg", nil)
+	h.handleMetricsChartSVG(rec, req)
+
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	body := rec.Body.String()
+	if !strings.Contains(body, `data-role="timeline-overlay"`) {
+		t.Fatalf("custom svg response missing timeline overlay: %s", body)
+	}
+	if !strings.Contains(body, `stroke-linecap="round"`) {
+		t.Fatalf("custom svg response missing custom polyline styling: %s", body)
+	}
+}
+
 func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
 	got := normalizeFanSeries([]float64{4200, 0, 0, 4300, 0})
 	want := []float64{4200, 4200, 4200, 4300, 4300}
@@ -212,21 +421,6 @@ func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
 	}
 }

-func TestChartYAxisOption(t *testing.T) {
-	min := floatPtr(0)
-	max := floatPtr(100)
-	opt := chartYAxisOption(min, max)
-	if opt.Min != min || opt.Max != max {
-		t.Fatalf("chartYAxisOption min/max mismatch: %#v", opt)
-	}
-	if opt.LabelCount != 11 {
-		t.Fatalf("chartYAxisOption labelCount=%d want 11", opt.LabelCount)
-	}
-	if got := opt.ValueFormatter(1000); got != "1к" {
-		t.Fatalf("chartYAxisOption formatter(1000)=%q want 1к", got)
-	}
-}
-
 func TestSnapshotFanRingsUsesTimelineLabels(t *testing.T) {
 	r1 := newMetricsRing(4)
 	r2 := newMetricsRing(4)
@@ -335,7 +529,7 @@ func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
-	if !strings.Contains(body, `Run Audit`) {
+	if !strings.Contains(body, `onclick="auditModalRun()">Run audit</button>`) {
 		t.Fatalf("dashboard missing run audit button: %s", body)
 	}
 	if strings.Contains(body, `No audit data`) {
@@ -343,6 +537,18 @@ func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
 	}
 }

+func TestReadyIsOKWhenAuditPathIsUnset(t *testing.T) {
+	handler := NewHandler(HandlerOptions{})
+	rec := httptest.NewRecorder()
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/api/ready", nil))
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	if strings.TrimSpace(rec.Body.String()) != "ready" {
+		t.Fatalf("body=%q want ready", rec.Body.String())
+	}
+}
+
 func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
 	dir := t.TempDir()
 	path := filepath.Join(dir, "audit.json")
@@ -365,7 +571,7 @@ func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
 	}
 }

-func TestTasksPageRendersLogModalAndPaginationControls(t *testing.T) {
+func TestTasksPageRendersOpenLinksAndPaginationControls(t *testing.T) {
 	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks", nil))
@@ -373,8 +579,8 @@ func TestTasksPageRendersLogModalAndPaginationControls(t *testing.T) {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
-	if !strings.Contains(body, `id="task-log-overlay"`) {
-		t.Fatalf("tasks page missing log modal overlay: %s", body)
+	if !strings.Contains(body, `Open a task to view its saved logs and charts.`) {
+		t.Fatalf("tasks page missing task report hint: %s", body)
 	}
 	if !strings.Contains(body, `_taskPageSize = 50`) {
 		t.Fatalf("tasks page missing pagination size config: %s", body)
@@ -409,37 +615,111 @@ func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
 	}
 }

-func TestTasksPageRendersScrollableLogModal(t *testing.T) {
-	dir := t.TempDir()
-	path := filepath.Join(dir, "audit.json")
-	exportDir := filepath.Join(dir, "export")
-	if err := os.MkdirAll(exportDir, 0755); err != nil {
-		t.Fatal(err)
-	}
-	if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z"}`), 0644); err != nil {
-		t.Fatal(err)
-	}
-
-	handler := NewHandler(HandlerOptions{
-		Title:     "Bee Hardware Audit",
-		AuditPath: path,
-		ExportDir: exportDir,
-	})
-
+func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
+	handler := NewHandler(HandlerOptions{})
 	rec := httptest.NewRecorder()
-	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks", nil))
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/benchmark", nil))
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
-	if !strings.Contains(body, `height:calc(100vh - 32px)`) {
-		t.Fatalf("tasks page missing bounded log modal height: %s", body)
+	for _, needle := range []string{
+		`href="/benchmark"`,
+		`id="benchmark-gpu-list"`,
+		`/api/gpu/nvidia`,
+		`/api/benchmark/nvidia/run`,
+		`benchmark-run-nccl`,
+	} {
+		if !strings.Contains(body, needle) {
+			t.Fatalf("benchmark page missing %q: %s", needle, body)
+		}
 	}
-	if !strings.Contains(body, `flex:1;min-height:0;overflow:hidden`) {
-		t.Fatalf("tasks page missing log modal overflow guard: %s", body)
+}
+
+func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
+	handler := NewHandler(HandlerOptions{})
+	rec := httptest.NewRecorder()
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d", rec.Code)
 	}
-	if !strings.Contains(body, `height:100%;min-height:0;overflow:auto`) {
-		t.Fatalf("tasks page missing scrollable log wrapper: %s", body)
+	body := rec.Body.String()
+	for _, needle := range []string{
+		`NVIDIA GPU Targeted Stress`,
+		`nvidia-targeted-stress`,
+		`controlled NVIDIA DCGM load`,
+		`<code>dcgmi diag targeted_stress</code>`,
+	} {
+		if !strings.Contains(body, needle) {
+			t.Fatalf("validate page missing %q: %s", needle, body)
+		}
+	}
+}
+
+func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
+	handler := NewHandler(HandlerOptions{})
+	rec := httptest.NewRecorder()
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/burn", nil))
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d", rec.Code)
+	}
+	body := rec.Body.String()
+	for _, needle := range []string{
+		`NVIDIA Max Compute Load`,
+		`dcgmproftester`,
+		`targeted_stress remain in <a href="/validate">Validate</a>`,
+		`NVIDIA Interconnect Test (NCCL all_reduce_perf)`,
+		`id="burn-gpu-list"`,
+	} {
+		if !strings.Contains(body, needle) {
+			t.Fatalf("burn page missing %q: %s", needle, body)
+		}
+	}
+}
+
+func TestTaskDetailPageRendersSavedReport(t *testing.T) {
+	dir := t.TempDir()
+	exportDir := filepath.Join(dir, "export")
+	reportDir := filepath.Join(exportDir, "tasks", "task-1_cpu_sat_done")
+	if err := os.MkdirAll(reportDir, 0755); err != nil {
+		t.Fatal(err)
+	}
+	reportPath := filepath.Join(reportDir, "report.html")
+	if err := os.WriteFile(reportPath, []byte(`<div class="card"><div class="card-head">Task Report</div><div class="card-body">saved report</div></div>`), 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	globalQueue.mu.Lock()
+	origTasks := globalQueue.tasks
+	globalQueue.tasks = []*Task{{
+		ID:             "task-1",
+		Name:           "CPU SAT",
+		Target:         "cpu",
+		Status:         TaskDone,
+		CreatedAt:      time.Now(),
+		ArtifactsDir:   reportDir,
+		ReportHTMLPath: reportPath,
+	}}
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = origTasks
+		globalQueue.mu.Unlock()
+	})
+
+	handler := NewHandler(HandlerOptions{Title: "Bee Hardware Audit", ExportDir: exportDir})
+
+	rec := httptest.NewRecorder()
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks/task-1", nil))
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d", rec.Code)
+	}
+	body := rec.Body.String()
+	if !strings.Contains(body, `saved report`) {
+		t.Fatalf("task detail page missing saved report: %s", body)
+	}
+	if !strings.Contains(body, `Back to Tasks`) {
+		t.Fatalf("task detail page missing back link: %s", body)
 	}
 }

@@ -564,3 +844,98 @@ func TestRuntimeHealthEndpointReturnsJSON(t *testing.T) {
 		t.Fatalf("body=%q want %q", strings.TrimSpace(rec.Body.String()), body)
 	}
 }
+
+func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "audit.json")
+	exportDir := filepath.Join(dir, "export")
+	if err := os.MkdirAll(exportDir, 0755); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z","hardware":{"board":{"serial_number":"SERIAL-1"}}}`), 0644); err != nil {
+		t.Fatal(err)
+	}
+	health := `{
+  "status":"PARTIAL",
+  "checked_at":"2026-03-16T10:00:00Z",
+  "export_dir":"/tmp/export",
+  "driver_ready":true,
+  "cuda_ready":false,
+  "network_status":"PARTIAL",
+  "issues":[
+    {"code":"dhcp_partial","description":"At least one interface did not obtain IPv4 connectivity."},
+    {"code":"cuda_runtime_not_ready","description":"CUDA runtime is not ready for GPU SAT."}
+  ],
+  "tools":[
+    {"name":"dmidecode","ok":true},
+    {"name":"nvidia-smi","ok":false}
+  ],
+  "services":[
+    {"name":"bee-web","status":"active"},
+    {"name":"bee-nvidia","status":"inactive"}
+  ]
+}`
+	if err := os.WriteFile(filepath.Join(exportDir, "runtime-health.json"), []byte(health), 0644); err != nil {
+		t.Fatal(err)
+	}
+	componentStatus := `[
+  {
+    "component_key":"cpu:all",
+    "status":"Warning",
+    "error_summary":"cpu SAT: FAILED",
+    "history":[{"at":"2026-03-16T10:00:00Z","status":"Warning","source":"sat:cpu","detail":"cpu SAT: FAILED"}]
+  },
+  {
+    "component_key":"memory:all",
+    "status":"OK",
+    "history":[{"at":"2026-03-16T10:01:00Z","status":"OK","source":"sat:memory","detail":"memory SAT: OK"}]
+  },
+  {
+    "component_key":"storage:nvme0n1",
+    "status":"Critical",
+    "error_summary":"storage SAT: FAILED",
+    "history":[{"at":"2026-03-16T10:02:00Z","status":"Critical","source":"sat:storage","detail":"storage SAT: FAILED"}]
+  },
+  {
+    "component_key":"pcie:gpu:nvidia",
+    "status":"Warning",
+    "error_summary":"nvidia SAT: FAILED",
+    "history":[{"at":"2026-03-16T10:03:00Z","status":"Warning","source":"sat:nvidia","detail":"nvidia SAT: FAILED"}]
+  }
+]`
+	if err := os.WriteFile(filepath.Join(exportDir, "component-status.json"), []byte(componentStatus), 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	handler := NewHandler(HandlerOptions{AuditPath: path, ExportDir: exportDir})
+	rec := httptest.NewRecorder()
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	body := rec.Body.String()
+	for _, needle := range []string{
+		`Runtime Health`,
+		`<th>Check</th><th>Status</th><th>Source</th><th>Issue</th>`,
+		`Export Directory`,
+		`Network`,
+		`NVIDIA/AMD Driver`,
+		`CUDA / ROCm`,
+		`Required Utilities`,
+		`Bee Services`,
+		`<td>CPU</td>`,
+		`<td>Memory</td>`,
+		`<td>Storage</td>`,
+		`<td>GPU</td>`,
+		`CUDA runtime is not ready for GPU SAT.`,
+		`Missing: nvidia-smi`,
+		`bee-nvidia=inactive`,
+		`cpu SAT: FAILED`,
+		`storage SAT: FAILED`,
+		`sat:nvidia`,
+	} {
+		if !strings.Contains(body, needle) {
+			t.Fatalf("dashboard missing %q: %s", needle, body)
+		}
+	}
+}
--- a/audit/internal/webui/stability.go
+++ b/audit/internal/webui/stability.go
@@ -0,0 +1,42 @@
+package webui
+
+import (
+	"fmt"
+	"log/slog"
+	"runtime/debug"
+	"time"
+)
+
+func goRecoverLoop(name string, restartDelay time.Duration, fn func()) {
+	go func() {
+		for {
+			if !runRecoverable(name, fn) {
+				return
+			}
+			if restartDelay > 0 {
+				time.Sleep(restartDelay)
+			}
+		}
+	}()
+}
+
+func goRecoverOnce(name string, fn func()) {
+	go func() {
+		_ = runRecoverable(name, fn)
+	}()
+}
+
+func runRecoverable(name string, fn func()) (panicked bool) {
+	defer func() {
+		if rec := recover(); rec != nil {
+			panicked = true
+			slog.Error("recovered panic",
+				"component", name,
+				"panic", fmt.Sprint(rec),
+				"stack", string(debug.Stack()),
+			)
+		}
+	}()
+	fn()
+	return false
+}
--- a/audit/internal/webui/task_page.go
+++ b/audit/internal/webui/task_page.go
@@ -0,0 +1,85 @@
+package webui
+
+import (
+	"fmt"
+	"html"
+	"net/http"
+	"os"
+	"strings"
+)
+
+func (h *handler) handleTaskPage(w http.ResponseWriter, r *http.Request) {
+	id := r.PathValue("id")
+	task, ok := globalQueue.findByID(id)
+	if !ok {
+		http.NotFound(w, r)
+		return
+	}
+	snapshot := *task
+	body := renderTaskDetailPage(h.opts, snapshot)
+	w.Header().Set("Cache-Control", "no-store")
+	w.Header().Set("Content-Type", "text/html; charset=utf-8")
+	_, _ = w.Write([]byte(body))
+}
+
+func renderTaskDetailPage(opts HandlerOptions, task Task) string {
+	title := task.Name
+	if strings.TrimSpace(title) == "" {
+		title = task.ID
+	}
+	var body strings.Builder
+	body.WriteString(`<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">`)
+	body.WriteString(`<a class="btn btn-secondary btn-sm" href="/tasks">Back to Tasks</a>`)
+	body.WriteString(`<span style="font-size:12px;color:var(--muted)">Artifacts are saved in the task folder under <code>./tasks</code>.</span>`)
+	body.WriteString(`</div>`)
+
+	if report := loadTaskReportFragment(task); report != "" {
+		body.WriteString(report)
+	} else {
+		body.WriteString(`<div class="card"><div class="card-head">Task Summary</div><div class="card-body">`)
+		body.WriteString(`<div style="font-size:18px;font-weight:700">` + html.EscapeString(title) + `</div>`)
+		body.WriteString(`<div style="margin-top:8px">` + renderTaskStatusBadge(task.Status) + `</div>`)
+		if strings.TrimSpace(task.ErrMsg) != "" {
+			body.WriteString(`<div style="margin-top:8px;color:var(--crit-fg)">` + html.EscapeString(task.ErrMsg) + `</div>`)
+		}
+		body.WriteString(`</div></div>`)
+	}
+
+	if task.Status == TaskRunning || task.Status == TaskPending {
+		body.WriteString(`<div class="card"><div class="card-head">Live Logs</div><div class="card-body">`)
+		body.WriteString(`<div id="task-live-log" class="terminal" style="max-height:none;white-space:pre-wrap">Connecting...</div>`)
+		body.WriteString(`</div></div>`)
+		body.WriteString(`<script>
+var _taskDetailES = new EventSource('/api/tasks/` + html.EscapeString(task.ID) + `/stream');
+var _taskDetailTerm = document.getElementById('task-live-log');
+_taskDetailES.onopen = function(){ _taskDetailTerm.textContent = ''; };
+_taskDetailES.onmessage = function(e){ _taskDetailTerm.textContent += e.data + "\n"; _taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight; };
+_taskDetailES.addEventListener('done', function(){ _taskDetailES.close(); setTimeout(function(){ window.location.reload(); }, 1000); });
+_taskDetailES.onerror = function(){ _taskDetailES.close(); };
+</script>`)
+	}
+
+	return layoutHead(opts.Title+" — "+title) +
+		layoutNav("tasks", opts.BuildLabel) +
+		`<div class="main"><div class="topbar"><h1>` + html.EscapeString(title) + `</h1></div><div class="content">` +
+		body.String() +
+		`</div></div></body></html>`
+}
+
+func loadTaskReportFragment(task Task) string {
+	if strings.TrimSpace(task.ReportHTMLPath) == "" {
+		return ""
+	}
+	data, err := os.ReadFile(task.ReportHTMLPath)
+	if err != nil || len(data) == 0 {
+		return ""
+	}
+	return string(data)
+}
+
+func taskArtifactDownloadLink(task Task, absPath string) string {
+	if strings.TrimSpace(absPath) == "" {
+		return ""
+	}
+	return fmt.Sprintf(`/export/file?path=%s`, absPath)
+}
--- a/audit/internal/webui/task_report.go
+++ b/audit/internal/webui/task_report.go
@@ -0,0 +1,286 @@
+package webui
+
+import (
+	"encoding/json"
+	"fmt"
+	"html"
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+	"time"
+
+	"bee/audit/internal/platform"
+)
+
+var taskReportMetricsDBPath = metricsDBPath
+
+type taskReport struct {
+	ID          string            `json:"id"`
+	Name        string            `json:"name"`
+	Target      string            `json:"target"`
+	Status      string            `json:"status"`
+	CreatedAt   time.Time         `json:"created_at"`
+	StartedAt   *time.Time        `json:"started_at,omitempty"`
+	DoneAt      *time.Time        `json:"done_at,omitempty"`
+	DurationSec int               `json:"duration_sec,omitempty"`
+	Error       string            `json:"error,omitempty"`
+	LogFile     string            `json:"log_file,omitempty"`
+	Charts      []taskReportChart `json:"charts,omitempty"`
+	GeneratedAt time.Time         `json:"generated_at"`
+}
+
+type taskReportChart struct {
+	Title string `json:"title"`
+	File  string `json:"file"`
+}
+
+type taskChartSpec struct {
+	Path string
+	File string
+}
+
+var taskDashboardChartSpecs = []taskChartSpec{
+	{Path: "server-load", File: "server-load.svg"},
+	{Path: "server-temp-cpu", File: "server-temp-cpu.svg"},
+	{Path: "server-temp-ambient", File: "server-temp-ambient.svg"},
+	{Path: "server-power", File: "server-power.svg"},
+	{Path: "server-fans", File: "server-fans.svg"},
+	{Path: "gpu-all-load", File: "gpu-all-load.svg"},
+	{Path: "gpu-all-memload", File: "gpu-all-memload.svg"},
+	{Path: "gpu-all-clock", File: "gpu-all-clock.svg"},
+	{Path: "gpu-all-power", File: "gpu-all-power.svg"},
+	{Path: "gpu-all-temp", File: "gpu-all-temp.svg"},
+}
+
+func writeTaskReportArtifacts(t *Task) error {
+	if t == nil {
+		return nil
+	}
+	ensureTaskReportPaths(t)
+	if strings.TrimSpace(t.ArtifactsDir) == "" {
+		return nil
+	}
+	if err := os.MkdirAll(t.ArtifactsDir, 0755); err != nil {
+		return err
+	}
+
+	start, end := taskTimeWindow(t)
+	samples, _ := loadTaskMetricSamples(start, end)
+	charts, inlineCharts := writeTaskCharts(t.ArtifactsDir, start, end, samples)
+
+	logText := ""
+	if data, err := os.ReadFile(t.LogPath); err == nil {
+		logText = string(data)
+	}
+
+	report := taskReport{
+		ID:          t.ID,
+		Name:        t.Name,
+		Target:      t.Target,
+		Status:      t.Status,
+		CreatedAt:   t.CreatedAt,
+		StartedAt:   t.StartedAt,
+		DoneAt:      t.DoneAt,
+		DurationSec: taskElapsedSec(t, reportDoneTime(t)),
+		Error:       t.ErrMsg,
+		LogFile:     filepath.Base(t.LogPath),
+		Charts:      charts,
+		GeneratedAt: time.Now().UTC(),
+	}
+	if err := writeJSONFile(t.ReportJSONPath, report); err != nil {
+		return err
+	}
+	return os.WriteFile(t.ReportHTMLPath, []byte(renderTaskReportFragment(report, inlineCharts, logText)), 0644)
+}
+
+func reportDoneTime(t *Task) time.Time {
+	if t != nil && t.DoneAt != nil && !t.DoneAt.IsZero() {
+		return *t.DoneAt
+	}
+	return time.Now()
+}
+
+func taskTimeWindow(t *Task) (time.Time, time.Time) {
+	if t == nil {
+		now := time.Now().UTC()
+		return now, now
+	}
+	start := t.CreatedAt.UTC()
+	if t.StartedAt != nil && !t.StartedAt.IsZero() {
+		start = t.StartedAt.UTC()
+	}
+	end := time.Now().UTC()
+	if t.DoneAt != nil && !t.DoneAt.IsZero() {
+		end = t.DoneAt.UTC()
+	}
+	if end.Before(start) {
+		end = start
+	}
+	return start, end
+}
+
+func loadTaskMetricSamples(start, end time.Time) ([]platform.LiveMetricSample, error) {
+	db, err := openMetricsDB(taskReportMetricsDBPath)
+	if err != nil {
+		return nil, err
+	}
+	defer db.Close()
+	return db.LoadBetween(start, end)
+}
+
+func writeTaskCharts(dir string, start, end time.Time, samples []platform.LiveMetricSample) ([]taskReportChart, map[string]string) {
+	if len(samples) == 0 {
+		return nil, nil
+	}
+	timeline := []chartTimelineSegment{{Start: start, End: end, Active: true}}
+	var charts []taskReportChart
+	inline := make(map[string]string)
+	for _, spec := range taskDashboardChartSpecs {
+		title, svg, ok := renderTaskChartSVG(spec.Path, samples, timeline)
+		if !ok || len(svg) == 0 {
+			continue
+		}
+		path := filepath.Join(dir, spec.File)
+		if err := os.WriteFile(path, svg, 0644); err != nil {
+			continue
+		}
+		charts = append(charts, taskReportChart{Title: title, File: spec.File})
+		inline[spec.File] = string(svg)
+	}
+
+	for _, idx := range taskGPUIndices(samples) {
+		file := fmt.Sprintf("gpu-%d-overview.svg", idx)
+		svg, ok, err := renderGPUOverviewChartSVG(idx, samples, timeline)
+		if err != nil || !ok || len(svg) == 0 {
+			continue
+		}
+		path := filepath.Join(dir, file)
+		if err := os.WriteFile(path, svg, 0644); err != nil {
+			continue
+		}
+		charts = append(charts, taskReportChart{Title: gpuDisplayLabel(idx) + " Overview", File: file})
+		inline[file] = string(svg)
+	}
+	return charts, inline
+}
+
+func renderTaskChartSVG(path string, samples []platform.LiveMetricSample, timeline []chartTimelineSegment) (string, []byte, bool) {
+	datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
+	if !ok {
+		return "", nil, false
+	}
+	buf, err := renderMetricChartSVG(
+		title,
+		labels,
+		sampleTimes(samples),
+		datasets,
+		names,
+		yMin,
+		yMax,
+		chartCanvasHeightForPath(path, len(names)),
+		timeline,
+	)
+	if err != nil {
+		return "", nil, false
+	}
+	return title, buf, true
+}
+
+func taskGPUIndices(samples []platform.LiveMetricSample) []int {
+	seen := map[int]bool{}
+	var out []int
+	for _, s := range samples {
+		for _, g := range s.GPUs {
+			if seen[g.GPUIndex] {
+				continue
+			}
+			seen[g.GPUIndex] = true
+			out = append(out, g.GPUIndex)
+		}
+	}
+	sort.Ints(out)
+	return out
+}
+
+func writeJSONFile(path string, v any) error {
+	data, err := json.MarshalIndent(v, "", "  ")
+	if err != nil {
+		return err
+	}
+	return os.WriteFile(path, data, 0644)
+}
+
+func renderTaskReportFragment(report taskReport, charts map[string]string, logText string) string {
+	var b strings.Builder
+	b.WriteString(`<div class="card"><div class="card-head">Task Report</div><div class="card-body">`)
+	b.WriteString(`<div class="grid2">`)
+	b.WriteString(`<div><div style="font-size:12px;color:var(--muted);margin-bottom:6px">Task</div><div style="font-size:16px;font-weight:700">` + html.EscapeString(report.Name) + `</div>`)
+	b.WriteString(`<div style="font-size:13px;color:var(--muted)">` + html.EscapeString(report.Target) + `</div></div>`)
+	b.WriteString(`<div><div style="font-size:12px;color:var(--muted);margin-bottom:6px">Status</div><div>` + renderTaskStatusBadge(report.Status) + `</div>`)
+	if strings.TrimSpace(report.Error) != "" {
+		b.WriteString(`<div style="margin-top:8px;font-size:13px;color:var(--crit-fg)">` + html.EscapeString(report.Error) + `</div>`)
+	}
+	b.WriteString(`</div></div>`)
+	b.WriteString(`<div style="margin-top:14px;font-size:13px;color:var(--muted)">`)
+	b.WriteString(`Started: ` + formatTaskTime(report.StartedAt, report.CreatedAt) + ` | Finished: ` + formatTaskTime(report.DoneAt, time.Time{}) + ` | Duration: ` + formatTaskDuration(report.DurationSec))
+	b.WriteString(`</div></div></div>`)
+
+	if len(report.Charts) > 0 {
+		b.WriteString(`<div class="grid2">`)
+		for _, chart := range report.Charts {
+			b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(chart.Title) + `</div><div class="card-body" style="padding:12px">`)
+			b.WriteString(charts[chart.File])
+			b.WriteString(`</div></div>`)
+		}
+		b.WriteString(`</div>`)
+	} else {
+		b.WriteString(`<div class="alert alert-info">No metric samples were captured during this task window.</div>`)
+	}
+
+	b.WriteString(`<div class="card"><div class="card-head">Logs</div><div class="card-body">`)
+	b.WriteString(`<div class="terminal" style="max-height:none;white-space:pre-wrap">` + html.EscapeString(strings.TrimSpace(logText)) + `</div>`)
+	b.WriteString(`</div></div>`)
+	return b.String()
+}
+
+func renderTaskStatusBadge(status string) string {
+	className := map[string]string{
+		TaskRunning:   "badge-ok",
+		TaskPending:   "badge-unknown",
+		TaskDone:      "badge-ok",
+		TaskFailed:    "badge-err",
+		TaskCancelled: "badge-unknown",
+	}[status]
+	if className == "" {
+		className = "badge-unknown"
+	}
+	label := strings.TrimSpace(status)
+	if label == "" {
+		label = "unknown"
+	}
+	return `<span class="badge ` + className + `">` + html.EscapeString(label) + `</span>`
+}
+
+func formatTaskTime(ts *time.Time, fallback time.Time) string {
+	if ts != nil && !ts.IsZero() {
+		return ts.Local().Format("2006-01-02 15:04:05")
+	}
+	if !fallback.IsZero() {
+		return fallback.Local().Format("2006-01-02 15:04:05")
+	}
+	return "n/a"
+}
+
+func formatTaskDuration(sec int) string {
+	if sec <= 0 {
+		return "n/a"
+	}
+	if sec < 60 {
+		return fmt.Sprintf("%ds", sec)
+	}
+	if sec < 3600 {
+		return fmt.Sprintf("%dm %02ds", sec/60, sec%60)
+	}
+	return fmt.Sprintf("%dh %02dm %02ds", sec/3600, (sec%3600)/60, sec%60)
+}
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -4,10 +4,12 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
+	"log/slog"
 	"net/http"
 	"os"
 	"os/exec"
 	"path/filepath"
+	"runtime/debug"
 	"sort"
 	"strings"
 	"sync"
@@ -28,22 +30,29 @@ const (

 // taskNames maps target → human-readable name for validate (SAT) runs.
 var taskNames = map[string]string{
-	"nvidia":          "NVIDIA SAT",
-	"nvidia-stress":   "NVIDIA GPU Stress",
-	"memory":          "Memory SAT",
-	"storage":         "Storage SAT",
-	"cpu":             "CPU SAT",
-	"amd":             "AMD GPU SAT",
-	"amd-mem":         "AMD GPU MEM Integrity",
-	"amd-bandwidth":   "AMD GPU MEM Bandwidth",
-	"amd-stress":      "AMD GPU Burn-in",
-	"memory-stress":   "Memory Burn-in",
-	"sat-stress":      "SAT Stress (stressapptest)",
-	"platform-stress": "Platform Thermal Cycling",
-	"audit":           "Audit",
-	"support-bundle":  "Support Bundle",
-	"install":         "Install to Disk",
-	"install-to-ram":  "Install to RAM",
+	"nvidia":                 "NVIDIA SAT",
+	"nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)",
+	"nvidia-benchmark":       "NVIDIA Benchmark",
+	"nvidia-compute":         "NVIDIA Max Compute Load (dcgmproftester)",
+	"nvidia-targeted-power":  "NVIDIA Targeted Power (dcgmi diag targeted_power)",
+	"nvidia-pulse":           "NVIDIA Pulse Test (dcgmi diag pulse_test)",
+	"nvidia-interconnect":    "NVIDIA Interconnect Test (NCCL all_reduce_perf)",
+	"nvidia-bandwidth":       "NVIDIA Bandwidth Test (NVBandwidth)",
+	"nvidia-stress":          "NVIDIA GPU Stress",
+	"memory":                 "Memory SAT",
+	"storage":                "Storage SAT",
+	"cpu":                    "CPU SAT",
+	"amd":                    "AMD GPU SAT",
+	"amd-mem":                "AMD GPU MEM Integrity",
+	"amd-bandwidth":          "AMD GPU MEM Bandwidth",
+	"amd-stress":             "AMD GPU Burn-in",
+	"memory-stress":          "Memory Burn-in",
+	"sat-stress":             "SAT Stress (stressapptest)",
+	"platform-stress":        "Platform Thermal Cycling",
+	"audit":                  "Audit",
+	"support-bundle":         "Support Bundle",
+	"install":                "Install to Disk",
+	"install-to-ram":         "Install to RAM",
 }

 // burnNames maps target → human-readable name when a burn profile is set.
@@ -83,17 +92,20 @@ func taskDisplayName(target, profile, loader string) string {

 // Task represents one unit of work in the queue.
 type Task struct {
-	ID         string     `json:"id"`
-	Name       string     `json:"name"`
-	Target     string     `json:"target"`
-	Priority   int        `json:"priority"`
-	Status     string     `json:"status"`
-	CreatedAt  time.Time  `json:"created_at"`
-	StartedAt  *time.Time `json:"started_at,omitempty"`
-	DoneAt     *time.Time `json:"done_at,omitempty"`
-	ElapsedSec int        `json:"elapsed_sec,omitempty"`
-	ErrMsg     string     `json:"error,omitempty"`
-	LogPath    string     `json:"log_path,omitempty"`
+	ID             string     `json:"id"`
+	Name           string     `json:"name"`
+	Target         string     `json:"target"`
+	Priority       int        `json:"priority"`
+	Status         string     `json:"status"`
+	CreatedAt      time.Time  `json:"created_at"`
+	StartedAt      *time.Time `json:"started_at,omitempty"`
+	DoneAt         *time.Time `json:"done_at,omitempty"`
+	ElapsedSec     int        `json:"elapsed_sec,omitempty"`
+	ErrMsg         string     `json:"error,omitempty"`
+	LogPath        string     `json:"log_path,omitempty"`
+	ArtifactsDir   string     `json:"artifacts_dir,omitempty"`
+	ReportJSONPath string     `json:"report_json_path,omitempty"`
+	ReportHTMLPath string     `json:"report_html_path,omitempty"`

 	// runtime fields (not serialised)
 	job    *jobState
@@ -106,67 +118,81 @@ type taskParams struct {
 	DiagLevel          int      `json:"diag_level,omitempty"`
 	GPUIndices         []int    `json:"gpu_indices,omitempty"`
 	ExcludeGPUIndices  []int    `json:"exclude_gpu_indices,omitempty"`
+	SizeMB             int      `json:"size_mb,omitempty"`
 	Loader             string   `json:"loader,omitempty"`
 	BurnProfile        string   `json:"burn_profile,omitempty"`
+	BenchmarkProfile   string   `json:"benchmark_profile,omitempty"`
+	RunNCCL            bool     `json:"run_nccl,omitempty"`
 	DisplayName        string   `json:"display_name,omitempty"`
 	Device             string   `json:"device,omitempty"` // for install
 	PlatformComponents []string `json:"platform_components,omitempty"`
 }

 type persistedTask struct {
-	ID        string     `json:"id"`
-	Name      string     `json:"name"`
-	Target    string     `json:"target"`
-	Priority  int        `json:"priority"`
-	Status    string     `json:"status"`
-	CreatedAt time.Time  `json:"created_at"`
-	StartedAt *time.Time `json:"started_at,omitempty"`
-	DoneAt    *time.Time `json:"done_at,omitempty"`
-	ErrMsg    string     `json:"error,omitempty"`
-	LogPath   string     `json:"log_path,omitempty"`
-	Params    taskParams `json:"params,omitempty"`
+	ID             string     `json:"id"`
+	Name           string     `json:"name"`
+	Target         string     `json:"target"`
+	Priority       int        `json:"priority"`
+	Status         string     `json:"status"`
+	CreatedAt      time.Time  `json:"created_at"`
+	StartedAt      *time.Time `json:"started_at,omitempty"`
+	DoneAt         *time.Time `json:"done_at,omitempty"`
+	ErrMsg         string     `json:"error,omitempty"`
+	LogPath        string     `json:"log_path,omitempty"`
+	ArtifactsDir   string     `json:"artifacts_dir,omitempty"`
+	ReportJSONPath string     `json:"report_json_path,omitempty"`
+	ReportHTMLPath string     `json:"report_html_path,omitempty"`
+	Params         taskParams `json:"params,omitempty"`
 }

 type burnPreset struct {
-	NvidiaDiag  int
 	DurationSec int
 }

 func resolveBurnPreset(profile string) burnPreset {
 	switch profile {
 	case "overnight":
-		return burnPreset{NvidiaDiag: 4, DurationSec: 8 * 60 * 60}
+		return burnPreset{DurationSec: 8 * 60 * 60}
 	case "acceptance":
-		return burnPreset{NvidiaDiag: 3, DurationSec: 60 * 60}
+		return burnPreset{DurationSec: 60 * 60}
 	default:
-		return burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}
+		return burnPreset{DurationSec: 5 * 60}
 	}
 }

 func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
+	acceptanceCycles := []platform.PlatformStressCycle{
+		{LoadSec: 85, IdleSec: 5},
+		{LoadSec: 80, IdleSec: 10},
+		{LoadSec: 55, IdleSec: 5},
+		{LoadSec: 60, IdleSec: 0},
+		{LoadSec: 100, IdleSec: 10},
+		{LoadSec: 145, IdleSec: 15},
+		{LoadSec: 190, IdleSec: 20},
+		{LoadSec: 235, IdleSec: 25},
+		{LoadSec: 280, IdleSec: 30},
+		{LoadSec: 325, IdleSec: 35},
+		{LoadSec: 370, IdleSec: 40},
+		{LoadSec: 415, IdleSec: 45},
+		{LoadSec: 460, IdleSec: 50},
+		{LoadSec: 510, IdleSec: 0},
+	}
+
 	switch profile {
 	case "overnight":
-		return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
-			{LoadSec: 600, IdleSec: 120},
-			{LoadSec: 600, IdleSec: 60},
-			{LoadSec: 600, IdleSec: 30},
-			{LoadSec: 600, IdleSec: 120},
-			{LoadSec: 600, IdleSec: 60},
-			{LoadSec: 600, IdleSec: 30},
-			{LoadSec: 600, IdleSec: 120},
-			{LoadSec: 600, IdleSec: 60},
-		}}
+		cycles := make([]platform.PlatformStressCycle, 0, len(acceptanceCycles)*8)
+		for range 8 {
+			cycles = append(cycles, acceptanceCycles...)
+		}
+		return platform.PlatformStressOptions{Cycles: cycles}
 	case "acceptance":
-		return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
-			{LoadSec: 300, IdleSec: 60},
-			{LoadSec: 300, IdleSec: 30},
-			{LoadSec: 300, IdleSec: 60},
-			{LoadSec: 300, IdleSec: 30},
-		}}
+		return platform.PlatformStressOptions{Cycles: acceptanceCycles}
 	default: // smoke
 		return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
-			{LoadSec: 90, IdleSec: 60},
-			{LoadSec: 90, IdleSec: 30},
+			{LoadSec: 85, IdleSec: 5},
+			{LoadSec: 80, IdleSec: 10},
+			{LoadSec: 55, IdleSec: 5},
+			{LoadSec: 60, IdleSec: 0},
 		}}
 	}
 }
@@ -232,6 +258,7 @@ func (q *taskQueue) enqueue(t *Task) {
 	q.prune()
 	q.persistLocked()
 	q.mu.Unlock()
+	taskSerialEvent(t, "queued")
 	select {
 	case q.trigger <- struct{}{}:
 	default:
@@ -377,7 +404,7 @@ func (q *taskQueue) startWorker(opts *HandlerOptions) {
 	if !q.started {
 		q.loadLocked()
 		q.started = true
-		go q.worker()
+		goRecoverLoop("task worker", 2*time.Second, q.worker)
 	}
 	hasPending := q.nextPending() != nil
 	q.mu.Unlock()
@@ -392,78 +419,115 @@ func (q *taskQueue) startWorker(opts *HandlerOptions) {
 func (q *taskQueue) worker() {
 	for {
 		<-q.trigger
-		setCPUGovernor("performance")
+		func() {
+			setCPUGovernor("performance")
+			defer setCPUGovernor("powersave")

-		// Drain all pending tasks and start them in parallel.
-		q.mu.Lock()
-		var batch []*Task
-		for {
-			t := q.nextPending()
-			if t == nil {
-				break
+			// Drain all pending tasks and start them in parallel.
+			q.mu.Lock()
+			var batch []*Task
+			for {
+				t := q.nextPending()
+				if t == nil {
+					break
+				}
+				now := time.Now()
+				t.Status = TaskRunning
+				t.StartedAt = &now
+				t.DoneAt = nil
+				t.ErrMsg = ""
+				j := newTaskJobState(t.LogPath, taskSerialPrefix(t))
+				t.job = j
+				batch = append(batch, t)
 			}
-			now := time.Now()
-			t.Status = TaskRunning
-			t.StartedAt = &now
-			t.DoneAt = nil
-			t.ErrMsg = ""
-			j := newTaskJobState(t.LogPath)
-			t.job = j
-			batch = append(batch, t)
-		}
-		if len(batch) > 0 {
-			q.persistLocked()
-		}
-		q.mu.Unlock()
+			if len(batch) > 0 {
+				q.persistLocked()
+			}
+			q.mu.Unlock()

-		var wg sync.WaitGroup
-		for _, t := range batch {
-			t := t
-			j := t.job
-			taskCtx, taskCancel := context.WithCancel(context.Background())
-			j.cancel = taskCancel
-			wg.Add(1)
-			go func() {
-				defer wg.Done()
-
-				if q.kmsgWatcher != nil && isSATTarget(t.Target) {
-					q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
-				}
-
-				q.runTask(t, j, taskCtx)
-
-				if q.kmsgWatcher != nil {
-					q.kmsgWatcher.NotifyTaskFinished(t.ID)
-				}
+			var wg sync.WaitGroup
+			for _, t := range batch {
+				t := t
+				j := t.job
+				taskCtx, taskCancel := context.WithCancel(context.Background())
+				j.cancel = taskCancel
+				wg.Add(1)
+				goRecoverOnce("task "+t.Target, func() {
+					defer wg.Done()
+					defer taskCancel()
+					q.executeTask(t, j, taskCtx)
+				})
+			}
+			wg.Wait()

+			if len(batch) > 0 {
 				q.mu.Lock()
-				now2 := time.Now()
-				t.DoneAt = &now2
-				if t.Status == TaskRunning {
-					if j.err != "" {
-						t.Status = TaskFailed
-						t.ErrMsg = j.err
-					} else {
-						t.Status = TaskDone
-					}
-				}
+				q.prune()
 				q.persistLocked()
 				q.mu.Unlock()
-			}()
-		}
-		wg.Wait()
+			}
+		}()

-		if len(batch) > 0 {
-			q.mu.Lock()
-			q.prune()
-			q.persistLocked()
-			q.mu.Unlock()
-		}
-
-		setCPUGovernor("powersave")
 	}
 }

+func (q *taskQueue) executeTask(t *Task, j *jobState, ctx context.Context) {
+	startedKmsgWatch := false
+	defer q.finalizeTaskRun(t, j)
+	defer func() {
+		if startedKmsgWatch && q.kmsgWatcher != nil {
+			q.kmsgWatcher.NotifyTaskFinished(t.ID)
+		}
+	}()
+	defer func() {
+		if rec := recover(); rec != nil {
+			msg := fmt.Sprintf("task panic: %v", rec)
+			slog.Error("task panic",
+				"task_id", t.ID,
+				"target", t.Target,
+				"panic", fmt.Sprint(rec),
+				"stack", string(debug.Stack()),
+			)
+			j.append("ERROR: " + msg)
+			j.finish(msg)
+		}
+	}()
+
+	if q.kmsgWatcher != nil && isSATTarget(t.Target) {
+		q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
+		startedKmsgWatch = true
+	}
+
+	q.runTask(t, j, ctx)
+}
+
+func (q *taskQueue) finalizeTaskRun(t *Task, j *jobState) {
+	q.mu.Lock()
+	now := time.Now()
+	t.DoneAt = &now
+	if t.Status == TaskRunning {
+		if j.err != "" {
+			t.Status = TaskFailed
+			t.ErrMsg = j.err
+		} else {
+			t.Status = TaskDone
+			t.ErrMsg = ""
+		}
+	}
+	q.finalizeTaskArtifactPathsLocked(t)
+	q.persistLocked()
+	q.mu.Unlock()
+
+	if err := writeTaskReportArtifacts(t); err != nil {
+		appendJobLog(t.LogPath, "WARN: task report generation failed: "+err.Error())
+	}
+	if t.ErrMsg != "" {
+		taskSerialEvent(t, "finished with status="+t.Status+" error="+t.ErrMsg)
+		return
+	}
+	taskSerialEvent(t, "finished with status="+t.Status)
+}
+
 // setCPUGovernor writes the given governor to all CPU scaling_governor sysfs files.
 // Silently ignores errors (e.g. when cpufreq is not available).
 func setCPUGovernor(governor string) {
@@ -502,9 +566,6 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			break
 		}
 		diagLevel := t.params.DiagLevel
-		if t.params.BurnProfile != "" && diagLevel <= 0 {
-			diagLevel = resolveBurnPreset(t.params.BurnProfile).NvidiaDiag
-		}
 		if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
 			result, e := a.RunNvidiaAcceptancePackWithOptions(
 				ctx, "", diagLevel, t.params.GPUIndices, j.append,
@@ -517,6 +578,78 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 		} else {
 			archive, err = a.RunNvidiaAcceptancePack("", j.append)
 		}
+	case "nvidia-targeted-stress":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		dur := t.params.Duration
+		if dur <= 0 {
+			dur = 300
+		}
+		archive, err = a.RunNvidiaTargetedStressValidatePack(ctx, "", dur, t.params.GPUIndices, j.append)
+	case "nvidia-benchmark":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = a.RunNvidiaBenchmarkCtx(ctx, "", platform.NvidiaBenchmarkOptions{
+			Profile:           t.params.BenchmarkProfile,
+			SizeMB:            t.params.SizeMB,
+			GPUIndices:        t.params.GPUIndices,
+			ExcludeGPUIndices: t.params.ExcludeGPUIndices,
+			RunNCCL:           t.params.RunNCCL,
+		}, j.append)
+	case "nvidia-compute":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		dur := t.params.Duration
+		if t.params.BurnProfile != "" && dur <= 0 {
+			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
+		}
+		archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, j.append)
+	case "nvidia-targeted-power":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		dur := t.params.Duration
+		if t.params.BurnProfile != "" && dur <= 0 {
+			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
+		}
+		archive, err = a.RunNvidiaTargetedPowerPack(ctx, "", dur, t.params.GPUIndices, j.append)
+	case "nvidia-pulse":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		dur := t.params.Duration
+		if t.params.BurnProfile != "" && dur <= 0 {
+			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
+		}
+		archive, err = a.RunNvidiaPulseTestPack(ctx, "", dur, t.params.GPUIndices, j.append)
+	case "nvidia-bandwidth":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = a.RunNvidiaBandwidthPack(ctx, "", t.params.GPUIndices, j.append)
+	case "nvidia-interconnect":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		dur := t.params.Duration
+		if t.params.BurnProfile != "" && dur <= 0 {
+			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
+		}
+		archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
+			DurationSec: dur,
+			Loader:      platform.NvidiaStressLoaderNCCL,
+			GPUIndices:  t.params.GPUIndices,
+		}, j.append)
 	case "nvidia-stress":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
@@ -731,6 +864,7 @@ func (h *handler) handleAPITasksCancel(w http.ResponseWriter, r *http.Request) {
 		now := time.Now()
 		t.DoneAt = &now
 		globalQueue.persistLocked()
+		taskSerialEvent(t, "finished with status="+t.Status)
 		writeJSON(w, map[string]string{"status": "cancelled"})
 	case TaskRunning:
 		if t.job != nil {
@@ -740,6 +874,7 @@ func (h *handler) handleAPITasksCancel(w http.ResponseWriter, r *http.Request) {
 		now := time.Now()
 		t.DoneAt = &now
 		globalQueue.persistLocked()
+		taskSerialEvent(t, "finished with status="+t.Status)
 		writeJSON(w, map[string]string{"status": "cancelled"})
 	default:
 		writeError(w, http.StatusConflict, "task is not running or pending")
@@ -780,6 +915,7 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
 		case TaskPending:
 			t.Status = TaskCancelled
 			t.DoneAt = &now
+			taskSerialEvent(t, "finished with status="+t.Status)
 			n++
 		case TaskRunning:
 			if t.job != nil {
@@ -787,6 +923,7 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
 			}
 			t.Status = TaskCancelled
 			t.DoneAt = &now
+			taskSerialEvent(t, "finished with status="+t.Status)
 			n++
 		}
 	}
@@ -805,6 +942,7 @@ func (h *handler) handleAPITasksKillWorkers(w http.ResponseWriter, _ *http.Reque
 		case TaskPending:
 			t.Status = TaskCancelled
 			t.DoneAt = &now
+			taskSerialEvent(t, "finished with status="+t.Status)
 			cancelled++
 		case TaskRunning:
 			if t.job != nil {
@@ -812,6 +950,7 @@ func (h *handler) handleAPITasksKillWorkers(w http.ResponseWriter, _ *http.Reque
 			}
 			t.Status = TaskCancelled
 			t.DoneAt = &now
+			taskSerialEvent(t, "finished with status="+t.Status)
 			cancelled++
 		}
 	}
@@ -875,10 +1014,10 @@ func (h *handler) handleAPITasksStream(w http.ResponseWriter, r *http.Request) {
 }

 func (q *taskQueue) assignTaskLogPathLocked(t *Task) {
-	if t.LogPath != "" || q.logsDir == "" || t.ID == "" {
+	if q.logsDir == "" || t.ID == "" {
 		return
 	}
-	t.LogPath = filepath.Join(q.logsDir, t.ID+".log")
+	q.ensureTaskArtifactPathsLocked(t)
 }

 func (q *taskQueue) loadLocked() {
@@ -895,17 +1034,20 @@ func (q *taskQueue) loadLocked() {
 	}
 	for _, pt := range persisted {
 		t := &Task{
-			ID:        pt.ID,
-			Name:      pt.Name,
-			Target:    pt.Target,
-			Priority:  pt.Priority,
-			Status:    pt.Status,
-			CreatedAt: pt.CreatedAt,
-			StartedAt: pt.StartedAt,
-			DoneAt:    pt.DoneAt,
-			ErrMsg:    pt.ErrMsg,
-			LogPath:   pt.LogPath,
-			params:    pt.Params,
+			ID:             pt.ID,
+			Name:           pt.Name,
+			Target:         pt.Target,
+			Priority:       pt.Priority,
+			Status:         pt.Status,
+			CreatedAt:      pt.CreatedAt,
+			StartedAt:      pt.StartedAt,
+			DoneAt:         pt.DoneAt,
+			ErrMsg:         pt.ErrMsg,
+			LogPath:        pt.LogPath,
+			ArtifactsDir:   pt.ArtifactsDir,
+			ReportJSONPath: pt.ReportJSONPath,
+			ReportHTMLPath: pt.ReportHTMLPath,
+			params:         pt.Params,
 		}
 		q.assignTaskLogPathLocked(t)
 		if t.Status == TaskRunning {
@@ -936,17 +1078,20 @@ func (q *taskQueue) persistLocked() {
 	state := make([]persistedTask, 0, len(q.tasks))
 	for _, t := range q.tasks {
 		state = append(state, persistedTask{
-			ID:        t.ID,
-			Name:      t.Name,
-			Target:    t.Target,
-			Priority:  t.Priority,
-			Status:    t.Status,
-			CreatedAt: t.CreatedAt,
-			StartedAt: t.StartedAt,
-			DoneAt:    t.DoneAt,
-			ErrMsg:    t.ErrMsg,
-			LogPath:   t.LogPath,
-			Params:    t.params,
+			ID:             t.ID,
+			Name:           t.Name,
+			Target:         t.Target,
+			Priority:       t.Priority,
+			Status:         t.Status,
+			CreatedAt:      t.CreatedAt,
+			StartedAt:      t.StartedAt,
+			DoneAt:         t.DoneAt,
+			ErrMsg:         t.ErrMsg,
+			LogPath:        t.LogPath,
+			ArtifactsDir:   t.ArtifactsDir,
+			ReportJSONPath: t.ReportJSONPath,
+			ReportHTMLPath: t.ReportHTMLPath,
+			Params:         t.params,
 		})
 	}
 	data, err := json.MarshalIndent(state, "", "  ")
@@ -977,3 +1122,88 @@ func taskElapsedSec(t *Task, now time.Time) int {
 	}
 	return int(end.Sub(start).Round(time.Second) / time.Second)
 }
+
+func taskFolderStatus(status string) string {
+	status = strings.TrimSpace(strings.ToLower(status))
+	switch status {
+	case TaskRunning, TaskDone, TaskFailed, TaskCancelled:
+		return status
+	default:
+		return TaskPending
+	}
+}
+
+func sanitizeTaskFolderPart(s string) string {
+	s = strings.TrimSpace(strings.ToLower(s))
+	if s == "" {
+		return "task"
+	}
+	var b strings.Builder
+	lastDash := false
+	for _, r := range s {
+		isAlnum := (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9')
+		if isAlnum {
+			b.WriteRune(r)
+			lastDash = false
+			continue
+		}
+		if !lastDash {
+			b.WriteByte('-')
+			lastDash = true
+		}
+	}
+	out := strings.Trim(b.String(), "-")
+	if out == "" {
+		return "task"
+	}
+	return out
+}
+
+func taskArtifactsDir(root string, t *Task, status string) string {
+	if strings.TrimSpace(root) == "" || t == nil {
+		return ""
+	}
+	return filepath.Join(root, fmt.Sprintf("%s_%s_%s", t.ID, sanitizeTaskFolderPart(t.Name), taskFolderStatus(status)))
+}
+
+func ensureTaskReportPaths(t *Task) {
+	if t == nil || strings.TrimSpace(t.ArtifactsDir) == "" {
+		return
+	}
+	if t.LogPath == "" || filepath.Base(t.LogPath) == "task.log" {
+		t.LogPath = filepath.Join(t.ArtifactsDir, "task.log")
+	}
+	t.ReportJSONPath = filepath.Join(t.ArtifactsDir, "report.json")
+	t.ReportHTMLPath = filepath.Join(t.ArtifactsDir, "report.html")
+}
+
+func (q *taskQueue) ensureTaskArtifactPathsLocked(t *Task) {
+	if t == nil || strings.TrimSpace(q.logsDir) == "" || strings.TrimSpace(t.ID) == "" {
+		return
+	}
+	if strings.TrimSpace(t.ArtifactsDir) == "" {
+		t.ArtifactsDir = taskArtifactsDir(q.logsDir, t, t.Status)
+	}
+	if t.ArtifactsDir != "" {
+		_ = os.MkdirAll(t.ArtifactsDir, 0755)
+	}
+	ensureTaskReportPaths(t)
+}
+
+func (q *taskQueue) finalizeTaskArtifactPathsLocked(t *Task) {
+	if t == nil || strings.TrimSpace(q.logsDir) == "" || strings.TrimSpace(t.ID) == "" {
+		return
+	}
+	q.ensureTaskArtifactPathsLocked(t)
+	dstDir := taskArtifactsDir(q.logsDir, t, t.Status)
+	if dstDir == "" {
+		return
+	}
+	if t.ArtifactsDir != "" && t.ArtifactsDir != dstDir {
+		if _, err := os.Stat(dstDir); err != nil {
+			_ = os.Rename(t.ArtifactsDir, dstDir)
+		}
+		t.ArtifactsDir = dstDir
+	}
+	ensureTaskReportPaths(t)
+}
--- a/audit/internal/webui/tasks_test.go
+++ b/audit/internal/webui/tasks_test.go
@@ -2,6 +2,7 @@ package webui

 import (
 	"context"
+	"encoding/json"
 	"net/http"
 	"net/http/httptest"
 	"os"
@@ -12,6 +13,7 @@ import (
 	"time"

 	"bee/audit/internal/app"
+	"bee/audit/internal/platform"
 )

 func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) {
@@ -248,15 +250,133 @@ func TestHandleAPITasksStreamPendingTaskStartsSSEImmediately(t *testing.T) {
 	t.Fatalf("stream did not emit queued status promptly, body=%q", rec.Body.String())
 }

+func TestFinalizeTaskRunCreatesReportFolderAndArtifacts(t *testing.T) {
+	dir := t.TempDir()
+	metricsPath := filepath.Join(dir, "metrics.db")
+	prevMetricsPath := taskReportMetricsDBPath
+	taskReportMetricsDBPath = metricsPath
+	t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
+
+	db, err := openMetricsDB(metricsPath)
+	if err != nil {
+		t.Fatalf("openMetricsDB: %v", err)
+	}
+	base := time.Now().UTC().Add(-45 * time.Second)
+	if err := db.Write(platform.LiveMetricSample{
+		Timestamp:  base,
+		CPULoadPct: 42,
+		MemLoadPct: 35,
+		PowerW:     510,
+	}); err != nil {
+		t.Fatalf("Write: %v", err)
+	}
+	_ = db.Close()
+
+	q := &taskQueue{
+		statePath: filepath.Join(dir, "tasks-state.json"),
+		logsDir:   filepath.Join(dir, "tasks"),
+		trigger:   make(chan struct{}, 1),
+	}
+	if err := os.MkdirAll(q.logsDir, 0755); err != nil {
+		t.Fatal(err)
+	}
+
+	started := time.Now().UTC().Add(-90 * time.Second)
+	task := &Task{
+		ID:        "task-1",
+		Name:      "CPU SAT",
+		Target:    "cpu",
+		Status:    TaskRunning,
+		CreatedAt: started.Add(-10 * time.Second),
+		StartedAt: &started,
+	}
+	q.assignTaskLogPathLocked(task)
+	appendJobLog(task.LogPath, "line-1")
+
+	job := newTaskJobState(task.LogPath)
+	job.finish("")
+	q.finalizeTaskRun(task, job)
+
+	if task.Status != TaskDone {
+		t.Fatalf("status=%q want %q", task.Status, TaskDone)
+	}
+	if !strings.Contains(filepath.Base(task.ArtifactsDir), "_done") {
+		t.Fatalf("artifacts dir=%q", task.ArtifactsDir)
+	}
+	if _, err := os.Stat(task.ReportJSONPath); err != nil {
+		t.Fatalf("report json: %v", err)
+	}
+	if _, err := os.Stat(task.ReportHTMLPath); err != nil {
+		t.Fatalf("report html: %v", err)
+	}
+	var report taskReport
+	data, err := os.ReadFile(task.ReportJSONPath)
+	if err != nil {
+		t.Fatalf("ReadFile(report.json): %v", err)
+	}
+	if err := json.Unmarshal(data, &report); err != nil {
+		t.Fatalf("Unmarshal(report.json): %v", err)
+	}
+	if report.ID != task.ID || report.Status != TaskDone {
+		t.Fatalf("report=%+v", report)
+	}
+	if len(report.Charts) == 0 {
+		t.Fatalf("expected charts in report, got none")
+	}
+}
+
+func TestTaskLifecycleMirrorsToSerialConsole(t *testing.T) {
+	var lines []string
+	prev := taskSerialWriteLine
+	taskSerialWriteLine = func(line string) { lines = append(lines, line) }
+	t.Cleanup(func() { taskSerialWriteLine = prev })
+
+	dir := t.TempDir()
+	q := &taskQueue{
+		statePath: filepath.Join(dir, "tasks-state.json"),
+		logsDir:   filepath.Join(dir, "tasks"),
+		trigger:   make(chan struct{}, 1),
+	}
+	task := &Task{
+		ID:        "task-serial-1",
+		Name:      "CPU SAT",
+		Target:    "cpu",
+		Status:    TaskPending,
+		CreatedAt: time.Now().UTC(),
+	}
+
+	q.enqueue(task)
+	started := time.Now().UTC()
+	task.Status = TaskRunning
+	task.StartedAt = &started
+	job := newTaskJobState(task.LogPath, taskSerialPrefix(task))
+	job.append("Starting CPU SAT...")
+	job.append("CPU stress duration: 60s")
+	job.finish("")
+	q.finalizeTaskRun(task, job)
+
+	joined := strings.Join(lines, "\n")
+	for _, needle := range []string{
+		"queued",
+		"Starting CPU SAT...",
+		"CPU stress duration: 60s",
+		"finished with status=done",
+	} {
+		if !strings.Contains(joined, needle) {
+			t.Fatalf("serial mirror missing %q in %q", needle, joined)
+		}
+	}
+}
+
 func TestResolveBurnPreset(t *testing.T) {
 	tests := []struct {
 		profile string
 		want    burnPreset
 	}{
-		{profile: "smoke", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}},
-		{profile: "acceptance", want: burnPreset{NvidiaDiag: 3, DurationSec: 60 * 60}},
-		{profile: "overnight", want: burnPreset{NvidiaDiag: 4, DurationSec: 8 * 60 * 60}},
-		{profile: "", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}},
+		{profile: "smoke", want: burnPreset{DurationSec: 5 * 60}},
+		{profile: "acceptance", want: burnPreset{DurationSec: 60 * 60}},
+		{profile: "overnight", want: burnPreset{DurationSec: 8 * 60 * 60}},
+		{profile: "", want: burnPreset{DurationSec: 5 * 60}},
 	}
 	for _, tc := range tests {
 		if got := resolveBurnPreset(tc.profile); got != tc.want {
@@ -467,3 +587,52 @@ func TestRunTaskInstallUsesSharedCommandStreaming(t *testing.T) {
 		t.Fatalf("unexpected error: %q", j.err)
 	}
 }
+
+func TestExecuteTaskMarksPanicsAsFailedAndClosesKmsgWindow(t *testing.T) {
+	dir := t.TempDir()
+	q := &taskQueue{
+		opts:        &HandlerOptions{App: &app.App{}},
+		statePath:   filepath.Join(dir, "tasks-state.json"),
+		logsDir:     filepath.Join(dir, "tasks"),
+		kmsgWatcher: newKmsgWatcher(nil),
+	}
+	tk := &Task{
+		ID:        "cpu-panic-1",
+		Name:      "CPU SAT",
+		Target:    "cpu",
+		Status:    TaskRunning,
+		CreatedAt: time.Now(),
+	}
+	j := &jobState{}
+
+	orig := runCPUAcceptancePackCtx
+	runCPUAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, _ int, _ func(string)) (string, error) {
+		panic("boom")
+	}
+	defer func() { runCPUAcceptancePackCtx = orig }()
+
+	q.executeTask(tk, j, context.Background())
+
+	if tk.Status != TaskFailed {
+		t.Fatalf("status=%q want %q", tk.Status, TaskFailed)
+	}
+	if tk.DoneAt == nil {
+		t.Fatal("expected done_at to be set")
+	}
+	if !strings.Contains(tk.ErrMsg, "task panic: boom") {
+		t.Fatalf("task error=%q", tk.ErrMsg)
+	}
+	if !strings.Contains(j.err, "task panic: boom") {
+		t.Fatalf("job error=%q", j.err)
+	}
+	q.kmsgWatcher.mu.Lock()
+	activeCount := q.kmsgWatcher.activeCount
+	window := q.kmsgWatcher.window
+	q.kmsgWatcher.mu.Unlock()
+	if activeCount != 0 {
+		t.Fatalf("activeCount=%d want 0", activeCount)
+	}
+	if window != nil {
+		t.Fatalf("expected kmsg window to be cleared, got %+v", window)
+	}
+}
--- a/2
+++ b/2
--- a/iso/builder/build.sh
+++ b/iso/builder/build.sh
@@ -302,6 +302,12 @@ memtest_fail() {
    return 0
 }

+nvidia_runtime_fail() {
+    msg="$1"
+    echo "ERROR: ${msg}" >&2
+    exit 1
+}
+
 iso_memtest_present() {
    iso_path="$1"
    iso_files="$(mktemp)"
@@ -439,6 +445,44 @@ validate_iso_memtest() {
    echo "=== memtest validation OK ==="
 }

+validate_iso_nvidia_runtime() {
+    iso_path="$1"
+    [ "$BEE_GPU_VENDOR" = "nvidia" ] || return 0
+
+    echo "=== validating NVIDIA runtime in ISO ==="
+
+    [ -f "$iso_path" ] || nvidia_runtime_fail "ISO not found for NVIDIA runtime validation: $iso_path"
+    require_iso_reader "$iso_path" >/dev/null 2>&1 || nvidia_runtime_fail "ISO reader unavailable for NVIDIA runtime validation"
+    command -v unsquashfs >/dev/null 2>&1 || nvidia_runtime_fail "unsquashfs is required for NVIDIA runtime validation"
+
+    squashfs_tmp="$(mktemp)"
+    squashfs_list="$(mktemp)"
+    iso_read_member "$iso_path" live/filesystem.squashfs "$squashfs_tmp" || {
+        rm -f "$squashfs_tmp" "$squashfs_list"
+        nvidia_runtime_fail "failed to extract live/filesystem.squashfs from ISO"
+    }
+    unsquashfs -ll "$squashfs_tmp" > "$squashfs_list" 2>/dev/null || {
+        rm -f "$squashfs_tmp" "$squashfs_list"
+        nvidia_runtime_fail "failed to inspect filesystem.squashfs from ISO"
+    }
+
+    grep -Eq 'usr/bin/dcgmi$' "$squashfs_list" || {
+        rm -f "$squashfs_tmp" "$squashfs_list"
+        nvidia_runtime_fail "dcgmi missing from final NVIDIA ISO"
+    }
+    grep -Eq 'usr/bin/nv-hostengine$' "$squashfs_list" || {
+        rm -f "$squashfs_tmp" "$squashfs_list"
+        nvidia_runtime_fail "nv-hostengine missing from final NVIDIA ISO"
+    }
+    grep -Eq 'usr/bin/dcgmproftester([0-9]+)?$' "$squashfs_list" || {
+        rm -f "$squashfs_tmp" "$squashfs_list"
+        nvidia_runtime_fail "dcgmproftester missing from final NVIDIA ISO"
+    }
+
+    rm -f "$squashfs_tmp" "$squashfs_list"
+    echo "=== NVIDIA runtime validation OK ==="
+}
+
 append_memtest_grub_entry() {
    grub_cfg="$1"
    [ -f "$grub_cfg" ] || return 1
@@ -1144,6 +1188,7 @@ if [ -f "$ISO_RAW" ]; then
        fi
    fi
    validate_iso_memtest "$ISO_RAW"
+    validate_iso_nvidia_runtime "$ISO_RAW"
    cp "$ISO_RAW" "$ISO_OUT"
    echo ""
    echo "=== done (${BEE_GPU_VENDOR}) ==="
--- a/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
+++ b/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
@@ -30,6 +30,7 @@ systemctl enable bee-preflight.service
 systemctl enable bee-audit.service
 systemctl enable bee-web.service
 systemctl enable bee-sshsetup.service
+systemctl enable bee-selfheal.timer
 systemctl enable ssh.service
 systemctl enable lightdm.service 2>/dev/null || true
 systemctl enable qemu-guest-agent.service 2>/dev/null || true
@@ -58,6 +59,7 @@ chmod +x /usr/local/bin/bee-sshsetup   2>/dev/null || true
 chmod +x /usr/local/bin/bee-smoketest  2>/dev/null || true
 chmod +x /usr/local/bin/bee            2>/dev/null || true
 chmod +x /usr/local/bin/bee-log-run    2>/dev/null || true
+chmod +x /usr/local/bin/bee-selfheal   2>/dev/null || true
 if [ "$GPU_VENDOR" = "nvidia" ]; then
    chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
    chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
--- a/iso/builder/config/package-lists/bee-nvidia.list.chroot
+++ b/iso/builder/config/package-lists/bee-nvidia.list.chroot
@@ -1,6 +1,10 @@
-# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing.
-# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with CUDA 13 userspace,
-# so install the CUDA 13 build plus proprietary diagnostic components explicitly.
+# NVIDIA DCGM (Data Center GPU Manager).
+# Validate uses dcgmi diagnostics; Burn uses dcgmproftester as the official
+# NVIDIA max-compute recipe. The smoketest/runtime contract treats
+# dcgmproftester as required in the LiveCD.
+# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
+# CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
+# explicitly.
 datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
 datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
 datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%
--- a/iso/builder/smoketest.sh
+++ b/iso/builder/smoketest.sh
@@ -52,6 +52,31 @@ else
    fail "nvidia-smi: NOT FOUND"
 fi

+if p=$(PATH="/usr/local/bin:$PATH" command -v dcgmi 2>/dev/null); then
+    ok "dcgmi found: $p"
+else
+    fail "dcgmi: NOT FOUND"
+fi
+
+if p=$(PATH="/usr/local/bin:$PATH" command -v nv-hostengine 2>/dev/null); then
+    ok "nv-hostengine found: $p"
+else
+    fail "nv-hostengine: NOT FOUND"
+fi
+
+DCGM_PROFTESTER=""
+for tool in dcgmproftester dcgmproftester13 dcgmproftester12 dcgmproftester11; do
+    if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
+        DCGM_PROFTESTER="$p"
+        break
+    fi
+done
+if [ -n "$DCGM_PROFTESTER" ]; then
+    ok "dcgmproftester found: $DCGM_PROFTESTER"
+else
+    fail "dcgmproftester: NOT FOUND"
+fi
+
 for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf; do
    if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
        ok "$tool found: $p"
@@ -60,6 +85,12 @@ for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf
    fi
 done

+if p=$(PATH="/usr/local/bin:$PATH" command -v nvbandwidth 2>/dev/null); then
+    ok "nvbandwidth found: $p"
+else
+    warn "nvbandwidth: NOT FOUND"
+fi
+
 echo ""
 echo "-- NVIDIA modules --"
 KO_DIR="/usr/local/lib/nvidia"
@@ -171,6 +202,12 @@ for svc in bee-nvidia bee-network bee-preflight bee-audit bee-web; do
    fi
 done

+if systemctl is-active --quiet bee-selfheal.timer 2>/dev/null; then
+    ok "timer active: bee-selfheal.timer"
+else
+    fail "timer NOT active: bee-selfheal.timer"
+fi
+
 echo ""
 echo "-- runtime health --"
 if [ -f /appdata/bee/export/runtime-health.json ] && [ -s /appdata/bee/export/runtime-health.json ]; then
--- a/iso/overlay/etc/systemd/system/bee-audit.service
+++ b/iso/overlay/etc/systemd/system/bee-audit.service
@@ -1,7 +1,6 @@
 [Unit]
 Description=Bee: hardware audit
 After=bee-preflight.service bee-network.service bee-nvidia.service
-Before=bee-web.service

 [Service]
 Type=oneshot
--- a/iso/overlay/etc/systemd/system/bee-selfheal.service
+++ b/iso/overlay/etc/systemd/system/bee-selfheal.service
@@ -0,0 +1,9 @@
+[Unit]
+Description=Bee: periodic runtime self-heal
+After=bee-web.service bee-audit.service bee-preflight.service
+
+[Service]
+Type=oneshot
+ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-selfheal.log /usr/local/bin/bee-selfheal
+StandardOutput=journal
+StandardError=journal
--- a/iso/overlay/etc/systemd/system/bee-selfheal.timer
+++ b/iso/overlay/etc/systemd/system/bee-selfheal.timer
@@ -0,0 +1,11 @@
+[Unit]
+Description=Bee: run self-heal checks periodically
+
+[Timer]
+OnBootSec=45sec
+OnUnitActiveSec=60sec
+AccuracySec=15sec
+Unit=bee-selfheal.service
+
+[Install]
+WantedBy=timers.target
--- a/iso/overlay/etc/systemd/system/bee-web.service
+++ b/iso/overlay/etc/systemd/system/bee-web.service
@@ -1,12 +1,12 @@
 [Unit]
 Description=Bee: hardware audit web viewer
-After=bee-audit.service
+StartLimitIntervalSec=0

 [Service]
 Type=simple
 ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-web.log /usr/local/bin/bee web --listen :80 --audit-path /appdata/bee/export/bee-audit.json --export-dir /appdata/bee/export --title "Bee Hardware Audit"
 Restart=always
-RestartSec=2
+RestartSec=3
 StandardOutput=journal
 StandardError=journal
 LimitMEMLOCK=infinity
--- a/iso/overlay/usr/local/bin/bee-selfheal
+++ b/iso/overlay/usr/local/bin/bee-selfheal
@@ -0,0 +1,99 @@
+#!/bin/bash
+# bee-selfheal — periodic best-effort recovery for critical live ISO services.
+
+set -u
+
+LOG_PREFIX="bee-selfheal"
+EXPORT_DIR="/appdata/bee/export"
+AUDIT_JSON="${EXPORT_DIR}/bee-audit.json"
+RUNTIME_JSON="${EXPORT_DIR}/runtime-health.json"
+LOCK_DIR="/run/bee-selfheal.lock"
+
+log() {
+    echo "[${LOG_PREFIX}] $*"
+}
+
+have_nvidia_gpu() {
+    lspci -nn 2>/dev/null | grep -qi '10de:'
+}
+
+service_active() {
+    systemctl is-active --quiet "$1" 2>/dev/null
+}
+
+restart_service() {
+    local svc="$1"
+    if systemctl restart "$svc" >/dev/null 2>&1; then
+        log "restarted ${svc}"
+        return 0
+    fi
+    log "WARN: failed to restart ${svc}"
+    return 1
+}
+
+file_ready() {
+    [ -s "$1" ]
+}
+
+artifact_state() {
+    local path="$1"
+    if [ -s "${path}" ]; then
+        echo "ready"
+        return 0
+    fi
+    if [ -e "${path}.tmp" ]; then
+        echo "interrupted"
+        return 0
+    fi
+    echo "missing"
+}
+
+web_healthy() {
+    bash -c 'exec 3<>/dev/tcp/127.0.0.1/80 && printf "GET /healthz HTTP/1.0\r\nHost: localhost\r\n\r\n" >&3 && grep -q "^ok$" <&3' \
+        >/dev/null 2>&1
+}
+
+mkdir -p "${EXPORT_DIR}" /run
+
+if ! mkdir "${LOCK_DIR}" 2>/dev/null; then
+    log "another self-heal run is already active"
+    exit 0
+fi
+trap 'rmdir "${LOCK_DIR}" >/dev/null 2>&1 || true' EXIT
+
+log "start"
+
+if have_nvidia_gpu && [ ! -e /dev/nvidia0 ]; then
+    log "NVIDIA GPU detected but /dev/nvidia0 is missing"
+    restart_service bee-nvidia.service || true
+fi
+
+runtime_state="$(artifact_state "${RUNTIME_JSON}")"
+if [ "${runtime_state}" != "ready" ]; then
+    if [ "${runtime_state}" = "interrupted" ]; then
+        log "runtime-health.json.tmp exists — interrupted runtime-health write detected"
+    else
+        log "runtime-health.json missing or empty"
+    fi
+    restart_service bee-preflight.service || true
+fi
+
+audit_state="$(artifact_state "${AUDIT_JSON}")"
+if [ "${audit_state}" != "ready" ]; then
+    if [ "${audit_state}" = "interrupted" ]; then
+        log "bee-audit.json.tmp exists — interrupted audit write detected"
+    else
+        log "bee-audit.json missing or empty"
+    fi
+    restart_service bee-audit.service || true
+fi
+
+if ! service_active bee-web.service; then
+    log "bee-web.service is not active"
+    restart_service bee-web.service || true
+elif ! web_healthy; then
+    log "bee-web health check failed"
+    restart_service bee-web.service || true
+fi
+
+log "done"
Author	SHA1	Message	Date
Michael Chus	f1621efee4	Mirror task lifecycle to serial console	2026-04-05 18:34:06 +03:00
Michael Chus	4461249cc3	Make memory stress size follow available RAM	2026-04-05 18:33:26 +03:00
Michael Chus	e609fbbc26	Add task reports and streamline GPU charts	2026-04-05 18:13:58 +03:00
Michael Chus	cc2b49ea41	Improve validate GPU runs and web UI feedback	2026-04-05 17:50:13 +03:00
Michael Chus	33e0a5bef2	Refine validate UI and runtime health table	2026-04-05 16:24:45 +03:00
Michael Chus	38e79143eb	Refine burn UI and NVIDIA stress flows	2026-04-05 13:43:43 +03:00
Michael Chus	25af2df23a	Unify metrics charts on custom SVG renderer	2026-04-05 12:17:50 +03:00
Michael Chus	20abff7f90	WIP: checkpoint current tree	2026-04-05 12:05:00 +03:00
Michael Chus	a14ec8631c	Persist GPU chart mode and expand GPU charts	2026-04-05 11:52:32 +03:00
Michael Chus	f58c7e58d3	Fix webui streaming recovery regressions	2026-04-05 10:39:09 +03:00
Michael Chus	bf47c8dbd2	Add NVIDIA benchmark reporting flow	2026-04-05 10:30:56 +03:00
Michael Chus	143b7dca5d	Add stability hardening and self-heal recovery	2026-04-05 10:29:37 +03:00
Michael Chus	9826d437a5	Add GPU clock charts and grouped GPU metrics view	2026-04-05 09:57:38 +03:00
Mikhail Chusavitin	f3c14cd893	Harden NIC probing for empty SFP ports	2026-04-04 15:23:15 +03:00
Mikhail Chusavitin	728270dc8e	Unblock bee-web startup and expand support bundle diagnostics	2026-04-04 15:18:43 +03:00