Unify metrics charts on custom SVG renderer

WIP: checkpoint current tree
Persist GPU chart mode and expand GPU charts
2026-04-05 12:17:50 +03:00 · 2026-04-05 12:05:00 +03:00 · 2026-04-05 11:52:32 +03:00 · 2026-04-05 10:39:09 +03:00 · 2026-04-05 10:30:56 +03:00 · 2026-04-05 10:29:37 +03:00
37 changed files with 3969 additions and 372 deletions
--- a/audit/cmd/bee/main.go
+++ b/audit/cmd/bee/main.go
@@ -7,6 +7,8 @@ import (
 	"io"
 	"log/slog"
 	"os"
+	"runtime/debug"
+	"strconv"
 	"strings"

 	"bee/audit/internal/app"
@@ -29,10 +31,19 @@ func main() {
 	os.Exit(run(os.Args[1:], os.Stdout, os.Stderr))
 }

-func run(args []string, stdout, stderr io.Writer) int {
+func run(args []string, stdout, stderr io.Writer) (exitCode int) {
 	slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
 		Level: slog.LevelInfo,
 	})))
+	defer func() {
+		if rec := recover(); rec != nil {
+			slog.Error("fatal panic",
+				"panic", fmt.Sprint(rec),
+				"stack", string(debug.Stack()),
+			)
+			exitCode = 1
+		}
+	}()

 	if len(args) == 0 {
 		printRootUsage(stderr)
@@ -58,6 +69,8 @@ func run(args []string, stdout, stderr io.Writer) int {
 		return runWeb(args[1:], stdout, stderr)
 	case "sat":
 		return runSAT(args[1:], stdout, stderr)
+	case "benchmark":
+		return runBenchmark(args[1:], stdout, stderr)
 	case "version", "--version", "-version":
 		fmt.Fprintln(stdout, Version)
 		return 0
@@ -76,6 +89,7 @@ func printRootUsage(w io.Writer) {
  bee support-bundle --output stdout|file:<path>
  bee web     --listen :80 --audit-path `+app.DefaultAuditJSONPath+`
  bee sat nvidia|memory|storage|cpu [--duration <seconds>]
+  bee benchmark nvidia [--profile standard|stability|overnight]
  bee version
  bee help [command]`)
 }
@@ -94,6 +108,8 @@ func runHelp(args []string, stdout, stderr io.Writer) int {
 		return runWeb([]string{"--help"}, stdout, stdout)
 	case "sat":
 		return runSAT([]string{"--help"}, stdout, stderr)
+	case "benchmark":
+		return runBenchmark([]string{"--help"}, stdout, stderr)
 	case "version":
 		fmt.Fprintln(stdout, "usage: bee version")
 		return 0
@@ -383,3 +399,85 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
 	slog.Info("sat archive written", "target", target, "path", archive)
 	return 0
 }
+
+func runBenchmark(args []string, stdout, stderr io.Writer) int {
+	if len(args) == 0 {
+		fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
+		return 2
+	}
+	if args[0] == "help" || args[0] == "--help" || args[0] == "-h" {
+		fmt.Fprintln(stdout, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
+		return 0
+	}
+	target := args[0]
+	if target != "nvidia" {
+		fmt.Fprintf(stderr, "bee benchmark: unknown target %q\n", target)
+		fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
+		return 2
+	}
+
+	fs := flag.NewFlagSet("benchmark", flag.ContinueOnError)
+	fs.SetOutput(stderr)
+	profile := fs.String("profile", platform.NvidiaBenchmarkProfileStandard, "benchmark profile: standard, stability, overnight")
+	devices := fs.String("devices", "", "comma-separated GPU indices to include")
+	exclude := fs.String("exclude", "", "comma-separated GPU indices to exclude")
+	sizeMB := fs.Int("size-mb", 0, "per-GPU benchmark buffer size in MB (0 = auto)")
+	skipNCCL := fs.Bool("skip-nccl", false, "skip multi-GPU NCCL interconnect benchmark")
+	if err := fs.Parse(args[1:]); err != nil {
+		if err == flag.ErrHelp {
+			return 0
+		}
+		return 2
+	}
+	if fs.NArg() != 0 {
+		fmt.Fprintf(stderr, "bee benchmark: unexpected arguments\n")
+		return 2
+	}
+
+	includeIndices, err := parseBenchmarkIndexCSV(*devices)
+	if err != nil {
+		fmt.Fprintf(stderr, "bee benchmark: invalid --devices: %v\n", err)
+		return 2
+	}
+	excludeIndices, err := parseBenchmarkIndexCSV(*exclude)
+	if err != nil {
+		fmt.Fprintf(stderr, "bee benchmark: invalid --exclude: %v\n", err)
+		return 2
+	}
+
+	application := app.New(platform.New())
+	logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
+	archive, err := application.RunNvidiaBenchmark("", platform.NvidiaBenchmarkOptions{
+		Profile:           *profile,
+		SizeMB:            *sizeMB,
+		GPUIndices:        includeIndices,
+		ExcludeGPUIndices: excludeIndices,
+		RunNCCL:           !*skipNCCL,
+	}, logLine)
+	if err != nil {
+		slog.Error("run benchmark", "target", target, "err", err)
+		return 1
+	}
+	slog.Info("benchmark archive written", "target", target, "path", archive)
+	return 0
+}
+
+func parseBenchmarkIndexCSV(raw string) ([]int, error) {
+	raw = strings.TrimSpace(raw)
+	if raw == "" {
+		return nil, nil
+	}
+	var indices []int
+	for _, part := range strings.Split(raw, ",") {
+		part = strings.TrimSpace(part)
+		if part == "" {
+			continue
+		}
+		value, err := strconv.Atoi(part)
+		if err != nil || value < 0 {
+			return nil, fmt.Errorf("bad gpu index %q", part)
+		}
+		indices = append(indices, value)
+	}
+	return indices, nil
+}
--- a/audit/internal/app/app.go
+++ b/audit/internal/app/app.go
@@ -19,17 +19,18 @@ import (
 )

 var (
-	DefaultExportDir       = "/appdata/bee/export"
-	DefaultAuditJSONPath   = DefaultExportDir + "/bee-audit.json"
-	DefaultAuditLogPath    = DefaultExportDir + "/bee-audit.log"
-	DefaultWebLogPath      = DefaultExportDir + "/bee-web.log"
-	DefaultNetworkLogPath  = DefaultExportDir + "/bee-network.log"
-	DefaultNvidiaLogPath   = DefaultExportDir + "/bee-nvidia.log"
-	DefaultSSHLogPath      = DefaultExportDir + "/bee-sshsetup.log"
-	DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
-	DefaultRuntimeLogPath  = DefaultExportDir + "/runtime-health.log"
-	DefaultTechDumpDir     = DefaultExportDir + "/techdump"
-	DefaultSATBaseDir      = DefaultExportDir + "/bee-sat"
+	DefaultExportDir        = "/appdata/bee/export"
+	DefaultAuditJSONPath    = DefaultExportDir + "/bee-audit.json"
+	DefaultAuditLogPath     = DefaultExportDir + "/bee-audit.log"
+	DefaultWebLogPath       = DefaultExportDir + "/bee-web.log"
+	DefaultNetworkLogPath   = DefaultExportDir + "/bee-network.log"
+	DefaultNvidiaLogPath    = DefaultExportDir + "/bee-nvidia.log"
+	DefaultSSHLogPath       = DefaultExportDir + "/bee-sshsetup.log"
+	DefaultRuntimeJSONPath  = DefaultExportDir + "/runtime-health.json"
+	DefaultRuntimeLogPath   = DefaultExportDir + "/runtime-health.log"
+	DefaultTechDumpDir      = DefaultExportDir + "/techdump"
+	DefaultSATBaseDir       = DefaultExportDir + "/bee-sat"
+	DefaultBenchmarkBaseDir = DefaultExportDir + "/bee-benchmark"
 )

 type App struct {
@@ -114,6 +115,7 @@ func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
 type satRunner interface {
 	RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error)
 	RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
+	RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
 	RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
 	RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
 	RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
@@ -195,10 +197,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
 		return "stdout", err
 	case strings.HasPrefix(output, "file:"):
 		path := strings.TrimPrefix(output, "file:")
-		if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
-			return "", err
-		}
-		if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
+		if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
 			return "", err
 		}
 		return path, nil
@@ -223,10 +222,7 @@ func (a *App) RunRuntimePreflight(output string) (string, error) {
 		return "stdout", err
 	case strings.HasPrefix(output, "file:"):
 		path := strings.TrimPrefix(output, "file:")
-		if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
-			return "", err
-		}
-		if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
+		if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
 			return "", err
 		}
 		return path, nil
@@ -536,6 +532,17 @@ func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOpti
 	return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
 }

+func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
+	return a.RunNvidiaBenchmarkCtx(context.Background(), baseDir, opts, logFunc)
+}
+
+func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultBenchmarkBaseDir
+	}
+	return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
+}
+
 func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
--- a/audit/internal/app/app_test.go
+++ b/audit/internal/app/app_test.go
@@ -120,15 +120,16 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
 }

 type fakeSAT struct {
-	runNvidiaFn       func(string) (string, error)
-	runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
-	runMemoryFn       func(string) (string, error)
-	runStorageFn      func(string) (string, error)
-	runCPUFn          func(string, int) (string, error)
-	detectVendorFn    func() string
-	listAMDGPUsFn     func() ([]platform.AMDGPUInfo, error)
-	runAMDPackFn      func(string) (string, error)
-	listNvidiaGPUsFn  func() ([]platform.NvidiaGPU, error)
+	runNvidiaFn          func(string) (string, error)
+	runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
+	runNvidiaStressFn    func(string, platform.NvidiaStressOptions) (string, error)
+	runMemoryFn          func(string) (string, error)
+	runStorageFn         func(string) (string, error)
+	runCPUFn             func(string, int) (string, error)
+	detectVendorFn       func() string
+	listAMDGPUsFn        func() ([]platform.AMDGPUInfo, error)
+	runAMDPackFn         func(string) (string, error)
+	listNvidiaGPUsFn     func() ([]platform.NvidiaGPU, error)
 }

 func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
@@ -139,6 +140,13 @@ func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir s
 	return f.runNvidiaFn(baseDir)
 }

+func (f fakeSAT) RunNvidiaBenchmark(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, _ func(string)) (string, error) {
+	if f.runNvidiaBenchmarkFn != nil {
+		return f.runNvidiaBenchmarkFn(baseDir, opts)
+	}
+	return f.runNvidiaFn(baseDir)
+}
+
 func (f fakeSAT) RunNvidiaStressPack(_ context.Context, baseDir string, opts platform.NvidiaStressOptions, _ func(string)) (string, error) {
 	if f.runNvidiaStressFn != nil {
 		return f.runNvidiaStressFn(baseDir, opts)
--- a/audit/internal/app/atomic_write.go
+++ b/audit/internal/app/atomic_write.go
@@ -0,0 +1,48 @@
+package app
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+)
+
+func atomicWriteFile(path string, data []byte, perm os.FileMode) error {
+	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+		return fmt.Errorf("mkdir %s: %w", filepath.Dir(path), err)
+	}
+
+	tmpPath := path + ".tmp"
+	f, err := os.OpenFile(tmpPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, perm)
+	if err != nil {
+		return fmt.Errorf("open temp %s: %w", tmpPath, err)
+	}
+
+	success := false
+	defer func() {
+		_ = f.Close()
+		if !success {
+			_ = os.Remove(tmpPath)
+		}
+	}()
+
+	if _, err := f.Write(data); err != nil {
+		return fmt.Errorf("write temp %s: %w", tmpPath, err)
+	}
+	if err := f.Sync(); err != nil {
+		return fmt.Errorf("sync temp %s: %w", tmpPath, err)
+	}
+	if err := f.Close(); err != nil {
+		return fmt.Errorf("close temp %s: %w", tmpPath, err)
+	}
+	if err := os.Rename(tmpPath, path); err != nil {
+		return fmt.Errorf("rename %s -> %s: %w", tmpPath, path, err)
+	}
+
+	if dir, err := os.Open(filepath.Dir(path)); err == nil {
+		_ = dir.Sync()
+		_ = dir.Close()
+	}
+
+	success = true
+	return nil
+}
--- a/audit/internal/app/atomic_write_test.go
+++ b/audit/internal/app/atomic_write_test.go
@@ -0,0 +1,71 @@
+package app
+
+import (
+	"encoding/json"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"bee/audit/internal/schema"
+)
+
+func TestAtomicWriteFileReplacesTargetWithoutLeavingTmp(t *testing.T) {
+	path := filepath.Join(t.TempDir(), "bee-audit.json")
+	if err := os.WriteFile(path, []byte("old\n"), 0644); err != nil {
+		t.Fatalf("seed file: %v", err)
+	}
+
+	if err := atomicWriteFile(path, []byte("new\n"), 0644); err != nil {
+		t.Fatalf("atomicWriteFile: %v", err)
+	}
+
+	raw, err := os.ReadFile(path)
+	if err != nil {
+		t.Fatalf("read final: %v", err)
+	}
+	if string(raw) != "new\n" {
+		t.Fatalf("final content=%q want %q", string(raw), "new\n")
+	}
+	if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
+		t.Fatalf("tmp file should be absent after success, err=%v", err)
+	}
+}
+
+func TestRunRuntimePreflightWritesAtomically(t *testing.T) {
+	path := filepath.Join(t.TempDir(), "runtime-health.json")
+	a := &App{
+		runtime: fakeRuntime{
+			collectFn: func(exportDir string) (schema.RuntimeHealth, error) {
+				return schema.RuntimeHealth{
+					Status:      "OK",
+					ExportDir:   exportDir,
+					DriverReady: true,
+					CUDAReady:   true,
+				}, nil
+			},
+		},
+	}
+
+	got, err := a.RunRuntimePreflight("file:" + path)
+	if err != nil {
+		t.Fatalf("RunRuntimePreflight: %v", err)
+	}
+	if got != path {
+		t.Fatalf("path=%q want %q", got, path)
+	}
+	if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
+		t.Fatalf("tmp file should be absent after success, err=%v", err)
+	}
+
+	raw, err := os.ReadFile(path)
+	if err != nil {
+		t.Fatalf("read runtime file: %v", err)
+	}
+	var health schema.RuntimeHealth
+	if err := json.Unmarshal(raw, &health); err != nil {
+		t.Fatalf("json unmarshal: %v", err)
+	}
+	if health.Status != "OK" {
+		t.Fatalf("status=%q want OK", health.Status)
+	}
+}
--- a/audit/internal/app/support_bundle.go
+++ b/audit/internal/app/support_bundle.go
@@ -19,6 +19,8 @@ var supportBundleServices = []string{
 	"bee-network.service",
 	"bee-nvidia.service",
 	"bee-preflight.service",
+	"bee-selfheal.service",
+	"bee-selfheal.timer",
 	"bee-sshsetup.service",
 }

--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
--- a/audit/internal/platform/benchmark_report.go
+++ b/audit/internal/platform/benchmark_report.go
@@ -0,0 +1,141 @@
+package platform
+
+import (
+	"fmt"
+	"strings"
+	"time"
+)
+
+func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
+	var b strings.Builder
+	fmt.Fprintf(&b, "Bee NVIDIA Benchmark Report\n")
+	fmt.Fprintf(&b, "===========================\n\n")
+	fmt.Fprintf(&b, "Generated: %s\n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
+	fmt.Fprintf(&b, "Host: %s\n", result.Hostname)
+	fmt.Fprintf(&b, "Profile: %s\n", result.BenchmarkProfile)
+	fmt.Fprintf(&b, "Overall status: %s\n", result.OverallStatus)
+	fmt.Fprintf(&b, "Selected GPUs: %s\n", joinIndexList(result.SelectedGPUIndices))
+	fmt.Fprintf(&b, "Normalization: %s\n\n", result.Normalization.Status)
+
+	if len(result.Findings) > 0 {
+		fmt.Fprintf(&b, "Executive Summary\n")
+		fmt.Fprintf(&b, "-----------------\n")
+		for _, finding := range result.Findings {
+			fmt.Fprintf(&b, "- %s\n", finding)
+		}
+		b.WriteString("\n")
+	}
+
+	if len(result.Warnings) > 0 {
+		fmt.Fprintf(&b, "Warnings\n")
+		fmt.Fprintf(&b, "--------\n")
+		for _, warning := range result.Warnings {
+			fmt.Fprintf(&b, "- %s\n", warning)
+		}
+		b.WriteString("\n")
+	}
+
+	fmt.Fprintf(&b, "Per GPU Scorecard\n")
+	fmt.Fprintf(&b, "-----------------\n")
+	for _, gpu := range result.GPUs {
+		fmt.Fprintf(&b, "GPU %d  %s\n", gpu.Index, gpu.Name)
+		fmt.Fprintf(&b, "  Status: %s\n", gpu.Status)
+		fmt.Fprintf(&b, "  Composite score: %.2f\n", gpu.Scores.CompositeScore)
+		fmt.Fprintf(&b, "  Compute score: %.2f\n", gpu.Scores.ComputeScore)
+		fmt.Fprintf(&b, "  Power sustain: %.1f\n", gpu.Scores.PowerSustainScore)
+		fmt.Fprintf(&b, "  Thermal sustain: %.1f\n", gpu.Scores.ThermalSustainScore)
+		fmt.Fprintf(&b, "  Stability: %.1f\n", gpu.Scores.StabilityScore)
+		if gpu.Scores.InterconnectScore > 0 {
+			fmt.Fprintf(&b, "  Interconnect: %.1f\n", gpu.Scores.InterconnectScore)
+		}
+		if len(gpu.DegradationReasons) > 0 {
+			fmt.Fprintf(&b, "  Degradation reasons: %s\n", strings.Join(gpu.DegradationReasons, ", "))
+		}
+		fmt.Fprintf(&b, "  Avg power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.AvgPowerW, gpu.Steady.AvgTempC, gpu.Steady.AvgGraphicsClockMHz)
+		fmt.Fprintf(&b, "  P95 power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.P95PowerW, gpu.Steady.P95TempC, gpu.Steady.P95GraphicsClockMHz)
+		if len(gpu.PrecisionResults) > 0 {
+			fmt.Fprintf(&b, "  Precision results:\n")
+			for _, precision := range gpu.PrecisionResults {
+				if precision.Supported {
+					fmt.Fprintf(&b, "    - %s: %.2f TOPS lanes=%d iterations=%d\n", precision.Name, precision.TeraOpsPerSec, precision.Lanes, precision.Iterations)
+				} else {
+					fmt.Fprintf(&b, "    - %s: unsupported (%s)\n", precision.Name, precision.Notes)
+				}
+			}
+		}
+		fmt.Fprintf(&b, "  Throttle counters (us): sw_power=%d sw_thermal=%d sync_boost=%d hw_thermal=%d hw_power_brake=%d\n",
+			gpu.Throttle.SWPowerCapUS,
+			gpu.Throttle.SWThermalSlowdownUS,
+			gpu.Throttle.SyncBoostUS,
+			gpu.Throttle.HWThermalSlowdownUS,
+			gpu.Throttle.HWPowerBrakeSlowdownUS,
+		)
+		if len(gpu.Notes) > 0 {
+			fmt.Fprintf(&b, "  Notes:\n")
+			for _, note := range gpu.Notes {
+				fmt.Fprintf(&b, "    - %s\n", note)
+			}
+		}
+		b.WriteString("\n")
+	}
+
+	if result.Interconnect != nil {
+		fmt.Fprintf(&b, "Interconnect\n")
+		fmt.Fprintf(&b, "------------\n")
+		fmt.Fprintf(&b, "Status: %s\n", result.Interconnect.Status)
+		if result.Interconnect.Supported {
+			fmt.Fprintf(&b, "Avg algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.AvgBusBWGBps)
+			fmt.Fprintf(&b, "Max algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.MaxAlgBWGBps, result.Interconnect.MaxBusBWGBps)
+		}
+		for _, note := range result.Interconnect.Notes {
+			fmt.Fprintf(&b, "- %s\n", note)
+		}
+		b.WriteString("\n")
+	}
+
+	fmt.Fprintf(&b, "Methodology\n")
+	fmt.Fprintf(&b, "-----------\n")
+	fmt.Fprintf(&b, "- Profile %s uses standardized baseline, warmup, steady-state, interconnect, and cooldown phases.\n", result.BenchmarkProfile)
+	fmt.Fprintf(&b, "- Single-GPU compute score comes from bee-gpu-burn cuBLASLt output when available.\n")
+	fmt.Fprintf(&b, "- Thermal and power limitations are inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
+	fmt.Fprintf(&b, "- result.json is the canonical machine-readable source for this benchmark run.\n\n")
+
+	fmt.Fprintf(&b, "Raw Files\n")
+	fmt.Fprintf(&b, "---------\n")
+	fmt.Fprintf(&b, "- result.json\n")
+	fmt.Fprintf(&b, "- report.txt\n")
+	fmt.Fprintf(&b, "- summary.txt\n")
+	fmt.Fprintf(&b, "- verbose.log\n")
+	fmt.Fprintf(&b, "- gpu-*-baseline-metrics.csv/html/term.txt\n")
+	fmt.Fprintf(&b, "- gpu-*-warmup.log\n")
+	fmt.Fprintf(&b, "- gpu-*-steady.log\n")
+	fmt.Fprintf(&b, "- gpu-*-steady-metrics.csv/html/term.txt\n")
+	fmt.Fprintf(&b, "- gpu-*-cooldown-metrics.csv/html/term.txt\n")
+	if result.Interconnect != nil {
+		fmt.Fprintf(&b, "- nccl-all-reduce.log\n")
+	}
+	return b.String()
+}
+
+func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
+	var b strings.Builder
+	fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
+	fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
+	fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
+	fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
+	fmt.Fprintf(&b, "normalization_status=%s\n", result.Normalization.Status)
+	var best float64
+	for i, gpu := range result.GPUs {
+		fmt.Fprintf(&b, "gpu_%d_status=%s\n", gpu.Index, gpu.Status)
+		fmt.Fprintf(&b, "gpu_%d_composite_score=%.2f\n", gpu.Index, gpu.Scores.CompositeScore)
+		if i == 0 || gpu.Scores.CompositeScore > best {
+			best = gpu.Scores.CompositeScore
+		}
+	}
+	fmt.Fprintf(&b, "best_composite_score=%.2f\n", best)
+	if result.Interconnect != nil {
+		fmt.Fprintf(&b, "interconnect_status=%s\n", result.Interconnect.Status)
+		fmt.Fprintf(&b, "interconnect_max_busbw_gbps=%.1f\n", result.Interconnect.MaxBusBWGBps)
+	}
+	return b.String()
+}
--- a/audit/internal/platform/benchmark_test.go
+++ b/audit/internal/platform/benchmark_test.go
@@ -0,0 +1,147 @@
+package platform
+
+import (
+	"strings"
+	"testing"
+)
+
+func TestResolveBenchmarkProfile(t *testing.T) {
+	t.Parallel()
+
+	cases := []struct {
+		name    string
+		profile string
+		want    benchmarkProfileSpec
+	}{
+		{
+			name:    "default",
+			profile: "",
+			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120},
+		},
+		{
+			name:    "stability",
+			profile: "stability",
+			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300},
+		},
+		{
+			name:    "overnight",
+			profile: "overnight",
+			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300},
+		},
+	}
+
+	for _, tc := range cases {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			got := resolveBenchmarkProfile(tc.profile)
+			if got != tc.want {
+				t.Fatalf("profile=%q got %+v want %+v", tc.profile, got, tc.want)
+			}
+		})
+	}
+}
+
+func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
+	t.Parallel()
+
+	opts := normalizeNvidiaBenchmarkOptionsForBenchmark(NvidiaBenchmarkOptions{
+		Profile: "stability",
+		RunNCCL: false,
+	})
+	if opts.Profile != NvidiaBenchmarkProfileStability {
+		t.Fatalf("profile=%q want %q", opts.Profile, NvidiaBenchmarkProfileStability)
+	}
+	if opts.RunNCCL {
+		t.Fatalf("RunNCCL should stay false when explicitly disabled")
+	}
+}
+
+func TestParseBenchmarkBurnLog(t *testing.T) {
+	t.Parallel()
+
+	raw := strings.Join([]string{
+		"loader=bee-gpu-burn",
+		"[gpu 0] device=NVIDIA H100",
+		"[gpu 0] compute_capability=9.0",
+		"[gpu 0] backend=cublasLt",
+		"[gpu 0] duration_s=10",
+		"[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0",
+		"[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0",
+		"[gpu 0] fp16_tensor_iterations=200",
+		"[gpu 0] fp8_e4m3_iterations=50",
+		"[gpu 0] status=OK",
+	}, "\n")
+
+	got := parseBenchmarkBurnLog(raw)
+	if got.Backend != "cublasLt" {
+		t.Fatalf("backend=%q want cublasLt", got.Backend)
+	}
+	if got.ComputeCapability != "9.0" {
+		t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability)
+	}
+	if len(got.Profiles) != 2 {
+		t.Fatalf("profiles=%d want 2", len(got.Profiles))
+	}
+	if got.Profiles[0].TeraOpsPerSec <= 0 {
+		t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec)
+	}
+	if got.Profiles[1].Category != "fp8" {
+		t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category)
+	}
+}
+
+func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
+	t.Parallel()
+
+	result := NvidiaBenchmarkResult{
+		BenchmarkVersion:   benchmarkVersion,
+		BenchmarkProfile:   NvidiaBenchmarkProfileStandard,
+		OverallStatus:      "PARTIAL",
+		SelectedGPUIndices: []int{0},
+		Normalization: BenchmarkNormalization{
+			Status: "partial",
+		},
+		Findings: []string{"GPU 0 spent measurable time under SW power cap."},
+		GPUs: []BenchmarkGPUResult{
+			{
+				Index:  0,
+				Name:   "NVIDIA H100",
+				Status: "OK",
+				Steady: BenchmarkTelemetrySummary{
+					AvgPowerW:           680,
+					AvgTempC:            79,
+					AvgGraphicsClockMHz: 1725,
+					P95PowerW:           700,
+					P95TempC:            82,
+					P95GraphicsClockMHz: 1800,
+				},
+				Scores: BenchmarkScorecard{
+					ComputeScore:        1200,
+					PowerSustainScore:   96,
+					ThermalSustainScore: 88,
+					StabilityScore:      92,
+					CompositeScore:      1176,
+				},
+				PrecisionResults: []BenchmarkPrecisionResult{
+					{Name: "fp16_tensor", Supported: true, TeraOpsPerSec: 700},
+				},
+				Throttle: BenchmarkThrottleCounters{
+					SWPowerCapUS: 1000000,
+				},
+				DegradationReasons: []string{"power_capped"},
+			},
+		},
+	}
+
+	report := renderBenchmarkReport(result)
+	for _, needle := range []string{
+		"Executive Summary",
+		"GPU 0 spent measurable time under SW power cap.",
+		"Composite score: 1176.00",
+		"fp16_tensor: 700.00 TOPS",
+	} {
+		if !strings.Contains(report, needle) {
+			t.Fatalf("report missing %q\n%s", needle, report)
+		}
+	}
+}
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -0,0 +1,132 @@
+package platform
+
+import "time"
+
+const (
+	NvidiaBenchmarkProfileStandard  = "standard"
+	NvidiaBenchmarkProfileStability = "stability"
+	NvidiaBenchmarkProfileOvernight = "overnight"
+)
+
+type NvidiaBenchmarkOptions struct {
+	Profile           string
+	SizeMB            int
+	GPUIndices        []int
+	ExcludeGPUIndices []int
+	RunNCCL           bool
+}
+
+type NvidiaBenchmarkResult struct {
+	BenchmarkVersion   string                       `json:"benchmark_version"`
+	GeneratedAt        time.Time                    `json:"generated_at"`
+	Hostname           string                       `json:"hostname,omitempty"`
+	BenchmarkProfile   string                       `json:"benchmark_profile"`
+	OverallStatus      string                       `json:"overall_status"`
+	SelectedGPUIndices []int                        `json:"selected_gpu_indices"`
+	Findings           []string                     `json:"findings,omitempty"`
+	Warnings           []string                     `json:"warnings,omitempty"`
+	Normalization      BenchmarkNormalization       `json:"normalization"`
+	GPUs               []BenchmarkGPUResult         `json:"gpus"`
+	Interconnect       *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
+}
+
+type BenchmarkNormalization struct {
+	Status string                      `json:"status"`
+	Notes  []string                    `json:"notes,omitempty"`
+	GPUs   []BenchmarkNormalizationGPU `json:"gpus,omitempty"`
+}
+
+type BenchmarkNormalizationGPU struct {
+	Index                 int      `json:"index"`
+	PersistenceMode       string   `json:"persistence_mode,omitempty"`
+	GPUClockLockMHz       float64  `json:"gpu_clock_lock_mhz,omitempty"`
+	GPUClockLockStatus    string   `json:"gpu_clock_lock_status,omitempty"`
+	MemoryClockLockMHz    float64  `json:"memory_clock_lock_mhz,omitempty"`
+	MemoryClockLockStatus string   `json:"memory_clock_lock_status,omitempty"`
+	Notes                 []string `json:"notes,omitempty"`
+}
+
+type BenchmarkGPUResult struct {
+	Index                  int                        `json:"index"`
+	UUID                   string                     `json:"uuid,omitempty"`
+	Name                   string                     `json:"name,omitempty"`
+	BusID                  string                     `json:"bus_id,omitempty"`
+	VBIOS                  string                     `json:"vbios,omitempty"`
+	ComputeCapability      string                     `json:"compute_capability,omitempty"`
+	Backend                string                     `json:"backend,omitempty"`
+	Status                 string                     `json:"status"`
+	PowerLimitW            float64                    `json:"power_limit_w,omitempty"`
+	MaxGraphicsClockMHz    float64                    `json:"max_graphics_clock_mhz,omitempty"`
+	MaxMemoryClockMHz      float64                    `json:"max_memory_clock_mhz,omitempty"`
+	LockedGraphicsClockMHz float64                    `json:"locked_graphics_clock_mhz,omitempty"`
+	LockedMemoryClockMHz   float64                    `json:"locked_memory_clock_mhz,omitempty"`
+	Baseline               BenchmarkTelemetrySummary  `json:"baseline"`
+	Steady                 BenchmarkTelemetrySummary  `json:"steady"`
+	Cooldown               BenchmarkTelemetrySummary  `json:"cooldown"`
+	Throttle               BenchmarkThrottleCounters  `json:"throttle_counters"`
+	PrecisionResults       []BenchmarkPrecisionResult `json:"precision_results,omitempty"`
+	Scores                 BenchmarkScorecard         `json:"scores"`
+	DegradationReasons     []string                   `json:"degradation_reasons,omitempty"`
+	Notes                  []string                   `json:"notes,omitempty"`
+}
+
+type BenchmarkTelemetrySummary struct {
+	DurationSec         float64 `json:"duration_sec"`
+	Samples             int     `json:"samples"`
+	AvgTempC            float64 `json:"avg_temp_c"`
+	P95TempC            float64 `json:"p95_temp_c"`
+	AvgPowerW           float64 `json:"avg_power_w"`
+	P95PowerW           float64 `json:"p95_power_w"`
+	AvgGraphicsClockMHz float64 `json:"avg_graphics_clock_mhz"`
+	P95GraphicsClockMHz float64 `json:"p95_graphics_clock_mhz"`
+	AvgMemoryClockMHz   float64 `json:"avg_memory_clock_mhz"`
+	P95MemoryClockMHz   float64 `json:"p95_memory_clock_mhz"`
+	AvgUsagePct         float64 `json:"avg_usage_pct"`
+	AvgMemUsagePct      float64 `json:"avg_mem_usage_pct"`
+	ClockCVPct          float64 `json:"clock_cv_pct"`
+	PowerCVPct          float64 `json:"power_cv_pct"`
+	TempCVPct           float64 `json:"temp_cv_pct"`
+	ClockDriftPct       float64 `json:"clock_drift_pct"`
+}
+
+type BenchmarkThrottleCounters struct {
+	SWPowerCapUS           uint64 `json:"sw_power_cap_us"`
+	SWThermalSlowdownUS    uint64 `json:"sw_thermal_slowdown_us"`
+	SyncBoostUS            uint64 `json:"sync_boost_us"`
+	HWThermalSlowdownUS    uint64 `json:"hw_thermal_slowdown_us"`
+	HWPowerBrakeSlowdownUS uint64 `json:"hw_power_brake_slowdown_us"`
+}
+
+type BenchmarkPrecisionResult struct {
+	Name          string  `json:"name"`
+	Category      string  `json:"category"`
+	Supported     bool    `json:"supported"`
+	Lanes         int     `json:"lanes,omitempty"`
+	M             uint64  `json:"m,omitempty"`
+	N             uint64  `json:"n,omitempty"`
+	K             uint64  `json:"k,omitempty"`
+	Iterations    uint64  `json:"iterations,omitempty"`
+	TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
+	Notes         string  `json:"notes,omitempty"`
+}
+
+type BenchmarkScorecard struct {
+	ComputeScore        float64 `json:"compute_score"`
+	PowerSustainScore   float64 `json:"power_sustain_score"`
+	ThermalSustainScore float64 `json:"thermal_sustain_score"`
+	StabilityScore      float64 `json:"stability_score"`
+	InterconnectScore   float64 `json:"interconnect_score"`
+	CompositeScore      float64 `json:"composite_score"`
+}
+
+type BenchmarkInterconnectResult struct {
+	Status             string   `json:"status"`
+	Attempted          bool     `json:"attempted"`
+	Supported          bool     `json:"supported"`
+	SelectedGPUIndices []int    `json:"selected_gpu_indices,omitempty"`
+	AvgAlgBWGBps       float64  `json:"avg_algbw_gbps,omitempty"`
+	MaxAlgBWGBps       float64  `json:"max_algbw_gbps,omitempty"`
+	AvgBusBWGBps       float64  `json:"avg_busbw_gbps,omitempty"`
+	MaxBusBWGBps       float64  `json:"max_busbw_gbps,omitempty"`
+	Notes              []string `json:"notes,omitempty"`
+}
--- a/audit/internal/platform/gpu_metrics.go
+++ b/audit/internal/platform/gpu_metrics.go
@@ -20,12 +20,13 @@ type GPUMetricRow struct {
 	MemUsagePct float64 `json:"mem_usage_pct"`
 	PowerW      float64 `json:"power_w"`
 	ClockMHz    float64 `json:"clock_mhz"`
+	MemClockMHz float64 `json:"mem_clock_mhz"`
 }

 // sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
 func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
 	args := []string{
-		"--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics",
+		"--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics,clocks.current.memory",
 		"--format=csv,noheader,nounits",
 	}
 	if len(gpuIndices) > 0 {
@@ -46,7 +47,7 @@ func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
 			continue
 		}
 		parts := strings.Split(line, ", ")
-		if len(parts) < 6 {
+		if len(parts) < 7 {
 			continue
 		}
 		idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
@@ -57,6 +58,7 @@ func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
 			MemUsagePct: parseGPUFloat(parts[3]),
 			PowerW:      parseGPUFloat(parts[4]),
 			ClockMHz:    parseGPUFloat(parts[5]),
+			MemClockMHz: parseGPUFloat(parts[6]),
 		})
 	}
 	return rows, nil
@@ -139,10 +141,10 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
 // WriteGPUMetricsCSV writes collected rows as a CSV file.
 func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
 	var b bytes.Buffer
-	b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,power_w,clock_mhz\n")
+	b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz\n")
 	for _, r := range rows {
-		fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.0f\n",
-			r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.PowerW, r.ClockMHz)
+		fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f\n",
+			r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz)
 	}
 	return os.WriteFile(path, b.Bytes(), 0644)
 }
@@ -197,7 +199,7 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
 	const PW = plotX2 - plotX1
 	const PH = plotY2 - plotY1
 	// Outer axes
-	const tempAxisX = 60  // temp axis line
+	const tempAxisX = 60   // temp axis line
 	const clockAxisX = 900 // clock axis line

 	colors := [4]string{"#e74c3c", "#3498db", "#2ecc71", "#f39c12"}
--- a/audit/internal/platform/install_to_ram.go
+++ b/audit/internal/platform/install_to_ram.go
@@ -120,10 +120,45 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) erro
 		log(fmt.Sprintf("Warning: rebind /run/live/medium failed: %v", err))
 	}

+	log("Verifying live medium now served from RAM...")
+	status := s.LiveBootSource()
+	if err := verifyInstallToRAMStatus(status); err != nil {
+		return err
+	}
+	log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
 	log("Done. Installation media can be safely disconnected.")
 	return nil
 }

+func verifyInstallToRAMStatus(status LiveBootSource) error {
+	if status.InRAM {
+		return nil
+	}
+	return fmt.Errorf("install to RAM verification failed: live medium still mounted from %s", describeLiveBootSource(status))
+}
+
+func describeLiveBootSource(status LiveBootSource) string {
+	source := strings.TrimSpace(status.Device)
+	if source == "" {
+		source = strings.TrimSpace(status.Source)
+	}
+	if source == "" {
+		source = "unknown source"
+	}
+	switch strings.TrimSpace(status.Kind) {
+	case "ram":
+		return "RAM"
+	case "usb":
+		return "USB (" + source + ")"
+	case "cdrom":
+		return "CD-ROM (" + source + ")"
+	case "disk":
+		return "disk (" + source + ")"
+	default:
+		return source
+	}
+}
+
 func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) error {
 	in, err := os.Open(src)
 	if err != nil {
--- a/audit/internal/platform/install_to_ram_test.go
+++ b/audit/internal/platform/install_to_ram_test.go
@@ -3,6 +3,8 @@ package platform
 import "testing"

 func TestInferLiveBootKind(t *testing.T) {
+	t.Parallel()
+
 	tests := []struct {
 		name       string
 		fsType     string
@@ -18,6 +20,7 @@ func TestInferLiveBootKind(t *testing.T) {
 		{name: "unknown", source: "overlay", want: "unknown"},
 	}
 	for _, tc := range tests {
+		tc := tc
 		t.Run(tc.name, func(t *testing.T) {
 			got := inferLiveBootKind(tc.fsType, tc.source, tc.deviceType, tc.transport)
 			if got != tc.want {
@@ -26,3 +29,29 @@ func TestInferLiveBootKind(t *testing.T) {
 		})
 	}
 }
+
+func TestVerifyInstallToRAMStatus(t *testing.T) {
+	t.Parallel()
+
+	if err := verifyInstallToRAMStatus(LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"}); err != nil {
+		t.Fatalf("expected success for RAM-backed status, got %v", err)
+	}
+	err := verifyInstallToRAMStatus(LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"})
+	if err == nil {
+		t.Fatal("expected verification failure when media is still on USB")
+	}
+	if got := err.Error(); got != "install to RAM verification failed: live medium still mounted from USB (/dev/sdb1)" {
+		t.Fatalf("error=%q", got)
+	}
+}
+
+func TestDescribeLiveBootSource(t *testing.T) {
+	t.Parallel()
+
+	if got := describeLiveBootSource(LiveBootSource{InRAM: true, Kind: "ram"}); got != "RAM" {
+		t.Fatalf("got %q want RAM", got)
+	}
+	if got := describeLiveBootSource(LiveBootSource{Kind: "unknown", Source: "/run/live/medium"}); got != "/run/live/medium" {
+		t.Fatalf("got %q want /run/live/medium", got)
+	}
+}
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -12,11 +12,11 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
-	"syscall"
 	"sort"
 	"strconv"
 	"strings"
 	"sync"
+	"syscall"
 	"time"
 )

@@ -76,15 +76,15 @@ func streamExecOutput(cmd *exec.Cmd, logFunc func(string)) ([]byte, error) {

 // NvidiaGPU holds basic GPU info from nvidia-smi.
 type NvidiaGPU struct {
-	Index    int
-	Name     string
-	MemoryMB int
+	Index    int    `json:"index"`
+	Name     string `json:"name"`
+	MemoryMB int    `json:"memory_mb"`
 }

 // AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
 type AMDGPUInfo struct {
-	Index int
-	Name  string
+	Index int    `json:"index"`
+	Name  string `json:"name"`
 }

 // DetectGPUVendor returns "nvidia" if /dev/nvidia0 exists, "amd" if /dev/kfd exists, or "" otherwise.
--- a/audit/internal/platform/services.go
+++ b/audit/internal/platform/services.go
@@ -10,17 +10,30 @@ import (
 func (s *System) ListBeeServices() ([]string, error) {
 	seen := map[string]bool{}
 	var out []string
-	for _, pattern := range []string{"/etc/systemd/system/bee-*.service", "/lib/systemd/system/bee-*.service"} {
+	for _, pattern := range []string{
+		"/etc/systemd/system/bee-*.service",
+		"/lib/systemd/system/bee-*.service",
+		"/etc/systemd/system/bee-*.timer",
+		"/lib/systemd/system/bee-*.timer",
+	} {
 		matches, err := filepath.Glob(pattern)
 		if err != nil {
 			return nil, err
 		}
 		for _, match := range matches {
-			name := strings.TrimSuffix(filepath.Base(match), ".service")
+			base := filepath.Base(match)
+			name := base
+			if strings.HasSuffix(base, ".service") {
+				name = strings.TrimSuffix(base, ".service")
+			}
 			// Skip template units (e.g. bee-journal-mirror@) — they have no instances to query.
 			if strings.HasSuffix(name, "@") {
 				continue
 			}
+			// bee-selfheal is timer-managed; showing the oneshot service as inactive is misleading.
+			if name == "bee-selfheal" && strings.HasSuffix(base, ".service") {
+				continue
+			}
 			if !seen[name] {
 				seen[name] = true
 				out = append(out, name)
--- a/audit/internal/platform/types.go
+++ b/audit/internal/platform/types.go
@@ -44,12 +44,12 @@ type StaticIPv4Config struct {
 }

 type RemovableTarget struct {
-	Device     string
-	FSType     string
-	Size       string
-	Label      string
-	Model      string
-	Mountpoint string
+	Device     string `json:"device"`
+	FSType     string `json:"fs_type"`
+	Size       string `json:"size"`
+	Label      string `json:"label"`
+	Model      string `json:"model"`
+	Mountpoint string `json:"mountpoint"`
 }

 type ToolStatus struct {
--- a/audit/internal/platform/types_test.go
+++ b/audit/internal/platform/types_test.go
@@ -0,0 +1,31 @@
+package platform
+
+import (
+	"encoding/json"
+	"strings"
+	"testing"
+)
+
+func TestRemovableTargetJSONUsesFrontendFieldNames(t *testing.T) {
+	t.Parallel()
+
+	data, err := json.Marshal(RemovableTarget{
+		Device: "/dev/sdb1",
+		FSType: "exfat",
+		Size:   "1.8T",
+		Label:  "USB",
+		Model:  "Flash",
+	})
+	if err != nil {
+		t.Fatalf("marshal: %v", err)
+	}
+	raw := string(data)
+	for _, key := range []string{`"device"`, `"fs_type"`, `"size"`, `"label"`, `"model"`} {
+		if !strings.Contains(raw, key) {
+			t.Fatalf("json missing key %s: %s", key, raw)
+		}
+	}
+	if strings.Contains(raw, `"Device"`) || strings.Contains(raw, `"FSType"`) {
+		t.Fatalf("json still contains Go field names: %s", raw)
+	}
+}
--- a/audit/internal/webui/api.go
+++ b/audit/internal/webui/api.go
@@ -110,6 +110,11 @@ func streamCmdJob(j *jobState, cmd *exec.Cmd) error {

 	scanDone := make(chan error, 1)
 	go func() {
+		defer func() {
+			if rec := recover(); rec != nil {
+				scanDone <- fmt.Errorf("stream scanner panic: %v", rec)
+			}
+		}()
 		scanner := bufio.NewScanner(pr)
 		scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024)
 		for scanner.Scan() {
@@ -227,6 +232,54 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
 	}
 }

+func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
+	if h.opts.App == nil {
+		writeError(w, http.StatusServiceUnavailable, "app not configured")
+		return
+	}
+
+	var body struct {
+		Profile           string `json:"profile"`
+		SizeMB            int    `json:"size_mb"`
+		GPUIndices        []int  `json:"gpu_indices"`
+		ExcludeGPUIndices []int  `json:"exclude_gpu_indices"`
+		RunNCCL           *bool  `json:"run_nccl"`
+		DisplayName       string `json:"display_name"`
+	}
+	if r.Body != nil {
+		if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
+			writeError(w, http.StatusBadRequest, "invalid request body")
+			return
+		}
+	}
+
+	runNCCL := true
+	if body.RunNCCL != nil {
+		runNCCL = *body.RunNCCL
+	}
+	t := &Task{
+		ID:        newJobID("benchmark-nvidia"),
+		Name:      taskDisplayName("nvidia-benchmark", "", ""),
+		Target:    "nvidia-benchmark",
+		Priority:  15,
+		Status:    TaskPending,
+		CreatedAt: time.Now(),
+		params: taskParams{
+			GPUIndices:        body.GPUIndices,
+			ExcludeGPUIndices: body.ExcludeGPUIndices,
+			SizeMB:            body.SizeMB,
+			BenchmarkProfile:  body.Profile,
+			RunNCCL:           runNCCL,
+			DisplayName:       body.DisplayName,
+		},
+	}
+	if strings.TrimSpace(body.DisplayName) != "" {
+		t.Name = body.DisplayName
+	}
+	globalQueue.enqueue(t)
+	writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
+}
+
 func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {
 	id := r.URL.Query().Get("job_id")
 	if id == "" {
@@ -486,6 +539,22 @@ func (h *handler) handleAPIExportUSBBundle(w http.ResponseWriter, r *http.Reques

 // ── GPU presence ──────────────────────────────────────────────────────────────

+func (h *handler) handleAPIGNVIDIAGPUs(w http.ResponseWriter, _ *http.Request) {
+	if h.opts.App == nil {
+		writeError(w, http.StatusServiceUnavailable, "app not configured")
+		return
+	}
+	gpus, err := h.opts.App.ListNvidiaGPUs()
+	if err != nil {
+		writeError(w, http.StatusInternalServerError, err.Error())
+		return
+	}
+	if gpus == nil {
+		gpus = []platform.NvidiaGPU{}
+	}
+	writeJSON(w, gpus)
+}
+
 func (h *handler) handleAPIGPUPresence(w http.ResponseWriter, r *http.Request) {
 	if h.opts.App == nil {
 		writeError(w, http.StatusServiceUnavailable, "app not configured")
@@ -511,8 +580,10 @@ func (h *handler) handleAPIGPUTools(w http.ResponseWriter, _ *http.Request) {
 	_, amdErr := os.Stat("/dev/kfd")
 	nvidiaUp := nvidiaErr == nil
 	amdUp := amdErr == nil
+	_, dcgmErr := exec.LookPath("dcgmi")
 	writeJSON(w, []toolEntry{
 		{ID: "bee-gpu-burn", Available: nvidiaUp, Vendor: "nvidia"},
+		{ID: "dcgm", Available: nvidiaUp && dcgmErr == nil, Vendor: "nvidia"},
 		{ID: "john", Available: nvidiaUp, Vendor: "nvidia"},
 		{ID: "nccl", Available: nvidiaUp, Vendor: "nvidia"},
 		{ID: "rvs", Available: amdUp, Vendor: "amd"},
--- a/audit/internal/webui/api_test.go
+++ b/audit/internal/webui/api_test.go
@@ -64,6 +64,42 @@ func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
 	}
 }

+func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
+	globalQueue.mu.Lock()
+	originalTasks := globalQueue.tasks
+	globalQueue.tasks = nil
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = originalTasks
+		globalQueue.mu.Unlock()
+	})
+
+	h := &handler{opts: HandlerOptions{App: &app.App{}}}
+	req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
+	rec := httptest.NewRecorder()
+
+	h.handleAPIBenchmarkNvidiaRun(rec, req)
+
+	if rec.Code != 200 {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	globalQueue.mu.Lock()
+	defer globalQueue.mu.Unlock()
+	if len(globalQueue.tasks) != 1 {
+		t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
+	}
+	task := globalQueue.tasks[0]
+	if task.Target != "nvidia-benchmark" {
+		t.Fatalf("target=%q want nvidia-benchmark", task.Target)
+	}
+	if got := task.params.GPUIndices; len(got) != 2 || got[0] != 1 || got[1] != 3 {
+		t.Fatalf("gpu indices=%v want [1 3]", got)
+	}
+	if task.params.RunNCCL {
+		t.Fatal("RunNCCL should reflect explicit false from request")
+	}
+}

 func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
 	h := &handler{}
--- a/audit/internal/webui/charts_svg.go
+++ b/audit/internal/webui/charts_svg.go
@@ -0,0 +1,713 @@
+package webui
+
+import (
+	"fmt"
+	"math"
+	"sort"
+	"strconv"
+	"strings"
+	"time"
+
+	"bee/audit/internal/platform"
+)
+
+type chartTimelineSegment struct {
+	Start  time.Time
+	End    time.Time
+	Active bool
+}
+
+type chartScale struct {
+	Min   float64
+	Max   float64
+	Ticks []float64
+}
+
+type chartLayout struct {
+	Width      int
+	Height     int
+	PlotLeft   int
+	PlotRight  int
+	PlotTop    int
+	PlotBottom int
+}
+
+type metricChartSeries struct {
+	Name      string
+	AxisTitle string
+	Color     string
+	Values    []float64
+}
+
+var metricChartPalette = []string{
+	"#5794f2",
+	"#73bf69",
+	"#f2cc0c",
+	"#ff9830",
+	"#f2495c",
+	"#b877d9",
+	"#56d2f7",
+	"#8ab8ff",
+	"#9adf8f",
+	"#ffbe5c",
+}
+
+func renderMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMin, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
+	pointCount := len(labels)
+	if len(times) > pointCount {
+		pointCount = len(times)
+	}
+	if pointCount == 0 {
+		pointCount = 1
+		labels = []string{""}
+		times = []time.Time{time.Time{}}
+	}
+	if len(labels) < pointCount {
+		padded := make([]string, pointCount)
+		copy(padded, labels)
+		labels = padded
+	}
+	if len(times) < pointCount {
+		times = synthesizeChartTimes(times, pointCount)
+	}
+	for i := range datasets {
+		if len(datasets[i]) == 0 {
+			datasets[i] = make([]float64, pointCount)
+		}
+	}
+
+	mn, avg, mx := globalStats(datasets)
+	if mx > 0 {
+		title = fmt.Sprintf("%s    ↓%s  ~%s  ↑%s",
+			title,
+			chartLegendNumber(mn),
+			chartLegendNumber(avg),
+			chartLegendNumber(mx),
+		)
+	}
+
+	legendItems := []metricChartSeries{}
+	for i, name := range names {
+		color := metricChartPalette[i%len(metricChartPalette)]
+		values := make([]float64, pointCount)
+		if i < len(datasets) {
+			copy(values, coalesceDataset(datasets[i], pointCount))
+		}
+		legendItems = append(legendItems, metricChartSeries{
+			Name:   name,
+			Color:  color,
+			Values: values,
+		})
+	}
+
+	scale := singleAxisChartScale(datasets, yMin, yMax)
+	layout := singleAxisChartLayout(canvasHeight, len(legendItems))
+	start, end := chartTimeBounds(times)
+
+	var b strings.Builder
+	writeSVGOpen(&b, layout.Width, layout.Height)
+	writeChartFrame(&b, title, layout.Width, layout.Height)
+	writeTimelineIdleSpans(&b, layout, start, end, timeline)
+	writeVerticalGrid(&b, layout, times, pointCount, 8)
+	writeHorizontalGrid(&b, layout, scale)
+	writeTimelineBoundaries(&b, layout, start, end, timeline)
+	writePlotBorder(&b, layout)
+	writeSingleAxisY(&b, layout, scale)
+	writeXAxisLabels(&b, layout, times, labels, start, end, 8)
+	for _, item := range legendItems {
+		writeSeriesPolyline(&b, layout, times, start, end, item.Values, scale, item.Color)
+	}
+	writeLegend(&b, layout, legendItems)
+	writeSVGClose(&b)
+	return []byte(b.String()), nil
+}
+
+func renderGPUOverviewChartSVG(idx int, samples []platform.LiveMetricSample, timeline []chartTimelineSegment) ([]byte, bool, error) {
+	temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
+	power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
+	coreClock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
+	memClock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
+	if temp == nil && power == nil && coreClock == nil && memClock == nil {
+		return nil, false, nil
+	}
+	labels := sampleTimeLabels(samples)
+	times := sampleTimes(samples)
+	svg, err := drawGPUOverviewChartSVG(
+		fmt.Sprintf("GPU %d Overview", idx),
+		labels,
+		times,
+		[]metricChartSeries{
+			{Name: "Temp C", Values: coalesceDataset(temp, len(labels)), Color: "#f05a5a", AxisTitle: "Temp C"},
+			{Name: "Power W", Values: coalesceDataset(power, len(labels)), Color: "#ffb357", AxisTitle: "Power W"},
+			{Name: "Core Clock MHz", Values: coalesceDataset(coreClock, len(labels)), Color: "#73bf69", AxisTitle: "Core MHz"},
+			{Name: "Memory Clock MHz", Values: coalesceDataset(memClock, len(labels)), Color: "#5794f2", AxisTitle: "Memory MHz"},
+		},
+		timeline,
+	)
+	if err != nil {
+		return nil, false, err
+	}
+	return svg, true, nil
+}
+
+func drawGPUOverviewChartSVG(title string, labels []string, times []time.Time, series []metricChartSeries, timeline []chartTimelineSegment) ([]byte, error) {
+	if len(series) != 4 {
+		return nil, fmt.Errorf("gpu overview requires 4 series, got %d", len(series))
+	}
+	const (
+		width      = 1400
+		height     = 840
+		plotLeft   = 180
+		plotRight  = 1220
+		plotTop    = 96
+		plotBottom = 660
+	)
+	const (
+		leftOuterAxis  = 72
+		leftInnerAxis  = 132
+		rightInnerAxis = 1268
+		rightOuterAxis = 1328
+	)
+	layout := chartLayout{
+		Width:      width,
+		Height:     height,
+		PlotLeft:   plotLeft,
+		PlotRight:  plotRight,
+		PlotTop:    plotTop,
+		PlotBottom: plotBottom,
+	}
+	axisX := []int{leftOuterAxis, leftInnerAxis, rightInnerAxis, rightOuterAxis}
+	pointCount := len(labels)
+	if len(times) > pointCount {
+		pointCount = len(times)
+	}
+	if pointCount == 0 {
+		pointCount = 1
+		labels = []string{""}
+		times = []time.Time{time.Time{}}
+	}
+	if len(labels) < pointCount {
+		padded := make([]string, pointCount)
+		copy(padded, labels)
+		labels = padded
+	}
+	if len(times) < pointCount {
+		times = synthesizeChartTimes(times, pointCount)
+	}
+	for i := range series {
+		if len(series[i].Values) == 0 {
+			series[i].Values = make([]float64, pointCount)
+		}
+	}
+
+	scales := make([]chartScale, len(series))
+	for i := range series {
+		min, max := chartSeriesBounds(series[i].Values)
+		ticks := chartNiceTicks(min, max, 8)
+		scales[i] = chartScale{
+			Min:   ticks[0],
+			Max:   ticks[len(ticks)-1],
+			Ticks: ticks,
+		}
+	}
+	start, end := chartTimeBounds(times)
+
+	var b strings.Builder
+	writeSVGOpen(&b, width, height)
+	writeChartFrame(&b, title, width, height)
+	writeTimelineIdleSpans(&b, layout, start, end, timeline)
+	writeVerticalGrid(&b, layout, times, pointCount, 8)
+	writeHorizontalGrid(&b, layout, scales[0])
+	writeTimelineBoundaries(&b, layout, start, end, timeline)
+	writePlotBorder(&b, layout)
+
+	for i, axisLineX := range axisX {
+		fmt.Fprintf(&b, `<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="%s" stroke-width="1"/>`+"\n",
+			axisLineX, layout.PlotTop, axisLineX, layout.PlotBottom, series[i].Color)
+		fmt.Fprintf(&b, `<text x="%d" y="%d" text-anchor="middle" font-family="sans-serif" font-size="11" font-weight="700" fill="%s">%s</text>`+"\n",
+			axisLineX, 64, series[i].Color, sanitizeChartText(series[i].AxisTitle))
+		for _, tick := range scales[i].Ticks {
+			y := chartYForValue(valueClamp(tick, scales[i]), scales[i], layout.PlotTop, layout.PlotBottom)
+			label := sanitizeChartText(chartYAxisNumber(tick))
+			if i < 2 {
+				fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="%s" stroke-width="1"/>`+"\n",
+					axisLineX, y, axisLineX+6, y, series[i].Color)
+				fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="end" dy="4" font-family="sans-serif" font-size="10" fill="%s">%s</text>`+"\n",
+					axisLineX-8, y, series[i].Color, label)
+				continue
+			}
+			fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="%s" stroke-width="1"/>`+"\n",
+				axisLineX, y, axisLineX-6, y, series[i].Color)
+			fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="start" dy="4" font-family="sans-serif" font-size="10" fill="%s">%s</text>`+"\n",
+				axisLineX+8, y, series[i].Color, label)
+		}
+	}
+
+	writeXAxisLabels(&b, layout, times, labels, start, end, 8)
+	for i := range series {
+		writeSeriesPolyline(&b, layout, times, start, end, series[i].Values, scales[i], series[i].Color)
+	}
+	writeLegend(&b, layout, series)
+	writeSVGClose(&b)
+	return []byte(b.String()), nil
+}
+
+func metricsTimelineSegments(samples []platform.LiveMetricSample, now time.Time) []chartTimelineSegment {
+	if len(samples) == 0 {
+		return nil
+	}
+	times := sampleTimes(samples)
+	start, end := chartTimeBounds(times)
+	if start.IsZero() || end.IsZero() {
+		return nil
+	}
+	return chartTimelineSegmentsForRange(start, end, now, snapshotTaskHistory())
+}
+
+func snapshotTaskHistory() []Task {
+	globalQueue.mu.Lock()
+	defer globalQueue.mu.Unlock()
+	out := make([]Task, len(globalQueue.tasks))
+	for i, t := range globalQueue.tasks {
+		out[i] = *t
+	}
+	return out
+}
+
+func chartTimelineSegmentsForRange(start, end, now time.Time, tasks []Task) []chartTimelineSegment {
+	if start.IsZero() || end.IsZero() {
+		return nil
+	}
+	if end.Before(start) {
+		start, end = end, start
+	}
+	type interval struct {
+		start time.Time
+		end   time.Time
+	}
+	active := make([]interval, 0, len(tasks))
+	for _, task := range tasks {
+		if task.StartedAt == nil {
+			continue
+		}
+		intervalStart := task.StartedAt.UTC()
+		intervalEnd := now.UTC()
+		if task.DoneAt != nil {
+			intervalEnd = task.DoneAt.UTC()
+		}
+		if !intervalEnd.After(intervalStart) {
+			continue
+		}
+		if intervalEnd.Before(start) || intervalStart.After(end) {
+			continue
+		}
+		if intervalStart.Before(start) {
+			intervalStart = start
+		}
+		if intervalEnd.After(end) {
+			intervalEnd = end
+		}
+		active = append(active, interval{start: intervalStart, end: intervalEnd})
+	}
+	sort.Slice(active, func(i, j int) bool {
+		if active[i].start.Equal(active[j].start) {
+			return active[i].end.Before(active[j].end)
+		}
+		return active[i].start.Before(active[j].start)
+	})
+	merged := make([]interval, 0, len(active))
+	for _, span := range active {
+		if len(merged) == 0 {
+			merged = append(merged, span)
+			continue
+		}
+		last := &merged[len(merged)-1]
+		if !span.start.After(last.end) {
+			if span.end.After(last.end) {
+				last.end = span.end
+			}
+			continue
+		}
+		merged = append(merged, span)
+	}
+
+	segments := make([]chartTimelineSegment, 0, len(merged)*2+1)
+	cursor := start
+	for _, span := range merged {
+		if span.start.After(cursor) {
+			segments = append(segments, chartTimelineSegment{Start: cursor, End: span.start, Active: false})
+		}
+		segments = append(segments, chartTimelineSegment{Start: span.start, End: span.end, Active: true})
+		cursor = span.end
+	}
+	if cursor.Before(end) {
+		segments = append(segments, chartTimelineSegment{Start: cursor, End: end, Active: false})
+	}
+	if len(segments) == 0 {
+		segments = append(segments, chartTimelineSegment{Start: start, End: end, Active: false})
+	}
+	return segments
+}
+
+func sampleTimes(samples []platform.LiveMetricSample) []time.Time {
+	times := make([]time.Time, 0, len(samples))
+	for _, sample := range samples {
+		times = append(times, sample.Timestamp)
+	}
+	return times
+}
+
+func singleAxisChartScale(datasets [][]float64, yMin, yMax *float64) chartScale {
+	min, max := 0.0, 1.0
+	if yMin != nil && yMax != nil {
+		min, max = *yMin, *yMax
+	} else {
+		min, max = chartSeriesBounds(flattenDatasets(datasets))
+		if yMin != nil {
+			min = *yMin
+		}
+		if yMax != nil {
+			max = *yMax
+		}
+	}
+	ticks := chartNiceTicks(min, max, 8)
+	return chartScale{Min: ticks[0], Max: ticks[len(ticks)-1], Ticks: ticks}
+}
+
+func flattenDatasets(datasets [][]float64) []float64 {
+	total := 0
+	for _, ds := range datasets {
+		total += len(ds)
+	}
+	out := make([]float64, 0, total)
+	for _, ds := range datasets {
+		out = append(out, ds...)
+	}
+	return out
+}
+
+func singleAxisChartLayout(canvasHeight int, seriesCount int) chartLayout {
+	legendRows := 0
+	if chartLegendVisible(seriesCount) && seriesCount > 0 {
+		cols := 4
+		if seriesCount < cols {
+			cols = seriesCount
+		}
+		legendRows = (seriesCount + cols - 1) / cols
+	}
+	legendHeight := 0
+	if legendRows > 0 {
+		legendHeight = legendRows*24 + 24
+	}
+	return chartLayout{
+		Width:      1400,
+		Height:     canvasHeight,
+		PlotLeft:   96,
+		PlotRight:  1352,
+		PlotTop:    72,
+		PlotBottom: canvasHeight - 60 - legendHeight,
+	}
+}
+
+func chartTimeBounds(times []time.Time) (time.Time, time.Time) {
+	if len(times) == 0 {
+		return time.Time{}, time.Time{}
+	}
+	start := times[0].UTC()
+	end := start
+	for _, ts := range times[1:] {
+		t := ts.UTC()
+		if t.Before(start) {
+			start = t
+		}
+		if t.After(end) {
+			end = t
+		}
+	}
+	return start, end
+}
+
+func synthesizeChartTimes(times []time.Time, count int) []time.Time {
+	if count <= 0 {
+		return nil
+	}
+	if len(times) == count {
+		return times
+	}
+	if len(times) == 1 {
+		out := make([]time.Time, count)
+		for i := range out {
+			out[i] = times[0].Add(time.Duration(i) * time.Minute)
+		}
+		return out
+	}
+	base := time.Now().UTC().Add(-time.Duration(count-1) * time.Minute)
+	out := make([]time.Time, count)
+	for i := range out {
+		out[i] = base.Add(time.Duration(i) * time.Minute)
+	}
+	return out
+}
+
+func writeSVGOpen(b *strings.Builder, width, height int) {
+	fmt.Fprintf(b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" viewBox="0 0 %d %d">`+"\n", width, height, width, height)
+}
+
+func writeSVGClose(b *strings.Builder) {
+	b.WriteString("</svg>\n")
+}
+
+func writeChartFrame(b *strings.Builder, title string, width, height int) {
+	fmt.Fprintf(b, `<rect width="%d" height="%d" rx="10" ry="10" fill="#ffffff" stroke="#d7e0ea"/>`+"\n", width, height)
+	fmt.Fprintf(b, `<text x="%d" y="30" text-anchor="middle" font-family="sans-serif" font-size="16" font-weight="700" fill="#1f2937">%s</text>`+"\n",
+		width/2, sanitizeChartText(title))
+}
+
+func writePlotBorder(b *strings.Builder, layout chartLayout) {
+	fmt.Fprintf(b, `<rect x="%d" y="%d" width="%d" height="%d" fill="none" stroke="#cbd5e1" stroke-width="1"/>`+"\n",
+		layout.PlotLeft, layout.PlotTop, layout.PlotRight-layout.PlotLeft, layout.PlotBottom-layout.PlotTop)
+}
+
+func writeHorizontalGrid(b *strings.Builder, layout chartLayout, scale chartScale) {
+	b.WriteString(`<g stroke="#e2e8f0" stroke-width="1">` + "\n")
+	for _, tick := range scale.Ticks {
+		y := chartYForValue(tick, scale, layout.PlotTop, layout.PlotBottom)
+		fmt.Fprintf(b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f"/>`+"\n",
+			layout.PlotLeft, y, layout.PlotRight, y)
+	}
+	b.WriteString(`</g>` + "\n")
+}
+
+func writeVerticalGrid(b *strings.Builder, layout chartLayout, times []time.Time, pointCount, target int) {
+	if pointCount <= 0 {
+		return
+	}
+	start, end := chartTimeBounds(times)
+	b.WriteString(`<g stroke="#edf2f7" stroke-width="1">` + "\n")
+	for _, idx := range gpuChartLabelIndices(pointCount, target) {
+		ts := chartPointTime(times, idx)
+		x := chartXForTime(ts, start, end, layout.PlotLeft, layout.PlotRight)
+		fmt.Fprintf(b, `<line x1="%.1f" y1="%d" x2="%.1f" y2="%d"/>`+"\n",
+			x, layout.PlotTop, x, layout.PlotBottom)
+	}
+	b.WriteString(`</g>` + "\n")
+}
+
+func writeSingleAxisY(b *strings.Builder, layout chartLayout, scale chartScale) {
+	fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="#64748b" stroke-width="1"/>`+"\n",
+		layout.PlotLeft, layout.PlotTop, layout.PlotLeft, layout.PlotBottom)
+	for _, tick := range scale.Ticks {
+		y := chartYForValue(tick, scale, layout.PlotTop, layout.PlotBottom)
+		fmt.Fprintf(b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="#64748b" stroke-width="1"/>`+"\n",
+			layout.PlotLeft, y, layout.PlotLeft-6, y)
+		fmt.Fprintf(b, `<text x="%d" y="%.1f" text-anchor="end" dy="4" font-family="sans-serif" font-size="10" fill="#475569">%s</text>`+"\n",
+			layout.PlotLeft-10, y, sanitizeChartText(chartYAxisNumber(tick)))
+	}
+}
+
+func writeXAxisLabels(b *strings.Builder, layout chartLayout, times []time.Time, labels []string, start, end time.Time, target int) {
+	pointCount := len(labels)
+	if len(times) > pointCount {
+		pointCount = len(times)
+	}
+	b.WriteString(`<g font-family="sans-serif" font-size="11" fill="#64748b" text-anchor="middle">` + "\n")
+	for _, idx := range gpuChartLabelIndices(pointCount, target) {
+		x := chartXForTime(chartPointTime(times, idx), start, end, layout.PlotLeft, layout.PlotRight)
+		label := ""
+		if idx < len(labels) {
+			label = labels[idx]
+		}
+		fmt.Fprintf(b, `<text x="%.1f" y="%d">%s</text>`+"\n", x, layout.PlotBottom+28, sanitizeChartText(label))
+	}
+	b.WriteString(`</g>` + "\n")
+	fmt.Fprintf(b, `<text x="%d" y="%d" text-anchor="middle" font-family="sans-serif" font-size="12" fill="#64748b">Time</text>`+"\n",
+		(layout.PlotLeft+layout.PlotRight)/2, layout.PlotBottom+48)
+}
+
+func writeSeriesPolyline(b *strings.Builder, layout chartLayout, times []time.Time, start, end time.Time, values []float64, scale chartScale, color string) {
+	if len(values) == 0 {
+		return
+	}
+	var points strings.Builder
+	for idx, value := range values {
+		if idx > 0 {
+			points.WriteByte(' ')
+		}
+		x := chartXForTime(chartPointTime(times, idx), start, end, layout.PlotLeft, layout.PlotRight)
+		y := chartYForValue(value, scale, layout.PlotTop, layout.PlotBottom)
+		points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
+		points.WriteByte(',')
+		points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
+	}
+	fmt.Fprintf(b, `<polyline points="%s" fill="none" stroke="%s" stroke-width="2.2" stroke-linejoin="round" stroke-linecap="round"/>`+"\n",
+		points.String(), color)
+	if len(values) == 1 {
+		x := chartXForTime(chartPointTime(times, 0), start, end, layout.PlotLeft, layout.PlotRight)
+		y := chartYForValue(values[0], scale, layout.PlotTop, layout.PlotBottom)
+		fmt.Fprintf(b, `<circle cx="%.1f" cy="%.1f" r="3.5" fill="%s"/>`+"\n", x, y, color)
+	}
+}
+
+func writeLegend(b *strings.Builder, layout chartLayout, series []metricChartSeries) {
+	if !chartLegendVisible(len(series)) || len(series) == 0 {
+		return
+	}
+	cols := 4
+	if len(series) < cols {
+		cols = len(series)
+	}
+	cellWidth := float64(layout.PlotRight-layout.PlotLeft) / float64(cols)
+	baseY := layout.PlotBottom + 74
+	for i, item := range series {
+		row := i / cols
+		col := i % cols
+		x := float64(layout.PlotLeft) + cellWidth*float64(col) + 8
+		y := float64(baseY + row*24)
+		fmt.Fprintf(b, `<line x1="%.1f" y1="%.1f" x2="%.1f" y2="%.1f" stroke="%s" stroke-width="3"/>`+"\n",
+			x, y, x+28, y, item.Color)
+		fmt.Fprintf(b, `<text x="%.1f" y="%.1f" font-family="sans-serif" font-size="12" fill="#1f2937">%s</text>`+"\n",
+			x+38, y+4, sanitizeChartText(item.Name))
+	}
+}
+
+func writeTimelineIdleSpans(b *strings.Builder, layout chartLayout, start, end time.Time, segments []chartTimelineSegment) {
+	if len(segments) == 0 {
+		return
+	}
+	b.WriteString(`<g data-role="timeline-overlay">` + "\n")
+	for _, segment := range segments {
+		if segment.Active || !segment.End.After(segment.Start) {
+			continue
+		}
+		x0 := chartXForTime(segment.Start, start, end, layout.PlotLeft, layout.PlotRight)
+		x1 := chartXForTime(segment.End, start, end, layout.PlotLeft, layout.PlotRight)
+		fmt.Fprintf(b, `<rect x="%.1f" y="%d" width="%.1f" height="%d" fill="#475569" opacity="0.10"/>`+"\n",
+			x0, layout.PlotTop, math.Max(1, x1-x0), layout.PlotBottom-layout.PlotTop)
+	}
+	b.WriteString(`</g>` + "\n")
+}
+
+func writeTimelineBoundaries(b *strings.Builder, layout chartLayout, start, end time.Time, segments []chartTimelineSegment) {
+	if len(segments) == 0 {
+		return
+	}
+	seen := map[int]bool{}
+	b.WriteString(`<g data-role="timeline-boundaries" stroke="#94a3b8" stroke-width="1.2">` + "\n")
+	for i, segment := range segments {
+		if i > 0 {
+			x := int(math.Round(chartXForTime(segment.Start, start, end, layout.PlotLeft, layout.PlotRight)))
+			if !seen[x] {
+				seen[x] = true
+				fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d"/>`+"\n", x, layout.PlotTop, x, layout.PlotBottom)
+			}
+		}
+		if i < len(segments)-1 {
+			x := int(math.Round(chartXForTime(segment.End, start, end, layout.PlotLeft, layout.PlotRight)))
+			if !seen[x] {
+				seen[x] = true
+				fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d"/>`+"\n", x, layout.PlotTop, x, layout.PlotBottom)
+			}
+		}
+	}
+	b.WriteString(`</g>` + "\n")
+}
+
+func chartXForTime(ts, start, end time.Time, left, right int) float64 {
+	if !end.After(start) {
+		return float64(left+right) / 2
+	}
+	if ts.Before(start) {
+		ts = start
+	}
+	if ts.After(end) {
+		ts = end
+	}
+	ratio := float64(ts.Sub(start)) / float64(end.Sub(start))
+	return float64(left) + ratio*float64(right-left)
+}
+
+func chartPointTime(times []time.Time, idx int) time.Time {
+	if idx >= 0 && idx < len(times) && !times[idx].IsZero() {
+		return times[idx].UTC()
+	}
+	if len(times) > 0 && !times[0].IsZero() {
+		return times[0].UTC().Add(time.Duration(idx) * time.Minute)
+	}
+	return time.Now().UTC().Add(time.Duration(idx) * time.Minute)
+}
+
+func chartYForValue(value float64, scale chartScale, plotTop, plotBottom int) float64 {
+	if scale.Max <= scale.Min {
+		return float64(plotTop+plotBottom) / 2
+	}
+	return float64(plotBottom) - (value-scale.Min)/(scale.Max-scale.Min)*float64(plotBottom-plotTop)
+}
+
+func chartSeriesBounds(values []float64) (float64, float64) {
+	if len(values) == 0 {
+		return 0, 1
+	}
+	min, max := values[0], values[0]
+	for _, value := range values[1:] {
+		if value < min {
+			min = value
+		}
+		if value > max {
+			max = value
+		}
+	}
+	if min == max {
+		if max == 0 {
+			return 0, 1
+		}
+		pad := math.Abs(max) * 0.1
+		if pad == 0 {
+			pad = 1
+		}
+		min -= pad
+		max += pad
+	}
+	if min > 0 {
+		pad := (max - min) * 0.2
+		if pad == 0 {
+			pad = max * 0.1
+		}
+		min -= pad
+		if min < 0 {
+			min = 0
+		}
+		max += pad
+	}
+	return min, max
+}
+
+func chartNiceTicks(min, max float64, target int) []float64 {
+	if min == max {
+		max = min + 1
+	}
+	span := max - min
+	step := math.Pow(10, math.Floor(math.Log10(span/float64(target))))
+	for _, factor := range []float64{1, 2, 5, 10} {
+		if span/(factor*step) <= float64(target)*1.5 {
+			step = factor * step
+			break
+		}
+	}
+	low := math.Floor(min/step) * step
+	high := math.Ceil(max/step) * step
+	var ticks []float64
+	for value := low; value <= high+step*0.001; value += step {
+		ticks = append(ticks, math.Round(value*1e9)/1e9)
+	}
+	return ticks
+}
+
+func valueClamp(value float64, scale chartScale) float64 {
+	if value < scale.Min {
+		return scale.Min
+	}
+	if value > scale.Max {
+		return scale.Max
+	}
+	return value
+}
--- a/audit/internal/webui/jobs.go
+++ b/audit/internal/webui/jobs.go
@@ -84,12 +84,12 @@ func (m *jobManager) create(id string) *jobState {
 	j := &jobState{}
 	m.jobs[id] = j
 	// Schedule cleanup after 30 minutes
-	go func() {
+	goRecoverOnce("job cleanup", func() {
 		time.Sleep(30 * time.Minute)
 		m.mu.Lock()
 		delete(m.jobs, id)
 		m.mu.Unlock()
-	}()
+	})
 	return j
 }

--- a/audit/internal/webui/kmsg_watcher.go
+++ b/audit/internal/webui/kmsg_watcher.go
@@ -17,10 +17,10 @@ import (
 // It supports multiple concurrent SAT tasks: a shared event window is open
 // while any SAT task is running, and flushed when all tasks complete.
 type kmsgWatcher struct {
-	mu           sync.Mutex
-	activeCount  int        // number of in-flight SAT tasks
-	window       *kmsgWindow
-	statusDB     *app.ComponentStatusDB
+	mu          sync.Mutex
+	activeCount int // number of in-flight SAT tasks
+	window      *kmsgWindow
+	statusDB    *app.ComponentStatusDB
 }

 type kmsgWindow struct {
@@ -48,36 +48,39 @@ func newKmsgWatcher(statusDB *app.ComponentStatusDB) *kmsgWatcher {

 // start launches the background kmsg reading goroutine.
 func (w *kmsgWatcher) start() {
-	go w.run()
+	goRecoverLoop("kmsg watcher", 5*time.Second, w.run)
 }

 func (w *kmsgWatcher) run() {
-	f, err := os.Open("/dev/kmsg")
-	if err != nil {
-		slog.Warn("kmsg watcher unavailable", "err", err)
-		return
-	}
-	defer f.Close()
-
-	// Best-effort seek to end so we only capture events from now forward.
-	_, _ = f.Seek(0, io.SeekEnd)
-
-	scanner := bufio.NewScanner(f)
-	scanner.Buffer(make([]byte, 64*1024), 64*1024)
-	for scanner.Scan() {
-		line := scanner.Text()
-		evt, ok := parseKmsgLine(line)
-		if !ok {
+	for {
+		f, err := os.Open("/dev/kmsg")
+		if err != nil {
+			slog.Warn("kmsg watcher unavailable", "err", err)
+			time.Sleep(30 * time.Second)
 			continue
 		}
-		w.mu.Lock()
-		if w.window != nil {
-			w.recordEvent(evt)
+		// Best-effort seek to end so we only capture events from now forward.
+		_, _ = f.Seek(0, io.SeekEnd)
+
+		scanner := bufio.NewScanner(f)
+		scanner.Buffer(make([]byte, 64*1024), 64*1024)
+		for scanner.Scan() {
+			line := scanner.Text()
+			evt, ok := parseKmsgLine(line)
+			if !ok {
+				continue
+			}
+			w.mu.Lock()
+			if w.window != nil {
+				w.recordEvent(evt)
+			}
+			w.mu.Unlock()
 		}
-		w.mu.Unlock()
-	}
-	if err := scanner.Err(); err != nil {
-		slog.Warn("kmsg watcher stopped", "err", err)
+		if err := scanner.Err(); err != nil {
+			slog.Warn("kmsg watcher stopped", "err", err)
+		}
+		_ = f.Close()
+		time.Sleep(2 * time.Second)
 	}
 }

@@ -134,7 +137,7 @@ func (w *kmsgWatcher) NotifyTaskFinished(taskID string) {
 	if window == nil || len(window.events) == 0 {
 		return
 	}
-	go w.flushWindow(window)
+	goRecoverOnce("kmsg watcher flush", func() { w.flushWindow(window) })
 }

 func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
@@ -229,7 +232,7 @@ func truncate(s string, max int) string {
 // isSATTarget returns true for task targets that run hardware acceptance tests.
 func isSATTarget(target string) bool {
 	switch target {
-	case "nvidia", "nvidia-stress", "memory", "memory-stress", "storage",
+	case "nvidia", "nvidia-benchmark", "nvidia-stress", "memory", "memory-stress", "storage",
 		"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
 		"platform-stress":
 		return true
--- a/audit/internal/webui/metricsdb.go
+++ b/audit/internal/webui/metricsdb.go
@@ -8,6 +8,7 @@ import (
 	"path/filepath"
 	"sort"
 	"strconv"
+	"strings"
 	"time"

 	"bee/audit/internal/platform"
@@ -54,6 +55,8 @@ CREATE TABLE IF NOT EXISTS gpu_metrics (
  usage_pct     REAL,
  mem_usage_pct REAL,
  power_w       REAL,
+  clock_mhz     REAL,
+  mem_clock_mhz REAL,
  PRIMARY KEY (ts, gpu_index)
 );
 CREATE TABLE IF NOT EXISTS fan_metrics (
@@ -70,6 +73,38 @@ CREATE TABLE IF NOT EXISTS temp_metrics (
  PRIMARY KEY (ts, name)
 );
 `)
+	if err != nil {
+		return err
+	}
+	if err := ensureMetricsColumn(db, "gpu_metrics", "clock_mhz", "REAL"); err != nil {
+		return err
+	}
+	return ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL")
+}
+
+func ensureMetricsColumn(db *sql.DB, table, column, definition string) error {
+	rows, err := db.Query("PRAGMA table_info(" + table + ")")
+	if err != nil {
+		return err
+	}
+	defer rows.Close()
+
+	for rows.Next() {
+		var cid int
+		var name, ctype string
+		var notNull, pk int
+		var dflt sql.NullString
+		if err := rows.Scan(&cid, &name, &ctype, &notNull, &dflt, &pk); err != nil {
+			return err
+		}
+		if strings.EqualFold(name, column) {
+			return nil
+		}
+	}
+	if err := rows.Err(); err != nil {
+		return err
+	}
+	_, err = db.Exec("ALTER TABLE " + table + " ADD COLUMN " + column + " " + definition)
 	return err
 }

@@ -91,8 +126,8 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
 	}
 	for _, g := range s.GPUs {
 		_, err = tx.Exec(
-			`INSERT OR REPLACE INTO gpu_metrics(ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w) VALUES(?,?,?,?,?,?)`,
-			ts, g.GPUIndex, g.TempC, g.UsagePct, g.MemUsagePct, g.PowerW,
+			`INSERT OR REPLACE INTO gpu_metrics(ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz) VALUES(?,?,?,?,?,?,?,?)`,
+			ts, g.GPUIndex, g.TempC, g.UsagePct, g.MemUsagePct, g.PowerW, g.ClockMHz, g.MemClockMHz,
 		)
 		if err != nil {
 			return err
@@ -163,7 +198,7 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
 	}
 	gpuData := map[gpuKey]platform.GPUMetricRow{}
 	gRows, err := m.db.Query(
-		`SELECT ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w FROM gpu_metrics WHERE ts>=? AND ts<=? ORDER BY ts,gpu_index`,
+		`SELECT ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w,IFNULL(clock_mhz,0),IFNULL(mem_clock_mhz,0) FROM gpu_metrics WHERE ts>=? AND ts<=? ORDER BY ts,gpu_index`,
 		minTS, maxTS,
 	)
 	if err == nil {
@@ -171,7 +206,7 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
 		for gRows.Next() {
 			var ts int64
 			var g platform.GPUMetricRow
-			if err := gRows.Scan(&ts, &g.GPUIndex, &g.TempC, &g.UsagePct, &g.MemUsagePct, &g.PowerW); err == nil {
+			if err := gRows.Scan(&ts, &g.GPUIndex, &g.TempC, &g.UsagePct, &g.MemUsagePct, &g.PowerW, &g.ClockMHz, &g.MemClockMHz); err == nil {
 				gpuData[gpuKey{ts, g.GPUIndex}] = g
 			}
 		}
@@ -283,7 +318,8 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
 func (m *MetricsDB) ExportCSV(w io.Writer) error {
 	rows, err := m.db.Query(`
 		SELECT s.ts, s.cpu_load_pct, s.mem_load_pct, s.power_w,
-		       g.gpu_index, g.temp_c, g.usage_pct, g.mem_usage_pct, g.power_w
+		       g.gpu_index, g.temp_c, g.usage_pct, g.mem_usage_pct, g.power_w,
+		       g.clock_mhz, g.mem_clock_mhz
 		FROM sys_metrics s
 		LEFT JOIN gpu_metrics g ON g.ts = s.ts
 		ORDER BY s.ts, g.gpu_index
@@ -294,13 +330,13 @@ func (m *MetricsDB) ExportCSV(w io.Writer) error {
 	defer rows.Close()

 	cw := csv.NewWriter(w)
-	_ = cw.Write([]string{"ts", "cpu_load_pct", "mem_load_pct", "sys_power_w", "gpu_index", "gpu_temp_c", "gpu_usage_pct", "gpu_mem_pct", "gpu_power_w"})
+	_ = cw.Write([]string{"ts", "cpu_load_pct", "mem_load_pct", "sys_power_w", "gpu_index", "gpu_temp_c", "gpu_usage_pct", "gpu_mem_pct", "gpu_power_w", "gpu_clock_mhz", "gpu_mem_clock_mhz"})
 	for rows.Next() {
 		var ts int64
 		var cpu, mem, pwr float64
 		var gpuIdx sql.NullInt64
-		var gpuTemp, gpuUse, gpuMem, gpuPow sql.NullFloat64
-		if err := rows.Scan(&ts, &cpu, &mem, &pwr, &gpuIdx, &gpuTemp, &gpuUse, &gpuMem, &gpuPow); err != nil {
+		var gpuTemp, gpuUse, gpuMem, gpuPow, gpuClock, gpuMemClock sql.NullFloat64
+		if err := rows.Scan(&ts, &cpu, &mem, &pwr, &gpuIdx, &gpuTemp, &gpuUse, &gpuMem, &gpuPow, &gpuClock, &gpuMemClock); err != nil {
 			continue
 		}
 		row := []string{
@@ -316,9 +352,11 @@ func (m *MetricsDB) ExportCSV(w io.Writer) error {
 				strconv.FormatFloat(gpuUse.Float64, 'f', 1, 64),
 				strconv.FormatFloat(gpuMem.Float64, 'f', 1, 64),
 				strconv.FormatFloat(gpuPow.Float64, 'f', 1, 64),
+				strconv.FormatFloat(gpuClock.Float64, 'f', 1, 64),
+				strconv.FormatFloat(gpuMemClock.Float64, 'f', 1, 64),
 			)
 		} else {
-			row = append(row, "", "", "", "", "")
+			row = append(row, "", "", "", "", "", "", "")
 		}
 		_ = cw.Write(row)
 	}
--- a/audit/internal/webui/metricsdb_test.go
+++ b/audit/internal/webui/metricsdb_test.go
@@ -1,11 +1,13 @@
 package webui

 import (
+	"database/sql"
 	"path/filepath"
 	"testing"
 	"time"

 	"bee/audit/internal/platform"
+	_ "modernc.org/sqlite"
 )

 func TestMetricsDBLoadSamplesKeepsChronologicalRangeForGPUs(t *testing.T) {
@@ -67,3 +69,77 @@ func TestMetricsDBLoadSamplesKeepsChronologicalRangeForGPUs(t *testing.T) {
 		}
 	}
 }
+
+func TestMetricsDBMigratesLegacyGPUSchema(t *testing.T) {
+	path := filepath.Join(t.TempDir(), "metrics.db")
+	raw, err := sql.Open("sqlite", path)
+	if err != nil {
+		t.Fatalf("sql.Open: %v", err)
+	}
+	_, err = raw.Exec(`
+CREATE TABLE gpu_metrics (
+  ts            INTEGER NOT NULL,
+  gpu_index     INTEGER NOT NULL,
+  temp_c        REAL,
+  usage_pct     REAL,
+  mem_usage_pct REAL,
+  power_w       REAL,
+  PRIMARY KEY (ts, gpu_index)
+);
+CREATE TABLE sys_metrics (
+  ts           INTEGER NOT NULL,
+  cpu_load_pct REAL,
+  mem_load_pct REAL,
+  power_w      REAL,
+  PRIMARY KEY (ts)
+);
+CREATE TABLE fan_metrics (
+  ts   INTEGER NOT NULL,
+  name TEXT NOT NULL,
+  rpm  REAL,
+  PRIMARY KEY (ts, name)
+);
+CREATE TABLE temp_metrics (
+  ts      INTEGER NOT NULL,
+  name    TEXT NOT NULL,
+  grp     TEXT NOT NULL,
+  celsius REAL,
+  PRIMARY KEY (ts, name)
+);
+`)
+	if err != nil {
+		t.Fatalf("create legacy schema: %v", err)
+	}
+	_ = raw.Close()
+
+	db, err := openMetricsDB(path)
+	if err != nil {
+		t.Fatalf("openMetricsDB: %v", err)
+	}
+	defer db.Close()
+
+	now := time.Unix(1_700_000_100, 0).UTC()
+	err = db.Write(platform.LiveMetricSample{
+		Timestamp: now,
+		GPUs: []platform.GPUMetricRow{
+			{GPUIndex: 0, ClockMHz: 1410, MemClockMHz: 2600},
+		},
+	})
+	if err != nil {
+		t.Fatalf("Write: %v", err)
+	}
+
+	samples, err := db.LoadAll()
+	if err != nil {
+		t.Fatalf("LoadAll: %v", err)
+	}
+	if len(samples) != 1 || len(samples[0].GPUs) != 1 {
+		t.Fatalf("samples=%+v", samples)
+	}
+	if got := samples[0].GPUs[0].ClockMHz; got != 1410 {
+		t.Fatalf("ClockMHz=%v want 1410", got)
+	}
+	if got := samples[0].GPUs[0].MemClockMHz; got != 2600 {
+		t.Fatalf("MemClockMHz=%v want 2600", got)
+	}
+}
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
@@ -91,6 +91,7 @@ func layoutNav(active string, buildLabel string) string {
 		{"audit", "Audit", "/audit", ""},
 		{"validate", "Validate", "/validate", ""},
 		{"burn", "Burn", "/burn", ""},
+		{"benchmark", "Benchmark", "/benchmark", ""},
 		{"tasks", "Tasks", "/tasks", ""},
 		{"tools", "Tools", "/tools", ""},
 	}
@@ -140,6 +141,10 @@ func renderPage(page string, opts HandlerOptions) string {
 		pageID = "burn"
 		title = "Burn"
 		body = renderBurn()
+	case "benchmark":
+		pageID = "benchmark"
+		title = "Benchmark"
+		body = renderBenchmark()
 	case "tasks":
 		pageID = "tasks"
 		title = "Tasks"
@@ -464,14 +469,14 @@ func renderMetrics() string {
 <div class="card" style="margin-bottom:16px">
  <div class="card-head">Server — Load</div>
  <div class="card-body" style="padding:8px">
-    <img id="chart-server-load" src="/api/metrics/chart/server-load.svg" style="width:100%;display:block;border-radius:6px" alt="CPU/Mem load">
+    <img id="chart-server-load" data-chart-refresh="1" src="/api/metrics/chart/server-load.svg" style="width:100%;display:block;border-radius:6px" alt="CPU/Mem load">
  </div>
 </div>

 <div class="card" style="margin-bottom:16px">
  <div class="card-head">Temperature — CPU</div>
  <div class="card-body" style="padding:8px">
-    <img id="chart-server-temp-cpu" src="/api/metrics/chart/server-temp-cpu.svg" style="width:100%;display:block;border-radius:6px" alt="CPU temperature">
+    <img id="chart-server-temp-cpu" data-chart-refresh="1" src="/api/metrics/chart/server-temp-cpu.svg" style="width:100%;display:block;border-radius:6px" alt="CPU temperature">
  </div>
 </div>

@@ -479,57 +484,99 @@ func renderMetrics() string {
 <div class="card" style="margin-bottom:16px">
  <div class="card-head">Temperature — Ambient Sensors</div>
  <div class="card-body" style="padding:8px">
-    <img id="chart-server-temp-ambient" src="/api/metrics/chart/server-temp-ambient.svg" style="width:100%;display:block;border-radius:6px" alt="Ambient temperature sensors">
+    <img id="chart-server-temp-ambient" data-chart-refresh="1" src="/api/metrics/chart/server-temp-ambient.svg" style="width:100%;display:block;border-radius:6px" alt="Ambient temperature sensors">
  </div>
 </div>

 <div class="card" style="margin-bottom:16px">
  <div class="card-head">Server — Power</div>
  <div class="card-body" style="padding:8px">
-    <img id="chart-server-power" src="/api/metrics/chart/server-power.svg" style="width:100%;display:block;border-radius:6px" alt="System power">
+    <img id="chart-server-power" data-chart-refresh="1" src="/api/metrics/chart/server-power.svg" style="width:100%;display:block;border-radius:6px" alt="System power">
  </div>
 </div>

 <div id="card-server-fans" class="card" style="margin-bottom:16px;display:none">
  <div class="card-head">Server — Fan RPM</div>
  <div class="card-body" style="padding:8px">
-    <img id="chart-server-fans" src="/api/metrics/chart/server-fans.svg" style="width:100%;display:block;border-radius:6px" alt="Fan RPM">
+    <img id="chart-server-fans" data-chart-refresh="1" src="/api/metrics/chart/server-fans.svg" style="width:100%;display:block;border-radius:6px" alt="Fan RPM">
  </div>
 </div>

-<div class="card" style="margin-bottom:16px">
-  <div class="card-head">GPU — Compute Load</div>
-  <div class="card-body" style="padding:8px">
-    <img id="chart-gpu-all-load" src="/api/metrics/chart/gpu-all-load.svg" style="width:100%;display:block;border-radius:6px" alt="GPU compute load">
+<section id="gpu-metrics-section" style="display:none;margin-top:24px;padding:16px 16px 4px;border:1px solid #d7e0ea;border-radius:10px;background:linear-gradient(180deg,#f7fafc 0%,#eef4f8 100%)">
+  <div style="display:flex;align-items:center;justify-content:space-between;gap:16px;flex-wrap:wrap;margin-bottom:14px">
+    <div>
+      <div style="font-size:12px;font-weight:700;letter-spacing:.08em;text-transform:uppercase;color:#486581">GPU Metrics</div>
+      <div id="gpu-metrics-summary" style="font-size:13px;color:var(--muted);margin-top:4px">Detected GPUs are rendered in a dedicated section.</div>
+    </div>
+    <label style="display:inline-flex;align-items:center;gap:8px;font-size:13px;color:var(--ink);font-weight:700;cursor:pointer">
+      <input id="gpu-chart-toggle" type="checkbox">
+      <span>One chart per GPU</span>
+    </label>
  </div>
-</div>
-<div class="card" style="margin-bottom:16px">
-  <div class="card-head">GPU — Memory Load</div>
-  <div class="card-body" style="padding:8px">
-    <img id="chart-gpu-all-memload" src="/api/metrics/chart/gpu-all-memload.svg" style="width:100%;display:block;border-radius:6px" alt="GPU memory load">
+
+  <div id="gpu-metrics-by-metric">
+    <div class="card" style="margin-bottom:16px">
+      <div class="card-head">GPU — Compute Load</div>
+      <div class="card-body" style="padding:8px">
+        <img id="chart-gpu-all-load" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-load.svg" style="width:100%;display:block;border-radius:6px" alt="GPU compute load">
+      </div>
+    </div>
+    <div class="card" style="margin-bottom:16px">
+      <div class="card-head">GPU — Memory Load</div>
+      <div class="card-body" style="padding:8px">
+        <img id="chart-gpu-all-memload" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-memload.svg" style="width:100%;display:block;border-radius:6px" alt="GPU memory load">
+      </div>
+    </div>
+    <div class="card" style="margin-bottom:16px">
+      <div class="card-head">GPU — Core Clock</div>
+      <div class="card-body" style="padding:8px">
+        <img id="chart-gpu-all-clock" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-clock.svg" style="width:100%;display:block;border-radius:6px" alt="GPU core clock">
+      </div>
+    </div>
+    <div class="card" style="margin-bottom:16px">
+      <div class="card-head">GPU — Memory Clock</div>
+      <div class="card-body" style="padding:8px">
+        <img id="chart-gpu-all-memclock" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-memclock.svg" style="width:100%;display:block;border-radius:6px" alt="GPU memory clock">
+      </div>
+    </div>
+    <div class="card" style="margin-bottom:16px">
+      <div class="card-head">GPU — Power</div>
+      <div class="card-body" style="padding:8px">
+        <img id="chart-gpu-all-power" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-power.svg" style="width:100%;display:block;border-radius:6px" alt="GPU power">
+      </div>
+    </div>
+    <div class="card" style="margin-bottom:16px">
+      <div class="card-head">GPU — Temperature</div>
+      <div class="card-body" style="padding:8px">
+        <img id="chart-gpu-all-temp" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-temp.svg" style="width:100%;display:block;border-radius:6px" alt="GPU temperature">
+      </div>
+    </div>
  </div>
-</div>
-<div class="card" style="margin-bottom:16px">
-  <div class="card-head">GPU — Power</div>
-  <div class="card-body" style="padding:8px">
-    <img id="chart-gpu-all-power" src="/api/metrics/chart/gpu-all-power.svg" style="width:100%;display:block;border-radius:6px" alt="GPU power">
-  </div>
-</div>
-<div class="card" style="margin-bottom:16px">
-  <div class="card-head">GPU — Temperature</div>
-  <div class="card-body" style="padding:8px">
-    <img id="chart-gpu-all-temp" src="/api/metrics/chart/gpu-all-temp.svg" style="width:100%;display:block;border-radius:6px" alt="GPU temperature">
-  </div>
-</div>
+
+  <div id="gpu-metrics-by-gpu" style="display:none"></div>
+</section>

 <script>
-const chartIds = [
-  'chart-server-load','chart-server-temp-cpu','chart-server-temp-gpu','chart-server-temp-ambient','chart-server-power','chart-server-fans',
-  'chart-gpu-all-load','chart-gpu-all-memload','chart-gpu-all-power','chart-gpu-all-temp'
-];
+let gpuChartKey = '';
+const gpuChartModeStorageKey = 'bee.metrics.gpuChartMode';
+
+function loadGPUChartModePreference() {
+  try {
+    return sessionStorage.getItem(gpuChartModeStorageKey) === 'per-gpu';
+  } catch (_) {
+    return false;
+  }
+}
+
+function saveGPUChartModePreference(perGPU) {
+  try {
+    sessionStorage.setItem(gpuChartModeStorageKey, perGPU ? 'per-gpu' : 'per-metric');
+  } catch (_) {}
+}

 function refreshChartImage(el) {
  if (!el || el.dataset.loading === '1') return;
+  if (el.offsetParent === null) return;
  const baseSrc = el.dataset.baseSrc || el.src.split('?')[0];
  const nextSrc = baseSrc + '?t=' + Date.now();
  const probe = new Image();
@@ -546,14 +593,83 @@ function refreshChartImage(el) {
 }

 function refreshCharts() {
-  chartIds.forEach(id => refreshChartImage(document.getElementById(id)));
+  document.querySelectorAll('img[data-chart-refresh="1"]').forEach(refreshChartImage);
 }
-setInterval(refreshCharts, 3000);

-fetch('/api/metrics/latest').then(r => r.json()).then(d => {
+function gpuIndices(rows) {
+  const seen = {};
+  const out = [];
+  (rows || []).forEach(function(row) {
+    const idx = Number(row.index);
+    if (!Number.isFinite(idx) || seen[idx]) return;
+    seen[idx] = true;
+    out.push(idx);
+  });
+  return out.sort(function(a, b) { return a - b; });
+}
+
+function renderGPUOverviewCards(indices) {
+  const host = document.getElementById('gpu-metrics-by-gpu');
+  if (!host) return;
+  host.innerHTML = indices.map(function(idx) {
+    return '<div class="card" style="margin-bottom:16px">' +
+      '<div class="card-head">GPU ' + idx + ' — Overview</div>' +
+      '<div class="card-body" style="padding:8px">' +
+      '<img id="chart-gpu-' + idx + '-overview" data-chart-refresh="1" src="/api/metrics/chart/gpu/' + idx + '-overview.svg" style="width:100%;display:block;border-radius:6px" alt="GPU ' + idx + ' overview">' +
+      '</div></div>';
+  }).join('');
+}
+
+function applyGPUChartMode() {
+  const perMetric = document.getElementById('gpu-metrics-by-metric');
+  const perGPU = document.getElementById('gpu-metrics-by-gpu');
+  const toggle = document.getElementById('gpu-chart-toggle');
+  const gpuModePerGPU = !!(toggle && toggle.checked);
+  if (perMetric) perMetric.style.display = gpuModePerGPU ? 'none' : '';
+  if (perGPU) perGPU.style.display = gpuModePerGPU ? '' : 'none';
+}
+
+function syncMetricsLayout(d) {
  const fanCard = document.getElementById('card-server-fans');
  if (fanCard) fanCard.style.display = (d.fans && d.fans.length > 0) ? '' : 'none';
-}).catch(() => {});
+  const section = document.getElementById('gpu-metrics-section');
+  const summary = document.getElementById('gpu-metrics-summary');
+  const indices = gpuIndices(d.gpus);
+  if (section) section.style.display = indices.length > 0 ? '' : 'none';
+  if (summary) {
+    summary.textContent = indices.length > 0
+      ? ('Detected GPUs: ' + indices.map(function(idx) { return 'GPU ' + idx; }).join(', '))
+      : 'No GPUs detected in live metrics.';
+  }
+  const nextKey = indices.join(',');
+  if (nextKey !== gpuChartKey) {
+    renderGPUOverviewCards(indices);
+    gpuChartKey = nextKey;
+  }
+  applyGPUChartMode();
+}
+
+function loadMetricsLayout() {
+  fetch('/api/metrics/latest').then(function(r) { return r.json(); }).then(syncMetricsLayout).catch(function() {});
+}
+
+const gpuChartToggle = document.getElementById('gpu-chart-toggle');
+if (gpuChartToggle) {
+  gpuChartToggle.checked = loadGPUChartModePreference();
+}
+applyGPUChartMode();
+
+if (gpuChartToggle) {
+  gpuChartToggle.addEventListener('change', function() {
+    saveGPUChartModePreference(!!gpuChartToggle.checked);
+    applyGPUChartMode();
+    refreshCharts();
+  });
+}
+
+loadMetricsLayout();
+setInterval(refreshCharts, 3000);
+setInterval(loadMetricsLayout, 5000);
 </script>`
 }

@@ -670,6 +786,193 @@ func renderSATCard(id, label, extra string) string {
 		label, extra, id, id)
 }

+// ── Benchmark ─────────────────────────────────────────────────────────────────
+
+func renderBenchmark() string {
+	return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Benchmark runs generate a human-readable TXT report and machine-readable result bundle. Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
+
+<div class="grid2">
+  <div class="card">
+    <div class="card-head">NVIDIA Benchmark</div>
+    <div class="card-body">
+      <div class="form-row">
+        <label>Profile</label>
+        <select id="benchmark-profile">
+          <option value="standard" selected>Standard — about 15 minutes</option>
+          <option value="stability">Stability — 1 to 2 hours</option>
+          <option value="overnight">Overnight — 8 hours</option>
+        </select>
+      </div>
+      <div class="form-row">
+        <label>GPU Selection</label>
+        <div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
+          <button class="btn btn-sm btn-secondary" type="button" onclick="benchmarkSelectAll()">Select All</button>
+          <button class="btn btn-sm btn-secondary" type="button" onclick="benchmarkSelectNone()">Clear</button>
+        </div>
+        <div id="benchmark-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
+          <p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
+        </div>
+      </div>
+      <label class="benchmark-cb-row">
+        <input type="checkbox" id="benchmark-run-nccl" checked>
+        <span>Run multi-GPU interconnect step (NCCL) only on the selected GPUs</span>
+      </label>
+      <p id="benchmark-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 14px">Select one GPU for single-card benchmarking or several GPUs for a constrained multi-GPU run.</p>
+      <button id="benchmark-run-btn" class="btn btn-primary" onclick="runNvidiaBenchmark()" disabled>&#9654; Run Benchmark</button>
+      <span id="benchmark-run-status" style="margin-left:10px;font-size:12px;color:var(--muted)"></span>
+    </div>
+  </div>
+
+  <div class="card">
+    <div class="card-head">Method</div>
+    <div class="card-body">
+      <p style="font-size:13px;color:var(--muted);margin-bottom:10px">Each benchmark run performs warmup, sustained compute, telemetry capture, cooldown, and optional NCCL interconnect checks.</p>
+      <table>
+        <tr><th>Profile</th><th>Purpose</th></tr>
+        <tr><td>Standard</td><td>Fast, repeatable performance check for server-to-server comparison.</td></tr>
+        <tr><td>Stability</td><td>Longer run for thermal drift, power caps, and clock instability.</td></tr>
+        <tr><td>Overnight</td><td>Extended verification of long-run stability and late throttling.</td></tr>
+      </table>
+    </div>
+  </div>
+</div>
+
+<div id="benchmark-output" style="display:none;margin-top:16px" class="card">
+  <div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
+  <div class="card-body"><div id="benchmark-terminal" class="terminal"></div></div>
+</div>
+
+<style>
+.benchmark-cb-row { display:flex; align-items:flex-start; gap:8px; cursor:pointer; font-size:13px; }
+.benchmark-cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
+.benchmark-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
+.benchmark-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
+</style>
+
+<script>
+let benchmarkES = null;
+
+function benchmarkSelectedGPUIndices() {
+  return Array.from(document.querySelectorAll('.benchmark-gpu-checkbox'))
+    .filter(function(el) { return el.checked && !el.disabled; })
+    .map(function(el) { return parseInt(el.value, 10); })
+    .filter(function(v) { return !Number.isNaN(v); })
+    .sort(function(a, b) { return a - b; });
+}
+
+function benchmarkUpdateSelectionNote() {
+  const selected = benchmarkSelectedGPUIndices();
+  const btn = document.getElementById('benchmark-run-btn');
+  const note = document.getElementById('benchmark-selection-note');
+  const nccl = document.getElementById('benchmark-run-nccl');
+  if (!selected.length) {
+    btn.disabled = true;
+    note.textContent = 'Select at least one NVIDIA GPU to run the benchmark.';
+    return;
+  }
+  btn.disabled = false;
+  note.textContent = 'Selected GPUs: ' + selected.join(', ') + '.';
+  if (nccl && nccl.checked && selected.length < 2) {
+    note.textContent += ' NCCL will be skipped because fewer than 2 GPUs are selected.';
+  } else if (nccl && nccl.checked) {
+    note.textContent += ' NCCL interconnect will use only these GPUs.';
+  }
+}
+
+function benchmarkRenderGPUList(gpus) {
+  const root = document.getElementById('benchmark-gpu-list');
+  if (!gpus || !gpus.length) {
+    root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
+    benchmarkUpdateSelectionNote();
+    return;
+  }
+  root.innerHTML = gpus.map(function(gpu) {
+    const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
+    return '<label class="benchmark-gpu-row">'
+      + '<input class="benchmark-gpu-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="benchmarkUpdateSelectionNote()">'
+      + '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
+      + '</label>';
+  }).join('');
+  benchmarkUpdateSelectionNote();
+}
+
+function benchmarkLoadGPUs() {
+  const status = document.getElementById('benchmark-run-status');
+  status.textContent = '';
+  fetch('/api/gpu/nvidia').then(function(r) {
+    return r.json().then(function(body) {
+      if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
+      return body;
+    });
+  }).then(function(gpus) {
+    benchmarkRenderGPUList(gpus);
+  }).catch(function(err) {
+    document.getElementById('benchmark-gpu-list').innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
+    benchmarkUpdateSelectionNote();
+  });
+}
+
+function benchmarkSelectAll() {
+  document.querySelectorAll('.benchmark-gpu-checkbox').forEach(function(el) { el.checked = true; });
+  benchmarkUpdateSelectionNote();
+}
+
+function benchmarkSelectNone() {
+  document.querySelectorAll('.benchmark-gpu-checkbox').forEach(function(el) { el.checked = false; });
+  benchmarkUpdateSelectionNote();
+}
+
+function runNvidiaBenchmark() {
+  const selected = benchmarkSelectedGPUIndices();
+  const status = document.getElementById('benchmark-run-status');
+  if (!selected.length) {
+    status.textContent = 'Select at least one GPU.';
+    return;
+  }
+  if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
+  const body = {
+    profile: document.getElementById('benchmark-profile').value || 'standard',
+    gpu_indices: selected,
+    run_nccl: !!document.getElementById('benchmark-run-nccl').checked,
+    display_name: 'NVIDIA Benchmark'
+  };
+  document.getElementById('benchmark-output').style.display = 'block';
+  document.getElementById('benchmark-title').textContent = '— ' + body.profile + ' [' + selected.join(', ') + ']';
+  const term = document.getElementById('benchmark-terminal');
+  term.textContent = 'Enqueuing benchmark for GPUs ' + selected.join(', ') + '...\n';
+  status.textContent = 'Queueing...';
+  fetch('/api/benchmark/nvidia/run', {
+    method: 'POST',
+    headers: {'Content-Type':'application/json'},
+    body: JSON.stringify(body)
+  }).then(function(r) {
+    return r.json().then(function(payload) {
+      if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
+      return payload;
+    });
+  }).then(function(d) {
+    status.textContent = 'Task ' + d.task_id + ' queued.';
+    term.textContent += 'Task ' + d.task_id + ' queued. Streaming log...\n';
+    benchmarkES = new EventSource('/api/tasks/' + d.task_id + '/stream');
+    benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
+    benchmarkES.addEventListener('done', function(e) {
+      benchmarkES.close();
+      benchmarkES = null;
+      term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
+      term.scrollTop = term.scrollHeight;
+      status.textContent = e.data ? 'Failed.' : 'Completed.';
+    });
+  }).catch(function(err) {
+    status.textContent = 'Error.';
+    term.textContent += 'ERROR: ' + err.message + '\n';
+  });
+}
+
+document.getElementById('benchmark-run-nccl').addEventListener('change', benchmarkUpdateSelectionNote);
+benchmarkLoadGPUs();
+</script>`
+}
+
 // ── Burn ──────────────────────────────────────────────────────────────────────

 func renderBurn() string {
@@ -694,11 +997,12 @@ func renderBurn() string {
 <div class="card">
  <div class="card-head">GPU Stress</div>
  <div class="card-body">
-    <p style="font-size:12px;color:var(--muted);margin:0 0 10px">Tests run on all GPUs in the system. Availability determined by driver status.</p>
+    <p style="font-size:12px;color:var(--muted);margin:0 0 10px">NVIDIA tools run on all discovered GPUs. DCGM is the official NVIDIA diagnostic path. NCCL exercises multi-GPU fabric and is not a full compute burn.</p>
    <div id="gpu-tools-list">
      <label class="cb-row"><input type="checkbox" id="burn-gpu-bee" value="bee-gpu-burn" disabled><span>bee-gpu-burn <span class="cb-note" id="note-bee"></span></span></label>
+      <label class="cb-row"><input type="checkbox" id="burn-gpu-dcgm" value="dcgm" disabled><span>DCGM Diagnostics (Official NVIDIA) <span class="cb-note" id="note-dcgm"></span></span></label>
      <label class="cb-row"><input type="checkbox" id="burn-gpu-john" value="john" disabled><span>John the Ripper (OpenCL) <span class="cb-note" id="note-john"></span></span></label>
-      <label class="cb-row"><input type="checkbox" id="burn-gpu-nccl" value="nccl" disabled><span>NCCL all_reduce_perf <span class="cb-note" id="note-nccl"></span></span></label>
+      <label class="cb-row"><input type="checkbox" id="burn-gpu-nccl" value="nccl" disabled><span>NCCL all_reduce_perf (Interconnect) <span class="cb-note" id="note-nccl"></span></span></label>
      <label class="cb-row"><input type="checkbox" id="burn-gpu-rvs" value="rvs" disabled><span>RVS GST (AMD) <span class="cb-note" id="note-rvs"></span></span></label>
    </div>
    <button class="btn btn-primary" style="margin-top:10px" onclick="runGPUStress()">&#9654; Run GPU Stress</button>
@@ -770,17 +1074,18 @@ function streamTask(taskId, label) {
 }

 function runGPUStress() {
-  const ids = ['burn-gpu-bee','burn-gpu-john','burn-gpu-nccl','burn-gpu-rvs'];
-  const loaderMap = {'burn-gpu-bee':'builtin','burn-gpu-john':'john','burn-gpu-nccl':'nccl','burn-gpu-rvs':'rvs'};
-  const targetMap = {'burn-gpu-bee':'nvidia-stress','burn-gpu-john':'nvidia-stress','burn-gpu-nccl':'nvidia-stress','burn-gpu-rvs':'amd-stress'};
-  let last = null;
-  ids.filter(id => {
-    const el = document.getElementById(id);
+  const tasks = [
+    {id:'burn-gpu-bee', target:'nvidia-stress', label:'bee-gpu-burn', extra:{loader:'builtin'}},
+    {id:'burn-gpu-dcgm', target:'nvidia', label:'DCGM Diagnostics (Official NVIDIA)', extra:{display_name:'NVIDIA DCGM Diagnostics (Official)'}},
+    {id:'burn-gpu-john', target:'nvidia-stress', label:'John GPU Stress', extra:{loader:'john'}},
+    {id:'burn-gpu-nccl', target:'nvidia-stress', label:'NCCL Interconnect Stress', extra:{loader:'nccl', display_name:'NCCL Interconnect Stress'}},
+    {id:'burn-gpu-rvs', target:'amd-stress', label:'RVS GST', extra:{}},
+  ];
+  tasks.filter(t => {
+    const el = document.getElementById(t.id);
    return el && el.checked && !el.disabled;
-  }).forEach(id => {
-    const target = targetMap[id];
-    const extra = target === 'nvidia-stress' ? {loader: loaderMap[id]} : {};
-    enqueueTask(target, extra).then(d => { last = d; streamTask(d.task_id, target + ' / ' + loaderMap[id]); });
+  }).forEach(t => {
+    enqueueTask(t.target, t.extra).then(d => { streamTask(d.task_id, t.label); });
  });
 }

@@ -817,13 +1122,15 @@ function runAll() {
  const done = () => { count++; status.textContent = count + ' tasks queued.'; };

  // GPU tests
-  const gpuIds = ['burn-gpu-bee','burn-gpu-john','burn-gpu-nccl','burn-gpu-rvs'];
-  const loaderMap = {'burn-gpu-bee':'builtin','burn-gpu-john':'john','burn-gpu-nccl':'nccl','burn-gpu-rvs':'rvs'};
-  const gpuTargetMap = {'burn-gpu-bee':'nvidia-stress','burn-gpu-john':'nvidia-stress','burn-gpu-nccl':'nvidia-stress','burn-gpu-rvs':'amd-stress'};
-  gpuIds.filter(id => { const el = document.getElementById(id); return el && el.checked && !el.disabled; }).forEach(id => {
-    const target = gpuTargetMap[id];
-    const extra = target === 'nvidia-stress' ? {loader: loaderMap[id]} : {};
-    enqueueTask(target, extra).then(d => { streamTask(d.task_id, target); done(); });
+  const gpuTasks = [
+    {id:'burn-gpu-bee', target:'nvidia-stress', label:'bee-gpu-burn', extra:{loader:'builtin'}},
+    {id:'burn-gpu-dcgm', target:'nvidia', label:'DCGM Diagnostics (Official NVIDIA)', extra:{display_name:'NVIDIA DCGM Diagnostics (Official)'}},
+    {id:'burn-gpu-john', target:'nvidia-stress', label:'John GPU Stress', extra:{loader:'john'}},
+    {id:'burn-gpu-nccl', target:'nvidia-stress', label:'NCCL Interconnect Stress', extra:{loader:'nccl', display_name:'NCCL Interconnect Stress'}},
+    {id:'burn-gpu-rvs', target:'amd-stress', label:'RVS GST', extra:{}},
+  ];
+  gpuTasks.filter(t => { const el = document.getElementById(t.id); return el && el.checked && !el.disabled; }).forEach(t => {
+    enqueueTask(t.target, t.extra).then(d => { streamTask(d.task_id, t.label); done(); });
  });

  // Compute tests
@@ -844,17 +1151,19 @@ function runAll() {

 // Load GPU tool availability
 fetch('/api/gpu/tools').then(r => r.json()).then(tools => {
-  const nvidiaMap = {'bee-gpu-burn':'burn-gpu-bee','john':'burn-gpu-john','nccl':'burn-gpu-nccl','rvs':'burn-gpu-rvs'};
-  const noteMap = {'bee-gpu-burn':'note-bee','john':'note-john','nccl':'note-nccl','rvs':'note-rvs'};
+  const nvidiaMap = {'bee-gpu-burn':'burn-gpu-bee','dcgm':'burn-gpu-dcgm','john':'burn-gpu-john','nccl':'burn-gpu-nccl','rvs':'burn-gpu-rvs'};
+  const noteMap = {'bee-gpu-burn':'note-bee','dcgm':'note-dcgm','john':'note-john','nccl':'note-nccl','rvs':'note-rvs'};
  tools.forEach(t => {
    const cb = document.getElementById(nvidiaMap[t.id]);
    const note = document.getElementById(noteMap[t.id]);
    if (!cb) return;
    if (t.available) {
      cb.disabled = false;
-      if (t.id === 'bee-gpu-burn') cb.checked = true;
+      if (t.id === 'bee-gpu-burn' || t.id === 'dcgm') cb.checked = true;
    } else {
-      const reason = t.vendor === 'nvidia' ? 'NVIDIA driver not running' : 'AMD driver not running';
+      let reason = t.vendor === 'nvidia' ? 'NVIDIA driver not running' : 'AMD driver not running';
+      if (t.id === 'dcgm' && t.vendor === 'nvidia') reason = 'dcgmi not available or NVIDIA driver not running';
+      if (t.id === 'nccl' && t.vendor === 'nvidia') reason = 'NCCL interconnect tool unavailable or NVIDIA driver not running';
      if (note) note.textContent = '— ' + reason;
    }
  });
@@ -1014,7 +1323,8 @@ func renderNetwork() string {
 // ── Services ──────────────────────────────────────────────────────────────────

 func renderServicesInline() string {
-	return `<div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="restartGPUDrivers()">Restart GPU Drivers</button><button class="btn btn-sm btn-secondary" onclick="loadServices()">&#8635; Refresh</button></div>
+	return `<p style="font-size:13px;color:var(--muted);margin-bottom:10px">` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `</p>
+<div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="restartGPUDrivers()">Restart GPU Drivers</button><button class="btn btn-sm btn-secondary" onclick="loadServices()">&#8635; Refresh</button></div>
 <div id="svc-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
 <div id="svc-out" style="display:none;margin-top:8px" class="card">
  <div class="card-head">Output</div>
@@ -1040,7 +1350,7 @@ function loadServices() {
        '</td></tr>';
    }).join('');
    document.getElementById('svc-table').innerHTML =
-      '<table><tr><th>Service</th><th>Status</th><th>Actions</th></tr>'+rows+'</table>';
+      '<table><tr><th>Unit</th><th>Status</th><th>Actions</th></tr>'+rows+'</table>';
  });
 }
 function toggleBody(id) {
--- a/audit/internal/webui/server.go
+++ b/audit/internal/webui/server.go
@@ -1,15 +1,19 @@
 package webui

 import (
+	"bufio"
 	"encoding/json"
 	"errors"
 	"fmt"
 	"html"
+	"io"
 	"log/slog"
 	"mime"
+	"net"
 	"net/http"
 	"os"
 	"path/filepath"
+	"runtime/debug"
 	"sort"
 	"strings"
 	"sync"
@@ -18,7 +22,6 @@ import (
 	"bee/audit/internal/app"
 	"bee/audit/internal/platform"
 	"bee/audit/internal/runtimeenv"
-	gocharts "github.com/go-analyze/charts"
 	"reanimator/chart/viewer"
 	"reanimator/chart/web"
 )
@@ -247,6 +250,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	mux.HandleFunc("POST /api/sat/platform-stress/run", h.handleAPISATRun("platform-stress"))
 	mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream)
 	mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
+	mux.HandleFunc("POST /api/benchmark/nvidia/run", h.handleAPIBenchmarkNvidiaRun)

 	// Tasks
 	mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)
@@ -283,6 +287,7 @@ func NewHandler(opts HandlerOptions) http.Handler {

 	// GPU presence / tools
 	mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
+	mux.HandleFunc("GET /api/gpu/nvidia", h.handleAPIGNVIDIAGPUs)
 	mux.HandleFunc("GET /api/gpu/tools", h.handleAPIGPUTools)

 	// System
@@ -309,11 +314,11 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	mux.HandleFunc("GET /", h.handlePage)

 	h.mux = mux
-	return mux
+	return recoverMiddleware(mux)
 }

 func (h *handler) startMetricsCollector() {
-	go func() {
+	goRecoverLoop("metrics collector", 2*time.Second, func() {
 		ticker := time.NewTicker(metricsCollectInterval)
 		defer ticker.Stop()
 		for range ticker.C {
@@ -324,7 +329,7 @@ func (h *handler) startMetricsCollector() {
 			h.feedRings(sample)
 			h.setLatestMetric(sample)
 		}
-	}()
+	})
 }

 func (h *handler) setLatestMetric(sample platform.LiveMetricSample) {
@@ -345,7 +350,81 @@ func (h *handler) latestMetric() (platform.LiveMetricSample, bool) {

 // ListenAndServe starts the HTTP server.
 func ListenAndServe(addr string, opts HandlerOptions) error {
-	return http.ListenAndServe(addr, NewHandler(opts))
+	srv := &http.Server{
+		Addr:              addr,
+		Handler:           NewHandler(opts),
+		ReadHeaderTimeout: 5 * time.Second,
+		ReadTimeout:       30 * time.Second,
+		IdleTimeout:       2 * time.Minute,
+	}
+	return srv.ListenAndServe()
+}
+
+type trackingResponseWriter struct {
+	http.ResponseWriter
+	wroteHeader bool
+}
+
+func (w *trackingResponseWriter) WriteHeader(statusCode int) {
+	w.wroteHeader = true
+	w.ResponseWriter.WriteHeader(statusCode)
+}
+
+func (w *trackingResponseWriter) Write(p []byte) (int, error) {
+	w.wroteHeader = true
+	return w.ResponseWriter.Write(p)
+}
+
+func (w *trackingResponseWriter) Flush() {
+	w.wroteHeader = true
+	if f, ok := w.ResponseWriter.(http.Flusher); ok {
+		f.Flush()
+	}
+}
+
+func (w *trackingResponseWriter) Hijack() (net.Conn, *bufio.ReadWriter, error) {
+	h, ok := w.ResponseWriter.(http.Hijacker)
+	if !ok {
+		return nil, nil, fmt.Errorf("hijacking not supported")
+	}
+	return h.Hijack()
+}
+
+func (w *trackingResponseWriter) Push(target string, opts *http.PushOptions) error {
+	p, ok := w.ResponseWriter.(http.Pusher)
+	if !ok {
+		return http.ErrNotSupported
+	}
+	return p.Push(target, opts)
+}
+
+func (w *trackingResponseWriter) ReadFrom(r io.Reader) (int64, error) {
+	rf, ok := w.ResponseWriter.(io.ReaderFrom)
+	if !ok {
+		return io.Copy(w.ResponseWriter, r)
+	}
+	w.wroteHeader = true
+	return rf.ReadFrom(r)
+}
+
+func recoverMiddleware(next http.Handler) http.Handler {
+	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		tw := &trackingResponseWriter{ResponseWriter: w}
+		defer func() {
+			if rec := recover(); rec != nil {
+				slog.Error("http handler panic",
+					"method", r.Method,
+					"path", r.URL.Path,
+					"panic", fmt.Sprint(rec),
+					"stack", string(debug.Stack()),
+				)
+				if !tw.wroteHeader {
+					http.Error(tw, "internal server error", http.StatusInternalServerError)
+				}
+			}
+		}()
+		next.ServeHTTP(tw, r)
+	})
 }

 // ── Infrastructure handlers ──────────────────────────────────────────────────
@@ -475,13 +554,44 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
 		http.Error(w, "metrics database not available", http.StatusServiceUnavailable)
 		return
 	}
-	datasets, names, labels, title, yMin, yMax, ok := h.chartDataFromDB(path)
+	samples, err := h.metricsDB.LoadAll()
+	if err != nil || len(samples) == 0 {
+		http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
+		return
+	}
+	timeline := metricsTimelineSegments(samples, time.Now())
+	if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
+		buf, ok, err := renderGPUOverviewChartSVG(idx, samples, timeline)
+		if err != nil {
+			http.Error(w, err.Error(), http.StatusInternalServerError)
+			return
+		}
+		if !ok {
+			http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
+			return
+		}
+		w.Header().Set("Content-Type", "image/svg+xml")
+		w.Header().Set("Cache-Control", "no-store")
+		_, _ = w.Write(buf)
+		return
+	}
+	datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
 	if !ok {
 		http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
 		return
 	}

-	buf, err := renderChartSVG(title, datasets, names, labels, yMin, yMax)
+	buf, err := renderMetricChartSVG(
+		title,
+		labels,
+		sampleTimes(samples),
+		datasets,
+		names,
+		yMin,
+		yMax,
+		chartCanvasHeightForPath(path, len(names)),
+		timeline,
+	)
 	if err != nil {
 		http.Error(w, err.Error(), http.StatusInternalServerError)
 		return
@@ -491,14 +601,6 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
 	_, _ = w.Write(buf)
 }

-func (h *handler) chartDataFromDB(path string) ([][]float64, []string, []string, string, *float64, *float64, bool) {
-	samples, err := h.metricsDB.LoadAll()
-	if err != nil || len(samples) == 0 {
-		return nil, nil, nil, "", nil, nil, false
-	}
-	return chartDataFromSamples(path, samples)
-}
-
 func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][]float64, []string, []string, string, *float64, *float64, bool) {
 	var datasets [][]float64
 	var names []string
@@ -578,15 +680,21 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 		yMin = floatPtr(0)
 		yMax = autoMax120(datasets...)

+	case path == "gpu-all-clock":
+		title = "GPU Core Clock"
+		datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
+		yMin, yMax = autoBounds120(datasets...)
+
+	case path == "gpu-all-memclock":
+		title = "GPU Memory Clock"
+		datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
+		yMin, yMax = autoBounds120(datasets...)
+
 	case strings.HasPrefix(path, "gpu/"):
-		rest := strings.TrimPrefix(path, "gpu/")
-		sub := ""
-		if i := strings.LastIndex(rest, "-"); i > 0 {
-			sub = rest[i+1:]
-			rest = rest[:i]
+		idx, sub, ok := parseGPUChartPath(path)
+		if !ok {
+			return nil, nil, nil, "", nil, nil, false
 		}
-		idx := 0
-		fmt.Sscanf(rest, "%d", &idx)
 		switch sub {
 		case "load":
 			title = fmt.Sprintf("GPU %d Load", idx)
@@ -609,6 +717,24 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 			names = []string{"Temp °C"}
 			yMin = floatPtr(0)
 			yMax = autoMax120(temp)
+		case "clock":
+			title = fmt.Sprintf("GPU %d Core Clock", idx)
+			clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
+			if clock == nil {
+				return nil, nil, nil, "", nil, nil, false
+			}
+			datasets = [][]float64{clock}
+			names = []string{"Core Clock MHz"}
+			yMin, yMax = autoBounds120(clock)
+		case "memclock":
+			title = fmt.Sprintf("GPU %d Memory Clock", idx)
+			clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
+			if clock == nil {
+				return nil, nil, nil, "", nil, nil, false
+			}
+			datasets = [][]float64{clock}
+			names = []string{"Memory Clock MHz"}
+			yMin, yMax = autoBounds120(clock)
 		default:
 			title = fmt.Sprintf("GPU %d Power", idx)
 			power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
@@ -627,6 +753,26 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
 	return datasets, names, labels, title, yMin, yMax, len(datasets) > 0
 }

+func parseGPUChartPath(path string) (idx int, sub string, ok bool) {
+	if !strings.HasPrefix(path, "gpu/") {
+		return 0, "", false
+	}
+	rest := strings.TrimPrefix(path, "gpu/")
+	if rest == "" {
+		return 0, "", false
+	}
+	sub = ""
+	if i := strings.LastIndex(rest, "-"); i > 0 {
+		sub = rest[i+1:]
+		rest = rest[:i]
+	}
+	n, err := fmt.Sscanf(rest, "%d", &idx)
+	if err != nil || n != 1 {
+		return 0, "", false
+	}
+	return idx, sub, true
+}
+
 func sampleTimeLabels(samples []platform.LiveMetricSample) []string {
 	labels := make([]string, len(samples))
 	if len(samples) == 0 {
@@ -852,64 +998,37 @@ func autoBounds120(datasets ...[]float64) (*float64, *float64) {
 	return floatPtr(low), floatPtr(high)
 }

-// renderChartSVG renders a line chart SVG with a fixed Y-axis range.
-func renderChartSVG(title string, datasets [][]float64, names []string, labels []string, yMin, yMax *float64) ([]byte, error) {
-	n := len(labels)
-	if n == 0 {
-		n = 1
-		labels = []string{""}
+func gpuChartLabelIndices(total, target int) []int {
+	if total <= 0 {
+		return nil
 	}
-	for i := range datasets {
-		if len(datasets[i]) == 0 {
-			datasets[i] = make([]float64, n)
-		}
+	if total == 1 {
+		return []int{0}
 	}
-	// Append global min/avg/max to title.
-	mn, avg, mx := globalStats(datasets)
-	if mx > 0 {
-		title = fmt.Sprintf("%s    ↓%s  ~%s  ↑%s",
-			title,
-			chartLegendNumber(mn),
-			chartLegendNumber(avg),
-			chartLegendNumber(mx),
-		)
+	step := total / target
+	if step < 1 {
+		step = 1
 	}
-	title = sanitizeChartText(title)
-	names = sanitizeChartTexts(names)
-	sparse := sanitizeChartTexts(sparseLabels(labels, 6))
+	var indices []int
+	for i := 0; i < total; i += step {
+		indices = append(indices, i)
+	}
+	if indices[len(indices)-1] != total-1 {
+		indices = append(indices, total-1)
+	}
+	return indices
+}

-	opt := gocharts.NewLineChartOptionWithData(datasets)
-	opt.Title = gocharts.TitleOption{Text: title}
-	opt.XAxis.Labels = sparse
-	opt.Legend = gocharts.LegendOption{SeriesNames: names}
-	if chartLegendVisible(len(names)) {
-		opt.Legend.Offset = gocharts.OffsetStr{Top: gocharts.PositionBottom}
-		opt.Legend.OverlayChart = gocharts.Ptr(false)
-	} else {
-		opt.Legend.Show = gocharts.Ptr(false)
-	}
-	opt.Symbol = gocharts.SymbolNone
-	// Right padding: reserve space for the MarkLine label (library recommendation).
-	opt.Padding = gocharts.NewBox(20, 20, 80, 20)
-	if yMin != nil || yMax != nil {
-		opt.YAxis = []gocharts.YAxisOption{chartYAxisOption(yMin, yMax)}
+func chartCanvasHeightForPath(path string, seriesCount int) int {
+	height := chartCanvasHeight(seriesCount)
+	if isGPUChartPath(path) {
+		return height * 2
 	}
+	return height
+}

-	// Add a single peak mark line on the series that holds the global maximum.
-	peakIdx, _ := globalPeakSeries(datasets)
-	if peakIdx >= 0 && peakIdx < len(opt.SeriesList) {
-		opt.SeriesList[peakIdx].MarkLine = gocharts.NewMarkLine(gocharts.SeriesMarkTypeMax)
-	}
-
-	p := gocharts.NewPainter(gocharts.PainterOptions{
-		OutputFormat: gocharts.ChartOutputSVG,
-		Width:        1400,
-		Height:       chartCanvasHeight(len(names)),
-	}, gocharts.PainterThemeOption(gocharts.GetTheme("grafana")))
-	if err := p.LineChart(opt); err != nil {
-		return nil, err
-	}
-	return p.Bytes()
+func isGPUChartPath(path string) bool {
+	return strings.HasPrefix(path, "gpu-all-") || strings.HasPrefix(path, "gpu/")
 }

 func chartLegendVisible(seriesCount int) bool {
@@ -923,30 +1042,6 @@ func chartCanvasHeight(seriesCount int) int {
 	return 288
 }

-func chartYAxisOption(yMin, yMax *float64) gocharts.YAxisOption {
-	return gocharts.YAxisOption{
-		Min:            yMin,
-		Max:            yMax,
-		LabelCount:     11,
-		ValueFormatter: chartYAxisNumber,
-	}
-}
-
-// globalPeakSeries returns the index of the series containing the global maximum
-// value across all datasets, and that maximum value.
-func globalPeakSeries(datasets [][]float64) (idx int, peak float64) {
-	idx = -1
-	for i, ds := range datasets {
-		for _, v := range ds {
-			if v > peak {
-				peak = v
-				idx = i
-			}
-		}
-	}
-	return idx, peak
-}
-
 // globalStats returns min, average, and max across all values in all datasets.
 func globalStats(datasets [][]float64) (mn, avg, mx float64) {
 	var sum float64
@@ -986,21 +1081,6 @@ func sanitizeChartText(s string) string {
 	}, s))
 }

-func sanitizeChartTexts(in []string) []string {
-	out := make([]string, len(in))
-	for i, s := range in {
-		out[i] = sanitizeChartText(s)
-	}
-	return out
-}
-
-func safeIdx(s []float64, i int) float64 {
-	if i < len(s) {
-		return s[i]
-	}
-	return 0
-}
-
 func snapshotNamedRings(rings []*namedMetricsRing) ([][]float64, []string, []string) {
 	var datasets [][]float64
 	var names []string
@@ -1087,20 +1167,6 @@ func chartYAxisNumber(v float64) string {
 	return out
 }

-func sparseLabels(labels []string, n int) []string {
-	out := make([]string, len(labels))
-	step := len(labels) / n
-	if step < 1 {
-		step = 1
-	}
-	for i, l := range labels {
-		if i%step == 0 {
-			out[i] = l
-		}
-	}
-	return out
-}
-
 func (h *handler) handleAPIMetricsExportCSV(w http.ResponseWriter, r *http.Request) {
 	if h.metricsDB == nil {
 		http.Error(w, "metrics database not available", http.StatusServiceUnavailable)
--- a/audit/internal/webui/server_test.go
+++ b/audit/internal/webui/server_test.go
@@ -34,6 +34,49 @@ func TestChartLegendNumber(t *testing.T) {
 	}
 }

+func TestRecoverMiddlewareReturns500OnPanic(t *testing.T) {
+	handler := recoverMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		panic("boom")
+	}))
+	rec := httptest.NewRecorder()
+	req := httptest.NewRequest(http.MethodGet, "/panic", nil)
+
+	handler.ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusInternalServerError {
+		t.Fatalf("status=%d want %d", rec.Code, http.StatusInternalServerError)
+	}
+	if !strings.Contains(rec.Body.String(), "internal server error") {
+		t.Fatalf("body=%q", rec.Body.String())
+	}
+}
+
+func TestRecoverMiddlewarePreservesStreamingInterfaces(t *testing.T) {
+	handler := recoverMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if !sseStart(w) {
+			return
+		}
+		if !sseWrite(w, "tick", "ok") {
+			t.Fatal("expected sse write to succeed")
+		}
+	}))
+	rec := httptest.NewRecorder()
+	req := httptest.NewRequest(http.MethodGet, "/stream", nil)
+
+	handler.ServeHTTP(rec, req)
+
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	if got := rec.Header().Get("Content-Type"); got != "text/event-stream" {
+		t.Fatalf("content-type=%q", got)
+	}
+	body := rec.Body.String()
+	if !strings.Contains(body, "event: tick\n") || !strings.Contains(body, "data: ok\n\n") {
+		t.Fatalf("body=%q", body)
+	}
+}
+
 func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
 	samples := []platform.LiveMetricSample{
 		{
@@ -136,6 +179,53 @@ func TestChartDataFromSamplesKeepsStableGPUSeriesOrder(t *testing.T) {
 	}
 }

+func TestChartDataFromSamplesIncludesGPUClockCharts(t *testing.T) {
+	samples := []platform.LiveMetricSample{
+		{
+			Timestamp: time.Now().Add(-2 * time.Minute),
+			GPUs: []platform.GPUMetricRow{
+				{GPUIndex: 0, ClockMHz: 1400, MemClockMHz: 2600},
+				{GPUIndex: 3, ClockMHz: 1500, MemClockMHz: 2800},
+			},
+		},
+		{
+			Timestamp: time.Now().Add(-1 * time.Minute),
+			GPUs: []platform.GPUMetricRow{
+				{GPUIndex: 0, ClockMHz: 1410, MemClockMHz: 2610},
+				{GPUIndex: 3, ClockMHz: 1510, MemClockMHz: 2810},
+			},
+		},
+	}
+
+	datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
+	if !ok {
+		t.Fatal("gpu-all-clock returned ok=false")
+	}
+	if title != "GPU Core Clock" {
+		t.Fatalf("title=%q", title)
+	}
+	if len(names) != 2 || names[0] != "GPU 0" || names[1] != "GPU 3" {
+		t.Fatalf("names=%v", names)
+	}
+	if got := datasets[1][1]; got != 1510 {
+		t.Fatalf("GPU 3 core clock=%v want 1510", got)
+	}
+
+	datasets, names, _, title, _, _, ok = chartDataFromSamples("gpu-all-memclock", samples)
+	if !ok {
+		t.Fatal("gpu-all-memclock returned ok=false")
+	}
+	if title != "GPU Memory Clock" {
+		t.Fatalf("title=%q", title)
+	}
+	if len(names) != 2 || names[0] != "GPU 0" || names[1] != "GPU 3" {
+		t.Fatalf("names=%v", names)
+	}
+	if got := datasets[0][0]; got != 2600 {
+		t.Fatalf("GPU 0 memory clock=%v want 2600", got)
+	}
+}
+
 func TestNormalizePowerSeriesHoldsLastPositive(t *testing.T) {
 	got := normalizePowerSeries([]float64{0, 480, 0, 0, 510, 0})
 	want := []float64{0, 480, 480, 480, 510, 510}
@@ -157,6 +247,21 @@ func TestRenderMetricsUsesBufferedChartRefresh(t *testing.T) {
 	if !strings.Contains(body, "el.dataset.loading === '1'") {
 		t.Fatalf("metrics page should avoid overlapping chart reloads: %s", body)
 	}
+	if !strings.Contains(body, `id="gpu-metrics-section" style="display:none`) {
+		t.Fatalf("metrics page should keep gpu charts in a hidden dedicated section until GPUs are detected: %s", body)
+	}
+	if !strings.Contains(body, `id="gpu-chart-toggle"`) {
+		t.Fatalf("metrics page should render GPU chart mode toggle: %s", body)
+	}
+	if !strings.Contains(body, `/api/metrics/chart/gpu-all-clock.svg`) {
+		t.Fatalf("metrics page should include GPU core clock chart: %s", body)
+	}
+	if !strings.Contains(body, `/api/metrics/chart/gpu-all-memclock.svg`) {
+		t.Fatalf("metrics page should include GPU memory clock chart: %s", body)
+	}
+	if !strings.Contains(body, `renderGPUOverviewCards(indices)`) {
+		t.Fatalf("metrics page should build per-GPU chart cards dynamically: %s", body)
+	}
 }

 func TestChartLegendVisible(t *testing.T) {
@@ -199,6 +304,124 @@ func TestChartCanvasHeight(t *testing.T) {
 	}
 }

+func TestChartTimelineSegmentsForRangeMergesActiveSpansAndIdleGaps(t *testing.T) {
+	start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
+	end := start.Add(10 * time.Minute)
+	taskWindow := func(offsetStart, offsetEnd time.Duration) Task {
+		s := start.Add(offsetStart)
+		e := start.Add(offsetEnd)
+		return Task{
+			Name:      "task",
+			Status:    TaskDone,
+			StartedAt: &s,
+			DoneAt:    &e,
+		}
+	}
+	segments := chartTimelineSegmentsForRange(start, end, end, []Task{
+		taskWindow(1*time.Minute, 3*time.Minute),
+		taskWindow(2*time.Minute, 5*time.Minute),
+		taskWindow(7*time.Minute, 8*time.Minute),
+	})
+	if len(segments) != 5 {
+		t.Fatalf("segments=%d want 5: %#v", len(segments), segments)
+	}
+	wantActive := []bool{false, true, false, true, false}
+	wantMinutes := [][2]int{{0, 1}, {1, 5}, {5, 7}, {7, 8}, {8, 10}}
+	for i, segment := range segments {
+		if segment.Active != wantActive[i] {
+			t.Fatalf("segment[%d].Active=%v want %v", i, segment.Active, wantActive[i])
+		}
+		if got := int(segment.Start.Sub(start).Minutes()); got != wantMinutes[i][0] {
+			t.Fatalf("segment[%d] start=%d want %d", i, got, wantMinutes[i][0])
+		}
+		if got := int(segment.End.Sub(start).Minutes()); got != wantMinutes[i][1] {
+			t.Fatalf("segment[%d] end=%d want %d", i, got, wantMinutes[i][1])
+		}
+	}
+}
+
+func TestRenderMetricChartSVGIncludesTimelineOverlay(t *testing.T) {
+	start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
+	labels := []string{"12:00", "12:01", "12:02"}
+	times := []time.Time{start, start.Add(time.Minute), start.Add(2 * time.Minute)}
+	svg, err := renderMetricChartSVG(
+		"System Power",
+		labels,
+		times,
+		[][]float64{{300, 320, 310}},
+		[]string{"Power W"},
+		floatPtr(0),
+		floatPtr(400),
+		360,
+		[]chartTimelineSegment{
+			{Start: start, End: start.Add(time.Minute), Active: false},
+			{Start: start.Add(time.Minute), End: start.Add(2 * time.Minute), Active: true},
+		},
+	)
+	if err != nil {
+		t.Fatal(err)
+	}
+	body := string(svg)
+	if !strings.Contains(body, `data-role="timeline-overlay"`) {
+		t.Fatalf("svg missing timeline overlay: %s", body)
+	}
+	if !strings.Contains(body, `opacity="0.10"`) {
+		t.Fatalf("svg missing idle overlay opacity: %s", body)
+	}
+	if !strings.Contains(body, `System Power`) {
+		t.Fatalf("svg missing chart title: %s", body)
+	}
+}
+
+func TestHandleMetricsChartSVGRendersCustomSVG(t *testing.T) {
+	dir := t.TempDir()
+	db, err := openMetricsDB(filepath.Join(dir, "metrics.db"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	t.Cleanup(func() { _ = db.db.Close() })
+
+	start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
+	for i, sample := range []platform.LiveMetricSample{
+		{Timestamp: start, PowerW: 300},
+		{Timestamp: start.Add(time.Minute), PowerW: 320},
+		{Timestamp: start.Add(2 * time.Minute), PowerW: 310},
+	} {
+		if err := db.Write(sample); err != nil {
+			t.Fatalf("write sample %d: %v", i, err)
+		}
+	}
+
+	globalQueue.mu.Lock()
+	prevTasks := globalQueue.tasks
+	s := start.Add(30 * time.Second)
+	e := start.Add(90 * time.Second)
+	globalQueue.tasks = []*Task{{Name: "Burn", Status: TaskDone, StartedAt: &s, DoneAt: &e}}
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = prevTasks
+		globalQueue.mu.Unlock()
+	})
+
+	h := &handler{opts: HandlerOptions{ExportDir: dir}, metricsDB: db}
+
+	rec := httptest.NewRecorder()
+	req := httptest.NewRequest(http.MethodGet, "/api/metrics/chart/server-power.svg", nil)
+	h.handleMetricsChartSVG(rec, req)
+
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	body := rec.Body.String()
+	if !strings.Contains(body, `data-role="timeline-overlay"`) {
+		t.Fatalf("custom svg response missing timeline overlay: %s", body)
+	}
+	if !strings.Contains(body, `stroke-linecap="round"`) {
+		t.Fatalf("custom svg response missing custom polyline styling: %s", body)
+	}
+}
+
 func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
 	got := normalizeFanSeries([]float64{4200, 0, 0, 4300, 0})
 	want := []float64{4200, 4200, 4200, 4300, 4300}
@@ -212,21 +435,6 @@ func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
 	}
 }

-func TestChartYAxisOption(t *testing.T) {
-	min := floatPtr(0)
-	max := floatPtr(100)
-	opt := chartYAxisOption(min, max)
-	if opt.Min != min || opt.Max != max {
-		t.Fatalf("chartYAxisOption min/max mismatch: %#v", opt)
-	}
-	if opt.LabelCount != 11 {
-		t.Fatalf("chartYAxisOption labelCount=%d want 11", opt.LabelCount)
-	}
-	if got := opt.ValueFormatter(1000); got != "1к" {
-		t.Fatalf("chartYAxisOption formatter(1000)=%q want 1к", got)
-	}
-}
-
 func TestSnapshotFanRingsUsesTimelineLabels(t *testing.T) {
 	r1 := newMetricsRing(4)
 	r2 := newMetricsRing(4)
@@ -409,6 +617,47 @@ func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
 	}
 }

+func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
+	handler := NewHandler(HandlerOptions{})
+	rec := httptest.NewRecorder()
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/benchmark", nil))
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d", rec.Code)
+	}
+	body := rec.Body.String()
+	for _, needle := range []string{
+		`href="/benchmark"`,
+		`id="benchmark-gpu-list"`,
+		`/api/gpu/nvidia`,
+		`/api/benchmark/nvidia/run`,
+		`benchmark-run-nccl`,
+	} {
+		if !strings.Contains(body, needle) {
+			t.Fatalf("benchmark page missing %q: %s", needle, body)
+		}
+	}
+}
+
+func TestBurnPageRendersOfficialNVIDIADCGMAndNCCLInterconnectLabel(t *testing.T) {
+	handler := NewHandler(HandlerOptions{})
+	rec := httptest.NewRecorder()
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/burn", nil))
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d", rec.Code)
+	}
+	body := rec.Body.String()
+	for _, needle := range []string{
+		`DCGM Diagnostics (Official NVIDIA)`,
+		`NCCL all_reduce_perf (Interconnect)`,
+		`DCGM is the official NVIDIA diagnostic path`,
+		`burn-gpu-dcgm`,
+	} {
+		if !strings.Contains(body, needle) {
+			t.Fatalf("burn page missing %q: %s", needle, body)
+		}
+	}
+}
+
 func TestTasksPageRendersScrollableLogModal(t *testing.T) {
 	dir := t.TempDir()
 	path := filepath.Join(dir, "audit.json")
--- a/audit/internal/webui/stability.go
+++ b/audit/internal/webui/stability.go
@@ -0,0 +1,42 @@
+package webui
+
+import (
+	"fmt"
+	"log/slog"
+	"runtime/debug"
+	"time"
+)
+
+func goRecoverLoop(name string, restartDelay time.Duration, fn func()) {
+	go func() {
+		for {
+			if !runRecoverable(name, fn) {
+				return
+			}
+			if restartDelay > 0 {
+				time.Sleep(restartDelay)
+			}
+		}
+	}()
+}
+
+func goRecoverOnce(name string, fn func()) {
+	go func() {
+		_ = runRecoverable(name, fn)
+	}()
+}
+
+func runRecoverable(name string, fn func()) (panicked bool) {
+	defer func() {
+		if rec := recover(); rec != nil {
+			panicked = true
+			slog.Error("recovered panic",
+				"component", name,
+				"panic", fmt.Sprint(rec),
+				"stack", string(debug.Stack()),
+			)
+		}
+	}()
+	fn()
+	return false
+}
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -4,10 +4,12 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
+	"log/slog"
 	"net/http"
 	"os"
 	"os/exec"
 	"path/filepath"
+	"runtime/debug"
 	"sort"
 	"strings"
 	"sync"
@@ -28,22 +30,23 @@ const (

 // taskNames maps target → human-readable name for validate (SAT) runs.
 var taskNames = map[string]string{
-	"nvidia":          "NVIDIA SAT",
-	"nvidia-stress":   "NVIDIA GPU Stress",
-	"memory":          "Memory SAT",
-	"storage":         "Storage SAT",
-	"cpu":             "CPU SAT",
-	"amd":             "AMD GPU SAT",
-	"amd-mem":         "AMD GPU MEM Integrity",
-	"amd-bandwidth":   "AMD GPU MEM Bandwidth",
-	"amd-stress":      "AMD GPU Burn-in",
-	"memory-stress":   "Memory Burn-in",
-	"sat-stress":      "SAT Stress (stressapptest)",
-	"platform-stress": "Platform Thermal Cycling",
-	"audit":           "Audit",
-	"support-bundle":  "Support Bundle",
-	"install":         "Install to Disk",
-	"install-to-ram":  "Install to RAM",
+	"nvidia":           "NVIDIA SAT",
+	"nvidia-benchmark": "NVIDIA Benchmark",
+	"nvidia-stress":    "NVIDIA GPU Stress",
+	"memory":           "Memory SAT",
+	"storage":          "Storage SAT",
+	"cpu":              "CPU SAT",
+	"amd":              "AMD GPU SAT",
+	"amd-mem":          "AMD GPU MEM Integrity",
+	"amd-bandwidth":    "AMD GPU MEM Bandwidth",
+	"amd-stress":       "AMD GPU Burn-in",
+	"memory-stress":    "Memory Burn-in",
+	"sat-stress":       "SAT Stress (stressapptest)",
+	"platform-stress":  "Platform Thermal Cycling",
+	"audit":            "Audit",
+	"support-bundle":   "Support Bundle",
+	"install":          "Install to Disk",
+	"install-to-ram":   "Install to RAM",
 }

 // burnNames maps target → human-readable name when a burn profile is set.
@@ -106,8 +109,11 @@ type taskParams struct {
 	DiagLevel          int      `json:"diag_level,omitempty"`
 	GPUIndices         []int    `json:"gpu_indices,omitempty"`
 	ExcludeGPUIndices  []int    `json:"exclude_gpu_indices,omitempty"`
+	SizeMB             int      `json:"size_mb,omitempty"`
 	Loader             string   `json:"loader,omitempty"`
 	BurnProfile        string   `json:"burn_profile,omitempty"`
+	BenchmarkProfile   string   `json:"benchmark_profile,omitempty"`
+	RunNCCL            bool     `json:"run_nccl,omitempty"`
 	DisplayName        string   `json:"display_name,omitempty"`
 	Device             string   `json:"device,omitempty"` // for install
 	PlatformComponents []string `json:"platform_components,omitempty"`
@@ -377,7 +383,7 @@ func (q *taskQueue) startWorker(opts *HandlerOptions) {
 	if !q.started {
 		q.loadLocked()
 		q.started = true
-		go q.worker()
+		goRecoverLoop("task worker", 2*time.Second, q.worker)
 	}
 	hasPending := q.nextPending() != nil
 	q.mu.Unlock()
@@ -392,78 +398,106 @@ func (q *taskQueue) startWorker(opts *HandlerOptions) {
 func (q *taskQueue) worker() {
 	for {
 		<-q.trigger
-		setCPUGovernor("performance")
+		func() {
+			setCPUGovernor("performance")
+			defer setCPUGovernor("powersave")

-		// Drain all pending tasks and start them in parallel.
-		q.mu.Lock()
-		var batch []*Task
-		for {
-			t := q.nextPending()
-			if t == nil {
-				break
+			// Drain all pending tasks and start them in parallel.
+			q.mu.Lock()
+			var batch []*Task
+			for {
+				t := q.nextPending()
+				if t == nil {
+					break
+				}
+				now := time.Now()
+				t.Status = TaskRunning
+				t.StartedAt = &now
+				t.DoneAt = nil
+				t.ErrMsg = ""
+				j := newTaskJobState(t.LogPath)
+				t.job = j
+				batch = append(batch, t)
 			}
-			now := time.Now()
-			t.Status = TaskRunning
-			t.StartedAt = &now
-			t.DoneAt = nil
-			t.ErrMsg = ""
-			j := newTaskJobState(t.LogPath)
-			t.job = j
-			batch = append(batch, t)
-		}
-		if len(batch) > 0 {
-			q.persistLocked()
-		}
-		q.mu.Unlock()
+			if len(batch) > 0 {
+				q.persistLocked()
+			}
+			q.mu.Unlock()

-		var wg sync.WaitGroup
-		for _, t := range batch {
-			t := t
-			j := t.job
-			taskCtx, taskCancel := context.WithCancel(context.Background())
-			j.cancel = taskCancel
-			wg.Add(1)
-			go func() {
-				defer wg.Done()
-
-				if q.kmsgWatcher != nil && isSATTarget(t.Target) {
-					q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
-				}
-
-				q.runTask(t, j, taskCtx)
-
-				if q.kmsgWatcher != nil {
-					q.kmsgWatcher.NotifyTaskFinished(t.ID)
-				}
+			var wg sync.WaitGroup
+			for _, t := range batch {
+				t := t
+				j := t.job
+				taskCtx, taskCancel := context.WithCancel(context.Background())
+				j.cancel = taskCancel
+				wg.Add(1)
+				goRecoverOnce("task "+t.Target, func() {
+					defer wg.Done()
+					defer taskCancel()
+					q.executeTask(t, j, taskCtx)
+				})
+			}
+			wg.Wait()

+			if len(batch) > 0 {
 				q.mu.Lock()
-				now2 := time.Now()
-				t.DoneAt = &now2
-				if t.Status == TaskRunning {
-					if j.err != "" {
-						t.Status = TaskFailed
-						t.ErrMsg = j.err
-					} else {
-						t.Status = TaskDone
-					}
-				}
+				q.prune()
 				q.persistLocked()
 				q.mu.Unlock()
-			}()
-		}
-		wg.Wait()
+			}
+		}()

-		if len(batch) > 0 {
-			q.mu.Lock()
-			q.prune()
-			q.persistLocked()
-			q.mu.Unlock()
-		}
-
-		setCPUGovernor("powersave")
 	}
 }

+func (q *taskQueue) executeTask(t *Task, j *jobState, ctx context.Context) {
+	startedKmsgWatch := false
+	defer q.finalizeTaskRun(t, j)
+	defer func() {
+		if startedKmsgWatch && q.kmsgWatcher != nil {
+			q.kmsgWatcher.NotifyTaskFinished(t.ID)
+		}
+	}()
+	defer func() {
+		if rec := recover(); rec != nil {
+			msg := fmt.Sprintf("task panic: %v", rec)
+			slog.Error("task panic",
+				"task_id", t.ID,
+				"target", t.Target,
+				"panic", fmt.Sprint(rec),
+				"stack", string(debug.Stack()),
+			)
+			j.append("ERROR: " + msg)
+			j.finish(msg)
+		}
+	}()
+
+	if q.kmsgWatcher != nil && isSATTarget(t.Target) {
+		q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
+		startedKmsgWatch = true
+	}
+
+	q.runTask(t, j, ctx)
+}
+
+func (q *taskQueue) finalizeTaskRun(t *Task, j *jobState) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+
+	now := time.Now()
+	t.DoneAt = &now
+	if t.Status == TaskRunning {
+		if j.err != "" {
+			t.Status = TaskFailed
+			t.ErrMsg = j.err
+		} else {
+			t.Status = TaskDone
+			t.ErrMsg = ""
+		}
+	}
+	q.persistLocked()
+}
+
 // setCPUGovernor writes the given governor to all CPU scaling_governor sysfs files.
 // Silently ignores errors (e.g. when cpufreq is not available).
 func setCPUGovernor(governor string) {
@@ -517,6 +551,18 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 		} else {
 			archive, err = a.RunNvidiaAcceptancePack("", j.append)
 		}
+	case "nvidia-benchmark":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = a.RunNvidiaBenchmarkCtx(ctx, "", platform.NvidiaBenchmarkOptions{
+			Profile:           t.params.BenchmarkProfile,
+			SizeMB:            t.params.SizeMB,
+			GPUIndices:        t.params.GPUIndices,
+			ExcludeGPUIndices: t.params.ExcludeGPUIndices,
+			RunNCCL:           t.params.RunNCCL,
+		}, j.append)
 	case "nvidia-stress":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
--- a/audit/internal/webui/tasks_test.go
+++ b/audit/internal/webui/tasks_test.go
@@ -467,3 +467,52 @@ func TestRunTaskInstallUsesSharedCommandStreaming(t *testing.T) {
 		t.Fatalf("unexpected error: %q", j.err)
 	}
 }
+
+func TestExecuteTaskMarksPanicsAsFailedAndClosesKmsgWindow(t *testing.T) {
+	dir := t.TempDir()
+	q := &taskQueue{
+		opts:        &HandlerOptions{App: &app.App{}},
+		statePath:   filepath.Join(dir, "tasks-state.json"),
+		logsDir:     filepath.Join(dir, "tasks"),
+		kmsgWatcher: newKmsgWatcher(nil),
+	}
+	tk := &Task{
+		ID:        "cpu-panic-1",
+		Name:      "CPU SAT",
+		Target:    "cpu",
+		Status:    TaskRunning,
+		CreatedAt: time.Now(),
+	}
+	j := &jobState{}
+
+	orig := runCPUAcceptancePackCtx
+	runCPUAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, _ int, _ func(string)) (string, error) {
+		panic("boom")
+	}
+	defer func() { runCPUAcceptancePackCtx = orig }()
+
+	q.executeTask(tk, j, context.Background())
+
+	if tk.Status != TaskFailed {
+		t.Fatalf("status=%q want %q", tk.Status, TaskFailed)
+	}
+	if tk.DoneAt == nil {
+		t.Fatal("expected done_at to be set")
+	}
+	if !strings.Contains(tk.ErrMsg, "task panic: boom") {
+		t.Fatalf("task error=%q", tk.ErrMsg)
+	}
+	if !strings.Contains(j.err, "task panic: boom") {
+		t.Fatalf("job error=%q", j.err)
+	}
+	q.kmsgWatcher.mu.Lock()
+	activeCount := q.kmsgWatcher.activeCount
+	window := q.kmsgWatcher.window
+	q.kmsgWatcher.mu.Unlock()
+	if activeCount != 0 {
+		t.Fatalf("activeCount=%d want 0", activeCount)
+	}
+	if window != nil {
+		t.Fatalf("expected kmsg window to be cleared, got %+v", window)
+	}
+}
--- a/2
+++ b/2
--- a/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
+++ b/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
@@ -30,6 +30,7 @@ systemctl enable bee-preflight.service
 systemctl enable bee-audit.service
 systemctl enable bee-web.service
 systemctl enable bee-sshsetup.service
+systemctl enable bee-selfheal.timer
 systemctl enable ssh.service
 systemctl enable lightdm.service 2>/dev/null || true
 systemctl enable qemu-guest-agent.service 2>/dev/null || true
@@ -58,6 +59,7 @@ chmod +x /usr/local/bin/bee-sshsetup   2>/dev/null || true
 chmod +x /usr/local/bin/bee-smoketest  2>/dev/null || true
 chmod +x /usr/local/bin/bee            2>/dev/null || true
 chmod +x /usr/local/bin/bee-log-run    2>/dev/null || true
+chmod +x /usr/local/bin/bee-selfheal   2>/dev/null || true
 if [ "$GPU_VENDOR" = "nvidia" ]; then
    chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
    chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
--- a/iso/builder/smoketest.sh
+++ b/iso/builder/smoketest.sh
@@ -171,6 +171,12 @@ for svc in bee-nvidia bee-network bee-preflight bee-audit bee-web; do
    fi
 done

+if systemctl is-active --quiet bee-selfheal.timer 2>/dev/null; then
+    ok "timer active: bee-selfheal.timer"
+else
+    fail "timer NOT active: bee-selfheal.timer"
+fi
+
 echo ""
 echo "-- runtime health --"
 if [ -f /appdata/bee/export/runtime-health.json ] && [ -s /appdata/bee/export/runtime-health.json ]; then
--- a/iso/overlay/etc/systemd/system/bee-selfheal.service
+++ b/iso/overlay/etc/systemd/system/bee-selfheal.service
@@ -0,0 +1,9 @@
+[Unit]
+Description=Bee: periodic runtime self-heal
+After=bee-web.service bee-audit.service bee-preflight.service
+
+[Service]
+Type=oneshot
+ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-selfheal.log /usr/local/bin/bee-selfheal
+StandardOutput=journal
+StandardError=journal
--- a/iso/overlay/etc/systemd/system/bee-selfheal.timer
+++ b/iso/overlay/etc/systemd/system/bee-selfheal.timer
@@ -0,0 +1,11 @@
+[Unit]
+Description=Bee: run self-heal checks periodically
+
+[Timer]
+OnBootSec=45sec
+OnUnitActiveSec=60sec
+AccuracySec=15sec
+Unit=bee-selfheal.service
+
+[Install]
+WantedBy=timers.target
--- a/iso/overlay/etc/systemd/system/bee-web.service
+++ b/iso/overlay/etc/systemd/system/bee-web.service
@@ -1,11 +1,12 @@
 [Unit]
 Description=Bee: hardware audit web viewer
+StartLimitIntervalSec=0

 [Service]
 Type=simple
 ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-web.log /usr/local/bin/bee web --listen :80 --audit-path /appdata/bee/export/bee-audit.json --export-dir /appdata/bee/export --title "Bee Hardware Audit"
 Restart=always
-RestartSec=2
+RestartSec=3
 StandardOutput=journal
 StandardError=journal
 LimitMEMLOCK=infinity
--- a/iso/overlay/usr/local/bin/bee-selfheal
+++ b/iso/overlay/usr/local/bin/bee-selfheal
@@ -0,0 +1,99 @@
+#!/bin/bash
+# bee-selfheal — periodic best-effort recovery for critical live ISO services.
+
+set -u
+
+LOG_PREFIX="bee-selfheal"
+EXPORT_DIR="/appdata/bee/export"
+AUDIT_JSON="${EXPORT_DIR}/bee-audit.json"
+RUNTIME_JSON="${EXPORT_DIR}/runtime-health.json"
+LOCK_DIR="/run/bee-selfheal.lock"
+
+log() {
+    echo "[${LOG_PREFIX}] $*"
+}
+
+have_nvidia_gpu() {
+    lspci -nn 2>/dev/null | grep -qi '10de:'
+}
+
+service_active() {
+    systemctl is-active --quiet "$1" 2>/dev/null
+}
+
+restart_service() {
+    local svc="$1"
+    if systemctl restart "$svc" >/dev/null 2>&1; then
+        log "restarted ${svc}"
+        return 0
+    fi
+    log "WARN: failed to restart ${svc}"
+    return 1
+}
+
+file_ready() {
+    [ -s "$1" ]
+}
+
+artifact_state() {
+    local path="$1"
+    if [ -s "${path}" ]; then
+        echo "ready"
+        return 0
+    fi
+    if [ -e "${path}.tmp" ]; then
+        echo "interrupted"
+        return 0
+    fi
+    echo "missing"
+}
+
+web_healthy() {
+    bash -c 'exec 3<>/dev/tcp/127.0.0.1/80 && printf "GET /healthz HTTP/1.0\r\nHost: localhost\r\n\r\n" >&3 && grep -q "^ok$" <&3' \
+        >/dev/null 2>&1
+}
+
+mkdir -p "${EXPORT_DIR}" /run
+
+if ! mkdir "${LOCK_DIR}" 2>/dev/null; then
+    log "another self-heal run is already active"
+    exit 0
+fi
+trap 'rmdir "${LOCK_DIR}" >/dev/null 2>&1 || true' EXIT
+
+log "start"
+
+if have_nvidia_gpu && [ ! -e /dev/nvidia0 ]; then
+    log "NVIDIA GPU detected but /dev/nvidia0 is missing"
+    restart_service bee-nvidia.service || true
+fi
+
+runtime_state="$(artifact_state "${RUNTIME_JSON}")"
+if [ "${runtime_state}" != "ready" ]; then
+    if [ "${runtime_state}" = "interrupted" ]; then
+        log "runtime-health.json.tmp exists — interrupted runtime-health write detected"
+    else
+        log "runtime-health.json missing or empty"
+    fi
+    restart_service bee-preflight.service || true
+fi
+
+audit_state="$(artifact_state "${AUDIT_JSON}")"
+if [ "${audit_state}" != "ready" ]; then
+    if [ "${audit_state}" = "interrupted" ]; then
+        log "bee-audit.json.tmp exists — interrupted audit write detected"
+    else
+        log "bee-audit.json missing or empty"
+    fi
+    restart_service bee-audit.service || true
+fi
+
+if ! service_active bee-web.service; then
+    log "bee-web.service is not active"
+    restart_service bee-web.service || true
+elif ! web_healthy; then
+    log "bee-web health check failed"
+    restart_service bee-web.service || true
+fi
+
+log "done"
Author	SHA1	Message	Date
Michael Chus	25af2df23a	Unify metrics charts on custom SVG renderer	2026-04-05 12:17:50 +03:00
Michael Chus	20abff7f90	WIP: checkpoint current tree	2026-04-05 12:05:00 +03:00
Michael Chus	a14ec8631c	Persist GPU chart mode and expand GPU charts	2026-04-05 11:52:32 +03:00
Michael Chus	f58c7e58d3	Fix webui streaming recovery regressions	2026-04-05 10:39:09 +03:00
Michael Chus	bf47c8dbd2	Add NVIDIA benchmark reporting flow	2026-04-05 10:30:56 +03:00
Michael Chus	143b7dca5d	Add stability hardening and self-heal recovery	2026-04-05 10:29:37 +03:00
Michael Chus	9826d437a5	Add GPU clock charts and grouped GPU metrics view	2026-04-05 09:57:38 +03:00