diff --git a/audit/cmd/bee/main.go b/audit/cmd/bee/main.go
index 5fb6c1e..9bf7827 100644
--- a/audit/cmd/bee/main.go
+++ b/audit/cmd/bee/main.go
@@ -8,6 +8,7 @@ import (
 	"log/slog"
 	"os"
 	"runtime/debug"
+	"strconv"
 	"strings"
 
 	"bee/audit/internal/app"
@@ -35,15 +36,13 @@ func run(args []string, stdout, stderr io.Writer) (exitCode int) {
 		Level: slog.LevelInfo,
 	})))
 	defer func() {
-		rec := recover()
-		if rec == nil {
-			return
+		if rec := recover(); rec != nil {
+			slog.Error("fatal panic",
+				"panic", fmt.Sprint(rec),
+				"stack", string(debug.Stack()),
+			)
+			exitCode = 1
 		}
-		slog.Error("fatal panic",
-			"panic", fmt.Sprint(rec),
-			"stack", string(debug.Stack()),
-		)
-		exitCode = 1
 	}()
 
 	if len(args) == 0 {
@@ -70,6 +69,8 @@ func run(args []string, stdout, stderr io.Writer) (exitCode int) {
 		return runWeb(args[1:], stdout, stderr)
 	case "sat":
 		return runSAT(args[1:], stdout, stderr)
+	case "benchmark":
+		return runBenchmark(args[1:], stdout, stderr)
 	case "version", "--version", "-version":
 		fmt.Fprintln(stdout, Version)
 		return 0
@@ -88,6 +89,7 @@ func printRootUsage(w io.Writer) {
   bee support-bundle --output stdout|file:<path>
   bee web     --listen :80 --audit-path `+app.DefaultAuditJSONPath+`
   bee sat nvidia|memory|storage|cpu [--duration <seconds>]
+  bee benchmark nvidia [--profile standard|stability|overnight]
   bee version
   bee help [command]`)
 }
@@ -106,6 +108,8 @@ func runHelp(args []string, stdout, stderr io.Writer) int {
 		return runWeb([]string{"--help"}, stdout, stdout)
 	case "sat":
 		return runSAT([]string{"--help"}, stdout, stderr)
+	case "benchmark":
+		return runBenchmark([]string{"--help"}, stdout, stderr)
 	case "version":
 		fmt.Fprintln(stdout, "usage: bee version")
 		return 0
@@ -395,3 +399,85 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
 	slog.Info("sat archive written", "target", target, "path", archive)
 	return 0
 }
+
+func runBenchmark(args []string, stdout, stderr io.Writer) int {
+	if len(args) == 0 {
+		fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
+		return 2
+	}
+	if args[0] == "help" || args[0] == "--help" || args[0] == "-h" {
+		fmt.Fprintln(stdout, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
+		return 0
+	}
+	target := args[0]
+	if target != "nvidia" {
+		fmt.Fprintf(stderr, "bee benchmark: unknown target %q\n", target)
+		fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
+		return 2
+	}
+
+	fs := flag.NewFlagSet("benchmark", flag.ContinueOnError)
+	fs.SetOutput(stderr)
+	profile := fs.String("profile", platform.NvidiaBenchmarkProfileStandard, "benchmark profile: standard, stability, overnight")
+	devices := fs.String("devices", "", "comma-separated GPU indices to include")
+	exclude := fs.String("exclude", "", "comma-separated GPU indices to exclude")
+	sizeMB := fs.Int("size-mb", 0, "per-GPU benchmark buffer size in MB (0 = auto)")
+	skipNCCL := fs.Bool("skip-nccl", false, "skip multi-GPU NCCL interconnect benchmark")
+	if err := fs.Parse(args[1:]); err != nil {
+		if err == flag.ErrHelp {
+			return 0
+		}
+		return 2
+	}
+	if fs.NArg() != 0 {
+		fmt.Fprintf(stderr, "bee benchmark: unexpected arguments\n")
+		return 2
+	}
+
+	includeIndices, err := parseBenchmarkIndexCSV(*devices)
+	if err != nil {
+		fmt.Fprintf(stderr, "bee benchmark: invalid --devices: %v\n", err)
+		return 2
+	}
+	excludeIndices, err := parseBenchmarkIndexCSV(*exclude)
+	if err != nil {
+		fmt.Fprintf(stderr, "bee benchmark: invalid --exclude: %v\n", err)
+		return 2
+	}
+
+	application := app.New(platform.New())
+	logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
+	archive, err := application.RunNvidiaBenchmark("", platform.NvidiaBenchmarkOptions{
+		Profile:           *profile,
+		SizeMB:            *sizeMB,
+		GPUIndices:        includeIndices,
+		ExcludeGPUIndices: excludeIndices,
+		RunNCCL:           !*skipNCCL,
+	}, logLine)
+	if err != nil {
+		slog.Error("run benchmark", "target", target, "err", err)
+		return 1
+	}
+	slog.Info("benchmark archive written", "target", target, "path", archive)
+	return 0
+}
+
+func parseBenchmarkIndexCSV(raw string) ([]int, error) {
+	raw = strings.TrimSpace(raw)
+	if raw == "" {
+		return nil, nil
+	}
+	var indices []int
+	for _, part := range strings.Split(raw, ",") {
+		part = strings.TrimSpace(part)
+		if part == "" {
+			continue
+		}
+		value, err := strconv.Atoi(part)
+		if err != nil || value < 0 {
+			return nil, fmt.Errorf("bad gpu index %q", part)
+		}
+		indices = append(indices, value)
+	}
+	return indices, nil
+}
diff --git a/audit/internal/app/app.go b/audit/internal/app/app.go
index 1a18863..96fd5f7 100644
--- a/audit/internal/app/app.go
+++ b/audit/internal/app/app.go
@@ -19,17 +19,18 @@ import (
 )
 
 var (
-	DefaultExportDir       = "/appdata/bee/export"
-	DefaultAuditJSONPath   = DefaultExportDir + "/bee-audit.json"
-	DefaultAuditLogPath    = DefaultExportDir + "/bee-audit.log"
-	DefaultWebLogPath      = DefaultExportDir + "/bee-web.log"
-	DefaultNetworkLogPath  = DefaultExportDir + "/bee-network.log"
-	DefaultNvidiaLogPath   = DefaultExportDir + "/bee-nvidia.log"
-	DefaultSSHLogPath      = DefaultExportDir + "/bee-sshsetup.log"
-	DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
-	DefaultRuntimeLogPath  = DefaultExportDir + "/runtime-health.log"
-	DefaultTechDumpDir     = DefaultExportDir + "/techdump"
-	DefaultSATBaseDir      = DefaultExportDir + "/bee-sat"
+	DefaultExportDir        = "/appdata/bee/export"
+	DefaultAuditJSONPath    = DefaultExportDir + "/bee-audit.json"
+	DefaultAuditLogPath     = DefaultExportDir + "/bee-audit.log"
+	DefaultWebLogPath       = DefaultExportDir + "/bee-web.log"
+	DefaultNetworkLogPath   = DefaultExportDir + "/bee-network.log"
+	DefaultNvidiaLogPath    = DefaultExportDir + "/bee-nvidia.log"
+	DefaultSSHLogPath       = DefaultExportDir + "/bee-sshsetup.log"
+	DefaultRuntimeJSONPath  = DefaultExportDir + "/runtime-health.json"
+	DefaultRuntimeLogPath   = DefaultExportDir + "/runtime-health.log"
+	DefaultTechDumpDir      = DefaultExportDir + "/techdump"
+	DefaultSATBaseDir       = DefaultExportDir + "/bee-sat"
+	DefaultBenchmarkBaseDir = DefaultExportDir + "/bee-benchmark"
 )
 
 type App struct {
@@ -114,6 +115,7 @@ func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
 type satRunner interface {
 	RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error)
 	RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
+	RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
 	RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
 	RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
 	RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
@@ -195,11 +197,10 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
 		return "stdout", err
 	case strings.HasPrefix(output, "file:"):
 		path := strings.TrimPrefix(output, "file:")
-		err := atomicWriteFile(path, append(data, '\n'), 0644)
-		if err == nil {
-			return path, nil
+		if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
+			return "", err
 		}
-		return "", err
+		return path, nil
 	default:
 		return "", fmt.Errorf("unknown output destination %q — use stdout or file:<path>", output)
 	}
@@ -221,11 +222,10 @@ func (a *App) RunRuntimePreflight(output string) (string, error) {
 		return "stdout", err
 	case strings.HasPrefix(output, "file:"):
 		path := strings.TrimPrefix(output, "file:")
-		err := atomicWriteFile(path, append(data, '\n'), 0644)
-		if err == nil {
-			return path, nil
+		if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
+			return "", err
 		}
-		return "", err
+		return path, nil
 	default:
 		return "", fmt.Errorf("unknown output destination %q — use stdout or file:<path>", output)
 	}
@@ -532,6 +532,17 @@ func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOpti
 	return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
 }
 
+func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
+	return a.RunNvidiaBenchmarkCtx(context.Background(), baseDir, opts, logFunc)
+}
+
+func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultBenchmarkBaseDir
+	}
+	return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
+}
+
 func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
diff --git a/audit/internal/app/app_test.go b/audit/internal/app/app_test.go
index 9fa9bc3..6ac733f 100644
--- a/audit/internal/app/app_test.go
+++ b/audit/internal/app/app_test.go
@@ -120,15 +120,16 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
 }
 
 type fakeSAT struct {
-	runNvidiaFn       func(string) (string, error)
-	runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
-	runMemoryFn       func(string) (string, error)
-	runStorageFn      func(string) (string, error)
-	runCPUFn          func(string, int) (string, error)
-	detectVendorFn    func() string
-	listAMDGPUsFn     func() ([]platform.AMDGPUInfo, error)
-	runAMDPackFn      func(string) (string, error)
-	listNvidiaGPUsFn  func() ([]platform.NvidiaGPU, error)
+	runNvidiaFn          func(string) (string, error)
+	runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
+	runNvidiaStressFn    func(string, platform.NvidiaStressOptions) (string, error)
+	runMemoryFn          func(string) (string, error)
+	runStorageFn         func(string) (string, error)
+	runCPUFn             func(string, int) (string, error)
+	detectVendorFn       func() string
+	listAMDGPUsFn        func() ([]platform.AMDGPUInfo, error)
+	runAMDPackFn         func(string) (string, error)
+	listNvidiaGPUsFn     func() ([]platform.NvidiaGPU, error)
 }
 
 func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
@@ -139,6 +140,13 @@ func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir s
 	return f.runNvidiaFn(baseDir)
 }
 
+func (f fakeSAT) RunNvidiaBenchmark(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, _ func(string)) (string, error) {
+	if f.runNvidiaBenchmarkFn != nil {
+		return f.runNvidiaBenchmarkFn(baseDir, opts)
+	}
+	return f.runNvidiaFn(baseDir)
+}
+
 func (f fakeSAT) RunNvidiaStressPack(_ context.Context, baseDir string, opts platform.NvidiaStressOptions, _ func(string)) (string, error) {
 	if f.runNvidiaStressFn != nil {
 		return f.runNvidiaStressFn(baseDir, opts)
diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go
new file mode 100644
index 0000000..1b401ee
--- /dev/null
+++ b/audit/internal/platform/benchmark.go
@@ -0,0 +1,1009 @@
+package platform
+
+import (
+	"context"
+	"encoding/csv"
+	"encoding/json"
+	"fmt"
+	"math"
+	"os"
+	"path/filepath"
+	"regexp"
+	"sort"
+	"strconv"
+	"strings"
+	"time"
+)
+
+const benchmarkVersion = "1"
+
+type benchmarkProfileSpec struct {
+	Name        string
+	BaselineSec int
+	WarmupSec   int
+	SteadySec   int
+	NCCLSec     int
+	CooldownSec int
+}
+
+type benchmarkGPUInfo struct {
+	Index               int
+	UUID                string
+	Name                string
+	BusID               string
+	VBIOS               string
+	PowerLimitW         float64
+	MaxGraphicsClockMHz float64
+	MaxMemoryClockMHz   float64
+}
+
+type benchmarkBurnProfile struct {
+	name       string
+	category   string
+	supported  bool
+	lanes      int
+	m          uint64
+	n          uint64
+	k          uint64
+	iterations uint64
+	notes      string
+}
+
+type benchmarkBurnParseResult struct {
+	Device            string
+	ComputeCapability string
+	Backend           string
+	DurationSec       int
+	Profiles          []BenchmarkPrecisionResult
+	Fallback          bool
+}
+
+type benchmarkRestoreAction struct {
+	name string
+	fn   func()
+}
+
+var (
+	benchmarkReadyPattern      = regexp.MustCompile(`^([a-z0-9_]+)\[(\d+)\]=READY dim=(\d+)x(\d+)x(\d+)\b`)
+	benchmarkSkippedPattern    = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
+	benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
+)
+
+func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
+	if ctx == nil {
+		ctx = context.Background()
+	}
+	if logFunc == nil {
+		logFunc = func(string) {}
+	}
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = "/var/log/bee-benchmark"
+	}
+	spec := resolveBenchmarkProfile(opts.Profile)
+	opts = normalizeNvidiaBenchmarkOptionsForBenchmark(opts)
+
+	selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
+	if err != nil {
+		return "", err
+	}
+	if len(selected) == 0 {
+		return "", fmt.Errorf("no NVIDIA GPUs selected")
+	}
+
+	ts := time.Now().UTC().Format("20060102-150405")
+	runDir := filepath.Join(baseDir, "gpu-benchmark-"+ts)
+	if err := os.MkdirAll(runDir, 0755); err != nil {
+		return "", fmt.Errorf("mkdir %s: %w", runDir, err)
+	}
+	verboseLog := filepath.Join(runDir, "verbose.log")
+
+	hostname, _ := os.Hostname()
+	result := NvidiaBenchmarkResult{
+		BenchmarkVersion:   benchmarkVersion,
+		GeneratedAt:        time.Now().UTC(),
+		Hostname:           hostname,
+		BenchmarkProfile:   spec.Name,
+		SelectedGPUIndices: append([]int(nil), selected...),
+		Normalization: BenchmarkNormalization{
+			Status: "full",
+		},
+	}
+
+	logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected)))
+
+	infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
+	if infoErr != nil {
+		result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
+		result.Normalization.Status = "partial"
+	}
+
+	if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
+		_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
+	}
+
+	activeApps, err := queryActiveComputeApps(selected)
+	if err == nil && len(activeApps) > 0 {
+		result.Warnings = append(result.Warnings, "active GPU compute processes detected before benchmark")
+		result.Normalization.Notes = append(result.Normalization.Notes, activeApps...)
+		result.Normalization.Status = "partial"
+	}
+
+	restoreActions := applyBenchmarkNormalization(ctx, verboseLog, selected, infoByIndex, &result)
+	defer func() {
+		for i := len(restoreActions) - 1; i >= 0; i-- {
+			restoreActions[i].fn()
+		}
+	}()
+
+	for _, idx := range selected {
+		gpuResult := BenchmarkGPUResult{
+			Index:  idx,
+			Status: "FAILED",
+		}
+		if info, ok := infoByIndex[idx]; ok {
+			gpuResult.UUID = info.UUID
+			gpuResult.Name = info.Name
+			gpuResult.BusID = info.BusID
+			gpuResult.VBIOS = info.VBIOS
+			gpuResult.PowerLimitW = info.PowerLimitW
+			gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
+			gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
+		}
+		if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
+			gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz
+			gpuResult.LockedMemoryClockMHz = norm.MemoryClockLockMHz
+		}
+
+		baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, []int{idx})
+		if err != nil && err != context.Canceled {
+			gpuResult.Notes = append(gpuResult.Notes, "baseline sampling failed: "+err.Error())
+		}
+		gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows)
+		writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), baselineRows)
+
+		warmupCmd := []string{
+			"bee-gpu-burn",
+			"--seconds", strconv.Itoa(spec.WarmupSec),
+			"--size-mb", strconv.Itoa(opts.SizeMB),
+			"--devices", strconv.Itoa(idx),
+		}
+		logFunc(fmt.Sprintf("GPU %d: warmup (%ds)", idx, spec.WarmupSec))
+		warmupOut, _, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-warmup.log", idx), warmupCmd, nil, []int{idx}, runDir, fmt.Sprintf("gpu-%d-warmup", idx), logFunc)
+		_ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-warmup.log", idx)), warmupOut, 0644)
+		if warmupErr != nil {
+			gpuResult.Notes = append(gpuResult.Notes, "warmup failed: "+warmupErr.Error())
+			result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
+			continue
+		}
+
+		beforeThrottle, _ := queryThrottleCounters(idx)
+		steadyCmd := []string{
+			"bee-gpu-burn",
+			"--seconds", strconv.Itoa(spec.SteadySec),
+			"--size-mb", strconv.Itoa(opts.SizeMB),
+			"--devices", strconv.Itoa(idx),
+		}
+		logFunc(fmt.Sprintf("GPU %d: steady compute (%ds)", idx, spec.SteadySec))
+		steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-steady.log", idx), steadyCmd, nil, []int{idx}, runDir, fmt.Sprintf("gpu-%d-steady", idx), logFunc)
+		_ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady.log", idx)), steadyOut, 0644)
+		afterThrottle, _ := queryThrottleCounters(idx)
+		if steadyErr != nil {
+			gpuResult.Notes = append(gpuResult.Notes, "steady compute failed: "+steadyErr.Error())
+		}
+
+		parseResult := parseBenchmarkBurnLog(string(steadyOut))
+		gpuResult.ComputeCapability = parseResult.ComputeCapability
+		gpuResult.Backend = parseResult.Backend
+		gpuResult.PrecisionResults = parseResult.Profiles
+		if parseResult.Fallback {
+			gpuResult.Notes = append(gpuResult.Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable")
+		}
+
+		gpuResult.Steady = summarizeBenchmarkTelemetry(steadyRows)
+		gpuResult.Throttle = diffThrottleCounters(beforeThrottle, afterThrottle)
+
+		cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, []int{idx})
+		if err != nil && err != context.Canceled {
+			gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error())
+		}
+		gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows)
+		writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-cooldown", idx), cooldownRows)
+
+		gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult)
+		gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status)
+		if steadyErr != nil {
+			gpuResult.Status = classifySATErrorStatus(steadyOut, steadyErr)
+		} else if parseResult.Fallback {
+			gpuResult.Status = "PARTIAL"
+		} else {
+			gpuResult.Status = "OK"
+		}
+
+		result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
+	}
+
+	if len(selected) > 1 && opts.RunNCCL {
+		result.Interconnect = runBenchmarkInterconnect(ctx, verboseLog, runDir, selected, spec, logFunc)
+		if result.Interconnect != nil && result.Interconnect.Supported {
+			for i := range result.GPUs {
+				result.GPUs[i].Scores.InterconnectScore = result.Interconnect.MaxBusBWGBps
+				result.GPUs[i].Scores.CompositeScore = compositeBenchmarkScore(result.GPUs[i].Scores)
+			}
+		}
+	}
+
+	result.Findings = buildBenchmarkFindings(result)
+	result.OverallStatus = benchmarkOverallStatus(result)
+
+	resultJSON, err := json.MarshalIndent(result, "", "  ")
+	if err != nil {
+		return "", fmt.Errorf("marshal benchmark result: %w", err)
+	}
+	if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
+		return "", fmt.Errorf("write result.json: %w", err)
+	}
+
+	report := renderBenchmarkReport(result)
+	if err := os.WriteFile(filepath.Join(runDir, "report.txt"), []byte(report), 0644); err != nil {
+		return "", fmt.Errorf("write report.txt: %w", err)
+	}
+
+	summary := renderBenchmarkSummary(result)
+	if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644); err != nil {
+		return "", fmt.Errorf("write summary.txt: %w", err)
+	}
+
+	archive := filepath.Join(baseDir, "gpu-benchmark-"+ts+".tar.gz")
+	if err := createTarGz(archive, runDir); err != nil {
+		return "", fmt.Errorf("pack benchmark archive: %w", err)
+	}
+	return archive, nil
+}
+
+func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) NvidiaBenchmarkOptions {
+	switch strings.TrimSpace(strings.ToLower(opts.Profile)) {
+	case NvidiaBenchmarkProfileStability:
+		opts.Profile = NvidiaBenchmarkProfileStability
+	case NvidiaBenchmarkProfileOvernight:
+		opts.Profile = NvidiaBenchmarkProfileOvernight
+	default:
+		opts.Profile = NvidiaBenchmarkProfileStandard
+	}
+	if opts.SizeMB < 0 {
+		opts.SizeMB = 0
+	}
+	opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices)
+	opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices)
+	if !opts.RunNCCL {
+		opts.RunNCCL = true
+	}
+	return opts
+}
+
+func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
+	switch strings.TrimSpace(strings.ToLower(profile)) {
+	case NvidiaBenchmarkProfileStability:
+		return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300}
+	case NvidiaBenchmarkProfileOvernight:
+		return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300}
+	default:
+		return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120}
+	}
+}
+
+func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
+	args := []string{
+		"--query-gpu=index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory",
+		"--format=csv,noheader,nounits",
+	}
+	if len(gpuIndices) > 0 {
+		args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...)
+	}
+	out, err := satExecCommand("nvidia-smi", args...).Output()
+	if err != nil {
+		return nil, fmt.Errorf("nvidia-smi gpu info: %w", err)
+	}
+
+	r := csv.NewReader(strings.NewReader(string(out)))
+	r.TrimLeadingSpace = true
+	r.FieldsPerRecord = -1
+	rows, err := r.ReadAll()
+	if err != nil {
+		return nil, fmt.Errorf("parse nvidia-smi gpu info: %w", err)
+	}
+
+	infoByIndex := make(map[int]benchmarkGPUInfo, len(rows))
+	for _, row := range rows {
+		if len(row) < 8 {
+			continue
+		}
+		idx, err := strconv.Atoi(strings.TrimSpace(row[0]))
+		if err != nil {
+			continue
+		}
+		infoByIndex[idx] = benchmarkGPUInfo{
+			Index:               idx,
+			UUID:                strings.TrimSpace(row[1]),
+			Name:                strings.TrimSpace(row[2]),
+			BusID:               strings.TrimSpace(row[3]),
+			VBIOS:               strings.TrimSpace(row[4]),
+			PowerLimitW:         parseBenchmarkFloat(row[5]),
+			MaxGraphicsClockMHz: parseBenchmarkFloat(row[6]),
+			MaxMemoryClockMHz:   parseBenchmarkFloat(row[7]),
+		}
+	}
+	return infoByIndex, nil
+}
+
+func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndices []int, infoByIndex map[int]benchmarkGPUInfo, result *NvidiaBenchmarkResult) []benchmarkRestoreAction {
+	if os.Geteuid() != 0 {
+		result.Normalization.Status = "partial"
+		result.Normalization.Notes = append(result.Normalization.Notes, "benchmark normalization skipped: root privileges are required for persistence mode and clock locks")
+		for _, idx := range gpuIndices {
+			result.Normalization.GPUs = append(result.Normalization.GPUs, BenchmarkNormalizationGPU{
+				Index: idx,
+				Notes: []string{"normalization skipped: root privileges are required"},
+			})
+		}
+		return nil
+	}
+
+	var restore []benchmarkRestoreAction
+	for _, idx := range gpuIndices {
+		rec := BenchmarkNormalizationGPU{Index: idx}
+		if _, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-pm", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-pm", "1"}, nil, nil); err != nil {
+			rec.PersistenceMode = "failed"
+			rec.Notes = append(rec.Notes, "failed to enable persistence mode")
+			result.Normalization.Status = "partial"
+		} else {
+			rec.PersistenceMode = "applied"
+		}
+
+		if info, ok := infoByIndex[idx]; ok && info.MaxGraphicsClockMHz > 0 {
+			target := int(math.Round(info.MaxGraphicsClockMHz))
+			if out, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-lgc", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-lgc", strconv.Itoa(target)}, nil, nil); err != nil {
+				rec.GPUClockLockStatus = "failed"
+				rec.Notes = append(rec.Notes, "graphics clock lock failed: "+strings.TrimSpace(string(out)))
+				result.Normalization.Status = "partial"
+			} else {
+				rec.GPUClockLockStatus = "applied"
+				rec.GPUClockLockMHz = float64(target)
+				idxCopy := idx
+				restore = append(restore, benchmarkRestoreAction{name: fmt.Sprintf("gpu-%d-rgc", idxCopy), fn: func() {
+					_, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rgc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rgc"}, nil, nil)
+				}})
+			}
+		}
+
+		if info, ok := infoByIndex[idx]; ok && info.MaxMemoryClockMHz > 0 {
+			target := int(math.Round(info.MaxMemoryClockMHz))
+			out, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-lmc", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-lmc", strconv.Itoa(target)}, nil, nil)
+			switch {
+			case err == nil:
+				rec.MemoryClockLockStatus = "applied"
+				rec.MemoryClockLockMHz = float64(target)
+				idxCopy := idx
+				restore = append(restore, benchmarkRestoreAction{name: fmt.Sprintf("gpu-%d-rmc", idxCopy), fn: func() {
+					_, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rmc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rmc"}, nil, nil)
+				}})
+			case strings.Contains(strings.ToLower(string(out)), "deferred") || strings.Contains(strings.ToLower(string(out)), "not supported"):
+				rec.MemoryClockLockStatus = "unsupported"
+				rec.Notes = append(rec.Notes, "memory clock lock unsupported on this GPU/driver path")
+				result.Normalization.Status = "partial"
+			default:
+				rec.MemoryClockLockStatus = "failed"
+				rec.Notes = append(rec.Notes, "memory clock lock failed: "+strings.TrimSpace(string(out)))
+				result.Normalization.Status = "partial"
+			}
+		}
+
+		result.Normalization.GPUs = append(result.Normalization.GPUs, rec)
+	}
+	return restore
+}
+
+func collectBenchmarkSamples(ctx context.Context, durationSec int, gpuIndices []int) ([]GPUMetricRow, error) {
+	if durationSec <= 0 {
+		return nil, nil
+	}
+	deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
+	var rows []GPUMetricRow
+	start := time.Now()
+	for {
+		if ctx.Err() != nil {
+			return rows, ctx.Err()
+		}
+		samples, err := sampleGPUMetrics(gpuIndices)
+		if err == nil {
+			elapsed := time.Since(start).Seconds()
+			for i := range samples {
+				samples[i].ElapsedSec = elapsed
+			}
+			rows = append(rows, samples...)
+		}
+		if time.Now().After(deadline) {
+			break
+		}
+		select {
+		case <-ctx.Done():
+			return rows, ctx.Err()
+		case <-time.After(time.Second):
+		}
+	}
+	return rows, nil
+}
+
+func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir, baseName string, logFunc func(string)) ([]byte, []GPUMetricRow, error) {
+	stopCh := make(chan struct{})
+	doneCh := make(chan struct{})
+	var metricRows []GPUMetricRow
+	start := time.Now()
+
+	go func() {
+		defer close(doneCh)
+		ticker := time.NewTicker(time.Second)
+		defer ticker.Stop()
+		for {
+			select {
+			case <-stopCh:
+				return
+			case <-ticker.C:
+				samples, err := sampleGPUMetrics(gpuIndices)
+				if err != nil {
+					continue
+				}
+				elapsed := time.Since(start).Seconds()
+				for i := range samples {
+					samples[i].ElapsedSec = elapsed
+				}
+				metricRows = append(metricRows, samples...)
+			}
+		}
+	}()
+
+	out, err := runSATCommandCtx(ctx, verboseLog, name, cmd, env, logFunc)
+	close(stopCh)
+	<-doneCh
+
+	writeBenchmarkMetricsFiles(runDir, baseName, metricRows)
+	return out, metricRows, err
+}
+
+func writeBenchmarkMetricsFiles(runDir, baseName string, rows []GPUMetricRow) {
+	if len(rows) == 0 {
+		return
+	}
+	_ = WriteGPUMetricsCSV(filepath.Join(runDir, baseName+"-metrics.csv"), rows)
+	_ = WriteGPUMetricsHTML(filepath.Join(runDir, baseName+"-metrics.html"), rows)
+	chart := RenderGPUTerminalChart(rows)
+	_ = os.WriteFile(filepath.Join(runDir, baseName+"-metrics-term.txt"), []byte(chart), 0644)
+}
+
+func parseBenchmarkBurnLog(raw string) benchmarkBurnParseResult {
+	result := benchmarkBurnParseResult{}
+	lines := strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n")
+	profiles := make(map[string]*benchmarkBurnProfile)
+	for _, line := range lines {
+		line = stripBenchmarkPrefix(strings.TrimSpace(line))
+		if line == "" {
+			continue
+		}
+		switch {
+		case strings.HasPrefix(line, "device="):
+			result.Device = strings.TrimSpace(strings.TrimPrefix(line, "device="))
+		case strings.HasPrefix(line, "compute_capability="):
+			result.ComputeCapability = strings.TrimSpace(strings.TrimPrefix(line, "compute_capability="))
+		case strings.HasPrefix(line, "backend="):
+			result.Backend = strings.TrimSpace(strings.TrimPrefix(line, "backend="))
+			result.Fallback = result.Backend == "driver-ptx"
+		case strings.HasPrefix(line, "duration_s="):
+			result.DurationSec, _ = strconv.Atoi(strings.TrimSpace(strings.TrimPrefix(line, "duration_s=")))
+		default:
+			if m := benchmarkReadyPattern.FindStringSubmatch(line); len(m) == 6 {
+				profile := ensureBenchmarkProfile(profiles, m[1])
+				profile.supported = true
+				profile.lanes++
+				profile.m, _ = strconv.ParseUint(m[3], 10, 64)
+				profile.n, _ = strconv.ParseUint(m[4], 10, 64)
+				profile.k, _ = strconv.ParseUint(m[5], 10, 64)
+				continue
+			}
+			if m := benchmarkSkippedPattern.FindStringSubmatch(line); len(m) == 3 {
+				profile := ensureBenchmarkProfile(profiles, m[1])
+				profile.supported = false
+				profile.notes = strings.TrimSpace(m[2])
+				continue
+			}
+			if m := benchmarkIterationsPattern.FindStringSubmatch(line); len(m) == 3 {
+				profile := ensureBenchmarkProfile(profiles, m[1])
+				iters, _ := strconv.ParseUint(m[2], 10, 64)
+				profile.iterations += iters
+			}
+		}
+	}
+
+	keys := make([]string, 0, len(profiles))
+	for key := range profiles {
+		keys = append(keys, key)
+	}
+	sort.Strings(keys)
+	for _, key := range keys {
+		profile := profiles[key]
+		precision := BenchmarkPrecisionResult{
+			Name:       profile.name,
+			Category:   profile.category,
+			Supported:  profile.supported,
+			Lanes:      profile.lanes,
+			M:          profile.m,
+			N:          profile.n,
+			K:          profile.k,
+			Iterations: profile.iterations,
+			Notes:      profile.notes,
+		}
+		if profile.supported && result.DurationSec > 0 && profile.m > 0 && profile.n > 0 && profile.k > 0 && profile.iterations > 0 {
+			precision.TeraOpsPerSec = (2.0 * float64(profile.m) * float64(profile.n) * float64(profile.k) * float64(profile.iterations)) / float64(result.DurationSec) / 1e12
+		}
+		result.Profiles = append(result.Profiles, precision)
+	}
+	return result
+}
+
+func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name string) *benchmarkBurnProfile {
+	if profile, ok := profiles[name]; ok {
+		return profile
+	}
+	category := "other"
+	switch {
+	case strings.HasPrefix(name, "fp32"):
+		category = "fp32_tf32"
+	case strings.HasPrefix(name, "fp16"):
+		category = "fp16_bf16"
+	case strings.HasPrefix(name, "fp8"):
+		category = "fp8"
+	case strings.HasPrefix(name, "fp4"):
+		category = "fp4"
+	}
+	profile := &benchmarkBurnProfile{name: name, category: category, supported: true}
+	profiles[name] = profile
+	return profile
+}
+
+func stripBenchmarkPrefix(line string) string {
+	if strings.HasPrefix(line, "[gpu ") {
+		if idx := strings.Index(line, "] "); idx >= 0 {
+			return line[idx+2:]
+		}
+	}
+	return line
+}
+
+func summarizeBenchmarkTelemetry(rows []GPUMetricRow) BenchmarkTelemetrySummary {
+	summary := BenchmarkTelemetrySummary{}
+	if len(rows) == 0 {
+		return summary
+	}
+	temps := make([]float64, 0, len(rows))
+	powers := make([]float64, 0, len(rows))
+	clocks := make([]float64, 0, len(rows))
+	memClocks := make([]float64, 0, len(rows))
+	usages := make([]float64, 0, len(rows))
+	memUsages := make([]float64, 0, len(rows))
+	summary.DurationSec = rows[len(rows)-1].ElapsedSec
+	summary.Samples = len(rows)
+	for _, row := range rows {
+		temps = append(temps, row.TempC)
+		powers = append(powers, row.PowerW)
+		clocks = append(clocks, row.ClockMHz)
+		memClocks = append(memClocks, row.MemClockMHz)
+		usages = append(usages, row.UsagePct)
+		memUsages = append(memUsages, row.MemUsagePct)
+	}
+	summary.AvgTempC = benchmarkMean(temps)
+	summary.P95TempC = benchmarkPercentile(temps, 95)
+	summary.AvgPowerW = benchmarkMean(powers)
+	summary.P95PowerW = benchmarkPercentile(powers, 95)
+	summary.AvgGraphicsClockMHz = benchmarkMean(clocks)
+	summary.P95GraphicsClockMHz = benchmarkPercentile(clocks, 95)
+	summary.AvgMemoryClockMHz = benchmarkMean(memClocks)
+	summary.P95MemoryClockMHz = benchmarkPercentile(memClocks, 95)
+	summary.AvgUsagePct = benchmarkMean(usages)
+	summary.AvgMemUsagePct = benchmarkMean(memUsages)
+	summary.ClockCVPct = benchmarkCV(clocks)
+	summary.PowerCVPct = benchmarkCV(powers)
+	summary.TempCVPct = benchmarkCV(temps)
+	summary.ClockDriftPct = benchmarkClockDrift(clocks)
+	return summary
+}
+
+func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
+	score := BenchmarkScorecard{}
+	for _, precision := range gpu.PrecisionResults {
+		if precision.Supported {
+			score.ComputeScore += precision.TeraOpsPerSec
+		}
+	}
+	if gpu.PowerLimitW > 0 {
+		score.PowerSustainScore = math.Min(100, (gpu.Steady.AvgPowerW/gpu.PowerLimitW)*100)
+	}
+	runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
+	thermalRatio := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) / runtimeUS
+	score.ThermalSustainScore = clampScore(100 - thermalRatio*100)
+	score.StabilityScore = clampScore(100 - (gpu.Steady.ClockCVPct*4 + gpu.Steady.PowerCVPct*2 + gpu.Steady.ClockDriftPct*2))
+	score.CompositeScore = compositeBenchmarkScore(score)
+	return score
+}
+
+func compositeBenchmarkScore(score BenchmarkScorecard) float64 {
+	quality := 0.40 + 0.20*(score.PowerSustainScore/100.0) + 0.20*(score.ThermalSustainScore/100.0) + 0.20*(score.StabilityScore/100.0)
+	if score.InterconnectScore > 0 {
+		quality += 0.10
+	}
+	if quality > 1.10 {
+		quality = 1.10
+	}
+	return score.ComputeScore * quality
+}
+
+func detectBenchmarkDegradationReasons(gpu BenchmarkGPUResult, normalizationStatus string) []string {
+	var reasons []string
+	runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
+	if float64(gpu.Throttle.SWPowerCapUS)/runtimeUS >= 0.05 {
+		reasons = append(reasons, "power_capped")
+	}
+	if float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS)/runtimeUS >= 0.01 {
+		reasons = append(reasons, "thermal_limited")
+	}
+	if float64(gpu.Throttle.SyncBoostUS)/runtimeUS >= 0.01 {
+		reasons = append(reasons, "sync_boost_limited")
+	}
+	if gpu.LockedGraphicsClockMHz > 0 && gpu.Steady.AvgGraphicsClockMHz < gpu.LockedGraphicsClockMHz*0.90 {
+		reasons = append(reasons, "low_sm_clock_vs_target")
+	}
+	if gpu.Scores.StabilityScore > 0 && gpu.Scores.StabilityScore < 85 {
+		reasons = append(reasons, "variance_too_high")
+	}
+	if normalizationStatus != "full" {
+		reasons = append(reasons, "normalization_partial")
+	}
+	return dedupeStrings(reasons)
+}
+
+func runBenchmarkInterconnect(ctx context.Context, verboseLog, runDir string, gpuIndices []int, spec benchmarkProfileSpec, logFunc func(string)) *BenchmarkInterconnectResult {
+	result := &BenchmarkInterconnectResult{
+		Status:             "UNSUPPORTED",
+		Attempted:          true,
+		SelectedGPUIndices: append([]int(nil), gpuIndices...),
+	}
+	cmd := []string{
+		"all_reduce_perf",
+		"-b", "512M",
+		"-e", "4G",
+		"-f", "2",
+		"-g", strconv.Itoa(len(gpuIndices)),
+		"--iters", strconv.Itoa(maxInt(20, spec.NCCLSec/10)),
+	}
+	env := []string{"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices)}
+	logFunc(fmt.Sprintf("NCCL interconnect: gpus=%s", joinIndexList(gpuIndices)))
+	out, err := runSATCommandCtx(ctx, verboseLog, "nccl-all-reduce.log", cmd, env, logFunc)
+	_ = os.WriteFile(filepath.Join(runDir, "nccl-all-reduce.log"), out, 0644)
+	if err != nil {
+		result.Notes = append(result.Notes, strings.TrimSpace(string(out)))
+		return result
+	}
+	avgAlg, maxAlg, avgBus, maxBus := parseNCCLAllReduceOutput(string(out))
+	result.Status = "OK"
+	result.Supported = true
+	result.AvgAlgBWGBps = avgAlg
+	result.MaxAlgBWGBps = maxAlg
+	result.AvgBusBWGBps = avgBus
+	result.MaxBusBWGBps = maxBus
+	return result
+}
+
+func parseNCCLAllReduceOutput(raw string) (avgAlg, maxAlg, avgBus, maxBus float64) {
+	lines := strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n")
+	var algs []float64
+	var buses []float64
+	for _, line := range lines {
+		line = strings.TrimSpace(line)
+		if line == "" || strings.HasPrefix(line, "#") {
+			continue
+		}
+		fields := strings.Fields(line)
+		if len(fields) < 8 {
+			continue
+		}
+		for i := 0; i+2 < len(fields); i++ {
+			timeVal, err1 := strconv.ParseFloat(fields[i], 64)
+			algVal, err2 := strconv.ParseFloat(fields[i+1], 64)
+			busVal, err3 := strconv.ParseFloat(fields[i+2], 64)
+			if err1 == nil && err2 == nil && err3 == nil && timeVal > 0 {
+				algs = append(algs, algVal)
+				buses = append(buses, busVal)
+				break
+			}
+		}
+	}
+	if len(algs) == 0 {
+		return 0, 0, 0, 0
+	}
+	return benchmarkMean(algs), benchmarkMax(algs), benchmarkMean(buses), benchmarkMax(buses)
+}
+
+func queryThrottleCounters(gpuIndex int) (BenchmarkThrottleCounters, error) {
+	out, err := satExecCommand(
+		"nvidia-smi",
+		"--id="+strconv.Itoa(gpuIndex),
+		"--query-gpu=clocks_event_reasons_counters.sw_power_cap,clocks_event_reasons_counters.sw_thermal_slowdown,clocks_event_reasons_counters.sync_boost,clocks_event_reasons_counters.hw_thermal_slowdown,clocks_event_reasons_counters.hw_power_brake_slowdown",
+		"--format=csv,noheader,nounits",
+	).Output()
+	if err != nil {
+		return BenchmarkThrottleCounters{}, err
+	}
+	fields := strings.Split(strings.TrimSpace(string(out)), ",")
+	if len(fields) < 5 {
+		return BenchmarkThrottleCounters{}, fmt.Errorf("unexpected throttle counter columns: %q", strings.TrimSpace(string(out)))
+	}
+	return BenchmarkThrottleCounters{
+		SWPowerCapUS:           parseBenchmarkUint64(fields[0]),
+		SWThermalSlowdownUS:    parseBenchmarkUint64(fields[1]),
+		SyncBoostUS:            parseBenchmarkUint64(fields[2]),
+		HWThermalSlowdownUS:    parseBenchmarkUint64(fields[3]),
+		HWPowerBrakeSlowdownUS: parseBenchmarkUint64(fields[4]),
+	}, nil
+}
+
+func diffThrottleCounters(before, after BenchmarkThrottleCounters) BenchmarkThrottleCounters {
+	return BenchmarkThrottleCounters{
+		SWPowerCapUS:           saturatingSub(after.SWPowerCapUS, before.SWPowerCapUS),
+		SWThermalSlowdownUS:    saturatingSub(after.SWThermalSlowdownUS, before.SWThermalSlowdownUS),
+		SyncBoostUS:            saturatingSub(after.SyncBoostUS, before.SyncBoostUS),
+		HWThermalSlowdownUS:    saturatingSub(after.HWThermalSlowdownUS, before.HWThermalSlowdownUS),
+		HWPowerBrakeSlowdownUS: saturatingSub(after.HWPowerBrakeSlowdownUS, before.HWPowerBrakeSlowdownUS),
+	}
+}
+
+func queryActiveComputeApps(gpuIndices []int) ([]string, error) {
+	args := []string{
+		"--query-compute-apps=gpu_uuid,pid,process_name",
+		"--format=csv,noheader,nounits",
+	}
+	if len(gpuIndices) > 0 {
+		args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...)
+	}
+	out, err := satExecCommand("nvidia-smi", args...).Output()
+	if err != nil {
+		return nil, err
+	}
+	var lines []string
+	for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
+		line = strings.TrimSpace(line)
+		if line == "" {
+			continue
+		}
+		lines = append(lines, line)
+	}
+	return lines, nil
+}
+
+func finalizeBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkGPUResult {
+	if gpu.Status == "" {
+		gpu.Status = "OK"
+	}
+	if gpu.Scores.CompositeScore == 0 {
+		gpu.Scores.CompositeScore = compositeBenchmarkScore(gpu.Scores)
+	}
+	return gpu
+}
+
+func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
+	var findings []string
+	if result.Normalization.Status != "full" {
+		findings = append(findings, "Environment normalization was partial; compare results with caution.")
+	}
+	for _, gpu := range result.GPUs {
+		if len(gpu.DegradationReasons) == 0 && gpu.Status == "OK" {
+			findings = append(findings, fmt.Sprintf("GPU %d held clocks without observable throttle counters during steady state.", gpu.Index))
+			continue
+		}
+		for _, reason := range gpu.DegradationReasons {
+			switch reason {
+			case "power_capped":
+				findings = append(findings, fmt.Sprintf("GPU %d spent measurable time under SW power cap.", gpu.Index))
+			case "thermal_limited":
+				findings = append(findings, fmt.Sprintf("GPU %d reported thermal slowdown during steady state.", gpu.Index))
+			case "sync_boost_limited":
+				findings = append(findings, fmt.Sprintf("GPU %d was limited by sync boost behaviour.", gpu.Index))
+			case "low_sm_clock_vs_target":
+				findings = append(findings, fmt.Sprintf("GPU %d average SM clock stayed below the requested lock target.", gpu.Index))
+			case "variance_too_high":
+				findings = append(findings, fmt.Sprintf("GPU %d showed unstable clocks/power over the benchmark window.", gpu.Index))
+			case "normalization_partial":
+				findings = append(findings, fmt.Sprintf("GPU %d ran without full benchmark normalization.", gpu.Index))
+			}
+		}
+		if gpu.Backend == "driver-ptx" {
+			findings = append(findings, fmt.Sprintf("GPU %d used driver PTX fallback; tensor score is intentionally degraded.", gpu.Index))
+		}
+	}
+	if result.Interconnect != nil && result.Interconnect.Supported {
+		findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps))
+	}
+	return dedupeStrings(findings)
+}
+
+func benchmarkOverallStatus(result NvidiaBenchmarkResult) string {
+	if len(result.GPUs) == 0 {
+		return "FAILED"
+	}
+	hasOK := false
+	hasPartial := result.Normalization.Status != "full"
+	for _, gpu := range result.GPUs {
+		switch gpu.Status {
+		case "OK":
+			hasOK = true
+		case "PARTIAL", "UNSUPPORTED":
+			hasPartial = true
+		}
+	}
+	if !hasOK {
+		return "FAILED"
+	}
+	if hasPartial {
+		return "PARTIAL"
+	}
+	return "OK"
+}
+
+func findBenchmarkNormalization(items []BenchmarkNormalizationGPU, idx int) *BenchmarkNormalizationGPU {
+	for i := range items {
+		if items[i].Index == idx {
+			return &items[i]
+		}
+	}
+	return nil
+}
+
+func classifySATErrorStatus(out []byte, err error) string {
+	status, _ := classifySATResult("benchmark", out, err)
+	if status == "UNSUPPORTED" {
+		return "UNSUPPORTED"
+	}
+	return "FAILED"
+}
+
+func parseBenchmarkFloat(raw string) float64 {
+	raw = strings.TrimSpace(raw)
+	if raw == "" || strings.EqualFold(raw, "n/a") || strings.EqualFold(raw, "[not supported]") {
+		return 0
+	}
+	value, _ := strconv.ParseFloat(raw, 64)
+	return value
+}
+
+func parseBenchmarkUint64(raw string) uint64 {
+	raw = strings.TrimSpace(raw)
+	if raw == "" || strings.EqualFold(raw, "n/a") || strings.EqualFold(raw, "[not supported]") {
+		return 0
+	}
+	value, _ := strconv.ParseUint(raw, 10, 64)
+	return value
+}
+
+func benchmarkMean(values []float64) float64 {
+	if len(values) == 0 {
+		return 0
+	}
+	var sum float64
+	for _, value := range values {
+		sum += value
+	}
+	return sum / float64(len(values))
+}
+
+func benchmarkPercentile(values []float64, p float64) float64 {
+	if len(values) == 0 {
+		return 0
+	}
+	copyValues := append([]float64(nil), values...)
+	sort.Float64s(copyValues)
+	if len(copyValues) == 1 {
+		return copyValues[0]
+	}
+	rank := (p / 100.0) * float64(len(copyValues)-1)
+	lower := int(math.Floor(rank))
+	upper := int(math.Ceil(rank))
+	if lower == upper {
+		return copyValues[lower]
+	}
+	frac := rank - float64(lower)
+	return copyValues[lower] + (copyValues[upper]-copyValues[lower])*frac
+}
+
+func benchmarkCV(values []float64) float64 {
+	if len(values) == 0 {
+		return 0
+	}
+	mean := benchmarkMean(values)
+	if mean == 0 {
+		return 0
+	}
+	var variance float64
+	for _, value := range values {
+		diff := value - mean
+		variance += diff * diff
+	}
+	variance /= float64(len(values))
+	return math.Sqrt(variance) / mean * 100
+}
+
+func benchmarkClockDrift(values []float64) float64 {
+	if len(values) < 4 {
+		return 0
+	}
+	window := len(values) / 4
+	if window < 1 {
+		window = 1
+	}
+	head := benchmarkMean(values[:window])
+	tail := benchmarkMean(values[len(values)-window:])
+	if head <= 0 || tail >= head {
+		return 0
+	}
+	return ((head - tail) / head) * 100
+}
+
+func benchmarkMax(values []float64) float64 {
+	var max float64
+	for i, value := range values {
+		if i == 0 || value > max {
+			max = value
+		}
+	}
+	return max
+}
+
+func clampScore(value float64) float64 {
+	switch {
+	case value < 0:
+		return 0
+	case value > 100:
+		return 100
+	default:
+		return value
+	}
+}
+
+func dedupeStrings(values []string) []string {
+	if len(values) == 0 {
+		return nil
+	}
+	seen := make(map[string]struct{}, len(values))
+	out := make([]string, 0, len(values))
+	for _, value := range values {
+		value = strings.TrimSpace(value)
+		if value == "" {
+			continue
+		}
+		if _, ok := seen[value]; ok {
+			continue
+		}
+		seen[value] = struct{}{}
+		out = append(out, value)
+	}
+	return out
+}
+
+func saturatingSub(after, before uint64) uint64 {
+	if after <= before {
+		return 0
+	}
+	return after - before
+}
+
+func maxInt(a, b int) int {
+	if a > b {
+		return a
+	}
+	return b
+}
diff --git a/audit/internal/platform/benchmark_report.go b/audit/internal/platform/benchmark_report.go
new file mode 100644
index 0000000..13a3dcf
--- /dev/null
+++ b/audit/internal/platform/benchmark_report.go
@@ -0,0 +1,141 @@
+package platform
+
+import (
+	"fmt"
+	"strings"
+	"time"
+)
+
+func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
+	var b strings.Builder
+	fmt.Fprintf(&b, "Bee NVIDIA Benchmark Report\n")
+	fmt.Fprintf(&b, "===========================\n\n")
+	fmt.Fprintf(&b, "Generated: %s\n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
+	fmt.Fprintf(&b, "Host: %s\n", result.Hostname)
+	fmt.Fprintf(&b, "Profile: %s\n", result.BenchmarkProfile)
+	fmt.Fprintf(&b, "Overall status: %s\n", result.OverallStatus)
+	fmt.Fprintf(&b, "Selected GPUs: %s\n", joinIndexList(result.SelectedGPUIndices))
+	fmt.Fprintf(&b, "Normalization: %s\n\n", result.Normalization.Status)
+
+	if len(result.Findings) > 0 {
+		fmt.Fprintf(&b, "Executive Summary\n")
+		fmt.Fprintf(&b, "-----------------\n")
+		for _, finding := range result.Findings {
+			fmt.Fprintf(&b, "- %s\n", finding)
+		}
+		b.WriteString("\n")
+	}
+
+	if len(result.Warnings) > 0 {
+		fmt.Fprintf(&b, "Warnings\n")
+		fmt.Fprintf(&b, "--------\n")
+		for _, warning := range result.Warnings {
+			fmt.Fprintf(&b, "- %s\n", warning)
+		}
+		b.WriteString("\n")
+	}
+
+	fmt.Fprintf(&b, "Per GPU Scorecard\n")
+	fmt.Fprintf(&b, "-----------------\n")
+	for _, gpu := range result.GPUs {
+		fmt.Fprintf(&b, "GPU %d  %s\n", gpu.Index, gpu.Name)
+		fmt.Fprintf(&b, "  Status: %s\n", gpu.Status)
+		fmt.Fprintf(&b, "  Composite score: %.2f\n", gpu.Scores.CompositeScore)
+		fmt.Fprintf(&b, "  Compute score: %.2f\n", gpu.Scores.ComputeScore)
+		fmt.Fprintf(&b, "  Power sustain: %.1f\n", gpu.Scores.PowerSustainScore)
+		fmt.Fprintf(&b, "  Thermal sustain: %.1f\n", gpu.Scores.ThermalSustainScore)
+		fmt.Fprintf(&b, "  Stability: %.1f\n", gpu.Scores.StabilityScore)
+		if gpu.Scores.InterconnectScore > 0 {
+			fmt.Fprintf(&b, "  Interconnect: %.1f\n", gpu.Scores.InterconnectScore)
+		}
+		if len(gpu.DegradationReasons) > 0 {
+			fmt.Fprintf(&b, "  Degradation reasons: %s\n", strings.Join(gpu.DegradationReasons, ", "))
+		}
+		fmt.Fprintf(&b, "  Avg power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.AvgPowerW, gpu.Steady.AvgTempC, gpu.Steady.AvgGraphicsClockMHz)
+		fmt.Fprintf(&b, "  P95 power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.P95PowerW, gpu.Steady.P95TempC, gpu.Steady.P95GraphicsClockMHz)
+		if len(gpu.PrecisionResults) > 0 {
+			fmt.Fprintf(&b, "  Precision results:\n")
+			for _, precision := range gpu.PrecisionResults {
+				if precision.Supported {
+					fmt.Fprintf(&b, "    - %s: %.2f TOPS lanes=%d iterations=%d\n", precision.Name, precision.TeraOpsPerSec, precision.Lanes, precision.Iterations)
+				} else {
+					fmt.Fprintf(&b, "    - %s: unsupported (%s)\n", precision.Name, precision.Notes)
+				}
+			}
+		}
+		fmt.Fprintf(&b, "  Throttle counters (us): sw_power=%d sw_thermal=%d sync_boost=%d hw_thermal=%d hw_power_brake=%d\n",
+			gpu.Throttle.SWPowerCapUS,
+			gpu.Throttle.SWThermalSlowdownUS,
+			gpu.Throttle.SyncBoostUS,
+			gpu.Throttle.HWThermalSlowdownUS,
+			gpu.Throttle.HWPowerBrakeSlowdownUS,
+		)
+		if len(gpu.Notes) > 0 {
+			fmt.Fprintf(&b, "  Notes:\n")
+			for _, note := range gpu.Notes {
+				fmt.Fprintf(&b, "    - %s\n", note)
+			}
+		}
+		b.WriteString("\n")
+	}
+
+	if result.Interconnect != nil {
+		fmt.Fprintf(&b, "Interconnect\n")
+		fmt.Fprintf(&b, "------------\n")
+		fmt.Fprintf(&b, "Status: %s\n", result.Interconnect.Status)
+		if result.Interconnect.Supported {
+			fmt.Fprintf(&b, "Avg algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.AvgBusBWGBps)
+			fmt.Fprintf(&b, "Max algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.MaxAlgBWGBps, result.Interconnect.MaxBusBWGBps)
+		}
+		for _, note := range result.Interconnect.Notes {
+			fmt.Fprintf(&b, "- %s\n", note)
+		}
+		b.WriteString("\n")
+	}
+
+	fmt.Fprintf(&b, "Methodology\n")
+	fmt.Fprintf(&b, "-----------\n")
+	fmt.Fprintf(&b, "- Profile %s uses standardized baseline, warmup, steady-state, interconnect, and cooldown phases.\n", result.BenchmarkProfile)
+	fmt.Fprintf(&b, "- Single-GPU compute score comes from bee-gpu-burn cuBLASLt output when available.\n")
+	fmt.Fprintf(&b, "- Thermal and power limitations are inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
+	fmt.Fprintf(&b, "- result.json is the canonical machine-readable source for this benchmark run.\n\n")
+
+	fmt.Fprintf(&b, "Raw Files\n")
+	fmt.Fprintf(&b, "---------\n")
+	fmt.Fprintf(&b, "- result.json\n")
+	fmt.Fprintf(&b, "- report.txt\n")
+	fmt.Fprintf(&b, "- summary.txt\n")
+	fmt.Fprintf(&b, "- verbose.log\n")
+	fmt.Fprintf(&b, "- gpu-*-baseline-metrics.csv/html/term.txt\n")
+	fmt.Fprintf(&b, "- gpu-*-warmup.log\n")
+	fmt.Fprintf(&b, "- gpu-*-steady.log\n")
+	fmt.Fprintf(&b, "- gpu-*-steady-metrics.csv/html/term.txt\n")
+	fmt.Fprintf(&b, "- gpu-*-cooldown-metrics.csv/html/term.txt\n")
+	if result.Interconnect != nil {
+		fmt.Fprintf(&b, "- nccl-all-reduce.log\n")
+	}
+	return b.String()
+}
+
+func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
+	var b strings.Builder
+	fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
+	fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
+	fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
+	fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
+	fmt.Fprintf(&b, "normalization_status=%s\n", result.Normalization.Status)
+	var best float64
+	for i, gpu := range result.GPUs {
+		fmt.Fprintf(&b, "gpu_%d_status=%s\n", gpu.Index, gpu.Status)
+		fmt.Fprintf(&b, "gpu_%d_composite_score=%.2f\n", gpu.Index, gpu.Scores.CompositeScore)
+		if i == 0 || gpu.Scores.CompositeScore > best {
+			best = gpu.Scores.CompositeScore
+		}
+	}
+	fmt.Fprintf(&b, "best_composite_score=%.2f\n", best)
+	if result.Interconnect != nil {
+		fmt.Fprintf(&b, "interconnect_status=%s\n", result.Interconnect.Status)
+		fmt.Fprintf(&b, "interconnect_max_busbw_gbps=%.1f\n", result.Interconnect.MaxBusBWGBps)
+	}
+	return b.String()
+}
diff --git a/audit/internal/platform/benchmark_test.go b/audit/internal/platform/benchmark_test.go
new file mode 100644
index 0000000..51120e7
--- /dev/null
+++ b/audit/internal/platform/benchmark_test.go
@@ -0,0 +1,132 @@
+package platform
+
+import (
+	"strings"
+	"testing"
+)
+
+func TestResolveBenchmarkProfile(t *testing.T) {
+	t.Parallel()
+
+	cases := []struct {
+		name    string
+		profile string
+		want    benchmarkProfileSpec
+	}{
+		{
+			name:    "default",
+			profile: "",
+			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120},
+		},
+		{
+			name:    "stability",
+			profile: "stability",
+			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300},
+		},
+		{
+			name:    "overnight",
+			profile: "overnight",
+			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300},
+		},
+	}
+
+	for _, tc := range cases {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			got := resolveBenchmarkProfile(tc.profile)
+			if got != tc.want {
+				t.Fatalf("profile=%q got %+v want %+v", tc.profile, got, tc.want)
+			}
+		})
+	}
+}
+
+func TestParseBenchmarkBurnLog(t *testing.T) {
+	t.Parallel()
+
+	raw := strings.Join([]string{
+		"loader=bee-gpu-burn",
+		"[gpu 0] device=NVIDIA H100",
+		"[gpu 0] compute_capability=9.0",
+		"[gpu 0] backend=cublasLt",
+		"[gpu 0] duration_s=10",
+		"[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0",
+		"[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0",
+		"[gpu 0] fp16_tensor_iterations=200",
+		"[gpu 0] fp8_e4m3_iterations=50",
+		"[gpu 0] status=OK",
+	}, "\n")
+
+	got := parseBenchmarkBurnLog(raw)
+	if got.Backend != "cublasLt" {
+		t.Fatalf("backend=%q want cublasLt", got.Backend)
+	}
+	if got.ComputeCapability != "9.0" {
+		t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability)
+	}
+	if len(got.Profiles) != 2 {
+		t.Fatalf("profiles=%d want 2", len(got.Profiles))
+	}
+	if got.Profiles[0].TeraOpsPerSec <= 0 {
+		t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec)
+	}
+	if got.Profiles[1].Category != "fp8" {
+		t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category)
+	}
+}
+
+func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
+	t.Parallel()
+
+	result := NvidiaBenchmarkResult{
+		BenchmarkVersion:   benchmarkVersion,
+		BenchmarkProfile:   NvidiaBenchmarkProfileStandard,
+		OverallStatus:      "PARTIAL",
+		SelectedGPUIndices: []int{0},
+		Normalization: BenchmarkNormalization{
+			Status: "partial",
+		},
+		Findings: []string{"GPU 0 spent measurable time under SW power cap."},
+		GPUs: []BenchmarkGPUResult{
+			{
+				Index:  0,
+				Name:   "NVIDIA H100",
+				Status: "OK",
+				Steady: BenchmarkTelemetrySummary{
+					AvgPowerW:           680,
+					AvgTempC:            79,
+					AvgGraphicsClockMHz: 1725,
+					P95PowerW:           700,
+					P95TempC:            82,
+					P95GraphicsClockMHz: 1800,
+				},
+				Scores: BenchmarkScorecard{
+					ComputeScore:        1200,
+					PowerSustainScore:   96,
+					ThermalSustainScore: 88,
+					StabilityScore:      92,
+					CompositeScore:      1176,
+				},
+				PrecisionResults: []BenchmarkPrecisionResult{
+					{Name: "fp16_tensor", Supported: true, TeraOpsPerSec: 700},
+				},
+				Throttle: BenchmarkThrottleCounters{
+					SWPowerCapUS: 1000000,
+				},
+				DegradationReasons: []string{"power_capped"},
+			},
+		},
+	}
+
+	report := renderBenchmarkReport(result)
+	for _, needle := range []string{
+		"Executive Summary",
+		"GPU 0 spent measurable time under SW power cap.",
+		"Composite score: 1176.00",
+		"fp16_tensor: 700.00 TOPS",
+	} {
+		if !strings.Contains(report, needle) {
+			t.Fatalf("report missing %q\n%s", needle, report)
+		}
+	}
+}
diff --git a/audit/internal/platform/benchmark_types.go b/audit/internal/platform/benchmark_types.go
new file mode 100644
index 0000000..a8b5618
--- /dev/null
+++ b/audit/internal/platform/benchmark_types.go
@@ -0,0 +1,132 @@
+package platform
+
+import "time"
+
+const (
+	NvidiaBenchmarkProfileStandard  = "standard"
+	NvidiaBenchmarkProfileStability = "stability"
+	NvidiaBenchmarkProfileOvernight = "overnight"
+)
+
+type NvidiaBenchmarkOptions struct {
+	Profile           string
+	SizeMB            int
+	GPUIndices        []int
+	ExcludeGPUIndices []int
+	RunNCCL           bool
+}
+
+type NvidiaBenchmarkResult struct {
+	BenchmarkVersion   string                       `json:"benchmark_version"`
+	GeneratedAt        time.Time                    `json:"generated_at"`
+	Hostname           string                       `json:"hostname,omitempty"`
+	BenchmarkProfile   string                       `json:"benchmark_profile"`
+	OverallStatus      string                       `json:"overall_status"`
+	SelectedGPUIndices []int                        `json:"selected_gpu_indices"`
+	Findings           []string                     `json:"findings,omitempty"`
+	Warnings           []string                     `json:"warnings,omitempty"`
+	Normalization      BenchmarkNormalization       `json:"normalization"`
+	GPUs               []BenchmarkGPUResult         `json:"gpus"`
+	Interconnect       *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
+}
+
+type BenchmarkNormalization struct {
+	Status string                      `json:"status"`
+	Notes  []string                    `json:"notes,omitempty"`
+	GPUs   []BenchmarkNormalizationGPU `json:"gpus,omitempty"`
+}
+
+type BenchmarkNormalizationGPU struct {
+	Index                 int      `json:"index"`
+	PersistenceMode       string   `json:"persistence_mode,omitempty"`
+	GPUClockLockMHz       float64  `json:"gpu_clock_lock_mhz,omitempty"`
+	GPUClockLockStatus    string   `json:"gpu_clock_lock_status,omitempty"`
+	MemoryClockLockMHz    float64  `json:"memory_clock_lock_mhz,omitempty"`
+	MemoryClockLockStatus string   `json:"memory_clock_lock_status,omitempty"`
+	Notes                 []string `json:"notes,omitempty"`
+}
+
+type BenchmarkGPUResult struct {
+	Index                  int                        `json:"index"`
+	UUID                   string                     `json:"uuid,omitempty"`
+	Name                   string                     `json:"name,omitempty"`
+	BusID                  string                     `json:"bus_id,omitempty"`
+	VBIOS                  string                     `json:"vbios,omitempty"`
+	ComputeCapability      string                     `json:"compute_capability,omitempty"`
+	Backend                string                     `json:"backend,omitempty"`
+	Status                 string                     `json:"status"`
+	PowerLimitW            float64                    `json:"power_limit_w,omitempty"`
+	MaxGraphicsClockMHz    float64                    `json:"max_graphics_clock_mhz,omitempty"`
+	MaxMemoryClockMHz      float64                    `json:"max_memory_clock_mhz,omitempty"`
+	LockedGraphicsClockMHz float64                    `json:"locked_graphics_clock_mhz,omitempty"`
+	LockedMemoryClockMHz   float64                    `json:"locked_memory_clock_mhz,omitempty"`
+	Baseline               BenchmarkTelemetrySummary  `json:"baseline"`
+	Steady                 BenchmarkTelemetrySummary  `json:"steady"`
+	Cooldown               BenchmarkTelemetrySummary  `json:"cooldown"`
+	Throttle               BenchmarkThrottleCounters  `json:"throttle_counters"`
+	PrecisionResults       []BenchmarkPrecisionResult `json:"precision_results,omitempty"`
+	Scores                 BenchmarkScorecard         `json:"scores"`
+	DegradationReasons     []string                   `json:"degradation_reasons,omitempty"`
+	Notes                  []string                   `json:"notes,omitempty"`
+}
+
+type BenchmarkTelemetrySummary struct {
+	DurationSec         float64 `json:"duration_sec"`
+	Samples             int     `json:"samples"`
+	AvgTempC            float64 `json:"avg_temp_c"`
+	P95TempC            float64 `json:"p95_temp_c"`
+	AvgPowerW           float64 `json:"avg_power_w"`
+	P95PowerW           float64 `json:"p95_power_w"`
+	AvgGraphicsClockMHz float64 `json:"avg_graphics_clock_mhz"`
+	P95GraphicsClockMHz float64 `json:"p95_graphics_clock_mhz"`
+	AvgMemoryClockMHz   float64 `json:"avg_memory_clock_mhz"`
+	P95MemoryClockMHz   float64 `json:"p95_memory_clock_mhz"`
+	AvgUsagePct         float64 `json:"avg_usage_pct"`
+	AvgMemUsagePct      float64 `json:"avg_mem_usage_pct"`
+	ClockCVPct          float64 `json:"clock_cv_pct"`
+	PowerCVPct          float64 `json:"power_cv_pct"`
+	TempCVPct           float64 `json:"temp_cv_pct"`
+	ClockDriftPct       float64 `json:"clock_drift_pct"`
+}
+
+type BenchmarkThrottleCounters struct {
+	SWPowerCapUS           uint64 `json:"sw_power_cap_us"`
+	SWThermalSlowdownUS    uint64 `json:"sw_thermal_slowdown_us"`
+	SyncBoostUS            uint64 `json:"sync_boost_us"`
+	HWThermalSlowdownUS    uint64 `json:"hw_thermal_slowdown_us"`
+	HWPowerBrakeSlowdownUS uint64 `json:"hw_power_brake_slowdown_us"`
+}
+
+type BenchmarkPrecisionResult struct {
+	Name          string  `json:"name"`
+	Category      string  `json:"category"`
+	Supported     bool    `json:"supported"`
+	Lanes         int     `json:"lanes,omitempty"`
+	M             uint64  `json:"m,omitempty"`
+	N             uint64  `json:"n,omitempty"`
+	K             uint64  `json:"k,omitempty"`
+	Iterations    uint64  `json:"iterations,omitempty"`
+	TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
+	Notes         string  `json:"notes,omitempty"`
+}
+
+type BenchmarkScorecard struct {
+	ComputeScore        float64 `json:"compute_score"`
+	PowerSustainScore   float64 `json:"power_sustain_score"`
+	ThermalSustainScore float64 `json:"thermal_sustain_score"`
+	StabilityScore      float64 `json:"stability_score"`
+	InterconnectScore   float64 `json:"interconnect_score"`
+	CompositeScore      float64 `json:"composite_score"`
+}
+
+type BenchmarkInterconnectResult struct {
+	Status             string   `json:"status"`
+	Attempted          bool     `json:"attempted"`
+	Supported          bool     `json:"supported"`
+	SelectedGPUIndices []int    `json:"selected_gpu_indices,omitempty"`
+	AvgAlgBWGBps       float64  `json:"avg_algbw_gbps,omitempty"`
+	MaxAlgBWGBps       float64  `json:"max_algbw_gbps,omitempty"`
+	AvgBusBWGBps       float64  `json:"avg_busbw_gbps,omitempty"`
+	MaxBusBWGBps       float64  `json:"max_busbw_gbps,omitempty"`
+	Notes              []string `json:"notes,omitempty"`
+}
diff --git a/audit/internal/platform/gpu_metrics.go b/audit/internal/platform/gpu_metrics.go
index 0873875..be055d5 100644
--- a/audit/internal/platform/gpu_metrics.go
+++ b/audit/internal/platform/gpu_metrics.go
@@ -141,10 +141,10 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
 // WriteGPUMetricsCSV writes collected rows as a CSV file.
 func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
 	var b bytes.Buffer
-	b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,power_w,clock_mhz,mem_clock_mhz\n")
+	b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz\n")
 	for _, r := range rows {
-		fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.0f,%.0f\n",
-			r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz)
+		fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f\n",
+			r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz)
 	}
 	return os.WriteFile(path, b.Bytes(), 0644)
 }
diff --git a/bible b/bible
index 688b87e..1d89a49 160000
--- a/bible
+++ b/bible
@@ -1 +1 @@
-Subproject commit 688b87e98deed5fadd71e10e123073640d92c15a
+Subproject commit 1d89a4918e6d4b42847e6dbeccbcc40b091a8369