fix(iso): clear stale bootloader templates in workdir

fix(iso): validate live boot entries in final ISO
Run NVIDIA DCGM diag tests on all selected GPUs simultaneously
2026-04-20 13:19:50 +03:00 · 2026-04-20 13:12:24 +03:00 · 2026-04-20 11:53:25 +03:00 · 2026-04-20 11:01:15 +03:00 · 2026-04-20 10:55:42 +03:00 · 2026-04-20 10:53:53 +03:00
49 changed files with 7138 additions and 3583 deletions
--- a/audit/bee
+++ b/audit/bee
--- a/audit/internal/app/app.go
+++ b/audit/internal/app/app.go
@@ -19,20 +19,22 @@ import (
 )

 var (
-	DefaultExportDir        = "/appdata/bee/export"
-	DefaultAuditJSONPath    = DefaultExportDir + "/bee-audit.json"
-	DefaultAuditLogPath     = DefaultExportDir + "/bee-audit.log"
-	DefaultWebLogPath       = DefaultExportDir + "/bee-web.log"
-	DefaultNetworkLogPath   = DefaultExportDir + "/bee-network.log"
-	DefaultNvidiaLogPath    = DefaultExportDir + "/bee-nvidia.log"
-	DefaultSSHLogPath       = DefaultExportDir + "/bee-sshsetup.log"
-	DefaultRuntimeJSONPath  = DefaultExportDir + "/runtime-health.json"
-	DefaultRuntimeLogPath   = DefaultExportDir + "/runtime-health.log"
-	DefaultTechDumpDir      = DefaultExportDir + "/techdump"
-	DefaultSATBaseDir       = DefaultExportDir + "/bee-sat"
-	DefaultBeeBenchBaseDir  = DefaultExportDir + "/bee-bench"
-	DefaultBeeBenchPerfDir  = DefaultBeeBenchBaseDir + "/perf"
-	DefaultBeeBenchPowerDir = DefaultBeeBenchBaseDir + "/power"
+	DefaultExportDir                     = "/appdata/bee/export"
+	DefaultAuditJSONPath                 = DefaultExportDir + "/bee-audit.json"
+	DefaultAuditLogPath                  = DefaultExportDir + "/bee-audit.log"
+	DefaultWebLogPath                    = DefaultExportDir + "/bee-web.log"
+	DefaultNetworkLogPath                = DefaultExportDir + "/bee-network.log"
+	DefaultNvidiaLogPath                 = DefaultExportDir + "/bee-nvidia.log"
+	DefaultSSHLogPath                    = DefaultExportDir + "/bee-sshsetup.log"
+	DefaultRuntimeJSONPath               = DefaultExportDir + "/runtime-health.json"
+	DefaultRuntimeLogPath                = DefaultExportDir + "/runtime-health.log"
+	DefaultTechDumpDir                   = DefaultExportDir + "/techdump"
+	DefaultSATBaseDir                    = DefaultExportDir + "/bee-sat"
+	DefaultBeeBenchBaseDir               = DefaultExportDir + "/bee-bench"
+	DefaultBeeBenchAutotuneDir           = DefaultBeeBenchBaseDir + "/autotune"
+	DefaultBeeBenchPerfDir               = DefaultBeeBenchBaseDir + "/perf"
+	DefaultBeeBenchPowerDir              = DefaultBeeBenchBaseDir + "/power"
+	DefaultBeeBenchPowerSourceConfigPath = DefaultBeeBenchBaseDir + "/power-source-autotune.json"
 )

 type App struct {
@@ -125,6 +127,7 @@ type satRunner interface {
 	RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
 	RunNvidiaPowerBench(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
+	RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error)
 	RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
 	RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
@@ -304,7 +307,7 @@ func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error)
 	}
 	filename := fmt.Sprintf("audit-%s-%s.json", sanitizeFilename(hostnameOr("unknown")), time.Now().UTC().Format("20060102-150405"))
 	tmpPath := filepath.Join(os.TempDir(), filename)
-	data, err := os.ReadFile(DefaultAuditJSONPath)
+	data, err := readFileLimited(DefaultAuditJSONPath, 100<<20)
 	if err != nil {
 		return "", err
 	}
@@ -572,6 +575,11 @@ func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts pl
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultBeeBenchPerfDir
 	}
+	resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "performance", logFunc)
+	if err != nil {
+		return "", err
+	}
+	opts.ServerPowerSource = resolved.SelectedSource
 	return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
 }

@@ -579,9 +587,47 @@ func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts p
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultBeeBenchPowerDir
 	}
+	resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "power-fit", logFunc)
+	if err != nil {
+		return "", err
+	}
+	opts.ServerPowerSource = resolved.SelectedSource
 	return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
 }

+func (a *App) RunNvidiaPowerSourceAutotuneCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultBeeBenchAutotuneDir
+	}
+	return a.sat.RunNvidiaPowerSourceAutotune(ctx, baseDir, opts, benchmarkKind, logFunc)
+}
+
+func (a *App) LoadBenchmarkPowerAutotune() (*platform.BenchmarkPowerAutotuneConfig, error) {
+	return platform.LoadBenchmarkPowerAutotuneConfig(DefaultBeeBenchPowerSourceConfigPath)
+}
+
+func (a *App) ensureBenchmarkPowerAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (platform.BenchmarkPowerAutotuneConfig, error) {
+	cfgPath := platform.BenchmarkPowerSourceConfigPath(baseDir)
+	if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath); err == nil {
+		if logFunc != nil {
+			logFunc(fmt.Sprintf("benchmark autotune: using saved server power source %s", cfg.SelectedSource))
+		}
+		return *cfg, nil
+	}
+	if logFunc != nil {
+		logFunc("benchmark autotune: no saved power source config, running autotune first")
+	}
+	autotuneDir := filepath.Join(filepath.Dir(baseDir), "autotune")
+	if _, err := a.RunNvidiaPowerSourceAutotuneCtx(ctx, autotuneDir, opts, benchmarkKind, logFunc); err != nil {
+		return platform.BenchmarkPowerAutotuneConfig{}, err
+	}
+	cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath)
+	if err != nil {
+		return platform.BenchmarkPowerAutotuneConfig{}, err
+	}
+	return *cfg, nil
+}
+
 func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
--- a/audit/internal/app/app_test.go
+++ b/audit/internal/app/app_test.go
@@ -9,6 +9,7 @@ import (
 	"io"
 	"os"
 	"path/filepath"
+	"strings"
 	"testing"

 	"bee/audit/internal/platform"
@@ -123,6 +124,7 @@ type fakeSAT struct {
 	runNvidiaFn               func(string) (string, error)
 	runNvidiaBenchmarkFn      func(string, platform.NvidiaBenchmarkOptions) (string, error)
 	runNvidiaPowerBenchFn     func(string, platform.NvidiaBenchmarkOptions) (string, error)
+	runNvidiaAutotuneFn       func(string, platform.NvidiaBenchmarkOptions, string) (string, error)
 	runNvidiaStressFn         func(string, platform.NvidiaStressOptions) (string, error)
 	runNvidiaComputeFn        func(string, int, []int) (string, error)
 	runNvidiaPowerFn          func(string, int, []int) (string, error)
@@ -163,6 +165,13 @@ func (f fakeSAT) RunNvidiaPowerBench(_ context.Context, baseDir string, opts pla
 	return f.runNvidiaFn(baseDir)
 }

+func (f fakeSAT) RunNvidiaPowerSourceAutotune(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, _ func(string)) (string, error) {
+	if f.runNvidiaAutotuneFn != nil {
+		return f.runNvidiaAutotuneFn(baseDir, opts, benchmarkKind)
+	}
+	return f.runNvidiaFn(baseDir)
+}
+
 func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
 	if f.runNvidiaTargetedStressFn != nil {
 		return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
@@ -809,6 +818,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 	if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
 		t.Fatal(err)
 	}
+	if err := os.MkdirAll(filepath.Join(exportDir, "bee-bench"), 0755); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json"), []byte(`{"version":1,"updated_at":"2026-04-20T01:02:03Z","selected_source":"sdr_psu_input","reason":"selected lowest relative error"}`), 0644); err != nil {
+		t.Fatal(err)
+	}
 	if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run.tar.gz"), []byte("nested sat archive"), 0644); err != nil {
 		t.Fatal(err)
 	}
@@ -836,6 +851,7 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 	tr := tar.NewReader(gzr)
 	var names []string
 	var auditJSON string
+	var manifest string
 	for {
 		hdr, err := tr.Next()
 		if errors.Is(err, io.EOF) {
@@ -852,6 +868,13 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 			}
 			auditJSON = string(body)
 		}
+		if strings.HasSuffix(hdr.Name, "/manifest.txt") {
+			body, err := io.ReadAll(tr)
+			if err != nil {
+				t.Fatalf("read manifest entry: %v", err)
+			}
+			manifest = string(body)
+		}
 	}

 	for _, want := range []string{
@@ -895,6 +918,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 	if !contains(auditJSON, "PASCARI") || !contains(auditJSON, "NVIDIA H100") {
 		t.Fatalf("support bundle should keep real devices:\n%s", auditJSON)
 	}
+	if !contains(manifest, "files:") {
+		t.Fatalf("support bundle manifest missing files section:\n%s", manifest)
+	}
+	if !strings.Contains(manifest, "power_autotune_selected_source=sdr_psu_input") {
+		t.Fatalf("support bundle manifest missing autotune source:\n%s", manifest)
+	}
 }

 func TestMainBanner(t *testing.T) {
--- a/audit/internal/app/atomic_write.go
+++ b/audit/internal/app/atomic_write.go
@@ -2,10 +2,29 @@ package app

 import (
 	"fmt"
+	"io"
 	"os"
 	"path/filepath"
 )

+// readFileLimited reads path into memory, refusing files larger than maxBytes.
+// Prevents OOM on corrupted or unexpectedly large data files.
+func readFileLimited(path string, maxBytes int64) ([]byte, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+	data, err := io.ReadAll(io.LimitReader(f, maxBytes+1))
+	if err != nil {
+		return nil, err
+	}
+	if int64(len(data)) > maxBytes {
+		return nil, fmt.Errorf("file %s too large (exceeds %d bytes)", path, maxBytes)
+	}
+	return data, nil
+}
+
 func atomicWriteFile(path string, data []byte, perm os.FileMode) error {
 	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
 		return fmt.Errorf("mkdir %s: %w", filepath.Dir(path), err)
--- a/audit/internal/app/component_status_db.go
+++ b/audit/internal/app/component_status_db.go
@@ -46,7 +46,7 @@ func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) {
 	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
 		return nil, err
 	}
-	data, err := os.ReadFile(path)
+	data, err := readFileLimited(path, 10<<20)
 	if err != nil && !os.IsNotExist(err) {
 		return nil, err
 	}
--- a/audit/internal/app/support_bundle.go
+++ b/audit/internal/app/support_bundle.go
@@ -2,6 +2,7 @@ package app

 import (
 	"archive/tar"
+	"bee/audit/internal/platform"
 	"compress/gzip"
 	"fmt"
 	"io"
@@ -424,6 +425,13 @@ func writeManifest(dst, exportDir, stageRoot string) error {
 	fmt.Fprintf(&body, "host=%s\n", hostnameOr("unknown"))
 	fmt.Fprintf(&body, "generated_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
 	fmt.Fprintf(&body, "export_dir=%s\n", exportDir)
+	if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json")); err == nil && cfg != nil {
+		fmt.Fprintf(&body, "power_autotune_selected_source=%s\n", cfg.SelectedSource)
+		fmt.Fprintf(&body, "power_autotune_updated_at=%s\n", cfg.UpdatedAt.UTC().Format(time.RFC3339))
+		if strings.TrimSpace(cfg.Reason) != "" {
+			fmt.Fprintf(&body, "power_autotune_reason=%s\n", cfg.Reason)
+		}
+	}
 	fmt.Fprintf(&body, "\nfiles:\n")

 	var files []string
--- a/audit/internal/collector/psu.go
+++ b/audit/internal/collector/psu.go
@@ -160,11 +160,57 @@ type psuSDR struct {
 }

 var psuSlotPatterns = []*regexp.Regexp{
-	regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`),
-	regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`),
-	regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`),
-	regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`),
-	regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`),
+	// MSI/underscore style: PSU1_POWER_IN, PSU2_POWER_OUT — underscore is \w so \b
+	// does not fire after the digit; match explicitly with underscore terminator.
+	regexp.MustCompile(`(?i)\bpsu([0-9]+)_`),
+	regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`),                    // PSU1, PS1, ps 2
+	regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`),                      // PS 6, PS6
+	regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`),                     // PWS1
+	regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`), // Power Supply 1, Power Supply Bay 3
+	regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`),                     // Bay 1
+	// Fallback for xFusion-style generic numbered PSU sensors (Power1, Power2, …).
+	// Must be last: "power supply N" is already caught by the pattern above.
+	regexp.MustCompile(`(?i)\bpower([0-9]+)\b`),
+}
+
+// psuInputPowerKeywords matches AC-input power sensor names across vendors:
+//   MSI:     PSU1_POWER_IN, PSU1_PIN
+//   MLT:     PSU1_PIN
+//   xFusion: (matched via default fallback — no explicit keyword)
+//   HPE:     PS1 Input Power, PS1 Input Watts
+func isPSUInputPower(name string) bool {
+	return strings.Contains(name, "input power") ||
+		strings.Contains(name, "input watts") ||
+		strings.Contains(name, "_pin") ||
+		strings.Contains(name, " pin") ||
+		strings.Contains(name, "_power_in") ||
+		strings.Contains(name, "power_in")
+}
+
+// isPSUOutputPower matches DC-output power sensor names across vendors:
+//   MSI:     PSU1_POWER_OUT
+//   MLT:     PSU1_POUT
+//   xFusion: PS1 POut
+func isPSUOutputPower(name string) bool {
+	return strings.Contains(name, "output power") ||
+		strings.Contains(name, "output watts") ||
+		strings.Contains(name, "_pout") ||
+		strings.Contains(name, " pout") ||
+		strings.Contains(name, "_power_out") ||
+		strings.Contains(name, "power_out") ||
+		strings.Contains(name, "power supply bay") ||
+		strings.Contains(name, "psu bay")
+}
+
+// parseBoundedFloat parses a numeric value from an SDR value field and
+// validates it is within (0, max]. Returns nil for zero, negative, or
+// out-of-range values — these indicate missing/off/fault sensor readings.
+func parseBoundedFloat(raw string, max float64) *float64 {
+	v := parseFloatPtr(raw)
+	if v == nil || *v <= 0 || *v > max {
+		return nil
+	}
+	return v
 }

 func parsePSUSDR(raw string) map[int]psuSDR {
@@ -194,24 +240,59 @@ func parsePSUSDR(raw string) map[int]psuSDR {

 		lowerName := strings.ToLower(name)
 		switch {
-		case strings.Contains(lowerName, "input power"):
-			entry.inputPowerW = parseFloatPtr(value)
-		case strings.Contains(lowerName, "output power"):
-			entry.outputPowerW = parseFloatPtr(value)
-		case strings.Contains(lowerName, "power supply bay"), strings.Contains(lowerName, "psu bay"):
-			entry.outputPowerW = parseFloatPtr(value)
+		case isPSUInputPower(lowerName):
+			entry.inputPowerW = parseBoundedFloat(value, 6000)
+		case isPSUOutputPower(lowerName):
+			entry.outputPowerW = parseBoundedFloat(value, 6000)
 		case strings.Contains(lowerName, "input voltage"), strings.Contains(lowerName, "ac input"):
 			entry.inputVoltage = parseFloatPtr(value)
 		case strings.Contains(lowerName, "temp"):
 			entry.temperatureC = parseFloatPtr(value)
 		case strings.Contains(lowerName, "health"), strings.Contains(lowerName, "remaining life"), strings.Contains(lowerName, "life remaining"):
 			entry.healthPct = parsePercentPtr(value)
+		default:
+			// Generic PSU power reading: sensor matched a slot pattern but carries
+			// no input/output keyword (e.g. xFusion "Power1", "Power2"). Treat as
+			// AC input if the value looks like wattage and no better data is set yet.
+			if entry.inputPowerW == nil {
+				entry.inputPowerW = parseBoundedFloat(value, 6000)
+			}
 		}
 		out[slot] = entry
 	}
 	return out
 }

+// PSUSlotPower holds SDR power readings for one PSU slot.
+// Slot key used by PSUSlotsFromSDR is the 0-based index string,
+// matching HardwarePowerSupply.Slot in the audit schema.
+type PSUSlotPower struct {
+	InputW  *float64 `json:"input_w,omitempty"`
+	OutputW *float64 `json:"output_w,omitempty"`
+	Status  string   `json:"status,omitempty"`
+}
+
+// PSUSlotsFromSDR parses `ipmitool sdr` output and returns per-slot PSU data
+// using the same battle-tested slot patterns as the hardware audit collector.
+// Works across MSI (PSU1_POWER_IN), xFusion (Power1, PS1 POut), MLT (PSU1_PIN).
+// Slot keys are 0-based index strings matching HardwarePowerSupply.Slot.
+func PSUSlotsFromSDR(sdrOutput string) map[string]PSUSlotPower {
+	sdr := parsePSUSDR(sdrOutput)
+	if len(sdr) == 0 {
+		return nil
+	}
+	out := make(map[string]PSUSlotPower, len(sdr))
+	for slot, entry := range sdr {
+		key := strconv.Itoa(slot - 1) // audit uses 0-based slot
+		out[key] = PSUSlotPower{
+			InputW:  entry.inputPowerW,
+			OutputW: entry.outputPowerW,
+			Status:  entry.status,
+		}
+	}
+	return out
+}
+
 func synthesizePSUsFromSDR(sdr map[int]psuSDR) []schema.HardwarePowerSupply {
 	if len(sdr) == 0 {
 		return nil
--- a/audit/internal/collector/psu_sdr_test.go
+++ b/audit/internal/collector/psu_sdr_test.go
@@ -49,6 +49,10 @@ func TestParsePSUSlotVendorVariants(t *testing.T) {
 		{name: "PWS1 Status", want: 1},
 		{name: "Power Supply Bay 8", want: 8},
 		{name: "PS 6 Input Power", want: 6},
+		// MSI underscore format — \b does not fire between digit and '_'
+		{name: "PSU1_POWER_IN", want: 1},
+		{name: "PSU2_POWER_OUT", want: 2},
+		{name: "PSU4_STATUS", want: 4},
 	}

 	for _, tt := range tests {
@@ -59,6 +63,31 @@ func TestParsePSUSlotVendorVariants(t *testing.T) {
 	}
 }

+func TestParsePSUSDRMSIFormat(t *testing.T) {
+	t.Parallel()
+	raw := `
+PSU1_STATUS      | F1h | ok
+PSU1_POWER_OUT   | 928 Watts | ok
+PSU1_POWER_IN    | 976 Watts | ok
+PSU2_STATUS      | F2h | ok
+PSU2_POWER_OUT   | 944 Watts | ok
+PSU2_POWER_IN    | 992 Watts | ok
+`
+	got := parsePSUSDR(raw)
+	if len(got) != 2 {
+		t.Fatalf("len(got)=%d want 2", len(got))
+	}
+	if got[1].inputPowerW == nil || *got[1].inputPowerW != 976 {
+		t.Fatalf("psu1 input power=%v want 976", got[1].inputPowerW)
+	}
+	if got[1].outputPowerW == nil || *got[1].outputPowerW != 928 {
+		t.Fatalf("psu1 output power=%v want 928", got[1].outputPowerW)
+	}
+	if got[2].inputPowerW == nil || *got[2].inputPowerW != 992 {
+		t.Fatalf("psu2 input power=%v want 992", got[2].inputPowerW)
+	}
+}
+
 func TestSynthesizePSUsFromSDR(t *testing.T) {
 	t.Parallel()

--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
--- a/audit/internal/platform/benchmark_power_autotune.go
+++ b/audit/internal/platform/benchmark_power_autotune.go
@@ -0,0 +1,735 @@
+package platform
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"math"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"sort"
+	"strings"
+	"time"
+)
+
+const (
+	benchmarkPowerAutotuneVersion         = 1
+	benchmarkPowerAutotuneIdleSec         = 60
+	benchmarkPowerAutotuneLoadSec         = 90
+	benchmarkPowerAutotuneSampleInterval  = 3
+	defaultBenchmarkPowerSourceConfigPath = "/appdata/bee/export/bee-bench/power-source-autotune.json"
+)
+
+func BenchmarkPowerSourceConfigPath(baseDir string) string {
+	baseDir = strings.TrimSpace(baseDir)
+	if baseDir == "" {
+		return defaultBenchmarkPowerSourceConfigPath
+	}
+	return filepath.Join(filepath.Dir(baseDir), "power-source-autotune.json")
+}
+
+func LoadBenchmarkPowerAutotuneConfig(path string) (*BenchmarkPowerAutotuneConfig, error) {
+	raw, err := os.ReadFile(path)
+	if err != nil {
+		return nil, err
+	}
+	var cfg BenchmarkPowerAutotuneConfig
+	if err := json.Unmarshal(raw, &cfg); err != nil {
+		return nil, err
+	}
+	if strings.TrimSpace(cfg.SelectedSource) == "" {
+		return nil, fmt.Errorf("autotune config missing selected_source")
+	}
+	return &cfg, nil
+}
+
+func SaveBenchmarkPowerAutotuneConfig(path string, cfg BenchmarkPowerAutotuneConfig) error {
+	if strings.TrimSpace(path) == "" {
+		return fmt.Errorf("empty autotune config path")
+	}
+	if cfg.Version <= 0 {
+		cfg.Version = benchmarkPowerAutotuneVersion
+	}
+	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+		return err
+	}
+	data, err := json.MarshalIndent(cfg, "", "  ")
+	if err != nil {
+		return err
+	}
+	tmp := path + ".tmp"
+	if err := os.WriteFile(tmp, data, 0644); err != nil {
+		return err
+	}
+	return os.Rename(tmp, path)
+}
+
+func LoadSystemPowerSourceConfig(exportDir string) (*BenchmarkPowerAutotuneConfig, error) {
+	return LoadBenchmarkPowerAutotuneConfig(BenchmarkPowerSourceConfigPath(exportDir))
+}
+
+func ResetBenchmarkPowerAutotuneConfig(path string) error {
+	if strings.TrimSpace(path) == "" {
+		return fmt.Errorf("empty autotune config path")
+	}
+	if err := os.Remove(path); err != nil && !os.IsNotExist(err) {
+		return err
+	}
+	return nil
+}
+
+func normalizeBenchmarkPowerSource(source string) string {
+	switch strings.TrimSpace(strings.ToLower(source)) {
+	case BenchmarkPowerSourceSDRPSUInput:
+		return BenchmarkPowerSourceSDRPSUInput
+	default:
+		return BenchmarkPowerSourceDCMI
+	}
+}
+
+func ResolveSystemPowerDecision(exportDir string) SystemPowerSourceDecision {
+	cfg, err := LoadSystemPowerSourceConfig(exportDir)
+	if err == nil && cfg != nil && strings.TrimSpace(cfg.SelectedSource) != "" {
+		selected := normalizeBenchmarkPowerSource(cfg.SelectedSource)
+		return SystemPowerSourceDecision{
+			Configured:      true,
+			SelectedSource:  selected,
+			EffectiveSource: selected,
+			Mode:            "autotuned",
+			Reason:          strings.TrimSpace(cfg.Reason),
+			ConfiguredAt:    cfg.UpdatedAt,
+		}
+	}
+
+	sources := sampleBenchmarkPowerSources()
+	if value := sources[BenchmarkPowerSourceSDRPSUInput]; value > 0 {
+		return SystemPowerSourceDecision{
+			Configured:      false,
+			EffectiveSource: BenchmarkPowerSourceSDRPSUInput,
+			Mode:            "fallback",
+			Reason:          "autotune config not found; using temporary fallback source sdr_psu_input",
+		}
+	}
+	return SystemPowerSourceDecision{
+		Configured:      false,
+		EffectiveSource: BenchmarkPowerSourceDCMI,
+		Mode:            "fallback",
+		Reason:          "autotune config not found; using temporary fallback source dcmi",
+	}
+}
+
+func SampleSystemPowerResolved(exportDir string) (float64, SystemPowerSourceDecision, error) {
+	decision := ResolveSystemPowerDecision(exportDir)
+	if decision.EffectiveSource != "" {
+		if value, err := queryBenchmarkPowerSourceW(decision.EffectiveSource); err == nil && value > 0 {
+			return value, decision, nil
+		} else if decision.Configured {
+			fallback := BenchmarkPowerSourceDCMI
+			if decision.EffectiveSource == BenchmarkPowerSourceDCMI {
+				fallback = BenchmarkPowerSourceSDRPSUInput
+			}
+			if fallbackValue, fallbackErr := queryBenchmarkPowerSourceW(fallback); fallbackErr == nil && fallbackValue > 0 {
+				decision.Mode = "degraded"
+				decision.Reason = fmt.Sprintf("configured source %s unavailable; using degraded fallback %s", decision.SelectedSource, fallback)
+				decision.EffectiveSource = fallback
+				return fallbackValue, decision, nil
+			}
+			decision.Mode = "degraded"
+			decision.Reason = fmt.Sprintf("configured source %s unavailable and no fallback source responded", decision.SelectedSource)
+			return 0, decision, err
+		}
+	}
+	return 0, decision, fmt.Errorf("system power source unavailable")
+}
+
+func queryBenchmarkPowerSourceW(source string) (float64, error) {
+	switch normalizeBenchmarkPowerSource(source) {
+	case BenchmarkPowerSourceSDRPSUInput:
+		sdr := sampleIPMISDRPowerSensors()
+		if sdr.PSUInW > 0 {
+			return sdr.PSUInW, nil
+		}
+		return 0, fmt.Errorf("sdr psu input unavailable")
+	default:
+		return queryIPMIServerPowerW()
+	}
+}
+
+func sampleBenchmarkPowerSources() map[string]float64 {
+	out := map[string]float64{}
+	if w, err := queryIPMIServerPowerW(); err == nil && w > 0 {
+		out[BenchmarkPowerSourceDCMI] = w
+	}
+	if w, err := queryBenchmarkPowerSourceW(BenchmarkPowerSourceSDRPSUInput); err == nil && w > 0 {
+		out[BenchmarkPowerSourceSDRPSUInput] = w
+	}
+	return out
+}
+
+func sampleBenchmarkPowerSourceSeries(ctx context.Context, source string, durationSec, intervalSec int) (float64, bool) {
+	if durationSec <= 0 {
+		return 0, false
+	}
+	samples := collectSelectedPowerSourceSamples(ctx, source, durationSec, intervalSec)
+	if len(samples) == 0 {
+		return 0, false
+	}
+	return benchmarkMean(samples), true
+}
+
+func collectSelectedPowerSourceSamples(ctx context.Context, source string, durationSec, intervalSec int) []float64 {
+	if durationSec <= 0 {
+		return nil
+	}
+	stopCh := make(chan struct{})
+	doneCh := startSelectedPowerSourceSampler(stopCh, source, intervalSec)
+	select {
+	case <-ctx.Done():
+	case <-time.After(time.Duration(durationSec) * time.Second):
+	}
+	close(stopCh)
+	return <-doneCh
+}
+
+func startSelectedPowerSourceSampler(stopCh <-chan struct{}, source string, intervalSec int) <-chan []float64 {
+	if intervalSec <= 0 {
+		intervalSec = benchmarkPowerAutotuneSampleInterval
+	}
+	ch := make(chan []float64, 1)
+	go func() {
+		defer close(ch)
+		var samples []float64
+		record := func() {
+			if w, err := queryBenchmarkPowerSourceW(source); err == nil && w > 0 {
+				samples = append(samples, w)
+			}
+		}
+		record()
+		ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
+		defer ticker.Stop()
+		for {
+			select {
+			case <-stopCh:
+				ch <- samples
+				return
+			case <-ticker.C:
+				record()
+			}
+		}
+	}()
+	return ch
+}
+
+type benchmarkPowerAutotuneSample struct {
+	ElapsedSec     float64
+	GPUAvgUsagePct float64
+	CPUUsagePct    float64
+	GPUSumPowerW   float64
+	Sources        map[string]float64
+}
+
+func collectBenchmarkPowerAutotuneSamples(ctx context.Context, phase string, gpuIndices []int, durationSec int, logFunc func(string)) []benchmarkPowerAutotuneSample {
+	if durationSec <= 0 {
+		return nil
+	}
+	var out []benchmarkPowerAutotuneSample
+	deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
+	start := time.Now()
+	for {
+		if ctx.Err() != nil {
+			return out
+		}
+		row := benchmarkPowerAutotuneSample{
+			ElapsedSec:  time.Since(start).Seconds(),
+			CPUUsagePct: sampleCPULoadPct(),
+			Sources:     sampleBenchmarkPowerSources(),
+		}
+		if gpuRows, err := sampleGPUMetrics(gpuIndices); err == nil && len(gpuRows) > 0 {
+			var usageSum float64
+			for _, gpu := range gpuRows {
+				row.GPUSumPowerW += gpu.PowerW
+				usageSum += gpu.UsagePct
+			}
+			row.GPUAvgUsagePct = usageSum / float64(len(gpuRows))
+		}
+		out = append(out, row)
+		logBenchmarkPowerAutotuneSample(phase, row, logFunc)
+		if time.Now().After(deadline) {
+			return out
+		}
+		select {
+		case <-ctx.Done():
+			return out
+		case <-time.After(benchmarkPowerAutotuneSampleInterval * time.Second):
+		}
+	}
+}
+
+func logBenchmarkPowerAutotuneSample(phase string, sample benchmarkPowerAutotuneSample, logFunc func(string)) {
+	if logFunc == nil {
+		return
+	}
+	var sourceParts []string
+	for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} {
+		if value, ok := sample.Sources[source]; ok && value > 0 {
+			sourceParts = append(sourceParts, fmt.Sprintf("%s=%.0fW", source, value))
+		} else {
+			sourceParts = append(sourceParts, fmt.Sprintf("%s=n/a", source))
+		}
+	}
+	logFunc(fmt.Sprintf(
+		"autotune %s sample t=%.0fs gpu_avg_util=%.1f%% gpu_sum_power=%.0fW cpu_load=%.1f%% %s",
+		phase,
+		sample.ElapsedSec,
+		sample.GPUAvgUsagePct,
+		sample.GPUSumPowerW,
+		sample.CPUUsagePct,
+		strings.Join(sourceParts, " "),
+	))
+}
+
+func logBenchmarkPowerAutotunePhaseSummary(phase string, samples []benchmarkPowerAutotuneSample, logFunc func(string)) {
+	if logFunc == nil || len(samples) == 0 {
+		return
+	}
+	var gpuUsage []float64
+	var cpuUsage []float64
+	var gpuPower []float64
+	sourceBuckets := map[string][]float64{}
+	for _, sample := range samples {
+		gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct)
+		cpuUsage = append(cpuUsage, sample.CPUUsagePct)
+		gpuPower = append(gpuPower, sample.GPUSumPowerW)
+		for source, value := range sample.Sources {
+			if value > 0 {
+				sourceBuckets[source] = append(sourceBuckets[source], value)
+			}
+		}
+	}
+	var sourceParts []string
+	for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} {
+		values := sourceBuckets[source]
+		if len(values) == 0 {
+			sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=n/a", source))
+			continue
+		}
+		sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=%.0fW", source, benchmarkMean(values)))
+	}
+	logFunc(fmt.Sprintf(
+		"autotune %s summary samples=%d gpu_avg_util=%.1f%% gpu_p95_util=%.1f%% gpu_avg_power=%.0fW cpu_avg=%.1f%% cpu_p95=%.1f%% %s",
+		phase,
+		len(samples),
+		benchmarkMean(gpuUsage),
+		benchmarkPercentile(gpuUsage, 95),
+		benchmarkMean(gpuPower),
+		benchmarkMean(cpuUsage),
+		benchmarkPercentile(cpuUsage, 95),
+		strings.Join(sourceParts, " "),
+	))
+}
+
+func logBenchmarkPowerAutotuneSelection(candidates []BenchmarkPowerAutotuneCandidate, selectedSource string, gpuDelta float64, logFunc func(string)) {
+	if logFunc == nil {
+		return
+	}
+	for _, candidate := range candidates {
+		if !candidate.Available {
+			logFunc(fmt.Sprintf("autotune candidate %s unavailable", candidate.Source))
+			continue
+		}
+		logFunc(fmt.Sprintf(
+			"autotune candidate %s idle_avg=%.0fW load_avg=%.0fW delta=%.0fW gpu_delta=%.0fW relative_error=%.3f confidence=%.0f%%%s",
+			candidate.Source,
+			candidate.IdleAvgW,
+			candidate.LoadAvgW,
+			candidate.DeltaW,
+			gpuDelta,
+			candidate.RelativeError,
+			candidate.Confidence*100,
+			map[bool]string{true: " SELECTED", false: ""}[candidate.Source == selectedSource],
+		))
+		if strings.TrimSpace(candidate.SelectionNotes) != "" {
+			logFunc(fmt.Sprintf("autotune candidate %s reason: %s", candidate.Source, candidate.SelectionNotes))
+		}
+	}
+}
+
+func validateBenchmarkPowerAutotuneIdle(samples []benchmarkPowerAutotuneSample) *BenchmarkPowerAutotuneValidation {
+	result := &BenchmarkPowerAutotuneValidation{}
+	if len(samples) == 0 {
+		result.Reason = "no idle telemetry samples collected"
+		return result
+	}
+	var gpuUsage []float64
+	var cpuUsage []float64
+	for _, sample := range samples {
+		gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct)
+		if sample.CPUUsagePct > 0 {
+			cpuUsage = append(cpuUsage, sample.CPUUsagePct)
+		}
+	}
+	result.GPUSamples = len(gpuUsage)
+	result.CPUSamples = len(cpuUsage)
+	result.GPUAvgUsagePct = math.Round(benchmarkMean(gpuUsage)*10) / 10
+	result.GPUP95UsagePct = math.Round(benchmarkPercentile(gpuUsage, 95)*10) / 10
+	result.CPUAvgUsagePct = math.Round(benchmarkMean(cpuUsage)*10) / 10
+	result.CPUP95UsagePct = math.Round(benchmarkPercentile(cpuUsage, 95)*10) / 10
+	switch {
+	case result.GPUAvgUsagePct > 5:
+		result.Reason = fmt.Sprintf("idle validation failed: average GPU load %.1f%% exceeds 5%%", result.GPUAvgUsagePct)
+	case result.GPUP95UsagePct > 10:
+		result.Reason = fmt.Sprintf("idle validation failed: p95 GPU load %.1f%% exceeds 10%%", result.GPUP95UsagePct)
+	case result.CPUAvgUsagePct > 20:
+		result.Reason = fmt.Sprintf("idle validation failed: average CPU load %.1f%% exceeds 20%%", result.CPUAvgUsagePct)
+	case result.CPUP95UsagePct > 35:
+		result.Reason = fmt.Sprintf("idle validation failed: p95 CPU load %.1f%% exceeds 35%%", result.CPUP95UsagePct)
+	default:
+		result.Valid = true
+	}
+	return result
+}
+
+func chooseBenchmarkPowerAutotuneSource(idle, load []benchmarkPowerAutotuneSample) (string, []BenchmarkPowerAutotuneCandidate, float64, float64, error) {
+	idleBySource := map[string][]float64{}
+	loadBySource := map[string][]float64{}
+	var idleGPU []float64
+	var loadGPU []float64
+	for _, sample := range idle {
+		idleGPU = append(idleGPU, sample.GPUSumPowerW)
+		for source, value := range sample.Sources {
+			if value > 0 {
+				idleBySource[source] = append(idleBySource[source], value)
+			}
+		}
+	}
+	for _, sample := range load {
+		loadGPU = append(loadGPU, sample.GPUSumPowerW)
+		for source, value := range sample.Sources {
+			if value > 0 {
+				loadBySource[source] = append(loadBySource[source], value)
+			}
+		}
+	}
+	idleGPUAvg := benchmarkMean(idleGPU)
+	loadGPUAvg := benchmarkMean(loadGPU)
+	gpuDelta := loadGPUAvg - idleGPUAvg
+	if gpuDelta <= 0 {
+		gpuDelta = loadGPUAvg
+	}
+
+	candidates := []BenchmarkPowerAutotuneCandidate{
+		buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceDCMI, idleBySource[BenchmarkPowerSourceDCMI], loadBySource[BenchmarkPowerSourceDCMI], gpuDelta),
+		buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceSDRPSUInput, idleBySource[BenchmarkPowerSourceSDRPSUInput], loadBySource[BenchmarkPowerSourceSDRPSUInput], gpuDelta),
+	}
+	available := make([]BenchmarkPowerAutotuneCandidate, 0, len(candidates))
+	for _, candidate := range candidates {
+		if candidate.Available && candidate.DeltaW > 0 {
+			available = append(available, candidate)
+		}
+	}
+	if len(available) == 0 {
+		return "", candidates, idleGPUAvg, loadGPUAvg, fmt.Errorf("no usable server power source samples collected")
+	}
+	sort.Slice(available, func(i, j int) bool {
+		if math.Abs(available[i].RelativeError-available[j].RelativeError) <= 0.10 {
+			if available[i].Source != available[j].Source {
+				return available[i].Source == BenchmarkPowerSourceSDRPSUInput
+			}
+		}
+		if available[i].RelativeError != available[j].RelativeError {
+			return available[i].RelativeError < available[j].RelativeError
+		}
+		return available[i].Samples > available[j].Samples
+	})
+	selected := available[0]
+	for idx := range candidates {
+		if candidates[idx].Source == selected.Source {
+			candidates[idx].Selected = true
+			candidates[idx].SelectionNotes = fmt.Sprintf("selected because delta %.0f W is closest to GPU delta %.0f W (relative error %.3f)", selected.DeltaW, gpuDelta, selected.RelativeError)
+		}
+	}
+	return selected.Source, candidates, idleGPUAvg, loadGPUAvg, nil
+}
+
+func buildBenchmarkPowerAutotuneCandidate(source string, idle, load []float64, gpuDelta float64) BenchmarkPowerAutotuneCandidate {
+	candidate := BenchmarkPowerAutotuneCandidate{
+		Source:    source,
+		Available: len(idle) > 0 && len(load) > 0,
+		Samples:   minInt(len(idle), len(load)),
+	}
+	if !candidate.Available {
+		return candidate
+	}
+	candidate.IdleAvgW = benchmarkMean(idle)
+	candidate.LoadAvgW = benchmarkMean(load)
+	candidate.DeltaW = candidate.LoadAvgW - candidate.IdleAvgW
+	if gpuDelta > 0 {
+		candidate.RelativeError = math.Abs(candidate.DeltaW-gpuDelta) / gpuDelta
+		candidate.Confidence = math.Max(0, 1-candidate.RelativeError)
+	}
+	return candidate
+}
+
+func renderBenchmarkPowerAutotuneSummary(result BenchmarkPowerAutotuneResult) string {
+	var b strings.Builder
+	fmt.Fprintf(&b, "generated_at=%s\n", result.GeneratedAt.UTC().Format(time.RFC3339))
+	fmt.Fprintf(&b, "status=%s\n", result.Status)
+	fmt.Fprintf(&b, "benchmark_kind=%s\n", result.BenchmarkKind)
+	fmt.Fprintf(&b, "profile=%s\n", result.Profile)
+	fmt.Fprintf(&b, "idle_duration_sec=%d\n", result.IdleDurationSec)
+	fmt.Fprintf(&b, "load_duration_sec=%d\n", result.LoadDurationSec)
+	fmt.Fprintf(&b, "sample_interval_sec=%d\n", result.SampleIntervalSec)
+	if result.SelectedSource != "" {
+		fmt.Fprintf(&b, "selected_source=%s\n", result.SelectedSource)
+	}
+	if result.IdleValidation != nil {
+		fmt.Fprintf(&b, "idle_valid=%t\n", result.IdleValidation.Valid)
+		fmt.Fprintf(&b, "idle_gpu_avg_usage_pct=%.1f\n", result.IdleValidation.GPUAvgUsagePct)
+		fmt.Fprintf(&b, "idle_gpu_p95_usage_pct=%.1f\n", result.IdleValidation.GPUP95UsagePct)
+		fmt.Fprintf(&b, "idle_cpu_avg_usage_pct=%.1f\n", result.IdleValidation.CPUAvgUsagePct)
+		fmt.Fprintf(&b, "idle_cpu_p95_usage_pct=%.1f\n", result.IdleValidation.CPUP95UsagePct)
+		if result.IdleValidation.Reason != "" {
+			fmt.Fprintf(&b, "idle_validation_error=%s\n", result.IdleValidation.Reason)
+		}
+	}
+	for _, candidate := range result.Candidates {
+		fmt.Fprintf(&b, "candidate_%s_available=%t\n", candidate.Source, candidate.Available)
+		if candidate.Available {
+			fmt.Fprintf(&b, "candidate_%s_idle_avg_w=%.0f\n", candidate.Source, candidate.IdleAvgW)
+			fmt.Fprintf(&b, "candidate_%s_load_avg_w=%.0f\n", candidate.Source, candidate.LoadAvgW)
+			fmt.Fprintf(&b, "candidate_%s_delta_w=%.0f\n", candidate.Source, candidate.DeltaW)
+			fmt.Fprintf(&b, "candidate_%s_relative_error=%.3f\n", candidate.Source, candidate.RelativeError)
+		}
+	}
+	return b.String()
+}
+
+func renderBenchmarkPowerAutotuneReport(result BenchmarkPowerAutotuneResult) string {
+	var b strings.Builder
+	b.WriteString("# Bee Bench Power Source Autotune\n\n")
+	fmt.Fprintf(&b, "**Status:** %s  \n", result.Status)
+	fmt.Fprintf(&b, "**Benchmark kind:** %s  \n", result.BenchmarkKind)
+	fmt.Fprintf(&b, "**Profile:** %s  \n", result.Profile)
+	fmt.Fprintf(&b, "**Idle window:** %ds  \n", result.IdleDurationSec)
+	fmt.Fprintf(&b, "**Load window:** %ds  \n", result.LoadDurationSec)
+	fmt.Fprintf(&b, "**Sample interval:** %ds  \n", result.SampleIntervalSec)
+	if result.SelectedSource != "" {
+		fmt.Fprintf(&b, "**Selected source:** `%s`  \n", result.SelectedSource)
+	}
+	b.WriteString("\n")
+	if result.IdleValidation != nil {
+		b.WriteString("## Idle Validation\n\n")
+		fmt.Fprintf(&b, "- valid: %t\n", result.IdleValidation.Valid)
+		fmt.Fprintf(&b, "- GPU avg usage: %.1f%%\n", result.IdleValidation.GPUAvgUsagePct)
+		fmt.Fprintf(&b, "- GPU p95 usage: %.1f%%\n", result.IdleValidation.GPUP95UsagePct)
+		fmt.Fprintf(&b, "- CPU avg usage: %.1f%%\n", result.IdleValidation.CPUAvgUsagePct)
+		fmt.Fprintf(&b, "- CPU p95 usage: %.1f%%\n", result.IdleValidation.CPUP95UsagePct)
+		if result.IdleValidation.Reason != "" {
+			fmt.Fprintf(&b, "- reason: %s\n", result.IdleValidation.Reason)
+		}
+		b.WriteString("\n")
+	}
+	if len(result.Candidates) > 0 {
+		b.WriteString("## Candidates\n\n")
+		b.WriteString("| Source | Idle avg W | Load avg W | Delta W | Relative error | Selected |\n")
+		b.WriteString("|--------|------------|------------|---------|----------------|----------|\n")
+		for _, candidate := range result.Candidates {
+			if !candidate.Available {
+				fmt.Fprintf(&b, "| %s | — | — | — | — | no |\n", candidate.Source)
+				continue
+			}
+			selected := "no"
+			if candidate.Selected {
+				selected = "yes"
+			}
+			fmt.Fprintf(&b, "| %s | %.0f | %.0f | %.0f | %.2f | %s |\n",
+				candidate.Source, candidate.IdleAvgW, candidate.LoadAvgW, candidate.DeltaW, candidate.RelativeError, selected)
+		}
+		b.WriteString("\n")
+	}
+	for _, note := range result.Notes {
+		fmt.Fprintf(&b, "- %s\n", note)
+	}
+	return b.String()
+}
+
+func benchmarkAutotuneLoadCommand(kind string, durationSec int, gpuIndices []int, sizeMB int) ([]string, string) {
+	allDevices := joinIndexList(gpuIndices)
+	switch strings.TrimSpace(strings.ToLower(kind)) {
+	case "power-fit", "power", "nvidia-bench-power":
+		cmd, _, err := resolveBenchmarkPowerLoadCommand(durationSec, gpuIndices)
+		if err == nil {
+			return cmd, "power-fit"
+		}
+		return nvidiaDCGMNamedDiagCommand("targeted_power", durationSec, gpuIndices), "power-fit"
+	default:
+		cmd := []string{
+			"bee-gpu-burn",
+			"--seconds", fmt.Sprintf("%d", durationSec),
+			"--devices", allDevices,
+		}
+		if sizeMB > 0 {
+			cmd = append(cmd, "--size-mb", fmt.Sprintf("%d", sizeMB))
+		}
+		return cmd, "performance"
+	}
+}
+
+func (s *System) RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
+	if ctx == nil {
+		ctx = context.Background()
+	}
+	if logFunc == nil {
+		logFunc = func(string) {}
+	}
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = "/var/log/bee-bench/autotune"
+	}
+	if err := os.MkdirAll(baseDir, 0755); err != nil {
+		return "", fmt.Errorf("mkdir %s: %w", baseDir, err)
+	}
+	selected, err := resolveNvidiaGPUSelection(nil, nil)
+	if err != nil {
+		return "", err
+	}
+	if len(selected) == 0 {
+		return "", fmt.Errorf("no NVIDIA GPUs detected for autotune")
+	}
+	ts := time.Now().UTC().Format("20060102-150405")
+	runDir := filepath.Join(baseDir, "autotune-"+ts)
+	if err := os.MkdirAll(runDir, 0755); err != nil {
+		return "", fmt.Errorf("mkdir %s: %w", runDir, err)
+	}
+	verboseLog := filepath.Join(runDir, "verbose.log")
+	hostname, _ := os.Hostname()
+	loadCmd, normalizedKind := benchmarkAutotuneLoadCommand(benchmarkKind, benchmarkPowerAutotuneLoadSec, selected, opts.SizeMB)
+	result := BenchmarkPowerAutotuneResult{
+		GeneratedAt:       time.Now().UTC(),
+		Hostname:          hostname,
+		ServerModel:       readServerModel(),
+		BenchmarkKind:     normalizedKind,
+		Profile:           opts.Profile,
+		Status:            "FAILED",
+		IdleDurationSec:   benchmarkPowerAutotuneIdleSec,
+		LoadDurationSec:   benchmarkPowerAutotuneLoadSec,
+		SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
+	}
+
+	logFunc(fmt.Sprintf("autotune: idle validation window %ds on GPUs %s", benchmarkPowerAutotuneIdleSec, joinIndexList(selected)))
+	idleSamples := collectBenchmarkPowerAutotuneSamples(ctx, "idle", selected, benchmarkPowerAutotuneIdleSec, logFunc)
+	logBenchmarkPowerAutotunePhaseSummary("idle", idleSamples, logFunc)
+	result.IdleValidation = validateBenchmarkPowerAutotuneIdle(idleSamples)
+	if result.IdleValidation == nil || !result.IdleValidation.Valid {
+		if result.IdleValidation != nil {
+			result.IdleValidationError = result.IdleValidation.Reason
+			logFunc(result.IdleValidation.Reason)
+		}
+		result.Notes = append(result.Notes, "autotune stopped before load stage because idle validation failed")
+		if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
+			return "", err
+		}
+		return runDir, fmt.Errorf("%s", result.IdleValidationError)
+	}
+
+	logFunc(fmt.Sprintf("autotune: full-load stage using %s for %ds", normalizedKind, benchmarkPowerAutotuneLoadSec))
+	loadSamplesCh := make(chan []benchmarkPowerAutotuneSample, 1)
+	go func() {
+		loadSamplesCh <- collectBenchmarkPowerAutotuneSamples(ctx, "load", selected, benchmarkPowerAutotuneLoadSec, logFunc)
+	}()
+	out, runErr := runSATCommandCtx(ctx, verboseLog, "autotune-load.log", loadCmd, nil, logFunc)
+	_ = os.WriteFile(filepath.Join(runDir, "autotune-load.log"), out, 0644)
+	loadSamples := <-loadSamplesCh
+	logBenchmarkPowerAutotunePhaseSummary("load", loadSamples, logFunc)
+	if runErr != nil {
+		result.Notes = append(result.Notes, "full-load stage failed: "+runErr.Error())
+		if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
+			return "", err
+		}
+		return runDir, fmt.Errorf("autotune load stage: %w", runErr)
+	}
+
+	selectedSource, candidates, idleGPUAvg, loadGPUAvg, chooseErr := chooseBenchmarkPowerAutotuneSource(idleSamples, loadSamples)
+	result.Candidates = candidates
+	result.GPUPowerIdleW = idleGPUAvg
+	result.GPUPowerLoadW = loadGPUAvg
+	if chooseErr != nil {
+		result.Notes = append(result.Notes, chooseErr.Error())
+		if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
+			return "", err
+		}
+		return runDir, chooseErr
+	}
+	gpuDelta := loadGPUAvg - idleGPUAvg
+	if gpuDelta <= 0 {
+		gpuDelta = loadGPUAvg
+	}
+	logBenchmarkPowerAutotuneSelection(candidates, selectedSource, gpuDelta, logFunc)
+	result.SelectedSource = selectedSource
+	result.Status = "OK"
+	var confidence float64
+	selectionReason := fmt.Sprintf("selected %s after comparing full-load average against GPU-reported delta", selectedSource)
+	for _, candidate := range candidates {
+		if candidate.Selected {
+			confidence = candidate.Confidence
+			if strings.TrimSpace(candidate.SelectionNotes) != "" {
+				selectionReason = candidate.SelectionNotes
+			}
+			break
+		}
+	}
+	cfg := BenchmarkPowerAutotuneConfig{
+		Version:           benchmarkPowerAutotuneVersion,
+		UpdatedAt:         time.Now().UTC(),
+		SelectedSource:    selectedSource,
+		BenchmarkKind:     normalizedKind,
+		Profile:           opts.Profile,
+		IdleDurationSec:   benchmarkPowerAutotuneIdleSec,
+		LoadDurationSec:   benchmarkPowerAutotuneLoadSec,
+		SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
+		Confidence:        confidence,
+		Reason:            selectionReason,
+	}
+	result.Config = &cfg
+	configPath := BenchmarkPowerSourceConfigPath(baseDir)
+	if err := SaveBenchmarkPowerAutotuneConfig(configPath, cfg); err != nil {
+		result.Status = "FAILED"
+		result.Notes = append(result.Notes, "failed to save autotune config: "+err.Error())
+		if writeErr := writeBenchmarkPowerAutotuneArtifacts(runDir, result); writeErr != nil {
+			return "", writeErr
+		}
+		return runDir, err
+	}
+	logFunc(fmt.Sprintf("autotune conclusion: selected source %s; reason: %s", selectedSource, cfg.Reason))
+	result.Notes = append(result.Notes, "saved autotune config to "+configPath)
+	if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
+		return "", err
+	}
+	return runDir, nil
+}
+
+func writeBenchmarkPowerAutotuneArtifacts(runDir string, result BenchmarkPowerAutotuneResult) error {
+	resultJSON, err := json.MarshalIndent(result, "", "  ")
+	if err != nil {
+		return fmt.Errorf("marshal autotune result: %w", err)
+	}
+	if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
+		return fmt.Errorf("write autotune result.json: %w", err)
+	}
+	if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(renderBenchmarkPowerAutotuneSummary(result)), 0644); err != nil {
+		return fmt.Errorf("write autotune summary.txt: %w", err)
+	}
+	if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(renderBenchmarkPowerAutotuneReport(result)), 0644); err != nil {
+		return fmt.Errorf("write autotune report.md: %w", err)
+	}
+	return nil
+}
+
+func minInt(a, b int) int {
+	if a < b {
+		return a
+	}
+	return b
+}
+
+var _ = exec.ErrNotFound
--- a/audit/internal/platform/benchmark_report.go
+++ b/audit/internal/platform/benchmark_report.go
@@ -89,136 +89,159 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {

 	// Perspective 1: Compatibility — hard stops
 	b.WriteString("### 1. Compatibility\n\n")
-	b.WriteString("| GPU | Thermal throttle | Fan duty at throttle | ECC uncorr | Status |\n")
-	b.WriteString("|-----|------------------|----------------------|------------|--------|\n")
-	for _, gpu := range result.GPUs {
-		thermalThrottle := "-"
-		if gpu.Scores.ThermalThrottlePct > 0 {
-			thermalThrottle = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
+	{
+		var rows [][]string
+		for _, gpu := range result.GPUs {
+			thermalThrottle := "-"
+			if gpu.Scores.ThermalThrottlePct > 0 {
+				thermalThrottle = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
+			}
+			fanAtThrottle := "-"
+			if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && gpu.Scores.ThermalThrottlePct > 0 {
+				fanAtThrottle = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
+			}
+			ecc := "-"
+			if gpu.ECC.Uncorrected > 0 {
+				ecc = fmt.Sprintf("⛔ %d", gpu.ECC.Uncorrected)
+			}
+			compatStatus := "✓ OK"
+			if gpu.ECC.Uncorrected > 0 || (gpu.Scores.ThermalThrottlePct > 0 && result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && result.Cooling.P95FanDutyCyclePct < 95) {
+				compatStatus = "⛔ HARD STOP"
+			}
+			rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), thermalThrottle, fanAtThrottle, ecc, compatStatus})
 		}
-		fanAtThrottle := "-"
-		if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && gpu.Scores.ThermalThrottlePct > 0 {
-			fanAtThrottle = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
-		}
-		ecc := "-"
-		if gpu.ECC.Uncorrected > 0 {
-			ecc = fmt.Sprintf("⛔ %d", gpu.ECC.Uncorrected)
-		}
-		compatStatus := "✓ OK"
-		if gpu.ECC.Uncorrected > 0 || (gpu.Scores.ThermalThrottlePct > 0 && result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && result.Cooling.P95FanDutyCyclePct < 95) {
-			compatStatus = "⛔ HARD STOP"
-		}
-		fmt.Fprintf(&b, "| GPU %d | %s | %s | %s | %s |\n",
-			gpu.Index, thermalThrottle, fanAtThrottle, ecc, compatStatus)
+		b.WriteString(fmtMDTable([]string{"GPU", "Thermal throttle", "Fan duty at throttle", "ECC uncorr", "Status"}, rows))
+		b.WriteString("\n")
 	}
-	b.WriteString("\n")

 	// Perspective 2: Thermal headroom
 	b.WriteString("### 2. Thermal Headroom\n\n")
-	b.WriteString("| GPU | p95 temp | Slowdown limit | Shutdown limit | Headroom | Thermal throttle | Status |\n")
-	b.WriteString("|-----|----------|----------------|----------------|----------|------------------|--------|\n")
-	for _, gpu := range result.GPUs {
-		shutdownTemp := gpu.ShutdownTempC
-		if shutdownTemp <= 0 {
-			shutdownTemp = 90
+	{
+		var rows [][]string
+		for _, gpu := range result.GPUs {
+			shutdownTemp := gpu.ShutdownTempC
+			if shutdownTemp <= 0 {
+				shutdownTemp = 90
+			}
+			slowdownTemp := gpu.SlowdownTempC
+			if slowdownTemp <= 0 {
+				slowdownTemp = 80
+			}
+			headroom := gpu.Scores.TempHeadroomC
+			thermalStatus := "✓ OK"
+			switch {
+			case headroom < 10:
+				thermalStatus = "⛔ CRITICAL"
+			case gpu.Steady.P95TempC >= slowdownTemp:
+				thermalStatus = "⚠ WARNING"
+			}
+			throttlePct := "-"
+			if gpu.Scores.ThermalThrottlePct > 0 {
+				throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
+			}
+			rows = append(rows, []string{
+				fmt.Sprintf("GPU %d", gpu.Index),
+				fmt.Sprintf("%.1f°C", gpu.Steady.P95TempC),
+				fmt.Sprintf("%.0f°C", slowdownTemp),
+				fmt.Sprintf("%.0f°C", shutdownTemp),
+				fmt.Sprintf("%.1f°C", headroom),
+				throttlePct,
+				thermalStatus,
+			})
 		}
-		slowdownTemp := gpu.SlowdownTempC
-		if slowdownTemp <= 0 {
-			slowdownTemp = 80
-		}
-		headroom := gpu.Scores.TempHeadroomC
-		thermalStatus := "✓ OK"
-		switch {
-		case headroom < 10:
-			thermalStatus = "⛔ CRITICAL"
-		case gpu.Steady.P95TempC >= slowdownTemp:
-			thermalStatus = "⚠ WARNING"
-		}
-		throttlePct := "-"
-		if gpu.Scores.ThermalThrottlePct > 0 {
-			throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
-		}
-		fmt.Fprintf(&b, "| GPU %d | %.1f°C | %.0f°C | %.0f°C | %.1f°C | %s | %s |\n",
-			gpu.Index, gpu.Steady.P95TempC, slowdownTemp, shutdownTemp, headroom, throttlePct, thermalStatus)
+		b.WriteString(fmtMDTable([]string{"GPU", "p95 temp", "Slowdown limit", "Shutdown limit", "Headroom", "Thermal throttle", "Status"}, rows))
+		b.WriteString("\n")
 	}
-	b.WriteString("\n")

 	// Perspective 3: Power delivery
 	b.WriteString("### 3. Power Delivery\n\n")
-	b.WriteString("| GPU | Power cap throttle | Power stability | Fan duty (p95) | Status |\n")
-	b.WriteString("|-----|-------------------|-----------------|----------------|--------|\n")
-	for _, gpu := range result.GPUs {
-		powerCap := "-"
-		if gpu.Scores.PowerCapThrottlePct > 0 {
-			powerCap = fmt.Sprintf("%.1f%%", gpu.Scores.PowerCapThrottlePct)
+	{
+		var rows [][]string
+		for _, gpu := range result.GPUs {
+			powerCap := "-"
+			if gpu.Scores.PowerCapThrottlePct > 0 {
+				powerCap = fmt.Sprintf("%.1f%%", gpu.Scores.PowerCapThrottlePct)
+			}
+			fanDuty := "-"
+			if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable {
+				fanDuty = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
+			}
+			powerStatus := "✓ OK"
+			if gpu.Scores.PowerCapThrottlePct > 5 {
+				powerStatus = "⚠ POWER LIMITED"
+			}
+			rows = append(rows, []string{
+				fmt.Sprintf("GPU %d", gpu.Index),
+				powerCap,
+				fmt.Sprintf("%.1f", gpu.Scores.PowerSustainScore),
+				fanDuty,
+				powerStatus,
+			})
 		}
-		fanDuty := "-"
-		if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable {
-			fanDuty = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
-		}
-		powerStatus := "✓ OK"
-		if gpu.Scores.PowerCapThrottlePct > 5 {
-			powerStatus = "⚠ POWER LIMITED"
-		}
-		fmt.Fprintf(&b, "| GPU %d | %s | %.1f | %s | %s |\n",
-			gpu.Index, powerCap, gpu.Scores.PowerSustainScore, fanDuty, powerStatus)
+		b.WriteString(fmtMDTable([]string{"GPU", "Power cap throttle", "Power stability", "Fan duty (p95)", "Status"}, rows))
+		b.WriteString("\n")
 	}
-	b.WriteString("\n")

 	// Perspective 4: Performance
 	b.WriteString("### 4. Performance\n\n")
-	b.WriteString("| GPU | Compute TOPS | Synthetic | Mixed | Mixed Eff. | TOPS/SM/GHz |\n")
-	b.WriteString("|-----|--------------|-----------|-------|------------|-------------|\n")
-	for _, gpu := range result.GPUs {
-		synthetic := "-"
-		if gpu.Scores.SyntheticScore > 0 {
-			synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
+	{
+		var rows [][]string
+		for _, gpu := range result.GPUs {
+			synthetic := "-"
+			if gpu.Scores.SyntheticScore > 0 {
+				synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
+			}
+			mixed := "-"
+			if gpu.Scores.MixedScore > 0 {
+				mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore)
+			}
+			mixedEff := "-"
+			if gpu.Scores.MixedEfficiency > 0 {
+				mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
+			}
+			topsPerSM := "-"
+			if gpu.Scores.TOPSPerSMPerGHz > 0 {
+				topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
+			}
+			rows = append(rows, []string{
+				fmt.Sprintf("GPU %d", gpu.Index),
+				fmt.Sprintf("**%.2f**", gpu.Scores.CompositeScore),
+				synthetic, mixed, mixedEff, topsPerSM,
+			})
 		}
-		mixed := "-"
-		if gpu.Scores.MixedScore > 0 {
-			mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore)
+		b.WriteString(fmtMDTable([]string{"GPU", "Compute TOPS", "Synthetic", "Mixed", "Mixed Eff.", "TOPS/SM/GHz"}, rows))
+		if len(result.PerformanceRampSteps) > 0 {
+			fmt.Fprintf(&b, "\n**Platform power score (scalability):** %.1f%%\n", result.PlatformPowerScore)
 		}
-		mixedEff := "-"
-		if gpu.Scores.MixedEfficiency > 0 {
-			mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
-		}
-		topsPerSM := "-"
-		if gpu.Scores.TOPSPerSMPerGHz > 0 {
-			topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
-		}
-		fmt.Fprintf(&b, "| GPU %d | **%.2f** | %s | %s | %s | %s |\n",
-			gpu.Index, gpu.Scores.CompositeScore, synthetic, mixed, mixedEff, topsPerSM)
+		b.WriteString("\n")
 	}
-	if len(result.PerformanceRampSteps) > 0 {
-		fmt.Fprintf(&b, "\n**Platform power score (scalability):** %.1f%%\n", result.PlatformPowerScore)
-	}
-	b.WriteString("\n")

 	// Perspective 5: Anomaly flags
 	b.WriteString("### 5. Anomalies\n\n")
-	b.WriteString("| GPU | ECC corrected | Sync boost throttle | Power instability | Thermal instability |\n")
-	b.WriteString("|-----|---------------|---------------------|-------------------|---------------------|\n")
-	for _, gpu := range result.GPUs {
-		eccCorr := "-"
-		if gpu.ECC.Corrected > 0 {
-			eccCorr = fmt.Sprintf("⚠ %d", gpu.ECC.Corrected)
+	{
+		var rows [][]string
+		for _, gpu := range result.GPUs {
+			eccCorr := "-"
+			if gpu.ECC.Corrected > 0 {
+				eccCorr = fmt.Sprintf("⚠ %d", gpu.ECC.Corrected)
+			}
+			syncBoost := "-"
+			if gpu.Scores.SyncBoostThrottlePct > 0 {
+				syncBoost = fmt.Sprintf("%.1f%%", gpu.Scores.SyncBoostThrottlePct)
+			}
+			powerVar := "OK"
+			if gpu.Scores.PowerSustainScore < 70 {
+				powerVar = "⚠ unstable"
+			}
+			thermalVar := "OK"
+			if gpu.Scores.ThermalSustainScore < 70 {
+				thermalVar = "⚠ unstable"
+			}
+			rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), eccCorr, syncBoost, powerVar, thermalVar})
 		}
-		syncBoost := "-"
-		if gpu.Scores.SyncBoostThrottlePct > 0 {
-			syncBoost = fmt.Sprintf("%.1f%%", gpu.Scores.SyncBoostThrottlePct)
-		}
-		powerVar := "OK"
-		if gpu.Scores.PowerSustainScore < 70 {
-			powerVar = "⚠ unstable"
-		}
-		thermalVar := "OK"
-		if gpu.Scores.ThermalSustainScore < 70 {
-			thermalVar = "⚠ unstable"
-		}
-		fmt.Fprintf(&b, "| GPU %d | %s | %s | %s | %s |\n",
-			gpu.Index, eccCorr, syncBoost, powerVar, thermalVar)
+		b.WriteString(fmtMDTable([]string{"GPU", "ECC corrected", "Sync boost throttle", "Power instability", "Thermal instability"}, rows))
+		b.WriteString("\n")
 	}
-	b.WriteString("\n")

 	// ── Per GPU detail ────────────────────────────────────────────────────────
 	b.WriteString("## Per-GPU Details\n\n")
@@ -263,12 +286,16 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		// Steady-state telemetry
 		if benchmarkTelemetryAvailable(gpu.Steady) {
 			fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
-			b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
-			fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
-			fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
-			fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
-			fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
-			fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
+			b.WriteString(fmtMDTable(
+				[]string{"", "Avg", "P95"},
+				[][]string{
+					{"Power", fmt.Sprintf("%.1f W", gpu.Steady.AvgPowerW), fmt.Sprintf("%.1f W", gpu.Steady.P95PowerW)},
+					{"Temperature", fmt.Sprintf("%.1f °C", gpu.Steady.AvgTempC), fmt.Sprintf("%.1f °C", gpu.Steady.P95TempC)},
+					{"GPU clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgGraphicsClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95GraphicsClockMHz)},
+					{"Memory clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgMemoryClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95MemoryClockMHz)},
+					{"GPU utilisation", fmt.Sprintf("%.1f %%", gpu.Steady.AvgUsagePct), "—"},
+				},
+			))
 			b.WriteString("\n")
 		} else {
 			b.WriteString("**Steady-state telemetry:** unavailable\n\n")
@@ -277,7 +304,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		// Per-precision stability phases.
 		if len(gpu.PrecisionSteady) > 0 {
 			b.WriteString("**Per-precision stability:**\n\n")
-			b.WriteString("| Precision | Status | Clock CV | Power CV | Clock Drift | ECC corr | ECC uncorr |\n|-----------|--------|----------|----------|-------------|----------|------------|\n")
+			var precRows [][]string
 			for _, p := range gpu.PrecisionSteady {
 				eccCorr := "—"
 				eccUncorr := "—"
@@ -289,10 +316,15 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 				if strings.TrimSpace(status) == "" {
 					status = "OK"
 				}
-				fmt.Fprintf(&b, "| %s | %s | %.1f%% | %.1f%% | %.1f%% | %s | %s |\n",
-					p.Precision, status, p.Steady.ClockCVPct, p.Steady.PowerCVPct, p.Steady.ClockDriftPct,
-					eccCorr, eccUncorr)
+				precRows = append(precRows, []string{
+					p.Precision, status,
+					fmt.Sprintf("%.1f%%", p.Steady.ClockCVPct),
+					fmt.Sprintf("%.1f%%", p.Steady.PowerCVPct),
+					fmt.Sprintf("%.1f%%", p.Steady.ClockDriftPct),
+					eccCorr, eccUncorr,
+				})
 			}
+			b.WriteString(fmtMDTable([]string{"Precision", "Status", "Clock CV", "Power CV", "Clock Drift", "ECC corr", "ECC uncorr"}, precRows))
 			b.WriteString("\n")
 		} else {
 			// Legacy: show combined-window variance.
@@ -315,16 +347,22 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		// Precision results
 		if len(gpu.PrecisionResults) > 0 {
 			b.WriteString("**Precision results:**\n\n")
-			b.WriteString("| Precision | TOPS (raw) | Weight | TOPS (fp32-eq) | Lanes | Iterations |\n|-----------|------------|--------|----------------|-------|------------|\n")
+			var presRows [][]string
 			for _, p := range gpu.PrecisionResults {
 				if p.Supported {
-					weightStr := fmt.Sprintf("×%.3g", p.Weight)
-					fmt.Fprintf(&b, "| %s | %.2f | %s | %.2f | %d | %d |\n",
-						p.Name, p.TeraOpsPerSec, weightStr, p.WeightedTeraOpsPerSec, p.Lanes, p.Iterations)
+					presRows = append(presRows, []string{
+						p.Name,
+						fmt.Sprintf("%.2f", p.TeraOpsPerSec),
+						fmt.Sprintf("×%.3g", p.Weight),
+						fmt.Sprintf("%.2f", p.WeightedTeraOpsPerSec),
+						fmt.Sprintf("%d", p.Lanes),
+						fmt.Sprintf("%d", p.Iterations),
+					})
 				} else {
-					fmt.Fprintf(&b, "| %s | — (unsupported) | — | — | — | — |\n", p.Name)
+					presRows = append(presRows, []string{p.Name, "— (unsupported)", "—", "—", "—", "—"})
 				}
 			}
+			b.WriteString(fmtMDTable([]string{"Precision", "TOPS (raw)", "Weight", "TOPS (fp32-eq)", "Lanes", "Iterations"}, presRows))
 			b.WriteString("\n")
 		}

@@ -346,9 +384,13 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		b.WriteString("## Interconnect (NCCL)\n\n")
 		fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status)
 		if result.Interconnect.Supported {
-			b.WriteString("| Metric | Avg | Max |\n|--------|-----|-----|\n")
-			fmt.Fprintf(&b, "| Alg BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.MaxAlgBWGBps)
-			fmt.Fprintf(&b, "| Bus BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgBusBWGBps, result.Interconnect.MaxBusBWGBps)
+			b.WriteString(fmtMDTable(
+				[]string{"Metric", "Avg", "Max"},
+				[][]string{
+					{"Alg BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgAlgBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxAlgBWGBps)},
+					{"Bus BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgBusBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxBusBWGBps)},
+				},
+			))
 			b.WriteString("\n")
 		}
 		for _, note := range result.Interconnect.Notes {
@@ -359,20 +401,26 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		}
 	}

-	// ── Server Power (IPMI) ───────────────────────────────────────────────────
+	// ── Server Power ───────────────────────────────────────────────────────────
 	if sp := result.ServerPower; sp != nil {
-		b.WriteString("## Server Power (IPMI)\n\n")
+		title := "## Server Power\n\n"
+		if sp.Source != "" {
+			title = fmt.Sprintf("## Server Power (`%s`)\n\n", sp.Source)
+		}
+		b.WriteString(title)
 		if !sp.Available {
-			b.WriteString("IPMI power measurement unavailable.\n\n")
+			b.WriteString("Server power measurement unavailable.\n\n")
 		} else {
-			b.WriteString("| | Value |\n|---|---|\n")
-			fmt.Fprintf(&b, "| Server idle | %.0f W |\n", sp.IdleW)
-			fmt.Fprintf(&b, "| Server under load | %.0f W |\n", sp.LoadedW)
-			fmt.Fprintf(&b, "| Server delta (load − idle) | %.0f W |\n", sp.DeltaW)
-			fmt.Fprintf(&b, "| GPU-reported sum | %.0f W |\n", sp.GPUReportedSumW)
-			if sp.ReportingRatio > 0 {
-				fmt.Fprintf(&b, "| Reporting ratio | %.2f (1.0 = accurate, <0.75 = GPU over-reports) |\n", sp.ReportingRatio)
+			spRows := [][]string{
+				{"Server idle", fmt.Sprintf("%.0f W", sp.IdleW)},
+				{"Server under load", fmt.Sprintf("%.0f W", sp.LoadedW)},
+				{"Server delta (load − idle)", fmt.Sprintf("%.0f W", sp.DeltaW)},
+				{"GPU-reported sum", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)},
 			}
+			if sp.ReportingRatio > 0 {
+				spRows = append(spRows, []string{"Reporting ratio", fmt.Sprintf("%.2f (1.0 = accurate, <0.75 = GPU over-reports)", sp.ReportingRatio)})
+			}
+			b.WriteString(fmtMDTable([]string{"", "Value"}, spRows))
 			b.WriteString("\n")
 		}
 		for _, note := range sp.Notes {
@@ -383,19 +431,33 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		}
 	}

+	// ── PSU Issues ────────────────────────────────────────────────────────────
+	if len(result.PSUIssues) > 0 {
+		b.WriteString("## PSU Issues\n\n")
+		b.WriteString("The following power supply anomalies were detected during the benchmark:\n\n")
+		for _, issue := range result.PSUIssues {
+			fmt.Fprintf(&b, "- ⛔ %s\n", issue)
+		}
+		b.WriteString("\n")
+	}
+
 	// ── Cooling ───────────────────────────────────────────────────────────────
 	if cooling := result.Cooling; cooling != nil {
 		b.WriteString("## Cooling\n\n")
 		if cooling.Available {
-			b.WriteString("| Metric | Value |\n|--------|-------|\n")
-			fmt.Fprintf(&b, "| Average fan speed | %.0f RPM |\n", cooling.AvgFanRPM)
+			dutyAvg, dutyP95 := "N/A", "N/A"
 			if cooling.FanDutyCycleAvailable {
-				fmt.Fprintf(&b, "| Average fan duty cycle | %.1f%% |\n", cooling.AvgFanDutyCyclePct)
-				fmt.Fprintf(&b, "| P95 fan duty cycle | %.1f%% |\n", cooling.P95FanDutyCyclePct)
-			} else {
-				b.WriteString("| Average fan duty cycle | N/A |\n")
-				b.WriteString("| P95 fan duty cycle | N/A |\n")
+				dutyAvg = fmt.Sprintf("%.1f%%", cooling.AvgFanDutyCyclePct)
+				dutyP95 = fmt.Sprintf("%.1f%%", cooling.P95FanDutyCyclePct)
 			}
+			b.WriteString(fmtMDTable(
+				[]string{"Metric", "Value"},
+				[][]string{
+					{"Average fan speed", fmt.Sprintf("%.0f RPM", cooling.AvgFanRPM)},
+					{"Average fan duty cycle", dutyAvg},
+					{"P95 fan duty cycle", dutyP95},
+				},
+			))
 			b.WriteString("\n")
 		} else {
 			b.WriteString("Cooling telemetry unavailable.\n\n")
@@ -412,12 +474,16 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 	if len(result.PerformanceRampSteps) > 0 {
 		b.WriteString("## Platform Scalability (Performance Ramp)\n\n")
 		fmt.Fprintf(&b, "**Platform power score:** %.1f%%  \n\n", result.PlatformPowerScore)
-		b.WriteString("| k GPUs | GPU Indices | Total Synthetic TOPS | Scalability |\n")
-		b.WriteString("|--------|-------------|----------------------|-------------|\n")
+		var scalRows [][]string
 		for _, step := range result.PerformanceRampSteps {
-			fmt.Fprintf(&b, "| %d | %s | %.2f | %.1f%% |\n",
-				step.StepIndex, joinIndexList(step.GPUIndices), step.TotalSyntheticTOPS, step.ScalabilityPct)
+			scalRows = append(scalRows, []string{
+				fmt.Sprintf("%d", step.StepIndex),
+				joinIndexList(step.GPUIndices),
+				fmt.Sprintf("%.2f", step.TotalSyntheticTOPS),
+				fmt.Sprintf("%.1f%%", step.ScalabilityPct),
+			})
 		}
+		b.WriteString(fmtMDTable([]string{"k GPUs", "GPU Indices", "Total Synthetic TOPS", "Scalability"}, scalRows))
 		b.WriteString("\n")
 	}

--- a/audit/internal/platform/benchmark_table.go
+++ b/audit/internal/platform/benchmark_table.go
@@ -0,0 +1,75 @@
+package platform
+
+import (
+	"strings"
+)
+
+// fmtMDTable renders a markdown table with column widths padded so the table
+// is readable as plain text without a markdown renderer.
+//
+// headers contains the column header strings.
+// rows contains data rows; each row must have the same number of cells as headers.
+// Cells with fewer entries than headers are treated as empty.
+func fmtMDTable(headers []string, rows [][]string) string {
+	ncols := len(headers)
+	if ncols == 0 {
+		return ""
+	}
+
+	// Compute max width per column.
+	widths := make([]int, ncols)
+	for i, h := range headers {
+		if len(h) > widths[i] {
+			widths[i] = len(h)
+		}
+	}
+	for _, row := range rows {
+		for i := 0; i < ncols; i++ {
+			cell := ""
+			if i < len(row) {
+				cell = row[i]
+			}
+			if len(cell) > widths[i] {
+				widths[i] = len(cell)
+			}
+		}
+	}
+
+	var b strings.Builder
+
+	// Header row.
+	b.WriteByte('|')
+	for i, h := range headers {
+		b.WriteByte(' ')
+		b.WriteString(h)
+		b.WriteString(strings.Repeat(" ", widths[i]-len(h)))
+		b.WriteString(" |")
+	}
+	b.WriteByte('\n')
+
+	// Separator row.
+	b.WriteByte('|')
+	for i := range headers {
+		b.WriteString(strings.Repeat("-", widths[i]+2))
+		b.WriteByte('|')
+	}
+	b.WriteByte('\n')
+
+	// Data rows.
+	for _, row := range rows {
+		b.WriteByte('|')
+		for i := 0; i < ncols; i++ {
+			cell := ""
+			if i < len(row) {
+				cell = row[i]
+			}
+			b.WriteByte(' ')
+			b.WriteString(cell)
+			b.WriteString(strings.Repeat(" ", widths[i]-len(cell)))
+			b.WriteString(" |")
+		}
+		b.WriteByte('\n')
+	}
+
+	return b.String()
+}
--- a/audit/internal/platform/benchmark_test.go
+++ b/audit/internal/platform/benchmark_test.go
@@ -1,8 +1,13 @@
 package platform

 import (
+	"context"
+	"os"
+	"os/exec"
+	"path/filepath"
 	"strings"
 	"testing"
+	"time"
 )

 func TestResolveBenchmarkProfile(t *testing.T) {
@@ -164,6 +169,93 @@ func TestBenchmarkPlannedPhaseStatus(t *testing.T) {
 	}
 }

+func TestBenchmarkCalibrationThrottleReasonIgnoresPowerReasons(t *testing.T) {
+	t.Parallel()
+
+	before := BenchmarkThrottleCounters{}
+	if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWPowerCapUS: 1_000_000}); got != "" {
+		t.Fatalf("sw_power_cap should be ignored, got %q", got)
+	}
+	if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWPowerBrakeSlowdownUS: 1_000_000}); got != "" {
+		t.Fatalf("hw_power_brake should be ignored, got %q", got)
+	}
+	if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWThermalSlowdownUS: 1_000_000}); got != "hw_thermal" {
+		t.Fatalf("hw_thermal mismatch: got %q", got)
+	}
+	if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWThermalSlowdownUS: 1_000_000}); got != "sw_thermal" {
+		t.Fatalf("sw_thermal mismatch: got %q", got)
+	}
+}
+
+func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
+	t.Parallel()
+
+	oldGeteuid := benchmarkGeteuid
+	oldExec := satExecCommand
+	benchmarkGeteuid = func() int { return 1000 }
+	satExecCommand = func(name string, args ...string) *exec.Cmd {
+		t.Fatalf("unexpected command: %s %v", name, args)
+		return nil
+	}
+	t.Cleanup(func() {
+		benchmarkGeteuid = oldGeteuid
+		satExecCommand = oldExec
+	})
+
+	var logs []string
+	failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{0, 2}, func(line string) {
+		logs = append(logs, line)
+	})
+	if got, want := strings.Join(logs, "\n"), "power benchmark pre-flight: root privileges unavailable, GPU reset skipped"; !strings.Contains(got, want) {
+		t.Fatalf("logs=%q want substring %q", got, want)
+	}
+	if len(failed) != 2 || failed[0] != 0 || failed[1] != 2 {
+		t.Fatalf("failed=%v want [0 2]", failed)
+	}
+}
+
+func TestResetBenchmarkGPUsResetsEachGPU(t *testing.T) {
+	t.Parallel()
+
+	dir := t.TempDir()
+	script := filepath.Join(dir, "nvidia-smi")
+	argsLog := filepath.Join(dir, "args.log")
+	if err := os.WriteFile(script, []byte("#!/bin/sh\nprintf '%s\\n' \"$*\" >> "+argsLog+"\nprintf 'ok\\n'\n"), 0755); err != nil {
+		t.Fatalf("write script: %v", err)
+	}
+
+	oldGeteuid := benchmarkGeteuid
+	oldSleep := benchmarkSleep
+	oldLookPath := satLookPath
+	benchmarkGeteuid = func() int { return 0 }
+	benchmarkSleep = func(time.Duration) {}
+	satLookPath = func(file string) (string, error) {
+		if file == "nvidia-smi" {
+			return script, nil
+		}
+		return exec.LookPath(file)
+	}
+	t.Cleanup(func() {
+		benchmarkGeteuid = oldGeteuid
+		benchmarkSleep = oldSleep
+		satLookPath = oldLookPath
+	})
+
+	failed := resetBenchmarkGPUs(context.Background(), filepath.Join(dir, "verbose.log"), []int{2, 5}, nil)
+	if len(failed) != 0 {
+		t.Fatalf("failed=%v want no failures", failed)
+	}
+	raw, err := os.ReadFile(argsLog)
+	if err != nil {
+		t.Fatalf("read args log: %v", err)
+	}
+	got := strings.Fields(string(raw))
+	want := []string{"-i", "2", "-r", "-i", "5", "-r"}
+	if strings.Join(got, " ") != strings.Join(want, " ") {
+		t.Fatalf("args=%v want %v", got, want)
+	}
+}
+
 func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
 	t.Parallel()

@@ -179,6 +271,59 @@ func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
 	}
 }

+func TestInitialBenchmarkCalibrationLimitW(t *testing.T) {
+	t.Parallel()
+
+	cases := []struct {
+		name string
+		info benchmarkGPUInfo
+		want int
+	}{
+		{
+			name: "prefers default tdp over current derated limit",
+			info: benchmarkGPUInfo{
+				PowerLimitW:        500,
+				DefaultPowerLimitW: 600,
+				MaxPowerLimitW:     600,
+			},
+			want: 600,
+		},
+		{
+			name: "caps default tdp to reported max limit",
+			info: benchmarkGPUInfo{
+				PowerLimitW:        500,
+				DefaultPowerLimitW: 700,
+				MaxPowerLimitW:     650,
+			},
+			want: 650,
+		},
+		{
+			name: "falls back to current limit when default missing",
+			info: benchmarkGPUInfo{
+				PowerLimitW:    525,
+				MaxPowerLimitW: 600,
+			},
+			want: 525,
+		},
+		{
+			name: "falls back to max limit when only that is known",
+			info: benchmarkGPUInfo{
+				MaxPowerLimitW: 575,
+			},
+			want: 575,
+		},
+	}
+
+	for _, tc := range cases {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			if got := initialBenchmarkCalibrationLimitW(tc.info); got != tc.want {
+				t.Fatalf("initialBenchmarkCalibrationLimitW(%+v)=%d want %d", tc.info, got, tc.want)
+			}
+		})
+	}
+}
+
 func TestParseBenchmarkBurnLog(t *testing.T) {
 	t.Parallel()

@@ -338,12 +483,16 @@ func TestScoreBenchmarkGPUIgnoresDisabledPrecisions(t *testing.T) {
 	}
 }

-func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {
+func TestEnrichGPUInfoWithNvidiaSMIQ(t *testing.T) {
 	t.Parallel()

 	nvsmiQ := []byte(`
 GPU 00000000:4E:00.0
    Product Name                          : NVIDIA RTX PRO 6000 Blackwell Server Edition
+    Min Power Limit                       : 200.00 W
+    Max Power Limit                       : 600.00 W
+    Default Power Limit                   : 575.00 W
+    Current Power Limit                   : 560.00 W
    Clocks
        Graphics                          : 2422 MHz
        Memory                            : 12481 MHz
@@ -365,7 +514,7 @@ GPU 00000000:4F:00.0
 		1: {Index: 1, BusID: "00000000:4F:00.0"},
 	}

-	enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
+	enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)

 	if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
 		t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz)
@@ -379,25 +528,49 @@ GPU 00000000:4F:00.0
 	if infoByIndex[1].MaxMemoryClockMHz != 12481 {
 		t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz)
 	}
+	if infoByIndex[0].MinPowerLimitW != 200 {
+		t.Errorf("GPU 0 MinPowerLimitW = %v, want 200", infoByIndex[0].MinPowerLimitW)
+	}
+	if infoByIndex[0].MaxPowerLimitW != 600 {
+		t.Errorf("GPU 0 MaxPowerLimitW = %v, want 600", infoByIndex[0].MaxPowerLimitW)
+	}
+	if infoByIndex[0].DefaultPowerLimitW != 575 {
+		t.Errorf("GPU 0 DefaultPowerLimitW = %v, want 575", infoByIndex[0].DefaultPowerLimitW)
+	}
+	if infoByIndex[0].PowerLimitW != 560 {
+		t.Errorf("GPU 0 PowerLimitW = %v, want 560", infoByIndex[0].PowerLimitW)
+	}
 }

-func TestEnrichGPUInfoWithMaxClocksSkipsPopulated(t *testing.T) {
+func TestEnrichGPUInfoWithNvidiaSMIQSkipsPopulated(t *testing.T) {
 	t.Parallel()

 	nvsmiQ := []byte(`
 GPU 00000000:4E:00.0
+    Min Power Limit                       : 100.00 W
+    Max Power Limit                       : 900.00 W
    Max Clocks
        Graphics                          : 9999 MHz
        Memory                            : 9999 MHz
 `)
 	// Already populated — must not be overwritten.
 	infoByIndex := map[int]benchmarkGPUInfo{
-		0: {Index: 0, BusID: "00000000:4E:00.0", MaxGraphicsClockMHz: 2430, MaxMemoryClockMHz: 12481},
+		0: {
+			Index:               0,
+			BusID:               "00000000:4E:00.0",
+			MaxGraphicsClockMHz: 2430,
+			MaxMemoryClockMHz:   12481,
+			MinPowerLimitW:      200,
+			MaxPowerLimitW:      600,
+		},
 	}

-	enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
+	enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)

 	if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
 		t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz)
 	}
+	if infoByIndex[0].MinPowerLimitW != 200 {
+		t.Errorf("expected existing min power limit to be preserved, got %v", infoByIndex[0].MinPowerLimitW)
+	}
 }
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -43,18 +43,120 @@ const (
 	NvidiaBenchmarkProfileOvernight = "overnight"
 )

+const (
+	BenchmarkPowerEngineDCGMProfTester = "dcgmproftester"
+	BenchmarkPowerEngineTargetedPower  = "targeted_power"
+)
+
+// Estimated wall-clock durations for benchmark runs, derived from real _v8 logs.
+// Rule: when changing profile phase durations in resolveBenchmarkProfile(),
+// re-measure from actual task logs and update the constants here.
+//
+// Sources:
+//   - BenchmarkEstimatedPerfStandardSec:   MLT v8.22 ramp 1-4: 927 s; xFusion v8.22 parallel 8GPU: 1080 s
+//   - BenchmarkEstimatedPerfStabilitySec:  xFusion v8.22 ramp 1-8: 5532 s
+//   - BenchmarkEstimatedPerfOvernightSec:  derived from profile phases (SteadySec=27000)
+//   - BenchmarkEstimatedPowerStandardSec:  MLT v8.22 ramp 1-4: 2663 s; MSI v8.22 ramp 1-8: 2375 s
+//   - BenchmarkEstimatedPowerStabilitySec: target ~90 min with calibDurationSec=300 (8 GPU × ~2-3 attempts)
+const (
+	// Performance Benchmark (bee-gpu-burn).
+	// Duration is per full ramp-up run (ramp 1→N) or per single parallel run.
+	// Sequential per-GPU mode scales approximately linearly.
+	BenchmarkEstimatedPerfStandardSec  = 960  // ~16 min; ramp-up 1-4: 927 s, parallel 8GPU: 1080 s
+	BenchmarkEstimatedPerfStabilitySec = 5532 // ~92 min; ramp-up 1-8 measured
+	BenchmarkEstimatedPerfOvernightSec = 8 * 3600
+
+	// Power / Thermal Fit (dcgmproftester load + nvidia-smi power-limit binary search).
+	// Duration is for the full ramp-up run; individual steps vary with convergence speed.
+	BenchmarkEstimatedPowerStandardSec  = 2600 // ~43 min; ramp 1-4: 2663 s, ramp 1-8: 2375 s
+	BenchmarkEstimatedPowerStabilitySec = 5400 // ~90 min; calibDurationSec=300 × 8 GPU × ~2-3 attempts
+	BenchmarkEstimatedPowerOvernightSec = 3 * 3600
+)
+
 type NvidiaBenchmarkOptions struct {
 	Profile           string
 	SizeMB            int
 	GPUIndices        []int
 	ExcludeGPUIndices []int
 	RunNCCL           bool
+	ServerPowerSource string
 	ParallelGPUs      bool   // run all selected GPUs simultaneously instead of sequentially
 	RampStep          int    // 1-based step index within a ramp-up run (0 = not a ramp-up)
 	RampTotal         int    // total number of ramp-up steps in this run
 	RampRunID         string // shared identifier across all steps of the same ramp-up run
 }

+const (
+	BenchmarkPowerSourceDCMI        = "dcmi"
+	BenchmarkPowerSourceSDRPSUInput = "sdr_psu_input"
+)
+
+type BenchmarkPowerAutotuneConfig struct {
+	Version           int       `json:"version"`
+	UpdatedAt         time.Time `json:"updated_at"`
+	SelectedSource    string    `json:"selected_source"`
+	BenchmarkKind     string    `json:"benchmark_kind,omitempty"`
+	Profile           string    `json:"profile,omitempty"`
+	IdleDurationSec   int       `json:"idle_duration_sec,omitempty"`
+	LoadDurationSec   int       `json:"load_duration_sec,omitempty"`
+	SampleIntervalSec int       `json:"sample_interval_sec,omitempty"`
+	Confidence        float64   `json:"confidence,omitempty"`
+	Reason            string    `json:"reason,omitempty"`
+}
+
+type SystemPowerSourceDecision struct {
+	Configured      bool      `json:"configured"`
+	SelectedSource  string    `json:"selected_source,omitempty"`
+	EffectiveSource string    `json:"effective_source,omitempty"`
+	Mode            string    `json:"mode,omitempty"` // autotuned, fallback, degraded
+	Reason          string    `json:"reason,omitempty"`
+	ConfiguredAt    time.Time `json:"configured_at,omitempty"`
+}
+
+type BenchmarkPowerAutotuneResult struct {
+	GeneratedAt         time.Time                         `json:"generated_at"`
+	Hostname            string                            `json:"hostname,omitempty"`
+	ServerModel         string                            `json:"server_model,omitempty"`
+	BenchmarkKind       string                            `json:"benchmark_kind,omitempty"`
+	Profile             string                            `json:"profile,omitempty"`
+	Status              string                            `json:"status"`
+	IdleDurationSec     int                               `json:"idle_duration_sec"`
+	LoadDurationSec     int                               `json:"load_duration_sec"`
+	SampleIntervalSec   int                               `json:"sample_interval_sec"`
+	SelectedSource      string                            `json:"selected_source,omitempty"`
+	IdleValidationError string                            `json:"idle_validation_error,omitempty"`
+	IdleValidation      *BenchmarkPowerAutotuneValidation `json:"idle_validation,omitempty"`
+	GPUPowerIdleW       float64                           `json:"gpu_power_idle_w,omitempty"`
+	GPUPowerLoadW       float64                           `json:"gpu_power_load_w,omitempty"`
+	Candidates          []BenchmarkPowerAutotuneCandidate `json:"candidates,omitempty"`
+	Notes               []string                          `json:"notes,omitempty"`
+	Config              *BenchmarkPowerAutotuneConfig     `json:"config,omitempty"`
+}
+
+type BenchmarkPowerAutotuneValidation struct {
+	Valid          bool    `json:"valid"`
+	GPUAvgUsagePct float64 `json:"gpu_avg_usage_pct,omitempty"`
+	GPUP95UsagePct float64 `json:"gpu_p95_usage_pct,omitempty"`
+	CPUAvgUsagePct float64 `json:"cpu_avg_usage_pct,omitempty"`
+	CPUP95UsagePct float64 `json:"cpu_p95_usage_pct,omitempty"`
+	GPUSamples     int     `json:"gpu_samples,omitempty"`
+	CPUSamples     int     `json:"cpu_samples,omitempty"`
+	Reason         string  `json:"reason,omitempty"`
+}
+
+type BenchmarkPowerAutotuneCandidate struct {
+	Source         string  `json:"source"`
+	IdleAvgW       float64 `json:"idle_avg_w,omitempty"`
+	LoadAvgW       float64 `json:"load_avg_w,omitempty"`
+	DeltaW         float64 `json:"delta_w,omitempty"`
+	Samples        int     `json:"samples,omitempty"`
+	RelativeError  float64 `json:"relative_error,omitempty"`
+	Confidence     float64 `json:"confidence,omitempty"`
+	Selected       bool    `json:"selected,omitempty"`
+	Available      bool    `json:"available"`
+	SelectionNotes string  `json:"selection_notes,omitempty"`
+}
+
 type NvidiaBenchmarkResult struct {
 	BenchmarkVersion string    `json:"benchmark_version"`
 	GeneratedAt      time.Time `json:"generated_at"`
@@ -82,6 +184,10 @@ type NvidiaBenchmarkResult struct {
 	GPUs                 []BenchmarkGPUResult         `json:"gpus"`
 	Interconnect         *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
 	ServerPower          *BenchmarkServerPower        `json:"server_power,omitempty"`
+	// PSUIssues holds power supply fault events detected by comparing IPMI PSU
+	// sensor states before and after the benchmark run. Empty when IPMI is
+	// unavailable or no PSU faults occurred during the test.
+	PSUIssues []string `json:"psu_issues,omitempty"`
 }

 type BenchmarkNormalization struct {
@@ -246,18 +352,59 @@ type BenchmarkScorecard struct {
 	TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
 }

-// BenchmarkServerPower captures server-side power via IPMI alongside GPU-reported
-// power. The reporting_ratio (delta / gpu_reported_sum) near 1.0 means GPU power
-// telemetry is accurate; a ratio well below 1.0 (e.g. 0.5) means the GPU is
-// over-reporting its power consumption.
+// BenchmarkPSUSlotPower holds SDR power readings for one PSU slot sampled
+// during the benchmark. Slot keys match audit HardwarePowerSupply.Slot (0-based)
+// so benchmark and audit data can be correlated by slot.
+type BenchmarkPSUSlotPower struct {
+	InputW  *float64 `json:"input_w,omitempty"`  // AC wall input (PSUx_POWER_IN)
+	OutputW *float64 `json:"output_w,omitempty"` // DC output (PSUx_POWER_OUT)
+	Status  string   `json:"status,omitempty"`
+}
+
+// BenchmarkServerPower captures server-side power from multiple independent
+// sources: IPMI DCMI (high-level), IPMI SDR per-PSU sensors (granular), and
+// GPU-reported power (nvidia-smi). Cross-comparing sources detects when DCMI
+// covers only a subset of installed PSUs (partial coverage).
+//
+// Source legend:
+//   - DCMI      — `ipmitool dcmi power reading`; fast but may miss PSUs
+//   - SDR       — `ipmitool sdr` PSUx_POWER_IN/OUT; per-PSU, reliable
+//   - nvidia-smi — GPU self-reported via internal shunt; accurate for GPU load
 type BenchmarkServerPower struct {
-	Available       bool     `json:"available"`
-	IdleW           float64  `json:"idle_w,omitempty"`
-	LoadedW         float64  `json:"loaded_w,omitempty"`
-	DeltaW          float64  `json:"delta_w,omitempty"`
-	GPUReportedSumW float64  `json:"gpu_reported_sum_w,omitempty"`
-	ReportingRatio  float64  `json:"reporting_ratio,omitempty"`
-	Notes           []string `json:"notes,omitempty"`
+	Available         bool    `json:"available"`
+	Source            string  `json:"source,omitempty"`
+	Mode              string  `json:"mode,omitempty"`
+	Reason            string  `json:"reason,omitempty"`
+	SampleIntervalSec int     `json:"sample_interval_sec,omitempty"`
+	IdleW             float64 `json:"idle_w,omitempty"`   // DCMI at idle
+	LoadedW           float64 `json:"loaded_w,omitempty"` // DCMI at peak load
+	DeltaW            float64 `json:"delta_w,omitempty"`  // DCMI loaded − idle
+	GPUReportedSumW   float64 `json:"gpu_reported_sum_w,omitempty"`
+	ReportingRatio    float64 `json:"reporting_ratio,omitempty"`
+
+	// PSU AC input sum — sampled at idle and at peak load using collector's
+	// slot patterns (PSU1_POWER_IN, PSU1_PIN, PS1 POut, Power1…).
+	PSUInputIdleW   float64 `json:"psu_input_idle_w,omitempty"`
+	PSUInputLoadedW float64 `json:"psu_input_loaded_w,omitempty"`
+
+	// PSU DC output sum — power delivered to server internals after conversion.
+	PSUOutputIdleW   float64 `json:"psu_output_idle_w,omitempty"`
+	PSUOutputLoadedW float64 `json:"psu_output_loaded_w,omitempty"`
+
+	// Per-slot PSU readings at idle and at peak load.
+	// Keys are 0-based slot strings matching audit HardwarePowerSupply.Slot.
+	PSUSlotReadingsIdle   map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings_idle,omitempty"`
+	PSUSlotReadingsLoaded map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings_loaded,omitempty"`
+
+	// GPUSlotTotalW is the sum of GPU_POWER_SLOTx SDR sensors at peak load.
+	// PCIe slot delivery only (excludes 16-pin connector power).
+	GPUSlotTotalW float64 `json:"gpu_slot_total_w,omitempty"`
+
+	// DCMICoverageRatio = DCMI_idle / SDR_PSU_IN_idle.
+	// Near 1.0 → DCMI tracks all PSUs. Near 0.5 → DCMI tracks half the PSUs.
+	DCMICoverageRatio float64 `json:"dcmi_coverage_ratio,omitempty"`
+
+	Notes []string `json:"notes,omitempty"`
 }

 // BenchmarkPrecisionSteadyPhase holds per-precision-category telemetry collected
@@ -308,6 +455,10 @@ type NvidiaPowerBenchResult struct {
 	ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
 	Findings    []string              `json:"findings,omitempty"`
 	GPUs        []NvidiaPowerBenchGPU `json:"gpus"`
+	// PSUIssues holds power supply fault events detected by comparing IPMI PSU
+	// sensor states before and after the power benchmark run. Empty when IPMI is
+	// unavailable or no PSU faults occurred during the test.
+	PSUIssues []string `json:"psu_issues,omitempty"`
 }

 type NvidiaPowerBenchGPU struct {
@@ -338,6 +489,9 @@ type NvidiaPowerBenchGPU struct {
 	// Telemetry holds the aggregated stats from the final converged calibration
 	// attempt for this GPU (temperature, power, fan, clock percentiles).
 	Telemetry *BenchmarkTelemetrySummary `json:"telemetry,omitempty"`
+	// Fan state sampled at the end of single-card calibration.
+	AvgFanRPM          float64 `json:"avg_fan_rpm,omitempty"`
+	AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
 }

 type NvidiaPowerBenchStep struct {
@@ -356,6 +510,13 @@ type NvidiaPowerBenchStep struct {
 	// ramp step's calibration run. ServerDeltaW = ServerLoadedW − idle.
 	ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
 	ServerDeltaW  float64 `json:"server_delta_w,omitempty"`
+	// PSU slot readings sampled at end of this ramp step.
+	PSUSlotReadings map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings,omitempty"`
+	// Fan state at end of this ramp step.
+	AvgFanRPM          float64 `json:"avg_fan_rpm,omitempty"`
+	AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
+	// Per-GPU telemetry from this step's calibration, keyed by GPU index.
+	PerGPUTelemetry map[int]*BenchmarkTelemetrySummary `json:"per_gpu_telemetry,omitempty"`
 }

 // NvidiaPerformanceRampStep holds per-step performance data for the
--- a/audit/internal/platform/install_to_ram.go
+++ b/audit/internal/platform/install_to_ram.go
@@ -12,6 +12,7 @@ import (
 )

 const installToRAMDir = "/dev/shm/bee-live"
+const copyProgressLogStep int64 = 100 * 1024 * 1024

 func (s *System) IsLiveMediaInRAM() bool {
 	return s.LiveMediaRAMState().InRAM
@@ -319,6 +320,7 @@ func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) e
 	defer out.Close()
 	total := fi.Size()
 	var copied int64
+	var lastLogged int64
 	buf := make([]byte, 4*1024*1024)
 	for {
 		if err := ctx.Err(); err != nil {
@@ -330,7 +332,8 @@ func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) e
 				return werr
 			}
 			copied += int64(n)
-			if logFunc != nil && total > 0 {
+			if shouldLogCopyProgress(copied, total, lastLogged) {
+				lastLogged = copied
 				pct := int(float64(copied) / float64(total) * 100)
 				logFunc(fmt.Sprintf("  %s / %s (%d%%)", humanBytes(copied), humanBytes(total), pct))
 			}
@@ -345,6 +348,19 @@ func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) e
 	return out.Sync()
 }

+func shouldLogCopyProgress(copied, total, lastLogged int64) bool {
+	if total <= 0 || copied <= 0 {
+		return false
+	}
+	if copied >= total {
+		return copied > lastLogged
+	}
+	if copied < copyProgressLogStep {
+		return false
+	}
+	return copied-lastLogged >= copyProgressLogStep
+}
+
 func cpDir(ctx context.Context, src, dst string, logFunc func(string)) error {
 	return filepath.Walk(src, func(path string, fi os.FileInfo, err error) error {
 		if ctx.Err() != nil {
--- a/audit/internal/platform/install_to_ram_test.go
+++ b/audit/internal/platform/install_to_ram_test.go
@@ -101,3 +101,26 @@ func TestEvaluateLiveMediaRAMState(t *testing.T) {
 		}
 	})
 }
+
+func TestShouldLogCopyProgress(t *testing.T) {
+	t.Parallel()
+
+	total := int64(250 * 1024 * 1024)
+	step := int64(100 * 1024 * 1024)
+
+	if shouldLogCopyProgress(step-1, total, 0) {
+		t.Fatal("progress logged too early")
+	}
+	if !shouldLogCopyProgress(step, total, 0) {
+		t.Fatal("expected log at first 100MB boundary")
+	}
+	if shouldLogCopyProgress(step+16*1024*1024, total, step) {
+		t.Fatal("progress logged again before next 100MB")
+	}
+	if !shouldLogCopyProgress(2*step, total, step) {
+		t.Fatal("expected log at second 100MB boundary")
+	}
+	if !shouldLogCopyProgress(total, total, 2*step) {
+		t.Fatal("expected final completion log")
+	}
+}
--- a/audit/internal/platform/kill_workers.go
+++ b/audit/internal/platform/kill_workers.go
@@ -1,11 +1,14 @@
 package platform

 import (
+	"context"
 	"fmt"
+	"log/slog"
 	"os"
 	"strconv"
 	"strings"
 	"syscall"
+	"time"
 )

 // workerPatterns are substrings matched against /proc/<pid>/cmdline to identify
@@ -30,7 +33,12 @@ type KilledProcess struct {
 // KillTestWorkers scans /proc for running test worker processes and sends
 // SIGKILL to each one found. It returns a list of killed processes.
 // Errors for individual processes (e.g. already exited) are silently ignored.
+// The scan runs under a 5-second deadline to avoid blocking if the process
+// table is very large (e.g. after a stress test with thousands of children).
 func KillTestWorkers() []KilledProcess {
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
 	entries, err := os.ReadDir("/proc")
 	if err != nil {
 		return nil
@@ -38,6 +46,13 @@ func KillTestWorkers() []KilledProcess {

 	var killed []KilledProcess
 	for _, e := range entries {
+		select {
+		case <-ctx.Done():
+			slog.Warn("KillTestWorkers scan timed out", "killed_so_far", len(killed))
+			return killed
+		default:
+		}
+
 		if !e.IsDir() {
 			continue
 		}
--- a/audit/internal/platform/live_metrics.go
+++ b/audit/internal/platform/live_metrics.go
@@ -1,8 +1,10 @@
 package platform

 import (
+	"bee/audit/internal/collector"
 	"bufio"
 	"encoding/json"
+	"fmt"
 	"os"
 	"os/exec"
 	"sort"
@@ -14,14 +16,17 @@ import (
 // LiveMetricSample is a single point-in-time snapshot of server metrics
 // collected for the web UI metrics page.
 type LiveMetricSample struct {
-	Timestamp  time.Time      `json:"ts"`
-	Fans       []FanReading   `json:"fans"`
-	Temps      []TempReading  `json:"temps"`
-	PowerW     float64        `json:"power_w"`
-	PSUs       []PSUReading   `json:"psus,omitempty"`
-	CPULoadPct float64        `json:"cpu_load_pct"`
-	MemLoadPct float64        `json:"mem_load_pct"`
-	GPUs       []GPUMetricRow `json:"gpus"`
+	Timestamp   time.Time      `json:"ts"`
+	Fans        []FanReading   `json:"fans"`
+	Temps       []TempReading  `json:"temps"`
+	PowerW      float64        `json:"power_w"`
+	PowerSource string         `json:"power_source,omitempty"`
+	PowerMode   string         `json:"power_mode,omitempty"`
+	PowerReason string         `json:"power_reason,omitempty"`
+	PSUs        []PSUReading   `json:"psus,omitempty"`
+	CPULoadPct  float64        `json:"cpu_load_pct"`
+	MemLoadPct  float64        `json:"mem_load_pct"`
+	GPUs        []GPUMetricRow `json:"gpus"`
 }

 // PSUReading is a per-slot power supply input power reading.
@@ -62,12 +67,18 @@ func SampleLiveMetrics() LiveMetricSample {
 		}
 	}

-	// System power — returns 0 if unavailable
-	s.PowerW = sampleSystemPower()
-
 	// Per-PSU power — populated when IPMI SDR has Power Supply entities with Watt readings
 	s.PSUs = samplePSUPower()

+	// System power: use the global autotune-selected source when configured,
+	// otherwise fall back to the historical heuristic and mark the mode.
+	if powerW, decision, err := SampleSystemPowerResolved(""); err == nil {
+		s.PowerW = powerW
+		s.PowerSource = decision.EffectiveSource
+		s.PowerMode = decision.Mode
+		s.PowerReason = decision.Reason
+	}
+
 	// CPU load — from /proc/stat
 	s.CPULoadPct = sampleCPULoadPct()

@@ -339,63 +350,44 @@ func compactAmbientTempName(chip, name string) string {
 }

 // samplePSUPower reads per-PSU input power via IPMI SDR.
-// It parses `ipmitool sdr elist full` output looking for Power Supply entity
-// sensors (entity ID "10.N") that report a value in Watts.
+// Uses collector.PSUSlotsFromSDR (name-based matching) which works across
+// vendors where PSU sensors may not carry entity ID "10.N".
 // Returns nil when IPMI is unavailable or no PSU Watt sensors exist.
 func samplePSUPower() []PSUReading {
-	out, err := exec.Command("ipmitool", "sdr", "elist", "full").Output()
+	out, err := exec.Command("ipmitool", "sdr").Output()
 	if err != nil || len(out) == 0 {
 		return nil
 	}
-	// map slot → reading (keep highest-watt value per slot in case of duplicates)
-	type entry struct {
-		name   string
-		powerW float64
-	}
-	bySlot := map[int]entry{}
-	for _, line := range strings.Split(string(out), "\n") {
-		parts := strings.Split(line, "|")
-		if len(parts) < 5 {
-			continue
-		}
-		entityID := strings.TrimSpace(parts[3]) // e.g. "10.1"
-		if !strings.HasPrefix(entityID, "10.") {
-			continue // not a Power Supply entity
-		}
-		slotStr := strings.TrimPrefix(entityID, "10.")
-		slot, err := strconv.Atoi(slotStr)
-		if err != nil {
-			continue
-		}
-		valueField := strings.TrimSpace(parts[4]) // e.g. "740.00 Watts"
-		if !strings.Contains(strings.ToLower(valueField), "watts") {
-			continue
-		}
-		valueFields := strings.Fields(valueField)
-		if len(valueFields) < 2 {
-			continue
-		}
-		w, err := strconv.ParseFloat(valueFields[0], 64)
-		if err != nil || w <= 0 {
-			continue
-		}
-		sensorName := strings.TrimSpace(parts[0])
-		if existing, ok := bySlot[slot]; !ok || w > existing.powerW {
-			bySlot[slot] = entry{name: sensorName, powerW: w}
-		}
-	}
-	if len(bySlot) == 0 {
+	slots := collector.PSUSlotsFromSDR(string(out))
+	if len(slots) == 0 {
 		return nil
 	}
-	slots := make([]int, 0, len(bySlot))
-	for s := range bySlot {
-		slots = append(slots, s)
+	// Collect slot keys and sort for stable output.
+	keys := make([]int, 0, len(slots))
+	for k := range slots {
+		n, err := strconv.Atoi(k)
+		if err == nil {
+			keys = append(keys, n)
+		}
 	}
-	sort.Ints(slots)
-	psus := make([]PSUReading, 0, len(slots))
-	for _, s := range slots {
-		e := bySlot[s]
-		psus = append(psus, PSUReading{Slot: s, Name: e.name, PowerW: e.powerW})
+	sort.Ints(keys)
+	psus := make([]PSUReading, 0, len(keys))
+	for _, k := range keys {
+		entry := slots[strconv.Itoa(k)]
+		// Prefer AC input power; fall back to DC output power.
+		var w float64
+		if entry.InputW != nil && *entry.InputW > 0 {
+			w = *entry.InputW
+		} else if entry.OutputW != nil && *entry.OutputW > 0 {
+			w = *entry.OutputW
+		}
+		if w <= 0 {
+			continue
+		}
+		psus = append(psus, PSUReading{Slot: k + 1, Name: fmt.Sprintf("PSU%d", k+1), PowerW: w})
+	}
+	if len(psus) == 0 {
+		return nil
 	}
 	return psus
 }
--- a/audit/internal/platform/nvidia_recover.go
+++ b/audit/internal/platform/nvidia_recover.go
@@ -0,0 +1,30 @@
+package platform
+
+import (
+	"fmt"
+	"os/exec"
+	"time"
+)
+
+const nvidiaRecoverHelper = "/usr/local/bin/bee-nvidia-recover"
+
+func runNvidiaRecover(args ...string) (string, error) {
+	helperArgs := append([]string{nvidiaRecoverHelper}, args...)
+	if _, err := exec.LookPath("systemd-run"); err == nil {
+		unit := fmt.Sprintf("bee-nvidia-recover-%d", time.Now().UnixNano())
+		cmdArgs := []string{
+			"systemd-run",
+			"--quiet",
+			"--pipe",
+			"--wait",
+			"--collect",
+			"--service-type=oneshot",
+			"--unit", unit,
+		}
+		cmdArgs = append(cmdArgs, helperArgs...)
+		raw, err := exec.Command("sudo", cmdArgs...).CombinedOutput()
+		return string(raw), err
+	}
+	raw, err := exec.Command("sudo", helperArgs...).CombinedOutput()
+	return string(raw), err
+}
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -20,6 +20,54 @@ import (
 	"time"
 )

+// Estimated wall-clock durations for each SAT/validate test, derived from real
+// production logs in _benchmark/_v8/.
+//
+// Rule: whenever the commands, timeout parameters, or number of sub-jobs inside
+// the corresponding Run*Pack function change, re-measure the wall-clock duration
+// from actual task logs and update the matching constant here.
+//
+// Sources:
+//   - SATEstimatedCPUValidateSec:                 xFusion v8.6 — 62 s
+//   - SATEstimatedMemoryValidateSec:               xFusion v8.6 — 68 s
+//   - SATEstimatedNvidiaGPUValidateSec:            xFusion v8.6/v8.22 — 77–87 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
+//   - SATEstimatedNvidiaGPUStressSec:              xFusion v8.6/v8.22 — 444–448 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
+//   - SATEstimatedNvidiaTargetedStressSec:         xFusion v8.6/v8.22 — 347–348 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
+//   - SATEstimatedNvidiaTargetedPowerSec:          MSI v8.22 / xFusion v8.6 — 346–351 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
+//   - SATEstimatedNvidiaPulseTestSec:              xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous)
+//   - SATEstimatedNvidiaInterconnectSec:           xFusion v8.6/v8.22 — 210–384 s / 8 GPU (all simultaneous)
+//   - SATEstimatedNvidiaBandwidthSec:              xFusion v8.6/v8.22 — 2 664–2 688 s / 8 GPU (all simultaneous)
+const (
+	// CPU stress: stress-ng 60 s + lscpu/sensors overhead.
+	SATEstimatedCPUValidateSec = 65
+	// CPU stress: stress-ng 1800 s (stress mode default).
+	SATEstimatedCPUStressSec = 1800
+
+	// RAM: memtester 256 MB / 1 pass.
+	SATEstimatedMemoryValidateSec = 70
+	// RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size).
+	SATEstimatedMemoryStressSec = 140
+
+	// NVIDIA dcgmi diag Level 2 (medium), all GPUs simultaneously.
+	SATEstimatedNvidiaGPUValidateSec = 85
+	// NVIDIA dcgmi diag Level 3 (targeted stress), all GPUs simultaneously.
+	SATEstimatedNvidiaGPUStressSec = 450
+
+	// NVIDIA dcgmi targeted_stress 300 s + overhead, all GPUs simultaneously.
+	SATEstimatedNvidiaTargetedStressSec = 350
+	// NVIDIA dcgmi targeted_power 300 s + overhead, all GPUs simultaneously.
+	SATEstimatedNvidiaTargetedPowerSec = 350
+
+	// NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU).
+	SATEstimatedNvidiaPulseTestSec = 5000
+
+	// NCCL all_reduce_perf, all GPUs simultaneously.
+	SATEstimatedNvidiaInterconnectSec = 300
+	// nvbandwidth, all GPUs simultaneously. Tool runs all built-in tests
+	// without a user-configurable time limit; duration is determined by nvbandwidth itself.
+	SATEstimatedNvidiaBandwidthSec = 2700
+)
+
 var (
 	satExecCommand  = exec.Command
 	satLookPath     = exec.LookPath
@@ -359,11 +407,11 @@ func (s *System) ResetNvidiaGPU(index int) (string, error) {
 	if index < 0 {
 		return "", fmt.Errorf("gpu index must be >= 0")
 	}
-	raw, err := satExecCommand("nvidia-smi", "-r", "-i", strconv.Itoa(index)).CombinedOutput()
-	if len(raw) == 0 && err == nil {
-		raw = []byte("GPU reset completed.\n")
+	out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
+	if strings.TrimSpace(out) == "" && err == nil {
+		out = "GPU reset completed.\n"
 	}
-	return string(raw), err
+	return out, err
 }

 // RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
@@ -395,11 +443,19 @@ func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir strin
 		profCmd []string
 		profEnv []string
 	)
-	if staggerSec > 0 && len(selected) > 1 {
+	if len(selected) > 1 {
+		// For multiple GPUs, always spawn one dcgmproftester process per GPU via
+		// bee-dcgmproftester-staggered (stagger=0 means all start simultaneously).
+		// A single dcgmproftester process without -i only loads GPU 0 regardless
+		// of CUDA_VISIBLE_DEVICES.
+		stagger := staggerSec
+		if stagger < 0 {
+			stagger = 0
+		}
 		profCmd = []string{
 			"bee-dcgmproftester-staggered",
 			"--seconds", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)),
-			"--stagger-seconds", strconv.Itoa(staggerSec),
+			"--stagger-seconds", strconv.Itoa(stagger),
 			"--devices", joinIndexList(selected),
 		}
 	} else {
--- a/audit/internal/platform/sat_fan_stress.go
+++ b/audit/internal/platform/sat_fan_stress.go
@@ -43,17 +43,22 @@ type GPUStressMetric struct {

 // FanStressRow is one second-interval telemetry sample covering all monitored dimensions.
 type FanStressRow struct {
-	TimestampUTC string
-	ElapsedSec   float64
-	Phase        string // "baseline", "load1", "pause", "load2", "cooldown"
-	GPUs         []GPUStressMetric
-	Fans         []FanReading
-	CPUMaxTempC  float64 // highest CPU temperature from ipmitool / sensors
-	SysPowerW    float64 // DCMI system power reading
+	TimestampUTC   string
+	ElapsedSec     float64
+	Phase          string // "baseline", "load1", "pause", "load2", "cooldown"
+	GPUs           []GPUStressMetric
+	Fans           []FanReading
+	CPUMaxTempC    float64 // highest CPU temperature from ipmitool / sensors
+	SysPowerW      float64
+	SysPowerSource string
+	SysPowerMode   string
 }

 type cachedPowerReading struct {
 	Value     float64
+	Source    string
+	Mode      string
+	Reason    string
 	UpdatedAt time.Time
 }

@@ -278,7 +283,7 @@ func sampleFanStressRow(gpuIndices []int, phase string, elapsed float64) FanStre
 	row.GPUs = sampleGPUStressMetrics(gpuIndices)
 	row.Fans, _ = sampleFanSpeeds()
 	row.CPUMaxTempC = sampleCPUMaxTemp()
-	row.SysPowerW = sampleSystemPower()
+	row.SysPowerW, row.SysPowerSource, row.SysPowerMode = sampleSystemPowerResolved()
 	return row
 }

@@ -763,19 +768,19 @@ func sampleCPUTempViaSensors() float64 {
 	return max
 }

-// sampleSystemPower reads system power draw via DCMI.
-func sampleSystemPower() float64 {
+// sampleSystemPowerResolved reads system power via the global autotune source,
+// falling back to the historical heuristic before autotune or when degraded.
+func sampleSystemPowerResolved() (float64, string, string) {
 	now := time.Now()
-	current := 0.0
-	out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
-	if err == nil {
-		current = parseDCMIPowerReading(string(out))
-	}
+	current, decision, err := SampleSystemPowerResolved("")
 	systemPowerCacheMu.Lock()
 	defer systemPowerCacheMu.Unlock()
-	value, updated := effectiveSystemPowerReading(systemPowerCache, current, now)
+	if err != nil {
+		current = 0
+	}
+	value, updated := effectiveSystemPowerReading(systemPowerCache, current, decision.EffectiveSource, decision.Mode, decision.Reason, now)
 	systemPowerCache = updated
-	return value
+	return value, updated.Source, updated.Mode
 }

 // parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
@@ -798,9 +803,9 @@ func parseDCMIPowerReading(raw string) float64 {
 	return 0
 }

-func effectiveSystemPowerReading(cache cachedPowerReading, current float64, now time.Time) (float64, cachedPowerReading) {
+func effectiveSystemPowerReading(cache cachedPowerReading, current float64, source, mode, reason string, now time.Time) (float64, cachedPowerReading) {
 	if current > 0 {
-		cache = cachedPowerReading{Value: current, UpdatedAt: now}
+		cache = cachedPowerReading{Value: current, Source: source, Mode: mode, Reason: reason, UpdatedAt: now}
 		return current, cache
 	}
 	if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL {
--- a/audit/internal/platform/sat_fan_stress_test.go
+++ b/audit/internal/platform/sat_fan_stress_test.go
@@ -112,7 +112,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
 	now := time.Now()
 	cache := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-5 * time.Second)}

-	got, updated := effectiveSystemPowerReading(cache, 0, now)
+	got, updated := effectiveSystemPowerReading(cache, 0, "", "", "", now)
 	if got != 480 {
 		t.Fatalf("got=%v want cached 480", got)
 	}
@@ -120,7 +120,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
 		t.Fatalf("updated=%+v", updated)
 	}

-	got, updated = effectiveSystemPowerReading(cache, 530, now)
+	got, updated = effectiveSystemPowerReading(cache, 530, "dcmi", "fallback", "test", now)
 	if got != 530 {
 		t.Fatalf("got=%v want 530", got)
 	}
@@ -129,7 +129,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
 	}

 	expired := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-systemPowerHoldTTL - time.Second)}
-	got, _ = effectiveSystemPowerReading(expired, 0, now)
+	got, _ = effectiveSystemPowerReading(expired, 0, "", "", "", now)
 	if got != 0 {
 		t.Fatalf("expired cache returned %v want 0", got)
 	}
--- a/audit/internal/platform/services.go
+++ b/audit/internal/platform/services.go
@@ -61,6 +61,9 @@ func (s *System) ServiceState(name string) string {
 }

 func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
+	if name == "bee-nvidia" && action == ServiceRestart {
+		return runNvidiaRecover("restart-drivers")
+	}
 	// bee-web runs as the bee user; sudo is required to control system services.
 	// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
 	raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()
--- a/audit/internal/webui/api.go
+++ b/audit/internal/webui/api.go
@@ -127,7 +127,7 @@ func defaultTaskPriority(target string, params taskParams) int {
 		return taskPriorityInstallToRAM
 	case "audit":
 		return taskPriorityAudit
-	case "nvidia-bench-perf", "nvidia-bench-power":
+	case "nvidia-bench-perf", "nvidia-bench-power", "nvidia-bench-autotune":
 		return taskPriorityBenchmark
 	case "nvidia-stress", "amd-stress", "memory-stress", "sat-stress", "platform-stress", "nvidia-compute":
 		return taskPriorityBurn
@@ -701,6 +701,78 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
 	}
 }

+func (h *handler) handleAPIBenchmarkAutotuneRun() http.HandlerFunc {
+	return func(w http.ResponseWriter, r *http.Request) {
+		if h.opts.App == nil {
+			writeError(w, http.StatusServiceUnavailable, "app not configured")
+			return
+		}
+		var body struct {
+			Profile       string `json:"profile"`
+			BenchmarkKind string `json:"benchmark_kind"`
+			SizeMB        int    `json:"size_mb"`
+		}
+		if r.Body != nil {
+			if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
+				writeError(w, http.StatusBadRequest, "invalid request body")
+				return
+			}
+		}
+		profile := strings.TrimSpace(body.Profile)
+		if profile == "" {
+			profile = "standard"
+		}
+		benchmarkKind := strings.TrimSpace(body.BenchmarkKind)
+		if benchmarkKind == "" {
+			benchmarkKind = "power-fit"
+		}
+		now := time.Now()
+		taskName := fmt.Sprintf("NVIDIA Benchmark Autotune · %s · %s", profile, benchmarkKind)
+		t := &Task{
+			ID:        newJobID("bee-bench-autotune"),
+			Name:      taskName,
+			Target:    "nvidia-bench-autotune",
+			Priority:  defaultTaskPriority("nvidia-bench-autotune", taskParams{}),
+			Status:    TaskPending,
+			CreatedAt: now,
+			params: taskParams{
+				BenchmarkProfile: profile,
+				BenchmarkKind:    benchmarkKind,
+				SizeMB:           body.SizeMB,
+				DisplayName:      taskName,
+			},
+		}
+		globalQueue.enqueue(t)
+		writeTaskRunResponse(w, []*Task{t})
+	}
+}
+
+func (h *handler) handleAPIBenchmarkAutotuneStatus(w http.ResponseWriter, r *http.Request) {
+	if h.opts.App == nil {
+		writeError(w, http.StatusServiceUnavailable, "app not configured")
+		return
+	}
+	cfg, err := h.opts.App.LoadBenchmarkPowerAutotune()
+	if err != nil {
+		if os.IsNotExist(err) {
+			w.WriteHeader(http.StatusOK)
+			writeJSON(w, map[string]any{
+				"configured": false,
+				"decision":   platform.ResolveSystemPowerDecision(h.opts.ExportDir),
+			})
+			return
+		}
+		writeError(w, http.StatusInternalServerError, err.Error())
+		return
+	}
+	w.WriteHeader(http.StatusOK)
+	writeJSON(w, map[string]any{
+		"configured": true,
+		"config":     cfg,
+		"decision":   platform.ResolveSystemPowerDecision(h.opts.ExportDir),
+	})
+}
+
 func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
 	h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf").ServeHTTP(w, r)
 }
--- a/audit/internal/webui/api_test.go
+++ b/audit/internal/webui/api_test.go
@@ -178,16 +178,54 @@ func TestHandleAPIBenchmarkPowerFitRampQueuesBenchmarkPowerFitTasks(t *testing.T
 	}
 	globalQueue.mu.Lock()
 	defer globalQueue.mu.Unlock()
-	if len(globalQueue.tasks) != 3 {
-		t.Fatalf("tasks=%d want 3", len(globalQueue.tasks))
+	// Ramp-up mode creates a single task that handles the 1→N GPU ramp internally
+	// (spawning N separate tasks would redundantly repeat all earlier ramp steps).
+	if len(globalQueue.tasks) != 1 {
+		t.Fatalf("tasks=%d want 1 (ramp-up uses single task)", len(globalQueue.tasks))
 	}
-	for i, task := range globalQueue.tasks {
-		if task.Target != "nvidia-bench-power" {
-			t.Fatalf("task[%d] target=%q", i, task.Target)
-		}
-		if task.Priority != taskPriorityBenchmark {
-			t.Fatalf("task[%d] priority=%d want %d", i, task.Priority, taskPriorityBenchmark)
-		}
+	task := globalQueue.tasks[0]
+	if task.Target != "nvidia-bench-power" {
+		t.Fatalf("task target=%q want nvidia-bench-power", task.Target)
+	}
+	if task.Priority != taskPriorityBenchmark {
+		t.Fatalf("task priority=%d want %d", task.Priority, taskPriorityBenchmark)
+	}
+	if task.params.RampTotal != 3 {
+		t.Fatalf("task RampTotal=%d want 3", task.params.RampTotal)
+	}
+}
+
+func TestHandleAPIBenchmarkAutotuneRunQueuesTask(t *testing.T) {
+	globalQueue.mu.Lock()
+	originalTasks := globalQueue.tasks
+	globalQueue.tasks = nil
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = originalTasks
+		globalQueue.mu.Unlock()
+	})
+
+	h := &handler{opts: HandlerOptions{App: &app.App{}}}
+	req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/autotune/run", strings.NewReader(`{"profile":"standard","benchmark_kind":"power-fit"}`))
+	rec := httptest.NewRecorder()
+
+	h.handleAPIBenchmarkAutotuneRun().ServeHTTP(rec, req)
+
+	if rec.Code != 200 {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	globalQueue.mu.Lock()
+	defer globalQueue.mu.Unlock()
+	if len(globalQueue.tasks) != 1 {
+		t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
+	}
+	task := globalQueue.tasks[0]
+	if task.Target != "nvidia-bench-autotune" {
+		t.Fatalf("task target=%q want nvidia-bench-autotune", task.Target)
+	}
+	if task.params.BenchmarkKind != "power-fit" {
+		t.Fatalf("task benchmark kind=%q want power-fit", task.params.BenchmarkKind)
 	}
 }

--- a/audit/internal/webui/jobs.go
+++ b/audit/internal/webui/jobs.go
@@ -1,6 +1,9 @@
 package webui

 import (
+	"bufio"
+	"fmt"
+	"io"
 	"os"
 	"strings"
 	"sync"
@@ -17,6 +20,25 @@ type jobState struct {
 	cancel       func() // optional cancel function; nil if job is not cancellable
 	logPath      string
 	serialPrefix string
+	logFile      *os.File    // kept open for the task lifetime to avoid per-line open/close
+	logBuf       *bufio.Writer
+}
+
+// readTaskLogFile reads a task log, refusing files over 50 MB.
+func readTaskLogFile(path string) ([]byte, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+	data, err := io.ReadAll(io.LimitReader(f, 50<<20+1))
+	if err != nil {
+		return nil, err
+	}
+	if int64(len(data)) > 50<<20 {
+		return nil, fmt.Errorf("task log %s too large (exceeds 50 MB)", path)
+	}
+	return data, nil
 }

 // abort cancels the job if it has a cancel function and is not yet done.
@@ -35,7 +57,7 @@ func (j *jobState) append(line string) {
 	defer j.mu.Unlock()
 	j.lines = append(j.lines, line)
 	if j.logPath != "" {
-		appendJobLog(j.logPath, line)
+		j.writeLogLineLocked(line)
 	}
 	if j.serialPrefix != "" {
 		taskSerialWriteLine(j.serialPrefix + line)
@@ -48,6 +70,35 @@ func (j *jobState) append(line string) {
 	}
 }

+// writeLogLineLocked writes a line to the persistent log file, opening it lazily.
+// Must be called with j.mu held. Uses a buffered writer kept open for the task
+// lifetime — avoids thousands of open/close syscalls during high-frequency logs.
+func (j *jobState) writeLogLineLocked(line string) {
+	if j.logFile == nil {
+		f, err := os.OpenFile(j.logPath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
+		if err != nil {
+			return
+		}
+		j.logFile = f
+		j.logBuf = bufio.NewWriterSize(f, 64*1024)
+	}
+	_, _ = j.logBuf.WriteString(line + "\n")
+}
+
+// closeLog flushes and closes the log file. Called after all task output is done.
+func (j *jobState) closeLog() {
+	j.mu.Lock()
+	defer j.mu.Unlock()
+	if j.logBuf != nil {
+		_ = j.logBuf.Flush()
+	}
+	if j.logFile != nil {
+		_ = j.logFile.Close()
+		j.logFile = nil
+		j.logBuf = nil
+	}
+}
+
 func (j *jobState) finish(errMsg string) {
 	j.mu.Lock()
 	defer j.mu.Unlock()
@@ -119,7 +170,7 @@ func newTaskJobState(logPath string, serialPrefix ...string) *jobState {
 	if logPath == "" {
 		return j
 	}
-	data, err := os.ReadFile(logPath)
+	data, err := readTaskLogFile(logPath)
 	if err != nil || len(data) == 0 {
 		return j
 	}
--- a/audit/internal/webui/layout.go
+++ b/audit/internal/webui/layout.go
@@ -0,0 +1,137 @@
+package webui
+
+import (
+	"fmt"
+	"html"
+	"os"
+	"strings"
+)
+
+func layoutHead(title string) string {
+	return `<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width,initial-scale=1">
+<title>` + html.EscapeString(title) + `</title>
+<style>
+:root{--bg:#fff;--surface:#fff;--surface-2:#f9fafb;--border:rgba(34,36,38,.15);--border-lite:rgba(34,36,38,.1);--ink:rgba(0,0,0,.87);--muted:rgba(0,0,0,.6);--accent:#2185d0;--accent-dark:#1678c2;--crit-bg:#fff6f6;--crit-fg:#9f3a38;--crit-border:#e0b4b4;--ok-bg:#fcfff5;--ok-fg:#2c662d;--warn-bg:#fffaf3;--warn-fg:#573a08}
+*{box-sizing:border-box;margin:0;padding:0}
+body{font:14px/1.5 Lato,"Helvetica Neue",Arial,Helvetica,sans-serif;background:var(--bg);color:var(--ink);display:flex;min-height:100vh}
+a{color:var(--accent);text-decoration:none}
+/* Sidebar */
+.sidebar{width:210px;min-height:100vh;background:#1b1c1d;flex-shrink:0;display:flex;flex-direction:column}
+.sidebar-logo{padding:18px 16px 12px;font-size:18px;font-weight:700;color:#fff;letter-spacing:-.5px}
+.sidebar-logo span{color:rgba(255,255,255,.5);font-weight:400;font-size:12px;display:block;margin-top:2px}
+.sidebar-version{padding:0 16px 14px;font-size:11px;color:rgba(255,255,255,.45)}
+.sidebar-badge{margin:0 12px 12px;padding:5px 8px;border-radius:4px;font-size:11px;font-weight:600;text-align:center}
+.sidebar-badge-warn{background:#7a4f00;color:#f6c90e}
+.sidebar-badge-crit{background:#5c1a1a;color:#ff6b6b}
+.nav{flex:1}
+.nav-item{display:block;padding:10px 16px;color:rgba(255,255,255,.7);font-size:13px;border-left:3px solid transparent;transition:all .15s}
+.nav-item:hover{color:#fff;background:rgba(255,255,255,.08)}
+.nav-item.active{color:#fff;background:rgba(33,133,208,.25);border-left-color:var(--accent)}
+/* Content */
+.main{flex:1;display:flex;flex-direction:column;overflow:auto}
+.topbar{padding:13px 24px;background:#1b1c1d;display:flex;align-items:center;gap:12px}
+.topbar h1{font-size:16px;font-weight:700;color:rgba(255,255,255,.9)}
+.content{padding:24px;flex:1}
+/* Cards */
+.card{background:var(--surface);border:1px solid var(--border);border-radius:4px;box-shadow:0 1px 2px rgba(34,36,38,.15);margin-bottom:16px;overflow:hidden}
+.card-head{padding:11px 16px;background:var(--surface-2);border-bottom:1px solid var(--border);font-weight:700;font-size:13px;display:flex;align-items:center;gap:8px}
+.card-head-actions{justify-content:space-between}
+.card-head-buttons{display:flex;align-items:center;gap:8px;margin-left:auto;flex-wrap:wrap}
+.card-body{padding:16px}
+/* Buttons */
+.btn{display:inline-flex;align-items:center;gap:6px;padding:8px 16px;border-radius:4px;font-size:13px;font-weight:700;cursor:pointer;border:none;transition:background .1s;font-family:inherit}
+.btn-primary{background:var(--accent);color:#fff}.btn-primary:hover{background:var(--accent-dark)}
+.btn-danger{background:#db2828;color:#fff}.btn-danger:hover{background:#b91c1c}
+.btn-secondary{background:var(--surface-2);color:var(--ink);border:1px solid var(--border)}.btn-secondary:hover{background:#eee}
+.btn-sm{padding:5px 10px;font-size:12px}
+/* Tables */
+table{width:100%;border-collapse:collapse;font-size:13px;background:var(--surface)}
+th{text-align:left;padding:9px 14px;color:var(--ink);font-weight:700;background:var(--surface-2);border-bottom:1px solid var(--border-lite)}
+td{padding:9px 14px;border-top:1px solid var(--border-lite)}
+tr:first-child td{border-top:0}
+tbody tr:hover td{background:rgba(0,0,0,.03)}
+/* Status badges */
+.badge{display:inline-block;padding:2px 9px;border-radius:4px;font-size:11px;font-weight:700}
+.badge-ok{background:var(--ok-bg);color:var(--ok-fg);border:1px solid #a3c293}
+.badge-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
+.badge-err{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
+.badge-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
+/* Component chips — one small square per device */
+.chips{display:inline-flex;flex-wrap:wrap;gap:3px;align-items:center;vertical-align:middle}
+.chip{display:inline-flex;align-items:center;justify-content:center;width:20px;height:20px;border-radius:3px;font-size:10px;font-weight:800;cursor:default;font-family:monospace;letter-spacing:0;user-select:none}
+.chip-ok{background:var(--ok-bg);color:var(--ok-fg);border:1px solid #a3c293}
+.chip-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
+.chip-fail{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
+.chip-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
+/* Output terminal */
+.terminal{background:#1b1c1d;border:1px solid rgba(0,0,0,.2);border-radius:4px;padding:14px;font-family:monospace;font-size:12px;color:#b5cea8;max-height:400px;overflow-y:auto;white-space:pre-wrap;word-break:break-all;user-select:text;-webkit-user-select:text}
+.terminal-wrap{position:relative}.terminal-copy{position:absolute;top:6px;right:6px;background:#2d2f30;border:1px solid #444;color:#aaa;font-size:11px;padding:2px 8px;border-radius:3px;cursor:pointer;opacity:.7}.terminal-copy:hover{opacity:1}
+/* Forms */
+.form-row{margin-bottom:14px}
+.form-row label{display:block;font-size:12px;color:var(--muted);margin-bottom:5px;font-weight:700}
+.form-row input,.form-row select{width:100%;padding:8px 10px;background:var(--surface);border:1px solid var(--border);border-radius:4px;color:var(--ink);font-size:13px;outline:none;font-family:inherit}
+.form-row input:focus,.form-row select:focus{border-color:var(--accent);box-shadow:0 0 0 2px rgba(33,133,208,.2)}
+/* Grid */
+.grid2{display:grid;grid-template-columns:1fr 1fr;gap:16px}
+.grid3{display:grid;grid-template-columns:1fr 1fr 1fr;gap:16px}
+@media(max-width:900px){.grid2,.grid3{grid-template-columns:1fr}.card-head-actions{align-items:flex-start;flex-direction:column}.card-head-buttons{margin-left:0}}
+/* iframe viewer */
+.viewer-frame{width:100%;height:calc(100vh - 160px);border:0;border-radius:4px;background:var(--surface-2)}
+/* Alerts */
+.alert{padding:10px 14px;border-radius:4px;font-size:13px;margin-bottom:14px}
+.alert-info{background:#dff0ff;border:1px solid #a9d4f5;color:#1e3a5f}
+.alert-warn{background:var(--warn-bg);border:1px solid #c9ba9b;color:var(--warn-fg)}
+</style>
+</head>
+<body>
+`
+}
+
+func layoutNav(active string, buildLabel string) string {
+	items := []struct{ id, label, href, onclick string }{
+		{"dashboard", "Dashboard", "/", ""},
+		{"audit", "Audit", "/audit", ""},
+		{"validate", "Validate", "/validate", ""},
+		{"burn", "Burn", "/burn", ""},
+		{"benchmark", "Benchmark", "/benchmark", ""},
+		{"tasks", "Tasks", "/tasks", ""},
+		{"tools", "Tools", "/tools", ""},
+	}
+	var b strings.Builder
+	b.WriteString(`<aside class="sidebar">`)
+	b.WriteString(`<div class="sidebar-logo">bee<span>hardware audit</span></div>`)
+	if strings.TrimSpace(buildLabel) == "" {
+		buildLabel = "dev"
+	}
+	b.WriteString(`<div class="sidebar-version">Version ` + html.EscapeString(buildLabel) + `</div>`)
+	if raw, err := os.ReadFile("/run/bee-nvidia-mode"); err == nil {
+		gspMode := strings.TrimSpace(string(raw))
+		switch gspMode {
+		case "gsp-off":
+			b.WriteString(`<div class="sidebar-badge sidebar-badge-warn">NVIDIA GSP=off</div>`)
+		case "gsp-stuck":
+			b.WriteString(`<div class="sidebar-badge sidebar-badge-crit">NVIDIA GSP stuck — reboot</div>`)
+		}
+	}
+	b.WriteString(`<nav class="nav">`)
+	for _, item := range items {
+		cls := "nav-item"
+		if item.id == active {
+			cls += " active"
+		}
+		if item.onclick != "" {
+			b.WriteString(fmt.Sprintf(`<a class="%s" href="%s" onclick="%s">%s</a>`,
+				cls, item.href, item.onclick, item.label))
+		} else {
+			b.WriteString(fmt.Sprintf(`<a class="%s" href="%s">%s</a>`,
+				cls, item.href, item.label))
+		}
+	}
+	b.WriteString(`</nav>`)
+	b.WriteString(`</aside>`)
+	return b.String()
+}
--- a/audit/internal/webui/metricsdb.go
+++ b/audit/internal/webui/metricsdb.go
@@ -53,6 +53,9 @@ CREATE TABLE IF NOT EXISTS sys_metrics (
  cpu_load_pct REAL,
  mem_load_pct REAL,
  power_w      REAL,
+  power_source TEXT,
+  power_mode   TEXT,
+  power_reason TEXT,
  PRIMARY KEY (ts)
 );
 CREATE TABLE IF NOT EXISTS gpu_metrics (
@@ -86,7 +89,16 @@ CREATE TABLE IF NOT EXISTS temp_metrics (
 	if err := ensureMetricsColumn(db, "gpu_metrics", "clock_mhz", "REAL"); err != nil {
 		return err
 	}
-	return ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL")
+	if err := ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL"); err != nil {
+		return err
+	}
+	if err := ensureMetricsColumn(db, "sys_metrics", "power_source", "TEXT"); err != nil {
+		return err
+	}
+	if err := ensureMetricsColumn(db, "sys_metrics", "power_mode", "TEXT"); err != nil {
+		return err
+	}
+	return ensureMetricsColumn(db, "sys_metrics", "power_reason", "TEXT")
 }

 func ensureMetricsColumn(db *sql.DB, table, column, definition string) error {
@@ -125,8 +137,8 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
 	defer func() { _ = tx.Rollback() }()

 	_, err = tx.Exec(
-		`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w) VALUES(?,?,?,?)`,
-		ts, s.CPULoadPct, s.MemLoadPct, s.PowerW,
+		`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason) VALUES(?,?,?,?,?,?,?)`,
+		ts, s.CPULoadPct, s.MemLoadPct, s.PowerW, s.PowerSource, s.PowerMode, s.PowerReason,
 	)
 	if err != nil {
 		return err
@@ -161,14 +173,64 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
 	return tx.Commit()
 }

+// Downsample reduces density of old metrics rows to 1 sample per minute.
+// Only rows in the half-open window [deleteOlderThan, downsampleBefore) are
+// affected — rows newer than downsampleBefore keep full 5-second resolution.
+// For each 60-second bucket the row with the smallest ts is kept; the rest
+// are deleted. This trims ~92 % of rows in that window while preserving
+// the overall shape of every chart.
+//
+// Called hourly by the metrics collector background goroutine.
+func (m *MetricsDB) Downsample(downsampleBefore, deleteOlderThan time.Time) error {
+	if m == nil || m.db == nil {
+		return nil
+	}
+	start := deleteOlderThan.Unix()
+	end := downsampleBefore.Unix()
+	if end <= start {
+		return nil
+	}
+	// For each table: delete rows in [start, end) whose ts is NOT the minimum
+	// ts in its 60-second bucket (ts/60 integer division = bucket ID).
+	for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
+		_, err := m.db.Exec(`
+DELETE FROM `+table+` WHERE ts >= ? AND ts < ?
+  AND ts NOT IN (
+    SELECT MIN(ts) FROM `+table+`
+    WHERE ts >= ? AND ts < ?
+    GROUP BY ts / 60
+  )`, start, end, start, end)
+		if err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// Prune deletes all rows older than the given cutoff from every metrics table.
+// Called hourly by the metrics collector to keep the DB size bounded.
+func (m *MetricsDB) Prune(before time.Time) error {
+	if m == nil || m.db == nil {
+		return nil
+	}
+	cutTS := before.Unix()
+	for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
+		if _, err := m.db.Exec("DELETE FROM "+table+" WHERE ts < ?", cutTS); err != nil {
+			return err
+		}
+	}
+	_, _ = m.db.Exec("PRAGMA wal_checkpoint(TRUNCATE)")
+	return nil
+}
+
 // LoadRecent returns up to n samples in chronological order (oldest first).
 func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
-	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
+	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
 }

 // LoadAll returns all persisted samples in chronological order (oldest first).
 func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
-	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
+	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics ORDER BY ts`, nil)
 }

 // LoadBetween returns samples in chronological order within the given time window.
@@ -183,7 +245,7 @@ func (m *MetricsDB) LoadBetween(start, end time.Time) ([]platform.LiveMetricSamp
 		start, end = end, start
 	}
 	return m.loadSamples(
-		`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
+		`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
 		start.Unix(), end.Unix(),
 	)
 }
@@ -199,11 +261,14 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
 	type sysRow struct {
 		ts            int64
 		cpu, mem, pwr float64
+		powerSource   string
+		powerMode     string
+		powerReason   string
 	}
 	var sysRows []sysRow
 	for rows.Next() {
 		var r sysRow
-		if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr); err != nil {
+		if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr, &r.powerSource, &r.powerMode, &r.powerReason); err != nil {
 			continue
 		}
 		sysRows = append(sysRows, r)
@@ -313,10 +378,13 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
 	samples := make([]platform.LiveMetricSample, len(sysRows))
 	for i, r := range sysRows {
 		s := platform.LiveMetricSample{
-			Timestamp:  time.Unix(r.ts, 0).UTC(),
-			CPULoadPct: r.cpu,
-			MemLoadPct: r.mem,
-			PowerW:     r.pwr,
+			Timestamp:   time.Unix(r.ts, 0).UTC(),
+			CPULoadPct:  r.cpu,
+			MemLoadPct:  r.mem,
+			PowerW:      r.pwr,
+			PowerSource: r.powerSource,
+			PowerMode:   r.powerMode,
+			PowerReason: r.powerReason,
 		}
 		for _, idx := range gpuIndices {
 			if g, ok := gpuData[gpuKey{r.ts, idx}]; ok {
--- a/audit/internal/webui/page_benchmark.go
+++ b/audit/internal/webui/page_benchmark.go
@@ -0,0 +1,613 @@
+package webui
+
+import (
+	"encoding/json"
+	"fmt"
+	"html"
+	"os"
+	"path/filepath"
+	"sort"
+	"strconv"
+	"strings"
+	"time"
+
+	"bee/audit/internal/app"
+	"bee/audit/internal/platform"
+)
+
+type benchmarkHistoryRun struct {
+	generatedAt   time.Time
+	displayTime   string
+	gpuScores     map[int]float64
+	gpuStatuses   map[int]string
+	overallStatus string
+}
+
+func renderBenchmark(opts HandlerOptions) string {
+	return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Benchmark runs generate a human-readable TXT report and machine-readable result bundle. Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
+
+<div class="grid2">
+  <div class="card">
+    <div class="card-head">Benchmark Setup</div>
+    <div class="card-body">
+      <div class="form-row">
+        <label>Profile</label>
+        <select id="benchmark-profile">
+          <option value="standard" selected>Standard — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</option>
+          <option value="stability">Stability — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</option>
+          <option value="overnight">Overnight — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfOvernightSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerOvernightSec) + `</option>
+        </select>
+      </div>
+      <div class="form-row">
+        <label>GPU Selection</label>
+        <div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
+          <button class="btn btn-sm btn-secondary" type="button" onclick="benchmarkSelectAll()">Select All</button>
+          <button class="btn btn-sm btn-secondary" type="button" onclick="benchmarkSelectNone()">Clear</button>
+        </div>
+        <div id="benchmark-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
+          <p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
+        </div>
+      </div>
+      <label class="benchmark-cb-row">
+        <input type="radio" name="benchmark-mode" value="sequential" onchange="benchmarkUpdateSelectionNote()">
+        <span>Sequential — one GPU at a time</span>
+      </label>
+      <label class="benchmark-cb-row" id="benchmark-parallel-label">
+        <input type="radio" name="benchmark-mode" value="parallel" onchange="benchmarkUpdateSelectionNote()">
+        <span>Parallel — all selected GPUs simultaneously</span>
+      </label>
+      <label class="benchmark-cb-row" id="benchmark-ramp-label">
+        <input type="radio" name="benchmark-mode" value="ramp-up" checked onchange="benchmarkUpdateSelectionNote()">
+        <span>Ramp-up — 1 GPU → 2 → … → all selected (separate tasks)</span>
+      </label>
+      <p id="benchmark-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 14px">Select one GPU for single-card benchmarking or several GPUs for a constrained multi-GPU run.</p>
+      <div style="display:flex;gap:8px;flex-wrap:wrap;align-items:center">
+        <button id="benchmark-run-performance-btn" class="btn btn-primary" onclick="runNvidiaBenchmark('performance')" disabled>&#9654; Run Performance Benchmark</button>
+        <button id="benchmark-run-power-fit-btn" class="btn btn-secondary" onclick="runNvidiaBenchmark('power-fit')" disabled>&#9654; Run Power / Thermal Fit</button>
+        <button id="benchmark-run-autotune-btn" class="btn btn-secondary" onclick="runBenchmarkAutotune()">Autotune</button>
+      </div>
+      <span id="benchmark-run-nccl" hidden>nccl-auto</span>
+      <span id="benchmark-run-status" style="margin-left:10px;font-size:12px;color:var(--muted)"></span>
+      <div id="benchmark-autotune-status" style="margin-top:10px;font-size:12px;color:var(--muted)">Autotune status: loading…</div>
+      <div style="margin-top:6px;font-size:12px;color:var(--muted)">Autotune overwrites the saved system-power source and applies it to all new power charts and tests.</div>
+    </div>
+  </div>
+
+  <div class="card">
+    <div class="card-head">Method Split</div>
+    <div class="card-body">
+      <p style="font-size:13px;color:var(--muted);margin-bottom:10px">The benchmark page now exposes two fundamentally different test families so compute score and server power-fit are not mixed into one number.</p>
+      <table>
+        <tr><th>Run Type</th><th>Engine</th><th>Question</th><th>Standard</th><th>Stability</th></tr>
+        <tr><td>Performance Benchmark</td><td><code>bee-gpu-burn</code></td><td>How much isolated compute performance does the GPU realize in this server?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + `</td></tr>
+        <tr><td>Power / Thermal Fit</td><td><code>dcgmproftester</code> + <code>nvidia-smi -pl</code></td><td>How much power per GPU can this server sustain as GPU count ramps up?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</td></tr>
+      </table>
+      <p style="font-size:12px;color:var(--muted);margin-top:10px">Timings are per full ramp-up run (1 GPU → all selected), measured on 4–8 GPU servers. Use ramp-up mode for capacity work: it creates 1 GPU → 2 GPU → … → all selected steps so analysis software can derive server total score and watts-per-GPU curves.</p>
+    </div>
+  </div>
+</div>
+
+` + `<div id="benchmark-results-section">` + renderBenchmarkResultsCard(opts.ExportDir) + `</div>` + `
+
+<div id="benchmark-output" style="display:none;margin-top:16px" class="card">
+  <div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
+  <div class="card-body"><div id="benchmark-terminal" class="terminal"></div></div>
+</div>
+
+<style>
+.benchmark-cb-row { display:flex; align-items:flex-start; gap:8px; cursor:pointer; font-size:13px; }
+.benchmark-cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
+.benchmark-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
+.benchmark-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
+</style>
+
+<script>
+let benchmarkES = null;
+function benchmarkTaskIDs(payload) {
+  if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
+  if (payload && payload.task_id) return [payload.task_id];
+  return [];
+}
+function benchmarkSelectedGPUIndices() {
+  return Array.from(document.querySelectorAll('.benchmark-gpu-checkbox'))
+    .filter(function(el) { return el.checked && !el.disabled; })
+    .map(function(el) { return parseInt(el.value, 10); })
+    .filter(function(v) { return !Number.isNaN(v); })
+    .sort(function(a, b) { return a - b; });
+}
+function benchmarkMode() {
+  const el = document.querySelector('input[name="benchmark-mode"]:checked');
+  return el ? el.value : 'sequential';
+}
+function benchmarkUpdateSelectionNote() {
+  const selected = benchmarkSelectedGPUIndices();
+  const perfBtn = document.getElementById('benchmark-run-performance-btn');
+  const fitBtn = document.getElementById('benchmark-run-power-fit-btn');
+  const note = document.getElementById('benchmark-selection-note');
+  if (!selected.length) {
+    perfBtn.disabled = true;
+    fitBtn.disabled = true;
+    note.textContent = 'Select at least one NVIDIA GPU to run the benchmark.';
+    return;
+  }
+  perfBtn.disabled = false;
+  fitBtn.disabled = false;
+  const mode = benchmarkMode();
+  if (mode === 'ramp-up') {
+    note.textContent = 'Ramp-up: ' + selected.length + ' tasks (1 GPU → ' + selected.length + ' GPUs). Performance uses compute benchmark; Power / Thermal Fit uses dcgmproftester load with nvidia-smi power-limit search per step.';
+  } else if (mode === 'parallel') {
+    note.textContent = 'Parallel: all ' + selected.length + ' GPU(s) simultaneously. Only the performance benchmark supports this mode.';
+  } else {
+    note.textContent = 'Sequential: each selected GPU benchmarked separately.';
+  }
+}
+function benchmarkRenderGPUList(gpus) {
+  const root = document.getElementById('benchmark-gpu-list');
+  if (!gpus || !gpus.length) {
+    root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
+    benchmarkUpdateSelectionNote();
+    return;
+  }
+  root.innerHTML = gpus.map(function(gpu) {
+    const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
+    return '<label class="benchmark-gpu-row">'
+      + '<input class="benchmark-gpu-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="benchmarkUpdateSelectionNote()">'
+      + '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
+      + '</label>';
+  }).join('');
+  benchmarkApplyMultiGPUState(gpus.length);
+  benchmarkUpdateSelectionNote();
+}
+function benchmarkApplyMultiGPUState(gpuCount) {
+  var multiValues = ['parallel', 'ramp-up'];
+  var radios = document.querySelectorAll('input[name="benchmark-mode"]');
+  radios.forEach(function(el) {
+    var isMulti = multiValues.indexOf(el.value) >= 0;
+    if (gpuCount < 2 && isMulti) {
+      el.disabled = true;
+      if (el.checked) {
+        var seq = document.querySelector('input[name="benchmark-mode"][value="sequential"]');
+        if (seq) seq.checked = true;
+      }
+      var label = el.closest('label');
+      if (label) label.style.opacity = '0.4';
+    } else {
+      el.disabled = false;
+      if (gpuCount >= 2 && el.value === 'ramp-up') el.checked = true;
+      var label = el.closest('label');
+      if (label) label.style.opacity = '';
+    }
+  });
+  benchmarkUpdateSelectionNote();
+}
+function benchmarkLoadGPUs() {
+  const status = document.getElementById('benchmark-run-status');
+  status.textContent = '';
+  fetch('/api/gpu/nvidia').then(function(r) {
+    return r.json().then(function(body) {
+      if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
+      return body;
+    });
+  }).then(function(gpus) {
+    benchmarkRenderGPUList(gpus);
+  }).catch(function(err) {
+    document.getElementById('benchmark-gpu-list').innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
+    benchmarkUpdateSelectionNote();
+  });
+}
+function benchmarkSelectAll() {
+  document.querySelectorAll('.benchmark-gpu-checkbox').forEach(function(el) { el.checked = true; });
+  benchmarkUpdateSelectionNote();
+}
+function benchmarkSelectNone() {
+  document.querySelectorAll('.benchmark-gpu-checkbox').forEach(function(el) { el.checked = false; });
+  benchmarkUpdateSelectionNote();
+}
+function runNvidiaBenchmark(kind) {
+  const selected = benchmarkSelectedGPUIndices();
+  const status = document.getElementById('benchmark-run-status');
+  if (!selected.length) {
+    status.textContent = 'Select at least one GPU.';
+    return;
+  }
+  if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
+  const mode = benchmarkMode();
+  const rampUp = mode === 'ramp-up' && selected.length > 1;
+  const parallelGPUs = mode === 'parallel' && kind === 'performance';
+  if (kind === 'power-fit' && mode === 'parallel') {
+    status.textContent = 'Power / Thermal Fit supports sequential or ramp-up only.';
+    return;
+  }
+  const body = {
+    profile: document.getElementById('benchmark-profile').value || 'standard',
+    gpu_indices: selected,
+    run_nccl: kind === 'performance' && selected.length > 1,
+    parallel_gpus: parallelGPUs,
+    ramp_up: rampUp,
+    display_name: kind === 'power-fit' ? 'NVIDIA Power / Thermal Fit' : 'NVIDIA Performance Benchmark'
+  };
+  document.getElementById('benchmark-output').style.display = 'block';
+  document.getElementById('benchmark-title').textContent = '— ' + body.display_name + ' · ' + body.profile + ' [' + selected.join(', ') + ']';
+  const term = document.getElementById('benchmark-terminal');
+  term.textContent = 'Enqueuing ' + body.display_name + ' for GPUs ' + selected.join(', ') + '...\n';
+  status.textContent = 'Queueing...';
+  const endpoint = kind === 'power-fit' ? '/api/bee-bench/nvidia/power/run' : '/api/bee-bench/nvidia/perf/run';
+  fetch(endpoint, {
+    method: 'POST',
+    headers: {'Content-Type':'application/json'},
+    body: JSON.stringify(body)
+  }).then(function(r) {
+    return r.json().then(function(payload) {
+      if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
+      return payload;
+    });
+  }).then(function(d) {
+    const taskIds = benchmarkTaskIDs(d);
+    if (!taskIds.length) throw new Error('No benchmark task was queued.');
+    status.textContent = taskIds.length === 1 ? ('Task ' + taskIds[0] + ' queued.') : ('Queued ' + taskIds.length + ' tasks.');
+    const streamNext = function(idx, failures) {
+      if (idx >= taskIds.length) {
+        status.textContent = failures ? 'Completed with failures.' : 'Completed.';
+        return;
+      }
+      const taskId = taskIds[idx];
+      term.textContent += '\n[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming log...\n';
+      benchmarkES = new EventSource('/api/tasks/' + taskId + '/stream');
+      benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
+      benchmarkES.addEventListener('done', function(e) {
+        benchmarkES.close();
+        benchmarkES = null;
+        if (e.data) failures += 1;
+        term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
+        term.scrollTop = term.scrollHeight;
+        const isLast = (idx + 1 >= taskIds.length);
+        streamNext(idx + 1, failures);
+        if (isLast) { benchmarkRefreshResults(); }
+      });
+      benchmarkES.onerror = function() {
+        if (benchmarkES) {
+          benchmarkES.close();
+          benchmarkES = null;
+        }
+        term.textContent += '\nERROR: stream disconnected.\n';
+        term.scrollTop = term.scrollHeight;
+        streamNext(idx + 1, failures + 1);
+      };
+    };
+    streamNext(0, 0);
+  }).catch(function(err) {
+    status.textContent = 'Error.';
+    term.textContent += 'ERROR: ' + err.message + '\n';
+  });
+}
+function benchmarkRenderAutotuneStatus(payload) {
+  const el = document.getElementById('benchmark-autotune-status');
+  if (!el) return;
+  if (!payload || !payload.configured || !payload.config) {
+    el.textContent = 'Autotune status: not configured. Temporary fallback source is used until autotune completes.';
+    return;
+  }
+  const cfg = payload.config || {};
+  const decision = payload.decision || {};
+  const updated = cfg.updated_at ? new Date(cfg.updated_at).toLocaleString() : 'unknown time';
+  const confidence = typeof cfg.confidence === 'number' ? (' · confidence ' + Math.round(cfg.confidence * 100) + '%') : '';
+  const effective = decision.effective_source ? (' · effective ' + decision.effective_source) : '';
+  const mode = decision.mode ? (' · mode ' + decision.mode) : '';
+  el.textContent = 'Autotune status: ' + cfg.selected_source + effective + mode + ' · updated ' + updated + confidence;
+}
+function loadBenchmarkAutotuneStatus() {
+  fetch('/api/bee-bench/nvidia/autotune/status')
+    .then(function(r) {
+      return r.json().then(function(body) {
+        if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
+        return body;
+      });
+    })
+    .then(function(body) { benchmarkRenderAutotuneStatus(body); })
+    .catch(function(err) {
+      const el = document.getElementById('benchmark-autotune-status');
+      if (el) el.textContent = 'Autotune status error: ' + err.message;
+    });
+}
+function runBenchmarkAutotune() {
+  const selected = benchmarkSelectedGPUIndices();
+  const status = document.getElementById('benchmark-run-status');
+  const term = document.getElementById('benchmark-terminal');
+  if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
+  document.getElementById('benchmark-output').style.display = 'block';
+  document.getElementById('benchmark-title').textContent = '— NVIDIA Benchmark Autotune';
+  term.textContent = 'Enqueuing benchmark autotune...\n';
+  status.textContent = 'Queueing autotune...';
+  fetch('/api/bee-bench/nvidia/autotune/run', {
+    method: 'POST',
+    headers: {'Content-Type':'application/json'},
+    body: JSON.stringify({
+      profile: document.getElementById('benchmark-profile').value || 'standard',
+      benchmark_kind: benchmarkMode() === 'parallel' ? 'performance' : 'power-fit',
+      gpu_indices: selected
+    })
+  }).then(function(r) {
+    return r.json().then(function(payload) {
+      if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
+      return payload;
+    });
+  }).then(function(d) {
+    const taskIds = benchmarkTaskIDs(d);
+    if (!taskIds.length) throw new Error('No autotune task was queued.');
+    const taskId = taskIds[0];
+    status.textContent = 'Autotune queued: ' + taskId;
+    benchmarkES = new EventSource('/api/tasks/' + taskId + '/stream');
+    benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
+    benchmarkES.addEventListener('done', function(e) {
+      if (benchmarkES) {
+        benchmarkES.close();
+        benchmarkES = null;
+      }
+      term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
+      status.textContent = e.data ? 'Autotune failed.' : 'Autotune completed.';
+      loadBenchmarkAutotuneStatus();
+    });
+  }).catch(function(err) {
+    status.textContent = 'Autotune error.';
+    term.textContent += 'ERROR: ' + err.message + '\n';
+  });
+}
+benchmarkLoadGPUs();
+loadBenchmarkAutotuneStatus();
+function benchmarkRefreshResults() {
+  fetch('/api/benchmark/results')
+    .then(function(r) { return r.text(); })
+    .then(function(html) {
+      const el = document.getElementById('benchmark-results-section');
+      if (el) el.innerHTML = html;
+    })
+    .catch(function() {});
+}
+</script>`
+}
+
+func renderBenchmarkResultsCard(exportDir string) string {
+	maxIdx, runs := loadBenchmarkHistory(exportDir)
+	perf := renderBenchmarkResultsCardFromRuns(
+		"Perf Results",
+		"Composite score by saved benchmark run and GPU.",
+		"No saved performance benchmark runs yet.",
+		maxIdx,
+		runs,
+	)
+	power := renderPowerBenchmarkResultsCard(exportDir)
+	return perf + "\n" + power
+}
+
+func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, maxGPUIndex int, runs []benchmarkHistoryRun) string {
+	if len(runs) == 0 {
+		return `<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body"><p style="color:var(--muted);font-size:13px">` + html.EscapeString(emptyMessage) + `</p></div></div>`
+	}
+	var b strings.Builder
+	b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body">`)
+	if strings.TrimSpace(description) != "" {
+		b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">` + html.EscapeString(description) + `</p>`)
+	}
+	b.WriteString(`<div style="overflow-x:auto">`)
+	b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th><th>Status</th>`)
+	for i := 0; i <= maxGPUIndex; i++ {
+		b.WriteString(`<th>GPU ` + strconv.Itoa(i) + `</th>`)
+	}
+	b.WriteString(`</tr></thead><tbody>`)
+	for i, run := range runs {
+		b.WriteString(`<tr>`)
+		b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
+		b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
+		overallColor := "var(--ok)"
+		overallLabel := run.overallStatus
+		if overallLabel == "" {
+			overallLabel = "OK"
+		}
+		if overallLabel == "FAILED" {
+			overallColor = "var(--crit-fg,#9f3a38)"
+		} else if overallLabel != "OK" {
+			overallColor = "var(--warn)"
+		}
+		b.WriteString(`<td style="color:` + overallColor + `;font-weight:600">` + html.EscapeString(overallLabel) + `</td>`)
+		for idx := 0; idx <= maxGPUIndex; idx++ {
+			score, ok := run.gpuScores[idx]
+			if !ok {
+				b.WriteString(`<td style="color:var(--muted)">-</td>`)
+				continue
+			}
+			gpuStatus := run.gpuStatuses[idx]
+			scoreColor := ""
+			switch gpuStatus {
+			case "FAILED":
+				scoreColor = ` style="color:var(--crit-fg,#9f3a38);font-weight:600"`
+			case "WARNING", "PARTIAL":
+				scoreColor = ` style="color:var(--warn);font-weight:600"`
+			case "", "OK":
+			default:
+				scoreColor = ` style="color:var(--warn);font-weight:600"`
+			}
+			b.WriteString(`<td` + scoreColor + `>` + fmt.Sprintf("%.2f", score) + `</td>`)
+		}
+		b.WriteString(`</tr>`)
+	}
+	b.WriteString(`</tbody></table></div></div></div>`)
+	return b.String()
+}
+
+func loadBenchmarkHistory(exportDir string) (int, []benchmarkHistoryRun) {
+	baseDir := app.DefaultBeeBenchPerfDir
+	if strings.TrimSpace(exportDir) != "" {
+		baseDir = filepath.Join(exportDir, "bee-bench", "perf")
+	}
+	paths, err := filepath.Glob(filepath.Join(baseDir, "perf-*", "result.json"))
+	if err != nil || len(paths) == 0 {
+		return -1, nil
+	}
+	sort.Strings(paths)
+	return loadBenchmarkHistoryFromPaths(paths)
+}
+
+func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun) {
+	runs := make([]benchmarkHistoryRun, 0, len(paths))
+	maxGPUIndex := -1
+	for _, path := range paths {
+		raw, err := os.ReadFile(path)
+		if err != nil {
+			continue
+		}
+		var result platform.NvidiaBenchmarkResult
+		if err := json.Unmarshal(raw, &result); err != nil {
+			continue
+		}
+		run := benchmarkHistoryRun{
+			generatedAt:   result.GeneratedAt,
+			displayTime:   result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
+			gpuScores:     make(map[int]float64),
+			gpuStatuses:   make(map[int]string),
+			overallStatus: result.OverallStatus,
+		}
+		for _, gpu := range result.GPUs {
+			run.gpuScores[gpu.Index] = gpu.Scores.CompositeScore
+			run.gpuStatuses[gpu.Index] = gpu.Status
+			if gpu.Index > maxGPUIndex {
+				maxGPUIndex = gpu.Index
+			}
+		}
+		runs = append(runs, run)
+	}
+	sort.Slice(runs, func(i, j int) bool {
+		return runs[i].generatedAt.After(runs[j].generatedAt)
+	})
+	return maxGPUIndex, runs
+}
+
+func renderPowerBenchmarkResultsCard(exportDir string) string {
+	baseDir := app.DefaultBeeBenchPowerDir
+	if strings.TrimSpace(exportDir) != "" {
+		baseDir = filepath.Join(exportDir, "bee-bench", "power")
+	}
+	paths, err := filepath.Glob(filepath.Join(baseDir, "power-*", "result.json"))
+	if err != nil || len(paths) == 0 {
+		return `<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body"><p style="color:var(--muted);font-size:13px">No saved power benchmark runs yet.</p></div></div>`
+	}
+	sort.Strings(paths)
+
+	type powerRun struct {
+		generatedAt time.Time
+		displayTime string
+		result      platform.NvidiaPowerBenchResult
+	}
+	var runs []powerRun
+	for _, path := range paths {
+		raw, err := os.ReadFile(path)
+		if err != nil {
+			continue
+		}
+		var r platform.NvidiaPowerBenchResult
+		if err := json.Unmarshal(raw, &r); err != nil {
+			continue
+		}
+		runs = append(runs, powerRun{
+			generatedAt: r.GeneratedAt,
+			displayTime: r.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
+			result:      r,
+		})
+	}
+	sort.Slice(runs, func(i, j int) bool {
+		return runs[i].generatedAt.After(runs[j].generatedAt)
+	})
+
+	var b strings.Builder
+	b.WriteString(`<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body">`)
+
+	latest := runs[0].result
+	b.WriteString(`<p style="font-size:12px;color:var(--muted);margin-bottom:10px">Latest run: ` + html.EscapeString(runs[0].displayTime))
+	if latest.Hostname != "" {
+		b.WriteString(` — ` + html.EscapeString(latest.Hostname))
+	}
+	if latest.OverallStatus != "" {
+		statusColor := "var(--ok)"
+		if latest.OverallStatus != "OK" {
+			statusColor = "var(--warn)"
+		}
+		b.WriteString(` — <span style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(latest.OverallStatus) + `</span>`)
+	}
+	b.WriteString(`</p>`)
+
+	if len(latest.GPUs) > 0 {
+		b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`)
+		b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Single-card W</th><th>Multi-GPU W</th><th>P95 Observed W</th><th>Status</th>`)
+		b.WriteString(`</tr></thead><tbody>`)
+		for _, gpu := range latest.GPUs {
+			finalLimitW := gpu.StablePowerLimitW
+			if finalLimitW <= 0 {
+				finalLimitW = gpu.AppliedPowerLimitW
+			}
+			derated := gpu.Derated ||
+				(gpu.DefaultPowerLimitW > 0 && finalLimitW > 0 && finalLimitW < gpu.DefaultPowerLimitW-1)
+			rowStyle := ""
+			finalStyle := ""
+			if derated {
+				rowStyle = ` style="background:rgba(255,180,0,0.08)"`
+				finalStyle = ` style="color:#e6a000;font-weight:600"`
+			}
+			statusLabel := gpu.Status
+			if statusLabel == "" {
+				statusLabel = "OK"
+			}
+			statusColor := "var(--ok)"
+			if statusLabel == "FAILED" {
+				statusColor = "var(--crit-fg,#9f3a38)"
+			} else if statusLabel != "OK" {
+				statusColor = "var(--warn)"
+			}
+			nominalStr := "-"
+			if gpu.DefaultPowerLimitW > 0 {
+				nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW)
+			}
+			singleStr := "-"
+			if gpu.AppliedPowerLimitW > 0 {
+				singleStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
+			}
+			multiStr := "-"
+			if gpu.StablePowerLimitW > 0 {
+				multiStr = fmt.Sprintf("%.0f", gpu.StablePowerLimitW)
+			}
+			p95Str := "-"
+			if gpu.MaxObservedPowerW > 0 {
+				p95Str = fmt.Sprintf("%.0f", gpu.MaxObservedPowerW)
+			}
+			b.WriteString(`<tr` + rowStyle + `>`)
+			b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`)
+			b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`)
+			b.WriteString(`<td>` + nominalStr + `</td>`)
+			b.WriteString(`<td>` + singleStr + `</td>`)
+			b.WriteString(`<td` + finalStyle + `>` + multiStr + `</td>`)
+			b.WriteString(`<td>` + p95Str + `</td>`)
+			b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`)
+			b.WriteString(`</tr>`)
+		}
+		b.WriteString(`</tbody></table></div>`)
+	}
+
+	if len(runs) > 1 {
+		b.WriteString(`<details style="margin-top:12px"><summary style="font-size:12px;color:var(--muted);cursor:pointer">` + strconv.Itoa(len(runs)) + ` runs total</summary>`)
+		b.WriteString(`<div style="overflow-x:auto;margin-top:8px"><table><thead><tr><th>#</th><th>Time</th><th>GPUs</th><th>Status</th></tr></thead><tbody>`)
+		for i, run := range runs {
+			statusColor := "var(--ok)"
+			if run.result.OverallStatus != "OK" {
+				statusColor = "var(--warn)"
+			}
+			b.WriteString(`<tr>`)
+			b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
+			b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
+			b.WriteString(`<td>` + strconv.Itoa(len(run.result.GPUs)) + `</td>`)
+			b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(run.result.OverallStatus) + `</td>`)
+			b.WriteString(`</tr>`)
+		}
+		b.WriteString(`</tbody></table></div></details>`)
+	}
+
+	b.WriteString(`</div></div>`)
+	return b.String()
+}
--- a/audit/internal/webui/page_burn.go
+++ b/audit/internal/webui/page_burn.go
@@ -0,0 +1,383 @@
+package webui
+
+func renderBurn() string {
+	return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>&#9888; Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
+<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in <a href="/validate">Validate → Stress mode</a>; NCCL and NVBandwidth are available directly from <a href="/validate">Validate</a>.</div>
+<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
+
+<div class="card" style="margin-bottom:16px">
+  <div class="card-head">Burn Profile</div>
+  <div class="card-body burn-profile-body">
+    <div class="burn-profile-col">
+      <div class="form-row" style="margin:0 0 8px"><label>Preset</label></div>
+      <label class="cb-row"><input type="radio" name="burn-profile" value="smoke" checked><span>Smoke — 5 min/GPU (sequential) or 5 min (parallel)</span></label>
+      <label class="cb-row"><input type="radio" name="burn-profile" value="acceptance"><span>Acceptance — 1 h/GPU (sequential) or 1 h (parallel)</span></label>
+      <label class="cb-row"><input type="radio" name="burn-profile" value="overnight"><span>Overnight — 8 h/GPU (sequential) or 8 h (parallel)</span></label>
+    </div>
+    <div class="burn-profile-col burn-profile-action">
+      <button type="button" class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button>
+      <p>Runs checked tests as separate sequential tasks. In sequential GPU mode, total time = profile duration × N GPU. In parallel mode, all selected GPUs burn simultaneously for one profile duration.</p>
+    </div>
+    <div class="burn-profile-col burn-profile-action">
+      <button type="button" class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button>
+      <p>Run checked core test modules (CPU, MEM, GPU). Tests start at the same time and run for a period with short cooldown phases to stress the server cooling system.</p>
+    </div>
+  </div>
+  <div class="card-body" style="padding-top:0;display:flex;justify-content:center">
+    <span id="burn-all-status" style="font-size:12px;color:var(--muted)"></span>
+  </div>
+</div>
+
+<div class="card" style="margin-bottom:16px">
+  <div class="card-head">NVIDIA GPU Selection</div>
+  <div class="card-body">
+    <p style="font-size:12px;color:var(--muted);margin:0 0 10px">Official NVIDIA recipes and custom NVIDIA stressors use only the GPUs selected here. Multi-GPU interconnect tests are limited to this selection as well.</p>
+    <div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
+      <button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectAll()">Select All</button>
+      <button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectNone()">Clear</button>
+    </div>
+	    <div id="burn-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
+	      <p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
+	    </div>
+	    <p id="burn-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA burn recipes.</p>
+	    <div style="display:flex;flex-direction:column;gap:4px;margin-top:10px">
+	      <label class="cb-row">
+	        <input type="radio" name="burn-nvidia-mode" value="sequential" checked>
+	        <span>Sequential — selected GPUs one at a time</span>
+	      </label>
+	      <label class="cb-row" id="burn-parallel-label">
+	        <input type="radio" name="burn-nvidia-mode" value="parallel">
+	        <span>Parallel — all selected GPUs simultaneously</span>
+	      </label>
+	      <label class="cb-row" id="burn-ramp-label">
+	        <input type="radio" name="burn-nvidia-mode" value="ramp-up">
+	        <span>Ramp-up — add one GPU at a time</span>
+	      </label>
+	    </div>
+	  </div>
+	</div>
+
+<div class="burn-section">Core Burn Paths</div>
+<div class="grid2 burn-grid" style="margin-bottom:16px">
+<div class="card burn-card">
+  <div class="card-head card-head-actions"><span>GPU Max Load</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-nvidia-compute',target:'nvidia-compute',label:'NVIDIA Max Compute Load (dcgmproftester)',nvidia:true},{id:'burn-gpu-bee',target:'nvidia-stress',label:'GPU Burn (bee-gpu-burn)',nvidia:true,extra:{loader:'builtin'}},{id:'burn-gpu-john',target:'nvidia-stress',label:'John GPU Stress (john/OpenCL)',nvidia:true,extra:{loader:'john'}},{id:'burn-gpu-rvs',target:'amd-stress',label:'AMD GPU Stress (rvs gst)'}])">Run</button></div>
+  <div class="card-body burn-card-body">
+    <p style="font-size:12px;color:var(--muted);margin:0 0 10px">Combine vendor-backed and custom GPU max-load recipes in one run set. ` + "dcgmproftester" + ` is the primary official NVIDIA path; custom stressors remain available as parallel checkbox options.</p>
+    <label class="cb-row"><input type="checkbox" id="burn-nvidia-compute" checked disabled><span>NVIDIA Max Compute Load (dcgmproftester) <span class="cb-note" id="note-nvidia-compute"></span></span></label>
+    <label class="cb-row"><input type="checkbox" id="burn-gpu-bee" checked disabled><span>GPU Burn (bee-gpu-burn) <span class="cb-note" id="note-bee"></span></span></label>
+    <label class="cb-row"><input type="checkbox" id="burn-gpu-john" disabled><span>John GPU Stress (john/OpenCL) <span class="cb-note" id="note-john"></span></span></label>
+    <label class="cb-row"><input type="checkbox" id="burn-gpu-rvs" disabled><span>AMD GPU Stress (rvs gst) <span class="cb-note" id="note-rvs"></span></span></label>
+  </div>
+</div>
+
+<div class="card burn-card">
+  <div class="card-head card-head-actions"><span>Compute Stress</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-cpu',target:'cpu',label:'CPU Burn-in'},{id:'burn-mem-stress',target:'memory-stress',label:'Memory Burn-in'},{id:'burn-sat-stress',target:'sat-stress',label:'SAT Stress (stressapptest)'}])">Run</button></div>
+  <div class="card-body burn-card-body">
+    <p style="font-size:12px;color:var(--muted);margin:0 0 10px">Select which subsystems to stress. Each checked item runs as a separate task.</p>
+    <label class="cb-row"><input type="checkbox" id="burn-cpu" checked><span>CPU stress (stress-ng)</span></label>
+    <label class="cb-row"><input type="checkbox" id="burn-mem-stress" checked><span>Memory stress (stress-ng --vm)</span></label>
+    <label class="cb-row"><input type="checkbox" id="burn-sat-stress"><span>stressapptest (CPU + memory bus)</span></label>
+  </div>
+</div>
+</div>
+
+<div id="bi-output" style="display:none;margin-top:16px" class="card">
+  <div class="card-head">Output <span id="bi-title"></span></div>
+  <div class="card-body"><div id="bi-terminal" class="terminal"></div></div>
+</div>
+
+<style>
+.cb-row { display:flex; align-items:flex-start; gap:8px; padding:4px 0; cursor:pointer; font-size:13px; }
+.cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
+.cb-row input[type=checkbox]:disabled { opacity:0.4; cursor:not-allowed; }
+.cb-row input[type=checkbox]:disabled ~ span { opacity:0.45; cursor:not-allowed; }
+.cb-note { font-size:11px; color:var(--muted); font-style:italic; }
+.burn-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
+.burn-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
+.burn-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
+.burn-profile-col { min-width:0; }
+.burn-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:flex-start; gap:8px; }
+.burn-profile-action p { font-size:12px; color:var(--muted); margin:0; width:100%; text-align:left; }
+.burn-section { font-size:12px; font-weight:700; letter-spacing:.06em; text-transform:uppercase; color:var(--muted); margin:0 0 10px; padding-top:4px; }
+.burn-grid { align-items:stretch; }
+.burn-card { height:100%; display:flex; flex-direction:column; }
+.burn-card-body { flex:1; display:flex; flex-direction:column; }
+.card-head-actions { justify-content:space-between; }
+.card-head-buttons { display:flex; align-items:center; gap:8px; margin-left:auto; }
+@media(max-width:900px){ .card-head-actions { align-items:flex-start; flex-direction:column; } .card-head-buttons { margin-left:0; } .burn-profile-body { grid-template-columns:1fr; } }
+</style>
+
+<script>
+let biES = null;
+function burnTaskIDs(payload) {
+  if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
+  if (payload && payload.task_id) return [payload.task_id];
+  return [];
+}
+function burnProfile() {
+  const selected = document.querySelector('input[name="burn-profile"]:checked');
+  return selected ? selected.value : 'smoke';
+}
+function burnSelectedGPUIndices() {
+  return Array.from(document.querySelectorAll('.burn-gpu-checkbox'))
+    .filter(function(el) { return el.checked && !el.disabled; })
+    .map(function(el) { return parseInt(el.value, 10); })
+    .filter(function(v) { return !Number.isNaN(v); })
+    .sort(function(a, b) { return a - b; });
+}
+function burnNvidiaMode() {
+  const el = document.querySelector('input[name="burn-nvidia-mode"]:checked');
+  return el ? el.value : 'sequential';
+}
+function burnApplyMultiGPUState(gpuCount) {
+  var multiValues = ['parallel', 'ramp-up'];
+  var radios = document.querySelectorAll('input[name="burn-nvidia-mode"]');
+  radios.forEach(function(el) {
+    var isMulti = multiValues.indexOf(el.value) >= 0;
+    if (gpuCount < 2 && isMulti) {
+      el.disabled = true;
+      if (el.checked) {
+        var seq = document.querySelector('input[name="burn-nvidia-mode"][value="sequential"]');
+        if (seq) seq.checked = true;
+      }
+      var label = el.closest('label');
+      if (label) label.style.opacity = '0.4';
+    } else {
+      el.disabled = false;
+      var label = el.closest('label');
+      if (label) label.style.opacity = '';
+    }
+  });
+}
+function burnUpdateSelectionNote() {
+  const note = document.getElementById('burn-selection-note');
+  const selected = burnSelectedGPUIndices();
+  if (!selected.length) {
+    note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA burn recipes.';
+    return;
+  }
+  note.textContent = 'Selected NVIDIA GPUs: ' + selected.join(', ') + '. Official and custom NVIDIA tasks will use only these GPUs.';
+}
+function burnRenderGPUList(gpus) {
+  const root = document.getElementById('burn-gpu-list');
+  if (!gpus || !gpus.length) {
+    root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
+    burnUpdateSelectionNote();
+    return;
+  }
+  root.innerHTML = gpus.map(function(gpu) {
+    const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
+    return '<label class="burn-gpu-row">'
+      + '<input class="burn-gpu-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="burnUpdateSelectionNote()">'
+      + '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
+      + '</label>';
+  }).join('');
+  burnApplyMultiGPUState(gpus.length);
+  burnUpdateSelectionNote();
+}
+function burnSelectAll() {
+  document.querySelectorAll('.burn-gpu-checkbox').forEach(function(el) { el.checked = true; });
+  burnUpdateSelectionNote();
+}
+function burnSelectNone() {
+  document.querySelectorAll('.burn-gpu-checkbox').forEach(function(el) { el.checked = false; });
+  burnUpdateSelectionNote();
+}
+function burnLoadGPUs() {
+  fetch('/api/gpu/nvidia').then(function(r) {
+    return r.json().then(function(body) {
+      if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
+      return body;
+    });
+  }).then(function(gpus) {
+    burnRenderGPUList(gpus);
+  }).catch(function(err) {
+    document.getElementById('burn-gpu-list').innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
+    burnUpdateSelectionNote();
+  });
+}
+function enqueueBurnTask(target, label, extra, useSelectedNvidia) {
+  const body = Object.assign({ profile: burnProfile(), display_name: label }, extra || {});
+  if (useSelectedNvidia) {
+    const selected = burnSelectedGPUIndices();
+    if (!selected.length) {
+      return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
+    }
+    body.gpu_indices = selected;
+    const bMode = burnNvidiaMode();
+    if (bMode === 'ramp-up' && selected.length > 1) {
+      body.stagger_gpu_start = true;
+    } else if (bMode === 'parallel' && selected.length > 1) {
+      body.parallel_gpus = true;
+    }
+  }
+  return fetch('/api/sat/' + target + '/run', {
+    method: 'POST',
+    headers: {'Content-Type':'application/json'},
+    body: JSON.stringify(body)
+  }).then(function(r) {
+    return r.json().then(function(payload) {
+      if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
+      return payload;
+    });
+  });
+}
+function streamTask(taskId, label) {
+  if (biES) { biES.close(); biES = null; }
+  document.getElementById('bi-output').style.display = 'block';
+  document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
+  const term = document.getElementById('bi-terminal');
+  term.textContent = 'Task ' + taskId + ' queued. Streaming...\n';
+  biES = new EventSource('/api/tasks/' + taskId + '/stream');
+  biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
+  biES.addEventListener('done', function(e) {
+    biES.close();
+    biES = null;
+    term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
+    term.scrollTop = term.scrollHeight;
+  });
+}
+function streamBurnTask(taskId, label, resetTerminal) {
+  return streamBurnTaskSet([taskId], label, resetTerminal);
+}
+function streamBurnTaskSet(taskIds, label, resetTerminal) {
+  if (biES) { biES.close(); biES = null; }
+  document.getElementById('bi-output').style.display = 'block';
+  document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
+  const term = document.getElementById('bi-terminal');
+  if (resetTerminal) {
+    term.textContent = '';
+  }
+  if (!Array.isArray(taskIds) || !taskIds.length) {
+    term.textContent += 'ERROR: no tasks queued.\n';
+    return Promise.resolve({ok:false, error:'no tasks queued'});
+  }
+  const streamNext = function(idx, failures) {
+    if (idx >= taskIds.length) {
+      return Promise.resolve({ok: failures === 0, error: failures ? (failures + ' task(s) failed') : ''});
+    }
+    const taskId = taskIds[idx];
+    term.textContent += '[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming...\n';
+    return new Promise(function(resolve) {
+      biES = new EventSource('/api/tasks/' + taskId + '/stream');
+      biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
+      biES.addEventListener('done', function(e) {
+        biES.close();
+        biES = null;
+        term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
+        term.scrollTop = term.scrollHeight;
+        resolve(failures + (e.data ? 1 : 0));
+      });
+      biES.onerror = function() {
+        if (biES) {
+          biES.close();
+          biES = null;
+        }
+        term.textContent += '\nERROR: stream disconnected.\n';
+        term.scrollTop = term.scrollHeight;
+        resolve(failures + 1);
+      };
+    }).then(function(nextFailures) {
+      return streamNext(idx + 1, nextFailures);
+    });
+  };
+  return streamNext(0, 0);
+}
+function runBurnTaskSet(tasks, statusElId) {
+  const enabled = tasks.filter(function(t) {
+    const el = document.getElementById(t.id);
+    return el && el.checked && !el.disabled;
+  });
+  const status = statusElId ? document.getElementById(statusElId) : null;
+  if (status) status.textContent = '';
+  if (!enabled.length) {
+    if (status) status.textContent = 'No tasks selected.';
+    return;
+  }
+  const term = document.getElementById('bi-terminal');
+  document.getElementById('bi-output').style.display = 'block';
+  document.getElementById('bi-title').textContent = '— Burn one by one [' + burnProfile() + ']';
+  term.textContent = '';
+  const runNext = function(idx) {
+    if (idx >= enabled.length) {
+      if (status) status.textContent = 'Completed ' + enabled.length + ' task(s).';
+      return Promise.resolve();
+    }
+    const t = enabled[idx];
+    term.textContent += '\n[' + (idx + 1) + '/' + enabled.length + '] ' + t.label + '\n';
+    if (status) status.textContent = 'Running ' + (idx + 1) + '/' + enabled.length + '...';
+    return enqueueBurnTask(t.target, t.label, t.extra, !!t.nvidia)
+      .then(function(d) {
+        return streamBurnTaskSet(burnTaskIDs(d), t.label, false);
+      })
+      .then(function() {
+        return runNext(idx + 1);
+      })
+      .catch(function(err) {
+        if (status) status.textContent = 'Error: ' + err.message;
+        document.getElementById('bi-output').style.display = 'block';
+        term.textContent += 'ERROR: ' + err.message + '\n';
+        return Promise.reject(err);
+      });
+  };
+  return runNext(0);
+}
+function runPlatformStress() {
+  const comps = [];
+  const computeIDs = ['burn-cpu', 'burn-mem-stress', 'burn-sat-stress'];
+  const gpuIDs = ['burn-nvidia-compute', 'burn-gpu-bee', 'burn-gpu-john', 'burn-gpu-rvs'];
+  const hasChecked = function(ids) {
+    return ids.some(function(id) {
+      const el = document.getElementById(id);
+      return el && el.checked && !el.disabled;
+    });
+  };
+  if (hasChecked(computeIDs)) comps.push('cpu');
+  if (hasChecked(gpuIDs)) comps.push('gpu');
+  if (!comps.length) {
+    const status = document.getElementById('burn-all-status');
+    if (status) status.textContent = 'Select at least one test in GPU Max Load or Compute Stress.';
+    return;
+  }
+  const extra = comps.length > 0 ? {platform_components: comps} : {};
+  enqueueBurnTask('platform-stress', 'Platform Thermal Cycling', extra, false).then(function(d) {
+    streamTask(d.task_id, 'Platform Thermal Cycling');
+  });
+}
+function runAllBurnTasks() {
+  const status = document.getElementById('burn-all-status');
+  const all = [
+    {id:'burn-nvidia-compute',target:'nvidia-compute',label:'NVIDIA Max Compute Load (dcgmproftester)',nvidia:true},
+    {id:'burn-gpu-bee',target:'nvidia-stress',label:'GPU Burn (bee-gpu-burn)',nvidia:true,extra:{loader:'builtin'}},
+    {id:'burn-gpu-john',target:'nvidia-stress',label:'John GPU Stress (john/OpenCL)',nvidia:true,extra:{loader:'john'}},
+    {id:'burn-gpu-rvs',target:'amd-stress',label:'AMD GPU Stress (rvs gst)'},
+    {id:'burn-cpu',target:'cpu',label:'CPU Burn-in'},
+    {id:'burn-mem-stress',target:'memory-stress',label:'Memory Burn-in'},
+    {id:'burn-sat-stress',target:'sat-stress',label:'SAT Stress (stressapptest)'},
+  ];
+  status.textContent = 'Enqueuing...';
+  runBurnTaskSet(all, 'burn-all-status');
+}
+fetch('/api/gpu/tools').then(function(r) { return r.json(); }).then(function(tools) {
+  const map = {
+    'nvidia-compute': {cb:'burn-nvidia-compute', note:'note-nvidia-compute', reason:'dcgmproftester not available or NVIDIA driver not running'},
+    'bee-gpu-burn': {cb:'burn-gpu-bee', note:'note-bee', reason:'bee-gpu-burn not available or NVIDIA driver not running'},
+    'john': {cb:'burn-gpu-john', note:'note-john', reason:'bee-john-gpu-stress not available or NVIDIA driver not running'},
+    'rvs': {cb:'burn-gpu-rvs', note:'note-rvs', reason:'AMD driver not running'},
+  };
+  tools.forEach(function(t) {
+    const spec = map[t.id];
+    if (!spec) return;
+    const cb = document.getElementById(spec.cb);
+    const note = document.getElementById(spec.note);
+    if (!cb) return;
+    if (t.available) {
+      cb.disabled = false;
+    } else if (note) {
+      note.textContent = '— ' + spec.reason;
+    }
+  });
+}).catch(function() {});
+burnLoadGPUs();
+</script>`
+}
--- a/audit/internal/webui/page_export_tools.go
+++ b/audit/internal/webui/page_export_tools.go
@@ -0,0 +1,434 @@
+package webui
+
+import (
+	"fmt"
+	"html"
+	"net/url"
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+)
+
+func renderExport(exportDir string) string {
+	entries, _ := listExportFiles(exportDir)
+	var rows strings.Builder
+	for _, e := range entries {
+		rows.WriteString(fmt.Sprintf(`<tr><td><a href="/export/file?path=%s" target="_blank">%s</a></td></tr>`,
+			url.QueryEscape(e), html.EscapeString(e)))
+	}
+	if len(entries) == 0 {
+		rows.WriteString(`<tr><td style="color:var(--muted)">No export files found.</td></tr>`)
+	}
+	return `<div class="grid2">
+<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
+<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Creates a tar.gz archive of all audit files, SAT results, and logs.</p>
+` + renderSupportBundleInline() + `
+</div></div>
+<div class="card"><div class="card-head">Export Files</div><div class="card-body">
+<table><tr><th>File</th></tr>` + rows.String() + `</table>
+</div></div>
+</div>
+
+` + renderUSBExportCard()
+}
+
+func listExportFiles(exportDir string) ([]string, error) {
+	var entries []string
+	err := filepath.Walk(strings.TrimSpace(exportDir), func(path string, info os.FileInfo, err error) error {
+		if err != nil {
+			return err
+		}
+		if info.IsDir() {
+			return nil
+		}
+		rel, err := filepath.Rel(exportDir, path)
+		if err != nil {
+			return err
+		}
+		entries = append(entries, rel)
+		return nil
+	})
+	if err != nil && !os.IsNotExist(err) {
+		return nil, err
+	}
+	sort.Strings(entries)
+	return entries, nil
+}
+
+func renderSupportBundleInline() string {
+	return `<button id="support-bundle-btn" class="btn btn-primary" onclick="supportBundleDownload()">&#8595; Download Support Bundle</button>
+<div id="support-bundle-status" style="margin-top:10px;font-size:13px;color:var(--muted)"></div>
+<script>
+window.supportBundleDownload = function() {
+  var btn = document.getElementById('support-bundle-btn');
+  var status = document.getElementById('support-bundle-status');
+  btn.disabled = true;
+  btn.textContent = 'Building...';
+  status.textContent = 'Collecting logs and export data\u2026';
+  status.style.color = 'var(--muted)';
+  var filename = 'bee-support.tar.gz';
+  fetch('/export/support.tar.gz')
+    .then(function(r) {
+      if (!r.ok) throw new Error('HTTP ' + r.status);
+      var cd = r.headers.get('Content-Disposition') || '';
+      var m = cd.match(/filename="?([^";]+)"?/);
+      if (m) filename = m[1];
+      return r.blob();
+    })
+    .then(function(blob) {
+      var url = URL.createObjectURL(blob);
+      var a = document.createElement('a');
+      a.href = url;
+      a.download = filename;
+      document.body.appendChild(a);
+      a.click();
+      document.body.removeChild(a);
+      URL.revokeObjectURL(url);
+      status.textContent = 'Download started.';
+      status.style.color = 'var(--ok-fg)';
+    })
+    .catch(function(e) {
+      status.textContent = 'Error: ' + e.message;
+      status.style.color = 'var(--crit-fg)';
+    })
+    .finally(function() {
+      btn.disabled = false;
+      btn.textContent = '\u2195 Download Support Bundle';
+    });
+};
+</script>`
+}
+
+func renderUSBExportCard() string {
+	return `<div class="card" style="margin-top:16px">
+  <div class="card-head">Export to USB
+    <button class="btn btn-sm btn-secondary" onclick="usbRefresh()" style="margin-left:auto">&#8635; Refresh</button>
+  </div>
+  <div class="card-body">` + renderUSBExportInline() + `</div>
+</div>`
+}
+
+func renderUSBExportInline() string {
+	return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Write audit JSON or support bundle directly to a removable USB drive.</p>
+<div id="usb-status" style="font-size:13px;color:var(--muted)">Scanning for USB devices...</div>
+<div id="usb-targets" style="margin-top:12px"></div>
+<div id="usb-msg" style="margin-top:10px;font-size:13px"></div>
+<script>
+(function(){
+function usbRefresh() {
+  document.getElementById('usb-status').textContent = 'Scanning...';
+  document.getElementById('usb-targets').innerHTML = '';
+  document.getElementById('usb-msg').textContent = '';
+  fetch('/api/export/usb').then(r=>r.json()).then(targets => {
+    window._usbTargets = Array.isArray(targets) ? targets : [];
+    const st = document.getElementById('usb-status');
+    const ct = document.getElementById('usb-targets');
+    if (!targets || targets.length === 0) {
+      st.textContent = 'No removable USB devices found.';
+      return;
+    }
+    st.textContent = targets.length + ' device(s) found:';
+    ct.innerHTML = '<table><tr><th>Device</th><th>FS</th><th>Size</th><th>Label</th><th>Model</th><th>Actions</th></tr>' +
+      targets.map((t, idx) => {
+        const dev = t.device || '';
+        const label = t.label || '';
+        const model = t.model || '';
+        return '<tr>' +
+          '<td style="font-family:monospace">'+dev+'</td>' +
+          '<td>'+t.fs_type+'</td>' +
+          '<td>'+t.size+'</td>' +
+          '<td>'+label+'</td>' +
+          '<td style="font-size:12px;color:var(--muted)">'+model+'</td>' +
+          '<td style="white-space:nowrap">' +
+            '<button class="btn btn-sm btn-primary" onclick="usbExport(\'audit\','+idx+',this)">Audit JSON</button> ' +
+            '<button class="btn btn-sm btn-secondary" onclick="usbExport(\'bundle\','+idx+',this)">Support Bundle</button>' +
+            '<div class="usb-row-msg" style="margin-top:6px;font-size:12px;color:var(--muted)"></div>' +
+          '</td></tr>';
+      }).join('') + '</table>';
+  }).catch(e => {
+    document.getElementById('usb-status').textContent = 'Error: ' + e;
+  });
+}
+window.usbExport = function(type, targetIndex, btn) {
+  const target = (window._usbTargets || [])[targetIndex];
+  if (!target) {
+    const msg = document.getElementById('usb-msg');
+    msg.style.color = 'var(--err,red)';
+    msg.textContent = 'Error: USB target not found. Refresh and try again.';
+    return;
+  }
+  const msg = document.getElementById('usb-msg');
+  const row = btn ? btn.closest('td') : null;
+  const rowMsg = row ? row.querySelector('.usb-row-msg') : null;
+  const originalText = btn ? btn.textContent : '';
+  if (btn) {
+    btn.disabled = true;
+    btn.textContent = 'Exporting...';
+  }
+  if (rowMsg) {
+    rowMsg.style.color = 'var(--muted)';
+    rowMsg.textContent = 'Working...';
+  }
+  msg.style.color = 'var(--muted)';
+  msg.textContent = 'Exporting ' + (type === 'bundle' ? 'support bundle' : 'audit JSON') + ' to ' + (target.device||'') + '...';
+  fetch('/api/export/usb/'+type, {
+    method: 'POST',
+    headers: {'Content-Type':'application/json'},
+    body: JSON.stringify(target)
+  }).then(async r => {
+    const d = await r.json();
+    if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status));
+    return d;
+  }).then(d => {
+    msg.style.color = 'var(--ok,green)';
+    msg.textContent = d.message || 'Done.';
+    if (rowMsg) {
+      rowMsg.style.color = 'var(--ok,green)';
+      rowMsg.textContent = d.message || 'Done.';
+    }
+  }).catch(e => {
+    msg.style.color = 'var(--err,red)';
+    msg.textContent = 'Error: '+e;
+    if (rowMsg) {
+      rowMsg.style.color = 'var(--err,red)';
+      rowMsg.textContent = 'Error: ' + e;
+    }
+  }).finally(() => {
+    if (btn) {
+      btn.disabled = false;
+      btn.textContent = originalText;
+    }
+  });
+};
+window.usbRefresh = usbRefresh;
+usbRefresh();
+})();
+</script>`
+}
+
+func renderNvidiaSelfHealInline() string {
+	return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Inspect NVIDIA GPU health, restart the bee-nvidia driver service, and issue a per-GPU reset when the driver reports reset required.</p>
+<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:12px">
+  <button id="nvidia-restart-btn" class="btn btn-secondary" onclick="nvidiaRestartDrivers()">Restart GPU Drivers</button>
+  <button class="btn btn-sm btn-secondary" onclick="loadNvidiaSelfHeal()">&#8635; Refresh</button>
+</div>
+<div id="nvidia-self-heal-status" style="font-size:13px;color:var(--muted);margin-bottom:12px">Loading NVIDIA GPU status...</div>
+<div id="nvidia-self-heal-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
+<div id="nvidia-self-heal-out" style="display:none;margin-top:12px">
+  <div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
+    <span id="nvidia-self-heal-out-label" style="font-size:12px;font-weight:600;color:var(--muted)">Output</span>
+    <span id="nvidia-self-heal-out-status" style="font-size:12px"></span>
+  </div>
+  <div id="nvidia-self-heal-terminal" class="terminal" style="max-height:220px;width:100%;box-sizing:border-box"></div>
+</div>
+<script>
+function nvidiaSelfHealShowResult(label, status, output) {
+  var out = document.getElementById('nvidia-self-heal-out');
+  var term = document.getElementById('nvidia-self-heal-terminal');
+  var statusEl = document.getElementById('nvidia-self-heal-out-status');
+  var labelEl = document.getElementById('nvidia-self-heal-out-label');
+  out.style.display = 'block';
+  labelEl.textContent = label;
+  term.textContent = output || '(no output)';
+  term.scrollTop = term.scrollHeight;
+  if (status === 'ok') {
+    statusEl.textContent = '✓ done';
+    statusEl.style.color = 'var(--ok-fg, #2c662d)';
+  } else {
+    statusEl.textContent = '✗ failed';
+    statusEl.style.color = 'var(--crit-fg, #9f3a38)';
+  }
+}
+function nvidiaRestartDrivers() {
+  var btn = document.getElementById('nvidia-restart-btn');
+  var original = btn.textContent;
+  btn.disabled = true;
+  btn.textContent = 'Restarting...';
+  nvidiaSelfHealShowResult('restart bee-nvidia', 'ok', 'Running...');
+  fetch('/api/services/action', {
+    method:'POST',
+    headers:{'Content-Type':'application/json'},
+    body:JSON.stringify({name:'bee-nvidia', action:'restart'})
+  }).then(r=>r.json()).then(d => {
+    nvidiaSelfHealShowResult('restart bee-nvidia', d.status || 'error', d.output || d.error || '(no output)');
+    setTimeout(function() {
+      loadServices();
+      loadNvidiaSelfHeal();
+    }, 800);
+  }).catch(e => {
+    nvidiaSelfHealShowResult('restart bee-nvidia', 'error', 'Request failed: ' + e);
+  }).finally(() => {
+    btn.disabled = false;
+    btn.textContent = original;
+  });
+}
+function nvidiaResetGPU(index, btn) {
+  var original = btn.textContent;
+  btn.disabled = true;
+  btn.textContent = 'Resetting...';
+  nvidiaSelfHealShowResult('reset gpu ' + index, 'ok', 'Running...');
+  fetch('/api/gpu/nvidia-reset', {
+    method:'POST',
+    headers:{'Content-Type':'application/json'},
+    body:JSON.stringify({index:index})
+  }).then(r=>r.json()).then(d => {
+    nvidiaSelfHealShowResult('reset gpu ' + index, d.status || 'error', d.output || '(no output)');
+    setTimeout(loadNvidiaSelfHeal, 1000);
+  }).catch(e => {
+    nvidiaSelfHealShowResult('reset gpu ' + index, 'error', 'Request failed: ' + e);
+  }).finally(() => {
+    btn.disabled = false;
+    btn.textContent = original;
+  });
+}
+function loadNvidiaSelfHeal() {
+  var status = document.getElementById('nvidia-self-heal-status');
+  var table = document.getElementById('nvidia-self-heal-table');
+  status.textContent = 'Loading NVIDIA GPU status...';
+  status.style.color = 'var(--muted)';
+  table.innerHTML = '<p style="color:var(--muted);font-size:13px">Loading...</p>';
+  fetch('/api/gpu/nvidia-status').then(r=>r.json()).then(gpus => {
+    if (!Array.isArray(gpus) || gpus.length === 0) {
+      status.textContent = 'No NVIDIA GPUs detected or nvidia-smi is unavailable.';
+      table.innerHTML = '';
+      return;
+    }
+    status.textContent = gpus.length + ' NVIDIA GPU(s) detected.';
+    const rows = gpus.map(g => {
+      const serial = g.serial || '';
+      const bdf = g.bdf || '';
+      const id = serial || bdf || ('gpu-' + g.index);
+      const badge = g.status === 'OK' ? 'badge-ok' : g.status === 'RESET_REQUIRED' ? 'badge-err' : 'badge-warn';
+      const details = [];
+      if (serial) details.push('serial ' + serial);
+      if (bdf) details.push('bdf ' + bdf);
+      if (g.parse_failure && g.raw_line) details.push(g.raw_line);
+      return '<tr>'
+        + '<td style="white-space:nowrap">' + g.index + '</td>'
+        + '<td>' + (g.name || 'unknown') + '</td>'
+        + '<td style="font-family:monospace">' + id + '</td>'
+        + '<td><span class="badge ' + badge + '">' + (g.status || 'UNKNOWN') + '</span>'
+        + (details.length ? '<div style="margin-top:4px;font-size:12px;color:var(--muted)">' + details.join(' | ') + '</div>' : '')
+        + '</td>'
+        + '<td style="white-space:nowrap"><button class="btn btn-sm btn-secondary" onclick="nvidiaResetGPU(' + g.index + ', this)">Reset GPU</button></td>'
+        + '</tr>';
+    }).join('');
+    table.innerHTML = '<table><tr><th>GPU</th><th>Model</th><th>ID</th><th>Status</th><th>Action</th></tr>' + rows + '</table>';
+  }).catch(e => {
+    status.textContent = 'Error loading NVIDIA GPU status: ' + e;
+    status.style.color = 'var(--crit-fg, #9f3a38)';
+    table.innerHTML = '';
+  });
+}
+loadNvidiaSelfHeal();
+</script>`
+}
+
+func renderTools() string {
+	return `<div class="card" style="margin-bottom:16px">
+  <div class="card-head">System Install</div>
+  <div class="card-body">
+    <div style="margin-bottom:20px">
+    <div style="font-weight:600;margin-bottom:8px">Install to RAM</div>
+    <p id="boot-source-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Detecting boot source...</p>
+    <p id="ram-status-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Checking...</p>
+    <button id="ram-install-btn" class="btn btn-primary" onclick="installToRAM()" style="display:none">&#9654; Copy to RAM</button>
+    </div>
+    <div style="border-top:1px solid var(--line);padding-top:20px">
+    <div style="font-weight:600;margin-bottom:8px">Install to Disk</div>` +
+		renderInstallInline() + `
+    </div>
+  </div>
+</div>
+<script>
+fetch('/api/system/ram-status').then(r=>r.json()).then(d=>{
+  const boot = document.getElementById('boot-source-text');
+  const txt = document.getElementById('ram-status-text');
+  const btn = document.getElementById('ram-install-btn');
+  let source = d.device || d.source || 'unknown source';
+  let kind = d.kind || 'unknown';
+  let label = source;
+  if (kind === 'ram') label = 'RAM';
+  else if (kind === 'usb') label = 'USB (' + source + ')';
+  else if (kind === 'cdrom') label = 'CD-ROM (' + source + ')';
+  else if (kind === 'disk') label = 'disk (' + source + ')';
+  else label = source;
+  boot.textContent = 'Current boot source: ' + label + '.';
+  txt.textContent = d.message || 'Checking...';
+  if (d.status === 'ok' || d.in_ram) {
+    txt.style.color = 'var(--ok, green)';
+  } else if (d.status === 'failed') {
+    txt.style.color = 'var(--err, #b91c1c)';
+  } else {
+    txt.style.color = 'var(--muted)';
+  }
+  if (d.can_start_task) {
+    btn.style.display = '';
+    btn.disabled = false;
+  } else {
+    btn.style.display = 'none';
+  }
+});
+function installToRAM() {
+  document.getElementById('ram-install-btn').disabled = true;
+  fetch('/api/system/install-to-ram', {method:'POST'}).then(r=>r.json()).then(d=>{
+    window.location.href = '/tasks#' + d.task_id;
+  });
+}
+</script>
+
+<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
+<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Downloads a tar.gz archive of all audit files, SAT results, and logs.</p>
+` + renderSupportBundleInline() + `
+<div style="border-top:1px solid var(--border);margin-top:16px;padding-top:16px">
+  <div style="font-weight:600;margin-bottom:8px">Export to USB</div>
+  ` + renderUSBExportInline() + `
+</div>
+</div></div>
+
+<div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">&#8635; Check</button></div>
+<div class="card-body"><div id="tools-table"><p style="color:var(--muted);font-size:13px">Checking...</p></div></div></div>
+
+<div class="card"><div class="card-head">NVIDIA Self Heal</div><div class="card-body">` +
+		renderNvidiaSelfHealInline() + `</div></div>
+
+<div class="card"><div class="card-head">Network</div><div class="card-body">` +
+		renderNetworkInline() + `</div></div>
+
+<div class="card"><div class="card-head">Services</div><div class="card-body">` +
+		renderServicesInline() + `</div></div>
+
+
+<script>
+function checkTools() {
+  document.getElementById('tools-table').innerHTML = '<p style="color:var(--muted);font-size:13px">Checking...</p>';
+  fetch('/api/tools/check').then(r=>r.json()).then(tools => {
+    const rows = tools.map(t =>
+      '<tr><td>'+t.Name+'</td><td><span class="badge '+(t.OK ? 'badge-ok' : 'badge-err')+'">'+(t.OK ? '&#10003; '+t.Path : '&#10007; missing')+'</span></td></tr>'
+    ).join('');
+    document.getElementById('tools-table').innerHTML =
+      '<table><tr><th>Tool</th><th>Status</th></tr>'+rows+'</table>';
+  });
+}
+checkTools();
+</script>`
+}
+
+func renderExportIndex(exportDir string) (string, error) {
+	entries, err := listExportFiles(exportDir)
+	if err != nil {
+		return "", err
+	}
+	var body strings.Builder
+	body.WriteString(`<!DOCTYPE html><html><head><meta charset="utf-8"><title>Bee Export Files</title></head><body>`)
+	body.WriteString(`<h1>Bee Export Files</h1><ul>`)
+	for _, entry := range entries {
+		body.WriteString(`<li><a href="/export/file?path=` + url.QueryEscape(entry) + `">` + html.EscapeString(entry) + `</a></li>`)
+	}
+	if len(entries) == 0 {
+		body.WriteString(`<li>No export files found.</li>`)
+	}
+	body.WriteString(`</ul></body></html>`)
+	return body.String(), nil
+}
--- a/audit/internal/webui/page_install_tasks.go
+++ b/audit/internal/webui/page_install_tasks.go
@@ -0,0 +1,314 @@
+package webui
+
+func renderInstallInline() string {
+	return `
+    <div class="alert alert-warn" style="margin-bottom:16px">
+      <strong>Warning:</strong> Installing will <strong>completely erase</strong> the selected
+      disk and write the live system onto it. All existing data on the target disk will be lost.
+      This operation cannot be undone.
+    </div>
+    <div id="install-loading" style="color:var(--muted);font-size:13px">Loading disk list…</div>
+    <div id="install-disk-section" style="display:none">
+      <div class="card" style="margin-bottom:0">
+        <table id="install-disk-table">
+          <thead><tr><th></th><th>Device</th><th>Model</th><th>Size</th><th>Status</th></tr></thead>
+          <tbody id="install-disk-tbody"></tbody>
+        </table>
+      </div>
+      <div style="margin-top:12px">
+        <button class="btn btn-secondary btn-sm" onclick="installRefreshDisks()">↻ Refresh</button>
+      </div>
+    </div>
+    <div id="install-confirm-section" style="display:none;margin-top:20px">
+      <div id="install-confirm-warn" class="alert" style="background:#fff6f6;border:1px solid #e0b4b4;color:#9f3a38;font-size:13px"></div>
+      <div class="form-row" style="max-width:360px">
+        <label>Type the device name to confirm (e.g. /dev/sda)</label>
+        <input type="text" id="install-confirm-input" placeholder="/dev/..." oninput="installCheckConfirm()" autocomplete="off" spellcheck="false">
+      </div>
+      <button class="btn btn-danger" id="install-start-btn" disabled onclick="installStart()">Install to Disk</button>
+      <button class="btn btn-secondary" style="margin-left:8px" onclick="installDeselect()">Cancel</button>
+    </div>
+    <div id="install-progress-section" style="display:none;margin-top:20px">
+      <div class="card-head" style="margin-bottom:8px">Installation Progress</div>
+      <div id="install-terminal" class="terminal" style="max-height:500px"></div>
+      <div id="install-status" style="margin-top:12px;font-size:13px"></div>
+    </div>
+
+<style>
+#install-disk-tbody tr{cursor:pointer}
+#install-disk-tbody tr.selected td{background:rgba(33,133,208,.1)}
+#install-disk-tbody tr:hover td{background:rgba(33,133,208,.07)}
+</style>
+
+<script>
+var _installSelected = null;
+
+function installRefreshDisks() {
+  document.getElementById('install-loading').style.display = '';
+  document.getElementById('install-disk-section').style.display = 'none';
+  document.getElementById('install-confirm-section').style.display = 'none';
+  _installSelected = null;
+  fetch('/api/install/disks').then(function(r){ return r.json(); }).then(function(disks){
+    document.getElementById('install-loading').style.display = 'none';
+    var tbody = document.getElementById('install-disk-tbody');
+    tbody.innerHTML = '';
+    if (!disks || disks.length === 0) {
+      tbody.innerHTML = '<tr><td colspan="5" style="color:var(--muted);text-align:center">No installable disks found</td></tr>';
+    } else {
+      disks.forEach(function(d) {
+        var warnings = (d.warnings || []);
+        var statusHtml;
+        if (warnings.length === 0) {
+          statusHtml = '<span class="badge badge-ok">OK</span>';
+        } else {
+          var hasSmall = warnings.some(function(w){ return w.indexOf('too small') >= 0; });
+          statusHtml = warnings.map(function(w){
+            var cls = hasSmall ? 'badge-err' : 'badge-warn';
+            return '<span class="badge ' + cls + '" title="' + w.replace(/"/g,'&quot;') + '">' +
+              (w.length > 40 ? w.substring(0,38)+'…' : w) + '</span>';
+          }).join(' ');
+        }
+        var mountedNote = (d.mounted_parts && d.mounted_parts.length > 0)
+          ? ' <span style="color:var(--warn-fg);font-size:11px">(mounted)</span>' : '';
+        var tr = document.createElement('tr');
+        tr.dataset.device = d.device;
+        tr.dataset.model = d.model || 'Unknown';
+        tr.dataset.size = d.size;
+        tr.dataset.warnings = JSON.stringify(warnings);
+        tr.innerHTML =
+          '<td><input type="radio" name="install-disk" value="' + d.device + '"></td>' +
+          '<td><code>' + d.device + '</code>' + mountedNote + '</td>' +
+          '<td>' + (d.model || '—') + '</td>' +
+          '<td>' + d.size + '</td>' +
+          '<td>' + statusHtml + '</td>';
+        tr.addEventListener('click', function(){ installSelectDisk(this); });
+        tbody.appendChild(tr);
+      });
+    }
+    document.getElementById('install-disk-section').style.display = '';
+  }).catch(function(e){
+    document.getElementById('install-loading').textContent = 'Failed to load disk list: ' + e;
+  });
+}
+
+function installSelectDisk(tr) {
+  document.querySelectorAll('#install-disk-tbody tr').forEach(function(r){ r.classList.remove('selected'); });
+  tr.classList.add('selected');
+  var radio = tr.querySelector('input[type=radio]');
+  if (radio) radio.checked = true;
+  _installSelected = {
+    device: tr.dataset.device,
+    model: tr.dataset.model,
+    size: tr.dataset.size,
+    warnings: JSON.parse(tr.dataset.warnings || '[]')
+  };
+  var warnBox = document.getElementById('install-confirm-warn');
+  var warnLines = '<strong>⚠ DANGER:</strong> ' + _installSelected.device +
+    ' (' + _installSelected.model + ', ' + _installSelected.size + ')' +
+    ' will be <strong>completely erased</strong> and repartitioned. All data will be lost.<br>';
+  if (_installSelected.warnings.length > 0) {
+    warnLines += '<br>' + _installSelected.warnings.map(function(w){ return '• ' + w; }).join('<br>');
+  }
+  warnBox.innerHTML = warnLines;
+  document.getElementById('install-confirm-input').value = '';
+  document.getElementById('install-start-btn').disabled = true;
+  document.getElementById('install-confirm-section').style.display = '';
+  document.getElementById('install-progress-section').style.display = 'none';
+}
+
+function installDeselect() {
+  _installSelected = null;
+  document.querySelectorAll('#install-disk-tbody tr').forEach(function(r){ r.classList.remove('selected'); });
+  document.querySelectorAll('#install-disk-tbody input[type=radio]').forEach(function(r){ r.checked = false; });
+  document.getElementById('install-confirm-section').style.display = 'none';
+}
+
+function installCheckConfirm() {
+  var val = document.getElementById('install-confirm-input').value.trim();
+  var ok = _installSelected && val === _installSelected.device;
+  document.getElementById('install-start-btn').disabled = !ok;
+}
+
+function installStart() {
+  if (!_installSelected) return;
+  document.getElementById('install-confirm-section').style.display = 'none';
+  document.getElementById('install-disk-section').style.display = 'none';
+  document.getElementById('install-loading').style.display = 'none';
+  var prog = document.getElementById('install-progress-section');
+  var term = document.getElementById('install-terminal');
+  var status = document.getElementById('install-status');
+  prog.style.display = '';
+  term.textContent = '';
+  status.textContent = 'Starting installation…';
+  status.style.color = 'var(--muted)';
+
+  fetch('/api/install/run', {
+    method: 'POST',
+    headers: {'Content-Type': 'application/json'},
+    body: JSON.stringify({device: _installSelected.device})
+  }).then(function(r){
+    return r.json().then(function(j){
+      if (!r.ok) throw new Error(j.error || r.statusText);
+      return j;
+    });
+  }).then(function(j){
+    if (!j.task_id) throw new Error('missing task id');
+    installStreamLog(j.task_id);
+  }).catch(function(e){
+    status.textContent = 'Error: ' + e;
+    status.style.color = 'var(--crit-fg)';
+  });
+}
+
+function installStreamLog(taskId) {
+  var term = document.getElementById('install-terminal');
+  var status = document.getElementById('install-status');
+  var es = new EventSource('/api/tasks/' + taskId + '/stream');
+  es.onmessage = function(e) {
+    term.textContent += e.data + '\n';
+    term.scrollTop = term.scrollHeight;
+  };
+  es.addEventListener('done', function(e) {
+    es.close();
+    if (!e.data) {
+      status.innerHTML = '<span style="color:var(--ok-fg);font-weight:700">✓ Installation complete.</span> Remove the ISO and reboot.';
+      var rebootBtn = document.createElement('button');
+      rebootBtn.className = 'btn btn-primary btn-sm';
+      rebootBtn.style.marginLeft = '12px';
+      rebootBtn.textContent = 'Reboot now';
+      rebootBtn.onclick = function(){
+        fetch('/api/services/action', {method:'POST',headers:{'Content-Type':'application/json'},
+          body: JSON.stringify({name:'', action:'reboot'})});
+      };
+      status.appendChild(rebootBtn);
+    } else {
+      status.textContent = '✗ Installation failed: ' + e.data;
+      status.style.color = 'var(--crit-fg)';
+    }
+  });
+  es.onerror = function() {
+    es.close();
+    status.textContent = '✗ Stream disconnected.';
+    status.style.color = 'var(--crit-fg)';
+  };
+}
+
+installRefreshDisks();
+</script>
+`
+}
+
+func renderInstall() string {
+	return `<div class="card"><div class="card-head">Install Live System to Disk</div><div class="card-body">` +
+		renderInstallInline() +
+		`</div></div>`
+}
+
+func renderTasks() string {
+	return `<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">
+<button class="btn btn-danger btn-sm" onclick="cancelAll()">Cancel All</button>
+<button class="btn btn-sm" style="background:#b45309;color:#fff" onclick="killWorkers()" title="Send SIGKILL to all running test processes (bee-gpu-burn, stress-ng, stressapptest, memtester)">Kill Workers</button>
+<span id="kill-toast" style="font-size:12px;color:var(--muted);display:none"></span>
+<span style="font-size:12px;color:var(--muted)">Open a task to view its saved logs and charts.</span>
+</div>
+<div class="card">
+<div id="tasks-table"><p style="color:var(--muted);font-size:13px;padding:16px">Loading...</p></div>
+</div>
+<script>
+var _taskRefreshTimer = null;
+var _tasksAll = [];
+var _taskPage = 1;
+var _taskPageSize = 50;
+
+function loadTasks() {
+  fetch('/api/tasks').then(r=>r.json()).then(tasks => {
+    _tasksAll = Array.isArray(tasks) ? tasks : [];
+    if (_tasksAll.length === 0) {
+      _taskPage = 1;
+      document.getElementById('tasks-table').innerHTML = '<p style="color:var(--muted);font-size:13px;padding:16px">No tasks.</p>';
+      return;
+    }
+    const totalPages = Math.max(1, Math.ceil(_tasksAll.length / _taskPageSize));
+    if (_taskPage > totalPages) _taskPage = totalPages;
+    if (_taskPage < 1) _taskPage = 1;
+    const start = (_taskPage - 1) * _taskPageSize;
+    const pageTasks = _tasksAll.slice(start, start + _taskPageSize);
+    const rows = pageTasks.map(t => {
+      const dur = t.elapsed_sec ? formatDurSec(t.elapsed_sec) : '';
+      const statusClass = {running:'badge-ok',pending:'badge-unknown',done:'badge-ok',failed:'badge-err',cancelled:'badge-unknown'}[t.status]||'badge-unknown';
+      const statusLabel = {running:'&#9654; running',pending:'pending',done:'&#10003; done',failed:'&#10007; failed',cancelled:'cancelled'}[t.status]||t.status;
+      let actions = '<a class="btn btn-sm btn-secondary" href="/tasks/'+encodeURIComponent(t.id)+'">Open</a>';
+      if (t.status === 'running' || t.status === 'pending') {
+        actions += ' <button class="btn btn-sm btn-danger" onclick="cancelTask(\''+t.id+'\')">Cancel</button>';
+      }
+      if (t.status === 'pending') {
+        actions += ' <button class="btn btn-sm btn-secondary" onclick="setPriority(\''+t.id+'\',1)" title="Increase priority">&#8679;</button>';
+        actions += ' <button class="btn btn-sm btn-secondary" onclick="setPriority(\''+t.id+'\',-1)" title="Decrease priority">&#8681;</button>';
+      }
+      return '<tr><td><a href="/tasks/'+encodeURIComponent(t.id)+'">'+escHtml(t.name)+'</a></td>' +
+        '<td><span class="badge '+statusClass+'">'+statusLabel+'</span></td>' +
+        '<td style="font-size:12px;color:var(--muted)">'+fmtTime(t.created_at)+'</td>' +
+        '<td style="font-size:12px;color:var(--muted)">'+dur+'</td>' +
+        '<td>'+t.priority+'</td>' +
+        '<td>'+actions+'</td></tr>';
+    }).join('');
+    const showingFrom = start + 1;
+    const showingTo = Math.min(start + pageTasks.length, _tasksAll.length);
+    const pager =
+      '<div style="display:flex;align-items:center;justify-content:space-between;gap:12px;flex-wrap:wrap;padding:12px 14px;border-top:1px solid var(--border-lite);background:var(--surface-2)">' +
+        '<div style="font-size:12px;color:var(--muted)">Showing '+showingFrom+'-'+showingTo+' of '+_tasksAll.length+' tasks</div>' +
+        '<div style="display:flex;align-items:center;gap:8px">' +
+          '<button class="btn btn-sm btn-secondary" onclick="setTaskPage('+(_taskPage-1)+')" '+(_taskPage <= 1 ? 'disabled' : '')+'>Previous</button>' +
+          '<span style="font-size:12px;color:var(--muted)">Page '+_taskPage+' / '+totalPages+'</span>' +
+          '<button class="btn btn-sm btn-secondary" onclick="setTaskPage('+(_taskPage+1)+')" '+(_taskPage >= totalPages ? 'disabled' : '')+'>Next</button>' +
+        '</div>' +
+      '</div>';
+    document.getElementById('tasks-table').innerHTML =
+      '<table><tr><th>Name</th><th>Status</th><th>Created</th><th>Duration</th><th>Priority</th><th>Actions</th></tr>'+rows+'</table>' + pager;
+  });
+}
+
+function escHtml(s) { return (s||'').replace(/&/g,'&amp;').replace(/</g,'&lt;').replace(/>/g,'&gt;').replace(/"/g,'&quot;'); }
+function fmtTime(s) { if (!s) return ''; try { return new Date(s).toLocaleTimeString(); } catch(e){ return s; } }
+function formatDurSec(sec) {
+  sec = Math.max(0, Math.round(sec||0));
+  if (sec < 60) return sec+'s';
+  const m = Math.floor(sec/60), ss = sec%60;
+  return m+'m '+ss+'s';
+}
+function setTaskPage(page) {
+  const totalPages = Math.max(1, Math.ceil(_tasksAll.length / _taskPageSize));
+  _taskPage = Math.min(totalPages, Math.max(1, page));
+  loadTasks();
+}
+
+function cancelTask(id) {
+  fetch('/api/tasks/'+id+'/cancel',{method:'POST'}).then(()=>loadTasks());
+}
+function cancelAll() {
+  fetch('/api/tasks/cancel-all',{method:'POST'}).then(()=>loadTasks());
+}
+function killWorkers() {
+  if (!confirm('Send SIGKILL to all running test workers (bee-gpu-burn, stress-ng, stressapptest, memtester)?\n\nThis will also cancel all queued and running tasks.')) return;
+  fetch('/api/tasks/kill-workers',{method:'POST'})
+    .then(r=>r.json())
+    .then(d=>{
+      loadTasks();
+      var toast = document.getElementById('kill-toast');
+      var parts = [];
+      if (d.cancelled > 0) parts.push(d.cancelled+' task'+(d.cancelled===1?'':'s')+' cancelled');
+      if (d.killed > 0) parts.push(d.killed+' process'+(d.killed===1?'':'es')+' killed');
+      toast.textContent = parts.length ? parts.join(', ')+'.' : 'No processes found.';
+      toast.style.display = '';
+      setTimeout(()=>{ toast.style.display='none'; }, 5000);
+    });
+}
+function setPriority(id, delta) {
+  fetch('/api/tasks/'+id+'/priority',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({delta:delta})})
+    .then(()=>loadTasks());
+}
+
+loadTasks();
+_taskRefreshTimer = setInterval(loadTasks, 2000);
+</script>`
+}
--- a/audit/internal/webui/page_metrics.go
+++ b/audit/internal/webui/page_metrics.go
@@ -0,0 +1,238 @@
+package webui
+
+func renderMetrics() string {
+	return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Live metrics — updated every 2 seconds.</p>
+
+<div class="card" style="margin-bottom:16px">
+  <div class="card-head">Server — Load</div>
+  <div class="card-body" style="padding:8px">
+    <img id="chart-server-load" data-chart-refresh="1" src="/api/metrics/chart/server-load.svg" style="width:100%;display:block;border-radius:6px" alt="CPU/Mem load">
+  </div>
+</div>
+
+<div class="card" style="margin-bottom:16px">
+  <div class="card-head">Temperature — CPU</div>
+  <div class="card-body" style="padding:8px">
+    <img id="chart-server-temp-cpu" data-chart-refresh="1" src="/api/metrics/chart/server-temp-cpu.svg" style="width:100%;display:block;border-radius:6px" alt="CPU temperature">
+  </div>
+</div>
+
+
+<div class="card" style="margin-bottom:16px">
+  <div class="card-head">Temperature — Ambient Sensors</div>
+  <div class="card-body" style="padding:8px">
+    <img id="chart-server-temp-ambient" data-chart-refresh="1" src="/api/metrics/chart/server-temp-ambient.svg" style="width:100%;display:block;border-radius:6px" alt="Ambient temperature sensors">
+  </div>
+</div>
+
+<div class="card" style="margin-bottom:16px">
+  <div class="card-head">Server — Power</div>
+  <div class="card-body" style="padding:8px">
+    <img id="chart-server-power" data-chart-refresh="1" src="/api/metrics/chart/server-power.svg" style="width:100%;display:block;border-radius:6px" alt="System power">
+  </div>
+</div>
+
+<div id="card-server-fans" class="card" style="margin-bottom:16px;display:none">
+  <div class="card-head">Server — Fan RPM</div>
+  <div class="card-body" style="padding:8px">
+    <img id="chart-server-fans" data-chart-refresh="1" src="/api/metrics/chart/server-fans.svg" style="width:100%;display:block;border-radius:6px" alt="Fan RPM">
+  </div>
+</div>
+
+<section id="gpu-metrics-section" style="display:none;margin-top:24px;padding:16px 16px 4px;border:1px solid #d7e0ea;border-radius:10px;background:linear-gradient(180deg,#f7fafc 0%,#eef4f8 100%)">
+  <div style="display:flex;align-items:center;justify-content:space-between;gap:16px;flex-wrap:wrap;margin-bottom:14px">
+    <div>
+      <div style="font-size:12px;font-weight:700;letter-spacing:.08em;text-transform:uppercase;color:#486581">GPU Metrics</div>
+      <div id="gpu-metrics-summary" style="font-size:13px;color:var(--muted);margin-top:4px">Detected GPUs are rendered in a dedicated section.</div>
+    </div>
+    <label style="display:inline-flex;align-items:center;gap:8px;font-size:13px;color:var(--ink);font-weight:700;cursor:pointer">
+      <input id="gpu-chart-toggle" type="checkbox">
+      <span>One chart per GPU</span>
+    </label>
+  </div>
+
+  <div id="gpu-metrics-by-metric">
+    <div class="card" style="margin-bottom:16px">
+      <div class="card-head">GPU — Compute Load</div>
+      <div class="card-body" style="padding:8px">
+        <img id="chart-gpu-all-load" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-load.svg" style="width:100%;display:block;border-radius:6px" alt="GPU compute load">
+      </div>
+    </div>
+    <div class="card" style="margin-bottom:16px">
+      <div class="card-head">GPU — Memory Load</div>
+      <div class="card-body" style="padding:8px">
+        <img id="chart-gpu-all-memload" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-memload.svg" style="width:100%;display:block;border-radius:6px" alt="GPU memory load">
+      </div>
+    </div>
+    <div class="card" style="margin-bottom:16px">
+      <div class="card-head">GPU — Core Clock</div>
+      <div class="card-body" style="padding:8px">
+        <img id="chart-gpu-all-clock" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-clock.svg" style="width:100%;display:block;border-radius:6px" alt="GPU core clock">
+      </div>
+    </div>
+    <div class="card" style="margin-bottom:16px">
+      <div class="card-head">GPU — Power</div>
+      <div class="card-body" style="padding:8px">
+        <img id="chart-gpu-all-power" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-power.svg" style="width:100%;display:block;border-radius:6px" alt="GPU power">
+      </div>
+    </div>
+    <div class="card" style="margin-bottom:16px">
+      <div class="card-head">GPU — Temperature</div>
+      <div class="card-body" style="padding:8px">
+        <img id="chart-gpu-all-temp" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-temp.svg" style="width:100%;display:block;border-radius:6px" alt="GPU temperature">
+      </div>
+    </div>
+  </div>
+
+  <div id="gpu-metrics-by-gpu" style="display:none"></div>
+</section>
+
+<script>
+let gpuChartKey = '';
+const gpuChartModeStorageKey = 'bee.metrics.gpuChartMode';
+let metricsNvidiaGPUsPromise = null;
+
+function loadMetricsNvidiaGPUs() {
+  if (!metricsNvidiaGPUsPromise) {
+    metricsNvidiaGPUsPromise = fetch('/api/gpu/nvidia')
+      .then(function(r) {
+        if (!r.ok) throw new Error('Failed to load NVIDIA GPUs.');
+        return r.json();
+      })
+      .then(function(list) { return Array.isArray(list) ? list : []; })
+      .catch(function() { return []; });
+  }
+  return metricsNvidiaGPUsPromise;
+}
+
+function metricsGPUNameMap(list) {
+  const out = {};
+  (list || []).forEach(function(gpu) {
+    const idx = Number(gpu.index);
+    if (!Number.isFinite(idx) || !gpu.name) return;
+    out[idx] = gpu.name;
+  });
+  return out;
+}
+
+function metricsGPUDisplayLabel(idx, names) {
+  const name = names && names[idx];
+  return name ? ('GPU ' + idx + ' — ' + name) : ('GPU ' + idx);
+}
+
+function loadGPUChartModePreference() {
+  try {
+    return sessionStorage.getItem(gpuChartModeStorageKey) === 'per-gpu';
+  } catch (_) {
+    return false;
+  }
+}
+
+function saveGPUChartModePreference(perGPU) {
+  try {
+    sessionStorage.setItem(gpuChartModeStorageKey, perGPU ? 'per-gpu' : 'per-metric');
+  } catch (_) {}
+}
+
+function refreshChartImage(el) {
+  if (!el || el.dataset.loading === '1') return;
+  if (el.offsetParent === null) return;
+  const baseSrc = el.dataset.baseSrc || el.src.split('?')[0];
+  const nextSrc = baseSrc + '?t=' + Date.now();
+  const probe = new Image();
+  el.dataset.baseSrc = baseSrc;
+  el.dataset.loading = '1';
+  probe.onload = function() {
+    el.src = nextSrc;
+    el.dataset.loading = '0';
+  };
+  probe.onerror = function() {
+    el.dataset.loading = '0';
+  };
+  probe.src = nextSrc;
+}
+
+function refreshCharts() {
+  document.querySelectorAll('img[data-chart-refresh="1"]').forEach(refreshChartImage);
+}
+
+function gpuIndices(rows) {
+  const seen = {};
+  const out = [];
+  (rows || []).forEach(function(row) {
+    const idx = Number(row.index);
+    if (!Number.isFinite(idx) || seen[idx]) return;
+    seen[idx] = true;
+    out.push(idx);
+  });
+  return out.sort(function(a, b) { return a - b; });
+}
+
+function renderGPUOverviewCards(indices, names) {
+  const host = document.getElementById('gpu-metrics-by-gpu');
+  if (!host) return;
+  host.innerHTML = indices.map(function(idx) {
+    const label = metricsGPUDisplayLabel(idx, names);
+    return '<div class="card" style="margin-bottom:16px">' +
+      '<div class="card-head">' + label + ' — Overview</div>' +
+      '<div class="card-body" style="padding:8px">' +
+      '<img id="chart-gpu-' + idx + '-overview" data-chart-refresh="1" src="/api/metrics/chart/gpu/' + idx + '-overview.svg" style="width:100%;display:block;border-radius:6px" alt="' + label + ' overview">' +
+      '</div></div>';
+  }).join('');
+}
+
+function applyGPUChartMode() {
+  const perMetric = document.getElementById('gpu-metrics-by-metric');
+  const perGPU = document.getElementById('gpu-metrics-by-gpu');
+  const toggle = document.getElementById('gpu-chart-toggle');
+  const gpuModePerGPU = !!(toggle && toggle.checked);
+  if (perMetric) perMetric.style.display = gpuModePerGPU ? 'none' : '';
+  if (perGPU) perGPU.style.display = gpuModePerGPU ? '' : 'none';
+}
+
+function syncMetricsLayout(d) {
+  const fanCard = document.getElementById('card-server-fans');
+  if (fanCard) fanCard.style.display = (d.fans && d.fans.length > 0) ? '' : 'none';
+  const section = document.getElementById('gpu-metrics-section');
+  const summary = document.getElementById('gpu-metrics-summary');
+  const indices = gpuIndices(d.gpus);
+  loadMetricsNvidiaGPUs().then(function(gpus) {
+    const names = metricsGPUNameMap(gpus);
+    if (section) section.style.display = indices.length > 0 ? '' : 'none';
+    if (summary) {
+      summary.textContent = indices.length > 0
+        ? ('Detected GPUs: ' + indices.map(function(idx) { return metricsGPUDisplayLabel(idx, names); }).join(', '))
+        : 'No GPUs detected in live metrics.';
+    }
+    const nextKey = indices.join(',') + '|' + indices.map(function(idx) { return names[idx] || ''; }).join(',');
+    if (nextKey !== gpuChartKey) {
+      renderGPUOverviewCards(indices, names);
+      gpuChartKey = nextKey;
+    }
+    applyGPUChartMode();
+  });
+}
+
+function loadMetricsLayout() {
+  fetch('/api/metrics/latest').then(function(r) { return r.json(); }).then(syncMetricsLayout).catch(function() {});
+}
+
+const gpuChartToggle = document.getElementById('gpu-chart-toggle');
+if (gpuChartToggle) {
+  gpuChartToggle.checked = loadGPUChartModePreference();
+}
+applyGPUChartMode();
+
+if (gpuChartToggle) {
+  gpuChartToggle.addEventListener('change', function() {
+    saveGPUChartModePreference(!!gpuChartToggle.checked);
+    applyGPUChartMode();
+    refreshCharts();
+  });
+}
+
+loadMetricsLayout();
+setInterval(refreshCharts, 3000);
+setInterval(loadMetricsLayout, 5000);
+</script>`
+}
--- a/audit/internal/webui/page_network_services.go
+++ b/audit/internal/webui/page_network_services.go
@@ -0,0 +1,213 @@
+package webui
+
+import "html"
+
+// renderNetworkInline returns the network UI without a wrapping card (for embedding in Tools).
+func renderNetworkInline() string {
+	return `<div id="net-pending" style="display:none" class="alert alert-warn">
+<strong>&#9888; Network change applied.</strong> Reverting in <span id="net-countdown">60</span>s unless confirmed.
+<button class="btn btn-primary btn-sm" style="margin-left:8px" onclick="confirmNetChange()">Confirm</button>
+<button class="btn btn-secondary btn-sm" style="margin-left:4px" onclick="rollbackNetChange()">Rollback</button>
+</div>
+<div id="iface-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
+<div class="grid2" style="margin-top:16px">
+<div><div style="font-weight:700;font-size:13px;margin-bottom:8px">DHCP</div>
+<div class="form-row"><label>Interface (leave empty for all)</label><input type="text" id="dhcp-iface" placeholder="eth0"></div>
+<button class="btn btn-primary" onclick="runDHCP()">&#9654; Run DHCP</button>
+<div id="dhcp-out" style="margin-top:10px;font-size:12px;color:var(--ok-fg)"></div>
+</div>
+<div><div style="font-weight:700;font-size:13px;margin-bottom:8px">Static IPv4</div>
+<div class="form-row"><label>Interface</label><input type="text" id="st-iface" placeholder="eth0"></div>
+<div class="form-row"><label>Address</label><input type="text" id="st-addr" placeholder="192.168.1.100"></div>
+<div class="form-row"><label>Prefix length</label><input type="text" id="st-prefix" placeholder="24"></div>
+<div class="form-row"><label>Gateway</label><input type="text" id="st-gw" placeholder="192.168.1.1"></div>
+<div class="form-row"><label>DNS (comma-separated)</label><input type="text" id="st-dns" placeholder="8.8.8.8,8.8.4.4"></div>
+<button class="btn btn-primary" onclick="setStatic()">Apply Static IP</button>
+<div id="static-out" style="margin-top:10px;font-size:12px;color:var(--ok-fg)"></div>
+</div>
+</div>
+<script>
+var _netCountdownTimer = null;
+var _netRefreshTimer = null;
+const NET_ROLLBACK_SECS = 60;
+function loadNetwork() {
+  fetch('/api/network').then(r=>r.json()).then(d => {
+    const rows = (d.interfaces||[]).map(i =>
+      '<tr><td style="cursor:pointer" onclick="selectIface(\''+i.Name+'\')" title="Use this interface in the forms below"><span style="text-decoration:underline">'+i.Name+'</span></td>' +
+      '<td style="cursor:pointer" onclick="toggleIface(\''+i.Name+'\',\''+i.State+'\')" title="Click to toggle"><span class="badge '+(i.State==='up'?'badge-ok':'badge-warn')+'">'+i.State+'</span></td>' +
+      '<td>'+(i.IPv4||[]).join(', ')+'</td></tr>'
+    ).join('');
+    document.getElementById('iface-table').innerHTML =
+      '<table><tr><th>Interface</th><th>State (click to toggle)</th><th>Addresses</th></tr>'+rows+'</table>' +
+      (d.default_route ? '<p style="font-size:12px;color:var(--muted);margin-top:8px">Default route: '+d.default_route+'</p>' : '');
+    if (d.pending_change) showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
+    else hideNetPending();
+  }).catch(function() {});
+}
+function selectIface(iface) {
+  document.getElementById('dhcp-iface').value = iface;
+  document.getElementById('st-iface').value = iface;
+}
+function toggleIface(iface, currentState) {
+  showNetPending(NET_ROLLBACK_SECS);
+  fetch('/api/network/toggle',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({iface:iface})})
+    .then(r=>r.json()).then(d => {
+      if (d.error) { hideNetPending(); alert('Error: '+d.error); return; }
+      loadNetwork();
+      showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
+    }).catch(function() {
+      setTimeout(loadNetwork, 1500);
+    });
+}
+function hideNetPending() {
+  const el = document.getElementById('net-pending');
+  if (_netCountdownTimer) clearInterval(_netCountdownTimer);
+  _netCountdownTimer = null;
+  el.style.display = 'none';
+}
+function showNetPending(secs) {
+  if (!secs || secs < 1) { hideNetPending(); return; }
+  const el = document.getElementById('net-pending');
+  el.style.display = 'block';
+  if (_netCountdownTimer) clearInterval(_netCountdownTimer);
+  let remaining = secs;
+  document.getElementById('net-countdown').textContent = remaining;
+  _netCountdownTimer = setInterval(function() {
+    remaining--;
+    document.getElementById('net-countdown').textContent = remaining;
+    if (remaining <= 0) { hideNetPending(); loadNetwork(); }
+  }, 1000);
+}
+function confirmNetChange() {
+  hideNetPending();
+  fetch('/api/network/confirm',{method:'POST'}).then(()=>loadNetwork()).catch(()=>{});
+}
+function rollbackNetChange() {
+  hideNetPending();
+  fetch('/api/network/rollback',{method:'POST'}).then(()=>loadNetwork()).catch(()=>{});
+}
+function runDHCP() {
+  const iface = document.getElementById('dhcp-iface').value.trim();
+  showNetPending(NET_ROLLBACK_SECS);
+  fetch('/api/network/dhcp',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({interface:iface||'all'})})
+    .then(r=>r.json()).then(d => {
+      document.getElementById('dhcp-out').textContent = d.output || d.error || 'Done.';
+      if (d.error) { hideNetPending(); return; }
+      showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
+      loadNetwork();
+    }).catch(function() {
+      setTimeout(loadNetwork, 1500);
+    });
+}
+function setStatic() {
+  const dns = document.getElementById('st-dns').value.split(',').map(s=>s.trim()).filter(Boolean);
+  showNetPending(NET_ROLLBACK_SECS);
+  fetch('/api/network/static',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({
+    interface: document.getElementById('st-iface').value,
+    address: document.getElementById('st-addr').value,
+    prefix: document.getElementById('st-prefix').value,
+    gateway: document.getElementById('st-gw').value,
+    dns: dns,
+  })}).then(r=>r.json()).then(d => {
+    document.getElementById('static-out').textContent = d.output || d.error || 'Done.';
+    if (d.error) { hideNetPending(); return; }
+    showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
+    loadNetwork();
+  }).catch(function() {
+    setTimeout(loadNetwork, 1500);
+  });
+}
+loadNetwork();
+if (_netRefreshTimer) clearInterval(_netRefreshTimer);
+_netRefreshTimer = setInterval(loadNetwork, 5000);
+</script>`
+}
+
+func renderNetwork() string {
+	return `<div class="card"><div class="card-head">Network Interfaces</div><div class="card-body">` +
+		renderNetworkInline() +
+		`</div></div>`
+}
+
+func renderServicesInline() string {
+	return `<p style="font-size:13px;color:var(--muted);margin-bottom:10px">` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `</p>
+<div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="loadServices()">&#8635; Refresh</button></div>
+<div id="svc-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
+<div id="svc-out" style="display:none;margin-top:12px">
+  <div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
+    <span id="svc-out-label" style="font-size:12px;font-weight:600;color:var(--muted)">Output</span>
+    <span id="svc-out-status" style="font-size:12px"></span>
+  </div>
+  <div id="svc-terminal" class="terminal" style="max-height:220px;width:100%;box-sizing:border-box"></div>
+</div>
+<script>
+function loadServices() {
+  fetch('/api/services').then(r=>r.json()).then(svcs => {
+    const rows = svcs.map(s => {
+      const st = s.state||'unknown';
+      const badge = st==='active' ? 'badge-ok' : st==='failed' ? 'badge-err' : 'badge-warn';
+      const id = 'svc-body-'+s.name.replace(/[^a-z0-9]/g,'-');
+      const body = (s.body||'').replace(/</g,'&lt;').replace(/>/g,'&gt;');
+      return '<tr>' +
+        '<td style="white-space:nowrap">'+s.name+'</td>' +
+        '<td style="white-space:nowrap"><span class="badge '+badge+'" style="cursor:pointer" onclick="toggleBody(\''+id+'\')">'+st+' ▾</span>' +
+        '<div id="'+id+'" style="display:none;margin-top:6px"><pre style="font-size:11px;white-space:pre-wrap;word-break:break-all;max-height:200px;overflow-y:auto;background:#1b1c1d;padding:8px;border-radius:4px;color:#b5cea8">'+body+'</pre></div>' +
+        '</td>' +
+        '<td style="white-space:nowrap">' +
+        '<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-start"   onclick="svcAction(this,\''+s.name+'\',\'start\')">Start</button> ' +
+        '<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-stop"    onclick="svcAction(this,\''+s.name+'\',\'stop\')">Stop</button> ' +
+        '<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-restart" onclick="svcAction(this,\''+s.name+'\',\'restart\')">Restart</button>' +
+        '</td></tr>';
+    }).join('');
+    document.getElementById('svc-table').innerHTML =
+      '<table><tr><th>Unit</th><th>Status</th><th>Actions</th></tr>'+rows+'</table>';
+  });
+}
+function toggleBody(id) {
+  const el = document.getElementById(id);
+  if (el) el.style.display = el.style.display==='none' ? 'block' : 'none';
+}
+function svcAction(btn, name, action) {
+  var label = btn.textContent;
+  btn.disabled = true;
+  btn.textContent = '...';
+  var out = document.getElementById('svc-out');
+  var term = document.getElementById('svc-terminal');
+  var statusEl = document.getElementById('svc-out-status');
+  var labelEl = document.getElementById('svc-out-label');
+  out.style.display = 'block';
+  labelEl.textContent = action + ' ' + name;
+  term.textContent = 'Running...';
+  statusEl.textContent = '';
+  statusEl.style.color = '';
+  fetch('/api/services/action',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({name,action})})
+    .then(r=>r.json()).then(d => {
+      term.textContent = d.output || d.error || '(no output)';
+      term.scrollTop = term.scrollHeight;
+      if (d.status === 'ok') {
+        statusEl.textContent = '✓ done';
+        statusEl.style.color = 'var(--ok-fg, #2c662d)';
+      } else {
+        statusEl.textContent = '✗ failed';
+        statusEl.style.color = 'var(--crit-fg, #9f3a38)';
+      }
+      btn.textContent = label;
+      btn.disabled = false;
+      setTimeout(loadServices, 800);
+    }).catch(e => {
+      term.textContent = 'Request failed: ' + e;
+      statusEl.textContent = '✗ error';
+      statusEl.style.color = 'var(--crit-fg, #9f3a38)';
+      btn.textContent = label;
+      btn.disabled = false;
+    });
+}
+loadServices();
+</script>`
+}
+
+func renderServices() string {
+	return `<div class="card"><div class="card-head">Bee Services</div><div class="card-body">` +
+		renderServicesInline() +
+		`</div></div>`
+}
--- a/audit/internal/webui/page_validate.go
+++ b/audit/internal/webui/page_validate.go
@@ -0,0 +1,663 @@
+package webui
+
+import (
+	"encoding/json"
+	"fmt"
+	"html"
+	"sort"
+	"strings"
+
+	"bee/audit/internal/platform"
+	"bee/audit/internal/schema"
+)
+
+type validateInventory struct {
+	CPU            string
+	Memory         string
+	Storage        string
+	NVIDIA         string
+	AMD            string
+	NvidiaGPUCount int
+	AMDGPUCount    int
+}
+
+func validateFmtDur(secs int) string {
+	if secs < 120 {
+		return fmt.Sprintf("~%d s", secs)
+	}
+	mins := (secs + 29) / 60
+	return fmt.Sprintf("~%d min", mins)
+}
+
+func validateTotalValidateSec(n int) int {
+	if n < 0 {
+		n = 0
+	}
+	total := platform.SATEstimatedCPUValidateSec +
+		platform.SATEstimatedMemoryValidateSec +
+		platform.SATEstimatedNvidiaInterconnectSec +
+		platform.SATEstimatedNvidiaBandwidthSec
+	if n > 0 {
+		total += platform.SATEstimatedNvidiaGPUValidateSec
+	}
+	return total
+}
+
+func validateTotalStressSec(n int) int {
+	if n < 0 {
+		n = 0
+	}
+	total := platform.SATEstimatedCPUStressSec +
+		platform.SATEstimatedMemoryStressSec +
+		platform.SATEstimatedNvidiaPulseTestSec +
+		platform.SATEstimatedNvidiaInterconnectSec +
+		platform.SATEstimatedNvidiaBandwidthSec
+	if n > 0 {
+		total += platform.SATEstimatedNvidiaGPUStressSec +
+			platform.SATEstimatedNvidiaTargetedStressSec +
+			platform.SATEstimatedNvidiaTargetedPowerSec
+	}
+	return total
+}
+
+func renderValidate(opts HandlerOptions) string {
+	inv := loadValidateInventory(opts)
+	n := inv.NvidiaGPUCount
+	validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
+	stressTotalStr := validateFmtDur(validateTotalStressSec(n))
+	gpuNote := ""
+	if n > 0 {
+		gpuNote = fmt.Sprintf(" (%d GPU)", n)
+	}
+	return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
+<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
+
+	<div class="card" style="margin-bottom:16px">
+	  <div class="card-head">Validate Profile</div>
+	  <div class="card-body validate-profile-body">
+	    <div class="validate-profile-col">
+	      <div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
+	      <label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
+	      <label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (` + stressTotalStr + gpuNote + `)</span></label>
+	    </div>
+	    <div class="validate-profile-col validate-profile-action">
+	      <p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially. Validate: ` + validateTotalStr + gpuNote + `; Stress: ` + stressTotalStr + gpuNote + `. Estimates are based on real log data and scale with GPU count.</p>
+	      <button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
+	      <div style="margin-top:12px">
+	        <span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
+	      </div>
+	    </div>
+	  </div>
+	</div>
+
+<div class="grid3">
+` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
+		inv.CPU,
+		`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
+		`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
+		validateFmtDur(platform.SATEstimatedCPUValidateSec)+` in Validate (stress-ng 60 s). `+validateFmtDur(platform.SATEstimatedCPUStressSec)+` in Stress (stress-ng 30 min).`,
+	)) +
+		renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
+			inv.Memory,
+			`Runs a RAM validation pass and records memory state around the test.`,
+			`<code>free</code>, <code>memtester</code>`,
+			validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` in Validate (256 MB × 1 pass). `+validateFmtDur(platform.SATEstimatedMemoryStressSec)+` in Stress (512 MB × 1 pass).`,
+		)) +
+		renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
+			inv.Storage,
+			`Scans all storage devices and runs the matching health or self-test path for each device type.`,
+			`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
+			`Seconds in Validate (NVMe: instant device query; SATA/SAS: short self-test). Up to ~1 h per device in Stress (extended self-test, device-dependent).`,
+		)) +
+		`</div>
+<div style="height:1px;background:var(--border);margin:16px 0"></div>
+<div class="card" style="margin-bottom:16px">
+  <div class="card-head">NVIDIA GPU Selection</div>
+  <div class="card-body">
+    <p style="font-size:12px;color:var(--muted);margin:0 0 8px">` + inv.NVIDIA + `</p>
+    <p style="font-size:12px;color:var(--muted);margin:0 0 10px">All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Validate one by one.</p>
+    <div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
+      <button class="btn btn-sm btn-secondary" type="button" onclick="satSelectAllGPUs()">Select All</button>
+      <button class="btn btn-sm btn-secondary" type="button" onclick="satSelectNoGPUs()">Clear</button>
+    </div>
+    <div id="sat-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
+      <p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
+    </div>
+    <p id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA validate tasks.</p>
+  </div>
+</div>
+
+<div class="grid3">
+` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
+		inv.NVIDIA,
+		`Runs NVIDIA diagnostics and board inventory checks.`,
+		`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
+		fmt.Sprintf("Validate: %s (Level 2, all GPUs simultaneously). Stress: %s (Level 3, all GPUs simultaneously).",
+			validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec),
+			validateFmtDur(platform.SATEstimatedNvidiaGPUStressSec)),
+	)) +
+		`<div id="sat-card-nvidia-targeted-stress">` +
+		renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
+			inv.NVIDIA,
+			`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
+			`<code>dcgmi diag targeted_stress</code>`,
+		"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec) + ` (all GPUs simultaneously).<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
+		)) +
+		`</div>` +
+		`<div id="sat-card-nvidia-targeted-power">` +
+		renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody(
+			inv.NVIDIA,
+			`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
+			`<code>dcgmi diag targeted_power</code>`,
+		"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec) + ` (all GPUs simultaneously).<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
+		)) +
+		`</div>` +
+		`<div id="sat-card-nvidia-pulse">` +
+		renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody(
+			inv.NVIDIA,
+			`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
+			`<code>dcgmi diag pulse_test</code>`,
+			`Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
+		)) +
+		`</div>` +
+		`<div id="sat-card-nvidia-interconnect">` +
+		renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
+			inv.NVIDIA,
+			`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
+			`<code>all_reduce_perf</code> (NCCL tests)`,
+			`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
+		)) +
+		`</div>` +
+		`<div id="sat-card-nvidia-bandwidth">` +
+		renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
+			inv.NVIDIA,
+			`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
+			`<code>nvbandwidth</code>`,
+			`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
+		)) +
+		`</div>` +
+		`</div>
+<div class="grid3" style="margin-top:16px">
+` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
+		inv.AMD,
+		`Runs the selected AMD checks only. GPU Validate collects inventory; MEM Integrity uses the RVS MEM module; MEM Bandwidth uses rocm-bandwidth-test and the RVS BABEL module.`,
+		`GPU Validate: <code>rocm-smi</code>, <code>dmidecode</code>; MEM Integrity: <code>rvs mem</code>; MEM Bandwidth: <code>rocm-bandwidth-test</code>, <code>rvs babel</code>`,
+		`<div style="display:flex;flex-direction:column;gap:4px"><label class="cb-row"><input type="checkbox" id="sat-amd-target" checked><span>GPU Validate</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-mem-target" checked><span>MEM Integrity</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-bandwidth-target" checked><span>MEM Bandwidth</span></label></div>`,
+	)) +
+		`</div>
+<div id="sat-output" style="display:none;margin-top:16px" class="card">
+  <div class="card-head">Test Output <span id="sat-title"></span></div>
+  <div class="card-body"><div id="sat-terminal" class="terminal"></div></div>
+</div>
+<style>
+.validate-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
+.validate-profile-col { min-width:0; display:flex; flex-direction:column; }
+.validate-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:center; }
+.validate-card-body { padding:0; }
+.validate-card-section { padding:12px 16px 0; }
+.validate-card-section:last-child { padding-bottom:16px; }
+.sat-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
+.sat-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
+@media(max-width:900px){ .validate-profile-body { grid-template-columns:1fr; } }
+</style>
+<script>
+let satES = null;
+function satStressMode() {
+  return document.querySelector('input[name="sat-mode"]:checked')?.value === 'stress';
+}
+function satModeChanged() {
+  const stress = satStressMode();
+  [
+    {card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
+    {card: 'sat-card-nvidia-targeted-power',  hint: 'sat-tp-mode-hint'},
+    {card: 'sat-card-nvidia-pulse',           hint: 'sat-pt-mode-hint'},
+  ].forEach(function(item) {
+    const card = document.getElementById(item.card);
+    if (card) {
+      card.style.opacity = stress ? '1' : '0.5';
+      const hint = document.getElementById(item.hint);
+      if (hint) hint.style.display = stress ? 'none' : '';
+    }
+  });
+}
+function satLabels() {
+  return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
+}
+let satNvidiaGPUsPromise = null;
+function loadSatNvidiaGPUs() {
+  if (!satNvidiaGPUsPromise) {
+    satNvidiaGPUsPromise = fetch('/api/gpu/nvidia')
+      .then(r => {
+        if (!r.ok) throw new Error('Failed to load NVIDIA GPUs.');
+        return r.json();
+      })
+      .then(list => Array.isArray(list) ? list : []);
+  }
+  return satNvidiaGPUsPromise;
+}
+function satSelectedGPUIndices() {
+  return Array.from(document.querySelectorAll('.sat-nvidia-checkbox'))
+    .filter(function(el) { return el.checked && !el.disabled; })
+    .map(function(el) { return parseInt(el.value, 10); })
+    .filter(function(v) { return !Number.isNaN(v); })
+    .sort(function(a, b) { return a - b; });
+}
+function satUpdateGPUSelectionNote() {
+  const note = document.getElementById('sat-gpu-selection-note');
+  if (!note) return;
+  const selected = satSelectedGPUIndices();
+  if (!selected.length) {
+    note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA validate tasks.';
+    return;
+  }
+  note.textContent = 'Selected GPUs: ' + selected.join(', ') + '. Multi-GPU tests will use all selected GPUs.';
+}
+function satRenderGPUList(gpus) {
+  const root = document.getElementById('sat-gpu-list');
+  if (!root) return;
+  if (!gpus || !gpus.length) {
+    root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
+    satUpdateGPUSelectionNote();
+    return;
+  }
+  root.innerHTML = gpus.map(function(gpu) {
+    const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
+    return '<label class="sat-gpu-row">'
+      + '<input class="sat-nvidia-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="satUpdateGPUSelectionNote()">'
+      + '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
+      + '</label>';
+  }).join('');
+  satUpdateGPUSelectionNote();
+}
+function satSelectAllGPUs() {
+  document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = true; });
+  satUpdateGPUSelectionNote();
+}
+function satSelectNoGPUs() {
+  document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = false; });
+  satUpdateGPUSelectionNote();
+}
+function satLoadGPUs() {
+  loadSatNvidiaGPUs().then(function(gpus) {
+    satRenderGPUList(gpus);
+  }).catch(function(err) {
+    const root = document.getElementById('sat-gpu-list');
+    if (root) {
+      root.innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
+    }
+    satUpdateGPUSelectionNote();
+  });
+}
+function satGPUDisplayName(gpu) {
+  const idx = (gpu && Number.isFinite(Number(gpu.index))) ? Number(gpu.index) : 0;
+  const name = gpu && gpu.name ? gpu.name : ('GPU ' + idx);
+  return 'GPU ' + idx + ' — ' + name;
+}
+function satRequestBody(target, overrides) {
+  const body = {};
+  const labels = satLabels();
+  body.display_name = labels[target] || ('Validate ' + target);
+  body.stress_mode = satStressMode();
+  if (target === 'cpu') body.duration = satStressMode() ? 1800 : 60;
+  if (overrides) {
+    Object.keys(overrides).forEach(key => { body[key] = overrides[key]; });
+  }
+  return body;
+}
+function enqueueSATTarget(target, overrides) {
+  return fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(satRequestBody(target, overrides))})
+    .then(r => r.json());
+}
+function streamSATTask(taskId, title, resetTerminal) {
+  if (satES) { satES.close(); satES = null; }
+  document.getElementById('sat-output').style.display='block';
+  document.getElementById('sat-title').textContent = '— ' + title;
+  const term = document.getElementById('sat-terminal');
+  if (resetTerminal) {
+    term.textContent = '';
+  }
+  term.textContent += 'Task ' + taskId + ' queued. Streaming log...\n';
+  return new Promise(function(resolve) {
+    satES = new EventSource('/api/tasks/' + taskId + '/stream');
+    satES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
+    satES.addEventListener('done', function(e) {
+      satES.close();
+      satES = null;
+      term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
+      term.scrollTop = term.scrollHeight;
+      resolve({ok: !e.data, error: e.data || ''});
+    });
+    satES.onerror = function() {
+      if (satES) {
+        satES.close();
+        satES = null;
+      }
+      term.textContent += '\nERROR: stream disconnected.\n';
+      term.scrollTop = term.scrollHeight;
+      resolve({ok: false, error: 'stream disconnected'});
+    };
+  });
+}
+function selectedAMDValidateTargets() {
+  const targets = [];
+  const gpu = document.getElementById('sat-amd-target');
+  const mem = document.getElementById('sat-amd-mem-target');
+  const bw = document.getElementById('sat-amd-bandwidth-target');
+  if (gpu && gpu.checked && !gpu.disabled) targets.push('amd');
+  if (mem && mem.checked && !mem.disabled) targets.push('amd-mem');
+  if (bw && bw.checked && !bw.disabled) targets.push('amd-bandwidth');
+  return targets;
+}
+function runSAT(target) {
+  return runSATWithOverrides(target, null);
+}
+function runSATWithOverrides(target, overrides) {
+  const title = (overrides && overrides.display_name) || target;
+  const term = document.getElementById('sat-terminal');
+  document.getElementById('sat-output').style.display='block';
+  document.getElementById('sat-title').textContent = '— ' + title;
+  term.textContent = 'Enqueuing ' + title + ' test...\n';
+  return enqueueSATTarget(target, overrides)
+    .then(d => streamSATTask(d.task_id, title, false));
+}
+const nvidiaPerGPUTargets = [];
+const nvidiaAllGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
+function satAllGPUIndicesForMulti() {
+  return Promise.resolve(satSelectedGPUIndices());
+}
+function expandSATTarget(target) {
+  if (nvidiaAllGPUTargets.indexOf(target) >= 0) {
+    return satAllGPUIndicesForMulti().then(function(indices) {
+      if (!indices.length) return Promise.reject(new Error('No NVIDIA GPUs available.'));
+      return [{target: target, overrides: {gpu_indices: indices, display_name: satLabels()[target] || target}}];
+    });
+  }
+  if (nvidiaPerGPUTargets.indexOf(target) < 0) {
+    return Promise.resolve([{target: target}]);
+  }
+  const selected = satSelectedGPUIndices();
+  if (!selected.length) {
+    return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
+  }
+  return loadSatNvidiaGPUs().then(gpus => gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0).map(gpu => ({
+    target: target,
+    overrides: {
+      gpu_indices: [Number(gpu.index)],
+      display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')'
+    },
+    label: satGPUDisplayName(gpu),
+  })));
+}
+function runNvidiaFabricValidate(target) {
+  satAllGPUIndicesForMulti().then(function(indices) {
+    if (!indices.length) { alert('No NVIDIA GPUs available.'); return; }
+    runSATWithOverrides(target, {gpu_indices: indices, display_name: satLabels()[target] || target});
+  });
+}
+function runNvidiaValidateSet(target) {
+  const selected = satSelectedGPUIndices();
+  if (!selected.length) { alert('Select at least one NVIDIA GPU.'); return; }
+  return runSATWithOverrides(target, {gpu_indices: selected, display_name: satLabels()[target] || target});
+}
+function runAMDValidateSet() {
+  const targets = selectedAMDValidateTargets();
+  if (!targets.length) return;
+  if (targets.length === 1) return runSAT(targets[0]);
+  document.getElementById('sat-output').style.display='block';
+  document.getElementById('sat-title').textContent = '— amd';
+  const term = document.getElementById('sat-terminal');
+  term.textContent = 'Running AMD validate set one by one...\n';
+  const labels = satLabels();
+  const runNext = (idx) => {
+    if (idx >= targets.length) return Promise.resolve();
+    const target = targets[idx];
+    term.textContent += '\n[' + (idx + 1) + '/' + targets.length + '] ' + labels[target] + '\n';
+    return enqueueSATTarget(target)
+      .then(d => {
+        return streamSATTask(d.task_id, labels[target], false);
+      }).then(function() {
+        return runNext(idx + 1);
+      });
+  };
+  return runNext(0);
+}
+function runAllSAT() {
+  const cycles = 1;
+  const status = document.getElementById('sat-all-status');
+  status.textContent = 'Enqueuing...';
+  const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
+  const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
+  const activeTargets = baseTargets.filter(target => {
+    if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
+    const btn = document.getElementById('sat-btn-' + target);
+    return !(btn && btn.disabled);
+  });
+  Promise.all(activeTargets.map(expandSATTarget)).then(groups => {
+    const expanded = [];
+    for (let cycle = 0; cycle < cycles; cycle++) {
+      groups.forEach(group => group.forEach(item => expanded.push(item)));
+    }
+    const total = expanded.length;
+    let enqueued = 0;
+    if (!total) {
+      status.textContent = 'No tasks selected.';
+      return;
+    }
+    const runNext = (idx) => {
+      if (idx >= expanded.length) { status.textContent = 'Completed ' + total + ' task(s).'; return Promise.resolve(); }
+      const item = expanded[idx];
+      status.textContent = 'Running ' + (idx + 1) + '/' + total + '...';
+      return enqueueSATTarget(item.target, item.overrides)
+        .then(() => {
+          enqueued++;
+          return runNext(idx + 1);
+        });
+    };
+    return runNext(0);
+  }).catch(err => {
+    status.textContent = 'Error: ' + err.message;
+  });
+}
+</script>
+<script>
+fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
+    if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
+    if (!gp.nvidia) disableSATCard('nvidia-targeted-stress', 'No NVIDIA GPU detected');
+    if (!gp.nvidia) disableSATCard('nvidia-targeted-power', 'No NVIDIA GPU detected');
+    if (!gp.nvidia) disableSATCard('nvidia-pulse', 'No NVIDIA GPU detected');
+    if (!gp.nvidia) disableSATCard('nvidia-interconnect', 'No NVIDIA GPU detected');
+    if (!gp.nvidia) disableSATCard('nvidia-bandwidth', 'No NVIDIA GPU detected');
+    if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
+    if (!gp.amd) disableSATAMDOptions('No AMD GPU detected');
+});
+satLoadGPUs();
+function disableSATAMDOptions(reason) {
+    ['sat-amd-target','sat-amd-mem-target','sat-amd-bandwidth-target'].forEach(function(id) {
+        const cb = document.getElementById(id);
+        if (!cb) return;
+        cb.disabled = true;
+        cb.checked = false;
+        cb.title = reason;
+    });
+}
+function disableSATCard(id, reason) {
+    const btn = document.getElementById('sat-btn-' + id);
+    if (!btn) return;
+    btn.disabled = true;
+    btn.title = reason;
+    btn.style.opacity = '0.4';
+    const card = btn.closest('.card');
+    if (card) {
+        let note = card.querySelector('.sat-unavail');
+        if (!note) {
+            note = document.createElement('p');
+            note.className = 'sat-unavail';
+            note.style.cssText = 'color:var(--muted);font-size:12px;margin:0 0 8px';
+            const body = card.querySelector('.card-body');
+            if (body) body.insertBefore(note, body.firstChild);
+        }
+        note.textContent = reason;
+    }
+}
+</script>`
+}
+
+func loadValidateInventory(opts HandlerOptions) validateInventory {
+	unknown := "Audit snapshot not loaded."
+	out := validateInventory{
+		CPU:     unknown,
+		Memory:  unknown,
+		Storage: unknown,
+		NVIDIA:  unknown,
+		AMD:     unknown,
+	}
+	data, err := loadSnapshot(opts.AuditPath)
+	if err != nil {
+		return out
+	}
+	var snap schema.HardwareIngestRequest
+	if err := json.Unmarshal(data, &snap); err != nil {
+		return out
+	}
+
+	cpuCounts := map[string]int{}
+	cpuTotal := 0
+	for _, cpu := range snap.Hardware.CPUs {
+		if cpu.Present != nil && !*cpu.Present {
+			continue
+		}
+		cpuTotal++
+		addValidateModel(cpuCounts, validateFirstNonEmpty(validateTrimPtr(cpu.Model), validateTrimPtr(cpu.Manufacturer), "unknown"))
+	}
+
+	memCounts := map[string]int{}
+	memTotal := 0
+	for _, dimm := range snap.Hardware.Memory {
+		if dimm.Present != nil && !*dimm.Present {
+			continue
+		}
+		memTotal++
+		addValidateModel(memCounts, validateFirstNonEmpty(validateTrimPtr(dimm.PartNumber), validateTrimPtr(dimm.Type), validateTrimPtr(dimm.Manufacturer), "unknown"))
+	}
+
+	storageCounts := map[string]int{}
+	storageTotal := 0
+	for _, dev := range snap.Hardware.Storage {
+		if dev.Present != nil && !*dev.Present {
+			continue
+		}
+		storageTotal++
+		addValidateModel(storageCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
+	}
+
+	nvidiaCounts := map[string]int{}
+	nvidiaTotal := 0
+	amdCounts := map[string]int{}
+	amdTotal := 0
+	for _, dev := range snap.Hardware.PCIeDevices {
+		if dev.Present != nil && !*dev.Present {
+			continue
+		}
+		if validateIsVendorGPU(dev, "nvidia") {
+			nvidiaTotal++
+			addValidateModel(nvidiaCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
+		}
+		if validateIsVendorGPU(dev, "amd") {
+			amdTotal++
+			addValidateModel(amdCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
+		}
+	}
+
+	out.CPU = formatValidateDeviceSummary(cpuTotal, cpuCounts, "CPU")
+	out.Memory = formatValidateDeviceSummary(memTotal, memCounts, "module")
+	out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device")
+	out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU")
+	out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU")
+	out.NvidiaGPUCount = nvidiaTotal
+	out.AMDGPUCount = amdTotal
+	return out
+}
+
+func renderValidateCardBody(devices, description, commands, settings string) string {
+	return `<div class="validate-card-section"><div style="font-size:13px;color:var(--muted)">` + devices + `</div></div>` +
+		`<div class="validate-card-section"><div style="font-size:13px">` + description + `</div></div>` +
+		`<div class="validate-card-section"><div style="font-size:13px">` + commands + `</div></div>` +
+		`<div class="validate-card-section"><div style="font-size:13px;color:var(--muted)">` + settings + `</div></div>`
+}
+
+func formatValidateDeviceSummary(total int, models map[string]int, unit string) string {
+	if total == 0 {
+		return "0 " + unit + "s detected."
+	}
+	keys := make([]string, 0, len(models))
+	for key := range models {
+		keys = append(keys, key)
+	}
+	sort.Strings(keys)
+	parts := make([]string, 0, len(keys))
+	for _, key := range keys {
+		parts = append(parts, fmt.Sprintf("%d x %s", models[key], html.EscapeString(key)))
+	}
+	label := unit
+	if total != 1 {
+		label += "s"
+	}
+	if len(parts) == 1 {
+		return parts[0] + " " + label
+	}
+	return fmt.Sprintf("%d %s: %s", total, label, strings.Join(parts, ", "))
+}
+
+func addValidateModel(counts map[string]int, name string) {
+	name = strings.TrimSpace(name)
+	if name == "" {
+		name = "unknown"
+	}
+	counts[name]++
+}
+
+func validateTrimPtr(value *string) string {
+	if value == nil {
+		return ""
+	}
+	return strings.TrimSpace(*value)
+}
+
+func validateFirstNonEmpty(values ...string) string {
+	for _, value := range values {
+		value = strings.TrimSpace(value)
+		if value != "" {
+			return value
+		}
+	}
+	return ""
+}
+
+func validateIsVendorGPU(dev schema.HardwarePCIeDevice, vendor string) bool {
+	model := strings.ToLower(validateTrimPtr(dev.Model))
+	manufacturer := strings.ToLower(validateTrimPtr(dev.Manufacturer))
+	class := strings.ToLower(validateTrimPtr(dev.DeviceClass))
+	if strings.Contains(model, "aspeed") || strings.Contains(manufacturer, "aspeed") {
+		return false
+	}
+	switch vendor {
+	case "nvidia":
+		return strings.Contains(model, "nvidia") || strings.Contains(manufacturer, "nvidia")
+	case "amd":
+		isGPUClass := class == "processingaccelerator" || class == "displaycontroller" || class == "videocontroller"
+		isAMDVendor := strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd") || strings.Contains(manufacturer, "ati")
+		isAMDModel := strings.Contains(model, "instinct") || strings.Contains(model, "radeon") || strings.Contains(model, "amd")
+		return isGPUClass && (isAMDVendor || isAMDModel)
+	default:
+		return false
+	}
+}
+
+func renderSATCard(id, label, runAction, headerActions, body string) string {
+	actions := `<button id="sat-btn-` + id + `" class="btn btn-primary btn-sm" onclick="` + runAction + `">Run</button>`
+	if strings.TrimSpace(headerActions) != "" {
+		actions += headerActions
+	}
+	return fmt.Sprintf(`<div class="card"><div class="card-head card-head-actions"><span>%s</span><div class="card-head-buttons">%s</div></div><div class="card-body validate-card-body">%s</div></div>`,
+		label, actions, body)
+}
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
--- a/audit/internal/webui/server.go
+++ b/audit/internal/webui/server.go
@@ -135,6 +135,14 @@ type namedMetricsRing struct {
 // At metricsCollectInterval = 5 s this covers 30 minutes of live history.
 const metricsChartWindow = 360

+// metricsDownsampleAge is the age after which old metrics rows are downsampled
+// to 1 sample per minute. Data fresher than this is kept at full resolution.
+const metricsDownsampleAge = 2 * time.Hour
+
+// metricsRetainWindow is the total retention period for metrics rows.
+// Rows older than this are deleted entirely by the background compactor.
+const metricsRetainWindow = 48 * time.Hour
+
 var metricsCollectInterval = 5 * time.Second

 // pendingNetChange tracks a network state change awaiting confirmation.
@@ -263,6 +271,8 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
 	mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf"))
 	mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power"))
+	mux.HandleFunc("POST /api/bee-bench/nvidia/autotune/run", h.handleAPIBenchmarkAutotuneRun())
+	mux.HandleFunc("GET /api/bee-bench/nvidia/autotune/status", h.handleAPIBenchmarkAutotuneStatus)
 	mux.HandleFunc("GET /api/benchmark/results", h.handleAPIBenchmarkResults)

 	// Tasks
@@ -335,13 +345,24 @@ func (h *handler) startMetricsCollector() {
 	goRecoverLoop("metrics collector", 2*time.Second, func() {
 		ticker := time.NewTicker(metricsCollectInterval)
 		defer ticker.Stop()
-		for range ticker.C {
-			sample := platform.SampleLiveMetrics()
-			if h.metricsDB != nil {
-				_ = h.metricsDB.Write(sample)
+		pruneTicker := time.NewTicker(time.Hour)
+		defer pruneTicker.Stop()
+		for {
+			select {
+			case <-ticker.C:
+				sample := platform.SampleLiveMetrics()
+				if h.metricsDB != nil {
+					_ = h.metricsDB.Write(sample)
+				}
+				h.feedRings(sample)
+				h.setLatestMetric(sample)
+			case <-pruneTicker.C:
+				if h.metricsDB != nil {
+					now := time.Now().UTC()
+					_ = h.metricsDB.Downsample(now.Add(-metricsDownsampleAge), now.Add(-metricsRetainWindow))
+					_ = h.metricsDB.Prune(now.Add(-metricsRetainWindow))
+				}
 			}
-			h.feedRings(sample)
-			h.setLatestMetric(sample)
 		}
 	})
 }
@@ -668,41 +689,22 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) (dat

 	case path == "server-power":
 		title = "System Power"
-		// Use per-PSU stacked chart when PSU SDR data is available.
-		// Collect the union of PSU slots seen across all samples.
-		psuSlots := psuSlotsFromSamples(samples)
-		if len(psuSlots) > 1 {
-			// Build one dataset per PSU slot.
-			psuDatasets := make([][]float64, len(psuSlots))
-			psuNames := make([]string, len(psuSlots))
-			for si, slot := range psuSlots {
-				ds := make([]float64, len(samples))
-				for i, s := range samples {
-					for _, psu := range s.PSUs {
-						if psu.Slot == slot {
-							ds[i] = psu.PowerW
-							break
-						}
-					}
+		power := make([]float64, len(samples))
+		label := "Power W"
+		for i, s := range samples {
+			power[i] = s.PowerW
+			if strings.TrimSpace(s.PowerSource) != "" {
+				label = fmt.Sprintf("Power W · %s", s.PowerSource)
+				if strings.TrimSpace(s.PowerMode) != "" {
+					label += fmt.Sprintf(" (%s)", s.PowerMode)
 				}
-				psuDatasets[si] = normalizePowerSeries(ds)
-				psuNames[si] = fmt.Sprintf("PSU %d", slot)
 			}
-			datasets = psuDatasets
-			names = psuNames
-			stacked = true
-			yMax = autoMax120(psuStackedTotal(psuDatasets))
-		} else {
-			power := make([]float64, len(samples))
-			for i, s := range samples {
-				power[i] = s.PowerW
-			}
-			power = normalizePowerSeries(power)
-			datasets = [][]float64{power}
-			names = []string{"Power W"}
-			yMin = floatPtr(0)
-			yMax = autoMax120(power)
 		}
+		power = normalizePowerSeries(power)
+		datasets = [][]float64{power}
+		names = []string{label}
+		yMin = floatPtr(0)
+		yMax = autoMax120(power)

 	case path == "server-fans":
 		title = "Fan RPM"
--- a/audit/internal/webui/server_test.go
+++ b/audit/internal/webui/server_test.go
@@ -420,6 +420,49 @@ func TestHandleMetricsChartSVGRendersCustomSVG(t *testing.T) {
 	}
 }

+func TestChartDataFromSamplesServerPowerUsesResolvedSystemPower(t *testing.T) {
+	start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
+	samples := []platform.LiveMetricSample{
+		{
+			Timestamp: start,
+			PSUs: []platform.PSUReading{
+				{Slot: 1, PowerW: 120},
+				{Slot: 2, PowerW: 130},
+			},
+			PowerW:      250,
+			PowerSource: "sdr_psu_input",
+			PowerMode:   "autotuned",
+		},
+		{
+			Timestamp: start.Add(time.Minute),
+			PSUs: []platform.PSUReading{
+				{Slot: 1, PowerW: 140},
+				{Slot: 2, PowerW: 135},
+			},
+			PowerW:      275,
+			PowerSource: "sdr_psu_input",
+			PowerMode:   "autotuned",
+		},
+	}
+
+	datasets, names, _, title, _, _, stacked, ok := chartDataFromSamples("server-power", samples)
+	if !ok {
+		t.Fatal("expected server-power chart data")
+	}
+	if title != "System Power" {
+		t.Fatalf("title=%q", title)
+	}
+	if stacked {
+		t.Fatal("server-power should use resolved system power, not stacked PSU inputs")
+	}
+	if len(datasets) != 1 || len(names) != 1 {
+		t.Fatalf("datasets=%d names=%d want 1/1", len(datasets), len(names))
+	}
+	if names[0] != "Power W · sdr_psu_input (autotuned)" {
+		t.Fatalf("names=%v", names)
+	}
+}
+
 func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
 	got := normalizeFanSeries([]float64{4200, 0, 0, 4300, 0})
 	want := []float64{4200, 4200, 4200, 4300, 4300}
@@ -650,9 +693,12 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
 		`/api/gpu/nvidia`,
 		`/api/bee-bench/nvidia/perf/run`,
 		`/api/bee-bench/nvidia/power/run`,
+		`/api/bee-bench/nvidia/autotune/run`,
+		`/api/bee-bench/nvidia/autotune/status`,
 		`benchmark-run-nccl`,
 		`Run Performance Benchmark`,
 		`Run Power / Thermal Fit`,
+		`Autotune`,
 	} {
 		if !strings.Contains(body, needle) {
 			t.Fatalf("benchmark page missing %q: %s", needle, body)
@@ -754,9 +800,9 @@ func TestValidatePageRendersNvidiaFabricCardsInValidateMode(t *testing.T) {
 	body := rec.Body.String()
 	for _, needle := range []string{
 		`NVIDIA Interconnect (NCCL)`,
-		`Runs in Validate and Stress.`,
+		`Validate and Stress:`,
 		`NVIDIA Bandwidth (NVBandwidth)`,
-		`Intended to stay short enough for Validate.`,
+		`nvbandwidth runs all built-in tests without a time limit`,
 	} {
 		if !strings.Contains(body, needle) {
 			t.Fatalf("validate page missing %q: %s", needle, body)
--- a/audit/internal/webui/stability.go
+++ b/audit/internal/webui/stability.go
@@ -7,14 +7,43 @@ import (
 	"time"
 )

+const (
+	recoverLoopMaxDelay   = 60 * time.Second
+	recoverLoopResetAfter = 30 * time.Second
+)
+
+// goRecoverLoop starts fn in a goroutine, restarting after panics.
+// restartDelay is the initial delay; successive panics double it up to
+// recoverLoopMaxDelay. The delay resets to restartDelay once fn runs
+// successfully for recoverLoopResetAfter without panicking.
 func goRecoverLoop(name string, restartDelay time.Duration, fn func()) {
 	go func() {
+		delay := restartDelay
+		consecutive := 0
 		for {
-			if !runRecoverable(name, fn) {
+			start := time.Now()
+			panicked := runRecoverable(name, fn)
+			if !panicked {
 				return
 			}
-			if restartDelay > 0 {
-				time.Sleep(restartDelay)
+			consecutive++
+			if time.Since(start) >= recoverLoopResetAfter {
+				delay = restartDelay
+				consecutive = 1
+			}
+			slog.Warn("goroutine restarting after panic",
+				"component", name,
+				"consecutive_panics", consecutive,
+				"next_delay", delay,
+			)
+			if delay > 0 {
+				time.Sleep(delay)
+			}
+			if delay < recoverLoopMaxDelay {
+				delay *= 2
+				if delay > recoverLoopMaxDelay {
+					delay = recoverLoopMaxDelay
+				}
 			}
 		}
 	}()
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -34,6 +34,7 @@ var taskNames = map[string]string{
 	"nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)",
 	"nvidia-bench-perf":      "NVIDIA Bee Bench Perf",
 	"nvidia-bench-power":     "NVIDIA Bee Bench Power",
+	"nvidia-bench-autotune":  "NVIDIA Bee Bench Power Source Autotune",
 	"nvidia-compute":         "NVIDIA Max Compute Load (dcgmproftester)",
 	"nvidia-targeted-power":  "NVIDIA Targeted Power (dcgmi diag targeted_power)",
 	"nvidia-pulse":           "NVIDIA Pulse Test (dcgmi diag pulse_test)",
@@ -125,6 +126,7 @@ type taskParams struct {
 	Loader             string   `json:"loader,omitempty"`
 	BurnProfile        string   `json:"burn_profile,omitempty"`
 	BenchmarkProfile   string   `json:"benchmark_profile,omitempty"`
+	BenchmarkKind      string   `json:"benchmark_kind,omitempty"`
 	RunNCCL            bool     `json:"run_nccl,omitempty"`
 	ParallelGPUs       bool     `json:"parallel_gpus,omitempty"`
 	RampStep           int      `json:"ramp_step,omitempty"`
@@ -585,6 +587,7 @@ func (q *taskQueue) finalizeTaskRun(t *Task, j *jobState) {
 	if err := writeTaskReportArtifacts(t); err != nil {
 		appendJobLog(t.LogPath, "WARN: task report generation failed: "+err.Error())
 	}
+	j.closeLog()
 	if t.ErrMsg != "" {
 		taskSerialEvent(t, "finished with status="+t.Status+" error="+t.ErrMsg)
 		return
@@ -685,6 +688,15 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			RampTotal:         t.params.RampTotal,
 			RampRunID:         t.params.RampRunID,
 		}, j.append)
+	case "nvidia-bench-autotune":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = a.RunNvidiaPowerSourceAutotuneCtx(ctx, app.DefaultBeeBenchAutotuneDir, platform.NvidiaBenchmarkOptions{
+			Profile: t.params.BenchmarkProfile,
+			SizeMB:  t.params.SizeMB,
+		}, t.params.BenchmarkKind, j.append)
 	case "nvidia-compute":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
--- a/bible-local/docs/gpu-model-propagation.md
+++ b/bible-local/docs/gpu-model-propagation.md
@@ -110,8 +110,12 @@ nvidia-smi / lspci (audit collection)

 ---

-## What Needs Fixing
+## Fixed Issues

-1. **NVIDIA PCIe Model** — `enrichPCIeWithNVIDIAData()` should set `dev.Model = &gpu.Name`
-2. **Fallback consistency** — `benchmark_report.go:93` should say `"Unknown GPU"` not `"Unknown"`; `sat.go:922` should say `"Unknown GPU"` not `"unknown"`
-3. **Old benchmark JSONs** — no fix possible for already-saved results with missing names (display-only issue)
+All previously open items are resolved:
+
+1. **NVIDIA PCIe Model** — `enrichPCIeWithNVIDIAData()` sets `dev.Model = &v` (`nvidia.go:78`).
+2. **Fallback consistency** — `sat.go` and `benchmark_report.go` both use `"Unknown GPU"`.
+3. **`tops_per_sm_per_ghz`** — computed in `benchmark.go` and stored in `BenchmarkGPUScore.TOPSPerSMPerGHz`.
+4. **`MultiprocessorCount`, `PowerLimitW`, `DefaultPowerLimitW`** — present in `benchmark_types.go`.
+5. **Old benchmark JSONs** — no fix possible for already-saved results with missing names (display-only issue).
--- a/iso/builder/auto/config
+++ b/iso/builder/auto/config
@@ -32,7 +32,7 @@ lb config noauto \
    --memtest memtest86+ \
    --iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
    --iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
-    --bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
+    --bootappend-live "boot=live components video=1920x1080 console=ttyS0,115200n8 console=tty0 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
    --debootstrap-options "--include=ca-certificates" \
    --apt-recommends false \
    --chroot-squashfs-compression-type zstd \
--- a/iso/builder/build.sh
+++ b/iso/builder/build.sh
@@ -126,6 +126,37 @@ resolve_iso_version() {
    resolve_audit_version
 }

+sync_builder_workdir() {
+    src_dir="$1"
+    dst_dir="$2"
+
+    mkdir -p "$dst_dir"
+
+    # Historical bug: old workdirs could keep config/bootloaders/grub-pc even
+    # after the source tree moved to grub-efi only. Remove bootloaders eagerly
+    # so reused workdirs cannot leak stale templates into a new ISO build.
+    rm -rf "$dst_dir/config/bootloaders"
+
+    rsync -a --delete \
+        --exclude='cache/' \
+        --exclude='chroot/' \
+        --exclude='.build/' \
+        --exclude='*.iso' \
+        --exclude='*.packages' \
+        --exclude='*.contents' \
+        --exclude='*.files' \
+        "$src_dir/" "$dst_dir/"
+
+    if [ ! -f "$dst_dir/config/bootloaders/grub-efi/grub.cfg" ]; then
+        echo "ERROR: staged workdir is missing config/bootloaders/grub-efi/grub.cfg" >&2
+        exit 1
+    fi
+    if [ -e "$dst_dir/config/bootloaders/grub-pc" ]; then
+        echo "ERROR: stale config/bootloaders/grub-pc remained in staged workdir" >&2
+        exit 1
+    fi
+}
+
 iso_list_files() {
    iso_path="$1"

@@ -203,7 +234,7 @@ dump_memtest_debug() {

        echo "-- source bootloader templates --"
        for cfg in \
-            "${BUILDER_DIR}/config/bootloaders/grub-pc/grub.cfg" \
+            "${BUILDER_DIR}/config/bootloaders/grub-efi/grub.cfg" \
            "${BUILDER_DIR}/config/bootloaders/isolinux/live.cfg.in"; do
            if [ -f "$cfg" ]; then
                echo "  file: $cfg"
@@ -466,6 +497,75 @@ validate_iso_memtest() {
    echo "=== memtest validation OK ==="
 }

+validate_iso_live_boot_entries() {
+    iso_path="$1"
+    echo "=== validating live boot entries in ISO ==="
+
+    [ -f "$iso_path" ] || {
+        echo "ERROR: ISO not found for live boot validation: $iso_path" >&2
+        exit 1
+    }
+    require_iso_reader "$iso_path" >/dev/null 2>&1 || {
+        echo "ERROR: ISO reader unavailable for live boot validation" >&2
+        exit 1
+    }
+
+    grub_cfg="$(mktemp)"
+    isolinux_cfg="$(mktemp)"
+
+    iso_read_member "$iso_path" boot/grub/grub.cfg "$grub_cfg" || {
+        echo "ERROR: failed to read boot/grub/grub.cfg from ISO" >&2
+        rm -f "$grub_cfg" "$isolinux_cfg"
+        exit 1
+    }
+    iso_read_member "$iso_path" isolinux/live.cfg "$isolinux_cfg" || {
+        echo "ERROR: failed to read isolinux/live.cfg from ISO" >&2
+        rm -f "$grub_cfg" "$isolinux_cfg"
+        exit 1
+    }
+
+    if grep -q '@APPEND_LIVE@\|@KERNEL_LIVE@\|@INITRD_LIVE@' "$grub_cfg" "$isolinux_cfg"; then
+        echo "ERROR: unresolved live-build placeholders remain in ISO bootloader config" >&2
+        rm -f "$grub_cfg" "$isolinux_cfg"
+        exit 1
+    fi
+
+    grep -q 'menuentry "EASY-BEE"' "$grub_cfg" || {
+        echo "ERROR: GRUB default EASY-BEE entry is missing" >&2
+        rm -f "$grub_cfg" "$isolinux_cfg"
+        exit 1
+    }
+    grep -q 'menuentry "EASY-BEE — load to RAM (toram)"' "$grub_cfg" || {
+        echo "ERROR: GRUB toram entry is missing" >&2
+        rm -f "$grub_cfg" "$isolinux_cfg"
+        exit 1
+    }
+    grep -q 'linux .*boot=live ' "$grub_cfg" || {
+        echo "ERROR: GRUB live entry is missing boot=live" >&2
+        rm -f "$grub_cfg" "$isolinux_cfg"
+        exit 1
+    }
+    grep -q 'linux .*boot=live .*toram ' "$grub_cfg" || {
+        echo "ERROR: GRUB toram entry is missing boot=live or toram" >&2
+        rm -f "$grub_cfg" "$isolinux_cfg"
+        exit 1
+    }
+
+    grep -q 'append .*boot=live ' "$isolinux_cfg" || {
+        echo "ERROR: isolinux live entry is missing boot=live" >&2
+        rm -f "$grub_cfg" "$isolinux_cfg"
+        exit 1
+    }
+    grep -q 'append .*boot=live .*toram ' "$isolinux_cfg" || {
+        echo "ERROR: isolinux toram entry is missing boot=live or toram" >&2
+        rm -f "$grub_cfg" "$isolinux_cfg"
+        exit 1
+    }
+
+    rm -f "$grub_cfg" "$isolinux_cfg"
+    echo "=== live boot validation OK ==="
+}
+
 validate_iso_nvidia_runtime() {
    iso_path="$1"
    [ "$BEE_GPU_VENDOR" = "nvidia" ] || return 0
@@ -542,6 +642,186 @@ label memtest
 EOF
 }

+extract_live_grub_entry() {
+    cfg="$1"
+    live_linux="$(awk '/^[[:space:]]*linux[[:space:]]+\/live\// { print; exit }' "$cfg")"
+    live_initrd="$(awk '/^[[:space:]]*initrd[[:space:]]+\/live\// { print; exit }' "$cfg")"
+    [ -n "$live_linux" ] || return 1
+    [ -n "$live_initrd" ] || return 1
+
+    grub_kernel="$(printf '%s\n' "$live_linux" | awk '{print $2}')"
+    grub_append="$(printf '%s\n' "$live_linux" | cut -d' ' -f3-)"
+    grub_initrd="$(printf '%s\n' "$live_initrd" | awk '{print $2}')"
+    [ -n "$grub_kernel" ] || return 1
+    [ -n "$grub_append" ] || return 1
+    [ -n "$grub_initrd" ] || return 1
+    return 0
+}
+
+extract_live_isolinux_entry() {
+    cfg="$1"
+    isolinux_linux="$(awk '/^[[:space:]]*linux[[:space:]]+\/live\// { print; exit }' "$cfg")"
+    isolinux_initrd="$(awk '/^[[:space:]]*initrd[[:space:]]+\/live\// { print; exit }' "$cfg")"
+    isolinux_append="$(awk '/^[[:space:]]*append[[:space:]]+/ { sub(/^[[:space:]]*append[[:space:]]+/, ""); print; exit }' "$cfg")"
+    [ -n "$isolinux_linux" ] || return 1
+    [ -n "$isolinux_initrd" ] || return 1
+    [ -n "$isolinux_append" ] || return 1
+
+    isolinux_kernel="$(printf '%s\n' "$isolinux_linux" | awk '{print $2}')"
+    isolinux_initrd_path="$(printf '%s\n' "$isolinux_initrd" | awk '{print $2}')"
+    [ -n "$isolinux_kernel" ] || return 1
+    [ -n "$isolinux_initrd_path" ] || return 1
+    return 0
+}
+
+write_canonical_grub_cfg() {
+    cfg="$1"
+    kernel="$2"
+    append_live="$3"
+    initrd="$4"
+
+    cat > "$cfg" <<EOF
+source /boot/grub/config.cfg
+
+echo ""
+echo "  ███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗"
+echo "  ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝"
+echo "  █████╗  ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗  █████╗"
+echo "  ██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝"
+echo "  ███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗"
+echo "  ╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝"
+echo "  Hardware Audit LiveCD"
+echo ""
+
+menuentry "EASY-BEE" {
+    linux   ${kernel} ${append_live} nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+    initrd  ${initrd}
+}
+
+menuentry "EASY-BEE — load to RAM (toram)" {
+    linux   ${kernel} ${append_live} toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+    initrd  ${initrd}
+}
+
+submenu "EASY-BEE (advanced options) -->" {
+    menuentry "EASY-BEE — GSP=off" {
+        linux   ${kernel} ${append_live} nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+        initrd  ${initrd}
+    }
+
+    menuentry "EASY-BEE — KMS (no nomodeset)" {
+        linux   ${kernel} ${append_live} bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+        initrd  ${initrd}
+    }
+
+    menuentry "EASY-BEE — KMS + GSP=off" {
+        linux   ${kernel} ${append_live} bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+        initrd  ${initrd}
+    }
+
+    menuentry "EASY-BEE — fail-safe" {
+        linux   ${kernel} ${append_live} nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
+        initrd  ${initrd}
+    }
+}
+
+if [ "\${grub_platform}" = "efi" ]; then
+    menuentry "Memory Test (memtest86+)" {
+        chainloader /boot/memtest86+x64.efi
+    }
+else
+    menuentry "Memory Test (memtest86+)" {
+        linux16 /boot/memtest86+x64.bin
+    }
+fi
+
+if [ "\${grub_platform}" = "efi" ]; then
+    menuentry "UEFI Firmware Settings" {
+        fwsetup
+    }
+fi
+EOF
+}
+
+write_canonical_isolinux_cfg() {
+    cfg="$1"
+    kernel="$2"
+    initrd="$3"
+    append_live="$4"
+
+    cat > "$cfg" <<EOF
+label live-@FLAVOUR@-normal
+    menu label ^EASY-BEE
+    menu default
+    linux ${kernel}
+    initrd ${initrd}
+    append ${append_live} nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+
+label live-@FLAVOUR@-toram
+    menu label EASY-BEE (^load to RAM)
+    linux ${kernel}
+    initrd ${initrd}
+    append ${append_live} toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+
+label live-@FLAVOUR@-gsp-off
+    menu label EASY-BEE (^NVIDIA GSP=off)
+    linux ${kernel}
+    initrd ${initrd}
+    append ${append_live} nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+
+label live-@FLAVOUR@-kms
+    menu label EASY-BEE (^KMS, no nomodeset)
+    linux ${kernel}
+    initrd ${initrd}
+    append ${append_live} bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+
+label live-@FLAVOUR@-kms-gsp-off
+    menu label EASY-BEE (KMS, ^GSP=off)
+    linux ${kernel}
+    initrd ${initrd}
+    append ${append_live} bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
+
+label live-@FLAVOUR@-failsafe
+    menu label EASY-BEE (^fail-safe)
+    linux ${kernel}
+    initrd ${initrd}
+    append ${append_live} nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
+
+label memtest
+    menu label ^Memory Test (memtest86+)
+    linux /boot/memtest86+x64.bin
+EOF
+}
+
+enforce_live_build_bootloader_assets() {
+    lb_dir="$1"
+    grub_cfg="$lb_dir/binary/boot/grub/grub.cfg"
+    grub_dir="$lb_dir/binary/boot/grub"
+    isolinux_cfg="$lb_dir/binary/isolinux/live.cfg"
+
+    if [ -f "$grub_cfg" ]; then
+        if extract_live_grub_entry "$grub_cfg"; then
+            mkdir -p "$grub_dir/live-theme"
+            cp "${BUILDER_DIR}/config/bootloaders/grub-efi/config.cfg" "$grub_dir/config.cfg"
+            cp "${BUILDER_DIR}/config/bootloaders/grub-efi/theme.cfg" "$grub_dir/theme.cfg"
+            cp -R "${BUILDER_DIR}/config/bootloaders/grub-efi/live-theme/." "$grub_dir/live-theme/"
+            write_canonical_grub_cfg "$grub_cfg" "$grub_kernel" "$grub_append" "$grub_initrd"
+            echo "bootloader sync: rewrote binary/boot/grub/grub.cfg with canonical EASY-BEE menu"
+        else
+            echo "bootloader sync: WARNING: could not extract live entry from $grub_cfg" >&2
+        fi
+    fi
+
+    if [ -f "$isolinux_cfg" ]; then
+        if extract_live_isolinux_entry "$isolinux_cfg"; then
+            write_canonical_isolinux_cfg "$isolinux_cfg" "$isolinux_kernel" "$isolinux_initrd_path" "$isolinux_append"
+            echo "bootloader sync: rewrote binary/isolinux/live.cfg with canonical EASY-BEE menu"
+        else
+            echo "bootloader sync: WARNING: could not extract live entry from $isolinux_cfg" >&2
+        fi
+    fi
+}
+
 copy_memtest_from_deb() {
    deb="$1"
    dst_boot="$2"
@@ -932,15 +1212,7 @@ echo "=== preparing staged overlay (${BUILD_VARIANT}) ==="
 mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"

 # Sync builder config into variant work dir, preserving lb cache.
-rsync -a --delete \
-    --exclude='cache/' \
-    --exclude='chroot/' \
-    --exclude='.build/' \
-    --exclude='*.iso' \
-    --exclude='*.packages' \
-    --exclude='*.contents' \
-    --exclude='*.files' \
-    "${BUILDER_DIR}/" "${BUILD_WORK_DIR}/"
+sync_builder_workdir "${BUILDER_DIR}" "${BUILD_WORK_DIR}"

 # Share deb package cache across variants.
 # Restore: populate work dir cache from shared cache before build.
@@ -954,86 +1226,6 @@ elif [ -d "${LB_PKG_CACHE}" ] && [ "$(ls -A "${LB_PKG_CACHE}" 2>/dev/null)" ]; t
    rsync -a "${LB_PKG_CACHE}/" "${BUILD_WORK_DIR}/cache/packages.chroot/"
 fi

-if [ "$BEE_GPU_VENDOR" != "nvidia" ] || [ "$BEE_NVIDIA_MODULE_FLAVOR" != "proprietary" ]; then
-    cat > "${BUILD_WORK_DIR}/config/bootloaders/grub-pc/grub.cfg" <<'EOF'
-source /boot/grub/config.cfg
-
-echo ""
-echo "  ███████╗ █████╗ ███████╗██╗   ██╗      ██████╗ ███████╗███████╗"
-echo "  ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝      ██╔══██╗██╔════╝██╔════╝"
-echo "  █████╗  ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗  █████╗"
-echo "  ██╔══╝  ██╔══██║╚════██║  ╚██╔╝  ╚════╝██╔══██╗██╔══╝  ██╔══╝"
-echo "  ███████╗██║  ██║███████║   ██║         ██████╔╝███████╗███████╗"
-echo "  ╚══════╝╚═╝  ╚═╝╚══════╝   ╚═╝         ╚═════╝ ╚══════╝╚══════╝"
-echo "  Hardware Audit LiveCD"
-echo ""
-
-menuentry "EASY-BEE" {
-    linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
-    initrd  @INITRD_LIVE@
-}
-
-submenu "EASY-BEE (advanced options) -->" {
-    menuentry "EASY-BEE — KMS (no nomodeset)" {
-        linux   @KERNEL_LIVE@ @APPEND_LIVE@ net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
-        initrd  @INITRD_LIVE@
-    }
-
-    menuentry "EASY-BEE — fail-safe" {
-        linux   @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
-        initrd  @INITRD_LIVE@
-    }
-}
-
-if [ "${grub_platform}" = "efi" ]; then
-    menuentry "Memory Test (memtest86+)" {
-        chainloader /boot/memtest86+x64.efi
-    }
-else
-    menuentry "Memory Test (memtest86+)" {
-        linux16 /boot/memtest86+x64.bin
-    }
-fi
-
-if [ "${grub_platform}" = "efi" ]; then
-    menuentry "UEFI Firmware Settings" {
-        fwsetup
-    }
-fi
-EOF
-
-    cat > "${BUILD_WORK_DIR}/config/bootloaders/isolinux/live.cfg.in" <<'EOF'
-label live-@FLAVOUR@-normal
-    menu label ^EASY-BEE
-    menu default
-    linux @LINUX@
-    initrd @INITRD@
-    append @APPEND_LIVE@
-
-label live-@FLAVOUR@-kms
-    menu label EASY-BEE (^graphics/KMS)
-    linux @LINUX@
-    initrd @INITRD@
-    append @APPEND_LIVE@ bee.display=kms
-
-label live-@FLAVOUR@-toram
-    menu label EASY-BEE (^load to RAM)
-    linux @LINUX@
-    initrd @INITRD@
-    append @APPEND_LIVE@ toram
-
-label live-@FLAVOUR@-failsafe
-    menu label EASY-BEE (^fail-safe)
-    linux @LINUX@
-    initrd @INITRD@
-    append @APPEND_LIVE@ memtest noapic noapm nodma nomce nolapic nosmp vga=normal
-
-label memtest
-    menu label ^Memory Test (memtest86+)
-    linux /boot/memtest86+x64.bin
-EOF
-fi
-
 rsync -a "${OVERLAY_DIR}/" "${OVERLAY_STAGE_DIR}/"
 rm -f \
    "${OVERLAY_STAGE_DIR}/etc/bee-ssh-password-fallback" \
@@ -1309,6 +1501,14 @@ run_step_sh "live-build clean" "80-lb-clean" "lb clean --all 2>&1 | tail -3"
 run_step_sh "live-build config" "81-lb-config" "lb config 2>&1 | tail -5"
 dump_memtest_debug "pre-build" "${LB_DIR}"
 run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
+echo "=== enforcing canonical bootloader assets ==="
+enforce_live_build_bootloader_assets "${LB_DIR}"
+reset_live_build_stage "${LB_DIR}" "binary_checksums"
+reset_live_build_stage "${LB_DIR}" "binary_iso"
+reset_live_build_stage "${LB_DIR}" "binary_zsync"
+run_step_sh "rebuild live-build checksums after bootloader sync" "91b-lb-checksums" "lb binary_checksums 2>&1"
+run_step_sh "rebuild ISO after bootloader sync" "91c-lb-binary-iso" "lb binary_iso 2>&1"
+run_step_sh "rebuild zsync after bootloader sync" "91d-lb-zsync" "lb binary_zsync 2>&1"

 # --- persist deb package cache back to shared location ---
 # This allows the second variant to reuse all downloaded packages.
@@ -1333,6 +1533,7 @@ if [ -f "$ISO_RAW" ]; then
        fi
    fi
    validate_iso_memtest "$ISO_RAW"
+    validate_iso_live_boot_entries "$ISO_RAW"
    validate_iso_nvidia_runtime "$ISO_RAW"
    cp "$ISO_RAW" "$ISO_OUT"
    echo ""
--- a/iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.png
+++ b/iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.png
--- a/iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt
+++ b/iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt
@@ -5,6 +5,15 @@ title-text: ""
 message-font: "Unifont Regular 16"
 terminal-font: "Unifont Regular 16"

+#bee logo — centered, upper third of screen
+ image {
+        top = 4%
+        left = 50%-200
+        width = 400
+        height = 400
+        file = "bee-logo.png"
+}
+
 #help bar at the bottom
 + label {
        top = 100%-50
@@ -21,8 +30,8 @@ terminal-font: "Unifont Regular 16"
 + boot_menu {
        left = 20%
        width = 60%
-        top = 62%
-        height = 38%-80
+        top = 65%
+        height = 35%-80
        item_color = "#c88000"
        item_font = "Unifont Regular 16"
        selected_item_color= "#f5a800"
--- a/iso/builder/config/bootloaders/grub-efi/theme.cfg
+++ b/iso/builder/config/bootloaders/grub-efi/theme.cfg
@@ -1,7 +1,7 @@
 set color_normal=light-gray/black
 set color_highlight=yellow/black

-if [ -e /boot/grub/splash.png ]; then
+if [ -e /boot/grub/live-theme/theme.txt ]; then
    set theme=/boot/grub/live-theme/theme.txt
 else
    set menu_color_normal=yellow/black
--- a/iso/overlay/etc/systemd/system/bee-web.service
+++ b/iso/overlay/etc/systemd/system/bee-web.service
@@ -10,6 +10,8 @@ RestartSec=3
 StandardOutput=journal
 StandardError=journal
 LimitMEMLOCK=infinity
+# No MemoryMax: bee-web spawns GPU test subprocesses (dcgmproftester etc.)
+# that legitimately use several GB; a cgroup limit kills them via OOM.
 # Keep the web server responsive during GPU/CPU stress (children inherit nice+10
 # via Setpriority in runCmdJob, but the bee-web parent stays at 0).
 Nice=0
--- a/iso/overlay/usr/local/bin/bee-nvidia-recover
+++ b/iso/overlay/usr/local/bin/bee-nvidia-recover
@@ -0,0 +1,178 @@
+#!/bin/sh
+# bee-nvidia-recover — drain NVIDIA clients, then reset a GPU or reload drivers.
+
+set -u
+
+log() {
+    echo "[bee-nvidia-recover] $*"
+}
+
+log_blocker() {
+    echo "[bee-nvidia-recover] blocker: $*"
+}
+
+usage() {
+    cat <<'EOF'
+usage:
+  bee-nvidia-recover restart-drivers
+  bee-nvidia-recover reset-gpu <index>
+EOF
+}
+
+unit_exists() {
+    systemctl cat "$1" >/dev/null 2>&1
+}
+
+unit_is_active() {
+    systemctl is-active --quiet "$1" 2>/dev/null
+}
+
+stop_unit_if_active() {
+    unit="$1"
+    if unit_is_active "$unit"; then
+        log "stopping $unit"
+        systemctl stop "$unit"
+        return 0
+    fi
+    return 1
+}
+
+start_unit_if_marked() {
+    unit="$1"
+    marker="$2"
+    if [ "$marker" = "1" ] && unit_exists "$unit"; then
+        log "starting $unit"
+        systemctl start "$unit"
+    fi
+}
+
+wait_for_process_exit() {
+    name="$1"
+    tries=0
+    while pgrep -x "$name" >/dev/null 2>&1; do
+        tries=$((tries + 1))
+        if [ "$tries" -ge 15 ]; then
+            log "WARN: $name is still running after stop request"
+            return 1
+        fi
+        sleep 1
+    done
+    return 0
+}
+
+kill_pattern() {
+    pattern="$1"
+    if pgrep -f "$pattern" >/dev/null 2>&1; then
+        pgrep -af "$pattern" 2>/dev/null | while IFS= read -r line; do
+            [ -n "$line" ] || continue
+            log_blocker "$line"
+        done
+        log "killing processes matching: $pattern"
+        pkill -TERM -f "$pattern" >/dev/null 2>&1 || true
+        sleep 1
+        pkill -KILL -f "$pattern" >/dev/null 2>&1 || true
+    fi
+}
+
+drain_gpu_clients() {
+    display_was_active=0
+    fabric_was_active=0
+
+    for unit in display-manager.service lightdm.service; do
+        if unit_exists "$unit" && stop_unit_if_active "$unit"; then
+            log_blocker "service $unit"
+            display_was_active=1
+        fi
+    done
+
+    if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
+        log_blocker "service nvidia-fabricmanager.service"
+        fabric_was_active=1
+    fi
+
+    if pgrep -x nv-hostengine >/dev/null 2>&1; then
+        pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do
+            [ -n "$line" ] || continue
+            log_blocker "$line"
+        done
+        log "stopping nv-hostengine"
+        pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true
+        wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true
+    fi
+
+    for pattern in \
+        "nvidia-smi" \
+        "dcgmi" \
+        "nvvs" \
+        "dcgmproftester" \
+        "all_reduce_perf" \
+        "nvtop" \
+        "bee-gpu-burn" \
+        "bee-john-gpu-stress" \
+        "bee-nccl-gpu-stress" \
+        "Xorg" \
+        "Xwayland"; do
+        kill_pattern "$pattern"
+    done
+}
+
+restore_gpu_clients() {
+    if command -v nvidia-smi >/dev/null 2>&1; then
+        if nvidia-smi -pm 1 >/dev/null 2>&1; then
+            log "enabled NVIDIA persistence mode"
+        else
+            log "WARN: failed to enable NVIDIA persistence mode"
+        fi
+    fi
+
+    if command -v nv-hostengine >/dev/null 2>&1 && ! pgrep -x nv-hostengine >/dev/null 2>&1; then
+        log "starting nv-hostengine"
+        nv-hostengine
+    fi
+
+    start_unit_if_marked nvidia-fabricmanager.service "${fabric_was_active:-0}"
+    start_unit_if_marked display-manager.service "${display_was_active:-0}"
+    if [ "${display_was_active:-0}" = "1" ] && unit_exists lightdm.service && ! unit_is_active lightdm.service; then
+        start_unit_if_marked lightdm.service "1"
+    fi
+}
+
+restart_drivers() {
+    drain_gpu_clients
+    for mod in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do
+        if lsmod | awk '{print $1}' | grep -qx "$mod"; then
+            log "unloading module $mod"
+            rmmod "$mod"
+        fi
+    done
+    rm -f /dev/nvidiactl /dev/nvidia-uvm /dev/nvidia-uvm-tools /dev/nvidia[0-9]* 2>/dev/null || true
+    log "reloading NVIDIA driver stack"
+    /usr/local/bin/bee-nvidia-load
+    restore_gpu_clients
+}
+
+reset_gpu() {
+    index="$1"
+    drain_gpu_clients
+    log "resetting GPU $index"
+    nvidia-smi -r -i "$index"
+    restore_gpu_clients
+}
+
+cmd="${1:-}"
+case "$cmd" in
+    restart-drivers)
+        restart_drivers
+        ;;
+    reset-gpu)
+        if [ "$#" -ne 2 ]; then
+            usage >&2
+            exit 2
+        fi
+        reset_gpu "$2"
+        ;;
+    *)
+        usage >&2
+        exit 2
+        ;;
+esac
--- a/scripts/deploy.sh
+++ b/scripts/deploy.sh
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+REMOTE_USER="bee"
+REMOTE_BIN="/usr/local/bin/bee"
+LOCAL_BIN="audit/bee"
+SERVICES="bee-audit bee-web"
+
+# --- IP ---
+if [[ $# -ge 1 ]]; then
+    HOST="$1"
+else
+    read -rp "IP адрес хоста: " HOST
+fi
+[[ -z "$HOST" ]] && { echo "Ошибка: IP не указан"; exit 1; }
+
+# --- SSH options ---
+SSH_OPTS=(-o StrictHostKeyChecking=no -o ConnectTimeout=10)
+
+# Проверяем, нужен ли пароль
+SSH_PASS=""
+if ! ssh "${SSH_OPTS[@]}" -o BatchMode=yes "${REMOTE_USER}@${HOST}" true 2>/dev/null; then
+    if command -v sshpass &>/dev/null; then
+        read -rsp "Пароль для ${REMOTE_USER}@${HOST}: " SSH_PASS
+        echo
+        SSH_CMD=(sshpass -p "$SSH_PASS" ssh "${SSH_OPTS[@]}")
+        SCP_CMD=(sshpass -p "$SSH_PASS" scp "${SSH_OPTS[@]}")
+    else
+        echo "sshpass не установлен. Введите пароль вручную при запросе (или установите SSH-ключ)."
+        SSH_CMD=(ssh "${SSH_OPTS[@]}")
+        SCP_CMD=(scp "${SSH_OPTS[@]}")
+    fi
+else
+    SSH_CMD=(ssh "${SSH_OPTS[@]}")
+    SCP_CMD=(scp "${SSH_OPTS[@]}")
+fi
+
+REMOTE="${REMOTE_USER}@${HOST}"
+
+# --- Build ---
+echo "==> Сборка бинарника..."
+(
+    cd audit
+    VERSION=$(sh ./scripts/resolve-version.sh 2>/dev/null || echo "dev")
+    CGO_ENABLED=0 GOOS=linux GOARCH=amd64 \
+        go build -ldflags "-X main.Version=${VERSION}" -o bee ./cmd/bee
+)
+echo "    OK: $(ls -lh "${LOCAL_BIN}" | awk '{print $5, $9}')"
+
+# --- Deploy ---
+echo "==> Копирование на ${REMOTE}..."
+"${SCP_CMD[@]}" "${LOCAL_BIN}" "${REMOTE}:/tmp/bee-new"
+
+echo "==> Замена бинарника и перезапуск сервисов..."
+"${SSH_CMD[@]}" "$REMOTE" bash -s <<EOF
+set -e
+sudo mv /tmp/bee-new ${REMOTE_BIN}
+sudo chmod +x ${REMOTE_BIN}
+sudo systemctl restart ${SERVICES}
+sleep 2
+systemctl status ${SERVICES} --no-pager -l
+EOF
+
+echo "==> Готово."
Author	SHA1	Message	Date
Mikhail Chusavitin	a35e90a93e	fix(iso): clear stale bootloader templates in workdir	2026-04-20 13:19:50 +03:00
Mikhail Chusavitin	1ced81707f	fix(iso): validate live boot entries in final ISO	2026-04-20 13:12:24 +03:00
Mikhail Chusavitin	679aeb9947	Run NVIDIA DCGM diag tests on all selected GPUs simultaneously targeted_stress, targeted_power, and the Level 2/3 diag were dispatched one GPU at a time from the UI, turning a single dcgmi command into 8 sequential ~350–450 s runs. DCGM supports -i with a comma-separated list of GPU indices and runs the diagnostic on all of them in parallel. Move nvidia, nvidia-targeted-stress, nvidia-targeted-power into nvidiaAllGPUTargets so expandSATTarget passes all selected indices in one API call. Simplify runNvidiaValidateSet to match runNvidiaFabricValidate. Update sat.go constants and page_validate.go estimates to reflect all-GPU simultaneous execution (remove n× multiplier from total time estimates). Stress test on 8-GPU system: ~5.3 h → ~2.5 h. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-20 11:53:25 +03:00
Mikhail Chusavitin	647e99b697	Fix post-sync live-build ISO rebuild	2026-04-20 11:01:15 +03:00
Mikhail Chusavitin	4af997f436	Update audit bee binary	2026-04-20 10:55:42 +03:00
Mikhail Chusavitin	6caace0cc0	Make power benchmark report phase-averaged	2026-04-20 10:53:53 +03:00
Mikhail Chusavitin	5f0103635b	Update power benchmark GPU reset flow	2026-04-20 09:46:00 +03:00
Mikhail Chusavitin	84a2551dc0	Fix NVIDIA self-heal recovery flow	2026-04-20 09:43:22 +03:00
Mikhail Chusavitin	1cfabc9230	Reset GPUs before power benchmark	2026-04-20 09:42:19 +03:00
Mikhail Chusavitin	5dc711de23	Start power calibration from full GPU TDP	2026-04-20 09:28:58 +03:00
Mikhail Chusavitin	ab802719f8	Use real NVIDIA power-limit bounds in benchmark	2026-04-20 09:26:56 +03:00
Mikhail Chusavitin	a94e8007f8	Ignore power throttling in benchmark calibration	2026-04-20 09:26:29 +03:00
Michael Chus	c69bf07b27	Commit remaining workspace changes	2026-04-20 07:02:31 +03:00
Michael Chus	b3cf8e3893	Globalize autotuned system power source	2026-04-20 07:02:12 +03:00
Michael Chus	17118298bd	audit: switch power benchmark load to dcgmproftester	2026-04-20 06:57:14 +03:00
Michael Chus	65bcc9ce81	refactor(webui): split pages into task modules	2026-04-20 06:56:52 +03:00
Michael Chus	0cdfbc5875	fix(iso): restore boot UX and boot logs	2026-04-19 23:08:09 +03:00
Michael Chus	cf9b54b600	Use last ramp-step SDR snapshot for PSU loaded power; add deploy script - benchmark.go: retain sdrLastStep from final ramp step instead of re-sampling after test when GPUs are already idle - scripts/deploy.sh: build+deploy bee binary to remote host over SSH Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-19 21:26:44 +03:00
Michael Chus	0bfb3fe954	Use PSU SDR sum for system power chart when available DCMI reports only the managed power domain (~CPU+MB), missing GPU draw. PSU AC input sensors cover full wall power. When samplePSUPower returns data, sum the slots for PowerW; fall back to DCMI otherwise. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-19 19:10:01 +03:00
Michael Chus	3053cb0710	Fix PSU slot regex: match MSI underscore format PSU1_POWER_IN \b does not fire between a digit and '_' because '_' is \w in RE2. The pattern \bpsu?\s*([0-9]+)\b never matched PSU1_POWER_IN style sensors, so parsePSUSDR (and PSUSlotsFromSDR / samplePSUPower) returned empty results for MSI servers — causing all power graphs to fall back to DCMI which reports ~half actual draw. Added an explicit underscore-terminated pattern first in the list and tests covering the MSI format. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-19 19:03:02 +03:00
Michael Chus	2038489961	Remove MemoryMax=3G from bee-web.service to fix OOM kill during GPU tests dcgmproftester and other GPU test subprocesses run inside the bee-web cgroup and exceed 3G with 8 GPUs. OOM killer terminates the whole service. No memory cap is appropriate on a LiveCD where GPU tests legitimately use several GB. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-19 18:52:41 +03:00
Michael Chus	e35484013e	Use SDR PSU AC input for single-card calibration server power Same fix as ramp steps: take sdrSingle snapshot after calibration and prefer PSUInW over DCMI for singleIPMILoadedW. DCMI kept as fallback. Log message indicates source. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-19 18:44:13 +03:00
Michael Chus	2cdf034bb0	Use SDR PSU AC input for per-step server power in power ramp When sdrStep.PSUInW is available, prefer it over DCMI for ramp.ServerLoadedW and ServerDeltaW. DCMI on this platform (MSI 4-PSU) reports ~half actual draw; SDR sums all PSU_POWER_IN sensors correctly. Delta is now SDR-to-SDR (sdrStep.PSUInW - sdrIdle.PSUInW) for consistency. DCMI path kept as fallback when SDR has no PSU data. Log message now indicates the source (SDR PSU AC input vs DCMI). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-19 18:43:36 +03:00
Michael Chus	b89580c24d	Fix PSU power chart: use name-based SDR matching instead of entity ID MSI servers place PSU_POWER_IN/OUT sensors on entity 3.0, not 10.N (the IPMI "Power Supply" entity). The old parser filtered by entity ID and found nothing, so the dashboard fell back to DCMI which reports roughly half the actual draw. Now delegates to collector.PSUSlotsFromSDR — the same name-based matching already used in the Power Fit benchmark. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-19 18:39:21 +03:00
Michael Chus	df1385d3d6	Fix dcgmproftester parallel mode: use staggered script for all multi-GPU runs A single dcgmproftester process without -i only loads GPU 0 regardless of CUDA_VISIBLE_DEVICES. Now always routes multi-GPU runs through bee-dcgmproftester-staggered (--stagger-seconds 0 for parallel mode), which spawns one process per GPU so all GPUs are loaded simultaneously. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-19 18:31:34 +03:00
Michael Chus	f8cd9a7376	Rework Power Fit report: 90 min stability, aligned tables, PSU/fan sections - Increase stability profile duration from 33 min to 90 min by wiring powerBenchDurationSec() into runBenchmarkPowerCalibration (was discarded) - Collect per-step PSU slot readings, fan RPM/duty, and per-GPU telemetry in ramp loop; add matching fields to NvidiaPowerBenchStep/NvidiaPowerBenchGPU - Rewrite renderPowerBenchReport: replace Per-Slot Results with Single GPU section, rework Ramp Sequence rows=runs/cols=GPUs, add PSU Performance section (conditional on IPMI data), add transposed Single vs All-GPU comparison table in per-GPU sections - Add fmtMDTable helper (benchmark_table.go) and apply to all tables in both power and performance reports so columns align in plain-text view Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-19 18:04:12 +03:00
Michael Chus	d52ec67f8f	Stability hardening, build script fixes, GRUB bee logo Stability hardening (webui/app): - readFileLimited(): защита от OOM при чтении audit JSON (100 MB), component-status DB (10 MB) и лога задачи (50 MB) - jobs.go: буферизованный лог задачи — один открытый fd на задачу вместо open/write/close на каждую строку (устраняет тысячи syscall/сек при GPU стресс-тестах) - stability.go: экспоненциальный backoff в goRecoverLoop (2s→4s→…→60s), сброс при успешном прогоне >30s, счётчик перезапусков в slog - kill_workers.go: таймаут 5s на скан /proc, warn при срабатывании - bee-web.service: MemoryMax=3G — OOM killer защищён Build script: - build.sh: удалён блок генерации grub-pc/grub.cfg + live.cfg.in — мёртвый код с v8.25; grub-pc игнорируется live-build, а генерируемый live.cfg.in перезаписывал правильный статический файл устаревшей версией без tuning-параметров ядра и пунктов gsp-off/kms+gsp-off - build.sh: dump_memtest_debug теперь логирует grub-efi/grub.cfg вместо grub-pc/grub.cfg (было всегда "missing") GRUB: - live-theme/bee-logo.png: логотип пчелы 400×400px на чёрном фоне - live-theme/theme.txt: + image компонент по центру в верхней трети экрана; меню сдвинуто с 62% до 65% Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-19 13:08:31 +03:00
Michael Chus	61c7abaa80	Add multi-source PSU power triangulation and per-slot distribution table - collector/psu.go: export PSUSlotsFromSDR() reusing slot regex patterns; add isPSUInputPower/isPSUOutputPower helpers covering MSI/MLT/xFusion/HPE naming; add xFusion Power<N> slot pattern; parseBoundedFloat for self-healing (rejects zero/negative/out-of-range sensor readings); default fallback treats unclassified PSU sensors as AC input - benchmark_types.go: BenchmarkPSUSlotPower struct; BenchmarkServerPower gains PSUInputIdle/Loaded, PSUOutputIdle/Loaded, PSUSlotReadingsIdle/Loaded, GPUSlotTotalW, DCMICoverageRatio fields - benchmark.go: sampleIPMISDRPowerSensors uses collector.PSUSlotsFromSDR instead of custom classifier; detectDCMIPartialCoverage replaces ramp heuristic — compares DCMI idle vs SDR PSU sum, flags <0.70 ratio as partial coverage; detectIPMISaturationFallback kept for servers without SDR PSU sensors; report gains PSU Load Distribution table (per-slot AC/DC idle vs loaded, Δ) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-19 13:07:48 +03:00
Michael Chus	d60f7758ba	Fix grub-pc directory missing before writing grub.cfg Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-19 08:42:17 +03:00
Michael Chus	52c3a24b76	Compact metrics DB in background to prevent CPU spin under load As metrics.db grew (1 sample/5 s × hours), handleMetricsChartSVG called LoadAll() on every chart request — loading all rows across 4 tables through a single SQLite connection. With ~10 charts auto-refreshing in parallel, requests queued behind each other, saturating the connection pool and pegging a CPU core. Fix: add a background compactor that runs every hour via the metrics collector: • Downsample: rows older than 2 h are thinned to 1 per minute (keep MIN(ts) per ts/60 bucket) — retains chart shape while cutting row count by ~92 %. • Prune: rows older than 48 h are deleted entirely. • After prune: WAL checkpoint/truncate to release disk space. LoadAll() in handleMetricsChartSVG is unchanged — it now stays fast because the DB is kept small rather than capping the query window. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-18 15:28:05 +03:00
Michael Chus	028bb30333	Detect PSU faults during perf and power benchmarks Snapshot IPMI "Power Supply" sensor states before and after each benchmark run. Compare before/after to surface only new anomalies (pre-existing faults are excluded). Results land in NvidiaBenchmarkResult.PSUIssues and NvidiaPowerBenchResult.PSUIssues (JSON: psu_issues) and are printed in the text benchmark report under a "PSU Issues" section. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-18 15:08:41 +03:00
Michael Chus	7d64e5d215	Fix two stale failing tests - TestHandleAPIBenchmarkPowerFitRampQueuesBenchmarkPowerFitTasks: ramp-up mode intentionally creates a single task (the runner handles 1→N internally to avoid redundant repetition of earlier ramp steps). Updated the test to expect 1 task and verify RampTotal=3 instead of asserting 3 separate tasks. - TestBenchmarkPageRendersSavedResultsTable: benchmark page used "Performance Results" as heading while the test looked for "Perf Results". Aligned the page heading with the shorter label used everywhere else (task reports, etc.). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-18 15:07:27 +03:00
Michael Chus	51b721aeb3	Add real-data duration estimates to benchmark and burn pages - Add BenchmarkEstimated* constants to benchmark_types.go from _v8 logs (Standard Perf ~16 min, Standard Power Fit ~43 min, Stability Perf ~92 min) - Update benchmark profile dropdown to show Perf / Power Fit timing per profile - Add timing columns to Method Split table (Standard vs Stability per run type) - Update burn preset labels to show "N min/GPU (sequential) or N min (parallel)" - Clarify burn "one by one" description with sequential vs parallel scaling Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-18 10:54:50 +03:00
Michael Chus	bac89bb6e5	Add real-data duration estimates to validate tab profiles - Add SATEstimated* constants to sat.go derived from _v8 production logs, with a rule to recalculate them whenever the script changes - Extend validateInventory with NvidiaGPUCount to make estimates GPU-aware - Update all validate card duration strings: CPU, memory, storage, NVIDIA GPU, targeted stress/power, pulse test, NCCL, nvbandwidth - Fix nvbandwidth description ("intended to stay short" → actual ~45 min) - Top-level profile labels show computed total including GPU count Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-18 10:51:15 +03:00