Globalize autotuned system power source

2026-04-20 07:02:12 +03:00
parent 17118298bd
commit b3cf8e3893
14 changed files with 327 additions and 108 deletions
--- a/audit/internal/app/app.go
+++ b/audit/internal/app/app.go
@@ -19,20 +19,22 @@ import (
 )

 var (
-	DefaultExportDir        = "/appdata/bee/export"
-	DefaultAuditJSONPath    = DefaultExportDir + "/bee-audit.json"
-	DefaultAuditLogPath     = DefaultExportDir + "/bee-audit.log"
-	DefaultWebLogPath       = DefaultExportDir + "/bee-web.log"
-	DefaultNetworkLogPath   = DefaultExportDir + "/bee-network.log"
-	DefaultNvidiaLogPath    = DefaultExportDir + "/bee-nvidia.log"
-	DefaultSSHLogPath       = DefaultExportDir + "/bee-sshsetup.log"
-	DefaultRuntimeJSONPath  = DefaultExportDir + "/runtime-health.json"
-	DefaultRuntimeLogPath   = DefaultExportDir + "/runtime-health.log"
-	DefaultTechDumpDir      = DefaultExportDir + "/techdump"
-	DefaultSATBaseDir       = DefaultExportDir + "/bee-sat"
-	DefaultBeeBenchBaseDir  = DefaultExportDir + "/bee-bench"
-	DefaultBeeBenchPerfDir  = DefaultBeeBenchBaseDir + "/perf"
-	DefaultBeeBenchPowerDir = DefaultBeeBenchBaseDir + "/power"
+	DefaultExportDir                     = "/appdata/bee/export"
+	DefaultAuditJSONPath                 = DefaultExportDir + "/bee-audit.json"
+	DefaultAuditLogPath                  = DefaultExportDir + "/bee-audit.log"
+	DefaultWebLogPath                    = DefaultExportDir + "/bee-web.log"
+	DefaultNetworkLogPath                = DefaultExportDir + "/bee-network.log"
+	DefaultNvidiaLogPath                 = DefaultExportDir + "/bee-nvidia.log"
+	DefaultSSHLogPath                    = DefaultExportDir + "/bee-sshsetup.log"
+	DefaultRuntimeJSONPath               = DefaultExportDir + "/runtime-health.json"
+	DefaultRuntimeLogPath                = DefaultExportDir + "/runtime-health.log"
+	DefaultTechDumpDir                   = DefaultExportDir + "/techdump"
+	DefaultSATBaseDir                    = DefaultExportDir + "/bee-sat"
+	DefaultBeeBenchBaseDir               = DefaultExportDir + "/bee-bench"
+	DefaultBeeBenchAutotuneDir           = DefaultBeeBenchBaseDir + "/autotune"
+	DefaultBeeBenchPerfDir               = DefaultBeeBenchBaseDir + "/perf"
+	DefaultBeeBenchPowerDir              = DefaultBeeBenchBaseDir + "/power"
+	DefaultBeeBenchPowerSourceConfigPath = DefaultBeeBenchBaseDir + "/power-source-autotune.json"
 )

 type App struct {
@@ -125,6 +127,7 @@ type satRunner interface {
 	RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
 	RunNvidiaPowerBench(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
+	RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error)
 	RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
 	RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
@@ -572,6 +575,11 @@ func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts pl
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultBeeBenchPerfDir
 	}
+	resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "performance", logFunc)
+	if err != nil {
+		return "", err
+	}
+	opts.ServerPowerSource = resolved.SelectedSource
 	return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
 }

@@ -579,9 +587,47 @@ func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts p
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultBeeBenchPowerDir
 	}
+	resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "power-fit", logFunc)
+	if err != nil {
+		return "", err
+	}
+	opts.ServerPowerSource = resolved.SelectedSource
 	return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
 }

+func (a *App) RunNvidiaPowerSourceAutotuneCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultBeeBenchAutotuneDir
+	}
+	return a.sat.RunNvidiaPowerSourceAutotune(ctx, baseDir, opts, benchmarkKind, logFunc)
+}
+
+func (a *App) LoadBenchmarkPowerAutotune() (*platform.BenchmarkPowerAutotuneConfig, error) {
+	return platform.LoadBenchmarkPowerAutotuneConfig(DefaultBeeBenchPowerSourceConfigPath)
+}
+
+func (a *App) ensureBenchmarkPowerAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (platform.BenchmarkPowerAutotuneConfig, error) {
+	cfgPath := platform.BenchmarkPowerSourceConfigPath(baseDir)
+	if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath); err == nil {
+		if logFunc != nil {
+			logFunc(fmt.Sprintf("benchmark autotune: using saved server power source %s", cfg.SelectedSource))
+		}
+		return *cfg, nil
+	}
+	if logFunc != nil {
+		logFunc("benchmark autotune: no saved power source config, running autotune first")
+	}
+	autotuneDir := filepath.Join(filepath.Dir(baseDir), "autotune")
+	if _, err := a.RunNvidiaPowerSourceAutotuneCtx(ctx, autotuneDir, opts, benchmarkKind, logFunc); err != nil {
+		return platform.BenchmarkPowerAutotuneConfig{}, err
+	}
+	cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath)
+	if err != nil {
+		return platform.BenchmarkPowerAutotuneConfig{}, err
+	}
+	return *cfg, nil
+}
+
 func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
--- a/audit/internal/app/app_test.go
+++ b/audit/internal/app/app_test.go
@@ -9,6 +9,7 @@ import (
 	"io"
 	"os"
 	"path/filepath"
+	"strings"
 	"testing"

 	"bee/audit/internal/platform"
@@ -123,6 +124,7 @@ type fakeSAT struct {
 	runNvidiaFn               func(string) (string, error)
 	runNvidiaBenchmarkFn      func(string, platform.NvidiaBenchmarkOptions) (string, error)
 	runNvidiaPowerBenchFn     func(string, platform.NvidiaBenchmarkOptions) (string, error)
+	runNvidiaAutotuneFn       func(string, platform.NvidiaBenchmarkOptions, string) (string, error)
 	runNvidiaStressFn         func(string, platform.NvidiaStressOptions) (string, error)
 	runNvidiaComputeFn        func(string, int, []int) (string, error)
 	runNvidiaPowerFn          func(string, int, []int) (string, error)
@@ -163,6 +165,13 @@ func (f fakeSAT) RunNvidiaPowerBench(_ context.Context, baseDir string, opts pla
 	return f.runNvidiaFn(baseDir)
 }

+func (f fakeSAT) RunNvidiaPowerSourceAutotune(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, _ func(string)) (string, error) {
+	if f.runNvidiaAutotuneFn != nil {
+		return f.runNvidiaAutotuneFn(baseDir, opts, benchmarkKind)
+	}
+	return f.runNvidiaFn(baseDir)
+}
+
 func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
 	if f.runNvidiaTargetedStressFn != nil {
 		return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
@@ -809,6 +818,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 	if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
 		t.Fatal(err)
 	}
+	if err := os.MkdirAll(filepath.Join(exportDir, "bee-bench"), 0755); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json"), []byte(`{"version":1,"updated_at":"2026-04-20T01:02:03Z","selected_source":"sdr_psu_input","reason":"selected lowest relative error"}`), 0644); err != nil {
+		t.Fatal(err)
+	}
 	if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run.tar.gz"), []byte("nested sat archive"), 0644); err != nil {
 		t.Fatal(err)
 	}
@@ -836,6 +851,7 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 	tr := tar.NewReader(gzr)
 	var names []string
 	var auditJSON string
+	var manifest string
 	for {
 		hdr, err := tr.Next()
 		if errors.Is(err, io.EOF) {
@@ -852,6 +868,13 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 			}
 			auditJSON = string(body)
 		}
+		if strings.HasSuffix(hdr.Name, "/manifest.txt") {
+			body, err := io.ReadAll(tr)
+			if err != nil {
+				t.Fatalf("read manifest entry: %v", err)
+			}
+			manifest = string(body)
+		}
 	}

 	for _, want := range []string{
@@ -895,6 +918,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 	if !contains(auditJSON, "PASCARI") || !contains(auditJSON, "NVIDIA H100") {
 		t.Fatalf("support bundle should keep real devices:\n%s", auditJSON)
 	}
+	if !contains(manifest, "files:") {
+		t.Fatalf("support bundle manifest missing files section:\n%s", manifest)
+	}
+	if !strings.Contains(manifest, "power_autotune_selected_source=sdr_psu_input") {
+		t.Fatalf("support bundle manifest missing autotune source:\n%s", manifest)
+	}
 }

 func TestMainBanner(t *testing.T) {
--- a/audit/internal/app/support_bundle.go
+++ b/audit/internal/app/support_bundle.go
@@ -2,6 +2,7 @@ package app

 import (
 	"archive/tar"
+	"bee/audit/internal/platform"
 	"compress/gzip"
 	"fmt"
 	"io"
@@ -424,6 +425,13 @@ func writeManifest(dst, exportDir, stageRoot string) error {
 	fmt.Fprintf(&body, "host=%s\n", hostnameOr("unknown"))
 	fmt.Fprintf(&body, "generated_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
 	fmt.Fprintf(&body, "export_dir=%s\n", exportDir)
+	if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json")); err == nil && cfg != nil {
+		fmt.Fprintf(&body, "power_autotune_selected_source=%s\n", cfg.SelectedSource)
+		fmt.Fprintf(&body, "power_autotune_updated_at=%s\n", cfg.UpdatedAt.UTC().Format(time.RFC3339))
+		if strings.TrimSpace(cfg.Reason) != "" {
+			fmt.Fprintf(&body, "power_autotune_reason=%s\n", cfg.Reason)
+		}
+	}
 	fmt.Fprintf(&body, "\nfiles:\n")

 	var files []string
--- a/audit/internal/platform/benchmark_report.go
+++ b/audit/internal/platform/benchmark_report.go
@@ -401,11 +401,15 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		}
 	}

-	// ── Server Power (IPMI) ───────────────────────────────────────────────────
+	// ── Server Power ───────────────────────────────────────────────────────────
 	if sp := result.ServerPower; sp != nil {
-		b.WriteString("## Server Power (IPMI)\n\n")
+		title := "## Server Power\n\n"
+		if sp.Source != "" {
+			title = fmt.Sprintf("## Server Power (`%s`)\n\n", sp.Source)
+		}
+		b.WriteString(title)
 		if !sp.Available {
-			b.WriteString("IPMI power measurement unavailable.\n\n")
+			b.WriteString("Server power measurement unavailable.\n\n")
 		} else {
 			spRows := [][]string{
 				{"Server idle", fmt.Sprintf("%.0f W", sp.IdleW)},
--- a/audit/internal/platform/live_metrics.go
+++ b/audit/internal/platform/live_metrics.go
@@ -16,14 +16,17 @@ import (
 // LiveMetricSample is a single point-in-time snapshot of server metrics
 // collected for the web UI metrics page.
 type LiveMetricSample struct {
-	Timestamp  time.Time      `json:"ts"`
-	Fans       []FanReading   `json:"fans"`
-	Temps      []TempReading  `json:"temps"`
-	PowerW     float64        `json:"power_w"`
-	PSUs       []PSUReading   `json:"psus,omitempty"`
-	CPULoadPct float64        `json:"cpu_load_pct"`
-	MemLoadPct float64        `json:"mem_load_pct"`
-	GPUs       []GPUMetricRow `json:"gpus"`
+	Timestamp   time.Time      `json:"ts"`
+	Fans        []FanReading   `json:"fans"`
+	Temps       []TempReading  `json:"temps"`
+	PowerW      float64        `json:"power_w"`
+	PowerSource string         `json:"power_source,omitempty"`
+	PowerMode   string         `json:"power_mode,omitempty"`
+	PowerReason string         `json:"power_reason,omitempty"`
+	PSUs        []PSUReading   `json:"psus,omitempty"`
+	CPULoadPct  float64        `json:"cpu_load_pct"`
+	MemLoadPct  float64        `json:"mem_load_pct"`
+	GPUs        []GPUMetricRow `json:"gpus"`
 }

 // PSUReading is a per-slot power supply input power reading.
@@ -67,15 +70,13 @@ func SampleLiveMetrics() LiveMetricSample {
 	// Per-PSU power — populated when IPMI SDR has Power Supply entities with Watt readings
 	s.PSUs = samplePSUPower()

-	// System power: prefer sum of PSU AC inputs (full wall draw); fall back to DCMI.
-	if len(s.PSUs) > 0 {
-		var total float64
-		for _, p := range s.PSUs {
-			total += p.PowerW
-		}
-		s.PowerW = total
-	} else {
-		s.PowerW = sampleSystemPower()
+	// System power: use the global autotune-selected source when configured,
+	// otherwise fall back to the historical heuristic and mark the mode.
+	if powerW, decision, err := SampleSystemPowerResolved(""); err == nil {
+		s.PowerW = powerW
+		s.PowerSource = decision.EffectiveSource
+		s.PowerMode = decision.Mode
+		s.PowerReason = decision.Reason
 	}

 	// CPU load — from /proc/stat
--- a/audit/internal/platform/sat_fan_stress.go
+++ b/audit/internal/platform/sat_fan_stress.go
@@ -43,17 +43,22 @@ type GPUStressMetric struct {

 // FanStressRow is one second-interval telemetry sample covering all monitored dimensions.
 type FanStressRow struct {
-	TimestampUTC string
-	ElapsedSec   float64
-	Phase        string // "baseline", "load1", "pause", "load2", "cooldown"
-	GPUs         []GPUStressMetric
-	Fans         []FanReading
-	CPUMaxTempC  float64 // highest CPU temperature from ipmitool / sensors
-	SysPowerW    float64 // DCMI system power reading
+	TimestampUTC   string
+	ElapsedSec     float64
+	Phase          string // "baseline", "load1", "pause", "load2", "cooldown"
+	GPUs           []GPUStressMetric
+	Fans           []FanReading
+	CPUMaxTempC    float64 // highest CPU temperature from ipmitool / sensors
+	SysPowerW      float64
+	SysPowerSource string
+	SysPowerMode   string
 }

 type cachedPowerReading struct {
 	Value     float64
+	Source    string
+	Mode      string
+	Reason    string
 	UpdatedAt time.Time
 }

@@ -278,7 +283,7 @@ func sampleFanStressRow(gpuIndices []int, phase string, elapsed float64) FanStre
 	row.GPUs = sampleGPUStressMetrics(gpuIndices)
 	row.Fans, _ = sampleFanSpeeds()
 	row.CPUMaxTempC = sampleCPUMaxTemp()
-	row.SysPowerW = sampleSystemPower()
+	row.SysPowerW, row.SysPowerSource, row.SysPowerMode = sampleSystemPowerResolved()
 	return row
 }

@@ -763,19 +768,19 @@ func sampleCPUTempViaSensors() float64 {
 	return max
 }

-// sampleSystemPower reads system power draw via DCMI.
-func sampleSystemPower() float64 {
+// sampleSystemPowerResolved reads system power via the global autotune source,
+// falling back to the historical heuristic before autotune or when degraded.
+func sampleSystemPowerResolved() (float64, string, string) {
 	now := time.Now()
-	current := 0.0
-	out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
-	if err == nil {
-		current = parseDCMIPowerReading(string(out))
-	}
+	current, decision, err := SampleSystemPowerResolved("")
 	systemPowerCacheMu.Lock()
 	defer systemPowerCacheMu.Unlock()
-	value, updated := effectiveSystemPowerReading(systemPowerCache, current, now)
+	if err != nil {
+		current = 0
+	}
+	value, updated := effectiveSystemPowerReading(systemPowerCache, current, decision.EffectiveSource, decision.Mode, decision.Reason, now)
 	systemPowerCache = updated
-	return value
+	return value, updated.Source, updated.Mode
 }

 // parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
@@ -798,9 +803,9 @@ func parseDCMIPowerReading(raw string) float64 {
 	return 0
 }

-func effectiveSystemPowerReading(cache cachedPowerReading, current float64, now time.Time) (float64, cachedPowerReading) {
+func effectiveSystemPowerReading(cache cachedPowerReading, current float64, source, mode, reason string, now time.Time) (float64, cachedPowerReading) {
 	if current > 0 {
-		cache = cachedPowerReading{Value: current, UpdatedAt: now}
+		cache = cachedPowerReading{Value: current, Source: source, Mode: mode, Reason: reason, UpdatedAt: now}
 		return current, cache
 	}
 	if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL {
--- a/audit/internal/platform/sat_fan_stress_test.go
+++ b/audit/internal/platform/sat_fan_stress_test.go
@@ -112,7 +112,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
 	now := time.Now()
 	cache := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-5 * time.Second)}

-	got, updated := effectiveSystemPowerReading(cache, 0, now)
+	got, updated := effectiveSystemPowerReading(cache, 0, "", "", "", now)
 	if got != 480 {
 		t.Fatalf("got=%v want cached 480", got)
 	}
@@ -120,7 +120,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
 		t.Fatalf("updated=%+v", updated)
 	}

-	got, updated = effectiveSystemPowerReading(cache, 530, now)
+	got, updated = effectiveSystemPowerReading(cache, 530, "dcmi", "fallback", "test", now)
 	if got != 530 {
 		t.Fatalf("got=%v want 530", got)
 	}
@@ -129,7 +129,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
 	}

 	expired := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-systemPowerHoldTTL - time.Second)}
-	got, _ = effectiveSystemPowerReading(expired, 0, now)
+	got, _ = effectiveSystemPowerReading(expired, 0, "", "", "", now)
 	if got != 0 {
 		t.Fatalf("expired cache returned %v want 0", got)
 	}
--- a/audit/internal/webui/api.go
+++ b/audit/internal/webui/api.go
@@ -127,7 +127,7 @@ func defaultTaskPriority(target string, params taskParams) int {
 		return taskPriorityInstallToRAM
 	case "audit":
 		return taskPriorityAudit
-	case "nvidia-bench-perf", "nvidia-bench-power":
+	case "nvidia-bench-perf", "nvidia-bench-power", "nvidia-bench-autotune":
 		return taskPriorityBenchmark
 	case "nvidia-stress", "amd-stress", "memory-stress", "sat-stress", "platform-stress", "nvidia-compute":
 		return taskPriorityBurn
@@ -701,6 +701,78 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
 	}
 }

+func (h *handler) handleAPIBenchmarkAutotuneRun() http.HandlerFunc {
+	return func(w http.ResponseWriter, r *http.Request) {
+		if h.opts.App == nil {
+			writeError(w, http.StatusServiceUnavailable, "app not configured")
+			return
+		}
+		var body struct {
+			Profile       string `json:"profile"`
+			BenchmarkKind string `json:"benchmark_kind"`
+			SizeMB        int    `json:"size_mb"`
+		}
+		if r.Body != nil {
+			if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
+				writeError(w, http.StatusBadRequest, "invalid request body")
+				return
+			}
+		}
+		profile := strings.TrimSpace(body.Profile)
+		if profile == "" {
+			profile = "standard"
+		}
+		benchmarkKind := strings.TrimSpace(body.BenchmarkKind)
+		if benchmarkKind == "" {
+			benchmarkKind = "power-fit"
+		}
+		now := time.Now()
+		taskName := fmt.Sprintf("NVIDIA Benchmark Autotune · %s · %s", profile, benchmarkKind)
+		t := &Task{
+			ID:        newJobID("bee-bench-autotune"),
+			Name:      taskName,
+			Target:    "nvidia-bench-autotune",
+			Priority:  defaultTaskPriority("nvidia-bench-autotune", taskParams{}),
+			Status:    TaskPending,
+			CreatedAt: now,
+			params: taskParams{
+				BenchmarkProfile: profile,
+				BenchmarkKind:    benchmarkKind,
+				SizeMB:           body.SizeMB,
+				DisplayName:      taskName,
+			},
+		}
+		globalQueue.enqueue(t)
+		writeTaskRunResponse(w, []*Task{t})
+	}
+}
+
+func (h *handler) handleAPIBenchmarkAutotuneStatus(w http.ResponseWriter, r *http.Request) {
+	if h.opts.App == nil {
+		writeError(w, http.StatusServiceUnavailable, "app not configured")
+		return
+	}
+	cfg, err := h.opts.App.LoadBenchmarkPowerAutotune()
+	if err != nil {
+		if os.IsNotExist(err) {
+			w.WriteHeader(http.StatusOK)
+			writeJSON(w, map[string]any{
+				"configured": false,
+				"decision":   platform.ResolveSystemPowerDecision(h.opts.ExportDir),
+			})
+			return
+		}
+		writeError(w, http.StatusInternalServerError, err.Error())
+		return
+	}
+	w.WriteHeader(http.StatusOK)
+	writeJSON(w, map[string]any{
+		"configured": true,
+		"config":     cfg,
+		"decision":   platform.ResolveSystemPowerDecision(h.opts.ExportDir),
+	})
+}
+
 func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
 	h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf").ServeHTTP(w, r)
 }
--- a/audit/internal/webui/api_test.go
+++ b/audit/internal/webui/api_test.go
@@ -195,6 +195,40 @@ func TestHandleAPIBenchmarkPowerFitRampQueuesBenchmarkPowerFitTasks(t *testing.T
 	}
 }

+func TestHandleAPIBenchmarkAutotuneRunQueuesTask(t *testing.T) {
+	globalQueue.mu.Lock()
+	originalTasks := globalQueue.tasks
+	globalQueue.tasks = nil
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = originalTasks
+		globalQueue.mu.Unlock()
+	})
+
+	h := &handler{opts: HandlerOptions{App: &app.App{}}}
+	req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/autotune/run", strings.NewReader(`{"profile":"standard","benchmark_kind":"power-fit"}`))
+	rec := httptest.NewRecorder()
+
+	h.handleAPIBenchmarkAutotuneRun().ServeHTTP(rec, req)
+
+	if rec.Code != 200 {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	globalQueue.mu.Lock()
+	defer globalQueue.mu.Unlock()
+	if len(globalQueue.tasks) != 1 {
+		t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
+	}
+	task := globalQueue.tasks[0]
+	if task.Target != "nvidia-bench-autotune" {
+		t.Fatalf("task target=%q want nvidia-bench-autotune", task.Target)
+	}
+	if task.params.BenchmarkKind != "power-fit" {
+		t.Fatalf("task benchmark kind=%q want power-fit", task.params.BenchmarkKind)
+	}
+}
+
 func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
 	globalQueue.mu.Lock()
 	originalTasks := globalQueue.tasks
--- a/audit/internal/webui/metricsdb.go
+++ b/audit/internal/webui/metricsdb.go
@@ -53,6 +53,9 @@ CREATE TABLE IF NOT EXISTS sys_metrics (
  cpu_load_pct REAL,
  mem_load_pct REAL,
  power_w      REAL,
+  power_source TEXT,
+  power_mode   TEXT,
+  power_reason TEXT,
  PRIMARY KEY (ts)
 );
 CREATE TABLE IF NOT EXISTS gpu_metrics (
@@ -86,7 +89,16 @@ CREATE TABLE IF NOT EXISTS temp_metrics (
 	if err := ensureMetricsColumn(db, "gpu_metrics", "clock_mhz", "REAL"); err != nil {
 		return err
 	}
-	return ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL")
+	if err := ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL"); err != nil {
+		return err
+	}
+	if err := ensureMetricsColumn(db, "sys_metrics", "power_source", "TEXT"); err != nil {
+		return err
+	}
+	if err := ensureMetricsColumn(db, "sys_metrics", "power_mode", "TEXT"); err != nil {
+		return err
+	}
+	return ensureMetricsColumn(db, "sys_metrics", "power_reason", "TEXT")
 }

 func ensureMetricsColumn(db *sql.DB, table, column, definition string) error {
@@ -125,8 +137,8 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
 	defer func() { _ = tx.Rollback() }()

 	_, err = tx.Exec(
-		`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w) VALUES(?,?,?,?)`,
-		ts, s.CPULoadPct, s.MemLoadPct, s.PowerW,
+		`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason) VALUES(?,?,?,?,?,?,?)`,
+		ts, s.CPULoadPct, s.MemLoadPct, s.PowerW, s.PowerSource, s.PowerMode, s.PowerReason,
 	)
 	if err != nil {
 		return err
@@ -213,12 +225,12 @@ func (m *MetricsDB) Prune(before time.Time) error {

 // LoadRecent returns up to n samples in chronological order (oldest first).
 func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
-	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
+	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
 }

 // LoadAll returns all persisted samples in chronological order (oldest first).
 func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
-	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
+	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics ORDER BY ts`, nil)
 }

 // LoadBetween returns samples in chronological order within the given time window.
@@ -233,7 +245,7 @@ func (m *MetricsDB) LoadBetween(start, end time.Time) ([]platform.LiveMetricSamp
 		start, end = end, start
 	}
 	return m.loadSamples(
-		`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
+		`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
 		start.Unix(), end.Unix(),
 	)
 }
@@ -249,11 +261,14 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
 	type sysRow struct {
 		ts            int64
 		cpu, mem, pwr float64
+		powerSource   string
+		powerMode     string
+		powerReason   string
 	}
 	var sysRows []sysRow
 	for rows.Next() {
 		var r sysRow
-		if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr); err != nil {
+		if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr, &r.powerSource, &r.powerMode, &r.powerReason); err != nil {
 			continue
 		}
 		sysRows = append(sysRows, r)
@@ -363,10 +378,13 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
 	samples := make([]platform.LiveMetricSample, len(sysRows))
 	for i, r := range sysRows {
 		s := platform.LiveMetricSample{
-			Timestamp:  time.Unix(r.ts, 0).UTC(),
-			CPULoadPct: r.cpu,
-			MemLoadPct: r.mem,
-			PowerW:     r.pwr,
+			Timestamp:   time.Unix(r.ts, 0).UTC(),
+			CPULoadPct:  r.cpu,
+			MemLoadPct:  r.mem,
+			PowerW:      r.pwr,
+			PowerSource: r.powerSource,
+			PowerMode:   r.powerMode,
+			PowerReason: r.powerReason,
 		}
 		for _, idx := range gpuIndices {
 			if g, ok := gpuData[gpuKey{r.ts, idx}]; ok {
--- a/audit/internal/webui/page_benchmark.go
+++ b/audit/internal/webui/page_benchmark.go
@@ -69,6 +69,7 @@ func renderBenchmark(opts HandlerOptions) string {
      <span id="benchmark-run-nccl" hidden>nccl-auto</span>
      <span id="benchmark-run-status" style="margin-left:10px;font-size:12px;color:var(--muted)"></span>
      <div id="benchmark-autotune-status" style="margin-top:10px;font-size:12px;color:var(--muted)">Autotune status: loading…</div>
+      <div style="margin-top:6px;font-size:12px;color:var(--muted)">Autotune overwrites the saved system-power source and applies it to all new power charts and tests.</div>
    </div>
  </div>

--- a/audit/internal/webui/server.go
+++ b/audit/internal/webui/server.go
@@ -271,6 +271,8 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
 	mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf"))
 	mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power"))
+	mux.HandleFunc("POST /api/bee-bench/nvidia/autotune/run", h.handleAPIBenchmarkAutotuneRun())
+	mux.HandleFunc("GET /api/bee-bench/nvidia/autotune/status", h.handleAPIBenchmarkAutotuneStatus)
 	mux.HandleFunc("GET /api/benchmark/results", h.handleAPIBenchmarkResults)

 	// Tasks
@@ -687,41 +689,22 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) (dat

 	case path == "server-power":
 		title = "System Power"
-		// Use per-PSU stacked chart when PSU SDR data is available.
-		// Collect the union of PSU slots seen across all samples.
-		psuSlots := psuSlotsFromSamples(samples)
-		if len(psuSlots) > 0 {
-			// Build one dataset per PSU slot.
-			psuDatasets := make([][]float64, len(psuSlots))
-			psuNames := make([]string, len(psuSlots))
-			for si, slot := range psuSlots {
-				ds := make([]float64, len(samples))
-				for i, s := range samples {
-					for _, psu := range s.PSUs {
-						if psu.Slot == slot {
-							ds[i] = psu.PowerW
-							break
-						}
-					}
+		power := make([]float64, len(samples))
+		label := "Power W"
+		for i, s := range samples {
+			power[i] = s.PowerW
+			if strings.TrimSpace(s.PowerSource) != "" {
+				label = fmt.Sprintf("Power W · %s", s.PowerSource)
+				if strings.TrimSpace(s.PowerMode) != "" {
+					label += fmt.Sprintf(" (%s)", s.PowerMode)
 				}
-				psuDatasets[si] = normalizePowerSeries(ds)
-				psuNames[si] = fmt.Sprintf("PSU %d", slot)
 			}
-			datasets = psuDatasets
-			names = psuNames
-			stacked = len(psuDatasets) > 0
-			yMax = autoMax120(psuStackedTotal(psuDatasets))
-		} else {
-			power := make([]float64, len(samples))
-			for i, s := range samples {
-				power[i] = s.PowerW
-			}
-			power = normalizePowerSeries(power)
-			datasets = [][]float64{power}
-			names = []string{"Power W"}
-			yMin = floatPtr(0)
-			yMax = autoMax120(power)
 		}
+		power = normalizePowerSeries(power)
+		datasets = [][]float64{power}
+		names = []string{label}
+		yMin = floatPtr(0)
+		yMax = autoMax120(power)

 	case path == "server-fans":
 		title = "Fan RPM"
--- a/audit/internal/webui/server_test.go
+++ b/audit/internal/webui/server_test.go
@@ -420,7 +420,7 @@ func TestHandleMetricsChartSVGRendersCustomSVG(t *testing.T) {
 	}
 }

-func TestChartDataFromSamplesServerPowerUsesPerPSUDatasets(t *testing.T) {
+func TestChartDataFromSamplesServerPowerUsesResolvedSystemPower(t *testing.T) {
 	start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
 	samples := []platform.LiveMetricSample{
 		{
@@ -429,7 +429,9 @@ func TestChartDataFromSamplesServerPowerUsesPerPSUDatasets(t *testing.T) {
 				{Slot: 1, PowerW: 120},
 				{Slot: 2, PowerW: 130},
 			},
-			PowerW: 250,
+			PowerW:      250,
+			PowerSource: "sdr_psu_input",
+			PowerMode:   "autotuned",
 		},
 		{
 			Timestamp: start.Add(time.Minute),
@@ -437,7 +439,9 @@ func TestChartDataFromSamplesServerPowerUsesPerPSUDatasets(t *testing.T) {
 				{Slot: 1, PowerW: 140},
 				{Slot: 2, PowerW: 135},
 			},
-			PowerW: 275,
+			PowerW:      275,
+			PowerSource: "sdr_psu_input",
+			PowerMode:   "autotuned",
 		},
 	}

@@ -448,13 +452,13 @@ func TestChartDataFromSamplesServerPowerUsesPerPSUDatasets(t *testing.T) {
 	if title != "System Power" {
 		t.Fatalf("title=%q", title)
 	}
-	if !stacked {
-		t.Fatal("expected stacked PSU chart")
+	if stacked {
+		t.Fatal("server-power should use resolved system power, not stacked PSU inputs")
 	}
-	if len(datasets) != 2 || len(names) != 2 {
-		t.Fatalf("datasets=%d names=%d want 2/2", len(datasets), len(names))
+	if len(datasets) != 1 || len(names) != 1 {
+		t.Fatalf("datasets=%d names=%d want 1/1", len(datasets), len(names))
 	}
-	if names[0] != "PSU 1" || names[1] != "PSU 2" {
+	if names[0] != "Power W · sdr_psu_input (autotuned)" {
 		t.Fatalf("names=%v", names)
 	}
 }
@@ -689,9 +693,12 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
 		`/api/gpu/nvidia`,
 		`/api/bee-bench/nvidia/perf/run`,
 		`/api/bee-bench/nvidia/power/run`,
+		`/api/bee-bench/nvidia/autotune/run`,
+		`/api/bee-bench/nvidia/autotune/status`,
 		`benchmark-run-nccl`,
 		`Run Performance Benchmark`,
 		`Run Power / Thermal Fit`,
+		`Autotune`,
 	} {
 		if !strings.Contains(body, needle) {
 			t.Fatalf("benchmark page missing %q: %s", needle, body)
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -34,6 +34,7 @@ var taskNames = map[string]string{
 	"nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)",
 	"nvidia-bench-perf":      "NVIDIA Bee Bench Perf",
 	"nvidia-bench-power":     "NVIDIA Bee Bench Power",
+	"nvidia-bench-autotune":  "NVIDIA Bee Bench Power Source Autotune",
 	"nvidia-compute":         "NVIDIA Max Compute Load (dcgmproftester)",
 	"nvidia-targeted-power":  "NVIDIA Targeted Power (dcgmi diag targeted_power)",
 	"nvidia-pulse":           "NVIDIA Pulse Test (dcgmi diag pulse_test)",
@@ -125,6 +126,7 @@ type taskParams struct {
 	Loader             string   `json:"loader,omitempty"`
 	BurnProfile        string   `json:"burn_profile,omitempty"`
 	BenchmarkProfile   string   `json:"benchmark_profile,omitempty"`
+	BenchmarkKind      string   `json:"benchmark_kind,omitempty"`
 	RunNCCL            bool     `json:"run_nccl,omitempty"`
 	ParallelGPUs       bool     `json:"parallel_gpus,omitempty"`
 	RampStep           int      `json:"ramp_step,omitempty"`
@@ -686,6 +688,15 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			RampTotal:         t.params.RampTotal,
 			RampRunID:         t.params.RampRunID,
 		}, j.append)
+	case "nvidia-bench-autotune":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = a.RunNvidiaPowerSourceAutotuneCtx(ctx, app.DefaultBeeBenchAutotuneDir, platform.NvidiaBenchmarkOptions{
+			Profile: t.params.BenchmarkProfile,
+			SizeMB:  t.params.SizeMB,
+		}, t.params.BenchmarkKind, j.append)
 	case "nvidia-compute":
 		if a == nil {
 			err = fmt.Errorf("app not configured")