diff --git a/audit/internal/app/app.go b/audit/internal/app/app.go index 81c3450..1bc1356 100644 --- a/audit/internal/app/app.go +++ b/audit/internal/app/app.go @@ -19,20 +19,22 @@ import ( ) var ( - DefaultExportDir = "/appdata/bee/export" - DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json" - DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log" - DefaultWebLogPath = DefaultExportDir + "/bee-web.log" - DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log" - DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log" - DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log" - DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json" - DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log" - DefaultTechDumpDir = DefaultExportDir + "/techdump" - DefaultSATBaseDir = DefaultExportDir + "/bee-sat" - DefaultBeeBenchBaseDir = DefaultExportDir + "/bee-bench" - DefaultBeeBenchPerfDir = DefaultBeeBenchBaseDir + "/perf" - DefaultBeeBenchPowerDir = DefaultBeeBenchBaseDir + "/power" + DefaultExportDir = "/appdata/bee/export" + DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json" + DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log" + DefaultWebLogPath = DefaultExportDir + "/bee-web.log" + DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log" + DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log" + DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log" + DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json" + DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log" + DefaultTechDumpDir = DefaultExportDir + "/techdump" + DefaultSATBaseDir = DefaultExportDir + "/bee-sat" + DefaultBeeBenchBaseDir = DefaultExportDir + "/bee-bench" + DefaultBeeBenchAutotuneDir = DefaultBeeBenchBaseDir + "/autotune" + DefaultBeeBenchPerfDir = DefaultBeeBenchBaseDir + "/perf" + DefaultBeeBenchPowerDir = DefaultBeeBenchBaseDir + "/power" + DefaultBeeBenchPowerSourceConfigPath = DefaultBeeBenchBaseDir + "/power-source-autotune.json" ) type App struct { @@ -125,6 +127,7 @@ type satRunner interface { RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) + RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) @@ -572,6 +575,11 @@ func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts pl if strings.TrimSpace(baseDir) == "" { baseDir = DefaultBeeBenchPerfDir } + resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "performance", logFunc) + if err != nil { + return "", err + } + opts.ServerPowerSource = resolved.SelectedSource return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc) } @@ -579,9 +587,47 @@ func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts p if strings.TrimSpace(baseDir) == "" { baseDir = DefaultBeeBenchPowerDir } + resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "power-fit", logFunc) + if err != nil { + return "", err + } + opts.ServerPowerSource = resolved.SelectedSource return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc) } +func (a *App) RunNvidiaPowerSourceAutotuneCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) { + if strings.TrimSpace(baseDir) == "" { + baseDir = DefaultBeeBenchAutotuneDir + } + return a.sat.RunNvidiaPowerSourceAutotune(ctx, baseDir, opts, benchmarkKind, logFunc) +} + +func (a *App) LoadBenchmarkPowerAutotune() (*platform.BenchmarkPowerAutotuneConfig, error) { + return platform.LoadBenchmarkPowerAutotuneConfig(DefaultBeeBenchPowerSourceConfigPath) +} + +func (a *App) ensureBenchmarkPowerAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (platform.BenchmarkPowerAutotuneConfig, error) { + cfgPath := platform.BenchmarkPowerSourceConfigPath(baseDir) + if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath); err == nil { + if logFunc != nil { + logFunc(fmt.Sprintf("benchmark autotune: using saved server power source %s", cfg.SelectedSource)) + } + return *cfg, nil + } + if logFunc != nil { + logFunc("benchmark autotune: no saved power source config, running autotune first") + } + autotuneDir := filepath.Join(filepath.Dir(baseDir), "autotune") + if _, err := a.RunNvidiaPowerSourceAutotuneCtx(ctx, autotuneDir, opts, benchmarkKind, logFunc); err != nil { + return platform.BenchmarkPowerAutotuneConfig{}, err + } + cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath) + if err != nil { + return platform.BenchmarkPowerAutotuneConfig{}, err + } + return *cfg, nil +} + func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) { if strings.TrimSpace(baseDir) == "" { baseDir = DefaultSATBaseDir diff --git a/audit/internal/app/app_test.go b/audit/internal/app/app_test.go index 2000e8b..850cc19 100644 --- a/audit/internal/app/app_test.go +++ b/audit/internal/app/app_test.go @@ -9,6 +9,7 @@ import ( "io" "os" "path/filepath" + "strings" "testing" "bee/audit/internal/platform" @@ -123,6 +124,7 @@ type fakeSAT struct { runNvidiaFn func(string) (string, error) runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error) runNvidiaPowerBenchFn func(string, platform.NvidiaBenchmarkOptions) (string, error) + runNvidiaAutotuneFn func(string, platform.NvidiaBenchmarkOptions, string) (string, error) runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error) runNvidiaComputeFn func(string, int, []int) (string, error) runNvidiaPowerFn func(string, int, []int) (string, error) @@ -163,6 +165,13 @@ func (f fakeSAT) RunNvidiaPowerBench(_ context.Context, baseDir string, opts pla return f.runNvidiaFn(baseDir) } +func (f fakeSAT) RunNvidiaPowerSourceAutotune(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, _ func(string)) (string, error) { + if f.runNvidiaAutotuneFn != nil { + return f.runNvidiaAutotuneFn(baseDir, opts, benchmarkKind) + } + return f.runNvidiaFn(baseDir) +} + func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) { if f.runNvidiaTargetedStressFn != nil { return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices) @@ -809,6 +818,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) { if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil { t.Fatal(err) } + if err := os.MkdirAll(filepath.Join(exportDir, "bee-bench"), 0755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json"), []byte(`{"version":1,"updated_at":"2026-04-20T01:02:03Z","selected_source":"sdr_psu_input","reason":"selected lowest relative error"}`), 0644); err != nil { + t.Fatal(err) + } if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run.tar.gz"), []byte("nested sat archive"), 0644); err != nil { t.Fatal(err) } @@ -836,6 +851,7 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) { tr := tar.NewReader(gzr) var names []string var auditJSON string + var manifest string for { hdr, err := tr.Next() if errors.Is(err, io.EOF) { @@ -852,6 +868,13 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) { } auditJSON = string(body) } + if strings.HasSuffix(hdr.Name, "/manifest.txt") { + body, err := io.ReadAll(tr) + if err != nil { + t.Fatalf("read manifest entry: %v", err) + } + manifest = string(body) + } } for _, want := range []string{ @@ -895,6 +918,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) { if !contains(auditJSON, "PASCARI") || !contains(auditJSON, "NVIDIA H100") { t.Fatalf("support bundle should keep real devices:\n%s", auditJSON) } + if !contains(manifest, "files:") { + t.Fatalf("support bundle manifest missing files section:\n%s", manifest) + } + if !strings.Contains(manifest, "power_autotune_selected_source=sdr_psu_input") { + t.Fatalf("support bundle manifest missing autotune source:\n%s", manifest) + } } func TestMainBanner(t *testing.T) { diff --git a/audit/internal/app/support_bundle.go b/audit/internal/app/support_bundle.go index b1a97f8..d9f5158 100644 --- a/audit/internal/app/support_bundle.go +++ b/audit/internal/app/support_bundle.go @@ -2,6 +2,7 @@ package app import ( "archive/tar" + "bee/audit/internal/platform" "compress/gzip" "fmt" "io" @@ -424,6 +425,13 @@ func writeManifest(dst, exportDir, stageRoot string) error { fmt.Fprintf(&body, "host=%s\n", hostnameOr("unknown")) fmt.Fprintf(&body, "generated_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339)) fmt.Fprintf(&body, "export_dir=%s\n", exportDir) + if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json")); err == nil && cfg != nil { + fmt.Fprintf(&body, "power_autotune_selected_source=%s\n", cfg.SelectedSource) + fmt.Fprintf(&body, "power_autotune_updated_at=%s\n", cfg.UpdatedAt.UTC().Format(time.RFC3339)) + if strings.TrimSpace(cfg.Reason) != "" { + fmt.Fprintf(&body, "power_autotune_reason=%s\n", cfg.Reason) + } + } fmt.Fprintf(&body, "\nfiles:\n") var files []string diff --git a/audit/internal/platform/benchmark_report.go b/audit/internal/platform/benchmark_report.go index 142e63a..5183460 100644 --- a/audit/internal/platform/benchmark_report.go +++ b/audit/internal/platform/benchmark_report.go @@ -401,11 +401,15 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { } } - // ── Server Power (IPMI) ─────────────────────────────────────────────────── + // ── Server Power ─────────────────────────────────────────────────────────── if sp := result.ServerPower; sp != nil { - b.WriteString("## Server Power (IPMI)\n\n") + title := "## Server Power\n\n" + if sp.Source != "" { + title = fmt.Sprintf("## Server Power (`%s`)\n\n", sp.Source) + } + b.WriteString(title) if !sp.Available { - b.WriteString("IPMI power measurement unavailable.\n\n") + b.WriteString("Server power measurement unavailable.\n\n") } else { spRows := [][]string{ {"Server idle", fmt.Sprintf("%.0f W", sp.IdleW)}, diff --git a/audit/internal/platform/live_metrics.go b/audit/internal/platform/live_metrics.go index 0830b4b..5381a0f 100644 --- a/audit/internal/platform/live_metrics.go +++ b/audit/internal/platform/live_metrics.go @@ -16,14 +16,17 @@ import ( // LiveMetricSample is a single point-in-time snapshot of server metrics // collected for the web UI metrics page. type LiveMetricSample struct { - Timestamp time.Time `json:"ts"` - Fans []FanReading `json:"fans"` - Temps []TempReading `json:"temps"` - PowerW float64 `json:"power_w"` - PSUs []PSUReading `json:"psus,omitempty"` - CPULoadPct float64 `json:"cpu_load_pct"` - MemLoadPct float64 `json:"mem_load_pct"` - GPUs []GPUMetricRow `json:"gpus"` + Timestamp time.Time `json:"ts"` + Fans []FanReading `json:"fans"` + Temps []TempReading `json:"temps"` + PowerW float64 `json:"power_w"` + PowerSource string `json:"power_source,omitempty"` + PowerMode string `json:"power_mode,omitempty"` + PowerReason string `json:"power_reason,omitempty"` + PSUs []PSUReading `json:"psus,omitempty"` + CPULoadPct float64 `json:"cpu_load_pct"` + MemLoadPct float64 `json:"mem_load_pct"` + GPUs []GPUMetricRow `json:"gpus"` } // PSUReading is a per-slot power supply input power reading. @@ -67,15 +70,13 @@ func SampleLiveMetrics() LiveMetricSample { // Per-PSU power — populated when IPMI SDR has Power Supply entities with Watt readings s.PSUs = samplePSUPower() - // System power: prefer sum of PSU AC inputs (full wall draw); fall back to DCMI. - if len(s.PSUs) > 0 { - var total float64 - for _, p := range s.PSUs { - total += p.PowerW - } - s.PowerW = total - } else { - s.PowerW = sampleSystemPower() + // System power: use the global autotune-selected source when configured, + // otherwise fall back to the historical heuristic and mark the mode. + if powerW, decision, err := SampleSystemPowerResolved(""); err == nil { + s.PowerW = powerW + s.PowerSource = decision.EffectiveSource + s.PowerMode = decision.Mode + s.PowerReason = decision.Reason } // CPU load — from /proc/stat diff --git a/audit/internal/platform/sat_fan_stress.go b/audit/internal/platform/sat_fan_stress.go index 28c430a..399f7cf 100644 --- a/audit/internal/platform/sat_fan_stress.go +++ b/audit/internal/platform/sat_fan_stress.go @@ -43,17 +43,22 @@ type GPUStressMetric struct { // FanStressRow is one second-interval telemetry sample covering all monitored dimensions. type FanStressRow struct { - TimestampUTC string - ElapsedSec float64 - Phase string // "baseline", "load1", "pause", "load2", "cooldown" - GPUs []GPUStressMetric - Fans []FanReading - CPUMaxTempC float64 // highest CPU temperature from ipmitool / sensors - SysPowerW float64 // DCMI system power reading + TimestampUTC string + ElapsedSec float64 + Phase string // "baseline", "load1", "pause", "load2", "cooldown" + GPUs []GPUStressMetric + Fans []FanReading + CPUMaxTempC float64 // highest CPU temperature from ipmitool / sensors + SysPowerW float64 + SysPowerSource string + SysPowerMode string } type cachedPowerReading struct { Value float64 + Source string + Mode string + Reason string UpdatedAt time.Time } @@ -278,7 +283,7 @@ func sampleFanStressRow(gpuIndices []int, phase string, elapsed float64) FanStre row.GPUs = sampleGPUStressMetrics(gpuIndices) row.Fans, _ = sampleFanSpeeds() row.CPUMaxTempC = sampleCPUMaxTemp() - row.SysPowerW = sampleSystemPower() + row.SysPowerW, row.SysPowerSource, row.SysPowerMode = sampleSystemPowerResolved() return row } @@ -763,19 +768,19 @@ func sampleCPUTempViaSensors() float64 { return max } -// sampleSystemPower reads system power draw via DCMI. -func sampleSystemPower() float64 { +// sampleSystemPowerResolved reads system power via the global autotune source, +// falling back to the historical heuristic before autotune or when degraded. +func sampleSystemPowerResolved() (float64, string, string) { now := time.Now() - current := 0.0 - out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output() - if err == nil { - current = parseDCMIPowerReading(string(out)) - } + current, decision, err := SampleSystemPowerResolved("") systemPowerCacheMu.Lock() defer systemPowerCacheMu.Unlock() - value, updated := effectiveSystemPowerReading(systemPowerCache, current, now) + if err != nil { + current = 0 + } + value, updated := effectiveSystemPowerReading(systemPowerCache, current, decision.EffectiveSource, decision.Mode, decision.Reason, now) systemPowerCache = updated - return value + return value, updated.Source, updated.Mode } // parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output. @@ -798,9 +803,9 @@ func parseDCMIPowerReading(raw string) float64 { return 0 } -func effectiveSystemPowerReading(cache cachedPowerReading, current float64, now time.Time) (float64, cachedPowerReading) { +func effectiveSystemPowerReading(cache cachedPowerReading, current float64, source, mode, reason string, now time.Time) (float64, cachedPowerReading) { if current > 0 { - cache = cachedPowerReading{Value: current, UpdatedAt: now} + cache = cachedPowerReading{Value: current, Source: source, Mode: mode, Reason: reason, UpdatedAt: now} return current, cache } if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL { diff --git a/audit/internal/platform/sat_fan_stress_test.go b/audit/internal/platform/sat_fan_stress_test.go index 7b248b4..20ac394 100644 --- a/audit/internal/platform/sat_fan_stress_test.go +++ b/audit/internal/platform/sat_fan_stress_test.go @@ -112,7 +112,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) { now := time.Now() cache := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-5 * time.Second)} - got, updated := effectiveSystemPowerReading(cache, 0, now) + got, updated := effectiveSystemPowerReading(cache, 0, "", "", "", now) if got != 480 { t.Fatalf("got=%v want cached 480", got) } @@ -120,7 +120,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) { t.Fatalf("updated=%+v", updated) } - got, updated = effectiveSystemPowerReading(cache, 530, now) + got, updated = effectiveSystemPowerReading(cache, 530, "dcmi", "fallback", "test", now) if got != 530 { t.Fatalf("got=%v want 530", got) } @@ -129,7 +129,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) { } expired := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-systemPowerHoldTTL - time.Second)} - got, _ = effectiveSystemPowerReading(expired, 0, now) + got, _ = effectiveSystemPowerReading(expired, 0, "", "", "", now) if got != 0 { t.Fatalf("expired cache returned %v want 0", got) } diff --git a/audit/internal/webui/api.go b/audit/internal/webui/api.go index 3a9f259..e5a53ca 100644 --- a/audit/internal/webui/api.go +++ b/audit/internal/webui/api.go @@ -127,7 +127,7 @@ func defaultTaskPriority(target string, params taskParams) int { return taskPriorityInstallToRAM case "audit": return taskPriorityAudit - case "nvidia-bench-perf", "nvidia-bench-power": + case "nvidia-bench-perf", "nvidia-bench-power", "nvidia-bench-autotune": return taskPriorityBenchmark case "nvidia-stress", "amd-stress", "memory-stress", "sat-stress", "platform-stress", "nvidia-compute": return taskPriorityBurn @@ -701,6 +701,78 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun } } +func (h *handler) handleAPIBenchmarkAutotuneRun() http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + if h.opts.App == nil { + writeError(w, http.StatusServiceUnavailable, "app not configured") + return + } + var body struct { + Profile string `json:"profile"` + BenchmarkKind string `json:"benchmark_kind"` + SizeMB int `json:"size_mb"` + } + if r.Body != nil { + if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) { + writeError(w, http.StatusBadRequest, "invalid request body") + return + } + } + profile := strings.TrimSpace(body.Profile) + if profile == "" { + profile = "standard" + } + benchmarkKind := strings.TrimSpace(body.BenchmarkKind) + if benchmarkKind == "" { + benchmarkKind = "power-fit" + } + now := time.Now() + taskName := fmt.Sprintf("NVIDIA Benchmark Autotune · %s · %s", profile, benchmarkKind) + t := &Task{ + ID: newJobID("bee-bench-autotune"), + Name: taskName, + Target: "nvidia-bench-autotune", + Priority: defaultTaskPriority("nvidia-bench-autotune", taskParams{}), + Status: TaskPending, + CreatedAt: now, + params: taskParams{ + BenchmarkProfile: profile, + BenchmarkKind: benchmarkKind, + SizeMB: body.SizeMB, + DisplayName: taskName, + }, + } + globalQueue.enqueue(t) + writeTaskRunResponse(w, []*Task{t}) + } +} + +func (h *handler) handleAPIBenchmarkAutotuneStatus(w http.ResponseWriter, r *http.Request) { + if h.opts.App == nil { + writeError(w, http.StatusServiceUnavailable, "app not configured") + return + } + cfg, err := h.opts.App.LoadBenchmarkPowerAutotune() + if err != nil { + if os.IsNotExist(err) { + w.WriteHeader(http.StatusOK) + writeJSON(w, map[string]any{ + "configured": false, + "decision": platform.ResolveSystemPowerDecision(h.opts.ExportDir), + }) + return + } + writeError(w, http.StatusInternalServerError, err.Error()) + return + } + w.WriteHeader(http.StatusOK) + writeJSON(w, map[string]any{ + "configured": true, + "config": cfg, + "decision": platform.ResolveSystemPowerDecision(h.opts.ExportDir), + }) +} + func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) { h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf").ServeHTTP(w, r) } diff --git a/audit/internal/webui/api_test.go b/audit/internal/webui/api_test.go index 77f5826..d132ad7 100644 --- a/audit/internal/webui/api_test.go +++ b/audit/internal/webui/api_test.go @@ -195,6 +195,40 @@ func TestHandleAPIBenchmarkPowerFitRampQueuesBenchmarkPowerFitTasks(t *testing.T } } +func TestHandleAPIBenchmarkAutotuneRunQueuesTask(t *testing.T) { + globalQueue.mu.Lock() + originalTasks := globalQueue.tasks + globalQueue.tasks = nil + globalQueue.mu.Unlock() + t.Cleanup(func() { + globalQueue.mu.Lock() + globalQueue.tasks = originalTasks + globalQueue.mu.Unlock() + }) + + h := &handler{opts: HandlerOptions{App: &app.App{}}} + req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/autotune/run", strings.NewReader(`{"profile":"standard","benchmark_kind":"power-fit"}`)) + rec := httptest.NewRecorder() + + h.handleAPIBenchmarkAutotuneRun().ServeHTTP(rec, req) + + if rec.Code != 200 { + t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String()) + } + globalQueue.mu.Lock() + defer globalQueue.mu.Unlock() + if len(globalQueue.tasks) != 1 { + t.Fatalf("tasks=%d want 1", len(globalQueue.tasks)) + } + task := globalQueue.tasks[0] + if task.Target != "nvidia-bench-autotune" { + t.Fatalf("task target=%q want nvidia-bench-autotune", task.Target) + } + if task.params.BenchmarkKind != "power-fit" { + t.Fatalf("task benchmark kind=%q want power-fit", task.params.BenchmarkKind) + } +} + func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) { globalQueue.mu.Lock() originalTasks := globalQueue.tasks diff --git a/audit/internal/webui/metricsdb.go b/audit/internal/webui/metricsdb.go index 21977cf..1919c8b 100644 --- a/audit/internal/webui/metricsdb.go +++ b/audit/internal/webui/metricsdb.go @@ -53,6 +53,9 @@ CREATE TABLE IF NOT EXISTS sys_metrics ( cpu_load_pct REAL, mem_load_pct REAL, power_w REAL, + power_source TEXT, + power_mode TEXT, + power_reason TEXT, PRIMARY KEY (ts) ); CREATE TABLE IF NOT EXISTS gpu_metrics ( @@ -86,7 +89,16 @@ CREATE TABLE IF NOT EXISTS temp_metrics ( if err := ensureMetricsColumn(db, "gpu_metrics", "clock_mhz", "REAL"); err != nil { return err } - return ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL") + if err := ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL"); err != nil { + return err + } + if err := ensureMetricsColumn(db, "sys_metrics", "power_source", "TEXT"); err != nil { + return err + } + if err := ensureMetricsColumn(db, "sys_metrics", "power_mode", "TEXT"); err != nil { + return err + } + return ensureMetricsColumn(db, "sys_metrics", "power_reason", "TEXT") } func ensureMetricsColumn(db *sql.DB, table, column, definition string) error { @@ -125,8 +137,8 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error { defer func() { _ = tx.Rollback() }() _, err = tx.Exec( - `INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w) VALUES(?,?,?,?)`, - ts, s.CPULoadPct, s.MemLoadPct, s.PowerW, + `INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason) VALUES(?,?,?,?,?,?,?)`, + ts, s.CPULoadPct, s.MemLoadPct, s.PowerW, s.PowerSource, s.PowerMode, s.PowerReason, ) if err != nil { return err @@ -213,12 +225,12 @@ func (m *MetricsDB) Prune(before time.Time) error { // LoadRecent returns up to n samples in chronological order (oldest first). func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) { - return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n) + return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n) } // LoadAll returns all persisted samples in chronological order (oldest first). func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) { - return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil) + return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics ORDER BY ts`, nil) } // LoadBetween returns samples in chronological order within the given time window. @@ -233,7 +245,7 @@ func (m *MetricsDB) LoadBetween(start, end time.Time) ([]platform.LiveMetricSamp start, end = end, start } return m.loadSamples( - `SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`, + `SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`, start.Unix(), end.Unix(), ) } @@ -249,11 +261,14 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri type sysRow struct { ts int64 cpu, mem, pwr float64 + powerSource string + powerMode string + powerReason string } var sysRows []sysRow for rows.Next() { var r sysRow - if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr); err != nil { + if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr, &r.powerSource, &r.powerMode, &r.powerReason); err != nil { continue } sysRows = append(sysRows, r) @@ -363,10 +378,13 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri samples := make([]platform.LiveMetricSample, len(sysRows)) for i, r := range sysRows { s := platform.LiveMetricSample{ - Timestamp: time.Unix(r.ts, 0).UTC(), - CPULoadPct: r.cpu, - MemLoadPct: r.mem, - PowerW: r.pwr, + Timestamp: time.Unix(r.ts, 0).UTC(), + CPULoadPct: r.cpu, + MemLoadPct: r.mem, + PowerW: r.pwr, + PowerSource: r.powerSource, + PowerMode: r.powerMode, + PowerReason: r.powerReason, } for _, idx := range gpuIndices { if g, ok := gpuData[gpuKey{r.ts, idx}]; ok { diff --git a/audit/internal/webui/page_benchmark.go b/audit/internal/webui/page_benchmark.go index f118805..f8b876d 100644 --- a/audit/internal/webui/page_benchmark.go +++ b/audit/internal/webui/page_benchmark.go @@ -69,6 +69,7 @@ func renderBenchmark(opts HandlerOptions) string {
Autotune status: loading…
+
Autotune overwrites the saved system-power source and applies it to all new power charts and tests.
diff --git a/audit/internal/webui/server.go b/audit/internal/webui/server.go index 0e20960..9b523d8 100644 --- a/audit/internal/webui/server.go +++ b/audit/internal/webui/server.go @@ -271,6 +271,8 @@ func NewHandler(opts HandlerOptions) http.Handler { mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort) mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf")) mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power")) + mux.HandleFunc("POST /api/bee-bench/nvidia/autotune/run", h.handleAPIBenchmarkAutotuneRun()) + mux.HandleFunc("GET /api/bee-bench/nvidia/autotune/status", h.handleAPIBenchmarkAutotuneStatus) mux.HandleFunc("GET /api/benchmark/results", h.handleAPIBenchmarkResults) // Tasks @@ -687,41 +689,22 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) (dat case path == "server-power": title = "System Power" - // Use per-PSU stacked chart when PSU SDR data is available. - // Collect the union of PSU slots seen across all samples. - psuSlots := psuSlotsFromSamples(samples) - if len(psuSlots) > 0 { - // Build one dataset per PSU slot. - psuDatasets := make([][]float64, len(psuSlots)) - psuNames := make([]string, len(psuSlots)) - for si, slot := range psuSlots { - ds := make([]float64, len(samples)) - for i, s := range samples { - for _, psu := range s.PSUs { - if psu.Slot == slot { - ds[i] = psu.PowerW - break - } - } + power := make([]float64, len(samples)) + label := "Power W" + for i, s := range samples { + power[i] = s.PowerW + if strings.TrimSpace(s.PowerSource) != "" { + label = fmt.Sprintf("Power W · %s", s.PowerSource) + if strings.TrimSpace(s.PowerMode) != "" { + label += fmt.Sprintf(" (%s)", s.PowerMode) } - psuDatasets[si] = normalizePowerSeries(ds) - psuNames[si] = fmt.Sprintf("PSU %d", slot) } - datasets = psuDatasets - names = psuNames - stacked = len(psuDatasets) > 0 - yMax = autoMax120(psuStackedTotal(psuDatasets)) - } else { - power := make([]float64, len(samples)) - for i, s := range samples { - power[i] = s.PowerW - } - power = normalizePowerSeries(power) - datasets = [][]float64{power} - names = []string{"Power W"} - yMin = floatPtr(0) - yMax = autoMax120(power) } + power = normalizePowerSeries(power) + datasets = [][]float64{power} + names = []string{label} + yMin = floatPtr(0) + yMax = autoMax120(power) case path == "server-fans": title = "Fan RPM" diff --git a/audit/internal/webui/server_test.go b/audit/internal/webui/server_test.go index d402902..80c6524 100644 --- a/audit/internal/webui/server_test.go +++ b/audit/internal/webui/server_test.go @@ -420,7 +420,7 @@ func TestHandleMetricsChartSVGRendersCustomSVG(t *testing.T) { } } -func TestChartDataFromSamplesServerPowerUsesPerPSUDatasets(t *testing.T) { +func TestChartDataFromSamplesServerPowerUsesResolvedSystemPower(t *testing.T) { start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC) samples := []platform.LiveMetricSample{ { @@ -429,7 +429,9 @@ func TestChartDataFromSamplesServerPowerUsesPerPSUDatasets(t *testing.T) { {Slot: 1, PowerW: 120}, {Slot: 2, PowerW: 130}, }, - PowerW: 250, + PowerW: 250, + PowerSource: "sdr_psu_input", + PowerMode: "autotuned", }, { Timestamp: start.Add(time.Minute), @@ -437,7 +439,9 @@ func TestChartDataFromSamplesServerPowerUsesPerPSUDatasets(t *testing.T) { {Slot: 1, PowerW: 140}, {Slot: 2, PowerW: 135}, }, - PowerW: 275, + PowerW: 275, + PowerSource: "sdr_psu_input", + PowerMode: "autotuned", }, } @@ -448,13 +452,13 @@ func TestChartDataFromSamplesServerPowerUsesPerPSUDatasets(t *testing.T) { if title != "System Power" { t.Fatalf("title=%q", title) } - if !stacked { - t.Fatal("expected stacked PSU chart") + if stacked { + t.Fatal("server-power should use resolved system power, not stacked PSU inputs") } - if len(datasets) != 2 || len(names) != 2 { - t.Fatalf("datasets=%d names=%d want 2/2", len(datasets), len(names)) + if len(datasets) != 1 || len(names) != 1 { + t.Fatalf("datasets=%d names=%d want 1/1", len(datasets), len(names)) } - if names[0] != "PSU 1" || names[1] != "PSU 2" { + if names[0] != "Power W · sdr_psu_input (autotuned)" { t.Fatalf("names=%v", names) } } @@ -689,9 +693,12 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) { `/api/gpu/nvidia`, `/api/bee-bench/nvidia/perf/run`, `/api/bee-bench/nvidia/power/run`, + `/api/bee-bench/nvidia/autotune/run`, + `/api/bee-bench/nvidia/autotune/status`, `benchmark-run-nccl`, `Run Performance Benchmark`, `Run Power / Thermal Fit`, + `Autotune`, } { if !strings.Contains(body, needle) { t.Fatalf("benchmark page missing %q: %s", needle, body) diff --git a/audit/internal/webui/tasks.go b/audit/internal/webui/tasks.go index d8bb9c3..6672369 100644 --- a/audit/internal/webui/tasks.go +++ b/audit/internal/webui/tasks.go @@ -34,6 +34,7 @@ var taskNames = map[string]string{ "nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)", "nvidia-bench-perf": "NVIDIA Bee Bench Perf", "nvidia-bench-power": "NVIDIA Bee Bench Power", + "nvidia-bench-autotune": "NVIDIA Bee Bench Power Source Autotune", "nvidia-compute": "NVIDIA Max Compute Load (dcgmproftester)", "nvidia-targeted-power": "NVIDIA Targeted Power (dcgmi diag targeted_power)", "nvidia-pulse": "NVIDIA Pulse Test (dcgmi diag pulse_test)", @@ -125,6 +126,7 @@ type taskParams struct { Loader string `json:"loader,omitempty"` BurnProfile string `json:"burn_profile,omitempty"` BenchmarkProfile string `json:"benchmark_profile,omitempty"` + BenchmarkKind string `json:"benchmark_kind,omitempty"` RunNCCL bool `json:"run_nccl,omitempty"` ParallelGPUs bool `json:"parallel_gpus,omitempty"` RampStep int `json:"ramp_step,omitempty"` @@ -686,6 +688,15 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) { RampTotal: t.params.RampTotal, RampRunID: t.params.RampRunID, }, j.append) + case "nvidia-bench-autotune": + if a == nil { + err = fmt.Errorf("app not configured") + break + } + archive, err = a.RunNvidiaPowerSourceAutotuneCtx(ctx, app.DefaultBeeBenchAutotuneDir, platform.NvidiaBenchmarkOptions{ + Profile: t.params.BenchmarkProfile, + SizeMB: t.params.SizeMB, + }, t.params.BenchmarkKind, j.append) case "nvidia-compute": if a == nil { err = fmt.Errorf("app not configured")