Globalize autotuned system power source

This commit is contained in:
2026-04-20 07:02:12 +03:00
parent 17118298bd
commit b3cf8e3893
14 changed files with 327 additions and 108 deletions

View File

@@ -19,20 +19,22 @@ import (
) )
var ( var (
DefaultExportDir = "/appdata/bee/export" DefaultExportDir = "/appdata/bee/export"
DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json" DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json"
DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log" DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log"
DefaultWebLogPath = DefaultExportDir + "/bee-web.log" DefaultWebLogPath = DefaultExportDir + "/bee-web.log"
DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log" DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log"
DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log" DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log"
DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log" DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log"
DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json" DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log" DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
DefaultTechDumpDir = DefaultExportDir + "/techdump" DefaultTechDumpDir = DefaultExportDir + "/techdump"
DefaultSATBaseDir = DefaultExportDir + "/bee-sat" DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
DefaultBeeBenchBaseDir = DefaultExportDir + "/bee-bench" DefaultBeeBenchBaseDir = DefaultExportDir + "/bee-bench"
DefaultBeeBenchPerfDir = DefaultBeeBenchBaseDir + "/perf" DefaultBeeBenchAutotuneDir = DefaultBeeBenchBaseDir + "/autotune"
DefaultBeeBenchPowerDir = DefaultBeeBenchBaseDir + "/power" DefaultBeeBenchPerfDir = DefaultBeeBenchBaseDir + "/perf"
DefaultBeeBenchPowerDir = DefaultBeeBenchBaseDir + "/power"
DefaultBeeBenchPowerSourceConfigPath = DefaultBeeBenchBaseDir + "/power-source-autotune.json"
) )
type App struct { type App struct {
@@ -125,6 +127,7 @@ type satRunner interface {
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
RunNvidiaPowerBench(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error)
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
@@ -572,6 +575,11 @@ func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts pl
if strings.TrimSpace(baseDir) == "" { if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultBeeBenchPerfDir baseDir = DefaultBeeBenchPerfDir
} }
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "performance", logFunc)
if err != nil {
return "", err
}
opts.ServerPowerSource = resolved.SelectedSource
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc) return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
} }
@@ -579,9 +587,47 @@ func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts p
if strings.TrimSpace(baseDir) == "" { if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultBeeBenchPowerDir baseDir = DefaultBeeBenchPowerDir
} }
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "power-fit", logFunc)
if err != nil {
return "", err
}
opts.ServerPowerSource = resolved.SelectedSource
return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc) return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
} }
func (a *App) RunNvidiaPowerSourceAutotuneCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultBeeBenchAutotuneDir
}
return a.sat.RunNvidiaPowerSourceAutotune(ctx, baseDir, opts, benchmarkKind, logFunc)
}
func (a *App) LoadBenchmarkPowerAutotune() (*platform.BenchmarkPowerAutotuneConfig, error) {
return platform.LoadBenchmarkPowerAutotuneConfig(DefaultBeeBenchPowerSourceConfigPath)
}
func (a *App) ensureBenchmarkPowerAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (platform.BenchmarkPowerAutotuneConfig, error) {
cfgPath := platform.BenchmarkPowerSourceConfigPath(baseDir)
if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath); err == nil {
if logFunc != nil {
logFunc(fmt.Sprintf("benchmark autotune: using saved server power source %s", cfg.SelectedSource))
}
return *cfg, nil
}
if logFunc != nil {
logFunc("benchmark autotune: no saved power source config, running autotune first")
}
autotuneDir := filepath.Join(filepath.Dir(baseDir), "autotune")
if _, err := a.RunNvidiaPowerSourceAutotuneCtx(ctx, autotuneDir, opts, benchmarkKind, logFunc); err != nil {
return platform.BenchmarkPowerAutotuneConfig{}, err
}
cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath)
if err != nil {
return platform.BenchmarkPowerAutotuneConfig{}, err
}
return *cfg, nil
}
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) { func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" { if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir baseDir = DefaultSATBaseDir

View File

@@ -9,6 +9,7 @@ import (
"io" "io"
"os" "os"
"path/filepath" "path/filepath"
"strings"
"testing" "testing"
"bee/audit/internal/platform" "bee/audit/internal/platform"
@@ -123,6 +124,7 @@ type fakeSAT struct {
runNvidiaFn func(string) (string, error) runNvidiaFn func(string) (string, error)
runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error) runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
runNvidiaPowerBenchFn func(string, platform.NvidiaBenchmarkOptions) (string, error) runNvidiaPowerBenchFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
runNvidiaAutotuneFn func(string, platform.NvidiaBenchmarkOptions, string) (string, error)
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error) runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
runNvidiaComputeFn func(string, int, []int) (string, error) runNvidiaComputeFn func(string, int, []int) (string, error)
runNvidiaPowerFn func(string, int, []int) (string, error) runNvidiaPowerFn func(string, int, []int) (string, error)
@@ -163,6 +165,13 @@ func (f fakeSAT) RunNvidiaPowerBench(_ context.Context, baseDir string, opts pla
return f.runNvidiaFn(baseDir) return f.runNvidiaFn(baseDir)
} }
func (f fakeSAT) RunNvidiaPowerSourceAutotune(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, _ func(string)) (string, error) {
if f.runNvidiaAutotuneFn != nil {
return f.runNvidiaAutotuneFn(baseDir, opts, benchmarkKind)
}
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) { func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
if f.runNvidiaTargetedStressFn != nil { if f.runNvidiaTargetedStressFn != nil {
return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices) return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
@@ -809,6 +818,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil { if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
t.Fatal(err) t.Fatal(err)
} }
if err := os.MkdirAll(filepath.Join(exportDir, "bee-bench"), 0755); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json"), []byte(`{"version":1,"updated_at":"2026-04-20T01:02:03Z","selected_source":"sdr_psu_input","reason":"selected lowest relative error"}`), 0644); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run.tar.gz"), []byte("nested sat archive"), 0644); err != nil { if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run.tar.gz"), []byte("nested sat archive"), 0644); err != nil {
t.Fatal(err) t.Fatal(err)
} }
@@ -836,6 +851,7 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
tr := tar.NewReader(gzr) tr := tar.NewReader(gzr)
var names []string var names []string
var auditJSON string var auditJSON string
var manifest string
for { for {
hdr, err := tr.Next() hdr, err := tr.Next()
if errors.Is(err, io.EOF) { if errors.Is(err, io.EOF) {
@@ -852,6 +868,13 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
} }
auditJSON = string(body) auditJSON = string(body)
} }
if strings.HasSuffix(hdr.Name, "/manifest.txt") {
body, err := io.ReadAll(tr)
if err != nil {
t.Fatalf("read manifest entry: %v", err)
}
manifest = string(body)
}
} }
for _, want := range []string{ for _, want := range []string{
@@ -895,6 +918,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
if !contains(auditJSON, "PASCARI") || !contains(auditJSON, "NVIDIA H100") { if !contains(auditJSON, "PASCARI") || !contains(auditJSON, "NVIDIA H100") {
t.Fatalf("support bundle should keep real devices:\n%s", auditJSON) t.Fatalf("support bundle should keep real devices:\n%s", auditJSON)
} }
if !contains(manifest, "files:") {
t.Fatalf("support bundle manifest missing files section:\n%s", manifest)
}
if !strings.Contains(manifest, "power_autotune_selected_source=sdr_psu_input") {
t.Fatalf("support bundle manifest missing autotune source:\n%s", manifest)
}
} }
func TestMainBanner(t *testing.T) { func TestMainBanner(t *testing.T) {

View File

@@ -2,6 +2,7 @@ package app
import ( import (
"archive/tar" "archive/tar"
"bee/audit/internal/platform"
"compress/gzip" "compress/gzip"
"fmt" "fmt"
"io" "io"
@@ -424,6 +425,13 @@ func writeManifest(dst, exportDir, stageRoot string) error {
fmt.Fprintf(&body, "host=%s\n", hostnameOr("unknown")) fmt.Fprintf(&body, "host=%s\n", hostnameOr("unknown"))
fmt.Fprintf(&body, "generated_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339)) fmt.Fprintf(&body, "generated_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
fmt.Fprintf(&body, "export_dir=%s\n", exportDir) fmt.Fprintf(&body, "export_dir=%s\n", exportDir)
if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json")); err == nil && cfg != nil {
fmt.Fprintf(&body, "power_autotune_selected_source=%s\n", cfg.SelectedSource)
fmt.Fprintf(&body, "power_autotune_updated_at=%s\n", cfg.UpdatedAt.UTC().Format(time.RFC3339))
if strings.TrimSpace(cfg.Reason) != "" {
fmt.Fprintf(&body, "power_autotune_reason=%s\n", cfg.Reason)
}
}
fmt.Fprintf(&body, "\nfiles:\n") fmt.Fprintf(&body, "\nfiles:\n")
var files []string var files []string

View File

@@ -401,11 +401,15 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
} }
} }
// ── Server Power (IPMI) ─────────────────────────────────────────────────── // ── Server Power ───────────────────────────────────────────────────────────
if sp := result.ServerPower; sp != nil { if sp := result.ServerPower; sp != nil {
b.WriteString("## Server Power (IPMI)\n\n") title := "## Server Power\n\n"
if sp.Source != "" {
title = fmt.Sprintf("## Server Power (`%s`)\n\n", sp.Source)
}
b.WriteString(title)
if !sp.Available { if !sp.Available {
b.WriteString("IPMI power measurement unavailable.\n\n") b.WriteString("Server power measurement unavailable.\n\n")
} else { } else {
spRows := [][]string{ spRows := [][]string{
{"Server idle", fmt.Sprintf("%.0f W", sp.IdleW)}, {"Server idle", fmt.Sprintf("%.0f W", sp.IdleW)},

View File

@@ -16,14 +16,17 @@ import (
// LiveMetricSample is a single point-in-time snapshot of server metrics // LiveMetricSample is a single point-in-time snapshot of server metrics
// collected for the web UI metrics page. // collected for the web UI metrics page.
type LiveMetricSample struct { type LiveMetricSample struct {
Timestamp time.Time `json:"ts"` Timestamp time.Time `json:"ts"`
Fans []FanReading `json:"fans"` Fans []FanReading `json:"fans"`
Temps []TempReading `json:"temps"` Temps []TempReading `json:"temps"`
PowerW float64 `json:"power_w"` PowerW float64 `json:"power_w"`
PSUs []PSUReading `json:"psus,omitempty"` PowerSource string `json:"power_source,omitempty"`
CPULoadPct float64 `json:"cpu_load_pct"` PowerMode string `json:"power_mode,omitempty"`
MemLoadPct float64 `json:"mem_load_pct"` PowerReason string `json:"power_reason,omitempty"`
GPUs []GPUMetricRow `json:"gpus"` PSUs []PSUReading `json:"psus,omitempty"`
CPULoadPct float64 `json:"cpu_load_pct"`
MemLoadPct float64 `json:"mem_load_pct"`
GPUs []GPUMetricRow `json:"gpus"`
} }
// PSUReading is a per-slot power supply input power reading. // PSUReading is a per-slot power supply input power reading.
@@ -67,15 +70,13 @@ func SampleLiveMetrics() LiveMetricSample {
// Per-PSU power — populated when IPMI SDR has Power Supply entities with Watt readings // Per-PSU power — populated when IPMI SDR has Power Supply entities with Watt readings
s.PSUs = samplePSUPower() s.PSUs = samplePSUPower()
// System power: prefer sum of PSU AC inputs (full wall draw); fall back to DCMI. // System power: use the global autotune-selected source when configured,
if len(s.PSUs) > 0 { // otherwise fall back to the historical heuristic and mark the mode.
var total float64 if powerW, decision, err := SampleSystemPowerResolved(""); err == nil {
for _, p := range s.PSUs { s.PowerW = powerW
total += p.PowerW s.PowerSource = decision.EffectiveSource
} s.PowerMode = decision.Mode
s.PowerW = total s.PowerReason = decision.Reason
} else {
s.PowerW = sampleSystemPower()
} }
// CPU load — from /proc/stat // CPU load — from /proc/stat

View File

@@ -43,17 +43,22 @@ type GPUStressMetric struct {
// FanStressRow is one second-interval telemetry sample covering all monitored dimensions. // FanStressRow is one second-interval telemetry sample covering all monitored dimensions.
type FanStressRow struct { type FanStressRow struct {
TimestampUTC string TimestampUTC string
ElapsedSec float64 ElapsedSec float64
Phase string // "baseline", "load1", "pause", "load2", "cooldown" Phase string // "baseline", "load1", "pause", "load2", "cooldown"
GPUs []GPUStressMetric GPUs []GPUStressMetric
Fans []FanReading Fans []FanReading
CPUMaxTempC float64 // highest CPU temperature from ipmitool / sensors CPUMaxTempC float64 // highest CPU temperature from ipmitool / sensors
SysPowerW float64 // DCMI system power reading SysPowerW float64
SysPowerSource string
SysPowerMode string
} }
type cachedPowerReading struct { type cachedPowerReading struct {
Value float64 Value float64
Source string
Mode string
Reason string
UpdatedAt time.Time UpdatedAt time.Time
} }
@@ -278,7 +283,7 @@ func sampleFanStressRow(gpuIndices []int, phase string, elapsed float64) FanStre
row.GPUs = sampleGPUStressMetrics(gpuIndices) row.GPUs = sampleGPUStressMetrics(gpuIndices)
row.Fans, _ = sampleFanSpeeds() row.Fans, _ = sampleFanSpeeds()
row.CPUMaxTempC = sampleCPUMaxTemp() row.CPUMaxTempC = sampleCPUMaxTemp()
row.SysPowerW = sampleSystemPower() row.SysPowerW, row.SysPowerSource, row.SysPowerMode = sampleSystemPowerResolved()
return row return row
} }
@@ -763,19 +768,19 @@ func sampleCPUTempViaSensors() float64 {
return max return max
} }
// sampleSystemPower reads system power draw via DCMI. // sampleSystemPowerResolved reads system power via the global autotune source,
func sampleSystemPower() float64 { // falling back to the historical heuristic before autotune or when degraded.
func sampleSystemPowerResolved() (float64, string, string) {
now := time.Now() now := time.Now()
current := 0.0 current, decision, err := SampleSystemPowerResolved("")
out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
if err == nil {
current = parseDCMIPowerReading(string(out))
}
systemPowerCacheMu.Lock() systemPowerCacheMu.Lock()
defer systemPowerCacheMu.Unlock() defer systemPowerCacheMu.Unlock()
value, updated := effectiveSystemPowerReading(systemPowerCache, current, now) if err != nil {
current = 0
}
value, updated := effectiveSystemPowerReading(systemPowerCache, current, decision.EffectiveSource, decision.Mode, decision.Reason, now)
systemPowerCache = updated systemPowerCache = updated
return value return value, updated.Source, updated.Mode
} }
// parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output. // parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
@@ -798,9 +803,9 @@ func parseDCMIPowerReading(raw string) float64 {
return 0 return 0
} }
func effectiveSystemPowerReading(cache cachedPowerReading, current float64, now time.Time) (float64, cachedPowerReading) { func effectiveSystemPowerReading(cache cachedPowerReading, current float64, source, mode, reason string, now time.Time) (float64, cachedPowerReading) {
if current > 0 { if current > 0 {
cache = cachedPowerReading{Value: current, UpdatedAt: now} cache = cachedPowerReading{Value: current, Source: source, Mode: mode, Reason: reason, UpdatedAt: now}
return current, cache return current, cache
} }
if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL { if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL {

View File

@@ -112,7 +112,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
now := time.Now() now := time.Now()
cache := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-5 * time.Second)} cache := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-5 * time.Second)}
got, updated := effectiveSystemPowerReading(cache, 0, now) got, updated := effectiveSystemPowerReading(cache, 0, "", "", "", now)
if got != 480 { if got != 480 {
t.Fatalf("got=%v want cached 480", got) t.Fatalf("got=%v want cached 480", got)
} }
@@ -120,7 +120,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
t.Fatalf("updated=%+v", updated) t.Fatalf("updated=%+v", updated)
} }
got, updated = effectiveSystemPowerReading(cache, 530, now) got, updated = effectiveSystemPowerReading(cache, 530, "dcmi", "fallback", "test", now)
if got != 530 { if got != 530 {
t.Fatalf("got=%v want 530", got) t.Fatalf("got=%v want 530", got)
} }
@@ -129,7 +129,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
} }
expired := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-systemPowerHoldTTL - time.Second)} expired := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-systemPowerHoldTTL - time.Second)}
got, _ = effectiveSystemPowerReading(expired, 0, now) got, _ = effectiveSystemPowerReading(expired, 0, "", "", "", now)
if got != 0 { if got != 0 {
t.Fatalf("expired cache returned %v want 0", got) t.Fatalf("expired cache returned %v want 0", got)
} }

View File

@@ -127,7 +127,7 @@ func defaultTaskPriority(target string, params taskParams) int {
return taskPriorityInstallToRAM return taskPriorityInstallToRAM
case "audit": case "audit":
return taskPriorityAudit return taskPriorityAudit
case "nvidia-bench-perf", "nvidia-bench-power": case "nvidia-bench-perf", "nvidia-bench-power", "nvidia-bench-autotune":
return taskPriorityBenchmark return taskPriorityBenchmark
case "nvidia-stress", "amd-stress", "memory-stress", "sat-stress", "platform-stress", "nvidia-compute": case "nvidia-stress", "amd-stress", "memory-stress", "sat-stress", "platform-stress", "nvidia-compute":
return taskPriorityBurn return taskPriorityBurn
@@ -701,6 +701,78 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
} }
} }
func (h *handler) handleAPIBenchmarkAutotuneRun() http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
if h.opts.App == nil {
writeError(w, http.StatusServiceUnavailable, "app not configured")
return
}
var body struct {
Profile string `json:"profile"`
BenchmarkKind string `json:"benchmark_kind"`
SizeMB int `json:"size_mb"`
}
if r.Body != nil {
if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
writeError(w, http.StatusBadRequest, "invalid request body")
return
}
}
profile := strings.TrimSpace(body.Profile)
if profile == "" {
profile = "standard"
}
benchmarkKind := strings.TrimSpace(body.BenchmarkKind)
if benchmarkKind == "" {
benchmarkKind = "power-fit"
}
now := time.Now()
taskName := fmt.Sprintf("NVIDIA Benchmark Autotune · %s · %s", profile, benchmarkKind)
t := &Task{
ID: newJobID("bee-bench-autotune"),
Name: taskName,
Target: "nvidia-bench-autotune",
Priority: defaultTaskPriority("nvidia-bench-autotune", taskParams{}),
Status: TaskPending,
CreatedAt: now,
params: taskParams{
BenchmarkProfile: profile,
BenchmarkKind: benchmarkKind,
SizeMB: body.SizeMB,
DisplayName: taskName,
},
}
globalQueue.enqueue(t)
writeTaskRunResponse(w, []*Task{t})
}
}
func (h *handler) handleAPIBenchmarkAutotuneStatus(w http.ResponseWriter, r *http.Request) {
if h.opts.App == nil {
writeError(w, http.StatusServiceUnavailable, "app not configured")
return
}
cfg, err := h.opts.App.LoadBenchmarkPowerAutotune()
if err != nil {
if os.IsNotExist(err) {
w.WriteHeader(http.StatusOK)
writeJSON(w, map[string]any{
"configured": false,
"decision": platform.ResolveSystemPowerDecision(h.opts.ExportDir),
})
return
}
writeError(w, http.StatusInternalServerError, err.Error())
return
}
w.WriteHeader(http.StatusOK)
writeJSON(w, map[string]any{
"configured": true,
"config": cfg,
"decision": platform.ResolveSystemPowerDecision(h.opts.ExportDir),
})
}
func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) { func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf").ServeHTTP(w, r) h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf").ServeHTTP(w, r)
} }

View File

@@ -195,6 +195,40 @@ func TestHandleAPIBenchmarkPowerFitRampQueuesBenchmarkPowerFitTasks(t *testing.T
} }
} }
func TestHandleAPIBenchmarkAutotuneRunQueuesTask(t *testing.T) {
globalQueue.mu.Lock()
originalTasks := globalQueue.tasks
globalQueue.tasks = nil
globalQueue.mu.Unlock()
t.Cleanup(func() {
globalQueue.mu.Lock()
globalQueue.tasks = originalTasks
globalQueue.mu.Unlock()
})
h := &handler{opts: HandlerOptions{App: &app.App{}}}
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/autotune/run", strings.NewReader(`{"profile":"standard","benchmark_kind":"power-fit"}`))
rec := httptest.NewRecorder()
h.handleAPIBenchmarkAutotuneRun().ServeHTTP(rec, req)
if rec.Code != 200 {
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
}
globalQueue.mu.Lock()
defer globalQueue.mu.Unlock()
if len(globalQueue.tasks) != 1 {
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
}
task := globalQueue.tasks[0]
if task.Target != "nvidia-bench-autotune" {
t.Fatalf("task target=%q want nvidia-bench-autotune", task.Target)
}
if task.params.BenchmarkKind != "power-fit" {
t.Fatalf("task benchmark kind=%q want power-fit", task.params.BenchmarkKind)
}
}
func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) { func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
globalQueue.mu.Lock() globalQueue.mu.Lock()
originalTasks := globalQueue.tasks originalTasks := globalQueue.tasks

View File

@@ -53,6 +53,9 @@ CREATE TABLE IF NOT EXISTS sys_metrics (
cpu_load_pct REAL, cpu_load_pct REAL,
mem_load_pct REAL, mem_load_pct REAL,
power_w REAL, power_w REAL,
power_source TEXT,
power_mode TEXT,
power_reason TEXT,
PRIMARY KEY (ts) PRIMARY KEY (ts)
); );
CREATE TABLE IF NOT EXISTS gpu_metrics ( CREATE TABLE IF NOT EXISTS gpu_metrics (
@@ -86,7 +89,16 @@ CREATE TABLE IF NOT EXISTS temp_metrics (
if err := ensureMetricsColumn(db, "gpu_metrics", "clock_mhz", "REAL"); err != nil { if err := ensureMetricsColumn(db, "gpu_metrics", "clock_mhz", "REAL"); err != nil {
return err return err
} }
return ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL") if err := ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL"); err != nil {
return err
}
if err := ensureMetricsColumn(db, "sys_metrics", "power_source", "TEXT"); err != nil {
return err
}
if err := ensureMetricsColumn(db, "sys_metrics", "power_mode", "TEXT"); err != nil {
return err
}
return ensureMetricsColumn(db, "sys_metrics", "power_reason", "TEXT")
} }
func ensureMetricsColumn(db *sql.DB, table, column, definition string) error { func ensureMetricsColumn(db *sql.DB, table, column, definition string) error {
@@ -125,8 +137,8 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
defer func() { _ = tx.Rollback() }() defer func() { _ = tx.Rollback() }()
_, err = tx.Exec( _, err = tx.Exec(
`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w) VALUES(?,?,?,?)`, `INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason) VALUES(?,?,?,?,?,?,?)`,
ts, s.CPULoadPct, s.MemLoadPct, s.PowerW, ts, s.CPULoadPct, s.MemLoadPct, s.PowerW, s.PowerSource, s.PowerMode, s.PowerReason,
) )
if err != nil { if err != nil {
return err return err
@@ -213,12 +225,12 @@ func (m *MetricsDB) Prune(before time.Time) error {
// LoadRecent returns up to n samples in chronological order (oldest first). // LoadRecent returns up to n samples in chronological order (oldest first).
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) { func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n) return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
} }
// LoadAll returns all persisted samples in chronological order (oldest first). // LoadAll returns all persisted samples in chronological order (oldest first).
func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) { func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil) return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics ORDER BY ts`, nil)
} }
// LoadBetween returns samples in chronological order within the given time window. // LoadBetween returns samples in chronological order within the given time window.
@@ -233,7 +245,7 @@ func (m *MetricsDB) LoadBetween(start, end time.Time) ([]platform.LiveMetricSamp
start, end = end, start start, end = end, start
} }
return m.loadSamples( return m.loadSamples(
`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`, `SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
start.Unix(), end.Unix(), start.Unix(), end.Unix(),
) )
} }
@@ -249,11 +261,14 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
type sysRow struct { type sysRow struct {
ts int64 ts int64
cpu, mem, pwr float64 cpu, mem, pwr float64
powerSource string
powerMode string
powerReason string
} }
var sysRows []sysRow var sysRows []sysRow
for rows.Next() { for rows.Next() {
var r sysRow var r sysRow
if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr); err != nil { if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr, &r.powerSource, &r.powerMode, &r.powerReason); err != nil {
continue continue
} }
sysRows = append(sysRows, r) sysRows = append(sysRows, r)
@@ -363,10 +378,13 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
samples := make([]platform.LiveMetricSample, len(sysRows)) samples := make([]platform.LiveMetricSample, len(sysRows))
for i, r := range sysRows { for i, r := range sysRows {
s := platform.LiveMetricSample{ s := platform.LiveMetricSample{
Timestamp: time.Unix(r.ts, 0).UTC(), Timestamp: time.Unix(r.ts, 0).UTC(),
CPULoadPct: r.cpu, CPULoadPct: r.cpu,
MemLoadPct: r.mem, MemLoadPct: r.mem,
PowerW: r.pwr, PowerW: r.pwr,
PowerSource: r.powerSource,
PowerMode: r.powerMode,
PowerReason: r.powerReason,
} }
for _, idx := range gpuIndices { for _, idx := range gpuIndices {
if g, ok := gpuData[gpuKey{r.ts, idx}]; ok { if g, ok := gpuData[gpuKey{r.ts, idx}]; ok {

View File

@@ -69,6 +69,7 @@ func renderBenchmark(opts HandlerOptions) string {
<span id="benchmark-run-nccl" hidden>nccl-auto</span> <span id="benchmark-run-nccl" hidden>nccl-auto</span>
<span id="benchmark-run-status" style="margin-left:10px;font-size:12px;color:var(--muted)"></span> <span id="benchmark-run-status" style="margin-left:10px;font-size:12px;color:var(--muted)"></span>
<div id="benchmark-autotune-status" style="margin-top:10px;font-size:12px;color:var(--muted)">Autotune status: loading…</div> <div id="benchmark-autotune-status" style="margin-top:10px;font-size:12px;color:var(--muted)">Autotune status: loading…</div>
<div style="margin-top:6px;font-size:12px;color:var(--muted)">Autotune overwrites the saved system-power source and applies it to all new power charts and tests.</div>
</div> </div>
</div> </div>

View File

@@ -271,6 +271,8 @@ func NewHandler(opts HandlerOptions) http.Handler {
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort) mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf")) mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf"))
mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power")) mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power"))
mux.HandleFunc("POST /api/bee-bench/nvidia/autotune/run", h.handleAPIBenchmarkAutotuneRun())
mux.HandleFunc("GET /api/bee-bench/nvidia/autotune/status", h.handleAPIBenchmarkAutotuneStatus)
mux.HandleFunc("GET /api/benchmark/results", h.handleAPIBenchmarkResults) mux.HandleFunc("GET /api/benchmark/results", h.handleAPIBenchmarkResults)
// Tasks // Tasks
@@ -687,41 +689,22 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) (dat
case path == "server-power": case path == "server-power":
title = "System Power" title = "System Power"
// Use per-PSU stacked chart when PSU SDR data is available. power := make([]float64, len(samples))
// Collect the union of PSU slots seen across all samples. label := "Power W"
psuSlots := psuSlotsFromSamples(samples) for i, s := range samples {
if len(psuSlots) > 0 { power[i] = s.PowerW
// Build one dataset per PSU slot. if strings.TrimSpace(s.PowerSource) != "" {
psuDatasets := make([][]float64, len(psuSlots)) label = fmt.Sprintf("Power W · %s", s.PowerSource)
psuNames := make([]string, len(psuSlots)) if strings.TrimSpace(s.PowerMode) != "" {
for si, slot := range psuSlots { label += fmt.Sprintf(" (%s)", s.PowerMode)
ds := make([]float64, len(samples))
for i, s := range samples {
for _, psu := range s.PSUs {
if psu.Slot == slot {
ds[i] = psu.PowerW
break
}
}
} }
psuDatasets[si] = normalizePowerSeries(ds)
psuNames[si] = fmt.Sprintf("PSU %d", slot)
} }
datasets = psuDatasets
names = psuNames
stacked = len(psuDatasets) > 0
yMax = autoMax120(psuStackedTotal(psuDatasets))
} else {
power := make([]float64, len(samples))
for i, s := range samples {
power[i] = s.PowerW
}
power = normalizePowerSeries(power)
datasets = [][]float64{power}
names = []string{"Power W"}
yMin = floatPtr(0)
yMax = autoMax120(power)
} }
power = normalizePowerSeries(power)
datasets = [][]float64{power}
names = []string{label}
yMin = floatPtr(0)
yMax = autoMax120(power)
case path == "server-fans": case path == "server-fans":
title = "Fan RPM" title = "Fan RPM"

View File

@@ -420,7 +420,7 @@ func TestHandleMetricsChartSVGRendersCustomSVG(t *testing.T) {
} }
} }
func TestChartDataFromSamplesServerPowerUsesPerPSUDatasets(t *testing.T) { func TestChartDataFromSamplesServerPowerUsesResolvedSystemPower(t *testing.T) {
start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC) start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
samples := []platform.LiveMetricSample{ samples := []platform.LiveMetricSample{
{ {
@@ -429,7 +429,9 @@ func TestChartDataFromSamplesServerPowerUsesPerPSUDatasets(t *testing.T) {
{Slot: 1, PowerW: 120}, {Slot: 1, PowerW: 120},
{Slot: 2, PowerW: 130}, {Slot: 2, PowerW: 130},
}, },
PowerW: 250, PowerW: 250,
PowerSource: "sdr_psu_input",
PowerMode: "autotuned",
}, },
{ {
Timestamp: start.Add(time.Minute), Timestamp: start.Add(time.Minute),
@@ -437,7 +439,9 @@ func TestChartDataFromSamplesServerPowerUsesPerPSUDatasets(t *testing.T) {
{Slot: 1, PowerW: 140}, {Slot: 1, PowerW: 140},
{Slot: 2, PowerW: 135}, {Slot: 2, PowerW: 135},
}, },
PowerW: 275, PowerW: 275,
PowerSource: "sdr_psu_input",
PowerMode: "autotuned",
}, },
} }
@@ -448,13 +452,13 @@ func TestChartDataFromSamplesServerPowerUsesPerPSUDatasets(t *testing.T) {
if title != "System Power" { if title != "System Power" {
t.Fatalf("title=%q", title) t.Fatalf("title=%q", title)
} }
if !stacked { if stacked {
t.Fatal("expected stacked PSU chart") t.Fatal("server-power should use resolved system power, not stacked PSU inputs")
} }
if len(datasets) != 2 || len(names) != 2 { if len(datasets) != 1 || len(names) != 1 {
t.Fatalf("datasets=%d names=%d want 2/2", len(datasets), len(names)) t.Fatalf("datasets=%d names=%d want 1/1", len(datasets), len(names))
} }
if names[0] != "PSU 1" || names[1] != "PSU 2" { if names[0] != "Power W · sdr_psu_input (autotuned)" {
t.Fatalf("names=%v", names) t.Fatalf("names=%v", names)
} }
} }
@@ -689,9 +693,12 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
`/api/gpu/nvidia`, `/api/gpu/nvidia`,
`/api/bee-bench/nvidia/perf/run`, `/api/bee-bench/nvidia/perf/run`,
`/api/bee-bench/nvidia/power/run`, `/api/bee-bench/nvidia/power/run`,
`/api/bee-bench/nvidia/autotune/run`,
`/api/bee-bench/nvidia/autotune/status`,
`benchmark-run-nccl`, `benchmark-run-nccl`,
`Run Performance Benchmark`, `Run Performance Benchmark`,
`Run Power / Thermal Fit`, `Run Power / Thermal Fit`,
`Autotune`,
} { } {
if !strings.Contains(body, needle) { if !strings.Contains(body, needle) {
t.Fatalf("benchmark page missing %q: %s", needle, body) t.Fatalf("benchmark page missing %q: %s", needle, body)

View File

@@ -34,6 +34,7 @@ var taskNames = map[string]string{
"nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)", "nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)",
"nvidia-bench-perf": "NVIDIA Bee Bench Perf", "nvidia-bench-perf": "NVIDIA Bee Bench Perf",
"nvidia-bench-power": "NVIDIA Bee Bench Power", "nvidia-bench-power": "NVIDIA Bee Bench Power",
"nvidia-bench-autotune": "NVIDIA Bee Bench Power Source Autotune",
"nvidia-compute": "NVIDIA Max Compute Load (dcgmproftester)", "nvidia-compute": "NVIDIA Max Compute Load (dcgmproftester)",
"nvidia-targeted-power": "NVIDIA Targeted Power (dcgmi diag targeted_power)", "nvidia-targeted-power": "NVIDIA Targeted Power (dcgmi diag targeted_power)",
"nvidia-pulse": "NVIDIA Pulse Test (dcgmi diag pulse_test)", "nvidia-pulse": "NVIDIA Pulse Test (dcgmi diag pulse_test)",
@@ -125,6 +126,7 @@ type taskParams struct {
Loader string `json:"loader,omitempty"` Loader string `json:"loader,omitempty"`
BurnProfile string `json:"burn_profile,omitempty"` BurnProfile string `json:"burn_profile,omitempty"`
BenchmarkProfile string `json:"benchmark_profile,omitempty"` BenchmarkProfile string `json:"benchmark_profile,omitempty"`
BenchmarkKind string `json:"benchmark_kind,omitempty"`
RunNCCL bool `json:"run_nccl,omitempty"` RunNCCL bool `json:"run_nccl,omitempty"`
ParallelGPUs bool `json:"parallel_gpus,omitempty"` ParallelGPUs bool `json:"parallel_gpus,omitempty"`
RampStep int `json:"ramp_step,omitempty"` RampStep int `json:"ramp_step,omitempty"`
@@ -686,6 +688,15 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
RampTotal: t.params.RampTotal, RampTotal: t.params.RampTotal,
RampRunID: t.params.RampRunID, RampRunID: t.params.RampRunID,
}, j.append) }, j.append)
case "nvidia-bench-autotune":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
archive, err = a.RunNvidiaPowerSourceAutotuneCtx(ctx, app.DefaultBeeBenchAutotuneDir, platform.NvidiaBenchmarkOptions{
Profile: t.params.BenchmarkProfile,
SizeMB: t.params.SizeMB,
}, t.params.BenchmarkKind, j.append)
case "nvidia-compute": case "nvidia-compute":
if a == nil { if a == nil {
err = fmt.Errorf("app not configured") err = fmt.Errorf("app not configured")