Globalize autotuned system power source

This commit is contained in:
2026-04-20 07:02:12 +03:00
parent 17118298bd
commit b3cf8e3893
14 changed files with 327 additions and 108 deletions

View File

@@ -19,20 +19,22 @@ import (
)
var (
DefaultExportDir = "/appdata/bee/export"
DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json"
DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log"
DefaultWebLogPath = DefaultExportDir + "/bee-web.log"
DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log"
DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log"
DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log"
DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
DefaultTechDumpDir = DefaultExportDir + "/techdump"
DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
DefaultBeeBenchBaseDir = DefaultExportDir + "/bee-bench"
DefaultBeeBenchPerfDir = DefaultBeeBenchBaseDir + "/perf"
DefaultBeeBenchPowerDir = DefaultBeeBenchBaseDir + "/power"
DefaultExportDir = "/appdata/bee/export"
DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json"
DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log"
DefaultWebLogPath = DefaultExportDir + "/bee-web.log"
DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log"
DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log"
DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log"
DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
DefaultTechDumpDir = DefaultExportDir + "/techdump"
DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
DefaultBeeBenchBaseDir = DefaultExportDir + "/bee-bench"
DefaultBeeBenchAutotuneDir = DefaultBeeBenchBaseDir + "/autotune"
DefaultBeeBenchPerfDir = DefaultBeeBenchBaseDir + "/perf"
DefaultBeeBenchPowerDir = DefaultBeeBenchBaseDir + "/power"
DefaultBeeBenchPowerSourceConfigPath = DefaultBeeBenchBaseDir + "/power-source-autotune.json"
)
type App struct {
@@ -125,6 +127,7 @@ type satRunner interface {
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
RunNvidiaPowerBench(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error)
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
@@ -572,6 +575,11 @@ func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts pl
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultBeeBenchPerfDir
}
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "performance", logFunc)
if err != nil {
return "", err
}
opts.ServerPowerSource = resolved.SelectedSource
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
}
@@ -579,9 +587,47 @@ func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts p
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultBeeBenchPowerDir
}
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "power-fit", logFunc)
if err != nil {
return "", err
}
opts.ServerPowerSource = resolved.SelectedSource
return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
}
func (a *App) RunNvidiaPowerSourceAutotuneCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultBeeBenchAutotuneDir
}
return a.sat.RunNvidiaPowerSourceAutotune(ctx, baseDir, opts, benchmarkKind, logFunc)
}
func (a *App) LoadBenchmarkPowerAutotune() (*platform.BenchmarkPowerAutotuneConfig, error) {
return platform.LoadBenchmarkPowerAutotuneConfig(DefaultBeeBenchPowerSourceConfigPath)
}
func (a *App) ensureBenchmarkPowerAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (platform.BenchmarkPowerAutotuneConfig, error) {
cfgPath := platform.BenchmarkPowerSourceConfigPath(baseDir)
if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath); err == nil {
if logFunc != nil {
logFunc(fmt.Sprintf("benchmark autotune: using saved server power source %s", cfg.SelectedSource))
}
return *cfg, nil
}
if logFunc != nil {
logFunc("benchmark autotune: no saved power source config, running autotune first")
}
autotuneDir := filepath.Join(filepath.Dir(baseDir), "autotune")
if _, err := a.RunNvidiaPowerSourceAutotuneCtx(ctx, autotuneDir, opts, benchmarkKind, logFunc); err != nil {
return platform.BenchmarkPowerAutotuneConfig{}, err
}
cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath)
if err != nil {
return platform.BenchmarkPowerAutotuneConfig{}, err
}
return *cfg, nil
}
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir

View File

@@ -9,6 +9,7 @@ import (
"io"
"os"
"path/filepath"
"strings"
"testing"
"bee/audit/internal/platform"
@@ -123,6 +124,7 @@ type fakeSAT struct {
runNvidiaFn func(string) (string, error)
runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
runNvidiaPowerBenchFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
runNvidiaAutotuneFn func(string, platform.NvidiaBenchmarkOptions, string) (string, error)
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
runNvidiaComputeFn func(string, int, []int) (string, error)
runNvidiaPowerFn func(string, int, []int) (string, error)
@@ -163,6 +165,13 @@ func (f fakeSAT) RunNvidiaPowerBench(_ context.Context, baseDir string, opts pla
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaPowerSourceAutotune(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, _ func(string)) (string, error) {
if f.runNvidiaAutotuneFn != nil {
return f.runNvidiaAutotuneFn(baseDir, opts, benchmarkKind)
}
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
if f.runNvidiaTargetedStressFn != nil {
return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
@@ -809,6 +818,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
t.Fatal(err)
}
if err := os.MkdirAll(filepath.Join(exportDir, "bee-bench"), 0755); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json"), []byte(`{"version":1,"updated_at":"2026-04-20T01:02:03Z","selected_source":"sdr_psu_input","reason":"selected lowest relative error"}`), 0644); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run.tar.gz"), []byte("nested sat archive"), 0644); err != nil {
t.Fatal(err)
}
@@ -836,6 +851,7 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
tr := tar.NewReader(gzr)
var names []string
var auditJSON string
var manifest string
for {
hdr, err := tr.Next()
if errors.Is(err, io.EOF) {
@@ -852,6 +868,13 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
}
auditJSON = string(body)
}
if strings.HasSuffix(hdr.Name, "/manifest.txt") {
body, err := io.ReadAll(tr)
if err != nil {
t.Fatalf("read manifest entry: %v", err)
}
manifest = string(body)
}
}
for _, want := range []string{
@@ -895,6 +918,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
if !contains(auditJSON, "PASCARI") || !contains(auditJSON, "NVIDIA H100") {
t.Fatalf("support bundle should keep real devices:\n%s", auditJSON)
}
if !contains(manifest, "files:") {
t.Fatalf("support bundle manifest missing files section:\n%s", manifest)
}
if !strings.Contains(manifest, "power_autotune_selected_source=sdr_psu_input") {
t.Fatalf("support bundle manifest missing autotune source:\n%s", manifest)
}
}
func TestMainBanner(t *testing.T) {

View File

@@ -2,6 +2,7 @@ package app
import (
"archive/tar"
"bee/audit/internal/platform"
"compress/gzip"
"fmt"
"io"
@@ -424,6 +425,13 @@ func writeManifest(dst, exportDir, stageRoot string) error {
fmt.Fprintf(&body, "host=%s\n", hostnameOr("unknown"))
fmt.Fprintf(&body, "generated_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
fmt.Fprintf(&body, "export_dir=%s\n", exportDir)
if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json")); err == nil && cfg != nil {
fmt.Fprintf(&body, "power_autotune_selected_source=%s\n", cfg.SelectedSource)
fmt.Fprintf(&body, "power_autotune_updated_at=%s\n", cfg.UpdatedAt.UTC().Format(time.RFC3339))
if strings.TrimSpace(cfg.Reason) != "" {
fmt.Fprintf(&body, "power_autotune_reason=%s\n", cfg.Reason)
}
}
fmt.Fprintf(&body, "\nfiles:\n")
var files []string

View File

@@ -401,11 +401,15 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
}
}
// ── Server Power (IPMI) ───────────────────────────────────────────────────
// ── Server Power ───────────────────────────────────────────────────────────
if sp := result.ServerPower; sp != nil {
b.WriteString("## Server Power (IPMI)\n\n")
title := "## Server Power\n\n"
if sp.Source != "" {
title = fmt.Sprintf("## Server Power (`%s`)\n\n", sp.Source)
}
b.WriteString(title)
if !sp.Available {
b.WriteString("IPMI power measurement unavailable.\n\n")
b.WriteString("Server power measurement unavailable.\n\n")
} else {
spRows := [][]string{
{"Server idle", fmt.Sprintf("%.0f W", sp.IdleW)},

View File

@@ -16,14 +16,17 @@ import (
// LiveMetricSample is a single point-in-time snapshot of server metrics
// collected for the web UI metrics page.
type LiveMetricSample struct {
Timestamp time.Time `json:"ts"`
Fans []FanReading `json:"fans"`
Temps []TempReading `json:"temps"`
PowerW float64 `json:"power_w"`
PSUs []PSUReading `json:"psus,omitempty"`
CPULoadPct float64 `json:"cpu_load_pct"`
MemLoadPct float64 `json:"mem_load_pct"`
GPUs []GPUMetricRow `json:"gpus"`
Timestamp time.Time `json:"ts"`
Fans []FanReading `json:"fans"`
Temps []TempReading `json:"temps"`
PowerW float64 `json:"power_w"`
PowerSource string `json:"power_source,omitempty"`
PowerMode string `json:"power_mode,omitempty"`
PowerReason string `json:"power_reason,omitempty"`
PSUs []PSUReading `json:"psus,omitempty"`
CPULoadPct float64 `json:"cpu_load_pct"`
MemLoadPct float64 `json:"mem_load_pct"`
GPUs []GPUMetricRow `json:"gpus"`
}
// PSUReading is a per-slot power supply input power reading.
@@ -67,15 +70,13 @@ func SampleLiveMetrics() LiveMetricSample {
// Per-PSU power — populated when IPMI SDR has Power Supply entities with Watt readings
s.PSUs = samplePSUPower()
// System power: prefer sum of PSU AC inputs (full wall draw); fall back to DCMI.
if len(s.PSUs) > 0 {
var total float64
for _, p := range s.PSUs {
total += p.PowerW
}
s.PowerW = total
} else {
s.PowerW = sampleSystemPower()
// System power: use the global autotune-selected source when configured,
// otherwise fall back to the historical heuristic and mark the mode.
if powerW, decision, err := SampleSystemPowerResolved(""); err == nil {
s.PowerW = powerW
s.PowerSource = decision.EffectiveSource
s.PowerMode = decision.Mode
s.PowerReason = decision.Reason
}
// CPU load — from /proc/stat

View File

@@ -43,17 +43,22 @@ type GPUStressMetric struct {
// FanStressRow is one second-interval telemetry sample covering all monitored dimensions.
type FanStressRow struct {
TimestampUTC string
ElapsedSec float64
Phase string // "baseline", "load1", "pause", "load2", "cooldown"
GPUs []GPUStressMetric
Fans []FanReading
CPUMaxTempC float64 // highest CPU temperature from ipmitool / sensors
SysPowerW float64 // DCMI system power reading
TimestampUTC string
ElapsedSec float64
Phase string // "baseline", "load1", "pause", "load2", "cooldown"
GPUs []GPUStressMetric
Fans []FanReading
CPUMaxTempC float64 // highest CPU temperature from ipmitool / sensors
SysPowerW float64
SysPowerSource string
SysPowerMode string
}
type cachedPowerReading struct {
Value float64
Source string
Mode string
Reason string
UpdatedAt time.Time
}
@@ -278,7 +283,7 @@ func sampleFanStressRow(gpuIndices []int, phase string, elapsed float64) FanStre
row.GPUs = sampleGPUStressMetrics(gpuIndices)
row.Fans, _ = sampleFanSpeeds()
row.CPUMaxTempC = sampleCPUMaxTemp()
row.SysPowerW = sampleSystemPower()
row.SysPowerW, row.SysPowerSource, row.SysPowerMode = sampleSystemPowerResolved()
return row
}
@@ -763,19 +768,19 @@ func sampleCPUTempViaSensors() float64 {
return max
}
// sampleSystemPower reads system power draw via DCMI.
func sampleSystemPower() float64 {
// sampleSystemPowerResolved reads system power via the global autotune source,
// falling back to the historical heuristic before autotune or when degraded.
func sampleSystemPowerResolved() (float64, string, string) {
now := time.Now()
current := 0.0
out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
if err == nil {
current = parseDCMIPowerReading(string(out))
}
current, decision, err := SampleSystemPowerResolved("")
systemPowerCacheMu.Lock()
defer systemPowerCacheMu.Unlock()
value, updated := effectiveSystemPowerReading(systemPowerCache, current, now)
if err != nil {
current = 0
}
value, updated := effectiveSystemPowerReading(systemPowerCache, current, decision.EffectiveSource, decision.Mode, decision.Reason, now)
systemPowerCache = updated
return value
return value, updated.Source, updated.Mode
}
// parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
@@ -798,9 +803,9 @@ func parseDCMIPowerReading(raw string) float64 {
return 0
}
func effectiveSystemPowerReading(cache cachedPowerReading, current float64, now time.Time) (float64, cachedPowerReading) {
func effectiveSystemPowerReading(cache cachedPowerReading, current float64, source, mode, reason string, now time.Time) (float64, cachedPowerReading) {
if current > 0 {
cache = cachedPowerReading{Value: current, UpdatedAt: now}
cache = cachedPowerReading{Value: current, Source: source, Mode: mode, Reason: reason, UpdatedAt: now}
return current, cache
}
if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL {

View File

@@ -112,7 +112,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
now := time.Now()
cache := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-5 * time.Second)}
got, updated := effectiveSystemPowerReading(cache, 0, now)
got, updated := effectiveSystemPowerReading(cache, 0, "", "", "", now)
if got != 480 {
t.Fatalf("got=%v want cached 480", got)
}
@@ -120,7 +120,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
t.Fatalf("updated=%+v", updated)
}
got, updated = effectiveSystemPowerReading(cache, 530, now)
got, updated = effectiveSystemPowerReading(cache, 530, "dcmi", "fallback", "test", now)
if got != 530 {
t.Fatalf("got=%v want 530", got)
}
@@ -129,7 +129,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
}
expired := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-systemPowerHoldTTL - time.Second)}
got, _ = effectiveSystemPowerReading(expired, 0, now)
got, _ = effectiveSystemPowerReading(expired, 0, "", "", "", now)
if got != 0 {
t.Fatalf("expired cache returned %v want 0", got)
}

View File

@@ -127,7 +127,7 @@ func defaultTaskPriority(target string, params taskParams) int {
return taskPriorityInstallToRAM
case "audit":
return taskPriorityAudit
case "nvidia-bench-perf", "nvidia-bench-power":
case "nvidia-bench-perf", "nvidia-bench-power", "nvidia-bench-autotune":
return taskPriorityBenchmark
case "nvidia-stress", "amd-stress", "memory-stress", "sat-stress", "platform-stress", "nvidia-compute":
return taskPriorityBurn
@@ -701,6 +701,78 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
}
}
func (h *handler) handleAPIBenchmarkAutotuneRun() http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
if h.opts.App == nil {
writeError(w, http.StatusServiceUnavailable, "app not configured")
return
}
var body struct {
Profile string `json:"profile"`
BenchmarkKind string `json:"benchmark_kind"`
SizeMB int `json:"size_mb"`
}
if r.Body != nil {
if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
writeError(w, http.StatusBadRequest, "invalid request body")
return
}
}
profile := strings.TrimSpace(body.Profile)
if profile == "" {
profile = "standard"
}
benchmarkKind := strings.TrimSpace(body.BenchmarkKind)
if benchmarkKind == "" {
benchmarkKind = "power-fit"
}
now := time.Now()
taskName := fmt.Sprintf("NVIDIA Benchmark Autotune · %s · %s", profile, benchmarkKind)
t := &Task{
ID: newJobID("bee-bench-autotune"),
Name: taskName,
Target: "nvidia-bench-autotune",
Priority: defaultTaskPriority("nvidia-bench-autotune", taskParams{}),
Status: TaskPending,
CreatedAt: now,
params: taskParams{
BenchmarkProfile: profile,
BenchmarkKind: benchmarkKind,
SizeMB: body.SizeMB,
DisplayName: taskName,
},
}
globalQueue.enqueue(t)
writeTaskRunResponse(w, []*Task{t})
}
}
func (h *handler) handleAPIBenchmarkAutotuneStatus(w http.ResponseWriter, r *http.Request) {
if h.opts.App == nil {
writeError(w, http.StatusServiceUnavailable, "app not configured")
return
}
cfg, err := h.opts.App.LoadBenchmarkPowerAutotune()
if err != nil {
if os.IsNotExist(err) {
w.WriteHeader(http.StatusOK)
writeJSON(w, map[string]any{
"configured": false,
"decision": platform.ResolveSystemPowerDecision(h.opts.ExportDir),
})
return
}
writeError(w, http.StatusInternalServerError, err.Error())
return
}
w.WriteHeader(http.StatusOK)
writeJSON(w, map[string]any{
"configured": true,
"config": cfg,
"decision": platform.ResolveSystemPowerDecision(h.opts.ExportDir),
})
}
func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf").ServeHTTP(w, r)
}

View File

@@ -195,6 +195,40 @@ func TestHandleAPIBenchmarkPowerFitRampQueuesBenchmarkPowerFitTasks(t *testing.T
}
}
func TestHandleAPIBenchmarkAutotuneRunQueuesTask(t *testing.T) {
globalQueue.mu.Lock()
originalTasks := globalQueue.tasks
globalQueue.tasks = nil
globalQueue.mu.Unlock()
t.Cleanup(func() {
globalQueue.mu.Lock()
globalQueue.tasks = originalTasks
globalQueue.mu.Unlock()
})
h := &handler{opts: HandlerOptions{App: &app.App{}}}
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/autotune/run", strings.NewReader(`{"profile":"standard","benchmark_kind":"power-fit"}`))
rec := httptest.NewRecorder()
h.handleAPIBenchmarkAutotuneRun().ServeHTTP(rec, req)
if rec.Code != 200 {
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
}
globalQueue.mu.Lock()
defer globalQueue.mu.Unlock()
if len(globalQueue.tasks) != 1 {
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
}
task := globalQueue.tasks[0]
if task.Target != "nvidia-bench-autotune" {
t.Fatalf("task target=%q want nvidia-bench-autotune", task.Target)
}
if task.params.BenchmarkKind != "power-fit" {
t.Fatalf("task benchmark kind=%q want power-fit", task.params.BenchmarkKind)
}
}
func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
globalQueue.mu.Lock()
originalTasks := globalQueue.tasks

View File

@@ -53,6 +53,9 @@ CREATE TABLE IF NOT EXISTS sys_metrics (
cpu_load_pct REAL,
mem_load_pct REAL,
power_w REAL,
power_source TEXT,
power_mode TEXT,
power_reason TEXT,
PRIMARY KEY (ts)
);
CREATE TABLE IF NOT EXISTS gpu_metrics (
@@ -86,7 +89,16 @@ CREATE TABLE IF NOT EXISTS temp_metrics (
if err := ensureMetricsColumn(db, "gpu_metrics", "clock_mhz", "REAL"); err != nil {
return err
}
return ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL")
if err := ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL"); err != nil {
return err
}
if err := ensureMetricsColumn(db, "sys_metrics", "power_source", "TEXT"); err != nil {
return err
}
if err := ensureMetricsColumn(db, "sys_metrics", "power_mode", "TEXT"); err != nil {
return err
}
return ensureMetricsColumn(db, "sys_metrics", "power_reason", "TEXT")
}
func ensureMetricsColumn(db *sql.DB, table, column, definition string) error {
@@ -125,8 +137,8 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
defer func() { _ = tx.Rollback() }()
_, err = tx.Exec(
`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w) VALUES(?,?,?,?)`,
ts, s.CPULoadPct, s.MemLoadPct, s.PowerW,
`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason) VALUES(?,?,?,?,?,?,?)`,
ts, s.CPULoadPct, s.MemLoadPct, s.PowerW, s.PowerSource, s.PowerMode, s.PowerReason,
)
if err != nil {
return err
@@ -213,12 +225,12 @@ func (m *MetricsDB) Prune(before time.Time) error {
// LoadRecent returns up to n samples in chronological order (oldest first).
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
}
// LoadAll returns all persisted samples in chronological order (oldest first).
func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics ORDER BY ts`, nil)
}
// LoadBetween returns samples in chronological order within the given time window.
@@ -233,7 +245,7 @@ func (m *MetricsDB) LoadBetween(start, end time.Time) ([]platform.LiveMetricSamp
start, end = end, start
}
return m.loadSamples(
`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
start.Unix(), end.Unix(),
)
}
@@ -249,11 +261,14 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
type sysRow struct {
ts int64
cpu, mem, pwr float64
powerSource string
powerMode string
powerReason string
}
var sysRows []sysRow
for rows.Next() {
var r sysRow
if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr); err != nil {
if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr, &r.powerSource, &r.powerMode, &r.powerReason); err != nil {
continue
}
sysRows = append(sysRows, r)
@@ -363,10 +378,13 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
samples := make([]platform.LiveMetricSample, len(sysRows))
for i, r := range sysRows {
s := platform.LiveMetricSample{
Timestamp: time.Unix(r.ts, 0).UTC(),
CPULoadPct: r.cpu,
MemLoadPct: r.mem,
PowerW: r.pwr,
Timestamp: time.Unix(r.ts, 0).UTC(),
CPULoadPct: r.cpu,
MemLoadPct: r.mem,
PowerW: r.pwr,
PowerSource: r.powerSource,
PowerMode: r.powerMode,
PowerReason: r.powerReason,
}
for _, idx := range gpuIndices {
if g, ok := gpuData[gpuKey{r.ts, idx}]; ok {

View File

@@ -69,6 +69,7 @@ func renderBenchmark(opts HandlerOptions) string {
<span id="benchmark-run-nccl" hidden>nccl-auto</span>
<span id="benchmark-run-status" style="margin-left:10px;font-size:12px;color:var(--muted)"></span>
<div id="benchmark-autotune-status" style="margin-top:10px;font-size:12px;color:var(--muted)">Autotune status: loading…</div>
<div style="margin-top:6px;font-size:12px;color:var(--muted)">Autotune overwrites the saved system-power source and applies it to all new power charts and tests.</div>
</div>
</div>

View File

@@ -271,6 +271,8 @@ func NewHandler(opts HandlerOptions) http.Handler {
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf"))
mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power"))
mux.HandleFunc("POST /api/bee-bench/nvidia/autotune/run", h.handleAPIBenchmarkAutotuneRun())
mux.HandleFunc("GET /api/bee-bench/nvidia/autotune/status", h.handleAPIBenchmarkAutotuneStatus)
mux.HandleFunc("GET /api/benchmark/results", h.handleAPIBenchmarkResults)
// Tasks
@@ -687,41 +689,22 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) (dat
case path == "server-power":
title = "System Power"
// Use per-PSU stacked chart when PSU SDR data is available.
// Collect the union of PSU slots seen across all samples.
psuSlots := psuSlotsFromSamples(samples)
if len(psuSlots) > 0 {
// Build one dataset per PSU slot.
psuDatasets := make([][]float64, len(psuSlots))
psuNames := make([]string, len(psuSlots))
for si, slot := range psuSlots {
ds := make([]float64, len(samples))
for i, s := range samples {
for _, psu := range s.PSUs {
if psu.Slot == slot {
ds[i] = psu.PowerW
break
}
}
power := make([]float64, len(samples))
label := "Power W"
for i, s := range samples {
power[i] = s.PowerW
if strings.TrimSpace(s.PowerSource) != "" {
label = fmt.Sprintf("Power W · %s", s.PowerSource)
if strings.TrimSpace(s.PowerMode) != "" {
label += fmt.Sprintf(" (%s)", s.PowerMode)
}
psuDatasets[si] = normalizePowerSeries(ds)
psuNames[si] = fmt.Sprintf("PSU %d", slot)
}
datasets = psuDatasets
names = psuNames
stacked = len(psuDatasets) > 0
yMax = autoMax120(psuStackedTotal(psuDatasets))
} else {
power := make([]float64, len(samples))
for i, s := range samples {
power[i] = s.PowerW
}
power = normalizePowerSeries(power)
datasets = [][]float64{power}
names = []string{"Power W"}
yMin = floatPtr(0)
yMax = autoMax120(power)
}
power = normalizePowerSeries(power)
datasets = [][]float64{power}
names = []string{label}
yMin = floatPtr(0)
yMax = autoMax120(power)
case path == "server-fans":
title = "Fan RPM"

View File

@@ -420,7 +420,7 @@ func TestHandleMetricsChartSVGRendersCustomSVG(t *testing.T) {
}
}
func TestChartDataFromSamplesServerPowerUsesPerPSUDatasets(t *testing.T) {
func TestChartDataFromSamplesServerPowerUsesResolvedSystemPower(t *testing.T) {
start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
samples := []platform.LiveMetricSample{
{
@@ -429,7 +429,9 @@ func TestChartDataFromSamplesServerPowerUsesPerPSUDatasets(t *testing.T) {
{Slot: 1, PowerW: 120},
{Slot: 2, PowerW: 130},
},
PowerW: 250,
PowerW: 250,
PowerSource: "sdr_psu_input",
PowerMode: "autotuned",
},
{
Timestamp: start.Add(time.Minute),
@@ -437,7 +439,9 @@ func TestChartDataFromSamplesServerPowerUsesPerPSUDatasets(t *testing.T) {
{Slot: 1, PowerW: 140},
{Slot: 2, PowerW: 135},
},
PowerW: 275,
PowerW: 275,
PowerSource: "sdr_psu_input",
PowerMode: "autotuned",
},
}
@@ -448,13 +452,13 @@ func TestChartDataFromSamplesServerPowerUsesPerPSUDatasets(t *testing.T) {
if title != "System Power" {
t.Fatalf("title=%q", title)
}
if !stacked {
t.Fatal("expected stacked PSU chart")
if stacked {
t.Fatal("server-power should use resolved system power, not stacked PSU inputs")
}
if len(datasets) != 2 || len(names) != 2 {
t.Fatalf("datasets=%d names=%d want 2/2", len(datasets), len(names))
if len(datasets) != 1 || len(names) != 1 {
t.Fatalf("datasets=%d names=%d want 1/1", len(datasets), len(names))
}
if names[0] != "PSU 1" || names[1] != "PSU 2" {
if names[0] != "Power W · sdr_psu_input (autotuned)" {
t.Fatalf("names=%v", names)
}
}
@@ -689,9 +693,12 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
`/api/gpu/nvidia`,
`/api/bee-bench/nvidia/perf/run`,
`/api/bee-bench/nvidia/power/run`,
`/api/bee-bench/nvidia/autotune/run`,
`/api/bee-bench/nvidia/autotune/status`,
`benchmark-run-nccl`,
`Run Performance Benchmark`,
`Run Power / Thermal Fit`,
`Autotune`,
} {
if !strings.Contains(body, needle) {
t.Fatalf("benchmark page missing %q: %s", needle, body)

View File

@@ -34,6 +34,7 @@ var taskNames = map[string]string{
"nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)",
"nvidia-bench-perf": "NVIDIA Bee Bench Perf",
"nvidia-bench-power": "NVIDIA Bee Bench Power",
"nvidia-bench-autotune": "NVIDIA Bee Bench Power Source Autotune",
"nvidia-compute": "NVIDIA Max Compute Load (dcgmproftester)",
"nvidia-targeted-power": "NVIDIA Targeted Power (dcgmi diag targeted_power)",
"nvidia-pulse": "NVIDIA Pulse Test (dcgmi diag pulse_test)",
@@ -125,6 +126,7 @@ type taskParams struct {
Loader string `json:"loader,omitempty"`
BurnProfile string `json:"burn_profile,omitempty"`
BenchmarkProfile string `json:"benchmark_profile,omitempty"`
BenchmarkKind string `json:"benchmark_kind,omitempty"`
RunNCCL bool `json:"run_nccl,omitempty"`
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
RampStep int `json:"ramp_step,omitempty"`
@@ -686,6 +688,15 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
RampTotal: t.params.RampTotal,
RampRunID: t.params.RampRunID,
}, j.append)
case "nvidia-bench-autotune":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
archive, err = a.RunNvidiaPowerSourceAutotuneCtx(ctx, app.DefaultBeeBenchAutotuneDir, platform.NvidiaBenchmarkOptions{
Profile: t.params.BenchmarkProfile,
SizeMB: t.params.SizeMB,
}, t.params.BenchmarkKind, j.append)
case "nvidia-compute":
if a == nil {
err = fmt.Errorf("app not configured")