Globalize autotuned system power source

This commit is contained in:
2026-04-20 07:02:12 +03:00
parent 17118298bd
commit b3cf8e3893
14 changed files with 327 additions and 108 deletions

View File

@@ -127,7 +127,7 @@ func defaultTaskPriority(target string, params taskParams) int {
return taskPriorityInstallToRAM
case "audit":
return taskPriorityAudit
case "nvidia-bench-perf", "nvidia-bench-power":
case "nvidia-bench-perf", "nvidia-bench-power", "nvidia-bench-autotune":
return taskPriorityBenchmark
case "nvidia-stress", "amd-stress", "memory-stress", "sat-stress", "platform-stress", "nvidia-compute":
return taskPriorityBurn
@@ -701,6 +701,78 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
}
}
func (h *handler) handleAPIBenchmarkAutotuneRun() http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
if h.opts.App == nil {
writeError(w, http.StatusServiceUnavailable, "app not configured")
return
}
var body struct {
Profile string `json:"profile"`
BenchmarkKind string `json:"benchmark_kind"`
SizeMB int `json:"size_mb"`
}
if r.Body != nil {
if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
writeError(w, http.StatusBadRequest, "invalid request body")
return
}
}
profile := strings.TrimSpace(body.Profile)
if profile == "" {
profile = "standard"
}
benchmarkKind := strings.TrimSpace(body.BenchmarkKind)
if benchmarkKind == "" {
benchmarkKind = "power-fit"
}
now := time.Now()
taskName := fmt.Sprintf("NVIDIA Benchmark Autotune · %s · %s", profile, benchmarkKind)
t := &Task{
ID: newJobID("bee-bench-autotune"),
Name: taskName,
Target: "nvidia-bench-autotune",
Priority: defaultTaskPriority("nvidia-bench-autotune", taskParams{}),
Status: TaskPending,
CreatedAt: now,
params: taskParams{
BenchmarkProfile: profile,
BenchmarkKind: benchmarkKind,
SizeMB: body.SizeMB,
DisplayName: taskName,
},
}
globalQueue.enqueue(t)
writeTaskRunResponse(w, []*Task{t})
}
}
func (h *handler) handleAPIBenchmarkAutotuneStatus(w http.ResponseWriter, r *http.Request) {
if h.opts.App == nil {
writeError(w, http.StatusServiceUnavailable, "app not configured")
return
}
cfg, err := h.opts.App.LoadBenchmarkPowerAutotune()
if err != nil {
if os.IsNotExist(err) {
w.WriteHeader(http.StatusOK)
writeJSON(w, map[string]any{
"configured": false,
"decision": platform.ResolveSystemPowerDecision(h.opts.ExportDir),
})
return
}
writeError(w, http.StatusInternalServerError, err.Error())
return
}
w.WriteHeader(http.StatusOK)
writeJSON(w, map[string]any{
"configured": true,
"config": cfg,
"decision": platform.ResolveSystemPowerDecision(h.opts.ExportDir),
})
}
func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf").ServeHTTP(w, r)
}

View File

@@ -195,6 +195,40 @@ func TestHandleAPIBenchmarkPowerFitRampQueuesBenchmarkPowerFitTasks(t *testing.T
}
}
func TestHandleAPIBenchmarkAutotuneRunQueuesTask(t *testing.T) {
globalQueue.mu.Lock()
originalTasks := globalQueue.tasks
globalQueue.tasks = nil
globalQueue.mu.Unlock()
t.Cleanup(func() {
globalQueue.mu.Lock()
globalQueue.tasks = originalTasks
globalQueue.mu.Unlock()
})
h := &handler{opts: HandlerOptions{App: &app.App{}}}
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/autotune/run", strings.NewReader(`{"profile":"standard","benchmark_kind":"power-fit"}`))
rec := httptest.NewRecorder()
h.handleAPIBenchmarkAutotuneRun().ServeHTTP(rec, req)
if rec.Code != 200 {
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
}
globalQueue.mu.Lock()
defer globalQueue.mu.Unlock()
if len(globalQueue.tasks) != 1 {
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
}
task := globalQueue.tasks[0]
if task.Target != "nvidia-bench-autotune" {
t.Fatalf("task target=%q want nvidia-bench-autotune", task.Target)
}
if task.params.BenchmarkKind != "power-fit" {
t.Fatalf("task benchmark kind=%q want power-fit", task.params.BenchmarkKind)
}
}
func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
globalQueue.mu.Lock()
originalTasks := globalQueue.tasks

View File

@@ -53,6 +53,9 @@ CREATE TABLE IF NOT EXISTS sys_metrics (
cpu_load_pct REAL,
mem_load_pct REAL,
power_w REAL,
power_source TEXT,
power_mode TEXT,
power_reason TEXT,
PRIMARY KEY (ts)
);
CREATE TABLE IF NOT EXISTS gpu_metrics (
@@ -86,7 +89,16 @@ CREATE TABLE IF NOT EXISTS temp_metrics (
if err := ensureMetricsColumn(db, "gpu_metrics", "clock_mhz", "REAL"); err != nil {
return err
}
return ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL")
if err := ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL"); err != nil {
return err
}
if err := ensureMetricsColumn(db, "sys_metrics", "power_source", "TEXT"); err != nil {
return err
}
if err := ensureMetricsColumn(db, "sys_metrics", "power_mode", "TEXT"); err != nil {
return err
}
return ensureMetricsColumn(db, "sys_metrics", "power_reason", "TEXT")
}
func ensureMetricsColumn(db *sql.DB, table, column, definition string) error {
@@ -125,8 +137,8 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
defer func() { _ = tx.Rollback() }()
_, err = tx.Exec(
`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w) VALUES(?,?,?,?)`,
ts, s.CPULoadPct, s.MemLoadPct, s.PowerW,
`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason) VALUES(?,?,?,?,?,?,?)`,
ts, s.CPULoadPct, s.MemLoadPct, s.PowerW, s.PowerSource, s.PowerMode, s.PowerReason,
)
if err != nil {
return err
@@ -213,12 +225,12 @@ func (m *MetricsDB) Prune(before time.Time) error {
// LoadRecent returns up to n samples in chronological order (oldest first).
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
}
// LoadAll returns all persisted samples in chronological order (oldest first).
func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics ORDER BY ts`, nil)
}
// LoadBetween returns samples in chronological order within the given time window.
@@ -233,7 +245,7 @@ func (m *MetricsDB) LoadBetween(start, end time.Time) ([]platform.LiveMetricSamp
start, end = end, start
}
return m.loadSamples(
`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
start.Unix(), end.Unix(),
)
}
@@ -249,11 +261,14 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
type sysRow struct {
ts int64
cpu, mem, pwr float64
powerSource string
powerMode string
powerReason string
}
var sysRows []sysRow
for rows.Next() {
var r sysRow
if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr); err != nil {
if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr, &r.powerSource, &r.powerMode, &r.powerReason); err != nil {
continue
}
sysRows = append(sysRows, r)
@@ -363,10 +378,13 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
samples := make([]platform.LiveMetricSample, len(sysRows))
for i, r := range sysRows {
s := platform.LiveMetricSample{
Timestamp: time.Unix(r.ts, 0).UTC(),
CPULoadPct: r.cpu,
MemLoadPct: r.mem,
PowerW: r.pwr,
Timestamp: time.Unix(r.ts, 0).UTC(),
CPULoadPct: r.cpu,
MemLoadPct: r.mem,
PowerW: r.pwr,
PowerSource: r.powerSource,
PowerMode: r.powerMode,
PowerReason: r.powerReason,
}
for _, idx := range gpuIndices {
if g, ok := gpuData[gpuKey{r.ts, idx}]; ok {

View File

@@ -69,6 +69,7 @@ func renderBenchmark(opts HandlerOptions) string {
<span id="benchmark-run-nccl" hidden>nccl-auto</span>
<span id="benchmark-run-status" style="margin-left:10px;font-size:12px;color:var(--muted)"></span>
<div id="benchmark-autotune-status" style="margin-top:10px;font-size:12px;color:var(--muted)">Autotune status: loading…</div>
<div style="margin-top:6px;font-size:12px;color:var(--muted)">Autotune overwrites the saved system-power source and applies it to all new power charts and tests.</div>
</div>
</div>

View File

@@ -271,6 +271,8 @@ func NewHandler(opts HandlerOptions) http.Handler {
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf"))
mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power"))
mux.HandleFunc("POST /api/bee-bench/nvidia/autotune/run", h.handleAPIBenchmarkAutotuneRun())
mux.HandleFunc("GET /api/bee-bench/nvidia/autotune/status", h.handleAPIBenchmarkAutotuneStatus)
mux.HandleFunc("GET /api/benchmark/results", h.handleAPIBenchmarkResults)
// Tasks
@@ -687,41 +689,22 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) (dat
case path == "server-power":
title = "System Power"
// Use per-PSU stacked chart when PSU SDR data is available.
// Collect the union of PSU slots seen across all samples.
psuSlots := psuSlotsFromSamples(samples)
if len(psuSlots) > 0 {
// Build one dataset per PSU slot.
psuDatasets := make([][]float64, len(psuSlots))
psuNames := make([]string, len(psuSlots))
for si, slot := range psuSlots {
ds := make([]float64, len(samples))
for i, s := range samples {
for _, psu := range s.PSUs {
if psu.Slot == slot {
ds[i] = psu.PowerW
break
}
}
power := make([]float64, len(samples))
label := "Power W"
for i, s := range samples {
power[i] = s.PowerW
if strings.TrimSpace(s.PowerSource) != "" {
label = fmt.Sprintf("Power W · %s", s.PowerSource)
if strings.TrimSpace(s.PowerMode) != "" {
label += fmt.Sprintf(" (%s)", s.PowerMode)
}
psuDatasets[si] = normalizePowerSeries(ds)
psuNames[si] = fmt.Sprintf("PSU %d", slot)
}
datasets = psuDatasets
names = psuNames
stacked = len(psuDatasets) > 0
yMax = autoMax120(psuStackedTotal(psuDatasets))
} else {
power := make([]float64, len(samples))
for i, s := range samples {
power[i] = s.PowerW
}
power = normalizePowerSeries(power)
datasets = [][]float64{power}
names = []string{"Power W"}
yMin = floatPtr(0)
yMax = autoMax120(power)
}
power = normalizePowerSeries(power)
datasets = [][]float64{power}
names = []string{label}
yMin = floatPtr(0)
yMax = autoMax120(power)
case path == "server-fans":
title = "Fan RPM"

View File

@@ -420,7 +420,7 @@ func TestHandleMetricsChartSVGRendersCustomSVG(t *testing.T) {
}
}
func TestChartDataFromSamplesServerPowerUsesPerPSUDatasets(t *testing.T) {
func TestChartDataFromSamplesServerPowerUsesResolvedSystemPower(t *testing.T) {
start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
samples := []platform.LiveMetricSample{
{
@@ -429,7 +429,9 @@ func TestChartDataFromSamplesServerPowerUsesPerPSUDatasets(t *testing.T) {
{Slot: 1, PowerW: 120},
{Slot: 2, PowerW: 130},
},
PowerW: 250,
PowerW: 250,
PowerSource: "sdr_psu_input",
PowerMode: "autotuned",
},
{
Timestamp: start.Add(time.Minute),
@@ -437,7 +439,9 @@ func TestChartDataFromSamplesServerPowerUsesPerPSUDatasets(t *testing.T) {
{Slot: 1, PowerW: 140},
{Slot: 2, PowerW: 135},
},
PowerW: 275,
PowerW: 275,
PowerSource: "sdr_psu_input",
PowerMode: "autotuned",
},
}
@@ -448,13 +452,13 @@ func TestChartDataFromSamplesServerPowerUsesPerPSUDatasets(t *testing.T) {
if title != "System Power" {
t.Fatalf("title=%q", title)
}
if !stacked {
t.Fatal("expected stacked PSU chart")
if stacked {
t.Fatal("server-power should use resolved system power, not stacked PSU inputs")
}
if len(datasets) != 2 || len(names) != 2 {
t.Fatalf("datasets=%d names=%d want 2/2", len(datasets), len(names))
if len(datasets) != 1 || len(names) != 1 {
t.Fatalf("datasets=%d names=%d want 1/1", len(datasets), len(names))
}
if names[0] != "PSU 1" || names[1] != "PSU 2" {
if names[0] != "Power W · sdr_psu_input (autotuned)" {
t.Fatalf("names=%v", names)
}
}
@@ -689,9 +693,12 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
`/api/gpu/nvidia`,
`/api/bee-bench/nvidia/perf/run`,
`/api/bee-bench/nvidia/power/run`,
`/api/bee-bench/nvidia/autotune/run`,
`/api/bee-bench/nvidia/autotune/status`,
`benchmark-run-nccl`,
`Run Performance Benchmark`,
`Run Power / Thermal Fit`,
`Autotune`,
} {
if !strings.Contains(body, needle) {
t.Fatalf("benchmark page missing %q: %s", needle, body)

View File

@@ -34,6 +34,7 @@ var taskNames = map[string]string{
"nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)",
"nvidia-bench-perf": "NVIDIA Bee Bench Perf",
"nvidia-bench-power": "NVIDIA Bee Bench Power",
"nvidia-bench-autotune": "NVIDIA Bee Bench Power Source Autotune",
"nvidia-compute": "NVIDIA Max Compute Load (dcgmproftester)",
"nvidia-targeted-power": "NVIDIA Targeted Power (dcgmi diag targeted_power)",
"nvidia-pulse": "NVIDIA Pulse Test (dcgmi diag pulse_test)",
@@ -125,6 +126,7 @@ type taskParams struct {
Loader string `json:"loader,omitempty"`
BurnProfile string `json:"burn_profile,omitempty"`
BenchmarkProfile string `json:"benchmark_profile,omitempty"`
BenchmarkKind string `json:"benchmark_kind,omitempty"`
RunNCCL bool `json:"run_nccl,omitempty"`
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
RampStep int `json:"ramp_step,omitempty"`
@@ -686,6 +688,15 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
RampTotal: t.params.RampTotal,
RampRunID: t.params.RampRunID,
}, j.append)
case "nvidia-bench-autotune":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
archive, err = a.RunNvidiaPowerSourceAutotuneCtx(ctx, app.DefaultBeeBenchAutotuneDir, platform.NvidiaBenchmarkOptions{
Profile: t.params.BenchmarkProfile,
SizeMB: t.params.SizeMB,
}, t.params.BenchmarkKind, j.append)
case "nvidia-compute":
if a == nil {
err = fmt.Errorf("app not configured")