Globalize autotuned system power source
This commit is contained in:
@@ -19,20 +19,22 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
DefaultExportDir = "/appdata/bee/export"
|
DefaultExportDir = "/appdata/bee/export"
|
||||||
DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json"
|
DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json"
|
||||||
DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log"
|
DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log"
|
||||||
DefaultWebLogPath = DefaultExportDir + "/bee-web.log"
|
DefaultWebLogPath = DefaultExportDir + "/bee-web.log"
|
||||||
DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log"
|
DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log"
|
||||||
DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log"
|
DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log"
|
||||||
DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log"
|
DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log"
|
||||||
DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
|
DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
|
||||||
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
|
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
|
||||||
DefaultTechDumpDir = DefaultExportDir + "/techdump"
|
DefaultTechDumpDir = DefaultExportDir + "/techdump"
|
||||||
DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
|
DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
|
||||||
DefaultBeeBenchBaseDir = DefaultExportDir + "/bee-bench"
|
DefaultBeeBenchBaseDir = DefaultExportDir + "/bee-bench"
|
||||||
DefaultBeeBenchPerfDir = DefaultBeeBenchBaseDir + "/perf"
|
DefaultBeeBenchAutotuneDir = DefaultBeeBenchBaseDir + "/autotune"
|
||||||
DefaultBeeBenchPowerDir = DefaultBeeBenchBaseDir + "/power"
|
DefaultBeeBenchPerfDir = DefaultBeeBenchBaseDir + "/perf"
|
||||||
|
DefaultBeeBenchPowerDir = DefaultBeeBenchBaseDir + "/power"
|
||||||
|
DefaultBeeBenchPowerSourceConfigPath = DefaultBeeBenchBaseDir + "/power-source-autotune.json"
|
||||||
)
|
)
|
||||||
|
|
||||||
type App struct {
|
type App struct {
|
||||||
@@ -125,6 +127,7 @@ type satRunner interface {
|
|||||||
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
||||||
RunNvidiaPowerBench(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
RunNvidiaPowerBench(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
||||||
|
RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error)
|
||||||
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
|
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
@@ -572,6 +575,11 @@ func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts pl
|
|||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultBeeBenchPerfDir
|
baseDir = DefaultBeeBenchPerfDir
|
||||||
}
|
}
|
||||||
|
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "performance", logFunc)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
opts.ServerPowerSource = resolved.SelectedSource
|
||||||
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -579,9 +587,47 @@ func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts p
|
|||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultBeeBenchPowerDir
|
baseDir = DefaultBeeBenchPowerDir
|
||||||
}
|
}
|
||||||
|
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "power-fit", logFunc)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
opts.ServerPowerSource = resolved.SelectedSource
|
||||||
return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
|
return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaPowerSourceAutotuneCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultBeeBenchAutotuneDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaPowerSourceAutotune(ctx, baseDir, opts, benchmarkKind, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) LoadBenchmarkPowerAutotune() (*platform.BenchmarkPowerAutotuneConfig, error) {
|
||||||
|
return platform.LoadBenchmarkPowerAutotuneConfig(DefaultBeeBenchPowerSourceConfigPath)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ensureBenchmarkPowerAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (platform.BenchmarkPowerAutotuneConfig, error) {
|
||||||
|
cfgPath := platform.BenchmarkPowerSourceConfigPath(baseDir)
|
||||||
|
if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath); err == nil {
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc(fmt.Sprintf("benchmark autotune: using saved server power source %s", cfg.SelectedSource))
|
||||||
|
}
|
||||||
|
return *cfg, nil
|
||||||
|
}
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc("benchmark autotune: no saved power source config, running autotune first")
|
||||||
|
}
|
||||||
|
autotuneDir := filepath.Join(filepath.Dir(baseDir), "autotune")
|
||||||
|
if _, err := a.RunNvidiaPowerSourceAutotuneCtx(ctx, autotuneDir, opts, benchmarkKind, logFunc); err != nil {
|
||||||
|
return platform.BenchmarkPowerAutotuneConfig{}, err
|
||||||
|
}
|
||||||
|
cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath)
|
||||||
|
if err != nil {
|
||||||
|
return platform.BenchmarkPowerAutotuneConfig{}, err
|
||||||
|
}
|
||||||
|
return *cfg, nil
|
||||||
|
}
|
||||||
|
|
||||||
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ import (
|
|||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"bee/audit/internal/platform"
|
"bee/audit/internal/platform"
|
||||||
@@ -123,6 +124,7 @@ type fakeSAT struct {
|
|||||||
runNvidiaFn func(string) (string, error)
|
runNvidiaFn func(string) (string, error)
|
||||||
runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
|
runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
|
||||||
runNvidiaPowerBenchFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
|
runNvidiaPowerBenchFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
|
||||||
|
runNvidiaAutotuneFn func(string, platform.NvidiaBenchmarkOptions, string) (string, error)
|
||||||
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
|
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
|
||||||
runNvidiaComputeFn func(string, int, []int) (string, error)
|
runNvidiaComputeFn func(string, int, []int) (string, error)
|
||||||
runNvidiaPowerFn func(string, int, []int) (string, error)
|
runNvidiaPowerFn func(string, int, []int) (string, error)
|
||||||
@@ -163,6 +165,13 @@ func (f fakeSAT) RunNvidiaPowerBench(_ context.Context, baseDir string, opts pla
|
|||||||
return f.runNvidiaFn(baseDir)
|
return f.runNvidiaFn(baseDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunNvidiaPowerSourceAutotune(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, _ func(string)) (string, error) {
|
||||||
|
if f.runNvidiaAutotuneFn != nil {
|
||||||
|
return f.runNvidiaAutotuneFn(baseDir, opts, benchmarkKind)
|
||||||
|
}
|
||||||
|
return f.runNvidiaFn(baseDir)
|
||||||
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
||||||
if f.runNvidiaTargetedStressFn != nil {
|
if f.runNvidiaTargetedStressFn != nil {
|
||||||
return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
|
return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
|
||||||
@@ -809,6 +818,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
|||||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
|
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
if err := os.MkdirAll(filepath.Join(exportDir, "bee-bench"), 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json"), []byte(`{"version":1,"updated_at":"2026-04-20T01:02:03Z","selected_source":"sdr_psu_input","reason":"selected lowest relative error"}`), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run.tar.gz"), []byte("nested sat archive"), 0644); err != nil {
|
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run.tar.gz"), []byte("nested sat archive"), 0644); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
@@ -836,6 +851,7 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
|||||||
tr := tar.NewReader(gzr)
|
tr := tar.NewReader(gzr)
|
||||||
var names []string
|
var names []string
|
||||||
var auditJSON string
|
var auditJSON string
|
||||||
|
var manifest string
|
||||||
for {
|
for {
|
||||||
hdr, err := tr.Next()
|
hdr, err := tr.Next()
|
||||||
if errors.Is(err, io.EOF) {
|
if errors.Is(err, io.EOF) {
|
||||||
@@ -852,6 +868,13 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
|||||||
}
|
}
|
||||||
auditJSON = string(body)
|
auditJSON = string(body)
|
||||||
}
|
}
|
||||||
|
if strings.HasSuffix(hdr.Name, "/manifest.txt") {
|
||||||
|
body, err := io.ReadAll(tr)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("read manifest entry: %v", err)
|
||||||
|
}
|
||||||
|
manifest = string(body)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, want := range []string{
|
for _, want := range []string{
|
||||||
@@ -895,6 +918,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
|||||||
if !contains(auditJSON, "PASCARI") || !contains(auditJSON, "NVIDIA H100") {
|
if !contains(auditJSON, "PASCARI") || !contains(auditJSON, "NVIDIA H100") {
|
||||||
t.Fatalf("support bundle should keep real devices:\n%s", auditJSON)
|
t.Fatalf("support bundle should keep real devices:\n%s", auditJSON)
|
||||||
}
|
}
|
||||||
|
if !contains(manifest, "files:") {
|
||||||
|
t.Fatalf("support bundle manifest missing files section:\n%s", manifest)
|
||||||
|
}
|
||||||
|
if !strings.Contains(manifest, "power_autotune_selected_source=sdr_psu_input") {
|
||||||
|
t.Fatalf("support bundle manifest missing autotune source:\n%s", manifest)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestMainBanner(t *testing.T) {
|
func TestMainBanner(t *testing.T) {
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ package app
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"archive/tar"
|
"archive/tar"
|
||||||
|
"bee/audit/internal/platform"
|
||||||
"compress/gzip"
|
"compress/gzip"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
@@ -424,6 +425,13 @@ func writeManifest(dst, exportDir, stageRoot string) error {
|
|||||||
fmt.Fprintf(&body, "host=%s\n", hostnameOr("unknown"))
|
fmt.Fprintf(&body, "host=%s\n", hostnameOr("unknown"))
|
||||||
fmt.Fprintf(&body, "generated_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
fmt.Fprintf(&body, "generated_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
||||||
fmt.Fprintf(&body, "export_dir=%s\n", exportDir)
|
fmt.Fprintf(&body, "export_dir=%s\n", exportDir)
|
||||||
|
if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json")); err == nil && cfg != nil {
|
||||||
|
fmt.Fprintf(&body, "power_autotune_selected_source=%s\n", cfg.SelectedSource)
|
||||||
|
fmt.Fprintf(&body, "power_autotune_updated_at=%s\n", cfg.UpdatedAt.UTC().Format(time.RFC3339))
|
||||||
|
if strings.TrimSpace(cfg.Reason) != "" {
|
||||||
|
fmt.Fprintf(&body, "power_autotune_reason=%s\n", cfg.Reason)
|
||||||
|
}
|
||||||
|
}
|
||||||
fmt.Fprintf(&body, "\nfiles:\n")
|
fmt.Fprintf(&body, "\nfiles:\n")
|
||||||
|
|
||||||
var files []string
|
var files []string
|
||||||
|
|||||||
@@ -401,11 +401,15 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Server Power (IPMI) ───────────────────────────────────────────────────
|
// ── Server Power ───────────────────────────────────────────────────────────
|
||||||
if sp := result.ServerPower; sp != nil {
|
if sp := result.ServerPower; sp != nil {
|
||||||
b.WriteString("## Server Power (IPMI)\n\n")
|
title := "## Server Power\n\n"
|
||||||
|
if sp.Source != "" {
|
||||||
|
title = fmt.Sprintf("## Server Power (`%s`)\n\n", sp.Source)
|
||||||
|
}
|
||||||
|
b.WriteString(title)
|
||||||
if !sp.Available {
|
if !sp.Available {
|
||||||
b.WriteString("IPMI power measurement unavailable.\n\n")
|
b.WriteString("Server power measurement unavailable.\n\n")
|
||||||
} else {
|
} else {
|
||||||
spRows := [][]string{
|
spRows := [][]string{
|
||||||
{"Server idle", fmt.Sprintf("%.0f W", sp.IdleW)},
|
{"Server idle", fmt.Sprintf("%.0f W", sp.IdleW)},
|
||||||
|
|||||||
@@ -16,14 +16,17 @@ import (
|
|||||||
// LiveMetricSample is a single point-in-time snapshot of server metrics
|
// LiveMetricSample is a single point-in-time snapshot of server metrics
|
||||||
// collected for the web UI metrics page.
|
// collected for the web UI metrics page.
|
||||||
type LiveMetricSample struct {
|
type LiveMetricSample struct {
|
||||||
Timestamp time.Time `json:"ts"`
|
Timestamp time.Time `json:"ts"`
|
||||||
Fans []FanReading `json:"fans"`
|
Fans []FanReading `json:"fans"`
|
||||||
Temps []TempReading `json:"temps"`
|
Temps []TempReading `json:"temps"`
|
||||||
PowerW float64 `json:"power_w"`
|
PowerW float64 `json:"power_w"`
|
||||||
PSUs []PSUReading `json:"psus,omitempty"`
|
PowerSource string `json:"power_source,omitempty"`
|
||||||
CPULoadPct float64 `json:"cpu_load_pct"`
|
PowerMode string `json:"power_mode,omitempty"`
|
||||||
MemLoadPct float64 `json:"mem_load_pct"`
|
PowerReason string `json:"power_reason,omitempty"`
|
||||||
GPUs []GPUMetricRow `json:"gpus"`
|
PSUs []PSUReading `json:"psus,omitempty"`
|
||||||
|
CPULoadPct float64 `json:"cpu_load_pct"`
|
||||||
|
MemLoadPct float64 `json:"mem_load_pct"`
|
||||||
|
GPUs []GPUMetricRow `json:"gpus"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// PSUReading is a per-slot power supply input power reading.
|
// PSUReading is a per-slot power supply input power reading.
|
||||||
@@ -67,15 +70,13 @@ func SampleLiveMetrics() LiveMetricSample {
|
|||||||
// Per-PSU power — populated when IPMI SDR has Power Supply entities with Watt readings
|
// Per-PSU power — populated when IPMI SDR has Power Supply entities with Watt readings
|
||||||
s.PSUs = samplePSUPower()
|
s.PSUs = samplePSUPower()
|
||||||
|
|
||||||
// System power: prefer sum of PSU AC inputs (full wall draw); fall back to DCMI.
|
// System power: use the global autotune-selected source when configured,
|
||||||
if len(s.PSUs) > 0 {
|
// otherwise fall back to the historical heuristic and mark the mode.
|
||||||
var total float64
|
if powerW, decision, err := SampleSystemPowerResolved(""); err == nil {
|
||||||
for _, p := range s.PSUs {
|
s.PowerW = powerW
|
||||||
total += p.PowerW
|
s.PowerSource = decision.EffectiveSource
|
||||||
}
|
s.PowerMode = decision.Mode
|
||||||
s.PowerW = total
|
s.PowerReason = decision.Reason
|
||||||
} else {
|
|
||||||
s.PowerW = sampleSystemPower()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// CPU load — from /proc/stat
|
// CPU load — from /proc/stat
|
||||||
|
|||||||
@@ -43,17 +43,22 @@ type GPUStressMetric struct {
|
|||||||
|
|
||||||
// FanStressRow is one second-interval telemetry sample covering all monitored dimensions.
|
// FanStressRow is one second-interval telemetry sample covering all monitored dimensions.
|
||||||
type FanStressRow struct {
|
type FanStressRow struct {
|
||||||
TimestampUTC string
|
TimestampUTC string
|
||||||
ElapsedSec float64
|
ElapsedSec float64
|
||||||
Phase string // "baseline", "load1", "pause", "load2", "cooldown"
|
Phase string // "baseline", "load1", "pause", "load2", "cooldown"
|
||||||
GPUs []GPUStressMetric
|
GPUs []GPUStressMetric
|
||||||
Fans []FanReading
|
Fans []FanReading
|
||||||
CPUMaxTempC float64 // highest CPU temperature from ipmitool / sensors
|
CPUMaxTempC float64 // highest CPU temperature from ipmitool / sensors
|
||||||
SysPowerW float64 // DCMI system power reading
|
SysPowerW float64
|
||||||
|
SysPowerSource string
|
||||||
|
SysPowerMode string
|
||||||
}
|
}
|
||||||
|
|
||||||
type cachedPowerReading struct {
|
type cachedPowerReading struct {
|
||||||
Value float64
|
Value float64
|
||||||
|
Source string
|
||||||
|
Mode string
|
||||||
|
Reason string
|
||||||
UpdatedAt time.Time
|
UpdatedAt time.Time
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -278,7 +283,7 @@ func sampleFanStressRow(gpuIndices []int, phase string, elapsed float64) FanStre
|
|||||||
row.GPUs = sampleGPUStressMetrics(gpuIndices)
|
row.GPUs = sampleGPUStressMetrics(gpuIndices)
|
||||||
row.Fans, _ = sampleFanSpeeds()
|
row.Fans, _ = sampleFanSpeeds()
|
||||||
row.CPUMaxTempC = sampleCPUMaxTemp()
|
row.CPUMaxTempC = sampleCPUMaxTemp()
|
||||||
row.SysPowerW = sampleSystemPower()
|
row.SysPowerW, row.SysPowerSource, row.SysPowerMode = sampleSystemPowerResolved()
|
||||||
return row
|
return row
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -763,19 +768,19 @@ func sampleCPUTempViaSensors() float64 {
|
|||||||
return max
|
return max
|
||||||
}
|
}
|
||||||
|
|
||||||
// sampleSystemPower reads system power draw via DCMI.
|
// sampleSystemPowerResolved reads system power via the global autotune source,
|
||||||
func sampleSystemPower() float64 {
|
// falling back to the historical heuristic before autotune or when degraded.
|
||||||
|
func sampleSystemPowerResolved() (float64, string, string) {
|
||||||
now := time.Now()
|
now := time.Now()
|
||||||
current := 0.0
|
current, decision, err := SampleSystemPowerResolved("")
|
||||||
out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
|
|
||||||
if err == nil {
|
|
||||||
current = parseDCMIPowerReading(string(out))
|
|
||||||
}
|
|
||||||
systemPowerCacheMu.Lock()
|
systemPowerCacheMu.Lock()
|
||||||
defer systemPowerCacheMu.Unlock()
|
defer systemPowerCacheMu.Unlock()
|
||||||
value, updated := effectiveSystemPowerReading(systemPowerCache, current, now)
|
if err != nil {
|
||||||
|
current = 0
|
||||||
|
}
|
||||||
|
value, updated := effectiveSystemPowerReading(systemPowerCache, current, decision.EffectiveSource, decision.Mode, decision.Reason, now)
|
||||||
systemPowerCache = updated
|
systemPowerCache = updated
|
||||||
return value
|
return value, updated.Source, updated.Mode
|
||||||
}
|
}
|
||||||
|
|
||||||
// parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
|
// parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
|
||||||
@@ -798,9 +803,9 @@ func parseDCMIPowerReading(raw string) float64 {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
func effectiveSystemPowerReading(cache cachedPowerReading, current float64, now time.Time) (float64, cachedPowerReading) {
|
func effectiveSystemPowerReading(cache cachedPowerReading, current float64, source, mode, reason string, now time.Time) (float64, cachedPowerReading) {
|
||||||
if current > 0 {
|
if current > 0 {
|
||||||
cache = cachedPowerReading{Value: current, UpdatedAt: now}
|
cache = cachedPowerReading{Value: current, Source: source, Mode: mode, Reason: reason, UpdatedAt: now}
|
||||||
return current, cache
|
return current, cache
|
||||||
}
|
}
|
||||||
if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL {
|
if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL {
|
||||||
|
|||||||
@@ -112,7 +112,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
|
|||||||
now := time.Now()
|
now := time.Now()
|
||||||
cache := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-5 * time.Second)}
|
cache := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-5 * time.Second)}
|
||||||
|
|
||||||
got, updated := effectiveSystemPowerReading(cache, 0, now)
|
got, updated := effectiveSystemPowerReading(cache, 0, "", "", "", now)
|
||||||
if got != 480 {
|
if got != 480 {
|
||||||
t.Fatalf("got=%v want cached 480", got)
|
t.Fatalf("got=%v want cached 480", got)
|
||||||
}
|
}
|
||||||
@@ -120,7 +120,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
|
|||||||
t.Fatalf("updated=%+v", updated)
|
t.Fatalf("updated=%+v", updated)
|
||||||
}
|
}
|
||||||
|
|
||||||
got, updated = effectiveSystemPowerReading(cache, 530, now)
|
got, updated = effectiveSystemPowerReading(cache, 530, "dcmi", "fallback", "test", now)
|
||||||
if got != 530 {
|
if got != 530 {
|
||||||
t.Fatalf("got=%v want 530", got)
|
t.Fatalf("got=%v want 530", got)
|
||||||
}
|
}
|
||||||
@@ -129,7 +129,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
expired := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-systemPowerHoldTTL - time.Second)}
|
expired := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-systemPowerHoldTTL - time.Second)}
|
||||||
got, _ = effectiveSystemPowerReading(expired, 0, now)
|
got, _ = effectiveSystemPowerReading(expired, 0, "", "", "", now)
|
||||||
if got != 0 {
|
if got != 0 {
|
||||||
t.Fatalf("expired cache returned %v want 0", got)
|
t.Fatalf("expired cache returned %v want 0", got)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -127,7 +127,7 @@ func defaultTaskPriority(target string, params taskParams) int {
|
|||||||
return taskPriorityInstallToRAM
|
return taskPriorityInstallToRAM
|
||||||
case "audit":
|
case "audit":
|
||||||
return taskPriorityAudit
|
return taskPriorityAudit
|
||||||
case "nvidia-bench-perf", "nvidia-bench-power":
|
case "nvidia-bench-perf", "nvidia-bench-power", "nvidia-bench-autotune":
|
||||||
return taskPriorityBenchmark
|
return taskPriorityBenchmark
|
||||||
case "nvidia-stress", "amd-stress", "memory-stress", "sat-stress", "platform-stress", "nvidia-compute":
|
case "nvidia-stress", "amd-stress", "memory-stress", "sat-stress", "platform-stress", "nvidia-compute":
|
||||||
return taskPriorityBurn
|
return taskPriorityBurn
|
||||||
@@ -701,6 +701,78 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPIBenchmarkAutotuneRun() http.HandlerFunc {
|
||||||
|
return func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if h.opts.App == nil {
|
||||||
|
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var body struct {
|
||||||
|
Profile string `json:"profile"`
|
||||||
|
BenchmarkKind string `json:"benchmark_kind"`
|
||||||
|
SizeMB int `json:"size_mb"`
|
||||||
|
}
|
||||||
|
if r.Body != nil {
|
||||||
|
if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
|
||||||
|
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
profile := strings.TrimSpace(body.Profile)
|
||||||
|
if profile == "" {
|
||||||
|
profile = "standard"
|
||||||
|
}
|
||||||
|
benchmarkKind := strings.TrimSpace(body.BenchmarkKind)
|
||||||
|
if benchmarkKind == "" {
|
||||||
|
benchmarkKind = "power-fit"
|
||||||
|
}
|
||||||
|
now := time.Now()
|
||||||
|
taskName := fmt.Sprintf("NVIDIA Benchmark Autotune · %s · %s", profile, benchmarkKind)
|
||||||
|
t := &Task{
|
||||||
|
ID: newJobID("bee-bench-autotune"),
|
||||||
|
Name: taskName,
|
||||||
|
Target: "nvidia-bench-autotune",
|
||||||
|
Priority: defaultTaskPriority("nvidia-bench-autotune", taskParams{}),
|
||||||
|
Status: TaskPending,
|
||||||
|
CreatedAt: now,
|
||||||
|
params: taskParams{
|
||||||
|
BenchmarkProfile: profile,
|
||||||
|
BenchmarkKind: benchmarkKind,
|
||||||
|
SizeMB: body.SizeMB,
|
||||||
|
DisplayName: taskName,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
globalQueue.enqueue(t)
|
||||||
|
writeTaskRunResponse(w, []*Task{t})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPIBenchmarkAutotuneStatus(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if h.opts.App == nil {
|
||||||
|
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
cfg, err := h.opts.App.LoadBenchmarkPowerAutotune()
|
||||||
|
if err != nil {
|
||||||
|
if os.IsNotExist(err) {
|
||||||
|
w.WriteHeader(http.StatusOK)
|
||||||
|
writeJSON(w, map[string]any{
|
||||||
|
"configured": false,
|
||||||
|
"decision": platform.ResolveSystemPowerDecision(h.opts.ExportDir),
|
||||||
|
})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
writeError(w, http.StatusInternalServerError, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
w.WriteHeader(http.StatusOK)
|
||||||
|
writeJSON(w, map[string]any{
|
||||||
|
"configured": true,
|
||||||
|
"config": cfg,
|
||||||
|
"decision": platform.ResolveSystemPowerDecision(h.opts.ExportDir),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
|
func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
|
||||||
h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf").ServeHTTP(w, r)
|
h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf").ServeHTTP(w, r)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -195,6 +195,40 @@ func TestHandleAPIBenchmarkPowerFitRampQueuesBenchmarkPowerFitTasks(t *testing.T
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestHandleAPIBenchmarkAutotuneRunQueuesTask(t *testing.T) {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
originalTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = nil
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = originalTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
|
||||||
|
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||||
|
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/autotune/run", strings.NewReader(`{"profile":"standard","benchmark_kind":"power-fit"}`))
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
|
||||||
|
h.handleAPIBenchmarkAutotuneRun().ServeHTTP(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != 200 {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
defer globalQueue.mu.Unlock()
|
||||||
|
if len(globalQueue.tasks) != 1 {
|
||||||
|
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
|
||||||
|
}
|
||||||
|
task := globalQueue.tasks[0]
|
||||||
|
if task.Target != "nvidia-bench-autotune" {
|
||||||
|
t.Fatalf("task target=%q want nvidia-bench-autotune", task.Target)
|
||||||
|
}
|
||||||
|
if task.params.BenchmarkKind != "power-fit" {
|
||||||
|
t.Fatalf("task benchmark kind=%q want power-fit", task.params.BenchmarkKind)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
|
func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
|
||||||
globalQueue.mu.Lock()
|
globalQueue.mu.Lock()
|
||||||
originalTasks := globalQueue.tasks
|
originalTasks := globalQueue.tasks
|
||||||
|
|||||||
@@ -53,6 +53,9 @@ CREATE TABLE IF NOT EXISTS sys_metrics (
|
|||||||
cpu_load_pct REAL,
|
cpu_load_pct REAL,
|
||||||
mem_load_pct REAL,
|
mem_load_pct REAL,
|
||||||
power_w REAL,
|
power_w REAL,
|
||||||
|
power_source TEXT,
|
||||||
|
power_mode TEXT,
|
||||||
|
power_reason TEXT,
|
||||||
PRIMARY KEY (ts)
|
PRIMARY KEY (ts)
|
||||||
);
|
);
|
||||||
CREATE TABLE IF NOT EXISTS gpu_metrics (
|
CREATE TABLE IF NOT EXISTS gpu_metrics (
|
||||||
@@ -86,7 +89,16 @@ CREATE TABLE IF NOT EXISTS temp_metrics (
|
|||||||
if err := ensureMetricsColumn(db, "gpu_metrics", "clock_mhz", "REAL"); err != nil {
|
if err := ensureMetricsColumn(db, "gpu_metrics", "clock_mhz", "REAL"); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
return ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL")
|
if err := ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL"); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := ensureMetricsColumn(db, "sys_metrics", "power_source", "TEXT"); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := ensureMetricsColumn(db, "sys_metrics", "power_mode", "TEXT"); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return ensureMetricsColumn(db, "sys_metrics", "power_reason", "TEXT")
|
||||||
}
|
}
|
||||||
|
|
||||||
func ensureMetricsColumn(db *sql.DB, table, column, definition string) error {
|
func ensureMetricsColumn(db *sql.DB, table, column, definition string) error {
|
||||||
@@ -125,8 +137,8 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
|
|||||||
defer func() { _ = tx.Rollback() }()
|
defer func() { _ = tx.Rollback() }()
|
||||||
|
|
||||||
_, err = tx.Exec(
|
_, err = tx.Exec(
|
||||||
`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w) VALUES(?,?,?,?)`,
|
`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason) VALUES(?,?,?,?,?,?,?)`,
|
||||||
ts, s.CPULoadPct, s.MemLoadPct, s.PowerW,
|
ts, s.CPULoadPct, s.MemLoadPct, s.PowerW, s.PowerSource, s.PowerMode, s.PowerReason,
|
||||||
)
|
)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -213,12 +225,12 @@ func (m *MetricsDB) Prune(before time.Time) error {
|
|||||||
|
|
||||||
// LoadRecent returns up to n samples in chronological order (oldest first).
|
// LoadRecent returns up to n samples in chronological order (oldest first).
|
||||||
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
||||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
|
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
|
||||||
}
|
}
|
||||||
|
|
||||||
// LoadAll returns all persisted samples in chronological order (oldest first).
|
// LoadAll returns all persisted samples in chronological order (oldest first).
|
||||||
func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
|
func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
|
||||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
|
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics ORDER BY ts`, nil)
|
||||||
}
|
}
|
||||||
|
|
||||||
// LoadBetween returns samples in chronological order within the given time window.
|
// LoadBetween returns samples in chronological order within the given time window.
|
||||||
@@ -233,7 +245,7 @@ func (m *MetricsDB) LoadBetween(start, end time.Time) ([]platform.LiveMetricSamp
|
|||||||
start, end = end, start
|
start, end = end, start
|
||||||
}
|
}
|
||||||
return m.loadSamples(
|
return m.loadSamples(
|
||||||
`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
|
`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
|
||||||
start.Unix(), end.Unix(),
|
start.Unix(), end.Unix(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -249,11 +261,14 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
|||||||
type sysRow struct {
|
type sysRow struct {
|
||||||
ts int64
|
ts int64
|
||||||
cpu, mem, pwr float64
|
cpu, mem, pwr float64
|
||||||
|
powerSource string
|
||||||
|
powerMode string
|
||||||
|
powerReason string
|
||||||
}
|
}
|
||||||
var sysRows []sysRow
|
var sysRows []sysRow
|
||||||
for rows.Next() {
|
for rows.Next() {
|
||||||
var r sysRow
|
var r sysRow
|
||||||
if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr); err != nil {
|
if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr, &r.powerSource, &r.powerMode, &r.powerReason); err != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
sysRows = append(sysRows, r)
|
sysRows = append(sysRows, r)
|
||||||
@@ -363,10 +378,13 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
|||||||
samples := make([]platform.LiveMetricSample, len(sysRows))
|
samples := make([]platform.LiveMetricSample, len(sysRows))
|
||||||
for i, r := range sysRows {
|
for i, r := range sysRows {
|
||||||
s := platform.LiveMetricSample{
|
s := platform.LiveMetricSample{
|
||||||
Timestamp: time.Unix(r.ts, 0).UTC(),
|
Timestamp: time.Unix(r.ts, 0).UTC(),
|
||||||
CPULoadPct: r.cpu,
|
CPULoadPct: r.cpu,
|
||||||
MemLoadPct: r.mem,
|
MemLoadPct: r.mem,
|
||||||
PowerW: r.pwr,
|
PowerW: r.pwr,
|
||||||
|
PowerSource: r.powerSource,
|
||||||
|
PowerMode: r.powerMode,
|
||||||
|
PowerReason: r.powerReason,
|
||||||
}
|
}
|
||||||
for _, idx := range gpuIndices {
|
for _, idx := range gpuIndices {
|
||||||
if g, ok := gpuData[gpuKey{r.ts, idx}]; ok {
|
if g, ok := gpuData[gpuKey{r.ts, idx}]; ok {
|
||||||
|
|||||||
@@ -69,6 +69,7 @@ func renderBenchmark(opts HandlerOptions) string {
|
|||||||
<span id="benchmark-run-nccl" hidden>nccl-auto</span>
|
<span id="benchmark-run-nccl" hidden>nccl-auto</span>
|
||||||
<span id="benchmark-run-status" style="margin-left:10px;font-size:12px;color:var(--muted)"></span>
|
<span id="benchmark-run-status" style="margin-left:10px;font-size:12px;color:var(--muted)"></span>
|
||||||
<div id="benchmark-autotune-status" style="margin-top:10px;font-size:12px;color:var(--muted)">Autotune status: loading…</div>
|
<div id="benchmark-autotune-status" style="margin-top:10px;font-size:12px;color:var(--muted)">Autotune status: loading…</div>
|
||||||
|
<div style="margin-top:6px;font-size:12px;color:var(--muted)">Autotune overwrites the saved system-power source and applies it to all new power charts and tests.</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|||||||
@@ -271,6 +271,8 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
|
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
|
||||||
mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf"))
|
mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf"))
|
||||||
mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power"))
|
mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power"))
|
||||||
|
mux.HandleFunc("POST /api/bee-bench/nvidia/autotune/run", h.handleAPIBenchmarkAutotuneRun())
|
||||||
|
mux.HandleFunc("GET /api/bee-bench/nvidia/autotune/status", h.handleAPIBenchmarkAutotuneStatus)
|
||||||
mux.HandleFunc("GET /api/benchmark/results", h.handleAPIBenchmarkResults)
|
mux.HandleFunc("GET /api/benchmark/results", h.handleAPIBenchmarkResults)
|
||||||
|
|
||||||
// Tasks
|
// Tasks
|
||||||
@@ -687,41 +689,22 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) (dat
|
|||||||
|
|
||||||
case path == "server-power":
|
case path == "server-power":
|
||||||
title = "System Power"
|
title = "System Power"
|
||||||
// Use per-PSU stacked chart when PSU SDR data is available.
|
power := make([]float64, len(samples))
|
||||||
// Collect the union of PSU slots seen across all samples.
|
label := "Power W"
|
||||||
psuSlots := psuSlotsFromSamples(samples)
|
for i, s := range samples {
|
||||||
if len(psuSlots) > 0 {
|
power[i] = s.PowerW
|
||||||
// Build one dataset per PSU slot.
|
if strings.TrimSpace(s.PowerSource) != "" {
|
||||||
psuDatasets := make([][]float64, len(psuSlots))
|
label = fmt.Sprintf("Power W · %s", s.PowerSource)
|
||||||
psuNames := make([]string, len(psuSlots))
|
if strings.TrimSpace(s.PowerMode) != "" {
|
||||||
for si, slot := range psuSlots {
|
label += fmt.Sprintf(" (%s)", s.PowerMode)
|
||||||
ds := make([]float64, len(samples))
|
|
||||||
for i, s := range samples {
|
|
||||||
for _, psu := range s.PSUs {
|
|
||||||
if psu.Slot == slot {
|
|
||||||
ds[i] = psu.PowerW
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
psuDatasets[si] = normalizePowerSeries(ds)
|
|
||||||
psuNames[si] = fmt.Sprintf("PSU %d", slot)
|
|
||||||
}
|
}
|
||||||
datasets = psuDatasets
|
|
||||||
names = psuNames
|
|
||||||
stacked = len(psuDatasets) > 0
|
|
||||||
yMax = autoMax120(psuStackedTotal(psuDatasets))
|
|
||||||
} else {
|
|
||||||
power := make([]float64, len(samples))
|
|
||||||
for i, s := range samples {
|
|
||||||
power[i] = s.PowerW
|
|
||||||
}
|
|
||||||
power = normalizePowerSeries(power)
|
|
||||||
datasets = [][]float64{power}
|
|
||||||
names = []string{"Power W"}
|
|
||||||
yMin = floatPtr(0)
|
|
||||||
yMax = autoMax120(power)
|
|
||||||
}
|
}
|
||||||
|
power = normalizePowerSeries(power)
|
||||||
|
datasets = [][]float64{power}
|
||||||
|
names = []string{label}
|
||||||
|
yMin = floatPtr(0)
|
||||||
|
yMax = autoMax120(power)
|
||||||
|
|
||||||
case path == "server-fans":
|
case path == "server-fans":
|
||||||
title = "Fan RPM"
|
title = "Fan RPM"
|
||||||
|
|||||||
@@ -420,7 +420,7 @@ func TestHandleMetricsChartSVGRendersCustomSVG(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestChartDataFromSamplesServerPowerUsesPerPSUDatasets(t *testing.T) {
|
func TestChartDataFromSamplesServerPowerUsesResolvedSystemPower(t *testing.T) {
|
||||||
start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
|
start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
|
||||||
samples := []platform.LiveMetricSample{
|
samples := []platform.LiveMetricSample{
|
||||||
{
|
{
|
||||||
@@ -429,7 +429,9 @@ func TestChartDataFromSamplesServerPowerUsesPerPSUDatasets(t *testing.T) {
|
|||||||
{Slot: 1, PowerW: 120},
|
{Slot: 1, PowerW: 120},
|
||||||
{Slot: 2, PowerW: 130},
|
{Slot: 2, PowerW: 130},
|
||||||
},
|
},
|
||||||
PowerW: 250,
|
PowerW: 250,
|
||||||
|
PowerSource: "sdr_psu_input",
|
||||||
|
PowerMode: "autotuned",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
Timestamp: start.Add(time.Minute),
|
Timestamp: start.Add(time.Minute),
|
||||||
@@ -437,7 +439,9 @@ func TestChartDataFromSamplesServerPowerUsesPerPSUDatasets(t *testing.T) {
|
|||||||
{Slot: 1, PowerW: 140},
|
{Slot: 1, PowerW: 140},
|
||||||
{Slot: 2, PowerW: 135},
|
{Slot: 2, PowerW: 135},
|
||||||
},
|
},
|
||||||
PowerW: 275,
|
PowerW: 275,
|
||||||
|
PowerSource: "sdr_psu_input",
|
||||||
|
PowerMode: "autotuned",
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -448,13 +452,13 @@ func TestChartDataFromSamplesServerPowerUsesPerPSUDatasets(t *testing.T) {
|
|||||||
if title != "System Power" {
|
if title != "System Power" {
|
||||||
t.Fatalf("title=%q", title)
|
t.Fatalf("title=%q", title)
|
||||||
}
|
}
|
||||||
if !stacked {
|
if stacked {
|
||||||
t.Fatal("expected stacked PSU chart")
|
t.Fatal("server-power should use resolved system power, not stacked PSU inputs")
|
||||||
}
|
}
|
||||||
if len(datasets) != 2 || len(names) != 2 {
|
if len(datasets) != 1 || len(names) != 1 {
|
||||||
t.Fatalf("datasets=%d names=%d want 2/2", len(datasets), len(names))
|
t.Fatalf("datasets=%d names=%d want 1/1", len(datasets), len(names))
|
||||||
}
|
}
|
||||||
if names[0] != "PSU 1" || names[1] != "PSU 2" {
|
if names[0] != "Power W · sdr_psu_input (autotuned)" {
|
||||||
t.Fatalf("names=%v", names)
|
t.Fatalf("names=%v", names)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -689,9 +693,12 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
|
|||||||
`/api/gpu/nvidia`,
|
`/api/gpu/nvidia`,
|
||||||
`/api/bee-bench/nvidia/perf/run`,
|
`/api/bee-bench/nvidia/perf/run`,
|
||||||
`/api/bee-bench/nvidia/power/run`,
|
`/api/bee-bench/nvidia/power/run`,
|
||||||
|
`/api/bee-bench/nvidia/autotune/run`,
|
||||||
|
`/api/bee-bench/nvidia/autotune/status`,
|
||||||
`benchmark-run-nccl`,
|
`benchmark-run-nccl`,
|
||||||
`Run Performance Benchmark`,
|
`Run Performance Benchmark`,
|
||||||
`Run Power / Thermal Fit`,
|
`Run Power / Thermal Fit`,
|
||||||
|
`Autotune`,
|
||||||
} {
|
} {
|
||||||
if !strings.Contains(body, needle) {
|
if !strings.Contains(body, needle) {
|
||||||
t.Fatalf("benchmark page missing %q: %s", needle, body)
|
t.Fatalf("benchmark page missing %q: %s", needle, body)
|
||||||
|
|||||||
@@ -34,6 +34,7 @@ var taskNames = map[string]string{
|
|||||||
"nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)",
|
"nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)",
|
||||||
"nvidia-bench-perf": "NVIDIA Bee Bench Perf",
|
"nvidia-bench-perf": "NVIDIA Bee Bench Perf",
|
||||||
"nvidia-bench-power": "NVIDIA Bee Bench Power",
|
"nvidia-bench-power": "NVIDIA Bee Bench Power",
|
||||||
|
"nvidia-bench-autotune": "NVIDIA Bee Bench Power Source Autotune",
|
||||||
"nvidia-compute": "NVIDIA Max Compute Load (dcgmproftester)",
|
"nvidia-compute": "NVIDIA Max Compute Load (dcgmproftester)",
|
||||||
"nvidia-targeted-power": "NVIDIA Targeted Power (dcgmi diag targeted_power)",
|
"nvidia-targeted-power": "NVIDIA Targeted Power (dcgmi diag targeted_power)",
|
||||||
"nvidia-pulse": "NVIDIA Pulse Test (dcgmi diag pulse_test)",
|
"nvidia-pulse": "NVIDIA Pulse Test (dcgmi diag pulse_test)",
|
||||||
@@ -125,6 +126,7 @@ type taskParams struct {
|
|||||||
Loader string `json:"loader,omitempty"`
|
Loader string `json:"loader,omitempty"`
|
||||||
BurnProfile string `json:"burn_profile,omitempty"`
|
BurnProfile string `json:"burn_profile,omitempty"`
|
||||||
BenchmarkProfile string `json:"benchmark_profile,omitempty"`
|
BenchmarkProfile string `json:"benchmark_profile,omitempty"`
|
||||||
|
BenchmarkKind string `json:"benchmark_kind,omitempty"`
|
||||||
RunNCCL bool `json:"run_nccl,omitempty"`
|
RunNCCL bool `json:"run_nccl,omitempty"`
|
||||||
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||||
RampStep int `json:"ramp_step,omitempty"`
|
RampStep int `json:"ramp_step,omitempty"`
|
||||||
@@ -686,6 +688,15 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
RampTotal: t.params.RampTotal,
|
RampTotal: t.params.RampTotal,
|
||||||
RampRunID: t.params.RampRunID,
|
RampRunID: t.params.RampRunID,
|
||||||
}, j.append)
|
}, j.append)
|
||||||
|
case "nvidia-bench-autotune":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaPowerSourceAutotuneCtx(ctx, app.DefaultBeeBenchAutotuneDir, platform.NvidiaBenchmarkOptions{
|
||||||
|
Profile: t.params.BenchmarkProfile,
|
||||||
|
SizeMB: t.params.SizeMB,
|
||||||
|
}, t.params.BenchmarkKind, j.append)
|
||||||
case "nvidia-compute":
|
case "nvidia-compute":
|
||||||
if a == nil {
|
if a == nil {
|
||||||
err = fmt.Errorf("app not configured")
|
err = fmt.Errorf("app not configured")
|
||||||
|
|||||||
Reference in New Issue
Block a user