Globalize autotuned system power source

This commit is contained in:
2026-04-20 07:02:12 +03:00
parent 17118298bd
commit b3cf8e3893
14 changed files with 327 additions and 108 deletions

View File

@@ -19,20 +19,22 @@ import (
)
var (
DefaultExportDir = "/appdata/bee/export"
DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json"
DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log"
DefaultWebLogPath = DefaultExportDir + "/bee-web.log"
DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log"
DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log"
DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log"
DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
DefaultTechDumpDir = DefaultExportDir + "/techdump"
DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
DefaultBeeBenchBaseDir = DefaultExportDir + "/bee-bench"
DefaultBeeBenchPerfDir = DefaultBeeBenchBaseDir + "/perf"
DefaultBeeBenchPowerDir = DefaultBeeBenchBaseDir + "/power"
DefaultExportDir = "/appdata/bee/export"
DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json"
DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log"
DefaultWebLogPath = DefaultExportDir + "/bee-web.log"
DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log"
DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log"
DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log"
DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
DefaultTechDumpDir = DefaultExportDir + "/techdump"
DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
DefaultBeeBenchBaseDir = DefaultExportDir + "/bee-bench"
DefaultBeeBenchAutotuneDir = DefaultBeeBenchBaseDir + "/autotune"
DefaultBeeBenchPerfDir = DefaultBeeBenchBaseDir + "/perf"
DefaultBeeBenchPowerDir = DefaultBeeBenchBaseDir + "/power"
DefaultBeeBenchPowerSourceConfigPath = DefaultBeeBenchBaseDir + "/power-source-autotune.json"
)
type App struct {
@@ -125,6 +127,7 @@ type satRunner interface {
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
RunNvidiaPowerBench(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error)
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
@@ -572,6 +575,11 @@ func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts pl
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultBeeBenchPerfDir
}
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "performance", logFunc)
if err != nil {
return "", err
}
opts.ServerPowerSource = resolved.SelectedSource
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
}
@@ -579,9 +587,47 @@ func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts p
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultBeeBenchPowerDir
}
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "power-fit", logFunc)
if err != nil {
return "", err
}
opts.ServerPowerSource = resolved.SelectedSource
return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
}
func (a *App) RunNvidiaPowerSourceAutotuneCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultBeeBenchAutotuneDir
}
return a.sat.RunNvidiaPowerSourceAutotune(ctx, baseDir, opts, benchmarkKind, logFunc)
}
func (a *App) LoadBenchmarkPowerAutotune() (*platform.BenchmarkPowerAutotuneConfig, error) {
return platform.LoadBenchmarkPowerAutotuneConfig(DefaultBeeBenchPowerSourceConfigPath)
}
func (a *App) ensureBenchmarkPowerAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (platform.BenchmarkPowerAutotuneConfig, error) {
cfgPath := platform.BenchmarkPowerSourceConfigPath(baseDir)
if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath); err == nil {
if logFunc != nil {
logFunc(fmt.Sprintf("benchmark autotune: using saved server power source %s", cfg.SelectedSource))
}
return *cfg, nil
}
if logFunc != nil {
logFunc("benchmark autotune: no saved power source config, running autotune first")
}
autotuneDir := filepath.Join(filepath.Dir(baseDir), "autotune")
if _, err := a.RunNvidiaPowerSourceAutotuneCtx(ctx, autotuneDir, opts, benchmarkKind, logFunc); err != nil {
return platform.BenchmarkPowerAutotuneConfig{}, err
}
cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath)
if err != nil {
return platform.BenchmarkPowerAutotuneConfig{}, err
}
return *cfg, nil
}
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir

View File

@@ -9,6 +9,7 @@ import (
"io"
"os"
"path/filepath"
"strings"
"testing"
"bee/audit/internal/platform"
@@ -123,6 +124,7 @@ type fakeSAT struct {
runNvidiaFn func(string) (string, error)
runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
runNvidiaPowerBenchFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
runNvidiaAutotuneFn func(string, platform.NvidiaBenchmarkOptions, string) (string, error)
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
runNvidiaComputeFn func(string, int, []int) (string, error)
runNvidiaPowerFn func(string, int, []int) (string, error)
@@ -163,6 +165,13 @@ func (f fakeSAT) RunNvidiaPowerBench(_ context.Context, baseDir string, opts pla
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaPowerSourceAutotune(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, _ func(string)) (string, error) {
if f.runNvidiaAutotuneFn != nil {
return f.runNvidiaAutotuneFn(baseDir, opts, benchmarkKind)
}
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
if f.runNvidiaTargetedStressFn != nil {
return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
@@ -809,6 +818,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
t.Fatal(err)
}
if err := os.MkdirAll(filepath.Join(exportDir, "bee-bench"), 0755); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json"), []byte(`{"version":1,"updated_at":"2026-04-20T01:02:03Z","selected_source":"sdr_psu_input","reason":"selected lowest relative error"}`), 0644); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run.tar.gz"), []byte("nested sat archive"), 0644); err != nil {
t.Fatal(err)
}
@@ -836,6 +851,7 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
tr := tar.NewReader(gzr)
var names []string
var auditJSON string
var manifest string
for {
hdr, err := tr.Next()
if errors.Is(err, io.EOF) {
@@ -852,6 +868,13 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
}
auditJSON = string(body)
}
if strings.HasSuffix(hdr.Name, "/manifest.txt") {
body, err := io.ReadAll(tr)
if err != nil {
t.Fatalf("read manifest entry: %v", err)
}
manifest = string(body)
}
}
for _, want := range []string{
@@ -895,6 +918,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
if !contains(auditJSON, "PASCARI") || !contains(auditJSON, "NVIDIA H100") {
t.Fatalf("support bundle should keep real devices:\n%s", auditJSON)
}
if !contains(manifest, "files:") {
t.Fatalf("support bundle manifest missing files section:\n%s", manifest)
}
if !strings.Contains(manifest, "power_autotune_selected_source=sdr_psu_input") {
t.Fatalf("support bundle manifest missing autotune source:\n%s", manifest)
}
}
func TestMainBanner(t *testing.T) {

View File

@@ -2,6 +2,7 @@ package app
import (
"archive/tar"
"bee/audit/internal/platform"
"compress/gzip"
"fmt"
"io"
@@ -424,6 +425,13 @@ func writeManifest(dst, exportDir, stageRoot string) error {
fmt.Fprintf(&body, "host=%s\n", hostnameOr("unknown"))
fmt.Fprintf(&body, "generated_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
fmt.Fprintf(&body, "export_dir=%s\n", exportDir)
if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json")); err == nil && cfg != nil {
fmt.Fprintf(&body, "power_autotune_selected_source=%s\n", cfg.SelectedSource)
fmt.Fprintf(&body, "power_autotune_updated_at=%s\n", cfg.UpdatedAt.UTC().Format(time.RFC3339))
if strings.TrimSpace(cfg.Reason) != "" {
fmt.Fprintf(&body, "power_autotune_reason=%s\n", cfg.Reason)
}
}
fmt.Fprintf(&body, "\nfiles:\n")
var files []string