Compare commits
13 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| c69bf07b27 | |||
| b3cf8e3893 | |||
| 17118298bd | |||
| 65bcc9ce81 | |||
| 0cdfbc5875 | |||
| cf9b54b600 | |||
| 0bfb3fe954 | |||
| 3053cb0710 | |||
| 2038489961 | |||
| e35484013e | |||
| 2cdf034bb0 | |||
| b89580c24d | |||
| df1385d3d6 |
@@ -19,20 +19,22 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
DefaultExportDir = "/appdata/bee/export"
|
DefaultExportDir = "/appdata/bee/export"
|
||||||
DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json"
|
DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json"
|
||||||
DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log"
|
DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log"
|
||||||
DefaultWebLogPath = DefaultExportDir + "/bee-web.log"
|
DefaultWebLogPath = DefaultExportDir + "/bee-web.log"
|
||||||
DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log"
|
DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log"
|
||||||
DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log"
|
DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log"
|
||||||
DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log"
|
DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log"
|
||||||
DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
|
DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
|
||||||
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
|
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
|
||||||
DefaultTechDumpDir = DefaultExportDir + "/techdump"
|
DefaultTechDumpDir = DefaultExportDir + "/techdump"
|
||||||
DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
|
DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
|
||||||
DefaultBeeBenchBaseDir = DefaultExportDir + "/bee-bench"
|
DefaultBeeBenchBaseDir = DefaultExportDir + "/bee-bench"
|
||||||
DefaultBeeBenchPerfDir = DefaultBeeBenchBaseDir + "/perf"
|
DefaultBeeBenchAutotuneDir = DefaultBeeBenchBaseDir + "/autotune"
|
||||||
DefaultBeeBenchPowerDir = DefaultBeeBenchBaseDir + "/power"
|
DefaultBeeBenchPerfDir = DefaultBeeBenchBaseDir + "/perf"
|
||||||
|
DefaultBeeBenchPowerDir = DefaultBeeBenchBaseDir + "/power"
|
||||||
|
DefaultBeeBenchPowerSourceConfigPath = DefaultBeeBenchBaseDir + "/power-source-autotune.json"
|
||||||
)
|
)
|
||||||
|
|
||||||
type App struct {
|
type App struct {
|
||||||
@@ -125,6 +127,7 @@ type satRunner interface {
|
|||||||
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
||||||
RunNvidiaPowerBench(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
RunNvidiaPowerBench(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
||||||
|
RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error)
|
||||||
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
|
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
@@ -572,6 +575,11 @@ func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts pl
|
|||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultBeeBenchPerfDir
|
baseDir = DefaultBeeBenchPerfDir
|
||||||
}
|
}
|
||||||
|
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "performance", logFunc)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
opts.ServerPowerSource = resolved.SelectedSource
|
||||||
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -579,9 +587,47 @@ func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts p
|
|||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultBeeBenchPowerDir
|
baseDir = DefaultBeeBenchPowerDir
|
||||||
}
|
}
|
||||||
|
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "power-fit", logFunc)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
opts.ServerPowerSource = resolved.SelectedSource
|
||||||
return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
|
return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaPowerSourceAutotuneCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultBeeBenchAutotuneDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaPowerSourceAutotune(ctx, baseDir, opts, benchmarkKind, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) LoadBenchmarkPowerAutotune() (*platform.BenchmarkPowerAutotuneConfig, error) {
|
||||||
|
return platform.LoadBenchmarkPowerAutotuneConfig(DefaultBeeBenchPowerSourceConfigPath)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ensureBenchmarkPowerAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (platform.BenchmarkPowerAutotuneConfig, error) {
|
||||||
|
cfgPath := platform.BenchmarkPowerSourceConfigPath(baseDir)
|
||||||
|
if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath); err == nil {
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc(fmt.Sprintf("benchmark autotune: using saved server power source %s", cfg.SelectedSource))
|
||||||
|
}
|
||||||
|
return *cfg, nil
|
||||||
|
}
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc("benchmark autotune: no saved power source config, running autotune first")
|
||||||
|
}
|
||||||
|
autotuneDir := filepath.Join(filepath.Dir(baseDir), "autotune")
|
||||||
|
if _, err := a.RunNvidiaPowerSourceAutotuneCtx(ctx, autotuneDir, opts, benchmarkKind, logFunc); err != nil {
|
||||||
|
return platform.BenchmarkPowerAutotuneConfig{}, err
|
||||||
|
}
|
||||||
|
cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath)
|
||||||
|
if err != nil {
|
||||||
|
return platform.BenchmarkPowerAutotuneConfig{}, err
|
||||||
|
}
|
||||||
|
return *cfg, nil
|
||||||
|
}
|
||||||
|
|
||||||
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ import (
|
|||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"bee/audit/internal/platform"
|
"bee/audit/internal/platform"
|
||||||
@@ -123,6 +124,7 @@ type fakeSAT struct {
|
|||||||
runNvidiaFn func(string) (string, error)
|
runNvidiaFn func(string) (string, error)
|
||||||
runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
|
runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
|
||||||
runNvidiaPowerBenchFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
|
runNvidiaPowerBenchFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
|
||||||
|
runNvidiaAutotuneFn func(string, platform.NvidiaBenchmarkOptions, string) (string, error)
|
||||||
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
|
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
|
||||||
runNvidiaComputeFn func(string, int, []int) (string, error)
|
runNvidiaComputeFn func(string, int, []int) (string, error)
|
||||||
runNvidiaPowerFn func(string, int, []int) (string, error)
|
runNvidiaPowerFn func(string, int, []int) (string, error)
|
||||||
@@ -163,6 +165,13 @@ func (f fakeSAT) RunNvidiaPowerBench(_ context.Context, baseDir string, opts pla
|
|||||||
return f.runNvidiaFn(baseDir)
|
return f.runNvidiaFn(baseDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunNvidiaPowerSourceAutotune(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, _ func(string)) (string, error) {
|
||||||
|
if f.runNvidiaAutotuneFn != nil {
|
||||||
|
return f.runNvidiaAutotuneFn(baseDir, opts, benchmarkKind)
|
||||||
|
}
|
||||||
|
return f.runNvidiaFn(baseDir)
|
||||||
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
||||||
if f.runNvidiaTargetedStressFn != nil {
|
if f.runNvidiaTargetedStressFn != nil {
|
||||||
return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
|
return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
|
||||||
@@ -809,6 +818,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
|||||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
|
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
if err := os.MkdirAll(filepath.Join(exportDir, "bee-bench"), 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json"), []byte(`{"version":1,"updated_at":"2026-04-20T01:02:03Z","selected_source":"sdr_psu_input","reason":"selected lowest relative error"}`), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run.tar.gz"), []byte("nested sat archive"), 0644); err != nil {
|
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run.tar.gz"), []byte("nested sat archive"), 0644); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
@@ -836,6 +851,7 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
|||||||
tr := tar.NewReader(gzr)
|
tr := tar.NewReader(gzr)
|
||||||
var names []string
|
var names []string
|
||||||
var auditJSON string
|
var auditJSON string
|
||||||
|
var manifest string
|
||||||
for {
|
for {
|
||||||
hdr, err := tr.Next()
|
hdr, err := tr.Next()
|
||||||
if errors.Is(err, io.EOF) {
|
if errors.Is(err, io.EOF) {
|
||||||
@@ -852,6 +868,13 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
|||||||
}
|
}
|
||||||
auditJSON = string(body)
|
auditJSON = string(body)
|
||||||
}
|
}
|
||||||
|
if strings.HasSuffix(hdr.Name, "/manifest.txt") {
|
||||||
|
body, err := io.ReadAll(tr)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("read manifest entry: %v", err)
|
||||||
|
}
|
||||||
|
manifest = string(body)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, want := range []string{
|
for _, want := range []string{
|
||||||
@@ -895,6 +918,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
|||||||
if !contains(auditJSON, "PASCARI") || !contains(auditJSON, "NVIDIA H100") {
|
if !contains(auditJSON, "PASCARI") || !contains(auditJSON, "NVIDIA H100") {
|
||||||
t.Fatalf("support bundle should keep real devices:\n%s", auditJSON)
|
t.Fatalf("support bundle should keep real devices:\n%s", auditJSON)
|
||||||
}
|
}
|
||||||
|
if !contains(manifest, "files:") {
|
||||||
|
t.Fatalf("support bundle manifest missing files section:\n%s", manifest)
|
||||||
|
}
|
||||||
|
if !strings.Contains(manifest, "power_autotune_selected_source=sdr_psu_input") {
|
||||||
|
t.Fatalf("support bundle manifest missing autotune source:\n%s", manifest)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestMainBanner(t *testing.T) {
|
func TestMainBanner(t *testing.T) {
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ package app
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"archive/tar"
|
"archive/tar"
|
||||||
|
"bee/audit/internal/platform"
|
||||||
"compress/gzip"
|
"compress/gzip"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
@@ -424,6 +425,13 @@ func writeManifest(dst, exportDir, stageRoot string) error {
|
|||||||
fmt.Fprintf(&body, "host=%s\n", hostnameOr("unknown"))
|
fmt.Fprintf(&body, "host=%s\n", hostnameOr("unknown"))
|
||||||
fmt.Fprintf(&body, "generated_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
fmt.Fprintf(&body, "generated_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
||||||
fmt.Fprintf(&body, "export_dir=%s\n", exportDir)
|
fmt.Fprintf(&body, "export_dir=%s\n", exportDir)
|
||||||
|
if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json")); err == nil && cfg != nil {
|
||||||
|
fmt.Fprintf(&body, "power_autotune_selected_source=%s\n", cfg.SelectedSource)
|
||||||
|
fmt.Fprintf(&body, "power_autotune_updated_at=%s\n", cfg.UpdatedAt.UTC().Format(time.RFC3339))
|
||||||
|
if strings.TrimSpace(cfg.Reason) != "" {
|
||||||
|
fmt.Fprintf(&body, "power_autotune_reason=%s\n", cfg.Reason)
|
||||||
|
}
|
||||||
|
}
|
||||||
fmt.Fprintf(&body, "\nfiles:\n")
|
fmt.Fprintf(&body, "\nfiles:\n")
|
||||||
|
|
||||||
var files []string
|
var files []string
|
||||||
|
|||||||
@@ -160,6 +160,9 @@ type psuSDR struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var psuSlotPatterns = []*regexp.Regexp{
|
var psuSlotPatterns = []*regexp.Regexp{
|
||||||
|
// MSI/underscore style: PSU1_POWER_IN, PSU2_POWER_OUT — underscore is \w so \b
|
||||||
|
// does not fire after the digit; match explicitly with underscore terminator.
|
||||||
|
regexp.MustCompile(`(?i)\bpsu([0-9]+)_`),
|
||||||
regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`), // PSU1, PS1, ps 2
|
regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`), // PSU1, PS1, ps 2
|
||||||
regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`), // PS 6, PS6
|
regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`), // PS 6, PS6
|
||||||
regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`), // PWS1
|
regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`), // PWS1
|
||||||
|
|||||||
@@ -49,6 +49,10 @@ func TestParsePSUSlotVendorVariants(t *testing.T) {
|
|||||||
{name: "PWS1 Status", want: 1},
|
{name: "PWS1 Status", want: 1},
|
||||||
{name: "Power Supply Bay 8", want: 8},
|
{name: "Power Supply Bay 8", want: 8},
|
||||||
{name: "PS 6 Input Power", want: 6},
|
{name: "PS 6 Input Power", want: 6},
|
||||||
|
// MSI underscore format — \b does not fire between digit and '_'
|
||||||
|
{name: "PSU1_POWER_IN", want: 1},
|
||||||
|
{name: "PSU2_POWER_OUT", want: 2},
|
||||||
|
{name: "PSU4_STATUS", want: 4},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tt := range tests {
|
for _, tt := range tests {
|
||||||
@@ -59,6 +63,31 @@ func TestParsePSUSlotVendorVariants(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestParsePSUSDRMSIFormat(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
raw := `
|
||||||
|
PSU1_STATUS | F1h | ok
|
||||||
|
PSU1_POWER_OUT | 928 Watts | ok
|
||||||
|
PSU1_POWER_IN | 976 Watts | ok
|
||||||
|
PSU2_STATUS | F2h | ok
|
||||||
|
PSU2_POWER_OUT | 944 Watts | ok
|
||||||
|
PSU2_POWER_IN | 992 Watts | ok
|
||||||
|
`
|
||||||
|
got := parsePSUSDR(raw)
|
||||||
|
if len(got) != 2 {
|
||||||
|
t.Fatalf("len(got)=%d want 2", len(got))
|
||||||
|
}
|
||||||
|
if got[1].inputPowerW == nil || *got[1].inputPowerW != 976 {
|
||||||
|
t.Fatalf("psu1 input power=%v want 976", got[1].inputPowerW)
|
||||||
|
}
|
||||||
|
if got[1].outputPowerW == nil || *got[1].outputPowerW != 928 {
|
||||||
|
t.Fatalf("psu1 output power=%v want 928", got[1].outputPowerW)
|
||||||
|
}
|
||||||
|
if got[2].inputPowerW == nil || *got[2].inputPowerW != 992 {
|
||||||
|
t.Fatalf("psu2 input power=%v want 992", got[2].inputPowerW)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestSynthesizePSUsFromSDR(t *testing.T) {
|
func TestSynthesizePSUsFromSDR(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
|
|||||||
@@ -240,6 +240,47 @@ func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, po
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func benchmarkPowerEngine() string {
|
||||||
|
switch strings.TrimSpace(strings.ToLower(os.Getenv("BEE_BENCH_POWER_ENGINE"))) {
|
||||||
|
case BenchmarkPowerEngineTargetedPower:
|
||||||
|
return BenchmarkPowerEngineTargetedPower
|
||||||
|
default:
|
||||||
|
return BenchmarkPowerEngineDCGMProfTester
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func benchmarkPowerEngineLabel(engine string) string {
|
||||||
|
switch strings.TrimSpace(strings.ToLower(engine)) {
|
||||||
|
case BenchmarkPowerEngineTargetedPower:
|
||||||
|
return "dcgmi diag targeted_power"
|
||||||
|
default:
|
||||||
|
return "dcgmproftester"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func resolveBenchmarkPowerLoadCommand(durationSec int, gpuIndices []int) ([]string, []string, error) {
|
||||||
|
engine := benchmarkPowerEngine()
|
||||||
|
durationSec = normalizeNvidiaBurnDuration(durationSec)
|
||||||
|
switch engine {
|
||||||
|
case BenchmarkPowerEngineTargetedPower:
|
||||||
|
return nvidiaDCGMNamedDiagCommand("targeted_power", durationSec, gpuIndices), nil, nil
|
||||||
|
default:
|
||||||
|
if len(gpuIndices) > 1 {
|
||||||
|
return []string{
|
||||||
|
"bee-dcgmproftester-staggered",
|
||||||
|
"--seconds", strconv.Itoa(durationSec),
|
||||||
|
"--stagger-seconds", "0",
|
||||||
|
"--devices", joinIndexList(gpuIndices),
|
||||||
|
}, nil, nil
|
||||||
|
}
|
||||||
|
cmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(durationSec))
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, err
|
||||||
|
}
|
||||||
|
return cmd, nvidiaVisibleDevicesEnv(gpuIndices), nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||||
if ctx == nil {
|
if ctx == nil {
|
||||||
ctx = context.Background()
|
ctx = context.Background()
|
||||||
@@ -384,10 +425,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
|
|
||||||
// Sample server idle power once (first GPU only — server state is global).
|
// Sample server idle power once (first GPU only — server state is global).
|
||||||
if !serverIdleOK {
|
if !serverIdleOK {
|
||||||
if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
|
if w, ok := sampleBenchmarkPowerSourceSeries(ctx, opts.ServerPowerSource, maxInt(spec.BaselineSec, 10), benchmarkPowerAutotuneSampleInterval); ok {
|
||||||
serverIdleW = w
|
serverIdleW = w
|
||||||
serverIdleOK = true
|
serverIdleOK = true
|
||||||
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
|
logFunc(fmt.Sprintf("server idle power (%s): %.0f W", opts.ServerPowerSource, w))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -430,7 +471,16 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
|
"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
|
||||||
}
|
}
|
||||||
logFunc(fmt.Sprintf("GPU %d: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", idx, len(supportedPrecisions), basePhaseSec, mixedPhaseSec))
|
logFunc(fmt.Sprintf("GPU %d: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", idx, len(supportedPrecisions), basePhaseSec, mixedPhaseSec))
|
||||||
|
serverPowerStopCh := make(chan struct{})
|
||||||
|
serverPowerCh := startSelectedPowerSourceSampler(serverPowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
|
||||||
_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-precision-plan.log", idx), planCmd, nil, []int{idx}, planPhases, logFunc)
|
_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-precision-plan.log", idx), planCmd, nil, []int{idx}, planPhases, logFunc)
|
||||||
|
close(serverPowerStopCh)
|
||||||
|
if serverPowerSamples := <-serverPowerCh; len(serverPowerSamples) > 0 {
|
||||||
|
serverLoadedWSum += benchmarkMean(serverPowerSamples)
|
||||||
|
serverLoadedSamples++
|
||||||
|
serverLoadedOK = true
|
||||||
|
logFunc(fmt.Sprintf("GPU %d: server loaded power (%s avg): %.0f W", idx, opts.ServerPowerSource, benchmarkMean(serverPowerSamples)))
|
||||||
|
}
|
||||||
for _, phaseSpec := range planPhases {
|
for _, phaseSpec := range planPhases {
|
||||||
if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
|
if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
|
||||||
appendBenchmarkMetrics(&metricRows, rows, phaseSpec.MetricStage, &metricTimelineSec, float64(phaseSpec.DurationSec))
|
appendBenchmarkMetrics(&metricRows, rows, phaseSpec.MetricStage, &metricTimelineSec, float64(phaseSpec.DurationSec))
|
||||||
@@ -461,48 +511,6 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
|
|
||||||
beforeThrottle, _ := queryThrottleCounters(idx)
|
beforeThrottle, _ := queryThrottleCounters(idx)
|
||||||
logFunc(fmt.Sprintf("GPU %d: steady compute (combined, %ds)", idx, mixedPhaseSec))
|
logFunc(fmt.Sprintf("GPU %d: steady compute (combined, %ds)", idx, mixedPhaseSec))
|
||||||
|
|
||||||
// Sample server power via IPMI in parallel with the steady phase.
|
|
||||||
// We collect readings every 5s and average them.
|
|
||||||
ipmiStopCh := make(chan struct{})
|
|
||||||
ipmiResultCh := make(chan float64, 1)
|
|
||||||
go func() {
|
|
||||||
defer close(ipmiResultCh)
|
|
||||||
var samples []float64
|
|
||||||
ticker := time.NewTicker(5 * time.Second)
|
|
||||||
defer ticker.Stop()
|
|
||||||
// First sample after a short warmup delay.
|
|
||||||
select {
|
|
||||||
case <-ipmiStopCh:
|
|
||||||
return
|
|
||||||
case <-time.After(15 * time.Second):
|
|
||||||
}
|
|
||||||
for {
|
|
||||||
if w, err := queryIPMIServerPowerW(); err == nil {
|
|
||||||
samples = append(samples, w)
|
|
||||||
}
|
|
||||||
select {
|
|
||||||
case <-ipmiStopCh:
|
|
||||||
if len(samples) > 0 {
|
|
||||||
var sum float64
|
|
||||||
for _, w := range samples {
|
|
||||||
sum += w
|
|
||||||
}
|
|
||||||
ipmiResultCh <- sum / float64(len(samples))
|
|
||||||
}
|
|
||||||
return
|
|
||||||
case <-ticker.C:
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
close(ipmiStopCh)
|
|
||||||
if loadedW, ok := <-ipmiResultCh; ok {
|
|
||||||
serverLoadedWSum += loadedW
|
|
||||||
serverLoadedSamples++
|
|
||||||
serverLoadedOK = true
|
|
||||||
logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW))
|
|
||||||
}
|
|
||||||
afterThrottle, _ := queryThrottleCounters(idx)
|
afterThrottle, _ := queryThrottleCounters(idx)
|
||||||
if planErr != nil {
|
if planErr != nil {
|
||||||
gpuResult.Notes = append(gpuResult.Notes, "precision plan failed: "+planErr.Error())
|
gpuResult.Notes = append(gpuResult.Notes, "precision plan failed: "+planErr.Error())
|
||||||
@@ -652,7 +660,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
if serverLoadedSamples > 0 {
|
if serverLoadedSamples > 0 {
|
||||||
serverLoadedW = serverLoadedWSum / float64(serverLoadedSamples)
|
serverLoadedW = serverLoadedWSum / float64(serverLoadedSamples)
|
||||||
}
|
}
|
||||||
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK)
|
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, opts.ServerPowerSource, serverIdleOK && serverLoadedOK)
|
||||||
result.Cooling = summarizeBenchmarkCooling(metricRows)
|
result.Cooling = summarizeBenchmarkCooling(metricRows)
|
||||||
|
|
||||||
// Apply server-power penalty when IPMI reports the server delta is much
|
// Apply server-power penalty when IPMI reports the server delta is much
|
||||||
@@ -707,6 +715,7 @@ func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) Nv
|
|||||||
if opts.SizeMB < 0 {
|
if opts.SizeMB < 0 {
|
||||||
opts.SizeMB = 0
|
opts.SizeMB = 0
|
||||||
}
|
}
|
||||||
|
opts.ServerPowerSource = normalizeBenchmarkPowerSource(opts.ServerPowerSource)
|
||||||
opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices)
|
opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices)
|
||||||
opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices)
|
opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices)
|
||||||
return opts
|
return opts
|
||||||
@@ -2535,10 +2544,14 @@ loop:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// characterizeServerPower computes BenchmarkServerPower from idle and loaded
|
// characterizeServerPower computes BenchmarkServerPower from idle and loaded
|
||||||
// IPMI samples plus the GPU-reported average power during steady state.
|
// samples plus the GPU-reported average power during steady state.
|
||||||
func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, ipmiAvailable bool) *BenchmarkServerPower {
|
func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, source string, available bool) *BenchmarkServerPower {
|
||||||
sp := &BenchmarkServerPower{Available: ipmiAvailable}
|
sp := &BenchmarkServerPower{
|
||||||
if !ipmiAvailable {
|
Available: available,
|
||||||
|
Source: normalizeBenchmarkPowerSource(source),
|
||||||
|
SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
|
||||||
|
}
|
||||||
|
if !available {
|
||||||
sp.Notes = append(sp.Notes, "IPMI power reading unavailable; server-side power characterization skipped")
|
sp.Notes = append(sp.Notes, "IPMI power reading unavailable; server-side power characterization skipped")
|
||||||
return sp
|
return sp
|
||||||
}
|
}
|
||||||
@@ -2671,10 +2684,10 @@ func runNvidiaBenchmarkParallel(
|
|||||||
|
|
||||||
// Sample server idle power once.
|
// Sample server idle power once.
|
||||||
if !*serverIdleOK {
|
if !*serverIdleOK {
|
||||||
if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
|
if w, ok := sampleBenchmarkPowerSourceSeries(ctx, opts.ServerPowerSource, maxInt(spec.BaselineSec, 10), benchmarkPowerAutotuneSampleInterval); ok {
|
||||||
*serverIdleW = w
|
*serverIdleW = w
|
||||||
*serverIdleOK = true
|
*serverIdleOK = true
|
||||||
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
|
logFunc(fmt.Sprintf("server idle power (%s): %.0f W", opts.ServerPowerSource, w))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2728,7 +2741,16 @@ func runNvidiaBenchmarkParallel(
|
|||||||
"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
|
"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
|
||||||
}
|
}
|
||||||
logFunc(fmt.Sprintf("GPUs %s: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", allDevices, len(supportedPrecisions), basePhaseSec, mixedPhaseSec))
|
logFunc(fmt.Sprintf("GPUs %s: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", allDevices, len(supportedPrecisions), basePhaseSec, mixedPhaseSec))
|
||||||
|
serverPowerStopCh := make(chan struct{})
|
||||||
|
serverPowerCh := startSelectedPowerSourceSampler(serverPowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
|
||||||
_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, "gpu-all-precision-plan.log", planCmd, nil, selected, planPhases, logFunc)
|
_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, "gpu-all-precision-plan.log", planCmd, nil, selected, planPhases, logFunc)
|
||||||
|
close(serverPowerStopCh)
|
||||||
|
if serverPowerSamples := <-serverPowerCh; len(serverPowerSamples) > 0 {
|
||||||
|
*serverLoadedWSum += benchmarkMean(serverPowerSamples)
|
||||||
|
(*serverLoadedSamples)++
|
||||||
|
*serverLoadedOK = true
|
||||||
|
logFunc(fmt.Sprintf("GPUs %s: server loaded power (%s avg): %.0f W", allDevices, opts.ServerPowerSource, benchmarkMean(serverPowerSamples)))
|
||||||
|
}
|
||||||
for _, phaseSpec := range planPhases {
|
for _, phaseSpec := range planPhases {
|
||||||
if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
|
if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
|
||||||
appendBenchmarkMetrics(allMetricRows, rows, phaseSpec.MetricStage, metricTimelineSec, float64(phaseSpec.DurationSec))
|
appendBenchmarkMetrics(allMetricRows, rows, phaseSpec.MetricStage, metricTimelineSec, float64(phaseSpec.DurationSec))
|
||||||
@@ -2770,46 +2792,6 @@ func runNvidiaBenchmarkParallel(
|
|||||||
}
|
}
|
||||||
|
|
||||||
logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (combined, %ds)", allDevices, mixedPhaseSec))
|
logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (combined, %ds)", allDevices, mixedPhaseSec))
|
||||||
|
|
||||||
// Sample server power via IPMI in parallel with steady phase.
|
|
||||||
ipmiStopCh := make(chan struct{})
|
|
||||||
ipmiResultCh := make(chan float64, 1)
|
|
||||||
go func() {
|
|
||||||
defer close(ipmiResultCh)
|
|
||||||
var samples []float64
|
|
||||||
ticker := time.NewTicker(5 * time.Second)
|
|
||||||
defer ticker.Stop()
|
|
||||||
select {
|
|
||||||
case <-ipmiStopCh:
|
|
||||||
return
|
|
||||||
case <-time.After(15 * time.Second):
|
|
||||||
}
|
|
||||||
for {
|
|
||||||
if w, err := queryIPMIServerPowerW(); err == nil {
|
|
||||||
samples = append(samples, w)
|
|
||||||
}
|
|
||||||
select {
|
|
||||||
case <-ipmiStopCh:
|
|
||||||
if len(samples) > 0 {
|
|
||||||
var sum float64
|
|
||||||
for _, w := range samples {
|
|
||||||
sum += w
|
|
||||||
}
|
|
||||||
ipmiResultCh <- sum / float64(len(samples))
|
|
||||||
}
|
|
||||||
return
|
|
||||||
case <-ticker.C:
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
close(ipmiStopCh)
|
|
||||||
if loadedW, ok := <-ipmiResultCh; ok {
|
|
||||||
*serverLoadedWSum += loadedW
|
|
||||||
(*serverLoadedSamples)++
|
|
||||||
*serverLoadedOK = true
|
|
||||||
logFunc(fmt.Sprintf("GPUs %s: server loaded power (IPMI): %.0f W", allDevices, loadedW))
|
|
||||||
}
|
|
||||||
afterThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
|
afterThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
|
||||||
for _, idx := range selected {
|
for _, idx := range selected {
|
||||||
afterThrottle[idx], _ = queryThrottleCounters(idx)
|
afterThrottle[idx], _ = queryThrottleCounters(idx)
|
||||||
@@ -3040,8 +3022,8 @@ func summarizeCPULoad(samples []float64) *BenchmarkCPULoad {
|
|||||||
return cl
|
return cl
|
||||||
}
|
}
|
||||||
|
|
||||||
// runBenchmarkPowerCalibration runs targeted_power for the supplied GPU set and
|
// runBenchmarkPowerCalibration runs the configured power-fit load for the supplied
|
||||||
// actively watches throttle counters. seedLimits, when provided, are treated as
|
// GPU set and actively watches throttle counters. seedLimits, when provided, are treated as
|
||||||
// the starting point for this calibration pass rather than as immutable fixed
|
// the starting point for this calibration pass rather than as immutable fixed
|
||||||
// limits. This matters during cumulative ramp-up: once an additional GPU is
|
// limits. This matters during cumulative ramp-up: once an additional GPU is
|
||||||
// introduced, every already-active GPU must be revalidated under the new
|
// introduced, every already-active GPU must be revalidated under the new
|
||||||
@@ -3070,10 +3052,19 @@ func runBenchmarkPowerCalibration(
|
|||||||
// doubling each retry until it would exceed the cap, at which point the
|
// doubling each retry until it would exceed the cap, at which point the
|
||||||
// next busy response fails the calibration immediately.
|
// next busy response fails the calibration immediately.
|
||||||
const dcgmResourceBusyMaxDelaySec = 300
|
const dcgmResourceBusyMaxDelaySec = 300
|
||||||
|
engine := benchmarkPowerEngine()
|
||||||
|
engineLabel := benchmarkPowerEngineLabel(engine)
|
||||||
|
|
||||||
if _, err := exec.LookPath("dcgmi"); err != nil {
|
if engine == BenchmarkPowerEngineTargetedPower {
|
||||||
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
|
if _, err := exec.LookPath("dcgmi"); err != nil {
|
||||||
return map[int]benchmarkPowerCalibrationResult{}, nil, nil
|
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
|
||||||
|
return map[int]benchmarkPowerCalibrationResult{}, nil, nil
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if _, _, err := resolveBenchmarkPowerLoadCommand(calibDurationSec, gpuIndices); err != nil {
|
||||||
|
logFunc("power calibration: dcgmproftester not found, skipping (will use default power limit)")
|
||||||
|
return map[int]benchmarkPowerCalibrationResult{}, nil, nil
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if killed := KillTestWorkers(); len(killed) > 0 {
|
if killed := KillTestWorkers(); len(killed) > 0 {
|
||||||
for _, p := range killed {
|
for _, p := range killed {
|
||||||
@@ -3206,7 +3197,7 @@ calibDone:
|
|||||||
sharedAttempt++
|
sharedAttempt++
|
||||||
for _, s := range active {
|
for _, s := range active {
|
||||||
s.calib.Attempts++
|
s.calib.Attempts++
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", s.idx, s.calib.Attempts, s.appliedLimitW, calibDurationSec))
|
logFunc(fmt.Sprintf("power calibration: GPU %d %s attempt %d at %d W for %ds", s.idx, engineLabel, s.calib.Attempts, s.appliedLimitW, calibDurationSec))
|
||||||
}
|
}
|
||||||
|
|
||||||
// Snapshot throttle counters for all active GPUs before the run.
|
// Snapshot throttle counters for all active GPUs before the run.
|
||||||
@@ -3215,14 +3206,22 @@ calibDone:
|
|||||||
beforeThrottle[s.idx], _ = queryThrottleCounters(s.idx)
|
beforeThrottle[s.idx], _ = queryThrottleCounters(s.idx)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Run targeted_power for ALL gpuIndices simultaneously so every card
|
// Run the selected power-fit load for ALL gpuIndices simultaneously so every card
|
||||||
// is under load during calibration — this reflects real server thermals.
|
// is under load during calibration — this reflects real server thermals.
|
||||||
logName := fmt.Sprintf("power-calibration-attempt-%d.log", sharedAttempt)
|
logName := fmt.Sprintf("power-calibration-attempt-%d.log", sharedAttempt)
|
||||||
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices)
|
cmd, env, err := resolveBenchmarkPowerLoadCommand(calibDurationSec, gpuIndices)
|
||||||
|
if err != nil {
|
||||||
|
for _, s := range active {
|
||||||
|
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("failed to resolve %s command: %v", engineLabel, err))
|
||||||
|
s.converged = true
|
||||||
|
}
|
||||||
|
logFunc(fmt.Sprintf("power calibration: failed to resolve %s command: %v", engineLabel, err))
|
||||||
|
break calibDone
|
||||||
|
}
|
||||||
attemptCtx, cancelAttempt := context.WithCancel(ctx)
|
attemptCtx, cancelAttempt := context.WithCancel(ctx)
|
||||||
doneCh := make(chan sharedAttemptResult, 1)
|
doneCh := make(chan sharedAttemptResult, 1)
|
||||||
go func() {
|
go func() {
|
||||||
out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, gpuIndices, logFunc)
|
out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, env, gpuIndices, logFunc)
|
||||||
doneCh <- sharedAttemptResult{out: out, rows: rows, err: err}
|
doneCh <- sharedAttemptResult{out: out, rows: rows, err: err}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
@@ -3245,8 +3244,8 @@ calibDone:
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
// Record throttle but do NOT cancel — let dcgmi finish so
|
// Record throttle but do NOT cancel — let the load command finish so
|
||||||
// nv-hostengine releases the slot cleanly before the next attempt.
|
// runtime resources release cleanly before the next attempt.
|
||||||
if reason := benchmarkCalibrationThrottleReason(beforeThrottle[s.idx], after); reason != "" {
|
if reason := benchmarkCalibrationThrottleReason(beforeThrottle[s.idx], after); reason != "" {
|
||||||
throttleReasons[s.idx] = reason
|
throttleReasons[s.idx] = reason
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for run to finish", s.idx, reason, s.appliedLimitW))
|
logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for run to finish", s.idx, reason, s.appliedLimitW))
|
||||||
@@ -3359,9 +3358,9 @@ calibDone:
|
|||||||
logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", s.idx, throttle, s.appliedLimitW))
|
logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", s.idx, throttle, s.appliedLimitW))
|
||||||
case ar.err != nil:
|
case ar.err != nil:
|
||||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", s.calib.Attempts, s.appliedLimitW, ar.err))
|
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", s.calib.Attempts, s.appliedLimitW, ar.err))
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", s.idx, s.appliedLimitW, ar.err))
|
logFunc(fmt.Sprintf("power calibration: GPU %d %s failed at %d W: %v", s.idx, engineLabel, s.appliedLimitW, ar.err))
|
||||||
default:
|
default:
|
||||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W: no valid power telemetry", s.calib.Attempts, s.appliedLimitW))
|
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("%s attempt %d at %d W: no valid power telemetry", engineLabel, s.calib.Attempts, s.appliedLimitW))
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W: no valid telemetry", s.idx, s.calib.Attempts, s.appliedLimitW))
|
logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W: no valid telemetry", s.idx, s.calib.Attempts, s.appliedLimitW))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3384,7 +3383,7 @@ calibDone:
|
|||||||
s.calib.Completed = true
|
s.calib.Completed = true
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
|
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
|
||||||
}
|
}
|
||||||
s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
|
s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
|
||||||
s.converged = true
|
s.converged = true
|
||||||
@@ -3399,7 +3398,7 @@ calibDone:
|
|||||||
next = (s.lo + s.hi) / 2
|
next = (s.lo + s.hi) / 2
|
||||||
}
|
}
|
||||||
if next < s.minLimitW {
|
if next < s.minLimitW {
|
||||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
|
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
|
||||||
s.converged = true
|
s.converged = true
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@@ -4117,13 +4116,13 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
}
|
}
|
||||||
durationSec := powerBenchDurationSec(opts.Profile)
|
durationSec := powerBenchDurationSec(opts.Profile)
|
||||||
|
|
||||||
// Sample IPMI idle power before any GPU load.
|
// Sample server idle power before any GPU load.
|
||||||
var serverIdleW float64
|
var serverIdleW float64
|
||||||
var serverIdleOK bool
|
var serverIdleOK bool
|
||||||
if w, ok := sampleIPMIPowerSeries(ctx, 10); ok {
|
if w, ok := sampleBenchmarkPowerSourceSeries(ctx, opts.ServerPowerSource, 10, benchmarkPowerAutotuneSampleInterval); ok {
|
||||||
serverIdleW = w
|
serverIdleW = w
|
||||||
serverIdleOK = true
|
serverIdleOK = true
|
||||||
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
|
logFunc(fmt.Sprintf("server idle power (%s): %.0f W", opts.ServerPowerSource, w))
|
||||||
}
|
}
|
||||||
sdrIdle := sampleIPMISDRPowerSensors()
|
sdrIdle := sampleIPMISDRPowerSensors()
|
||||||
psuBefore := psuStatusSnapshot()
|
psuBefore := psuStatusSnapshot()
|
||||||
@@ -4141,20 +4140,18 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
_ = os.MkdirAll(singleDir, 0755)
|
_ = os.MkdirAll(singleDir, 0755)
|
||||||
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
||||||
ipmiSingleCtx, ipmiSingleCancel := context.WithCancel(ctx)
|
singlePowerStopCh := make(chan struct{})
|
||||||
ipmiSingleDone := make(chan float64, 1)
|
singlePowerCh := startSelectedPowerSourceSampler(singlePowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
|
||||||
go func() {
|
|
||||||
defer close(ipmiSingleDone)
|
|
||||||
if w, ok := sampleIPMIPowerSeries(ipmiSingleCtx, 3600); ok {
|
|
||||||
ipmiSingleDone <- w
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil, durationSec)
|
c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil, durationSec)
|
||||||
appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0)
|
appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0)
|
||||||
ipmiSingleCancel()
|
close(singlePowerStopCh)
|
||||||
if w, ok := <-ipmiSingleDone; ok {
|
sdrSingle := sampleIPMISDRPowerSensors()
|
||||||
singleIPMILoadedW[idx] = w
|
if samples := <-singlePowerCh; len(samples) > 0 {
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card IPMI loaded: %.0f W", idx, w))
|
singleIPMILoadedW[idx] = benchmarkMean(samples)
|
||||||
|
logFunc(fmt.Sprintf("power calibration: GPU %d single-card server power (%s avg): %.0f W", idx, opts.ServerPowerSource, singleIPMILoadedW[idx]))
|
||||||
|
} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrSingle.PSUInW > 0 {
|
||||||
|
singleIPMILoadedW[idx] = sdrSingle.PSUInW
|
||||||
|
logFunc(fmt.Sprintf("power calibration: GPU %d single-card fallback server power (SDR snapshot): %.0f W", idx, sdrSingle.PSUInW))
|
||||||
}
|
}
|
||||||
allRestoreActions = append(allRestoreActions, restore...)
|
allRestoreActions = append(allRestoreActions, restore...)
|
||||||
if r, ok := c[idx]; ok {
|
if r, ok := c[idx]; ok {
|
||||||
@@ -4228,11 +4225,11 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
result.RecommendedSlotOrder = append(result.RecommendedSlotOrder, gpu.Index)
|
result.RecommendedSlotOrder = append(result.RecommendedSlotOrder, gpu.Index)
|
||||||
}
|
}
|
||||||
if len(result.RecommendedSlotOrder) > 0 {
|
if len(result.RecommendedSlotOrder) > 0 {
|
||||||
result.Findings = append(result.Findings, fmt.Sprintf("Recommended slot order for installation based on single-card targeted_power: %s.", joinIndexList(result.RecommendedSlotOrder)))
|
result.Findings = append(result.Findings, fmt.Sprintf("Recommended slot order for installation based on single-card %s: %s.", benchmarkPowerEngineLabel(benchmarkPowerEngine()), joinIndexList(result.RecommendedSlotOrder)))
|
||||||
}
|
}
|
||||||
for _, gpu := range gpus {
|
for _, gpu := range gpus {
|
||||||
if gpu.Derated {
|
if gpu.Derated {
|
||||||
result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete targeted_power.", gpu.Index, gpu.AppliedPowerLimitW))
|
result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete %s.", gpu.Index, gpu.AppliedPowerLimitW, benchmarkPowerEngineLabel(benchmarkPowerEngine())))
|
||||||
}
|
}
|
||||||
if gpu.CoolingWarning != "" {
|
if gpu.CoolingWarning != "" {
|
||||||
result.Findings = append(result.Findings, fmt.Sprintf(
|
result.Findings = append(result.Findings, fmt.Sprintf(
|
||||||
@@ -4249,7 +4246,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
// Phase 2: cumulative thermal ramp.
|
// Phase 2: cumulative thermal ramp.
|
||||||
// Each step introduces one new GPU into an environment where all previously
|
// Each step introduces one new GPU into an environment where all previously
|
||||||
// calibrated GPUs are already running at their fixed stable limits. The new
|
// calibrated GPUs are already running at their fixed stable limits. The new
|
||||||
// GPU's stable TDP is searched via binary search (targeted_power) under real
|
// GPU's stable TDP is searched via binary search under real
|
||||||
// multi-GPU thermal load. Once found, its limit is fixed permanently for all
|
// multi-GPU thermal load. Once found, its limit is fixed permanently for all
|
||||||
// subsequent steps. This ensures each GPU's limit reflects actual sustained
|
// subsequent steps. This ensures each GPU's limit reflects actual sustained
|
||||||
// power in the final full-system thermal state.
|
// power in the final full-system thermal state.
|
||||||
@@ -4262,6 +4259,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
// per-step in NvidiaPowerBenchStep.ServerLoadedW.
|
// per-step in NvidiaPowerBenchStep.ServerLoadedW.
|
||||||
var serverLoadedW float64
|
var serverLoadedW float64
|
||||||
var serverLoadedOK bool
|
var serverLoadedOK bool
|
||||||
|
// sdrLastStep retains the SDR snapshot from the last ramp step while GPUs are
|
||||||
|
// still loaded. Used as PSUInputLoadedW in the summary instead of re-sampling
|
||||||
|
// after the test when GPUs have already returned to idle.
|
||||||
|
var sdrLastStep sdrPowerSnapshot
|
||||||
|
|
||||||
// Step 1: reuse single-card calibration result directly.
|
// Step 1: reuse single-card calibration result directly.
|
||||||
if len(result.RecommendedSlotOrder) > 0 {
|
if len(result.RecommendedSlotOrder) > 0 {
|
||||||
@@ -4284,7 +4285,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
}
|
}
|
||||||
if !firstCalib.Completed {
|
if !firstCalib.Completed {
|
||||||
ramp.Status = "FAILED"
|
ramp.Status = "FAILED"
|
||||||
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card targeted_power", firstIdx))
|
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card %s", firstIdx, benchmarkPowerEngineLabel(benchmarkPowerEngine())))
|
||||||
result.OverallStatus = "PARTIAL"
|
result.OverallStatus = "PARTIAL"
|
||||||
} else if firstCalib.Derated {
|
} else if firstCalib.Derated {
|
||||||
ramp.Status = "PARTIAL"
|
ramp.Status = "PARTIAL"
|
||||||
@@ -4330,23 +4331,16 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
step, len(result.RecommendedSlotOrder), len(subset), newGPUIdx))
|
step, len(result.RecommendedSlotOrder), len(subset), newGPUIdx))
|
||||||
|
|
||||||
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||||
ipmiStepCtx, ipmiStepCancel := context.WithCancel(ctx)
|
stepPowerStopCh := make(chan struct{})
|
||||||
ipmiStepDone := make(chan float64, 1)
|
stepPowerCh := startSelectedPowerSourceSampler(stepPowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
|
||||||
go func() {
|
|
||||||
defer close(ipmiStepDone)
|
|
||||||
if w, ok := sampleIPMIPowerSeries(ipmiStepCtx, 3600); ok {
|
|
||||||
ipmiStepDone <- w
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep, durationSec)
|
stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep, durationSec)
|
||||||
appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0)
|
appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0)
|
||||||
ipmiStepCancel()
|
close(stepPowerStopCh)
|
||||||
var stepIPMILoadedW float64
|
var stepIPMILoadedW float64
|
||||||
var stepIPMIOK bool
|
var stepIPMIOK bool
|
||||||
if w, ok := <-ipmiStepDone; ok {
|
if samples := <-stepPowerCh; len(samples) > 0 {
|
||||||
stepIPMILoadedW = w
|
stepIPMILoadedW = benchmarkMean(samples)
|
||||||
stepIPMIOK = true
|
stepIPMIOK = true
|
||||||
logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W", step, w))
|
|
||||||
}
|
}
|
||||||
// Accumulate restore actions; they all run in the outer defer.
|
// Accumulate restore actions; they all run in the outer defer.
|
||||||
allRestoreActions = append(allRestoreActions, stepRestore...)
|
allRestoreActions = append(allRestoreActions, stepRestore...)
|
||||||
@@ -4382,7 +4376,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
}
|
}
|
||||||
ramp.Status = "FAILED"
|
ramp.Status = "FAILED"
|
||||||
ramp.Notes = append(ramp.Notes,
|
ramp.Notes = append(ramp.Notes,
|
||||||
fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d; keeping previous stable limit %d W", idx, step, fallback))
|
fmt.Sprintf("GPU %d did not complete %s in ramp step %d; keeping previous stable limit %d W", idx, benchmarkPowerEngineLabel(benchmarkPowerEngine()), step, fallback))
|
||||||
result.OverallStatus = "PARTIAL"
|
result.OverallStatus = "PARTIAL"
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@@ -4410,20 +4404,33 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
|
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Per-step PSU slot snapshot — also used as the authoritative loaded power
|
||||||
|
// source when SDR PSU sensors are available (more accurate than DCMI on
|
||||||
|
// servers where DCMI covers only a subset of installed PSUs).
|
||||||
|
sdrStep := sampleIPMISDRPowerSensors()
|
||||||
|
if len(sdrStep.PSUSlots) > 0 {
|
||||||
|
ramp.PSUSlotReadings = sdrStep.PSUSlots
|
||||||
|
}
|
||||||
|
|
||||||
if stepIPMIOK && serverIdleOK && stepIPMILoadedW > 0 {
|
if stepIPMIOK && serverIdleOK && stepIPMILoadedW > 0 {
|
||||||
ramp.ServerLoadedW = stepIPMILoadedW
|
ramp.ServerLoadedW = stepIPMILoadedW
|
||||||
ramp.ServerDeltaW = stepIPMILoadedW - serverIdleW
|
ramp.ServerDeltaW = stepIPMILoadedW - serverIdleW
|
||||||
|
logFunc(fmt.Sprintf("power ramp: step %d server loaded power (%s avg): %.0f W", step, opts.ServerPowerSource, stepIPMILoadedW))
|
||||||
// The last step has all GPUs loaded — use it as the top-level loaded_w.
|
// The last step has all GPUs loaded — use it as the top-level loaded_w.
|
||||||
if step == len(result.RecommendedSlotOrder) {
|
if step == len(result.RecommendedSlotOrder) {
|
||||||
serverLoadedW = stepIPMILoadedW
|
serverLoadedW = stepIPMILoadedW
|
||||||
serverLoadedOK = true
|
serverLoadedOK = true
|
||||||
|
sdrLastStep = sdrStep
|
||||||
|
}
|
||||||
|
} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrStep.PSUInW > 0 {
|
||||||
|
ramp.ServerLoadedW = sdrStep.PSUInW
|
||||||
|
ramp.ServerDeltaW = sdrStep.PSUInW - sdrIdle.PSUInW
|
||||||
|
logFunc(fmt.Sprintf("power ramp: step %d fallback server loaded power (SDR snapshot): %.0f W", step, sdrStep.PSUInW))
|
||||||
|
if step == len(result.RecommendedSlotOrder) {
|
||||||
|
serverLoadedW = sdrStep.PSUInW
|
||||||
|
serverLoadedOK = true
|
||||||
|
sdrLastStep = sdrStep
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// Per-step PSU slot snapshot.
|
|
||||||
sdrStep := sampleIPMISDRPowerSensors()
|
|
||||||
if len(sdrStep.PSUSlots) > 0 {
|
|
||||||
ramp.PSUSlotReadings = sdrStep.PSUSlots
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fan state at end of ramp step.
|
// Fan state at end of ramp step.
|
||||||
@@ -4480,11 +4487,13 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
gpuActualSumW = result.PlatformMaxTDPW
|
gpuActualSumW = result.PlatformMaxTDPW
|
||||||
}
|
}
|
||||||
_ = serverIdleOK // used implicitly via characterizeServerPower
|
_ = serverIdleOK // used implicitly via characterizeServerPower
|
||||||
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuActualSumW, serverIdleOK && serverLoadedOK)
|
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuActualSumW, opts.ServerPowerSource, serverIdleOK && serverLoadedOK)
|
||||||
// Supplement DCMI with SDR multi-source data via collector's PSU slot patterns.
|
// Supplement DCMI with SDR multi-source data via collector's PSU slot patterns.
|
||||||
// Per-slot readings enable correlation with audit HardwarePowerSupply entries.
|
// Per-slot readings enable correlation with audit HardwarePowerSupply entries.
|
||||||
if result.ServerPower != nil {
|
if result.ServerPower != nil {
|
||||||
sdrLoaded := sampleIPMISDRPowerSensors()
|
// Use the SDR snapshot from the last ramp step (GPUs still loaded) rather
|
||||||
|
// than re-sampling here, which would capture post-test idle state.
|
||||||
|
sdrLoaded := sdrLastStep
|
||||||
result.ServerPower.PSUInputIdleW = sdrIdle.PSUInW
|
result.ServerPower.PSUInputIdleW = sdrIdle.PSUInW
|
||||||
result.ServerPower.PSUInputLoadedW = sdrLoaded.PSUInW
|
result.ServerPower.PSUInputLoadedW = sdrLoaded.PSUInW
|
||||||
result.ServerPower.PSUOutputIdleW = sdrIdle.PSUOutW
|
result.ServerPower.PSUOutputIdleW = sdrIdle.PSUOutW
|
||||||
|
|||||||
735
audit/internal/platform/benchmark_power_autotune.go
Normal file
735
audit/internal/platform/benchmark_power_autotune.go
Normal file
@@ -0,0 +1,735 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"math"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
benchmarkPowerAutotuneVersion = 1
|
||||||
|
benchmarkPowerAutotuneIdleSec = 60
|
||||||
|
benchmarkPowerAutotuneLoadSec = 90
|
||||||
|
benchmarkPowerAutotuneSampleInterval = 3
|
||||||
|
defaultBenchmarkPowerSourceConfigPath = "/appdata/bee/export/bee-bench/power-source-autotune.json"
|
||||||
|
)
|
||||||
|
|
||||||
|
func BenchmarkPowerSourceConfigPath(baseDir string) string {
|
||||||
|
baseDir = strings.TrimSpace(baseDir)
|
||||||
|
if baseDir == "" {
|
||||||
|
return defaultBenchmarkPowerSourceConfigPath
|
||||||
|
}
|
||||||
|
return filepath.Join(filepath.Dir(baseDir), "power-source-autotune.json")
|
||||||
|
}
|
||||||
|
|
||||||
|
func LoadBenchmarkPowerAutotuneConfig(path string) (*BenchmarkPowerAutotuneConfig, error) {
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
var cfg BenchmarkPowerAutotuneConfig
|
||||||
|
if err := json.Unmarshal(raw, &cfg); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(cfg.SelectedSource) == "" {
|
||||||
|
return nil, fmt.Errorf("autotune config missing selected_source")
|
||||||
|
}
|
||||||
|
return &cfg, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func SaveBenchmarkPowerAutotuneConfig(path string, cfg BenchmarkPowerAutotuneConfig) error {
|
||||||
|
if strings.TrimSpace(path) == "" {
|
||||||
|
return fmt.Errorf("empty autotune config path")
|
||||||
|
}
|
||||||
|
if cfg.Version <= 0 {
|
||||||
|
cfg.Version = benchmarkPowerAutotuneVersion
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
data, err := json.MarshalIndent(cfg, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
tmp := path + ".tmp"
|
||||||
|
if err := os.WriteFile(tmp, data, 0644); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return os.Rename(tmp, path)
|
||||||
|
}
|
||||||
|
|
||||||
|
func LoadSystemPowerSourceConfig(exportDir string) (*BenchmarkPowerAutotuneConfig, error) {
|
||||||
|
return LoadBenchmarkPowerAutotuneConfig(BenchmarkPowerSourceConfigPath(exportDir))
|
||||||
|
}
|
||||||
|
|
||||||
|
func ResetBenchmarkPowerAutotuneConfig(path string) error {
|
||||||
|
if strings.TrimSpace(path) == "" {
|
||||||
|
return fmt.Errorf("empty autotune config path")
|
||||||
|
}
|
||||||
|
if err := os.Remove(path); err != nil && !os.IsNotExist(err) {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func normalizeBenchmarkPowerSource(source string) string {
|
||||||
|
switch strings.TrimSpace(strings.ToLower(source)) {
|
||||||
|
case BenchmarkPowerSourceSDRPSUInput:
|
||||||
|
return BenchmarkPowerSourceSDRPSUInput
|
||||||
|
default:
|
||||||
|
return BenchmarkPowerSourceDCMI
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func ResolveSystemPowerDecision(exportDir string) SystemPowerSourceDecision {
|
||||||
|
cfg, err := LoadSystemPowerSourceConfig(exportDir)
|
||||||
|
if err == nil && cfg != nil && strings.TrimSpace(cfg.SelectedSource) != "" {
|
||||||
|
selected := normalizeBenchmarkPowerSource(cfg.SelectedSource)
|
||||||
|
return SystemPowerSourceDecision{
|
||||||
|
Configured: true,
|
||||||
|
SelectedSource: selected,
|
||||||
|
EffectiveSource: selected,
|
||||||
|
Mode: "autotuned",
|
||||||
|
Reason: strings.TrimSpace(cfg.Reason),
|
||||||
|
ConfiguredAt: cfg.UpdatedAt,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sources := sampleBenchmarkPowerSources()
|
||||||
|
if value := sources[BenchmarkPowerSourceSDRPSUInput]; value > 0 {
|
||||||
|
return SystemPowerSourceDecision{
|
||||||
|
Configured: false,
|
||||||
|
EffectiveSource: BenchmarkPowerSourceSDRPSUInput,
|
||||||
|
Mode: "fallback",
|
||||||
|
Reason: "autotune config not found; using temporary fallback source sdr_psu_input",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return SystemPowerSourceDecision{
|
||||||
|
Configured: false,
|
||||||
|
EffectiveSource: BenchmarkPowerSourceDCMI,
|
||||||
|
Mode: "fallback",
|
||||||
|
Reason: "autotune config not found; using temporary fallback source dcmi",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func SampleSystemPowerResolved(exportDir string) (float64, SystemPowerSourceDecision, error) {
|
||||||
|
decision := ResolveSystemPowerDecision(exportDir)
|
||||||
|
if decision.EffectiveSource != "" {
|
||||||
|
if value, err := queryBenchmarkPowerSourceW(decision.EffectiveSource); err == nil && value > 0 {
|
||||||
|
return value, decision, nil
|
||||||
|
} else if decision.Configured {
|
||||||
|
fallback := BenchmarkPowerSourceDCMI
|
||||||
|
if decision.EffectiveSource == BenchmarkPowerSourceDCMI {
|
||||||
|
fallback = BenchmarkPowerSourceSDRPSUInput
|
||||||
|
}
|
||||||
|
if fallbackValue, fallbackErr := queryBenchmarkPowerSourceW(fallback); fallbackErr == nil && fallbackValue > 0 {
|
||||||
|
decision.Mode = "degraded"
|
||||||
|
decision.Reason = fmt.Sprintf("configured source %s unavailable; using degraded fallback %s", decision.SelectedSource, fallback)
|
||||||
|
decision.EffectiveSource = fallback
|
||||||
|
return fallbackValue, decision, nil
|
||||||
|
}
|
||||||
|
decision.Mode = "degraded"
|
||||||
|
decision.Reason = fmt.Sprintf("configured source %s unavailable and no fallback source responded", decision.SelectedSource)
|
||||||
|
return 0, decision, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, decision, fmt.Errorf("system power source unavailable")
|
||||||
|
}
|
||||||
|
|
||||||
|
func queryBenchmarkPowerSourceW(source string) (float64, error) {
|
||||||
|
switch normalizeBenchmarkPowerSource(source) {
|
||||||
|
case BenchmarkPowerSourceSDRPSUInput:
|
||||||
|
sdr := sampleIPMISDRPowerSensors()
|
||||||
|
if sdr.PSUInW > 0 {
|
||||||
|
return sdr.PSUInW, nil
|
||||||
|
}
|
||||||
|
return 0, fmt.Errorf("sdr psu input unavailable")
|
||||||
|
default:
|
||||||
|
return queryIPMIServerPowerW()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func sampleBenchmarkPowerSources() map[string]float64 {
|
||||||
|
out := map[string]float64{}
|
||||||
|
if w, err := queryIPMIServerPowerW(); err == nil && w > 0 {
|
||||||
|
out[BenchmarkPowerSourceDCMI] = w
|
||||||
|
}
|
||||||
|
if w, err := queryBenchmarkPowerSourceW(BenchmarkPowerSourceSDRPSUInput); err == nil && w > 0 {
|
||||||
|
out[BenchmarkPowerSourceSDRPSUInput] = w
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func sampleBenchmarkPowerSourceSeries(ctx context.Context, source string, durationSec, intervalSec int) (float64, bool) {
|
||||||
|
if durationSec <= 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
samples := collectSelectedPowerSourceSamples(ctx, source, durationSec, intervalSec)
|
||||||
|
if len(samples) == 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return benchmarkMean(samples), true
|
||||||
|
}
|
||||||
|
|
||||||
|
func collectSelectedPowerSourceSamples(ctx context.Context, source string, durationSec, intervalSec int) []float64 {
|
||||||
|
if durationSec <= 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
stopCh := make(chan struct{})
|
||||||
|
doneCh := startSelectedPowerSourceSampler(stopCh, source, intervalSec)
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
case <-time.After(time.Duration(durationSec) * time.Second):
|
||||||
|
}
|
||||||
|
close(stopCh)
|
||||||
|
return <-doneCh
|
||||||
|
}
|
||||||
|
|
||||||
|
func startSelectedPowerSourceSampler(stopCh <-chan struct{}, source string, intervalSec int) <-chan []float64 {
|
||||||
|
if intervalSec <= 0 {
|
||||||
|
intervalSec = benchmarkPowerAutotuneSampleInterval
|
||||||
|
}
|
||||||
|
ch := make(chan []float64, 1)
|
||||||
|
go func() {
|
||||||
|
defer close(ch)
|
||||||
|
var samples []float64
|
||||||
|
record := func() {
|
||||||
|
if w, err := queryBenchmarkPowerSourceW(source); err == nil && w > 0 {
|
||||||
|
samples = append(samples, w)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
record()
|
||||||
|
ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
|
||||||
|
defer ticker.Stop()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-stopCh:
|
||||||
|
ch <- samples
|
||||||
|
return
|
||||||
|
case <-ticker.C:
|
||||||
|
record()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
return ch
|
||||||
|
}
|
||||||
|
|
||||||
|
type benchmarkPowerAutotuneSample struct {
|
||||||
|
ElapsedSec float64
|
||||||
|
GPUAvgUsagePct float64
|
||||||
|
CPUUsagePct float64
|
||||||
|
GPUSumPowerW float64
|
||||||
|
Sources map[string]float64
|
||||||
|
}
|
||||||
|
|
||||||
|
func collectBenchmarkPowerAutotuneSamples(ctx context.Context, phase string, gpuIndices []int, durationSec int, logFunc func(string)) []benchmarkPowerAutotuneSample {
|
||||||
|
if durationSec <= 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
var out []benchmarkPowerAutotuneSample
|
||||||
|
deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
|
||||||
|
start := time.Now()
|
||||||
|
for {
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
row := benchmarkPowerAutotuneSample{
|
||||||
|
ElapsedSec: time.Since(start).Seconds(),
|
||||||
|
CPUUsagePct: sampleCPULoadPct(),
|
||||||
|
Sources: sampleBenchmarkPowerSources(),
|
||||||
|
}
|
||||||
|
if gpuRows, err := sampleGPUMetrics(gpuIndices); err == nil && len(gpuRows) > 0 {
|
||||||
|
var usageSum float64
|
||||||
|
for _, gpu := range gpuRows {
|
||||||
|
row.GPUSumPowerW += gpu.PowerW
|
||||||
|
usageSum += gpu.UsagePct
|
||||||
|
}
|
||||||
|
row.GPUAvgUsagePct = usageSum / float64(len(gpuRows))
|
||||||
|
}
|
||||||
|
out = append(out, row)
|
||||||
|
logBenchmarkPowerAutotuneSample(phase, row, logFunc)
|
||||||
|
if time.Now().After(deadline) {
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return out
|
||||||
|
case <-time.After(benchmarkPowerAutotuneSampleInterval * time.Second):
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func logBenchmarkPowerAutotuneSample(phase string, sample benchmarkPowerAutotuneSample, logFunc func(string)) {
|
||||||
|
if logFunc == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var sourceParts []string
|
||||||
|
for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} {
|
||||||
|
if value, ok := sample.Sources[source]; ok && value > 0 {
|
||||||
|
sourceParts = append(sourceParts, fmt.Sprintf("%s=%.0fW", source, value))
|
||||||
|
} else {
|
||||||
|
sourceParts = append(sourceParts, fmt.Sprintf("%s=n/a", source))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
logFunc(fmt.Sprintf(
|
||||||
|
"autotune %s sample t=%.0fs gpu_avg_util=%.1f%% gpu_sum_power=%.0fW cpu_load=%.1f%% %s",
|
||||||
|
phase,
|
||||||
|
sample.ElapsedSec,
|
||||||
|
sample.GPUAvgUsagePct,
|
||||||
|
sample.GPUSumPowerW,
|
||||||
|
sample.CPUUsagePct,
|
||||||
|
strings.Join(sourceParts, " "),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
func logBenchmarkPowerAutotunePhaseSummary(phase string, samples []benchmarkPowerAutotuneSample, logFunc func(string)) {
|
||||||
|
if logFunc == nil || len(samples) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var gpuUsage []float64
|
||||||
|
var cpuUsage []float64
|
||||||
|
var gpuPower []float64
|
||||||
|
sourceBuckets := map[string][]float64{}
|
||||||
|
for _, sample := range samples {
|
||||||
|
gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct)
|
||||||
|
cpuUsage = append(cpuUsage, sample.CPUUsagePct)
|
||||||
|
gpuPower = append(gpuPower, sample.GPUSumPowerW)
|
||||||
|
for source, value := range sample.Sources {
|
||||||
|
if value > 0 {
|
||||||
|
sourceBuckets[source] = append(sourceBuckets[source], value)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
var sourceParts []string
|
||||||
|
for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} {
|
||||||
|
values := sourceBuckets[source]
|
||||||
|
if len(values) == 0 {
|
||||||
|
sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=n/a", source))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=%.0fW", source, benchmarkMean(values)))
|
||||||
|
}
|
||||||
|
logFunc(fmt.Sprintf(
|
||||||
|
"autotune %s summary samples=%d gpu_avg_util=%.1f%% gpu_p95_util=%.1f%% gpu_avg_power=%.0fW cpu_avg=%.1f%% cpu_p95=%.1f%% %s",
|
||||||
|
phase,
|
||||||
|
len(samples),
|
||||||
|
benchmarkMean(gpuUsage),
|
||||||
|
benchmarkPercentile(gpuUsage, 95),
|
||||||
|
benchmarkMean(gpuPower),
|
||||||
|
benchmarkMean(cpuUsage),
|
||||||
|
benchmarkPercentile(cpuUsage, 95),
|
||||||
|
strings.Join(sourceParts, " "),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
func logBenchmarkPowerAutotuneSelection(candidates []BenchmarkPowerAutotuneCandidate, selectedSource string, gpuDelta float64, logFunc func(string)) {
|
||||||
|
if logFunc == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, candidate := range candidates {
|
||||||
|
if !candidate.Available {
|
||||||
|
logFunc(fmt.Sprintf("autotune candidate %s unavailable", candidate.Source))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
logFunc(fmt.Sprintf(
|
||||||
|
"autotune candidate %s idle_avg=%.0fW load_avg=%.0fW delta=%.0fW gpu_delta=%.0fW relative_error=%.3f confidence=%.0f%%%s",
|
||||||
|
candidate.Source,
|
||||||
|
candidate.IdleAvgW,
|
||||||
|
candidate.LoadAvgW,
|
||||||
|
candidate.DeltaW,
|
||||||
|
gpuDelta,
|
||||||
|
candidate.RelativeError,
|
||||||
|
candidate.Confidence*100,
|
||||||
|
map[bool]string{true: " SELECTED", false: ""}[candidate.Source == selectedSource],
|
||||||
|
))
|
||||||
|
if strings.TrimSpace(candidate.SelectionNotes) != "" {
|
||||||
|
logFunc(fmt.Sprintf("autotune candidate %s reason: %s", candidate.Source, candidate.SelectionNotes))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func validateBenchmarkPowerAutotuneIdle(samples []benchmarkPowerAutotuneSample) *BenchmarkPowerAutotuneValidation {
|
||||||
|
result := &BenchmarkPowerAutotuneValidation{}
|
||||||
|
if len(samples) == 0 {
|
||||||
|
result.Reason = "no idle telemetry samples collected"
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
var gpuUsage []float64
|
||||||
|
var cpuUsage []float64
|
||||||
|
for _, sample := range samples {
|
||||||
|
gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct)
|
||||||
|
if sample.CPUUsagePct > 0 {
|
||||||
|
cpuUsage = append(cpuUsage, sample.CPUUsagePct)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
result.GPUSamples = len(gpuUsage)
|
||||||
|
result.CPUSamples = len(cpuUsage)
|
||||||
|
result.GPUAvgUsagePct = math.Round(benchmarkMean(gpuUsage)*10) / 10
|
||||||
|
result.GPUP95UsagePct = math.Round(benchmarkPercentile(gpuUsage, 95)*10) / 10
|
||||||
|
result.CPUAvgUsagePct = math.Round(benchmarkMean(cpuUsage)*10) / 10
|
||||||
|
result.CPUP95UsagePct = math.Round(benchmarkPercentile(cpuUsage, 95)*10) / 10
|
||||||
|
switch {
|
||||||
|
case result.GPUAvgUsagePct > 5:
|
||||||
|
result.Reason = fmt.Sprintf("idle validation failed: average GPU load %.1f%% exceeds 5%%", result.GPUAvgUsagePct)
|
||||||
|
case result.GPUP95UsagePct > 10:
|
||||||
|
result.Reason = fmt.Sprintf("idle validation failed: p95 GPU load %.1f%% exceeds 10%%", result.GPUP95UsagePct)
|
||||||
|
case result.CPUAvgUsagePct > 20:
|
||||||
|
result.Reason = fmt.Sprintf("idle validation failed: average CPU load %.1f%% exceeds 20%%", result.CPUAvgUsagePct)
|
||||||
|
case result.CPUP95UsagePct > 35:
|
||||||
|
result.Reason = fmt.Sprintf("idle validation failed: p95 CPU load %.1f%% exceeds 35%%", result.CPUP95UsagePct)
|
||||||
|
default:
|
||||||
|
result.Valid = true
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
func chooseBenchmarkPowerAutotuneSource(idle, load []benchmarkPowerAutotuneSample) (string, []BenchmarkPowerAutotuneCandidate, float64, float64, error) {
|
||||||
|
idleBySource := map[string][]float64{}
|
||||||
|
loadBySource := map[string][]float64{}
|
||||||
|
var idleGPU []float64
|
||||||
|
var loadGPU []float64
|
||||||
|
for _, sample := range idle {
|
||||||
|
idleGPU = append(idleGPU, sample.GPUSumPowerW)
|
||||||
|
for source, value := range sample.Sources {
|
||||||
|
if value > 0 {
|
||||||
|
idleBySource[source] = append(idleBySource[source], value)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, sample := range load {
|
||||||
|
loadGPU = append(loadGPU, sample.GPUSumPowerW)
|
||||||
|
for source, value := range sample.Sources {
|
||||||
|
if value > 0 {
|
||||||
|
loadBySource[source] = append(loadBySource[source], value)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
idleGPUAvg := benchmarkMean(idleGPU)
|
||||||
|
loadGPUAvg := benchmarkMean(loadGPU)
|
||||||
|
gpuDelta := loadGPUAvg - idleGPUAvg
|
||||||
|
if gpuDelta <= 0 {
|
||||||
|
gpuDelta = loadGPUAvg
|
||||||
|
}
|
||||||
|
|
||||||
|
candidates := []BenchmarkPowerAutotuneCandidate{
|
||||||
|
buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceDCMI, idleBySource[BenchmarkPowerSourceDCMI], loadBySource[BenchmarkPowerSourceDCMI], gpuDelta),
|
||||||
|
buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceSDRPSUInput, idleBySource[BenchmarkPowerSourceSDRPSUInput], loadBySource[BenchmarkPowerSourceSDRPSUInput], gpuDelta),
|
||||||
|
}
|
||||||
|
available := make([]BenchmarkPowerAutotuneCandidate, 0, len(candidates))
|
||||||
|
for _, candidate := range candidates {
|
||||||
|
if candidate.Available && candidate.DeltaW > 0 {
|
||||||
|
available = append(available, candidate)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(available) == 0 {
|
||||||
|
return "", candidates, idleGPUAvg, loadGPUAvg, fmt.Errorf("no usable server power source samples collected")
|
||||||
|
}
|
||||||
|
sort.Slice(available, func(i, j int) bool {
|
||||||
|
if math.Abs(available[i].RelativeError-available[j].RelativeError) <= 0.10 {
|
||||||
|
if available[i].Source != available[j].Source {
|
||||||
|
return available[i].Source == BenchmarkPowerSourceSDRPSUInput
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if available[i].RelativeError != available[j].RelativeError {
|
||||||
|
return available[i].RelativeError < available[j].RelativeError
|
||||||
|
}
|
||||||
|
return available[i].Samples > available[j].Samples
|
||||||
|
})
|
||||||
|
selected := available[0]
|
||||||
|
for idx := range candidates {
|
||||||
|
if candidates[idx].Source == selected.Source {
|
||||||
|
candidates[idx].Selected = true
|
||||||
|
candidates[idx].SelectionNotes = fmt.Sprintf("selected because delta %.0f W is closest to GPU delta %.0f W (relative error %.3f)", selected.DeltaW, gpuDelta, selected.RelativeError)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return selected.Source, candidates, idleGPUAvg, loadGPUAvg, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildBenchmarkPowerAutotuneCandidate(source string, idle, load []float64, gpuDelta float64) BenchmarkPowerAutotuneCandidate {
|
||||||
|
candidate := BenchmarkPowerAutotuneCandidate{
|
||||||
|
Source: source,
|
||||||
|
Available: len(idle) > 0 && len(load) > 0,
|
||||||
|
Samples: minInt(len(idle), len(load)),
|
||||||
|
}
|
||||||
|
if !candidate.Available {
|
||||||
|
return candidate
|
||||||
|
}
|
||||||
|
candidate.IdleAvgW = benchmarkMean(idle)
|
||||||
|
candidate.LoadAvgW = benchmarkMean(load)
|
||||||
|
candidate.DeltaW = candidate.LoadAvgW - candidate.IdleAvgW
|
||||||
|
if gpuDelta > 0 {
|
||||||
|
candidate.RelativeError = math.Abs(candidate.DeltaW-gpuDelta) / gpuDelta
|
||||||
|
candidate.Confidence = math.Max(0, 1-candidate.RelativeError)
|
||||||
|
}
|
||||||
|
return candidate
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderBenchmarkPowerAutotuneSummary(result BenchmarkPowerAutotuneResult) string {
|
||||||
|
var b strings.Builder
|
||||||
|
fmt.Fprintf(&b, "generated_at=%s\n", result.GeneratedAt.UTC().Format(time.RFC3339))
|
||||||
|
fmt.Fprintf(&b, "status=%s\n", result.Status)
|
||||||
|
fmt.Fprintf(&b, "benchmark_kind=%s\n", result.BenchmarkKind)
|
||||||
|
fmt.Fprintf(&b, "profile=%s\n", result.Profile)
|
||||||
|
fmt.Fprintf(&b, "idle_duration_sec=%d\n", result.IdleDurationSec)
|
||||||
|
fmt.Fprintf(&b, "load_duration_sec=%d\n", result.LoadDurationSec)
|
||||||
|
fmt.Fprintf(&b, "sample_interval_sec=%d\n", result.SampleIntervalSec)
|
||||||
|
if result.SelectedSource != "" {
|
||||||
|
fmt.Fprintf(&b, "selected_source=%s\n", result.SelectedSource)
|
||||||
|
}
|
||||||
|
if result.IdleValidation != nil {
|
||||||
|
fmt.Fprintf(&b, "idle_valid=%t\n", result.IdleValidation.Valid)
|
||||||
|
fmt.Fprintf(&b, "idle_gpu_avg_usage_pct=%.1f\n", result.IdleValidation.GPUAvgUsagePct)
|
||||||
|
fmt.Fprintf(&b, "idle_gpu_p95_usage_pct=%.1f\n", result.IdleValidation.GPUP95UsagePct)
|
||||||
|
fmt.Fprintf(&b, "idle_cpu_avg_usage_pct=%.1f\n", result.IdleValidation.CPUAvgUsagePct)
|
||||||
|
fmt.Fprintf(&b, "idle_cpu_p95_usage_pct=%.1f\n", result.IdleValidation.CPUP95UsagePct)
|
||||||
|
if result.IdleValidation.Reason != "" {
|
||||||
|
fmt.Fprintf(&b, "idle_validation_error=%s\n", result.IdleValidation.Reason)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, candidate := range result.Candidates {
|
||||||
|
fmt.Fprintf(&b, "candidate_%s_available=%t\n", candidate.Source, candidate.Available)
|
||||||
|
if candidate.Available {
|
||||||
|
fmt.Fprintf(&b, "candidate_%s_idle_avg_w=%.0f\n", candidate.Source, candidate.IdleAvgW)
|
||||||
|
fmt.Fprintf(&b, "candidate_%s_load_avg_w=%.0f\n", candidate.Source, candidate.LoadAvgW)
|
||||||
|
fmt.Fprintf(&b, "candidate_%s_delta_w=%.0f\n", candidate.Source, candidate.DeltaW)
|
||||||
|
fmt.Fprintf(&b, "candidate_%s_relative_error=%.3f\n", candidate.Source, candidate.RelativeError)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderBenchmarkPowerAutotuneReport(result BenchmarkPowerAutotuneResult) string {
|
||||||
|
var b strings.Builder
|
||||||
|
b.WriteString("# Bee Bench Power Source Autotune\n\n")
|
||||||
|
fmt.Fprintf(&b, "**Status:** %s \n", result.Status)
|
||||||
|
fmt.Fprintf(&b, "**Benchmark kind:** %s \n", result.BenchmarkKind)
|
||||||
|
fmt.Fprintf(&b, "**Profile:** %s \n", result.Profile)
|
||||||
|
fmt.Fprintf(&b, "**Idle window:** %ds \n", result.IdleDurationSec)
|
||||||
|
fmt.Fprintf(&b, "**Load window:** %ds \n", result.LoadDurationSec)
|
||||||
|
fmt.Fprintf(&b, "**Sample interval:** %ds \n", result.SampleIntervalSec)
|
||||||
|
if result.SelectedSource != "" {
|
||||||
|
fmt.Fprintf(&b, "**Selected source:** `%s` \n", result.SelectedSource)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
if result.IdleValidation != nil {
|
||||||
|
b.WriteString("## Idle Validation\n\n")
|
||||||
|
fmt.Fprintf(&b, "- valid: %t\n", result.IdleValidation.Valid)
|
||||||
|
fmt.Fprintf(&b, "- GPU avg usage: %.1f%%\n", result.IdleValidation.GPUAvgUsagePct)
|
||||||
|
fmt.Fprintf(&b, "- GPU p95 usage: %.1f%%\n", result.IdleValidation.GPUP95UsagePct)
|
||||||
|
fmt.Fprintf(&b, "- CPU avg usage: %.1f%%\n", result.IdleValidation.CPUAvgUsagePct)
|
||||||
|
fmt.Fprintf(&b, "- CPU p95 usage: %.1f%%\n", result.IdleValidation.CPUP95UsagePct)
|
||||||
|
if result.IdleValidation.Reason != "" {
|
||||||
|
fmt.Fprintf(&b, "- reason: %s\n", result.IdleValidation.Reason)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
if len(result.Candidates) > 0 {
|
||||||
|
b.WriteString("## Candidates\n\n")
|
||||||
|
b.WriteString("| Source | Idle avg W | Load avg W | Delta W | Relative error | Selected |\n")
|
||||||
|
b.WriteString("|--------|------------|------------|---------|----------------|----------|\n")
|
||||||
|
for _, candidate := range result.Candidates {
|
||||||
|
if !candidate.Available {
|
||||||
|
fmt.Fprintf(&b, "| %s | — | — | — | — | no |\n", candidate.Source)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
selected := "no"
|
||||||
|
if candidate.Selected {
|
||||||
|
selected = "yes"
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "| %s | %.0f | %.0f | %.0f | %.2f | %s |\n",
|
||||||
|
candidate.Source, candidate.IdleAvgW, candidate.LoadAvgW, candidate.DeltaW, candidate.RelativeError, selected)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
for _, note := range result.Notes {
|
||||||
|
fmt.Fprintf(&b, "- %s\n", note)
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func benchmarkAutotuneLoadCommand(kind string, durationSec int, gpuIndices []int, sizeMB int) ([]string, string) {
|
||||||
|
allDevices := joinIndexList(gpuIndices)
|
||||||
|
switch strings.TrimSpace(strings.ToLower(kind)) {
|
||||||
|
case "power-fit", "power", "nvidia-bench-power":
|
||||||
|
cmd, _, err := resolveBenchmarkPowerLoadCommand(durationSec, gpuIndices)
|
||||||
|
if err == nil {
|
||||||
|
return cmd, "power-fit"
|
||||||
|
}
|
||||||
|
return nvidiaDCGMNamedDiagCommand("targeted_power", durationSec, gpuIndices), "power-fit"
|
||||||
|
default:
|
||||||
|
cmd := []string{
|
||||||
|
"bee-gpu-burn",
|
||||||
|
"--seconds", fmt.Sprintf("%d", durationSec),
|
||||||
|
"--devices", allDevices,
|
||||||
|
}
|
||||||
|
if sizeMB > 0 {
|
||||||
|
cmd = append(cmd, "--size-mb", fmt.Sprintf("%d", sizeMB))
|
||||||
|
}
|
||||||
|
return cmd, "performance"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
|
||||||
|
if ctx == nil {
|
||||||
|
ctx = context.Background()
|
||||||
|
}
|
||||||
|
if logFunc == nil {
|
||||||
|
logFunc = func(string) {}
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = "/var/log/bee-bench/autotune"
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(baseDir, 0755); err != nil {
|
||||||
|
return "", fmt.Errorf("mkdir %s: %w", baseDir, err)
|
||||||
|
}
|
||||||
|
selected, err := resolveNvidiaGPUSelection(nil, nil)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
if len(selected) == 0 {
|
||||||
|
return "", fmt.Errorf("no NVIDIA GPUs detected for autotune")
|
||||||
|
}
|
||||||
|
ts := time.Now().UTC().Format("20060102-150405")
|
||||||
|
runDir := filepath.Join(baseDir, "autotune-"+ts)
|
||||||
|
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||||
|
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
|
||||||
|
}
|
||||||
|
verboseLog := filepath.Join(runDir, "verbose.log")
|
||||||
|
hostname, _ := os.Hostname()
|
||||||
|
loadCmd, normalizedKind := benchmarkAutotuneLoadCommand(benchmarkKind, benchmarkPowerAutotuneLoadSec, selected, opts.SizeMB)
|
||||||
|
result := BenchmarkPowerAutotuneResult{
|
||||||
|
GeneratedAt: time.Now().UTC(),
|
||||||
|
Hostname: hostname,
|
||||||
|
ServerModel: readServerModel(),
|
||||||
|
BenchmarkKind: normalizedKind,
|
||||||
|
Profile: opts.Profile,
|
||||||
|
Status: "FAILED",
|
||||||
|
IdleDurationSec: benchmarkPowerAutotuneIdleSec,
|
||||||
|
LoadDurationSec: benchmarkPowerAutotuneLoadSec,
|
||||||
|
SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
|
||||||
|
}
|
||||||
|
|
||||||
|
logFunc(fmt.Sprintf("autotune: idle validation window %ds on GPUs %s", benchmarkPowerAutotuneIdleSec, joinIndexList(selected)))
|
||||||
|
idleSamples := collectBenchmarkPowerAutotuneSamples(ctx, "idle", selected, benchmarkPowerAutotuneIdleSec, logFunc)
|
||||||
|
logBenchmarkPowerAutotunePhaseSummary("idle", idleSamples, logFunc)
|
||||||
|
result.IdleValidation = validateBenchmarkPowerAutotuneIdle(idleSamples)
|
||||||
|
if result.IdleValidation == nil || !result.IdleValidation.Valid {
|
||||||
|
if result.IdleValidation != nil {
|
||||||
|
result.IdleValidationError = result.IdleValidation.Reason
|
||||||
|
logFunc(result.IdleValidation.Reason)
|
||||||
|
}
|
||||||
|
result.Notes = append(result.Notes, "autotune stopped before load stage because idle validation failed")
|
||||||
|
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return runDir, fmt.Errorf("%s", result.IdleValidationError)
|
||||||
|
}
|
||||||
|
|
||||||
|
logFunc(fmt.Sprintf("autotune: full-load stage using %s for %ds", normalizedKind, benchmarkPowerAutotuneLoadSec))
|
||||||
|
loadSamplesCh := make(chan []benchmarkPowerAutotuneSample, 1)
|
||||||
|
go func() {
|
||||||
|
loadSamplesCh <- collectBenchmarkPowerAutotuneSamples(ctx, "load", selected, benchmarkPowerAutotuneLoadSec, logFunc)
|
||||||
|
}()
|
||||||
|
out, runErr := runSATCommandCtx(ctx, verboseLog, "autotune-load.log", loadCmd, nil, logFunc)
|
||||||
|
_ = os.WriteFile(filepath.Join(runDir, "autotune-load.log"), out, 0644)
|
||||||
|
loadSamples := <-loadSamplesCh
|
||||||
|
logBenchmarkPowerAutotunePhaseSummary("load", loadSamples, logFunc)
|
||||||
|
if runErr != nil {
|
||||||
|
result.Notes = append(result.Notes, "full-load stage failed: "+runErr.Error())
|
||||||
|
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return runDir, fmt.Errorf("autotune load stage: %w", runErr)
|
||||||
|
}
|
||||||
|
|
||||||
|
selectedSource, candidates, idleGPUAvg, loadGPUAvg, chooseErr := chooseBenchmarkPowerAutotuneSource(idleSamples, loadSamples)
|
||||||
|
result.Candidates = candidates
|
||||||
|
result.GPUPowerIdleW = idleGPUAvg
|
||||||
|
result.GPUPowerLoadW = loadGPUAvg
|
||||||
|
if chooseErr != nil {
|
||||||
|
result.Notes = append(result.Notes, chooseErr.Error())
|
||||||
|
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return runDir, chooseErr
|
||||||
|
}
|
||||||
|
gpuDelta := loadGPUAvg - idleGPUAvg
|
||||||
|
if gpuDelta <= 0 {
|
||||||
|
gpuDelta = loadGPUAvg
|
||||||
|
}
|
||||||
|
logBenchmarkPowerAutotuneSelection(candidates, selectedSource, gpuDelta, logFunc)
|
||||||
|
result.SelectedSource = selectedSource
|
||||||
|
result.Status = "OK"
|
||||||
|
var confidence float64
|
||||||
|
selectionReason := fmt.Sprintf("selected %s after comparing full-load average against GPU-reported delta", selectedSource)
|
||||||
|
for _, candidate := range candidates {
|
||||||
|
if candidate.Selected {
|
||||||
|
confidence = candidate.Confidence
|
||||||
|
if strings.TrimSpace(candidate.SelectionNotes) != "" {
|
||||||
|
selectionReason = candidate.SelectionNotes
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
cfg := BenchmarkPowerAutotuneConfig{
|
||||||
|
Version: benchmarkPowerAutotuneVersion,
|
||||||
|
UpdatedAt: time.Now().UTC(),
|
||||||
|
SelectedSource: selectedSource,
|
||||||
|
BenchmarkKind: normalizedKind,
|
||||||
|
Profile: opts.Profile,
|
||||||
|
IdleDurationSec: benchmarkPowerAutotuneIdleSec,
|
||||||
|
LoadDurationSec: benchmarkPowerAutotuneLoadSec,
|
||||||
|
SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
|
||||||
|
Confidence: confidence,
|
||||||
|
Reason: selectionReason,
|
||||||
|
}
|
||||||
|
result.Config = &cfg
|
||||||
|
configPath := BenchmarkPowerSourceConfigPath(baseDir)
|
||||||
|
if err := SaveBenchmarkPowerAutotuneConfig(configPath, cfg); err != nil {
|
||||||
|
result.Status = "FAILED"
|
||||||
|
result.Notes = append(result.Notes, "failed to save autotune config: "+err.Error())
|
||||||
|
if writeErr := writeBenchmarkPowerAutotuneArtifacts(runDir, result); writeErr != nil {
|
||||||
|
return "", writeErr
|
||||||
|
}
|
||||||
|
return runDir, err
|
||||||
|
}
|
||||||
|
logFunc(fmt.Sprintf("autotune conclusion: selected source %s; reason: %s", selectedSource, cfg.Reason))
|
||||||
|
result.Notes = append(result.Notes, "saved autotune config to "+configPath)
|
||||||
|
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return runDir, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeBenchmarkPowerAutotuneArtifacts(runDir string, result BenchmarkPowerAutotuneResult) error {
|
||||||
|
resultJSON, err := json.MarshalIndent(result, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("marshal autotune result: %w", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
|
||||||
|
return fmt.Errorf("write autotune result.json: %w", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(renderBenchmarkPowerAutotuneSummary(result)), 0644); err != nil {
|
||||||
|
return fmt.Errorf("write autotune summary.txt: %w", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(renderBenchmarkPowerAutotuneReport(result)), 0644); err != nil {
|
||||||
|
return fmt.Errorf("write autotune report.md: %w", err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func minInt(a, b int) int {
|
||||||
|
if a < b {
|
||||||
|
return a
|
||||||
|
}
|
||||||
|
return b
|
||||||
|
}
|
||||||
|
|
||||||
|
var _ = exec.ErrNotFound
|
||||||
@@ -401,11 +401,15 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Server Power (IPMI) ───────────────────────────────────────────────────
|
// ── Server Power ───────────────────────────────────────────────────────────
|
||||||
if sp := result.ServerPower; sp != nil {
|
if sp := result.ServerPower; sp != nil {
|
||||||
b.WriteString("## Server Power (IPMI)\n\n")
|
title := "## Server Power\n\n"
|
||||||
|
if sp.Source != "" {
|
||||||
|
title = fmt.Sprintf("## Server Power (`%s`)\n\n", sp.Source)
|
||||||
|
}
|
||||||
|
b.WriteString(title)
|
||||||
if !sp.Available {
|
if !sp.Available {
|
||||||
b.WriteString("IPMI power measurement unavailable.\n\n")
|
b.WriteString("Server power measurement unavailable.\n\n")
|
||||||
} else {
|
} else {
|
||||||
spRows := [][]string{
|
spRows := [][]string{
|
||||||
{"Server idle", fmt.Sprintf("%.0f W", sp.IdleW)},
|
{"Server idle", fmt.Sprintf("%.0f W", sp.IdleW)},
|
||||||
|
|||||||
@@ -43,6 +43,11 @@ const (
|
|||||||
NvidiaBenchmarkProfileOvernight = "overnight"
|
NvidiaBenchmarkProfileOvernight = "overnight"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
BenchmarkPowerEngineDCGMProfTester = "dcgmproftester"
|
||||||
|
BenchmarkPowerEngineTargetedPower = "targeted_power"
|
||||||
|
)
|
||||||
|
|
||||||
// Estimated wall-clock durations for benchmark runs, derived from real _v8 logs.
|
// Estimated wall-clock durations for benchmark runs, derived from real _v8 logs.
|
||||||
// Rule: when changing profile phase durations in resolveBenchmarkProfile(),
|
// Rule: when changing profile phase durations in resolveBenchmarkProfile(),
|
||||||
// re-measure from actual task logs and update the constants here.
|
// re-measure from actual task logs and update the constants here.
|
||||||
@@ -61,7 +66,7 @@ const (
|
|||||||
BenchmarkEstimatedPerfStabilitySec = 5532 // ~92 min; ramp-up 1-8 measured
|
BenchmarkEstimatedPerfStabilitySec = 5532 // ~92 min; ramp-up 1-8 measured
|
||||||
BenchmarkEstimatedPerfOvernightSec = 8 * 3600
|
BenchmarkEstimatedPerfOvernightSec = 8 * 3600
|
||||||
|
|
||||||
// Power / Thermal Fit (dcgmi targeted_power binary-search calibration).
|
// Power / Thermal Fit (dcgmproftester load + nvidia-smi power-limit binary search).
|
||||||
// Duration is for the full ramp-up run; individual steps vary with convergence speed.
|
// Duration is for the full ramp-up run; individual steps vary with convergence speed.
|
||||||
BenchmarkEstimatedPowerStandardSec = 2600 // ~43 min; ramp 1-4: 2663 s, ramp 1-8: 2375 s
|
BenchmarkEstimatedPowerStandardSec = 2600 // ~43 min; ramp 1-4: 2663 s, ramp 1-8: 2375 s
|
||||||
BenchmarkEstimatedPowerStabilitySec = 5400 // ~90 min; calibDurationSec=300 × 8 GPU × ~2-3 attempts
|
BenchmarkEstimatedPowerStabilitySec = 5400 // ~90 min; calibDurationSec=300 × 8 GPU × ~2-3 attempts
|
||||||
@@ -74,12 +79,84 @@ type NvidiaBenchmarkOptions struct {
|
|||||||
GPUIndices []int
|
GPUIndices []int
|
||||||
ExcludeGPUIndices []int
|
ExcludeGPUIndices []int
|
||||||
RunNCCL bool
|
RunNCCL bool
|
||||||
|
ServerPowerSource string
|
||||||
ParallelGPUs bool // run all selected GPUs simultaneously instead of sequentially
|
ParallelGPUs bool // run all selected GPUs simultaneously instead of sequentially
|
||||||
RampStep int // 1-based step index within a ramp-up run (0 = not a ramp-up)
|
RampStep int // 1-based step index within a ramp-up run (0 = not a ramp-up)
|
||||||
RampTotal int // total number of ramp-up steps in this run
|
RampTotal int // total number of ramp-up steps in this run
|
||||||
RampRunID string // shared identifier across all steps of the same ramp-up run
|
RampRunID string // shared identifier across all steps of the same ramp-up run
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
BenchmarkPowerSourceDCMI = "dcmi"
|
||||||
|
BenchmarkPowerSourceSDRPSUInput = "sdr_psu_input"
|
||||||
|
)
|
||||||
|
|
||||||
|
type BenchmarkPowerAutotuneConfig struct {
|
||||||
|
Version int `json:"version"`
|
||||||
|
UpdatedAt time.Time `json:"updated_at"`
|
||||||
|
SelectedSource string `json:"selected_source"`
|
||||||
|
BenchmarkKind string `json:"benchmark_kind,omitempty"`
|
||||||
|
Profile string `json:"profile,omitempty"`
|
||||||
|
IdleDurationSec int `json:"idle_duration_sec,omitempty"`
|
||||||
|
LoadDurationSec int `json:"load_duration_sec,omitempty"`
|
||||||
|
SampleIntervalSec int `json:"sample_interval_sec,omitempty"`
|
||||||
|
Confidence float64 `json:"confidence,omitempty"`
|
||||||
|
Reason string `json:"reason,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type SystemPowerSourceDecision struct {
|
||||||
|
Configured bool `json:"configured"`
|
||||||
|
SelectedSource string `json:"selected_source,omitempty"`
|
||||||
|
EffectiveSource string `json:"effective_source,omitempty"`
|
||||||
|
Mode string `json:"mode,omitempty"` // autotuned, fallback, degraded
|
||||||
|
Reason string `json:"reason,omitempty"`
|
||||||
|
ConfiguredAt time.Time `json:"configured_at,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BenchmarkPowerAutotuneResult struct {
|
||||||
|
GeneratedAt time.Time `json:"generated_at"`
|
||||||
|
Hostname string `json:"hostname,omitempty"`
|
||||||
|
ServerModel string `json:"server_model,omitempty"`
|
||||||
|
BenchmarkKind string `json:"benchmark_kind,omitempty"`
|
||||||
|
Profile string `json:"profile,omitempty"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
IdleDurationSec int `json:"idle_duration_sec"`
|
||||||
|
LoadDurationSec int `json:"load_duration_sec"`
|
||||||
|
SampleIntervalSec int `json:"sample_interval_sec"`
|
||||||
|
SelectedSource string `json:"selected_source,omitempty"`
|
||||||
|
IdleValidationError string `json:"idle_validation_error,omitempty"`
|
||||||
|
IdleValidation *BenchmarkPowerAutotuneValidation `json:"idle_validation,omitempty"`
|
||||||
|
GPUPowerIdleW float64 `json:"gpu_power_idle_w,omitempty"`
|
||||||
|
GPUPowerLoadW float64 `json:"gpu_power_load_w,omitempty"`
|
||||||
|
Candidates []BenchmarkPowerAutotuneCandidate `json:"candidates,omitempty"`
|
||||||
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
Config *BenchmarkPowerAutotuneConfig `json:"config,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BenchmarkPowerAutotuneValidation struct {
|
||||||
|
Valid bool `json:"valid"`
|
||||||
|
GPUAvgUsagePct float64 `json:"gpu_avg_usage_pct,omitempty"`
|
||||||
|
GPUP95UsagePct float64 `json:"gpu_p95_usage_pct,omitempty"`
|
||||||
|
CPUAvgUsagePct float64 `json:"cpu_avg_usage_pct,omitempty"`
|
||||||
|
CPUP95UsagePct float64 `json:"cpu_p95_usage_pct,omitempty"`
|
||||||
|
GPUSamples int `json:"gpu_samples,omitempty"`
|
||||||
|
CPUSamples int `json:"cpu_samples,omitempty"`
|
||||||
|
Reason string `json:"reason,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BenchmarkPowerAutotuneCandidate struct {
|
||||||
|
Source string `json:"source"`
|
||||||
|
IdleAvgW float64 `json:"idle_avg_w,omitempty"`
|
||||||
|
LoadAvgW float64 `json:"load_avg_w,omitempty"`
|
||||||
|
DeltaW float64 `json:"delta_w,omitempty"`
|
||||||
|
Samples int `json:"samples,omitempty"`
|
||||||
|
RelativeError float64 `json:"relative_error,omitempty"`
|
||||||
|
Confidence float64 `json:"confidence,omitempty"`
|
||||||
|
Selected bool `json:"selected,omitempty"`
|
||||||
|
Available bool `json:"available"`
|
||||||
|
SelectionNotes string `json:"selection_notes,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
type NvidiaBenchmarkResult struct {
|
type NvidiaBenchmarkResult struct {
|
||||||
BenchmarkVersion string `json:"benchmark_version"`
|
BenchmarkVersion string `json:"benchmark_version"`
|
||||||
GeneratedAt time.Time `json:"generated_at"`
|
GeneratedAt time.Time `json:"generated_at"`
|
||||||
@@ -294,12 +371,16 @@ type BenchmarkPSUSlotPower struct {
|
|||||||
// - SDR — `ipmitool sdr` PSUx_POWER_IN/OUT; per-PSU, reliable
|
// - SDR — `ipmitool sdr` PSUx_POWER_IN/OUT; per-PSU, reliable
|
||||||
// - nvidia-smi — GPU self-reported via internal shunt; accurate for GPU load
|
// - nvidia-smi — GPU self-reported via internal shunt; accurate for GPU load
|
||||||
type BenchmarkServerPower struct {
|
type BenchmarkServerPower struct {
|
||||||
Available bool `json:"available"`
|
Available bool `json:"available"`
|
||||||
IdleW float64 `json:"idle_w,omitempty"` // DCMI at idle
|
Source string `json:"source,omitempty"`
|
||||||
LoadedW float64 `json:"loaded_w,omitempty"` // DCMI at peak load
|
Mode string `json:"mode,omitempty"`
|
||||||
DeltaW float64 `json:"delta_w,omitempty"` // DCMI loaded − idle
|
Reason string `json:"reason,omitempty"`
|
||||||
GPUReportedSumW float64 `json:"gpu_reported_sum_w,omitempty"`
|
SampleIntervalSec int `json:"sample_interval_sec,omitempty"`
|
||||||
ReportingRatio float64 `json:"reporting_ratio,omitempty"`
|
IdleW float64 `json:"idle_w,omitempty"` // DCMI at idle
|
||||||
|
LoadedW float64 `json:"loaded_w,omitempty"` // DCMI at peak load
|
||||||
|
DeltaW float64 `json:"delta_w,omitempty"` // DCMI loaded − idle
|
||||||
|
GPUReportedSumW float64 `json:"gpu_reported_sum_w,omitempty"`
|
||||||
|
ReportingRatio float64 `json:"reporting_ratio,omitempty"`
|
||||||
|
|
||||||
// PSU AC input sum — sampled at idle and at peak load using collector's
|
// PSU AC input sum — sampled at idle and at peak load using collector's
|
||||||
// slot patterns (PSU1_POWER_IN, PSU1_PIN, PS1 POut, Power1…).
|
// slot patterns (PSU1_POWER_IN, PSU1_PIN, PS1 POut, Power1…).
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
const installToRAMDir = "/dev/shm/bee-live"
|
const installToRAMDir = "/dev/shm/bee-live"
|
||||||
|
const copyProgressLogStep int64 = 100 * 1024 * 1024
|
||||||
|
|
||||||
func (s *System) IsLiveMediaInRAM() bool {
|
func (s *System) IsLiveMediaInRAM() bool {
|
||||||
return s.LiveMediaRAMState().InRAM
|
return s.LiveMediaRAMState().InRAM
|
||||||
@@ -319,6 +320,7 @@ func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) e
|
|||||||
defer out.Close()
|
defer out.Close()
|
||||||
total := fi.Size()
|
total := fi.Size()
|
||||||
var copied int64
|
var copied int64
|
||||||
|
var lastLogged int64
|
||||||
buf := make([]byte, 4*1024*1024)
|
buf := make([]byte, 4*1024*1024)
|
||||||
for {
|
for {
|
||||||
if err := ctx.Err(); err != nil {
|
if err := ctx.Err(); err != nil {
|
||||||
@@ -330,7 +332,8 @@ func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) e
|
|||||||
return werr
|
return werr
|
||||||
}
|
}
|
||||||
copied += int64(n)
|
copied += int64(n)
|
||||||
if logFunc != nil && total > 0 {
|
if shouldLogCopyProgress(copied, total, lastLogged) {
|
||||||
|
lastLogged = copied
|
||||||
pct := int(float64(copied) / float64(total) * 100)
|
pct := int(float64(copied) / float64(total) * 100)
|
||||||
logFunc(fmt.Sprintf(" %s / %s (%d%%)", humanBytes(copied), humanBytes(total), pct))
|
logFunc(fmt.Sprintf(" %s / %s (%d%%)", humanBytes(copied), humanBytes(total), pct))
|
||||||
}
|
}
|
||||||
@@ -345,6 +348,19 @@ func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) e
|
|||||||
return out.Sync()
|
return out.Sync()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func shouldLogCopyProgress(copied, total, lastLogged int64) bool {
|
||||||
|
if total <= 0 || copied <= 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if copied >= total {
|
||||||
|
return copied > lastLogged
|
||||||
|
}
|
||||||
|
if copied < copyProgressLogStep {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return copied-lastLogged >= copyProgressLogStep
|
||||||
|
}
|
||||||
|
|
||||||
func cpDir(ctx context.Context, src, dst string, logFunc func(string)) error {
|
func cpDir(ctx context.Context, src, dst string, logFunc func(string)) error {
|
||||||
return filepath.Walk(src, func(path string, fi os.FileInfo, err error) error {
|
return filepath.Walk(src, func(path string, fi os.FileInfo, err error) error {
|
||||||
if ctx.Err() != nil {
|
if ctx.Err() != nil {
|
||||||
|
|||||||
@@ -101,3 +101,26 @@ func TestEvaluateLiveMediaRAMState(t *testing.T) {
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestShouldLogCopyProgress(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
total := int64(250 * 1024 * 1024)
|
||||||
|
step := int64(100 * 1024 * 1024)
|
||||||
|
|
||||||
|
if shouldLogCopyProgress(step-1, total, 0) {
|
||||||
|
t.Fatal("progress logged too early")
|
||||||
|
}
|
||||||
|
if !shouldLogCopyProgress(step, total, 0) {
|
||||||
|
t.Fatal("expected log at first 100MB boundary")
|
||||||
|
}
|
||||||
|
if shouldLogCopyProgress(step+16*1024*1024, total, step) {
|
||||||
|
t.Fatal("progress logged again before next 100MB")
|
||||||
|
}
|
||||||
|
if !shouldLogCopyProgress(2*step, total, step) {
|
||||||
|
t.Fatal("expected log at second 100MB boundary")
|
||||||
|
}
|
||||||
|
if !shouldLogCopyProgress(total, total, 2*step) {
|
||||||
|
t.Fatal("expected final completion log")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,8 +1,10 @@
|
|||||||
package platform
|
package platform
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bee/audit/internal/collector"
|
||||||
"bufio"
|
"bufio"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"sort"
|
"sort"
|
||||||
@@ -14,14 +16,17 @@ import (
|
|||||||
// LiveMetricSample is a single point-in-time snapshot of server metrics
|
// LiveMetricSample is a single point-in-time snapshot of server metrics
|
||||||
// collected for the web UI metrics page.
|
// collected for the web UI metrics page.
|
||||||
type LiveMetricSample struct {
|
type LiveMetricSample struct {
|
||||||
Timestamp time.Time `json:"ts"`
|
Timestamp time.Time `json:"ts"`
|
||||||
Fans []FanReading `json:"fans"`
|
Fans []FanReading `json:"fans"`
|
||||||
Temps []TempReading `json:"temps"`
|
Temps []TempReading `json:"temps"`
|
||||||
PowerW float64 `json:"power_w"`
|
PowerW float64 `json:"power_w"`
|
||||||
PSUs []PSUReading `json:"psus,omitempty"`
|
PowerSource string `json:"power_source,omitempty"`
|
||||||
CPULoadPct float64 `json:"cpu_load_pct"`
|
PowerMode string `json:"power_mode,omitempty"`
|
||||||
MemLoadPct float64 `json:"mem_load_pct"`
|
PowerReason string `json:"power_reason,omitempty"`
|
||||||
GPUs []GPUMetricRow `json:"gpus"`
|
PSUs []PSUReading `json:"psus,omitempty"`
|
||||||
|
CPULoadPct float64 `json:"cpu_load_pct"`
|
||||||
|
MemLoadPct float64 `json:"mem_load_pct"`
|
||||||
|
GPUs []GPUMetricRow `json:"gpus"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// PSUReading is a per-slot power supply input power reading.
|
// PSUReading is a per-slot power supply input power reading.
|
||||||
@@ -62,12 +67,18 @@ func SampleLiveMetrics() LiveMetricSample {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// System power — returns 0 if unavailable
|
|
||||||
s.PowerW = sampleSystemPower()
|
|
||||||
|
|
||||||
// Per-PSU power — populated when IPMI SDR has Power Supply entities with Watt readings
|
// Per-PSU power — populated when IPMI SDR has Power Supply entities with Watt readings
|
||||||
s.PSUs = samplePSUPower()
|
s.PSUs = samplePSUPower()
|
||||||
|
|
||||||
|
// System power: use the global autotune-selected source when configured,
|
||||||
|
// otherwise fall back to the historical heuristic and mark the mode.
|
||||||
|
if powerW, decision, err := SampleSystemPowerResolved(""); err == nil {
|
||||||
|
s.PowerW = powerW
|
||||||
|
s.PowerSource = decision.EffectiveSource
|
||||||
|
s.PowerMode = decision.Mode
|
||||||
|
s.PowerReason = decision.Reason
|
||||||
|
}
|
||||||
|
|
||||||
// CPU load — from /proc/stat
|
// CPU load — from /proc/stat
|
||||||
s.CPULoadPct = sampleCPULoadPct()
|
s.CPULoadPct = sampleCPULoadPct()
|
||||||
|
|
||||||
@@ -339,63 +350,44 @@ func compactAmbientTempName(chip, name string) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// samplePSUPower reads per-PSU input power via IPMI SDR.
|
// samplePSUPower reads per-PSU input power via IPMI SDR.
|
||||||
// It parses `ipmitool sdr elist full` output looking for Power Supply entity
|
// Uses collector.PSUSlotsFromSDR (name-based matching) which works across
|
||||||
// sensors (entity ID "10.N") that report a value in Watts.
|
// vendors where PSU sensors may not carry entity ID "10.N".
|
||||||
// Returns nil when IPMI is unavailable or no PSU Watt sensors exist.
|
// Returns nil when IPMI is unavailable or no PSU Watt sensors exist.
|
||||||
func samplePSUPower() []PSUReading {
|
func samplePSUPower() []PSUReading {
|
||||||
out, err := exec.Command("ipmitool", "sdr", "elist", "full").Output()
|
out, err := exec.Command("ipmitool", "sdr").Output()
|
||||||
if err != nil || len(out) == 0 {
|
if err != nil || len(out) == 0 {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
// map slot → reading (keep highest-watt value per slot in case of duplicates)
|
slots := collector.PSUSlotsFromSDR(string(out))
|
||||||
type entry struct {
|
if len(slots) == 0 {
|
||||||
name string
|
|
||||||
powerW float64
|
|
||||||
}
|
|
||||||
bySlot := map[int]entry{}
|
|
||||||
for _, line := range strings.Split(string(out), "\n") {
|
|
||||||
parts := strings.Split(line, "|")
|
|
||||||
if len(parts) < 5 {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
entityID := strings.TrimSpace(parts[3]) // e.g. "10.1"
|
|
||||||
if !strings.HasPrefix(entityID, "10.") {
|
|
||||||
continue // not a Power Supply entity
|
|
||||||
}
|
|
||||||
slotStr := strings.TrimPrefix(entityID, "10.")
|
|
||||||
slot, err := strconv.Atoi(slotStr)
|
|
||||||
if err != nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
valueField := strings.TrimSpace(parts[4]) // e.g. "740.00 Watts"
|
|
||||||
if !strings.Contains(strings.ToLower(valueField), "watts") {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
valueFields := strings.Fields(valueField)
|
|
||||||
if len(valueFields) < 2 {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
w, err := strconv.ParseFloat(valueFields[0], 64)
|
|
||||||
if err != nil || w <= 0 {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
sensorName := strings.TrimSpace(parts[0])
|
|
||||||
if existing, ok := bySlot[slot]; !ok || w > existing.powerW {
|
|
||||||
bySlot[slot] = entry{name: sensorName, powerW: w}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if len(bySlot) == 0 {
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
slots := make([]int, 0, len(bySlot))
|
// Collect slot keys and sort for stable output.
|
||||||
for s := range bySlot {
|
keys := make([]int, 0, len(slots))
|
||||||
slots = append(slots, s)
|
for k := range slots {
|
||||||
|
n, err := strconv.Atoi(k)
|
||||||
|
if err == nil {
|
||||||
|
keys = append(keys, n)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
sort.Ints(slots)
|
sort.Ints(keys)
|
||||||
psus := make([]PSUReading, 0, len(slots))
|
psus := make([]PSUReading, 0, len(keys))
|
||||||
for _, s := range slots {
|
for _, k := range keys {
|
||||||
e := bySlot[s]
|
entry := slots[strconv.Itoa(k)]
|
||||||
psus = append(psus, PSUReading{Slot: s, Name: e.name, PowerW: e.powerW})
|
// Prefer AC input power; fall back to DC output power.
|
||||||
|
var w float64
|
||||||
|
if entry.InputW != nil && *entry.InputW > 0 {
|
||||||
|
w = *entry.InputW
|
||||||
|
} else if entry.OutputW != nil && *entry.OutputW > 0 {
|
||||||
|
w = *entry.OutputW
|
||||||
|
}
|
||||||
|
if w <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
psus = append(psus, PSUReading{Slot: k + 1, Name: fmt.Sprintf("PSU%d", k+1), PowerW: w})
|
||||||
|
}
|
||||||
|
if len(psus) == 0 {
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
return psus
|
return psus
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -443,11 +443,19 @@ func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir strin
|
|||||||
profCmd []string
|
profCmd []string
|
||||||
profEnv []string
|
profEnv []string
|
||||||
)
|
)
|
||||||
if staggerSec > 0 && len(selected) > 1 {
|
if len(selected) > 1 {
|
||||||
|
// For multiple GPUs, always spawn one dcgmproftester process per GPU via
|
||||||
|
// bee-dcgmproftester-staggered (stagger=0 means all start simultaneously).
|
||||||
|
// A single dcgmproftester process without -i only loads GPU 0 regardless
|
||||||
|
// of CUDA_VISIBLE_DEVICES.
|
||||||
|
stagger := staggerSec
|
||||||
|
if stagger < 0 {
|
||||||
|
stagger = 0
|
||||||
|
}
|
||||||
profCmd = []string{
|
profCmd = []string{
|
||||||
"bee-dcgmproftester-staggered",
|
"bee-dcgmproftester-staggered",
|
||||||
"--seconds", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)),
|
"--seconds", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)),
|
||||||
"--stagger-seconds", strconv.Itoa(staggerSec),
|
"--stagger-seconds", strconv.Itoa(stagger),
|
||||||
"--devices", joinIndexList(selected),
|
"--devices", joinIndexList(selected),
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -43,17 +43,22 @@ type GPUStressMetric struct {
|
|||||||
|
|
||||||
// FanStressRow is one second-interval telemetry sample covering all monitored dimensions.
|
// FanStressRow is one second-interval telemetry sample covering all monitored dimensions.
|
||||||
type FanStressRow struct {
|
type FanStressRow struct {
|
||||||
TimestampUTC string
|
TimestampUTC string
|
||||||
ElapsedSec float64
|
ElapsedSec float64
|
||||||
Phase string // "baseline", "load1", "pause", "load2", "cooldown"
|
Phase string // "baseline", "load1", "pause", "load2", "cooldown"
|
||||||
GPUs []GPUStressMetric
|
GPUs []GPUStressMetric
|
||||||
Fans []FanReading
|
Fans []FanReading
|
||||||
CPUMaxTempC float64 // highest CPU temperature from ipmitool / sensors
|
CPUMaxTempC float64 // highest CPU temperature from ipmitool / sensors
|
||||||
SysPowerW float64 // DCMI system power reading
|
SysPowerW float64
|
||||||
|
SysPowerSource string
|
||||||
|
SysPowerMode string
|
||||||
}
|
}
|
||||||
|
|
||||||
type cachedPowerReading struct {
|
type cachedPowerReading struct {
|
||||||
Value float64
|
Value float64
|
||||||
|
Source string
|
||||||
|
Mode string
|
||||||
|
Reason string
|
||||||
UpdatedAt time.Time
|
UpdatedAt time.Time
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -278,7 +283,7 @@ func sampleFanStressRow(gpuIndices []int, phase string, elapsed float64) FanStre
|
|||||||
row.GPUs = sampleGPUStressMetrics(gpuIndices)
|
row.GPUs = sampleGPUStressMetrics(gpuIndices)
|
||||||
row.Fans, _ = sampleFanSpeeds()
|
row.Fans, _ = sampleFanSpeeds()
|
||||||
row.CPUMaxTempC = sampleCPUMaxTemp()
|
row.CPUMaxTempC = sampleCPUMaxTemp()
|
||||||
row.SysPowerW = sampleSystemPower()
|
row.SysPowerW, row.SysPowerSource, row.SysPowerMode = sampleSystemPowerResolved()
|
||||||
return row
|
return row
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -763,19 +768,19 @@ func sampleCPUTempViaSensors() float64 {
|
|||||||
return max
|
return max
|
||||||
}
|
}
|
||||||
|
|
||||||
// sampleSystemPower reads system power draw via DCMI.
|
// sampleSystemPowerResolved reads system power via the global autotune source,
|
||||||
func sampleSystemPower() float64 {
|
// falling back to the historical heuristic before autotune or when degraded.
|
||||||
|
func sampleSystemPowerResolved() (float64, string, string) {
|
||||||
now := time.Now()
|
now := time.Now()
|
||||||
current := 0.0
|
current, decision, err := SampleSystemPowerResolved("")
|
||||||
out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
|
|
||||||
if err == nil {
|
|
||||||
current = parseDCMIPowerReading(string(out))
|
|
||||||
}
|
|
||||||
systemPowerCacheMu.Lock()
|
systemPowerCacheMu.Lock()
|
||||||
defer systemPowerCacheMu.Unlock()
|
defer systemPowerCacheMu.Unlock()
|
||||||
value, updated := effectiveSystemPowerReading(systemPowerCache, current, now)
|
if err != nil {
|
||||||
|
current = 0
|
||||||
|
}
|
||||||
|
value, updated := effectiveSystemPowerReading(systemPowerCache, current, decision.EffectiveSource, decision.Mode, decision.Reason, now)
|
||||||
systemPowerCache = updated
|
systemPowerCache = updated
|
||||||
return value
|
return value, updated.Source, updated.Mode
|
||||||
}
|
}
|
||||||
|
|
||||||
// parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
|
// parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
|
||||||
@@ -798,9 +803,9 @@ func parseDCMIPowerReading(raw string) float64 {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
func effectiveSystemPowerReading(cache cachedPowerReading, current float64, now time.Time) (float64, cachedPowerReading) {
|
func effectiveSystemPowerReading(cache cachedPowerReading, current float64, source, mode, reason string, now time.Time) (float64, cachedPowerReading) {
|
||||||
if current > 0 {
|
if current > 0 {
|
||||||
cache = cachedPowerReading{Value: current, UpdatedAt: now}
|
cache = cachedPowerReading{Value: current, Source: source, Mode: mode, Reason: reason, UpdatedAt: now}
|
||||||
return current, cache
|
return current, cache
|
||||||
}
|
}
|
||||||
if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL {
|
if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL {
|
||||||
|
|||||||
@@ -112,7 +112,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
|
|||||||
now := time.Now()
|
now := time.Now()
|
||||||
cache := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-5 * time.Second)}
|
cache := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-5 * time.Second)}
|
||||||
|
|
||||||
got, updated := effectiveSystemPowerReading(cache, 0, now)
|
got, updated := effectiveSystemPowerReading(cache, 0, "", "", "", now)
|
||||||
if got != 480 {
|
if got != 480 {
|
||||||
t.Fatalf("got=%v want cached 480", got)
|
t.Fatalf("got=%v want cached 480", got)
|
||||||
}
|
}
|
||||||
@@ -120,7 +120,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
|
|||||||
t.Fatalf("updated=%+v", updated)
|
t.Fatalf("updated=%+v", updated)
|
||||||
}
|
}
|
||||||
|
|
||||||
got, updated = effectiveSystemPowerReading(cache, 530, now)
|
got, updated = effectiveSystemPowerReading(cache, 530, "dcmi", "fallback", "test", now)
|
||||||
if got != 530 {
|
if got != 530 {
|
||||||
t.Fatalf("got=%v want 530", got)
|
t.Fatalf("got=%v want 530", got)
|
||||||
}
|
}
|
||||||
@@ -129,7 +129,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
expired := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-systemPowerHoldTTL - time.Second)}
|
expired := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-systemPowerHoldTTL - time.Second)}
|
||||||
got, _ = effectiveSystemPowerReading(expired, 0, now)
|
got, _ = effectiveSystemPowerReading(expired, 0, "", "", "", now)
|
||||||
if got != 0 {
|
if got != 0 {
|
||||||
t.Fatalf("expired cache returned %v want 0", got)
|
t.Fatalf("expired cache returned %v want 0", got)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -127,7 +127,7 @@ func defaultTaskPriority(target string, params taskParams) int {
|
|||||||
return taskPriorityInstallToRAM
|
return taskPriorityInstallToRAM
|
||||||
case "audit":
|
case "audit":
|
||||||
return taskPriorityAudit
|
return taskPriorityAudit
|
||||||
case "nvidia-bench-perf", "nvidia-bench-power":
|
case "nvidia-bench-perf", "nvidia-bench-power", "nvidia-bench-autotune":
|
||||||
return taskPriorityBenchmark
|
return taskPriorityBenchmark
|
||||||
case "nvidia-stress", "amd-stress", "memory-stress", "sat-stress", "platform-stress", "nvidia-compute":
|
case "nvidia-stress", "amd-stress", "memory-stress", "sat-stress", "platform-stress", "nvidia-compute":
|
||||||
return taskPriorityBurn
|
return taskPriorityBurn
|
||||||
@@ -701,6 +701,78 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPIBenchmarkAutotuneRun() http.HandlerFunc {
|
||||||
|
return func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if h.opts.App == nil {
|
||||||
|
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var body struct {
|
||||||
|
Profile string `json:"profile"`
|
||||||
|
BenchmarkKind string `json:"benchmark_kind"`
|
||||||
|
SizeMB int `json:"size_mb"`
|
||||||
|
}
|
||||||
|
if r.Body != nil {
|
||||||
|
if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
|
||||||
|
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
profile := strings.TrimSpace(body.Profile)
|
||||||
|
if profile == "" {
|
||||||
|
profile = "standard"
|
||||||
|
}
|
||||||
|
benchmarkKind := strings.TrimSpace(body.BenchmarkKind)
|
||||||
|
if benchmarkKind == "" {
|
||||||
|
benchmarkKind = "power-fit"
|
||||||
|
}
|
||||||
|
now := time.Now()
|
||||||
|
taskName := fmt.Sprintf("NVIDIA Benchmark Autotune · %s · %s", profile, benchmarkKind)
|
||||||
|
t := &Task{
|
||||||
|
ID: newJobID("bee-bench-autotune"),
|
||||||
|
Name: taskName,
|
||||||
|
Target: "nvidia-bench-autotune",
|
||||||
|
Priority: defaultTaskPriority("nvidia-bench-autotune", taskParams{}),
|
||||||
|
Status: TaskPending,
|
||||||
|
CreatedAt: now,
|
||||||
|
params: taskParams{
|
||||||
|
BenchmarkProfile: profile,
|
||||||
|
BenchmarkKind: benchmarkKind,
|
||||||
|
SizeMB: body.SizeMB,
|
||||||
|
DisplayName: taskName,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
globalQueue.enqueue(t)
|
||||||
|
writeTaskRunResponse(w, []*Task{t})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPIBenchmarkAutotuneStatus(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if h.opts.App == nil {
|
||||||
|
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
cfg, err := h.opts.App.LoadBenchmarkPowerAutotune()
|
||||||
|
if err != nil {
|
||||||
|
if os.IsNotExist(err) {
|
||||||
|
w.WriteHeader(http.StatusOK)
|
||||||
|
writeJSON(w, map[string]any{
|
||||||
|
"configured": false,
|
||||||
|
"decision": platform.ResolveSystemPowerDecision(h.opts.ExportDir),
|
||||||
|
})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
writeError(w, http.StatusInternalServerError, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
w.WriteHeader(http.StatusOK)
|
||||||
|
writeJSON(w, map[string]any{
|
||||||
|
"configured": true,
|
||||||
|
"config": cfg,
|
||||||
|
"decision": platform.ResolveSystemPowerDecision(h.opts.ExportDir),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
|
func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
|
||||||
h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf").ServeHTTP(w, r)
|
h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf").ServeHTTP(w, r)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -195,6 +195,40 @@ func TestHandleAPIBenchmarkPowerFitRampQueuesBenchmarkPowerFitTasks(t *testing.T
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestHandleAPIBenchmarkAutotuneRunQueuesTask(t *testing.T) {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
originalTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = nil
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = originalTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
|
||||||
|
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||||
|
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/autotune/run", strings.NewReader(`{"profile":"standard","benchmark_kind":"power-fit"}`))
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
|
||||||
|
h.handleAPIBenchmarkAutotuneRun().ServeHTTP(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != 200 {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
defer globalQueue.mu.Unlock()
|
||||||
|
if len(globalQueue.tasks) != 1 {
|
||||||
|
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
|
||||||
|
}
|
||||||
|
task := globalQueue.tasks[0]
|
||||||
|
if task.Target != "nvidia-bench-autotune" {
|
||||||
|
t.Fatalf("task target=%q want nvidia-bench-autotune", task.Target)
|
||||||
|
}
|
||||||
|
if task.params.BenchmarkKind != "power-fit" {
|
||||||
|
t.Fatalf("task benchmark kind=%q want power-fit", task.params.BenchmarkKind)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
|
func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
|
||||||
globalQueue.mu.Lock()
|
globalQueue.mu.Lock()
|
||||||
originalTasks := globalQueue.tasks
|
originalTasks := globalQueue.tasks
|
||||||
|
|||||||
137
audit/internal/webui/layout.go
Normal file
137
audit/internal/webui/layout.go
Normal file
@@ -0,0 +1,137 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"html"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
func layoutHead(title string) string {
|
||||||
|
return `<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width,initial-scale=1">
|
||||||
|
<title>` + html.EscapeString(title) + `</title>
|
||||||
|
<style>
|
||||||
|
:root{--bg:#fff;--surface:#fff;--surface-2:#f9fafb;--border:rgba(34,36,38,.15);--border-lite:rgba(34,36,38,.1);--ink:rgba(0,0,0,.87);--muted:rgba(0,0,0,.6);--accent:#2185d0;--accent-dark:#1678c2;--crit-bg:#fff6f6;--crit-fg:#9f3a38;--crit-border:#e0b4b4;--ok-bg:#fcfff5;--ok-fg:#2c662d;--warn-bg:#fffaf3;--warn-fg:#573a08}
|
||||||
|
*{box-sizing:border-box;margin:0;padding:0}
|
||||||
|
body{font:14px/1.5 Lato,"Helvetica Neue",Arial,Helvetica,sans-serif;background:var(--bg);color:var(--ink);display:flex;min-height:100vh}
|
||||||
|
a{color:var(--accent);text-decoration:none}
|
||||||
|
/* Sidebar */
|
||||||
|
.sidebar{width:210px;min-height:100vh;background:#1b1c1d;flex-shrink:0;display:flex;flex-direction:column}
|
||||||
|
.sidebar-logo{padding:18px 16px 12px;font-size:18px;font-weight:700;color:#fff;letter-spacing:-.5px}
|
||||||
|
.sidebar-logo span{color:rgba(255,255,255,.5);font-weight:400;font-size:12px;display:block;margin-top:2px}
|
||||||
|
.sidebar-version{padding:0 16px 14px;font-size:11px;color:rgba(255,255,255,.45)}
|
||||||
|
.sidebar-badge{margin:0 12px 12px;padding:5px 8px;border-radius:4px;font-size:11px;font-weight:600;text-align:center}
|
||||||
|
.sidebar-badge-warn{background:#7a4f00;color:#f6c90e}
|
||||||
|
.sidebar-badge-crit{background:#5c1a1a;color:#ff6b6b}
|
||||||
|
.nav{flex:1}
|
||||||
|
.nav-item{display:block;padding:10px 16px;color:rgba(255,255,255,.7);font-size:13px;border-left:3px solid transparent;transition:all .15s}
|
||||||
|
.nav-item:hover{color:#fff;background:rgba(255,255,255,.08)}
|
||||||
|
.nav-item.active{color:#fff;background:rgba(33,133,208,.25);border-left-color:var(--accent)}
|
||||||
|
/* Content */
|
||||||
|
.main{flex:1;display:flex;flex-direction:column;overflow:auto}
|
||||||
|
.topbar{padding:13px 24px;background:#1b1c1d;display:flex;align-items:center;gap:12px}
|
||||||
|
.topbar h1{font-size:16px;font-weight:700;color:rgba(255,255,255,.9)}
|
||||||
|
.content{padding:24px;flex:1}
|
||||||
|
/* Cards */
|
||||||
|
.card{background:var(--surface);border:1px solid var(--border);border-radius:4px;box-shadow:0 1px 2px rgba(34,36,38,.15);margin-bottom:16px;overflow:hidden}
|
||||||
|
.card-head{padding:11px 16px;background:var(--surface-2);border-bottom:1px solid var(--border);font-weight:700;font-size:13px;display:flex;align-items:center;gap:8px}
|
||||||
|
.card-head-actions{justify-content:space-between}
|
||||||
|
.card-head-buttons{display:flex;align-items:center;gap:8px;margin-left:auto;flex-wrap:wrap}
|
||||||
|
.card-body{padding:16px}
|
||||||
|
/* Buttons */
|
||||||
|
.btn{display:inline-flex;align-items:center;gap:6px;padding:8px 16px;border-radius:4px;font-size:13px;font-weight:700;cursor:pointer;border:none;transition:background .1s;font-family:inherit}
|
||||||
|
.btn-primary{background:var(--accent);color:#fff}.btn-primary:hover{background:var(--accent-dark)}
|
||||||
|
.btn-danger{background:#db2828;color:#fff}.btn-danger:hover{background:#b91c1c}
|
||||||
|
.btn-secondary{background:var(--surface-2);color:var(--ink);border:1px solid var(--border)}.btn-secondary:hover{background:#eee}
|
||||||
|
.btn-sm{padding:5px 10px;font-size:12px}
|
||||||
|
/* Tables */
|
||||||
|
table{width:100%;border-collapse:collapse;font-size:13px;background:var(--surface)}
|
||||||
|
th{text-align:left;padding:9px 14px;color:var(--ink);font-weight:700;background:var(--surface-2);border-bottom:1px solid var(--border-lite)}
|
||||||
|
td{padding:9px 14px;border-top:1px solid var(--border-lite)}
|
||||||
|
tr:first-child td{border-top:0}
|
||||||
|
tbody tr:hover td{background:rgba(0,0,0,.03)}
|
||||||
|
/* Status badges */
|
||||||
|
.badge{display:inline-block;padding:2px 9px;border-radius:4px;font-size:11px;font-weight:700}
|
||||||
|
.badge-ok{background:var(--ok-bg);color:var(--ok-fg);border:1px solid #a3c293}
|
||||||
|
.badge-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
|
||||||
|
.badge-err{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
|
||||||
|
.badge-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
|
||||||
|
/* Component chips — one small square per device */
|
||||||
|
.chips{display:inline-flex;flex-wrap:wrap;gap:3px;align-items:center;vertical-align:middle}
|
||||||
|
.chip{display:inline-flex;align-items:center;justify-content:center;width:20px;height:20px;border-radius:3px;font-size:10px;font-weight:800;cursor:default;font-family:monospace;letter-spacing:0;user-select:none}
|
||||||
|
.chip-ok{background:var(--ok-bg);color:var(--ok-fg);border:1px solid #a3c293}
|
||||||
|
.chip-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
|
||||||
|
.chip-fail{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
|
||||||
|
.chip-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
|
||||||
|
/* Output terminal */
|
||||||
|
.terminal{background:#1b1c1d;border:1px solid rgba(0,0,0,.2);border-radius:4px;padding:14px;font-family:monospace;font-size:12px;color:#b5cea8;max-height:400px;overflow-y:auto;white-space:pre-wrap;word-break:break-all;user-select:text;-webkit-user-select:text}
|
||||||
|
.terminal-wrap{position:relative}.terminal-copy{position:absolute;top:6px;right:6px;background:#2d2f30;border:1px solid #444;color:#aaa;font-size:11px;padding:2px 8px;border-radius:3px;cursor:pointer;opacity:.7}.terminal-copy:hover{opacity:1}
|
||||||
|
/* Forms */
|
||||||
|
.form-row{margin-bottom:14px}
|
||||||
|
.form-row label{display:block;font-size:12px;color:var(--muted);margin-bottom:5px;font-weight:700}
|
||||||
|
.form-row input,.form-row select{width:100%;padding:8px 10px;background:var(--surface);border:1px solid var(--border);border-radius:4px;color:var(--ink);font-size:13px;outline:none;font-family:inherit}
|
||||||
|
.form-row input:focus,.form-row select:focus{border-color:var(--accent);box-shadow:0 0 0 2px rgba(33,133,208,.2)}
|
||||||
|
/* Grid */
|
||||||
|
.grid2{display:grid;grid-template-columns:1fr 1fr;gap:16px}
|
||||||
|
.grid3{display:grid;grid-template-columns:1fr 1fr 1fr;gap:16px}
|
||||||
|
@media(max-width:900px){.grid2,.grid3{grid-template-columns:1fr}.card-head-actions{align-items:flex-start;flex-direction:column}.card-head-buttons{margin-left:0}}
|
||||||
|
/* iframe viewer */
|
||||||
|
.viewer-frame{width:100%;height:calc(100vh - 160px);border:0;border-radius:4px;background:var(--surface-2)}
|
||||||
|
/* Alerts */
|
||||||
|
.alert{padding:10px 14px;border-radius:4px;font-size:13px;margin-bottom:14px}
|
||||||
|
.alert-info{background:#dff0ff;border:1px solid #a9d4f5;color:#1e3a5f}
|
||||||
|
.alert-warn{background:var(--warn-bg);border:1px solid #c9ba9b;color:var(--warn-fg)}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
`
|
||||||
|
}
|
||||||
|
|
||||||
|
func layoutNav(active string, buildLabel string) string {
|
||||||
|
items := []struct{ id, label, href, onclick string }{
|
||||||
|
{"dashboard", "Dashboard", "/", ""},
|
||||||
|
{"audit", "Audit", "/audit", ""},
|
||||||
|
{"validate", "Validate", "/validate", ""},
|
||||||
|
{"burn", "Burn", "/burn", ""},
|
||||||
|
{"benchmark", "Benchmark", "/benchmark", ""},
|
||||||
|
{"tasks", "Tasks", "/tasks", ""},
|
||||||
|
{"tools", "Tools", "/tools", ""},
|
||||||
|
}
|
||||||
|
var b strings.Builder
|
||||||
|
b.WriteString(`<aside class="sidebar">`)
|
||||||
|
b.WriteString(`<div class="sidebar-logo">bee<span>hardware audit</span></div>`)
|
||||||
|
if strings.TrimSpace(buildLabel) == "" {
|
||||||
|
buildLabel = "dev"
|
||||||
|
}
|
||||||
|
b.WriteString(`<div class="sidebar-version">Version ` + html.EscapeString(buildLabel) + `</div>`)
|
||||||
|
if raw, err := os.ReadFile("/run/bee-nvidia-mode"); err == nil {
|
||||||
|
gspMode := strings.TrimSpace(string(raw))
|
||||||
|
switch gspMode {
|
||||||
|
case "gsp-off":
|
||||||
|
b.WriteString(`<div class="sidebar-badge sidebar-badge-warn">NVIDIA GSP=off</div>`)
|
||||||
|
case "gsp-stuck":
|
||||||
|
b.WriteString(`<div class="sidebar-badge sidebar-badge-crit">NVIDIA GSP stuck — reboot</div>`)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
b.WriteString(`<nav class="nav">`)
|
||||||
|
for _, item := range items {
|
||||||
|
cls := "nav-item"
|
||||||
|
if item.id == active {
|
||||||
|
cls += " active"
|
||||||
|
}
|
||||||
|
if item.onclick != "" {
|
||||||
|
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s" onclick="%s">%s</a>`,
|
||||||
|
cls, item.href, item.onclick, item.label))
|
||||||
|
} else {
|
||||||
|
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s">%s</a>`,
|
||||||
|
cls, item.href, item.label))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
b.WriteString(`</nav>`)
|
||||||
|
b.WriteString(`</aside>`)
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
@@ -53,6 +53,9 @@ CREATE TABLE IF NOT EXISTS sys_metrics (
|
|||||||
cpu_load_pct REAL,
|
cpu_load_pct REAL,
|
||||||
mem_load_pct REAL,
|
mem_load_pct REAL,
|
||||||
power_w REAL,
|
power_w REAL,
|
||||||
|
power_source TEXT,
|
||||||
|
power_mode TEXT,
|
||||||
|
power_reason TEXT,
|
||||||
PRIMARY KEY (ts)
|
PRIMARY KEY (ts)
|
||||||
);
|
);
|
||||||
CREATE TABLE IF NOT EXISTS gpu_metrics (
|
CREATE TABLE IF NOT EXISTS gpu_metrics (
|
||||||
@@ -86,7 +89,16 @@ CREATE TABLE IF NOT EXISTS temp_metrics (
|
|||||||
if err := ensureMetricsColumn(db, "gpu_metrics", "clock_mhz", "REAL"); err != nil {
|
if err := ensureMetricsColumn(db, "gpu_metrics", "clock_mhz", "REAL"); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
return ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL")
|
if err := ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL"); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := ensureMetricsColumn(db, "sys_metrics", "power_source", "TEXT"); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := ensureMetricsColumn(db, "sys_metrics", "power_mode", "TEXT"); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return ensureMetricsColumn(db, "sys_metrics", "power_reason", "TEXT")
|
||||||
}
|
}
|
||||||
|
|
||||||
func ensureMetricsColumn(db *sql.DB, table, column, definition string) error {
|
func ensureMetricsColumn(db *sql.DB, table, column, definition string) error {
|
||||||
@@ -125,8 +137,8 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
|
|||||||
defer func() { _ = tx.Rollback() }()
|
defer func() { _ = tx.Rollback() }()
|
||||||
|
|
||||||
_, err = tx.Exec(
|
_, err = tx.Exec(
|
||||||
`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w) VALUES(?,?,?,?)`,
|
`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason) VALUES(?,?,?,?,?,?,?)`,
|
||||||
ts, s.CPULoadPct, s.MemLoadPct, s.PowerW,
|
ts, s.CPULoadPct, s.MemLoadPct, s.PowerW, s.PowerSource, s.PowerMode, s.PowerReason,
|
||||||
)
|
)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -213,12 +225,12 @@ func (m *MetricsDB) Prune(before time.Time) error {
|
|||||||
|
|
||||||
// LoadRecent returns up to n samples in chronological order (oldest first).
|
// LoadRecent returns up to n samples in chronological order (oldest first).
|
||||||
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
||||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
|
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
|
||||||
}
|
}
|
||||||
|
|
||||||
// LoadAll returns all persisted samples in chronological order (oldest first).
|
// LoadAll returns all persisted samples in chronological order (oldest first).
|
||||||
func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
|
func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
|
||||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
|
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics ORDER BY ts`, nil)
|
||||||
}
|
}
|
||||||
|
|
||||||
// LoadBetween returns samples in chronological order within the given time window.
|
// LoadBetween returns samples in chronological order within the given time window.
|
||||||
@@ -233,7 +245,7 @@ func (m *MetricsDB) LoadBetween(start, end time.Time) ([]platform.LiveMetricSamp
|
|||||||
start, end = end, start
|
start, end = end, start
|
||||||
}
|
}
|
||||||
return m.loadSamples(
|
return m.loadSamples(
|
||||||
`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
|
`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
|
||||||
start.Unix(), end.Unix(),
|
start.Unix(), end.Unix(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -249,11 +261,14 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
|||||||
type sysRow struct {
|
type sysRow struct {
|
||||||
ts int64
|
ts int64
|
||||||
cpu, mem, pwr float64
|
cpu, mem, pwr float64
|
||||||
|
powerSource string
|
||||||
|
powerMode string
|
||||||
|
powerReason string
|
||||||
}
|
}
|
||||||
var sysRows []sysRow
|
var sysRows []sysRow
|
||||||
for rows.Next() {
|
for rows.Next() {
|
||||||
var r sysRow
|
var r sysRow
|
||||||
if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr); err != nil {
|
if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr, &r.powerSource, &r.powerMode, &r.powerReason); err != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
sysRows = append(sysRows, r)
|
sysRows = append(sysRows, r)
|
||||||
@@ -363,10 +378,13 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
|||||||
samples := make([]platform.LiveMetricSample, len(sysRows))
|
samples := make([]platform.LiveMetricSample, len(sysRows))
|
||||||
for i, r := range sysRows {
|
for i, r := range sysRows {
|
||||||
s := platform.LiveMetricSample{
|
s := platform.LiveMetricSample{
|
||||||
Timestamp: time.Unix(r.ts, 0).UTC(),
|
Timestamp: time.Unix(r.ts, 0).UTC(),
|
||||||
CPULoadPct: r.cpu,
|
CPULoadPct: r.cpu,
|
||||||
MemLoadPct: r.mem,
|
MemLoadPct: r.mem,
|
||||||
PowerW: r.pwr,
|
PowerW: r.pwr,
|
||||||
|
PowerSource: r.powerSource,
|
||||||
|
PowerMode: r.powerMode,
|
||||||
|
PowerReason: r.powerReason,
|
||||||
}
|
}
|
||||||
for _, idx := range gpuIndices {
|
for _, idx := range gpuIndices {
|
||||||
if g, ok := gpuData[gpuKey{r.ts, idx}]; ok {
|
if g, ok := gpuData[gpuKey{r.ts, idx}]; ok {
|
||||||
|
|||||||
613
audit/internal/webui/page_benchmark.go
Normal file
613
audit/internal/webui/page_benchmark.go
Normal file
@@ -0,0 +1,613 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"html"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/app"
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
type benchmarkHistoryRun struct {
|
||||||
|
generatedAt time.Time
|
||||||
|
displayTime string
|
||||||
|
gpuScores map[int]float64
|
||||||
|
gpuStatuses map[int]string
|
||||||
|
overallStatus string
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderBenchmark(opts HandlerOptions) string {
|
||||||
|
return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Benchmark runs generate a human-readable TXT report and machine-readable result bundle. Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||||
|
|
||||||
|
<div class="grid2">
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-head">Benchmark Setup</div>
|
||||||
|
<div class="card-body">
|
||||||
|
<div class="form-row">
|
||||||
|
<label>Profile</label>
|
||||||
|
<select id="benchmark-profile">
|
||||||
|
<option value="standard" selected>Standard — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</option>
|
||||||
|
<option value="stability">Stability — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</option>
|
||||||
|
<option value="overnight">Overnight — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfOvernightSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerOvernightSec) + `</option>
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
|
<div class="form-row">
|
||||||
|
<label>GPU Selection</label>
|
||||||
|
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||||||
|
<button class="btn btn-sm btn-secondary" type="button" onclick="benchmarkSelectAll()">Select All</button>
|
||||||
|
<button class="btn btn-sm btn-secondary" type="button" onclick="benchmarkSelectNone()">Clear</button>
|
||||||
|
</div>
|
||||||
|
<div id="benchmark-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
||||||
|
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<label class="benchmark-cb-row">
|
||||||
|
<input type="radio" name="benchmark-mode" value="sequential" onchange="benchmarkUpdateSelectionNote()">
|
||||||
|
<span>Sequential — one GPU at a time</span>
|
||||||
|
</label>
|
||||||
|
<label class="benchmark-cb-row" id="benchmark-parallel-label">
|
||||||
|
<input type="radio" name="benchmark-mode" value="parallel" onchange="benchmarkUpdateSelectionNote()">
|
||||||
|
<span>Parallel — all selected GPUs simultaneously</span>
|
||||||
|
</label>
|
||||||
|
<label class="benchmark-cb-row" id="benchmark-ramp-label">
|
||||||
|
<input type="radio" name="benchmark-mode" value="ramp-up" checked onchange="benchmarkUpdateSelectionNote()">
|
||||||
|
<span>Ramp-up — 1 GPU → 2 → … → all selected (separate tasks)</span>
|
||||||
|
</label>
|
||||||
|
<p id="benchmark-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 14px">Select one GPU for single-card benchmarking or several GPUs for a constrained multi-GPU run.</p>
|
||||||
|
<div style="display:flex;gap:8px;flex-wrap:wrap;align-items:center">
|
||||||
|
<button id="benchmark-run-performance-btn" class="btn btn-primary" onclick="runNvidiaBenchmark('performance')" disabled>▶ Run Performance Benchmark</button>
|
||||||
|
<button id="benchmark-run-power-fit-btn" class="btn btn-secondary" onclick="runNvidiaBenchmark('power-fit')" disabled>▶ Run Power / Thermal Fit</button>
|
||||||
|
<button id="benchmark-run-autotune-btn" class="btn btn-secondary" onclick="runBenchmarkAutotune()">Autotune</button>
|
||||||
|
</div>
|
||||||
|
<span id="benchmark-run-nccl" hidden>nccl-auto</span>
|
||||||
|
<span id="benchmark-run-status" style="margin-left:10px;font-size:12px;color:var(--muted)"></span>
|
||||||
|
<div id="benchmark-autotune-status" style="margin-top:10px;font-size:12px;color:var(--muted)">Autotune status: loading…</div>
|
||||||
|
<div style="margin-top:6px;font-size:12px;color:var(--muted)">Autotune overwrites the saved system-power source and applies it to all new power charts and tests.</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-head">Method Split</div>
|
||||||
|
<div class="card-body">
|
||||||
|
<p style="font-size:13px;color:var(--muted);margin-bottom:10px">The benchmark page now exposes two fundamentally different test families so compute score and server power-fit are not mixed into one number.</p>
|
||||||
|
<table>
|
||||||
|
<tr><th>Run Type</th><th>Engine</th><th>Question</th><th>Standard</th><th>Stability</th></tr>
|
||||||
|
<tr><td>Performance Benchmark</td><td><code>bee-gpu-burn</code></td><td>How much isolated compute performance does the GPU realize in this server?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + `</td></tr>
|
||||||
|
<tr><td>Power / Thermal Fit</td><td><code>dcgmproftester</code> + <code>nvidia-smi -pl</code></td><td>How much power per GPU can this server sustain as GPU count ramps up?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</td></tr>
|
||||||
|
</table>
|
||||||
|
<p style="font-size:12px;color:var(--muted);margin-top:10px">Timings are per full ramp-up run (1 GPU → all selected), measured on 4–8 GPU servers. Use ramp-up mode for capacity work: it creates 1 GPU → 2 GPU → … → all selected steps so analysis software can derive server total score and watts-per-GPU curves.</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
` + `<div id="benchmark-results-section">` + renderBenchmarkResultsCard(opts.ExportDir) + `</div>` + `
|
||||||
|
|
||||||
|
<div id="benchmark-output" style="display:none;margin-top:16px" class="card">
|
||||||
|
<div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
|
||||||
|
<div class="card-body"><div id="benchmark-terminal" class="terminal"></div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<style>
|
||||||
|
.benchmark-cb-row { display:flex; align-items:flex-start; gap:8px; cursor:pointer; font-size:13px; }
|
||||||
|
.benchmark-cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||||
|
.benchmark-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
|
||||||
|
.benchmark-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||||
|
</style>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
let benchmarkES = null;
|
||||||
|
function benchmarkTaskIDs(payload) {
|
||||||
|
if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
|
||||||
|
if (payload && payload.task_id) return [payload.task_id];
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
function benchmarkSelectedGPUIndices() {
|
||||||
|
return Array.from(document.querySelectorAll('.benchmark-gpu-checkbox'))
|
||||||
|
.filter(function(el) { return el.checked && !el.disabled; })
|
||||||
|
.map(function(el) { return parseInt(el.value, 10); })
|
||||||
|
.filter(function(v) { return !Number.isNaN(v); })
|
||||||
|
.sort(function(a, b) { return a - b; });
|
||||||
|
}
|
||||||
|
function benchmarkMode() {
|
||||||
|
const el = document.querySelector('input[name="benchmark-mode"]:checked');
|
||||||
|
return el ? el.value : 'sequential';
|
||||||
|
}
|
||||||
|
function benchmarkUpdateSelectionNote() {
|
||||||
|
const selected = benchmarkSelectedGPUIndices();
|
||||||
|
const perfBtn = document.getElementById('benchmark-run-performance-btn');
|
||||||
|
const fitBtn = document.getElementById('benchmark-run-power-fit-btn');
|
||||||
|
const note = document.getElementById('benchmark-selection-note');
|
||||||
|
if (!selected.length) {
|
||||||
|
perfBtn.disabled = true;
|
||||||
|
fitBtn.disabled = true;
|
||||||
|
note.textContent = 'Select at least one NVIDIA GPU to run the benchmark.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
perfBtn.disabled = false;
|
||||||
|
fitBtn.disabled = false;
|
||||||
|
const mode = benchmarkMode();
|
||||||
|
if (mode === 'ramp-up') {
|
||||||
|
note.textContent = 'Ramp-up: ' + selected.length + ' tasks (1 GPU → ' + selected.length + ' GPUs). Performance uses compute benchmark; Power / Thermal Fit uses dcgmproftester load with nvidia-smi power-limit search per step.';
|
||||||
|
} else if (mode === 'parallel') {
|
||||||
|
note.textContent = 'Parallel: all ' + selected.length + ' GPU(s) simultaneously. Only the performance benchmark supports this mode.';
|
||||||
|
} else {
|
||||||
|
note.textContent = 'Sequential: each selected GPU benchmarked separately.';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
function benchmarkRenderGPUList(gpus) {
|
||||||
|
const root = document.getElementById('benchmark-gpu-list');
|
||||||
|
if (!gpus || !gpus.length) {
|
||||||
|
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
|
||||||
|
benchmarkUpdateSelectionNote();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
root.innerHTML = gpus.map(function(gpu) {
|
||||||
|
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
|
||||||
|
return '<label class="benchmark-gpu-row">'
|
||||||
|
+ '<input class="benchmark-gpu-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="benchmarkUpdateSelectionNote()">'
|
||||||
|
+ '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
|
||||||
|
+ '</label>';
|
||||||
|
}).join('');
|
||||||
|
benchmarkApplyMultiGPUState(gpus.length);
|
||||||
|
benchmarkUpdateSelectionNote();
|
||||||
|
}
|
||||||
|
function benchmarkApplyMultiGPUState(gpuCount) {
|
||||||
|
var multiValues = ['parallel', 'ramp-up'];
|
||||||
|
var radios = document.querySelectorAll('input[name="benchmark-mode"]');
|
||||||
|
radios.forEach(function(el) {
|
||||||
|
var isMulti = multiValues.indexOf(el.value) >= 0;
|
||||||
|
if (gpuCount < 2 && isMulti) {
|
||||||
|
el.disabled = true;
|
||||||
|
if (el.checked) {
|
||||||
|
var seq = document.querySelector('input[name="benchmark-mode"][value="sequential"]');
|
||||||
|
if (seq) seq.checked = true;
|
||||||
|
}
|
||||||
|
var label = el.closest('label');
|
||||||
|
if (label) label.style.opacity = '0.4';
|
||||||
|
} else {
|
||||||
|
el.disabled = false;
|
||||||
|
if (gpuCount >= 2 && el.value === 'ramp-up') el.checked = true;
|
||||||
|
var label = el.closest('label');
|
||||||
|
if (label) label.style.opacity = '';
|
||||||
|
}
|
||||||
|
});
|
||||||
|
benchmarkUpdateSelectionNote();
|
||||||
|
}
|
||||||
|
function benchmarkLoadGPUs() {
|
||||||
|
const status = document.getElementById('benchmark-run-status');
|
||||||
|
status.textContent = '';
|
||||||
|
fetch('/api/gpu/nvidia').then(function(r) {
|
||||||
|
return r.json().then(function(body) {
|
||||||
|
if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
|
||||||
|
return body;
|
||||||
|
});
|
||||||
|
}).then(function(gpus) {
|
||||||
|
benchmarkRenderGPUList(gpus);
|
||||||
|
}).catch(function(err) {
|
||||||
|
document.getElementById('benchmark-gpu-list').innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
|
||||||
|
benchmarkUpdateSelectionNote();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function benchmarkSelectAll() {
|
||||||
|
document.querySelectorAll('.benchmark-gpu-checkbox').forEach(function(el) { el.checked = true; });
|
||||||
|
benchmarkUpdateSelectionNote();
|
||||||
|
}
|
||||||
|
function benchmarkSelectNone() {
|
||||||
|
document.querySelectorAll('.benchmark-gpu-checkbox').forEach(function(el) { el.checked = false; });
|
||||||
|
benchmarkUpdateSelectionNote();
|
||||||
|
}
|
||||||
|
function runNvidiaBenchmark(kind) {
|
||||||
|
const selected = benchmarkSelectedGPUIndices();
|
||||||
|
const status = document.getElementById('benchmark-run-status');
|
||||||
|
if (!selected.length) {
|
||||||
|
status.textContent = 'Select at least one GPU.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
|
||||||
|
const mode = benchmarkMode();
|
||||||
|
const rampUp = mode === 'ramp-up' && selected.length > 1;
|
||||||
|
const parallelGPUs = mode === 'parallel' && kind === 'performance';
|
||||||
|
if (kind === 'power-fit' && mode === 'parallel') {
|
||||||
|
status.textContent = 'Power / Thermal Fit supports sequential or ramp-up only.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const body = {
|
||||||
|
profile: document.getElementById('benchmark-profile').value || 'standard',
|
||||||
|
gpu_indices: selected,
|
||||||
|
run_nccl: kind === 'performance' && selected.length > 1,
|
||||||
|
parallel_gpus: parallelGPUs,
|
||||||
|
ramp_up: rampUp,
|
||||||
|
display_name: kind === 'power-fit' ? 'NVIDIA Power / Thermal Fit' : 'NVIDIA Performance Benchmark'
|
||||||
|
};
|
||||||
|
document.getElementById('benchmark-output').style.display = 'block';
|
||||||
|
document.getElementById('benchmark-title').textContent = '— ' + body.display_name + ' · ' + body.profile + ' [' + selected.join(', ') + ']';
|
||||||
|
const term = document.getElementById('benchmark-terminal');
|
||||||
|
term.textContent = 'Enqueuing ' + body.display_name + ' for GPUs ' + selected.join(', ') + '...\n';
|
||||||
|
status.textContent = 'Queueing...';
|
||||||
|
const endpoint = kind === 'power-fit' ? '/api/bee-bench/nvidia/power/run' : '/api/bee-bench/nvidia/perf/run';
|
||||||
|
fetch(endpoint, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {'Content-Type':'application/json'},
|
||||||
|
body: JSON.stringify(body)
|
||||||
|
}).then(function(r) {
|
||||||
|
return r.json().then(function(payload) {
|
||||||
|
if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
|
||||||
|
return payload;
|
||||||
|
});
|
||||||
|
}).then(function(d) {
|
||||||
|
const taskIds = benchmarkTaskIDs(d);
|
||||||
|
if (!taskIds.length) throw new Error('No benchmark task was queued.');
|
||||||
|
status.textContent = taskIds.length === 1 ? ('Task ' + taskIds[0] + ' queued.') : ('Queued ' + taskIds.length + ' tasks.');
|
||||||
|
const streamNext = function(idx, failures) {
|
||||||
|
if (idx >= taskIds.length) {
|
||||||
|
status.textContent = failures ? 'Completed with failures.' : 'Completed.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const taskId = taskIds[idx];
|
||||||
|
term.textContent += '\n[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming log...\n';
|
||||||
|
benchmarkES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||||
|
benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||||
|
benchmarkES.addEventListener('done', function(e) {
|
||||||
|
benchmarkES.close();
|
||||||
|
benchmarkES = null;
|
||||||
|
if (e.data) failures += 1;
|
||||||
|
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
const isLast = (idx + 1 >= taskIds.length);
|
||||||
|
streamNext(idx + 1, failures);
|
||||||
|
if (isLast) { benchmarkRefreshResults(); }
|
||||||
|
});
|
||||||
|
benchmarkES.onerror = function() {
|
||||||
|
if (benchmarkES) {
|
||||||
|
benchmarkES.close();
|
||||||
|
benchmarkES = null;
|
||||||
|
}
|
||||||
|
term.textContent += '\nERROR: stream disconnected.\n';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
streamNext(idx + 1, failures + 1);
|
||||||
|
};
|
||||||
|
};
|
||||||
|
streamNext(0, 0);
|
||||||
|
}).catch(function(err) {
|
||||||
|
status.textContent = 'Error.';
|
||||||
|
term.textContent += 'ERROR: ' + err.message + '\n';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function benchmarkRenderAutotuneStatus(payload) {
|
||||||
|
const el = document.getElementById('benchmark-autotune-status');
|
||||||
|
if (!el) return;
|
||||||
|
if (!payload || !payload.configured || !payload.config) {
|
||||||
|
el.textContent = 'Autotune status: not configured. Temporary fallback source is used until autotune completes.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const cfg = payload.config || {};
|
||||||
|
const decision = payload.decision || {};
|
||||||
|
const updated = cfg.updated_at ? new Date(cfg.updated_at).toLocaleString() : 'unknown time';
|
||||||
|
const confidence = typeof cfg.confidence === 'number' ? (' · confidence ' + Math.round(cfg.confidence * 100) + '%') : '';
|
||||||
|
const effective = decision.effective_source ? (' · effective ' + decision.effective_source) : '';
|
||||||
|
const mode = decision.mode ? (' · mode ' + decision.mode) : '';
|
||||||
|
el.textContent = 'Autotune status: ' + cfg.selected_source + effective + mode + ' · updated ' + updated + confidence;
|
||||||
|
}
|
||||||
|
function loadBenchmarkAutotuneStatus() {
|
||||||
|
fetch('/api/bee-bench/nvidia/autotune/status')
|
||||||
|
.then(function(r) {
|
||||||
|
return r.json().then(function(body) {
|
||||||
|
if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
|
||||||
|
return body;
|
||||||
|
});
|
||||||
|
})
|
||||||
|
.then(function(body) { benchmarkRenderAutotuneStatus(body); })
|
||||||
|
.catch(function(err) {
|
||||||
|
const el = document.getElementById('benchmark-autotune-status');
|
||||||
|
if (el) el.textContent = 'Autotune status error: ' + err.message;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function runBenchmarkAutotune() {
|
||||||
|
const selected = benchmarkSelectedGPUIndices();
|
||||||
|
const status = document.getElementById('benchmark-run-status');
|
||||||
|
const term = document.getElementById('benchmark-terminal');
|
||||||
|
if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
|
||||||
|
document.getElementById('benchmark-output').style.display = 'block';
|
||||||
|
document.getElementById('benchmark-title').textContent = '— NVIDIA Benchmark Autotune';
|
||||||
|
term.textContent = 'Enqueuing benchmark autotune...\n';
|
||||||
|
status.textContent = 'Queueing autotune...';
|
||||||
|
fetch('/api/bee-bench/nvidia/autotune/run', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {'Content-Type':'application/json'},
|
||||||
|
body: JSON.stringify({
|
||||||
|
profile: document.getElementById('benchmark-profile').value || 'standard',
|
||||||
|
benchmark_kind: benchmarkMode() === 'parallel' ? 'performance' : 'power-fit',
|
||||||
|
gpu_indices: selected
|
||||||
|
})
|
||||||
|
}).then(function(r) {
|
||||||
|
return r.json().then(function(payload) {
|
||||||
|
if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
|
||||||
|
return payload;
|
||||||
|
});
|
||||||
|
}).then(function(d) {
|
||||||
|
const taskIds = benchmarkTaskIDs(d);
|
||||||
|
if (!taskIds.length) throw new Error('No autotune task was queued.');
|
||||||
|
const taskId = taskIds[0];
|
||||||
|
status.textContent = 'Autotune queued: ' + taskId;
|
||||||
|
benchmarkES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||||
|
benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||||
|
benchmarkES.addEventListener('done', function(e) {
|
||||||
|
if (benchmarkES) {
|
||||||
|
benchmarkES.close();
|
||||||
|
benchmarkES = null;
|
||||||
|
}
|
||||||
|
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||||
|
status.textContent = e.data ? 'Autotune failed.' : 'Autotune completed.';
|
||||||
|
loadBenchmarkAutotuneStatus();
|
||||||
|
});
|
||||||
|
}).catch(function(err) {
|
||||||
|
status.textContent = 'Autotune error.';
|
||||||
|
term.textContent += 'ERROR: ' + err.message + '\n';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
benchmarkLoadGPUs();
|
||||||
|
loadBenchmarkAutotuneStatus();
|
||||||
|
function benchmarkRefreshResults() {
|
||||||
|
fetch('/api/benchmark/results')
|
||||||
|
.then(function(r) { return r.text(); })
|
||||||
|
.then(function(html) {
|
||||||
|
const el = document.getElementById('benchmark-results-section');
|
||||||
|
if (el) el.innerHTML = html;
|
||||||
|
})
|
||||||
|
.catch(function() {});
|
||||||
|
}
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderBenchmarkResultsCard(exportDir string) string {
|
||||||
|
maxIdx, runs := loadBenchmarkHistory(exportDir)
|
||||||
|
perf := renderBenchmarkResultsCardFromRuns(
|
||||||
|
"Perf Results",
|
||||||
|
"Composite score by saved benchmark run and GPU.",
|
||||||
|
"No saved performance benchmark runs yet.",
|
||||||
|
maxIdx,
|
||||||
|
runs,
|
||||||
|
)
|
||||||
|
power := renderPowerBenchmarkResultsCard(exportDir)
|
||||||
|
return perf + "\n" + power
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, maxGPUIndex int, runs []benchmarkHistoryRun) string {
|
||||||
|
if len(runs) == 0 {
|
||||||
|
return `<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body"><p style="color:var(--muted);font-size:13px">` + html.EscapeString(emptyMessage) + `</p></div></div>`
|
||||||
|
}
|
||||||
|
var b strings.Builder
|
||||||
|
b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body">`)
|
||||||
|
if strings.TrimSpace(description) != "" {
|
||||||
|
b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">` + html.EscapeString(description) + `</p>`)
|
||||||
|
}
|
||||||
|
b.WriteString(`<div style="overflow-x:auto">`)
|
||||||
|
b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th><th>Status</th>`)
|
||||||
|
for i := 0; i <= maxGPUIndex; i++ {
|
||||||
|
b.WriteString(`<th>GPU ` + strconv.Itoa(i) + `</th>`)
|
||||||
|
}
|
||||||
|
b.WriteString(`</tr></thead><tbody>`)
|
||||||
|
for i, run := range runs {
|
||||||
|
b.WriteString(`<tr>`)
|
||||||
|
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
|
||||||
|
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
|
||||||
|
overallColor := "var(--ok)"
|
||||||
|
overallLabel := run.overallStatus
|
||||||
|
if overallLabel == "" {
|
||||||
|
overallLabel = "OK"
|
||||||
|
}
|
||||||
|
if overallLabel == "FAILED" {
|
||||||
|
overallColor = "var(--crit-fg,#9f3a38)"
|
||||||
|
} else if overallLabel != "OK" {
|
||||||
|
overallColor = "var(--warn)"
|
||||||
|
}
|
||||||
|
b.WriteString(`<td style="color:` + overallColor + `;font-weight:600">` + html.EscapeString(overallLabel) + `</td>`)
|
||||||
|
for idx := 0; idx <= maxGPUIndex; idx++ {
|
||||||
|
score, ok := run.gpuScores[idx]
|
||||||
|
if !ok {
|
||||||
|
b.WriteString(`<td style="color:var(--muted)">-</td>`)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
gpuStatus := run.gpuStatuses[idx]
|
||||||
|
scoreColor := ""
|
||||||
|
switch gpuStatus {
|
||||||
|
case "FAILED":
|
||||||
|
scoreColor = ` style="color:var(--crit-fg,#9f3a38);font-weight:600"`
|
||||||
|
case "WARNING", "PARTIAL":
|
||||||
|
scoreColor = ` style="color:var(--warn);font-weight:600"`
|
||||||
|
case "", "OK":
|
||||||
|
default:
|
||||||
|
scoreColor = ` style="color:var(--warn);font-weight:600"`
|
||||||
|
}
|
||||||
|
b.WriteString(`<td` + scoreColor + `>` + fmt.Sprintf("%.2f", score) + `</td>`)
|
||||||
|
}
|
||||||
|
b.WriteString(`</tr>`)
|
||||||
|
}
|
||||||
|
b.WriteString(`</tbody></table></div></div></div>`)
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadBenchmarkHistory(exportDir string) (int, []benchmarkHistoryRun) {
|
||||||
|
baseDir := app.DefaultBeeBenchPerfDir
|
||||||
|
if strings.TrimSpace(exportDir) != "" {
|
||||||
|
baseDir = filepath.Join(exportDir, "bee-bench", "perf")
|
||||||
|
}
|
||||||
|
paths, err := filepath.Glob(filepath.Join(baseDir, "perf-*", "result.json"))
|
||||||
|
if err != nil || len(paths) == 0 {
|
||||||
|
return -1, nil
|
||||||
|
}
|
||||||
|
sort.Strings(paths)
|
||||||
|
return loadBenchmarkHistoryFromPaths(paths)
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun) {
|
||||||
|
runs := make([]benchmarkHistoryRun, 0, len(paths))
|
||||||
|
maxGPUIndex := -1
|
||||||
|
for _, path := range paths {
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
var result platform.NvidiaBenchmarkResult
|
||||||
|
if err := json.Unmarshal(raw, &result); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
run := benchmarkHistoryRun{
|
||||||
|
generatedAt: result.GeneratedAt,
|
||||||
|
displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
||||||
|
gpuScores: make(map[int]float64),
|
||||||
|
gpuStatuses: make(map[int]string),
|
||||||
|
overallStatus: result.OverallStatus,
|
||||||
|
}
|
||||||
|
for _, gpu := range result.GPUs {
|
||||||
|
run.gpuScores[gpu.Index] = gpu.Scores.CompositeScore
|
||||||
|
run.gpuStatuses[gpu.Index] = gpu.Status
|
||||||
|
if gpu.Index > maxGPUIndex {
|
||||||
|
maxGPUIndex = gpu.Index
|
||||||
|
}
|
||||||
|
}
|
||||||
|
runs = append(runs, run)
|
||||||
|
}
|
||||||
|
sort.Slice(runs, func(i, j int) bool {
|
||||||
|
return runs[i].generatedAt.After(runs[j].generatedAt)
|
||||||
|
})
|
||||||
|
return maxGPUIndex, runs
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderPowerBenchmarkResultsCard(exportDir string) string {
|
||||||
|
baseDir := app.DefaultBeeBenchPowerDir
|
||||||
|
if strings.TrimSpace(exportDir) != "" {
|
||||||
|
baseDir = filepath.Join(exportDir, "bee-bench", "power")
|
||||||
|
}
|
||||||
|
paths, err := filepath.Glob(filepath.Join(baseDir, "power-*", "result.json"))
|
||||||
|
if err != nil || len(paths) == 0 {
|
||||||
|
return `<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body"><p style="color:var(--muted);font-size:13px">No saved power benchmark runs yet.</p></div></div>`
|
||||||
|
}
|
||||||
|
sort.Strings(paths)
|
||||||
|
|
||||||
|
type powerRun struct {
|
||||||
|
generatedAt time.Time
|
||||||
|
displayTime string
|
||||||
|
result platform.NvidiaPowerBenchResult
|
||||||
|
}
|
||||||
|
var runs []powerRun
|
||||||
|
for _, path := range paths {
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
var r platform.NvidiaPowerBenchResult
|
||||||
|
if err := json.Unmarshal(raw, &r); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
runs = append(runs, powerRun{
|
||||||
|
generatedAt: r.GeneratedAt,
|
||||||
|
displayTime: r.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
||||||
|
result: r,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
sort.Slice(runs, func(i, j int) bool {
|
||||||
|
return runs[i].generatedAt.After(runs[j].generatedAt)
|
||||||
|
})
|
||||||
|
|
||||||
|
var b strings.Builder
|
||||||
|
b.WriteString(`<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body">`)
|
||||||
|
|
||||||
|
latest := runs[0].result
|
||||||
|
b.WriteString(`<p style="font-size:12px;color:var(--muted);margin-bottom:10px">Latest run: ` + html.EscapeString(runs[0].displayTime))
|
||||||
|
if latest.Hostname != "" {
|
||||||
|
b.WriteString(` — ` + html.EscapeString(latest.Hostname))
|
||||||
|
}
|
||||||
|
if latest.OverallStatus != "" {
|
||||||
|
statusColor := "var(--ok)"
|
||||||
|
if latest.OverallStatus != "OK" {
|
||||||
|
statusColor = "var(--warn)"
|
||||||
|
}
|
||||||
|
b.WriteString(` — <span style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(latest.OverallStatus) + `</span>`)
|
||||||
|
}
|
||||||
|
b.WriteString(`</p>`)
|
||||||
|
|
||||||
|
if len(latest.GPUs) > 0 {
|
||||||
|
b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`)
|
||||||
|
b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Single-card W</th><th>Multi-GPU W</th><th>P95 Observed W</th><th>Status</th>`)
|
||||||
|
b.WriteString(`</tr></thead><tbody>`)
|
||||||
|
for _, gpu := range latest.GPUs {
|
||||||
|
finalLimitW := gpu.StablePowerLimitW
|
||||||
|
if finalLimitW <= 0 {
|
||||||
|
finalLimitW = gpu.AppliedPowerLimitW
|
||||||
|
}
|
||||||
|
derated := gpu.Derated ||
|
||||||
|
(gpu.DefaultPowerLimitW > 0 && finalLimitW > 0 && finalLimitW < gpu.DefaultPowerLimitW-1)
|
||||||
|
rowStyle := ""
|
||||||
|
finalStyle := ""
|
||||||
|
if derated {
|
||||||
|
rowStyle = ` style="background:rgba(255,180,0,0.08)"`
|
||||||
|
finalStyle = ` style="color:#e6a000;font-weight:600"`
|
||||||
|
}
|
||||||
|
statusLabel := gpu.Status
|
||||||
|
if statusLabel == "" {
|
||||||
|
statusLabel = "OK"
|
||||||
|
}
|
||||||
|
statusColor := "var(--ok)"
|
||||||
|
if statusLabel == "FAILED" {
|
||||||
|
statusColor = "var(--crit-fg,#9f3a38)"
|
||||||
|
} else if statusLabel != "OK" {
|
||||||
|
statusColor = "var(--warn)"
|
||||||
|
}
|
||||||
|
nominalStr := "-"
|
||||||
|
if gpu.DefaultPowerLimitW > 0 {
|
||||||
|
nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW)
|
||||||
|
}
|
||||||
|
singleStr := "-"
|
||||||
|
if gpu.AppliedPowerLimitW > 0 {
|
||||||
|
singleStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
|
||||||
|
}
|
||||||
|
multiStr := "-"
|
||||||
|
if gpu.StablePowerLimitW > 0 {
|
||||||
|
multiStr = fmt.Sprintf("%.0f", gpu.StablePowerLimitW)
|
||||||
|
}
|
||||||
|
p95Str := "-"
|
||||||
|
if gpu.MaxObservedPowerW > 0 {
|
||||||
|
p95Str = fmt.Sprintf("%.0f", gpu.MaxObservedPowerW)
|
||||||
|
}
|
||||||
|
b.WriteString(`<tr` + rowStyle + `>`)
|
||||||
|
b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`)
|
||||||
|
b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`)
|
||||||
|
b.WriteString(`<td>` + nominalStr + `</td>`)
|
||||||
|
b.WriteString(`<td>` + singleStr + `</td>`)
|
||||||
|
b.WriteString(`<td` + finalStyle + `>` + multiStr + `</td>`)
|
||||||
|
b.WriteString(`<td>` + p95Str + `</td>`)
|
||||||
|
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`)
|
||||||
|
b.WriteString(`</tr>`)
|
||||||
|
}
|
||||||
|
b.WriteString(`</tbody></table></div>`)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(runs) > 1 {
|
||||||
|
b.WriteString(`<details style="margin-top:12px"><summary style="font-size:12px;color:var(--muted);cursor:pointer">` + strconv.Itoa(len(runs)) + ` runs total</summary>`)
|
||||||
|
b.WriteString(`<div style="overflow-x:auto;margin-top:8px"><table><thead><tr><th>#</th><th>Time</th><th>GPUs</th><th>Status</th></tr></thead><tbody>`)
|
||||||
|
for i, run := range runs {
|
||||||
|
statusColor := "var(--ok)"
|
||||||
|
if run.result.OverallStatus != "OK" {
|
||||||
|
statusColor = "var(--warn)"
|
||||||
|
}
|
||||||
|
b.WriteString(`<tr>`)
|
||||||
|
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
|
||||||
|
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
|
||||||
|
b.WriteString(`<td>` + strconv.Itoa(len(run.result.GPUs)) + `</td>`)
|
||||||
|
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(run.result.OverallStatus) + `</td>`)
|
||||||
|
b.WriteString(`</tr>`)
|
||||||
|
}
|
||||||
|
b.WriteString(`</tbody></table></div></details>`)
|
||||||
|
}
|
||||||
|
|
||||||
|
b.WriteString(`</div></div>`)
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
383
audit/internal/webui/page_burn.go
Normal file
383
audit/internal/webui/page_burn.go
Normal file
@@ -0,0 +1,383 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
func renderBurn() string {
|
||||||
|
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
|
||||||
|
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in <a href="/validate">Validate → Stress mode</a>; NCCL and NVBandwidth are available directly from <a href="/validate">Validate</a>.</div>
|
||||||
|
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||||
|
|
||||||
|
<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">Burn Profile</div>
|
||||||
|
<div class="card-body burn-profile-body">
|
||||||
|
<div class="burn-profile-col">
|
||||||
|
<div class="form-row" style="margin:0 0 8px"><label>Preset</label></div>
|
||||||
|
<label class="cb-row"><input type="radio" name="burn-profile" value="smoke" checked><span>Smoke — 5 min/GPU (sequential) or 5 min (parallel)</span></label>
|
||||||
|
<label class="cb-row"><input type="radio" name="burn-profile" value="acceptance"><span>Acceptance — 1 h/GPU (sequential) or 1 h (parallel)</span></label>
|
||||||
|
<label class="cb-row"><input type="radio" name="burn-profile" value="overnight"><span>Overnight — 8 h/GPU (sequential) or 8 h (parallel)</span></label>
|
||||||
|
</div>
|
||||||
|
<div class="burn-profile-col burn-profile-action">
|
||||||
|
<button type="button" class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button>
|
||||||
|
<p>Runs checked tests as separate sequential tasks. In sequential GPU mode, total time = profile duration × N GPU. In parallel mode, all selected GPUs burn simultaneously for one profile duration.</p>
|
||||||
|
</div>
|
||||||
|
<div class="burn-profile-col burn-profile-action">
|
||||||
|
<button type="button" class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button>
|
||||||
|
<p>Run checked core test modules (CPU, MEM, GPU). Tests start at the same time and run for a period with short cooldown phases to stress the server cooling system.</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="card-body" style="padding-top:0;display:flex;justify-content:center">
|
||||||
|
<span id="burn-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">NVIDIA GPU Selection</div>
|
||||||
|
<div class="card-body">
|
||||||
|
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Official NVIDIA recipes and custom NVIDIA stressors use only the GPUs selected here. Multi-GPU interconnect tests are limited to this selection as well.</p>
|
||||||
|
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||||||
|
<button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectAll()">Select All</button>
|
||||||
|
<button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectNone()">Clear</button>
|
||||||
|
</div>
|
||||||
|
<div id="burn-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
||||||
|
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||||
|
</div>
|
||||||
|
<p id="burn-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA burn recipes.</p>
|
||||||
|
<div style="display:flex;flex-direction:column;gap:4px;margin-top:10px">
|
||||||
|
<label class="cb-row">
|
||||||
|
<input type="radio" name="burn-nvidia-mode" value="sequential" checked>
|
||||||
|
<span>Sequential — selected GPUs one at a time</span>
|
||||||
|
</label>
|
||||||
|
<label class="cb-row" id="burn-parallel-label">
|
||||||
|
<input type="radio" name="burn-nvidia-mode" value="parallel">
|
||||||
|
<span>Parallel — all selected GPUs simultaneously</span>
|
||||||
|
</label>
|
||||||
|
<label class="cb-row" id="burn-ramp-label">
|
||||||
|
<input type="radio" name="burn-nvidia-mode" value="ramp-up">
|
||||||
|
<span>Ramp-up — add one GPU at a time</span>
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="burn-section">Core Burn Paths</div>
|
||||||
|
<div class="grid2 burn-grid" style="margin-bottom:16px">
|
||||||
|
<div class="card burn-card">
|
||||||
|
<div class="card-head card-head-actions"><span>GPU Max Load</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-nvidia-compute',target:'nvidia-compute',label:'NVIDIA Max Compute Load (dcgmproftester)',nvidia:true},{id:'burn-gpu-bee',target:'nvidia-stress',label:'GPU Burn (bee-gpu-burn)',nvidia:true,extra:{loader:'builtin'}},{id:'burn-gpu-john',target:'nvidia-stress',label:'John GPU Stress (john/OpenCL)',nvidia:true,extra:{loader:'john'}},{id:'burn-gpu-rvs',target:'amd-stress',label:'AMD GPU Stress (rvs gst)'}])">Run</button></div>
|
||||||
|
<div class="card-body burn-card-body">
|
||||||
|
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Combine vendor-backed and custom GPU max-load recipes in one run set. ` + "dcgmproftester" + ` is the primary official NVIDIA path; custom stressors remain available as parallel checkbox options.</p>
|
||||||
|
<label class="cb-row"><input type="checkbox" id="burn-nvidia-compute" checked disabled><span>NVIDIA Max Compute Load (dcgmproftester) <span class="cb-note" id="note-nvidia-compute"></span></span></label>
|
||||||
|
<label class="cb-row"><input type="checkbox" id="burn-gpu-bee" checked disabled><span>GPU Burn (bee-gpu-burn) <span class="cb-note" id="note-bee"></span></span></label>
|
||||||
|
<label class="cb-row"><input type="checkbox" id="burn-gpu-john" disabled><span>John GPU Stress (john/OpenCL) <span class="cb-note" id="note-john"></span></span></label>
|
||||||
|
<label class="cb-row"><input type="checkbox" id="burn-gpu-rvs" disabled><span>AMD GPU Stress (rvs gst) <span class="cb-note" id="note-rvs"></span></span></label>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="card burn-card">
|
||||||
|
<div class="card-head card-head-actions"><span>Compute Stress</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-cpu',target:'cpu',label:'CPU Burn-in'},{id:'burn-mem-stress',target:'memory-stress',label:'Memory Burn-in'},{id:'burn-sat-stress',target:'sat-stress',label:'SAT Stress (stressapptest)'}])">Run</button></div>
|
||||||
|
<div class="card-body burn-card-body">
|
||||||
|
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Select which subsystems to stress. Each checked item runs as a separate task.</p>
|
||||||
|
<label class="cb-row"><input type="checkbox" id="burn-cpu" checked><span>CPU stress (stress-ng)</span></label>
|
||||||
|
<label class="cb-row"><input type="checkbox" id="burn-mem-stress" checked><span>Memory stress (stress-ng --vm)</span></label>
|
||||||
|
<label class="cb-row"><input type="checkbox" id="burn-sat-stress"><span>stressapptest (CPU + memory bus)</span></label>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div id="bi-output" style="display:none;margin-top:16px" class="card">
|
||||||
|
<div class="card-head">Output <span id="bi-title"></span></div>
|
||||||
|
<div class="card-body"><div id="bi-terminal" class="terminal"></div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<style>
|
||||||
|
.cb-row { display:flex; align-items:flex-start; gap:8px; padding:4px 0; cursor:pointer; font-size:13px; }
|
||||||
|
.cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||||
|
.cb-row input[type=checkbox]:disabled { opacity:0.4; cursor:not-allowed; }
|
||||||
|
.cb-row input[type=checkbox]:disabled ~ span { opacity:0.45; cursor:not-allowed; }
|
||||||
|
.cb-note { font-size:11px; color:var(--muted); font-style:italic; }
|
||||||
|
.burn-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
|
||||||
|
.burn-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||||
|
.burn-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
|
||||||
|
.burn-profile-col { min-width:0; }
|
||||||
|
.burn-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:flex-start; gap:8px; }
|
||||||
|
.burn-profile-action p { font-size:12px; color:var(--muted); margin:0; width:100%; text-align:left; }
|
||||||
|
.burn-section { font-size:12px; font-weight:700; letter-spacing:.06em; text-transform:uppercase; color:var(--muted); margin:0 0 10px; padding-top:4px; }
|
||||||
|
.burn-grid { align-items:stretch; }
|
||||||
|
.burn-card { height:100%; display:flex; flex-direction:column; }
|
||||||
|
.burn-card-body { flex:1; display:flex; flex-direction:column; }
|
||||||
|
.card-head-actions { justify-content:space-between; }
|
||||||
|
.card-head-buttons { display:flex; align-items:center; gap:8px; margin-left:auto; }
|
||||||
|
@media(max-width:900px){ .card-head-actions { align-items:flex-start; flex-direction:column; } .card-head-buttons { margin-left:0; } .burn-profile-body { grid-template-columns:1fr; } }
|
||||||
|
</style>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
let biES = null;
|
||||||
|
function burnTaskIDs(payload) {
|
||||||
|
if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
|
||||||
|
if (payload && payload.task_id) return [payload.task_id];
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
function burnProfile() {
|
||||||
|
const selected = document.querySelector('input[name="burn-profile"]:checked');
|
||||||
|
return selected ? selected.value : 'smoke';
|
||||||
|
}
|
||||||
|
function burnSelectedGPUIndices() {
|
||||||
|
return Array.from(document.querySelectorAll('.burn-gpu-checkbox'))
|
||||||
|
.filter(function(el) { return el.checked && !el.disabled; })
|
||||||
|
.map(function(el) { return parseInt(el.value, 10); })
|
||||||
|
.filter(function(v) { return !Number.isNaN(v); })
|
||||||
|
.sort(function(a, b) { return a - b; });
|
||||||
|
}
|
||||||
|
function burnNvidiaMode() {
|
||||||
|
const el = document.querySelector('input[name="burn-nvidia-mode"]:checked');
|
||||||
|
return el ? el.value : 'sequential';
|
||||||
|
}
|
||||||
|
function burnApplyMultiGPUState(gpuCount) {
|
||||||
|
var multiValues = ['parallel', 'ramp-up'];
|
||||||
|
var radios = document.querySelectorAll('input[name="burn-nvidia-mode"]');
|
||||||
|
radios.forEach(function(el) {
|
||||||
|
var isMulti = multiValues.indexOf(el.value) >= 0;
|
||||||
|
if (gpuCount < 2 && isMulti) {
|
||||||
|
el.disabled = true;
|
||||||
|
if (el.checked) {
|
||||||
|
var seq = document.querySelector('input[name="burn-nvidia-mode"][value="sequential"]');
|
||||||
|
if (seq) seq.checked = true;
|
||||||
|
}
|
||||||
|
var label = el.closest('label');
|
||||||
|
if (label) label.style.opacity = '0.4';
|
||||||
|
} else {
|
||||||
|
el.disabled = false;
|
||||||
|
var label = el.closest('label');
|
||||||
|
if (label) label.style.opacity = '';
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function burnUpdateSelectionNote() {
|
||||||
|
const note = document.getElementById('burn-selection-note');
|
||||||
|
const selected = burnSelectedGPUIndices();
|
||||||
|
if (!selected.length) {
|
||||||
|
note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA burn recipes.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
note.textContent = 'Selected NVIDIA GPUs: ' + selected.join(', ') + '. Official and custom NVIDIA tasks will use only these GPUs.';
|
||||||
|
}
|
||||||
|
function burnRenderGPUList(gpus) {
|
||||||
|
const root = document.getElementById('burn-gpu-list');
|
||||||
|
if (!gpus || !gpus.length) {
|
||||||
|
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
|
||||||
|
burnUpdateSelectionNote();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
root.innerHTML = gpus.map(function(gpu) {
|
||||||
|
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
|
||||||
|
return '<label class="burn-gpu-row">'
|
||||||
|
+ '<input class="burn-gpu-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="burnUpdateSelectionNote()">'
|
||||||
|
+ '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
|
||||||
|
+ '</label>';
|
||||||
|
}).join('');
|
||||||
|
burnApplyMultiGPUState(gpus.length);
|
||||||
|
burnUpdateSelectionNote();
|
||||||
|
}
|
||||||
|
function burnSelectAll() {
|
||||||
|
document.querySelectorAll('.burn-gpu-checkbox').forEach(function(el) { el.checked = true; });
|
||||||
|
burnUpdateSelectionNote();
|
||||||
|
}
|
||||||
|
function burnSelectNone() {
|
||||||
|
document.querySelectorAll('.burn-gpu-checkbox').forEach(function(el) { el.checked = false; });
|
||||||
|
burnUpdateSelectionNote();
|
||||||
|
}
|
||||||
|
function burnLoadGPUs() {
|
||||||
|
fetch('/api/gpu/nvidia').then(function(r) {
|
||||||
|
return r.json().then(function(body) {
|
||||||
|
if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
|
||||||
|
return body;
|
||||||
|
});
|
||||||
|
}).then(function(gpus) {
|
||||||
|
burnRenderGPUList(gpus);
|
||||||
|
}).catch(function(err) {
|
||||||
|
document.getElementById('burn-gpu-list').innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
|
||||||
|
burnUpdateSelectionNote();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function enqueueBurnTask(target, label, extra, useSelectedNvidia) {
|
||||||
|
const body = Object.assign({ profile: burnProfile(), display_name: label }, extra || {});
|
||||||
|
if (useSelectedNvidia) {
|
||||||
|
const selected = burnSelectedGPUIndices();
|
||||||
|
if (!selected.length) {
|
||||||
|
return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
|
||||||
|
}
|
||||||
|
body.gpu_indices = selected;
|
||||||
|
const bMode = burnNvidiaMode();
|
||||||
|
if (bMode === 'ramp-up' && selected.length > 1) {
|
||||||
|
body.stagger_gpu_start = true;
|
||||||
|
} else if (bMode === 'parallel' && selected.length > 1) {
|
||||||
|
body.parallel_gpus = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return fetch('/api/sat/' + target + '/run', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {'Content-Type':'application/json'},
|
||||||
|
body: JSON.stringify(body)
|
||||||
|
}).then(function(r) {
|
||||||
|
return r.json().then(function(payload) {
|
||||||
|
if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
|
||||||
|
return payload;
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function streamTask(taskId, label) {
|
||||||
|
if (biES) { biES.close(); biES = null; }
|
||||||
|
document.getElementById('bi-output').style.display = 'block';
|
||||||
|
document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
|
||||||
|
const term = document.getElementById('bi-terminal');
|
||||||
|
term.textContent = 'Task ' + taskId + ' queued. Streaming...\n';
|
||||||
|
biES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||||
|
biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||||
|
biES.addEventListener('done', function(e) {
|
||||||
|
biES.close();
|
||||||
|
biES = null;
|
||||||
|
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function streamBurnTask(taskId, label, resetTerminal) {
|
||||||
|
return streamBurnTaskSet([taskId], label, resetTerminal);
|
||||||
|
}
|
||||||
|
function streamBurnTaskSet(taskIds, label, resetTerminal) {
|
||||||
|
if (biES) { biES.close(); biES = null; }
|
||||||
|
document.getElementById('bi-output').style.display = 'block';
|
||||||
|
document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
|
||||||
|
const term = document.getElementById('bi-terminal');
|
||||||
|
if (resetTerminal) {
|
||||||
|
term.textContent = '';
|
||||||
|
}
|
||||||
|
if (!Array.isArray(taskIds) || !taskIds.length) {
|
||||||
|
term.textContent += 'ERROR: no tasks queued.\n';
|
||||||
|
return Promise.resolve({ok:false, error:'no tasks queued'});
|
||||||
|
}
|
||||||
|
const streamNext = function(idx, failures) {
|
||||||
|
if (idx >= taskIds.length) {
|
||||||
|
return Promise.resolve({ok: failures === 0, error: failures ? (failures + ' task(s) failed') : ''});
|
||||||
|
}
|
||||||
|
const taskId = taskIds[idx];
|
||||||
|
term.textContent += '[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming...\n';
|
||||||
|
return new Promise(function(resolve) {
|
||||||
|
biES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||||
|
biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||||
|
biES.addEventListener('done', function(e) {
|
||||||
|
biES.close();
|
||||||
|
biES = null;
|
||||||
|
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
resolve(failures + (e.data ? 1 : 0));
|
||||||
|
});
|
||||||
|
biES.onerror = function() {
|
||||||
|
if (biES) {
|
||||||
|
biES.close();
|
||||||
|
biES = null;
|
||||||
|
}
|
||||||
|
term.textContent += '\nERROR: stream disconnected.\n';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
resolve(failures + 1);
|
||||||
|
};
|
||||||
|
}).then(function(nextFailures) {
|
||||||
|
return streamNext(idx + 1, nextFailures);
|
||||||
|
});
|
||||||
|
};
|
||||||
|
return streamNext(0, 0);
|
||||||
|
}
|
||||||
|
function runBurnTaskSet(tasks, statusElId) {
|
||||||
|
const enabled = tasks.filter(function(t) {
|
||||||
|
const el = document.getElementById(t.id);
|
||||||
|
return el && el.checked && !el.disabled;
|
||||||
|
});
|
||||||
|
const status = statusElId ? document.getElementById(statusElId) : null;
|
||||||
|
if (status) status.textContent = '';
|
||||||
|
if (!enabled.length) {
|
||||||
|
if (status) status.textContent = 'No tasks selected.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const term = document.getElementById('bi-terminal');
|
||||||
|
document.getElementById('bi-output').style.display = 'block';
|
||||||
|
document.getElementById('bi-title').textContent = '— Burn one by one [' + burnProfile() + ']';
|
||||||
|
term.textContent = '';
|
||||||
|
const runNext = function(idx) {
|
||||||
|
if (idx >= enabled.length) {
|
||||||
|
if (status) status.textContent = 'Completed ' + enabled.length + ' task(s).';
|
||||||
|
return Promise.resolve();
|
||||||
|
}
|
||||||
|
const t = enabled[idx];
|
||||||
|
term.textContent += '\n[' + (idx + 1) + '/' + enabled.length + '] ' + t.label + '\n';
|
||||||
|
if (status) status.textContent = 'Running ' + (idx + 1) + '/' + enabled.length + '...';
|
||||||
|
return enqueueBurnTask(t.target, t.label, t.extra, !!t.nvidia)
|
||||||
|
.then(function(d) {
|
||||||
|
return streamBurnTaskSet(burnTaskIDs(d), t.label, false);
|
||||||
|
})
|
||||||
|
.then(function() {
|
||||||
|
return runNext(idx + 1);
|
||||||
|
})
|
||||||
|
.catch(function(err) {
|
||||||
|
if (status) status.textContent = 'Error: ' + err.message;
|
||||||
|
document.getElementById('bi-output').style.display = 'block';
|
||||||
|
term.textContent += 'ERROR: ' + err.message + '\n';
|
||||||
|
return Promise.reject(err);
|
||||||
|
});
|
||||||
|
};
|
||||||
|
return runNext(0);
|
||||||
|
}
|
||||||
|
function runPlatformStress() {
|
||||||
|
const comps = [];
|
||||||
|
const computeIDs = ['burn-cpu', 'burn-mem-stress', 'burn-sat-stress'];
|
||||||
|
const gpuIDs = ['burn-nvidia-compute', 'burn-gpu-bee', 'burn-gpu-john', 'burn-gpu-rvs'];
|
||||||
|
const hasChecked = function(ids) {
|
||||||
|
return ids.some(function(id) {
|
||||||
|
const el = document.getElementById(id);
|
||||||
|
return el && el.checked && !el.disabled;
|
||||||
|
});
|
||||||
|
};
|
||||||
|
if (hasChecked(computeIDs)) comps.push('cpu');
|
||||||
|
if (hasChecked(gpuIDs)) comps.push('gpu');
|
||||||
|
if (!comps.length) {
|
||||||
|
const status = document.getElementById('burn-all-status');
|
||||||
|
if (status) status.textContent = 'Select at least one test in GPU Max Load or Compute Stress.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const extra = comps.length > 0 ? {platform_components: comps} : {};
|
||||||
|
enqueueBurnTask('platform-stress', 'Platform Thermal Cycling', extra, false).then(function(d) {
|
||||||
|
streamTask(d.task_id, 'Platform Thermal Cycling');
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function runAllBurnTasks() {
|
||||||
|
const status = document.getElementById('burn-all-status');
|
||||||
|
const all = [
|
||||||
|
{id:'burn-nvidia-compute',target:'nvidia-compute',label:'NVIDIA Max Compute Load (dcgmproftester)',nvidia:true},
|
||||||
|
{id:'burn-gpu-bee',target:'nvidia-stress',label:'GPU Burn (bee-gpu-burn)',nvidia:true,extra:{loader:'builtin'}},
|
||||||
|
{id:'burn-gpu-john',target:'nvidia-stress',label:'John GPU Stress (john/OpenCL)',nvidia:true,extra:{loader:'john'}},
|
||||||
|
{id:'burn-gpu-rvs',target:'amd-stress',label:'AMD GPU Stress (rvs gst)'},
|
||||||
|
{id:'burn-cpu',target:'cpu',label:'CPU Burn-in'},
|
||||||
|
{id:'burn-mem-stress',target:'memory-stress',label:'Memory Burn-in'},
|
||||||
|
{id:'burn-sat-stress',target:'sat-stress',label:'SAT Stress (stressapptest)'},
|
||||||
|
];
|
||||||
|
status.textContent = 'Enqueuing...';
|
||||||
|
runBurnTaskSet(all, 'burn-all-status');
|
||||||
|
}
|
||||||
|
fetch('/api/gpu/tools').then(function(r) { return r.json(); }).then(function(tools) {
|
||||||
|
const map = {
|
||||||
|
'nvidia-compute': {cb:'burn-nvidia-compute', note:'note-nvidia-compute', reason:'dcgmproftester not available or NVIDIA driver not running'},
|
||||||
|
'bee-gpu-burn': {cb:'burn-gpu-bee', note:'note-bee', reason:'bee-gpu-burn not available or NVIDIA driver not running'},
|
||||||
|
'john': {cb:'burn-gpu-john', note:'note-john', reason:'bee-john-gpu-stress not available or NVIDIA driver not running'},
|
||||||
|
'rvs': {cb:'burn-gpu-rvs', note:'note-rvs', reason:'AMD driver not running'},
|
||||||
|
};
|
||||||
|
tools.forEach(function(t) {
|
||||||
|
const spec = map[t.id];
|
||||||
|
if (!spec) return;
|
||||||
|
const cb = document.getElementById(spec.cb);
|
||||||
|
const note = document.getElementById(spec.note);
|
||||||
|
if (!cb) return;
|
||||||
|
if (t.available) {
|
||||||
|
cb.disabled = false;
|
||||||
|
} else if (note) {
|
||||||
|
note.textContent = '— ' + spec.reason;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}).catch(function() {});
|
||||||
|
burnLoadGPUs();
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
434
audit/internal/webui/page_export_tools.go
Normal file
434
audit/internal/webui/page_export_tools.go
Normal file
@@ -0,0 +1,434 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"html"
|
||||||
|
"net/url"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
func renderExport(exportDir string) string {
|
||||||
|
entries, _ := listExportFiles(exportDir)
|
||||||
|
var rows strings.Builder
|
||||||
|
for _, e := range entries {
|
||||||
|
rows.WriteString(fmt.Sprintf(`<tr><td><a href="/export/file?path=%s" target="_blank">%s</a></td></tr>`,
|
||||||
|
url.QueryEscape(e), html.EscapeString(e)))
|
||||||
|
}
|
||||||
|
if len(entries) == 0 {
|
||||||
|
rows.WriteString(`<tr><td style="color:var(--muted)">No export files found.</td></tr>`)
|
||||||
|
}
|
||||||
|
return `<div class="grid2">
|
||||||
|
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
||||||
|
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Creates a tar.gz archive of all audit files, SAT results, and logs.</p>
|
||||||
|
` + renderSupportBundleInline() + `
|
||||||
|
</div></div>
|
||||||
|
<div class="card"><div class="card-head">Export Files</div><div class="card-body">
|
||||||
|
<table><tr><th>File</th></tr>` + rows.String() + `</table>
|
||||||
|
</div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
` + renderUSBExportCard()
|
||||||
|
}
|
||||||
|
|
||||||
|
func listExportFiles(exportDir string) ([]string, error) {
|
||||||
|
var entries []string
|
||||||
|
err := filepath.Walk(strings.TrimSpace(exportDir), func(path string, info os.FileInfo, err error) error {
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if info.IsDir() {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
rel, err := filepath.Rel(exportDir, path)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
entries = append(entries, rel)
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
if err != nil && !os.IsNotExist(err) {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
sort.Strings(entries)
|
||||||
|
return entries, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderSupportBundleInline() string {
|
||||||
|
return `<button id="support-bundle-btn" class="btn btn-primary" onclick="supportBundleDownload()">↓ Download Support Bundle</button>
|
||||||
|
<div id="support-bundle-status" style="margin-top:10px;font-size:13px;color:var(--muted)"></div>
|
||||||
|
<script>
|
||||||
|
window.supportBundleDownload = function() {
|
||||||
|
var btn = document.getElementById('support-bundle-btn');
|
||||||
|
var status = document.getElementById('support-bundle-status');
|
||||||
|
btn.disabled = true;
|
||||||
|
btn.textContent = 'Building...';
|
||||||
|
status.textContent = 'Collecting logs and export data\u2026';
|
||||||
|
status.style.color = 'var(--muted)';
|
||||||
|
var filename = 'bee-support.tar.gz';
|
||||||
|
fetch('/export/support.tar.gz')
|
||||||
|
.then(function(r) {
|
||||||
|
if (!r.ok) throw new Error('HTTP ' + r.status);
|
||||||
|
var cd = r.headers.get('Content-Disposition') || '';
|
||||||
|
var m = cd.match(/filename="?([^";]+)"?/);
|
||||||
|
if (m) filename = m[1];
|
||||||
|
return r.blob();
|
||||||
|
})
|
||||||
|
.then(function(blob) {
|
||||||
|
var url = URL.createObjectURL(blob);
|
||||||
|
var a = document.createElement('a');
|
||||||
|
a.href = url;
|
||||||
|
a.download = filename;
|
||||||
|
document.body.appendChild(a);
|
||||||
|
a.click();
|
||||||
|
document.body.removeChild(a);
|
||||||
|
URL.revokeObjectURL(url);
|
||||||
|
status.textContent = 'Download started.';
|
||||||
|
status.style.color = 'var(--ok-fg)';
|
||||||
|
})
|
||||||
|
.catch(function(e) {
|
||||||
|
status.textContent = 'Error: ' + e.message;
|
||||||
|
status.style.color = 'var(--crit-fg)';
|
||||||
|
})
|
||||||
|
.finally(function() {
|
||||||
|
btn.disabled = false;
|
||||||
|
btn.textContent = '\u2195 Download Support Bundle';
|
||||||
|
});
|
||||||
|
};
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderUSBExportCard() string {
|
||||||
|
return `<div class="card" style="margin-top:16px">
|
||||||
|
<div class="card-head">Export to USB
|
||||||
|
<button class="btn btn-sm btn-secondary" onclick="usbRefresh()" style="margin-left:auto">↻ Refresh</button>
|
||||||
|
</div>
|
||||||
|
<div class="card-body">` + renderUSBExportInline() + `</div>
|
||||||
|
</div>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderUSBExportInline() string {
|
||||||
|
return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Write audit JSON or support bundle directly to a removable USB drive.</p>
|
||||||
|
<div id="usb-status" style="font-size:13px;color:var(--muted)">Scanning for USB devices...</div>
|
||||||
|
<div id="usb-targets" style="margin-top:12px"></div>
|
||||||
|
<div id="usb-msg" style="margin-top:10px;font-size:13px"></div>
|
||||||
|
<script>
|
||||||
|
(function(){
|
||||||
|
function usbRefresh() {
|
||||||
|
document.getElementById('usb-status').textContent = 'Scanning...';
|
||||||
|
document.getElementById('usb-targets').innerHTML = '';
|
||||||
|
document.getElementById('usb-msg').textContent = '';
|
||||||
|
fetch('/api/export/usb').then(r=>r.json()).then(targets => {
|
||||||
|
window._usbTargets = Array.isArray(targets) ? targets : [];
|
||||||
|
const st = document.getElementById('usb-status');
|
||||||
|
const ct = document.getElementById('usb-targets');
|
||||||
|
if (!targets || targets.length === 0) {
|
||||||
|
st.textContent = 'No removable USB devices found.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
st.textContent = targets.length + ' device(s) found:';
|
||||||
|
ct.innerHTML = '<table><tr><th>Device</th><th>FS</th><th>Size</th><th>Label</th><th>Model</th><th>Actions</th></tr>' +
|
||||||
|
targets.map((t, idx) => {
|
||||||
|
const dev = t.device || '';
|
||||||
|
const label = t.label || '';
|
||||||
|
const model = t.model || '';
|
||||||
|
return '<tr>' +
|
||||||
|
'<td style="font-family:monospace">'+dev+'</td>' +
|
||||||
|
'<td>'+t.fs_type+'</td>' +
|
||||||
|
'<td>'+t.size+'</td>' +
|
||||||
|
'<td>'+label+'</td>' +
|
||||||
|
'<td style="font-size:12px;color:var(--muted)">'+model+'</td>' +
|
||||||
|
'<td style="white-space:nowrap">' +
|
||||||
|
'<button class="btn btn-sm btn-primary" onclick="usbExport(\'audit\','+idx+',this)">Audit JSON</button> ' +
|
||||||
|
'<button class="btn btn-sm btn-secondary" onclick="usbExport(\'bundle\','+idx+',this)">Support Bundle</button>' +
|
||||||
|
'<div class="usb-row-msg" style="margin-top:6px;font-size:12px;color:var(--muted)"></div>' +
|
||||||
|
'</td></tr>';
|
||||||
|
}).join('') + '</table>';
|
||||||
|
}).catch(e => {
|
||||||
|
document.getElementById('usb-status').textContent = 'Error: ' + e;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
window.usbExport = function(type, targetIndex, btn) {
|
||||||
|
const target = (window._usbTargets || [])[targetIndex];
|
||||||
|
if (!target) {
|
||||||
|
const msg = document.getElementById('usb-msg');
|
||||||
|
msg.style.color = 'var(--err,red)';
|
||||||
|
msg.textContent = 'Error: USB target not found. Refresh and try again.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const msg = document.getElementById('usb-msg');
|
||||||
|
const row = btn ? btn.closest('td') : null;
|
||||||
|
const rowMsg = row ? row.querySelector('.usb-row-msg') : null;
|
||||||
|
const originalText = btn ? btn.textContent : '';
|
||||||
|
if (btn) {
|
||||||
|
btn.disabled = true;
|
||||||
|
btn.textContent = 'Exporting...';
|
||||||
|
}
|
||||||
|
if (rowMsg) {
|
||||||
|
rowMsg.style.color = 'var(--muted)';
|
||||||
|
rowMsg.textContent = 'Working...';
|
||||||
|
}
|
||||||
|
msg.style.color = 'var(--muted)';
|
||||||
|
msg.textContent = 'Exporting ' + (type === 'bundle' ? 'support bundle' : 'audit JSON') + ' to ' + (target.device||'') + '...';
|
||||||
|
fetch('/api/export/usb/'+type, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {'Content-Type':'application/json'},
|
||||||
|
body: JSON.stringify(target)
|
||||||
|
}).then(async r => {
|
||||||
|
const d = await r.json();
|
||||||
|
if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status));
|
||||||
|
return d;
|
||||||
|
}).then(d => {
|
||||||
|
msg.style.color = 'var(--ok,green)';
|
||||||
|
msg.textContent = d.message || 'Done.';
|
||||||
|
if (rowMsg) {
|
||||||
|
rowMsg.style.color = 'var(--ok,green)';
|
||||||
|
rowMsg.textContent = d.message || 'Done.';
|
||||||
|
}
|
||||||
|
}).catch(e => {
|
||||||
|
msg.style.color = 'var(--err,red)';
|
||||||
|
msg.textContent = 'Error: '+e;
|
||||||
|
if (rowMsg) {
|
||||||
|
rowMsg.style.color = 'var(--err,red)';
|
||||||
|
rowMsg.textContent = 'Error: ' + e;
|
||||||
|
}
|
||||||
|
}).finally(() => {
|
||||||
|
if (btn) {
|
||||||
|
btn.disabled = false;
|
||||||
|
btn.textContent = originalText;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
};
|
||||||
|
window.usbRefresh = usbRefresh;
|
||||||
|
usbRefresh();
|
||||||
|
})();
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderNvidiaSelfHealInline() string {
|
||||||
|
return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Inspect NVIDIA GPU health, restart the bee-nvidia driver service, and issue a per-GPU reset when the driver reports reset required.</p>
|
||||||
|
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:12px">
|
||||||
|
<button id="nvidia-restart-btn" class="btn btn-secondary" onclick="nvidiaRestartDrivers()">Restart GPU Drivers</button>
|
||||||
|
<button class="btn btn-sm btn-secondary" onclick="loadNvidiaSelfHeal()">↻ Refresh</button>
|
||||||
|
</div>
|
||||||
|
<div id="nvidia-self-heal-status" style="font-size:13px;color:var(--muted);margin-bottom:12px">Loading NVIDIA GPU status...</div>
|
||||||
|
<div id="nvidia-self-heal-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||||
|
<div id="nvidia-self-heal-out" style="display:none;margin-top:12px">
|
||||||
|
<div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
|
||||||
|
<span id="nvidia-self-heal-out-label" style="font-size:12px;font-weight:600;color:var(--muted)">Output</span>
|
||||||
|
<span id="nvidia-self-heal-out-status" style="font-size:12px"></span>
|
||||||
|
</div>
|
||||||
|
<div id="nvidia-self-heal-terminal" class="terminal" style="max-height:220px;width:100%;box-sizing:border-box"></div>
|
||||||
|
</div>
|
||||||
|
<script>
|
||||||
|
function nvidiaSelfHealShowResult(label, status, output) {
|
||||||
|
var out = document.getElementById('nvidia-self-heal-out');
|
||||||
|
var term = document.getElementById('nvidia-self-heal-terminal');
|
||||||
|
var statusEl = document.getElementById('nvidia-self-heal-out-status');
|
||||||
|
var labelEl = document.getElementById('nvidia-self-heal-out-label');
|
||||||
|
out.style.display = 'block';
|
||||||
|
labelEl.textContent = label;
|
||||||
|
term.textContent = output || '(no output)';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
if (status === 'ok') {
|
||||||
|
statusEl.textContent = '✓ done';
|
||||||
|
statusEl.style.color = 'var(--ok-fg, #2c662d)';
|
||||||
|
} else {
|
||||||
|
statusEl.textContent = '✗ failed';
|
||||||
|
statusEl.style.color = 'var(--crit-fg, #9f3a38)';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
function nvidiaRestartDrivers() {
|
||||||
|
var btn = document.getElementById('nvidia-restart-btn');
|
||||||
|
var original = btn.textContent;
|
||||||
|
btn.disabled = true;
|
||||||
|
btn.textContent = 'Restarting...';
|
||||||
|
nvidiaSelfHealShowResult('restart bee-nvidia', 'ok', 'Running...');
|
||||||
|
fetch('/api/services/action', {
|
||||||
|
method:'POST',
|
||||||
|
headers:{'Content-Type':'application/json'},
|
||||||
|
body:JSON.stringify({name:'bee-nvidia', action:'restart'})
|
||||||
|
}).then(r=>r.json()).then(d => {
|
||||||
|
nvidiaSelfHealShowResult('restart bee-nvidia', d.status || 'error', d.output || d.error || '(no output)');
|
||||||
|
setTimeout(function() {
|
||||||
|
loadServices();
|
||||||
|
loadNvidiaSelfHeal();
|
||||||
|
}, 800);
|
||||||
|
}).catch(e => {
|
||||||
|
nvidiaSelfHealShowResult('restart bee-nvidia', 'error', 'Request failed: ' + e);
|
||||||
|
}).finally(() => {
|
||||||
|
btn.disabled = false;
|
||||||
|
btn.textContent = original;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function nvidiaResetGPU(index, btn) {
|
||||||
|
var original = btn.textContent;
|
||||||
|
btn.disabled = true;
|
||||||
|
btn.textContent = 'Resetting...';
|
||||||
|
nvidiaSelfHealShowResult('reset gpu ' + index, 'ok', 'Running...');
|
||||||
|
fetch('/api/gpu/nvidia-reset', {
|
||||||
|
method:'POST',
|
||||||
|
headers:{'Content-Type':'application/json'},
|
||||||
|
body:JSON.stringify({index:index})
|
||||||
|
}).then(r=>r.json()).then(d => {
|
||||||
|
nvidiaSelfHealShowResult('reset gpu ' + index, d.status || 'error', d.output || '(no output)');
|
||||||
|
setTimeout(loadNvidiaSelfHeal, 1000);
|
||||||
|
}).catch(e => {
|
||||||
|
nvidiaSelfHealShowResult('reset gpu ' + index, 'error', 'Request failed: ' + e);
|
||||||
|
}).finally(() => {
|
||||||
|
btn.disabled = false;
|
||||||
|
btn.textContent = original;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function loadNvidiaSelfHeal() {
|
||||||
|
var status = document.getElementById('nvidia-self-heal-status');
|
||||||
|
var table = document.getElementById('nvidia-self-heal-table');
|
||||||
|
status.textContent = 'Loading NVIDIA GPU status...';
|
||||||
|
status.style.color = 'var(--muted)';
|
||||||
|
table.innerHTML = '<p style="color:var(--muted);font-size:13px">Loading...</p>';
|
||||||
|
fetch('/api/gpu/nvidia-status').then(r=>r.json()).then(gpus => {
|
||||||
|
if (!Array.isArray(gpus) || gpus.length === 0) {
|
||||||
|
status.textContent = 'No NVIDIA GPUs detected or nvidia-smi is unavailable.';
|
||||||
|
table.innerHTML = '';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
status.textContent = gpus.length + ' NVIDIA GPU(s) detected.';
|
||||||
|
const rows = gpus.map(g => {
|
||||||
|
const serial = g.serial || '';
|
||||||
|
const bdf = g.bdf || '';
|
||||||
|
const id = serial || bdf || ('gpu-' + g.index);
|
||||||
|
const badge = g.status === 'OK' ? 'badge-ok' : g.status === 'RESET_REQUIRED' ? 'badge-err' : 'badge-warn';
|
||||||
|
const details = [];
|
||||||
|
if (serial) details.push('serial ' + serial);
|
||||||
|
if (bdf) details.push('bdf ' + bdf);
|
||||||
|
if (g.parse_failure && g.raw_line) details.push(g.raw_line);
|
||||||
|
return '<tr>'
|
||||||
|
+ '<td style="white-space:nowrap">' + g.index + '</td>'
|
||||||
|
+ '<td>' + (g.name || 'unknown') + '</td>'
|
||||||
|
+ '<td style="font-family:monospace">' + id + '</td>'
|
||||||
|
+ '<td><span class="badge ' + badge + '">' + (g.status || 'UNKNOWN') + '</span>'
|
||||||
|
+ (details.length ? '<div style="margin-top:4px;font-size:12px;color:var(--muted)">' + details.join(' | ') + '</div>' : '')
|
||||||
|
+ '</td>'
|
||||||
|
+ '<td style="white-space:nowrap"><button class="btn btn-sm btn-secondary" onclick="nvidiaResetGPU(' + g.index + ', this)">Reset GPU</button></td>'
|
||||||
|
+ '</tr>';
|
||||||
|
}).join('');
|
||||||
|
table.innerHTML = '<table><tr><th>GPU</th><th>Model</th><th>ID</th><th>Status</th><th>Action</th></tr>' + rows + '</table>';
|
||||||
|
}).catch(e => {
|
||||||
|
status.textContent = 'Error loading NVIDIA GPU status: ' + e;
|
||||||
|
status.style.color = 'var(--crit-fg, #9f3a38)';
|
||||||
|
table.innerHTML = '';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
loadNvidiaSelfHeal();
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderTools() string {
|
||||||
|
return `<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">System Install</div>
|
||||||
|
<div class="card-body">
|
||||||
|
<div style="margin-bottom:20px">
|
||||||
|
<div style="font-weight:600;margin-bottom:8px">Install to RAM</div>
|
||||||
|
<p id="boot-source-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Detecting boot source...</p>
|
||||||
|
<p id="ram-status-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Checking...</p>
|
||||||
|
<button id="ram-install-btn" class="btn btn-primary" onclick="installToRAM()" style="display:none">▶ Copy to RAM</button>
|
||||||
|
</div>
|
||||||
|
<div style="border-top:1px solid var(--line);padding-top:20px">
|
||||||
|
<div style="font-weight:600;margin-bottom:8px">Install to Disk</div>` +
|
||||||
|
renderInstallInline() + `
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<script>
|
||||||
|
fetch('/api/system/ram-status').then(r=>r.json()).then(d=>{
|
||||||
|
const boot = document.getElementById('boot-source-text');
|
||||||
|
const txt = document.getElementById('ram-status-text');
|
||||||
|
const btn = document.getElementById('ram-install-btn');
|
||||||
|
let source = d.device || d.source || 'unknown source';
|
||||||
|
let kind = d.kind || 'unknown';
|
||||||
|
let label = source;
|
||||||
|
if (kind === 'ram') label = 'RAM';
|
||||||
|
else if (kind === 'usb') label = 'USB (' + source + ')';
|
||||||
|
else if (kind === 'cdrom') label = 'CD-ROM (' + source + ')';
|
||||||
|
else if (kind === 'disk') label = 'disk (' + source + ')';
|
||||||
|
else label = source;
|
||||||
|
boot.textContent = 'Current boot source: ' + label + '.';
|
||||||
|
txt.textContent = d.message || 'Checking...';
|
||||||
|
if (d.status === 'ok' || d.in_ram) {
|
||||||
|
txt.style.color = 'var(--ok, green)';
|
||||||
|
} else if (d.status === 'failed') {
|
||||||
|
txt.style.color = 'var(--err, #b91c1c)';
|
||||||
|
} else {
|
||||||
|
txt.style.color = 'var(--muted)';
|
||||||
|
}
|
||||||
|
if (d.can_start_task) {
|
||||||
|
btn.style.display = '';
|
||||||
|
btn.disabled = false;
|
||||||
|
} else {
|
||||||
|
btn.style.display = 'none';
|
||||||
|
}
|
||||||
|
});
|
||||||
|
function installToRAM() {
|
||||||
|
document.getElementById('ram-install-btn').disabled = true;
|
||||||
|
fetch('/api/system/install-to-ram', {method:'POST'}).then(r=>r.json()).then(d=>{
|
||||||
|
window.location.href = '/tasks#' + d.task_id;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
|
||||||
|
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
||||||
|
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Downloads a tar.gz archive of all audit files, SAT results, and logs.</p>
|
||||||
|
` + renderSupportBundleInline() + `
|
||||||
|
<div style="border-top:1px solid var(--border);margin-top:16px;padding-top:16px">
|
||||||
|
<div style="font-weight:600;margin-bottom:8px">Export to USB</div>
|
||||||
|
` + renderUSBExportInline() + `
|
||||||
|
</div>
|
||||||
|
</div></div>
|
||||||
|
|
||||||
|
<div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">↻ Check</button></div>
|
||||||
|
<div class="card-body"><div id="tools-table"><p style="color:var(--muted);font-size:13px">Checking...</p></div></div></div>
|
||||||
|
|
||||||
|
<div class="card"><div class="card-head">NVIDIA Self Heal</div><div class="card-body">` +
|
||||||
|
renderNvidiaSelfHealInline() + `</div></div>
|
||||||
|
|
||||||
|
<div class="card"><div class="card-head">Network</div><div class="card-body">` +
|
||||||
|
renderNetworkInline() + `</div></div>
|
||||||
|
|
||||||
|
<div class="card"><div class="card-head">Services</div><div class="card-body">` +
|
||||||
|
renderServicesInline() + `</div></div>
|
||||||
|
|
||||||
|
|
||||||
|
<script>
|
||||||
|
function checkTools() {
|
||||||
|
document.getElementById('tools-table').innerHTML = '<p style="color:var(--muted);font-size:13px">Checking...</p>';
|
||||||
|
fetch('/api/tools/check').then(r=>r.json()).then(tools => {
|
||||||
|
const rows = tools.map(t =>
|
||||||
|
'<tr><td>'+t.Name+'</td><td><span class="badge '+(t.OK ? 'badge-ok' : 'badge-err')+'">'+(t.OK ? '✓ '+t.Path : '✗ missing')+'</span></td></tr>'
|
||||||
|
).join('');
|
||||||
|
document.getElementById('tools-table').innerHTML =
|
||||||
|
'<table><tr><th>Tool</th><th>Status</th></tr>'+rows+'</table>';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
checkTools();
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderExportIndex(exportDir string) (string, error) {
|
||||||
|
entries, err := listExportFiles(exportDir)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
var body strings.Builder
|
||||||
|
body.WriteString(`<!DOCTYPE html><html><head><meta charset="utf-8"><title>Bee Export Files</title></head><body>`)
|
||||||
|
body.WriteString(`<h1>Bee Export Files</h1><ul>`)
|
||||||
|
for _, entry := range entries {
|
||||||
|
body.WriteString(`<li><a href="/export/file?path=` + url.QueryEscape(entry) + `">` + html.EscapeString(entry) + `</a></li>`)
|
||||||
|
}
|
||||||
|
if len(entries) == 0 {
|
||||||
|
body.WriteString(`<li>No export files found.</li>`)
|
||||||
|
}
|
||||||
|
body.WriteString(`</ul></body></html>`)
|
||||||
|
return body.String(), nil
|
||||||
|
}
|
||||||
314
audit/internal/webui/page_install_tasks.go
Normal file
314
audit/internal/webui/page_install_tasks.go
Normal file
@@ -0,0 +1,314 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
func renderInstallInline() string {
|
||||||
|
return `
|
||||||
|
<div class="alert alert-warn" style="margin-bottom:16px">
|
||||||
|
<strong>Warning:</strong> Installing will <strong>completely erase</strong> the selected
|
||||||
|
disk and write the live system onto it. All existing data on the target disk will be lost.
|
||||||
|
This operation cannot be undone.
|
||||||
|
</div>
|
||||||
|
<div id="install-loading" style="color:var(--muted);font-size:13px">Loading disk list…</div>
|
||||||
|
<div id="install-disk-section" style="display:none">
|
||||||
|
<div class="card" style="margin-bottom:0">
|
||||||
|
<table id="install-disk-table">
|
||||||
|
<thead><tr><th></th><th>Device</th><th>Model</th><th>Size</th><th>Status</th></tr></thead>
|
||||||
|
<tbody id="install-disk-tbody"></tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
<div style="margin-top:12px">
|
||||||
|
<button class="btn btn-secondary btn-sm" onclick="installRefreshDisks()">↻ Refresh</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div id="install-confirm-section" style="display:none;margin-top:20px">
|
||||||
|
<div id="install-confirm-warn" class="alert" style="background:#fff6f6;border:1px solid #e0b4b4;color:#9f3a38;font-size:13px"></div>
|
||||||
|
<div class="form-row" style="max-width:360px">
|
||||||
|
<label>Type the device name to confirm (e.g. /dev/sda)</label>
|
||||||
|
<input type="text" id="install-confirm-input" placeholder="/dev/..." oninput="installCheckConfirm()" autocomplete="off" spellcheck="false">
|
||||||
|
</div>
|
||||||
|
<button class="btn btn-danger" id="install-start-btn" disabled onclick="installStart()">Install to Disk</button>
|
||||||
|
<button class="btn btn-secondary" style="margin-left:8px" onclick="installDeselect()">Cancel</button>
|
||||||
|
</div>
|
||||||
|
<div id="install-progress-section" style="display:none;margin-top:20px">
|
||||||
|
<div class="card-head" style="margin-bottom:8px">Installation Progress</div>
|
||||||
|
<div id="install-terminal" class="terminal" style="max-height:500px"></div>
|
||||||
|
<div id="install-status" style="margin-top:12px;font-size:13px"></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<style>
|
||||||
|
#install-disk-tbody tr{cursor:pointer}
|
||||||
|
#install-disk-tbody tr.selected td{background:rgba(33,133,208,.1)}
|
||||||
|
#install-disk-tbody tr:hover td{background:rgba(33,133,208,.07)}
|
||||||
|
</style>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
var _installSelected = null;
|
||||||
|
|
||||||
|
function installRefreshDisks() {
|
||||||
|
document.getElementById('install-loading').style.display = '';
|
||||||
|
document.getElementById('install-disk-section').style.display = 'none';
|
||||||
|
document.getElementById('install-confirm-section').style.display = 'none';
|
||||||
|
_installSelected = null;
|
||||||
|
fetch('/api/install/disks').then(function(r){ return r.json(); }).then(function(disks){
|
||||||
|
document.getElementById('install-loading').style.display = 'none';
|
||||||
|
var tbody = document.getElementById('install-disk-tbody');
|
||||||
|
tbody.innerHTML = '';
|
||||||
|
if (!disks || disks.length === 0) {
|
||||||
|
tbody.innerHTML = '<tr><td colspan="5" style="color:var(--muted);text-align:center">No installable disks found</td></tr>';
|
||||||
|
} else {
|
||||||
|
disks.forEach(function(d) {
|
||||||
|
var warnings = (d.warnings || []);
|
||||||
|
var statusHtml;
|
||||||
|
if (warnings.length === 0) {
|
||||||
|
statusHtml = '<span class="badge badge-ok">OK</span>';
|
||||||
|
} else {
|
||||||
|
var hasSmall = warnings.some(function(w){ return w.indexOf('too small') >= 0; });
|
||||||
|
statusHtml = warnings.map(function(w){
|
||||||
|
var cls = hasSmall ? 'badge-err' : 'badge-warn';
|
||||||
|
return '<span class="badge ' + cls + '" title="' + w.replace(/"/g,'"') + '">' +
|
||||||
|
(w.length > 40 ? w.substring(0,38)+'…' : w) + '</span>';
|
||||||
|
}).join(' ');
|
||||||
|
}
|
||||||
|
var mountedNote = (d.mounted_parts && d.mounted_parts.length > 0)
|
||||||
|
? ' <span style="color:var(--warn-fg);font-size:11px">(mounted)</span>' : '';
|
||||||
|
var tr = document.createElement('tr');
|
||||||
|
tr.dataset.device = d.device;
|
||||||
|
tr.dataset.model = d.model || 'Unknown';
|
||||||
|
tr.dataset.size = d.size;
|
||||||
|
tr.dataset.warnings = JSON.stringify(warnings);
|
||||||
|
tr.innerHTML =
|
||||||
|
'<td><input type="radio" name="install-disk" value="' + d.device + '"></td>' +
|
||||||
|
'<td><code>' + d.device + '</code>' + mountedNote + '</td>' +
|
||||||
|
'<td>' + (d.model || '—') + '</td>' +
|
||||||
|
'<td>' + d.size + '</td>' +
|
||||||
|
'<td>' + statusHtml + '</td>';
|
||||||
|
tr.addEventListener('click', function(){ installSelectDisk(this); });
|
||||||
|
tbody.appendChild(tr);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
document.getElementById('install-disk-section').style.display = '';
|
||||||
|
}).catch(function(e){
|
||||||
|
document.getElementById('install-loading').textContent = 'Failed to load disk list: ' + e;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function installSelectDisk(tr) {
|
||||||
|
document.querySelectorAll('#install-disk-tbody tr').forEach(function(r){ r.classList.remove('selected'); });
|
||||||
|
tr.classList.add('selected');
|
||||||
|
var radio = tr.querySelector('input[type=radio]');
|
||||||
|
if (radio) radio.checked = true;
|
||||||
|
_installSelected = {
|
||||||
|
device: tr.dataset.device,
|
||||||
|
model: tr.dataset.model,
|
||||||
|
size: tr.dataset.size,
|
||||||
|
warnings: JSON.parse(tr.dataset.warnings || '[]')
|
||||||
|
};
|
||||||
|
var warnBox = document.getElementById('install-confirm-warn');
|
||||||
|
var warnLines = '<strong>⚠ DANGER:</strong> ' + _installSelected.device +
|
||||||
|
' (' + _installSelected.model + ', ' + _installSelected.size + ')' +
|
||||||
|
' will be <strong>completely erased</strong> and repartitioned. All data will be lost.<br>';
|
||||||
|
if (_installSelected.warnings.length > 0) {
|
||||||
|
warnLines += '<br>' + _installSelected.warnings.map(function(w){ return '• ' + w; }).join('<br>');
|
||||||
|
}
|
||||||
|
warnBox.innerHTML = warnLines;
|
||||||
|
document.getElementById('install-confirm-input').value = '';
|
||||||
|
document.getElementById('install-start-btn').disabled = true;
|
||||||
|
document.getElementById('install-confirm-section').style.display = '';
|
||||||
|
document.getElementById('install-progress-section').style.display = 'none';
|
||||||
|
}
|
||||||
|
|
||||||
|
function installDeselect() {
|
||||||
|
_installSelected = null;
|
||||||
|
document.querySelectorAll('#install-disk-tbody tr').forEach(function(r){ r.classList.remove('selected'); });
|
||||||
|
document.querySelectorAll('#install-disk-tbody input[type=radio]').forEach(function(r){ r.checked = false; });
|
||||||
|
document.getElementById('install-confirm-section').style.display = 'none';
|
||||||
|
}
|
||||||
|
|
||||||
|
function installCheckConfirm() {
|
||||||
|
var val = document.getElementById('install-confirm-input').value.trim();
|
||||||
|
var ok = _installSelected && val === _installSelected.device;
|
||||||
|
document.getElementById('install-start-btn').disabled = !ok;
|
||||||
|
}
|
||||||
|
|
||||||
|
function installStart() {
|
||||||
|
if (!_installSelected) return;
|
||||||
|
document.getElementById('install-confirm-section').style.display = 'none';
|
||||||
|
document.getElementById('install-disk-section').style.display = 'none';
|
||||||
|
document.getElementById('install-loading').style.display = 'none';
|
||||||
|
var prog = document.getElementById('install-progress-section');
|
||||||
|
var term = document.getElementById('install-terminal');
|
||||||
|
var status = document.getElementById('install-status');
|
||||||
|
prog.style.display = '';
|
||||||
|
term.textContent = '';
|
||||||
|
status.textContent = 'Starting installation…';
|
||||||
|
status.style.color = 'var(--muted)';
|
||||||
|
|
||||||
|
fetch('/api/install/run', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {'Content-Type': 'application/json'},
|
||||||
|
body: JSON.stringify({device: _installSelected.device})
|
||||||
|
}).then(function(r){
|
||||||
|
return r.json().then(function(j){
|
||||||
|
if (!r.ok) throw new Error(j.error || r.statusText);
|
||||||
|
return j;
|
||||||
|
});
|
||||||
|
}).then(function(j){
|
||||||
|
if (!j.task_id) throw new Error('missing task id');
|
||||||
|
installStreamLog(j.task_id);
|
||||||
|
}).catch(function(e){
|
||||||
|
status.textContent = 'Error: ' + e;
|
||||||
|
status.style.color = 'var(--crit-fg)';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function installStreamLog(taskId) {
|
||||||
|
var term = document.getElementById('install-terminal');
|
||||||
|
var status = document.getElementById('install-status');
|
||||||
|
var es = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||||
|
es.onmessage = function(e) {
|
||||||
|
term.textContent += e.data + '\n';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
};
|
||||||
|
es.addEventListener('done', function(e) {
|
||||||
|
es.close();
|
||||||
|
if (!e.data) {
|
||||||
|
status.innerHTML = '<span style="color:var(--ok-fg);font-weight:700">✓ Installation complete.</span> Remove the ISO and reboot.';
|
||||||
|
var rebootBtn = document.createElement('button');
|
||||||
|
rebootBtn.className = 'btn btn-primary btn-sm';
|
||||||
|
rebootBtn.style.marginLeft = '12px';
|
||||||
|
rebootBtn.textContent = 'Reboot now';
|
||||||
|
rebootBtn.onclick = function(){
|
||||||
|
fetch('/api/services/action', {method:'POST',headers:{'Content-Type':'application/json'},
|
||||||
|
body: JSON.stringify({name:'', action:'reboot'})});
|
||||||
|
};
|
||||||
|
status.appendChild(rebootBtn);
|
||||||
|
} else {
|
||||||
|
status.textContent = '✗ Installation failed: ' + e.data;
|
||||||
|
status.style.color = 'var(--crit-fg)';
|
||||||
|
}
|
||||||
|
});
|
||||||
|
es.onerror = function() {
|
||||||
|
es.close();
|
||||||
|
status.textContent = '✗ Stream disconnected.';
|
||||||
|
status.style.color = 'var(--crit-fg)';
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
installRefreshDisks();
|
||||||
|
</script>
|
||||||
|
`
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderInstall() string {
|
||||||
|
return `<div class="card"><div class="card-head">Install Live System to Disk</div><div class="card-body">` +
|
||||||
|
renderInstallInline() +
|
||||||
|
`</div></div>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderTasks() string {
|
||||||
|
return `<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">
|
||||||
|
<button class="btn btn-danger btn-sm" onclick="cancelAll()">Cancel All</button>
|
||||||
|
<button class="btn btn-sm" style="background:#b45309;color:#fff" onclick="killWorkers()" title="Send SIGKILL to all running test processes (bee-gpu-burn, stress-ng, stressapptest, memtester)">Kill Workers</button>
|
||||||
|
<span id="kill-toast" style="font-size:12px;color:var(--muted);display:none"></span>
|
||||||
|
<span style="font-size:12px;color:var(--muted)">Open a task to view its saved logs and charts.</span>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<div id="tasks-table"><p style="color:var(--muted);font-size:13px;padding:16px">Loading...</p></div>
|
||||||
|
</div>
|
||||||
|
<script>
|
||||||
|
var _taskRefreshTimer = null;
|
||||||
|
var _tasksAll = [];
|
||||||
|
var _taskPage = 1;
|
||||||
|
var _taskPageSize = 50;
|
||||||
|
|
||||||
|
function loadTasks() {
|
||||||
|
fetch('/api/tasks').then(r=>r.json()).then(tasks => {
|
||||||
|
_tasksAll = Array.isArray(tasks) ? tasks : [];
|
||||||
|
if (_tasksAll.length === 0) {
|
||||||
|
_taskPage = 1;
|
||||||
|
document.getElementById('tasks-table').innerHTML = '<p style="color:var(--muted);font-size:13px;padding:16px">No tasks.</p>';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const totalPages = Math.max(1, Math.ceil(_tasksAll.length / _taskPageSize));
|
||||||
|
if (_taskPage > totalPages) _taskPage = totalPages;
|
||||||
|
if (_taskPage < 1) _taskPage = 1;
|
||||||
|
const start = (_taskPage - 1) * _taskPageSize;
|
||||||
|
const pageTasks = _tasksAll.slice(start, start + _taskPageSize);
|
||||||
|
const rows = pageTasks.map(t => {
|
||||||
|
const dur = t.elapsed_sec ? formatDurSec(t.elapsed_sec) : '';
|
||||||
|
const statusClass = {running:'badge-ok',pending:'badge-unknown',done:'badge-ok',failed:'badge-err',cancelled:'badge-unknown'}[t.status]||'badge-unknown';
|
||||||
|
const statusLabel = {running:'▶ running',pending:'pending',done:'✓ done',failed:'✗ failed',cancelled:'cancelled'}[t.status]||t.status;
|
||||||
|
let actions = '<a class="btn btn-sm btn-secondary" href="/tasks/'+encodeURIComponent(t.id)+'">Open</a>';
|
||||||
|
if (t.status === 'running' || t.status === 'pending') {
|
||||||
|
actions += ' <button class="btn btn-sm btn-danger" onclick="cancelTask(\''+t.id+'\')">Cancel</button>';
|
||||||
|
}
|
||||||
|
if (t.status === 'pending') {
|
||||||
|
actions += ' <button class="btn btn-sm btn-secondary" onclick="setPriority(\''+t.id+'\',1)" title="Increase priority">⇧</button>';
|
||||||
|
actions += ' <button class="btn btn-sm btn-secondary" onclick="setPriority(\''+t.id+'\',-1)" title="Decrease priority">⇩</button>';
|
||||||
|
}
|
||||||
|
return '<tr><td><a href="/tasks/'+encodeURIComponent(t.id)+'">'+escHtml(t.name)+'</a></td>' +
|
||||||
|
'<td><span class="badge '+statusClass+'">'+statusLabel+'</span></td>' +
|
||||||
|
'<td style="font-size:12px;color:var(--muted)">'+fmtTime(t.created_at)+'</td>' +
|
||||||
|
'<td style="font-size:12px;color:var(--muted)">'+dur+'</td>' +
|
||||||
|
'<td>'+t.priority+'</td>' +
|
||||||
|
'<td>'+actions+'</td></tr>';
|
||||||
|
}).join('');
|
||||||
|
const showingFrom = start + 1;
|
||||||
|
const showingTo = Math.min(start + pageTasks.length, _tasksAll.length);
|
||||||
|
const pager =
|
||||||
|
'<div style="display:flex;align-items:center;justify-content:space-between;gap:12px;flex-wrap:wrap;padding:12px 14px;border-top:1px solid var(--border-lite);background:var(--surface-2)">' +
|
||||||
|
'<div style="font-size:12px;color:var(--muted)">Showing '+showingFrom+'-'+showingTo+' of '+_tasksAll.length+' tasks</div>' +
|
||||||
|
'<div style="display:flex;align-items:center;gap:8px">' +
|
||||||
|
'<button class="btn btn-sm btn-secondary" onclick="setTaskPage('+(_taskPage-1)+')" '+(_taskPage <= 1 ? 'disabled' : '')+'>Previous</button>' +
|
||||||
|
'<span style="font-size:12px;color:var(--muted)">Page '+_taskPage+' / '+totalPages+'</span>' +
|
||||||
|
'<button class="btn btn-sm btn-secondary" onclick="setTaskPage('+(_taskPage+1)+')" '+(_taskPage >= totalPages ? 'disabled' : '')+'>Next</button>' +
|
||||||
|
'</div>' +
|
||||||
|
'</div>';
|
||||||
|
document.getElementById('tasks-table').innerHTML =
|
||||||
|
'<table><tr><th>Name</th><th>Status</th><th>Created</th><th>Duration</th><th>Priority</th><th>Actions</th></tr>'+rows+'</table>' + pager;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function escHtml(s) { return (s||'').replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>').replace(/"/g,'"'); }
|
||||||
|
function fmtTime(s) { if (!s) return ''; try { return new Date(s).toLocaleTimeString(); } catch(e){ return s; } }
|
||||||
|
function formatDurSec(sec) {
|
||||||
|
sec = Math.max(0, Math.round(sec||0));
|
||||||
|
if (sec < 60) return sec+'s';
|
||||||
|
const m = Math.floor(sec/60), ss = sec%60;
|
||||||
|
return m+'m '+ss+'s';
|
||||||
|
}
|
||||||
|
function setTaskPage(page) {
|
||||||
|
const totalPages = Math.max(1, Math.ceil(_tasksAll.length / _taskPageSize));
|
||||||
|
_taskPage = Math.min(totalPages, Math.max(1, page));
|
||||||
|
loadTasks();
|
||||||
|
}
|
||||||
|
|
||||||
|
function cancelTask(id) {
|
||||||
|
fetch('/api/tasks/'+id+'/cancel',{method:'POST'}).then(()=>loadTasks());
|
||||||
|
}
|
||||||
|
function cancelAll() {
|
||||||
|
fetch('/api/tasks/cancel-all',{method:'POST'}).then(()=>loadTasks());
|
||||||
|
}
|
||||||
|
function killWorkers() {
|
||||||
|
if (!confirm('Send SIGKILL to all running test workers (bee-gpu-burn, stress-ng, stressapptest, memtester)?\n\nThis will also cancel all queued and running tasks.')) return;
|
||||||
|
fetch('/api/tasks/kill-workers',{method:'POST'})
|
||||||
|
.then(r=>r.json())
|
||||||
|
.then(d=>{
|
||||||
|
loadTasks();
|
||||||
|
var toast = document.getElementById('kill-toast');
|
||||||
|
var parts = [];
|
||||||
|
if (d.cancelled > 0) parts.push(d.cancelled+' task'+(d.cancelled===1?'':'s')+' cancelled');
|
||||||
|
if (d.killed > 0) parts.push(d.killed+' process'+(d.killed===1?'':'es')+' killed');
|
||||||
|
toast.textContent = parts.length ? parts.join(', ')+'.' : 'No processes found.';
|
||||||
|
toast.style.display = '';
|
||||||
|
setTimeout(()=>{ toast.style.display='none'; }, 5000);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function setPriority(id, delta) {
|
||||||
|
fetch('/api/tasks/'+id+'/priority',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({delta:delta})})
|
||||||
|
.then(()=>loadTasks());
|
||||||
|
}
|
||||||
|
|
||||||
|
loadTasks();
|
||||||
|
_taskRefreshTimer = setInterval(loadTasks, 2000);
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
238
audit/internal/webui/page_metrics.go
Normal file
238
audit/internal/webui/page_metrics.go
Normal file
@@ -0,0 +1,238 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
func renderMetrics() string {
|
||||||
|
return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Live metrics — updated every 2 seconds.</p>
|
||||||
|
|
||||||
|
<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">Server — Load</div>
|
||||||
|
<div class="card-body" style="padding:8px">
|
||||||
|
<img id="chart-server-load" data-chart-refresh="1" src="/api/metrics/chart/server-load.svg" style="width:100%;display:block;border-radius:6px" alt="CPU/Mem load">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">Temperature — CPU</div>
|
||||||
|
<div class="card-body" style="padding:8px">
|
||||||
|
<img id="chart-server-temp-cpu" data-chart-refresh="1" src="/api/metrics/chart/server-temp-cpu.svg" style="width:100%;display:block;border-radius:6px" alt="CPU temperature">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">Temperature — Ambient Sensors</div>
|
||||||
|
<div class="card-body" style="padding:8px">
|
||||||
|
<img id="chart-server-temp-ambient" data-chart-refresh="1" src="/api/metrics/chart/server-temp-ambient.svg" style="width:100%;display:block;border-radius:6px" alt="Ambient temperature sensors">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">Server — Power</div>
|
||||||
|
<div class="card-body" style="padding:8px">
|
||||||
|
<img id="chart-server-power" data-chart-refresh="1" src="/api/metrics/chart/server-power.svg" style="width:100%;display:block;border-radius:6px" alt="System power">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div id="card-server-fans" class="card" style="margin-bottom:16px;display:none">
|
||||||
|
<div class="card-head">Server — Fan RPM</div>
|
||||||
|
<div class="card-body" style="padding:8px">
|
||||||
|
<img id="chart-server-fans" data-chart-refresh="1" src="/api/metrics/chart/server-fans.svg" style="width:100%;display:block;border-radius:6px" alt="Fan RPM">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<section id="gpu-metrics-section" style="display:none;margin-top:24px;padding:16px 16px 4px;border:1px solid #d7e0ea;border-radius:10px;background:linear-gradient(180deg,#f7fafc 0%,#eef4f8 100%)">
|
||||||
|
<div style="display:flex;align-items:center;justify-content:space-between;gap:16px;flex-wrap:wrap;margin-bottom:14px">
|
||||||
|
<div>
|
||||||
|
<div style="font-size:12px;font-weight:700;letter-spacing:.08em;text-transform:uppercase;color:#486581">GPU Metrics</div>
|
||||||
|
<div id="gpu-metrics-summary" style="font-size:13px;color:var(--muted);margin-top:4px">Detected GPUs are rendered in a dedicated section.</div>
|
||||||
|
</div>
|
||||||
|
<label style="display:inline-flex;align-items:center;gap:8px;font-size:13px;color:var(--ink);font-weight:700;cursor:pointer">
|
||||||
|
<input id="gpu-chart-toggle" type="checkbox">
|
||||||
|
<span>One chart per GPU</span>
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div id="gpu-metrics-by-metric">
|
||||||
|
<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">GPU — Compute Load</div>
|
||||||
|
<div class="card-body" style="padding:8px">
|
||||||
|
<img id="chart-gpu-all-load" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-load.svg" style="width:100%;display:block;border-radius:6px" alt="GPU compute load">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">GPU — Memory Load</div>
|
||||||
|
<div class="card-body" style="padding:8px">
|
||||||
|
<img id="chart-gpu-all-memload" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-memload.svg" style="width:100%;display:block;border-radius:6px" alt="GPU memory load">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">GPU — Core Clock</div>
|
||||||
|
<div class="card-body" style="padding:8px">
|
||||||
|
<img id="chart-gpu-all-clock" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-clock.svg" style="width:100%;display:block;border-radius:6px" alt="GPU core clock">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">GPU — Power</div>
|
||||||
|
<div class="card-body" style="padding:8px">
|
||||||
|
<img id="chart-gpu-all-power" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-power.svg" style="width:100%;display:block;border-radius:6px" alt="GPU power">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">GPU — Temperature</div>
|
||||||
|
<div class="card-body" style="padding:8px">
|
||||||
|
<img id="chart-gpu-all-temp" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-temp.svg" style="width:100%;display:block;border-radius:6px" alt="GPU temperature">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div id="gpu-metrics-by-gpu" style="display:none"></div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
let gpuChartKey = '';
|
||||||
|
const gpuChartModeStorageKey = 'bee.metrics.gpuChartMode';
|
||||||
|
let metricsNvidiaGPUsPromise = null;
|
||||||
|
|
||||||
|
function loadMetricsNvidiaGPUs() {
|
||||||
|
if (!metricsNvidiaGPUsPromise) {
|
||||||
|
metricsNvidiaGPUsPromise = fetch('/api/gpu/nvidia')
|
||||||
|
.then(function(r) {
|
||||||
|
if (!r.ok) throw new Error('Failed to load NVIDIA GPUs.');
|
||||||
|
return r.json();
|
||||||
|
})
|
||||||
|
.then(function(list) { return Array.isArray(list) ? list : []; })
|
||||||
|
.catch(function() { return []; });
|
||||||
|
}
|
||||||
|
return metricsNvidiaGPUsPromise;
|
||||||
|
}
|
||||||
|
|
||||||
|
function metricsGPUNameMap(list) {
|
||||||
|
const out = {};
|
||||||
|
(list || []).forEach(function(gpu) {
|
||||||
|
const idx = Number(gpu.index);
|
||||||
|
if (!Number.isFinite(idx) || !gpu.name) return;
|
||||||
|
out[idx] = gpu.name;
|
||||||
|
});
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
function metricsGPUDisplayLabel(idx, names) {
|
||||||
|
const name = names && names[idx];
|
||||||
|
return name ? ('GPU ' + idx + ' — ' + name) : ('GPU ' + idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
function loadGPUChartModePreference() {
|
||||||
|
try {
|
||||||
|
return sessionStorage.getItem(gpuChartModeStorageKey) === 'per-gpu';
|
||||||
|
} catch (_) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function saveGPUChartModePreference(perGPU) {
|
||||||
|
try {
|
||||||
|
sessionStorage.setItem(gpuChartModeStorageKey, perGPU ? 'per-gpu' : 'per-metric');
|
||||||
|
} catch (_) {}
|
||||||
|
}
|
||||||
|
|
||||||
|
function refreshChartImage(el) {
|
||||||
|
if (!el || el.dataset.loading === '1') return;
|
||||||
|
if (el.offsetParent === null) return;
|
||||||
|
const baseSrc = el.dataset.baseSrc || el.src.split('?')[0];
|
||||||
|
const nextSrc = baseSrc + '?t=' + Date.now();
|
||||||
|
const probe = new Image();
|
||||||
|
el.dataset.baseSrc = baseSrc;
|
||||||
|
el.dataset.loading = '1';
|
||||||
|
probe.onload = function() {
|
||||||
|
el.src = nextSrc;
|
||||||
|
el.dataset.loading = '0';
|
||||||
|
};
|
||||||
|
probe.onerror = function() {
|
||||||
|
el.dataset.loading = '0';
|
||||||
|
};
|
||||||
|
probe.src = nextSrc;
|
||||||
|
}
|
||||||
|
|
||||||
|
function refreshCharts() {
|
||||||
|
document.querySelectorAll('img[data-chart-refresh="1"]').forEach(refreshChartImage);
|
||||||
|
}
|
||||||
|
|
||||||
|
function gpuIndices(rows) {
|
||||||
|
const seen = {};
|
||||||
|
const out = [];
|
||||||
|
(rows || []).forEach(function(row) {
|
||||||
|
const idx = Number(row.index);
|
||||||
|
if (!Number.isFinite(idx) || seen[idx]) return;
|
||||||
|
seen[idx] = true;
|
||||||
|
out.push(idx);
|
||||||
|
});
|
||||||
|
return out.sort(function(a, b) { return a - b; });
|
||||||
|
}
|
||||||
|
|
||||||
|
function renderGPUOverviewCards(indices, names) {
|
||||||
|
const host = document.getElementById('gpu-metrics-by-gpu');
|
||||||
|
if (!host) return;
|
||||||
|
host.innerHTML = indices.map(function(idx) {
|
||||||
|
const label = metricsGPUDisplayLabel(idx, names);
|
||||||
|
return '<div class="card" style="margin-bottom:16px">' +
|
||||||
|
'<div class="card-head">' + label + ' — Overview</div>' +
|
||||||
|
'<div class="card-body" style="padding:8px">' +
|
||||||
|
'<img id="chart-gpu-' + idx + '-overview" data-chart-refresh="1" src="/api/metrics/chart/gpu/' + idx + '-overview.svg" style="width:100%;display:block;border-radius:6px" alt="' + label + ' overview">' +
|
||||||
|
'</div></div>';
|
||||||
|
}).join('');
|
||||||
|
}
|
||||||
|
|
||||||
|
function applyGPUChartMode() {
|
||||||
|
const perMetric = document.getElementById('gpu-metrics-by-metric');
|
||||||
|
const perGPU = document.getElementById('gpu-metrics-by-gpu');
|
||||||
|
const toggle = document.getElementById('gpu-chart-toggle');
|
||||||
|
const gpuModePerGPU = !!(toggle && toggle.checked);
|
||||||
|
if (perMetric) perMetric.style.display = gpuModePerGPU ? 'none' : '';
|
||||||
|
if (perGPU) perGPU.style.display = gpuModePerGPU ? '' : 'none';
|
||||||
|
}
|
||||||
|
|
||||||
|
function syncMetricsLayout(d) {
|
||||||
|
const fanCard = document.getElementById('card-server-fans');
|
||||||
|
if (fanCard) fanCard.style.display = (d.fans && d.fans.length > 0) ? '' : 'none';
|
||||||
|
const section = document.getElementById('gpu-metrics-section');
|
||||||
|
const summary = document.getElementById('gpu-metrics-summary');
|
||||||
|
const indices = gpuIndices(d.gpus);
|
||||||
|
loadMetricsNvidiaGPUs().then(function(gpus) {
|
||||||
|
const names = metricsGPUNameMap(gpus);
|
||||||
|
if (section) section.style.display = indices.length > 0 ? '' : 'none';
|
||||||
|
if (summary) {
|
||||||
|
summary.textContent = indices.length > 0
|
||||||
|
? ('Detected GPUs: ' + indices.map(function(idx) { return metricsGPUDisplayLabel(idx, names); }).join(', '))
|
||||||
|
: 'No GPUs detected in live metrics.';
|
||||||
|
}
|
||||||
|
const nextKey = indices.join(',') + '|' + indices.map(function(idx) { return names[idx] || ''; }).join(',');
|
||||||
|
if (nextKey !== gpuChartKey) {
|
||||||
|
renderGPUOverviewCards(indices, names);
|
||||||
|
gpuChartKey = nextKey;
|
||||||
|
}
|
||||||
|
applyGPUChartMode();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function loadMetricsLayout() {
|
||||||
|
fetch('/api/metrics/latest').then(function(r) { return r.json(); }).then(syncMetricsLayout).catch(function() {});
|
||||||
|
}
|
||||||
|
|
||||||
|
const gpuChartToggle = document.getElementById('gpu-chart-toggle');
|
||||||
|
if (gpuChartToggle) {
|
||||||
|
gpuChartToggle.checked = loadGPUChartModePreference();
|
||||||
|
}
|
||||||
|
applyGPUChartMode();
|
||||||
|
|
||||||
|
if (gpuChartToggle) {
|
||||||
|
gpuChartToggle.addEventListener('change', function() {
|
||||||
|
saveGPUChartModePreference(!!gpuChartToggle.checked);
|
||||||
|
applyGPUChartMode();
|
||||||
|
refreshCharts();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
loadMetricsLayout();
|
||||||
|
setInterval(refreshCharts, 3000);
|
||||||
|
setInterval(loadMetricsLayout, 5000);
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
213
audit/internal/webui/page_network_services.go
Normal file
213
audit/internal/webui/page_network_services.go
Normal file
@@ -0,0 +1,213 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import "html"
|
||||||
|
|
||||||
|
// renderNetworkInline returns the network UI without a wrapping card (for embedding in Tools).
|
||||||
|
func renderNetworkInline() string {
|
||||||
|
return `<div id="net-pending" style="display:none" class="alert alert-warn">
|
||||||
|
<strong>⚠ Network change applied.</strong> Reverting in <span id="net-countdown">60</span>s unless confirmed.
|
||||||
|
<button class="btn btn-primary btn-sm" style="margin-left:8px" onclick="confirmNetChange()">Confirm</button>
|
||||||
|
<button class="btn btn-secondary btn-sm" style="margin-left:4px" onclick="rollbackNetChange()">Rollback</button>
|
||||||
|
</div>
|
||||||
|
<div id="iface-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||||
|
<div class="grid2" style="margin-top:16px">
|
||||||
|
<div><div style="font-weight:700;font-size:13px;margin-bottom:8px">DHCP</div>
|
||||||
|
<div class="form-row"><label>Interface (leave empty for all)</label><input type="text" id="dhcp-iface" placeholder="eth0"></div>
|
||||||
|
<button class="btn btn-primary" onclick="runDHCP()">▶ Run DHCP</button>
|
||||||
|
<div id="dhcp-out" style="margin-top:10px;font-size:12px;color:var(--ok-fg)"></div>
|
||||||
|
</div>
|
||||||
|
<div><div style="font-weight:700;font-size:13px;margin-bottom:8px">Static IPv4</div>
|
||||||
|
<div class="form-row"><label>Interface</label><input type="text" id="st-iface" placeholder="eth0"></div>
|
||||||
|
<div class="form-row"><label>Address</label><input type="text" id="st-addr" placeholder="192.168.1.100"></div>
|
||||||
|
<div class="form-row"><label>Prefix length</label><input type="text" id="st-prefix" placeholder="24"></div>
|
||||||
|
<div class="form-row"><label>Gateway</label><input type="text" id="st-gw" placeholder="192.168.1.1"></div>
|
||||||
|
<div class="form-row"><label>DNS (comma-separated)</label><input type="text" id="st-dns" placeholder="8.8.8.8,8.8.4.4"></div>
|
||||||
|
<button class="btn btn-primary" onclick="setStatic()">Apply Static IP</button>
|
||||||
|
<div id="static-out" style="margin-top:10px;font-size:12px;color:var(--ok-fg)"></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<script>
|
||||||
|
var _netCountdownTimer = null;
|
||||||
|
var _netRefreshTimer = null;
|
||||||
|
const NET_ROLLBACK_SECS = 60;
|
||||||
|
function loadNetwork() {
|
||||||
|
fetch('/api/network').then(r=>r.json()).then(d => {
|
||||||
|
const rows = (d.interfaces||[]).map(i =>
|
||||||
|
'<tr><td style="cursor:pointer" onclick="selectIface(\''+i.Name+'\')" title="Use this interface in the forms below"><span style="text-decoration:underline">'+i.Name+'</span></td>' +
|
||||||
|
'<td style="cursor:pointer" onclick="toggleIface(\''+i.Name+'\',\''+i.State+'\')" title="Click to toggle"><span class="badge '+(i.State==='up'?'badge-ok':'badge-warn')+'">'+i.State+'</span></td>' +
|
||||||
|
'<td>'+(i.IPv4||[]).join(', ')+'</td></tr>'
|
||||||
|
).join('');
|
||||||
|
document.getElementById('iface-table').innerHTML =
|
||||||
|
'<table><tr><th>Interface</th><th>State (click to toggle)</th><th>Addresses</th></tr>'+rows+'</table>' +
|
||||||
|
(d.default_route ? '<p style="font-size:12px;color:var(--muted);margin-top:8px">Default route: '+d.default_route+'</p>' : '');
|
||||||
|
if (d.pending_change) showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||||
|
else hideNetPending();
|
||||||
|
}).catch(function() {});
|
||||||
|
}
|
||||||
|
function selectIface(iface) {
|
||||||
|
document.getElementById('dhcp-iface').value = iface;
|
||||||
|
document.getElementById('st-iface').value = iface;
|
||||||
|
}
|
||||||
|
function toggleIface(iface, currentState) {
|
||||||
|
showNetPending(NET_ROLLBACK_SECS);
|
||||||
|
fetch('/api/network/toggle',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({iface:iface})})
|
||||||
|
.then(r=>r.json()).then(d => {
|
||||||
|
if (d.error) { hideNetPending(); alert('Error: '+d.error); return; }
|
||||||
|
loadNetwork();
|
||||||
|
showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||||
|
}).catch(function() {
|
||||||
|
setTimeout(loadNetwork, 1500);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function hideNetPending() {
|
||||||
|
const el = document.getElementById('net-pending');
|
||||||
|
if (_netCountdownTimer) clearInterval(_netCountdownTimer);
|
||||||
|
_netCountdownTimer = null;
|
||||||
|
el.style.display = 'none';
|
||||||
|
}
|
||||||
|
function showNetPending(secs) {
|
||||||
|
if (!secs || secs < 1) { hideNetPending(); return; }
|
||||||
|
const el = document.getElementById('net-pending');
|
||||||
|
el.style.display = 'block';
|
||||||
|
if (_netCountdownTimer) clearInterval(_netCountdownTimer);
|
||||||
|
let remaining = secs;
|
||||||
|
document.getElementById('net-countdown').textContent = remaining;
|
||||||
|
_netCountdownTimer = setInterval(function() {
|
||||||
|
remaining--;
|
||||||
|
document.getElementById('net-countdown').textContent = remaining;
|
||||||
|
if (remaining <= 0) { hideNetPending(); loadNetwork(); }
|
||||||
|
}, 1000);
|
||||||
|
}
|
||||||
|
function confirmNetChange() {
|
||||||
|
hideNetPending();
|
||||||
|
fetch('/api/network/confirm',{method:'POST'}).then(()=>loadNetwork()).catch(()=>{});
|
||||||
|
}
|
||||||
|
function rollbackNetChange() {
|
||||||
|
hideNetPending();
|
||||||
|
fetch('/api/network/rollback',{method:'POST'}).then(()=>loadNetwork()).catch(()=>{});
|
||||||
|
}
|
||||||
|
function runDHCP() {
|
||||||
|
const iface = document.getElementById('dhcp-iface').value.trim();
|
||||||
|
showNetPending(NET_ROLLBACK_SECS);
|
||||||
|
fetch('/api/network/dhcp',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({interface:iface||'all'})})
|
||||||
|
.then(r=>r.json()).then(d => {
|
||||||
|
document.getElementById('dhcp-out').textContent = d.output || d.error || 'Done.';
|
||||||
|
if (d.error) { hideNetPending(); return; }
|
||||||
|
showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||||
|
loadNetwork();
|
||||||
|
}).catch(function() {
|
||||||
|
setTimeout(loadNetwork, 1500);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function setStatic() {
|
||||||
|
const dns = document.getElementById('st-dns').value.split(',').map(s=>s.trim()).filter(Boolean);
|
||||||
|
showNetPending(NET_ROLLBACK_SECS);
|
||||||
|
fetch('/api/network/static',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({
|
||||||
|
interface: document.getElementById('st-iface').value,
|
||||||
|
address: document.getElementById('st-addr').value,
|
||||||
|
prefix: document.getElementById('st-prefix').value,
|
||||||
|
gateway: document.getElementById('st-gw').value,
|
||||||
|
dns: dns,
|
||||||
|
})}).then(r=>r.json()).then(d => {
|
||||||
|
document.getElementById('static-out').textContent = d.output || d.error || 'Done.';
|
||||||
|
if (d.error) { hideNetPending(); return; }
|
||||||
|
showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||||
|
loadNetwork();
|
||||||
|
}).catch(function() {
|
||||||
|
setTimeout(loadNetwork, 1500);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
loadNetwork();
|
||||||
|
if (_netRefreshTimer) clearInterval(_netRefreshTimer);
|
||||||
|
_netRefreshTimer = setInterval(loadNetwork, 5000);
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderNetwork() string {
|
||||||
|
return `<div class="card"><div class="card-head">Network Interfaces</div><div class="card-body">` +
|
||||||
|
renderNetworkInline() +
|
||||||
|
`</div></div>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderServicesInline() string {
|
||||||
|
return `<p style="font-size:13px;color:var(--muted);margin-bottom:10px">` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `</p>
|
||||||
|
<div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="loadServices()">↻ Refresh</button></div>
|
||||||
|
<div id="svc-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||||
|
<div id="svc-out" style="display:none;margin-top:12px">
|
||||||
|
<div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
|
||||||
|
<span id="svc-out-label" style="font-size:12px;font-weight:600;color:var(--muted)">Output</span>
|
||||||
|
<span id="svc-out-status" style="font-size:12px"></span>
|
||||||
|
</div>
|
||||||
|
<div id="svc-terminal" class="terminal" style="max-height:220px;width:100%;box-sizing:border-box"></div>
|
||||||
|
</div>
|
||||||
|
<script>
|
||||||
|
function loadServices() {
|
||||||
|
fetch('/api/services').then(r=>r.json()).then(svcs => {
|
||||||
|
const rows = svcs.map(s => {
|
||||||
|
const st = s.state||'unknown';
|
||||||
|
const badge = st==='active' ? 'badge-ok' : st==='failed' ? 'badge-err' : 'badge-warn';
|
||||||
|
const id = 'svc-body-'+s.name.replace(/[^a-z0-9]/g,'-');
|
||||||
|
const body = (s.body||'').replace(/</g,'<').replace(/>/g,'>');
|
||||||
|
return '<tr>' +
|
||||||
|
'<td style="white-space:nowrap">'+s.name+'</td>' +
|
||||||
|
'<td style="white-space:nowrap"><span class="badge '+badge+'" style="cursor:pointer" onclick="toggleBody(\''+id+'\')">'+st+' ▾</span>' +
|
||||||
|
'<div id="'+id+'" style="display:none;margin-top:6px"><pre style="font-size:11px;white-space:pre-wrap;word-break:break-all;max-height:200px;overflow-y:auto;background:#1b1c1d;padding:8px;border-radius:4px;color:#b5cea8">'+body+'</pre></div>' +
|
||||||
|
'</td>' +
|
||||||
|
'<td style="white-space:nowrap">' +
|
||||||
|
'<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-start" onclick="svcAction(this,\''+s.name+'\',\'start\')">Start</button> ' +
|
||||||
|
'<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-stop" onclick="svcAction(this,\''+s.name+'\',\'stop\')">Stop</button> ' +
|
||||||
|
'<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-restart" onclick="svcAction(this,\''+s.name+'\',\'restart\')">Restart</button>' +
|
||||||
|
'</td></tr>';
|
||||||
|
}).join('');
|
||||||
|
document.getElementById('svc-table').innerHTML =
|
||||||
|
'<table><tr><th>Unit</th><th>Status</th><th>Actions</th></tr>'+rows+'</table>';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function toggleBody(id) {
|
||||||
|
const el = document.getElementById(id);
|
||||||
|
if (el) el.style.display = el.style.display==='none' ? 'block' : 'none';
|
||||||
|
}
|
||||||
|
function svcAction(btn, name, action) {
|
||||||
|
var label = btn.textContent;
|
||||||
|
btn.disabled = true;
|
||||||
|
btn.textContent = '...';
|
||||||
|
var out = document.getElementById('svc-out');
|
||||||
|
var term = document.getElementById('svc-terminal');
|
||||||
|
var statusEl = document.getElementById('svc-out-status');
|
||||||
|
var labelEl = document.getElementById('svc-out-label');
|
||||||
|
out.style.display = 'block';
|
||||||
|
labelEl.textContent = action + ' ' + name;
|
||||||
|
term.textContent = 'Running...';
|
||||||
|
statusEl.textContent = '';
|
||||||
|
statusEl.style.color = '';
|
||||||
|
fetch('/api/services/action',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({name,action})})
|
||||||
|
.then(r=>r.json()).then(d => {
|
||||||
|
term.textContent = d.output || d.error || '(no output)';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
if (d.status === 'ok') {
|
||||||
|
statusEl.textContent = '✓ done';
|
||||||
|
statusEl.style.color = 'var(--ok-fg, #2c662d)';
|
||||||
|
} else {
|
||||||
|
statusEl.textContent = '✗ failed';
|
||||||
|
statusEl.style.color = 'var(--crit-fg, #9f3a38)';
|
||||||
|
}
|
||||||
|
btn.textContent = label;
|
||||||
|
btn.disabled = false;
|
||||||
|
setTimeout(loadServices, 800);
|
||||||
|
}).catch(e => {
|
||||||
|
term.textContent = 'Request failed: ' + e;
|
||||||
|
statusEl.textContent = '✗ error';
|
||||||
|
statusEl.style.color = 'var(--crit-fg, #9f3a38)';
|
||||||
|
btn.textContent = label;
|
||||||
|
btn.disabled = false;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
loadServices();
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderServices() string {
|
||||||
|
return `<div class="card"><div class="card-head">Bee Services</div><div class="card-body">` +
|
||||||
|
renderServicesInline() +
|
||||||
|
`</div></div>`
|
||||||
|
}
|
||||||
716
audit/internal/webui/page_validate.go
Normal file
716
audit/internal/webui/page_validate.go
Normal file
@@ -0,0 +1,716 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"html"
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
)
|
||||||
|
|
||||||
|
type validateInventory struct {
|
||||||
|
CPU string
|
||||||
|
Memory string
|
||||||
|
Storage string
|
||||||
|
NVIDIA string
|
||||||
|
AMD string
|
||||||
|
NvidiaGPUCount int
|
||||||
|
AMDGPUCount int
|
||||||
|
}
|
||||||
|
|
||||||
|
func validateFmtDur(secs int) string {
|
||||||
|
if secs < 120 {
|
||||||
|
return fmt.Sprintf("~%d s", secs)
|
||||||
|
}
|
||||||
|
mins := (secs + 29) / 60
|
||||||
|
return fmt.Sprintf("~%d min", mins)
|
||||||
|
}
|
||||||
|
|
||||||
|
func validateTotalValidateSec(n int) int {
|
||||||
|
if n < 0 {
|
||||||
|
n = 0
|
||||||
|
}
|
||||||
|
total := platform.SATEstimatedCPUValidateSec +
|
||||||
|
platform.SATEstimatedMemoryValidateSec +
|
||||||
|
n*platform.SATEstimatedNvidiaGPUValidatePerGPUSec +
|
||||||
|
platform.SATEstimatedNvidiaInterconnectSec +
|
||||||
|
platform.SATEstimatedNvidiaBandwidthSec
|
||||||
|
return total
|
||||||
|
}
|
||||||
|
|
||||||
|
func validateTotalStressSec(n int) int {
|
||||||
|
if n < 0 {
|
||||||
|
n = 0
|
||||||
|
}
|
||||||
|
total := platform.SATEstimatedCPUStressSec +
|
||||||
|
platform.SATEstimatedMemoryStressSec +
|
||||||
|
n*platform.SATEstimatedNvidiaGPUStressPerGPUSec +
|
||||||
|
n*platform.SATEstimatedNvidiaTargetedStressPerGPUSec +
|
||||||
|
n*platform.SATEstimatedNvidiaTargetedPowerPerGPUSec +
|
||||||
|
platform.SATEstimatedNvidiaPulseTestSec +
|
||||||
|
platform.SATEstimatedNvidiaInterconnectSec +
|
||||||
|
platform.SATEstimatedNvidiaBandwidthSec
|
||||||
|
return total
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderValidate(opts HandlerOptions) string {
|
||||||
|
inv := loadValidateInventory(opts)
|
||||||
|
n := inv.NvidiaGPUCount
|
||||||
|
validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
|
||||||
|
stressTotalStr := validateFmtDur(validateTotalStressSec(n))
|
||||||
|
gpuNote := ""
|
||||||
|
if n > 0 {
|
||||||
|
gpuNote = fmt.Sprintf(" (%d GPU)", n)
|
||||||
|
}
|
||||||
|
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
|
||||||
|
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||||
|
|
||||||
|
<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">Validate Profile</div>
|
||||||
|
<div class="card-body validate-profile-body">
|
||||||
|
<div class="validate-profile-col">
|
||||||
|
<div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
|
||||||
|
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
|
||||||
|
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (` + stressTotalStr + gpuNote + `)</span></label>
|
||||||
|
</div>
|
||||||
|
<div class="validate-profile-col validate-profile-action">
|
||||||
|
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially. Validate: ` + validateTotalStr + gpuNote + `; Stress: ` + stressTotalStr + gpuNote + `. Estimates are based on real log data and scale with GPU count.</p>
|
||||||
|
<button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
|
||||||
|
<div style="margin-top:12px">
|
||||||
|
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="grid3">
|
||||||
|
` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
|
||||||
|
inv.CPU,
|
||||||
|
`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
|
||||||
|
`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
|
||||||
|
validateFmtDur(platform.SATEstimatedCPUValidateSec)+` in Validate (stress-ng 60 s). `+validateFmtDur(platform.SATEstimatedCPUStressSec)+` in Stress (stress-ng 30 min).`,
|
||||||
|
)) +
|
||||||
|
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
|
||||||
|
inv.Memory,
|
||||||
|
`Runs a RAM validation pass and records memory state around the test.`,
|
||||||
|
`<code>free</code>, <code>memtester</code>`,
|
||||||
|
validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` in Validate (256 MB × 1 pass). `+validateFmtDur(platform.SATEstimatedMemoryStressSec)+` in Stress (512 MB × 1 pass).`,
|
||||||
|
)) +
|
||||||
|
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
|
||||||
|
inv.Storage,
|
||||||
|
`Scans all storage devices and runs the matching health or self-test path for each device type.`,
|
||||||
|
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
|
||||||
|
`Seconds in Validate (NVMe: instant device query; SATA/SAS: short self-test). Up to ~1 h per device in Stress (extended self-test, device-dependent).`,
|
||||||
|
)) +
|
||||||
|
`</div>
|
||||||
|
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
||||||
|
<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">NVIDIA GPU Selection</div>
|
||||||
|
<div class="card-body">
|
||||||
|
<p style="font-size:12px;color:var(--muted);margin:0 0 8px">` + inv.NVIDIA + `</p>
|
||||||
|
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Validate one by one.</p>
|
||||||
|
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||||||
|
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectAllGPUs()">Select All</button>
|
||||||
|
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectNoGPUs()">Clear</button>
|
||||||
|
</div>
|
||||||
|
<div id="sat-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
||||||
|
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||||
|
</div>
|
||||||
|
<p id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA validate tasks.</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="grid3">
|
||||||
|
` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
|
||||||
|
inv.NVIDIA,
|
||||||
|
`Runs NVIDIA diagnostics and board inventory checks.`,
|
||||||
|
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
||||||
|
func() string {
|
||||||
|
perV := platform.SATEstimatedNvidiaGPUValidatePerGPUSec
|
||||||
|
perS := platform.SATEstimatedNvidiaGPUStressPerGPUSec
|
||||||
|
if n > 0 {
|
||||||
|
return fmt.Sprintf("Validate: %s/GPU × %d = %s (Level 2, sequential). Stress: %s/GPU × %d = %s (Level 3, sequential).",
|
||||||
|
validateFmtDur(perV), n, validateFmtDur(perV*n),
|
||||||
|
validateFmtDur(perS), n, validateFmtDur(perS*n))
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("Validate: %s/GPU (Level 2, sequential). Stress: %s/GPU (Level 3, sequential).",
|
||||||
|
validateFmtDur(perV), validateFmtDur(perS))
|
||||||
|
}(),
|
||||||
|
)) +
|
||||||
|
`<div id="sat-card-nvidia-targeted-stress">` +
|
||||||
|
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
||||||
|
inv.NVIDIA,
|
||||||
|
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
|
||||||
|
`<code>dcgmi diag targeted_stress</code>`,
|
||||||
|
func() string {
|
||||||
|
per := platform.SATEstimatedNvidiaTargetedStressPerGPUSec
|
||||||
|
s := "Skipped in Validate. "
|
||||||
|
if n > 0 {
|
||||||
|
s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
|
||||||
|
} else {
|
||||||
|
s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
|
||||||
|
}
|
||||||
|
return s + `<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
|
||||||
|
}(),
|
||||||
|
)) +
|
||||||
|
`</div>` +
|
||||||
|
`<div id="sat-card-nvidia-targeted-power">` +
|
||||||
|
renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody(
|
||||||
|
inv.NVIDIA,
|
||||||
|
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
|
||||||
|
`<code>dcgmi diag targeted_power</code>`,
|
||||||
|
func() string {
|
||||||
|
per := platform.SATEstimatedNvidiaTargetedPowerPerGPUSec
|
||||||
|
s := "Skipped in Validate. "
|
||||||
|
if n > 0 {
|
||||||
|
s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
|
||||||
|
} else {
|
||||||
|
s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
|
||||||
|
}
|
||||||
|
return s + `<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
|
||||||
|
}(),
|
||||||
|
)) +
|
||||||
|
`</div>` +
|
||||||
|
`<div id="sat-card-nvidia-pulse">` +
|
||||||
|
renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody(
|
||||||
|
inv.NVIDIA,
|
||||||
|
`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
|
||||||
|
`<code>dcgmi diag pulse_test</code>`,
|
||||||
|
`Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||||
|
)) +
|
||||||
|
`</div>` +
|
||||||
|
`<div id="sat-card-nvidia-interconnect">` +
|
||||||
|
renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
|
||||||
|
inv.NVIDIA,
|
||||||
|
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
|
||||||
|
`<code>all_reduce_perf</code> (NCCL tests)`,
|
||||||
|
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
|
||||||
|
)) +
|
||||||
|
`</div>` +
|
||||||
|
`<div id="sat-card-nvidia-bandwidth">` +
|
||||||
|
renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
|
||||||
|
inv.NVIDIA,
|
||||||
|
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
|
||||||
|
`<code>nvbandwidth</code>`,
|
||||||
|
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
|
||||||
|
)) +
|
||||||
|
`</div>` +
|
||||||
|
`</div>
|
||||||
|
<div class="grid3" style="margin-top:16px">
|
||||||
|
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
|
||||||
|
inv.AMD,
|
||||||
|
`Runs the selected AMD checks only. GPU Validate collects inventory; MEM Integrity uses the RVS MEM module; MEM Bandwidth uses rocm-bandwidth-test and the RVS BABEL module.`,
|
||||||
|
`GPU Validate: <code>rocm-smi</code>, <code>dmidecode</code>; MEM Integrity: <code>rvs mem</code>; MEM Bandwidth: <code>rocm-bandwidth-test</code>, <code>rvs babel</code>`,
|
||||||
|
`<div style="display:flex;flex-direction:column;gap:4px"><label class="cb-row"><input type="checkbox" id="sat-amd-target" checked><span>GPU Validate</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-mem-target" checked><span>MEM Integrity</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-bandwidth-target" checked><span>MEM Bandwidth</span></label></div>`,
|
||||||
|
)) +
|
||||||
|
`</div>
|
||||||
|
<div id="sat-output" style="display:none;margin-top:16px" class="card">
|
||||||
|
<div class="card-head">Test Output <span id="sat-title"></span></div>
|
||||||
|
<div class="card-body"><div id="sat-terminal" class="terminal"></div></div>
|
||||||
|
</div>
|
||||||
|
<style>
|
||||||
|
.validate-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
|
||||||
|
.validate-profile-col { min-width:0; display:flex; flex-direction:column; }
|
||||||
|
.validate-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:center; }
|
||||||
|
.validate-card-body { padding:0; }
|
||||||
|
.validate-card-section { padding:12px 16px 0; }
|
||||||
|
.validate-card-section:last-child { padding-bottom:16px; }
|
||||||
|
.sat-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
|
||||||
|
.sat-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||||
|
@media(max-width:900px){ .validate-profile-body { grid-template-columns:1fr; } }
|
||||||
|
</style>
|
||||||
|
<script>
|
||||||
|
let satES = null;
|
||||||
|
function satStressMode() {
|
||||||
|
return document.querySelector('input[name="sat-mode"]:checked')?.value === 'stress';
|
||||||
|
}
|
||||||
|
function satModeChanged() {
|
||||||
|
const stress = satStressMode();
|
||||||
|
[
|
||||||
|
{card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
|
||||||
|
{card: 'sat-card-nvidia-targeted-power', hint: 'sat-tp-mode-hint'},
|
||||||
|
{card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'},
|
||||||
|
].forEach(function(item) {
|
||||||
|
const card = document.getElementById(item.card);
|
||||||
|
if (card) {
|
||||||
|
card.style.opacity = stress ? '1' : '0.5';
|
||||||
|
const hint = document.getElementById(item.hint);
|
||||||
|
if (hint) hint.style.display = stress ? 'none' : '';
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function satLabels() {
|
||||||
|
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||||
|
}
|
||||||
|
let satNvidiaGPUsPromise = null;
|
||||||
|
function loadSatNvidiaGPUs() {
|
||||||
|
if (!satNvidiaGPUsPromise) {
|
||||||
|
satNvidiaGPUsPromise = fetch('/api/gpu/nvidia')
|
||||||
|
.then(r => {
|
||||||
|
if (!r.ok) throw new Error('Failed to load NVIDIA GPUs.');
|
||||||
|
return r.json();
|
||||||
|
})
|
||||||
|
.then(list => Array.isArray(list) ? list : []);
|
||||||
|
}
|
||||||
|
return satNvidiaGPUsPromise;
|
||||||
|
}
|
||||||
|
function satSelectedGPUIndices() {
|
||||||
|
return Array.from(document.querySelectorAll('.sat-nvidia-checkbox'))
|
||||||
|
.filter(function(el) { return el.checked && !el.disabled; })
|
||||||
|
.map(function(el) { return parseInt(el.value, 10); })
|
||||||
|
.filter(function(v) { return !Number.isNaN(v); })
|
||||||
|
.sort(function(a, b) { return a - b; });
|
||||||
|
}
|
||||||
|
function satUpdateGPUSelectionNote() {
|
||||||
|
const note = document.getElementById('sat-gpu-selection-note');
|
||||||
|
if (!note) return;
|
||||||
|
const selected = satSelectedGPUIndices();
|
||||||
|
if (!selected.length) {
|
||||||
|
note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA validate tasks.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
note.textContent = 'Selected GPUs: ' + selected.join(', ') + '. Multi-GPU tests will use all selected GPUs.';
|
||||||
|
}
|
||||||
|
function satRenderGPUList(gpus) {
|
||||||
|
const root = document.getElementById('sat-gpu-list');
|
||||||
|
if (!root) return;
|
||||||
|
if (!gpus || !gpus.length) {
|
||||||
|
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
|
||||||
|
satUpdateGPUSelectionNote();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
root.innerHTML = gpus.map(function(gpu) {
|
||||||
|
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
|
||||||
|
return '<label class="sat-gpu-row">'
|
||||||
|
+ '<input class="sat-nvidia-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="satUpdateGPUSelectionNote()">'
|
||||||
|
+ '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
|
||||||
|
+ '</label>';
|
||||||
|
}).join('');
|
||||||
|
satUpdateGPUSelectionNote();
|
||||||
|
}
|
||||||
|
function satSelectAllGPUs() {
|
||||||
|
document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = true; });
|
||||||
|
satUpdateGPUSelectionNote();
|
||||||
|
}
|
||||||
|
function satSelectNoGPUs() {
|
||||||
|
document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = false; });
|
||||||
|
satUpdateGPUSelectionNote();
|
||||||
|
}
|
||||||
|
function satLoadGPUs() {
|
||||||
|
loadSatNvidiaGPUs().then(function(gpus) {
|
||||||
|
satRenderGPUList(gpus);
|
||||||
|
}).catch(function(err) {
|
||||||
|
const root = document.getElementById('sat-gpu-list');
|
||||||
|
if (root) {
|
||||||
|
root.innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
|
||||||
|
}
|
||||||
|
satUpdateGPUSelectionNote();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function satGPUDisplayName(gpu) {
|
||||||
|
const idx = (gpu && Number.isFinite(Number(gpu.index))) ? Number(gpu.index) : 0;
|
||||||
|
const name = gpu && gpu.name ? gpu.name : ('GPU ' + idx);
|
||||||
|
return 'GPU ' + idx + ' — ' + name;
|
||||||
|
}
|
||||||
|
function satRequestBody(target, overrides) {
|
||||||
|
const body = {};
|
||||||
|
const labels = satLabels();
|
||||||
|
body.display_name = labels[target] || ('Validate ' + target);
|
||||||
|
body.stress_mode = satStressMode();
|
||||||
|
if (target === 'cpu') body.duration = satStressMode() ? 1800 : 60;
|
||||||
|
if (overrides) {
|
||||||
|
Object.keys(overrides).forEach(key => { body[key] = overrides[key]; });
|
||||||
|
}
|
||||||
|
return body;
|
||||||
|
}
|
||||||
|
function enqueueSATTarget(target, overrides) {
|
||||||
|
return fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(satRequestBody(target, overrides))})
|
||||||
|
.then(r => r.json());
|
||||||
|
}
|
||||||
|
function streamSATTask(taskId, title, resetTerminal) {
|
||||||
|
if (satES) { satES.close(); satES = null; }
|
||||||
|
document.getElementById('sat-output').style.display='block';
|
||||||
|
document.getElementById('sat-title').textContent = '— ' + title;
|
||||||
|
const term = document.getElementById('sat-terminal');
|
||||||
|
if (resetTerminal) {
|
||||||
|
term.textContent = '';
|
||||||
|
}
|
||||||
|
term.textContent += 'Task ' + taskId + ' queued. Streaming log...\n';
|
||||||
|
return new Promise(function(resolve) {
|
||||||
|
satES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||||
|
satES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||||
|
satES.addEventListener('done', function(e) {
|
||||||
|
satES.close();
|
||||||
|
satES = null;
|
||||||
|
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
resolve({ok: !e.data, error: e.data || ''});
|
||||||
|
});
|
||||||
|
satES.onerror = function() {
|
||||||
|
if (satES) {
|
||||||
|
satES.close();
|
||||||
|
satES = null;
|
||||||
|
}
|
||||||
|
term.textContent += '\nERROR: stream disconnected.\n';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
resolve({ok: false, error: 'stream disconnected'});
|
||||||
|
};
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function selectedAMDValidateTargets() {
|
||||||
|
const targets = [];
|
||||||
|
const gpu = document.getElementById('sat-amd-target');
|
||||||
|
const mem = document.getElementById('sat-amd-mem-target');
|
||||||
|
const bw = document.getElementById('sat-amd-bandwidth-target');
|
||||||
|
if (gpu && gpu.checked && !gpu.disabled) targets.push('amd');
|
||||||
|
if (mem && mem.checked && !mem.disabled) targets.push('amd-mem');
|
||||||
|
if (bw && bw.checked && !bw.disabled) targets.push('amd-bandwidth');
|
||||||
|
return targets;
|
||||||
|
}
|
||||||
|
function runSAT(target) {
|
||||||
|
return runSATWithOverrides(target, null);
|
||||||
|
}
|
||||||
|
function runSATWithOverrides(target, overrides) {
|
||||||
|
const title = (overrides && overrides.display_name) || target;
|
||||||
|
const term = document.getElementById('sat-terminal');
|
||||||
|
document.getElementById('sat-output').style.display='block';
|
||||||
|
document.getElementById('sat-title').textContent = '— ' + title;
|
||||||
|
term.textContent = 'Enqueuing ' + title + ' test...\n';
|
||||||
|
return enqueueSATTarget(target, overrides)
|
||||||
|
.then(d => streamSATTask(d.task_id, title, false));
|
||||||
|
}
|
||||||
|
const nvidiaPerGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power'];
|
||||||
|
const nvidiaAllGPUTargets = ['nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
|
||||||
|
function satAllGPUIndicesForMulti() {
|
||||||
|
return Promise.resolve(satSelectedGPUIndices());
|
||||||
|
}
|
||||||
|
function expandSATTarget(target) {
|
||||||
|
if (nvidiaAllGPUTargets.indexOf(target) >= 0) {
|
||||||
|
return satAllGPUIndicesForMulti().then(function(indices) {
|
||||||
|
if (!indices.length) return Promise.reject(new Error('No NVIDIA GPUs available.'));
|
||||||
|
return [{target: target, overrides: {gpu_indices: indices, display_name: satLabels()[target] || target}}];
|
||||||
|
});
|
||||||
|
}
|
||||||
|
if (nvidiaPerGPUTargets.indexOf(target) < 0) {
|
||||||
|
return Promise.resolve([{target: target}]);
|
||||||
|
}
|
||||||
|
const selected = satSelectedGPUIndices();
|
||||||
|
if (!selected.length) {
|
||||||
|
return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
|
||||||
|
}
|
||||||
|
return loadSatNvidiaGPUs().then(gpus => gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0).map(gpu => ({
|
||||||
|
target: target,
|
||||||
|
overrides: {
|
||||||
|
gpu_indices: [Number(gpu.index)],
|
||||||
|
display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')'
|
||||||
|
},
|
||||||
|
label: satGPUDisplayName(gpu),
|
||||||
|
})));
|
||||||
|
}
|
||||||
|
function runNvidiaFabricValidate(target) {
|
||||||
|
satAllGPUIndicesForMulti().then(function(indices) {
|
||||||
|
if (!indices.length) { alert('No NVIDIA GPUs available.'); return; }
|
||||||
|
runSATWithOverrides(target, {gpu_indices: indices, display_name: satLabels()[target] || target});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function runNvidiaValidateSet(target) {
|
||||||
|
return loadSatNvidiaGPUs().then(gpus => {
|
||||||
|
const selected = satSelectedGPUIndices();
|
||||||
|
const picked = gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0);
|
||||||
|
if (!picked.length) {
|
||||||
|
throw new Error('Select at least one NVIDIA GPU.');
|
||||||
|
}
|
||||||
|
if (picked.length === 1) {
|
||||||
|
const gpu = picked[0];
|
||||||
|
return runSATWithOverrides(target, {
|
||||||
|
gpu_indices: [Number(gpu.index)],
|
||||||
|
display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')',
|
||||||
|
});
|
||||||
|
}
|
||||||
|
document.getElementById('sat-output').style.display='block';
|
||||||
|
document.getElementById('sat-title').textContent = '— ' + target;
|
||||||
|
const term = document.getElementById('sat-terminal');
|
||||||
|
term.textContent = 'Running ' + target + ' one GPU at a time...\n';
|
||||||
|
const labelBase = satLabels()[target] || ('Validate ' + target);
|
||||||
|
const runNext = (idx) => {
|
||||||
|
if (idx >= picked.length) return Promise.resolve();
|
||||||
|
const gpu = picked[idx];
|
||||||
|
const gpuLabel = satGPUDisplayName(gpu);
|
||||||
|
term.textContent += '\n[' + (idx + 1) + '/' + picked.length + '] ' + gpuLabel + '\n';
|
||||||
|
return enqueueSATTarget(target, {
|
||||||
|
gpu_indices: [Number(gpu.index)],
|
||||||
|
display_name: labelBase + ' (' + gpuLabel + ')',
|
||||||
|
}).then(d => {
|
||||||
|
return streamSATTask(d.task_id, labelBase + ' (' + gpuLabel + ')', false);
|
||||||
|
}).then(function() {
|
||||||
|
return runNext(idx + 1);
|
||||||
|
});
|
||||||
|
};
|
||||||
|
return runNext(0);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function runAMDValidateSet() {
|
||||||
|
const targets = selectedAMDValidateTargets();
|
||||||
|
if (!targets.length) return;
|
||||||
|
if (targets.length === 1) return runSAT(targets[0]);
|
||||||
|
document.getElementById('sat-output').style.display='block';
|
||||||
|
document.getElementById('sat-title').textContent = '— amd';
|
||||||
|
const term = document.getElementById('sat-terminal');
|
||||||
|
term.textContent = 'Running AMD validate set one by one...\n';
|
||||||
|
const labels = satLabels();
|
||||||
|
const runNext = (idx) => {
|
||||||
|
if (idx >= targets.length) return Promise.resolve();
|
||||||
|
const target = targets[idx];
|
||||||
|
term.textContent += '\n[' + (idx + 1) + '/' + targets.length + '] ' + labels[target] + '\n';
|
||||||
|
return enqueueSATTarget(target)
|
||||||
|
.then(d => {
|
||||||
|
return streamSATTask(d.task_id, labels[target], false);
|
||||||
|
}).then(function() {
|
||||||
|
return runNext(idx + 1);
|
||||||
|
});
|
||||||
|
};
|
||||||
|
return runNext(0);
|
||||||
|
}
|
||||||
|
function runAllSAT() {
|
||||||
|
const cycles = 1;
|
||||||
|
const status = document.getElementById('sat-all-status');
|
||||||
|
status.textContent = 'Enqueuing...';
|
||||||
|
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
|
||||||
|
const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
|
||||||
|
const activeTargets = baseTargets.filter(target => {
|
||||||
|
if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
|
||||||
|
const btn = document.getElementById('sat-btn-' + target);
|
||||||
|
return !(btn && btn.disabled);
|
||||||
|
});
|
||||||
|
Promise.all(activeTargets.map(expandSATTarget)).then(groups => {
|
||||||
|
const expanded = [];
|
||||||
|
for (let cycle = 0; cycle < cycles; cycle++) {
|
||||||
|
groups.forEach(group => group.forEach(item => expanded.push(item)));
|
||||||
|
}
|
||||||
|
const total = expanded.length;
|
||||||
|
let enqueued = 0;
|
||||||
|
if (!total) {
|
||||||
|
status.textContent = 'No tasks selected.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const runNext = (idx) => {
|
||||||
|
if (idx >= expanded.length) { status.textContent = 'Completed ' + total + ' task(s).'; return Promise.resolve(); }
|
||||||
|
const item = expanded[idx];
|
||||||
|
status.textContent = 'Running ' + (idx + 1) + '/' + total + '...';
|
||||||
|
return enqueueSATTarget(item.target, item.overrides)
|
||||||
|
.then(() => {
|
||||||
|
enqueued++;
|
||||||
|
return runNext(idx + 1);
|
||||||
|
});
|
||||||
|
};
|
||||||
|
return runNext(0);
|
||||||
|
}).catch(err => {
|
||||||
|
status.textContent = 'Error: ' + err.message;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
<script>
|
||||||
|
fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
|
||||||
|
if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
|
||||||
|
if (!gp.nvidia) disableSATCard('nvidia-targeted-stress', 'No NVIDIA GPU detected');
|
||||||
|
if (!gp.nvidia) disableSATCard('nvidia-targeted-power', 'No NVIDIA GPU detected');
|
||||||
|
if (!gp.nvidia) disableSATCard('nvidia-pulse', 'No NVIDIA GPU detected');
|
||||||
|
if (!gp.nvidia) disableSATCard('nvidia-interconnect', 'No NVIDIA GPU detected');
|
||||||
|
if (!gp.nvidia) disableSATCard('nvidia-bandwidth', 'No NVIDIA GPU detected');
|
||||||
|
if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
|
||||||
|
if (!gp.amd) disableSATAMDOptions('No AMD GPU detected');
|
||||||
|
});
|
||||||
|
satLoadGPUs();
|
||||||
|
function disableSATAMDOptions(reason) {
|
||||||
|
['sat-amd-target','sat-amd-mem-target','sat-amd-bandwidth-target'].forEach(function(id) {
|
||||||
|
const cb = document.getElementById(id);
|
||||||
|
if (!cb) return;
|
||||||
|
cb.disabled = true;
|
||||||
|
cb.checked = false;
|
||||||
|
cb.title = reason;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function disableSATCard(id, reason) {
|
||||||
|
const btn = document.getElementById('sat-btn-' + id);
|
||||||
|
if (!btn) return;
|
||||||
|
btn.disabled = true;
|
||||||
|
btn.title = reason;
|
||||||
|
btn.style.opacity = '0.4';
|
||||||
|
const card = btn.closest('.card');
|
||||||
|
if (card) {
|
||||||
|
let note = card.querySelector('.sat-unavail');
|
||||||
|
if (!note) {
|
||||||
|
note = document.createElement('p');
|
||||||
|
note.className = 'sat-unavail';
|
||||||
|
note.style.cssText = 'color:var(--muted);font-size:12px;margin:0 0 8px';
|
||||||
|
const body = card.querySelector('.card-body');
|
||||||
|
if (body) body.insertBefore(note, body.firstChild);
|
||||||
|
}
|
||||||
|
note.textContent = reason;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadValidateInventory(opts HandlerOptions) validateInventory {
|
||||||
|
unknown := "Audit snapshot not loaded."
|
||||||
|
out := validateInventory{
|
||||||
|
CPU: unknown,
|
||||||
|
Memory: unknown,
|
||||||
|
Storage: unknown,
|
||||||
|
NVIDIA: unknown,
|
||||||
|
AMD: unknown,
|
||||||
|
}
|
||||||
|
data, err := loadSnapshot(opts.AuditPath)
|
||||||
|
if err != nil {
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
var snap schema.HardwareIngestRequest
|
||||||
|
if err := json.Unmarshal(data, &snap); err != nil {
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
cpuCounts := map[string]int{}
|
||||||
|
cpuTotal := 0
|
||||||
|
for _, cpu := range snap.Hardware.CPUs {
|
||||||
|
if cpu.Present != nil && !*cpu.Present {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
cpuTotal++
|
||||||
|
addValidateModel(cpuCounts, validateFirstNonEmpty(validateTrimPtr(cpu.Model), validateTrimPtr(cpu.Manufacturer), "unknown"))
|
||||||
|
}
|
||||||
|
|
||||||
|
memCounts := map[string]int{}
|
||||||
|
memTotal := 0
|
||||||
|
for _, dimm := range snap.Hardware.Memory {
|
||||||
|
if dimm.Present != nil && !*dimm.Present {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
memTotal++
|
||||||
|
addValidateModel(memCounts, validateFirstNonEmpty(validateTrimPtr(dimm.PartNumber), validateTrimPtr(dimm.Type), validateTrimPtr(dimm.Manufacturer), "unknown"))
|
||||||
|
}
|
||||||
|
|
||||||
|
storageCounts := map[string]int{}
|
||||||
|
storageTotal := 0
|
||||||
|
for _, dev := range snap.Hardware.Storage {
|
||||||
|
if dev.Present != nil && !*dev.Present {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
storageTotal++
|
||||||
|
addValidateModel(storageCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
|
||||||
|
}
|
||||||
|
|
||||||
|
nvidiaCounts := map[string]int{}
|
||||||
|
nvidiaTotal := 0
|
||||||
|
amdCounts := map[string]int{}
|
||||||
|
amdTotal := 0
|
||||||
|
for _, dev := range snap.Hardware.PCIeDevices {
|
||||||
|
if dev.Present != nil && !*dev.Present {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if validateIsVendorGPU(dev, "nvidia") {
|
||||||
|
nvidiaTotal++
|
||||||
|
addValidateModel(nvidiaCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
|
||||||
|
}
|
||||||
|
if validateIsVendorGPU(dev, "amd") {
|
||||||
|
amdTotal++
|
||||||
|
addValidateModel(amdCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
out.CPU = formatValidateDeviceSummary(cpuTotal, cpuCounts, "CPU")
|
||||||
|
out.Memory = formatValidateDeviceSummary(memTotal, memCounts, "module")
|
||||||
|
out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device")
|
||||||
|
out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU")
|
||||||
|
out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU")
|
||||||
|
out.NvidiaGPUCount = nvidiaTotal
|
||||||
|
out.AMDGPUCount = amdTotal
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderValidateCardBody(devices, description, commands, settings string) string {
|
||||||
|
return `<div class="validate-card-section"><div style="font-size:13px;color:var(--muted)">` + devices + `</div></div>` +
|
||||||
|
`<div class="validate-card-section"><div style="font-size:13px">` + description + `</div></div>` +
|
||||||
|
`<div class="validate-card-section"><div style="font-size:13px">` + commands + `</div></div>` +
|
||||||
|
`<div class="validate-card-section"><div style="font-size:13px;color:var(--muted)">` + settings + `</div></div>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatValidateDeviceSummary(total int, models map[string]int, unit string) string {
|
||||||
|
if total == 0 {
|
||||||
|
return "0 " + unit + "s detected."
|
||||||
|
}
|
||||||
|
keys := make([]string, 0, len(models))
|
||||||
|
for key := range models {
|
||||||
|
keys = append(keys, key)
|
||||||
|
}
|
||||||
|
sort.Strings(keys)
|
||||||
|
parts := make([]string, 0, len(keys))
|
||||||
|
for _, key := range keys {
|
||||||
|
parts = append(parts, fmt.Sprintf("%d x %s", models[key], html.EscapeString(key)))
|
||||||
|
}
|
||||||
|
label := unit
|
||||||
|
if total != 1 {
|
||||||
|
label += "s"
|
||||||
|
}
|
||||||
|
if len(parts) == 1 {
|
||||||
|
return parts[0] + " " + label
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("%d %s: %s", total, label, strings.Join(parts, ", "))
|
||||||
|
}
|
||||||
|
|
||||||
|
func addValidateModel(counts map[string]int, name string) {
|
||||||
|
name = strings.TrimSpace(name)
|
||||||
|
if name == "" {
|
||||||
|
name = "unknown"
|
||||||
|
}
|
||||||
|
counts[name]++
|
||||||
|
}
|
||||||
|
|
||||||
|
func validateTrimPtr(value *string) string {
|
||||||
|
if value == nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(*value)
|
||||||
|
}
|
||||||
|
|
||||||
|
func validateFirstNonEmpty(values ...string) string {
|
||||||
|
for _, value := range values {
|
||||||
|
value = strings.TrimSpace(value)
|
||||||
|
if value != "" {
|
||||||
|
return value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
func validateIsVendorGPU(dev schema.HardwarePCIeDevice, vendor string) bool {
|
||||||
|
model := strings.ToLower(validateTrimPtr(dev.Model))
|
||||||
|
manufacturer := strings.ToLower(validateTrimPtr(dev.Manufacturer))
|
||||||
|
class := strings.ToLower(validateTrimPtr(dev.DeviceClass))
|
||||||
|
if strings.Contains(model, "aspeed") || strings.Contains(manufacturer, "aspeed") {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
switch vendor {
|
||||||
|
case "nvidia":
|
||||||
|
return strings.Contains(model, "nvidia") || strings.Contains(manufacturer, "nvidia")
|
||||||
|
case "amd":
|
||||||
|
isGPUClass := class == "processingaccelerator" || class == "displaycontroller" || class == "videocontroller"
|
||||||
|
isAMDVendor := strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd") || strings.Contains(manufacturer, "ati")
|
||||||
|
isAMDModel := strings.Contains(model, "instinct") || strings.Contains(model, "radeon") || strings.Contains(model, "amd")
|
||||||
|
return isGPUClass && (isAMDVendor || isAMDModel)
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderSATCard(id, label, runAction, headerActions, body string) string {
|
||||||
|
actions := `<button id="sat-btn-` + id + `" class="btn btn-primary btn-sm" onclick="` + runAction + `">Run</button>`
|
||||||
|
if strings.TrimSpace(headerActions) != "" {
|
||||||
|
actions += headerActions
|
||||||
|
}
|
||||||
|
return fmt.Sprintf(`<div class="card"><div class="card-head card-head-actions"><span>%s</span><div class="card-head-buttons">%s</div></div><div class="card-body validate-card-body">%s</div></div>`,
|
||||||
|
label, actions, body)
|
||||||
|
}
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -271,6 +271,8 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
|
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
|
||||||
mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf"))
|
mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf"))
|
||||||
mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power"))
|
mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power"))
|
||||||
|
mux.HandleFunc("POST /api/bee-bench/nvidia/autotune/run", h.handleAPIBenchmarkAutotuneRun())
|
||||||
|
mux.HandleFunc("GET /api/bee-bench/nvidia/autotune/status", h.handleAPIBenchmarkAutotuneStatus)
|
||||||
mux.HandleFunc("GET /api/benchmark/results", h.handleAPIBenchmarkResults)
|
mux.HandleFunc("GET /api/benchmark/results", h.handleAPIBenchmarkResults)
|
||||||
|
|
||||||
// Tasks
|
// Tasks
|
||||||
@@ -687,41 +689,22 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) (dat
|
|||||||
|
|
||||||
case path == "server-power":
|
case path == "server-power":
|
||||||
title = "System Power"
|
title = "System Power"
|
||||||
// Use per-PSU stacked chart when PSU SDR data is available.
|
power := make([]float64, len(samples))
|
||||||
// Collect the union of PSU slots seen across all samples.
|
label := "Power W"
|
||||||
psuSlots := psuSlotsFromSamples(samples)
|
for i, s := range samples {
|
||||||
if len(psuSlots) > 1 {
|
power[i] = s.PowerW
|
||||||
// Build one dataset per PSU slot.
|
if strings.TrimSpace(s.PowerSource) != "" {
|
||||||
psuDatasets := make([][]float64, len(psuSlots))
|
label = fmt.Sprintf("Power W · %s", s.PowerSource)
|
||||||
psuNames := make([]string, len(psuSlots))
|
if strings.TrimSpace(s.PowerMode) != "" {
|
||||||
for si, slot := range psuSlots {
|
label += fmt.Sprintf(" (%s)", s.PowerMode)
|
||||||
ds := make([]float64, len(samples))
|
|
||||||
for i, s := range samples {
|
|
||||||
for _, psu := range s.PSUs {
|
|
||||||
if psu.Slot == slot {
|
|
||||||
ds[i] = psu.PowerW
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
psuDatasets[si] = normalizePowerSeries(ds)
|
|
||||||
psuNames[si] = fmt.Sprintf("PSU %d", slot)
|
|
||||||
}
|
}
|
||||||
datasets = psuDatasets
|
|
||||||
names = psuNames
|
|
||||||
stacked = true
|
|
||||||
yMax = autoMax120(psuStackedTotal(psuDatasets))
|
|
||||||
} else {
|
|
||||||
power := make([]float64, len(samples))
|
|
||||||
for i, s := range samples {
|
|
||||||
power[i] = s.PowerW
|
|
||||||
}
|
|
||||||
power = normalizePowerSeries(power)
|
|
||||||
datasets = [][]float64{power}
|
|
||||||
names = []string{"Power W"}
|
|
||||||
yMin = floatPtr(0)
|
|
||||||
yMax = autoMax120(power)
|
|
||||||
}
|
}
|
||||||
|
power = normalizePowerSeries(power)
|
||||||
|
datasets = [][]float64{power}
|
||||||
|
names = []string{label}
|
||||||
|
yMin = floatPtr(0)
|
||||||
|
yMax = autoMax120(power)
|
||||||
|
|
||||||
case path == "server-fans":
|
case path == "server-fans":
|
||||||
title = "Fan RPM"
|
title = "Fan RPM"
|
||||||
|
|||||||
@@ -420,6 +420,49 @@ func TestHandleMetricsChartSVGRendersCustomSVG(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestChartDataFromSamplesServerPowerUsesResolvedSystemPower(t *testing.T) {
|
||||||
|
start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
|
||||||
|
samples := []platform.LiveMetricSample{
|
||||||
|
{
|
||||||
|
Timestamp: start,
|
||||||
|
PSUs: []platform.PSUReading{
|
||||||
|
{Slot: 1, PowerW: 120},
|
||||||
|
{Slot: 2, PowerW: 130},
|
||||||
|
},
|
||||||
|
PowerW: 250,
|
||||||
|
PowerSource: "sdr_psu_input",
|
||||||
|
PowerMode: "autotuned",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Timestamp: start.Add(time.Minute),
|
||||||
|
PSUs: []platform.PSUReading{
|
||||||
|
{Slot: 1, PowerW: 140},
|
||||||
|
{Slot: 2, PowerW: 135},
|
||||||
|
},
|
||||||
|
PowerW: 275,
|
||||||
|
PowerSource: "sdr_psu_input",
|
||||||
|
PowerMode: "autotuned",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
datasets, names, _, title, _, _, stacked, ok := chartDataFromSamples("server-power", samples)
|
||||||
|
if !ok {
|
||||||
|
t.Fatal("expected server-power chart data")
|
||||||
|
}
|
||||||
|
if title != "System Power" {
|
||||||
|
t.Fatalf("title=%q", title)
|
||||||
|
}
|
||||||
|
if stacked {
|
||||||
|
t.Fatal("server-power should use resolved system power, not stacked PSU inputs")
|
||||||
|
}
|
||||||
|
if len(datasets) != 1 || len(names) != 1 {
|
||||||
|
t.Fatalf("datasets=%d names=%d want 1/1", len(datasets), len(names))
|
||||||
|
}
|
||||||
|
if names[0] != "Power W · sdr_psu_input (autotuned)" {
|
||||||
|
t.Fatalf("names=%v", names)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
|
func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
|
||||||
got := normalizeFanSeries([]float64{4200, 0, 0, 4300, 0})
|
got := normalizeFanSeries([]float64{4200, 0, 0, 4300, 0})
|
||||||
want := []float64{4200, 4200, 4200, 4300, 4300}
|
want := []float64{4200, 4200, 4200, 4300, 4300}
|
||||||
@@ -650,9 +693,12 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
|
|||||||
`/api/gpu/nvidia`,
|
`/api/gpu/nvidia`,
|
||||||
`/api/bee-bench/nvidia/perf/run`,
|
`/api/bee-bench/nvidia/perf/run`,
|
||||||
`/api/bee-bench/nvidia/power/run`,
|
`/api/bee-bench/nvidia/power/run`,
|
||||||
|
`/api/bee-bench/nvidia/autotune/run`,
|
||||||
|
`/api/bee-bench/nvidia/autotune/status`,
|
||||||
`benchmark-run-nccl`,
|
`benchmark-run-nccl`,
|
||||||
`Run Performance Benchmark`,
|
`Run Performance Benchmark`,
|
||||||
`Run Power / Thermal Fit`,
|
`Run Power / Thermal Fit`,
|
||||||
|
`Autotune`,
|
||||||
} {
|
} {
|
||||||
if !strings.Contains(body, needle) {
|
if !strings.Contains(body, needle) {
|
||||||
t.Fatalf("benchmark page missing %q: %s", needle, body)
|
t.Fatalf("benchmark page missing %q: %s", needle, body)
|
||||||
|
|||||||
@@ -34,6 +34,7 @@ var taskNames = map[string]string{
|
|||||||
"nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)",
|
"nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)",
|
||||||
"nvidia-bench-perf": "NVIDIA Bee Bench Perf",
|
"nvidia-bench-perf": "NVIDIA Bee Bench Perf",
|
||||||
"nvidia-bench-power": "NVIDIA Bee Bench Power",
|
"nvidia-bench-power": "NVIDIA Bee Bench Power",
|
||||||
|
"nvidia-bench-autotune": "NVIDIA Bee Bench Power Source Autotune",
|
||||||
"nvidia-compute": "NVIDIA Max Compute Load (dcgmproftester)",
|
"nvidia-compute": "NVIDIA Max Compute Load (dcgmproftester)",
|
||||||
"nvidia-targeted-power": "NVIDIA Targeted Power (dcgmi diag targeted_power)",
|
"nvidia-targeted-power": "NVIDIA Targeted Power (dcgmi diag targeted_power)",
|
||||||
"nvidia-pulse": "NVIDIA Pulse Test (dcgmi diag pulse_test)",
|
"nvidia-pulse": "NVIDIA Pulse Test (dcgmi diag pulse_test)",
|
||||||
@@ -125,6 +126,7 @@ type taskParams struct {
|
|||||||
Loader string `json:"loader,omitempty"`
|
Loader string `json:"loader,omitempty"`
|
||||||
BurnProfile string `json:"burn_profile,omitempty"`
|
BurnProfile string `json:"burn_profile,omitempty"`
|
||||||
BenchmarkProfile string `json:"benchmark_profile,omitempty"`
|
BenchmarkProfile string `json:"benchmark_profile,omitempty"`
|
||||||
|
BenchmarkKind string `json:"benchmark_kind,omitempty"`
|
||||||
RunNCCL bool `json:"run_nccl,omitempty"`
|
RunNCCL bool `json:"run_nccl,omitempty"`
|
||||||
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||||
RampStep int `json:"ramp_step,omitempty"`
|
RampStep int `json:"ramp_step,omitempty"`
|
||||||
@@ -686,6 +688,15 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
RampTotal: t.params.RampTotal,
|
RampTotal: t.params.RampTotal,
|
||||||
RampRunID: t.params.RampRunID,
|
RampRunID: t.params.RampRunID,
|
||||||
}, j.append)
|
}, j.append)
|
||||||
|
case "nvidia-bench-autotune":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaPowerSourceAutotuneCtx(ctx, app.DefaultBeeBenchAutotuneDir, platform.NvidiaBenchmarkOptions{
|
||||||
|
Profile: t.params.BenchmarkProfile,
|
||||||
|
SizeMB: t.params.SizeMB,
|
||||||
|
}, t.params.BenchmarkKind, j.append)
|
||||||
case "nvidia-compute":
|
case "nvidia-compute":
|
||||||
if a == nil {
|
if a == nil {
|
||||||
err = fmt.Errorf("app not configured")
|
err = fmt.Errorf("app not configured")
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ lb config noauto \
|
|||||||
--memtest memtest86+ \
|
--memtest memtest86+ \
|
||||||
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||||
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||||
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
--bootappend-live "boot=live components video=1920x1080 console=ttyS0,115200n8 console=tty0 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
||||||
--debootstrap-options "--include=ca-certificates" \
|
--debootstrap-options "--include=ca-certificates" \
|
||||||
--apt-recommends false \
|
--apt-recommends false \
|
||||||
--chroot-squashfs-compression-type zstd \
|
--chroot-squashfs-compression-type zstd \
|
||||||
|
|||||||
@@ -542,6 +542,186 @@ label memtest
|
|||||||
EOF
|
EOF
|
||||||
}
|
}
|
||||||
|
|
||||||
|
extract_live_grub_entry() {
|
||||||
|
cfg="$1"
|
||||||
|
live_linux="$(awk '/^[[:space:]]*linux[[:space:]]+\/live\// { print; exit }' "$cfg")"
|
||||||
|
live_initrd="$(awk '/^[[:space:]]*initrd[[:space:]]+\/live\// { print; exit }' "$cfg")"
|
||||||
|
[ -n "$live_linux" ] || return 1
|
||||||
|
[ -n "$live_initrd" ] || return 1
|
||||||
|
|
||||||
|
grub_kernel="$(printf '%s\n' "$live_linux" | awk '{print $2}')"
|
||||||
|
grub_append="$(printf '%s\n' "$live_linux" | cut -d' ' -f3-)"
|
||||||
|
grub_initrd="$(printf '%s\n' "$live_initrd" | awk '{print $2}')"
|
||||||
|
[ -n "$grub_kernel" ] || return 1
|
||||||
|
[ -n "$grub_append" ] || return 1
|
||||||
|
[ -n "$grub_initrd" ] || return 1
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
extract_live_isolinux_entry() {
|
||||||
|
cfg="$1"
|
||||||
|
isolinux_linux="$(awk '/^[[:space:]]*linux[[:space:]]+\/live\// { print; exit }' "$cfg")"
|
||||||
|
isolinux_initrd="$(awk '/^[[:space:]]*initrd[[:space:]]+\/live\// { print; exit }' "$cfg")"
|
||||||
|
isolinux_append="$(awk '/^[[:space:]]*append[[:space:]]+/ { sub(/^[[:space:]]*append[[:space:]]+/, ""); print; exit }' "$cfg")"
|
||||||
|
[ -n "$isolinux_linux" ] || return 1
|
||||||
|
[ -n "$isolinux_initrd" ] || return 1
|
||||||
|
[ -n "$isolinux_append" ] || return 1
|
||||||
|
|
||||||
|
isolinux_kernel="$(printf '%s\n' "$isolinux_linux" | awk '{print $2}')"
|
||||||
|
isolinux_initrd_path="$(printf '%s\n' "$isolinux_initrd" | awk '{print $2}')"
|
||||||
|
[ -n "$isolinux_kernel" ] || return 1
|
||||||
|
[ -n "$isolinux_initrd_path" ] || return 1
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
write_canonical_grub_cfg() {
|
||||||
|
cfg="$1"
|
||||||
|
kernel="$2"
|
||||||
|
append_live="$3"
|
||||||
|
initrd="$4"
|
||||||
|
|
||||||
|
cat > "$cfg" <<EOF
|
||||||
|
source /boot/grub/config.cfg
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo " ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗"
|
||||||
|
echo " ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝"
|
||||||
|
echo " █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗"
|
||||||
|
echo " ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝"
|
||||||
|
echo " ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗"
|
||||||
|
echo " ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝"
|
||||||
|
echo " Hardware Audit LiveCD"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
menuentry "EASY-BEE" {
|
||||||
|
linux ${kernel} ${append_live} nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
initrd ${initrd}
|
||||||
|
}
|
||||||
|
|
||||||
|
menuentry "EASY-BEE — load to RAM (toram)" {
|
||||||
|
linux ${kernel} ${append_live} toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
initrd ${initrd}
|
||||||
|
}
|
||||||
|
|
||||||
|
submenu "EASY-BEE (advanced options) -->" {
|
||||||
|
menuentry "EASY-BEE — GSP=off" {
|
||||||
|
linux ${kernel} ${append_live} nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
initrd ${initrd}
|
||||||
|
}
|
||||||
|
|
||||||
|
menuentry "EASY-BEE — KMS (no nomodeset)" {
|
||||||
|
linux ${kernel} ${append_live} bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
initrd ${initrd}
|
||||||
|
}
|
||||||
|
|
||||||
|
menuentry "EASY-BEE — KMS + GSP=off" {
|
||||||
|
linux ${kernel} ${append_live} bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
initrd ${initrd}
|
||||||
|
}
|
||||||
|
|
||||||
|
menuentry "EASY-BEE — fail-safe" {
|
||||||
|
linux ${kernel} ${append_live} nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||||
|
initrd ${initrd}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if [ "\${grub_platform}" = "efi" ]; then
|
||||||
|
menuentry "Memory Test (memtest86+)" {
|
||||||
|
chainloader /boot/memtest86+x64.efi
|
||||||
|
}
|
||||||
|
else
|
||||||
|
menuentry "Memory Test (memtest86+)" {
|
||||||
|
linux16 /boot/memtest86+x64.bin
|
||||||
|
}
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "\${grub_platform}" = "efi" ]; then
|
||||||
|
menuentry "UEFI Firmware Settings" {
|
||||||
|
fwsetup
|
||||||
|
}
|
||||||
|
fi
|
||||||
|
EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
write_canonical_isolinux_cfg() {
|
||||||
|
cfg="$1"
|
||||||
|
kernel="$2"
|
||||||
|
initrd="$3"
|
||||||
|
append_live="$4"
|
||||||
|
|
||||||
|
cat > "$cfg" <<EOF
|
||||||
|
label live-@FLAVOUR@-normal
|
||||||
|
menu label ^EASY-BEE
|
||||||
|
menu default
|
||||||
|
linux ${kernel}
|
||||||
|
initrd ${initrd}
|
||||||
|
append ${append_live} nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
|
label live-@FLAVOUR@-toram
|
||||||
|
menu label EASY-BEE (^load to RAM)
|
||||||
|
linux ${kernel}
|
||||||
|
initrd ${initrd}
|
||||||
|
append ${append_live} toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
|
label live-@FLAVOUR@-gsp-off
|
||||||
|
menu label EASY-BEE (^NVIDIA GSP=off)
|
||||||
|
linux ${kernel}
|
||||||
|
initrd ${initrd}
|
||||||
|
append ${append_live} nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
|
label live-@FLAVOUR@-kms
|
||||||
|
menu label EASY-BEE (^KMS, no nomodeset)
|
||||||
|
linux ${kernel}
|
||||||
|
initrd ${initrd}
|
||||||
|
append ${append_live} bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
|
label live-@FLAVOUR@-kms-gsp-off
|
||||||
|
menu label EASY-BEE (KMS, ^GSP=off)
|
||||||
|
linux ${kernel}
|
||||||
|
initrd ${initrd}
|
||||||
|
append ${append_live} bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
|
label live-@FLAVOUR@-failsafe
|
||||||
|
menu label EASY-BEE (^fail-safe)
|
||||||
|
linux ${kernel}
|
||||||
|
initrd ${initrd}
|
||||||
|
append ${append_live} nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||||
|
|
||||||
|
label memtest
|
||||||
|
menu label ^Memory Test (memtest86+)
|
||||||
|
linux /boot/memtest86+x64.bin
|
||||||
|
EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
enforce_live_build_bootloader_assets() {
|
||||||
|
lb_dir="$1"
|
||||||
|
grub_cfg="$lb_dir/binary/boot/grub/grub.cfg"
|
||||||
|
grub_dir="$lb_dir/binary/boot/grub"
|
||||||
|
isolinux_cfg="$lb_dir/binary/isolinux/live.cfg"
|
||||||
|
|
||||||
|
if [ -f "$grub_cfg" ]; then
|
||||||
|
if extract_live_grub_entry "$grub_cfg"; then
|
||||||
|
mkdir -p "$grub_dir/live-theme"
|
||||||
|
cp "${BUILDER_DIR}/config/bootloaders/grub-efi/config.cfg" "$grub_dir/config.cfg"
|
||||||
|
cp "${BUILDER_DIR}/config/bootloaders/grub-efi/theme.cfg" "$grub_dir/theme.cfg"
|
||||||
|
cp -R "${BUILDER_DIR}/config/bootloaders/grub-efi/live-theme/." "$grub_dir/live-theme/"
|
||||||
|
write_canonical_grub_cfg "$grub_cfg" "$grub_kernel" "$grub_append" "$grub_initrd"
|
||||||
|
echo "bootloader sync: rewrote binary/boot/grub/grub.cfg with canonical EASY-BEE menu"
|
||||||
|
else
|
||||||
|
echo "bootloader sync: WARNING: could not extract live entry from $grub_cfg" >&2
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -f "$isolinux_cfg" ]; then
|
||||||
|
if extract_live_isolinux_entry "$isolinux_cfg"; then
|
||||||
|
write_canonical_isolinux_cfg "$isolinux_cfg" "$isolinux_kernel" "$isolinux_initrd_path" "$isolinux_append"
|
||||||
|
echo "bootloader sync: rewrote binary/isolinux/live.cfg with canonical EASY-BEE menu"
|
||||||
|
else
|
||||||
|
echo "bootloader sync: WARNING: could not extract live entry from $isolinux_cfg" >&2
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
copy_memtest_from_deb() {
|
copy_memtest_from_deb() {
|
||||||
deb="$1"
|
deb="$1"
|
||||||
dst_boot="$2"
|
dst_boot="$2"
|
||||||
@@ -1229,6 +1409,11 @@ run_step_sh "live-build clean" "80-lb-clean" "lb clean --all 2>&1 | tail -3"
|
|||||||
run_step_sh "live-build config" "81-lb-config" "lb config 2>&1 | tail -5"
|
run_step_sh "live-build config" "81-lb-config" "lb config 2>&1 | tail -5"
|
||||||
dump_memtest_debug "pre-build" "${LB_DIR}"
|
dump_memtest_debug "pre-build" "${LB_DIR}"
|
||||||
run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
|
run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
|
||||||
|
echo "=== enforcing canonical bootloader assets ==="
|
||||||
|
enforce_live_build_bootloader_assets "${LB_DIR}"
|
||||||
|
run_step_sh "rebuild live-build checksums after bootloader sync" "91b-lb-checksums" "lb binary_checksums 2>&1"
|
||||||
|
run_step_sh "rebuild ISO after bootloader sync" "91c-lb-binary-iso" "rm -f '${LB_DIR}/live-image-amd64.hybrid.iso' && lb binary_iso 2>&1"
|
||||||
|
run_step_sh "rebuild zsync after bootloader sync" "91d-lb-zsync" "lb binary_zsync 2>&1"
|
||||||
|
|
||||||
# --- persist deb package cache back to shared location ---
|
# --- persist deb package cache back to shared location ---
|
||||||
# This allows the second variant to reuse all downloaded packages.
|
# This allows the second variant to reuse all downloaded packages.
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
set color_normal=light-gray/black
|
set color_normal=light-gray/black
|
||||||
set color_highlight=yellow/black
|
set color_highlight=yellow/black
|
||||||
|
|
||||||
if [ -e /boot/grub/splash.png ]; then
|
if [ -e /boot/grub/live-theme/theme.txt ]; then
|
||||||
set theme=/boot/grub/live-theme/theme.txt
|
set theme=/boot/grub/live-theme/theme.txt
|
||||||
else
|
else
|
||||||
set menu_color_normal=yellow/black
|
set menu_color_normal=yellow/black
|
||||||
|
|||||||
@@ -10,7 +10,8 @@ RestartSec=3
|
|||||||
StandardOutput=journal
|
StandardOutput=journal
|
||||||
StandardError=journal
|
StandardError=journal
|
||||||
LimitMEMLOCK=infinity
|
LimitMEMLOCK=infinity
|
||||||
MemoryMax=3G
|
# No MemoryMax: bee-web spawns GPU test subprocesses (dcgmproftester etc.)
|
||||||
|
# that legitimately use several GB; a cgroup limit kills them via OOM.
|
||||||
# Keep the web server responsive during GPU/CPU stress (children inherit nice+10
|
# Keep the web server responsive during GPU/CPU stress (children inherit nice+10
|
||||||
# via Setpriority in runCmdJob, but the bee-web parent stays at 0).
|
# via Setpriority in runCmdJob, but the bee-web parent stays at 0).
|
||||||
Nice=0
|
Nice=0
|
||||||
|
|||||||
64
scripts/deploy.sh
Executable file
64
scripts/deploy.sh
Executable file
@@ -0,0 +1,64 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
REMOTE_USER="bee"
|
||||||
|
REMOTE_BIN="/usr/local/bin/bee"
|
||||||
|
LOCAL_BIN="audit/bee"
|
||||||
|
SERVICES="bee-audit bee-web"
|
||||||
|
|
||||||
|
# --- IP ---
|
||||||
|
if [[ $# -ge 1 ]]; then
|
||||||
|
HOST="$1"
|
||||||
|
else
|
||||||
|
read -rp "IP адрес хоста: " HOST
|
||||||
|
fi
|
||||||
|
[[ -z "$HOST" ]] && { echo "Ошибка: IP не указан"; exit 1; }
|
||||||
|
|
||||||
|
# --- SSH options ---
|
||||||
|
SSH_OPTS=(-o StrictHostKeyChecking=no -o ConnectTimeout=10)
|
||||||
|
|
||||||
|
# Проверяем, нужен ли пароль
|
||||||
|
SSH_PASS=""
|
||||||
|
if ! ssh "${SSH_OPTS[@]}" -o BatchMode=yes "${REMOTE_USER}@${HOST}" true 2>/dev/null; then
|
||||||
|
if command -v sshpass &>/dev/null; then
|
||||||
|
read -rsp "Пароль для ${REMOTE_USER}@${HOST}: " SSH_PASS
|
||||||
|
echo
|
||||||
|
SSH_CMD=(sshpass -p "$SSH_PASS" ssh "${SSH_OPTS[@]}")
|
||||||
|
SCP_CMD=(sshpass -p "$SSH_PASS" scp "${SSH_OPTS[@]}")
|
||||||
|
else
|
||||||
|
echo "sshpass не установлен. Введите пароль вручную при запросе (или установите SSH-ключ)."
|
||||||
|
SSH_CMD=(ssh "${SSH_OPTS[@]}")
|
||||||
|
SCP_CMD=(scp "${SSH_OPTS[@]}")
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
SSH_CMD=(ssh "${SSH_OPTS[@]}")
|
||||||
|
SCP_CMD=(scp "${SSH_OPTS[@]}")
|
||||||
|
fi
|
||||||
|
|
||||||
|
REMOTE="${REMOTE_USER}@${HOST}"
|
||||||
|
|
||||||
|
# --- Build ---
|
||||||
|
echo "==> Сборка бинарника..."
|
||||||
|
(
|
||||||
|
cd audit
|
||||||
|
VERSION=$(sh ./scripts/resolve-version.sh 2>/dev/null || echo "dev")
|
||||||
|
CGO_ENABLED=0 GOOS=linux GOARCH=amd64 \
|
||||||
|
go build -ldflags "-X main.Version=${VERSION}" -o bee ./cmd/bee
|
||||||
|
)
|
||||||
|
echo " OK: $(ls -lh "${LOCAL_BIN}" | awk '{print $5, $9}')"
|
||||||
|
|
||||||
|
# --- Deploy ---
|
||||||
|
echo "==> Копирование на ${REMOTE}..."
|
||||||
|
"${SCP_CMD[@]}" "${LOCAL_BIN}" "${REMOTE}:/tmp/bee-new"
|
||||||
|
|
||||||
|
echo "==> Замена бинарника и перезапуск сервисов..."
|
||||||
|
"${SSH_CMD[@]}" "$REMOTE" bash -s <<EOF
|
||||||
|
set -e
|
||||||
|
sudo mv /tmp/bee-new ${REMOTE_BIN}
|
||||||
|
sudo chmod +x ${REMOTE_BIN}
|
||||||
|
sudo systemctl restart ${SERVICES}
|
||||||
|
sleep 2
|
||||||
|
systemctl status ${SERVICES} --no-pager -l
|
||||||
|
EOF
|
||||||
|
|
||||||
|
echo "==> Готово."
|
||||||
Reference in New Issue
Block a user