Compare commits
72 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 75c33e073e | |||
| 7b4bcc745a | |||
| 42774d44a6 | |||
| 5dc022ddf8 | |||
| 6623e159f5 | |||
| bbd6d009f8 | |||
| 6c2b188ec9 | |||
| 14505ef24a | |||
| 4f20c9246d | |||
| eed157c2db | |||
| a2c8aea0df | |||
| b21f03cd26 | |||
| cac5b9c86e | |||
| b5d04ef045 | |||
| fcd64438ea | |||
| 0e39e7d960 | |||
| 58d6da0e4f | |||
| 7ce73e34a4 | |||
| 8a21809ade | |||
| 626763e31d | |||
| 0b8a2ff83f | |||
| 2c22b01fe3 | |||
| ec89616585 | |||
| c0dbbf96ad | |||
| 76484b123c | |||
| 8901596152 | |||
| 7c504e5056 | |||
| 333c44f3ba | |||
| 3bca821d3e | |||
| 3648e37a1e | |||
| d109e08fab | |||
| 11d00b9442 | |||
| 6defa5ae15 | |||
| c76658ed00 | |||
| 2163017a98 | |||
| 29179917c3 | |||
| be4b439804 | |||
| 749fc8a94d | |||
| 6112094d45 | |||
| e9a2bc9f9d | |||
| 7a8f884664 | |||
| 8bf8dfa45b | |||
| 6a22199aff | |||
| ddb2bb5d1c | |||
| aa284ae754 | |||
| 8512098174 | |||
| 6b5d22c194 | |||
| a35e90a93e | |||
| 1ced81707f | |||
| 679aeb9947 | |||
| 647e99b697 | |||
| 4af997f436 | |||
| 6caace0cc0 | |||
| 5f0103635b | |||
| 84a2551dc0 | |||
| 1cfabc9230 | |||
| 5dc711de23 | |||
| ab802719f8 | |||
| a94e8007f8 | |||
| c69bf07b27 | |||
| b3cf8e3893 | |||
| 17118298bd | |||
| 65bcc9ce81 | |||
| 0cdfbc5875 | |||
| cf9b54b600 | |||
| 0bfb3fe954 | |||
| 3053cb0710 | |||
| 2038489961 | |||
| e35484013e | |||
| 2cdf034bb0 | |||
| b89580c24d | |||
| df1385d3d6 |
@@ -3,3 +3,4 @@
|
||||
dist/
|
||||
iso/out/
|
||||
build-cache/
|
||||
audit/bee
|
||||
|
||||
@@ -2,6 +2,7 @@ package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
@@ -63,14 +64,20 @@ func run(args []string, stdout, stderr io.Writer) (exitCode int) {
|
||||
return runExport(args[1:], stdout, stderr)
|
||||
case "preflight":
|
||||
return runPreflight(args[1:], stdout, stderr)
|
||||
case "install-to-ram":
|
||||
return runInstallToRAM(args[1:], stdout, stderr)
|
||||
case "support-bundle":
|
||||
return runSupportBundle(args[1:], stdout, stderr)
|
||||
case "web":
|
||||
return runWeb(args[1:], stdout, stderr)
|
||||
case "blackbox":
|
||||
return runBlackbox(args[1:], stdout, stderr)
|
||||
case "sat":
|
||||
return runSAT(args[1:], stdout, stderr)
|
||||
case "benchmark":
|
||||
return runBenchmark(args[1:], stdout, stderr)
|
||||
case "bee-worker":
|
||||
return runBeeWorker(args[1:], stdout, stderr)
|
||||
case "version", "--version", "-version":
|
||||
fmt.Fprintln(stdout, Version)
|
||||
return 0
|
||||
@@ -85,11 +92,14 @@ func printRootUsage(w io.Writer) {
|
||||
fmt.Fprintln(w, `bee commands:
|
||||
bee audit --runtime auto|local|livecd --output stdout|file:<path>
|
||||
bee preflight --output stdout|file:<path>
|
||||
bee install-to-ram
|
||||
bee export --target <device>
|
||||
bee support-bundle --output stdout|file:<path>
|
||||
bee web --listen :80 [--audit-path `+app.DefaultAuditJSONPath+`]
|
||||
bee blackbox --export-dir `+app.DefaultExportDir+` [--state-file `+app.DefaultBlackboxStatePath+`]
|
||||
bee sat nvidia|memory|storage|cpu [--duration <seconds>]
|
||||
bee benchmark nvidia [--profile standard|stability|overnight]
|
||||
bee bee-worker --export-dir `+app.DefaultExportDir+` --task-id TASK-001
|
||||
bee version
|
||||
bee help [command]`)
|
||||
}
|
||||
@@ -102,14 +112,20 @@ func runHelp(args []string, stdout, stderr io.Writer) int {
|
||||
return runExport([]string{"--help"}, stdout, stdout)
|
||||
case "preflight":
|
||||
return runPreflight([]string{"--help"}, stdout, stdout)
|
||||
case "install-to-ram":
|
||||
return runInstallToRAM([]string{"--help"}, stdout, stdout)
|
||||
case "support-bundle":
|
||||
return runSupportBundle([]string{"--help"}, stdout, stdout)
|
||||
case "web":
|
||||
return runWeb([]string{"--help"}, stdout, stdout)
|
||||
case "blackbox":
|
||||
return runBlackbox([]string{"--help"}, stdout, stdout)
|
||||
case "sat":
|
||||
return runSAT([]string{"--help"}, stdout, stderr)
|
||||
case "benchmark":
|
||||
return runBenchmark([]string{"--help"}, stdout, stderr)
|
||||
case "bee-worker":
|
||||
return runBeeWorker([]string{"--help"}, stdout, stderr)
|
||||
case "version":
|
||||
fmt.Fprintln(stdout, "usage: bee version")
|
||||
return 0
|
||||
@@ -241,6 +257,32 @@ func runPreflight(args []string, stdout, stderr io.Writer) int {
|
||||
return 0
|
||||
}
|
||||
|
||||
func runInstallToRAM(args []string, stdout, stderr io.Writer) int {
|
||||
fs := flag.NewFlagSet("install-to-ram", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
fs.Usage = func() {
|
||||
fmt.Fprintln(stderr, "usage: bee install-to-ram")
|
||||
}
|
||||
if err := fs.Parse(args); err != nil {
|
||||
if err == flag.ErrHelp {
|
||||
return 0
|
||||
}
|
||||
return 2
|
||||
}
|
||||
if fs.NArg() != 0 {
|
||||
fs.Usage()
|
||||
return 2
|
||||
}
|
||||
|
||||
application := app.New(platform.New())
|
||||
logLine := func(s string) { fmt.Fprintln(stdout, s) }
|
||||
if err := application.RunInstallToRAM(context.Background(), logLine); err != nil {
|
||||
slog.Error("run install-to-ram", "err", err)
|
||||
return 1
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func runSupportBundle(args []string, stdout, stderr io.Writer) int {
|
||||
fs := flag.NewFlagSet("support-bundle", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
@@ -335,6 +377,33 @@ func runWeb(args []string, stdout, stderr io.Writer) int {
|
||||
return 0
|
||||
}
|
||||
|
||||
func runBlackbox(args []string, stdout, stderr io.Writer) int {
|
||||
fs := flag.NewFlagSet("blackbox", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
|
||||
statePath := fs.String("state-file", app.DefaultBlackboxStatePath, "blackbox state file")
|
||||
fs.Usage = func() {
|
||||
fmt.Fprintf(stderr, "usage: bee blackbox [--export-dir %s] [--state-file %s]\n", app.DefaultExportDir, app.DefaultBlackboxStatePath)
|
||||
fs.PrintDefaults()
|
||||
}
|
||||
if err := fs.Parse(args); err != nil {
|
||||
if err == flag.ErrHelp {
|
||||
return 0
|
||||
}
|
||||
return 2
|
||||
}
|
||||
if fs.NArg() != 0 {
|
||||
fs.Usage()
|
||||
return 2
|
||||
}
|
||||
slog.Info("starting bee blackbox", "export_dir", *exportDir, "state_file", *statePath)
|
||||
if err := app.RunBlackbox(context.Background(), *exportDir, *statePath, platform.New()); err != nil && !errors.Is(err, context.Canceled) {
|
||||
slog.Error("run blackbox", "err", err)
|
||||
return 1
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func runSAT(args []string, stdout, stderr io.Writer) int {
|
||||
if len(args) == 0 {
|
||||
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
|
||||
@@ -462,6 +531,28 @@ func runBenchmark(args []string, stdout, stderr io.Writer) int {
|
||||
return 0
|
||||
}
|
||||
|
||||
func runBeeWorker(args []string, stdout, stderr io.Writer) int {
|
||||
fs := flag.NewFlagSet("bee-worker", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with task state and artifacts")
|
||||
taskID := fs.String("task-id", "", "task identifier, e.g. TASK-001")
|
||||
fs.Usage = func() {
|
||||
fmt.Fprintf(stderr, "usage: bee bee-worker --export-dir %s --task-id TASK-001\n", app.DefaultExportDir)
|
||||
fs.PrintDefaults()
|
||||
}
|
||||
if err := fs.Parse(args); err != nil {
|
||||
if err == flag.ErrHelp {
|
||||
return 0
|
||||
}
|
||||
return 2
|
||||
}
|
||||
if fs.NArg() != 0 {
|
||||
fs.Usage()
|
||||
return 2
|
||||
}
|
||||
return webui.RunPersistedTask(*exportDir, *taskID, stdout, stderr)
|
||||
}
|
||||
|
||||
func parseBenchmarkIndexCSV(raw string) ([]int, error) {
|
||||
raw = strings.TrimSpace(raw)
|
||||
if raw == "" {
|
||||
|
||||
+60
-14
@@ -19,20 +19,22 @@ import (
|
||||
)
|
||||
|
||||
var (
|
||||
DefaultExportDir = "/appdata/bee/export"
|
||||
DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json"
|
||||
DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log"
|
||||
DefaultWebLogPath = DefaultExportDir + "/bee-web.log"
|
||||
DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log"
|
||||
DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log"
|
||||
DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log"
|
||||
DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
|
||||
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
|
||||
DefaultTechDumpDir = DefaultExportDir + "/techdump"
|
||||
DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
|
||||
DefaultBeeBenchBaseDir = DefaultExportDir + "/bee-bench"
|
||||
DefaultBeeBenchPerfDir = DefaultBeeBenchBaseDir + "/perf"
|
||||
DefaultBeeBenchPowerDir = DefaultBeeBenchBaseDir + "/power"
|
||||
DefaultExportDir = "/appdata/bee/export"
|
||||
DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json"
|
||||
DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log"
|
||||
DefaultWebLogPath = DefaultExportDir + "/bee-web.log"
|
||||
DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log"
|
||||
DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log"
|
||||
DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log"
|
||||
DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
|
||||
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
|
||||
DefaultTechDumpDir = DefaultExportDir + "/techdump"
|
||||
DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
|
||||
DefaultBeeBenchBaseDir = DefaultExportDir + "/bee-bench"
|
||||
DefaultBeeBenchAutotuneDir = DefaultBeeBenchBaseDir + "/autotune"
|
||||
DefaultBeeBenchPerfDir = DefaultBeeBenchBaseDir + "/perf"
|
||||
DefaultBeeBenchPowerDir = DefaultBeeBenchBaseDir + "/power"
|
||||
DefaultBeeBenchPowerSourceConfigPath = DefaultBeeBenchBaseDir + "/power-source-autotune.json"
|
||||
)
|
||||
|
||||
type App struct {
|
||||
@@ -125,6 +127,7 @@ type satRunner interface {
|
||||
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
||||
RunNvidiaPowerBench(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
||||
RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error)
|
||||
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
|
||||
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
@@ -572,6 +575,11 @@ func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts pl
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultBeeBenchPerfDir
|
||||
}
|
||||
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "performance", logFunc)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
opts.ServerPowerSource = resolved.SelectedSource
|
||||
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
@@ -579,9 +587,47 @@ func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts p
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultBeeBenchPowerDir
|
||||
}
|
||||
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "power-fit", logFunc)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
opts.ServerPowerSource = resolved.SelectedSource
|
||||
return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaPowerSourceAutotuneCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultBeeBenchAutotuneDir
|
||||
}
|
||||
return a.sat.RunNvidiaPowerSourceAutotune(ctx, baseDir, opts, benchmarkKind, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) LoadBenchmarkPowerAutotune() (*platform.BenchmarkPowerAutotuneConfig, error) {
|
||||
return platform.LoadBenchmarkPowerAutotuneConfig(DefaultBeeBenchPowerSourceConfigPath)
|
||||
}
|
||||
|
||||
func (a *App) ensureBenchmarkPowerAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (platform.BenchmarkPowerAutotuneConfig, error) {
|
||||
cfgPath := platform.BenchmarkPowerSourceConfigPath(baseDir)
|
||||
if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath); err == nil {
|
||||
if logFunc != nil {
|
||||
logFunc(fmt.Sprintf("benchmark autotune: using saved server power source %s", cfg.SelectedSource))
|
||||
}
|
||||
return *cfg, nil
|
||||
}
|
||||
if logFunc != nil {
|
||||
logFunc("benchmark autotune: no saved power source config, running autotune first")
|
||||
}
|
||||
autotuneDir := filepath.Join(filepath.Dir(baseDir), "autotune")
|
||||
if _, err := a.RunNvidiaPowerSourceAutotuneCtx(ctx, autotuneDir, opts, benchmarkKind, logFunc); err != nil {
|
||||
return platform.BenchmarkPowerAutotuneConfig{}, err
|
||||
}
|
||||
cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath)
|
||||
if err != nil {
|
||||
return platform.BenchmarkPowerAutotuneConfig{}, err
|
||||
}
|
||||
return *cfg, nil
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
|
||||
@@ -9,6 +9,7 @@ import (
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
@@ -123,6 +124,7 @@ type fakeSAT struct {
|
||||
runNvidiaFn func(string) (string, error)
|
||||
runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
|
||||
runNvidiaPowerBenchFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
|
||||
runNvidiaAutotuneFn func(string, platform.NvidiaBenchmarkOptions, string) (string, error)
|
||||
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
|
||||
runNvidiaComputeFn func(string, int, []int) (string, error)
|
||||
runNvidiaPowerFn func(string, int, []int) (string, error)
|
||||
@@ -163,6 +165,13 @@ func (f fakeSAT) RunNvidiaPowerBench(_ context.Context, baseDir string, opts pla
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaPowerSourceAutotune(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, _ func(string)) (string, error) {
|
||||
if f.runNvidiaAutotuneFn != nil {
|
||||
return f.runNvidiaAutotuneFn(baseDir, opts, benchmarkKind)
|
||||
}
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
||||
if f.runNvidiaTargetedStressFn != nil {
|
||||
return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
|
||||
@@ -809,6 +818,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Join(exportDir, "bee-bench"), 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json"), []byte(`{"version":1,"updated_at":"2026-04-20T01:02:03Z","selected_source":"sdr_psu_input","reason":"selected lowest relative error"}`), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run.tar.gz"), []byte("nested sat archive"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -836,6 +851,7 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
tr := tar.NewReader(gzr)
|
||||
var names []string
|
||||
var auditJSON string
|
||||
var manifest string
|
||||
for {
|
||||
hdr, err := tr.Next()
|
||||
if errors.Is(err, io.EOF) {
|
||||
@@ -852,6 +868,13 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
}
|
||||
auditJSON = string(body)
|
||||
}
|
||||
if strings.HasSuffix(hdr.Name, "/manifest.txt") {
|
||||
body, err := io.ReadAll(tr)
|
||||
if err != nil {
|
||||
t.Fatalf("read manifest entry: %v", err)
|
||||
}
|
||||
manifest = string(body)
|
||||
}
|
||||
}
|
||||
|
||||
for _, want := range []string{
|
||||
@@ -895,6 +918,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
if !contains(auditJSON, "PASCARI") || !contains(auditJSON, "NVIDIA H100") {
|
||||
t.Fatalf("support bundle should keep real devices:\n%s", auditJSON)
|
||||
}
|
||||
if !contains(manifest, "files:") {
|
||||
t.Fatalf("support bundle manifest missing files section:\n%s", manifest)
|
||||
}
|
||||
if !strings.Contains(manifest, "power_autotune_selected_source=sdr_psu_input") {
|
||||
t.Fatalf("support bundle manifest missing autotune source:\n%s", manifest)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMainBanner(t *testing.T) {
|
||||
|
||||
@@ -0,0 +1,779 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
const (
|
||||
blackboxMarkerName = ".bee-blackbox"
|
||||
blackboxDiscoverInterval = 2 * time.Second
|
||||
blackboxMinFlushPeriod = 1 * time.Second
|
||||
blackboxMaxFlushPeriod = 30 * time.Second
|
||||
blackboxRecoveryFastCount = 5
|
||||
)
|
||||
|
||||
var DefaultBlackboxStatePath = DefaultExportDir + "/blackbox-state.json"
|
||||
|
||||
var (
|
||||
blackboxExecCommand = exec.Command
|
||||
blackboxNow = func() time.Time { return time.Now().UTC() }
|
||||
)
|
||||
|
||||
type BlackboxMarker struct {
|
||||
Version int `json:"version"`
|
||||
EnrollmentID string `json:"enrollment_id"`
|
||||
CreatedAtUTC string `json:"created_at_utc"`
|
||||
Host string `json:"host,omitempty"`
|
||||
}
|
||||
|
||||
type BlackboxTargetStatus struct {
|
||||
EnrollmentID string `json:"enrollment_id"`
|
||||
Device string `json:"device"`
|
||||
FS platform.RemovableTarget `json:"fs"`
|
||||
BootFolder string `json:"boot_folder"`
|
||||
Status string `json:"status"`
|
||||
LastSyncAtUTC string `json:"last_sync_at_utc,omitempty"`
|
||||
LastCycleDuration string `json:"last_cycle_duration,omitempty"`
|
||||
FlushPeriod string `json:"flush_period"`
|
||||
LastError string `json:"last_error,omitempty"`
|
||||
Mountpoint string `json:"mountpoint,omitempty"`
|
||||
}
|
||||
|
||||
type BlackboxState struct {
|
||||
Status string `json:"status"`
|
||||
BootStartedAtUTC string `json:"boot_started_at_utc"`
|
||||
BootFolder string `json:"boot_folder"`
|
||||
UpdatedAtUTC string `json:"updated_at_utc"`
|
||||
Targets []BlackboxTargetStatus `json:"targets"`
|
||||
}
|
||||
|
||||
type blackboxRuntime struct {
|
||||
exportDir string
|
||||
statePath string
|
||||
system *platform.System
|
||||
bootStarted time.Time
|
||||
bootFolder string
|
||||
|
||||
mu sync.Mutex
|
||||
workers map[string]*blackboxWorker
|
||||
}
|
||||
|
||||
type discoveredBlackboxTarget struct {
|
||||
marker BlackboxMarker
|
||||
target platform.RemovableTarget
|
||||
seenMount string
|
||||
mountedByBee bool
|
||||
}
|
||||
|
||||
type blackboxWorker struct {
|
||||
runtime *blackboxRuntime
|
||||
enrollmentID string
|
||||
|
||||
mu sync.Mutex
|
||||
target platform.RemovableTarget
|
||||
marker BlackboxMarker
|
||||
mountpoint string
|
||||
mountedByBee bool
|
||||
status string
|
||||
lastSyncAt time.Time
|
||||
lastDuration time.Duration
|
||||
flushPeriod time.Duration
|
||||
lastError string
|
||||
fastCycles int
|
||||
stopCh chan struct{}
|
||||
stoppedCh chan struct{}
|
||||
}
|
||||
|
||||
func RunBlackbox(ctx context.Context, exportDir, statePath string, system *platform.System) error {
|
||||
exportDir = strings.TrimSpace(exportDir)
|
||||
if exportDir == "" {
|
||||
exportDir = DefaultExportDir
|
||||
}
|
||||
statePath = strings.TrimSpace(statePath)
|
||||
if statePath == "" {
|
||||
statePath = DefaultBlackboxStatePath
|
||||
}
|
||||
if system == nil {
|
||||
system = platform.New()
|
||||
}
|
||||
bootStarted, err := bootStartedAtUTC()
|
||||
if err != nil {
|
||||
bootStarted = blackboxNow()
|
||||
}
|
||||
rt := &blackboxRuntime{
|
||||
exportDir: exportDir,
|
||||
statePath: statePath,
|
||||
system: system,
|
||||
bootStarted: bootStarted,
|
||||
bootFolder: SupportBundleBaseName(bootStarted),
|
||||
workers: make(map[string]*blackboxWorker),
|
||||
}
|
||||
_ = os.MkdirAll(filepath.Dir(statePath), 0755)
|
||||
rt.persistState()
|
||||
ticker := time.NewTicker(blackboxDiscoverInterval)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
rt.reconcile()
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
rt.stopAll()
|
||||
return ctx.Err()
|
||||
case <-ticker.C:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func ReadBlackboxState(path string) (BlackboxState, error) {
|
||||
path = strings.TrimSpace(path)
|
||||
if path == "" {
|
||||
path = DefaultBlackboxStatePath
|
||||
}
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return BlackboxState{}, err
|
||||
}
|
||||
var state BlackboxState
|
||||
if err := json.Unmarshal(raw, &state); err != nil {
|
||||
return BlackboxState{}, err
|
||||
}
|
||||
return state, nil
|
||||
}
|
||||
|
||||
func EnableBlackboxTarget(target platform.RemovableTarget) (BlackboxMarker, error) {
|
||||
target = sanitizeRemovableTarget(target)
|
||||
if target.Device == "" {
|
||||
return BlackboxMarker{}, fmt.Errorf("device is required")
|
||||
}
|
||||
mountpoint, mountedByBee, err := ensureMountedTarget(target, "marker")
|
||||
if err != nil {
|
||||
return BlackboxMarker{}, err
|
||||
}
|
||||
defer func() {
|
||||
if mountedByBee {
|
||||
_ = unmountTarget(mountpoint)
|
||||
}
|
||||
}()
|
||||
|
||||
marker, _, err := readBlackboxMarker(mountpoint)
|
||||
if err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||
return BlackboxMarker{}, err
|
||||
}
|
||||
if marker.EnrollmentID == "" {
|
||||
marker = BlackboxMarker{
|
||||
Version: 1,
|
||||
EnrollmentID: newBlackboxEnrollmentID(),
|
||||
CreatedAtUTC: blackboxNow().Format(time.RFC3339),
|
||||
Host: hostnameOr("unknown"),
|
||||
}
|
||||
}
|
||||
if err := writeBlackboxMarker(mountpoint, marker); err != nil {
|
||||
return BlackboxMarker{}, err
|
||||
}
|
||||
return marker, nil
|
||||
}
|
||||
|
||||
func DisableBlackboxTarget(device, enrollmentID string) error {
|
||||
device = strings.TrimSpace(device)
|
||||
enrollmentID = strings.TrimSpace(enrollmentID)
|
||||
if device == "" && enrollmentID == "" {
|
||||
return fmt.Errorf("device or enrollment_id is required")
|
||||
}
|
||||
system := platform.New()
|
||||
targets, err := system.ListRemovableTargets()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, target := range targets {
|
||||
target = sanitizeRemovableTarget(target)
|
||||
mountpoint, mountedByBee, mountErr := ensureMountedTarget(target, "marker")
|
||||
if mountErr != nil {
|
||||
continue
|
||||
}
|
||||
remove := false
|
||||
marker, _, err := readBlackboxMarker(mountpoint)
|
||||
if err == nil {
|
||||
if enrollmentID != "" && marker.EnrollmentID == enrollmentID {
|
||||
remove = true
|
||||
}
|
||||
if device != "" && target.Device == device {
|
||||
remove = true
|
||||
}
|
||||
}
|
||||
if remove {
|
||||
err = os.Remove(filepath.Join(mountpoint, blackboxMarkerName))
|
||||
}
|
||||
if mountedByBee {
|
||||
_ = unmountTarget(mountpoint)
|
||||
}
|
||||
if remove {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return os.ErrNotExist
|
||||
}
|
||||
|
||||
func (rt *blackboxRuntime) reconcile() {
|
||||
discovered, _ := rt.discoverMarkedTargets()
|
||||
|
||||
rt.mu.Lock()
|
||||
defer rt.mu.Unlock()
|
||||
|
||||
seen := make(map[string]struct{}, len(discovered))
|
||||
for _, found := range discovered {
|
||||
seen[found.marker.EnrollmentID] = struct{}{}
|
||||
worker, ok := rt.workers[found.marker.EnrollmentID]
|
||||
if !ok {
|
||||
worker = newBlackboxWorker(rt, found)
|
||||
rt.workers[found.marker.EnrollmentID] = worker
|
||||
go worker.run()
|
||||
continue
|
||||
}
|
||||
worker.update(found)
|
||||
}
|
||||
for id, worker := range rt.workers {
|
||||
if _, ok := seen[id]; ok {
|
||||
continue
|
||||
}
|
||||
worker.stop()
|
||||
delete(rt.workers, id)
|
||||
}
|
||||
rt.persistStateLocked()
|
||||
}
|
||||
|
||||
func (rt *blackboxRuntime) stopAll() {
|
||||
rt.mu.Lock()
|
||||
workers := make([]*blackboxWorker, 0, len(rt.workers))
|
||||
for _, worker := range rt.workers {
|
||||
workers = append(workers, worker)
|
||||
}
|
||||
rt.workers = map[string]*blackboxWorker{}
|
||||
rt.persistStateLocked()
|
||||
rt.mu.Unlock()
|
||||
for _, worker := range workers {
|
||||
worker.stop()
|
||||
}
|
||||
}
|
||||
|
||||
func (rt *blackboxRuntime) discoverMarkedTargets() ([]discoveredBlackboxTarget, error) {
|
||||
targets, err := rt.system.ListRemovableTargets()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var out []discoveredBlackboxTarget
|
||||
for _, rawTarget := range targets {
|
||||
target := sanitizeRemovableTarget(rawTarget)
|
||||
if target.Device == "" {
|
||||
continue
|
||||
}
|
||||
mountpoint, mountedByBee, err := ensureMountedTarget(target, "probe")
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
marker, ok, err := readBlackboxMarker(mountpoint)
|
||||
if mountedByBee && !ok {
|
||||
_ = unmountTarget(mountpoint)
|
||||
}
|
||||
if err != nil || !ok || marker.EnrollmentID == "" {
|
||||
continue
|
||||
}
|
||||
if mountedByBee {
|
||||
_ = unmountTarget(mountpoint)
|
||||
}
|
||||
out = append(out, discoveredBlackboxTarget{
|
||||
marker: marker,
|
||||
target: target,
|
||||
seenMount: mountpoint,
|
||||
mountedByBee: mountedByBee,
|
||||
})
|
||||
}
|
||||
sort.Slice(out, func(i, j int) bool {
|
||||
return out[i].marker.EnrollmentID < out[j].marker.EnrollmentID
|
||||
})
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func newBlackboxWorker(rt *blackboxRuntime, found discoveredBlackboxTarget) *blackboxWorker {
|
||||
return &blackboxWorker{
|
||||
runtime: rt,
|
||||
enrollmentID: found.marker.EnrollmentID,
|
||||
target: found.target,
|
||||
marker: found.marker,
|
||||
flushPeriod: blackboxMinFlushPeriod,
|
||||
status: "running",
|
||||
stopCh: make(chan struct{}),
|
||||
stoppedCh: make(chan struct{}),
|
||||
}
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) run() {
|
||||
defer close(w.stoppedCh)
|
||||
for {
|
||||
start := time.Now()
|
||||
err := w.syncCycle()
|
||||
duration := time.Since(start)
|
||||
w.finishCycle(duration, err)
|
||||
|
||||
wait := w.currentFlushPeriod()
|
||||
timer := time.NewTimer(wait)
|
||||
select {
|
||||
case <-w.stopCh:
|
||||
timer.Stop()
|
||||
w.cleanup()
|
||||
return
|
||||
case <-timer.C:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) update(found discoveredBlackboxTarget) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
w.target = found.target
|
||||
w.marker = found.marker
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) stop() {
|
||||
select {
|
||||
case <-w.stopCh:
|
||||
default:
|
||||
close(w.stopCh)
|
||||
}
|
||||
<-w.stoppedCh
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) currentFlushPeriod() time.Duration {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
return w.flushPeriod
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) finishCycle(duration time.Duration, err error) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
w.lastDuration = duration
|
||||
if err != nil {
|
||||
w.status = "degraded"
|
||||
w.lastError = err.Error()
|
||||
w.fastCycles = 0
|
||||
w.flushPeriod = adjustFlushPeriod(w.flushPeriod, duration, false, 0)
|
||||
} else {
|
||||
w.status = "running"
|
||||
w.lastSyncAt = blackboxNow()
|
||||
w.lastError = ""
|
||||
if duration <= w.flushPeriod/2 {
|
||||
w.fastCycles++
|
||||
} else {
|
||||
w.fastCycles = 0
|
||||
}
|
||||
w.flushPeriod = adjustFlushPeriod(w.flushPeriod, duration, true, w.fastCycles)
|
||||
}
|
||||
w.runtime.persistState()
|
||||
}
|
||||
|
||||
func adjustFlushPeriod(current, duration time.Duration, success bool, fastCycles int) time.Duration {
|
||||
if current <= 0 {
|
||||
current = blackboxMinFlushPeriod
|
||||
}
|
||||
if duration <= 0 {
|
||||
duration = current
|
||||
}
|
||||
next := current
|
||||
if duration > current {
|
||||
growA := time.Duration(float64(current) * 1.25)
|
||||
growB := time.Duration(float64(duration) * 1.25)
|
||||
if growB > growA {
|
||||
next = growB
|
||||
} else {
|
||||
next = growA
|
||||
}
|
||||
}
|
||||
if success && fastCycles >= blackboxRecoveryFastCount {
|
||||
next = time.Duration(float64(current) * 0.9)
|
||||
}
|
||||
if next < blackboxMinFlushPeriod {
|
||||
next = blackboxMinFlushPeriod
|
||||
}
|
||||
if next > blackboxMaxFlushPeriod {
|
||||
next = blackboxMaxFlushPeriod
|
||||
}
|
||||
return next
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) syncCycle() error {
|
||||
target, marker := w.snapshotTarget()
|
||||
mountpoint, mountedByBee, err := ensureMountedTarget(target, marker.EnrollmentID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
w.recordMountpoint(mountpoint, mountedByBee)
|
||||
|
||||
root := filepath.Join(mountpoint, w.runtime.bootFolder)
|
||||
if err := os.MkdirAll(filepath.Join(root, "export"), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := syncDirectoryTree(w.runtime.exportDir, filepath.Join(root, "export")); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := w.captureSnapshots(root); err != nil {
|
||||
return err
|
||||
}
|
||||
return syncFilesystem(root)
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) cleanup() {
|
||||
w.mu.Lock()
|
||||
mountpoint := w.mountpoint
|
||||
mountedByBee := w.mountedByBee
|
||||
w.mu.Unlock()
|
||||
if mountedByBee && mountpoint != "" {
|
||||
_ = unmountTarget(mountpoint)
|
||||
}
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) snapshotTarget() (platform.RemovableTarget, BlackboxMarker) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
return w.target, w.marker
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) recordMountpoint(mountpoint string, mountedByBee bool) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
w.mountpoint = mountpoint
|
||||
w.mountedByBee = mountedByBee
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) captureSnapshots(root string) error {
|
||||
if err := captureCommandAtomic(filepath.Join(root, "systemd", "combined.journal.log"), "journalctl", "--no-pager", "--since", w.runtime.bootStarted.Format(time.RFC3339)); err != nil {
|
||||
return err
|
||||
}
|
||||
for _, svc := range supportBundleServices {
|
||||
if err := captureCommandAtomic(filepath.Join(root, "systemd", svc+".journal.log"), "journalctl", "--no-pager", "-u", svc, "--since", w.runtime.bootStarted.Format(time.RFC3339)); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := captureCommandAtomic(filepath.Join(root, "systemd", svc+".status.txt"), "systemctl", "status", svc, "--no-pager"); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if err := captureCommandAtomic(filepath.Join(root, "system", "dmesg.txt"), "dmesg"); err != nil {
|
||||
return err
|
||||
}
|
||||
for _, item := range supportBundleOptionalFiles {
|
||||
if err := copyFileIfChanged(item.src, filepath.Join(root, item.name)); err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (rt *blackboxRuntime) persistState() {
|
||||
rt.mu.Lock()
|
||||
defer rt.mu.Unlock()
|
||||
rt.persistStateLocked()
|
||||
}
|
||||
|
||||
func (rt *blackboxRuntime) persistStateLocked() {
|
||||
state := BlackboxState{
|
||||
Status: "disabled",
|
||||
BootStartedAtUTC: rt.bootStarted.Format(time.RFC3339),
|
||||
BootFolder: rt.bootFolder,
|
||||
UpdatedAtUTC: blackboxNow().Format(time.RFC3339),
|
||||
Targets: make([]BlackboxTargetStatus, 0, len(rt.workers)),
|
||||
}
|
||||
if len(rt.workers) > 0 {
|
||||
state.Status = "running"
|
||||
}
|
||||
for _, worker := range rt.workers {
|
||||
worker.mu.Lock()
|
||||
targetState := BlackboxTargetStatus{
|
||||
EnrollmentID: worker.enrollmentID,
|
||||
Device: worker.target.Device,
|
||||
FS: worker.target,
|
||||
BootFolder: rt.bootFolder,
|
||||
Status: worker.status,
|
||||
FlushPeriod: worker.flushPeriod.String(),
|
||||
LastError: worker.lastError,
|
||||
Mountpoint: worker.mountpoint,
|
||||
}
|
||||
if !worker.lastSyncAt.IsZero() {
|
||||
targetState.LastSyncAtUTC = worker.lastSyncAt.Format(time.RFC3339)
|
||||
}
|
||||
if worker.lastDuration > 0 {
|
||||
targetState.LastCycleDuration = worker.lastDuration.String()
|
||||
}
|
||||
if worker.status == "degraded" {
|
||||
state.Status = "degraded"
|
||||
}
|
||||
worker.mu.Unlock()
|
||||
state.Targets = append(state.Targets, targetState)
|
||||
}
|
||||
sort.Slice(state.Targets, func(i, j int) bool {
|
||||
return state.Targets[i].EnrollmentID < state.Targets[j].EnrollmentID
|
||||
})
|
||||
_ = writeJSONAtomic(rt.statePath, state)
|
||||
}
|
||||
|
||||
func bootStartedAtUTC() (time.Time, error) {
|
||||
raw, err := os.ReadFile("/proc/stat")
|
||||
if err != nil {
|
||||
return time.Time{}, err
|
||||
}
|
||||
for _, line := range strings.Split(string(raw), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if !strings.HasPrefix(line, "btime ") {
|
||||
continue
|
||||
}
|
||||
parts := strings.Fields(line)
|
||||
if len(parts) != 2 {
|
||||
break
|
||||
}
|
||||
sec, err := time.ParseDuration(parts[1] + "s")
|
||||
if err != nil {
|
||||
break
|
||||
}
|
||||
return time.Unix(int64(sec/time.Second), 0).UTC(), nil
|
||||
}
|
||||
return time.Time{}, fmt.Errorf("boot time not found")
|
||||
}
|
||||
|
||||
func newBlackboxEnrollmentID() string {
|
||||
var buf [8]byte
|
||||
if _, err := rand.Read(buf[:]); err != nil {
|
||||
return fmt.Sprintf("bb-%d", time.Now().UnixNano())
|
||||
}
|
||||
return "bb-" + hex.EncodeToString(buf[:])
|
||||
}
|
||||
|
||||
func sanitizeRemovableTarget(target platform.RemovableTarget) platform.RemovableTarget {
|
||||
target.Device = strings.TrimSpace(target.Device)
|
||||
target.FSType = strings.TrimSpace(target.FSType)
|
||||
target.Size = strings.TrimSpace(target.Size)
|
||||
target.Label = strings.TrimSpace(target.Label)
|
||||
target.Model = strings.TrimSpace(target.Model)
|
||||
target.Mountpoint = strings.TrimSpace(target.Mountpoint)
|
||||
return target
|
||||
}
|
||||
|
||||
func ensureMountedTarget(target platform.RemovableTarget, suffix string) (mountpoint string, mountedByBee bool, retErr error) {
|
||||
target = sanitizeRemovableTarget(target)
|
||||
if target.Mountpoint != "" {
|
||||
if err := ensureWritableBlackboxMountpoint(target.Mountpoint); err == nil {
|
||||
return target.Mountpoint, false, nil
|
||||
}
|
||||
}
|
||||
mountpoint = filepath.Join("/tmp", "bee-blackbox-"+sanitizeFilename(suffix))
|
||||
if err := os.MkdirAll(mountpoint, 0755); err != nil {
|
||||
return "", false, err
|
||||
}
|
||||
if raw, err := blackboxExecCommand("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
|
||||
return "", false, formatBlackboxMountTargetError(target, string(raw), err)
|
||||
}
|
||||
if err := ensureWritableBlackboxMountpoint(mountpoint); err != nil {
|
||||
_ = unmountTarget(mountpoint)
|
||||
return "", false, err
|
||||
}
|
||||
return mountpoint, true, nil
|
||||
}
|
||||
|
||||
func unmountTarget(mountpoint string) error {
|
||||
_ = blackboxExecCommand("sync").Run()
|
||||
raw, err := blackboxExecCommand("umount", mountpoint).CombinedOutput()
|
||||
if err != nil {
|
||||
msg := strings.TrimSpace(string(raw))
|
||||
if msg == "" {
|
||||
return err
|
||||
}
|
||||
return fmt.Errorf("%s: %w", msg, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func readBlackboxMarker(mountpoint string) (BlackboxMarker, bool, error) {
|
||||
raw, err := os.ReadFile(filepath.Join(mountpoint, blackboxMarkerName))
|
||||
if err != nil {
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
return BlackboxMarker{}, false, os.ErrNotExist
|
||||
}
|
||||
return BlackboxMarker{}, false, err
|
||||
}
|
||||
var marker BlackboxMarker
|
||||
if err := json.Unmarshal(raw, &marker); err != nil {
|
||||
return BlackboxMarker{}, false, err
|
||||
}
|
||||
return marker, true, nil
|
||||
}
|
||||
|
||||
func writeBlackboxMarker(mountpoint string, marker BlackboxMarker) error {
|
||||
if marker.Version == 0 {
|
||||
marker.Version = 1
|
||||
}
|
||||
return writeJSONAtomic(filepath.Join(mountpoint, blackboxMarkerName), marker)
|
||||
}
|
||||
|
||||
func syncDirectoryTree(srcDir, dstDir string) error {
|
||||
seen := make(map[string]struct{})
|
||||
err := filepath.WalkDir(srcDir, func(path string, d fs.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
rel, err := filepath.Rel(srcDir, path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
rel = filepath.Clean(rel)
|
||||
if rel == "." {
|
||||
seen["."] = struct{}{}
|
||||
return os.MkdirAll(dstDir, 0755)
|
||||
}
|
||||
seen[rel] = struct{}{}
|
||||
dstPath := filepath.Join(dstDir, rel)
|
||||
if d.IsDir() {
|
||||
info, err := d.Info()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return os.MkdirAll(dstPath, info.Mode().Perm())
|
||||
}
|
||||
return copyFileIfChanged(path, dstPath)
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return removeMissingPaths(dstDir, seen)
|
||||
}
|
||||
|
||||
func removeMissingPaths(dstDir string, seen map[string]struct{}) error {
|
||||
return filepath.WalkDir(dstDir, func(path string, d fs.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
rel, err := filepath.Rel(dstDir, path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
rel = filepath.Clean(rel)
|
||||
if rel == "." {
|
||||
return nil
|
||||
}
|
||||
if _, ok := seen[rel]; ok {
|
||||
return nil
|
||||
}
|
||||
return os.RemoveAll(path)
|
||||
})
|
||||
}
|
||||
|
||||
func copyFileIfChanged(src, dst string) error {
|
||||
info, err := os.Stat(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if info.IsDir() {
|
||||
return os.MkdirAll(dst, info.Mode().Perm())
|
||||
}
|
||||
srcData, err := os.ReadFile(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if dstData, err := os.ReadFile(dst); err == nil && bytes.Equal(dstData, srcData) {
|
||||
return nil
|
||||
}
|
||||
return writeFileAtomic(dst, srcData, info.Mode().Perm())
|
||||
}
|
||||
|
||||
func captureCommandAtomic(dst string, name string, args ...string) error {
|
||||
raw, err := blackboxExecCommand(name, args...).CombinedOutput()
|
||||
if len(raw) == 0 {
|
||||
if err != nil {
|
||||
raw = []byte(err.Error() + "\n")
|
||||
} else {
|
||||
raw = []byte("no output\n")
|
||||
}
|
||||
}
|
||||
return writeFileAtomic(dst, raw, 0644)
|
||||
}
|
||||
|
||||
func writeJSONAtomic(path string, v any) error {
|
||||
raw, err := json.MarshalIndent(v, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
raw = append(raw, '\n')
|
||||
return writeFileAtomic(path, raw, 0644)
|
||||
}
|
||||
|
||||
func writeFileAtomic(path string, data []byte, perm os.FileMode) error {
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
if existing, err := os.ReadFile(path); err == nil && bytes.Equal(existing, data) {
|
||||
return nil
|
||||
}
|
||||
tmp := path + ".tmp"
|
||||
f, err := os.OpenFile(tmp, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, perm)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := f.Write(data); err != nil {
|
||||
_ = f.Close()
|
||||
return err
|
||||
}
|
||||
if err := f.Sync(); err != nil {
|
||||
_ = f.Close()
|
||||
return err
|
||||
}
|
||||
if err := f.Close(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := os.Rename(tmp, path); err != nil {
|
||||
return err
|
||||
}
|
||||
return syncFilesystem(filepath.Dir(path))
|
||||
}
|
||||
|
||||
func syncFilesystem(path string) error {
|
||||
return blackboxExecCommand("sync").Run()
|
||||
}
|
||||
|
||||
func ensureWritableBlackboxMountpoint(mountpoint string) error {
|
||||
probe, err := os.CreateTemp(mountpoint, ".bee-blackbox-write-test-*")
|
||||
if err != nil {
|
||||
return fmt.Errorf("target filesystem is not writable: %w", err)
|
||||
}
|
||||
name := probe.Name()
|
||||
if closeErr := probe.Close(); closeErr != nil {
|
||||
_ = os.Remove(name)
|
||||
return closeErr
|
||||
}
|
||||
if err := os.Remove(name); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func formatBlackboxMountTargetError(target platform.RemovableTarget, raw string, err error) error {
|
||||
msg := strings.TrimSpace(raw)
|
||||
fstype := strings.ToLower(strings.TrimSpace(target.FSType))
|
||||
if fstype == "exfat" && strings.Contains(strings.ToLower(msg), "unknown filesystem type 'exfat'") {
|
||||
return fmt.Errorf("mount %s: exFAT support is missing in this ISO build: %w", target.Device, err)
|
||||
}
|
||||
if msg == "" {
|
||||
return err
|
||||
}
|
||||
return fmt.Errorf("%s: %w", msg, err)
|
||||
}
|
||||
@@ -0,0 +1,52 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestAdjustFlushPeriodGrowsOnSlowCycle(t *testing.T) {
|
||||
current := 2 * time.Second
|
||||
got := adjustFlushPeriod(current, 4*time.Second, false, 0)
|
||||
if got <= current {
|
||||
t.Fatalf("adjustFlushPeriod=%s want > %s", got, current)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAdjustFlushPeriodShrinksAfterFastCycles(t *testing.T) {
|
||||
current := 10 * time.Second
|
||||
got := adjustFlushPeriod(current, 2*time.Second, true, blackboxRecoveryFastCount)
|
||||
if got >= current {
|
||||
t.Fatalf("adjustFlushPeriod=%s want < %s", got, current)
|
||||
}
|
||||
if got < blackboxMinFlushPeriod {
|
||||
t.Fatalf("adjustFlushPeriod=%s below min %s", got, blackboxMinFlushPeriod)
|
||||
}
|
||||
}
|
||||
|
||||
func TestReadBlackboxState(t *testing.T) {
|
||||
path := filepath.Join(t.TempDir(), "blackbox-state.json")
|
||||
want := BlackboxState{
|
||||
Status: "running",
|
||||
BootStartedAtUTC: "2026-04-24T00:00:00Z",
|
||||
BootFolder: "boot-folder",
|
||||
UpdatedAtUTC: "2026-04-24T00:00:01Z",
|
||||
Targets: []BlackboxTargetStatus{{
|
||||
EnrollmentID: "bb-1",
|
||||
Device: "/dev/sdb1",
|
||||
Status: "running",
|
||||
FlushPeriod: "1s",
|
||||
}},
|
||||
}
|
||||
if err := writeJSONAtomic(path, want); err != nil {
|
||||
t.Fatalf("writeJSONAtomic: %v", err)
|
||||
}
|
||||
got, err := ReadBlackboxState(path)
|
||||
if err != nil {
|
||||
t.Fatalf("ReadBlackboxState: %v", err)
|
||||
}
|
||||
if got.Status != want.Status || got.BootFolder != want.BootFolder || len(got.Targets) != 1 || got.Targets[0].EnrollmentID != "bb-1" {
|
||||
t.Fatalf("state=%+v", got)
|
||||
}
|
||||
}
|
||||
@@ -2,6 +2,7 @@ package app
|
||||
|
||||
import (
|
||||
"archive/tar"
|
||||
"bee/audit/internal/platform"
|
||||
"compress/gzip"
|
||||
"fmt"
|
||||
"io"
|
||||
@@ -14,6 +15,7 @@ import (
|
||||
)
|
||||
|
||||
var supportBundleServices = []string{
|
||||
"bee-blackbox.service",
|
||||
"bee-audit.service",
|
||||
"bee-web.service",
|
||||
"bee-network.service",
|
||||
@@ -22,6 +24,8 @@ var supportBundleServices = []string{
|
||||
"bee-selfheal.service",
|
||||
"bee-selfheal.timer",
|
||||
"bee-sshsetup.service",
|
||||
"display-manager.service",
|
||||
"lightdm.service",
|
||||
"nvidia-dcgm.service",
|
||||
"nvidia-fabricmanager.service",
|
||||
}
|
||||
@@ -42,12 +46,128 @@ var supportBundleCommands = []struct {
|
||||
{name: "system/mount.txt", cmd: []string{"mount"}},
|
||||
{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
|
||||
{name: "system/dmesg.txt", cmd: []string{"dmesg"}},
|
||||
{name: "system/dmesg-gui-video-input.txt", cmd: []string{"sh", "-c", `
|
||||
if command -v dmesg >/dev/null 2>&1; then
|
||||
dmesg | grep -iE 'nvidia|drm|fb|framebuffer|vesa|efi|lightdm|Xorg|input|hid|usb|keyboard|mouse|virtual keyboard|virtual mouse|ami|aspeed|ast' || echo "no GUI/video/input kernel messages found"
|
||||
else
|
||||
echo "dmesg not found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/kernel-aer-nvidia.txt", cmd: []string{"sh", "-c", `
|
||||
if command -v dmesg >/dev/null 2>&1; then
|
||||
dmesg | grep -iE 'AER|NVRM|Xid|pcieport|nvidia' || echo "no AER/NVRM/Xid kernel messages found"
|
||||
else
|
||||
echo "dmesg not found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/loginctl-sessions.txt", cmd: []string{"sh", "-c", `
|
||||
if command -v loginctl >/dev/null 2>&1; then
|
||||
loginctl list-sessions 2>&1 || true
|
||||
else
|
||||
echo "loginctl not found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/loginctl-seats.txt", cmd: []string{"sh", "-c", `
|
||||
if command -v loginctl >/dev/null 2>&1; then
|
||||
loginctl list-seats 2>&1 || true
|
||||
echo
|
||||
for seat in $(loginctl list-seats --no-legend 2>/dev/null | awk '{print $1}'); do
|
||||
echo "=== $seat ==="
|
||||
loginctl seat-status "$seat" 2>&1 || true
|
||||
echo
|
||||
done
|
||||
else
|
||||
echo "loginctl not found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/ps-gui.txt", cmd: []string{"sh", "-c", `
|
||||
ps -ef | grep -iE 'lightdm|Xorg|X$|openbox|chromium|chrome|xinit|xsession' | grep -v grep || echo "no GUI processes found"
|
||||
`}},
|
||||
{name: "system/lspci-video-vv.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v lspci >/dev/null 2>&1; then
|
||||
echo "lspci not found"
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for dev in $(lspci -Dn | awk '$2 ~ /^03(00|02):$/ {print $1}'); do
|
||||
found=1
|
||||
echo "=== $dev ==="
|
||||
lspci -s "$dev" -vv 2>&1 || true
|
||||
echo
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no display-class PCI devices found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/proc-fb.txt", cmd: []string{"cat", "/proc/fb"}},
|
||||
{name: "system/drm-cards.txt", cmd: []string{"sh", "-c", `
|
||||
if [ -d /sys/class/drm ]; then
|
||||
for path in /sys/class/drm/card*; do
|
||||
[ -e "$path" ] || continue
|
||||
card=$(basename "$path")
|
||||
echo "=== $card ==="
|
||||
for f in status enabled dpms modes; do
|
||||
[ -r "$path/$f" ] && printf " %-8s %s\n" "$f" "$(cat "$path/$f" 2>/dev/null)"
|
||||
done
|
||||
device=$(readlink -f "$path/device" 2>/dev/null || true)
|
||||
[ -n "$device" ] && echo " device ${device##*/}"
|
||||
echo
|
||||
done
|
||||
else
|
||||
echo "/sys/class/drm not present"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/input-devices.txt", cmd: []string{"sh", "-c", `
|
||||
if [ -r /proc/bus/input/devices ]; then
|
||||
cat /proc/bus/input/devices
|
||||
else
|
||||
echo "/proc/bus/input/devices not readable"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/udevadm-input.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v udevadm >/dev/null 2>&1; then
|
||||
echo "udevadm not found"
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for dev in /dev/input/event*; do
|
||||
[ -e "$dev" ] || continue
|
||||
found=1
|
||||
echo "=== $dev ==="
|
||||
udevadm info --query=all --name="$dev" 2>&1 || true
|
||||
echo
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no /dev/input/event* devices found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/xinput-list.txt", cmd: []string{"sh", "-c", `
|
||||
if command -v xinput >/dev/null 2>&1; then
|
||||
DISPLAY=:0 xinput --list 2>&1 || true
|
||||
else
|
||||
echo "xinput not found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/libinput-list-devices.txt", cmd: []string{"sh", "-c", `
|
||||
if command -v libinput >/dev/null 2>&1; then
|
||||
libinput list-devices 2>&1 || true
|
||||
else
|
||||
echo "libinput not found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/systemctl-gui-units.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v systemctl >/dev/null 2>&1; then
|
||||
echo "systemctl not found"
|
||||
exit 0
|
||||
fi
|
||||
echo "=== unit files ==="
|
||||
systemctl list-unit-files --no-pager --all 'lightdm*' 'display-manager*' 2>&1 || true
|
||||
echo
|
||||
echo "=== active units ==="
|
||||
systemctl list-units --no-pager --all 'lightdm*' 'display-manager*' 2>&1 || true
|
||||
echo
|
||||
echo "=== failed units ==="
|
||||
systemctl --failed --no-pager 2>&1 | grep -iE 'lightdm|display-manager|Xorg' || echo "no failed GUI units"
|
||||
`}},
|
||||
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "system/nvidia-smi-topo.txt", cmd: []string{"sh", "-c", `
|
||||
@@ -234,6 +354,13 @@ var supportBundleOptionalFiles = []struct {
|
||||
}{
|
||||
{name: "system/kern.log", src: "/var/log/kern.log"},
|
||||
{name: "system/syslog.txt", src: "/var/log/syslog"},
|
||||
{name: "system/Xorg.0.log", src: "/var/log/Xorg.0.log"},
|
||||
{name: "system/Xorg.0.log.old", src: "/var/log/Xorg.0.log.old"},
|
||||
{name: "system/lightdm/lightdm.log", src: "/var/log/lightdm/lightdm.log"},
|
||||
{name: "system/lightdm/x-0.log", src: "/var/log/lightdm/x-0.log"},
|
||||
{name: "system/lightdm/x-0-greeter.log", src: "/var/log/lightdm/x-0-greeter.log"},
|
||||
{name: "system/home-bee-xsession-errors.log", src: "/home/bee/.xsession-errors"},
|
||||
{name: "system/home-bee-chromium-debug.log", src: "/tmp/bee-chrome/chrome_debug.log"},
|
||||
{name: "system/fabricmanager.log", src: "/var/log/fabricmanager.log"},
|
||||
{name: "system/nvlsm.log", src: "/var/log/nvlsm.log"},
|
||||
{name: "system/fabricmanager/fabricmanager.log", src: "/var/log/fabricmanager/fabricmanager.log"},
|
||||
@@ -255,11 +382,6 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
||||
}
|
||||
|
||||
now := time.Now().UTC()
|
||||
date := now.Format("2006-01-02")
|
||||
tod := now.Format("150405")
|
||||
ver := bundleVersion()
|
||||
model := serverModelForBundle()
|
||||
sn := serverSerialForBundle()
|
||||
|
||||
stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-stage-%s-%s", sanitizeFilename(hostnameOr("unknown")), now.Format("20060102-150405")))
|
||||
if err := os.MkdirAll(stageRoot, 0755); err != nil {
|
||||
@@ -293,7 +415,7 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
||||
return "", err
|
||||
}
|
||||
|
||||
archiveName := fmt.Sprintf("%s (BEE-SP v%s) %s %s %s.tar.gz", date, ver, model, sn, tod)
|
||||
archiveName := SupportBundleBaseName(now) + ".tar.gz"
|
||||
archivePath := filepath.Join(os.TempDir(), archiveName)
|
||||
if err := createSupportTarGz(archivePath, stageRoot); err != nil {
|
||||
return "", err
|
||||
@@ -301,6 +423,16 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
||||
return archivePath, nil
|
||||
}
|
||||
|
||||
func SupportBundleBaseName(at time.Time) string {
|
||||
at = at.UTC()
|
||||
date := at.Format("2006-01-02")
|
||||
tod := at.Format("150405")
|
||||
ver := bundleVersion()
|
||||
model := serverModelForBundle()
|
||||
sn := serverSerialForBundle()
|
||||
return fmt.Sprintf("%s (BEE-SP v%s) %s %s %s", date, ver, model, sn, tod)
|
||||
}
|
||||
|
||||
func LatestSupportBundlePath() (string, error) {
|
||||
return latestSupportBundlePath(os.TempDir())
|
||||
}
|
||||
@@ -424,6 +556,13 @@ func writeManifest(dst, exportDir, stageRoot string) error {
|
||||
fmt.Fprintf(&body, "host=%s\n", hostnameOr("unknown"))
|
||||
fmt.Fprintf(&body, "generated_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
||||
fmt.Fprintf(&body, "export_dir=%s\n", exportDir)
|
||||
if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json")); err == nil && cfg != nil {
|
||||
fmt.Fprintf(&body, "power_autotune_selected_source=%s\n", cfg.SelectedSource)
|
||||
fmt.Fprintf(&body, "power_autotune_updated_at=%s\n", cfg.UpdatedAt.UTC().Format(time.RFC3339))
|
||||
if strings.TrimSpace(cfg.Reason) != "" {
|
||||
fmt.Fprintf(&body, "power_autotune_reason=%s\n", cfg.Reason)
|
||||
}
|
||||
}
|
||||
fmt.Fprintf(&body, "\nfiles:\n")
|
||||
|
||||
var files []string
|
||||
|
||||
@@ -3,6 +3,7 @@ package collector
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"bufio"
|
||||
"context"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/exec"
|
||||
@@ -17,14 +18,6 @@ var execDmidecode = func(typeNum string) (string, error) {
|
||||
return string(out), nil
|
||||
}
|
||||
|
||||
var execIpmitool = func(args ...string) (string, error) {
|
||||
out, err := exec.Command("ipmitool", args...).Output()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(out), nil
|
||||
}
|
||||
|
||||
// collectBoard runs dmidecode for types 0, 1, 2 and returns the board record
|
||||
// plus the BIOS firmware entry. Any failure is logged and returns zero values.
|
||||
func collectBoard() (schema.HardwareBoard, []schema.HardwareFirmwareRecord) {
|
||||
@@ -80,19 +73,23 @@ func parseBoard(type1, type2 string) schema.HardwareBoard {
|
||||
|
||||
// collectBMCFirmware collects BMC firmware version via ipmitool mc info.
|
||||
// Returns nil if ipmitool is missing, /dev/ipmi0 is absent, or any error occurs.
|
||||
func collectBMCFirmware() []schema.HardwareFirmwareRecord {
|
||||
func collectBMCFirmware(manufacturer string) []schema.HardwareFirmwareRecord {
|
||||
if _, err := exec.LookPath("ipmitool"); err != nil {
|
||||
return nil
|
||||
}
|
||||
if _, err := os.Stat("/dev/ipmi0"); err != nil {
|
||||
return nil
|
||||
}
|
||||
out, err := execIpmitool("mc", "info")
|
||||
profile := selectIPMIProfile(manufacturer)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), profile.mcInfoTimeout)
|
||||
defer cancel()
|
||||
cmd := exec.CommandContext(ctx, "ipmitool", "mc", "info")
|
||||
raw, err := cmd.Output()
|
||||
if err != nil {
|
||||
slog.Info("bmc: ipmitool mc info unavailable", "err", err)
|
||||
return nil
|
||||
}
|
||||
version := parseBMCFirmwareRevision(out)
|
||||
version := parseBMCFirmwareRevision(string(raw))
|
||||
if version == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -23,7 +23,7 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
||||
board, biosFW := collectBoard()
|
||||
snap.Board = board
|
||||
snap.Firmware = append(snap.Firmware, biosFW...)
|
||||
snap.Firmware = append(snap.Firmware, collectBMCFirmware()...)
|
||||
snap.Firmware = append(snap.Firmware, collectBMCFirmware(derefString(snap.Board.Manufacturer))...)
|
||||
|
||||
snap.CPUs = collectCPUs()
|
||||
|
||||
@@ -34,6 +34,7 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
||||
}
|
||||
snap.CPUs = enrichCPUsWithTelemetry(snap.CPUs, sensorDoc)
|
||||
snap.Memory = enrichMemoryWithTelemetry(snap.Memory, sensorDoc)
|
||||
bestEffortRescanHotplugStorage()
|
||||
snap.Storage = collectStorage()
|
||||
snap.PCIeDevices = collectPCIe()
|
||||
snap.PCIeDevices = enrichPCIeWithAMD(snap.PCIeDevices)
|
||||
@@ -44,7 +45,8 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
||||
snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices)
|
||||
snap.Storage = enrichStorageWithVROC(snap.Storage, snap.PCIeDevices)
|
||||
snap.Storage = appendUniqueStorage(snap.Storage, collectRAIDStorage(snap.PCIeDevices))
|
||||
snap.PowerSupplies = collectPSUs()
|
||||
snap.VROCLicense = collectVROCLicense(snap.PCIeDevices)
|
||||
snap.PowerSupplies = collectPSUs(derefString(snap.Board.Manufacturer))
|
||||
snap.PowerSupplies = enrichPSUsWithTelemetry(snap.PowerSupplies, sensorDoc)
|
||||
snap.Sensors = buildSensorsFromDoc(sensorDoc)
|
||||
finalizeSnapshot(&snap, collectedAt)
|
||||
|
||||
@@ -0,0 +1,92 @@
|
||||
package collector
|
||||
|
||||
// Package-level IPMI tuning profiles.
|
||||
//
|
||||
// Each profile is matched by board manufacturer (already known before PSU
|
||||
// collection runs). The profile drives two things:
|
||||
// - Per-command timeouts — prevents infinite hangs on slow BMCs.
|
||||
// - FRU early-exit — streaming parser stops reading once all PSU entries
|
||||
// are found, avoiding the tail of non-PSU FRU records.
|
||||
//
|
||||
// To add a new vendor: append to ipmiProfiles. The first matching entry wins.
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ipmiProfile holds tuning parameters for one or more board manufacturers.
|
||||
type ipmiProfile struct {
|
||||
// name is shown in log messages.
|
||||
name string
|
||||
// manufacturers is a list of lowercase substrings matched against the
|
||||
// board manufacturer string from dmidecode type 1.
|
||||
manufacturers []string
|
||||
// fruTimeout is the hard deadline for the entire `ipmitool fru print`
|
||||
// command. Zero means no timeout (not recommended).
|
||||
fruTimeout time.Duration
|
||||
// sdrTimeout is the hard deadline for `ipmitool sdr`.
|
||||
sdrTimeout time.Duration
|
||||
// mcInfoTimeout is the hard deadline for `ipmitool mc info`.
|
||||
mcInfoTimeout time.Duration
|
||||
// fruEarlyExit instructs the streaming FRU parser to stop reading
|
||||
// after it has found at least one PSU entry and the current block is
|
||||
// complete. Useful on servers with many non-PSU FRU devices.
|
||||
fruEarlyExit bool
|
||||
}
|
||||
|
||||
// ipmiProfiles is the ordered list of profiles. First match wins.
|
||||
var ipmiProfiles = []ipmiProfile{
|
||||
{
|
||||
// Lenovo XCC-based servers (ThinkSystem SR6xx / SR8xx / ST series).
|
||||
// SR650 V3 has 54 FRU devices; each IPMI read takes ~2 s, so the
|
||||
// full `fru print` scan takes ~108 s on a loaded BMC. Enable early
|
||||
// exit so collection stops once PSU records are found.
|
||||
name: "lenovo",
|
||||
manufacturers: []string{"lenovo"},
|
||||
fruTimeout: 90 * time.Second,
|
||||
sdrTimeout: 45 * time.Second,
|
||||
mcInfoTimeout: 15 * time.Second,
|
||||
fruEarlyExit: true,
|
||||
},
|
||||
{
|
||||
// HPE iLO-based servers (ProLiant DL/ML/BL).
|
||||
name: "hpe",
|
||||
manufacturers: []string{"hp", "hewlett packard"},
|
||||
fruTimeout: 60 * time.Second,
|
||||
sdrTimeout: 30 * time.Second,
|
||||
mcInfoTimeout: 10 * time.Second,
|
||||
fruEarlyExit: false,
|
||||
},
|
||||
{
|
||||
// Dell iDRAC-based servers.
|
||||
name: "dell",
|
||||
manufacturers: []string{"dell"},
|
||||
fruTimeout: 60 * time.Second,
|
||||
sdrTimeout: 30 * time.Second,
|
||||
mcInfoTimeout: 10 * time.Second,
|
||||
fruEarlyExit: false,
|
||||
},
|
||||
}
|
||||
|
||||
// defaultIPMIProfile is used when no vendor profile matches.
|
||||
var defaultIPMIProfile = ipmiProfile{
|
||||
name: "default",
|
||||
fruTimeout: 60 * time.Second,
|
||||
sdrTimeout: 30 * time.Second,
|
||||
mcInfoTimeout: 10 * time.Second,
|
||||
fruEarlyExit: false,
|
||||
}
|
||||
|
||||
// selectIPMIProfile returns the profile for the given board manufacturer.
|
||||
func selectIPMIProfile(manufacturer string) ipmiProfile {
|
||||
mfgLower := strings.ToLower(strings.TrimSpace(manufacturer))
|
||||
for _, p := range ipmiProfiles {
|
||||
for _, m := range p.manufacturers {
|
||||
if strings.Contains(mfgLower, m) {
|
||||
return p
|
||||
}
|
||||
}
|
||||
}
|
||||
return defaultIPMIProfile
|
||||
}
|
||||
@@ -4,7 +4,9 @@ import (
|
||||
"bee/audit/internal/schema"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
@@ -140,6 +142,9 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
||||
} else if numaNode, ok := parsePCINumaNode(fields["NUMANode"]); ok {
|
||||
dev.NUMANode = &numaNode
|
||||
}
|
||||
if group, ok := readPCIIOMMUGroup(bdf); ok {
|
||||
dev.IOMMUGroup = &group
|
||||
}
|
||||
if width, ok := readPCIIntAttribute(bdf, "current_link_width"); ok {
|
||||
dev.LinkWidth = &width
|
||||
}
|
||||
@@ -179,6 +184,21 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
||||
return dev
|
||||
}
|
||||
|
||||
// readPCIIOMMUGroup resolves the IOMMU group number for a BDF via the
|
||||
// iommu_group symlink in sysfs: .../devices/<bdf>/iommu_group -> .../kernel/iommu_groups/<N>
|
||||
func readPCIIOMMUGroup(bdf string) (int, bool) {
|
||||
link := "/sys/bus/pci/devices/" + bdf + "/iommu_group"
|
||||
target, err := os.Readlink(link)
|
||||
if err != nil {
|
||||
return 0, false
|
||||
}
|
||||
n, err := strconv.Atoi(filepath.Base(target))
|
||||
if err != nil {
|
||||
return 0, false
|
||||
}
|
||||
return n, true
|
||||
}
|
||||
|
||||
// readPCIIDs reads vendor and device IDs from sysfs for a given BDF.
|
||||
func readPCIIDs(bdf string) (vendorID, deviceID int) {
|
||||
base := "/sys/bus/pci/devices/" + bdf
|
||||
|
||||
@@ -2,6 +2,8 @@ package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"bufio"
|
||||
"context"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"regexp"
|
||||
@@ -10,16 +12,29 @@ import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
func collectPSUs() []schema.HardwarePowerSupply {
|
||||
func collectPSUs(manufacturer string) []schema.HardwarePowerSupply {
|
||||
profile := selectIPMIProfile(manufacturer)
|
||||
|
||||
var psus []schema.HardwarePowerSupply
|
||||
if out, err := exec.Command("ipmitool", "fru", "print").Output(); err == nil {
|
||||
psus = parseFRU(string(out))
|
||||
fruCtx, fruCancel := context.WithTimeout(context.Background(), profile.fruTimeout)
|
||||
defer fruCancel()
|
||||
|
||||
if profile.fruEarlyExit {
|
||||
psus = collectFRUEarlyExit(fruCtx)
|
||||
} else {
|
||||
slog.Info("psu: fru unavailable", "err", err)
|
||||
cmd := exec.CommandContext(fruCtx, "ipmitool", "fru", "print")
|
||||
if out, err := cmd.Output(); err == nil {
|
||||
psus = parseFRU(string(out))
|
||||
} else {
|
||||
slog.Info("psu: fru unavailable", "err", err)
|
||||
}
|
||||
}
|
||||
|
||||
sdrData := map[int]psuSDR{}
|
||||
if sdrOut, err := exec.Command("ipmitool", "sdr").Output(); err == nil {
|
||||
sdrCtx, sdrCancel := context.WithTimeout(context.Background(), profile.sdrTimeout)
|
||||
defer sdrCancel()
|
||||
cmd := exec.CommandContext(sdrCtx, "ipmitool", "sdr")
|
||||
if sdrOut, err := cmd.Output(); err == nil {
|
||||
sdrData = parsePSUSDR(string(sdrOut))
|
||||
if len(psus) == 0 {
|
||||
psus = synthesizePSUsFromSDR(sdrData)
|
||||
@@ -30,7 +45,66 @@ func collectPSUs() []schema.HardwarePowerSupply {
|
||||
slog.Info("psu: ipmitool unavailable, skipping", "err", err)
|
||||
return nil
|
||||
}
|
||||
slog.Info("psu: collected", "count", len(psus))
|
||||
slog.Info("psu: collected", "count", len(psus), "profile", profile.name)
|
||||
return psus
|
||||
}
|
||||
|
||||
// collectFRUEarlyExit streams ipmitool fru print line-by-line and stops reading
|
||||
// as soon as it has found all PSU blocks and the next block is not a PSU.
|
||||
// This avoids scanning all 50+ non-PSU FRU devices on Lenovo XCC servers.
|
||||
func collectFRUEarlyExit(ctx context.Context) []schema.HardwarePowerSupply {
|
||||
cmd := exec.CommandContext(ctx, "ipmitool", "fru", "print")
|
||||
pipe, err := cmd.StdoutPipe()
|
||||
if err != nil {
|
||||
slog.Info("psu: fru pipe unavailable", "err", err)
|
||||
return nil
|
||||
}
|
||||
if err := cmd.Start(); err != nil {
|
||||
slog.Info("psu: fru start failed", "err", err)
|
||||
return nil
|
||||
}
|
||||
|
||||
var psus []schema.HardwarePowerSupply
|
||||
var currentBlock strings.Builder
|
||||
slot := 0
|
||||
psuFound := false
|
||||
stoppedEarly := false
|
||||
|
||||
scanner := bufio.NewScanner(pipe)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
|
||||
if strings.HasPrefix(line, "FRU Device Description") {
|
||||
if currentBlock.Len() > 0 {
|
||||
if psu, ok := parseFRUBlock(currentBlock.String(), slot); ok {
|
||||
psus = append(psus, psu)
|
||||
psuFound = true
|
||||
slot++
|
||||
}
|
||||
currentBlock.Reset()
|
||||
}
|
||||
// Stop once we've collected PSUs and hit a non-PSU block header.
|
||||
if psuFound && !isPSUHeader(strings.ToLower(line)) {
|
||||
stoppedEarly = true
|
||||
break
|
||||
}
|
||||
}
|
||||
currentBlock.WriteString(line)
|
||||
currentBlock.WriteByte('\n')
|
||||
}
|
||||
|
||||
if !stoppedEarly && currentBlock.Len() > 0 {
|
||||
if psu, ok := parseFRUBlock(currentBlock.String(), slot); ok {
|
||||
psus = append(psus, psu)
|
||||
}
|
||||
}
|
||||
|
||||
// Kill the process immediately on early exit rather than waiting for context timeout.
|
||||
if cmd.Process != nil {
|
||||
cmd.Process.Kill() //nolint:errcheck
|
||||
}
|
||||
cmd.Wait() //nolint:errcheck
|
||||
slog.Info("psu: fru early-exit complete", "psus_found", len(psus), "stopped_early", stoppedEarly)
|
||||
return psus
|
||||
}
|
||||
|
||||
@@ -160,6 +234,9 @@ type psuSDR struct {
|
||||
}
|
||||
|
||||
var psuSlotPatterns = []*regexp.Regexp{
|
||||
// MSI/underscore style: PSU1_POWER_IN, PSU2_POWER_OUT — underscore is \w so \b
|
||||
// does not fire after the digit; match explicitly with underscore terminator.
|
||||
regexp.MustCompile(`(?i)\bpsu([0-9]+)_`),
|
||||
regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`), // PSU1, PS1, ps 2
|
||||
regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`), // PS 6, PS6
|
||||
regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`), // PWS1
|
||||
|
||||
@@ -49,6 +49,10 @@ func TestParsePSUSlotVendorVariants(t *testing.T) {
|
||||
{name: "PWS1 Status", want: 1},
|
||||
{name: "Power Supply Bay 8", want: 8},
|
||||
{name: "PS 6 Input Power", want: 6},
|
||||
// MSI underscore format — \b does not fire between digit and '_'
|
||||
{name: "PSU1_POWER_IN", want: 1},
|
||||
{name: "PSU2_POWER_OUT", want: 2},
|
||||
{name: "PSU4_STATUS", want: 4},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
@@ -59,6 +63,31 @@ func TestParsePSUSlotVendorVariants(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParsePSUSDRMSIFormat(t *testing.T) {
|
||||
t.Parallel()
|
||||
raw := `
|
||||
PSU1_STATUS | F1h | ok
|
||||
PSU1_POWER_OUT | 928 Watts | ok
|
||||
PSU1_POWER_IN | 976 Watts | ok
|
||||
PSU2_STATUS | F2h | ok
|
||||
PSU2_POWER_OUT | 944 Watts | ok
|
||||
PSU2_POWER_IN | 992 Watts | ok
|
||||
`
|
||||
got := parsePSUSDR(raw)
|
||||
if len(got) != 2 {
|
||||
t.Fatalf("len(got)=%d want 2", len(got))
|
||||
}
|
||||
if got[1].inputPowerW == nil || *got[1].inputPowerW != 976 {
|
||||
t.Fatalf("psu1 input power=%v want 976", got[1].inputPowerW)
|
||||
}
|
||||
if got[1].outputPowerW == nil || *got[1].outputPowerW != 928 {
|
||||
t.Fatalf("psu1 output power=%v want 928", got[1].outputPowerW)
|
||||
}
|
||||
if got[2].inputPowerW == nil || *got[2].inputPowerW != 992 {
|
||||
t.Fatalf("psu2 input power=%v want 992", got[2].inputPowerW)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSynthesizePSUsFromSDR(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
@@ -733,6 +733,37 @@ func parseMDStatArrays(raw string) []mdArray {
|
||||
return arrays
|
||||
}
|
||||
|
||||
// collectVROCLicense runs mdadm --detail-platform and extracts the License field.
|
||||
// Returns nil when VROC is absent or the platform does not report a license.
|
||||
func collectVROCLicense(pcie []schema.HardwarePCIeDevice) *string {
|
||||
if !hasVROCController(pcie) {
|
||||
return nil
|
||||
}
|
||||
out, err := raidToolQuery("mdadm", "--detail-platform")
|
||||
if err != nil {
|
||||
slog.Info("vroc: mdadm --detail-platform unavailable", "err", err)
|
||||
return nil
|
||||
}
|
||||
return parseMDAdmPlatformLicense(string(out))
|
||||
}
|
||||
|
||||
func parseMDAdmPlatformLicense(raw string) *string {
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
trimmed := strings.TrimSpace(line)
|
||||
if !strings.HasPrefix(strings.ToLower(trimmed), "license") {
|
||||
continue
|
||||
}
|
||||
if idx := strings.Index(trimmed, ":"); idx >= 0 {
|
||||
val := strings.TrimSpace(trimmed[idx+1:])
|
||||
if val != "" {
|
||||
v := strings.ToLower(val)
|
||||
return &v
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func queryDeviceSerial(devPath string) string {
|
||||
if out, err := exec.Command("nvme", "id-ctrl", devPath, "-o", "json").Output(); err == nil {
|
||||
var ctrl nvmeIDCtrl
|
||||
|
||||
@@ -4,12 +4,52 @@ import (
|
||||
"bee/audit/internal/schema"
|
||||
"encoding/json"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var (
|
||||
pciRescanPath = "/sys/bus/pci/rescan"
|
||||
scsiHostScanGlob = "/sys/class/scsi_host/host*/scan"
|
||||
hotplugWriteFile = os.WriteFile
|
||||
hotplugExecCommand = exec.Command
|
||||
hotplugGlob = filepath.Glob
|
||||
nvmeLBAFCompactRE = regexp.MustCompile(`(?im)^\s*lbaf\s+\d+\s*:\s*ms:(\d+)\s+lbads:(\d+).*?\(in use\)\s*$`)
|
||||
nvmeLBAFVerboseRE = regexp.MustCompile(`(?im)^\s*LBA Format\s+\d+\s*:\s*Metadata Size:\s*(\d+)\s+bytes\s*-\s*Data Size:\s*(\d+)\s+bytes.*?\(in use\)\s*$`)
|
||||
sgReadcapBlockRE = regexp.MustCompile(`(?im)logical block length\s*=\s*(\d+)\s+bytes`)
|
||||
sgReadcapProtRE = regexp.MustCompile(`(?im)prot_en\s*=\s*1`)
|
||||
)
|
||||
|
||||
func bestEffortRescanHotplugStorage() {
|
||||
if err := hotplugWriteFile(pciRescanPath, []byte("1\n"), 0644); err != nil {
|
||||
slog.Info("storage: pci rescan skipped", "path", pciRescanPath, "err", err)
|
||||
} else {
|
||||
slog.Info("storage: triggered pci rescan for hotplug discovery")
|
||||
}
|
||||
|
||||
hostPaths, err := hotplugGlob(scsiHostScanGlob)
|
||||
if err != nil {
|
||||
slog.Info("storage: scsi host scan skipped", "pattern", scsiHostScanGlob, "err", err)
|
||||
} else {
|
||||
for _, path := range hostPaths {
|
||||
if err := hotplugWriteFile(path, []byte("- - -\n"), 0644); err != nil {
|
||||
slog.Info("storage: scsi host scan write failed", "path", path, "err", err)
|
||||
continue
|
||||
}
|
||||
slog.Info("storage: triggered scsi host scan", "path", path)
|
||||
}
|
||||
}
|
||||
|
||||
out, err := hotplugExecCommand("udevadm", "settle", "--timeout=10").CombinedOutput()
|
||||
if err != nil {
|
||||
slog.Info("storage: udev settle after hotplug rescan failed", "err", err, "output", strings.TrimSpace(string(out)))
|
||||
}
|
||||
}
|
||||
|
||||
func collectStorage() []schema.HardwareStorage {
|
||||
devs := discoverStorageDevices()
|
||||
result := make([]schema.HardwareStorage, 0, len(devs))
|
||||
@@ -35,6 +75,8 @@ type lsblkDevice struct {
|
||||
Model string `json:"model"`
|
||||
Tran string `json:"tran"`
|
||||
Hctl string `json:"hctl"`
|
||||
LogSec string `json:"log-sec"`
|
||||
PhySec string `json:"phy-sec"`
|
||||
}
|
||||
|
||||
type lsblkRoot struct {
|
||||
@@ -101,7 +143,7 @@ func isVirtualHDiskModel(model string) bool {
|
||||
|
||||
func lsblkDevices() []lsblkDevice {
|
||||
out, err := exec.Command("lsblk", "-J", "-d",
|
||||
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL").Output()
|
||||
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL,LOG-SEC,PHY-SEC").Output()
|
||||
if err != nil {
|
||||
slog.Warn("storage: lsblk failed", "err", err)
|
||||
return nil
|
||||
@@ -208,6 +250,7 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
||||
present := true
|
||||
s := schema.HardwareStorage{Present: &present}
|
||||
s.Telemetry = map[string]any{"linux_device": "/dev/" + dev.Name}
|
||||
applyStorageBlockGeometry(&s, dev)
|
||||
|
||||
tran := strings.ToLower(dev.Tran)
|
||||
devPath := "/dev/" + dev.Name
|
||||
@@ -250,6 +293,8 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
||||
}
|
||||
|
||||
var info smartctlInfo
|
||||
var raw map[string]any
|
||||
_ = json.Unmarshal(out, &raw)
|
||||
if err := json.Unmarshal(out, &info); err == nil {
|
||||
if v := cleanDMIValue(info.ModelName); v != "" {
|
||||
s.Model = &v
|
||||
@@ -302,8 +347,11 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
||||
value := float64(attr.Raw.Value)
|
||||
s.LifeRemainingPct = &value
|
||||
case 241:
|
||||
value := attr.Raw.Value
|
||||
value := smartLBAsToBytes(attr.Raw.Value)
|
||||
s.WrittenBytes = &value
|
||||
case 242:
|
||||
value := smartLBAsToBytes(attr.Raw.Value)
|
||||
s.ReadBytes = &value
|
||||
case 197:
|
||||
pending = attr.Raw.Value
|
||||
s.CurrentPendingSectors = &pending
|
||||
@@ -321,6 +369,8 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
||||
offlineUncorrectable: uncorrectable,
|
||||
lifeRemainingPct: lifeRemaining,
|
||||
}
|
||||
applySCSISmartctlTelemetry(&s, raw, &status)
|
||||
applySCSIProtectionBlockGeometry(&s, devPath)
|
||||
setStorageHealthStatus(&s, status)
|
||||
return s
|
||||
}
|
||||
@@ -368,6 +418,7 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||
Interface: &iface,
|
||||
Telemetry: map[string]any{"linux_device": "/dev/" + dev.Name},
|
||||
}
|
||||
applyStorageBlockGeometry(&s, dev)
|
||||
|
||||
devPath := "/dev/" + dev.Name
|
||||
if v := cleanDMIValue(strings.TrimSpace(dev.Model)); v != "" {
|
||||
@@ -402,6 +453,7 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||
}
|
||||
}
|
||||
}
|
||||
applyNVMeBlockGeometry(&s, devPath)
|
||||
|
||||
// smart-log: wear telemetry
|
||||
if out, err := exec.Command("nvme", "smart-log", devPath, "-o", "json").Output(); err == nil {
|
||||
@@ -477,6 +529,251 @@ func nvmeDataUnitsToBytes(units int64) int64 {
|
||||
return units * 512000
|
||||
}
|
||||
|
||||
func smartLBAsToBytes(lbas int64) int64 {
|
||||
if lbas <= 0 {
|
||||
return 0
|
||||
}
|
||||
return lbas * 512
|
||||
}
|
||||
|
||||
func applySCSISmartctlTelemetry(s *schema.HardwareStorage, raw map[string]any, status *storageHealthStatus) {
|
||||
if s == nil || len(raw) == 0 {
|
||||
return
|
||||
}
|
||||
if v, ok := firstInt64(raw,
|
||||
"path:power_on_time.hours",
|
||||
"path:accumulated_power_on_time.hours",
|
||||
"path:power_on_time.hour",
|
||||
"path:accumulated_power_on_time.hour",
|
||||
); ok && v > 0 && s.PowerOnHours == nil {
|
||||
s.PowerOnHours = &v
|
||||
}
|
||||
if v, ok := firstInt64(raw,
|
||||
"path:power_cycle_count",
|
||||
"path:start_stop_cycle_count",
|
||||
"path:accumulated_start_stop_cycles",
|
||||
); ok && v > 0 && s.PowerCycles == nil {
|
||||
s.PowerCycles = &v
|
||||
}
|
||||
if v, ok := firstInt64(raw,
|
||||
"path:scsi_grown_defect_list",
|
||||
"path:grown_defect_list",
|
||||
); ok && v > 0 && s.ReallocatedSectors == nil {
|
||||
s.ReallocatedSectors = &v
|
||||
if status != nil && status.reallocatedSectors == 0 {
|
||||
status.reallocatedSectors = v
|
||||
}
|
||||
}
|
||||
if v, ok := firstInt64(raw,
|
||||
"path:percentage_used_endurance_indicator",
|
||||
"path:scsi_percentage_used_endurance_indicator",
|
||||
); ok && v > 0 {
|
||||
if s.LifeUsedPct == nil {
|
||||
fv := float64(v)
|
||||
s.LifeUsedPct = &fv
|
||||
}
|
||||
if s.LifeRemainingPct == nil && v <= 100 {
|
||||
remaining := float64(100 - v)
|
||||
s.LifeRemainingPct = &remaining
|
||||
if status != nil && status.lifeRemainingPct == 0 {
|
||||
status.lifeRemainingPct = int64(remaining)
|
||||
}
|
||||
}
|
||||
}
|
||||
blockSize, hasBlockSize := firstInt64(raw,
|
||||
"path:logical_block_size",
|
||||
"path:block_size",
|
||||
"path:user_capacity.block_size",
|
||||
)
|
||||
if hasBlockSize && blockSize > 0 {
|
||||
if s.LogicalBlockSizeBytes == nil {
|
||||
s.LogicalBlockSizeBytes = &blockSize
|
||||
}
|
||||
if s.MetadataBytesPerBlock == nil {
|
||||
zero := int64(0)
|
||||
s.MetadataBytesPerBlock = &zero
|
||||
}
|
||||
if s.Telemetry == nil {
|
||||
s.Telemetry = map[string]any{}
|
||||
}
|
||||
s.Telemetry["logical_block_size_bytes"] = *s.LogicalBlockSizeBytes
|
||||
s.Telemetry["metadata_bytes_per_block"] = *s.MetadataBytesPerBlock
|
||||
s.Telemetry["block_format"] = formatBlockFormat(*s.LogicalBlockSizeBytes, *s.MetadataBytesPerBlock)
|
||||
if v, ok := firstInt64(raw,
|
||||
"path:logical_blocks_written",
|
||||
"path:total_lbas_written",
|
||||
); ok && v > 0 && s.WrittenBytes == nil {
|
||||
bytes := v * blockSize
|
||||
s.WrittenBytes = &bytes
|
||||
}
|
||||
if v, ok := firstInt64(raw,
|
||||
"path:logical_blocks_read",
|
||||
"path:total_lbas_read",
|
||||
); ok && v > 0 && s.ReadBytes == nil {
|
||||
bytes := v * blockSize
|
||||
s.ReadBytes = &bytes
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func applyStorageBlockGeometry(s *schema.HardwareStorage, dev lsblkDevice) {
|
||||
if s == nil {
|
||||
return
|
||||
}
|
||||
logical := parseStorageBytes(dev.LogSec)
|
||||
physical := parseStorageBytes(dev.PhySec)
|
||||
if logical <= 0 && physical <= 0 {
|
||||
return
|
||||
}
|
||||
if s.Telemetry == nil {
|
||||
s.Telemetry = map[string]any{}
|
||||
}
|
||||
if logical > 0 {
|
||||
s.LogicalBlockSizeBytes = &logical
|
||||
s.Telemetry["logical_block_size_bytes"] = logical
|
||||
if s.MetadataBytesPerBlock == nil {
|
||||
zero := int64(0)
|
||||
s.MetadataBytesPerBlock = &zero
|
||||
s.Telemetry["metadata_bytes_per_block"] = zero
|
||||
}
|
||||
}
|
||||
if physical > 0 {
|
||||
s.PhysicalBlockSizeBytes = &physical
|
||||
s.Telemetry["physical_block_size_bytes"] = physical
|
||||
}
|
||||
if s.LogicalBlockSizeBytes != nil && s.MetadataBytesPerBlock != nil {
|
||||
s.Telemetry["block_format"] = formatBlockFormat(*s.LogicalBlockSizeBytes, *s.MetadataBytesPerBlock)
|
||||
}
|
||||
}
|
||||
|
||||
func applyNVMeBlockGeometry(s *schema.HardwareStorage, devPath string) {
|
||||
if s == nil || strings.TrimSpace(devPath) == "" {
|
||||
return
|
||||
}
|
||||
out, err := exec.Command("nvme", "id-ns", devPath, "-H").CombinedOutput()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
dataBytes, metadataBytes, ok := parseNVMeBlockFormat(string(out))
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
setStorageBlockGeometry(s, dataBytes, metadataBytes)
|
||||
}
|
||||
|
||||
func applySCSIProtectionBlockGeometry(s *schema.HardwareStorage, devPath string) {
|
||||
if s == nil || strings.TrimSpace(devPath) == "" {
|
||||
return
|
||||
}
|
||||
out, err := exec.Command("sg_readcap", "-l", devPath).CombinedOutput()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
dataBytes, metadataBytes, ok := parseSCSIBlockFormat(string(out))
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
setStorageBlockGeometry(s, dataBytes, metadataBytes)
|
||||
}
|
||||
|
||||
func setStorageBlockGeometry(s *schema.HardwareStorage, dataBytes, metadataBytes int64) {
|
||||
if s == nil || dataBytes <= 0 || metadataBytes < 0 {
|
||||
return
|
||||
}
|
||||
if s.Telemetry == nil {
|
||||
s.Telemetry = map[string]any{}
|
||||
}
|
||||
s.LogicalBlockSizeBytes = &dataBytes
|
||||
s.MetadataBytesPerBlock = &metadataBytes
|
||||
s.Telemetry["logical_block_size_bytes"] = dataBytes
|
||||
s.Telemetry["metadata_bytes_per_block"] = metadataBytes
|
||||
s.Telemetry["block_format"] = formatBlockFormat(dataBytes, metadataBytes)
|
||||
}
|
||||
|
||||
func formatBlockFormat(dataBytes, metadataBytes int64) string {
|
||||
return strconv.FormatInt(dataBytes, 10) + "+" + strconv.FormatInt(metadataBytes, 10)
|
||||
}
|
||||
|
||||
func parseNVMeBlockFormat(raw string) (dataBytes, metadataBytes int64, ok bool) {
|
||||
if m := nvmeLBAFCompactRE.FindStringSubmatch(raw); len(m) == 3 {
|
||||
ms, errMS := strconv.ParseInt(m[1], 10, 64)
|
||||
lbads, errLBADS := strconv.ParseInt(m[2], 10, 64)
|
||||
if errMS == nil && errLBADS == nil && lbads >= 0 && lbads < 63 {
|
||||
return 1 << lbads, ms, true
|
||||
}
|
||||
}
|
||||
if m := nvmeLBAFVerboseRE.FindStringSubmatch(raw); len(m) == 3 {
|
||||
ms, errMS := strconv.ParseInt(m[1], 10, 64)
|
||||
ds, errDS := strconv.ParseInt(m[2], 10, 64)
|
||||
if errMS == nil && errDS == nil && ds > 0 {
|
||||
return ds, ms, true
|
||||
}
|
||||
}
|
||||
return 0, 0, false
|
||||
}
|
||||
|
||||
func parseSCSIBlockFormat(raw string) (dataBytes, metadataBytes int64, ok bool) {
|
||||
m := sgReadcapBlockRE.FindStringSubmatch(raw)
|
||||
if len(m) != 2 {
|
||||
return 0, 0, false
|
||||
}
|
||||
blockBytes, err := strconv.ParseInt(m[1], 10, 64)
|
||||
if err != nil || blockBytes <= 0 {
|
||||
return 0, 0, false
|
||||
}
|
||||
if sgReadcapProtRE.MatchString(raw) {
|
||||
return blockBytes, 8, true
|
||||
}
|
||||
return blockBytes, 0, true
|
||||
}
|
||||
|
||||
func firstInt64(root map[string]any, candidates ...string) (int64, bool) {
|
||||
for _, candidate := range candidates {
|
||||
if !strings.HasPrefix(candidate, "path:") {
|
||||
continue
|
||||
}
|
||||
path := strings.TrimPrefix(candidate, "path:")
|
||||
if v, ok := nestedInt64(root, strings.Split(path, ".")); ok {
|
||||
return v, true
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func nestedInt64(root map[string]any, path []string) (int64, bool) {
|
||||
var current any = root
|
||||
for _, key := range path {
|
||||
obj, ok := current.(map[string]any)
|
||||
if !ok {
|
||||
return 0, false
|
||||
}
|
||||
current, ok = obj[key]
|
||||
if !ok {
|
||||
return 0, false
|
||||
}
|
||||
}
|
||||
switch v := current.(type) {
|
||||
case float64:
|
||||
return int64(v), true
|
||||
case float32:
|
||||
return int64(v), true
|
||||
case int:
|
||||
return int64(v), true
|
||||
case int64:
|
||||
return v, true
|
||||
case int32:
|
||||
return int64(v), true
|
||||
case json.Number:
|
||||
n, err := v.Int64()
|
||||
return n, err == nil
|
||||
case string:
|
||||
n, err := strconv.ParseInt(strings.TrimSpace(v), 10, 64)
|
||||
return n, err == nil
|
||||
default:
|
||||
return 0, false
|
||||
}
|
||||
}
|
||||
|
||||
type storageHealthStatus struct {
|
||||
hasOverall bool
|
||||
overallPassed bool
|
||||
|
||||
@@ -0,0 +1,69 @@
|
||||
package collector
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestParseNVMeBlockFormatCompact(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
raw := `
|
||||
lbaf 0 : ms:0 lbads:9 rp:0x2 (in use)
|
||||
lbaf 1 : ms:8 lbads:9 rp:0x1
|
||||
`
|
||||
dataBytes, metadataBytes, ok := parseNVMeBlockFormat(raw)
|
||||
if !ok {
|
||||
t.Fatal("parseNVMeBlockFormat returned ok=false")
|
||||
}
|
||||
if dataBytes != 512 || metadataBytes != 0 {
|
||||
t.Fatalf("got %d+%d want 512+0", dataBytes, metadataBytes)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseNVMeBlockFormatVerbose(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
raw := `
|
||||
LBA Format 0 : Metadata Size: 8 bytes - Data Size: 512 bytes - Relative Performance: 0 Better (in use)
|
||||
LBA Format 1 : Metadata Size: 0 bytes - Data Size: 4096 bytes - Relative Performance: 1 Best
|
||||
`
|
||||
dataBytes, metadataBytes, ok := parseNVMeBlockFormat(raw)
|
||||
if !ok {
|
||||
t.Fatal("parseNVMeBlockFormat returned ok=false")
|
||||
}
|
||||
if dataBytes != 512 || metadataBytes != 8 {
|
||||
t.Fatalf("got %d+%d want 512+8", dataBytes, metadataBytes)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseSCSIBlockFormatWithProtection(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
raw := `
|
||||
Read Capacity results:
|
||||
Protection: prot_en=1, p_type=1, p_i_exponent=0
|
||||
Logical block length=512 bytes
|
||||
`
|
||||
dataBytes, metadataBytes, ok := parseSCSIBlockFormat(raw)
|
||||
if !ok {
|
||||
t.Fatal("parseSCSIBlockFormat returned ok=false")
|
||||
}
|
||||
if dataBytes != 512 || metadataBytes != 8 {
|
||||
t.Fatalf("got %d+%d want 512+8", dataBytes, metadataBytes)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseSCSIBlockFormatWithoutProtection(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
raw := `
|
||||
Read Capacity results:
|
||||
Protection: prot_en=0, p_type=0, p_i_exponent=0
|
||||
Logical block length=4096 bytes
|
||||
`
|
||||
dataBytes, metadataBytes, ok := parseSCSIBlockFormat(raw)
|
||||
if !ok {
|
||||
t.Fatal("parseSCSIBlockFormat returned ok=false")
|
||||
}
|
||||
if dataBytes != 4096 || metadataBytes != 0 {
|
||||
t.Fatalf("got %d+%d want 4096+0", dataBytes, metadataBytes)
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,12 @@
|
||||
package collector
|
||||
|
||||
import "testing"
|
||||
import (
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestMergeStorageDevicePrefersNonEmptyFields(t *testing.T) {
|
||||
t.Parallel()
|
||||
@@ -31,3 +37,82 @@ func TestParseStorageBytes(t *testing.T) {
|
||||
t.Fatalf("parseStorageBytes invalid=%d want 0", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBestEffortRescanHotplugStorage(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tmp := t.TempDir()
|
||||
rescanPath := filepath.Join(tmp, "pci-rescan")
|
||||
scanDir := filepath.Join(tmp, "scsi_host")
|
||||
host0Path := filepath.Join(scanDir, "host0", "scan")
|
||||
host1Path := filepath.Join(scanDir, "host1", "scan")
|
||||
argsPath := filepath.Join(tmp, "udevadm-args")
|
||||
toolPath := filepath.Join(tmp, "udevadm")
|
||||
if err := os.MkdirAll(filepath.Dir(host0Path), 0755); err != nil {
|
||||
t.Fatalf("mkdir host0: %v", err)
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Dir(host1Path), 0755); err != nil {
|
||||
t.Fatalf("mkdir host1: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(host0Path, nil, 0644); err != nil {
|
||||
t.Fatalf("touch host0 scan: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(host1Path, nil, 0644); err != nil {
|
||||
t.Fatalf("touch host1 scan: %v", err)
|
||||
}
|
||||
script := "#!/bin/sh\nprintf '%s' \"$*\" > \"" + argsPath + "\"\n"
|
||||
if err := os.WriteFile(toolPath, []byte(script), 0755); err != nil {
|
||||
t.Fatalf("write udevadm stub: %v", err)
|
||||
}
|
||||
|
||||
oldPath := os.Getenv("PATH")
|
||||
if err := os.Setenv("PATH", tmp+string(os.PathListSeparator)+oldPath); err != nil {
|
||||
t.Fatalf("set PATH: %v", err)
|
||||
}
|
||||
defer func() { _ = os.Setenv("PATH", oldPath) }()
|
||||
|
||||
oldRescanPath := pciRescanPath
|
||||
oldSCSIGlob := scsiHostScanGlob
|
||||
oldWriteFile := hotplugWriteFile
|
||||
oldExecCommand := hotplugExecCommand
|
||||
oldGlob := hotplugGlob
|
||||
pciRescanPath = rescanPath
|
||||
scsiHostScanGlob = filepath.Join(scanDir, "host*", "scan")
|
||||
hotplugWriteFile = os.WriteFile
|
||||
hotplugExecCommand = exec.Command
|
||||
hotplugGlob = filepath.Glob
|
||||
defer func() {
|
||||
pciRescanPath = oldRescanPath
|
||||
scsiHostScanGlob = oldSCSIGlob
|
||||
hotplugWriteFile = oldWriteFile
|
||||
hotplugExecCommand = oldExecCommand
|
||||
hotplugGlob = oldGlob
|
||||
}()
|
||||
|
||||
bestEffortRescanHotplugStorage()
|
||||
|
||||
raw, err := os.ReadFile(rescanPath)
|
||||
if err != nil {
|
||||
t.Fatalf("read rescan file: %v", err)
|
||||
}
|
||||
if string(raw) != "1\n" {
|
||||
t.Fatalf("rescan payload=%q want %q", string(raw), "1\n")
|
||||
}
|
||||
for _, path := range []string{host0Path, host1Path} {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("read scsi scan file %s: %v", path, err)
|
||||
}
|
||||
if string(raw) != "- - -\n" {
|
||||
t.Fatalf("scsi scan payload at %s =%q want %q", path, string(raw), "- - -\n")
|
||||
}
|
||||
}
|
||||
|
||||
args, err := os.ReadFile(argsPath)
|
||||
if err != nil {
|
||||
t.Fatalf("read udevadm args: %v", err)
|
||||
}
|
||||
if got := strings.TrimSpace(string(args)); got != "settle --timeout=10" {
|
||||
t.Fatalf("udevadm args=%q want %q", got, "settle --timeout=10")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,101 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
func TestApplySCSISmartctlTelemetry(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
raw := map[string]any{
|
||||
"power_on_time": map[string]any{
|
||||
"hours": float64(32123),
|
||||
},
|
||||
"accumulated_start_stop_cycles": float64(17),
|
||||
"scsi_grown_defect_list": float64(4),
|
||||
"percentage_used_endurance_indicator": float64(12),
|
||||
"logical_block_size": float64(4096),
|
||||
"logical_blocks_written": float64(1000),
|
||||
"logical_blocks_read": float64(2000),
|
||||
}
|
||||
|
||||
var disk schema.HardwareStorage
|
||||
status := storageHealthStatus{}
|
||||
applySCSISmartctlTelemetry(&disk, raw, &status)
|
||||
|
||||
if disk.PowerOnHours == nil || *disk.PowerOnHours != 32123 {
|
||||
t.Fatalf("power_on_hours=%v want 32123", disk.PowerOnHours)
|
||||
}
|
||||
if disk.PowerCycles == nil || *disk.PowerCycles != 17 {
|
||||
t.Fatalf("power_cycles=%v want 17", disk.PowerCycles)
|
||||
}
|
||||
if disk.ReallocatedSectors == nil || *disk.ReallocatedSectors != 4 {
|
||||
t.Fatalf("reallocated=%v want 4", disk.ReallocatedSectors)
|
||||
}
|
||||
if disk.WrittenBytes == nil || *disk.WrittenBytes != 4096000 {
|
||||
t.Fatalf("written_bytes=%v want 4096000", disk.WrittenBytes)
|
||||
}
|
||||
if disk.ReadBytes == nil || *disk.ReadBytes != 8192000 {
|
||||
t.Fatalf("read_bytes=%v want 8192000", disk.ReadBytes)
|
||||
}
|
||||
if disk.LogicalBlockSizeBytes == nil || *disk.LogicalBlockSizeBytes != 4096 {
|
||||
t.Fatalf("logical_block_size_bytes=%v want 4096", disk.LogicalBlockSizeBytes)
|
||||
}
|
||||
if disk.MetadataBytesPerBlock == nil || *disk.MetadataBytesPerBlock != 0 {
|
||||
t.Fatalf("metadata_bytes_per_block=%v want 0", disk.MetadataBytesPerBlock)
|
||||
}
|
||||
if disk.LifeUsedPct == nil || *disk.LifeUsedPct != 12 {
|
||||
t.Fatalf("life_used_pct=%v want 12", disk.LifeUsedPct)
|
||||
}
|
||||
if disk.LifeRemainingPct == nil || *disk.LifeRemainingPct != 88 {
|
||||
t.Fatalf("life_remaining_pct=%v want 88", disk.LifeRemainingPct)
|
||||
}
|
||||
if status.reallocatedSectors != 4 {
|
||||
t.Fatalf("status.reallocated=%d want 4", status.reallocatedSectors)
|
||||
}
|
||||
if status.lifeRemainingPct != 88 {
|
||||
t.Fatalf("status.life_remaining_pct=%d want 88", status.lifeRemainingPct)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplySCSISmartctlTelemetryDoesNotOverwriteExistingValues(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
powerOnHours := int64(10)
|
||||
writtenBytes := int64(20)
|
||||
lifeRemaining := 30.0
|
||||
disk := schema.HardwareStorage{
|
||||
PowerOnHours: &powerOnHours,
|
||||
WrittenBytes: &writtenBytes,
|
||||
LifeRemainingPct: &lifeRemaining,
|
||||
}
|
||||
raw := map[string]any{
|
||||
"power_on_time": map[string]any{"hours": float64(999)},
|
||||
"logical_block_size": float64(512),
|
||||
"logical_blocks_written": float64(999),
|
||||
"percentage_used_endurance_indicator": float64(50),
|
||||
}
|
||||
|
||||
applySCSISmartctlTelemetry(&disk, raw, nil)
|
||||
|
||||
if *disk.PowerOnHours != 10 {
|
||||
t.Fatalf("power_on_hours overwritten: got %d want 10", *disk.PowerOnHours)
|
||||
}
|
||||
if *disk.WrittenBytes != 20 {
|
||||
t.Fatalf("written_bytes overwritten: got %d want 20", *disk.WrittenBytes)
|
||||
}
|
||||
if disk.LogicalBlockSizeBytes == nil || *disk.LogicalBlockSizeBytes != 512 {
|
||||
t.Fatalf("logical_block_size_bytes=%v want 512", disk.LogicalBlockSizeBytes)
|
||||
}
|
||||
if disk.MetadataBytesPerBlock == nil || *disk.MetadataBytesPerBlock != 0 {
|
||||
t.Fatalf("metadata_bytes_per_block=%v want 0", disk.MetadataBytesPerBlock)
|
||||
}
|
||||
if *disk.LifeRemainingPct != 30 {
|
||||
t.Fatalf("life_remaining_pct overwritten: got %v want 30", *disk.LifeRemainingPct)
|
||||
}
|
||||
if disk.LifeUsedPct == nil || *disk.LifeUsedPct != 50 {
|
||||
t.Fatalf("life_used_pct=%v want 50", disk.LifeUsedPct)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,25 @@
|
||||
package collector
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestSmartLBAsToBytes(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
lbas int64
|
||||
want int64
|
||||
}{
|
||||
{name: "zero", lbas: 0, want: 0},
|
||||
{name: "single lba", lbas: 1, want: 512},
|
||||
{name: "multiple lbas", lbas: 2048, want: 1048576},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
if got := smartLBAsToBytes(tt.lbas); got != tt.want {
|
||||
t.Fatalf("smartLBAsToBytes(%d)=%d want %d", tt.lbas, got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -28,6 +28,35 @@ md125 : active raid1 nvme2n1[0] nvme3n1[1]
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseMDAdmPlatformLicense(t *testing.T) {
|
||||
premium := `Platform : Intel(R) Virtual RAID on CPU
|
||||
Version : 1.3.0.1138
|
||||
RAID Levels : raid0 raid1 raid5 raid10
|
||||
Total Disks : 4
|
||||
License : Premium
|
||||
`
|
||||
got := parseMDAdmPlatformLicense(premium)
|
||||
if got == nil || *got != "premium" {
|
||||
t.Fatalf("expected 'premium', got %v", got)
|
||||
}
|
||||
|
||||
standard := `Platform : Intel(R) Virtual RAID on CPU
|
||||
License : Standard
|
||||
`
|
||||
got = parseMDAdmPlatformLicense(standard)
|
||||
if got == nil || *got != "standard" {
|
||||
t.Fatalf("expected 'standard', got %v", got)
|
||||
}
|
||||
|
||||
noLicense := `Platform : Intel(R) Virtual RAID on CPU
|
||||
Version : 1.0.0
|
||||
`
|
||||
got = parseMDAdmPlatformLicense(noLicense)
|
||||
if got != nil {
|
||||
t.Fatalf("expected nil, got %v", *got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHasVROCController(t *testing.T) {
|
||||
intel := vendorIntel
|
||||
model := "Volume Management Device NVMe RAID Controller"
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,735 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
benchmarkPowerAutotuneVersion = 1
|
||||
benchmarkPowerAutotuneIdleSec = 60
|
||||
benchmarkPowerAutotuneLoadSec = 90
|
||||
benchmarkPowerAutotuneSampleInterval = 3
|
||||
defaultBenchmarkPowerSourceConfigPath = "/appdata/bee/export/bee-bench/power-source-autotune.json"
|
||||
)
|
||||
|
||||
func BenchmarkPowerSourceConfigPath(baseDir string) string {
|
||||
baseDir = strings.TrimSpace(baseDir)
|
||||
if baseDir == "" {
|
||||
return defaultBenchmarkPowerSourceConfigPath
|
||||
}
|
||||
return filepath.Join(filepath.Dir(baseDir), "power-source-autotune.json")
|
||||
}
|
||||
|
||||
func LoadBenchmarkPowerAutotuneConfig(path string) (*BenchmarkPowerAutotuneConfig, error) {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var cfg BenchmarkPowerAutotuneConfig
|
||||
if err := json.Unmarshal(raw, &cfg); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if strings.TrimSpace(cfg.SelectedSource) == "" {
|
||||
return nil, fmt.Errorf("autotune config missing selected_source")
|
||||
}
|
||||
return &cfg, nil
|
||||
}
|
||||
|
||||
func SaveBenchmarkPowerAutotuneConfig(path string, cfg BenchmarkPowerAutotuneConfig) error {
|
||||
if strings.TrimSpace(path) == "" {
|
||||
return fmt.Errorf("empty autotune config path")
|
||||
}
|
||||
if cfg.Version <= 0 {
|
||||
cfg.Version = benchmarkPowerAutotuneVersion
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
data, err := json.MarshalIndent(cfg, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
tmp := path + ".tmp"
|
||||
if err := os.WriteFile(tmp, data, 0644); err != nil {
|
||||
return err
|
||||
}
|
||||
return os.Rename(tmp, path)
|
||||
}
|
||||
|
||||
func LoadSystemPowerSourceConfig(exportDir string) (*BenchmarkPowerAutotuneConfig, error) {
|
||||
return LoadBenchmarkPowerAutotuneConfig(BenchmarkPowerSourceConfigPath(exportDir))
|
||||
}
|
||||
|
||||
func ResetBenchmarkPowerAutotuneConfig(path string) error {
|
||||
if strings.TrimSpace(path) == "" {
|
||||
return fmt.Errorf("empty autotune config path")
|
||||
}
|
||||
if err := os.Remove(path); err != nil && !os.IsNotExist(err) {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func normalizeBenchmarkPowerSource(source string) string {
|
||||
switch strings.TrimSpace(strings.ToLower(source)) {
|
||||
case BenchmarkPowerSourceSDRPSUInput:
|
||||
return BenchmarkPowerSourceSDRPSUInput
|
||||
default:
|
||||
return BenchmarkPowerSourceDCMI
|
||||
}
|
||||
}
|
||||
|
||||
func ResolveSystemPowerDecision(exportDir string) SystemPowerSourceDecision {
|
||||
cfg, err := LoadSystemPowerSourceConfig(exportDir)
|
||||
if err == nil && cfg != nil && strings.TrimSpace(cfg.SelectedSource) != "" {
|
||||
selected := normalizeBenchmarkPowerSource(cfg.SelectedSource)
|
||||
return SystemPowerSourceDecision{
|
||||
Configured: true,
|
||||
SelectedSource: selected,
|
||||
EffectiveSource: selected,
|
||||
Mode: "autotuned",
|
||||
Reason: strings.TrimSpace(cfg.Reason),
|
||||
ConfiguredAt: cfg.UpdatedAt,
|
||||
}
|
||||
}
|
||||
|
||||
sources := sampleBenchmarkPowerSources()
|
||||
if value := sources[BenchmarkPowerSourceSDRPSUInput]; value > 0 {
|
||||
return SystemPowerSourceDecision{
|
||||
Configured: false,
|
||||
EffectiveSource: BenchmarkPowerSourceSDRPSUInput,
|
||||
Mode: "fallback",
|
||||
Reason: "autotune config not found; using temporary fallback source sdr_psu_input",
|
||||
}
|
||||
}
|
||||
return SystemPowerSourceDecision{
|
||||
Configured: false,
|
||||
EffectiveSource: BenchmarkPowerSourceDCMI,
|
||||
Mode: "fallback",
|
||||
Reason: "autotune config not found; using temporary fallback source dcmi",
|
||||
}
|
||||
}
|
||||
|
||||
func SampleSystemPowerResolved(exportDir string) (float64, SystemPowerSourceDecision, error) {
|
||||
decision := ResolveSystemPowerDecision(exportDir)
|
||||
if decision.EffectiveSource != "" {
|
||||
if value, err := queryBenchmarkPowerSourceW(decision.EffectiveSource); err == nil && value > 0 {
|
||||
return value, decision, nil
|
||||
} else if decision.Configured {
|
||||
fallback := BenchmarkPowerSourceDCMI
|
||||
if decision.EffectiveSource == BenchmarkPowerSourceDCMI {
|
||||
fallback = BenchmarkPowerSourceSDRPSUInput
|
||||
}
|
||||
if fallbackValue, fallbackErr := queryBenchmarkPowerSourceW(fallback); fallbackErr == nil && fallbackValue > 0 {
|
||||
decision.Mode = "degraded"
|
||||
decision.Reason = fmt.Sprintf("configured source %s unavailable; using degraded fallback %s", decision.SelectedSource, fallback)
|
||||
decision.EffectiveSource = fallback
|
||||
return fallbackValue, decision, nil
|
||||
}
|
||||
decision.Mode = "degraded"
|
||||
decision.Reason = fmt.Sprintf("configured source %s unavailable and no fallback source responded", decision.SelectedSource)
|
||||
return 0, decision, err
|
||||
}
|
||||
}
|
||||
return 0, decision, fmt.Errorf("system power source unavailable")
|
||||
}
|
||||
|
||||
func queryBenchmarkPowerSourceW(source string) (float64, error) {
|
||||
switch normalizeBenchmarkPowerSource(source) {
|
||||
case BenchmarkPowerSourceSDRPSUInput:
|
||||
sdr := sampleIPMISDRPowerSensors()
|
||||
if sdr.PSUInW > 0 {
|
||||
return sdr.PSUInW, nil
|
||||
}
|
||||
return 0, fmt.Errorf("sdr psu input unavailable")
|
||||
default:
|
||||
return queryIPMIServerPowerW()
|
||||
}
|
||||
}
|
||||
|
||||
func sampleBenchmarkPowerSources() map[string]float64 {
|
||||
out := map[string]float64{}
|
||||
if w, err := queryIPMIServerPowerW(); err == nil && w > 0 {
|
||||
out[BenchmarkPowerSourceDCMI] = w
|
||||
}
|
||||
if w, err := queryBenchmarkPowerSourceW(BenchmarkPowerSourceSDRPSUInput); err == nil && w > 0 {
|
||||
out[BenchmarkPowerSourceSDRPSUInput] = w
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func sampleBenchmarkPowerSourceSeries(ctx context.Context, source string, durationSec, intervalSec int) (float64, bool) {
|
||||
if durationSec <= 0 {
|
||||
return 0, false
|
||||
}
|
||||
samples := collectSelectedPowerSourceSamples(ctx, source, durationSec, intervalSec)
|
||||
if len(samples) == 0 {
|
||||
return 0, false
|
||||
}
|
||||
return benchmarkMean(samples), true
|
||||
}
|
||||
|
||||
func collectSelectedPowerSourceSamples(ctx context.Context, source string, durationSec, intervalSec int) []float64 {
|
||||
if durationSec <= 0 {
|
||||
return nil
|
||||
}
|
||||
stopCh := make(chan struct{})
|
||||
doneCh := startSelectedPowerSourceSampler(stopCh, source, intervalSec)
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
case <-time.After(time.Duration(durationSec) * time.Second):
|
||||
}
|
||||
close(stopCh)
|
||||
return <-doneCh
|
||||
}
|
||||
|
||||
func startSelectedPowerSourceSampler(stopCh <-chan struct{}, source string, intervalSec int) <-chan []float64 {
|
||||
if intervalSec <= 0 {
|
||||
intervalSec = benchmarkPowerAutotuneSampleInterval
|
||||
}
|
||||
ch := make(chan []float64, 1)
|
||||
go func() {
|
||||
defer close(ch)
|
||||
var samples []float64
|
||||
record := func() {
|
||||
if w, err := queryBenchmarkPowerSourceW(source); err == nil && w > 0 {
|
||||
samples = append(samples, w)
|
||||
}
|
||||
}
|
||||
record()
|
||||
ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-stopCh:
|
||||
ch <- samples
|
||||
return
|
||||
case <-ticker.C:
|
||||
record()
|
||||
}
|
||||
}
|
||||
}()
|
||||
return ch
|
||||
}
|
||||
|
||||
type benchmarkPowerAutotuneSample struct {
|
||||
ElapsedSec float64
|
||||
GPUAvgUsagePct float64
|
||||
CPUUsagePct float64
|
||||
GPUSumPowerW float64
|
||||
Sources map[string]float64
|
||||
}
|
||||
|
||||
func collectBenchmarkPowerAutotuneSamples(ctx context.Context, phase string, gpuIndices []int, durationSec int, logFunc func(string)) []benchmarkPowerAutotuneSample {
|
||||
if durationSec <= 0 {
|
||||
return nil
|
||||
}
|
||||
var out []benchmarkPowerAutotuneSample
|
||||
deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
|
||||
start := time.Now()
|
||||
for {
|
||||
if ctx.Err() != nil {
|
||||
return out
|
||||
}
|
||||
row := benchmarkPowerAutotuneSample{
|
||||
ElapsedSec: time.Since(start).Seconds(),
|
||||
CPUUsagePct: sampleCPULoadPct(),
|
||||
Sources: sampleBenchmarkPowerSources(),
|
||||
}
|
||||
if gpuRows, err := sampleGPUMetrics(gpuIndices); err == nil && len(gpuRows) > 0 {
|
||||
var usageSum float64
|
||||
for _, gpu := range gpuRows {
|
||||
row.GPUSumPowerW += gpu.PowerW
|
||||
usageSum += gpu.UsagePct
|
||||
}
|
||||
row.GPUAvgUsagePct = usageSum / float64(len(gpuRows))
|
||||
}
|
||||
out = append(out, row)
|
||||
logBenchmarkPowerAutotuneSample(phase, row, logFunc)
|
||||
if time.Now().After(deadline) {
|
||||
return out
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return out
|
||||
case <-time.After(benchmarkPowerAutotuneSampleInterval * time.Second):
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func logBenchmarkPowerAutotuneSample(phase string, sample benchmarkPowerAutotuneSample, logFunc func(string)) {
|
||||
if logFunc == nil {
|
||||
return
|
||||
}
|
||||
var sourceParts []string
|
||||
for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} {
|
||||
if value, ok := sample.Sources[source]; ok && value > 0 {
|
||||
sourceParts = append(sourceParts, fmt.Sprintf("%s=%.0fW", source, value))
|
||||
} else {
|
||||
sourceParts = append(sourceParts, fmt.Sprintf("%s=n/a", source))
|
||||
}
|
||||
}
|
||||
logFunc(fmt.Sprintf(
|
||||
"autotune %s sample t=%.0fs gpu_avg_util=%.1f%% gpu_sum_power=%.0fW cpu_load=%.1f%% %s",
|
||||
phase,
|
||||
sample.ElapsedSec,
|
||||
sample.GPUAvgUsagePct,
|
||||
sample.GPUSumPowerW,
|
||||
sample.CPUUsagePct,
|
||||
strings.Join(sourceParts, " "),
|
||||
))
|
||||
}
|
||||
|
||||
func logBenchmarkPowerAutotunePhaseSummary(phase string, samples []benchmarkPowerAutotuneSample, logFunc func(string)) {
|
||||
if logFunc == nil || len(samples) == 0 {
|
||||
return
|
||||
}
|
||||
var gpuUsage []float64
|
||||
var cpuUsage []float64
|
||||
var gpuPower []float64
|
||||
sourceBuckets := map[string][]float64{}
|
||||
for _, sample := range samples {
|
||||
gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct)
|
||||
cpuUsage = append(cpuUsage, sample.CPUUsagePct)
|
||||
gpuPower = append(gpuPower, sample.GPUSumPowerW)
|
||||
for source, value := range sample.Sources {
|
||||
if value > 0 {
|
||||
sourceBuckets[source] = append(sourceBuckets[source], value)
|
||||
}
|
||||
}
|
||||
}
|
||||
var sourceParts []string
|
||||
for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} {
|
||||
values := sourceBuckets[source]
|
||||
if len(values) == 0 {
|
||||
sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=n/a", source))
|
||||
continue
|
||||
}
|
||||
sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=%.0fW", source, benchmarkMean(values)))
|
||||
}
|
||||
logFunc(fmt.Sprintf(
|
||||
"autotune %s summary samples=%d gpu_avg_util=%.1f%% gpu_p95_util=%.1f%% gpu_avg_power=%.0fW cpu_avg=%.1f%% cpu_p95=%.1f%% %s",
|
||||
phase,
|
||||
len(samples),
|
||||
benchmarkMean(gpuUsage),
|
||||
benchmarkPercentile(gpuUsage, 95),
|
||||
benchmarkMean(gpuPower),
|
||||
benchmarkMean(cpuUsage),
|
||||
benchmarkPercentile(cpuUsage, 95),
|
||||
strings.Join(sourceParts, " "),
|
||||
))
|
||||
}
|
||||
|
||||
func logBenchmarkPowerAutotuneSelection(candidates []BenchmarkPowerAutotuneCandidate, selectedSource string, gpuDelta float64, logFunc func(string)) {
|
||||
if logFunc == nil {
|
||||
return
|
||||
}
|
||||
for _, candidate := range candidates {
|
||||
if !candidate.Available {
|
||||
logFunc(fmt.Sprintf("autotune candidate %s unavailable", candidate.Source))
|
||||
continue
|
||||
}
|
||||
logFunc(fmt.Sprintf(
|
||||
"autotune candidate %s idle_avg=%.0fW load_avg=%.0fW delta=%.0fW gpu_delta=%.0fW relative_error=%.3f confidence=%.0f%%%s",
|
||||
candidate.Source,
|
||||
candidate.IdleAvgW,
|
||||
candidate.LoadAvgW,
|
||||
candidate.DeltaW,
|
||||
gpuDelta,
|
||||
candidate.RelativeError,
|
||||
candidate.Confidence*100,
|
||||
map[bool]string{true: " SELECTED", false: ""}[candidate.Source == selectedSource],
|
||||
))
|
||||
if strings.TrimSpace(candidate.SelectionNotes) != "" {
|
||||
logFunc(fmt.Sprintf("autotune candidate %s reason: %s", candidate.Source, candidate.SelectionNotes))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func validateBenchmarkPowerAutotuneIdle(samples []benchmarkPowerAutotuneSample) *BenchmarkPowerAutotuneValidation {
|
||||
result := &BenchmarkPowerAutotuneValidation{}
|
||||
if len(samples) == 0 {
|
||||
result.Reason = "no idle telemetry samples collected"
|
||||
return result
|
||||
}
|
||||
var gpuUsage []float64
|
||||
var cpuUsage []float64
|
||||
for _, sample := range samples {
|
||||
gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct)
|
||||
if sample.CPUUsagePct > 0 {
|
||||
cpuUsage = append(cpuUsage, sample.CPUUsagePct)
|
||||
}
|
||||
}
|
||||
result.GPUSamples = len(gpuUsage)
|
||||
result.CPUSamples = len(cpuUsage)
|
||||
result.GPUAvgUsagePct = math.Round(benchmarkMean(gpuUsage)*10) / 10
|
||||
result.GPUP95UsagePct = math.Round(benchmarkPercentile(gpuUsage, 95)*10) / 10
|
||||
result.CPUAvgUsagePct = math.Round(benchmarkMean(cpuUsage)*10) / 10
|
||||
result.CPUP95UsagePct = math.Round(benchmarkPercentile(cpuUsage, 95)*10) / 10
|
||||
switch {
|
||||
case result.GPUAvgUsagePct > 5:
|
||||
result.Reason = fmt.Sprintf("idle validation failed: average GPU load %.1f%% exceeds 5%%", result.GPUAvgUsagePct)
|
||||
case result.GPUP95UsagePct > 10:
|
||||
result.Reason = fmt.Sprintf("idle validation failed: p95 GPU load %.1f%% exceeds 10%%", result.GPUP95UsagePct)
|
||||
case result.CPUAvgUsagePct > 20:
|
||||
result.Reason = fmt.Sprintf("idle validation failed: average CPU load %.1f%% exceeds 20%%", result.CPUAvgUsagePct)
|
||||
case result.CPUP95UsagePct > 35:
|
||||
result.Reason = fmt.Sprintf("idle validation failed: p95 CPU load %.1f%% exceeds 35%%", result.CPUP95UsagePct)
|
||||
default:
|
||||
result.Valid = true
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func chooseBenchmarkPowerAutotuneSource(idle, load []benchmarkPowerAutotuneSample) (string, []BenchmarkPowerAutotuneCandidate, float64, float64, error) {
|
||||
idleBySource := map[string][]float64{}
|
||||
loadBySource := map[string][]float64{}
|
||||
var idleGPU []float64
|
||||
var loadGPU []float64
|
||||
for _, sample := range idle {
|
||||
idleGPU = append(idleGPU, sample.GPUSumPowerW)
|
||||
for source, value := range sample.Sources {
|
||||
if value > 0 {
|
||||
idleBySource[source] = append(idleBySource[source], value)
|
||||
}
|
||||
}
|
||||
}
|
||||
for _, sample := range load {
|
||||
loadGPU = append(loadGPU, sample.GPUSumPowerW)
|
||||
for source, value := range sample.Sources {
|
||||
if value > 0 {
|
||||
loadBySource[source] = append(loadBySource[source], value)
|
||||
}
|
||||
}
|
||||
}
|
||||
idleGPUAvg := benchmarkMean(idleGPU)
|
||||
loadGPUAvg := benchmarkMean(loadGPU)
|
||||
gpuDelta := loadGPUAvg - idleGPUAvg
|
||||
if gpuDelta <= 0 {
|
||||
gpuDelta = loadGPUAvg
|
||||
}
|
||||
|
||||
candidates := []BenchmarkPowerAutotuneCandidate{
|
||||
buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceDCMI, idleBySource[BenchmarkPowerSourceDCMI], loadBySource[BenchmarkPowerSourceDCMI], gpuDelta),
|
||||
buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceSDRPSUInput, idleBySource[BenchmarkPowerSourceSDRPSUInput], loadBySource[BenchmarkPowerSourceSDRPSUInput], gpuDelta),
|
||||
}
|
||||
available := make([]BenchmarkPowerAutotuneCandidate, 0, len(candidates))
|
||||
for _, candidate := range candidates {
|
||||
if candidate.Available && candidate.DeltaW > 0 {
|
||||
available = append(available, candidate)
|
||||
}
|
||||
}
|
||||
if len(available) == 0 {
|
||||
return "", candidates, idleGPUAvg, loadGPUAvg, fmt.Errorf("no usable server power source samples collected")
|
||||
}
|
||||
sort.Slice(available, func(i, j int) bool {
|
||||
if math.Abs(available[i].RelativeError-available[j].RelativeError) <= 0.10 {
|
||||
if available[i].Source != available[j].Source {
|
||||
return available[i].Source == BenchmarkPowerSourceSDRPSUInput
|
||||
}
|
||||
}
|
||||
if available[i].RelativeError != available[j].RelativeError {
|
||||
return available[i].RelativeError < available[j].RelativeError
|
||||
}
|
||||
return available[i].Samples > available[j].Samples
|
||||
})
|
||||
selected := available[0]
|
||||
for idx := range candidates {
|
||||
if candidates[idx].Source == selected.Source {
|
||||
candidates[idx].Selected = true
|
||||
candidates[idx].SelectionNotes = fmt.Sprintf("selected because delta %.0f W is closest to GPU delta %.0f W (relative error %.3f)", selected.DeltaW, gpuDelta, selected.RelativeError)
|
||||
}
|
||||
}
|
||||
return selected.Source, candidates, idleGPUAvg, loadGPUAvg, nil
|
||||
}
|
||||
|
||||
func buildBenchmarkPowerAutotuneCandidate(source string, idle, load []float64, gpuDelta float64) BenchmarkPowerAutotuneCandidate {
|
||||
candidate := BenchmarkPowerAutotuneCandidate{
|
||||
Source: source,
|
||||
Available: len(idle) > 0 && len(load) > 0,
|
||||
Samples: minInt(len(idle), len(load)),
|
||||
}
|
||||
if !candidate.Available {
|
||||
return candidate
|
||||
}
|
||||
candidate.IdleAvgW = benchmarkMean(idle)
|
||||
candidate.LoadAvgW = benchmarkMean(load)
|
||||
candidate.DeltaW = candidate.LoadAvgW - candidate.IdleAvgW
|
||||
if gpuDelta > 0 {
|
||||
candidate.RelativeError = math.Abs(candidate.DeltaW-gpuDelta) / gpuDelta
|
||||
candidate.Confidence = math.Max(0, 1-candidate.RelativeError)
|
||||
}
|
||||
return candidate
|
||||
}
|
||||
|
||||
func renderBenchmarkPowerAutotuneSummary(result BenchmarkPowerAutotuneResult) string {
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "generated_at=%s\n", result.GeneratedAt.UTC().Format(time.RFC3339))
|
||||
fmt.Fprintf(&b, "status=%s\n", result.Status)
|
||||
fmt.Fprintf(&b, "benchmark_kind=%s\n", result.BenchmarkKind)
|
||||
fmt.Fprintf(&b, "profile=%s\n", result.Profile)
|
||||
fmt.Fprintf(&b, "idle_duration_sec=%d\n", result.IdleDurationSec)
|
||||
fmt.Fprintf(&b, "load_duration_sec=%d\n", result.LoadDurationSec)
|
||||
fmt.Fprintf(&b, "sample_interval_sec=%d\n", result.SampleIntervalSec)
|
||||
if result.SelectedSource != "" {
|
||||
fmt.Fprintf(&b, "selected_source=%s\n", result.SelectedSource)
|
||||
}
|
||||
if result.IdleValidation != nil {
|
||||
fmt.Fprintf(&b, "idle_valid=%t\n", result.IdleValidation.Valid)
|
||||
fmt.Fprintf(&b, "idle_gpu_avg_usage_pct=%.1f\n", result.IdleValidation.GPUAvgUsagePct)
|
||||
fmt.Fprintf(&b, "idle_gpu_p95_usage_pct=%.1f\n", result.IdleValidation.GPUP95UsagePct)
|
||||
fmt.Fprintf(&b, "idle_cpu_avg_usage_pct=%.1f\n", result.IdleValidation.CPUAvgUsagePct)
|
||||
fmt.Fprintf(&b, "idle_cpu_p95_usage_pct=%.1f\n", result.IdleValidation.CPUP95UsagePct)
|
||||
if result.IdleValidation.Reason != "" {
|
||||
fmt.Fprintf(&b, "idle_validation_error=%s\n", result.IdleValidation.Reason)
|
||||
}
|
||||
}
|
||||
for _, candidate := range result.Candidates {
|
||||
fmt.Fprintf(&b, "candidate_%s_available=%t\n", candidate.Source, candidate.Available)
|
||||
if candidate.Available {
|
||||
fmt.Fprintf(&b, "candidate_%s_idle_avg_w=%.0f\n", candidate.Source, candidate.IdleAvgW)
|
||||
fmt.Fprintf(&b, "candidate_%s_load_avg_w=%.0f\n", candidate.Source, candidate.LoadAvgW)
|
||||
fmt.Fprintf(&b, "candidate_%s_delta_w=%.0f\n", candidate.Source, candidate.DeltaW)
|
||||
fmt.Fprintf(&b, "candidate_%s_relative_error=%.3f\n", candidate.Source, candidate.RelativeError)
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func renderBenchmarkPowerAutotuneReport(result BenchmarkPowerAutotuneResult) string {
|
||||
var b strings.Builder
|
||||
b.WriteString("# Bee Bench Power Source Autotune\n\n")
|
||||
fmt.Fprintf(&b, "**Status:** %s \n", result.Status)
|
||||
fmt.Fprintf(&b, "**Benchmark kind:** %s \n", result.BenchmarkKind)
|
||||
fmt.Fprintf(&b, "**Profile:** %s \n", result.Profile)
|
||||
fmt.Fprintf(&b, "**Idle window:** %ds \n", result.IdleDurationSec)
|
||||
fmt.Fprintf(&b, "**Load window:** %ds \n", result.LoadDurationSec)
|
||||
fmt.Fprintf(&b, "**Sample interval:** %ds \n", result.SampleIntervalSec)
|
||||
if result.SelectedSource != "" {
|
||||
fmt.Fprintf(&b, "**Selected source:** `%s` \n", result.SelectedSource)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
if result.IdleValidation != nil {
|
||||
b.WriteString("## Idle Validation\n\n")
|
||||
fmt.Fprintf(&b, "- valid: %t\n", result.IdleValidation.Valid)
|
||||
fmt.Fprintf(&b, "- GPU avg usage: %.1f%%\n", result.IdleValidation.GPUAvgUsagePct)
|
||||
fmt.Fprintf(&b, "- GPU p95 usage: %.1f%%\n", result.IdleValidation.GPUP95UsagePct)
|
||||
fmt.Fprintf(&b, "- CPU avg usage: %.1f%%\n", result.IdleValidation.CPUAvgUsagePct)
|
||||
fmt.Fprintf(&b, "- CPU p95 usage: %.1f%%\n", result.IdleValidation.CPUP95UsagePct)
|
||||
if result.IdleValidation.Reason != "" {
|
||||
fmt.Fprintf(&b, "- reason: %s\n", result.IdleValidation.Reason)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
if len(result.Candidates) > 0 {
|
||||
b.WriteString("## Candidates\n\n")
|
||||
b.WriteString("| Source | Idle avg W | Load avg W | Delta W | Relative error | Selected |\n")
|
||||
b.WriteString("|--------|------------|------------|---------|----------------|----------|\n")
|
||||
for _, candidate := range result.Candidates {
|
||||
if !candidate.Available {
|
||||
fmt.Fprintf(&b, "| %s | — | — | — | — | no |\n", candidate.Source)
|
||||
continue
|
||||
}
|
||||
selected := "no"
|
||||
if candidate.Selected {
|
||||
selected = "yes"
|
||||
}
|
||||
fmt.Fprintf(&b, "| %s | %.0f | %.0f | %.0f | %.2f | %s |\n",
|
||||
candidate.Source, candidate.IdleAvgW, candidate.LoadAvgW, candidate.DeltaW, candidate.RelativeError, selected)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
for _, note := range result.Notes {
|
||||
fmt.Fprintf(&b, "- %s\n", note)
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func benchmarkAutotuneLoadCommand(kind string, durationSec int, gpuIndices []int, sizeMB int) ([]string, string) {
|
||||
allDevices := joinIndexList(gpuIndices)
|
||||
switch strings.TrimSpace(strings.ToLower(kind)) {
|
||||
case "power-fit", "power", "nvidia-bench-power":
|
||||
cmd, _, err := resolveBenchmarkPowerLoadCommand(durationSec, gpuIndices)
|
||||
if err == nil {
|
||||
return cmd, "power-fit"
|
||||
}
|
||||
return nvidiaDCGMNamedDiagCommand("targeted_power", durationSec, gpuIndices), "power-fit"
|
||||
default:
|
||||
cmd := []string{
|
||||
"bee-gpu-burn",
|
||||
"--seconds", fmt.Sprintf("%d", durationSec),
|
||||
"--devices", allDevices,
|
||||
}
|
||||
if sizeMB > 0 {
|
||||
cmd = append(cmd, "--size-mb", fmt.Sprintf("%d", sizeMB))
|
||||
}
|
||||
return cmd, "performance"
|
||||
}
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
|
||||
if ctx == nil {
|
||||
ctx = context.Background()
|
||||
}
|
||||
if logFunc == nil {
|
||||
logFunc = func(string) {}
|
||||
}
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = "/var/log/bee-bench/autotune"
|
||||
}
|
||||
if err := os.MkdirAll(baseDir, 0755); err != nil {
|
||||
return "", fmt.Errorf("mkdir %s: %w", baseDir, err)
|
||||
}
|
||||
selected, err := resolveNvidiaGPUSelection(nil, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if len(selected) == 0 {
|
||||
return "", fmt.Errorf("no NVIDIA GPUs detected for autotune")
|
||||
}
|
||||
ts := time.Now().UTC().Format("20060102-150405")
|
||||
runDir := filepath.Join(baseDir, "autotune-"+ts)
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
|
||||
}
|
||||
verboseLog := filepath.Join(runDir, "verbose.log")
|
||||
hostname, _ := os.Hostname()
|
||||
loadCmd, normalizedKind := benchmarkAutotuneLoadCommand(benchmarkKind, benchmarkPowerAutotuneLoadSec, selected, opts.SizeMB)
|
||||
result := BenchmarkPowerAutotuneResult{
|
||||
GeneratedAt: time.Now().UTC(),
|
||||
Hostname: hostname,
|
||||
ServerModel: readServerModel(),
|
||||
BenchmarkKind: normalizedKind,
|
||||
Profile: opts.Profile,
|
||||
Status: "FAILED",
|
||||
IdleDurationSec: benchmarkPowerAutotuneIdleSec,
|
||||
LoadDurationSec: benchmarkPowerAutotuneLoadSec,
|
||||
SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
|
||||
}
|
||||
|
||||
logFunc(fmt.Sprintf("autotune: idle validation window %ds on GPUs %s", benchmarkPowerAutotuneIdleSec, joinIndexList(selected)))
|
||||
idleSamples := collectBenchmarkPowerAutotuneSamples(ctx, "idle", selected, benchmarkPowerAutotuneIdleSec, logFunc)
|
||||
logBenchmarkPowerAutotunePhaseSummary("idle", idleSamples, logFunc)
|
||||
result.IdleValidation = validateBenchmarkPowerAutotuneIdle(idleSamples)
|
||||
if result.IdleValidation == nil || !result.IdleValidation.Valid {
|
||||
if result.IdleValidation != nil {
|
||||
result.IdleValidationError = result.IdleValidation.Reason
|
||||
logFunc(result.IdleValidation.Reason)
|
||||
}
|
||||
result.Notes = append(result.Notes, "autotune stopped before load stage because idle validation failed")
|
||||
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runDir, fmt.Errorf("%s", result.IdleValidationError)
|
||||
}
|
||||
|
||||
logFunc(fmt.Sprintf("autotune: full-load stage using %s for %ds", normalizedKind, benchmarkPowerAutotuneLoadSec))
|
||||
loadSamplesCh := make(chan []benchmarkPowerAutotuneSample, 1)
|
||||
go func() {
|
||||
loadSamplesCh <- collectBenchmarkPowerAutotuneSamples(ctx, "load", selected, benchmarkPowerAutotuneLoadSec, logFunc)
|
||||
}()
|
||||
out, runErr := runSATCommandCtx(ctx, verboseLog, "autotune-load.log", loadCmd, nil, logFunc)
|
||||
_ = os.WriteFile(filepath.Join(runDir, "autotune-load.log"), out, 0644)
|
||||
loadSamples := <-loadSamplesCh
|
||||
logBenchmarkPowerAutotunePhaseSummary("load", loadSamples, logFunc)
|
||||
if runErr != nil {
|
||||
result.Notes = append(result.Notes, "full-load stage failed: "+runErr.Error())
|
||||
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runDir, fmt.Errorf("autotune load stage: %w", runErr)
|
||||
}
|
||||
|
||||
selectedSource, candidates, idleGPUAvg, loadGPUAvg, chooseErr := chooseBenchmarkPowerAutotuneSource(idleSamples, loadSamples)
|
||||
result.Candidates = candidates
|
||||
result.GPUPowerIdleW = idleGPUAvg
|
||||
result.GPUPowerLoadW = loadGPUAvg
|
||||
if chooseErr != nil {
|
||||
result.Notes = append(result.Notes, chooseErr.Error())
|
||||
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runDir, chooseErr
|
||||
}
|
||||
gpuDelta := loadGPUAvg - idleGPUAvg
|
||||
if gpuDelta <= 0 {
|
||||
gpuDelta = loadGPUAvg
|
||||
}
|
||||
logBenchmarkPowerAutotuneSelection(candidates, selectedSource, gpuDelta, logFunc)
|
||||
result.SelectedSource = selectedSource
|
||||
result.Status = "OK"
|
||||
var confidence float64
|
||||
selectionReason := fmt.Sprintf("selected %s after comparing full-load average against GPU-reported delta", selectedSource)
|
||||
for _, candidate := range candidates {
|
||||
if candidate.Selected {
|
||||
confidence = candidate.Confidence
|
||||
if strings.TrimSpace(candidate.SelectionNotes) != "" {
|
||||
selectionReason = candidate.SelectionNotes
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
cfg := BenchmarkPowerAutotuneConfig{
|
||||
Version: benchmarkPowerAutotuneVersion,
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
SelectedSource: selectedSource,
|
||||
BenchmarkKind: normalizedKind,
|
||||
Profile: opts.Profile,
|
||||
IdleDurationSec: benchmarkPowerAutotuneIdleSec,
|
||||
LoadDurationSec: benchmarkPowerAutotuneLoadSec,
|
||||
SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
|
||||
Confidence: confidence,
|
||||
Reason: selectionReason,
|
||||
}
|
||||
result.Config = &cfg
|
||||
configPath := BenchmarkPowerSourceConfigPath(baseDir)
|
||||
if err := SaveBenchmarkPowerAutotuneConfig(configPath, cfg); err != nil {
|
||||
result.Status = "FAILED"
|
||||
result.Notes = append(result.Notes, "failed to save autotune config: "+err.Error())
|
||||
if writeErr := writeBenchmarkPowerAutotuneArtifacts(runDir, result); writeErr != nil {
|
||||
return "", writeErr
|
||||
}
|
||||
return runDir, err
|
||||
}
|
||||
logFunc(fmt.Sprintf("autotune conclusion: selected source %s; reason: %s", selectedSource, cfg.Reason))
|
||||
result.Notes = append(result.Notes, "saved autotune config to "+configPath)
|
||||
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runDir, nil
|
||||
}
|
||||
|
||||
func writeBenchmarkPowerAutotuneArtifacts(runDir string, result BenchmarkPowerAutotuneResult) error {
|
||||
resultJSON, err := json.MarshalIndent(result, "", " ")
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal autotune result: %w", err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
|
||||
return fmt.Errorf("write autotune result.json: %w", err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(renderBenchmarkPowerAutotuneSummary(result)), 0644); err != nil {
|
||||
return fmt.Errorf("write autotune summary.txt: %w", err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(renderBenchmarkPowerAutotuneReport(result)), 0644); err != nil {
|
||||
return fmt.Errorf("write autotune report.md: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func minInt(a, b int) int {
|
||||
if a < b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
var _ = exec.ErrNotFound
|
||||
@@ -401,11 +401,15 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
}
|
||||
}
|
||||
|
||||
// ── Server Power (IPMI) ───────────────────────────────────────────────────
|
||||
// ── Server Power ───────────────────────────────────────────────────────────
|
||||
if sp := result.ServerPower; sp != nil {
|
||||
b.WriteString("## Server Power (IPMI)\n\n")
|
||||
title := "## Server Power\n\n"
|
||||
if sp.Source != "" {
|
||||
title = fmt.Sprintf("## Server Power (`%s`)\n\n", sp.Source)
|
||||
}
|
||||
b.WriteString(title)
|
||||
if !sp.Available {
|
||||
b.WriteString("IPMI power measurement unavailable.\n\n")
|
||||
b.WriteString("Server power measurement unavailable.\n\n")
|
||||
} else {
|
||||
spRows := [][]string{
|
||||
{"Server idle", fmt.Sprintf("%.0f W", sp.IdleW)},
|
||||
|
||||
@@ -1,8 +1,13 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestResolveBenchmarkProfile(t *testing.T) {
|
||||
@@ -164,6 +169,99 @@ func TestBenchmarkPlannedPhaseStatus(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkCalibrationThrottleReasonIgnoresPowerReasons(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
before := BenchmarkThrottleCounters{}
|
||||
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWPowerCapUS: 1_000_000}); got != "" {
|
||||
t.Fatalf("sw_power_cap should be ignored, got %q", got)
|
||||
}
|
||||
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWPowerBrakeSlowdownUS: 1_000_000}); got != "" {
|
||||
t.Fatalf("hw_power_brake should be ignored, got %q", got)
|
||||
}
|
||||
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWThermalSlowdownUS: 1_000_000}); got != "hw_thermal" {
|
||||
t.Fatalf("hw_thermal mismatch: got %q", got)
|
||||
}
|
||||
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWThermalSlowdownUS: 1_000_000}); got != "sw_thermal" {
|
||||
t.Fatalf("sw_thermal mismatch: got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
|
||||
oldGeteuid := benchmarkGeteuid
|
||||
oldReset := benchmarkResetNvidiaGPU
|
||||
benchmarkGeteuid = func() int { return 1000 }
|
||||
benchmarkResetNvidiaGPU = func(int) (string, error) {
|
||||
t.Fatal("unexpected reset call")
|
||||
return "", nil
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
benchmarkGeteuid = oldGeteuid
|
||||
benchmarkResetNvidiaGPU = oldReset
|
||||
})
|
||||
|
||||
var logs []string
|
||||
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{0, 2}, func(line string) {
|
||||
logs = append(logs, line)
|
||||
})
|
||||
if got, want := strings.Join(logs, "\n"), "power benchmark pre-flight: root privileges unavailable, GPU reset skipped"; !strings.Contains(got, want) {
|
||||
t.Fatalf("logs=%q want substring %q", got, want)
|
||||
}
|
||||
if len(failed) != 2 || failed[0] != 0 || failed[1] != 2 {
|
||||
t.Fatalf("failed=%v want [0 2]", failed)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResetBenchmarkGPUsResetsEachGPU(t *testing.T) {
|
||||
oldGeteuid := benchmarkGeteuid
|
||||
oldSleep := benchmarkSleep
|
||||
oldReset := benchmarkResetNvidiaGPU
|
||||
benchmarkGeteuid = func() int { return 0 }
|
||||
benchmarkSleep = func(time.Duration) {}
|
||||
var calls []int
|
||||
benchmarkResetNvidiaGPU = func(index int) (string, error) {
|
||||
calls = append(calls, index)
|
||||
return "ok\n", nil
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
benchmarkGeteuid = oldGeteuid
|
||||
benchmarkSleep = oldSleep
|
||||
benchmarkResetNvidiaGPU = oldReset
|
||||
})
|
||||
|
||||
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{2, 5}, nil)
|
||||
if len(failed) != 0 {
|
||||
t.Fatalf("failed=%v want no failures", failed)
|
||||
}
|
||||
if got, want := fmt.Sprint(calls), "[2 5]"; got != want {
|
||||
t.Fatalf("calls=%v want %s", calls, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResetBenchmarkGPUsTracksFailuresFromSharedReset(t *testing.T) {
|
||||
oldGeteuid := benchmarkGeteuid
|
||||
oldSleep := benchmarkSleep
|
||||
oldReset := benchmarkResetNvidiaGPU
|
||||
benchmarkGeteuid = func() int { return 0 }
|
||||
benchmarkSleep = func(time.Duration) {}
|
||||
benchmarkResetNvidiaGPU = func(index int) (string, error) {
|
||||
if index == 5 {
|
||||
return "busy\n", exec.ErrNotFound
|
||||
}
|
||||
return "ok\n", nil
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
benchmarkGeteuid = oldGeteuid
|
||||
benchmarkSleep = oldSleep
|
||||
benchmarkResetNvidiaGPU = oldReset
|
||||
})
|
||||
|
||||
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{2, 5}, nil)
|
||||
if got, want := fmt.Sprint(failed), "[5]"; got != want {
|
||||
t.Fatalf("failed=%v want %s", failed, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -179,6 +277,59 @@ func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestInitialBenchmarkCalibrationLimitW(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
info benchmarkGPUInfo
|
||||
want int
|
||||
}{
|
||||
{
|
||||
name: "prefers default tdp over current derated limit",
|
||||
info: benchmarkGPUInfo{
|
||||
PowerLimitW: 500,
|
||||
DefaultPowerLimitW: 600,
|
||||
MaxPowerLimitW: 600,
|
||||
},
|
||||
want: 600,
|
||||
},
|
||||
{
|
||||
name: "caps default tdp to reported max limit",
|
||||
info: benchmarkGPUInfo{
|
||||
PowerLimitW: 500,
|
||||
DefaultPowerLimitW: 700,
|
||||
MaxPowerLimitW: 650,
|
||||
},
|
||||
want: 650,
|
||||
},
|
||||
{
|
||||
name: "falls back to current limit when default missing",
|
||||
info: benchmarkGPUInfo{
|
||||
PowerLimitW: 525,
|
||||
MaxPowerLimitW: 600,
|
||||
},
|
||||
want: 525,
|
||||
},
|
||||
{
|
||||
name: "falls back to max limit when only that is known",
|
||||
info: benchmarkGPUInfo{
|
||||
MaxPowerLimitW: 575,
|
||||
},
|
||||
want: 575,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
tc := tc
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
if got := initialBenchmarkCalibrationLimitW(tc.info); got != tc.want {
|
||||
t.Fatalf("initialBenchmarkCalibrationLimitW(%+v)=%d want %d", tc.info, got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseBenchmarkBurnLog(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -338,12 +489,16 @@ func TestScoreBenchmarkGPUIgnoresDisabledPrecisions(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {
|
||||
func TestEnrichGPUInfoWithNvidiaSMIQ(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
nvsmiQ := []byte(`
|
||||
GPU 00000000:4E:00.0
|
||||
Product Name : NVIDIA RTX PRO 6000 Blackwell Server Edition
|
||||
Min Power Limit : 200.00 W
|
||||
Max Power Limit : 600.00 W
|
||||
Default Power Limit : 575.00 W
|
||||
Current Power Limit : 560.00 W
|
||||
Clocks
|
||||
Graphics : 2422 MHz
|
||||
Memory : 12481 MHz
|
||||
@@ -365,7 +520,7 @@ GPU 00000000:4F:00.0
|
||||
1: {Index: 1, BusID: "00000000:4F:00.0"},
|
||||
}
|
||||
|
||||
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
|
||||
enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)
|
||||
|
||||
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
|
||||
t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz)
|
||||
@@ -379,25 +534,49 @@ GPU 00000000:4F:00.0
|
||||
if infoByIndex[1].MaxMemoryClockMHz != 12481 {
|
||||
t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz)
|
||||
}
|
||||
if infoByIndex[0].MinPowerLimitW != 200 {
|
||||
t.Errorf("GPU 0 MinPowerLimitW = %v, want 200", infoByIndex[0].MinPowerLimitW)
|
||||
}
|
||||
if infoByIndex[0].MaxPowerLimitW != 600 {
|
||||
t.Errorf("GPU 0 MaxPowerLimitW = %v, want 600", infoByIndex[0].MaxPowerLimitW)
|
||||
}
|
||||
if infoByIndex[0].DefaultPowerLimitW != 575 {
|
||||
t.Errorf("GPU 0 DefaultPowerLimitW = %v, want 575", infoByIndex[0].DefaultPowerLimitW)
|
||||
}
|
||||
if infoByIndex[0].PowerLimitW != 560 {
|
||||
t.Errorf("GPU 0 PowerLimitW = %v, want 560", infoByIndex[0].PowerLimitW)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnrichGPUInfoWithMaxClocksSkipsPopulated(t *testing.T) {
|
||||
func TestEnrichGPUInfoWithNvidiaSMIQSkipsPopulated(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
nvsmiQ := []byte(`
|
||||
GPU 00000000:4E:00.0
|
||||
Min Power Limit : 100.00 W
|
||||
Max Power Limit : 900.00 W
|
||||
Max Clocks
|
||||
Graphics : 9999 MHz
|
||||
Memory : 9999 MHz
|
||||
`)
|
||||
// Already populated — must not be overwritten.
|
||||
infoByIndex := map[int]benchmarkGPUInfo{
|
||||
0: {Index: 0, BusID: "00000000:4E:00.0", MaxGraphicsClockMHz: 2430, MaxMemoryClockMHz: 12481},
|
||||
0: {
|
||||
Index: 0,
|
||||
BusID: "00000000:4E:00.0",
|
||||
MaxGraphicsClockMHz: 2430,
|
||||
MaxMemoryClockMHz: 12481,
|
||||
MinPowerLimitW: 200,
|
||||
MaxPowerLimitW: 600,
|
||||
},
|
||||
}
|
||||
|
||||
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
|
||||
enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)
|
||||
|
||||
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
|
||||
t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz)
|
||||
}
|
||||
if infoByIndex[0].MinPowerLimitW != 200 {
|
||||
t.Errorf("expected existing min power limit to be preserved, got %v", infoByIndex[0].MinPowerLimitW)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -43,6 +43,11 @@ const (
|
||||
NvidiaBenchmarkProfileOvernight = "overnight"
|
||||
)
|
||||
|
||||
const (
|
||||
BenchmarkPowerEngineDCGMProfTester = "dcgmproftester"
|
||||
BenchmarkPowerEngineTargetedPower = "targeted_power"
|
||||
)
|
||||
|
||||
// Estimated wall-clock durations for benchmark runs, derived from real _v8 logs.
|
||||
// Rule: when changing profile phase durations in resolveBenchmarkProfile(),
|
||||
// re-measure from actual task logs and update the constants here.
|
||||
@@ -61,7 +66,7 @@ const (
|
||||
BenchmarkEstimatedPerfStabilitySec = 5532 // ~92 min; ramp-up 1-8 measured
|
||||
BenchmarkEstimatedPerfOvernightSec = 8 * 3600
|
||||
|
||||
// Power / Thermal Fit (dcgmi targeted_power binary-search calibration).
|
||||
// Power / Thermal Fit (dcgmproftester load + nvidia-smi power-limit binary search).
|
||||
// Duration is for the full ramp-up run; individual steps vary with convergence speed.
|
||||
BenchmarkEstimatedPowerStandardSec = 2600 // ~43 min; ramp 1-4: 2663 s, ramp 1-8: 2375 s
|
||||
BenchmarkEstimatedPowerStabilitySec = 5400 // ~90 min; calibDurationSec=300 × 8 GPU × ~2-3 attempts
|
||||
@@ -74,12 +79,84 @@ type NvidiaBenchmarkOptions struct {
|
||||
GPUIndices []int
|
||||
ExcludeGPUIndices []int
|
||||
RunNCCL bool
|
||||
ServerPowerSource string
|
||||
ParallelGPUs bool // run all selected GPUs simultaneously instead of sequentially
|
||||
RampStep int // 1-based step index within a ramp-up run (0 = not a ramp-up)
|
||||
RampTotal int // total number of ramp-up steps in this run
|
||||
RampRunID string // shared identifier across all steps of the same ramp-up run
|
||||
}
|
||||
|
||||
const (
|
||||
BenchmarkPowerSourceDCMI = "dcmi"
|
||||
BenchmarkPowerSourceSDRPSUInput = "sdr_psu_input"
|
||||
)
|
||||
|
||||
type BenchmarkPowerAutotuneConfig struct {
|
||||
Version int `json:"version"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
SelectedSource string `json:"selected_source"`
|
||||
BenchmarkKind string `json:"benchmark_kind,omitempty"`
|
||||
Profile string `json:"profile,omitempty"`
|
||||
IdleDurationSec int `json:"idle_duration_sec,omitempty"`
|
||||
LoadDurationSec int `json:"load_duration_sec,omitempty"`
|
||||
SampleIntervalSec int `json:"sample_interval_sec,omitempty"`
|
||||
Confidence float64 `json:"confidence,omitempty"`
|
||||
Reason string `json:"reason,omitempty"`
|
||||
}
|
||||
|
||||
type SystemPowerSourceDecision struct {
|
||||
Configured bool `json:"configured"`
|
||||
SelectedSource string `json:"selected_source,omitempty"`
|
||||
EffectiveSource string `json:"effective_source,omitempty"`
|
||||
Mode string `json:"mode,omitempty"` // autotuned, fallback, degraded
|
||||
Reason string `json:"reason,omitempty"`
|
||||
ConfiguredAt time.Time `json:"configured_at,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkPowerAutotuneResult struct {
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
Hostname string `json:"hostname,omitempty"`
|
||||
ServerModel string `json:"server_model,omitempty"`
|
||||
BenchmarkKind string `json:"benchmark_kind,omitempty"`
|
||||
Profile string `json:"profile,omitempty"`
|
||||
Status string `json:"status"`
|
||||
IdleDurationSec int `json:"idle_duration_sec"`
|
||||
LoadDurationSec int `json:"load_duration_sec"`
|
||||
SampleIntervalSec int `json:"sample_interval_sec"`
|
||||
SelectedSource string `json:"selected_source,omitempty"`
|
||||
IdleValidationError string `json:"idle_validation_error,omitempty"`
|
||||
IdleValidation *BenchmarkPowerAutotuneValidation `json:"idle_validation,omitempty"`
|
||||
GPUPowerIdleW float64 `json:"gpu_power_idle_w,omitempty"`
|
||||
GPUPowerLoadW float64 `json:"gpu_power_load_w,omitempty"`
|
||||
Candidates []BenchmarkPowerAutotuneCandidate `json:"candidates,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
Config *BenchmarkPowerAutotuneConfig `json:"config,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkPowerAutotuneValidation struct {
|
||||
Valid bool `json:"valid"`
|
||||
GPUAvgUsagePct float64 `json:"gpu_avg_usage_pct,omitempty"`
|
||||
GPUP95UsagePct float64 `json:"gpu_p95_usage_pct,omitempty"`
|
||||
CPUAvgUsagePct float64 `json:"cpu_avg_usage_pct,omitempty"`
|
||||
CPUP95UsagePct float64 `json:"cpu_p95_usage_pct,omitempty"`
|
||||
GPUSamples int `json:"gpu_samples,omitempty"`
|
||||
CPUSamples int `json:"cpu_samples,omitempty"`
|
||||
Reason string `json:"reason,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkPowerAutotuneCandidate struct {
|
||||
Source string `json:"source"`
|
||||
IdleAvgW float64 `json:"idle_avg_w,omitempty"`
|
||||
LoadAvgW float64 `json:"load_avg_w,omitempty"`
|
||||
DeltaW float64 `json:"delta_w,omitempty"`
|
||||
Samples int `json:"samples,omitempty"`
|
||||
RelativeError float64 `json:"relative_error,omitempty"`
|
||||
Confidence float64 `json:"confidence,omitempty"`
|
||||
Selected bool `json:"selected,omitempty"`
|
||||
Available bool `json:"available"`
|
||||
SelectionNotes string `json:"selection_notes,omitempty"`
|
||||
}
|
||||
|
||||
type NvidiaBenchmarkResult struct {
|
||||
BenchmarkVersion string `json:"benchmark_version"`
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
@@ -294,12 +371,16 @@ type BenchmarkPSUSlotPower struct {
|
||||
// - SDR — `ipmitool sdr` PSUx_POWER_IN/OUT; per-PSU, reliable
|
||||
// - nvidia-smi — GPU self-reported via internal shunt; accurate for GPU load
|
||||
type BenchmarkServerPower struct {
|
||||
Available bool `json:"available"`
|
||||
IdleW float64 `json:"idle_w,omitempty"` // DCMI at idle
|
||||
LoadedW float64 `json:"loaded_w,omitempty"` // DCMI at peak load
|
||||
DeltaW float64 `json:"delta_w,omitempty"` // DCMI loaded − idle
|
||||
GPUReportedSumW float64 `json:"gpu_reported_sum_w,omitempty"`
|
||||
ReportingRatio float64 `json:"reporting_ratio,omitempty"`
|
||||
Available bool `json:"available"`
|
||||
Source string `json:"source,omitempty"`
|
||||
Mode string `json:"mode,omitempty"`
|
||||
Reason string `json:"reason,omitempty"`
|
||||
SampleIntervalSec int `json:"sample_interval_sec,omitempty"`
|
||||
IdleW float64 `json:"idle_w,omitempty"` // DCMI at idle
|
||||
LoadedW float64 `json:"loaded_w,omitempty"` // DCMI at peak load
|
||||
DeltaW float64 `json:"delta_w,omitempty"` // DCMI loaded − idle
|
||||
GPUReportedSumW float64 `json:"gpu_reported_sum_w,omitempty"`
|
||||
ReportingRatio float64 `json:"reporting_ratio,omitempty"`
|
||||
|
||||
// PSU AC input sum — sampled at idle and at peak load using collector's
|
||||
// slot patterns (PSU1_POWER_IN, PSU1_PIN, PS1 POut, Power1…).
|
||||
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
@@ -18,7 +19,7 @@ type InstallDisk struct {
|
||||
MountedParts []string // partition mount points currently active
|
||||
}
|
||||
|
||||
const squashfsPath = "/run/live/medium/live/filesystem.squashfs"
|
||||
const squashfsGlob = "/run/live/medium/live/*.squashfs"
|
||||
|
||||
// ListInstallDisks returns block devices suitable for installation.
|
||||
// Excludes the current live boot medium but includes USB drives.
|
||||
@@ -176,11 +177,22 @@ func inferLiveBootKind(fsType, source, deviceType, transport string) string {
|
||||
// squashfs size × 1.5 to allow for extracted filesystem and bootloader.
|
||||
// Returns 0 if the squashfs is not available (non-live environment).
|
||||
func MinInstallBytes() int64 {
|
||||
fi, err := os.Stat(squashfsPath)
|
||||
if err != nil {
|
||||
files, err := filepath.Glob(squashfsGlob)
|
||||
if err != nil || len(files) == 0 {
|
||||
return 0
|
||||
}
|
||||
return fi.Size() * 3 / 2
|
||||
var total int64
|
||||
for _, path := range files {
|
||||
fi, statErr := os.Stat(path)
|
||||
if statErr != nil {
|
||||
continue
|
||||
}
|
||||
total += fi.Size()
|
||||
}
|
||||
if total == 0 {
|
||||
return 0
|
||||
}
|
||||
return total * 3 / 2
|
||||
}
|
||||
|
||||
// toramActive returns true when the live system was booted with toram.
|
||||
@@ -222,12 +234,10 @@ func DiskWarnings(d InstallDisk) []string {
|
||||
humanBytes(min), humanBytes(d.SizeBytes)))
|
||||
}
|
||||
if toramActive() {
|
||||
sqFi, err := os.Stat(squashfsPath)
|
||||
if err == nil {
|
||||
free := freeMemBytes()
|
||||
if free > 0 && free < sqFi.Size()*2 {
|
||||
w = append(w, "toram mode — low RAM, extraction may be slow or fail")
|
||||
}
|
||||
free := freeMemBytes()
|
||||
min := MinInstallBytes()
|
||||
if free > 0 && min > 0 && free < (min*4/3) {
|
||||
w = append(w, "toram mode — low RAM, extraction may be slow or fail")
|
||||
}
|
||||
}
|
||||
return w
|
||||
|
||||
@@ -12,6 +12,23 @@ import (
|
||||
)
|
||||
|
||||
const installToRAMDir = "/dev/shm/bee-live"
|
||||
const copyProgressLogStep int64 = 100 * 1024 * 1024
|
||||
|
||||
var liveMediumSquashfsGlob = func() ([]string, error) {
|
||||
return filepath.Glob("/run/live/medium/live/*.squashfs")
|
||||
}
|
||||
|
||||
var runRemountMedium = func() ([]byte, error) {
|
||||
return exec.Command("bee-remount-medium").CombinedOutput()
|
||||
}
|
||||
|
||||
var umountLiveMedium = func() error {
|
||||
return exec.Command("umount", "/run/live/medium").Run()
|
||||
}
|
||||
|
||||
var ejectDevice = func(device string) error {
|
||||
return exec.Command("eject", device).Run()
|
||||
}
|
||||
|
||||
func (s *System) IsLiveMediaInRAM() bool {
|
||||
return s.LiveMediaRAMState().InRAM
|
||||
@@ -139,8 +156,7 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (ret
|
||||
return nil
|
||||
}
|
||||
|
||||
squashfsFiles, err := filepath.Glob("/run/live/medium/live/*.squashfs")
|
||||
sourceAvailable := err == nil && len(squashfsFiles) > 0
|
||||
squashfsFiles, sourceAvailable := ensureLiveMediumAvailable(log)
|
||||
|
||||
dstDir := installToRAMDir
|
||||
|
||||
@@ -170,7 +186,7 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (ret
|
||||
}
|
||||
goto bindMedium
|
||||
}
|
||||
return fmt.Errorf("no squashfs files found in /run/live/medium/live/ and no prior RAM copy in %s — reconnect the installation medium and retry", dstDir)
|
||||
return fmt.Errorf("no squashfs files found in /run/live/medium/live/ and no prior RAM copy in %s — reconnect the installation medium and retry (or run bee-remount-medium as root)", dstDir)
|
||||
}
|
||||
|
||||
{
|
||||
@@ -253,10 +269,83 @@ bindMedium:
|
||||
if status.InRAM {
|
||||
log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
|
||||
}
|
||||
log("Done. Squashfs files are in RAM. Installation media can be safely disconnected.")
|
||||
detachInstallMedium(status, log)
|
||||
log("Done. Squashfs files are in RAM. Installation media has been detached when possible.")
|
||||
return nil
|
||||
}
|
||||
|
||||
func tryRemountLiveMedium(log func(string)) error {
|
||||
output, err := runRemountMedium()
|
||||
trimmed := strings.TrimSpace(string(output))
|
||||
if err != nil {
|
||||
if trimmed != "" && log != nil {
|
||||
for _, line := range strings.Split(trimmed, "\n") {
|
||||
log("bee-remount-medium: " + line)
|
||||
}
|
||||
}
|
||||
return err
|
||||
}
|
||||
if trimmed != "" && log != nil {
|
||||
for _, line := range strings.Split(trimmed, "\n") {
|
||||
log("bee-remount-medium: " + line)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func ensureLiveMediumAvailable(log func(string)) ([]string, bool) {
|
||||
squashfsFiles, err := liveMediumSquashfsGlob()
|
||||
sourceAvailable := err == nil && len(squashfsFiles) > 0
|
||||
if sourceAvailable {
|
||||
return squashfsFiles, true
|
||||
}
|
||||
|
||||
if log != nil {
|
||||
log("Live medium not mounted at /run/live/medium — attempting automatic remount scan...")
|
||||
}
|
||||
if remountErr := tryRemountLiveMedium(log); remountErr != nil {
|
||||
if log != nil {
|
||||
log(fmt.Sprintf("Automatic remount did not restore the live medium: %v", remountErr))
|
||||
}
|
||||
return squashfsFiles, false
|
||||
}
|
||||
|
||||
squashfsFiles, err = liveMediumSquashfsGlob()
|
||||
sourceAvailable = err == nil && len(squashfsFiles) > 0
|
||||
if sourceAvailable && log != nil {
|
||||
log("Live medium restored after remount scan.")
|
||||
}
|
||||
return squashfsFiles, sourceAvailable
|
||||
}
|
||||
|
||||
func detachInstallMedium(status LiveBootSource, log func(string)) {
|
||||
if log == nil {
|
||||
log = func(string) {}
|
||||
}
|
||||
|
||||
log("Detaching original installation medium...")
|
||||
if err := umountLiveMedium(); err != nil {
|
||||
log(fmt.Sprintf("Warning: could not unmount /run/live/medium: %v", err))
|
||||
} else {
|
||||
log("Unmounted /run/live/medium.")
|
||||
}
|
||||
|
||||
device := strings.TrimSpace(status.Device)
|
||||
if device == "" {
|
||||
device = strings.TrimSpace(status.Source)
|
||||
}
|
||||
if device == "" || !strings.HasPrefix(device, "/dev/") {
|
||||
log("No block device identified for eject; skipping media eject.")
|
||||
return
|
||||
}
|
||||
|
||||
if err := ejectDevice(device); err != nil {
|
||||
log(fmt.Sprintf("Warning: could not eject %s: %v", device, err))
|
||||
return
|
||||
}
|
||||
log(fmt.Sprintf("Ejected %s.", device))
|
||||
}
|
||||
|
||||
func verifyInstallToRAMStatus(status LiveBootSource, dstDir string, mediumRebound bool, log func(string)) error {
|
||||
if status.InRAM {
|
||||
return nil
|
||||
@@ -319,6 +408,7 @@ func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) e
|
||||
defer out.Close()
|
||||
total := fi.Size()
|
||||
var copied int64
|
||||
var lastLogged int64
|
||||
buf := make([]byte, 4*1024*1024)
|
||||
for {
|
||||
if err := ctx.Err(); err != nil {
|
||||
@@ -330,7 +420,8 @@ func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) e
|
||||
return werr
|
||||
}
|
||||
copied += int64(n)
|
||||
if logFunc != nil && total > 0 {
|
||||
if shouldLogCopyProgress(copied, total, lastLogged) {
|
||||
lastLogged = copied
|
||||
pct := int(float64(copied) / float64(total) * 100)
|
||||
logFunc(fmt.Sprintf(" %s / %s (%d%%)", humanBytes(copied), humanBytes(total), pct))
|
||||
}
|
||||
@@ -345,6 +436,19 @@ func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) e
|
||||
return out.Sync()
|
||||
}
|
||||
|
||||
func shouldLogCopyProgress(copied, total, lastLogged int64) bool {
|
||||
if total <= 0 || copied <= 0 {
|
||||
return false
|
||||
}
|
||||
if copied >= total {
|
||||
return copied > lastLogged
|
||||
}
|
||||
if copied < copyProgressLogStep {
|
||||
return false
|
||||
}
|
||||
return copied-lastLogged >= copyProgressLogStep
|
||||
}
|
||||
|
||||
func cpDir(ctx context.Context, src, dst string, logFunc func(string)) error {
|
||||
return filepath.Walk(src, func(path string, fi os.FileInfo, err error) error {
|
||||
if ctx.Err() != nil {
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
package platform
|
||||
|
||||
import "testing"
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestInferLiveBootKind(t *testing.T) {
|
||||
t.Parallel()
|
||||
@@ -101,3 +104,179 @@ func TestEvaluateLiveMediaRAMState(t *testing.T) {
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestShouldLogCopyProgress(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
total := int64(250 * 1024 * 1024)
|
||||
step := int64(100 * 1024 * 1024)
|
||||
|
||||
if shouldLogCopyProgress(step-1, total, 0) {
|
||||
t.Fatal("progress logged too early")
|
||||
}
|
||||
if !shouldLogCopyProgress(step, total, 0) {
|
||||
t.Fatal("expected log at first 100MB boundary")
|
||||
}
|
||||
if shouldLogCopyProgress(step+16*1024*1024, total, step) {
|
||||
t.Fatal("progress logged again before next 100MB")
|
||||
}
|
||||
if !shouldLogCopyProgress(2*step, total, step) {
|
||||
t.Fatal("expected log at second 100MB boundary")
|
||||
}
|
||||
if !shouldLogCopyProgress(total, total, 2*step) {
|
||||
t.Fatal("expected final completion log")
|
||||
}
|
||||
}
|
||||
|
||||
func TestTryRemountLiveMedium(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
orig := runRemountMedium
|
||||
t.Cleanup(func() {
|
||||
runRemountMedium = orig
|
||||
})
|
||||
|
||||
t.Run("success", func(t *testing.T) {
|
||||
runRemountMedium = func() ([]byte, error) {
|
||||
return []byte("[10:57:31] Mounted /dev/sr1 on /run/live/medium\n"), nil
|
||||
}
|
||||
var logs []string
|
||||
if err := tryRemountLiveMedium(func(msg string) { logs = append(logs, msg) }); err != nil {
|
||||
t.Fatalf("tryRemountLiveMedium() error = %v", err)
|
||||
}
|
||||
if len(logs) != 1 || logs[0] != "bee-remount-medium: [10:57:31] Mounted /dev/sr1 on /run/live/medium" {
|
||||
t.Fatalf("logs=%v", logs)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("failure", func(t *testing.T) {
|
||||
runRemountMedium = func() ([]byte, error) {
|
||||
return []byte("must be run as root\n"), fmt.Errorf("exit status 1")
|
||||
}
|
||||
var logs []string
|
||||
err := tryRemountLiveMedium(func(msg string) { logs = append(logs, msg) })
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
if len(logs) != 1 || logs[0] != "bee-remount-medium: must be run as root" {
|
||||
t.Fatalf("logs=%v", logs)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestEnsureLiveMediumAvailableRemountsSource(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
origGlob := liveMediumSquashfsGlob
|
||||
origRemount := runRemountMedium
|
||||
t.Cleanup(func() {
|
||||
liveMediumSquashfsGlob = origGlob
|
||||
runRemountMedium = origRemount
|
||||
})
|
||||
|
||||
callCount := 0
|
||||
liveMediumSquashfsGlob = func() ([]string, error) {
|
||||
callCount++
|
||||
if callCount == 1 {
|
||||
return nil, nil
|
||||
}
|
||||
return []string{"/run/live/medium/live/filesystem.squashfs"}, nil
|
||||
}
|
||||
runRemountMedium = func() ([]byte, error) {
|
||||
return []byte("Mounted /dev/sr1 on /run/live/medium\n"), nil
|
||||
}
|
||||
|
||||
var logs []string
|
||||
files, ok := ensureLiveMediumAvailable(func(msg string) { logs = append(logs, msg) })
|
||||
if !ok {
|
||||
t.Fatal("expected live medium to become available after remount")
|
||||
}
|
||||
if callCount < 2 {
|
||||
t.Fatalf("liveMediumSquashfsGlob called %d times, want at least 2", callCount)
|
||||
}
|
||||
if len(files) != 1 || files[0] != "/run/live/medium/live/filesystem.squashfs" {
|
||||
t.Fatalf("files=%v", files)
|
||||
}
|
||||
found := false
|
||||
for _, msg := range logs {
|
||||
if msg == "Live medium restored after remount scan." {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Fatalf("expected remount success log, logs=%v", logs)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDetachInstallMedium(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
origUmount := umountLiveMedium
|
||||
origEject := ejectDevice
|
||||
t.Cleanup(func() {
|
||||
umountLiveMedium = origUmount
|
||||
ejectDevice = origEject
|
||||
})
|
||||
|
||||
t.Run("success", func(t *testing.T) {
|
||||
var umountCalled bool
|
||||
var ejected string
|
||||
umountLiveMedium = func() error {
|
||||
umountCalled = true
|
||||
return nil
|
||||
}
|
||||
ejectDevice = func(device string) error {
|
||||
ejected = device
|
||||
return nil
|
||||
}
|
||||
var logs []string
|
||||
detachInstallMedium(LiveBootSource{Kind: "cdrom", Device: "/dev/sr1"}, func(msg string) { logs = append(logs, msg) })
|
||||
if !umountCalled {
|
||||
t.Fatal("expected umountLiveMedium to be called")
|
||||
}
|
||||
if ejected != "/dev/sr1" {
|
||||
t.Fatalf("ejected=%q want /dev/sr1", ejected)
|
||||
}
|
||||
if len(logs) < 3 {
|
||||
t.Fatalf("logs=%v", logs)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("no device", func(t *testing.T) {
|
||||
umountLiveMedium = func() error { return nil }
|
||||
ejectDevice = func(device string) error {
|
||||
t.Fatalf("unexpected eject for %q", device)
|
||||
return nil
|
||||
}
|
||||
var logs []string
|
||||
detachInstallMedium(LiveBootSource{Kind: "ram", Source: "tmpfs"}, func(msg string) { logs = append(logs, msg) })
|
||||
found := false
|
||||
for _, msg := range logs {
|
||||
if msg == "No block device identified for eject; skipping media eject." {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Fatalf("logs=%v", logs)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("eject failure is warning only", func(t *testing.T) {
|
||||
umountLiveMedium = func() error { return nil }
|
||||
ejectDevice = func(device string) error { return fmt.Errorf("exit status 1") }
|
||||
var logs []string
|
||||
detachInstallMedium(LiveBootSource{Kind: "usb", Device: "/dev/sdb1"}, func(msg string) { logs = append(logs, msg) })
|
||||
found := false
|
||||
for _, msg := range logs {
|
||||
if msg == "Warning: could not eject /dev/sdb1: exit status 1" {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Fatalf("logs=%v", logs)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
@@ -18,6 +18,7 @@ var workerPatterns = []string{
|
||||
"stress-ng",
|
||||
"stressapptest",
|
||||
"memtester",
|
||||
"nvbandwidth",
|
||||
// DCGM diagnostic workers — nvvs is spawned by dcgmi diag and survives
|
||||
// if dcgmi is killed mid-run, leaving the GPU occupied (DCGM_ST_IN_USE).
|
||||
"nvvs",
|
||||
@@ -71,13 +72,19 @@ func KillTestWorkers() []KilledProcess {
|
||||
if idx := strings.LastIndexByte(exe, '/'); idx >= 0 {
|
||||
base = exe[idx+1:]
|
||||
}
|
||||
for _, pat := range workerPatterns {
|
||||
if strings.Contains(base, pat) || strings.Contains(exe, pat) {
|
||||
_ = syscall.Kill(pid, syscall.SIGKILL)
|
||||
killed = append(killed, KilledProcess{PID: pid, Name: base})
|
||||
break
|
||||
}
|
||||
if shouldKillWorkerProcess(exe, base) {
|
||||
_ = syscall.Kill(pid, syscall.SIGKILL)
|
||||
killed = append(killed, KilledProcess{PID: pid, Name: base})
|
||||
}
|
||||
}
|
||||
return killed
|
||||
}
|
||||
|
||||
func shouldKillWorkerProcess(exe, base string) bool {
|
||||
for _, pat := range workerPatterns {
|
||||
if strings.Contains(base, pat) || strings.Contains(exe, pat) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
@@ -0,0 +1,39 @@
|
||||
package platform
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestShouldKillWorkerProcess(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
exe string
|
||||
base string
|
||||
want bool
|
||||
}{
|
||||
{
|
||||
name: "nvbandwidth executable",
|
||||
exe: "/usr/libexec/datacenter-gpu-manager-4/plugins/cuda13/nvbandwidth",
|
||||
base: "nvbandwidth",
|
||||
want: true,
|
||||
},
|
||||
{
|
||||
name: "dcgmi executable",
|
||||
exe: "/usr/bin/dcgmi",
|
||||
base: "dcgmi",
|
||||
want: true,
|
||||
},
|
||||
{
|
||||
name: "unrelated process",
|
||||
exe: "/usr/bin/bash",
|
||||
base: "bash",
|
||||
want: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
if got := shouldKillWorkerProcess(tt.exe, tt.base); got != tt.want {
|
||||
t.Fatalf("shouldKillWorkerProcess(%q, %q)=%v want %v", tt.exe, tt.base, got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -1,8 +1,10 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"bee/audit/internal/collector"
|
||||
"bufio"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"sort"
|
||||
@@ -14,14 +16,17 @@ import (
|
||||
// LiveMetricSample is a single point-in-time snapshot of server metrics
|
||||
// collected for the web UI metrics page.
|
||||
type LiveMetricSample struct {
|
||||
Timestamp time.Time `json:"ts"`
|
||||
Fans []FanReading `json:"fans"`
|
||||
Temps []TempReading `json:"temps"`
|
||||
PowerW float64 `json:"power_w"`
|
||||
PSUs []PSUReading `json:"psus,omitempty"`
|
||||
CPULoadPct float64 `json:"cpu_load_pct"`
|
||||
MemLoadPct float64 `json:"mem_load_pct"`
|
||||
GPUs []GPUMetricRow `json:"gpus"`
|
||||
Timestamp time.Time `json:"ts"`
|
||||
Fans []FanReading `json:"fans"`
|
||||
Temps []TempReading `json:"temps"`
|
||||
PowerW float64 `json:"power_w"`
|
||||
PowerSource string `json:"power_source,omitempty"`
|
||||
PowerMode string `json:"power_mode,omitempty"`
|
||||
PowerReason string `json:"power_reason,omitempty"`
|
||||
PSUs []PSUReading `json:"psus,omitempty"`
|
||||
CPULoadPct float64 `json:"cpu_load_pct"`
|
||||
MemLoadPct float64 `json:"mem_load_pct"`
|
||||
GPUs []GPUMetricRow `json:"gpus"`
|
||||
}
|
||||
|
||||
// PSUReading is a per-slot power supply input power reading.
|
||||
@@ -62,12 +67,18 @@ func SampleLiveMetrics() LiveMetricSample {
|
||||
}
|
||||
}
|
||||
|
||||
// System power — returns 0 if unavailable
|
||||
s.PowerW = sampleSystemPower()
|
||||
|
||||
// Per-PSU power — populated when IPMI SDR has Power Supply entities with Watt readings
|
||||
s.PSUs = samplePSUPower()
|
||||
|
||||
// System power: use the global autotune-selected source when configured,
|
||||
// otherwise fall back to the historical heuristic and mark the mode.
|
||||
if powerW, decision, err := SampleSystemPowerResolved(""); err == nil {
|
||||
s.PowerW = powerW
|
||||
s.PowerSource = decision.EffectiveSource
|
||||
s.PowerMode = decision.Mode
|
||||
s.PowerReason = decision.Reason
|
||||
}
|
||||
|
||||
// CPU load — from /proc/stat
|
||||
s.CPULoadPct = sampleCPULoadPct()
|
||||
|
||||
@@ -339,63 +350,44 @@ func compactAmbientTempName(chip, name string) string {
|
||||
}
|
||||
|
||||
// samplePSUPower reads per-PSU input power via IPMI SDR.
|
||||
// It parses `ipmitool sdr elist full` output looking for Power Supply entity
|
||||
// sensors (entity ID "10.N") that report a value in Watts.
|
||||
// Uses collector.PSUSlotsFromSDR (name-based matching) which works across
|
||||
// vendors where PSU sensors may not carry entity ID "10.N".
|
||||
// Returns nil when IPMI is unavailable or no PSU Watt sensors exist.
|
||||
func samplePSUPower() []PSUReading {
|
||||
out, err := exec.Command("ipmitool", "sdr", "elist", "full").Output()
|
||||
out, err := exec.Command("ipmitool", "sdr").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
// map slot → reading (keep highest-watt value per slot in case of duplicates)
|
||||
type entry struct {
|
||||
name string
|
||||
powerW float64
|
||||
}
|
||||
bySlot := map[int]entry{}
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
parts := strings.Split(line, "|")
|
||||
if len(parts) < 5 {
|
||||
continue
|
||||
}
|
||||
entityID := strings.TrimSpace(parts[3]) // e.g. "10.1"
|
||||
if !strings.HasPrefix(entityID, "10.") {
|
||||
continue // not a Power Supply entity
|
||||
}
|
||||
slotStr := strings.TrimPrefix(entityID, "10.")
|
||||
slot, err := strconv.Atoi(slotStr)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
valueField := strings.TrimSpace(parts[4]) // e.g. "740.00 Watts"
|
||||
if !strings.Contains(strings.ToLower(valueField), "watts") {
|
||||
continue
|
||||
}
|
||||
valueFields := strings.Fields(valueField)
|
||||
if len(valueFields) < 2 {
|
||||
continue
|
||||
}
|
||||
w, err := strconv.ParseFloat(valueFields[0], 64)
|
||||
if err != nil || w <= 0 {
|
||||
continue
|
||||
}
|
||||
sensorName := strings.TrimSpace(parts[0])
|
||||
if existing, ok := bySlot[slot]; !ok || w > existing.powerW {
|
||||
bySlot[slot] = entry{name: sensorName, powerW: w}
|
||||
}
|
||||
}
|
||||
if len(bySlot) == 0 {
|
||||
slots := collector.PSUSlotsFromSDR(string(out))
|
||||
if len(slots) == 0 {
|
||||
return nil
|
||||
}
|
||||
slots := make([]int, 0, len(bySlot))
|
||||
for s := range bySlot {
|
||||
slots = append(slots, s)
|
||||
// Collect slot keys and sort for stable output.
|
||||
keys := make([]int, 0, len(slots))
|
||||
for k := range slots {
|
||||
n, err := strconv.Atoi(k)
|
||||
if err == nil {
|
||||
keys = append(keys, n)
|
||||
}
|
||||
}
|
||||
sort.Ints(slots)
|
||||
psus := make([]PSUReading, 0, len(slots))
|
||||
for _, s := range slots {
|
||||
e := bySlot[s]
|
||||
psus = append(psus, PSUReading{Slot: s, Name: e.name, PowerW: e.powerW})
|
||||
sort.Ints(keys)
|
||||
psus := make([]PSUReading, 0, len(keys))
|
||||
for _, k := range keys {
|
||||
entry := slots[strconv.Itoa(k)]
|
||||
// Prefer AC input power; fall back to DC output power.
|
||||
var w float64
|
||||
if entry.InputW != nil && *entry.InputW > 0 {
|
||||
w = *entry.InputW
|
||||
} else if entry.OutputW != nil && *entry.OutputW > 0 {
|
||||
w = *entry.OutputW
|
||||
}
|
||||
if w <= 0 {
|
||||
continue
|
||||
}
|
||||
psus = append(psus, PSUReading{Slot: k + 1, Name: fmt.Sprintf("PSU%d", k+1), PowerW: w})
|
||||
}
|
||||
if len(psus) == 0 {
|
||||
return nil
|
||||
}
|
||||
return psus
|
||||
}
|
||||
|
||||
@@ -0,0 +1,51 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
const nvidiaRecoverHelper = "/usr/local/bin/bee-nvidia-recover"
|
||||
|
||||
func runNvidiaRecover(args ...string) (string, error) {
|
||||
helperArgs := append([]string{nvidiaRecoverHelper}, args...)
|
||||
if _, err := exec.LookPath("systemd-run"); err == nil {
|
||||
unit := fmt.Sprintf("bee-nvidia-recover-%d", time.Now().UnixNano())
|
||||
cmdArgs := []string{
|
||||
"systemd-run",
|
||||
"--quiet",
|
||||
"--pipe",
|
||||
"--wait",
|
||||
"--collect",
|
||||
"--service-type=oneshot",
|
||||
"--unit", unit,
|
||||
}
|
||||
cmdArgs = append(cmdArgs, helperArgs...)
|
||||
raw, err := exec.Command("sudo", cmdArgs...).CombinedOutput()
|
||||
return string(raw), err
|
||||
}
|
||||
raw, err := exec.Command("sudo", helperArgs...).CombinedOutput()
|
||||
return string(raw), err
|
||||
}
|
||||
|
||||
func resetNvidiaGPU(index int) (string, error) {
|
||||
if index < 0 {
|
||||
return "", fmt.Errorf("gpu index must be >= 0")
|
||||
}
|
||||
out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
|
||||
if strings.TrimSpace(out) == "" && err == nil {
|
||||
out = "GPU reset completed.\n"
|
||||
}
|
||||
return out, err
|
||||
}
|
||||
|
||||
func restartNvidiaDrivers() (string, error) {
|
||||
out, err := runNvidiaRecover("restart-drivers")
|
||||
if strings.TrimSpace(out) == "" && err == nil {
|
||||
out = "NVIDIA drivers restarted.\n"
|
||||
}
|
||||
return out, err
|
||||
}
|
||||
@@ -30,10 +30,10 @@ import (
|
||||
// Sources:
|
||||
// - SATEstimatedCPUValidateSec: xFusion v8.6 — 62 s
|
||||
// - SATEstimatedMemoryValidateSec: xFusion v8.6 — 68 s
|
||||
// - SATEstimatedNvidiaGPUValidatePerGPUSec: xFusion v8.6/v8.22 — 77–87 s/GPU
|
||||
// - SATEstimatedNvidiaGPUStressPerGPUSec: xFusion v8.6/v8.22 — 444–448 s/GPU
|
||||
// - SATEstimatedNvidiaTargetedStressPerGPUSec: xFusion v8.6/v8.22 — 347–348 s/GPU (300 s default + overhead)
|
||||
// - SATEstimatedNvidiaTargetedPowerPerGPUSec: MSI v8.22 / xFusion v8.6 — 346–351 s/GPU
|
||||
// - SATEstimatedNvidiaGPUValidateSec: xFusion v8.6/v8.22 — 77–87 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
|
||||
// - SATEstimatedNvidiaGPUStressSec: xFusion v8.6/v8.22 — 444–448 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
|
||||
// - SATEstimatedNvidiaTargetedStressSec: xFusion v8.6/v8.22 — 347–348 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
|
||||
// - SATEstimatedNvidiaTargetedPowerSec: MSI v8.22 / xFusion v8.6 — 346–351 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
|
||||
// - SATEstimatedNvidiaPulseTestSec: xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous)
|
||||
// - SATEstimatedNvidiaInterconnectSec: xFusion v8.6/v8.22 — 210–384 s / 8 GPU (all simultaneous)
|
||||
// - SATEstimatedNvidiaBandwidthSec: xFusion v8.6/v8.22 — 2 664–2 688 s / 8 GPU (all simultaneous)
|
||||
@@ -48,15 +48,15 @@ const (
|
||||
// RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size).
|
||||
SATEstimatedMemoryStressSec = 140
|
||||
|
||||
// NVIDIA dcgmi diag Level 2 (medium), per GPU, sequential.
|
||||
SATEstimatedNvidiaGPUValidatePerGPUSec = 85
|
||||
// NVIDIA dcgmi diag Level 3 (targeted stress), per GPU, sequential.
|
||||
SATEstimatedNvidiaGPUStressPerGPUSec = 450
|
||||
// NVIDIA dcgmi diag Level 2 (medium), all GPUs simultaneously.
|
||||
SATEstimatedNvidiaGPUValidateSec = 85
|
||||
// NVIDIA dcgmi diag Level 3 (targeted stress), all GPUs simultaneously.
|
||||
SATEstimatedNvidiaGPUStressSec = 450
|
||||
|
||||
// NVIDIA dcgmi targeted_stress 300 s + overhead, per GPU, sequential.
|
||||
SATEstimatedNvidiaTargetedStressPerGPUSec = 350
|
||||
// NVIDIA dcgmi targeted_power 300 s + overhead, per GPU, sequential.
|
||||
SATEstimatedNvidiaTargetedPowerPerGPUSec = 350
|
||||
// NVIDIA dcgmi targeted_stress 300 s + overhead, all GPUs simultaneously.
|
||||
SATEstimatedNvidiaTargetedStressSec = 350
|
||||
// NVIDIA dcgmi targeted_power 300 s + overhead, all GPUs simultaneously.
|
||||
SATEstimatedNvidiaTargetedPowerSec = 350
|
||||
|
||||
// NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU).
|
||||
SATEstimatedNvidiaPulseTestSec = 5000
|
||||
@@ -404,14 +404,7 @@ func normalizeNvidiaBusID(v string) string {
|
||||
}
|
||||
|
||||
func (s *System) ResetNvidiaGPU(index int) (string, error) {
|
||||
if index < 0 {
|
||||
return "", fmt.Errorf("gpu index must be >= 0")
|
||||
}
|
||||
raw, err := satExecCommand("nvidia-smi", "-r", "-i", strconv.Itoa(index)).CombinedOutput()
|
||||
if len(raw) == 0 && err == nil {
|
||||
raw = []byte("GPU reset completed.\n")
|
||||
}
|
||||
return string(raw), err
|
||||
return resetNvidiaGPU(index)
|
||||
}
|
||||
|
||||
// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
|
||||
@@ -443,11 +436,19 @@ func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir strin
|
||||
profCmd []string
|
||||
profEnv []string
|
||||
)
|
||||
if staggerSec > 0 && len(selected) > 1 {
|
||||
if len(selected) > 1 {
|
||||
// For multiple GPUs, always spawn one dcgmproftester process per GPU via
|
||||
// bee-dcgmproftester-staggered (stagger=0 means all start simultaneously).
|
||||
// A single dcgmproftester process without -i only loads GPU 0 regardless
|
||||
// of CUDA_VISIBLE_DEVICES.
|
||||
stagger := staggerSec
|
||||
if stagger < 0 {
|
||||
stagger = 0
|
||||
}
|
||||
profCmd = []string{
|
||||
"bee-dcgmproftester-staggered",
|
||||
"--seconds", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)),
|
||||
"--stagger-seconds", strconv.Itoa(staggerSec),
|
||||
"--stagger-seconds", strconv.Itoa(stagger),
|
||||
"--devices", joinIndexList(selected),
|
||||
}
|
||||
} else {
|
||||
|
||||
@@ -43,17 +43,22 @@ type GPUStressMetric struct {
|
||||
|
||||
// FanStressRow is one second-interval telemetry sample covering all monitored dimensions.
|
||||
type FanStressRow struct {
|
||||
TimestampUTC string
|
||||
ElapsedSec float64
|
||||
Phase string // "baseline", "load1", "pause", "load2", "cooldown"
|
||||
GPUs []GPUStressMetric
|
||||
Fans []FanReading
|
||||
CPUMaxTempC float64 // highest CPU temperature from ipmitool / sensors
|
||||
SysPowerW float64 // DCMI system power reading
|
||||
TimestampUTC string
|
||||
ElapsedSec float64
|
||||
Phase string // "baseline", "load1", "pause", "load2", "cooldown"
|
||||
GPUs []GPUStressMetric
|
||||
Fans []FanReading
|
||||
CPUMaxTempC float64 // highest CPU temperature from ipmitool / sensors
|
||||
SysPowerW float64
|
||||
SysPowerSource string
|
||||
SysPowerMode string
|
||||
}
|
||||
|
||||
type cachedPowerReading struct {
|
||||
Value float64
|
||||
Source string
|
||||
Mode string
|
||||
Reason string
|
||||
UpdatedAt time.Time
|
||||
}
|
||||
|
||||
@@ -278,7 +283,7 @@ func sampleFanStressRow(gpuIndices []int, phase string, elapsed float64) FanStre
|
||||
row.GPUs = sampleGPUStressMetrics(gpuIndices)
|
||||
row.Fans, _ = sampleFanSpeeds()
|
||||
row.CPUMaxTempC = sampleCPUMaxTemp()
|
||||
row.SysPowerW = sampleSystemPower()
|
||||
row.SysPowerW, row.SysPowerSource, row.SysPowerMode = sampleSystemPowerResolved()
|
||||
return row
|
||||
}
|
||||
|
||||
@@ -763,19 +768,19 @@ func sampleCPUTempViaSensors() float64 {
|
||||
return max
|
||||
}
|
||||
|
||||
// sampleSystemPower reads system power draw via DCMI.
|
||||
func sampleSystemPower() float64 {
|
||||
// sampleSystemPowerResolved reads system power via the global autotune source,
|
||||
// falling back to the historical heuristic before autotune or when degraded.
|
||||
func sampleSystemPowerResolved() (float64, string, string) {
|
||||
now := time.Now()
|
||||
current := 0.0
|
||||
out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
|
||||
if err == nil {
|
||||
current = parseDCMIPowerReading(string(out))
|
||||
}
|
||||
current, decision, err := SampleSystemPowerResolved("")
|
||||
systemPowerCacheMu.Lock()
|
||||
defer systemPowerCacheMu.Unlock()
|
||||
value, updated := effectiveSystemPowerReading(systemPowerCache, current, now)
|
||||
if err != nil {
|
||||
current = 0
|
||||
}
|
||||
value, updated := effectiveSystemPowerReading(systemPowerCache, current, decision.EffectiveSource, decision.Mode, decision.Reason, now)
|
||||
systemPowerCache = updated
|
||||
return value
|
||||
return value, updated.Source, updated.Mode
|
||||
}
|
||||
|
||||
// parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
|
||||
@@ -798,9 +803,9 @@ func parseDCMIPowerReading(raw string) float64 {
|
||||
return 0
|
||||
}
|
||||
|
||||
func effectiveSystemPowerReading(cache cachedPowerReading, current float64, now time.Time) (float64, cachedPowerReading) {
|
||||
func effectiveSystemPowerReading(cache cachedPowerReading, current float64, source, mode, reason string, now time.Time) (float64, cachedPowerReading) {
|
||||
if current > 0 {
|
||||
cache = cachedPowerReading{Value: current, UpdatedAt: now}
|
||||
cache = cachedPowerReading{Value: current, Source: source, Mode: mode, Reason: reason, UpdatedAt: now}
|
||||
return current, cache
|
||||
}
|
||||
if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL {
|
||||
|
||||
@@ -112,7 +112,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
|
||||
now := time.Now()
|
||||
cache := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-5 * time.Second)}
|
||||
|
||||
got, updated := effectiveSystemPowerReading(cache, 0, now)
|
||||
got, updated := effectiveSystemPowerReading(cache, 0, "", "", "", now)
|
||||
if got != 480 {
|
||||
t.Fatalf("got=%v want cached 480", got)
|
||||
}
|
||||
@@ -120,7 +120,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
|
||||
t.Fatalf("updated=%+v", updated)
|
||||
}
|
||||
|
||||
got, updated = effectiveSystemPowerReading(cache, 530, now)
|
||||
got, updated = effectiveSystemPowerReading(cache, 530, "dcmi", "fallback", "test", now)
|
||||
if got != 530 {
|
||||
t.Fatalf("got=%v want 530", got)
|
||||
}
|
||||
@@ -129,7 +129,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
|
||||
}
|
||||
|
||||
expired := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-systemPowerHoldTTL - time.Second)}
|
||||
got, _ = effectiveSystemPowerReading(expired, 0, now)
|
||||
got, _ = effectiveSystemPowerReading(expired, 0, "", "", "", now)
|
||||
if got != 0 {
|
||||
t.Fatalf("expired cache returned %v want 0", got)
|
||||
}
|
||||
|
||||
@@ -61,6 +61,9 @@ func (s *System) ServiceState(name string) string {
|
||||
}
|
||||
|
||||
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
|
||||
if name == "bee-nvidia" && action == ServiceRestart {
|
||||
return restartNvidiaDrivers()
|
||||
}
|
||||
// bee-web runs as the bee user; sudo is required to control system services.
|
||||
// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
|
||||
raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()
|
||||
|
||||
@@ -66,6 +66,7 @@ type HardwareSnapshot struct {
|
||||
PowerSupplies []HardwarePowerSupply `json:"power_supplies,omitempty"`
|
||||
Sensors *HardwareSensors `json:"sensors,omitempty"`
|
||||
EventLogs []HardwareEventLog `json:"event_logs,omitempty"`
|
||||
VROCLicense *string `json:"vroc_license,omitempty"`
|
||||
}
|
||||
|
||||
type HardwareHealthSummary struct {
|
||||
@@ -143,30 +144,33 @@ type HardwareMemory struct {
|
||||
|
||||
type HardwareStorage struct {
|
||||
HardwareComponentStatus
|
||||
Slot *string `json:"slot,omitempty"`
|
||||
Type *string `json:"type,omitempty"`
|
||||
Model *string `json:"model,omitempty"`
|
||||
SizeGB *int `json:"size_gb,omitempty"`
|
||||
SerialNumber *string `json:"serial_number,omitempty"`
|
||||
Manufacturer *string `json:"manufacturer,omitempty"`
|
||||
Firmware *string `json:"firmware,omitempty"`
|
||||
Interface *string `json:"interface,omitempty"`
|
||||
Present *bool `json:"present,omitempty"`
|
||||
TemperatureC *float64 `json:"temperature_c,omitempty"`
|
||||
PowerOnHours *int64 `json:"power_on_hours,omitempty"`
|
||||
PowerCycles *int64 `json:"power_cycles,omitempty"`
|
||||
UnsafeShutdowns *int64 `json:"unsafe_shutdowns,omitempty"`
|
||||
MediaErrors *int64 `json:"media_errors,omitempty"`
|
||||
ErrorLogEntries *int64 `json:"error_log_entries,omitempty"`
|
||||
WrittenBytes *int64 `json:"written_bytes,omitempty"`
|
||||
ReadBytes *int64 `json:"read_bytes,omitempty"`
|
||||
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
|
||||
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
|
||||
AvailableSparePct *float64 `json:"available_spare_pct,omitempty"`
|
||||
ReallocatedSectors *int64 `json:"reallocated_sectors,omitempty"`
|
||||
CurrentPendingSectors *int64 `json:"current_pending_sectors,omitempty"`
|
||||
OfflineUncorrectable *int64 `json:"offline_uncorrectable,omitempty"`
|
||||
Telemetry map[string]any `json:"-"`
|
||||
Slot *string `json:"slot,omitempty"`
|
||||
Type *string `json:"type,omitempty"`
|
||||
Model *string `json:"model,omitempty"`
|
||||
SizeGB *int `json:"size_gb,omitempty"`
|
||||
LogicalBlockSizeBytes *int64 `json:"logical_block_size_bytes,omitempty"`
|
||||
PhysicalBlockSizeBytes *int64 `json:"physical_block_size_bytes,omitempty"`
|
||||
MetadataBytesPerBlock *int64 `json:"metadata_bytes_per_block,omitempty"`
|
||||
SerialNumber *string `json:"serial_number,omitempty"`
|
||||
Manufacturer *string `json:"manufacturer,omitempty"`
|
||||
Firmware *string `json:"firmware,omitempty"`
|
||||
Interface *string `json:"interface,omitempty"`
|
||||
Present *bool `json:"present,omitempty"`
|
||||
TemperatureC *float64 `json:"temperature_c,omitempty"`
|
||||
PowerOnHours *int64 `json:"power_on_hours,omitempty"`
|
||||
PowerCycles *int64 `json:"power_cycles,omitempty"`
|
||||
UnsafeShutdowns *int64 `json:"unsafe_shutdowns,omitempty"`
|
||||
MediaErrors *int64 `json:"media_errors,omitempty"`
|
||||
ErrorLogEntries *int64 `json:"error_log_entries,omitempty"`
|
||||
WrittenBytes *int64 `json:"written_bytes,omitempty"`
|
||||
ReadBytes *int64 `json:"read_bytes,omitempty"`
|
||||
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
|
||||
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
|
||||
AvailableSparePct *float64 `json:"available_spare_pct,omitempty"`
|
||||
ReallocatedSectors *int64 `json:"reallocated_sectors,omitempty"`
|
||||
CurrentPendingSectors *int64 `json:"current_pending_sectors,omitempty"`
|
||||
OfflineUncorrectable *int64 `json:"offline_uncorrectable,omitempty"`
|
||||
Telemetry map[string]any `json:"-"`
|
||||
}
|
||||
|
||||
type HardwarePCIeDevice struct {
|
||||
@@ -211,6 +215,7 @@ type HardwarePCIeDevice struct {
|
||||
Firmware *string `json:"firmware,omitempty"`
|
||||
MacAddresses []string `json:"mac_addresses,omitempty"`
|
||||
Present *bool `json:"present,omitempty"`
|
||||
IOMMUGroup *int `json:"iommu_group,omitempty"`
|
||||
Telemetry map[string]any `json:"-"`
|
||||
}
|
||||
|
||||
|
||||
@@ -44,3 +44,57 @@ func TestHardwareSnapshotMarshalsNewContractFields(t *testing.T) {
|
||||
t.Fatalf("missing event_logs payload: %s", text)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHardwareSnapshotMarshalsStorageTelemetryFields(t *testing.T) {
|
||||
powerOnHours := int64(12450)
|
||||
writtenBytes := int64(9876543210)
|
||||
readBytes := int64(1234567890)
|
||||
lifeRemainingPct := 91.0
|
||||
logicalBlockSizeBytes := int64(512)
|
||||
physicalBlockSizeBytes := int64(4096)
|
||||
metadataBytesPerBlock := int64(8)
|
||||
|
||||
payload := HardwareIngestRequest{
|
||||
CollectedAt: "2026-03-15T15:00:00Z",
|
||||
Hardware: HardwareSnapshot{
|
||||
Board: HardwareBoard{SerialNumber: "SRV-001"},
|
||||
Storage: []HardwareStorage{
|
||||
{
|
||||
SerialNumber: stringPtr("DISK-001"),
|
||||
Model: stringPtr("TestDisk"),
|
||||
LogicalBlockSizeBytes: &logicalBlockSizeBytes,
|
||||
PhysicalBlockSizeBytes: &physicalBlockSizeBytes,
|
||||
MetadataBytesPerBlock: &metadataBytesPerBlock,
|
||||
PowerOnHours: &powerOnHours,
|
||||
WrittenBytes: &writtenBytes,
|
||||
ReadBytes: &readBytes,
|
||||
LifeRemainingPct: &lifeRemainingPct,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
data, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal: %v", err)
|
||||
}
|
||||
text := string(data)
|
||||
for _, needle := range []string{
|
||||
`"storage":[{`,
|
||||
`"logical_block_size_bytes":512`,
|
||||
`"physical_block_size_bytes":4096`,
|
||||
`"metadata_bytes_per_block":8`,
|
||||
`"power_on_hours":12450`,
|
||||
`"written_bytes":9876543210`,
|
||||
`"read_bytes":1234567890`,
|
||||
`"life_remaining_pct":91`,
|
||||
} {
|
||||
if !strings.Contains(text, needle) {
|
||||
t.Fatalf("missing %q in payload: %s", needle, text)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func stringPtr(v string) *string {
|
||||
return &v
|
||||
}
|
||||
|
||||
+158
-10
@@ -125,9 +125,11 @@ func defaultTaskPriority(target string, params taskParams) int {
|
||||
return taskPriorityInstall
|
||||
case "install-to-ram":
|
||||
return taskPriorityInstallToRAM
|
||||
case "nvme-format":
|
||||
return taskPriorityInstall
|
||||
case "audit":
|
||||
return taskPriorityAudit
|
||||
case "nvidia-bench-perf", "nvidia-bench-power":
|
||||
case "nvidia-bench-perf", "nvidia-bench-power", "nvidia-bench-autotune":
|
||||
return taskPriorityBenchmark
|
||||
case "nvidia-stress", "amd-stress", "memory-stress", "sat-stress", "platform-stress", "nvidia-compute":
|
||||
return taskPriorityBurn
|
||||
@@ -701,6 +703,78 @@ func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFun
|
||||
}
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIBenchmarkAutotuneRun() http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
var body struct {
|
||||
Profile string `json:"profile"`
|
||||
BenchmarkKind string `json:"benchmark_kind"`
|
||||
SizeMB int `json:"size_mb"`
|
||||
}
|
||||
if r.Body != nil {
|
||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
|
||||
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||
return
|
||||
}
|
||||
}
|
||||
profile := strings.TrimSpace(body.Profile)
|
||||
if profile == "" {
|
||||
profile = "standard"
|
||||
}
|
||||
benchmarkKind := strings.TrimSpace(body.BenchmarkKind)
|
||||
if benchmarkKind == "" {
|
||||
benchmarkKind = "power-fit"
|
||||
}
|
||||
now := time.Now()
|
||||
taskName := fmt.Sprintf("NVIDIA Benchmark Autotune · %s · %s", profile, benchmarkKind)
|
||||
t := &Task{
|
||||
ID: newJobID("bee-bench-autotune"),
|
||||
Name: taskName,
|
||||
Target: "nvidia-bench-autotune",
|
||||
Priority: defaultTaskPriority("nvidia-bench-autotune", taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: now,
|
||||
params: taskParams{
|
||||
BenchmarkProfile: profile,
|
||||
BenchmarkKind: benchmarkKind,
|
||||
SizeMB: body.SizeMB,
|
||||
DisplayName: taskName,
|
||||
},
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeTaskRunResponse(w, []*Task{t})
|
||||
}
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIBenchmarkAutotuneStatus(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
cfg, err := h.opts.App.LoadBenchmarkPowerAutotune()
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
writeJSON(w, map[string]any{
|
||||
"configured": false,
|
||||
"decision": platform.ResolveSystemPowerDecision(h.opts.ExportDir),
|
||||
})
|
||||
return
|
||||
}
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
w.WriteHeader(http.StatusOK)
|
||||
writeJSON(w, map[string]any{
|
||||
"configured": true,
|
||||
"config": cfg,
|
||||
"decision": platform.ResolveSystemPowerDecision(h.opts.ExportDir),
|
||||
})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
|
||||
h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf").ServeHTTP(w, r)
|
||||
}
|
||||
@@ -734,15 +808,14 @@ func (h *handler) handleAPISATAbort(w http.ResponseWriter, r *http.Request) {
|
||||
now := time.Now()
|
||||
t.DoneAt = &now
|
||||
case TaskRunning:
|
||||
if t.job != nil {
|
||||
t.job.abort()
|
||||
if t.job == nil || !t.job.abort() {
|
||||
globalQueue.mu.Unlock()
|
||||
writeJSON(w, map[string]string{"status": "not_running"})
|
||||
return
|
||||
}
|
||||
if taskMayLeaveOrphanWorkers(t.Target) {
|
||||
platform.KillTestWorkers()
|
||||
}
|
||||
t.Status = TaskCancelled
|
||||
now := time.Now()
|
||||
t.DoneAt = &now
|
||||
globalQueue.mu.Unlock()
|
||||
writeJSON(w, map[string]string{"status": "aborting"})
|
||||
return
|
||||
}
|
||||
globalQueue.mu.Unlock()
|
||||
writeJSON(w, map[string]string{"status": "aborted"})
|
||||
@@ -967,6 +1040,81 @@ func (h *handler) handleAPIExportUSBBundle(w http.ResponseWriter, r *http.Reques
|
||||
writeJSON(w, map[string]string{"status": "ok", "message": result.Body})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIBlackboxStatus(w http.ResponseWriter, _ *http.Request) {
|
||||
state, err := app.ReadBlackboxState(filepath.Join(h.opts.ExportDir, "blackbox-state.json"))
|
||||
if err != nil {
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
writeJSON(w, app.BlackboxState{Status: "disabled", Targets: []app.BlackboxTargetStatus{}})
|
||||
return
|
||||
}
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
if state.Targets == nil {
|
||||
state.Targets = []app.BlackboxTargetStatus{}
|
||||
}
|
||||
writeJSON(w, state)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIBlackboxEnable(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
var target platform.RemovableTarget
|
||||
if err := json.NewDecoder(r.Body).Decode(&target); err != nil || strings.TrimSpace(target.Device) == "" {
|
||||
writeError(w, http.StatusBadRequest, "device is required")
|
||||
return
|
||||
}
|
||||
targets, err := h.opts.App.ListRemovableTargets()
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
allowed := false
|
||||
for _, candidate := range targets {
|
||||
if candidate.Device == target.Device {
|
||||
target = candidate
|
||||
allowed = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !allowed {
|
||||
writeError(w, http.StatusBadRequest, "device not in removable target list")
|
||||
return
|
||||
}
|
||||
marker, err := app.EnableBlackboxTarget(target)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, map[string]any{
|
||||
"status": "ok",
|
||||
"message": "Black-box marker written.",
|
||||
"enrollment_id": marker.EnrollmentID,
|
||||
})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIBlackboxDisable(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
Device string `json:"device"`
|
||||
EnrollmentID string `json:"enrollment_id"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||
return
|
||||
}
|
||||
if err := app.DisableBlackboxTarget(req.Device, req.EnrollmentID); err != nil {
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
writeError(w, http.StatusNotFound, "black-box target not found")
|
||||
return
|
||||
}
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, map[string]string{"status": "ok", "message": "Black-box marker removed."})
|
||||
}
|
||||
|
||||
// ── GPU presence ──────────────────────────────────────────────────────────────
|
||||
|
||||
func (h *handler) handleAPIGNVIDIAGPUs(w http.ResponseWriter, _ *http.Request) {
|
||||
@@ -1149,7 +1297,7 @@ func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request)
|
||||
var standardTools = []string{
|
||||
"dmidecode", "smartctl", "nvme", "lspci", "ipmitool",
|
||||
"nvidia-smi", "dcgmi", "nv-hostengine", "memtester", "stress-ng", "nvtop",
|
||||
"mstflint", "qrencode",
|
||||
"mstflint",
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIToolsCheck(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
@@ -3,6 +3,8 @@ package webui
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
@@ -44,6 +46,66 @@ func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBlackboxStatusReturnsDisabledWhenStateMissing(t *testing.T) {
|
||||
h := &handler{opts: HandlerOptions{ExportDir: t.TempDir()}}
|
||||
rec := httptest.NewRecorder()
|
||||
req := httptest.NewRequest("GET", "/api/blackbox/status", nil)
|
||||
|
||||
h.handleAPIBlackboxStatus(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
var state app.BlackboxState
|
||||
if err := json.Unmarshal(rec.Body.Bytes(), &state); err != nil {
|
||||
t.Fatalf("decode state: %v", err)
|
||||
}
|
||||
if state.Status != "disabled" {
|
||||
t.Fatalf("status=%q want disabled", state.Status)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBlackboxStatusReturnsPersistedState(t *testing.T) {
|
||||
exportDir := t.TempDir()
|
||||
statePath := filepath.Join(exportDir, "blackbox-state.json")
|
||||
if err := os.WriteFile(statePath, []byte(`{"status":"running","boot_folder":"boot-folder","targets":[{"enrollment_id":"bb-1","device":"/dev/sdb1","status":"running","flush_period":"1s"}]}`), 0644); err != nil {
|
||||
t.Fatalf("write state: %v", err)
|
||||
}
|
||||
h := &handler{opts: HandlerOptions{ExportDir: exportDir}}
|
||||
rec := httptest.NewRecorder()
|
||||
req := httptest.NewRequest("GET", "/api/blackbox/status", nil)
|
||||
|
||||
h.handleAPIBlackboxStatus(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
if !strings.Contains(rec.Body.String(), `"boot_folder":"boot-folder"`) {
|
||||
t.Fatalf("body=%s", rec.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseNVMeFormatModes(t *testing.T) {
|
||||
raw := `
|
||||
lbaf 0 : ms:0 lbads:9 rp:0x2 (in use)
|
||||
lbaf 1 : ms:8 lbads:9 rp:0x1
|
||||
lbaf 2 : ms:0 lbads:12 rp:0
|
||||
`
|
||||
modes := parseNVMeFormatModes(raw)
|
||||
if len(modes) != 3 {
|
||||
t.Fatalf("modes=%#v want 3 modes", modes)
|
||||
}
|
||||
if modes[0].Mode != 0 || modes[0].DataBytes != 512 || modes[0].MetadataBytes != 0 || !modes[0].InUse {
|
||||
t.Fatalf("mode 0=%#v", modes[0])
|
||||
}
|
||||
if modes[1].Label != "MODE 1 (512+8)" {
|
||||
t.Fatalf("mode 1 label=%q", modes[1].Label)
|
||||
}
|
||||
if modes[2].DataBytes != 4096 || modes[2].MetadataBytes != 0 {
|
||||
t.Fatalf("mode 2=%#v", modes[2])
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
@@ -195,6 +257,40 @@ func TestHandleAPIBenchmarkPowerFitRampQueuesBenchmarkPowerFitTasks(t *testing.T
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBenchmarkAutotuneRunQueuesTask(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
globalQueue.tasks = nil
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = originalTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/autotune/run", strings.NewReader(`{"profile":"standard","benchmark_kind":"power-fit"}`))
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPIBenchmarkAutotuneRun().ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
if len(globalQueue.tasks) != 1 {
|
||||
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
|
||||
}
|
||||
task := globalQueue.tasks[0]
|
||||
if task.Target != "nvidia-bench-autotune" {
|
||||
t.Fatalf("task target=%q want nvidia-bench-autotune", task.Target)
|
||||
}
|
||||
if task.params.BenchmarkKind != "power-fit" {
|
||||
t.Fatalf("task benchmark kind=%q want power-fit", task.params.BenchmarkKind)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
|
||||
@@ -20,7 +20,7 @@ type jobState struct {
|
||||
cancel func() // optional cancel function; nil if job is not cancellable
|
||||
logPath string
|
||||
serialPrefix string
|
||||
logFile *os.File // kept open for the task lifetime to avoid per-line open/close
|
||||
logFile *os.File // kept open for the task lifetime to avoid per-line open/close
|
||||
logBuf *bufio.Writer
|
||||
}
|
||||
|
||||
@@ -53,13 +53,21 @@ func (j *jobState) abort() bool {
|
||||
}
|
||||
|
||||
func (j *jobState) append(line string) {
|
||||
j.appendWithOptions(line, true, true)
|
||||
}
|
||||
|
||||
func (j *jobState) appendFromLog(line string) {
|
||||
j.appendWithOptions(line, false, false)
|
||||
}
|
||||
|
||||
func (j *jobState) appendWithOptions(line string, persistLog, serialMirror bool) {
|
||||
j.mu.Lock()
|
||||
defer j.mu.Unlock()
|
||||
j.lines = append(j.lines, line)
|
||||
if j.logPath != "" {
|
||||
if persistLog && j.logPath != "" {
|
||||
j.writeLogLineLocked(line)
|
||||
}
|
||||
if j.serialPrefix != "" {
|
||||
if serialMirror && j.serialPrefix != "" {
|
||||
taskSerialWriteLine(j.serialPrefix + line)
|
||||
}
|
||||
for _, ch := range j.subs {
|
||||
@@ -83,6 +91,7 @@ func (j *jobState) writeLogLineLocked(line string) {
|
||||
j.logBuf = bufio.NewWriterSize(f, 64*1024)
|
||||
}
|
||||
_, _ = j.logBuf.WriteString(line + "\n")
|
||||
_ = j.logBuf.Flush()
|
||||
}
|
||||
|
||||
// closeLog flushes and closes the log file. Called after all task output is done.
|
||||
|
||||
@@ -0,0 +1,137 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"html"
|
||||
"os"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func layoutHead(title string) string {
|
||||
return `<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width,initial-scale=1">
|
||||
<title>` + html.EscapeString(title) + `</title>
|
||||
<style>
|
||||
:root{--bg:#fff;--surface:#fff;--surface-2:#f9fafb;--border:rgba(34,36,38,.15);--border-lite:rgba(34,36,38,.1);--ink:rgba(0,0,0,.87);--muted:rgba(0,0,0,.6);--accent:#2185d0;--accent-dark:#1678c2;--crit-bg:#fff6f6;--crit-fg:#9f3a38;--crit-border:#e0b4b4;--ok-bg:#fcfff5;--ok-fg:#2c662d;--warn-bg:#fffaf3;--warn-fg:#573a08}
|
||||
*{box-sizing:border-box;margin:0;padding:0}
|
||||
body{font:14px/1.5 Lato,"Helvetica Neue",Arial,Helvetica,sans-serif;background:var(--bg);color:var(--ink);display:flex;min-height:100vh}
|
||||
a{color:var(--accent);text-decoration:none}
|
||||
/* Sidebar */
|
||||
.sidebar{width:210px;min-height:100vh;background:#1b1c1d;flex-shrink:0;display:flex;flex-direction:column}
|
||||
.sidebar-logo{padding:18px 16px 12px;font-size:18px;font-weight:700;color:#fff;letter-spacing:-.5px}
|
||||
.sidebar-logo span{color:rgba(255,255,255,.5);font-weight:400;font-size:12px;display:block;margin-top:2px}
|
||||
.sidebar-version{padding:0 16px 14px;font-size:11px;color:rgba(255,255,255,.45)}
|
||||
.sidebar-badge{margin:0 12px 12px;padding:5px 8px;border-radius:4px;font-size:11px;font-weight:600;text-align:center}
|
||||
.sidebar-badge-warn{background:#7a4f00;color:#f6c90e}
|
||||
.sidebar-badge-crit{background:#5c1a1a;color:#ff6b6b}
|
||||
.nav{flex:1}
|
||||
.nav-item{display:block;padding:10px 16px;color:rgba(255,255,255,.7);font-size:13px;border-left:3px solid transparent;transition:all .15s}
|
||||
.nav-item:hover{color:#fff;background:rgba(255,255,255,.08)}
|
||||
.nav-item.active{color:#fff;background:rgba(33,133,208,.25);border-left-color:var(--accent)}
|
||||
/* Content */
|
||||
.main{flex:1;display:flex;flex-direction:column;overflow:auto}
|
||||
.topbar{padding:13px 24px;background:#1b1c1d;display:flex;align-items:center;gap:12px}
|
||||
.topbar h1{font-size:16px;font-weight:700;color:rgba(255,255,255,.9)}
|
||||
.content{padding:24px;flex:1}
|
||||
/* Cards */
|
||||
.card{background:var(--surface);border:1px solid var(--border);border-radius:4px;box-shadow:0 1px 2px rgba(34,36,38,.15);margin-bottom:16px;overflow:hidden}
|
||||
.card-head{padding:11px 16px;background:var(--surface-2);border-bottom:1px solid var(--border);font-weight:700;font-size:13px;display:flex;align-items:center;gap:8px}
|
||||
.card-head-actions{justify-content:space-between}
|
||||
.card-head-buttons{display:flex;align-items:center;gap:8px;margin-left:auto;flex-wrap:wrap}
|
||||
.card-body{padding:16px}
|
||||
/* Buttons */
|
||||
.btn{display:inline-flex;align-items:center;gap:6px;padding:8px 16px;border-radius:4px;font-size:13px;font-weight:700;cursor:pointer;border:none;transition:background .1s;font-family:inherit}
|
||||
.btn-primary{background:var(--accent);color:#fff}.btn-primary:hover{background:var(--accent-dark)}
|
||||
.btn-danger{background:#db2828;color:#fff}.btn-danger:hover{background:#b91c1c}
|
||||
.btn-secondary{background:var(--surface-2);color:var(--ink);border:1px solid var(--border)}.btn-secondary:hover{background:#eee}
|
||||
.btn-sm{padding:5px 10px;font-size:12px}
|
||||
/* Tables */
|
||||
table{width:100%;border-collapse:collapse;font-size:13px;background:var(--surface)}
|
||||
th{text-align:left;padding:9px 14px;color:var(--ink);font-weight:700;background:var(--surface-2);border-bottom:1px solid var(--border-lite)}
|
||||
td{padding:9px 14px;border-top:1px solid var(--border-lite)}
|
||||
tr:first-child td{border-top:0}
|
||||
tbody tr:hover td{background:rgba(0,0,0,.03)}
|
||||
/* Status badges */
|
||||
.badge{display:inline-block;padding:2px 9px;border-radius:4px;font-size:11px;font-weight:700}
|
||||
.badge-ok{background:var(--ok-bg);color:var(--ok-fg);border:1px solid #a3c293}
|
||||
.badge-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
|
||||
.badge-err{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
|
||||
.badge-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
|
||||
/* Component chips — one small square per device */
|
||||
.chips{display:inline-flex;flex-wrap:wrap;gap:3px;align-items:center;vertical-align:middle}
|
||||
.chip{display:inline-flex;align-items:center;justify-content:center;width:20px;height:20px;border-radius:3px;font-size:10px;font-weight:800;cursor:default;font-family:monospace;letter-spacing:0;user-select:none}
|
||||
.chip-ok{background:var(--ok-bg);color:var(--ok-fg);border:1px solid #a3c293}
|
||||
.chip-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
|
||||
.chip-fail{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
|
||||
.chip-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
|
||||
/* Output terminal */
|
||||
.terminal{background:#1b1c1d;border:1px solid rgba(0,0,0,.2);border-radius:4px;padding:14px;font-family:monospace;font-size:12px;color:#b5cea8;max-height:400px;overflow-y:auto;white-space:pre-wrap;word-break:break-all;user-select:text;-webkit-user-select:text}
|
||||
.terminal-wrap{position:relative}.terminal-copy{position:absolute;top:6px;right:6px;background:#2d2f30;border:1px solid #444;color:#aaa;font-size:11px;padding:2px 8px;border-radius:3px;cursor:pointer;opacity:.7}.terminal-copy:hover{opacity:1}
|
||||
/* Forms */
|
||||
.form-row{margin-bottom:14px}
|
||||
.form-row label{display:block;font-size:12px;color:var(--muted);margin-bottom:5px;font-weight:700}
|
||||
.form-row input,.form-row select{width:100%;padding:8px 10px;background:var(--surface);border:1px solid var(--border);border-radius:4px;color:var(--ink);font-size:13px;outline:none;font-family:inherit}
|
||||
.form-row input:focus,.form-row select:focus{border-color:var(--accent);box-shadow:0 0 0 2px rgba(33,133,208,.2)}
|
||||
/* Grid */
|
||||
.grid2{display:grid;grid-template-columns:1fr 1fr;gap:16px}
|
||||
.grid3{display:grid;grid-template-columns:1fr 1fr 1fr;gap:16px}
|
||||
@media(max-width:900px){.grid2,.grid3{grid-template-columns:1fr}.card-head-actions{align-items:flex-start;flex-direction:column}.card-head-buttons{margin-left:0}}
|
||||
/* iframe viewer */
|
||||
.viewer-frame{width:100%;height:calc(100vh - 160px);border:0;border-radius:4px;background:var(--surface-2)}
|
||||
/* Alerts */
|
||||
.alert{padding:10px 14px;border-radius:4px;font-size:13px;margin-bottom:14px}
|
||||
.alert-info{background:#dff0ff;border:1px solid #a9d4f5;color:#1e3a5f}
|
||||
.alert-warn{background:var(--warn-bg);border:1px solid #c9ba9b;color:var(--warn-fg)}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
`
|
||||
}
|
||||
|
||||
func layoutNav(active string, buildLabel string) string {
|
||||
items := []struct{ id, label, href, onclick string }{
|
||||
{"dashboard", "Dashboard", "/", ""},
|
||||
{"audit", "Audit", "/audit", ""},
|
||||
{"validate", "Validate", "/validate", ""},
|
||||
{"burn", "Burn", "/burn", ""},
|
||||
{"benchmark", "Benchmark", "/benchmark", ""},
|
||||
{"tasks", "Tasks", "/tasks", ""},
|
||||
{"tools", "Tools", "/tools", ""},
|
||||
}
|
||||
var b strings.Builder
|
||||
b.WriteString(`<aside class="sidebar">`)
|
||||
b.WriteString(`<div class="sidebar-logo">bee<span>hardware audit</span></div>`)
|
||||
if strings.TrimSpace(buildLabel) == "" {
|
||||
buildLabel = "dev"
|
||||
}
|
||||
b.WriteString(`<div class="sidebar-version">Version ` + html.EscapeString(buildLabel) + `</div>`)
|
||||
if raw, err := os.ReadFile("/run/bee-nvidia-mode"); err == nil {
|
||||
gspMode := strings.TrimSpace(string(raw))
|
||||
switch gspMode {
|
||||
case "gsp-off":
|
||||
b.WriteString(`<div class="sidebar-badge sidebar-badge-warn">NVIDIA GSP=off</div>`)
|
||||
case "gsp-stuck":
|
||||
b.WriteString(`<div class="sidebar-badge sidebar-badge-crit">NVIDIA GSP stuck — reboot</div>`)
|
||||
}
|
||||
}
|
||||
b.WriteString(`<nav class="nav">`)
|
||||
for _, item := range items {
|
||||
cls := "nav-item"
|
||||
if item.id == active {
|
||||
cls += " active"
|
||||
}
|
||||
if item.onclick != "" {
|
||||
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s" onclick="%s">%s</a>`,
|
||||
cls, item.href, item.onclick, item.label))
|
||||
} else {
|
||||
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s">%s</a>`,
|
||||
cls, item.href, item.label))
|
||||
}
|
||||
}
|
||||
b.WriteString(`</nav>`)
|
||||
b.WriteString(`</aside>`)
|
||||
return b.String()
|
||||
}
|
||||
@@ -53,6 +53,9 @@ CREATE TABLE IF NOT EXISTS sys_metrics (
|
||||
cpu_load_pct REAL,
|
||||
mem_load_pct REAL,
|
||||
power_w REAL,
|
||||
power_source TEXT,
|
||||
power_mode TEXT,
|
||||
power_reason TEXT,
|
||||
PRIMARY KEY (ts)
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS gpu_metrics (
|
||||
@@ -86,7 +89,16 @@ CREATE TABLE IF NOT EXISTS temp_metrics (
|
||||
if err := ensureMetricsColumn(db, "gpu_metrics", "clock_mhz", "REAL"); err != nil {
|
||||
return err
|
||||
}
|
||||
return ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL")
|
||||
if err := ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL"); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := ensureMetricsColumn(db, "sys_metrics", "power_source", "TEXT"); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := ensureMetricsColumn(db, "sys_metrics", "power_mode", "TEXT"); err != nil {
|
||||
return err
|
||||
}
|
||||
return ensureMetricsColumn(db, "sys_metrics", "power_reason", "TEXT")
|
||||
}
|
||||
|
||||
func ensureMetricsColumn(db *sql.DB, table, column, definition string) error {
|
||||
@@ -125,8 +137,8 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
|
||||
defer func() { _ = tx.Rollback() }()
|
||||
|
||||
_, err = tx.Exec(
|
||||
`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w) VALUES(?,?,?,?)`,
|
||||
ts, s.CPULoadPct, s.MemLoadPct, s.PowerW,
|
||||
`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason) VALUES(?,?,?,?,?,?,?)`,
|
||||
ts, s.CPULoadPct, s.MemLoadPct, s.PowerW, s.PowerSource, s.PowerMode, s.PowerReason,
|
||||
)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -213,12 +225,12 @@ func (m *MetricsDB) Prune(before time.Time) error {
|
||||
|
||||
// LoadRecent returns up to n samples in chronological order (oldest first).
|
||||
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
|
||||
}
|
||||
|
||||
// LoadAll returns all persisted samples in chronological order (oldest first).
|
||||
func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics ORDER BY ts`, nil)
|
||||
}
|
||||
|
||||
// LoadBetween returns samples in chronological order within the given time window.
|
||||
@@ -233,7 +245,7 @@ func (m *MetricsDB) LoadBetween(start, end time.Time) ([]platform.LiveMetricSamp
|
||||
start, end = end, start
|
||||
}
|
||||
return m.loadSamples(
|
||||
`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
|
||||
`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
|
||||
start.Unix(), end.Unix(),
|
||||
)
|
||||
}
|
||||
@@ -249,11 +261,14 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
type sysRow struct {
|
||||
ts int64
|
||||
cpu, mem, pwr float64
|
||||
powerSource string
|
||||
powerMode string
|
||||
powerReason string
|
||||
}
|
||||
var sysRows []sysRow
|
||||
for rows.Next() {
|
||||
var r sysRow
|
||||
if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr); err != nil {
|
||||
if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr, &r.powerSource, &r.powerMode, &r.powerReason); err != nil {
|
||||
continue
|
||||
}
|
||||
sysRows = append(sysRows, r)
|
||||
@@ -363,10 +378,13 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
samples := make([]platform.LiveMetricSample, len(sysRows))
|
||||
for i, r := range sysRows {
|
||||
s := platform.LiveMetricSample{
|
||||
Timestamp: time.Unix(r.ts, 0).UTC(),
|
||||
CPULoadPct: r.cpu,
|
||||
MemLoadPct: r.mem,
|
||||
PowerW: r.pwr,
|
||||
Timestamp: time.Unix(r.ts, 0).UTC(),
|
||||
CPULoadPct: r.cpu,
|
||||
MemLoadPct: r.mem,
|
||||
PowerW: r.pwr,
|
||||
PowerSource: r.powerSource,
|
||||
PowerMode: r.powerMode,
|
||||
PowerReason: r.powerReason,
|
||||
}
|
||||
for _, idx := range gpuIndices {
|
||||
if g, ok := gpuData[gpuKey{r.ts, idx}]; ok {
|
||||
|
||||
@@ -0,0 +1,368 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type nvmeFormatMode struct {
|
||||
Mode int `json:"mode"`
|
||||
DataBytes int64 `json:"data_bytes"`
|
||||
MetadataBytes int64 `json:"metadata_bytes"`
|
||||
InUse bool `json:"in_use"`
|
||||
Label string `json:"label"`
|
||||
}
|
||||
|
||||
type nvmeFormatDisk struct {
|
||||
Device string `json:"device"`
|
||||
Model string `json:"model,omitempty"`
|
||||
Serial string `json:"serial,omitempty"`
|
||||
Size string `json:"size,omitempty"`
|
||||
CurrentMode int `json:"current_mode"`
|
||||
CurrentFormat string `json:"current_format"`
|
||||
Modes []nvmeFormatMode `json:"modes"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
type nvmeListJSON struct {
|
||||
Devices []struct {
|
||||
DevicePath string `json:"DevicePath"`
|
||||
ModelNumber string `json:"ModelNumber"`
|
||||
SerialNumber string `json:"SerialNumber"`
|
||||
PhysicalSize int64 `json:"PhysicalSize"`
|
||||
} `json:"Devices"`
|
||||
}
|
||||
|
||||
var (
|
||||
nvmeFormatDeviceRE = regexp.MustCompile(`^/dev/nvme[0-9]+n[0-9]+$`)
|
||||
nvmeLBAFCompactLineRE = regexp.MustCompile(`(?im)^\s*lbaf\s+(\d+)\s*:\s*ms:(\d+)\s+lbads:(\d+).*$`)
|
||||
nvmeLBAFVerboseLineRE = regexp.MustCompile(`(?im)^\s*LBA Format\s+(\d+)\s*:\s*Metadata Size:\s*(\d+)\s+bytes\s*-\s*Data Size:\s*(\d+)\s+bytes.*$`)
|
||||
nvmeCommandContext = exec.CommandContext
|
||||
nvmeListFormatsTimeout = 20 * time.Second
|
||||
)
|
||||
|
||||
func listNVMeFormatDisks(ctx context.Context) ([]nvmeFormatDisk, error) {
|
||||
ctx, cancel := context.WithTimeout(ctx, nvmeListFormatsTimeout)
|
||||
defer cancel()
|
||||
out, err := nvmeCommandContext(ctx, "nvme", "list", "-o", "json").Output()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var root nvmeListJSON
|
||||
if err := json.Unmarshal(out, &root); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
disks := make([]nvmeFormatDisk, 0, len(root.Devices))
|
||||
seen := map[string]struct{}{}
|
||||
for _, dev := range root.Devices {
|
||||
path := strings.TrimSpace(dev.DevicePath)
|
||||
if !nvmeFormatDeviceRE.MatchString(path) {
|
||||
continue
|
||||
}
|
||||
if _, ok := seen[path]; ok {
|
||||
continue
|
||||
}
|
||||
seen[path] = struct{}{}
|
||||
disk := nvmeFormatDisk{
|
||||
Device: path,
|
||||
Model: strings.TrimSpace(dev.ModelNumber),
|
||||
Serial: strings.TrimSpace(dev.SerialNumber),
|
||||
Size: formatNVMeBytes(dev.PhysicalSize),
|
||||
CurrentMode: -1,
|
||||
}
|
||||
modes, parseErr := readNVMeFormatModes(ctx, path)
|
||||
if parseErr != nil {
|
||||
disk.Error = parseErr.Error()
|
||||
}
|
||||
disk.Modes = modes
|
||||
for _, mode := range modes {
|
||||
if mode.InUse {
|
||||
disk.CurrentMode = mode.Mode
|
||||
disk.CurrentFormat = formatNVMeBlock(mode.DataBytes, mode.MetadataBytes)
|
||||
break
|
||||
}
|
||||
}
|
||||
disks = append(disks, disk)
|
||||
}
|
||||
sort.Slice(disks, func(i, j int) bool { return disks[i].Device < disks[j].Device })
|
||||
return disks, nil
|
||||
}
|
||||
|
||||
func readNVMeFormatModes(ctx context.Context, device string) ([]nvmeFormatMode, error) {
|
||||
if !nvmeFormatDeviceRE.MatchString(device) {
|
||||
return nil, fmt.Errorf("invalid NVMe device")
|
||||
}
|
||||
out, err := nvmeCommandContext(ctx, "nvme", "id-ns", device, "-H").CombinedOutput()
|
||||
if err != nil {
|
||||
msg := strings.TrimSpace(string(out))
|
||||
if msg == "" {
|
||||
msg = err.Error()
|
||||
}
|
||||
return nil, fmt.Errorf("%s", msg)
|
||||
}
|
||||
modes := parseNVMeFormatModes(string(out))
|
||||
if len(modes) == 0 {
|
||||
return nil, fmt.Errorf("no LBA format modes found")
|
||||
}
|
||||
return modes, nil
|
||||
}
|
||||
|
||||
func parseNVMeFormatModes(raw string) []nvmeFormatMode {
|
||||
byMode := map[int]nvmeFormatMode{}
|
||||
for _, m := range nvmeLBAFCompactLineRE.FindAllStringSubmatch(raw, -1) {
|
||||
mode, errMode := strconv.Atoi(m[1])
|
||||
metadata, errMS := strconv.ParseInt(m[2], 10, 64)
|
||||
lbads, errLBADS := strconv.Atoi(m[3])
|
||||
if errMode != nil || errMS != nil || errLBADS != nil || lbads < 0 || lbads >= 63 {
|
||||
continue
|
||||
}
|
||||
data := int64(1) << lbads
|
||||
line := m[0]
|
||||
byMode[mode] = nvmeFormatMode{
|
||||
Mode: mode,
|
||||
DataBytes: data,
|
||||
MetadataBytes: metadata,
|
||||
InUse: strings.Contains(strings.ToLower(line), "in use"),
|
||||
Label: fmt.Sprintf("MODE %d (%s)", mode, formatNVMeBlock(data, metadata)),
|
||||
}
|
||||
}
|
||||
for _, m := range nvmeLBAFVerboseLineRE.FindAllStringSubmatch(raw, -1) {
|
||||
mode, errMode := strconv.Atoi(m[1])
|
||||
metadata, errMS := strconv.ParseInt(m[2], 10, 64)
|
||||
data, errData := strconv.ParseInt(m[3], 10, 64)
|
||||
if errMode != nil || errMS != nil || errData != nil || data <= 0 {
|
||||
continue
|
||||
}
|
||||
line := m[0]
|
||||
byMode[mode] = nvmeFormatMode{
|
||||
Mode: mode,
|
||||
DataBytes: data,
|
||||
MetadataBytes: metadata,
|
||||
InUse: strings.Contains(strings.ToLower(line), "in use"),
|
||||
Label: fmt.Sprintf("MODE %d (%s)", mode, formatNVMeBlock(data, metadata)),
|
||||
}
|
||||
}
|
||||
modes := make([]nvmeFormatMode, 0, len(byMode))
|
||||
for _, mode := range byMode {
|
||||
modes = append(modes, mode)
|
||||
}
|
||||
sort.Slice(modes, func(i, j int) bool { return modes[i].Mode < modes[j].Mode })
|
||||
return modes
|
||||
}
|
||||
|
||||
func runNVMeFormatTask(ctx context.Context, j *jobState, device string, lbaf int) error {
|
||||
if !nvmeFormatDeviceRE.MatchString(device) {
|
||||
return fmt.Errorf("invalid NVMe device")
|
||||
}
|
||||
modes, err := readNVMeFormatModes(ctx, device)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
var selected nvmeFormatMode
|
||||
found := false
|
||||
for _, mode := range modes {
|
||||
if mode.Mode == lbaf {
|
||||
selected = mode
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
return fmt.Errorf("MODE %d is not available on %s", lbaf, device)
|
||||
}
|
||||
ms := 0
|
||||
if selected.MetadataBytes > 0 {
|
||||
ms = 1
|
||||
}
|
||||
j.append(fmt.Sprintf("Formatting %s to %s with --lbaf=%d --ms=%d --force", device, formatNVMeBlock(selected.DataBytes, selected.MetadataBytes), selected.Mode, ms))
|
||||
cmd := nvmeCommandContext(ctx, "nvme", "format", device, fmt.Sprintf("--lbaf=%d", selected.Mode), fmt.Sprintf("--ms=%d", ms), "--force")
|
||||
return streamCmdJob(j, cmd)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPINVMeFormats(w http.ResponseWriter, r *http.Request) {
|
||||
disks, err := listNVMeFormatDisks(r.Context())
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, disks)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPINVMeFormatRun(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
Device string `json:"device"`
|
||||
LBAF int `json:"lbaf"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||
return
|
||||
}
|
||||
if !nvmeFormatDeviceRE.MatchString(req.Device) {
|
||||
writeError(w, http.StatusBadRequest, "invalid NVMe device")
|
||||
return
|
||||
}
|
||||
disks, err := listNVMeFormatDisks(r.Context())
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
var label string
|
||||
allowed := false
|
||||
for _, disk := range disks {
|
||||
if disk.Device != req.Device {
|
||||
continue
|
||||
}
|
||||
for _, mode := range disk.Modes {
|
||||
if mode.Mode == req.LBAF {
|
||||
allowed = true
|
||||
label = mode.Label
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
if !allowed {
|
||||
writeError(w, http.StatusBadRequest, "LBA format mode is not available for this device")
|
||||
return
|
||||
}
|
||||
name := fmt.Sprintf("NVMe Format %s to %s", filepath.Base(req.Device), label)
|
||||
t := &Task{
|
||||
ID: newJobID("nvme-format"),
|
||||
Name: name,
|
||||
Target: "nvme-format",
|
||||
Priority: defaultTaskPriority("nvme-format", taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{
|
||||
Device: req.Device,
|
||||
LBAF: req.LBAF,
|
||||
},
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
|
||||
}
|
||||
|
||||
func formatNVMeBlock(dataBytes, metadataBytes int64) string {
|
||||
return strconv.FormatInt(dataBytes, 10) + "+" + strconv.FormatInt(metadataBytes, 10)
|
||||
}
|
||||
|
||||
func formatNVMeBytes(n int64) string {
|
||||
if n <= 0 {
|
||||
return ""
|
||||
}
|
||||
units := []string{"B", "KB", "MB", "GB", "TB", "PB"}
|
||||
v := float64(n)
|
||||
unit := 0
|
||||
for v >= 1000 && unit < len(units)-1 {
|
||||
v /= 1000
|
||||
unit++
|
||||
}
|
||||
if unit == 0 {
|
||||
return fmt.Sprintf("%d B", n)
|
||||
}
|
||||
return fmt.Sprintf("%.1f %s", v, units[unit])
|
||||
}
|
||||
|
||||
func renderNVMeFormatInline() string {
|
||||
return `<div id="nvme-format-status" style="font-size:13px;color:var(--muted);margin-bottom:12px">Loading NVMe disks...</div>
|
||||
<div id="nvme-format-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||
<script>
|
||||
function nvmeFormatEsc(s) {
|
||||
return String(s == null ? '' : s).replace(/[&<>"']/g, function(c) {
|
||||
return {'&':'&','<':'<','>':'>','"':'"',"'":'''}[c];
|
||||
});
|
||||
}
|
||||
function loadNVMeFormats() {
|
||||
var status = document.getElementById('nvme-format-status');
|
||||
var table = document.getElementById('nvme-format-table');
|
||||
status.textContent = 'Loading NVMe disks...';
|
||||
status.style.color = 'var(--muted)';
|
||||
table.innerHTML = '<p style="color:var(--muted);font-size:13px">Loading...</p>';
|
||||
fetch('/api/tools/nvme-formats').then(function(r) { return r.json().then(function(d) { if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status)); return d; }); }).then(function(disks) {
|
||||
window._nvmeFormatDisks = Array.isArray(disks) ? disks : [];
|
||||
if (!window._nvmeFormatDisks.length) {
|
||||
status.textContent = 'No NVMe disks found.';
|
||||
table.innerHTML = '';
|
||||
return;
|
||||
}
|
||||
status.textContent = window._nvmeFormatDisks.length + ' NVMe disk(s) found.';
|
||||
var rows = window._nvmeFormatDisks.map(function(d, idx) {
|
||||
var current = d.current_format ? (d.current_format + ' / MODE ' + d.current_mode) : 'unknown';
|
||||
var detail = [d.model || '', d.serial || '', d.size || ''].filter(Boolean).join(' | ');
|
||||
var options = (d.modes || []).map(function(m) {
|
||||
return '<option value="' + m.mode + '"' + (m.in_use ? ' selected' : '') + '>' + nvmeFormatEsc(m.label) + '</option>';
|
||||
}).join('');
|
||||
var disabled = options ? '' : ' disabled';
|
||||
var err = d.error ? '<div style="font-size:12px;color:var(--crit-fg,#9f3a38);margin-top:4px">' + nvmeFormatEsc(d.error) + '</div>' : '';
|
||||
return '<tr>'
|
||||
+ '<td style="font-family:monospace;white-space:nowrap">' + nvmeFormatEsc(d.device) + (detail ? '<div style="font-family:inherit;font-size:12px;color:var(--muted)">' + nvmeFormatEsc(detail) + '</div>' : '') + '</td>'
|
||||
+ '<td style="white-space:nowrap">' + nvmeFormatEsc(current) + err + '</td>'
|
||||
+ '<td style="white-space:nowrap"><select id="nvme-format-select-' + idx + '"' + disabled + '>' + options + '</select></td>'
|
||||
+ '<td style="white-space:nowrap"><button class="btn btn-sm btn-primary" onclick="nvmeFormatRun(' + idx + ', this)"' + disabled + '>Apply</button><div class="nvme-format-row-msg" style="margin-top:6px;font-size:12px;color:var(--muted)"></div></td>'
|
||||
+ '</tr>';
|
||||
}).join('');
|
||||
table.innerHTML = '<table><tr><th>Disk</th><th>Current block / mode</th><th>New mode</th><th>Action</th></tr>' + rows + '</table>';
|
||||
}).catch(function(e) {
|
||||
status.textContent = 'Error loading NVMe disks: ' + e.message;
|
||||
status.style.color = 'var(--crit-fg,#9f3a38)';
|
||||
table.innerHTML = '';
|
||||
});
|
||||
}
|
||||
function nvmeWaitTaskDone(taskID, rowMsg) {
|
||||
var timer = setInterval(function() {
|
||||
fetch('/api/tasks').then(function(r) { return r.json(); }).then(function(tasks) {
|
||||
var task = (tasks || []).find(function(t) { return t.id === taskID; });
|
||||
if (!task) return;
|
||||
if (task.status === 'done' || task.status === 'failed' || task.status === 'cancelled') {
|
||||
clearInterval(timer);
|
||||
rowMsg.textContent = 'Task ' + taskID + ': ' + task.status + (task.error ? ' - ' + task.error : '');
|
||||
rowMsg.style.color = task.status === 'done' ? 'var(--ok,green)' : 'var(--crit-fg,#9f3a38)';
|
||||
loadNVMeFormats();
|
||||
}
|
||||
}).catch(function(){});
|
||||
}, 1500);
|
||||
}
|
||||
function nvmeFormatRun(idx, btn) {
|
||||
var disk = (window._nvmeFormatDisks || [])[idx];
|
||||
var select = document.getElementById('nvme-format-select-' + idx);
|
||||
var row = btn.closest('td');
|
||||
var rowMsg = row.querySelector('.nvme-format-row-msg');
|
||||
if (!disk || !select) return;
|
||||
var lbaf = parseInt(select.value, 10);
|
||||
var mode = (disk.modes || []).find(function(m) { return m.mode === lbaf; });
|
||||
if (!mode) return;
|
||||
if (!window.confirm('Format ' + disk.device + ' to ' + mode.label + '? This erases data on the namespace.')) return;
|
||||
btn.disabled = true;
|
||||
rowMsg.style.color = 'var(--muted)';
|
||||
rowMsg.textContent = 'Queued...';
|
||||
fetch('/api/tools/nvme-format/run', {
|
||||
method:'POST',
|
||||
headers:{'Content-Type':'application/json'},
|
||||
body:JSON.stringify({device: disk.device, lbaf: lbaf})
|
||||
}).then(function(r) { return r.json().then(function(d) { if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status)); return d; }); }).then(function(d) {
|
||||
rowMsg.textContent = 'Task ' + d.task_id + ' queued.';
|
||||
nvmeWaitTaskDone(d.task_id, rowMsg);
|
||||
}).catch(function(e) {
|
||||
rowMsg.style.color = 'var(--crit-fg,#9f3a38)';
|
||||
rowMsg.textContent = 'Error: ' + e.message;
|
||||
}).finally(function() {
|
||||
btn.disabled = false;
|
||||
});
|
||||
}
|
||||
loadNVMeFormats();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderNVMeFormatCard() string {
|
||||
return `<div class="card"><div class="card-head">NVMe Block Format <button class="btn btn-sm btn-secondary" onclick="loadNVMeFormats()" style="margin-left:auto">↻ Refresh</button></div><div class="card-body">` +
|
||||
`<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Lists NVMe namespaces and changes their LBA format through a queued task.</p>` +
|
||||
renderNVMeFormatInline() + `</div></div>`
|
||||
}
|
||||
@@ -0,0 +1,613 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"html"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
type benchmarkHistoryRun struct {
|
||||
generatedAt time.Time
|
||||
displayTime string
|
||||
gpuScores map[int]float64
|
||||
gpuStatuses map[int]string
|
||||
overallStatus string
|
||||
}
|
||||
|
||||
func renderBenchmark(opts HandlerOptions) string {
|
||||
return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Benchmark runs generate a human-readable TXT report and machine-readable result bundle. Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
|
||||
<div class="grid2">
|
||||
<div class="card">
|
||||
<div class="card-head">Benchmark Setup</div>
|
||||
<div class="card-body">
|
||||
<div class="form-row">
|
||||
<label>Profile</label>
|
||||
<select id="benchmark-profile">
|
||||
<option value="standard" selected>Standard — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</option>
|
||||
<option value="stability">Stability — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</option>
|
||||
<option value="overnight">Overnight — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfOvernightSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerOvernightSec) + `</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="form-row">
|
||||
<label>GPU Selection</label>
|
||||
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="benchmarkSelectAll()">Select All</button>
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="benchmarkSelectNone()">Clear</button>
|
||||
</div>
|
||||
<div id="benchmark-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
||||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||
</div>
|
||||
</div>
|
||||
<label class="benchmark-cb-row">
|
||||
<input type="radio" name="benchmark-mode" value="sequential" onchange="benchmarkUpdateSelectionNote()">
|
||||
<span>Sequential — one GPU at a time</span>
|
||||
</label>
|
||||
<label class="benchmark-cb-row" id="benchmark-parallel-label">
|
||||
<input type="radio" name="benchmark-mode" value="parallel" onchange="benchmarkUpdateSelectionNote()">
|
||||
<span>Parallel — all selected GPUs simultaneously</span>
|
||||
</label>
|
||||
<label class="benchmark-cb-row" id="benchmark-ramp-label">
|
||||
<input type="radio" name="benchmark-mode" value="ramp-up" checked onchange="benchmarkUpdateSelectionNote()">
|
||||
<span>Ramp-up — 1 GPU → 2 → … → all selected (separate tasks)</span>
|
||||
</label>
|
||||
<p id="benchmark-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 14px">Select one GPU for single-card benchmarking or several GPUs for a constrained multi-GPU run.</p>
|
||||
<div style="display:flex;gap:8px;flex-wrap:wrap;align-items:center">
|
||||
<button id="benchmark-run-performance-btn" class="btn btn-primary" onclick="runNvidiaBenchmark('performance')" disabled>▶ Run Performance Benchmark</button>
|
||||
<button id="benchmark-run-power-fit-btn" class="btn btn-secondary" onclick="runNvidiaBenchmark('power-fit')" disabled>▶ Run Power / Thermal Fit</button>
|
||||
<button id="benchmark-run-autotune-btn" class="btn btn-secondary" onclick="runBenchmarkAutotune()">Autotune</button>
|
||||
</div>
|
||||
<span id="benchmark-run-nccl" hidden>nccl-auto</span>
|
||||
<span id="benchmark-run-status" style="margin-left:10px;font-size:12px;color:var(--muted)"></span>
|
||||
<div id="benchmark-autotune-status" style="margin-top:10px;font-size:12px;color:var(--muted)">Autotune status: loading…</div>
|
||||
<div style="margin-top:6px;font-size:12px;color:var(--muted)">Autotune overwrites the saved system-power source and applies it to all new power charts and tests.</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card">
|
||||
<div class="card-head">Method Split</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:10px">The benchmark page now exposes two fundamentally different test families so compute score and server power-fit are not mixed into one number.</p>
|
||||
<table>
|
||||
<tr><th>Run Type</th><th>Engine</th><th>Question</th><th>Standard</th><th>Stability</th></tr>
|
||||
<tr><td>Performance Benchmark</td><td><code>bee-gpu-burn</code></td><td>How much isolated compute performance does the GPU realize in this server?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + `</td></tr>
|
||||
<tr><td>Power / Thermal Fit</td><td><code>dcgmproftester</code> + <code>nvidia-smi -pl</code></td><td>How much power per GPU can this server sustain as GPU count ramps up?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</td></tr>
|
||||
</table>
|
||||
<p style="font-size:12px;color:var(--muted);margin-top:10px">Timings are per full ramp-up run (1 GPU → all selected), measured on 4–8 GPU servers. Use ramp-up mode for capacity work: it creates 1 GPU → 2 GPU → … → all selected steps so analysis software can derive server total score and watts-per-GPU curves.</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
` + `<div id="benchmark-results-section">` + renderBenchmarkResultsCard(opts.ExportDir) + `</div>` + `
|
||||
|
||||
<div id="benchmark-output" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
|
||||
<div class="card-body"><div id="benchmark-terminal" class="terminal"></div></div>
|
||||
</div>
|
||||
|
||||
<style>
|
||||
.benchmark-cb-row { display:flex; align-items:flex-start; gap:8px; cursor:pointer; font-size:13px; }
|
||||
.benchmark-cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
.benchmark-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
|
||||
.benchmark-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
</style>
|
||||
|
||||
<script>
|
||||
let benchmarkES = null;
|
||||
function benchmarkTaskIDs(payload) {
|
||||
if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
|
||||
if (payload && payload.task_id) return [payload.task_id];
|
||||
return [];
|
||||
}
|
||||
function benchmarkSelectedGPUIndices() {
|
||||
return Array.from(document.querySelectorAll('.benchmark-gpu-checkbox'))
|
||||
.filter(function(el) { return el.checked && !el.disabled; })
|
||||
.map(function(el) { return parseInt(el.value, 10); })
|
||||
.filter(function(v) { return !Number.isNaN(v); })
|
||||
.sort(function(a, b) { return a - b; });
|
||||
}
|
||||
function benchmarkMode() {
|
||||
const el = document.querySelector('input[name="benchmark-mode"]:checked');
|
||||
return el ? el.value : 'sequential';
|
||||
}
|
||||
function benchmarkUpdateSelectionNote() {
|
||||
const selected = benchmarkSelectedGPUIndices();
|
||||
const perfBtn = document.getElementById('benchmark-run-performance-btn');
|
||||
const fitBtn = document.getElementById('benchmark-run-power-fit-btn');
|
||||
const note = document.getElementById('benchmark-selection-note');
|
||||
if (!selected.length) {
|
||||
perfBtn.disabled = true;
|
||||
fitBtn.disabled = true;
|
||||
note.textContent = 'Select at least one NVIDIA GPU to run the benchmark.';
|
||||
return;
|
||||
}
|
||||
perfBtn.disabled = false;
|
||||
fitBtn.disabled = false;
|
||||
const mode = benchmarkMode();
|
||||
if (mode === 'ramp-up') {
|
||||
note.textContent = 'Ramp-up: ' + selected.length + ' tasks (1 GPU → ' + selected.length + ' GPUs). Performance uses compute benchmark; Power / Thermal Fit uses dcgmproftester load with nvidia-smi power-limit search per step.';
|
||||
} else if (mode === 'parallel') {
|
||||
note.textContent = 'Parallel: all ' + selected.length + ' GPU(s) simultaneously. Only the performance benchmark supports this mode.';
|
||||
} else {
|
||||
note.textContent = 'Sequential: each selected GPU benchmarked separately.';
|
||||
}
|
||||
}
|
||||
function benchmarkRenderGPUList(gpus) {
|
||||
const root = document.getElementById('benchmark-gpu-list');
|
||||
if (!gpus || !gpus.length) {
|
||||
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
|
||||
benchmarkUpdateSelectionNote();
|
||||
return;
|
||||
}
|
||||
root.innerHTML = gpus.map(function(gpu) {
|
||||
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
|
||||
return '<label class="benchmark-gpu-row">'
|
||||
+ '<input class="benchmark-gpu-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="benchmarkUpdateSelectionNote()">'
|
||||
+ '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
|
||||
+ '</label>';
|
||||
}).join('');
|
||||
benchmarkApplyMultiGPUState(gpus.length);
|
||||
benchmarkUpdateSelectionNote();
|
||||
}
|
||||
function benchmarkApplyMultiGPUState(gpuCount) {
|
||||
var multiValues = ['parallel', 'ramp-up'];
|
||||
var radios = document.querySelectorAll('input[name="benchmark-mode"]');
|
||||
radios.forEach(function(el) {
|
||||
var isMulti = multiValues.indexOf(el.value) >= 0;
|
||||
if (gpuCount < 2 && isMulti) {
|
||||
el.disabled = true;
|
||||
if (el.checked) {
|
||||
var seq = document.querySelector('input[name="benchmark-mode"][value="sequential"]');
|
||||
if (seq) seq.checked = true;
|
||||
}
|
||||
var label = el.closest('label');
|
||||
if (label) label.style.opacity = '0.4';
|
||||
} else {
|
||||
el.disabled = false;
|
||||
if (gpuCount >= 2 && el.value === 'ramp-up') el.checked = true;
|
||||
var label = el.closest('label');
|
||||
if (label) label.style.opacity = '';
|
||||
}
|
||||
});
|
||||
benchmarkUpdateSelectionNote();
|
||||
}
|
||||
function benchmarkLoadGPUs() {
|
||||
const status = document.getElementById('benchmark-run-status');
|
||||
status.textContent = '';
|
||||
fetch('/api/gpu/nvidia').then(function(r) {
|
||||
return r.json().then(function(body) {
|
||||
if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
|
||||
return body;
|
||||
});
|
||||
}).then(function(gpus) {
|
||||
benchmarkRenderGPUList(gpus);
|
||||
}).catch(function(err) {
|
||||
document.getElementById('benchmark-gpu-list').innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
|
||||
benchmarkUpdateSelectionNote();
|
||||
});
|
||||
}
|
||||
function benchmarkSelectAll() {
|
||||
document.querySelectorAll('.benchmark-gpu-checkbox').forEach(function(el) { el.checked = true; });
|
||||
benchmarkUpdateSelectionNote();
|
||||
}
|
||||
function benchmarkSelectNone() {
|
||||
document.querySelectorAll('.benchmark-gpu-checkbox').forEach(function(el) { el.checked = false; });
|
||||
benchmarkUpdateSelectionNote();
|
||||
}
|
||||
function runNvidiaBenchmark(kind) {
|
||||
const selected = benchmarkSelectedGPUIndices();
|
||||
const status = document.getElementById('benchmark-run-status');
|
||||
if (!selected.length) {
|
||||
status.textContent = 'Select at least one GPU.';
|
||||
return;
|
||||
}
|
||||
if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
|
||||
const mode = benchmarkMode();
|
||||
const rampUp = mode === 'ramp-up' && selected.length > 1;
|
||||
const parallelGPUs = mode === 'parallel' && kind === 'performance';
|
||||
if (kind === 'power-fit' && mode === 'parallel') {
|
||||
status.textContent = 'Power / Thermal Fit supports sequential or ramp-up only.';
|
||||
return;
|
||||
}
|
||||
const body = {
|
||||
profile: document.getElementById('benchmark-profile').value || 'standard',
|
||||
gpu_indices: selected,
|
||||
run_nccl: kind === 'performance' && selected.length > 1,
|
||||
parallel_gpus: parallelGPUs,
|
||||
ramp_up: rampUp,
|
||||
display_name: kind === 'power-fit' ? 'NVIDIA Power / Thermal Fit' : 'NVIDIA Performance Benchmark'
|
||||
};
|
||||
document.getElementById('benchmark-output').style.display = 'block';
|
||||
document.getElementById('benchmark-title').textContent = '— ' + body.display_name + ' · ' + body.profile + ' [' + selected.join(', ') + ']';
|
||||
const term = document.getElementById('benchmark-terminal');
|
||||
term.textContent = 'Enqueuing ' + body.display_name + ' for GPUs ' + selected.join(', ') + '...\n';
|
||||
status.textContent = 'Queueing...';
|
||||
const endpoint = kind === 'power-fit' ? '/api/bee-bench/nvidia/power/run' : '/api/bee-bench/nvidia/perf/run';
|
||||
fetch(endpoint, {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type':'application/json'},
|
||||
body: JSON.stringify(body)
|
||||
}).then(function(r) {
|
||||
return r.json().then(function(payload) {
|
||||
if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
|
||||
return payload;
|
||||
});
|
||||
}).then(function(d) {
|
||||
const taskIds = benchmarkTaskIDs(d);
|
||||
if (!taskIds.length) throw new Error('No benchmark task was queued.');
|
||||
status.textContent = taskIds.length === 1 ? ('Task ' + taskIds[0] + ' queued.') : ('Queued ' + taskIds.length + ' tasks.');
|
||||
const streamNext = function(idx, failures) {
|
||||
if (idx >= taskIds.length) {
|
||||
status.textContent = failures ? 'Completed with failures.' : 'Completed.';
|
||||
return;
|
||||
}
|
||||
const taskId = taskIds[idx];
|
||||
term.textContent += '\n[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming log...\n';
|
||||
benchmarkES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||
benchmarkES.addEventListener('done', function(e) {
|
||||
benchmarkES.close();
|
||||
benchmarkES = null;
|
||||
if (e.data) failures += 1;
|
||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
const isLast = (idx + 1 >= taskIds.length);
|
||||
streamNext(idx + 1, failures);
|
||||
if (isLast) { benchmarkRefreshResults(); }
|
||||
});
|
||||
benchmarkES.onerror = function() {
|
||||
if (benchmarkES) {
|
||||
benchmarkES.close();
|
||||
benchmarkES = null;
|
||||
}
|
||||
term.textContent += '\nERROR: stream disconnected.\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
streamNext(idx + 1, failures + 1);
|
||||
};
|
||||
};
|
||||
streamNext(0, 0);
|
||||
}).catch(function(err) {
|
||||
status.textContent = 'Error.';
|
||||
term.textContent += 'ERROR: ' + err.message + '\n';
|
||||
});
|
||||
}
|
||||
function benchmarkRenderAutotuneStatus(payload) {
|
||||
const el = document.getElementById('benchmark-autotune-status');
|
||||
if (!el) return;
|
||||
if (!payload || !payload.configured || !payload.config) {
|
||||
el.textContent = 'Autotune status: not configured. Temporary fallback source is used until autotune completes.';
|
||||
return;
|
||||
}
|
||||
const cfg = payload.config || {};
|
||||
const decision = payload.decision || {};
|
||||
const updated = cfg.updated_at ? new Date(cfg.updated_at).toLocaleString() : 'unknown time';
|
||||
const confidence = typeof cfg.confidence === 'number' ? (' · confidence ' + Math.round(cfg.confidence * 100) + '%') : '';
|
||||
const effective = decision.effective_source ? (' · effective ' + decision.effective_source) : '';
|
||||
const mode = decision.mode ? (' · mode ' + decision.mode) : '';
|
||||
el.textContent = 'Autotune status: ' + cfg.selected_source + effective + mode + ' · updated ' + updated + confidence;
|
||||
}
|
||||
function loadBenchmarkAutotuneStatus() {
|
||||
fetch('/api/bee-bench/nvidia/autotune/status')
|
||||
.then(function(r) {
|
||||
return r.json().then(function(body) {
|
||||
if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
|
||||
return body;
|
||||
});
|
||||
})
|
||||
.then(function(body) { benchmarkRenderAutotuneStatus(body); })
|
||||
.catch(function(err) {
|
||||
const el = document.getElementById('benchmark-autotune-status');
|
||||
if (el) el.textContent = 'Autotune status error: ' + err.message;
|
||||
});
|
||||
}
|
||||
function runBenchmarkAutotune() {
|
||||
const selected = benchmarkSelectedGPUIndices();
|
||||
const status = document.getElementById('benchmark-run-status');
|
||||
const term = document.getElementById('benchmark-terminal');
|
||||
if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
|
||||
document.getElementById('benchmark-output').style.display = 'block';
|
||||
document.getElementById('benchmark-title').textContent = '— NVIDIA Benchmark Autotune';
|
||||
term.textContent = 'Enqueuing benchmark autotune...\n';
|
||||
status.textContent = 'Queueing autotune...';
|
||||
fetch('/api/bee-bench/nvidia/autotune/run', {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type':'application/json'},
|
||||
body: JSON.stringify({
|
||||
profile: document.getElementById('benchmark-profile').value || 'standard',
|
||||
benchmark_kind: benchmarkMode() === 'parallel' ? 'performance' : 'power-fit',
|
||||
gpu_indices: selected
|
||||
})
|
||||
}).then(function(r) {
|
||||
return r.json().then(function(payload) {
|
||||
if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
|
||||
return payload;
|
||||
});
|
||||
}).then(function(d) {
|
||||
const taskIds = benchmarkTaskIDs(d);
|
||||
if (!taskIds.length) throw new Error('No autotune task was queued.');
|
||||
const taskId = taskIds[0];
|
||||
status.textContent = 'Autotune queued: ' + taskId;
|
||||
benchmarkES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||
benchmarkES.addEventListener('done', function(e) {
|
||||
if (benchmarkES) {
|
||||
benchmarkES.close();
|
||||
benchmarkES = null;
|
||||
}
|
||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||
status.textContent = e.data ? 'Autotune failed.' : 'Autotune completed.';
|
||||
loadBenchmarkAutotuneStatus();
|
||||
});
|
||||
}).catch(function(err) {
|
||||
status.textContent = 'Autotune error.';
|
||||
term.textContent += 'ERROR: ' + err.message + '\n';
|
||||
});
|
||||
}
|
||||
benchmarkLoadGPUs();
|
||||
loadBenchmarkAutotuneStatus();
|
||||
function benchmarkRefreshResults() {
|
||||
fetch('/api/benchmark/results')
|
||||
.then(function(r) { return r.text(); })
|
||||
.then(function(html) {
|
||||
const el = document.getElementById('benchmark-results-section');
|
||||
if (el) el.innerHTML = html;
|
||||
})
|
||||
.catch(function() {});
|
||||
}
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderBenchmarkResultsCard(exportDir string) string {
|
||||
maxIdx, runs := loadBenchmarkHistory(exportDir)
|
||||
perf := renderBenchmarkResultsCardFromRuns(
|
||||
"Perf Results",
|
||||
"Composite score by saved benchmark run and GPU.",
|
||||
"No saved performance benchmark runs yet.",
|
||||
maxIdx,
|
||||
runs,
|
||||
)
|
||||
power := renderPowerBenchmarkResultsCard(exportDir)
|
||||
return perf + "\n" + power
|
||||
}
|
||||
|
||||
func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, maxGPUIndex int, runs []benchmarkHistoryRun) string {
|
||||
if len(runs) == 0 {
|
||||
return `<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body"><p style="color:var(--muted);font-size:13px">` + html.EscapeString(emptyMessage) + `</p></div></div>`
|
||||
}
|
||||
var b strings.Builder
|
||||
b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body">`)
|
||||
if strings.TrimSpace(description) != "" {
|
||||
b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">` + html.EscapeString(description) + `</p>`)
|
||||
}
|
||||
b.WriteString(`<div style="overflow-x:auto">`)
|
||||
b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th><th>Status</th>`)
|
||||
for i := 0; i <= maxGPUIndex; i++ {
|
||||
b.WriteString(`<th>GPU ` + strconv.Itoa(i) + `</th>`)
|
||||
}
|
||||
b.WriteString(`</tr></thead><tbody>`)
|
||||
for i, run := range runs {
|
||||
b.WriteString(`<tr>`)
|
||||
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
|
||||
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
|
||||
overallColor := "var(--ok)"
|
||||
overallLabel := run.overallStatus
|
||||
if overallLabel == "" {
|
||||
overallLabel = "OK"
|
||||
}
|
||||
if overallLabel == "FAILED" {
|
||||
overallColor = "var(--crit-fg,#9f3a38)"
|
||||
} else if overallLabel != "OK" {
|
||||
overallColor = "var(--warn)"
|
||||
}
|
||||
b.WriteString(`<td style="color:` + overallColor + `;font-weight:600">` + html.EscapeString(overallLabel) + `</td>`)
|
||||
for idx := 0; idx <= maxGPUIndex; idx++ {
|
||||
score, ok := run.gpuScores[idx]
|
||||
if !ok {
|
||||
b.WriteString(`<td style="color:var(--muted)">-</td>`)
|
||||
continue
|
||||
}
|
||||
gpuStatus := run.gpuStatuses[idx]
|
||||
scoreColor := ""
|
||||
switch gpuStatus {
|
||||
case "FAILED":
|
||||
scoreColor = ` style="color:var(--crit-fg,#9f3a38);font-weight:600"`
|
||||
case "WARNING", "PARTIAL":
|
||||
scoreColor = ` style="color:var(--warn);font-weight:600"`
|
||||
case "", "OK":
|
||||
default:
|
||||
scoreColor = ` style="color:var(--warn);font-weight:600"`
|
||||
}
|
||||
b.WriteString(`<td` + scoreColor + `>` + fmt.Sprintf("%.2f", score) + `</td>`)
|
||||
}
|
||||
b.WriteString(`</tr>`)
|
||||
}
|
||||
b.WriteString(`</tbody></table></div></div></div>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func loadBenchmarkHistory(exportDir string) (int, []benchmarkHistoryRun) {
|
||||
baseDir := app.DefaultBeeBenchPerfDir
|
||||
if strings.TrimSpace(exportDir) != "" {
|
||||
baseDir = filepath.Join(exportDir, "bee-bench", "perf")
|
||||
}
|
||||
paths, err := filepath.Glob(filepath.Join(baseDir, "perf-*", "result.json"))
|
||||
if err != nil || len(paths) == 0 {
|
||||
return -1, nil
|
||||
}
|
||||
sort.Strings(paths)
|
||||
return loadBenchmarkHistoryFromPaths(paths)
|
||||
}
|
||||
|
||||
func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun) {
|
||||
runs := make([]benchmarkHistoryRun, 0, len(paths))
|
||||
maxGPUIndex := -1
|
||||
for _, path := range paths {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
var result platform.NvidiaBenchmarkResult
|
||||
if err := json.Unmarshal(raw, &result); err != nil {
|
||||
continue
|
||||
}
|
||||
run := benchmarkHistoryRun{
|
||||
generatedAt: result.GeneratedAt,
|
||||
displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
||||
gpuScores: make(map[int]float64),
|
||||
gpuStatuses: make(map[int]string),
|
||||
overallStatus: result.OverallStatus,
|
||||
}
|
||||
for _, gpu := range result.GPUs {
|
||||
run.gpuScores[gpu.Index] = gpu.Scores.CompositeScore
|
||||
run.gpuStatuses[gpu.Index] = gpu.Status
|
||||
if gpu.Index > maxGPUIndex {
|
||||
maxGPUIndex = gpu.Index
|
||||
}
|
||||
}
|
||||
runs = append(runs, run)
|
||||
}
|
||||
sort.Slice(runs, func(i, j int) bool {
|
||||
return runs[i].generatedAt.After(runs[j].generatedAt)
|
||||
})
|
||||
return maxGPUIndex, runs
|
||||
}
|
||||
|
||||
func renderPowerBenchmarkResultsCard(exportDir string) string {
|
||||
baseDir := app.DefaultBeeBenchPowerDir
|
||||
if strings.TrimSpace(exportDir) != "" {
|
||||
baseDir = filepath.Join(exportDir, "bee-bench", "power")
|
||||
}
|
||||
paths, err := filepath.Glob(filepath.Join(baseDir, "power-*", "result.json"))
|
||||
if err != nil || len(paths) == 0 {
|
||||
return `<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body"><p style="color:var(--muted);font-size:13px">No saved power benchmark runs yet.</p></div></div>`
|
||||
}
|
||||
sort.Strings(paths)
|
||||
|
||||
type powerRun struct {
|
||||
generatedAt time.Time
|
||||
displayTime string
|
||||
result platform.NvidiaPowerBenchResult
|
||||
}
|
||||
var runs []powerRun
|
||||
for _, path := range paths {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
var r platform.NvidiaPowerBenchResult
|
||||
if err := json.Unmarshal(raw, &r); err != nil {
|
||||
continue
|
||||
}
|
||||
runs = append(runs, powerRun{
|
||||
generatedAt: r.GeneratedAt,
|
||||
displayTime: r.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
||||
result: r,
|
||||
})
|
||||
}
|
||||
sort.Slice(runs, func(i, j int) bool {
|
||||
return runs[i].generatedAt.After(runs[j].generatedAt)
|
||||
})
|
||||
|
||||
var b strings.Builder
|
||||
b.WriteString(`<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body">`)
|
||||
|
||||
latest := runs[0].result
|
||||
b.WriteString(`<p style="font-size:12px;color:var(--muted);margin-bottom:10px">Latest run: ` + html.EscapeString(runs[0].displayTime))
|
||||
if latest.Hostname != "" {
|
||||
b.WriteString(` — ` + html.EscapeString(latest.Hostname))
|
||||
}
|
||||
if latest.OverallStatus != "" {
|
||||
statusColor := "var(--ok)"
|
||||
if latest.OverallStatus != "OK" {
|
||||
statusColor = "var(--warn)"
|
||||
}
|
||||
b.WriteString(` — <span style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(latest.OverallStatus) + `</span>`)
|
||||
}
|
||||
b.WriteString(`</p>`)
|
||||
|
||||
if len(latest.GPUs) > 0 {
|
||||
b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`)
|
||||
b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Single-card W</th><th>Multi-GPU W</th><th>P95 Observed W</th><th>Status</th>`)
|
||||
b.WriteString(`</tr></thead><tbody>`)
|
||||
for _, gpu := range latest.GPUs {
|
||||
finalLimitW := gpu.StablePowerLimitW
|
||||
if finalLimitW <= 0 {
|
||||
finalLimitW = gpu.AppliedPowerLimitW
|
||||
}
|
||||
derated := gpu.Derated ||
|
||||
(gpu.DefaultPowerLimitW > 0 && finalLimitW > 0 && finalLimitW < gpu.DefaultPowerLimitW-1)
|
||||
rowStyle := ""
|
||||
finalStyle := ""
|
||||
if derated {
|
||||
rowStyle = ` style="background:rgba(255,180,0,0.08)"`
|
||||
finalStyle = ` style="color:#e6a000;font-weight:600"`
|
||||
}
|
||||
statusLabel := gpu.Status
|
||||
if statusLabel == "" {
|
||||
statusLabel = "OK"
|
||||
}
|
||||
statusColor := "var(--ok)"
|
||||
if statusLabel == "FAILED" {
|
||||
statusColor = "var(--crit-fg,#9f3a38)"
|
||||
} else if statusLabel != "OK" {
|
||||
statusColor = "var(--warn)"
|
||||
}
|
||||
nominalStr := "-"
|
||||
if gpu.DefaultPowerLimitW > 0 {
|
||||
nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW)
|
||||
}
|
||||
singleStr := "-"
|
||||
if gpu.AppliedPowerLimitW > 0 {
|
||||
singleStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
|
||||
}
|
||||
multiStr := "-"
|
||||
if gpu.StablePowerLimitW > 0 {
|
||||
multiStr = fmt.Sprintf("%.0f", gpu.StablePowerLimitW)
|
||||
}
|
||||
p95Str := "-"
|
||||
if gpu.MaxObservedPowerW > 0 {
|
||||
p95Str = fmt.Sprintf("%.0f", gpu.MaxObservedPowerW)
|
||||
}
|
||||
b.WriteString(`<tr` + rowStyle + `>`)
|
||||
b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`)
|
||||
b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`)
|
||||
b.WriteString(`<td>` + nominalStr + `</td>`)
|
||||
b.WriteString(`<td>` + singleStr + `</td>`)
|
||||
b.WriteString(`<td` + finalStyle + `>` + multiStr + `</td>`)
|
||||
b.WriteString(`<td>` + p95Str + `</td>`)
|
||||
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`)
|
||||
b.WriteString(`</tr>`)
|
||||
}
|
||||
b.WriteString(`</tbody></table></div>`)
|
||||
}
|
||||
|
||||
if len(runs) > 1 {
|
||||
b.WriteString(`<details style="margin-top:12px"><summary style="font-size:12px;color:var(--muted);cursor:pointer">` + strconv.Itoa(len(runs)) + ` runs total</summary>`)
|
||||
b.WriteString(`<div style="overflow-x:auto;margin-top:8px"><table><thead><tr><th>#</th><th>Time</th><th>GPUs</th><th>Status</th></tr></thead><tbody>`)
|
||||
for i, run := range runs {
|
||||
statusColor := "var(--ok)"
|
||||
if run.result.OverallStatus != "OK" {
|
||||
statusColor = "var(--warn)"
|
||||
}
|
||||
b.WriteString(`<tr>`)
|
||||
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
|
||||
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
|
||||
b.WriteString(`<td>` + strconv.Itoa(len(run.result.GPUs)) + `</td>`)
|
||||
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(run.result.OverallStatus) + `</td>`)
|
||||
b.WriteString(`</tr>`)
|
||||
}
|
||||
b.WriteString(`</tbody></table></div></details>`)
|
||||
}
|
||||
|
||||
b.WriteString(`</div></div>`)
|
||||
return b.String()
|
||||
}
|
||||
@@ -0,0 +1,383 @@
|
||||
package webui
|
||||
|
||||
func renderBurn() string {
|
||||
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
|
||||
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in <a href="/validate">Validate → Stress mode</a>; NCCL and NVBandwidth are available directly from <a href="/validate">Validate</a>.</div>
|
||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Burn Profile</div>
|
||||
<div class="card-body burn-profile-body">
|
||||
<div class="burn-profile-col">
|
||||
<div class="form-row" style="margin:0 0 8px"><label>Preset</label></div>
|
||||
<label class="cb-row"><input type="radio" name="burn-profile" value="smoke" checked><span>Smoke — 5 min/GPU (sequential) or 5 min (parallel)</span></label>
|
||||
<label class="cb-row"><input type="radio" name="burn-profile" value="acceptance"><span>Acceptance — 1 h/GPU (sequential) or 1 h (parallel)</span></label>
|
||||
<label class="cb-row"><input type="radio" name="burn-profile" value="overnight"><span>Overnight — 8 h/GPU (sequential) or 8 h (parallel)</span></label>
|
||||
</div>
|
||||
<div class="burn-profile-col burn-profile-action">
|
||||
<button type="button" class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button>
|
||||
<p>Runs checked tests as separate sequential tasks. In sequential GPU mode, total time = profile duration × N GPU. In parallel mode, all selected GPUs burn simultaneously for one profile duration.</p>
|
||||
</div>
|
||||
<div class="burn-profile-col burn-profile-action">
|
||||
<button type="button" class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button>
|
||||
<p>Run checked core test modules (CPU, MEM, GPU). Tests start at the same time and run for a period with short cooldown phases to stress the server cooling system.</p>
|
||||
</div>
|
||||
</div>
|
||||
<div class="card-body" style="padding-top:0;display:flex;justify-content:center">
|
||||
<span id="burn-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">NVIDIA GPU Selection</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Official NVIDIA recipes and custom NVIDIA stressors use only the GPUs selected here. Multi-GPU interconnect tests are limited to this selection as well.</p>
|
||||
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectAll()">Select All</button>
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectNone()">Clear</button>
|
||||
</div>
|
||||
<div id="burn-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
||||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||
</div>
|
||||
<p id="burn-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA burn recipes.</p>
|
||||
<div style="display:flex;flex-direction:column;gap:4px;margin-top:10px">
|
||||
<label class="cb-row">
|
||||
<input type="radio" name="burn-nvidia-mode" value="sequential" checked>
|
||||
<span>Sequential — selected GPUs one at a time</span>
|
||||
</label>
|
||||
<label class="cb-row" id="burn-parallel-label">
|
||||
<input type="radio" name="burn-nvidia-mode" value="parallel">
|
||||
<span>Parallel — all selected GPUs simultaneously</span>
|
||||
</label>
|
||||
<label class="cb-row" id="burn-ramp-label">
|
||||
<input type="radio" name="burn-nvidia-mode" value="ramp-up">
|
||||
<span>Ramp-up — add one GPU at a time</span>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="burn-section">Core Burn Paths</div>
|
||||
<div class="grid2 burn-grid" style="margin-bottom:16px">
|
||||
<div class="card burn-card">
|
||||
<div class="card-head card-head-actions"><span>GPU Max Load</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-nvidia-compute',target:'nvidia-compute',label:'NVIDIA Max Compute Load (dcgmproftester)',nvidia:true},{id:'burn-gpu-bee',target:'nvidia-stress',label:'GPU Burn (bee-gpu-burn)',nvidia:true,extra:{loader:'builtin'}},{id:'burn-gpu-john',target:'nvidia-stress',label:'John GPU Stress (john/OpenCL)',nvidia:true,extra:{loader:'john'}},{id:'burn-gpu-rvs',target:'amd-stress',label:'AMD GPU Stress (rvs gst)'}])">Run</button></div>
|
||||
<div class="card-body burn-card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Combine vendor-backed and custom GPU max-load recipes in one run set. ` + "dcgmproftester" + ` is the primary official NVIDIA path; custom stressors remain available as parallel checkbox options.</p>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-nvidia-compute" checked disabled><span>NVIDIA Max Compute Load (dcgmproftester) <span class="cb-note" id="note-nvidia-compute"></span></span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-gpu-bee" checked disabled><span>GPU Burn (bee-gpu-burn) <span class="cb-note" id="note-bee"></span></span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-gpu-john" disabled><span>John GPU Stress (john/OpenCL) <span class="cb-note" id="note-john"></span></span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-gpu-rvs" disabled><span>AMD GPU Stress (rvs gst) <span class="cb-note" id="note-rvs"></span></span></label>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card burn-card">
|
||||
<div class="card-head card-head-actions"><span>Compute Stress</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-cpu',target:'cpu',label:'CPU Burn-in'},{id:'burn-mem-stress',target:'memory-stress',label:'Memory Burn-in'},{id:'burn-sat-stress',target:'sat-stress',label:'SAT Stress (stressapptest)'}])">Run</button></div>
|
||||
<div class="card-body burn-card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Select which subsystems to stress. Each checked item runs as a separate task.</p>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-cpu" checked><span>CPU stress (stress-ng)</span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-mem-stress" checked><span>Memory stress (stress-ng --vm)</span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-sat-stress"><span>stressapptest (CPU + memory bus)</span></label>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="bi-output" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Output <span id="bi-title"></span></div>
|
||||
<div class="card-body"><div id="bi-terminal" class="terminal"></div></div>
|
||||
</div>
|
||||
|
||||
<style>
|
||||
.cb-row { display:flex; align-items:flex-start; gap:8px; padding:4px 0; cursor:pointer; font-size:13px; }
|
||||
.cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
.cb-row input[type=checkbox]:disabled { opacity:0.4; cursor:not-allowed; }
|
||||
.cb-row input[type=checkbox]:disabled ~ span { opacity:0.45; cursor:not-allowed; }
|
||||
.cb-note { font-size:11px; color:var(--muted); font-style:italic; }
|
||||
.burn-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
|
||||
.burn-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
.burn-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
|
||||
.burn-profile-col { min-width:0; }
|
||||
.burn-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:flex-start; gap:8px; }
|
||||
.burn-profile-action p { font-size:12px; color:var(--muted); margin:0; width:100%; text-align:left; }
|
||||
.burn-section { font-size:12px; font-weight:700; letter-spacing:.06em; text-transform:uppercase; color:var(--muted); margin:0 0 10px; padding-top:4px; }
|
||||
.burn-grid { align-items:stretch; }
|
||||
.burn-card { height:100%; display:flex; flex-direction:column; }
|
||||
.burn-card-body { flex:1; display:flex; flex-direction:column; }
|
||||
.card-head-actions { justify-content:space-between; }
|
||||
.card-head-buttons { display:flex; align-items:center; gap:8px; margin-left:auto; }
|
||||
@media(max-width:900px){ .card-head-actions { align-items:flex-start; flex-direction:column; } .card-head-buttons { margin-left:0; } .burn-profile-body { grid-template-columns:1fr; } }
|
||||
</style>
|
||||
|
||||
<script>
|
||||
let biES = null;
|
||||
function burnTaskIDs(payload) {
|
||||
if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
|
||||
if (payload && payload.task_id) return [payload.task_id];
|
||||
return [];
|
||||
}
|
||||
function burnProfile() {
|
||||
const selected = document.querySelector('input[name="burn-profile"]:checked');
|
||||
return selected ? selected.value : 'smoke';
|
||||
}
|
||||
function burnSelectedGPUIndices() {
|
||||
return Array.from(document.querySelectorAll('.burn-gpu-checkbox'))
|
||||
.filter(function(el) { return el.checked && !el.disabled; })
|
||||
.map(function(el) { return parseInt(el.value, 10); })
|
||||
.filter(function(v) { return !Number.isNaN(v); })
|
||||
.sort(function(a, b) { return a - b; });
|
||||
}
|
||||
function burnNvidiaMode() {
|
||||
const el = document.querySelector('input[name="burn-nvidia-mode"]:checked');
|
||||
return el ? el.value : 'sequential';
|
||||
}
|
||||
function burnApplyMultiGPUState(gpuCount) {
|
||||
var multiValues = ['parallel', 'ramp-up'];
|
||||
var radios = document.querySelectorAll('input[name="burn-nvidia-mode"]');
|
||||
radios.forEach(function(el) {
|
||||
var isMulti = multiValues.indexOf(el.value) >= 0;
|
||||
if (gpuCount < 2 && isMulti) {
|
||||
el.disabled = true;
|
||||
if (el.checked) {
|
||||
var seq = document.querySelector('input[name="burn-nvidia-mode"][value="sequential"]');
|
||||
if (seq) seq.checked = true;
|
||||
}
|
||||
var label = el.closest('label');
|
||||
if (label) label.style.opacity = '0.4';
|
||||
} else {
|
||||
el.disabled = false;
|
||||
var label = el.closest('label');
|
||||
if (label) label.style.opacity = '';
|
||||
}
|
||||
});
|
||||
}
|
||||
function burnUpdateSelectionNote() {
|
||||
const note = document.getElementById('burn-selection-note');
|
||||
const selected = burnSelectedGPUIndices();
|
||||
if (!selected.length) {
|
||||
note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA burn recipes.';
|
||||
return;
|
||||
}
|
||||
note.textContent = 'Selected NVIDIA GPUs: ' + selected.join(', ') + '. Official and custom NVIDIA tasks will use only these GPUs.';
|
||||
}
|
||||
function burnRenderGPUList(gpus) {
|
||||
const root = document.getElementById('burn-gpu-list');
|
||||
if (!gpus || !gpus.length) {
|
||||
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
|
||||
burnUpdateSelectionNote();
|
||||
return;
|
||||
}
|
||||
root.innerHTML = gpus.map(function(gpu) {
|
||||
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
|
||||
return '<label class="burn-gpu-row">'
|
||||
+ '<input class="burn-gpu-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="burnUpdateSelectionNote()">'
|
||||
+ '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
|
||||
+ '</label>';
|
||||
}).join('');
|
||||
burnApplyMultiGPUState(gpus.length);
|
||||
burnUpdateSelectionNote();
|
||||
}
|
||||
function burnSelectAll() {
|
||||
document.querySelectorAll('.burn-gpu-checkbox').forEach(function(el) { el.checked = true; });
|
||||
burnUpdateSelectionNote();
|
||||
}
|
||||
function burnSelectNone() {
|
||||
document.querySelectorAll('.burn-gpu-checkbox').forEach(function(el) { el.checked = false; });
|
||||
burnUpdateSelectionNote();
|
||||
}
|
||||
function burnLoadGPUs() {
|
||||
fetch('/api/gpu/nvidia').then(function(r) {
|
||||
return r.json().then(function(body) {
|
||||
if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
|
||||
return body;
|
||||
});
|
||||
}).then(function(gpus) {
|
||||
burnRenderGPUList(gpus);
|
||||
}).catch(function(err) {
|
||||
document.getElementById('burn-gpu-list').innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
|
||||
burnUpdateSelectionNote();
|
||||
});
|
||||
}
|
||||
function enqueueBurnTask(target, label, extra, useSelectedNvidia) {
|
||||
const body = Object.assign({ profile: burnProfile(), display_name: label }, extra || {});
|
||||
if (useSelectedNvidia) {
|
||||
const selected = burnSelectedGPUIndices();
|
||||
if (!selected.length) {
|
||||
return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
|
||||
}
|
||||
body.gpu_indices = selected;
|
||||
const bMode = burnNvidiaMode();
|
||||
if (bMode === 'ramp-up' && selected.length > 1) {
|
||||
body.stagger_gpu_start = true;
|
||||
} else if (bMode === 'parallel' && selected.length > 1) {
|
||||
body.parallel_gpus = true;
|
||||
}
|
||||
}
|
||||
return fetch('/api/sat/' + target + '/run', {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type':'application/json'},
|
||||
body: JSON.stringify(body)
|
||||
}).then(function(r) {
|
||||
return r.json().then(function(payload) {
|
||||
if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
|
||||
return payload;
|
||||
});
|
||||
});
|
||||
}
|
||||
function streamTask(taskId, label) {
|
||||
if (biES) { biES.close(); biES = null; }
|
||||
document.getElementById('bi-output').style.display = 'block';
|
||||
document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
|
||||
const term = document.getElementById('bi-terminal');
|
||||
term.textContent = 'Task ' + taskId + ' queued. Streaming...\n';
|
||||
biES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||
biES.addEventListener('done', function(e) {
|
||||
biES.close();
|
||||
biES = null;
|
||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
});
|
||||
}
|
||||
function streamBurnTask(taskId, label, resetTerminal) {
|
||||
return streamBurnTaskSet([taskId], label, resetTerminal);
|
||||
}
|
||||
function streamBurnTaskSet(taskIds, label, resetTerminal) {
|
||||
if (biES) { biES.close(); biES = null; }
|
||||
document.getElementById('bi-output').style.display = 'block';
|
||||
document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
|
||||
const term = document.getElementById('bi-terminal');
|
||||
if (resetTerminal) {
|
||||
term.textContent = '';
|
||||
}
|
||||
if (!Array.isArray(taskIds) || !taskIds.length) {
|
||||
term.textContent += 'ERROR: no tasks queued.\n';
|
||||
return Promise.resolve({ok:false, error:'no tasks queued'});
|
||||
}
|
||||
const streamNext = function(idx, failures) {
|
||||
if (idx >= taskIds.length) {
|
||||
return Promise.resolve({ok: failures === 0, error: failures ? (failures + ' task(s) failed') : ''});
|
||||
}
|
||||
const taskId = taskIds[idx];
|
||||
term.textContent += '[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming...\n';
|
||||
return new Promise(function(resolve) {
|
||||
biES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||
biES.addEventListener('done', function(e) {
|
||||
biES.close();
|
||||
biES = null;
|
||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
resolve(failures + (e.data ? 1 : 0));
|
||||
});
|
||||
biES.onerror = function() {
|
||||
if (biES) {
|
||||
biES.close();
|
||||
biES = null;
|
||||
}
|
||||
term.textContent += '\nERROR: stream disconnected.\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
resolve(failures + 1);
|
||||
};
|
||||
}).then(function(nextFailures) {
|
||||
return streamNext(idx + 1, nextFailures);
|
||||
});
|
||||
};
|
||||
return streamNext(0, 0);
|
||||
}
|
||||
function runBurnTaskSet(tasks, statusElId) {
|
||||
const enabled = tasks.filter(function(t) {
|
||||
const el = document.getElementById(t.id);
|
||||
return el && el.checked && !el.disabled;
|
||||
});
|
||||
const status = statusElId ? document.getElementById(statusElId) : null;
|
||||
if (status) status.textContent = '';
|
||||
if (!enabled.length) {
|
||||
if (status) status.textContent = 'No tasks selected.';
|
||||
return;
|
||||
}
|
||||
const term = document.getElementById('bi-terminal');
|
||||
document.getElementById('bi-output').style.display = 'block';
|
||||
document.getElementById('bi-title').textContent = '— Burn one by one [' + burnProfile() + ']';
|
||||
term.textContent = '';
|
||||
const runNext = function(idx) {
|
||||
if (idx >= enabled.length) {
|
||||
if (status) status.textContent = 'Completed ' + enabled.length + ' task(s).';
|
||||
return Promise.resolve();
|
||||
}
|
||||
const t = enabled[idx];
|
||||
term.textContent += '\n[' + (idx + 1) + '/' + enabled.length + '] ' + t.label + '\n';
|
||||
if (status) status.textContent = 'Running ' + (idx + 1) + '/' + enabled.length + '...';
|
||||
return enqueueBurnTask(t.target, t.label, t.extra, !!t.nvidia)
|
||||
.then(function(d) {
|
||||
return streamBurnTaskSet(burnTaskIDs(d), t.label, false);
|
||||
})
|
||||
.then(function() {
|
||||
return runNext(idx + 1);
|
||||
})
|
||||
.catch(function(err) {
|
||||
if (status) status.textContent = 'Error: ' + err.message;
|
||||
document.getElementById('bi-output').style.display = 'block';
|
||||
term.textContent += 'ERROR: ' + err.message + '\n';
|
||||
return Promise.reject(err);
|
||||
});
|
||||
};
|
||||
return runNext(0);
|
||||
}
|
||||
function runPlatformStress() {
|
||||
const comps = [];
|
||||
const computeIDs = ['burn-cpu', 'burn-mem-stress', 'burn-sat-stress'];
|
||||
const gpuIDs = ['burn-nvidia-compute', 'burn-gpu-bee', 'burn-gpu-john', 'burn-gpu-rvs'];
|
||||
const hasChecked = function(ids) {
|
||||
return ids.some(function(id) {
|
||||
const el = document.getElementById(id);
|
||||
return el && el.checked && !el.disabled;
|
||||
});
|
||||
};
|
||||
if (hasChecked(computeIDs)) comps.push('cpu');
|
||||
if (hasChecked(gpuIDs)) comps.push('gpu');
|
||||
if (!comps.length) {
|
||||
const status = document.getElementById('burn-all-status');
|
||||
if (status) status.textContent = 'Select at least one test in GPU Max Load or Compute Stress.';
|
||||
return;
|
||||
}
|
||||
const extra = comps.length > 0 ? {platform_components: comps} : {};
|
||||
enqueueBurnTask('platform-stress', 'Platform Thermal Cycling', extra, false).then(function(d) {
|
||||
streamTask(d.task_id, 'Platform Thermal Cycling');
|
||||
});
|
||||
}
|
||||
function runAllBurnTasks() {
|
||||
const status = document.getElementById('burn-all-status');
|
||||
const all = [
|
||||
{id:'burn-nvidia-compute',target:'nvidia-compute',label:'NVIDIA Max Compute Load (dcgmproftester)',nvidia:true},
|
||||
{id:'burn-gpu-bee',target:'nvidia-stress',label:'GPU Burn (bee-gpu-burn)',nvidia:true,extra:{loader:'builtin'}},
|
||||
{id:'burn-gpu-john',target:'nvidia-stress',label:'John GPU Stress (john/OpenCL)',nvidia:true,extra:{loader:'john'}},
|
||||
{id:'burn-gpu-rvs',target:'amd-stress',label:'AMD GPU Stress (rvs gst)'},
|
||||
{id:'burn-cpu',target:'cpu',label:'CPU Burn-in'},
|
||||
{id:'burn-mem-stress',target:'memory-stress',label:'Memory Burn-in'},
|
||||
{id:'burn-sat-stress',target:'sat-stress',label:'SAT Stress (stressapptest)'},
|
||||
];
|
||||
status.textContent = 'Enqueuing...';
|
||||
runBurnTaskSet(all, 'burn-all-status');
|
||||
}
|
||||
fetch('/api/gpu/tools').then(function(r) { return r.json(); }).then(function(tools) {
|
||||
const map = {
|
||||
'nvidia-compute': {cb:'burn-nvidia-compute', note:'note-nvidia-compute', reason:'dcgmproftester not available or NVIDIA driver not running'},
|
||||
'bee-gpu-burn': {cb:'burn-gpu-bee', note:'note-bee', reason:'bee-gpu-burn not available or NVIDIA driver not running'},
|
||||
'john': {cb:'burn-gpu-john', note:'note-john', reason:'bee-john-gpu-stress not available or NVIDIA driver not running'},
|
||||
'rvs': {cb:'burn-gpu-rvs', note:'note-rvs', reason:'AMD driver not running'},
|
||||
};
|
||||
tools.forEach(function(t) {
|
||||
const spec = map[t.id];
|
||||
if (!spec) return;
|
||||
const cb = document.getElementById(spec.cb);
|
||||
const note = document.getElementById(spec.note);
|
||||
if (!cb) return;
|
||||
if (t.available) {
|
||||
cb.disabled = false;
|
||||
} else if (note) {
|
||||
note.textContent = '— ' + spec.reason;
|
||||
}
|
||||
});
|
||||
}).catch(function() {});
|
||||
burnLoadGPUs();
|
||||
</script>`
|
||||
}
|
||||
@@ -0,0 +1,511 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"html"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func renderExport(exportDir string) string {
|
||||
entries, _ := listExportFiles(exportDir)
|
||||
var rows strings.Builder
|
||||
for _, e := range entries {
|
||||
rows.WriteString(fmt.Sprintf(`<tr><td><a href="/export/file?path=%s" target="_blank">%s</a></td></tr>`,
|
||||
url.QueryEscape(e), html.EscapeString(e)))
|
||||
}
|
||||
if len(entries) == 0 {
|
||||
rows.WriteString(`<tr><td style="color:var(--muted)">No export files found.</td></tr>`)
|
||||
}
|
||||
return `<div class="grid2">
|
||||
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Creates a tar.gz archive of all audit files, SAT results, and logs.</p>
|
||||
` + renderSupportBundleInline() + `
|
||||
</div></div>
|
||||
<div class="card"><div class="card-head">Export Files</div><div class="card-body">
|
||||
<table><tr><th>File</th></tr>` + rows.String() + `</table>
|
||||
</div></div>
|
||||
</div>
|
||||
|
||||
` + renderUSBExportCard()
|
||||
}
|
||||
|
||||
func listExportFiles(exportDir string) ([]string, error) {
|
||||
var entries []string
|
||||
err := filepath.Walk(strings.TrimSpace(exportDir), func(path string, info os.FileInfo, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if info.IsDir() {
|
||||
return nil
|
||||
}
|
||||
rel, err := filepath.Rel(exportDir, path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
entries = append(entries, rel)
|
||||
return nil
|
||||
})
|
||||
if err != nil && !os.IsNotExist(err) {
|
||||
return nil, err
|
||||
}
|
||||
sort.Strings(entries)
|
||||
return entries, nil
|
||||
}
|
||||
|
||||
func renderSupportBundleInline() string {
|
||||
return `<button id="support-bundle-btn" class="btn btn-primary" onclick="supportBundleDownload()">↓ Download Support Bundle</button>
|
||||
<div id="support-bundle-status" style="margin-top:10px;font-size:13px;color:var(--muted)"></div>
|
||||
<script>
|
||||
window.supportBundleDownload = function() {
|
||||
var btn = document.getElementById('support-bundle-btn');
|
||||
var status = document.getElementById('support-bundle-status');
|
||||
btn.disabled = true;
|
||||
btn.textContent = 'Building...';
|
||||
status.textContent = 'Collecting logs and export data\u2026';
|
||||
status.style.color = 'var(--muted)';
|
||||
var filename = 'bee-support.tar.gz';
|
||||
fetch('/export/support.tar.gz')
|
||||
.then(function(r) {
|
||||
if (!r.ok) throw new Error('HTTP ' + r.status);
|
||||
var cd = r.headers.get('Content-Disposition') || '';
|
||||
var m = cd.match(/filename="?([^";]+)"?/);
|
||||
if (m) filename = m[1];
|
||||
return r.blob();
|
||||
})
|
||||
.then(function(blob) {
|
||||
var url = URL.createObjectURL(blob);
|
||||
var a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = filename;
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
document.body.removeChild(a);
|
||||
URL.revokeObjectURL(url);
|
||||
status.textContent = 'Download started.';
|
||||
status.style.color = 'var(--ok-fg)';
|
||||
})
|
||||
.catch(function(e) {
|
||||
status.textContent = 'Error: ' + e.message;
|
||||
status.style.color = 'var(--crit-fg)';
|
||||
})
|
||||
.finally(function() {
|
||||
btn.disabled = false;
|
||||
btn.textContent = '\u2195 Download Support Bundle';
|
||||
});
|
||||
};
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderUSBExportCard() string {
|
||||
return `<div class="card" style="margin-top:16px">
|
||||
<div class="card-head">USB Black-Box
|
||||
<button class="btn btn-sm btn-secondary" onclick="blackboxRefresh()" style="margin-left:auto">↻ Refresh</button>
|
||||
</div>
|
||||
<div class="card-body">` + renderUSBExportInline() + `</div>
|
||||
</div>`
|
||||
}
|
||||
|
||||
func renderUSBExportInline() string {
|
||||
return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Marks removable USB devices as black-box targets. The dedicated bee-blackbox service mirrors export files and system logs into a boot-scoped folder and resumes automatically after restart.</p>
|
||||
<div id="usb-status" style="font-size:13px;color:var(--muted)">Scanning for USB devices...</div>
|
||||
<div id="blackbox-summary" style="margin-top:8px;font-size:13px;color:var(--muted)">Loading black-box status...</div>
|
||||
<div id="usb-targets" style="margin-top:12px"></div>
|
||||
<div id="usb-msg" style="margin-top:10px;font-size:13px"></div>
|
||||
<script>
|
||||
(function(){
|
||||
function blackboxRefresh() {
|
||||
document.getElementById('usb-status').textContent = 'Scanning...';
|
||||
document.getElementById('blackbox-summary').textContent = 'Loading black-box status...';
|
||||
document.getElementById('usb-targets').innerHTML = '';
|
||||
document.getElementById('usb-msg').textContent = '';
|
||||
Promise.all([
|
||||
fetch('/api/export/usb').then(r=>r.json()),
|
||||
fetch('/api/blackbox/status').then(r=>r.json())
|
||||
]).then(function(values) {
|
||||
const targets = Array.isArray(values[0]) ? values[0] : [];
|
||||
const state = values[1] || {};
|
||||
const active = Array.isArray(state.targets) ? state.targets : [];
|
||||
window._usbTargets = targets;
|
||||
window._blackboxTargets = active;
|
||||
const st = document.getElementById('usb-status');
|
||||
const ct = document.getElementById('usb-targets');
|
||||
const summary = document.getElementById('blackbox-summary');
|
||||
if (state.boot_folder) {
|
||||
summary.textContent = 'Service state: ' + (state.status || 'unknown') + '. Boot folder: ' + state.boot_folder + '.';
|
||||
} else {
|
||||
summary.textContent = 'Service state: ' + (state.status || 'disabled') + '.';
|
||||
}
|
||||
if (!targets || targets.length === 0) {
|
||||
st.textContent = 'No removable USB devices found.';
|
||||
} else {
|
||||
st.textContent = targets.length + ' device(s) found:';
|
||||
}
|
||||
const byDevice = {};
|
||||
active.forEach(function(item) { byDevice[item.device] = item; });
|
||||
ct.innerHTML = '<table><tr><th>Device</th><th>FS</th><th>Size</th><th>Label</th><th>Model</th><th>Black-Box</th><th>Actions</th></tr>' +
|
||||
targets.map((t, idx) => {
|
||||
const dev = t.device || '';
|
||||
const label = t.label || '';
|
||||
const model = t.model || '';
|
||||
const state = byDevice[dev];
|
||||
const status = state ? (state.status + (state.flush_period ? ', flush ' + state.flush_period : '')) : 'not enrolled';
|
||||
const detail = state && state.last_error ? ('<div style="font-size:12px;color:var(--err,red)">'+state.last_error+'</div>') : '';
|
||||
return '<tr>' +
|
||||
'<td style="font-family:monospace">'+dev+'</td>' +
|
||||
'<td>'+t.fs_type+'</td>' +
|
||||
'<td>'+t.size+'</td>' +
|
||||
'<td>'+label+'</td>' +
|
||||
'<td style="font-size:12px;color:var(--muted)">'+model+'</td>' +
|
||||
'<td style="font-size:12px">'+status+detail+'</td>' +
|
||||
'<td style="white-space:nowrap">' +
|
||||
(state
|
||||
? '<button class="btn btn-sm btn-secondary" onclick="blackboxDisable('+idx+',this)">Disable</button>'
|
||||
: '<button class="btn btn-sm btn-primary" onclick="blackboxEnable('+idx+',this)">Enable</button>') +
|
||||
'<div class="usb-row-msg" style="margin-top:6px;font-size:12px;color:var(--muted)"></div>' +
|
||||
'</td></tr>';
|
||||
}).join('') + '</table>';
|
||||
}).catch(e => {
|
||||
document.getElementById('usb-status').textContent = 'Error: ' + e;
|
||||
});
|
||||
}
|
||||
window.blackboxEnable = function(targetIndex, btn) {
|
||||
const target = (window._usbTargets || [])[targetIndex];
|
||||
if (!target) {
|
||||
const msg = document.getElementById('usb-msg');
|
||||
msg.style.color = 'var(--err,red)';
|
||||
msg.textContent = 'Error: USB target not found. Refresh and try again.';
|
||||
return;
|
||||
}
|
||||
const msg = document.getElementById('usb-msg');
|
||||
const row = btn ? btn.closest('td') : null;
|
||||
const rowMsg = row ? row.querySelector('.usb-row-msg') : null;
|
||||
const originalText = btn ? btn.textContent : '';
|
||||
if (btn) {
|
||||
btn.disabled = true;
|
||||
btn.textContent = 'Enabling...';
|
||||
}
|
||||
if (rowMsg) {
|
||||
rowMsg.style.color = 'var(--muted)';
|
||||
rowMsg.textContent = 'Working...';
|
||||
}
|
||||
msg.style.color = 'var(--muted)';
|
||||
msg.textContent = 'Enabling black-box on ' + (target.device||'') + '...';
|
||||
fetch('/api/blackbox/enable', {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type':'application/json'},
|
||||
body: JSON.stringify(target)
|
||||
}).then(async r => {
|
||||
const d = await r.json();
|
||||
if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status));
|
||||
return d;
|
||||
}).then(d => {
|
||||
msg.style.color = 'var(--ok,green)';
|
||||
msg.textContent = d.message || 'Done.';
|
||||
if (rowMsg) {
|
||||
rowMsg.style.color = 'var(--ok,green)';
|
||||
rowMsg.textContent = d.message || 'Done.';
|
||||
}
|
||||
}).catch(e => {
|
||||
msg.style.color = 'var(--err,red)';
|
||||
msg.textContent = 'Error: '+e;
|
||||
if (rowMsg) {
|
||||
rowMsg.style.color = 'var(--err,red)';
|
||||
rowMsg.textContent = 'Error: ' + e;
|
||||
}
|
||||
}).finally(() => {
|
||||
if (btn) {
|
||||
btn.disabled = false;
|
||||
btn.textContent = originalText;
|
||||
}
|
||||
setTimeout(blackboxRefresh, 300);
|
||||
});
|
||||
};
|
||||
window.blackboxDisable = function(targetIndex, btn) {
|
||||
const target = (window._usbTargets || [])[targetIndex];
|
||||
const active = (window._blackboxTargets || []).find(function(item){ return item.device === (target && target.device); });
|
||||
if (!target || !active) {
|
||||
const msg = document.getElementById('usb-msg');
|
||||
msg.style.color = 'var(--err,red)';
|
||||
msg.textContent = 'Error: black-box target not found. Refresh and try again.';
|
||||
return;
|
||||
}
|
||||
const msg = document.getElementById('usb-msg');
|
||||
const row = btn ? btn.closest('td') : null;
|
||||
const rowMsg = row ? row.querySelector('.usb-row-msg') : null;
|
||||
const originalText = btn ? btn.textContent : '';
|
||||
if (btn) {
|
||||
btn.disabled = true;
|
||||
btn.textContent = 'Disabling...';
|
||||
}
|
||||
if (rowMsg) {
|
||||
rowMsg.style.color = 'var(--muted)';
|
||||
rowMsg.textContent = 'Working...';
|
||||
}
|
||||
msg.style.color = 'var(--muted)';
|
||||
msg.textContent = 'Disabling black-box on ' + (target.device||'') + '...';
|
||||
fetch('/api/blackbox/disable', {
|
||||
method:'POST',
|
||||
headers:{'Content-Type':'application/json'},
|
||||
body: JSON.stringify({device: target.device, enrollment_id: active.enrollment_id})
|
||||
}).then(async r => {
|
||||
const d = await r.json();
|
||||
if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status));
|
||||
return d;
|
||||
}).then(d => {
|
||||
msg.style.color = 'var(--ok,green)';
|
||||
msg.textContent = d.message || 'Done.';
|
||||
if (rowMsg) {
|
||||
rowMsg.style.color = 'var(--ok,green)';
|
||||
rowMsg.textContent = d.message || 'Done.';
|
||||
}
|
||||
}).catch(e => {
|
||||
msg.style.color = 'var(--err,red)';
|
||||
msg.textContent = 'Error: '+e;
|
||||
if (rowMsg) {
|
||||
rowMsg.style.color = 'var(--err,red)';
|
||||
rowMsg.textContent = 'Error: ' + e;
|
||||
}
|
||||
}).finally(() => {
|
||||
if (btn) {
|
||||
btn.disabled = false;
|
||||
btn.textContent = originalText;
|
||||
}
|
||||
setTimeout(blackboxRefresh, 300);
|
||||
});
|
||||
};
|
||||
window.blackboxRefresh = blackboxRefresh;
|
||||
blackboxRefresh();
|
||||
})();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderNvidiaSelfHealInline() string {
|
||||
return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Inspect NVIDIA GPU health, restart the bee-nvidia driver service, and issue a per-GPU reset when the driver reports reset required.</p>
|
||||
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:12px">
|
||||
<button id="nvidia-restart-btn" class="btn btn-secondary" onclick="nvidiaRestartDrivers()">Restart GPU Drivers</button>
|
||||
<button class="btn btn-sm btn-secondary" onclick="loadNvidiaSelfHeal()">↻ Refresh</button>
|
||||
</div>
|
||||
<div id="nvidia-self-heal-status" style="font-size:13px;color:var(--muted);margin-bottom:12px">Loading NVIDIA GPU status...</div>
|
||||
<div id="nvidia-self-heal-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||
<div id="nvidia-self-heal-out" style="display:none;margin-top:12px">
|
||||
<div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
|
||||
<span id="nvidia-self-heal-out-label" style="font-size:12px;font-weight:600;color:var(--muted)">Output</span>
|
||||
<span id="nvidia-self-heal-out-status" style="font-size:12px"></span>
|
||||
</div>
|
||||
<div id="nvidia-self-heal-terminal" class="terminal" style="max-height:220px;width:100%;box-sizing:border-box"></div>
|
||||
</div>
|
||||
<script>
|
||||
function nvidiaSelfHealShowResult(label, status, output) {
|
||||
var out = document.getElementById('nvidia-self-heal-out');
|
||||
var term = document.getElementById('nvidia-self-heal-terminal');
|
||||
var statusEl = document.getElementById('nvidia-self-heal-out-status');
|
||||
var labelEl = document.getElementById('nvidia-self-heal-out-label');
|
||||
out.style.display = 'block';
|
||||
labelEl.textContent = label;
|
||||
term.textContent = output || '(no output)';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
if (status === 'ok') {
|
||||
statusEl.textContent = '✓ done';
|
||||
statusEl.style.color = 'var(--ok-fg, #2c662d)';
|
||||
} else {
|
||||
statusEl.textContent = '✗ failed';
|
||||
statusEl.style.color = 'var(--crit-fg, #9f3a38)';
|
||||
}
|
||||
}
|
||||
function nvidiaRestartDrivers() {
|
||||
var btn = document.getElementById('nvidia-restart-btn');
|
||||
var original = btn.textContent;
|
||||
btn.disabled = true;
|
||||
btn.textContent = 'Restarting...';
|
||||
nvidiaSelfHealShowResult('restart bee-nvidia', 'ok', 'Running...');
|
||||
fetch('/api/services/action', {
|
||||
method:'POST',
|
||||
headers:{'Content-Type':'application/json'},
|
||||
body:JSON.stringify({name:'bee-nvidia', action:'restart'})
|
||||
}).then(r=>r.json()).then(d => {
|
||||
nvidiaSelfHealShowResult('restart bee-nvidia', d.status || 'error', d.output || d.error || '(no output)');
|
||||
setTimeout(function() {
|
||||
loadServices();
|
||||
loadNvidiaSelfHeal();
|
||||
}, 800);
|
||||
}).catch(e => {
|
||||
nvidiaSelfHealShowResult('restart bee-nvidia', 'error', 'Request failed: ' + e);
|
||||
}).finally(() => {
|
||||
btn.disabled = false;
|
||||
btn.textContent = original;
|
||||
});
|
||||
}
|
||||
function nvidiaResetGPU(index, btn) {
|
||||
var original = btn.textContent;
|
||||
btn.disabled = true;
|
||||
btn.textContent = 'Resetting...';
|
||||
nvidiaSelfHealShowResult('reset gpu ' + index, 'ok', 'Running...');
|
||||
fetch('/api/gpu/nvidia-reset', {
|
||||
method:'POST',
|
||||
headers:{'Content-Type':'application/json'},
|
||||
body:JSON.stringify({index:index})
|
||||
}).then(r=>r.json()).then(d => {
|
||||
nvidiaSelfHealShowResult('reset gpu ' + index, d.status || 'error', d.output || '(no output)');
|
||||
setTimeout(loadNvidiaSelfHeal, 1000);
|
||||
}).catch(e => {
|
||||
nvidiaSelfHealShowResult('reset gpu ' + index, 'error', 'Request failed: ' + e);
|
||||
}).finally(() => {
|
||||
btn.disabled = false;
|
||||
btn.textContent = original;
|
||||
});
|
||||
}
|
||||
function loadNvidiaSelfHeal() {
|
||||
var status = document.getElementById('nvidia-self-heal-status');
|
||||
var table = document.getElementById('nvidia-self-heal-table');
|
||||
status.textContent = 'Loading NVIDIA GPU status...';
|
||||
status.style.color = 'var(--muted)';
|
||||
table.innerHTML = '<p style="color:var(--muted);font-size:13px">Loading...</p>';
|
||||
fetch('/api/gpu/nvidia-status').then(r=>r.json()).then(gpus => {
|
||||
if (!Array.isArray(gpus) || gpus.length === 0) {
|
||||
status.textContent = 'No NVIDIA GPUs detected or nvidia-smi is unavailable.';
|
||||
table.innerHTML = '';
|
||||
return;
|
||||
}
|
||||
status.textContent = gpus.length + ' NVIDIA GPU(s) detected.';
|
||||
const rows = gpus.map(g => {
|
||||
const serial = g.serial || '';
|
||||
const bdf = g.bdf || '';
|
||||
const id = serial || bdf || ('gpu-' + g.index);
|
||||
const badge = g.status === 'OK' ? 'badge-ok' : g.status === 'RESET_REQUIRED' ? 'badge-err' : 'badge-warn';
|
||||
const details = [];
|
||||
if (serial) details.push('serial ' + serial);
|
||||
if (bdf) details.push('bdf ' + bdf);
|
||||
if (g.parse_failure && g.raw_line) details.push(g.raw_line);
|
||||
return '<tr>'
|
||||
+ '<td style="white-space:nowrap">' + g.index + '</td>'
|
||||
+ '<td>' + (g.name || 'unknown') + '</td>'
|
||||
+ '<td style="font-family:monospace">' + id + '</td>'
|
||||
+ '<td><span class="badge ' + badge + '">' + (g.status || 'UNKNOWN') + '</span>'
|
||||
+ (details.length ? '<div style="margin-top:4px;font-size:12px;color:var(--muted)">' + details.join(' | ') + '</div>' : '')
|
||||
+ '</td>'
|
||||
+ '<td style="white-space:nowrap"><button class="btn btn-sm btn-secondary" onclick="nvidiaResetGPU(' + g.index + ', this)">Reset GPU</button></td>'
|
||||
+ '</tr>';
|
||||
}).join('');
|
||||
table.innerHTML = '<table><tr><th>GPU</th><th>Model</th><th>ID</th><th>Status</th><th>Action</th></tr>' + rows + '</table>';
|
||||
}).catch(e => {
|
||||
status.textContent = 'Error loading NVIDIA GPU status: ' + e;
|
||||
status.style.color = 'var(--crit-fg, #9f3a38)';
|
||||
table.innerHTML = '';
|
||||
});
|
||||
}
|
||||
loadNvidiaSelfHeal();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderTools() string {
|
||||
return `<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">System Install</div>
|
||||
<div class="card-body">
|
||||
<div style="margin-bottom:20px">
|
||||
<div style="font-weight:600;margin-bottom:8px">Install to RAM</div>
|
||||
<p id="boot-source-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Detecting boot source...</p>
|
||||
<p id="ram-status-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Checking...</p>
|
||||
<button id="ram-install-btn" class="btn btn-primary" onclick="installToRAM()" style="display:none">▶ Copy to RAM</button>
|
||||
</div>
|
||||
<div style="border-top:1px solid var(--line);padding-top:20px">
|
||||
<div style="font-weight:600;margin-bottom:8px">Install to Disk</div>` +
|
||||
renderInstallInline() + `
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<script>
|
||||
fetch('/api/system/ram-status').then(r=>r.json()).then(d=>{
|
||||
const boot = document.getElementById('boot-source-text');
|
||||
const txt = document.getElementById('ram-status-text');
|
||||
const btn = document.getElementById('ram-install-btn');
|
||||
let source = d.device || d.source || 'unknown source';
|
||||
let kind = d.kind || 'unknown';
|
||||
let label = source;
|
||||
if (kind === 'ram') label = 'RAM';
|
||||
else if (kind === 'usb') label = 'USB (' + source + ')';
|
||||
else if (kind === 'cdrom') label = 'CD-ROM (' + source + ')';
|
||||
else if (kind === 'disk') label = 'disk (' + source + ')';
|
||||
else label = source;
|
||||
boot.textContent = 'Current boot source: ' + label + '.';
|
||||
txt.textContent = d.blocked_reason || d.message || 'Checking...';
|
||||
if (d.status === 'ok' || d.in_ram) {
|
||||
txt.style.color = 'var(--ok, green)';
|
||||
} else if (d.status === 'failed') {
|
||||
txt.style.color = 'var(--err, #b91c1c)';
|
||||
} else {
|
||||
txt.style.color = 'var(--muted)';
|
||||
}
|
||||
if (d.can_start_task) {
|
||||
btn.style.display = '';
|
||||
btn.disabled = false;
|
||||
} else {
|
||||
btn.style.display = 'none';
|
||||
}
|
||||
});
|
||||
function installToRAM() {
|
||||
document.getElementById('ram-install-btn').disabled = true;
|
||||
fetch('/api/system/install-to-ram', {method:'POST'}).then(r=>r.json()).then(d=>{
|
||||
window.location.href = '/tasks#' + d.task_id;
|
||||
});
|
||||
}
|
||||
</script>
|
||||
|
||||
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Downloads a tar.gz archive of all audit files, SAT results, and logs.</p>
|
||||
` + renderSupportBundleInline() + `
|
||||
<div style="border-top:1px solid var(--border);margin-top:16px;padding-top:16px">
|
||||
<div style="font-weight:600;margin-bottom:8px">USB Black-Box</div>
|
||||
` + renderUSBExportInline() + `
|
||||
</div>
|
||||
</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">↻ Check</button></div>
|
||||
<div class="card-body"><div id="tools-table"><p style="color:var(--muted);font-size:13px">Checking...</p></div></div></div>
|
||||
|
||||
<div class="card"><div class="card-head">NVIDIA Self Heal</div><div class="card-body">` +
|
||||
renderNvidiaSelfHealInline() + `</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Network</div><div class="card-body">` +
|
||||
renderNetworkInline() + `</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Services</div><div class="card-body">` +
|
||||
renderServicesInline() + `</div></div>
|
||||
|
||||
` + renderNVMeFormatCard() + `
|
||||
|
||||
<script>
|
||||
function checkTools() {
|
||||
document.getElementById('tools-table').innerHTML = '<p style="color:var(--muted);font-size:13px">Checking...</p>';
|
||||
fetch('/api/tools/check').then(r=>r.json()).then(tools => {
|
||||
const rows = tools.map(t =>
|
||||
'<tr><td>'+t.Name+'</td><td><span class="badge '+(t.OK ? 'badge-ok' : 'badge-err')+'">'+(t.OK ? '✓ '+t.Path : '✗ missing')+'</span></td></tr>'
|
||||
).join('');
|
||||
document.getElementById('tools-table').innerHTML =
|
||||
'<table><tr><th>Tool</th><th>Status</th></tr>'+rows+'</table>';
|
||||
});
|
||||
}
|
||||
checkTools();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderExportIndex(exportDir string) (string, error) {
|
||||
entries, err := listExportFiles(exportDir)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
var body strings.Builder
|
||||
body.WriteString(`<!DOCTYPE html><html><head><meta charset="utf-8"><title>Bee Export Files</title></head><body>`)
|
||||
body.WriteString(`<h1>Bee Export Files</h1><ul>`)
|
||||
for _, entry := range entries {
|
||||
body.WriteString(`<li><a href="/export/file?path=` + url.QueryEscape(entry) + `">` + html.EscapeString(entry) + `</a></li>`)
|
||||
}
|
||||
if len(entries) == 0 {
|
||||
body.WriteString(`<li>No export files found.</li>`)
|
||||
}
|
||||
body.WriteString(`</ul></body></html>`)
|
||||
return body.String(), nil
|
||||
}
|
||||
@@ -0,0 +1,314 @@
|
||||
package webui
|
||||
|
||||
func renderInstallInline() string {
|
||||
return `
|
||||
<div class="alert alert-warn" style="margin-bottom:16px">
|
||||
<strong>Warning:</strong> Installing will <strong>completely erase</strong> the selected
|
||||
disk and write the live system onto it. All existing data on the target disk will be lost.
|
||||
This operation cannot be undone.
|
||||
</div>
|
||||
<div id="install-loading" style="color:var(--muted);font-size:13px">Loading disk list…</div>
|
||||
<div id="install-disk-section" style="display:none">
|
||||
<div class="card" style="margin-bottom:0">
|
||||
<table id="install-disk-table">
|
||||
<thead><tr><th></th><th>Device</th><th>Model</th><th>Size</th><th>Status</th></tr></thead>
|
||||
<tbody id="install-disk-tbody"></tbody>
|
||||
</table>
|
||||
</div>
|
||||
<div style="margin-top:12px">
|
||||
<button class="btn btn-secondary btn-sm" onclick="installRefreshDisks()">↻ Refresh</button>
|
||||
</div>
|
||||
</div>
|
||||
<div id="install-confirm-section" style="display:none;margin-top:20px">
|
||||
<div id="install-confirm-warn" class="alert" style="background:#fff6f6;border:1px solid #e0b4b4;color:#9f3a38;font-size:13px"></div>
|
||||
<div class="form-row" style="max-width:360px">
|
||||
<label>Type the device name to confirm (e.g. /dev/sda)</label>
|
||||
<input type="text" id="install-confirm-input" placeholder="/dev/..." oninput="installCheckConfirm()" autocomplete="off" spellcheck="false">
|
||||
</div>
|
||||
<button class="btn btn-danger" id="install-start-btn" disabled onclick="installStart()">Install to Disk</button>
|
||||
<button class="btn btn-secondary" style="margin-left:8px" onclick="installDeselect()">Cancel</button>
|
||||
</div>
|
||||
<div id="install-progress-section" style="display:none;margin-top:20px">
|
||||
<div class="card-head" style="margin-bottom:8px">Installation Progress</div>
|
||||
<div id="install-terminal" class="terminal" style="max-height:500px"></div>
|
||||
<div id="install-status" style="margin-top:12px;font-size:13px"></div>
|
||||
</div>
|
||||
|
||||
<style>
|
||||
#install-disk-tbody tr{cursor:pointer}
|
||||
#install-disk-tbody tr.selected td{background:rgba(33,133,208,.1)}
|
||||
#install-disk-tbody tr:hover td{background:rgba(33,133,208,.07)}
|
||||
</style>
|
||||
|
||||
<script>
|
||||
var _installSelected = null;
|
||||
|
||||
function installRefreshDisks() {
|
||||
document.getElementById('install-loading').style.display = '';
|
||||
document.getElementById('install-disk-section').style.display = 'none';
|
||||
document.getElementById('install-confirm-section').style.display = 'none';
|
||||
_installSelected = null;
|
||||
fetch('/api/install/disks').then(function(r){ return r.json(); }).then(function(disks){
|
||||
document.getElementById('install-loading').style.display = 'none';
|
||||
var tbody = document.getElementById('install-disk-tbody');
|
||||
tbody.innerHTML = '';
|
||||
if (!disks || disks.length === 0) {
|
||||
tbody.innerHTML = '<tr><td colspan="5" style="color:var(--muted);text-align:center">No installable disks found</td></tr>';
|
||||
} else {
|
||||
disks.forEach(function(d) {
|
||||
var warnings = (d.warnings || []);
|
||||
var statusHtml;
|
||||
if (warnings.length === 0) {
|
||||
statusHtml = '<span class="badge badge-ok">OK</span>';
|
||||
} else {
|
||||
var hasSmall = warnings.some(function(w){ return w.indexOf('too small') >= 0; });
|
||||
statusHtml = warnings.map(function(w){
|
||||
var cls = hasSmall ? 'badge-err' : 'badge-warn';
|
||||
return '<span class="badge ' + cls + '" title="' + w.replace(/"/g,'"') + '">' +
|
||||
(w.length > 40 ? w.substring(0,38)+'…' : w) + '</span>';
|
||||
}).join(' ');
|
||||
}
|
||||
var mountedNote = (d.mounted_parts && d.mounted_parts.length > 0)
|
||||
? ' <span style="color:var(--warn-fg);font-size:11px">(mounted)</span>' : '';
|
||||
var tr = document.createElement('tr');
|
||||
tr.dataset.device = d.device;
|
||||
tr.dataset.model = d.model || 'Unknown';
|
||||
tr.dataset.size = d.size;
|
||||
tr.dataset.warnings = JSON.stringify(warnings);
|
||||
tr.innerHTML =
|
||||
'<td><input type="radio" name="install-disk" value="' + d.device + '"></td>' +
|
||||
'<td><code>' + d.device + '</code>' + mountedNote + '</td>' +
|
||||
'<td>' + (d.model || '—') + '</td>' +
|
||||
'<td>' + d.size + '</td>' +
|
||||
'<td>' + statusHtml + '</td>';
|
||||
tr.addEventListener('click', function(){ installSelectDisk(this); });
|
||||
tbody.appendChild(tr);
|
||||
});
|
||||
}
|
||||
document.getElementById('install-disk-section').style.display = '';
|
||||
}).catch(function(e){
|
||||
document.getElementById('install-loading').textContent = 'Failed to load disk list: ' + e;
|
||||
});
|
||||
}
|
||||
|
||||
function installSelectDisk(tr) {
|
||||
document.querySelectorAll('#install-disk-tbody tr').forEach(function(r){ r.classList.remove('selected'); });
|
||||
tr.classList.add('selected');
|
||||
var radio = tr.querySelector('input[type=radio]');
|
||||
if (radio) radio.checked = true;
|
||||
_installSelected = {
|
||||
device: tr.dataset.device,
|
||||
model: tr.dataset.model,
|
||||
size: tr.dataset.size,
|
||||
warnings: JSON.parse(tr.dataset.warnings || '[]')
|
||||
};
|
||||
var warnBox = document.getElementById('install-confirm-warn');
|
||||
var warnLines = '<strong>⚠ DANGER:</strong> ' + _installSelected.device +
|
||||
' (' + _installSelected.model + ', ' + _installSelected.size + ')' +
|
||||
' will be <strong>completely erased</strong> and repartitioned. All data will be lost.<br>';
|
||||
if (_installSelected.warnings.length > 0) {
|
||||
warnLines += '<br>' + _installSelected.warnings.map(function(w){ return '• ' + w; }).join('<br>');
|
||||
}
|
||||
warnBox.innerHTML = warnLines;
|
||||
document.getElementById('install-confirm-input').value = '';
|
||||
document.getElementById('install-start-btn').disabled = true;
|
||||
document.getElementById('install-confirm-section').style.display = '';
|
||||
document.getElementById('install-progress-section').style.display = 'none';
|
||||
}
|
||||
|
||||
function installDeselect() {
|
||||
_installSelected = null;
|
||||
document.querySelectorAll('#install-disk-tbody tr').forEach(function(r){ r.classList.remove('selected'); });
|
||||
document.querySelectorAll('#install-disk-tbody input[type=radio]').forEach(function(r){ r.checked = false; });
|
||||
document.getElementById('install-confirm-section').style.display = 'none';
|
||||
}
|
||||
|
||||
function installCheckConfirm() {
|
||||
var val = document.getElementById('install-confirm-input').value.trim();
|
||||
var ok = _installSelected && val === _installSelected.device;
|
||||
document.getElementById('install-start-btn').disabled = !ok;
|
||||
}
|
||||
|
||||
function installStart() {
|
||||
if (!_installSelected) return;
|
||||
document.getElementById('install-confirm-section').style.display = 'none';
|
||||
document.getElementById('install-disk-section').style.display = 'none';
|
||||
document.getElementById('install-loading').style.display = 'none';
|
||||
var prog = document.getElementById('install-progress-section');
|
||||
var term = document.getElementById('install-terminal');
|
||||
var status = document.getElementById('install-status');
|
||||
prog.style.display = '';
|
||||
term.textContent = '';
|
||||
status.textContent = 'Starting installation…';
|
||||
status.style.color = 'var(--muted)';
|
||||
|
||||
fetch('/api/install/run', {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type': 'application/json'},
|
||||
body: JSON.stringify({device: _installSelected.device})
|
||||
}).then(function(r){
|
||||
return r.json().then(function(j){
|
||||
if (!r.ok) throw new Error(j.error || r.statusText);
|
||||
return j;
|
||||
});
|
||||
}).then(function(j){
|
||||
if (!j.task_id) throw new Error('missing task id');
|
||||
installStreamLog(j.task_id);
|
||||
}).catch(function(e){
|
||||
status.textContent = 'Error: ' + e;
|
||||
status.style.color = 'var(--crit-fg)';
|
||||
});
|
||||
}
|
||||
|
||||
function installStreamLog(taskId) {
|
||||
var term = document.getElementById('install-terminal');
|
||||
var status = document.getElementById('install-status');
|
||||
var es = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
es.onmessage = function(e) {
|
||||
term.textContent += e.data + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
};
|
||||
es.addEventListener('done', function(e) {
|
||||
es.close();
|
||||
if (!e.data) {
|
||||
status.innerHTML = '<span style="color:var(--ok-fg);font-weight:700">✓ Installation complete.</span> Remove the ISO and reboot.';
|
||||
var rebootBtn = document.createElement('button');
|
||||
rebootBtn.className = 'btn btn-primary btn-sm';
|
||||
rebootBtn.style.marginLeft = '12px';
|
||||
rebootBtn.textContent = 'Reboot now';
|
||||
rebootBtn.onclick = function(){
|
||||
fetch('/api/services/action', {method:'POST',headers:{'Content-Type':'application/json'},
|
||||
body: JSON.stringify({name:'', action:'reboot'})});
|
||||
};
|
||||
status.appendChild(rebootBtn);
|
||||
} else {
|
||||
status.textContent = '✗ Installation failed: ' + e.data;
|
||||
status.style.color = 'var(--crit-fg)';
|
||||
}
|
||||
});
|
||||
es.onerror = function() {
|
||||
es.close();
|
||||
status.textContent = '✗ Stream disconnected.';
|
||||
status.style.color = 'var(--crit-fg)';
|
||||
};
|
||||
}
|
||||
|
||||
installRefreshDisks();
|
||||
</script>
|
||||
`
|
||||
}
|
||||
|
||||
func renderInstall() string {
|
||||
return `<div class="card"><div class="card-head">Install Live System to Disk</div><div class="card-body">` +
|
||||
renderInstallInline() +
|
||||
`</div></div>`
|
||||
}
|
||||
|
||||
func renderTasks() string {
|
||||
return `<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">
|
||||
<button class="btn btn-danger btn-sm" onclick="cancelAll()">Cancel All</button>
|
||||
<button class="btn btn-sm" style="background:#b45309;color:#fff" onclick="killWorkers()" title="Abort running tasks and kill orphaned test processes (bee-gpu-burn, dcgmi, nvvs, nvbandwidth, stress-ng, stressapptest, memtester)">Abort Tasks And Kill Orphans</button>
|
||||
<span id="kill-toast" style="font-size:12px;color:var(--muted);display:none"></span>
|
||||
<span style="font-size:12px;color:var(--muted)">Open a task to view its saved logs and charts.</span>
|
||||
</div>
|
||||
<div class="card">
|
||||
<div id="tasks-table"><p style="color:var(--muted);font-size:13px;padding:16px">Loading...</p></div>
|
||||
</div>
|
||||
<script>
|
||||
var _taskRefreshTimer = null;
|
||||
var _tasksAll = [];
|
||||
var _taskPage = 1;
|
||||
var _taskPageSize = 50;
|
||||
|
||||
function loadTasks() {
|
||||
fetch('/api/tasks').then(r=>r.json()).then(tasks => {
|
||||
_tasksAll = Array.isArray(tasks) ? tasks : [];
|
||||
if (_tasksAll.length === 0) {
|
||||
_taskPage = 1;
|
||||
document.getElementById('tasks-table').innerHTML = '<p style="color:var(--muted);font-size:13px;padding:16px">No tasks.</p>';
|
||||
return;
|
||||
}
|
||||
const totalPages = Math.max(1, Math.ceil(_tasksAll.length / _taskPageSize));
|
||||
if (_taskPage > totalPages) _taskPage = totalPages;
|
||||
if (_taskPage < 1) _taskPage = 1;
|
||||
const start = (_taskPage - 1) * _taskPageSize;
|
||||
const pageTasks = _tasksAll.slice(start, start + _taskPageSize);
|
||||
const rows = pageTasks.map(t => {
|
||||
const dur = t.elapsed_sec ? formatDurSec(t.elapsed_sec) : '';
|
||||
const statusClass = {running:'badge-ok',pending:'badge-unknown',done:'badge-ok',failed:'badge-err',cancelled:'badge-unknown'}[t.status]||'badge-unknown';
|
||||
const statusLabel = {running:'▶ running',pending:'pending',done:'✓ done',failed:'✗ failed',cancelled:'cancelled'}[t.status]||t.status;
|
||||
let actions = '<a class="btn btn-sm btn-secondary" href="/tasks/'+encodeURIComponent(t.id)+'">Open</a>';
|
||||
if (t.status === 'running' || t.status === 'pending') {
|
||||
actions += ' <button class="btn btn-sm btn-danger" onclick="cancelTask(\''+t.id+'\')">Cancel</button>';
|
||||
}
|
||||
if (t.status === 'pending') {
|
||||
actions += ' <button class="btn btn-sm btn-secondary" onclick="setPriority(\''+t.id+'\',1)" title="Increase priority">⇧</button>';
|
||||
actions += ' <button class="btn btn-sm btn-secondary" onclick="setPriority(\''+t.id+'\',-1)" title="Decrease priority">⇩</button>';
|
||||
}
|
||||
return '<tr><td><a href="/tasks/'+encodeURIComponent(t.id)+'">'+escHtml(t.name)+'</a></td>' +
|
||||
'<td><span class="badge '+statusClass+'">'+statusLabel+'</span></td>' +
|
||||
'<td style="font-size:12px;color:var(--muted)">'+fmtTime(t.created_at)+'</td>' +
|
||||
'<td style="font-size:12px;color:var(--muted)">'+dur+'</td>' +
|
||||
'<td>'+t.priority+'</td>' +
|
||||
'<td>'+actions+'</td></tr>';
|
||||
}).join('');
|
||||
const showingFrom = start + 1;
|
||||
const showingTo = Math.min(start + pageTasks.length, _tasksAll.length);
|
||||
const pager =
|
||||
'<div style="display:flex;align-items:center;justify-content:space-between;gap:12px;flex-wrap:wrap;padding:12px 14px;border-top:1px solid var(--border-lite);background:var(--surface-2)">' +
|
||||
'<div style="font-size:12px;color:var(--muted)">Showing '+showingFrom+'-'+showingTo+' of '+_tasksAll.length+' tasks</div>' +
|
||||
'<div style="display:flex;align-items:center;gap:8px">' +
|
||||
'<button class="btn btn-sm btn-secondary" onclick="setTaskPage('+(_taskPage-1)+')" '+(_taskPage <= 1 ? 'disabled' : '')+'>Previous</button>' +
|
||||
'<span style="font-size:12px;color:var(--muted)">Page '+_taskPage+' / '+totalPages+'</span>' +
|
||||
'<button class="btn btn-sm btn-secondary" onclick="setTaskPage('+(_taskPage+1)+')" '+(_taskPage >= totalPages ? 'disabled' : '')+'>Next</button>' +
|
||||
'</div>' +
|
||||
'</div>';
|
||||
document.getElementById('tasks-table').innerHTML =
|
||||
'<table><tr><th>Name</th><th>Status</th><th>Created</th><th>Duration</th><th>Priority</th><th>Actions</th></tr>'+rows+'</table>' + pager;
|
||||
});
|
||||
}
|
||||
|
||||
function escHtml(s) { return (s||'').replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>').replace(/"/g,'"'); }
|
||||
function fmtTime(s) { if (!s) return ''; try { return new Date(s).toLocaleTimeString(); } catch(e){ return s; } }
|
||||
function formatDurSec(sec) {
|
||||
sec = Math.max(0, Math.round(sec||0));
|
||||
if (sec < 60) return sec+'s';
|
||||
const m = Math.floor(sec/60), ss = sec%60;
|
||||
return m+'m '+ss+'s';
|
||||
}
|
||||
function setTaskPage(page) {
|
||||
const totalPages = Math.max(1, Math.ceil(_tasksAll.length / _taskPageSize));
|
||||
_taskPage = Math.min(totalPages, Math.max(1, page));
|
||||
loadTasks();
|
||||
}
|
||||
|
||||
function cancelTask(id) {
|
||||
fetch('/api/tasks/'+id+'/cancel',{method:'POST'}).then(()=>loadTasks());
|
||||
}
|
||||
function cancelAll() {
|
||||
fetch('/api/tasks/cancel-all',{method:'POST'}).then(()=>loadTasks());
|
||||
}
|
||||
function killWorkers() {
|
||||
if (!confirm('Abort all queued/running tasks and kill orphaned test workers (bee-gpu-burn, dcgmi, nvvs, nvbandwidth, stress-ng, stressapptest, memtester)?\n\nRunning bee-worker processes will first be asked to stop gracefully; orphaned test processes will then be killed.')) return;
|
||||
fetch('/api/tasks/kill-workers',{method:'POST'})
|
||||
.then(r=>r.json())
|
||||
.then(d=>{
|
||||
loadTasks();
|
||||
var toast = document.getElementById('kill-toast');
|
||||
var parts = [];
|
||||
if (d.cancelled > 0) parts.push(d.cancelled+' task'+(d.cancelled===1?'':'s')+' cancelled');
|
||||
if (d.killed > 0) parts.push(d.killed+' process'+(d.killed===1?'':'es')+' killed');
|
||||
toast.textContent = parts.length ? parts.join(', ')+'.' : 'No processes found.';
|
||||
toast.style.display = '';
|
||||
setTimeout(()=>{ toast.style.display='none'; }, 5000);
|
||||
});
|
||||
}
|
||||
function setPriority(id, delta) {
|
||||
fetch('/api/tasks/'+id+'/priority',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({delta:delta})})
|
||||
.then(()=>loadTasks());
|
||||
}
|
||||
|
||||
loadTasks();
|
||||
_taskRefreshTimer = setInterval(loadTasks, 2000);
|
||||
</script>`
|
||||
}
|
||||
@@ -0,0 +1,238 @@
|
||||
package webui
|
||||
|
||||
func renderMetrics() string {
|
||||
return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Live metrics — updated every 2 seconds.</p>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Server — Load</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-server-load" data-chart-refresh="1" src="/api/metrics/chart/server-load.svg" style="width:100%;display:block;border-radius:6px" alt="CPU/Mem load">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Temperature — CPU</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-server-temp-cpu" data-chart-refresh="1" src="/api/metrics/chart/server-temp-cpu.svg" style="width:100%;display:block;border-radius:6px" alt="CPU temperature">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Temperature — Ambient Sensors</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-server-temp-ambient" data-chart-refresh="1" src="/api/metrics/chart/server-temp-ambient.svg" style="width:100%;display:block;border-radius:6px" alt="Ambient temperature sensors">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Server — Power</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-server-power" data-chart-refresh="1" src="/api/metrics/chart/server-power.svg" style="width:100%;display:block;border-radius:6px" alt="System power">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="card-server-fans" class="card" style="margin-bottom:16px;display:none">
|
||||
<div class="card-head">Server — Fan RPM</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-server-fans" data-chart-refresh="1" src="/api/metrics/chart/server-fans.svg" style="width:100%;display:block;border-radius:6px" alt="Fan RPM">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<section id="gpu-metrics-section" style="display:none;margin-top:24px;padding:16px 16px 4px;border:1px solid #d7e0ea;border-radius:10px;background:linear-gradient(180deg,#f7fafc 0%,#eef4f8 100%)">
|
||||
<div style="display:flex;align-items:center;justify-content:space-between;gap:16px;flex-wrap:wrap;margin-bottom:14px">
|
||||
<div>
|
||||
<div style="font-size:12px;font-weight:700;letter-spacing:.08em;text-transform:uppercase;color:#486581">GPU Metrics</div>
|
||||
<div id="gpu-metrics-summary" style="font-size:13px;color:var(--muted);margin-top:4px">Detected GPUs are rendered in a dedicated section.</div>
|
||||
</div>
|
||||
<label style="display:inline-flex;align-items:center;gap:8px;font-size:13px;color:var(--ink);font-weight:700;cursor:pointer">
|
||||
<input id="gpu-chart-toggle" type="checkbox">
|
||||
<span>One chart per GPU</span>
|
||||
</label>
|
||||
</div>
|
||||
|
||||
<div id="gpu-metrics-by-metric">
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">GPU — Compute Load</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-gpu-all-load" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-load.svg" style="width:100%;display:block;border-radius:6px" alt="GPU compute load">
|
||||
</div>
|
||||
</div>
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">GPU — Memory Load</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-gpu-all-memload" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-memload.svg" style="width:100%;display:block;border-radius:6px" alt="GPU memory load">
|
||||
</div>
|
||||
</div>
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">GPU — Core Clock</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-gpu-all-clock" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-clock.svg" style="width:100%;display:block;border-radius:6px" alt="GPU core clock">
|
||||
</div>
|
||||
</div>
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">GPU — Power</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-gpu-all-power" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-power.svg" style="width:100%;display:block;border-radius:6px" alt="GPU power">
|
||||
</div>
|
||||
</div>
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">GPU — Temperature</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-gpu-all-temp" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-temp.svg" style="width:100%;display:block;border-radius:6px" alt="GPU temperature">
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="gpu-metrics-by-gpu" style="display:none"></div>
|
||||
</section>
|
||||
|
||||
<script>
|
||||
let gpuChartKey = '';
|
||||
const gpuChartModeStorageKey = 'bee.metrics.gpuChartMode';
|
||||
let metricsNvidiaGPUsPromise = null;
|
||||
|
||||
function loadMetricsNvidiaGPUs() {
|
||||
if (!metricsNvidiaGPUsPromise) {
|
||||
metricsNvidiaGPUsPromise = fetch('/api/gpu/nvidia')
|
||||
.then(function(r) {
|
||||
if (!r.ok) throw new Error('Failed to load NVIDIA GPUs.');
|
||||
return r.json();
|
||||
})
|
||||
.then(function(list) { return Array.isArray(list) ? list : []; })
|
||||
.catch(function() { return []; });
|
||||
}
|
||||
return metricsNvidiaGPUsPromise;
|
||||
}
|
||||
|
||||
function metricsGPUNameMap(list) {
|
||||
const out = {};
|
||||
(list || []).forEach(function(gpu) {
|
||||
const idx = Number(gpu.index);
|
||||
if (!Number.isFinite(idx) || !gpu.name) return;
|
||||
out[idx] = gpu.name;
|
||||
});
|
||||
return out;
|
||||
}
|
||||
|
||||
function metricsGPUDisplayLabel(idx, names) {
|
||||
const name = names && names[idx];
|
||||
return name ? ('GPU ' + idx + ' — ' + name) : ('GPU ' + idx);
|
||||
}
|
||||
|
||||
function loadGPUChartModePreference() {
|
||||
try {
|
||||
return sessionStorage.getItem(gpuChartModeStorageKey) === 'per-gpu';
|
||||
} catch (_) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
function saveGPUChartModePreference(perGPU) {
|
||||
try {
|
||||
sessionStorage.setItem(gpuChartModeStorageKey, perGPU ? 'per-gpu' : 'per-metric');
|
||||
} catch (_) {}
|
||||
}
|
||||
|
||||
function refreshChartImage(el) {
|
||||
if (!el || el.dataset.loading === '1') return;
|
||||
if (el.offsetParent === null) return;
|
||||
const baseSrc = el.dataset.baseSrc || el.src.split('?')[0];
|
||||
const nextSrc = baseSrc + '?t=' + Date.now();
|
||||
const probe = new Image();
|
||||
el.dataset.baseSrc = baseSrc;
|
||||
el.dataset.loading = '1';
|
||||
probe.onload = function() {
|
||||
el.src = nextSrc;
|
||||
el.dataset.loading = '0';
|
||||
};
|
||||
probe.onerror = function() {
|
||||
el.dataset.loading = '0';
|
||||
};
|
||||
probe.src = nextSrc;
|
||||
}
|
||||
|
||||
function refreshCharts() {
|
||||
document.querySelectorAll('img[data-chart-refresh="1"]').forEach(refreshChartImage);
|
||||
}
|
||||
|
||||
function gpuIndices(rows) {
|
||||
const seen = {};
|
||||
const out = [];
|
||||
(rows || []).forEach(function(row) {
|
||||
const idx = Number(row.index);
|
||||
if (!Number.isFinite(idx) || seen[idx]) return;
|
||||
seen[idx] = true;
|
||||
out.push(idx);
|
||||
});
|
||||
return out.sort(function(a, b) { return a - b; });
|
||||
}
|
||||
|
||||
function renderGPUOverviewCards(indices, names) {
|
||||
const host = document.getElementById('gpu-metrics-by-gpu');
|
||||
if (!host) return;
|
||||
host.innerHTML = indices.map(function(idx) {
|
||||
const label = metricsGPUDisplayLabel(idx, names);
|
||||
return '<div class="card" style="margin-bottom:16px">' +
|
||||
'<div class="card-head">' + label + ' — Overview</div>' +
|
||||
'<div class="card-body" style="padding:8px">' +
|
||||
'<img id="chart-gpu-' + idx + '-overview" data-chart-refresh="1" src="/api/metrics/chart/gpu/' + idx + '-overview.svg" style="width:100%;display:block;border-radius:6px" alt="' + label + ' overview">' +
|
||||
'</div></div>';
|
||||
}).join('');
|
||||
}
|
||||
|
||||
function applyGPUChartMode() {
|
||||
const perMetric = document.getElementById('gpu-metrics-by-metric');
|
||||
const perGPU = document.getElementById('gpu-metrics-by-gpu');
|
||||
const toggle = document.getElementById('gpu-chart-toggle');
|
||||
const gpuModePerGPU = !!(toggle && toggle.checked);
|
||||
if (perMetric) perMetric.style.display = gpuModePerGPU ? 'none' : '';
|
||||
if (perGPU) perGPU.style.display = gpuModePerGPU ? '' : 'none';
|
||||
}
|
||||
|
||||
function syncMetricsLayout(d) {
|
||||
const fanCard = document.getElementById('card-server-fans');
|
||||
if (fanCard) fanCard.style.display = (d.fans && d.fans.length > 0) ? '' : 'none';
|
||||
const section = document.getElementById('gpu-metrics-section');
|
||||
const summary = document.getElementById('gpu-metrics-summary');
|
||||
const indices = gpuIndices(d.gpus);
|
||||
loadMetricsNvidiaGPUs().then(function(gpus) {
|
||||
const names = metricsGPUNameMap(gpus);
|
||||
if (section) section.style.display = indices.length > 0 ? '' : 'none';
|
||||
if (summary) {
|
||||
summary.textContent = indices.length > 0
|
||||
? ('Detected GPUs: ' + indices.map(function(idx) { return metricsGPUDisplayLabel(idx, names); }).join(', '))
|
||||
: 'No GPUs detected in live metrics.';
|
||||
}
|
||||
const nextKey = indices.join(',') + '|' + indices.map(function(idx) { return names[idx] || ''; }).join(',');
|
||||
if (nextKey !== gpuChartKey) {
|
||||
renderGPUOverviewCards(indices, names);
|
||||
gpuChartKey = nextKey;
|
||||
}
|
||||
applyGPUChartMode();
|
||||
});
|
||||
}
|
||||
|
||||
function loadMetricsLayout() {
|
||||
fetch('/api/metrics/latest').then(function(r) { return r.json(); }).then(syncMetricsLayout).catch(function() {});
|
||||
}
|
||||
|
||||
const gpuChartToggle = document.getElementById('gpu-chart-toggle');
|
||||
if (gpuChartToggle) {
|
||||
gpuChartToggle.checked = loadGPUChartModePreference();
|
||||
}
|
||||
applyGPUChartMode();
|
||||
|
||||
if (gpuChartToggle) {
|
||||
gpuChartToggle.addEventListener('change', function() {
|
||||
saveGPUChartModePreference(!!gpuChartToggle.checked);
|
||||
applyGPUChartMode();
|
||||
refreshCharts();
|
||||
});
|
||||
}
|
||||
|
||||
loadMetricsLayout();
|
||||
setInterval(refreshCharts, 3000);
|
||||
setInterval(loadMetricsLayout, 5000);
|
||||
</script>`
|
||||
}
|
||||
@@ -0,0 +1,213 @@
|
||||
package webui
|
||||
|
||||
import "html"
|
||||
|
||||
// renderNetworkInline returns the network UI without a wrapping card (for embedding in Tools).
|
||||
func renderNetworkInline() string {
|
||||
return `<div id="net-pending" style="display:none" class="alert alert-warn">
|
||||
<strong>⚠ Network change applied.</strong> Reverting in <span id="net-countdown">60</span>s unless confirmed.
|
||||
<button class="btn btn-primary btn-sm" style="margin-left:8px" onclick="confirmNetChange()">Confirm</button>
|
||||
<button class="btn btn-secondary btn-sm" style="margin-left:4px" onclick="rollbackNetChange()">Rollback</button>
|
||||
</div>
|
||||
<div id="iface-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||
<div class="grid2" style="margin-top:16px">
|
||||
<div><div style="font-weight:700;font-size:13px;margin-bottom:8px">DHCP</div>
|
||||
<div class="form-row"><label>Interface (leave empty for all)</label><input type="text" id="dhcp-iface" placeholder="eth0"></div>
|
||||
<button class="btn btn-primary" onclick="runDHCP()">▶ Run DHCP</button>
|
||||
<div id="dhcp-out" style="margin-top:10px;font-size:12px;color:var(--ok-fg)"></div>
|
||||
</div>
|
||||
<div><div style="font-weight:700;font-size:13px;margin-bottom:8px">Static IPv4</div>
|
||||
<div class="form-row"><label>Interface</label><input type="text" id="st-iface" placeholder="eth0"></div>
|
||||
<div class="form-row"><label>Address</label><input type="text" id="st-addr" placeholder="192.168.1.100"></div>
|
||||
<div class="form-row"><label>Prefix length</label><input type="text" id="st-prefix" placeholder="24"></div>
|
||||
<div class="form-row"><label>Gateway</label><input type="text" id="st-gw" placeholder="192.168.1.1"></div>
|
||||
<div class="form-row"><label>DNS (comma-separated)</label><input type="text" id="st-dns" placeholder="8.8.8.8,8.8.4.4"></div>
|
||||
<button class="btn btn-primary" onclick="setStatic()">Apply Static IP</button>
|
||||
<div id="static-out" style="margin-top:10px;font-size:12px;color:var(--ok-fg)"></div>
|
||||
</div>
|
||||
</div>
|
||||
<script>
|
||||
var _netCountdownTimer = null;
|
||||
var _netRefreshTimer = null;
|
||||
const NET_ROLLBACK_SECS = 60;
|
||||
function loadNetwork() {
|
||||
fetch('/api/network').then(r=>r.json()).then(d => {
|
||||
const rows = (d.interfaces||[]).map(i =>
|
||||
'<tr><td style="cursor:pointer" onclick="selectIface(\''+i.Name+'\')" title="Use this interface in the forms below"><span style="text-decoration:underline">'+i.Name+'</span></td>' +
|
||||
'<td style="cursor:pointer" onclick="toggleIface(\''+i.Name+'\',\''+i.State+'\')" title="Click to toggle"><span class="badge '+(i.State==='up'?'badge-ok':'badge-warn')+'">'+i.State+'</span></td>' +
|
||||
'<td>'+(i.IPv4||[]).join(', ')+'</td></tr>'
|
||||
).join('');
|
||||
document.getElementById('iface-table').innerHTML =
|
||||
'<table><tr><th>Interface</th><th>State (click to toggle)</th><th>Addresses</th></tr>'+rows+'</table>' +
|
||||
(d.default_route ? '<p style="font-size:12px;color:var(--muted);margin-top:8px">Default route: '+d.default_route+'</p>' : '');
|
||||
if (d.pending_change) showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||
else hideNetPending();
|
||||
}).catch(function() {});
|
||||
}
|
||||
function selectIface(iface) {
|
||||
document.getElementById('dhcp-iface').value = iface;
|
||||
document.getElementById('st-iface').value = iface;
|
||||
}
|
||||
function toggleIface(iface, currentState) {
|
||||
showNetPending(NET_ROLLBACK_SECS);
|
||||
fetch('/api/network/toggle',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({iface:iface})})
|
||||
.then(r=>r.json()).then(d => {
|
||||
if (d.error) { hideNetPending(); alert('Error: '+d.error); return; }
|
||||
loadNetwork();
|
||||
showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||
}).catch(function() {
|
||||
setTimeout(loadNetwork, 1500);
|
||||
});
|
||||
}
|
||||
function hideNetPending() {
|
||||
const el = document.getElementById('net-pending');
|
||||
if (_netCountdownTimer) clearInterval(_netCountdownTimer);
|
||||
_netCountdownTimer = null;
|
||||
el.style.display = 'none';
|
||||
}
|
||||
function showNetPending(secs) {
|
||||
if (!secs || secs < 1) { hideNetPending(); return; }
|
||||
const el = document.getElementById('net-pending');
|
||||
el.style.display = 'block';
|
||||
if (_netCountdownTimer) clearInterval(_netCountdownTimer);
|
||||
let remaining = secs;
|
||||
document.getElementById('net-countdown').textContent = remaining;
|
||||
_netCountdownTimer = setInterval(function() {
|
||||
remaining--;
|
||||
document.getElementById('net-countdown').textContent = remaining;
|
||||
if (remaining <= 0) { hideNetPending(); loadNetwork(); }
|
||||
}, 1000);
|
||||
}
|
||||
function confirmNetChange() {
|
||||
hideNetPending();
|
||||
fetch('/api/network/confirm',{method:'POST'}).then(()=>loadNetwork()).catch(()=>{});
|
||||
}
|
||||
function rollbackNetChange() {
|
||||
hideNetPending();
|
||||
fetch('/api/network/rollback',{method:'POST'}).then(()=>loadNetwork()).catch(()=>{});
|
||||
}
|
||||
function runDHCP() {
|
||||
const iface = document.getElementById('dhcp-iface').value.trim();
|
||||
showNetPending(NET_ROLLBACK_SECS);
|
||||
fetch('/api/network/dhcp',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({interface:iface||'all'})})
|
||||
.then(r=>r.json()).then(d => {
|
||||
document.getElementById('dhcp-out').textContent = d.output || d.error || 'Done.';
|
||||
if (d.error) { hideNetPending(); return; }
|
||||
showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||
loadNetwork();
|
||||
}).catch(function() {
|
||||
setTimeout(loadNetwork, 1500);
|
||||
});
|
||||
}
|
||||
function setStatic() {
|
||||
const dns = document.getElementById('st-dns').value.split(',').map(s=>s.trim()).filter(Boolean);
|
||||
showNetPending(NET_ROLLBACK_SECS);
|
||||
fetch('/api/network/static',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({
|
||||
interface: document.getElementById('st-iface').value,
|
||||
address: document.getElementById('st-addr').value,
|
||||
prefix: document.getElementById('st-prefix').value,
|
||||
gateway: document.getElementById('st-gw').value,
|
||||
dns: dns,
|
||||
})}).then(r=>r.json()).then(d => {
|
||||
document.getElementById('static-out').textContent = d.output || d.error || 'Done.';
|
||||
if (d.error) { hideNetPending(); return; }
|
||||
showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||
loadNetwork();
|
||||
}).catch(function() {
|
||||
setTimeout(loadNetwork, 1500);
|
||||
});
|
||||
}
|
||||
loadNetwork();
|
||||
if (_netRefreshTimer) clearInterval(_netRefreshTimer);
|
||||
_netRefreshTimer = setInterval(loadNetwork, 5000);
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderNetwork() string {
|
||||
return `<div class="card"><div class="card-head">Network Interfaces</div><div class="card-body">` +
|
||||
renderNetworkInline() +
|
||||
`</div></div>`
|
||||
}
|
||||
|
||||
func renderServicesInline() string {
|
||||
return `<p style="font-size:13px;color:var(--muted);margin-bottom:10px">` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `</p>
|
||||
<div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="loadServices()">↻ Refresh</button></div>
|
||||
<div id="svc-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||
<div id="svc-out" style="display:none;margin-top:12px">
|
||||
<div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
|
||||
<span id="svc-out-label" style="font-size:12px;font-weight:600;color:var(--muted)">Output</span>
|
||||
<span id="svc-out-status" style="font-size:12px"></span>
|
||||
</div>
|
||||
<div id="svc-terminal" class="terminal" style="max-height:220px;width:100%;box-sizing:border-box"></div>
|
||||
</div>
|
||||
<script>
|
||||
function loadServices() {
|
||||
fetch('/api/services').then(r=>r.json()).then(svcs => {
|
||||
const rows = svcs.map(s => {
|
||||
const st = s.state||'unknown';
|
||||
const badge = st==='active' ? 'badge-ok' : st==='failed' ? 'badge-err' : 'badge-warn';
|
||||
const id = 'svc-body-'+s.name.replace(/[^a-z0-9]/g,'-');
|
||||
const body = (s.body||'').replace(/</g,'<').replace(/>/g,'>');
|
||||
return '<tr>' +
|
||||
'<td style="white-space:nowrap">'+s.name+'</td>' +
|
||||
'<td style="white-space:nowrap"><span class="badge '+badge+'" style="cursor:pointer" onclick="toggleBody(\''+id+'\')">'+st+' ▾</span>' +
|
||||
'<div id="'+id+'" style="display:none;margin-top:6px"><pre style="font-size:11px;white-space:pre-wrap;word-break:break-all;max-height:200px;overflow-y:auto;background:#1b1c1d;padding:8px;border-radius:4px;color:#b5cea8">'+body+'</pre></div>' +
|
||||
'</td>' +
|
||||
'<td style="white-space:nowrap">' +
|
||||
'<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-start" onclick="svcAction(this,\''+s.name+'\',\'start\')">Start</button> ' +
|
||||
'<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-stop" onclick="svcAction(this,\''+s.name+'\',\'stop\')">Stop</button> ' +
|
||||
'<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-restart" onclick="svcAction(this,\''+s.name+'\',\'restart\')">Restart</button>' +
|
||||
'</td></tr>';
|
||||
}).join('');
|
||||
document.getElementById('svc-table').innerHTML =
|
||||
'<table><tr><th>Unit</th><th>Status</th><th>Actions</th></tr>'+rows+'</table>';
|
||||
});
|
||||
}
|
||||
function toggleBody(id) {
|
||||
const el = document.getElementById(id);
|
||||
if (el) el.style.display = el.style.display==='none' ? 'block' : 'none';
|
||||
}
|
||||
function svcAction(btn, name, action) {
|
||||
var label = btn.textContent;
|
||||
btn.disabled = true;
|
||||
btn.textContent = '...';
|
||||
var out = document.getElementById('svc-out');
|
||||
var term = document.getElementById('svc-terminal');
|
||||
var statusEl = document.getElementById('svc-out-status');
|
||||
var labelEl = document.getElementById('svc-out-label');
|
||||
out.style.display = 'block';
|
||||
labelEl.textContent = action + ' ' + name;
|
||||
term.textContent = 'Running...';
|
||||
statusEl.textContent = '';
|
||||
statusEl.style.color = '';
|
||||
fetch('/api/services/action',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({name,action})})
|
||||
.then(r=>r.json()).then(d => {
|
||||
term.textContent = d.output || d.error || '(no output)';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
if (d.status === 'ok') {
|
||||
statusEl.textContent = '✓ done';
|
||||
statusEl.style.color = 'var(--ok-fg, #2c662d)';
|
||||
} else {
|
||||
statusEl.textContent = '✗ failed';
|
||||
statusEl.style.color = 'var(--crit-fg, #9f3a38)';
|
||||
}
|
||||
btn.textContent = label;
|
||||
btn.disabled = false;
|
||||
setTimeout(loadServices, 800);
|
||||
}).catch(e => {
|
||||
term.textContent = 'Request failed: ' + e;
|
||||
statusEl.textContent = '✗ error';
|
||||
statusEl.style.color = 'var(--crit-fg, #9f3a38)';
|
||||
btn.textContent = label;
|
||||
btn.disabled = false;
|
||||
});
|
||||
}
|
||||
loadServices();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderServices() string {
|
||||
return `<div class="card"><div class="card-head">Bee Services</div><div class="card-body">` +
|
||||
renderServicesInline() +
|
||||
`</div></div>`
|
||||
}
|
||||
@@ -0,0 +1,663 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"html"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
type validateInventory struct {
|
||||
CPU string
|
||||
Memory string
|
||||
Storage string
|
||||
NVIDIA string
|
||||
AMD string
|
||||
NvidiaGPUCount int
|
||||
AMDGPUCount int
|
||||
}
|
||||
|
||||
func validateFmtDur(secs int) string {
|
||||
if secs < 120 {
|
||||
return fmt.Sprintf("~%d s", secs)
|
||||
}
|
||||
mins := (secs + 29) / 60
|
||||
return fmt.Sprintf("~%d min", mins)
|
||||
}
|
||||
|
||||
func validateTotalValidateSec(n int) int {
|
||||
if n < 0 {
|
||||
n = 0
|
||||
}
|
||||
total := platform.SATEstimatedCPUValidateSec +
|
||||
platform.SATEstimatedMemoryValidateSec +
|
||||
platform.SATEstimatedNvidiaInterconnectSec +
|
||||
platform.SATEstimatedNvidiaBandwidthSec
|
||||
if n > 0 {
|
||||
total += platform.SATEstimatedNvidiaGPUValidateSec
|
||||
}
|
||||
return total
|
||||
}
|
||||
|
||||
func validateTotalStressSec(n int) int {
|
||||
if n < 0 {
|
||||
n = 0
|
||||
}
|
||||
total := platform.SATEstimatedCPUStressSec +
|
||||
platform.SATEstimatedMemoryStressSec +
|
||||
platform.SATEstimatedNvidiaPulseTestSec +
|
||||
platform.SATEstimatedNvidiaInterconnectSec +
|
||||
platform.SATEstimatedNvidiaBandwidthSec
|
||||
if n > 0 {
|
||||
total += platform.SATEstimatedNvidiaGPUStressSec +
|
||||
platform.SATEstimatedNvidiaTargetedStressSec +
|
||||
platform.SATEstimatedNvidiaTargetedPowerSec
|
||||
}
|
||||
return total
|
||||
}
|
||||
|
||||
func renderValidate(opts HandlerOptions) string {
|
||||
inv := loadValidateInventory(opts)
|
||||
n := inv.NvidiaGPUCount
|
||||
validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
|
||||
stressTotalStr := validateFmtDur(validateTotalStressSec(n))
|
||||
gpuNote := ""
|
||||
if n > 0 {
|
||||
gpuNote = fmt.Sprintf(" (%d GPU)", n)
|
||||
}
|
||||
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
|
||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Validate Profile</div>
|
||||
<div class="card-body validate-profile-body">
|
||||
<div class="validate-profile-col">
|
||||
<div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
|
||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
|
||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (` + stressTotalStr + gpuNote + `)</span></label>
|
||||
</div>
|
||||
<div class="validate-profile-col validate-profile-action">
|
||||
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially. Validate: ` + validateTotalStr + gpuNote + `; Stress: ` + stressTotalStr + gpuNote + `. Estimates are based on real log data and scale with GPU count.</p>
|
||||
<button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
|
||||
<div style="margin-top:12px">
|
||||
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="grid3">
|
||||
` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
|
||||
inv.CPU,
|
||||
`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
|
||||
`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
|
||||
validateFmtDur(platform.SATEstimatedCPUValidateSec)+` in Validate (stress-ng 60 s). `+validateFmtDur(platform.SATEstimatedCPUStressSec)+` in Stress (stress-ng 30 min).`,
|
||||
)) +
|
||||
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
|
||||
inv.Memory,
|
||||
`Runs a RAM validation pass and records memory state around the test.`,
|
||||
`<code>free</code>, <code>memtester</code>`,
|
||||
validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` in Validate (256 MB × 1 pass). `+validateFmtDur(platform.SATEstimatedMemoryStressSec)+` in Stress (512 MB × 1 pass).`,
|
||||
)) +
|
||||
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
|
||||
inv.Storage,
|
||||
`Scans all storage devices and runs the matching health or self-test path for each device type.`,
|
||||
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
|
||||
`Seconds in Validate (NVMe: instant device query; SATA/SAS: short self-test). Up to ~1 h per device in Stress (extended self-test, device-dependent).`,
|
||||
)) +
|
||||
`</div>
|
||||
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">NVIDIA GPU Selection</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 8px">` + inv.NVIDIA + `</p>
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Validate one by one.</p>
|
||||
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectAllGPUs()">Select All</button>
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectNoGPUs()">Clear</button>
|
||||
</div>
|
||||
<div id="sat-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
||||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||
</div>
|
||||
<p id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA validate tasks.</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="grid3">
|
||||
` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Runs NVIDIA diagnostics and board inventory checks.`,
|
||||
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
||||
fmt.Sprintf("Validate: %s (Level 2, all GPUs simultaneously). Stress: %s (Level 3, all GPUs simultaneously).",
|
||||
validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec),
|
||||
validateFmtDur(platform.SATEstimatedNvidiaGPUStressSec)),
|
||||
)) +
|
||||
`<div id="sat-card-nvidia-targeted-stress">` +
|
||||
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
|
||||
`<code>dcgmi diag targeted_stress</code>`,
|
||||
"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec) + ` (all GPUs simultaneously).<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-targeted-power">` +
|
||||
renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
|
||||
`<code>dcgmi diag targeted_power</code>`,
|
||||
"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec) + ` (all GPUs simultaneously).<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-pulse">` +
|
||||
renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
|
||||
`<code>dcgmi diag pulse_test</code>`,
|
||||
`Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-interconnect">` +
|
||||
renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
|
||||
`<code>all_reduce_perf</code> (NCCL tests)`,
|
||||
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-bandwidth">` +
|
||||
renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
|
||||
`<code>nvbandwidth</code>`,
|
||||
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`</div>
|
||||
<div class="grid3" style="margin-top:16px">
|
||||
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
|
||||
inv.AMD,
|
||||
`Runs the selected AMD checks only. GPU Validate collects inventory; MEM Integrity uses the RVS MEM module; MEM Bandwidth uses rocm-bandwidth-test and the RVS BABEL module.`,
|
||||
`GPU Validate: <code>rocm-smi</code>, <code>dmidecode</code>; MEM Integrity: <code>rvs mem</code>; MEM Bandwidth: <code>rocm-bandwidth-test</code>, <code>rvs babel</code>`,
|
||||
`<div style="display:flex;flex-direction:column;gap:4px"><label class="cb-row"><input type="checkbox" id="sat-amd-target" checked><span>GPU Validate</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-mem-target" checked><span>MEM Integrity</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-bandwidth-target" checked><span>MEM Bandwidth</span></label></div>`,
|
||||
)) +
|
||||
`</div>
|
||||
<div id="sat-output" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Test Output <span id="sat-title"></span></div>
|
||||
<div class="card-body"><div id="sat-terminal" class="terminal"></div></div>
|
||||
</div>
|
||||
<style>
|
||||
.validate-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
|
||||
.validate-profile-col { min-width:0; display:flex; flex-direction:column; }
|
||||
.validate-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:center; }
|
||||
.validate-card-body { padding:0; }
|
||||
.validate-card-section { padding:12px 16px 0; }
|
||||
.validate-card-section:last-child { padding-bottom:16px; }
|
||||
.sat-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
|
||||
.sat-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
@media(max-width:900px){ .validate-profile-body { grid-template-columns:1fr; } }
|
||||
</style>
|
||||
<script>
|
||||
let satES = null;
|
||||
function satStressMode() {
|
||||
return document.querySelector('input[name="sat-mode"]:checked')?.value === 'stress';
|
||||
}
|
||||
function satModeChanged() {
|
||||
const stress = satStressMode();
|
||||
[
|
||||
{card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
|
||||
{card: 'sat-card-nvidia-targeted-power', hint: 'sat-tp-mode-hint'},
|
||||
{card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'},
|
||||
].forEach(function(item) {
|
||||
const card = document.getElementById(item.card);
|
||||
if (card) {
|
||||
card.style.opacity = stress ? '1' : '0.5';
|
||||
const hint = document.getElementById(item.hint);
|
||||
if (hint) hint.style.display = stress ? 'none' : '';
|
||||
}
|
||||
});
|
||||
}
|
||||
function satLabels() {
|
||||
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||
}
|
||||
let satNvidiaGPUsPromise = null;
|
||||
function loadSatNvidiaGPUs() {
|
||||
if (!satNvidiaGPUsPromise) {
|
||||
satNvidiaGPUsPromise = fetch('/api/gpu/nvidia')
|
||||
.then(r => {
|
||||
if (!r.ok) throw new Error('Failed to load NVIDIA GPUs.');
|
||||
return r.json();
|
||||
})
|
||||
.then(list => Array.isArray(list) ? list : []);
|
||||
}
|
||||
return satNvidiaGPUsPromise;
|
||||
}
|
||||
function satSelectedGPUIndices() {
|
||||
return Array.from(document.querySelectorAll('.sat-nvidia-checkbox'))
|
||||
.filter(function(el) { return el.checked && !el.disabled; })
|
||||
.map(function(el) { return parseInt(el.value, 10); })
|
||||
.filter(function(v) { return !Number.isNaN(v); })
|
||||
.sort(function(a, b) { return a - b; });
|
||||
}
|
||||
function satUpdateGPUSelectionNote() {
|
||||
const note = document.getElementById('sat-gpu-selection-note');
|
||||
if (!note) return;
|
||||
const selected = satSelectedGPUIndices();
|
||||
if (!selected.length) {
|
||||
note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA validate tasks.';
|
||||
return;
|
||||
}
|
||||
note.textContent = 'Selected GPUs: ' + selected.join(', ') + '. Multi-GPU tests will use all selected GPUs.';
|
||||
}
|
||||
function satRenderGPUList(gpus) {
|
||||
const root = document.getElementById('sat-gpu-list');
|
||||
if (!root) return;
|
||||
if (!gpus || !gpus.length) {
|
||||
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
|
||||
satUpdateGPUSelectionNote();
|
||||
return;
|
||||
}
|
||||
root.innerHTML = gpus.map(function(gpu) {
|
||||
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
|
||||
return '<label class="sat-gpu-row">'
|
||||
+ '<input class="sat-nvidia-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="satUpdateGPUSelectionNote()">'
|
||||
+ '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
|
||||
+ '</label>';
|
||||
}).join('');
|
||||
satUpdateGPUSelectionNote();
|
||||
}
|
||||
function satSelectAllGPUs() {
|
||||
document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = true; });
|
||||
satUpdateGPUSelectionNote();
|
||||
}
|
||||
function satSelectNoGPUs() {
|
||||
document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = false; });
|
||||
satUpdateGPUSelectionNote();
|
||||
}
|
||||
function satLoadGPUs() {
|
||||
loadSatNvidiaGPUs().then(function(gpus) {
|
||||
satRenderGPUList(gpus);
|
||||
}).catch(function(err) {
|
||||
const root = document.getElementById('sat-gpu-list');
|
||||
if (root) {
|
||||
root.innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
|
||||
}
|
||||
satUpdateGPUSelectionNote();
|
||||
});
|
||||
}
|
||||
function satGPUDisplayName(gpu) {
|
||||
const idx = (gpu && Number.isFinite(Number(gpu.index))) ? Number(gpu.index) : 0;
|
||||
const name = gpu && gpu.name ? gpu.name : ('GPU ' + idx);
|
||||
return 'GPU ' + idx + ' — ' + name;
|
||||
}
|
||||
function satRequestBody(target, overrides) {
|
||||
const body = {};
|
||||
const labels = satLabels();
|
||||
body.display_name = labels[target] || ('Validate ' + target);
|
||||
body.stress_mode = satStressMode();
|
||||
if (target === 'cpu') body.duration = satStressMode() ? 1800 : 60;
|
||||
if (overrides) {
|
||||
Object.keys(overrides).forEach(key => { body[key] = overrides[key]; });
|
||||
}
|
||||
return body;
|
||||
}
|
||||
function enqueueSATTarget(target, overrides) {
|
||||
return fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(satRequestBody(target, overrides))})
|
||||
.then(r => r.json());
|
||||
}
|
||||
function streamSATTask(taskId, title, resetTerminal) {
|
||||
if (satES) { satES.close(); satES = null; }
|
||||
document.getElementById('sat-output').style.display='block';
|
||||
document.getElementById('sat-title').textContent = '— ' + title;
|
||||
const term = document.getElementById('sat-terminal');
|
||||
if (resetTerminal) {
|
||||
term.textContent = '';
|
||||
}
|
||||
term.textContent += 'Task ' + taskId + ' queued. Streaming log...\n';
|
||||
return new Promise(function(resolve) {
|
||||
satES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
satES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||
satES.addEventListener('done', function(e) {
|
||||
satES.close();
|
||||
satES = null;
|
||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
resolve({ok: !e.data, error: e.data || ''});
|
||||
});
|
||||
satES.onerror = function() {
|
||||
if (satES) {
|
||||
satES.close();
|
||||
satES = null;
|
||||
}
|
||||
term.textContent += '\nERROR: stream disconnected.\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
resolve({ok: false, error: 'stream disconnected'});
|
||||
};
|
||||
});
|
||||
}
|
||||
function selectedAMDValidateTargets() {
|
||||
const targets = [];
|
||||
const gpu = document.getElementById('sat-amd-target');
|
||||
const mem = document.getElementById('sat-amd-mem-target');
|
||||
const bw = document.getElementById('sat-amd-bandwidth-target');
|
||||
if (gpu && gpu.checked && !gpu.disabled) targets.push('amd');
|
||||
if (mem && mem.checked && !mem.disabled) targets.push('amd-mem');
|
||||
if (bw && bw.checked && !bw.disabled) targets.push('amd-bandwidth');
|
||||
return targets;
|
||||
}
|
||||
function runSAT(target) {
|
||||
return runSATWithOverrides(target, null);
|
||||
}
|
||||
function runSATWithOverrides(target, overrides) {
|
||||
const title = (overrides && overrides.display_name) || target;
|
||||
const term = document.getElementById('sat-terminal');
|
||||
document.getElementById('sat-output').style.display='block';
|
||||
document.getElementById('sat-title').textContent = '— ' + title;
|
||||
term.textContent = 'Enqueuing ' + title + ' test...\n';
|
||||
return enqueueSATTarget(target, overrides)
|
||||
.then(d => streamSATTask(d.task_id, title, false));
|
||||
}
|
||||
const nvidiaPerGPUTargets = [];
|
||||
const nvidiaAllGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
|
||||
function satAllGPUIndicesForMulti() {
|
||||
return Promise.resolve(satSelectedGPUIndices());
|
||||
}
|
||||
function expandSATTarget(target) {
|
||||
if (nvidiaAllGPUTargets.indexOf(target) >= 0) {
|
||||
return satAllGPUIndicesForMulti().then(function(indices) {
|
||||
if (!indices.length) return Promise.reject(new Error('No NVIDIA GPUs available.'));
|
||||
return [{target: target, overrides: {gpu_indices: indices, display_name: satLabels()[target] || target}}];
|
||||
});
|
||||
}
|
||||
if (nvidiaPerGPUTargets.indexOf(target) < 0) {
|
||||
return Promise.resolve([{target: target}]);
|
||||
}
|
||||
const selected = satSelectedGPUIndices();
|
||||
if (!selected.length) {
|
||||
return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
|
||||
}
|
||||
return loadSatNvidiaGPUs().then(gpus => gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0).map(gpu => ({
|
||||
target: target,
|
||||
overrides: {
|
||||
gpu_indices: [Number(gpu.index)],
|
||||
display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')'
|
||||
},
|
||||
label: satGPUDisplayName(gpu),
|
||||
})));
|
||||
}
|
||||
function runNvidiaFabricValidate(target) {
|
||||
satAllGPUIndicesForMulti().then(function(indices) {
|
||||
if (!indices.length) { alert('No NVIDIA GPUs available.'); return; }
|
||||
runSATWithOverrides(target, {gpu_indices: indices, display_name: satLabels()[target] || target});
|
||||
});
|
||||
}
|
||||
function runNvidiaValidateSet(target) {
|
||||
const selected = satSelectedGPUIndices();
|
||||
if (!selected.length) { alert('Select at least one NVIDIA GPU.'); return; }
|
||||
return runSATWithOverrides(target, {gpu_indices: selected, display_name: satLabels()[target] || target});
|
||||
}
|
||||
function runAMDValidateSet() {
|
||||
const targets = selectedAMDValidateTargets();
|
||||
if (!targets.length) return;
|
||||
if (targets.length === 1) return runSAT(targets[0]);
|
||||
document.getElementById('sat-output').style.display='block';
|
||||
document.getElementById('sat-title').textContent = '— amd';
|
||||
const term = document.getElementById('sat-terminal');
|
||||
term.textContent = 'Running AMD validate set one by one...\n';
|
||||
const labels = satLabels();
|
||||
const runNext = (idx) => {
|
||||
if (idx >= targets.length) return Promise.resolve();
|
||||
const target = targets[idx];
|
||||
term.textContent += '\n[' + (idx + 1) + '/' + targets.length + '] ' + labels[target] + '\n';
|
||||
return enqueueSATTarget(target)
|
||||
.then(d => {
|
||||
return streamSATTask(d.task_id, labels[target], false);
|
||||
}).then(function() {
|
||||
return runNext(idx + 1);
|
||||
});
|
||||
};
|
||||
return runNext(0);
|
||||
}
|
||||
function runAllSAT() {
|
||||
const cycles = 1;
|
||||
const status = document.getElementById('sat-all-status');
|
||||
status.textContent = 'Enqueuing...';
|
||||
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
|
||||
const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
|
||||
const activeTargets = baseTargets.filter(target => {
|
||||
if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
|
||||
const btn = document.getElementById('sat-btn-' + target);
|
||||
return !(btn && btn.disabled);
|
||||
});
|
||||
Promise.all(activeTargets.map(expandSATTarget)).then(groups => {
|
||||
const expanded = [];
|
||||
for (let cycle = 0; cycle < cycles; cycle++) {
|
||||
groups.forEach(group => group.forEach(item => expanded.push(item)));
|
||||
}
|
||||
const total = expanded.length;
|
||||
let enqueued = 0;
|
||||
if (!total) {
|
||||
status.textContent = 'No tasks selected.';
|
||||
return;
|
||||
}
|
||||
const runNext = (idx) => {
|
||||
if (idx >= expanded.length) { status.textContent = 'Completed ' + total + ' task(s).'; return Promise.resolve(); }
|
||||
const item = expanded[idx];
|
||||
status.textContent = 'Running ' + (idx + 1) + '/' + total + '...';
|
||||
return enqueueSATTarget(item.target, item.overrides)
|
||||
.then(() => {
|
||||
enqueued++;
|
||||
return runNext(idx + 1);
|
||||
});
|
||||
};
|
||||
return runNext(0);
|
||||
}).catch(err => {
|
||||
status.textContent = 'Error: ' + err.message;
|
||||
});
|
||||
}
|
||||
</script>
|
||||
<script>
|
||||
fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
|
||||
if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
|
||||
if (!gp.nvidia) disableSATCard('nvidia-targeted-stress', 'No NVIDIA GPU detected');
|
||||
if (!gp.nvidia) disableSATCard('nvidia-targeted-power', 'No NVIDIA GPU detected');
|
||||
if (!gp.nvidia) disableSATCard('nvidia-pulse', 'No NVIDIA GPU detected');
|
||||
if (!gp.nvidia) disableSATCard('nvidia-interconnect', 'No NVIDIA GPU detected');
|
||||
if (!gp.nvidia) disableSATCard('nvidia-bandwidth', 'No NVIDIA GPU detected');
|
||||
if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
|
||||
if (!gp.amd) disableSATAMDOptions('No AMD GPU detected');
|
||||
});
|
||||
satLoadGPUs();
|
||||
function disableSATAMDOptions(reason) {
|
||||
['sat-amd-target','sat-amd-mem-target','sat-amd-bandwidth-target'].forEach(function(id) {
|
||||
const cb = document.getElementById(id);
|
||||
if (!cb) return;
|
||||
cb.disabled = true;
|
||||
cb.checked = false;
|
||||
cb.title = reason;
|
||||
});
|
||||
}
|
||||
function disableSATCard(id, reason) {
|
||||
const btn = document.getElementById('sat-btn-' + id);
|
||||
if (!btn) return;
|
||||
btn.disabled = true;
|
||||
btn.title = reason;
|
||||
btn.style.opacity = '0.4';
|
||||
const card = btn.closest('.card');
|
||||
if (card) {
|
||||
let note = card.querySelector('.sat-unavail');
|
||||
if (!note) {
|
||||
note = document.createElement('p');
|
||||
note.className = 'sat-unavail';
|
||||
note.style.cssText = 'color:var(--muted);font-size:12px;margin:0 0 8px';
|
||||
const body = card.querySelector('.card-body');
|
||||
if (body) body.insertBefore(note, body.firstChild);
|
||||
}
|
||||
note.textContent = reason;
|
||||
}
|
||||
}
|
||||
</script>`
|
||||
}
|
||||
|
||||
func loadValidateInventory(opts HandlerOptions) validateInventory {
|
||||
unknown := "Audit snapshot not loaded."
|
||||
out := validateInventory{
|
||||
CPU: unknown,
|
||||
Memory: unknown,
|
||||
Storage: unknown,
|
||||
NVIDIA: unknown,
|
||||
AMD: unknown,
|
||||
}
|
||||
data, err := loadSnapshot(opts.AuditPath)
|
||||
if err != nil {
|
||||
return out
|
||||
}
|
||||
var snap schema.HardwareIngestRequest
|
||||
if err := json.Unmarshal(data, &snap); err != nil {
|
||||
return out
|
||||
}
|
||||
|
||||
cpuCounts := map[string]int{}
|
||||
cpuTotal := 0
|
||||
for _, cpu := range snap.Hardware.CPUs {
|
||||
if cpu.Present != nil && !*cpu.Present {
|
||||
continue
|
||||
}
|
||||
cpuTotal++
|
||||
addValidateModel(cpuCounts, validateFirstNonEmpty(validateTrimPtr(cpu.Model), validateTrimPtr(cpu.Manufacturer), "unknown"))
|
||||
}
|
||||
|
||||
memCounts := map[string]int{}
|
||||
memTotal := 0
|
||||
for _, dimm := range snap.Hardware.Memory {
|
||||
if dimm.Present != nil && !*dimm.Present {
|
||||
continue
|
||||
}
|
||||
memTotal++
|
||||
addValidateModel(memCounts, validateFirstNonEmpty(validateTrimPtr(dimm.PartNumber), validateTrimPtr(dimm.Type), validateTrimPtr(dimm.Manufacturer), "unknown"))
|
||||
}
|
||||
|
||||
storageCounts := map[string]int{}
|
||||
storageTotal := 0
|
||||
for _, dev := range snap.Hardware.Storage {
|
||||
if dev.Present != nil && !*dev.Present {
|
||||
continue
|
||||
}
|
||||
storageTotal++
|
||||
addValidateModel(storageCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
|
||||
}
|
||||
|
||||
nvidiaCounts := map[string]int{}
|
||||
nvidiaTotal := 0
|
||||
amdCounts := map[string]int{}
|
||||
amdTotal := 0
|
||||
for _, dev := range snap.Hardware.PCIeDevices {
|
||||
if dev.Present != nil && !*dev.Present {
|
||||
continue
|
||||
}
|
||||
if validateIsVendorGPU(dev, "nvidia") {
|
||||
nvidiaTotal++
|
||||
addValidateModel(nvidiaCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
|
||||
}
|
||||
if validateIsVendorGPU(dev, "amd") {
|
||||
amdTotal++
|
||||
addValidateModel(amdCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
|
||||
}
|
||||
}
|
||||
|
||||
out.CPU = formatValidateDeviceSummary(cpuTotal, cpuCounts, "CPU")
|
||||
out.Memory = formatValidateDeviceSummary(memTotal, memCounts, "module")
|
||||
out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device")
|
||||
out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU")
|
||||
out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU")
|
||||
out.NvidiaGPUCount = nvidiaTotal
|
||||
out.AMDGPUCount = amdTotal
|
||||
return out
|
||||
}
|
||||
|
||||
func renderValidateCardBody(devices, description, commands, settings string) string {
|
||||
return `<div class="validate-card-section"><div style="font-size:13px;color:var(--muted)">` + devices + `</div></div>` +
|
||||
`<div class="validate-card-section"><div style="font-size:13px">` + description + `</div></div>` +
|
||||
`<div class="validate-card-section"><div style="font-size:13px">` + commands + `</div></div>` +
|
||||
`<div class="validate-card-section"><div style="font-size:13px;color:var(--muted)">` + settings + `</div></div>`
|
||||
}
|
||||
|
||||
func formatValidateDeviceSummary(total int, models map[string]int, unit string) string {
|
||||
if total == 0 {
|
||||
return "0 " + unit + "s detected."
|
||||
}
|
||||
keys := make([]string, 0, len(models))
|
||||
for key := range models {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
parts := make([]string, 0, len(keys))
|
||||
for _, key := range keys {
|
||||
parts = append(parts, fmt.Sprintf("%d x %s", models[key], html.EscapeString(key)))
|
||||
}
|
||||
label := unit
|
||||
if total != 1 {
|
||||
label += "s"
|
||||
}
|
||||
if len(parts) == 1 {
|
||||
return parts[0] + " " + label
|
||||
}
|
||||
return fmt.Sprintf("%d %s: %s", total, label, strings.Join(parts, ", "))
|
||||
}
|
||||
|
||||
func addValidateModel(counts map[string]int, name string) {
|
||||
name = strings.TrimSpace(name)
|
||||
if name == "" {
|
||||
name = "unknown"
|
||||
}
|
||||
counts[name]++
|
||||
}
|
||||
|
||||
func validateTrimPtr(value *string) string {
|
||||
if value == nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(*value)
|
||||
}
|
||||
|
||||
func validateFirstNonEmpty(values ...string) string {
|
||||
for _, value := range values {
|
||||
value = strings.TrimSpace(value)
|
||||
if value != "" {
|
||||
return value
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func validateIsVendorGPU(dev schema.HardwarePCIeDevice, vendor string) bool {
|
||||
model := strings.ToLower(validateTrimPtr(dev.Model))
|
||||
manufacturer := strings.ToLower(validateTrimPtr(dev.Manufacturer))
|
||||
class := strings.ToLower(validateTrimPtr(dev.DeviceClass))
|
||||
if strings.Contains(model, "aspeed") || strings.Contains(manufacturer, "aspeed") {
|
||||
return false
|
||||
}
|
||||
switch vendor {
|
||||
case "nvidia":
|
||||
return strings.Contains(model, "nvidia") || strings.Contains(manufacturer, "nvidia")
|
||||
case "amd":
|
||||
isGPUClass := class == "processingaccelerator" || class == "displaycontroller" || class == "videocontroller"
|
||||
isAMDVendor := strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd") || strings.Contains(manufacturer, "ati")
|
||||
isAMDModel := strings.Contains(model, "instinct") || strings.Contains(model, "radeon") || strings.Contains(model, "amd")
|
||||
return isGPUClass && (isAMDVendor || isAMDModel)
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func renderSATCard(id, label, runAction, headerActions, body string) string {
|
||||
actions := `<button id="sat-btn-` + id + `" class="btn btn-primary btn-sm" onclick="` + runAction + `">Run</button>`
|
||||
if strings.TrimSpace(headerActions) != "" {
|
||||
actions += headerActions
|
||||
}
|
||||
return fmt.Sprintf(`<div class="card"><div class="card-head card-head-actions"><span>%s</span><div class="card-head-buttons">%s</div></div><div class="card-body validate-card-body">%s</div></div>`,
|
||||
label, actions, body)
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -271,6 +271,8 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
|
||||
mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf"))
|
||||
mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power"))
|
||||
mux.HandleFunc("POST /api/bee-bench/nvidia/autotune/run", h.handleAPIBenchmarkAutotuneRun())
|
||||
mux.HandleFunc("GET /api/bee-bench/nvidia/autotune/status", h.handleAPIBenchmarkAutotuneStatus)
|
||||
mux.HandleFunc("GET /api/benchmark/results", h.handleAPIBenchmarkResults)
|
||||
|
||||
// Tasks
|
||||
@@ -299,11 +301,14 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
// Export
|
||||
mux.HandleFunc("GET /api/export/list", h.handleAPIExportList)
|
||||
mux.HandleFunc("GET /api/export/usb", h.handleAPIExportUSBTargets)
|
||||
mux.HandleFunc("POST /api/export/usb/audit", h.handleAPIExportUSBAudit)
|
||||
mux.HandleFunc("POST /api/export/usb/bundle", h.handleAPIExportUSBBundle)
|
||||
mux.HandleFunc("GET /api/blackbox/status", h.handleAPIBlackboxStatus)
|
||||
mux.HandleFunc("POST /api/blackbox/enable", h.handleAPIBlackboxEnable)
|
||||
mux.HandleFunc("POST /api/blackbox/disable", h.handleAPIBlackboxDisable)
|
||||
|
||||
// Tools
|
||||
mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)
|
||||
mux.HandleFunc("GET /api/tools/nvme-formats", h.handleAPINVMeFormats)
|
||||
mux.HandleFunc("POST /api/tools/nvme-format/run", h.handleAPINVMeFormatRun)
|
||||
|
||||
// GPU presence / tools
|
||||
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
||||
@@ -569,6 +574,7 @@ func (h *handler) handleExportIndex(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
func (h *handler) handleViewer(w http.ResponseWriter, r *http.Request) {
|
||||
snapshot, _ := loadSnapshot(h.opts.AuditPath)
|
||||
snapshot = enrichSnapshotForViewer(snapshot)
|
||||
body, err := viewer.RenderHTML(snapshot, h.opts.Title)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
@@ -687,41 +693,22 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) (dat
|
||||
|
||||
case path == "server-power":
|
||||
title = "System Power"
|
||||
// Use per-PSU stacked chart when PSU SDR data is available.
|
||||
// Collect the union of PSU slots seen across all samples.
|
||||
psuSlots := psuSlotsFromSamples(samples)
|
||||
if len(psuSlots) > 1 {
|
||||
// Build one dataset per PSU slot.
|
||||
psuDatasets := make([][]float64, len(psuSlots))
|
||||
psuNames := make([]string, len(psuSlots))
|
||||
for si, slot := range psuSlots {
|
||||
ds := make([]float64, len(samples))
|
||||
for i, s := range samples {
|
||||
for _, psu := range s.PSUs {
|
||||
if psu.Slot == slot {
|
||||
ds[i] = psu.PowerW
|
||||
break
|
||||
}
|
||||
}
|
||||
power := make([]float64, len(samples))
|
||||
label := "Power W"
|
||||
for i, s := range samples {
|
||||
power[i] = s.PowerW
|
||||
if strings.TrimSpace(s.PowerSource) != "" {
|
||||
label = fmt.Sprintf("Power W · %s", s.PowerSource)
|
||||
if strings.TrimSpace(s.PowerMode) != "" {
|
||||
label += fmt.Sprintf(" (%s)", s.PowerMode)
|
||||
}
|
||||
psuDatasets[si] = normalizePowerSeries(ds)
|
||||
psuNames[si] = fmt.Sprintf("PSU %d", slot)
|
||||
}
|
||||
datasets = psuDatasets
|
||||
names = psuNames
|
||||
stacked = true
|
||||
yMax = autoMax120(psuStackedTotal(psuDatasets))
|
||||
} else {
|
||||
power := make([]float64, len(samples))
|
||||
for i, s := range samples {
|
||||
power[i] = s.PowerW
|
||||
}
|
||||
power = normalizePowerSeries(power)
|
||||
datasets = [][]float64{power}
|
||||
names = []string{"Power W"}
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(power)
|
||||
}
|
||||
power = normalizePowerSeries(power)
|
||||
datasets = [][]float64{power}
|
||||
names = []string{label}
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(power)
|
||||
|
||||
case path == "server-fans":
|
||||
title = "Fan RPM"
|
||||
@@ -1307,8 +1294,8 @@ const loadingPageHTML = `<!DOCTYPE html>
|
||||
*{margin:0;padding:0;box-sizing:border-box}
|
||||
html,body{height:100%;background:#0f1117;display:flex;align-items:center;justify-content:center;font-family:'Courier New',monospace;color:#e2e8f0}
|
||||
.wrap{text-align:center;width:420px}
|
||||
.logo{font-size:11px;line-height:1.4;color:#f6c90e;margin-bottom:6px;white-space:pre;text-align:left}
|
||||
.subtitle{font-size:12px;color:#a0aec0;text-align:left;margin-bottom:24px;padding-left:2px}
|
||||
.brand{font-size:22px;letter-spacing:.18em;color:#f6c90e;margin-bottom:6px;text-align:left}
|
||||
.subtitle{font-size:12px;color:#a0aec0;text-align:left;margin-bottom:24px}
|
||||
.spinner{width:36px;height:36px;border:3px solid #2d3748;border-top-color:#f6c90e;border-radius:50%;animation:spin .8s linear infinite;margin:0 auto 14px}
|
||||
.spinner.hidden{display:none}
|
||||
@keyframes spin{to{transform:rotate(360deg)}}
|
||||
@@ -1326,12 +1313,7 @@ td:first-child{color:#718096;width:55%}
|
||||
</head>
|
||||
<body>
|
||||
<div class="wrap">
|
||||
<div class="logo"> ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗
|
||||
██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝
|
||||
█████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗
|
||||
██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝
|
||||
███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗
|
||||
╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝</div>
|
||||
<div class="brand">EASY BEE</div>
|
||||
<div class="subtitle">Hardware Audit LiveCD</div>
|
||||
<div class="spinner" id="spin"></div>
|
||||
<div class="status" id="st">Connecting to bee-web...</div>
|
||||
@@ -1341,8 +1323,20 @@ td:first-child{color:#718096;width:55%}
|
||||
<script>
|
||||
(function(){
|
||||
var gone = false;
|
||||
var pollStarted = false;
|
||||
var fallbackOpenTimer = null;
|
||||
var AUTO_OPEN_DELAY_MS = 15000;
|
||||
function go(){ if(!gone){gone=true;window.location.replace('/');} }
|
||||
|
||||
function scheduleFallbackOpen(){
|
||||
if(fallbackOpenTimer!==null) return;
|
||||
fallbackOpenTimer=setTimeout(function(){
|
||||
document.getElementById('spin').className='spinner hidden';
|
||||
document.getElementById('st').textContent='Startup checks are taking too long — opening app...';
|
||||
go();
|
||||
},AUTO_OPEN_DELAY_MS);
|
||||
}
|
||||
|
||||
function icon(s){
|
||||
if(s==='active') return '<span class="ok">● active</span>';
|
||||
if(s==='failed') return '<span class="fail">✕ failed</span>';
|
||||
@@ -1374,6 +1368,7 @@ function pollServices(){
|
||||
tbl.innerHTML=html;
|
||||
if(allSettled(svcs)){
|
||||
clearInterval(pollTimer);
|
||||
if(fallbackOpenTimer!==null) clearTimeout(fallbackOpenTimer);
|
||||
document.getElementById('spin').className='spinner hidden';
|
||||
document.getElementById('st').textContent='Ready \u2014 opening...';
|
||||
setTimeout(go,800);
|
||||
@@ -1388,8 +1383,12 @@ function probe(){
|
||||
if(r.ok){
|
||||
document.getElementById('st').textContent='bee-web running \u2014 checking services...';
|
||||
document.getElementById('btn').style.display='';
|
||||
pollServices();
|
||||
pollTimer=setInterval(pollServices,1500);
|
||||
scheduleFallbackOpen();
|
||||
if(!pollStarted){
|
||||
pollStarted=true;
|
||||
pollServices();
|
||||
pollTimer=setInterval(pollServices,1500);
|
||||
}
|
||||
} else {
|
||||
document.getElementById('st').textContent='bee-web starting (status '+r.status+')...';
|
||||
setTimeout(probe,500);
|
||||
|
||||
@@ -420,6 +420,49 @@ func TestHandleMetricsChartSVGRendersCustomSVG(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestChartDataFromSamplesServerPowerUsesResolvedSystemPower(t *testing.T) {
|
||||
start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
|
||||
samples := []platform.LiveMetricSample{
|
||||
{
|
||||
Timestamp: start,
|
||||
PSUs: []platform.PSUReading{
|
||||
{Slot: 1, PowerW: 120},
|
||||
{Slot: 2, PowerW: 130},
|
||||
},
|
||||
PowerW: 250,
|
||||
PowerSource: "sdr_psu_input",
|
||||
PowerMode: "autotuned",
|
||||
},
|
||||
{
|
||||
Timestamp: start.Add(time.Minute),
|
||||
PSUs: []platform.PSUReading{
|
||||
{Slot: 1, PowerW: 140},
|
||||
{Slot: 2, PowerW: 135},
|
||||
},
|
||||
PowerW: 275,
|
||||
PowerSource: "sdr_psu_input",
|
||||
PowerMode: "autotuned",
|
||||
},
|
||||
}
|
||||
|
||||
datasets, names, _, title, _, _, stacked, ok := chartDataFromSamples("server-power", samples)
|
||||
if !ok {
|
||||
t.Fatal("expected server-power chart data")
|
||||
}
|
||||
if title != "System Power" {
|
||||
t.Fatalf("title=%q", title)
|
||||
}
|
||||
if stacked {
|
||||
t.Fatal("server-power should use resolved system power, not stacked PSU inputs")
|
||||
}
|
||||
if len(datasets) != 1 || len(names) != 1 {
|
||||
t.Fatalf("datasets=%d names=%d want 1/1", len(datasets), len(names))
|
||||
}
|
||||
if names[0] != "Power W · sdr_psu_input (autotuned)" {
|
||||
t.Fatalf("names=%v", names)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
|
||||
got := normalizeFanSeries([]float64{4200, 0, 0, 4300, 0})
|
||||
want := []float64{4200, 4200, 4200, 4300, 4300}
|
||||
@@ -561,6 +604,25 @@ func TestReadyIsOKWhenAuditPathIsUnset(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadingPageHasFallbackAutoOpen(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/loading", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
body := rec.Body.String()
|
||||
for _, needle := range []string{
|
||||
`var AUTO_OPEN_DELAY_MS = 15000;`,
|
||||
`function scheduleFallbackOpen(){`,
|
||||
`Startup checks are taking too long — opening app...`,
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("loading page missing %q: %s", needle, body)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
@@ -628,11 +690,17 @@ func TestToolsPageRendersNvidiaSelfHealSection(t *testing.T) {
|
||||
if !strings.Contains(body, `id="boot-source-text"`) {
|
||||
t.Fatalf("tools page missing boot source field: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `Export to USB`) {
|
||||
t.Fatalf("tools page missing export to usb section: %s", body)
|
||||
if !strings.Contains(body, `USB Black-Box`) {
|
||||
t.Fatalf("tools page missing usb black-box section: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `Support Bundle</button>`) {
|
||||
t.Fatalf("tools page missing support bundle usb button: %s", body)
|
||||
if !strings.Contains(body, `/api/blackbox/status`) {
|
||||
t.Fatalf("tools page missing black-box status api usage: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `NVMe Block Format`) {
|
||||
t.Fatalf("tools page missing nvme block format section: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `/api/tools/nvme-formats`) || !strings.Contains(body, `/api/tools/nvme-format/run`) {
|
||||
t.Fatalf("tools page missing nvme format api usage: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -650,9 +718,12 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
|
||||
`/api/gpu/nvidia`,
|
||||
`/api/bee-bench/nvidia/perf/run`,
|
||||
`/api/bee-bench/nvidia/power/run`,
|
||||
`/api/bee-bench/nvidia/autotune/run`,
|
||||
`/api/bee-bench/nvidia/autotune/status`,
|
||||
`benchmark-run-nccl`,
|
||||
`Run Performance Benchmark`,
|
||||
`Run Power / Thermal Fit`,
|
||||
`Autotune`,
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("benchmark page missing %q: %s", needle, body)
|
||||
@@ -970,6 +1041,39 @@ func TestViewerRendersLatestSnapshot(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestViewerRendersDerivedStorageBlockFormat(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
body := `{
|
||||
"collected_at":"2026-04-29T00:05:00Z",
|
||||
"hardware":{
|
||||
"board":{"serial_number":"SERIAL-NEW"},
|
||||
"storage":[
|
||||
{
|
||||
"serial_number":"DISK-1",
|
||||
"model":"Test NVMe",
|
||||
"logical_block_size_bytes":512,
|
||||
"physical_block_size_bytes":4096,
|
||||
"metadata_bytes_per_block":8
|
||||
}
|
||||
]
|
||||
}
|
||||
}`
|
||||
if err := os.WriteFile(path, []byte(body), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
handler := NewHandler(HandlerOptions{AuditPath: path})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/viewer", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
if !strings.Contains(rec.Body.String(), "512+8") {
|
||||
t.Fatalf("viewer body missing derived block format: %s", rec.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestAuditJSONServesLatestSnapshot(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
@@ -992,6 +1096,36 @@ func TestAuditJSONServesLatestSnapshot(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestAuditJSONDoesNotInjectDerivedStorageBlockFormat(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
body := `{
|
||||
"hardware":{
|
||||
"board":{"serial_number":"SERIAL-API"},
|
||||
"storage":[
|
||||
{
|
||||
"serial_number":"DISK-1",
|
||||
"logical_block_size_bytes":512,
|
||||
"metadata_bytes_per_block":8
|
||||
}
|
||||
]
|
||||
}
|
||||
}`
|
||||
if err := os.WriteFile(path, []byte(body), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
handler := NewHandler(HandlerOptions{AuditPath: path})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/audit.json", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
if strings.Contains(rec.Body.String(), "block_format") {
|
||||
t.Fatalf("audit.json should remain contract-only: %s", rec.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestMissingAuditJSONReturnsNotFound(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{AuditPath: "/missing/audit.json"})
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
@@ -0,0 +1,511 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/signal"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
"bee/audit/internal/runtimeenv"
|
||||
)
|
||||
|
||||
type taskRunnerState struct {
|
||||
PID int `json:"pid"`
|
||||
Status string `json:"status"`
|
||||
Error string `json:"error,omitempty"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
func taskRunnerStatePath(t *Task) string {
|
||||
if t == nil || strings.TrimSpace(t.ArtifactsDir) == "" {
|
||||
return ""
|
||||
}
|
||||
return filepath.Join(t.ArtifactsDir, "runner-state.json")
|
||||
}
|
||||
|
||||
func writeTaskRunnerState(t *Task, state taskRunnerState) error {
|
||||
path := taskRunnerStatePath(t)
|
||||
if path == "" {
|
||||
return nil
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
data, err := json.MarshalIndent(state, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
tmp := path + ".tmp"
|
||||
if err := os.WriteFile(tmp, data, 0644); err != nil {
|
||||
return err
|
||||
}
|
||||
return os.Rename(tmp, path)
|
||||
}
|
||||
|
||||
func readTaskRunnerState(t *Task) (taskRunnerState, bool) {
|
||||
path := taskRunnerStatePath(t)
|
||||
if path == "" {
|
||||
return taskRunnerState{}, false
|
||||
}
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil || len(data) == 0 {
|
||||
return taskRunnerState{}, false
|
||||
}
|
||||
var state taskRunnerState
|
||||
if err := json.Unmarshal(data, &state); err != nil {
|
||||
return taskRunnerState{}, false
|
||||
}
|
||||
return state, true
|
||||
}
|
||||
|
||||
func processAlive(pid int) bool {
|
||||
if pid <= 0 {
|
||||
return false
|
||||
}
|
||||
err := syscall.Kill(pid, 0)
|
||||
return err == nil || err == syscall.EPERM
|
||||
}
|
||||
|
||||
func finalizeTaskForResult(t *Task, errMsg string, cancelled bool) {
|
||||
now := time.Now()
|
||||
t.DoneAt = &now
|
||||
switch {
|
||||
case cancelled:
|
||||
t.Status = TaskCancelled
|
||||
t.ErrMsg = "aborted"
|
||||
case strings.TrimSpace(errMsg) != "":
|
||||
t.Status = TaskFailed
|
||||
t.ErrMsg = errMsg
|
||||
default:
|
||||
t.Status = TaskDone
|
||||
t.ErrMsg = ""
|
||||
}
|
||||
}
|
||||
|
||||
func executeTaskWithOptions(opts *HandlerOptions, t *Task, j *jobState, ctx context.Context) {
|
||||
if opts == nil {
|
||||
j.append("ERROR: handler options not configured")
|
||||
j.finish("handler options not configured")
|
||||
return
|
||||
}
|
||||
a := opts.App
|
||||
|
||||
recovered := len(j.lines) > 0
|
||||
j.append(fmt.Sprintf("Starting %s...", t.Name))
|
||||
if recovered {
|
||||
j.append(fmt.Sprintf("Recovered after bee-web restart at %s", time.Now().UTC().Format(time.RFC3339)))
|
||||
}
|
||||
|
||||
var (
|
||||
archive string
|
||||
err error
|
||||
)
|
||||
|
||||
switch t.Target {
|
||||
case "nvidia":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
diagLevel := 2
|
||||
if t.params.StressMode {
|
||||
diagLevel = 3
|
||||
}
|
||||
if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
|
||||
result, e := a.RunNvidiaAcceptancePackWithOptions(ctx, "", diagLevel, t.params.GPUIndices, j.append)
|
||||
if e != nil {
|
||||
err = e
|
||||
} else {
|
||||
archive = result.Body
|
||||
}
|
||||
} else {
|
||||
archive, err = a.RunNvidiaAcceptancePack("", j.append)
|
||||
}
|
||||
case "nvidia-targeted-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if dur <= 0 {
|
||||
dur = 300
|
||||
}
|
||||
archive, err = a.RunNvidiaTargetedStressValidatePack(ctx, "", dur, t.params.GPUIndices, j.append)
|
||||
case "nvidia-bench-perf":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = a.RunNvidiaBenchmarkCtx(ctx, "", platform.NvidiaBenchmarkOptions{
|
||||
Profile: t.params.BenchmarkProfile,
|
||||
SizeMB: t.params.SizeMB,
|
||||
GPUIndices: t.params.GPUIndices,
|
||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||
RunNCCL: t.params.RunNCCL,
|
||||
ParallelGPUs: t.params.ParallelGPUs,
|
||||
RampStep: t.params.RampStep,
|
||||
RampTotal: t.params.RampTotal,
|
||||
RampRunID: t.params.RampRunID,
|
||||
}, j.append)
|
||||
case "nvidia-bench-power":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = a.RunNvidiaPowerBenchCtx(ctx, app.DefaultBeeBenchPowerDir, platform.NvidiaBenchmarkOptions{
|
||||
Profile: t.params.BenchmarkProfile,
|
||||
GPUIndices: t.params.GPUIndices,
|
||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||
RampStep: t.params.RampStep,
|
||||
RampTotal: t.params.RampTotal,
|
||||
RampRunID: t.params.RampRunID,
|
||||
}, j.append)
|
||||
case "nvidia-bench-autotune":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = a.RunNvidiaPowerSourceAutotuneCtx(ctx, app.DefaultBeeBenchAutotuneDir, platform.NvidiaBenchmarkOptions{
|
||||
Profile: t.params.BenchmarkProfile,
|
||||
SizeMB: t.params.SizeMB,
|
||||
}, t.params.BenchmarkKind, j.append)
|
||||
case "nvidia-compute":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
|
||||
if planErr != nil {
|
||||
err = planErr
|
||||
break
|
||||
}
|
||||
if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
|
||||
dur = rampPlan.DurationSec
|
||||
}
|
||||
if rampPlan.StaggerSeconds > 0 {
|
||||
j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
|
||||
}
|
||||
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, rampPlan.StaggerSeconds, j.append)
|
||||
case "nvidia-targeted-power":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = a.RunNvidiaTargetedPowerPack(ctx, "", dur, t.params.GPUIndices, j.append)
|
||||
case "nvidia-pulse":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = a.RunNvidiaPulseTestPack(ctx, "", dur, t.params.GPUIndices, j.append)
|
||||
case "nvidia-bandwidth":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = a.RunNvidiaBandwidthPack(ctx, "", t.params.GPUIndices, j.append)
|
||||
case "nvidia-interconnect":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = a.RunNCCLTests(ctx, "", t.params.GPUIndices, j.append)
|
||||
case "nvidia-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
|
||||
if planErr != nil {
|
||||
err = planErr
|
||||
break
|
||||
}
|
||||
if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
|
||||
dur = rampPlan.DurationSec
|
||||
}
|
||||
if rampPlan.StaggerSeconds > 0 {
|
||||
j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
|
||||
}
|
||||
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
|
||||
DurationSec: dur,
|
||||
Loader: t.params.Loader,
|
||||
GPUIndices: t.params.GPUIndices,
|
||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||
StaggerSeconds: rampPlan.StaggerSeconds,
|
||||
}, j.append)
|
||||
case "memory":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
sizeMB, passes := resolveMemoryValidatePreset(t.params.BurnProfile, t.params.StressMode)
|
||||
j.append(fmt.Sprintf("Memory validate preset: %d MB x %d pass(es)", sizeMB, passes))
|
||||
archive, err = runMemoryAcceptancePackCtx(a, ctx, "", sizeMB, passes, j.append)
|
||||
case "storage":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = runStorageAcceptancePackCtx(a, ctx, "", t.params.StressMode, j.append)
|
||||
case "cpu":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
if dur <= 0 {
|
||||
if t.params.StressMode {
|
||||
dur = 1800
|
||||
} else {
|
||||
dur = 60
|
||||
}
|
||||
}
|
||||
j.append(fmt.Sprintf("CPU stress duration: %ds", dur))
|
||||
archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
|
||||
case "amd":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = runAMDAcceptancePackCtx(a, ctx, "", j.append)
|
||||
case "amd-mem":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = runAMDMemIntegrityPackCtx(a, ctx, "", j.append)
|
||||
case "amd-bandwidth":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = runAMDMemBandwidthPackCtx(a, ctx, "", j.append)
|
||||
case "amd-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = runAMDStressPackCtx(a, ctx, "", dur, j.append)
|
||||
case "memory-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = runMemoryStressPackCtx(a, ctx, "", dur, j.append)
|
||||
case "sat-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
|
||||
case "platform-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
runOpts := resolvePlatformStressPreset(t.params.BurnProfile)
|
||||
runOpts.Components = t.params.PlatformComponents
|
||||
archive, err = a.RunPlatformStress(ctx, "", runOpts, j.append)
|
||||
case "audit":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
result, e := a.RunAuditNow(opts.RuntimeMode)
|
||||
if e != nil {
|
||||
err = e
|
||||
} else {
|
||||
for _, line := range splitLines(result.Body) {
|
||||
j.append(line)
|
||||
}
|
||||
}
|
||||
case "support-bundle":
|
||||
j.append("Building support bundle...")
|
||||
archive, err = buildSupportBundle(opts.ExportDir)
|
||||
case "install":
|
||||
if strings.TrimSpace(t.params.Device) == "" {
|
||||
err = fmt.Errorf("device is required")
|
||||
break
|
||||
}
|
||||
installLogPath := platform.InstallLogPath(t.params.Device)
|
||||
j.append("Install log: " + installLogPath)
|
||||
err = streamCmdJob(j, installCommand(ctx, t.params.Device, installLogPath))
|
||||
case "install-to-ram":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
err = a.RunInstallToRAM(ctx, j.append)
|
||||
case "nvme-format":
|
||||
if strings.TrimSpace(t.params.Device) == "" {
|
||||
err = fmt.Errorf("device is required")
|
||||
break
|
||||
}
|
||||
err = runNVMeFormatTask(ctx, j, t.params.Device, t.params.LBAF)
|
||||
default:
|
||||
j.append("ERROR: unknown target: " + t.Target)
|
||||
j.finish("unknown target")
|
||||
return
|
||||
}
|
||||
|
||||
if archive != "" {
|
||||
archivePath := app.ExtractArchivePath(archive)
|
||||
if err == nil && app.ReadSATOverallStatus(archivePath) == "FAILED" {
|
||||
err = fmt.Errorf("SAT overall_status=FAILED (see summary.txt)")
|
||||
}
|
||||
if opts.App != nil && opts.App.StatusDB != nil {
|
||||
app.ApplySATResultToDB(opts.App.StatusDB, t.Target, archivePath)
|
||||
}
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
if ctx.Err() != nil {
|
||||
j.append("Aborted.")
|
||||
j.finish("aborted")
|
||||
} else {
|
||||
j.append("ERROR: " + err.Error())
|
||||
j.finish(err.Error())
|
||||
}
|
||||
return
|
||||
}
|
||||
if archive != "" {
|
||||
j.append("Archive: " + archive)
|
||||
}
|
||||
j.finish("")
|
||||
}
|
||||
|
||||
func loadPersistedTask(statePath, taskID string) (*Task, error) {
|
||||
data, err := os.ReadFile(statePath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var persisted []persistedTask
|
||||
if err := json.Unmarshal(data, &persisted); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
for _, pt := range persisted {
|
||||
if pt.ID != taskID {
|
||||
continue
|
||||
}
|
||||
t := &Task{
|
||||
ID: pt.ID,
|
||||
Name: pt.Name,
|
||||
Target: pt.Target,
|
||||
Priority: pt.Priority,
|
||||
Status: pt.Status,
|
||||
CreatedAt: pt.CreatedAt,
|
||||
StartedAt: pt.StartedAt,
|
||||
DoneAt: pt.DoneAt,
|
||||
ErrMsg: pt.ErrMsg,
|
||||
LogPath: pt.LogPath,
|
||||
ArtifactsDir: pt.ArtifactsDir,
|
||||
ReportJSONPath: pt.ReportJSONPath,
|
||||
ReportHTMLPath: pt.ReportHTMLPath,
|
||||
params: pt.Params,
|
||||
}
|
||||
ensureTaskReportPaths(t)
|
||||
return t, nil
|
||||
}
|
||||
return nil, fmt.Errorf("task %s not found", taskID)
|
||||
}
|
||||
|
||||
func RunPersistedTask(exportDir, taskID string, stdout, stderr io.Writer) int {
|
||||
if strings.TrimSpace(exportDir) == "" || strings.TrimSpace(taskID) == "" {
|
||||
fmt.Fprintln(stderr, "bee task-run: --export-dir and --task-id are required")
|
||||
return 2
|
||||
}
|
||||
|
||||
runtimeInfo, err := runtimeenv.Detect("auto")
|
||||
if err != nil {
|
||||
slog.Warn("resolve runtime for task-run", "err", err)
|
||||
}
|
||||
opts := &HandlerOptions{
|
||||
ExportDir: exportDir,
|
||||
App: app.New(platform.New()),
|
||||
RuntimeMode: runtimeInfo.Mode,
|
||||
}
|
||||
statePath := filepath.Join(exportDir, "tasks-state.json")
|
||||
task, err := loadPersistedTask(statePath, taskID)
|
||||
if err != nil {
|
||||
fmt.Fprintln(stderr, err.Error())
|
||||
return 1
|
||||
}
|
||||
if task.StartedAt == nil || task.StartedAt.IsZero() {
|
||||
now := time.Now()
|
||||
task.StartedAt = &now
|
||||
}
|
||||
if task.Status == "" {
|
||||
task.Status = TaskRunning
|
||||
}
|
||||
if err := writeTaskRunnerState(task, taskRunnerState{
|
||||
PID: os.Getpid(),
|
||||
Status: TaskRunning,
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
}); err != nil {
|
||||
fmt.Fprintln(stderr, err.Error())
|
||||
return 1
|
||||
}
|
||||
|
||||
ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
|
||||
defer cancel()
|
||||
|
||||
j := newTaskJobState(task.LogPath, taskSerialPrefix(task))
|
||||
executeTaskWithOptions(opts, task, j, ctx)
|
||||
finalizeTaskForResult(task, j.err, ctx.Err() != nil)
|
||||
if err := writeTaskReportArtifacts(task); err != nil {
|
||||
appendJobLog(task.LogPath, "WARN: task report generation failed: "+err.Error())
|
||||
}
|
||||
j.closeLog()
|
||||
if err := writeTaskRunnerState(task, taskRunnerState{
|
||||
PID: os.Getpid(),
|
||||
Status: task.Status,
|
||||
Error: task.ErrMsg,
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
}); err != nil {
|
||||
fmt.Fprintln(stderr, err.Error())
|
||||
}
|
||||
if task.ErrMsg != "" {
|
||||
return 1
|
||||
}
|
||||
return 0
|
||||
}
|
||||
+271
-32
@@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"os"
|
||||
@@ -13,6 +14,7 @@ import (
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
@@ -34,6 +36,7 @@ var taskNames = map[string]string{
|
||||
"nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)",
|
||||
"nvidia-bench-perf": "NVIDIA Bee Bench Perf",
|
||||
"nvidia-bench-power": "NVIDIA Bee Bench Power",
|
||||
"nvidia-bench-autotune": "NVIDIA Bee Bench Power Source Autotune",
|
||||
"nvidia-compute": "NVIDIA Max Compute Load (dcgmproftester)",
|
||||
"nvidia-targeted-power": "NVIDIA Targeted Power (dcgmi diag targeted_power)",
|
||||
"nvidia-pulse": "NVIDIA Pulse Test (dcgmi diag pulse_test)",
|
||||
@@ -54,6 +57,7 @@ var taskNames = map[string]string{
|
||||
"support-bundle": "Support Bundle",
|
||||
"install": "Install to Disk",
|
||||
"install-to-ram": "Install to RAM",
|
||||
"nvme-format": "NVMe Block Format Change",
|
||||
}
|
||||
|
||||
// burnNames maps target → human-readable name when a burn profile is set.
|
||||
@@ -109,8 +113,9 @@ type Task struct {
|
||||
ReportHTMLPath string `json:"report_html_path,omitempty"`
|
||||
|
||||
// runtime fields (not serialised)
|
||||
job *jobState
|
||||
params taskParams
|
||||
job *jobState
|
||||
runnerPID int
|
||||
params taskParams
|
||||
}
|
||||
|
||||
// taskParams holds optional parameters parsed from the run request.
|
||||
@@ -125,6 +130,7 @@ type taskParams struct {
|
||||
Loader string `json:"loader,omitempty"`
|
||||
BurnProfile string `json:"burn_profile,omitempty"`
|
||||
BenchmarkProfile string `json:"benchmark_profile,omitempty"`
|
||||
BenchmarkKind string `json:"benchmark_kind,omitempty"`
|
||||
RunNCCL bool `json:"run_nccl,omitempty"`
|
||||
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||
RampStep int `json:"ramp_step,omitempty"`
|
||||
@@ -132,6 +138,7 @@ type taskParams struct {
|
||||
RampRunID string `json:"ramp_run_id,omitempty"`
|
||||
DisplayName string `json:"display_name,omitempty"`
|
||||
Device string `json:"device,omitempty"` // for install
|
||||
LBAF int `json:"lbaf,omitempty"`
|
||||
PlatformComponents []string `json:"platform_components,omitempty"`
|
||||
}
|
||||
|
||||
@@ -326,6 +333,13 @@ var (
|
||||
installCommand = func(ctx context.Context, device string, logPath string) *exec.Cmd {
|
||||
return exec.CommandContext(ctx, "bee-install", device, logPath)
|
||||
}
|
||||
externalTaskRunnerCommand = func(exportDir, taskID string) (*exec.Cmd, error) {
|
||||
exe, err := os.Executable()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return exec.Command(exe, "bee-worker", "--export-dir", exportDir, "--task-id", taskID), nil
|
||||
}
|
||||
)
|
||||
|
||||
// enqueue adds a task to the queue and notifies the worker.
|
||||
@@ -363,6 +377,11 @@ func (q *taskQueue) prune() {
|
||||
|
||||
// nextPending returns the highest-priority pending task (nil if none).
|
||||
func (q *taskQueue) nextPending() *Task {
|
||||
for _, t := range q.tasks {
|
||||
if t.Status == TaskRunning {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
var best *Task
|
||||
for _, t := range q.tasks {
|
||||
if t.Status != TaskPending {
|
||||
@@ -482,6 +501,7 @@ func (q *taskQueue) startWorker(opts *HandlerOptions) {
|
||||
if !q.started {
|
||||
q.loadLocked()
|
||||
q.started = true
|
||||
q.resumeRunningTasksLocked()
|
||||
goRecoverLoop("task worker", 2*time.Second, q.worker)
|
||||
}
|
||||
hasPending := q.nextPending() != nil
|
||||
@@ -515,15 +535,12 @@ func (q *taskQueue) worker() {
|
||||
t.StartedAt = &now
|
||||
t.DoneAt = nil
|
||||
t.ErrMsg = ""
|
||||
j := newTaskJobState(t.LogPath, taskSerialPrefix(t))
|
||||
j := newTaskJobState(t.LogPath)
|
||||
t.job = j
|
||||
q.persistLocked()
|
||||
q.mu.Unlock()
|
||||
|
||||
taskCtx, taskCancel := context.WithCancel(context.Background())
|
||||
j.cancel = taskCancel
|
||||
q.executeTask(t, j, taskCtx)
|
||||
taskCancel()
|
||||
q.runTaskExternal(t, j)
|
||||
|
||||
q.mu.Lock()
|
||||
q.prune()
|
||||
@@ -535,6 +552,218 @@ func (q *taskQueue) worker() {
|
||||
}
|
||||
}
|
||||
|
||||
func (q *taskQueue) resumeRunningTasksLocked() {
|
||||
for _, t := range q.tasks {
|
||||
if t.Status != TaskRunning {
|
||||
continue
|
||||
}
|
||||
if t.job == nil {
|
||||
t.job = newTaskJobState(t.LogPath)
|
||||
}
|
||||
q.attachExternalTaskControlsLocked(t, t.job)
|
||||
q.startRecoveredTaskMonitorLocked(t, t.job)
|
||||
}
|
||||
}
|
||||
|
||||
func (q *taskQueue) attachExternalTaskControlsLocked(t *Task, j *jobState) {
|
||||
if t == nil || j == nil {
|
||||
return
|
||||
}
|
||||
j.cancel = func() {
|
||||
pid := t.runnerPID
|
||||
if pid <= 0 {
|
||||
if state, ok := readTaskRunnerState(t); ok {
|
||||
pid = state.PID
|
||||
}
|
||||
}
|
||||
if pid > 0 {
|
||||
_ = syscall.Kill(pid, syscall.SIGTERM)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (q *taskQueue) startRecoveredTaskMonitorLocked(t *Task, j *jobState) {
|
||||
if t == nil || j == nil || t.runnerPID <= 0 {
|
||||
return
|
||||
}
|
||||
goRecoverOnce("task runner monitor", func() {
|
||||
stopTail := make(chan struct{})
|
||||
doneTail := make(chan struct{})
|
||||
go q.followTaskLog(t, j, stopTail, doneTail)
|
||||
for processAlive(t.runnerPID) {
|
||||
time.Sleep(500 * time.Millisecond)
|
||||
}
|
||||
close(stopTail)
|
||||
<-doneTail
|
||||
q.finishExternalTask(t, j, nil)
|
||||
})
|
||||
}
|
||||
|
||||
func (q *taskQueue) runTaskExternal(t *Task, j *jobState) {
|
||||
startedKmsgWatch := false
|
||||
if q.kmsgWatcher != nil && isSATTarget(t.Target) {
|
||||
q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
|
||||
startedKmsgWatch = true
|
||||
}
|
||||
defer func() {
|
||||
if startedKmsgWatch && q.kmsgWatcher != nil {
|
||||
q.kmsgWatcher.NotifyTaskFinished(t.ID)
|
||||
}
|
||||
}()
|
||||
|
||||
stopTail := make(chan struct{})
|
||||
doneTail := make(chan struct{})
|
||||
defer func() {
|
||||
close(stopTail)
|
||||
<-doneTail
|
||||
}()
|
||||
go q.followTaskLog(t, j, stopTail, doneTail)
|
||||
|
||||
cmd, err := externalTaskRunnerCommand(q.opts.ExportDir, t.ID)
|
||||
if err != nil {
|
||||
j.appendFromLog("ERROR: " + err.Error())
|
||||
q.finishExternalTask(t, j, err)
|
||||
return
|
||||
}
|
||||
if err := cmd.Start(); err != nil {
|
||||
j.appendFromLog("ERROR: " + err.Error())
|
||||
q.finishExternalTask(t, j, err)
|
||||
return
|
||||
}
|
||||
|
||||
q.mu.Lock()
|
||||
t.runnerPID = cmd.Process.Pid
|
||||
q.attachExternalTaskControlsLocked(t, j)
|
||||
q.persistLocked()
|
||||
q.mu.Unlock()
|
||||
|
||||
waitErr := cmd.Wait()
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
q.finishExternalTask(t, j, waitErr)
|
||||
}
|
||||
|
||||
func (q *taskQueue) followTaskLog(t *Task, j *jobState, stop <-chan struct{}, done chan<- struct{}) {
|
||||
defer close(done)
|
||||
path := ""
|
||||
if t != nil {
|
||||
path = t.LogPath
|
||||
}
|
||||
if strings.TrimSpace(path) == "" {
|
||||
return
|
||||
}
|
||||
offset := int64(0)
|
||||
if info, err := os.Stat(path); err == nil {
|
||||
offset = info.Size()
|
||||
}
|
||||
var partial string
|
||||
ticker := time.NewTicker(250 * time.Millisecond)
|
||||
defer ticker.Stop()
|
||||
flush := func() {
|
||||
data, newOffset, err := readTaskLogDelta(path, offset)
|
||||
if err != nil || len(data) == 0 {
|
||||
offset = newOffset
|
||||
return
|
||||
}
|
||||
offset = newOffset
|
||||
text := partial + strings.ReplaceAll(string(data), "\r\n", "\n")
|
||||
lines := strings.Split(text, "\n")
|
||||
partial = lines[len(lines)-1]
|
||||
for _, line := range lines[:len(lines)-1] {
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
j.appendFromLog(line)
|
||||
}
|
||||
}
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
flush()
|
||||
case <-stop:
|
||||
flush()
|
||||
if strings.TrimSpace(partial) != "" {
|
||||
j.appendFromLog(partial)
|
||||
}
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func readTaskLogDelta(path string, offset int64) ([]byte, int64, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, offset, err
|
||||
}
|
||||
defer f.Close()
|
||||
info, err := f.Stat()
|
||||
if err != nil {
|
||||
return nil, offset, err
|
||||
}
|
||||
if info.Size() < offset {
|
||||
offset = 0
|
||||
}
|
||||
if _, err := f.Seek(offset, io.SeekStart); err != nil {
|
||||
return nil, offset, err
|
||||
}
|
||||
data, err := io.ReadAll(io.LimitReader(f, 1<<20))
|
||||
return data, offset + int64(len(data)), err
|
||||
}
|
||||
|
||||
func (q *taskQueue) finishExternalTask(t *Task, j *jobState, waitErr error) {
|
||||
q.mu.Lock()
|
||||
defer q.mu.Unlock()
|
||||
if t.Status == TaskDone || t.Status == TaskFailed || t.Status == TaskCancelled {
|
||||
if j != nil && !j.isDone() {
|
||||
j.finish(t.ErrMsg)
|
||||
j.closeLog()
|
||||
}
|
||||
select {
|
||||
case q.trigger <- struct{}{}:
|
||||
default:
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
state, ok := readTaskRunnerState(t)
|
||||
switch {
|
||||
case ok && state.Status != TaskRunning:
|
||||
t.Status = state.Status
|
||||
t.ErrMsg = state.Error
|
||||
now := state.UpdatedAt
|
||||
if now.IsZero() {
|
||||
now = time.Now()
|
||||
}
|
||||
t.DoneAt = &now
|
||||
case waitErr != nil:
|
||||
now := time.Now()
|
||||
t.Status = TaskFailed
|
||||
t.ErrMsg = waitErr.Error()
|
||||
t.DoneAt = &now
|
||||
default:
|
||||
now := time.Now()
|
||||
t.Status = TaskFailed
|
||||
t.ErrMsg = "task runner exited without final state"
|
||||
t.DoneAt = &now
|
||||
}
|
||||
t.runnerPID = 0
|
||||
q.finalizeTaskArtifactPathsLocked(t)
|
||||
q.persistLocked()
|
||||
|
||||
if j != nil && !j.isDone() {
|
||||
j.finish(t.ErrMsg)
|
||||
j.closeLog()
|
||||
}
|
||||
if t.ErrMsg != "" {
|
||||
taskSerialEvent(t, "finished with status="+t.Status+" error="+t.ErrMsg)
|
||||
} else {
|
||||
taskSerialEvent(t, "finished with status="+t.Status)
|
||||
}
|
||||
select {
|
||||
case q.trigger <- struct{}{}:
|
||||
default:
|
||||
}
|
||||
}
|
||||
|
||||
func (q *taskQueue) executeTask(t *Task, j *jobState, ctx context.Context) {
|
||||
startedKmsgWatch := false
|
||||
defer q.finalizeTaskRun(t, j)
|
||||
@@ -686,6 +915,15 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
RampTotal: t.params.RampTotal,
|
||||
RampRunID: t.params.RampRunID,
|
||||
}, j.append)
|
||||
case "nvidia-bench-autotune":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = a.RunNvidiaPowerSourceAutotuneCtx(ctx, app.DefaultBeeBenchAutotuneDir, platform.NvidiaBenchmarkOptions{
|
||||
Profile: t.params.BenchmarkProfile,
|
||||
SizeMB: t.params.SizeMB,
|
||||
}, t.params.BenchmarkKind, j.append)
|
||||
case "nvidia-compute":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
@@ -974,15 +1212,11 @@ func (h *handler) handleAPITasksCancel(w http.ResponseWriter, r *http.Request) {
|
||||
taskSerialEvent(t, "finished with status="+t.Status)
|
||||
writeJSON(w, map[string]string{"status": "cancelled"})
|
||||
case TaskRunning:
|
||||
if t.job != nil {
|
||||
t.job.abort()
|
||||
if t.job == nil || !t.job.abort() {
|
||||
writeError(w, http.StatusConflict, "task is not cancellable")
|
||||
return
|
||||
}
|
||||
t.Status = TaskCancelled
|
||||
now := time.Now()
|
||||
t.DoneAt = &now
|
||||
globalQueue.persistLocked()
|
||||
taskSerialEvent(t, "finished with status="+t.Status)
|
||||
writeJSON(w, map[string]string{"status": "cancelled"})
|
||||
writeJSON(w, map[string]string{"status": "aborting"})
|
||||
default:
|
||||
writeError(w, http.StatusConflict, "task is not running or pending")
|
||||
}
|
||||
@@ -1028,12 +1262,6 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
|
||||
if t.job != nil {
|
||||
t.job.abort()
|
||||
}
|
||||
if taskMayLeaveOrphanWorkers(t.Target) {
|
||||
platform.KillTestWorkers()
|
||||
}
|
||||
t.Status = TaskCancelled
|
||||
t.DoneAt = &now
|
||||
taskSerialEvent(t, "finished with status="+t.Status)
|
||||
n++
|
||||
}
|
||||
}
|
||||
@@ -1164,18 +1392,29 @@ func (q *taskQueue) loadLocked() {
|
||||
}
|
||||
q.assignTaskLogPathLocked(t)
|
||||
if t.Status == TaskRunning {
|
||||
// The task was interrupted by a bee-web restart. Child processes
|
||||
// (e.g. bee-gpu-burn-worker, dcgmi/nvvs) survive the restart in
|
||||
// their own process groups. Kill any matching stale workers before
|
||||
// marking the task failed so the next GPU test does not inherit a
|
||||
// busy DCGM slot or duplicate workers.
|
||||
if taskMayLeaveOrphanWorkers(t.Target) {
|
||||
_ = platform.KillTestWorkers()
|
||||
state, ok := readTaskRunnerState(t)
|
||||
switch {
|
||||
case ok && state.Status == TaskRunning && processAlive(state.PID):
|
||||
t.runnerPID = state.PID
|
||||
t.job = newTaskJobState(t.LogPath)
|
||||
case ok && state.Status != TaskRunning:
|
||||
t.runnerPID = state.PID
|
||||
t.Status = state.Status
|
||||
t.ErrMsg = state.Error
|
||||
now := state.UpdatedAt
|
||||
if now.IsZero() {
|
||||
now = time.Now()
|
||||
}
|
||||
t.DoneAt = &now
|
||||
default:
|
||||
if taskMayLeaveOrphanWorkers(t.Target) {
|
||||
_ = platform.KillTestWorkers()
|
||||
}
|
||||
now := time.Now()
|
||||
t.Status = TaskFailed
|
||||
t.DoneAt = &now
|
||||
t.ErrMsg = "interrupted by bee-web restart"
|
||||
}
|
||||
now := time.Now()
|
||||
t.Status = TaskFailed
|
||||
t.DoneAt = &now
|
||||
t.ErrMsg = "interrupted by bee-web restart"
|
||||
} else if t.Status == TaskPending {
|
||||
t.StartedAt = nil
|
||||
t.DoneAt = nil
|
||||
|
||||
@@ -126,6 +126,23 @@ func TestNewTaskJobStateLoadsExistingLog(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestJobAppendFlushesTaskLogImmediately(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "task.log")
|
||||
j := newTaskJobState(path)
|
||||
|
||||
j.append("live-line")
|
||||
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if string(data) != "live-line\n" {
|
||||
t.Fatalf("log=%q want live-line newline", string(data))
|
||||
}
|
||||
j.closeLog()
|
||||
}
|
||||
|
||||
func TestTaskQueueSnapshotSortsNewestFirst(t *testing.T) {
|
||||
now := time.Date(2026, 4, 2, 12, 0, 0, 0, time.UTC)
|
||||
q := &taskQueue{
|
||||
@@ -849,3 +866,82 @@ func TestExecuteTaskMarksPanicsAsFailedAndClosesKmsgWindow(t *testing.T) {
|
||||
t.Fatalf("expected kmsg window to be cleared, got %+v", window)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunTaskExternalOpensAndClosesKmsgWindow(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
releasePath := filepath.Join(dir, "release")
|
||||
readyPath := filepath.Join(dir, "ready")
|
||||
q := &taskQueue{
|
||||
opts: &HandlerOptions{ExportDir: dir},
|
||||
logsDir: filepath.Join(dir, "tasks"),
|
||||
kmsgWatcher: newKmsgWatcher(nil),
|
||||
trigger: make(chan struct{}, 1),
|
||||
}
|
||||
if err := os.MkdirAll(q.logsDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
tk := &Task{
|
||||
ID: "cpu-external-1",
|
||||
Name: "CPU SAT",
|
||||
Target: "cpu",
|
||||
Status: TaskRunning,
|
||||
CreatedAt: time.Now(),
|
||||
}
|
||||
q.assignTaskLogPathLocked(tk)
|
||||
j := newTaskJobState(tk.LogPath)
|
||||
|
||||
orig := externalTaskRunnerCommand
|
||||
externalTaskRunnerCommand = func(exportDir, taskID string) (*exec.Cmd, error) {
|
||||
script := "printf ready > \"$1\"; while [ ! -f \"$2\" ]; do sleep 0.05; done"
|
||||
return exec.Command("sh", "-c", script, "sh", readyPath, releasePath), nil
|
||||
}
|
||||
defer func() { externalTaskRunnerCommand = orig }()
|
||||
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
q.runTaskExternal(tk, j)
|
||||
close(done)
|
||||
}()
|
||||
|
||||
deadline := time.Now().Add(2 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
if _, err := os.Stat(readyPath); err == nil {
|
||||
break
|
||||
}
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
}
|
||||
if _, err := os.Stat(readyPath); err != nil {
|
||||
t.Fatalf("external runner did not start: %v", err)
|
||||
}
|
||||
|
||||
q.kmsgWatcher.mu.Lock()
|
||||
activeCount := q.kmsgWatcher.activeCount
|
||||
window := q.kmsgWatcher.window
|
||||
q.kmsgWatcher.mu.Unlock()
|
||||
if activeCount != 1 {
|
||||
t.Fatalf("activeCount while running=%d want 1", activeCount)
|
||||
}
|
||||
if window == nil || len(window.targets) != 1 || window.targets[0] != "cpu" {
|
||||
t.Fatalf("window while running=%+v", window)
|
||||
}
|
||||
|
||||
if err := os.WriteFile(releasePath, []byte("1\n"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
select {
|
||||
case <-done:
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("runTaskExternal did not return")
|
||||
}
|
||||
|
||||
q.kmsgWatcher.mu.Lock()
|
||||
activeCount = q.kmsgWatcher.activeCount
|
||||
window = q.kmsgWatcher.window
|
||||
q.kmsgWatcher.mu.Unlock()
|
||||
if activeCount != 0 {
|
||||
t.Fatalf("activeCount after finish=%d want 0", activeCount)
|
||||
}
|
||||
if window != nil {
|
||||
t.Fatalf("expected kmsg window to be cleared, got %+v", window)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,62 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
func enrichSnapshotForViewer(snapshot []byte) []byte {
|
||||
if len(snapshot) == 0 {
|
||||
return snapshot
|
||||
}
|
||||
var root map[string]any
|
||||
if err := json.Unmarshal(snapshot, &root); err != nil {
|
||||
return snapshot
|
||||
}
|
||||
hardware, _ := root["hardware"].(map[string]any)
|
||||
if len(hardware) == 0 {
|
||||
return snapshot
|
||||
}
|
||||
storage, _ := hardware["storage"].([]any)
|
||||
if len(storage) == 0 {
|
||||
return snapshot
|
||||
}
|
||||
changed := false
|
||||
for _, item := range storage {
|
||||
row, _ := item.(map[string]any)
|
||||
if len(row) == 0 {
|
||||
continue
|
||||
}
|
||||
if _, exists := row["block_format"]; exists {
|
||||
continue
|
||||
}
|
||||
logical, okLogical := jsonNumberToInt64(row["logical_block_size_bytes"])
|
||||
metadata, okMetadata := jsonNumberToInt64(row["metadata_bytes_per_block"])
|
||||
if !okLogical || !okMetadata || logical <= 0 || metadata < 0 {
|
||||
continue
|
||||
}
|
||||
row["block_format"] = strconv.FormatInt(logical, 10) + "+" + strconv.FormatInt(metadata, 10)
|
||||
changed = true
|
||||
}
|
||||
if !changed {
|
||||
return snapshot
|
||||
}
|
||||
out, err := json.Marshal(root)
|
||||
if err != nil {
|
||||
return snapshot
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func jsonNumberToInt64(v any) (int64, bool) {
|
||||
switch x := v.(type) {
|
||||
case float64:
|
||||
return int64(x), true
|
||||
case int64:
|
||||
return x, true
|
||||
case int:
|
||||
return int64(x), true
|
||||
default:
|
||||
return 0, false
|
||||
}
|
||||
}
|
||||
+58
-1
@@ -9,5 +9,62 @@ Generic engineering rules live in `bible/rules/patterns/`.
|
||||
|---|---|
|
||||
| `architecture/system-overview.md` | What bee does, scope, tech stack |
|
||||
| `architecture/runtime-flows.md` | Boot sequence, audit flow, service order |
|
||||
| `docs/customer-gpu-test-methodology.md` | Customer-facing GPU PCIe Validate / Validate -> Stress test list |
|
||||
| `docs/hardware-ingest-contract.md` | Current Reanimator hardware ingest JSON contract |
|
||||
| `decisions/` | Architectural decision log |
|
||||
| `docs/validate-vs-burn.md` | Validate and Validate -> Stress hardware test policy |
|
||||
| `decisions/` | Architectural decision log, including read-only submodule policy |
|
||||
|
||||
## Validate Test Matrix
|
||||
|
||||
### Validate
|
||||
|
||||
- CPU check
|
||||
- `lscpu`
|
||||
- `sensors`
|
||||
- `stress-ng`
|
||||
- Memory check
|
||||
- `free`
|
||||
- `timeout <timeout_sec> memtester`
|
||||
- `free`
|
||||
- NVMe storage check
|
||||
- `nvme id-ctrl`
|
||||
- `nvme smart-log`
|
||||
- `nvme device-self-test`
|
||||
- SATA/SAS storage check
|
||||
- `smartctl -H -A`
|
||||
- `smartctl -t short`
|
||||
- Basic NVIDIA GPU check
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dmidecode -t baseboard`
|
||||
- `dmidecode -t system`
|
||||
- `dcgmi diag -r 2`
|
||||
- Inter-GPU communication check
|
||||
- `all_reduce_perf`
|
||||
- GPU bandwidth check
|
||||
- `dcgmi diag -r nvbandwidth`
|
||||
|
||||
### Validate -> Stress
|
||||
|
||||
- Extended NVIDIA GPU check
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dmidecode -t baseboard`
|
||||
- `dmidecode -t system`
|
||||
- `dcgmi diag -r 3`
|
||||
- NVIDIA targeted stress
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dcgmi diag -r targeted_stress`
|
||||
- NVIDIA targeted power
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dcgmi diag -r targeted_power`
|
||||
- NVIDIA pulse test
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dcgmi diag -r pulse_test`
|
||||
- Inter-GPU communication check
|
||||
- `all_reduce_perf`
|
||||
- GPU bandwidth check
|
||||
- `dcgmi diag -r nvbandwidth`
|
||||
|
||||
@@ -149,7 +149,6 @@ Current validation state:
|
||||
6. psu collector (ipmitool fru + sdr — silent if no /dev/ipmi0)
|
||||
7. nvidia enrichment (nvidia-smi — skipped if binary absent or driver not loaded)
|
||||
8. output JSON → /var/log/bee-audit.json
|
||||
9. QR summary to stdout (qrencode if available)
|
||||
```
|
||||
|
||||
Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal.
|
||||
|
||||
@@ -58,6 +58,8 @@ Fills gaps where Redfish/logpile is blind:
|
||||
- `bee` should populate current component state, hardware inventory, telemetry, and `status_checked_at`.
|
||||
- Historical status transitions and component replacement logic belong to the centralized ingest/lifecycle system, not to `bee`.
|
||||
- Contract fields that have no honest local source on a generic Linux host may remain empty.
|
||||
- Embedded submodules such as `internal/chart/` and `bible/` are read-only for `bee` feature work.
|
||||
- If the UI needs extra information, `bee` must emit it through the standard audit JSON contract rather than patching `chart`.
|
||||
|
||||
## Tech stack
|
||||
|
||||
@@ -101,7 +103,7 @@ Fills gaps where Redfish/logpile is blind:
|
||||
| `iso/builder/` | ISO build scripts and `live-build` profile |
|
||||
| `iso/overlay/` | Source overlay copied into a staged build overlay |
|
||||
| `iso/vendor/` | Optional pre-built vendor binaries (storcli64, sas2ircu, sas3ircu, arcconf, ssacli, …) |
|
||||
| `internal/chart/` | Git submodule with `reanimator/chart`, embedded into `bee web` |
|
||||
| `internal/chart/` | Git submodule with `reanimator/chart`, embedded into `bee web`; update by submodule pointer only, never by local `bee`-specific edits |
|
||||
| `iso/builder/VERSIONS` | Pinned versions: Debian, Go, NVIDIA driver, kernel ABI |
|
||||
| `iso/builder/smoketest.sh` | Post-boot smoke test — run via SSH to verify live ISO |
|
||||
| `iso/overlay/etc/profile.d/bee.sh` | tty1 welcome message with web UI URLs |
|
||||
|
||||
@@ -0,0 +1,39 @@
|
||||
# Decision: Treat embedded submodules as read-only
|
||||
|
||||
## Context
|
||||
|
||||
`bee` embeds external git submodules such as:
|
||||
|
||||
- `internal/chart/` — `reanimator/chart`, a generic read-only viewer for Reanimator JSON snapshots
|
||||
- `bible/` — shared engineering rules and contracts
|
||||
|
||||
These repositories are reused by other projects. A local feature request in `bee`
|
||||
must not be solved by silently changing shared submodule behavior.
|
||||
|
||||
The concrete failure mode here was attempting to add project-specific storage
|
||||
telemetry presentation by editing `internal/chart/`. That couples a shared viewer
|
||||
to one host application's needs and creates hidden cross-project regressions.
|
||||
|
||||
## Decision
|
||||
|
||||
Embedded submodules are read-only from the point of view of `bee`.
|
||||
|
||||
- Do not implement `bee`-specific behavior by editing `internal/chart/`.
|
||||
- Do not implement `bee`-specific behavior by editing `bible/`.
|
||||
- If `bee` needs new data in the report, produce it in the standard audit JSON
|
||||
emitted by `bee` itself.
|
||||
- `chart` must continue to consume the canonical snapshot as an external viewer,
|
||||
without host-specific forks.
|
||||
- Updating a submodule pointer to an upstream commit is allowed.
|
||||
- Carrying local unmerged submodule commits as part of a `bee` feature is forbidden.
|
||||
|
||||
## Consequences
|
||||
|
||||
- Audit/report features must be expressed through the contract in
|
||||
`bible-local/docs/hardware-ingest-contract.md`.
|
||||
- `bee` owns collection, normalization, and serialization of storage telemetry in
|
||||
`hardware.storage[]`.
|
||||
- `chart` remains a pure visualization module that reads the snapshot it is given.
|
||||
- If a capability is genuinely missing in a shared submodule, it must be proposed
|
||||
and landed upstream as a generic change first, then pulled into `bee` via a
|
||||
normal submodule update.
|
||||
@@ -6,3 +6,4 @@ One file per decision, named `YYYY-MM-DD-short-topic.md`.
|
||||
|---|---|---|
|
||||
| 2026-03-05 | Use NVIDIA proprietary driver | active |
|
||||
| 2026-04-01 | Treat memtest as explicit ISO content | active |
|
||||
| 2026-04-29 | Treat embedded submodules as read-only | active |
|
||||
|
||||
@@ -0,0 +1,54 @@
|
||||
# GPU PCIe Test Methodology
|
||||
|
||||
## Validate
|
||||
|
||||
- CPU check
|
||||
- `lscpu`
|
||||
- `sensors`
|
||||
- `stress-ng`
|
||||
- Memory check
|
||||
- `free`
|
||||
- `timeout <timeout_sec> memtester`
|
||||
- `free`
|
||||
- NVMe storage check
|
||||
- `nvme id-ctrl`
|
||||
- `nvme smart-log`
|
||||
- `nvme device-self-test`
|
||||
- SATA/SAS storage check
|
||||
- `smartctl -H -A`
|
||||
- `smartctl -t short`
|
||||
- Basic NVIDIA GPU check
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dmidecode -t baseboard`
|
||||
- `dmidecode -t system`
|
||||
- `dcgmi diag -r 2`
|
||||
- Inter-GPU communication check
|
||||
- `all_reduce_perf`
|
||||
- GPU bandwidth check
|
||||
- `dcgmi diag -r nvbandwidth`
|
||||
|
||||
## Validate -> Stress
|
||||
|
||||
- Extended NVIDIA GPU check
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dmidecode -t baseboard`
|
||||
- `dmidecode -t system`
|
||||
- `dcgmi diag -r 3`
|
||||
- NVIDIA targeted stress
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dcgmi diag -r targeted_stress`
|
||||
- NVIDIA targeted power
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dcgmi diag -r targeted_power`
|
||||
- NVIDIA pulse test
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dcgmi diag -r pulse_test`
|
||||
- Inter-GPU communication check
|
||||
- `all_reduce_perf`
|
||||
- GPU bandwidth check
|
||||
- `dcgmi diag -r nvbandwidth`
|
||||
@@ -1,7 +1,7 @@
|
||||
---
|
||||
title: Hardware Ingest JSON Contract
|
||||
version: "2.7"
|
||||
updated: "2026-03-15"
|
||||
version: "2.10"
|
||||
updated: "2026-04-29"
|
||||
maintainer: Reanimator Core
|
||||
audience: external-integrators, ai-agents
|
||||
language: ru
|
||||
@@ -9,7 +9,7 @@ language: ru
|
||||
|
||||
# Интеграция с Reanimator: контракт JSON-импорта аппаратного обеспечения
|
||||
|
||||
Версия: **2.7** · Дата: **2026-03-15**
|
||||
Версия: **2.10** · Дата: **2026-04-29**
|
||||
|
||||
Документ описывает формат JSON для передачи данных об аппаратном обеспечении серверов в систему **Reanimator** (управление жизненным циклом аппаратного обеспечения).
|
||||
Предназначен для разработчиков смежных систем (Redfish-коллекторов, агентов мониторинга, CMDB-экспортёров) и может быть включён в документацию интегрируемых проектов.
|
||||
@@ -22,6 +22,9 @@ language: ru
|
||||
|
||||
| Версия | Дата | Изменения |
|
||||
|--------|------|-----------|
|
||||
| 2.10 | 2026-04-29 | Для `hardware.storage[]` добавлены необязательные числовые поля `logical_block_size_bytes`, `physical_block_size_bytes`, `metadata_bytes_per_block` для нормализованного описания формата блока накопителя |
|
||||
| 2.9 | 2026-03-19 | Добавлена необязательная секция `hardware.platform_config` — произвольный объект с настройками платформы (BIOS/Redfish); хранится как latest-snapshot per machine |
|
||||
| 2.8 | 2026-03-15 | Поле `location` удалено из всех `sensors.*`; сенсоры передаются только по `name` и измеренным значениям |
|
||||
| 2.7 | 2026-03-15 | Явно запрещён синтез данных в `event_logs`; интеграторы не должны придумывать серийные номера компонентов, если источник их не отдал |
|
||||
| 2.6 | 2026-03-15 | Добавлена необязательная секция `event_logs` для dedup/upsert логов `host` / `bmc` / `redfish` вне history timeline |
|
||||
| 2.5 | 2026-03-15 | Добавлено общее необязательное поле `manufactured_year_week` для компонентных секций (`YYYY-Www`) |
|
||||
@@ -131,8 +134,9 @@ GET /ingest/hardware/jobs/{job_id}
|
||||
"storage": [ ... ],
|
||||
"pcie_devices": [ ... ],
|
||||
"power_supplies": [ ... ],
|
||||
"sensors": { ... },
|
||||
"event_logs": [ ... ]
|
||||
"sensors": { ... },
|
||||
"event_logs": [ ... ],
|
||||
"platform_config": { ... }
|
||||
}
|
||||
}
|
||||
```
|
||||
@@ -343,6 +347,9 @@ GET /ingest/hardware/jobs/{job_id}
|
||||
| `type` | string | нет | Тип: `NVMe`, `SSD`, `HDD` |
|
||||
| `interface` | string | нет | Интерфейс: `NVMe`, `SATA`, `SAS` |
|
||||
| `size_gb` | int | нет | Размер в ГБ |
|
||||
| `logical_block_size_bytes` | int64 | нет | Логический размер пользовательского блока данных, например `512` или `4096` |
|
||||
| `physical_block_size_bytes` | int64 | нет | Физический размер блока, если известен, например `4096` |
|
||||
| `metadata_bytes_per_block` | int64 | нет | Metadata / protection bytes на логический блок, например `0` или `8` |
|
||||
| `temperature_c` | float | нет | Температура накопителя, °C (telemetry) |
|
||||
| `power_on_hours` | int64 | нет | Время работы, часы |
|
||||
| `power_cycles` | int64 | нет | Количество циклов питания |
|
||||
@@ -363,6 +370,11 @@ GET /ingest/hardware/jobs/{job_id}
|
||||
|
||||
Диск без `serial_number` игнорируется. Изменение `firmware` создаёт событие `FIRMWARE_CHANGED`.
|
||||
|
||||
Формат вида `512+8` в контракт не добавляется отдельным строковым полем. Если источник знает такую форму, он должен передавать её как:
|
||||
- `logical_block_size_bytes = 512`
|
||||
- `metadata_bytes_per_block = 8`
|
||||
- `physical_block_size_bytes = 512` или `4096`, если известен физический размер блока
|
||||
|
||||
```json
|
||||
"storage": [
|
||||
{
|
||||
@@ -370,6 +382,9 @@ GET /ingest/hardware/jobs/{job_id}
|
||||
"type": "NVMe",
|
||||
"model": "INTEL SSDPF2KX076T1",
|
||||
"size_gb": 7680,
|
||||
"logical_block_size_bytes": 512,
|
||||
"physical_block_size_bytes": 4096,
|
||||
"metadata_bytes_per_block": 8,
|
||||
"temperature_c": 38.5,
|
||||
"power_on_hours": 12450,
|
||||
"unsafe_shutdowns": 3,
|
||||
@@ -592,7 +607,6 @@ PSU без `serial_number` игнорируется.
|
||||
| Поле | Тип | Обязательно | Описание |
|
||||
|------|-----|-------------|----------|
|
||||
| `name` | string | **да** | Уникальное имя сенсора в рамках секции |
|
||||
| `location` | string | нет | Физическое расположение |
|
||||
| `rpm` | int | нет | Обороты, RPM |
|
||||
| `status` | string | нет | Статус: `OK`, `Warning`, `Critical`, `Unknown` |
|
||||
|
||||
@@ -601,7 +615,6 @@ PSU без `serial_number` игнорируется.
|
||||
| Поле | Тип | Обязательно | Описание |
|
||||
|------|-----|-------------|----------|
|
||||
| `name` | string | **да** | Уникальное имя сенсора |
|
||||
| `location` | string | нет | Физическое расположение |
|
||||
| `voltage_v` | float | нет | Напряжение, В |
|
||||
| `current_a` | float | нет | Ток, А |
|
||||
| `power_w` | float | нет | Мощность, Вт |
|
||||
@@ -612,7 +625,6 @@ PSU без `serial_number` игнорируется.
|
||||
| Поле | Тип | Обязательно | Описание |
|
||||
|------|-----|-------------|----------|
|
||||
| `name` | string | **да** | Уникальное имя сенсора |
|
||||
| `location` | string | нет | Физическое расположение |
|
||||
| `celsius` | float | нет | Температура, °C |
|
||||
| `threshold_warning_celsius` | float | нет | Порог Warning, °C |
|
||||
| `threshold_critical_celsius` | float | нет | Порог Critical, °C |
|
||||
@@ -623,29 +635,29 @@ PSU без `serial_number` игнорируется.
|
||||
| Поле | Тип | Обязательно | Описание |
|
||||
|------|-----|-------------|----------|
|
||||
| `name` | string | **да** | Уникальное имя сенсора |
|
||||
| `location` | string | нет | Физическое расположение |
|
||||
| `value` | float | нет | Значение |
|
||||
| `unit` | string | нет | Единица измерения |
|
||||
| `status` | string | нет | Статус |
|
||||
|
||||
**Правила sensors:**
|
||||
- Идентификатор сенсора: пара `(sensor_type, name)`. Дубли в одном payload — берётся первое вхождение.
|
||||
- `location` для сенсоров передавать не нужно и не следует: в Reanimator location/slot используется только для проверки перемещения и установки компонентов, а не для last-known-value sensor ingest.
|
||||
- Сенсоры без `name` игнорируются.
|
||||
- При каждом импорте значения перезаписываются (upsert по ключу).
|
||||
|
||||
```json
|
||||
"sensors": {
|
||||
"fans": [
|
||||
{ "name": "FAN1", "location": "Front", "rpm": 4200, "status": "OK" },
|
||||
{ "name": "FAN_CPU0", "location": "CPU0", "rpm": 5600, "status": "OK" }
|
||||
{ "name": "FAN1", "rpm": 4200, "status": "OK" },
|
||||
{ "name": "FAN_CPU0", "rpm": 5600, "status": "OK" }
|
||||
],
|
||||
"power": [
|
||||
{ "name": "12V Rail", "location": "Mainboard", "voltage_v": 12.06, "status": "OK" },
|
||||
{ "name": "PSU0 Input", "location": "PSU0", "voltage_v": 215.25, "current_a": 0.64, "power_w": 137.0, "status": "OK" }
|
||||
{ "name": "12V Rail", "voltage_v": 12.06, "status": "OK" },
|
||||
{ "name": "PSU0 Input", "voltage_v": 215.25, "current_a": 0.64, "power_w": 137.0, "status": "OK" }
|
||||
],
|
||||
"temperatures": [
|
||||
{ "name": "CPU0 Temp", "location": "CPU0", "celsius": 46.0, "threshold_warning_celsius": 80.0, "threshold_critical_celsius": 95.0, "status": "OK" },
|
||||
{ "name": "Inlet Temp", "location": "Front", "celsius": 22.0, "threshold_warning_celsius": 40.0, "threshold_critical_celsius": 50.0, "status": "OK" }
|
||||
{ "name": "CPU0 Temp", "celsius": 46.0, "threshold_warning_celsius": 80.0, "threshold_critical_celsius": 95.0, "status": "OK" },
|
||||
{ "name": "Inlet Temp", "celsius": 22.0, "threshold_warning_celsius": 40.0, "threshold_critical_celsius": 50.0, "status": "OK" }
|
||||
],
|
||||
"other": [
|
||||
{ "name": "System Humidity", "value": 38.5, "unit": "%", "status": "OK" }
|
||||
@@ -655,6 +667,31 @@ PSU без `serial_number` игнорируется.
|
||||
|
||||
---
|
||||
|
||||
## Секция platform_config
|
||||
|
||||
Необязательный объект с произвольными ключами — настройки платформы как есть из источника (BIOS, Redfish, IPMI).
|
||||
|
||||
| Поле | Тип | Обязательно | Описание |
|
||||
|------|-----|-------------|----------|
|
||||
| `platform_config` | object | нет | Произвольный объект: ключи — строки, значения — строки, числа, булевы |
|
||||
|
||||
**Правила platform_config:**
|
||||
- Содержимое объекта не валидируется: передавайте параметры как есть.
|
||||
- При каждом импорте хранится latest-snapshot per machine; история изменений по каждому ключу накапливается отдельно.
|
||||
- Если секция отсутствует или равна `null` — данные платформы не обновляются.
|
||||
|
||||
```json
|
||||
"platform_config": {
|
||||
"SecureBoot": "Enabled",
|
||||
"BiosVersion": "06.08.05",
|
||||
"TpmEnabled": true,
|
||||
"NumaEnabled": false,
|
||||
"HyperThreading": "Enabled"
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Обработка статусов компонентов
|
||||
|
||||
| Статус | Поведение |
|
||||
@@ -787,6 +824,12 @@ PSU без `serial_number` игнорируется.
|
||||
"other": [
|
||||
{ "name": "System Humidity", "value": 38.5, "unit": "%" }
|
||||
]
|
||||
},
|
||||
"platform_config": {
|
||||
"SecureBoot": "Enabled",
|
||||
"BiosVersion": "06.08.05",
|
||||
"TpmEnabled": true,
|
||||
"HyperThreading": "Enabled"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,31 @@
|
||||
# Contract: ASCII-Safe Text in Scripts and Boot Configs
|
||||
|
||||
Version: 1.0
|
||||
|
||||
## Principle
|
||||
|
||||
Shell scripts, bootloader configs, and any text rendered on serial/SOL consoles must use only printable ASCII characters. Non-ASCII Unicode — including typographic punctuation such as the em-dash (U+2014 `—`), en-dash (U+2013 `–`), curly quotes, and ellipsis (U+2026 `…`) — breaks rendering on serial terminals, GRUB text/serial mode, IPMI SOL, and tooling that assumes ASCII.
|
||||
|
||||
## Rules
|
||||
|
||||
- Never use em-dash (`—`) or en-dash (`–`) in any shell script, GRUB config, syslinux/isolinux config, or service unit file. Use ASCII double-hyphen `--` or single hyphen `-` instead.
|
||||
- Never use curly quotes (`"` `"` `'` `'`) in shell scripts or configs. Use straight quotes `"` and `'`.
|
||||
- Never use the Unicode ellipsis (`…`). Use `...`.
|
||||
- GRUB `menuentry` and `submenu` titles must be ASCII-only — GRUB serial terminal output is ASCII; non-ASCII characters render as garbage or are dropped.
|
||||
- Comments in GRUB theme files (`.txt`) must also be ASCII-only, as GRUB may parse the entire file.
|
||||
|
||||
## Why
|
||||
|
||||
GRUB renders menus over both `gfxterm` (graphical, Unicode-capable) and `serial` (ASCII-only) simultaneously when `terminal_output gfxterm serial` is set. The serial output — used by IPMI SOL and BMC remote consoles — cannot display multi-byte UTF-8 sequences and shows raw bytes or drops characters. A menuentry title `"EASY-BEE — GSP=off"` appears as `"EASY-BEE â€" GSP=off"` or `"EASY-BEE GSP=off"` on SOL, making the menu unreadable.
|
||||
|
||||
## Anti-patterns
|
||||
|
||||
- `menuentry "EASY-BEE — GSP=off"` — em-dash in GRUB title
|
||||
- `# bee logo — centered` — em-dash in GRUB theme comment
|
||||
- `echo "done — reboot"` in a shell script displayed over serial
|
||||
|
||||
## Correct form
|
||||
|
||||
- `menuentry "EASY-BEE -- GSP=off"`
|
||||
- `# bee logo - centered`
|
||||
- `echo "done - reboot"`
|
||||
@@ -0,0 +1,134 @@
|
||||
# GRUB bitmap error: null src bitmap in grub_video_bitmap_create_scaled
|
||||
|
||||
## Symptom
|
||||
|
||||
```
|
||||
error: null src bitmap in grub_video_bitmap_create_scaled.
|
||||
Press any key to continue...
|
||||
```
|
||||
|
||||
Appears on boot before the GRUB menu renders. The menu still appears after pressing a key,
|
||||
but without the bee logo. Reproduced on real hardware (Lenovo SR650 V3, ASUS GPU servers).
|
||||
|
||||
## Root cause model
|
||||
|
||||
`grub_video_bitmap_create_scaled` receives a null `src` pointer, meaning the PNG loader
|
||||
returned null for `bee-logo.png`. GRUB calls this function even when no explicit
|
||||
`width`/`height` are set in `theme.txt` — it is invoked any time an image component is
|
||||
rendered, passing the image's natural dimensions as the target size.
|
||||
|
||||
The PNG file is referenced as `file = "bee-logo.png"` (relative to theme dir).
|
||||
GRUB resolves this to `/boot/grub/live-theme/bee-logo.png`.
|
||||
|
||||
## Attempts that did NOT fix the error
|
||||
|
||||
### Attempt 1 — add explicit `width`/`height` to image block (d52ec67)
|
||||
|
||||
**What was done:** First introduction of bee-logo.png with:
|
||||
```
|
||||
+ image {
|
||||
top = 4%
|
||||
left = 50%-200
|
||||
width = 400
|
||||
height = 400
|
||||
file = "bee-logo.png"
|
||||
}
|
||||
```
|
||||
PNG at this point was RGBA (color_type=6).
|
||||
|
||||
**Result:** Error appeared immediately on first ISO build.
|
||||
|
||||
---
|
||||
|
||||
### Attempt 2 — remove `width`/`height` from image block (aa284ae)
|
||||
|
||||
**Hypothesis:** Explicit scaling dimensions trigger the scale path; removing them avoids it.
|
||||
|
||||
**What was done:** Removed `width = 400` and `height = 400` from the image block.
|
||||
```
|
||||
+ image {
|
||||
top = 4%
|
||||
left = 50%-200
|
||||
file = "bee-logo.png"
|
||||
}
|
||||
```
|
||||
|
||||
**Result:** Error persists. GRUB calls `grub_video_bitmap_create_scaled` regardless of whether
|
||||
`width`/`height` are specified — if the bitmap is null (loading failed), the error fires either way.
|
||||
|
||||
---
|
||||
|
||||
### Attempt 3 — convert PNG to RGBA + strip metadata chunks (6112094)
|
||||
|
||||
**Hypothesis:** GRUB's minimal PNG parser is confused by metadata chunks (cHRM, bKGD, tIME, tEXt).
|
||||
Also re-ordered `terminal_output gfxterm` before `insmod png` / theme load.
|
||||
|
||||
**What was done:**
|
||||
- Converted PNG to RGBA color_type=6, stripped all ancillary chunks
|
||||
- Moved `terminal_output gfxterm` earlier in config.cfg
|
||||
- Removed echo ASCII art banner from grub.cfg
|
||||
|
||||
**Result:** Error persists — and this change actually confirmed RGBA does not work:
|
||||
GRUB's PNG loader does not render RGBA PNGs correctly on this platform.
|
||||
|
||||
---
|
||||
|
||||
### Attempt 4 — convert PNG from RGBA back to RGB (333c44f, most recent)
|
||||
|
||||
**Hypothesis:** GRUB does not support RGBA (color_type=6); RGB (color_type=2) is the correct format.
|
||||
Alpha channel composited onto black background (#000000) to match `desktop-color`.
|
||||
|
||||
**What was done:** Converted bee-logo.png from RGBA to RGB via ImageMagick.
|
||||
|
||||
**Current file state:**
|
||||
- 400×400 px, 8-bit/color RGB, non-interlaced
|
||||
- Only IHDR + IDAT + IEND chunks (no metadata)
|
||||
- `insmod png` is present in config.cfg
|
||||
- `terminal_output gfxterm` runs before theme is sourced
|
||||
- No explicit `width`/`height` in image block
|
||||
|
||||
**Result:** Error still occurs on real hardware. Despite the PNG being nominally correct
|
||||
(RGB, non-interlaced, minimal chunks), the bitmap load returns null.
|
||||
|
||||
## Confirmed root cause (verified on 172.16.41.94, 2026-04-30)
|
||||
|
||||
The EFI partition (`sda2`, vfat, 5 MB) contains only:
|
||||
```
|
||||
/EFI/boot/bootia32.efi
|
||||
/EFI/boot/bootx64.efi
|
||||
/EFI/boot/grubx64.efi
|
||||
/boot/grub/grub.cfg
|
||||
```
|
||||
|
||||
`config.cfg`, `theme.cfg`, and the entire `live-theme/` directory (including `bee-logo.png`)
|
||||
are **absent from the EFI image**. `live-build`'s `lb binary_grub-efi` stage is not
|
||||
copying these files. GRUB boots, sources only `grub.cfg`, then fails to load the theme
|
||||
because the file does not exist — returning a null bitmap regardless of PNG format.
|
||||
|
||||
All four fix attempts were targeting the wrong layer (PNG format/content).
|
||||
|
||||
## Fix (applied 2026-04-30)
|
||||
|
||||
Switched from PNG to TGA format:
|
||||
|
||||
1. Converted `bee-logo.png` → `bee-logo.tga` (24-bit uncompressed BGR, top-left origin,
|
||||
480018 bytes). Conversion done via Python stdlib (no external tools needed).
|
||||
2. `config.cfg`: `insmod png` → `insmod tga`
|
||||
3. `theme.txt`: `file = "bee-logo.png"` → `file = "bee-logo.tga"`
|
||||
|
||||
**Why TGA works:** GRUB's TGA reader (`tga.mod`) handles uncompressed 24-bit images
|
||||
trivially — no decompression, no complex chunk parsing. The module is present on-disk
|
||||
(`x86_64-efi/tga.mod`). PNG was failing despite a valid file; the exact GRUB bug is
|
||||
unknown but the PNG reader in Debian bookworm's grub2 is known to be fragile.
|
||||
|
||||
The old `bee-logo.png` is kept in the tree (may be useful for other tools) but is no
|
||||
longer referenced by the theme.
|
||||
|
||||
## Relevant files
|
||||
|
||||
| File | Purpose |
|
||||
|------|---------|
|
||||
| `iso/builder/config/bootloaders/grub-efi/config.cfg` | insmod png, gfxterm init, theme source |
|
||||
| `iso/builder/config/bootloaders/grub-efi/theme.cfg` | sets `theme=` path |
|
||||
| `iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt` | image component definition |
|
||||
| `iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.png` | the logo PNG |
|
||||
+2
-2
@@ -31,10 +31,10 @@ Build with explicit SSH keys baked into the ISO:
|
||||
sh iso/builder/build-in-container.sh --authorized-keys ~/.ssh/id_ed25519.pub
|
||||
```
|
||||
|
||||
Rebuild the builder image:
|
||||
Force a clean rebuild of the builder image and build caches:
|
||||
|
||||
```sh
|
||||
sh iso/builder/build-in-container.sh --rebuild-image
|
||||
sh iso/builder/build-in-container.sh --clean-build
|
||||
```
|
||||
|
||||
Use a custom cache directory:
|
||||
|
||||
@@ -16,6 +16,12 @@ else
|
||||
LB_LINUX_PACKAGES="linux-image"
|
||||
fi
|
||||
|
||||
if [ -n "${BEE_ISO_VOLUME:-}" ]; then
|
||||
LB_ISO_VOLUME="${BEE_ISO_VOLUME}"
|
||||
else
|
||||
LB_ISO_VOLUME="EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}"
|
||||
fi
|
||||
|
||||
lb config noauto \
|
||||
--distribution bookworm \
|
||||
--architectures amd64 \
|
||||
@@ -30,9 +36,9 @@ lb config noauto \
|
||||
--linux-flavours "amd64" \
|
||||
--linux-packages "${LB_LINUX_PACKAGES}" \
|
||||
--memtest memtest86+ \
|
||||
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||
--iso-volume "${LB_ISO_VOLUME}" \
|
||||
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
||||
--bootappend-live "boot=live live-media-label=${LB_ISO_VOLUME} components video=1920x1080 console=ttyS0,115200n8 console=tty0 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
||||
--debootstrap-options "--include=ca-certificates" \
|
||||
--apt-recommends false \
|
||||
--chroot-squashfs-compression-type zstd \
|
||||
|
||||
@@ -10,7 +10,6 @@ IMAGE_TAG="${BEE_BUILDER_IMAGE:-bee-iso-builder}"
|
||||
BUILDER_PLATFORM="${BEE_BUILDER_PLATFORM:-linux/amd64}"
|
||||
CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/container-cache}"
|
||||
AUTH_KEYS=""
|
||||
REBUILD_IMAGE=0
|
||||
CLEAN_CACHE=0
|
||||
VARIANT="all"
|
||||
|
||||
@@ -22,17 +21,12 @@ while [ $# -gt 0 ]; do
|
||||
CACHE_DIR="$2"
|
||||
shift 2
|
||||
;;
|
||||
--rebuild-image)
|
||||
REBUILD_IMAGE=1
|
||||
shift
|
||||
;;
|
||||
--authorized-keys)
|
||||
AUTH_KEYS="$2"
|
||||
shift 2
|
||||
;;
|
||||
--clean-build)
|
||||
CLEAN_CACHE=1
|
||||
REBUILD_IMAGE=1
|
||||
shift
|
||||
;;
|
||||
--variant)
|
||||
@@ -41,7 +35,7 @@ while [ $# -gt 0 ]; do
|
||||
;;
|
||||
*)
|
||||
echo "unknown arg: $1" >&2
|
||||
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|nvidia-legacy|amd|nogpu|all]" >&2
|
||||
echo "usage: $0 [--cache-dir /path] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|nvidia-legacy|amd|nogpu|all]" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
@@ -105,7 +99,7 @@ image_matches_platform() {
|
||||
}
|
||||
|
||||
NEED_BUILD_IMAGE=0
|
||||
if [ "$REBUILD_IMAGE" = "1" ]; then
|
||||
if [ "$CLEAN_CACHE" = "1" ]; then
|
||||
NEED_BUILD_IMAGE=1
|
||||
elif ! "$CONTAINER_TOOL" image inspect "${IMAGE_REF}" >/dev/null 2>&1; then
|
||||
NEED_BUILD_IMAGE=1
|
||||
|
||||
+524
-56
@@ -69,12 +69,27 @@ mkdir -p "${CACHE_ROOT}"
|
||||
: "${GOMODCACHE:=${CACHE_ROOT}/go-mod}"
|
||||
export GOCACHE GOMODCACHE
|
||||
|
||||
resolve_audit_version() {
|
||||
resolve_project_version() {
|
||||
if [ -n "${BEE_VERSION:-}" ]; then
|
||||
echo "${BEE_VERSION}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
if [ -n "${BEE_AUDIT_VERSION:-}" ] && [ -n "${BEE_ISO_VERSION:-}" ] && [ "${BEE_AUDIT_VERSION}" != "${BEE_ISO_VERSION}" ]; then
|
||||
echo "ERROR: BEE_AUDIT_VERSION (${BEE_AUDIT_VERSION}) and BEE_ISO_VERSION (${BEE_ISO_VERSION}) differ; versioning must stay synchronized" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -n "${BEE_AUDIT_VERSION:-}" ]; then
|
||||
echo "${BEE_AUDIT_VERSION}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
if [ -n "${BEE_ISO_VERSION:-}" ]; then
|
||||
echo "${BEE_ISO_VERSION}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
|
||||
case "${tag}" in
|
||||
v*)
|
||||
@@ -97,33 +112,35 @@ resolve_audit_version() {
|
||||
date +%Y%m%d
|
||||
}
|
||||
|
||||
# ISO image versioned separately from the audit binary (iso/v* tags).
|
||||
resolve_iso_version() {
|
||||
if [ -n "${BEE_ISO_VERSION:-}" ]; then
|
||||
echo "${BEE_ISO_VERSION}"
|
||||
return 0
|
||||
sync_builder_workdir() {
|
||||
src_dir="$1"
|
||||
dst_dir="$2"
|
||||
|
||||
mkdir -p "$dst_dir"
|
||||
|
||||
# Historical bug: old workdirs could keep config/bootloaders/grub-pc even
|
||||
# after the source tree moved to grub-efi only. Remove bootloaders eagerly
|
||||
# so reused workdirs cannot leak stale templates into a new ISO build.
|
||||
rm -rf "$dst_dir/config/bootloaders"
|
||||
|
||||
rsync -a --delete \
|
||||
--exclude='cache/' \
|
||||
--exclude='chroot/' \
|
||||
--exclude='.build/' \
|
||||
--exclude='*.iso' \
|
||||
--exclude='*.packages' \
|
||||
--exclude='*.contents' \
|
||||
--exclude='*.files' \
|
||||
"$src_dir/" "$dst_dir/"
|
||||
|
||||
if [ ! -f "$dst_dir/config/bootloaders/grub-efi/grub.cfg" ]; then
|
||||
echo "ERROR: staged workdir is missing config/bootloaders/grub-efi/grub.cfg" >&2
|
||||
exit 1
|
||||
fi
|
||||
if [ -e "$dst_dir/config/bootloaders/grub-pc" ]; then
|
||||
echo "ERROR: stale config/bootloaders/grub-pc remained in staged workdir" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Plain v* tags (e.g. v2.7) take priority — this is the current tagging scheme
|
||||
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
|
||||
case "${tag}" in
|
||||
v*)
|
||||
echo "${tag#v}"
|
||||
return 0
|
||||
;;
|
||||
esac
|
||||
|
||||
# Legacy iso/v* tags fallback
|
||||
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'iso/v*' --abbrev=7 --dirty 2>/dev/null || true)"
|
||||
case "${tag}" in
|
||||
iso/v*)
|
||||
echo "${tag#iso/v}"
|
||||
return 0
|
||||
;;
|
||||
esac
|
||||
|
||||
# Fall back to audit version so the name is still meaningful
|
||||
resolve_audit_version
|
||||
}
|
||||
|
||||
iso_list_files() {
|
||||
@@ -466,6 +483,119 @@ validate_iso_memtest() {
|
||||
echo "=== memtest validation OK ==="
|
||||
}
|
||||
|
||||
validate_iso_live_boot_entries() {
|
||||
iso_path="$1"
|
||||
echo "=== validating live boot entries in ISO ==="
|
||||
|
||||
[ -f "$iso_path" ] || {
|
||||
echo "ERROR: ISO not found for live boot validation: $iso_path" >&2
|
||||
exit 1
|
||||
}
|
||||
require_iso_reader "$iso_path" >/dev/null 2>&1 || {
|
||||
echo "ERROR: ISO reader unavailable for live boot validation" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
grub_cfg="$(mktemp)"
|
||||
isolinux_cfg="$(mktemp)"
|
||||
|
||||
iso_read_member "$iso_path" boot/grub/grub.cfg "$grub_cfg" || {
|
||||
echo "ERROR: failed to read boot/grub/grub.cfg from ISO" >&2
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
exit 1
|
||||
}
|
||||
iso_read_member "$iso_path" isolinux/live.cfg "$isolinux_cfg" || {
|
||||
echo "ERROR: failed to read isolinux/live.cfg from ISO" >&2
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
exit 1
|
||||
}
|
||||
|
||||
if grep -q '@APPEND_LIVE@\|@KERNEL_LIVE@\|@INITRD_LIVE@' "$grub_cfg" "$isolinux_cfg"; then
|
||||
echo "ERROR: unresolved live-build placeholders remain in ISO bootloader config" >&2
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
grep -q 'menuentry "EASY-BEE v' "$grub_cfg" || {
|
||||
echo "ERROR: GRUB default EASY-BEE entry is missing" >&2
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
exit 1
|
||||
}
|
||||
grep -q 'menuentry "EASY-BEE v.* -- load to RAM (toram)"' "$grub_cfg" || {
|
||||
echo "ERROR: GRUB toram entry is missing" >&2
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
exit 1
|
||||
}
|
||||
grep -q 'linux .*boot=live ' "$grub_cfg" || {
|
||||
echo "ERROR: GRUB live entry is missing boot=live" >&2
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
exit 1
|
||||
}
|
||||
grep -q 'linux .*boot=live .*toram ' "$grub_cfg" || {
|
||||
echo "ERROR: GRUB toram entry is missing boot=live or toram" >&2
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
exit 1
|
||||
}
|
||||
grep -q 'linux .*live-media-label=EASY_BEE_' "$grub_cfg" || {
|
||||
echo "ERROR: GRUB live entry is missing live-media-label pinning" >&2
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
exit 1
|
||||
}
|
||||
|
||||
grep -q 'append .*boot=live ' "$isolinux_cfg" || {
|
||||
echo "ERROR: isolinux live entry is missing boot=live" >&2
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
exit 1
|
||||
}
|
||||
grep -q 'append .*boot=live .*toram ' "$isolinux_cfg" || {
|
||||
echo "ERROR: isolinux toram entry is missing boot=live or toram" >&2
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
exit 1
|
||||
}
|
||||
grep -q 'append .*live-media-label=EASY_BEE_' "$isolinux_cfg" || {
|
||||
echo "ERROR: isolinux live entry is missing live-media-label pinning" >&2
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
exit 1
|
||||
}
|
||||
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
echo "=== live boot validation OK ==="
|
||||
}
|
||||
|
||||
validate_iso_grub_assets() {
|
||||
iso_path="$1"
|
||||
echo "=== validating GRUB assets in ISO ==="
|
||||
|
||||
[ -f "$iso_path" ] || {
|
||||
echo "ERROR: ISO not found for GRUB asset validation: $iso_path" >&2
|
||||
exit 1
|
||||
}
|
||||
require_iso_reader "$iso_path" >/dev/null 2>&1 || {
|
||||
echo "ERROR: ISO reader unavailable for GRUB asset validation" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
iso_files="$(mktemp)"
|
||||
iso_list_files "$iso_path" > "$iso_files" || {
|
||||
echo "ERROR: failed to list ISO files for GRUB asset validation" >&2
|
||||
rm -f "$iso_files"
|
||||
exit 1
|
||||
}
|
||||
|
||||
for required in \
|
||||
boot/grub/config.cfg \
|
||||
boot/grub/grub.cfg; do
|
||||
grep -q "^${required}$" "$iso_files" || {
|
||||
echo "ERROR: missing GRUB asset in ISO: ${required}" >&2
|
||||
rm -f "$iso_files"
|
||||
exit 1
|
||||
}
|
||||
done
|
||||
|
||||
rm -f "$iso_files"
|
||||
echo "=== GRUB asset validation OK ==="
|
||||
}
|
||||
|
||||
validate_iso_nvidia_runtime() {
|
||||
iso_path="$1"
|
||||
[ "$BEE_GPU_VENDOR" = "nvidia" ] || return 0
|
||||
@@ -478,29 +608,37 @@ validate_iso_nvidia_runtime() {
|
||||
|
||||
squashfs_tmp="$(mktemp)"
|
||||
squashfs_list="$(mktemp)"
|
||||
iso_read_member "$iso_path" live/filesystem.squashfs "$squashfs_tmp" || {
|
||||
rm -f "$squashfs_tmp" "$squashfs_list"
|
||||
nvidia_runtime_fail "failed to extract live/filesystem.squashfs from ISO"
|
||||
}
|
||||
unsquashfs -ll "$squashfs_tmp" > "$squashfs_list" 2>/dev/null || {
|
||||
rm -f "$squashfs_tmp" "$squashfs_list"
|
||||
nvidia_runtime_fail "failed to inspect filesystem.squashfs from ISO"
|
||||
iso_files="$(mktemp)"
|
||||
iso_list_files "$iso_path" > "$iso_files" || {
|
||||
rm -f "$squashfs_tmp" "$squashfs_list" "$iso_files"
|
||||
nvidia_runtime_fail "failed to list ISO files for NVIDIA runtime validation"
|
||||
}
|
||||
grep '^live/.*\.squashfs$' "$iso_files" | while IFS= read -r squashfs_member; do
|
||||
iso_read_member "$iso_path" "$squashfs_member" "$squashfs_tmp" || {
|
||||
rm -f "$squashfs_tmp" "$squashfs_list" "$iso_files"
|
||||
nvidia_runtime_fail "failed to extract $squashfs_member from ISO"
|
||||
}
|
||||
unsquashfs -ll "$squashfs_tmp" >> "$squashfs_list" 2>/dev/null || {
|
||||
rm -f "$squashfs_tmp" "$squashfs_list" "$iso_files"
|
||||
nvidia_runtime_fail "failed to inspect $squashfs_member from ISO"
|
||||
}
|
||||
: > "$squashfs_tmp"
|
||||
done
|
||||
|
||||
grep -Eq 'usr/bin/dcgmi$' "$squashfs_list" || {
|
||||
rm -f "$squashfs_tmp" "$squashfs_list"
|
||||
rm -f "$squashfs_tmp" "$squashfs_list" "$iso_files"
|
||||
nvidia_runtime_fail "dcgmi missing from final NVIDIA ISO"
|
||||
}
|
||||
grep -Eq 'usr/bin/nv-hostengine$' "$squashfs_list" || {
|
||||
rm -f "$squashfs_tmp" "$squashfs_list"
|
||||
rm -f "$squashfs_tmp" "$squashfs_list" "$iso_files"
|
||||
nvidia_runtime_fail "nv-hostengine missing from final NVIDIA ISO"
|
||||
}
|
||||
grep -Eq 'usr/bin/dcgmproftester([0-9]+)?$' "$squashfs_list" || {
|
||||
rm -f "$squashfs_tmp" "$squashfs_list"
|
||||
rm -f "$squashfs_tmp" "$squashfs_list" "$iso_files"
|
||||
nvidia_runtime_fail "dcgmproftester missing from final NVIDIA ISO"
|
||||
}
|
||||
|
||||
rm -f "$squashfs_tmp" "$squashfs_list"
|
||||
rm -f "$squashfs_tmp" "$squashfs_list" "$iso_files"
|
||||
echo "=== NVIDIA runtime validation OK ==="
|
||||
}
|
||||
|
||||
@@ -542,6 +680,184 @@ label memtest
|
||||
EOF
|
||||
}
|
||||
|
||||
extract_live_grub_entry() {
|
||||
cfg="$1"
|
||||
live_linux="$(awk '/^[[:space:]]*linux[[:space:]]+\/live\// { print; exit }' "$cfg")"
|
||||
live_initrd="$(awk '/^[[:space:]]*initrd[[:space:]]+\/live\// { print; exit }' "$cfg")"
|
||||
[ -n "$live_linux" ] || return 1
|
||||
[ -n "$live_initrd" ] || return 1
|
||||
|
||||
grub_kernel="$(printf '%s\n' "$live_linux" | awk '{print $2}')"
|
||||
grub_append="$(printf '%s\n' "$live_linux" | cut -d' ' -f3-)"
|
||||
grub_initrd="$(printf '%s\n' "$live_initrd" | awk '{print $2}')"
|
||||
[ -n "$grub_kernel" ] || return 1
|
||||
[ -n "$grub_append" ] || return 1
|
||||
[ -n "$grub_initrd" ] || return 1
|
||||
return 0
|
||||
}
|
||||
|
||||
load_live_build_append() {
|
||||
lb_dir="$1"
|
||||
binary_cfg="$lb_dir/config/binary"
|
||||
[ -f "$binary_cfg" ] || return 1
|
||||
|
||||
# config/binary is generated by live-build and contains shell variable
|
||||
# assignments such as LB_BOOTAPPEND_LIVE="boot=live ...".
|
||||
# shellcheck disable=SC1090
|
||||
. "$binary_cfg"
|
||||
|
||||
[ -n "${LB_BOOTAPPEND_LIVE:-}" ] || return 1
|
||||
live_build_append="$LB_BOOTAPPEND_LIVE"
|
||||
return 0
|
||||
}
|
||||
|
||||
extract_live_isolinux_entry() {
|
||||
cfg="$1"
|
||||
isolinux_linux="$(awk '/^[[:space:]]*linux[[:space:]]+\/live\// { print; exit }' "$cfg")"
|
||||
isolinux_initrd="$(awk '/^[[:space:]]*initrd[[:space:]]+\/live\// { print; exit }' "$cfg")"
|
||||
isolinux_append="$(awk '/^[[:space:]]*append[[:space:]]+/ { sub(/^[[:space:]]*append[[:space:]]+/, ""); print; exit }' "$cfg")"
|
||||
[ -n "$isolinux_linux" ] || return 1
|
||||
[ -n "$isolinux_initrd" ] || return 1
|
||||
[ -n "$isolinux_append" ] || return 1
|
||||
|
||||
isolinux_kernel="$(printf '%s\n' "$isolinux_linux" | awk '{print $2}')"
|
||||
isolinux_initrd_path="$(printf '%s\n' "$isolinux_initrd" | awk '{print $2}')"
|
||||
[ -n "$isolinux_kernel" ] || return 1
|
||||
[ -n "$isolinux_initrd_path" ] || return 1
|
||||
return 0
|
||||
}
|
||||
|
||||
write_canonical_grub_cfg() {
|
||||
cfg="$1"
|
||||
kernel="$2"
|
||||
append_live="$3"
|
||||
initrd="$4"
|
||||
version_label="${PROJECT_VERSION_EFFECTIVE}"
|
||||
|
||||
cat > "$cfg" <<EOF
|
||||
source /boot/grub/config.cfg
|
||||
|
||||
menuentry "EASY-BEE v${version_label}" {
|
||||
linux ${kernel} ${append_live} nomodeset bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd ${initrd}
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE v${version_label} -- load to RAM (toram)" {
|
||||
linux ${kernel} ${append_live} toram nomodeset bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd ${initrd}
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE v${version_label} -- no GUI / no X11" {
|
||||
linux ${kernel} ${append_live} nomodeset bee.gui=off bee.nvidia.mode=gsp-off pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd ${initrd}
|
||||
}
|
||||
|
||||
if [ "\${grub_platform}" = "efi" ]; then
|
||||
menuentry "Memory Test (memtest86+)" {
|
||||
chainloader /boot/memtest86+x64.efi
|
||||
}
|
||||
else
|
||||
menuentry "Memory Test (memtest86+)" {
|
||||
linux16 /boot/memtest86+x64.bin
|
||||
}
|
||||
fi
|
||||
|
||||
if [ "\${grub_platform}" = "efi" ]; then
|
||||
menuentry "UEFI Firmware Settings" {
|
||||
fwsetup
|
||||
}
|
||||
fi
|
||||
EOF
|
||||
}
|
||||
|
||||
write_canonical_isolinux_cfg() {
|
||||
cfg="$1"
|
||||
kernel="$2"
|
||||
initrd="$3"
|
||||
append_live="$4"
|
||||
version_label="${PROJECT_VERSION_EFFECTIVE}"
|
||||
|
||||
cat > "$cfg" <<EOF
|
||||
label live-@FLAVOUR@-normal
|
||||
menu label ^EASY-BEE v${version_label}
|
||||
linux ${kernel}
|
||||
initrd ${initrd}
|
||||
append ${append_live} nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
|
||||
label live-@FLAVOUR@-toram
|
||||
menu label EASY-BEE v${version_label} (^load to RAM)
|
||||
menu default
|
||||
linux ${kernel}
|
||||
initrd ${initrd}
|
||||
append ${append_live} toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
|
||||
label live-@FLAVOUR@-console
|
||||
menu label EASY-BEE v${version_label} (^no GUI / no X11)
|
||||
linux ${kernel}
|
||||
initrd ${initrd}
|
||||
append ${append_live} nomodeset bee.gui=off bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
|
||||
label live-@FLAVOUR@-gsp-off
|
||||
menu label EASY-BEE (^NVIDIA GSP=off)
|
||||
linux ${kernel}
|
||||
initrd ${initrd}
|
||||
append ${append_live} nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
|
||||
label live-@FLAVOUR@-kms
|
||||
menu label EASY-BEE (^KMS, no nomodeset)
|
||||
linux ${kernel}
|
||||
initrd ${initrd}
|
||||
append ${append_live} bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
|
||||
label live-@FLAVOUR@-kms-gsp-off
|
||||
menu label EASY-BEE (KMS, ^GSP=off)
|
||||
linux ${kernel}
|
||||
initrd ${initrd}
|
||||
append ${append_live} bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
|
||||
label live-@FLAVOUR@-failsafe
|
||||
menu label EASY-BEE (^fail-safe)
|
||||
linux ${kernel}
|
||||
initrd ${initrd}
|
||||
append ${append_live} nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||
|
||||
label memtest
|
||||
menu label ^Memory Test (memtest86+)
|
||||
linux /boot/memtest86+x64.bin
|
||||
EOF
|
||||
}
|
||||
|
||||
enforce_live_build_bootloader_assets() {
|
||||
lb_dir="$1"
|
||||
grub_cfg="$lb_dir/binary/boot/grub/grub.cfg"
|
||||
grub_dir="$lb_dir/binary/boot/grub"
|
||||
isolinux_cfg="$lb_dir/binary/isolinux/live.cfg"
|
||||
|
||||
if ! load_live_build_append "$lb_dir"; then
|
||||
echo "bootloader sync: WARNING: could not load LB_BOOTAPPEND_LIVE from $lb_dir/config/binary" >&2
|
||||
live_build_append=""
|
||||
fi
|
||||
|
||||
if [ -f "$grub_cfg" ]; then
|
||||
if extract_live_grub_entry "$grub_cfg"; then
|
||||
cp "${BUILDER_DIR}/config/bootloaders/grub-efi/config.cfg" "$grub_dir/config.cfg"
|
||||
write_canonical_grub_cfg "$grub_cfg" "$grub_kernel" "${live_build_append:-$grub_append}" "$grub_initrd"
|
||||
echo "bootloader sync: rewrote binary/boot/grub/grub.cfg with canonical EASY-BEE menu"
|
||||
else
|
||||
echo "bootloader sync: WARNING: could not extract live entry from $grub_cfg" >&2
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -f "$isolinux_cfg" ]; then
|
||||
if extract_live_isolinux_entry "$isolinux_cfg"; then
|
||||
write_canonical_isolinux_cfg "$isolinux_cfg" "$isolinux_kernel" "$isolinux_initrd_path" "${live_build_append:-$isolinux_append}"
|
||||
echo "bootloader sync: rewrote binary/isolinux/live.cfg with canonical EASY-BEE menu"
|
||||
else
|
||||
echo "bootloader sync: WARNING: could not extract live entry from $isolinux_cfg" >&2
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
copy_memtest_from_deb() {
|
||||
deb="$1"
|
||||
dst_boot="$2"
|
||||
@@ -569,6 +885,135 @@ reset_live_build_stage() {
|
||||
done
|
||||
}
|
||||
|
||||
# Marker written after every successful full lb build for this variant
|
||||
FULL_BUILD_MARKER="${BUILD_WORK_DIR}/.bee-full-build-marker"
|
||||
|
||||
# Returns 0 if full lb build is needed, 1 if fast-path is safe.
|
||||
# Fast-path is safe when only light files changed since the last full build
|
||||
# (Go source, overlay scripts/configs). Heavy changes (VERSIONS, package lists,
|
||||
# hooks, archives, Dockerfile, auto/config) require a full lb build.
|
||||
needs_full_build() {
|
||||
[ -f "${FULL_BUILD_MARKER}" ] || return 0
|
||||
[ -f "${BUILD_WORK_DIR}/binary/live/filesystem.squashfs" ] || return 0
|
||||
[ -f "${BUILD_WORK_DIR}/live-image-amd64.hybrid.iso" ] || return 0
|
||||
_extra_sq=$(find "${BUILD_WORK_DIR}/binary/live" -maxdepth 1 -type f -name '*.squashfs' ! -name 'filesystem.squashfs' 2>/dev/null | head -1)
|
||||
if [ -n "$_extra_sq" ]; then
|
||||
echo "=== full build required: multi-squashfs live image present ==="
|
||||
return 0
|
||||
fi
|
||||
|
||||
_heavy=$(find \
|
||||
"${BUILDER_DIR}/VERSIONS" \
|
||||
"${BUILDER_DIR}/auto/config" \
|
||||
"${BUILDER_DIR}/Dockerfile" \
|
||||
"${BUILDER_DIR}/config/package-lists" \
|
||||
"${BUILDER_DIR}/config/hooks" \
|
||||
"${BUILDER_DIR}/config/archives" \
|
||||
"${BUILDER_DIR}/config/bootloaders" \
|
||||
-newer "${FULL_BUILD_MARKER}" 2>/dev/null | head -1)
|
||||
|
||||
if [ -n "$_heavy" ]; then
|
||||
echo "=== full build required: heavy config changed: $(basename "$_heavy") ==="
|
||||
return 0
|
||||
fi
|
||||
|
||||
return 1
|
||||
}
|
||||
|
||||
# Fast-path: unsquash existing filesystem, rsync overlay on top, repack.
|
||||
# Requires ~10 GB free in BEE_CACHE_DIR for the unpacked squashfs.
|
||||
fast_path_repack_squashfs() {
|
||||
_sq="${BUILD_WORK_DIR}/binary/live/filesystem.squashfs"
|
||||
_tmp="${BEE_CACHE_DIR}/fast-unsquash-${BUILD_VARIANT}"
|
||||
echo "=== fast-path: unsquash ($(du -sh "$_sq" | cut -f1) compressed) ==="
|
||||
rm -rf "$_tmp"
|
||||
unsquashfs -d "$_tmp" "$_sq"
|
||||
echo "=== fast-path: syncing overlay stage ==="
|
||||
rsync -a --checksum "${OVERLAY_STAGE_DIR}/" "$_tmp/"
|
||||
echo "=== fast-path: repacking squashfs ==="
|
||||
_sq_new="${_sq}.new"
|
||||
rm -f "$_sq_new"
|
||||
mksquashfs "$_tmp" "$_sq_new" -comp zstd -b 1048576 -noappend -no-progress
|
||||
mv "$_sq_new" "$_sq"
|
||||
rm -rf "$_tmp"
|
||||
echo "=== fast-path: squashfs repacked ($(du -sh "$_sq" | cut -f1)) ==="
|
||||
}
|
||||
|
||||
# Fast-path: rebuild ISO by replacing only live/filesystem.squashfs via xorriso.
|
||||
# Boot structure (El Torito, EFI, MBR hybrid) is replayed from the prior ISO.
|
||||
fast_path_rebuild_iso() {
|
||||
_sq="${BUILD_WORK_DIR}/binary/live/filesystem.squashfs"
|
||||
_prior="${BUILD_WORK_DIR}/live-image-amd64.hybrid.iso"
|
||||
_new="${BUILD_WORK_DIR}/live-image-amd64.hybrid.iso.new"
|
||||
echo "=== fast-path: rebuilding ISO with xorriso ==="
|
||||
rm -f "$_new"
|
||||
xorriso \
|
||||
-indev "$_prior" \
|
||||
-outdev "$_new" \
|
||||
-map "$_sq" /live/filesystem.squashfs \
|
||||
-boot_image any replay \
|
||||
-commit
|
||||
mv "$_new" "$_prior"
|
||||
echo "=== fast-path: ISO rebuilt ==="
|
||||
}
|
||||
|
||||
dir_has_entries() {
|
||||
_dir="$1"
|
||||
[ -d "$_dir" ] || return 1
|
||||
find "$_dir" -mindepth 1 -print -quit 2>/dev/null | grep -q .
|
||||
}
|
||||
|
||||
move_tree_to_layer() {
|
||||
_src_root="$1"
|
||||
_rel="$2"
|
||||
_dst_root="$3"
|
||||
[ -e "${_src_root}/${_rel}" ] || return 0
|
||||
mkdir -p "${_dst_root}/$(dirname "$_rel")"
|
||||
mv "${_src_root}/${_rel}" "${_dst_root}/${_rel}"
|
||||
}
|
||||
|
||||
split_live_squashfs_layers() {
|
||||
lb_dir="$1"
|
||||
live_dir="${lb_dir}/binary/live"
|
||||
base_sq="${live_dir}/filesystem.squashfs"
|
||||
usr_sq="${live_dir}/10-usr.squashfs"
|
||||
fw_sq="${live_dir}/20-firmware.squashfs"
|
||||
|
||||
[ -f "$base_sq" ] || return 0
|
||||
command -v unsquashfs >/dev/null 2>&1 || return 0
|
||||
command -v mksquashfs >/dev/null 2>&1 || return 0
|
||||
|
||||
tmp_root="$(mktemp -d)"
|
||||
tmp_usr="$(mktemp -d)"
|
||||
tmp_fw="$(mktemp -d)"
|
||||
|
||||
echo "=== splitting live squashfs into smaller layers ==="
|
||||
unsquashfs -d "$tmp_root/root" "$base_sq" >/dev/null
|
||||
mkdir -p "$tmp_usr/root" "$tmp_fw/root"
|
||||
|
||||
move_tree_to_layer "$tmp_root/root" "usr" "$tmp_usr/root"
|
||||
move_tree_to_layer "$tmp_root/root" "lib/firmware" "$tmp_fw/root"
|
||||
move_tree_to_layer "$tmp_root/root" "usr/lib/firmware" "$tmp_fw/root"
|
||||
move_tree_to_layer "$tmp_root/root" "boot/firmware" "$tmp_fw/root"
|
||||
|
||||
rm -f "$usr_sq" "$fw_sq"
|
||||
mksquashfs "$tmp_root/root" "${base_sq}.new" -comp zstd -b 1048576 -noappend -no-progress >/dev/null
|
||||
mv "${base_sq}.new" "$base_sq"
|
||||
|
||||
if dir_has_entries "$tmp_usr/root"; then
|
||||
mksquashfs "$tmp_usr/root" "${usr_sq}.new" -comp zstd -b 1048576 -noappend -no-progress >/dev/null
|
||||
mv "${usr_sq}.new" "$usr_sq"
|
||||
fi
|
||||
if dir_has_entries "$tmp_fw/root"; then
|
||||
mksquashfs "$tmp_fw/root" "${fw_sq}.new" -comp zstd -b 1048576 -noappend -no-progress >/dev/null
|
||||
mv "${fw_sq}.new" "$fw_sq"
|
||||
fi
|
||||
|
||||
echo "=== live squashfs layers ==="
|
||||
find "$live_dir" -maxdepth 1 -type f -name '*.squashfs' -exec du -sh {} \; | sort
|
||||
rm -rf "$tmp_root" "$tmp_usr" "$tmp_fw"
|
||||
}
|
||||
|
||||
recover_iso_memtest() {
|
||||
lb_dir="$1"
|
||||
iso_path="$2"
|
||||
@@ -646,11 +1091,11 @@ recover_iso_memtest() {
|
||||
fi
|
||||
}
|
||||
|
||||
AUDIT_VERSION_EFFECTIVE="$(resolve_audit_version)"
|
||||
ISO_VERSION_EFFECTIVE="$(resolve_iso_version)"
|
||||
ISO_BASENAME="easy-bee-${BUILD_VARIANT}-v${ISO_VERSION_EFFECTIVE}-amd64"
|
||||
PROJECT_VERSION_EFFECTIVE="$(resolve_project_version)"
|
||||
ISO_BASENAME="easy-bee-${BUILD_VARIANT}-v${PROJECT_VERSION_EFFECTIVE}-amd64"
|
||||
# Versioned output directory: dist/easy-bee-v4.1/ — all final artefacts live here.
|
||||
OUT_DIR="${DIST_DIR}/easy-bee-v${ISO_VERSION_EFFECTIVE}"
|
||||
OUT_DIR="${DIST_DIR}/easy-bee-v${PROJECT_VERSION_EFFECTIVE}"
|
||||
ISO_VERSION_LABEL_TOKEN="$(printf '%s' "${PROJECT_VERSION_EFFECTIVE}" | tr '[:lower:].-' '[:upper:]__')"
|
||||
mkdir -p "${OUT_DIR}"
|
||||
LOG_DIR="${OUT_DIR}/${ISO_BASENAME}.logs"
|
||||
LOG_ARCHIVE="${OUT_DIR}/${ISO_BASENAME}.logs.tar.gz"
|
||||
@@ -826,7 +1271,7 @@ fi
|
||||
|
||||
echo "=== bee ISO build (variant: ${BUILD_VARIANT}) ==="
|
||||
echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERSION}"
|
||||
echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}, ISO version: ${ISO_VERSION_EFFECTIVE}"
|
||||
echo "Project version: ${PROJECT_VERSION_EFFECTIVE}"
|
||||
echo ""
|
||||
|
||||
run_step "sync git submodules" "05-git-submodules" \
|
||||
@@ -846,7 +1291,7 @@ if [ "$NEED_BUILD" = "1" ]; then
|
||||
"cd '${REPO_ROOT}/audit' && \
|
||||
env GOOS=linux GOARCH=amd64 CGO_ENABLED=0 \
|
||||
go build \
|
||||
-ldflags '-s -w -X main.Version=${AUDIT_VERSION_EFFECTIVE}' \
|
||||
-ldflags '-s -w -X main.Version=${PROJECT_VERSION_EFFECTIVE}' \
|
||||
-o '${BEE_BIN}' \
|
||||
./cmd/bee"
|
||||
echo "binary: $BEE_BIN"
|
||||
@@ -932,15 +1377,7 @@ echo "=== preparing staged overlay (${BUILD_VARIANT}) ==="
|
||||
mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
|
||||
|
||||
# Sync builder config into variant work dir, preserving lb cache.
|
||||
rsync -a --delete \
|
||||
--exclude='cache/' \
|
||||
--exclude='chroot/' \
|
||||
--exclude='.build/' \
|
||||
--exclude='*.iso' \
|
||||
--exclude='*.packages' \
|
||||
--exclude='*.contents' \
|
||||
--exclude='*.files' \
|
||||
"${BUILDER_DIR}/" "${BUILD_WORK_DIR}/"
|
||||
sync_builder_workdir "${BUILDER_DIR}" "${BUILD_WORK_DIR}"
|
||||
|
||||
# Share deb package cache across variants.
|
||||
# Restore: populate work dir cache from shared cache before build.
|
||||
@@ -1129,8 +1566,10 @@ else
|
||||
fi
|
||||
|
||||
cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
|
||||
BEE_ISO_VERSION=${ISO_VERSION_EFFECTIVE}
|
||||
BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
|
||||
BEE_VERSION=${PROJECT_VERSION_EFFECTIVE}
|
||||
export BEE_VERSION
|
||||
BEE_ISO_VERSION=${PROJECT_VERSION_EFFECTIVE}
|
||||
BEE_AUDIT_VERSION=${PROJECT_VERSION_EFFECTIVE}
|
||||
BEE_BUILD_VARIANT=${BUILD_VARIANT}
|
||||
BEE_GPU_VENDOR=${BEE_GPU_VENDOR}
|
||||
BUILD_DATE=${BUILD_DATE}
|
||||
@@ -1216,19 +1655,45 @@ if [ -f "${LB_INCLUDES}/root/.ssh/authorized_keys" ]; then
|
||||
chmod 600 "${LB_INCLUDES}/root/.ssh/authorized_keys"
|
||||
fi
|
||||
|
||||
# --- auto fast-path: squashfs surgery if only light files changed ---
|
||||
if ! needs_full_build; then
|
||||
echo "=== fast-path build (no heavy config changes since last full build) ==="
|
||||
fast_path_repack_squashfs
|
||||
fast_path_rebuild_iso
|
||||
ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
|
||||
validate_iso_live_boot_entries "$ISO_RAW"
|
||||
validate_iso_grub_assets "$ISO_RAW"
|
||||
validate_iso_nvidia_runtime "$ISO_RAW"
|
||||
cp "$ISO_RAW" "$ISO_OUT"
|
||||
echo ""
|
||||
echo "=== done (${BUILD_VARIANT}, fast-path) ==="
|
||||
echo "ISO: $ISO_OUT"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# --- build ISO using live-build ---
|
||||
echo ""
|
||||
echo "=== building ISO (variant: ${BUILD_VARIANT}) ==="
|
||||
|
||||
# Export for auto/config
|
||||
BEE_GPU_VENDOR_UPPER="$(echo "${BUILD_VARIANT}" | tr 'a-z-' 'A-Z_')"
|
||||
export BEE_GPU_VENDOR_UPPER
|
||||
BEE_ISO_VOLUME="EASY_BEE_${BEE_GPU_VENDOR_UPPER}_V${ISO_VERSION_LABEL_TOKEN}"
|
||||
export BEE_GPU_VENDOR_UPPER BEE_ISO_VOLUME
|
||||
|
||||
cd "${LB_DIR}"
|
||||
run_step_sh "live-build clean" "80-lb-clean" "lb clean --all 2>&1 | tail -3"
|
||||
run_step_sh "live-build config" "81-lb-config" "lb config 2>&1 | tail -5"
|
||||
dump_memtest_debug "pre-build" "${LB_DIR}"
|
||||
run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
|
||||
split_live_squashfs_layers "${LB_DIR}"
|
||||
echo "=== enforcing canonical bootloader assets ==="
|
||||
enforce_live_build_bootloader_assets "${LB_DIR}"
|
||||
reset_live_build_stage "${LB_DIR}" "binary_checksums"
|
||||
reset_live_build_stage "${LB_DIR}" "binary_iso"
|
||||
reset_live_build_stage "${LB_DIR}" "binary_zsync"
|
||||
run_step_sh "rebuild live-build checksums after bootloader sync" "91b-lb-checksums" "lb binary_checksums 2>&1"
|
||||
run_step_sh "rebuild ISO after bootloader sync" "91c-lb-binary-iso" "lb binary_iso 2>&1"
|
||||
run_step_sh "rebuild zsync after bootloader sync" "91d-lb-zsync" "lb binary_zsync 2>&1"
|
||||
|
||||
# --- persist deb package cache back to shared location ---
|
||||
# This allows the second variant to reuse all downloaded packages.
|
||||
@@ -1253,8 +1718,11 @@ if [ -f "$ISO_RAW" ]; then
|
||||
fi
|
||||
fi
|
||||
validate_iso_memtest "$ISO_RAW"
|
||||
validate_iso_live_boot_entries "$ISO_RAW"
|
||||
validate_iso_grub_assets "$ISO_RAW"
|
||||
validate_iso_nvidia_runtime "$ISO_RAW"
|
||||
cp "$ISO_RAW" "$ISO_OUT"
|
||||
touch "${FULL_BUILD_MARKER}"
|
||||
echo ""
|
||||
echo "=== done (${BUILD_VARIANT}) ==="
|
||||
echo "ISO: $ISO_OUT"
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
set default=0
|
||||
set timeout=5
|
||||
set default=1
|
||||
set timeout=10
|
||||
set color_normal=yellow/black
|
||||
set color_highlight=white/brown
|
||||
|
||||
if [ x$feature_default_font_path = xy ] ; then
|
||||
font=unicode
|
||||
@@ -8,7 +10,7 @@ else
|
||||
fi
|
||||
|
||||
if loadfont $font ; then
|
||||
set gfxmode=1920x1080,1280x1024,auto
|
||||
set gfxmode=1280x1024,auto
|
||||
set gfxpayload=keep
|
||||
insmod efi_gop
|
||||
insmod efi_uga
|
||||
@@ -23,9 +25,6 @@ insmod serial
|
||||
serial --unit=0 --speed=115200 --word=8 --parity=no --stop=1
|
||||
|
||||
insmod gfxterm
|
||||
insmod png
|
||||
|
||||
source /boot/grub/theme.cfg
|
||||
|
||||
terminal_input console serial
|
||||
terminal_output gfxterm serial
|
||||
|
||||
@@ -1,47 +1,21 @@
|
||||
source /boot/grub/config.cfg
|
||||
|
||||
echo ""
|
||||
echo " ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗"
|
||||
echo " ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝"
|
||||
echo " █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗"
|
||||
echo " ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝"
|
||||
echo " ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗"
|
||||
echo " ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝"
|
||||
echo " Hardware Audit LiveCD"
|
||||
echo ""
|
||||
|
||||
menuentry "EASY-BEE" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
menuentry "EASY-BEE v@VERSION@" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
submenu "EASY-BEE (advanced options) -->" {
|
||||
menuentry "EASY-BEE — load to RAM (toram)" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE — GSP=off" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE — KMS (no nomodeset)" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE — KMS + GSP=off" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE — fail-safe" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
menuentry "EASY-BEE v@VERSION@ -- load to RAM (toram)" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE v@VERSION@ -- no GUI / no X11" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.gui=off bee.nvidia.mode=gsp-off pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
|
||||
if [ "${grub_platform}" = "efi" ]; then
|
||||
menuentry "Memory Test (memtest86+)" {
|
||||
chainloader /boot/memtest86+x64.efi
|
||||
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 70 KiB After Width: | Height: | Size: 77 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 469 KiB |
@@ -5,15 +5,6 @@ title-text: ""
|
||||
message-font: "Unifont Regular 16"
|
||||
terminal-font: "Unifont Regular 16"
|
||||
|
||||
#bee logo — centered, upper third of screen
|
||||
+ image {
|
||||
top = 4%
|
||||
left = 50%-200
|
||||
width = 400
|
||||
height = 400
|
||||
file = "bee-logo.png"
|
||||
}
|
||||
|
||||
#help bar at the bottom
|
||||
+ label {
|
||||
top = 100%-50
|
||||
@@ -36,11 +27,11 @@ terminal-font: "Unifont Regular 16"
|
||||
item_font = "Unifont Regular 16"
|
||||
selected_item_color= "#f5a800"
|
||||
selected_item_font = "Unifont Regular 16"
|
||||
item_height = 16
|
||||
item_padding = 0
|
||||
item_height = 20
|
||||
item_padding = 2
|
||||
item_spacing = 4
|
||||
icon_width = 0
|
||||
icon_heigh = 0
|
||||
icon_height = 0
|
||||
item_icon_space = 0
|
||||
}
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
set color_normal=light-gray/black
|
||||
set color_highlight=yellow/black
|
||||
|
||||
if [ -e /boot/grub/splash.png ]; then
|
||||
if [ -e /boot/grub/live-theme/theme.txt ]; then
|
||||
set theme=/boot/grub/live-theme/theme.txt
|
||||
else
|
||||
set menu_color_normal=yellow/black
|
||||
|
||||
@@ -1,16 +1,22 @@
|
||||
label live-@FLAVOUR@-normal
|
||||
menu label ^EASY-BEE
|
||||
menu default
|
||||
menu label ^EASY-BEE v@VERSION@
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
|
||||
label live-@FLAVOUR@-toram
|
||||
menu label EASY-BEE (^load to RAM)
|
||||
menu label EASY-BEE v@VERSION@ (^load to RAM)
|
||||
menu default
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
|
||||
label live-@FLAVOUR@-console
|
||||
menu label EASY-BEE v@VERSION@ (^no GUI / no X11)
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ nomodeset bee.gui=off bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
|
||||
label live-@FLAVOUR@-gsp-off
|
||||
menu label EASY-BEE (^NVIDIA GSP=off)
|
||||
linux @LINUX@
|
||||
|
||||
@@ -31,6 +31,7 @@ systemctl enable bee-preflight.service
|
||||
systemctl enable bee-audit.service
|
||||
systemctl enable bee-web.service
|
||||
systemctl enable bee-sshsetup.service
|
||||
systemctl enable bee-blackbox.service
|
||||
systemctl enable bee-selfheal.timer
|
||||
systemctl enable bee-boot-status.service
|
||||
systemctl enable ssh.service
|
||||
@@ -66,6 +67,7 @@ chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-selfheal 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-boot-status 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-install 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-gui-gate 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-remount-medium 2>/dev/null || true
|
||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
||||
|
||||
@@ -47,18 +47,27 @@ vim-tiny
|
||||
mc
|
||||
htop
|
||||
nvtop
|
||||
btop
|
||||
sudo
|
||||
zstd
|
||||
mstflint
|
||||
memtester
|
||||
stress-ng
|
||||
stressapptest
|
||||
|
||||
# QR codes (for displaying audit results)
|
||||
qrencode
|
||||
fio
|
||||
iperf3
|
||||
iotop
|
||||
nload
|
||||
tcpdump
|
||||
hdparm
|
||||
sysstat
|
||||
lsscsi
|
||||
sg3-utils
|
||||
jq
|
||||
curl
|
||||
net-tools
|
||||
|
||||
# Local desktop (openbox + chromium kiosk)
|
||||
gparted
|
||||
openbox
|
||||
tint2
|
||||
feh
|
||||
|
||||
@@ -1,11 +1,4 @@
|
||||
|
||||
███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗
|
||||
██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝
|
||||
█████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗
|
||||
██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝
|
||||
███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗
|
||||
╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝
|
||||
|
||||
EASY BEE
|
||||
Hardware Audit LiveCD
|
||||
Build: %%BUILD_INFO%%
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[Unit]
|
||||
Description=Bee: hardware audit
|
||||
After=bee-preflight.service bee-network.service bee-nvidia.service
|
||||
After=bee-preflight.service bee-nvidia.service bee-blackbox.service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
|
||||
@@ -0,0 +1,18 @@
|
||||
[Unit]
|
||||
Description=Bee: USB black-box log mirror
|
||||
After=local-fs.target
|
||||
Before=bee-network.service bee-nvidia.service bee-preflight.service bee-audit.service bee-web.service
|
||||
StartLimitIntervalSec=0
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-blackbox.log /usr/local/bin/bee blackbox --export-dir /appdata/bee/export --state-file /appdata/bee/export/blackbox-state.json
|
||||
Restart=always
|
||||
RestartSec=1
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
OOMScoreAdjust=-900
|
||||
Nice=0
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
@@ -1,7 +1,6 @@
|
||||
[Unit]
|
||||
Description=Bee: bring up network interfaces via DHCP
|
||||
After=local-fs.target
|
||||
Before=network-online.target bee-audit.service
|
||||
After=bee-web.service bee-audit.service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[Unit]
|
||||
Description=Bee: load NVIDIA kernel modules and create device nodes
|
||||
After=local-fs.target udev.service
|
||||
After=local-fs.target udev.service bee-blackbox.service
|
||||
Before=bee-audit.service
|
||||
|
||||
[Service]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[Unit]
|
||||
Description=Bee: runtime preflight self-check
|
||||
After=bee-network.service bee-nvidia.service
|
||||
After=bee-nvidia.service bee-blackbox.service
|
||||
Before=bee-audit.service
|
||||
|
||||
[Service]
|
||||
|
||||
@@ -3,7 +3,7 @@ Description=Bee: run self-heal checks periodically
|
||||
|
||||
[Timer]
|
||||
OnBootSec=45sec
|
||||
OnUnitActiveSec=60sec
|
||||
OnUnitActiveSec=3min
|
||||
AccuracySec=15sec
|
||||
Unit=bee-selfheal.service
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
[Unit]
|
||||
Description=Bee: hardware audit web viewer
|
||||
After=bee-blackbox.service
|
||||
StartLimitIntervalSec=0
|
||||
|
||||
[Service]
|
||||
@@ -10,7 +11,8 @@ RestartSec=3
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
LimitMEMLOCK=infinity
|
||||
MemoryMax=3G
|
||||
# No MemoryMax: bee-web spawns GPU test subprocesses (dcgmproftester etc.)
|
||||
# that legitimately use several GB; a cgroup limit kills them via OOM.
|
||||
# Keep the web server responsive during GPU/CPU stress (children inherit nice+10
|
||||
# via Setpriority in runCmdJob, but the bee-web parent stays at 0).
|
||||
Nice=0
|
||||
|
||||
@@ -0,0 +1,2 @@
|
||||
[Service]
|
||||
ExecCondition=/usr/local/bin/bee-gui-gate
|
||||
@@ -51,12 +51,7 @@ while true; do
|
||||
printf '\033[H\033[2J'
|
||||
|
||||
printf '\n'
|
||||
printf ' \033[33m███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗\033[0m\n'
|
||||
printf ' \033[33m██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝\033[0m\n'
|
||||
printf ' \033[33m█████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗\033[0m\n'
|
||||
printf ' \033[33m██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝\033[0m\n'
|
||||
printf ' \033[33m███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗\033[0m\n'
|
||||
printf ' \033[33m╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝\033[0m\n'
|
||||
printf ' \033[33mEASY BEE\033[0m\n'
|
||||
printf ' Hardware Audit LiveCD\n'
|
||||
printf '\n'
|
||||
|
||||
|
||||
Executable
+27
@@ -0,0 +1,27 @@
|
||||
#!/bin/sh
|
||||
# bee-gui-gate — skip starting the local GUI when bee.gui=off is set.
|
||||
|
||||
set -eu
|
||||
|
||||
cmdline_param() {
|
||||
key="$1"
|
||||
for token in $(cat /proc/cmdline 2>/dev/null); do
|
||||
case "$token" in
|
||||
"$key"=*)
|
||||
echo "${token#*=}"
|
||||
return 0
|
||||
;;
|
||||
esac
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
mode="$(cmdline_param bee.gui || true)"
|
||||
case "${mode}" in
|
||||
off|false|0|tty|console|text|nogui)
|
||||
echo "bee-gui-gate: bee.gui=${mode}; skipping lightdm"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
exit 0
|
||||
@@ -8,7 +8,7 @@
|
||||
# Layout (UEFI): GPT, /dev/sdX1=EFI 512MB vfat, /dev/sdX2=root ext4
|
||||
# Layout (BIOS): MBR, /dev/sdX1=root ext4
|
||||
#
|
||||
# Squashfs source: /run/live/medium/live/filesystem.squashfs
|
||||
# Squashfs sources: /run/live/medium/live/*.squashfs
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
@@ -62,9 +62,9 @@ for tool in parted mkfs.vfat mkfs.ext4 unsquashfs grub-install update-grub; do
|
||||
fi
|
||||
done
|
||||
|
||||
SQUASHFS="/run/live/medium/live/filesystem.squashfs"
|
||||
if [ ! -f "$SQUASHFS" ]; then
|
||||
echo "ERROR: squashfs not found at $SQUASHFS" >&2
|
||||
mapfile -t SQUASHFS_FILES < <(find /run/live/medium/live -maxdepth 1 -type f -name '*.squashfs' | sort)
|
||||
if [ "${#SQUASHFS_FILES[@]}" -eq 0 ]; then
|
||||
echo "ERROR: no squashfs files found under /run/live/medium/live" >&2
|
||||
echo " The live medium may have been disconnected." >&2
|
||||
echo " Reconnect the disc and run: bee-remount-medium --wait" >&2
|
||||
echo " Then re-run bee-install." >&2
|
||||
@@ -106,7 +106,10 @@ log "=== BEE DISK INSTALLER ==="
|
||||
log "Target device : $DEVICE"
|
||||
log "Root partition: $PART_ROOT"
|
||||
[ "$UEFI" = "1" ] && log "EFI partition : $PART_EFI"
|
||||
log "Squashfs : $SQUASHFS ($(du -sh "$SQUASHFS" | cut -f1))"
|
||||
log "Squashfs : ${#SQUASHFS_FILES[@]} layer(s)"
|
||||
for sf in "${SQUASHFS_FILES[@]}"; do
|
||||
log " - $sf ($(du -sh "$sf" | cut -f1))"
|
||||
done
|
||||
log "Log : $LOGFILE"
|
||||
log ""
|
||||
|
||||
@@ -163,7 +166,9 @@ log " Mounted."
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
log "--- Step 5/7: Unpacking filesystem (this takes 10-20 minutes) ---"
|
||||
log " Source: $SQUASHFS"
|
||||
for sf in "${SQUASHFS_FILES[@]}"; do
|
||||
log " Source: $sf"
|
||||
done
|
||||
log " Target: $MOUNT_ROOT"
|
||||
|
||||
# unsquashfs does not support resume, so retry the entire unpack step if the
|
||||
@@ -177,9 +182,9 @@ while true; do
|
||||
fi
|
||||
[ "$UNPACK_ATTEMPTS" -gt 1 ] && log " Retry attempt $UNPACK_ATTEMPTS / $UNPACK_MAX ..."
|
||||
|
||||
# Re-check squashfs is reachable before each attempt
|
||||
if [ ! -f "$SQUASHFS" ]; then
|
||||
log " SOURCE LOST: $SQUASHFS not found."
|
||||
mapfile -t SQUASHFS_FILES < <(find /run/live/medium/live -maxdepth 1 -type f -name '*.squashfs' | sort)
|
||||
if [ "${#SQUASHFS_FILES[@]}" -eq 0 ]; then
|
||||
log " SOURCE LOST: no squashfs files found under /run/live/medium/live."
|
||||
log " Reconnect the disc and run 'bee-remount-medium --wait' in another terminal,"
|
||||
log " then press Enter here to retry."
|
||||
read -r _
|
||||
@@ -194,12 +199,17 @@ while true; do
|
||||
fi
|
||||
|
||||
UNPACK_OK=0
|
||||
unsquashfs -f -d "$MOUNT_ROOT" "$SQUASHFS" 2>&1 | \
|
||||
grep -E '^\[|^inod|^created|^extract|^ERROR|failed' | \
|
||||
while IFS= read -r line; do log " $line"; done || UNPACK_OK=$?
|
||||
for sf in "${SQUASHFS_FILES[@]}"; do
|
||||
log " Unpacking $(basename "$sf") ..."
|
||||
unsquashfs -f -d "$MOUNT_ROOT" "$sf" 2>&1 | \
|
||||
grep -E '^\[|^inod|^created|^extract|^ERROR|failed' | \
|
||||
while IFS= read -r line; do log " $line"; done || UNPACK_OK=$?
|
||||
[ "$UNPACK_OK" -eq 0 ] || break
|
||||
done
|
||||
|
||||
# Check squashfs is still reachable (gone = disc pulled during copy)
|
||||
if [ ! -f "$SQUASHFS" ]; then
|
||||
mapfile -t SQUASHFS_FILES < <(find /run/live/medium/live -maxdepth 1 -type f -name '*.squashfs' | sort)
|
||||
if [ "${#SQUASHFS_FILES[@]}" -eq 0 ]; then
|
||||
log " WARNING: source medium lost during unpack — will retry after remount."
|
||||
log " Run 'bee-remount-medium --wait' in another terminal, then press Enter."
|
||||
read -r _
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
#!/bin/sh
|
||||
# bee-network.sh — bring up all physical network interfaces via DHCP
|
||||
# Unattended: runs silently, logs results, never blocks.
|
||||
# Unattended: starts later in boot, runs quietly, and gives up after a bounded timeout.
|
||||
|
||||
LOG_PREFIX="bee-network"
|
||||
DHCP_TIMEOUT_SECS=300
|
||||
|
||||
log() { echo "[$LOG_PREFIX] $*"; }
|
||||
|
||||
@@ -19,9 +20,50 @@ if command -v udevadm >/dev/null 2>&1; then
|
||||
udevadm settle --timeout=5 >/dev/null 2>&1 || log "WARN: udevadm settle timed out"
|
||||
fi
|
||||
|
||||
start_dhcp() {
|
||||
iface="$1"
|
||||
if ! ip link set "$iface" up; then
|
||||
log "WARN: could not bring up $iface"
|
||||
return 1
|
||||
fi
|
||||
|
||||
carrier=$(cat "/sys/class/net/$iface/carrier" 2>/dev/null || true)
|
||||
if [ "$carrier" = "1" ]; then
|
||||
log "carrier detected on $iface"
|
||||
else
|
||||
log "carrier not detected on $iface"
|
||||
fi
|
||||
|
||||
dhclient -r "$iface" >/dev/null 2>&1 || true
|
||||
|
||||
if timeout "${DHCP_TIMEOUT_SECS}" dhclient -4 -q -1 "$iface" >/dev/null 2>&1; then
|
||||
addr="$(ip -4 -o addr show dev "$iface" scope global 2>/dev/null | awk '{print $4}' | head -1)"
|
||||
if [ -n "$addr" ]; then
|
||||
log "DHCP lease acquired on $iface ($addr)"
|
||||
else
|
||||
log "DHCP lease acquired on $iface"
|
||||
fi
|
||||
return 0
|
||||
fi
|
||||
|
||||
rc=$?
|
||||
case "$rc" in
|
||||
124)
|
||||
log "DHCP timed out on $iface after ${DHCP_TIMEOUT_SECS}s"
|
||||
;;
|
||||
*)
|
||||
log "DHCP failed on $iface (exit $rc)"
|
||||
;;
|
||||
esac
|
||||
dhclient -r "$iface" >/dev/null 2>&1 || true
|
||||
return 1
|
||||
}
|
||||
|
||||
started_ifaces=""
|
||||
started_count=0
|
||||
scan_pass=1
|
||||
pids=""
|
||||
pid_ifaces=""
|
||||
|
||||
# Some server NICs appear a bit later after module/firmware init. Do a small
|
||||
# bounded rescan window without turning network bring-up into a boot blocker.
|
||||
@@ -34,22 +76,11 @@ while [ "$scan_pass" -le 3 ]; do
|
||||
*" $iface "*) continue ;;
|
||||
esac
|
||||
|
||||
log "bringing up $iface"
|
||||
if ! ip link set "$iface" up; then
|
||||
log "WARN: could not bring up $iface"
|
||||
continue
|
||||
fi
|
||||
|
||||
carrier=$(cat "/sys/class/net/$iface/carrier" 2>/dev/null || true)
|
||||
if [ "$carrier" = "1" ]; then
|
||||
log "carrier detected on $iface"
|
||||
else
|
||||
log "carrier not detected yet on $iface"
|
||||
fi
|
||||
|
||||
# DHCP in background — non-blocking, keep dhclient verbose output in the service log.
|
||||
dhclient -4 -v -nw "$iface" &
|
||||
log "DHCP started for $iface (pid $!)"
|
||||
log "starting DHCP on $iface (timeout ${DHCP_TIMEOUT_SECS}s)"
|
||||
start_dhcp "$iface" &
|
||||
pid="$!"
|
||||
pids="$pids $pid"
|
||||
pid_ifaces="$pid_ifaces $pid:$iface"
|
||||
|
||||
started_ifaces="$started_ifaces $iface"
|
||||
started_count=$((started_count + 1))
|
||||
@@ -68,4 +99,15 @@ if [ "$started_count" -eq 0 ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
log "done (interfaces started: $started_count)"
|
||||
success_count=0
|
||||
for pid_iface in $pid_ifaces; do
|
||||
pid="${pid_iface%%:*}"
|
||||
iface="${pid_iface#*:}"
|
||||
if wait "$pid"; then
|
||||
success_count=$((success_count + 1))
|
||||
else
|
||||
log "DHCP did not complete successfully on $iface"
|
||||
fi
|
||||
done
|
||||
|
||||
log "done (interfaces scanned: $started_count, leases acquired: $success_count)"
|
||||
|
||||
Executable
+326
@@ -0,0 +1,326 @@
|
||||
#!/bin/sh
|
||||
# bee-nvidia-recover — drain NVIDIA clients, then reset a GPU or reload drivers.
|
||||
|
||||
set -u
|
||||
|
||||
log() {
|
||||
echo "[bee-nvidia-recover] $*"
|
||||
}
|
||||
|
||||
log_blocker() {
|
||||
echo "[bee-nvidia-recover] blocker: $*"
|
||||
}
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
usage:
|
||||
bee-nvidia-recover restart-drivers
|
||||
bee-nvidia-recover reset-gpu <index>
|
||||
EOF
|
||||
}
|
||||
|
||||
unit_exists() {
|
||||
systemctl cat "$1" >/dev/null 2>&1
|
||||
}
|
||||
|
||||
unit_is_active() {
|
||||
systemctl is-active --quiet "$1" 2>/dev/null
|
||||
}
|
||||
|
||||
stop_unit_if_active() {
|
||||
unit="$1"
|
||||
if unit_is_active "$unit"; then
|
||||
log "stopping $unit"
|
||||
systemctl stop "$unit"
|
||||
return 0
|
||||
fi
|
||||
return 1
|
||||
}
|
||||
|
||||
start_unit_if_marked() {
|
||||
unit="$1"
|
||||
marker="$2"
|
||||
if [ "$marker" = "1" ] && unit_exists "$unit"; then
|
||||
log "starting $unit"
|
||||
systemctl start "$unit"
|
||||
fi
|
||||
}
|
||||
|
||||
wait_for_process_exit() {
|
||||
name="$1"
|
||||
tries=0
|
||||
while pgrep -x "$name" >/dev/null 2>&1; do
|
||||
tries=$((tries + 1))
|
||||
if [ "$tries" -ge 15 ]; then
|
||||
log "WARN: $name is still running after stop request"
|
||||
return 1
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
return 0
|
||||
}
|
||||
|
||||
log_pid_details() {
|
||||
pid="$1"
|
||||
line=$(ps -p "$pid" -o pid=,comm=,args= 2>/dev/null | sed 's/^[[:space:]]*//')
|
||||
if [ -n "$line" ]; then
|
||||
log_blocker "$line"
|
||||
else
|
||||
log_blocker "pid $pid"
|
||||
fi
|
||||
}
|
||||
|
||||
collect_gpu_compute_pids() {
|
||||
index="$1"
|
||||
if ! command -v nvidia-smi >/dev/null 2>&1; then
|
||||
return 0
|
||||
fi
|
||||
nvidia-smi --id="$index" \
|
||||
--query-compute-apps=pid \
|
||||
--format=csv,noheader,nounits 2>/dev/null \
|
||||
| sed 's/^[[:space:]]*//;s/[[:space:]]*$//' \
|
||||
| grep -E '^[0-9]+$' || true
|
||||
}
|
||||
|
||||
collect_gpu_device_pids() {
|
||||
index="$1"
|
||||
dev="/dev/nvidia$index"
|
||||
[ -e "$dev" ] || return 0
|
||||
if command -v fuser >/dev/null 2>&1; then
|
||||
fuser "$dev" 2>/dev/null \
|
||||
| tr ' ' '\n' \
|
||||
| sed 's/[^0-9].*$//' \
|
||||
| grep -E '^[0-9]+$' || true
|
||||
fi
|
||||
}
|
||||
|
||||
collect_gpu_holder_pids() {
|
||||
index="$1"
|
||||
{
|
||||
collect_gpu_compute_pids "$index"
|
||||
collect_gpu_device_pids "$index"
|
||||
} | awk 'NF' | sort -u
|
||||
}
|
||||
|
||||
kill_pid_list() {
|
||||
pids="$1"
|
||||
[ -n "$pids" ] || return 0
|
||||
|
||||
for pid in $pids; do
|
||||
log_pid_details "$pid"
|
||||
done
|
||||
log "terminating GPU holder PIDs: $(echo "$pids" | tr '\n' ' ' | sed 's/[[:space:]]*$//')"
|
||||
for pid in $pids; do
|
||||
kill -TERM "$pid" >/dev/null 2>&1 || true
|
||||
done
|
||||
sleep 1
|
||||
for pid in $pids; do
|
||||
if kill -0 "$pid" >/dev/null 2>&1; then
|
||||
log "forcing GPU holder PID $pid to exit"
|
||||
kill -KILL "$pid" >/dev/null 2>&1 || true
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
gpu_has_display_holders() {
|
||||
index="$1"
|
||||
holders=$(collect_gpu_device_pids "$index")
|
||||
[ -n "$holders" ] || return 1
|
||||
for pid in $holders; do
|
||||
comm=$(ps -p "$pid" -o comm= 2>/dev/null | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
|
||||
case "$comm" in
|
||||
Xorg|Xwayland|X|gnome-shell)
|
||||
return 0
|
||||
;;
|
||||
esac
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
stop_nv_hostengine_if_running() {
|
||||
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||
pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do
|
||||
[ -n "$line" ] || continue
|
||||
log_blocker "$line"
|
||||
done
|
||||
log "stopping nv-hostengine"
|
||||
pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true
|
||||
wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true
|
||||
hostengine_was_active=1
|
||||
return 0
|
||||
fi
|
||||
return 1
|
||||
}
|
||||
|
||||
stop_fabricmanager_if_active() {
|
||||
if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
|
||||
log_blocker "service nvidia-fabricmanager.service"
|
||||
fabric_was_active=1
|
||||
return 0
|
||||
fi
|
||||
return 1
|
||||
}
|
||||
|
||||
stop_display_stack_if_active() {
|
||||
stopped=1
|
||||
for unit in display-manager.service lightdm.service; do
|
||||
if unit_exists "$unit" && stop_unit_if_active "$unit"; then
|
||||
log_blocker "service $unit"
|
||||
display_was_active=1
|
||||
stopped=0
|
||||
fi
|
||||
done
|
||||
return "$stopped"
|
||||
}
|
||||
|
||||
try_gpu_reset() {
|
||||
index="$1"
|
||||
log "resetting GPU $index"
|
||||
nvidia-smi -r -i "$index"
|
||||
}
|
||||
|
||||
drain_gpu_clients() {
|
||||
display_was_active=0
|
||||
fabric_was_active=0
|
||||
hostengine_was_active=0
|
||||
|
||||
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||
pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do
|
||||
[ -n "$line" ] || continue
|
||||
log_blocker "$line"
|
||||
done
|
||||
log "stopping nv-hostengine"
|
||||
pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true
|
||||
wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true
|
||||
hostengine_was_active=1
|
||||
fi
|
||||
|
||||
if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
|
||||
log_blocker "service nvidia-fabricmanager.service"
|
||||
fabric_was_active=1
|
||||
fi
|
||||
|
||||
for unit in display-manager.service lightdm.service; do
|
||||
if unit_exists "$unit" && stop_unit_if_active "$unit"; then
|
||||
log_blocker "service $unit"
|
||||
display_was_active=1
|
||||
fi
|
||||
done
|
||||
|
||||
for dev in /dev/nvidia[0-9]*; do
|
||||
[ -e "$dev" ] || continue
|
||||
holders=$(collect_gpu_device_pids "${dev#/dev/nvidia}")
|
||||
kill_pid_list "$holders"
|
||||
done
|
||||
}
|
||||
|
||||
restore_gpu_clients() {
|
||||
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||
if nvidia-smi -pm 1 >/dev/null 2>&1; then
|
||||
log "enabled NVIDIA persistence mode"
|
||||
else
|
||||
log "WARN: failed to enable NVIDIA persistence mode"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ "${hostengine_was_active:-0}" = "1" ] && command -v nv-hostengine >/dev/null 2>&1 && ! pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||
log "starting nv-hostengine"
|
||||
nv-hostengine
|
||||
fi
|
||||
|
||||
start_unit_if_marked nvidia-fabricmanager.service "${fabric_was_active:-0}"
|
||||
start_unit_if_marked display-manager.service "${display_was_active:-0}"
|
||||
if [ "${display_was_active:-0}" = "1" ] && unit_exists lightdm.service && ! unit_is_active lightdm.service; then
|
||||
start_unit_if_marked lightdm.service "1"
|
||||
fi
|
||||
}
|
||||
|
||||
restart_drivers() {
|
||||
drain_gpu_clients
|
||||
for mod in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do
|
||||
if lsmod | awk '{print $1}' | grep -qx "$mod"; then
|
||||
log "unloading module $mod"
|
||||
rmmod "$mod"
|
||||
fi
|
||||
done
|
||||
rm -f /dev/nvidiactl /dev/nvidia-uvm /dev/nvidia-uvm-tools /dev/nvidia[0-9]* 2>/dev/null || true
|
||||
log "reloading NVIDIA driver stack"
|
||||
/usr/local/bin/bee-nvidia-load
|
||||
restore_gpu_clients
|
||||
}
|
||||
|
||||
reset_gpu() {
|
||||
index="$1"
|
||||
display_was_active=0
|
||||
fabric_was_active=0
|
||||
hostengine_was_active=0
|
||||
|
||||
holders=$(collect_gpu_holder_pids "$index")
|
||||
if [ -n "$holders" ]; then
|
||||
kill_pid_list "$holders"
|
||||
fi
|
||||
if try_gpu_reset "$index"; then
|
||||
restore_gpu_clients
|
||||
return 0
|
||||
fi
|
||||
|
||||
stop_nv_hostengine_if_running || true
|
||||
holders=$(collect_gpu_holder_pids "$index")
|
||||
if [ -n "$holders" ]; then
|
||||
kill_pid_list "$holders"
|
||||
fi
|
||||
if try_gpu_reset "$index"; then
|
||||
restore_gpu_clients
|
||||
return 0
|
||||
fi
|
||||
|
||||
stop_fabricmanager_if_active || true
|
||||
holders=$(collect_gpu_holder_pids "$index")
|
||||
if [ -n "$holders" ]; then
|
||||
kill_pid_list "$holders"
|
||||
fi
|
||||
if try_gpu_reset "$index"; then
|
||||
restore_gpu_clients
|
||||
return 0
|
||||
fi
|
||||
|
||||
if gpu_has_display_holders "$index"; then
|
||||
stop_display_stack_if_active || true
|
||||
holders=$(collect_gpu_holder_pids "$index")
|
||||
if [ -n "$holders" ]; then
|
||||
kill_pid_list "$holders"
|
||||
fi
|
||||
if try_gpu_reset "$index"; then
|
||||
restore_gpu_clients
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
holders=$(collect_gpu_holder_pids "$index")
|
||||
if [ -n "$holders" ]; then
|
||||
log "GPU $index still has holders after targeted drain"
|
||||
kill_pid_list "$holders"
|
||||
fi
|
||||
try_gpu_reset "$index"
|
||||
rc=$?
|
||||
restore_gpu_clients
|
||||
return "$rc"
|
||||
}
|
||||
|
||||
cmd="${1:-}"
|
||||
case "$cmd" in
|
||||
restart-drivers)
|
||||
restart_drivers
|
||||
;;
|
||||
reset-gpu)
|
||||
if [ "$#" -ne 2 ]; then
|
||||
usage >&2
|
||||
exit 2
|
||||
fi
|
||||
reset_gpu "$2"
|
||||
;;
|
||||
*)
|
||||
usage >&2
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
@@ -2,7 +2,7 @@
|
||||
# bee-remount-medium — find and remount the live ISO medium to /run/live/medium
|
||||
#
|
||||
# Run this after reconnecting the ISO source disc (USB/CD) if the live medium
|
||||
# was lost and /run/live/medium/live/filesystem.squashfs is missing.
|
||||
# was lost and /run/live/medium/live/*.squashfs are missing.
|
||||
#
|
||||
# Usage: bee-remount-medium [--wait]
|
||||
# --wait keep retrying every 5 seconds until the medium is found (useful
|
||||
@@ -11,7 +11,7 @@
|
||||
set -euo pipefail
|
||||
|
||||
MEDIUM_DIR="/run/live/medium"
|
||||
SQUASHFS_REL="live/filesystem.squashfs"
|
||||
SQUASHFS_GLOB="live/*.squashfs"
|
||||
WAIT_MODE=0
|
||||
|
||||
for arg in "$@"; do
|
||||
@@ -28,6 +28,10 @@ done
|
||||
log() { echo "[$(date +%H:%M:%S)] $*"; }
|
||||
die() { log "ERROR: $*" >&2; exit 1; }
|
||||
|
||||
if [ "$(id -u)" -ne 0 ]; then
|
||||
die "bee-remount-medium must be run as root (use sudo or a root shell)"
|
||||
fi
|
||||
|
||||
# Return all candidate block devices (optical + removable USB mass storage)
|
||||
find_candidates() {
|
||||
# CD/DVD drives
|
||||
@@ -52,7 +56,7 @@ try_mount() {
|
||||
local tmpdir
|
||||
tmpdir=$(mktemp -d /tmp/bee-probe-XXXXXX)
|
||||
if mount -o ro "$dev" "$tmpdir" 2>/dev/null; then
|
||||
if [ -f "${tmpdir}/${SQUASHFS_REL}" ]; then
|
||||
if find "${tmpdir}/live" -maxdepth 1 -type f -name '*.squashfs' 2>/dev/null | grep -q .; then
|
||||
# Unmount probe mount and mount properly onto live path
|
||||
umount "$tmpdir" 2>/dev/null || true
|
||||
rmdir "$tmpdir" 2>/dev/null || true
|
||||
@@ -78,8 +82,9 @@ attempt() {
|
||||
for dev in $(find_candidates); do
|
||||
log " Trying $dev ..."
|
||||
if try_mount "$dev"; then
|
||||
local sq="${MEDIUM_DIR}/${SQUASHFS_REL}"
|
||||
log "SUCCESS: squashfs available at $sq ($(du -sh "$sq" | cut -f1))"
|
||||
local count
|
||||
count=$(find "${MEDIUM_DIR}/live" -maxdepth 1 -type f -name '*.squashfs' 2>/dev/null | wc -l | tr -d ' ')
|
||||
log "SUCCESS: ${count} squashfs layer(s) available under ${MEDIUM_DIR}/live"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
@@ -96,5 +101,5 @@ if [ "$WAIT_MODE" = "1" ]; then
|
||||
sleep 5
|
||||
done
|
||||
else
|
||||
attempt || die "No ISO medium with ${SQUASHFS_REL} found. Reconnect the disc and re-run, or use --wait."
|
||||
attempt || die "No ISO medium with ${SQUASHFS_GLOB} found. Reconnect the disc and re-run, or use --wait."
|
||||
fi
|
||||
|
||||
@@ -8,11 +8,17 @@ EXPORT_DIR="/appdata/bee/export"
|
||||
AUDIT_JSON="${EXPORT_DIR}/bee-audit.json"
|
||||
RUNTIME_JSON="${EXPORT_DIR}/runtime-health.json"
|
||||
LOCK_DIR="/run/bee-selfheal.lock"
|
||||
EVENTS=0
|
||||
|
||||
log() {
|
||||
echo "[${LOG_PREFIX}] $*"
|
||||
}
|
||||
|
||||
log_event() {
|
||||
EVENTS=$((EVENTS + 1))
|
||||
log "$*"
|
||||
}
|
||||
|
||||
have_nvidia_gpu() {
|
||||
lspci -Dn 2>/dev/null | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ { found=1; exit } END { exit(found ? 0 : 1) }'
|
||||
}
|
||||
@@ -56,24 +62,22 @@ web_healthy() {
|
||||
mkdir -p "${EXPORT_DIR}" /run
|
||||
|
||||
if ! mkdir "${LOCK_DIR}" 2>/dev/null; then
|
||||
log "another self-heal run is already active"
|
||||
log_event "another self-heal run is already active"
|
||||
exit 0
|
||||
fi
|
||||
trap 'rmdir "${LOCK_DIR}" >/dev/null 2>&1 || true' EXIT
|
||||
|
||||
log "start"
|
||||
|
||||
if have_nvidia_gpu && [ ! -e /dev/nvidia0 ]; then
|
||||
log "NVIDIA GPU detected but /dev/nvidia0 is missing"
|
||||
log_event "NVIDIA GPU detected but /dev/nvidia0 is missing"
|
||||
restart_service bee-nvidia.service || true
|
||||
fi
|
||||
|
||||
runtime_state="$(artifact_state "${RUNTIME_JSON}")"
|
||||
if [ "${runtime_state}" != "ready" ]; then
|
||||
if [ "${runtime_state}" = "interrupted" ]; then
|
||||
log "runtime-health.json.tmp exists — interrupted runtime-health write detected"
|
||||
log_event "runtime-health.json.tmp exists — interrupted runtime-health write detected"
|
||||
else
|
||||
log "runtime-health.json missing or empty"
|
||||
log_event "runtime-health.json missing or empty"
|
||||
fi
|
||||
restart_service bee-preflight.service || true
|
||||
fi
|
||||
@@ -81,19 +85,17 @@ fi
|
||||
audit_state="$(artifact_state "${AUDIT_JSON}")"
|
||||
if [ "${audit_state}" != "ready" ]; then
|
||||
if [ "${audit_state}" = "interrupted" ]; then
|
||||
log "bee-audit.json.tmp exists — interrupted audit write detected"
|
||||
log_event "bee-audit.json.tmp exists — interrupted audit write detected"
|
||||
else
|
||||
log "bee-audit.json missing or empty"
|
||||
log_event "bee-audit.json missing or empty"
|
||||
fi
|
||||
restart_service bee-audit.service || true
|
||||
fi
|
||||
|
||||
if ! service_active bee-web.service; then
|
||||
log "bee-web.service is not active"
|
||||
log_event "bee-web.service is not active"
|
||||
restart_service bee-web.service || true
|
||||
elif ! web_healthy; then
|
||||
log "bee-web health check failed"
|
||||
log_event "bee-web health check failed"
|
||||
restart_service bee-web.service || true
|
||||
fi
|
||||
|
||||
log "done"
|
||||
|
||||
Vendored
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user