Compare commits

..

15 Commits
v4.4 ... v5.6

50 changed files with 6689 additions and 785 deletions

View File

@@ -1,9 +1,10 @@
LISTEN ?= :8080 LISTEN ?= :8080
AUDIT_PATH ?= AUDIT_PATH ?=
EXPORT_DIR ?= $(CURDIR)/.tmp/export
VERSION ?= $(shell sh ./scripts/resolve-version.sh) VERSION ?= $(shell sh ./scripts/resolve-version.sh)
GO_LDFLAGS := -X main.Version=$(VERSION) GO_LDFLAGS := -X main.Version=$(VERSION)
RUN_ARGS := web --listen $(LISTEN) RUN_ARGS := web --listen $(LISTEN) --export-dir $(EXPORT_DIR)
ifneq ($(AUDIT_PATH),) ifneq ($(AUDIT_PATH),)
RUN_ARGS += --audit-path $(AUDIT_PATH) RUN_ARGS += --audit-path $(AUDIT_PATH)
endif endif
@@ -11,6 +12,7 @@ endif
.PHONY: run build test .PHONY: run build test
run: run:
mkdir -p $(EXPORT_DIR)
go run -ldflags "$(GO_LDFLAGS)" ./cmd/bee $(RUN_ARGS) go run -ldflags "$(GO_LDFLAGS)" ./cmd/bee $(RUN_ARGS)
build: build:

View File

@@ -7,6 +7,8 @@ import (
"io" "io"
"log/slog" "log/slog"
"os" "os"
"runtime/debug"
"strconv"
"strings" "strings"
"bee/audit/internal/app" "bee/audit/internal/app"
@@ -29,10 +31,19 @@ func main() {
os.Exit(run(os.Args[1:], os.Stdout, os.Stderr)) os.Exit(run(os.Args[1:], os.Stdout, os.Stderr))
} }
func run(args []string, stdout, stderr io.Writer) int { func run(args []string, stdout, stderr io.Writer) (exitCode int) {
slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{ slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
Level: slog.LevelInfo, Level: slog.LevelInfo,
}))) })))
defer func() {
if rec := recover(); rec != nil {
slog.Error("fatal panic",
"panic", fmt.Sprint(rec),
"stack", string(debug.Stack()),
)
exitCode = 1
}
}()
if len(args) == 0 { if len(args) == 0 {
printRootUsage(stderr) printRootUsage(stderr)
@@ -58,6 +69,8 @@ func run(args []string, stdout, stderr io.Writer) int {
return runWeb(args[1:], stdout, stderr) return runWeb(args[1:], stdout, stderr)
case "sat": case "sat":
return runSAT(args[1:], stdout, stderr) return runSAT(args[1:], stdout, stderr)
case "benchmark":
return runBenchmark(args[1:], stdout, stderr)
case "version", "--version", "-version": case "version", "--version", "-version":
fmt.Fprintln(stdout, Version) fmt.Fprintln(stdout, Version)
return 0 return 0
@@ -74,8 +87,9 @@ func printRootUsage(w io.Writer) {
bee preflight --output stdout|file:<path> bee preflight --output stdout|file:<path>
bee export --target <device> bee export --target <device>
bee support-bundle --output stdout|file:<path> bee support-bundle --output stdout|file:<path>
bee web --listen :80 --audit-path `+app.DefaultAuditJSONPath+` bee web --listen :80 [--audit-path `+app.DefaultAuditJSONPath+`]
bee sat nvidia|memory|storage|cpu [--duration <seconds>] bee sat nvidia|memory|storage|cpu [--duration <seconds>]
bee benchmark nvidia [--profile standard|stability|overnight]
bee version bee version
bee help [command]`) bee help [command]`)
} }
@@ -94,6 +108,8 @@ func runHelp(args []string, stdout, stderr io.Writer) int {
return runWeb([]string{"--help"}, stdout, stdout) return runWeb([]string{"--help"}, stdout, stdout)
case "sat": case "sat":
return runSAT([]string{"--help"}, stdout, stderr) return runSAT([]string{"--help"}, stdout, stderr)
case "benchmark":
return runBenchmark([]string{"--help"}, stdout, stderr)
case "version": case "version":
fmt.Fprintln(stdout, "usage: bee version") fmt.Fprintln(stdout, "usage: bee version")
return 0 return 0
@@ -280,7 +296,7 @@ func runWeb(args []string, stdout, stderr io.Writer) int {
fs := flag.NewFlagSet("web", flag.ContinueOnError) fs := flag.NewFlagSet("web", flag.ContinueOnError)
fs.SetOutput(stderr) fs.SetOutput(stderr)
listenAddr := fs.String("listen", ":8080", "listen address, e.g. :80") listenAddr := fs.String("listen", ":8080", "listen address, e.g. :80")
auditPath := fs.String("audit-path", app.DefaultAuditJSONPath, "path to the latest audit JSON snapshot") auditPath := fs.String("audit-path", "", "optional path to the latest audit JSON snapshot")
exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles") exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
title := fs.String("title", "Bee Hardware Audit", "page title") title := fs.String("title", "Bee Hardware Audit", "page title")
fs.Usage = func() { fs.Usage = func() {
@@ -383,3 +399,85 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
slog.Info("sat archive written", "target", target, "path", archive) slog.Info("sat archive written", "target", target, "path", archive)
return 0 return 0
} }
func runBenchmark(args []string, stdout, stderr io.Writer) int {
if len(args) == 0 {
fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
return 2
}
if args[0] == "help" || args[0] == "--help" || args[0] == "-h" {
fmt.Fprintln(stdout, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
return 0
}
target := args[0]
if target != "nvidia" {
fmt.Fprintf(stderr, "bee benchmark: unknown target %q\n", target)
fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
return 2
}
fs := flag.NewFlagSet("benchmark", flag.ContinueOnError)
fs.SetOutput(stderr)
profile := fs.String("profile", platform.NvidiaBenchmarkProfileStandard, "benchmark profile: standard, stability, overnight")
devices := fs.String("devices", "", "comma-separated GPU indices to include")
exclude := fs.String("exclude", "", "comma-separated GPU indices to exclude")
sizeMB := fs.Int("size-mb", 0, "per-GPU benchmark buffer size in MB (0 = auto)")
skipNCCL := fs.Bool("skip-nccl", false, "skip multi-GPU NCCL interconnect benchmark")
if err := fs.Parse(args[1:]); err != nil {
if err == flag.ErrHelp {
return 0
}
return 2
}
if fs.NArg() != 0 {
fmt.Fprintf(stderr, "bee benchmark: unexpected arguments\n")
return 2
}
includeIndices, err := parseBenchmarkIndexCSV(*devices)
if err != nil {
fmt.Fprintf(stderr, "bee benchmark: invalid --devices: %v\n", err)
return 2
}
excludeIndices, err := parseBenchmarkIndexCSV(*exclude)
if err != nil {
fmt.Fprintf(stderr, "bee benchmark: invalid --exclude: %v\n", err)
return 2
}
application := app.New(platform.New())
logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
archive, err := application.RunNvidiaBenchmark("", platform.NvidiaBenchmarkOptions{
Profile: *profile,
SizeMB: *sizeMB,
GPUIndices: includeIndices,
ExcludeGPUIndices: excludeIndices,
RunNCCL: !*skipNCCL,
}, logLine)
if err != nil {
slog.Error("run benchmark", "target", target, "err", err)
return 1
}
slog.Info("benchmark archive written", "target", target, "path", archive)
return 0
}
func parseBenchmarkIndexCSV(raw string) ([]int, error) {
raw = strings.TrimSpace(raw)
if raw == "" {
return nil, nil
}
var indices []int
for _, part := range strings.Split(raw, ",") {
part = strings.TrimSpace(part)
if part == "" {
continue
}
value, err := strconv.Atoi(part)
if err != nil || value < 0 {
return nil, fmt.Errorf("bad gpu index %q", part)
}
indices = append(indices, value)
}
return indices, nil
}

View File

@@ -19,17 +19,18 @@ import (
) )
var ( var (
DefaultExportDir = "/appdata/bee/export" DefaultExportDir = "/appdata/bee/export"
DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json" DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json"
DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log" DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log"
DefaultWebLogPath = DefaultExportDir + "/bee-web.log" DefaultWebLogPath = DefaultExportDir + "/bee-web.log"
DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log" DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log"
DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log" DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log"
DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log" DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log"
DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json" DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log" DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
DefaultTechDumpDir = DefaultExportDir + "/techdump" DefaultTechDumpDir = DefaultExportDir + "/techdump"
DefaultSATBaseDir = DefaultExportDir + "/bee-sat" DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
DefaultBenchmarkBaseDir = DefaultExportDir + "/bee-benchmark"
) )
type App struct { type App struct {
@@ -114,6 +115,12 @@ func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
type satRunner interface { type satRunner interface {
RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error)
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
@@ -195,10 +202,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
return "stdout", err return "stdout", err
case strings.HasPrefix(output, "file:"): case strings.HasPrefix(output, "file:"):
path := strings.TrimPrefix(output, "file:") path := strings.TrimPrefix(output, "file:")
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
return "", err
}
if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
return "", err return "", err
} }
return path, nil return path, nil
@@ -223,10 +227,7 @@ func (a *App) RunRuntimePreflight(output string) (string, error) {
return "stdout", err return "stdout", err
case strings.HasPrefix(output, "file:"): case strings.HasPrefix(output, "file:"):
path := strings.TrimPrefix(output, "file:") path := strings.TrimPrefix(output, "file:")
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
return "", err
}
if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
return "", err return "", err
} }
return path, nil return path, nil
@@ -532,10 +533,56 @@ func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir st
return ActionResult{Title: "NVIDIA DCGM", Body: body}, err return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
} }
func (a *App) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNvidiaTargetedStressValidatePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
}
func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) { func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc) return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
} }
func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
return a.RunNvidiaBenchmarkCtx(context.Background(), baseDir, opts, logFunc)
}
func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultBenchmarkBaseDir
}
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
}
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
}
func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNvidiaTargetedPowerPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
}
func (a *App) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNvidiaPulseTestPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
}
func (a *App) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNvidiaBandwidthPack(ctx, baseDir, gpuIndices, logFunc)
}
func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) { func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" { if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir baseDir = DefaultSATBaseDir
@@ -886,6 +933,12 @@ func latestSATSummaries() []string {
prefix string prefix string
}{ }{
{label: "NVIDIA SAT", prefix: "gpu-nvidia-"}, {label: "NVIDIA SAT", prefix: "gpu-nvidia-"},
{label: "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)", prefix: "gpu-nvidia-targeted-stress-"},
{label: "NVIDIA Max Compute Load (dcgmproftester)", prefix: "gpu-nvidia-compute-"},
{label: "NVIDIA Targeted Power (dcgmi diag targeted_power)", prefix: "gpu-nvidia-targeted-power-"},
{label: "NVIDIA Pulse Test (dcgmi diag pulse_test)", prefix: "gpu-nvidia-pulse-"},
{label: "NVIDIA Interconnect Test (NCCL all_reduce_perf)", prefix: "gpu-nvidia-nccl-"},
{label: "NVIDIA Bandwidth Test (NVBandwidth)", prefix: "gpu-nvidia-bandwidth-"},
{label: "Memory SAT", prefix: "memory-"}, {label: "Memory SAT", prefix: "memory-"},
{label: "Storage SAT", prefix: "storage-"}, {label: "Storage SAT", prefix: "storage-"},
{label: "CPU SAT", prefix: "cpu-"}, {label: "CPU SAT", prefix: "cpu-"},

View File

@@ -120,15 +120,21 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
} }
type fakeSAT struct { type fakeSAT struct {
runNvidiaFn func(string) (string, error) runNvidiaFn func(string) (string, error)
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error) runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
runMemoryFn func(string) (string, error) runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
runStorageFn func(string) (string, error) runNvidiaComputeFn func(string, int, []int) (string, error)
runCPUFn func(string, int) (string, error) runNvidiaPowerFn func(string, int, []int) (string, error)
detectVendorFn func() string runNvidiaPulseFn func(string, int, []int) (string, error)
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error) runNvidiaBandwidthFn func(string, []int) (string, error)
runAMDPackFn func(string) (string, error) runNvidiaTargetedStressFn func(string, int, []int) (string, error)
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error) runMemoryFn func(string) (string, error)
runStorageFn func(string) (string, error)
runCPUFn func(string, int) (string, error)
detectVendorFn func() string
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
runAMDPackFn func(string) (string, error)
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
} }
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) { func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
@@ -139,6 +145,48 @@ func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir s
return f.runNvidiaFn(baseDir) return f.runNvidiaFn(baseDir)
} }
func (f fakeSAT) RunNvidiaBenchmark(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, _ func(string)) (string, error) {
if f.runNvidiaBenchmarkFn != nil {
return f.runNvidiaBenchmarkFn(baseDir, opts)
}
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
if f.runNvidiaTargetedStressFn != nil {
return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
}
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
if f.runNvidiaComputeFn != nil {
return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices)
}
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaTargetedPowerPack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
if f.runNvidiaPowerFn != nil {
return f.runNvidiaPowerFn(baseDir, durationSec, gpuIndices)
}
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaPulseTestPack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
if f.runNvidiaPulseFn != nil {
return f.runNvidiaPulseFn(baseDir, durationSec, gpuIndices)
}
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaBandwidthPack(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
if f.runNvidiaBandwidthFn != nil {
return f.runNvidiaBandwidthFn(baseDir, gpuIndices)
}
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaStressPack(_ context.Context, baseDir string, opts platform.NvidiaStressOptions, _ func(string)) (string, error) { func (f fakeSAT) RunNvidiaStressPack(_ context.Context, baseDir string, opts platform.NvidiaStressOptions, _ func(string)) (string, error) {
if f.runNvidiaStressFn != nil { if f.runNvidiaStressFn != nil {
return f.runNvidiaStressFn(baseDir, opts) return f.runNvidiaStressFn(baseDir, opts)
@@ -754,6 +802,26 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
} }
} }
for _, want := range []string{
"/system/ip-link.txt",
"/system/ip-link-stats.txt",
"/system/ethtool-info.txt",
"/system/ethtool-link.txt",
"/system/ethtool-module.txt",
"/system/mstflint-query.txt",
} {
var found bool
for _, name := range names {
if contains(name, want) {
found = true
break
}
}
if !found {
t.Fatalf("support bundle missing %s, names=%v", want, names)
}
}
var foundRaw bool var foundRaw bool
for _, name := range names { for _, name := range names {
if contains(name, "/export/bee-sat/memory-run/verbose.log") { if contains(name, "/export/bee-sat/memory-run/verbose.log") {

View File

@@ -0,0 +1,48 @@
package app
import (
"fmt"
"os"
"path/filepath"
)
func atomicWriteFile(path string, data []byte, perm os.FileMode) error {
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
return fmt.Errorf("mkdir %s: %w", filepath.Dir(path), err)
}
tmpPath := path + ".tmp"
f, err := os.OpenFile(tmpPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, perm)
if err != nil {
return fmt.Errorf("open temp %s: %w", tmpPath, err)
}
success := false
defer func() {
_ = f.Close()
if !success {
_ = os.Remove(tmpPath)
}
}()
if _, err := f.Write(data); err != nil {
return fmt.Errorf("write temp %s: %w", tmpPath, err)
}
if err := f.Sync(); err != nil {
return fmt.Errorf("sync temp %s: %w", tmpPath, err)
}
if err := f.Close(); err != nil {
return fmt.Errorf("close temp %s: %w", tmpPath, err)
}
if err := os.Rename(tmpPath, path); err != nil {
return fmt.Errorf("rename %s -> %s: %w", tmpPath, path, err)
}
if dir, err := os.Open(filepath.Dir(path)); err == nil {
_ = dir.Sync()
_ = dir.Close()
}
success = true
return nil
}

View File

@@ -0,0 +1,71 @@
package app
import (
"encoding/json"
"os"
"path/filepath"
"testing"
"bee/audit/internal/schema"
)
func TestAtomicWriteFileReplacesTargetWithoutLeavingTmp(t *testing.T) {
path := filepath.Join(t.TempDir(), "bee-audit.json")
if err := os.WriteFile(path, []byte("old\n"), 0644); err != nil {
t.Fatalf("seed file: %v", err)
}
if err := atomicWriteFile(path, []byte("new\n"), 0644); err != nil {
t.Fatalf("atomicWriteFile: %v", err)
}
raw, err := os.ReadFile(path)
if err != nil {
t.Fatalf("read final: %v", err)
}
if string(raw) != "new\n" {
t.Fatalf("final content=%q want %q", string(raw), "new\n")
}
if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
t.Fatalf("tmp file should be absent after success, err=%v", err)
}
}
func TestRunRuntimePreflightWritesAtomically(t *testing.T) {
path := filepath.Join(t.TempDir(), "runtime-health.json")
a := &App{
runtime: fakeRuntime{
collectFn: func(exportDir string) (schema.RuntimeHealth, error) {
return schema.RuntimeHealth{
Status: "OK",
ExportDir: exportDir,
DriverReady: true,
CUDAReady: true,
}, nil
},
},
}
got, err := a.RunRuntimePreflight("file:" + path)
if err != nil {
t.Fatalf("RunRuntimePreflight: %v", err)
}
if got != path {
t.Fatalf("path=%q want %q", got, path)
}
if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
t.Fatalf("tmp file should be absent after success, err=%v", err)
}
raw, err := os.ReadFile(path)
if err != nil {
t.Fatalf("read runtime file: %v", err)
}
var health schema.RuntimeHealth
if err := json.Unmarshal(raw, &health); err != nil {
t.Fatalf("json unmarshal: %v", err)
}
if health.Status != "OK" {
t.Fatalf("status=%q want OK", health.Status)
}
}

View File

@@ -21,12 +21,12 @@ type ComponentStatusDB struct {
// ComponentStatusRecord holds the current and historical health of one hardware component. // ComponentStatusRecord holds the current and historical health of one hardware component.
type ComponentStatusRecord struct { type ComponentStatusRecord struct {
ComponentKey string `json:"component_key"` ComponentKey string `json:"component_key"`
Status string `json:"status"` // "OK", "Warning", "Critical", "Unknown" Status string `json:"status"` // "OK", "Warning", "Critical", "Unknown"
LastCheckedAt time.Time `json:"last_checked_at"` LastCheckedAt time.Time `json:"last_checked_at"`
LastChangedAt time.Time `json:"last_changed_at"` LastChangedAt time.Time `json:"last_changed_at"`
ErrorSummary string `json:"error_summary,omitempty"` ErrorSummary string `json:"error_summary,omitempty"`
History []ComponentStatusEntry `json:"history"` History []ComponentStatusEntry `json:"history"`
} }
// ComponentStatusEntry is one observation written to a component's history. // ComponentStatusEntry is one observation written to a component's history.
@@ -179,7 +179,9 @@ func ApplySATResultToDB(db *ComponentStatusDB, target, archivePath string) {
// Map SAT target to component keys. // Map SAT target to component keys.
switch target { switch target {
case "nvidia", "amd", "nvidia-stress", "amd-stress", "amd-mem", "amd-bandwidth": case "nvidia", "nvidia-targeted-stress", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
"nvidia-interconnect", "nvidia-bandwidth", "amd", "nvidia-stress",
"amd-stress", "amd-mem", "amd-bandwidth":
db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall) db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall)
case "memory", "memory-stress", "sat-stress": case "memory", "memory-stress", "sat-stress":
db.Record("memory:all", source, dbStatus, target+" SAT: "+overall) db.Record("memory:all", source, dbStatus, target+" SAT: "+overall)

View File

@@ -19,6 +19,8 @@ var supportBundleServices = []string{
"bee-network.service", "bee-network.service",
"bee-nvidia.service", "bee-nvidia.service",
"bee-preflight.service", "bee-preflight.service",
"bee-selfheal.service",
"bee-selfheal.timer",
"bee-sshsetup.service", "bee-sshsetup.service",
} }
@@ -32,6 +34,8 @@ var supportBundleCommands = []struct {
{name: "system/lspci-nn.txt", cmd: []string{"lspci", "-nn"}}, {name: "system/lspci-nn.txt", cmd: []string{"lspci", "-nn"}},
{name: "system/lspci-vvv.txt", cmd: []string{"lspci", "-vvv"}}, {name: "system/lspci-vvv.txt", cmd: []string{"lspci", "-vvv"}},
{name: "system/ip-addr.txt", cmd: []string{"ip", "addr"}}, {name: "system/ip-addr.txt", cmd: []string{"ip", "addr"}},
{name: "system/ip-link.txt", cmd: []string{"ip", "-details", "link", "show"}},
{name: "system/ip-link-stats.txt", cmd: []string{"ip", "-s", "link", "show"}},
{name: "system/ip-route.txt", cmd: []string{"ip", "route"}}, {name: "system/ip-route.txt", cmd: []string{"ip", "route"}},
{name: "system/mount.txt", cmd: []string{"mount"}}, {name: "system/mount.txt", cmd: []string{"mount"}},
{name: "system/df-h.txt", cmd: []string{"df", "-h"}}, {name: "system/df-h.txt", cmd: []string{"df", "-h"}},
@@ -47,6 +51,83 @@ for d in /sys/bus/pci/devices/*/; do
printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)" printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
done done
done done
`}},
{name: "system/ethtool-info.txt", cmd: []string{"sh", "-c", `
if ! command -v ethtool >/dev/null 2>&1; then
echo "ethtool not found"
exit 0
fi
found=0
for path in /sys/class/net/*; do
[ -e "$path" ] || continue
iface=$(basename "$path")
[ "$iface" = "lo" ] && continue
found=1
echo "=== $iface ==="
ethtool -i "$iface" 2>&1 || true
echo
done
if [ "$found" -eq 0 ]; then
echo "no interfaces found"
fi
`}},
{name: "system/ethtool-link.txt", cmd: []string{"sh", "-c", `
if ! command -v ethtool >/dev/null 2>&1; then
echo "ethtool not found"
exit 0
fi
found=0
for path in /sys/class/net/*; do
[ -e "$path" ] || continue
iface=$(basename "$path")
[ "$iface" = "lo" ] && continue
found=1
echo "=== $iface ==="
ethtool "$iface" 2>&1 || true
echo
done
if [ "$found" -eq 0 ]; then
echo "no interfaces found"
fi
`}},
{name: "system/ethtool-module.txt", cmd: []string{"sh", "-c", `
if ! command -v ethtool >/dev/null 2>&1; then
echo "ethtool not found"
exit 0
fi
found=0
for path in /sys/class/net/*; do
[ -e "$path" ] || continue
iface=$(basename "$path")
[ "$iface" = "lo" ] && continue
found=1
echo "=== $iface ==="
ethtool -m "$iface" 2>&1 || true
echo
done
if [ "$found" -eq 0 ]; then
echo "no interfaces found"
fi
`}},
{name: "system/mstflint-query.txt", cmd: []string{"sh", "-c", `
if ! command -v mstflint >/dev/null 2>&1; then
echo "mstflint not found"
exit 0
fi
found=0
for path in /sys/bus/pci/devices/*; do
[ -e "$path/vendor" ] || continue
vendor=$(cat "$path/vendor" 2>/dev/null)
[ "$vendor" = "0x15b3" ] || continue
bdf=$(basename "$path")
found=1
echo "=== $bdf ==="
mstflint -d "$bdf" q 2>&1 || true
echo
done
if [ "$found" -eq 0 ]; then
echo "no Mellanox/NVIDIA networking devices found"
fi
`}}, `}},
} }

View File

@@ -2,18 +2,21 @@ package collector
import ( import (
"bee/audit/internal/schema" "bee/audit/internal/schema"
"context"
"log/slog" "log/slog"
"os" "os"
"os/exec" "os/exec"
"path/filepath" "path/filepath"
"strings" "strings"
"time"
) )
const mellanoxVendorID = 0x15b3 const mellanoxVendorID = 0x15b3
const nicProbeTimeout = 2 * time.Second
var ( var (
mstflintQuery = func(bdf string) (string, error) { mstflintQuery = func(bdf string) (string, error) {
out, err := exec.Command("mstflint", "-d", bdf, "q").Output() out, err := commandOutputWithTimeout(nicProbeTimeout, "mstflint", "-d", bdf, "q")
if err != nil { if err != nil {
return "", err return "", err
} }
@@ -21,7 +24,7 @@ var (
} }
ethtoolInfoQuery = func(iface string) (string, error) { ethtoolInfoQuery = func(iface string) (string, error) {
out, err := exec.Command("ethtool", "-i", iface).Output() out, err := commandOutputWithTimeout(nicProbeTimeout, "ethtool", "-i", iface)
if err != nil { if err != nil {
return "", err return "", err
} }
@@ -29,6 +32,14 @@ var (
} }
netIfacesByBDF = listNetIfacesByBDF netIfacesByBDF = listNetIfacesByBDF
readNetCarrierFile = func(iface string) (string, error) {
path := filepath.Join("/sys/class/net", iface, "carrier")
raw, err := os.ReadFile(path)
if err != nil {
return "", err
}
return strings.TrimSpace(string(raw)), nil
}
) )
// enrichPCIeWithMellanox enriches Mellanox/NVIDIA Networking devices with // enrichPCIeWithMellanox enriches Mellanox/NVIDIA Networking devices with
@@ -162,3 +173,17 @@ func listNetIfacesByBDF(bdf string) []string {
} }
return ifaces return ifaces
} }
func commandOutputWithTimeout(timeout time.Duration, name string, args ...string) ([]byte, error) {
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
return exec.CommandContext(ctx, name, args...).Output()
}
func interfaceHasCarrier(iface string) bool {
raw, err := readNetCarrierFile(iface)
if err != nil {
return false
}
return strings.TrimSpace(raw) == "1"
}

View File

@@ -12,7 +12,7 @@ import (
var ( var (
ethtoolModuleQuery = func(iface string) (string, error) { ethtoolModuleQuery = func(iface string) (string, error) {
out, err := raidToolQuery("ethtool", "-m", iface) out, err := commandOutputWithTimeout(nicProbeTimeout, "ethtool", "-m", iface)
if err != nil { if err != nil {
return "", err return "", err
} }
@@ -58,10 +58,12 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
} }
} }
if out, err := ethtoolModuleQuery(iface); err == nil { if interfaceHasCarrier(iface) {
if injectSFPDOMTelemetry(&devs[i], out) { if out, err := ethtoolModuleQuery(iface); err == nil {
enriched++ if injectSFPDOMTelemetry(&devs[i], out) {
continue enriched++
continue
}
} }
} }
if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil { if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {

View File

@@ -57,6 +57,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
origReadMAC := readNetAddressFile origReadMAC := readNetAddressFile
origEth := ethtoolInfoQuery origEth := ethtoolInfoQuery
origModule := ethtoolModuleQuery origModule := ethtoolModuleQuery
origCarrier := readNetCarrierFile
t.Cleanup(func() { t.Cleanup(func() {
queryPCILSPCIDetail = origDetail queryPCILSPCIDetail = origDetail
readPCIVPDFile = origVPD readPCIVPDFile = origVPD
@@ -64,6 +65,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
readNetAddressFile = origReadMAC readNetAddressFile = origReadMAC
ethtoolInfoQuery = origEth ethtoolInfoQuery = origEth
ethtoolModuleQuery = origModule ethtoolModuleQuery = origModule
readNetCarrierFile = origCarrier
}) })
queryPCILSPCIDetail = func(bdf string) (string, error) { queryPCILSPCIDetail = func(bdf string) (string, error) {
@@ -82,6 +84,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
} }
return "aa:bb:cc:dd:ee:ff", nil return "aa:bb:cc:dd:ee:ff", nil
} }
readNetCarrierFile = func(string) (string, error) { return "1", nil }
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") } ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("skip optics") } ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("skip optics") }
@@ -101,6 +104,42 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
} }
} }
func TestEnrichPCIeWithNICTelemetrySkipsModuleQueryWithoutCarrier(t *testing.T) {
origIfaces := netIfacesByBDF
origReadMAC := readNetAddressFile
origEth := ethtoolInfoQuery
origModule := ethtoolModuleQuery
origCarrier := readNetCarrierFile
t.Cleanup(func() {
netIfacesByBDF = origIfaces
readNetAddressFile = origReadMAC
ethtoolInfoQuery = origEth
ethtoolModuleQuery = origModule
readNetCarrierFile = origCarrier
})
netIfacesByBDF = func(string) []string { return []string{"eth0"} }
readNetAddressFile = func(string) (string, error) { return "aa:bb:cc:dd:ee:ff", nil }
readNetCarrierFile = func(string) (string, error) { return "0", nil }
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
ethtoolModuleQuery = func(string) (string, error) {
t.Fatal("ethtool -m should not be called without carrier")
return "", nil
}
class := "EthernetController"
bdf := "0000:18:00.0"
devs := []schema.HardwarePCIeDevice{{
DeviceClass: &class,
BDF: &bdf,
}}
out := enrichPCIeWithNICTelemetry(devs)
if len(out[0].MacAddresses) != 1 || out[0].MacAddresses[0] != "aa:bb:cc:dd:ee:ff" {
t.Fatalf("mac_addresses=%v", out[0].MacAddresses)
}
}
func TestDBMValue(t *testing.T) { func TestDBMValue(t *testing.T) {
tests := []struct { tests := []struct {
in string in string

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,141 @@
package platform
import (
"fmt"
"strings"
"time"
)
func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
var b strings.Builder
fmt.Fprintf(&b, "Bee NVIDIA Benchmark Report\n")
fmt.Fprintf(&b, "===========================\n\n")
fmt.Fprintf(&b, "Generated: %s\n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
fmt.Fprintf(&b, "Host: %s\n", result.Hostname)
fmt.Fprintf(&b, "Profile: %s\n", result.BenchmarkProfile)
fmt.Fprintf(&b, "Overall status: %s\n", result.OverallStatus)
fmt.Fprintf(&b, "Selected GPUs: %s\n", joinIndexList(result.SelectedGPUIndices))
fmt.Fprintf(&b, "Normalization: %s\n\n", result.Normalization.Status)
if len(result.Findings) > 0 {
fmt.Fprintf(&b, "Executive Summary\n")
fmt.Fprintf(&b, "-----------------\n")
for _, finding := range result.Findings {
fmt.Fprintf(&b, "- %s\n", finding)
}
b.WriteString("\n")
}
if len(result.Warnings) > 0 {
fmt.Fprintf(&b, "Warnings\n")
fmt.Fprintf(&b, "--------\n")
for _, warning := range result.Warnings {
fmt.Fprintf(&b, "- %s\n", warning)
}
b.WriteString("\n")
}
fmt.Fprintf(&b, "Per GPU Scorecard\n")
fmt.Fprintf(&b, "-----------------\n")
for _, gpu := range result.GPUs {
fmt.Fprintf(&b, "GPU %d %s\n", gpu.Index, gpu.Name)
fmt.Fprintf(&b, " Status: %s\n", gpu.Status)
fmt.Fprintf(&b, " Composite score: %.2f\n", gpu.Scores.CompositeScore)
fmt.Fprintf(&b, " Compute score: %.2f\n", gpu.Scores.ComputeScore)
fmt.Fprintf(&b, " Power sustain: %.1f\n", gpu.Scores.PowerSustainScore)
fmt.Fprintf(&b, " Thermal sustain: %.1f\n", gpu.Scores.ThermalSustainScore)
fmt.Fprintf(&b, " Stability: %.1f\n", gpu.Scores.StabilityScore)
if gpu.Scores.InterconnectScore > 0 {
fmt.Fprintf(&b, " Interconnect: %.1f\n", gpu.Scores.InterconnectScore)
}
if len(gpu.DegradationReasons) > 0 {
fmt.Fprintf(&b, " Degradation reasons: %s\n", strings.Join(gpu.DegradationReasons, ", "))
}
fmt.Fprintf(&b, " Avg power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.AvgPowerW, gpu.Steady.AvgTempC, gpu.Steady.AvgGraphicsClockMHz)
fmt.Fprintf(&b, " P95 power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.P95PowerW, gpu.Steady.P95TempC, gpu.Steady.P95GraphicsClockMHz)
if len(gpu.PrecisionResults) > 0 {
fmt.Fprintf(&b, " Precision results:\n")
for _, precision := range gpu.PrecisionResults {
if precision.Supported {
fmt.Fprintf(&b, " - %s: %.2f TOPS lanes=%d iterations=%d\n", precision.Name, precision.TeraOpsPerSec, precision.Lanes, precision.Iterations)
} else {
fmt.Fprintf(&b, " - %s: unsupported (%s)\n", precision.Name, precision.Notes)
}
}
}
fmt.Fprintf(&b, " Throttle counters (us): sw_power=%d sw_thermal=%d sync_boost=%d hw_thermal=%d hw_power_brake=%d\n",
gpu.Throttle.SWPowerCapUS,
gpu.Throttle.SWThermalSlowdownUS,
gpu.Throttle.SyncBoostUS,
gpu.Throttle.HWThermalSlowdownUS,
gpu.Throttle.HWPowerBrakeSlowdownUS,
)
if len(gpu.Notes) > 0 {
fmt.Fprintf(&b, " Notes:\n")
for _, note := range gpu.Notes {
fmt.Fprintf(&b, " - %s\n", note)
}
}
b.WriteString("\n")
}
if result.Interconnect != nil {
fmt.Fprintf(&b, "Interconnect\n")
fmt.Fprintf(&b, "------------\n")
fmt.Fprintf(&b, "Status: %s\n", result.Interconnect.Status)
if result.Interconnect.Supported {
fmt.Fprintf(&b, "Avg algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.AvgBusBWGBps)
fmt.Fprintf(&b, "Max algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.MaxAlgBWGBps, result.Interconnect.MaxBusBWGBps)
}
for _, note := range result.Interconnect.Notes {
fmt.Fprintf(&b, "- %s\n", note)
}
b.WriteString("\n")
}
fmt.Fprintf(&b, "Methodology\n")
fmt.Fprintf(&b, "-----------\n")
fmt.Fprintf(&b, "- Profile %s uses standardized baseline, warmup, steady-state, interconnect, and cooldown phases.\n", result.BenchmarkProfile)
fmt.Fprintf(&b, "- Single-GPU compute score comes from bee-gpu-burn cuBLASLt output when available.\n")
fmt.Fprintf(&b, "- Thermal and power limitations are inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
fmt.Fprintf(&b, "- result.json is the canonical machine-readable source for this benchmark run.\n\n")
fmt.Fprintf(&b, "Raw Files\n")
fmt.Fprintf(&b, "---------\n")
fmt.Fprintf(&b, "- result.json\n")
fmt.Fprintf(&b, "- report.txt\n")
fmt.Fprintf(&b, "- summary.txt\n")
fmt.Fprintf(&b, "- verbose.log\n")
fmt.Fprintf(&b, "- gpu-*-baseline-metrics.csv/html/term.txt\n")
fmt.Fprintf(&b, "- gpu-*-warmup.log\n")
fmt.Fprintf(&b, "- gpu-*-steady.log\n")
fmt.Fprintf(&b, "- gpu-*-steady-metrics.csv/html/term.txt\n")
fmt.Fprintf(&b, "- gpu-*-cooldown-metrics.csv/html/term.txt\n")
if result.Interconnect != nil {
fmt.Fprintf(&b, "- nccl-all-reduce.log\n")
}
return b.String()
}
func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
var b strings.Builder
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
fmt.Fprintf(&b, "normalization_status=%s\n", result.Normalization.Status)
var best float64
for i, gpu := range result.GPUs {
fmt.Fprintf(&b, "gpu_%d_status=%s\n", gpu.Index, gpu.Status)
fmt.Fprintf(&b, "gpu_%d_composite_score=%.2f\n", gpu.Index, gpu.Scores.CompositeScore)
if i == 0 || gpu.Scores.CompositeScore > best {
best = gpu.Scores.CompositeScore
}
}
fmt.Fprintf(&b, "best_composite_score=%.2f\n", best)
if result.Interconnect != nil {
fmt.Fprintf(&b, "interconnect_status=%s\n", result.Interconnect.Status)
fmt.Fprintf(&b, "interconnect_max_busbw_gbps=%.1f\n", result.Interconnect.MaxBusBWGBps)
}
return b.String()
}

View File

@@ -0,0 +1,147 @@
package platform
import (
"strings"
"testing"
)
func TestResolveBenchmarkProfile(t *testing.T) {
t.Parallel()
cases := []struct {
name string
profile string
want benchmarkProfileSpec
}{
{
name: "default",
profile: "",
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120},
},
{
name: "stability",
profile: "stability",
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300},
},
{
name: "overnight",
profile: "overnight",
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300},
},
}
for _, tc := range cases {
tc := tc
t.Run(tc.name, func(t *testing.T) {
got := resolveBenchmarkProfile(tc.profile)
if got != tc.want {
t.Fatalf("profile=%q got %+v want %+v", tc.profile, got, tc.want)
}
})
}
}
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
t.Parallel()
opts := normalizeNvidiaBenchmarkOptionsForBenchmark(NvidiaBenchmarkOptions{
Profile: "stability",
RunNCCL: false,
})
if opts.Profile != NvidiaBenchmarkProfileStability {
t.Fatalf("profile=%q want %q", opts.Profile, NvidiaBenchmarkProfileStability)
}
if opts.RunNCCL {
t.Fatalf("RunNCCL should stay false when explicitly disabled")
}
}
func TestParseBenchmarkBurnLog(t *testing.T) {
t.Parallel()
raw := strings.Join([]string{
"loader=bee-gpu-burn",
"[gpu 0] device=NVIDIA H100",
"[gpu 0] compute_capability=9.0",
"[gpu 0] backend=cublasLt",
"[gpu 0] duration_s=10",
"[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0",
"[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0",
"[gpu 0] fp16_tensor_iterations=200",
"[gpu 0] fp8_e4m3_iterations=50",
"[gpu 0] status=OK",
}, "\n")
got := parseBenchmarkBurnLog(raw)
if got.Backend != "cublasLt" {
t.Fatalf("backend=%q want cublasLt", got.Backend)
}
if got.ComputeCapability != "9.0" {
t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability)
}
if len(got.Profiles) != 2 {
t.Fatalf("profiles=%d want 2", len(got.Profiles))
}
if got.Profiles[0].TeraOpsPerSec <= 0 {
t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec)
}
if got.Profiles[1].Category != "fp8" {
t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category)
}
}
func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
t.Parallel()
result := NvidiaBenchmarkResult{
BenchmarkVersion: benchmarkVersion,
BenchmarkProfile: NvidiaBenchmarkProfileStandard,
OverallStatus: "PARTIAL",
SelectedGPUIndices: []int{0},
Normalization: BenchmarkNormalization{
Status: "partial",
},
Findings: []string{"GPU 0 spent measurable time under SW power cap."},
GPUs: []BenchmarkGPUResult{
{
Index: 0,
Name: "NVIDIA H100",
Status: "OK",
Steady: BenchmarkTelemetrySummary{
AvgPowerW: 680,
AvgTempC: 79,
AvgGraphicsClockMHz: 1725,
P95PowerW: 700,
P95TempC: 82,
P95GraphicsClockMHz: 1800,
},
Scores: BenchmarkScorecard{
ComputeScore: 1200,
PowerSustainScore: 96,
ThermalSustainScore: 88,
StabilityScore: 92,
CompositeScore: 1176,
},
PrecisionResults: []BenchmarkPrecisionResult{
{Name: "fp16_tensor", Supported: true, TeraOpsPerSec: 700},
},
Throttle: BenchmarkThrottleCounters{
SWPowerCapUS: 1000000,
},
DegradationReasons: []string{"power_capped"},
},
},
}
report := renderBenchmarkReport(result)
for _, needle := range []string{
"Executive Summary",
"GPU 0 spent measurable time under SW power cap.",
"Composite score: 1176.00",
"fp16_tensor: 700.00 TOPS",
} {
if !strings.Contains(report, needle) {
t.Fatalf("report missing %q\n%s", needle, report)
}
}
}

View File

@@ -0,0 +1,132 @@
package platform
import "time"
const (
NvidiaBenchmarkProfileStandard = "standard"
NvidiaBenchmarkProfileStability = "stability"
NvidiaBenchmarkProfileOvernight = "overnight"
)
type NvidiaBenchmarkOptions struct {
Profile string
SizeMB int
GPUIndices []int
ExcludeGPUIndices []int
RunNCCL bool
}
type NvidiaBenchmarkResult struct {
BenchmarkVersion string `json:"benchmark_version"`
GeneratedAt time.Time `json:"generated_at"`
Hostname string `json:"hostname,omitempty"`
BenchmarkProfile string `json:"benchmark_profile"`
OverallStatus string `json:"overall_status"`
SelectedGPUIndices []int `json:"selected_gpu_indices"`
Findings []string `json:"findings,omitempty"`
Warnings []string `json:"warnings,omitempty"`
Normalization BenchmarkNormalization `json:"normalization"`
GPUs []BenchmarkGPUResult `json:"gpus"`
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
}
type BenchmarkNormalization struct {
Status string `json:"status"`
Notes []string `json:"notes,omitempty"`
GPUs []BenchmarkNormalizationGPU `json:"gpus,omitempty"`
}
type BenchmarkNormalizationGPU struct {
Index int `json:"index"`
PersistenceMode string `json:"persistence_mode,omitempty"`
GPUClockLockMHz float64 `json:"gpu_clock_lock_mhz,omitempty"`
GPUClockLockStatus string `json:"gpu_clock_lock_status,omitempty"`
MemoryClockLockMHz float64 `json:"memory_clock_lock_mhz,omitempty"`
MemoryClockLockStatus string `json:"memory_clock_lock_status,omitempty"`
Notes []string `json:"notes,omitempty"`
}
type BenchmarkGPUResult struct {
Index int `json:"index"`
UUID string `json:"uuid,omitempty"`
Name string `json:"name,omitempty"`
BusID string `json:"bus_id,omitempty"`
VBIOS string `json:"vbios,omitempty"`
ComputeCapability string `json:"compute_capability,omitempty"`
Backend string `json:"backend,omitempty"`
Status string `json:"status"`
PowerLimitW float64 `json:"power_limit_w,omitempty"`
MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"`
MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"`
LockedGraphicsClockMHz float64 `json:"locked_graphics_clock_mhz,omitempty"`
LockedMemoryClockMHz float64 `json:"locked_memory_clock_mhz,omitempty"`
Baseline BenchmarkTelemetrySummary `json:"baseline"`
Steady BenchmarkTelemetrySummary `json:"steady"`
Cooldown BenchmarkTelemetrySummary `json:"cooldown"`
Throttle BenchmarkThrottleCounters `json:"throttle_counters"`
PrecisionResults []BenchmarkPrecisionResult `json:"precision_results,omitempty"`
Scores BenchmarkScorecard `json:"scores"`
DegradationReasons []string `json:"degradation_reasons,omitempty"`
Notes []string `json:"notes,omitempty"`
}
type BenchmarkTelemetrySummary struct {
DurationSec float64 `json:"duration_sec"`
Samples int `json:"samples"`
AvgTempC float64 `json:"avg_temp_c"`
P95TempC float64 `json:"p95_temp_c"`
AvgPowerW float64 `json:"avg_power_w"`
P95PowerW float64 `json:"p95_power_w"`
AvgGraphicsClockMHz float64 `json:"avg_graphics_clock_mhz"`
P95GraphicsClockMHz float64 `json:"p95_graphics_clock_mhz"`
AvgMemoryClockMHz float64 `json:"avg_memory_clock_mhz"`
P95MemoryClockMHz float64 `json:"p95_memory_clock_mhz"`
AvgUsagePct float64 `json:"avg_usage_pct"`
AvgMemUsagePct float64 `json:"avg_mem_usage_pct"`
ClockCVPct float64 `json:"clock_cv_pct"`
PowerCVPct float64 `json:"power_cv_pct"`
TempCVPct float64 `json:"temp_cv_pct"`
ClockDriftPct float64 `json:"clock_drift_pct"`
}
type BenchmarkThrottleCounters struct {
SWPowerCapUS uint64 `json:"sw_power_cap_us"`
SWThermalSlowdownUS uint64 `json:"sw_thermal_slowdown_us"`
SyncBoostUS uint64 `json:"sync_boost_us"`
HWThermalSlowdownUS uint64 `json:"hw_thermal_slowdown_us"`
HWPowerBrakeSlowdownUS uint64 `json:"hw_power_brake_slowdown_us"`
}
type BenchmarkPrecisionResult struct {
Name string `json:"name"`
Category string `json:"category"`
Supported bool `json:"supported"`
Lanes int `json:"lanes,omitempty"`
M uint64 `json:"m,omitempty"`
N uint64 `json:"n,omitempty"`
K uint64 `json:"k,omitempty"`
Iterations uint64 `json:"iterations,omitempty"`
TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
Notes string `json:"notes,omitempty"`
}
type BenchmarkScorecard struct {
ComputeScore float64 `json:"compute_score"`
PowerSustainScore float64 `json:"power_sustain_score"`
ThermalSustainScore float64 `json:"thermal_sustain_score"`
StabilityScore float64 `json:"stability_score"`
InterconnectScore float64 `json:"interconnect_score"`
CompositeScore float64 `json:"composite_score"`
}
type BenchmarkInterconnectResult struct {
Status string `json:"status"`
Attempted bool `json:"attempted"`
Supported bool `json:"supported"`
SelectedGPUIndices []int `json:"selected_gpu_indices,omitempty"`
AvgAlgBWGBps float64 `json:"avg_algbw_gbps,omitempty"`
MaxAlgBWGBps float64 `json:"max_algbw_gbps,omitempty"`
AvgBusBWGBps float64 `json:"avg_busbw_gbps,omitempty"`
MaxBusBWGBps float64 `json:"max_busbw_gbps,omitempty"`
Notes []string `json:"notes,omitempty"`
}

View File

@@ -20,12 +20,13 @@ type GPUMetricRow struct {
MemUsagePct float64 `json:"mem_usage_pct"` MemUsagePct float64 `json:"mem_usage_pct"`
PowerW float64 `json:"power_w"` PowerW float64 `json:"power_w"`
ClockMHz float64 `json:"clock_mhz"` ClockMHz float64 `json:"clock_mhz"`
MemClockMHz float64 `json:"mem_clock_mhz"`
} }
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU. // sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) { func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
args := []string{ args := []string{
"--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics", "--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics,clocks.current.memory",
"--format=csv,noheader,nounits", "--format=csv,noheader,nounits",
} }
if len(gpuIndices) > 0 { if len(gpuIndices) > 0 {
@@ -46,7 +47,7 @@ func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
continue continue
} }
parts := strings.Split(line, ", ") parts := strings.Split(line, ", ")
if len(parts) < 6 { if len(parts) < 7 {
continue continue
} }
idx, _ := strconv.Atoi(strings.TrimSpace(parts[0])) idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
@@ -57,6 +58,7 @@ func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
MemUsagePct: parseGPUFloat(parts[3]), MemUsagePct: parseGPUFloat(parts[3]),
PowerW: parseGPUFloat(parts[4]), PowerW: parseGPUFloat(parts[4]),
ClockMHz: parseGPUFloat(parts[5]), ClockMHz: parseGPUFloat(parts[5]),
MemClockMHz: parseGPUFloat(parts[6]),
}) })
} }
return rows, nil return rows, nil
@@ -139,10 +141,10 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
// WriteGPUMetricsCSV writes collected rows as a CSV file. // WriteGPUMetricsCSV writes collected rows as a CSV file.
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error { func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
var b bytes.Buffer var b bytes.Buffer
b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,power_w,clock_mhz\n") b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz\n")
for _, r := range rows { for _, r := range rows {
fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.0f\n", fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f\n",
r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.PowerW, r.ClockMHz) r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz)
} }
return os.WriteFile(path, b.Bytes(), 0644) return os.WriteFile(path, b.Bytes(), 0644)
} }
@@ -197,7 +199,7 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
const PW = plotX2 - plotX1 const PW = plotX2 - plotX1
const PH = plotY2 - plotY1 const PH = plotY2 - plotY1
// Outer axes // Outer axes
const tempAxisX = 60 // temp axis line const tempAxisX = 60 // temp axis line
const clockAxisX = 900 // clock axis line const clockAxisX = 900 // clock axis line
colors := [4]string{"#e74c3c", "#3498db", "#2ecc71", "#f39c12"} colors := [4]string{"#e74c3c", "#3498db", "#2ecc71", "#f39c12"}

View File

@@ -120,10 +120,45 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) erro
log(fmt.Sprintf("Warning: rebind /run/live/medium failed: %v", err)) log(fmt.Sprintf("Warning: rebind /run/live/medium failed: %v", err))
} }
log("Verifying live medium now served from RAM...")
status := s.LiveBootSource()
if err := verifyInstallToRAMStatus(status); err != nil {
return err
}
log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
log("Done. Installation media can be safely disconnected.") log("Done. Installation media can be safely disconnected.")
return nil return nil
} }
func verifyInstallToRAMStatus(status LiveBootSource) error {
if status.InRAM {
return nil
}
return fmt.Errorf("install to RAM verification failed: live medium still mounted from %s", describeLiveBootSource(status))
}
func describeLiveBootSource(status LiveBootSource) string {
source := strings.TrimSpace(status.Device)
if source == "" {
source = strings.TrimSpace(status.Source)
}
if source == "" {
source = "unknown source"
}
switch strings.TrimSpace(status.Kind) {
case "ram":
return "RAM"
case "usb":
return "USB (" + source + ")"
case "cdrom":
return "CD-ROM (" + source + ")"
case "disk":
return "disk (" + source + ")"
default:
return source
}
}
func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) error { func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) error {
in, err := os.Open(src) in, err := os.Open(src)
if err != nil { if err != nil {

View File

@@ -3,6 +3,8 @@ package platform
import "testing" import "testing"
func TestInferLiveBootKind(t *testing.T) { func TestInferLiveBootKind(t *testing.T) {
t.Parallel()
tests := []struct { tests := []struct {
name string name string
fsType string fsType string
@@ -18,6 +20,7 @@ func TestInferLiveBootKind(t *testing.T) {
{name: "unknown", source: "overlay", want: "unknown"}, {name: "unknown", source: "overlay", want: "unknown"},
} }
for _, tc := range tests { for _, tc := range tests {
tc := tc
t.Run(tc.name, func(t *testing.T) { t.Run(tc.name, func(t *testing.T) {
got := inferLiveBootKind(tc.fsType, tc.source, tc.deviceType, tc.transport) got := inferLiveBootKind(tc.fsType, tc.source, tc.deviceType, tc.transport)
if got != tc.want { if got != tc.want {
@@ -26,3 +29,29 @@ func TestInferLiveBootKind(t *testing.T) {
}) })
} }
} }
func TestVerifyInstallToRAMStatus(t *testing.T) {
t.Parallel()
if err := verifyInstallToRAMStatus(LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"}); err != nil {
t.Fatalf("expected success for RAM-backed status, got %v", err)
}
err := verifyInstallToRAMStatus(LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"})
if err == nil {
t.Fatal("expected verification failure when media is still on USB")
}
if got := err.Error(); got != "install to RAM verification failed: live medium still mounted from USB (/dev/sdb1)" {
t.Fatalf("error=%q", got)
}
}
func TestDescribeLiveBootSource(t *testing.T) {
t.Parallel()
if got := describeLiveBootSource(LiveBootSource{InRAM: true, Kind: "ram"}); got != "RAM" {
t.Fatalf("got %q want RAM", got)
}
if got := describeLiveBootSource(LiveBootSource{Kind: "unknown", Source: "/run/live/medium"}); got != "/run/live/medium" {
t.Fatalf("got %q want /run/live/medium", got)
}
}

View File

@@ -135,12 +135,15 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
case "nvidia": case "nvidia":
tools = append(tools, s.CheckTools([]string{ tools = append(tools, s.CheckTools([]string{
"nvidia-smi", "nvidia-smi",
"dcgmi",
"nv-hostengine",
"nvidia-bug-report.sh", "nvidia-bug-report.sh",
"bee-gpu-burn", "bee-gpu-burn",
"bee-john-gpu-stress", "bee-john-gpu-stress",
"bee-nccl-gpu-stress", "bee-nccl-gpu-stress",
"all_reduce_perf", "all_reduce_perf",
})...) })...)
tools = append(tools, resolvedToolStatus("dcgmproftester", dcgmProfTesterCandidates...))
case "amd": case "amd":
tool := ToolStatus{Name: "rocm-smi"} tool := ToolStatus{Name: "rocm-smi"}
if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 { if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 {
@@ -155,6 +158,16 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
return tools return tools
} }
func resolvedToolStatus(display string, candidates ...string) ToolStatus {
for _, candidate := range candidates {
path, err := exec.LookPath(candidate)
if err == nil {
return ToolStatus{Name: display, Path: path, OK: true}
}
}
return ToolStatus{Name: display}
}
func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) { func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
lsmodText := commandText("lsmod") lsmodText := commandText("lsmod")

View File

@@ -12,19 +12,20 @@ import (
"os" "os"
"os/exec" "os/exec"
"path/filepath" "path/filepath"
"syscall"
"sort" "sort"
"strconv" "strconv"
"strings" "strings"
"sync" "sync"
"syscall"
"time" "time"
) )
var ( var (
satExecCommand = exec.Command satExecCommand = exec.Command
satLookPath = exec.LookPath satLookPath = exec.LookPath
satGlob = filepath.Glob satGlob = filepath.Glob
satStat = os.Stat satStat = os.Stat
satFreeMemBytes = freeMemBytes
rocmSMIExecutableGlobs = []string{ rocmSMIExecutableGlobs = []string{
"/opt/rocm/bin/rocm-smi", "/opt/rocm/bin/rocm-smi",
@@ -38,6 +39,12 @@ var (
"/opt/rocm/bin/rvs", "/opt/rocm/bin/rvs",
"/opt/rocm-*/bin/rvs", "/opt/rocm-*/bin/rvs",
} }
dcgmProfTesterCandidates = []string{
"dcgmproftester",
"dcgmproftester13",
"dcgmproftester12",
"dcgmproftester11",
}
) )
// streamExecOutput runs cmd and streams each output line to logFunc (if non-nil). // streamExecOutput runs cmd and streams each output line to logFunc (if non-nil).
@@ -76,15 +83,15 @@ func streamExecOutput(cmd *exec.Cmd, logFunc func(string)) ([]byte, error) {
// NvidiaGPU holds basic GPU info from nvidia-smi. // NvidiaGPU holds basic GPU info from nvidia-smi.
type NvidiaGPU struct { type NvidiaGPU struct {
Index int Index int `json:"index"`
Name string Name string `json:"name"`
MemoryMB int MemoryMB int `json:"memory_mb"`
} }
// AMDGPUInfo holds basic info about an AMD GPU from rocm-smi. // AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
type AMDGPUInfo struct { type AMDGPUInfo struct {
Index int Index int `json:"index"`
Name string Name string `json:"name"`
} }
// DetectGPUVendor returns "nvidia" if /dev/nvidia0 exists, "amd" if /dev/kfd exists, or "" otherwise. // DetectGPUVendor returns "nvidia" if /dev/nvidia0 exists, "amd" if /dev/kfd exists, or "" otherwise.
@@ -256,6 +263,9 @@ func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
MemoryMB: memMB, MemoryMB: memMB,
}) })
} }
sort.Slice(gpus, func(i, j int) bool {
return gpus[i].Index < gpus[j].Index
})
return gpus, nil return gpus, nil
} }
@@ -277,6 +287,80 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
}, logFunc) }, logFunc)
} }
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
selected, err := resolveDCGMGPUIndices(gpuIndices)
if err != nil {
return "", err
}
profCmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
if err != nil {
return "", err
}
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", []satJob{
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
{
name: "03-dcgmproftester.log",
cmd: profCmd,
env: nvidiaVisibleDevicesEnv(selected),
collectGPU: true,
gpuIndices: selected,
},
{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
}, logFunc)
}
func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
selected, err := resolveDCGMGPUIndices(gpuIndices)
if err != nil {
return "", err
}
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", []satJob{
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{
name: "02-dcgmi-targeted-power.log",
cmd: nvidiaDCGMNamedDiagCommand("targeted_power", normalizeNvidiaBurnDuration(durationSec), selected),
collectGPU: true,
gpuIndices: selected,
},
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
}, logFunc)
}
func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
selected, err := resolveDCGMGPUIndices(gpuIndices)
if err != nil {
return "", err
}
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", []satJob{
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{
name: "02-dcgmi-pulse-test.log",
cmd: nvidiaDCGMNamedDiagCommand("pulse_test", normalizeNvidiaBurnDuration(durationSec), selected),
collectGPU: true,
gpuIndices: selected,
},
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
}, logFunc)
}
func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
selected, err := resolveDCGMGPUIndices(gpuIndices)
if err != nil {
return "", err
}
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", []satJob{
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{
name: "02-dcgmi-nvbandwidth.log",
cmd: nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, selected),
collectGPU: true,
gpuIndices: selected,
},
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
}, logFunc)
}
func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) { func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
return runAcceptancePackCtx(context.Background(), baseDir, "gpu-nvidia", nvidiaSATJobs(), logFunc) return runAcceptancePackCtx(context.Background(), baseDir, "gpu-nvidia", nvidiaSATJobs(), logFunc)
} }
@@ -293,6 +377,23 @@ func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, resolvedGPUIndices), logFunc) return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, resolvedGPUIndices), logFunc)
} }
func (s *System) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
selected, err := resolveDCGMGPUIndices(gpuIndices)
if err != nil {
return "", err
}
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", []satJob{
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{
name: "02-dcgmi-targeted-stress.log",
cmd: nvidiaDCGMNamedDiagCommand("targeted_stress", normalizeNvidiaBurnDuration(durationSec), selected),
collectGPU: true,
gpuIndices: selected,
},
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
}, logFunc)
}
func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) { func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
if len(gpuIndices) > 0 { if len(gpuIndices) > 0 {
return dedupeSortedIndices(gpuIndices), nil return dedupeSortedIndices(gpuIndices), nil
@@ -307,6 +408,25 @@ func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
return all, nil return all, nil
} }
func memoryStressSizeArg() string {
if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 {
return fmt.Sprintf("%dM", mb)
}
availBytes := satFreeMemBytes()
if availBytes <= 0 {
return "80%"
}
availMB := availBytes / (1024 * 1024)
targetMB := (availMB * 2) / 3
if targetMB >= 256 {
targetMB = (targetMB / 256) * 256
}
if targetMB <= 0 {
return "80%"
}
return fmt.Sprintf("%dM", targetMB)
}
func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) { func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128) sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128)
passes := envInt("BEE_MEMTESTER_PASSES", 1) passes := envInt("BEE_MEMTESTER_PASSES", 1)
@@ -322,11 +442,9 @@ func (s *System) RunMemoryStressPack(ctx context.Context, baseDir string, durati
if seconds <= 0 { if seconds <= 0 {
seconds = envInt("BEE_VM_STRESS_SECONDS", 300) seconds = envInt("BEE_VM_STRESS_SECONDS", 300)
} }
// Use 80% of RAM by default; override with BEE_VM_STRESS_SIZE_MB. // Base the default on current MemAvailable and keep headroom for the OS and
sizeArg := "80%" // concurrent stressors so mixed burn runs do not trip the OOM killer.
if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 { sizeArg := memoryStressSizeArg()
sizeArg = fmt.Sprintf("%dM", mb)
}
return runAcceptancePackCtx(ctx, baseDir, "memory-stress", []satJob{ return runAcceptancePackCtx(ctx, baseDir, "memory-stress", []satJob{
{name: "01-free-before.log", cmd: []string{"free", "-h"}}, {name: "01-free-before.log", cmd: []string{"free", "-h"}},
{name: "02-stress-ng-vm.log", cmd: []string{ {name: "02-stress-ng-vm.log", cmd: []string{
@@ -473,6 +591,31 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
} }
} }
func nvidiaDCGMNamedDiagCommand(name string, durationSec int, gpuIndices []int) []string {
args := []string{"dcgmi", "diag", "-r", name}
if durationSec > 0 {
args = append(args, "-p", fmt.Sprintf("%s.test_duration=%d", name, durationSec))
}
if len(gpuIndices) > 0 {
args = append(args, "-i", joinIndexList(gpuIndices))
}
return args
}
func normalizeNvidiaBurnDuration(durationSec int) int {
if durationSec <= 0 {
return 300
}
return durationSec
}
func nvidiaVisibleDevicesEnv(gpuIndices []int) []string {
if len(gpuIndices) == 0 {
return nil
}
return []string{"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices)}
}
func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) { func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
if ctx == nil { if ctx == nil {
ctx = context.Background() ctx = context.Background()
@@ -642,6 +785,7 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
} }
if strings.Contains(text, "unsupported") || if strings.Contains(text, "unsupported") ||
strings.Contains(text, "not supported") || strings.Contains(text, "not supported") ||
strings.Contains(text, "not found in path") ||
strings.Contains(text, "invalid opcode") || strings.Contains(text, "invalid opcode") ||
strings.Contains(text, "unknown command") || strings.Contains(text, "unknown command") ||
strings.Contains(text, "not implemented") || strings.Contains(text, "not implemented") ||
@@ -748,6 +892,15 @@ func resolveROCmSMICommand(args ...string) ([]string, error) {
return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm") return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
} }
func resolveDCGMProfTesterCommand(args ...string) ([]string, error) {
for _, candidate := range dcgmProfTesterCandidates {
if path, err := satLookPath(candidate); err == nil {
return append([]string{path}, args...), nil
}
}
return nil, errors.New("dcgmproftester not found in PATH")
}
func ensureAMDRuntimeReady() error { func ensureAMDRuntimeReady() error {
if _, err := os.Stat("/dev/kfd"); err == nil { if _, err := os.Stat("/dev/kfd"); err == nil {
return nil return nil

View File

@@ -195,6 +195,53 @@ func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) {
} }
} }
func TestResolveDCGMProfTesterCommandUsesVersionedBinary(t *testing.T) {
oldLookPath := satLookPath
satLookPath = func(file string) (string, error) {
switch file {
case "dcgmproftester13":
return "/usr/bin/dcgmproftester13", nil
default:
return "", exec.ErrNotFound
}
}
t.Cleanup(func() { satLookPath = oldLookPath })
cmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004")
if err != nil {
t.Fatalf("resolveDCGMProfTesterCommand error: %v", err)
}
if len(cmd) != 4 {
t.Fatalf("cmd len=%d want 4 (%v)", len(cmd), cmd)
}
if cmd[0] != "/usr/bin/dcgmproftester13" {
t.Fatalf("cmd[0]=%q want /usr/bin/dcgmproftester13", cmd[0])
}
}
func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", 900, []int{3, 1})
want := []string{"dcgmi", "diag", "-r", "targeted_power", "-p", "targeted_power.test_duration=900", "-i", "3,1"}
if len(cmd) != len(want) {
t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
}
for i := range want {
if cmd[i] != want[i] {
t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
}
}
}
func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
if len(env) != 1 {
t.Fatalf("env len=%d want 1 (%v)", len(env), env)
}
if env[0] != "CUDA_VISIBLE_DEVICES=0,2,4" {
t.Fatalf("env[0]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[0])
}
}
func TestNvidiaStressArchivePrefixByLoader(t *testing.T) { func TestNvidiaStressArchivePrefixByLoader(t *testing.T) {
t.Parallel() t.Parallel()
@@ -229,6 +276,37 @@ func TestEnvIntFallback(t *testing.T) {
} }
} }
func TestMemoryStressSizeArgUsesAvailableMemory(t *testing.T) {
oldFreeMemBytes := satFreeMemBytes
satFreeMemBytes = func() int64 { return 96 * 1024 * 1024 * 1024 }
t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
if got := memoryStressSizeArg(); got != "65536M" {
t.Fatalf("sizeArg=%q want 65536M", got)
}
}
func TestMemoryStressSizeArgRespectsOverride(t *testing.T) {
oldFreeMemBytes := satFreeMemBytes
satFreeMemBytes = func() int64 { return 96 * 1024 * 1024 * 1024 }
t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
t.Setenv("BEE_VM_STRESS_SIZE_MB", "4096")
if got := memoryStressSizeArg(); got != "4096M" {
t.Fatalf("sizeArg=%q want 4096M", got)
}
}
func TestMemoryStressSizeArgFallsBackWhenFreeMemoryUnknown(t *testing.T) {
oldFreeMemBytes := satFreeMemBytes
satFreeMemBytes = func() int64 { return 0 }
t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
if got := memoryStressSizeArg(); got != "80%" {
t.Fatalf("sizeArg=%q want 80%%", got)
}
}
func TestClassifySATResult(t *testing.T) { func TestClassifySATResult(t *testing.T) {
tests := []struct { tests := []struct {
name string name string

View File

@@ -10,17 +10,30 @@ import (
func (s *System) ListBeeServices() ([]string, error) { func (s *System) ListBeeServices() ([]string, error) {
seen := map[string]bool{} seen := map[string]bool{}
var out []string var out []string
for _, pattern := range []string{"/etc/systemd/system/bee-*.service", "/lib/systemd/system/bee-*.service"} { for _, pattern := range []string{
"/etc/systemd/system/bee-*.service",
"/lib/systemd/system/bee-*.service",
"/etc/systemd/system/bee-*.timer",
"/lib/systemd/system/bee-*.timer",
} {
matches, err := filepath.Glob(pattern) matches, err := filepath.Glob(pattern)
if err != nil { if err != nil {
return nil, err return nil, err
} }
for _, match := range matches { for _, match := range matches {
name := strings.TrimSuffix(filepath.Base(match), ".service") base := filepath.Base(match)
name := base
if strings.HasSuffix(base, ".service") {
name = strings.TrimSuffix(base, ".service")
}
// Skip template units (e.g. bee-journal-mirror@) — they have no instances to query. // Skip template units (e.g. bee-journal-mirror@) — they have no instances to query.
if strings.HasSuffix(name, "@") { if strings.HasSuffix(name, "@") {
continue continue
} }
// bee-selfheal is timer-managed; showing the oneshot service as inactive is misleading.
if name == "bee-selfheal" && strings.HasSuffix(base, ".service") {
continue
}
if !seen[name] { if !seen[name] {
seen[name] = true seen[name] = true
out = append(out, name) out = append(out, name)

View File

@@ -44,12 +44,12 @@ type StaticIPv4Config struct {
} }
type RemovableTarget struct { type RemovableTarget struct {
Device string Device string `json:"device"`
FSType string FSType string `json:"fs_type"`
Size string Size string `json:"size"`
Label string Label string `json:"label"`
Model string Model string `json:"model"`
Mountpoint string Mountpoint string `json:"mountpoint"`
} }
type ToolStatus struct { type ToolStatus struct {

View File

@@ -0,0 +1,31 @@
package platform
import (
"encoding/json"
"strings"
"testing"
)
func TestRemovableTargetJSONUsesFrontendFieldNames(t *testing.T) {
t.Parallel()
data, err := json.Marshal(RemovableTarget{
Device: "/dev/sdb1",
FSType: "exfat",
Size: "1.8T",
Label: "USB",
Model: "Flash",
})
if err != nil {
t.Fatalf("marshal: %v", err)
}
raw := string(data)
for _, key := range []string{`"device"`, `"fs_type"`, `"size"`, `"label"`, `"model"`} {
if !strings.Contains(raw, key) {
t.Fatalf("json missing key %s: %s", key, raw)
}
}
if strings.Contains(raw, `"Device"`) || strings.Contains(raw, `"FSType"`) {
t.Fatalf("json still contains Go field names: %s", raw)
}
}

View File

@@ -110,6 +110,11 @@ func streamCmdJob(j *jobState, cmd *exec.Cmd) error {
scanDone := make(chan error, 1) scanDone := make(chan error, 1)
go func() { go func() {
defer func() {
if rec := recover(); rec != nil {
scanDone <- fmt.Errorf("stream scanner panic: %v", rec)
}
}()
scanner := bufio.NewScanner(pr) scanner := bufio.NewScanner(pr)
scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024) scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024)
for scanner.Scan() { for scanner.Scan() {
@@ -227,6 +232,54 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
} }
} }
func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
if h.opts.App == nil {
writeError(w, http.StatusServiceUnavailable, "app not configured")
return
}
var body struct {
Profile string `json:"profile"`
SizeMB int `json:"size_mb"`
GPUIndices []int `json:"gpu_indices"`
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
RunNCCL *bool `json:"run_nccl"`
DisplayName string `json:"display_name"`
}
if r.Body != nil {
if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
writeError(w, http.StatusBadRequest, "invalid request body")
return
}
}
runNCCL := true
if body.RunNCCL != nil {
runNCCL = *body.RunNCCL
}
t := &Task{
ID: newJobID("benchmark-nvidia"),
Name: taskDisplayName("nvidia-benchmark", "", ""),
Target: "nvidia-benchmark",
Priority: 15,
Status: TaskPending,
CreatedAt: time.Now(),
params: taskParams{
GPUIndices: body.GPUIndices,
ExcludeGPUIndices: body.ExcludeGPUIndices,
SizeMB: body.SizeMB,
BenchmarkProfile: body.Profile,
RunNCCL: runNCCL,
DisplayName: body.DisplayName,
},
}
if strings.TrimSpace(body.DisplayName) != "" {
t.Name = body.DisplayName
}
globalQueue.enqueue(t)
writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
}
func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) { func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {
id := r.URL.Query().Get("job_id") id := r.URL.Query().Get("job_id")
if id == "" { if id == "" {
@@ -486,6 +539,22 @@ func (h *handler) handleAPIExportUSBBundle(w http.ResponseWriter, r *http.Reques
// ── GPU presence ────────────────────────────────────────────────────────────── // ── GPU presence ──────────────────────────────────────────────────────────────
func (h *handler) handleAPIGNVIDIAGPUs(w http.ResponseWriter, _ *http.Request) {
if h.opts.App == nil {
writeError(w, http.StatusServiceUnavailable, "app not configured")
return
}
gpus, err := h.opts.App.ListNvidiaGPUs()
if err != nil {
writeError(w, http.StatusInternalServerError, err.Error())
return
}
if gpus == nil {
gpus = []platform.NvidiaGPU{}
}
writeJSON(w, gpus)
}
func (h *handler) handleAPIGPUPresence(w http.ResponseWriter, r *http.Request) { func (h *handler) handleAPIGPUPresence(w http.ResponseWriter, r *http.Request) {
if h.opts.App == nil { if h.opts.App == nil {
writeError(w, http.StatusServiceUnavailable, "app not configured") writeError(w, http.StatusServiceUnavailable, "app not configured")
@@ -511,14 +580,33 @@ func (h *handler) handleAPIGPUTools(w http.ResponseWriter, _ *http.Request) {
_, amdErr := os.Stat("/dev/kfd") _, amdErr := os.Stat("/dev/kfd")
nvidiaUp := nvidiaErr == nil nvidiaUp := nvidiaErr == nil
amdUp := amdErr == nil amdUp := amdErr == nil
_, dcgmErr := exec.LookPath("dcgmi")
_, ncclStressErr := exec.LookPath("bee-nccl-gpu-stress")
_, johnErr := exec.LookPath("bee-john-gpu-stress")
_, beeBurnErr := exec.LookPath("bee-gpu-burn")
_, nvBandwidthErr := exec.LookPath("nvbandwidth")
profErr := lookPathAny("dcgmproftester", "dcgmproftester13", "dcgmproftester12", "dcgmproftester11")
writeJSON(w, []toolEntry{ writeJSON(w, []toolEntry{
{ID: "bee-gpu-burn", Available: nvidiaUp, Vendor: "nvidia"}, {ID: "nvidia-compute", Available: nvidiaUp && profErr == nil, Vendor: "nvidia"},
{ID: "john", Available: nvidiaUp, Vendor: "nvidia"}, {ID: "nvidia-targeted-power", Available: nvidiaUp && dcgmErr == nil, Vendor: "nvidia"},
{ID: "nccl", Available: nvidiaUp, Vendor: "nvidia"}, {ID: "nvidia-pulse", Available: nvidiaUp && dcgmErr == nil, Vendor: "nvidia"},
{ID: "nvidia-interconnect", Available: nvidiaUp && ncclStressErr == nil, Vendor: "nvidia"},
{ID: "nvidia-bandwidth", Available: nvidiaUp && dcgmErr == nil && nvBandwidthErr == nil, Vendor: "nvidia"},
{ID: "bee-gpu-burn", Available: nvidiaUp && beeBurnErr == nil, Vendor: "nvidia"},
{ID: "john", Available: nvidiaUp && johnErr == nil, Vendor: "nvidia"},
{ID: "rvs", Available: amdUp, Vendor: "amd"}, {ID: "rvs", Available: amdUp, Vendor: "amd"},
}) })
} }
func lookPathAny(names ...string) error {
for _, name := range names {
if _, err := exec.LookPath(name); err == nil {
return nil
}
}
return exec.ErrNotFound
}
// ── System ──────────────────────────────────────────────────────────────────── // ── System ────────────────────────────────────────────────────────────────────
func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) { func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) {
@@ -557,7 +645,7 @@ func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request)
var standardTools = []string{ var standardTools = []string{
"dmidecode", "smartctl", "nvme", "lspci", "ipmitool", "dmidecode", "smartctl", "nvme", "lspci", "ipmitool",
"nvidia-smi", "memtester", "stress-ng", "nvtop", "nvidia-smi", "dcgmi", "nv-hostengine", "memtester", "stress-ng", "nvtop",
"mstflint", "qrencode", "mstflint", "qrencode",
} }

View File

@@ -64,6 +64,42 @@ func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
} }
} }
func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
globalQueue.mu.Lock()
originalTasks := globalQueue.tasks
globalQueue.tasks = nil
globalQueue.mu.Unlock()
t.Cleanup(func() {
globalQueue.mu.Lock()
globalQueue.tasks = originalTasks
globalQueue.mu.Unlock()
})
h := &handler{opts: HandlerOptions{App: &app.App{}}}
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
rec := httptest.NewRecorder()
h.handleAPIBenchmarkNvidiaRun(rec, req)
if rec.Code != 200 {
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
}
globalQueue.mu.Lock()
defer globalQueue.mu.Unlock()
if len(globalQueue.tasks) != 1 {
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
}
task := globalQueue.tasks[0]
if task.Target != "nvidia-benchmark" {
t.Fatalf("target=%q want nvidia-benchmark", task.Target)
}
if got := task.params.GPUIndices; len(got) != 2 || got[0] != 1 || got[1] != 3 {
t.Fatalf("gpu indices=%v want [1 3]", got)
}
if task.params.RunNCCL {
t.Fatal("RunNCCL should reflect explicit false from request")
}
}
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) { func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
h := &handler{} h := &handler{}

View File

@@ -0,0 +1,773 @@
package webui
import (
"fmt"
"math"
"sort"
"strconv"
"strings"
"sync"
"time"
"bee/audit/internal/platform"
)
type chartTimelineSegment struct {
Start time.Time
End time.Time
Active bool
}
type chartScale struct {
Min float64
Max float64
Ticks []float64
}
type chartLayout struct {
Width int
Height int
PlotLeft int
PlotRight int
PlotTop int
PlotBottom int
}
type metricChartSeries struct {
Name string
AxisTitle string
Color string
Values []float64
}
var metricChartPalette = []string{
"#5794f2",
"#73bf69",
"#f2cc0c",
"#ff9830",
"#f2495c",
"#b877d9",
"#56d2f7",
"#8ab8ff",
"#9adf8f",
"#ffbe5c",
}
var gpuLabelCache struct {
mu sync.Mutex
loadedAt time.Time
byIndex map[int]string
}
func renderMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMin, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
pointCount := len(labels)
if len(times) > pointCount {
pointCount = len(times)
}
if pointCount == 0 {
pointCount = 1
labels = []string{""}
times = []time.Time{time.Time{}}
}
if len(labels) < pointCount {
padded := make([]string, pointCount)
copy(padded, labels)
labels = padded
}
if len(times) < pointCount {
times = synthesizeChartTimes(times, pointCount)
}
for i := range datasets {
if len(datasets[i]) == 0 {
datasets[i] = make([]float64, pointCount)
}
}
statsLabel := chartStatsLabel(datasets)
legendItems := []metricChartSeries{}
for i, name := range names {
color := metricChartPalette[i%len(metricChartPalette)]
values := make([]float64, pointCount)
if i < len(datasets) {
copy(values, coalesceDataset(datasets[i], pointCount))
}
legendItems = append(legendItems, metricChartSeries{
Name: name,
Color: color,
Values: values,
})
}
scale := singleAxisChartScale(datasets, yMin, yMax)
layout := singleAxisChartLayout(canvasHeight, len(legendItems))
start, end := chartTimeBounds(times)
var b strings.Builder
writeSVGOpen(&b, layout.Width, layout.Height)
writeChartFrame(&b, title, statsLabel, layout.Width, layout.Height)
writeTimelineIdleSpans(&b, layout, start, end, timeline)
writeVerticalGrid(&b, layout, times, pointCount, 8)
writeHorizontalGrid(&b, layout, scale)
writeTimelineBoundaries(&b, layout, start, end, timeline)
writePlotBorder(&b, layout)
writeSingleAxisY(&b, layout, scale)
writeXAxisLabels(&b, layout, times, labels, start, end, 8)
for _, item := range legendItems {
writeSeriesPolyline(&b, layout, times, start, end, item.Values, scale, item.Color)
}
writeLegend(&b, layout, legendItems)
writeSVGClose(&b)
return []byte(b.String()), nil
}
func renderGPUOverviewChartSVG(idx int, samples []platform.LiveMetricSample, timeline []chartTimelineSegment) ([]byte, bool, error) {
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
coreClock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
if temp == nil && power == nil && coreClock == nil {
return nil, false, nil
}
labels := sampleTimeLabels(samples)
times := sampleTimes(samples)
svg, err := drawGPUOverviewChartSVG(
gpuDisplayLabel(idx)+" Overview",
labels,
times,
[]metricChartSeries{
{Name: "Temp C", Values: coalesceDataset(temp, len(labels)), Color: "#f05a5a", AxisTitle: "Temp C"},
{Name: "Power W", Values: coalesceDataset(power, len(labels)), Color: "#ffb357", AxisTitle: "Power W"},
{Name: "Core Clock MHz", Values: coalesceDataset(coreClock, len(labels)), Color: "#73bf69", AxisTitle: "Core MHz"},
},
timeline,
)
if err != nil {
return nil, false, err
}
return svg, true, nil
}
func drawGPUOverviewChartSVG(title string, labels []string, times []time.Time, series []metricChartSeries, timeline []chartTimelineSegment) ([]byte, error) {
if len(series) != 3 {
return nil, fmt.Errorf("gpu overview requires 3 series, got %d", len(series))
}
const (
width = 1400
height = 840
plotLeft = 180
plotRight = 1220
plotTop = 96
plotBottom = 660
)
const (
leftOuterAxis = 72
leftInnerAxis = 132
rightInnerAxis = 1268
)
layout := chartLayout{
Width: width,
Height: height,
PlotLeft: plotLeft,
PlotRight: plotRight,
PlotTop: plotTop,
PlotBottom: plotBottom,
}
axisX := []int{leftOuterAxis, leftInnerAxis, rightInnerAxis}
pointCount := len(labels)
if len(times) > pointCount {
pointCount = len(times)
}
if pointCount == 0 {
pointCount = 1
labels = []string{""}
times = []time.Time{time.Time{}}
}
if len(labels) < pointCount {
padded := make([]string, pointCount)
copy(padded, labels)
labels = padded
}
if len(times) < pointCount {
times = synthesizeChartTimes(times, pointCount)
}
for i := range series {
if len(series[i].Values) == 0 {
series[i].Values = make([]float64, pointCount)
}
}
scales := make([]chartScale, len(series))
for i := range series {
min, max := chartSeriesBounds(series[i].Values)
ticks := chartNiceTicks(min, max, 8)
scales[i] = chartScale{
Min: ticks[0],
Max: ticks[len(ticks)-1],
Ticks: ticks,
}
}
start, end := chartTimeBounds(times)
var b strings.Builder
writeSVGOpen(&b, width, height)
writeChartFrame(&b, title, "", width, height)
writeTimelineIdleSpans(&b, layout, start, end, timeline)
writeVerticalGrid(&b, layout, times, pointCount, 8)
writeHorizontalGrid(&b, layout, scales[0])
writeTimelineBoundaries(&b, layout, start, end, timeline)
writePlotBorder(&b, layout)
for i, axisLineX := range axisX {
fmt.Fprintf(&b, `<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="%s" stroke-width="1"/>`+"\n",
axisLineX, layout.PlotTop, axisLineX, layout.PlotBottom, series[i].Color)
fmt.Fprintf(&b, `<text x="%d" y="%d" text-anchor="middle" font-family="sans-serif" font-size="11" font-weight="700" fill="%s">%s</text>`+"\n",
axisLineX, 64, series[i].Color, sanitizeChartText(series[i].AxisTitle))
for _, tick := range scales[i].Ticks {
y := chartYForValue(valueClamp(tick, scales[i]), scales[i], layout.PlotTop, layout.PlotBottom)
label := sanitizeChartText(chartYAxisNumber(tick))
if i < 2 {
fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="%s" stroke-width="1"/>`+"\n",
axisLineX, y, axisLineX+6, y, series[i].Color)
fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="end" dy="4" font-family="sans-serif" font-size="10" fill="%s">%s</text>`+"\n",
axisLineX-8, y, series[i].Color, label)
continue
}
fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="%s" stroke-width="1"/>`+"\n",
axisLineX, y, axisLineX-6, y, series[i].Color)
fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="start" dy="4" font-family="sans-serif" font-size="10" fill="%s">%s</text>`+"\n",
axisLineX+8, y, series[i].Color, label)
}
}
writeXAxisLabels(&b, layout, times, labels, start, end, 8)
for i := range series {
writeSeriesPolyline(&b, layout, times, start, end, series[i].Values, scales[i], series[i].Color)
}
writeLegend(&b, layout, series)
writeSVGClose(&b)
return []byte(b.String()), nil
}
func metricsTimelineSegments(samples []platform.LiveMetricSample, now time.Time) []chartTimelineSegment {
if len(samples) == 0 {
return nil
}
times := sampleTimes(samples)
start, end := chartTimeBounds(times)
if start.IsZero() || end.IsZero() {
return nil
}
return chartTimelineSegmentsForRange(start, end, now, snapshotTaskHistory())
}
func snapshotTaskHistory() []Task {
globalQueue.mu.Lock()
defer globalQueue.mu.Unlock()
out := make([]Task, len(globalQueue.tasks))
for i, t := range globalQueue.tasks {
out[i] = *t
}
return out
}
func chartTimelineSegmentsForRange(start, end, now time.Time, tasks []Task) []chartTimelineSegment {
if start.IsZero() || end.IsZero() {
return nil
}
if end.Before(start) {
start, end = end, start
}
type interval struct {
start time.Time
end time.Time
}
active := make([]interval, 0, len(tasks))
for _, task := range tasks {
if task.StartedAt == nil {
continue
}
intervalStart := task.StartedAt.UTC()
intervalEnd := now.UTC()
if task.DoneAt != nil {
intervalEnd = task.DoneAt.UTC()
}
if !intervalEnd.After(intervalStart) {
continue
}
if intervalEnd.Before(start) || intervalStart.After(end) {
continue
}
if intervalStart.Before(start) {
intervalStart = start
}
if intervalEnd.After(end) {
intervalEnd = end
}
active = append(active, interval{start: intervalStart, end: intervalEnd})
}
sort.Slice(active, func(i, j int) bool {
if active[i].start.Equal(active[j].start) {
return active[i].end.Before(active[j].end)
}
return active[i].start.Before(active[j].start)
})
merged := make([]interval, 0, len(active))
for _, span := range active {
if len(merged) == 0 {
merged = append(merged, span)
continue
}
last := &merged[len(merged)-1]
if !span.start.After(last.end) {
if span.end.After(last.end) {
last.end = span.end
}
continue
}
merged = append(merged, span)
}
segments := make([]chartTimelineSegment, 0, len(merged)*2+1)
cursor := start
for _, span := range merged {
if span.start.After(cursor) {
segments = append(segments, chartTimelineSegment{Start: cursor, End: span.start, Active: false})
}
segments = append(segments, chartTimelineSegment{Start: span.start, End: span.end, Active: true})
cursor = span.end
}
if cursor.Before(end) {
segments = append(segments, chartTimelineSegment{Start: cursor, End: end, Active: false})
}
if len(segments) == 0 {
segments = append(segments, chartTimelineSegment{Start: start, End: end, Active: false})
}
return segments
}
func sampleTimes(samples []platform.LiveMetricSample) []time.Time {
times := make([]time.Time, 0, len(samples))
for _, sample := range samples {
times = append(times, sample.Timestamp)
}
return times
}
func singleAxisChartScale(datasets [][]float64, yMin, yMax *float64) chartScale {
min, max := 0.0, 1.0
if yMin != nil && yMax != nil {
min, max = *yMin, *yMax
} else {
min, max = chartSeriesBounds(flattenDatasets(datasets))
if yMin != nil {
min = *yMin
}
if yMax != nil {
max = *yMax
}
}
ticks := chartNiceTicks(min, max, 8)
return chartScale{Min: ticks[0], Max: ticks[len(ticks)-1], Ticks: ticks}
}
func flattenDatasets(datasets [][]float64) []float64 {
total := 0
for _, ds := range datasets {
total += len(ds)
}
out := make([]float64, 0, total)
for _, ds := range datasets {
out = append(out, ds...)
}
return out
}
func singleAxisChartLayout(canvasHeight int, seriesCount int) chartLayout {
legendRows := 0
if chartLegendVisible(seriesCount) && seriesCount > 0 {
cols := 4
if seriesCount < cols {
cols = seriesCount
}
legendRows = (seriesCount + cols - 1) / cols
}
legendHeight := 0
if legendRows > 0 {
legendHeight = legendRows*24 + 24
}
return chartLayout{
Width: 1400,
Height: canvasHeight,
PlotLeft: 96,
PlotRight: 1352,
PlotTop: 72,
PlotBottom: canvasHeight - 60 - legendHeight,
}
}
func chartTimeBounds(times []time.Time) (time.Time, time.Time) {
if len(times) == 0 {
return time.Time{}, time.Time{}
}
start := times[0].UTC()
end := start
for _, ts := range times[1:] {
t := ts.UTC()
if t.Before(start) {
start = t
}
if t.After(end) {
end = t
}
}
return start, end
}
func synthesizeChartTimes(times []time.Time, count int) []time.Time {
if count <= 0 {
return nil
}
if len(times) == count {
return times
}
if len(times) == 1 {
out := make([]time.Time, count)
for i := range out {
out[i] = times[0].Add(time.Duration(i) * time.Minute)
}
return out
}
base := time.Now().UTC().Add(-time.Duration(count-1) * time.Minute)
out := make([]time.Time, count)
for i := range out {
out[i] = base.Add(time.Duration(i) * time.Minute)
}
return out
}
func writeSVGOpen(b *strings.Builder, width, height int) {
fmt.Fprintf(b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" viewBox="0 0 %d %d">`+"\n", width, height, width, height)
}
func writeSVGClose(b *strings.Builder) {
b.WriteString("</svg>\n")
}
func writeChartFrame(b *strings.Builder, title, subtitle string, width, height int) {
fmt.Fprintf(b, `<rect width="%d" height="%d" rx="10" ry="10" fill="#ffffff" stroke="#d7e0ea"/>`+"\n", width, height)
fmt.Fprintf(b, `<text x="%d" y="30" text-anchor="middle" font-family="sans-serif" font-size="16" font-weight="700" fill="#1f2937">%s</text>`+"\n",
width/2, sanitizeChartText(title))
if strings.TrimSpace(subtitle) != "" {
fmt.Fprintf(b, `<text x="%d" y="50" text-anchor="middle" font-family="sans-serif" font-size="12" font-weight="600" fill="#64748b">%s</text>`+"\n",
width/2, sanitizeChartText(subtitle))
}
}
func writePlotBorder(b *strings.Builder, layout chartLayout) {
fmt.Fprintf(b, `<rect x="%d" y="%d" width="%d" height="%d" fill="none" stroke="#cbd5e1" stroke-width="1"/>`+"\n",
layout.PlotLeft, layout.PlotTop, layout.PlotRight-layout.PlotLeft, layout.PlotBottom-layout.PlotTop)
}
func writeHorizontalGrid(b *strings.Builder, layout chartLayout, scale chartScale) {
b.WriteString(`<g stroke="#e2e8f0" stroke-width="1">` + "\n")
for _, tick := range scale.Ticks {
y := chartYForValue(tick, scale, layout.PlotTop, layout.PlotBottom)
fmt.Fprintf(b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f"/>`+"\n",
layout.PlotLeft, y, layout.PlotRight, y)
}
b.WriteString(`</g>` + "\n")
}
func writeVerticalGrid(b *strings.Builder, layout chartLayout, times []time.Time, pointCount, target int) {
if pointCount <= 0 {
return
}
start, end := chartTimeBounds(times)
b.WriteString(`<g stroke="#edf2f7" stroke-width="1">` + "\n")
for _, idx := range gpuChartLabelIndices(pointCount, target) {
ts := chartPointTime(times, idx)
x := chartXForTime(ts, start, end, layout.PlotLeft, layout.PlotRight)
fmt.Fprintf(b, `<line x1="%.1f" y1="%d" x2="%.1f" y2="%d"/>`+"\n",
x, layout.PlotTop, x, layout.PlotBottom)
}
b.WriteString(`</g>` + "\n")
}
func writeSingleAxisY(b *strings.Builder, layout chartLayout, scale chartScale) {
fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="#64748b" stroke-width="1"/>`+"\n",
layout.PlotLeft, layout.PlotTop, layout.PlotLeft, layout.PlotBottom)
for _, tick := range scale.Ticks {
y := chartYForValue(tick, scale, layout.PlotTop, layout.PlotBottom)
fmt.Fprintf(b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="#64748b" stroke-width="1"/>`+"\n",
layout.PlotLeft, y, layout.PlotLeft-6, y)
fmt.Fprintf(b, `<text x="%d" y="%.1f" text-anchor="end" dy="4" font-family="sans-serif" font-size="10" fill="#475569">%s</text>`+"\n",
layout.PlotLeft-10, y, sanitizeChartText(chartYAxisNumber(tick)))
}
}
func writeXAxisLabels(b *strings.Builder, layout chartLayout, times []time.Time, labels []string, start, end time.Time, target int) {
pointCount := len(labels)
if len(times) > pointCount {
pointCount = len(times)
}
b.WriteString(`<g font-family="sans-serif" font-size="11" fill="#64748b" text-anchor="middle">` + "\n")
for _, idx := range gpuChartLabelIndices(pointCount, target) {
x := chartXForTime(chartPointTime(times, idx), start, end, layout.PlotLeft, layout.PlotRight)
label := ""
if idx < len(labels) {
label = labels[idx]
}
fmt.Fprintf(b, `<text x="%.1f" y="%d">%s</text>`+"\n", x, layout.PlotBottom+28, sanitizeChartText(label))
}
b.WriteString(`</g>` + "\n")
fmt.Fprintf(b, `<text x="%d" y="%d" text-anchor="middle" font-family="sans-serif" font-size="12" fill="#64748b">Time</text>`+"\n",
(layout.PlotLeft+layout.PlotRight)/2, layout.PlotBottom+48)
}
func writeSeriesPolyline(b *strings.Builder, layout chartLayout, times []time.Time, start, end time.Time, values []float64, scale chartScale, color string) {
if len(values) == 0 {
return
}
var points strings.Builder
for idx, value := range values {
if idx > 0 {
points.WriteByte(' ')
}
x := chartXForTime(chartPointTime(times, idx), start, end, layout.PlotLeft, layout.PlotRight)
y := chartYForValue(value, scale, layout.PlotTop, layout.PlotBottom)
points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
points.WriteByte(',')
points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
}
fmt.Fprintf(b, `<polyline points="%s" fill="none" stroke="%s" stroke-width="2.2" stroke-linejoin="round" stroke-linecap="round"/>`+"\n",
points.String(), color)
if len(values) == 1 {
x := chartXForTime(chartPointTime(times, 0), start, end, layout.PlotLeft, layout.PlotRight)
y := chartYForValue(values[0], scale, layout.PlotTop, layout.PlotBottom)
fmt.Fprintf(b, `<circle cx="%.1f" cy="%.1f" r="3.5" fill="%s"/>`+"\n", x, y, color)
return
}
peakIdx := 0
peakValue := values[0]
for idx, value := range values[1:] {
if value >= peakValue {
peakIdx = idx + 1
peakValue = value
}
}
x := chartXForTime(chartPointTime(times, peakIdx), start, end, layout.PlotLeft, layout.PlotRight)
y := chartYForValue(peakValue, scale, layout.PlotTop, layout.PlotBottom)
fmt.Fprintf(b, `<circle cx="%.1f" cy="%.1f" r="4.2" fill="%s" stroke="#ffffff" stroke-width="1.6"/>`+"\n", x, y, color)
fmt.Fprintf(b, `<path d="M %.1f %.1f L %.1f %.1f L %.1f %.1f Z" fill="%s" opacity="0.9"/>`+"\n",
x, y-10, x-5, y-18, x+5, y-18, color)
}
func writeLegend(b *strings.Builder, layout chartLayout, series []metricChartSeries) {
if !chartLegendVisible(len(series)) || len(series) == 0 {
return
}
cols := 4
if len(series) < cols {
cols = len(series)
}
cellWidth := float64(layout.PlotRight-layout.PlotLeft) / float64(cols)
baseY := layout.PlotBottom + 74
for i, item := range series {
row := i / cols
col := i % cols
x := float64(layout.PlotLeft) + cellWidth*float64(col) + 8
y := float64(baseY + row*24)
fmt.Fprintf(b, `<line x1="%.1f" y1="%.1f" x2="%.1f" y2="%.1f" stroke="%s" stroke-width="3"/>`+"\n",
x, y, x+28, y, item.Color)
fmt.Fprintf(b, `<text x="%.1f" y="%.1f" font-family="sans-serif" font-size="12" fill="#1f2937">%s</text>`+"\n",
x+38, y+4, sanitizeChartText(item.Name))
}
}
func writeTimelineIdleSpans(b *strings.Builder, layout chartLayout, start, end time.Time, segments []chartTimelineSegment) {
if len(segments) == 0 {
return
}
b.WriteString(`<g data-role="timeline-overlay">` + "\n")
for _, segment := range segments {
if segment.Active || !segment.End.After(segment.Start) {
continue
}
x0 := chartXForTime(segment.Start, start, end, layout.PlotLeft, layout.PlotRight)
x1 := chartXForTime(segment.End, start, end, layout.PlotLeft, layout.PlotRight)
fmt.Fprintf(b, `<rect x="%.1f" y="%d" width="%.1f" height="%d" fill="#475569" opacity="0.10"/>`+"\n",
x0, layout.PlotTop, math.Max(1, x1-x0), layout.PlotBottom-layout.PlotTop)
}
b.WriteString(`</g>` + "\n")
}
func writeTimelineBoundaries(b *strings.Builder, layout chartLayout, start, end time.Time, segments []chartTimelineSegment) {
if len(segments) == 0 {
return
}
seen := map[int]bool{}
b.WriteString(`<g data-role="timeline-boundaries" stroke="#94a3b8" stroke-width="1.2">` + "\n")
for i, segment := range segments {
if i > 0 {
x := int(math.Round(chartXForTime(segment.Start, start, end, layout.PlotLeft, layout.PlotRight)))
if !seen[x] {
seen[x] = true
fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d"/>`+"\n", x, layout.PlotTop, x, layout.PlotBottom)
}
}
if i < len(segments)-1 {
x := int(math.Round(chartXForTime(segment.End, start, end, layout.PlotLeft, layout.PlotRight)))
if !seen[x] {
seen[x] = true
fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d"/>`+"\n", x, layout.PlotTop, x, layout.PlotBottom)
}
}
}
b.WriteString(`</g>` + "\n")
}
func chartXForTime(ts, start, end time.Time, left, right int) float64 {
if !end.After(start) {
return float64(left+right) / 2
}
if ts.Before(start) {
ts = start
}
if ts.After(end) {
ts = end
}
ratio := float64(ts.Sub(start)) / float64(end.Sub(start))
return float64(left) + ratio*float64(right-left)
}
func chartPointTime(times []time.Time, idx int) time.Time {
if idx >= 0 && idx < len(times) && !times[idx].IsZero() {
return times[idx].UTC()
}
if len(times) > 0 && !times[0].IsZero() {
return times[0].UTC().Add(time.Duration(idx) * time.Minute)
}
return time.Now().UTC().Add(time.Duration(idx) * time.Minute)
}
func chartYForValue(value float64, scale chartScale, plotTop, plotBottom int) float64 {
if scale.Max <= scale.Min {
return float64(plotTop+plotBottom) / 2
}
return float64(plotBottom) - (value-scale.Min)/(scale.Max-scale.Min)*float64(plotBottom-plotTop)
}
func chartSeriesBounds(values []float64) (float64, float64) {
if len(values) == 0 {
return 0, 1
}
min, max := values[0], values[0]
for _, value := range values[1:] {
if value < min {
min = value
}
if value > max {
max = value
}
}
if min == max {
if max == 0 {
return 0, 1
}
pad := math.Abs(max) * 0.1
if pad == 0 {
pad = 1
}
min -= pad
max += pad
}
if min > 0 {
pad := (max - min) * 0.2
if pad == 0 {
pad = max * 0.1
}
min -= pad
if min < 0 {
min = 0
}
max += pad
}
return min, max
}
func chartNiceTicks(min, max float64, target int) []float64 {
if min == max {
max = min + 1
}
span := max - min
step := math.Pow(10, math.Floor(math.Log10(span/float64(target))))
for _, factor := range []float64{1, 2, 5, 10} {
if span/(factor*step) <= float64(target)*1.5 {
step = factor * step
break
}
}
low := math.Floor(min/step) * step
high := math.Ceil(max/step) * step
var ticks []float64
for value := low; value <= high+step*0.001; value += step {
ticks = append(ticks, math.Round(value*1e9)/1e9)
}
return ticks
}
func valueClamp(value float64, scale chartScale) float64 {
if value < scale.Min {
return scale.Min
}
if value > scale.Max {
return scale.Max
}
return value
}
func chartStatsLabel(datasets [][]float64) string {
mn, avg, mx := globalStats(datasets)
if mx <= 0 && avg <= 0 && mn <= 0 {
return ""
}
return fmt.Sprintf("min %s avg %s max %s",
chartLegendNumber(mn),
chartLegendNumber(avg),
chartLegendNumber(mx),
)
}
func gpuDisplayLabel(idx int) string {
if name := gpuModelNameByIndex(idx); name != "" {
return fmt.Sprintf("GPU %d — %s", idx, name)
}
return fmt.Sprintf("GPU %d", idx)
}
func gpuModelNameByIndex(idx int) string {
now := time.Now()
gpuLabelCache.mu.Lock()
if now.Sub(gpuLabelCache.loadedAt) > 30*time.Second || gpuLabelCache.byIndex == nil {
gpuLabelCache.loadedAt = now
gpuLabelCache.byIndex = loadGPUModelNames()
}
name := strings.TrimSpace(gpuLabelCache.byIndex[idx])
gpuLabelCache.mu.Unlock()
return name
}
func loadGPUModelNames() map[int]string {
out := map[int]string{}
gpus, err := platform.New().ListNvidiaGPUs()
if err != nil {
return out
}
for _, gpu := range gpus {
name := strings.TrimSpace(gpu.Name)
if name != "" {
out[gpu.Index] = name
}
}
return out
}

View File

@@ -9,13 +9,14 @@ import (
// jobState holds the output lines and completion status of an async job. // jobState holds the output lines and completion status of an async job.
type jobState struct { type jobState struct {
lines []string lines []string
done bool done bool
err string err string
mu sync.Mutex mu sync.Mutex
subs []chan string subs []chan string
cancel func() // optional cancel function; nil if job is not cancellable cancel func() // optional cancel function; nil if job is not cancellable
logPath string logPath string
serialPrefix string
} }
// abort cancels the job if it has a cancel function and is not yet done. // abort cancels the job if it has a cancel function and is not yet done.
@@ -36,6 +37,9 @@ func (j *jobState) append(line string) {
if j.logPath != "" { if j.logPath != "" {
appendJobLog(j.logPath, line) appendJobLog(j.logPath, line)
} }
if j.serialPrefix != "" {
taskSerialWriteLine(j.serialPrefix + line)
}
for _, ch := range j.subs { for _, ch := range j.subs {
select { select {
case ch <- line: case ch <- line:
@@ -84,12 +88,12 @@ func (m *jobManager) create(id string) *jobState {
j := &jobState{} j := &jobState{}
m.jobs[id] = j m.jobs[id] = j
// Schedule cleanup after 30 minutes // Schedule cleanup after 30 minutes
go func() { goRecoverOnce("job cleanup", func() {
time.Sleep(30 * time.Minute) time.Sleep(30 * time.Minute)
m.mu.Lock() m.mu.Lock()
delete(m.jobs, id) delete(m.jobs, id)
m.mu.Unlock() m.mu.Unlock()
}() })
return j return j
} }
@@ -107,8 +111,11 @@ func (m *jobManager) get(id string) (*jobState, bool) {
return j, ok return j, ok
} }
func newTaskJobState(logPath string) *jobState { func newTaskJobState(logPath string, serialPrefix ...string) *jobState {
j := &jobState{logPath: logPath} j := &jobState{logPath: logPath}
if len(serialPrefix) > 0 {
j.serialPrefix = serialPrefix[0]
}
if logPath == "" { if logPath == "" {
return j return j
} }

View File

@@ -17,10 +17,10 @@ import (
// It supports multiple concurrent SAT tasks: a shared event window is open // It supports multiple concurrent SAT tasks: a shared event window is open
// while any SAT task is running, and flushed when all tasks complete. // while any SAT task is running, and flushed when all tasks complete.
type kmsgWatcher struct { type kmsgWatcher struct {
mu sync.Mutex mu sync.Mutex
activeCount int // number of in-flight SAT tasks activeCount int // number of in-flight SAT tasks
window *kmsgWindow window *kmsgWindow
statusDB *app.ComponentStatusDB statusDB *app.ComponentStatusDB
} }
type kmsgWindow struct { type kmsgWindow struct {
@@ -48,36 +48,39 @@ func newKmsgWatcher(statusDB *app.ComponentStatusDB) *kmsgWatcher {
// start launches the background kmsg reading goroutine. // start launches the background kmsg reading goroutine.
func (w *kmsgWatcher) start() { func (w *kmsgWatcher) start() {
go w.run() goRecoverLoop("kmsg watcher", 5*time.Second, w.run)
} }
func (w *kmsgWatcher) run() { func (w *kmsgWatcher) run() {
f, err := os.Open("/dev/kmsg") for {
if err != nil { f, err := os.Open("/dev/kmsg")
slog.Warn("kmsg watcher unavailable", "err", err) if err != nil {
return slog.Warn("kmsg watcher unavailable", "err", err)
} time.Sleep(30 * time.Second)
defer f.Close()
// Best-effort seek to end so we only capture events from now forward.
_, _ = f.Seek(0, io.SeekEnd)
scanner := bufio.NewScanner(f)
scanner.Buffer(make([]byte, 64*1024), 64*1024)
for scanner.Scan() {
line := scanner.Text()
evt, ok := parseKmsgLine(line)
if !ok {
continue continue
} }
w.mu.Lock() // Best-effort seek to end so we only capture events from now forward.
if w.window != nil { _, _ = f.Seek(0, io.SeekEnd)
w.recordEvent(evt)
scanner := bufio.NewScanner(f)
scanner.Buffer(make([]byte, 64*1024), 64*1024)
for scanner.Scan() {
line := scanner.Text()
evt, ok := parseKmsgLine(line)
if !ok {
continue
}
w.mu.Lock()
if w.window != nil {
w.recordEvent(evt)
}
w.mu.Unlock()
} }
w.mu.Unlock() if err := scanner.Err(); err != nil {
} slog.Warn("kmsg watcher stopped", "err", err)
if err := scanner.Err(); err != nil { }
slog.Warn("kmsg watcher stopped", "err", err) _ = f.Close()
time.Sleep(2 * time.Second)
} }
} }
@@ -134,7 +137,7 @@ func (w *kmsgWatcher) NotifyTaskFinished(taskID string) {
if window == nil || len(window.events) == 0 { if window == nil || len(window.events) == 0 {
return return
} }
go w.flushWindow(window) goRecoverOnce("kmsg watcher flush", func() { w.flushWindow(window) })
} }
func (w *kmsgWatcher) flushWindow(window *kmsgWindow) { func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
@@ -229,7 +232,8 @@ func truncate(s string, max int) string {
// isSATTarget returns true for task targets that run hardware acceptance tests. // isSATTarget returns true for task targets that run hardware acceptance tests.
func isSATTarget(target string) bool { func isSATTarget(target string) bool {
switch target { switch target {
case "nvidia", "nvidia-stress", "memory", "memory-stress", "storage", case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
"nvidia-interconnect", "nvidia-bandwidth", "nvidia-stress", "memory", "memory-stress", "storage",
"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress", "cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
"platform-stress": "platform-stress":
return true return true

View File

@@ -8,6 +8,7 @@ import (
"path/filepath" "path/filepath"
"sort" "sort"
"strconv" "strconv"
"strings"
"time" "time"
"bee/audit/internal/platform" "bee/audit/internal/platform"
@@ -21,6 +22,13 @@ type MetricsDB struct {
db *sql.DB db *sql.DB
} }
func (m *MetricsDB) Close() error {
if m == nil || m.db == nil {
return nil
}
return m.db.Close()
}
// openMetricsDB opens (or creates) the metrics database at the given path. // openMetricsDB opens (or creates) the metrics database at the given path.
func openMetricsDB(path string) (*MetricsDB, error) { func openMetricsDB(path string) (*MetricsDB, error) {
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
@@ -54,6 +62,8 @@ CREATE TABLE IF NOT EXISTS gpu_metrics (
usage_pct REAL, usage_pct REAL,
mem_usage_pct REAL, mem_usage_pct REAL,
power_w REAL, power_w REAL,
clock_mhz REAL,
mem_clock_mhz REAL,
PRIMARY KEY (ts, gpu_index) PRIMARY KEY (ts, gpu_index)
); );
CREATE TABLE IF NOT EXISTS fan_metrics ( CREATE TABLE IF NOT EXISTS fan_metrics (
@@ -70,6 +80,38 @@ CREATE TABLE IF NOT EXISTS temp_metrics (
PRIMARY KEY (ts, name) PRIMARY KEY (ts, name)
); );
`) `)
if err != nil {
return err
}
if err := ensureMetricsColumn(db, "gpu_metrics", "clock_mhz", "REAL"); err != nil {
return err
}
return ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL")
}
func ensureMetricsColumn(db *sql.DB, table, column, definition string) error {
rows, err := db.Query("PRAGMA table_info(" + table + ")")
if err != nil {
return err
}
defer rows.Close()
for rows.Next() {
var cid int
var name, ctype string
var notNull, pk int
var dflt sql.NullString
if err := rows.Scan(&cid, &name, &ctype, &notNull, &dflt, &pk); err != nil {
return err
}
if strings.EqualFold(name, column) {
return nil
}
}
if err := rows.Err(); err != nil {
return err
}
_, err = db.Exec("ALTER TABLE " + table + " ADD COLUMN " + column + " " + definition)
return err return err
} }
@@ -91,8 +133,8 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
} }
for _, g := range s.GPUs { for _, g := range s.GPUs {
_, err = tx.Exec( _, err = tx.Exec(
`INSERT OR REPLACE INTO gpu_metrics(ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w) VALUES(?,?,?,?,?,?)`, `INSERT OR REPLACE INTO gpu_metrics(ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz) VALUES(?,?,?,?,?,?,?,?)`,
ts, g.GPUIndex, g.TempC, g.UsagePct, g.MemUsagePct, g.PowerW, ts, g.GPUIndex, g.TempC, g.UsagePct, g.MemUsagePct, g.PowerW, g.ClockMHz, g.MemClockMHz,
) )
if err != nil { if err != nil {
return err return err
@@ -129,6 +171,23 @@ func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil) return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
} }
// LoadBetween returns samples in chronological order within the given time window.
func (m *MetricsDB) LoadBetween(start, end time.Time) ([]platform.LiveMetricSample, error) {
if m == nil {
return nil, nil
}
if start.IsZero() || end.IsZero() {
return nil, nil
}
if end.Before(start) {
start, end = end, start
}
return m.loadSamples(
`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
start.Unix(), end.Unix(),
)
}
// loadSamples reconstructs LiveMetricSample rows from the normalized tables. // loadSamples reconstructs LiveMetricSample rows from the normalized tables.
func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetricSample, error) { func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetricSample, error) {
rows, err := m.db.Query(query, args...) rows, err := m.db.Query(query, args...)
@@ -163,7 +222,7 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
} }
gpuData := map[gpuKey]platform.GPUMetricRow{} gpuData := map[gpuKey]platform.GPUMetricRow{}
gRows, err := m.db.Query( gRows, err := m.db.Query(
`SELECT ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w FROM gpu_metrics WHERE ts>=? AND ts<=? ORDER BY ts,gpu_index`, `SELECT ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w,IFNULL(clock_mhz,0),IFNULL(mem_clock_mhz,0) FROM gpu_metrics WHERE ts>=? AND ts<=? ORDER BY ts,gpu_index`,
minTS, maxTS, minTS, maxTS,
) )
if err == nil { if err == nil {
@@ -171,7 +230,7 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
for gRows.Next() { for gRows.Next() {
var ts int64 var ts int64
var g platform.GPUMetricRow var g platform.GPUMetricRow
if err := gRows.Scan(&ts, &g.GPUIndex, &g.TempC, &g.UsagePct, &g.MemUsagePct, &g.PowerW); err == nil { if err := gRows.Scan(&ts, &g.GPUIndex, &g.TempC, &g.UsagePct, &g.MemUsagePct, &g.PowerW, &g.ClockMHz, &g.MemClockMHz); err == nil {
gpuData[gpuKey{ts, g.GPUIndex}] = g gpuData[gpuKey{ts, g.GPUIndex}] = g
} }
} }
@@ -283,7 +342,8 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
func (m *MetricsDB) ExportCSV(w io.Writer) error { func (m *MetricsDB) ExportCSV(w io.Writer) error {
rows, err := m.db.Query(` rows, err := m.db.Query(`
SELECT s.ts, s.cpu_load_pct, s.mem_load_pct, s.power_w, SELECT s.ts, s.cpu_load_pct, s.mem_load_pct, s.power_w,
g.gpu_index, g.temp_c, g.usage_pct, g.mem_usage_pct, g.power_w g.gpu_index, g.temp_c, g.usage_pct, g.mem_usage_pct, g.power_w,
g.clock_mhz, g.mem_clock_mhz
FROM sys_metrics s FROM sys_metrics s
LEFT JOIN gpu_metrics g ON g.ts = s.ts LEFT JOIN gpu_metrics g ON g.ts = s.ts
ORDER BY s.ts, g.gpu_index ORDER BY s.ts, g.gpu_index
@@ -294,13 +354,13 @@ func (m *MetricsDB) ExportCSV(w io.Writer) error {
defer rows.Close() defer rows.Close()
cw := csv.NewWriter(w) cw := csv.NewWriter(w)
_ = cw.Write([]string{"ts", "cpu_load_pct", "mem_load_pct", "sys_power_w", "gpu_index", "gpu_temp_c", "gpu_usage_pct", "gpu_mem_pct", "gpu_power_w"}) _ = cw.Write([]string{"ts", "cpu_load_pct", "mem_load_pct", "sys_power_w", "gpu_index", "gpu_temp_c", "gpu_usage_pct", "gpu_mem_pct", "gpu_power_w", "gpu_clock_mhz", "gpu_mem_clock_mhz"})
for rows.Next() { for rows.Next() {
var ts int64 var ts int64
var cpu, mem, pwr float64 var cpu, mem, pwr float64
var gpuIdx sql.NullInt64 var gpuIdx sql.NullInt64
var gpuTemp, gpuUse, gpuMem, gpuPow sql.NullFloat64 var gpuTemp, gpuUse, gpuMem, gpuPow, gpuClock, gpuMemClock sql.NullFloat64
if err := rows.Scan(&ts, &cpu, &mem, &pwr, &gpuIdx, &gpuTemp, &gpuUse, &gpuMem, &gpuPow); err != nil { if err := rows.Scan(&ts, &cpu, &mem, &pwr, &gpuIdx, &gpuTemp, &gpuUse, &gpuMem, &gpuPow, &gpuClock, &gpuMemClock); err != nil {
continue continue
} }
row := []string{ row := []string{
@@ -316,9 +376,11 @@ func (m *MetricsDB) ExportCSV(w io.Writer) error {
strconv.FormatFloat(gpuUse.Float64, 'f', 1, 64), strconv.FormatFloat(gpuUse.Float64, 'f', 1, 64),
strconv.FormatFloat(gpuMem.Float64, 'f', 1, 64), strconv.FormatFloat(gpuMem.Float64, 'f', 1, 64),
strconv.FormatFloat(gpuPow.Float64, 'f', 1, 64), strconv.FormatFloat(gpuPow.Float64, 'f', 1, 64),
strconv.FormatFloat(gpuClock.Float64, 'f', 1, 64),
strconv.FormatFloat(gpuMemClock.Float64, 'f', 1, 64),
) )
} else { } else {
row = append(row, "", "", "", "", "") row = append(row, "", "", "", "", "", "", "")
} }
_ = cw.Write(row) _ = cw.Write(row)
} }
@@ -326,9 +388,6 @@ func (m *MetricsDB) ExportCSV(w io.Writer) error {
return cw.Error() return cw.Error()
} }
// Close closes the database.
func (m *MetricsDB) Close() { _ = m.db.Close() }
func nullFloat(v float64) sql.NullFloat64 { func nullFloat(v float64) sql.NullFloat64 {
return sql.NullFloat64{Float64: v, Valid: true} return sql.NullFloat64{Float64: v, Valid: true}
} }

View File

@@ -1,11 +1,13 @@
package webui package webui
import ( import (
"database/sql"
"path/filepath" "path/filepath"
"testing" "testing"
"time" "time"
"bee/audit/internal/platform" "bee/audit/internal/platform"
_ "modernc.org/sqlite"
) )
func TestMetricsDBLoadSamplesKeepsChronologicalRangeForGPUs(t *testing.T) { func TestMetricsDBLoadSamplesKeepsChronologicalRangeForGPUs(t *testing.T) {
@@ -67,3 +69,106 @@ func TestMetricsDBLoadSamplesKeepsChronologicalRangeForGPUs(t *testing.T) {
} }
} }
} }
func TestMetricsDBMigratesLegacyGPUSchema(t *testing.T) {
path := filepath.Join(t.TempDir(), "metrics.db")
raw, err := sql.Open("sqlite", path)
if err != nil {
t.Fatalf("sql.Open: %v", err)
}
_, err = raw.Exec(`
CREATE TABLE gpu_metrics (
ts INTEGER NOT NULL,
gpu_index INTEGER NOT NULL,
temp_c REAL,
usage_pct REAL,
mem_usage_pct REAL,
power_w REAL,
PRIMARY KEY (ts, gpu_index)
);
CREATE TABLE sys_metrics (
ts INTEGER NOT NULL,
cpu_load_pct REAL,
mem_load_pct REAL,
power_w REAL,
PRIMARY KEY (ts)
);
CREATE TABLE fan_metrics (
ts INTEGER NOT NULL,
name TEXT NOT NULL,
rpm REAL,
PRIMARY KEY (ts, name)
);
CREATE TABLE temp_metrics (
ts INTEGER NOT NULL,
name TEXT NOT NULL,
grp TEXT NOT NULL,
celsius REAL,
PRIMARY KEY (ts, name)
);
`)
if err != nil {
t.Fatalf("create legacy schema: %v", err)
}
_ = raw.Close()
db, err := openMetricsDB(path)
if err != nil {
t.Fatalf("openMetricsDB: %v", err)
}
defer db.Close()
now := time.Unix(1_700_000_100, 0).UTC()
err = db.Write(platform.LiveMetricSample{
Timestamp: now,
GPUs: []platform.GPUMetricRow{
{GPUIndex: 0, ClockMHz: 1410, MemClockMHz: 2600},
},
})
if err != nil {
t.Fatalf("Write: %v", err)
}
samples, err := db.LoadAll()
if err != nil {
t.Fatalf("LoadAll: %v", err)
}
if len(samples) != 1 || len(samples[0].GPUs) != 1 {
t.Fatalf("samples=%+v", samples)
}
if got := samples[0].GPUs[0].ClockMHz; got != 1410 {
t.Fatalf("ClockMHz=%v want 1410", got)
}
if got := samples[0].GPUs[0].MemClockMHz; got != 2600 {
t.Fatalf("MemClockMHz=%v want 2600", got)
}
}
func TestMetricsDBLoadBetweenFiltersWindow(t *testing.T) {
db, err := openMetricsDB(filepath.Join(t.TempDir(), "metrics.db"))
if err != nil {
t.Fatalf("openMetricsDB: %v", err)
}
defer db.Close()
base := time.Unix(1_700_000_000, 0).UTC()
for i := 0; i < 5; i++ {
if err := db.Write(platform.LiveMetricSample{
Timestamp: base.Add(time.Duration(i) * time.Minute),
CPULoadPct: float64(i),
}); err != nil {
t.Fatalf("Write(%d): %v", i, err)
}
}
got, err := db.LoadBetween(base.Add(1*time.Minute), base.Add(3*time.Minute))
if err != nil {
t.Fatalf("LoadBetween: %v", err)
}
if len(got) != 3 {
t.Fatalf("LoadBetween len=%d want 3", len(got))
}
if !got[0].Timestamp.Equal(base.Add(1*time.Minute)) || !got[2].Timestamp.Equal(base.Add(3*time.Minute)) {
t.Fatalf("window=%v..%v", got[0].Timestamp, got[2].Timestamp)
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,41 @@
package webui
import (
"fmt"
"os"
"strings"
"time"
)
var taskSerialWriteLine = writeTaskSerialLine
func writeTaskSerialLine(line string) {
line = strings.TrimSpace(line)
if line == "" {
return
}
payload := fmt.Sprintf("%s %s\n", time.Now().UTC().Format("2006-01-02 15:04:05Z"), line)
for _, path := range []string{"/dev/ttyS0", "/dev/ttyS1", "/dev/console"} {
f, err := os.OpenFile(path, os.O_WRONLY|os.O_APPEND, 0)
if err != nil {
continue
}
_, _ = f.WriteString(payload)
_ = f.Close()
return
}
}
func taskSerialPrefix(t *Task) string {
if t == nil {
return "[task] "
}
return fmt.Sprintf("[task %s %s] ", t.ID, t.Name)
}
func taskSerialEvent(t *Task, event string) {
if t == nil {
return
}
taskSerialWriteLine(fmt.Sprintf("%s%s", taskSerialPrefix(t), strings.TrimSpace(event)))
}

View File

@@ -1,15 +1,19 @@
package webui package webui
import ( import (
"bufio"
"encoding/json" "encoding/json"
"errors" "errors"
"fmt" "fmt"
"html" "html"
"io"
"log/slog" "log/slog"
"mime" "mime"
"net"
"net/http" "net/http"
"os" "os"
"path/filepath" "path/filepath"
"runtime/debug"
"sort" "sort"
"strings" "strings"
"sync" "sync"
@@ -18,7 +22,6 @@ import (
"bee/audit/internal/app" "bee/audit/internal/app"
"bee/audit/internal/platform" "bee/audit/internal/platform"
"bee/audit/internal/runtimeenv" "bee/audit/internal/runtimeenv"
gocharts "github.com/go-analyze/charts"
"reanimator/chart/viewer" "reanimator/chart/viewer"
"reanimator/chart/web" "reanimator/chart/web"
) )
@@ -234,6 +237,12 @@ func NewHandler(opts HandlerOptions) http.Handler {
// SAT // SAT
mux.HandleFunc("POST /api/sat/nvidia/run", h.handleAPISATRun("nvidia")) mux.HandleFunc("POST /api/sat/nvidia/run", h.handleAPISATRun("nvidia"))
mux.HandleFunc("POST /api/sat/nvidia-targeted-stress/run", h.handleAPISATRun("nvidia-targeted-stress"))
mux.HandleFunc("POST /api/sat/nvidia-compute/run", h.handleAPISATRun("nvidia-compute"))
mux.HandleFunc("POST /api/sat/nvidia-targeted-power/run", h.handleAPISATRun("nvidia-targeted-power"))
mux.HandleFunc("POST /api/sat/nvidia-pulse/run", h.handleAPISATRun("nvidia-pulse"))
mux.HandleFunc("POST /api/sat/nvidia-interconnect/run", h.handleAPISATRun("nvidia-interconnect"))
mux.HandleFunc("POST /api/sat/nvidia-bandwidth/run", h.handleAPISATRun("nvidia-bandwidth"))
mux.HandleFunc("POST /api/sat/nvidia-stress/run", h.handleAPISATRun("nvidia-stress")) mux.HandleFunc("POST /api/sat/nvidia-stress/run", h.handleAPISATRun("nvidia-stress"))
mux.HandleFunc("POST /api/sat/memory/run", h.handleAPISATRun("memory")) mux.HandleFunc("POST /api/sat/memory/run", h.handleAPISATRun("memory"))
mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage")) mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
@@ -247,6 +256,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
mux.HandleFunc("POST /api/sat/platform-stress/run", h.handleAPISATRun("platform-stress")) mux.HandleFunc("POST /api/sat/platform-stress/run", h.handleAPISATRun("platform-stress"))
mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream) mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream)
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort) mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
mux.HandleFunc("POST /api/benchmark/nvidia/run", h.handleAPIBenchmarkNvidiaRun)
// Tasks // Tasks
mux.HandleFunc("GET /api/tasks", h.handleAPITasksList) mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)
@@ -255,6 +265,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
mux.HandleFunc("POST /api/tasks/{id}/cancel", h.handleAPITasksCancel) mux.HandleFunc("POST /api/tasks/{id}/cancel", h.handleAPITasksCancel)
mux.HandleFunc("POST /api/tasks/{id}/priority", h.handleAPITasksPriority) mux.HandleFunc("POST /api/tasks/{id}/priority", h.handleAPITasksPriority)
mux.HandleFunc("GET /api/tasks/{id}/stream", h.handleAPITasksStream) mux.HandleFunc("GET /api/tasks/{id}/stream", h.handleAPITasksStream)
mux.HandleFunc("GET /tasks/{id}", h.handleTaskPage)
// Services // Services
mux.HandleFunc("GET /api/services", h.handleAPIServicesList) mux.HandleFunc("GET /api/services", h.handleAPIServicesList)
@@ -283,6 +294,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
// GPU presence / tools // GPU presence / tools
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence) mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
mux.HandleFunc("GET /api/gpu/nvidia", h.handleAPIGNVIDIAGPUs)
mux.HandleFunc("GET /api/gpu/tools", h.handleAPIGPUTools) mux.HandleFunc("GET /api/gpu/tools", h.handleAPIGPUTools)
// System // System
@@ -309,11 +321,11 @@ func NewHandler(opts HandlerOptions) http.Handler {
mux.HandleFunc("GET /", h.handlePage) mux.HandleFunc("GET /", h.handlePage)
h.mux = mux h.mux = mux
return mux return recoverMiddleware(mux)
} }
func (h *handler) startMetricsCollector() { func (h *handler) startMetricsCollector() {
go func() { goRecoverLoop("metrics collector", 2*time.Second, func() {
ticker := time.NewTicker(metricsCollectInterval) ticker := time.NewTicker(metricsCollectInterval)
defer ticker.Stop() defer ticker.Stop()
for range ticker.C { for range ticker.C {
@@ -324,7 +336,7 @@ func (h *handler) startMetricsCollector() {
h.feedRings(sample) h.feedRings(sample)
h.setLatestMetric(sample) h.setLatestMetric(sample)
} }
}() })
} }
func (h *handler) setLatestMetric(sample platform.LiveMetricSample) { func (h *handler) setLatestMetric(sample platform.LiveMetricSample) {
@@ -345,7 +357,81 @@ func (h *handler) latestMetric() (platform.LiveMetricSample, bool) {
// ListenAndServe starts the HTTP server. // ListenAndServe starts the HTTP server.
func ListenAndServe(addr string, opts HandlerOptions) error { func ListenAndServe(addr string, opts HandlerOptions) error {
return http.ListenAndServe(addr, NewHandler(opts)) srv := &http.Server{
Addr: addr,
Handler: NewHandler(opts),
ReadHeaderTimeout: 5 * time.Second,
ReadTimeout: 30 * time.Second,
IdleTimeout: 2 * time.Minute,
}
return srv.ListenAndServe()
}
type trackingResponseWriter struct {
http.ResponseWriter
wroteHeader bool
}
func (w *trackingResponseWriter) WriteHeader(statusCode int) {
w.wroteHeader = true
w.ResponseWriter.WriteHeader(statusCode)
}
func (w *trackingResponseWriter) Write(p []byte) (int, error) {
w.wroteHeader = true
return w.ResponseWriter.Write(p)
}
func (w *trackingResponseWriter) Flush() {
w.wroteHeader = true
if f, ok := w.ResponseWriter.(http.Flusher); ok {
f.Flush()
}
}
func (w *trackingResponseWriter) Hijack() (net.Conn, *bufio.ReadWriter, error) {
h, ok := w.ResponseWriter.(http.Hijacker)
if !ok {
return nil, nil, fmt.Errorf("hijacking not supported")
}
return h.Hijack()
}
func (w *trackingResponseWriter) Push(target string, opts *http.PushOptions) error {
p, ok := w.ResponseWriter.(http.Pusher)
if !ok {
return http.ErrNotSupported
}
return p.Push(target, opts)
}
func (w *trackingResponseWriter) ReadFrom(r io.Reader) (int64, error) {
rf, ok := w.ResponseWriter.(io.ReaderFrom)
if !ok {
return io.Copy(w.ResponseWriter, r)
}
w.wroteHeader = true
return rf.ReadFrom(r)
}
func recoverMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
tw := &trackingResponseWriter{ResponseWriter: w}
defer func() {
if rec := recover(); rec != nil {
slog.Error("http handler panic",
"method", r.Method,
"path", r.URL.Path,
"panic", fmt.Sprint(rec),
"stack", string(debug.Stack()),
)
if !tw.wroteHeader {
http.Error(tw, "internal server error", http.StatusInternalServerError)
}
}
}()
next.ServeHTTP(tw, r)
})
} }
// ── Infrastructure handlers ────────────────────────────────────────────────── // ── Infrastructure handlers ──────────────────────────────────────────────────
@@ -475,13 +561,44 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
http.Error(w, "metrics database not available", http.StatusServiceUnavailable) http.Error(w, "metrics database not available", http.StatusServiceUnavailable)
return return
} }
datasets, names, labels, title, yMin, yMax, ok := h.chartDataFromDB(path) samples, err := h.metricsDB.LoadAll()
if err != nil || len(samples) == 0 {
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
return
}
timeline := metricsTimelineSegments(samples, time.Now())
if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
buf, ok, err := renderGPUOverviewChartSVG(idx, samples, timeline)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
if !ok {
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
return
}
w.Header().Set("Content-Type", "image/svg+xml")
w.Header().Set("Cache-Control", "no-store")
_, _ = w.Write(buf)
return
}
datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
if !ok { if !ok {
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable) http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
return return
} }
buf, err := renderChartSVG(title, datasets, names, labels, yMin, yMax) buf, err := renderMetricChartSVG(
title,
labels,
sampleTimes(samples),
datasets,
names,
yMin,
yMax,
chartCanvasHeightForPath(path, len(names)),
timeline,
)
if err != nil { if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError) http.Error(w, err.Error(), http.StatusInternalServerError)
return return
@@ -491,14 +608,6 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
_, _ = w.Write(buf) _, _ = w.Write(buf)
} }
func (h *handler) chartDataFromDB(path string) ([][]float64, []string, []string, string, *float64, *float64, bool) {
samples, err := h.metricsDB.LoadAll()
if err != nil || len(samples) == 0 {
return nil, nil, nil, "", nil, nil, false
}
return chartDataFromSamples(path, samples)
}
func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][]float64, []string, []string, string, *float64, *float64, bool) { func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][]float64, []string, []string, string, *float64, *float64, bool) {
var datasets [][]float64 var datasets [][]float64
var names []string var names []string
@@ -578,18 +687,24 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
yMin = floatPtr(0) yMin = floatPtr(0)
yMax = autoMax120(datasets...) yMax = autoMax120(datasets...)
case path == "gpu-all-clock":
title = "GPU Core Clock"
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
yMin, yMax = autoBounds120(datasets...)
case path == "gpu-all-memclock":
title = "GPU Memory Clock"
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
yMin, yMax = autoBounds120(datasets...)
case strings.HasPrefix(path, "gpu/"): case strings.HasPrefix(path, "gpu/"):
rest := strings.TrimPrefix(path, "gpu/") idx, sub, ok := parseGPUChartPath(path)
sub := "" if !ok {
if i := strings.LastIndex(rest, "-"); i > 0 { return nil, nil, nil, "", nil, nil, false
sub = rest[i+1:]
rest = rest[:i]
} }
idx := 0
fmt.Sscanf(rest, "%d", &idx)
switch sub { switch sub {
case "load": case "load":
title = fmt.Sprintf("GPU %d Load", idx) title = gpuDisplayLabel(idx) + " Load"
util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct }) util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct }) mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
if util == nil && mem == nil { if util == nil && mem == nil {
@@ -600,7 +715,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
yMin = floatPtr(0) yMin = floatPtr(0)
yMax = floatPtr(100) yMax = floatPtr(100)
case "temp": case "temp":
title = fmt.Sprintf("GPU %d Temperature", idx) title = gpuDisplayLabel(idx) + " Temperature"
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC }) temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
if temp == nil { if temp == nil {
return nil, nil, nil, "", nil, nil, false return nil, nil, nil, "", nil, nil, false
@@ -609,8 +724,26 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
names = []string{"Temp °C"} names = []string{"Temp °C"}
yMin = floatPtr(0) yMin = floatPtr(0)
yMax = autoMax120(temp) yMax = autoMax120(temp)
case "clock":
title = gpuDisplayLabel(idx) + " Core Clock"
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
if clock == nil {
return nil, nil, nil, "", nil, nil, false
}
datasets = [][]float64{clock}
names = []string{"Core Clock MHz"}
yMin, yMax = autoBounds120(clock)
case "memclock":
title = gpuDisplayLabel(idx) + " Memory Clock"
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
if clock == nil {
return nil, nil, nil, "", nil, nil, false
}
datasets = [][]float64{clock}
names = []string{"Memory Clock MHz"}
yMin, yMax = autoBounds120(clock)
default: default:
title = fmt.Sprintf("GPU %d Power", idx) title = gpuDisplayLabel(idx) + " Power"
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW }) power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
if power == nil { if power == nil {
return nil, nil, nil, "", nil, nil, false return nil, nil, nil, "", nil, nil, false
@@ -627,6 +760,26 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
return datasets, names, labels, title, yMin, yMax, len(datasets) > 0 return datasets, names, labels, title, yMin, yMax, len(datasets) > 0
} }
func parseGPUChartPath(path string) (idx int, sub string, ok bool) {
if !strings.HasPrefix(path, "gpu/") {
return 0, "", false
}
rest := strings.TrimPrefix(path, "gpu/")
if rest == "" {
return 0, "", false
}
sub = ""
if i := strings.LastIndex(rest, "-"); i > 0 {
sub = rest[i+1:]
rest = rest[:i]
}
n, err := fmt.Sscanf(rest, "%d", &idx)
if err != nil || n != 1 {
return 0, "", false
}
return idx, sub, true
}
func sampleTimeLabels(samples []platform.LiveMetricSample) []string { func sampleTimeLabels(samples []platform.LiveMetricSample) []string {
labels := make([]string, len(samples)) labels := make([]string, len(samples))
if len(samples) == 0 { if len(samples) == 0 {
@@ -719,7 +872,7 @@ func gpuDatasets(samples []platform.LiveMetricSample, pick func(platform.GPUMetr
continue continue
} }
datasets = append(datasets, ds) datasets = append(datasets, ds)
names = append(names, fmt.Sprintf("GPU %d", idx)) names = append(names, gpuDisplayLabel(idx))
} }
return datasets, names return datasets, names
} }
@@ -852,64 +1005,37 @@ func autoBounds120(datasets ...[]float64) (*float64, *float64) {
return floatPtr(low), floatPtr(high) return floatPtr(low), floatPtr(high)
} }
// renderChartSVG renders a line chart SVG with a fixed Y-axis range. func gpuChartLabelIndices(total, target int) []int {
func renderChartSVG(title string, datasets [][]float64, names []string, labels []string, yMin, yMax *float64) ([]byte, error) { if total <= 0 {
n := len(labels) return nil
if n == 0 {
n = 1
labels = []string{""}
} }
for i := range datasets { if total == 1 {
if len(datasets[i]) == 0 { return []int{0}
datasets[i] = make([]float64, n)
}
} }
// Append global min/avg/max to title. step := total / target
mn, avg, mx := globalStats(datasets) if step < 1 {
if mx > 0 { step = 1
title = fmt.Sprintf("%s ↓%s ~%s ↑%s",
title,
chartLegendNumber(mn),
chartLegendNumber(avg),
chartLegendNumber(mx),
)
} }
title = sanitizeChartText(title) var indices []int
names = sanitizeChartTexts(names) for i := 0; i < total; i += step {
sparse := sanitizeChartTexts(sparseLabels(labels, 6)) indices = append(indices, i)
}
if indices[len(indices)-1] != total-1 {
indices = append(indices, total-1)
}
return indices
}
opt := gocharts.NewLineChartOptionWithData(datasets) func chartCanvasHeightForPath(path string, seriesCount int) int {
opt.Title = gocharts.TitleOption{Text: title} height := chartCanvasHeight(seriesCount)
opt.XAxis.Labels = sparse if isGPUChartPath(path) {
opt.Legend = gocharts.LegendOption{SeriesNames: names} return height * 2
if chartLegendVisible(len(names)) {
opt.Legend.Offset = gocharts.OffsetStr{Top: gocharts.PositionBottom}
opt.Legend.OverlayChart = gocharts.Ptr(false)
} else {
opt.Legend.Show = gocharts.Ptr(false)
}
opt.Symbol = gocharts.SymbolNone
// Right padding: reserve space for the MarkLine label (library recommendation).
opt.Padding = gocharts.NewBox(20, 20, 80, 20)
if yMin != nil || yMax != nil {
opt.YAxis = []gocharts.YAxisOption{chartYAxisOption(yMin, yMax)}
} }
return height
}
// Add a single peak mark line on the series that holds the global maximum. func isGPUChartPath(path string) bool {
peakIdx, _ := globalPeakSeries(datasets) return strings.HasPrefix(path, "gpu-all-") || strings.HasPrefix(path, "gpu/")
if peakIdx >= 0 && peakIdx < len(opt.SeriesList) {
opt.SeriesList[peakIdx].MarkLine = gocharts.NewMarkLine(gocharts.SeriesMarkTypeMax)
}
p := gocharts.NewPainter(gocharts.PainterOptions{
OutputFormat: gocharts.ChartOutputSVG,
Width: 1400,
Height: chartCanvasHeight(len(names)),
}, gocharts.PainterThemeOption(gocharts.GetTheme("grafana")))
if err := p.LineChart(opt); err != nil {
return nil, err
}
return p.Bytes()
} }
func chartLegendVisible(seriesCount int) bool { func chartLegendVisible(seriesCount int) bool {
@@ -923,30 +1049,6 @@ func chartCanvasHeight(seriesCount int) int {
return 288 return 288
} }
func chartYAxisOption(yMin, yMax *float64) gocharts.YAxisOption {
return gocharts.YAxisOption{
Min: yMin,
Max: yMax,
LabelCount: 11,
ValueFormatter: chartYAxisNumber,
}
}
// globalPeakSeries returns the index of the series containing the global maximum
// value across all datasets, and that maximum value.
func globalPeakSeries(datasets [][]float64) (idx int, peak float64) {
idx = -1
for i, ds := range datasets {
for _, v := range ds {
if v > peak {
peak = v
idx = i
}
}
}
return idx, peak
}
// globalStats returns min, average, and max across all values in all datasets. // globalStats returns min, average, and max across all values in all datasets.
func globalStats(datasets [][]float64) (mn, avg, mx float64) { func globalStats(datasets [][]float64) (mn, avg, mx float64) {
var sum float64 var sum float64
@@ -986,21 +1088,6 @@ func sanitizeChartText(s string) string {
}, s)) }, s))
} }
func sanitizeChartTexts(in []string) []string {
out := make([]string, len(in))
for i, s := range in {
out[i] = sanitizeChartText(s)
}
return out
}
func safeIdx(s []float64, i int) float64 {
if i < len(s) {
return s[i]
}
return 0
}
func snapshotNamedRings(rings []*namedMetricsRing) ([][]float64, []string, []string) { func snapshotNamedRings(rings []*namedMetricsRing) ([][]float64, []string, []string) {
var datasets [][]float64 var datasets [][]float64
var names []string var names []string
@@ -1087,20 +1174,6 @@ func chartYAxisNumber(v float64) string {
return out return out
} }
func sparseLabels(labels []string, n int) []string {
out := make([]string, len(labels))
step := len(labels) / n
if step < 1 {
step = 1
}
for i, l := range labels {
if i%step == 0 {
out[i] = l
}
}
return out
}
func (h *handler) handleAPIMetricsExportCSV(w http.ResponseWriter, r *http.Request) { func (h *handler) handleAPIMetricsExportCSV(w http.ResponseWriter, r *http.Request) {
if h.metricsDB == nil { if h.metricsDB == nil {
http.Error(w, "metrics database not available", http.StatusServiceUnavailable) http.Error(w, "metrics database not available", http.StatusServiceUnavailable)
@@ -1116,6 +1189,11 @@ func (h *handler) handleAPIMetricsExportCSV(w http.ResponseWriter, r *http.Reque
func (h *handler) handleReady(w http.ResponseWriter, r *http.Request) { func (h *handler) handleReady(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Cache-Control", "no-store") w.Header().Set("Cache-Control", "no-store")
if strings.TrimSpace(h.opts.AuditPath) == "" {
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte("ready"))
return
}
if _, err := os.Stat(h.opts.AuditPath); err != nil { if _, err := os.Stat(h.opts.AuditPath); err != nil {
w.WriteHeader(http.StatusServiceUnavailable) w.WriteHeader(http.StatusServiceUnavailable)
_, _ = w.Write([]byte("starting")) _, _ = w.Write([]byte("starting"))

View File

@@ -34,6 +34,49 @@ func TestChartLegendNumber(t *testing.T) {
} }
} }
func TestRecoverMiddlewareReturns500OnPanic(t *testing.T) {
handler := recoverMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
panic("boom")
}))
rec := httptest.NewRecorder()
req := httptest.NewRequest(http.MethodGet, "/panic", nil)
handler.ServeHTTP(rec, req)
if rec.Code != http.StatusInternalServerError {
t.Fatalf("status=%d want %d", rec.Code, http.StatusInternalServerError)
}
if !strings.Contains(rec.Body.String(), "internal server error") {
t.Fatalf("body=%q", rec.Body.String())
}
}
func TestRecoverMiddlewarePreservesStreamingInterfaces(t *testing.T) {
handler := recoverMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if !sseStart(w) {
return
}
if !sseWrite(w, "tick", "ok") {
t.Fatal("expected sse write to succeed")
}
}))
rec := httptest.NewRecorder()
req := httptest.NewRequest(http.MethodGet, "/stream", nil)
handler.ServeHTTP(rec, req)
if rec.Code != http.StatusOK {
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
}
if got := rec.Header().Get("Content-Type"); got != "text/event-stream" {
t.Fatalf("content-type=%q", got)
}
body := rec.Body.String()
if !strings.Contains(body, "event: tick\n") || !strings.Contains(body, "data: ok\n\n") {
t.Fatalf("body=%q", body)
}
}
func TestChartDataFromSamplesUsesFullHistory(t *testing.T) { func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
samples := []platform.LiveMetricSample{ samples := []platform.LiveMetricSample{
{ {
@@ -136,6 +179,39 @@ func TestChartDataFromSamplesKeepsStableGPUSeriesOrder(t *testing.T) {
} }
} }
func TestChartDataFromSamplesIncludesGPUClockCharts(t *testing.T) {
samples := []platform.LiveMetricSample{
{
Timestamp: time.Now().Add(-2 * time.Minute),
GPUs: []platform.GPUMetricRow{
{GPUIndex: 0, ClockMHz: 1400},
{GPUIndex: 3, ClockMHz: 1500},
},
},
{
Timestamp: time.Now().Add(-1 * time.Minute),
GPUs: []platform.GPUMetricRow{
{GPUIndex: 0, ClockMHz: 1410},
{GPUIndex: 3, ClockMHz: 1510},
},
},
}
datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
if !ok {
t.Fatal("gpu-all-clock returned ok=false")
}
if title != "GPU Core Clock" {
t.Fatalf("title=%q", title)
}
if len(names) != 2 || names[0] != "GPU 0" || names[1] != "GPU 3" {
t.Fatalf("names=%v", names)
}
if got := datasets[1][1]; got != 1510 {
t.Fatalf("GPU 3 core clock=%v want 1510", got)
}
}
func TestNormalizePowerSeriesHoldsLastPositive(t *testing.T) { func TestNormalizePowerSeriesHoldsLastPositive(t *testing.T) {
got := normalizePowerSeries([]float64{0, 480, 0, 0, 510, 0}) got := normalizePowerSeries([]float64{0, 480, 0, 0, 510, 0})
want := []float64{0, 480, 480, 480, 510, 510} want := []float64{0, 480, 480, 480, 510, 510}
@@ -157,6 +233,21 @@ func TestRenderMetricsUsesBufferedChartRefresh(t *testing.T) {
if !strings.Contains(body, "el.dataset.loading === '1'") { if !strings.Contains(body, "el.dataset.loading === '1'") {
t.Fatalf("metrics page should avoid overlapping chart reloads: %s", body) t.Fatalf("metrics page should avoid overlapping chart reloads: %s", body)
} }
if !strings.Contains(body, `id="gpu-metrics-section" style="display:none`) {
t.Fatalf("metrics page should keep gpu charts in a hidden dedicated section until GPUs are detected: %s", body)
}
if !strings.Contains(body, `id="gpu-chart-toggle"`) {
t.Fatalf("metrics page should render GPU chart mode toggle: %s", body)
}
if !strings.Contains(body, `/api/metrics/chart/gpu-all-clock.svg`) {
t.Fatalf("metrics page should include GPU core clock chart: %s", body)
}
if strings.Contains(body, `/api/metrics/chart/gpu-all-memclock.svg`) {
t.Fatalf("metrics page should not include GPU memory clock chart: %s", body)
}
if !strings.Contains(body, `renderGPUOverviewCards(indices, names)`) {
t.Fatalf("metrics page should build per-GPU chart cards dynamically: %s", body)
}
} }
func TestChartLegendVisible(t *testing.T) { func TestChartLegendVisible(t *testing.T) {
@@ -199,6 +290,124 @@ func TestChartCanvasHeight(t *testing.T) {
} }
} }
func TestChartTimelineSegmentsForRangeMergesActiveSpansAndIdleGaps(t *testing.T) {
start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
end := start.Add(10 * time.Minute)
taskWindow := func(offsetStart, offsetEnd time.Duration) Task {
s := start.Add(offsetStart)
e := start.Add(offsetEnd)
return Task{
Name: "task",
Status: TaskDone,
StartedAt: &s,
DoneAt: &e,
}
}
segments := chartTimelineSegmentsForRange(start, end, end, []Task{
taskWindow(1*time.Minute, 3*time.Minute),
taskWindow(2*time.Minute, 5*time.Minute),
taskWindow(7*time.Minute, 8*time.Minute),
})
if len(segments) != 5 {
t.Fatalf("segments=%d want 5: %#v", len(segments), segments)
}
wantActive := []bool{false, true, false, true, false}
wantMinutes := [][2]int{{0, 1}, {1, 5}, {5, 7}, {7, 8}, {8, 10}}
for i, segment := range segments {
if segment.Active != wantActive[i] {
t.Fatalf("segment[%d].Active=%v want %v", i, segment.Active, wantActive[i])
}
if got := int(segment.Start.Sub(start).Minutes()); got != wantMinutes[i][0] {
t.Fatalf("segment[%d] start=%d want %d", i, got, wantMinutes[i][0])
}
if got := int(segment.End.Sub(start).Minutes()); got != wantMinutes[i][1] {
t.Fatalf("segment[%d] end=%d want %d", i, got, wantMinutes[i][1])
}
}
}
func TestRenderMetricChartSVGIncludesTimelineOverlay(t *testing.T) {
start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
labels := []string{"12:00", "12:01", "12:02"}
times := []time.Time{start, start.Add(time.Minute), start.Add(2 * time.Minute)}
svg, err := renderMetricChartSVG(
"System Power",
labels,
times,
[][]float64{{300, 320, 310}},
[]string{"Power W"},
floatPtr(0),
floatPtr(400),
360,
[]chartTimelineSegment{
{Start: start, End: start.Add(time.Minute), Active: false},
{Start: start.Add(time.Minute), End: start.Add(2 * time.Minute), Active: true},
},
)
if err != nil {
t.Fatal(err)
}
body := string(svg)
if !strings.Contains(body, `data-role="timeline-overlay"`) {
t.Fatalf("svg missing timeline overlay: %s", body)
}
if !strings.Contains(body, `opacity="0.10"`) {
t.Fatalf("svg missing idle overlay opacity: %s", body)
}
if !strings.Contains(body, `System Power`) {
t.Fatalf("svg missing chart title: %s", body)
}
}
func TestHandleMetricsChartSVGRendersCustomSVG(t *testing.T) {
dir := t.TempDir()
db, err := openMetricsDB(filepath.Join(dir, "metrics.db"))
if err != nil {
t.Fatal(err)
}
t.Cleanup(func() { _ = db.db.Close() })
start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
for i, sample := range []platform.LiveMetricSample{
{Timestamp: start, PowerW: 300},
{Timestamp: start.Add(time.Minute), PowerW: 320},
{Timestamp: start.Add(2 * time.Minute), PowerW: 310},
} {
if err := db.Write(sample); err != nil {
t.Fatalf("write sample %d: %v", i, err)
}
}
globalQueue.mu.Lock()
prevTasks := globalQueue.tasks
s := start.Add(30 * time.Second)
e := start.Add(90 * time.Second)
globalQueue.tasks = []*Task{{Name: "Burn", Status: TaskDone, StartedAt: &s, DoneAt: &e}}
globalQueue.mu.Unlock()
t.Cleanup(func() {
globalQueue.mu.Lock()
globalQueue.tasks = prevTasks
globalQueue.mu.Unlock()
})
h := &handler{opts: HandlerOptions{ExportDir: dir}, metricsDB: db}
rec := httptest.NewRecorder()
req := httptest.NewRequest(http.MethodGet, "/api/metrics/chart/server-power.svg", nil)
h.handleMetricsChartSVG(rec, req)
if rec.Code != http.StatusOK {
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
}
body := rec.Body.String()
if !strings.Contains(body, `data-role="timeline-overlay"`) {
t.Fatalf("custom svg response missing timeline overlay: %s", body)
}
if !strings.Contains(body, `stroke-linecap="round"`) {
t.Fatalf("custom svg response missing custom polyline styling: %s", body)
}
}
func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) { func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
got := normalizeFanSeries([]float64{4200, 0, 0, 4300, 0}) got := normalizeFanSeries([]float64{4200, 0, 0, 4300, 0})
want := []float64{4200, 4200, 4200, 4300, 4300} want := []float64{4200, 4200, 4200, 4300, 4300}
@@ -212,21 +421,6 @@ func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
} }
} }
func TestChartYAxisOption(t *testing.T) {
min := floatPtr(0)
max := floatPtr(100)
opt := chartYAxisOption(min, max)
if opt.Min != min || opt.Max != max {
t.Fatalf("chartYAxisOption min/max mismatch: %#v", opt)
}
if opt.LabelCount != 11 {
t.Fatalf("chartYAxisOption labelCount=%d want 11", opt.LabelCount)
}
if got := opt.ValueFormatter(1000); got != "1к" {
t.Fatalf("chartYAxisOption formatter(1000)=%q want 1к", got)
}
}
func TestSnapshotFanRingsUsesTimelineLabels(t *testing.T) { func TestSnapshotFanRingsUsesTimelineLabels(t *testing.T) {
r1 := newMetricsRing(4) r1 := newMetricsRing(4)
r2 := newMetricsRing(4) r2 := newMetricsRing(4)
@@ -335,7 +529,7 @@ func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
t.Fatalf("status=%d", rec.Code) t.Fatalf("status=%d", rec.Code)
} }
body := rec.Body.String() body := rec.Body.String()
if !strings.Contains(body, `Run Audit`) { if !strings.Contains(body, `onclick="auditModalRun()">Run audit</button>`) {
t.Fatalf("dashboard missing run audit button: %s", body) t.Fatalf("dashboard missing run audit button: %s", body)
} }
if strings.Contains(body, `No audit data`) { if strings.Contains(body, `No audit data`) {
@@ -343,6 +537,18 @@ func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
} }
} }
func TestReadyIsOKWhenAuditPathIsUnset(t *testing.T) {
handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/api/ready", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
}
if strings.TrimSpace(rec.Body.String()) != "ready" {
t.Fatalf("body=%q want ready", rec.Body.String())
}
}
func TestAuditPageRendersViewerFrameAndActions(t *testing.T) { func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
dir := t.TempDir() dir := t.TempDir()
path := filepath.Join(dir, "audit.json") path := filepath.Join(dir, "audit.json")
@@ -365,7 +571,7 @@ func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
} }
} }
func TestTasksPageRendersLogModalAndPaginationControls(t *testing.T) { func TestTasksPageRendersOpenLinksAndPaginationControls(t *testing.T) {
handler := NewHandler(HandlerOptions{}) handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder() rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks", nil)) handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks", nil))
@@ -373,8 +579,8 @@ func TestTasksPageRendersLogModalAndPaginationControls(t *testing.T) {
t.Fatalf("status=%d", rec.Code) t.Fatalf("status=%d", rec.Code)
} }
body := rec.Body.String() body := rec.Body.String()
if !strings.Contains(body, `id="task-log-overlay"`) { if !strings.Contains(body, `Open a task to view its saved logs and charts.`) {
t.Fatalf("tasks page missing log modal overlay: %s", body) t.Fatalf("tasks page missing task report hint: %s", body)
} }
if !strings.Contains(body, `_taskPageSize = 50`) { if !strings.Contains(body, `_taskPageSize = 50`) {
t.Fatalf("tasks page missing pagination size config: %s", body) t.Fatalf("tasks page missing pagination size config: %s", body)
@@ -409,37 +615,111 @@ func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
} }
} }
func TestTasksPageRendersScrollableLogModal(t *testing.T) { func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
dir := t.TempDir() handler := NewHandler(HandlerOptions{})
path := filepath.Join(dir, "audit.json")
exportDir := filepath.Join(dir, "export")
if err := os.MkdirAll(exportDir, 0755); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z"}`), 0644); err != nil {
t.Fatal(err)
}
handler := NewHandler(HandlerOptions{
Title: "Bee Hardware Audit",
AuditPath: path,
ExportDir: exportDir,
})
rec := httptest.NewRecorder() rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks", nil)) handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/benchmark", nil))
if rec.Code != http.StatusOK { if rec.Code != http.StatusOK {
t.Fatalf("status=%d", rec.Code) t.Fatalf("status=%d", rec.Code)
} }
body := rec.Body.String() body := rec.Body.String()
if !strings.Contains(body, `height:calc(100vh - 32px)`) { for _, needle := range []string{
t.Fatalf("tasks page missing bounded log modal height: %s", body) `href="/benchmark"`,
`id="benchmark-gpu-list"`,
`/api/gpu/nvidia`,
`/api/benchmark/nvidia/run`,
`benchmark-run-nccl`,
} {
if !strings.Contains(body, needle) {
t.Fatalf("benchmark page missing %q: %s", needle, body)
}
} }
if !strings.Contains(body, `flex:1;min-height:0;overflow:hidden`) { }
t.Fatalf("tasks page missing log modal overflow guard: %s", body)
func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d", rec.Code)
} }
if !strings.Contains(body, `height:100%;min-height:0;overflow:auto`) { body := rec.Body.String()
t.Fatalf("tasks page missing scrollable log wrapper: %s", body) for _, needle := range []string{
`NVIDIA GPU Targeted Stress`,
`nvidia-targeted-stress`,
`controlled NVIDIA DCGM load`,
`<code>dcgmi diag targeted_stress</code>`,
} {
if !strings.Contains(body, needle) {
t.Fatalf("validate page missing %q: %s", needle, body)
}
}
}
func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/burn", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d", rec.Code)
}
body := rec.Body.String()
for _, needle := range []string{
`NVIDIA Max Compute Load`,
`dcgmproftester`,
`targeted_stress remain in <a href="/validate">Validate</a>`,
`NVIDIA Interconnect Test (NCCL all_reduce_perf)`,
`id="burn-gpu-list"`,
} {
if !strings.Contains(body, needle) {
t.Fatalf("burn page missing %q: %s", needle, body)
}
}
}
func TestTaskDetailPageRendersSavedReport(t *testing.T) {
dir := t.TempDir()
exportDir := filepath.Join(dir, "export")
reportDir := filepath.Join(exportDir, "tasks", "task-1_cpu_sat_done")
if err := os.MkdirAll(reportDir, 0755); err != nil {
t.Fatal(err)
}
reportPath := filepath.Join(reportDir, "report.html")
if err := os.WriteFile(reportPath, []byte(`<div class="card"><div class="card-head">Task Report</div><div class="card-body">saved report</div></div>`), 0644); err != nil {
t.Fatal(err)
}
globalQueue.mu.Lock()
origTasks := globalQueue.tasks
globalQueue.tasks = []*Task{{
ID: "task-1",
Name: "CPU SAT",
Target: "cpu",
Status: TaskDone,
CreatedAt: time.Now(),
ArtifactsDir: reportDir,
ReportHTMLPath: reportPath,
}}
globalQueue.mu.Unlock()
t.Cleanup(func() {
globalQueue.mu.Lock()
globalQueue.tasks = origTasks
globalQueue.mu.Unlock()
})
handler := NewHandler(HandlerOptions{Title: "Bee Hardware Audit", ExportDir: exportDir})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks/task-1", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d", rec.Code)
}
body := rec.Body.String()
if !strings.Contains(body, `saved report`) {
t.Fatalf("task detail page missing saved report: %s", body)
}
if !strings.Contains(body, `Back to Tasks`) {
t.Fatalf("task detail page missing back link: %s", body)
} }
} }
@@ -564,3 +844,98 @@ func TestRuntimeHealthEndpointReturnsJSON(t *testing.T) {
t.Fatalf("body=%q want %q", strings.TrimSpace(rec.Body.String()), body) t.Fatalf("body=%q want %q", strings.TrimSpace(rec.Body.String()), body)
} }
} }
func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "audit.json")
exportDir := filepath.Join(dir, "export")
if err := os.MkdirAll(exportDir, 0755); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z","hardware":{"board":{"serial_number":"SERIAL-1"}}}`), 0644); err != nil {
t.Fatal(err)
}
health := `{
"status":"PARTIAL",
"checked_at":"2026-03-16T10:00:00Z",
"export_dir":"/tmp/export",
"driver_ready":true,
"cuda_ready":false,
"network_status":"PARTIAL",
"issues":[
{"code":"dhcp_partial","description":"At least one interface did not obtain IPv4 connectivity."},
{"code":"cuda_runtime_not_ready","description":"CUDA runtime is not ready for GPU SAT."}
],
"tools":[
{"name":"dmidecode","ok":true},
{"name":"nvidia-smi","ok":false}
],
"services":[
{"name":"bee-web","status":"active"},
{"name":"bee-nvidia","status":"inactive"}
]
}`
if err := os.WriteFile(filepath.Join(exportDir, "runtime-health.json"), []byte(health), 0644); err != nil {
t.Fatal(err)
}
componentStatus := `[
{
"component_key":"cpu:all",
"status":"Warning",
"error_summary":"cpu SAT: FAILED",
"history":[{"at":"2026-03-16T10:00:00Z","status":"Warning","source":"sat:cpu","detail":"cpu SAT: FAILED"}]
},
{
"component_key":"memory:all",
"status":"OK",
"history":[{"at":"2026-03-16T10:01:00Z","status":"OK","source":"sat:memory","detail":"memory SAT: OK"}]
},
{
"component_key":"storage:nvme0n1",
"status":"Critical",
"error_summary":"storage SAT: FAILED",
"history":[{"at":"2026-03-16T10:02:00Z","status":"Critical","source":"sat:storage","detail":"storage SAT: FAILED"}]
},
{
"component_key":"pcie:gpu:nvidia",
"status":"Warning",
"error_summary":"nvidia SAT: FAILED",
"history":[{"at":"2026-03-16T10:03:00Z","status":"Warning","source":"sat:nvidia","detail":"nvidia SAT: FAILED"}]
}
]`
if err := os.WriteFile(filepath.Join(exportDir, "component-status.json"), []byte(componentStatus), 0644); err != nil {
t.Fatal(err)
}
handler := NewHandler(HandlerOptions{AuditPath: path, ExportDir: exportDir})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
}
body := rec.Body.String()
for _, needle := range []string{
`Runtime Health`,
`<th>Check</th><th>Status</th><th>Source</th><th>Issue</th>`,
`Export Directory`,
`Network`,
`NVIDIA/AMD Driver`,
`CUDA / ROCm`,
`Required Utilities`,
`Bee Services`,
`<td>CPU</td>`,
`<td>Memory</td>`,
`<td>Storage</td>`,
`<td>GPU</td>`,
`CUDA runtime is not ready for GPU SAT.`,
`Missing: nvidia-smi`,
`bee-nvidia=inactive`,
`cpu SAT: FAILED`,
`storage SAT: FAILED`,
`sat:nvidia`,
} {
if !strings.Contains(body, needle) {
t.Fatalf("dashboard missing %q: %s", needle, body)
}
}
}

View File

@@ -0,0 +1,42 @@
package webui
import (
"fmt"
"log/slog"
"runtime/debug"
"time"
)
func goRecoverLoop(name string, restartDelay time.Duration, fn func()) {
go func() {
for {
if !runRecoverable(name, fn) {
return
}
if restartDelay > 0 {
time.Sleep(restartDelay)
}
}
}()
}
func goRecoverOnce(name string, fn func()) {
go func() {
_ = runRecoverable(name, fn)
}()
}
func runRecoverable(name string, fn func()) (panicked bool) {
defer func() {
if rec := recover(); rec != nil {
panicked = true
slog.Error("recovered panic",
"component", name,
"panic", fmt.Sprint(rec),
"stack", string(debug.Stack()),
)
}
}()
fn()
return false
}

View File

@@ -0,0 +1,85 @@
package webui
import (
"fmt"
"html"
"net/http"
"os"
"strings"
)
func (h *handler) handleTaskPage(w http.ResponseWriter, r *http.Request) {
id := r.PathValue("id")
task, ok := globalQueue.findByID(id)
if !ok {
http.NotFound(w, r)
return
}
snapshot := *task
body := renderTaskDetailPage(h.opts, snapshot)
w.Header().Set("Cache-Control", "no-store")
w.Header().Set("Content-Type", "text/html; charset=utf-8")
_, _ = w.Write([]byte(body))
}
func renderTaskDetailPage(opts HandlerOptions, task Task) string {
title := task.Name
if strings.TrimSpace(title) == "" {
title = task.ID
}
var body strings.Builder
body.WriteString(`<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">`)
body.WriteString(`<a class="btn btn-secondary btn-sm" href="/tasks">Back to Tasks</a>`)
body.WriteString(`<span style="font-size:12px;color:var(--muted)">Artifacts are saved in the task folder under <code>./tasks</code>.</span>`)
body.WriteString(`</div>`)
if report := loadTaskReportFragment(task); report != "" {
body.WriteString(report)
} else {
body.WriteString(`<div class="card"><div class="card-head">Task Summary</div><div class="card-body">`)
body.WriteString(`<div style="font-size:18px;font-weight:700">` + html.EscapeString(title) + `</div>`)
body.WriteString(`<div style="margin-top:8px">` + renderTaskStatusBadge(task.Status) + `</div>`)
if strings.TrimSpace(task.ErrMsg) != "" {
body.WriteString(`<div style="margin-top:8px;color:var(--crit-fg)">` + html.EscapeString(task.ErrMsg) + `</div>`)
}
body.WriteString(`</div></div>`)
}
if task.Status == TaskRunning || task.Status == TaskPending {
body.WriteString(`<div class="card"><div class="card-head">Live Logs</div><div class="card-body">`)
body.WriteString(`<div id="task-live-log" class="terminal" style="max-height:none;white-space:pre-wrap">Connecting...</div>`)
body.WriteString(`</div></div>`)
body.WriteString(`<script>
var _taskDetailES = new EventSource('/api/tasks/` + html.EscapeString(task.ID) + `/stream');
var _taskDetailTerm = document.getElementById('task-live-log');
_taskDetailES.onopen = function(){ _taskDetailTerm.textContent = ''; };
_taskDetailES.onmessage = function(e){ _taskDetailTerm.textContent += e.data + "\n"; _taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight; };
_taskDetailES.addEventListener('done', function(){ _taskDetailES.close(); setTimeout(function(){ window.location.reload(); }, 1000); });
_taskDetailES.onerror = function(){ _taskDetailES.close(); };
</script>`)
}
return layoutHead(opts.Title+" — "+title) +
layoutNav("tasks", opts.BuildLabel) +
`<div class="main"><div class="topbar"><h1>` + html.EscapeString(title) + `</h1></div><div class="content">` +
body.String() +
`</div></div></body></html>`
}
func loadTaskReportFragment(task Task) string {
if strings.TrimSpace(task.ReportHTMLPath) == "" {
return ""
}
data, err := os.ReadFile(task.ReportHTMLPath)
if err != nil || len(data) == 0 {
return ""
}
return string(data)
}
func taskArtifactDownloadLink(task Task, absPath string) string {
if strings.TrimSpace(absPath) == "" {
return ""
}
return fmt.Sprintf(`/export/file?path=%s`, absPath)
}

View File

@@ -0,0 +1,286 @@
package webui
import (
"encoding/json"
"fmt"
"html"
"os"
"path/filepath"
"sort"
"strings"
"time"
"bee/audit/internal/platform"
)
var taskReportMetricsDBPath = metricsDBPath
type taskReport struct {
ID string `json:"id"`
Name string `json:"name"`
Target string `json:"target"`
Status string `json:"status"`
CreatedAt time.Time `json:"created_at"`
StartedAt *time.Time `json:"started_at,omitempty"`
DoneAt *time.Time `json:"done_at,omitempty"`
DurationSec int `json:"duration_sec,omitempty"`
Error string `json:"error,omitempty"`
LogFile string `json:"log_file,omitempty"`
Charts []taskReportChart `json:"charts,omitempty"`
GeneratedAt time.Time `json:"generated_at"`
}
type taskReportChart struct {
Title string `json:"title"`
File string `json:"file"`
}
type taskChartSpec struct {
Path string
File string
}
var taskDashboardChartSpecs = []taskChartSpec{
{Path: "server-load", File: "server-load.svg"},
{Path: "server-temp-cpu", File: "server-temp-cpu.svg"},
{Path: "server-temp-ambient", File: "server-temp-ambient.svg"},
{Path: "server-power", File: "server-power.svg"},
{Path: "server-fans", File: "server-fans.svg"},
{Path: "gpu-all-load", File: "gpu-all-load.svg"},
{Path: "gpu-all-memload", File: "gpu-all-memload.svg"},
{Path: "gpu-all-clock", File: "gpu-all-clock.svg"},
{Path: "gpu-all-power", File: "gpu-all-power.svg"},
{Path: "gpu-all-temp", File: "gpu-all-temp.svg"},
}
func writeTaskReportArtifacts(t *Task) error {
if t == nil {
return nil
}
ensureTaskReportPaths(t)
if strings.TrimSpace(t.ArtifactsDir) == "" {
return nil
}
if err := os.MkdirAll(t.ArtifactsDir, 0755); err != nil {
return err
}
start, end := taskTimeWindow(t)
samples, _ := loadTaskMetricSamples(start, end)
charts, inlineCharts := writeTaskCharts(t.ArtifactsDir, start, end, samples)
logText := ""
if data, err := os.ReadFile(t.LogPath); err == nil {
logText = string(data)
}
report := taskReport{
ID: t.ID,
Name: t.Name,
Target: t.Target,
Status: t.Status,
CreatedAt: t.CreatedAt,
StartedAt: t.StartedAt,
DoneAt: t.DoneAt,
DurationSec: taskElapsedSec(t, reportDoneTime(t)),
Error: t.ErrMsg,
LogFile: filepath.Base(t.LogPath),
Charts: charts,
GeneratedAt: time.Now().UTC(),
}
if err := writeJSONFile(t.ReportJSONPath, report); err != nil {
return err
}
return os.WriteFile(t.ReportHTMLPath, []byte(renderTaskReportFragment(report, inlineCharts, logText)), 0644)
}
func reportDoneTime(t *Task) time.Time {
if t != nil && t.DoneAt != nil && !t.DoneAt.IsZero() {
return *t.DoneAt
}
return time.Now()
}
func taskTimeWindow(t *Task) (time.Time, time.Time) {
if t == nil {
now := time.Now().UTC()
return now, now
}
start := t.CreatedAt.UTC()
if t.StartedAt != nil && !t.StartedAt.IsZero() {
start = t.StartedAt.UTC()
}
end := time.Now().UTC()
if t.DoneAt != nil && !t.DoneAt.IsZero() {
end = t.DoneAt.UTC()
}
if end.Before(start) {
end = start
}
return start, end
}
func loadTaskMetricSamples(start, end time.Time) ([]platform.LiveMetricSample, error) {
db, err := openMetricsDB(taskReportMetricsDBPath)
if err != nil {
return nil, err
}
defer db.Close()
return db.LoadBetween(start, end)
}
func writeTaskCharts(dir string, start, end time.Time, samples []platform.LiveMetricSample) ([]taskReportChart, map[string]string) {
if len(samples) == 0 {
return nil, nil
}
timeline := []chartTimelineSegment{{Start: start, End: end, Active: true}}
var charts []taskReportChart
inline := make(map[string]string)
for _, spec := range taskDashboardChartSpecs {
title, svg, ok := renderTaskChartSVG(spec.Path, samples, timeline)
if !ok || len(svg) == 0 {
continue
}
path := filepath.Join(dir, spec.File)
if err := os.WriteFile(path, svg, 0644); err != nil {
continue
}
charts = append(charts, taskReportChart{Title: title, File: spec.File})
inline[spec.File] = string(svg)
}
for _, idx := range taskGPUIndices(samples) {
file := fmt.Sprintf("gpu-%d-overview.svg", idx)
svg, ok, err := renderGPUOverviewChartSVG(idx, samples, timeline)
if err != nil || !ok || len(svg) == 0 {
continue
}
path := filepath.Join(dir, file)
if err := os.WriteFile(path, svg, 0644); err != nil {
continue
}
charts = append(charts, taskReportChart{Title: gpuDisplayLabel(idx) + " Overview", File: file})
inline[file] = string(svg)
}
return charts, inline
}
func renderTaskChartSVG(path string, samples []platform.LiveMetricSample, timeline []chartTimelineSegment) (string, []byte, bool) {
datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
if !ok {
return "", nil, false
}
buf, err := renderMetricChartSVG(
title,
labels,
sampleTimes(samples),
datasets,
names,
yMin,
yMax,
chartCanvasHeightForPath(path, len(names)),
timeline,
)
if err != nil {
return "", nil, false
}
return title, buf, true
}
func taskGPUIndices(samples []platform.LiveMetricSample) []int {
seen := map[int]bool{}
var out []int
for _, s := range samples {
for _, g := range s.GPUs {
if seen[g.GPUIndex] {
continue
}
seen[g.GPUIndex] = true
out = append(out, g.GPUIndex)
}
}
sort.Ints(out)
return out
}
func writeJSONFile(path string, v any) error {
data, err := json.MarshalIndent(v, "", " ")
if err != nil {
return err
}
return os.WriteFile(path, data, 0644)
}
func renderTaskReportFragment(report taskReport, charts map[string]string, logText string) string {
var b strings.Builder
b.WriteString(`<div class="card"><div class="card-head">Task Report</div><div class="card-body">`)
b.WriteString(`<div class="grid2">`)
b.WriteString(`<div><div style="font-size:12px;color:var(--muted);margin-bottom:6px">Task</div><div style="font-size:16px;font-weight:700">` + html.EscapeString(report.Name) + `</div>`)
b.WriteString(`<div style="font-size:13px;color:var(--muted)">` + html.EscapeString(report.Target) + `</div></div>`)
b.WriteString(`<div><div style="font-size:12px;color:var(--muted);margin-bottom:6px">Status</div><div>` + renderTaskStatusBadge(report.Status) + `</div>`)
if strings.TrimSpace(report.Error) != "" {
b.WriteString(`<div style="margin-top:8px;font-size:13px;color:var(--crit-fg)">` + html.EscapeString(report.Error) + `</div>`)
}
b.WriteString(`</div></div>`)
b.WriteString(`<div style="margin-top:14px;font-size:13px;color:var(--muted)">`)
b.WriteString(`Started: ` + formatTaskTime(report.StartedAt, report.CreatedAt) + ` | Finished: ` + formatTaskTime(report.DoneAt, time.Time{}) + ` | Duration: ` + formatTaskDuration(report.DurationSec))
b.WriteString(`</div></div></div>`)
if len(report.Charts) > 0 {
b.WriteString(`<div class="grid2">`)
for _, chart := range report.Charts {
b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(chart.Title) + `</div><div class="card-body" style="padding:12px">`)
b.WriteString(charts[chart.File])
b.WriteString(`</div></div>`)
}
b.WriteString(`</div>`)
} else {
b.WriteString(`<div class="alert alert-info">No metric samples were captured during this task window.</div>`)
}
b.WriteString(`<div class="card"><div class="card-head">Logs</div><div class="card-body">`)
b.WriteString(`<div class="terminal" style="max-height:none;white-space:pre-wrap">` + html.EscapeString(strings.TrimSpace(logText)) + `</div>`)
b.WriteString(`</div></div>`)
return b.String()
}
func renderTaskStatusBadge(status string) string {
className := map[string]string{
TaskRunning: "badge-ok",
TaskPending: "badge-unknown",
TaskDone: "badge-ok",
TaskFailed: "badge-err",
TaskCancelled: "badge-unknown",
}[status]
if className == "" {
className = "badge-unknown"
}
label := strings.TrimSpace(status)
if label == "" {
label = "unknown"
}
return `<span class="badge ` + className + `">` + html.EscapeString(label) + `</span>`
}
func formatTaskTime(ts *time.Time, fallback time.Time) string {
if ts != nil && !ts.IsZero() {
return ts.Local().Format("2006-01-02 15:04:05")
}
if !fallback.IsZero() {
return fallback.Local().Format("2006-01-02 15:04:05")
}
return "n/a"
}
func formatTaskDuration(sec int) string {
if sec <= 0 {
return "n/a"
}
if sec < 60 {
return fmt.Sprintf("%ds", sec)
}
if sec < 3600 {
return fmt.Sprintf("%dm %02ds", sec/60, sec%60)
}
return fmt.Sprintf("%dh %02dm %02ds", sec/3600, (sec%3600)/60, sec%60)
}

View File

@@ -4,10 +4,12 @@ import (
"context" "context"
"encoding/json" "encoding/json"
"fmt" "fmt"
"log/slog"
"net/http" "net/http"
"os" "os"
"os/exec" "os/exec"
"path/filepath" "path/filepath"
"runtime/debug"
"sort" "sort"
"strings" "strings"
"sync" "sync"
@@ -28,22 +30,29 @@ const (
// taskNames maps target → human-readable name for validate (SAT) runs. // taskNames maps target → human-readable name for validate (SAT) runs.
var taskNames = map[string]string{ var taskNames = map[string]string{
"nvidia": "NVIDIA SAT", "nvidia": "NVIDIA SAT",
"nvidia-stress": "NVIDIA GPU Stress", "nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)",
"memory": "Memory SAT", "nvidia-benchmark": "NVIDIA Benchmark",
"storage": "Storage SAT", "nvidia-compute": "NVIDIA Max Compute Load (dcgmproftester)",
"cpu": "CPU SAT", "nvidia-targeted-power": "NVIDIA Targeted Power (dcgmi diag targeted_power)",
"amd": "AMD GPU SAT", "nvidia-pulse": "NVIDIA Pulse Test (dcgmi diag pulse_test)",
"amd-mem": "AMD GPU MEM Integrity", "nvidia-interconnect": "NVIDIA Interconnect Test (NCCL all_reduce_perf)",
"amd-bandwidth": "AMD GPU MEM Bandwidth", "nvidia-bandwidth": "NVIDIA Bandwidth Test (NVBandwidth)",
"amd-stress": "AMD GPU Burn-in", "nvidia-stress": "NVIDIA GPU Stress",
"memory-stress": "Memory Burn-in", "memory": "Memory SAT",
"sat-stress": "SAT Stress (stressapptest)", "storage": "Storage SAT",
"platform-stress": "Platform Thermal Cycling", "cpu": "CPU SAT",
"audit": "Audit", "amd": "AMD GPU SAT",
"support-bundle": "Support Bundle", "amd-mem": "AMD GPU MEM Integrity",
"install": "Install to Disk", "amd-bandwidth": "AMD GPU MEM Bandwidth",
"install-to-ram": "Install to RAM", "amd-stress": "AMD GPU Burn-in",
"memory-stress": "Memory Burn-in",
"sat-stress": "SAT Stress (stressapptest)",
"platform-stress": "Platform Thermal Cycling",
"audit": "Audit",
"support-bundle": "Support Bundle",
"install": "Install to Disk",
"install-to-ram": "Install to RAM",
} }
// burnNames maps target → human-readable name when a burn profile is set. // burnNames maps target → human-readable name when a burn profile is set.
@@ -83,17 +92,20 @@ func taskDisplayName(target, profile, loader string) string {
// Task represents one unit of work in the queue. // Task represents one unit of work in the queue.
type Task struct { type Task struct {
ID string `json:"id"` ID string `json:"id"`
Name string `json:"name"` Name string `json:"name"`
Target string `json:"target"` Target string `json:"target"`
Priority int `json:"priority"` Priority int `json:"priority"`
Status string `json:"status"` Status string `json:"status"`
CreatedAt time.Time `json:"created_at"` CreatedAt time.Time `json:"created_at"`
StartedAt *time.Time `json:"started_at,omitempty"` StartedAt *time.Time `json:"started_at,omitempty"`
DoneAt *time.Time `json:"done_at,omitempty"` DoneAt *time.Time `json:"done_at,omitempty"`
ElapsedSec int `json:"elapsed_sec,omitempty"` ElapsedSec int `json:"elapsed_sec,omitempty"`
ErrMsg string `json:"error,omitempty"` ErrMsg string `json:"error,omitempty"`
LogPath string `json:"log_path,omitempty"` LogPath string `json:"log_path,omitempty"`
ArtifactsDir string `json:"artifacts_dir,omitempty"`
ReportJSONPath string `json:"report_json_path,omitempty"`
ReportHTMLPath string `json:"report_html_path,omitempty"`
// runtime fields (not serialised) // runtime fields (not serialised)
job *jobState job *jobState
@@ -106,67 +118,81 @@ type taskParams struct {
DiagLevel int `json:"diag_level,omitempty"` DiagLevel int `json:"diag_level,omitempty"`
GPUIndices []int `json:"gpu_indices,omitempty"` GPUIndices []int `json:"gpu_indices,omitempty"`
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"` ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
SizeMB int `json:"size_mb,omitempty"`
Loader string `json:"loader,omitempty"` Loader string `json:"loader,omitempty"`
BurnProfile string `json:"burn_profile,omitempty"` BurnProfile string `json:"burn_profile,omitempty"`
BenchmarkProfile string `json:"benchmark_profile,omitempty"`
RunNCCL bool `json:"run_nccl,omitempty"`
DisplayName string `json:"display_name,omitempty"` DisplayName string `json:"display_name,omitempty"`
Device string `json:"device,omitempty"` // for install Device string `json:"device,omitempty"` // for install
PlatformComponents []string `json:"platform_components,omitempty"` PlatformComponents []string `json:"platform_components,omitempty"`
} }
type persistedTask struct { type persistedTask struct {
ID string `json:"id"` ID string `json:"id"`
Name string `json:"name"` Name string `json:"name"`
Target string `json:"target"` Target string `json:"target"`
Priority int `json:"priority"` Priority int `json:"priority"`
Status string `json:"status"` Status string `json:"status"`
CreatedAt time.Time `json:"created_at"` CreatedAt time.Time `json:"created_at"`
StartedAt *time.Time `json:"started_at,omitempty"` StartedAt *time.Time `json:"started_at,omitempty"`
DoneAt *time.Time `json:"done_at,omitempty"` DoneAt *time.Time `json:"done_at,omitempty"`
ErrMsg string `json:"error,omitempty"` ErrMsg string `json:"error,omitempty"`
LogPath string `json:"log_path,omitempty"` LogPath string `json:"log_path,omitempty"`
Params taskParams `json:"params,omitempty"` ArtifactsDir string `json:"artifacts_dir,omitempty"`
ReportJSONPath string `json:"report_json_path,omitempty"`
ReportHTMLPath string `json:"report_html_path,omitempty"`
Params taskParams `json:"params,omitempty"`
} }
type burnPreset struct { type burnPreset struct {
NvidiaDiag int
DurationSec int DurationSec int
} }
func resolveBurnPreset(profile string) burnPreset { func resolveBurnPreset(profile string) burnPreset {
switch profile { switch profile {
case "overnight": case "overnight":
return burnPreset{NvidiaDiag: 4, DurationSec: 8 * 60 * 60} return burnPreset{DurationSec: 8 * 60 * 60}
case "acceptance": case "acceptance":
return burnPreset{NvidiaDiag: 3, DurationSec: 60 * 60} return burnPreset{DurationSec: 60 * 60}
default: default:
return burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60} return burnPreset{DurationSec: 5 * 60}
} }
} }
func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions { func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
acceptanceCycles := []platform.PlatformStressCycle{
{LoadSec: 85, IdleSec: 5},
{LoadSec: 80, IdleSec: 10},
{LoadSec: 55, IdleSec: 5},
{LoadSec: 60, IdleSec: 0},
{LoadSec: 100, IdleSec: 10},
{LoadSec: 145, IdleSec: 15},
{LoadSec: 190, IdleSec: 20},
{LoadSec: 235, IdleSec: 25},
{LoadSec: 280, IdleSec: 30},
{LoadSec: 325, IdleSec: 35},
{LoadSec: 370, IdleSec: 40},
{LoadSec: 415, IdleSec: 45},
{LoadSec: 460, IdleSec: 50},
{LoadSec: 510, IdleSec: 0},
}
switch profile { switch profile {
case "overnight": case "overnight":
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{ cycles := make([]platform.PlatformStressCycle, 0, len(acceptanceCycles)*8)
{LoadSec: 600, IdleSec: 120}, for range 8 {
{LoadSec: 600, IdleSec: 60}, cycles = append(cycles, acceptanceCycles...)
{LoadSec: 600, IdleSec: 30}, }
{LoadSec: 600, IdleSec: 120}, return platform.PlatformStressOptions{Cycles: cycles}
{LoadSec: 600, IdleSec: 60},
{LoadSec: 600, IdleSec: 30},
{LoadSec: 600, IdleSec: 120},
{LoadSec: 600, IdleSec: 60},
}}
case "acceptance": case "acceptance":
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{ return platform.PlatformStressOptions{Cycles: acceptanceCycles}
{LoadSec: 300, IdleSec: 60},
{LoadSec: 300, IdleSec: 30},
{LoadSec: 300, IdleSec: 60},
{LoadSec: 300, IdleSec: 30},
}}
default: // smoke default: // smoke
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{ return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
{LoadSec: 90, IdleSec: 60}, {LoadSec: 85, IdleSec: 5},
{LoadSec: 90, IdleSec: 30}, {LoadSec: 80, IdleSec: 10},
{LoadSec: 55, IdleSec: 5},
{LoadSec: 60, IdleSec: 0},
}} }}
} }
} }
@@ -232,6 +258,7 @@ func (q *taskQueue) enqueue(t *Task) {
q.prune() q.prune()
q.persistLocked() q.persistLocked()
q.mu.Unlock() q.mu.Unlock()
taskSerialEvent(t, "queued")
select { select {
case q.trigger <- struct{}{}: case q.trigger <- struct{}{}:
default: default:
@@ -377,7 +404,7 @@ func (q *taskQueue) startWorker(opts *HandlerOptions) {
if !q.started { if !q.started {
q.loadLocked() q.loadLocked()
q.started = true q.started = true
go q.worker() goRecoverLoop("task worker", 2*time.Second, q.worker)
} }
hasPending := q.nextPending() != nil hasPending := q.nextPending() != nil
q.mu.Unlock() q.mu.Unlock()
@@ -392,78 +419,115 @@ func (q *taskQueue) startWorker(opts *HandlerOptions) {
func (q *taskQueue) worker() { func (q *taskQueue) worker() {
for { for {
<-q.trigger <-q.trigger
setCPUGovernor("performance") func() {
setCPUGovernor("performance")
defer setCPUGovernor("powersave")
// Drain all pending tasks and start them in parallel. // Drain all pending tasks and start them in parallel.
q.mu.Lock() q.mu.Lock()
var batch []*Task var batch []*Task
for { for {
t := q.nextPending() t := q.nextPending()
if t == nil { if t == nil {
break break
}
now := time.Now()
t.Status = TaskRunning
t.StartedAt = &now
t.DoneAt = nil
t.ErrMsg = ""
j := newTaskJobState(t.LogPath, taskSerialPrefix(t))
t.job = j
batch = append(batch, t)
} }
now := time.Now() if len(batch) > 0 {
t.Status = TaskRunning q.persistLocked()
t.StartedAt = &now }
t.DoneAt = nil q.mu.Unlock()
t.ErrMsg = ""
j := newTaskJobState(t.LogPath)
t.job = j
batch = append(batch, t)
}
if len(batch) > 0 {
q.persistLocked()
}
q.mu.Unlock()
var wg sync.WaitGroup var wg sync.WaitGroup
for _, t := range batch { for _, t := range batch {
t := t t := t
j := t.job j := t.job
taskCtx, taskCancel := context.WithCancel(context.Background()) taskCtx, taskCancel := context.WithCancel(context.Background())
j.cancel = taskCancel j.cancel = taskCancel
wg.Add(1) wg.Add(1)
go func() { goRecoverOnce("task "+t.Target, func() {
defer wg.Done() defer wg.Done()
defer taskCancel()
if q.kmsgWatcher != nil && isSATTarget(t.Target) { q.executeTask(t, j, taskCtx)
q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target) })
} }
wg.Wait()
q.runTask(t, j, taskCtx)
if q.kmsgWatcher != nil {
q.kmsgWatcher.NotifyTaskFinished(t.ID)
}
if len(batch) > 0 {
q.mu.Lock() q.mu.Lock()
now2 := time.Now() q.prune()
t.DoneAt = &now2
if t.Status == TaskRunning {
if j.err != "" {
t.Status = TaskFailed
t.ErrMsg = j.err
} else {
t.Status = TaskDone
}
}
q.persistLocked() q.persistLocked()
q.mu.Unlock() q.mu.Unlock()
}() }
} }()
wg.Wait()
if len(batch) > 0 {
q.mu.Lock()
q.prune()
q.persistLocked()
q.mu.Unlock()
}
setCPUGovernor("powersave")
} }
} }
func (q *taskQueue) executeTask(t *Task, j *jobState, ctx context.Context) {
startedKmsgWatch := false
defer q.finalizeTaskRun(t, j)
defer func() {
if startedKmsgWatch && q.kmsgWatcher != nil {
q.kmsgWatcher.NotifyTaskFinished(t.ID)
}
}()
defer func() {
if rec := recover(); rec != nil {
msg := fmt.Sprintf("task panic: %v", rec)
slog.Error("task panic",
"task_id", t.ID,
"target", t.Target,
"panic", fmt.Sprint(rec),
"stack", string(debug.Stack()),
)
j.append("ERROR: " + msg)
j.finish(msg)
}
}()
if q.kmsgWatcher != nil && isSATTarget(t.Target) {
q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
startedKmsgWatch = true
}
q.runTask(t, j, ctx)
}
func (q *taskQueue) finalizeTaskRun(t *Task, j *jobState) {
q.mu.Lock()
now := time.Now()
t.DoneAt = &now
if t.Status == TaskRunning {
if j.err != "" {
t.Status = TaskFailed
t.ErrMsg = j.err
} else {
t.Status = TaskDone
t.ErrMsg = ""
}
}
q.finalizeTaskArtifactPathsLocked(t)
q.persistLocked()
q.mu.Unlock()
if err := writeTaskReportArtifacts(t); err != nil {
appendJobLog(t.LogPath, "WARN: task report generation failed: "+err.Error())
}
if t.ErrMsg != "" {
taskSerialEvent(t, "finished with status="+t.Status+" error="+t.ErrMsg)
return
}
taskSerialEvent(t, "finished with status="+t.Status)
}
// setCPUGovernor writes the given governor to all CPU scaling_governor sysfs files. // setCPUGovernor writes the given governor to all CPU scaling_governor sysfs files.
// Silently ignores errors (e.g. when cpufreq is not available). // Silently ignores errors (e.g. when cpufreq is not available).
func setCPUGovernor(governor string) { func setCPUGovernor(governor string) {
@@ -502,9 +566,6 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
break break
} }
diagLevel := t.params.DiagLevel diagLevel := t.params.DiagLevel
if t.params.BurnProfile != "" && diagLevel <= 0 {
diagLevel = resolveBurnPreset(t.params.BurnProfile).NvidiaDiag
}
if len(t.params.GPUIndices) > 0 || diagLevel > 0 { if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
result, e := a.RunNvidiaAcceptancePackWithOptions( result, e := a.RunNvidiaAcceptancePackWithOptions(
ctx, "", diagLevel, t.params.GPUIndices, j.append, ctx, "", diagLevel, t.params.GPUIndices, j.append,
@@ -517,6 +578,78 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
} else { } else {
archive, err = a.RunNvidiaAcceptancePack("", j.append) archive, err = a.RunNvidiaAcceptancePack("", j.append)
} }
case "nvidia-targeted-stress":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if dur <= 0 {
dur = 300
}
archive, err = a.RunNvidiaTargetedStressValidatePack(ctx, "", dur, t.params.GPUIndices, j.append)
case "nvidia-benchmark":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
archive, err = a.RunNvidiaBenchmarkCtx(ctx, "", platform.NvidiaBenchmarkOptions{
Profile: t.params.BenchmarkProfile,
SizeMB: t.params.SizeMB,
GPUIndices: t.params.GPUIndices,
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
RunNCCL: t.params.RunNCCL,
}, j.append)
case "nvidia-compute":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, j.append)
case "nvidia-targeted-power":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
archive, err = a.RunNvidiaTargetedPowerPack(ctx, "", dur, t.params.GPUIndices, j.append)
case "nvidia-pulse":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
archive, err = a.RunNvidiaPulseTestPack(ctx, "", dur, t.params.GPUIndices, j.append)
case "nvidia-bandwidth":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
archive, err = a.RunNvidiaBandwidthPack(ctx, "", t.params.GPUIndices, j.append)
case "nvidia-interconnect":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
DurationSec: dur,
Loader: platform.NvidiaStressLoaderNCCL,
GPUIndices: t.params.GPUIndices,
}, j.append)
case "nvidia-stress": case "nvidia-stress":
if a == nil { if a == nil {
err = fmt.Errorf("app not configured") err = fmt.Errorf("app not configured")
@@ -731,6 +864,7 @@ func (h *handler) handleAPITasksCancel(w http.ResponseWriter, r *http.Request) {
now := time.Now() now := time.Now()
t.DoneAt = &now t.DoneAt = &now
globalQueue.persistLocked() globalQueue.persistLocked()
taskSerialEvent(t, "finished with status="+t.Status)
writeJSON(w, map[string]string{"status": "cancelled"}) writeJSON(w, map[string]string{"status": "cancelled"})
case TaskRunning: case TaskRunning:
if t.job != nil { if t.job != nil {
@@ -740,6 +874,7 @@ func (h *handler) handleAPITasksCancel(w http.ResponseWriter, r *http.Request) {
now := time.Now() now := time.Now()
t.DoneAt = &now t.DoneAt = &now
globalQueue.persistLocked() globalQueue.persistLocked()
taskSerialEvent(t, "finished with status="+t.Status)
writeJSON(w, map[string]string{"status": "cancelled"}) writeJSON(w, map[string]string{"status": "cancelled"})
default: default:
writeError(w, http.StatusConflict, "task is not running or pending") writeError(w, http.StatusConflict, "task is not running or pending")
@@ -780,6 +915,7 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
case TaskPending: case TaskPending:
t.Status = TaskCancelled t.Status = TaskCancelled
t.DoneAt = &now t.DoneAt = &now
taskSerialEvent(t, "finished with status="+t.Status)
n++ n++
case TaskRunning: case TaskRunning:
if t.job != nil { if t.job != nil {
@@ -787,6 +923,7 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
} }
t.Status = TaskCancelled t.Status = TaskCancelled
t.DoneAt = &now t.DoneAt = &now
taskSerialEvent(t, "finished with status="+t.Status)
n++ n++
} }
} }
@@ -805,6 +942,7 @@ func (h *handler) handleAPITasksKillWorkers(w http.ResponseWriter, _ *http.Reque
case TaskPending: case TaskPending:
t.Status = TaskCancelled t.Status = TaskCancelled
t.DoneAt = &now t.DoneAt = &now
taskSerialEvent(t, "finished with status="+t.Status)
cancelled++ cancelled++
case TaskRunning: case TaskRunning:
if t.job != nil { if t.job != nil {
@@ -812,6 +950,7 @@ func (h *handler) handleAPITasksKillWorkers(w http.ResponseWriter, _ *http.Reque
} }
t.Status = TaskCancelled t.Status = TaskCancelled
t.DoneAt = &now t.DoneAt = &now
taskSerialEvent(t, "finished with status="+t.Status)
cancelled++ cancelled++
} }
} }
@@ -875,10 +1014,10 @@ func (h *handler) handleAPITasksStream(w http.ResponseWriter, r *http.Request) {
} }
func (q *taskQueue) assignTaskLogPathLocked(t *Task) { func (q *taskQueue) assignTaskLogPathLocked(t *Task) {
if t.LogPath != "" || q.logsDir == "" || t.ID == "" { if q.logsDir == "" || t.ID == "" {
return return
} }
t.LogPath = filepath.Join(q.logsDir, t.ID+".log") q.ensureTaskArtifactPathsLocked(t)
} }
func (q *taskQueue) loadLocked() { func (q *taskQueue) loadLocked() {
@@ -895,17 +1034,20 @@ func (q *taskQueue) loadLocked() {
} }
for _, pt := range persisted { for _, pt := range persisted {
t := &Task{ t := &Task{
ID: pt.ID, ID: pt.ID,
Name: pt.Name, Name: pt.Name,
Target: pt.Target, Target: pt.Target,
Priority: pt.Priority, Priority: pt.Priority,
Status: pt.Status, Status: pt.Status,
CreatedAt: pt.CreatedAt, CreatedAt: pt.CreatedAt,
StartedAt: pt.StartedAt, StartedAt: pt.StartedAt,
DoneAt: pt.DoneAt, DoneAt: pt.DoneAt,
ErrMsg: pt.ErrMsg, ErrMsg: pt.ErrMsg,
LogPath: pt.LogPath, LogPath: pt.LogPath,
params: pt.Params, ArtifactsDir: pt.ArtifactsDir,
ReportJSONPath: pt.ReportJSONPath,
ReportHTMLPath: pt.ReportHTMLPath,
params: pt.Params,
} }
q.assignTaskLogPathLocked(t) q.assignTaskLogPathLocked(t)
if t.Status == TaskRunning { if t.Status == TaskRunning {
@@ -936,17 +1078,20 @@ func (q *taskQueue) persistLocked() {
state := make([]persistedTask, 0, len(q.tasks)) state := make([]persistedTask, 0, len(q.tasks))
for _, t := range q.tasks { for _, t := range q.tasks {
state = append(state, persistedTask{ state = append(state, persistedTask{
ID: t.ID, ID: t.ID,
Name: t.Name, Name: t.Name,
Target: t.Target, Target: t.Target,
Priority: t.Priority, Priority: t.Priority,
Status: t.Status, Status: t.Status,
CreatedAt: t.CreatedAt, CreatedAt: t.CreatedAt,
StartedAt: t.StartedAt, StartedAt: t.StartedAt,
DoneAt: t.DoneAt, DoneAt: t.DoneAt,
ErrMsg: t.ErrMsg, ErrMsg: t.ErrMsg,
LogPath: t.LogPath, LogPath: t.LogPath,
Params: t.params, ArtifactsDir: t.ArtifactsDir,
ReportJSONPath: t.ReportJSONPath,
ReportHTMLPath: t.ReportHTMLPath,
Params: t.params,
}) })
} }
data, err := json.MarshalIndent(state, "", " ") data, err := json.MarshalIndent(state, "", " ")
@@ -977,3 +1122,88 @@ func taskElapsedSec(t *Task, now time.Time) int {
} }
return int(end.Sub(start).Round(time.Second) / time.Second) return int(end.Sub(start).Round(time.Second) / time.Second)
} }
func taskFolderStatus(status string) string {
status = strings.TrimSpace(strings.ToLower(status))
switch status {
case TaskRunning, TaskDone, TaskFailed, TaskCancelled:
return status
default:
return TaskPending
}
}
func sanitizeTaskFolderPart(s string) string {
s = strings.TrimSpace(strings.ToLower(s))
if s == "" {
return "task"
}
var b strings.Builder
lastDash := false
for _, r := range s {
isAlnum := (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9')
if isAlnum {
b.WriteRune(r)
lastDash = false
continue
}
if !lastDash {
b.WriteByte('-')
lastDash = true
}
}
out := strings.Trim(b.String(), "-")
if out == "" {
return "task"
}
return out
}
func taskArtifactsDir(root string, t *Task, status string) string {
if strings.TrimSpace(root) == "" || t == nil {
return ""
}
return filepath.Join(root, fmt.Sprintf("%s_%s_%s", t.ID, sanitizeTaskFolderPart(t.Name), taskFolderStatus(status)))
}
func ensureTaskReportPaths(t *Task) {
if t == nil || strings.TrimSpace(t.ArtifactsDir) == "" {
return
}
if t.LogPath == "" || filepath.Base(t.LogPath) == "task.log" {
t.LogPath = filepath.Join(t.ArtifactsDir, "task.log")
}
t.ReportJSONPath = filepath.Join(t.ArtifactsDir, "report.json")
t.ReportHTMLPath = filepath.Join(t.ArtifactsDir, "report.html")
}
func (q *taskQueue) ensureTaskArtifactPathsLocked(t *Task) {
if t == nil || strings.TrimSpace(q.logsDir) == "" || strings.TrimSpace(t.ID) == "" {
return
}
if strings.TrimSpace(t.ArtifactsDir) == "" {
t.ArtifactsDir = taskArtifactsDir(q.logsDir, t, t.Status)
}
if t.ArtifactsDir != "" {
_ = os.MkdirAll(t.ArtifactsDir, 0755)
}
ensureTaskReportPaths(t)
}
func (q *taskQueue) finalizeTaskArtifactPathsLocked(t *Task) {
if t == nil || strings.TrimSpace(q.logsDir) == "" || strings.TrimSpace(t.ID) == "" {
return
}
q.ensureTaskArtifactPathsLocked(t)
dstDir := taskArtifactsDir(q.logsDir, t, t.Status)
if dstDir == "" {
return
}
if t.ArtifactsDir != "" && t.ArtifactsDir != dstDir {
if _, err := os.Stat(dstDir); err != nil {
_ = os.Rename(t.ArtifactsDir, dstDir)
}
t.ArtifactsDir = dstDir
}
ensureTaskReportPaths(t)
}

View File

@@ -2,6 +2,7 @@ package webui
import ( import (
"context" "context"
"encoding/json"
"net/http" "net/http"
"net/http/httptest" "net/http/httptest"
"os" "os"
@@ -12,6 +13,7 @@ import (
"time" "time"
"bee/audit/internal/app" "bee/audit/internal/app"
"bee/audit/internal/platform"
) )
func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) { func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) {
@@ -248,15 +250,133 @@ func TestHandleAPITasksStreamPendingTaskStartsSSEImmediately(t *testing.T) {
t.Fatalf("stream did not emit queued status promptly, body=%q", rec.Body.String()) t.Fatalf("stream did not emit queued status promptly, body=%q", rec.Body.String())
} }
func TestFinalizeTaskRunCreatesReportFolderAndArtifacts(t *testing.T) {
dir := t.TempDir()
metricsPath := filepath.Join(dir, "metrics.db")
prevMetricsPath := taskReportMetricsDBPath
taskReportMetricsDBPath = metricsPath
t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
db, err := openMetricsDB(metricsPath)
if err != nil {
t.Fatalf("openMetricsDB: %v", err)
}
base := time.Now().UTC().Add(-45 * time.Second)
if err := db.Write(platform.LiveMetricSample{
Timestamp: base,
CPULoadPct: 42,
MemLoadPct: 35,
PowerW: 510,
}); err != nil {
t.Fatalf("Write: %v", err)
}
_ = db.Close()
q := &taskQueue{
statePath: filepath.Join(dir, "tasks-state.json"),
logsDir: filepath.Join(dir, "tasks"),
trigger: make(chan struct{}, 1),
}
if err := os.MkdirAll(q.logsDir, 0755); err != nil {
t.Fatal(err)
}
started := time.Now().UTC().Add(-90 * time.Second)
task := &Task{
ID: "task-1",
Name: "CPU SAT",
Target: "cpu",
Status: TaskRunning,
CreatedAt: started.Add(-10 * time.Second),
StartedAt: &started,
}
q.assignTaskLogPathLocked(task)
appendJobLog(task.LogPath, "line-1")
job := newTaskJobState(task.LogPath)
job.finish("")
q.finalizeTaskRun(task, job)
if task.Status != TaskDone {
t.Fatalf("status=%q want %q", task.Status, TaskDone)
}
if !strings.Contains(filepath.Base(task.ArtifactsDir), "_done") {
t.Fatalf("artifacts dir=%q", task.ArtifactsDir)
}
if _, err := os.Stat(task.ReportJSONPath); err != nil {
t.Fatalf("report json: %v", err)
}
if _, err := os.Stat(task.ReportHTMLPath); err != nil {
t.Fatalf("report html: %v", err)
}
var report taskReport
data, err := os.ReadFile(task.ReportJSONPath)
if err != nil {
t.Fatalf("ReadFile(report.json): %v", err)
}
if err := json.Unmarshal(data, &report); err != nil {
t.Fatalf("Unmarshal(report.json): %v", err)
}
if report.ID != task.ID || report.Status != TaskDone {
t.Fatalf("report=%+v", report)
}
if len(report.Charts) == 0 {
t.Fatalf("expected charts in report, got none")
}
}
func TestTaskLifecycleMirrorsToSerialConsole(t *testing.T) {
var lines []string
prev := taskSerialWriteLine
taskSerialWriteLine = func(line string) { lines = append(lines, line) }
t.Cleanup(func() { taskSerialWriteLine = prev })
dir := t.TempDir()
q := &taskQueue{
statePath: filepath.Join(dir, "tasks-state.json"),
logsDir: filepath.Join(dir, "tasks"),
trigger: make(chan struct{}, 1),
}
task := &Task{
ID: "task-serial-1",
Name: "CPU SAT",
Target: "cpu",
Status: TaskPending,
CreatedAt: time.Now().UTC(),
}
q.enqueue(task)
started := time.Now().UTC()
task.Status = TaskRunning
task.StartedAt = &started
job := newTaskJobState(task.LogPath, taskSerialPrefix(task))
job.append("Starting CPU SAT...")
job.append("CPU stress duration: 60s")
job.finish("")
q.finalizeTaskRun(task, job)
joined := strings.Join(lines, "\n")
for _, needle := range []string{
"queued",
"Starting CPU SAT...",
"CPU stress duration: 60s",
"finished with status=done",
} {
if !strings.Contains(joined, needle) {
t.Fatalf("serial mirror missing %q in %q", needle, joined)
}
}
}
func TestResolveBurnPreset(t *testing.T) { func TestResolveBurnPreset(t *testing.T) {
tests := []struct { tests := []struct {
profile string profile string
want burnPreset want burnPreset
}{ }{
{profile: "smoke", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}}, {profile: "smoke", want: burnPreset{DurationSec: 5 * 60}},
{profile: "acceptance", want: burnPreset{NvidiaDiag: 3, DurationSec: 60 * 60}}, {profile: "acceptance", want: burnPreset{DurationSec: 60 * 60}},
{profile: "overnight", want: burnPreset{NvidiaDiag: 4, DurationSec: 8 * 60 * 60}}, {profile: "overnight", want: burnPreset{DurationSec: 8 * 60 * 60}},
{profile: "", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}}, {profile: "", want: burnPreset{DurationSec: 5 * 60}},
} }
for _, tc := range tests { for _, tc := range tests {
if got := resolveBurnPreset(tc.profile); got != tc.want { if got := resolveBurnPreset(tc.profile); got != tc.want {
@@ -467,3 +587,52 @@ func TestRunTaskInstallUsesSharedCommandStreaming(t *testing.T) {
t.Fatalf("unexpected error: %q", j.err) t.Fatalf("unexpected error: %q", j.err)
} }
} }
func TestExecuteTaskMarksPanicsAsFailedAndClosesKmsgWindow(t *testing.T) {
dir := t.TempDir()
q := &taskQueue{
opts: &HandlerOptions{App: &app.App{}},
statePath: filepath.Join(dir, "tasks-state.json"),
logsDir: filepath.Join(dir, "tasks"),
kmsgWatcher: newKmsgWatcher(nil),
}
tk := &Task{
ID: "cpu-panic-1",
Name: "CPU SAT",
Target: "cpu",
Status: TaskRunning,
CreatedAt: time.Now(),
}
j := &jobState{}
orig := runCPUAcceptancePackCtx
runCPUAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, _ int, _ func(string)) (string, error) {
panic("boom")
}
defer func() { runCPUAcceptancePackCtx = orig }()
q.executeTask(tk, j, context.Background())
if tk.Status != TaskFailed {
t.Fatalf("status=%q want %q", tk.Status, TaskFailed)
}
if tk.DoneAt == nil {
t.Fatal("expected done_at to be set")
}
if !strings.Contains(tk.ErrMsg, "task panic: boom") {
t.Fatalf("task error=%q", tk.ErrMsg)
}
if !strings.Contains(j.err, "task panic: boom") {
t.Fatalf("job error=%q", j.err)
}
q.kmsgWatcher.mu.Lock()
activeCount := q.kmsgWatcher.activeCount
window := q.kmsgWatcher.window
q.kmsgWatcher.mu.Unlock()
if activeCount != 0 {
t.Fatalf("activeCount=%d want 0", activeCount)
}
if window != nil {
t.Fatalf("expected kmsg window to be cleared, got %+v", window)
}
}

2
bible

Submodule bible updated: 688b87e98d...1d89a4918e

View File

@@ -302,6 +302,12 @@ memtest_fail() {
return 0 return 0
} }
nvidia_runtime_fail() {
msg="$1"
echo "ERROR: ${msg}" >&2
exit 1
}
iso_memtest_present() { iso_memtest_present() {
iso_path="$1" iso_path="$1"
iso_files="$(mktemp)" iso_files="$(mktemp)"
@@ -439,6 +445,44 @@ validate_iso_memtest() {
echo "=== memtest validation OK ===" echo "=== memtest validation OK ==="
} }
validate_iso_nvidia_runtime() {
iso_path="$1"
[ "$BEE_GPU_VENDOR" = "nvidia" ] || return 0
echo "=== validating NVIDIA runtime in ISO ==="
[ -f "$iso_path" ] || nvidia_runtime_fail "ISO not found for NVIDIA runtime validation: $iso_path"
require_iso_reader "$iso_path" >/dev/null 2>&1 || nvidia_runtime_fail "ISO reader unavailable for NVIDIA runtime validation"
command -v unsquashfs >/dev/null 2>&1 || nvidia_runtime_fail "unsquashfs is required for NVIDIA runtime validation"
squashfs_tmp="$(mktemp)"
squashfs_list="$(mktemp)"
iso_read_member "$iso_path" live/filesystem.squashfs "$squashfs_tmp" || {
rm -f "$squashfs_tmp" "$squashfs_list"
nvidia_runtime_fail "failed to extract live/filesystem.squashfs from ISO"
}
unsquashfs -ll "$squashfs_tmp" > "$squashfs_list" 2>/dev/null || {
rm -f "$squashfs_tmp" "$squashfs_list"
nvidia_runtime_fail "failed to inspect filesystem.squashfs from ISO"
}
grep -Eq 'usr/bin/dcgmi$' "$squashfs_list" || {
rm -f "$squashfs_tmp" "$squashfs_list"
nvidia_runtime_fail "dcgmi missing from final NVIDIA ISO"
}
grep -Eq 'usr/bin/nv-hostengine$' "$squashfs_list" || {
rm -f "$squashfs_tmp" "$squashfs_list"
nvidia_runtime_fail "nv-hostengine missing from final NVIDIA ISO"
}
grep -Eq 'usr/bin/dcgmproftester([0-9]+)?$' "$squashfs_list" || {
rm -f "$squashfs_tmp" "$squashfs_list"
nvidia_runtime_fail "dcgmproftester missing from final NVIDIA ISO"
}
rm -f "$squashfs_tmp" "$squashfs_list"
echo "=== NVIDIA runtime validation OK ==="
}
append_memtest_grub_entry() { append_memtest_grub_entry() {
grub_cfg="$1" grub_cfg="$1"
[ -f "$grub_cfg" ] || return 1 [ -f "$grub_cfg" ] || return 1
@@ -1144,6 +1188,7 @@ if [ -f "$ISO_RAW" ]; then
fi fi
fi fi
validate_iso_memtest "$ISO_RAW" validate_iso_memtest "$ISO_RAW"
validate_iso_nvidia_runtime "$ISO_RAW"
cp "$ISO_RAW" "$ISO_OUT" cp "$ISO_RAW" "$ISO_OUT"
echo "" echo ""
echo "=== done (${BEE_GPU_VENDOR}) ===" echo "=== done (${BEE_GPU_VENDOR}) ==="

View File

@@ -30,6 +30,7 @@ systemctl enable bee-preflight.service
systemctl enable bee-audit.service systemctl enable bee-audit.service
systemctl enable bee-web.service systemctl enable bee-web.service
systemctl enable bee-sshsetup.service systemctl enable bee-sshsetup.service
systemctl enable bee-selfheal.timer
systemctl enable ssh.service systemctl enable ssh.service
systemctl enable lightdm.service 2>/dev/null || true systemctl enable lightdm.service 2>/dev/null || true
systemctl enable qemu-guest-agent.service 2>/dev/null || true systemctl enable qemu-guest-agent.service 2>/dev/null || true
@@ -58,6 +59,7 @@ chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
chmod +x /usr/local/bin/bee 2>/dev/null || true chmod +x /usr/local/bin/bee 2>/dev/null || true
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
chmod +x /usr/local/bin/bee-selfheal 2>/dev/null || true
if [ "$GPU_VENDOR" = "nvidia" ]; then if [ "$GPU_VENDOR" = "nvidia" ]; then
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true

View File

@@ -1,6 +1,10 @@
# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing. # NVIDIA DCGM (Data Center GPU Manager).
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with CUDA 13 userspace, # Validate uses dcgmi diagnostics; Burn uses dcgmproftester as the official
# so install the CUDA 13 build plus proprietary diagnostic components explicitly. # NVIDIA max-compute recipe. The smoketest/runtime contract treats
# dcgmproftester as required in the LiveCD.
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
# CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
# explicitly.
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%% datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%% datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%% datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%

View File

@@ -52,6 +52,31 @@ else
fail "nvidia-smi: NOT FOUND" fail "nvidia-smi: NOT FOUND"
fi fi
if p=$(PATH="/usr/local/bin:$PATH" command -v dcgmi 2>/dev/null); then
ok "dcgmi found: $p"
else
fail "dcgmi: NOT FOUND"
fi
if p=$(PATH="/usr/local/bin:$PATH" command -v nv-hostengine 2>/dev/null); then
ok "nv-hostengine found: $p"
else
fail "nv-hostengine: NOT FOUND"
fi
DCGM_PROFTESTER=""
for tool in dcgmproftester dcgmproftester13 dcgmproftester12 dcgmproftester11; do
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
DCGM_PROFTESTER="$p"
break
fi
done
if [ -n "$DCGM_PROFTESTER" ]; then
ok "dcgmproftester found: $DCGM_PROFTESTER"
else
fail "dcgmproftester: NOT FOUND"
fi
for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf; do for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf; do
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
ok "$tool found: $p" ok "$tool found: $p"
@@ -60,6 +85,12 @@ for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf
fi fi
done done
if p=$(PATH="/usr/local/bin:$PATH" command -v nvbandwidth 2>/dev/null); then
ok "nvbandwidth found: $p"
else
warn "nvbandwidth: NOT FOUND"
fi
echo "" echo ""
echo "-- NVIDIA modules --" echo "-- NVIDIA modules --"
KO_DIR="/usr/local/lib/nvidia" KO_DIR="/usr/local/lib/nvidia"
@@ -171,6 +202,12 @@ for svc in bee-nvidia bee-network bee-preflight bee-audit bee-web; do
fi fi
done done
if systemctl is-active --quiet bee-selfheal.timer 2>/dev/null; then
ok "timer active: bee-selfheal.timer"
else
fail "timer NOT active: bee-selfheal.timer"
fi
echo "" echo ""
echo "-- runtime health --" echo "-- runtime health --"
if [ -f /appdata/bee/export/runtime-health.json ] && [ -s /appdata/bee/export/runtime-health.json ]; then if [ -f /appdata/bee/export/runtime-health.json ] && [ -s /appdata/bee/export/runtime-health.json ]; then

View File

@@ -1,7 +1,6 @@
[Unit] [Unit]
Description=Bee: hardware audit Description=Bee: hardware audit
After=bee-preflight.service bee-network.service bee-nvidia.service After=bee-preflight.service bee-network.service bee-nvidia.service
Before=bee-web.service
[Service] [Service]
Type=oneshot Type=oneshot

View File

@@ -0,0 +1,9 @@
[Unit]
Description=Bee: periodic runtime self-heal
After=bee-web.service bee-audit.service bee-preflight.service
[Service]
Type=oneshot
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-selfheal.log /usr/local/bin/bee-selfheal
StandardOutput=journal
StandardError=journal

View File

@@ -0,0 +1,11 @@
[Unit]
Description=Bee: run self-heal checks periodically
[Timer]
OnBootSec=45sec
OnUnitActiveSec=60sec
AccuracySec=15sec
Unit=bee-selfheal.service
[Install]
WantedBy=timers.target

View File

@@ -1,12 +1,12 @@
[Unit] [Unit]
Description=Bee: hardware audit web viewer Description=Bee: hardware audit web viewer
After=bee-audit.service StartLimitIntervalSec=0
[Service] [Service]
Type=simple Type=simple
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-web.log /usr/local/bin/bee web --listen :80 --audit-path /appdata/bee/export/bee-audit.json --export-dir /appdata/bee/export --title "Bee Hardware Audit" ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-web.log /usr/local/bin/bee web --listen :80 --audit-path /appdata/bee/export/bee-audit.json --export-dir /appdata/bee/export --title "Bee Hardware Audit"
Restart=always Restart=always
RestartSec=2 RestartSec=3
StandardOutput=journal StandardOutput=journal
StandardError=journal StandardError=journal
LimitMEMLOCK=infinity LimitMEMLOCK=infinity

View File

@@ -0,0 +1,99 @@
#!/bin/bash
# bee-selfheal — periodic best-effort recovery for critical live ISO services.
set -u
LOG_PREFIX="bee-selfheal"
EXPORT_DIR="/appdata/bee/export"
AUDIT_JSON="${EXPORT_DIR}/bee-audit.json"
RUNTIME_JSON="${EXPORT_DIR}/runtime-health.json"
LOCK_DIR="/run/bee-selfheal.lock"
log() {
echo "[${LOG_PREFIX}] $*"
}
have_nvidia_gpu() {
lspci -nn 2>/dev/null | grep -qi '10de:'
}
service_active() {
systemctl is-active --quiet "$1" 2>/dev/null
}
restart_service() {
local svc="$1"
if systemctl restart "$svc" >/dev/null 2>&1; then
log "restarted ${svc}"
return 0
fi
log "WARN: failed to restart ${svc}"
return 1
}
file_ready() {
[ -s "$1" ]
}
artifact_state() {
local path="$1"
if [ -s "${path}" ]; then
echo "ready"
return 0
fi
if [ -e "${path}.tmp" ]; then
echo "interrupted"
return 0
fi
echo "missing"
}
web_healthy() {
bash -c 'exec 3<>/dev/tcp/127.0.0.1/80 && printf "GET /healthz HTTP/1.0\r\nHost: localhost\r\n\r\n" >&3 && grep -q "^ok$" <&3' \
>/dev/null 2>&1
}
mkdir -p "${EXPORT_DIR}" /run
if ! mkdir "${LOCK_DIR}" 2>/dev/null; then
log "another self-heal run is already active"
exit 0
fi
trap 'rmdir "${LOCK_DIR}" >/dev/null 2>&1 || true' EXIT
log "start"
if have_nvidia_gpu && [ ! -e /dev/nvidia0 ]; then
log "NVIDIA GPU detected but /dev/nvidia0 is missing"
restart_service bee-nvidia.service || true
fi
runtime_state="$(artifact_state "${RUNTIME_JSON}")"
if [ "${runtime_state}" != "ready" ]; then
if [ "${runtime_state}" = "interrupted" ]; then
log "runtime-health.json.tmp exists — interrupted runtime-health write detected"
else
log "runtime-health.json missing or empty"
fi
restart_service bee-preflight.service || true
fi
audit_state="$(artifact_state "${AUDIT_JSON}")"
if [ "${audit_state}" != "ready" ]; then
if [ "${audit_state}" = "interrupted" ]; then
log "bee-audit.json.tmp exists — interrupted audit write detected"
else
log "bee-audit.json missing or empty"
fi
restart_service bee-audit.service || true
fi
if ! service_active bee-web.service; then
log "bee-web.service is not active"
restart_service bee-web.service || true
elif ! web_healthy; then
log "bee-web health check failed"
restart_service bee-web.service || true
fi
log "done"