Compare commits

...

20 Commits
v4.3 ... v5.5

Author SHA1 Message Date
e609fbbc26 Add task reports and streamline GPU charts 2026-04-05 18:13:58 +03:00
cc2b49ea41 Improve validate GPU runs and web UI feedback 2026-04-05 17:50:13 +03:00
33e0a5bef2 Refine validate UI and runtime health table 2026-04-05 16:24:45 +03:00
38e79143eb Refine burn UI and NVIDIA stress flows 2026-04-05 13:43:43 +03:00
25af2df23a Unify metrics charts on custom SVG renderer 2026-04-05 12:17:50 +03:00
20abff7f90 WIP: checkpoint current tree 2026-04-05 12:05:00 +03:00
a14ec8631c Persist GPU chart mode and expand GPU charts 2026-04-05 11:52:32 +03:00
f58c7e58d3 Fix webui streaming recovery regressions 2026-04-05 10:39:09 +03:00
bf47c8dbd2 Add NVIDIA benchmark reporting flow 2026-04-05 10:30:56 +03:00
143b7dca5d Add stability hardening and self-heal recovery 2026-04-05 10:29:37 +03:00
9826d437a5 Add GPU clock charts and grouped GPU metrics view 2026-04-05 09:57:38 +03:00
Mikhail Chusavitin
f3c14cd893 Harden NIC probing for empty SFP ports 2026-04-04 15:23:15 +03:00
Mikhail Chusavitin
728270dc8e Unblock bee-web startup and expand support bundle diagnostics 2026-04-04 15:18:43 +03:00
Mikhail Chusavitin
8692f825bc Use plain repo tags for build version 2026-04-03 10:48:51 +03:00
Mikhail Chusavitin
11f52ac710 Fix task log modal scrolling 2026-04-03 10:36:11 +03:00
Mikhail Chusavitin
1cb398fe83 Show tag version at top of sidebar 2026-04-03 10:08:00 +03:00
Mikhail Chusavitin
7a843be6b0 Stabilize DCGM GPU discovery 2026-04-03 09:50:33 +03:00
Mikhail Chusavitin
7f6386dccc Restore USB support bundle export on tools page 2026-04-03 09:48:22 +03:00
Mikhail Chusavitin
eea2591bcc Fix John GPU stress duration semantics 2026-04-03 09:46:16 +03:00
Mikhail Chusavitin
295a19b93a feat(tasks): run all queued tasks in parallel
Tasks are now started simultaneously when multiple are enqueued (e.g.
Run All). The worker drains all pending tasks at once and launches each
in its own goroutine, waiting via WaitGroup. kmsg watcher updated to
use a shared event window with a reference counter across concurrent tasks.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-03 09:15:06 +03:00
53 changed files with 6802 additions and 855 deletions

View File

@@ -1,7 +1,10 @@
LISTEN ?= :8080
AUDIT_PATH ?=
EXPORT_DIR ?= $(CURDIR)/.tmp/export
VERSION ?= $(shell sh ./scripts/resolve-version.sh)
GO_LDFLAGS := -X main.Version=$(VERSION)
RUN_ARGS := web --listen $(LISTEN)
RUN_ARGS := web --listen $(LISTEN) --export-dir $(EXPORT_DIR)
ifneq ($(AUDIT_PATH),)
RUN_ARGS += --audit-path $(AUDIT_PATH)
endif
@@ -9,10 +12,11 @@ endif
.PHONY: run build test
run:
go run ./cmd/bee $(RUN_ARGS)
mkdir -p $(EXPORT_DIR)
go run -ldflags "$(GO_LDFLAGS)" ./cmd/bee $(RUN_ARGS)
build:
go build -o bee ./cmd/bee
go build -ldflags "$(GO_LDFLAGS)" -o bee ./cmd/bee
test:
go test ./...

View File

@@ -8,6 +8,7 @@ import (
"log/slog"
"os"
"runtime/debug"
"strconv"
"strings"
"bee/audit/internal/app"
@@ -21,30 +22,7 @@ var Version = "dev"
func buildLabel() string {
label := strings.TrimSpace(Version)
if label == "" {
label = "dev"
}
if info, ok := debug.ReadBuildInfo(); ok {
var revision string
var modified bool
for _, setting := range info.Settings {
switch setting.Key {
case "vcs.revision":
revision = setting.Value
case "vcs.modified":
modified = setting.Value == "true"
}
}
if revision != "" {
short := revision
if len(short) > 12 {
short = short[:12]
}
label += " (" + short
if modified {
label += "+"
}
label += ")"
}
return "dev"
}
return label
}
@@ -53,10 +31,19 @@ func main() {
os.Exit(run(os.Args[1:], os.Stdout, os.Stderr))
}
func run(args []string, stdout, stderr io.Writer) int {
func run(args []string, stdout, stderr io.Writer) (exitCode int) {
slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
Level: slog.LevelInfo,
})))
defer func() {
if rec := recover(); rec != nil {
slog.Error("fatal panic",
"panic", fmt.Sprint(rec),
"stack", string(debug.Stack()),
)
exitCode = 1
}
}()
if len(args) == 0 {
printRootUsage(stderr)
@@ -82,6 +69,8 @@ func run(args []string, stdout, stderr io.Writer) int {
return runWeb(args[1:], stdout, stderr)
case "sat":
return runSAT(args[1:], stdout, stderr)
case "benchmark":
return runBenchmark(args[1:], stdout, stderr)
case "version", "--version", "-version":
fmt.Fprintln(stdout, Version)
return 0
@@ -98,8 +87,9 @@ func printRootUsage(w io.Writer) {
bee preflight --output stdout|file:<path>
bee export --target <device>
bee support-bundle --output stdout|file:<path>
bee web --listen :80 --audit-path `+app.DefaultAuditJSONPath+`
bee web --listen :80 [--audit-path `+app.DefaultAuditJSONPath+`]
bee sat nvidia|memory|storage|cpu [--duration <seconds>]
bee benchmark nvidia [--profile standard|stability|overnight]
bee version
bee help [command]`)
}
@@ -118,6 +108,8 @@ func runHelp(args []string, stdout, stderr io.Writer) int {
return runWeb([]string{"--help"}, stdout, stdout)
case "sat":
return runSAT([]string{"--help"}, stdout, stderr)
case "benchmark":
return runBenchmark([]string{"--help"}, stdout, stderr)
case "version":
fmt.Fprintln(stdout, "usage: bee version")
return 0
@@ -304,7 +296,7 @@ func runWeb(args []string, stdout, stderr io.Writer) int {
fs := flag.NewFlagSet("web", flag.ContinueOnError)
fs.SetOutput(stderr)
listenAddr := fs.String("listen", ":8080", "listen address, e.g. :80")
auditPath := fs.String("audit-path", app.DefaultAuditJSONPath, "path to the latest audit JSON snapshot")
auditPath := fs.String("audit-path", "", "optional path to the latest audit JSON snapshot")
exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
title := fs.String("title", "Bee Hardware Audit", "page title")
fs.Usage = func() {
@@ -407,3 +399,85 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
slog.Info("sat archive written", "target", target, "path", archive)
return 0
}
func runBenchmark(args []string, stdout, stderr io.Writer) int {
if len(args) == 0 {
fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
return 2
}
if args[0] == "help" || args[0] == "--help" || args[0] == "-h" {
fmt.Fprintln(stdout, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
return 0
}
target := args[0]
if target != "nvidia" {
fmt.Fprintf(stderr, "bee benchmark: unknown target %q\n", target)
fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
return 2
}
fs := flag.NewFlagSet("benchmark", flag.ContinueOnError)
fs.SetOutput(stderr)
profile := fs.String("profile", platform.NvidiaBenchmarkProfileStandard, "benchmark profile: standard, stability, overnight")
devices := fs.String("devices", "", "comma-separated GPU indices to include")
exclude := fs.String("exclude", "", "comma-separated GPU indices to exclude")
sizeMB := fs.Int("size-mb", 0, "per-GPU benchmark buffer size in MB (0 = auto)")
skipNCCL := fs.Bool("skip-nccl", false, "skip multi-GPU NCCL interconnect benchmark")
if err := fs.Parse(args[1:]); err != nil {
if err == flag.ErrHelp {
return 0
}
return 2
}
if fs.NArg() != 0 {
fmt.Fprintf(stderr, "bee benchmark: unexpected arguments\n")
return 2
}
includeIndices, err := parseBenchmarkIndexCSV(*devices)
if err != nil {
fmt.Fprintf(stderr, "bee benchmark: invalid --devices: %v\n", err)
return 2
}
excludeIndices, err := parseBenchmarkIndexCSV(*exclude)
if err != nil {
fmt.Fprintf(stderr, "bee benchmark: invalid --exclude: %v\n", err)
return 2
}
application := app.New(platform.New())
logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
archive, err := application.RunNvidiaBenchmark("", platform.NvidiaBenchmarkOptions{
Profile: *profile,
SizeMB: *sizeMB,
GPUIndices: includeIndices,
ExcludeGPUIndices: excludeIndices,
RunNCCL: !*skipNCCL,
}, logLine)
if err != nil {
slog.Error("run benchmark", "target", target, "err", err)
return 1
}
slog.Info("benchmark archive written", "target", target, "path", archive)
return 0
}
func parseBenchmarkIndexCSV(raw string) ([]int, error) {
raw = strings.TrimSpace(raw)
if raw == "" {
return nil, nil
}
var indices []int
for _, part := range strings.Split(raw, ",") {
part = strings.TrimSpace(part)
if part == "" {
continue
}
value, err := strconv.Atoi(part)
if err != nil || value < 0 {
return nil, fmt.Errorf("bad gpu index %q", part)
}
indices = append(indices, value)
}
return indices, nil
}

View File

@@ -46,8 +46,6 @@ func TestRunUnknownCommand(t *testing.T) {
}
func TestRunVersion(t *testing.T) {
t.Parallel()
old := Version
Version = "test-version"
t.Cleanup(func() { Version = old })
@@ -62,6 +60,16 @@ func TestRunVersion(t *testing.T) {
}
}
func TestBuildLabelUsesVersionAsIs(t *testing.T) {
old := Version
Version = "1.2.3"
t.Cleanup(func() { Version = old })
if got := buildLabel(); got != "1.2.3" {
t.Fatalf("buildLabel=%q want %q", got, "1.2.3")
}
}
func TestRunExportRequiresTarget(t *testing.T) {
t.Parallel()

View File

@@ -19,17 +19,18 @@ import (
)
var (
DefaultExportDir = "/appdata/bee/export"
DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json"
DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log"
DefaultWebLogPath = DefaultExportDir + "/bee-web.log"
DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log"
DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log"
DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log"
DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
DefaultTechDumpDir = DefaultExportDir + "/techdump"
DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
DefaultExportDir = "/appdata/bee/export"
DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json"
DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log"
DefaultWebLogPath = DefaultExportDir + "/bee-web.log"
DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log"
DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log"
DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log"
DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
DefaultTechDumpDir = DefaultExportDir + "/techdump"
DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
DefaultBenchmarkBaseDir = DefaultExportDir + "/bee-benchmark"
)
type App struct {
@@ -114,6 +115,12 @@ func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
type satRunner interface {
RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error)
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
@@ -195,10 +202,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
return "stdout", err
case strings.HasPrefix(output, "file:"):
path := strings.TrimPrefix(output, "file:")
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
return "", err
}
if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
return "", err
}
return path, nil
@@ -223,10 +227,7 @@ func (a *App) RunRuntimePreflight(output string) (string, error) {
return "stdout", err
case strings.HasPrefix(output, "file:"):
path := strings.TrimPrefix(output, "file:")
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
return "", err
}
if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
return "", err
}
return path, nil
@@ -532,10 +533,56 @@ func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir st
return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
}
func (a *App) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNvidiaTargetedStressValidatePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
}
func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
}
func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
return a.RunNvidiaBenchmarkCtx(context.Background(), baseDir, opts, logFunc)
}
func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultBenchmarkBaseDir
}
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
}
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
}
func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNvidiaTargetedPowerPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
}
func (a *App) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNvidiaPulseTestPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
}
func (a *App) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNvidiaBandwidthPack(ctx, baseDir, gpuIndices, logFunc)
}
func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
@@ -886,6 +933,12 @@ func latestSATSummaries() []string {
prefix string
}{
{label: "NVIDIA SAT", prefix: "gpu-nvidia-"},
{label: "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)", prefix: "gpu-nvidia-targeted-stress-"},
{label: "NVIDIA Max Compute Load (dcgmproftester)", prefix: "gpu-nvidia-compute-"},
{label: "NVIDIA Targeted Power (dcgmi diag targeted_power)", prefix: "gpu-nvidia-targeted-power-"},
{label: "NVIDIA Pulse Test (dcgmi diag pulse_test)", prefix: "gpu-nvidia-pulse-"},
{label: "NVIDIA Interconnect Test (NCCL all_reduce_perf)", prefix: "gpu-nvidia-nccl-"},
{label: "NVIDIA Bandwidth Test (NVBandwidth)", prefix: "gpu-nvidia-bandwidth-"},
{label: "Memory SAT", prefix: "memory-"},
{label: "Storage SAT", prefix: "storage-"},
{label: "CPU SAT", prefix: "cpu-"},

View File

@@ -120,15 +120,21 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
}
type fakeSAT struct {
runNvidiaFn func(string) (string, error)
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
runMemoryFn func(string) (string, error)
runStorageFn func(string) (string, error)
runCPUFn func(string, int) (string, error)
detectVendorFn func() string
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
runAMDPackFn func(string) (string, error)
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
runNvidiaFn func(string) (string, error)
runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
runNvidiaComputeFn func(string, int, []int) (string, error)
runNvidiaPowerFn func(string, int, []int) (string, error)
runNvidiaPulseFn func(string, int, []int) (string, error)
runNvidiaBandwidthFn func(string, []int) (string, error)
runNvidiaTargetedStressFn func(string, int, []int) (string, error)
runMemoryFn func(string) (string, error)
runStorageFn func(string) (string, error)
runCPUFn func(string, int) (string, error)
detectVendorFn func() string
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
runAMDPackFn func(string) (string, error)
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
}
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
@@ -139,6 +145,48 @@ func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir s
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaBenchmark(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, _ func(string)) (string, error) {
if f.runNvidiaBenchmarkFn != nil {
return f.runNvidiaBenchmarkFn(baseDir, opts)
}
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
if f.runNvidiaTargetedStressFn != nil {
return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
}
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
if f.runNvidiaComputeFn != nil {
return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices)
}
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaTargetedPowerPack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
if f.runNvidiaPowerFn != nil {
return f.runNvidiaPowerFn(baseDir, durationSec, gpuIndices)
}
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaPulseTestPack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
if f.runNvidiaPulseFn != nil {
return f.runNvidiaPulseFn(baseDir, durationSec, gpuIndices)
}
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaBandwidthPack(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
if f.runNvidiaBandwidthFn != nil {
return f.runNvidiaBandwidthFn(baseDir, gpuIndices)
}
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaStressPack(_ context.Context, baseDir string, opts platform.NvidiaStressOptions, _ func(string)) (string, error) {
if f.runNvidiaStressFn != nil {
return f.runNvidiaStressFn(baseDir, opts)
@@ -754,6 +802,26 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
}
}
for _, want := range []string{
"/system/ip-link.txt",
"/system/ip-link-stats.txt",
"/system/ethtool-info.txt",
"/system/ethtool-link.txt",
"/system/ethtool-module.txt",
"/system/mstflint-query.txt",
} {
var found bool
for _, name := range names {
if contains(name, want) {
found = true
break
}
}
if !found {
t.Fatalf("support bundle missing %s, names=%v", want, names)
}
}
var foundRaw bool
for _, name := range names {
if contains(name, "/export/bee-sat/memory-run/verbose.log") {

View File

@@ -0,0 +1,48 @@
package app
import (
"fmt"
"os"
"path/filepath"
)
func atomicWriteFile(path string, data []byte, perm os.FileMode) error {
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
return fmt.Errorf("mkdir %s: %w", filepath.Dir(path), err)
}
tmpPath := path + ".tmp"
f, err := os.OpenFile(tmpPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, perm)
if err != nil {
return fmt.Errorf("open temp %s: %w", tmpPath, err)
}
success := false
defer func() {
_ = f.Close()
if !success {
_ = os.Remove(tmpPath)
}
}()
if _, err := f.Write(data); err != nil {
return fmt.Errorf("write temp %s: %w", tmpPath, err)
}
if err := f.Sync(); err != nil {
return fmt.Errorf("sync temp %s: %w", tmpPath, err)
}
if err := f.Close(); err != nil {
return fmt.Errorf("close temp %s: %w", tmpPath, err)
}
if err := os.Rename(tmpPath, path); err != nil {
return fmt.Errorf("rename %s -> %s: %w", tmpPath, path, err)
}
if dir, err := os.Open(filepath.Dir(path)); err == nil {
_ = dir.Sync()
_ = dir.Close()
}
success = true
return nil
}

View File

@@ -0,0 +1,71 @@
package app
import (
"encoding/json"
"os"
"path/filepath"
"testing"
"bee/audit/internal/schema"
)
func TestAtomicWriteFileReplacesTargetWithoutLeavingTmp(t *testing.T) {
path := filepath.Join(t.TempDir(), "bee-audit.json")
if err := os.WriteFile(path, []byte("old\n"), 0644); err != nil {
t.Fatalf("seed file: %v", err)
}
if err := atomicWriteFile(path, []byte("new\n"), 0644); err != nil {
t.Fatalf("atomicWriteFile: %v", err)
}
raw, err := os.ReadFile(path)
if err != nil {
t.Fatalf("read final: %v", err)
}
if string(raw) != "new\n" {
t.Fatalf("final content=%q want %q", string(raw), "new\n")
}
if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
t.Fatalf("tmp file should be absent after success, err=%v", err)
}
}
func TestRunRuntimePreflightWritesAtomically(t *testing.T) {
path := filepath.Join(t.TempDir(), "runtime-health.json")
a := &App{
runtime: fakeRuntime{
collectFn: func(exportDir string) (schema.RuntimeHealth, error) {
return schema.RuntimeHealth{
Status: "OK",
ExportDir: exportDir,
DriverReady: true,
CUDAReady: true,
}, nil
},
},
}
got, err := a.RunRuntimePreflight("file:" + path)
if err != nil {
t.Fatalf("RunRuntimePreflight: %v", err)
}
if got != path {
t.Fatalf("path=%q want %q", got, path)
}
if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
t.Fatalf("tmp file should be absent after success, err=%v", err)
}
raw, err := os.ReadFile(path)
if err != nil {
t.Fatalf("read runtime file: %v", err)
}
var health schema.RuntimeHealth
if err := json.Unmarshal(raw, &health); err != nil {
t.Fatalf("json unmarshal: %v", err)
}
if health.Status != "OK" {
t.Fatalf("status=%q want OK", health.Status)
}
}

View File

@@ -21,12 +21,12 @@ type ComponentStatusDB struct {
// ComponentStatusRecord holds the current and historical health of one hardware component.
type ComponentStatusRecord struct {
ComponentKey string `json:"component_key"`
Status string `json:"status"` // "OK", "Warning", "Critical", "Unknown"
LastCheckedAt time.Time `json:"last_checked_at"`
LastChangedAt time.Time `json:"last_changed_at"`
ErrorSummary string `json:"error_summary,omitempty"`
History []ComponentStatusEntry `json:"history"`
ComponentKey string `json:"component_key"`
Status string `json:"status"` // "OK", "Warning", "Critical", "Unknown"
LastCheckedAt time.Time `json:"last_checked_at"`
LastChangedAt time.Time `json:"last_changed_at"`
ErrorSummary string `json:"error_summary,omitempty"`
History []ComponentStatusEntry `json:"history"`
}
// ComponentStatusEntry is one observation written to a component's history.
@@ -179,7 +179,9 @@ func ApplySATResultToDB(db *ComponentStatusDB, target, archivePath string) {
// Map SAT target to component keys.
switch target {
case "nvidia", "amd", "nvidia-stress", "amd-stress", "amd-mem", "amd-bandwidth":
case "nvidia", "nvidia-targeted-stress", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
"nvidia-interconnect", "nvidia-bandwidth", "amd", "nvidia-stress",
"amd-stress", "amd-mem", "amd-bandwidth":
db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall)
case "memory", "memory-stress", "sat-stress":
db.Record("memory:all", source, dbStatus, target+" SAT: "+overall)

View File

@@ -19,6 +19,8 @@ var supportBundleServices = []string{
"bee-network.service",
"bee-nvidia.service",
"bee-preflight.service",
"bee-selfheal.service",
"bee-selfheal.timer",
"bee-sshsetup.service",
}
@@ -32,6 +34,8 @@ var supportBundleCommands = []struct {
{name: "system/lspci-nn.txt", cmd: []string{"lspci", "-nn"}},
{name: "system/lspci-vvv.txt", cmd: []string{"lspci", "-vvv"}},
{name: "system/ip-addr.txt", cmd: []string{"ip", "addr"}},
{name: "system/ip-link.txt", cmd: []string{"ip", "-details", "link", "show"}},
{name: "system/ip-link-stats.txt", cmd: []string{"ip", "-s", "link", "show"}},
{name: "system/ip-route.txt", cmd: []string{"ip", "route"}},
{name: "system/mount.txt", cmd: []string{"mount"}},
{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
@@ -47,6 +51,83 @@ for d in /sys/bus/pci/devices/*/; do
printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
done
done
`}},
{name: "system/ethtool-info.txt", cmd: []string{"sh", "-c", `
if ! command -v ethtool >/dev/null 2>&1; then
echo "ethtool not found"
exit 0
fi
found=0
for path in /sys/class/net/*; do
[ -e "$path" ] || continue
iface=$(basename "$path")
[ "$iface" = "lo" ] && continue
found=1
echo "=== $iface ==="
ethtool -i "$iface" 2>&1 || true
echo
done
if [ "$found" -eq 0 ]; then
echo "no interfaces found"
fi
`}},
{name: "system/ethtool-link.txt", cmd: []string{"sh", "-c", `
if ! command -v ethtool >/dev/null 2>&1; then
echo "ethtool not found"
exit 0
fi
found=0
for path in /sys/class/net/*; do
[ -e "$path" ] || continue
iface=$(basename "$path")
[ "$iface" = "lo" ] && continue
found=1
echo "=== $iface ==="
ethtool "$iface" 2>&1 || true
echo
done
if [ "$found" -eq 0 ]; then
echo "no interfaces found"
fi
`}},
{name: "system/ethtool-module.txt", cmd: []string{"sh", "-c", `
if ! command -v ethtool >/dev/null 2>&1; then
echo "ethtool not found"
exit 0
fi
found=0
for path in /sys/class/net/*; do
[ -e "$path" ] || continue
iface=$(basename "$path")
[ "$iface" = "lo" ] && continue
found=1
echo "=== $iface ==="
ethtool -m "$iface" 2>&1 || true
echo
done
if [ "$found" -eq 0 ]; then
echo "no interfaces found"
fi
`}},
{name: "system/mstflint-query.txt", cmd: []string{"sh", "-c", `
if ! command -v mstflint >/dev/null 2>&1; then
echo "mstflint not found"
exit 0
fi
found=0
for path in /sys/bus/pci/devices/*; do
[ -e "$path/vendor" ] || continue
vendor=$(cat "$path/vendor" 2>/dev/null)
[ "$vendor" = "0x15b3" ] || continue
bdf=$(basename "$path")
found=1
echo "=== $bdf ==="
mstflint -d "$bdf" q 2>&1 || true
echo
done
if [ "$found" -eq 0 ]; then
echo "no Mellanox/NVIDIA networking devices found"
fi
`}},
}

View File

@@ -2,18 +2,21 @@ package collector
import (
"bee/audit/internal/schema"
"context"
"log/slog"
"os"
"os/exec"
"path/filepath"
"strings"
"time"
)
const mellanoxVendorID = 0x15b3
const nicProbeTimeout = 2 * time.Second
var (
mstflintQuery = func(bdf string) (string, error) {
out, err := exec.Command("mstflint", "-d", bdf, "q").Output()
out, err := commandOutputWithTimeout(nicProbeTimeout, "mstflint", "-d", bdf, "q")
if err != nil {
return "", err
}
@@ -21,7 +24,7 @@ var (
}
ethtoolInfoQuery = func(iface string) (string, error) {
out, err := exec.Command("ethtool", "-i", iface).Output()
out, err := commandOutputWithTimeout(nicProbeTimeout, "ethtool", "-i", iface)
if err != nil {
return "", err
}
@@ -29,6 +32,14 @@ var (
}
netIfacesByBDF = listNetIfacesByBDF
readNetCarrierFile = func(iface string) (string, error) {
path := filepath.Join("/sys/class/net", iface, "carrier")
raw, err := os.ReadFile(path)
if err != nil {
return "", err
}
return strings.TrimSpace(string(raw)), nil
}
)
// enrichPCIeWithMellanox enriches Mellanox/NVIDIA Networking devices with
@@ -162,3 +173,17 @@ func listNetIfacesByBDF(bdf string) []string {
}
return ifaces
}
func commandOutputWithTimeout(timeout time.Duration, name string, args ...string) ([]byte, error) {
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
return exec.CommandContext(ctx, name, args...).Output()
}
func interfaceHasCarrier(iface string) bool {
raw, err := readNetCarrierFile(iface)
if err != nil {
return false
}
return strings.TrimSpace(raw) == "1"
}

View File

@@ -12,7 +12,7 @@ import (
var (
ethtoolModuleQuery = func(iface string) (string, error) {
out, err := raidToolQuery("ethtool", "-m", iface)
out, err := commandOutputWithTimeout(nicProbeTimeout, "ethtool", "-m", iface)
if err != nil {
return "", err
}
@@ -58,10 +58,12 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
}
}
if out, err := ethtoolModuleQuery(iface); err == nil {
if injectSFPDOMTelemetry(&devs[i], out) {
enriched++
continue
if interfaceHasCarrier(iface) {
if out, err := ethtoolModuleQuery(iface); err == nil {
if injectSFPDOMTelemetry(&devs[i], out) {
enriched++
continue
}
}
}
if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {

View File

@@ -57,6 +57,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
origReadMAC := readNetAddressFile
origEth := ethtoolInfoQuery
origModule := ethtoolModuleQuery
origCarrier := readNetCarrierFile
t.Cleanup(func() {
queryPCILSPCIDetail = origDetail
readPCIVPDFile = origVPD
@@ -64,6 +65,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
readNetAddressFile = origReadMAC
ethtoolInfoQuery = origEth
ethtoolModuleQuery = origModule
readNetCarrierFile = origCarrier
})
queryPCILSPCIDetail = func(bdf string) (string, error) {
@@ -82,6 +84,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
}
return "aa:bb:cc:dd:ee:ff", nil
}
readNetCarrierFile = func(string) (string, error) { return "1", nil }
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("skip optics") }
@@ -101,6 +104,42 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
}
}
func TestEnrichPCIeWithNICTelemetrySkipsModuleQueryWithoutCarrier(t *testing.T) {
origIfaces := netIfacesByBDF
origReadMAC := readNetAddressFile
origEth := ethtoolInfoQuery
origModule := ethtoolModuleQuery
origCarrier := readNetCarrierFile
t.Cleanup(func() {
netIfacesByBDF = origIfaces
readNetAddressFile = origReadMAC
ethtoolInfoQuery = origEth
ethtoolModuleQuery = origModule
readNetCarrierFile = origCarrier
})
netIfacesByBDF = func(string) []string { return []string{"eth0"} }
readNetAddressFile = func(string) (string, error) { return "aa:bb:cc:dd:ee:ff", nil }
readNetCarrierFile = func(string) (string, error) { return "0", nil }
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
ethtoolModuleQuery = func(string) (string, error) {
t.Fatal("ethtool -m should not be called without carrier")
return "", nil
}
class := "EthernetController"
bdf := "0000:18:00.0"
devs := []schema.HardwarePCIeDevice{{
DeviceClass: &class,
BDF: &bdf,
}}
out := enrichPCIeWithNICTelemetry(devs)
if len(out[0].MacAddresses) != 1 || out[0].MacAddresses[0] != "aa:bb:cc:dd:ee:ff" {
t.Fatalf("mac_addresses=%v", out[0].MacAddresses)
}
}
func TestDBMValue(t *testing.T) {
tests := []struct {
in string

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,141 @@
package platform
import (
"fmt"
"strings"
"time"
)
func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
var b strings.Builder
fmt.Fprintf(&b, "Bee NVIDIA Benchmark Report\n")
fmt.Fprintf(&b, "===========================\n\n")
fmt.Fprintf(&b, "Generated: %s\n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
fmt.Fprintf(&b, "Host: %s\n", result.Hostname)
fmt.Fprintf(&b, "Profile: %s\n", result.BenchmarkProfile)
fmt.Fprintf(&b, "Overall status: %s\n", result.OverallStatus)
fmt.Fprintf(&b, "Selected GPUs: %s\n", joinIndexList(result.SelectedGPUIndices))
fmt.Fprintf(&b, "Normalization: %s\n\n", result.Normalization.Status)
if len(result.Findings) > 0 {
fmt.Fprintf(&b, "Executive Summary\n")
fmt.Fprintf(&b, "-----------------\n")
for _, finding := range result.Findings {
fmt.Fprintf(&b, "- %s\n", finding)
}
b.WriteString("\n")
}
if len(result.Warnings) > 0 {
fmt.Fprintf(&b, "Warnings\n")
fmt.Fprintf(&b, "--------\n")
for _, warning := range result.Warnings {
fmt.Fprintf(&b, "- %s\n", warning)
}
b.WriteString("\n")
}
fmt.Fprintf(&b, "Per GPU Scorecard\n")
fmt.Fprintf(&b, "-----------------\n")
for _, gpu := range result.GPUs {
fmt.Fprintf(&b, "GPU %d %s\n", gpu.Index, gpu.Name)
fmt.Fprintf(&b, " Status: %s\n", gpu.Status)
fmt.Fprintf(&b, " Composite score: %.2f\n", gpu.Scores.CompositeScore)
fmt.Fprintf(&b, " Compute score: %.2f\n", gpu.Scores.ComputeScore)
fmt.Fprintf(&b, " Power sustain: %.1f\n", gpu.Scores.PowerSustainScore)
fmt.Fprintf(&b, " Thermal sustain: %.1f\n", gpu.Scores.ThermalSustainScore)
fmt.Fprintf(&b, " Stability: %.1f\n", gpu.Scores.StabilityScore)
if gpu.Scores.InterconnectScore > 0 {
fmt.Fprintf(&b, " Interconnect: %.1f\n", gpu.Scores.InterconnectScore)
}
if len(gpu.DegradationReasons) > 0 {
fmt.Fprintf(&b, " Degradation reasons: %s\n", strings.Join(gpu.DegradationReasons, ", "))
}
fmt.Fprintf(&b, " Avg power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.AvgPowerW, gpu.Steady.AvgTempC, gpu.Steady.AvgGraphicsClockMHz)
fmt.Fprintf(&b, " P95 power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.P95PowerW, gpu.Steady.P95TempC, gpu.Steady.P95GraphicsClockMHz)
if len(gpu.PrecisionResults) > 0 {
fmt.Fprintf(&b, " Precision results:\n")
for _, precision := range gpu.PrecisionResults {
if precision.Supported {
fmt.Fprintf(&b, " - %s: %.2f TOPS lanes=%d iterations=%d\n", precision.Name, precision.TeraOpsPerSec, precision.Lanes, precision.Iterations)
} else {
fmt.Fprintf(&b, " - %s: unsupported (%s)\n", precision.Name, precision.Notes)
}
}
}
fmt.Fprintf(&b, " Throttle counters (us): sw_power=%d sw_thermal=%d sync_boost=%d hw_thermal=%d hw_power_brake=%d\n",
gpu.Throttle.SWPowerCapUS,
gpu.Throttle.SWThermalSlowdownUS,
gpu.Throttle.SyncBoostUS,
gpu.Throttle.HWThermalSlowdownUS,
gpu.Throttle.HWPowerBrakeSlowdownUS,
)
if len(gpu.Notes) > 0 {
fmt.Fprintf(&b, " Notes:\n")
for _, note := range gpu.Notes {
fmt.Fprintf(&b, " - %s\n", note)
}
}
b.WriteString("\n")
}
if result.Interconnect != nil {
fmt.Fprintf(&b, "Interconnect\n")
fmt.Fprintf(&b, "------------\n")
fmt.Fprintf(&b, "Status: %s\n", result.Interconnect.Status)
if result.Interconnect.Supported {
fmt.Fprintf(&b, "Avg algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.AvgBusBWGBps)
fmt.Fprintf(&b, "Max algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.MaxAlgBWGBps, result.Interconnect.MaxBusBWGBps)
}
for _, note := range result.Interconnect.Notes {
fmt.Fprintf(&b, "- %s\n", note)
}
b.WriteString("\n")
}
fmt.Fprintf(&b, "Methodology\n")
fmt.Fprintf(&b, "-----------\n")
fmt.Fprintf(&b, "- Profile %s uses standardized baseline, warmup, steady-state, interconnect, and cooldown phases.\n", result.BenchmarkProfile)
fmt.Fprintf(&b, "- Single-GPU compute score comes from bee-gpu-burn cuBLASLt output when available.\n")
fmt.Fprintf(&b, "- Thermal and power limitations are inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
fmt.Fprintf(&b, "- result.json is the canonical machine-readable source for this benchmark run.\n\n")
fmt.Fprintf(&b, "Raw Files\n")
fmt.Fprintf(&b, "---------\n")
fmt.Fprintf(&b, "- result.json\n")
fmt.Fprintf(&b, "- report.txt\n")
fmt.Fprintf(&b, "- summary.txt\n")
fmt.Fprintf(&b, "- verbose.log\n")
fmt.Fprintf(&b, "- gpu-*-baseline-metrics.csv/html/term.txt\n")
fmt.Fprintf(&b, "- gpu-*-warmup.log\n")
fmt.Fprintf(&b, "- gpu-*-steady.log\n")
fmt.Fprintf(&b, "- gpu-*-steady-metrics.csv/html/term.txt\n")
fmt.Fprintf(&b, "- gpu-*-cooldown-metrics.csv/html/term.txt\n")
if result.Interconnect != nil {
fmt.Fprintf(&b, "- nccl-all-reduce.log\n")
}
return b.String()
}
func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
var b strings.Builder
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
fmt.Fprintf(&b, "normalization_status=%s\n", result.Normalization.Status)
var best float64
for i, gpu := range result.GPUs {
fmt.Fprintf(&b, "gpu_%d_status=%s\n", gpu.Index, gpu.Status)
fmt.Fprintf(&b, "gpu_%d_composite_score=%.2f\n", gpu.Index, gpu.Scores.CompositeScore)
if i == 0 || gpu.Scores.CompositeScore > best {
best = gpu.Scores.CompositeScore
}
}
fmt.Fprintf(&b, "best_composite_score=%.2f\n", best)
if result.Interconnect != nil {
fmt.Fprintf(&b, "interconnect_status=%s\n", result.Interconnect.Status)
fmt.Fprintf(&b, "interconnect_max_busbw_gbps=%.1f\n", result.Interconnect.MaxBusBWGBps)
}
return b.String()
}

View File

@@ -0,0 +1,147 @@
package platform
import (
"strings"
"testing"
)
func TestResolveBenchmarkProfile(t *testing.T) {
t.Parallel()
cases := []struct {
name string
profile string
want benchmarkProfileSpec
}{
{
name: "default",
profile: "",
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120},
},
{
name: "stability",
profile: "stability",
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300},
},
{
name: "overnight",
profile: "overnight",
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300},
},
}
for _, tc := range cases {
tc := tc
t.Run(tc.name, func(t *testing.T) {
got := resolveBenchmarkProfile(tc.profile)
if got != tc.want {
t.Fatalf("profile=%q got %+v want %+v", tc.profile, got, tc.want)
}
})
}
}
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
t.Parallel()
opts := normalizeNvidiaBenchmarkOptionsForBenchmark(NvidiaBenchmarkOptions{
Profile: "stability",
RunNCCL: false,
})
if opts.Profile != NvidiaBenchmarkProfileStability {
t.Fatalf("profile=%q want %q", opts.Profile, NvidiaBenchmarkProfileStability)
}
if opts.RunNCCL {
t.Fatalf("RunNCCL should stay false when explicitly disabled")
}
}
func TestParseBenchmarkBurnLog(t *testing.T) {
t.Parallel()
raw := strings.Join([]string{
"loader=bee-gpu-burn",
"[gpu 0] device=NVIDIA H100",
"[gpu 0] compute_capability=9.0",
"[gpu 0] backend=cublasLt",
"[gpu 0] duration_s=10",
"[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0",
"[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0",
"[gpu 0] fp16_tensor_iterations=200",
"[gpu 0] fp8_e4m3_iterations=50",
"[gpu 0] status=OK",
}, "\n")
got := parseBenchmarkBurnLog(raw)
if got.Backend != "cublasLt" {
t.Fatalf("backend=%q want cublasLt", got.Backend)
}
if got.ComputeCapability != "9.0" {
t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability)
}
if len(got.Profiles) != 2 {
t.Fatalf("profiles=%d want 2", len(got.Profiles))
}
if got.Profiles[0].TeraOpsPerSec <= 0 {
t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec)
}
if got.Profiles[1].Category != "fp8" {
t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category)
}
}
func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
t.Parallel()
result := NvidiaBenchmarkResult{
BenchmarkVersion: benchmarkVersion,
BenchmarkProfile: NvidiaBenchmarkProfileStandard,
OverallStatus: "PARTIAL",
SelectedGPUIndices: []int{0},
Normalization: BenchmarkNormalization{
Status: "partial",
},
Findings: []string{"GPU 0 spent measurable time under SW power cap."},
GPUs: []BenchmarkGPUResult{
{
Index: 0,
Name: "NVIDIA H100",
Status: "OK",
Steady: BenchmarkTelemetrySummary{
AvgPowerW: 680,
AvgTempC: 79,
AvgGraphicsClockMHz: 1725,
P95PowerW: 700,
P95TempC: 82,
P95GraphicsClockMHz: 1800,
},
Scores: BenchmarkScorecard{
ComputeScore: 1200,
PowerSustainScore: 96,
ThermalSustainScore: 88,
StabilityScore: 92,
CompositeScore: 1176,
},
PrecisionResults: []BenchmarkPrecisionResult{
{Name: "fp16_tensor", Supported: true, TeraOpsPerSec: 700},
},
Throttle: BenchmarkThrottleCounters{
SWPowerCapUS: 1000000,
},
DegradationReasons: []string{"power_capped"},
},
},
}
report := renderBenchmarkReport(result)
for _, needle := range []string{
"Executive Summary",
"GPU 0 spent measurable time under SW power cap.",
"Composite score: 1176.00",
"fp16_tensor: 700.00 TOPS",
} {
if !strings.Contains(report, needle) {
t.Fatalf("report missing %q\n%s", needle, report)
}
}
}

View File

@@ -0,0 +1,132 @@
package platform
import "time"
const (
NvidiaBenchmarkProfileStandard = "standard"
NvidiaBenchmarkProfileStability = "stability"
NvidiaBenchmarkProfileOvernight = "overnight"
)
type NvidiaBenchmarkOptions struct {
Profile string
SizeMB int
GPUIndices []int
ExcludeGPUIndices []int
RunNCCL bool
}
type NvidiaBenchmarkResult struct {
BenchmarkVersion string `json:"benchmark_version"`
GeneratedAt time.Time `json:"generated_at"`
Hostname string `json:"hostname,omitempty"`
BenchmarkProfile string `json:"benchmark_profile"`
OverallStatus string `json:"overall_status"`
SelectedGPUIndices []int `json:"selected_gpu_indices"`
Findings []string `json:"findings,omitempty"`
Warnings []string `json:"warnings,omitempty"`
Normalization BenchmarkNormalization `json:"normalization"`
GPUs []BenchmarkGPUResult `json:"gpus"`
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
}
type BenchmarkNormalization struct {
Status string `json:"status"`
Notes []string `json:"notes,omitempty"`
GPUs []BenchmarkNormalizationGPU `json:"gpus,omitempty"`
}
type BenchmarkNormalizationGPU struct {
Index int `json:"index"`
PersistenceMode string `json:"persistence_mode,omitempty"`
GPUClockLockMHz float64 `json:"gpu_clock_lock_mhz,omitempty"`
GPUClockLockStatus string `json:"gpu_clock_lock_status,omitempty"`
MemoryClockLockMHz float64 `json:"memory_clock_lock_mhz,omitempty"`
MemoryClockLockStatus string `json:"memory_clock_lock_status,omitempty"`
Notes []string `json:"notes,omitempty"`
}
type BenchmarkGPUResult struct {
Index int `json:"index"`
UUID string `json:"uuid,omitempty"`
Name string `json:"name,omitempty"`
BusID string `json:"bus_id,omitempty"`
VBIOS string `json:"vbios,omitempty"`
ComputeCapability string `json:"compute_capability,omitempty"`
Backend string `json:"backend,omitempty"`
Status string `json:"status"`
PowerLimitW float64 `json:"power_limit_w,omitempty"`
MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"`
MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"`
LockedGraphicsClockMHz float64 `json:"locked_graphics_clock_mhz,omitempty"`
LockedMemoryClockMHz float64 `json:"locked_memory_clock_mhz,omitempty"`
Baseline BenchmarkTelemetrySummary `json:"baseline"`
Steady BenchmarkTelemetrySummary `json:"steady"`
Cooldown BenchmarkTelemetrySummary `json:"cooldown"`
Throttle BenchmarkThrottleCounters `json:"throttle_counters"`
PrecisionResults []BenchmarkPrecisionResult `json:"precision_results,omitempty"`
Scores BenchmarkScorecard `json:"scores"`
DegradationReasons []string `json:"degradation_reasons,omitempty"`
Notes []string `json:"notes,omitempty"`
}
type BenchmarkTelemetrySummary struct {
DurationSec float64 `json:"duration_sec"`
Samples int `json:"samples"`
AvgTempC float64 `json:"avg_temp_c"`
P95TempC float64 `json:"p95_temp_c"`
AvgPowerW float64 `json:"avg_power_w"`
P95PowerW float64 `json:"p95_power_w"`
AvgGraphicsClockMHz float64 `json:"avg_graphics_clock_mhz"`
P95GraphicsClockMHz float64 `json:"p95_graphics_clock_mhz"`
AvgMemoryClockMHz float64 `json:"avg_memory_clock_mhz"`
P95MemoryClockMHz float64 `json:"p95_memory_clock_mhz"`
AvgUsagePct float64 `json:"avg_usage_pct"`
AvgMemUsagePct float64 `json:"avg_mem_usage_pct"`
ClockCVPct float64 `json:"clock_cv_pct"`
PowerCVPct float64 `json:"power_cv_pct"`
TempCVPct float64 `json:"temp_cv_pct"`
ClockDriftPct float64 `json:"clock_drift_pct"`
}
type BenchmarkThrottleCounters struct {
SWPowerCapUS uint64 `json:"sw_power_cap_us"`
SWThermalSlowdownUS uint64 `json:"sw_thermal_slowdown_us"`
SyncBoostUS uint64 `json:"sync_boost_us"`
HWThermalSlowdownUS uint64 `json:"hw_thermal_slowdown_us"`
HWPowerBrakeSlowdownUS uint64 `json:"hw_power_brake_slowdown_us"`
}
type BenchmarkPrecisionResult struct {
Name string `json:"name"`
Category string `json:"category"`
Supported bool `json:"supported"`
Lanes int `json:"lanes,omitempty"`
M uint64 `json:"m,omitempty"`
N uint64 `json:"n,omitempty"`
K uint64 `json:"k,omitempty"`
Iterations uint64 `json:"iterations,omitempty"`
TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
Notes string `json:"notes,omitempty"`
}
type BenchmarkScorecard struct {
ComputeScore float64 `json:"compute_score"`
PowerSustainScore float64 `json:"power_sustain_score"`
ThermalSustainScore float64 `json:"thermal_sustain_score"`
StabilityScore float64 `json:"stability_score"`
InterconnectScore float64 `json:"interconnect_score"`
CompositeScore float64 `json:"composite_score"`
}
type BenchmarkInterconnectResult struct {
Status string `json:"status"`
Attempted bool `json:"attempted"`
Supported bool `json:"supported"`
SelectedGPUIndices []int `json:"selected_gpu_indices,omitempty"`
AvgAlgBWGBps float64 `json:"avg_algbw_gbps,omitempty"`
MaxAlgBWGBps float64 `json:"max_algbw_gbps,omitempty"`
AvgBusBWGBps float64 `json:"avg_busbw_gbps,omitempty"`
MaxBusBWGBps float64 `json:"max_busbw_gbps,omitempty"`
Notes []string `json:"notes,omitempty"`
}

View File

@@ -20,12 +20,13 @@ type GPUMetricRow struct {
MemUsagePct float64 `json:"mem_usage_pct"`
PowerW float64 `json:"power_w"`
ClockMHz float64 `json:"clock_mhz"`
MemClockMHz float64 `json:"mem_clock_mhz"`
}
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
args := []string{
"--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics",
"--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics,clocks.current.memory",
"--format=csv,noheader,nounits",
}
if len(gpuIndices) > 0 {
@@ -46,7 +47,7 @@ func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
continue
}
parts := strings.Split(line, ", ")
if len(parts) < 6 {
if len(parts) < 7 {
continue
}
idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
@@ -57,6 +58,7 @@ func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
MemUsagePct: parseGPUFloat(parts[3]),
PowerW: parseGPUFloat(parts[4]),
ClockMHz: parseGPUFloat(parts[5]),
MemClockMHz: parseGPUFloat(parts[6]),
})
}
return rows, nil
@@ -139,10 +141,10 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
// WriteGPUMetricsCSV writes collected rows as a CSV file.
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
var b bytes.Buffer
b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,power_w,clock_mhz\n")
b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz\n")
for _, r := range rows {
fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.0f\n",
r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.PowerW, r.ClockMHz)
fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f\n",
r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz)
}
return os.WriteFile(path, b.Bytes(), 0644)
}
@@ -197,7 +199,7 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
const PW = plotX2 - plotX1
const PH = plotY2 - plotY1
// Outer axes
const tempAxisX = 60 // temp axis line
const tempAxisX = 60 // temp axis line
const clockAxisX = 900 // clock axis line
colors := [4]string{"#e74c3c", "#3498db", "#2ecc71", "#f39c12"}

View File

@@ -120,10 +120,45 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) erro
log(fmt.Sprintf("Warning: rebind /run/live/medium failed: %v", err))
}
log("Verifying live medium now served from RAM...")
status := s.LiveBootSource()
if err := verifyInstallToRAMStatus(status); err != nil {
return err
}
log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
log("Done. Installation media can be safely disconnected.")
return nil
}
func verifyInstallToRAMStatus(status LiveBootSource) error {
if status.InRAM {
return nil
}
return fmt.Errorf("install to RAM verification failed: live medium still mounted from %s", describeLiveBootSource(status))
}
func describeLiveBootSource(status LiveBootSource) string {
source := strings.TrimSpace(status.Device)
if source == "" {
source = strings.TrimSpace(status.Source)
}
if source == "" {
source = "unknown source"
}
switch strings.TrimSpace(status.Kind) {
case "ram":
return "RAM"
case "usb":
return "USB (" + source + ")"
case "cdrom":
return "CD-ROM (" + source + ")"
case "disk":
return "disk (" + source + ")"
default:
return source
}
}
func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) error {
in, err := os.Open(src)
if err != nil {

View File

@@ -3,6 +3,8 @@ package platform
import "testing"
func TestInferLiveBootKind(t *testing.T) {
t.Parallel()
tests := []struct {
name string
fsType string
@@ -18,6 +20,7 @@ func TestInferLiveBootKind(t *testing.T) {
{name: "unknown", source: "overlay", want: "unknown"},
}
for _, tc := range tests {
tc := tc
t.Run(tc.name, func(t *testing.T) {
got := inferLiveBootKind(tc.fsType, tc.source, tc.deviceType, tc.transport)
if got != tc.want {
@@ -26,3 +29,29 @@ func TestInferLiveBootKind(t *testing.T) {
})
}
}
func TestVerifyInstallToRAMStatus(t *testing.T) {
t.Parallel()
if err := verifyInstallToRAMStatus(LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"}); err != nil {
t.Fatalf("expected success for RAM-backed status, got %v", err)
}
err := verifyInstallToRAMStatus(LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"})
if err == nil {
t.Fatal("expected verification failure when media is still on USB")
}
if got := err.Error(); got != "install to RAM verification failed: live medium still mounted from USB (/dev/sdb1)" {
t.Fatalf("error=%q", got)
}
}
func TestDescribeLiveBootSource(t *testing.T) {
t.Parallel()
if got := describeLiveBootSource(LiveBootSource{InRAM: true, Kind: "ram"}); got != "RAM" {
t.Fatalf("got %q want RAM", got)
}
if got := describeLiveBootSource(LiveBootSource{Kind: "unknown", Source: "/run/live/medium"}); got != "/run/live/medium" {
t.Fatalf("got %q want /run/live/medium", got)
}
}

View File

@@ -135,12 +135,15 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
case "nvidia":
tools = append(tools, s.CheckTools([]string{
"nvidia-smi",
"dcgmi",
"nv-hostengine",
"nvidia-bug-report.sh",
"bee-gpu-burn",
"bee-john-gpu-stress",
"bee-nccl-gpu-stress",
"all_reduce_perf",
})...)
tools = append(tools, resolvedToolStatus("dcgmproftester", dcgmProfTesterCandidates...))
case "amd":
tool := ToolStatus{Name: "rocm-smi"}
if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 {
@@ -155,6 +158,16 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
return tools
}
func resolvedToolStatus(display string, candidates ...string) ToolStatus {
for _, candidate := range candidates {
path, err := exec.LookPath(candidate)
if err == nil {
return ToolStatus{Name: display, Path: path, OK: true}
}
}
return ToolStatus{Name: display}
}
func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
lsmodText := commandText("lsmod")

View File

@@ -12,11 +12,11 @@ import (
"os"
"os/exec"
"path/filepath"
"syscall"
"sort"
"strconv"
"strings"
"sync"
"syscall"
"time"
)
@@ -38,6 +38,12 @@ var (
"/opt/rocm/bin/rvs",
"/opt/rocm-*/bin/rvs",
}
dcgmProfTesterCandidates = []string{
"dcgmproftester",
"dcgmproftester13",
"dcgmproftester12",
"dcgmproftester11",
}
)
// streamExecOutput runs cmd and streams each output line to logFunc (if non-nil).
@@ -76,15 +82,15 @@ func streamExecOutput(cmd *exec.Cmd, logFunc func(string)) ([]byte, error) {
// NvidiaGPU holds basic GPU info from nvidia-smi.
type NvidiaGPU struct {
Index int
Name string
MemoryMB int
Index int `json:"index"`
Name string `json:"name"`
MemoryMB int `json:"memory_mb"`
}
// AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
type AMDGPUInfo struct {
Index int
Name string
Index int `json:"index"`
Name string `json:"name"`
}
// DetectGPUVendor returns "nvidia" if /dev/nvidia0 exists, "amd" if /dev/kfd exists, or "" otherwise.
@@ -256,6 +262,9 @@ func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
MemoryMB: memMB,
})
}
sort.Slice(gpus, func(i, j int) bool {
return gpus[i].Index < gpus[j].Index
})
return gpus, nil
}
@@ -277,6 +286,80 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
}, logFunc)
}
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
selected, err := resolveDCGMGPUIndices(gpuIndices)
if err != nil {
return "", err
}
profCmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
if err != nil {
return "", err
}
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", []satJob{
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
{
name: "03-dcgmproftester.log",
cmd: profCmd,
env: nvidiaVisibleDevicesEnv(selected),
collectGPU: true,
gpuIndices: selected,
},
{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
}, logFunc)
}
func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
selected, err := resolveDCGMGPUIndices(gpuIndices)
if err != nil {
return "", err
}
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", []satJob{
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{
name: "02-dcgmi-targeted-power.log",
cmd: nvidiaDCGMNamedDiagCommand("targeted_power", normalizeNvidiaBurnDuration(durationSec), selected),
collectGPU: true,
gpuIndices: selected,
},
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
}, logFunc)
}
func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
selected, err := resolveDCGMGPUIndices(gpuIndices)
if err != nil {
return "", err
}
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", []satJob{
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{
name: "02-dcgmi-pulse-test.log",
cmd: nvidiaDCGMNamedDiagCommand("pulse_test", normalizeNvidiaBurnDuration(durationSec), selected),
collectGPU: true,
gpuIndices: selected,
},
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
}, logFunc)
}
func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
selected, err := resolveDCGMGPUIndices(gpuIndices)
if err != nil {
return "", err
}
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", []satJob{
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{
name: "02-dcgmi-nvbandwidth.log",
cmd: nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, selected),
collectGPU: true,
gpuIndices: selected,
},
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
}, logFunc)
}
func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
return runAcceptancePackCtx(context.Background(), baseDir, "gpu-nvidia", nvidiaSATJobs(), logFunc)
}
@@ -286,7 +369,42 @@ func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (
// gpuIndices: specific GPU indices to test (empty = all GPUs).
// ctx cancellation kills the running job.
func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error) {
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, gpuIndices), logFunc)
resolvedGPUIndices, err := resolveDCGMGPUIndices(gpuIndices)
if err != nil {
return "", err
}
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, resolvedGPUIndices), logFunc)
}
func (s *System) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
selected, err := resolveDCGMGPUIndices(gpuIndices)
if err != nil {
return "", err
}
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", []satJob{
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{
name: "02-dcgmi-targeted-stress.log",
cmd: nvidiaDCGMNamedDiagCommand("targeted_stress", normalizeNvidiaBurnDuration(durationSec), selected),
collectGPU: true,
gpuIndices: selected,
},
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
}, logFunc)
}
func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
if len(gpuIndices) > 0 {
return dedupeSortedIndices(gpuIndices), nil
}
all, err := listNvidiaGPUIndices()
if err != nil {
return nil, err
}
if len(all) == 0 {
return nil, fmt.Errorf("nvidia-smi found no NVIDIA GPUs")
}
return all, nil
}
func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
@@ -455,6 +573,31 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
}
}
func nvidiaDCGMNamedDiagCommand(name string, durationSec int, gpuIndices []int) []string {
args := []string{"dcgmi", "diag", "-r", name}
if durationSec > 0 {
args = append(args, "-p", fmt.Sprintf("%s.test_duration=%d", name, durationSec))
}
if len(gpuIndices) > 0 {
args = append(args, "-i", joinIndexList(gpuIndices))
}
return args
}
func normalizeNvidiaBurnDuration(durationSec int) int {
if durationSec <= 0 {
return 300
}
return durationSec
}
func nvidiaVisibleDevicesEnv(gpuIndices []int) []string {
if len(gpuIndices) == 0 {
return nil
}
return []string{"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices)}
}
func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
if ctx == nil {
ctx = context.Background()
@@ -624,6 +767,7 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
}
if strings.Contains(text, "unsupported") ||
strings.Contains(text, "not supported") ||
strings.Contains(text, "not found in path") ||
strings.Contains(text, "invalid opcode") ||
strings.Contains(text, "unknown command") ||
strings.Contains(text, "not implemented") ||
@@ -730,6 +874,15 @@ func resolveROCmSMICommand(args ...string) ([]string, error) {
return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
}
func resolveDCGMProfTesterCommand(args ...string) ([]string, error) {
for _, candidate := range dcgmProfTesterCandidates {
if path, err := satLookPath(candidate); err == nil {
return append([]string{path}, args...), nil
}
}
return nil, errors.New("dcgmproftester not found in PATH")
}
func ensureAMDRuntimeReady() error {
if _, err := os.Stat("/dev/kfd"); err == nil {
return nil

View File

@@ -162,6 +162,86 @@ func TestBuildNvidiaStressJobUsesNCCLLoader(t *testing.T) {
}
}
func TestResolveDCGMGPUIndicesUsesDetectedGPUsWhenUnset(t *testing.T) {
t.Parallel()
oldExecCommand := satExecCommand
satExecCommand = func(name string, args ...string) *exec.Cmd {
if name == "nvidia-smi" {
return exec.Command("sh", "-c", "printf '2\n0\n1\n'")
}
return exec.Command(name, args...)
}
t.Cleanup(func() { satExecCommand = oldExecCommand })
got, err := resolveDCGMGPUIndices(nil)
if err != nil {
t.Fatalf("resolveDCGMGPUIndices error: %v", err)
}
if want := "0,1,2"; joinIndexList(got) != want {
t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want)
}
}
func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) {
t.Parallel()
got, err := resolveDCGMGPUIndices([]int{3, 1, 3})
if err != nil {
t.Fatalf("resolveDCGMGPUIndices error: %v", err)
}
if want := "1,3"; joinIndexList(got) != want {
t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want)
}
}
func TestResolveDCGMProfTesterCommandUsesVersionedBinary(t *testing.T) {
oldLookPath := satLookPath
satLookPath = func(file string) (string, error) {
switch file {
case "dcgmproftester13":
return "/usr/bin/dcgmproftester13", nil
default:
return "", exec.ErrNotFound
}
}
t.Cleanup(func() { satLookPath = oldLookPath })
cmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004")
if err != nil {
t.Fatalf("resolveDCGMProfTesterCommand error: %v", err)
}
if len(cmd) != 4 {
t.Fatalf("cmd len=%d want 4 (%v)", len(cmd), cmd)
}
if cmd[0] != "/usr/bin/dcgmproftester13" {
t.Fatalf("cmd[0]=%q want /usr/bin/dcgmproftester13", cmd[0])
}
}
func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", 900, []int{3, 1})
want := []string{"dcgmi", "diag", "-r", "targeted_power", "-p", "targeted_power.test_duration=900", "-i", "3,1"}
if len(cmd) != len(want) {
t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
}
for i := range want {
if cmd[i] != want[i] {
t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
}
}
}
func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
if len(env) != 1 {
t.Fatalf("env len=%d want 1 (%v)", len(env), env)
}
if env[0] != "CUDA_VISIBLE_DEVICES=0,2,4" {
t.Fatalf("env[0]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[0])
}
}
func TestNvidiaStressArchivePrefixByLoader(t *testing.T) {
t.Parallel()

View File

@@ -10,17 +10,30 @@ import (
func (s *System) ListBeeServices() ([]string, error) {
seen := map[string]bool{}
var out []string
for _, pattern := range []string{"/etc/systemd/system/bee-*.service", "/lib/systemd/system/bee-*.service"} {
for _, pattern := range []string{
"/etc/systemd/system/bee-*.service",
"/lib/systemd/system/bee-*.service",
"/etc/systemd/system/bee-*.timer",
"/lib/systemd/system/bee-*.timer",
} {
matches, err := filepath.Glob(pattern)
if err != nil {
return nil, err
}
for _, match := range matches {
name := strings.TrimSuffix(filepath.Base(match), ".service")
base := filepath.Base(match)
name := base
if strings.HasSuffix(base, ".service") {
name = strings.TrimSuffix(base, ".service")
}
// Skip template units (e.g. bee-journal-mirror@) — they have no instances to query.
if strings.HasSuffix(name, "@") {
continue
}
// bee-selfheal is timer-managed; showing the oneshot service as inactive is misleading.
if name == "bee-selfheal" && strings.HasSuffix(base, ".service") {
continue
}
if !seen[name] {
seen[name] = true
out = append(out, name)

View File

@@ -44,12 +44,12 @@ type StaticIPv4Config struct {
}
type RemovableTarget struct {
Device string
FSType string
Size string
Label string
Model string
Mountpoint string
Device string `json:"device"`
FSType string `json:"fs_type"`
Size string `json:"size"`
Label string `json:"label"`
Model string `json:"model"`
Mountpoint string `json:"mountpoint"`
}
type ToolStatus struct {

View File

@@ -0,0 +1,31 @@
package platform
import (
"encoding/json"
"strings"
"testing"
)
func TestRemovableTargetJSONUsesFrontendFieldNames(t *testing.T) {
t.Parallel()
data, err := json.Marshal(RemovableTarget{
Device: "/dev/sdb1",
FSType: "exfat",
Size: "1.8T",
Label: "USB",
Model: "Flash",
})
if err != nil {
t.Fatalf("marshal: %v", err)
}
raw := string(data)
for _, key := range []string{`"device"`, `"fs_type"`, `"size"`, `"label"`, `"model"`} {
if !strings.Contains(raw, key) {
t.Fatalf("json missing key %s: %s", key, raw)
}
}
if strings.Contains(raw, `"Device"`) || strings.Contains(raw, `"FSType"`) {
t.Fatalf("json still contains Go field names: %s", raw)
}
}

View File

@@ -110,6 +110,11 @@ func streamCmdJob(j *jobState, cmd *exec.Cmd) error {
scanDone := make(chan error, 1)
go func() {
defer func() {
if rec := recover(); rec != nil {
scanDone <- fmt.Errorf("stream scanner panic: %v", rec)
}
}()
scanner := bufio.NewScanner(pr)
scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024)
for scanner.Scan() {
@@ -227,6 +232,54 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
}
}
func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
if h.opts.App == nil {
writeError(w, http.StatusServiceUnavailable, "app not configured")
return
}
var body struct {
Profile string `json:"profile"`
SizeMB int `json:"size_mb"`
GPUIndices []int `json:"gpu_indices"`
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
RunNCCL *bool `json:"run_nccl"`
DisplayName string `json:"display_name"`
}
if r.Body != nil {
if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
writeError(w, http.StatusBadRequest, "invalid request body")
return
}
}
runNCCL := true
if body.RunNCCL != nil {
runNCCL = *body.RunNCCL
}
t := &Task{
ID: newJobID("benchmark-nvidia"),
Name: taskDisplayName("nvidia-benchmark", "", ""),
Target: "nvidia-benchmark",
Priority: 15,
Status: TaskPending,
CreatedAt: time.Now(),
params: taskParams{
GPUIndices: body.GPUIndices,
ExcludeGPUIndices: body.ExcludeGPUIndices,
SizeMB: body.SizeMB,
BenchmarkProfile: body.Profile,
RunNCCL: runNCCL,
DisplayName: body.DisplayName,
},
}
if strings.TrimSpace(body.DisplayName) != "" {
t.Name = body.DisplayName
}
globalQueue.enqueue(t)
writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
}
func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {
id := r.URL.Query().Get("job_id")
if id == "" {
@@ -486,6 +539,22 @@ func (h *handler) handleAPIExportUSBBundle(w http.ResponseWriter, r *http.Reques
// ── GPU presence ──────────────────────────────────────────────────────────────
func (h *handler) handleAPIGNVIDIAGPUs(w http.ResponseWriter, _ *http.Request) {
if h.opts.App == nil {
writeError(w, http.StatusServiceUnavailable, "app not configured")
return
}
gpus, err := h.opts.App.ListNvidiaGPUs()
if err != nil {
writeError(w, http.StatusInternalServerError, err.Error())
return
}
if gpus == nil {
gpus = []platform.NvidiaGPU{}
}
writeJSON(w, gpus)
}
func (h *handler) handleAPIGPUPresence(w http.ResponseWriter, r *http.Request) {
if h.opts.App == nil {
writeError(w, http.StatusServiceUnavailable, "app not configured")
@@ -511,14 +580,33 @@ func (h *handler) handleAPIGPUTools(w http.ResponseWriter, _ *http.Request) {
_, amdErr := os.Stat("/dev/kfd")
nvidiaUp := nvidiaErr == nil
amdUp := amdErr == nil
_, dcgmErr := exec.LookPath("dcgmi")
_, ncclStressErr := exec.LookPath("bee-nccl-gpu-stress")
_, johnErr := exec.LookPath("bee-john-gpu-stress")
_, beeBurnErr := exec.LookPath("bee-gpu-burn")
_, nvBandwidthErr := exec.LookPath("nvbandwidth")
profErr := lookPathAny("dcgmproftester", "dcgmproftester13", "dcgmproftester12", "dcgmproftester11")
writeJSON(w, []toolEntry{
{ID: "bee-gpu-burn", Available: nvidiaUp, Vendor: "nvidia"},
{ID: "john", Available: nvidiaUp, Vendor: "nvidia"},
{ID: "nccl", Available: nvidiaUp, Vendor: "nvidia"},
{ID: "nvidia-compute", Available: nvidiaUp && profErr == nil, Vendor: "nvidia"},
{ID: "nvidia-targeted-power", Available: nvidiaUp && dcgmErr == nil, Vendor: "nvidia"},
{ID: "nvidia-pulse", Available: nvidiaUp && dcgmErr == nil, Vendor: "nvidia"},
{ID: "nvidia-interconnect", Available: nvidiaUp && ncclStressErr == nil, Vendor: "nvidia"},
{ID: "nvidia-bandwidth", Available: nvidiaUp && dcgmErr == nil && nvBandwidthErr == nil, Vendor: "nvidia"},
{ID: "bee-gpu-burn", Available: nvidiaUp && beeBurnErr == nil, Vendor: "nvidia"},
{ID: "john", Available: nvidiaUp && johnErr == nil, Vendor: "nvidia"},
{ID: "rvs", Available: amdUp, Vendor: "amd"},
})
}
func lookPathAny(names ...string) error {
for _, name := range names {
if _, err := exec.LookPath(name); err == nil {
return nil
}
}
return exec.ErrNotFound
}
// ── System ────────────────────────────────────────────────────────────────────
func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) {
@@ -557,7 +645,7 @@ func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request)
var standardTools = []string{
"dmidecode", "smartctl", "nvme", "lspci", "ipmitool",
"nvidia-smi", "memtester", "stress-ng", "nvtop",
"nvidia-smi", "dcgmi", "nv-hostengine", "memtester", "stress-ng", "nvtop",
"mstflint", "qrencode",
}

View File

@@ -64,6 +64,42 @@ func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
}
}
func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
globalQueue.mu.Lock()
originalTasks := globalQueue.tasks
globalQueue.tasks = nil
globalQueue.mu.Unlock()
t.Cleanup(func() {
globalQueue.mu.Lock()
globalQueue.tasks = originalTasks
globalQueue.mu.Unlock()
})
h := &handler{opts: HandlerOptions{App: &app.App{}}}
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
rec := httptest.NewRecorder()
h.handleAPIBenchmarkNvidiaRun(rec, req)
if rec.Code != 200 {
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
}
globalQueue.mu.Lock()
defer globalQueue.mu.Unlock()
if len(globalQueue.tasks) != 1 {
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
}
task := globalQueue.tasks[0]
if task.Target != "nvidia-benchmark" {
t.Fatalf("target=%q want nvidia-benchmark", task.Target)
}
if got := task.params.GPUIndices; len(got) != 2 || got[0] != 1 || got[1] != 3 {
t.Fatalf("gpu indices=%v want [1 3]", got)
}
if task.params.RunNCCL {
t.Fatal("RunNCCL should reflect explicit false from request")
}
}
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
h := &handler{}

View File

@@ -0,0 +1,773 @@
package webui
import (
"fmt"
"math"
"sort"
"strconv"
"strings"
"sync"
"time"
"bee/audit/internal/platform"
)
type chartTimelineSegment struct {
Start time.Time
End time.Time
Active bool
}
type chartScale struct {
Min float64
Max float64
Ticks []float64
}
type chartLayout struct {
Width int
Height int
PlotLeft int
PlotRight int
PlotTop int
PlotBottom int
}
type metricChartSeries struct {
Name string
AxisTitle string
Color string
Values []float64
}
var metricChartPalette = []string{
"#5794f2",
"#73bf69",
"#f2cc0c",
"#ff9830",
"#f2495c",
"#b877d9",
"#56d2f7",
"#8ab8ff",
"#9adf8f",
"#ffbe5c",
}
var gpuLabelCache struct {
mu sync.Mutex
loadedAt time.Time
byIndex map[int]string
}
func renderMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMin, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
pointCount := len(labels)
if len(times) > pointCount {
pointCount = len(times)
}
if pointCount == 0 {
pointCount = 1
labels = []string{""}
times = []time.Time{time.Time{}}
}
if len(labels) < pointCount {
padded := make([]string, pointCount)
copy(padded, labels)
labels = padded
}
if len(times) < pointCount {
times = synthesizeChartTimes(times, pointCount)
}
for i := range datasets {
if len(datasets[i]) == 0 {
datasets[i] = make([]float64, pointCount)
}
}
statsLabel := chartStatsLabel(datasets)
legendItems := []metricChartSeries{}
for i, name := range names {
color := metricChartPalette[i%len(metricChartPalette)]
values := make([]float64, pointCount)
if i < len(datasets) {
copy(values, coalesceDataset(datasets[i], pointCount))
}
legendItems = append(legendItems, metricChartSeries{
Name: name,
Color: color,
Values: values,
})
}
scale := singleAxisChartScale(datasets, yMin, yMax)
layout := singleAxisChartLayout(canvasHeight, len(legendItems))
start, end := chartTimeBounds(times)
var b strings.Builder
writeSVGOpen(&b, layout.Width, layout.Height)
writeChartFrame(&b, title, statsLabel, layout.Width, layout.Height)
writeTimelineIdleSpans(&b, layout, start, end, timeline)
writeVerticalGrid(&b, layout, times, pointCount, 8)
writeHorizontalGrid(&b, layout, scale)
writeTimelineBoundaries(&b, layout, start, end, timeline)
writePlotBorder(&b, layout)
writeSingleAxisY(&b, layout, scale)
writeXAxisLabels(&b, layout, times, labels, start, end, 8)
for _, item := range legendItems {
writeSeriesPolyline(&b, layout, times, start, end, item.Values, scale, item.Color)
}
writeLegend(&b, layout, legendItems)
writeSVGClose(&b)
return []byte(b.String()), nil
}
func renderGPUOverviewChartSVG(idx int, samples []platform.LiveMetricSample, timeline []chartTimelineSegment) ([]byte, bool, error) {
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
coreClock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
if temp == nil && power == nil && coreClock == nil {
return nil, false, nil
}
labels := sampleTimeLabels(samples)
times := sampleTimes(samples)
svg, err := drawGPUOverviewChartSVG(
gpuDisplayLabel(idx)+" Overview",
labels,
times,
[]metricChartSeries{
{Name: "Temp C", Values: coalesceDataset(temp, len(labels)), Color: "#f05a5a", AxisTitle: "Temp C"},
{Name: "Power W", Values: coalesceDataset(power, len(labels)), Color: "#ffb357", AxisTitle: "Power W"},
{Name: "Core Clock MHz", Values: coalesceDataset(coreClock, len(labels)), Color: "#73bf69", AxisTitle: "Core MHz"},
},
timeline,
)
if err != nil {
return nil, false, err
}
return svg, true, nil
}
func drawGPUOverviewChartSVG(title string, labels []string, times []time.Time, series []metricChartSeries, timeline []chartTimelineSegment) ([]byte, error) {
if len(series) != 3 {
return nil, fmt.Errorf("gpu overview requires 3 series, got %d", len(series))
}
const (
width = 1400
height = 840
plotLeft = 180
plotRight = 1220
plotTop = 96
plotBottom = 660
)
const (
leftOuterAxis = 72
leftInnerAxis = 132
rightInnerAxis = 1268
)
layout := chartLayout{
Width: width,
Height: height,
PlotLeft: plotLeft,
PlotRight: plotRight,
PlotTop: plotTop,
PlotBottom: plotBottom,
}
axisX := []int{leftOuterAxis, leftInnerAxis, rightInnerAxis}
pointCount := len(labels)
if len(times) > pointCount {
pointCount = len(times)
}
if pointCount == 0 {
pointCount = 1
labels = []string{""}
times = []time.Time{time.Time{}}
}
if len(labels) < pointCount {
padded := make([]string, pointCount)
copy(padded, labels)
labels = padded
}
if len(times) < pointCount {
times = synthesizeChartTimes(times, pointCount)
}
for i := range series {
if len(series[i].Values) == 0 {
series[i].Values = make([]float64, pointCount)
}
}
scales := make([]chartScale, len(series))
for i := range series {
min, max := chartSeriesBounds(series[i].Values)
ticks := chartNiceTicks(min, max, 8)
scales[i] = chartScale{
Min: ticks[0],
Max: ticks[len(ticks)-1],
Ticks: ticks,
}
}
start, end := chartTimeBounds(times)
var b strings.Builder
writeSVGOpen(&b, width, height)
writeChartFrame(&b, title, "", width, height)
writeTimelineIdleSpans(&b, layout, start, end, timeline)
writeVerticalGrid(&b, layout, times, pointCount, 8)
writeHorizontalGrid(&b, layout, scales[0])
writeTimelineBoundaries(&b, layout, start, end, timeline)
writePlotBorder(&b, layout)
for i, axisLineX := range axisX {
fmt.Fprintf(&b, `<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="%s" stroke-width="1"/>`+"\n",
axisLineX, layout.PlotTop, axisLineX, layout.PlotBottom, series[i].Color)
fmt.Fprintf(&b, `<text x="%d" y="%d" text-anchor="middle" font-family="sans-serif" font-size="11" font-weight="700" fill="%s">%s</text>`+"\n",
axisLineX, 64, series[i].Color, sanitizeChartText(series[i].AxisTitle))
for _, tick := range scales[i].Ticks {
y := chartYForValue(valueClamp(tick, scales[i]), scales[i], layout.PlotTop, layout.PlotBottom)
label := sanitizeChartText(chartYAxisNumber(tick))
if i < 2 {
fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="%s" stroke-width="1"/>`+"\n",
axisLineX, y, axisLineX+6, y, series[i].Color)
fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="end" dy="4" font-family="sans-serif" font-size="10" fill="%s">%s</text>`+"\n",
axisLineX-8, y, series[i].Color, label)
continue
}
fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="%s" stroke-width="1"/>`+"\n",
axisLineX, y, axisLineX-6, y, series[i].Color)
fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="start" dy="4" font-family="sans-serif" font-size="10" fill="%s">%s</text>`+"\n",
axisLineX+8, y, series[i].Color, label)
}
}
writeXAxisLabels(&b, layout, times, labels, start, end, 8)
for i := range series {
writeSeriesPolyline(&b, layout, times, start, end, series[i].Values, scales[i], series[i].Color)
}
writeLegend(&b, layout, series)
writeSVGClose(&b)
return []byte(b.String()), nil
}
func metricsTimelineSegments(samples []platform.LiveMetricSample, now time.Time) []chartTimelineSegment {
if len(samples) == 0 {
return nil
}
times := sampleTimes(samples)
start, end := chartTimeBounds(times)
if start.IsZero() || end.IsZero() {
return nil
}
return chartTimelineSegmentsForRange(start, end, now, snapshotTaskHistory())
}
func snapshotTaskHistory() []Task {
globalQueue.mu.Lock()
defer globalQueue.mu.Unlock()
out := make([]Task, len(globalQueue.tasks))
for i, t := range globalQueue.tasks {
out[i] = *t
}
return out
}
func chartTimelineSegmentsForRange(start, end, now time.Time, tasks []Task) []chartTimelineSegment {
if start.IsZero() || end.IsZero() {
return nil
}
if end.Before(start) {
start, end = end, start
}
type interval struct {
start time.Time
end time.Time
}
active := make([]interval, 0, len(tasks))
for _, task := range tasks {
if task.StartedAt == nil {
continue
}
intervalStart := task.StartedAt.UTC()
intervalEnd := now.UTC()
if task.DoneAt != nil {
intervalEnd = task.DoneAt.UTC()
}
if !intervalEnd.After(intervalStart) {
continue
}
if intervalEnd.Before(start) || intervalStart.After(end) {
continue
}
if intervalStart.Before(start) {
intervalStart = start
}
if intervalEnd.After(end) {
intervalEnd = end
}
active = append(active, interval{start: intervalStart, end: intervalEnd})
}
sort.Slice(active, func(i, j int) bool {
if active[i].start.Equal(active[j].start) {
return active[i].end.Before(active[j].end)
}
return active[i].start.Before(active[j].start)
})
merged := make([]interval, 0, len(active))
for _, span := range active {
if len(merged) == 0 {
merged = append(merged, span)
continue
}
last := &merged[len(merged)-1]
if !span.start.After(last.end) {
if span.end.After(last.end) {
last.end = span.end
}
continue
}
merged = append(merged, span)
}
segments := make([]chartTimelineSegment, 0, len(merged)*2+1)
cursor := start
for _, span := range merged {
if span.start.After(cursor) {
segments = append(segments, chartTimelineSegment{Start: cursor, End: span.start, Active: false})
}
segments = append(segments, chartTimelineSegment{Start: span.start, End: span.end, Active: true})
cursor = span.end
}
if cursor.Before(end) {
segments = append(segments, chartTimelineSegment{Start: cursor, End: end, Active: false})
}
if len(segments) == 0 {
segments = append(segments, chartTimelineSegment{Start: start, End: end, Active: false})
}
return segments
}
func sampleTimes(samples []platform.LiveMetricSample) []time.Time {
times := make([]time.Time, 0, len(samples))
for _, sample := range samples {
times = append(times, sample.Timestamp)
}
return times
}
func singleAxisChartScale(datasets [][]float64, yMin, yMax *float64) chartScale {
min, max := 0.0, 1.0
if yMin != nil && yMax != nil {
min, max = *yMin, *yMax
} else {
min, max = chartSeriesBounds(flattenDatasets(datasets))
if yMin != nil {
min = *yMin
}
if yMax != nil {
max = *yMax
}
}
ticks := chartNiceTicks(min, max, 8)
return chartScale{Min: ticks[0], Max: ticks[len(ticks)-1], Ticks: ticks}
}
func flattenDatasets(datasets [][]float64) []float64 {
total := 0
for _, ds := range datasets {
total += len(ds)
}
out := make([]float64, 0, total)
for _, ds := range datasets {
out = append(out, ds...)
}
return out
}
func singleAxisChartLayout(canvasHeight int, seriesCount int) chartLayout {
legendRows := 0
if chartLegendVisible(seriesCount) && seriesCount > 0 {
cols := 4
if seriesCount < cols {
cols = seriesCount
}
legendRows = (seriesCount + cols - 1) / cols
}
legendHeight := 0
if legendRows > 0 {
legendHeight = legendRows*24 + 24
}
return chartLayout{
Width: 1400,
Height: canvasHeight,
PlotLeft: 96,
PlotRight: 1352,
PlotTop: 72,
PlotBottom: canvasHeight - 60 - legendHeight,
}
}
func chartTimeBounds(times []time.Time) (time.Time, time.Time) {
if len(times) == 0 {
return time.Time{}, time.Time{}
}
start := times[0].UTC()
end := start
for _, ts := range times[1:] {
t := ts.UTC()
if t.Before(start) {
start = t
}
if t.After(end) {
end = t
}
}
return start, end
}
func synthesizeChartTimes(times []time.Time, count int) []time.Time {
if count <= 0 {
return nil
}
if len(times) == count {
return times
}
if len(times) == 1 {
out := make([]time.Time, count)
for i := range out {
out[i] = times[0].Add(time.Duration(i) * time.Minute)
}
return out
}
base := time.Now().UTC().Add(-time.Duration(count-1) * time.Minute)
out := make([]time.Time, count)
for i := range out {
out[i] = base.Add(time.Duration(i) * time.Minute)
}
return out
}
func writeSVGOpen(b *strings.Builder, width, height int) {
fmt.Fprintf(b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" viewBox="0 0 %d %d">`+"\n", width, height, width, height)
}
func writeSVGClose(b *strings.Builder) {
b.WriteString("</svg>\n")
}
func writeChartFrame(b *strings.Builder, title, subtitle string, width, height int) {
fmt.Fprintf(b, `<rect width="%d" height="%d" rx="10" ry="10" fill="#ffffff" stroke="#d7e0ea"/>`+"\n", width, height)
fmt.Fprintf(b, `<text x="%d" y="30" text-anchor="middle" font-family="sans-serif" font-size="16" font-weight="700" fill="#1f2937">%s</text>`+"\n",
width/2, sanitizeChartText(title))
if strings.TrimSpace(subtitle) != "" {
fmt.Fprintf(b, `<text x="%d" y="50" text-anchor="middle" font-family="sans-serif" font-size="12" font-weight="600" fill="#64748b">%s</text>`+"\n",
width/2, sanitizeChartText(subtitle))
}
}
func writePlotBorder(b *strings.Builder, layout chartLayout) {
fmt.Fprintf(b, `<rect x="%d" y="%d" width="%d" height="%d" fill="none" stroke="#cbd5e1" stroke-width="1"/>`+"\n",
layout.PlotLeft, layout.PlotTop, layout.PlotRight-layout.PlotLeft, layout.PlotBottom-layout.PlotTop)
}
func writeHorizontalGrid(b *strings.Builder, layout chartLayout, scale chartScale) {
b.WriteString(`<g stroke="#e2e8f0" stroke-width="1">` + "\n")
for _, tick := range scale.Ticks {
y := chartYForValue(tick, scale, layout.PlotTop, layout.PlotBottom)
fmt.Fprintf(b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f"/>`+"\n",
layout.PlotLeft, y, layout.PlotRight, y)
}
b.WriteString(`</g>` + "\n")
}
func writeVerticalGrid(b *strings.Builder, layout chartLayout, times []time.Time, pointCount, target int) {
if pointCount <= 0 {
return
}
start, end := chartTimeBounds(times)
b.WriteString(`<g stroke="#edf2f7" stroke-width="1">` + "\n")
for _, idx := range gpuChartLabelIndices(pointCount, target) {
ts := chartPointTime(times, idx)
x := chartXForTime(ts, start, end, layout.PlotLeft, layout.PlotRight)
fmt.Fprintf(b, `<line x1="%.1f" y1="%d" x2="%.1f" y2="%d"/>`+"\n",
x, layout.PlotTop, x, layout.PlotBottom)
}
b.WriteString(`</g>` + "\n")
}
func writeSingleAxisY(b *strings.Builder, layout chartLayout, scale chartScale) {
fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="#64748b" stroke-width="1"/>`+"\n",
layout.PlotLeft, layout.PlotTop, layout.PlotLeft, layout.PlotBottom)
for _, tick := range scale.Ticks {
y := chartYForValue(tick, scale, layout.PlotTop, layout.PlotBottom)
fmt.Fprintf(b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="#64748b" stroke-width="1"/>`+"\n",
layout.PlotLeft, y, layout.PlotLeft-6, y)
fmt.Fprintf(b, `<text x="%d" y="%.1f" text-anchor="end" dy="4" font-family="sans-serif" font-size="10" fill="#475569">%s</text>`+"\n",
layout.PlotLeft-10, y, sanitizeChartText(chartYAxisNumber(tick)))
}
}
func writeXAxisLabels(b *strings.Builder, layout chartLayout, times []time.Time, labels []string, start, end time.Time, target int) {
pointCount := len(labels)
if len(times) > pointCount {
pointCount = len(times)
}
b.WriteString(`<g font-family="sans-serif" font-size="11" fill="#64748b" text-anchor="middle">` + "\n")
for _, idx := range gpuChartLabelIndices(pointCount, target) {
x := chartXForTime(chartPointTime(times, idx), start, end, layout.PlotLeft, layout.PlotRight)
label := ""
if idx < len(labels) {
label = labels[idx]
}
fmt.Fprintf(b, `<text x="%.1f" y="%d">%s</text>`+"\n", x, layout.PlotBottom+28, sanitizeChartText(label))
}
b.WriteString(`</g>` + "\n")
fmt.Fprintf(b, `<text x="%d" y="%d" text-anchor="middle" font-family="sans-serif" font-size="12" fill="#64748b">Time</text>`+"\n",
(layout.PlotLeft+layout.PlotRight)/2, layout.PlotBottom+48)
}
func writeSeriesPolyline(b *strings.Builder, layout chartLayout, times []time.Time, start, end time.Time, values []float64, scale chartScale, color string) {
if len(values) == 0 {
return
}
var points strings.Builder
for idx, value := range values {
if idx > 0 {
points.WriteByte(' ')
}
x := chartXForTime(chartPointTime(times, idx), start, end, layout.PlotLeft, layout.PlotRight)
y := chartYForValue(value, scale, layout.PlotTop, layout.PlotBottom)
points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
points.WriteByte(',')
points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
}
fmt.Fprintf(b, `<polyline points="%s" fill="none" stroke="%s" stroke-width="2.2" stroke-linejoin="round" stroke-linecap="round"/>`+"\n",
points.String(), color)
if len(values) == 1 {
x := chartXForTime(chartPointTime(times, 0), start, end, layout.PlotLeft, layout.PlotRight)
y := chartYForValue(values[0], scale, layout.PlotTop, layout.PlotBottom)
fmt.Fprintf(b, `<circle cx="%.1f" cy="%.1f" r="3.5" fill="%s"/>`+"\n", x, y, color)
return
}
peakIdx := 0
peakValue := values[0]
for idx, value := range values[1:] {
if value >= peakValue {
peakIdx = idx + 1
peakValue = value
}
}
x := chartXForTime(chartPointTime(times, peakIdx), start, end, layout.PlotLeft, layout.PlotRight)
y := chartYForValue(peakValue, scale, layout.PlotTop, layout.PlotBottom)
fmt.Fprintf(b, `<circle cx="%.1f" cy="%.1f" r="4.2" fill="%s" stroke="#ffffff" stroke-width="1.6"/>`+"\n", x, y, color)
fmt.Fprintf(b, `<path d="M %.1f %.1f L %.1f %.1f L %.1f %.1f Z" fill="%s" opacity="0.9"/>`+"\n",
x, y-10, x-5, y-18, x+5, y-18, color)
}
func writeLegend(b *strings.Builder, layout chartLayout, series []metricChartSeries) {
if !chartLegendVisible(len(series)) || len(series) == 0 {
return
}
cols := 4
if len(series) < cols {
cols = len(series)
}
cellWidth := float64(layout.PlotRight-layout.PlotLeft) / float64(cols)
baseY := layout.PlotBottom + 74
for i, item := range series {
row := i / cols
col := i % cols
x := float64(layout.PlotLeft) + cellWidth*float64(col) + 8
y := float64(baseY + row*24)
fmt.Fprintf(b, `<line x1="%.1f" y1="%.1f" x2="%.1f" y2="%.1f" stroke="%s" stroke-width="3"/>`+"\n",
x, y, x+28, y, item.Color)
fmt.Fprintf(b, `<text x="%.1f" y="%.1f" font-family="sans-serif" font-size="12" fill="#1f2937">%s</text>`+"\n",
x+38, y+4, sanitizeChartText(item.Name))
}
}
func writeTimelineIdleSpans(b *strings.Builder, layout chartLayout, start, end time.Time, segments []chartTimelineSegment) {
if len(segments) == 0 {
return
}
b.WriteString(`<g data-role="timeline-overlay">` + "\n")
for _, segment := range segments {
if segment.Active || !segment.End.After(segment.Start) {
continue
}
x0 := chartXForTime(segment.Start, start, end, layout.PlotLeft, layout.PlotRight)
x1 := chartXForTime(segment.End, start, end, layout.PlotLeft, layout.PlotRight)
fmt.Fprintf(b, `<rect x="%.1f" y="%d" width="%.1f" height="%d" fill="#475569" opacity="0.10"/>`+"\n",
x0, layout.PlotTop, math.Max(1, x1-x0), layout.PlotBottom-layout.PlotTop)
}
b.WriteString(`</g>` + "\n")
}
func writeTimelineBoundaries(b *strings.Builder, layout chartLayout, start, end time.Time, segments []chartTimelineSegment) {
if len(segments) == 0 {
return
}
seen := map[int]bool{}
b.WriteString(`<g data-role="timeline-boundaries" stroke="#94a3b8" stroke-width="1.2">` + "\n")
for i, segment := range segments {
if i > 0 {
x := int(math.Round(chartXForTime(segment.Start, start, end, layout.PlotLeft, layout.PlotRight)))
if !seen[x] {
seen[x] = true
fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d"/>`+"\n", x, layout.PlotTop, x, layout.PlotBottom)
}
}
if i < len(segments)-1 {
x := int(math.Round(chartXForTime(segment.End, start, end, layout.PlotLeft, layout.PlotRight)))
if !seen[x] {
seen[x] = true
fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d"/>`+"\n", x, layout.PlotTop, x, layout.PlotBottom)
}
}
}
b.WriteString(`</g>` + "\n")
}
func chartXForTime(ts, start, end time.Time, left, right int) float64 {
if !end.After(start) {
return float64(left+right) / 2
}
if ts.Before(start) {
ts = start
}
if ts.After(end) {
ts = end
}
ratio := float64(ts.Sub(start)) / float64(end.Sub(start))
return float64(left) + ratio*float64(right-left)
}
func chartPointTime(times []time.Time, idx int) time.Time {
if idx >= 0 && idx < len(times) && !times[idx].IsZero() {
return times[idx].UTC()
}
if len(times) > 0 && !times[0].IsZero() {
return times[0].UTC().Add(time.Duration(idx) * time.Minute)
}
return time.Now().UTC().Add(time.Duration(idx) * time.Minute)
}
func chartYForValue(value float64, scale chartScale, plotTop, plotBottom int) float64 {
if scale.Max <= scale.Min {
return float64(plotTop+plotBottom) / 2
}
return float64(plotBottom) - (value-scale.Min)/(scale.Max-scale.Min)*float64(plotBottom-plotTop)
}
func chartSeriesBounds(values []float64) (float64, float64) {
if len(values) == 0 {
return 0, 1
}
min, max := values[0], values[0]
for _, value := range values[1:] {
if value < min {
min = value
}
if value > max {
max = value
}
}
if min == max {
if max == 0 {
return 0, 1
}
pad := math.Abs(max) * 0.1
if pad == 0 {
pad = 1
}
min -= pad
max += pad
}
if min > 0 {
pad := (max - min) * 0.2
if pad == 0 {
pad = max * 0.1
}
min -= pad
if min < 0 {
min = 0
}
max += pad
}
return min, max
}
func chartNiceTicks(min, max float64, target int) []float64 {
if min == max {
max = min + 1
}
span := max - min
step := math.Pow(10, math.Floor(math.Log10(span/float64(target))))
for _, factor := range []float64{1, 2, 5, 10} {
if span/(factor*step) <= float64(target)*1.5 {
step = factor * step
break
}
}
low := math.Floor(min/step) * step
high := math.Ceil(max/step) * step
var ticks []float64
for value := low; value <= high+step*0.001; value += step {
ticks = append(ticks, math.Round(value*1e9)/1e9)
}
return ticks
}
func valueClamp(value float64, scale chartScale) float64 {
if value < scale.Min {
return scale.Min
}
if value > scale.Max {
return scale.Max
}
return value
}
func chartStatsLabel(datasets [][]float64) string {
mn, avg, mx := globalStats(datasets)
if mx <= 0 && avg <= 0 && mn <= 0 {
return ""
}
return fmt.Sprintf("min %s avg %s max %s",
chartLegendNumber(mn),
chartLegendNumber(avg),
chartLegendNumber(mx),
)
}
func gpuDisplayLabel(idx int) string {
if name := gpuModelNameByIndex(idx); name != "" {
return fmt.Sprintf("GPU %d — %s", idx, name)
}
return fmt.Sprintf("GPU %d", idx)
}
func gpuModelNameByIndex(idx int) string {
now := time.Now()
gpuLabelCache.mu.Lock()
if now.Sub(gpuLabelCache.loadedAt) > 30*time.Second || gpuLabelCache.byIndex == nil {
gpuLabelCache.loadedAt = now
gpuLabelCache.byIndex = loadGPUModelNames()
}
name := strings.TrimSpace(gpuLabelCache.byIndex[idx])
gpuLabelCache.mu.Unlock()
return name
}
func loadGPUModelNames() map[int]string {
out := map[int]string{}
gpus, err := platform.New().ListNvidiaGPUs()
if err != nil {
return out
}
for _, gpu := range gpus {
name := strings.TrimSpace(gpu.Name)
if name != "" {
out[gpu.Index] = name
}
}
return out
}

View File

@@ -84,12 +84,12 @@ func (m *jobManager) create(id string) *jobState {
j := &jobState{}
m.jobs[id] = j
// Schedule cleanup after 30 minutes
go func() {
goRecoverOnce("job cleanup", func() {
time.Sleep(30 * time.Minute)
m.mu.Lock()
delete(m.jobs, id)
m.mu.Unlock()
}()
})
return j
}

View File

@@ -14,17 +14,17 @@ import (
)
// kmsgWatcher reads /dev/kmsg and accumulates hardware error events.
// During an active SAT task window it records matching lines; on task finish
// it writes Warning status records to the component status DB.
// It supports multiple concurrent SAT tasks: a shared event window is open
// while any SAT task is running, and flushed when all tasks complete.
type kmsgWatcher struct {
mu sync.Mutex
activeWindow *kmsgWindow
statusDB *app.ComponentStatusDB
mu sync.Mutex
activeCount int // number of in-flight SAT tasks
window *kmsgWindow
statusDB *app.ComponentStatusDB
}
type kmsgWindow struct {
taskID string
target string
targets []string // SAT targets running concurrently
startedAt time.Time
seen map[kmsgEventKey]bool
events []kmsgEvent
@@ -48,36 +48,39 @@ func newKmsgWatcher(statusDB *app.ComponentStatusDB) *kmsgWatcher {
// start launches the background kmsg reading goroutine.
func (w *kmsgWatcher) start() {
go w.run()
goRecoverLoop("kmsg watcher", 5*time.Second, w.run)
}
func (w *kmsgWatcher) run() {
f, err := os.Open("/dev/kmsg")
if err != nil {
slog.Warn("kmsg watcher unavailable", "err", err)
return
}
defer f.Close()
// Best-effort seek to end so we only capture events from now forward.
_, _ = f.Seek(0, io.SeekEnd)
scanner := bufio.NewScanner(f)
scanner.Buffer(make([]byte, 64*1024), 64*1024)
for scanner.Scan() {
line := scanner.Text()
evt, ok := parseKmsgLine(line)
if !ok {
for {
f, err := os.Open("/dev/kmsg")
if err != nil {
slog.Warn("kmsg watcher unavailable", "err", err)
time.Sleep(30 * time.Second)
continue
}
w.mu.Lock()
if w.activeWindow != nil {
w.recordEvent(evt)
// Best-effort seek to end so we only capture events from now forward.
_, _ = f.Seek(0, io.SeekEnd)
scanner := bufio.NewScanner(f)
scanner.Buffer(make([]byte, 64*1024), 64*1024)
for scanner.Scan() {
line := scanner.Text()
evt, ok := parseKmsgLine(line)
if !ok {
continue
}
w.mu.Lock()
if w.window != nil {
w.recordEvent(evt)
}
w.mu.Unlock()
}
w.mu.Unlock()
}
if err := scanner.Err(); err != nil {
slog.Warn("kmsg watcher stopped", "err", err)
if err := scanner.Err(); err != nil {
slog.Warn("kmsg watcher stopped", "err", err)
}
_ = f.Close()
time.Sleep(2 * time.Second)
}
}
@@ -85,48 +88,56 @@ func (w *kmsgWatcher) run() {
// Must be called with w.mu held.
func (w *kmsgWatcher) recordEvent(evt kmsgEvent) {
if len(evt.ids) == 0 {
// Events without a device ID (e.g. MCE) — deduplicate by category.
key := kmsgEventKey{id: "", category: evt.category}
if !w.activeWindow.seen[key] {
w.activeWindow.seen[key] = true
w.activeWindow.events = append(w.activeWindow.events, evt)
if !w.window.seen[key] {
w.window.seen[key] = true
w.window.events = append(w.window.events, evt)
}
return
}
for _, id := range evt.ids {
key := kmsgEventKey{id: id, category: evt.category}
if !w.activeWindow.seen[key] {
w.activeWindow.seen[key] = true
w.activeWindow.events = append(w.activeWindow.events, evt)
if !w.window.seen[key] {
w.window.seen[key] = true
w.window.events = append(w.window.events, evt)
}
}
}
// NotifyTaskStarted opens a new event window for the given SAT task.
// NotifyTaskStarted increments the active task counter and opens a shared event window
// if this is the first task starting.
func (w *kmsgWatcher) NotifyTaskStarted(taskID, target string) {
w.mu.Lock()
defer w.mu.Unlock()
w.activeWindow = &kmsgWindow{
taskID: taskID,
target: target,
startedAt: time.Now(),
seen: make(map[kmsgEventKey]bool),
if w.activeCount == 0 {
w.window = &kmsgWindow{
startedAt: time.Now(),
seen: make(map[kmsgEventKey]bool),
}
}
w.activeCount++
if w.window != nil {
w.window.targets = append(w.window.targets, target)
}
}
// NotifyTaskFinished closes the event window and asynchronously writes status records.
// NotifyTaskFinished decrements the active task counter. When all tasks finish,
// it flushes the accumulated events to the status DB.
func (w *kmsgWatcher) NotifyTaskFinished(taskID string) {
w.mu.Lock()
window := w.activeWindow
if window != nil && window.taskID == taskID {
w.activeWindow = nil
w.activeCount--
var window *kmsgWindow
if w.activeCount <= 0 {
w.activeCount = 0
window = w.window
w.window = nil
}
w.mu.Unlock()
if window == nil || len(window.events) == 0 {
return
}
go w.flushWindow(window)
goRecoverOnce("kmsg watcher flush", func() { w.flushWindow(window) })
}
func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
@@ -164,7 +175,7 @@ func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
}
}
for key, detail := range seen {
detail = "kernel error during " + window.target + " SAT: " + truncate(detail, 120)
detail = "kernel error during SAT (" + strings.Join(window.targets, ",") + "): " + truncate(detail, 120)
w.statusDB.Record(key, source, "Warning", detail)
}
}
@@ -221,7 +232,8 @@ func truncate(s string, max int) string {
// isSATTarget returns true for task targets that run hardware acceptance tests.
func isSATTarget(target string) bool {
switch target {
case "nvidia", "nvidia-stress", "memory", "memory-stress", "storage",
case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
"nvidia-interconnect", "nvidia-bandwidth", "nvidia-stress", "memory", "memory-stress", "storage",
"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
"platform-stress":
return true

View File

@@ -8,6 +8,7 @@ import (
"path/filepath"
"sort"
"strconv"
"strings"
"time"
"bee/audit/internal/platform"
@@ -21,6 +22,13 @@ type MetricsDB struct {
db *sql.DB
}
func (m *MetricsDB) Close() error {
if m == nil || m.db == nil {
return nil
}
return m.db.Close()
}
// openMetricsDB opens (or creates) the metrics database at the given path.
func openMetricsDB(path string) (*MetricsDB, error) {
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
@@ -54,6 +62,8 @@ CREATE TABLE IF NOT EXISTS gpu_metrics (
usage_pct REAL,
mem_usage_pct REAL,
power_w REAL,
clock_mhz REAL,
mem_clock_mhz REAL,
PRIMARY KEY (ts, gpu_index)
);
CREATE TABLE IF NOT EXISTS fan_metrics (
@@ -70,6 +80,38 @@ CREATE TABLE IF NOT EXISTS temp_metrics (
PRIMARY KEY (ts, name)
);
`)
if err != nil {
return err
}
if err := ensureMetricsColumn(db, "gpu_metrics", "clock_mhz", "REAL"); err != nil {
return err
}
return ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL")
}
func ensureMetricsColumn(db *sql.DB, table, column, definition string) error {
rows, err := db.Query("PRAGMA table_info(" + table + ")")
if err != nil {
return err
}
defer rows.Close()
for rows.Next() {
var cid int
var name, ctype string
var notNull, pk int
var dflt sql.NullString
if err := rows.Scan(&cid, &name, &ctype, &notNull, &dflt, &pk); err != nil {
return err
}
if strings.EqualFold(name, column) {
return nil
}
}
if err := rows.Err(); err != nil {
return err
}
_, err = db.Exec("ALTER TABLE " + table + " ADD COLUMN " + column + " " + definition)
return err
}
@@ -91,8 +133,8 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
}
for _, g := range s.GPUs {
_, err = tx.Exec(
`INSERT OR REPLACE INTO gpu_metrics(ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w) VALUES(?,?,?,?,?,?)`,
ts, g.GPUIndex, g.TempC, g.UsagePct, g.MemUsagePct, g.PowerW,
`INSERT OR REPLACE INTO gpu_metrics(ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz) VALUES(?,?,?,?,?,?,?,?)`,
ts, g.GPUIndex, g.TempC, g.UsagePct, g.MemUsagePct, g.PowerW, g.ClockMHz, g.MemClockMHz,
)
if err != nil {
return err
@@ -129,6 +171,23 @@ func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
}
// LoadBetween returns samples in chronological order within the given time window.
func (m *MetricsDB) LoadBetween(start, end time.Time) ([]platform.LiveMetricSample, error) {
if m == nil {
return nil, nil
}
if start.IsZero() || end.IsZero() {
return nil, nil
}
if end.Before(start) {
start, end = end, start
}
return m.loadSamples(
`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
start.Unix(), end.Unix(),
)
}
// loadSamples reconstructs LiveMetricSample rows from the normalized tables.
func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetricSample, error) {
rows, err := m.db.Query(query, args...)
@@ -163,7 +222,7 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
}
gpuData := map[gpuKey]platform.GPUMetricRow{}
gRows, err := m.db.Query(
`SELECT ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w FROM gpu_metrics WHERE ts>=? AND ts<=? ORDER BY ts,gpu_index`,
`SELECT ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w,IFNULL(clock_mhz,0),IFNULL(mem_clock_mhz,0) FROM gpu_metrics WHERE ts>=? AND ts<=? ORDER BY ts,gpu_index`,
minTS, maxTS,
)
if err == nil {
@@ -171,7 +230,7 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
for gRows.Next() {
var ts int64
var g platform.GPUMetricRow
if err := gRows.Scan(&ts, &g.GPUIndex, &g.TempC, &g.UsagePct, &g.MemUsagePct, &g.PowerW); err == nil {
if err := gRows.Scan(&ts, &g.GPUIndex, &g.TempC, &g.UsagePct, &g.MemUsagePct, &g.PowerW, &g.ClockMHz, &g.MemClockMHz); err == nil {
gpuData[gpuKey{ts, g.GPUIndex}] = g
}
}
@@ -283,7 +342,8 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
func (m *MetricsDB) ExportCSV(w io.Writer) error {
rows, err := m.db.Query(`
SELECT s.ts, s.cpu_load_pct, s.mem_load_pct, s.power_w,
g.gpu_index, g.temp_c, g.usage_pct, g.mem_usage_pct, g.power_w
g.gpu_index, g.temp_c, g.usage_pct, g.mem_usage_pct, g.power_w,
g.clock_mhz, g.mem_clock_mhz
FROM sys_metrics s
LEFT JOIN gpu_metrics g ON g.ts = s.ts
ORDER BY s.ts, g.gpu_index
@@ -294,13 +354,13 @@ func (m *MetricsDB) ExportCSV(w io.Writer) error {
defer rows.Close()
cw := csv.NewWriter(w)
_ = cw.Write([]string{"ts", "cpu_load_pct", "mem_load_pct", "sys_power_w", "gpu_index", "gpu_temp_c", "gpu_usage_pct", "gpu_mem_pct", "gpu_power_w"})
_ = cw.Write([]string{"ts", "cpu_load_pct", "mem_load_pct", "sys_power_w", "gpu_index", "gpu_temp_c", "gpu_usage_pct", "gpu_mem_pct", "gpu_power_w", "gpu_clock_mhz", "gpu_mem_clock_mhz"})
for rows.Next() {
var ts int64
var cpu, mem, pwr float64
var gpuIdx sql.NullInt64
var gpuTemp, gpuUse, gpuMem, gpuPow sql.NullFloat64
if err := rows.Scan(&ts, &cpu, &mem, &pwr, &gpuIdx, &gpuTemp, &gpuUse, &gpuMem, &gpuPow); err != nil {
var gpuTemp, gpuUse, gpuMem, gpuPow, gpuClock, gpuMemClock sql.NullFloat64
if err := rows.Scan(&ts, &cpu, &mem, &pwr, &gpuIdx, &gpuTemp, &gpuUse, &gpuMem, &gpuPow, &gpuClock, &gpuMemClock); err != nil {
continue
}
row := []string{
@@ -316,9 +376,11 @@ func (m *MetricsDB) ExportCSV(w io.Writer) error {
strconv.FormatFloat(gpuUse.Float64, 'f', 1, 64),
strconv.FormatFloat(gpuMem.Float64, 'f', 1, 64),
strconv.FormatFloat(gpuPow.Float64, 'f', 1, 64),
strconv.FormatFloat(gpuClock.Float64, 'f', 1, 64),
strconv.FormatFloat(gpuMemClock.Float64, 'f', 1, 64),
)
} else {
row = append(row, "", "", "", "", "")
row = append(row, "", "", "", "", "", "", "")
}
_ = cw.Write(row)
}
@@ -326,9 +388,6 @@ func (m *MetricsDB) ExportCSV(w io.Writer) error {
return cw.Error()
}
// Close closes the database.
func (m *MetricsDB) Close() { _ = m.db.Close() }
func nullFloat(v float64) sql.NullFloat64 {
return sql.NullFloat64{Float64: v, Valid: true}
}

View File

@@ -1,11 +1,13 @@
package webui
import (
"database/sql"
"path/filepath"
"testing"
"time"
"bee/audit/internal/platform"
_ "modernc.org/sqlite"
)
func TestMetricsDBLoadSamplesKeepsChronologicalRangeForGPUs(t *testing.T) {
@@ -67,3 +69,106 @@ func TestMetricsDBLoadSamplesKeepsChronologicalRangeForGPUs(t *testing.T) {
}
}
}
func TestMetricsDBMigratesLegacyGPUSchema(t *testing.T) {
path := filepath.Join(t.TempDir(), "metrics.db")
raw, err := sql.Open("sqlite", path)
if err != nil {
t.Fatalf("sql.Open: %v", err)
}
_, err = raw.Exec(`
CREATE TABLE gpu_metrics (
ts INTEGER NOT NULL,
gpu_index INTEGER NOT NULL,
temp_c REAL,
usage_pct REAL,
mem_usage_pct REAL,
power_w REAL,
PRIMARY KEY (ts, gpu_index)
);
CREATE TABLE sys_metrics (
ts INTEGER NOT NULL,
cpu_load_pct REAL,
mem_load_pct REAL,
power_w REAL,
PRIMARY KEY (ts)
);
CREATE TABLE fan_metrics (
ts INTEGER NOT NULL,
name TEXT NOT NULL,
rpm REAL,
PRIMARY KEY (ts, name)
);
CREATE TABLE temp_metrics (
ts INTEGER NOT NULL,
name TEXT NOT NULL,
grp TEXT NOT NULL,
celsius REAL,
PRIMARY KEY (ts, name)
);
`)
if err != nil {
t.Fatalf("create legacy schema: %v", err)
}
_ = raw.Close()
db, err := openMetricsDB(path)
if err != nil {
t.Fatalf("openMetricsDB: %v", err)
}
defer db.Close()
now := time.Unix(1_700_000_100, 0).UTC()
err = db.Write(platform.LiveMetricSample{
Timestamp: now,
GPUs: []platform.GPUMetricRow{
{GPUIndex: 0, ClockMHz: 1410, MemClockMHz: 2600},
},
})
if err != nil {
t.Fatalf("Write: %v", err)
}
samples, err := db.LoadAll()
if err != nil {
t.Fatalf("LoadAll: %v", err)
}
if len(samples) != 1 || len(samples[0].GPUs) != 1 {
t.Fatalf("samples=%+v", samples)
}
if got := samples[0].GPUs[0].ClockMHz; got != 1410 {
t.Fatalf("ClockMHz=%v want 1410", got)
}
if got := samples[0].GPUs[0].MemClockMHz; got != 2600 {
t.Fatalf("MemClockMHz=%v want 2600", got)
}
}
func TestMetricsDBLoadBetweenFiltersWindow(t *testing.T) {
db, err := openMetricsDB(filepath.Join(t.TempDir(), "metrics.db"))
if err != nil {
t.Fatalf("openMetricsDB: %v", err)
}
defer db.Close()
base := time.Unix(1_700_000_000, 0).UTC()
for i := 0; i < 5; i++ {
if err := db.Write(platform.LiveMetricSample{
Timestamp: base.Add(time.Duration(i) * time.Minute),
CPULoadPct: float64(i),
}); err != nil {
t.Fatalf("Write(%d): %v", i, err)
}
}
got, err := db.LoadBetween(base.Add(1*time.Minute), base.Add(3*time.Minute))
if err != nil {
t.Fatalf("LoadBetween: %v", err)
}
if len(got) != 3 {
t.Fatalf("LoadBetween len=%d want 3", len(got))
}
if !got[0].Timestamp.Equal(base.Add(1*time.Minute)) || !got[2].Timestamp.Equal(base.Add(3*time.Minute)) {
t.Fatalf("window=%v..%v", got[0].Timestamp, got[2].Timestamp)
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,15 +1,19 @@
package webui
import (
"bufio"
"encoding/json"
"errors"
"fmt"
"html"
"io"
"log/slog"
"mime"
"net"
"net/http"
"os"
"path/filepath"
"runtime/debug"
"sort"
"strings"
"sync"
@@ -18,7 +22,6 @@ import (
"bee/audit/internal/app"
"bee/audit/internal/platform"
"bee/audit/internal/runtimeenv"
gocharts "github.com/go-analyze/charts"
"reanimator/chart/viewer"
"reanimator/chart/web"
)
@@ -234,6 +237,12 @@ func NewHandler(opts HandlerOptions) http.Handler {
// SAT
mux.HandleFunc("POST /api/sat/nvidia/run", h.handleAPISATRun("nvidia"))
mux.HandleFunc("POST /api/sat/nvidia-targeted-stress/run", h.handleAPISATRun("nvidia-targeted-stress"))
mux.HandleFunc("POST /api/sat/nvidia-compute/run", h.handleAPISATRun("nvidia-compute"))
mux.HandleFunc("POST /api/sat/nvidia-targeted-power/run", h.handleAPISATRun("nvidia-targeted-power"))
mux.HandleFunc("POST /api/sat/nvidia-pulse/run", h.handleAPISATRun("nvidia-pulse"))
mux.HandleFunc("POST /api/sat/nvidia-interconnect/run", h.handleAPISATRun("nvidia-interconnect"))
mux.HandleFunc("POST /api/sat/nvidia-bandwidth/run", h.handleAPISATRun("nvidia-bandwidth"))
mux.HandleFunc("POST /api/sat/nvidia-stress/run", h.handleAPISATRun("nvidia-stress"))
mux.HandleFunc("POST /api/sat/memory/run", h.handleAPISATRun("memory"))
mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
@@ -247,6 +256,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
mux.HandleFunc("POST /api/sat/platform-stress/run", h.handleAPISATRun("platform-stress"))
mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream)
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
mux.HandleFunc("POST /api/benchmark/nvidia/run", h.handleAPIBenchmarkNvidiaRun)
// Tasks
mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)
@@ -255,6 +265,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
mux.HandleFunc("POST /api/tasks/{id}/cancel", h.handleAPITasksCancel)
mux.HandleFunc("POST /api/tasks/{id}/priority", h.handleAPITasksPriority)
mux.HandleFunc("GET /api/tasks/{id}/stream", h.handleAPITasksStream)
mux.HandleFunc("GET /tasks/{id}", h.handleTaskPage)
// Services
mux.HandleFunc("GET /api/services", h.handleAPIServicesList)
@@ -283,6 +294,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
// GPU presence / tools
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
mux.HandleFunc("GET /api/gpu/nvidia", h.handleAPIGNVIDIAGPUs)
mux.HandleFunc("GET /api/gpu/tools", h.handleAPIGPUTools)
// System
@@ -309,11 +321,11 @@ func NewHandler(opts HandlerOptions) http.Handler {
mux.HandleFunc("GET /", h.handlePage)
h.mux = mux
return mux
return recoverMiddleware(mux)
}
func (h *handler) startMetricsCollector() {
go func() {
goRecoverLoop("metrics collector", 2*time.Second, func() {
ticker := time.NewTicker(metricsCollectInterval)
defer ticker.Stop()
for range ticker.C {
@@ -324,7 +336,7 @@ func (h *handler) startMetricsCollector() {
h.feedRings(sample)
h.setLatestMetric(sample)
}
}()
})
}
func (h *handler) setLatestMetric(sample platform.LiveMetricSample) {
@@ -345,7 +357,81 @@ func (h *handler) latestMetric() (platform.LiveMetricSample, bool) {
// ListenAndServe starts the HTTP server.
func ListenAndServe(addr string, opts HandlerOptions) error {
return http.ListenAndServe(addr, NewHandler(opts))
srv := &http.Server{
Addr: addr,
Handler: NewHandler(opts),
ReadHeaderTimeout: 5 * time.Second,
ReadTimeout: 30 * time.Second,
IdleTimeout: 2 * time.Minute,
}
return srv.ListenAndServe()
}
type trackingResponseWriter struct {
http.ResponseWriter
wroteHeader bool
}
func (w *trackingResponseWriter) WriteHeader(statusCode int) {
w.wroteHeader = true
w.ResponseWriter.WriteHeader(statusCode)
}
func (w *trackingResponseWriter) Write(p []byte) (int, error) {
w.wroteHeader = true
return w.ResponseWriter.Write(p)
}
func (w *trackingResponseWriter) Flush() {
w.wroteHeader = true
if f, ok := w.ResponseWriter.(http.Flusher); ok {
f.Flush()
}
}
func (w *trackingResponseWriter) Hijack() (net.Conn, *bufio.ReadWriter, error) {
h, ok := w.ResponseWriter.(http.Hijacker)
if !ok {
return nil, nil, fmt.Errorf("hijacking not supported")
}
return h.Hijack()
}
func (w *trackingResponseWriter) Push(target string, opts *http.PushOptions) error {
p, ok := w.ResponseWriter.(http.Pusher)
if !ok {
return http.ErrNotSupported
}
return p.Push(target, opts)
}
func (w *trackingResponseWriter) ReadFrom(r io.Reader) (int64, error) {
rf, ok := w.ResponseWriter.(io.ReaderFrom)
if !ok {
return io.Copy(w.ResponseWriter, r)
}
w.wroteHeader = true
return rf.ReadFrom(r)
}
func recoverMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
tw := &trackingResponseWriter{ResponseWriter: w}
defer func() {
if rec := recover(); rec != nil {
slog.Error("http handler panic",
"method", r.Method,
"path", r.URL.Path,
"panic", fmt.Sprint(rec),
"stack", string(debug.Stack()),
)
if !tw.wroteHeader {
http.Error(tw, "internal server error", http.StatusInternalServerError)
}
}
}()
next.ServeHTTP(tw, r)
})
}
// ── Infrastructure handlers ──────────────────────────────────────────────────
@@ -475,13 +561,44 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
http.Error(w, "metrics database not available", http.StatusServiceUnavailable)
return
}
datasets, names, labels, title, yMin, yMax, ok := h.chartDataFromDB(path)
samples, err := h.metricsDB.LoadAll()
if err != nil || len(samples) == 0 {
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
return
}
timeline := metricsTimelineSegments(samples, time.Now())
if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
buf, ok, err := renderGPUOverviewChartSVG(idx, samples, timeline)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
if !ok {
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
return
}
w.Header().Set("Content-Type", "image/svg+xml")
w.Header().Set("Cache-Control", "no-store")
_, _ = w.Write(buf)
return
}
datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
if !ok {
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
return
}
buf, err := renderChartSVG(title, datasets, names, labels, yMin, yMax)
buf, err := renderMetricChartSVG(
title,
labels,
sampleTimes(samples),
datasets,
names,
yMin,
yMax,
chartCanvasHeightForPath(path, len(names)),
timeline,
)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
@@ -491,14 +608,6 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
_, _ = w.Write(buf)
}
func (h *handler) chartDataFromDB(path string) ([][]float64, []string, []string, string, *float64, *float64, bool) {
samples, err := h.metricsDB.LoadAll()
if err != nil || len(samples) == 0 {
return nil, nil, nil, "", nil, nil, false
}
return chartDataFromSamples(path, samples)
}
func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][]float64, []string, []string, string, *float64, *float64, bool) {
var datasets [][]float64
var names []string
@@ -578,18 +687,24 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
yMin = floatPtr(0)
yMax = autoMax120(datasets...)
case path == "gpu-all-clock":
title = "GPU Core Clock"
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
yMin, yMax = autoBounds120(datasets...)
case path == "gpu-all-memclock":
title = "GPU Memory Clock"
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
yMin, yMax = autoBounds120(datasets...)
case strings.HasPrefix(path, "gpu/"):
rest := strings.TrimPrefix(path, "gpu/")
sub := ""
if i := strings.LastIndex(rest, "-"); i > 0 {
sub = rest[i+1:]
rest = rest[:i]
idx, sub, ok := parseGPUChartPath(path)
if !ok {
return nil, nil, nil, "", nil, nil, false
}
idx := 0
fmt.Sscanf(rest, "%d", &idx)
switch sub {
case "load":
title = fmt.Sprintf("GPU %d Load", idx)
title = gpuDisplayLabel(idx) + " Load"
util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
if util == nil && mem == nil {
@@ -600,7 +715,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
yMin = floatPtr(0)
yMax = floatPtr(100)
case "temp":
title = fmt.Sprintf("GPU %d Temperature", idx)
title = gpuDisplayLabel(idx) + " Temperature"
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
if temp == nil {
return nil, nil, nil, "", nil, nil, false
@@ -609,8 +724,26 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
names = []string{"Temp °C"}
yMin = floatPtr(0)
yMax = autoMax120(temp)
case "clock":
title = gpuDisplayLabel(idx) + " Core Clock"
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
if clock == nil {
return nil, nil, nil, "", nil, nil, false
}
datasets = [][]float64{clock}
names = []string{"Core Clock MHz"}
yMin, yMax = autoBounds120(clock)
case "memclock":
title = gpuDisplayLabel(idx) + " Memory Clock"
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
if clock == nil {
return nil, nil, nil, "", nil, nil, false
}
datasets = [][]float64{clock}
names = []string{"Memory Clock MHz"}
yMin, yMax = autoBounds120(clock)
default:
title = fmt.Sprintf("GPU %d Power", idx)
title = gpuDisplayLabel(idx) + " Power"
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
if power == nil {
return nil, nil, nil, "", nil, nil, false
@@ -627,6 +760,26 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
return datasets, names, labels, title, yMin, yMax, len(datasets) > 0
}
func parseGPUChartPath(path string) (idx int, sub string, ok bool) {
if !strings.HasPrefix(path, "gpu/") {
return 0, "", false
}
rest := strings.TrimPrefix(path, "gpu/")
if rest == "" {
return 0, "", false
}
sub = ""
if i := strings.LastIndex(rest, "-"); i > 0 {
sub = rest[i+1:]
rest = rest[:i]
}
n, err := fmt.Sscanf(rest, "%d", &idx)
if err != nil || n != 1 {
return 0, "", false
}
return idx, sub, true
}
func sampleTimeLabels(samples []platform.LiveMetricSample) []string {
labels := make([]string, len(samples))
if len(samples) == 0 {
@@ -719,7 +872,7 @@ func gpuDatasets(samples []platform.LiveMetricSample, pick func(platform.GPUMetr
continue
}
datasets = append(datasets, ds)
names = append(names, fmt.Sprintf("GPU %d", idx))
names = append(names, gpuDisplayLabel(idx))
}
return datasets, names
}
@@ -852,64 +1005,37 @@ func autoBounds120(datasets ...[]float64) (*float64, *float64) {
return floatPtr(low), floatPtr(high)
}
// renderChartSVG renders a line chart SVG with a fixed Y-axis range.
func renderChartSVG(title string, datasets [][]float64, names []string, labels []string, yMin, yMax *float64) ([]byte, error) {
n := len(labels)
if n == 0 {
n = 1
labels = []string{""}
func gpuChartLabelIndices(total, target int) []int {
if total <= 0 {
return nil
}
for i := range datasets {
if len(datasets[i]) == 0 {
datasets[i] = make([]float64, n)
}
if total == 1 {
return []int{0}
}
// Append global min/avg/max to title.
mn, avg, mx := globalStats(datasets)
if mx > 0 {
title = fmt.Sprintf("%s ↓%s ~%s ↑%s",
title,
chartLegendNumber(mn),
chartLegendNumber(avg),
chartLegendNumber(mx),
)
step := total / target
if step < 1 {
step = 1
}
title = sanitizeChartText(title)
names = sanitizeChartTexts(names)
sparse := sanitizeChartTexts(sparseLabels(labels, 6))
var indices []int
for i := 0; i < total; i += step {
indices = append(indices, i)
}
if indices[len(indices)-1] != total-1 {
indices = append(indices, total-1)
}
return indices
}
opt := gocharts.NewLineChartOptionWithData(datasets)
opt.Title = gocharts.TitleOption{Text: title}
opt.XAxis.Labels = sparse
opt.Legend = gocharts.LegendOption{SeriesNames: names}
if chartLegendVisible(len(names)) {
opt.Legend.Offset = gocharts.OffsetStr{Top: gocharts.PositionBottom}
opt.Legend.OverlayChart = gocharts.Ptr(false)
} else {
opt.Legend.Show = gocharts.Ptr(false)
}
opt.Symbol = gocharts.SymbolNone
// Right padding: reserve space for the MarkLine label (library recommendation).
opt.Padding = gocharts.NewBox(20, 20, 80, 20)
if yMin != nil || yMax != nil {
opt.YAxis = []gocharts.YAxisOption{chartYAxisOption(yMin, yMax)}
func chartCanvasHeightForPath(path string, seriesCount int) int {
height := chartCanvasHeight(seriesCount)
if isGPUChartPath(path) {
return height * 2
}
return height
}
// Add a single peak mark line on the series that holds the global maximum.
peakIdx, _ := globalPeakSeries(datasets)
if peakIdx >= 0 && peakIdx < len(opt.SeriesList) {
opt.SeriesList[peakIdx].MarkLine = gocharts.NewMarkLine(gocharts.SeriesMarkTypeMax)
}
p := gocharts.NewPainter(gocharts.PainterOptions{
OutputFormat: gocharts.ChartOutputSVG,
Width: 1400,
Height: chartCanvasHeight(len(names)),
}, gocharts.PainterThemeOption(gocharts.GetTheme("grafana")))
if err := p.LineChart(opt); err != nil {
return nil, err
}
return p.Bytes()
func isGPUChartPath(path string) bool {
return strings.HasPrefix(path, "gpu-all-") || strings.HasPrefix(path, "gpu/")
}
func chartLegendVisible(seriesCount int) bool {
@@ -923,30 +1049,6 @@ func chartCanvasHeight(seriesCount int) int {
return 288
}
func chartYAxisOption(yMin, yMax *float64) gocharts.YAxisOption {
return gocharts.YAxisOption{
Min: yMin,
Max: yMax,
LabelCount: 11,
ValueFormatter: chartYAxisNumber,
}
}
// globalPeakSeries returns the index of the series containing the global maximum
// value across all datasets, and that maximum value.
func globalPeakSeries(datasets [][]float64) (idx int, peak float64) {
idx = -1
for i, ds := range datasets {
for _, v := range ds {
if v > peak {
peak = v
idx = i
}
}
}
return idx, peak
}
// globalStats returns min, average, and max across all values in all datasets.
func globalStats(datasets [][]float64) (mn, avg, mx float64) {
var sum float64
@@ -986,21 +1088,6 @@ func sanitizeChartText(s string) string {
}, s))
}
func sanitizeChartTexts(in []string) []string {
out := make([]string, len(in))
for i, s := range in {
out[i] = sanitizeChartText(s)
}
return out
}
func safeIdx(s []float64, i int) float64 {
if i < len(s) {
return s[i]
}
return 0
}
func snapshotNamedRings(rings []*namedMetricsRing) ([][]float64, []string, []string) {
var datasets [][]float64
var names []string
@@ -1087,20 +1174,6 @@ func chartYAxisNumber(v float64) string {
return out
}
func sparseLabels(labels []string, n int) []string {
out := make([]string, len(labels))
step := len(labels) / n
if step < 1 {
step = 1
}
for i, l := range labels {
if i%step == 0 {
out[i] = l
}
}
return out
}
func (h *handler) handleAPIMetricsExportCSV(w http.ResponseWriter, r *http.Request) {
if h.metricsDB == nil {
http.Error(w, "metrics database not available", http.StatusServiceUnavailable)
@@ -1116,6 +1189,11 @@ func (h *handler) handleAPIMetricsExportCSV(w http.ResponseWriter, r *http.Reque
func (h *handler) handleReady(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Cache-Control", "no-store")
if strings.TrimSpace(h.opts.AuditPath) == "" {
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte("ready"))
return
}
if _, err := os.Stat(h.opts.AuditPath); err != nil {
w.WriteHeader(http.StatusServiceUnavailable)
_, _ = w.Write([]byte("starting"))

View File

@@ -34,6 +34,49 @@ func TestChartLegendNumber(t *testing.T) {
}
}
func TestRecoverMiddlewareReturns500OnPanic(t *testing.T) {
handler := recoverMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
panic("boom")
}))
rec := httptest.NewRecorder()
req := httptest.NewRequest(http.MethodGet, "/panic", nil)
handler.ServeHTTP(rec, req)
if rec.Code != http.StatusInternalServerError {
t.Fatalf("status=%d want %d", rec.Code, http.StatusInternalServerError)
}
if !strings.Contains(rec.Body.String(), "internal server error") {
t.Fatalf("body=%q", rec.Body.String())
}
}
func TestRecoverMiddlewarePreservesStreamingInterfaces(t *testing.T) {
handler := recoverMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if !sseStart(w) {
return
}
if !sseWrite(w, "tick", "ok") {
t.Fatal("expected sse write to succeed")
}
}))
rec := httptest.NewRecorder()
req := httptest.NewRequest(http.MethodGet, "/stream", nil)
handler.ServeHTTP(rec, req)
if rec.Code != http.StatusOK {
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
}
if got := rec.Header().Get("Content-Type"); got != "text/event-stream" {
t.Fatalf("content-type=%q", got)
}
body := rec.Body.String()
if !strings.Contains(body, "event: tick\n") || !strings.Contains(body, "data: ok\n\n") {
t.Fatalf("body=%q", body)
}
}
func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
samples := []platform.LiveMetricSample{
{
@@ -136,6 +179,39 @@ func TestChartDataFromSamplesKeepsStableGPUSeriesOrder(t *testing.T) {
}
}
func TestChartDataFromSamplesIncludesGPUClockCharts(t *testing.T) {
samples := []platform.LiveMetricSample{
{
Timestamp: time.Now().Add(-2 * time.Minute),
GPUs: []platform.GPUMetricRow{
{GPUIndex: 0, ClockMHz: 1400},
{GPUIndex: 3, ClockMHz: 1500},
},
},
{
Timestamp: time.Now().Add(-1 * time.Minute),
GPUs: []platform.GPUMetricRow{
{GPUIndex: 0, ClockMHz: 1410},
{GPUIndex: 3, ClockMHz: 1510},
},
},
}
datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
if !ok {
t.Fatal("gpu-all-clock returned ok=false")
}
if title != "GPU Core Clock" {
t.Fatalf("title=%q", title)
}
if len(names) != 2 || names[0] != "GPU 0" || names[1] != "GPU 3" {
t.Fatalf("names=%v", names)
}
if got := datasets[1][1]; got != 1510 {
t.Fatalf("GPU 3 core clock=%v want 1510", got)
}
}
func TestNormalizePowerSeriesHoldsLastPositive(t *testing.T) {
got := normalizePowerSeries([]float64{0, 480, 0, 0, 510, 0})
want := []float64{0, 480, 480, 480, 510, 510}
@@ -157,6 +233,21 @@ func TestRenderMetricsUsesBufferedChartRefresh(t *testing.T) {
if !strings.Contains(body, "el.dataset.loading === '1'") {
t.Fatalf("metrics page should avoid overlapping chart reloads: %s", body)
}
if !strings.Contains(body, `id="gpu-metrics-section" style="display:none`) {
t.Fatalf("metrics page should keep gpu charts in a hidden dedicated section until GPUs are detected: %s", body)
}
if !strings.Contains(body, `id="gpu-chart-toggle"`) {
t.Fatalf("metrics page should render GPU chart mode toggle: %s", body)
}
if !strings.Contains(body, `/api/metrics/chart/gpu-all-clock.svg`) {
t.Fatalf("metrics page should include GPU core clock chart: %s", body)
}
if strings.Contains(body, `/api/metrics/chart/gpu-all-memclock.svg`) {
t.Fatalf("metrics page should not include GPU memory clock chart: %s", body)
}
if !strings.Contains(body, `renderGPUOverviewCards(indices, names)`) {
t.Fatalf("metrics page should build per-GPU chart cards dynamically: %s", body)
}
}
func TestChartLegendVisible(t *testing.T) {
@@ -199,6 +290,124 @@ func TestChartCanvasHeight(t *testing.T) {
}
}
func TestChartTimelineSegmentsForRangeMergesActiveSpansAndIdleGaps(t *testing.T) {
start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
end := start.Add(10 * time.Minute)
taskWindow := func(offsetStart, offsetEnd time.Duration) Task {
s := start.Add(offsetStart)
e := start.Add(offsetEnd)
return Task{
Name: "task",
Status: TaskDone,
StartedAt: &s,
DoneAt: &e,
}
}
segments := chartTimelineSegmentsForRange(start, end, end, []Task{
taskWindow(1*time.Minute, 3*time.Minute),
taskWindow(2*time.Minute, 5*time.Minute),
taskWindow(7*time.Minute, 8*time.Minute),
})
if len(segments) != 5 {
t.Fatalf("segments=%d want 5: %#v", len(segments), segments)
}
wantActive := []bool{false, true, false, true, false}
wantMinutes := [][2]int{{0, 1}, {1, 5}, {5, 7}, {7, 8}, {8, 10}}
for i, segment := range segments {
if segment.Active != wantActive[i] {
t.Fatalf("segment[%d].Active=%v want %v", i, segment.Active, wantActive[i])
}
if got := int(segment.Start.Sub(start).Minutes()); got != wantMinutes[i][0] {
t.Fatalf("segment[%d] start=%d want %d", i, got, wantMinutes[i][0])
}
if got := int(segment.End.Sub(start).Minutes()); got != wantMinutes[i][1] {
t.Fatalf("segment[%d] end=%d want %d", i, got, wantMinutes[i][1])
}
}
}
func TestRenderMetricChartSVGIncludesTimelineOverlay(t *testing.T) {
start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
labels := []string{"12:00", "12:01", "12:02"}
times := []time.Time{start, start.Add(time.Minute), start.Add(2 * time.Minute)}
svg, err := renderMetricChartSVG(
"System Power",
labels,
times,
[][]float64{{300, 320, 310}},
[]string{"Power W"},
floatPtr(0),
floatPtr(400),
360,
[]chartTimelineSegment{
{Start: start, End: start.Add(time.Minute), Active: false},
{Start: start.Add(time.Minute), End: start.Add(2 * time.Minute), Active: true},
},
)
if err != nil {
t.Fatal(err)
}
body := string(svg)
if !strings.Contains(body, `data-role="timeline-overlay"`) {
t.Fatalf("svg missing timeline overlay: %s", body)
}
if !strings.Contains(body, `opacity="0.10"`) {
t.Fatalf("svg missing idle overlay opacity: %s", body)
}
if !strings.Contains(body, `System Power`) {
t.Fatalf("svg missing chart title: %s", body)
}
}
func TestHandleMetricsChartSVGRendersCustomSVG(t *testing.T) {
dir := t.TempDir()
db, err := openMetricsDB(filepath.Join(dir, "metrics.db"))
if err != nil {
t.Fatal(err)
}
t.Cleanup(func() { _ = db.db.Close() })
start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
for i, sample := range []platform.LiveMetricSample{
{Timestamp: start, PowerW: 300},
{Timestamp: start.Add(time.Minute), PowerW: 320},
{Timestamp: start.Add(2 * time.Minute), PowerW: 310},
} {
if err := db.Write(sample); err != nil {
t.Fatalf("write sample %d: %v", i, err)
}
}
globalQueue.mu.Lock()
prevTasks := globalQueue.tasks
s := start.Add(30 * time.Second)
e := start.Add(90 * time.Second)
globalQueue.tasks = []*Task{{Name: "Burn", Status: TaskDone, StartedAt: &s, DoneAt: &e}}
globalQueue.mu.Unlock()
t.Cleanup(func() {
globalQueue.mu.Lock()
globalQueue.tasks = prevTasks
globalQueue.mu.Unlock()
})
h := &handler{opts: HandlerOptions{ExportDir: dir}, metricsDB: db}
rec := httptest.NewRecorder()
req := httptest.NewRequest(http.MethodGet, "/api/metrics/chart/server-power.svg", nil)
h.handleMetricsChartSVG(rec, req)
if rec.Code != http.StatusOK {
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
}
body := rec.Body.String()
if !strings.Contains(body, `data-role="timeline-overlay"`) {
t.Fatalf("custom svg response missing timeline overlay: %s", body)
}
if !strings.Contains(body, `stroke-linecap="round"`) {
t.Fatalf("custom svg response missing custom polyline styling: %s", body)
}
}
func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
got := normalizeFanSeries([]float64{4200, 0, 0, 4300, 0})
want := []float64{4200, 4200, 4200, 4300, 4300}
@@ -212,21 +421,6 @@ func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
}
}
func TestChartYAxisOption(t *testing.T) {
min := floatPtr(0)
max := floatPtr(100)
opt := chartYAxisOption(min, max)
if opt.Min != min || opt.Max != max {
t.Fatalf("chartYAxisOption min/max mismatch: %#v", opt)
}
if opt.LabelCount != 11 {
t.Fatalf("chartYAxisOption labelCount=%d want 11", opt.LabelCount)
}
if got := opt.ValueFormatter(1000); got != "1к" {
t.Fatalf("chartYAxisOption formatter(1000)=%q want 1к", got)
}
}
func TestSnapshotFanRingsUsesTimelineLabels(t *testing.T) {
r1 := newMetricsRing(4)
r2 := newMetricsRing(4)
@@ -275,9 +469,10 @@ func TestRootRendersDashboard(t *testing.T) {
}
handler := NewHandler(HandlerOptions{
Title: "Bee Hardware Audit",
AuditPath: path,
ExportDir: exportDir,
Title: "Bee Hardware Audit",
BuildLabel: "1.2.3",
AuditPath: path,
ExportDir: exportDir,
})
first := httptest.NewRecorder()
@@ -292,6 +487,11 @@ func TestRootRendersDashboard(t *testing.T) {
if !strings.Contains(first.Body.String(), `/viewer`) {
t.Fatalf("first body missing viewer link: %s", first.Body.String())
}
versionIdx := strings.Index(first.Body.String(), `Version 1.2.3`)
navIdx := strings.Index(first.Body.String(), `href="/"`)
if versionIdx == -1 || navIdx == -1 || versionIdx > navIdx {
t.Fatalf("version should render near top of sidebar before nav links: %s", first.Body.String())
}
if got := first.Header().Get("Cache-Control"); got != "no-store" {
t.Fatalf("first cache-control=%q", got)
}
@@ -329,7 +529,7 @@ func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
t.Fatalf("status=%d", rec.Code)
}
body := rec.Body.String()
if !strings.Contains(body, `Run Audit`) {
if !strings.Contains(body, `onclick="auditModalRun()">Run audit</button>`) {
t.Fatalf("dashboard missing run audit button: %s", body)
}
if strings.Contains(body, `No audit data`) {
@@ -337,6 +537,18 @@ func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
}
}
func TestReadyIsOKWhenAuditPathIsUnset(t *testing.T) {
handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/api/ready", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
}
if strings.TrimSpace(rec.Body.String()) != "ready" {
t.Fatalf("body=%q want ready", rec.Body.String())
}
}
func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "audit.json")
@@ -359,7 +571,7 @@ func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
}
}
func TestTasksPageRendersLogModalAndPaginationControls(t *testing.T) {
func TestTasksPageRendersOpenLinksAndPaginationControls(t *testing.T) {
handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks", nil))
@@ -367,8 +579,8 @@ func TestTasksPageRendersLogModalAndPaginationControls(t *testing.T) {
t.Fatalf("status=%d", rec.Code)
}
body := rec.Body.String()
if !strings.Contains(body, `id="task-log-overlay"`) {
t.Fatalf("tasks page missing log modal overlay: %s", body)
if !strings.Contains(body, `Open a task to view its saved logs and charts.`) {
t.Fatalf("tasks page missing task report hint: %s", body)
}
if !strings.Contains(body, `_taskPageSize = 50`) {
t.Fatalf("tasks page missing pagination size config: %s", body)
@@ -395,6 +607,120 @@ func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
if !strings.Contains(body, `id="boot-source-text"`) {
t.Fatalf("tools page missing boot source field: %s", body)
}
if !strings.Contains(body, `Export to USB`) {
t.Fatalf("tools page missing export to usb section: %s", body)
}
if !strings.Contains(body, `Support Bundle</button>`) {
t.Fatalf("tools page missing support bundle usb button: %s", body)
}
}
func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/benchmark", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d", rec.Code)
}
body := rec.Body.String()
for _, needle := range []string{
`href="/benchmark"`,
`id="benchmark-gpu-list"`,
`/api/gpu/nvidia`,
`/api/benchmark/nvidia/run`,
`benchmark-run-nccl`,
} {
if !strings.Contains(body, needle) {
t.Fatalf("benchmark page missing %q: %s", needle, body)
}
}
}
func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d", rec.Code)
}
body := rec.Body.String()
for _, needle := range []string{
`NVIDIA GPU Targeted Stress`,
`nvidia-targeted-stress`,
`controlled NVIDIA DCGM load`,
`<code>dcgmi diag targeted_stress</code>`,
} {
if !strings.Contains(body, needle) {
t.Fatalf("validate page missing %q: %s", needle, body)
}
}
}
func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/burn", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d", rec.Code)
}
body := rec.Body.String()
for _, needle := range []string{
`NVIDIA Max Compute Load`,
`dcgmproftester`,
`targeted_stress remain in <a href="/validate">Validate</a>`,
`NVIDIA Interconnect Test (NCCL all_reduce_perf)`,
`id="burn-gpu-list"`,
} {
if !strings.Contains(body, needle) {
t.Fatalf("burn page missing %q: %s", needle, body)
}
}
}
func TestTaskDetailPageRendersSavedReport(t *testing.T) {
dir := t.TempDir()
exportDir := filepath.Join(dir, "export")
reportDir := filepath.Join(exportDir, "tasks", "task-1_cpu_sat_done")
if err := os.MkdirAll(reportDir, 0755); err != nil {
t.Fatal(err)
}
reportPath := filepath.Join(reportDir, "report.html")
if err := os.WriteFile(reportPath, []byte(`<div class="card"><div class="card-head">Task Report</div><div class="card-body">saved report</div></div>`), 0644); err != nil {
t.Fatal(err)
}
globalQueue.mu.Lock()
origTasks := globalQueue.tasks
globalQueue.tasks = []*Task{{
ID: "task-1",
Name: "CPU SAT",
Target: "cpu",
Status: TaskDone,
CreatedAt: time.Now(),
ArtifactsDir: reportDir,
ReportHTMLPath: reportPath,
}}
globalQueue.mu.Unlock()
t.Cleanup(func() {
globalQueue.mu.Lock()
globalQueue.tasks = origTasks
globalQueue.mu.Unlock()
})
handler := NewHandler(HandlerOptions{Title: "Bee Hardware Audit", ExportDir: exportDir})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks/task-1", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d", rec.Code)
}
body := rec.Body.String()
if !strings.Contains(body, `saved report`) {
t.Fatalf("task detail page missing saved report: %s", body)
}
if !strings.Contains(body, `Back to Tasks`) {
t.Fatalf("task detail page missing back link: %s", body)
}
}
func TestViewerRendersLatestSnapshot(t *testing.T) {
@@ -518,3 +844,98 @@ func TestRuntimeHealthEndpointReturnsJSON(t *testing.T) {
t.Fatalf("body=%q want %q", strings.TrimSpace(rec.Body.String()), body)
}
}
func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "audit.json")
exportDir := filepath.Join(dir, "export")
if err := os.MkdirAll(exportDir, 0755); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z","hardware":{"board":{"serial_number":"SERIAL-1"}}}`), 0644); err != nil {
t.Fatal(err)
}
health := `{
"status":"PARTIAL",
"checked_at":"2026-03-16T10:00:00Z",
"export_dir":"/tmp/export",
"driver_ready":true,
"cuda_ready":false,
"network_status":"PARTIAL",
"issues":[
{"code":"dhcp_partial","description":"At least one interface did not obtain IPv4 connectivity."},
{"code":"cuda_runtime_not_ready","description":"CUDA runtime is not ready for GPU SAT."}
],
"tools":[
{"name":"dmidecode","ok":true},
{"name":"nvidia-smi","ok":false}
],
"services":[
{"name":"bee-web","status":"active"},
{"name":"bee-nvidia","status":"inactive"}
]
}`
if err := os.WriteFile(filepath.Join(exportDir, "runtime-health.json"), []byte(health), 0644); err != nil {
t.Fatal(err)
}
componentStatus := `[
{
"component_key":"cpu:all",
"status":"Warning",
"error_summary":"cpu SAT: FAILED",
"history":[{"at":"2026-03-16T10:00:00Z","status":"Warning","source":"sat:cpu","detail":"cpu SAT: FAILED"}]
},
{
"component_key":"memory:all",
"status":"OK",
"history":[{"at":"2026-03-16T10:01:00Z","status":"OK","source":"sat:memory","detail":"memory SAT: OK"}]
},
{
"component_key":"storage:nvme0n1",
"status":"Critical",
"error_summary":"storage SAT: FAILED",
"history":[{"at":"2026-03-16T10:02:00Z","status":"Critical","source":"sat:storage","detail":"storage SAT: FAILED"}]
},
{
"component_key":"pcie:gpu:nvidia",
"status":"Warning",
"error_summary":"nvidia SAT: FAILED",
"history":[{"at":"2026-03-16T10:03:00Z","status":"Warning","source":"sat:nvidia","detail":"nvidia SAT: FAILED"}]
}
]`
if err := os.WriteFile(filepath.Join(exportDir, "component-status.json"), []byte(componentStatus), 0644); err != nil {
t.Fatal(err)
}
handler := NewHandler(HandlerOptions{AuditPath: path, ExportDir: exportDir})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
}
body := rec.Body.String()
for _, needle := range []string{
`Runtime Health`,
`<th>Check</th><th>Status</th><th>Source</th><th>Issue</th>`,
`Export Directory`,
`Network`,
`NVIDIA/AMD Driver`,
`CUDA / ROCm`,
`Required Utilities`,
`Bee Services`,
`<td>CPU</td>`,
`<td>Memory</td>`,
`<td>Storage</td>`,
`<td>GPU</td>`,
`CUDA runtime is not ready for GPU SAT.`,
`Missing: nvidia-smi`,
`bee-nvidia=inactive`,
`cpu SAT: FAILED`,
`storage SAT: FAILED`,
`sat:nvidia`,
} {
if !strings.Contains(body, needle) {
t.Fatalf("dashboard missing %q: %s", needle, body)
}
}
}

View File

@@ -0,0 +1,42 @@
package webui
import (
"fmt"
"log/slog"
"runtime/debug"
"time"
)
func goRecoverLoop(name string, restartDelay time.Duration, fn func()) {
go func() {
for {
if !runRecoverable(name, fn) {
return
}
if restartDelay > 0 {
time.Sleep(restartDelay)
}
}
}()
}
func goRecoverOnce(name string, fn func()) {
go func() {
_ = runRecoverable(name, fn)
}()
}
func runRecoverable(name string, fn func()) (panicked bool) {
defer func() {
if rec := recover(); rec != nil {
panicked = true
slog.Error("recovered panic",
"component", name,
"panic", fmt.Sprint(rec),
"stack", string(debug.Stack()),
)
}
}()
fn()
return false
}

View File

@@ -0,0 +1,85 @@
package webui
import (
"fmt"
"html"
"net/http"
"os"
"strings"
)
func (h *handler) handleTaskPage(w http.ResponseWriter, r *http.Request) {
id := r.PathValue("id")
task, ok := globalQueue.findByID(id)
if !ok {
http.NotFound(w, r)
return
}
snapshot := *task
body := renderTaskDetailPage(h.opts, snapshot)
w.Header().Set("Cache-Control", "no-store")
w.Header().Set("Content-Type", "text/html; charset=utf-8")
_, _ = w.Write([]byte(body))
}
func renderTaskDetailPage(opts HandlerOptions, task Task) string {
title := task.Name
if strings.TrimSpace(title) == "" {
title = task.ID
}
var body strings.Builder
body.WriteString(`<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">`)
body.WriteString(`<a class="btn btn-secondary btn-sm" href="/tasks">Back to Tasks</a>`)
body.WriteString(`<span style="font-size:12px;color:var(--muted)">Artifacts are saved in the task folder under <code>./tasks</code>.</span>`)
body.WriteString(`</div>`)
if report := loadTaskReportFragment(task); report != "" {
body.WriteString(report)
} else {
body.WriteString(`<div class="card"><div class="card-head">Task Summary</div><div class="card-body">`)
body.WriteString(`<div style="font-size:18px;font-weight:700">` + html.EscapeString(title) + `</div>`)
body.WriteString(`<div style="margin-top:8px">` + renderTaskStatusBadge(task.Status) + `</div>`)
if strings.TrimSpace(task.ErrMsg) != "" {
body.WriteString(`<div style="margin-top:8px;color:var(--crit-fg)">` + html.EscapeString(task.ErrMsg) + `</div>`)
}
body.WriteString(`</div></div>`)
}
if task.Status == TaskRunning || task.Status == TaskPending {
body.WriteString(`<div class="card"><div class="card-head">Live Logs</div><div class="card-body">`)
body.WriteString(`<div id="task-live-log" class="terminal" style="max-height:none;white-space:pre-wrap">Connecting...</div>`)
body.WriteString(`</div></div>`)
body.WriteString(`<script>
var _taskDetailES = new EventSource('/api/tasks/` + html.EscapeString(task.ID) + `/stream');
var _taskDetailTerm = document.getElementById('task-live-log');
_taskDetailES.onopen = function(){ _taskDetailTerm.textContent = ''; };
_taskDetailES.onmessage = function(e){ _taskDetailTerm.textContent += e.data + "\n"; _taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight; };
_taskDetailES.addEventListener('done', function(){ _taskDetailES.close(); setTimeout(function(){ window.location.reload(); }, 1000); });
_taskDetailES.onerror = function(){ _taskDetailES.close(); };
</script>`)
}
return layoutHead(opts.Title+" — "+title) +
layoutNav("tasks", opts.BuildLabel) +
`<div class="main"><div class="topbar"><h1>` + html.EscapeString(title) + `</h1></div><div class="content">` +
body.String() +
`</div></div></body></html>`
}
func loadTaskReportFragment(task Task) string {
if strings.TrimSpace(task.ReportHTMLPath) == "" {
return ""
}
data, err := os.ReadFile(task.ReportHTMLPath)
if err != nil || len(data) == 0 {
return ""
}
return string(data)
}
func taskArtifactDownloadLink(task Task, absPath string) string {
if strings.TrimSpace(absPath) == "" {
return ""
}
return fmt.Sprintf(`/export/file?path=%s`, absPath)
}

View File

@@ -0,0 +1,286 @@
package webui
import (
"encoding/json"
"fmt"
"html"
"os"
"path/filepath"
"sort"
"strings"
"time"
"bee/audit/internal/platform"
)
var taskReportMetricsDBPath = metricsDBPath
type taskReport struct {
ID string `json:"id"`
Name string `json:"name"`
Target string `json:"target"`
Status string `json:"status"`
CreatedAt time.Time `json:"created_at"`
StartedAt *time.Time `json:"started_at,omitempty"`
DoneAt *time.Time `json:"done_at,omitempty"`
DurationSec int `json:"duration_sec,omitempty"`
Error string `json:"error,omitempty"`
LogFile string `json:"log_file,omitempty"`
Charts []taskReportChart `json:"charts,omitempty"`
GeneratedAt time.Time `json:"generated_at"`
}
type taskReportChart struct {
Title string `json:"title"`
File string `json:"file"`
}
type taskChartSpec struct {
Path string
File string
}
var taskDashboardChartSpecs = []taskChartSpec{
{Path: "server-load", File: "server-load.svg"},
{Path: "server-temp-cpu", File: "server-temp-cpu.svg"},
{Path: "server-temp-ambient", File: "server-temp-ambient.svg"},
{Path: "server-power", File: "server-power.svg"},
{Path: "server-fans", File: "server-fans.svg"},
{Path: "gpu-all-load", File: "gpu-all-load.svg"},
{Path: "gpu-all-memload", File: "gpu-all-memload.svg"},
{Path: "gpu-all-clock", File: "gpu-all-clock.svg"},
{Path: "gpu-all-power", File: "gpu-all-power.svg"},
{Path: "gpu-all-temp", File: "gpu-all-temp.svg"},
}
func writeTaskReportArtifacts(t *Task) error {
if t == nil {
return nil
}
ensureTaskReportPaths(t)
if strings.TrimSpace(t.ArtifactsDir) == "" {
return nil
}
if err := os.MkdirAll(t.ArtifactsDir, 0755); err != nil {
return err
}
start, end := taskTimeWindow(t)
samples, _ := loadTaskMetricSamples(start, end)
charts, inlineCharts := writeTaskCharts(t.ArtifactsDir, start, end, samples)
logText := ""
if data, err := os.ReadFile(t.LogPath); err == nil {
logText = string(data)
}
report := taskReport{
ID: t.ID,
Name: t.Name,
Target: t.Target,
Status: t.Status,
CreatedAt: t.CreatedAt,
StartedAt: t.StartedAt,
DoneAt: t.DoneAt,
DurationSec: taskElapsedSec(t, reportDoneTime(t)),
Error: t.ErrMsg,
LogFile: filepath.Base(t.LogPath),
Charts: charts,
GeneratedAt: time.Now().UTC(),
}
if err := writeJSONFile(t.ReportJSONPath, report); err != nil {
return err
}
return os.WriteFile(t.ReportHTMLPath, []byte(renderTaskReportFragment(report, inlineCharts, logText)), 0644)
}
func reportDoneTime(t *Task) time.Time {
if t != nil && t.DoneAt != nil && !t.DoneAt.IsZero() {
return *t.DoneAt
}
return time.Now()
}
func taskTimeWindow(t *Task) (time.Time, time.Time) {
if t == nil {
now := time.Now().UTC()
return now, now
}
start := t.CreatedAt.UTC()
if t.StartedAt != nil && !t.StartedAt.IsZero() {
start = t.StartedAt.UTC()
}
end := time.Now().UTC()
if t.DoneAt != nil && !t.DoneAt.IsZero() {
end = t.DoneAt.UTC()
}
if end.Before(start) {
end = start
}
return start, end
}
func loadTaskMetricSamples(start, end time.Time) ([]platform.LiveMetricSample, error) {
db, err := openMetricsDB(taskReportMetricsDBPath)
if err != nil {
return nil, err
}
defer db.Close()
return db.LoadBetween(start, end)
}
func writeTaskCharts(dir string, start, end time.Time, samples []platform.LiveMetricSample) ([]taskReportChart, map[string]string) {
if len(samples) == 0 {
return nil, nil
}
timeline := []chartTimelineSegment{{Start: start, End: end, Active: true}}
var charts []taskReportChart
inline := make(map[string]string)
for _, spec := range taskDashboardChartSpecs {
title, svg, ok := renderTaskChartSVG(spec.Path, samples, timeline)
if !ok || len(svg) == 0 {
continue
}
path := filepath.Join(dir, spec.File)
if err := os.WriteFile(path, svg, 0644); err != nil {
continue
}
charts = append(charts, taskReportChart{Title: title, File: spec.File})
inline[spec.File] = string(svg)
}
for _, idx := range taskGPUIndices(samples) {
file := fmt.Sprintf("gpu-%d-overview.svg", idx)
svg, ok, err := renderGPUOverviewChartSVG(idx, samples, timeline)
if err != nil || !ok || len(svg) == 0 {
continue
}
path := filepath.Join(dir, file)
if err := os.WriteFile(path, svg, 0644); err != nil {
continue
}
charts = append(charts, taskReportChart{Title: gpuDisplayLabel(idx) + " Overview", File: file})
inline[file] = string(svg)
}
return charts, inline
}
func renderTaskChartSVG(path string, samples []platform.LiveMetricSample, timeline []chartTimelineSegment) (string, []byte, bool) {
datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
if !ok {
return "", nil, false
}
buf, err := renderMetricChartSVG(
title,
labels,
sampleTimes(samples),
datasets,
names,
yMin,
yMax,
chartCanvasHeightForPath(path, len(names)),
timeline,
)
if err != nil {
return "", nil, false
}
return title, buf, true
}
func taskGPUIndices(samples []platform.LiveMetricSample) []int {
seen := map[int]bool{}
var out []int
for _, s := range samples {
for _, g := range s.GPUs {
if seen[g.GPUIndex] {
continue
}
seen[g.GPUIndex] = true
out = append(out, g.GPUIndex)
}
}
sort.Ints(out)
return out
}
func writeJSONFile(path string, v any) error {
data, err := json.MarshalIndent(v, "", " ")
if err != nil {
return err
}
return os.WriteFile(path, data, 0644)
}
func renderTaskReportFragment(report taskReport, charts map[string]string, logText string) string {
var b strings.Builder
b.WriteString(`<div class="card"><div class="card-head">Task Report</div><div class="card-body">`)
b.WriteString(`<div class="grid2">`)
b.WriteString(`<div><div style="font-size:12px;color:var(--muted);margin-bottom:6px">Task</div><div style="font-size:16px;font-weight:700">` + html.EscapeString(report.Name) + `</div>`)
b.WriteString(`<div style="font-size:13px;color:var(--muted)">` + html.EscapeString(report.Target) + `</div></div>`)
b.WriteString(`<div><div style="font-size:12px;color:var(--muted);margin-bottom:6px">Status</div><div>` + renderTaskStatusBadge(report.Status) + `</div>`)
if strings.TrimSpace(report.Error) != "" {
b.WriteString(`<div style="margin-top:8px;font-size:13px;color:var(--crit-fg)">` + html.EscapeString(report.Error) + `</div>`)
}
b.WriteString(`</div></div>`)
b.WriteString(`<div style="margin-top:14px;font-size:13px;color:var(--muted)">`)
b.WriteString(`Started: ` + formatTaskTime(report.StartedAt, report.CreatedAt) + ` | Finished: ` + formatTaskTime(report.DoneAt, time.Time{}) + ` | Duration: ` + formatTaskDuration(report.DurationSec))
b.WriteString(`</div></div></div>`)
if len(report.Charts) > 0 {
b.WriteString(`<div class="grid2">`)
for _, chart := range report.Charts {
b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(chart.Title) + `</div><div class="card-body" style="padding:12px">`)
b.WriteString(charts[chart.File])
b.WriteString(`</div></div>`)
}
b.WriteString(`</div>`)
} else {
b.WriteString(`<div class="alert alert-info">No metric samples were captured during this task window.</div>`)
}
b.WriteString(`<div class="card"><div class="card-head">Logs</div><div class="card-body">`)
b.WriteString(`<div class="terminal" style="max-height:none;white-space:pre-wrap">` + html.EscapeString(strings.TrimSpace(logText)) + `</div>`)
b.WriteString(`</div></div>`)
return b.String()
}
func renderTaskStatusBadge(status string) string {
className := map[string]string{
TaskRunning: "badge-ok",
TaskPending: "badge-unknown",
TaskDone: "badge-ok",
TaskFailed: "badge-err",
TaskCancelled: "badge-unknown",
}[status]
if className == "" {
className = "badge-unknown"
}
label := strings.TrimSpace(status)
if label == "" {
label = "unknown"
}
return `<span class="badge ` + className + `">` + html.EscapeString(label) + `</span>`
}
func formatTaskTime(ts *time.Time, fallback time.Time) string {
if ts != nil && !ts.IsZero() {
return ts.Local().Format("2006-01-02 15:04:05")
}
if !fallback.IsZero() {
return fallback.Local().Format("2006-01-02 15:04:05")
}
return "n/a"
}
func formatTaskDuration(sec int) string {
if sec <= 0 {
return "n/a"
}
if sec < 60 {
return fmt.Sprintf("%ds", sec)
}
if sec < 3600 {
return fmt.Sprintf("%dm %02ds", sec/60, sec%60)
}
return fmt.Sprintf("%dh %02dm %02ds", sec/3600, (sec%3600)/60, sec%60)
}

View File

@@ -4,10 +4,12 @@ import (
"context"
"encoding/json"
"fmt"
"log/slog"
"net/http"
"os"
"os/exec"
"path/filepath"
"runtime/debug"
"sort"
"strings"
"sync"
@@ -28,22 +30,29 @@ const (
// taskNames maps target → human-readable name for validate (SAT) runs.
var taskNames = map[string]string{
"nvidia": "NVIDIA SAT",
"nvidia-stress": "NVIDIA GPU Stress",
"memory": "Memory SAT",
"storage": "Storage SAT",
"cpu": "CPU SAT",
"amd": "AMD GPU SAT",
"amd-mem": "AMD GPU MEM Integrity",
"amd-bandwidth": "AMD GPU MEM Bandwidth",
"amd-stress": "AMD GPU Burn-in",
"memory-stress": "Memory Burn-in",
"sat-stress": "SAT Stress (stressapptest)",
"platform-stress": "Platform Thermal Cycling",
"audit": "Audit",
"support-bundle": "Support Bundle",
"install": "Install to Disk",
"install-to-ram": "Install to RAM",
"nvidia": "NVIDIA SAT",
"nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)",
"nvidia-benchmark": "NVIDIA Benchmark",
"nvidia-compute": "NVIDIA Max Compute Load (dcgmproftester)",
"nvidia-targeted-power": "NVIDIA Targeted Power (dcgmi diag targeted_power)",
"nvidia-pulse": "NVIDIA Pulse Test (dcgmi diag pulse_test)",
"nvidia-interconnect": "NVIDIA Interconnect Test (NCCL all_reduce_perf)",
"nvidia-bandwidth": "NVIDIA Bandwidth Test (NVBandwidth)",
"nvidia-stress": "NVIDIA GPU Stress",
"memory": "Memory SAT",
"storage": "Storage SAT",
"cpu": "CPU SAT",
"amd": "AMD GPU SAT",
"amd-mem": "AMD GPU MEM Integrity",
"amd-bandwidth": "AMD GPU MEM Bandwidth",
"amd-stress": "AMD GPU Burn-in",
"memory-stress": "Memory Burn-in",
"sat-stress": "SAT Stress (stressapptest)",
"platform-stress": "Platform Thermal Cycling",
"audit": "Audit",
"support-bundle": "Support Bundle",
"install": "Install to Disk",
"install-to-ram": "Install to RAM",
}
// burnNames maps target → human-readable name when a burn profile is set.
@@ -83,17 +92,20 @@ func taskDisplayName(target, profile, loader string) string {
// Task represents one unit of work in the queue.
type Task struct {
ID string `json:"id"`
Name string `json:"name"`
Target string `json:"target"`
Priority int `json:"priority"`
Status string `json:"status"`
CreatedAt time.Time `json:"created_at"`
StartedAt *time.Time `json:"started_at,omitempty"`
DoneAt *time.Time `json:"done_at,omitempty"`
ElapsedSec int `json:"elapsed_sec,omitempty"`
ErrMsg string `json:"error,omitempty"`
LogPath string `json:"log_path,omitempty"`
ID string `json:"id"`
Name string `json:"name"`
Target string `json:"target"`
Priority int `json:"priority"`
Status string `json:"status"`
CreatedAt time.Time `json:"created_at"`
StartedAt *time.Time `json:"started_at,omitempty"`
DoneAt *time.Time `json:"done_at,omitempty"`
ElapsedSec int `json:"elapsed_sec,omitempty"`
ErrMsg string `json:"error,omitempty"`
LogPath string `json:"log_path,omitempty"`
ArtifactsDir string `json:"artifacts_dir,omitempty"`
ReportJSONPath string `json:"report_json_path,omitempty"`
ReportHTMLPath string `json:"report_html_path,omitempty"`
// runtime fields (not serialised)
job *jobState
@@ -106,67 +118,81 @@ type taskParams struct {
DiagLevel int `json:"diag_level,omitempty"`
GPUIndices []int `json:"gpu_indices,omitempty"`
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
SizeMB int `json:"size_mb,omitempty"`
Loader string `json:"loader,omitempty"`
BurnProfile string `json:"burn_profile,omitempty"`
BenchmarkProfile string `json:"benchmark_profile,omitempty"`
RunNCCL bool `json:"run_nccl,omitempty"`
DisplayName string `json:"display_name,omitempty"`
Device string `json:"device,omitempty"` // for install
PlatformComponents []string `json:"platform_components,omitempty"`
}
type persistedTask struct {
ID string `json:"id"`
Name string `json:"name"`
Target string `json:"target"`
Priority int `json:"priority"`
Status string `json:"status"`
CreatedAt time.Time `json:"created_at"`
StartedAt *time.Time `json:"started_at,omitempty"`
DoneAt *time.Time `json:"done_at,omitempty"`
ErrMsg string `json:"error,omitempty"`
LogPath string `json:"log_path,omitempty"`
Params taskParams `json:"params,omitempty"`
ID string `json:"id"`
Name string `json:"name"`
Target string `json:"target"`
Priority int `json:"priority"`
Status string `json:"status"`
CreatedAt time.Time `json:"created_at"`
StartedAt *time.Time `json:"started_at,omitempty"`
DoneAt *time.Time `json:"done_at,omitempty"`
ErrMsg string `json:"error,omitempty"`
LogPath string `json:"log_path,omitempty"`
ArtifactsDir string `json:"artifacts_dir,omitempty"`
ReportJSONPath string `json:"report_json_path,omitempty"`
ReportHTMLPath string `json:"report_html_path,omitempty"`
Params taskParams `json:"params,omitempty"`
}
type burnPreset struct {
NvidiaDiag int
DurationSec int
}
func resolveBurnPreset(profile string) burnPreset {
switch profile {
case "overnight":
return burnPreset{NvidiaDiag: 4, DurationSec: 8 * 60 * 60}
return burnPreset{DurationSec: 8 * 60 * 60}
case "acceptance":
return burnPreset{NvidiaDiag: 3, DurationSec: 60 * 60}
return burnPreset{DurationSec: 60 * 60}
default:
return burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}
return burnPreset{DurationSec: 5 * 60}
}
}
func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
acceptanceCycles := []platform.PlatformStressCycle{
{LoadSec: 85, IdleSec: 5},
{LoadSec: 80, IdleSec: 10},
{LoadSec: 55, IdleSec: 5},
{LoadSec: 60, IdleSec: 0},
{LoadSec: 100, IdleSec: 10},
{LoadSec: 145, IdleSec: 15},
{LoadSec: 190, IdleSec: 20},
{LoadSec: 235, IdleSec: 25},
{LoadSec: 280, IdleSec: 30},
{LoadSec: 325, IdleSec: 35},
{LoadSec: 370, IdleSec: 40},
{LoadSec: 415, IdleSec: 45},
{LoadSec: 460, IdleSec: 50},
{LoadSec: 510, IdleSec: 0},
}
switch profile {
case "overnight":
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
{LoadSec: 600, IdleSec: 120},
{LoadSec: 600, IdleSec: 60},
{LoadSec: 600, IdleSec: 30},
{LoadSec: 600, IdleSec: 120},
{LoadSec: 600, IdleSec: 60},
{LoadSec: 600, IdleSec: 30},
{LoadSec: 600, IdleSec: 120},
{LoadSec: 600, IdleSec: 60},
}}
cycles := make([]platform.PlatformStressCycle, 0, len(acceptanceCycles)*8)
for range 8 {
cycles = append(cycles, acceptanceCycles...)
}
return platform.PlatformStressOptions{Cycles: cycles}
case "acceptance":
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
{LoadSec: 300, IdleSec: 60},
{LoadSec: 300, IdleSec: 30},
{LoadSec: 300, IdleSec: 60},
{LoadSec: 300, IdleSec: 30},
}}
return platform.PlatformStressOptions{Cycles: acceptanceCycles}
default: // smoke
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
{LoadSec: 90, IdleSec: 60},
{LoadSec: 90, IdleSec: 30},
{LoadSec: 85, IdleSec: 5},
{LoadSec: 80, IdleSec: 10},
{LoadSec: 55, IdleSec: 5},
{LoadSec: 60, IdleSec: 0},
}}
}
}
@@ -377,7 +403,7 @@ func (q *taskQueue) startWorker(opts *HandlerOptions) {
if !q.started {
q.loadLocked()
q.started = true
go q.worker()
goRecoverLoop("task worker", 2*time.Second, q.worker)
}
hasPending := q.nextPending() != nil
q.mu.Unlock()
@@ -392,52 +418,107 @@ func (q *taskQueue) startWorker(opts *HandlerOptions) {
func (q *taskQueue) worker() {
for {
<-q.trigger
setCPUGovernor("performance")
for {
func() {
setCPUGovernor("performance")
defer setCPUGovernor("powersave")
// Drain all pending tasks and start them in parallel.
q.mu.Lock()
t := q.nextPending()
if t == nil {
q.mu.Unlock()
break
}
now := time.Now()
t.Status = TaskRunning
t.StartedAt = &now
t.DoneAt = nil
t.ErrMsg = ""
j := newTaskJobState(t.LogPath)
ctx, cancel := context.WithCancel(context.Background())
j.cancel = cancel
t.job = j
q.persistLocked()
q.mu.Unlock()
if q.kmsgWatcher != nil && isSATTarget(t.Target) {
q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
}
q.runTask(t, j, ctx)
if q.kmsgWatcher != nil {
q.kmsgWatcher.NotifyTaskFinished(t.ID)
}
q.mu.Lock()
now2 := time.Now()
t.DoneAt = &now2
if t.Status == TaskRunning { // not cancelled externally
if j.err != "" {
t.Status = TaskFailed
t.ErrMsg = j.err
} else {
t.Status = TaskDone
var batch []*Task
for {
t := q.nextPending()
if t == nil {
break
}
now := time.Now()
t.Status = TaskRunning
t.StartedAt = &now
t.DoneAt = nil
t.ErrMsg = ""
j := newTaskJobState(t.LogPath)
t.job = j
batch = append(batch, t)
}
if len(batch) > 0 {
q.persistLocked()
}
q.prune()
q.persistLocked()
q.mu.Unlock()
var wg sync.WaitGroup
for _, t := range batch {
t := t
j := t.job
taskCtx, taskCancel := context.WithCancel(context.Background())
j.cancel = taskCancel
wg.Add(1)
goRecoverOnce("task "+t.Target, func() {
defer wg.Done()
defer taskCancel()
q.executeTask(t, j, taskCtx)
})
}
wg.Wait()
if len(batch) > 0 {
q.mu.Lock()
q.prune()
q.persistLocked()
q.mu.Unlock()
}
}()
}
}
func (q *taskQueue) executeTask(t *Task, j *jobState, ctx context.Context) {
startedKmsgWatch := false
defer q.finalizeTaskRun(t, j)
defer func() {
if startedKmsgWatch && q.kmsgWatcher != nil {
q.kmsgWatcher.NotifyTaskFinished(t.ID)
}
setCPUGovernor("powersave")
}()
defer func() {
if rec := recover(); rec != nil {
msg := fmt.Sprintf("task panic: %v", rec)
slog.Error("task panic",
"task_id", t.ID,
"target", t.Target,
"panic", fmt.Sprint(rec),
"stack", string(debug.Stack()),
)
j.append("ERROR: " + msg)
j.finish(msg)
}
}()
if q.kmsgWatcher != nil && isSATTarget(t.Target) {
q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
startedKmsgWatch = true
}
q.runTask(t, j, ctx)
}
func (q *taskQueue) finalizeTaskRun(t *Task, j *jobState) {
q.mu.Lock()
now := time.Now()
t.DoneAt = &now
if t.Status == TaskRunning {
if j.err != "" {
t.Status = TaskFailed
t.ErrMsg = j.err
} else {
t.Status = TaskDone
t.ErrMsg = ""
}
}
q.finalizeTaskArtifactPathsLocked(t)
q.persistLocked()
q.mu.Unlock()
if err := writeTaskReportArtifacts(t); err != nil {
appendJobLog(t.LogPath, "WARN: task report generation failed: "+err.Error())
}
}
@@ -479,9 +560,6 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
break
}
diagLevel := t.params.DiagLevel
if t.params.BurnProfile != "" && diagLevel <= 0 {
diagLevel = resolveBurnPreset(t.params.BurnProfile).NvidiaDiag
}
if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
result, e := a.RunNvidiaAcceptancePackWithOptions(
ctx, "", diagLevel, t.params.GPUIndices, j.append,
@@ -494,6 +572,78 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
} else {
archive, err = a.RunNvidiaAcceptancePack("", j.append)
}
case "nvidia-targeted-stress":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if dur <= 0 {
dur = 300
}
archive, err = a.RunNvidiaTargetedStressValidatePack(ctx, "", dur, t.params.GPUIndices, j.append)
case "nvidia-benchmark":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
archive, err = a.RunNvidiaBenchmarkCtx(ctx, "", platform.NvidiaBenchmarkOptions{
Profile: t.params.BenchmarkProfile,
SizeMB: t.params.SizeMB,
GPUIndices: t.params.GPUIndices,
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
RunNCCL: t.params.RunNCCL,
}, j.append)
case "nvidia-compute":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, j.append)
case "nvidia-targeted-power":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
archive, err = a.RunNvidiaTargetedPowerPack(ctx, "", dur, t.params.GPUIndices, j.append)
case "nvidia-pulse":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
archive, err = a.RunNvidiaPulseTestPack(ctx, "", dur, t.params.GPUIndices, j.append)
case "nvidia-bandwidth":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
archive, err = a.RunNvidiaBandwidthPack(ctx, "", t.params.GPUIndices, j.append)
case "nvidia-interconnect":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
DurationSec: dur,
Loader: platform.NvidiaStressLoaderNCCL,
GPUIndices: t.params.GPUIndices,
}, j.append)
case "nvidia-stress":
if a == nil {
err = fmt.Errorf("app not configured")
@@ -852,10 +1002,10 @@ func (h *handler) handleAPITasksStream(w http.ResponseWriter, r *http.Request) {
}
func (q *taskQueue) assignTaskLogPathLocked(t *Task) {
if t.LogPath != "" || q.logsDir == "" || t.ID == "" {
if q.logsDir == "" || t.ID == "" {
return
}
t.LogPath = filepath.Join(q.logsDir, t.ID+".log")
q.ensureTaskArtifactPathsLocked(t)
}
func (q *taskQueue) loadLocked() {
@@ -872,17 +1022,20 @@ func (q *taskQueue) loadLocked() {
}
for _, pt := range persisted {
t := &Task{
ID: pt.ID,
Name: pt.Name,
Target: pt.Target,
Priority: pt.Priority,
Status: pt.Status,
CreatedAt: pt.CreatedAt,
StartedAt: pt.StartedAt,
DoneAt: pt.DoneAt,
ErrMsg: pt.ErrMsg,
LogPath: pt.LogPath,
params: pt.Params,
ID: pt.ID,
Name: pt.Name,
Target: pt.Target,
Priority: pt.Priority,
Status: pt.Status,
CreatedAt: pt.CreatedAt,
StartedAt: pt.StartedAt,
DoneAt: pt.DoneAt,
ErrMsg: pt.ErrMsg,
LogPath: pt.LogPath,
ArtifactsDir: pt.ArtifactsDir,
ReportJSONPath: pt.ReportJSONPath,
ReportHTMLPath: pt.ReportHTMLPath,
params: pt.Params,
}
q.assignTaskLogPathLocked(t)
if t.Status == TaskRunning {
@@ -913,17 +1066,20 @@ func (q *taskQueue) persistLocked() {
state := make([]persistedTask, 0, len(q.tasks))
for _, t := range q.tasks {
state = append(state, persistedTask{
ID: t.ID,
Name: t.Name,
Target: t.Target,
Priority: t.Priority,
Status: t.Status,
CreatedAt: t.CreatedAt,
StartedAt: t.StartedAt,
DoneAt: t.DoneAt,
ErrMsg: t.ErrMsg,
LogPath: t.LogPath,
Params: t.params,
ID: t.ID,
Name: t.Name,
Target: t.Target,
Priority: t.Priority,
Status: t.Status,
CreatedAt: t.CreatedAt,
StartedAt: t.StartedAt,
DoneAt: t.DoneAt,
ErrMsg: t.ErrMsg,
LogPath: t.LogPath,
ArtifactsDir: t.ArtifactsDir,
ReportJSONPath: t.ReportJSONPath,
ReportHTMLPath: t.ReportHTMLPath,
Params: t.params,
})
}
data, err := json.MarshalIndent(state, "", " ")
@@ -954,3 +1110,88 @@ func taskElapsedSec(t *Task, now time.Time) int {
}
return int(end.Sub(start).Round(time.Second) / time.Second)
}
func taskFolderStatus(status string) string {
status = strings.TrimSpace(strings.ToLower(status))
switch status {
case TaskRunning, TaskDone, TaskFailed, TaskCancelled:
return status
default:
return TaskPending
}
}
func sanitizeTaskFolderPart(s string) string {
s = strings.TrimSpace(strings.ToLower(s))
if s == "" {
return "task"
}
var b strings.Builder
lastDash := false
for _, r := range s {
isAlnum := (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9')
if isAlnum {
b.WriteRune(r)
lastDash = false
continue
}
if !lastDash {
b.WriteByte('-')
lastDash = true
}
}
out := strings.Trim(b.String(), "-")
if out == "" {
return "task"
}
return out
}
func taskArtifactsDir(root string, t *Task, status string) string {
if strings.TrimSpace(root) == "" || t == nil {
return ""
}
return filepath.Join(root, fmt.Sprintf("%s_%s_%s", t.ID, sanitizeTaskFolderPart(t.Name), taskFolderStatus(status)))
}
func ensureTaskReportPaths(t *Task) {
if t == nil || strings.TrimSpace(t.ArtifactsDir) == "" {
return
}
if t.LogPath == "" || filepath.Base(t.LogPath) == "task.log" {
t.LogPath = filepath.Join(t.ArtifactsDir, "task.log")
}
t.ReportJSONPath = filepath.Join(t.ArtifactsDir, "report.json")
t.ReportHTMLPath = filepath.Join(t.ArtifactsDir, "report.html")
}
func (q *taskQueue) ensureTaskArtifactPathsLocked(t *Task) {
if t == nil || strings.TrimSpace(q.logsDir) == "" || strings.TrimSpace(t.ID) == "" {
return
}
if strings.TrimSpace(t.ArtifactsDir) == "" {
t.ArtifactsDir = taskArtifactsDir(q.logsDir, t, t.Status)
}
if t.ArtifactsDir != "" {
_ = os.MkdirAll(t.ArtifactsDir, 0755)
}
ensureTaskReportPaths(t)
}
func (q *taskQueue) finalizeTaskArtifactPathsLocked(t *Task) {
if t == nil || strings.TrimSpace(q.logsDir) == "" || strings.TrimSpace(t.ID) == "" {
return
}
q.ensureTaskArtifactPathsLocked(t)
dstDir := taskArtifactsDir(q.logsDir, t, t.Status)
if dstDir == "" {
return
}
if t.ArtifactsDir != "" && t.ArtifactsDir != dstDir {
if _, err := os.Stat(dstDir); err != nil {
_ = os.Rename(t.ArtifactsDir, dstDir)
}
t.ArtifactsDir = dstDir
}
ensureTaskReportPaths(t)
}

View File

@@ -2,6 +2,7 @@ package webui
import (
"context"
"encoding/json"
"net/http"
"net/http/httptest"
"os"
@@ -12,6 +13,7 @@ import (
"time"
"bee/audit/internal/app"
"bee/audit/internal/platform"
)
func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) {
@@ -248,15 +250,90 @@ func TestHandleAPITasksStreamPendingTaskStartsSSEImmediately(t *testing.T) {
t.Fatalf("stream did not emit queued status promptly, body=%q", rec.Body.String())
}
func TestFinalizeTaskRunCreatesReportFolderAndArtifacts(t *testing.T) {
dir := t.TempDir()
metricsPath := filepath.Join(dir, "metrics.db")
prevMetricsPath := taskReportMetricsDBPath
taskReportMetricsDBPath = metricsPath
t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
db, err := openMetricsDB(metricsPath)
if err != nil {
t.Fatalf("openMetricsDB: %v", err)
}
base := time.Now().UTC().Add(-45 * time.Second)
if err := db.Write(platform.LiveMetricSample{
Timestamp: base,
CPULoadPct: 42,
MemLoadPct: 35,
PowerW: 510,
}); err != nil {
t.Fatalf("Write: %v", err)
}
_ = db.Close()
q := &taskQueue{
statePath: filepath.Join(dir, "tasks-state.json"),
logsDir: filepath.Join(dir, "tasks"),
trigger: make(chan struct{}, 1),
}
if err := os.MkdirAll(q.logsDir, 0755); err != nil {
t.Fatal(err)
}
started := time.Now().UTC().Add(-90 * time.Second)
task := &Task{
ID: "task-1",
Name: "CPU SAT",
Target: "cpu",
Status: TaskRunning,
CreatedAt: started.Add(-10 * time.Second),
StartedAt: &started,
}
q.assignTaskLogPathLocked(task)
appendJobLog(task.LogPath, "line-1")
job := newTaskJobState(task.LogPath)
job.finish("")
q.finalizeTaskRun(task, job)
if task.Status != TaskDone {
t.Fatalf("status=%q want %q", task.Status, TaskDone)
}
if !strings.Contains(filepath.Base(task.ArtifactsDir), "_done") {
t.Fatalf("artifacts dir=%q", task.ArtifactsDir)
}
if _, err := os.Stat(task.ReportJSONPath); err != nil {
t.Fatalf("report json: %v", err)
}
if _, err := os.Stat(task.ReportHTMLPath); err != nil {
t.Fatalf("report html: %v", err)
}
var report taskReport
data, err := os.ReadFile(task.ReportJSONPath)
if err != nil {
t.Fatalf("ReadFile(report.json): %v", err)
}
if err := json.Unmarshal(data, &report); err != nil {
t.Fatalf("Unmarshal(report.json): %v", err)
}
if report.ID != task.ID || report.Status != TaskDone {
t.Fatalf("report=%+v", report)
}
if len(report.Charts) == 0 {
t.Fatalf("expected charts in report, got none")
}
}
func TestResolveBurnPreset(t *testing.T) {
tests := []struct {
profile string
want burnPreset
}{
{profile: "smoke", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}},
{profile: "acceptance", want: burnPreset{NvidiaDiag: 3, DurationSec: 60 * 60}},
{profile: "overnight", want: burnPreset{NvidiaDiag: 4, DurationSec: 8 * 60 * 60}},
{profile: "", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}},
{profile: "smoke", want: burnPreset{DurationSec: 5 * 60}},
{profile: "acceptance", want: burnPreset{DurationSec: 60 * 60}},
{profile: "overnight", want: burnPreset{DurationSec: 8 * 60 * 60}},
{profile: "", want: burnPreset{DurationSec: 5 * 60}},
}
for _, tc := range tests {
if got := resolveBurnPreset(tc.profile); got != tc.want {
@@ -467,3 +544,52 @@ func TestRunTaskInstallUsesSharedCommandStreaming(t *testing.T) {
t.Fatalf("unexpected error: %q", j.err)
}
}
func TestExecuteTaskMarksPanicsAsFailedAndClosesKmsgWindow(t *testing.T) {
dir := t.TempDir()
q := &taskQueue{
opts: &HandlerOptions{App: &app.App{}},
statePath: filepath.Join(dir, "tasks-state.json"),
logsDir: filepath.Join(dir, "tasks"),
kmsgWatcher: newKmsgWatcher(nil),
}
tk := &Task{
ID: "cpu-panic-1",
Name: "CPU SAT",
Target: "cpu",
Status: TaskRunning,
CreatedAt: time.Now(),
}
j := &jobState{}
orig := runCPUAcceptancePackCtx
runCPUAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, _ int, _ func(string)) (string, error) {
panic("boom")
}
defer func() { runCPUAcceptancePackCtx = orig }()
q.executeTask(tk, j, context.Background())
if tk.Status != TaskFailed {
t.Fatalf("status=%q want %q", tk.Status, TaskFailed)
}
if tk.DoneAt == nil {
t.Fatal("expected done_at to be set")
}
if !strings.Contains(tk.ErrMsg, "task panic: boom") {
t.Fatalf("task error=%q", tk.ErrMsg)
}
if !strings.Contains(j.err, "task panic: boom") {
t.Fatalf("job error=%q", j.err)
}
q.kmsgWatcher.mu.Lock()
activeCount := q.kmsgWatcher.activeCount
window := q.kmsgWatcher.window
q.kmsgWatcher.mu.Unlock()
if activeCount != 0 {
t.Fatalf("activeCount=%d want 0", activeCount)
}
if window != nil {
t.Fatalf("expected kmsg window to be cleared, got %+v", window)
}
}

View File

@@ -0,0 +1,16 @@
#!/bin/sh
set -eu
tag="$(git describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
case "${tag}" in
v*)
printf '%s\n' "${tag#v}"
;;
"")
printf 'dev\n'
;;
*)
printf '%s\n' "${tag}"
;;
esac

2
bible

Submodule bible updated: 688b87e98d...1d89a4918e

View File

@@ -54,15 +54,8 @@ resolve_audit_version() {
return 0
fi
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'audit/v*' --abbrev=7 --dirty 2>/dev/null || true)"
if [ -z "${tag}" ]; then
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
fi
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
case "${tag}" in
audit/v*)
echo "${tag#audit/v}"
return 0
;;
v*)
echo "${tag#v}"
return 0
@@ -309,6 +302,12 @@ memtest_fail() {
return 0
}
nvidia_runtime_fail() {
msg="$1"
echo "ERROR: ${msg}" >&2
exit 1
}
iso_memtest_present() {
iso_path="$1"
iso_files="$(mktemp)"
@@ -446,6 +445,44 @@ validate_iso_memtest() {
echo "=== memtest validation OK ==="
}
validate_iso_nvidia_runtime() {
iso_path="$1"
[ "$BEE_GPU_VENDOR" = "nvidia" ] || return 0
echo "=== validating NVIDIA runtime in ISO ==="
[ -f "$iso_path" ] || nvidia_runtime_fail "ISO not found for NVIDIA runtime validation: $iso_path"
require_iso_reader "$iso_path" >/dev/null 2>&1 || nvidia_runtime_fail "ISO reader unavailable for NVIDIA runtime validation"
command -v unsquashfs >/dev/null 2>&1 || nvidia_runtime_fail "unsquashfs is required for NVIDIA runtime validation"
squashfs_tmp="$(mktemp)"
squashfs_list="$(mktemp)"
iso_read_member "$iso_path" live/filesystem.squashfs "$squashfs_tmp" || {
rm -f "$squashfs_tmp" "$squashfs_list"
nvidia_runtime_fail "failed to extract live/filesystem.squashfs from ISO"
}
unsquashfs -ll "$squashfs_tmp" > "$squashfs_list" 2>/dev/null || {
rm -f "$squashfs_tmp" "$squashfs_list"
nvidia_runtime_fail "failed to inspect filesystem.squashfs from ISO"
}
grep -Eq 'usr/bin/dcgmi$' "$squashfs_list" || {
rm -f "$squashfs_tmp" "$squashfs_list"
nvidia_runtime_fail "dcgmi missing from final NVIDIA ISO"
}
grep -Eq 'usr/bin/nv-hostengine$' "$squashfs_list" || {
rm -f "$squashfs_tmp" "$squashfs_list"
nvidia_runtime_fail "nv-hostengine missing from final NVIDIA ISO"
}
grep -Eq 'usr/bin/dcgmproftester([0-9]+)?$' "$squashfs_list" || {
rm -f "$squashfs_tmp" "$squashfs_list"
nvidia_runtime_fail "dcgmproftester missing from final NVIDIA ISO"
}
rm -f "$squashfs_tmp" "$squashfs_list"
echo "=== NVIDIA runtime validation OK ==="
}
append_memtest_grub_entry() {
grub_cfg="$1"
[ -f "$grub_cfg" ] || return 1
@@ -1151,6 +1188,7 @@ if [ -f "$ISO_RAW" ]; then
fi
fi
validate_iso_memtest "$ISO_RAW"
validate_iso_nvidia_runtime "$ISO_RAW"
cp "$ISO_RAW" "$ISO_OUT"
echo ""
echo "=== done (${BEE_GPU_VENDOR}) ==="

View File

@@ -30,6 +30,7 @@ systemctl enable bee-preflight.service
systemctl enable bee-audit.service
systemctl enable bee-web.service
systemctl enable bee-sshsetup.service
systemctl enable bee-selfheal.timer
systemctl enable ssh.service
systemctl enable lightdm.service 2>/dev/null || true
systemctl enable qemu-guest-agent.service 2>/dev/null || true
@@ -58,6 +59,7 @@ chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
chmod +x /usr/local/bin/bee 2>/dev/null || true
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
chmod +x /usr/local/bin/bee-selfheal 2>/dev/null || true
if [ "$GPU_VENDOR" = "nvidia" ]; then
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true

View File

@@ -1,6 +1,10 @@
# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing.
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with CUDA 13 userspace,
# so install the CUDA 13 build plus proprietary diagnostic components explicitly.
# NVIDIA DCGM (Data Center GPU Manager).
# Validate uses dcgmi diagnostics; Burn uses dcgmproftester as the official
# NVIDIA max-compute recipe. The smoketest/runtime contract treats
# dcgmproftester as required in the LiveCD.
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
# CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
# explicitly.
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%

View File

@@ -52,6 +52,31 @@ else
fail "nvidia-smi: NOT FOUND"
fi
if p=$(PATH="/usr/local/bin:$PATH" command -v dcgmi 2>/dev/null); then
ok "dcgmi found: $p"
else
fail "dcgmi: NOT FOUND"
fi
if p=$(PATH="/usr/local/bin:$PATH" command -v nv-hostengine 2>/dev/null); then
ok "nv-hostengine found: $p"
else
fail "nv-hostengine: NOT FOUND"
fi
DCGM_PROFTESTER=""
for tool in dcgmproftester dcgmproftester13 dcgmproftester12 dcgmproftester11; do
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
DCGM_PROFTESTER="$p"
break
fi
done
if [ -n "$DCGM_PROFTESTER" ]; then
ok "dcgmproftester found: $DCGM_PROFTESTER"
else
fail "dcgmproftester: NOT FOUND"
fi
for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf; do
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
ok "$tool found: $p"
@@ -60,6 +85,12 @@ for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf
fi
done
if p=$(PATH="/usr/local/bin:$PATH" command -v nvbandwidth 2>/dev/null); then
ok "nvbandwidth found: $p"
else
warn "nvbandwidth: NOT FOUND"
fi
echo ""
echo "-- NVIDIA modules --"
KO_DIR="/usr/local/lib/nvidia"
@@ -171,6 +202,12 @@ for svc in bee-nvidia bee-network bee-preflight bee-audit bee-web; do
fi
done
if systemctl is-active --quiet bee-selfheal.timer 2>/dev/null; then
ok "timer active: bee-selfheal.timer"
else
fail "timer NOT active: bee-selfheal.timer"
fi
echo ""
echo "-- runtime health --"
if [ -f /appdata/bee/export/runtime-health.json ] && [ -s /appdata/bee/export/runtime-health.json ]; then

View File

@@ -1,7 +1,6 @@
[Unit]
Description=Bee: hardware audit
After=bee-preflight.service bee-network.service bee-nvidia.service
Before=bee-web.service
[Service]
Type=oneshot

View File

@@ -0,0 +1,9 @@
[Unit]
Description=Bee: periodic runtime self-heal
After=bee-web.service bee-audit.service bee-preflight.service
[Service]
Type=oneshot
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-selfheal.log /usr/local/bin/bee-selfheal
StandardOutput=journal
StandardError=journal

View File

@@ -0,0 +1,11 @@
[Unit]
Description=Bee: run self-heal checks periodically
[Timer]
OnBootSec=45sec
OnUnitActiveSec=60sec
AccuracySec=15sec
Unit=bee-selfheal.service
[Install]
WantedBy=timers.target

View File

@@ -1,12 +1,12 @@
[Unit]
Description=Bee: hardware audit web viewer
After=bee-audit.service
StartLimitIntervalSec=0
[Service]
Type=simple
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-web.log /usr/local/bin/bee web --listen :80 --audit-path /appdata/bee/export/bee-audit.json --export-dir /appdata/bee/export --title "Bee Hardware Audit"
Restart=always
RestartSec=2
RestartSec=3
StandardOutput=journal
StandardError=journal
LimitMEMLOCK=infinity

View File

@@ -1,10 +1,11 @@
#!/bin/sh
set -eu
SECONDS=300
DURATION_SEC=300
DEVICES=""
EXCLUDE=""
FORMAT=""
TEST_SLICE_SECONDS=300
JOHN_DIR="/usr/local/lib/bee/john/run"
JOHN_BIN="${JOHN_DIR}/john"
export OCL_ICD_VENDORS="/etc/OpenCL/vendors"
@@ -116,7 +117,7 @@ ensure_opencl_ready() {
while [ "$#" -gt 0 ]; do
case "$1" in
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
--seconds|-t) [ "$#" -ge 2 ] || usage; DURATION_SEC="$2"; shift 2 ;;
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
--format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;;
@@ -189,14 +190,51 @@ CHOSEN_FORMAT=$(choose_format) || {
exit 1
}
echo "format=${CHOSEN_FORMAT}"
run_john_loop() {
opencl_id="$1"
deadline="$2"
round=0
while :; do
now=$(date +%s)
remaining=$((deadline - now))
if [ "${remaining}" -le 0 ]; then
break
fi
round=$((round + 1))
slice="${remaining}"
if [ "${slice}" -gt "${TEST_SLICE_SECONDS}" ]; then
slice="${TEST_SLICE_SECONDS}"
fi
echo "device=${opencl_id} round=${round} remaining_sec=${remaining} slice_sec=${slice}"
./john --test="${slice}" --format="${CHOSEN_FORMAT}" --devices="${opencl_id}" || return 1
done
}
PIDS=""
cleanup() {
rc=$?
trap - EXIT INT TERM
for pid in ${PIDS}; do
kill "${pid}" 2>/dev/null || true
done
for pid in ${PIDS}; do
wait "${pid}" 2>/dev/null || true
done
exit "${rc}"
}
trap cleanup EXIT INT TERM
echo "format=${CHOSEN_FORMAT}"
echo "target_seconds=${DURATION_SEC}"
echo "slice_seconds=${TEST_SLICE_SECONDS}"
DEADLINE=$(( $(date +%s) + DURATION_SEC ))
_first=1
for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do
[ "${_first}" = "1" ] || sleep 3
_first=0
./john --test="${SECONDS}" --format="${CHOSEN_FORMAT}" --devices="${opencl_id}" &
PIDS="${PIDS} $!"
run_john_loop "${opencl_id}" "${DEADLINE}" &
pid=$!
PIDS="${PIDS} ${pid}"
done
FAIL=0
for pid in ${PIDS}; do

View File

@@ -128,13 +128,32 @@ ldconfig 2>/dev/null || true
log "ldconfig refreshed"
# Start DCGM host engine so dcgmi can discover GPUs.
# nv-hostengine must run before any dcgmi command — without it, dcgmi reports
# "group is empty" even when GPUs and modules are present.
# Skip if already running (e.g. started by a dcgm systemd service or prior boot).
# nv-hostengine must run after the NVIDIA modules and device nodes are ready.
# If it started too early (for example via systemd before bee-nvidia-load), it can
# keep a stale empty inventory and dcgmi diag later reports no testable entities.
if command -v nv-hostengine >/dev/null 2>&1; then
if pgrep -x nv-hostengine >/dev/null 2>&1; then
log "nv-hostengine already running — skipping"
else
if command -v pkill >/dev/null 2>&1; then
pkill -x nv-hostengine >/dev/null 2>&1 || true
tries=0
while pgrep -x nv-hostengine >/dev/null 2>&1; do
tries=$((tries + 1))
if [ "${tries}" -ge 10 ]; then
log "WARN: nv-hostengine is still running after restart request"
break
fi
sleep 1
done
if pgrep -x nv-hostengine >/dev/null 2>&1; then
log "WARN: keeping existing nv-hostengine process"
else
log "nv-hostengine restarted"
fi
else
log "WARN: pkill not found — cannot refresh nv-hostengine inventory"
fi
fi
if ! pgrep -x nv-hostengine >/dev/null 2>&1; then
nv-hostengine
log "nv-hostengine started"
fi

View File

@@ -0,0 +1,99 @@
#!/bin/bash
# bee-selfheal — periodic best-effort recovery for critical live ISO services.
set -u
LOG_PREFIX="bee-selfheal"
EXPORT_DIR="/appdata/bee/export"
AUDIT_JSON="${EXPORT_DIR}/bee-audit.json"
RUNTIME_JSON="${EXPORT_DIR}/runtime-health.json"
LOCK_DIR="/run/bee-selfheal.lock"
log() {
echo "[${LOG_PREFIX}] $*"
}
have_nvidia_gpu() {
lspci -nn 2>/dev/null | grep -qi '10de:'
}
service_active() {
systemctl is-active --quiet "$1" 2>/dev/null
}
restart_service() {
local svc="$1"
if systemctl restart "$svc" >/dev/null 2>&1; then
log "restarted ${svc}"
return 0
fi
log "WARN: failed to restart ${svc}"
return 1
}
file_ready() {
[ -s "$1" ]
}
artifact_state() {
local path="$1"
if [ -s "${path}" ]; then
echo "ready"
return 0
fi
if [ -e "${path}.tmp" ]; then
echo "interrupted"
return 0
fi
echo "missing"
}
web_healthy() {
bash -c 'exec 3<>/dev/tcp/127.0.0.1/80 && printf "GET /healthz HTTP/1.0\r\nHost: localhost\r\n\r\n" >&3 && grep -q "^ok$" <&3' \
>/dev/null 2>&1
}
mkdir -p "${EXPORT_DIR}" /run
if ! mkdir "${LOCK_DIR}" 2>/dev/null; then
log "another self-heal run is already active"
exit 0
fi
trap 'rmdir "${LOCK_DIR}" >/dev/null 2>&1 || true' EXIT
log "start"
if have_nvidia_gpu && [ ! -e /dev/nvidia0 ]; then
log "NVIDIA GPU detected but /dev/nvidia0 is missing"
restart_service bee-nvidia.service || true
fi
runtime_state="$(artifact_state "${RUNTIME_JSON}")"
if [ "${runtime_state}" != "ready" ]; then
if [ "${runtime_state}" = "interrupted" ]; then
log "runtime-health.json.tmp exists — interrupted runtime-health write detected"
else
log "runtime-health.json missing or empty"
fi
restart_service bee-preflight.service || true
fi
audit_state="$(artifact_state "${AUDIT_JSON}")"
if [ "${audit_state}" != "ready" ]; then
if [ "${audit_state}" = "interrupted" ]; then
log "bee-audit.json.tmp exists — interrupted audit write detected"
else
log "bee-audit.json missing or empty"
fi
restart_service bee-audit.service || true
fi
if ! service_active bee-web.service; then
log "bee-web.service is not active"
restart_service bee-web.service || true
elif ! web_healthy; then
log "bee-web health check failed"
restart_service bee-web.service || true
fi
log "done"