Compare commits

..

6 Commits
v5 ... v5.4

27 changed files with 3069 additions and 709 deletions

View File

@@ -1,9 +1,10 @@
LISTEN ?= :8080
AUDIT_PATH ?=
EXPORT_DIR ?= $(CURDIR)/.tmp/export
VERSION ?= $(shell sh ./scripts/resolve-version.sh)
GO_LDFLAGS := -X main.Version=$(VERSION)
RUN_ARGS := web --listen $(LISTEN)
RUN_ARGS := web --listen $(LISTEN) --export-dir $(EXPORT_DIR)
ifneq ($(AUDIT_PATH),)
RUN_ARGS += --audit-path $(AUDIT_PATH)
endif
@@ -11,6 +12,7 @@ endif
.PHONY: run build test
run:
mkdir -p $(EXPORT_DIR)
go run -ldflags "$(GO_LDFLAGS)" ./cmd/bee $(RUN_ARGS)
build:

View File

@@ -87,7 +87,7 @@ func printRootUsage(w io.Writer) {
bee preflight --output stdout|file:<path>
bee export --target <device>
bee support-bundle --output stdout|file:<path>
bee web --listen :80 --audit-path `+app.DefaultAuditJSONPath+`
bee web --listen :80 [--audit-path `+app.DefaultAuditJSONPath+`]
bee sat nvidia|memory|storage|cpu [--duration <seconds>]
bee benchmark nvidia [--profile standard|stability|overnight]
bee version
@@ -296,7 +296,7 @@ func runWeb(args []string, stdout, stderr io.Writer) int {
fs := flag.NewFlagSet("web", flag.ContinueOnError)
fs.SetOutput(stderr)
listenAddr := fs.String("listen", ":8080", "listen address, e.g. :80")
auditPath := fs.String("audit-path", app.DefaultAuditJSONPath, "path to the latest audit JSON snapshot")
auditPath := fs.String("audit-path", "", "optional path to the latest audit JSON snapshot")
exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
title := fs.String("title", "Bee Hardware Audit", "page title")
fs.Usage = func() {

View File

@@ -115,7 +115,12 @@ func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
type satRunner interface {
RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error)
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
@@ -528,6 +533,13 @@ func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir st
return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
}
func (a *App) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNvidiaTargetedStressValidatePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
}
func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
}
@@ -543,6 +555,34 @@ func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts pl
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
}
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
}
func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNvidiaTargetedPowerPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
}
func (a *App) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNvidiaPulseTestPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
}
func (a *App) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNvidiaBandwidthPack(ctx, baseDir, gpuIndices, logFunc)
}
func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
@@ -893,6 +933,12 @@ func latestSATSummaries() []string {
prefix string
}{
{label: "NVIDIA SAT", prefix: "gpu-nvidia-"},
{label: "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)", prefix: "gpu-nvidia-targeted-stress-"},
{label: "NVIDIA Max Compute Load (dcgmproftester)", prefix: "gpu-nvidia-compute-"},
{label: "NVIDIA Targeted Power (dcgmi diag targeted_power)", prefix: "gpu-nvidia-targeted-power-"},
{label: "NVIDIA Pulse Test (dcgmi diag pulse_test)", prefix: "gpu-nvidia-pulse-"},
{label: "NVIDIA Interconnect Test (NCCL all_reduce_perf)", prefix: "gpu-nvidia-nccl-"},
{label: "NVIDIA Bandwidth Test (NVBandwidth)", prefix: "gpu-nvidia-bandwidth-"},
{label: "Memory SAT", prefix: "memory-"},
{label: "Storage SAT", prefix: "storage-"},
{label: "CPU SAT", prefix: "cpu-"},

View File

@@ -123,6 +123,11 @@ type fakeSAT struct {
runNvidiaFn func(string) (string, error)
runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
runNvidiaComputeFn func(string, int, []int) (string, error)
runNvidiaPowerFn func(string, int, []int) (string, error)
runNvidiaPulseFn func(string, int, []int) (string, error)
runNvidiaBandwidthFn func(string, []int) (string, error)
runNvidiaTargetedStressFn func(string, int, []int) (string, error)
runMemoryFn func(string) (string, error)
runStorageFn func(string) (string, error)
runCPUFn func(string, int) (string, error)
@@ -147,6 +152,41 @@ func (f fakeSAT) RunNvidiaBenchmark(_ context.Context, baseDir string, opts plat
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
if f.runNvidiaTargetedStressFn != nil {
return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
}
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
if f.runNvidiaComputeFn != nil {
return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices)
}
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaTargetedPowerPack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
if f.runNvidiaPowerFn != nil {
return f.runNvidiaPowerFn(baseDir, durationSec, gpuIndices)
}
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaPulseTestPack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
if f.runNvidiaPulseFn != nil {
return f.runNvidiaPulseFn(baseDir, durationSec, gpuIndices)
}
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaBandwidthPack(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
if f.runNvidiaBandwidthFn != nil {
return f.runNvidiaBandwidthFn(baseDir, gpuIndices)
}
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaStressPack(_ context.Context, baseDir string, opts platform.NvidiaStressOptions, _ func(string)) (string, error) {
if f.runNvidiaStressFn != nil {
return f.runNvidiaStressFn(baseDir, opts)

View File

@@ -179,7 +179,9 @@ func ApplySATResultToDB(db *ComponentStatusDB, target, archivePath string) {
// Map SAT target to component keys.
switch target {
case "nvidia", "amd", "nvidia-stress", "amd-stress", "amd-mem", "amd-bandwidth":
case "nvidia", "nvidia-targeted-stress", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
"nvidia-interconnect", "nvidia-bandwidth", "amd", "nvidia-stress",
"amd-stress", "amd-mem", "amd-bandwidth":
db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall)
case "memory", "memory-stress", "sat-stress":
db.Record("memory:all", source, dbStatus, target+" SAT: "+overall)

View File

@@ -274,9 +274,6 @@ func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) Nv
}
opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices)
opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices)
if !opts.RunNCCL {
opts.RunNCCL = true
}
return opts
}

View File

@@ -41,6 +41,21 @@ func TestResolveBenchmarkProfile(t *testing.T) {
}
}
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
t.Parallel()
opts := normalizeNvidiaBenchmarkOptionsForBenchmark(NvidiaBenchmarkOptions{
Profile: "stability",
RunNCCL: false,
})
if opts.Profile != NvidiaBenchmarkProfileStability {
t.Fatalf("profile=%q want %q", opts.Profile, NvidiaBenchmarkProfileStability)
}
if opts.RunNCCL {
t.Fatalf("RunNCCL should stay false when explicitly disabled")
}
}
func TestParseBenchmarkBurnLog(t *testing.T) {
t.Parallel()

View File

@@ -120,10 +120,45 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) erro
log(fmt.Sprintf("Warning: rebind /run/live/medium failed: %v", err))
}
log("Verifying live medium now served from RAM...")
status := s.LiveBootSource()
if err := verifyInstallToRAMStatus(status); err != nil {
return err
}
log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
log("Done. Installation media can be safely disconnected.")
return nil
}
func verifyInstallToRAMStatus(status LiveBootSource) error {
if status.InRAM {
return nil
}
return fmt.Errorf("install to RAM verification failed: live medium still mounted from %s", describeLiveBootSource(status))
}
func describeLiveBootSource(status LiveBootSource) string {
source := strings.TrimSpace(status.Device)
if source == "" {
source = strings.TrimSpace(status.Source)
}
if source == "" {
source = "unknown source"
}
switch strings.TrimSpace(status.Kind) {
case "ram":
return "RAM"
case "usb":
return "USB (" + source + ")"
case "cdrom":
return "CD-ROM (" + source + ")"
case "disk":
return "disk (" + source + ")"
default:
return source
}
}
func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) error {
in, err := os.Open(src)
if err != nil {

View File

@@ -3,6 +3,8 @@ package platform
import "testing"
func TestInferLiveBootKind(t *testing.T) {
t.Parallel()
tests := []struct {
name string
fsType string
@@ -18,6 +20,7 @@ func TestInferLiveBootKind(t *testing.T) {
{name: "unknown", source: "overlay", want: "unknown"},
}
for _, tc := range tests {
tc := tc
t.Run(tc.name, func(t *testing.T) {
got := inferLiveBootKind(tc.fsType, tc.source, tc.deviceType, tc.transport)
if got != tc.want {
@@ -26,3 +29,29 @@ func TestInferLiveBootKind(t *testing.T) {
})
}
}
func TestVerifyInstallToRAMStatus(t *testing.T) {
t.Parallel()
if err := verifyInstallToRAMStatus(LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"}); err != nil {
t.Fatalf("expected success for RAM-backed status, got %v", err)
}
err := verifyInstallToRAMStatus(LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"})
if err == nil {
t.Fatal("expected verification failure when media is still on USB")
}
if got := err.Error(); got != "install to RAM verification failed: live medium still mounted from USB (/dev/sdb1)" {
t.Fatalf("error=%q", got)
}
}
func TestDescribeLiveBootSource(t *testing.T) {
t.Parallel()
if got := describeLiveBootSource(LiveBootSource{InRAM: true, Kind: "ram"}); got != "RAM" {
t.Fatalf("got %q want RAM", got)
}
if got := describeLiveBootSource(LiveBootSource{Kind: "unknown", Source: "/run/live/medium"}); got != "/run/live/medium" {
t.Fatalf("got %q want /run/live/medium", got)
}
}

View File

@@ -135,12 +135,15 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
case "nvidia":
tools = append(tools, s.CheckTools([]string{
"nvidia-smi",
"dcgmi",
"nv-hostengine",
"nvidia-bug-report.sh",
"bee-gpu-burn",
"bee-john-gpu-stress",
"bee-nccl-gpu-stress",
"all_reduce_perf",
})...)
tools = append(tools, resolvedToolStatus("dcgmproftester", dcgmProfTesterCandidates...))
case "amd":
tool := ToolStatus{Name: "rocm-smi"}
if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 {
@@ -155,6 +158,16 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
return tools
}
func resolvedToolStatus(display string, candidates ...string) ToolStatus {
for _, candidate := range candidates {
path, err := exec.LookPath(candidate)
if err == nil {
return ToolStatus{Name: display, Path: path, OK: true}
}
}
return ToolStatus{Name: display}
}
func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
lsmodText := commandText("lsmod")

View File

@@ -12,11 +12,11 @@ import (
"os"
"os/exec"
"path/filepath"
"syscall"
"sort"
"strconv"
"strings"
"sync"
"syscall"
"time"
)
@@ -38,6 +38,12 @@ var (
"/opt/rocm/bin/rvs",
"/opt/rocm-*/bin/rvs",
}
dcgmProfTesterCandidates = []string{
"dcgmproftester",
"dcgmproftester13",
"dcgmproftester12",
"dcgmproftester11",
}
)
// streamExecOutput runs cmd and streams each output line to logFunc (if non-nil).
@@ -76,15 +82,15 @@ func streamExecOutput(cmd *exec.Cmd, logFunc func(string)) ([]byte, error) {
// NvidiaGPU holds basic GPU info from nvidia-smi.
type NvidiaGPU struct {
Index int
Name string
MemoryMB int
Index int `json:"index"`
Name string `json:"name"`
MemoryMB int `json:"memory_mb"`
}
// AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
type AMDGPUInfo struct {
Index int
Name string
Index int `json:"index"`
Name string `json:"name"`
}
// DetectGPUVendor returns "nvidia" if /dev/nvidia0 exists, "amd" if /dev/kfd exists, or "" otherwise.
@@ -277,6 +283,80 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
}, logFunc)
}
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
selected, err := resolveDCGMGPUIndices(gpuIndices)
if err != nil {
return "", err
}
profCmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
if err != nil {
return "", err
}
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", []satJob{
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
{
name: "03-dcgmproftester.log",
cmd: profCmd,
env: nvidiaVisibleDevicesEnv(selected),
collectGPU: true,
gpuIndices: selected,
},
{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
}, logFunc)
}
func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
selected, err := resolveDCGMGPUIndices(gpuIndices)
if err != nil {
return "", err
}
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", []satJob{
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{
name: "02-dcgmi-targeted-power.log",
cmd: nvidiaDCGMNamedDiagCommand("targeted_power", normalizeNvidiaBurnDuration(durationSec), selected),
collectGPU: true,
gpuIndices: selected,
},
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
}, logFunc)
}
func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
selected, err := resolveDCGMGPUIndices(gpuIndices)
if err != nil {
return "", err
}
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", []satJob{
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{
name: "02-dcgmi-pulse-test.log",
cmd: nvidiaDCGMNamedDiagCommand("pulse_test", normalizeNvidiaBurnDuration(durationSec), selected),
collectGPU: true,
gpuIndices: selected,
},
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
}, logFunc)
}
func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
selected, err := resolveDCGMGPUIndices(gpuIndices)
if err != nil {
return "", err
}
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", []satJob{
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{
name: "02-dcgmi-nvbandwidth.log",
cmd: nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, selected),
collectGPU: true,
gpuIndices: selected,
},
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
}, logFunc)
}
func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
return runAcceptancePackCtx(context.Background(), baseDir, "gpu-nvidia", nvidiaSATJobs(), logFunc)
}
@@ -293,6 +373,23 @@ func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, resolvedGPUIndices), logFunc)
}
func (s *System) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
selected, err := resolveDCGMGPUIndices(gpuIndices)
if err != nil {
return "", err
}
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", []satJob{
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{
name: "02-dcgmi-targeted-stress.log",
cmd: nvidiaDCGMNamedDiagCommand("targeted_stress", normalizeNvidiaBurnDuration(durationSec), selected),
collectGPU: true,
gpuIndices: selected,
},
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
}, logFunc)
}
func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
if len(gpuIndices) > 0 {
return dedupeSortedIndices(gpuIndices), nil
@@ -473,6 +570,31 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
}
}
func nvidiaDCGMNamedDiagCommand(name string, durationSec int, gpuIndices []int) []string {
args := []string{"dcgmi", "diag", "-r", name}
if durationSec > 0 {
args = append(args, "-p", fmt.Sprintf("%s.test_duration=%d", name, durationSec))
}
if len(gpuIndices) > 0 {
args = append(args, "-i", joinIndexList(gpuIndices))
}
return args
}
func normalizeNvidiaBurnDuration(durationSec int) int {
if durationSec <= 0 {
return 300
}
return durationSec
}
func nvidiaVisibleDevicesEnv(gpuIndices []int) []string {
if len(gpuIndices) == 0 {
return nil
}
return []string{"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices)}
}
func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
if ctx == nil {
ctx = context.Background()
@@ -642,6 +764,7 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
}
if strings.Contains(text, "unsupported") ||
strings.Contains(text, "not supported") ||
strings.Contains(text, "not found in path") ||
strings.Contains(text, "invalid opcode") ||
strings.Contains(text, "unknown command") ||
strings.Contains(text, "not implemented") ||
@@ -748,6 +871,15 @@ func resolveROCmSMICommand(args ...string) ([]string, error) {
return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
}
func resolveDCGMProfTesterCommand(args ...string) ([]string, error) {
for _, candidate := range dcgmProfTesterCandidates {
if path, err := satLookPath(candidate); err == nil {
return append([]string{path}, args...), nil
}
}
return nil, errors.New("dcgmproftester not found in PATH")
}
func ensureAMDRuntimeReady() error {
if _, err := os.Stat("/dev/kfd"); err == nil {
return nil

View File

@@ -195,6 +195,53 @@ func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) {
}
}
func TestResolveDCGMProfTesterCommandUsesVersionedBinary(t *testing.T) {
oldLookPath := satLookPath
satLookPath = func(file string) (string, error) {
switch file {
case "dcgmproftester13":
return "/usr/bin/dcgmproftester13", nil
default:
return "", exec.ErrNotFound
}
}
t.Cleanup(func() { satLookPath = oldLookPath })
cmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004")
if err != nil {
t.Fatalf("resolveDCGMProfTesterCommand error: %v", err)
}
if len(cmd) != 4 {
t.Fatalf("cmd len=%d want 4 (%v)", len(cmd), cmd)
}
if cmd[0] != "/usr/bin/dcgmproftester13" {
t.Fatalf("cmd[0]=%q want /usr/bin/dcgmproftester13", cmd[0])
}
}
func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", 900, []int{3, 1})
want := []string{"dcgmi", "diag", "-r", "targeted_power", "-p", "targeted_power.test_duration=900", "-i", "3,1"}
if len(cmd) != len(want) {
t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
}
for i := range want {
if cmd[i] != want[i] {
t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
}
}
}
func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
if len(env) != 1 {
t.Fatalf("env len=%d want 1 (%v)", len(env), env)
}
if env[0] != "CUDA_VISIBLE_DEVICES=0,2,4" {
t.Fatalf("env[0]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[0])
}
}
func TestNvidiaStressArchivePrefixByLoader(t *testing.T) {
t.Parallel()

View File

@@ -10,17 +10,30 @@ import (
func (s *System) ListBeeServices() ([]string, error) {
seen := map[string]bool{}
var out []string
for _, pattern := range []string{"/etc/systemd/system/bee-*.service", "/lib/systemd/system/bee-*.service"} {
for _, pattern := range []string{
"/etc/systemd/system/bee-*.service",
"/lib/systemd/system/bee-*.service",
"/etc/systemd/system/bee-*.timer",
"/lib/systemd/system/bee-*.timer",
} {
matches, err := filepath.Glob(pattern)
if err != nil {
return nil, err
}
for _, match := range matches {
name := strings.TrimSuffix(filepath.Base(match), ".service")
base := filepath.Base(match)
name := base
if strings.HasSuffix(base, ".service") {
name = strings.TrimSuffix(base, ".service")
}
// Skip template units (e.g. bee-journal-mirror@) — they have no instances to query.
if strings.HasSuffix(name, "@") {
continue
}
// bee-selfheal is timer-managed; showing the oneshot service as inactive is misleading.
if name == "bee-selfheal" && strings.HasSuffix(base, ".service") {
continue
}
if !seen[name] {
seen[name] = true
out = append(out, name)

View File

@@ -44,12 +44,12 @@ type StaticIPv4Config struct {
}
type RemovableTarget struct {
Device string
FSType string
Size string
Label string
Model string
Mountpoint string
Device string `json:"device"`
FSType string `json:"fs_type"`
Size string `json:"size"`
Label string `json:"label"`
Model string `json:"model"`
Mountpoint string `json:"mountpoint"`
}
type ToolStatus struct {

View File

@@ -0,0 +1,31 @@
package platform
import (
"encoding/json"
"strings"
"testing"
)
func TestRemovableTargetJSONUsesFrontendFieldNames(t *testing.T) {
t.Parallel()
data, err := json.Marshal(RemovableTarget{
Device: "/dev/sdb1",
FSType: "exfat",
Size: "1.8T",
Label: "USB",
Model: "Flash",
})
if err != nil {
t.Fatalf("marshal: %v", err)
}
raw := string(data)
for _, key := range []string{`"device"`, `"fs_type"`, `"size"`, `"label"`, `"model"`} {
if !strings.Contains(raw, key) {
t.Fatalf("json missing key %s: %s", key, raw)
}
}
if strings.Contains(raw, `"Device"`) || strings.Contains(raw, `"FSType"`) {
t.Fatalf("json still contains Go field names: %s", raw)
}
}

View File

@@ -232,6 +232,54 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
}
}
func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
if h.opts.App == nil {
writeError(w, http.StatusServiceUnavailable, "app not configured")
return
}
var body struct {
Profile string `json:"profile"`
SizeMB int `json:"size_mb"`
GPUIndices []int `json:"gpu_indices"`
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
RunNCCL *bool `json:"run_nccl"`
DisplayName string `json:"display_name"`
}
if r.Body != nil {
if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
writeError(w, http.StatusBadRequest, "invalid request body")
return
}
}
runNCCL := true
if body.RunNCCL != nil {
runNCCL = *body.RunNCCL
}
t := &Task{
ID: newJobID("benchmark-nvidia"),
Name: taskDisplayName("nvidia-benchmark", "", ""),
Target: "nvidia-benchmark",
Priority: 15,
Status: TaskPending,
CreatedAt: time.Now(),
params: taskParams{
GPUIndices: body.GPUIndices,
ExcludeGPUIndices: body.ExcludeGPUIndices,
SizeMB: body.SizeMB,
BenchmarkProfile: body.Profile,
RunNCCL: runNCCL,
DisplayName: body.DisplayName,
},
}
if strings.TrimSpace(body.DisplayName) != "" {
t.Name = body.DisplayName
}
globalQueue.enqueue(t)
writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
}
func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {
id := r.URL.Query().Get("job_id")
if id == "" {
@@ -491,6 +539,22 @@ func (h *handler) handleAPIExportUSBBundle(w http.ResponseWriter, r *http.Reques
// ── GPU presence ──────────────────────────────────────────────────────────────
func (h *handler) handleAPIGNVIDIAGPUs(w http.ResponseWriter, _ *http.Request) {
if h.opts.App == nil {
writeError(w, http.StatusServiceUnavailable, "app not configured")
return
}
gpus, err := h.opts.App.ListNvidiaGPUs()
if err != nil {
writeError(w, http.StatusInternalServerError, err.Error())
return
}
if gpus == nil {
gpus = []platform.NvidiaGPU{}
}
writeJSON(w, gpus)
}
func (h *handler) handleAPIGPUPresence(w http.ResponseWriter, r *http.Request) {
if h.opts.App == nil {
writeError(w, http.StatusServiceUnavailable, "app not configured")
@@ -516,14 +580,33 @@ func (h *handler) handleAPIGPUTools(w http.ResponseWriter, _ *http.Request) {
_, amdErr := os.Stat("/dev/kfd")
nvidiaUp := nvidiaErr == nil
amdUp := amdErr == nil
_, dcgmErr := exec.LookPath("dcgmi")
_, ncclStressErr := exec.LookPath("bee-nccl-gpu-stress")
_, johnErr := exec.LookPath("bee-john-gpu-stress")
_, beeBurnErr := exec.LookPath("bee-gpu-burn")
_, nvBandwidthErr := exec.LookPath("nvbandwidth")
profErr := lookPathAny("dcgmproftester", "dcgmproftester13", "dcgmproftester12", "dcgmproftester11")
writeJSON(w, []toolEntry{
{ID: "bee-gpu-burn", Available: nvidiaUp, Vendor: "nvidia"},
{ID: "john", Available: nvidiaUp, Vendor: "nvidia"},
{ID: "nccl", Available: nvidiaUp, Vendor: "nvidia"},
{ID: "nvidia-compute", Available: nvidiaUp && profErr == nil, Vendor: "nvidia"},
{ID: "nvidia-targeted-power", Available: nvidiaUp && dcgmErr == nil, Vendor: "nvidia"},
{ID: "nvidia-pulse", Available: nvidiaUp && dcgmErr == nil, Vendor: "nvidia"},
{ID: "nvidia-interconnect", Available: nvidiaUp && ncclStressErr == nil, Vendor: "nvidia"},
{ID: "nvidia-bandwidth", Available: nvidiaUp && dcgmErr == nil && nvBandwidthErr == nil, Vendor: "nvidia"},
{ID: "bee-gpu-burn", Available: nvidiaUp && beeBurnErr == nil, Vendor: "nvidia"},
{ID: "john", Available: nvidiaUp && johnErr == nil, Vendor: "nvidia"},
{ID: "rvs", Available: amdUp, Vendor: "amd"},
})
}
func lookPathAny(names ...string) error {
for _, name := range names {
if _, err := exec.LookPath(name); err == nil {
return nil
}
}
return exec.ErrNotFound
}
// ── System ────────────────────────────────────────────────────────────────────
func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) {
@@ -562,7 +645,7 @@ func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request)
var standardTools = []string{
"dmidecode", "smartctl", "nvme", "lspci", "ipmitool",
"nvidia-smi", "memtester", "stress-ng", "nvtop",
"nvidia-smi", "dcgmi", "nv-hostengine", "memtester", "stress-ng", "nvtop",
"mstflint", "qrencode",
}

View File

@@ -64,6 +64,42 @@ func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
}
}
func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
globalQueue.mu.Lock()
originalTasks := globalQueue.tasks
globalQueue.tasks = nil
globalQueue.mu.Unlock()
t.Cleanup(func() {
globalQueue.mu.Lock()
globalQueue.tasks = originalTasks
globalQueue.mu.Unlock()
})
h := &handler{opts: HandlerOptions{App: &app.App{}}}
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
rec := httptest.NewRecorder()
h.handleAPIBenchmarkNvidiaRun(rec, req)
if rec.Code != 200 {
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
}
globalQueue.mu.Lock()
defer globalQueue.mu.Unlock()
if len(globalQueue.tasks) != 1 {
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
}
task := globalQueue.tasks[0]
if task.Target != "nvidia-benchmark" {
t.Fatalf("target=%q want nvidia-benchmark", task.Target)
}
if got := task.params.GPUIndices; len(got) != 2 || got[0] != 1 || got[1] != 3 {
t.Fatalf("gpu indices=%v want [1 3]", got)
}
if task.params.RunNCCL {
t.Fatal("RunNCCL should reflect explicit false from request")
}
}
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
h := &handler{}

View File

@@ -0,0 +1,713 @@
package webui
import (
"fmt"
"math"
"sort"
"strconv"
"strings"
"time"
"bee/audit/internal/platform"
)
type chartTimelineSegment struct {
Start time.Time
End time.Time
Active bool
}
type chartScale struct {
Min float64
Max float64
Ticks []float64
}
type chartLayout struct {
Width int
Height int
PlotLeft int
PlotRight int
PlotTop int
PlotBottom int
}
type metricChartSeries struct {
Name string
AxisTitle string
Color string
Values []float64
}
var metricChartPalette = []string{
"#5794f2",
"#73bf69",
"#f2cc0c",
"#ff9830",
"#f2495c",
"#b877d9",
"#56d2f7",
"#8ab8ff",
"#9adf8f",
"#ffbe5c",
}
func renderMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMin, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
pointCount := len(labels)
if len(times) > pointCount {
pointCount = len(times)
}
if pointCount == 0 {
pointCount = 1
labels = []string{""}
times = []time.Time{time.Time{}}
}
if len(labels) < pointCount {
padded := make([]string, pointCount)
copy(padded, labels)
labels = padded
}
if len(times) < pointCount {
times = synthesizeChartTimes(times, pointCount)
}
for i := range datasets {
if len(datasets[i]) == 0 {
datasets[i] = make([]float64, pointCount)
}
}
mn, avg, mx := globalStats(datasets)
if mx > 0 {
title = fmt.Sprintf("%s ↓%s ~%s ↑%s",
title,
chartLegendNumber(mn),
chartLegendNumber(avg),
chartLegendNumber(mx),
)
}
legendItems := []metricChartSeries{}
for i, name := range names {
color := metricChartPalette[i%len(metricChartPalette)]
values := make([]float64, pointCount)
if i < len(datasets) {
copy(values, coalesceDataset(datasets[i], pointCount))
}
legendItems = append(legendItems, metricChartSeries{
Name: name,
Color: color,
Values: values,
})
}
scale := singleAxisChartScale(datasets, yMin, yMax)
layout := singleAxisChartLayout(canvasHeight, len(legendItems))
start, end := chartTimeBounds(times)
var b strings.Builder
writeSVGOpen(&b, layout.Width, layout.Height)
writeChartFrame(&b, title, layout.Width, layout.Height)
writeTimelineIdleSpans(&b, layout, start, end, timeline)
writeVerticalGrid(&b, layout, times, pointCount, 8)
writeHorizontalGrid(&b, layout, scale)
writeTimelineBoundaries(&b, layout, start, end, timeline)
writePlotBorder(&b, layout)
writeSingleAxisY(&b, layout, scale)
writeXAxisLabels(&b, layout, times, labels, start, end, 8)
for _, item := range legendItems {
writeSeriesPolyline(&b, layout, times, start, end, item.Values, scale, item.Color)
}
writeLegend(&b, layout, legendItems)
writeSVGClose(&b)
return []byte(b.String()), nil
}
func renderGPUOverviewChartSVG(idx int, samples []platform.LiveMetricSample, timeline []chartTimelineSegment) ([]byte, bool, error) {
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
coreClock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
memClock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
if temp == nil && power == nil && coreClock == nil && memClock == nil {
return nil, false, nil
}
labels := sampleTimeLabels(samples)
times := sampleTimes(samples)
svg, err := drawGPUOverviewChartSVG(
fmt.Sprintf("GPU %d Overview", idx),
labels,
times,
[]metricChartSeries{
{Name: "Temp C", Values: coalesceDataset(temp, len(labels)), Color: "#f05a5a", AxisTitle: "Temp C"},
{Name: "Power W", Values: coalesceDataset(power, len(labels)), Color: "#ffb357", AxisTitle: "Power W"},
{Name: "Core Clock MHz", Values: coalesceDataset(coreClock, len(labels)), Color: "#73bf69", AxisTitle: "Core MHz"},
{Name: "Memory Clock MHz", Values: coalesceDataset(memClock, len(labels)), Color: "#5794f2", AxisTitle: "Memory MHz"},
},
timeline,
)
if err != nil {
return nil, false, err
}
return svg, true, nil
}
func drawGPUOverviewChartSVG(title string, labels []string, times []time.Time, series []metricChartSeries, timeline []chartTimelineSegment) ([]byte, error) {
if len(series) != 4 {
return nil, fmt.Errorf("gpu overview requires 4 series, got %d", len(series))
}
const (
width = 1400
height = 840
plotLeft = 180
plotRight = 1220
plotTop = 96
plotBottom = 660
)
const (
leftOuterAxis = 72
leftInnerAxis = 132
rightInnerAxis = 1268
rightOuterAxis = 1328
)
layout := chartLayout{
Width: width,
Height: height,
PlotLeft: plotLeft,
PlotRight: plotRight,
PlotTop: plotTop,
PlotBottom: plotBottom,
}
axisX := []int{leftOuterAxis, leftInnerAxis, rightInnerAxis, rightOuterAxis}
pointCount := len(labels)
if len(times) > pointCount {
pointCount = len(times)
}
if pointCount == 0 {
pointCount = 1
labels = []string{""}
times = []time.Time{time.Time{}}
}
if len(labels) < pointCount {
padded := make([]string, pointCount)
copy(padded, labels)
labels = padded
}
if len(times) < pointCount {
times = synthesizeChartTimes(times, pointCount)
}
for i := range series {
if len(series[i].Values) == 0 {
series[i].Values = make([]float64, pointCount)
}
}
scales := make([]chartScale, len(series))
for i := range series {
min, max := chartSeriesBounds(series[i].Values)
ticks := chartNiceTicks(min, max, 8)
scales[i] = chartScale{
Min: ticks[0],
Max: ticks[len(ticks)-1],
Ticks: ticks,
}
}
start, end := chartTimeBounds(times)
var b strings.Builder
writeSVGOpen(&b, width, height)
writeChartFrame(&b, title, width, height)
writeTimelineIdleSpans(&b, layout, start, end, timeline)
writeVerticalGrid(&b, layout, times, pointCount, 8)
writeHorizontalGrid(&b, layout, scales[0])
writeTimelineBoundaries(&b, layout, start, end, timeline)
writePlotBorder(&b, layout)
for i, axisLineX := range axisX {
fmt.Fprintf(&b, `<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="%s" stroke-width="1"/>`+"\n",
axisLineX, layout.PlotTop, axisLineX, layout.PlotBottom, series[i].Color)
fmt.Fprintf(&b, `<text x="%d" y="%d" text-anchor="middle" font-family="sans-serif" font-size="11" font-weight="700" fill="%s">%s</text>`+"\n",
axisLineX, 64, series[i].Color, sanitizeChartText(series[i].AxisTitle))
for _, tick := range scales[i].Ticks {
y := chartYForValue(valueClamp(tick, scales[i]), scales[i], layout.PlotTop, layout.PlotBottom)
label := sanitizeChartText(chartYAxisNumber(tick))
if i < 2 {
fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="%s" stroke-width="1"/>`+"\n",
axisLineX, y, axisLineX+6, y, series[i].Color)
fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="end" dy="4" font-family="sans-serif" font-size="10" fill="%s">%s</text>`+"\n",
axisLineX-8, y, series[i].Color, label)
continue
}
fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="%s" stroke-width="1"/>`+"\n",
axisLineX, y, axisLineX-6, y, series[i].Color)
fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="start" dy="4" font-family="sans-serif" font-size="10" fill="%s">%s</text>`+"\n",
axisLineX+8, y, series[i].Color, label)
}
}
writeXAxisLabels(&b, layout, times, labels, start, end, 8)
for i := range series {
writeSeriesPolyline(&b, layout, times, start, end, series[i].Values, scales[i], series[i].Color)
}
writeLegend(&b, layout, series)
writeSVGClose(&b)
return []byte(b.String()), nil
}
func metricsTimelineSegments(samples []platform.LiveMetricSample, now time.Time) []chartTimelineSegment {
if len(samples) == 0 {
return nil
}
times := sampleTimes(samples)
start, end := chartTimeBounds(times)
if start.IsZero() || end.IsZero() {
return nil
}
return chartTimelineSegmentsForRange(start, end, now, snapshotTaskHistory())
}
func snapshotTaskHistory() []Task {
globalQueue.mu.Lock()
defer globalQueue.mu.Unlock()
out := make([]Task, len(globalQueue.tasks))
for i, t := range globalQueue.tasks {
out[i] = *t
}
return out
}
func chartTimelineSegmentsForRange(start, end, now time.Time, tasks []Task) []chartTimelineSegment {
if start.IsZero() || end.IsZero() {
return nil
}
if end.Before(start) {
start, end = end, start
}
type interval struct {
start time.Time
end time.Time
}
active := make([]interval, 0, len(tasks))
for _, task := range tasks {
if task.StartedAt == nil {
continue
}
intervalStart := task.StartedAt.UTC()
intervalEnd := now.UTC()
if task.DoneAt != nil {
intervalEnd = task.DoneAt.UTC()
}
if !intervalEnd.After(intervalStart) {
continue
}
if intervalEnd.Before(start) || intervalStart.After(end) {
continue
}
if intervalStart.Before(start) {
intervalStart = start
}
if intervalEnd.After(end) {
intervalEnd = end
}
active = append(active, interval{start: intervalStart, end: intervalEnd})
}
sort.Slice(active, func(i, j int) bool {
if active[i].start.Equal(active[j].start) {
return active[i].end.Before(active[j].end)
}
return active[i].start.Before(active[j].start)
})
merged := make([]interval, 0, len(active))
for _, span := range active {
if len(merged) == 0 {
merged = append(merged, span)
continue
}
last := &merged[len(merged)-1]
if !span.start.After(last.end) {
if span.end.After(last.end) {
last.end = span.end
}
continue
}
merged = append(merged, span)
}
segments := make([]chartTimelineSegment, 0, len(merged)*2+1)
cursor := start
for _, span := range merged {
if span.start.After(cursor) {
segments = append(segments, chartTimelineSegment{Start: cursor, End: span.start, Active: false})
}
segments = append(segments, chartTimelineSegment{Start: span.start, End: span.end, Active: true})
cursor = span.end
}
if cursor.Before(end) {
segments = append(segments, chartTimelineSegment{Start: cursor, End: end, Active: false})
}
if len(segments) == 0 {
segments = append(segments, chartTimelineSegment{Start: start, End: end, Active: false})
}
return segments
}
func sampleTimes(samples []platform.LiveMetricSample) []time.Time {
times := make([]time.Time, 0, len(samples))
for _, sample := range samples {
times = append(times, sample.Timestamp)
}
return times
}
func singleAxisChartScale(datasets [][]float64, yMin, yMax *float64) chartScale {
min, max := 0.0, 1.0
if yMin != nil && yMax != nil {
min, max = *yMin, *yMax
} else {
min, max = chartSeriesBounds(flattenDatasets(datasets))
if yMin != nil {
min = *yMin
}
if yMax != nil {
max = *yMax
}
}
ticks := chartNiceTicks(min, max, 8)
return chartScale{Min: ticks[0], Max: ticks[len(ticks)-1], Ticks: ticks}
}
func flattenDatasets(datasets [][]float64) []float64 {
total := 0
for _, ds := range datasets {
total += len(ds)
}
out := make([]float64, 0, total)
for _, ds := range datasets {
out = append(out, ds...)
}
return out
}
func singleAxisChartLayout(canvasHeight int, seriesCount int) chartLayout {
legendRows := 0
if chartLegendVisible(seriesCount) && seriesCount > 0 {
cols := 4
if seriesCount < cols {
cols = seriesCount
}
legendRows = (seriesCount + cols - 1) / cols
}
legendHeight := 0
if legendRows > 0 {
legendHeight = legendRows*24 + 24
}
return chartLayout{
Width: 1400,
Height: canvasHeight,
PlotLeft: 96,
PlotRight: 1352,
PlotTop: 72,
PlotBottom: canvasHeight - 60 - legendHeight,
}
}
func chartTimeBounds(times []time.Time) (time.Time, time.Time) {
if len(times) == 0 {
return time.Time{}, time.Time{}
}
start := times[0].UTC()
end := start
for _, ts := range times[1:] {
t := ts.UTC()
if t.Before(start) {
start = t
}
if t.After(end) {
end = t
}
}
return start, end
}
func synthesizeChartTimes(times []time.Time, count int) []time.Time {
if count <= 0 {
return nil
}
if len(times) == count {
return times
}
if len(times) == 1 {
out := make([]time.Time, count)
for i := range out {
out[i] = times[0].Add(time.Duration(i) * time.Minute)
}
return out
}
base := time.Now().UTC().Add(-time.Duration(count-1) * time.Minute)
out := make([]time.Time, count)
for i := range out {
out[i] = base.Add(time.Duration(i) * time.Minute)
}
return out
}
func writeSVGOpen(b *strings.Builder, width, height int) {
fmt.Fprintf(b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" viewBox="0 0 %d %d">`+"\n", width, height, width, height)
}
func writeSVGClose(b *strings.Builder) {
b.WriteString("</svg>\n")
}
func writeChartFrame(b *strings.Builder, title string, width, height int) {
fmt.Fprintf(b, `<rect width="%d" height="%d" rx="10" ry="10" fill="#ffffff" stroke="#d7e0ea"/>`+"\n", width, height)
fmt.Fprintf(b, `<text x="%d" y="30" text-anchor="middle" font-family="sans-serif" font-size="16" font-weight="700" fill="#1f2937">%s</text>`+"\n",
width/2, sanitizeChartText(title))
}
func writePlotBorder(b *strings.Builder, layout chartLayout) {
fmt.Fprintf(b, `<rect x="%d" y="%d" width="%d" height="%d" fill="none" stroke="#cbd5e1" stroke-width="1"/>`+"\n",
layout.PlotLeft, layout.PlotTop, layout.PlotRight-layout.PlotLeft, layout.PlotBottom-layout.PlotTop)
}
func writeHorizontalGrid(b *strings.Builder, layout chartLayout, scale chartScale) {
b.WriteString(`<g stroke="#e2e8f0" stroke-width="1">` + "\n")
for _, tick := range scale.Ticks {
y := chartYForValue(tick, scale, layout.PlotTop, layout.PlotBottom)
fmt.Fprintf(b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f"/>`+"\n",
layout.PlotLeft, y, layout.PlotRight, y)
}
b.WriteString(`</g>` + "\n")
}
func writeVerticalGrid(b *strings.Builder, layout chartLayout, times []time.Time, pointCount, target int) {
if pointCount <= 0 {
return
}
start, end := chartTimeBounds(times)
b.WriteString(`<g stroke="#edf2f7" stroke-width="1">` + "\n")
for _, idx := range gpuChartLabelIndices(pointCount, target) {
ts := chartPointTime(times, idx)
x := chartXForTime(ts, start, end, layout.PlotLeft, layout.PlotRight)
fmt.Fprintf(b, `<line x1="%.1f" y1="%d" x2="%.1f" y2="%d"/>`+"\n",
x, layout.PlotTop, x, layout.PlotBottom)
}
b.WriteString(`</g>` + "\n")
}
func writeSingleAxisY(b *strings.Builder, layout chartLayout, scale chartScale) {
fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="#64748b" stroke-width="1"/>`+"\n",
layout.PlotLeft, layout.PlotTop, layout.PlotLeft, layout.PlotBottom)
for _, tick := range scale.Ticks {
y := chartYForValue(tick, scale, layout.PlotTop, layout.PlotBottom)
fmt.Fprintf(b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="#64748b" stroke-width="1"/>`+"\n",
layout.PlotLeft, y, layout.PlotLeft-6, y)
fmt.Fprintf(b, `<text x="%d" y="%.1f" text-anchor="end" dy="4" font-family="sans-serif" font-size="10" fill="#475569">%s</text>`+"\n",
layout.PlotLeft-10, y, sanitizeChartText(chartYAxisNumber(tick)))
}
}
func writeXAxisLabels(b *strings.Builder, layout chartLayout, times []time.Time, labels []string, start, end time.Time, target int) {
pointCount := len(labels)
if len(times) > pointCount {
pointCount = len(times)
}
b.WriteString(`<g font-family="sans-serif" font-size="11" fill="#64748b" text-anchor="middle">` + "\n")
for _, idx := range gpuChartLabelIndices(pointCount, target) {
x := chartXForTime(chartPointTime(times, idx), start, end, layout.PlotLeft, layout.PlotRight)
label := ""
if idx < len(labels) {
label = labels[idx]
}
fmt.Fprintf(b, `<text x="%.1f" y="%d">%s</text>`+"\n", x, layout.PlotBottom+28, sanitizeChartText(label))
}
b.WriteString(`</g>` + "\n")
fmt.Fprintf(b, `<text x="%d" y="%d" text-anchor="middle" font-family="sans-serif" font-size="12" fill="#64748b">Time</text>`+"\n",
(layout.PlotLeft+layout.PlotRight)/2, layout.PlotBottom+48)
}
func writeSeriesPolyline(b *strings.Builder, layout chartLayout, times []time.Time, start, end time.Time, values []float64, scale chartScale, color string) {
if len(values) == 0 {
return
}
var points strings.Builder
for idx, value := range values {
if idx > 0 {
points.WriteByte(' ')
}
x := chartXForTime(chartPointTime(times, idx), start, end, layout.PlotLeft, layout.PlotRight)
y := chartYForValue(value, scale, layout.PlotTop, layout.PlotBottom)
points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
points.WriteByte(',')
points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
}
fmt.Fprintf(b, `<polyline points="%s" fill="none" stroke="%s" stroke-width="2.2" stroke-linejoin="round" stroke-linecap="round"/>`+"\n",
points.String(), color)
if len(values) == 1 {
x := chartXForTime(chartPointTime(times, 0), start, end, layout.PlotLeft, layout.PlotRight)
y := chartYForValue(values[0], scale, layout.PlotTop, layout.PlotBottom)
fmt.Fprintf(b, `<circle cx="%.1f" cy="%.1f" r="3.5" fill="%s"/>`+"\n", x, y, color)
}
}
func writeLegend(b *strings.Builder, layout chartLayout, series []metricChartSeries) {
if !chartLegendVisible(len(series)) || len(series) == 0 {
return
}
cols := 4
if len(series) < cols {
cols = len(series)
}
cellWidth := float64(layout.PlotRight-layout.PlotLeft) / float64(cols)
baseY := layout.PlotBottom + 74
for i, item := range series {
row := i / cols
col := i % cols
x := float64(layout.PlotLeft) + cellWidth*float64(col) + 8
y := float64(baseY + row*24)
fmt.Fprintf(b, `<line x1="%.1f" y1="%.1f" x2="%.1f" y2="%.1f" stroke="%s" stroke-width="3"/>`+"\n",
x, y, x+28, y, item.Color)
fmt.Fprintf(b, `<text x="%.1f" y="%.1f" font-family="sans-serif" font-size="12" fill="#1f2937">%s</text>`+"\n",
x+38, y+4, sanitizeChartText(item.Name))
}
}
func writeTimelineIdleSpans(b *strings.Builder, layout chartLayout, start, end time.Time, segments []chartTimelineSegment) {
if len(segments) == 0 {
return
}
b.WriteString(`<g data-role="timeline-overlay">` + "\n")
for _, segment := range segments {
if segment.Active || !segment.End.After(segment.Start) {
continue
}
x0 := chartXForTime(segment.Start, start, end, layout.PlotLeft, layout.PlotRight)
x1 := chartXForTime(segment.End, start, end, layout.PlotLeft, layout.PlotRight)
fmt.Fprintf(b, `<rect x="%.1f" y="%d" width="%.1f" height="%d" fill="#475569" opacity="0.10"/>`+"\n",
x0, layout.PlotTop, math.Max(1, x1-x0), layout.PlotBottom-layout.PlotTop)
}
b.WriteString(`</g>` + "\n")
}
func writeTimelineBoundaries(b *strings.Builder, layout chartLayout, start, end time.Time, segments []chartTimelineSegment) {
if len(segments) == 0 {
return
}
seen := map[int]bool{}
b.WriteString(`<g data-role="timeline-boundaries" stroke="#94a3b8" stroke-width="1.2">` + "\n")
for i, segment := range segments {
if i > 0 {
x := int(math.Round(chartXForTime(segment.Start, start, end, layout.PlotLeft, layout.PlotRight)))
if !seen[x] {
seen[x] = true
fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d"/>`+"\n", x, layout.PlotTop, x, layout.PlotBottom)
}
}
if i < len(segments)-1 {
x := int(math.Round(chartXForTime(segment.End, start, end, layout.PlotLeft, layout.PlotRight)))
if !seen[x] {
seen[x] = true
fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d"/>`+"\n", x, layout.PlotTop, x, layout.PlotBottom)
}
}
}
b.WriteString(`</g>` + "\n")
}
func chartXForTime(ts, start, end time.Time, left, right int) float64 {
if !end.After(start) {
return float64(left+right) / 2
}
if ts.Before(start) {
ts = start
}
if ts.After(end) {
ts = end
}
ratio := float64(ts.Sub(start)) / float64(end.Sub(start))
return float64(left) + ratio*float64(right-left)
}
func chartPointTime(times []time.Time, idx int) time.Time {
if idx >= 0 && idx < len(times) && !times[idx].IsZero() {
return times[idx].UTC()
}
if len(times) > 0 && !times[0].IsZero() {
return times[0].UTC().Add(time.Duration(idx) * time.Minute)
}
return time.Now().UTC().Add(time.Duration(idx) * time.Minute)
}
func chartYForValue(value float64, scale chartScale, plotTop, plotBottom int) float64 {
if scale.Max <= scale.Min {
return float64(plotTop+plotBottom) / 2
}
return float64(plotBottom) - (value-scale.Min)/(scale.Max-scale.Min)*float64(plotBottom-plotTop)
}
func chartSeriesBounds(values []float64) (float64, float64) {
if len(values) == 0 {
return 0, 1
}
min, max := values[0], values[0]
for _, value := range values[1:] {
if value < min {
min = value
}
if value > max {
max = value
}
}
if min == max {
if max == 0 {
return 0, 1
}
pad := math.Abs(max) * 0.1
if pad == 0 {
pad = 1
}
min -= pad
max += pad
}
if min > 0 {
pad := (max - min) * 0.2
if pad == 0 {
pad = max * 0.1
}
min -= pad
if min < 0 {
min = 0
}
max += pad
}
return min, max
}
func chartNiceTicks(min, max float64, target int) []float64 {
if min == max {
max = min + 1
}
span := max - min
step := math.Pow(10, math.Floor(math.Log10(span/float64(target))))
for _, factor := range []float64{1, 2, 5, 10} {
if span/(factor*step) <= float64(target)*1.5 {
step = factor * step
break
}
}
low := math.Floor(min/step) * step
high := math.Ceil(max/step) * step
var ticks []float64
for value := low; value <= high+step*0.001; value += step {
ticks = append(ticks, math.Round(value*1e9)/1e9)
}
return ticks
}
func valueClamp(value float64, scale chartScale) float64 {
if value < scale.Min {
return scale.Min
}
if value > scale.Max {
return scale.Max
}
return value
}

View File

@@ -232,7 +232,8 @@ func truncate(s string, max int) string {
// isSATTarget returns true for task targets that run hardware acceptance tests.
func isSATTarget(target string) bool {
switch target {
case "nvidia", "nvidia-stress", "memory", "memory-stress", "storage",
case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
"nvidia-interconnect", "nvidia-bandwidth", "nvidia-stress", "memory", "memory-stress", "storage",
"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
"platform-stress":
return true

File diff suppressed because it is too large Load Diff

View File

@@ -1,19 +1,20 @@
package webui
import (
"bufio"
"encoding/json"
"errors"
"fmt"
"html"
"io"
"log/slog"
"math"
"mime"
"net"
"net/http"
"os"
"path/filepath"
"runtime/debug"
"sort"
"strconv"
"strings"
"sync"
"time"
@@ -21,7 +22,6 @@ import (
"bee/audit/internal/app"
"bee/audit/internal/platform"
"bee/audit/internal/runtimeenv"
gocharts "github.com/go-analyze/charts"
"reanimator/chart/viewer"
"reanimator/chart/web"
)
@@ -237,6 +237,12 @@ func NewHandler(opts HandlerOptions) http.Handler {
// SAT
mux.HandleFunc("POST /api/sat/nvidia/run", h.handleAPISATRun("nvidia"))
mux.HandleFunc("POST /api/sat/nvidia-targeted-stress/run", h.handleAPISATRun("nvidia-targeted-stress"))
mux.HandleFunc("POST /api/sat/nvidia-compute/run", h.handleAPISATRun("nvidia-compute"))
mux.HandleFunc("POST /api/sat/nvidia-targeted-power/run", h.handleAPISATRun("nvidia-targeted-power"))
mux.HandleFunc("POST /api/sat/nvidia-pulse/run", h.handleAPISATRun("nvidia-pulse"))
mux.HandleFunc("POST /api/sat/nvidia-interconnect/run", h.handleAPISATRun("nvidia-interconnect"))
mux.HandleFunc("POST /api/sat/nvidia-bandwidth/run", h.handleAPISATRun("nvidia-bandwidth"))
mux.HandleFunc("POST /api/sat/nvidia-stress/run", h.handleAPISATRun("nvidia-stress"))
mux.HandleFunc("POST /api/sat/memory/run", h.handleAPISATRun("memory"))
mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
@@ -250,6 +256,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
mux.HandleFunc("POST /api/sat/platform-stress/run", h.handleAPISATRun("platform-stress"))
mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream)
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
mux.HandleFunc("POST /api/benchmark/nvidia/run", h.handleAPIBenchmarkNvidiaRun)
// Tasks
mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)
@@ -286,6 +293,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
// GPU presence / tools
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
mux.HandleFunc("GET /api/gpu/nvidia", h.handleAPIGNVIDIAGPUs)
mux.HandleFunc("GET /api/gpu/tools", h.handleAPIGPUTools)
// System
@@ -373,6 +381,38 @@ func (w *trackingResponseWriter) Write(p []byte) (int, error) {
return w.ResponseWriter.Write(p)
}
func (w *trackingResponseWriter) Flush() {
w.wroteHeader = true
if f, ok := w.ResponseWriter.(http.Flusher); ok {
f.Flush()
}
}
func (w *trackingResponseWriter) Hijack() (net.Conn, *bufio.ReadWriter, error) {
h, ok := w.ResponseWriter.(http.Hijacker)
if !ok {
return nil, nil, fmt.Errorf("hijacking not supported")
}
return h.Hijack()
}
func (w *trackingResponseWriter) Push(target string, opts *http.PushOptions) error {
p, ok := w.ResponseWriter.(http.Pusher)
if !ok {
return http.ErrNotSupported
}
return p.Push(target, opts)
}
func (w *trackingResponseWriter) ReadFrom(r io.Reader) (int64, error) {
rf, ok := w.ResponseWriter.(io.ReaderFrom)
if !ok {
return io.Copy(w.ResponseWriter, r)
}
w.wroteHeader = true
return rf.ReadFrom(r)
}
func recoverMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
tw := &trackingResponseWriter{ResponseWriter: w}
@@ -520,13 +560,14 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
http.Error(w, "metrics database not available", http.StatusServiceUnavailable)
return
}
samples, err := h.metricsDB.LoadAll()
if err != nil || len(samples) == 0 {
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
return
}
timeline := metricsTimelineSegments(samples, time.Now())
if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
samples, err := h.metricsDB.LoadAll()
if err != nil || len(samples) == 0 {
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
return
}
buf, ok, err := renderGPUOverviewChartSVG(idx, samples)
buf, ok, err := renderGPUOverviewChartSVG(idx, samples, timeline)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
@@ -540,13 +581,23 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
_, _ = w.Write(buf)
return
}
datasets, names, labels, title, yMin, yMax, ok := h.chartDataFromDB(path)
datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
if !ok {
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
return
}
buf, err := renderChartSVG(title, datasets, names, labels, yMin, yMax)
buf, err := renderMetricChartSVG(
title,
labels,
sampleTimes(samples),
datasets,
names,
yMin,
yMax,
chartCanvasHeightForPath(path, len(names)),
timeline,
)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
@@ -556,14 +607,6 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
_, _ = w.Write(buf)
}
func (h *handler) chartDataFromDB(path string) ([][]float64, []string, []string, string, *float64, *float64, bool) {
samples, err := h.metricsDB.LoadAll()
if err != nil || len(samples) == 0 {
return nil, nil, nil, "", nil, nil, false
}
return chartDataFromSamples(path, samples)
}
func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][]float64, []string, []string, string, *float64, *float64, bool) {
var datasets [][]float64
var names []string
@@ -961,247 +1004,6 @@ func autoBounds120(datasets ...[]float64) (*float64, *float64) {
return floatPtr(low), floatPtr(high)
}
func renderGPUOverviewChartSVG(idx int, samples []platform.LiveMetricSample) ([]byte, bool, error) {
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
coreClock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
memClock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
if temp == nil && power == nil && coreClock == nil && memClock == nil {
return nil, false, nil
}
labels := sampleTimeLabels(samples)
svg, err := drawGPUOverviewChartSVG(
fmt.Sprintf("GPU %d Overview", idx),
labels,
[]gpuOverviewSeries{
{Name: "Temp C", Values: coalesceDataset(temp, len(samples)), Color: "#f05a5a", AxisTitle: "Temp C"},
{Name: "Power W", Values: coalesceDataset(power, len(samples)), Color: "#ffb357", AxisTitle: "Power W"},
{Name: "Core Clock MHz", Values: coalesceDataset(coreClock, len(samples)), Color: "#73bf69", AxisTitle: "Core MHz"},
{Name: "Memory Clock MHz", Values: coalesceDataset(memClock, len(samples)), Color: "#5794f2", AxisTitle: "Memory MHz"},
},
)
if err != nil {
return nil, false, err
}
return svg, true, nil
}
type gpuOverviewSeries struct {
Name string
AxisTitle string
Color string
Values []float64
}
func drawGPUOverviewChartSVG(title string, labels []string, series []gpuOverviewSeries) ([]byte, error) {
if len(series) != 4 {
return nil, fmt.Errorf("gpu overview requires 4 series, got %d", len(series))
}
const (
width = 1400
height = 420
plotLeft = 180
plotRight = 1220
plotTop = 74
plotBottom = 292
)
const (
leftOuterAxis = 72
leftInnerAxis = 132
rightInnerAxis = 1268
rightOuterAxis = 1328
)
axisX := []int{leftOuterAxis, leftInnerAxis, rightInnerAxis, rightOuterAxis}
plotWidth := plotRight - plotLeft
plotHeight := plotBottom - plotTop
pointCount := len(labels)
if pointCount == 0 {
pointCount = 1
labels = []string{""}
}
for i := range series {
if len(series[i].Values) == 0 {
series[i].Values = make([]float64, pointCount)
}
}
type axisScale struct {
Min float64
Max float64
Ticks []float64
}
scales := make([]axisScale, len(series))
for i := range series {
min, max := gpuChartSeriesBounds(series[i].Values)
ticks := gpuChartNiceTicks(min, max, 8)
scales[i] = axisScale{
Min: ticks[0],
Max: ticks[len(ticks)-1],
Ticks: ticks,
}
}
xFor := func(index int) float64 {
if pointCount <= 1 {
return float64(plotLeft + plotWidth/2)
}
return float64(plotLeft) + float64(index)*float64(plotWidth)/float64(pointCount-1)
}
yFor := func(value float64, scale axisScale) float64 {
if scale.Max <= scale.Min {
return float64(plotTop + plotHeight/2)
}
return float64(plotBottom) - (value-scale.Min)/(scale.Max-scale.Min)*float64(plotHeight)
}
var b strings.Builder
b.WriteString(fmt.Sprintf(`<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" viewBox="0 0 %d %d">`, width, height, width, height))
b.WriteString("\n")
b.WriteString(`<rect width="100%" height="100%" rx="10" ry="10" fill="#111217" stroke="#2f3440"/>` + "\n")
b.WriteString(`<text x="700" y="28" text-anchor="middle" font-family="sans-serif" font-size="16" font-weight="700" fill="#f5f7fa">` + sanitizeChartText(title) + `</text>` + "\n")
b.WriteString(`<g stroke="#2f3440" stroke-width="1">` + "\n")
for _, tick := range scales[0].Ticks {
y := yFor(tick, scales[0])
fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f"/>`+"\n", plotLeft, y, plotRight, y)
}
for _, idx := range gpuChartLabelIndices(pointCount, 8) {
x := xFor(idx)
fmt.Fprintf(&b, `<line x1="%.1f" y1="%d" x2="%.1f" y2="%d"/>`+"\n", x, plotTop, x, plotBottom)
}
b.WriteString("</g>\n")
fmt.Fprintf(&b, `<rect x="%d" y="%d" width="%d" height="%d" fill="none" stroke="#454c5c" stroke-width="1"/>`+"\n",
plotLeft, plotTop, plotWidth, plotHeight)
for i, axisLineX := range axisX {
fmt.Fprintf(&b, `<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="%s" stroke-width="1"/>`+"\n",
axisLineX, plotTop, axisLineX, plotBottom, series[i].Color)
fmt.Fprintf(&b, `<text x="%d" y="%d" text-anchor="middle" font-family="sans-serif" font-size="11" font-weight="700" fill="%s">%s</text>`+"\n",
axisLineX, 52, series[i].Color, sanitizeChartText(series[i].AxisTitle))
for _, tick := range scales[i].Ticks {
y := yFor(tick, scales[i])
label := sanitizeChartText(gpuChartFormatTick(tick))
if i < 2 {
fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="%s" stroke-width="1"/>`+"\n",
axisLineX, y, axisLineX+6, y, series[i].Color)
fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="end" dy="4" font-family="sans-serif" font-size="10" fill="%s">%s</text>`+"\n",
axisLineX-8, y, series[i].Color, label)
continue
}
fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="%s" stroke-width="1"/>`+"\n",
axisLineX, y, axisLineX-6, y, series[i].Color)
fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="start" dy="4" font-family="sans-serif" font-size="10" fill="%s">%s</text>`+"\n",
axisLineX+8, y, series[i].Color, label)
}
}
b.WriteString(`<g font-family="sans-serif" font-size="11" fill="#c8d0d8" text-anchor="middle">` + "\n")
for _, idx := range gpuChartLabelIndices(pointCount, 8) {
x := xFor(idx)
fmt.Fprintf(&b, `<text x="%.1f" y="%d">%s</text>`+"\n", x, plotBottom+22, sanitizeChartText(labels[idx]))
}
b.WriteString(`</g>` + "\n")
b.WriteString(`<text x="700" y="338" text-anchor="middle" font-family="sans-serif" font-size="12" fill="#c8d0d8">Time</text>` + "\n")
for i := range series {
var points strings.Builder
for j, value := range series[i].Values {
if j > 0 {
points.WriteByte(' ')
}
points.WriteString(strconv.FormatFloat(xFor(j), 'f', 1, 64))
points.WriteByte(',')
points.WriteString(strconv.FormatFloat(yFor(value, scales[i]), 'f', 1, 64))
}
fmt.Fprintf(&b, `<polyline points="%s" fill="none" stroke="%s" stroke-width="2"/>`+"\n",
points.String(), series[i].Color)
if len(series[i].Values) == 1 {
fmt.Fprintf(&b, `<circle cx="%.1f" cy="%.1f" r="3" fill="%s"/>`+"\n",
xFor(0), yFor(series[i].Values[0], scales[i]), series[i].Color)
}
}
const legendY = 372
legendX := []int{190, 470, 790, 1090}
for i := range series {
fmt.Fprintf(&b, `<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="%s" stroke-width="3"/>`+"\n",
legendX[i], legendY, legendX[i]+28, legendY, series[i].Color)
fmt.Fprintf(&b, `<text x="%d" y="%d" font-family="sans-serif" font-size="12" fill="#f5f7fa">%s</text>`+"\n",
legendX[i]+38, legendY+4, sanitizeChartText(series[i].Name))
}
b.WriteString("</svg>\n")
return []byte(b.String()), nil
}
func gpuChartSeriesBounds(values []float64) (float64, float64) {
if len(values) == 0 {
return 0, 1
}
min, max := values[0], values[0]
for _, value := range values[1:] {
if value < min {
min = value
}
if value > max {
max = value
}
}
if min == max {
if max == 0 {
return 0, 1
}
pad := math.Abs(max) * 0.1
if pad == 0 {
pad = 1
}
min -= pad
max += pad
}
if min > 0 {
pad := (max - min) * 0.2
if pad == 0 {
pad = max * 0.1
}
min -= pad
if min < 0 {
min = 0
}
max += pad
}
return min, max
}
func gpuChartNiceTicks(min, max float64, target int) []float64 {
if min == max {
max = min + 1
}
span := max - min
step := math.Pow(10, math.Floor(math.Log10(span/float64(target))))
for _, factor := range []float64{1, 2, 5, 10} {
if span/(factor*step) <= float64(target)*1.5 {
step = factor * step
break
}
}
low := math.Floor(min/step) * step
high := math.Ceil(max/step) * step
var ticks []float64
for value := low; value <= high+step*0.001; value += step {
ticks = append(ticks, math.Round(value*1e9)/1e9)
}
return ticks
}
func gpuChartFormatTick(value float64) string {
if value == math.Trunc(value) {
return strconv.Itoa(int(value))
}
return strconv.FormatFloat(value, 'f', 1, 64)
}
func gpuChartLabelIndices(total, target int) []int {
if total <= 0 {
return nil
@@ -1223,64 +1025,16 @@ func gpuChartLabelIndices(total, target int) []int {
return indices
}
// renderChartSVG renders a line chart SVG with a fixed Y-axis range.
func renderChartSVG(title string, datasets [][]float64, names []string, labels []string, yMin, yMax *float64) ([]byte, error) {
n := len(labels)
if n == 0 {
n = 1
labels = []string{""}
func chartCanvasHeightForPath(path string, seriesCount int) int {
height := chartCanvasHeight(seriesCount)
if isGPUChartPath(path) {
return height * 2
}
for i := range datasets {
if len(datasets[i]) == 0 {
datasets[i] = make([]float64, n)
}
}
// Append global min/avg/max to title.
mn, avg, mx := globalStats(datasets)
if mx > 0 {
title = fmt.Sprintf("%s ↓%s ~%s ↑%s",
title,
chartLegendNumber(mn),
chartLegendNumber(avg),
chartLegendNumber(mx),
)
}
title = sanitizeChartText(title)
names = sanitizeChartTexts(names)
sparse := sanitizeChartTexts(sparseLabels(labels, 6))
return height
}
opt := gocharts.NewLineChartOptionWithData(datasets)
opt.Title = gocharts.TitleOption{Text: title}
opt.XAxis.Labels = sparse
opt.Legend = gocharts.LegendOption{SeriesNames: names}
if chartLegendVisible(len(names)) {
opt.Legend.Offset = gocharts.OffsetStr{Top: gocharts.PositionBottom}
opt.Legend.OverlayChart = gocharts.Ptr(false)
} else {
opt.Legend.Show = gocharts.Ptr(false)
}
opt.Symbol = gocharts.SymbolNone
// Right padding: reserve space for the MarkLine label (library recommendation).
opt.Padding = gocharts.NewBox(20, 20, 80, 20)
if yMin != nil || yMax != nil {
opt.YAxis = []gocharts.YAxisOption{chartYAxisOption(yMin, yMax)}
}
// Add a single peak mark line on the series that holds the global maximum.
peakIdx, _ := globalPeakSeries(datasets)
if peakIdx >= 0 && peakIdx < len(opt.SeriesList) {
opt.SeriesList[peakIdx].MarkLine = gocharts.NewMarkLine(gocharts.SeriesMarkTypeMax)
}
p := gocharts.NewPainter(gocharts.PainterOptions{
OutputFormat: gocharts.ChartOutputSVG,
Width: 1400,
Height: chartCanvasHeight(len(names)),
}, gocharts.PainterThemeOption(gocharts.GetTheme("grafana")))
if err := p.LineChart(opt); err != nil {
return nil, err
}
return p.Bytes()
func isGPUChartPath(path string) bool {
return strings.HasPrefix(path, "gpu-all-") || strings.HasPrefix(path, "gpu/")
}
func chartLegendVisible(seriesCount int) bool {
@@ -1294,30 +1048,6 @@ func chartCanvasHeight(seriesCount int) int {
return 288
}
func chartYAxisOption(yMin, yMax *float64) gocharts.YAxisOption {
return gocharts.YAxisOption{
Min: yMin,
Max: yMax,
LabelCount: 11,
ValueFormatter: chartYAxisNumber,
}
}
// globalPeakSeries returns the index of the series containing the global maximum
// value across all datasets, and that maximum value.
func globalPeakSeries(datasets [][]float64) (idx int, peak float64) {
idx = -1
for i, ds := range datasets {
for _, v := range ds {
if v > peak {
peak = v
idx = i
}
}
}
return idx, peak
}
// globalStats returns min, average, and max across all values in all datasets.
func globalStats(datasets [][]float64) (mn, avg, mx float64) {
var sum float64
@@ -1357,21 +1087,6 @@ func sanitizeChartText(s string) string {
}, s))
}
func sanitizeChartTexts(in []string) []string {
out := make([]string, len(in))
for i, s := range in {
out[i] = sanitizeChartText(s)
}
return out
}
func safeIdx(s []float64, i int) float64 {
if i < len(s) {
return s[i]
}
return 0
}
func snapshotNamedRings(rings []*namedMetricsRing) ([][]float64, []string, []string) {
var datasets [][]float64
var names []string
@@ -1458,20 +1173,6 @@ func chartYAxisNumber(v float64) string {
return out
}
func sparseLabels(labels []string, n int) []string {
out := make([]string, len(labels))
step := len(labels) / n
if step < 1 {
step = 1
}
for i, l := range labels {
if i%step == 0 {
out[i] = l
}
}
return out
}
func (h *handler) handleAPIMetricsExportCSV(w http.ResponseWriter, r *http.Request) {
if h.metricsDB == nil {
http.Error(w, "metrics database not available", http.StatusServiceUnavailable)
@@ -1487,6 +1188,11 @@ func (h *handler) handleAPIMetricsExportCSV(w http.ResponseWriter, r *http.Reque
func (h *handler) handleReady(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Cache-Control", "no-store")
if strings.TrimSpace(h.opts.AuditPath) == "" {
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte("ready"))
return
}
if _, err := os.Stat(h.opts.AuditPath); err != nil {
w.WriteHeader(http.StatusServiceUnavailable)
_, _ = w.Write([]byte("starting"))

View File

@@ -51,6 +51,32 @@ func TestRecoverMiddlewareReturns500OnPanic(t *testing.T) {
}
}
func TestRecoverMiddlewarePreservesStreamingInterfaces(t *testing.T) {
handler := recoverMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if !sseStart(w) {
return
}
if !sseWrite(w, "tick", "ok") {
t.Fatal("expected sse write to succeed")
}
}))
rec := httptest.NewRecorder()
req := httptest.NewRequest(http.MethodGet, "/stream", nil)
handler.ServeHTTP(rec, req)
if rec.Code != http.StatusOK {
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
}
if got := rec.Header().Get("Content-Type"); got != "text/event-stream" {
t.Fatalf("content-type=%q", got)
}
body := rec.Body.String()
if !strings.Contains(body, "event: tick\n") || !strings.Contains(body, "data: ok\n\n") {
t.Fatalf("body=%q", body)
}
}
func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
samples := []platform.LiveMetricSample{
{
@@ -278,6 +304,124 @@ func TestChartCanvasHeight(t *testing.T) {
}
}
func TestChartTimelineSegmentsForRangeMergesActiveSpansAndIdleGaps(t *testing.T) {
start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
end := start.Add(10 * time.Minute)
taskWindow := func(offsetStart, offsetEnd time.Duration) Task {
s := start.Add(offsetStart)
e := start.Add(offsetEnd)
return Task{
Name: "task",
Status: TaskDone,
StartedAt: &s,
DoneAt: &e,
}
}
segments := chartTimelineSegmentsForRange(start, end, end, []Task{
taskWindow(1*time.Minute, 3*time.Minute),
taskWindow(2*time.Minute, 5*time.Minute),
taskWindow(7*time.Minute, 8*time.Minute),
})
if len(segments) != 5 {
t.Fatalf("segments=%d want 5: %#v", len(segments), segments)
}
wantActive := []bool{false, true, false, true, false}
wantMinutes := [][2]int{{0, 1}, {1, 5}, {5, 7}, {7, 8}, {8, 10}}
for i, segment := range segments {
if segment.Active != wantActive[i] {
t.Fatalf("segment[%d].Active=%v want %v", i, segment.Active, wantActive[i])
}
if got := int(segment.Start.Sub(start).Minutes()); got != wantMinutes[i][0] {
t.Fatalf("segment[%d] start=%d want %d", i, got, wantMinutes[i][0])
}
if got := int(segment.End.Sub(start).Minutes()); got != wantMinutes[i][1] {
t.Fatalf("segment[%d] end=%d want %d", i, got, wantMinutes[i][1])
}
}
}
func TestRenderMetricChartSVGIncludesTimelineOverlay(t *testing.T) {
start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
labels := []string{"12:00", "12:01", "12:02"}
times := []time.Time{start, start.Add(time.Minute), start.Add(2 * time.Minute)}
svg, err := renderMetricChartSVG(
"System Power",
labels,
times,
[][]float64{{300, 320, 310}},
[]string{"Power W"},
floatPtr(0),
floatPtr(400),
360,
[]chartTimelineSegment{
{Start: start, End: start.Add(time.Minute), Active: false},
{Start: start.Add(time.Minute), End: start.Add(2 * time.Minute), Active: true},
},
)
if err != nil {
t.Fatal(err)
}
body := string(svg)
if !strings.Contains(body, `data-role="timeline-overlay"`) {
t.Fatalf("svg missing timeline overlay: %s", body)
}
if !strings.Contains(body, `opacity="0.10"`) {
t.Fatalf("svg missing idle overlay opacity: %s", body)
}
if !strings.Contains(body, `System Power`) {
t.Fatalf("svg missing chart title: %s", body)
}
}
func TestHandleMetricsChartSVGRendersCustomSVG(t *testing.T) {
dir := t.TempDir()
db, err := openMetricsDB(filepath.Join(dir, "metrics.db"))
if err != nil {
t.Fatal(err)
}
t.Cleanup(func() { _ = db.db.Close() })
start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
for i, sample := range []platform.LiveMetricSample{
{Timestamp: start, PowerW: 300},
{Timestamp: start.Add(time.Minute), PowerW: 320},
{Timestamp: start.Add(2 * time.Minute), PowerW: 310},
} {
if err := db.Write(sample); err != nil {
t.Fatalf("write sample %d: %v", i, err)
}
}
globalQueue.mu.Lock()
prevTasks := globalQueue.tasks
s := start.Add(30 * time.Second)
e := start.Add(90 * time.Second)
globalQueue.tasks = []*Task{{Name: "Burn", Status: TaskDone, StartedAt: &s, DoneAt: &e}}
globalQueue.mu.Unlock()
t.Cleanup(func() {
globalQueue.mu.Lock()
globalQueue.tasks = prevTasks
globalQueue.mu.Unlock()
})
h := &handler{opts: HandlerOptions{ExportDir: dir}, metricsDB: db}
rec := httptest.NewRecorder()
req := httptest.NewRequest(http.MethodGet, "/api/metrics/chart/server-power.svg", nil)
h.handleMetricsChartSVG(rec, req)
if rec.Code != http.StatusOK {
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
}
body := rec.Body.String()
if !strings.Contains(body, `data-role="timeline-overlay"`) {
t.Fatalf("custom svg response missing timeline overlay: %s", body)
}
if !strings.Contains(body, `stroke-linecap="round"`) {
t.Fatalf("custom svg response missing custom polyline styling: %s", body)
}
}
func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
got := normalizeFanSeries([]float64{4200, 0, 0, 4300, 0})
want := []float64{4200, 4200, 4200, 4300, 4300}
@@ -291,21 +435,6 @@ func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
}
}
func TestChartYAxisOption(t *testing.T) {
min := floatPtr(0)
max := floatPtr(100)
opt := chartYAxisOption(min, max)
if opt.Min != min || opt.Max != max {
t.Fatalf("chartYAxisOption min/max mismatch: %#v", opt)
}
if opt.LabelCount != 11 {
t.Fatalf("chartYAxisOption labelCount=%d want 11", opt.LabelCount)
}
if got := opt.ValueFormatter(1000); got != "1к" {
t.Fatalf("chartYAxisOption formatter(1000)=%q want 1к", got)
}
}
func TestSnapshotFanRingsUsesTimelineLabels(t *testing.T) {
r1 := newMetricsRing(4)
r2 := newMetricsRing(4)
@@ -414,7 +543,7 @@ func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
t.Fatalf("status=%d", rec.Code)
}
body := rec.Body.String()
if !strings.Contains(body, `Run Audit`) {
if !strings.Contains(body, `onclick="auditModalRun()">Run audit</button>`) {
t.Fatalf("dashboard missing run audit button: %s", body)
}
if strings.Contains(body, `No audit data`) {
@@ -422,6 +551,18 @@ func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
}
}
func TestReadyIsOKWhenAuditPathIsUnset(t *testing.T) {
handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/api/ready", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
}
if strings.TrimSpace(rec.Body.String()) != "ready" {
t.Fatalf("body=%q want ready", rec.Body.String())
}
}
func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "audit.json")
@@ -488,6 +629,68 @@ func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
}
}
func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/benchmark", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d", rec.Code)
}
body := rec.Body.String()
for _, needle := range []string{
`href="/benchmark"`,
`id="benchmark-gpu-list"`,
`/api/gpu/nvidia`,
`/api/benchmark/nvidia/run`,
`benchmark-run-nccl`,
} {
if !strings.Contains(body, needle) {
t.Fatalf("benchmark page missing %q: %s", needle, body)
}
}
}
func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d", rec.Code)
}
body := rec.Body.String()
for _, needle := range []string{
`NVIDIA GPU Targeted Stress`,
`nvidia-targeted-stress`,
`controlled NVIDIA DCGM load`,
`<code>dcgmi diag targeted_stress</code>`,
} {
if !strings.Contains(body, needle) {
t.Fatalf("validate page missing %q: %s", needle, body)
}
}
}
func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/burn", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d", rec.Code)
}
body := rec.Body.String()
for _, needle := range []string{
`NVIDIA Max Compute Load`,
`dcgmproftester`,
`targeted_stress remain in <a href="/validate">Validate</a>`,
`NVIDIA Interconnect Test (NCCL all_reduce_perf)`,
`id="burn-gpu-list"`,
} {
if !strings.Contains(body, needle) {
t.Fatalf("burn page missing %q: %s", needle, body)
}
}
}
func TestTasksPageRendersScrollableLogModal(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "audit.json")
@@ -643,3 +846,98 @@ func TestRuntimeHealthEndpointReturnsJSON(t *testing.T) {
t.Fatalf("body=%q want %q", strings.TrimSpace(rec.Body.String()), body)
}
}
func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "audit.json")
exportDir := filepath.Join(dir, "export")
if err := os.MkdirAll(exportDir, 0755); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z","hardware":{"board":{"serial_number":"SERIAL-1"}}}`), 0644); err != nil {
t.Fatal(err)
}
health := `{
"status":"PARTIAL",
"checked_at":"2026-03-16T10:00:00Z",
"export_dir":"/tmp/export",
"driver_ready":true,
"cuda_ready":false,
"network_status":"PARTIAL",
"issues":[
{"code":"dhcp_partial","description":"At least one interface did not obtain IPv4 connectivity."},
{"code":"cuda_runtime_not_ready","description":"CUDA runtime is not ready for GPU SAT."}
],
"tools":[
{"name":"dmidecode","ok":true},
{"name":"nvidia-smi","ok":false}
],
"services":[
{"name":"bee-web","status":"active"},
{"name":"bee-nvidia","status":"inactive"}
]
}`
if err := os.WriteFile(filepath.Join(exportDir, "runtime-health.json"), []byte(health), 0644); err != nil {
t.Fatal(err)
}
componentStatus := `[
{
"component_key":"cpu:all",
"status":"Warning",
"error_summary":"cpu SAT: FAILED",
"history":[{"at":"2026-03-16T10:00:00Z","status":"Warning","source":"sat:cpu","detail":"cpu SAT: FAILED"}]
},
{
"component_key":"memory:all",
"status":"OK",
"history":[{"at":"2026-03-16T10:01:00Z","status":"OK","source":"sat:memory","detail":"memory SAT: OK"}]
},
{
"component_key":"storage:nvme0n1",
"status":"Critical",
"error_summary":"storage SAT: FAILED",
"history":[{"at":"2026-03-16T10:02:00Z","status":"Critical","source":"sat:storage","detail":"storage SAT: FAILED"}]
},
{
"component_key":"pcie:gpu:nvidia",
"status":"Warning",
"error_summary":"nvidia SAT: FAILED",
"history":[{"at":"2026-03-16T10:03:00Z","status":"Warning","source":"sat:nvidia","detail":"nvidia SAT: FAILED"}]
}
]`
if err := os.WriteFile(filepath.Join(exportDir, "component-status.json"), []byte(componentStatus), 0644); err != nil {
t.Fatal(err)
}
handler := NewHandler(HandlerOptions{AuditPath: path, ExportDir: exportDir})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
}
body := rec.Body.String()
for _, needle := range []string{
`Runtime Health`,
`<th>Check</th><th>Status</th><th>Source</th><th>Issue</th>`,
`Export Directory`,
`Network`,
`NVIDIA/AMD Driver`,
`CUDA / ROCm`,
`Required Utilities`,
`Bee Services`,
`<td>CPU</td>`,
`<td>Memory</td>`,
`<td>Storage</td>`,
`<td>GPU</td>`,
`CUDA runtime is not ready for GPU SAT.`,
`Missing: nvidia-smi`,
`bee-nvidia=inactive`,
`cpu SAT: FAILED`,
`storage SAT: FAILED`,
`sat:nvidia`,
} {
if !strings.Contains(body, needle) {
t.Fatalf("dashboard missing %q: %s", needle, body)
}
}
}

View File

@@ -31,6 +31,13 @@ const (
// taskNames maps target → human-readable name for validate (SAT) runs.
var taskNames = map[string]string{
"nvidia": "NVIDIA SAT",
"nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)",
"nvidia-benchmark": "NVIDIA Benchmark",
"nvidia-compute": "NVIDIA Max Compute Load (dcgmproftester)",
"nvidia-targeted-power": "NVIDIA Targeted Power (dcgmi diag targeted_power)",
"nvidia-pulse": "NVIDIA Pulse Test (dcgmi diag pulse_test)",
"nvidia-interconnect": "NVIDIA Interconnect Test (NCCL all_reduce_perf)",
"nvidia-bandwidth": "NVIDIA Bandwidth Test (NVBandwidth)",
"nvidia-stress": "NVIDIA GPU Stress",
"memory": "Memory SAT",
"storage": "Storage SAT",
@@ -108,8 +115,11 @@ type taskParams struct {
DiagLevel int `json:"diag_level,omitempty"`
GPUIndices []int `json:"gpu_indices,omitempty"`
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
SizeMB int `json:"size_mb,omitempty"`
Loader string `json:"loader,omitempty"`
BurnProfile string `json:"burn_profile,omitempty"`
BenchmarkProfile string `json:"benchmark_profile,omitempty"`
RunNCCL bool `json:"run_nccl,omitempty"`
DisplayName string `json:"display_name,omitempty"`
Device string `json:"device,omitempty"` // for install
PlatformComponents []string `json:"platform_components,omitempty"`
@@ -130,45 +140,53 @@ type persistedTask struct {
}
type burnPreset struct {
NvidiaDiag int
DurationSec int
}
func resolveBurnPreset(profile string) burnPreset {
switch profile {
case "overnight":
return burnPreset{NvidiaDiag: 4, DurationSec: 8 * 60 * 60}
return burnPreset{DurationSec: 8 * 60 * 60}
case "acceptance":
return burnPreset{NvidiaDiag: 3, DurationSec: 60 * 60}
return burnPreset{DurationSec: 60 * 60}
default:
return burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}
return burnPreset{DurationSec: 5 * 60}
}
}
func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
acceptanceCycles := []platform.PlatformStressCycle{
{LoadSec: 85, IdleSec: 5},
{LoadSec: 80, IdleSec: 10},
{LoadSec: 55, IdleSec: 5},
{LoadSec: 60, IdleSec: 0},
{LoadSec: 100, IdleSec: 10},
{LoadSec: 145, IdleSec: 15},
{LoadSec: 190, IdleSec: 20},
{LoadSec: 235, IdleSec: 25},
{LoadSec: 280, IdleSec: 30},
{LoadSec: 325, IdleSec: 35},
{LoadSec: 370, IdleSec: 40},
{LoadSec: 415, IdleSec: 45},
{LoadSec: 460, IdleSec: 50},
{LoadSec: 510, IdleSec: 0},
}
switch profile {
case "overnight":
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
{LoadSec: 600, IdleSec: 120},
{LoadSec: 600, IdleSec: 60},
{LoadSec: 600, IdleSec: 30},
{LoadSec: 600, IdleSec: 120},
{LoadSec: 600, IdleSec: 60},
{LoadSec: 600, IdleSec: 30},
{LoadSec: 600, IdleSec: 120},
{LoadSec: 600, IdleSec: 60},
}}
cycles := make([]platform.PlatformStressCycle, 0, len(acceptanceCycles)*8)
for range 8 {
cycles = append(cycles, acceptanceCycles...)
}
return platform.PlatformStressOptions{Cycles: cycles}
case "acceptance":
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
{LoadSec: 300, IdleSec: 60},
{LoadSec: 300, IdleSec: 30},
{LoadSec: 300, IdleSec: 60},
{LoadSec: 300, IdleSec: 30},
}}
return platform.PlatformStressOptions{Cycles: acceptanceCycles}
default: // smoke
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
{LoadSec: 90, IdleSec: 60},
{LoadSec: 90, IdleSec: 30},
{LoadSec: 85, IdleSec: 5},
{LoadSec: 80, IdleSec: 10},
{LoadSec: 55, IdleSec: 5},
{LoadSec: 60, IdleSec: 0},
}}
}
}
@@ -429,6 +447,31 @@ func (q *taskQueue) worker() {
wg.Add(1)
goRecoverOnce("task "+t.Target, func() {
defer wg.Done()
defer taskCancel()
q.executeTask(t, j, taskCtx)
})
}
wg.Wait()
if len(batch) > 0 {
q.mu.Lock()
q.prune()
q.persistLocked()
q.mu.Unlock()
}
}()
}
}
func (q *taskQueue) executeTask(t *Task, j *jobState, ctx context.Context) {
startedKmsgWatch := false
defer q.finalizeTaskRun(t, j)
defer func() {
if startedKmsgWatch && q.kmsgWatcher != nil {
q.kmsgWatcher.NotifyTaskFinished(t.ID)
}
}()
defer func() {
if rec := recover(); rec != nil {
msg := fmt.Sprintf("task panic: %v", rec)
@@ -445,40 +488,28 @@ func (q *taskQueue) worker() {
if q.kmsgWatcher != nil && isSATTarget(t.Target) {
q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
startedKmsgWatch = true
}
q.runTask(t, j, taskCtx)
if q.kmsgWatcher != nil {
q.kmsgWatcher.NotifyTaskFinished(t.ID)
}
q.runTask(t, j, ctx)
}
func (q *taskQueue) finalizeTaskRun(t *Task, j *jobState) {
q.mu.Lock()
now2 := time.Now()
t.DoneAt = &now2
defer q.mu.Unlock()
now := time.Now()
t.DoneAt = &now
if t.Status == TaskRunning {
if j.err != "" {
t.Status = TaskFailed
t.ErrMsg = j.err
} else {
t.Status = TaskDone
t.ErrMsg = ""
}
}
q.persistLocked()
q.mu.Unlock()
})
}
wg.Wait()
if len(batch) > 0 {
q.mu.Lock()
q.prune()
q.persistLocked()
q.mu.Unlock()
}
}()
}
}
// setCPUGovernor writes the given governor to all CPU scaling_governor sysfs files.
@@ -519,9 +550,6 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
break
}
diagLevel := t.params.DiagLevel
if t.params.BurnProfile != "" && diagLevel <= 0 {
diagLevel = resolveBurnPreset(t.params.BurnProfile).NvidiaDiag
}
if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
result, e := a.RunNvidiaAcceptancePackWithOptions(
ctx, "", diagLevel, t.params.GPUIndices, j.append,
@@ -534,6 +562,78 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
} else {
archive, err = a.RunNvidiaAcceptancePack("", j.append)
}
case "nvidia-targeted-stress":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if dur <= 0 {
dur = 300
}
archive, err = a.RunNvidiaTargetedStressValidatePack(ctx, "", dur, t.params.GPUIndices, j.append)
case "nvidia-benchmark":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
archive, err = a.RunNvidiaBenchmarkCtx(ctx, "", platform.NvidiaBenchmarkOptions{
Profile: t.params.BenchmarkProfile,
SizeMB: t.params.SizeMB,
GPUIndices: t.params.GPUIndices,
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
RunNCCL: t.params.RunNCCL,
}, j.append)
case "nvidia-compute":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, j.append)
case "nvidia-targeted-power":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
archive, err = a.RunNvidiaTargetedPowerPack(ctx, "", dur, t.params.GPUIndices, j.append)
case "nvidia-pulse":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
archive, err = a.RunNvidiaPulseTestPack(ctx, "", dur, t.params.GPUIndices, j.append)
case "nvidia-bandwidth":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
archive, err = a.RunNvidiaBandwidthPack(ctx, "", t.params.GPUIndices, j.append)
case "nvidia-interconnect":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
DurationSec: dur,
Loader: platform.NvidiaStressLoaderNCCL,
GPUIndices: t.params.GPUIndices,
}, j.append)
case "nvidia-stress":
if a == nil {
err = fmt.Errorf("app not configured")

View File

@@ -253,10 +253,10 @@ func TestResolveBurnPreset(t *testing.T) {
profile string
want burnPreset
}{
{profile: "smoke", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}},
{profile: "acceptance", want: burnPreset{NvidiaDiag: 3, DurationSec: 60 * 60}},
{profile: "overnight", want: burnPreset{NvidiaDiag: 4, DurationSec: 8 * 60 * 60}},
{profile: "", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}},
{profile: "smoke", want: burnPreset{DurationSec: 5 * 60}},
{profile: "acceptance", want: burnPreset{DurationSec: 60 * 60}},
{profile: "overnight", want: burnPreset{DurationSec: 8 * 60 * 60}},
{profile: "", want: burnPreset{DurationSec: 5 * 60}},
}
for _, tc := range tests {
if got := resolveBurnPreset(tc.profile); got != tc.want {
@@ -467,3 +467,52 @@ func TestRunTaskInstallUsesSharedCommandStreaming(t *testing.T) {
t.Fatalf("unexpected error: %q", j.err)
}
}
func TestExecuteTaskMarksPanicsAsFailedAndClosesKmsgWindow(t *testing.T) {
dir := t.TempDir()
q := &taskQueue{
opts: &HandlerOptions{App: &app.App{}},
statePath: filepath.Join(dir, "tasks-state.json"),
logsDir: filepath.Join(dir, "tasks"),
kmsgWatcher: newKmsgWatcher(nil),
}
tk := &Task{
ID: "cpu-panic-1",
Name: "CPU SAT",
Target: "cpu",
Status: TaskRunning,
CreatedAt: time.Now(),
}
j := &jobState{}
orig := runCPUAcceptancePackCtx
runCPUAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, _ int, _ func(string)) (string, error) {
panic("boom")
}
defer func() { runCPUAcceptancePackCtx = orig }()
q.executeTask(tk, j, context.Background())
if tk.Status != TaskFailed {
t.Fatalf("status=%q want %q", tk.Status, TaskFailed)
}
if tk.DoneAt == nil {
t.Fatal("expected done_at to be set")
}
if !strings.Contains(tk.ErrMsg, "task panic: boom") {
t.Fatalf("task error=%q", tk.ErrMsg)
}
if !strings.Contains(j.err, "task panic: boom") {
t.Fatalf("job error=%q", j.err)
}
q.kmsgWatcher.mu.Lock()
activeCount := q.kmsgWatcher.activeCount
window := q.kmsgWatcher.window
q.kmsgWatcher.mu.Unlock()
if activeCount != 0 {
t.Fatalf("activeCount=%d want 0", activeCount)
}
if window != nil {
t.Fatalf("expected kmsg window to be cleared, got %+v", window)
}
}

View File

@@ -302,6 +302,12 @@ memtest_fail() {
return 0
}
nvidia_runtime_fail() {
msg="$1"
echo "ERROR: ${msg}" >&2
exit 1
}
iso_memtest_present() {
iso_path="$1"
iso_files="$(mktemp)"
@@ -439,6 +445,44 @@ validate_iso_memtest() {
echo "=== memtest validation OK ==="
}
validate_iso_nvidia_runtime() {
iso_path="$1"
[ "$BEE_GPU_VENDOR" = "nvidia" ] || return 0
echo "=== validating NVIDIA runtime in ISO ==="
[ -f "$iso_path" ] || nvidia_runtime_fail "ISO not found for NVIDIA runtime validation: $iso_path"
require_iso_reader "$iso_path" >/dev/null 2>&1 || nvidia_runtime_fail "ISO reader unavailable for NVIDIA runtime validation"
command -v unsquashfs >/dev/null 2>&1 || nvidia_runtime_fail "unsquashfs is required for NVIDIA runtime validation"
squashfs_tmp="$(mktemp)"
squashfs_list="$(mktemp)"
iso_read_member "$iso_path" live/filesystem.squashfs "$squashfs_tmp" || {
rm -f "$squashfs_tmp" "$squashfs_list"
nvidia_runtime_fail "failed to extract live/filesystem.squashfs from ISO"
}
unsquashfs -ll "$squashfs_tmp" > "$squashfs_list" 2>/dev/null || {
rm -f "$squashfs_tmp" "$squashfs_list"
nvidia_runtime_fail "failed to inspect filesystem.squashfs from ISO"
}
grep -Eq 'usr/bin/dcgmi$' "$squashfs_list" || {
rm -f "$squashfs_tmp" "$squashfs_list"
nvidia_runtime_fail "dcgmi missing from final NVIDIA ISO"
}
grep -Eq 'usr/bin/nv-hostengine$' "$squashfs_list" || {
rm -f "$squashfs_tmp" "$squashfs_list"
nvidia_runtime_fail "nv-hostengine missing from final NVIDIA ISO"
}
grep -Eq 'usr/bin/dcgmproftester([0-9]+)?$' "$squashfs_list" || {
rm -f "$squashfs_tmp" "$squashfs_list"
nvidia_runtime_fail "dcgmproftester missing from final NVIDIA ISO"
}
rm -f "$squashfs_tmp" "$squashfs_list"
echo "=== NVIDIA runtime validation OK ==="
}
append_memtest_grub_entry() {
grub_cfg="$1"
[ -f "$grub_cfg" ] || return 1
@@ -1144,6 +1188,7 @@ if [ -f "$ISO_RAW" ]; then
fi
fi
validate_iso_memtest "$ISO_RAW"
validate_iso_nvidia_runtime "$ISO_RAW"
cp "$ISO_RAW" "$ISO_OUT"
echo ""
echo "=== done (${BEE_GPU_VENDOR}) ==="

View File

@@ -1,6 +1,10 @@
# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing.
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with CUDA 13 userspace,
# so install the CUDA 13 build plus proprietary diagnostic components explicitly.
# NVIDIA DCGM (Data Center GPU Manager).
# Validate uses dcgmi diagnostics; Burn uses dcgmproftester as the official
# NVIDIA max-compute recipe. The smoketest/runtime contract treats
# dcgmproftester as required in the LiveCD.
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
# CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
# explicitly.
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%

View File

@@ -52,6 +52,31 @@ else
fail "nvidia-smi: NOT FOUND"
fi
if p=$(PATH="/usr/local/bin:$PATH" command -v dcgmi 2>/dev/null); then
ok "dcgmi found: $p"
else
fail "dcgmi: NOT FOUND"
fi
if p=$(PATH="/usr/local/bin:$PATH" command -v nv-hostengine 2>/dev/null); then
ok "nv-hostengine found: $p"
else
fail "nv-hostengine: NOT FOUND"
fi
DCGM_PROFTESTER=""
for tool in dcgmproftester dcgmproftester13 dcgmproftester12 dcgmproftester11; do
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
DCGM_PROFTESTER="$p"
break
fi
done
if [ -n "$DCGM_PROFTESTER" ]; then
ok "dcgmproftester found: $DCGM_PROFTESTER"
else
fail "dcgmproftester: NOT FOUND"
fi
for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf; do
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
ok "$tool found: $p"
@@ -60,6 +85,12 @@ for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf
fi
done
if p=$(PATH="/usr/local/bin:$PATH" command -v nvbandwidth 2>/dev/null); then
ok "nvbandwidth found: $p"
else
warn "nvbandwidth: NOT FOUND"
fi
echo ""
echo "-- NVIDIA modules --"
KO_DIR="/usr/local/lib/nvidia"