Compare commits

...

2 Commits
v5.2 ... v5.4

Author SHA1 Message Date
33e0a5bef2 Refine validate UI and runtime health table 2026-04-05 16:24:45 +03:00
38e79143eb Refine burn UI and NVIDIA stress flows 2026-04-05 13:43:43 +03:00
18 changed files with 1535 additions and 276 deletions

View File

@@ -1,9 +1,10 @@
LISTEN ?= :8080 LISTEN ?= :8080
AUDIT_PATH ?= AUDIT_PATH ?=
EXPORT_DIR ?= $(CURDIR)/.tmp/export
VERSION ?= $(shell sh ./scripts/resolve-version.sh) VERSION ?= $(shell sh ./scripts/resolve-version.sh)
GO_LDFLAGS := -X main.Version=$(VERSION) GO_LDFLAGS := -X main.Version=$(VERSION)
RUN_ARGS := web --listen $(LISTEN) RUN_ARGS := web --listen $(LISTEN) --export-dir $(EXPORT_DIR)
ifneq ($(AUDIT_PATH),) ifneq ($(AUDIT_PATH),)
RUN_ARGS += --audit-path $(AUDIT_PATH) RUN_ARGS += --audit-path $(AUDIT_PATH)
endif endif
@@ -11,6 +12,7 @@ endif
.PHONY: run build test .PHONY: run build test
run: run:
mkdir -p $(EXPORT_DIR)
go run -ldflags "$(GO_LDFLAGS)" ./cmd/bee $(RUN_ARGS) go run -ldflags "$(GO_LDFLAGS)" ./cmd/bee $(RUN_ARGS)
build: build:

View File

@@ -87,7 +87,7 @@ func printRootUsage(w io.Writer) {
bee preflight --output stdout|file:<path> bee preflight --output stdout|file:<path>
bee export --target <device> bee export --target <device>
bee support-bundle --output stdout|file:<path> bee support-bundle --output stdout|file:<path>
bee web --listen :80 --audit-path `+app.DefaultAuditJSONPath+` bee web --listen :80 [--audit-path `+app.DefaultAuditJSONPath+`]
bee sat nvidia|memory|storage|cpu [--duration <seconds>] bee sat nvidia|memory|storage|cpu [--duration <seconds>]
bee benchmark nvidia [--profile standard|stability|overnight] bee benchmark nvidia [--profile standard|stability|overnight]
bee version bee version
@@ -296,7 +296,7 @@ func runWeb(args []string, stdout, stderr io.Writer) int {
fs := flag.NewFlagSet("web", flag.ContinueOnError) fs := flag.NewFlagSet("web", flag.ContinueOnError)
fs.SetOutput(stderr) fs.SetOutput(stderr)
listenAddr := fs.String("listen", ":8080", "listen address, e.g. :80") listenAddr := fs.String("listen", ":8080", "listen address, e.g. :80")
auditPath := fs.String("audit-path", app.DefaultAuditJSONPath, "path to the latest audit JSON snapshot") auditPath := fs.String("audit-path", "", "optional path to the latest audit JSON snapshot")
exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles") exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
title := fs.String("title", "Bee Hardware Audit", "page title") title := fs.String("title", "Bee Hardware Audit", "page title")
fs.Usage = func() { fs.Usage = func() {

View File

@@ -115,7 +115,12 @@ func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
type satRunner interface { type satRunner interface {
RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error)
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
@@ -528,6 +533,13 @@ func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir st
return ActionResult{Title: "NVIDIA DCGM", Body: body}, err return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
} }
func (a *App) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNvidiaTargetedStressValidatePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
}
func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) { func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc) return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
} }
@@ -543,6 +555,34 @@ func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts pl
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc) return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
} }
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
}
func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNvidiaTargetedPowerPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
}
func (a *App) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNvidiaPulseTestPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
}
func (a *App) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNvidiaBandwidthPack(ctx, baseDir, gpuIndices, logFunc)
}
func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) { func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" { if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir baseDir = DefaultSATBaseDir
@@ -893,6 +933,12 @@ func latestSATSummaries() []string {
prefix string prefix string
}{ }{
{label: "NVIDIA SAT", prefix: "gpu-nvidia-"}, {label: "NVIDIA SAT", prefix: "gpu-nvidia-"},
{label: "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)", prefix: "gpu-nvidia-targeted-stress-"},
{label: "NVIDIA Max Compute Load (dcgmproftester)", prefix: "gpu-nvidia-compute-"},
{label: "NVIDIA Targeted Power (dcgmi diag targeted_power)", prefix: "gpu-nvidia-targeted-power-"},
{label: "NVIDIA Pulse Test (dcgmi diag pulse_test)", prefix: "gpu-nvidia-pulse-"},
{label: "NVIDIA Interconnect Test (NCCL all_reduce_perf)", prefix: "gpu-nvidia-nccl-"},
{label: "NVIDIA Bandwidth Test (NVBandwidth)", prefix: "gpu-nvidia-bandwidth-"},
{label: "Memory SAT", prefix: "memory-"}, {label: "Memory SAT", prefix: "memory-"},
{label: "Storage SAT", prefix: "storage-"}, {label: "Storage SAT", prefix: "storage-"},
{label: "CPU SAT", prefix: "cpu-"}, {label: "CPU SAT", prefix: "cpu-"},

View File

@@ -120,16 +120,21 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
} }
type fakeSAT struct { type fakeSAT struct {
runNvidiaFn func(string) (string, error) runNvidiaFn func(string) (string, error)
runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error) runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error) runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
runMemoryFn func(string) (string, error) runNvidiaComputeFn func(string, int, []int) (string, error)
runStorageFn func(string) (string, error) runNvidiaPowerFn func(string, int, []int) (string, error)
runCPUFn func(string, int) (string, error) runNvidiaPulseFn func(string, int, []int) (string, error)
detectVendorFn func() string runNvidiaBandwidthFn func(string, []int) (string, error)
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error) runNvidiaTargetedStressFn func(string, int, []int) (string, error)
runAMDPackFn func(string) (string, error) runMemoryFn func(string) (string, error)
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error) runStorageFn func(string) (string, error)
runCPUFn func(string, int) (string, error)
detectVendorFn func() string
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
runAMDPackFn func(string) (string, error)
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
} }
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) { func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
@@ -147,6 +152,41 @@ func (f fakeSAT) RunNvidiaBenchmark(_ context.Context, baseDir string, opts plat
return f.runNvidiaFn(baseDir) return f.runNvidiaFn(baseDir)
} }
func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
if f.runNvidiaTargetedStressFn != nil {
return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
}
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
if f.runNvidiaComputeFn != nil {
return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices)
}
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaTargetedPowerPack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
if f.runNvidiaPowerFn != nil {
return f.runNvidiaPowerFn(baseDir, durationSec, gpuIndices)
}
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaPulseTestPack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
if f.runNvidiaPulseFn != nil {
return f.runNvidiaPulseFn(baseDir, durationSec, gpuIndices)
}
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaBandwidthPack(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
if f.runNvidiaBandwidthFn != nil {
return f.runNvidiaBandwidthFn(baseDir, gpuIndices)
}
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaStressPack(_ context.Context, baseDir string, opts platform.NvidiaStressOptions, _ func(string)) (string, error) { func (f fakeSAT) RunNvidiaStressPack(_ context.Context, baseDir string, opts platform.NvidiaStressOptions, _ func(string)) (string, error) {
if f.runNvidiaStressFn != nil { if f.runNvidiaStressFn != nil {
return f.runNvidiaStressFn(baseDir, opts) return f.runNvidiaStressFn(baseDir, opts)

View File

@@ -21,12 +21,12 @@ type ComponentStatusDB struct {
// ComponentStatusRecord holds the current and historical health of one hardware component. // ComponentStatusRecord holds the current and historical health of one hardware component.
type ComponentStatusRecord struct { type ComponentStatusRecord struct {
ComponentKey string `json:"component_key"` ComponentKey string `json:"component_key"`
Status string `json:"status"` // "OK", "Warning", "Critical", "Unknown" Status string `json:"status"` // "OK", "Warning", "Critical", "Unknown"
LastCheckedAt time.Time `json:"last_checked_at"` LastCheckedAt time.Time `json:"last_checked_at"`
LastChangedAt time.Time `json:"last_changed_at"` LastChangedAt time.Time `json:"last_changed_at"`
ErrorSummary string `json:"error_summary,omitempty"` ErrorSummary string `json:"error_summary,omitempty"`
History []ComponentStatusEntry `json:"history"` History []ComponentStatusEntry `json:"history"`
} }
// ComponentStatusEntry is one observation written to a component's history. // ComponentStatusEntry is one observation written to a component's history.
@@ -179,7 +179,9 @@ func ApplySATResultToDB(db *ComponentStatusDB, target, archivePath string) {
// Map SAT target to component keys. // Map SAT target to component keys.
switch target { switch target {
case "nvidia", "amd", "nvidia-stress", "amd-stress", "amd-mem", "amd-bandwidth": case "nvidia", "nvidia-targeted-stress", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
"nvidia-interconnect", "nvidia-bandwidth", "amd", "nvidia-stress",
"amd-stress", "amd-mem", "amd-bandwidth":
db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall) db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall)
case "memory", "memory-stress", "sat-stress": case "memory", "memory-stress", "sat-stress":
db.Record("memory:all", source, dbStatus, target+" SAT: "+overall) db.Record("memory:all", source, dbStatus, target+" SAT: "+overall)

View File

@@ -135,12 +135,15 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
case "nvidia": case "nvidia":
tools = append(tools, s.CheckTools([]string{ tools = append(tools, s.CheckTools([]string{
"nvidia-smi", "nvidia-smi",
"dcgmi",
"nv-hostengine",
"nvidia-bug-report.sh", "nvidia-bug-report.sh",
"bee-gpu-burn", "bee-gpu-burn",
"bee-john-gpu-stress", "bee-john-gpu-stress",
"bee-nccl-gpu-stress", "bee-nccl-gpu-stress",
"all_reduce_perf", "all_reduce_perf",
})...) })...)
tools = append(tools, resolvedToolStatus("dcgmproftester", dcgmProfTesterCandidates...))
case "amd": case "amd":
tool := ToolStatus{Name: "rocm-smi"} tool := ToolStatus{Name: "rocm-smi"}
if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 { if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 {
@@ -155,6 +158,16 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
return tools return tools
} }
func resolvedToolStatus(display string, candidates ...string) ToolStatus {
for _, candidate := range candidates {
path, err := exec.LookPath(candidate)
if err == nil {
return ToolStatus{Name: display, Path: path, OK: true}
}
}
return ToolStatus{Name: display}
}
func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) { func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
lsmodText := commandText("lsmod") lsmodText := commandText("lsmod")

View File

@@ -38,6 +38,12 @@ var (
"/opt/rocm/bin/rvs", "/opt/rocm/bin/rvs",
"/opt/rocm-*/bin/rvs", "/opt/rocm-*/bin/rvs",
} }
dcgmProfTesterCandidates = []string{
"dcgmproftester",
"dcgmproftester13",
"dcgmproftester12",
"dcgmproftester11",
}
) )
// streamExecOutput runs cmd and streams each output line to logFunc (if non-nil). // streamExecOutput runs cmd and streams each output line to logFunc (if non-nil).
@@ -277,6 +283,80 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
}, logFunc) }, logFunc)
} }
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
selected, err := resolveDCGMGPUIndices(gpuIndices)
if err != nil {
return "", err
}
profCmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
if err != nil {
return "", err
}
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", []satJob{
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
{
name: "03-dcgmproftester.log",
cmd: profCmd,
env: nvidiaVisibleDevicesEnv(selected),
collectGPU: true,
gpuIndices: selected,
},
{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
}, logFunc)
}
func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
selected, err := resolveDCGMGPUIndices(gpuIndices)
if err != nil {
return "", err
}
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", []satJob{
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{
name: "02-dcgmi-targeted-power.log",
cmd: nvidiaDCGMNamedDiagCommand("targeted_power", normalizeNvidiaBurnDuration(durationSec), selected),
collectGPU: true,
gpuIndices: selected,
},
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
}, logFunc)
}
func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
selected, err := resolveDCGMGPUIndices(gpuIndices)
if err != nil {
return "", err
}
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", []satJob{
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{
name: "02-dcgmi-pulse-test.log",
cmd: nvidiaDCGMNamedDiagCommand("pulse_test", normalizeNvidiaBurnDuration(durationSec), selected),
collectGPU: true,
gpuIndices: selected,
},
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
}, logFunc)
}
func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
selected, err := resolveDCGMGPUIndices(gpuIndices)
if err != nil {
return "", err
}
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", []satJob{
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{
name: "02-dcgmi-nvbandwidth.log",
cmd: nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, selected),
collectGPU: true,
gpuIndices: selected,
},
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
}, logFunc)
}
func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) { func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
return runAcceptancePackCtx(context.Background(), baseDir, "gpu-nvidia", nvidiaSATJobs(), logFunc) return runAcceptancePackCtx(context.Background(), baseDir, "gpu-nvidia", nvidiaSATJobs(), logFunc)
} }
@@ -293,6 +373,23 @@ func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, resolvedGPUIndices), logFunc) return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, resolvedGPUIndices), logFunc)
} }
func (s *System) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
selected, err := resolveDCGMGPUIndices(gpuIndices)
if err != nil {
return "", err
}
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", []satJob{
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{
name: "02-dcgmi-targeted-stress.log",
cmd: nvidiaDCGMNamedDiagCommand("targeted_stress", normalizeNvidiaBurnDuration(durationSec), selected),
collectGPU: true,
gpuIndices: selected,
},
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
}, logFunc)
}
func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) { func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
if len(gpuIndices) > 0 { if len(gpuIndices) > 0 {
return dedupeSortedIndices(gpuIndices), nil return dedupeSortedIndices(gpuIndices), nil
@@ -473,6 +570,31 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
} }
} }
func nvidiaDCGMNamedDiagCommand(name string, durationSec int, gpuIndices []int) []string {
args := []string{"dcgmi", "diag", "-r", name}
if durationSec > 0 {
args = append(args, "-p", fmt.Sprintf("%s.test_duration=%d", name, durationSec))
}
if len(gpuIndices) > 0 {
args = append(args, "-i", joinIndexList(gpuIndices))
}
return args
}
func normalizeNvidiaBurnDuration(durationSec int) int {
if durationSec <= 0 {
return 300
}
return durationSec
}
func nvidiaVisibleDevicesEnv(gpuIndices []int) []string {
if len(gpuIndices) == 0 {
return nil
}
return []string{"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices)}
}
func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) { func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
if ctx == nil { if ctx == nil {
ctx = context.Background() ctx = context.Background()
@@ -642,6 +764,7 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
} }
if strings.Contains(text, "unsupported") || if strings.Contains(text, "unsupported") ||
strings.Contains(text, "not supported") || strings.Contains(text, "not supported") ||
strings.Contains(text, "not found in path") ||
strings.Contains(text, "invalid opcode") || strings.Contains(text, "invalid opcode") ||
strings.Contains(text, "unknown command") || strings.Contains(text, "unknown command") ||
strings.Contains(text, "not implemented") || strings.Contains(text, "not implemented") ||
@@ -748,6 +871,15 @@ func resolveROCmSMICommand(args ...string) ([]string, error) {
return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm") return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
} }
func resolveDCGMProfTesterCommand(args ...string) ([]string, error) {
for _, candidate := range dcgmProfTesterCandidates {
if path, err := satLookPath(candidate); err == nil {
return append([]string{path}, args...), nil
}
}
return nil, errors.New("dcgmproftester not found in PATH")
}
func ensureAMDRuntimeReady() error { func ensureAMDRuntimeReady() error {
if _, err := os.Stat("/dev/kfd"); err == nil { if _, err := os.Stat("/dev/kfd"); err == nil {
return nil return nil

View File

@@ -195,6 +195,53 @@ func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) {
} }
} }
func TestResolveDCGMProfTesterCommandUsesVersionedBinary(t *testing.T) {
oldLookPath := satLookPath
satLookPath = func(file string) (string, error) {
switch file {
case "dcgmproftester13":
return "/usr/bin/dcgmproftester13", nil
default:
return "", exec.ErrNotFound
}
}
t.Cleanup(func() { satLookPath = oldLookPath })
cmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004")
if err != nil {
t.Fatalf("resolveDCGMProfTesterCommand error: %v", err)
}
if len(cmd) != 4 {
t.Fatalf("cmd len=%d want 4 (%v)", len(cmd), cmd)
}
if cmd[0] != "/usr/bin/dcgmproftester13" {
t.Fatalf("cmd[0]=%q want /usr/bin/dcgmproftester13", cmd[0])
}
}
func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", 900, []int{3, 1})
want := []string{"dcgmi", "diag", "-r", "targeted_power", "-p", "targeted_power.test_duration=900", "-i", "3,1"}
if len(cmd) != len(want) {
t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
}
for i := range want {
if cmd[i] != want[i] {
t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
}
}
}
func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
if len(env) != 1 {
t.Fatalf("env len=%d want 1 (%v)", len(env), env)
}
if env[0] != "CUDA_VISIBLE_DEVICES=0,2,4" {
t.Fatalf("env[0]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[0])
}
}
func TestNvidiaStressArchivePrefixByLoader(t *testing.T) { func TestNvidiaStressArchivePrefixByLoader(t *testing.T) {
t.Parallel() t.Parallel()

View File

@@ -581,15 +581,32 @@ func (h *handler) handleAPIGPUTools(w http.ResponseWriter, _ *http.Request) {
nvidiaUp := nvidiaErr == nil nvidiaUp := nvidiaErr == nil
amdUp := amdErr == nil amdUp := amdErr == nil
_, dcgmErr := exec.LookPath("dcgmi") _, dcgmErr := exec.LookPath("dcgmi")
_, ncclStressErr := exec.LookPath("bee-nccl-gpu-stress")
_, johnErr := exec.LookPath("bee-john-gpu-stress")
_, beeBurnErr := exec.LookPath("bee-gpu-burn")
_, nvBandwidthErr := exec.LookPath("nvbandwidth")
profErr := lookPathAny("dcgmproftester", "dcgmproftester13", "dcgmproftester12", "dcgmproftester11")
writeJSON(w, []toolEntry{ writeJSON(w, []toolEntry{
{ID: "bee-gpu-burn", Available: nvidiaUp, Vendor: "nvidia"}, {ID: "nvidia-compute", Available: nvidiaUp && profErr == nil, Vendor: "nvidia"},
{ID: "dcgm", Available: nvidiaUp && dcgmErr == nil, Vendor: "nvidia"}, {ID: "nvidia-targeted-power", Available: nvidiaUp && dcgmErr == nil, Vendor: "nvidia"},
{ID: "john", Available: nvidiaUp, Vendor: "nvidia"}, {ID: "nvidia-pulse", Available: nvidiaUp && dcgmErr == nil, Vendor: "nvidia"},
{ID: "nccl", Available: nvidiaUp, Vendor: "nvidia"}, {ID: "nvidia-interconnect", Available: nvidiaUp && ncclStressErr == nil, Vendor: "nvidia"},
{ID: "nvidia-bandwidth", Available: nvidiaUp && dcgmErr == nil && nvBandwidthErr == nil, Vendor: "nvidia"},
{ID: "bee-gpu-burn", Available: nvidiaUp && beeBurnErr == nil, Vendor: "nvidia"},
{ID: "john", Available: nvidiaUp && johnErr == nil, Vendor: "nvidia"},
{ID: "rvs", Available: amdUp, Vendor: "amd"}, {ID: "rvs", Available: amdUp, Vendor: "amd"},
}) })
} }
func lookPathAny(names ...string) error {
for _, name := range names {
if _, err := exec.LookPath(name); err == nil {
return nil
}
}
return exec.ErrNotFound
}
// ── System ──────────────────────────────────────────────────────────────────── // ── System ────────────────────────────────────────────────────────────────────
func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) { func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) {
@@ -628,7 +645,7 @@ func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request)
var standardTools = []string{ var standardTools = []string{
"dmidecode", "smartctl", "nvme", "lspci", "ipmitool", "dmidecode", "smartctl", "nvme", "lspci", "ipmitool",
"nvidia-smi", "memtester", "stress-ng", "nvtop", "nvidia-smi", "dcgmi", "nv-hostengine", "memtester", "stress-ng", "nvtop",
"mstflint", "qrencode", "mstflint", "qrencode",
} }

View File

@@ -232,7 +232,8 @@ func truncate(s string, max int) string {
// isSATTarget returns true for task targets that run hardware acceptance tests. // isSATTarget returns true for task targets that run hardware acceptance tests.
func isSATTarget(target string) bool { func isSATTarget(target string) bool {
switch target { switch target {
case "nvidia", "nvidia-benchmark", "nvidia-stress", "memory", "memory-stress", "storage", case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
"nvidia-interconnect", "nvidia-bandwidth", "nvidia-stress", "memory", "memory-stress", "storage",
"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress", "cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
"platform-stress": "platform-stress":
return true return true

File diff suppressed because it is too large Load Diff

View File

@@ -237,6 +237,12 @@ func NewHandler(opts HandlerOptions) http.Handler {
// SAT // SAT
mux.HandleFunc("POST /api/sat/nvidia/run", h.handleAPISATRun("nvidia")) mux.HandleFunc("POST /api/sat/nvidia/run", h.handleAPISATRun("nvidia"))
mux.HandleFunc("POST /api/sat/nvidia-targeted-stress/run", h.handleAPISATRun("nvidia-targeted-stress"))
mux.HandleFunc("POST /api/sat/nvidia-compute/run", h.handleAPISATRun("nvidia-compute"))
mux.HandleFunc("POST /api/sat/nvidia-targeted-power/run", h.handleAPISATRun("nvidia-targeted-power"))
mux.HandleFunc("POST /api/sat/nvidia-pulse/run", h.handleAPISATRun("nvidia-pulse"))
mux.HandleFunc("POST /api/sat/nvidia-interconnect/run", h.handleAPISATRun("nvidia-interconnect"))
mux.HandleFunc("POST /api/sat/nvidia-bandwidth/run", h.handleAPISATRun("nvidia-bandwidth"))
mux.HandleFunc("POST /api/sat/nvidia-stress/run", h.handleAPISATRun("nvidia-stress")) mux.HandleFunc("POST /api/sat/nvidia-stress/run", h.handleAPISATRun("nvidia-stress"))
mux.HandleFunc("POST /api/sat/memory/run", h.handleAPISATRun("memory")) mux.HandleFunc("POST /api/sat/memory/run", h.handleAPISATRun("memory"))
mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage")) mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
@@ -1182,6 +1188,11 @@ func (h *handler) handleAPIMetricsExportCSV(w http.ResponseWriter, r *http.Reque
func (h *handler) handleReady(w http.ResponseWriter, r *http.Request) { func (h *handler) handleReady(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Cache-Control", "no-store") w.Header().Set("Cache-Control", "no-store")
if strings.TrimSpace(h.opts.AuditPath) == "" {
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte("ready"))
return
}
if _, err := os.Stat(h.opts.AuditPath); err != nil { if _, err := os.Stat(h.opts.AuditPath); err != nil {
w.WriteHeader(http.StatusServiceUnavailable) w.WriteHeader(http.StatusServiceUnavailable)
_, _ = w.Write([]byte("starting")) _, _ = w.Write([]byte("starting"))

View File

@@ -543,7 +543,7 @@ func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
t.Fatalf("status=%d", rec.Code) t.Fatalf("status=%d", rec.Code)
} }
body := rec.Body.String() body := rec.Body.String()
if !strings.Contains(body, `Run Audit`) { if !strings.Contains(body, `onclick="auditModalRun()">Run audit</button>`) {
t.Fatalf("dashboard missing run audit button: %s", body) t.Fatalf("dashboard missing run audit button: %s", body)
} }
if strings.Contains(body, `No audit data`) { if strings.Contains(body, `No audit data`) {
@@ -551,6 +551,18 @@ func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
} }
} }
func TestReadyIsOKWhenAuditPathIsUnset(t *testing.T) {
handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/api/ready", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
}
if strings.TrimSpace(rec.Body.String()) != "ready" {
t.Fatalf("body=%q want ready", rec.Body.String())
}
}
func TestAuditPageRendersViewerFrameAndActions(t *testing.T) { func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
dir := t.TempDir() dir := t.TempDir()
path := filepath.Join(dir, "audit.json") path := filepath.Join(dir, "audit.json")
@@ -638,7 +650,27 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
} }
} }
func TestBurnPageRendersOfficialNVIDIADCGMAndNCCLInterconnectLabel(t *testing.T) { func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d", rec.Code)
}
body := rec.Body.String()
for _, needle := range []string{
`NVIDIA GPU Targeted Stress`,
`nvidia-targeted-stress`,
`controlled NVIDIA DCGM load`,
`<code>dcgmi diag targeted_stress</code>`,
} {
if !strings.Contains(body, needle) {
t.Fatalf("validate page missing %q: %s", needle, body)
}
}
}
func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
handler := NewHandler(HandlerOptions{}) handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder() rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/burn", nil)) handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/burn", nil))
@@ -647,10 +679,11 @@ func TestBurnPageRendersOfficialNVIDIADCGMAndNCCLInterconnectLabel(t *testing.T)
} }
body := rec.Body.String() body := rec.Body.String()
for _, needle := range []string{ for _, needle := range []string{
`DCGM Diagnostics (Official NVIDIA)`, `NVIDIA Max Compute Load`,
`NCCL all_reduce_perf (Interconnect)`, `dcgmproftester`,
`DCGM is the official NVIDIA diagnostic path`, `targeted_stress remain in <a href="/validate">Validate</a>`,
`burn-gpu-dcgm`, `NVIDIA Interconnect Test (NCCL all_reduce_perf)`,
`id="burn-gpu-list"`,
} { } {
if !strings.Contains(body, needle) { if !strings.Contains(body, needle) {
t.Fatalf("burn page missing %q: %s", needle, body) t.Fatalf("burn page missing %q: %s", needle, body)
@@ -813,3 +846,98 @@ func TestRuntimeHealthEndpointReturnsJSON(t *testing.T) {
t.Fatalf("body=%q want %q", strings.TrimSpace(rec.Body.String()), body) t.Fatalf("body=%q want %q", strings.TrimSpace(rec.Body.String()), body)
} }
} }
func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "audit.json")
exportDir := filepath.Join(dir, "export")
if err := os.MkdirAll(exportDir, 0755); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z","hardware":{"board":{"serial_number":"SERIAL-1"}}}`), 0644); err != nil {
t.Fatal(err)
}
health := `{
"status":"PARTIAL",
"checked_at":"2026-03-16T10:00:00Z",
"export_dir":"/tmp/export",
"driver_ready":true,
"cuda_ready":false,
"network_status":"PARTIAL",
"issues":[
{"code":"dhcp_partial","description":"At least one interface did not obtain IPv4 connectivity."},
{"code":"cuda_runtime_not_ready","description":"CUDA runtime is not ready for GPU SAT."}
],
"tools":[
{"name":"dmidecode","ok":true},
{"name":"nvidia-smi","ok":false}
],
"services":[
{"name":"bee-web","status":"active"},
{"name":"bee-nvidia","status":"inactive"}
]
}`
if err := os.WriteFile(filepath.Join(exportDir, "runtime-health.json"), []byte(health), 0644); err != nil {
t.Fatal(err)
}
componentStatus := `[
{
"component_key":"cpu:all",
"status":"Warning",
"error_summary":"cpu SAT: FAILED",
"history":[{"at":"2026-03-16T10:00:00Z","status":"Warning","source":"sat:cpu","detail":"cpu SAT: FAILED"}]
},
{
"component_key":"memory:all",
"status":"OK",
"history":[{"at":"2026-03-16T10:01:00Z","status":"OK","source":"sat:memory","detail":"memory SAT: OK"}]
},
{
"component_key":"storage:nvme0n1",
"status":"Critical",
"error_summary":"storage SAT: FAILED",
"history":[{"at":"2026-03-16T10:02:00Z","status":"Critical","source":"sat:storage","detail":"storage SAT: FAILED"}]
},
{
"component_key":"pcie:gpu:nvidia",
"status":"Warning",
"error_summary":"nvidia SAT: FAILED",
"history":[{"at":"2026-03-16T10:03:00Z","status":"Warning","source":"sat:nvidia","detail":"nvidia SAT: FAILED"}]
}
]`
if err := os.WriteFile(filepath.Join(exportDir, "component-status.json"), []byte(componentStatus), 0644); err != nil {
t.Fatal(err)
}
handler := NewHandler(HandlerOptions{AuditPath: path, ExportDir: exportDir})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
}
body := rec.Body.String()
for _, needle := range []string{
`Runtime Health`,
`<th>Check</th><th>Status</th><th>Source</th><th>Issue</th>`,
`Export Directory`,
`Network`,
`NVIDIA/AMD Driver`,
`CUDA / ROCm`,
`Required Utilities`,
`Bee Services`,
`<td>CPU</td>`,
`<td>Memory</td>`,
`<td>Storage</td>`,
`<td>GPU</td>`,
`CUDA runtime is not ready for GPU SAT.`,
`Missing: nvidia-smi`,
`bee-nvidia=inactive`,
`cpu SAT: FAILED`,
`storage SAT: FAILED`,
`sat:nvidia`,
} {
if !strings.Contains(body, needle) {
t.Fatalf("dashboard missing %q: %s", needle, body)
}
}
}

View File

@@ -30,23 +30,29 @@ const (
// taskNames maps target → human-readable name for validate (SAT) runs. // taskNames maps target → human-readable name for validate (SAT) runs.
var taskNames = map[string]string{ var taskNames = map[string]string{
"nvidia": "NVIDIA SAT", "nvidia": "NVIDIA SAT",
"nvidia-benchmark": "NVIDIA Benchmark", "nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)",
"nvidia-stress": "NVIDIA GPU Stress", "nvidia-benchmark": "NVIDIA Benchmark",
"memory": "Memory SAT", "nvidia-compute": "NVIDIA Max Compute Load (dcgmproftester)",
"storage": "Storage SAT", "nvidia-targeted-power": "NVIDIA Targeted Power (dcgmi diag targeted_power)",
"cpu": "CPU SAT", "nvidia-pulse": "NVIDIA Pulse Test (dcgmi diag pulse_test)",
"amd": "AMD GPU SAT", "nvidia-interconnect": "NVIDIA Interconnect Test (NCCL all_reduce_perf)",
"amd-mem": "AMD GPU MEM Integrity", "nvidia-bandwidth": "NVIDIA Bandwidth Test (NVBandwidth)",
"amd-bandwidth": "AMD GPU MEM Bandwidth", "nvidia-stress": "NVIDIA GPU Stress",
"amd-stress": "AMD GPU Burn-in", "memory": "Memory SAT",
"memory-stress": "Memory Burn-in", "storage": "Storage SAT",
"sat-stress": "SAT Stress (stressapptest)", "cpu": "CPU SAT",
"platform-stress": "Platform Thermal Cycling", "amd": "AMD GPU SAT",
"audit": "Audit", "amd-mem": "AMD GPU MEM Integrity",
"support-bundle": "Support Bundle", "amd-bandwidth": "AMD GPU MEM Bandwidth",
"install": "Install to Disk", "amd-stress": "AMD GPU Burn-in",
"install-to-ram": "Install to RAM", "memory-stress": "Memory Burn-in",
"sat-stress": "SAT Stress (stressapptest)",
"platform-stress": "Platform Thermal Cycling",
"audit": "Audit",
"support-bundle": "Support Bundle",
"install": "Install to Disk",
"install-to-ram": "Install to RAM",
} }
// burnNames maps target → human-readable name when a burn profile is set. // burnNames maps target → human-readable name when a burn profile is set.
@@ -134,45 +140,53 @@ type persistedTask struct {
} }
type burnPreset struct { type burnPreset struct {
NvidiaDiag int
DurationSec int DurationSec int
} }
func resolveBurnPreset(profile string) burnPreset { func resolveBurnPreset(profile string) burnPreset {
switch profile { switch profile {
case "overnight": case "overnight":
return burnPreset{NvidiaDiag: 4, DurationSec: 8 * 60 * 60} return burnPreset{DurationSec: 8 * 60 * 60}
case "acceptance": case "acceptance":
return burnPreset{NvidiaDiag: 3, DurationSec: 60 * 60} return burnPreset{DurationSec: 60 * 60}
default: default:
return burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60} return burnPreset{DurationSec: 5 * 60}
} }
} }
func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions { func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
acceptanceCycles := []platform.PlatformStressCycle{
{LoadSec: 85, IdleSec: 5},
{LoadSec: 80, IdleSec: 10},
{LoadSec: 55, IdleSec: 5},
{LoadSec: 60, IdleSec: 0},
{LoadSec: 100, IdleSec: 10},
{LoadSec: 145, IdleSec: 15},
{LoadSec: 190, IdleSec: 20},
{LoadSec: 235, IdleSec: 25},
{LoadSec: 280, IdleSec: 30},
{LoadSec: 325, IdleSec: 35},
{LoadSec: 370, IdleSec: 40},
{LoadSec: 415, IdleSec: 45},
{LoadSec: 460, IdleSec: 50},
{LoadSec: 510, IdleSec: 0},
}
switch profile { switch profile {
case "overnight": case "overnight":
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{ cycles := make([]platform.PlatformStressCycle, 0, len(acceptanceCycles)*8)
{LoadSec: 600, IdleSec: 120}, for range 8 {
{LoadSec: 600, IdleSec: 60}, cycles = append(cycles, acceptanceCycles...)
{LoadSec: 600, IdleSec: 30}, }
{LoadSec: 600, IdleSec: 120}, return platform.PlatformStressOptions{Cycles: cycles}
{LoadSec: 600, IdleSec: 60},
{LoadSec: 600, IdleSec: 30},
{LoadSec: 600, IdleSec: 120},
{LoadSec: 600, IdleSec: 60},
}}
case "acceptance": case "acceptance":
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{ return platform.PlatformStressOptions{Cycles: acceptanceCycles}
{LoadSec: 300, IdleSec: 60},
{LoadSec: 300, IdleSec: 30},
{LoadSec: 300, IdleSec: 60},
{LoadSec: 300, IdleSec: 30},
}}
default: // smoke default: // smoke
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{ return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
{LoadSec: 90, IdleSec: 60}, {LoadSec: 85, IdleSec: 5},
{LoadSec: 90, IdleSec: 30}, {LoadSec: 80, IdleSec: 10},
{LoadSec: 55, IdleSec: 5},
{LoadSec: 60, IdleSec: 0},
}} }}
} }
} }
@@ -536,9 +550,6 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
break break
} }
diagLevel := t.params.DiagLevel diagLevel := t.params.DiagLevel
if t.params.BurnProfile != "" && diagLevel <= 0 {
diagLevel = resolveBurnPreset(t.params.BurnProfile).NvidiaDiag
}
if len(t.params.GPUIndices) > 0 || diagLevel > 0 { if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
result, e := a.RunNvidiaAcceptancePackWithOptions( result, e := a.RunNvidiaAcceptancePackWithOptions(
ctx, "", diagLevel, t.params.GPUIndices, j.append, ctx, "", diagLevel, t.params.GPUIndices, j.append,
@@ -551,6 +562,16 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
} else { } else {
archive, err = a.RunNvidiaAcceptancePack("", j.append) archive, err = a.RunNvidiaAcceptancePack("", j.append)
} }
case "nvidia-targeted-stress":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if dur <= 0 {
dur = 300
}
archive, err = a.RunNvidiaTargetedStressValidatePack(ctx, "", dur, t.params.GPUIndices, j.append)
case "nvidia-benchmark": case "nvidia-benchmark":
if a == nil { if a == nil {
err = fmt.Errorf("app not configured") err = fmt.Errorf("app not configured")
@@ -563,6 +584,56 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
ExcludeGPUIndices: t.params.ExcludeGPUIndices, ExcludeGPUIndices: t.params.ExcludeGPUIndices,
RunNCCL: t.params.RunNCCL, RunNCCL: t.params.RunNCCL,
}, j.append) }, j.append)
case "nvidia-compute":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, j.append)
case "nvidia-targeted-power":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
archive, err = a.RunNvidiaTargetedPowerPack(ctx, "", dur, t.params.GPUIndices, j.append)
case "nvidia-pulse":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
archive, err = a.RunNvidiaPulseTestPack(ctx, "", dur, t.params.GPUIndices, j.append)
case "nvidia-bandwidth":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
archive, err = a.RunNvidiaBandwidthPack(ctx, "", t.params.GPUIndices, j.append)
case "nvidia-interconnect":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
DurationSec: dur,
Loader: platform.NvidiaStressLoaderNCCL,
GPUIndices: t.params.GPUIndices,
}, j.append)
case "nvidia-stress": case "nvidia-stress":
if a == nil { if a == nil {
err = fmt.Errorf("app not configured") err = fmt.Errorf("app not configured")

View File

@@ -253,10 +253,10 @@ func TestResolveBurnPreset(t *testing.T) {
profile string profile string
want burnPreset want burnPreset
}{ }{
{profile: "smoke", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}}, {profile: "smoke", want: burnPreset{DurationSec: 5 * 60}},
{profile: "acceptance", want: burnPreset{NvidiaDiag: 3, DurationSec: 60 * 60}}, {profile: "acceptance", want: burnPreset{DurationSec: 60 * 60}},
{profile: "overnight", want: burnPreset{NvidiaDiag: 4, DurationSec: 8 * 60 * 60}}, {profile: "overnight", want: burnPreset{DurationSec: 8 * 60 * 60}},
{profile: "", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}}, {profile: "", want: burnPreset{DurationSec: 5 * 60}},
} }
for _, tc := range tests { for _, tc := range tests {
if got := resolveBurnPreset(tc.profile); got != tc.want { if got := resolveBurnPreset(tc.profile); got != tc.want {

View File

@@ -302,6 +302,12 @@ memtest_fail() {
return 0 return 0
} }
nvidia_runtime_fail() {
msg="$1"
echo "ERROR: ${msg}" >&2
exit 1
}
iso_memtest_present() { iso_memtest_present() {
iso_path="$1" iso_path="$1"
iso_files="$(mktemp)" iso_files="$(mktemp)"
@@ -439,6 +445,44 @@ validate_iso_memtest() {
echo "=== memtest validation OK ===" echo "=== memtest validation OK ==="
} }
validate_iso_nvidia_runtime() {
iso_path="$1"
[ "$BEE_GPU_VENDOR" = "nvidia" ] || return 0
echo "=== validating NVIDIA runtime in ISO ==="
[ -f "$iso_path" ] || nvidia_runtime_fail "ISO not found for NVIDIA runtime validation: $iso_path"
require_iso_reader "$iso_path" >/dev/null 2>&1 || nvidia_runtime_fail "ISO reader unavailable for NVIDIA runtime validation"
command -v unsquashfs >/dev/null 2>&1 || nvidia_runtime_fail "unsquashfs is required for NVIDIA runtime validation"
squashfs_tmp="$(mktemp)"
squashfs_list="$(mktemp)"
iso_read_member "$iso_path" live/filesystem.squashfs "$squashfs_tmp" || {
rm -f "$squashfs_tmp" "$squashfs_list"
nvidia_runtime_fail "failed to extract live/filesystem.squashfs from ISO"
}
unsquashfs -ll "$squashfs_tmp" > "$squashfs_list" 2>/dev/null || {
rm -f "$squashfs_tmp" "$squashfs_list"
nvidia_runtime_fail "failed to inspect filesystem.squashfs from ISO"
}
grep -Eq 'usr/bin/dcgmi$' "$squashfs_list" || {
rm -f "$squashfs_tmp" "$squashfs_list"
nvidia_runtime_fail "dcgmi missing from final NVIDIA ISO"
}
grep -Eq 'usr/bin/nv-hostengine$' "$squashfs_list" || {
rm -f "$squashfs_tmp" "$squashfs_list"
nvidia_runtime_fail "nv-hostengine missing from final NVIDIA ISO"
}
grep -Eq 'usr/bin/dcgmproftester([0-9]+)?$' "$squashfs_list" || {
rm -f "$squashfs_tmp" "$squashfs_list"
nvidia_runtime_fail "dcgmproftester missing from final NVIDIA ISO"
}
rm -f "$squashfs_tmp" "$squashfs_list"
echo "=== NVIDIA runtime validation OK ==="
}
append_memtest_grub_entry() { append_memtest_grub_entry() {
grub_cfg="$1" grub_cfg="$1"
[ -f "$grub_cfg" ] || return 1 [ -f "$grub_cfg" ] || return 1
@@ -1144,6 +1188,7 @@ if [ -f "$ISO_RAW" ]; then
fi fi
fi fi
validate_iso_memtest "$ISO_RAW" validate_iso_memtest "$ISO_RAW"
validate_iso_nvidia_runtime "$ISO_RAW"
cp "$ISO_RAW" "$ISO_OUT" cp "$ISO_RAW" "$ISO_OUT"
echo "" echo ""
echo "=== done (${BEE_GPU_VENDOR}) ===" echo "=== done (${BEE_GPU_VENDOR}) ==="

View File

@@ -1,6 +1,10 @@
# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing. # NVIDIA DCGM (Data Center GPU Manager).
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with CUDA 13 userspace, # Validate uses dcgmi diagnostics; Burn uses dcgmproftester as the official
# so install the CUDA 13 build plus proprietary diagnostic components explicitly. # NVIDIA max-compute recipe. The smoketest/runtime contract treats
# dcgmproftester as required in the LiveCD.
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
# CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
# explicitly.
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%% datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%% datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%% datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%

View File

@@ -52,6 +52,31 @@ else
fail "nvidia-smi: NOT FOUND" fail "nvidia-smi: NOT FOUND"
fi fi
if p=$(PATH="/usr/local/bin:$PATH" command -v dcgmi 2>/dev/null); then
ok "dcgmi found: $p"
else
fail "dcgmi: NOT FOUND"
fi
if p=$(PATH="/usr/local/bin:$PATH" command -v nv-hostengine 2>/dev/null); then
ok "nv-hostengine found: $p"
else
fail "nv-hostengine: NOT FOUND"
fi
DCGM_PROFTESTER=""
for tool in dcgmproftester dcgmproftester13 dcgmproftester12 dcgmproftester11; do
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
DCGM_PROFTESTER="$p"
break
fi
done
if [ -n "$DCGM_PROFTESTER" ]; then
ok "dcgmproftester found: $DCGM_PROFTESTER"
else
fail "dcgmproftester: NOT FOUND"
fi
for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf; do for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf; do
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
ok "$tool found: $p" ok "$tool found: $p"
@@ -60,6 +85,12 @@ for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf
fi fi
done done
if p=$(PATH="/usr/local/bin:$PATH" command -v nvbandwidth 2>/dev/null); then
ok "nvbandwidth found: $p"
else
warn "nvbandwidth: NOT FOUND"
fi
echo "" echo ""
echo "-- NVIDIA modules --" echo "-- NVIDIA modules --"
KO_DIR="/usr/local/lib/nvidia" KO_DIR="/usr/local/lib/nvidia"