Compare commits

..

1 Commits
v5.2 ... v5.3

Author SHA1 Message Date
38e79143eb Refine burn UI and NVIDIA stress flows 2026-04-05 13:43:43 +03:00
18 changed files with 825 additions and 229 deletions

View File

@@ -1,9 +1,10 @@
LISTEN ?= :8080 LISTEN ?= :8080
AUDIT_PATH ?= AUDIT_PATH ?=
EXPORT_DIR ?= $(CURDIR)/.tmp/export
VERSION ?= $(shell sh ./scripts/resolve-version.sh) VERSION ?= $(shell sh ./scripts/resolve-version.sh)
GO_LDFLAGS := -X main.Version=$(VERSION) GO_LDFLAGS := -X main.Version=$(VERSION)
RUN_ARGS := web --listen $(LISTEN) RUN_ARGS := web --listen $(LISTEN) --export-dir $(EXPORT_DIR)
ifneq ($(AUDIT_PATH),) ifneq ($(AUDIT_PATH),)
RUN_ARGS += --audit-path $(AUDIT_PATH) RUN_ARGS += --audit-path $(AUDIT_PATH)
endif endif
@@ -11,6 +12,7 @@ endif
.PHONY: run build test .PHONY: run build test
run: run:
mkdir -p $(EXPORT_DIR)
go run -ldflags "$(GO_LDFLAGS)" ./cmd/bee $(RUN_ARGS) go run -ldflags "$(GO_LDFLAGS)" ./cmd/bee $(RUN_ARGS)
build: build:

View File

@@ -87,7 +87,7 @@ func printRootUsage(w io.Writer) {
bee preflight --output stdout|file:<path> bee preflight --output stdout|file:<path>
bee export --target <device> bee export --target <device>
bee support-bundle --output stdout|file:<path> bee support-bundle --output stdout|file:<path>
bee web --listen :80 --audit-path `+app.DefaultAuditJSONPath+` bee web --listen :80 [--audit-path `+app.DefaultAuditJSONPath+`]
bee sat nvidia|memory|storage|cpu [--duration <seconds>] bee sat nvidia|memory|storage|cpu [--duration <seconds>]
bee benchmark nvidia [--profile standard|stability|overnight] bee benchmark nvidia [--profile standard|stability|overnight]
bee version bee version
@@ -296,7 +296,7 @@ func runWeb(args []string, stdout, stderr io.Writer) int {
fs := flag.NewFlagSet("web", flag.ContinueOnError) fs := flag.NewFlagSet("web", flag.ContinueOnError)
fs.SetOutput(stderr) fs.SetOutput(stderr)
listenAddr := fs.String("listen", ":8080", "listen address, e.g. :80") listenAddr := fs.String("listen", ":8080", "listen address, e.g. :80")
auditPath := fs.String("audit-path", app.DefaultAuditJSONPath, "path to the latest audit JSON snapshot") auditPath := fs.String("audit-path", "", "optional path to the latest audit JSON snapshot")
exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles") exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
title := fs.String("title", "Bee Hardware Audit", "page title") title := fs.String("title", "Bee Hardware Audit", "page title")
fs.Usage = func() { fs.Usage = func() {

View File

@@ -115,7 +115,12 @@ func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
type satRunner interface { type satRunner interface {
RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error)
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
@@ -528,6 +533,13 @@ func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir st
return ActionResult{Title: "NVIDIA DCGM", Body: body}, err return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
} }
func (a *App) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNvidiaTargetedStressValidatePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
}
func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) { func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc) return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
} }
@@ -543,6 +555,34 @@ func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts pl
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc) return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
} }
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
}
func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNvidiaTargetedPowerPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
}
func (a *App) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNvidiaPulseTestPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
}
func (a *App) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNvidiaBandwidthPack(ctx, baseDir, gpuIndices, logFunc)
}
func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) { func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" { if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir baseDir = DefaultSATBaseDir
@@ -893,6 +933,12 @@ func latestSATSummaries() []string {
prefix string prefix string
}{ }{
{label: "NVIDIA SAT", prefix: "gpu-nvidia-"}, {label: "NVIDIA SAT", prefix: "gpu-nvidia-"},
{label: "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)", prefix: "gpu-nvidia-targeted-stress-"},
{label: "NVIDIA Max Compute Load (dcgmproftester)", prefix: "gpu-nvidia-compute-"},
{label: "NVIDIA Targeted Power (dcgmi diag targeted_power)", prefix: "gpu-nvidia-targeted-power-"},
{label: "NVIDIA Pulse Test (dcgmi diag pulse_test)", prefix: "gpu-nvidia-pulse-"},
{label: "NVIDIA Interconnect Test (NCCL all_reduce_perf)", prefix: "gpu-nvidia-nccl-"},
{label: "NVIDIA Bandwidth Test (NVBandwidth)", prefix: "gpu-nvidia-bandwidth-"},
{label: "Memory SAT", prefix: "memory-"}, {label: "Memory SAT", prefix: "memory-"},
{label: "Storage SAT", prefix: "storage-"}, {label: "Storage SAT", prefix: "storage-"},
{label: "CPU SAT", prefix: "cpu-"}, {label: "CPU SAT", prefix: "cpu-"},

View File

@@ -120,16 +120,21 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
} }
type fakeSAT struct { type fakeSAT struct {
runNvidiaFn func(string) (string, error) runNvidiaFn func(string) (string, error)
runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error) runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error) runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
runMemoryFn func(string) (string, error) runNvidiaComputeFn func(string, int, []int) (string, error)
runStorageFn func(string) (string, error) runNvidiaPowerFn func(string, int, []int) (string, error)
runCPUFn func(string, int) (string, error) runNvidiaPulseFn func(string, int, []int) (string, error)
detectVendorFn func() string runNvidiaBandwidthFn func(string, []int) (string, error)
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error) runNvidiaTargetedStressFn func(string, int, []int) (string, error)
runAMDPackFn func(string) (string, error) runMemoryFn func(string) (string, error)
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error) runStorageFn func(string) (string, error)
runCPUFn func(string, int) (string, error)
detectVendorFn func() string
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
runAMDPackFn func(string) (string, error)
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
} }
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) { func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
@@ -147,6 +152,41 @@ func (f fakeSAT) RunNvidiaBenchmark(_ context.Context, baseDir string, opts plat
return f.runNvidiaFn(baseDir) return f.runNvidiaFn(baseDir)
} }
func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
if f.runNvidiaTargetedStressFn != nil {
return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
}
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
if f.runNvidiaComputeFn != nil {
return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices)
}
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaTargetedPowerPack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
if f.runNvidiaPowerFn != nil {
return f.runNvidiaPowerFn(baseDir, durationSec, gpuIndices)
}
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaPulseTestPack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
if f.runNvidiaPulseFn != nil {
return f.runNvidiaPulseFn(baseDir, durationSec, gpuIndices)
}
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaBandwidthPack(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
if f.runNvidiaBandwidthFn != nil {
return f.runNvidiaBandwidthFn(baseDir, gpuIndices)
}
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaStressPack(_ context.Context, baseDir string, opts platform.NvidiaStressOptions, _ func(string)) (string, error) { func (f fakeSAT) RunNvidiaStressPack(_ context.Context, baseDir string, opts platform.NvidiaStressOptions, _ func(string)) (string, error) {
if f.runNvidiaStressFn != nil { if f.runNvidiaStressFn != nil {
return f.runNvidiaStressFn(baseDir, opts) return f.runNvidiaStressFn(baseDir, opts)

View File

@@ -21,12 +21,12 @@ type ComponentStatusDB struct {
// ComponentStatusRecord holds the current and historical health of one hardware component. // ComponentStatusRecord holds the current and historical health of one hardware component.
type ComponentStatusRecord struct { type ComponentStatusRecord struct {
ComponentKey string `json:"component_key"` ComponentKey string `json:"component_key"`
Status string `json:"status"` // "OK", "Warning", "Critical", "Unknown" Status string `json:"status"` // "OK", "Warning", "Critical", "Unknown"
LastCheckedAt time.Time `json:"last_checked_at"` LastCheckedAt time.Time `json:"last_checked_at"`
LastChangedAt time.Time `json:"last_changed_at"` LastChangedAt time.Time `json:"last_changed_at"`
ErrorSummary string `json:"error_summary,omitempty"` ErrorSummary string `json:"error_summary,omitempty"`
History []ComponentStatusEntry `json:"history"` History []ComponentStatusEntry `json:"history"`
} }
// ComponentStatusEntry is one observation written to a component's history. // ComponentStatusEntry is one observation written to a component's history.
@@ -179,7 +179,9 @@ func ApplySATResultToDB(db *ComponentStatusDB, target, archivePath string) {
// Map SAT target to component keys. // Map SAT target to component keys.
switch target { switch target {
case "nvidia", "amd", "nvidia-stress", "amd-stress", "amd-mem", "amd-bandwidth": case "nvidia", "nvidia-targeted-stress", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
"nvidia-interconnect", "nvidia-bandwidth", "amd", "nvidia-stress",
"amd-stress", "amd-mem", "amd-bandwidth":
db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall) db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall)
case "memory", "memory-stress", "sat-stress": case "memory", "memory-stress", "sat-stress":
db.Record("memory:all", source, dbStatus, target+" SAT: "+overall) db.Record("memory:all", source, dbStatus, target+" SAT: "+overall)

View File

@@ -135,12 +135,15 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
case "nvidia": case "nvidia":
tools = append(tools, s.CheckTools([]string{ tools = append(tools, s.CheckTools([]string{
"nvidia-smi", "nvidia-smi",
"dcgmi",
"nv-hostengine",
"nvidia-bug-report.sh", "nvidia-bug-report.sh",
"bee-gpu-burn", "bee-gpu-burn",
"bee-john-gpu-stress", "bee-john-gpu-stress",
"bee-nccl-gpu-stress", "bee-nccl-gpu-stress",
"all_reduce_perf", "all_reduce_perf",
})...) })...)
tools = append(tools, resolvedToolStatus("dcgmproftester", dcgmProfTesterCandidates...))
case "amd": case "amd":
tool := ToolStatus{Name: "rocm-smi"} tool := ToolStatus{Name: "rocm-smi"}
if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 { if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 {
@@ -155,6 +158,16 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
return tools return tools
} }
func resolvedToolStatus(display string, candidates ...string) ToolStatus {
for _, candidate := range candidates {
path, err := exec.LookPath(candidate)
if err == nil {
return ToolStatus{Name: display, Path: path, OK: true}
}
}
return ToolStatus{Name: display}
}
func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) { func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
lsmodText := commandText("lsmod") lsmodText := commandText("lsmod")

View File

@@ -38,6 +38,12 @@ var (
"/opt/rocm/bin/rvs", "/opt/rocm/bin/rvs",
"/opt/rocm-*/bin/rvs", "/opt/rocm-*/bin/rvs",
} }
dcgmProfTesterCandidates = []string{
"dcgmproftester",
"dcgmproftester13",
"dcgmproftester12",
"dcgmproftester11",
}
) )
// streamExecOutput runs cmd and streams each output line to logFunc (if non-nil). // streamExecOutput runs cmd and streams each output line to logFunc (if non-nil).
@@ -277,6 +283,80 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
}, logFunc) }, logFunc)
} }
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
selected, err := resolveDCGMGPUIndices(gpuIndices)
if err != nil {
return "", err
}
profCmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
if err != nil {
return "", err
}
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", []satJob{
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
{
name: "03-dcgmproftester.log",
cmd: profCmd,
env: nvidiaVisibleDevicesEnv(selected),
collectGPU: true,
gpuIndices: selected,
},
{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
}, logFunc)
}
func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
selected, err := resolveDCGMGPUIndices(gpuIndices)
if err != nil {
return "", err
}
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", []satJob{
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{
name: "02-dcgmi-targeted-power.log",
cmd: nvidiaDCGMNamedDiagCommand("targeted_power", normalizeNvidiaBurnDuration(durationSec), selected),
collectGPU: true,
gpuIndices: selected,
},
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
}, logFunc)
}
func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
selected, err := resolveDCGMGPUIndices(gpuIndices)
if err != nil {
return "", err
}
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", []satJob{
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{
name: "02-dcgmi-pulse-test.log",
cmd: nvidiaDCGMNamedDiagCommand("pulse_test", normalizeNvidiaBurnDuration(durationSec), selected),
collectGPU: true,
gpuIndices: selected,
},
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
}, logFunc)
}
func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
selected, err := resolveDCGMGPUIndices(gpuIndices)
if err != nil {
return "", err
}
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", []satJob{
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{
name: "02-dcgmi-nvbandwidth.log",
cmd: nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, selected),
collectGPU: true,
gpuIndices: selected,
},
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
}, logFunc)
}
func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) { func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
return runAcceptancePackCtx(context.Background(), baseDir, "gpu-nvidia", nvidiaSATJobs(), logFunc) return runAcceptancePackCtx(context.Background(), baseDir, "gpu-nvidia", nvidiaSATJobs(), logFunc)
} }
@@ -293,6 +373,23 @@ func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, resolvedGPUIndices), logFunc) return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, resolvedGPUIndices), logFunc)
} }
func (s *System) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
selected, err := resolveDCGMGPUIndices(gpuIndices)
if err != nil {
return "", err
}
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", []satJob{
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{
name: "02-dcgmi-targeted-stress.log",
cmd: nvidiaDCGMNamedDiagCommand("targeted_stress", normalizeNvidiaBurnDuration(durationSec), selected),
collectGPU: true,
gpuIndices: selected,
},
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
}, logFunc)
}
func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) { func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
if len(gpuIndices) > 0 { if len(gpuIndices) > 0 {
return dedupeSortedIndices(gpuIndices), nil return dedupeSortedIndices(gpuIndices), nil
@@ -473,6 +570,31 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
} }
} }
func nvidiaDCGMNamedDiagCommand(name string, durationSec int, gpuIndices []int) []string {
args := []string{"dcgmi", "diag", "-r", name}
if durationSec > 0 {
args = append(args, "-p", fmt.Sprintf("%s.test_duration=%d", name, durationSec))
}
if len(gpuIndices) > 0 {
args = append(args, "-i", joinIndexList(gpuIndices))
}
return args
}
func normalizeNvidiaBurnDuration(durationSec int) int {
if durationSec <= 0 {
return 300
}
return durationSec
}
func nvidiaVisibleDevicesEnv(gpuIndices []int) []string {
if len(gpuIndices) == 0 {
return nil
}
return []string{"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices)}
}
func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) { func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
if ctx == nil { if ctx == nil {
ctx = context.Background() ctx = context.Background()
@@ -642,6 +764,7 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
} }
if strings.Contains(text, "unsupported") || if strings.Contains(text, "unsupported") ||
strings.Contains(text, "not supported") || strings.Contains(text, "not supported") ||
strings.Contains(text, "not found in path") ||
strings.Contains(text, "invalid opcode") || strings.Contains(text, "invalid opcode") ||
strings.Contains(text, "unknown command") || strings.Contains(text, "unknown command") ||
strings.Contains(text, "not implemented") || strings.Contains(text, "not implemented") ||
@@ -748,6 +871,15 @@ func resolveROCmSMICommand(args ...string) ([]string, error) {
return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm") return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
} }
func resolveDCGMProfTesterCommand(args ...string) ([]string, error) {
for _, candidate := range dcgmProfTesterCandidates {
if path, err := satLookPath(candidate); err == nil {
return append([]string{path}, args...), nil
}
}
return nil, errors.New("dcgmproftester not found in PATH")
}
func ensureAMDRuntimeReady() error { func ensureAMDRuntimeReady() error {
if _, err := os.Stat("/dev/kfd"); err == nil { if _, err := os.Stat("/dev/kfd"); err == nil {
return nil return nil

View File

@@ -195,6 +195,53 @@ func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) {
} }
} }
func TestResolveDCGMProfTesterCommandUsesVersionedBinary(t *testing.T) {
oldLookPath := satLookPath
satLookPath = func(file string) (string, error) {
switch file {
case "dcgmproftester13":
return "/usr/bin/dcgmproftester13", nil
default:
return "", exec.ErrNotFound
}
}
t.Cleanup(func() { satLookPath = oldLookPath })
cmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004")
if err != nil {
t.Fatalf("resolveDCGMProfTesterCommand error: %v", err)
}
if len(cmd) != 4 {
t.Fatalf("cmd len=%d want 4 (%v)", len(cmd), cmd)
}
if cmd[0] != "/usr/bin/dcgmproftester13" {
t.Fatalf("cmd[0]=%q want /usr/bin/dcgmproftester13", cmd[0])
}
}
func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", 900, []int{3, 1})
want := []string{"dcgmi", "diag", "-r", "targeted_power", "-p", "targeted_power.test_duration=900", "-i", "3,1"}
if len(cmd) != len(want) {
t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
}
for i := range want {
if cmd[i] != want[i] {
t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
}
}
}
func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
if len(env) != 1 {
t.Fatalf("env len=%d want 1 (%v)", len(env), env)
}
if env[0] != "CUDA_VISIBLE_DEVICES=0,2,4" {
t.Fatalf("env[0]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[0])
}
}
func TestNvidiaStressArchivePrefixByLoader(t *testing.T) { func TestNvidiaStressArchivePrefixByLoader(t *testing.T) {
t.Parallel() t.Parallel()

View File

@@ -581,15 +581,32 @@ func (h *handler) handleAPIGPUTools(w http.ResponseWriter, _ *http.Request) {
nvidiaUp := nvidiaErr == nil nvidiaUp := nvidiaErr == nil
amdUp := amdErr == nil amdUp := amdErr == nil
_, dcgmErr := exec.LookPath("dcgmi") _, dcgmErr := exec.LookPath("dcgmi")
_, ncclStressErr := exec.LookPath("bee-nccl-gpu-stress")
_, johnErr := exec.LookPath("bee-john-gpu-stress")
_, beeBurnErr := exec.LookPath("bee-gpu-burn")
_, nvBandwidthErr := exec.LookPath("nvbandwidth")
profErr := lookPathAny("dcgmproftester", "dcgmproftester13", "dcgmproftester12", "dcgmproftester11")
writeJSON(w, []toolEntry{ writeJSON(w, []toolEntry{
{ID: "bee-gpu-burn", Available: nvidiaUp, Vendor: "nvidia"}, {ID: "nvidia-compute", Available: nvidiaUp && profErr == nil, Vendor: "nvidia"},
{ID: "dcgm", Available: nvidiaUp && dcgmErr == nil, Vendor: "nvidia"}, {ID: "nvidia-targeted-power", Available: nvidiaUp && dcgmErr == nil, Vendor: "nvidia"},
{ID: "john", Available: nvidiaUp, Vendor: "nvidia"}, {ID: "nvidia-pulse", Available: nvidiaUp && dcgmErr == nil, Vendor: "nvidia"},
{ID: "nccl", Available: nvidiaUp, Vendor: "nvidia"}, {ID: "nvidia-interconnect", Available: nvidiaUp && ncclStressErr == nil, Vendor: "nvidia"},
{ID: "nvidia-bandwidth", Available: nvidiaUp && dcgmErr == nil && nvBandwidthErr == nil, Vendor: "nvidia"},
{ID: "bee-gpu-burn", Available: nvidiaUp && beeBurnErr == nil, Vendor: "nvidia"},
{ID: "john", Available: nvidiaUp && johnErr == nil, Vendor: "nvidia"},
{ID: "rvs", Available: amdUp, Vendor: "amd"}, {ID: "rvs", Available: amdUp, Vendor: "amd"},
}) })
} }
func lookPathAny(names ...string) error {
for _, name := range names {
if _, err := exec.LookPath(name); err == nil {
return nil
}
}
return exec.ErrNotFound
}
// ── System ──────────────────────────────────────────────────────────────────── // ── System ────────────────────────────────────────────────────────────────────
func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) { func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) {
@@ -628,7 +645,7 @@ func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request)
var standardTools = []string{ var standardTools = []string{
"dmidecode", "smartctl", "nvme", "lspci", "ipmitool", "dmidecode", "smartctl", "nvme", "lspci", "ipmitool",
"nvidia-smi", "memtester", "stress-ng", "nvtop", "nvidia-smi", "dcgmi", "nv-hostengine", "memtester", "stress-ng", "nvtop",
"mstflint", "qrencode", "mstflint", "qrencode",
} }

View File

@@ -232,7 +232,8 @@ func truncate(s string, max int) string {
// isSATTarget returns true for task targets that run hardware acceptance tests. // isSATTarget returns true for task targets that run hardware acceptance tests.
func isSATTarget(target string) bool { func isSATTarget(target string) bool {
switch target { switch target {
case "nvidia", "nvidia-benchmark", "nvidia-stress", "memory", "memory-stress", "storage", case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
"nvidia-interconnect", "nvidia-bandwidth", "nvidia-stress", "memory", "memory-stress", "storage",
"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress", "cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
"platform-stress": "platform-stress":
return true return true

View File

@@ -689,7 +689,7 @@ func renderValidate() string {
</div> </div>
<div class="grid3"> <div class="grid3">
` + renderSATCard("nvidia", "NVIDIA GPU", `<div class="form-row"><label>Diag Level</label><select id="sat-nvidia-level"><option value="1">Level 1 — Quick</option><option value="2">Level 2 — Standard</option><option value="3">Level 3 — Extended</option><option value="4">Level 4 — Full</option></select></div>`) + ` + renderSATCard("nvidia", "NVIDIA GPU", `<div class="form-row"><label>Diag Level</label><select id="sat-nvidia-level"><option value="1">Level 1 — Quick</option><option value="2">Level 2 — Standard</option><option value="3">Level 3 — Extended</option><option value="4">Level 4 — Full</option></select></div><div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button id="sat-btn-nvidia-targeted-stress" class="btn" type="button" onclick="runSAT('nvidia-targeted-stress')">Targeted Stress (dcgmi diag targeted_stress)</button></div><p style="color:var(--muted);font-size:12px;margin:0">Official DCGM `+"targeted_stress"+` stays in Validate as a controlled diagnostic load, not a max-burn recipe.</p>`) +
renderSATCard("memory", "Memory", "") + renderSATCard("memory", "Memory", "") +
renderSATCard("storage", "Storage", "") + renderSATCard("storage", "Storage", "") +
renderSATCard("cpu", "CPU", `<div class="form-row"><label>Duration (seconds)</label><input type="number" id="sat-cpu-dur" value="60" min="10"></div>`) + renderSATCard("cpu", "CPU", `<div class="form-row"><label>Duration (seconds)</label><input type="number" id="sat-cpu-dur" value="60" min="10"></div>`) +
@@ -708,9 +708,10 @@ let satES = null;
function runSAT(target) { function runSAT(target) {
if (satES) { satES.close(); satES = null; } if (satES) { satES.close(); satES = null; }
const body = {}; const body = {};
const labels = {nvidia:'Validate GPU', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'}; const labels = {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
body.display_name = labels[target] || ('Validate ' + target); body.display_name = labels[target] || ('Validate ' + target);
if (target === 'nvidia') body.diag_level = parseInt(document.getElementById('sat-nvidia-level').value)||1; if (target === 'nvidia') body.diag_level = parseInt(document.getElementById('sat-nvidia-level').value)||1;
if (target === 'nvidia-targeted-stress') body.duration = 300;
if (target === 'cpu') body.duration = parseInt(document.getElementById('sat-cpu-dur').value)||60; if (target === 'cpu') body.duration = parseInt(document.getElementById('sat-cpu-dur').value)||60;
document.getElementById('sat-output').style.display='block'; document.getElementById('sat-output').style.display='block';
document.getElementById('sat-title').textContent = '— ' + target; document.getElementById('sat-title').textContent = '— ' + target;
@@ -727,7 +728,7 @@ function runSAT(target) {
} }
function runAllSAT() { function runAllSAT() {
const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1); const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
const targets = ['nvidia','memory','storage','cpu','amd','amd-mem','amd-bandwidth']; const targets = ['nvidia','nvidia-targeted-stress','memory','storage','cpu','amd','amd-mem','amd-bandwidth'];
const total = targets.length * cycles; const total = targets.length * cycles;
let enqueued = 0; let enqueued = 0;
const status = document.getElementById('sat-all-status'); const status = document.getElementById('sat-all-status');
@@ -739,9 +740,10 @@ function runAllSAT() {
const btn = document.getElementById('sat-btn-' + target); const btn = document.getElementById('sat-btn-' + target);
if (btn && btn.disabled) { enqueueNext(cycle, idx+1); return; } if (btn && btn.disabled) { enqueueNext(cycle, idx+1); return; }
const body = {}; const body = {};
const labels = {nvidia:'Validate GPU', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'}; const labels = {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
body.display_name = labels[target] || ('Validate ' + target); body.display_name = labels[target] || ('Validate ' + target);
if (target === 'nvidia') body.diag_level = parseInt(document.getElementById('sat-nvidia-level').value)||1; if (target === 'nvidia') body.diag_level = parseInt(document.getElementById('sat-nvidia-level').value)||1;
if (target === 'nvidia-targeted-stress') body.duration = 300;
if (target === 'cpu') body.duration = parseInt(document.getElementById('sat-cpu-dur').value)||60; if (target === 'cpu') body.duration = parseInt(document.getElementById('sat-cpu-dur').value)||60;
fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(body)}) fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(body)})
.then(r=>r.json()).then(()=>{ .then(r=>r.json()).then(()=>{
@@ -756,6 +758,7 @@ function runAllSAT() {
<script> <script>
fetch('/api/gpu/presence').then(r=>r.json()).then(gp => { fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected'); if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
if (!gp.nvidia) disableSATCard('nvidia-targeted-stress', 'No NVIDIA GPU detected');
if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected'); if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
if (!gp.amd) disableSATCard('amd-mem', 'No AMD GPU detected'); if (!gp.amd) disableSATCard('amd-mem', 'No AMD GPU detected');
if (!gp.amd) disableSATCard('amd-bandwidth', 'No AMD GPU detected'); if (!gp.amd) disableSATCard('amd-bandwidth', 'No AMD GPU detected');
@@ -976,62 +979,91 @@ benchmarkLoadGPUs();
// ── Burn ────────────────────────────────────────────────────────────────────── // ── Burn ──────────────────────────────────────────────────────────────────────
func renderBurn() string { func renderBurn() string {
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>&#9888; Warning:</strong> Stress tests on this page run hardware at maximum load. Repeated or prolonged use may reduce hardware lifespan (storage endurance, GPU wear). Use only when necessary.</div> return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>&#9888; Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> DCGM diagnostics and ` + "targeted_stress" + ` remain in <a href="/validate">Validate</a>. Burn exposes official NVIDIA load recipes by test goal plus separate custom stress tools.</div>
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p> <p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
<div class="card" style="margin-bottom:16px"> <div class="card" style="margin-bottom:16px">
<div class="card-head">Burn Profile</div> <div class="card-head">Burn Profile</div>
<div class="card-body" style="display:flex;align-items:center;gap:16px;flex-wrap:wrap"> <div class="card-body burn-profile-body">
<div class="form-row" style="margin:0;max-width:380px"><label>Preset</label><select id="burn-profile"> <div class="burn-profile-col">
<option value="smoke" selected>Smoke — quick check (~5 min)</option> <div class="form-row" style="margin:0 0 8px"><label>Preset</label></div>
<option value="acceptance">Acceptance — 1 hour</option> <label class="cb-row"><input type="radio" name="burn-profile" value="smoke" checked><span>Smoke — quick check (~5 min)</span></label>
<option value="overnight">Overnight8 hours</option> <label class="cb-row"><input type="radio" name="burn-profile" value="acceptance"><span>Acceptance1 hour</span></label>
</select></div> <label class="cb-row"><input type="radio" name="burn-profile" value="overnight"><span>Overnight — 8 hours</span></label>
<button class="btn btn-primary" onclick="runAll()">&#9654; Run All</button> </div>
<div class="burn-profile-col burn-profile-action">
<button class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button>
<p>Run checked tests one by one. Tests run without cooldown. Each test duration is determined by the Burn Profile. Total test duration is the sum of all selected tests multiplied by the Burn Profile duration.</p>
</div>
<div class="burn-profile-col burn-profile-action">
<button class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button>
<p>Run checked core test modules (CPU, MEM, GPU). Tests start at the same time and run for a period with short cooldown phases to stress the server cooling system.</p>
</div>
</div>
<div class="card-body" style="padding-top:0;display:flex;justify-content:center">
<span id="burn-all-status" style="font-size:12px;color:var(--muted)"></span> <span id="burn-all-status" style="font-size:12px;color:var(--muted)"></span>
</div> </div>
</div> </div>
<div class="grid3" style="margin-bottom:16px"> <div class="card" style="margin-bottom:16px">
<div class="card-head">NVIDIA GPU Selection</div>
<div class="card">
<div class="card-head">GPU Stress</div>
<div class="card-body"> <div class="card-body">
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">NVIDIA tools run on all discovered GPUs. DCGM is the official NVIDIA diagnostic path. NCCL exercises multi-GPU fabric and is not a full compute burn.</p> <p style="font-size:12px;color:var(--muted);margin:0 0 10px">Official NVIDIA recipes and custom NVIDIA stressors use only the GPUs selected here. Multi-GPU interconnect tests are limited to this selection as well.</p>
<div id="gpu-tools-list"> <div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
<label class="cb-row"><input type="checkbox" id="burn-gpu-bee" value="bee-gpu-burn" disabled><span>bee-gpu-burn <span class="cb-note" id="note-bee"></span></span></label> <button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectAll()">Select All</button>
<label class="cb-row"><input type="checkbox" id="burn-gpu-dcgm" value="dcgm" disabled><span>DCGM Diagnostics (Official NVIDIA) <span class="cb-note" id="note-dcgm"></span></span></label> <button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectNone()">Clear</button>
<label class="cb-row"><input type="checkbox" id="burn-gpu-john" value="john" disabled><span>John the Ripper (OpenCL) <span class="cb-note" id="note-john"></span></span></label>
<label class="cb-row"><input type="checkbox" id="burn-gpu-nccl" value="nccl" disabled><span>NCCL all_reduce_perf (Interconnect) <span class="cb-note" id="note-nccl"></span></span></label>
<label class="cb-row"><input type="checkbox" id="burn-gpu-rvs" value="rvs" disabled><span>RVS GST (AMD) <span class="cb-note" id="note-rvs"></span></span></label>
</div> </div>
<button class="btn btn-primary" style="margin-top:10px" onclick="runGPUStress()">&#9654; Run GPU Stress</button> <div id="burn-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
</div>
<p id="burn-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA burn recipes.</p>
</div> </div>
</div> </div>
<div class="card"> <div class="burn-section">Core Burn Paths</div>
<div class="card-head">Compute Stress</div> <div class="grid2 burn-grid" style="margin-bottom:16px">
<div class="card-body"> <div class="card burn-card">
<div class="card-head card-head-actions"><span>GPU Max Load</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-nvidia-compute',target:'nvidia-compute',label:'NVIDIA Max Compute Load (dcgmproftester)',nvidia:true},{id:'burn-gpu-bee',target:'nvidia-stress',label:'GPU Burn (bee-gpu-burn)',nvidia:true,extra:{loader:'builtin'}},{id:'burn-gpu-john',target:'nvidia-stress',label:'John GPU Stress (john/OpenCL)',nvidia:true,extra:{loader:'john'}},{id:'burn-gpu-rvs',target:'amd-stress',label:'AMD GPU Stress (rvs gst)'}])">Run</button></div>
<div class="card-body burn-card-body">
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Combine vendor-backed and custom GPU max-load recipes in one run set. ` + "dcgmproftester" + ` is the primary official NVIDIA path; custom stressors remain available as parallel checkbox options.</p>
<label class="cb-row"><input type="checkbox" id="burn-nvidia-compute" checked disabled><span>NVIDIA Max Compute Load (dcgmproftester) <span class="cb-note" id="note-nvidia-compute"></span></span></label>
<label class="cb-row"><input type="checkbox" id="burn-gpu-bee" checked disabled><span>GPU Burn (bee-gpu-burn) <span class="cb-note" id="note-bee"></span></span></label>
<label class="cb-row"><input type="checkbox" id="burn-gpu-john" disabled><span>John GPU Stress (john/OpenCL) <span class="cb-note" id="note-john"></span></span></label>
<label class="cb-row"><input type="checkbox" id="burn-gpu-rvs" disabled><span>AMD GPU Stress (rvs gst) <span class="cb-note" id="note-rvs"></span></span></label>
</div>
</div>
<div class="card burn-card">
<div class="card-head card-head-actions"><span>Compute Stress</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-cpu',target:'cpu',label:'CPU Burn-in'},{id:'burn-mem-stress',target:'memory-stress',label:'Memory Burn-in'},{id:'burn-sat-stress',target:'sat-stress',label:'SAT Stress (stressapptest)'}])">Run</button></div>
<div class="card-body burn-card-body">
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Select which subsystems to stress. Each checked item runs as a separate task.</p> <p style="font-size:12px;color:var(--muted);margin:0 0 10px">Select which subsystems to stress. Each checked item runs as a separate task.</p>
<label class="cb-row"><input type="checkbox" id="burn-cpu" checked><span>CPU stress (stress-ng)</span></label> <label class="cb-row"><input type="checkbox" id="burn-cpu" checked><span>CPU stress (stress-ng)</span></label>
<label class="cb-row"><input type="checkbox" id="burn-mem-stress" checked><span>Memory stress (stress-ng --vm)</span></label> <label class="cb-row"><input type="checkbox" id="burn-mem-stress" checked><span>Memory stress (stress-ng --vm)</span></label>
<label class="cb-row"><input type="checkbox" id="burn-sat-stress"><span>stressapptest (CPU + memory bus)</span></label> <label class="cb-row"><input type="checkbox" id="burn-sat-stress"><span>stressapptest (CPU + memory bus)</span></label>
<button class="btn btn-primary" style="margin-top:10px" onclick="runComputeStress()">&#9654; Run Compute Stress</button> </div>
</div>
</div>
<div class="burn-section">GPU-Specific Tests</div>
<div class="grid2 burn-grid" style="margin-bottom:16px">
<div class="card burn-card">
<div class="card-head card-head-actions"><span>Power Delivery / Power Budget</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-nvidia-power',target:'nvidia-targeted-power',label:'NVIDIA Targeted Power (dcgmi diag targeted_power)',nvidia:true},{id:'burn-nvidia-pulse',target:'nvidia-pulse',label:'NVIDIA Pulse Test (dcgmi diag pulse_test)',nvidia:true}])">Run</button></div>
<div class="card-body burn-card-body">
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Official NVIDIA power-oriented recipes. ` + "targeted_power" + ` checks sustained delivery; ` + "pulse_test" + ` checks transient behavior.</p>
<label class="cb-row"><input type="checkbox" id="burn-nvidia-power" disabled><span>NVIDIA Targeted Power (dcgmi diag targeted_power) <span class="cb-note" id="note-nvidia-power"></span></span></label>
<label class="cb-row"><input type="checkbox" id="burn-nvidia-pulse" disabled><span>NVIDIA Pulse Test (dcgmi diag pulse_test) <span class="cb-note" id="note-nvidia-pulse"></span></span></label>
</div> </div>
</div> </div>
<div class="card"> <div class="card burn-card">
<div class="card-head">Platform Thermal Cycling</div> <div class="card-head card-head-actions"><span>Interconnect / Bandwidth</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-nvidia-interconnect',target:'nvidia-interconnect',label:'NVIDIA Interconnect Test (NCCL all_reduce_perf)',nvidia:true},{id:'burn-nvidia-bandwidth',target:'nvidia-bandwidth',label:'NVIDIA Bandwidth Test (NVBandwidth)',nvidia:true}])">Run</button></div>
<div class="card-body"> <div class="card-body burn-card-body">
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Repeated load+idle cycles. Detects cooling recovery failures and GPU throttle. Smoke: 2×90s. Acceptance: 4×300s.</p> <p style="font-size:12px;color:var(--muted);margin:0 0 10px">Official NVIDIA fabric paths. NCCL is interconnect-only and is not a compute burn. NVBandwidth validates copy and bandwidth paths.</p>
<p style="font-size:12px;font-weight:600;margin:0 0 6px">Load components:</p> <label class="cb-row"><input type="checkbox" id="burn-nvidia-interconnect" disabled><span>NVIDIA Interconnect Test (NCCL all_reduce_perf) <span class="cb-note" id="note-nvidia-interconnect"></span></span></label>
<label class="cb-row"><input type="checkbox" id="burn-pt-cpu" checked><span>CPU (stressapptest)</span></label> <label class="cb-row"><input type="checkbox" id="burn-nvidia-bandwidth" disabled><span>NVIDIA Bandwidth Test (NVBandwidth) <span class="cb-note" id="note-nvidia-bandwidth"></span></span></label>
<label class="cb-row"><input type="checkbox" id="burn-pt-nvidia" disabled><span>NVIDIA GPU <span class="cb-note" id="note-pt-nvidia"></span></span></label>
<label class="cb-row"><input type="checkbox" id="burn-pt-amd" disabled><span>AMD GPU <span class="cb-note" id="note-pt-amd"></span></span></label>
<button class="btn btn-primary" style="margin-top:10px" onclick="runPlatformStress()">&#9654; Run Thermal Cycling</button>
</div> </div>
</div> </div>
</div> </div>
<div id="bi-output" style="display:none;margin-top:16px" class="card"> <div id="bi-output" style="display:none;margin-top:16px" class="card">
@@ -1040,154 +1072,224 @@ func renderBurn() string {
</div> </div>
<style> <style>
.cb-row { display:flex; align-items:center; gap:8px; padding:4px 0; cursor:pointer; font-size:13px; } .cb-row { display:flex; align-items:flex-start; gap:8px; padding:4px 0; cursor:pointer; font-size:13px; }
.cb-row input[type=checkbox] { width:16px; height:16px; flex-shrink:0; } .cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
.cb-row input[type=checkbox]:disabled { opacity:0.4; cursor:not-allowed; } .cb-row input[type=checkbox]:disabled { opacity:0.4; cursor:not-allowed; }
.cb-row input[type=checkbox]:disabled ~ span { opacity:0.45; cursor:not-allowed; } .cb-row input[type=checkbox]:disabled ~ span { opacity:0.45; cursor:not-allowed; }
.cb-note { font-size:11px; color:var(--muted); font-style:italic; } .cb-note { font-size:11px; color:var(--muted); font-style:italic; }
.burn-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
.burn-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
.burn-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
.burn-profile-col { min-width:0; }
.burn-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:flex-start; gap:8px; }
.burn-profile-action p { font-size:12px; color:var(--muted); margin:0; width:100%; text-align:left; }
.burn-section { font-size:12px; font-weight:700; letter-spacing:.06em; text-transform:uppercase; color:var(--muted); margin:0 0 10px; padding-top:4px; }
.burn-grid { align-items:stretch; }
.burn-card { height:100%; display:flex; flex-direction:column; }
.burn-card-body { flex:1; display:flex; flex-direction:column; }
.card-head-actions { justify-content:space-between; }
.card-head-buttons { display:flex; align-items:center; gap:8px; margin-left:auto; }
@media(max-width:900px){ .card-head-actions { align-items:flex-start; flex-direction:column; } .card-head-buttons { margin-left:0; } .burn-profile-body { grid-template-columns:1fr; } }
</style> </style>
<script> <script>
let biES = null; let biES = null;
function profile() { return document.getElementById('burn-profile').value || 'smoke'; } function burnProfile() {
const selected = document.querySelector('input[name="burn-profile"]:checked');
return selected ? selected.value : 'smoke';
}
function enqueueTask(target, extra) { function burnSelectedGPUIndices() {
const body = Object.assign({ profile: profile() }, extra || {}); return Array.from(document.querySelectorAll('.burn-gpu-checkbox'))
return fetch('/api/sat/'+target+'/run', { .filter(function(el) { return el.checked && !el.disabled; })
method: 'POST', headers: {'Content-Type':'application/json'}, body: JSON.stringify(body) .map(function(el) { return parseInt(el.value, 10); })
}).then(r => r.json()); .filter(function(v) { return !Number.isNaN(v); })
.sort(function(a, b) { return a - b; });
}
function burnUpdateSelectionNote() {
const note = document.getElementById('burn-selection-note');
const selected = burnSelectedGPUIndices();
if (!selected.length) {
note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA burn recipes.';
return;
}
note.textContent = 'Selected NVIDIA GPUs: ' + selected.join(', ') + '. Official and custom NVIDIA tasks will use only these GPUs.';
}
function burnRenderGPUList(gpus) {
const root = document.getElementById('burn-gpu-list');
if (!gpus || !gpus.length) {
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
burnUpdateSelectionNote();
return;
}
root.innerHTML = gpus.map(function(gpu) {
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
return '<label class="burn-gpu-row">'
+ '<input class="burn-gpu-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="burnUpdateSelectionNote()">'
+ '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
+ '</label>';
}).join('');
burnUpdateSelectionNote();
}
function burnSelectAll() {
document.querySelectorAll('.burn-gpu-checkbox').forEach(function(el) { el.checked = true; });
burnUpdateSelectionNote();
}
function burnSelectNone() {
document.querySelectorAll('.burn-gpu-checkbox').forEach(function(el) { el.checked = false; });
burnUpdateSelectionNote();
}
function burnLoadGPUs() {
fetch('/api/gpu/nvidia').then(function(r) {
return r.json().then(function(body) {
if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
return body;
});
}).then(function(gpus) {
burnRenderGPUList(gpus);
}).catch(function(err) {
document.getElementById('burn-gpu-list').innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
burnUpdateSelectionNote();
});
}
function enqueueBurnTask(target, label, extra, useSelectedNvidia) {
const body = Object.assign({ profile: burnProfile(), display_name: label }, extra || {});
if (useSelectedNvidia) {
const selected = burnSelectedGPUIndices();
if (!selected.length) {
return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
}
body.gpu_indices = selected;
}
return fetch('/api/sat/' + target + '/run', {
method: 'POST',
headers: {'Content-Type':'application/json'},
body: JSON.stringify(body)
}).then(function(r) {
return r.json().then(function(payload) {
if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
return payload;
});
});
} }
function streamTask(taskId, label) { function streamTask(taskId, label) {
if (biES) { biES.close(); biES = null; } if (biES) { biES.close(); biES = null; }
document.getElementById('bi-output').style.display = 'block'; document.getElementById('bi-output').style.display = 'block';
document.getElementById('bi-title').textContent = '— ' + label + ' [' + profile() + ']'; document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
const term = document.getElementById('bi-terminal'); const term = document.getElementById('bi-terminal');
term.textContent = 'Task ' + taskId + ' queued. Streaming...\n'; term.textContent = 'Task ' + taskId + ' queued. Streaming...\n';
biES = new EventSource('/api/tasks/'+taskId+'/stream'); biES = new EventSource('/api/tasks/' + taskId + '/stream');
biES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop = term.scrollHeight; }; biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
biES.addEventListener('done', e => { biES.addEventListener('done', function(e) {
biES.close(); biES = null; biES.close();
term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n'; biES = null;
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
term.scrollTop = term.scrollHeight;
}); });
} }
function runGPUStress() { function runBurnTaskSet(tasks, statusElId) {
const tasks = [ const enabled = tasks.filter(function(t) {
{id:'burn-gpu-bee', target:'nvidia-stress', label:'bee-gpu-burn', extra:{loader:'builtin'}},
{id:'burn-gpu-dcgm', target:'nvidia', label:'DCGM Diagnostics (Official NVIDIA)', extra:{display_name:'NVIDIA DCGM Diagnostics (Official)'}},
{id:'burn-gpu-john', target:'nvidia-stress', label:'John GPU Stress', extra:{loader:'john'}},
{id:'burn-gpu-nccl', target:'nvidia-stress', label:'NCCL Interconnect Stress', extra:{loader:'nccl', display_name:'NCCL Interconnect Stress'}},
{id:'burn-gpu-rvs', target:'amd-stress', label:'RVS GST', extra:{}},
];
tasks.filter(t => {
const el = document.getElementById(t.id); const el = document.getElementById(t.id);
return el && el.checked && !el.disabled; return el && el.checked && !el.disabled;
}).forEach(t => {
enqueueTask(t.target, t.extra).then(d => { streamTask(d.task_id, t.label); });
}); });
} const status = statusElId ? document.getElementById(statusElId) : null;
if (status) status.textContent = '';
function runComputeStress() { if (!enabled.length) {
const tasks = [ if (status) status.textContent = 'No tasks selected.';
{id:'burn-cpu', target:'cpu'}, return;
{id:'burn-mem-stress', target:'memory-stress'}, }
{id:'burn-sat-stress', target:'sat-stress'}, enabled.forEach(function(t) {
]; enqueueBurnTask(t.target, t.label, t.extra, !!t.nvidia)
let last = null; .then(function(d) {
tasks.filter(t => { if (status) status.textContent = enabled.length + ' task(s) queued.';
const el = document.getElementById(t.id); streamTask(d.task_id, t.label);
return el && el.checked; })
}).forEach(t => { .catch(function(err) {
enqueueTask(t.target).then(d => { last = d; streamTask(d.task_id, t.target); }); if (status) status.textContent = 'Error: ' + err.message;
const term = document.getElementById('bi-terminal');
document.getElementById('bi-output').style.display = 'block';
term.textContent += 'ERROR: ' + err.message + '\n';
});
}); });
} }
function runPlatformStress() { function runPlatformStress() {
const comps = []; const comps = [];
if (document.getElementById('burn-pt-cpu').checked) comps.push('cpu'); const computeIDs = ['burn-cpu', 'burn-mem-stress', 'burn-sat-stress'];
const nv = document.getElementById('burn-pt-nvidia'); const gpuIDs = ['burn-nvidia-compute', 'burn-gpu-bee', 'burn-gpu-john', 'burn-gpu-rvs'];
if (nv && nv.checked && !nv.disabled) comps.push('gpu'); const hasChecked = function(ids) {
const am = document.getElementById('burn-pt-amd'); return ids.some(function(id) {
if (am && am.checked && !am.disabled) comps.push('gpu'); const el = document.getElementById(id);
return el && el.checked && !el.disabled;
});
};
if (hasChecked(computeIDs)) comps.push('cpu');
if (hasChecked(gpuIDs)) comps.push('gpu');
if (!comps.length) {
const status = document.getElementById('burn-all-status');
if (status) status.textContent = 'Select at least one test in GPU Max Load or Compute Stress.';
return;
}
const extra = comps.length > 0 ? {platform_components: comps} : {}; const extra = comps.length > 0 ? {platform_components: comps} : {};
enqueueTask('platform-stress', extra).then(d => streamTask(d.task_id, 'platform-stress')); enqueueBurnTask('platform-stress', 'Platform Thermal Cycling', extra, false).then(function(d) {
} streamTask(d.task_id, 'Platform Thermal Cycling');
function runAll() {
const status = document.getElementById('burn-all-status');
status.textContent = 'Enqueuing...';
let count = 0;
const done = () => { count++; status.textContent = count + ' tasks queued.'; };
// GPU tests
const gpuTasks = [
{id:'burn-gpu-bee', target:'nvidia-stress', label:'bee-gpu-burn', extra:{loader:'builtin'}},
{id:'burn-gpu-dcgm', target:'nvidia', label:'DCGM Diagnostics (Official NVIDIA)', extra:{display_name:'NVIDIA DCGM Diagnostics (Official)'}},
{id:'burn-gpu-john', target:'nvidia-stress', label:'John GPU Stress', extra:{loader:'john'}},
{id:'burn-gpu-nccl', target:'nvidia-stress', label:'NCCL Interconnect Stress', extra:{loader:'nccl', display_name:'NCCL Interconnect Stress'}},
{id:'burn-gpu-rvs', target:'amd-stress', label:'RVS GST', extra:{}},
];
gpuTasks.filter(t => { const el = document.getElementById(t.id); return el && el.checked && !el.disabled; }).forEach(t => {
enqueueTask(t.target, t.extra).then(d => { streamTask(d.task_id, t.label); done(); });
}); });
// Compute tests
[{id:'burn-cpu',target:'cpu'},{id:'burn-mem-stress',target:'memory-stress'},{id:'burn-sat-stress',target:'sat-stress'}]
.filter(t => { const el = document.getElementById(t.id); return el && el.checked; })
.forEach(t => enqueueTask(t.target).then(d => { streamTask(d.task_id, t.target); done(); }));
// Platform
const comps = [];
if (document.getElementById('burn-pt-cpu').checked) comps.push('cpu');
const nv = document.getElementById('burn-pt-nvidia');
if (nv && nv.checked && !nv.disabled) comps.push('gpu');
const am = document.getElementById('burn-pt-amd');
if (am && am.checked && !am.disabled) comps.push('gpu');
const ptExtra = comps.length > 0 ? {platform_components: comps} : {};
enqueueTask('platform-stress', ptExtra).then(d => { streamTask(d.task_id, 'platform-stress'); done(); });
} }
// Load GPU tool availability function runAllBurnTasks() {
fetch('/api/gpu/tools').then(r => r.json()).then(tools => { const status = document.getElementById('burn-all-status');
const nvidiaMap = {'bee-gpu-burn':'burn-gpu-bee','dcgm':'burn-gpu-dcgm','john':'burn-gpu-john','nccl':'burn-gpu-nccl','rvs':'burn-gpu-rvs'}; const all = [
const noteMap = {'bee-gpu-burn':'note-bee','dcgm':'note-dcgm','john':'note-john','nccl':'note-nccl','rvs':'note-rvs'}; {id:'burn-nvidia-compute',target:'nvidia-compute',label:'NVIDIA Max Compute Load (dcgmproftester)',nvidia:true},
tools.forEach(t => { {id:'burn-nvidia-power',target:'nvidia-targeted-power',label:'NVIDIA Targeted Power (dcgmi diag targeted_power)',nvidia:true},
const cb = document.getElementById(nvidiaMap[t.id]); {id:'burn-nvidia-pulse',target:'nvidia-pulse',label:'NVIDIA Pulse Test (dcgmi diag pulse_test)',nvidia:true},
const note = document.getElementById(noteMap[t.id]); {id:'burn-nvidia-interconnect',target:'nvidia-interconnect',label:'NVIDIA Interconnect Test (NCCL all_reduce_perf)',nvidia:true},
{id:'burn-nvidia-bandwidth',target:'nvidia-bandwidth',label:'NVIDIA Bandwidth Test (NVBandwidth)',nvidia:true},
{id:'burn-gpu-bee',target:'nvidia-stress',label:'GPU Burn (bee-gpu-burn)',nvidia:true,extra:{loader:'builtin'}},
{id:'burn-gpu-john',target:'nvidia-stress',label:'John GPU Stress (john/OpenCL)',nvidia:true,extra:{loader:'john'}},
{id:'burn-gpu-rvs',target:'amd-stress',label:'AMD GPU Stress (rvs gst)'},
{id:'burn-cpu',target:'cpu',label:'CPU Burn-in'},
{id:'burn-mem-stress',target:'memory-stress',label:'Memory Burn-in'},
{id:'burn-sat-stress',target:'sat-stress',label:'SAT Stress (stressapptest)'},
];
status.textContent = 'Enqueuing...';
runBurnTaskSet(all, 'burn-all-status');
}
fetch('/api/gpu/tools').then(function(r) { return r.json(); }).then(function(tools) {
const map = {
'nvidia-compute': {cb:'burn-nvidia-compute', note:'note-nvidia-compute', reason:'dcgmproftester not available or NVIDIA driver not running'},
'nvidia-targeted-power': {cb:'burn-nvidia-power', note:'note-nvidia-power', reason:'dcgmi not available or NVIDIA driver not running'},
'nvidia-pulse': {cb:'burn-nvidia-pulse', note:'note-nvidia-pulse', reason:'dcgmi not available or NVIDIA driver not running'},
'nvidia-interconnect': {cb:'burn-nvidia-interconnect', note:'note-nvidia-interconnect', reason:'NCCL interconnect tool not available or NVIDIA driver not running'},
'nvidia-bandwidth': {cb:'burn-nvidia-bandwidth', note:'note-nvidia-bandwidth', reason:'nvbandwidth or dcgmi not available or NVIDIA driver not running'},
'bee-gpu-burn': {cb:'burn-gpu-bee', note:'note-bee', reason:'bee-gpu-burn not available or NVIDIA driver not running'},
'john': {cb:'burn-gpu-john', note:'note-john', reason:'bee-john-gpu-stress not available or NVIDIA driver not running'},
'rvs': {cb:'burn-gpu-rvs', note:'note-rvs', reason:'AMD driver not running'},
};
tools.forEach(function(t) {
const spec = map[t.id];
if (!spec) return;
const cb = document.getElementById(spec.cb);
const note = document.getElementById(spec.note);
if (!cb) return; if (!cb) return;
if (t.available) { if (t.available) {
cb.disabled = false; cb.disabled = false;
if (t.id === 'bee-gpu-burn' || t.id === 'dcgm') cb.checked = true; } else if (note) {
} else { note.textContent = '— ' + spec.reason;
let reason = t.vendor === 'nvidia' ? 'NVIDIA driver not running' : 'AMD driver not running';
if (t.id === 'dcgm' && t.vendor === 'nvidia') reason = 'dcgmi not available or NVIDIA driver not running';
if (t.id === 'nccl' && t.vendor === 'nvidia') reason = 'NCCL interconnect tool unavailable or NVIDIA driver not running';
if (note) note.textContent = '— ' + reason;
} }
}); });
}).catch(() => {}); }).catch(function() {});
// Load GPU presence for platform thermal cycling burnLoadGPUs();
fetch('/api/gpu/presence').then(r => r.json()).then(gp => {
const nvCb = document.getElementById('burn-pt-nvidia');
const amCb = document.getElementById('burn-pt-amd');
const nvNote = document.getElementById('note-pt-nvidia');
const amNote = document.getElementById('note-pt-amd');
if (gp.nvidia) {
nvCb.disabled = false;
nvCb.checked = true;
} else {
if (nvNote) nvNote.textContent = '— NVIDIA driver not running';
}
if (gp.amd) {
amCb.disabled = false;
amCb.checked = true;
} else {
if (amNote) amNote.textContent = '— AMD driver not running';
}
}).catch(() => {});
</script>` </script>`
} }

View File

@@ -237,6 +237,12 @@ func NewHandler(opts HandlerOptions) http.Handler {
// SAT // SAT
mux.HandleFunc("POST /api/sat/nvidia/run", h.handleAPISATRun("nvidia")) mux.HandleFunc("POST /api/sat/nvidia/run", h.handleAPISATRun("nvidia"))
mux.HandleFunc("POST /api/sat/nvidia-targeted-stress/run", h.handleAPISATRun("nvidia-targeted-stress"))
mux.HandleFunc("POST /api/sat/nvidia-compute/run", h.handleAPISATRun("nvidia-compute"))
mux.HandleFunc("POST /api/sat/nvidia-targeted-power/run", h.handleAPISATRun("nvidia-targeted-power"))
mux.HandleFunc("POST /api/sat/nvidia-pulse/run", h.handleAPISATRun("nvidia-pulse"))
mux.HandleFunc("POST /api/sat/nvidia-interconnect/run", h.handleAPISATRun("nvidia-interconnect"))
mux.HandleFunc("POST /api/sat/nvidia-bandwidth/run", h.handleAPISATRun("nvidia-bandwidth"))
mux.HandleFunc("POST /api/sat/nvidia-stress/run", h.handleAPISATRun("nvidia-stress")) mux.HandleFunc("POST /api/sat/nvidia-stress/run", h.handleAPISATRun("nvidia-stress"))
mux.HandleFunc("POST /api/sat/memory/run", h.handleAPISATRun("memory")) mux.HandleFunc("POST /api/sat/memory/run", h.handleAPISATRun("memory"))
mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage")) mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
@@ -1182,6 +1188,11 @@ func (h *handler) handleAPIMetricsExportCSV(w http.ResponseWriter, r *http.Reque
func (h *handler) handleReady(w http.ResponseWriter, r *http.Request) { func (h *handler) handleReady(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Cache-Control", "no-store") w.Header().Set("Cache-Control", "no-store")
if strings.TrimSpace(h.opts.AuditPath) == "" {
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte("ready"))
return
}
if _, err := os.Stat(h.opts.AuditPath); err != nil { if _, err := os.Stat(h.opts.AuditPath); err != nil {
w.WriteHeader(http.StatusServiceUnavailable) w.WriteHeader(http.StatusServiceUnavailable)
_, _ = w.Write([]byte("starting")) _, _ = w.Write([]byte("starting"))

View File

@@ -551,6 +551,18 @@ func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
} }
} }
func TestReadyIsOKWhenAuditPathIsUnset(t *testing.T) {
handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/api/ready", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
}
if strings.TrimSpace(rec.Body.String()) != "ready" {
t.Fatalf("body=%q want ready", rec.Body.String())
}
}
func TestAuditPageRendersViewerFrameAndActions(t *testing.T) { func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
dir := t.TempDir() dir := t.TempDir()
path := filepath.Join(dir, "audit.json") path := filepath.Join(dir, "audit.json")
@@ -638,7 +650,26 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
} }
} }
func TestBurnPageRendersOfficialNVIDIADCGMAndNCCLInterconnectLabel(t *testing.T) { func TestValidatePageRendersNvidiaTargetedStressAction(t *testing.T) {
handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d", rec.Code)
}
body := rec.Body.String()
for _, needle := range []string{
`Targeted Stress`,
`nvidia-targeted-stress`,
`Official DCGM`,
} {
if !strings.Contains(body, needle) {
t.Fatalf("validate page missing %q: %s", needle, body)
}
}
}
func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
handler := NewHandler(HandlerOptions{}) handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder() rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/burn", nil)) handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/burn", nil))
@@ -647,10 +678,11 @@ func TestBurnPageRendersOfficialNVIDIADCGMAndNCCLInterconnectLabel(t *testing.T)
} }
body := rec.Body.String() body := rec.Body.String()
for _, needle := range []string{ for _, needle := range []string{
`DCGM Diagnostics (Official NVIDIA)`, `NVIDIA Max Compute Load`,
`NCCL all_reduce_perf (Interconnect)`, `dcgmproftester`,
`DCGM is the official NVIDIA diagnostic path`, `targeted_stress remain in <a href="/validate">Validate</a>`,
`burn-gpu-dcgm`, `NVIDIA Interconnect Test (NCCL all_reduce_perf)`,
`id="burn-gpu-list"`,
} { } {
if !strings.Contains(body, needle) { if !strings.Contains(body, needle) {
t.Fatalf("burn page missing %q: %s", needle, body) t.Fatalf("burn page missing %q: %s", needle, body)

View File

@@ -30,23 +30,29 @@ const (
// taskNames maps target → human-readable name for validate (SAT) runs. // taskNames maps target → human-readable name for validate (SAT) runs.
var taskNames = map[string]string{ var taskNames = map[string]string{
"nvidia": "NVIDIA SAT", "nvidia": "NVIDIA SAT",
"nvidia-benchmark": "NVIDIA Benchmark", "nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)",
"nvidia-stress": "NVIDIA GPU Stress", "nvidia-benchmark": "NVIDIA Benchmark",
"memory": "Memory SAT", "nvidia-compute": "NVIDIA Max Compute Load (dcgmproftester)",
"storage": "Storage SAT", "nvidia-targeted-power": "NVIDIA Targeted Power (dcgmi diag targeted_power)",
"cpu": "CPU SAT", "nvidia-pulse": "NVIDIA Pulse Test (dcgmi diag pulse_test)",
"amd": "AMD GPU SAT", "nvidia-interconnect": "NVIDIA Interconnect Test (NCCL all_reduce_perf)",
"amd-mem": "AMD GPU MEM Integrity", "nvidia-bandwidth": "NVIDIA Bandwidth Test (NVBandwidth)",
"amd-bandwidth": "AMD GPU MEM Bandwidth", "nvidia-stress": "NVIDIA GPU Stress",
"amd-stress": "AMD GPU Burn-in", "memory": "Memory SAT",
"memory-stress": "Memory Burn-in", "storage": "Storage SAT",
"sat-stress": "SAT Stress (stressapptest)", "cpu": "CPU SAT",
"platform-stress": "Platform Thermal Cycling", "amd": "AMD GPU SAT",
"audit": "Audit", "amd-mem": "AMD GPU MEM Integrity",
"support-bundle": "Support Bundle", "amd-bandwidth": "AMD GPU MEM Bandwidth",
"install": "Install to Disk", "amd-stress": "AMD GPU Burn-in",
"install-to-ram": "Install to RAM", "memory-stress": "Memory Burn-in",
"sat-stress": "SAT Stress (stressapptest)",
"platform-stress": "Platform Thermal Cycling",
"audit": "Audit",
"support-bundle": "Support Bundle",
"install": "Install to Disk",
"install-to-ram": "Install to RAM",
} }
// burnNames maps target → human-readable name when a burn profile is set. // burnNames maps target → human-readable name when a burn profile is set.
@@ -134,45 +140,53 @@ type persistedTask struct {
} }
type burnPreset struct { type burnPreset struct {
NvidiaDiag int
DurationSec int DurationSec int
} }
func resolveBurnPreset(profile string) burnPreset { func resolveBurnPreset(profile string) burnPreset {
switch profile { switch profile {
case "overnight": case "overnight":
return burnPreset{NvidiaDiag: 4, DurationSec: 8 * 60 * 60} return burnPreset{DurationSec: 8 * 60 * 60}
case "acceptance": case "acceptance":
return burnPreset{NvidiaDiag: 3, DurationSec: 60 * 60} return burnPreset{DurationSec: 60 * 60}
default: default:
return burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60} return burnPreset{DurationSec: 5 * 60}
} }
} }
func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions { func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
acceptanceCycles := []platform.PlatformStressCycle{
{LoadSec: 85, IdleSec: 5},
{LoadSec: 80, IdleSec: 10},
{LoadSec: 55, IdleSec: 5},
{LoadSec: 60, IdleSec: 0},
{LoadSec: 100, IdleSec: 10},
{LoadSec: 145, IdleSec: 15},
{LoadSec: 190, IdleSec: 20},
{LoadSec: 235, IdleSec: 25},
{LoadSec: 280, IdleSec: 30},
{LoadSec: 325, IdleSec: 35},
{LoadSec: 370, IdleSec: 40},
{LoadSec: 415, IdleSec: 45},
{LoadSec: 460, IdleSec: 50},
{LoadSec: 510, IdleSec: 0},
}
switch profile { switch profile {
case "overnight": case "overnight":
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{ cycles := make([]platform.PlatformStressCycle, 0, len(acceptanceCycles)*8)
{LoadSec: 600, IdleSec: 120}, for range 8 {
{LoadSec: 600, IdleSec: 60}, cycles = append(cycles, acceptanceCycles...)
{LoadSec: 600, IdleSec: 30}, }
{LoadSec: 600, IdleSec: 120}, return platform.PlatformStressOptions{Cycles: cycles}
{LoadSec: 600, IdleSec: 60},
{LoadSec: 600, IdleSec: 30},
{LoadSec: 600, IdleSec: 120},
{LoadSec: 600, IdleSec: 60},
}}
case "acceptance": case "acceptance":
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{ return platform.PlatformStressOptions{Cycles: acceptanceCycles}
{LoadSec: 300, IdleSec: 60},
{LoadSec: 300, IdleSec: 30},
{LoadSec: 300, IdleSec: 60},
{LoadSec: 300, IdleSec: 30},
}}
default: // smoke default: // smoke
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{ return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
{LoadSec: 90, IdleSec: 60}, {LoadSec: 85, IdleSec: 5},
{LoadSec: 90, IdleSec: 30}, {LoadSec: 80, IdleSec: 10},
{LoadSec: 55, IdleSec: 5},
{LoadSec: 60, IdleSec: 0},
}} }}
} }
} }
@@ -536,9 +550,6 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
break break
} }
diagLevel := t.params.DiagLevel diagLevel := t.params.DiagLevel
if t.params.BurnProfile != "" && diagLevel <= 0 {
diagLevel = resolveBurnPreset(t.params.BurnProfile).NvidiaDiag
}
if len(t.params.GPUIndices) > 0 || diagLevel > 0 { if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
result, e := a.RunNvidiaAcceptancePackWithOptions( result, e := a.RunNvidiaAcceptancePackWithOptions(
ctx, "", diagLevel, t.params.GPUIndices, j.append, ctx, "", diagLevel, t.params.GPUIndices, j.append,
@@ -551,6 +562,16 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
} else { } else {
archive, err = a.RunNvidiaAcceptancePack("", j.append) archive, err = a.RunNvidiaAcceptancePack("", j.append)
} }
case "nvidia-targeted-stress":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if dur <= 0 {
dur = 300
}
archive, err = a.RunNvidiaTargetedStressValidatePack(ctx, "", dur, t.params.GPUIndices, j.append)
case "nvidia-benchmark": case "nvidia-benchmark":
if a == nil { if a == nil {
err = fmt.Errorf("app not configured") err = fmt.Errorf("app not configured")
@@ -563,6 +584,56 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
ExcludeGPUIndices: t.params.ExcludeGPUIndices, ExcludeGPUIndices: t.params.ExcludeGPUIndices,
RunNCCL: t.params.RunNCCL, RunNCCL: t.params.RunNCCL,
}, j.append) }, j.append)
case "nvidia-compute":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, j.append)
case "nvidia-targeted-power":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
archive, err = a.RunNvidiaTargetedPowerPack(ctx, "", dur, t.params.GPUIndices, j.append)
case "nvidia-pulse":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
archive, err = a.RunNvidiaPulseTestPack(ctx, "", dur, t.params.GPUIndices, j.append)
case "nvidia-bandwidth":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
archive, err = a.RunNvidiaBandwidthPack(ctx, "", t.params.GPUIndices, j.append)
case "nvidia-interconnect":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
DurationSec: dur,
Loader: platform.NvidiaStressLoaderNCCL,
GPUIndices: t.params.GPUIndices,
}, j.append)
case "nvidia-stress": case "nvidia-stress":
if a == nil { if a == nil {
err = fmt.Errorf("app not configured") err = fmt.Errorf("app not configured")

View File

@@ -253,10 +253,10 @@ func TestResolveBurnPreset(t *testing.T) {
profile string profile string
want burnPreset want burnPreset
}{ }{
{profile: "smoke", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}}, {profile: "smoke", want: burnPreset{DurationSec: 5 * 60}},
{profile: "acceptance", want: burnPreset{NvidiaDiag: 3, DurationSec: 60 * 60}}, {profile: "acceptance", want: burnPreset{DurationSec: 60 * 60}},
{profile: "overnight", want: burnPreset{NvidiaDiag: 4, DurationSec: 8 * 60 * 60}}, {profile: "overnight", want: burnPreset{DurationSec: 8 * 60 * 60}},
{profile: "", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}}, {profile: "", want: burnPreset{DurationSec: 5 * 60}},
} }
for _, tc := range tests { for _, tc := range tests {
if got := resolveBurnPreset(tc.profile); got != tc.want { if got := resolveBurnPreset(tc.profile); got != tc.want {

View File

@@ -302,6 +302,12 @@ memtest_fail() {
return 0 return 0
} }
nvidia_runtime_fail() {
msg="$1"
echo "ERROR: ${msg}" >&2
exit 1
}
iso_memtest_present() { iso_memtest_present() {
iso_path="$1" iso_path="$1"
iso_files="$(mktemp)" iso_files="$(mktemp)"
@@ -439,6 +445,44 @@ validate_iso_memtest() {
echo "=== memtest validation OK ===" echo "=== memtest validation OK ==="
} }
validate_iso_nvidia_runtime() {
iso_path="$1"
[ "$BEE_GPU_VENDOR" = "nvidia" ] || return 0
echo "=== validating NVIDIA runtime in ISO ==="
[ -f "$iso_path" ] || nvidia_runtime_fail "ISO not found for NVIDIA runtime validation: $iso_path"
require_iso_reader "$iso_path" >/dev/null 2>&1 || nvidia_runtime_fail "ISO reader unavailable for NVIDIA runtime validation"
command -v unsquashfs >/dev/null 2>&1 || nvidia_runtime_fail "unsquashfs is required for NVIDIA runtime validation"
squashfs_tmp="$(mktemp)"
squashfs_list="$(mktemp)"
iso_read_member "$iso_path" live/filesystem.squashfs "$squashfs_tmp" || {
rm -f "$squashfs_tmp" "$squashfs_list"
nvidia_runtime_fail "failed to extract live/filesystem.squashfs from ISO"
}
unsquashfs -ll "$squashfs_tmp" > "$squashfs_list" 2>/dev/null || {
rm -f "$squashfs_tmp" "$squashfs_list"
nvidia_runtime_fail "failed to inspect filesystem.squashfs from ISO"
}
grep -Eq 'usr/bin/dcgmi$' "$squashfs_list" || {
rm -f "$squashfs_tmp" "$squashfs_list"
nvidia_runtime_fail "dcgmi missing from final NVIDIA ISO"
}
grep -Eq 'usr/bin/nv-hostengine$' "$squashfs_list" || {
rm -f "$squashfs_tmp" "$squashfs_list"
nvidia_runtime_fail "nv-hostengine missing from final NVIDIA ISO"
}
grep -Eq 'usr/bin/dcgmproftester([0-9]+)?$' "$squashfs_list" || {
rm -f "$squashfs_tmp" "$squashfs_list"
nvidia_runtime_fail "dcgmproftester missing from final NVIDIA ISO"
}
rm -f "$squashfs_tmp" "$squashfs_list"
echo "=== NVIDIA runtime validation OK ==="
}
append_memtest_grub_entry() { append_memtest_grub_entry() {
grub_cfg="$1" grub_cfg="$1"
[ -f "$grub_cfg" ] || return 1 [ -f "$grub_cfg" ] || return 1
@@ -1144,6 +1188,7 @@ if [ -f "$ISO_RAW" ]; then
fi fi
fi fi
validate_iso_memtest "$ISO_RAW" validate_iso_memtest "$ISO_RAW"
validate_iso_nvidia_runtime "$ISO_RAW"
cp "$ISO_RAW" "$ISO_OUT" cp "$ISO_RAW" "$ISO_OUT"
echo "" echo ""
echo "=== done (${BEE_GPU_VENDOR}) ===" echo "=== done (${BEE_GPU_VENDOR}) ==="

View File

@@ -1,6 +1,10 @@
# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing. # NVIDIA DCGM (Data Center GPU Manager).
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with CUDA 13 userspace, # Validate uses dcgmi diagnostics; Burn uses dcgmproftester as the official
# so install the CUDA 13 build plus proprietary diagnostic components explicitly. # NVIDIA max-compute recipe. The smoketest/runtime contract treats
# dcgmproftester as required in the LiveCD.
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
# CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
# explicitly.
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%% datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%% datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%% datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%

View File

@@ -52,6 +52,31 @@ else
fail "nvidia-smi: NOT FOUND" fail "nvidia-smi: NOT FOUND"
fi fi
if p=$(PATH="/usr/local/bin:$PATH" command -v dcgmi 2>/dev/null); then
ok "dcgmi found: $p"
else
fail "dcgmi: NOT FOUND"
fi
if p=$(PATH="/usr/local/bin:$PATH" command -v nv-hostengine 2>/dev/null); then
ok "nv-hostengine found: $p"
else
fail "nv-hostengine: NOT FOUND"
fi
DCGM_PROFTESTER=""
for tool in dcgmproftester dcgmproftester13 dcgmproftester12 dcgmproftester11; do
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
DCGM_PROFTESTER="$p"
break
fi
done
if [ -n "$DCGM_PROFTESTER" ]; then
ok "dcgmproftester found: $DCGM_PROFTESTER"
else
fail "dcgmproftester: NOT FOUND"
fi
for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf; do for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf; do
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
ok "$tool found: $p" ok "$tool found: $p"
@@ -60,6 +85,12 @@ for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf
fi fi
done done
if p=$(PATH="/usr/local/bin:$PATH" command -v nvbandwidth 2>/dev/null); then
ok "nvbandwidth found: $p"
else
warn "nvbandwidth: NOT FOUND"
fi
echo "" echo ""
echo "-- NVIDIA modules --" echo "-- NVIDIA modules --"
KO_DIR="/usr/local/lib/nvidia" KO_DIR="/usr/local/lib/nvidia"