diff --git a/audit/internal/platform/nvidia_stress.go b/audit/internal/platform/nvidia_stress.go index bcaea2a..62bc0c5 100644 --- a/audit/internal/platform/nvidia_stress.go +++ b/audit/internal/platform/nvidia_stress.go @@ -16,7 +16,7 @@ func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts N return "", err } - return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-stress", []satJob{ + return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), []satJob{ {name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, {name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}}, job, @@ -24,6 +24,17 @@ func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts N }, logFunc) } +func nvidiaStressArchivePrefix(loader string) string { + switch strings.TrimSpace(strings.ToLower(loader)) { + case NvidiaStressLoaderJohn: + return "gpu-nvidia-john" + case NvidiaStressLoaderNCCL: + return "gpu-nvidia-nccl" + default: + return "gpu-nvidia-burn" + } +} + func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) { selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices) if err != nil { diff --git a/audit/internal/platform/sat_test.go b/audit/internal/platform/sat_test.go index 0f77523..6d6df7b 100644 --- a/audit/internal/platform/sat_test.go +++ b/audit/internal/platform/sat_test.go @@ -162,6 +162,25 @@ func TestBuildNvidiaStressJobUsesNCCLLoader(t *testing.T) { } } +func TestNvidiaStressArchivePrefixByLoader(t *testing.T) { + t.Parallel() + + tests := []struct { + loader string + want string + }{ + {loader: NvidiaStressLoaderBuiltin, want: "gpu-nvidia-burn"}, + {loader: NvidiaStressLoaderJohn, want: "gpu-nvidia-john"}, + {loader: NvidiaStressLoaderNCCL, want: "gpu-nvidia-nccl"}, + {loader: "", want: "gpu-nvidia-burn"}, + } + for _, tt := range tests { + if got := nvidiaStressArchivePrefix(tt.loader); got != tt.want { + t.Fatalf("loader=%q prefix=%q want %q", tt.loader, got, tt.want) + } + } +} + func TestEnvIntFallback(t *testing.T) { os.Unsetenv("BEE_MEMTESTER_SIZE_MB") if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 123 { diff --git a/audit/internal/webui/api.go b/audit/internal/webui/api.go index fe66958..dd2e19e 100644 --- a/audit/internal/webui/api.go +++ b/audit/internal/webui/api.go @@ -183,15 +183,7 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc { _ = json.NewDecoder(r.Body).Decode(&body) } - name := taskNames[target] - if body.Profile != "" { - if n, ok := burnNames[target]; ok { - name = n - } - } - if name == "" { - name = target - } + name := taskDisplayName(target, body.Profile, body.Loader) t := &Task{ ID: newJobID("sat-" + target), Name: name, diff --git a/audit/internal/webui/tasks.go b/audit/internal/webui/tasks.go index 692f6e2..4f8cebc 100644 --- a/audit/internal/webui/tasks.go +++ b/audit/internal/webui/tasks.go @@ -8,6 +8,7 @@ import ( "os" "path/filepath" "sort" + "strings" "sync" "time" @@ -51,6 +52,33 @@ var burnNames = map[string]string{ "amd": "AMD GPU Burn-in", } +func nvidiaStressTaskName(loader string) string { + switch strings.TrimSpace(strings.ToLower(loader)) { + case platform.NvidiaStressLoaderJohn: + return "NVIDIA GPU Stress (John/OpenCL)" + case platform.NvidiaStressLoaderNCCL: + return "NVIDIA GPU Stress (NCCL)" + default: + return "NVIDIA GPU Stress (bee-gpu-burn)" + } +} + +func taskDisplayName(target, profile, loader string) string { + name := taskNames[target] + if profile != "" { + if n, ok := burnNames[target]; ok { + name = n + } + } + if target == "nvidia-stress" { + name = nvidiaStressTaskName(loader) + } + if name == "" { + name = target + } + return name +} + // Task represents one unit of work in the queue. type Task struct { ID string `json:"id"` diff --git a/audit/internal/webui/tasks_test.go b/audit/internal/webui/tasks_test.go index b3ca986..e9a085a 100644 --- a/audit/internal/webui/tasks_test.go +++ b/audit/internal/webui/tasks_test.go @@ -95,6 +95,23 @@ func TestResolveBurnPreset(t *testing.T) { } } +func TestTaskDisplayNameUsesNvidiaStressLoader(t *testing.T) { + tests := []struct { + loader string + want string + }{ + {loader: "", want: "NVIDIA GPU Stress (bee-gpu-burn)"}, + {loader: "builtin", want: "NVIDIA GPU Stress (bee-gpu-burn)"}, + {loader: "john", want: "NVIDIA GPU Stress (John/OpenCL)"}, + {loader: "nccl", want: "NVIDIA GPU Stress (NCCL)"}, + } + for _, tc := range tests { + if got := taskDisplayName("nvidia-stress", "acceptance", tc.loader); got != tc.want { + t.Fatalf("taskDisplayName(loader=%q)=%q want %q", tc.loader, got, tc.want) + } + } +} + func TestRunTaskHonorsCancel(t *testing.T) { t.Parallel() diff --git a/iso/overlay/usr/local/bin/bee-john-gpu-stress b/iso/overlay/usr/local/bin/bee-john-gpu-stress index 66aa778..9960ee2 100644 --- a/iso/overlay/usr/local/bin/bee-john-gpu-stress +++ b/iso/overlay/usr/local/bin/bee-john-gpu-stress @@ -7,6 +7,8 @@ EXCLUDE="" FORMAT="" JOHN_DIR="/usr/local/lib/bee/john/run" JOHN_BIN="${JOHN_DIR}/john" +export OCL_ICD_VENDORS="/etc/OpenCL/vendors" +export LD_LIBRARY_PATH="/usr/lib:/usr/local/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" usage() { echo "usage: $0 [--seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2 @@ -24,6 +26,21 @@ contains_csv() { } show_opencl_diagnostics() { + echo "-- OpenCL ICD vendors --" >&2 + if [ -d /etc/OpenCL/vendors ]; then + ls -l /etc/OpenCL/vendors >&2 || true + for icd in /etc/OpenCL/vendors/*.icd; do + [ -f "${icd}" ] || continue + echo " file: ${icd}" >&2 + sed 's/^/ /' "${icd}" >&2 || true + done + else + echo " /etc/OpenCL/vendors is missing" >&2 + fi + echo "-- NVIDIA device nodes --" >&2 + ls -l /dev/nvidia* >&2 || true + echo "-- ldconfig OpenCL/NVIDIA --" >&2 + ldconfig -p 2>/dev/null | grep 'libOpenCL\|libcuda\|libnvidia-opencl' >&2 || true if command -v clinfo >/dev/null 2>&1; then echo "-- clinfo -l --" >&2 clinfo -l >&2 || true @@ -32,6 +49,17 @@ show_opencl_diagnostics() { ./john --list=opencl-devices >&2 || true } +refresh_nvidia_runtime() { + if [ "$(id -u)" != "0" ]; then + return 1 + fi + if command -v bee-nvidia-load >/dev/null 2>&1; then + bee-nvidia-load >/dev/null 2>&1 || true + fi + ldconfig >/dev/null 2>&1 || true + return 0 +} + ensure_nvidia_uvm() { if lsmod 2>/dev/null | grep -q '^nvidia_uvm '; then return 0 @@ -61,6 +89,13 @@ ensure_opencl_ready() { return 0 fi + if refresh_nvidia_runtime; then + out=$(./john --list=opencl-devices 2>&1 || true) + if echo "${out}" | grep -q "Device #"; then + return 0 + fi + fi + if ensure_nvidia_uvm; then out=$(./john --list=opencl-devices 2>&1 || true) if echo "${out}" | grep -q "Device #"; then