package platform import ( "context" "errors" "os" "os/exec" "path/filepath" "strings" "testing" "time" ) func TestStorageSATCommands(t *testing.T) { t.Parallel() nvme := storageSATCommands("/dev/nvme0n1", false) if len(nvme) != 3 || nvme[2].cmd[0] != "nvme" { t.Fatalf("unexpected nvme commands: %#v", nvme) } sata := storageSATCommands("/dev/sda", false) if len(sata) != 2 || sata[0].cmd[0] != "smartctl" { t.Fatalf("unexpected sata commands: %#v", sata) } } func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) { t.Parallel() jobs := nvidiaSATJobs() if len(jobs) != 6 { t.Fatalf("jobs=%d want 6", len(jobs)) } if got := jobs[0].cmd[0]; got != "nvidia-smi" { t.Fatalf("preflight command=%q want nvidia-smi", got) } if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" { t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1") } if got := jobs[5].cmd[0]; got != "bee-gpu-burn" { t.Fatalf("gpu stress command=%q want bee-gpu-burn", got) } if got := jobs[4].cmd[1]; got != "--output-file" { t.Fatalf("bug report flag=%q want --output-file", got) } } func TestAMDStressConfigUsesSingleGSTAction(t *testing.T) { t.Parallel() cfg := amdStressRVSConfig(123) if !strings.Contains(cfg, "module: gst") { t.Fatalf("config missing gst module:\n%s", cfg) } if strings.Contains(cfg, "module: mem") { t.Fatalf("config should not include mem module:\n%s", cfg) } if !strings.Contains(cfg, "copy_matrix: false") { t.Fatalf("config should use copy_matrix=false:\n%s", cfg) } if strings.Count(cfg, "duration: 123000") != 1 { t.Fatalf("config should apply duration once:\n%s", cfg) } for _, field := range []string{"matrix_size_a: 8640", "matrix_size_b: 8640", "matrix_size_c: 8640"} { if !strings.Contains(cfg, field) { t.Fatalf("config missing %s:\n%s", field, cfg) } } } func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) { t.Parallel() jobs := amdStressJobs(300, "/tmp/test-amd-gst.conf") if len(jobs) != 4 { t.Fatalf("jobs=%d want 4", len(jobs)) } if got := jobs[1].cmd[0]; got != "rocm-bandwidth-test" { t.Fatalf("jobs[1]=%q want rocm-bandwidth-test", got) } if got := jobs[2].cmd[0]; got != "rvs" { t.Fatalf("jobs[2]=%q want rvs", got) } if got := jobs[2].cmd[2]; got != "/tmp/test-amd-gst.conf" { t.Fatalf("jobs[2] cfg=%q want /tmp/test-amd-gst.conf", got) } } func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) { jobs := nvidiaSATJobs() got := jobs[5].cmd want := []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"} if len(got) != len(want) { t.Fatalf("cmd len=%d want %d", len(got), len(want)) } for i := range want { if got[i] != want[i] { t.Fatalf("cmd[%d]=%q want %q", i, got[i], want[i]) } } } func TestNvidiaDCGMJobsEnablePersistenceModeBeforeDiag(t *testing.T) { jobs := nvidiaDCGMJobs(3, []int{2, 0}) if len(jobs) != 5 { t.Fatalf("jobs=%d want 5", len(jobs)) } if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" { t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1") } if got := strings.Join(jobs[4].cmd, " "); got != "dcgmi diag -r 3 -i 2,0" { t.Fatalf("diag=%q want %q", got, "dcgmi diag -r 3 -i 2,0") } } func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) { t.Parallel() oldExecCommand := satExecCommand satExecCommand = func(name string, args ...string) *exec.Cmd { if name == "nvidia-smi" { return exec.Command("sh", "-c", "printf '0\n1\n2\n'") } return exec.Command(name, args...) } t.Cleanup(func() { satExecCommand = oldExecCommand }) job, err := buildNvidiaStressJob(NvidiaStressOptions{ DurationSec: 600, Loader: NvidiaStressLoaderJohn, ExcludeGPUIndices: []int{1}, }) if err != nil { t.Fatalf("buildNvidiaStressJob error: %v", err) } wantCmd := []string{"bee-john-gpu-stress", "--seconds", "600", "--devices", "0,2"} if len(job.cmd) != len(wantCmd) { t.Fatalf("cmd len=%d want %d (%v)", len(job.cmd), len(wantCmd), job.cmd) } for i := range wantCmd { if job.cmd[i] != wantCmd[i] { t.Fatalf("cmd[%d]=%q want %q", i, job.cmd[i], wantCmd[i]) } } if got := joinIndexList(job.gpuIndices); got != "0,2" { t.Fatalf("gpuIndices=%q want 0,2", got) } } func TestBuildNvidiaStressJobUsesNCCLLoader(t *testing.T) { t.Parallel() oldExecCommand := satExecCommand satExecCommand = func(name string, args ...string) *exec.Cmd { if name == "nvidia-smi" { return exec.Command("sh", "-c", "printf '0\n1\n2\n'") } return exec.Command(name, args...) } t.Cleanup(func() { satExecCommand = oldExecCommand }) job, err := buildNvidiaStressJob(NvidiaStressOptions{ DurationSec: 120, Loader: NvidiaStressLoaderNCCL, GPUIndices: []int{2, 0}, }) if err != nil { t.Fatalf("buildNvidiaStressJob error: %v", err) } wantCmd := []string{"bee-nccl-gpu-stress", "--seconds", "120", "--devices", "0,2"} if len(job.cmd) != len(wantCmd) { t.Fatalf("cmd len=%d want %d (%v)", len(job.cmd), len(wantCmd), job.cmd) } for i := range wantCmd { if job.cmd[i] != wantCmd[i] { t.Fatalf("cmd[%d]=%q want %q", i, job.cmd[i], wantCmd[i]) } } if got := joinIndexList(job.gpuIndices); got != "0,2" { t.Fatalf("gpuIndices=%q want 0,2", got) } } func TestResolveDCGMGPUIndicesUsesDetectedGPUsWhenUnset(t *testing.T) { t.Parallel() oldExecCommand := satExecCommand satExecCommand = func(name string, args ...string) *exec.Cmd { if name == "nvidia-smi" { return exec.Command("sh", "-c", "printf '2\n0\n1\n'") } return exec.Command(name, args...) } t.Cleanup(func() { satExecCommand = oldExecCommand }) got, err := resolveDCGMGPUIndices(nil) if err != nil { t.Fatalf("resolveDCGMGPUIndices error: %v", err) } if want := "0,1,2"; joinIndexList(got) != want { t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want) } } func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) { t.Parallel() got, err := resolveDCGMGPUIndices([]int{3, 1, 3}) if err != nil { t.Fatalf("resolveDCGMGPUIndices error: %v", err) } if want := "1,3"; joinIndexList(got) != want { t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want) } } func TestParseNvidiaGPUHealthDetectsResetRequired(t *testing.T) { t.Parallel() got := parseNvidiaGPUHealth("0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n") if len(got) != 2 { t.Fatalf("len=%d want 2", len(got)) } if got[0].NeedsReset { t.Fatalf("gpu0 unexpectedly marked reset-required") } if !got[1].NeedsReset { t.Fatalf("gpu1 should be marked reset-required: %#v", got[1]) } } func TestCheckNvidiaJobHealthReturnsErrorForSelectedResetRequiredGPU(t *testing.T) { oldExecCommand := satExecCommand satExecCommand = func(name string, args ...string) *exec.Cmd { if name == "nvidia-smi" { return exec.Command("sh", "-c", "printf '0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n'") } return exec.Command(name, args...) } t.Cleanup(func() { satExecCommand = oldExecCommand }) msg, err := checkNvidiaJobHealth([]int{1}) if err == nil { t.Fatal("expected health check error") } if !strings.Contains(msg, "gpu 1") || !strings.Contains(strings.ToLower(msg), "requires reset") { t.Fatalf("unexpected message: %q", msg) } } func TestWriteNvidiaGPUStatusFilesCreatesPerGPUFiles(t *testing.T) { dir := t.TempDir() oldExecCommand := satExecCommand satExecCommand = func(name string, args ...string) *exec.Cmd { if name == "nvidia-smi" { return exec.Command("sh", "-c", "printf '0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n'") } return exec.Command(name, args...) } t.Cleanup(func() { satExecCommand = oldExecCommand }) perGPU := map[int]*nvidiaGPUStatusFile{ 0: {Index: 0, RunStatus: "OK"}, 1: {Index: 1, RunStatus: "FAILED", FailingJob: "02-dcgmi-targeted-stress.log", Reason: "NVIDIA GPU health check failed:"}, } if err := writeNvidiaGPUStatusFiles(dir, "FAILED", perGPU, map[int]struct{}{0: {}, 1: {}}); err != nil { t.Fatalf("writeNvidiaGPUStatusFiles error: %v", err) } raw, err := os.ReadFile(filepath.Join(dir, "gpu-1-status.txt")) if err != nil { t.Fatalf("ReadFile gpu-1-status.txt: %v", err) } text := string(raw) if !strings.Contains(text, "run_status=FAILED") { t.Fatalf("missing run status:\n%s", text) } if !strings.Contains(text, "health_status=RESET_REQUIRED") { t.Fatalf("missing health status:\n%s", text) } if !strings.Contains(text, "failing_job=02-dcgmi-targeted-stress.log") { t.Fatalf("missing failing job:\n%s", text) } } func TestResolveDCGMProfTesterCommandUsesVersionedBinary(t *testing.T) { oldLookPath := satLookPath satLookPath = func(file string) (string, error) { switch file { case "dcgmproftester13": return "/usr/bin/dcgmproftester13", nil default: return "", exec.ErrNotFound } } t.Cleanup(func() { satLookPath = oldLookPath }) cmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004") if err != nil { t.Fatalf("resolveDCGMProfTesterCommand error: %v", err) } if len(cmd) != 4 { t.Fatalf("cmd len=%d want 4 (%v)", len(cmd), cmd) } if cmd[0] != "/usr/bin/dcgmproftester13" { t.Fatalf("cmd[0]=%q want /usr/bin/dcgmproftester13", cmd[0]) } } func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) { cmd := nvidiaDCGMNamedDiagCommand("targeted_power", 900, []int{3, 1}) want := []string{"dcgmi", "diag", "-r", "targeted_power", "-p", "targeted_power.test_duration=900", "-i", "3,1"} if len(cmd) != len(want) { t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd) } for i := range want { if cmd[i] != want[i] { t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i]) } } } func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) { env := nvidiaVisibleDevicesEnv([]int{0, 2, 4}) if len(env) != 2 { t.Fatalf("env len=%d want 2 (%v)", len(env), env) } if env[0] != "CUDA_DEVICE_ORDER=PCI_BUS_ID" { t.Fatalf("env[0]=%q want CUDA_DEVICE_ORDER=PCI_BUS_ID", env[0]) } if env[1] != "CUDA_VISIBLE_DEVICES=0,2,4" { t.Fatalf("env[1]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[1]) } } func TestNvidiaStressArchivePrefixByLoader(t *testing.T) { t.Parallel() tests := []struct { loader string want string }{ {loader: NvidiaStressLoaderBuiltin, want: "gpu-nvidia-burn"}, {loader: NvidiaStressLoaderJohn, want: "gpu-nvidia-john"}, {loader: NvidiaStressLoaderNCCL, want: "gpu-nvidia-nccl"}, {loader: "", want: "gpu-nvidia-burn"}, } for _, tt := range tests { if got := nvidiaStressArchivePrefix(tt.loader); got != tt.want { t.Fatalf("loader=%q prefix=%q want %q", tt.loader, got, tt.want) } } } func TestEnvIntFallback(t *testing.T) { os.Unsetenv("BEE_MEMTESTER_SIZE_MB") if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 123 { t.Fatalf("got %d want 123", got) } t.Setenv("BEE_MEMTESTER_SIZE_MB", "bad") if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 123 { t.Fatalf("got %d want 123", got) } t.Setenv("BEE_MEMTESTER_SIZE_MB", "256") if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 256 { t.Fatalf("got %d want 256", got) } } func TestMemoryStressSizeArgUsesAvailableMemory(t *testing.T) { oldFreeMemBytes := satFreeMemBytes satFreeMemBytes = func() int64 { return 96 * 1024 * 1024 * 1024 } t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes }) if got := memoryStressSizeArg(); got != "65536M" { t.Fatalf("sizeArg=%q want 65536M", got) } } func TestMemoryStressSizeArgRespectsOverride(t *testing.T) { oldFreeMemBytes := satFreeMemBytes satFreeMemBytes = func() int64 { return 96 * 1024 * 1024 * 1024 } t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes }) t.Setenv("BEE_VM_STRESS_SIZE_MB", "4096") if got := memoryStressSizeArg(); got != "4096M" { t.Fatalf("sizeArg=%q want 4096M", got) } } func TestMemoryStressSizeArgFallsBackWhenFreeMemoryUnknown(t *testing.T) { oldFreeMemBytes := satFreeMemBytes satFreeMemBytes = func() int64 { return 0 } t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes }) if got := memoryStressSizeArg(); got != "80%" { t.Fatalf("sizeArg=%q want 80%%", got) } } func TestClassifySATResult(t *testing.T) { tests := []struct { name string job string out string err error status string }{ {name: "ok", job: "memtester", out: "done", err: nil, status: "OK"}, {name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"}, {name: "nvme wait timeout without progress", job: "nvme-device-self-test", out: "Short Device self-test started\nWaiting for self test completion...\nno progress for 78 seconds, stop waiting", err: errors.New("rc 1"), status: "UNSUPPORTED"}, {name: "failed", job: "bee-gpu-burn", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"}, {name: "cuda not ready", job: "bee-gpu-burn", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { got, _ := classifySATResult(tt.job, []byte(tt.out), tt.err) if got != tt.status { t.Fatalf("status=%q want %q", got, tt.status) } }) } } func TestRunAcceptancePackCtxReturnsContextErrorWithoutArchive(t *testing.T) { dir := t.TempDir() ctx, cancel := context.WithCancel(context.Background()) t.Cleanup(cancel) done := make(chan struct{}) go func() { time.Sleep(100 * time.Millisecond) cancel() close(done) }() archive, err := runAcceptancePackCtx(ctx, dir, "cancelled-pack", []satJob{ {name: "01-sleep.log", cmd: []string{"sh", "-c", "sleep 5"}}, }, nil) <-done if !errors.Is(err, context.Canceled) { t.Fatalf("err=%v want context.Canceled", err) } if archive != "" { t.Fatalf("archive=%q want empty", archive) } matches, globErr := filepath.Glob(filepath.Join(dir, "cancelled-pack-*.tar.gz")) if globErr != nil { t.Fatalf("Glob error: %v", globErr) } if len(matches) != 0 { t.Fatalf("archives=%v want none", matches) } } func TestParseStorageDevicesSkipsUSBDisks(t *testing.T) { t.Parallel() raw := "nvme0n1 disk nvme\nsda disk usb\nloop0 loop\nsdb disk sata\n" got := parseStorageDevices(raw) want := []string{"/dev/nvme0n1", "/dev/sdb"} if len(got) != len(want) { t.Fatalf("len(devices)=%d want %d (%v)", len(got), len(want), got) } for i := range want { if got[i] != want[i] { t.Fatalf("devices[%d]=%q want %q", i, got[i], want[i]) } } } func TestResolveROCmSMICommandFromPATH(t *testing.T) { t.Setenv("PATH", t.TempDir()) toolPath := filepath.Join(os.Getenv("PATH"), "rocm-smi") if err := os.WriteFile(toolPath, []byte("#!/bin/sh\nexit 0\n"), 0755); err != nil { t.Fatalf("write rocm-smi: %v", err) } cmd, err := resolveROCmSMICommand("--showproductname") if err != nil { t.Fatalf("resolveROCmSMICommand error: %v", err) } if len(cmd) != 2 { t.Fatalf("cmd len=%d want 2 (%v)", len(cmd), cmd) } if cmd[0] != toolPath { t.Fatalf("cmd[0]=%q want %q", cmd[0], toolPath) } } func TestResolveSATCommandUsesLookPathForGenericTools(t *testing.T) { oldLookPath := satLookPath satLookPath = func(file string) (string, error) { if file == "stress-ng" { return "/usr/bin/stress-ng", nil } return "", exec.ErrNotFound } t.Cleanup(func() { satLookPath = oldLookPath }) cmd, err := resolveSATCommand([]string{"stress-ng", "--cpu", "0"}) if err != nil { t.Fatalf("resolveSATCommand error: %v", err) } if len(cmd) != 3 { t.Fatalf("cmd len=%d want 3 (%v)", len(cmd), cmd) } if cmd[0] != "/usr/bin/stress-ng" { t.Fatalf("cmd[0]=%q want /usr/bin/stress-ng", cmd[0]) } } func TestResolveSATCommandFailsForMissingGenericTool(t *testing.T) { oldLookPath := satLookPath satLookPath = func(file string) (string, error) { return "", exec.ErrNotFound } t.Cleanup(func() { satLookPath = oldLookPath }) _, err := resolveSATCommand([]string{"stress-ng", "--cpu", "0"}) if err == nil { t.Fatal("expected error") } if !strings.Contains(err.Error(), "stress-ng not found in PATH") { t.Fatalf("error=%q", err) } } func TestResolveROCmSMICommandFallsBackToROCmTree(t *testing.T) { tmp := t.TempDir() execPath := filepath.Join(tmp, "opt", "rocm", "bin", "rocm-smi") if err := os.MkdirAll(filepath.Dir(execPath), 0755); err != nil { t.Fatalf("mkdir: %v", err) } if err := os.WriteFile(execPath, []byte("#!/bin/sh\nexit 0\n"), 0755); err != nil { t.Fatalf("write rocm-smi: %v", err) } oldGlob := rocmSMIExecutableGlobs oldScriptGlobs := rocmSMIScriptGlobs rocmSMIExecutableGlobs = []string{execPath} rocmSMIScriptGlobs = nil t.Cleanup(func() { rocmSMIExecutableGlobs = oldGlob rocmSMIScriptGlobs = oldScriptGlobs }) t.Setenv("PATH", "") cmd, err := resolveROCmSMICommand("--showallinfo") if err != nil { t.Fatalf("resolveROCmSMICommand error: %v", err) } if len(cmd) != 2 { t.Fatalf("cmd len=%d want 2 (%v)", len(cmd), cmd) } if cmd[0] != execPath { t.Fatalf("cmd[0]=%q want %q", cmd[0], execPath) } } func TestRunROCmSMIReportsMissingCommand(t *testing.T) { oldLookPath := satLookPath oldExecGlobs := rocmSMIExecutableGlobs oldScriptGlobs := rocmSMIScriptGlobs satLookPath = func(string) (string, error) { return "", exec.ErrNotFound } rocmSMIExecutableGlobs = nil rocmSMIScriptGlobs = nil t.Cleanup(func() { satLookPath = oldLookPath rocmSMIExecutableGlobs = oldExecGlobs rocmSMIScriptGlobs = oldScriptGlobs }) if _, err := runROCmSMI("--showproductname"); err == nil { t.Fatal("expected missing rocm-smi error") } }