diff --git a/audit/internal/platform/sat.go b/audit/internal/platform/sat.go index 76183e1..b7f3d2d 100644 --- a/audit/internal/platform/sat.go +++ b/audit/internal/platform/sat.go @@ -146,28 +146,36 @@ func (s *System) RunAMDStressPack(ctx context.Context, baseDir string, durationS if err := ensureAMDRuntimeReady(); err != nil { return "", err } - // Write RVS GST config to a temp file - rvsCfg := fmt.Sprintf(`actions: + // Enable copy_matrix so the same GST run drives VRAM traffic in addition to compute. + rvsCfg := amdStressRVSConfig(seconds) + cfgFile := "/tmp/bee-amd-gst.conf" + _ = os.WriteFile(cfgFile, []byte(rvsCfg), 0644) + + return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-stress", amdStressJobs(seconds, cfgFile), logFunc) +} + +func amdStressRVSConfig(seconds int) string { + return fmt.Sprintf(`actions: - name: gst_stress device: all module: gst parallel: true duration: %d - copy_matrix: false + copy_matrix: true target_stress: 90 matrix_size_a: 8640 matrix_size_b: 8640 matrix_size_c: 8640 `, seconds*1000) - cfgFile := "/tmp/bee-amd-gst.conf" - _ = os.WriteFile(cfgFile, []byte(rvsCfg), 0644) +} - return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-stress", []satJob{ +func amdStressJobs(seconds int, cfgFile string) []satJob { + return []satJob{ {name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}}, {name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}}, {name: fmt.Sprintf("03-rvs-gst-%ds.log", seconds), cmd: []string{"rvs", "-c", cfgFile}}, {name: fmt.Sprintf("04-rocm-smi-after.log"), cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--csv"}}, - }, logFunc) + } } // ListNvidiaGPUs returns GPUs visible to nvidia-smi. diff --git a/audit/internal/platform/sat_test.go b/audit/internal/platform/sat_test.go index be99f39..fa3fed3 100644 --- a/audit/internal/platform/sat_test.go +++ b/audit/internal/platform/sat_test.go @@ -5,6 +5,7 @@ import ( "os" "os/exec" "path/filepath" + "strings" "testing" ) @@ -38,6 +39,36 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) { } } +func TestAMDStressConfigEnablesVRAMTraffic(t *testing.T) { + t.Parallel() + + cfg := amdStressRVSConfig(123) + if !strings.Contains(cfg, "copy_matrix: true") { + t.Fatalf("config missing VRAM copy path:\n%s", cfg) + } + if !strings.Contains(cfg, "duration: 123000") { + t.Fatalf("config missing millisecond duration:\n%s", cfg) + } +} + +func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) { + t.Parallel() + + jobs := amdStressJobs(300, "/tmp/test-amd-gst.conf") + if len(jobs) != 4 { + t.Fatalf("jobs=%d want 4", len(jobs)) + } + if got := jobs[1].cmd[0]; got != "rocm-bandwidth-test" { + t.Fatalf("jobs[1]=%q want rocm-bandwidth-test", got) + } + if got := jobs[2].cmd[0]; got != "rvs" { + t.Fatalf("jobs[2]=%q want rvs", got) + } + if got := jobs[2].cmd[2]; got != "/tmp/test-amd-gst.conf" { + t.Fatalf("jobs[2] cfg=%q want /tmp/test-amd-gst.conf", got) + } +} + func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) { t.Setenv("BEE_GPU_STRESS_SECONDS", "9") t.Setenv("BEE_GPU_STRESS_SIZE_MB", "96") diff --git a/audit/internal/webui/pages.go b/audit/internal/webui/pages.go index c60a9cc..05c41f2 100644 --- a/audit/internal/webui/pages.go +++ b/audit/internal/webui/pages.go @@ -598,7 +598,7 @@ func renderBurn() string {
AMD GPU Stress
-

Requires ROCm tools (rocm-bandwidth-test). Missing tools reported as UNSUPPORTED.

+

Runs ROCm compute stress together with VRAM copy/load activity via RVS GST and records a separate rocm-bandwidth-test snapshot. Missing tools reported as UNSUPPORTED.

Memory Stress