fix(amd-stress): include VRAM load in GST burn
This commit is contained in:
@@ -146,28 +146,36 @@ func (s *System) RunAMDStressPack(ctx context.Context, baseDir string, durationS
|
|||||||
if err := ensureAMDRuntimeReady(); err != nil {
|
if err := ensureAMDRuntimeReady(); err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
// Write RVS GST config to a temp file
|
// Enable copy_matrix so the same GST run drives VRAM traffic in addition to compute.
|
||||||
rvsCfg := fmt.Sprintf(`actions:
|
rvsCfg := amdStressRVSConfig(seconds)
|
||||||
|
cfgFile := "/tmp/bee-amd-gst.conf"
|
||||||
|
_ = os.WriteFile(cfgFile, []byte(rvsCfg), 0644)
|
||||||
|
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-stress", amdStressJobs(seconds, cfgFile), logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func amdStressRVSConfig(seconds int) string {
|
||||||
|
return fmt.Sprintf(`actions:
|
||||||
- name: gst_stress
|
- name: gst_stress
|
||||||
device: all
|
device: all
|
||||||
module: gst
|
module: gst
|
||||||
parallel: true
|
parallel: true
|
||||||
duration: %d
|
duration: %d
|
||||||
copy_matrix: false
|
copy_matrix: true
|
||||||
target_stress: 90
|
target_stress: 90
|
||||||
matrix_size_a: 8640
|
matrix_size_a: 8640
|
||||||
matrix_size_b: 8640
|
matrix_size_b: 8640
|
||||||
matrix_size_c: 8640
|
matrix_size_c: 8640
|
||||||
`, seconds*1000)
|
`, seconds*1000)
|
||||||
cfgFile := "/tmp/bee-amd-gst.conf"
|
}
|
||||||
_ = os.WriteFile(cfgFile, []byte(rvsCfg), 0644)
|
|
||||||
|
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-stress", []satJob{
|
func amdStressJobs(seconds int, cfgFile string) []satJob {
|
||||||
|
return []satJob{
|
||||||
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||||
{name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}},
|
{name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}},
|
||||||
{name: fmt.Sprintf("03-rvs-gst-%ds.log", seconds), cmd: []string{"rvs", "-c", cfgFile}},
|
{name: fmt.Sprintf("03-rvs-gst-%ds.log", seconds), cmd: []string{"rvs", "-c", cfgFile}},
|
||||||
{name: fmt.Sprintf("04-rocm-smi-after.log"), cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--csv"}},
|
{name: fmt.Sprintf("04-rocm-smi-after.log"), cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--csv"}},
|
||||||
}, logFunc)
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ListNvidiaGPUs returns GPUs visible to nvidia-smi.
|
// ListNvidiaGPUs returns GPUs visible to nvidia-smi.
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -38,6 +39,36 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestAMDStressConfigEnablesVRAMTraffic(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
cfg := amdStressRVSConfig(123)
|
||||||
|
if !strings.Contains(cfg, "copy_matrix: true") {
|
||||||
|
t.Fatalf("config missing VRAM copy path:\n%s", cfg)
|
||||||
|
}
|
||||||
|
if !strings.Contains(cfg, "duration: 123000") {
|
||||||
|
t.Fatalf("config missing millisecond duration:\n%s", cfg)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
jobs := amdStressJobs(300, "/tmp/test-amd-gst.conf")
|
||||||
|
if len(jobs) != 4 {
|
||||||
|
t.Fatalf("jobs=%d want 4", len(jobs))
|
||||||
|
}
|
||||||
|
if got := jobs[1].cmd[0]; got != "rocm-bandwidth-test" {
|
||||||
|
t.Fatalf("jobs[1]=%q want rocm-bandwidth-test", got)
|
||||||
|
}
|
||||||
|
if got := jobs[2].cmd[0]; got != "rvs" {
|
||||||
|
t.Fatalf("jobs[2]=%q want rvs", got)
|
||||||
|
}
|
||||||
|
if got := jobs[2].cmd[2]; got != "/tmp/test-amd-gst.conf" {
|
||||||
|
t.Fatalf("jobs[2] cfg=%q want /tmp/test-amd-gst.conf", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) {
|
func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) {
|
||||||
t.Setenv("BEE_GPU_STRESS_SECONDS", "9")
|
t.Setenv("BEE_GPU_STRESS_SECONDS", "9")
|
||||||
t.Setenv("BEE_GPU_STRESS_SIZE_MB", "96")
|
t.Setenv("BEE_GPU_STRESS_SIZE_MB", "96")
|
||||||
|
|||||||
@@ -598,7 +598,7 @@ func renderBurn() string {
|
|||||||
<button class="btn btn-primary" onclick="runBurnIn('cpu')">▶ Start CPU Stress</button>
|
<button class="btn btn-primary" onclick="runBurnIn('cpu')">▶ Start CPU Stress</button>
|
||||||
</div></div>
|
</div></div>
|
||||||
<div class="card"><div class="card-head">AMD GPU Stress</div><div class="card-body">
|
<div class="card"><div class="card-head">AMD GPU Stress</div><div class="card-body">
|
||||||
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Requires ROCm tools (rocm-bandwidth-test). Missing tools reported as UNSUPPORTED.</p>
|
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Runs ROCm compute stress together with VRAM copy/load activity via RVS GST and records a separate <code>rocm-bandwidth-test</code> snapshot. Missing tools reported as UNSUPPORTED.</p>
|
||||||
<button id="sat-btn-amd-stress" class="btn btn-primary" onclick="runBurnIn('amd-stress')">▶ Start AMD Stress</button>
|
<button id="sat-btn-amd-stress" class="btn btn-primary" onclick="runBurnIn('amd-stress')">▶ Start AMD Stress</button>
|
||||||
</div></div>
|
</div></div>
|
||||||
<div class="card"><div class="card-head">Memory Stress</div><div class="card-body">
|
<div class="card"><div class="card-head">Memory Stress</div><div class="card-body">
|
||||||
|
|||||||
Reference in New Issue
Block a user