feat: AMD GPU compute stress via rocm-validation-suite GST (GEMM)
- Add rocm-validation-suite, rocblas, rocrand, hip-runtime-amd, hipblaslt, comgr to ISO (~700MB, needed for HIP compute) - RunAMDStressPack: run RVS GST (SGEMM ~31 TFLOPS/GPU) + bandwidth test - Add rvs symlink in chroot setup hook - Pin all new package versions in VERSIONS Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -142,13 +142,27 @@ func (s *System) RunAMDStressPack(ctx context.Context, baseDir string, durationS
|
||||
if err := ensureAMDRuntimeReady(); err != nil {
|
||||
return "", err
|
||||
}
|
||||
// Write RVS GST config to a temp file
|
||||
rvsCfg := fmt.Sprintf(`actions:
|
||||
- name: gst_stress
|
||||
device: all
|
||||
module: gst
|
||||
parallel: true
|
||||
duration: %d
|
||||
copy_matrix: false
|
||||
target_stress: 90
|
||||
matrix_size_a: 8640
|
||||
matrix_size_b: 8640
|
||||
matrix_size_c: 8640
|
||||
`, seconds*1000)
|
||||
cfgFile := "/tmp/bee-amd-gst.conf"
|
||||
_ = os.WriteFile(cfgFile, []byte(rvsCfg), 0644)
|
||||
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-stress", []satJob{
|
||||
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||
{name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}},
|
||||
{name: fmt.Sprintf("03-rocm-smi-monitor-%ds.log", seconds), cmd: []string{
|
||||
"bash", "-lc",
|
||||
fmt.Sprintf("end=$((SECONDS+%d)); while [ \"$SECONDS\" -lt \"$end\" ]; do rocm-smi --showtemp --showpower --csv; sleep 1; done", seconds),
|
||||
}},
|
||||
{name: fmt.Sprintf("03-rvs-gst-%ds.log", seconds), cmd: []string{"rvs", "-c", cfgFile}},
|
||||
{name: fmt.Sprintf("04-rocm-smi-after.log"), cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--csv"}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user