feat: AMD GPU compute stress via rocm-validation-suite GST (GEMM)

- Add rocm-validation-suite, rocblas, rocrand, hip-runtime-amd,
  hipblaslt, comgr to ISO (~700MB, needed for HIP compute)
- RunAMDStressPack: run RVS GST (SGEMM ~31 TFLOPS/GPU) + bandwidth test
- Add rvs symlink in chroot setup hook
- Pin all new package versions in VERSIONS

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-29 10:56:32 +03:00
parent e69e9109da
commit a03312c286
4 changed files with 32 additions and 7 deletions

View File

@@ -47,7 +47,7 @@ chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
udevadm control --reload-rules 2>/dev/null || true
# rocm symlinks (packages install to /opt/rocm-*/bin/)
for tool in rocm-smi rocm-bandwidth-test; do
for tool in rocm-smi rocm-bandwidth-test rvs; do
if [ ! -e /usr/local/bin/${tool} ]; then
bin_path="$(find /opt -path "*/bin/${tool}" -type f 2>/dev/null | sort | tail -1)"
[ -n "${bin_path}" ] && ln -sf "${bin_path}" /usr/local/bin/${tool}

View File

@@ -75,10 +75,15 @@ firmware-qlogic
# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing
datacenter-gpu-manager=1:%%DCGM_VERSION%%
# AMD ROCm SMI — GPU monitoring for Instinct cards (repo: rocm/apt/6.3.4 jammy)
# AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST)
rocm-smi-lib=%%ROCM_SMI_VERSION%%
# AMD ROCm bandwidth test — used for GPU burn-in stress
rocm-bandwidth-test=%%ROCM_BANDWIDTH_TEST_VERSION%%
rocm-validation-suite=%%ROCM_VALIDATION_SUITE_VERSION%%
rocblas=%%ROCBLAS_VERSION%%
rocrand=%%ROCRAND_VERSION%%
hip-runtime-amd=%%HIP_RUNTIME_AMD_VERSION%%
hipblaslt=%%HIPBLASLT_VERSION%%
comgr=%%COMGR_VERSION%%
# glibc compat helpers (for any external binaries that need it)
libc6