diff --git a/iso/builder/VERSIONS b/iso/builder/VERSIONS index 6c46761..1248ebf 100644 --- a/iso/builder/VERSIONS +++ b/iso/builder/VERSIONS @@ -8,5 +8,8 @@ NCCL_TESTS_VERSION=2.13.10 NVCC_VERSION=12.8 CUBLAS_VERSION=13.0.2.14-1 CUDA_USERSPACE_VERSION=13.0.96-1 +DCGM_VERSION=3.3.9 +ROCM_VERSION=6.3.4 +ROCM_SMI_VERSION=7.4.0.60304-76~22.04 GO_VERSION=1.24.0 AUDIT_VERSION=1.0.0 diff --git a/iso/builder/build-nccl-tests.sh b/iso/builder/build-nccl-tests.sh index 7014a02..4bebe05 100755 --- a/iso/builder/build-nccl-tests.sh +++ b/iso/builder/build-nccl-tests.sh @@ -16,11 +16,13 @@ NCCL_TESTS_VERSION="$1" NCCL_VERSION="$2" NCCL_CUDA_VERSION="$3" DIST_DIR="$4" +NVCC_VERSION="${5:-}" +DEBIAN_VERSION="${6:-12}" -[ -n "$NCCL_TESTS_VERSION" ] || { echo "usage: $0 "; exit 1; } -[ -n "$NCCL_VERSION" ] || { echo "usage: $0 "; exit 1; } -[ -n "$NCCL_CUDA_VERSION" ] || { echo "usage: $0 "; exit 1; } -[ -n "$DIST_DIR" ] || { echo "usage: $0 "; exit 1; } +[ -n "$NCCL_TESTS_VERSION" ] || { echo "usage: $0 [nvcc-version] [debian-version]"; exit 1; } +[ -n "$NCCL_VERSION" ] || { echo "usage: $0 [nvcc-version] [debian-version]"; exit 1; } +[ -n "$NCCL_CUDA_VERSION" ] || { echo "usage: $0 [nvcc-version] [debian-version]"; exit 1; } +[ -n "$DIST_DIR" ] || { echo "usage: $0 [nvcc-version] [debian-version]"; exit 1; } echo "=== nccl-tests ${NCCL_TESTS_VERSION} ===" @@ -34,15 +36,16 @@ if [ -f "${CACHE_DIR}/bin/all_reduce_perf" ]; then exit 0 fi -# Resolve nvcc path (cuda-nvcc-12-8 installs to /usr/local/cuda-12.8/bin/nvcc) +# Resolve nvcc path (cuda-nvcc-X-Y installs to /usr/local/cuda-X.Y/bin/nvcc) +NVCC_VERSION_PATH="$(echo "${NVCC_VERSION}" | tr '.' '.')" NVCC="" -for candidate in nvcc /usr/local/cuda-12.8/bin/nvcc /usr/local/cuda-12/bin/nvcc /usr/local/cuda/bin/nvcc; do +for candidate in nvcc "/usr/local/cuda-${NVCC_VERSION_PATH}/bin/nvcc" /usr/local/cuda-12/bin/nvcc /usr/local/cuda/bin/nvcc; do if command -v "$candidate" >/dev/null 2>&1 || [ -x "$candidate" ]; then NVCC="$candidate" break fi done -[ -n "$NVCC" ] || { echo "ERROR: nvcc not found — install cuda-nvcc-13-0"; exit 1; } +[ -n "$NVCC" ] || { echo "ERROR: nvcc not found — install cuda-nvcc-$(echo "${NVCC_VERSION}" | tr '.' '-')"; exit 1; } echo "nvcc: $NVCC" # Determine CUDA_HOME from nvcc location @@ -50,7 +53,7 @@ CUDA_HOME="$(dirname "$(dirname "$NVCC")")" echo "CUDA_HOME: $CUDA_HOME" # Download libnccl-dev for nccl.h -REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64" +REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian${DEBIAN_VERSION}/x86_64" DEV_PKG="libnccl-dev_${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}_amd64.deb" DEV_URL="${REPO_BASE}/${DEV_PKG}" diff --git a/iso/builder/build.sh b/iso/builder/build.sh index 354125e..ee5334e 100755 --- a/iso/builder/build.sh +++ b/iso/builder/build.sh @@ -337,7 +337,9 @@ sh "${BUILDER_DIR}/build-nccl-tests.sh" \ "${NCCL_TESTS_VERSION}" \ "${NCCL_VERSION}" \ "${NCCL_CUDA_VERSION}" \ - "${DIST_DIR}" + "${DIST_DIR}" \ + "${NVCC_VERSION}" \ + "${DEBIAN_VERSION}" NCCL_TESTS_CACHE="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}" cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf" @@ -371,6 +373,14 @@ if [ -f "${OVERLAY_STAGE_DIR}/etc/motd" ]; then mv "${OVERLAY_STAGE_DIR}/etc/motd.patched" "${OVERLAY_STAGE_DIR}/etc/motd" fi +# --- substitute version placeholders in package list --- +sed -i \ + -e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \ + -e "s/%%ROCM_VERSION%%/${ROCM_VERSION}/g" \ + -e "s/%%ROCM_SMI_VERSION%%/${ROCM_SMI_VERSION}/g" \ + "${BUILD_WORK_DIR}/config/package-lists/bee.list.chroot" \ + "${BUILD_WORK_DIR}/config/archives/rocm.list.chroot" + # --- sync overlay into live-build includes.chroot --- LB_DIR="${BUILD_WORK_DIR}" LB_INCLUDES="${LB_DIR}/config/includes.chroot" diff --git a/iso/builder/config/archives/nvidia-cuda.key.chroot b/iso/builder/config/archives/nvidia-cuda.key.chroot new file mode 100644 index 0000000..8aabe48 --- /dev/null +++ b/iso/builder/config/archives/nvidia-cuda.key.chroot @@ -0,0 +1,29 @@ +-----BEGIN PGP PUBLIC KEY BLOCK----- +Version: GnuPG v2.0.22 (GNU/Linux) + +mQINBGJYmlEBEAC6nJmeqByeReM+MSy4palACCnfOg4pOxffrrkldxz4jrDOZNK4 +q8KG+ZbXrkdP0e9qTFRvZzN+A6Jw3ySfoiKXRBw5l2Zp81AYkghV641OpWNjZOyL +syKEtST9LR1ttHv1ZI71pj8NVG/EnpimZPOblEJ1OpibJJCXLrbn+qcJ8JNuGTSK +6v2aLBmhR8VR/aSJpmkg7fFjcGklweTI8+Ibj72HuY9JRD/+dtUoSh7z037mWo56 +ee02lPFRD0pHOEAlLSXxFO/SDqRVMhcgHk0a8roCF+9h5Ni7ZUyxlGK/uHkqN7ED +/U/ATpGKgvk4t23eTpdRC8FXAlBZQyf/xnhQXsyF/z7+RV5CL0o1zk1LKgo+5K32 +5ka5uZb6JSIrEPUaCPEMXu6EEY8zSFnCrRS/Vjkfvc9ViYZWzJ387WTjAhMdS7wd +PmdDWw2ASGUP4FrfCireSZiFX+ZAOspKpZdh0P5iR5XSx14XDt3jNK2EQQboaJAD +uqksItatOEYNu4JsCbc24roJvJtGhpjTnq1/dyoy6K433afU0DS2ZPLthLpGqeyK +MKNY7a2WjxhRmCSu5Zok/fGKcO62XF8a3eSj4NzCRv8LM6mG1Oekz6Zz+tdxHg19 +ufHO0et7AKE5q+5VjE438Xpl4UWbM/Voj6VPJ9uzywDcnZXpeOqeTQh2pQARAQAB +tCBjdWRhdG9vbHMgPGN1ZGF0b29sc0BudmlkaWEuY29tPokCOQQTAQIAIwUCYlia +UQIbAwcLCQgHAwIBBhUIAgkKCwQWAgMBAh4BAheAAAoJEKS0aZY7+GPM1y4QALKh +BqSozrYbe341Qu7SyxHQgjRCGi4YhI3bHCMj5F6vEOHnwiFH6YmFkxCYtqcGjca6 +iw7cCYMow/hgKLAPwkwSJ84EYpGLWx62+20rMM4OuZwauSUcY/kE2WgnQ74zbh3+ +MHs56zntJFfJ9G+NYidvwDWeZn5HIzR4CtxaxRgpiykg0s3ps6X0U+vuVcLnutBF +7r81astvlVQERFbce/6KqHK+yj843Qrhb3JEolUoOETK06nD25bVtnAxe0QEyA90 +9MpRNLfR6BdjPpxqhphDcMOhJfyubAroQUxG/7S+Yw+mtEqHrL/dz9iEYqodYiSo +zfi0b+HFI59sRkTfOBDBwb3kcARExwnvLJmqijiVqWkoJ3H67oA0XJN2nelucw+A +Hb+Jt9BWjyzKWlLFDnVHdGicyRJ0I8yqi32w8hGeXmu3tU58VWJrkXEXadBftmci +pemb6oZ/r5SCkW6kxr2PsNWcJoebUdynyOQGbVwpMtJAnjOYp0ObKOANbcIg+tsi +kyCIO5TiY3ADbBDPCeZK8xdcugXoW5WFwACGC0z+Cn0mtw8z3VGIPAMSCYmLusgW +t2+EpikwrP2inNp5Pc+YdczRAsa4s30Jpyv/UHEG5P9GKnvofaxJgnU56lJIRPzF +iCUGy6cVI0Fq777X/ME1K6A/bzZ4vRYNx8rUmVE5 +=DO7z +-----END PGP PUBLIC KEY BLOCK----- diff --git a/iso/builder/config/archives/nvidia-cuda.list.chroot b/iso/builder/config/archives/nvidia-cuda.list.chroot new file mode 100644 index 0000000..a23bac8 --- /dev/null +++ b/iso/builder/config/archives/nvidia-cuda.list.chroot @@ -0,0 +1 @@ +deb https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/ / diff --git a/iso/builder/config/archives/rocm.key.chroot b/iso/builder/config/archives/rocm.key.chroot new file mode 100644 index 0000000..122e477 Binary files /dev/null and b/iso/builder/config/archives/rocm.key.chroot differ diff --git a/iso/builder/config/archives/rocm.list.chroot b/iso/builder/config/archives/rocm.list.chroot new file mode 100644 index 0000000..78f19e8 --- /dev/null +++ b/iso/builder/config/archives/rocm.list.chroot @@ -0,0 +1 @@ +deb https://repo.radeon.com/rocm/apt/%%ROCM_VERSION%% jammy main diff --git a/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot b/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot index 594120a..aefd2c8 100755 --- a/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot +++ b/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot @@ -46,6 +46,12 @@ chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true # Reload udev rules udevadm control --reload-rules 2>/dev/null || true +# rocm-smi symlink (package installs to /opt/rocm-*/bin/rocm-smi) +if [ ! -e /usr/local/bin/rocm-smi ]; then + smi_path="$(find /opt -path '*/bin/rocm-smi' -type f 2>/dev/null | sort | tail -1)" + [ -n "${smi_path}" ] && ln -sf "${smi_path}" /usr/local/bin/rocm-smi +fi + # Create export directory mkdir -p /appdata/bee/export diff --git a/iso/builder/config/hooks/normal/9001-amd-rocm.hook.chroot b/iso/builder/config/hooks/normal/9001-amd-rocm.hook.chroot deleted file mode 100755 index de38de1..0000000 --- a/iso/builder/config/hooks/normal/9001-amd-rocm.hook.chroot +++ /dev/null @@ -1,103 +0,0 @@ -#!/bin/sh -# 9001-amd-rocm.hook.chroot — install AMD ROCm SMI tool for Instinct GPU monitoring. -# Runs inside the live-build chroot. Adds AMD's apt repository and installs -# rocm-smi-lib which provides the `rocm-smi` CLI (analogous to nvidia-smi). -# -# AMD does NOT publish Debian Bookworm packages. The repo uses Ubuntu codenames -# (jammy/noble). We use jammy (Ubuntu 22.04) — its packages install cleanly on -# Debian 12 (Bookworm) due to compatible glibc/libstdc++. -# Tried versions newest-first; falls back if a point release is missing. - -set -e - -# Ubuntu codename to use for the AMD repo (Debian has no AMD packages). -ROCM_UBUNTU_DIST="jammy" - -# ROCm point-releases to try newest-first. AMD drops old point releases -# from the repo, so we walk backwards until one responds 200. -ROCM_CANDIDATES="6.3.4 6.3.3 6.3.2 6.3.1 6.3 6.2.4 6.2.3 6.2.2 6.2.1 6.2" - -ROCM_KEYRING="/etc/apt/keyrings/rocm.gpg" -ROCM_LIST="/etc/apt/sources.list.d/rocm.list" -APT_UPDATED=0 - -mkdir -p /etc/apt/keyrings - -ensure_tool() { - tool="$1" - pkg="$2" - if command -v "${tool}" >/dev/null 2>&1; then - return 0 - fi - if [ "${APT_UPDATED}" -eq 0 ]; then - apt-get update -qq - APT_UPDATED=1 - fi - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends "${pkg}" -} - -ensure_cert_bundle() { - if [ -s /etc/ssl/certs/ca-certificates.crt ]; then - return 0 - fi - if [ "${APT_UPDATED}" -eq 0 ]; then - apt-get update -qq - APT_UPDATED=1 - fi - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends ca-certificates -} - -# live-build chroot may not include fetch/signing tools yet -if ! ensure_cert_bundle || ! ensure_tool wget wget || ! ensure_tool gpg gpg; then - echo "WARN: failed to install wget/gpg/ca-certificates prerequisites — skipping ROCm install" - exit 0 -fi - -# Download and import AMD GPG key -if ! wget -qO- "https://repo.radeon.com/rocm/rocm.gpg.key" \ - | gpg --dearmor --yes --output "${ROCM_KEYRING}"; then - echo "WARN: failed to fetch AMD ROCm GPG key — skipping ROCm install" - exit 0 -fi - -# Try each ROCm version until apt-get update succeeds. -# AMD repo uses Ubuntu codenames; bookworm is not published — use jammy. -ROCM_VERSION="" -for candidate in ${ROCM_CANDIDATES}; do - cat > "${ROCM_LIST}" </dev/null; then - ROCM_VERSION="${candidate}" - echo "=== AMD ROCm ${ROCM_VERSION} (${ROCM_UBUNTU_DIST}): repository available ===" - break - fi - echo "WARN: ROCm ${candidate} not available, trying next..." - rm -f "${ROCM_LIST}" -done - -if [ -z "${ROCM_VERSION}" ]; then - echo "WARN: no ROCm apt repository available — skipping ROCm install" - rm -f "${ROCM_KEYRING}" - exit 0 -fi - -# rocm-smi-lib provides the rocm-smi CLI tool for GPU monitoring -if DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends rocm-smi-lib; then - echo "=== AMD ROCm: rocm-smi-lib installed ===" - if [ -x /opt/rocm/bin/rocm-smi ]; then - ln -sf /opt/rocm/bin/rocm-smi /usr/local/bin/rocm-smi - else - smi_path="$(find /opt -path '*/bin/rocm-smi' -type f 2>/dev/null | sort | tail -1)" - if [ -n "${smi_path}" ]; then - ln -sf "${smi_path}" /usr/local/bin/rocm-smi - fi - fi - rocm-smi --version 2>/dev/null || true -else - echo "WARN: rocm-smi-lib install failed — AMD GPU monitoring unavailable" -fi - -# Clean up apt lists to keep ISO size down -rm -f "${ROCM_LIST}" -apt-get clean diff --git a/iso/builder/config/hooks/normal/9002-nvidia-dcgm.hook.chroot b/iso/builder/config/hooks/normal/9002-nvidia-dcgm.hook.chroot deleted file mode 100755 index 4ef15af..0000000 --- a/iso/builder/config/hooks/normal/9002-nvidia-dcgm.hook.chroot +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/sh -# 9002-nvidia-dcgm.hook.chroot — install NVIDIA DCGM inside the live-build chroot. -# DCGM (Data Center GPU Manager) provides dcgmi diag for acceptance testing. -# Adds NVIDIA's CUDA apt repository (debian12/x86_64) and installs datacenter-gpu-manager. - -set -e - -NVIDIA_KEYRING="/usr/share/keyrings/nvidia-cuda.gpg" -NVIDIA_LIST="/etc/apt/sources.list.d/nvidia-cuda.list" -NVIDIA_KEY_URL="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/3bf863cc.pub" -NVIDIA_REPO="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/" -APT_UPDATED=0 - -mkdir -p /usr/share/keyrings /etc/apt/sources.list.d - -ensure_tool() { - tool="$1" - pkg="$2" - if command -v "${tool}" >/dev/null 2>&1; then - return 0 - fi - if [ "${APT_UPDATED}" -eq 0 ]; then - apt-get update -qq - APT_UPDATED=1 - fi - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends "${pkg}" -} - -ensure_cert_bundle() { - if [ -s /etc/ssl/certs/ca-certificates.crt ]; then - return 0 - fi - if [ "${APT_UPDATED}" -eq 0 ]; then - apt-get update -qq - APT_UPDATED=1 - fi - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends ca-certificates -} - -if ! ensure_cert_bundle || ! ensure_tool wget wget || ! ensure_tool gpg gpg; then - echo "WARN: prerequisites missing — skipping DCGM install" - exit 0 -fi - -# Download and import NVIDIA GPG key -if ! wget -qO- "${NVIDIA_KEY_URL}" | gpg --dearmor --yes --output "${NVIDIA_KEYRING}"; then - echo "WARN: failed to fetch NVIDIA GPG key — skipping DCGM install" - exit 0 -fi - -cat > "${NVIDIA_LIST}" </dev/null || true -else - echo "WARN: datacenter-gpu-manager install failed — DCGM unavailable" -fi - -# Clean up apt lists to keep ISO size down -rm -f "${NVIDIA_LIST}" -apt-get clean diff --git a/iso/builder/config/package-lists/bee.list.chroot b/iso/builder/config/package-lists/bee.list.chroot index f201446..f178eac 100644 --- a/iso/builder/config/package-lists/bee.list.chroot +++ b/iso/builder/config/package-lists/bee.list.chroot @@ -70,5 +70,11 @@ firmware-bnx2x firmware-cavium firmware-qlogic +# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing +datacenter-gpu-manager=1:%%DCGM_VERSION%% + +# AMD ROCm SMI — GPU monitoring for Instinct cards (repo: rocm/apt/6.3.4 jammy) +rocm-smi-lib=%%ROCM_SMI_VERSION%% + # glibc compat helpers (for any external binaries that need it) libc6