refactor(iso): replace chroot hooks for DCGM/ROCm with live-build apt sources

Move datacenter-gpu-manager and rocm-smi-lib from dynamic chroot hooks
into live-build's config/archives mechanism so lb caches the .deb files
in cache/packages.chroot/ between builds, eliminating repeated 900+ MB
downloads. Versions pinned via VERSIONS and substituted into package
lists at build time.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-28 13:01:10 +03:00
parent acfd2010d7
commit 911745e4da
11 changed files with 68 additions and 178 deletions

View File

@@ -8,5 +8,8 @@ NCCL_TESTS_VERSION=2.13.10
NVCC_VERSION=12.8
CUBLAS_VERSION=13.0.2.14-1
CUDA_USERSPACE_VERSION=13.0.96-1
DCGM_VERSION=3.3.9
ROCM_VERSION=6.3.4
ROCM_SMI_VERSION=7.4.0.60304-76~22.04
GO_VERSION=1.24.0
AUDIT_VERSION=1.0.0

View File

@@ -16,11 +16,13 @@ NCCL_TESTS_VERSION="$1"
NCCL_VERSION="$2"
NCCL_CUDA_VERSION="$3"
DIST_DIR="$4"
NVCC_VERSION="${5:-}"
DEBIAN_VERSION="${6:-12}"
[ -n "$NCCL_TESTS_VERSION" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir>"; exit 1; }
[ -n "$NCCL_VERSION" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir>"; exit 1; }
[ -n "$NCCL_CUDA_VERSION" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir>"; exit 1; }
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir>"; exit 1; }
[ -n "$NCCL_TESTS_VERSION" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir> [nvcc-version] [debian-version]"; exit 1; }
[ -n "$NCCL_VERSION" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir> [nvcc-version] [debian-version]"; exit 1; }
[ -n "$NCCL_CUDA_VERSION" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir> [nvcc-version] [debian-version]"; exit 1; }
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir> [nvcc-version] [debian-version]"; exit 1; }
echo "=== nccl-tests ${NCCL_TESTS_VERSION} ==="
@@ -34,15 +36,16 @@ if [ -f "${CACHE_DIR}/bin/all_reduce_perf" ]; then
exit 0
fi
# Resolve nvcc path (cuda-nvcc-12-8 installs to /usr/local/cuda-12.8/bin/nvcc)
# Resolve nvcc path (cuda-nvcc-X-Y installs to /usr/local/cuda-X.Y/bin/nvcc)
NVCC_VERSION_PATH="$(echo "${NVCC_VERSION}" | tr '.' '.')"
NVCC=""
for candidate in nvcc /usr/local/cuda-12.8/bin/nvcc /usr/local/cuda-12/bin/nvcc /usr/local/cuda/bin/nvcc; do
for candidate in nvcc "/usr/local/cuda-${NVCC_VERSION_PATH}/bin/nvcc" /usr/local/cuda-12/bin/nvcc /usr/local/cuda/bin/nvcc; do
if command -v "$candidate" >/dev/null 2>&1 || [ -x "$candidate" ]; then
NVCC="$candidate"
break
fi
done
[ -n "$NVCC" ] || { echo "ERROR: nvcc not found — install cuda-nvcc-13-0"; exit 1; }
[ -n "$NVCC" ] || { echo "ERROR: nvcc not found — install cuda-nvcc-$(echo "${NVCC_VERSION}" | tr '.' '-')"; exit 1; }
echo "nvcc: $NVCC"
# Determine CUDA_HOME from nvcc location
@@ -50,7 +53,7 @@ CUDA_HOME="$(dirname "$(dirname "$NVCC")")"
echo "CUDA_HOME: $CUDA_HOME"
# Download libnccl-dev for nccl.h
REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64"
REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian${DEBIAN_VERSION}/x86_64"
DEV_PKG="libnccl-dev_${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}_amd64.deb"
DEV_URL="${REPO_BASE}/${DEV_PKG}"

View File

@@ -337,7 +337,9 @@ sh "${BUILDER_DIR}/build-nccl-tests.sh" \
"${NCCL_TESTS_VERSION}" \
"${NCCL_VERSION}" \
"${NCCL_CUDA_VERSION}" \
"${DIST_DIR}"
"${DIST_DIR}" \
"${NVCC_VERSION}" \
"${DEBIAN_VERSION}"
NCCL_TESTS_CACHE="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
@@ -371,6 +373,14 @@ if [ -f "${OVERLAY_STAGE_DIR}/etc/motd" ]; then
mv "${OVERLAY_STAGE_DIR}/etc/motd.patched" "${OVERLAY_STAGE_DIR}/etc/motd"
fi
# --- substitute version placeholders in package list ---
sed -i \
-e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \
-e "s/%%ROCM_VERSION%%/${ROCM_VERSION}/g" \
-e "s/%%ROCM_SMI_VERSION%%/${ROCM_SMI_VERSION}/g" \
"${BUILD_WORK_DIR}/config/package-lists/bee.list.chroot" \
"${BUILD_WORK_DIR}/config/archives/rocm.list.chroot"
# --- sync overlay into live-build includes.chroot ---
LB_DIR="${BUILD_WORK_DIR}"
LB_INCLUDES="${LB_DIR}/config/includes.chroot"

View File

@@ -0,0 +1,29 @@
-----BEGIN PGP PUBLIC KEY BLOCK-----
Version: GnuPG v2.0.22 (GNU/Linux)
mQINBGJYmlEBEAC6nJmeqByeReM+MSy4palACCnfOg4pOxffrrkldxz4jrDOZNK4
q8KG+ZbXrkdP0e9qTFRvZzN+A6Jw3ySfoiKXRBw5l2Zp81AYkghV641OpWNjZOyL
syKEtST9LR1ttHv1ZI71pj8NVG/EnpimZPOblEJ1OpibJJCXLrbn+qcJ8JNuGTSK
6v2aLBmhR8VR/aSJpmkg7fFjcGklweTI8+Ibj72HuY9JRD/+dtUoSh7z037mWo56
ee02lPFRD0pHOEAlLSXxFO/SDqRVMhcgHk0a8roCF+9h5Ni7ZUyxlGK/uHkqN7ED
/U/ATpGKgvk4t23eTpdRC8FXAlBZQyf/xnhQXsyF/z7+RV5CL0o1zk1LKgo+5K32
5ka5uZb6JSIrEPUaCPEMXu6EEY8zSFnCrRS/Vjkfvc9ViYZWzJ387WTjAhMdS7wd
PmdDWw2ASGUP4FrfCireSZiFX+ZAOspKpZdh0P5iR5XSx14XDt3jNK2EQQboaJAD
uqksItatOEYNu4JsCbc24roJvJtGhpjTnq1/dyoy6K433afU0DS2ZPLthLpGqeyK
MKNY7a2WjxhRmCSu5Zok/fGKcO62XF8a3eSj4NzCRv8LM6mG1Oekz6Zz+tdxHg19
ufHO0et7AKE5q+5VjE438Xpl4UWbM/Voj6VPJ9uzywDcnZXpeOqeTQh2pQARAQAB
tCBjdWRhdG9vbHMgPGN1ZGF0b29sc0BudmlkaWEuY29tPokCOQQTAQIAIwUCYlia
UQIbAwcLCQgHAwIBBhUIAgkKCwQWAgMBAh4BAheAAAoJEKS0aZY7+GPM1y4QALKh
BqSozrYbe341Qu7SyxHQgjRCGi4YhI3bHCMj5F6vEOHnwiFH6YmFkxCYtqcGjca6
iw7cCYMow/hgKLAPwkwSJ84EYpGLWx62+20rMM4OuZwauSUcY/kE2WgnQ74zbh3+
MHs56zntJFfJ9G+NYidvwDWeZn5HIzR4CtxaxRgpiykg0s3ps6X0U+vuVcLnutBF
7r81astvlVQERFbce/6KqHK+yj843Qrhb3JEolUoOETK06nD25bVtnAxe0QEyA90
9MpRNLfR6BdjPpxqhphDcMOhJfyubAroQUxG/7S+Yw+mtEqHrL/dz9iEYqodYiSo
zfi0b+HFI59sRkTfOBDBwb3kcARExwnvLJmqijiVqWkoJ3H67oA0XJN2nelucw+A
Hb+Jt9BWjyzKWlLFDnVHdGicyRJ0I8yqi32w8hGeXmu3tU58VWJrkXEXadBftmci
pemb6oZ/r5SCkW6kxr2PsNWcJoebUdynyOQGbVwpMtJAnjOYp0ObKOANbcIg+tsi
kyCIO5TiY3ADbBDPCeZK8xdcugXoW5WFwACGC0z+Cn0mtw8z3VGIPAMSCYmLusgW
t2+EpikwrP2inNp5Pc+YdczRAsa4s30Jpyv/UHEG5P9GKnvofaxJgnU56lJIRPzF
iCUGy6cVI0Fq777X/ME1K6A/bzZ4vRYNx8rUmVE5
=DO7z
-----END PGP PUBLIC KEY BLOCK-----

View File

@@ -0,0 +1 @@
deb https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/ /

Binary file not shown.

View File

@@ -0,0 +1 @@
deb https://repo.radeon.com/rocm/apt/%%ROCM_VERSION%% jammy main

View File

@@ -46,6 +46,12 @@ chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
# Reload udev rules
udevadm control --reload-rules 2>/dev/null || true
# rocm-smi symlink (package installs to /opt/rocm-*/bin/rocm-smi)
if [ ! -e /usr/local/bin/rocm-smi ]; then
smi_path="$(find /opt -path '*/bin/rocm-smi' -type f 2>/dev/null | sort | tail -1)"
[ -n "${smi_path}" ] && ln -sf "${smi_path}" /usr/local/bin/rocm-smi
fi
# Create export directory
mkdir -p /appdata/bee/export

View File

@@ -1,103 +0,0 @@
#!/bin/sh
# 9001-amd-rocm.hook.chroot — install AMD ROCm SMI tool for Instinct GPU monitoring.
# Runs inside the live-build chroot. Adds AMD's apt repository and installs
# rocm-smi-lib which provides the `rocm-smi` CLI (analogous to nvidia-smi).
#
# AMD does NOT publish Debian Bookworm packages. The repo uses Ubuntu codenames
# (jammy/noble). We use jammy (Ubuntu 22.04) — its packages install cleanly on
# Debian 12 (Bookworm) due to compatible glibc/libstdc++.
# Tried versions newest-first; falls back if a point release is missing.
set -e
# Ubuntu codename to use for the AMD repo (Debian has no AMD packages).
ROCM_UBUNTU_DIST="jammy"
# ROCm point-releases to try newest-first. AMD drops old point releases
# from the repo, so we walk backwards until one responds 200.
ROCM_CANDIDATES="6.3.4 6.3.3 6.3.2 6.3.1 6.3 6.2.4 6.2.3 6.2.2 6.2.1 6.2"
ROCM_KEYRING="/etc/apt/keyrings/rocm.gpg"
ROCM_LIST="/etc/apt/sources.list.d/rocm.list"
APT_UPDATED=0
mkdir -p /etc/apt/keyrings
ensure_tool() {
tool="$1"
pkg="$2"
if command -v "${tool}" >/dev/null 2>&1; then
return 0
fi
if [ "${APT_UPDATED}" -eq 0 ]; then
apt-get update -qq
APT_UPDATED=1
fi
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends "${pkg}"
}
ensure_cert_bundle() {
if [ -s /etc/ssl/certs/ca-certificates.crt ]; then
return 0
fi
if [ "${APT_UPDATED}" -eq 0 ]; then
apt-get update -qq
APT_UPDATED=1
fi
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends ca-certificates
}
# live-build chroot may not include fetch/signing tools yet
if ! ensure_cert_bundle || ! ensure_tool wget wget || ! ensure_tool gpg gpg; then
echo "WARN: failed to install wget/gpg/ca-certificates prerequisites — skipping ROCm install"
exit 0
fi
# Download and import AMD GPG key
if ! wget -qO- "https://repo.radeon.com/rocm/rocm.gpg.key" \
| gpg --dearmor --yes --output "${ROCM_KEYRING}"; then
echo "WARN: failed to fetch AMD ROCm GPG key — skipping ROCm install"
exit 0
fi
# Try each ROCm version until apt-get update succeeds.
# AMD repo uses Ubuntu codenames; bookworm is not published — use jammy.
ROCM_VERSION=""
for candidate in ${ROCM_CANDIDATES}; do
cat > "${ROCM_LIST}" <<EOF
deb [arch=amd64 signed-by=${ROCM_KEYRING}] https://repo.radeon.com/rocm/apt/${candidate} ${ROCM_UBUNTU_DIST} main
EOF
if apt-get update -qq 2>/dev/null; then
ROCM_VERSION="${candidate}"
echo "=== AMD ROCm ${ROCM_VERSION} (${ROCM_UBUNTU_DIST}): repository available ==="
break
fi
echo "WARN: ROCm ${candidate} not available, trying next..."
rm -f "${ROCM_LIST}"
done
if [ -z "${ROCM_VERSION}" ]; then
echo "WARN: no ROCm apt repository available — skipping ROCm install"
rm -f "${ROCM_KEYRING}"
exit 0
fi
# rocm-smi-lib provides the rocm-smi CLI tool for GPU monitoring
if DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends rocm-smi-lib; then
echo "=== AMD ROCm: rocm-smi-lib installed ==="
if [ -x /opt/rocm/bin/rocm-smi ]; then
ln -sf /opt/rocm/bin/rocm-smi /usr/local/bin/rocm-smi
else
smi_path="$(find /opt -path '*/bin/rocm-smi' -type f 2>/dev/null | sort | tail -1)"
if [ -n "${smi_path}" ]; then
ln -sf "${smi_path}" /usr/local/bin/rocm-smi
fi
fi
rocm-smi --version 2>/dev/null || true
else
echo "WARN: rocm-smi-lib install failed — AMD GPU monitoring unavailable"
fi
# Clean up apt lists to keep ISO size down
rm -f "${ROCM_LIST}"
apt-get clean

View File

@@ -1,66 +0,0 @@
#!/bin/sh
# 9002-nvidia-dcgm.hook.chroot — install NVIDIA DCGM inside the live-build chroot.
# DCGM (Data Center GPU Manager) provides dcgmi diag for acceptance testing.
# Adds NVIDIA's CUDA apt repository (debian12/x86_64) and installs datacenter-gpu-manager.
set -e
NVIDIA_KEYRING="/usr/share/keyrings/nvidia-cuda.gpg"
NVIDIA_LIST="/etc/apt/sources.list.d/nvidia-cuda.list"
NVIDIA_KEY_URL="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/3bf863cc.pub"
NVIDIA_REPO="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/"
APT_UPDATED=0
mkdir -p /usr/share/keyrings /etc/apt/sources.list.d
ensure_tool() {
tool="$1"
pkg="$2"
if command -v "${tool}" >/dev/null 2>&1; then
return 0
fi
if [ "${APT_UPDATED}" -eq 0 ]; then
apt-get update -qq
APT_UPDATED=1
fi
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends "${pkg}"
}
ensure_cert_bundle() {
if [ -s /etc/ssl/certs/ca-certificates.crt ]; then
return 0
fi
if [ "${APT_UPDATED}" -eq 0 ]; then
apt-get update -qq
APT_UPDATED=1
fi
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends ca-certificates
}
if ! ensure_cert_bundle || ! ensure_tool wget wget || ! ensure_tool gpg gpg; then
echo "WARN: prerequisites missing — skipping DCGM install"
exit 0
fi
# Download and import NVIDIA GPG key
if ! wget -qO- "${NVIDIA_KEY_URL}" | gpg --dearmor --yes --output "${NVIDIA_KEYRING}"; then
echo "WARN: failed to fetch NVIDIA GPG key — skipping DCGM install"
exit 0
fi
cat > "${NVIDIA_LIST}" <<EOF
deb [signed-by=${NVIDIA_KEYRING}] ${NVIDIA_REPO} /
EOF
apt-get update -qq
if DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends datacenter-gpu-manager; then
echo "=== DCGM: datacenter-gpu-manager installed ==="
dcgmi --version 2>/dev/null || true
else
echo "WARN: datacenter-gpu-manager install failed — DCGM unavailable"
fi
# Clean up apt lists to keep ISO size down
rm -f "${NVIDIA_LIST}"
apt-get clean

View File

@@ -70,5 +70,11 @@ firmware-bnx2x
firmware-cavium
firmware-qlogic
# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing
datacenter-gpu-manager=1:%%DCGM_VERSION%%
# AMD ROCm SMI — GPU monitoring for Instinct cards (repo: rocm/apt/6.3.4 jammy)
rocm-smi-lib=%%ROCM_SMI_VERSION%%
# glibc compat helpers (for any external binaries that need it)
libc6