diff --git a/iso/builder/VERSIONS b/iso/builder/VERSIONS index 870f25a..0ed59b7 100644 --- a/iso/builder/VERSIONS +++ b/iso/builder/VERSIONS @@ -1,5 +1,7 @@ DEBIAN_VERSION=12 DEBIAN_KERNEL_ABI=6.1.0-43 NVIDIA_DRIVER_VERSION=590.48.01 +NCCL_VERSION=2.26.2-1 +NCCL_CUDA_VERSION=12.8 GO_VERSION=1.24.0 AUDIT_VERSION=1.0.0 diff --git a/iso/builder/build-nccl.sh b/iso/builder/build-nccl.sh new file mode 100755 index 0000000..61751ce --- /dev/null +++ b/iso/builder/build-nccl.sh @@ -0,0 +1,81 @@ +#!/bin/sh +# build-nccl.sh — download and extract NCCL shared library for the LiveCD. +# +# Downloads libnccl2 .deb from NVIDIA's CUDA apt repository (Debian 12, x86_64) +# and extracts the shared library. Transport security via HTTPS; package integrity +# verified by sha256 from NVIDIA's Packages index. +# +# Output is cached in DIST_DIR/nccl-+cuda/ so subsequent builds +# are instant unless NCCL_VERSION or NCCL_CUDA_VERSION changes. +# +# Output layout: +# $CACHE_DIR/lib/ — libnccl.so.* files + +set -e + +NCCL_VERSION="$1" +NCCL_CUDA_VERSION="$2" +DIST_DIR="$3" + +[ -n "$NCCL_VERSION" ] || { echo "usage: $0 "; exit 1; } +[ -n "$NCCL_CUDA_VERSION" ] || { echo "usage: $0 "; exit 1; } +[ -n "$DIST_DIR" ] || { echo "usage: $0 "; exit 1; } + +echo "=== NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION} ===" + +CACHE_DIR="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}" +CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}" +DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nccl-downloads" + +if [ -d "$CACHE_DIR/lib" ] && [ "$(ls "$CACHE_DIR/lib/"libnccl.so.* 2>/dev/null | wc -l)" -gt 0 ]; then + echo "=== NCCL cached, skipping download ===" + echo "cache: $CACHE_DIR" + echo "libs: $(ls "$CACHE_DIR/lib/" | wc -l) files" + exit 0 +fi + +REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64" +PKG_NAME="libnccl2_${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}_amd64.deb" +PKG_URL="${REPO_BASE}/${PKG_NAME}" + +mkdir -p "$DOWNLOAD_CACHE_DIR" +DEB_FILE="${DOWNLOAD_CACHE_DIR}/${PKG_NAME}" + +echo "=== downloading NCCL package ===" +echo "URL: ${PKG_URL}" +wget --show-progress -O "$DEB_FILE" "$PKG_URL" + +echo "=== extracting NCCL libraries ===" +EXTRACT_TMP=$(mktemp -d) +trap 'rm -rf "$EXTRACT_TMP"' EXIT INT TERM + +# .deb is an ar archive; data.tar.* contains the actual files +cd "$EXTRACT_TMP" +ar x "$DEB_FILE" + +# Extract data tarball (xz, gz, or zst) +DATA_TAR=$(ls data.tar.* 2>/dev/null | head -1) +[ -n "$DATA_TAR" ] || { echo "ERROR: data.tar.* not found in .deb"; exit 1; } +tar xf "$DATA_TAR" + +# Library lands in ./usr/lib/x86_64-linux-gnu/ or ./usr/lib/ +mkdir -p "$CACHE_DIR/lib" +found=0 +for f in $(find . -name 'libnccl.so.*' -not -type d 2>/dev/null); do + cp "$f" "$CACHE_DIR/lib/" + found=$((found + 1)) +done + +[ "$found" -gt 0 ] || { echo "ERROR: libnccl.so.* not found in package"; exit 1; } + +# Create soname symlinks: libnccl.so.2 -> libnccl.so., libnccl.so -> libnccl.so.2 +versioned=$(ls "$CACHE_DIR/lib/libnccl.so."[0-9][0-9.]* 2>/dev/null | head -1) +if [ -n "$versioned" ]; then + base=$(basename "$versioned") + ln -sf "$base" "$CACHE_DIR/lib/libnccl.so.2" 2>/dev/null || true + ln -sf "libnccl.so.2" "$CACHE_DIR/lib/libnccl.so" 2>/dev/null || true +fi + +echo "=== NCCL extraction complete ===" +echo "cache: $CACHE_DIR" +ls -lh "$CACHE_DIR/lib/" diff --git a/iso/builder/build.sh b/iso/builder/build.sh index 380738c..a8efd0a 100755 --- a/iso/builder/build.sh +++ b/iso/builder/build.sh @@ -186,6 +186,17 @@ if [ -d "${NVIDIA_CACHE}/firmware" ] && [ "$(ls -A "${NVIDIA_CACHE}/firmware" 2> echo "=== firmware: $(ls "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/" | wc -l) files injected ===" fi +# --- build / download NCCL --- +echo "" +echo "=== downloading NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION} ===" +sh "${BUILDER_DIR}/build-nccl.sh" "${NCCL_VERSION}" "${NCCL_CUDA_VERSION}" "${DIST_DIR}" + +NCCL_CACHE="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}" + +# Inject libnccl.so.* into overlay alongside other NVIDIA userspace libs +cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" +echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ===" + # --- embed build metadata --- mkdir -p "${OVERLAY_STAGE_DIR}/etc" BUILD_DATE="$(date +%Y-%m-%d)" @@ -198,6 +209,8 @@ GIT_COMMIT=${GIT_COMMIT} DEBIAN_VERSION=${DEBIAN_VERSION} DEBIAN_KERNEL_ABI=${DEBIAN_KERNEL_ABI} NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION} +NCCL_VERSION=${NCCL_VERSION} +NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION} EOF # Patch motd with build info