NVIDIA's CUDA repo for Debian 12 only has NCCL packages for cuda13.x, not cuda12.x. Update to the latest available: 2.28.9-1+cuda13.0. Also pass sha256 from VERSIONS into build-nccl.sh for integrity check. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
95 lines
3.2 KiB
Bash
Executable File
95 lines
3.2 KiB
Bash
Executable File
#!/bin/sh
|
|
# build-nccl.sh — download and extract NCCL shared library for the LiveCD.
|
|
#
|
|
# Downloads libnccl2 .deb from NVIDIA's CUDA apt repository (Debian 12, x86_64)
|
|
# and extracts the shared library. Package integrity verified via sha256.
|
|
#
|
|
# Output is cached in DIST_DIR/nccl-<version>+cuda<cuda>/ so subsequent builds
|
|
# are instant unless NCCL_VERSION or NCCL_CUDA_VERSION changes.
|
|
#
|
|
# Output layout:
|
|
# $CACHE_DIR/lib/ — libnccl.so.* files
|
|
|
|
set -e
|
|
|
|
NCCL_VERSION="$1"
|
|
NCCL_CUDA_VERSION="$2"
|
|
DIST_DIR="$3"
|
|
EXPECTED_SHA256="$4"
|
|
|
|
[ -n "$NCCL_VERSION" ] || { echo "usage: $0 <nccl-version> <cuda-version> <dist-dir> [sha256]"; exit 1; }
|
|
[ -n "$NCCL_CUDA_VERSION" ] || { echo "usage: $0 <nccl-version> <cuda-version> <dist-dir> [sha256]"; exit 1; }
|
|
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nccl-version> <cuda-version> <dist-dir> [sha256]"; exit 1; }
|
|
|
|
echo "=== NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION} ==="
|
|
|
|
CACHE_DIR="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
|
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
|
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nccl-downloads"
|
|
|
|
if [ -d "$CACHE_DIR/lib" ] && [ "$(ls "$CACHE_DIR/lib/"libnccl.so.* 2>/dev/null | wc -l)" -gt 0 ]; then
|
|
echo "=== NCCL cached, skipping download ==="
|
|
echo "cache: $CACHE_DIR"
|
|
echo "libs: $(ls "$CACHE_DIR/lib/" | wc -l) files"
|
|
exit 0
|
|
fi
|
|
|
|
REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64"
|
|
PKG_NAME="libnccl2_${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}_amd64.deb"
|
|
PKG_URL="${REPO_BASE}/${PKG_NAME}"
|
|
|
|
mkdir -p "$DOWNLOAD_CACHE_DIR"
|
|
DEB_FILE="${DOWNLOAD_CACHE_DIR}/${PKG_NAME}"
|
|
|
|
echo "=== downloading NCCL package ==="
|
|
echo "URL: ${PKG_URL}"
|
|
wget --show-progress -O "$DEB_FILE" "$PKG_URL"
|
|
|
|
if [ -n "$EXPECTED_SHA256" ]; then
|
|
echo "=== verifying sha256 ==="
|
|
ACTUAL_SHA256=$(sha256sum "$DEB_FILE" | awk '{print $1}')
|
|
if [ "$ACTUAL_SHA256" != "$EXPECTED_SHA256" ]; then
|
|
echo "ERROR: sha256 mismatch"
|
|
echo " expected: $EXPECTED_SHA256"
|
|
echo " actual: $ACTUAL_SHA256"
|
|
rm -f "$DEB_FILE"
|
|
exit 1
|
|
fi
|
|
echo "sha256 OK"
|
|
fi
|
|
|
|
echo "=== extracting NCCL libraries ==="
|
|
EXTRACT_TMP=$(mktemp -d)
|
|
trap 'rm -rf "$EXTRACT_TMP"' EXIT INT TERM
|
|
|
|
# .deb is an ar archive; data.tar.* contains the actual files
|
|
cd "$EXTRACT_TMP"
|
|
ar x "$DEB_FILE"
|
|
|
|
# Extract data tarball (xz, gz, or zst)
|
|
DATA_TAR=$(ls data.tar.* 2>/dev/null | head -1)
|
|
[ -n "$DATA_TAR" ] || { echo "ERROR: data.tar.* not found in .deb"; exit 1; }
|
|
tar xf "$DATA_TAR"
|
|
|
|
# Library lands in ./usr/lib/x86_64-linux-gnu/ or ./usr/lib/
|
|
mkdir -p "$CACHE_DIR/lib"
|
|
found=0
|
|
for f in $(find . -name 'libnccl.so.*' -not -type d 2>/dev/null); do
|
|
cp "$f" "$CACHE_DIR/lib/"
|
|
found=$((found + 1))
|
|
done
|
|
|
|
[ "$found" -gt 0 ] || { echo "ERROR: libnccl.so.* not found in package"; exit 1; }
|
|
|
|
# Create soname symlinks: libnccl.so.2 -> libnccl.so.<full>, libnccl.so -> libnccl.so.2
|
|
versioned=$(ls "$CACHE_DIR/lib/libnccl.so."[0-9][0-9.]* 2>/dev/null | head -1)
|
|
if [ -n "$versioned" ]; then
|
|
base=$(basename "$versioned")
|
|
ln -sf "$base" "$CACHE_DIR/lib/libnccl.so.2" 2>/dev/null || true
|
|
ln -sf "libnccl.so.2" "$CACHE_DIR/lib/libnccl.so" 2>/dev/null || true
|
|
fi
|
|
|
|
echo "=== NCCL extraction complete ==="
|
|
echo "cache: $CACHE_DIR"
|
|
ls -lh "$CACHE_DIR/lib/"
|