Files
bee/iso/builder/build-nccl.sh
Mikhail Chusavitin 2abe2ce3aa fix(iso): fix NCCL version to 2.28.9+cuda13.0, add sha256 verification
NVIDIA's CUDA repo for Debian 12 only has NCCL packages for cuda13.x,
not cuda12.x. Update to the latest available: 2.28.9-1+cuda13.0.
Also pass sha256 from VERSIONS into build-nccl.sh for integrity check.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-19 12:04:03 +03:00

95 lines
3.2 KiB
Bash
Executable File

#!/bin/sh
# build-nccl.sh — download and extract NCCL shared library for the LiveCD.
#
# Downloads libnccl2 .deb from NVIDIA's CUDA apt repository (Debian 12, x86_64)
# and extracts the shared library. Package integrity verified via sha256.
#
# Output is cached in DIST_DIR/nccl-<version>+cuda<cuda>/ so subsequent builds
# are instant unless NCCL_VERSION or NCCL_CUDA_VERSION changes.
#
# Output layout:
# $CACHE_DIR/lib/ — libnccl.so.* files
set -e
NCCL_VERSION="$1"
NCCL_CUDA_VERSION="$2"
DIST_DIR="$3"
EXPECTED_SHA256="$4"
[ -n "$NCCL_VERSION" ] || { echo "usage: $0 <nccl-version> <cuda-version> <dist-dir> [sha256]"; exit 1; }
[ -n "$NCCL_CUDA_VERSION" ] || { echo "usage: $0 <nccl-version> <cuda-version> <dist-dir> [sha256]"; exit 1; }
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nccl-version> <cuda-version> <dist-dir> [sha256]"; exit 1; }
echo "=== NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION} ==="
CACHE_DIR="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nccl-downloads"
if [ -d "$CACHE_DIR/lib" ] && [ "$(ls "$CACHE_DIR/lib/"libnccl.so.* 2>/dev/null | wc -l)" -gt 0 ]; then
echo "=== NCCL cached, skipping download ==="
echo "cache: $CACHE_DIR"
echo "libs: $(ls "$CACHE_DIR/lib/" | wc -l) files"
exit 0
fi
REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64"
PKG_NAME="libnccl2_${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}_amd64.deb"
PKG_URL="${REPO_BASE}/${PKG_NAME}"
mkdir -p "$DOWNLOAD_CACHE_DIR"
DEB_FILE="${DOWNLOAD_CACHE_DIR}/${PKG_NAME}"
echo "=== downloading NCCL package ==="
echo "URL: ${PKG_URL}"
wget --show-progress -O "$DEB_FILE" "$PKG_URL"
if [ -n "$EXPECTED_SHA256" ]; then
echo "=== verifying sha256 ==="
ACTUAL_SHA256=$(sha256sum "$DEB_FILE" | awk '{print $1}')
if [ "$ACTUAL_SHA256" != "$EXPECTED_SHA256" ]; then
echo "ERROR: sha256 mismatch"
echo " expected: $EXPECTED_SHA256"
echo " actual: $ACTUAL_SHA256"
rm -f "$DEB_FILE"
exit 1
fi
echo "sha256 OK"
fi
echo "=== extracting NCCL libraries ==="
EXTRACT_TMP=$(mktemp -d)
trap 'rm -rf "$EXTRACT_TMP"' EXIT INT TERM
# .deb is an ar archive; data.tar.* contains the actual files
cd "$EXTRACT_TMP"
ar x "$DEB_FILE"
# Extract data tarball (xz, gz, or zst)
DATA_TAR=$(ls data.tar.* 2>/dev/null | head -1)
[ -n "$DATA_TAR" ] || { echo "ERROR: data.tar.* not found in .deb"; exit 1; }
tar xf "$DATA_TAR"
# Library lands in ./usr/lib/x86_64-linux-gnu/ or ./usr/lib/
mkdir -p "$CACHE_DIR/lib"
found=0
for f in $(find . -name 'libnccl.so.*' -not -type d 2>/dev/null); do
cp "$f" "$CACHE_DIR/lib/"
found=$((found + 1))
done
[ "$found" -gt 0 ] || { echo "ERROR: libnccl.so.* not found in package"; exit 1; }
# Create soname symlinks: libnccl.so.2 -> libnccl.so.<full>, libnccl.so -> libnccl.so.2
versioned=$(ls "$CACHE_DIR/lib/libnccl.so."[0-9][0-9.]* 2>/dev/null | head -1)
if [ -n "$versioned" ]; then
base=$(basename "$versioned")
ln -sf "$base" "$CACHE_DIR/lib/libnccl.so.2" 2>/dev/null || true
ln -sf "libnccl.so.2" "$CACHE_DIR/lib/libnccl.so" 2>/dev/null || true
fi
echo "=== NCCL extraction complete ==="
echo "cache: $CACHE_DIR"
ls -lh "$CACHE_DIR/lib/"