#!/bin/sh # build-nccl-tests.sh — build nccl-tests all_reduce_perf for the LiveCD. # # Downloads nccl-tests source from GitHub, downloads libnccl-dev .deb for # nccl.h, and compiles all_reduce_perf with nvcc (cuda-nvcc-13-0). # # Output is cached in DIST_DIR/nccl-tests-/ so subsequent builds # are instant unless NCCL_TESTS_VERSION changes. # # Output layout: # $CACHE_DIR/bin/all_reduce_perf set -e NCCL_TESTS_VERSION="$1" NCCL_VERSION="$2" NCCL_CUDA_VERSION="$3" DIST_DIR="$4" [ -n "$NCCL_TESTS_VERSION" ] || { echo "usage: $0 "; exit 1; } [ -n "$NCCL_VERSION" ] || { echo "usage: $0 "; exit 1; } [ -n "$NCCL_CUDA_VERSION" ] || { echo "usage: $0 "; exit 1; } [ -n "$DIST_DIR" ] || { echo "usage: $0 "; exit 1; } echo "=== nccl-tests ${NCCL_TESTS_VERSION} ===" CACHE_DIR="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}" CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}" DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nccl-tests-downloads" if [ -f "${CACHE_DIR}/bin/all_reduce_perf" ]; then echo "=== nccl-tests cached, skipping build ===" echo "binary: ${CACHE_DIR}/bin/all_reduce_perf" exit 0 fi # Resolve nvcc path (cuda-nvcc-12-8 installs to /usr/local/cuda-12.8/bin/nvcc) NVCC="" for candidate in nvcc /usr/local/cuda-12.8/bin/nvcc /usr/local/cuda-12/bin/nvcc /usr/local/cuda/bin/nvcc; do if command -v "$candidate" >/dev/null 2>&1 || [ -x "$candidate" ]; then NVCC="$candidate" break fi done [ -n "$NVCC" ] || { echo "ERROR: nvcc not found — install cuda-nvcc-13-0"; exit 1; } echo "nvcc: $NVCC" # Determine CUDA_HOME from nvcc location CUDA_HOME="$(dirname "$(dirname "$NVCC")")" echo "CUDA_HOME: $CUDA_HOME" # Download libnccl-dev for nccl.h REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64" DEV_PKG="libnccl-dev_${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}_amd64.deb" DEV_URL="${REPO_BASE}/${DEV_PKG}" mkdir -p "$DOWNLOAD_CACHE_DIR" DEV_DEB="${DOWNLOAD_CACHE_DIR}/${DEV_PKG}" if [ ! -f "$DEV_DEB" ]; then echo "=== downloading libnccl-dev ===" wget --show-progress -O "$DEV_DEB" "$DEV_URL" fi # Extract nccl.h from libnccl-dev NCCL_INCLUDE_TMP=$(mktemp -d) trap 'rm -rf "$NCCL_INCLUDE_TMP" "$BUILD_TMP"' EXIT INT TERM cd "$NCCL_INCLUDE_TMP" ar x "$DEV_DEB" DATA_TAR=$(ls data.tar.* 2>/dev/null | head -1) [ -n "$DATA_TAR" ] || { echo "ERROR: data.tar.* not found in libnccl-dev .deb"; exit 1; } tar xf "$DATA_TAR" # nccl.h lands in ./usr/include/ or ./usr/local/cuda-X.Y/targets/.../include/ NCCL_H=$(find . -name 'nccl.h' -type f 2>/dev/null | head -1) [ -n "$NCCL_H" ] || { echo "ERROR: nccl.h not found in libnccl-dev package"; exit 1; } NCCL_INCLUDE_DIR="$(pwd)/$(dirname "$NCCL_H")" echo "nccl.h: $NCCL_H" # libnccl.so comes from the already-built NCCL cache (build-nccl.sh ran first) NCCL_LIB_DIR="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}/lib" [ -d "$NCCL_LIB_DIR" ] || { echo "ERROR: NCCL lib dir not found at $NCCL_LIB_DIR — run build-nccl.sh first"; exit 1; } echo "nccl lib: $NCCL_LIB_DIR" # Download nccl-tests source SRC_TAR="${DOWNLOAD_CACHE_DIR}/nccl-tests-v${NCCL_TESTS_VERSION}.tar.gz" SRC_URL="https://github.com/NVIDIA/nccl-tests/archive/refs/tags/v${NCCL_TESTS_VERSION}.tar.gz" if [ ! -f "$SRC_TAR" ]; then echo "=== downloading nccl-tests v${NCCL_TESTS_VERSION} ===" wget --show-progress -O "$SRC_TAR" "$SRC_URL" fi # Extract and build BUILD_TMP=$(mktemp -d) cd "$BUILD_TMP" tar xf "$SRC_TAR" SRC_DIR=$(ls -d nccl-tests-* 2>/dev/null | head -1) [ -n "$SRC_DIR" ] || { echo "ERROR: source directory not found in archive"; exit 1; } cd "$SRC_DIR" echo "=== building all_reduce_perf ===" # Pick gencode based on the actual nvcc version: # CUDA 12.x — Volta..Blackwell (sm_70..sm_100) # CUDA 13.x — Hopper..Blackwell (sm_90..sm_100, Pascal/Volta/Ampere dropped) NVCC_MAJOR=$("$NVCC" --version 2>/dev/null | grep -oE 'release [0-9]+' | awk '{print $2}' | head -1) echo "nvcc major version: ${NVCC_MAJOR:-unknown}" if [ "${NVCC_MAJOR:-0}" -ge 13 ] 2>/dev/null; then GENCODE="-gencode=arch=compute_90,code=sm_90 \ -gencode=arch=compute_100,code=sm_100" echo "gencode: sm_90 sm_100 (CUDA 13+)" else GENCODE="-gencode=arch=compute_70,code=sm_70 \ -gencode=arch=compute_80,code=sm_80 \ -gencode=arch=compute_86,code=sm_86 \ -gencode=arch=compute_90,code=sm_90 \ -gencode=arch=compute_100,code=sm_100" echo "gencode: sm_70..sm_100 (CUDA 12)" fi LIBRARY_PATH="$NCCL_LIB_DIR${LIBRARY_PATH:+:$LIBRARY_PATH}" \ make MPI=0 \ NVCC="$NVCC" \ CUDA_HOME="$CUDA_HOME" \ NCCL_HOME="$NCCL_INCLUDE_DIR/.." \ NCCL_LIB="$NCCL_LIB_DIR" \ NVCC_GENCODE="$GENCODE" \ BUILDDIR="./build" [ -f "./build/all_reduce_perf" ] || { echo "ERROR: all_reduce_perf not found after build"; exit 1; } mkdir -p "${CACHE_DIR}/bin" cp "./build/all_reduce_perf" "${CACHE_DIR}/bin/all_reduce_perf" chmod +x "${CACHE_DIR}/bin/all_reduce_perf" echo "=== nccl-tests build complete ===" echo "binary: ${CACHE_DIR}/bin/all_reduce_perf" ls -lh "${CACHE_DIR}/bin/all_reduce_perf"