139 lines
5.2 KiB
Bash
Executable File
139 lines
5.2 KiB
Bash
Executable File
#!/bin/sh
|
|
# build-nccl-tests.sh — build nccl-tests all_reduce_perf for the LiveCD.
|
|
#
|
|
# Downloads nccl-tests source from GitHub, downloads libnccl-dev .deb for
|
|
# nccl.h, and compiles all_reduce_perf with nvcc (cuda-nvcc-13-0).
|
|
#
|
|
# Output is cached in DIST_DIR/nccl-tests-<version>/ so subsequent builds
|
|
# are instant unless NCCL_TESTS_VERSION changes.
|
|
#
|
|
# Output layout:
|
|
# $CACHE_DIR/bin/all_reduce_perf
|
|
|
|
set -e
|
|
|
|
NCCL_TESTS_VERSION="$1"
|
|
NCCL_VERSION="$2"
|
|
NCCL_CUDA_VERSION="$3"
|
|
DIST_DIR="$4"
|
|
|
|
[ -n "$NCCL_TESTS_VERSION" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir>"; exit 1; }
|
|
[ -n "$NCCL_VERSION" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir>"; exit 1; }
|
|
[ -n "$NCCL_CUDA_VERSION" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir>"; exit 1; }
|
|
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir>"; exit 1; }
|
|
|
|
echo "=== nccl-tests ${NCCL_TESTS_VERSION} ==="
|
|
|
|
CACHE_DIR="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
|
|
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
|
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nccl-tests-downloads"
|
|
|
|
if [ -f "${CACHE_DIR}/bin/all_reduce_perf" ]; then
|
|
echo "=== nccl-tests cached, skipping build ==="
|
|
echo "binary: ${CACHE_DIR}/bin/all_reduce_perf"
|
|
exit 0
|
|
fi
|
|
|
|
# Resolve nvcc path (cuda-nvcc-12-8 installs to /usr/local/cuda-12.8/bin/nvcc)
|
|
NVCC=""
|
|
for candidate in nvcc /usr/local/cuda-12.8/bin/nvcc /usr/local/cuda-12/bin/nvcc /usr/local/cuda/bin/nvcc; do
|
|
if command -v "$candidate" >/dev/null 2>&1 || [ -x "$candidate" ]; then
|
|
NVCC="$candidate"
|
|
break
|
|
fi
|
|
done
|
|
[ -n "$NVCC" ] || { echo "ERROR: nvcc not found — install cuda-nvcc-13-0"; exit 1; }
|
|
echo "nvcc: $NVCC"
|
|
|
|
# Determine CUDA_HOME from nvcc location
|
|
CUDA_HOME="$(dirname "$(dirname "$NVCC")")"
|
|
echo "CUDA_HOME: $CUDA_HOME"
|
|
|
|
# Download libnccl-dev for nccl.h
|
|
REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64"
|
|
DEV_PKG="libnccl-dev_${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}_amd64.deb"
|
|
DEV_URL="${REPO_BASE}/${DEV_PKG}"
|
|
|
|
mkdir -p "$DOWNLOAD_CACHE_DIR"
|
|
DEV_DEB="${DOWNLOAD_CACHE_DIR}/${DEV_PKG}"
|
|
|
|
if [ ! -f "$DEV_DEB" ]; then
|
|
echo "=== downloading libnccl-dev ==="
|
|
wget --show-progress -O "$DEV_DEB" "$DEV_URL"
|
|
fi
|
|
|
|
# Extract nccl.h from libnccl-dev
|
|
NCCL_INCLUDE_TMP=$(mktemp -d)
|
|
trap 'rm -rf "$NCCL_INCLUDE_TMP" "$BUILD_TMP"' EXIT INT TERM
|
|
|
|
cd "$NCCL_INCLUDE_TMP"
|
|
ar x "$DEV_DEB"
|
|
DATA_TAR=$(ls data.tar.* 2>/dev/null | head -1)
|
|
[ -n "$DATA_TAR" ] || { echo "ERROR: data.tar.* not found in libnccl-dev .deb"; exit 1; }
|
|
tar xf "$DATA_TAR"
|
|
|
|
# nccl.h lands in ./usr/include/ or ./usr/local/cuda-X.Y/targets/.../include/
|
|
NCCL_H=$(find . -name 'nccl.h' -type f 2>/dev/null | head -1)
|
|
[ -n "$NCCL_H" ] || { echo "ERROR: nccl.h not found in libnccl-dev package"; exit 1; }
|
|
NCCL_INCLUDE_DIR="$(pwd)/$(dirname "$NCCL_H")"
|
|
echo "nccl.h: $NCCL_H"
|
|
|
|
# libnccl.so comes from the already-built NCCL cache (build-nccl.sh ran first)
|
|
NCCL_LIB_DIR="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}/lib"
|
|
[ -d "$NCCL_LIB_DIR" ] || { echo "ERROR: NCCL lib dir not found at $NCCL_LIB_DIR — run build-nccl.sh first"; exit 1; }
|
|
echo "nccl lib: $NCCL_LIB_DIR"
|
|
|
|
# Download nccl-tests source
|
|
SRC_TAR="${DOWNLOAD_CACHE_DIR}/nccl-tests-v${NCCL_TESTS_VERSION}.tar.gz"
|
|
SRC_URL="https://github.com/NVIDIA/nccl-tests/archive/refs/tags/v${NCCL_TESTS_VERSION}.tar.gz"
|
|
|
|
if [ ! -f "$SRC_TAR" ]; then
|
|
echo "=== downloading nccl-tests v${NCCL_TESTS_VERSION} ==="
|
|
wget --show-progress -O "$SRC_TAR" "$SRC_URL"
|
|
fi
|
|
|
|
# Extract and build
|
|
BUILD_TMP=$(mktemp -d)
|
|
cd "$BUILD_TMP"
|
|
tar xf "$SRC_TAR"
|
|
SRC_DIR=$(ls -d nccl-tests-* 2>/dev/null | head -1)
|
|
[ -n "$SRC_DIR" ] || { echo "ERROR: source directory not found in archive"; exit 1; }
|
|
cd "$SRC_DIR"
|
|
|
|
echo "=== building all_reduce_perf ==="
|
|
# Pick gencode based on the actual nvcc version:
|
|
# CUDA 12.x — Volta..Blackwell (sm_70..sm_100)
|
|
# CUDA 13.x — Hopper..Blackwell (sm_90..sm_100, Pascal/Volta/Ampere dropped)
|
|
NVCC_MAJOR=$("$NVCC" --version 2>/dev/null | grep -oE 'release [0-9]+' | awk '{print $2}' | head -1)
|
|
echo "nvcc major version: ${NVCC_MAJOR:-unknown}"
|
|
if [ "${NVCC_MAJOR:-0}" -ge 13 ] 2>/dev/null; then
|
|
GENCODE="-gencode=arch=compute_90,code=sm_90 \
|
|
-gencode=arch=compute_100,code=sm_100"
|
|
echo "gencode: sm_90 sm_100 (CUDA 13+)"
|
|
else
|
|
GENCODE="-gencode=arch=compute_70,code=sm_70 \
|
|
-gencode=arch=compute_80,code=sm_80 \
|
|
-gencode=arch=compute_86,code=sm_86 \
|
|
-gencode=arch=compute_90,code=sm_90 \
|
|
-gencode=arch=compute_100,code=sm_100"
|
|
echo "gencode: sm_70..sm_100 (CUDA 12)"
|
|
fi
|
|
LIBRARY_PATH="$NCCL_LIB_DIR${LIBRARY_PATH:+:$LIBRARY_PATH}" \
|
|
make MPI=0 \
|
|
NVCC="$NVCC" \
|
|
CUDA_HOME="$CUDA_HOME" \
|
|
NCCL_HOME="$NCCL_INCLUDE_DIR/.." \
|
|
NCCL_LIB="$NCCL_LIB_DIR" \
|
|
NVCC_GENCODE="$GENCODE" \
|
|
BUILDDIR="./build"
|
|
|
|
[ -f "./build/all_reduce_perf" ] || { echo "ERROR: all_reduce_perf not found after build"; exit 1; }
|
|
|
|
mkdir -p "${CACHE_DIR}/bin"
|
|
cp "./build/all_reduce_perf" "${CACHE_DIR}/bin/all_reduce_perf"
|
|
chmod +x "${CACHE_DIR}/bin/all_reduce_perf"
|
|
|
|
echo "=== nccl-tests build complete ==="
|
|
echo "binary: ${CACHE_DIR}/bin/all_reduce_perf"
|
|
ls -lh "${CACHE_DIR}/bin/all_reduce_perf"
|