fix(nccl-tests): pin /usr/local/cuda→12.8 symlink, auto-detect gencode by nvcc version

This commit is contained in:
2026-03-26 23:54:07 +03:00
parent 1f43b4e050
commit 837a1fb981
2 changed files with 19 additions and 7 deletions

View File

@@ -37,7 +37,8 @@ https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/ /" \
> /etc/apt/sources.list.d/cuda.list \
&& apt-get update -qq \
&& apt-get install -y cuda-nvcc-12-8 \
&& rm -rf /var/lib/apt/lists/*
&& rm -rf /var/lib/apt/lists/* \
&& ln -sfn /usr/local/cuda-12.8 /usr/local/cuda
RUN arch="$(dpkg --print-architecture)" \
&& case "$arch" in \

View File

@@ -101,12 +101,23 @@ SRC_DIR=$(ls -d nccl-tests-* 2>/dev/null | head -1)
cd "$SRC_DIR"
echo "=== building all_reduce_perf ==="
# CUDA 12.8 supports Volta through Blackwell (sm_70..sm_100).
GENCODE="-gencode=arch=compute_70,code=sm_70 \
-gencode=arch=compute_80,code=sm_80 \
-gencode=arch=compute_86,code=sm_86 \
-gencode=arch=compute_90,code=sm_90 \
-gencode=arch=compute_100,code=sm_100"
# Pick gencode based on the actual nvcc version:
# CUDA 12.x — Volta..Blackwell (sm_70..sm_100)
# CUDA 13.x — Hopper..Blackwell (sm_90..sm_100, Pascal/Volta/Ampere dropped)
NVCC_MAJOR=$("$NVCC" --version 2>/dev/null | grep -oE 'release [0-9]+' | awk '{print $2}' | head -1)
echo "nvcc major version: ${NVCC_MAJOR:-unknown}"
if [ "${NVCC_MAJOR:-0}" -ge 13 ] 2>/dev/null; then
GENCODE="-gencode=arch=compute_90,code=sm_90 \
-gencode=arch=compute_100,code=sm_100"
echo "gencode: sm_90 sm_100 (CUDA 13+)"
else
GENCODE="-gencode=arch=compute_70,code=sm_70 \
-gencode=arch=compute_80,code=sm_80 \
-gencode=arch=compute_86,code=sm_86 \
-gencode=arch=compute_90,code=sm_90 \
-gencode=arch=compute_100,code=sm_100"
echo "gencode: sm_70..sm_100 (CUDA 12)"
fi
make MPI=0 \
NVCC="$NVCC" \
CUDA_HOME="$CUDA_HOME" \