fix(nccl-tests): pin /usr/local/cuda→12.8 symlink, auto-detect gencode by nvcc version
This commit is contained in:
@@ -37,7 +37,8 @@ https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/ /" \
|
|||||||
> /etc/apt/sources.list.d/cuda.list \
|
> /etc/apt/sources.list.d/cuda.list \
|
||||||
&& apt-get update -qq \
|
&& apt-get update -qq \
|
||||||
&& apt-get install -y cuda-nvcc-12-8 \
|
&& apt-get install -y cuda-nvcc-12-8 \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
|
&& ln -sfn /usr/local/cuda-12.8 /usr/local/cuda
|
||||||
|
|
||||||
RUN arch="$(dpkg --print-architecture)" \
|
RUN arch="$(dpkg --print-architecture)" \
|
||||||
&& case "$arch" in \
|
&& case "$arch" in \
|
||||||
|
|||||||
@@ -101,12 +101,23 @@ SRC_DIR=$(ls -d nccl-tests-* 2>/dev/null | head -1)
|
|||||||
cd "$SRC_DIR"
|
cd "$SRC_DIR"
|
||||||
|
|
||||||
echo "=== building all_reduce_perf ==="
|
echo "=== building all_reduce_perf ==="
|
||||||
# CUDA 12.8 supports Volta through Blackwell (sm_70..sm_100).
|
# Pick gencode based on the actual nvcc version:
|
||||||
GENCODE="-gencode=arch=compute_70,code=sm_70 \
|
# CUDA 12.x — Volta..Blackwell (sm_70..sm_100)
|
||||||
-gencode=arch=compute_80,code=sm_80 \
|
# CUDA 13.x — Hopper..Blackwell (sm_90..sm_100, Pascal/Volta/Ampere dropped)
|
||||||
-gencode=arch=compute_86,code=sm_86 \
|
NVCC_MAJOR=$("$NVCC" --version 2>/dev/null | grep -oE 'release [0-9]+' | awk '{print $2}' | head -1)
|
||||||
-gencode=arch=compute_90,code=sm_90 \
|
echo "nvcc major version: ${NVCC_MAJOR:-unknown}"
|
||||||
-gencode=arch=compute_100,code=sm_100"
|
if [ "${NVCC_MAJOR:-0}" -ge 13 ] 2>/dev/null; then
|
||||||
|
GENCODE="-gencode=arch=compute_90,code=sm_90 \
|
||||||
|
-gencode=arch=compute_100,code=sm_100"
|
||||||
|
echo "gencode: sm_90 sm_100 (CUDA 13+)"
|
||||||
|
else
|
||||||
|
GENCODE="-gencode=arch=compute_70,code=sm_70 \
|
||||||
|
-gencode=arch=compute_80,code=sm_80 \
|
||||||
|
-gencode=arch=compute_86,code=sm_86 \
|
||||||
|
-gencode=arch=compute_90,code=sm_90 \
|
||||||
|
-gencode=arch=compute_100,code=sm_100"
|
||||||
|
echo "gencode: sm_70..sm_100 (CUDA 12)"
|
||||||
|
fi
|
||||||
make MPI=0 \
|
make MPI=0 \
|
||||||
NVCC="$NVCC" \
|
NVCC="$NVCC" \
|
||||||
CUDA_HOME="$CUDA_HOME" \
|
CUDA_HOME="$CUDA_HOME" \
|
||||||
|
|||||||
Reference in New Issue
Block a user