feat(nccl): add nccl-tests all_reduce_perf for GPU bandwidth testing

- Dockerfile: install cuda-nvcc-13-0 from NVIDIA repo for compilation
- build-nccl-tests.sh: downloads libnccl-dev for nccl.h, builds all_reduce_perf
- build.sh: runs nccl-tests build, injects binary into /usr/local/bin/
- platform: RunNCCLTests() auto-detects GPU count, runs all_reduce_perf
- TUI: NCCL bandwidth test entry in Burn-in Tests screen [N] hotkey

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-26 23:22:19 +03:00
parent eea98e6d76
commit 5644231f9a
11 changed files with 221 additions and 13 deletions

View File

@@ -197,7 +197,8 @@ rm -f \
"${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest"
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
# --- inject authorized_keys for SSH access ---
AUTHORIZED_KEYS_FILE="${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys"
@@ -298,6 +299,20 @@ echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/
cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
# --- build nccl-tests ---
echo ""
echo "=== building nccl-tests ${NCCL_TESTS_VERSION} ==="
sh "${BUILDER_DIR}/build-nccl-tests.sh" \
"${NCCL_TESTS_VERSION}" \
"${NCCL_VERSION}" \
"${NCCL_CUDA_VERSION}" \
"${DIST_DIR}"
NCCL_TESTS_CACHE="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
echo "=== all_reduce_perf injected ==="
# --- embed build metadata ---
mkdir -p "${OVERLAY_STAGE_DIR}/etc"
BUILD_DATE="$(date +%Y-%m-%d)"
@@ -314,6 +329,7 @@ NCCL_VERSION=${NCCL_VERSION}
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
CUBLAS_VERSION=${CUBLAS_VERSION}
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}
EOF
# Patch motd with build info