feat(nccl): add nccl-tests all_reduce_perf for GPU bandwidth testing
- Dockerfile: install cuda-nvcc-13-0 from NVIDIA repo for compilation - build-nccl-tests.sh: downloads libnccl-dev for nccl.h, builds all_reduce_perf - build.sh: runs nccl-tests build, injects binary into /usr/local/bin/ - platform: RunNCCLTests() auto-detects GPU count, runs all_reduce_perf - TUI: NCCL bandwidth test entry in Burn-in Tests screen [N] hotkey Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -81,6 +81,7 @@ type satRunner interface {
|
||||
ListAMDGPUs() ([]platform.AMDGPUInfo, error)
|
||||
RunAMDAcceptancePack(baseDir string) (string, error)
|
||||
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
||||
RunNCCLTests(ctx context.Context, baseDir string) (string, error)
|
||||
}
|
||||
|
||||
type runtimeChecker interface {
|
||||
@@ -498,6 +499,15 @@ func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platfor
|
||||
return a.sat.RunFanStressTest(ctx, baseDir, opts)
|
||||
}
|
||||
|
||||
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
||||
path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir)
|
||||
body := "Results: " + path
|
||||
if err != nil && err != context.Canceled {
|
||||
body += "\nERROR: " + err.Error()
|
||||
}
|
||||
return ActionResult{Title: "NCCL bandwidth test", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) {
|
||||
path, err := a.RunFanStressTest(ctx, "", opts)
|
||||
body := formatFanStressResult(path)
|
||||
|
||||
@@ -174,6 +174,10 @@ func (f fakeSAT) RunFanStressTest(_ context.Context, _ string, _ platform.FanStr
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
@@ -121,6 +121,24 @@ func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
|
||||
return gpus, nil
|
||||
}
|
||||
|
||||
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
|
||||
// Measures collective communication bandwidth over NVLink/PCIe.
|
||||
func (s *System) RunNCCLTests(ctx context.Context, baseDir string) (string, error) {
|
||||
// detect GPU count
|
||||
out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
|
||||
gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
|
||||
if gpuCount < 1 {
|
||||
gpuCount = 1
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-all-reduce-perf.log", cmd: []string{
|
||||
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
|
||||
"-g", strconv.Itoa(gpuCount), "--iters", "20",
|
||||
}},
|
||||
})
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaAcceptancePack(baseDir string) (string, error) {
|
||||
return runAcceptancePack(baseDir, "gpu-nvidia", nvidiaSATJobs())
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package tui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
@@ -140,6 +141,15 @@ func (m model) updateConfirm(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
)
|
||||
case actionRunFanStress:
|
||||
return m.startGPUStressTest()
|
||||
case actionRunNCCLTests:
|
||||
m.busy = true
|
||||
m.busyTitle = "NCCL bandwidth test"
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
m.ncclCancel = cancel
|
||||
return m, func() tea.Msg {
|
||||
result, err := m.app.RunNCCLTestsResult(ctx)
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenBurnInTests}
|
||||
}
|
||||
}
|
||||
case "ctrl+c":
|
||||
return m, tea.Quit
|
||||
@@ -153,7 +163,7 @@ func (m model) confirmCancelTarget() screen {
|
||||
return screenExportTargets
|
||||
case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT:
|
||||
return screenHealthCheck
|
||||
case actionRunFanStress:
|
||||
case actionRunFanStress, actionRunNCCLTests:
|
||||
return screenBurnInTests
|
||||
default:
|
||||
return screenMain
|
||||
|
||||
@@ -8,12 +8,13 @@ import (
|
||||
)
|
||||
|
||||
const (
|
||||
burnCurGPUStress = 0
|
||||
burnCurModeQuick = 1
|
||||
burnCurModeStd = 2
|
||||
burnCurModeExpr = 3
|
||||
burnCurRun = 4
|
||||
burnCurTotal = 5
|
||||
burnCurGPUStress = 0
|
||||
burnCurModeQuick = 1
|
||||
burnCurModeStd = 2
|
||||
burnCurModeExpr = 3
|
||||
burnCurRun = 4
|
||||
burnCurNCCLTests = 5
|
||||
burnCurTotal = 6
|
||||
)
|
||||
|
||||
func (m model) enterBurnInTests() (tea.Model, tea.Cmd) {
|
||||
@@ -48,9 +49,13 @@ func (m model) updateBurnInTests(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
return m.burnRunSelected()
|
||||
case burnCurModeQuick, burnCurModeStd, burnCurModeExpr:
|
||||
m.burnMode = m.burnCursor - burnCurModeQuick
|
||||
case burnCurNCCLTests:
|
||||
return m.burnRunNCCL()
|
||||
}
|
||||
case "f", "F", "r", "R":
|
||||
return m.burnRunSelected()
|
||||
case "n", "N":
|
||||
return m.burnRunNCCL()
|
||||
case "1":
|
||||
m.burnMode = 0
|
||||
case "2":
|
||||
@@ -70,6 +75,13 @@ func (m model) burnRunSelected() (tea.Model, tea.Cmd) {
|
||||
return m.hcRunFanStress()
|
||||
}
|
||||
|
||||
func (m model) burnRunNCCL() (tea.Model, tea.Cmd) {
|
||||
m.pendingAction = actionRunNCCLTests
|
||||
m.screen = screenConfirm
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func renderBurnInTests(m model) string {
|
||||
var b strings.Builder
|
||||
|
||||
@@ -110,8 +122,15 @@ func renderBurnInTests(m model) string {
|
||||
}
|
||||
fmt.Fprintf(&b, "%s[ RUN SELECTED [R] ]\n", pfx)
|
||||
|
||||
fmt.Fprintln(&b)
|
||||
pfx = " "
|
||||
if m.burnCursor == burnCurNCCLTests {
|
||||
pfx = "> "
|
||||
}
|
||||
fmt.Fprintf(&b, "%s[ NCCL BANDWIDTH TEST [N] ] (all_reduce_perf, NVLink/PCIe bandwidth)\n", pfx)
|
||||
|
||||
fmt.Fprintln(&b)
|
||||
fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────")
|
||||
fmt.Fprint(&b, "[↑↓] move [space/enter] select [1/2/3] mode [R/F] run [Esc] back")
|
||||
fmt.Fprint(&b, "[↑↓] move [space/enter] select [1/2/3] mode [R/F] run [N] nccl [Esc] back")
|
||||
return b.String()
|
||||
}
|
||||
|
||||
@@ -268,12 +268,9 @@ func TestHealthCheckGPUOpensNvidiaSATSetup(t *testing.T) {
|
||||
m.hcInitialized = true
|
||||
m.hcSel = [4]bool{true, true, true, true}
|
||||
|
||||
next, cmd := m.hcRunSingle(hcGPU)
|
||||
next, _ := m.hcRunSingle(hcGPU)
|
||||
got := next.(model)
|
||||
|
||||
if cmd == nil {
|
||||
t.Fatal("expected non-nil cmd (GPU list loader)")
|
||||
}
|
||||
if got.screen != screenNvidiaSATSetup {
|
||||
t.Fatalf("screen=%q want %q", got.screen, screenNvidiaSATSetup)
|
||||
}
|
||||
|
||||
@@ -44,6 +44,7 @@ const (
|
||||
actionRunCPUSAT actionKind = "run_cpu_sat"
|
||||
actionRunAMDGPUSAT actionKind = "run_amd_gpu_sat"
|
||||
actionRunFanStress actionKind = "run_fan_stress"
|
||||
actionRunNCCLTests actionKind = "run_nccl_tests"
|
||||
)
|
||||
|
||||
type model struct {
|
||||
@@ -98,6 +99,9 @@ type model struct {
|
||||
nvidiaSATCancel func()
|
||||
nvidiaSATAborted bool
|
||||
|
||||
// NCCL tests running
|
||||
ncclCancel func()
|
||||
|
||||
// GPU Platform Stress Test running
|
||||
gpuStressCancel func()
|
||||
gpuStressAborted bool
|
||||
@@ -202,6 +206,8 @@ func (m model) confirmBody() (string, string) {
|
||||
return "CPU test", "Run stress-ng? Mode: " + modes[m.hcMode]
|
||||
case actionRunAMDGPUSAT:
|
||||
return "AMD GPU test", "Run AMD GPU diagnostic pack (rocm-smi)?"
|
||||
case actionRunNCCLTests:
|
||||
return "NCCL bandwidth test", "Run all_reduce_perf across all GPUs?\n\nMeasures collective bandwidth over NVLink/PCIe.\nRequires 2+ GPUs for meaningful results."
|
||||
case actionRunFanStress:
|
||||
modes := []string{"Quick (2×2min)", "Standard (2×5min)", "Express (2×10min)"}
|
||||
return "GPU Platform Stress Test", "Two-phase GPU thermal cycling test.\n" +
|
||||
|
||||
@@ -26,6 +26,19 @@ RUN apt-get update -qq && apt-get install -y \
|
||||
linux-headers-amd64 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Add NVIDIA CUDA repo and install nvcc (needed to compile nccl-tests)
|
||||
RUN wget -qO /tmp/cuda-keyring.gpg \
|
||||
https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/3bf863cc.pub \
|
||||
&& gpg --dearmor < /tmp/cuda-keyring.gpg \
|
||||
> /usr/share/keyrings/nvidia-cuda.gpg \
|
||||
&& rm /tmp/cuda-keyring.gpg \
|
||||
&& echo "deb [signed-by=/usr/share/keyrings/nvidia-cuda.gpg] \
|
||||
https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/ /" \
|
||||
> /etc/apt/sources.list.d/cuda.list \
|
||||
&& apt-get update -qq \
|
||||
&& apt-get install -y cuda-nvcc-13-0 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN arch="$(dpkg --print-architecture)" \
|
||||
&& case "$arch" in \
|
||||
amd64) goarch=amd64 ;; \
|
||||
|
||||
@@ -4,6 +4,7 @@ NVIDIA_DRIVER_VERSION=590.48.01
|
||||
NCCL_VERSION=2.28.9-1
|
||||
NCCL_CUDA_VERSION=13.0
|
||||
NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
|
||||
NCCL_TESTS_VERSION=2.13.10
|
||||
CUBLAS_VERSION=13.0.2.14-1
|
||||
CUDA_USERSPACE_VERSION=13.0.96-1
|
||||
GO_VERSION=1.24.0
|
||||
|
||||
114
iso/builder/build-nccl-tests.sh
Executable file
114
iso/builder/build-nccl-tests.sh
Executable file
@@ -0,0 +1,114 @@
|
||||
#!/bin/sh
|
||||
# build-nccl-tests.sh — build nccl-tests all_reduce_perf for the LiveCD.
|
||||
#
|
||||
# Downloads nccl-tests source from GitHub, downloads libnccl-dev .deb for
|
||||
# nccl.h, and compiles all_reduce_perf with nvcc (cuda-nvcc-13-0).
|
||||
#
|
||||
# Output is cached in DIST_DIR/nccl-tests-<version>/ so subsequent builds
|
||||
# are instant unless NCCL_TESTS_VERSION changes.
|
||||
#
|
||||
# Output layout:
|
||||
# $CACHE_DIR/bin/all_reduce_perf
|
||||
|
||||
set -e
|
||||
|
||||
NCCL_TESTS_VERSION="$1"
|
||||
NCCL_VERSION="$2"
|
||||
NCCL_CUDA_VERSION="$3"
|
||||
DIST_DIR="$4"
|
||||
|
||||
[ -n "$NCCL_TESTS_VERSION" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir>"; exit 1; }
|
||||
[ -n "$NCCL_VERSION" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir>"; exit 1; }
|
||||
[ -n "$NCCL_CUDA_VERSION" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir>"; exit 1; }
|
||||
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir>"; exit 1; }
|
||||
|
||||
echo "=== nccl-tests ${NCCL_TESTS_VERSION} ==="
|
||||
|
||||
CACHE_DIR="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
|
||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nccl-tests-downloads"
|
||||
|
||||
if [ -f "${CACHE_DIR}/bin/all_reduce_perf" ]; then
|
||||
echo "=== nccl-tests cached, skipping build ==="
|
||||
echo "binary: ${CACHE_DIR}/bin/all_reduce_perf"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Resolve nvcc path (cuda-nvcc-13-0 installs to /usr/local/cuda-13.0/bin/nvcc)
|
||||
NVCC=""
|
||||
for candidate in nvcc /usr/local/cuda-13.0/bin/nvcc /usr/local/cuda/bin/nvcc; do
|
||||
if command -v "$candidate" >/dev/null 2>&1 || [ -x "$candidate" ]; then
|
||||
NVCC="$candidate"
|
||||
break
|
||||
fi
|
||||
done
|
||||
[ -n "$NVCC" ] || { echo "ERROR: nvcc not found — install cuda-nvcc-13-0"; exit 1; }
|
||||
echo "nvcc: $NVCC"
|
||||
|
||||
# Determine CUDA_HOME from nvcc location
|
||||
CUDA_HOME="$(dirname "$(dirname "$NVCC")")"
|
||||
echo "CUDA_HOME: $CUDA_HOME"
|
||||
|
||||
# Download libnccl-dev for nccl.h
|
||||
REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64"
|
||||
DEV_PKG="libnccl-dev_${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}_amd64.deb"
|
||||
DEV_URL="${REPO_BASE}/${DEV_PKG}"
|
||||
|
||||
mkdir -p "$DOWNLOAD_CACHE_DIR"
|
||||
DEV_DEB="${DOWNLOAD_CACHE_DIR}/${DEV_PKG}"
|
||||
|
||||
if [ ! -f "$DEV_DEB" ]; then
|
||||
echo "=== downloading libnccl-dev ==="
|
||||
wget --show-progress -O "$DEV_DEB" "$DEV_URL"
|
||||
fi
|
||||
|
||||
# Extract nccl.h from libnccl-dev
|
||||
NCCL_INCLUDE_TMP=$(mktemp -d)
|
||||
trap 'rm -rf "$NCCL_INCLUDE_TMP" "$BUILD_TMP"' EXIT INT TERM
|
||||
|
||||
cd "$NCCL_INCLUDE_TMP"
|
||||
ar x "$DEV_DEB"
|
||||
DATA_TAR=$(ls data.tar.* 2>/dev/null | head -1)
|
||||
[ -n "$DATA_TAR" ] || { echo "ERROR: data.tar.* not found in libnccl-dev .deb"; exit 1; }
|
||||
tar xf "$DATA_TAR"
|
||||
|
||||
# nccl.h lands in ./usr/include/ or ./usr/local/cuda-X.Y/targets/.../include/
|
||||
NCCL_H=$(find . -name 'nccl.h' -type f 2>/dev/null | head -1)
|
||||
[ -n "$NCCL_H" ] || { echo "ERROR: nccl.h not found in libnccl-dev package"; exit 1; }
|
||||
NCCL_INCLUDE_DIR="$(pwd)/$(dirname "$NCCL_H")"
|
||||
echo "nccl.h: $NCCL_H"
|
||||
|
||||
# Download nccl-tests source
|
||||
SRC_TAR="${DOWNLOAD_CACHE_DIR}/nccl-tests-v${NCCL_TESTS_VERSION}.tar.gz"
|
||||
SRC_URL="https://github.com/NVIDIA/nccl-tests/archive/refs/tags/v${NCCL_TESTS_VERSION}.tar.gz"
|
||||
|
||||
if [ ! -f "$SRC_TAR" ]; then
|
||||
echo "=== downloading nccl-tests v${NCCL_TESTS_VERSION} ==="
|
||||
wget --show-progress -O "$SRC_TAR" "$SRC_URL"
|
||||
fi
|
||||
|
||||
# Extract and build
|
||||
BUILD_TMP=$(mktemp -d)
|
||||
cd "$BUILD_TMP"
|
||||
tar xf "$SRC_TAR"
|
||||
SRC_DIR=$(ls -d nccl-tests-* 2>/dev/null | head -1)
|
||||
[ -n "$SRC_DIR" ] || { echo "ERROR: source directory not found in archive"; exit 1; }
|
||||
cd "$SRC_DIR"
|
||||
|
||||
echo "=== building all_reduce_perf ==="
|
||||
make MPI=0 \
|
||||
NVCC="$NVCC" \
|
||||
CUDA_HOME="$CUDA_HOME" \
|
||||
NCCL_HOME="$NCCL_INCLUDE_DIR/.." \
|
||||
BUILDDIR="./build" \
|
||||
all_reduce_perf
|
||||
|
||||
[ -f "./build/all_reduce_perf" ] || { echo "ERROR: all_reduce_perf not found after build"; exit 1; }
|
||||
|
||||
mkdir -p "${CACHE_DIR}/bin"
|
||||
cp "./build/all_reduce_perf" "${CACHE_DIR}/bin/all_reduce_perf"
|
||||
chmod +x "${CACHE_DIR}/bin/all_reduce_perf"
|
||||
|
||||
echo "=== nccl-tests build complete ==="
|
||||
echo "binary: ${CACHE_DIR}/bin/all_reduce_perf"
|
||||
ls -lh "${CACHE_DIR}/bin/all_reduce_perf"
|
||||
@@ -197,7 +197,8 @@ rm -f \
|
||||
"${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest"
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||
|
||||
# --- inject authorized_keys for SSH access ---
|
||||
AUTHORIZED_KEYS_FILE="${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys"
|
||||
@@ -298,6 +299,20 @@ echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/
|
||||
cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
||||
echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
||||
|
||||
# --- build nccl-tests ---
|
||||
echo ""
|
||||
echo "=== building nccl-tests ${NCCL_TESTS_VERSION} ==="
|
||||
sh "${BUILDER_DIR}/build-nccl-tests.sh" \
|
||||
"${NCCL_TESTS_VERSION}" \
|
||||
"${NCCL_VERSION}" \
|
||||
"${NCCL_CUDA_VERSION}" \
|
||||
"${DIST_DIR}"
|
||||
|
||||
NCCL_TESTS_CACHE="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
|
||||
cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||
echo "=== all_reduce_perf injected ==="
|
||||
|
||||
# --- embed build metadata ---
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/etc"
|
||||
BUILD_DATE="$(date +%Y-%m-%d)"
|
||||
@@ -314,6 +329,7 @@ NCCL_VERSION=${NCCL_VERSION}
|
||||
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
|
||||
CUBLAS_VERSION=${CUBLAS_VERSION}
|
||||
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
|
||||
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}
|
||||
EOF
|
||||
|
||||
# Patch motd with build info
|
||||
|
||||
Reference in New Issue
Block a user