feat(nccl): add nccl-tests all_reduce_perf for GPU bandwidth testing

- Dockerfile: install cuda-nvcc-13-0 from NVIDIA repo for compilation
- build-nccl-tests.sh: downloads libnccl-dev for nccl.h, builds all_reduce_perf
- build.sh: runs nccl-tests build, injects binary into /usr/local/bin/
- platform: RunNCCLTests() auto-detects GPU count, runs all_reduce_perf
- TUI: NCCL bandwidth test entry in Burn-in Tests screen [N] hotkey

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-26 23:22:19 +03:00
parent eea98e6d76
commit 5644231f9a
11 changed files with 221 additions and 13 deletions

View File

@@ -81,6 +81,7 @@ type satRunner interface {
ListAMDGPUs() ([]platform.AMDGPUInfo, error) ListAMDGPUs() ([]platform.AMDGPUInfo, error)
RunAMDAcceptancePack(baseDir string) (string, error) RunAMDAcceptancePack(baseDir string) (string, error)
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
RunNCCLTests(ctx context.Context, baseDir string) (string, error)
} }
type runtimeChecker interface { type runtimeChecker interface {
@@ -498,6 +499,15 @@ func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platfor
return a.sat.RunFanStressTest(ctx, baseDir, opts) return a.sat.RunFanStressTest(ctx, baseDir, opts)
} }
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir)
body := "Results: " + path
if err != nil && err != context.Canceled {
body += "\nERROR: " + err.Error()
}
return ActionResult{Title: "NCCL bandwidth test", Body: body}, err
}
func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) { func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) {
path, err := a.RunFanStressTest(ctx, "", opts) path, err := a.RunFanStressTest(ctx, "", opts)
body := formatFanStressResult(path) body := formatFanStressResult(path)

View File

@@ -174,6 +174,10 @@ func (f fakeSAT) RunFanStressTest(_ context.Context, _ string, _ platform.FanStr
return "", nil return "", nil
} }
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string) (string, error) {
return "", nil
}
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) { func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
t.Parallel() t.Parallel()

View File

@@ -121,6 +121,24 @@ func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
return gpus, nil return gpus, nil
} }
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
// Measures collective communication bandwidth over NVLink/PCIe.
func (s *System) RunNCCLTests(ctx context.Context, baseDir string) (string, error) {
// detect GPU count
out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
if gpuCount < 1 {
gpuCount = 1
}
return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", []satJob{
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{name: "02-all-reduce-perf.log", cmd: []string{
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
"-g", strconv.Itoa(gpuCount), "--iters", "20",
}},
})
}
func (s *System) RunNvidiaAcceptancePack(baseDir string) (string, error) { func (s *System) RunNvidiaAcceptancePack(baseDir string) (string, error) {
return runAcceptancePack(baseDir, "gpu-nvidia", nvidiaSATJobs()) return runAcceptancePack(baseDir, "gpu-nvidia", nvidiaSATJobs())
} }

View File

@@ -1,6 +1,7 @@
package tui package tui
import ( import (
"context"
"time" "time"
"bee/audit/internal/platform" "bee/audit/internal/platform"
@@ -140,6 +141,15 @@ func (m model) updateConfirm(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
) )
case actionRunFanStress: case actionRunFanStress:
return m.startGPUStressTest() return m.startGPUStressTest()
case actionRunNCCLTests:
m.busy = true
m.busyTitle = "NCCL bandwidth test"
ctx, cancel := context.WithCancel(context.Background())
m.ncclCancel = cancel
return m, func() tea.Msg {
result, err := m.app.RunNCCLTestsResult(ctx)
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenBurnInTests}
}
} }
case "ctrl+c": case "ctrl+c":
return m, tea.Quit return m, tea.Quit
@@ -153,7 +163,7 @@ func (m model) confirmCancelTarget() screen {
return screenExportTargets return screenExportTargets
case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT: case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT:
return screenHealthCheck return screenHealthCheck
case actionRunFanStress: case actionRunFanStress, actionRunNCCLTests:
return screenBurnInTests return screenBurnInTests
default: default:
return screenMain return screenMain

View File

@@ -13,7 +13,8 @@ const (
burnCurModeStd = 2 burnCurModeStd = 2
burnCurModeExpr = 3 burnCurModeExpr = 3
burnCurRun = 4 burnCurRun = 4
burnCurTotal = 5 burnCurNCCLTests = 5
burnCurTotal = 6
) )
func (m model) enterBurnInTests() (tea.Model, tea.Cmd) { func (m model) enterBurnInTests() (tea.Model, tea.Cmd) {
@@ -48,9 +49,13 @@ func (m model) updateBurnInTests(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
return m.burnRunSelected() return m.burnRunSelected()
case burnCurModeQuick, burnCurModeStd, burnCurModeExpr: case burnCurModeQuick, burnCurModeStd, burnCurModeExpr:
m.burnMode = m.burnCursor - burnCurModeQuick m.burnMode = m.burnCursor - burnCurModeQuick
case burnCurNCCLTests:
return m.burnRunNCCL()
} }
case "f", "F", "r", "R": case "f", "F", "r", "R":
return m.burnRunSelected() return m.burnRunSelected()
case "n", "N":
return m.burnRunNCCL()
case "1": case "1":
m.burnMode = 0 m.burnMode = 0
case "2": case "2":
@@ -70,6 +75,13 @@ func (m model) burnRunSelected() (tea.Model, tea.Cmd) {
return m.hcRunFanStress() return m.hcRunFanStress()
} }
func (m model) burnRunNCCL() (tea.Model, tea.Cmd) {
m.pendingAction = actionRunNCCLTests
m.screen = screenConfirm
m.cursor = 0
return m, nil
}
func renderBurnInTests(m model) string { func renderBurnInTests(m model) string {
var b strings.Builder var b strings.Builder
@@ -110,8 +122,15 @@ func renderBurnInTests(m model) string {
} }
fmt.Fprintf(&b, "%s[ RUN SELECTED [R] ]\n", pfx) fmt.Fprintf(&b, "%s[ RUN SELECTED [R] ]\n", pfx)
fmt.Fprintln(&b)
pfx = " "
if m.burnCursor == burnCurNCCLTests {
pfx = "> "
}
fmt.Fprintf(&b, "%s[ NCCL BANDWIDTH TEST [N] ] (all_reduce_perf, NVLink/PCIe bandwidth)\n", pfx)
fmt.Fprintln(&b) fmt.Fprintln(&b)
fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────") fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────")
fmt.Fprint(&b, "[↑↓] move [space/enter] select [1/2/3] mode [R/F] run [Esc] back") fmt.Fprint(&b, "[↑↓] move [space/enter] select [1/2/3] mode [R/F] run [N] nccl [Esc] back")
return b.String() return b.String()
} }

View File

@@ -268,12 +268,9 @@ func TestHealthCheckGPUOpensNvidiaSATSetup(t *testing.T) {
m.hcInitialized = true m.hcInitialized = true
m.hcSel = [4]bool{true, true, true, true} m.hcSel = [4]bool{true, true, true, true}
next, cmd := m.hcRunSingle(hcGPU) next, _ := m.hcRunSingle(hcGPU)
got := next.(model) got := next.(model)
if cmd == nil {
t.Fatal("expected non-nil cmd (GPU list loader)")
}
if got.screen != screenNvidiaSATSetup { if got.screen != screenNvidiaSATSetup {
t.Fatalf("screen=%q want %q", got.screen, screenNvidiaSATSetup) t.Fatalf("screen=%q want %q", got.screen, screenNvidiaSATSetup)
} }

View File

@@ -44,6 +44,7 @@ const (
actionRunCPUSAT actionKind = "run_cpu_sat" actionRunCPUSAT actionKind = "run_cpu_sat"
actionRunAMDGPUSAT actionKind = "run_amd_gpu_sat" actionRunAMDGPUSAT actionKind = "run_amd_gpu_sat"
actionRunFanStress actionKind = "run_fan_stress" actionRunFanStress actionKind = "run_fan_stress"
actionRunNCCLTests actionKind = "run_nccl_tests"
) )
type model struct { type model struct {
@@ -98,6 +99,9 @@ type model struct {
nvidiaSATCancel func() nvidiaSATCancel func()
nvidiaSATAborted bool nvidiaSATAborted bool
// NCCL tests running
ncclCancel func()
// GPU Platform Stress Test running // GPU Platform Stress Test running
gpuStressCancel func() gpuStressCancel func()
gpuStressAborted bool gpuStressAborted bool
@@ -202,6 +206,8 @@ func (m model) confirmBody() (string, string) {
return "CPU test", "Run stress-ng? Mode: " + modes[m.hcMode] return "CPU test", "Run stress-ng? Mode: " + modes[m.hcMode]
case actionRunAMDGPUSAT: case actionRunAMDGPUSAT:
return "AMD GPU test", "Run AMD GPU diagnostic pack (rocm-smi)?" return "AMD GPU test", "Run AMD GPU diagnostic pack (rocm-smi)?"
case actionRunNCCLTests:
return "NCCL bandwidth test", "Run all_reduce_perf across all GPUs?\n\nMeasures collective bandwidth over NVLink/PCIe.\nRequires 2+ GPUs for meaningful results."
case actionRunFanStress: case actionRunFanStress:
modes := []string{"Quick (2×2min)", "Standard (2×5min)", "Express (2×10min)"} modes := []string{"Quick (2×2min)", "Standard (2×5min)", "Express (2×10min)"}
return "GPU Platform Stress Test", "Two-phase GPU thermal cycling test.\n" + return "GPU Platform Stress Test", "Two-phase GPU thermal cycling test.\n" +

View File

@@ -26,6 +26,19 @@ RUN apt-get update -qq && apt-get install -y \
linux-headers-amd64 \ linux-headers-amd64 \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
# Add NVIDIA CUDA repo and install nvcc (needed to compile nccl-tests)
RUN wget -qO /tmp/cuda-keyring.gpg \
https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/3bf863cc.pub \
&& gpg --dearmor < /tmp/cuda-keyring.gpg \
> /usr/share/keyrings/nvidia-cuda.gpg \
&& rm /tmp/cuda-keyring.gpg \
&& echo "deb [signed-by=/usr/share/keyrings/nvidia-cuda.gpg] \
https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/ /" \
> /etc/apt/sources.list.d/cuda.list \
&& apt-get update -qq \
&& apt-get install -y cuda-nvcc-13-0 \
&& rm -rf /var/lib/apt/lists/*
RUN arch="$(dpkg --print-architecture)" \ RUN arch="$(dpkg --print-architecture)" \
&& case "$arch" in \ && case "$arch" in \
amd64) goarch=amd64 ;; \ amd64) goarch=amd64 ;; \

View File

@@ -4,6 +4,7 @@ NVIDIA_DRIVER_VERSION=590.48.01
NCCL_VERSION=2.28.9-1 NCCL_VERSION=2.28.9-1
NCCL_CUDA_VERSION=13.0 NCCL_CUDA_VERSION=13.0
NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186 NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
NCCL_TESTS_VERSION=2.13.10
CUBLAS_VERSION=13.0.2.14-1 CUBLAS_VERSION=13.0.2.14-1
CUDA_USERSPACE_VERSION=13.0.96-1 CUDA_USERSPACE_VERSION=13.0.96-1
GO_VERSION=1.24.0 GO_VERSION=1.24.0

114
iso/builder/build-nccl-tests.sh Executable file
View File

@@ -0,0 +1,114 @@
#!/bin/sh
# build-nccl-tests.sh — build nccl-tests all_reduce_perf for the LiveCD.
#
# Downloads nccl-tests source from GitHub, downloads libnccl-dev .deb for
# nccl.h, and compiles all_reduce_perf with nvcc (cuda-nvcc-13-0).
#
# Output is cached in DIST_DIR/nccl-tests-<version>/ so subsequent builds
# are instant unless NCCL_TESTS_VERSION changes.
#
# Output layout:
# $CACHE_DIR/bin/all_reduce_perf
set -e
NCCL_TESTS_VERSION="$1"
NCCL_VERSION="$2"
NCCL_CUDA_VERSION="$3"
DIST_DIR="$4"
[ -n "$NCCL_TESTS_VERSION" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir>"; exit 1; }
[ -n "$NCCL_VERSION" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir>"; exit 1; }
[ -n "$NCCL_CUDA_VERSION" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir>"; exit 1; }
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir>"; exit 1; }
echo "=== nccl-tests ${NCCL_TESTS_VERSION} ==="
CACHE_DIR="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nccl-tests-downloads"
if [ -f "${CACHE_DIR}/bin/all_reduce_perf" ]; then
echo "=== nccl-tests cached, skipping build ==="
echo "binary: ${CACHE_DIR}/bin/all_reduce_perf"
exit 0
fi
# Resolve nvcc path (cuda-nvcc-13-0 installs to /usr/local/cuda-13.0/bin/nvcc)
NVCC=""
for candidate in nvcc /usr/local/cuda-13.0/bin/nvcc /usr/local/cuda/bin/nvcc; do
if command -v "$candidate" >/dev/null 2>&1 || [ -x "$candidate" ]; then
NVCC="$candidate"
break
fi
done
[ -n "$NVCC" ] || { echo "ERROR: nvcc not found — install cuda-nvcc-13-0"; exit 1; }
echo "nvcc: $NVCC"
# Determine CUDA_HOME from nvcc location
CUDA_HOME="$(dirname "$(dirname "$NVCC")")"
echo "CUDA_HOME: $CUDA_HOME"
# Download libnccl-dev for nccl.h
REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64"
DEV_PKG="libnccl-dev_${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}_amd64.deb"
DEV_URL="${REPO_BASE}/${DEV_PKG}"
mkdir -p "$DOWNLOAD_CACHE_DIR"
DEV_DEB="${DOWNLOAD_CACHE_DIR}/${DEV_PKG}"
if [ ! -f "$DEV_DEB" ]; then
echo "=== downloading libnccl-dev ==="
wget --show-progress -O "$DEV_DEB" "$DEV_URL"
fi
# Extract nccl.h from libnccl-dev
NCCL_INCLUDE_TMP=$(mktemp -d)
trap 'rm -rf "$NCCL_INCLUDE_TMP" "$BUILD_TMP"' EXIT INT TERM
cd "$NCCL_INCLUDE_TMP"
ar x "$DEV_DEB"
DATA_TAR=$(ls data.tar.* 2>/dev/null | head -1)
[ -n "$DATA_TAR" ] || { echo "ERROR: data.tar.* not found in libnccl-dev .deb"; exit 1; }
tar xf "$DATA_TAR"
# nccl.h lands in ./usr/include/ or ./usr/local/cuda-X.Y/targets/.../include/
NCCL_H=$(find . -name 'nccl.h' -type f 2>/dev/null | head -1)
[ -n "$NCCL_H" ] || { echo "ERROR: nccl.h not found in libnccl-dev package"; exit 1; }
NCCL_INCLUDE_DIR="$(pwd)/$(dirname "$NCCL_H")"
echo "nccl.h: $NCCL_H"
# Download nccl-tests source
SRC_TAR="${DOWNLOAD_CACHE_DIR}/nccl-tests-v${NCCL_TESTS_VERSION}.tar.gz"
SRC_URL="https://github.com/NVIDIA/nccl-tests/archive/refs/tags/v${NCCL_TESTS_VERSION}.tar.gz"
if [ ! -f "$SRC_TAR" ]; then
echo "=== downloading nccl-tests v${NCCL_TESTS_VERSION} ==="
wget --show-progress -O "$SRC_TAR" "$SRC_URL"
fi
# Extract and build
BUILD_TMP=$(mktemp -d)
cd "$BUILD_TMP"
tar xf "$SRC_TAR"
SRC_DIR=$(ls -d nccl-tests-* 2>/dev/null | head -1)
[ -n "$SRC_DIR" ] || { echo "ERROR: source directory not found in archive"; exit 1; }
cd "$SRC_DIR"
echo "=== building all_reduce_perf ==="
make MPI=0 \
NVCC="$NVCC" \
CUDA_HOME="$CUDA_HOME" \
NCCL_HOME="$NCCL_INCLUDE_DIR/.." \
BUILDDIR="./build" \
all_reduce_perf
[ -f "./build/all_reduce_perf" ] || { echo "ERROR: all_reduce_perf not found after build"; exit 1; }
mkdir -p "${CACHE_DIR}/bin"
cp "./build/all_reduce_perf" "${CACHE_DIR}/bin/all_reduce_perf"
chmod +x "${CACHE_DIR}/bin/all_reduce_perf"
echo "=== nccl-tests build complete ==="
echo "binary: ${CACHE_DIR}/bin/all_reduce_perf"
ls -lh "${CACHE_DIR}/bin/all_reduce_perf"

View File

@@ -197,7 +197,8 @@ rm -f \
"${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \ "${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \ "${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress" \ "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
# --- inject authorized_keys for SSH access --- # --- inject authorized_keys for SSH access ---
AUTHORIZED_KEYS_FILE="${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" AUTHORIZED_KEYS_FILE="${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys"
@@ -298,6 +299,20 @@ echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/
cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ===" echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
# --- build nccl-tests ---
echo ""
echo "=== building nccl-tests ${NCCL_TESTS_VERSION} ==="
sh "${BUILDER_DIR}/build-nccl-tests.sh" \
"${NCCL_TESTS_VERSION}" \
"${NCCL_VERSION}" \
"${NCCL_CUDA_VERSION}" \
"${DIST_DIR}"
NCCL_TESTS_CACHE="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
echo "=== all_reduce_perf injected ==="
# --- embed build metadata --- # --- embed build metadata ---
mkdir -p "${OVERLAY_STAGE_DIR}/etc" mkdir -p "${OVERLAY_STAGE_DIR}/etc"
BUILD_DATE="$(date +%Y-%m-%d)" BUILD_DATE="$(date +%Y-%m-%d)"
@@ -314,6 +329,7 @@ NCCL_VERSION=${NCCL_VERSION}
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION} NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
CUBLAS_VERSION=${CUBLAS_VERSION} CUBLAS_VERSION=${CUBLAS_VERSION}
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION} CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}
EOF EOF
# Patch motd with build info # Patch motd with build info