feat(nccl): add nccl-tests all_reduce_perf for GPU bandwidth testing
- Dockerfile: install cuda-nvcc-13-0 from NVIDIA repo for compilation - build-nccl-tests.sh: downloads libnccl-dev for nccl.h, builds all_reduce_perf - build.sh: runs nccl-tests build, injects binary into /usr/local/bin/ - platform: RunNCCLTests() auto-detects GPU count, runs all_reduce_perf - TUI: NCCL bandwidth test entry in Burn-in Tests screen [N] hotkey Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -81,6 +81,7 @@ type satRunner interface {
|
|||||||
ListAMDGPUs() ([]platform.AMDGPUInfo, error)
|
ListAMDGPUs() ([]platform.AMDGPUInfo, error)
|
||||||
RunAMDAcceptancePack(baseDir string) (string, error)
|
RunAMDAcceptancePack(baseDir string) (string, error)
|
||||||
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
||||||
|
RunNCCLTests(ctx context.Context, baseDir string) (string, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
type runtimeChecker interface {
|
type runtimeChecker interface {
|
||||||
@@ -498,6 +499,15 @@ func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platfor
|
|||||||
return a.sat.RunFanStressTest(ctx, baseDir, opts)
|
return a.sat.RunFanStressTest(ctx, baseDir, opts)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
||||||
|
path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir)
|
||||||
|
body := "Results: " + path
|
||||||
|
if err != nil && err != context.Canceled {
|
||||||
|
body += "\nERROR: " + err.Error()
|
||||||
|
}
|
||||||
|
return ActionResult{Title: "NCCL bandwidth test", Body: body}, err
|
||||||
|
}
|
||||||
|
|
||||||
func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) {
|
func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) {
|
||||||
path, err := a.RunFanStressTest(ctx, "", opts)
|
path, err := a.RunFanStressTest(ctx, "", opts)
|
||||||
body := formatFanStressResult(path)
|
body := formatFanStressResult(path)
|
||||||
|
|||||||
@@ -174,6 +174,10 @@ func (f fakeSAT) RunFanStressTest(_ context.Context, _ string, _ platform.FanStr
|
|||||||
return "", nil
|
return "", nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string) (string, error) {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
|
||||||
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
|
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
|
|||||||
@@ -121,6 +121,24 @@ func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
|
|||||||
return gpus, nil
|
return gpus, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
|
||||||
|
// Measures collective communication bandwidth over NVLink/PCIe.
|
||||||
|
func (s *System) RunNCCLTests(ctx context.Context, baseDir string) (string, error) {
|
||||||
|
// detect GPU count
|
||||||
|
out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
|
||||||
|
gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
|
||||||
|
if gpuCount < 1 {
|
||||||
|
gpuCount = 1
|
||||||
|
}
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", []satJob{
|
||||||
|
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
|
{name: "02-all-reduce-perf.log", cmd: []string{
|
||||||
|
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
|
||||||
|
"-g", strconv.Itoa(gpuCount), "--iters", "20",
|
||||||
|
}},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
func (s *System) RunNvidiaAcceptancePack(baseDir string) (string, error) {
|
func (s *System) RunNvidiaAcceptancePack(baseDir string) (string, error) {
|
||||||
return runAcceptancePack(baseDir, "gpu-nvidia", nvidiaSATJobs())
|
return runAcceptancePack(baseDir, "gpu-nvidia", nvidiaSATJobs())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package tui
|
package tui
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"bee/audit/internal/platform"
|
"bee/audit/internal/platform"
|
||||||
@@ -140,6 +141,15 @@ func (m model) updateConfirm(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
|||||||
)
|
)
|
||||||
case actionRunFanStress:
|
case actionRunFanStress:
|
||||||
return m.startGPUStressTest()
|
return m.startGPUStressTest()
|
||||||
|
case actionRunNCCLTests:
|
||||||
|
m.busy = true
|
||||||
|
m.busyTitle = "NCCL bandwidth test"
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
m.ncclCancel = cancel
|
||||||
|
return m, func() tea.Msg {
|
||||||
|
result, err := m.app.RunNCCLTestsResult(ctx)
|
||||||
|
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenBurnInTests}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
case "ctrl+c":
|
case "ctrl+c":
|
||||||
return m, tea.Quit
|
return m, tea.Quit
|
||||||
@@ -153,7 +163,7 @@ func (m model) confirmCancelTarget() screen {
|
|||||||
return screenExportTargets
|
return screenExportTargets
|
||||||
case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT:
|
case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT:
|
||||||
return screenHealthCheck
|
return screenHealthCheck
|
||||||
case actionRunFanStress:
|
case actionRunFanStress, actionRunNCCLTests:
|
||||||
return screenBurnInTests
|
return screenBurnInTests
|
||||||
default:
|
default:
|
||||||
return screenMain
|
return screenMain
|
||||||
|
|||||||
@@ -8,12 +8,13 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
burnCurGPUStress = 0
|
burnCurGPUStress = 0
|
||||||
burnCurModeQuick = 1
|
burnCurModeQuick = 1
|
||||||
burnCurModeStd = 2
|
burnCurModeStd = 2
|
||||||
burnCurModeExpr = 3
|
burnCurModeExpr = 3
|
||||||
burnCurRun = 4
|
burnCurRun = 4
|
||||||
burnCurTotal = 5
|
burnCurNCCLTests = 5
|
||||||
|
burnCurTotal = 6
|
||||||
)
|
)
|
||||||
|
|
||||||
func (m model) enterBurnInTests() (tea.Model, tea.Cmd) {
|
func (m model) enterBurnInTests() (tea.Model, tea.Cmd) {
|
||||||
@@ -48,9 +49,13 @@ func (m model) updateBurnInTests(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
|||||||
return m.burnRunSelected()
|
return m.burnRunSelected()
|
||||||
case burnCurModeQuick, burnCurModeStd, burnCurModeExpr:
|
case burnCurModeQuick, burnCurModeStd, burnCurModeExpr:
|
||||||
m.burnMode = m.burnCursor - burnCurModeQuick
|
m.burnMode = m.burnCursor - burnCurModeQuick
|
||||||
|
case burnCurNCCLTests:
|
||||||
|
return m.burnRunNCCL()
|
||||||
}
|
}
|
||||||
case "f", "F", "r", "R":
|
case "f", "F", "r", "R":
|
||||||
return m.burnRunSelected()
|
return m.burnRunSelected()
|
||||||
|
case "n", "N":
|
||||||
|
return m.burnRunNCCL()
|
||||||
case "1":
|
case "1":
|
||||||
m.burnMode = 0
|
m.burnMode = 0
|
||||||
case "2":
|
case "2":
|
||||||
@@ -70,6 +75,13 @@ func (m model) burnRunSelected() (tea.Model, tea.Cmd) {
|
|||||||
return m.hcRunFanStress()
|
return m.hcRunFanStress()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (m model) burnRunNCCL() (tea.Model, tea.Cmd) {
|
||||||
|
m.pendingAction = actionRunNCCLTests
|
||||||
|
m.screen = screenConfirm
|
||||||
|
m.cursor = 0
|
||||||
|
return m, nil
|
||||||
|
}
|
||||||
|
|
||||||
func renderBurnInTests(m model) string {
|
func renderBurnInTests(m model) string {
|
||||||
var b strings.Builder
|
var b strings.Builder
|
||||||
|
|
||||||
@@ -110,8 +122,15 @@ func renderBurnInTests(m model) string {
|
|||||||
}
|
}
|
||||||
fmt.Fprintf(&b, "%s[ RUN SELECTED [R] ]\n", pfx)
|
fmt.Fprintf(&b, "%s[ RUN SELECTED [R] ]\n", pfx)
|
||||||
|
|
||||||
|
fmt.Fprintln(&b)
|
||||||
|
pfx = " "
|
||||||
|
if m.burnCursor == burnCurNCCLTests {
|
||||||
|
pfx = "> "
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "%s[ NCCL BANDWIDTH TEST [N] ] (all_reduce_perf, NVLink/PCIe bandwidth)\n", pfx)
|
||||||
|
|
||||||
fmt.Fprintln(&b)
|
fmt.Fprintln(&b)
|
||||||
fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────")
|
fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────")
|
||||||
fmt.Fprint(&b, "[↑↓] move [space/enter] select [1/2/3] mode [R/F] run [Esc] back")
|
fmt.Fprint(&b, "[↑↓] move [space/enter] select [1/2/3] mode [R/F] run [N] nccl [Esc] back")
|
||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -268,12 +268,9 @@ func TestHealthCheckGPUOpensNvidiaSATSetup(t *testing.T) {
|
|||||||
m.hcInitialized = true
|
m.hcInitialized = true
|
||||||
m.hcSel = [4]bool{true, true, true, true}
|
m.hcSel = [4]bool{true, true, true, true}
|
||||||
|
|
||||||
next, cmd := m.hcRunSingle(hcGPU)
|
next, _ := m.hcRunSingle(hcGPU)
|
||||||
got := next.(model)
|
got := next.(model)
|
||||||
|
|
||||||
if cmd == nil {
|
|
||||||
t.Fatal("expected non-nil cmd (GPU list loader)")
|
|
||||||
}
|
|
||||||
if got.screen != screenNvidiaSATSetup {
|
if got.screen != screenNvidiaSATSetup {
|
||||||
t.Fatalf("screen=%q want %q", got.screen, screenNvidiaSATSetup)
|
t.Fatalf("screen=%q want %q", got.screen, screenNvidiaSATSetup)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -44,6 +44,7 @@ const (
|
|||||||
actionRunCPUSAT actionKind = "run_cpu_sat"
|
actionRunCPUSAT actionKind = "run_cpu_sat"
|
||||||
actionRunAMDGPUSAT actionKind = "run_amd_gpu_sat"
|
actionRunAMDGPUSAT actionKind = "run_amd_gpu_sat"
|
||||||
actionRunFanStress actionKind = "run_fan_stress"
|
actionRunFanStress actionKind = "run_fan_stress"
|
||||||
|
actionRunNCCLTests actionKind = "run_nccl_tests"
|
||||||
)
|
)
|
||||||
|
|
||||||
type model struct {
|
type model struct {
|
||||||
@@ -98,6 +99,9 @@ type model struct {
|
|||||||
nvidiaSATCancel func()
|
nvidiaSATCancel func()
|
||||||
nvidiaSATAborted bool
|
nvidiaSATAborted bool
|
||||||
|
|
||||||
|
// NCCL tests running
|
||||||
|
ncclCancel func()
|
||||||
|
|
||||||
// GPU Platform Stress Test running
|
// GPU Platform Stress Test running
|
||||||
gpuStressCancel func()
|
gpuStressCancel func()
|
||||||
gpuStressAborted bool
|
gpuStressAborted bool
|
||||||
@@ -202,6 +206,8 @@ func (m model) confirmBody() (string, string) {
|
|||||||
return "CPU test", "Run stress-ng? Mode: " + modes[m.hcMode]
|
return "CPU test", "Run stress-ng? Mode: " + modes[m.hcMode]
|
||||||
case actionRunAMDGPUSAT:
|
case actionRunAMDGPUSAT:
|
||||||
return "AMD GPU test", "Run AMD GPU diagnostic pack (rocm-smi)?"
|
return "AMD GPU test", "Run AMD GPU diagnostic pack (rocm-smi)?"
|
||||||
|
case actionRunNCCLTests:
|
||||||
|
return "NCCL bandwidth test", "Run all_reduce_perf across all GPUs?\n\nMeasures collective bandwidth over NVLink/PCIe.\nRequires 2+ GPUs for meaningful results."
|
||||||
case actionRunFanStress:
|
case actionRunFanStress:
|
||||||
modes := []string{"Quick (2×2min)", "Standard (2×5min)", "Express (2×10min)"}
|
modes := []string{"Quick (2×2min)", "Standard (2×5min)", "Express (2×10min)"}
|
||||||
return "GPU Platform Stress Test", "Two-phase GPU thermal cycling test.\n" +
|
return "GPU Platform Stress Test", "Two-phase GPU thermal cycling test.\n" +
|
||||||
|
|||||||
@@ -26,6 +26,19 @@ RUN apt-get update -qq && apt-get install -y \
|
|||||||
linux-headers-amd64 \
|
linux-headers-amd64 \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Add NVIDIA CUDA repo and install nvcc (needed to compile nccl-tests)
|
||||||
|
RUN wget -qO /tmp/cuda-keyring.gpg \
|
||||||
|
https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/3bf863cc.pub \
|
||||||
|
&& gpg --dearmor < /tmp/cuda-keyring.gpg \
|
||||||
|
> /usr/share/keyrings/nvidia-cuda.gpg \
|
||||||
|
&& rm /tmp/cuda-keyring.gpg \
|
||||||
|
&& echo "deb [signed-by=/usr/share/keyrings/nvidia-cuda.gpg] \
|
||||||
|
https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/ /" \
|
||||||
|
> /etc/apt/sources.list.d/cuda.list \
|
||||||
|
&& apt-get update -qq \
|
||||||
|
&& apt-get install -y cuda-nvcc-13-0 \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
RUN arch="$(dpkg --print-architecture)" \
|
RUN arch="$(dpkg --print-architecture)" \
|
||||||
&& case "$arch" in \
|
&& case "$arch" in \
|
||||||
amd64) goarch=amd64 ;; \
|
amd64) goarch=amd64 ;; \
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ NVIDIA_DRIVER_VERSION=590.48.01
|
|||||||
NCCL_VERSION=2.28.9-1
|
NCCL_VERSION=2.28.9-1
|
||||||
NCCL_CUDA_VERSION=13.0
|
NCCL_CUDA_VERSION=13.0
|
||||||
NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
|
NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
|
||||||
|
NCCL_TESTS_VERSION=2.13.10
|
||||||
CUBLAS_VERSION=13.0.2.14-1
|
CUBLAS_VERSION=13.0.2.14-1
|
||||||
CUDA_USERSPACE_VERSION=13.0.96-1
|
CUDA_USERSPACE_VERSION=13.0.96-1
|
||||||
GO_VERSION=1.24.0
|
GO_VERSION=1.24.0
|
||||||
|
|||||||
114
iso/builder/build-nccl-tests.sh
Executable file
114
iso/builder/build-nccl-tests.sh
Executable file
@@ -0,0 +1,114 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# build-nccl-tests.sh — build nccl-tests all_reduce_perf for the LiveCD.
|
||||||
|
#
|
||||||
|
# Downloads nccl-tests source from GitHub, downloads libnccl-dev .deb for
|
||||||
|
# nccl.h, and compiles all_reduce_perf with nvcc (cuda-nvcc-13-0).
|
||||||
|
#
|
||||||
|
# Output is cached in DIST_DIR/nccl-tests-<version>/ so subsequent builds
|
||||||
|
# are instant unless NCCL_TESTS_VERSION changes.
|
||||||
|
#
|
||||||
|
# Output layout:
|
||||||
|
# $CACHE_DIR/bin/all_reduce_perf
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
NCCL_TESTS_VERSION="$1"
|
||||||
|
NCCL_VERSION="$2"
|
||||||
|
NCCL_CUDA_VERSION="$3"
|
||||||
|
DIST_DIR="$4"
|
||||||
|
|
||||||
|
[ -n "$NCCL_TESTS_VERSION" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir>"; exit 1; }
|
||||||
|
[ -n "$NCCL_VERSION" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir>"; exit 1; }
|
||||||
|
[ -n "$NCCL_CUDA_VERSION" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir>"; exit 1; }
|
||||||
|
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir>"; exit 1; }
|
||||||
|
|
||||||
|
echo "=== nccl-tests ${NCCL_TESTS_VERSION} ==="
|
||||||
|
|
||||||
|
CACHE_DIR="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
|
||||||
|
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||||
|
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nccl-tests-downloads"
|
||||||
|
|
||||||
|
if [ -f "${CACHE_DIR}/bin/all_reduce_perf" ]; then
|
||||||
|
echo "=== nccl-tests cached, skipping build ==="
|
||||||
|
echo "binary: ${CACHE_DIR}/bin/all_reduce_perf"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Resolve nvcc path (cuda-nvcc-13-0 installs to /usr/local/cuda-13.0/bin/nvcc)
|
||||||
|
NVCC=""
|
||||||
|
for candidate in nvcc /usr/local/cuda-13.0/bin/nvcc /usr/local/cuda/bin/nvcc; do
|
||||||
|
if command -v "$candidate" >/dev/null 2>&1 || [ -x "$candidate" ]; then
|
||||||
|
NVCC="$candidate"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
[ -n "$NVCC" ] || { echo "ERROR: nvcc not found — install cuda-nvcc-13-0"; exit 1; }
|
||||||
|
echo "nvcc: $NVCC"
|
||||||
|
|
||||||
|
# Determine CUDA_HOME from nvcc location
|
||||||
|
CUDA_HOME="$(dirname "$(dirname "$NVCC")")"
|
||||||
|
echo "CUDA_HOME: $CUDA_HOME"
|
||||||
|
|
||||||
|
# Download libnccl-dev for nccl.h
|
||||||
|
REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64"
|
||||||
|
DEV_PKG="libnccl-dev_${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}_amd64.deb"
|
||||||
|
DEV_URL="${REPO_BASE}/${DEV_PKG}"
|
||||||
|
|
||||||
|
mkdir -p "$DOWNLOAD_CACHE_DIR"
|
||||||
|
DEV_DEB="${DOWNLOAD_CACHE_DIR}/${DEV_PKG}"
|
||||||
|
|
||||||
|
if [ ! -f "$DEV_DEB" ]; then
|
||||||
|
echo "=== downloading libnccl-dev ==="
|
||||||
|
wget --show-progress -O "$DEV_DEB" "$DEV_URL"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Extract nccl.h from libnccl-dev
|
||||||
|
NCCL_INCLUDE_TMP=$(mktemp -d)
|
||||||
|
trap 'rm -rf "$NCCL_INCLUDE_TMP" "$BUILD_TMP"' EXIT INT TERM
|
||||||
|
|
||||||
|
cd "$NCCL_INCLUDE_TMP"
|
||||||
|
ar x "$DEV_DEB"
|
||||||
|
DATA_TAR=$(ls data.tar.* 2>/dev/null | head -1)
|
||||||
|
[ -n "$DATA_TAR" ] || { echo "ERROR: data.tar.* not found in libnccl-dev .deb"; exit 1; }
|
||||||
|
tar xf "$DATA_TAR"
|
||||||
|
|
||||||
|
# nccl.h lands in ./usr/include/ or ./usr/local/cuda-X.Y/targets/.../include/
|
||||||
|
NCCL_H=$(find . -name 'nccl.h' -type f 2>/dev/null | head -1)
|
||||||
|
[ -n "$NCCL_H" ] || { echo "ERROR: nccl.h not found in libnccl-dev package"; exit 1; }
|
||||||
|
NCCL_INCLUDE_DIR="$(pwd)/$(dirname "$NCCL_H")"
|
||||||
|
echo "nccl.h: $NCCL_H"
|
||||||
|
|
||||||
|
# Download nccl-tests source
|
||||||
|
SRC_TAR="${DOWNLOAD_CACHE_DIR}/nccl-tests-v${NCCL_TESTS_VERSION}.tar.gz"
|
||||||
|
SRC_URL="https://github.com/NVIDIA/nccl-tests/archive/refs/tags/v${NCCL_TESTS_VERSION}.tar.gz"
|
||||||
|
|
||||||
|
if [ ! -f "$SRC_TAR" ]; then
|
||||||
|
echo "=== downloading nccl-tests v${NCCL_TESTS_VERSION} ==="
|
||||||
|
wget --show-progress -O "$SRC_TAR" "$SRC_URL"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Extract and build
|
||||||
|
BUILD_TMP=$(mktemp -d)
|
||||||
|
cd "$BUILD_TMP"
|
||||||
|
tar xf "$SRC_TAR"
|
||||||
|
SRC_DIR=$(ls -d nccl-tests-* 2>/dev/null | head -1)
|
||||||
|
[ -n "$SRC_DIR" ] || { echo "ERROR: source directory not found in archive"; exit 1; }
|
||||||
|
cd "$SRC_DIR"
|
||||||
|
|
||||||
|
echo "=== building all_reduce_perf ==="
|
||||||
|
make MPI=0 \
|
||||||
|
NVCC="$NVCC" \
|
||||||
|
CUDA_HOME="$CUDA_HOME" \
|
||||||
|
NCCL_HOME="$NCCL_INCLUDE_DIR/.." \
|
||||||
|
BUILDDIR="./build" \
|
||||||
|
all_reduce_perf
|
||||||
|
|
||||||
|
[ -f "./build/all_reduce_perf" ] || { echo "ERROR: all_reduce_perf not found after build"; exit 1; }
|
||||||
|
|
||||||
|
mkdir -p "${CACHE_DIR}/bin"
|
||||||
|
cp "./build/all_reduce_perf" "${CACHE_DIR}/bin/all_reduce_perf"
|
||||||
|
chmod +x "${CACHE_DIR}/bin/all_reduce_perf"
|
||||||
|
|
||||||
|
echo "=== nccl-tests build complete ==="
|
||||||
|
echo "binary: ${CACHE_DIR}/bin/all_reduce_perf"
|
||||||
|
ls -lh "${CACHE_DIR}/bin/all_reduce_perf"
|
||||||
@@ -197,7 +197,8 @@ rm -f \
|
|||||||
"${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \
|
"${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \
|
||||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \
|
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \
|
||||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress" \
|
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress" \
|
||||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest"
|
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \
|
||||||
|
"${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||||
|
|
||||||
# --- inject authorized_keys for SSH access ---
|
# --- inject authorized_keys for SSH access ---
|
||||||
AUTHORIZED_KEYS_FILE="${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys"
|
AUTHORIZED_KEYS_FILE="${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys"
|
||||||
@@ -298,6 +299,20 @@ echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/
|
|||||||
cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
||||||
echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
||||||
|
|
||||||
|
# --- build nccl-tests ---
|
||||||
|
echo ""
|
||||||
|
echo "=== building nccl-tests ${NCCL_TESTS_VERSION} ==="
|
||||||
|
sh "${BUILDER_DIR}/build-nccl-tests.sh" \
|
||||||
|
"${NCCL_TESTS_VERSION}" \
|
||||||
|
"${NCCL_VERSION}" \
|
||||||
|
"${NCCL_CUDA_VERSION}" \
|
||||||
|
"${DIST_DIR}"
|
||||||
|
|
||||||
|
NCCL_TESTS_CACHE="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
|
||||||
|
cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||||
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||||
|
echo "=== all_reduce_perf injected ==="
|
||||||
|
|
||||||
# --- embed build metadata ---
|
# --- embed build metadata ---
|
||||||
mkdir -p "${OVERLAY_STAGE_DIR}/etc"
|
mkdir -p "${OVERLAY_STAGE_DIR}/etc"
|
||||||
BUILD_DATE="$(date +%Y-%m-%d)"
|
BUILD_DATE="$(date +%Y-%m-%d)"
|
||||||
@@ -314,6 +329,7 @@ NCCL_VERSION=${NCCL_VERSION}
|
|||||||
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
|
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
|
||||||
CUBLAS_VERSION=${CUBLAS_VERSION}
|
CUBLAS_VERSION=${CUBLAS_VERSION}
|
||||||
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
|
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
|
||||||
|
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
# Patch motd with build info
|
# Patch motd with build info
|
||||||
|
|||||||
Reference in New Issue
Block a user