diff --git a/audit/internal/app/app.go b/audit/internal/app/app.go index a0a173d..a2c6a65 100644 --- a/audit/internal/app/app.go +++ b/audit/internal/app/app.go @@ -81,6 +81,7 @@ type satRunner interface { ListAMDGPUs() ([]platform.AMDGPUInfo, error) RunAMDAcceptancePack(baseDir string) (string, error) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) + RunNCCLTests(ctx context.Context, baseDir string) (string, error) } type runtimeChecker interface { @@ -498,6 +499,15 @@ func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platfor return a.sat.RunFanStressTest(ctx, baseDir, opts) } +func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) { + path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir) + body := "Results: " + path + if err != nil && err != context.Canceled { + body += "\nERROR: " + err.Error() + } + return ActionResult{Title: "NCCL bandwidth test", Body: body}, err +} + func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) { path, err := a.RunFanStressTest(ctx, "", opts) body := formatFanStressResult(path) diff --git a/audit/internal/app/app_test.go b/audit/internal/app/app_test.go index 67df2fd..c15ac45 100644 --- a/audit/internal/app/app_test.go +++ b/audit/internal/app/app_test.go @@ -174,6 +174,10 @@ func (f fakeSAT) RunFanStressTest(_ context.Context, _ string, _ platform.FanStr return "", nil } +func (f fakeSAT) RunNCCLTests(_ context.Context, _ string) (string, error) { + return "", nil +} + func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) { t.Parallel() diff --git a/audit/internal/platform/sat.go b/audit/internal/platform/sat.go index 9b0d274..e16b490 100644 --- a/audit/internal/platform/sat.go +++ b/audit/internal/platform/sat.go @@ -121,6 +121,24 @@ func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) { return gpus, nil } +// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs. +// Measures collective communication bandwidth over NVLink/PCIe. +func (s *System) RunNCCLTests(ctx context.Context, baseDir string) (string, error) { + // detect GPU count + out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output() + gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n")) + if gpuCount < 1 { + gpuCount = 1 + } + return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", []satJob{ + {name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, + {name: "02-all-reduce-perf.log", cmd: []string{ + "all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2", + "-g", strconv.Itoa(gpuCount), "--iters", "20", + }}, + }) +} + func (s *System) RunNvidiaAcceptancePack(baseDir string) (string, error) { return runAcceptancePack(baseDir, "gpu-nvidia", nvidiaSATJobs()) } diff --git a/audit/internal/tui/forms.go b/audit/internal/tui/forms.go index 936d5c5..3ecaf9e 100644 --- a/audit/internal/tui/forms.go +++ b/audit/internal/tui/forms.go @@ -1,6 +1,7 @@ package tui import ( + "context" "time" "bee/audit/internal/platform" @@ -140,6 +141,15 @@ func (m model) updateConfirm(msg tea.KeyMsg) (tea.Model, tea.Cmd) { ) case actionRunFanStress: return m.startGPUStressTest() + case actionRunNCCLTests: + m.busy = true + m.busyTitle = "NCCL bandwidth test" + ctx, cancel := context.WithCancel(context.Background()) + m.ncclCancel = cancel + return m, func() tea.Msg { + result, err := m.app.RunNCCLTestsResult(ctx) + return resultMsg{title: result.Title, body: result.Body, err: err, back: screenBurnInTests} + } } case "ctrl+c": return m, tea.Quit @@ -153,7 +163,7 @@ func (m model) confirmCancelTarget() screen { return screenExportTargets case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT: return screenHealthCheck - case actionRunFanStress: + case actionRunFanStress, actionRunNCCLTests: return screenBurnInTests default: return screenMain diff --git a/audit/internal/tui/screen_burn_in.go b/audit/internal/tui/screen_burn_in.go index 7013962..07cc956 100644 --- a/audit/internal/tui/screen_burn_in.go +++ b/audit/internal/tui/screen_burn_in.go @@ -8,12 +8,13 @@ import ( ) const ( - burnCurGPUStress = 0 - burnCurModeQuick = 1 - burnCurModeStd = 2 - burnCurModeExpr = 3 - burnCurRun = 4 - burnCurTotal = 5 + burnCurGPUStress = 0 + burnCurModeQuick = 1 + burnCurModeStd = 2 + burnCurModeExpr = 3 + burnCurRun = 4 + burnCurNCCLTests = 5 + burnCurTotal = 6 ) func (m model) enterBurnInTests() (tea.Model, tea.Cmd) { @@ -48,9 +49,13 @@ func (m model) updateBurnInTests(msg tea.KeyMsg) (tea.Model, tea.Cmd) { return m.burnRunSelected() case burnCurModeQuick, burnCurModeStd, burnCurModeExpr: m.burnMode = m.burnCursor - burnCurModeQuick + case burnCurNCCLTests: + return m.burnRunNCCL() } case "f", "F", "r", "R": return m.burnRunSelected() + case "n", "N": + return m.burnRunNCCL() case "1": m.burnMode = 0 case "2": @@ -70,6 +75,13 @@ func (m model) burnRunSelected() (tea.Model, tea.Cmd) { return m.hcRunFanStress() } +func (m model) burnRunNCCL() (tea.Model, tea.Cmd) { + m.pendingAction = actionRunNCCLTests + m.screen = screenConfirm + m.cursor = 0 + return m, nil +} + func renderBurnInTests(m model) string { var b strings.Builder @@ -110,8 +122,15 @@ func renderBurnInTests(m model) string { } fmt.Fprintf(&b, "%s[ RUN SELECTED [R] ]\n", pfx) + fmt.Fprintln(&b) + pfx = " " + if m.burnCursor == burnCurNCCLTests { + pfx = "> " + } + fmt.Fprintf(&b, "%s[ NCCL BANDWIDTH TEST [N] ] (all_reduce_perf, NVLink/PCIe bandwidth)\n", pfx) + fmt.Fprintln(&b) fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────") - fmt.Fprint(&b, "[↑↓] move [space/enter] select [1/2/3] mode [R/F] run [Esc] back") + fmt.Fprint(&b, "[↑↓] move [space/enter] select [1/2/3] mode [R/F] run [N] nccl [Esc] back") return b.String() } diff --git a/audit/internal/tui/tui_test.go b/audit/internal/tui/tui_test.go index b71b451..a12f2ba 100644 --- a/audit/internal/tui/tui_test.go +++ b/audit/internal/tui/tui_test.go @@ -268,12 +268,9 @@ func TestHealthCheckGPUOpensNvidiaSATSetup(t *testing.T) { m.hcInitialized = true m.hcSel = [4]bool{true, true, true, true} - next, cmd := m.hcRunSingle(hcGPU) + next, _ := m.hcRunSingle(hcGPU) got := next.(model) - if cmd == nil { - t.Fatal("expected non-nil cmd (GPU list loader)") - } if got.screen != screenNvidiaSATSetup { t.Fatalf("screen=%q want %q", got.screen, screenNvidiaSATSetup) } diff --git a/audit/internal/tui/types.go b/audit/internal/tui/types.go index a094611..ce22517 100644 --- a/audit/internal/tui/types.go +++ b/audit/internal/tui/types.go @@ -44,6 +44,7 @@ const ( actionRunCPUSAT actionKind = "run_cpu_sat" actionRunAMDGPUSAT actionKind = "run_amd_gpu_sat" actionRunFanStress actionKind = "run_fan_stress" + actionRunNCCLTests actionKind = "run_nccl_tests" ) type model struct { @@ -98,6 +99,9 @@ type model struct { nvidiaSATCancel func() nvidiaSATAborted bool + // NCCL tests running + ncclCancel func() + // GPU Platform Stress Test running gpuStressCancel func() gpuStressAborted bool @@ -202,6 +206,8 @@ func (m model) confirmBody() (string, string) { return "CPU test", "Run stress-ng? Mode: " + modes[m.hcMode] case actionRunAMDGPUSAT: return "AMD GPU test", "Run AMD GPU diagnostic pack (rocm-smi)?" + case actionRunNCCLTests: + return "NCCL bandwidth test", "Run all_reduce_perf across all GPUs?\n\nMeasures collective bandwidth over NVLink/PCIe.\nRequires 2+ GPUs for meaningful results." case actionRunFanStress: modes := []string{"Quick (2×2min)", "Standard (2×5min)", "Express (2×10min)"} return "GPU Platform Stress Test", "Two-phase GPU thermal cycling test.\n" + diff --git a/iso/builder/Dockerfile b/iso/builder/Dockerfile index 13c6f8a..4a225ba 100644 --- a/iso/builder/Dockerfile +++ b/iso/builder/Dockerfile @@ -26,6 +26,19 @@ RUN apt-get update -qq && apt-get install -y \ linux-headers-amd64 \ && rm -rf /var/lib/apt/lists/* +# Add NVIDIA CUDA repo and install nvcc (needed to compile nccl-tests) +RUN wget -qO /tmp/cuda-keyring.gpg \ + https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/3bf863cc.pub \ + && gpg --dearmor < /tmp/cuda-keyring.gpg \ + > /usr/share/keyrings/nvidia-cuda.gpg \ + && rm /tmp/cuda-keyring.gpg \ + && echo "deb [signed-by=/usr/share/keyrings/nvidia-cuda.gpg] \ +https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/ /" \ + > /etc/apt/sources.list.d/cuda.list \ + && apt-get update -qq \ + && apt-get install -y cuda-nvcc-13-0 \ + && rm -rf /var/lib/apt/lists/* + RUN arch="$(dpkg --print-architecture)" \ && case "$arch" in \ amd64) goarch=amd64 ;; \ diff --git a/iso/builder/VERSIONS b/iso/builder/VERSIONS index db45732..cfe40ee 100644 --- a/iso/builder/VERSIONS +++ b/iso/builder/VERSIONS @@ -4,6 +4,7 @@ NVIDIA_DRIVER_VERSION=590.48.01 NCCL_VERSION=2.28.9-1 NCCL_CUDA_VERSION=13.0 NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186 +NCCL_TESTS_VERSION=2.13.10 CUBLAS_VERSION=13.0.2.14-1 CUDA_USERSPACE_VERSION=13.0.96-1 GO_VERSION=1.24.0 diff --git a/iso/builder/build-nccl-tests.sh b/iso/builder/build-nccl-tests.sh new file mode 100755 index 0000000..da4a4dc --- /dev/null +++ b/iso/builder/build-nccl-tests.sh @@ -0,0 +1,114 @@ +#!/bin/sh +# build-nccl-tests.sh — build nccl-tests all_reduce_perf for the LiveCD. +# +# Downloads nccl-tests source from GitHub, downloads libnccl-dev .deb for +# nccl.h, and compiles all_reduce_perf with nvcc (cuda-nvcc-13-0). +# +# Output is cached in DIST_DIR/nccl-tests-/ so subsequent builds +# are instant unless NCCL_TESTS_VERSION changes. +# +# Output layout: +# $CACHE_DIR/bin/all_reduce_perf + +set -e + +NCCL_TESTS_VERSION="$1" +NCCL_VERSION="$2" +NCCL_CUDA_VERSION="$3" +DIST_DIR="$4" + +[ -n "$NCCL_TESTS_VERSION" ] || { echo "usage: $0 "; exit 1; } +[ -n "$NCCL_VERSION" ] || { echo "usage: $0 "; exit 1; } +[ -n "$NCCL_CUDA_VERSION" ] || { echo "usage: $0 "; exit 1; } +[ -n "$DIST_DIR" ] || { echo "usage: $0 "; exit 1; } + +echo "=== nccl-tests ${NCCL_TESTS_VERSION} ===" + +CACHE_DIR="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}" +CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}" +DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nccl-tests-downloads" + +if [ -f "${CACHE_DIR}/bin/all_reduce_perf" ]; then + echo "=== nccl-tests cached, skipping build ===" + echo "binary: ${CACHE_DIR}/bin/all_reduce_perf" + exit 0 +fi + +# Resolve nvcc path (cuda-nvcc-13-0 installs to /usr/local/cuda-13.0/bin/nvcc) +NVCC="" +for candidate in nvcc /usr/local/cuda-13.0/bin/nvcc /usr/local/cuda/bin/nvcc; do + if command -v "$candidate" >/dev/null 2>&1 || [ -x "$candidate" ]; then + NVCC="$candidate" + break + fi +done +[ -n "$NVCC" ] || { echo "ERROR: nvcc not found — install cuda-nvcc-13-0"; exit 1; } +echo "nvcc: $NVCC" + +# Determine CUDA_HOME from nvcc location +CUDA_HOME="$(dirname "$(dirname "$NVCC")")" +echo "CUDA_HOME: $CUDA_HOME" + +# Download libnccl-dev for nccl.h +REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64" +DEV_PKG="libnccl-dev_${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}_amd64.deb" +DEV_URL="${REPO_BASE}/${DEV_PKG}" + +mkdir -p "$DOWNLOAD_CACHE_DIR" +DEV_DEB="${DOWNLOAD_CACHE_DIR}/${DEV_PKG}" + +if [ ! -f "$DEV_DEB" ]; then + echo "=== downloading libnccl-dev ===" + wget --show-progress -O "$DEV_DEB" "$DEV_URL" +fi + +# Extract nccl.h from libnccl-dev +NCCL_INCLUDE_TMP=$(mktemp -d) +trap 'rm -rf "$NCCL_INCLUDE_TMP" "$BUILD_TMP"' EXIT INT TERM + +cd "$NCCL_INCLUDE_TMP" +ar x "$DEV_DEB" +DATA_TAR=$(ls data.tar.* 2>/dev/null | head -1) +[ -n "$DATA_TAR" ] || { echo "ERROR: data.tar.* not found in libnccl-dev .deb"; exit 1; } +tar xf "$DATA_TAR" + +# nccl.h lands in ./usr/include/ or ./usr/local/cuda-X.Y/targets/.../include/ +NCCL_H=$(find . -name 'nccl.h' -type f 2>/dev/null | head -1) +[ -n "$NCCL_H" ] || { echo "ERROR: nccl.h not found in libnccl-dev package"; exit 1; } +NCCL_INCLUDE_DIR="$(pwd)/$(dirname "$NCCL_H")" +echo "nccl.h: $NCCL_H" + +# Download nccl-tests source +SRC_TAR="${DOWNLOAD_CACHE_DIR}/nccl-tests-v${NCCL_TESTS_VERSION}.tar.gz" +SRC_URL="https://github.com/NVIDIA/nccl-tests/archive/refs/tags/v${NCCL_TESTS_VERSION}.tar.gz" + +if [ ! -f "$SRC_TAR" ]; then + echo "=== downloading nccl-tests v${NCCL_TESTS_VERSION} ===" + wget --show-progress -O "$SRC_TAR" "$SRC_URL" +fi + +# Extract and build +BUILD_TMP=$(mktemp -d) +cd "$BUILD_TMP" +tar xf "$SRC_TAR" +SRC_DIR=$(ls -d nccl-tests-* 2>/dev/null | head -1) +[ -n "$SRC_DIR" ] || { echo "ERROR: source directory not found in archive"; exit 1; } +cd "$SRC_DIR" + +echo "=== building all_reduce_perf ===" +make MPI=0 \ + NVCC="$NVCC" \ + CUDA_HOME="$CUDA_HOME" \ + NCCL_HOME="$NCCL_INCLUDE_DIR/.." \ + BUILDDIR="./build" \ + all_reduce_perf + +[ -f "./build/all_reduce_perf" ] || { echo "ERROR: all_reduce_perf not found after build"; exit 1; } + +mkdir -p "${CACHE_DIR}/bin" +cp "./build/all_reduce_perf" "${CACHE_DIR}/bin/all_reduce_perf" +chmod +x "${CACHE_DIR}/bin/all_reduce_perf" + +echo "=== nccl-tests build complete ===" +echo "binary: ${CACHE_DIR}/bin/all_reduce_perf" +ls -lh "${CACHE_DIR}/bin/all_reduce_perf" diff --git a/iso/builder/build.sh b/iso/builder/build.sh index e70784c..0b2775b 100755 --- a/iso/builder/build.sh +++ b/iso/builder/build.sh @@ -197,7 +197,8 @@ rm -f \ "${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \ "${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \ "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress" \ - "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" + "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \ + "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf" # --- inject authorized_keys for SSH access --- AUTHORIZED_KEYS_FILE="${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" @@ -298,6 +299,20 @@ echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ===" +# --- build nccl-tests --- +echo "" +echo "=== building nccl-tests ${NCCL_TESTS_VERSION} ===" +sh "${BUILDER_DIR}/build-nccl-tests.sh" \ + "${NCCL_TESTS_VERSION}" \ + "${NCCL_VERSION}" \ + "${NCCL_CUDA_VERSION}" \ + "${DIST_DIR}" + +NCCL_TESTS_CACHE="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}" +cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf" +chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf" +echo "=== all_reduce_perf injected ===" + # --- embed build metadata --- mkdir -p "${OVERLAY_STAGE_DIR}/etc" BUILD_DATE="$(date +%Y-%m-%d)" @@ -314,6 +329,7 @@ NCCL_VERSION=${NCCL_VERSION} NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION} CUBLAS_VERSION=${CUBLAS_VERSION} CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION} +NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION} EOF # Patch motd with build info