feat(nccl): add nccl-tests all_reduce_perf for GPU bandwidth testing

- Dockerfile: install cuda-nvcc-13-0 from NVIDIA repo for compilation
- build-nccl-tests.sh: downloads libnccl-dev for nccl.h, builds all_reduce_perf
- build.sh: runs nccl-tests build, injects binary into /usr/local/bin/
- platform: RunNCCLTests() auto-detects GPU count, runs all_reduce_perf
- TUI: NCCL bandwidth test entry in Burn-in Tests screen [N] hotkey

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-26 23:22:19 +03:00
parent eea98e6d76
commit 5644231f9a
11 changed files with 221 additions and 13 deletions

View File

@@ -81,6 +81,7 @@ type satRunner interface {
ListAMDGPUs() ([]platform.AMDGPUInfo, error)
RunAMDAcceptancePack(baseDir string) (string, error)
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
RunNCCLTests(ctx context.Context, baseDir string) (string, error)
}
type runtimeChecker interface {
@@ -498,6 +499,15 @@ func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platfor
return a.sat.RunFanStressTest(ctx, baseDir, opts)
}
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir)
body := "Results: " + path
if err != nil && err != context.Canceled {
body += "\nERROR: " + err.Error()
}
return ActionResult{Title: "NCCL bandwidth test", Body: body}, err
}
func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) {
path, err := a.RunFanStressTest(ctx, "", opts)
body := formatFanStressResult(path)

View File

@@ -174,6 +174,10 @@ func (f fakeSAT) RunFanStressTest(_ context.Context, _ string, _ platform.FanStr
return "", nil
}
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string) (string, error) {
return "", nil
}
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
t.Parallel()

View File

@@ -121,6 +121,24 @@ func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
return gpus, nil
}
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
// Measures collective communication bandwidth over NVLink/PCIe.
func (s *System) RunNCCLTests(ctx context.Context, baseDir string) (string, error) {
// detect GPU count
out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
if gpuCount < 1 {
gpuCount = 1
}
return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", []satJob{
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{name: "02-all-reduce-perf.log", cmd: []string{
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
"-g", strconv.Itoa(gpuCount), "--iters", "20",
}},
})
}
func (s *System) RunNvidiaAcceptancePack(baseDir string) (string, error) {
return runAcceptancePack(baseDir, "gpu-nvidia", nvidiaSATJobs())
}

View File

@@ -1,6 +1,7 @@
package tui
import (
"context"
"time"
"bee/audit/internal/platform"
@@ -140,6 +141,15 @@ func (m model) updateConfirm(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
)
case actionRunFanStress:
return m.startGPUStressTest()
case actionRunNCCLTests:
m.busy = true
m.busyTitle = "NCCL bandwidth test"
ctx, cancel := context.WithCancel(context.Background())
m.ncclCancel = cancel
return m, func() tea.Msg {
result, err := m.app.RunNCCLTestsResult(ctx)
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenBurnInTests}
}
}
case "ctrl+c":
return m, tea.Quit
@@ -153,7 +163,7 @@ func (m model) confirmCancelTarget() screen {
return screenExportTargets
case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT:
return screenHealthCheck
case actionRunFanStress:
case actionRunFanStress, actionRunNCCLTests:
return screenBurnInTests
default:
return screenMain

View File

@@ -8,12 +8,13 @@ import (
)
const (
burnCurGPUStress = 0
burnCurModeQuick = 1
burnCurModeStd = 2
burnCurModeExpr = 3
burnCurRun = 4
burnCurTotal = 5
burnCurGPUStress = 0
burnCurModeQuick = 1
burnCurModeStd = 2
burnCurModeExpr = 3
burnCurRun = 4
burnCurNCCLTests = 5
burnCurTotal = 6
)
func (m model) enterBurnInTests() (tea.Model, tea.Cmd) {
@@ -48,9 +49,13 @@ func (m model) updateBurnInTests(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
return m.burnRunSelected()
case burnCurModeQuick, burnCurModeStd, burnCurModeExpr:
m.burnMode = m.burnCursor - burnCurModeQuick
case burnCurNCCLTests:
return m.burnRunNCCL()
}
case "f", "F", "r", "R":
return m.burnRunSelected()
case "n", "N":
return m.burnRunNCCL()
case "1":
m.burnMode = 0
case "2":
@@ -70,6 +75,13 @@ func (m model) burnRunSelected() (tea.Model, tea.Cmd) {
return m.hcRunFanStress()
}
func (m model) burnRunNCCL() (tea.Model, tea.Cmd) {
m.pendingAction = actionRunNCCLTests
m.screen = screenConfirm
m.cursor = 0
return m, nil
}
func renderBurnInTests(m model) string {
var b strings.Builder
@@ -110,8 +122,15 @@ func renderBurnInTests(m model) string {
}
fmt.Fprintf(&b, "%s[ RUN SELECTED [R] ]\n", pfx)
fmt.Fprintln(&b)
pfx = " "
if m.burnCursor == burnCurNCCLTests {
pfx = "> "
}
fmt.Fprintf(&b, "%s[ NCCL BANDWIDTH TEST [N] ] (all_reduce_perf, NVLink/PCIe bandwidth)\n", pfx)
fmt.Fprintln(&b)
fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────")
fmt.Fprint(&b, "[↑↓] move [space/enter] select [1/2/3] mode [R/F] run [Esc] back")
fmt.Fprint(&b, "[↑↓] move [space/enter] select [1/2/3] mode [R/F] run [N] nccl [Esc] back")
return b.String()
}

View File

@@ -268,12 +268,9 @@ func TestHealthCheckGPUOpensNvidiaSATSetup(t *testing.T) {
m.hcInitialized = true
m.hcSel = [4]bool{true, true, true, true}
next, cmd := m.hcRunSingle(hcGPU)
next, _ := m.hcRunSingle(hcGPU)
got := next.(model)
if cmd == nil {
t.Fatal("expected non-nil cmd (GPU list loader)")
}
if got.screen != screenNvidiaSATSetup {
t.Fatalf("screen=%q want %q", got.screen, screenNvidiaSATSetup)
}

View File

@@ -44,6 +44,7 @@ const (
actionRunCPUSAT actionKind = "run_cpu_sat"
actionRunAMDGPUSAT actionKind = "run_amd_gpu_sat"
actionRunFanStress actionKind = "run_fan_stress"
actionRunNCCLTests actionKind = "run_nccl_tests"
)
type model struct {
@@ -98,6 +99,9 @@ type model struct {
nvidiaSATCancel func()
nvidiaSATAborted bool
// NCCL tests running
ncclCancel func()
// GPU Platform Stress Test running
gpuStressCancel func()
gpuStressAborted bool
@@ -202,6 +206,8 @@ func (m model) confirmBody() (string, string) {
return "CPU test", "Run stress-ng? Mode: " + modes[m.hcMode]
case actionRunAMDGPUSAT:
return "AMD GPU test", "Run AMD GPU diagnostic pack (rocm-smi)?"
case actionRunNCCLTests:
return "NCCL bandwidth test", "Run all_reduce_perf across all GPUs?\n\nMeasures collective bandwidth over NVLink/PCIe.\nRequires 2+ GPUs for meaningful results."
case actionRunFanStress:
modes := []string{"Quick (2×2min)", "Standard (2×5min)", "Express (2×10min)"}
return "GPU Platform Stress Test", "Two-phase GPU thermal cycling test.\n" +