feat(nccl): add nccl-tests all_reduce_perf for GPU bandwidth testing
- Dockerfile: install cuda-nvcc-13-0 from NVIDIA repo for compilation - build-nccl-tests.sh: downloads libnccl-dev for nccl.h, builds all_reduce_perf - build.sh: runs nccl-tests build, injects binary into /usr/local/bin/ - platform: RunNCCLTests() auto-detects GPU count, runs all_reduce_perf - TUI: NCCL bandwidth test entry in Burn-in Tests screen [N] hotkey Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
package tui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
@@ -140,6 +141,15 @@ func (m model) updateConfirm(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
)
|
||||
case actionRunFanStress:
|
||||
return m.startGPUStressTest()
|
||||
case actionRunNCCLTests:
|
||||
m.busy = true
|
||||
m.busyTitle = "NCCL bandwidth test"
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
m.ncclCancel = cancel
|
||||
return m, func() tea.Msg {
|
||||
result, err := m.app.RunNCCLTestsResult(ctx)
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenBurnInTests}
|
||||
}
|
||||
}
|
||||
case "ctrl+c":
|
||||
return m, tea.Quit
|
||||
@@ -153,7 +163,7 @@ func (m model) confirmCancelTarget() screen {
|
||||
return screenExportTargets
|
||||
case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT:
|
||||
return screenHealthCheck
|
||||
case actionRunFanStress:
|
||||
case actionRunFanStress, actionRunNCCLTests:
|
||||
return screenBurnInTests
|
||||
default:
|
||||
return screenMain
|
||||
|
||||
@@ -8,12 +8,13 @@ import (
|
||||
)
|
||||
|
||||
const (
|
||||
burnCurGPUStress = 0
|
||||
burnCurModeQuick = 1
|
||||
burnCurModeStd = 2
|
||||
burnCurModeExpr = 3
|
||||
burnCurRun = 4
|
||||
burnCurTotal = 5
|
||||
burnCurGPUStress = 0
|
||||
burnCurModeQuick = 1
|
||||
burnCurModeStd = 2
|
||||
burnCurModeExpr = 3
|
||||
burnCurRun = 4
|
||||
burnCurNCCLTests = 5
|
||||
burnCurTotal = 6
|
||||
)
|
||||
|
||||
func (m model) enterBurnInTests() (tea.Model, tea.Cmd) {
|
||||
@@ -48,9 +49,13 @@ func (m model) updateBurnInTests(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
return m.burnRunSelected()
|
||||
case burnCurModeQuick, burnCurModeStd, burnCurModeExpr:
|
||||
m.burnMode = m.burnCursor - burnCurModeQuick
|
||||
case burnCurNCCLTests:
|
||||
return m.burnRunNCCL()
|
||||
}
|
||||
case "f", "F", "r", "R":
|
||||
return m.burnRunSelected()
|
||||
case "n", "N":
|
||||
return m.burnRunNCCL()
|
||||
case "1":
|
||||
m.burnMode = 0
|
||||
case "2":
|
||||
@@ -70,6 +75,13 @@ func (m model) burnRunSelected() (tea.Model, tea.Cmd) {
|
||||
return m.hcRunFanStress()
|
||||
}
|
||||
|
||||
func (m model) burnRunNCCL() (tea.Model, tea.Cmd) {
|
||||
m.pendingAction = actionRunNCCLTests
|
||||
m.screen = screenConfirm
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func renderBurnInTests(m model) string {
|
||||
var b strings.Builder
|
||||
|
||||
@@ -110,8 +122,15 @@ func renderBurnInTests(m model) string {
|
||||
}
|
||||
fmt.Fprintf(&b, "%s[ RUN SELECTED [R] ]\n", pfx)
|
||||
|
||||
fmt.Fprintln(&b)
|
||||
pfx = " "
|
||||
if m.burnCursor == burnCurNCCLTests {
|
||||
pfx = "> "
|
||||
}
|
||||
fmt.Fprintf(&b, "%s[ NCCL BANDWIDTH TEST [N] ] (all_reduce_perf, NVLink/PCIe bandwidth)\n", pfx)
|
||||
|
||||
fmt.Fprintln(&b)
|
||||
fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────")
|
||||
fmt.Fprint(&b, "[↑↓] move [space/enter] select [1/2/3] mode [R/F] run [Esc] back")
|
||||
fmt.Fprint(&b, "[↑↓] move [space/enter] select [1/2/3] mode [R/F] run [N] nccl [Esc] back")
|
||||
return b.String()
|
||||
}
|
||||
|
||||
@@ -268,12 +268,9 @@ func TestHealthCheckGPUOpensNvidiaSATSetup(t *testing.T) {
|
||||
m.hcInitialized = true
|
||||
m.hcSel = [4]bool{true, true, true, true}
|
||||
|
||||
next, cmd := m.hcRunSingle(hcGPU)
|
||||
next, _ := m.hcRunSingle(hcGPU)
|
||||
got := next.(model)
|
||||
|
||||
if cmd == nil {
|
||||
t.Fatal("expected non-nil cmd (GPU list loader)")
|
||||
}
|
||||
if got.screen != screenNvidiaSATSetup {
|
||||
t.Fatalf("screen=%q want %q", got.screen, screenNvidiaSATSetup)
|
||||
}
|
||||
|
||||
@@ -44,6 +44,7 @@ const (
|
||||
actionRunCPUSAT actionKind = "run_cpu_sat"
|
||||
actionRunAMDGPUSAT actionKind = "run_amd_gpu_sat"
|
||||
actionRunFanStress actionKind = "run_fan_stress"
|
||||
actionRunNCCLTests actionKind = "run_nccl_tests"
|
||||
)
|
||||
|
||||
type model struct {
|
||||
@@ -98,6 +99,9 @@ type model struct {
|
||||
nvidiaSATCancel func()
|
||||
nvidiaSATAborted bool
|
||||
|
||||
// NCCL tests running
|
||||
ncclCancel func()
|
||||
|
||||
// GPU Platform Stress Test running
|
||||
gpuStressCancel func()
|
||||
gpuStressAborted bool
|
||||
@@ -202,6 +206,8 @@ func (m model) confirmBody() (string, string) {
|
||||
return "CPU test", "Run stress-ng? Mode: " + modes[m.hcMode]
|
||||
case actionRunAMDGPUSAT:
|
||||
return "AMD GPU test", "Run AMD GPU diagnostic pack (rocm-smi)?"
|
||||
case actionRunNCCLTests:
|
||||
return "NCCL bandwidth test", "Run all_reduce_perf across all GPUs?\n\nMeasures collective bandwidth over NVLink/PCIe.\nRequires 2+ GPUs for meaningful results."
|
||||
case actionRunFanStress:
|
||||
modes := []string{"Quick (2×2min)", "Standard (2×5min)", "Express (2×10min)"}
|
||||
return "GPU Platform Stress Test", "Two-phase GPU thermal cycling test.\n" +
|
||||
|
||||
Reference in New Issue
Block a user