feat(nccl): add nccl-tests all_reduce_perf for GPU bandwidth testing

- Dockerfile: install cuda-nvcc-13-0 from NVIDIA repo for compilation
- build-nccl-tests.sh: downloads libnccl-dev for nccl.h, builds all_reduce_perf
- build.sh: runs nccl-tests build, injects binary into /usr/local/bin/
- platform: RunNCCLTests() auto-detects GPU count, runs all_reduce_perf
- TUI: NCCL bandwidth test entry in Burn-in Tests screen [N] hotkey

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-26 23:22:19 +03:00
parent eea98e6d76
commit 5644231f9a
11 changed files with 221 additions and 13 deletions

View File

@@ -1,6 +1,7 @@
package tui
import (
"context"
"time"
"bee/audit/internal/platform"
@@ -140,6 +141,15 @@ func (m model) updateConfirm(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
)
case actionRunFanStress:
return m.startGPUStressTest()
case actionRunNCCLTests:
m.busy = true
m.busyTitle = "NCCL bandwidth test"
ctx, cancel := context.WithCancel(context.Background())
m.ncclCancel = cancel
return m, func() tea.Msg {
result, err := m.app.RunNCCLTestsResult(ctx)
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenBurnInTests}
}
}
case "ctrl+c":
return m, tea.Quit
@@ -153,7 +163,7 @@ func (m model) confirmCancelTarget() screen {
return screenExportTargets
case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT:
return screenHealthCheck
case actionRunFanStress:
case actionRunFanStress, actionRunNCCLTests:
return screenBurnInTests
default:
return screenMain