feat(nccl): add nccl-tests all_reduce_perf for GPU bandwidth testing
- Dockerfile: install cuda-nvcc-13-0 from NVIDIA repo for compilation - build-nccl-tests.sh: downloads libnccl-dev for nccl.h, builds all_reduce_perf - build.sh: runs nccl-tests build, injects binary into /usr/local/bin/ - platform: RunNCCLTests() auto-detects GPU count, runs all_reduce_perf - TUI: NCCL bandwidth test entry in Burn-in Tests screen [N] hotkey Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
package tui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
@@ -140,6 +141,15 @@ func (m model) updateConfirm(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
)
|
||||
case actionRunFanStress:
|
||||
return m.startGPUStressTest()
|
||||
case actionRunNCCLTests:
|
||||
m.busy = true
|
||||
m.busyTitle = "NCCL bandwidth test"
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
m.ncclCancel = cancel
|
||||
return m, func() tea.Msg {
|
||||
result, err := m.app.RunNCCLTestsResult(ctx)
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenBurnInTests}
|
||||
}
|
||||
}
|
||||
case "ctrl+c":
|
||||
return m, tea.Quit
|
||||
@@ -153,7 +163,7 @@ func (m model) confirmCancelTarget() screen {
|
||||
return screenExportTargets
|
||||
case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT:
|
||||
return screenHealthCheck
|
||||
case actionRunFanStress:
|
||||
case actionRunFanStress, actionRunNCCLTests:
|
||||
return screenBurnInTests
|
||||
default:
|
||||
return screenMain
|
||||
|
||||
Reference in New Issue
Block a user