feat(nccl): add nccl-tests all_reduce_perf for GPU bandwidth testing

- Dockerfile: install cuda-nvcc-13-0 from NVIDIA repo for compilation
- build-nccl-tests.sh: downloads libnccl-dev for nccl.h, builds all_reduce_perf
- build.sh: runs nccl-tests build, injects binary into /usr/local/bin/
- platform: RunNCCLTests() auto-detects GPU count, runs all_reduce_perf
- TUI: NCCL bandwidth test entry in Burn-in Tests screen [N] hotkey

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-26 23:22:19 +03:00
parent eea98e6d76
commit 5644231f9a
11 changed files with 221 additions and 13 deletions

View File

@@ -268,12 +268,9 @@ func TestHealthCheckGPUOpensNvidiaSATSetup(t *testing.T) {
m.hcInitialized = true
m.hcSel = [4]bool{true, true, true, true}
next, cmd := m.hcRunSingle(hcGPU)
next, _ := m.hcRunSingle(hcGPU)
got := next.(model)
if cmd == nil {
t.Fatal("expected non-nil cmd (GPU list loader)")
}
if got.screen != screenNvidiaSATSetup {
t.Fatalf("screen=%q want %q", got.screen, screenNvidiaSATSetup)
}