feat(nccl): add nccl-tests all_reduce_perf for GPU bandwidth testing

- Dockerfile: install cuda-nvcc-13-0 from NVIDIA repo for compilation
- build-nccl-tests.sh: downloads libnccl-dev for nccl.h, builds all_reduce_perf
- build.sh: runs nccl-tests build, injects binary into /usr/local/bin/
- platform: RunNCCLTests() auto-detects GPU count, runs all_reduce_perf
- TUI: NCCL bandwidth test entry in Burn-in Tests screen [N] hotkey

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-26 23:22:19 +03:00
parent eea98e6d76
commit 5644231f9a
11 changed files with 221 additions and 13 deletions

View File

@@ -81,6 +81,7 @@ type satRunner interface {
ListAMDGPUs() ([]platform.AMDGPUInfo, error)
RunAMDAcceptancePack(baseDir string) (string, error)
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
RunNCCLTests(ctx context.Context, baseDir string) (string, error)
}
type runtimeChecker interface {
@@ -498,6 +499,15 @@ func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platfor
return a.sat.RunFanStressTest(ctx, baseDir, opts)
}
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir)
body := "Results: " + path
if err != nil && err != context.Canceled {
body += "\nERROR: " + err.Error()
}
return ActionResult{Title: "NCCL bandwidth test", Body: body}, err
}
func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) {
path, err := a.RunFanStressTest(ctx, "", opts)
body := formatFanStressResult(path)

View File

@@ -174,6 +174,10 @@ func (f fakeSAT) RunFanStressTest(_ context.Context, _ string, _ platform.FanStr
return "", nil
}
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string) (string, error) {
return "", nil
}
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
t.Parallel()