diff --git a/audit/internal/app/app.go b/audit/internal/app/app.go
index a66979f..817907c 100644
--- a/audit/internal/app/app.go
+++ b/audit/internal/app/app.go
@@ -146,7 +146,7 @@ type satRunner interface {
RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
- RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
+ RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
}
type runtimeChecker interface {
@@ -744,8 +744,15 @@ func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platfo
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
}
+func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
+ if strings.TrimSpace(baseDir) == "" {
+ baseDir = DefaultSATBaseDir
+ }
+ return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc)
+}
+
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
- path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
+ path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil)
body := "Results: " + path
if err != nil && err != context.Canceled {
body += "\nERROR: " + err.Error()
diff --git a/audit/internal/app/app_test.go b/audit/internal/app/app_test.go
index 57c3385..2000e8b 100644
--- a/audit/internal/app/app_test.go
+++ b/audit/internal/app/app_test.go
@@ -128,6 +128,7 @@ type fakeSAT struct {
runNvidiaPowerFn func(string, int, []int) (string, error)
runNvidiaPulseFn func(string, int, []int) (string, error)
runNvidiaBandwidthFn func(string, []int) (string, error)
+ runNCCLFn func(string, []int) (string, error)
runNvidiaTargetedStressFn func(string, int, []int) (string, error)
runMemoryFn func(string) (string, error)
runStorageFn func(string) (string, error)
@@ -287,10 +288,43 @@ func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.Platf
return "", nil
}
-func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
+func (f fakeSAT) RunNCCLTests(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
+ if f.runNCCLFn != nil {
+ return f.runNCCLFn(baseDir, gpuIndices)
+ }
return "", nil
}
+func TestRunNCCLTestsPassesSelectedGPUs(t *testing.T) {
+ t.Parallel()
+
+ var gotBaseDir string
+ var gotGPUIndices []int
+ a := &App{
+ sat: fakeSAT{
+ runNCCLFn: func(baseDir string, gpuIndices []int) (string, error) {
+ gotBaseDir = baseDir
+ gotGPUIndices = append([]int(nil), gpuIndices...)
+ return "/tmp/nccl-tests.tar.gz", nil
+ },
+ },
+ }
+
+ path, err := a.RunNCCLTests(context.Background(), "/tmp/sat", []int{3, 1}, nil)
+ if err != nil {
+ t.Fatalf("RunNCCLTests error: %v", err)
+ }
+ if path != "/tmp/nccl-tests.tar.gz" {
+ t.Fatalf("path=%q want %q", path, "/tmp/nccl-tests.tar.gz")
+ }
+ if gotBaseDir != "/tmp/sat" {
+ t.Fatalf("baseDir=%q want %q", gotBaseDir, "/tmp/sat")
+ }
+ if len(gotGPUIndices) != 2 || gotGPUIndices[0] != 3 || gotGPUIndices[1] != 1 {
+ t.Fatalf("gpuIndices=%v want [3 1]", gotGPUIndices)
+ }
+}
+
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
t.Parallel()
diff --git a/audit/internal/platform/sat.go b/audit/internal/platform/sat.go
index ac8bcc8..a47e474 100644
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -366,12 +366,14 @@ func (s *System) ResetNvidiaGPU(index int) (string, error) {
return string(raw), err
}
-// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
+// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
// Measures collective communication bandwidth over NVLink/PCIe.
-func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
- // detect GPU count
- out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
- gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
+func (s *System) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
+ selected, err := resolveDCGMGPUIndices(gpuIndices)
+ if err != nil {
+ return "", err
+ }
+ gpuCount := len(selected)
if gpuCount < 1 {
gpuCount = 1
}
@@ -380,7 +382,7 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
satJob{name: "02-all-reduce-perf.log", cmd: []string{
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
"-g", strconv.Itoa(gpuCount), "--iters", "20",
- }},
+ }, env: nvidiaVisibleDevicesEnv(selected)},
), logFunc)
}
diff --git a/audit/internal/platform/sat_test.go b/audit/internal/platform/sat_test.go
index 71ae50b..06ad947 100644
--- a/audit/internal/platform/sat_test.go
+++ b/audit/internal/platform/sat_test.go
@@ -321,6 +321,19 @@ func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
}
}
+func TestNvidiaDCGMNamedDiagCommandSkipsDurationForNVBandwidth(t *testing.T) {
+ cmd := nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, []int{2, 0})
+ want := []string{"dcgmi", "diag", "-r", "nvbandwidth", "-i", "2,0"}
+ if len(cmd) != len(want) {
+ t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
+ }
+ for i := range want {
+ if cmd[i] != want[i] {
+ t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
+ }
+ }
+}
+
func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
if len(env) != 2 {
diff --git a/audit/internal/webui/pages.go b/audit/internal/webui/pages.go
index 1b6ab2b..67942c1 100644
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
@@ -1481,7 +1481,7 @@ func renderValidate(opts HandlerOptions) string {
inv.NVIDIA,
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
`all_reduce_perf (NCCL tests)`,
- `Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously (requires ≥2).
Only runs in Stress mode. Switch mode above to enable in Run All.
`, + `Runs in Validate and Stress. Uses all selected GPUs simultaneously (requires ≥2) and is kept short so it fits the Validate flow.`, )) + `` + `nvbandwidth`,
- `Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously.Only runs in Stress mode. Switch mode above to enable in Run All.
`, + `Runs in Validate and Stress across all selected GPUs simultaneously. Intended to stay short enough for Validate.`, )) + `