diff --git a/audit/internal/app/app.go b/audit/internal/app/app.go index a66979f..817907c 100644 --- a/audit/internal/app/app.go +++ b/audit/internal/app/app.go @@ -146,7 +146,7 @@ type satRunner interface { RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error) - RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) + RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) } type runtimeChecker interface { @@ -744,8 +744,15 @@ func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platfo return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc) } +func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) { + if strings.TrimSpace(baseDir) == "" { + baseDir = DefaultSATBaseDir + } + return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc) +} + func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) { - path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil) + path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil) body := "Results: " + path if err != nil && err != context.Canceled { body += "\nERROR: " + err.Error() diff --git a/audit/internal/app/app_test.go b/audit/internal/app/app_test.go index 57c3385..2000e8b 100644 --- a/audit/internal/app/app_test.go +++ b/audit/internal/app/app_test.go @@ -128,6 +128,7 @@ type fakeSAT struct { runNvidiaPowerFn func(string, int, []int) (string, error) runNvidiaPulseFn func(string, int, []int) (string, error) runNvidiaBandwidthFn func(string, []int) (string, error) + runNCCLFn func(string, []int) (string, error) runNvidiaTargetedStressFn func(string, int, []int) (string, error) runMemoryFn func(string) (string, error) runStorageFn func(string) (string, error) @@ -287,10 +288,43 @@ func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.Platf return "", nil } -func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) { +func (f fakeSAT) RunNCCLTests(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) { + if f.runNCCLFn != nil { + return f.runNCCLFn(baseDir, gpuIndices) + } return "", nil } +func TestRunNCCLTestsPassesSelectedGPUs(t *testing.T) { + t.Parallel() + + var gotBaseDir string + var gotGPUIndices []int + a := &App{ + sat: fakeSAT{ + runNCCLFn: func(baseDir string, gpuIndices []int) (string, error) { + gotBaseDir = baseDir + gotGPUIndices = append([]int(nil), gpuIndices...) + return "/tmp/nccl-tests.tar.gz", nil + }, + }, + } + + path, err := a.RunNCCLTests(context.Background(), "/tmp/sat", []int{3, 1}, nil) + if err != nil { + t.Fatalf("RunNCCLTests error: %v", err) + } + if path != "/tmp/nccl-tests.tar.gz" { + t.Fatalf("path=%q want %q", path, "/tmp/nccl-tests.tar.gz") + } + if gotBaseDir != "/tmp/sat" { + t.Fatalf("baseDir=%q want %q", gotBaseDir, "/tmp/sat") + } + if len(gotGPUIndices) != 2 || gotGPUIndices[0] != 3 || gotGPUIndices[1] != 1 { + t.Fatalf("gpuIndices=%v want [3 1]", gotGPUIndices) + } +} + func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) { t.Parallel() diff --git a/audit/internal/platform/sat.go b/audit/internal/platform/sat.go index ac8bcc8..a47e474 100644 --- a/audit/internal/platform/sat.go +++ b/audit/internal/platform/sat.go @@ -366,12 +366,14 @@ func (s *System) ResetNvidiaGPU(index int) (string, error) { return string(raw), err } -// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs. +// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs. // Measures collective communication bandwidth over NVLink/PCIe. -func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) { - // detect GPU count - out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output() - gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n")) +func (s *System) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) { + selected, err := resolveDCGMGPUIndices(gpuIndices) + if err != nil { + return "", err + } + gpuCount := len(selected) if gpuCount < 1 { gpuCount = 1 } @@ -380,7 +382,7 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func( satJob{name: "02-all-reduce-perf.log", cmd: []string{ "all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2", "-g", strconv.Itoa(gpuCount), "--iters", "20", - }}, + }, env: nvidiaVisibleDevicesEnv(selected)}, ), logFunc) } diff --git a/audit/internal/platform/sat_test.go b/audit/internal/platform/sat_test.go index 71ae50b..06ad947 100644 --- a/audit/internal/platform/sat_test.go +++ b/audit/internal/platform/sat_test.go @@ -321,6 +321,19 @@ func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) { } } +func TestNvidiaDCGMNamedDiagCommandSkipsDurationForNVBandwidth(t *testing.T) { + cmd := nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, []int{2, 0}) + want := []string{"dcgmi", "diag", "-r", "nvbandwidth", "-i", "2,0"} + if len(cmd) != len(want) { + t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd) + } + for i := range want { + if cmd[i] != want[i] { + t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i]) + } + } +} + func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) { env := nvidiaVisibleDevicesEnv([]int{0, 2, 4}) if len(env) != 2 { diff --git a/audit/internal/webui/pages.go b/audit/internal/webui/pages.go index 1b6ab2b..67942c1 100644 --- a/audit/internal/webui/pages.go +++ b/audit/internal/webui/pages.go @@ -1481,7 +1481,7 @@ func renderValidate(opts HandlerOptions) string { inv.NVIDIA, `Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`, `all_reduce_perf (NCCL tests)`, - `Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously (requires ≥2).

Only runs in Stress mode. Switch mode above to enable in Run All.

`, + `Runs in Validate and Stress. Uses all selected GPUs simultaneously (requires ≥2) and is kept short so it fits the Validate flow.`, )) + `` + `
` + @@ -1489,7 +1489,7 @@ func renderValidate(opts HandlerOptions) string { inv.NVIDIA, `Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`, `nvbandwidth`, - `Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously.

Only runs in Stress mode. Switch mode above to enable in Run All.

`, + `Runs in Validate and Stress across all selected GPUs simultaneously. Intended to stay short enough for Validate.`, )) + `
` + ` @@ -1527,8 +1527,6 @@ function satModeChanged() { {card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'}, {card: 'sat-card-nvidia-targeted-power', hint: 'sat-tp-mode-hint'}, {card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'}, - {card: 'sat-card-nvidia-interconnect', hint: 'sat-ni-mode-hint'}, - {card: 'sat-card-nvidia-bandwidth', hint: 'sat-nb-mode-hint'}, ].forEach(function(item) { const card = document.getElementById(item.card); if (card) { @@ -1776,7 +1774,7 @@ function runAllSAT() { const cycles = 1; const status = document.getElementById('sat-all-status'); status.textContent = 'Enqueuing...'; - const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth']; + const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse']; const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets()); const activeTargets = baseTargets.filter(target => { if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false; @@ -2082,7 +2080,7 @@ func renderBenchmark(opts HandlerOptions) string { -`+`
`+renderBenchmarkResultsCard(opts.ExportDir)+`
`+` +` + `
` + renderBenchmarkResultsCard(opts.ExportDir) + `
` + `