Move NCCL and NVBandwidth into validate mode

This commit is contained in:
Mikhail Chusavitin
2026-04-16 11:02:30 +03:00
parent f74976ec4c
commit b4280941f5
7 changed files with 91 additions and 25 deletions

View File

@@ -146,7 +146,7 @@ type satRunner interface {
RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
}
type runtimeChecker interface {
@@ -744,8 +744,15 @@ func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platfo
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
}
func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc)
}
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil)
body := "Results: " + path
if err != nil && err != context.Canceled {
body += "\nERROR: " + err.Error()

View File

@@ -128,6 +128,7 @@ type fakeSAT struct {
runNvidiaPowerFn func(string, int, []int) (string, error)
runNvidiaPulseFn func(string, int, []int) (string, error)
runNvidiaBandwidthFn func(string, []int) (string, error)
runNCCLFn func(string, []int) (string, error)
runNvidiaTargetedStressFn func(string, int, []int) (string, error)
runMemoryFn func(string) (string, error)
runStorageFn func(string) (string, error)
@@ -287,10 +288,43 @@ func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.Platf
return "", nil
}
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
func (f fakeSAT) RunNCCLTests(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
if f.runNCCLFn != nil {
return f.runNCCLFn(baseDir, gpuIndices)
}
return "", nil
}
func TestRunNCCLTestsPassesSelectedGPUs(t *testing.T) {
t.Parallel()
var gotBaseDir string
var gotGPUIndices []int
a := &App{
sat: fakeSAT{
runNCCLFn: func(baseDir string, gpuIndices []int) (string, error) {
gotBaseDir = baseDir
gotGPUIndices = append([]int(nil), gpuIndices...)
return "/tmp/nccl-tests.tar.gz", nil
},
},
}
path, err := a.RunNCCLTests(context.Background(), "/tmp/sat", []int{3, 1}, nil)
if err != nil {
t.Fatalf("RunNCCLTests error: %v", err)
}
if path != "/tmp/nccl-tests.tar.gz" {
t.Fatalf("path=%q want %q", path, "/tmp/nccl-tests.tar.gz")
}
if gotBaseDir != "/tmp/sat" {
t.Fatalf("baseDir=%q want %q", gotBaseDir, "/tmp/sat")
}
if len(gotGPUIndices) != 2 || gotGPUIndices[0] != 3 || gotGPUIndices[1] != 1 {
t.Fatalf("gpuIndices=%v want [3 1]", gotGPUIndices)
}
}
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
t.Parallel()

View File

@@ -366,12 +366,14 @@ func (s *System) ResetNvidiaGPU(index int) (string, error) {
return string(raw), err
}
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
// Measures collective communication bandwidth over NVLink/PCIe.
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
// detect GPU count
out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
selected, err := resolveDCGMGPUIndices(gpuIndices)
if err != nil {
return "", err
}
gpuCount := len(selected)
if gpuCount < 1 {
gpuCount = 1
}
@@ -380,7 +382,7 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
satJob{name: "02-all-reduce-perf.log", cmd: []string{
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
"-g", strconv.Itoa(gpuCount), "--iters", "20",
}},
}, env: nvidiaVisibleDevicesEnv(selected)},
), logFunc)
}

View File

@@ -321,6 +321,19 @@ func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
}
}
func TestNvidiaDCGMNamedDiagCommandSkipsDurationForNVBandwidth(t *testing.T) {
cmd := nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, []int{2, 0})
want := []string{"dcgmi", "diag", "-r", "nvbandwidth", "-i", "2,0"}
if len(cmd) != len(want) {
t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
}
for i := range want {
if cmd[i] != want[i] {
t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
}
}
}
func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
if len(env) != 2 {

View File

@@ -1481,7 +1481,7 @@ func renderValidate(opts HandlerOptions) string {
inv.NVIDIA,
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
`<code>all_reduce_perf</code> (NCCL tests)`,
`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously (requires ≥2).<p id="sat-ni-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
`Runs in Validate and Stress. Uses all selected GPUs simultaneously (requires ≥2) and is kept short so it fits the Validate flow.`,
)) +
`</div>` +
`<div id="sat-card-nvidia-bandwidth">` +
@@ -1489,7 +1489,7 @@ func renderValidate(opts HandlerOptions) string {
inv.NVIDIA,
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
`<code>nvbandwidth</code>`,
`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously.<p id="sat-nb-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
`Runs in Validate and Stress across all selected GPUs simultaneously. Intended to stay short enough for Validate.`,
)) +
`</div>` +
`</div>
@@ -1527,8 +1527,6 @@ function satModeChanged() {
{card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
{card: 'sat-card-nvidia-targeted-power', hint: 'sat-tp-mode-hint'},
{card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'},
{card: 'sat-card-nvidia-interconnect', hint: 'sat-ni-mode-hint'},
{card: 'sat-card-nvidia-bandwidth', hint: 'sat-nb-mode-hint'},
].forEach(function(item) {
const card = document.getElementById(item.card);
if (card) {
@@ -1776,7 +1774,7 @@ function runAllSAT() {
const cycles = 1;
const status = document.getElementById('sat-all-status');
status.textContent = 'Enqueuing...';
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
const activeTargets = baseTargets.filter(target => {
if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
@@ -2082,7 +2080,7 @@ func renderBenchmark(opts HandlerOptions) string {
</div>
</div>
`+`<div id="benchmark-results-section">`+renderBenchmarkResultsCard(opts.ExportDir)+`</div>`+`
` + `<div id="benchmark-results-section">` + renderBenchmarkResultsCard(opts.ExportDir) + `</div>` + `
<div id="benchmark-output" style="display:none;margin-top:16px" class="card">
<div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
@@ -2517,7 +2515,7 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {
func renderBurn() string {
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>&#9888; Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `), NCCL, NVBandwidth, and LINPACK remain in <a href="/validate">Validate → Stress mode</a>. Burn exposes sustained GPU compute load recipes.</div>
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in <a href="/validate">Validate → Stress mode</a>; NCCL and NVBandwidth are available directly from <a href="/validate">Validate</a>.</div>
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
<div class="card" style="margin-bottom:16px">

View File

@@ -744,6 +744,26 @@ func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
}
}
func TestValidatePageRendersNvidiaFabricCardsInValidateMode(t *testing.T) {
handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d", rec.Code)
}
body := rec.Body.String()
for _, needle := range []string{
`NVIDIA Interconnect (NCCL)`,
`Runs in Validate and Stress.`,
`NVIDIA Bandwidth (NVBandwidth)`,
`Intended to stay short enough for Validate.`,
} {
if !strings.Contains(body, needle) {
t.Fatalf("validate page missing %q: %s", needle, body)
}
}
}
func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder()

View File

@@ -736,15 +736,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
DurationSec: dur,
Loader: platform.NvidiaStressLoaderNCCL,
GPUIndices: t.params.GPUIndices,
}, j.append)
archive, err = a.RunNCCLTests(ctx, "", t.params.GPUIndices, j.append)
case "nvidia-stress":
if a == nil {
err = fmt.Errorf("app not configured")