feat: v3.10 GPU stress and NCCL burn updates

2026-03-31 11:22:27 +03:00
parent 6dee8f3509
commit 06b4afd263
12 changed files with 423 additions and 88 deletions
--- a/audit/internal/platform/nvidia_stress.go
+++ b/audit/internal/platform/nvidia_stress.go
@@ -61,6 +61,20 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
 			collectGPU: true,
 			gpuIndices: selected,
 		}, nil
+	case NvidiaStressLoaderNCCL:
+		cmd := []string{
+			"bee-nccl-gpu-stress",
+			"--seconds", strconv.Itoa(opts.DurationSec),
+		}
+		if len(selected) > 0 {
+			cmd = append(cmd, "--devices", joinIndexList(selected))
+		}
+		return satJob{
+			name:       "03-bee-nccl-gpu-stress.log",
+			cmd:        cmd,
+			collectGPU: true,
+			gpuIndices: selected,
+		}, nil
 	default:
 		return satJob{}, fmt.Errorf("unknown NVIDIA stress loader %q", opts.Loader)
 	}
@@ -78,6 +92,8 @@ func normalizeNvidiaStressOptions(opts *NvidiaStressOptions) {
 		opts.Loader = NvidiaStressLoaderBuiltin
 	case NvidiaStressLoaderJohn:
 		opts.Loader = NvidiaStressLoaderJohn
+	case NvidiaStressLoaderNCCL:
+		opts.Loader = NvidiaStressLoaderNCCL
 	default:
 		opts.Loader = NvidiaStressLoaderBuiltin
 	}
--- a/audit/internal/platform/runtime.go
+++ b/audit/internal/platform/runtime.go
@@ -138,6 +138,8 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
 			"nvidia-bug-report.sh",
 			"bee-gpu-burn",
 			"bee-john-gpu-stress",
+			"bee-nccl-gpu-stress",
+			"all_reduce_perf",
 		})...)
 	case "amd":
 		tool := ToolStatus{Name: "rocm-smi"}
--- a/audit/internal/platform/sat_test.go
+++ b/audit/internal/platform/sat_test.go
@@ -128,6 +128,40 @@ func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) {
 	}
 }

+func TestBuildNvidiaStressJobUsesNCCLLoader(t *testing.T) {
+	t.Parallel()
+
+	oldExecCommand := satExecCommand
+	satExecCommand = func(name string, args ...string) *exec.Cmd {
+		if name == "nvidia-smi" {
+			return exec.Command("sh", "-c", "printf '0\n1\n2\n'")
+		}
+		return exec.Command(name, args...)
+	}
+	t.Cleanup(func() { satExecCommand = oldExecCommand })
+
+	job, err := buildNvidiaStressJob(NvidiaStressOptions{
+		DurationSec: 120,
+		Loader:      NvidiaStressLoaderNCCL,
+		GPUIndices:  []int{2, 0},
+	})
+	if err != nil {
+		t.Fatalf("buildNvidiaStressJob error: %v", err)
+	}
+	wantCmd := []string{"bee-nccl-gpu-stress", "--seconds", "120", "--devices", "0,2"}
+	if len(job.cmd) != len(wantCmd) {
+		t.Fatalf("cmd len=%d want %d (%v)", len(job.cmd), len(wantCmd), job.cmd)
+	}
+	for i := range wantCmd {
+		if job.cmd[i] != wantCmd[i] {
+			t.Fatalf("cmd[%d]=%q want %q", i, job.cmd[i], wantCmd[i])
+		}
+	}
+	if got := joinIndexList(job.gpuIndices); got != "0,2" {
+		t.Fatalf("gpuIndices=%q want 0,2", got)
+	}
+}
+
 func TestEnvIntFallback(t *testing.T) {
 	os.Unsetenv("BEE_MEMTESTER_SIZE_MB")
 	if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 123 {
--- a/audit/internal/platform/types.go
+++ b/audit/internal/platform/types.go
@@ -54,6 +54,7 @@ type ToolStatus struct {
 const (
 	NvidiaStressLoaderBuiltin = "builtin"
 	NvidiaStressLoaderJohn    = "john"
+	NvidiaStressLoaderNCCL    = "nccl"
 )

 type NvidiaStressOptions struct {
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
@@ -669,9 +669,9 @@ func renderBurn() string {
 </div></div>
 <div class="grid3">
 <div class="card"><div class="card-head">NVIDIA GPU Stress</div><div class="card-body">
-<div class="form-row"><label>Load Tool</label><select id="nvidia-stress-loader"><option value="builtin" selected>bee-gpu-burn</option><option value="john">John the Ripper jumbo (OpenCL)</option></select></div>
+<div class="form-row"><label>Load Tool</label><select id="nvidia-stress-loader"><option value="builtin" selected>bee-gpu-burn</option><option value="nccl">NCCL all_reduce_perf</option><option value="john">John the Ripper jumbo (OpenCL)</option></select></div>
 <div class="form-row"><label>Exclude GPU indices</label><input type="text" id="nvidia-stress-exclude" placeholder="e.g. 1,3"></div>
-<p style="color:var(--muted);font-size:12px;margin-bottom:8px"><code>bee-gpu-burn</code> runs on all detected NVIDIA GPUs by default. Use exclusions only when one or more cards must be skipped.</p>
+<p style="color:var(--muted);font-size:12px;margin-bottom:8px"><code>bee-gpu-burn</code> runs on all detected NVIDIA GPUs by default. <code>NCCL all_reduce_perf</code> is useful for multi-GPU / interconnect load. Use exclusions only when one or more cards must be skipped.</p>
 <button id="sat-btn-nvidia-stress" class="btn btn-primary" onclick="runBurnIn('nvidia-stress')">&#9654; Start NVIDIA Stress</button>
 </div></div>
 <div class="card"><div class="card-head">CPU Stress</div><div class="card-body">
--- a/2
+++ b/2
--- a/internal/chart
+++ b/internal/chart
--- a/iso/builder/bee-gpu-stress.c
+++ b/iso/builder/bee-gpu-stress.c
@@ -32,6 +32,10 @@ typedef void *CUstream;
 #define CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT 16
 #define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR 75
 #define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR 76
+#define MAX_STRESS_STREAMS 16
+#define MAX_CUBLAS_PROFILES 5
+#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
+#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)

 static const char *ptx_source =
    ".version 6.0\n"
@@ -211,14 +215,12 @@ static double now_seconds(void) {
    return (double)ts.tv_sec + ((double)ts.tv_nsec / 1000000000.0);
 }

-#if HAVE_CUBLASLT_HEADERS
 static size_t round_down_size(size_t value, size_t multiple) {
    if (multiple == 0 || value < multiple) {
        return value;
    }
    return value - (value % multiple);
 }
-#endif

 static int query_compute_capability(struct cuda_api *api, CUdevice dev, int *major, int *minor) {
    int cc_major = 0;
@@ -271,6 +273,42 @@ static size_t clamp_budget_to_free_memory(struct cuda_api *api, size_t requested
    return requested_bytes;
 }

+static int choose_stream_count(int mp_count, int planned_profiles, size_t total_budget, int have_streams) {
+    int stream_count = 1;
+    if (!have_streams || mp_count <= 0 || planned_profiles <= 0) {
+        return 1;
+    }
+
+    stream_count = mp_count / 8;
+    if (stream_count < 2) {
+        stream_count = 2;
+    }
+    if (stream_count > MAX_STRESS_STREAMS) {
+        stream_count = MAX_STRESS_STREAMS;
+    }
+
+    while (stream_count > 1) {
+        size_t per_stream_budget = total_budget / ((size_t)planned_profiles * (size_t)stream_count);
+        if (per_stream_budget >= MIN_STREAM_BUDGET_BYTES) {
+            break;
+        }
+        stream_count--;
+    }
+    return stream_count;
+}
+
+static void destroy_streams(struct cuda_api *api, CUstream *streams, int count) {
+    if (!api->cuStreamDestroy) {
+        return;
+    }
+    for (int i = 0; i < count; i++) {
+        if (streams[i]) {
+            api->cuStreamDestroy(streams[i]);
+            streams[i] = NULL;
+        }
+    }
+}
+
 #if HAVE_CUBLASLT_HEADERS
 static void append_detail(char *buf, size_t cap, const char *fmt, ...) {
    size_t len = strlen(buf);
@@ -293,12 +331,19 @@ static int run_ptx_fallback(struct cuda_api *api,
                            int size_mb,
                            struct stress_report *report) {
    CUcontext ctx = NULL;
-    CUdeviceptr device_mem = 0;
    CUmodule module = NULL;
    CUfunction kernel = NULL;
    uint32_t sample[256];
-    uint32_t words = 0;
+    CUdeviceptr device_mem[MAX_STRESS_STREAMS] = {0};
+    CUstream streams[MAX_STRESS_STREAMS] = {0};
+    uint32_t words[MAX_STRESS_STREAMS] = {0};
+    uint32_t rounds[MAX_STRESS_STREAMS] = {0};
+    void *params[MAX_STRESS_STREAMS][3];
+    size_t bytes_per_stream[MAX_STRESS_STREAMS] = {0};
    unsigned long iterations = 0;
+    int mp_count = 0;
+    int stream_count = 1;
+    int launches_per_wave = 0;

    memset(report, 0, sizeof(*report));
    snprintf(report->backend, sizeof(report->backend), "driver-ptx");
@@ -311,64 +356,102 @@ static int run_ptx_fallback(struct cuda_api *api,
        return 0;
    }

-    size_t bytes = (size_t)size_mb * 1024u * 1024u;
-    if (bytes < 4u * 1024u * 1024u) {
-        bytes = 4u * 1024u * 1024u;
+    size_t requested_bytes = (size_t)size_mb * 1024u * 1024u;
+    if (requested_bytes < MIN_PROFILE_BUDGET_BYTES) {
+        requested_bytes = MIN_PROFILE_BUDGET_BYTES;
    }
-    if (bytes > (size_t)1024u * 1024u * 1024u) {
-        bytes = (size_t)1024u * 1024u * 1024u;
+    size_t total_bytes = clamp_budget_to_free_memory(api, requested_bytes);
+    if (total_bytes < MIN_PROFILE_BUDGET_BYTES) {
+        total_bytes = MIN_PROFILE_BUDGET_BYTES;
    }
-    words = (uint32_t)(bytes / sizeof(uint32_t));
+    report->buffer_mb = (int)(total_bytes / (1024u * 1024u));

-    if (!check_rc(api, "cuMemAlloc", api->cuMemAlloc(&device_mem, bytes))) {
-        api->cuCtxDestroy(ctx);
-        return 0;
+    if (query_multiprocessor_count(api, dev, &mp_count) &&
+        api->cuStreamCreate &&
+        api->cuStreamDestroy) {
+        stream_count = choose_stream_count(mp_count, 1, total_bytes, 1);
    }
-    if (!check_rc(api, "cuMemsetD8", api->cuMemsetD8(device_mem, 0, bytes))) {
-        api->cuMemFree(device_mem);
-        api->cuCtxDestroy(ctx);
-        return 0;
+    if (stream_count > 1) {
+        int created = 0;
+        for (; created < stream_count; created++) {
+            if (!check_rc(api, "cuStreamCreate", api->cuStreamCreate(&streams[created], 0))) {
+                destroy_streams(api, streams, created);
+                stream_count = 1;
+                break;
+            }
+        }
    }
+    report->stream_count = stream_count;
+
+    for (int lane = 0; lane < stream_count; lane++) {
+        size_t slice = total_bytes / (size_t)stream_count;
+        if (lane == stream_count - 1) {
+            slice = total_bytes - ((size_t)lane * (total_bytes / (size_t)stream_count));
+        }
+        slice = round_down_size(slice, sizeof(uint32_t));
+        if (slice < MIN_PROFILE_BUDGET_BYTES) {
+            slice = MIN_PROFILE_BUDGET_BYTES;
+        }
+        bytes_per_stream[lane] = slice;
+        words[lane] = (uint32_t)(slice / sizeof(uint32_t));
+
+        if (!check_rc(api, "cuMemAlloc", api->cuMemAlloc(&device_mem[lane], slice))) {
+            goto fail;
+        }
+        if (!check_rc(api, "cuMemsetD8", api->cuMemsetD8(device_mem[lane], 0, slice))) {
+            goto fail;
+        }
+        rounds[lane] = 2048;
+        params[lane][0] = &device_mem[lane];
+        params[lane][1] = &words[lane];
+        params[lane][2] = &rounds[lane];
+    }
+
    if (!check_rc(api,
                  "cuModuleLoadDataEx",
                  api->cuModuleLoadDataEx(&module, ptx_source, 0, NULL, NULL))) {
-        api->cuMemFree(device_mem);
-        api->cuCtxDestroy(ctx);
-        return 0;
+        goto fail;
    }
    if (!check_rc(api, "cuModuleGetFunction", api->cuModuleGetFunction(&kernel, module, "burn"))) {
-        api->cuMemFree(device_mem);
-        api->cuCtxDestroy(ctx);
-        return 0;
+        goto fail;
    }

    unsigned int threads = 256;
-    unsigned int blocks = (unsigned int)((words + threads - 1) / threads);
-    uint32_t rounds = 1024;
-    void *params[] = {&device_mem, &words, &rounds};

    double start = now_seconds();
    double deadline = start + (double)seconds;
    while (now_seconds() < deadline) {
-        if (!check_rc(api,
-                      "cuLaunchKernel",
-                      api->cuLaunchKernel(kernel, blocks, 1, 1, threads, 1, 1, 0, NULL, params, NULL))) {
-            api->cuMemFree(device_mem);
-            api->cuCtxDestroy(ctx);
-            return 0;
+        launches_per_wave = 0;
+        for (int lane = 0; lane < stream_count; lane++) {
+            unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
+            if (!check_rc(api,
+                          "cuLaunchKernel",
+                          api->cuLaunchKernel(kernel,
+                                              blocks,
+                                              1,
+                                              1,
+                                              threads,
+                                              1,
+                                              1,
+                                              0,
+                                              streams[lane],
+                                              params[lane],
+                                              NULL))) {
+                goto fail;
+            }
+            launches_per_wave++;
        }
-        iterations++;
+        if (launches_per_wave <= 0) {
+            goto fail;
+        }
+        if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
+            goto fail;
+        }
+        iterations += (unsigned long)launches_per_wave;
    }

-    if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
-        api->cuMemFree(device_mem);
-        api->cuCtxDestroy(ctx);
-        return 0;
-    }
-    if (!check_rc(api, "cuMemcpyDtoH", api->cuMemcpyDtoH(sample, device_mem, sizeof(sample)))) {
-        api->cuMemFree(device_mem);
-        api->cuCtxDestroy(ctx);
-        return 0;
+    if (!check_rc(api, "cuMemcpyDtoH", api->cuMemcpyDtoH(sample, device_mem[0], sizeof(sample)))) {
+        goto fail;
    }

    for (size_t i = 0; i < sizeof(sample) / sizeof(sample[0]); i++) {
@@ -377,12 +460,33 @@ static int run_ptx_fallback(struct cuda_api *api,
    report->iterations = iterations;
    snprintf(report->details,
             sizeof(report->details),
-             "profile_int32_fallback=OK iterations=%lu\n",
+             "fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d per_stream_mb=%zu iterations=%lu\n",
+             size_mb,
+             report->buffer_mb,
+             report->stream_count,
+             bytes_per_stream[0] / (1024u * 1024u),
             iterations);

-    api->cuMemFree(device_mem);
+    for (int lane = 0; lane < stream_count; lane++) {
+        if (device_mem[lane]) {
+            api->cuMemFree(device_mem[lane]);
+        }
+    }
+    destroy_streams(api, streams, stream_count);
    api->cuCtxDestroy(ctx);
    return 1;
+
+fail:
+    for (int lane = 0; lane < MAX_STRESS_STREAMS; lane++) {
+        if (device_mem[lane]) {
+            api->cuMemFree(device_mem[lane]);
+        }
+    }
+    destroy_streams(api, streams, MAX_STRESS_STREAMS);
+    if (ctx) {
+        api->cuCtxDestroy(ctx);
+    }
+    return 0;
 }

 #if HAVE_CUBLASLT_HEADERS
@@ -469,6 +573,7 @@ struct profile_desc {

 struct prepared_profile {
    struct profile_desc desc;
+    CUstream stream;
    cublasLtMatmulDesc_t op_desc;
    cublasLtMatrixLayout_t a_layout;
    cublasLtMatrixLayout_t b_layout;
@@ -668,8 +773,8 @@ static uint64_t choose_square_dim(size_t budget_bytes, size_t bytes_per_cell, in
    if (dim < (uint64_t)multiple) {
        dim = (uint64_t)multiple;
    }
-    if (dim > 8192u) {
-        dim = 8192u;
+    if (dim > 65536u) {
+        dim = 65536u;
    }
    return dim;
 }
@@ -755,10 +860,12 @@ static int prepare_profile(struct cublaslt_api *cublas,
                           cublasLtHandle_t handle,
                           struct cuda_api *cuda,
                           const struct profile_desc *desc,
+                           CUstream stream,
                           size_t profile_budget_bytes,
                           struct prepared_profile *out) {
    memset(out, 0, sizeof(*out));
    out->desc = *desc;
+    out->stream = stream;

    size_t bytes_per_cell = 0;
    bytes_per_cell += bytes_for_elements(desc->a_type, 1);
@@ -986,7 +1093,7 @@ static int run_cublas_profile(cublasLtHandle_t handle,
                                               &profile->heuristic.algo,
                                               (void *)(uintptr_t)profile->workspace_dev,
                                               profile->workspace_size,
-                                               (cudaStream_t)0));
+                                               profile->stream));
 }

 static int run_cublaslt_stress(struct cuda_api *cuda,
@@ -998,13 +1105,22 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
                               int size_mb,
                               struct stress_report *report) {
    struct cublaslt_api cublas;
-    struct prepared_profile prepared[sizeof(k_profiles) / sizeof(k_profiles[0])];
+    struct prepared_profile prepared[MAX_STRESS_STREAMS * MAX_CUBLAS_PROFILES];
    cublasLtHandle_t handle = NULL;
    CUcontext ctx = NULL;
+    CUstream streams[MAX_STRESS_STREAMS] = {0};
    uint16_t sample[256];
    int cc = cc_major * 10 + cc_minor;
    int planned = 0;
    int active = 0;
+    int mp_count = 0;
+    int stream_count = 1;
+    int profile_count = (int)(sizeof(k_profiles) / sizeof(k_profiles[0]));
+    int prepared_count = 0;
+    int wave_launches = 0;
+    size_t requested_budget = 0;
+    size_t total_budget = 0;
+    size_t per_profile_budget = 0;

    memset(report, 0, sizeof(*report));
    snprintf(report->backend, sizeof(report->backend), "cublasLt");
@@ -1037,16 +1153,45 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
        return 0;
    }

-    size_t total_budget = (size_t)size_mb * 1024u * 1024u;
-    if (total_budget < (size_t)planned * 4u * 1024u * 1024u) {
-        total_budget = (size_t)planned * 4u * 1024u * 1024u;
+    requested_budget = (size_t)size_mb * 1024u * 1024u;
+    if (requested_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) {
+        requested_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES;
    }
-    size_t per_profile_budget = total_budget / (size_t)planned;
-    if (per_profile_budget < 4u * 1024u * 1024u) {
-        per_profile_budget = 4u * 1024u * 1024u;
+    total_budget = clamp_budget_to_free_memory(cuda, requested_budget);
+    if (total_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) {
+        total_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES;
    }
+    if (query_multiprocessor_count(cuda, dev, &mp_count) &&
+        cuda->cuStreamCreate &&
+        cuda->cuStreamDestroy) {
+        stream_count = choose_stream_count(mp_count, planned, total_budget, 1);
+    }
+    if (stream_count > 1) {
+        int created = 0;
+        for (; created < stream_count; created++) {
+            if (!check_rc(cuda, "cuStreamCreate", cuda->cuStreamCreate(&streams[created], 0))) {
+                destroy_streams(cuda, streams, created);
+                stream_count = 1;
+                break;
+            }
+        }
+    }
+    report->stream_count = stream_count;
+    per_profile_budget = total_budget / ((size_t)planned * (size_t)stream_count);
+    if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) {
+        per_profile_budget = MIN_PROFILE_BUDGET_BYTES;
+    }
+    report->buffer_mb = (int)(total_budget / (1024u * 1024u));
+    append_detail(report->details,
+                  sizeof(report->details),
+                  "requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n",
+                  size_mb,
+                  report->buffer_mb,
+                  report->stream_count,
+                  mp_count,
+                  per_profile_budget / (1024u * 1024u));

-    for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
+    for (int i = 0; i < profile_count; i++) {
        const struct profile_desc *desc = &k_profiles[i];
        if (!(desc->enabled && cc >= desc->min_cc)) {
            append_detail(report->details,
@@ -1056,30 +1201,45 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
                          desc->min_cc);
            continue;
        }
-        if (prepare_profile(&cublas, handle, cuda, desc, per_profile_budget, &prepared[i])) {
-            active++;
-            append_detail(report->details,
-                          sizeof(report->details),
-                          "%s=READY dim=%llux%llux%llu block=%s\n",
-                          desc->name,
-                          (unsigned long long)prepared[i].m,
-                          (unsigned long long)prepared[i].n,
-                          (unsigned long long)prepared[i].k,
-                          desc->block_label);
-        } else {
-            append_detail(report->details, sizeof(report->details), "%s=SKIPPED unsupported\n", desc->name);
+        for (int lane = 0; lane < stream_count; lane++) {
+            CUstream stream = streams[lane];
+            if (prepared_count >= (int)(sizeof(prepared) / sizeof(prepared[0]))) {
+                break;
+            }
+            if (prepare_profile(&cublas, handle, cuda, desc, stream, per_profile_budget, &prepared[prepared_count])) {
+                active++;
+                append_detail(report->details,
+                              sizeof(report->details),
+                              "%s[%d]=READY dim=%llux%llux%llu block=%s stream=%d\n",
+                              desc->name,
+                              lane,
+                              (unsigned long long)prepared[prepared_count].m,
+                              (unsigned long long)prepared[prepared_count].n,
+                              (unsigned long long)prepared[prepared_count].k,
+                              desc->block_label,
+                              lane);
+                prepared_count++;
+            } else {
+                append_detail(report->details,
+                              sizeof(report->details),
+                              "%s[%d]=SKIPPED unsupported\n",
+                              desc->name,
+                              lane);
+            }
        }
    }

    if (active <= 0) {
        cublas.cublasLtDestroy(handle);
+        destroy_streams(cuda, streams, stream_count);
        cuda->cuCtxDestroy(ctx);
        return 0;
    }

    double deadline = now_seconds() + (double)seconds;
    while (now_seconds() < deadline) {
-        for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
+        wave_launches = 0;
+        for (int i = 0; i < prepared_count; i++) {
            if (!prepared[i].ready) {
                continue;
            }
@@ -1088,31 +1248,33 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
                              sizeof(report->details),
                              "%s=FAILED runtime\n",
                              prepared[i].desc.name);
-                for (size_t j = 0; j < sizeof(prepared) / sizeof(prepared[0]); j++) {
+                for (int j = 0; j < prepared_count; j++) {
                    destroy_profile(&cublas, cuda, &prepared[j]);
                }
                cublas.cublasLtDestroy(handle);
+                destroy_streams(cuda, streams, stream_count);
                cuda->cuCtxDestroy(ctx);
                return 0;
            }
            prepared[i].iterations++;
            report->iterations++;
-            if (now_seconds() >= deadline) {
-                break;
+            wave_launches++;
+        }
+        if (wave_launches <= 0) {
+            break;
+        }
+        if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
+            for (int i = 0; i < prepared_count; i++) {
+                destroy_profile(&cublas, cuda, &prepared[i]);
            }
+            cublas.cublasLtDestroy(handle);
+            destroy_streams(cuda, streams, stream_count);
+            cuda->cuCtxDestroy(ctx);
+            return 0;
        }
    }

-    if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
-        for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
-            destroy_profile(&cublas, cuda, &prepared[i]);
-        }
-        cublas.cublasLtDestroy(handle);
-        cuda->cuCtxDestroy(ctx);
-        return 0;
-    }
-
-    for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
+    for (int i = 0; i < prepared_count; i++) {
        if (!prepared[i].ready) {
            continue;
        }
@@ -1123,7 +1285,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
                      prepared[i].iterations);
    }

-    for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
+    for (int i = 0; i < prepared_count; i++) {
        if (prepared[i].ready) {
            if (check_rc(cuda, "cuMemcpyDtoH", cuda->cuMemcpyDtoH(sample, prepared[i].d_dev, sizeof(sample)))) {
                for (size_t j = 0; j < sizeof(sample) / sizeof(sample[0]); j++) {
@@ -1134,10 +1296,11 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
        }
    }

-    for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
+    for (int i = 0; i < prepared_count; i++) {
        destroy_profile(&cublas, cuda, &prepared[i]);
    }
    cublas.cublasLtDestroy(handle);
+    destroy_streams(cuda, streams, stream_count);
    cuda->cuCtxDestroy(ctx);
    return 1;
 }
@@ -1229,6 +1392,7 @@ int main(int argc, char **argv) {
    printf("backend=%s\n", report.backend);
    printf("duration_s=%d\n", seconds);
    printf("buffer_mb=%d\n", report.buffer_mb);
+    printf("streams=%d\n", report.stream_count);
    printf("iterations=%lu\n", report.iterations);
    printf("checksum=%llu\n", (unsigned long long)report.checksum);
    if (report.details[0] != '\0') {
--- a/iso/builder/build-nccl-tests.sh
+++ b/iso/builder/build-nccl-tests.sh
@@ -9,6 +9,7 @@
 #
 # Output layout:
 #   $CACHE_DIR/bin/all_reduce_perf
+#   $CACHE_DIR/lib/libcudart.so* copied from the nvcc toolchain used to build nccl-tests

 set -e

@@ -30,7 +31,7 @@ CACHE_DIR="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
 CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
 DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nccl-tests-downloads"

-if [ -f "${CACHE_DIR}/bin/all_reduce_perf" ]; then
+if [ -f "${CACHE_DIR}/bin/all_reduce_perf" ] && [ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcudart.so*' 2>/dev/null | wc -l)" -gt 0 ]; then
    echo "=== nccl-tests cached, skipping build ==="
    echo "binary: ${CACHE_DIR}/bin/all_reduce_perf"
    exit 0
@@ -52,6 +53,23 @@ echo "nvcc: $NVCC"
 CUDA_HOME="$(dirname "$(dirname "$NVCC")")"
 echo "CUDA_HOME: $CUDA_HOME"

+find_cudart_dir() {
+    for dir in \
+        "${CUDA_HOME}/targets/x86_64-linux/lib" \
+        "${CUDA_HOME}/targets/x86_64-linux/lib/stubs" \
+        "${CUDA_HOME}/lib64" \
+        "${CUDA_HOME}/lib"; do
+        if [ -d "$dir" ] && find "$dir" -maxdepth 1 -name 'libcudart.so*' -type f | grep -q .; then
+            printf '%s\n' "$dir"
+            return 0
+        fi
+    done
+    return 1
+}
+
+CUDART_DIR="$(find_cudart_dir)" || { echo "ERROR: libcudart.so* not found under ${CUDA_HOME}"; exit 1; }
+echo "cudart dir: $CUDART_DIR"
+
 # Download libnccl-dev for nccl.h
 REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian${DEBIAN_VERSION}/x86_64"
 DEV_PKG="libnccl-dev_${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}_amd64.deb"
@@ -136,6 +154,11 @@ mkdir -p "${CACHE_DIR}/bin"
 cp "./build/all_reduce_perf" "${CACHE_DIR}/bin/all_reduce_perf"
 chmod +x "${CACHE_DIR}/bin/all_reduce_perf"

+mkdir -p "${CACHE_DIR}/lib"
+find "${CUDART_DIR}" -maxdepth 1 -name 'libcudart.so*' -type f -exec cp -a {} "${CACHE_DIR}/lib/" \;
+[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcudart.so*' -type f | wc -l)" -gt 0 ] || { echo "ERROR: libcudart runtime copy failed"; exit 1; }
+
 echo "=== nccl-tests build complete ==="
 echo "binary: ${CACHE_DIR}/bin/all_reduce_perf"
 ls -lh "${CACHE_DIR}/bin/all_reduce_perf"
+ls -lh "${CACHE_DIR}/lib/"libcudart.so* 2>/dev/null || true
--- a/iso/builder/build.sh
+++ b/iso/builder/build.sh
@@ -246,6 +246,7 @@ rm -f \
    "${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \
    "${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \
    "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress" \
+    "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress" \
    "${OVERLAY_STAGE_DIR}/usr/local/bin/john" \
    "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker" \
    "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john" \
@@ -302,6 +303,7 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ] && [ -f "$GPU_BURN_WORKER_BIN" ]; then
    chmod +x "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker"
    chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-burn" 2>/dev/null || true
    chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-john-gpu-stress" 2>/dev/null || true
+    chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress" 2>/dev/null || true
    ln -sfn bee-gpu-burn "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
 fi

@@ -380,6 +382,7 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
    NCCL_TESTS_CACHE="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
    cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
    chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
+    cp "${NCCL_TESTS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
    echo "=== all_reduce_perf injected ==="

    echo ""
--- a/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
+++ b/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot
@@ -62,6 +62,7 @@ if [ "$GPU_VENDOR" = "nvidia" ]; then
    chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
    chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
    chmod +x /usr/local/bin/bee-john-gpu-stress 2>/dev/null || true
+    chmod +x /usr/local/bin/bee-nccl-gpu-stress 2>/dev/null || true
 fi

 # Reload udev rules
--- a/iso/overlay/usr/local/bin/bee-nccl-gpu-stress
+++ b/iso/overlay/usr/local/bin/bee-nccl-gpu-stress
@@ -0,0 +1,91 @@
+#!/bin/sh
+set -eu
+
+SECONDS=300
+DEVICES=""
+EXCLUDE=""
+MIN_BYTES="512M"
+MAX_BYTES="4G"
+FACTOR="2"
+ITERS="20"
+ALL_REDUCE_BIN="/usr/local/bin/all_reduce_perf"
+
+usage() {
+    echo "usage: $0 [--seconds N] [--devices 0,1] [--exclude 2,3]" >&2
+    exit 2
+}
+
+normalize_list() {
+    echo "${1:-}" | tr ',' '\n' | sed 's/[[:space:]]//g' | awk 'NF' | sort -n | uniq | paste -sd, -
+}
+
+contains_csv() {
+    needle="$1"
+    haystack="${2:-}"
+    echo ",${haystack}," | grep -q ",${needle},"
+}
+
+while [ "$#" -gt 0 ]; do
+    case "$1" in
+        --seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
+        --devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
+        --exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
+        *) usage ;;
+    esac
+done
+
+[ -x "${ALL_REDUCE_BIN}" ] || { echo "all_reduce_perf not found: ${ALL_REDUCE_BIN}" >&2; exit 1; }
+
+ALL_DEVICES=$(nvidia-smi --query-gpu=index --format=csv,noheader,nounits 2>/dev/null | sed 's/[[:space:]]//g' | awk 'NF' | paste -sd, -)
+[ -n "${ALL_DEVICES}" ] || { echo "nvidia-smi found no NVIDIA GPUs" >&2; exit 1; }
+
+DEVICES=$(normalize_list "${DEVICES}")
+EXCLUDE=$(normalize_list "${EXCLUDE}")
+SELECTED="${DEVICES}"
+if [ -z "${SELECTED}" ]; then
+    SELECTED="${ALL_DEVICES}"
+fi
+
+FINAL=""
+for id in $(echo "${SELECTED}" | tr ',' ' '); do
+    [ -n "${id}" ] || continue
+    if contains_csv "${id}" "${EXCLUDE}"; then
+        continue
+    fi
+    if [ -z "${FINAL}" ]; then
+        FINAL="${id}"
+    else
+        FINAL="${FINAL},${id}"
+    fi
+done
+
+[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
+
+GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | awk '{print $1}')
+[ "${GPU_COUNT}" -gt 0 ] || { echo "selected GPU count is zero" >&2; exit 1; }
+
+echo "loader=nccl"
+echo "selected_gpus=${FINAL}"
+echo "gpu_count=${GPU_COUNT}"
+echo "range=${MIN_BYTES}..${MAX_BYTES}"
+echo "iters=${ITERS}"
+
+deadline=$(( $(date +%s) + SECONDS ))
+round=0
+
+while :; do
+    now=$(date +%s)
+    if [ "${now}" -ge "${deadline}" ]; then
+        break
+    fi
+    round=$((round + 1))
+    remaining=$((deadline - now))
+    echo "round=${round} remaining_sec=${remaining}"
+    CUDA_VISIBLE_DEVICES="${FINAL}" \
+        "${ALL_REDUCE_BIN}" \
+        -b "${MIN_BYTES}" \
+        -e "${MAX_BYTES}" \
+        -f "${FACTOR}" \
+        -g "${GPU_COUNT}" \
+        --iters "${ITERS}"
+done