fix(stress): keep platform burn responsive under load

2026-03-31 22:28:26 +03:00
parent ea660500c9
commit c9ee078622
3 changed files with 141 additions and 41 deletions
@@ -36,6 +36,7 @@ typedef void *CUstream;
 #define MAX_CUBLAS_PROFILES 5
 #define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
 #define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
+#define STRESS_LAUNCH_DEPTH 8

 static const char *ptx_source =
    ".version 6.0\n"
@@ -422,24 +423,31 @@ static int run_ptx_fallback(struct cuda_api *api,
    double deadline = start + (double)seconds;
    while (now_seconds() < deadline) {
        launches_per_wave = 0;
-        for (int lane = 0; lane < stream_count; lane++) {
-            unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
-            if (!check_rc(api,
-                          "cuLaunchKernel",
-                          api->cuLaunchKernel(kernel,
-                                              blocks,
-                                              1,
-                                              1,
-                                              threads,
-                                              1,
-                                              1,
-                                              0,
-                                              streams[lane],
-                                              params[lane],
-                                              NULL))) {
-                goto fail;
+        for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
+            int launched_this_batch = 0;
+            for (int lane = 0; lane < stream_count; lane++) {
+                unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
+                if (!check_rc(api,
+                              "cuLaunchKernel",
+                              api->cuLaunchKernel(kernel,
+                                                  blocks,
+                                                  1,
+                                                  1,
+                                                  threads,
+                                                  1,
+                                                  1,
+                                                  0,
+                                                  streams[lane],
+                                                  params[lane],
+                                                  NULL))) {
+                    goto fail;
+                }
+                launches_per_wave++;
+                launched_this_batch++;
+            }
+            if (launched_this_batch <= 0) {
+                break;
            }
-            launches_per_wave++;
        }
        if (launches_per_wave <= 0) {
            goto fail;
@@ -460,10 +468,11 @@ static int run_ptx_fallback(struct cuda_api *api,
    report->iterations = iterations;
    snprintf(report->details,
             sizeof(report->details),
-             "fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d per_stream_mb=%zu iterations=%lu\n",
+             "fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d queue_depth=%d per_stream_mb=%zu iterations=%lu\n",
             size_mb,
             report->buffer_mb,
             report->stream_count,
+             STRESS_LAUNCH_DEPTH,
             bytes_per_stream[0] / (1024u * 1024u),
             iterations);

@@ -1184,10 +1193,11 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
    report->buffer_mb = (int)(total_budget / (1024u * 1024u));
    append_detail(report->details,
                  sizeof(report->details),
-                  "requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n",
+                  "requested_mb=%d actual_mb=%d streams=%d queue_depth=%d mp_count=%d per_worker_mb=%zu\n",
                  size_mb,
                  report->buffer_mb,
                  report->stream_count,
+                  STRESS_LAUNCH_DEPTH,
                  mp_count,
                  per_profile_budget / (1024u * 1024u));

@@ -1239,26 +1249,33 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
    double deadline = now_seconds() + (double)seconds;
    while (now_seconds() < deadline) {
        wave_launches = 0;
-        for (int i = 0; i < prepared_count; i++) {
-            if (!prepared[i].ready) {
-                continue;
-            }
-            if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
-                append_detail(report->details,
-                              sizeof(report->details),
-                              "%s=FAILED runtime\n",
-                              prepared[i].desc.name);
-                for (int j = 0; j < prepared_count; j++) {
-                    destroy_profile(&cublas, cuda, &prepared[j]);
+        for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
+            int launched_this_batch = 0;
+            for (int i = 0; i < prepared_count; i++) {
+                if (!prepared[i].ready) {
+                    continue;
                }
-                cublas.cublasLtDestroy(handle);
-                destroy_streams(cuda, streams, stream_count);
-                cuda->cuCtxDestroy(ctx);
-                return 0;
+                if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
+                    append_detail(report->details,
+                                  sizeof(report->details),
+                                  "%s=FAILED runtime\n",
+                                  prepared[i].desc.name);
+                    for (int j = 0; j < prepared_count; j++) {
+                        destroy_profile(&cublas, cuda, &prepared[j]);
+                    }
+                    cublas.cublasLtDestroy(handle);
+                    destroy_streams(cuda, streams, stream_count);
+                    cuda->cuCtxDestroy(ctx);
+                    return 0;
+                }
+                prepared[i].iterations++;
+                report->iterations++;
+                wave_launches++;
+                launched_this_batch++;
+            }
+            if (launched_this_batch <= 0) {
+                break;
            }
-            prepared[i].iterations++;
-            report->iterations++;
-            wave_launches++;
        }
        if (wave_launches <= 0) {
            break;