From fa6d905a108ea9c3c0a9c55811915f1b9e79f4a6 Mon Sep 17 00:00:00 2001
From: Michael Chus <mike@mchus.pro>
Date: Thu, 16 Apr 2026 00:05:47 +0300
Subject: [PATCH] Tune bee-gpu-burn single-precision benchmark phases

---
 iso/builder/bee-gpu-stress.c | 329 +++++++++++++++++++----------------
 1 file changed, 182 insertions(+), 147 deletions(-)

diff --git a/iso/builder/bee-gpu-stress.c b/iso/builder/bee-gpu-stress.c
index 01cba57..5160034 100644
--- a/iso/builder/bee-gpu-stress.c
+++ b/iso/builder/bee-gpu-stress.c
@@ -35,6 +35,8 @@ typedef void *CUstream;
 #define MAX_STRESS_STREAMS 16
 #define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
 #define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
+#define MAX_SINGLE_PRECISION_STREAMS 4
+#define MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES ((size_t)2u * 1024u * 1024u * 1024u)
 
 static const char *ptx_source =
     ".version 6.0\n"
@@ -296,6 +298,13 @@ static int choose_stream_count(int mp_count, int planned_profiles, size_t total_
     return stream_count;
 }
 
+static size_t clamp_single_precision_profile_budget(size_t profile_budget_bytes) {
+    if (profile_budget_bytes > MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES) {
+        return MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES;
+    }
+    return profile_budget_bytes;
+}
+
 static void destroy_streams(struct cuda_api *api, CUstream *streams, int count) {
     if (!api->cuStreamDestroy) {
         return;
@@ -908,11 +917,9 @@ static int prepare_profile(struct cublaslt_api *cublas,
                            CUstream stream,
                            size_t profile_budget_bytes,
                            struct prepared_profile *out) {
-    memset(out, 0, sizeof(*out));
-    out->desc = *desc;
-    out->stream = stream;
-
     size_t bytes_per_cell = 0;
+    size_t attempt_budget = profile_budget_bytes;
+
     bytes_per_cell += bytes_for_elements(desc->a_type, 1);
     bytes_per_cell += bytes_for_elements(desc->b_type, 1);
     bytes_per_cell += bytes_for_elements(desc->c_type, 1);
@@ -921,106 +928,115 @@ static int prepare_profile(struct cublaslt_api *cublas,
         return 0;
     }
 
-    uint64_t dim = choose_square_dim(profile_budget_bytes, bytes_per_cell, desc->min_multiple);
-    out->m = dim;
-    out->n = dim;
-    out->k = dim;
+    while (attempt_budget >= MIN_PROFILE_BUDGET_BYTES) {
+        memset(out, 0, sizeof(*out));
+        out->desc = *desc;
+        out->stream = stream;
 
-    size_t desired_workspace = profile_budget_bytes / 8u;
-    if (desired_workspace > 32u * 1024u * 1024u) {
-        desired_workspace = 32u * 1024u * 1024u;
-    }
-    desired_workspace = round_down_size(desired_workspace, 256u);
+        uint64_t dim = choose_square_dim(attempt_budget, bytes_per_cell, desc->min_multiple);
+        out->m = dim;
+        out->n = dim;
+        out->k = dim;
 
-    size_t a_bytes = 0;
-    size_t b_bytes = 0;
-    size_t c_bytes = 0;
-    size_t d_bytes = 0;
-    size_t scale_bytes = 0;
-    while (1) {
-        a_bytes = bytes_for_elements(desc->a_type, out->k * out->m);
-        b_bytes = bytes_for_elements(desc->b_type, out->k * out->n);
-        c_bytes = bytes_for_elements(desc->c_type, out->m * out->n);
-        d_bytes = bytes_for_elements(desc->d_type, out->m * out->n);
-        scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k);
+        size_t desired_workspace = attempt_budget / 8u;
+        if (desired_workspace > 32u * 1024u * 1024u) {
+            desired_workspace = 32u * 1024u * 1024u;
+        }
+        desired_workspace = round_down_size(desired_workspace, 256u);
 
-        size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes;
-        if (matrix_bytes <= profile_budget_bytes) {
-            size_t remaining = profile_budget_bytes - matrix_bytes;
-            out->workspace_size = desired_workspace;
-            if (out->workspace_size > remaining) {
-                out->workspace_size = round_down_size(remaining, 256u);
+        size_t a_bytes = 0;
+        size_t b_bytes = 0;
+        size_t c_bytes = 0;
+        size_t d_bytes = 0;
+        size_t scale_bytes = 0;
+        while (1) {
+            a_bytes = bytes_for_elements(desc->a_type, out->k * out->m);
+            b_bytes = bytes_for_elements(desc->b_type, out->k * out->n);
+            c_bytes = bytes_for_elements(desc->c_type, out->m * out->n);
+            d_bytes = bytes_for_elements(desc->d_type, out->m * out->n);
+            scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k);
+
+            size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes;
+            if (matrix_bytes <= attempt_budget) {
+                size_t remaining = attempt_budget - matrix_bytes;
+                out->workspace_size = desired_workspace;
+                if (out->workspace_size > remaining) {
+                    out->workspace_size = round_down_size(remaining, 256u);
+                }
+                break;
             }
-            break;
+
+            if (out->m <= (uint64_t)desc->min_multiple) {
+                break;
+            }
+            out->m -= (uint64_t)desc->min_multiple;
+            out->n = out->m;
+            out->k = out->m;
+        }
+        if (out->m < (uint64_t)desc->min_multiple) {
+            attempt_budget /= 2u;
+            continue;
         }
 
-        if (out->m <= (uint64_t)desc->min_multiple) {
-            return 0;
-        }
-        out->m -= (uint64_t)desc->min_multiple;
-        out->n = out->m;
-        out->k = out->m;
-    }
-
-    if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) ||
-        !alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) ||
-        !alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) ||
-        !alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    cudaDataType_t scale_type = matmul_scale_type(desc);
-    if (!check_cublas("cublasLtMatmulDescCreate",
-                      cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, scale_type))) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    cublasOperation_t transa = CUBLAS_OP_T;
-    cublasOperation_t transb = CUBLAS_OP_N;
-    if (!check_cublas("set TRANSA",
-                      cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
-                                                             CUBLASLT_MATMUL_DESC_TRANSA,
-                                                             &transa,
-                                                             sizeof(transa))) ||
-        !check_cublas("set TRANSB",
-                      cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
-                                                             CUBLASLT_MATMUL_DESC_TRANSB,
-                                                             &transb,
-                                                             sizeof(transb)))) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    if (desc->needs_scalar_scale) {
-        float one = 1.0f;
-        if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) ||
-            !alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) {
+        if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) ||
+            !alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) ||
+            !alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) ||
+            !alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) {
             destroy_profile(cublas, cuda, out);
             return 0;
         }
-        if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) ||
-            !device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) {
+
+        cudaDataType_t scale_type = matmul_scale_type(desc);
+        if (!check_cublas("cublasLtMatmulDescCreate",
+                          cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, scale_type))) {
             destroy_profile(cublas, cuda, out);
             return 0;
         }
-        void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev;
-        void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev;
-        if (!check_cublas("set A scale ptr",
+
+        cublasOperation_t transa = CUBLAS_OP_T;
+        cublasOperation_t transb = CUBLAS_OP_N;
+        if (!check_cublas("set TRANSA",
                           cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
-                                                                 CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
-                                                                 &a_scale_ptr,
-                                                                 sizeof(a_scale_ptr))) ||
-            !check_cublas("set B scale ptr",
+                                                                 CUBLASLT_MATMUL_DESC_TRANSA,
+                                                                 &transa,
+                                                                 sizeof(transa))) ||
+            !check_cublas("set TRANSB",
                           cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
-                                                                 CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
-                                                                 &b_scale_ptr,
-                                                                 sizeof(b_scale_ptr)))) {
+                                                                 CUBLASLT_MATMUL_DESC_TRANSB,
+                                                                 &transb,
+                                                                 sizeof(transb)))) {
             destroy_profile(cublas, cuda, out);
             return 0;
         }
-    }
+
+        if (desc->needs_scalar_scale) {
+            float one = 1.0f;
+            if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) ||
+                !alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) {
+                destroy_profile(cublas, cuda, out);
+                return 0;
+            }
+            if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) ||
+                !device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) {
+                destroy_profile(cublas, cuda, out);
+                return 0;
+            }
+            void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev;
+            void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev;
+            if (!check_cublas("set A scale ptr",
+                              cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
+                                                                     CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
+                                                                     &a_scale_ptr,
+                                                                     sizeof(a_scale_ptr))) ||
+                !check_cublas("set B scale ptr",
+                              cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
+                                                                     CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
+                                                                     &b_scale_ptr,
+                                                                     sizeof(b_scale_ptr)))) {
+                destroy_profile(cublas, cuda, out);
+                return 0;
+            }
+        }
 
 #if defined(CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3)
     if (desc->needs_block_scale) {
@@ -1060,62 +1076,65 @@ static int prepare_profile(struct cublaslt_api *cublas,
     }
 #endif
 
-    if (!check_cublas("create A layout",
-                      cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) ||
-        !check_cublas("create B layout",
-                      cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) ||
-        !check_cublas("create C layout",
-                      cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) ||
-        !check_cublas("create D layout",
-                      cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    if (out->workspace_size > 0) {
-        if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) {
+        if (!check_cublas("create A layout",
+                          cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) ||
+            !check_cublas("create B layout",
+                          cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) ||
+            !check_cublas("create C layout",
+                          cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) ||
+            !check_cublas("create D layout",
+                          cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) {
             destroy_profile(cublas, cuda, out);
             return 0;
         }
+
+        if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) {
+            destroy_profile(cublas, cuda, out);
+            return 0;
+        }
+
+        if (out->workspace_size > 0) {
+            if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) {
+                destroy_profile(cublas, cuda, out);
+                return 0;
+            }
+        }
+
+        if (!check_cublas("set workspace",
+                          cublas->cublasLtMatmulPreferenceSetAttribute(
+                              out->preference,
+                              CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
+                              &out->workspace_size,
+                              sizeof(out->workspace_size)))) {
+            destroy_profile(cublas, cuda, out);
+            return 0;
+        }
+
+        int found = 0;
+        if (check_cublas("heuristic",
+                         cublas->cublasLtMatmulAlgoGetHeuristic(handle,
+                                                                out->op_desc,
+                                                                out->a_layout,
+                                                                out->b_layout,
+                                                                out->c_layout,
+                                                                out->d_layout,
+                                                                out->preference,
+                                                                1,
+                                                                &out->heuristic,
+                                                                &found)) &&
+            found > 0) {
+            out->ready = 1;
+            return 1;
+        }
+
+        destroy_profile(cublas, cuda, out);
+        attempt_budget = round_down_size(attempt_budget * 3u / 4u, 256u);
+        if (attempt_budget < MIN_PROFILE_BUDGET_BYTES) {
+            break;
+        }
     }
 
-    if (!check_cublas("set workspace",
-                      cublas->cublasLtMatmulPreferenceSetAttribute(
-                          out->preference,
-                          CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
-                          &out->workspace_size,
-                          sizeof(out->workspace_size)))) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    int found = 0;
-    if (!check_cublas("heuristic",
-                      cublas->cublasLtMatmulAlgoGetHeuristic(handle,
-                                                             out->op_desc,
-                                                             out->a_layout,
-                                                             out->b_layout,
-                                                             out->c_layout,
-                                                             out->d_layout,
-                                                             out->preference,
-                                                             1,
-                                                             &out->heuristic,
-                                                             &found))) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-    if (found <= 0) {
-        destroy_profile(cublas, cuda, out);
-        return 0;
-    }
-
-    out->ready = 1;
-    return 1;
+    return 0;
 }
 
 static int run_cublas_profile(cublasLtHandle_t handle,
@@ -1180,6 +1199,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
     size_t requested_budget = 0;
     size_t total_budget = 0;
     size_t per_profile_budget = 0;
+    int budget_profiles = 0;
 
     memset(report, 0, sizeof(*report));
     snprintf(report->backend, sizeof(report->backend), "cublasLt");
@@ -1215,8 +1235,9 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
     }
 
     /* Count all profiles active on this GPU regardless of filter.
-     * Used as the budget divisor so matrix sizes stay consistent whether
-     * running all precisions together or a single-precision phase. */
+     * Mixed phases still divide budget across the full precision set, while
+     * single-precision benchmark phases dedicate budget only to active
+     * profiles matching precision_filter. */
     int planned_total = 0;
     for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
         if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) {
@@ -1226,19 +1247,29 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
     if (planned_total < planned) {
         planned_total = planned;
     }
+    budget_profiles = planned_total;
+    if (precision_filter != NULL) {
+        budget_profiles = planned;
+    }
+    if (budget_profiles <= 0) {
+        budget_profiles = planned_total;
+    }
 
     requested_budget = (size_t)size_mb * 1024u * 1024u;
-    if (requested_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) {
-        requested_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES;
+    if (requested_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) {
+        requested_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES;
     }
     total_budget = clamp_budget_to_free_memory(cuda, requested_budget);
-    if (total_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) {
-        total_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES;
+    if (total_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) {
+        total_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES;
     }
     if (query_multiprocessor_count(cuda, dev, &mp_count) &&
         cuda->cuStreamCreate &&
         cuda->cuStreamDestroy) {
-        stream_count = choose_stream_count(mp_count, planned_total, total_budget, 1);
+        stream_count = choose_stream_count(mp_count, budget_profiles, total_budget, 1);
+    }
+    if (precision_filter != NULL && stream_count > MAX_SINGLE_PRECISION_STREAMS) {
+        stream_count = MAX_SINGLE_PRECISION_STREAMS;
     }
     if (stream_count > 1) {
         int created = 0;
@@ -1251,18 +1282,22 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
         }
     }
     report->stream_count = stream_count;
-    per_profile_budget = total_budget / ((size_t)planned_total * (size_t)stream_count);
+    per_profile_budget = total_budget / ((size_t)budget_profiles * (size_t)stream_count);
     if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) {
         per_profile_budget = MIN_PROFILE_BUDGET_BYTES;
     }
+    if (precision_filter != NULL) {
+        per_profile_budget = clamp_single_precision_profile_budget(per_profile_budget);
+    }
     report->buffer_mb = (int)(total_budget / (1024u * 1024u));
     append_detail(report->details,
                   sizeof(report->details),
-                  "requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n",
+                  "requested_mb=%d actual_mb=%d streams=%d mp_count=%d budget_profiles=%d per_worker_mb=%zu\n",
                   size_mb,
                   report->buffer_mb,
                   report->stream_count,
                   mp_count,
+                  budget_profiles,
                   per_profile_budget / (1024u * 1024u));
 
     for (int i = 0; i < profile_count; i++) {