Tune bee-gpu-burn single-precision benchmark phases

2026-04-16 00:05:47 +03:00
parent 5c1862ce4c
commit fa6d905a10
1 changed files with 182 additions and 147 deletions
--- a/iso/builder/bee-gpu-stress.c
+++ b/iso/builder/bee-gpu-stress.c
@@ -35,6 +35,8 @@ typedef void *CUstream;
 #define MAX_STRESS_STREAMS 16
 #define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
 #define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
 #define MAX_SINGLE_PRECISION_STREAMS 4
 #define MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES ((size_t)2u * 1024u * 1024u * 1024u)
 static const char *ptx_source =
    ".version 6.0\n"
@@ -296,6 +298,13 @@ static int choose_stream_count(int mp_count, int planned_profiles, size_t total_
    return stream_count;
 }
 static size_t clamp_single_precision_profile_budget(size_t profile_budget_bytes) {
    if (profile_budget_bytes > MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES) {
        return MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES;
    }
    return profile_budget_bytes;
 }
 static void destroy_streams(struct cuda_api *api, CUstream *streams, int count) {
    if (!api->cuStreamDestroy) {
        return;
@@ -908,11 +917,9 @@ static int prepare_profile(struct cublaslt_api *cublas,
                           CUstream stream,
                           size_t profile_budget_bytes,
                           struct prepared_profile *out) {
    memset(out, 0, sizeof(*out));
    out->desc = *desc;
    out->stream = stream;
    size_t bytes_per_cell = 0;
    size_t attempt_budget = profile_budget_bytes;
    bytes_per_cell += bytes_for_elements(desc->a_type, 1);
    bytes_per_cell += bytes_for_elements(desc->b_type, 1);
    bytes_per_cell += bytes_for_elements(desc->c_type, 1);
@@ -921,106 +928,115 @@ static int prepare_profile(struct cublaslt_api *cublas,
        return 0;
    }
-    uint64_t dim = choose_square_dim(profile_budget_bytes, bytes_per_cell, desc->min_multiple);
+    while (attempt_budget >= MIN_PROFILE_BUDGET_BYTES) {
-    out->m = dim;
+        memset(out, 0, sizeof(*out));
-    out->n = dim;
+        out->desc = *desc;
-    out->k = dim;
+        out->stream = stream;
-    size_t desired_workspace = profile_budget_bytes / 8u;
+        uint64_t dim = choose_square_dim(attempt_budget, bytes_per_cell, desc->min_multiple);
-    if (desired_workspace > 32u * 1024u * 1024u) {
+        out->m = dim;
-        desired_workspace = 32u * 1024u * 1024u;
+        out->n = dim;
-    }
+        out->k = dim;
    desired_workspace = round_down_size(desired_workspace, 256u);
-    size_t a_bytes = 0;
+        size_t desired_workspace = attempt_budget / 8u;
-    size_t b_bytes = 0;
+        if (desired_workspace > 32u * 1024u * 1024u) {
-    size_t c_bytes = 0;
+            desired_workspace = 32u * 1024u * 1024u;
-    size_t d_bytes = 0;
+        }
-    size_t scale_bytes = 0;
+        desired_workspace = round_down_size(desired_workspace, 256u);
    while (1) {
        a_bytes = bytes_for_elements(desc->a_type, out->k * out->m);
        b_bytes = bytes_for_elements(desc->b_type, out->k * out->n);
        c_bytes = bytes_for_elements(desc->c_type, out->m * out->n);
        d_bytes = bytes_for_elements(desc->d_type, out->m * out->n);
        scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k);
-        size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes;
+        size_t a_bytes = 0;
-        if (matrix_bytes <= profile_budget_bytes) {
+        size_t b_bytes = 0;
-            size_t remaining = profile_budget_bytes - matrix_bytes;
+        size_t c_bytes = 0;
-            out->workspace_size = desired_workspace;
+        size_t d_bytes = 0;
-            if (out->workspace_size > remaining) {
+        size_t scale_bytes = 0;
-                out->workspace_size = round_down_size(remaining, 256u);
+        while (1) {
            a_bytes = bytes_for_elements(desc->a_type, out->k * out->m);
            b_bytes = bytes_for_elements(desc->b_type, out->k * out->n);
            c_bytes = bytes_for_elements(desc->c_type, out->m * out->n);
            d_bytes = bytes_for_elements(desc->d_type, out->m * out->n);
            scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k);
            size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes;
            if (matrix_bytes <= attempt_budget) {
                size_t remaining = attempt_budget - matrix_bytes;
                out->workspace_size = desired_workspace;
                if (out->workspace_size > remaining) {
                    out->workspace_size = round_down_size(remaining, 256u);
                }
                break;
            }
-            break;
+
            if (out->m <= (uint64_t)desc->min_multiple) {
                break;
            }
            out->m -= (uint64_t)desc->min_multiple;
            out->n = out->m;
            out->k = out->m;
        }
        if (out->m < (uint64_t)desc->min_multiple) {
            attempt_budget /= 2u;
            continue;
        }
-        if (out->m <= (uint64_t)desc->min_multiple) {
+        if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) ||
-            return 0;
+            !alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) ||
-        }
+            !alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) ||
-        out->m -= (uint64_t)desc->min_multiple;
+            !alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) {
        out->n = out->m;
        out->k = out->m;
    }
    if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) ||
        !alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) ||
        !alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) ||
        !alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) {
        destroy_profile(cublas, cuda, out);
        return 0;
    }
    cudaDataType_t scale_type = matmul_scale_type(desc);
    if (!check_cublas("cublasLtMatmulDescCreate",
                      cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, scale_type))) {
        destroy_profile(cublas, cuda, out);
        return 0;
    }
    cublasOperation_t transa = CUBLAS_OP_T;
    cublasOperation_t transb = CUBLAS_OP_N;
    if (!check_cublas("set TRANSA",
                      cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
                                                             CUBLASLT_MATMUL_DESC_TRANSA,
                                                             &transa,
                                                             sizeof(transa))) ||
        !check_cublas("set TRANSB",
                      cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
                                                             CUBLASLT_MATMUL_DESC_TRANSB,
                                                             &transb,
                                                             sizeof(transb)))) {
        destroy_profile(cublas, cuda, out);
        return 0;
    }
    if (desc->needs_scalar_scale) {
        float one = 1.0f;
        if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) ||
            !alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
-        if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) ||
+
-            !device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) {
+        cudaDataType_t scale_type = matmul_scale_type(desc);
        if (!check_cublas("cublasLtMatmulDescCreate",
                          cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, scale_type))) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
-        void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev;
+
-        void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev;
+        cublasOperation_t transa = CUBLAS_OP_T;
-        if (!check_cublas("set A scale ptr",
+        cublasOperation_t transb = CUBLAS_OP_N;
        if (!check_cublas("set TRANSA",
                          cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
-                                                                 CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
+                                                                 CUBLASLT_MATMUL_DESC_TRANSA,
-                                                                 &a_scale_ptr,
+                                                                 &transa,
-                                                                 sizeof(a_scale_ptr))) ||
+                                                                 sizeof(transa))) ||
-            !check_cublas("set B scale ptr",
+            !check_cublas("set TRANSB",
                          cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
-                                                                 CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
+                                                                 CUBLASLT_MATMUL_DESC_TRANSB,
-                                                                 &b_scale_ptr,
+                                                                 &transb,
-                                                                 sizeof(b_scale_ptr)))) {
+                                                                 sizeof(transb)))) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
-    }
+
        if (desc->needs_scalar_scale) {
            float one = 1.0f;
            if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) ||
                !alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) {
                destroy_profile(cublas, cuda, out);
                return 0;
            }
            if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) ||
                !device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) {
                destroy_profile(cublas, cuda, out);
                return 0;
            }
            void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev;
            void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev;
            if (!check_cublas("set A scale ptr",
                              cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
                                                                     CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
                                                                     &a_scale_ptr,
                                                                     sizeof(a_scale_ptr))) ||
                !check_cublas("set B scale ptr",
                              cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
                                                                     CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
                                                                     &b_scale_ptr,
                                                                     sizeof(b_scale_ptr)))) {
                destroy_profile(cublas, cuda, out);
                return 0;
            }
        }
 #if defined(CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3)
    if (desc->needs_block_scale) {
@@ -1060,62 +1076,65 @@ static int prepare_profile(struct cublaslt_api *cublas,
    }
 #endif
-    if (!check_cublas("create A layout",
+        if (!check_cublas("create A layout",
-                      cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) ||
+                          cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) ||
-        !check_cublas("create B layout",
+            !check_cublas("create B layout",
-                      cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) ||
+                          cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) ||
-        !check_cublas("create C layout",
+            !check_cublas("create C layout",
-                      cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) ||
+                          cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) ||
-        !check_cublas("create D layout",
+            !check_cublas("create D layout",
-                      cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) {
+                          cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) {
        destroy_profile(cublas, cuda, out);
        return 0;
    }
    if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) {
        destroy_profile(cublas, cuda, out);
        return 0;
    }
    if (out->workspace_size > 0) {
        if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
        if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
        if (out->workspace_size > 0) {
            if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) {
                destroy_profile(cublas, cuda, out);
                return 0;
            }
        }
        if (!check_cublas("set workspace",
                          cublas->cublasLtMatmulPreferenceSetAttribute(
                              out->preference,
                              CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
                              &out->workspace_size,
                              sizeof(out->workspace_size)))) {
            destroy_profile(cublas, cuda, out);
            return 0;
        }
        int found = 0;
        if (check_cublas("heuristic",
                         cublas->cublasLtMatmulAlgoGetHeuristic(handle,
                                                                out->op_desc,
                                                                out->a_layout,
                                                                out->b_layout,
                                                                out->c_layout,
                                                                out->d_layout,
                                                                out->preference,
                                                                1,
                                                                &out->heuristic,
                                                                &found)) &&
            found > 0) {
            out->ready = 1;
            return 1;
        }
        destroy_profile(cublas, cuda, out);
        attempt_budget = round_down_size(attempt_budget * 3u / 4u, 256u);
        if (attempt_budget < MIN_PROFILE_BUDGET_BYTES) {
            break;
        }
    }
-    if (!check_cublas("set workspace",
+    return 0;
                      cublas->cublasLtMatmulPreferenceSetAttribute(
                          out->preference,
                          CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
                          &out->workspace_size,
                          sizeof(out->workspace_size)))) {
        destroy_profile(cublas, cuda, out);
        return 0;
    }
    int found = 0;
    if (!check_cublas("heuristic",
                      cublas->cublasLtMatmulAlgoGetHeuristic(handle,
                                                             out->op_desc,
                                                             out->a_layout,
                                                             out->b_layout,
                                                             out->c_layout,
                                                             out->d_layout,
                                                             out->preference,
                                                             1,
                                                             &out->heuristic,
                                                             &found))) {
        destroy_profile(cublas, cuda, out);
        return 0;
    }
    if (found <= 0) {
        destroy_profile(cublas, cuda, out);
        return 0;
    }
    out->ready = 1;
    return 1;
 }
 static int run_cublas_profile(cublasLtHandle_t handle,
@@ -1180,6 +1199,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
    size_t requested_budget = 0;
    size_t total_budget = 0;
    size_t per_profile_budget = 0;
    int budget_profiles = 0;
    memset(report, 0, sizeof(*report));
    snprintf(report->backend, sizeof(report->backend), "cublasLt");
@@ -1215,8 +1235,9 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
    }
    /* Count all profiles active on this GPU regardless of filter.
-     * Used as the budget divisor so matrix sizes stay consistent whether
+     * Mixed phases still divide budget across the full precision set, while
-     * running all precisions together or a single-precision phase. */
+     * single-precision benchmark phases dedicate budget only to active
     * profiles matching precision_filter. */
    int planned_total = 0;
    for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
        if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) {
@@ -1226,19 +1247,29 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
    if (planned_total < planned) {
        planned_total = planned;
    }
    budget_profiles = planned_total;
    if (precision_filter != NULL) {
        budget_profiles = planned;
    }
    if (budget_profiles <= 0) {
        budget_profiles = planned_total;
    }
    requested_budget = (size_t)size_mb * 1024u * 1024u;
-    if (requested_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) {
+    if (requested_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) {
-        requested_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES;
+        requested_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES;
    }
    total_budget = clamp_budget_to_free_memory(cuda, requested_budget);
-    if (total_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) {
+    if (total_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) {
-        total_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES;
+        total_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES;
    }
    if (query_multiprocessor_count(cuda, dev, &mp_count) &&
        cuda->cuStreamCreate &&
        cuda->cuStreamDestroy) {
-        stream_count = choose_stream_count(mp_count, planned_total, total_budget, 1);
+        stream_count = choose_stream_count(mp_count, budget_profiles, total_budget, 1);
    }
    if (precision_filter != NULL && stream_count > MAX_SINGLE_PRECISION_STREAMS) {
        stream_count = MAX_SINGLE_PRECISION_STREAMS;
    }
    if (stream_count > 1) {
        int created = 0;
@@ -1251,18 +1282,22 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
        }
    }
    report->stream_count = stream_count;
-    per_profile_budget = total_budget / ((size_t)planned_total * (size_t)stream_count);
+    per_profile_budget = total_budget / ((size_t)budget_profiles * (size_t)stream_count);
    if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) {
        per_profile_budget = MIN_PROFILE_BUDGET_BYTES;
    }
    if (precision_filter != NULL) {
        per_profile_budget = clamp_single_precision_profile_budget(per_profile_budget);
    }
    report->buffer_mb = (int)(total_budget / (1024u * 1024u));
    append_detail(report->details,
                  sizeof(report->details),
-                  "requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n",
+                  "requested_mb=%d actual_mb=%d streams=%d mp_count=%d budget_profiles=%d per_worker_mb=%zu\n",
                  size_mb,
                  report->buffer_mb,
                  report->stream_count,
                  mp_count,
                  budget_profiles,
                  per_profile_budget / (1024u * 1024u));
    for (int i = 0; i < profile_count; i++) {