Tune bee-gpu-burn single-precision benchmark phases

2026-04-16 00:05:47 +03:00
parent 5c1862ce4c
commit fa6d905a10
1 changed files with 182 additions and 147 deletions
--- a/iso/builder/bee-gpu-stress.c
+++ b/iso/builder/bee-gpu-stress.c
@@ -35,6 +35,8 @@ typedef void *CUstream;
 #define MAX_STRESS_STREAMS 16
 #define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
 #define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
 #define MAX_SINGLE_PRECISION_STREAMS 4
 #define MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES ((size_t)2u * 1024u * 1024u * 1024u)
 static const char *ptx_source =
    ".version 6.0\n"
@@ -296,6 +298,13 @@ static int choose_stream_count(int mp_count, int planned_profiles, size_t total_
    return stream_count;
 }
 static size_t clamp_single_precision_profile_budget(size_t profile_budget_bytes) {
    if (profile_budget_bytes > MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES) {
        return MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES;
    }
    return profile_budget_bytes;
 }
 static void destroy_streams(struct cuda_api *api, CUstream *streams, int count) {
    if (!api->cuStreamDestroy) {
        return;
@@ -908,11 +917,9 @@ static int prepare_profile(struct cublaslt_api *cublas,
                           CUstream stream,
                           size_t profile_budget_bytes,
                           struct prepared_profile *out) {
    memset(out, 0, sizeof(*out));
    out->desc = *desc;
    out->stream = stream;
    size_t bytes_per_cell = 0;
    size_t attempt_budget = profile_budget_bytes;
    bytes_per_cell += bytes_for_elements(desc->a_type, 1);
    bytes_per_cell += bytes_for_elements(desc->b_type, 1);
    bytes_per_cell += bytes_for_elements(desc->c_type, 1);
@@ -921,12 +928,17 @@ static int prepare_profile(struct cublaslt_api *cublas,
        return 0;
    }
-    uint64_t dim = choose_square_dim(profile_budget_bytes, bytes_per_cell, desc->min_multiple);
+    while (attempt_budget >= MIN_PROFILE_BUDGET_BYTES) {
        memset(out, 0, sizeof(*out));
        out->desc = *desc;
        out->stream = stream;
        uint64_t dim = choose_square_dim(attempt_budget, bytes_per_cell, desc->min_multiple);
        out->m = dim;
        out->n = dim;
        out->k = dim;
-    size_t desired_workspace = profile_budget_bytes / 8u;
+        size_t desired_workspace = attempt_budget / 8u;
        if (desired_workspace > 32u * 1024u * 1024u) {
            desired_workspace = 32u * 1024u * 1024u;
        }
@@ -945,8 +957,8 @@ static int prepare_profile(struct cublaslt_api *cublas,
            scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k);
            size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes;
-        if (matrix_bytes <= profile_budget_bytes) {
+            if (matrix_bytes <= attempt_budget) {
-            size_t remaining = profile_budget_bytes - matrix_bytes;
+                size_t remaining = attempt_budget - matrix_bytes;
                out->workspace_size = desired_workspace;
                if (out->workspace_size > remaining) {
                    out->workspace_size = round_down_size(remaining, 256u);
@@ -955,12 +967,16 @@ static int prepare_profile(struct cublaslt_api *cublas,
            }
            if (out->m <= (uint64_t)desc->min_multiple) {
-            return 0;
+                break;
            }
            out->m -= (uint64_t)desc->min_multiple;
            out->n = out->m;
            out->k = out->m;
        }
        if (out->m < (uint64_t)desc->min_multiple) {
            attempt_budget /= 2u;
            continue;
        }
        if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) ||
            !alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) ||
@@ -1095,7 +1111,7 @@ static int prepare_profile(struct cublaslt_api *cublas,
        }
        int found = 0;
-    if (!check_cublas("heuristic",
+        if (check_cublas("heuristic",
                         cublas->cublasLtMatmulAlgoGetHeuristic(handle,
                                                                out->op_desc,
                                                                out->a_layout,
@@ -1105,19 +1121,22 @@ static int prepare_profile(struct cublaslt_api *cublas,
                                                                out->preference,
                                                                1,
                                                                &out->heuristic,
-                                                             &found))) {
+                                                                &found)) &&
-        destroy_profile(cublas, cuda, out);
+            found > 0) {
        return 0;
    }
    if (found <= 0) {
        destroy_profile(cublas, cuda, out);
        return 0;
    }
            out->ready = 1;
            return 1;
        }
        destroy_profile(cublas, cuda, out);
        attempt_budget = round_down_size(attempt_budget * 3u / 4u, 256u);
        if (attempt_budget < MIN_PROFILE_BUDGET_BYTES) {
            break;
        }
    }
    return 0;
 }
 static int run_cublas_profile(cublasLtHandle_t handle,
                              struct cublaslt_api *cublas,
                              struct prepared_profile *profile) {
@@ -1180,6 +1199,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
    size_t requested_budget = 0;
    size_t total_budget = 0;
    size_t per_profile_budget = 0;
    int budget_profiles = 0;
    memset(report, 0, sizeof(*report));
    snprintf(report->backend, sizeof(report->backend), "cublasLt");
@@ -1215,8 +1235,9 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
    }
    /* Count all profiles active on this GPU regardless of filter.
-     * Used as the budget divisor so matrix sizes stay consistent whether
+     * Mixed phases still divide budget across the full precision set, while
-     * running all precisions together or a single-precision phase. */
+     * single-precision benchmark phases dedicate budget only to active
     * profiles matching precision_filter. */
    int planned_total = 0;
    for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
        if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) {
@@ -1226,19 +1247,29 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
    if (planned_total < planned) {
        planned_total = planned;
    }
    budget_profiles = planned_total;
    if (precision_filter != NULL) {
        budget_profiles = planned;
    }
    if (budget_profiles <= 0) {
        budget_profiles = planned_total;
    }
    requested_budget = (size_t)size_mb * 1024u * 1024u;
-    if (requested_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) {
+    if (requested_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) {
-        requested_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES;
+        requested_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES;
    }
    total_budget = clamp_budget_to_free_memory(cuda, requested_budget);
-    if (total_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) {
+    if (total_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) {
-        total_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES;
+        total_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES;
    }
    if (query_multiprocessor_count(cuda, dev, &mp_count) &&
        cuda->cuStreamCreate &&
        cuda->cuStreamDestroy) {
-        stream_count = choose_stream_count(mp_count, planned_total, total_budget, 1);
+        stream_count = choose_stream_count(mp_count, budget_profiles, total_budget, 1);
    }
    if (precision_filter != NULL && stream_count > MAX_SINGLE_PRECISION_STREAMS) {
        stream_count = MAX_SINGLE_PRECISION_STREAMS;
    }
    if (stream_count > 1) {
        int created = 0;
@@ -1251,18 +1282,22 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
        }
    }
    report->stream_count = stream_count;
-    per_profile_budget = total_budget / ((size_t)planned_total * (size_t)stream_count);
+    per_profile_budget = total_budget / ((size_t)budget_profiles * (size_t)stream_count);
    if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) {
        per_profile_budget = MIN_PROFILE_BUDGET_BYTES;
    }
    if (precision_filter != NULL) {
        per_profile_budget = clamp_single_precision_profile_budget(per_profile_budget);
    }
    report->buffer_mb = (int)(total_budget / (1024u * 1024u));
    append_detail(report->details,
                  sizeof(report->details),
-                  "requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n",
+                  "requested_mb=%d actual_mb=%d streams=%d mp_count=%d budget_profiles=%d per_worker_mb=%zu\n",
                  size_mb,
                  report->buffer_mb,
                  report->stream_count,
                  mp_count,
                  budget_profiles,
                  per_profile_budget / (1024u * 1024u));
    for (int i = 0; i < profile_count; i++) {