From fa6d905a108ea9c3c0a9c55811915f1b9e79f4a6 Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Thu, 16 Apr 2026 00:05:47 +0300 Subject: [PATCH] Tune bee-gpu-burn single-precision benchmark phases --- iso/builder/bee-gpu-stress.c | 329 +++++++++++++++++++---------------- 1 file changed, 182 insertions(+), 147 deletions(-) diff --git a/iso/builder/bee-gpu-stress.c b/iso/builder/bee-gpu-stress.c index 01cba57..5160034 100644 --- a/iso/builder/bee-gpu-stress.c +++ b/iso/builder/bee-gpu-stress.c @@ -35,6 +35,8 @@ typedef void *CUstream; #define MAX_STRESS_STREAMS 16 #define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u) #define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u) +#define MAX_SINGLE_PRECISION_STREAMS 4 +#define MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES ((size_t)2u * 1024u * 1024u * 1024u) static const char *ptx_source = ".version 6.0\n" @@ -296,6 +298,13 @@ static int choose_stream_count(int mp_count, int planned_profiles, size_t total_ return stream_count; } +static size_t clamp_single_precision_profile_budget(size_t profile_budget_bytes) { + if (profile_budget_bytes > MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES) { + return MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES; + } + return profile_budget_bytes; +} + static void destroy_streams(struct cuda_api *api, CUstream *streams, int count) { if (!api->cuStreamDestroy) { return; @@ -908,11 +917,9 @@ static int prepare_profile(struct cublaslt_api *cublas, CUstream stream, size_t profile_budget_bytes, struct prepared_profile *out) { - memset(out, 0, sizeof(*out)); - out->desc = *desc; - out->stream = stream; - size_t bytes_per_cell = 0; + size_t attempt_budget = profile_budget_bytes; + bytes_per_cell += bytes_for_elements(desc->a_type, 1); bytes_per_cell += bytes_for_elements(desc->b_type, 1); bytes_per_cell += bytes_for_elements(desc->c_type, 1); @@ -921,106 +928,115 @@ static int prepare_profile(struct cublaslt_api *cublas, return 0; } - uint64_t dim = choose_square_dim(profile_budget_bytes, bytes_per_cell, desc->min_multiple); - out->m = dim; - out->n = dim; - out->k = dim; + while (attempt_budget >= MIN_PROFILE_BUDGET_BYTES) { + memset(out, 0, sizeof(*out)); + out->desc = *desc; + out->stream = stream; - size_t desired_workspace = profile_budget_bytes / 8u; - if (desired_workspace > 32u * 1024u * 1024u) { - desired_workspace = 32u * 1024u * 1024u; - } - desired_workspace = round_down_size(desired_workspace, 256u); + uint64_t dim = choose_square_dim(attempt_budget, bytes_per_cell, desc->min_multiple); + out->m = dim; + out->n = dim; + out->k = dim; - size_t a_bytes = 0; - size_t b_bytes = 0; - size_t c_bytes = 0; - size_t d_bytes = 0; - size_t scale_bytes = 0; - while (1) { - a_bytes = bytes_for_elements(desc->a_type, out->k * out->m); - b_bytes = bytes_for_elements(desc->b_type, out->k * out->n); - c_bytes = bytes_for_elements(desc->c_type, out->m * out->n); - d_bytes = bytes_for_elements(desc->d_type, out->m * out->n); - scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k); + size_t desired_workspace = attempt_budget / 8u; + if (desired_workspace > 32u * 1024u * 1024u) { + desired_workspace = 32u * 1024u * 1024u; + } + desired_workspace = round_down_size(desired_workspace, 256u); - size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes; - if (matrix_bytes <= profile_budget_bytes) { - size_t remaining = profile_budget_bytes - matrix_bytes; - out->workspace_size = desired_workspace; - if (out->workspace_size > remaining) { - out->workspace_size = round_down_size(remaining, 256u); + size_t a_bytes = 0; + size_t b_bytes = 0; + size_t c_bytes = 0; + size_t d_bytes = 0; + size_t scale_bytes = 0; + while (1) { + a_bytes = bytes_for_elements(desc->a_type, out->k * out->m); + b_bytes = bytes_for_elements(desc->b_type, out->k * out->n); + c_bytes = bytes_for_elements(desc->c_type, out->m * out->n); + d_bytes = bytes_for_elements(desc->d_type, out->m * out->n); + scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k); + + size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes; + if (matrix_bytes <= attempt_budget) { + size_t remaining = attempt_budget - matrix_bytes; + out->workspace_size = desired_workspace; + if (out->workspace_size > remaining) { + out->workspace_size = round_down_size(remaining, 256u); + } + break; } - break; + + if (out->m <= (uint64_t)desc->min_multiple) { + break; + } + out->m -= (uint64_t)desc->min_multiple; + out->n = out->m; + out->k = out->m; + } + if (out->m < (uint64_t)desc->min_multiple) { + attempt_budget /= 2u; + continue; } - if (out->m <= (uint64_t)desc->min_multiple) { - return 0; - } - out->m -= (uint64_t)desc->min_multiple; - out->n = out->m; - out->k = out->m; - } - - if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) || - !alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) || - !alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) || - !alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) { - destroy_profile(cublas, cuda, out); - return 0; - } - - cudaDataType_t scale_type = matmul_scale_type(desc); - if (!check_cublas("cublasLtMatmulDescCreate", - cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, scale_type))) { - destroy_profile(cublas, cuda, out); - return 0; - } - - cublasOperation_t transa = CUBLAS_OP_T; - cublasOperation_t transb = CUBLAS_OP_N; - if (!check_cublas("set TRANSA", - cublas->cublasLtMatmulDescSetAttribute(out->op_desc, - CUBLASLT_MATMUL_DESC_TRANSA, - &transa, - sizeof(transa))) || - !check_cublas("set TRANSB", - cublas->cublasLtMatmulDescSetAttribute(out->op_desc, - CUBLASLT_MATMUL_DESC_TRANSB, - &transb, - sizeof(transb)))) { - destroy_profile(cublas, cuda, out); - return 0; - } - - if (desc->needs_scalar_scale) { - float one = 1.0f; - if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) || - !alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) { + if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) || + !alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) || + !alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) || + !alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) { destroy_profile(cublas, cuda, out); return 0; } - if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) || - !device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) { + + cudaDataType_t scale_type = matmul_scale_type(desc); + if (!check_cublas("cublasLtMatmulDescCreate", + cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, scale_type))) { destroy_profile(cublas, cuda, out); return 0; } - void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev; - void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev; - if (!check_cublas("set A scale ptr", + + cublasOperation_t transa = CUBLAS_OP_T; + cublasOperation_t transb = CUBLAS_OP_N; + if (!check_cublas("set TRANSA", cublas->cublasLtMatmulDescSetAttribute(out->op_desc, - CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, - &a_scale_ptr, - sizeof(a_scale_ptr))) || - !check_cublas("set B scale ptr", + CUBLASLT_MATMUL_DESC_TRANSA, + &transa, + sizeof(transa))) || + !check_cublas("set TRANSB", cublas->cublasLtMatmulDescSetAttribute(out->op_desc, - CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, - &b_scale_ptr, - sizeof(b_scale_ptr)))) { + CUBLASLT_MATMUL_DESC_TRANSB, + &transb, + sizeof(transb)))) { destroy_profile(cublas, cuda, out); return 0; } - } + + if (desc->needs_scalar_scale) { + float one = 1.0f; + if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) || + !alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) { + destroy_profile(cublas, cuda, out); + return 0; + } + if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) || + !device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) { + destroy_profile(cublas, cuda, out); + return 0; + } + void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev; + void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev; + if (!check_cublas("set A scale ptr", + cublas->cublasLtMatmulDescSetAttribute(out->op_desc, + CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, + &a_scale_ptr, + sizeof(a_scale_ptr))) || + !check_cublas("set B scale ptr", + cublas->cublasLtMatmulDescSetAttribute(out->op_desc, + CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, + &b_scale_ptr, + sizeof(b_scale_ptr)))) { + destroy_profile(cublas, cuda, out); + return 0; + } + } #if defined(CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3) if (desc->needs_block_scale) { @@ -1060,62 +1076,65 @@ static int prepare_profile(struct cublaslt_api *cublas, } #endif - if (!check_cublas("create A layout", - cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) || - !check_cublas("create B layout", - cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) || - !check_cublas("create C layout", - cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) || - !check_cublas("create D layout", - cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) { - destroy_profile(cublas, cuda, out); - return 0; - } - - if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) { - destroy_profile(cublas, cuda, out); - return 0; - } - - if (out->workspace_size > 0) { - if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) { + if (!check_cublas("create A layout", + cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) || + !check_cublas("create B layout", + cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) || + !check_cublas("create C layout", + cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) || + !check_cublas("create D layout", + cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) { destroy_profile(cublas, cuda, out); return 0; } + + if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) { + destroy_profile(cublas, cuda, out); + return 0; + } + + if (out->workspace_size > 0) { + if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) { + destroy_profile(cublas, cuda, out); + return 0; + } + } + + if (!check_cublas("set workspace", + cublas->cublasLtMatmulPreferenceSetAttribute( + out->preference, + CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, + &out->workspace_size, + sizeof(out->workspace_size)))) { + destroy_profile(cublas, cuda, out); + return 0; + } + + int found = 0; + if (check_cublas("heuristic", + cublas->cublasLtMatmulAlgoGetHeuristic(handle, + out->op_desc, + out->a_layout, + out->b_layout, + out->c_layout, + out->d_layout, + out->preference, + 1, + &out->heuristic, + &found)) && + found > 0) { + out->ready = 1; + return 1; + } + + destroy_profile(cublas, cuda, out); + attempt_budget = round_down_size(attempt_budget * 3u / 4u, 256u); + if (attempt_budget < MIN_PROFILE_BUDGET_BYTES) { + break; + } } - if (!check_cublas("set workspace", - cublas->cublasLtMatmulPreferenceSetAttribute( - out->preference, - CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, - &out->workspace_size, - sizeof(out->workspace_size)))) { - destroy_profile(cublas, cuda, out); - return 0; - } - - int found = 0; - if (!check_cublas("heuristic", - cublas->cublasLtMatmulAlgoGetHeuristic(handle, - out->op_desc, - out->a_layout, - out->b_layout, - out->c_layout, - out->d_layout, - out->preference, - 1, - &out->heuristic, - &found))) { - destroy_profile(cublas, cuda, out); - return 0; - } - if (found <= 0) { - destroy_profile(cublas, cuda, out); - return 0; - } - - out->ready = 1; - return 1; + return 0; } static int run_cublas_profile(cublasLtHandle_t handle, @@ -1180,6 +1199,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda, size_t requested_budget = 0; size_t total_budget = 0; size_t per_profile_budget = 0; + int budget_profiles = 0; memset(report, 0, sizeof(*report)); snprintf(report->backend, sizeof(report->backend), "cublasLt"); @@ -1215,8 +1235,9 @@ static int run_cublaslt_stress(struct cuda_api *cuda, } /* Count all profiles active on this GPU regardless of filter. - * Used as the budget divisor so matrix sizes stay consistent whether - * running all precisions together or a single-precision phase. */ + * Mixed phases still divide budget across the full precision set, while + * single-precision benchmark phases dedicate budget only to active + * profiles matching precision_filter. */ int planned_total = 0; for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) { if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) { @@ -1226,19 +1247,29 @@ static int run_cublaslt_stress(struct cuda_api *cuda, if (planned_total < planned) { planned_total = planned; } + budget_profiles = planned_total; + if (precision_filter != NULL) { + budget_profiles = planned; + } + if (budget_profiles <= 0) { + budget_profiles = planned_total; + } requested_budget = (size_t)size_mb * 1024u * 1024u; - if (requested_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) { - requested_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES; + if (requested_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) { + requested_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES; } total_budget = clamp_budget_to_free_memory(cuda, requested_budget); - if (total_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) { - total_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES; + if (total_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) { + total_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES; } if (query_multiprocessor_count(cuda, dev, &mp_count) && cuda->cuStreamCreate && cuda->cuStreamDestroy) { - stream_count = choose_stream_count(mp_count, planned_total, total_budget, 1); + stream_count = choose_stream_count(mp_count, budget_profiles, total_budget, 1); + } + if (precision_filter != NULL && stream_count > MAX_SINGLE_PRECISION_STREAMS) { + stream_count = MAX_SINGLE_PRECISION_STREAMS; } if (stream_count > 1) { int created = 0; @@ -1251,18 +1282,22 @@ static int run_cublaslt_stress(struct cuda_api *cuda, } } report->stream_count = stream_count; - per_profile_budget = total_budget / ((size_t)planned_total * (size_t)stream_count); + per_profile_budget = total_budget / ((size_t)budget_profiles * (size_t)stream_count); if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) { per_profile_budget = MIN_PROFILE_BUDGET_BYTES; } + if (precision_filter != NULL) { + per_profile_budget = clamp_single_precision_profile_budget(per_profile_budget); + } report->buffer_mb = (int)(total_budget / (1024u * 1024u)); append_detail(report->details, sizeof(report->details), - "requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n", + "requested_mb=%d actual_mb=%d streams=%d mp_count=%d budget_profiles=%d per_worker_mb=%zu\n", size_mb, report->buffer_mb, report->stream_count, mp_count, + budget_profiles, per_profile_budget / (1024u * 1024u)); for (int i = 0; i < profile_count; i++) {