Tune bee-gpu-burn single-precision benchmark phases

This commit is contained in:
2026-04-16 00:05:47 +03:00
parent 5c1862ce4c
commit fa6d905a10

View File

@@ -35,6 +35,8 @@ typedef void *CUstream;
#define MAX_STRESS_STREAMS 16 #define MAX_STRESS_STREAMS 16
#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u) #define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u) #define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
#define MAX_SINGLE_PRECISION_STREAMS 4
#define MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES ((size_t)2u * 1024u * 1024u * 1024u)
static const char *ptx_source = static const char *ptx_source =
".version 6.0\n" ".version 6.0\n"
@@ -296,6 +298,13 @@ static int choose_stream_count(int mp_count, int planned_profiles, size_t total_
return stream_count; return stream_count;
} }
static size_t clamp_single_precision_profile_budget(size_t profile_budget_bytes) {
if (profile_budget_bytes > MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES) {
return MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES;
}
return profile_budget_bytes;
}
static void destroy_streams(struct cuda_api *api, CUstream *streams, int count) { static void destroy_streams(struct cuda_api *api, CUstream *streams, int count) {
if (!api->cuStreamDestroy) { if (!api->cuStreamDestroy) {
return; return;
@@ -908,11 +917,9 @@ static int prepare_profile(struct cublaslt_api *cublas,
CUstream stream, CUstream stream,
size_t profile_budget_bytes, size_t profile_budget_bytes,
struct prepared_profile *out) { struct prepared_profile *out) {
memset(out, 0, sizeof(*out));
out->desc = *desc;
out->stream = stream;
size_t bytes_per_cell = 0; size_t bytes_per_cell = 0;
size_t attempt_budget = profile_budget_bytes;
bytes_per_cell += bytes_for_elements(desc->a_type, 1); bytes_per_cell += bytes_for_elements(desc->a_type, 1);
bytes_per_cell += bytes_for_elements(desc->b_type, 1); bytes_per_cell += bytes_for_elements(desc->b_type, 1);
bytes_per_cell += bytes_for_elements(desc->c_type, 1); bytes_per_cell += bytes_for_elements(desc->c_type, 1);
@@ -921,12 +928,17 @@ static int prepare_profile(struct cublaslt_api *cublas,
return 0; return 0;
} }
uint64_t dim = choose_square_dim(profile_budget_bytes, bytes_per_cell, desc->min_multiple); while (attempt_budget >= MIN_PROFILE_BUDGET_BYTES) {
memset(out, 0, sizeof(*out));
out->desc = *desc;
out->stream = stream;
uint64_t dim = choose_square_dim(attempt_budget, bytes_per_cell, desc->min_multiple);
out->m = dim; out->m = dim;
out->n = dim; out->n = dim;
out->k = dim; out->k = dim;
size_t desired_workspace = profile_budget_bytes / 8u; size_t desired_workspace = attempt_budget / 8u;
if (desired_workspace > 32u * 1024u * 1024u) { if (desired_workspace > 32u * 1024u * 1024u) {
desired_workspace = 32u * 1024u * 1024u; desired_workspace = 32u * 1024u * 1024u;
} }
@@ -945,8 +957,8 @@ static int prepare_profile(struct cublaslt_api *cublas,
scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k); scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k);
size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes; size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes;
if (matrix_bytes <= profile_budget_bytes) { if (matrix_bytes <= attempt_budget) {
size_t remaining = profile_budget_bytes - matrix_bytes; size_t remaining = attempt_budget - matrix_bytes;
out->workspace_size = desired_workspace; out->workspace_size = desired_workspace;
if (out->workspace_size > remaining) { if (out->workspace_size > remaining) {
out->workspace_size = round_down_size(remaining, 256u); out->workspace_size = round_down_size(remaining, 256u);
@@ -955,12 +967,16 @@ static int prepare_profile(struct cublaslt_api *cublas,
} }
if (out->m <= (uint64_t)desc->min_multiple) { if (out->m <= (uint64_t)desc->min_multiple) {
return 0; break;
} }
out->m -= (uint64_t)desc->min_multiple; out->m -= (uint64_t)desc->min_multiple;
out->n = out->m; out->n = out->m;
out->k = out->m; out->k = out->m;
} }
if (out->m < (uint64_t)desc->min_multiple) {
attempt_budget /= 2u;
continue;
}
if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) || if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) ||
!alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) || !alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) ||
@@ -1095,7 +1111,7 @@ static int prepare_profile(struct cublaslt_api *cublas,
} }
int found = 0; int found = 0;
if (!check_cublas("heuristic", if (check_cublas("heuristic",
cublas->cublasLtMatmulAlgoGetHeuristic(handle, cublas->cublasLtMatmulAlgoGetHeuristic(handle,
out->op_desc, out->op_desc,
out->a_layout, out->a_layout,
@@ -1105,19 +1121,22 @@ static int prepare_profile(struct cublaslt_api *cublas,
out->preference, out->preference,
1, 1,
&out->heuristic, &out->heuristic,
&found))) { &found)) &&
destroy_profile(cublas, cuda, out); found > 0) {
return 0;
}
if (found <= 0) {
destroy_profile(cublas, cuda, out);
return 0;
}
out->ready = 1; out->ready = 1;
return 1; return 1;
} }
destroy_profile(cublas, cuda, out);
attempt_budget = round_down_size(attempt_budget * 3u / 4u, 256u);
if (attempt_budget < MIN_PROFILE_BUDGET_BYTES) {
break;
}
}
return 0;
}
static int run_cublas_profile(cublasLtHandle_t handle, static int run_cublas_profile(cublasLtHandle_t handle,
struct cublaslt_api *cublas, struct cublaslt_api *cublas,
struct prepared_profile *profile) { struct prepared_profile *profile) {
@@ -1180,6 +1199,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
size_t requested_budget = 0; size_t requested_budget = 0;
size_t total_budget = 0; size_t total_budget = 0;
size_t per_profile_budget = 0; size_t per_profile_budget = 0;
int budget_profiles = 0;
memset(report, 0, sizeof(*report)); memset(report, 0, sizeof(*report));
snprintf(report->backend, sizeof(report->backend), "cublasLt"); snprintf(report->backend, sizeof(report->backend), "cublasLt");
@@ -1215,8 +1235,9 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
} }
/* Count all profiles active on this GPU regardless of filter. /* Count all profiles active on this GPU regardless of filter.
* Used as the budget divisor so matrix sizes stay consistent whether * Mixed phases still divide budget across the full precision set, while
* running all precisions together or a single-precision phase. */ * single-precision benchmark phases dedicate budget only to active
* profiles matching precision_filter. */
int planned_total = 0; int planned_total = 0;
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) { for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) { if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) {
@@ -1226,19 +1247,29 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
if (planned_total < planned) { if (planned_total < planned) {
planned_total = planned; planned_total = planned;
} }
budget_profiles = planned_total;
if (precision_filter != NULL) {
budget_profiles = planned;
}
if (budget_profiles <= 0) {
budget_profiles = planned_total;
}
requested_budget = (size_t)size_mb * 1024u * 1024u; requested_budget = (size_t)size_mb * 1024u * 1024u;
if (requested_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) { if (requested_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) {
requested_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES; requested_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES;
} }
total_budget = clamp_budget_to_free_memory(cuda, requested_budget); total_budget = clamp_budget_to_free_memory(cuda, requested_budget);
if (total_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) { if (total_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) {
total_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES; total_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES;
} }
if (query_multiprocessor_count(cuda, dev, &mp_count) && if (query_multiprocessor_count(cuda, dev, &mp_count) &&
cuda->cuStreamCreate && cuda->cuStreamCreate &&
cuda->cuStreamDestroy) { cuda->cuStreamDestroy) {
stream_count = choose_stream_count(mp_count, planned_total, total_budget, 1); stream_count = choose_stream_count(mp_count, budget_profiles, total_budget, 1);
}
if (precision_filter != NULL && stream_count > MAX_SINGLE_PRECISION_STREAMS) {
stream_count = MAX_SINGLE_PRECISION_STREAMS;
} }
if (stream_count > 1) { if (stream_count > 1) {
int created = 0; int created = 0;
@@ -1251,18 +1282,22 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
} }
} }
report->stream_count = stream_count; report->stream_count = stream_count;
per_profile_budget = total_budget / ((size_t)planned_total * (size_t)stream_count); per_profile_budget = total_budget / ((size_t)budget_profiles * (size_t)stream_count);
if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) { if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) {
per_profile_budget = MIN_PROFILE_BUDGET_BYTES; per_profile_budget = MIN_PROFILE_BUDGET_BYTES;
} }
if (precision_filter != NULL) {
per_profile_budget = clamp_single_precision_profile_budget(per_profile_budget);
}
report->buffer_mb = (int)(total_budget / (1024u * 1024u)); report->buffer_mb = (int)(total_budget / (1024u * 1024u));
append_detail(report->details, append_detail(report->details,
sizeof(report->details), sizeof(report->details),
"requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n", "requested_mb=%d actual_mb=%d streams=%d mp_count=%d budget_profiles=%d per_worker_mb=%zu\n",
size_mb, size_mb,
report->buffer_mb, report->buffer_mb,
report->stream_count, report->stream_count,
mp_count, mp_count,
budget_profiles,
per_profile_budget / (1024u * 1024u)); per_profile_budget / (1024u * 1024u));
for (int i = 0; i < profile_count; i++) { for (int i = 0; i < profile_count; i++) {