Tune bee-gpu-burn single-precision benchmark phases
This commit is contained in:
@@ -35,6 +35,8 @@ typedef void *CUstream;
|
|||||||
#define MAX_STRESS_STREAMS 16
|
#define MAX_STRESS_STREAMS 16
|
||||||
#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
|
#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
|
||||||
#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
|
#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
|
||||||
|
#define MAX_SINGLE_PRECISION_STREAMS 4
|
||||||
|
#define MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES ((size_t)2u * 1024u * 1024u * 1024u)
|
||||||
|
|
||||||
static const char *ptx_source =
|
static const char *ptx_source =
|
||||||
".version 6.0\n"
|
".version 6.0\n"
|
||||||
@@ -296,6 +298,13 @@ static int choose_stream_count(int mp_count, int planned_profiles, size_t total_
|
|||||||
return stream_count;
|
return stream_count;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static size_t clamp_single_precision_profile_budget(size_t profile_budget_bytes) {
|
||||||
|
if (profile_budget_bytes > MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES) {
|
||||||
|
return MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES;
|
||||||
|
}
|
||||||
|
return profile_budget_bytes;
|
||||||
|
}
|
||||||
|
|
||||||
static void destroy_streams(struct cuda_api *api, CUstream *streams, int count) {
|
static void destroy_streams(struct cuda_api *api, CUstream *streams, int count) {
|
||||||
if (!api->cuStreamDestroy) {
|
if (!api->cuStreamDestroy) {
|
||||||
return;
|
return;
|
||||||
@@ -908,11 +917,9 @@ static int prepare_profile(struct cublaslt_api *cublas,
|
|||||||
CUstream stream,
|
CUstream stream,
|
||||||
size_t profile_budget_bytes,
|
size_t profile_budget_bytes,
|
||||||
struct prepared_profile *out) {
|
struct prepared_profile *out) {
|
||||||
memset(out, 0, sizeof(*out));
|
|
||||||
out->desc = *desc;
|
|
||||||
out->stream = stream;
|
|
||||||
|
|
||||||
size_t bytes_per_cell = 0;
|
size_t bytes_per_cell = 0;
|
||||||
|
size_t attempt_budget = profile_budget_bytes;
|
||||||
|
|
||||||
bytes_per_cell += bytes_for_elements(desc->a_type, 1);
|
bytes_per_cell += bytes_for_elements(desc->a_type, 1);
|
||||||
bytes_per_cell += bytes_for_elements(desc->b_type, 1);
|
bytes_per_cell += bytes_for_elements(desc->b_type, 1);
|
||||||
bytes_per_cell += bytes_for_elements(desc->c_type, 1);
|
bytes_per_cell += bytes_for_elements(desc->c_type, 1);
|
||||||
@@ -921,106 +928,115 @@ static int prepare_profile(struct cublaslt_api *cublas,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t dim = choose_square_dim(profile_budget_bytes, bytes_per_cell, desc->min_multiple);
|
while (attempt_budget >= MIN_PROFILE_BUDGET_BYTES) {
|
||||||
out->m = dim;
|
memset(out, 0, sizeof(*out));
|
||||||
out->n = dim;
|
out->desc = *desc;
|
||||||
out->k = dim;
|
out->stream = stream;
|
||||||
|
|
||||||
size_t desired_workspace = profile_budget_bytes / 8u;
|
uint64_t dim = choose_square_dim(attempt_budget, bytes_per_cell, desc->min_multiple);
|
||||||
if (desired_workspace > 32u * 1024u * 1024u) {
|
out->m = dim;
|
||||||
desired_workspace = 32u * 1024u * 1024u;
|
out->n = dim;
|
||||||
}
|
out->k = dim;
|
||||||
desired_workspace = round_down_size(desired_workspace, 256u);
|
|
||||||
|
|
||||||
size_t a_bytes = 0;
|
size_t desired_workspace = attempt_budget / 8u;
|
||||||
size_t b_bytes = 0;
|
if (desired_workspace > 32u * 1024u * 1024u) {
|
||||||
size_t c_bytes = 0;
|
desired_workspace = 32u * 1024u * 1024u;
|
||||||
size_t d_bytes = 0;
|
}
|
||||||
size_t scale_bytes = 0;
|
desired_workspace = round_down_size(desired_workspace, 256u);
|
||||||
while (1) {
|
|
||||||
a_bytes = bytes_for_elements(desc->a_type, out->k * out->m);
|
|
||||||
b_bytes = bytes_for_elements(desc->b_type, out->k * out->n);
|
|
||||||
c_bytes = bytes_for_elements(desc->c_type, out->m * out->n);
|
|
||||||
d_bytes = bytes_for_elements(desc->d_type, out->m * out->n);
|
|
||||||
scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k);
|
|
||||||
|
|
||||||
size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes;
|
size_t a_bytes = 0;
|
||||||
if (matrix_bytes <= profile_budget_bytes) {
|
size_t b_bytes = 0;
|
||||||
size_t remaining = profile_budget_bytes - matrix_bytes;
|
size_t c_bytes = 0;
|
||||||
out->workspace_size = desired_workspace;
|
size_t d_bytes = 0;
|
||||||
if (out->workspace_size > remaining) {
|
size_t scale_bytes = 0;
|
||||||
out->workspace_size = round_down_size(remaining, 256u);
|
while (1) {
|
||||||
|
a_bytes = bytes_for_elements(desc->a_type, out->k * out->m);
|
||||||
|
b_bytes = bytes_for_elements(desc->b_type, out->k * out->n);
|
||||||
|
c_bytes = bytes_for_elements(desc->c_type, out->m * out->n);
|
||||||
|
d_bytes = bytes_for_elements(desc->d_type, out->m * out->n);
|
||||||
|
scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k);
|
||||||
|
|
||||||
|
size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes;
|
||||||
|
if (matrix_bytes <= attempt_budget) {
|
||||||
|
size_t remaining = attempt_budget - matrix_bytes;
|
||||||
|
out->workspace_size = desired_workspace;
|
||||||
|
if (out->workspace_size > remaining) {
|
||||||
|
out->workspace_size = round_down_size(remaining, 256u);
|
||||||
|
}
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
break;
|
|
||||||
|
if (out->m <= (uint64_t)desc->min_multiple) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
out->m -= (uint64_t)desc->min_multiple;
|
||||||
|
out->n = out->m;
|
||||||
|
out->k = out->m;
|
||||||
|
}
|
||||||
|
if (out->m < (uint64_t)desc->min_multiple) {
|
||||||
|
attempt_budget /= 2u;
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (out->m <= (uint64_t)desc->min_multiple) {
|
if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) ||
|
||||||
return 0;
|
!alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) ||
|
||||||
}
|
!alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) ||
|
||||||
out->m -= (uint64_t)desc->min_multiple;
|
!alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) {
|
||||||
out->n = out->m;
|
|
||||||
out->k = out->m;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) ||
|
|
||||||
!alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) ||
|
|
||||||
!alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) ||
|
|
||||||
!alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) {
|
|
||||||
destroy_profile(cublas, cuda, out);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
cudaDataType_t scale_type = matmul_scale_type(desc);
|
|
||||||
if (!check_cublas("cublasLtMatmulDescCreate",
|
|
||||||
cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, scale_type))) {
|
|
||||||
destroy_profile(cublas, cuda, out);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
cublasOperation_t transa = CUBLAS_OP_T;
|
|
||||||
cublasOperation_t transb = CUBLAS_OP_N;
|
|
||||||
if (!check_cublas("set TRANSA",
|
|
||||||
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
|
||||||
CUBLASLT_MATMUL_DESC_TRANSA,
|
|
||||||
&transa,
|
|
||||||
sizeof(transa))) ||
|
|
||||||
!check_cublas("set TRANSB",
|
|
||||||
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
|
||||||
CUBLASLT_MATMUL_DESC_TRANSB,
|
|
||||||
&transb,
|
|
||||||
sizeof(transb)))) {
|
|
||||||
destroy_profile(cublas, cuda, out);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (desc->needs_scalar_scale) {
|
|
||||||
float one = 1.0f;
|
|
||||||
if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) ||
|
|
||||||
!alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) {
|
|
||||||
destroy_profile(cublas, cuda, out);
|
destroy_profile(cublas, cuda, out);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) ||
|
|
||||||
!device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) {
|
cudaDataType_t scale_type = matmul_scale_type(desc);
|
||||||
|
if (!check_cublas("cublasLtMatmulDescCreate",
|
||||||
|
cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, scale_type))) {
|
||||||
destroy_profile(cublas, cuda, out);
|
destroy_profile(cublas, cuda, out);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev;
|
|
||||||
void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev;
|
cublasOperation_t transa = CUBLAS_OP_T;
|
||||||
if (!check_cublas("set A scale ptr",
|
cublasOperation_t transb = CUBLAS_OP_N;
|
||||||
|
if (!check_cublas("set TRANSA",
|
||||||
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
||||||
CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
|
CUBLASLT_MATMUL_DESC_TRANSA,
|
||||||
&a_scale_ptr,
|
&transa,
|
||||||
sizeof(a_scale_ptr))) ||
|
sizeof(transa))) ||
|
||||||
!check_cublas("set B scale ptr",
|
!check_cublas("set TRANSB",
|
||||||
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
||||||
CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
|
CUBLASLT_MATMUL_DESC_TRANSB,
|
||||||
&b_scale_ptr,
|
&transb,
|
||||||
sizeof(b_scale_ptr)))) {
|
sizeof(transb)))) {
|
||||||
destroy_profile(cublas, cuda, out);
|
destroy_profile(cublas, cuda, out);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
if (desc->needs_scalar_scale) {
|
||||||
|
float one = 1.0f;
|
||||||
|
if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) ||
|
||||||
|
!alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) {
|
||||||
|
destroy_profile(cublas, cuda, out);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) ||
|
||||||
|
!device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) {
|
||||||
|
destroy_profile(cublas, cuda, out);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev;
|
||||||
|
void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev;
|
||||||
|
if (!check_cublas("set A scale ptr",
|
||||||
|
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
||||||
|
CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
|
||||||
|
&a_scale_ptr,
|
||||||
|
sizeof(a_scale_ptr))) ||
|
||||||
|
!check_cublas("set B scale ptr",
|
||||||
|
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
||||||
|
CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
|
||||||
|
&b_scale_ptr,
|
||||||
|
sizeof(b_scale_ptr)))) {
|
||||||
|
destroy_profile(cublas, cuda, out);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#if defined(CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3)
|
#if defined(CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3)
|
||||||
if (desc->needs_block_scale) {
|
if (desc->needs_block_scale) {
|
||||||
@@ -1060,62 +1076,65 @@ static int prepare_profile(struct cublaslt_api *cublas,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (!check_cublas("create A layout",
|
if (!check_cublas("create A layout",
|
||||||
cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) ||
|
cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) ||
|
||||||
!check_cublas("create B layout",
|
!check_cublas("create B layout",
|
||||||
cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) ||
|
cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) ||
|
||||||
!check_cublas("create C layout",
|
!check_cublas("create C layout",
|
||||||
cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) ||
|
cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) ||
|
||||||
!check_cublas("create D layout",
|
!check_cublas("create D layout",
|
||||||
cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) {
|
cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) {
|
||||||
destroy_profile(cublas, cuda, out);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) {
|
|
||||||
destroy_profile(cublas, cuda, out);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (out->workspace_size > 0) {
|
|
||||||
if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) {
|
|
||||||
destroy_profile(cublas, cuda, out);
|
destroy_profile(cublas, cuda, out);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) {
|
||||||
|
destroy_profile(cublas, cuda, out);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (out->workspace_size > 0) {
|
||||||
|
if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) {
|
||||||
|
destroy_profile(cublas, cuda, out);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!check_cublas("set workspace",
|
||||||
|
cublas->cublasLtMatmulPreferenceSetAttribute(
|
||||||
|
out->preference,
|
||||||
|
CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
|
||||||
|
&out->workspace_size,
|
||||||
|
sizeof(out->workspace_size)))) {
|
||||||
|
destroy_profile(cublas, cuda, out);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int found = 0;
|
||||||
|
if (check_cublas("heuristic",
|
||||||
|
cublas->cublasLtMatmulAlgoGetHeuristic(handle,
|
||||||
|
out->op_desc,
|
||||||
|
out->a_layout,
|
||||||
|
out->b_layout,
|
||||||
|
out->c_layout,
|
||||||
|
out->d_layout,
|
||||||
|
out->preference,
|
||||||
|
1,
|
||||||
|
&out->heuristic,
|
||||||
|
&found)) &&
|
||||||
|
found > 0) {
|
||||||
|
out->ready = 1;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
destroy_profile(cublas, cuda, out);
|
||||||
|
attempt_budget = round_down_size(attempt_budget * 3u / 4u, 256u);
|
||||||
|
if (attempt_budget < MIN_PROFILE_BUDGET_BYTES) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!check_cublas("set workspace",
|
return 0;
|
||||||
cublas->cublasLtMatmulPreferenceSetAttribute(
|
|
||||||
out->preference,
|
|
||||||
CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
|
|
||||||
&out->workspace_size,
|
|
||||||
sizeof(out->workspace_size)))) {
|
|
||||||
destroy_profile(cublas, cuda, out);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int found = 0;
|
|
||||||
if (!check_cublas("heuristic",
|
|
||||||
cublas->cublasLtMatmulAlgoGetHeuristic(handle,
|
|
||||||
out->op_desc,
|
|
||||||
out->a_layout,
|
|
||||||
out->b_layout,
|
|
||||||
out->c_layout,
|
|
||||||
out->d_layout,
|
|
||||||
out->preference,
|
|
||||||
1,
|
|
||||||
&out->heuristic,
|
|
||||||
&found))) {
|
|
||||||
destroy_profile(cublas, cuda, out);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
if (found <= 0) {
|
|
||||||
destroy_profile(cublas, cuda, out);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
out->ready = 1;
|
|
||||||
return 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static int run_cublas_profile(cublasLtHandle_t handle,
|
static int run_cublas_profile(cublasLtHandle_t handle,
|
||||||
@@ -1180,6 +1199,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
size_t requested_budget = 0;
|
size_t requested_budget = 0;
|
||||||
size_t total_budget = 0;
|
size_t total_budget = 0;
|
||||||
size_t per_profile_budget = 0;
|
size_t per_profile_budget = 0;
|
||||||
|
int budget_profiles = 0;
|
||||||
|
|
||||||
memset(report, 0, sizeof(*report));
|
memset(report, 0, sizeof(*report));
|
||||||
snprintf(report->backend, sizeof(report->backend), "cublasLt");
|
snprintf(report->backend, sizeof(report->backend), "cublasLt");
|
||||||
@@ -1215,8 +1235,9 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Count all profiles active on this GPU regardless of filter.
|
/* Count all profiles active on this GPU regardless of filter.
|
||||||
* Used as the budget divisor so matrix sizes stay consistent whether
|
* Mixed phases still divide budget across the full precision set, while
|
||||||
* running all precisions together or a single-precision phase. */
|
* single-precision benchmark phases dedicate budget only to active
|
||||||
|
* profiles matching precision_filter. */
|
||||||
int planned_total = 0;
|
int planned_total = 0;
|
||||||
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
|
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
|
||||||
if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) {
|
if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) {
|
||||||
@@ -1226,19 +1247,29 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
if (planned_total < planned) {
|
if (planned_total < planned) {
|
||||||
planned_total = planned;
|
planned_total = planned;
|
||||||
}
|
}
|
||||||
|
budget_profiles = planned_total;
|
||||||
|
if (precision_filter != NULL) {
|
||||||
|
budget_profiles = planned;
|
||||||
|
}
|
||||||
|
if (budget_profiles <= 0) {
|
||||||
|
budget_profiles = planned_total;
|
||||||
|
}
|
||||||
|
|
||||||
requested_budget = (size_t)size_mb * 1024u * 1024u;
|
requested_budget = (size_t)size_mb * 1024u * 1024u;
|
||||||
if (requested_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) {
|
if (requested_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) {
|
||||||
requested_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES;
|
requested_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES;
|
||||||
}
|
}
|
||||||
total_budget = clamp_budget_to_free_memory(cuda, requested_budget);
|
total_budget = clamp_budget_to_free_memory(cuda, requested_budget);
|
||||||
if (total_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) {
|
if (total_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) {
|
||||||
total_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES;
|
total_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES;
|
||||||
}
|
}
|
||||||
if (query_multiprocessor_count(cuda, dev, &mp_count) &&
|
if (query_multiprocessor_count(cuda, dev, &mp_count) &&
|
||||||
cuda->cuStreamCreate &&
|
cuda->cuStreamCreate &&
|
||||||
cuda->cuStreamDestroy) {
|
cuda->cuStreamDestroy) {
|
||||||
stream_count = choose_stream_count(mp_count, planned_total, total_budget, 1);
|
stream_count = choose_stream_count(mp_count, budget_profiles, total_budget, 1);
|
||||||
|
}
|
||||||
|
if (precision_filter != NULL && stream_count > MAX_SINGLE_PRECISION_STREAMS) {
|
||||||
|
stream_count = MAX_SINGLE_PRECISION_STREAMS;
|
||||||
}
|
}
|
||||||
if (stream_count > 1) {
|
if (stream_count > 1) {
|
||||||
int created = 0;
|
int created = 0;
|
||||||
@@ -1251,18 +1282,22 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
report->stream_count = stream_count;
|
report->stream_count = stream_count;
|
||||||
per_profile_budget = total_budget / ((size_t)planned_total * (size_t)stream_count);
|
per_profile_budget = total_budget / ((size_t)budget_profiles * (size_t)stream_count);
|
||||||
if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) {
|
if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) {
|
||||||
per_profile_budget = MIN_PROFILE_BUDGET_BYTES;
|
per_profile_budget = MIN_PROFILE_BUDGET_BYTES;
|
||||||
}
|
}
|
||||||
|
if (precision_filter != NULL) {
|
||||||
|
per_profile_budget = clamp_single_precision_profile_budget(per_profile_budget);
|
||||||
|
}
|
||||||
report->buffer_mb = (int)(total_budget / (1024u * 1024u));
|
report->buffer_mb = (int)(total_budget / (1024u * 1024u));
|
||||||
append_detail(report->details,
|
append_detail(report->details,
|
||||||
sizeof(report->details),
|
sizeof(report->details),
|
||||||
"requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n",
|
"requested_mb=%d actual_mb=%d streams=%d mp_count=%d budget_profiles=%d per_worker_mb=%zu\n",
|
||||||
size_mb,
|
size_mb,
|
||||||
report->buffer_mb,
|
report->buffer_mb,
|
||||||
report->stream_count,
|
report->stream_count,
|
||||||
mp_count,
|
mp_count,
|
||||||
|
budget_profiles,
|
||||||
per_profile_budget / (1024u * 1024u));
|
per_profile_budget / (1024u * 1024u));
|
||||||
|
|
||||||
for (int i = 0; i < profile_count; i++) {
|
for (int i = 0; i < profile_count; i++) {
|
||||||
|
|||||||
Reference in New Issue
Block a user