1177 lines
42 KiB
C
1177 lines
42 KiB
C
#define _POSIX_C_SOURCE 200809L
|
|
|
|
#include <dlfcn.h>
|
|
#include <math.h>
|
|
#include <stdarg.h>
|
|
#include <stdint.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <time.h>
|
|
|
|
#if defined(__has_include)
|
|
#if __has_include(<cublasLt.h>)
|
|
#include <cublasLt.h>
|
|
#define HAVE_CUBLASLT_HEADERS 1
|
|
#else
|
|
#define HAVE_CUBLASLT_HEADERS 0
|
|
#endif
|
|
#else
|
|
#define HAVE_CUBLASLT_HEADERS 0
|
|
#endif
|
|
|
|
typedef int CUdevice;
|
|
typedef uint64_t CUdeviceptr;
|
|
typedef int CUresult;
|
|
typedef void *CUcontext;
|
|
typedef void *CUmodule;
|
|
typedef void *CUfunction;
|
|
typedef void *CUstream;
|
|
|
|
#define CU_SUCCESS 0
|
|
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR 75
|
|
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR 76
|
|
|
|
static const char *ptx_source =
|
|
".version 6.0\n"
|
|
".target sm_30\n"
|
|
".address_size 64\n"
|
|
"\n"
|
|
".visible .entry burn(\n"
|
|
" .param .u64 data,\n"
|
|
" .param .u32 words,\n"
|
|
" .param .u32 rounds\n"
|
|
")\n"
|
|
"{\n"
|
|
" .reg .pred %p<2>;\n"
|
|
" .reg .b32 %r<8>;\n"
|
|
" .reg .b64 %rd<5>;\n"
|
|
"\n"
|
|
" ld.param.u64 %rd1, [data];\n"
|
|
" ld.param.u32 %r1, [words];\n"
|
|
" ld.param.u32 %r2, [rounds];\n"
|
|
" mov.u32 %r3, %ctaid.x;\n"
|
|
" mov.u32 %r4, %ntid.x;\n"
|
|
" mov.u32 %r5, %tid.x;\n"
|
|
" mad.lo.s32 %r0, %r3, %r4, %r5;\n"
|
|
" setp.ge.u32 %p0, %r0, %r1;\n"
|
|
" @%p0 bra DONE;\n"
|
|
" mul.wide.u32 %rd2, %r0, 4;\n"
|
|
" add.s64 %rd3, %rd1, %rd2;\n"
|
|
" ld.global.u32 %r6, [%rd3];\n"
|
|
"LOOP:\n"
|
|
" setp.eq.u32 %p1, %r2, 0;\n"
|
|
" @%p1 bra STORE;\n"
|
|
" mad.lo.u32 %r6, %r6, 1664525, 1013904223;\n"
|
|
" sub.u32 %r2, %r2, 1;\n"
|
|
" bra LOOP;\n"
|
|
"STORE:\n"
|
|
" st.global.u32 [%rd3], %r6;\n"
|
|
"DONE:\n"
|
|
" ret;\n"
|
|
"}\n";
|
|
|
|
typedef CUresult (*cuInit_fn)(unsigned int);
|
|
typedef CUresult (*cuDeviceGetCount_fn)(int *);
|
|
typedef CUresult (*cuDeviceGet_fn)(CUdevice *, int);
|
|
typedef CUresult (*cuDeviceGetName_fn)(char *, int, CUdevice);
|
|
typedef CUresult (*cuDeviceGetAttribute_fn)(int *, int, CUdevice);
|
|
typedef CUresult (*cuCtxCreate_fn)(CUcontext *, unsigned int, CUdevice);
|
|
typedef CUresult (*cuCtxDestroy_fn)(CUcontext);
|
|
typedef CUresult (*cuCtxSynchronize_fn)(void);
|
|
typedef CUresult (*cuMemAlloc_fn)(CUdeviceptr *, size_t);
|
|
typedef CUresult (*cuMemFree_fn)(CUdeviceptr);
|
|
typedef CUresult (*cuMemsetD8_fn)(CUdeviceptr, unsigned char, size_t);
|
|
typedef CUresult (*cuMemcpyHtoD_fn)(CUdeviceptr, const void *, size_t);
|
|
typedef CUresult (*cuMemcpyDtoH_fn)(void *, CUdeviceptr, size_t);
|
|
typedef CUresult (*cuModuleLoadDataEx_fn)(CUmodule *, const void *, unsigned int, void *, void *);
|
|
typedef CUresult (*cuModuleGetFunction_fn)(CUfunction *, CUmodule, const char *);
|
|
typedef CUresult (*cuLaunchKernel_fn)(CUfunction,
|
|
unsigned int,
|
|
unsigned int,
|
|
unsigned int,
|
|
unsigned int,
|
|
unsigned int,
|
|
unsigned int,
|
|
unsigned int,
|
|
CUstream,
|
|
void **,
|
|
void **);
|
|
typedef CUresult (*cuGetErrorName_fn)(CUresult, const char **);
|
|
typedef CUresult (*cuGetErrorString_fn)(CUresult, const char **);
|
|
|
|
struct cuda_api {
|
|
void *lib;
|
|
cuInit_fn cuInit;
|
|
cuDeviceGetCount_fn cuDeviceGetCount;
|
|
cuDeviceGet_fn cuDeviceGet;
|
|
cuDeviceGetName_fn cuDeviceGetName;
|
|
cuDeviceGetAttribute_fn cuDeviceGetAttribute;
|
|
cuCtxCreate_fn cuCtxCreate;
|
|
cuCtxDestroy_fn cuCtxDestroy;
|
|
cuCtxSynchronize_fn cuCtxSynchronize;
|
|
cuMemAlloc_fn cuMemAlloc;
|
|
cuMemFree_fn cuMemFree;
|
|
cuMemsetD8_fn cuMemsetD8;
|
|
cuMemcpyHtoD_fn cuMemcpyHtoD;
|
|
cuMemcpyDtoH_fn cuMemcpyDtoH;
|
|
cuModuleLoadDataEx_fn cuModuleLoadDataEx;
|
|
cuModuleGetFunction_fn cuModuleGetFunction;
|
|
cuLaunchKernel_fn cuLaunchKernel;
|
|
cuGetErrorName_fn cuGetErrorName;
|
|
cuGetErrorString_fn cuGetErrorString;
|
|
};
|
|
|
|
struct stress_report {
|
|
char backend[32];
|
|
char device[128];
|
|
int cc_major;
|
|
int cc_minor;
|
|
int buffer_mb;
|
|
unsigned long iterations;
|
|
uint64_t checksum;
|
|
char details[1024];
|
|
};
|
|
|
|
static int load_symbol(void *lib, const char *name, void **out) {
|
|
*out = dlsym(lib, name);
|
|
return *out != NULL;
|
|
}
|
|
|
|
static int load_cuda(struct cuda_api *api) {
|
|
memset(api, 0, sizeof(*api));
|
|
api->lib = dlopen("libcuda.so.1", RTLD_NOW | RTLD_LOCAL);
|
|
if (!api->lib) {
|
|
return 0;
|
|
}
|
|
return
|
|
load_symbol(api->lib, "cuInit", (void **)&api->cuInit) &&
|
|
load_symbol(api->lib, "cuDeviceGetCount", (void **)&api->cuDeviceGetCount) &&
|
|
load_symbol(api->lib, "cuDeviceGet", (void **)&api->cuDeviceGet) &&
|
|
load_symbol(api->lib, "cuDeviceGetName", (void **)&api->cuDeviceGetName) &&
|
|
load_symbol(api->lib, "cuDeviceGetAttribute", (void **)&api->cuDeviceGetAttribute) &&
|
|
load_symbol(api->lib, "cuCtxCreate_v2", (void **)&api->cuCtxCreate) &&
|
|
load_symbol(api->lib, "cuCtxDestroy_v2", (void **)&api->cuCtxDestroy) &&
|
|
load_symbol(api->lib, "cuCtxSynchronize", (void **)&api->cuCtxSynchronize) &&
|
|
load_symbol(api->lib, "cuMemAlloc_v2", (void **)&api->cuMemAlloc) &&
|
|
load_symbol(api->lib, "cuMemFree_v2", (void **)&api->cuMemFree) &&
|
|
load_symbol(api->lib, "cuMemsetD8_v2", (void **)&api->cuMemsetD8) &&
|
|
load_symbol(api->lib, "cuMemcpyHtoD_v2", (void **)&api->cuMemcpyHtoD) &&
|
|
load_symbol(api->lib, "cuMemcpyDtoH_v2", (void **)&api->cuMemcpyDtoH) &&
|
|
load_symbol(api->lib, "cuModuleLoadDataEx", (void **)&api->cuModuleLoadDataEx) &&
|
|
load_symbol(api->lib, "cuModuleGetFunction", (void **)&api->cuModuleGetFunction) &&
|
|
load_symbol(api->lib, "cuLaunchKernel", (void **)&api->cuLaunchKernel);
|
|
}
|
|
|
|
static const char *cu_error_name(struct cuda_api *api, CUresult rc) {
|
|
const char *value = NULL;
|
|
if (api->cuGetErrorName && api->cuGetErrorName(rc, &value) == CU_SUCCESS && value) {
|
|
return value;
|
|
}
|
|
return "CUDA_ERROR";
|
|
}
|
|
|
|
static const char *cu_error_string(struct cuda_api *api, CUresult rc) {
|
|
const char *value = NULL;
|
|
if (api->cuGetErrorString && api->cuGetErrorString(rc, &value) == CU_SUCCESS && value) {
|
|
return value;
|
|
}
|
|
return "unknown";
|
|
}
|
|
|
|
static int check_rc(struct cuda_api *api, const char *step, CUresult rc) {
|
|
if (rc == CU_SUCCESS) {
|
|
return 1;
|
|
}
|
|
fprintf(stderr, "%s failed: %s (%s)\n", step, cu_error_name(api, rc), cu_error_string(api, rc));
|
|
return 0;
|
|
}
|
|
|
|
static double now_seconds(void) {
|
|
struct timespec ts;
|
|
clock_gettime(CLOCK_MONOTONIC, &ts);
|
|
return (double)ts.tv_sec + ((double)ts.tv_nsec / 1000000000.0);
|
|
}
|
|
|
|
#if HAVE_CUBLASLT_HEADERS
|
|
static size_t round_down_size(size_t value, size_t multiple) {
|
|
if (multiple == 0 || value < multiple) {
|
|
return value;
|
|
}
|
|
return value - (value % multiple);
|
|
}
|
|
#endif
|
|
|
|
static int query_compute_capability(struct cuda_api *api, CUdevice dev, int *major, int *minor) {
|
|
int cc_major = 0;
|
|
int cc_minor = 0;
|
|
if (!check_rc(api,
|
|
"cuDeviceGetAttribute(major)",
|
|
api->cuDeviceGetAttribute(&cc_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev))) {
|
|
return 0;
|
|
}
|
|
if (!check_rc(api,
|
|
"cuDeviceGetAttribute(minor)",
|
|
api->cuDeviceGetAttribute(&cc_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev))) {
|
|
return 0;
|
|
}
|
|
*major = cc_major;
|
|
*minor = cc_minor;
|
|
return 1;
|
|
}
|
|
|
|
#if HAVE_CUBLASLT_HEADERS
|
|
static void append_detail(char *buf, size_t cap, const char *fmt, ...) {
|
|
size_t len = strlen(buf);
|
|
if (len >= cap) {
|
|
return;
|
|
}
|
|
va_list ap;
|
|
va_start(ap, fmt);
|
|
vsnprintf(buf + len, cap - len, fmt, ap);
|
|
va_end(ap);
|
|
}
|
|
#endif
|
|
|
|
static int run_ptx_fallback(struct cuda_api *api,
|
|
CUdevice dev,
|
|
const char *device_name,
|
|
int cc_major,
|
|
int cc_minor,
|
|
int seconds,
|
|
int size_mb,
|
|
struct stress_report *report) {
|
|
CUcontext ctx = NULL;
|
|
CUdeviceptr device_mem = 0;
|
|
CUmodule module = NULL;
|
|
CUfunction kernel = NULL;
|
|
uint32_t sample[256];
|
|
uint32_t words = 0;
|
|
unsigned long iterations = 0;
|
|
|
|
memset(report, 0, sizeof(*report));
|
|
snprintf(report->backend, sizeof(report->backend), "driver-ptx");
|
|
snprintf(report->device, sizeof(report->device), "%s", device_name);
|
|
report->cc_major = cc_major;
|
|
report->cc_minor = cc_minor;
|
|
report->buffer_mb = size_mb;
|
|
|
|
if (!check_rc(api, "cuCtxCreate", api->cuCtxCreate(&ctx, 0, dev))) {
|
|
return 0;
|
|
}
|
|
|
|
size_t bytes = (size_t)size_mb * 1024u * 1024u;
|
|
if (bytes < 4u * 1024u * 1024u) {
|
|
bytes = 4u * 1024u * 1024u;
|
|
}
|
|
if (bytes > (size_t)1024u * 1024u * 1024u) {
|
|
bytes = (size_t)1024u * 1024u * 1024u;
|
|
}
|
|
words = (uint32_t)(bytes / sizeof(uint32_t));
|
|
|
|
if (!check_rc(api, "cuMemAlloc", api->cuMemAlloc(&device_mem, bytes))) {
|
|
api->cuCtxDestroy(ctx);
|
|
return 0;
|
|
}
|
|
if (!check_rc(api, "cuMemsetD8", api->cuMemsetD8(device_mem, 0, bytes))) {
|
|
api->cuMemFree(device_mem);
|
|
api->cuCtxDestroy(ctx);
|
|
return 0;
|
|
}
|
|
if (!check_rc(api,
|
|
"cuModuleLoadDataEx",
|
|
api->cuModuleLoadDataEx(&module, ptx_source, 0, NULL, NULL))) {
|
|
api->cuMemFree(device_mem);
|
|
api->cuCtxDestroy(ctx);
|
|
return 0;
|
|
}
|
|
if (!check_rc(api, "cuModuleGetFunction", api->cuModuleGetFunction(&kernel, module, "burn"))) {
|
|
api->cuMemFree(device_mem);
|
|
api->cuCtxDestroy(ctx);
|
|
return 0;
|
|
}
|
|
|
|
unsigned int threads = 256;
|
|
unsigned int blocks = (unsigned int)((words + threads - 1) / threads);
|
|
uint32_t rounds = 1024;
|
|
void *params[] = {&device_mem, &words, &rounds};
|
|
|
|
double start = now_seconds();
|
|
double deadline = start + (double)seconds;
|
|
while (now_seconds() < deadline) {
|
|
if (!check_rc(api,
|
|
"cuLaunchKernel",
|
|
api->cuLaunchKernel(kernel, blocks, 1, 1, threads, 1, 1, 0, NULL, params, NULL))) {
|
|
api->cuMemFree(device_mem);
|
|
api->cuCtxDestroy(ctx);
|
|
return 0;
|
|
}
|
|
iterations++;
|
|
}
|
|
|
|
if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
|
|
api->cuMemFree(device_mem);
|
|
api->cuCtxDestroy(ctx);
|
|
return 0;
|
|
}
|
|
if (!check_rc(api, "cuMemcpyDtoH", api->cuMemcpyDtoH(sample, device_mem, sizeof(sample)))) {
|
|
api->cuMemFree(device_mem);
|
|
api->cuCtxDestroy(ctx);
|
|
return 0;
|
|
}
|
|
|
|
for (size_t i = 0; i < sizeof(sample) / sizeof(sample[0]); i++) {
|
|
report->checksum += sample[i];
|
|
}
|
|
report->iterations = iterations;
|
|
snprintf(report->details,
|
|
sizeof(report->details),
|
|
"profile_int32_fallback=OK iterations=%lu\n",
|
|
iterations);
|
|
|
|
api->cuMemFree(device_mem);
|
|
api->cuCtxDestroy(ctx);
|
|
return 1;
|
|
}
|
|
|
|
#if HAVE_CUBLASLT_HEADERS
|
|
typedef cublasStatus_t (*cublasLtCreate_fn)(cublasLtHandle_t *);
|
|
typedef cublasStatus_t (*cublasLtDestroy_fn)(cublasLtHandle_t);
|
|
typedef cublasStatus_t (*cublasLtMatmulDescCreate_fn)(cublasLtMatmulDesc_t *,
|
|
cublasComputeType_t,
|
|
cudaDataType_t);
|
|
typedef cublasStatus_t (*cublasLtMatmulDescDestroy_fn)(cublasLtMatmulDesc_t);
|
|
typedef cublasStatus_t (*cublasLtMatmulDescSetAttribute_fn)(cublasLtMatmulDesc_t,
|
|
cublasLtMatmulDescAttributes_t,
|
|
const void *,
|
|
size_t);
|
|
typedef cublasStatus_t (*cublasLtMatrixLayoutCreate_fn)(cublasLtMatrixLayout_t *,
|
|
cudaDataType_t,
|
|
uint64_t,
|
|
uint64_t,
|
|
int64_t);
|
|
typedef cublasStatus_t (*cublasLtMatrixLayoutDestroy_fn)(cublasLtMatrixLayout_t);
|
|
typedef cublasStatus_t (*cublasLtMatmulPreferenceCreate_fn)(cublasLtMatmulPreference_t *);
|
|
typedef cublasStatus_t (*cublasLtMatmulPreferenceDestroy_fn)(cublasLtMatmulPreference_t);
|
|
typedef cublasStatus_t (*cublasLtMatmulPreferenceSetAttribute_fn)(cublasLtMatmulPreference_t,
|
|
cublasLtMatmulPreferenceAttributes_t,
|
|
const void *,
|
|
size_t);
|
|
typedef cublasStatus_t (*cublasLtMatmulAlgoGetHeuristic_fn)(
|
|
cublasLtHandle_t,
|
|
cublasLtMatmulDesc_t,
|
|
cublasLtMatrixLayout_t,
|
|
cublasLtMatrixLayout_t,
|
|
cublasLtMatrixLayout_t,
|
|
cublasLtMatrixLayout_t,
|
|
cublasLtMatmulPreference_t,
|
|
int,
|
|
cublasLtMatmulHeuristicResult_t *,
|
|
int *);
|
|
typedef cublasStatus_t (*cublasLtMatmul_fn)(cublasLtHandle_t,
|
|
cublasLtMatmulDesc_t,
|
|
const void *,
|
|
const void *,
|
|
cublasLtMatrixLayout_t,
|
|
const void *,
|
|
cublasLtMatrixLayout_t,
|
|
const void *,
|
|
const void *,
|
|
cublasLtMatrixLayout_t,
|
|
void *,
|
|
cublasLtMatrixLayout_t,
|
|
const cublasLtMatmulAlgo_t *,
|
|
void *,
|
|
size_t,
|
|
cudaStream_t);
|
|
|
|
struct cublaslt_api {
|
|
void *lib;
|
|
cublasLtCreate_fn cublasLtCreate;
|
|
cublasLtDestroy_fn cublasLtDestroy;
|
|
cublasLtMatmulDescCreate_fn cublasLtMatmulDescCreate;
|
|
cublasLtMatmulDescDestroy_fn cublasLtMatmulDescDestroy;
|
|
cublasLtMatmulDescSetAttribute_fn cublasLtMatmulDescSetAttribute;
|
|
cublasLtMatrixLayoutCreate_fn cublasLtMatrixLayoutCreate;
|
|
cublasLtMatrixLayoutDestroy_fn cublasLtMatrixLayoutDestroy;
|
|
cublasLtMatmulPreferenceCreate_fn cublasLtMatmulPreferenceCreate;
|
|
cublasLtMatmulPreferenceDestroy_fn cublasLtMatmulPreferenceDestroy;
|
|
cublasLtMatmulPreferenceSetAttribute_fn cublasLtMatmulPreferenceSetAttribute;
|
|
cublasLtMatmulAlgoGetHeuristic_fn cublasLtMatmulAlgoGetHeuristic;
|
|
cublasLtMatmul_fn cublasLtMatmul;
|
|
};
|
|
|
|
struct profile_desc {
|
|
const char *name;
|
|
const char *block_label;
|
|
int min_cc;
|
|
int enabled;
|
|
int needs_scalar_scale;
|
|
int needs_block_scale;
|
|
int min_multiple;
|
|
cudaDataType_t a_type;
|
|
cudaDataType_t b_type;
|
|
cudaDataType_t c_type;
|
|
cudaDataType_t d_type;
|
|
cublasComputeType_t compute_type;
|
|
};
|
|
|
|
struct prepared_profile {
|
|
struct profile_desc desc;
|
|
cublasLtMatmulDesc_t op_desc;
|
|
cublasLtMatrixLayout_t a_layout;
|
|
cublasLtMatrixLayout_t b_layout;
|
|
cublasLtMatrixLayout_t c_layout;
|
|
cublasLtMatrixLayout_t d_layout;
|
|
cublasLtMatmulPreference_t preference;
|
|
cublasLtMatmulHeuristicResult_t heuristic;
|
|
CUdeviceptr a_dev;
|
|
CUdeviceptr b_dev;
|
|
CUdeviceptr c_dev;
|
|
CUdeviceptr d_dev;
|
|
CUdeviceptr a_scale_dev;
|
|
CUdeviceptr b_scale_dev;
|
|
CUdeviceptr workspace_dev;
|
|
size_t workspace_size;
|
|
uint64_t m;
|
|
uint64_t n;
|
|
uint64_t k;
|
|
unsigned long iterations;
|
|
int ready;
|
|
};
|
|
|
|
static const struct profile_desc k_profiles[] = {
|
|
{
|
|
"fp32_tf32",
|
|
"fp32",
|
|
80,
|
|
1,
|
|
0,
|
|
0,
|
|
128,
|
|
CUDA_R_32F,
|
|
CUDA_R_32F,
|
|
CUDA_R_32F,
|
|
CUDA_R_32F,
|
|
CUBLAS_COMPUTE_32F_FAST_TF32,
|
|
},
|
|
{
|
|
"fp16_tensor",
|
|
"fp16",
|
|
80,
|
|
1,
|
|
0,
|
|
0,
|
|
128,
|
|
CUDA_R_16F,
|
|
CUDA_R_16F,
|
|
CUDA_R_16F,
|
|
CUDA_R_16F,
|
|
CUBLAS_COMPUTE_32F_FAST_16F,
|
|
},
|
|
{
|
|
"fp8_e4m3",
|
|
"fp8",
|
|
89,
|
|
1,
|
|
1,
|
|
0,
|
|
128,
|
|
CUDA_R_8F_E4M3,
|
|
CUDA_R_8F_E4M3,
|
|
CUDA_R_16BF,
|
|
CUDA_R_16BF,
|
|
CUBLAS_COMPUTE_32F,
|
|
},
|
|
{
|
|
"fp8_e5m2",
|
|
"fp8",
|
|
89,
|
|
1,
|
|
1,
|
|
0,
|
|
128,
|
|
CUDA_R_8F_E5M2,
|
|
CUDA_R_8F_E5M2,
|
|
CUDA_R_16BF,
|
|
CUDA_R_16BF,
|
|
CUBLAS_COMPUTE_32F,
|
|
},
|
|
#if defined(CUDA_R_4F_E2M1) && defined(CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3)
|
|
{
|
|
"fp4_e2m1",
|
|
"fp4",
|
|
100,
|
|
1,
|
|
0,
|
|
1,
|
|
128,
|
|
CUDA_R_4F_E2M1,
|
|
CUDA_R_4F_E2M1,
|
|
CUDA_R_16BF,
|
|
CUDA_R_16BF,
|
|
CUBLAS_COMPUTE_32F,
|
|
},
|
|
#endif
|
|
};
|
|
|
|
static int load_cublaslt(struct cublaslt_api *api) {
|
|
memset(api, 0, sizeof(*api));
|
|
api->lib = dlopen("libcublasLt.so.13", RTLD_NOW | RTLD_LOCAL);
|
|
if (!api->lib) {
|
|
api->lib = dlopen("libcublasLt.so", RTLD_NOW | RTLD_LOCAL);
|
|
}
|
|
if (!api->lib) {
|
|
return 0;
|
|
}
|
|
return
|
|
load_symbol(api->lib, "cublasLtCreate", (void **)&api->cublasLtCreate) &&
|
|
load_symbol(api->lib, "cublasLtDestroy", (void **)&api->cublasLtDestroy) &&
|
|
load_symbol(api->lib, "cublasLtMatmulDescCreate", (void **)&api->cublasLtMatmulDescCreate) &&
|
|
load_symbol(api->lib, "cublasLtMatmulDescDestroy", (void **)&api->cublasLtMatmulDescDestroy) &&
|
|
load_symbol(api->lib,
|
|
"cublasLtMatmulDescSetAttribute",
|
|
(void **)&api->cublasLtMatmulDescSetAttribute) &&
|
|
load_symbol(api->lib, "cublasLtMatrixLayoutCreate", (void **)&api->cublasLtMatrixLayoutCreate) &&
|
|
load_symbol(api->lib, "cublasLtMatrixLayoutDestroy", (void **)&api->cublasLtMatrixLayoutDestroy) &&
|
|
load_symbol(api->lib,
|
|
"cublasLtMatmulPreferenceCreate",
|
|
(void **)&api->cublasLtMatmulPreferenceCreate) &&
|
|
load_symbol(api->lib,
|
|
"cublasLtMatmulPreferenceDestroy",
|
|
(void **)&api->cublasLtMatmulPreferenceDestroy) &&
|
|
load_symbol(api->lib,
|
|
"cublasLtMatmulPreferenceSetAttribute",
|
|
(void **)&api->cublasLtMatmulPreferenceSetAttribute) &&
|
|
load_symbol(api->lib,
|
|
"cublasLtMatmulAlgoGetHeuristic",
|
|
(void **)&api->cublasLtMatmulAlgoGetHeuristic) &&
|
|
load_symbol(api->lib, "cublasLtMatmul", (void **)&api->cublasLtMatmul);
|
|
}
|
|
|
|
static const char *cublas_status_text(cublasStatus_t status) {
|
|
switch (status) {
|
|
case CUBLAS_STATUS_SUCCESS:
|
|
return "CUBLAS_STATUS_SUCCESS";
|
|
case CUBLAS_STATUS_NOT_INITIALIZED:
|
|
return "CUBLAS_STATUS_NOT_INITIALIZED";
|
|
case CUBLAS_STATUS_ALLOC_FAILED:
|
|
return "CUBLAS_STATUS_ALLOC_FAILED";
|
|
case CUBLAS_STATUS_INVALID_VALUE:
|
|
return "CUBLAS_STATUS_INVALID_VALUE";
|
|
case CUBLAS_STATUS_ARCH_MISMATCH:
|
|
return "CUBLAS_STATUS_ARCH_MISMATCH";
|
|
case CUBLAS_STATUS_MAPPING_ERROR:
|
|
return "CUBLAS_STATUS_MAPPING_ERROR";
|
|
case CUBLAS_STATUS_EXECUTION_FAILED:
|
|
return "CUBLAS_STATUS_EXECUTION_FAILED";
|
|
case CUBLAS_STATUS_INTERNAL_ERROR:
|
|
return "CUBLAS_STATUS_INTERNAL_ERROR";
|
|
case CUBLAS_STATUS_NOT_SUPPORTED:
|
|
return "CUBLAS_STATUS_NOT_SUPPORTED";
|
|
default:
|
|
return "CUBLAS_STATUS_UNKNOWN";
|
|
}
|
|
}
|
|
|
|
static int check_cublas(const char *step, cublasStatus_t status) {
|
|
if (status == CUBLAS_STATUS_SUCCESS) {
|
|
return 1;
|
|
}
|
|
fprintf(stderr, "%s failed: %s (%d)\n", step, cublas_status_text(status), (int)status);
|
|
return 0;
|
|
}
|
|
|
|
static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) {
|
|
switch (type) {
|
|
case CUDA_R_32F:
|
|
return (size_t)(elements * 4u);
|
|
case CUDA_R_16F:
|
|
case CUDA_R_16BF:
|
|
return (size_t)(elements * 2u);
|
|
case CUDA_R_8F_E4M3:
|
|
case CUDA_R_8F_E5M2:
|
|
return (size_t)(elements);
|
|
#if defined(CUDA_R_4F_E2M1)
|
|
case CUDA_R_4F_E2M1:
|
|
return (size_t)((elements + 1u) / 2u);
|
|
#endif
|
|
default:
|
|
return (size_t)(elements * 4u);
|
|
}
|
|
}
|
|
|
|
static size_t fp4_scale_bytes(uint64_t rows, uint64_t cols) {
|
|
uint64_t row_tiles = (rows + 127u) / 128u;
|
|
uint64_t col_tiles = (cols + 63u) / 64u;
|
|
return (size_t)(row_tiles * col_tiles * 128u);
|
|
}
|
|
|
|
static uint64_t choose_square_dim(size_t budget_bytes, size_t bytes_per_cell, int multiple) {
|
|
double approx = sqrt((double)budget_bytes / (double)bytes_per_cell);
|
|
uint64_t dim = (uint64_t)approx;
|
|
if (dim < (uint64_t)multiple) {
|
|
dim = (uint64_t)multiple;
|
|
}
|
|
dim = (uint64_t)round_down_size((size_t)dim, (size_t)multiple);
|
|
if (dim < (uint64_t)multiple) {
|
|
dim = (uint64_t)multiple;
|
|
}
|
|
if (dim > 8192u) {
|
|
dim = 8192u;
|
|
}
|
|
return dim;
|
|
}
|
|
|
|
static int device_upload(struct cuda_api *cuda, CUdeviceptr dev, const void *src, size_t bytes) {
|
|
return check_rc(cuda, "cuMemcpyHtoD", cuda->cuMemcpyHtoD(dev, src, bytes));
|
|
}
|
|
|
|
static int alloc_filled(struct cuda_api *cuda, CUdeviceptr *ptr, size_t bytes, unsigned char pattern) {
|
|
if (!check_rc(cuda, "cuMemAlloc", cuda->cuMemAlloc(ptr, bytes))) {
|
|
return 0;
|
|
}
|
|
if (!check_rc(cuda, "cuMemsetD8", cuda->cuMemsetD8(*ptr, pattern, bytes))) {
|
|
cuda->cuMemFree(*ptr);
|
|
*ptr = 0;
|
|
return 0;
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
static size_t profile_scale_bytes(const struct profile_desc *desc, uint64_t m, uint64_t n, uint64_t k) {
|
|
size_t bytes = 0;
|
|
if (desc->needs_scalar_scale) {
|
|
bytes += 2u * sizeof(float);
|
|
}
|
|
#if defined(CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3)
|
|
if (desc->needs_block_scale) {
|
|
bytes += fp4_scale_bytes(k, m);
|
|
bytes += fp4_scale_bytes(k, n);
|
|
}
|
|
#else
|
|
(void)m;
|
|
(void)n;
|
|
(void)k;
|
|
#endif
|
|
return bytes;
|
|
}
|
|
|
|
static void destroy_profile(struct cublaslt_api *cublas, struct cuda_api *cuda, struct prepared_profile *profile) {
|
|
if (profile->workspace_dev) {
|
|
cuda->cuMemFree(profile->workspace_dev);
|
|
}
|
|
if (profile->a_scale_dev) {
|
|
cuda->cuMemFree(profile->a_scale_dev);
|
|
}
|
|
if (profile->b_scale_dev) {
|
|
cuda->cuMemFree(profile->b_scale_dev);
|
|
}
|
|
if (profile->d_dev) {
|
|
cuda->cuMemFree(profile->d_dev);
|
|
}
|
|
if (profile->c_dev) {
|
|
cuda->cuMemFree(profile->c_dev);
|
|
}
|
|
if (profile->b_dev) {
|
|
cuda->cuMemFree(profile->b_dev);
|
|
}
|
|
if (profile->a_dev) {
|
|
cuda->cuMemFree(profile->a_dev);
|
|
}
|
|
if (profile->preference) {
|
|
cublas->cublasLtMatmulPreferenceDestroy(profile->preference);
|
|
}
|
|
if (profile->d_layout) {
|
|
cublas->cublasLtMatrixLayoutDestroy(profile->d_layout);
|
|
}
|
|
if (profile->c_layout) {
|
|
cublas->cublasLtMatrixLayoutDestroy(profile->c_layout);
|
|
}
|
|
if (profile->b_layout) {
|
|
cublas->cublasLtMatrixLayoutDestroy(profile->b_layout);
|
|
}
|
|
if (profile->a_layout) {
|
|
cublas->cublasLtMatrixLayoutDestroy(profile->a_layout);
|
|
}
|
|
if (profile->op_desc) {
|
|
cublas->cublasLtMatmulDescDestroy(profile->op_desc);
|
|
}
|
|
memset(profile, 0, sizeof(*profile));
|
|
}
|
|
|
|
static int prepare_profile(struct cublaslt_api *cublas,
|
|
cublasLtHandle_t handle,
|
|
struct cuda_api *cuda,
|
|
const struct profile_desc *desc,
|
|
size_t profile_budget_bytes,
|
|
struct prepared_profile *out) {
|
|
memset(out, 0, sizeof(*out));
|
|
out->desc = *desc;
|
|
|
|
size_t bytes_per_cell = 0;
|
|
bytes_per_cell += bytes_for_elements(desc->a_type, 1);
|
|
bytes_per_cell += bytes_for_elements(desc->b_type, 1);
|
|
bytes_per_cell += bytes_for_elements(desc->c_type, 1);
|
|
bytes_per_cell += bytes_for_elements(desc->d_type, 1);
|
|
if (bytes_per_cell == 0) {
|
|
return 0;
|
|
}
|
|
|
|
uint64_t dim = choose_square_dim(profile_budget_bytes, bytes_per_cell, desc->min_multiple);
|
|
out->m = dim;
|
|
out->n = dim;
|
|
out->k = dim;
|
|
|
|
size_t desired_workspace = profile_budget_bytes / 8u;
|
|
if (desired_workspace > 32u * 1024u * 1024u) {
|
|
desired_workspace = 32u * 1024u * 1024u;
|
|
}
|
|
desired_workspace = round_down_size(desired_workspace, 256u);
|
|
|
|
size_t a_bytes = 0;
|
|
size_t b_bytes = 0;
|
|
size_t c_bytes = 0;
|
|
size_t d_bytes = 0;
|
|
size_t scale_bytes = 0;
|
|
while (1) {
|
|
a_bytes = bytes_for_elements(desc->a_type, out->k * out->m);
|
|
b_bytes = bytes_for_elements(desc->b_type, out->k * out->n);
|
|
c_bytes = bytes_for_elements(desc->c_type, out->m * out->n);
|
|
d_bytes = bytes_for_elements(desc->d_type, out->m * out->n);
|
|
scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k);
|
|
|
|
size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes;
|
|
if (matrix_bytes <= profile_budget_bytes) {
|
|
size_t remaining = profile_budget_bytes - matrix_bytes;
|
|
out->workspace_size = desired_workspace;
|
|
if (out->workspace_size > remaining) {
|
|
out->workspace_size = round_down_size(remaining, 256u);
|
|
}
|
|
break;
|
|
}
|
|
|
|
if (out->m <= (uint64_t)desc->min_multiple) {
|
|
return 0;
|
|
}
|
|
out->m -= (uint64_t)desc->min_multiple;
|
|
out->n = out->m;
|
|
out->k = out->m;
|
|
}
|
|
|
|
if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) ||
|
|
!alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) ||
|
|
!alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) ||
|
|
!alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) {
|
|
destroy_profile(cublas, cuda, out);
|
|
return 0;
|
|
}
|
|
|
|
if (!check_cublas("cublasLtMatmulDescCreate",
|
|
cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, CUDA_R_32F))) {
|
|
destroy_profile(cublas, cuda, out);
|
|
return 0;
|
|
}
|
|
|
|
cublasOperation_t transa = CUBLAS_OP_T;
|
|
cublasOperation_t transb = CUBLAS_OP_N;
|
|
if (!check_cublas("set TRANSA",
|
|
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
|
CUBLASLT_MATMUL_DESC_TRANSA,
|
|
&transa,
|
|
sizeof(transa))) ||
|
|
!check_cublas("set TRANSB",
|
|
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
|
CUBLASLT_MATMUL_DESC_TRANSB,
|
|
&transb,
|
|
sizeof(transb)))) {
|
|
destroy_profile(cublas, cuda, out);
|
|
return 0;
|
|
}
|
|
|
|
if (desc->needs_scalar_scale) {
|
|
float one = 1.0f;
|
|
if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) ||
|
|
!alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) {
|
|
destroy_profile(cublas, cuda, out);
|
|
return 0;
|
|
}
|
|
if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) ||
|
|
!device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) {
|
|
destroy_profile(cublas, cuda, out);
|
|
return 0;
|
|
}
|
|
void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev;
|
|
void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev;
|
|
if (!check_cublas("set A scale ptr",
|
|
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
|
CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
|
|
&a_scale_ptr,
|
|
sizeof(a_scale_ptr))) ||
|
|
!check_cublas("set B scale ptr",
|
|
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
|
CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
|
|
&b_scale_ptr,
|
|
sizeof(b_scale_ptr)))) {
|
|
destroy_profile(cublas, cuda, out);
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
#if defined(CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3)
|
|
if (desc->needs_block_scale) {
|
|
size_t a_scale_bytes = fp4_scale_bytes(out->k, out->m);
|
|
size_t b_scale_bytes = fp4_scale_bytes(out->k, out->n);
|
|
if (!alloc_filled(cuda, &out->a_scale_dev, a_scale_bytes, 0x11) ||
|
|
!alloc_filled(cuda, &out->b_scale_dev, b_scale_bytes, 0x11)) {
|
|
destroy_profile(cublas, cuda, out);
|
|
return 0;
|
|
}
|
|
cublasLtMatmulMatrixScale_t scale_mode = CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3;
|
|
void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev;
|
|
void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev;
|
|
if (!check_cublas("set A scale mode",
|
|
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
|
CUBLASLT_MATMUL_DESC_A_SCALE_MODE,
|
|
&scale_mode,
|
|
sizeof(scale_mode))) ||
|
|
!check_cublas("set B scale mode",
|
|
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
|
CUBLASLT_MATMUL_DESC_B_SCALE_MODE,
|
|
&scale_mode,
|
|
sizeof(scale_mode))) ||
|
|
!check_cublas("set A block scale ptr",
|
|
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
|
CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
|
|
&a_scale_ptr,
|
|
sizeof(a_scale_ptr))) ||
|
|
!check_cublas("set B block scale ptr",
|
|
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
|
CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
|
|
&b_scale_ptr,
|
|
sizeof(b_scale_ptr)))) {
|
|
destroy_profile(cublas, cuda, out);
|
|
return 0;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
if (!check_cublas("create A layout",
|
|
cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) ||
|
|
!check_cublas("create B layout",
|
|
cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) ||
|
|
!check_cublas("create C layout",
|
|
cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) ||
|
|
!check_cublas("create D layout",
|
|
cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) {
|
|
destroy_profile(cublas, cuda, out);
|
|
return 0;
|
|
}
|
|
|
|
if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) {
|
|
destroy_profile(cublas, cuda, out);
|
|
return 0;
|
|
}
|
|
|
|
if (out->workspace_size > 0) {
|
|
if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) {
|
|
destroy_profile(cublas, cuda, out);
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
if (!check_cublas("set workspace",
|
|
cublas->cublasLtMatmulPreferenceSetAttribute(
|
|
out->preference,
|
|
CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
|
|
&out->workspace_size,
|
|
sizeof(out->workspace_size)))) {
|
|
destroy_profile(cublas, cuda, out);
|
|
return 0;
|
|
}
|
|
|
|
int found = 0;
|
|
if (!check_cublas("heuristic",
|
|
cublas->cublasLtMatmulAlgoGetHeuristic(handle,
|
|
out->op_desc,
|
|
out->a_layout,
|
|
out->b_layout,
|
|
out->c_layout,
|
|
out->d_layout,
|
|
out->preference,
|
|
1,
|
|
&out->heuristic,
|
|
&found))) {
|
|
destroy_profile(cublas, cuda, out);
|
|
return 0;
|
|
}
|
|
if (found <= 0) {
|
|
destroy_profile(cublas, cuda, out);
|
|
return 0;
|
|
}
|
|
|
|
out->ready = 1;
|
|
return 1;
|
|
}
|
|
|
|
static int run_cublas_profile(cublasLtHandle_t handle,
|
|
struct cublaslt_api *cublas,
|
|
struct prepared_profile *profile) {
|
|
float alpha = 1.0f;
|
|
float beta = 0.0f;
|
|
return check_cublas(profile->desc.name,
|
|
cublas->cublasLtMatmul(handle,
|
|
profile->op_desc,
|
|
&alpha,
|
|
(const void *)(uintptr_t)profile->a_dev,
|
|
profile->a_layout,
|
|
(const void *)(uintptr_t)profile->b_dev,
|
|
profile->b_layout,
|
|
&beta,
|
|
(const void *)(uintptr_t)profile->c_dev,
|
|
profile->c_layout,
|
|
(void *)(uintptr_t)profile->d_dev,
|
|
profile->d_layout,
|
|
&profile->heuristic.algo,
|
|
(void *)(uintptr_t)profile->workspace_dev,
|
|
profile->workspace_size,
|
|
(cudaStream_t)0));
|
|
}
|
|
|
|
static int run_cublaslt_stress(struct cuda_api *cuda,
|
|
CUdevice dev,
|
|
const char *device_name,
|
|
int cc_major,
|
|
int cc_minor,
|
|
int seconds,
|
|
int size_mb,
|
|
struct stress_report *report) {
|
|
struct cublaslt_api cublas;
|
|
struct prepared_profile prepared[sizeof(k_profiles) / sizeof(k_profiles[0])];
|
|
cublasLtHandle_t handle = NULL;
|
|
CUcontext ctx = NULL;
|
|
uint16_t sample[256];
|
|
int cc = cc_major * 10 + cc_minor;
|
|
int planned = 0;
|
|
int active = 0;
|
|
|
|
memset(report, 0, sizeof(*report));
|
|
snprintf(report->backend, sizeof(report->backend), "cublasLt");
|
|
snprintf(report->device, sizeof(report->device), "%s", device_name);
|
|
report->cc_major = cc_major;
|
|
report->cc_minor = cc_minor;
|
|
report->buffer_mb = size_mb;
|
|
|
|
if (!load_cublaslt(&cublas)) {
|
|
snprintf(report->details, sizeof(report->details), "cublasLt=unavailable\n");
|
|
return 0;
|
|
}
|
|
if (!check_rc(cuda, "cuCtxCreate", cuda->cuCtxCreate(&ctx, 0, dev))) {
|
|
return 0;
|
|
}
|
|
if (!check_cublas("cublasLtCreate", cublas.cublasLtCreate(&handle))) {
|
|
cuda->cuCtxDestroy(ctx);
|
|
return 0;
|
|
}
|
|
|
|
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
|
|
if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) {
|
|
planned++;
|
|
}
|
|
}
|
|
if (planned <= 0) {
|
|
snprintf(report->details, sizeof(report->details), "cublasLt_profiles=unsupported\n");
|
|
cublas.cublasLtDestroy(handle);
|
|
cuda->cuCtxDestroy(ctx);
|
|
return 0;
|
|
}
|
|
|
|
size_t total_budget = (size_t)size_mb * 1024u * 1024u;
|
|
if (total_budget < (size_t)planned * 4u * 1024u * 1024u) {
|
|
total_budget = (size_t)planned * 4u * 1024u * 1024u;
|
|
}
|
|
size_t per_profile_budget = total_budget / (size_t)planned;
|
|
if (per_profile_budget < 4u * 1024u * 1024u) {
|
|
per_profile_budget = 4u * 1024u * 1024u;
|
|
}
|
|
|
|
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
|
|
const struct profile_desc *desc = &k_profiles[i];
|
|
if (!(desc->enabled && cc >= desc->min_cc)) {
|
|
append_detail(report->details,
|
|
sizeof(report->details),
|
|
"%s=SKIPPED cc<%d\n",
|
|
desc->name,
|
|
desc->min_cc);
|
|
continue;
|
|
}
|
|
if (prepare_profile(&cublas, handle, cuda, desc, per_profile_budget, &prepared[i])) {
|
|
active++;
|
|
append_detail(report->details,
|
|
sizeof(report->details),
|
|
"%s=READY dim=%llux%llux%llu block=%s\n",
|
|
desc->name,
|
|
(unsigned long long)prepared[i].m,
|
|
(unsigned long long)prepared[i].n,
|
|
(unsigned long long)prepared[i].k,
|
|
desc->block_label);
|
|
} else {
|
|
append_detail(report->details, sizeof(report->details), "%s=SKIPPED unsupported\n", desc->name);
|
|
}
|
|
}
|
|
|
|
if (active <= 0) {
|
|
cublas.cublasLtDestroy(handle);
|
|
cuda->cuCtxDestroy(ctx);
|
|
return 0;
|
|
}
|
|
|
|
double deadline = now_seconds() + (double)seconds;
|
|
while (now_seconds() < deadline) {
|
|
for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
|
|
if (!prepared[i].ready) {
|
|
continue;
|
|
}
|
|
if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
|
|
append_detail(report->details,
|
|
sizeof(report->details),
|
|
"%s=FAILED runtime\n",
|
|
prepared[i].desc.name);
|
|
for (size_t j = 0; j < sizeof(prepared) / sizeof(prepared[0]); j++) {
|
|
destroy_profile(&cublas, cuda, &prepared[j]);
|
|
}
|
|
cublas.cublasLtDestroy(handle);
|
|
cuda->cuCtxDestroy(ctx);
|
|
return 0;
|
|
}
|
|
prepared[i].iterations++;
|
|
report->iterations++;
|
|
if (now_seconds() >= deadline) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
|
|
for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
|
|
destroy_profile(&cublas, cuda, &prepared[i]);
|
|
}
|
|
cublas.cublasLtDestroy(handle);
|
|
cuda->cuCtxDestroy(ctx);
|
|
return 0;
|
|
}
|
|
|
|
for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
|
|
if (!prepared[i].ready) {
|
|
continue;
|
|
}
|
|
append_detail(report->details,
|
|
sizeof(report->details),
|
|
"%s_iterations=%lu\n",
|
|
prepared[i].desc.name,
|
|
prepared[i].iterations);
|
|
}
|
|
|
|
for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
|
|
if (prepared[i].ready) {
|
|
if (check_rc(cuda, "cuMemcpyDtoH", cuda->cuMemcpyDtoH(sample, prepared[i].d_dev, sizeof(sample)))) {
|
|
for (size_t j = 0; j < sizeof(sample) / sizeof(sample[0]); j++) {
|
|
report->checksum += sample[j];
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
|
|
destroy_profile(&cublas, cuda, &prepared[i]);
|
|
}
|
|
cublas.cublasLtDestroy(handle);
|
|
cuda->cuCtxDestroy(ctx);
|
|
return 1;
|
|
}
|
|
#endif
|
|
|
|
int main(int argc, char **argv) {
|
|
int seconds = 5;
|
|
int size_mb = 64;
|
|
for (int i = 1; i < argc; i++) {
|
|
if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) {
|
|
seconds = atoi(argv[++i]);
|
|
} else if ((strcmp(argv[i], "--size-mb") == 0 || strcmp(argv[i], "-m") == 0) && i + 1 < argc) {
|
|
size_mb = atoi(argv[++i]);
|
|
} else {
|
|
fprintf(stderr, "usage: %s [--seconds N] [--size-mb N]\n", argv[0]);
|
|
return 2;
|
|
}
|
|
}
|
|
if (seconds <= 0) {
|
|
seconds = 5;
|
|
}
|
|
if (size_mb <= 0) {
|
|
size_mb = 64;
|
|
}
|
|
|
|
struct cuda_api cuda;
|
|
if (!load_cuda(&cuda)) {
|
|
fprintf(stderr, "failed to load libcuda.so.1 or required Driver API symbols\n");
|
|
return 1;
|
|
}
|
|
load_symbol(cuda.lib, "cuGetErrorName", (void **)&cuda.cuGetErrorName);
|
|
load_symbol(cuda.lib, "cuGetErrorString", (void **)&cuda.cuGetErrorString);
|
|
|
|
if (!check_rc(&cuda, "cuInit", cuda.cuInit(0))) {
|
|
return 1;
|
|
}
|
|
|
|
int count = 0;
|
|
if (!check_rc(&cuda, "cuDeviceGetCount", cuda.cuDeviceGetCount(&count))) {
|
|
return 1;
|
|
}
|
|
if (count <= 0) {
|
|
fprintf(stderr, "no CUDA devices found\n");
|
|
return 1;
|
|
}
|
|
|
|
CUdevice dev = 0;
|
|
if (!check_rc(&cuda, "cuDeviceGet", cuda.cuDeviceGet(&dev, 0))) {
|
|
return 1;
|
|
}
|
|
|
|
char name[128] = {0};
|
|
if (!check_rc(&cuda, "cuDeviceGetName", cuda.cuDeviceGetName(name, (int)sizeof(name), dev))) {
|
|
return 1;
|
|
}
|
|
|
|
int cc_major = 0;
|
|
int cc_minor = 0;
|
|
if (!query_compute_capability(&cuda, dev, &cc_major, &cc_minor)) {
|
|
return 1;
|
|
}
|
|
|
|
struct stress_report report;
|
|
int ok = 0;
|
|
|
|
#if HAVE_CUBLASLT_HEADERS
|
|
ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, &report);
|
|
#endif
|
|
if (!ok) {
|
|
if (!run_ptx_fallback(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, &report)) {
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
printf("device=%s\n", report.device);
|
|
printf("compute_capability=%d.%d\n", report.cc_major, report.cc_minor);
|
|
printf("backend=%s\n", report.backend);
|
|
printf("duration_s=%d\n", seconds);
|
|
printf("buffer_mb=%d\n", report.buffer_mb);
|
|
printf("iterations=%lu\n", report.iterations);
|
|
printf("checksum=%llu\n", (unsigned long long)report.checksum);
|
|
if (report.details[0] != '\0') {
|
|
printf("%s", report.details);
|
|
}
|
|
printf("status=OK\n");
|
|
return 0;
|
|
}
|