315 lines
10 KiB
C
315 lines
10 KiB
C
#define _POSIX_C_SOURCE 200809L
|
|
|
|
#include <dlfcn.h>
|
|
#include <stdint.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <time.h>
|
|
|
|
typedef int CUdevice;
|
|
typedef uint64_t CUdeviceptr;
|
|
typedef int CUresult;
|
|
typedef void *CUcontext;
|
|
typedef void *CUmodule;
|
|
typedef void *CUfunction;
|
|
typedef void *CUstream;
|
|
|
|
#define CU_SUCCESS 0
|
|
|
|
static const char *ptx_source =
|
|
".version 6.0\n"
|
|
".target sm_30\n"
|
|
".address_size 64\n"
|
|
"\n"
|
|
".visible .entry burn(\n"
|
|
" .param .u64 data,\n"
|
|
" .param .u32 words,\n"
|
|
" .param .u32 rounds\n"
|
|
")\n"
|
|
"{\n"
|
|
" .reg .pred %p<2>;\n"
|
|
" .reg .b32 %r<8>;\n"
|
|
" .reg .b64 %rd<5>;\n"
|
|
"\n"
|
|
" ld.param.u64 %rd1, [data];\n"
|
|
" ld.param.u32 %r1, [words];\n"
|
|
" ld.param.u32 %r2, [rounds];\n"
|
|
" mov.u32 %r3, %ctaid.x;\n"
|
|
" mov.u32 %r4, %ntid.x;\n"
|
|
" mov.u32 %r5, %tid.x;\n"
|
|
" mad.lo.s32 %r0, %r3, %r4, %r5;\n"
|
|
" setp.ge.u32 %p0, %r0, %r1;\n"
|
|
" @%p0 bra DONE;\n"
|
|
" mul.wide.u32 %rd2, %r0, 4;\n"
|
|
" add.s64 %rd3, %rd1, %rd2;\n"
|
|
" ld.global.u32 %r6, [%rd3];\n"
|
|
"LOOP:\n"
|
|
" setp.eq.u32 %p1, %r2, 0;\n"
|
|
" @%p1 bra STORE;\n"
|
|
" mad.lo.u32 %r6, %r6, 1664525, 1013904223;\n"
|
|
" sub.u32 %r2, %r2, 1;\n"
|
|
" bra LOOP;\n"
|
|
"STORE:\n"
|
|
" st.global.u32 [%rd3], %r6;\n"
|
|
"DONE:\n"
|
|
" ret;\n"
|
|
"}\n";
|
|
|
|
typedef CUresult (*cuInit_fn)(unsigned int);
|
|
typedef CUresult (*cuDeviceGetCount_fn)(int *);
|
|
typedef CUresult (*cuDeviceGet_fn)(CUdevice *, int);
|
|
typedef CUresult (*cuDeviceGetName_fn)(char *, int, CUdevice);
|
|
typedef CUresult (*cuCtxCreate_fn)(CUcontext *, unsigned int, CUdevice);
|
|
typedef CUresult (*cuCtxDestroy_fn)(CUcontext);
|
|
typedef CUresult (*cuCtxSynchronize_fn)(void);
|
|
typedef CUresult (*cuMemAlloc_fn)(CUdeviceptr *, size_t);
|
|
typedef CUresult (*cuMemFree_fn)(CUdeviceptr);
|
|
typedef CUresult (*cuMemcpyHtoD_fn)(CUdeviceptr, const void *, size_t);
|
|
typedef CUresult (*cuMemcpyDtoH_fn)(void *, CUdeviceptr, size_t);
|
|
typedef CUresult (*cuModuleLoadDataEx_fn)(CUmodule *, const void *, unsigned int, void *, void *);
|
|
typedef CUresult (*cuModuleGetFunction_fn)(CUfunction *, CUmodule, const char *);
|
|
typedef CUresult (*cuLaunchKernel_fn)(CUfunction,
|
|
unsigned int,
|
|
unsigned int,
|
|
unsigned int,
|
|
unsigned int,
|
|
unsigned int,
|
|
unsigned int,
|
|
unsigned int,
|
|
CUstream,
|
|
void **,
|
|
void **);
|
|
typedef CUresult (*cuGetErrorName_fn)(CUresult, const char **);
|
|
typedef CUresult (*cuGetErrorString_fn)(CUresult, const char **);
|
|
|
|
struct cuda_api {
|
|
void *lib;
|
|
cuInit_fn cuInit;
|
|
cuDeviceGetCount_fn cuDeviceGetCount;
|
|
cuDeviceGet_fn cuDeviceGet;
|
|
cuDeviceGetName_fn cuDeviceGetName;
|
|
cuCtxCreate_fn cuCtxCreate;
|
|
cuCtxDestroy_fn cuCtxDestroy;
|
|
cuCtxSynchronize_fn cuCtxSynchronize;
|
|
cuMemAlloc_fn cuMemAlloc;
|
|
cuMemFree_fn cuMemFree;
|
|
cuMemcpyHtoD_fn cuMemcpyHtoD;
|
|
cuMemcpyDtoH_fn cuMemcpyDtoH;
|
|
cuModuleLoadDataEx_fn cuModuleLoadDataEx;
|
|
cuModuleGetFunction_fn cuModuleGetFunction;
|
|
cuLaunchKernel_fn cuLaunchKernel;
|
|
cuGetErrorName_fn cuGetErrorName;
|
|
cuGetErrorString_fn cuGetErrorString;
|
|
};
|
|
|
|
static int load_symbol(void *lib, const char *name, void **out) {
|
|
*out = dlsym(lib, name);
|
|
return *out != NULL;
|
|
}
|
|
|
|
static int load_cuda(struct cuda_api *api) {
|
|
memset(api, 0, sizeof(*api));
|
|
api->lib = dlopen("libcuda.so.1", RTLD_NOW | RTLD_LOCAL);
|
|
if (!api->lib) {
|
|
return 0;
|
|
}
|
|
return
|
|
load_symbol(api->lib, "cuInit", (void **)&api->cuInit) &&
|
|
load_symbol(api->lib, "cuDeviceGetCount", (void **)&api->cuDeviceGetCount) &&
|
|
load_symbol(api->lib, "cuDeviceGet", (void **)&api->cuDeviceGet) &&
|
|
load_symbol(api->lib, "cuDeviceGetName", (void **)&api->cuDeviceGetName) &&
|
|
load_symbol(api->lib, "cuCtxCreate_v2", (void **)&api->cuCtxCreate) &&
|
|
load_symbol(api->lib, "cuCtxDestroy_v2", (void **)&api->cuCtxDestroy) &&
|
|
load_symbol(api->lib, "cuCtxSynchronize", (void **)&api->cuCtxSynchronize) &&
|
|
load_symbol(api->lib, "cuMemAlloc_v2", (void **)&api->cuMemAlloc) &&
|
|
load_symbol(api->lib, "cuMemFree_v2", (void **)&api->cuMemFree) &&
|
|
load_symbol(api->lib, "cuMemcpyHtoD_v2", (void **)&api->cuMemcpyHtoD) &&
|
|
load_symbol(api->lib, "cuMemcpyDtoH_v2", (void **)&api->cuMemcpyDtoH) &&
|
|
load_symbol(api->lib, "cuModuleLoadDataEx", (void **)&api->cuModuleLoadDataEx) &&
|
|
load_symbol(api->lib, "cuModuleGetFunction", (void **)&api->cuModuleGetFunction) &&
|
|
load_symbol(api->lib, "cuLaunchKernel", (void **)&api->cuLaunchKernel);
|
|
}
|
|
|
|
static const char *cu_error_name(struct cuda_api *api, CUresult rc) {
|
|
const char *value = NULL;
|
|
if (api->cuGetErrorName && api->cuGetErrorName(rc, &value) == CU_SUCCESS && value) {
|
|
return value;
|
|
}
|
|
return "CUDA_ERROR";
|
|
}
|
|
|
|
static const char *cu_error_string(struct cuda_api *api, CUresult rc) {
|
|
const char *value = NULL;
|
|
if (api->cuGetErrorString && api->cuGetErrorString(rc, &value) == CU_SUCCESS && value) {
|
|
return value;
|
|
}
|
|
return "unknown";
|
|
}
|
|
|
|
static int check_rc(struct cuda_api *api, const char *step, CUresult rc) {
|
|
if (rc == CU_SUCCESS) {
|
|
return 1;
|
|
}
|
|
fprintf(stderr, "%s failed: %s (%s)\n", step, cu_error_name(api, rc), cu_error_string(api, rc));
|
|
return 0;
|
|
}
|
|
|
|
static double now_seconds(void) {
|
|
struct timespec ts;
|
|
clock_gettime(CLOCK_MONOTONIC, &ts);
|
|
return (double)ts.tv_sec + ((double)ts.tv_nsec / 1000000000.0);
|
|
}
|
|
|
|
int main(int argc, char **argv) {
|
|
int seconds = 5;
|
|
int size_mb = 64;
|
|
for (int i = 1; i < argc; i++) {
|
|
if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) {
|
|
seconds = atoi(argv[++i]);
|
|
} else if ((strcmp(argv[i], "--size-mb") == 0 || strcmp(argv[i], "-m") == 0) && i + 1 < argc) {
|
|
size_mb = atoi(argv[++i]);
|
|
} else {
|
|
fprintf(stderr, "usage: %s [--seconds N] [--size-mb N]\n", argv[0]);
|
|
return 2;
|
|
}
|
|
}
|
|
if (seconds <= 0) {
|
|
seconds = 5;
|
|
}
|
|
if (size_mb <= 0) {
|
|
size_mb = 64;
|
|
}
|
|
|
|
struct cuda_api api;
|
|
if (!load_cuda(&api)) {
|
|
fprintf(stderr, "failed to load libcuda.so.1 or required Driver API symbols\n");
|
|
return 1;
|
|
}
|
|
load_symbol(api.lib, "cuGetErrorName", (void **)&api.cuGetErrorName);
|
|
load_symbol(api.lib, "cuGetErrorString", (void **)&api.cuGetErrorString);
|
|
|
|
if (!check_rc(&api, "cuInit", api.cuInit(0))) {
|
|
return 1;
|
|
}
|
|
|
|
int count = 0;
|
|
if (!check_rc(&api, "cuDeviceGetCount", api.cuDeviceGetCount(&count))) {
|
|
return 1;
|
|
}
|
|
if (count <= 0) {
|
|
fprintf(stderr, "no CUDA devices found\n");
|
|
return 1;
|
|
}
|
|
|
|
CUdevice dev = 0;
|
|
if (!check_rc(&api, "cuDeviceGet", api.cuDeviceGet(&dev, 0))) {
|
|
return 1;
|
|
}
|
|
char name[128] = {0};
|
|
if (!check_rc(&api, "cuDeviceGetName", api.cuDeviceGetName(name, (int)sizeof(name), dev))) {
|
|
return 1;
|
|
}
|
|
|
|
CUcontext ctx = NULL;
|
|
if (!check_rc(&api, "cuCtxCreate", api.cuCtxCreate(&ctx, 0, dev))) {
|
|
return 1;
|
|
}
|
|
|
|
size_t bytes = (size_t)size_mb * 1024 * 1024;
|
|
uint32_t words = (uint32_t)(bytes / sizeof(uint32_t));
|
|
if (words < 1024) {
|
|
words = 1024;
|
|
bytes = (size_t)words * sizeof(uint32_t);
|
|
}
|
|
|
|
uint32_t *host = (uint32_t *)malloc(bytes);
|
|
if (!host) {
|
|
fprintf(stderr, "malloc failed\n");
|
|
api.cuCtxDestroy(ctx);
|
|
return 1;
|
|
}
|
|
for (uint32_t i = 0; i < words; i++) {
|
|
host[i] = i ^ 0x12345678u;
|
|
}
|
|
|
|
CUdeviceptr device_mem = 0;
|
|
if (!check_rc(&api, "cuMemAlloc", api.cuMemAlloc(&device_mem, bytes))) {
|
|
free(host);
|
|
api.cuCtxDestroy(ctx);
|
|
return 1;
|
|
}
|
|
if (!check_rc(&api, "cuMemcpyHtoD", api.cuMemcpyHtoD(device_mem, host, bytes))) {
|
|
api.cuMemFree(device_mem);
|
|
free(host);
|
|
api.cuCtxDestroy(ctx);
|
|
return 1;
|
|
}
|
|
|
|
CUmodule module = NULL;
|
|
if (!check_rc(&api, "cuModuleLoadDataEx", api.cuModuleLoadDataEx(&module, ptx_source, 0, NULL, NULL))) {
|
|
api.cuMemFree(device_mem);
|
|
free(host);
|
|
api.cuCtxDestroy(ctx);
|
|
return 1;
|
|
}
|
|
|
|
CUfunction kernel = NULL;
|
|
if (!check_rc(&api, "cuModuleGetFunction", api.cuModuleGetFunction(&kernel, module, "burn"))) {
|
|
api.cuMemFree(device_mem);
|
|
free(host);
|
|
api.cuCtxDestroy(ctx);
|
|
return 1;
|
|
}
|
|
|
|
unsigned int threads = 256;
|
|
unsigned int blocks = (words + threads - 1) / threads;
|
|
uint32_t rounds = 256;
|
|
void *params[] = {&device_mem, &words, &rounds};
|
|
|
|
double start = now_seconds();
|
|
double deadline = start + (double)seconds;
|
|
unsigned long iterations = 0;
|
|
while (now_seconds() < deadline) {
|
|
if (!check_rc(&api, "cuLaunchKernel",
|
|
api.cuLaunchKernel(kernel, blocks, 1, 1, threads, 1, 1, 0, NULL, params, NULL))) {
|
|
api.cuMemFree(device_mem);
|
|
free(host);
|
|
api.cuCtxDestroy(ctx);
|
|
return 1;
|
|
}
|
|
iterations++;
|
|
}
|
|
|
|
if (!check_rc(&api, "cuCtxSynchronize", api.cuCtxSynchronize())) {
|
|
api.cuMemFree(device_mem);
|
|
free(host);
|
|
api.cuCtxDestroy(ctx);
|
|
return 1;
|
|
}
|
|
if (!check_rc(&api, "cuMemcpyDtoH", api.cuMemcpyDtoH(host, device_mem, bytes))) {
|
|
api.cuMemFree(device_mem);
|
|
free(host);
|
|
api.cuCtxDestroy(ctx);
|
|
return 1;
|
|
}
|
|
|
|
uint64_t checksum = 0;
|
|
for (uint32_t i = 0; i < words; i += words / 256 ? words / 256 : 1) {
|
|
checksum += host[i];
|
|
}
|
|
|
|
double elapsed = now_seconds() - start;
|
|
printf("device=%s\n", name);
|
|
printf("duration_s=%.2f\n", elapsed);
|
|
printf("buffer_mb=%d\n", size_mb);
|
|
printf("iterations=%lu\n", iterations);
|
|
printf("checksum=%llu\n", (unsigned long long)checksum);
|
|
printf("status=OK\n");
|
|
|
|
api.cuMemFree(device_mem);
|
|
free(host);
|
|
api.cuCtxDestroy(ctx);
|
|
return 0;
|
|
}
|