#define _POSIX_C_SOURCE 200809L #include #include #include #include #include #include typedef int CUdevice; typedef uint64_t CUdeviceptr; typedef int CUresult; typedef void *CUcontext; typedef void *CUmodule; typedef void *CUfunction; typedef void *CUstream; #define CU_SUCCESS 0 static const char *ptx_source = ".version 6.0\n" ".target sm_30\n" ".address_size 64\n" "\n" ".visible .entry burn(\n" " .param .u64 data,\n" " .param .u32 words,\n" " .param .u32 rounds\n" ")\n" "{\n" " .reg .pred %p<2>;\n" " .reg .b32 %r<8>;\n" " .reg .b64 %rd<5>;\n" "\n" " ld.param.u64 %rd1, [data];\n" " ld.param.u32 %r1, [words];\n" " ld.param.u32 %r2, [rounds];\n" " mov.u32 %r3, %ctaid.x;\n" " mov.u32 %r4, %ntid.x;\n" " mov.u32 %r5, %tid.x;\n" " mad.lo.s32 %r0, %r3, %r4, %r5;\n" " setp.ge.u32 %p0, %r0, %r1;\n" " @%p0 bra DONE;\n" " mul.wide.u32 %rd2, %r0, 4;\n" " add.s64 %rd3, %rd1, %rd2;\n" " ld.global.u32 %r6, [%rd3];\n" "LOOP:\n" " setp.eq.u32 %p1, %r2, 0;\n" " @%p1 bra STORE;\n" " mad.lo.u32 %r6, %r6, 1664525, 1013904223;\n" " sub.u32 %r2, %r2, 1;\n" " bra LOOP;\n" "STORE:\n" " st.global.u32 [%rd3], %r6;\n" "DONE:\n" " ret;\n" "}\n"; typedef CUresult (*cuInit_fn)(unsigned int); typedef CUresult (*cuDeviceGetCount_fn)(int *); typedef CUresult (*cuDeviceGet_fn)(CUdevice *, int); typedef CUresult (*cuDeviceGetName_fn)(char *, int, CUdevice); typedef CUresult (*cuCtxCreate_fn)(CUcontext *, unsigned int, CUdevice); typedef CUresult (*cuCtxDestroy_fn)(CUcontext); typedef CUresult (*cuCtxSynchronize_fn)(void); typedef CUresult (*cuMemAlloc_fn)(CUdeviceptr *, size_t); typedef CUresult (*cuMemFree_fn)(CUdeviceptr); typedef CUresult (*cuMemcpyHtoD_fn)(CUdeviceptr, const void *, size_t); typedef CUresult (*cuMemcpyDtoH_fn)(void *, CUdeviceptr, size_t); typedef CUresult (*cuModuleLoadDataEx_fn)(CUmodule *, const void *, unsigned int, void *, void *); typedef CUresult (*cuModuleGetFunction_fn)(CUfunction *, CUmodule, const char *); typedef CUresult (*cuLaunchKernel_fn)(CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **); typedef CUresult (*cuGetErrorName_fn)(CUresult, const char **); typedef CUresult (*cuGetErrorString_fn)(CUresult, const char **); struct cuda_api { void *lib; cuInit_fn cuInit; cuDeviceGetCount_fn cuDeviceGetCount; cuDeviceGet_fn cuDeviceGet; cuDeviceGetName_fn cuDeviceGetName; cuCtxCreate_fn cuCtxCreate; cuCtxDestroy_fn cuCtxDestroy; cuCtxSynchronize_fn cuCtxSynchronize; cuMemAlloc_fn cuMemAlloc; cuMemFree_fn cuMemFree; cuMemcpyHtoD_fn cuMemcpyHtoD; cuMemcpyDtoH_fn cuMemcpyDtoH; cuModuleLoadDataEx_fn cuModuleLoadDataEx; cuModuleGetFunction_fn cuModuleGetFunction; cuLaunchKernel_fn cuLaunchKernel; cuGetErrorName_fn cuGetErrorName; cuGetErrorString_fn cuGetErrorString; }; static int load_symbol(void *lib, const char *name, void **out) { *out = dlsym(lib, name); return *out != NULL; } static int load_cuda(struct cuda_api *api) { memset(api, 0, sizeof(*api)); api->lib = dlopen("libcuda.so.1", RTLD_NOW | RTLD_LOCAL); if (!api->lib) { return 0; } return load_symbol(api->lib, "cuInit", (void **)&api->cuInit) && load_symbol(api->lib, "cuDeviceGetCount", (void **)&api->cuDeviceGetCount) && load_symbol(api->lib, "cuDeviceGet", (void **)&api->cuDeviceGet) && load_symbol(api->lib, "cuDeviceGetName", (void **)&api->cuDeviceGetName) && load_symbol(api->lib, "cuCtxCreate_v2", (void **)&api->cuCtxCreate) && load_symbol(api->lib, "cuCtxDestroy_v2", (void **)&api->cuCtxDestroy) && load_symbol(api->lib, "cuCtxSynchronize", (void **)&api->cuCtxSynchronize) && load_symbol(api->lib, "cuMemAlloc_v2", (void **)&api->cuMemAlloc) && load_symbol(api->lib, "cuMemFree_v2", (void **)&api->cuMemFree) && load_symbol(api->lib, "cuMemcpyHtoD_v2", (void **)&api->cuMemcpyHtoD) && load_symbol(api->lib, "cuMemcpyDtoH_v2", (void **)&api->cuMemcpyDtoH) && load_symbol(api->lib, "cuModuleLoadDataEx", (void **)&api->cuModuleLoadDataEx) && load_symbol(api->lib, "cuModuleGetFunction", (void **)&api->cuModuleGetFunction) && load_symbol(api->lib, "cuLaunchKernel", (void **)&api->cuLaunchKernel); } static const char *cu_error_name(struct cuda_api *api, CUresult rc) { const char *value = NULL; if (api->cuGetErrorName && api->cuGetErrorName(rc, &value) == CU_SUCCESS && value) { return value; } return "CUDA_ERROR"; } static const char *cu_error_string(struct cuda_api *api, CUresult rc) { const char *value = NULL; if (api->cuGetErrorString && api->cuGetErrorString(rc, &value) == CU_SUCCESS && value) { return value; } return "unknown"; } static int check_rc(struct cuda_api *api, const char *step, CUresult rc) { if (rc == CU_SUCCESS) { return 1; } fprintf(stderr, "%s failed: %s (%s)\n", step, cu_error_name(api, rc), cu_error_string(api, rc)); return 0; } static double now_seconds(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return (double)ts.tv_sec + ((double)ts.tv_nsec / 1000000000.0); } int main(int argc, char **argv) { int seconds = 5; int size_mb = 64; for (int i = 1; i < argc; i++) { if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) { seconds = atoi(argv[++i]); } else if ((strcmp(argv[i], "--size-mb") == 0 || strcmp(argv[i], "-m") == 0) && i + 1 < argc) { size_mb = atoi(argv[++i]); } else { fprintf(stderr, "usage: %s [--seconds N] [--size-mb N]\n", argv[0]); return 2; } } if (seconds <= 0) { seconds = 5; } if (size_mb <= 0) { size_mb = 64; } struct cuda_api api; if (!load_cuda(&api)) { fprintf(stderr, "failed to load libcuda.so.1 or required Driver API symbols\n"); return 1; } load_symbol(api.lib, "cuGetErrorName", (void **)&api.cuGetErrorName); load_symbol(api.lib, "cuGetErrorString", (void **)&api.cuGetErrorString); if (!check_rc(&api, "cuInit", api.cuInit(0))) { return 1; } int count = 0; if (!check_rc(&api, "cuDeviceGetCount", api.cuDeviceGetCount(&count))) { return 1; } if (count <= 0) { fprintf(stderr, "no CUDA devices found\n"); return 1; } CUdevice dev = 0; if (!check_rc(&api, "cuDeviceGet", api.cuDeviceGet(&dev, 0))) { return 1; } char name[128] = {0}; if (!check_rc(&api, "cuDeviceGetName", api.cuDeviceGetName(name, (int)sizeof(name), dev))) { return 1; } CUcontext ctx = NULL; if (!check_rc(&api, "cuCtxCreate", api.cuCtxCreate(&ctx, 0, dev))) { return 1; } size_t bytes = (size_t)size_mb * 1024 * 1024; uint32_t words = (uint32_t)(bytes / sizeof(uint32_t)); if (words < 1024) { words = 1024; bytes = (size_t)words * sizeof(uint32_t); } uint32_t *host = (uint32_t *)malloc(bytes); if (!host) { fprintf(stderr, "malloc failed\n"); api.cuCtxDestroy(ctx); return 1; } for (uint32_t i = 0; i < words; i++) { host[i] = i ^ 0x12345678u; } CUdeviceptr device_mem = 0; if (!check_rc(&api, "cuMemAlloc", api.cuMemAlloc(&device_mem, bytes))) { free(host); api.cuCtxDestroy(ctx); return 1; } if (!check_rc(&api, "cuMemcpyHtoD", api.cuMemcpyHtoD(device_mem, host, bytes))) { api.cuMemFree(device_mem); free(host); api.cuCtxDestroy(ctx); return 1; } CUmodule module = NULL; if (!check_rc(&api, "cuModuleLoadDataEx", api.cuModuleLoadDataEx(&module, ptx_source, 0, NULL, NULL))) { api.cuMemFree(device_mem); free(host); api.cuCtxDestroy(ctx); return 1; } CUfunction kernel = NULL; if (!check_rc(&api, "cuModuleGetFunction", api.cuModuleGetFunction(&kernel, module, "burn"))) { api.cuMemFree(device_mem); free(host); api.cuCtxDestroy(ctx); return 1; } unsigned int threads = 256; unsigned int blocks = (words + threads - 1) / threads; uint32_t rounds = 256; void *params[] = {&device_mem, &words, &rounds}; double start = now_seconds(); double deadline = start + (double)seconds; unsigned long iterations = 0; while (now_seconds() < deadline) { if (!check_rc(&api, "cuLaunchKernel", api.cuLaunchKernel(kernel, blocks, 1, 1, threads, 1, 1, 0, NULL, params, NULL))) { api.cuMemFree(device_mem); free(host); api.cuCtxDestroy(ctx); return 1; } iterations++; } if (!check_rc(&api, "cuCtxSynchronize", api.cuCtxSynchronize())) { api.cuMemFree(device_mem); free(host); api.cuCtxDestroy(ctx); return 1; } if (!check_rc(&api, "cuMemcpyDtoH", api.cuMemcpyDtoH(host, device_mem, bytes))) { api.cuMemFree(device_mem); free(host); api.cuCtxDestroy(ctx); return 1; } uint64_t checksum = 0; for (uint32_t i = 0; i < words; i += words / 256 ? words / 256 : 1) { checksum += host[i]; } double elapsed = now_seconds() - start; printf("device=%s\n", name); printf("duration_s=%.2f\n", elapsed); printf("buffer_mb=%d\n", size_mb); printf("iterations=%lu\n", iterations); printf("checksum=%llu\n", (unsigned long long)checksum); printf("status=OK\n"); api.cuMemFree(device_mem); free(host); api.cuCtxDestroy(ctx); return 0; }