#define _POSIX_C_SOURCE 200809L #include #include #include #include #include #include #include #include #if defined(__has_include) #if __has_include() #include #define HAVE_CUBLASLT_HEADERS 1 #else #define HAVE_CUBLASLT_HEADERS 0 #endif #else #define HAVE_CUBLASLT_HEADERS 0 #endif typedef int CUdevice; typedef uint64_t CUdeviceptr; typedef int CUresult; typedef void *CUcontext; typedef void *CUmodule; typedef void *CUfunction; typedef void *CUstream; #define CU_SUCCESS 0 #define CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT 16 #define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR 75 #define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR 76 #define MAX_STRESS_STREAMS 16 #define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u) #define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u) static const char *ptx_source = ".version 6.0\n" ".target sm_30\n" ".address_size 64\n" "\n" ".visible .entry burn(\n" " .param .u64 data,\n" " .param .u32 words,\n" " .param .u32 rounds\n" ")\n" "{\n" " .reg .pred %p<2>;\n" " .reg .b32 %r<8>;\n" " .reg .b64 %rd<5>;\n" "\n" " ld.param.u64 %rd1, [data];\n" " ld.param.u32 %r1, [words];\n" " ld.param.u32 %r2, [rounds];\n" " mov.u32 %r3, %ctaid.x;\n" " mov.u32 %r4, %ntid.x;\n" " mov.u32 %r5, %tid.x;\n" " mad.lo.s32 %r0, %r3, %r4, %r5;\n" " setp.ge.u32 %p0, %r0, %r1;\n" " @%p0 bra DONE;\n" " mul.wide.u32 %rd2, %r0, 4;\n" " add.s64 %rd3, %rd1, %rd2;\n" " ld.global.u32 %r6, [%rd3];\n" "LOOP:\n" " setp.eq.u32 %p1, %r2, 0;\n" " @%p1 bra STORE;\n" " mad.lo.u32 %r6, %r6, 1664525, 1013904223;\n" " sub.u32 %r2, %r2, 1;\n" " bra LOOP;\n" "STORE:\n" " st.global.u32 [%rd3], %r6;\n" "DONE:\n" " ret;\n" "}\n"; typedef CUresult (*cuInit_fn)(unsigned int); typedef CUresult (*cuDeviceGetCount_fn)(int *); typedef CUresult (*cuDeviceGet_fn)(CUdevice *, int); typedef CUresult (*cuDeviceGetName_fn)(char *, int, CUdevice); typedef CUresult (*cuDeviceGetAttribute_fn)(int *, int, CUdevice); typedef CUresult (*cuCtxCreate_fn)(CUcontext *, unsigned int, CUdevice); typedef CUresult (*cuCtxDestroy_fn)(CUcontext); typedef CUresult (*cuCtxSynchronize_fn)(void); typedef CUresult (*cuMemAlloc_fn)(CUdeviceptr *, size_t); typedef CUresult (*cuMemFree_fn)(CUdeviceptr); typedef CUresult (*cuMemsetD8_fn)(CUdeviceptr, unsigned char, size_t); typedef CUresult (*cuMemcpyHtoD_fn)(CUdeviceptr, const void *, size_t); typedef CUresult (*cuMemcpyDtoH_fn)(void *, CUdeviceptr, size_t); typedef CUresult (*cuModuleLoadDataEx_fn)(CUmodule *, const void *, unsigned int, void *, void *); typedef CUresult (*cuModuleGetFunction_fn)(CUfunction *, CUmodule, const char *); typedef CUresult (*cuLaunchKernel_fn)(CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **); typedef CUresult (*cuMemGetInfo_fn)(size_t *, size_t *); typedef CUresult (*cuStreamCreate_fn)(CUstream *, unsigned int); typedef CUresult (*cuStreamDestroy_fn)(CUstream); typedef CUresult (*cuGetErrorName_fn)(CUresult, const char **); typedef CUresult (*cuGetErrorString_fn)(CUresult, const char **); struct cuda_api { void *lib; cuInit_fn cuInit; cuDeviceGetCount_fn cuDeviceGetCount; cuDeviceGet_fn cuDeviceGet; cuDeviceGetName_fn cuDeviceGetName; cuDeviceGetAttribute_fn cuDeviceGetAttribute; cuCtxCreate_fn cuCtxCreate; cuCtxDestroy_fn cuCtxDestroy; cuCtxSynchronize_fn cuCtxSynchronize; cuMemAlloc_fn cuMemAlloc; cuMemFree_fn cuMemFree; cuMemsetD8_fn cuMemsetD8; cuMemcpyHtoD_fn cuMemcpyHtoD; cuMemcpyDtoH_fn cuMemcpyDtoH; cuModuleLoadDataEx_fn cuModuleLoadDataEx; cuModuleGetFunction_fn cuModuleGetFunction; cuLaunchKernel_fn cuLaunchKernel; cuMemGetInfo_fn cuMemGetInfo; cuStreamCreate_fn cuStreamCreate; cuStreamDestroy_fn cuStreamDestroy; cuGetErrorName_fn cuGetErrorName; cuGetErrorString_fn cuGetErrorString; }; struct stress_report { char backend[32]; char device[128]; int cc_major; int cc_minor; int buffer_mb; int stream_count; unsigned long iterations; uint64_t checksum; char details[16384]; }; static int load_symbol(void *lib, const char *name, void **out) { *out = dlsym(lib, name); return *out != NULL; } static int load_cuda(struct cuda_api *api) { memset(api, 0, sizeof(*api)); api->lib = dlopen("libcuda.so.1", RTLD_NOW | RTLD_LOCAL); if (!api->lib) { return 0; } if (!( load_symbol(api->lib, "cuInit", (void **)&api->cuInit) && load_symbol(api->lib, "cuDeviceGetCount", (void **)&api->cuDeviceGetCount) && load_symbol(api->lib, "cuDeviceGet", (void **)&api->cuDeviceGet) && load_symbol(api->lib, "cuDeviceGetName", (void **)&api->cuDeviceGetName) && load_symbol(api->lib, "cuDeviceGetAttribute", (void **)&api->cuDeviceGetAttribute) && load_symbol(api->lib, "cuCtxCreate_v2", (void **)&api->cuCtxCreate) && load_symbol(api->lib, "cuCtxDestroy_v2", (void **)&api->cuCtxDestroy) && load_symbol(api->lib, "cuCtxSynchronize", (void **)&api->cuCtxSynchronize) && load_symbol(api->lib, "cuMemAlloc_v2", (void **)&api->cuMemAlloc) && load_symbol(api->lib, "cuMemFree_v2", (void **)&api->cuMemFree) && load_symbol(api->lib, "cuMemsetD8_v2", (void **)&api->cuMemsetD8) && load_symbol(api->lib, "cuMemcpyHtoD_v2", (void **)&api->cuMemcpyHtoD) && load_symbol(api->lib, "cuMemcpyDtoH_v2", (void **)&api->cuMemcpyDtoH) && load_symbol(api->lib, "cuModuleLoadDataEx", (void **)&api->cuModuleLoadDataEx) && load_symbol(api->lib, "cuModuleGetFunction", (void **)&api->cuModuleGetFunction) && load_symbol(api->lib, "cuLaunchKernel", (void **)&api->cuLaunchKernel))) { dlclose(api->lib); memset(api, 0, sizeof(*api)); return 0; } load_symbol(api->lib, "cuMemGetInfo_v2", (void **)&api->cuMemGetInfo); load_symbol(api->lib, "cuStreamCreate", (void **)&api->cuStreamCreate); if (!load_symbol(api->lib, "cuStreamDestroy_v2", (void **)&api->cuStreamDestroy)) { load_symbol(api->lib, "cuStreamDestroy", (void **)&api->cuStreamDestroy); } return 1; } static const char *cu_error_name(struct cuda_api *api, CUresult rc) { const char *value = NULL; if (api->cuGetErrorName && api->cuGetErrorName(rc, &value) == CU_SUCCESS && value) { return value; } return "CUDA_ERROR"; } static const char *cu_error_string(struct cuda_api *api, CUresult rc) { const char *value = NULL; if (api->cuGetErrorString && api->cuGetErrorString(rc, &value) == CU_SUCCESS && value) { return value; } return "unknown"; } static int check_rc(struct cuda_api *api, const char *step, CUresult rc) { if (rc == CU_SUCCESS) { return 1; } fprintf(stderr, "%s failed: %s (%s)\n", step, cu_error_name(api, rc), cu_error_string(api, rc)); return 0; } static double now_seconds(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return (double)ts.tv_sec + ((double)ts.tv_nsec / 1000000000.0); } static size_t round_down_size(size_t value, size_t multiple) { if (multiple == 0 || value < multiple) { return value; } return value - (value % multiple); } static int query_compute_capability(struct cuda_api *api, CUdevice dev, int *major, int *minor) { int cc_major = 0; int cc_minor = 0; if (!check_rc(api, "cuDeviceGetAttribute(major)", api->cuDeviceGetAttribute(&cc_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev))) { return 0; } if (!check_rc(api, "cuDeviceGetAttribute(minor)", api->cuDeviceGetAttribute(&cc_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev))) { return 0; } *major = cc_major; *minor = cc_minor; return 1; } static int query_multiprocessor_count(struct cuda_api *api, CUdevice dev, int *count) { int mp_count = 0; if (!check_rc(api, "cuDeviceGetAttribute(multiprocessors)", api->cuDeviceGetAttribute(&mp_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev))) { return 0; } *count = mp_count; return 1; } static size_t clamp_budget_to_free_memory(struct cuda_api *api, size_t requested_bytes) { size_t free_bytes = 0; size_t total_bytes = 0; size_t max_bytes = requested_bytes; if (!api->cuMemGetInfo) { return requested_bytes; } if (api->cuMemGetInfo(&free_bytes, &total_bytes) != CU_SUCCESS || free_bytes == 0) { return requested_bytes; } max_bytes = (free_bytes * 9u) / 10u; if (max_bytes < (size_t)4u * 1024u * 1024u) { max_bytes = (size_t)4u * 1024u * 1024u; } if (requested_bytes > max_bytes) { return max_bytes; } return requested_bytes; } static int choose_stream_count(int mp_count, int planned_profiles, size_t total_budget, int have_streams) { int stream_count = 1; if (!have_streams || mp_count <= 0 || planned_profiles <= 0) { return 1; } stream_count = mp_count / 8; if (stream_count < 2) { stream_count = 2; } if (stream_count > MAX_STRESS_STREAMS) { stream_count = MAX_STRESS_STREAMS; } while (stream_count > 1) { size_t per_stream_budget = total_budget / ((size_t)planned_profiles * (size_t)stream_count); if (per_stream_budget >= MIN_STREAM_BUDGET_BYTES) { break; } stream_count--; } return stream_count; } static void destroy_streams(struct cuda_api *api, CUstream *streams, int count) { if (!api->cuStreamDestroy) { return; } for (int i = 0; i < count; i++) { if (streams[i]) { api->cuStreamDestroy(streams[i]); streams[i] = NULL; } } } #if HAVE_CUBLASLT_HEADERS static void append_detail(char *buf, size_t cap, const char *fmt, ...) { size_t len = strlen(buf); if (len >= cap) { return; } va_list ap; va_start(ap, fmt); vsnprintf(buf + len, cap - len, fmt, ap); va_end(ap); } #endif static int run_ptx_fallback(struct cuda_api *api, CUdevice dev, const char *device_name, int cc_major, int cc_minor, int seconds, int size_mb, struct stress_report *report) { CUcontext ctx = NULL; CUmodule module = NULL; CUfunction kernel = NULL; uint32_t sample[256]; CUdeviceptr device_mem[MAX_STRESS_STREAMS] = {0}; CUstream streams[MAX_STRESS_STREAMS] = {0}; uint32_t words[MAX_STRESS_STREAMS] = {0}; uint32_t rounds[MAX_STRESS_STREAMS] = {0}; void *params[MAX_STRESS_STREAMS][3]; size_t bytes_per_stream[MAX_STRESS_STREAMS] = {0}; unsigned long iterations = 0; int mp_count = 0; int stream_count = 1; memset(report, 0, sizeof(*report)); snprintf(report->backend, sizeof(report->backend), "driver-ptx"); snprintf(report->device, sizeof(report->device), "%s", device_name); report->cc_major = cc_major; report->cc_minor = cc_minor; report->buffer_mb = size_mb; if (!check_rc(api, "cuCtxCreate", api->cuCtxCreate(&ctx, 0, dev))) { return 0; } size_t requested_bytes = (size_t)size_mb * 1024u * 1024u; if (requested_bytes < MIN_PROFILE_BUDGET_BYTES) { requested_bytes = MIN_PROFILE_BUDGET_BYTES; } size_t total_bytes = clamp_budget_to_free_memory(api, requested_bytes); if (total_bytes < MIN_PROFILE_BUDGET_BYTES) { total_bytes = MIN_PROFILE_BUDGET_BYTES; } report->buffer_mb = (int)(total_bytes / (1024u * 1024u)); if (query_multiprocessor_count(api, dev, &mp_count) && api->cuStreamCreate && api->cuStreamDestroy) { stream_count = choose_stream_count(mp_count, 1, total_bytes, 1); } if (stream_count > 1) { int created = 0; for (; created < stream_count; created++) { if (!check_rc(api, "cuStreamCreate", api->cuStreamCreate(&streams[created], 0))) { destroy_streams(api, streams, created); stream_count = 1; break; } } } report->stream_count = stream_count; for (int lane = 0; lane < stream_count; lane++) { size_t slice = total_bytes / (size_t)stream_count; if (lane == stream_count - 1) { slice = total_bytes - ((size_t)lane * (total_bytes / (size_t)stream_count)); } slice = round_down_size(slice, sizeof(uint32_t)); if (slice < MIN_PROFILE_BUDGET_BYTES) { slice = MIN_PROFILE_BUDGET_BYTES; } bytes_per_stream[lane] = slice; words[lane] = (uint32_t)(slice / sizeof(uint32_t)); if (!check_rc(api, "cuMemAlloc", api->cuMemAlloc(&device_mem[lane], slice))) { goto fail; } if (!check_rc(api, "cuMemsetD8", api->cuMemsetD8(device_mem[lane], 0, slice))) { goto fail; } rounds[lane] = 2048; params[lane][0] = &device_mem[lane]; params[lane][1] = &words[lane]; params[lane][2] = &rounds[lane]; } if (!check_rc(api, "cuModuleLoadDataEx", api->cuModuleLoadDataEx(&module, ptx_source, 0, NULL, NULL))) { goto fail; } if (!check_rc(api, "cuModuleGetFunction", api->cuModuleGetFunction(&kernel, module, "burn"))) { goto fail; } unsigned int threads = 256; double deadline = now_seconds() + (double)seconds; double next_sync = now_seconds() + 1.0; while (now_seconds() < deadline) { int launched = 0; for (int lane = 0; lane < stream_count; lane++) { unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads); if (!check_rc(api, "cuLaunchKernel", api->cuLaunchKernel(kernel, blocks, 1, 1, threads, 1, 1, 0, streams[lane], params[lane], NULL))) { goto fail; } launched++; iterations++; } if (launched <= 0) { goto fail; } double now = now_seconds(); if (now >= next_sync || now >= deadline) { if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) { goto fail; } next_sync = now + 1.0; } } api->cuCtxSynchronize(); if (!check_rc(api, "cuMemcpyDtoH", api->cuMemcpyDtoH(sample, device_mem[0], sizeof(sample)))) { goto fail; } for (size_t i = 0; i < sizeof(sample) / sizeof(sample[0]); i++) { report->checksum += sample[i]; } report->iterations = iterations; snprintf(report->details, sizeof(report->details), "fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d per_stream_mb=%zu iterations=%lu\n", size_mb, report->buffer_mb, report->stream_count, bytes_per_stream[0] / (1024u * 1024u), iterations); for (int lane = 0; lane < stream_count; lane++) { if (device_mem[lane]) { api->cuMemFree(device_mem[lane]); } } destroy_streams(api, streams, stream_count); api->cuCtxDestroy(ctx); return 1; fail: for (int lane = 0; lane < MAX_STRESS_STREAMS; lane++) { if (device_mem[lane]) { api->cuMemFree(device_mem[lane]); } } destroy_streams(api, streams, MAX_STRESS_STREAMS); if (ctx) { api->cuCtxDestroy(ctx); } return 0; } #if HAVE_CUBLASLT_HEADERS typedef cublasStatus_t (*cublasLtCreate_fn)(cublasLtHandle_t *); typedef cublasStatus_t (*cublasLtDestroy_fn)(cublasLtHandle_t); typedef cublasStatus_t (*cublasLtMatmulDescCreate_fn)(cublasLtMatmulDesc_t *, cublasComputeType_t, cudaDataType_t); typedef cublasStatus_t (*cublasLtMatmulDescDestroy_fn)(cublasLtMatmulDesc_t); typedef cublasStatus_t (*cublasLtMatmulDescSetAttribute_fn)(cublasLtMatmulDesc_t, cublasLtMatmulDescAttributes_t, const void *, size_t); typedef cublasStatus_t (*cublasLtMatrixLayoutCreate_fn)(cublasLtMatrixLayout_t *, cudaDataType_t, uint64_t, uint64_t, int64_t); typedef cublasStatus_t (*cublasLtMatrixLayoutDestroy_fn)(cublasLtMatrixLayout_t); typedef cublasStatus_t (*cublasLtMatmulPreferenceCreate_fn)(cublasLtMatmulPreference_t *); typedef cublasStatus_t (*cublasLtMatmulPreferenceDestroy_fn)(cublasLtMatmulPreference_t); typedef cublasStatus_t (*cublasLtMatmulPreferenceSetAttribute_fn)(cublasLtMatmulPreference_t, cublasLtMatmulPreferenceAttributes_t, const void *, size_t); typedef cublasStatus_t (*cublasLtMatmulAlgoGetHeuristic_fn)( cublasLtHandle_t, cublasLtMatmulDesc_t, cublasLtMatrixLayout_t, cublasLtMatrixLayout_t, cublasLtMatrixLayout_t, cublasLtMatrixLayout_t, cublasLtMatmulPreference_t, int, cublasLtMatmulHeuristicResult_t *, int *); typedef cublasStatus_t (*cublasLtMatmul_fn)(cublasLtHandle_t, cublasLtMatmulDesc_t, const void *, const void *, cublasLtMatrixLayout_t, const void *, cublasLtMatrixLayout_t, const void *, const void *, cublasLtMatrixLayout_t, void *, cublasLtMatrixLayout_t, const cublasLtMatmulAlgo_t *, void *, size_t, cudaStream_t); struct cublaslt_api { void *lib; cublasLtCreate_fn cublasLtCreate; cublasLtDestroy_fn cublasLtDestroy; cublasLtMatmulDescCreate_fn cublasLtMatmulDescCreate; cublasLtMatmulDescDestroy_fn cublasLtMatmulDescDestroy; cublasLtMatmulDescSetAttribute_fn cublasLtMatmulDescSetAttribute; cublasLtMatrixLayoutCreate_fn cublasLtMatrixLayoutCreate; cublasLtMatrixLayoutDestroy_fn cublasLtMatrixLayoutDestroy; cublasLtMatmulPreferenceCreate_fn cublasLtMatmulPreferenceCreate; cublasLtMatmulPreferenceDestroy_fn cublasLtMatmulPreferenceDestroy; cublasLtMatmulPreferenceSetAttribute_fn cublasLtMatmulPreferenceSetAttribute; cublasLtMatmulAlgoGetHeuristic_fn cublasLtMatmulAlgoGetHeuristic; cublasLtMatmul_fn cublasLtMatmul; }; struct profile_desc { const char *name; const char *block_label; int min_cc; int enabled; int needs_scalar_scale; int needs_block_scale; int min_multiple; cudaDataType_t a_type; cudaDataType_t b_type; cudaDataType_t c_type; cudaDataType_t d_type; cublasComputeType_t compute_type; }; struct prepared_profile { struct profile_desc desc; CUstream stream; cublasLtMatmulDesc_t op_desc; cublasLtMatrixLayout_t a_layout; cublasLtMatrixLayout_t b_layout; cublasLtMatrixLayout_t c_layout; cublasLtMatrixLayout_t d_layout; cublasLtMatmulPreference_t preference; cublasLtMatmulHeuristicResult_t heuristic; CUdeviceptr a_dev; CUdeviceptr b_dev; CUdeviceptr c_dev; CUdeviceptr d_dev; CUdeviceptr a_scale_dev; CUdeviceptr b_scale_dev; CUdeviceptr workspace_dev; size_t workspace_size; uint64_t m; uint64_t n; uint64_t k; unsigned long iterations; int ready; }; static const struct profile_desc k_profiles[] = { { "fp64", "fp64", 80, 1, 0, 0, 8, CUDA_R_64F, CUDA_R_64F, CUDA_R_64F, CUDA_R_64F, CUBLAS_COMPUTE_64F, }, { "fp32_tf32", "fp32", 80, 1, 0, 0, 128, CUDA_R_32F, CUDA_R_32F, CUDA_R_32F, CUDA_R_32F, CUBLAS_COMPUTE_32F_FAST_TF32, }, { "fp16_tensor", "fp16", 80, 1, 0, 0, 128, CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUBLAS_COMPUTE_32F_FAST_16F, }, { "int8_tensor", "int8", 75, 1, 0, 0, 128, CUDA_R_8I, CUDA_R_8I, CUDA_R_32I, CUDA_R_32I, CUBLAS_COMPUTE_32I, }, { "fp8_e4m3", "fp8", 89, 1, 1, 0, 128, CUDA_R_8F_E4M3, CUDA_R_8F_E4M3, CUDA_R_16BF, CUDA_R_16BF, CUBLAS_COMPUTE_32F, }, { "fp8_e5m2", "fp8", 89, 1, 1, 0, 128, CUDA_R_8F_E5M2, CUDA_R_8F_E5M2, CUDA_R_16BF, CUDA_R_16BF, CUBLAS_COMPUTE_32F, }, #if defined(CUDA_R_4F_E2M1) && defined(CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3) { "fp4_e2m1", "fp4", 100, 1, 0, 1, 128, CUDA_R_4F_E2M1, CUDA_R_4F_E2M1, CUDA_R_16BF, CUDA_R_16BF, CUBLAS_COMPUTE_32F, }, #endif }; #define PROFILE_COUNT ((int)(sizeof(k_profiles) / sizeof(k_profiles[0]))) static int load_cublaslt(struct cublaslt_api *api) { memset(api, 0, sizeof(*api)); api->lib = dlopen("libcublasLt.so.13", RTLD_NOW | RTLD_LOCAL); if (!api->lib) { api->lib = dlopen("libcublasLt.so", RTLD_NOW | RTLD_LOCAL); } if (!api->lib) { return 0; } return load_symbol(api->lib, "cublasLtCreate", (void **)&api->cublasLtCreate) && load_symbol(api->lib, "cublasLtDestroy", (void **)&api->cublasLtDestroy) && load_symbol(api->lib, "cublasLtMatmulDescCreate", (void **)&api->cublasLtMatmulDescCreate) && load_symbol(api->lib, "cublasLtMatmulDescDestroy", (void **)&api->cublasLtMatmulDescDestroy) && load_symbol(api->lib, "cublasLtMatmulDescSetAttribute", (void **)&api->cublasLtMatmulDescSetAttribute) && load_symbol(api->lib, "cublasLtMatrixLayoutCreate", (void **)&api->cublasLtMatrixLayoutCreate) && load_symbol(api->lib, "cublasLtMatrixLayoutDestroy", (void **)&api->cublasLtMatrixLayoutDestroy) && load_symbol(api->lib, "cublasLtMatmulPreferenceCreate", (void **)&api->cublasLtMatmulPreferenceCreate) && load_symbol(api->lib, "cublasLtMatmulPreferenceDestroy", (void **)&api->cublasLtMatmulPreferenceDestroy) && load_symbol(api->lib, "cublasLtMatmulPreferenceSetAttribute", (void **)&api->cublasLtMatmulPreferenceSetAttribute) && load_symbol(api->lib, "cublasLtMatmulAlgoGetHeuristic", (void **)&api->cublasLtMatmulAlgoGetHeuristic) && load_symbol(api->lib, "cublasLtMatmul", (void **)&api->cublasLtMatmul); } static const char *cublas_status_text(cublasStatus_t status) { switch (status) { case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED"; case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED"; case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED"; default: return "CUBLAS_STATUS_UNKNOWN"; } } static int check_cublas(const char *step, cublasStatus_t status) { if (status == CUBLAS_STATUS_SUCCESS) { return 1; } fprintf(stderr, "%s failed: %s (%d)\n", step, cublas_status_text(status), (int)status); return 0; } static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) { switch (type) { case CUDA_R_32F: case CUDA_R_32I: return (size_t)(elements * 4u); case CUDA_R_16F: case CUDA_R_16BF: return (size_t)(elements * 2u); case CUDA_R_8I: case CUDA_R_8F_E4M3: case CUDA_R_8F_E5M2: return (size_t)(elements); #if defined(CUDA_R_4F_E2M1) case CUDA_R_4F_E2M1: return (size_t)((elements + 1u) / 2u); #endif default: return (size_t)(elements * 4u); } } static cudaDataType_t matmul_scale_type(const struct profile_desc *desc) { if (desc->compute_type == CUBLAS_COMPUTE_32I) { return CUDA_R_32I; } if (desc->compute_type == CUBLAS_COMPUTE_64F) { return CUDA_R_64F; } return CUDA_R_32F; } static size_t fp4_scale_bytes(uint64_t rows, uint64_t cols) { uint64_t row_tiles = (rows + 127u) / 128u; uint64_t col_tiles = (cols + 63u) / 64u; return (size_t)(row_tiles * col_tiles * 128u); } static uint64_t choose_square_dim(size_t budget_bytes, size_t bytes_per_cell, int multiple) { double approx = sqrt((double)budget_bytes / (double)bytes_per_cell); uint64_t dim = (uint64_t)approx; if (dim < (uint64_t)multiple) { dim = (uint64_t)multiple; } dim = (uint64_t)round_down_size((size_t)dim, (size_t)multiple); if (dim < (uint64_t)multiple) { dim = (uint64_t)multiple; } if (dim > 65536u) { dim = 65536u; } return dim; } static int device_upload(struct cuda_api *cuda, CUdeviceptr dev, const void *src, size_t bytes) { return check_rc(cuda, "cuMemcpyHtoD", cuda->cuMemcpyHtoD(dev, src, bytes)); } static int alloc_filled(struct cuda_api *cuda, CUdeviceptr *ptr, size_t bytes, unsigned char pattern) { if (!check_rc(cuda, "cuMemAlloc", cuda->cuMemAlloc(ptr, bytes))) { return 0; } if (!check_rc(cuda, "cuMemsetD8", cuda->cuMemsetD8(*ptr, pattern, bytes))) { cuda->cuMemFree(*ptr); *ptr = 0; return 0; } return 1; } static size_t profile_scale_bytes(const struct profile_desc *desc, uint64_t m, uint64_t n, uint64_t k) { size_t bytes = 0; if (desc->needs_scalar_scale) { bytes += 2u * sizeof(float); } #if defined(CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3) if (desc->needs_block_scale) { bytes += fp4_scale_bytes(k, m); bytes += fp4_scale_bytes(k, n); } #else (void)m; (void)n; (void)k; #endif return bytes; } static void destroy_profile(struct cublaslt_api *cublas, struct cuda_api *cuda, struct prepared_profile *profile) { if (profile->workspace_dev) { cuda->cuMemFree(profile->workspace_dev); } if (profile->a_scale_dev) { cuda->cuMemFree(profile->a_scale_dev); } if (profile->b_scale_dev) { cuda->cuMemFree(profile->b_scale_dev); } if (profile->d_dev) { cuda->cuMemFree(profile->d_dev); } if (profile->c_dev) { cuda->cuMemFree(profile->c_dev); } if (profile->b_dev) { cuda->cuMemFree(profile->b_dev); } if (profile->a_dev) { cuda->cuMemFree(profile->a_dev); } if (profile->preference) { cublas->cublasLtMatmulPreferenceDestroy(profile->preference); } if (profile->d_layout) { cublas->cublasLtMatrixLayoutDestroy(profile->d_layout); } if (profile->c_layout) { cublas->cublasLtMatrixLayoutDestroy(profile->c_layout); } if (profile->b_layout) { cublas->cublasLtMatrixLayoutDestroy(profile->b_layout); } if (profile->a_layout) { cublas->cublasLtMatrixLayoutDestroy(profile->a_layout); } if (profile->op_desc) { cublas->cublasLtMatmulDescDestroy(profile->op_desc); } memset(profile, 0, sizeof(*profile)); } static int prepare_profile(struct cublaslt_api *cublas, cublasLtHandle_t handle, struct cuda_api *cuda, const struct profile_desc *desc, CUstream stream, size_t profile_budget_bytes, struct prepared_profile *out) { memset(out, 0, sizeof(*out)); out->desc = *desc; out->stream = stream; size_t bytes_per_cell = 0; bytes_per_cell += bytes_for_elements(desc->a_type, 1); bytes_per_cell += bytes_for_elements(desc->b_type, 1); bytes_per_cell += bytes_for_elements(desc->c_type, 1); bytes_per_cell += bytes_for_elements(desc->d_type, 1); if (bytes_per_cell == 0) { return 0; } uint64_t dim = choose_square_dim(profile_budget_bytes, bytes_per_cell, desc->min_multiple); out->m = dim; out->n = dim; out->k = dim; size_t desired_workspace = profile_budget_bytes / 8u; if (desired_workspace > 32u * 1024u * 1024u) { desired_workspace = 32u * 1024u * 1024u; } desired_workspace = round_down_size(desired_workspace, 256u); size_t a_bytes = 0; size_t b_bytes = 0; size_t c_bytes = 0; size_t d_bytes = 0; size_t scale_bytes = 0; while (1) { a_bytes = bytes_for_elements(desc->a_type, out->k * out->m); b_bytes = bytes_for_elements(desc->b_type, out->k * out->n); c_bytes = bytes_for_elements(desc->c_type, out->m * out->n); d_bytes = bytes_for_elements(desc->d_type, out->m * out->n); scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k); size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes; if (matrix_bytes <= profile_budget_bytes) { size_t remaining = profile_budget_bytes - matrix_bytes; out->workspace_size = desired_workspace; if (out->workspace_size > remaining) { out->workspace_size = round_down_size(remaining, 256u); } break; } if (out->m <= (uint64_t)desc->min_multiple) { return 0; } out->m -= (uint64_t)desc->min_multiple; out->n = out->m; out->k = out->m; } if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) || !alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) || !alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) || !alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) { destroy_profile(cublas, cuda, out); return 0; } cudaDataType_t scale_type = matmul_scale_type(desc); if (!check_cublas("cublasLtMatmulDescCreate", cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, scale_type))) { destroy_profile(cublas, cuda, out); return 0; } cublasOperation_t transa = CUBLAS_OP_T; cublasOperation_t transb = CUBLAS_OP_N; if (!check_cublas("set TRANSA", cublas->cublasLtMatmulDescSetAttribute(out->op_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa))) || !check_cublas("set TRANSB", cublas->cublasLtMatmulDescSetAttribute(out->op_desc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transb)))) { destroy_profile(cublas, cuda, out); return 0; } if (desc->needs_scalar_scale) { float one = 1.0f; if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) || !alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) { destroy_profile(cublas, cuda, out); return 0; } if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) || !device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) { destroy_profile(cublas, cuda, out); return 0; } void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev; void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev; if (!check_cublas("set A scale ptr", cublas->cublasLtMatmulDescSetAttribute(out->op_desc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &a_scale_ptr, sizeof(a_scale_ptr))) || !check_cublas("set B scale ptr", cublas->cublasLtMatmulDescSetAttribute(out->op_desc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &b_scale_ptr, sizeof(b_scale_ptr)))) { destroy_profile(cublas, cuda, out); return 0; } } #if defined(CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3) if (desc->needs_block_scale) { size_t a_scale_bytes = fp4_scale_bytes(out->k, out->m); size_t b_scale_bytes = fp4_scale_bytes(out->k, out->n); if (!alloc_filled(cuda, &out->a_scale_dev, a_scale_bytes, 0x11) || !alloc_filled(cuda, &out->b_scale_dev, b_scale_bytes, 0x11)) { destroy_profile(cublas, cuda, out); return 0; } cublasLtMatmulMatrixScale_t scale_mode = CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3; void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev; void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev; if (!check_cublas("set A scale mode", cublas->cublasLtMatmulDescSetAttribute(out->op_desc, CUBLASLT_MATMUL_DESC_A_SCALE_MODE, &scale_mode, sizeof(scale_mode))) || !check_cublas("set B scale mode", cublas->cublasLtMatmulDescSetAttribute(out->op_desc, CUBLASLT_MATMUL_DESC_B_SCALE_MODE, &scale_mode, sizeof(scale_mode))) || !check_cublas("set A block scale ptr", cublas->cublasLtMatmulDescSetAttribute(out->op_desc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &a_scale_ptr, sizeof(a_scale_ptr))) || !check_cublas("set B block scale ptr", cublas->cublasLtMatmulDescSetAttribute(out->op_desc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &b_scale_ptr, sizeof(b_scale_ptr)))) { destroy_profile(cublas, cuda, out); return 0; } } #endif if (!check_cublas("create A layout", cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) || !check_cublas("create B layout", cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) || !check_cublas("create C layout", cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) || !check_cublas("create D layout", cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) { destroy_profile(cublas, cuda, out); return 0; } if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) { destroy_profile(cublas, cuda, out); return 0; } if (out->workspace_size > 0) { if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) { destroy_profile(cublas, cuda, out); return 0; } } if (!check_cublas("set workspace", cublas->cublasLtMatmulPreferenceSetAttribute( out->preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &out->workspace_size, sizeof(out->workspace_size)))) { destroy_profile(cublas, cuda, out); return 0; } int found = 0; if (!check_cublas("heuristic", cublas->cublasLtMatmulAlgoGetHeuristic(handle, out->op_desc, out->a_layout, out->b_layout, out->c_layout, out->d_layout, out->preference, 1, &out->heuristic, &found))) { destroy_profile(cublas, cuda, out); return 0; } if (found <= 0) { destroy_profile(cublas, cuda, out); return 0; } out->ready = 1; return 1; } static int run_cublas_profile(cublasLtHandle_t handle, struct cublaslt_api *cublas, struct prepared_profile *profile) { int32_t alpha_i32 = 1; int32_t beta_i32 = 0; double alpha_f64 = 1.0; double beta_f64 = 0.0; float alpha = 1.0f; float beta = 0.0f; const void *alpha_ptr = α const void *beta_ptr = β if (profile->desc.compute_type == CUBLAS_COMPUTE_32I) { alpha_ptr = &alpha_i32; beta_ptr = &beta_i32; } else if (profile->desc.compute_type == CUBLAS_COMPUTE_64F) { alpha_ptr = &alpha_f64; beta_ptr = &beta_f64; } return check_cublas(profile->desc.name, cublas->cublasLtMatmul(handle, profile->op_desc, alpha_ptr, (const void *)(uintptr_t)profile->a_dev, profile->a_layout, (const void *)(uintptr_t)profile->b_dev, profile->b_layout, beta_ptr, (const void *)(uintptr_t)profile->c_dev, profile->c_layout, (void *)(uintptr_t)profile->d_dev, profile->d_layout, &profile->heuristic.algo, (void *)(uintptr_t)profile->workspace_dev, profile->workspace_size, profile->stream)); } static int run_cublaslt_stress(struct cuda_api *cuda, CUdevice dev, const char *device_name, int cc_major, int cc_minor, int seconds, int size_mb, const char *precision_filter, struct stress_report *report) { struct cublaslt_api cublas; struct prepared_profile prepared[MAX_STRESS_STREAMS * PROFILE_COUNT]; cublasLtHandle_t handle = NULL; CUcontext ctx = NULL; CUstream streams[MAX_STRESS_STREAMS] = {0}; uint16_t sample[256]; int cc = cc_major * 10 + cc_minor; int planned = 0; int active = 0; int mp_count = 0; int stream_count = 1; int profile_count = PROFILE_COUNT; int prepared_count = 0; size_t requested_budget = 0; size_t total_budget = 0; size_t per_profile_budget = 0; memset(report, 0, sizeof(*report)); snprintf(report->backend, sizeof(report->backend), "cublasLt"); snprintf(report->device, sizeof(report->device), "%s", device_name); report->cc_major = cc_major; report->cc_minor = cc_minor; report->buffer_mb = size_mb; if (!load_cublaslt(&cublas)) { snprintf(report->details, sizeof(report->details), "cublasLt=unavailable\n"); return 0; } if (!check_rc(cuda, "cuCtxCreate", cuda->cuCtxCreate(&ctx, 0, dev))) { return 0; } if (!check_cublas("cublasLtCreate", cublas.cublasLtCreate(&handle))) { cuda->cuCtxDestroy(ctx); return 0; } /* Count profiles matching the filter (for deciding what to run). */ for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) { if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc && (precision_filter == NULL || strcmp(k_profiles[i].block_label, precision_filter) == 0)) { planned++; } } if (planned <= 0) { snprintf(report->details, sizeof(report->details), "cublasLt_profiles=unsupported\n"); cublas.cublasLtDestroy(handle); cuda->cuCtxDestroy(ctx); return 0; } /* Count all profiles active on this GPU regardless of filter. * Used as the budget divisor so matrix sizes stay consistent whether * running all precisions together or a single-precision phase. */ int planned_total = 0; for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) { if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) { planned_total++; } } if (planned_total < planned) { planned_total = planned; } requested_budget = (size_t)size_mb * 1024u * 1024u; if (requested_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) { requested_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES; } total_budget = clamp_budget_to_free_memory(cuda, requested_budget); if (total_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) { total_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES; } if (query_multiprocessor_count(cuda, dev, &mp_count) && cuda->cuStreamCreate && cuda->cuStreamDestroy) { stream_count = choose_stream_count(mp_count, planned_total, total_budget, 1); } if (stream_count > 1) { int created = 0; for (; created < stream_count; created++) { if (!check_rc(cuda, "cuStreamCreate", cuda->cuStreamCreate(&streams[created], 0))) { destroy_streams(cuda, streams, created); stream_count = 1; break; } } } report->stream_count = stream_count; per_profile_budget = total_budget / ((size_t)planned_total * (size_t)stream_count); if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) { per_profile_budget = MIN_PROFILE_BUDGET_BYTES; } report->buffer_mb = (int)(total_budget / (1024u * 1024u)); append_detail(report->details, sizeof(report->details), "requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n", size_mb, report->buffer_mb, report->stream_count, mp_count, per_profile_budget / (1024u * 1024u)); for (int i = 0; i < profile_count; i++) { const struct profile_desc *desc = &k_profiles[i]; if (!(desc->enabled && cc >= desc->min_cc)) { append_detail(report->details, sizeof(report->details), "%s=SKIPPED cc<%d\n", desc->name, desc->min_cc); continue; } if (precision_filter != NULL && strcmp(desc->block_label, precision_filter) != 0) { append_detail(report->details, sizeof(report->details), "%s=SKIPPED precision_filter\n", desc->name); continue; } for (int lane = 0; lane < stream_count; lane++) { CUstream stream = streams[lane]; if (prepared_count >= (int)(sizeof(prepared) / sizeof(prepared[0]))) { break; } if (prepare_profile(&cublas, handle, cuda, desc, stream, per_profile_budget, &prepared[prepared_count])) { active++; append_detail(report->details, sizeof(report->details), "%s[%d]=READY dim=%llux%llux%llu block=%s stream=%d\n", desc->name, lane, (unsigned long long)prepared[prepared_count].m, (unsigned long long)prepared[prepared_count].n, (unsigned long long)prepared[prepared_count].k, desc->block_label, lane); prepared_count++; } else { append_detail(report->details, sizeof(report->details), "%s[%d]=SKIPPED unsupported\n", desc->name, lane); } } } if (active <= 0) { cublas.cublasLtDestroy(handle); destroy_streams(cuda, streams, stream_count); cuda->cuCtxDestroy(ctx); return 0; } /* Keep the GPU queue continuously full by submitting kernels without * synchronizing after every wave. A sync barrier after each small batch * creates CPU↔GPU ping-pong gaps that prevent full TDP utilisation, * especially when individual kernels are short. Instead we sync at most * once per second (for error detection) and once at the very end. */ double deadline = now_seconds() + (double)seconds; double next_sync = now_seconds() + 1.0; while (now_seconds() < deadline) { int launched = 0; for (int i = 0; i < prepared_count; i++) { if (!prepared[i].ready) { continue; } if (!run_cublas_profile(handle, &cublas, &prepared[i])) { append_detail(report->details, sizeof(report->details), "%s=FAILED runtime\n", prepared[i].desc.name); for (int j = 0; j < prepared_count; j++) { destroy_profile(&cublas, cuda, &prepared[j]); } cublas.cublasLtDestroy(handle); destroy_streams(cuda, streams, stream_count); cuda->cuCtxDestroy(ctx); return 0; } prepared[i].iterations++; report->iterations++; launched++; } if (launched <= 0) { break; } double now = now_seconds(); if (now >= next_sync || now >= deadline) { if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) { for (int i = 0; i < prepared_count; i++) { destroy_profile(&cublas, cuda, &prepared[i]); } cublas.cublasLtDestroy(handle); destroy_streams(cuda, streams, stream_count); cuda->cuCtxDestroy(ctx); return 0; } next_sync = now + 1.0; } } /* Final drain — ensure all queued work finishes before we read results. */ cuda->cuCtxSynchronize(); for (int i = 0; i < prepared_count; i++) { if (!prepared[i].ready) { continue; } append_detail(report->details, sizeof(report->details), "%s_iterations=%lu\n", prepared[i].desc.name, prepared[i].iterations); } for (int i = 0; i < prepared_count; i++) { if (prepared[i].ready) { if (check_rc(cuda, "cuMemcpyDtoH", cuda->cuMemcpyDtoH(sample, prepared[i].d_dev, sizeof(sample)))) { for (size_t j = 0; j < sizeof(sample) / sizeof(sample[0]); j++) { report->checksum += sample[j]; } } break; } } for (int i = 0; i < prepared_count; i++) { destroy_profile(&cublas, cuda, &prepared[i]); } cublas.cublasLtDestroy(handle); destroy_streams(cuda, streams, stream_count); cuda->cuCtxDestroy(ctx); return 1; } #endif static void print_stress_report(const struct stress_report *report, int device_index, int seconds) { printf("device=%s\n", report->device); printf("device_index=%d\n", device_index); printf("compute_capability=%d.%d\n", report->cc_major, report->cc_minor); printf("backend=%s\n", report->backend); printf("duration_s=%d\n", seconds); printf("buffer_mb=%d\n", report->buffer_mb); printf("streams=%d\n", report->stream_count); printf("iterations=%lu\n", report->iterations); printf("checksum=%llu\n", (unsigned long long)report->checksum); if (report->details[0] != '\0') { printf("%s", report->details); } printf("status=OK\n"); } int main(int argc, char **argv) { int seconds = 5; int size_mb = 64; int device_index = 0; const char *precision_filter = NULL; /* NULL = all; else block_label to match */ const char *precision_plan = NULL; const char *precision_plan_seconds = NULL; for (int i = 1; i < argc; i++) { if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) { seconds = atoi(argv[++i]); } else if ((strcmp(argv[i], "--size-mb") == 0 || strcmp(argv[i], "-m") == 0) && i + 1 < argc) { size_mb = atoi(argv[++i]); } else if ((strcmp(argv[i], "--device") == 0 || strcmp(argv[i], "-d") == 0) && i + 1 < argc) { device_index = atoi(argv[++i]); } else if (strcmp(argv[i], "--precision") == 0 && i + 1 < argc) { precision_filter = argv[++i]; } else if (strcmp(argv[i], "--precision-plan") == 0 && i + 1 < argc) { precision_plan = argv[++i]; } else if (strcmp(argv[i], "--precision-plan-seconds") == 0 && i + 1 < argc) { precision_plan_seconds = argv[++i]; } else { fprintf(stderr, "usage: %s [--seconds N] [--size-mb N] [--device N] [--precision int8|fp8|fp16|fp32|fp64|fp4] [--precision-plan p1,p2,...,mixed] [--precision-plan-seconds s1,s2,...]\n", argv[0]); return 2; } } if (seconds <= 0) { seconds = 5; } if (size_mb <= 0) { size_mb = 64; } if (device_index < 0) { device_index = 0; } struct cuda_api cuda; if (!load_cuda(&cuda)) { fprintf(stderr, "failed to load libcuda.so.1 or required Driver API symbols\n"); return 1; } load_symbol(cuda.lib, "cuGetErrorName", (void **)&cuda.cuGetErrorName); load_symbol(cuda.lib, "cuGetErrorString", (void **)&cuda.cuGetErrorString); if (!check_rc(&cuda, "cuInit", cuda.cuInit(0))) { return 1; } int count = 0; if (!check_rc(&cuda, "cuDeviceGetCount", cuda.cuDeviceGetCount(&count))) { return 1; } if (count <= 0) { fprintf(stderr, "no CUDA devices found\n"); return 1; } if (device_index >= count) { fprintf(stderr, "device index %d out of range (found %d CUDA device(s))\n", device_index, count); return 1; } CUdevice dev = 0; if (!check_rc(&cuda, "cuDeviceGet", cuda.cuDeviceGet(&dev, device_index))) { return 1; } char name[128] = {0}; if (!check_rc(&cuda, "cuDeviceGetName", cuda.cuDeviceGetName(name, (int)sizeof(name), dev))) { return 1; } int cc_major = 0; int cc_minor = 0; if (!query_compute_capability(&cuda, dev, &cc_major, &cc_minor)) { return 1; } struct stress_report report; int ok = 0; #if HAVE_CUBLASLT_HEADERS if (precision_plan != NULL && precision_plan[0] != '\0') { char *plan_copy = strdup(precision_plan); char *plan_seconds_copy = NULL; int phase_seconds[32] = {0}; int phase_seconds_count = 0; int phase_ok = 0; if (plan_copy == NULL) { fprintf(stderr, "failed to allocate precision plan buffer\n"); return 1; } if (precision_plan_seconds != NULL && precision_plan_seconds[0] != '\0') { plan_seconds_copy = strdup(precision_plan_seconds); if (plan_seconds_copy == NULL) { free(plan_copy); fprintf(stderr, "failed to allocate precision plan seconds buffer\n"); return 1; } for (char *sec_token = strtok(plan_seconds_copy, ","); sec_token != NULL && phase_seconds_count < (int)(sizeof(phase_seconds) / sizeof(phase_seconds[0])); sec_token = strtok(NULL, ",")) { while (*sec_token == ' ' || *sec_token == '\t') { sec_token++; } if (*sec_token == '\0') { continue; } phase_seconds[phase_seconds_count++] = atoi(sec_token); } } int phase_idx = 0; for (char *token = strtok(plan_copy, ","); token != NULL; token = strtok(NULL, ","), phase_idx++) { while (*token == ' ' || *token == '\t') { token++; } if (*token == '\0') { continue; } const char *phase_name = token; const char *phase_filter = token; if (strcmp(token, "mixed") == 0 || strcmp(token, "all") == 0) { phase_filter = NULL; } int phase_duration = seconds; if (phase_idx < phase_seconds_count && phase_seconds[phase_idx] > 0) { phase_duration = phase_seconds[phase_idx]; } printf("phase_begin=%s\n", phase_name); fflush(stdout); memset(&report, 0, sizeof(report)); ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, phase_duration, size_mb, phase_filter, &report); if (ok) { print_stress_report(&report, device_index, phase_duration); phase_ok = 1; } else { printf("phase_error=%s\n", phase_name); if (report.details[0] != '\0') { printf("%s", report.details); if (report.details[strlen(report.details) - 1] != '\n') { printf("\n"); } } printf("status=FAILED\n"); } printf("phase_end=%s\n", phase_name); fflush(stdout); } free(plan_seconds_copy); free(plan_copy); return phase_ok ? 0 : 1; } ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, precision_filter, &report); #endif if (!ok) { if (precision_filter != NULL) { fprintf(stderr, "requested precision path unavailable: precision=%s device=%s cc=%d.%d\n", precision_filter, name, cc_major, cc_minor); return 1; } int ptx_mb = size_mb; if (!run_ptx_fallback(&cuda, dev, name, cc_major, cc_minor, seconds, ptx_mb, &report)) { return 1; } } print_stress_report(&report, device_index, seconds); return 0; }