diff --git a/.gitignore b/.gitignore index e8398b5..a2e4fee 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ .DS_Store dist/ iso/out/ +build-cache/ diff --git a/iso/builder/VERSIONS b/iso/builder/VERSIONS index aac0fc1..c63678d 100644 --- a/iso/builder/VERSIONS +++ b/iso/builder/VERSIONS @@ -6,7 +6,7 @@ NCCL_CUDA_VERSION=13.0 NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186 NCCL_TESTS_VERSION=2.13.10 NVCC_VERSION=12.8 -CUBLAS_VERSION=13.0.2.14-1 +CUBLAS_VERSION=13.1.1.3-1 CUDA_USERSPACE_VERSION=13.0.96-1 DCGM_VERSION=4.5.3-1 JOHN_JUMBO_COMMIT=67fcf9fe5a diff --git a/iso/builder/bee-gpu-stress.c b/iso/builder/bee-gpu-stress.c index 5c4c7f2..65f0674 100644 --- a/iso/builder/bee-gpu-stress.c +++ b/iso/builder/bee-gpu-stress.c @@ -1160,6 +1160,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda, return 0; } + /* Count profiles matching the filter (for deciding what to run). */ for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) { if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc && (precision_filter == NULL || strcmp(k_profiles[i].block_label, precision_filter) == 0)) { @@ -1173,18 +1174,31 @@ static int run_cublaslt_stress(struct cuda_api *cuda, return 0; } + /* Count all profiles active on this GPU regardless of filter. + * Used as the budget divisor so matrix sizes stay consistent whether + * running all precisions together or a single-precision phase. */ + int planned_total = 0; + for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) { + if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) { + planned_total++; + } + } + if (planned_total < planned) { + planned_total = planned; + } + requested_budget = (size_t)size_mb * 1024u * 1024u; - if (requested_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) { - requested_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES; + if (requested_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) { + requested_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES; } total_budget = clamp_budget_to_free_memory(cuda, requested_budget); - if (total_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) { - total_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES; + if (total_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) { + total_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES; } if (query_multiprocessor_count(cuda, dev, &mp_count) && cuda->cuStreamCreate && cuda->cuStreamDestroy) { - stream_count = choose_stream_count(mp_count, planned, total_budget, 1); + stream_count = choose_stream_count(mp_count, planned_total, total_budget, 1); } if (stream_count > 1) { int created = 0; @@ -1197,7 +1211,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda, } } report->stream_count = stream_count; - per_profile_budget = total_budget / ((size_t)planned * (size_t)stream_count); + per_profile_budget = total_budget / ((size_t)planned_total * (size_t)stream_count); if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) { per_profile_budget = MIN_PROFILE_BUDGET_BYTES; } @@ -1425,7 +1439,17 @@ int main(int argc, char **argv) { ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, precision_filter, &report); #endif if (!ok) { - if (!run_ptx_fallback(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, &report)) { + if (precision_filter != NULL) { + fprintf(stderr, + "requested precision path unavailable: precision=%s device=%s cc=%d.%d\n", + precision_filter, + name, + cc_major, + cc_minor); + return 1; + } + int ptx_mb = size_mb; + if (!run_ptx_fallback(&cuda, dev, name, cc_major, cc_minor, seconds, ptx_mb, &report)) { return 1; } }