Disable precision fallback and pin cuBLAS 13.1

2026-04-14 10:17:44 +03:00
parent 81e7c921f8
commit 82fe1f6d26
3 changed files with 33 additions and 8 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@
 .DS_Store
 dist/
 iso/out/
+build-cache/
--- a/iso/builder/VERSIONS
+++ b/iso/builder/VERSIONS
@@ -6,7 +6,7 @@ NCCL_CUDA_VERSION=13.0
 NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
 NCCL_TESTS_VERSION=2.13.10
 NVCC_VERSION=12.8
-CUBLAS_VERSION=13.0.2.14-1
+CUBLAS_VERSION=13.1.1.3-1
 CUDA_USERSPACE_VERSION=13.0.96-1
 DCGM_VERSION=4.5.3-1
 JOHN_JUMBO_COMMIT=67fcf9fe5a
--- a/iso/builder/bee-gpu-stress.c
+++ b/iso/builder/bee-gpu-stress.c
@@ -1160,6 +1160,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
        return 0;
    }

+    /* Count profiles matching the filter (for deciding what to run). */
    for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
        if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc &&
            (precision_filter == NULL || strcmp(k_profiles[i].block_label, precision_filter) == 0)) {
@@ -1173,18 +1174,31 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
        return 0;
    }

+    /* Count all profiles active on this GPU regardless of filter.
+     * Used as the budget divisor so matrix sizes stay consistent whether
+     * running all precisions together or a single-precision phase. */
+    int planned_total = 0;
+    for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
+        if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) {
+            planned_total++;
+        }
+    }
+    if (planned_total < planned) {
+        planned_total = planned;
+    }
+
    requested_budget = (size_t)size_mb * 1024u * 1024u;
-    if (requested_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) {
-        requested_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES;
+    if (requested_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) {
+        requested_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES;
    }
    total_budget = clamp_budget_to_free_memory(cuda, requested_budget);
-    if (total_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) {
-        total_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES;
+    if (total_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) {
+        total_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES;
    }
    if (query_multiprocessor_count(cuda, dev, &mp_count) &&
        cuda->cuStreamCreate &&
        cuda->cuStreamDestroy) {
-        stream_count = choose_stream_count(mp_count, planned, total_budget, 1);
+        stream_count = choose_stream_count(mp_count, planned_total, total_budget, 1);
    }
    if (stream_count > 1) {
        int created = 0;
@@ -1197,7 +1211,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
        }
    }
    report->stream_count = stream_count;
-    per_profile_budget = total_budget / ((size_t)planned * (size_t)stream_count);
+    per_profile_budget = total_budget / ((size_t)planned_total * (size_t)stream_count);
    if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) {
        per_profile_budget = MIN_PROFILE_BUDGET_BYTES;
    }
@@ -1425,7 +1439,17 @@ int main(int argc, char **argv) {
    ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, precision_filter, &report);
 #endif
    if (!ok) {
-        if (!run_ptx_fallback(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, &report)) {
+        if (precision_filter != NULL) {
+            fprintf(stderr,
+                    "requested precision path unavailable: precision=%s device=%s cc=%d.%d\n",
+                    precision_filter,
+                    name,
+                    cc_major,
+                    cc_minor);
+            return 1;
+        }
+        int ptx_mb = size_mb;
+        if (!run_ptx_fallback(&cuda, dev, name, cc_major, cc_minor, seconds, ptx_mb, &report)) {
            return 1;
        }
    }