Add per-precision benchmark phases, weighted TOPS scoring, and ECC tracking

- Split steady window into 6 equal slots: fp8/fp16/fp32/fp64/fp4 + combined - Each precision phase runs bee-gpu-burn with --precision filter so PowerCVPct reflects single-kernel stability (not round-robin artifact) - Add fp4 support in bee-gpu-stress.c for Blackwell (cc>=100) via existing CUDA_R_4F_E2M1 guard - Weighted TOPS: fp64×2.0, fp32×1.0, fp16×0.5, fp8×0.25, fp4×0.125 - SyntheticScore = sum of weighted TOPS from per-precision phases - MixedScore = sum from combined phase; MixedEfficiency = Mixed/Synthetic - ComputeScore = SyntheticScore × (1 + MixedEfficiency × 0.3) - ECC volatile counters sampled before/after each phase and overall - DegradationReasons: ecc_uncorrected_errors, ecc_corrected_errors - Report: per-precision stability table with ECC columns, methodology section - Ramp-up history table redesign: GPU indices as columns, runs as rows Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-13 10:49:49 +03:00
parent 02e44b1172
commit bf6ecab4f0
9 changed files with 390 additions and 144 deletions
--- a/iso/builder/bee-gpu-stress.c
+++ b/iso/builder/bee-gpu-stress.c
@@ -1121,6 +1121,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
                               int cc_minor,
                               int seconds,
                               int size_mb,
+                               const char *precision_filter,
                               struct stress_report *report) {
    struct cublaslt_api cublas;
    struct prepared_profile prepared[MAX_STRESS_STREAMS * MAX_CUBLAS_PROFILES];
@@ -1159,7 +1160,8 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
    }

    for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
-        if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) {
+        if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc &&
+            (precision_filter == NULL || strcmp(k_profiles[i].block_label, precision_filter) == 0)) {
            planned++;
        }
    }
@@ -1218,6 +1220,13 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
                          desc->min_cc);
            continue;
        }
+        if (precision_filter != NULL && strcmp(desc->block_label, precision_filter) != 0) {
+            append_detail(report->details,
+                          sizeof(report->details),
+                          "%s=SKIPPED precision_filter\n",
+                          desc->name);
+            continue;
+        }
        for (int lane = 0; lane < stream_count; lane++) {
            CUstream stream = streams[lane];
            if (prepared_count >= (int)(sizeof(prepared) / sizeof(prepared[0]))) {
@@ -1339,6 +1348,7 @@ int main(int argc, char **argv) {
    int seconds = 5;
    int size_mb = 64;
    int device_index = 0;
+    const char *precision_filter = NULL; /* NULL = all; else block_label to match */
    for (int i = 1; i < argc; i++) {
        if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) {
            seconds = atoi(argv[++i]);
@@ -1346,8 +1356,12 @@ int main(int argc, char **argv) {
            size_mb = atoi(argv[++i]);
        } else if ((strcmp(argv[i], "--device") == 0 || strcmp(argv[i], "-d") == 0) && i + 1 < argc) {
            device_index = atoi(argv[++i]);
+        } else if (strcmp(argv[i], "--precision") == 0 && i + 1 < argc) {
+            precision_filter = argv[++i];
        } else {
-            fprintf(stderr, "usage: %s [--seconds N] [--size-mb N] [--device N]\n", argv[0]);
+            fprintf(stderr,
+                    "usage: %s [--seconds N] [--size-mb N] [--device N] [--precision fp8|fp16|fp32|fp64|fp4]\n",
+                    argv[0]);
            return 2;
        }
    }
@@ -1407,7 +1421,7 @@ int main(int argc, char **argv) {
    int ok = 0;

 #if HAVE_CUBLASLT_HEADERS
-    ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, &report);
+    ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, precision_filter, &report);
 #endif
    if (!ok) {
        if (!run_ptx_fallback(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, &report)) {