Fix combined gpu burn profile capacity for fp4

Fix benchmark report methodology and rebuild gpu burn worker on toolchain changes
2026-04-14 00:00:40 +03:00 · 2026-04-13 23:43:12 +03:00
3 changed files with 23 additions and 13 deletions
--- a/audit/internal/platform/benchmark_report.go
+++ b/audit/internal/platform/benchmark_report.go
@@ -81,8 +81,12 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		b.WriteString("\n")
 	}

-	// ── Scoring methodology ───────────────────────────────────────────────────
-	b.WriteString("## Scoring Methodology\n\n")
+	// ── Methodology ───────────────────────────────────────────────────────────
+	b.WriteString("## Methodology\n\n")
+	fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline -> warmup -> steady-state -> interconnect -> cooldown phases.\n", result.BenchmarkProfile)
+	b.WriteString("- Single-GPU compute score comes from `bee-gpu-burn` on the cuBLASLt path when available.\n")
+	b.WriteString("- Thermal and power limits are inferred from NVIDIA clock-event counters plus sustained telemetry.\n")
+	b.WriteString("- `result.json` is the canonical machine-readable source for the run.\n\n")
 	b.WriteString("**Compute score** is derived from two phases:\n\n")
 	b.WriteString("- **Synthetic** — each precision type (fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ")
 	b.WriteString("Measures peak throughput with the full GPU dedicated to one kernel type. ")
@@ -286,13 +290,6 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
 		}
 	}

-	// ── Methodology ───────────────────────────────────────────────────────────
-	b.WriteString("## Methodology\n\n")
-	fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline → warmup → steady-state → interconnect → cooldown phases.\n", result.BenchmarkProfile)
-	b.WriteString("- Single-GPU compute score from bee-gpu-burn cuBLASLt when available.\n")
-	b.WriteString("- Thermal and power limitations inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
-	b.WriteString("- `result.json` is the canonical machine-readable source for this benchmark run.\n\n")
-
 	// ── Raw files ─────────────────────────────────────────────────────────────
 	b.WriteString("## Raw Files\n\n")
 	b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
--- a/iso/builder/bee-gpu-stress.c
+++ b/iso/builder/bee-gpu-stress.c
@@ -33,7 +33,6 @@ typedef void *CUstream;
 #define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR 75
 #define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR 76
 #define MAX_STRESS_STREAMS 16
-#define MAX_CUBLAS_PROFILES 5
 #define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
 #define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)

@@ -689,6 +688,8 @@ static const struct profile_desc k_profiles[] = {
 #endif
 };

+#define PROFILE_COUNT ((int)(sizeof(k_profiles) / sizeof(k_profiles[0])))
+
 static int load_cublaslt(struct cublaslt_api *api) {
    memset(api, 0, sizeof(*api));
    api->lib = dlopen("libcublasLt.so.13", RTLD_NOW | RTLD_LOCAL);
@@ -1124,7 +1125,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
                               const char *precision_filter,
                               struct stress_report *report) {
    struct cublaslt_api cublas;
-    struct prepared_profile prepared[MAX_STRESS_STREAMS * MAX_CUBLAS_PROFILES];
+    struct prepared_profile prepared[MAX_STRESS_STREAMS * PROFILE_COUNT];
    cublasLtHandle_t handle = NULL;
    CUcontext ctx = NULL;
    CUstream streams[MAX_STRESS_STREAMS] = {0};
@@ -1134,7 +1135,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
    int active = 0;
    int mp_count = 0;
    int stream_count = 1;
-    int profile_count = (int)(sizeof(k_profiles) / sizeof(k_profiles[0]));
+    int profile_count = PROFILE_COUNT;
    int prepared_count = 0;
    size_t requested_budget = 0;
    size_t total_budget = 0;
--- a/iso/builder/build.sh
+++ b/iso/builder/build.sh
@@ -874,8 +874,20 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
    CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"

    GPU_STRESS_NEED_BUILD=1
-    if [ -f "$GPU_BURN_WORKER_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_BURN_WORKER_BIN" ]; then
+    if [ -f "$GPU_BURN_WORKER_BIN" ]; then
        GPU_STRESS_NEED_BUILD=0
+        for dep in \
+            "${BUILDER_DIR}/bee-gpu-stress.c" \
+            "${BUILDER_DIR}/VERSIONS"; do
+            if [ "$dep" -nt "$GPU_BURN_WORKER_BIN" ]; then
+                GPU_STRESS_NEED_BUILD=1
+                break
+            fi
+        done
+        if [ "$GPU_STRESS_NEED_BUILD" = "0" ] && \
+            find "${CUBLAS_CACHE}/include" "${CUBLAS_CACHE}/lib" -type f -newer "$GPU_BURN_WORKER_BIN" | grep -q .; then
+            GPU_STRESS_NEED_BUILD=1
+        fi
    fi

    if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
Author	SHA1	Message	Date
Michael Chus	0fb8f2777f	Fix combined gpu burn profile capacity for fp4	2026-04-14 00:00:40 +03:00
Michael Chus	bf182daa89	Fix benchmark report methodology and rebuild gpu burn worker on toolchain changes	2026-04-13 23:43:12 +03:00