Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
88b5e0edf2 | ||
|
|
82fe1f6d26 | ||
| 81e7c921f8 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -2,3 +2,4 @@
|
||||
.DS_Store
|
||||
dist/
|
||||
iso/out/
|
||||
build-cache/
|
||||
|
||||
@@ -1601,7 +1601,10 @@ func maxInt(a, b int) int {
|
||||
// queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
|
||||
// Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
|
||||
func queryIPMIServerPowerW() (float64, error) {
|
||||
out, err := satExecCommand("ipmitool", "dcmi", "power", "reading").Output()
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
cmd := exec.CommandContext(ctx, "ipmitool", "dcmi", "power", "reading")
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("ipmitool dcmi power reading: %w", err)
|
||||
}
|
||||
@@ -1620,6 +1623,7 @@ func sampleIPMIPowerSeries(ctx context.Context, durationSec int) (meanW float64,
|
||||
}
|
||||
deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
|
||||
var samples []float64
|
||||
loop:
|
||||
for {
|
||||
if w, err := queryIPMIServerPowerW(); err == nil {
|
||||
samples = append(samples, w)
|
||||
@@ -1629,7 +1633,7 @@ func sampleIPMIPowerSeries(ctx context.Context, durationSec int) (meanW float64,
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
break
|
||||
break loop
|
||||
case <-time.After(2 * time.Second):
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ NCCL_CUDA_VERSION=13.0
|
||||
NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
|
||||
NCCL_TESTS_VERSION=2.13.10
|
||||
NVCC_VERSION=12.8
|
||||
CUBLAS_VERSION=13.0.2.14-1
|
||||
CUBLAS_VERSION=13.1.1.3-1
|
||||
CUDA_USERSPACE_VERSION=13.0.96-1
|
||||
DCGM_VERSION=4.5.3-1
|
||||
JOHN_JUMBO_COMMIT=67fcf9fe5a
|
||||
|
||||
@@ -1160,6 +1160,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Count profiles matching the filter (for deciding what to run). */
|
||||
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
|
||||
if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc &&
|
||||
(precision_filter == NULL || strcmp(k_profiles[i].block_label, precision_filter) == 0)) {
|
||||
@@ -1173,18 +1174,31 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Count all profiles active on this GPU regardless of filter.
|
||||
* Used as the budget divisor so matrix sizes stay consistent whether
|
||||
* running all precisions together or a single-precision phase. */
|
||||
int planned_total = 0;
|
||||
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
|
||||
if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) {
|
||||
planned_total++;
|
||||
}
|
||||
}
|
||||
if (planned_total < planned) {
|
||||
planned_total = planned;
|
||||
}
|
||||
|
||||
requested_budget = (size_t)size_mb * 1024u * 1024u;
|
||||
if (requested_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) {
|
||||
requested_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES;
|
||||
if (requested_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) {
|
||||
requested_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES;
|
||||
}
|
||||
total_budget = clamp_budget_to_free_memory(cuda, requested_budget);
|
||||
if (total_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) {
|
||||
total_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES;
|
||||
if (total_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) {
|
||||
total_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES;
|
||||
}
|
||||
if (query_multiprocessor_count(cuda, dev, &mp_count) &&
|
||||
cuda->cuStreamCreate &&
|
||||
cuda->cuStreamDestroy) {
|
||||
stream_count = choose_stream_count(mp_count, planned, total_budget, 1);
|
||||
stream_count = choose_stream_count(mp_count, planned_total, total_budget, 1);
|
||||
}
|
||||
if (stream_count > 1) {
|
||||
int created = 0;
|
||||
@@ -1197,7 +1211,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
}
|
||||
}
|
||||
report->stream_count = stream_count;
|
||||
per_profile_budget = total_budget / ((size_t)planned * (size_t)stream_count);
|
||||
per_profile_budget = total_budget / ((size_t)planned_total * (size_t)stream_count);
|
||||
if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) {
|
||||
per_profile_budget = MIN_PROFILE_BUDGET_BYTES;
|
||||
}
|
||||
@@ -1425,7 +1439,17 @@ int main(int argc, char **argv) {
|
||||
ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, precision_filter, &report);
|
||||
#endif
|
||||
if (!ok) {
|
||||
if (!run_ptx_fallback(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, &report)) {
|
||||
if (precision_filter != NULL) {
|
||||
fprintf(stderr,
|
||||
"requested precision path unavailable: precision=%s device=%s cc=%d.%d\n",
|
||||
precision_filter,
|
||||
name,
|
||||
cc_major,
|
||||
cc_minor);
|
||||
return 1;
|
||||
}
|
||||
int ptx_mb = size_mb;
|
||||
if (!run_ptx_fallback(&cuda, dev, name, cc_major, cc_minor, seconds, ptx_mb, &report)) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -873,6 +873,22 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
|
||||
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||
|
||||
echo "=== bee-gpu-burn FP4 header probe ==="
|
||||
fp4_type_match="$(grep -Rsnm 1 'CUDA_R_4F_E2M1' "${CUBLAS_CACHE}/include" 2>/dev/null || true)"
|
||||
fp4_scale_match="$(grep -Rsnm 1 'CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3' "${CUBLAS_CACHE}/include" 2>/dev/null || true)"
|
||||
if [ -n "$fp4_type_match" ]; then
|
||||
echo "fp4_header_symbol=present"
|
||||
echo "$fp4_type_match"
|
||||
else
|
||||
echo "fp4_header_symbol=missing"
|
||||
fi
|
||||
if [ -n "$fp4_scale_match" ]; then
|
||||
echo "fp4_scale_mode_symbol=present"
|
||||
echo "$fp4_scale_match"
|
||||
else
|
||||
echo "fp4_scale_mode_symbol=missing"
|
||||
fi
|
||||
|
||||
GPU_STRESS_NEED_BUILD=1
|
||||
if [ -f "$GPU_BURN_WORKER_BIN" ]; then
|
||||
GPU_STRESS_NEED_BUILD=0
|
||||
@@ -901,6 +917,12 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
else
|
||||
echo "=== bee-gpu-burn worker up to date, skipping build ==="
|
||||
fi
|
||||
echo "=== bee-gpu-burn compiled profile probe ==="
|
||||
if grep -aq 'fp4_e2m1' "$GPU_BURN_WORKER_BIN"; then
|
||||
echo "fp4_profile_string=present"
|
||||
else
|
||||
echo "fp4_profile_string=missing"
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "=== preparing staged overlay (${BUILD_VARIANT}) ==="
|
||||
|
||||
Reference in New Issue
Block a user