Move power diag tests to validate/stress; fix GPU burn power saturation

- bee-gpu-stress.c: remove per-wave cuCtxSynchronize barrier in both
  cuBLASLt and PTX hot loops; sync at most once/sec so the GPU queue
  stays continuously full — eliminates the CPU↔GPU ping-pong that
  prevented reaching full TDP
- sat_fan_stress.go: default SizeMB 0 (auto = 95% VRAM) instead of
  hardcoded 64 MB; tiny matrices caused <0.1 ms kernels where CPU
  re-queue overhead dominated
- pages.go: move nvidia-targeted-power and nvidia-pulse from Burn →
  Validate stress section alongside nvidia-targeted-stress; these are
  DCGM pass/fail diagnostics, not sustained burn loads; remove the
  Power Delivery / Power Budget card from Burn entirely

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-08 00:13:52 +03:00
parent 0a4bb596f6
commit d1a22d782d
3 changed files with 196 additions and 150 deletions

View File

@@ -36,7 +36,6 @@ typedef void *CUstream;
#define MAX_CUBLAS_PROFILES 5
#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
#define STRESS_LAUNCH_DEPTH 8
static const char *ptx_source =
".version 6.0\n"
@@ -344,7 +343,6 @@ static int run_ptx_fallback(struct cuda_api *api,
unsigned long iterations = 0;
int mp_count = 0;
int stream_count = 1;
int launches_per_wave = 0;
memset(report, 0, sizeof(*report));
snprintf(report->backend, sizeof(report->backend), "driver-ptx");
@@ -419,44 +417,42 @@ static int run_ptx_fallback(struct cuda_api *api,
unsigned int threads = 256;
double start = now_seconds();
double deadline = start + (double)seconds;
double deadline = now_seconds() + (double)seconds;
double next_sync = now_seconds() + 1.0;
while (now_seconds() < deadline) {
launches_per_wave = 0;
for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
int launched_this_batch = 0;
for (int lane = 0; lane < stream_count; lane++) {
unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
if (!check_rc(api,
"cuLaunchKernel",
api->cuLaunchKernel(kernel,
blocks,
1,
1,
threads,
1,
1,
0,
streams[lane],
params[lane],
NULL))) {
goto fail;
}
launches_per_wave++;
launched_this_batch++;
}
if (launched_this_batch <= 0) {
break;
int launched = 0;
for (int lane = 0; lane < stream_count; lane++) {
unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
if (!check_rc(api,
"cuLaunchKernel",
api->cuLaunchKernel(kernel,
blocks,
1,
1,
threads,
1,
1,
0,
streams[lane],
params[lane],
NULL))) {
goto fail;
}
launched++;
iterations++;
}
if (launches_per_wave <= 0) {
if (launched <= 0) {
goto fail;
}
if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
goto fail;
double now = now_seconds();
if (now >= next_sync || now >= deadline) {
if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
goto fail;
}
next_sync = now + 1.0;
}
iterations += (unsigned long)launches_per_wave;
}
api->cuCtxSynchronize();
if (!check_rc(api, "cuMemcpyDtoH", api->cuMemcpyDtoH(sample, device_mem[0], sizeof(sample)))) {
goto fail;
@@ -468,11 +464,10 @@ static int run_ptx_fallback(struct cuda_api *api,
report->iterations = iterations;
snprintf(report->details,
sizeof(report->details),
"fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d queue_depth=%d per_stream_mb=%zu iterations=%lu\n",
"fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d per_stream_mb=%zu iterations=%lu\n",
size_mb,
report->buffer_mb,
report->stream_count,
STRESS_LAUNCH_DEPTH,
bytes_per_stream[0] / (1024u * 1024u),
iterations);
@@ -1140,7 +1135,6 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
int stream_count = 1;
int profile_count = (int)(sizeof(k_profiles) / sizeof(k_profiles[0]));
int prepared_count = 0;
int wave_launches = 0;
size_t requested_budget = 0;
size_t total_budget = 0;
size_t per_profile_budget = 0;
@@ -1207,11 +1201,10 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
report->buffer_mb = (int)(total_budget / (1024u * 1024u));
append_detail(report->details,
sizeof(report->details),
"requested_mb=%d actual_mb=%d streams=%d queue_depth=%d mp_count=%d per_worker_mb=%zu\n",
"requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n",
size_mb,
report->buffer_mb,
report->stream_count,
STRESS_LAUNCH_DEPTH,
mp_count,
per_profile_budget / (1024u * 1024u));
@@ -1260,50 +1253,55 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
return 0;
}
/* Keep the GPU queue continuously full by submitting kernels without
* synchronizing after every wave. A sync barrier after each small batch
* creates CPU↔GPU ping-pong gaps that prevent full TDP utilisation,
* especially when individual kernels are short. Instead we sync at most
* once per second (for error detection) and once at the very end. */
double deadline = now_seconds() + (double)seconds;
double next_sync = now_seconds() + 1.0;
while (now_seconds() < deadline) {
wave_launches = 0;
for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
int launched_this_batch = 0;
for (int i = 0; i < prepared_count; i++) {
if (!prepared[i].ready) {
continue;
}
if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
append_detail(report->details,
sizeof(report->details),
"%s=FAILED runtime\n",
prepared[i].desc.name);
for (int j = 0; j < prepared_count; j++) {
destroy_profile(&cublas, cuda, &prepared[j]);
}
cublas.cublasLtDestroy(handle);
destroy_streams(cuda, streams, stream_count);
cuda->cuCtxDestroy(ctx);
return 0;
}
prepared[i].iterations++;
report->iterations++;
wave_launches++;
launched_this_batch++;
int launched = 0;
for (int i = 0; i < prepared_count; i++) {
if (!prepared[i].ready) {
continue;
}
if (launched_this_batch <= 0) {
break;
if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
append_detail(report->details,
sizeof(report->details),
"%s=FAILED runtime\n",
prepared[i].desc.name);
for (int j = 0; j < prepared_count; j++) {
destroy_profile(&cublas, cuda, &prepared[j]);
}
cublas.cublasLtDestroy(handle);
destroy_streams(cuda, streams, stream_count);
cuda->cuCtxDestroy(ctx);
return 0;
}
prepared[i].iterations++;
report->iterations++;
launched++;
}
if (wave_launches <= 0) {
if (launched <= 0) {
break;
}
if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
for (int i = 0; i < prepared_count; i++) {
destroy_profile(&cublas, cuda, &prepared[i]);
double now = now_seconds();
if (now >= next_sync || now >= deadline) {
if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
for (int i = 0; i < prepared_count; i++) {
destroy_profile(&cublas, cuda, &prepared[i]);
}
cublas.cublasLtDestroy(handle);
destroy_streams(cuda, streams, stream_count);
cuda->cuCtxDestroy(ctx);
return 0;
}
cublas.cublasLtDestroy(handle);
destroy_streams(cuda, streams, stream_count);
cuda->cuCtxDestroy(ctx);
return 0;
next_sync = now + 1.0;
}
}
/* Final drain — ensure all queued work finishes before we read results. */
cuda->cuCtxSynchronize();
for (int i = 0; i < prepared_count; i++) {
if (!prepared[i].ready) {