fix(stress): keep platform burn responsive under load
This commit is contained in:
@@ -10,9 +10,11 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"runtime"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
|
"syscall"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -374,10 +376,17 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
|
|||||||
return nil, fmt.Errorf("stressapptest not found: %w", err)
|
return nil, fmt.Errorf("stressapptest not found: %w", err)
|
||||||
}
|
}
|
||||||
// Use a very long duration; the context timeout will kill it at the right time.
|
// Use a very long duration; the context timeout will kill it at the right time.
|
||||||
cmd := exec.CommandContext(ctx, path, "-s", "86400", "-W", "--cc_test")
|
cmdArgs := []string{"-s", "86400", "-W", "--cc_test"}
|
||||||
|
if threads := platformStressCPUThreads(); threads > 0 {
|
||||||
|
cmdArgs = append(cmdArgs, "-m", strconv.Itoa(threads))
|
||||||
|
}
|
||||||
|
if mb := platformStressMemoryMB(); mb > 0 {
|
||||||
|
cmdArgs = append(cmdArgs, "-M", strconv.Itoa(mb))
|
||||||
|
}
|
||||||
|
cmd := exec.CommandContext(ctx, path, cmdArgs...)
|
||||||
cmd.Stdout = nil
|
cmd.Stdout = nil
|
||||||
cmd.Stderr = nil
|
cmd.Stderr = nil
|
||||||
if err := cmd.Start(); err != nil {
|
if err := startLowPriorityCmd(cmd, 15); err != nil {
|
||||||
return nil, fmt.Errorf("stressapptest start: %w", err)
|
return nil, fmt.Errorf("stressapptest start: %w", err)
|
||||||
}
|
}
|
||||||
return cmd, nil
|
return cmd, nil
|
||||||
@@ -418,7 +427,7 @@ func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
|
|||||||
cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
|
cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
|
||||||
cmd.Stdout = nil
|
cmd.Stdout = nil
|
||||||
cmd.Stderr = nil
|
cmd.Stderr = nil
|
||||||
_ = cmd.Start()
|
_ = startLowPriorityCmd(cmd, 10)
|
||||||
return cmd
|
return cmd
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -433,10 +442,50 @@ func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
|
|||||||
cmd := exec.CommandContext(ctx, path, "--seconds", "86400", "--size-mb", "64")
|
cmd := exec.CommandContext(ctx, path, "--seconds", "86400", "--size-mb", "64")
|
||||||
cmd.Stdout = nil
|
cmd.Stdout = nil
|
||||||
cmd.Stderr = nil
|
cmd.Stderr = nil
|
||||||
_ = cmd.Start()
|
_ = startLowPriorityCmd(cmd, 10)
|
||||||
return cmd
|
return cmd
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func startLowPriorityCmd(cmd *exec.Cmd, nice int) error {
|
||||||
|
if err := cmd.Start(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if cmd.Process != nil {
|
||||||
|
_ = syscall.Setpriority(syscall.PRIO_PROCESS, cmd.Process.Pid, nice)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func platformStressCPUThreads() int {
|
||||||
|
if n := envInt("BEE_PLATFORM_STRESS_THREADS", 0); n > 0 {
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
cpus := runtime.NumCPU()
|
||||||
|
switch {
|
||||||
|
case cpus <= 2:
|
||||||
|
return 1
|
||||||
|
case cpus <= 8:
|
||||||
|
return cpus - 1
|
||||||
|
default:
|
||||||
|
return cpus - 2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func platformStressMemoryMB() int {
|
||||||
|
if mb := envInt("BEE_PLATFORM_STRESS_MB", 0); mb > 0 {
|
||||||
|
return mb
|
||||||
|
}
|
||||||
|
free := freeMemBytes()
|
||||||
|
if free <= 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
mb := int((free * 60) / 100 / (1024 * 1024))
|
||||||
|
if mb < 1024 {
|
||||||
|
return 1024
|
||||||
|
}
|
||||||
|
return mb
|
||||||
|
}
|
||||||
|
|
||||||
func packPlatformDir(dir, dest string) error {
|
func packPlatformDir(dir, dest string) error {
|
||||||
f, err := os.Create(dest)
|
f, err := os.Create(dest)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
34
audit/internal/platform/platform_stress_test.go
Normal file
34
audit/internal/platform/platform_stress_test.go
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"runtime"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestPlatformStressCPUThreadsOverride(t *testing.T) {
|
||||||
|
t.Setenv("BEE_PLATFORM_STRESS_THREADS", "7")
|
||||||
|
if got := platformStressCPUThreads(); got != 7 {
|
||||||
|
t.Fatalf("platformStressCPUThreads=%d want 7", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestPlatformStressCPUThreadsDefaultLeavesHeadroom(t *testing.T) {
|
||||||
|
t.Setenv("BEE_PLATFORM_STRESS_THREADS", "")
|
||||||
|
got := platformStressCPUThreads()
|
||||||
|
if got < 1 {
|
||||||
|
t.Fatalf("platformStressCPUThreads=%d want >= 1", got)
|
||||||
|
}
|
||||||
|
if got > runtime.NumCPU() {
|
||||||
|
t.Fatalf("platformStressCPUThreads=%d want <= NumCPU=%d", got, runtime.NumCPU())
|
||||||
|
}
|
||||||
|
if runtime.NumCPU() > 2 && got >= runtime.NumCPU() {
|
||||||
|
t.Fatalf("platformStressCPUThreads=%d want headroom below NumCPU=%d", got, runtime.NumCPU())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestPlatformStressMemoryMBOverride(t *testing.T) {
|
||||||
|
t.Setenv("BEE_PLATFORM_STRESS_MB", "8192")
|
||||||
|
if got := platformStressMemoryMB(); got != 8192 {
|
||||||
|
t.Fatalf("platformStressMemoryMB=%d want 8192", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -36,6 +36,7 @@ typedef void *CUstream;
|
|||||||
#define MAX_CUBLAS_PROFILES 5
|
#define MAX_CUBLAS_PROFILES 5
|
||||||
#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
|
#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
|
||||||
#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
|
#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
|
||||||
|
#define STRESS_LAUNCH_DEPTH 8
|
||||||
|
|
||||||
static const char *ptx_source =
|
static const char *ptx_source =
|
||||||
".version 6.0\n"
|
".version 6.0\n"
|
||||||
@@ -422,24 +423,31 @@ static int run_ptx_fallback(struct cuda_api *api,
|
|||||||
double deadline = start + (double)seconds;
|
double deadline = start + (double)seconds;
|
||||||
while (now_seconds() < deadline) {
|
while (now_seconds() < deadline) {
|
||||||
launches_per_wave = 0;
|
launches_per_wave = 0;
|
||||||
for (int lane = 0; lane < stream_count; lane++) {
|
for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
|
||||||
unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
|
int launched_this_batch = 0;
|
||||||
if (!check_rc(api,
|
for (int lane = 0; lane < stream_count; lane++) {
|
||||||
"cuLaunchKernel",
|
unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
|
||||||
api->cuLaunchKernel(kernel,
|
if (!check_rc(api,
|
||||||
blocks,
|
"cuLaunchKernel",
|
||||||
1,
|
api->cuLaunchKernel(kernel,
|
||||||
1,
|
blocks,
|
||||||
threads,
|
1,
|
||||||
1,
|
1,
|
||||||
1,
|
threads,
|
||||||
0,
|
1,
|
||||||
streams[lane],
|
1,
|
||||||
params[lane],
|
0,
|
||||||
NULL))) {
|
streams[lane],
|
||||||
goto fail;
|
params[lane],
|
||||||
|
NULL))) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
launches_per_wave++;
|
||||||
|
launched_this_batch++;
|
||||||
|
}
|
||||||
|
if (launched_this_batch <= 0) {
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
launches_per_wave++;
|
|
||||||
}
|
}
|
||||||
if (launches_per_wave <= 0) {
|
if (launches_per_wave <= 0) {
|
||||||
goto fail;
|
goto fail;
|
||||||
@@ -460,10 +468,11 @@ static int run_ptx_fallback(struct cuda_api *api,
|
|||||||
report->iterations = iterations;
|
report->iterations = iterations;
|
||||||
snprintf(report->details,
|
snprintf(report->details,
|
||||||
sizeof(report->details),
|
sizeof(report->details),
|
||||||
"fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d per_stream_mb=%zu iterations=%lu\n",
|
"fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d queue_depth=%d per_stream_mb=%zu iterations=%lu\n",
|
||||||
size_mb,
|
size_mb,
|
||||||
report->buffer_mb,
|
report->buffer_mb,
|
||||||
report->stream_count,
|
report->stream_count,
|
||||||
|
STRESS_LAUNCH_DEPTH,
|
||||||
bytes_per_stream[0] / (1024u * 1024u),
|
bytes_per_stream[0] / (1024u * 1024u),
|
||||||
iterations);
|
iterations);
|
||||||
|
|
||||||
@@ -1184,10 +1193,11 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
report->buffer_mb = (int)(total_budget / (1024u * 1024u));
|
report->buffer_mb = (int)(total_budget / (1024u * 1024u));
|
||||||
append_detail(report->details,
|
append_detail(report->details,
|
||||||
sizeof(report->details),
|
sizeof(report->details),
|
||||||
"requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n",
|
"requested_mb=%d actual_mb=%d streams=%d queue_depth=%d mp_count=%d per_worker_mb=%zu\n",
|
||||||
size_mb,
|
size_mb,
|
||||||
report->buffer_mb,
|
report->buffer_mb,
|
||||||
report->stream_count,
|
report->stream_count,
|
||||||
|
STRESS_LAUNCH_DEPTH,
|
||||||
mp_count,
|
mp_count,
|
||||||
per_profile_budget / (1024u * 1024u));
|
per_profile_budget / (1024u * 1024u));
|
||||||
|
|
||||||
@@ -1239,26 +1249,33 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
double deadline = now_seconds() + (double)seconds;
|
double deadline = now_seconds() + (double)seconds;
|
||||||
while (now_seconds() < deadline) {
|
while (now_seconds() < deadline) {
|
||||||
wave_launches = 0;
|
wave_launches = 0;
|
||||||
for (int i = 0; i < prepared_count; i++) {
|
for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
|
||||||
if (!prepared[i].ready) {
|
int launched_this_batch = 0;
|
||||||
continue;
|
for (int i = 0; i < prepared_count; i++) {
|
||||||
}
|
if (!prepared[i].ready) {
|
||||||
if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
|
continue;
|
||||||
append_detail(report->details,
|
|
||||||
sizeof(report->details),
|
|
||||||
"%s=FAILED runtime\n",
|
|
||||||
prepared[i].desc.name);
|
|
||||||
for (int j = 0; j < prepared_count; j++) {
|
|
||||||
destroy_profile(&cublas, cuda, &prepared[j]);
|
|
||||||
}
|
}
|
||||||
cublas.cublasLtDestroy(handle);
|
if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
|
||||||
destroy_streams(cuda, streams, stream_count);
|
append_detail(report->details,
|
||||||
cuda->cuCtxDestroy(ctx);
|
sizeof(report->details),
|
||||||
return 0;
|
"%s=FAILED runtime\n",
|
||||||
|
prepared[i].desc.name);
|
||||||
|
for (int j = 0; j < prepared_count; j++) {
|
||||||
|
destroy_profile(&cublas, cuda, &prepared[j]);
|
||||||
|
}
|
||||||
|
cublas.cublasLtDestroy(handle);
|
||||||
|
destroy_streams(cuda, streams, stream_count);
|
||||||
|
cuda->cuCtxDestroy(ctx);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
prepared[i].iterations++;
|
||||||
|
report->iterations++;
|
||||||
|
wave_launches++;
|
||||||
|
launched_this_batch++;
|
||||||
|
}
|
||||||
|
if (launched_this_batch <= 0) {
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
prepared[i].iterations++;
|
|
||||||
report->iterations++;
|
|
||||||
wave_launches++;
|
|
||||||
}
|
}
|
||||||
if (wave_launches <= 0) {
|
if (wave_launches <= 0) {
|
||||||
break;
|
break;
|
||||||
|
|||||||
Reference in New Issue
Block a user