From 19b4803ec7f1bc3b4358d8d2674133d5d593dc62 Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Sun, 5 Apr 2026 20:22:43 +0300 Subject: [PATCH] Pass exact cycle duration to GPU stress instead of 86400s sentinel bee-gpu-burn now receives --seconds so it exits naturally when the cycle ends, rather than relying solely on context cancellation to kill it. Process group kill (Setpgid+Cancel) is kept as a safety net for early cancellation (user stop, context timeout). Same fix for AMD RVS which now gets duration_ms = LoadSec * 1000. Co-Authored-By: Claude Sonnet 4.6 --- audit/internal/platform/platform_stress.go | 24 +++++++++++----------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/audit/internal/platform/platform_stress.go b/audit/internal/platform/platform_stress.go index 239cfad..9068712 100644 --- a/audit/internal/platform/platform_stress.go +++ b/audit/internal/platform/platform_stress.go @@ -110,7 +110,7 @@ func (s *System) RunPlatformStress( wg.Add(1) go func() { defer wg.Done() - gpuCmd := buildGPUStressCmd(loadCtx, vendor) + gpuCmd := buildGPUStressCmd(loadCtx, vendor, cycle.LoadSec) if gpuCmd == nil { return } @@ -409,28 +409,28 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) { // buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor. // Returns nil if no GPU stress tool is available (CPU-only cycling still useful). -func buildGPUStressCmd(ctx context.Context, vendor string) *exec.Cmd { +func buildGPUStressCmd(ctx context.Context, vendor string, durSec int) *exec.Cmd { switch strings.ToLower(vendor) { case "amd": - return buildAMDGPUStressCmd(ctx) + return buildAMDGPUStressCmd(ctx, durSec) case "nvidia": - return buildNvidiaGPUStressCmd(ctx) + return buildNvidiaGPUStressCmd(ctx, durSec) } return nil } -func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd { +func buildAMDGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd { rvsArgs, err := resolveRVSCommand() if err != nil { return nil } rvsPath := rvsArgs[0] - cfg := `actions: + cfg := fmt.Sprintf(`actions: - name: gst_platform device: all module: gst parallel: true - duration: 86400000 + duration: %d`, durSec*1000) + ` copy_matrix: false target_stress: 90 matrix_size_a: 8640 @@ -453,7 +453,7 @@ func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd { return cmd } -func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd { +func buildNvidiaGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd { path, err := satLookPath("bee-gpu-burn") if err != nil { path, err = satLookPath("bee-gpu-stress") @@ -461,10 +461,10 @@ func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd { if err != nil { return nil } - cmd := exec.CommandContext(ctx, path, "--seconds", "86400") - // bee-gpu-burn is a shell script that spawns bee-gpu-burn-worker children. - // Put the whole tree in its own process group so context cancellation kills - // all workers, not just the shell parent. + // Pass exact duration so bee-gpu-burn exits on its own when the cycle ends. + // Process group kill via Setpgid+Cancel is kept as a safety net for cases + // where the context is cancelled early (user stop, parent timeout). + cmd := exec.CommandContext(ctx, path, "--seconds", strconv.Itoa(durSec)) cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} cmd.Cancel = func() error { if cmd.Process != nil {