diff --git a/audit/internal/platform/platform_stress.go b/audit/internal/platform/platform_stress.go index 239cfad..9068712 100644 --- a/audit/internal/platform/platform_stress.go +++ b/audit/internal/platform/platform_stress.go @@ -110,7 +110,7 @@ func (s *System) RunPlatformStress( wg.Add(1) go func() { defer wg.Done() - gpuCmd := buildGPUStressCmd(loadCtx, vendor) + gpuCmd := buildGPUStressCmd(loadCtx, vendor, cycle.LoadSec) if gpuCmd == nil { return } @@ -409,28 +409,28 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) { // buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor. // Returns nil if no GPU stress tool is available (CPU-only cycling still useful). -func buildGPUStressCmd(ctx context.Context, vendor string) *exec.Cmd { +func buildGPUStressCmd(ctx context.Context, vendor string, durSec int) *exec.Cmd { switch strings.ToLower(vendor) { case "amd": - return buildAMDGPUStressCmd(ctx) + return buildAMDGPUStressCmd(ctx, durSec) case "nvidia": - return buildNvidiaGPUStressCmd(ctx) + return buildNvidiaGPUStressCmd(ctx, durSec) } return nil } -func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd { +func buildAMDGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd { rvsArgs, err := resolveRVSCommand() if err != nil { return nil } rvsPath := rvsArgs[0] - cfg := `actions: + cfg := fmt.Sprintf(`actions: - name: gst_platform device: all module: gst parallel: true - duration: 86400000 + duration: %d`, durSec*1000) + ` copy_matrix: false target_stress: 90 matrix_size_a: 8640 @@ -453,7 +453,7 @@ func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd { return cmd } -func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd { +func buildNvidiaGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd { path, err := satLookPath("bee-gpu-burn") if err != nil { path, err = satLookPath("bee-gpu-stress") @@ -461,10 +461,10 @@ func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd { if err != nil { return nil } - cmd := exec.CommandContext(ctx, path, "--seconds", "86400") - // bee-gpu-burn is a shell script that spawns bee-gpu-burn-worker children. - // Put the whole tree in its own process group so context cancellation kills - // all workers, not just the shell parent. + // Pass exact duration so bee-gpu-burn exits on its own when the cycle ends. + // Process group kill via Setpgid+Cancel is kept as a safety net for cases + // where the context is cancelled early (user stop, parent timeout). + cmd := exec.CommandContext(ctx, path, "--seconds", strconv.Itoa(durSec)) cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} cmd.Cancel = func() error { if cmd.Process != nil {