Pass exact cycle duration to GPU stress instead of 86400s sentinel

bee-gpu-burn now receives --seconds <LoadSec> so it exits naturally
when the cycle ends, rather than relying solely on context cancellation
to kill it. Process group kill (Setpgid+Cancel) is kept as a safety net
for early cancellation (user stop, context timeout). Same fix for AMD
RVS which now gets duration_ms = LoadSec * 1000.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-05 20:22:43 +03:00
parent 1bdfb1e9ca
commit 19b4803ec7

View File

@@ -110,7 +110,7 @@ func (s *System) RunPlatformStress(
wg.Add(1) wg.Add(1)
go func() { go func() {
defer wg.Done() defer wg.Done()
gpuCmd := buildGPUStressCmd(loadCtx, vendor) gpuCmd := buildGPUStressCmd(loadCtx, vendor, cycle.LoadSec)
if gpuCmd == nil { if gpuCmd == nil {
return return
} }
@@ -409,28 +409,28 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
// buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor. // buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
// Returns nil if no GPU stress tool is available (CPU-only cycling still useful). // Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
func buildGPUStressCmd(ctx context.Context, vendor string) *exec.Cmd { func buildGPUStressCmd(ctx context.Context, vendor string, durSec int) *exec.Cmd {
switch strings.ToLower(vendor) { switch strings.ToLower(vendor) {
case "amd": case "amd":
return buildAMDGPUStressCmd(ctx) return buildAMDGPUStressCmd(ctx, durSec)
case "nvidia": case "nvidia":
return buildNvidiaGPUStressCmd(ctx) return buildNvidiaGPUStressCmd(ctx, durSec)
} }
return nil return nil
} }
func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd { func buildAMDGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
rvsArgs, err := resolveRVSCommand() rvsArgs, err := resolveRVSCommand()
if err != nil { if err != nil {
return nil return nil
} }
rvsPath := rvsArgs[0] rvsPath := rvsArgs[0]
cfg := `actions: cfg := fmt.Sprintf(`actions:
- name: gst_platform - name: gst_platform
device: all device: all
module: gst module: gst
parallel: true parallel: true
duration: 86400000 duration: %d`, durSec*1000) + `
copy_matrix: false copy_matrix: false
target_stress: 90 target_stress: 90
matrix_size_a: 8640 matrix_size_a: 8640
@@ -453,7 +453,7 @@ func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
return cmd return cmd
} }
func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd { func buildNvidiaGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
path, err := satLookPath("bee-gpu-burn") path, err := satLookPath("bee-gpu-burn")
if err != nil { if err != nil {
path, err = satLookPath("bee-gpu-stress") path, err = satLookPath("bee-gpu-stress")
@@ -461,10 +461,10 @@ func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
if err != nil { if err != nil {
return nil return nil
} }
cmd := exec.CommandContext(ctx, path, "--seconds", "86400") // Pass exact duration so bee-gpu-burn exits on its own when the cycle ends.
// bee-gpu-burn is a shell script that spawns bee-gpu-burn-worker children. // Process group kill via Setpgid+Cancel is kept as a safety net for cases
// Put the whole tree in its own process group so context cancellation kills // where the context is cancelled early (user stop, parent timeout).
// all workers, not just the shell parent. cmd := exec.CommandContext(ctx, path, "--seconds", strconv.Itoa(durSec))
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
cmd.Cancel = func() error { cmd.Cancel = func() error {
if cmd.Process != nil { if cmd.Process != nil {