Pass exact cycle duration to GPU stress instead of 86400s sentinel

bee-gpu-burn now receives --seconds <LoadSec> so it exits naturally
when the cycle ends, rather than relying solely on context cancellation
to kill it. Process group kill (Setpgid+Cancel) is kept as a safety net
for early cancellation (user stop, context timeout). Same fix for AMD
RVS which now gets duration_ms = LoadSec * 1000.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-05 20:22:43 +03:00
parent 1bdfb1e9ca
commit 19b4803ec7

View File

@@ -110,7 +110,7 @@ func (s *System) RunPlatformStress(
wg.Add(1)
go func() {
defer wg.Done()
gpuCmd := buildGPUStressCmd(loadCtx, vendor)
gpuCmd := buildGPUStressCmd(loadCtx, vendor, cycle.LoadSec)
if gpuCmd == nil {
return
}
@@ -409,28 +409,28 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
// buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
// Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
func buildGPUStressCmd(ctx context.Context, vendor string) *exec.Cmd {
func buildGPUStressCmd(ctx context.Context, vendor string, durSec int) *exec.Cmd {
switch strings.ToLower(vendor) {
case "amd":
return buildAMDGPUStressCmd(ctx)
return buildAMDGPUStressCmd(ctx, durSec)
case "nvidia":
return buildNvidiaGPUStressCmd(ctx)
return buildNvidiaGPUStressCmd(ctx, durSec)
}
return nil
}
func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
func buildAMDGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
rvsArgs, err := resolveRVSCommand()
if err != nil {
return nil
}
rvsPath := rvsArgs[0]
cfg := `actions:
cfg := fmt.Sprintf(`actions:
- name: gst_platform
device: all
module: gst
parallel: true
duration: 86400000
duration: %d`, durSec*1000) + `
copy_matrix: false
target_stress: 90
matrix_size_a: 8640
@@ -453,7 +453,7 @@ func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
return cmd
}
func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
func buildNvidiaGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
path, err := satLookPath("bee-gpu-burn")
if err != nil {
path, err = satLookPath("bee-gpu-stress")
@@ -461,10 +461,10 @@ func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
if err != nil {
return nil
}
cmd := exec.CommandContext(ctx, path, "--seconds", "86400")
// bee-gpu-burn is a shell script that spawns bee-gpu-burn-worker children.
// Put the whole tree in its own process group so context cancellation kills
// all workers, not just the shell parent.
// Pass exact duration so bee-gpu-burn exits on its own when the cycle ends.
// Process group kill via Setpgid+Cancel is kept as a safety net for cases
// where the context is cancelled early (user stop, parent timeout).
cmd := exec.CommandContext(ctx, path, "--seconds", strconv.Itoa(durSec))
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
cmd.Cancel = func() error {
if cmd.Process != nil {