diff --git a/audit/internal/platform/sat.go b/audit/internal/platform/sat.go index d3cf98b..7aacf3f 100644 --- a/audit/internal/platform/sat.go +++ b/audit/internal/platform/sat.go @@ -21,10 +21,11 @@ import ( ) var ( - satExecCommand = exec.Command - satLookPath = exec.LookPath - satGlob = filepath.Glob - satStat = os.Stat + satExecCommand = exec.Command + satLookPath = exec.LookPath + satGlob = filepath.Glob + satStat = os.Stat + satFreeMemBytes = freeMemBytes rocmSMIExecutableGlobs = []string{ "/opt/rocm/bin/rocm-smi", @@ -407,6 +408,25 @@ func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) { return all, nil } +func memoryStressSizeArg() string { + if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 { + return fmt.Sprintf("%dM", mb) + } + availBytes := satFreeMemBytes() + if availBytes <= 0 { + return "80%" + } + availMB := availBytes / (1024 * 1024) + targetMB := (availMB * 2) / 3 + if targetMB >= 256 { + targetMB = (targetMB / 256) * 256 + } + if targetMB <= 0 { + return "80%" + } + return fmt.Sprintf("%dM", targetMB) +} + func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) { sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128) passes := envInt("BEE_MEMTESTER_PASSES", 1) @@ -422,11 +442,9 @@ func (s *System) RunMemoryStressPack(ctx context.Context, baseDir string, durati if seconds <= 0 { seconds = envInt("BEE_VM_STRESS_SECONDS", 300) } - // Use 80% of RAM by default; override with BEE_VM_STRESS_SIZE_MB. - sizeArg := "80%" - if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 { - sizeArg = fmt.Sprintf("%dM", mb) - } + // Base the default on current MemAvailable and keep headroom for the OS and + // concurrent stressors so mixed burn runs do not trip the OOM killer. + sizeArg := memoryStressSizeArg() return runAcceptancePackCtx(ctx, baseDir, "memory-stress", []satJob{ {name: "01-free-before.log", cmd: []string{"free", "-h"}}, {name: "02-stress-ng-vm.log", cmd: []string{ diff --git a/audit/internal/platform/sat_test.go b/audit/internal/platform/sat_test.go index d48328b..f46f311 100644 --- a/audit/internal/platform/sat_test.go +++ b/audit/internal/platform/sat_test.go @@ -276,6 +276,37 @@ func TestEnvIntFallback(t *testing.T) { } } +func TestMemoryStressSizeArgUsesAvailableMemory(t *testing.T) { + oldFreeMemBytes := satFreeMemBytes + satFreeMemBytes = func() int64 { return 96 * 1024 * 1024 * 1024 } + t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes }) + + if got := memoryStressSizeArg(); got != "65536M" { + t.Fatalf("sizeArg=%q want 65536M", got) + } +} + +func TestMemoryStressSizeArgRespectsOverride(t *testing.T) { + oldFreeMemBytes := satFreeMemBytes + satFreeMemBytes = func() int64 { return 96 * 1024 * 1024 * 1024 } + t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes }) + t.Setenv("BEE_VM_STRESS_SIZE_MB", "4096") + + if got := memoryStressSizeArg(); got != "4096M" { + t.Fatalf("sizeArg=%q want 4096M", got) + } +} + +func TestMemoryStressSizeArgFallsBackWhenFreeMemoryUnknown(t *testing.T) { + oldFreeMemBytes := satFreeMemBytes + satFreeMemBytes = func() int64 { return 0 } + t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes }) + + if got := memoryStressSizeArg(); got != "80%" { + t.Fatalf("sizeArg=%q want 80%%", got) + } +} + func TestClassifySATResult(t *testing.T) { tests := []struct { name string