diff --git a/audit/internal/platform/nvidia_stress.go b/audit/internal/platform/nvidia_stress.go index 62bc0c5..6ef735e 100644 --- a/audit/internal/platform/nvidia_stress.go +++ b/audit/internal/platform/nvidia_stress.go @@ -95,9 +95,7 @@ func normalizeNvidiaStressOptions(opts *NvidiaStressOptions) { if opts.DurationSec <= 0 { opts.DurationSec = 300 } - if opts.SizeMB <= 0 { - opts.SizeMB = 64 - } + // SizeMB=0 means "auto" — bee-gpu-burn will query per-GPU memory at runtime. switch strings.TrimSpace(strings.ToLower(opts.Loader)) { case "", NvidiaStressLoaderBuiltin: opts.Loader = NvidiaStressLoaderBuiltin diff --git a/audit/internal/platform/platform_stress.go b/audit/internal/platform/platform_stress.go index 0c03088..d7e909f 100644 --- a/audit/internal/platform/platform_stress.go +++ b/audit/internal/platform/platform_stress.go @@ -26,7 +26,8 @@ type PlatformStressCycle struct { // PlatformStressOptions controls the thermal cycling test. type PlatformStressOptions struct { - Cycles []PlatformStressCycle + Cycles []PlatformStressCycle + Components []string // if empty: run all; values: "cpu", "gpu" } // platformStressRow is one second of telemetry. @@ -68,8 +69,11 @@ func (s *System) RunPlatformStress( return "", fmt.Errorf("mkdir run dir: %w", err) } + hasCPU := len(opts.Components) == 0 || containsComponent(opts.Components, "cpu") + hasGPU := len(opts.Components) == 0 || containsComponent(opts.Components, "gpu") + vendor := s.DetectGPUVendor() - logFunc(fmt.Sprintf("Platform Thermal Cycling — %d cycle(s), GPU vendor: %s", len(opts.Cycles), vendor)) + logFunc(fmt.Sprintf("Platform Thermal Cycling — %d cycle(s), GPU vendor: %s, cpu=%v gpu=%v", len(opts.Cycles), vendor, hasCPU, hasGPU)) var rows []platformStressRow start := time.Now() @@ -88,27 +92,31 @@ func (s *System) RunPlatformStress( var wg sync.WaitGroup // CPU stress - wg.Add(1) - go func() { - defer wg.Done() - cpuCmd, err := buildCPUStressCmd(loadCtx) - if err != nil { - logFunc("CPU stress: " + err.Error()) - return - } - _ = cpuCmd.Wait() // exits when loadCtx times out (SIGKILL) - }() + if hasCPU { + wg.Add(1) + go func() { + defer wg.Done() + cpuCmd, err := buildCPUStressCmd(loadCtx) + if err != nil { + logFunc("CPU stress: " + err.Error()) + return + } + _ = cpuCmd.Wait() // exits when loadCtx times out (SIGKILL) + }() + } // GPU stress - wg.Add(1) - go func() { - defer wg.Done() - gpuCmd := buildGPUStressCmd(loadCtx, vendor) - if gpuCmd == nil { - return - } - _ = gpuCmd.Wait() - }() + if hasGPU { + wg.Add(1) + go func() { + defer wg.Done() + gpuCmd := buildGPUStressCmd(loadCtx, vendor) + if gpuCmd == nil { + return + } + _ = gpuCmd.Wait() + }() + } // Monitoring goroutine for load phase loadRows := collectPhase(loadCtx, cycleNum, "load", start) @@ -439,7 +447,7 @@ func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd { if err != nil { return nil } - cmd := exec.CommandContext(ctx, path, "--seconds", "86400", "--size-mb", "64") + cmd := exec.CommandContext(ctx, path, "--seconds", "86400") cmd.Stdout = nil cmd.Stderr = nil _ = startLowPriorityCmd(cmd, 10) @@ -486,6 +494,15 @@ func platformStressMemoryMB() int { return mb } +func containsComponent(components []string, name string) bool { + for _, c := range components { + if c == name { + return true + } + } + return false +} + func packPlatformDir(dir, dest string) error { f, err := os.Create(dest) if err != nil { diff --git a/audit/internal/webui/api.go b/audit/internal/webui/api.go index b0000b0..6010443 100644 --- a/audit/internal/webui/api.go +++ b/audit/internal/webui/api.go @@ -181,13 +181,14 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc { } var body struct { - Duration int `json:"duration"` - DiagLevel int `json:"diag_level"` - GPUIndices []int `json:"gpu_indices"` - ExcludeGPUIndices []int `json:"exclude_gpu_indices"` - Loader string `json:"loader"` - Profile string `json:"profile"` - DisplayName string `json:"display_name"` + Duration int `json:"duration"` + DiagLevel int `json:"diag_level"` + GPUIndices []int `json:"gpu_indices"` + ExcludeGPUIndices []int `json:"exclude_gpu_indices"` + Loader string `json:"loader"` + Profile string `json:"profile"` + DisplayName string `json:"display_name"` + PlatformComponents []string `json:"platform_components"` } if r.Body != nil { if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) { @@ -204,13 +205,14 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc { Status: TaskPending, CreatedAt: time.Now(), params: taskParams{ - Duration: body.Duration, - DiagLevel: body.DiagLevel, - GPUIndices: body.GPUIndices, - ExcludeGPUIndices: body.ExcludeGPUIndices, - Loader: body.Loader, - BurnProfile: body.Profile, - DisplayName: body.DisplayName, + Duration: body.Duration, + DiagLevel: body.DiagLevel, + GPUIndices: body.GPUIndices, + ExcludeGPUIndices: body.ExcludeGPUIndices, + Loader: body.Loader, + BurnProfile: body.Profile, + DisplayName: body.DisplayName, + PlatformComponents: body.PlatformComponents, }, } if strings.TrimSpace(body.DisplayName) != "" { @@ -512,6 +514,26 @@ func (h *handler) handleAPIGPUPresence(w http.ResponseWriter, r *http.Request) { }) } +// ── GPU tools ───────────────────────────────────────────────────────────────── + +func (h *handler) handleAPIGPUTools(w http.ResponseWriter, _ *http.Request) { + type toolEntry struct { + ID string `json:"id"` + Available bool `json:"available"` + Vendor string `json:"vendor"` // "nvidia" | "amd" + } + _, nvidiaErr := os.Stat("/dev/nvidia0") + _, amdErr := os.Stat("/dev/kfd") + nvidiaUp := nvidiaErr == nil + amdUp := amdErr == nil + writeJSON(w, []toolEntry{ + {ID: "bee-gpu-burn", Available: nvidiaUp, Vendor: "nvidia"}, + {ID: "john", Available: nvidiaUp, Vendor: "nvidia"}, + {ID: "nccl", Available: nvidiaUp, Vendor: "nvidia"}, + {ID: "rvs", Available: amdUp, Vendor: "amd"}, + }) +} + // ── System ──────────────────────────────────────────────────────────────────── func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) { diff --git a/audit/internal/webui/pages.go b/audit/internal/webui/pages.go index 4d0c220..32678d6 100644 --- a/audit/internal/webui/pages.go +++ b/audit/internal/webui/pages.go @@ -657,96 +657,210 @@ func renderSATCard(id, label, extra string) string { func renderBurn() string { return `
Tasks continue in the background — view progress in Tasks.
-Applied to all tests on this page. NVIDIA SAT on the Validate page still uses DCGM. NVIDIA GPU Stress on this page uses the selected stress loader for the preset duration.
-bee-gpu-burn runs on all detected NVIDIA GPUs by default. NCCL all_reduce_perf is useful for multi-GPU / interconnect load. Use exclusions only when one or more cards must be skipped.
Runs ROCm compute stress together with VRAM copy/load activity via RVS GST and records a separate rocm-bandwidth-test snapshot. Missing tools reported as UNSUPPORTED.
stress-ng --vm writes and verifies memory patterns across all of RAM. Env: BEE_VM_STRESS_SECONDS (default 300), BEE_VM_STRESS_SIZE_MB (default 80%).
Google stressapptest saturates CPU, memory and cache buses simultaneously. Env: BEE_SAT_STRESS_SECONDS (default 300), BEE_SAT_STRESS_MB (default auto).
Runs CPU + GPU stress simultaneously across multiple load/idle cycles with varying durations. Detects cooling systems that fail to recover under repeated load cycles. Smoke: 2 cycles ~5 min. Acceptance: 4 cycles ~25 min.
- -Tests run on all GPUs in the system. Availability determined by driver status.
+Select which subsystems to stress. Each checked item runs as a separate task.
+ + + + +Repeated load+idle cycles. Detects cooling recovery failures and GPU throttle. Smoke: 2×90s. Acceptance: 4×300s.
+Load components:
+ + + + +