From 7c2a0135d2e12e3b043d857cc1aad748db7295c8 Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Sun, 29 Mar 2026 21:57:33 +0300 Subject: [PATCH] feat(audit): add platform thermal cycling stress test Runs CPU (stressapptest) + GPU stress simultaneously across multiple load/idle cycles with varying idle durations (120s/60s/30s) to detect cooling systems that fail to recover under repeated load. Presets: smoke (~5 min), acceptance (~25 min), overnight (~100 min). Outputs metrics.csv + summary.txt with per-cycle throttle and fan spindown analysis, packed as tar.gz. Co-Authored-By: Claude Sonnet 4.6 --- audit/internal/app/app.go | 8 + audit/internal/app/app_test.go | 4 + audit/internal/platform/platform_stress.go | 476 +++++++++++++++++++++ audit/internal/webui/pages.go | 4 + audit/internal/webui/server.go | 1 + audit/internal/webui/tasks.go | 35 +- 6 files changed, 527 insertions(+), 1 deletion(-) create mode 100644 audit/internal/platform/platform_stress.go diff --git a/audit/internal/app/app.go b/audit/internal/app/app.go index 8d46181..52da447 100644 --- a/audit/internal/app/app.go +++ b/audit/internal/app/app.go @@ -120,6 +120,7 @@ type satRunner interface { RunMemoryStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) + RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) } @@ -627,6 +628,13 @@ func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platfor return a.sat.RunFanStressTest(ctx, baseDir, opts) } +func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error) { + if strings.TrimSpace(baseDir) == "" { + baseDir = DefaultSATBaseDir + } + return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc) +} + func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) { path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil) body := "Results: " + path diff --git a/audit/internal/app/app_test.go b/audit/internal/app/app_test.go index 93d1b7f..38901b9 100644 --- a/audit/internal/app/app_test.go +++ b/audit/internal/app/app_test.go @@ -203,6 +203,10 @@ func (f fakeSAT) RunFanStressTest(_ context.Context, _ string, _ platform.FanStr return "", nil } +func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.PlatformStressOptions, _ func(string)) (string, error) { + return "", nil +} + func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) { return "", nil } diff --git a/audit/internal/platform/platform_stress.go b/audit/internal/platform/platform_stress.go new file mode 100644 index 0000000..06cd3b6 --- /dev/null +++ b/audit/internal/platform/platform_stress.go @@ -0,0 +1,476 @@ +package platform + +import ( + "archive/tar" + "bytes" + "compress/gzip" + "context" + "encoding/csv" + "fmt" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + "sync" + "time" +) + +// PlatformStressCycle defines one load+idle cycle. +type PlatformStressCycle struct { + LoadSec int // seconds of simultaneous CPU+GPU stress + IdleSec int // seconds of idle monitoring after load cut +} + +// PlatformStressOptions controls the thermal cycling test. +type PlatformStressOptions struct { + Cycles []PlatformStressCycle +} + +// platformStressRow is one second of telemetry. +type platformStressRow struct { + ElapsedSec float64 + Cycle int + Phase string // "load" | "idle" + CPULoadPct float64 + MaxCPUTempC float64 + MaxGPUTempC float64 + SysPowerW float64 + FanMinRPM float64 + FanMaxRPM float64 + GPUThrottled bool +} + +// RunPlatformStress runs repeated load+idle thermal cycling. +// Each cycle starts CPU (stressapptest) and GPU stress simultaneously, +// runs for LoadSec, then cuts load abruptly and monitors for IdleSec. +func (s *System) RunPlatformStress( + ctx context.Context, + baseDir string, + opts PlatformStressOptions, + logFunc func(string), +) (string, error) { + if logFunc == nil { + logFunc = func(string) {} + } + if len(opts.Cycles) == 0 { + return "", fmt.Errorf("no cycles defined") + } + if err := os.MkdirAll(baseDir, 0755); err != nil { + return "", fmt.Errorf("mkdir %s: %w", baseDir, err) + } + + stamp := time.Now().UTC().Format("20060102-150405") + runDir := filepath.Join(baseDir, "platform-stress-"+stamp) + if err := os.MkdirAll(runDir, 0755); err != nil { + return "", fmt.Errorf("mkdir run dir: %w", err) + } + + vendor := s.DetectGPUVendor() + logFunc(fmt.Sprintf("Platform Thermal Cycling — %d cycle(s), GPU vendor: %s", len(opts.Cycles), vendor)) + + var rows []platformStressRow + start := time.Now() + + var analyses []cycleAnalysis + + for i, cycle := range opts.Cycles { + if ctx.Err() != nil { + break + } + cycleNum := i + 1 + logFunc(fmt.Sprintf("--- Cycle %d/%d: load=%ds, idle=%ds ---", cycleNum, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec)) + + // ── LOAD PHASE ─────────────────────────────────────────────────────── + loadCtx, loadCancel := context.WithTimeout(ctx, time.Duration(cycle.LoadSec)*time.Second) + var wg sync.WaitGroup + + // CPU stress + wg.Add(1) + go func() { + defer wg.Done() + cpuCmd, err := buildCPUStressCmd(loadCtx) + if err != nil { + logFunc("CPU stress: " + err.Error()) + return + } + _ = cpuCmd.Wait() // exits when loadCtx times out (SIGKILL) + }() + + // GPU stress + wg.Add(1) + go func() { + defer wg.Done() + gpuCmd := buildGPUStressCmd(loadCtx, vendor) + if gpuCmd == nil { + return + } + _ = gpuCmd.Wait() + }() + + // Monitoring goroutine for load phase + loadRows := collectPhase(loadCtx, cycleNum, "load", start) + for _, r := range loadRows { + logFunc(formatPlatformRow(r)) + } + rows = append(rows, loadRows...) + loadCancel() + wg.Wait() + + if len(loadRows) > 0 { + logFunc(fmt.Sprintf("Cycle %d load ended (%.0fs)", cycleNum, loadRows[len(loadRows)-1].ElapsedSec)) + } + + // ── IDLE PHASE ─────────────────────────────────────────────────────── + idleCtx, idleCancel := context.WithTimeout(ctx, time.Duration(cycle.IdleSec)*time.Second) + idleRows := collectPhase(idleCtx, cycleNum, "idle", start) + for _, r := range idleRows { + logFunc(formatPlatformRow(r)) + } + rows = append(rows, idleRows...) + idleCancel() + + // Per-cycle analysis + an := analyzePlatformCycle(loadRows, idleRows) + analyses = append(analyses, an) + logFunc(fmt.Sprintf("Cycle %d: maxCPU=%.1f°C maxGPU=%.1f°C power=%.0fW throttled=%v fanDrop=%.0f%%", + cycleNum, an.maxCPUTemp, an.maxGPUTemp, an.maxPower, an.throttled, an.fanDropPct)) + } + + // Write CSV + csvData := writePlatformCSV(rows) + _ = os.WriteFile(filepath.Join(runDir, "metrics.csv"), csvData, 0644) + + // Write summary + summary := writePlatformSummary(opts, analyses) + logFunc("--- Summary ---") + for _, line := range strings.Split(summary, "\n") { + if line != "" { + logFunc(line) + } + } + _ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644) + + // Pack tar.gz + archivePath := filepath.Join(baseDir, "platform-stress-"+stamp+".tar.gz") + if err := packPlatformDir(runDir, archivePath); err != nil { + return "", fmt.Errorf("pack archive: %w", err) + } + _ = os.RemoveAll(runDir) + return archivePath, nil +} + +// collectPhase samples live metrics every second until ctx is done. +func collectPhase(ctx context.Context, cycle int, phase string, testStart time.Time) []platformStressRow { + var rows []platformStressRow + ticker := time.NewTicker(time.Second) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return rows + case <-ticker.C: + sample := SampleLiveMetrics() + rows = append(rows, sampleToPlatformRow(sample, cycle, phase, testStart)) + } + } +} + +func sampleToPlatformRow(s LiveMetricSample, cycle int, phase string, testStart time.Time) platformStressRow { + r := platformStressRow{ + ElapsedSec: time.Since(testStart).Seconds(), + Cycle: cycle, + Phase: phase, + CPULoadPct: s.CPULoadPct, + SysPowerW: s.PowerW, + } + for _, t := range s.Temps { + switch t.Group { + case "cpu": + if t.Celsius > r.MaxCPUTempC { + r.MaxCPUTempC = t.Celsius + } + case "gpu": + if t.Celsius > r.MaxGPUTempC { + r.MaxGPUTempC = t.Celsius + } + } + } + for _, g := range s.GPUs { + if g.TempC > r.MaxGPUTempC { + r.MaxGPUTempC = g.TempC + } + } + if len(s.Fans) > 0 { + r.FanMinRPM = s.Fans[0].RPM + r.FanMaxRPM = s.Fans[0].RPM + for _, f := range s.Fans[1:] { + if f.RPM < r.FanMinRPM { + r.FanMinRPM = f.RPM + } + if f.RPM > r.FanMaxRPM { + r.FanMaxRPM = f.RPM + } + } + } + return r +} + +func formatPlatformRow(r platformStressRow) string { + throttle := "" + if r.GPUThrottled { + throttle = " THROTTLE" + } + fans := "" + if r.FanMinRPM > 0 { + fans = fmt.Sprintf(" fans=%.0f-%.0fRPM", r.FanMinRPM, r.FanMaxRPM) + } + return fmt.Sprintf("[%5.0fs] cycle=%d phase=%-4s cpu=%.0f%% cpuT=%.1f°C gpuT=%.1f°C pwr=%.0fW%s%s", + r.ElapsedSec, r.Cycle, r.Phase, r.CPULoadPct, r.MaxCPUTempC, r.MaxGPUTempC, r.SysPowerW, fans, throttle) +} + +func analyzePlatformCycle(loadRows, idleRows []platformStressRow) cycleAnalysis { + var an cycleAnalysis + for _, r := range loadRows { + if r.MaxCPUTempC > an.maxCPUTemp { + an.maxCPUTemp = r.MaxCPUTempC + } + if r.MaxGPUTempC > an.maxGPUTemp { + an.maxGPUTemp = r.MaxGPUTempC + } + if r.SysPowerW > an.maxPower { + an.maxPower = r.SysPowerW + } + if r.GPUThrottled { + an.throttled = true + } + } + // Fan RPM at cut = avg of last 5 load rows + if n := len(loadRows); n > 0 { + window := loadRows + if n > 5 { + window = loadRows[n-5:] + } + var sum float64 + var cnt int + for _, r := range window { + if r.FanMinRPM > 0 { + sum += (r.FanMinRPM + r.FanMaxRPM) / 2 + cnt++ + } + } + if cnt > 0 { + an.fanAtCutAvg = sum / float64(cnt) + } + } + // Fan RPM min in first 15s of idle + an.fanMin15s = an.fanAtCutAvg + var cutElapsed float64 + if len(loadRows) > 0 { + cutElapsed = loadRows[len(loadRows)-1].ElapsedSec + } + for _, r := range idleRows { + if r.ElapsedSec > cutElapsed+15 { + break + } + avg := (r.FanMinRPM + r.FanMaxRPM) / 2 + if avg > 0 && (an.fanMin15s == 0 || avg < an.fanMin15s) { + an.fanMin15s = avg + } + } + if an.fanAtCutAvg > 0 { + an.fanDropPct = (an.fanAtCutAvg - an.fanMin15s) / an.fanAtCutAvg * 100 + } + return an +} + +type cycleAnalysis struct { + maxCPUTemp float64 + maxGPUTemp float64 + maxPower float64 + throttled bool + fanAtCutAvg float64 + fanMin15s float64 + fanDropPct float64 +} + +func writePlatformSummary(opts PlatformStressOptions, analyses []cycleAnalysis) string { + var b strings.Builder + fmt.Fprintf(&b, "Platform Thermal Cycling — %d cycle(s)\n", len(opts.Cycles)) + fmt.Fprintf(&b, "%s\n\n", strings.Repeat("=", 48)) + + totalThrottle := 0 + totalFanWarn := 0 + for i, an := range analyses { + cycle := opts.Cycles[i] + fmt.Fprintf(&b, "Cycle %d/%d (load=%ds, idle=%ds)\n", i+1, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec) + fmt.Fprintf(&b, " Max CPU temp: %.1f°C\n", an.maxCPUTemp) + fmt.Fprintf(&b, " Max GPU temp: %.1f°C\n", an.maxGPUTemp) + fmt.Fprintf(&b, " Max sys power: %.0f W\n", an.maxPower) + if an.throttled { + fmt.Fprintf(&b, " Throttle: DETECTED\n") + totalThrottle++ + } else { + fmt.Fprintf(&b, " Throttle: none\n") + } + if an.fanAtCutAvg > 0 { + fmt.Fprintf(&b, " Fan at load cut: %.0f RPM avg\n", an.fanAtCutAvg) + fmt.Fprintf(&b, " Fan min (first 15s idle): %.0f RPM (drop %.0f%%)\n", an.fanMin15s, an.fanDropPct) + if an.fanDropPct > 20 { + fmt.Fprintf(&b, " Fan response: WARN — fast spindown (>20%% drop in 15s)\n") + totalFanWarn++ + } else { + fmt.Fprintf(&b, " Fan response: OK\n") + } + } + b.WriteString("\n") + } + + fmt.Fprintf(&b, "%s\n", strings.Repeat("=", 48)) + if totalThrottle > 0 { + fmt.Fprintf(&b, "Overall: FAIL — throttle detected in %d/%d cycles\n", totalThrottle, len(analyses)) + } else if totalFanWarn > 0 { + fmt.Fprintf(&b, "Overall: WARN — fast fan spindown in %d/%d cycles (cooling recovery risk)\n", totalFanWarn, len(analyses)) + } else { + fmt.Fprintf(&b, "Overall: PASS\n") + } + return b.String() +} + +func writePlatformCSV(rows []platformStressRow) []byte { + var buf bytes.Buffer + w := csv.NewWriter(&buf) + _ = w.Write([]string{ + "elapsed_sec", "cycle", "phase", + "cpu_load_pct", "max_cpu_temp_c", "max_gpu_temp_c", + "sys_power_w", "fan_min_rpm", "fan_max_rpm", "gpu_throttled", + }) + for _, r := range rows { + throttled := "0" + if r.GPUThrottled { + throttled = "1" + } + _ = w.Write([]string{ + strconv.FormatFloat(r.ElapsedSec, 'f', 1, 64), + strconv.Itoa(r.Cycle), + r.Phase, + strconv.FormatFloat(r.CPULoadPct, 'f', 1, 64), + strconv.FormatFloat(r.MaxCPUTempC, 'f', 1, 64), + strconv.FormatFloat(r.MaxGPUTempC, 'f', 1, 64), + strconv.FormatFloat(r.SysPowerW, 'f', 1, 64), + strconv.FormatFloat(r.FanMinRPM, 'f', 0, 64), + strconv.FormatFloat(r.FanMaxRPM, 'f', 0, 64), + throttled, + }) + } + w.Flush() + return buf.Bytes() +} + +// buildCPUStressCmd creates a stressapptest command that runs until ctx is cancelled. +func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) { + path, err := satLookPath("stressapptest") + if err != nil { + return nil, fmt.Errorf("stressapptest not found: %w", err) + } + // Use a very long duration; the context timeout will kill it at the right time. + cmd := exec.CommandContext(ctx, path, "-s", "86400", "-W", "--cc_test") + cmd.Stdout = nil + cmd.Stderr = nil + if err := cmd.Start(); err != nil { + return nil, fmt.Errorf("stressapptest start: %w", err) + } + return cmd, nil +} + +// buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor. +// Returns nil if no GPU stress tool is available (CPU-only cycling still useful). +func buildGPUStressCmd(ctx context.Context, vendor string) *exec.Cmd { + switch strings.ToLower(vendor) { + case "amd": + return buildAMDGPUStressCmd(ctx) + case "nvidia": + return buildNvidiaGPUStressCmd(ctx) + } + return nil +} + +func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd { + rvsArgs, err := resolveRVSCommand() + if err != nil { + return nil + } + rvsPath := rvsArgs[0] + cfg := `actions: +- name: gst_platform + device: all + module: gst + parallel: true + duration: 86400000 + copy_matrix: false + target_stress: 90 + matrix_size_a: 8640 + matrix_size_b: 8640 + matrix_size_c: 8640 +` + cfgFile := "/tmp/bee-platform-gst.conf" + _ = os.WriteFile(cfgFile, []byte(cfg), 0644) + cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile) + cmd.Stdout = nil + cmd.Stderr = nil + _ = cmd.Start() + return cmd +} + +func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd { + path, err := satLookPath("bee-gpu-stress") + if err != nil { + return nil + } + cmd := exec.CommandContext(ctx, path, "--seconds", "86400", "--size-mb", "64") + cmd.Stdout = nil + cmd.Stderr = nil + _ = cmd.Start() + return cmd +} + +func packPlatformDir(dir, dest string) error { + f, err := os.Create(dest) + if err != nil { + return err + } + defer f.Close() + gz := gzip.NewWriter(f) + defer gz.Close() + tw := tar.NewWriter(gz) + defer tw.Close() + + entries, err := os.ReadDir(dir) + if err != nil { + return err + } + base := filepath.Base(dir) + for _, e := range entries { + if e.IsDir() { + continue + } + fpath := filepath.Join(dir, e.Name()) + data, err := os.ReadFile(fpath) + if err != nil { + continue + } + hdr := &tar.Header{ + Name: filepath.Join(base, e.Name()), + Size: int64(len(data)), + Mode: 0644, + ModTime: time.Now(), + } + if err := tw.WriteHeader(hdr); err != nil { + return err + } + if _, err := tw.Write(data); err != nil { + return err + } + } + return nil +} diff --git a/audit/internal/webui/pages.go b/audit/internal/webui/pages.go index 00378e8..924035e 100644 --- a/audit/internal/webui/pages.go +++ b/audit/internal/webui/pages.go @@ -615,6 +615,10 @@ func renderBurn() string {

Google stressapptest saturates CPU, memory and cache buses simultaneously. Env: BEE_SAT_STRESS_SECONDS (default 300), BEE_SAT_STRESS_MB (default auto).

+
Platform Thermal Cycling
+

Runs CPU + GPU stress simultaneously across multiple load/idle cycles with varying durations. Detects cooling systems that fail to recover under repeated load cycles. Smoke: 2 cycles ~5 min. Acceptance: 4 cycles ~25 min.

+ +