Files
bee/audit/internal/platform/platform_stress.go
Michael Chus 7c2a0135d2 feat(audit): add platform thermal cycling stress test
Runs CPU (stressapptest) + GPU stress simultaneously across multiple
load/idle cycles with varying idle durations (120s/60s/30s) to detect
cooling systems that fail to recover under repeated load.

Presets: smoke (~5 min), acceptance (~25 min), overnight (~100 min).
Outputs metrics.csv + summary.txt with per-cycle throttle and fan
spindown analysis, packed as tar.gz.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-29 21:57:33 +03:00

477 lines
12 KiB
Go

package platform
import (
"archive/tar"
"bytes"
"compress/gzip"
"context"
"encoding/csv"
"fmt"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
"sync"
"time"
)
// PlatformStressCycle defines one load+idle cycle.
type PlatformStressCycle struct {
LoadSec int // seconds of simultaneous CPU+GPU stress
IdleSec int // seconds of idle monitoring after load cut
}
// PlatformStressOptions controls the thermal cycling test.
type PlatformStressOptions struct {
Cycles []PlatformStressCycle
}
// platformStressRow is one second of telemetry.
type platformStressRow struct {
ElapsedSec float64
Cycle int
Phase string // "load" | "idle"
CPULoadPct float64
MaxCPUTempC float64
MaxGPUTempC float64
SysPowerW float64
FanMinRPM float64
FanMaxRPM float64
GPUThrottled bool
}
// RunPlatformStress runs repeated load+idle thermal cycling.
// Each cycle starts CPU (stressapptest) and GPU stress simultaneously,
// runs for LoadSec, then cuts load abruptly and monitors for IdleSec.
func (s *System) RunPlatformStress(
ctx context.Context,
baseDir string,
opts PlatformStressOptions,
logFunc func(string),
) (string, error) {
if logFunc == nil {
logFunc = func(string) {}
}
if len(opts.Cycles) == 0 {
return "", fmt.Errorf("no cycles defined")
}
if err := os.MkdirAll(baseDir, 0755); err != nil {
return "", fmt.Errorf("mkdir %s: %w", baseDir, err)
}
stamp := time.Now().UTC().Format("20060102-150405")
runDir := filepath.Join(baseDir, "platform-stress-"+stamp)
if err := os.MkdirAll(runDir, 0755); err != nil {
return "", fmt.Errorf("mkdir run dir: %w", err)
}
vendor := s.DetectGPUVendor()
logFunc(fmt.Sprintf("Platform Thermal Cycling — %d cycle(s), GPU vendor: %s", len(opts.Cycles), vendor))
var rows []platformStressRow
start := time.Now()
var analyses []cycleAnalysis
for i, cycle := range opts.Cycles {
if ctx.Err() != nil {
break
}
cycleNum := i + 1
logFunc(fmt.Sprintf("--- Cycle %d/%d: load=%ds, idle=%ds ---", cycleNum, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec))
// ── LOAD PHASE ───────────────────────────────────────────────────────
loadCtx, loadCancel := context.WithTimeout(ctx, time.Duration(cycle.LoadSec)*time.Second)
var wg sync.WaitGroup
// CPU stress
wg.Add(1)
go func() {
defer wg.Done()
cpuCmd, err := buildCPUStressCmd(loadCtx)
if err != nil {
logFunc("CPU stress: " + err.Error())
return
}
_ = cpuCmd.Wait() // exits when loadCtx times out (SIGKILL)
}()
// GPU stress
wg.Add(1)
go func() {
defer wg.Done()
gpuCmd := buildGPUStressCmd(loadCtx, vendor)
if gpuCmd == nil {
return
}
_ = gpuCmd.Wait()
}()
// Monitoring goroutine for load phase
loadRows := collectPhase(loadCtx, cycleNum, "load", start)
for _, r := range loadRows {
logFunc(formatPlatformRow(r))
}
rows = append(rows, loadRows...)
loadCancel()
wg.Wait()
if len(loadRows) > 0 {
logFunc(fmt.Sprintf("Cycle %d load ended (%.0fs)", cycleNum, loadRows[len(loadRows)-1].ElapsedSec))
}
// ── IDLE PHASE ───────────────────────────────────────────────────────
idleCtx, idleCancel := context.WithTimeout(ctx, time.Duration(cycle.IdleSec)*time.Second)
idleRows := collectPhase(idleCtx, cycleNum, "idle", start)
for _, r := range idleRows {
logFunc(formatPlatformRow(r))
}
rows = append(rows, idleRows...)
idleCancel()
// Per-cycle analysis
an := analyzePlatformCycle(loadRows, idleRows)
analyses = append(analyses, an)
logFunc(fmt.Sprintf("Cycle %d: maxCPU=%.1f°C maxGPU=%.1f°C power=%.0fW throttled=%v fanDrop=%.0f%%",
cycleNum, an.maxCPUTemp, an.maxGPUTemp, an.maxPower, an.throttled, an.fanDropPct))
}
// Write CSV
csvData := writePlatformCSV(rows)
_ = os.WriteFile(filepath.Join(runDir, "metrics.csv"), csvData, 0644)
// Write summary
summary := writePlatformSummary(opts, analyses)
logFunc("--- Summary ---")
for _, line := range strings.Split(summary, "\n") {
if line != "" {
logFunc(line)
}
}
_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
// Pack tar.gz
archivePath := filepath.Join(baseDir, "platform-stress-"+stamp+".tar.gz")
if err := packPlatformDir(runDir, archivePath); err != nil {
return "", fmt.Errorf("pack archive: %w", err)
}
_ = os.RemoveAll(runDir)
return archivePath, nil
}
// collectPhase samples live metrics every second until ctx is done.
func collectPhase(ctx context.Context, cycle int, phase string, testStart time.Time) []platformStressRow {
var rows []platformStressRow
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return rows
case <-ticker.C:
sample := SampleLiveMetrics()
rows = append(rows, sampleToPlatformRow(sample, cycle, phase, testStart))
}
}
}
func sampleToPlatformRow(s LiveMetricSample, cycle int, phase string, testStart time.Time) platformStressRow {
r := platformStressRow{
ElapsedSec: time.Since(testStart).Seconds(),
Cycle: cycle,
Phase: phase,
CPULoadPct: s.CPULoadPct,
SysPowerW: s.PowerW,
}
for _, t := range s.Temps {
switch t.Group {
case "cpu":
if t.Celsius > r.MaxCPUTempC {
r.MaxCPUTempC = t.Celsius
}
case "gpu":
if t.Celsius > r.MaxGPUTempC {
r.MaxGPUTempC = t.Celsius
}
}
}
for _, g := range s.GPUs {
if g.TempC > r.MaxGPUTempC {
r.MaxGPUTempC = g.TempC
}
}
if len(s.Fans) > 0 {
r.FanMinRPM = s.Fans[0].RPM
r.FanMaxRPM = s.Fans[0].RPM
for _, f := range s.Fans[1:] {
if f.RPM < r.FanMinRPM {
r.FanMinRPM = f.RPM
}
if f.RPM > r.FanMaxRPM {
r.FanMaxRPM = f.RPM
}
}
}
return r
}
func formatPlatformRow(r platformStressRow) string {
throttle := ""
if r.GPUThrottled {
throttle = " THROTTLE"
}
fans := ""
if r.FanMinRPM > 0 {
fans = fmt.Sprintf(" fans=%.0f-%.0fRPM", r.FanMinRPM, r.FanMaxRPM)
}
return fmt.Sprintf("[%5.0fs] cycle=%d phase=%-4s cpu=%.0f%% cpuT=%.1f°C gpuT=%.1f°C pwr=%.0fW%s%s",
r.ElapsedSec, r.Cycle, r.Phase, r.CPULoadPct, r.MaxCPUTempC, r.MaxGPUTempC, r.SysPowerW, fans, throttle)
}
func analyzePlatformCycle(loadRows, idleRows []platformStressRow) cycleAnalysis {
var an cycleAnalysis
for _, r := range loadRows {
if r.MaxCPUTempC > an.maxCPUTemp {
an.maxCPUTemp = r.MaxCPUTempC
}
if r.MaxGPUTempC > an.maxGPUTemp {
an.maxGPUTemp = r.MaxGPUTempC
}
if r.SysPowerW > an.maxPower {
an.maxPower = r.SysPowerW
}
if r.GPUThrottled {
an.throttled = true
}
}
// Fan RPM at cut = avg of last 5 load rows
if n := len(loadRows); n > 0 {
window := loadRows
if n > 5 {
window = loadRows[n-5:]
}
var sum float64
var cnt int
for _, r := range window {
if r.FanMinRPM > 0 {
sum += (r.FanMinRPM + r.FanMaxRPM) / 2
cnt++
}
}
if cnt > 0 {
an.fanAtCutAvg = sum / float64(cnt)
}
}
// Fan RPM min in first 15s of idle
an.fanMin15s = an.fanAtCutAvg
var cutElapsed float64
if len(loadRows) > 0 {
cutElapsed = loadRows[len(loadRows)-1].ElapsedSec
}
for _, r := range idleRows {
if r.ElapsedSec > cutElapsed+15 {
break
}
avg := (r.FanMinRPM + r.FanMaxRPM) / 2
if avg > 0 && (an.fanMin15s == 0 || avg < an.fanMin15s) {
an.fanMin15s = avg
}
}
if an.fanAtCutAvg > 0 {
an.fanDropPct = (an.fanAtCutAvg - an.fanMin15s) / an.fanAtCutAvg * 100
}
return an
}
type cycleAnalysis struct {
maxCPUTemp float64
maxGPUTemp float64
maxPower float64
throttled bool
fanAtCutAvg float64
fanMin15s float64
fanDropPct float64
}
func writePlatformSummary(opts PlatformStressOptions, analyses []cycleAnalysis) string {
var b strings.Builder
fmt.Fprintf(&b, "Platform Thermal Cycling — %d cycle(s)\n", len(opts.Cycles))
fmt.Fprintf(&b, "%s\n\n", strings.Repeat("=", 48))
totalThrottle := 0
totalFanWarn := 0
for i, an := range analyses {
cycle := opts.Cycles[i]
fmt.Fprintf(&b, "Cycle %d/%d (load=%ds, idle=%ds)\n", i+1, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec)
fmt.Fprintf(&b, " Max CPU temp: %.1f°C\n", an.maxCPUTemp)
fmt.Fprintf(&b, " Max GPU temp: %.1f°C\n", an.maxGPUTemp)
fmt.Fprintf(&b, " Max sys power: %.0f W\n", an.maxPower)
if an.throttled {
fmt.Fprintf(&b, " Throttle: DETECTED\n")
totalThrottle++
} else {
fmt.Fprintf(&b, " Throttle: none\n")
}
if an.fanAtCutAvg > 0 {
fmt.Fprintf(&b, " Fan at load cut: %.0f RPM avg\n", an.fanAtCutAvg)
fmt.Fprintf(&b, " Fan min (first 15s idle): %.0f RPM (drop %.0f%%)\n", an.fanMin15s, an.fanDropPct)
if an.fanDropPct > 20 {
fmt.Fprintf(&b, " Fan response: WARN — fast spindown (>20%% drop in 15s)\n")
totalFanWarn++
} else {
fmt.Fprintf(&b, " Fan response: OK\n")
}
}
b.WriteString("\n")
}
fmt.Fprintf(&b, "%s\n", strings.Repeat("=", 48))
if totalThrottle > 0 {
fmt.Fprintf(&b, "Overall: FAIL — throttle detected in %d/%d cycles\n", totalThrottle, len(analyses))
} else if totalFanWarn > 0 {
fmt.Fprintf(&b, "Overall: WARN — fast fan spindown in %d/%d cycles (cooling recovery risk)\n", totalFanWarn, len(analyses))
} else {
fmt.Fprintf(&b, "Overall: PASS\n")
}
return b.String()
}
func writePlatformCSV(rows []platformStressRow) []byte {
var buf bytes.Buffer
w := csv.NewWriter(&buf)
_ = w.Write([]string{
"elapsed_sec", "cycle", "phase",
"cpu_load_pct", "max_cpu_temp_c", "max_gpu_temp_c",
"sys_power_w", "fan_min_rpm", "fan_max_rpm", "gpu_throttled",
})
for _, r := range rows {
throttled := "0"
if r.GPUThrottled {
throttled = "1"
}
_ = w.Write([]string{
strconv.FormatFloat(r.ElapsedSec, 'f', 1, 64),
strconv.Itoa(r.Cycle),
r.Phase,
strconv.FormatFloat(r.CPULoadPct, 'f', 1, 64),
strconv.FormatFloat(r.MaxCPUTempC, 'f', 1, 64),
strconv.FormatFloat(r.MaxGPUTempC, 'f', 1, 64),
strconv.FormatFloat(r.SysPowerW, 'f', 1, 64),
strconv.FormatFloat(r.FanMinRPM, 'f', 0, 64),
strconv.FormatFloat(r.FanMaxRPM, 'f', 0, 64),
throttled,
})
}
w.Flush()
return buf.Bytes()
}
// buildCPUStressCmd creates a stressapptest command that runs until ctx is cancelled.
func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
path, err := satLookPath("stressapptest")
if err != nil {
return nil, fmt.Errorf("stressapptest not found: %w", err)
}
// Use a very long duration; the context timeout will kill it at the right time.
cmd := exec.CommandContext(ctx, path, "-s", "86400", "-W", "--cc_test")
cmd.Stdout = nil
cmd.Stderr = nil
if err := cmd.Start(); err != nil {
return nil, fmt.Errorf("stressapptest start: %w", err)
}
return cmd, nil
}
// buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
// Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
func buildGPUStressCmd(ctx context.Context, vendor string) *exec.Cmd {
switch strings.ToLower(vendor) {
case "amd":
return buildAMDGPUStressCmd(ctx)
case "nvidia":
return buildNvidiaGPUStressCmd(ctx)
}
return nil
}
func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
rvsArgs, err := resolveRVSCommand()
if err != nil {
return nil
}
rvsPath := rvsArgs[0]
cfg := `actions:
- name: gst_platform
device: all
module: gst
parallel: true
duration: 86400000
copy_matrix: false
target_stress: 90
matrix_size_a: 8640
matrix_size_b: 8640
matrix_size_c: 8640
`
cfgFile := "/tmp/bee-platform-gst.conf"
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
cmd.Stdout = nil
cmd.Stderr = nil
_ = cmd.Start()
return cmd
}
func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
path, err := satLookPath("bee-gpu-stress")
if err != nil {
return nil
}
cmd := exec.CommandContext(ctx, path, "--seconds", "86400", "--size-mb", "64")
cmd.Stdout = nil
cmd.Stderr = nil
_ = cmd.Start()
return cmd
}
func packPlatformDir(dir, dest string) error {
f, err := os.Create(dest)
if err != nil {
return err
}
defer f.Close()
gz := gzip.NewWriter(f)
defer gz.Close()
tw := tar.NewWriter(gz)
defer tw.Close()
entries, err := os.ReadDir(dir)
if err != nil {
return err
}
base := filepath.Base(dir)
for _, e := range entries {
if e.IsDir() {
continue
}
fpath := filepath.Join(dir, e.Name())
data, err := os.ReadFile(fpath)
if err != nil {
continue
}
hdr := &tar.Header{
Name: filepath.Join(base, e.Name()),
Size: int64(len(data)),
Mode: 0644,
ModTime: time.Now(),
}
if err := tw.WriteHeader(hdr); err != nil {
return err
}
if _, err := tw.Write(data); err != nil {
return err
}
}
return nil
}