Runs CPU (stressapptest) + GPU stress simultaneously across multiple load/idle cycles with varying idle durations (120s/60s/30s) to detect cooling systems that fail to recover under repeated load. Presets: smoke (~5 min), acceptance (~25 min), overnight (~100 min). Outputs metrics.csv + summary.txt with per-cycle throttle and fan spindown analysis, packed as tar.gz. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
477 lines
12 KiB
Go
477 lines
12 KiB
Go
package platform
|
|
|
|
import (
|
|
"archive/tar"
|
|
"bytes"
|
|
"compress/gzip"
|
|
"context"
|
|
"encoding/csv"
|
|
"fmt"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
)
|
|
|
|
// PlatformStressCycle defines one load+idle cycle.
|
|
type PlatformStressCycle struct {
|
|
LoadSec int // seconds of simultaneous CPU+GPU stress
|
|
IdleSec int // seconds of idle monitoring after load cut
|
|
}
|
|
|
|
// PlatformStressOptions controls the thermal cycling test.
|
|
type PlatformStressOptions struct {
|
|
Cycles []PlatformStressCycle
|
|
}
|
|
|
|
// platformStressRow is one second of telemetry.
|
|
type platformStressRow struct {
|
|
ElapsedSec float64
|
|
Cycle int
|
|
Phase string // "load" | "idle"
|
|
CPULoadPct float64
|
|
MaxCPUTempC float64
|
|
MaxGPUTempC float64
|
|
SysPowerW float64
|
|
FanMinRPM float64
|
|
FanMaxRPM float64
|
|
GPUThrottled bool
|
|
}
|
|
|
|
// RunPlatformStress runs repeated load+idle thermal cycling.
|
|
// Each cycle starts CPU (stressapptest) and GPU stress simultaneously,
|
|
// runs for LoadSec, then cuts load abruptly and monitors for IdleSec.
|
|
func (s *System) RunPlatformStress(
|
|
ctx context.Context,
|
|
baseDir string,
|
|
opts PlatformStressOptions,
|
|
logFunc func(string),
|
|
) (string, error) {
|
|
if logFunc == nil {
|
|
logFunc = func(string) {}
|
|
}
|
|
if len(opts.Cycles) == 0 {
|
|
return "", fmt.Errorf("no cycles defined")
|
|
}
|
|
if err := os.MkdirAll(baseDir, 0755); err != nil {
|
|
return "", fmt.Errorf("mkdir %s: %w", baseDir, err)
|
|
}
|
|
|
|
stamp := time.Now().UTC().Format("20060102-150405")
|
|
runDir := filepath.Join(baseDir, "platform-stress-"+stamp)
|
|
if err := os.MkdirAll(runDir, 0755); err != nil {
|
|
return "", fmt.Errorf("mkdir run dir: %w", err)
|
|
}
|
|
|
|
vendor := s.DetectGPUVendor()
|
|
logFunc(fmt.Sprintf("Platform Thermal Cycling — %d cycle(s), GPU vendor: %s", len(opts.Cycles), vendor))
|
|
|
|
var rows []platformStressRow
|
|
start := time.Now()
|
|
|
|
var analyses []cycleAnalysis
|
|
|
|
for i, cycle := range opts.Cycles {
|
|
if ctx.Err() != nil {
|
|
break
|
|
}
|
|
cycleNum := i + 1
|
|
logFunc(fmt.Sprintf("--- Cycle %d/%d: load=%ds, idle=%ds ---", cycleNum, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec))
|
|
|
|
// ── LOAD PHASE ───────────────────────────────────────────────────────
|
|
loadCtx, loadCancel := context.WithTimeout(ctx, time.Duration(cycle.LoadSec)*time.Second)
|
|
var wg sync.WaitGroup
|
|
|
|
// CPU stress
|
|
wg.Add(1)
|
|
go func() {
|
|
defer wg.Done()
|
|
cpuCmd, err := buildCPUStressCmd(loadCtx)
|
|
if err != nil {
|
|
logFunc("CPU stress: " + err.Error())
|
|
return
|
|
}
|
|
_ = cpuCmd.Wait() // exits when loadCtx times out (SIGKILL)
|
|
}()
|
|
|
|
// GPU stress
|
|
wg.Add(1)
|
|
go func() {
|
|
defer wg.Done()
|
|
gpuCmd := buildGPUStressCmd(loadCtx, vendor)
|
|
if gpuCmd == nil {
|
|
return
|
|
}
|
|
_ = gpuCmd.Wait()
|
|
}()
|
|
|
|
// Monitoring goroutine for load phase
|
|
loadRows := collectPhase(loadCtx, cycleNum, "load", start)
|
|
for _, r := range loadRows {
|
|
logFunc(formatPlatformRow(r))
|
|
}
|
|
rows = append(rows, loadRows...)
|
|
loadCancel()
|
|
wg.Wait()
|
|
|
|
if len(loadRows) > 0 {
|
|
logFunc(fmt.Sprintf("Cycle %d load ended (%.0fs)", cycleNum, loadRows[len(loadRows)-1].ElapsedSec))
|
|
}
|
|
|
|
// ── IDLE PHASE ───────────────────────────────────────────────────────
|
|
idleCtx, idleCancel := context.WithTimeout(ctx, time.Duration(cycle.IdleSec)*time.Second)
|
|
idleRows := collectPhase(idleCtx, cycleNum, "idle", start)
|
|
for _, r := range idleRows {
|
|
logFunc(formatPlatformRow(r))
|
|
}
|
|
rows = append(rows, idleRows...)
|
|
idleCancel()
|
|
|
|
// Per-cycle analysis
|
|
an := analyzePlatformCycle(loadRows, idleRows)
|
|
analyses = append(analyses, an)
|
|
logFunc(fmt.Sprintf("Cycle %d: maxCPU=%.1f°C maxGPU=%.1f°C power=%.0fW throttled=%v fanDrop=%.0f%%",
|
|
cycleNum, an.maxCPUTemp, an.maxGPUTemp, an.maxPower, an.throttled, an.fanDropPct))
|
|
}
|
|
|
|
// Write CSV
|
|
csvData := writePlatformCSV(rows)
|
|
_ = os.WriteFile(filepath.Join(runDir, "metrics.csv"), csvData, 0644)
|
|
|
|
// Write summary
|
|
summary := writePlatformSummary(opts, analyses)
|
|
logFunc("--- Summary ---")
|
|
for _, line := range strings.Split(summary, "\n") {
|
|
if line != "" {
|
|
logFunc(line)
|
|
}
|
|
}
|
|
_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
|
|
|
|
// Pack tar.gz
|
|
archivePath := filepath.Join(baseDir, "platform-stress-"+stamp+".tar.gz")
|
|
if err := packPlatformDir(runDir, archivePath); err != nil {
|
|
return "", fmt.Errorf("pack archive: %w", err)
|
|
}
|
|
_ = os.RemoveAll(runDir)
|
|
return archivePath, nil
|
|
}
|
|
|
|
// collectPhase samples live metrics every second until ctx is done.
|
|
func collectPhase(ctx context.Context, cycle int, phase string, testStart time.Time) []platformStressRow {
|
|
var rows []platformStressRow
|
|
ticker := time.NewTicker(time.Second)
|
|
defer ticker.Stop()
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return rows
|
|
case <-ticker.C:
|
|
sample := SampleLiveMetrics()
|
|
rows = append(rows, sampleToPlatformRow(sample, cycle, phase, testStart))
|
|
}
|
|
}
|
|
}
|
|
|
|
func sampleToPlatformRow(s LiveMetricSample, cycle int, phase string, testStart time.Time) platformStressRow {
|
|
r := platformStressRow{
|
|
ElapsedSec: time.Since(testStart).Seconds(),
|
|
Cycle: cycle,
|
|
Phase: phase,
|
|
CPULoadPct: s.CPULoadPct,
|
|
SysPowerW: s.PowerW,
|
|
}
|
|
for _, t := range s.Temps {
|
|
switch t.Group {
|
|
case "cpu":
|
|
if t.Celsius > r.MaxCPUTempC {
|
|
r.MaxCPUTempC = t.Celsius
|
|
}
|
|
case "gpu":
|
|
if t.Celsius > r.MaxGPUTempC {
|
|
r.MaxGPUTempC = t.Celsius
|
|
}
|
|
}
|
|
}
|
|
for _, g := range s.GPUs {
|
|
if g.TempC > r.MaxGPUTempC {
|
|
r.MaxGPUTempC = g.TempC
|
|
}
|
|
}
|
|
if len(s.Fans) > 0 {
|
|
r.FanMinRPM = s.Fans[0].RPM
|
|
r.FanMaxRPM = s.Fans[0].RPM
|
|
for _, f := range s.Fans[1:] {
|
|
if f.RPM < r.FanMinRPM {
|
|
r.FanMinRPM = f.RPM
|
|
}
|
|
if f.RPM > r.FanMaxRPM {
|
|
r.FanMaxRPM = f.RPM
|
|
}
|
|
}
|
|
}
|
|
return r
|
|
}
|
|
|
|
func formatPlatformRow(r platformStressRow) string {
|
|
throttle := ""
|
|
if r.GPUThrottled {
|
|
throttle = " THROTTLE"
|
|
}
|
|
fans := ""
|
|
if r.FanMinRPM > 0 {
|
|
fans = fmt.Sprintf(" fans=%.0f-%.0fRPM", r.FanMinRPM, r.FanMaxRPM)
|
|
}
|
|
return fmt.Sprintf("[%5.0fs] cycle=%d phase=%-4s cpu=%.0f%% cpuT=%.1f°C gpuT=%.1f°C pwr=%.0fW%s%s",
|
|
r.ElapsedSec, r.Cycle, r.Phase, r.CPULoadPct, r.MaxCPUTempC, r.MaxGPUTempC, r.SysPowerW, fans, throttle)
|
|
}
|
|
|
|
func analyzePlatformCycle(loadRows, idleRows []platformStressRow) cycleAnalysis {
|
|
var an cycleAnalysis
|
|
for _, r := range loadRows {
|
|
if r.MaxCPUTempC > an.maxCPUTemp {
|
|
an.maxCPUTemp = r.MaxCPUTempC
|
|
}
|
|
if r.MaxGPUTempC > an.maxGPUTemp {
|
|
an.maxGPUTemp = r.MaxGPUTempC
|
|
}
|
|
if r.SysPowerW > an.maxPower {
|
|
an.maxPower = r.SysPowerW
|
|
}
|
|
if r.GPUThrottled {
|
|
an.throttled = true
|
|
}
|
|
}
|
|
// Fan RPM at cut = avg of last 5 load rows
|
|
if n := len(loadRows); n > 0 {
|
|
window := loadRows
|
|
if n > 5 {
|
|
window = loadRows[n-5:]
|
|
}
|
|
var sum float64
|
|
var cnt int
|
|
for _, r := range window {
|
|
if r.FanMinRPM > 0 {
|
|
sum += (r.FanMinRPM + r.FanMaxRPM) / 2
|
|
cnt++
|
|
}
|
|
}
|
|
if cnt > 0 {
|
|
an.fanAtCutAvg = sum / float64(cnt)
|
|
}
|
|
}
|
|
// Fan RPM min in first 15s of idle
|
|
an.fanMin15s = an.fanAtCutAvg
|
|
var cutElapsed float64
|
|
if len(loadRows) > 0 {
|
|
cutElapsed = loadRows[len(loadRows)-1].ElapsedSec
|
|
}
|
|
for _, r := range idleRows {
|
|
if r.ElapsedSec > cutElapsed+15 {
|
|
break
|
|
}
|
|
avg := (r.FanMinRPM + r.FanMaxRPM) / 2
|
|
if avg > 0 && (an.fanMin15s == 0 || avg < an.fanMin15s) {
|
|
an.fanMin15s = avg
|
|
}
|
|
}
|
|
if an.fanAtCutAvg > 0 {
|
|
an.fanDropPct = (an.fanAtCutAvg - an.fanMin15s) / an.fanAtCutAvg * 100
|
|
}
|
|
return an
|
|
}
|
|
|
|
type cycleAnalysis struct {
|
|
maxCPUTemp float64
|
|
maxGPUTemp float64
|
|
maxPower float64
|
|
throttled bool
|
|
fanAtCutAvg float64
|
|
fanMin15s float64
|
|
fanDropPct float64
|
|
}
|
|
|
|
func writePlatformSummary(opts PlatformStressOptions, analyses []cycleAnalysis) string {
|
|
var b strings.Builder
|
|
fmt.Fprintf(&b, "Platform Thermal Cycling — %d cycle(s)\n", len(opts.Cycles))
|
|
fmt.Fprintf(&b, "%s\n\n", strings.Repeat("=", 48))
|
|
|
|
totalThrottle := 0
|
|
totalFanWarn := 0
|
|
for i, an := range analyses {
|
|
cycle := opts.Cycles[i]
|
|
fmt.Fprintf(&b, "Cycle %d/%d (load=%ds, idle=%ds)\n", i+1, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec)
|
|
fmt.Fprintf(&b, " Max CPU temp: %.1f°C\n", an.maxCPUTemp)
|
|
fmt.Fprintf(&b, " Max GPU temp: %.1f°C\n", an.maxGPUTemp)
|
|
fmt.Fprintf(&b, " Max sys power: %.0f W\n", an.maxPower)
|
|
if an.throttled {
|
|
fmt.Fprintf(&b, " Throttle: DETECTED\n")
|
|
totalThrottle++
|
|
} else {
|
|
fmt.Fprintf(&b, " Throttle: none\n")
|
|
}
|
|
if an.fanAtCutAvg > 0 {
|
|
fmt.Fprintf(&b, " Fan at load cut: %.0f RPM avg\n", an.fanAtCutAvg)
|
|
fmt.Fprintf(&b, " Fan min (first 15s idle): %.0f RPM (drop %.0f%%)\n", an.fanMin15s, an.fanDropPct)
|
|
if an.fanDropPct > 20 {
|
|
fmt.Fprintf(&b, " Fan response: WARN — fast spindown (>20%% drop in 15s)\n")
|
|
totalFanWarn++
|
|
} else {
|
|
fmt.Fprintf(&b, " Fan response: OK\n")
|
|
}
|
|
}
|
|
b.WriteString("\n")
|
|
}
|
|
|
|
fmt.Fprintf(&b, "%s\n", strings.Repeat("=", 48))
|
|
if totalThrottle > 0 {
|
|
fmt.Fprintf(&b, "Overall: FAIL — throttle detected in %d/%d cycles\n", totalThrottle, len(analyses))
|
|
} else if totalFanWarn > 0 {
|
|
fmt.Fprintf(&b, "Overall: WARN — fast fan spindown in %d/%d cycles (cooling recovery risk)\n", totalFanWarn, len(analyses))
|
|
} else {
|
|
fmt.Fprintf(&b, "Overall: PASS\n")
|
|
}
|
|
return b.String()
|
|
}
|
|
|
|
func writePlatformCSV(rows []platformStressRow) []byte {
|
|
var buf bytes.Buffer
|
|
w := csv.NewWriter(&buf)
|
|
_ = w.Write([]string{
|
|
"elapsed_sec", "cycle", "phase",
|
|
"cpu_load_pct", "max_cpu_temp_c", "max_gpu_temp_c",
|
|
"sys_power_w", "fan_min_rpm", "fan_max_rpm", "gpu_throttled",
|
|
})
|
|
for _, r := range rows {
|
|
throttled := "0"
|
|
if r.GPUThrottled {
|
|
throttled = "1"
|
|
}
|
|
_ = w.Write([]string{
|
|
strconv.FormatFloat(r.ElapsedSec, 'f', 1, 64),
|
|
strconv.Itoa(r.Cycle),
|
|
r.Phase,
|
|
strconv.FormatFloat(r.CPULoadPct, 'f', 1, 64),
|
|
strconv.FormatFloat(r.MaxCPUTempC, 'f', 1, 64),
|
|
strconv.FormatFloat(r.MaxGPUTempC, 'f', 1, 64),
|
|
strconv.FormatFloat(r.SysPowerW, 'f', 1, 64),
|
|
strconv.FormatFloat(r.FanMinRPM, 'f', 0, 64),
|
|
strconv.FormatFloat(r.FanMaxRPM, 'f', 0, 64),
|
|
throttled,
|
|
})
|
|
}
|
|
w.Flush()
|
|
return buf.Bytes()
|
|
}
|
|
|
|
// buildCPUStressCmd creates a stressapptest command that runs until ctx is cancelled.
|
|
func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
|
|
path, err := satLookPath("stressapptest")
|
|
if err != nil {
|
|
return nil, fmt.Errorf("stressapptest not found: %w", err)
|
|
}
|
|
// Use a very long duration; the context timeout will kill it at the right time.
|
|
cmd := exec.CommandContext(ctx, path, "-s", "86400", "-W", "--cc_test")
|
|
cmd.Stdout = nil
|
|
cmd.Stderr = nil
|
|
if err := cmd.Start(); err != nil {
|
|
return nil, fmt.Errorf("stressapptest start: %w", err)
|
|
}
|
|
return cmd, nil
|
|
}
|
|
|
|
// buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
|
|
// Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
|
|
func buildGPUStressCmd(ctx context.Context, vendor string) *exec.Cmd {
|
|
switch strings.ToLower(vendor) {
|
|
case "amd":
|
|
return buildAMDGPUStressCmd(ctx)
|
|
case "nvidia":
|
|
return buildNvidiaGPUStressCmd(ctx)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
|
|
rvsArgs, err := resolveRVSCommand()
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
rvsPath := rvsArgs[0]
|
|
cfg := `actions:
|
|
- name: gst_platform
|
|
device: all
|
|
module: gst
|
|
parallel: true
|
|
duration: 86400000
|
|
copy_matrix: false
|
|
target_stress: 90
|
|
matrix_size_a: 8640
|
|
matrix_size_b: 8640
|
|
matrix_size_c: 8640
|
|
`
|
|
cfgFile := "/tmp/bee-platform-gst.conf"
|
|
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
|
cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
|
|
cmd.Stdout = nil
|
|
cmd.Stderr = nil
|
|
_ = cmd.Start()
|
|
return cmd
|
|
}
|
|
|
|
func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
|
|
path, err := satLookPath("bee-gpu-stress")
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
cmd := exec.CommandContext(ctx, path, "--seconds", "86400", "--size-mb", "64")
|
|
cmd.Stdout = nil
|
|
cmd.Stderr = nil
|
|
_ = cmd.Start()
|
|
return cmd
|
|
}
|
|
|
|
func packPlatformDir(dir, dest string) error {
|
|
f, err := os.Create(dest)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer f.Close()
|
|
gz := gzip.NewWriter(f)
|
|
defer gz.Close()
|
|
tw := tar.NewWriter(gz)
|
|
defer tw.Close()
|
|
|
|
entries, err := os.ReadDir(dir)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
base := filepath.Base(dir)
|
|
for _, e := range entries {
|
|
if e.IsDir() {
|
|
continue
|
|
}
|
|
fpath := filepath.Join(dir, e.Name())
|
|
data, err := os.ReadFile(fpath)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
hdr := &tar.Header{
|
|
Name: filepath.Join(base, e.Name()),
|
|
Size: int64(len(data)),
|
|
Mode: 0644,
|
|
ModTime: time.Now(),
|
|
}
|
|
if err := tw.WriteHeader(hdr); err != nil {
|
|
return err
|
|
}
|
|
if _, err := tw.Write(data); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|