package platform import ( "archive/tar" "bytes" "compress/gzip" "context" "encoding/csv" "fmt" "os" "os/exec" "path/filepath" "strconv" "strings" "sync" "time" ) // PlatformStressCycle defines one load+idle cycle. type PlatformStressCycle struct { LoadSec int // seconds of simultaneous CPU+GPU stress IdleSec int // seconds of idle monitoring after load cut } // PlatformStressOptions controls the thermal cycling test. type PlatformStressOptions struct { Cycles []PlatformStressCycle } // platformStressRow is one second of telemetry. type platformStressRow struct { ElapsedSec float64 Cycle int Phase string // "load" | "idle" CPULoadPct float64 MaxCPUTempC float64 MaxGPUTempC float64 SysPowerW float64 FanMinRPM float64 FanMaxRPM float64 GPUThrottled bool } // RunPlatformStress runs repeated load+idle thermal cycling. // Each cycle starts CPU (stressapptest) and GPU stress simultaneously, // runs for LoadSec, then cuts load abruptly and monitors for IdleSec. func (s *System) RunPlatformStress( ctx context.Context, baseDir string, opts PlatformStressOptions, logFunc func(string), ) (string, error) { if logFunc == nil { logFunc = func(string) {} } if len(opts.Cycles) == 0 { return "", fmt.Errorf("no cycles defined") } if err := os.MkdirAll(baseDir, 0755); err != nil { return "", fmt.Errorf("mkdir %s: %w", baseDir, err) } stamp := time.Now().UTC().Format("20060102-150405") runDir := filepath.Join(baseDir, "platform-stress-"+stamp) if err := os.MkdirAll(runDir, 0755); err != nil { return "", fmt.Errorf("mkdir run dir: %w", err) } vendor := s.DetectGPUVendor() logFunc(fmt.Sprintf("Platform Thermal Cycling — %d cycle(s), GPU vendor: %s", len(opts.Cycles), vendor)) var rows []platformStressRow start := time.Now() var analyses []cycleAnalysis for i, cycle := range opts.Cycles { if ctx.Err() != nil { break } cycleNum := i + 1 logFunc(fmt.Sprintf("--- Cycle %d/%d: load=%ds, idle=%ds ---", cycleNum, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec)) // ── LOAD PHASE ─────────────────────────────────────────────────────── loadCtx, loadCancel := context.WithTimeout(ctx, time.Duration(cycle.LoadSec)*time.Second) var wg sync.WaitGroup // CPU stress wg.Add(1) go func() { defer wg.Done() cpuCmd, err := buildCPUStressCmd(loadCtx) if err != nil { logFunc("CPU stress: " + err.Error()) return } _ = cpuCmd.Wait() // exits when loadCtx times out (SIGKILL) }() // GPU stress wg.Add(1) go func() { defer wg.Done() gpuCmd := buildGPUStressCmd(loadCtx, vendor) if gpuCmd == nil { return } _ = gpuCmd.Wait() }() // Monitoring goroutine for load phase loadRows := collectPhase(loadCtx, cycleNum, "load", start) for _, r := range loadRows { logFunc(formatPlatformRow(r)) } rows = append(rows, loadRows...) loadCancel() wg.Wait() if len(loadRows) > 0 { logFunc(fmt.Sprintf("Cycle %d load ended (%.0fs)", cycleNum, loadRows[len(loadRows)-1].ElapsedSec)) } // ── IDLE PHASE ─────────────────────────────────────────────────────── idleCtx, idleCancel := context.WithTimeout(ctx, time.Duration(cycle.IdleSec)*time.Second) idleRows := collectPhase(idleCtx, cycleNum, "idle", start) for _, r := range idleRows { logFunc(formatPlatformRow(r)) } rows = append(rows, idleRows...) idleCancel() // Per-cycle analysis an := analyzePlatformCycle(loadRows, idleRows) analyses = append(analyses, an) logFunc(fmt.Sprintf("Cycle %d: maxCPU=%.1f°C maxGPU=%.1f°C power=%.0fW throttled=%v fanDrop=%.0f%%", cycleNum, an.maxCPUTemp, an.maxGPUTemp, an.maxPower, an.throttled, an.fanDropPct)) } // Write CSV csvData := writePlatformCSV(rows) _ = os.WriteFile(filepath.Join(runDir, "metrics.csv"), csvData, 0644) // Write summary summary := writePlatformSummary(opts, analyses) logFunc("--- Summary ---") for _, line := range strings.Split(summary, "\n") { if line != "" { logFunc(line) } } _ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644) // Pack tar.gz archivePath := filepath.Join(baseDir, "platform-stress-"+stamp+".tar.gz") if err := packPlatformDir(runDir, archivePath); err != nil { return "", fmt.Errorf("pack archive: %w", err) } _ = os.RemoveAll(runDir) return archivePath, nil } // collectPhase samples live metrics every second until ctx is done. func collectPhase(ctx context.Context, cycle int, phase string, testStart time.Time) []platformStressRow { var rows []platformStressRow ticker := time.NewTicker(time.Second) defer ticker.Stop() for { select { case <-ctx.Done(): return rows case <-ticker.C: sample := SampleLiveMetrics() rows = append(rows, sampleToPlatformRow(sample, cycle, phase, testStart)) } } } func sampleToPlatformRow(s LiveMetricSample, cycle int, phase string, testStart time.Time) platformStressRow { r := platformStressRow{ ElapsedSec: time.Since(testStart).Seconds(), Cycle: cycle, Phase: phase, CPULoadPct: s.CPULoadPct, SysPowerW: s.PowerW, } for _, t := range s.Temps { switch t.Group { case "cpu": if t.Celsius > r.MaxCPUTempC { r.MaxCPUTempC = t.Celsius } case "gpu": if t.Celsius > r.MaxGPUTempC { r.MaxGPUTempC = t.Celsius } } } for _, g := range s.GPUs { if g.TempC > r.MaxGPUTempC { r.MaxGPUTempC = g.TempC } } if len(s.Fans) > 0 { r.FanMinRPM = s.Fans[0].RPM r.FanMaxRPM = s.Fans[0].RPM for _, f := range s.Fans[1:] { if f.RPM < r.FanMinRPM { r.FanMinRPM = f.RPM } if f.RPM > r.FanMaxRPM { r.FanMaxRPM = f.RPM } } } return r } func formatPlatformRow(r platformStressRow) string { throttle := "" if r.GPUThrottled { throttle = " THROTTLE" } fans := "" if r.FanMinRPM > 0 { fans = fmt.Sprintf(" fans=%.0f-%.0fRPM", r.FanMinRPM, r.FanMaxRPM) } return fmt.Sprintf("[%5.0fs] cycle=%d phase=%-4s cpu=%.0f%% cpuT=%.1f°C gpuT=%.1f°C pwr=%.0fW%s%s", r.ElapsedSec, r.Cycle, r.Phase, r.CPULoadPct, r.MaxCPUTempC, r.MaxGPUTempC, r.SysPowerW, fans, throttle) } func analyzePlatformCycle(loadRows, idleRows []platformStressRow) cycleAnalysis { var an cycleAnalysis for _, r := range loadRows { if r.MaxCPUTempC > an.maxCPUTemp { an.maxCPUTemp = r.MaxCPUTempC } if r.MaxGPUTempC > an.maxGPUTemp { an.maxGPUTemp = r.MaxGPUTempC } if r.SysPowerW > an.maxPower { an.maxPower = r.SysPowerW } if r.GPUThrottled { an.throttled = true } } // Fan RPM at cut = avg of last 5 load rows if n := len(loadRows); n > 0 { window := loadRows if n > 5 { window = loadRows[n-5:] } var sum float64 var cnt int for _, r := range window { if r.FanMinRPM > 0 { sum += (r.FanMinRPM + r.FanMaxRPM) / 2 cnt++ } } if cnt > 0 { an.fanAtCutAvg = sum / float64(cnt) } } // Fan RPM min in first 15s of idle an.fanMin15s = an.fanAtCutAvg var cutElapsed float64 if len(loadRows) > 0 { cutElapsed = loadRows[len(loadRows)-1].ElapsedSec } for _, r := range idleRows { if r.ElapsedSec > cutElapsed+15 { break } avg := (r.FanMinRPM + r.FanMaxRPM) / 2 if avg > 0 && (an.fanMin15s == 0 || avg < an.fanMin15s) { an.fanMin15s = avg } } if an.fanAtCutAvg > 0 { an.fanDropPct = (an.fanAtCutAvg - an.fanMin15s) / an.fanAtCutAvg * 100 } return an } type cycleAnalysis struct { maxCPUTemp float64 maxGPUTemp float64 maxPower float64 throttled bool fanAtCutAvg float64 fanMin15s float64 fanDropPct float64 } func writePlatformSummary(opts PlatformStressOptions, analyses []cycleAnalysis) string { var b strings.Builder fmt.Fprintf(&b, "Platform Thermal Cycling — %d cycle(s)\n", len(opts.Cycles)) fmt.Fprintf(&b, "%s\n\n", strings.Repeat("=", 48)) totalThrottle := 0 totalFanWarn := 0 for i, an := range analyses { cycle := opts.Cycles[i] fmt.Fprintf(&b, "Cycle %d/%d (load=%ds, idle=%ds)\n", i+1, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec) fmt.Fprintf(&b, " Max CPU temp: %.1f°C\n", an.maxCPUTemp) fmt.Fprintf(&b, " Max GPU temp: %.1f°C\n", an.maxGPUTemp) fmt.Fprintf(&b, " Max sys power: %.0f W\n", an.maxPower) if an.throttled { fmt.Fprintf(&b, " Throttle: DETECTED\n") totalThrottle++ } else { fmt.Fprintf(&b, " Throttle: none\n") } if an.fanAtCutAvg > 0 { fmt.Fprintf(&b, " Fan at load cut: %.0f RPM avg\n", an.fanAtCutAvg) fmt.Fprintf(&b, " Fan min (first 15s idle): %.0f RPM (drop %.0f%%)\n", an.fanMin15s, an.fanDropPct) if an.fanDropPct > 20 { fmt.Fprintf(&b, " Fan response: WARN — fast spindown (>20%% drop in 15s)\n") totalFanWarn++ } else { fmt.Fprintf(&b, " Fan response: OK\n") } } b.WriteString("\n") } fmt.Fprintf(&b, "%s\n", strings.Repeat("=", 48)) if totalThrottle > 0 { fmt.Fprintf(&b, "Overall: FAIL — throttle detected in %d/%d cycles\n", totalThrottle, len(analyses)) } else if totalFanWarn > 0 { fmt.Fprintf(&b, "Overall: WARN — fast fan spindown in %d/%d cycles (cooling recovery risk)\n", totalFanWarn, len(analyses)) } else { fmt.Fprintf(&b, "Overall: PASS\n") } return b.String() } func writePlatformCSV(rows []platformStressRow) []byte { var buf bytes.Buffer w := csv.NewWriter(&buf) _ = w.Write([]string{ "elapsed_sec", "cycle", "phase", "cpu_load_pct", "max_cpu_temp_c", "max_gpu_temp_c", "sys_power_w", "fan_min_rpm", "fan_max_rpm", "gpu_throttled", }) for _, r := range rows { throttled := "0" if r.GPUThrottled { throttled = "1" } _ = w.Write([]string{ strconv.FormatFloat(r.ElapsedSec, 'f', 1, 64), strconv.Itoa(r.Cycle), r.Phase, strconv.FormatFloat(r.CPULoadPct, 'f', 1, 64), strconv.FormatFloat(r.MaxCPUTempC, 'f', 1, 64), strconv.FormatFloat(r.MaxGPUTempC, 'f', 1, 64), strconv.FormatFloat(r.SysPowerW, 'f', 1, 64), strconv.FormatFloat(r.FanMinRPM, 'f', 0, 64), strconv.FormatFloat(r.FanMaxRPM, 'f', 0, 64), throttled, }) } w.Flush() return buf.Bytes() } // buildCPUStressCmd creates a stressapptest command that runs until ctx is cancelled. func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) { path, err := satLookPath("stressapptest") if err != nil { return nil, fmt.Errorf("stressapptest not found: %w", err) } // Use a very long duration; the context timeout will kill it at the right time. cmd := exec.CommandContext(ctx, path, "-s", "86400", "-W", "--cc_test") cmd.Stdout = nil cmd.Stderr = nil if err := cmd.Start(); err != nil { return nil, fmt.Errorf("stressapptest start: %w", err) } return cmd, nil } // buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor. // Returns nil if no GPU stress tool is available (CPU-only cycling still useful). func buildGPUStressCmd(ctx context.Context, vendor string) *exec.Cmd { switch strings.ToLower(vendor) { case "amd": return buildAMDGPUStressCmd(ctx) case "nvidia": return buildNvidiaGPUStressCmd(ctx) } return nil } func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd { rvsArgs, err := resolveRVSCommand() if err != nil { return nil } rvsPath := rvsArgs[0] cfg := `actions: - name: gst_platform device: all module: gst parallel: true duration: 86400000 copy_matrix: false target_stress: 90 matrix_size_a: 8640 matrix_size_b: 8640 matrix_size_c: 8640 ` cfgFile := "/tmp/bee-platform-gst.conf" _ = os.WriteFile(cfgFile, []byte(cfg), 0644) cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile) cmd.Stdout = nil cmd.Stderr = nil _ = cmd.Start() return cmd } func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd { path, err := satLookPath("bee-gpu-stress") if err != nil { return nil } cmd := exec.CommandContext(ctx, path, "--seconds", "86400", "--size-mb", "64") cmd.Stdout = nil cmd.Stderr = nil _ = cmd.Start() return cmd } func packPlatformDir(dir, dest string) error { f, err := os.Create(dest) if err != nil { return err } defer f.Close() gz := gzip.NewWriter(f) defer gz.Close() tw := tar.NewWriter(gz) defer tw.Close() entries, err := os.ReadDir(dir) if err != nil { return err } base := filepath.Base(dir) for _, e := range entries { if e.IsDir() { continue } fpath := filepath.Join(dir, e.Name()) data, err := os.ReadFile(fpath) if err != nil { continue } hdr := &tar.Header{ Name: filepath.Join(base, e.Name()), Size: int64(len(data)), Mode: 0644, ModTime: time.Now(), } if err := tw.WriteHeader(hdr); err != nil { return err } if _, err := tw.Write(data); err != nil { return err } } return nil }