package platform import ( "context" "fmt" "os" "os/exec" "path/filepath" "strconv" "strings" "sync" "time" ) // FanStressOptions configures the fan-stress / thermal cycling test. type FanStressOptions struct { BaselineSec int // idle monitoring before and after load (default 30) Phase1DurSec int // first load phase duration in seconds (default 300) PauseSec int // pause between the two load phases (default 60) Phase2DurSec int // second load phase duration in seconds (default 300) SizeMB int // GPU memory to allocate per GPU during stress (default 64) GPUIndices []int // which GPU indices to stress (empty = all detected) } // FanReading holds one fan sensor reading. type FanReading struct { Name string RPM float64 } // GPUStressMetric holds per-GPU metrics during the stress test. type GPUStressMetric struct { Index int TempC float64 UsagePct float64 PowerW float64 ClockMHz float64 Throttled bool // true if any throttle reason is active } // FanStressRow is one second-interval telemetry sample covering all monitored dimensions. type FanStressRow struct { TimestampUTC string ElapsedSec float64 Phase string // "baseline", "load1", "pause", "load2", "cooldown" GPUs []GPUStressMetric Fans []FanReading CPUMaxTempC float64 // highest CPU temperature from ipmitool / sensors SysPowerW float64 // DCMI system power reading } // RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds, // temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv. // Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling. func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanStressOptions) (string, error) { if baseDir == "" { baseDir = "/var/log/bee-sat" } applyFanStressDefaults(&opts) ts := time.Now().UTC().Format("20060102-150405") runDir := filepath.Join(baseDir, "fan-stress-"+ts) if err := os.MkdirAll(runDir, 0755); err != nil { return "", err } verboseLog := filepath.Join(runDir, "verbose.log") // Phase name shared between sampler goroutine and main goroutine. var phaseMu sync.Mutex currentPhase := "init" setPhase := func(name string) { phaseMu.Lock() currentPhase = name phaseMu.Unlock() } getPhase := func() string { phaseMu.Lock() defer phaseMu.Unlock() return currentPhase } start := time.Now() var rowsMu sync.Mutex var allRows []FanStressRow // Start background sampler (every second). stopCh := make(chan struct{}) doneCh := make(chan struct{}) go func() { defer close(doneCh) ticker := time.NewTicker(time.Second) defer ticker.Stop() for { select { case <-stopCh: return case <-ticker.C: row := sampleFanStressRow(opts.GPUIndices, getPhase(), time.Since(start).Seconds()) rowsMu.Lock() allRows = append(allRows, row) rowsMu.Unlock() } } }() var summary strings.Builder fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339)) stats := satStats{} // idlePhase sleeps for durSec while the sampler stamps phaseName on each row. idlePhase := func(phaseName, stepName string, durSec int) { if ctx.Err() != nil { return } setPhase(phaseName) appendSATVerboseLog(verboseLog, fmt.Sprintf("[%s] start %s (idle %ds)", time.Now().UTC().Format(time.RFC3339), stepName, durSec), ) select { case <-ctx.Done(): case <-time.After(time.Duration(durSec) * time.Second): } appendSATVerboseLog(verboseLog, fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), stepName), ) fmt.Fprintf(&summary, "%s_status=OK\n", stepName) stats.OK++ } // loadPhase runs bee-gpu-stress for durSec; sampler stamps phaseName on each row. loadPhase := func(phaseName, stepName string, durSec int) { if ctx.Err() != nil { return } setPhase(phaseName) var env []string if len(opts.GPUIndices) > 0 { ids := make([]string, len(opts.GPUIndices)) for i, idx := range opts.GPUIndices { ids[i] = strconv.Itoa(idx) } env = []string{"CUDA_VISIBLE_DEVICES=" + strings.Join(ids, ",")} } cmd := []string{ "bee-gpu-stress", "--seconds", strconv.Itoa(durSec), "--size-mb", strconv.Itoa(opts.SizeMB), } out, err := runSATCommandCtx(ctx, verboseLog, stepName, cmd, env) _ = os.WriteFile(filepath.Join(runDir, stepName+".log"), out, 0644) if err != nil && err != context.Canceled && err.Error() != "signal: killed" { fmt.Fprintf(&summary, "%s_status=FAILED\n", stepName) stats.Failed++ } else { fmt.Fprintf(&summary, "%s_status=OK\n", stepName) stats.OK++ } } // Execute test phases. idlePhase("baseline", "01-baseline", opts.BaselineSec) loadPhase("load1", "02-load1", opts.Phase1DurSec) idlePhase("pause", "03-pause", opts.PauseSec) loadPhase("load2", "04-load2", opts.Phase2DurSec) idlePhase("cooldown", "05-cooldown", opts.BaselineSec) // Stop sampler and collect rows. close(stopCh) <-doneCh rowsMu.Lock() rows := allRows rowsMu.Unlock() // Analysis. throttled := analyzeThrottling(rows) maxGPUTemp := analyzeMaxTemp(rows, func(r FanStressRow) float64 { var m float64 for _, g := range r.GPUs { if g.TempC > m { m = g.TempC } } return m }) maxCPUTemp := analyzeMaxTemp(rows, func(r FanStressRow) float64 { return r.CPUMaxTempC }) fanResponseSec := analyzeFanResponse(rows) fmt.Fprintf(&summary, "throttling_detected=%v\n", throttled) fmt.Fprintf(&summary, "max_gpu_temp_c=%.1f\n", maxGPUTemp) fmt.Fprintf(&summary, "max_cpu_temp_c=%.1f\n", maxCPUTemp) if fanResponseSec >= 0 { fmt.Fprintf(&summary, "fan_response_sec=%.1f\n", fanResponseSec) } else { fmt.Fprintf(&summary, "fan_response_sec=N/A\n") } // Throttling failure counts against overall result. if throttled { stats.Failed++ } writeSATStats(&summary, stats) // Write CSV outputs. if err := WriteFanStressCSV(filepath.Join(runDir, "metrics.csv"), rows, opts.GPUIndices); err != nil { return "", err } _ = WriteFanSensorsCSV(filepath.Join(runDir, "fan-sensors.csv"), rows) if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil { return "", err } archive := filepath.Join(baseDir, "fan-stress-"+ts+".tar.gz") if err := createTarGz(archive, runDir); err != nil { return "", err } return archive, nil } func applyFanStressDefaults(opts *FanStressOptions) { if opts.BaselineSec <= 0 { opts.BaselineSec = 30 } if opts.Phase1DurSec <= 0 { opts.Phase1DurSec = 300 } if opts.PauseSec <= 0 { opts.PauseSec = 60 } if opts.Phase2DurSec <= 0 { opts.Phase2DurSec = 300 } if opts.SizeMB <= 0 { opts.SizeMB = 64 } } // sampleFanStressRow collects all metrics for one telemetry sample. func sampleFanStressRow(gpuIndices []int, phase string, elapsed float64) FanStressRow { row := FanStressRow{ TimestampUTC: time.Now().UTC().Format(time.RFC3339), ElapsedSec: elapsed, Phase: phase, } row.GPUs = sampleGPUStressMetrics(gpuIndices) row.Fans, _ = sampleFanSpeeds() row.CPUMaxTempC = sampleCPUMaxTemp() row.SysPowerW = sampleSystemPower() return row } // sampleGPUStressMetrics queries nvidia-smi for temperature, utilization, power, // clock frequency, and active throttle reasons for each GPU. func sampleGPUStressMetrics(gpuIndices []int) []GPUStressMetric { args := []string{ "--query-gpu=index,temperature.gpu,utilization.gpu,power.draw,clocks.current.graphics,clocks_throttle_reasons.active", "--format=csv,noheader,nounits", } if len(gpuIndices) > 0 { ids := make([]string, len(gpuIndices)) for i, idx := range gpuIndices { ids[i] = strconv.Itoa(idx) } args = append([]string{"--id=" + strings.Join(ids, ",")}, args...) } out, err := exec.Command("nvidia-smi", args...).Output() if err != nil { return nil } var metrics []GPUStressMetric for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") { line = strings.TrimSpace(line) if line == "" { continue } parts := strings.Split(line, ", ") if len(parts) < 6 { continue } idx, _ := strconv.Atoi(strings.TrimSpace(parts[0])) throttleVal := strings.TrimSpace(parts[5]) // Throttled if active reasons bitmask is non-zero. throttled := throttleVal != "0x0000000000000000" && throttleVal != "0x0" && throttleVal != "0" && throttleVal != "" && throttleVal != "N/A" metrics = append(metrics, GPUStressMetric{ Index: idx, TempC: parseGPUFloat(parts[1]), UsagePct: parseGPUFloat(parts[2]), PowerW: parseGPUFloat(parts[3]), ClockMHz: parseGPUFloat(parts[4]), Throttled: throttled, }) } return metrics } // sampleFanSpeeds reads fan RPM values from ipmitool sdr. func sampleFanSpeeds() ([]FanReading, error) { out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output() if err != nil { return nil, err } return parseFanSpeeds(string(out)), nil } // parseFanSpeeds parses "ipmitool sdr type Fan" output. // Line format: "FAN1 | 2400.000 | RPM | ok" func parseFanSpeeds(raw string) []FanReading { var fans []FanReading for _, line := range strings.Split(strings.TrimSpace(raw), "\n") { parts := strings.Split(line, "|") if len(parts) < 3 { continue } unit := strings.TrimSpace(parts[2]) if !strings.EqualFold(unit, "RPM") { continue } valStr := strings.TrimSpace(parts[1]) if strings.EqualFold(valStr, "na") || strings.EqualFold(valStr, "disabled") || valStr == "" { continue } val, err := strconv.ParseFloat(valStr, 64) if err != nil { continue } fans = append(fans, FanReading{ Name: strings.TrimSpace(parts[0]), RPM: val, }) } return fans } // sampleCPUMaxTemp returns the highest CPU/inlet temperature from ipmitool or sensors. func sampleCPUMaxTemp() float64 { out, err := exec.Command("ipmitool", "sdr", "type", "Temperature").Output() if err != nil { return sampleCPUTempViaSensors() } return parseIPMIMaxTemp(string(out)) } // parseIPMIMaxTemp extracts the maximum temperature from "ipmitool sdr type Temperature". func parseIPMIMaxTemp(raw string) float64 { var max float64 for _, line := range strings.Split(strings.TrimSpace(raw), "\n") { parts := strings.Split(line, "|") if len(parts) < 3 { continue } unit := strings.TrimSpace(parts[2]) if !strings.Contains(strings.ToLower(unit), "degrees") { continue } valStr := strings.TrimSpace(parts[1]) if strings.EqualFold(valStr, "na") || valStr == "" { continue } val, err := strconv.ParseFloat(valStr, 64) if err != nil { continue } if val > max { max = val } } return max } // sampleCPUTempViaSensors falls back to lm-sensors when ipmitool is unavailable. func sampleCPUTempViaSensors() float64 { out, err := exec.Command("sensors", "-u").Output() if err != nil { return 0 } var max float64 for _, line := range strings.Split(string(out), "\n") { line = strings.TrimSpace(line) fields := strings.Fields(line) if len(fields) < 2 { continue } if !strings.HasSuffix(fields[0], "_input:") { continue } val, err := strconv.ParseFloat(fields[1], 64) if err != nil { continue } if val > 0 && val < 150 && val > max { max = val } } return max } // sampleSystemPower reads system power draw via DCMI. func sampleSystemPower() float64 { out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output() if err != nil { return 0 } return parseDCMIPowerReading(string(out)) } // parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output. // Sample: " Instantaneous power reading: 500 Watts" func parseDCMIPowerReading(raw string) float64 { for _, line := range strings.Split(raw, "\n") { if !strings.Contains(strings.ToLower(line), "instantaneous") { continue } parts := strings.Fields(line) for i, p := range parts { if strings.EqualFold(p, "Watts") && i > 0 { val, err := strconv.ParseFloat(parts[i-1], 64) if err == nil { return val } } } } return 0 } // analyzeThrottling returns true if any GPU reported an active throttle reason // during either load phase. func analyzeThrottling(rows []FanStressRow) bool { for _, row := range rows { if row.Phase != "load1" && row.Phase != "load2" { continue } for _, gpu := range row.GPUs { if gpu.Throttled { return true } } } return false } // analyzeMaxTemp returns the maximum value of the given extractor across all rows. func analyzeMaxTemp(rows []FanStressRow, extract func(FanStressRow) float64) float64 { var max float64 for _, row := range rows { if v := extract(row); v > max { max = v } } return max } // analyzeFanResponse returns the seconds from load1 start until fan RPM first // increased by more than 5% above the baseline average. Returns -1 if undetermined. func analyzeFanResponse(rows []FanStressRow) float64 { // Compute baseline average fan RPM. var baseTotal, baseCount float64 for _, row := range rows { if row.Phase != "baseline" { continue } for _, f := range row.Fans { baseTotal += f.RPM baseCount++ } } if baseCount == 0 || baseTotal == 0 { return -1 } baseAvg := baseTotal / baseCount threshold := baseAvg * 1.05 // 5% increase signals fan ramp-up // Find elapsed time when load1 started. var load1Start float64 = -1 for _, row := range rows { if row.Phase == "load1" { load1Start = row.ElapsedSec break } } if load1Start < 0 { return -1 } // Find first load1 row where average RPM crosses the threshold. for _, row := range rows { if row.Phase != "load1" { continue } var total, count float64 for _, f := range row.Fans { total += f.RPM count++ } if count > 0 && total/count >= threshold { return row.ElapsedSec - load1Start } } return -1 } // WriteFanStressCSV writes the wide-format metrics CSV with one row per second. // GPU columns are generated per index in gpuIndices order. func WriteFanStressCSV(path string, rows []FanStressRow, gpuIndices []int) error { if len(rows) == 0 { return os.WriteFile(path, []byte("no data\n"), 0644) } var b strings.Builder // Header: fixed system columns + per-GPU columns. b.WriteString("timestamp_utc,elapsed_sec,phase,fan_avg_rpm,fan_min_rpm,fan_max_rpm,cpu_max_temp_c,sys_power_w") for _, idx := range gpuIndices { fmt.Fprintf(&b, ",gpu%d_temp_c,gpu%d_usage_pct,gpu%d_power_w,gpu%d_clock_mhz,gpu%d_throttled", idx, idx, idx, idx, idx) } b.WriteRune('\n') for _, row := range rows { favg, fmin, fmax := fanRPMStats(row.Fans) fmt.Fprintf(&b, "%s,%.1f,%s,%.0f,%.0f,%.0f,%.1f,%.1f", row.TimestampUTC, row.ElapsedSec, row.Phase, favg, fmin, fmax, row.CPUMaxTempC, row.SysPowerW, ) gpuByIdx := make(map[int]GPUStressMetric, len(row.GPUs)) for _, g := range row.GPUs { gpuByIdx[g.Index] = g } for _, idx := range gpuIndices { g := gpuByIdx[idx] throttled := 0 if g.Throttled { throttled = 1 } fmt.Fprintf(&b, ",%.1f,%.1f,%.1f,%.0f,%d", g.TempC, g.UsagePct, g.PowerW, g.ClockMHz, throttled) } b.WriteRune('\n') } return os.WriteFile(path, []byte(b.String()), 0644) } // WriteFanSensorsCSV writes individual fan sensor readings in long (tidy) format. func WriteFanSensorsCSV(path string, rows []FanStressRow) error { var b strings.Builder b.WriteString("timestamp_utc,elapsed_sec,phase,fan_name,rpm\n") for _, row := range rows { for _, f := range row.Fans { fmt.Fprintf(&b, "%s,%.1f,%s,%s,%.0f\n", row.TimestampUTC, row.ElapsedSec, row.Phase, f.Name, f.RPM) } } return os.WriteFile(path, []byte(b.String()), 0644) } // fanRPMStats computes average, min, max RPM across all fans in a sample row. func fanRPMStats(fans []FanReading) (avg, min, max float64) { if len(fans) == 0 { return 0, 0, 0 } min = fans[0].RPM max = fans[0].RPM var total float64 for _, f := range fans { total += f.RPM if f.RPM < min { min = f.RPM } if f.RPM > max { max = f.RPM } } return total / float64(len(fans)), min, max }