Two-phase GPU thermal cycling test with per-second telemetry: - Phases: baseline → load1 → pause (no cooldown) → load2 → cooldown - Monitors: fan RPM (ipmitool sdr), CPU/server temps (ipmitool/sensors), system power (ipmitool dcmi), GPU temp/power/usage/clock/throttle (nvidia-smi) - Detects throttling via clocks_throttle_reasons.active bitmask - Measures fan response lag from load start (validates case-04 ~2s lag) - Exports metrics.csv (wide format, one row/sec) and fan-sensors.csv (long format) - TUI: adds [F] Fan Stress Test to Health Check screen with Quick/Standard/Express modes Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
588 lines
16 KiB
Go
588 lines
16 KiB
Go
package platform
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
)
|
|
|
|
// FanStressOptions configures the fan-stress / thermal cycling test.
|
|
type FanStressOptions struct {
|
|
BaselineSec int // idle monitoring before and after load (default 30)
|
|
Phase1DurSec int // first load phase duration in seconds (default 300)
|
|
PauseSec int // pause between the two load phases (default 60)
|
|
Phase2DurSec int // second load phase duration in seconds (default 300)
|
|
SizeMB int // GPU memory to allocate per GPU during stress (default 64)
|
|
GPUIndices []int // which GPU indices to stress (empty = all detected)
|
|
}
|
|
|
|
// FanReading holds one fan sensor reading.
|
|
type FanReading struct {
|
|
Name string
|
|
RPM float64
|
|
}
|
|
|
|
// GPUStressMetric holds per-GPU metrics during the stress test.
|
|
type GPUStressMetric struct {
|
|
Index int
|
|
TempC float64
|
|
UsagePct float64
|
|
PowerW float64
|
|
ClockMHz float64
|
|
Throttled bool // true if any throttle reason is active
|
|
}
|
|
|
|
// FanStressRow is one second-interval telemetry sample covering all monitored dimensions.
|
|
type FanStressRow struct {
|
|
TimestampUTC string
|
|
ElapsedSec float64
|
|
Phase string // "baseline", "load1", "pause", "load2", "cooldown"
|
|
GPUs []GPUStressMetric
|
|
Fans []FanReading
|
|
CPUMaxTempC float64 // highest CPU temperature from ipmitool / sensors
|
|
SysPowerW float64 // DCMI system power reading
|
|
}
|
|
|
|
// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
|
|
// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
|
|
// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
|
|
func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanStressOptions) (string, error) {
|
|
if baseDir == "" {
|
|
baseDir = "/var/log/bee-sat"
|
|
}
|
|
applyFanStressDefaults(&opts)
|
|
|
|
ts := time.Now().UTC().Format("20060102-150405")
|
|
runDir := filepath.Join(baseDir, "fan-stress-"+ts)
|
|
if err := os.MkdirAll(runDir, 0755); err != nil {
|
|
return "", err
|
|
}
|
|
verboseLog := filepath.Join(runDir, "verbose.log")
|
|
|
|
// Phase name shared between sampler goroutine and main goroutine.
|
|
var phaseMu sync.Mutex
|
|
currentPhase := "init"
|
|
setPhase := func(name string) {
|
|
phaseMu.Lock()
|
|
currentPhase = name
|
|
phaseMu.Unlock()
|
|
}
|
|
getPhase := func() string {
|
|
phaseMu.Lock()
|
|
defer phaseMu.Unlock()
|
|
return currentPhase
|
|
}
|
|
|
|
start := time.Now()
|
|
var rowsMu sync.Mutex
|
|
var allRows []FanStressRow
|
|
|
|
// Start background sampler (every second).
|
|
stopCh := make(chan struct{})
|
|
doneCh := make(chan struct{})
|
|
go func() {
|
|
defer close(doneCh)
|
|
ticker := time.NewTicker(time.Second)
|
|
defer ticker.Stop()
|
|
for {
|
|
select {
|
|
case <-stopCh:
|
|
return
|
|
case <-ticker.C:
|
|
row := sampleFanStressRow(opts.GPUIndices, getPhase(), time.Since(start).Seconds())
|
|
rowsMu.Lock()
|
|
allRows = append(allRows, row)
|
|
rowsMu.Unlock()
|
|
}
|
|
}
|
|
}()
|
|
|
|
var summary strings.Builder
|
|
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
|
|
|
stats := satStats{}
|
|
|
|
// idlePhase sleeps for durSec while the sampler stamps phaseName on each row.
|
|
idlePhase := func(phaseName, stepName string, durSec int) {
|
|
if ctx.Err() != nil {
|
|
return
|
|
}
|
|
setPhase(phaseName)
|
|
appendSATVerboseLog(verboseLog,
|
|
fmt.Sprintf("[%s] start %s (idle %ds)", time.Now().UTC().Format(time.RFC3339), stepName, durSec),
|
|
)
|
|
select {
|
|
case <-ctx.Done():
|
|
case <-time.After(time.Duration(durSec) * time.Second):
|
|
}
|
|
appendSATVerboseLog(verboseLog,
|
|
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), stepName),
|
|
)
|
|
fmt.Fprintf(&summary, "%s_status=OK\n", stepName)
|
|
stats.OK++
|
|
}
|
|
|
|
// loadPhase runs bee-gpu-stress for durSec; sampler stamps phaseName on each row.
|
|
loadPhase := func(phaseName, stepName string, durSec int) {
|
|
if ctx.Err() != nil {
|
|
return
|
|
}
|
|
setPhase(phaseName)
|
|
var env []string
|
|
if len(opts.GPUIndices) > 0 {
|
|
ids := make([]string, len(opts.GPUIndices))
|
|
for i, idx := range opts.GPUIndices {
|
|
ids[i] = strconv.Itoa(idx)
|
|
}
|
|
env = []string{"CUDA_VISIBLE_DEVICES=" + strings.Join(ids, ",")}
|
|
}
|
|
cmd := []string{
|
|
"bee-gpu-stress",
|
|
"--seconds", strconv.Itoa(durSec),
|
|
"--size-mb", strconv.Itoa(opts.SizeMB),
|
|
}
|
|
out, err := runSATCommandCtx(ctx, verboseLog, stepName, cmd, env)
|
|
_ = os.WriteFile(filepath.Join(runDir, stepName+".log"), out, 0644)
|
|
if err != nil && err != context.Canceled && err.Error() != "signal: killed" {
|
|
fmt.Fprintf(&summary, "%s_status=FAILED\n", stepName)
|
|
stats.Failed++
|
|
} else {
|
|
fmt.Fprintf(&summary, "%s_status=OK\n", stepName)
|
|
stats.OK++
|
|
}
|
|
}
|
|
|
|
// Execute test phases.
|
|
idlePhase("baseline", "01-baseline", opts.BaselineSec)
|
|
loadPhase("load1", "02-load1", opts.Phase1DurSec)
|
|
idlePhase("pause", "03-pause", opts.PauseSec)
|
|
loadPhase("load2", "04-load2", opts.Phase2DurSec)
|
|
idlePhase("cooldown", "05-cooldown", opts.BaselineSec)
|
|
|
|
// Stop sampler and collect rows.
|
|
close(stopCh)
|
|
<-doneCh
|
|
|
|
rowsMu.Lock()
|
|
rows := allRows
|
|
rowsMu.Unlock()
|
|
|
|
// Analysis.
|
|
throttled := analyzeThrottling(rows)
|
|
maxGPUTemp := analyzeMaxTemp(rows, func(r FanStressRow) float64 {
|
|
var m float64
|
|
for _, g := range r.GPUs {
|
|
if g.TempC > m {
|
|
m = g.TempC
|
|
}
|
|
}
|
|
return m
|
|
})
|
|
maxCPUTemp := analyzeMaxTemp(rows, func(r FanStressRow) float64 {
|
|
return r.CPUMaxTempC
|
|
})
|
|
fanResponseSec := analyzeFanResponse(rows)
|
|
|
|
fmt.Fprintf(&summary, "throttling_detected=%v\n", throttled)
|
|
fmt.Fprintf(&summary, "max_gpu_temp_c=%.1f\n", maxGPUTemp)
|
|
fmt.Fprintf(&summary, "max_cpu_temp_c=%.1f\n", maxCPUTemp)
|
|
if fanResponseSec >= 0 {
|
|
fmt.Fprintf(&summary, "fan_response_sec=%.1f\n", fanResponseSec)
|
|
} else {
|
|
fmt.Fprintf(&summary, "fan_response_sec=N/A\n")
|
|
}
|
|
|
|
// Throttling failure counts against overall result.
|
|
if throttled {
|
|
stats.Failed++
|
|
}
|
|
writeSATStats(&summary, stats)
|
|
|
|
// Write CSV outputs.
|
|
if err := WriteFanStressCSV(filepath.Join(runDir, "metrics.csv"), rows, opts.GPUIndices); err != nil {
|
|
return "", err
|
|
}
|
|
_ = WriteFanSensorsCSV(filepath.Join(runDir, "fan-sensors.csv"), rows)
|
|
|
|
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
|
return "", err
|
|
}
|
|
|
|
archive := filepath.Join(baseDir, "fan-stress-"+ts+".tar.gz")
|
|
if err := createTarGz(archive, runDir); err != nil {
|
|
return "", err
|
|
}
|
|
return archive, nil
|
|
}
|
|
|
|
func applyFanStressDefaults(opts *FanStressOptions) {
|
|
if opts.BaselineSec <= 0 {
|
|
opts.BaselineSec = 30
|
|
}
|
|
if opts.Phase1DurSec <= 0 {
|
|
opts.Phase1DurSec = 300
|
|
}
|
|
if opts.PauseSec <= 0 {
|
|
opts.PauseSec = 60
|
|
}
|
|
if opts.Phase2DurSec <= 0 {
|
|
opts.Phase2DurSec = 300
|
|
}
|
|
if opts.SizeMB <= 0 {
|
|
opts.SizeMB = 64
|
|
}
|
|
}
|
|
|
|
// sampleFanStressRow collects all metrics for one telemetry sample.
|
|
func sampleFanStressRow(gpuIndices []int, phase string, elapsed float64) FanStressRow {
|
|
row := FanStressRow{
|
|
TimestampUTC: time.Now().UTC().Format(time.RFC3339),
|
|
ElapsedSec: elapsed,
|
|
Phase: phase,
|
|
}
|
|
row.GPUs = sampleGPUStressMetrics(gpuIndices)
|
|
row.Fans, _ = sampleFanSpeeds()
|
|
row.CPUMaxTempC = sampleCPUMaxTemp()
|
|
row.SysPowerW = sampleSystemPower()
|
|
return row
|
|
}
|
|
|
|
// sampleGPUStressMetrics queries nvidia-smi for temperature, utilization, power,
|
|
// clock frequency, and active throttle reasons for each GPU.
|
|
func sampleGPUStressMetrics(gpuIndices []int) []GPUStressMetric {
|
|
args := []string{
|
|
"--query-gpu=index,temperature.gpu,utilization.gpu,power.draw,clocks.current.graphics,clocks_throttle_reasons.active",
|
|
"--format=csv,noheader,nounits",
|
|
}
|
|
if len(gpuIndices) > 0 {
|
|
ids := make([]string, len(gpuIndices))
|
|
for i, idx := range gpuIndices {
|
|
ids[i] = strconv.Itoa(idx)
|
|
}
|
|
args = append([]string{"--id=" + strings.Join(ids, ",")}, args...)
|
|
}
|
|
out, err := exec.Command("nvidia-smi", args...).Output()
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
var metrics []GPUStressMetric
|
|
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
|
line = strings.TrimSpace(line)
|
|
if line == "" {
|
|
continue
|
|
}
|
|
parts := strings.Split(line, ", ")
|
|
if len(parts) < 6 {
|
|
continue
|
|
}
|
|
idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
|
|
throttleVal := strings.TrimSpace(parts[5])
|
|
// Throttled if active reasons bitmask is non-zero.
|
|
throttled := throttleVal != "0x0000000000000000" &&
|
|
throttleVal != "0x0" &&
|
|
throttleVal != "0" &&
|
|
throttleVal != "" &&
|
|
throttleVal != "N/A"
|
|
metrics = append(metrics, GPUStressMetric{
|
|
Index: idx,
|
|
TempC: parseGPUFloat(parts[1]),
|
|
UsagePct: parseGPUFloat(parts[2]),
|
|
PowerW: parseGPUFloat(parts[3]),
|
|
ClockMHz: parseGPUFloat(parts[4]),
|
|
Throttled: throttled,
|
|
})
|
|
}
|
|
return metrics
|
|
}
|
|
|
|
// sampleFanSpeeds reads fan RPM values from ipmitool sdr.
|
|
func sampleFanSpeeds() ([]FanReading, error) {
|
|
out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return parseFanSpeeds(string(out)), nil
|
|
}
|
|
|
|
// parseFanSpeeds parses "ipmitool sdr type Fan" output.
|
|
// Line format: "FAN1 | 2400.000 | RPM | ok"
|
|
func parseFanSpeeds(raw string) []FanReading {
|
|
var fans []FanReading
|
|
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
|
parts := strings.Split(line, "|")
|
|
if len(parts) < 3 {
|
|
continue
|
|
}
|
|
unit := strings.TrimSpace(parts[2])
|
|
if !strings.EqualFold(unit, "RPM") {
|
|
continue
|
|
}
|
|
valStr := strings.TrimSpace(parts[1])
|
|
if strings.EqualFold(valStr, "na") || strings.EqualFold(valStr, "disabled") || valStr == "" {
|
|
continue
|
|
}
|
|
val, err := strconv.ParseFloat(valStr, 64)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
fans = append(fans, FanReading{
|
|
Name: strings.TrimSpace(parts[0]),
|
|
RPM: val,
|
|
})
|
|
}
|
|
return fans
|
|
}
|
|
|
|
// sampleCPUMaxTemp returns the highest CPU/inlet temperature from ipmitool or sensors.
|
|
func sampleCPUMaxTemp() float64 {
|
|
out, err := exec.Command("ipmitool", "sdr", "type", "Temperature").Output()
|
|
if err != nil {
|
|
return sampleCPUTempViaSensors()
|
|
}
|
|
return parseIPMIMaxTemp(string(out))
|
|
}
|
|
|
|
// parseIPMIMaxTemp extracts the maximum temperature from "ipmitool sdr type Temperature".
|
|
func parseIPMIMaxTemp(raw string) float64 {
|
|
var max float64
|
|
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
|
parts := strings.Split(line, "|")
|
|
if len(parts) < 3 {
|
|
continue
|
|
}
|
|
unit := strings.TrimSpace(parts[2])
|
|
if !strings.Contains(strings.ToLower(unit), "degrees") {
|
|
continue
|
|
}
|
|
valStr := strings.TrimSpace(parts[1])
|
|
if strings.EqualFold(valStr, "na") || valStr == "" {
|
|
continue
|
|
}
|
|
val, err := strconv.ParseFloat(valStr, 64)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
if val > max {
|
|
max = val
|
|
}
|
|
}
|
|
return max
|
|
}
|
|
|
|
// sampleCPUTempViaSensors falls back to lm-sensors when ipmitool is unavailable.
|
|
func sampleCPUTempViaSensors() float64 {
|
|
out, err := exec.Command("sensors", "-u").Output()
|
|
if err != nil {
|
|
return 0
|
|
}
|
|
var max float64
|
|
for _, line := range strings.Split(string(out), "\n") {
|
|
line = strings.TrimSpace(line)
|
|
fields := strings.Fields(line)
|
|
if len(fields) < 2 {
|
|
continue
|
|
}
|
|
if !strings.HasSuffix(fields[0], "_input:") {
|
|
continue
|
|
}
|
|
val, err := strconv.ParseFloat(fields[1], 64)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
if val > 0 && val < 150 && val > max {
|
|
max = val
|
|
}
|
|
}
|
|
return max
|
|
}
|
|
|
|
// sampleSystemPower reads system power draw via DCMI.
|
|
func sampleSystemPower() float64 {
|
|
out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
|
|
if err != nil {
|
|
return 0
|
|
}
|
|
return parseDCMIPowerReading(string(out))
|
|
}
|
|
|
|
// parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
|
|
// Sample: " Instantaneous power reading: 500 Watts"
|
|
func parseDCMIPowerReading(raw string) float64 {
|
|
for _, line := range strings.Split(raw, "\n") {
|
|
if !strings.Contains(strings.ToLower(line), "instantaneous") {
|
|
continue
|
|
}
|
|
parts := strings.Fields(line)
|
|
for i, p := range parts {
|
|
if strings.EqualFold(p, "Watts") && i > 0 {
|
|
val, err := strconv.ParseFloat(parts[i-1], 64)
|
|
if err == nil {
|
|
return val
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return 0
|
|
}
|
|
|
|
// analyzeThrottling returns true if any GPU reported an active throttle reason
|
|
// during either load phase.
|
|
func analyzeThrottling(rows []FanStressRow) bool {
|
|
for _, row := range rows {
|
|
if row.Phase != "load1" && row.Phase != "load2" {
|
|
continue
|
|
}
|
|
for _, gpu := range row.GPUs {
|
|
if gpu.Throttled {
|
|
return true
|
|
}
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// analyzeMaxTemp returns the maximum value of the given extractor across all rows.
|
|
func analyzeMaxTemp(rows []FanStressRow, extract func(FanStressRow) float64) float64 {
|
|
var max float64
|
|
for _, row := range rows {
|
|
if v := extract(row); v > max {
|
|
max = v
|
|
}
|
|
}
|
|
return max
|
|
}
|
|
|
|
// analyzeFanResponse returns the seconds from load1 start until fan RPM first
|
|
// increased by more than 5% above the baseline average. Returns -1 if undetermined.
|
|
func analyzeFanResponse(rows []FanStressRow) float64 {
|
|
// Compute baseline average fan RPM.
|
|
var baseTotal, baseCount float64
|
|
for _, row := range rows {
|
|
if row.Phase != "baseline" {
|
|
continue
|
|
}
|
|
for _, f := range row.Fans {
|
|
baseTotal += f.RPM
|
|
baseCount++
|
|
}
|
|
}
|
|
if baseCount == 0 || baseTotal == 0 {
|
|
return -1
|
|
}
|
|
baseAvg := baseTotal / baseCount
|
|
threshold := baseAvg * 1.05 // 5% increase signals fan ramp-up
|
|
|
|
// Find elapsed time when load1 started.
|
|
var load1Start float64 = -1
|
|
for _, row := range rows {
|
|
if row.Phase == "load1" {
|
|
load1Start = row.ElapsedSec
|
|
break
|
|
}
|
|
}
|
|
if load1Start < 0 {
|
|
return -1
|
|
}
|
|
|
|
// Find first load1 row where average RPM crosses the threshold.
|
|
for _, row := range rows {
|
|
if row.Phase != "load1" {
|
|
continue
|
|
}
|
|
var total, count float64
|
|
for _, f := range row.Fans {
|
|
total += f.RPM
|
|
count++
|
|
}
|
|
if count > 0 && total/count >= threshold {
|
|
return row.ElapsedSec - load1Start
|
|
}
|
|
}
|
|
return -1
|
|
}
|
|
|
|
// WriteFanStressCSV writes the wide-format metrics CSV with one row per second.
|
|
// GPU columns are generated per index in gpuIndices order.
|
|
func WriteFanStressCSV(path string, rows []FanStressRow, gpuIndices []int) error {
|
|
if len(rows) == 0 {
|
|
return os.WriteFile(path, []byte("no data\n"), 0644)
|
|
}
|
|
|
|
var b strings.Builder
|
|
|
|
// Header: fixed system columns + per-GPU columns.
|
|
b.WriteString("timestamp_utc,elapsed_sec,phase,fan_avg_rpm,fan_min_rpm,fan_max_rpm,cpu_max_temp_c,sys_power_w")
|
|
for _, idx := range gpuIndices {
|
|
fmt.Fprintf(&b, ",gpu%d_temp_c,gpu%d_usage_pct,gpu%d_power_w,gpu%d_clock_mhz,gpu%d_throttled",
|
|
idx, idx, idx, idx, idx)
|
|
}
|
|
b.WriteRune('\n')
|
|
|
|
for _, row := range rows {
|
|
favg, fmin, fmax := fanRPMStats(row.Fans)
|
|
fmt.Fprintf(&b, "%s,%.1f,%s,%.0f,%.0f,%.0f,%.1f,%.1f",
|
|
row.TimestampUTC,
|
|
row.ElapsedSec,
|
|
row.Phase,
|
|
favg, fmin, fmax,
|
|
row.CPUMaxTempC,
|
|
row.SysPowerW,
|
|
)
|
|
gpuByIdx := make(map[int]GPUStressMetric, len(row.GPUs))
|
|
for _, g := range row.GPUs {
|
|
gpuByIdx[g.Index] = g
|
|
}
|
|
for _, idx := range gpuIndices {
|
|
g := gpuByIdx[idx]
|
|
throttled := 0
|
|
if g.Throttled {
|
|
throttled = 1
|
|
}
|
|
fmt.Fprintf(&b, ",%.1f,%.1f,%.1f,%.0f,%d",
|
|
g.TempC, g.UsagePct, g.PowerW, g.ClockMHz, throttled)
|
|
}
|
|
b.WriteRune('\n')
|
|
}
|
|
|
|
return os.WriteFile(path, []byte(b.String()), 0644)
|
|
}
|
|
|
|
// WriteFanSensorsCSV writes individual fan sensor readings in long (tidy) format.
|
|
func WriteFanSensorsCSV(path string, rows []FanStressRow) error {
|
|
var b strings.Builder
|
|
b.WriteString("timestamp_utc,elapsed_sec,phase,fan_name,rpm\n")
|
|
for _, row := range rows {
|
|
for _, f := range row.Fans {
|
|
fmt.Fprintf(&b, "%s,%.1f,%s,%s,%.0f\n",
|
|
row.TimestampUTC, row.ElapsedSec, row.Phase, f.Name, f.RPM)
|
|
}
|
|
}
|
|
return os.WriteFile(path, []byte(b.String()), 0644)
|
|
}
|
|
|
|
// fanRPMStats computes average, min, max RPM across all fans in a sample row.
|
|
func fanRPMStats(fans []FanReading) (avg, min, max float64) {
|
|
if len(fans) == 0 {
|
|
return 0, 0, 0
|
|
}
|
|
min = fans[0].RPM
|
|
max = fans[0].RPM
|
|
var total float64
|
|
for _, f := range fans {
|
|
total += f.RPM
|
|
if f.RPM < min {
|
|
min = f.RPM
|
|
}
|
|
if f.RPM > max {
|
|
max = f.RPM
|
|
}
|
|
}
|
|
return total / float64(len(fans)), min, max
|
|
}
|