diff --git a/audit/internal/app/app.go b/audit/internal/app/app.go index 914e5f6..ca15f2c 100644 --- a/audit/internal/app/app.go +++ b/audit/internal/app/app.go @@ -80,6 +80,7 @@ type satRunner interface { DetectGPUVendor() string ListAMDGPUs() ([]platform.AMDGPUInfo, error) RunAMDAcceptancePack(baseDir string) (string, error) + RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) } type runtimeChecker interface { @@ -491,6 +492,67 @@ func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) { return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err } +func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) { + if strings.TrimSpace(baseDir) == "" { + baseDir = DefaultSATBaseDir + } + return a.sat.RunFanStressTest(ctx, baseDir, opts) +} + +func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) { + path, err := a.RunFanStressTest(ctx, "", opts) + body := formatFanStressResult(path) + if err != nil && err != context.Canceled { + body += "\nERROR: " + err.Error() + } + return ActionResult{Title: "Fan Stress Test", Body: body}, err +} + +// formatFanStressResult formats the summary.txt from a fan-stress run, including +// the per-step pass/fail display and the analysis section (throttling, max temps, fan response). +func formatFanStressResult(archivePath string) string { + if archivePath == "" { + return "No output produced." + } + runDir := strings.TrimSuffix(archivePath, ".tar.gz") + raw, err := os.ReadFile(filepath.Join(runDir, "summary.txt")) + if err != nil { + return "Archive written to " + archivePath + } + content := strings.TrimSpace(string(raw)) + kv := parseKeyValueSummary(content) + + var b strings.Builder + b.WriteString(formatSATDetail(content)) + + // Append analysis section. + var analysis []string + if v, ok := kv["throttling_detected"]; ok { + label := "NO" + if v == "true" { + label = "YES ← throttling detected during load" + } + analysis = append(analysis, "Throttling: "+label) + } + if v, ok := kv["max_gpu_temp_c"]; ok && v != "0.0" { + analysis = append(analysis, "Max GPU temp: "+v+"°C") + } + if v, ok := kv["max_cpu_temp_c"]; ok && v != "0.0" { + analysis = append(analysis, "Max CPU temp: "+v+"°C") + } + if v, ok := kv["fan_response_sec"]; ok && v != "N/A" && v != "-1.0" { + analysis = append(analysis, "Fan response: "+v+"s") + } + + if len(analysis) > 0 { + b.WriteString("\n\n=== Analysis ===\n") + for _, line := range analysis { + b.WriteString(line + "\n") + } + } + return strings.TrimSpace(b.String()) +} + // satResultBody reads summary.txt from the SAT run directory (archive path without .tar.gz) // and returns a formatted human-readable result. Falls back to a plain message if unreadable. func satResultBody(archivePath string) string { diff --git a/audit/internal/app/app_test.go b/audit/internal/app/app_test.go index 291d8bf..b606005 100644 --- a/audit/internal/app/app_test.go +++ b/audit/internal/app/app_test.go @@ -170,6 +170,10 @@ func (f fakeSAT) RunAMDAcceptancePack(baseDir string) (string, error) { return "", nil } +func (f fakeSAT) RunFanStressTest(_ context.Context, _ string, _ platform.FanStressOptions) (string, error) { + return "", nil +} + func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) { t.Parallel() diff --git a/audit/internal/platform/sat_fan_stress.go b/audit/internal/platform/sat_fan_stress.go new file mode 100644 index 0000000..c64920d --- /dev/null +++ b/audit/internal/platform/sat_fan_stress.go @@ -0,0 +1,587 @@ +package platform + +import ( + "context" + "fmt" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + "sync" + "time" +) + +// FanStressOptions configures the fan-stress / thermal cycling test. +type FanStressOptions struct { + BaselineSec int // idle monitoring before and after load (default 30) + Phase1DurSec int // first load phase duration in seconds (default 300) + PauseSec int // pause between the two load phases (default 60) + Phase2DurSec int // second load phase duration in seconds (default 300) + SizeMB int // GPU memory to allocate per GPU during stress (default 64) + GPUIndices []int // which GPU indices to stress (empty = all detected) +} + +// FanReading holds one fan sensor reading. +type FanReading struct { + Name string + RPM float64 +} + +// GPUStressMetric holds per-GPU metrics during the stress test. +type GPUStressMetric struct { + Index int + TempC float64 + UsagePct float64 + PowerW float64 + ClockMHz float64 + Throttled bool // true if any throttle reason is active +} + +// FanStressRow is one second-interval telemetry sample covering all monitored dimensions. +type FanStressRow struct { + TimestampUTC string + ElapsedSec float64 + Phase string // "baseline", "load1", "pause", "load2", "cooldown" + GPUs []GPUStressMetric + Fans []FanReading + CPUMaxTempC float64 // highest CPU temperature from ipmitool / sensors + SysPowerW float64 // DCMI system power reading +} + +// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds, +// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv. +// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling. +func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanStressOptions) (string, error) { + if baseDir == "" { + baseDir = "/var/log/bee-sat" + } + applyFanStressDefaults(&opts) + + ts := time.Now().UTC().Format("20060102-150405") + runDir := filepath.Join(baseDir, "fan-stress-"+ts) + if err := os.MkdirAll(runDir, 0755); err != nil { + return "", err + } + verboseLog := filepath.Join(runDir, "verbose.log") + + // Phase name shared between sampler goroutine and main goroutine. + var phaseMu sync.Mutex + currentPhase := "init" + setPhase := func(name string) { + phaseMu.Lock() + currentPhase = name + phaseMu.Unlock() + } + getPhase := func() string { + phaseMu.Lock() + defer phaseMu.Unlock() + return currentPhase + } + + start := time.Now() + var rowsMu sync.Mutex + var allRows []FanStressRow + + // Start background sampler (every second). + stopCh := make(chan struct{}) + doneCh := make(chan struct{}) + go func() { + defer close(doneCh) + ticker := time.NewTicker(time.Second) + defer ticker.Stop() + for { + select { + case <-stopCh: + return + case <-ticker.C: + row := sampleFanStressRow(opts.GPUIndices, getPhase(), time.Since(start).Seconds()) + rowsMu.Lock() + allRows = append(allRows, row) + rowsMu.Unlock() + } + } + }() + + var summary strings.Builder + fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339)) + + stats := satStats{} + + // idlePhase sleeps for durSec while the sampler stamps phaseName on each row. + idlePhase := func(phaseName, stepName string, durSec int) { + if ctx.Err() != nil { + return + } + setPhase(phaseName) + appendSATVerboseLog(verboseLog, + fmt.Sprintf("[%s] start %s (idle %ds)", time.Now().UTC().Format(time.RFC3339), stepName, durSec), + ) + select { + case <-ctx.Done(): + case <-time.After(time.Duration(durSec) * time.Second): + } + appendSATVerboseLog(verboseLog, + fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), stepName), + ) + fmt.Fprintf(&summary, "%s_status=OK\n", stepName) + stats.OK++ + } + + // loadPhase runs bee-gpu-stress for durSec; sampler stamps phaseName on each row. + loadPhase := func(phaseName, stepName string, durSec int) { + if ctx.Err() != nil { + return + } + setPhase(phaseName) + var env []string + if len(opts.GPUIndices) > 0 { + ids := make([]string, len(opts.GPUIndices)) + for i, idx := range opts.GPUIndices { + ids[i] = strconv.Itoa(idx) + } + env = []string{"CUDA_VISIBLE_DEVICES=" + strings.Join(ids, ",")} + } + cmd := []string{ + "bee-gpu-stress", + "--seconds", strconv.Itoa(durSec), + "--size-mb", strconv.Itoa(opts.SizeMB), + } + out, err := runSATCommandCtx(ctx, verboseLog, stepName, cmd, env) + _ = os.WriteFile(filepath.Join(runDir, stepName+".log"), out, 0644) + if err != nil && err != context.Canceled && err.Error() != "signal: killed" { + fmt.Fprintf(&summary, "%s_status=FAILED\n", stepName) + stats.Failed++ + } else { + fmt.Fprintf(&summary, "%s_status=OK\n", stepName) + stats.OK++ + } + } + + // Execute test phases. + idlePhase("baseline", "01-baseline", opts.BaselineSec) + loadPhase("load1", "02-load1", opts.Phase1DurSec) + idlePhase("pause", "03-pause", opts.PauseSec) + loadPhase("load2", "04-load2", opts.Phase2DurSec) + idlePhase("cooldown", "05-cooldown", opts.BaselineSec) + + // Stop sampler and collect rows. + close(stopCh) + <-doneCh + + rowsMu.Lock() + rows := allRows + rowsMu.Unlock() + + // Analysis. + throttled := analyzeThrottling(rows) + maxGPUTemp := analyzeMaxTemp(rows, func(r FanStressRow) float64 { + var m float64 + for _, g := range r.GPUs { + if g.TempC > m { + m = g.TempC + } + } + return m + }) + maxCPUTemp := analyzeMaxTemp(rows, func(r FanStressRow) float64 { + return r.CPUMaxTempC + }) + fanResponseSec := analyzeFanResponse(rows) + + fmt.Fprintf(&summary, "throttling_detected=%v\n", throttled) + fmt.Fprintf(&summary, "max_gpu_temp_c=%.1f\n", maxGPUTemp) + fmt.Fprintf(&summary, "max_cpu_temp_c=%.1f\n", maxCPUTemp) + if fanResponseSec >= 0 { + fmt.Fprintf(&summary, "fan_response_sec=%.1f\n", fanResponseSec) + } else { + fmt.Fprintf(&summary, "fan_response_sec=N/A\n") + } + + // Throttling failure counts against overall result. + if throttled { + stats.Failed++ + } + writeSATStats(&summary, stats) + + // Write CSV outputs. + if err := WriteFanStressCSV(filepath.Join(runDir, "metrics.csv"), rows, opts.GPUIndices); err != nil { + return "", err + } + _ = WriteFanSensorsCSV(filepath.Join(runDir, "fan-sensors.csv"), rows) + + if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil { + return "", err + } + + archive := filepath.Join(baseDir, "fan-stress-"+ts+".tar.gz") + if err := createTarGz(archive, runDir); err != nil { + return "", err + } + return archive, nil +} + +func applyFanStressDefaults(opts *FanStressOptions) { + if opts.BaselineSec <= 0 { + opts.BaselineSec = 30 + } + if opts.Phase1DurSec <= 0 { + opts.Phase1DurSec = 300 + } + if opts.PauseSec <= 0 { + opts.PauseSec = 60 + } + if opts.Phase2DurSec <= 0 { + opts.Phase2DurSec = 300 + } + if opts.SizeMB <= 0 { + opts.SizeMB = 64 + } +} + +// sampleFanStressRow collects all metrics for one telemetry sample. +func sampleFanStressRow(gpuIndices []int, phase string, elapsed float64) FanStressRow { + row := FanStressRow{ + TimestampUTC: time.Now().UTC().Format(time.RFC3339), + ElapsedSec: elapsed, + Phase: phase, + } + row.GPUs = sampleGPUStressMetrics(gpuIndices) + row.Fans, _ = sampleFanSpeeds() + row.CPUMaxTempC = sampleCPUMaxTemp() + row.SysPowerW = sampleSystemPower() + return row +} + +// sampleGPUStressMetrics queries nvidia-smi for temperature, utilization, power, +// clock frequency, and active throttle reasons for each GPU. +func sampleGPUStressMetrics(gpuIndices []int) []GPUStressMetric { + args := []string{ + "--query-gpu=index,temperature.gpu,utilization.gpu,power.draw,clocks.current.graphics,clocks_throttle_reasons.active", + "--format=csv,noheader,nounits", + } + if len(gpuIndices) > 0 { + ids := make([]string, len(gpuIndices)) + for i, idx := range gpuIndices { + ids[i] = strconv.Itoa(idx) + } + args = append([]string{"--id=" + strings.Join(ids, ",")}, args...) + } + out, err := exec.Command("nvidia-smi", args...).Output() + if err != nil { + return nil + } + var metrics []GPUStressMetric + for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") { + line = strings.TrimSpace(line) + if line == "" { + continue + } + parts := strings.Split(line, ", ") + if len(parts) < 6 { + continue + } + idx, _ := strconv.Atoi(strings.TrimSpace(parts[0])) + throttleVal := strings.TrimSpace(parts[5]) + // Throttled if active reasons bitmask is non-zero. + throttled := throttleVal != "0x0000000000000000" && + throttleVal != "0x0" && + throttleVal != "0" && + throttleVal != "" && + throttleVal != "N/A" + metrics = append(metrics, GPUStressMetric{ + Index: idx, + TempC: parseGPUFloat(parts[1]), + UsagePct: parseGPUFloat(parts[2]), + PowerW: parseGPUFloat(parts[3]), + ClockMHz: parseGPUFloat(parts[4]), + Throttled: throttled, + }) + } + return metrics +} + +// sampleFanSpeeds reads fan RPM values from ipmitool sdr. +func sampleFanSpeeds() ([]FanReading, error) { + out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output() + if err != nil { + return nil, err + } + return parseFanSpeeds(string(out)), nil +} + +// parseFanSpeeds parses "ipmitool sdr type Fan" output. +// Line format: "FAN1 | 2400.000 | RPM | ok" +func parseFanSpeeds(raw string) []FanReading { + var fans []FanReading + for _, line := range strings.Split(strings.TrimSpace(raw), "\n") { + parts := strings.Split(line, "|") + if len(parts) < 3 { + continue + } + unit := strings.TrimSpace(parts[2]) + if !strings.EqualFold(unit, "RPM") { + continue + } + valStr := strings.TrimSpace(parts[1]) + if strings.EqualFold(valStr, "na") || strings.EqualFold(valStr, "disabled") || valStr == "" { + continue + } + val, err := strconv.ParseFloat(valStr, 64) + if err != nil { + continue + } + fans = append(fans, FanReading{ + Name: strings.TrimSpace(parts[0]), + RPM: val, + }) + } + return fans +} + +// sampleCPUMaxTemp returns the highest CPU/inlet temperature from ipmitool or sensors. +func sampleCPUMaxTemp() float64 { + out, err := exec.Command("ipmitool", "sdr", "type", "Temperature").Output() + if err != nil { + return sampleCPUTempViaSensors() + } + return parseIPMIMaxTemp(string(out)) +} + +// parseIPMIMaxTemp extracts the maximum temperature from "ipmitool sdr type Temperature". +func parseIPMIMaxTemp(raw string) float64 { + var max float64 + for _, line := range strings.Split(strings.TrimSpace(raw), "\n") { + parts := strings.Split(line, "|") + if len(parts) < 3 { + continue + } + unit := strings.TrimSpace(parts[2]) + if !strings.Contains(strings.ToLower(unit), "degrees") { + continue + } + valStr := strings.TrimSpace(parts[1]) + if strings.EqualFold(valStr, "na") || valStr == "" { + continue + } + val, err := strconv.ParseFloat(valStr, 64) + if err != nil { + continue + } + if val > max { + max = val + } + } + return max +} + +// sampleCPUTempViaSensors falls back to lm-sensors when ipmitool is unavailable. +func sampleCPUTempViaSensors() float64 { + out, err := exec.Command("sensors", "-u").Output() + if err != nil { + return 0 + } + var max float64 + for _, line := range strings.Split(string(out), "\n") { + line = strings.TrimSpace(line) + fields := strings.Fields(line) + if len(fields) < 2 { + continue + } + if !strings.HasSuffix(fields[0], "_input:") { + continue + } + val, err := strconv.ParseFloat(fields[1], 64) + if err != nil { + continue + } + if val > 0 && val < 150 && val > max { + max = val + } + } + return max +} + +// sampleSystemPower reads system power draw via DCMI. +func sampleSystemPower() float64 { + out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output() + if err != nil { + return 0 + } + return parseDCMIPowerReading(string(out)) +} + +// parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output. +// Sample: " Instantaneous power reading: 500 Watts" +func parseDCMIPowerReading(raw string) float64 { + for _, line := range strings.Split(raw, "\n") { + if !strings.Contains(strings.ToLower(line), "instantaneous") { + continue + } + parts := strings.Fields(line) + for i, p := range parts { + if strings.EqualFold(p, "Watts") && i > 0 { + val, err := strconv.ParseFloat(parts[i-1], 64) + if err == nil { + return val + } + } + } + } + return 0 +} + +// analyzeThrottling returns true if any GPU reported an active throttle reason +// during either load phase. +func analyzeThrottling(rows []FanStressRow) bool { + for _, row := range rows { + if row.Phase != "load1" && row.Phase != "load2" { + continue + } + for _, gpu := range row.GPUs { + if gpu.Throttled { + return true + } + } + } + return false +} + +// analyzeMaxTemp returns the maximum value of the given extractor across all rows. +func analyzeMaxTemp(rows []FanStressRow, extract func(FanStressRow) float64) float64 { + var max float64 + for _, row := range rows { + if v := extract(row); v > max { + max = v + } + } + return max +} + +// analyzeFanResponse returns the seconds from load1 start until fan RPM first +// increased by more than 5% above the baseline average. Returns -1 if undetermined. +func analyzeFanResponse(rows []FanStressRow) float64 { + // Compute baseline average fan RPM. + var baseTotal, baseCount float64 + for _, row := range rows { + if row.Phase != "baseline" { + continue + } + for _, f := range row.Fans { + baseTotal += f.RPM + baseCount++ + } + } + if baseCount == 0 || baseTotal == 0 { + return -1 + } + baseAvg := baseTotal / baseCount + threshold := baseAvg * 1.05 // 5% increase signals fan ramp-up + + // Find elapsed time when load1 started. + var load1Start float64 = -1 + for _, row := range rows { + if row.Phase == "load1" { + load1Start = row.ElapsedSec + break + } + } + if load1Start < 0 { + return -1 + } + + // Find first load1 row where average RPM crosses the threshold. + for _, row := range rows { + if row.Phase != "load1" { + continue + } + var total, count float64 + for _, f := range row.Fans { + total += f.RPM + count++ + } + if count > 0 && total/count >= threshold { + return row.ElapsedSec - load1Start + } + } + return -1 +} + +// WriteFanStressCSV writes the wide-format metrics CSV with one row per second. +// GPU columns are generated per index in gpuIndices order. +func WriteFanStressCSV(path string, rows []FanStressRow, gpuIndices []int) error { + if len(rows) == 0 { + return os.WriteFile(path, []byte("no data\n"), 0644) + } + + var b strings.Builder + + // Header: fixed system columns + per-GPU columns. + b.WriteString("timestamp_utc,elapsed_sec,phase,fan_avg_rpm,fan_min_rpm,fan_max_rpm,cpu_max_temp_c,sys_power_w") + for _, idx := range gpuIndices { + fmt.Fprintf(&b, ",gpu%d_temp_c,gpu%d_usage_pct,gpu%d_power_w,gpu%d_clock_mhz,gpu%d_throttled", + idx, idx, idx, idx, idx) + } + b.WriteRune('\n') + + for _, row := range rows { + favg, fmin, fmax := fanRPMStats(row.Fans) + fmt.Fprintf(&b, "%s,%.1f,%s,%.0f,%.0f,%.0f,%.1f,%.1f", + row.TimestampUTC, + row.ElapsedSec, + row.Phase, + favg, fmin, fmax, + row.CPUMaxTempC, + row.SysPowerW, + ) + gpuByIdx := make(map[int]GPUStressMetric, len(row.GPUs)) + for _, g := range row.GPUs { + gpuByIdx[g.Index] = g + } + for _, idx := range gpuIndices { + g := gpuByIdx[idx] + throttled := 0 + if g.Throttled { + throttled = 1 + } + fmt.Fprintf(&b, ",%.1f,%.1f,%.1f,%.0f,%d", + g.TempC, g.UsagePct, g.PowerW, g.ClockMHz, throttled) + } + b.WriteRune('\n') + } + + return os.WriteFile(path, []byte(b.String()), 0644) +} + +// WriteFanSensorsCSV writes individual fan sensor readings in long (tidy) format. +func WriteFanSensorsCSV(path string, rows []FanStressRow) error { + var b strings.Builder + b.WriteString("timestamp_utc,elapsed_sec,phase,fan_name,rpm\n") + for _, row := range rows { + for _, f := range row.Fans { + fmt.Fprintf(&b, "%s,%.1f,%s,%s,%.0f\n", + row.TimestampUTC, row.ElapsedSec, row.Phase, f.Name, f.RPM) + } + } + return os.WriteFile(path, []byte(b.String()), 0644) +} + +// fanRPMStats computes average, min, max RPM across all fans in a sample row. +func fanRPMStats(fans []FanReading) (avg, min, max float64) { + if len(fans) == 0 { + return 0, 0, 0 + } + min = fans[0].RPM + max = fans[0].RPM + var total float64 + for _, f := range fans { + total += f.RPM + if f.RPM < min { + min = f.RPM + } + if f.RPM > max { + max = f.RPM + } + } + return total / float64(len(fans)), min, max +} diff --git a/audit/internal/tui/forms.go b/audit/internal/tui/forms.go index 950aa21..0c505bb 100644 --- a/audit/internal/tui/forms.go +++ b/audit/internal/tui/forms.go @@ -1,8 +1,10 @@ package tui import ( + "context" "time" + "bee/audit/internal/platform" tea "github.com/charmbracelet/bubbletea" ) @@ -137,6 +139,21 @@ func (m model) updateConfirm(msg tea.KeyMsg) (tea.Model, tea.Cmd) { }, pollSATProgress("gpu-amd", since), ) + case actionRunFanStress: + m.busyTitle = "Fan Stress Test" + m.progressPrefix = "fan-stress" + m.progressSince = time.Now() + m.progressLines = nil + since := m.progressSince + opts := hcFanStressOpts(m.hcMode, m.app) + return m, tea.Batch( + func() tea.Msg { + ctx := context.Background() + result, err := m.app.RunFanStressTestResult(ctx, opts) + return resultMsg{title: result.Title, body: result.Body, err: err, back: screenHealthCheck} + }, + pollSATProgress("fan-stress", since), + ) } case "ctrl+c": return m, tea.Quit @@ -148,9 +165,53 @@ func (m model) confirmCancelTarget() screen { switch m.pendingAction { case actionExportBundle: return screenExportTargets - case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT: + case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT, actionRunFanStress: return screenHealthCheck default: return screenMain } } + +// hcFanStressOpts builds FanStressOptions for the selected mode, auto-detecting all GPUs. +func hcFanStressOpts(hcMode int, application interface { + ListNvidiaGPUs() ([]platform.NvidiaGPU, error) +}) platform.FanStressOptions { + // Phase durations per mode: [baseline, load1, pause, load2] + type durations struct{ baseline, load1, pause, load2 int } + modes := [3]durations{ + {30, 120, 30, 120}, // Quick: ~5 min total + {60, 300, 60, 300}, // Standard: ~12 min total + {60, 600, 120, 600}, // Express: ~24 min total + } + if hcMode < 0 || hcMode >= len(modes) { + hcMode = 0 + } + d := modes[hcMode] + + // Use all detected NVIDIA GPUs. + var indices []int + if gpus, err := application.ListNvidiaGPUs(); err == nil { + for _, g := range gpus { + indices = append(indices, g.Index) + } + } + + // Use minimum GPU memory size to fit all GPUs. + sizeMB := 64 + if gpus, err := application.ListNvidiaGPUs(); err == nil { + for _, g := range gpus { + if g.MemoryMB > 0 && (sizeMB == 64 || g.MemoryMB < sizeMB) { + sizeMB = g.MemoryMB / 16 // allocate 1/16 of VRAM per GPU + } + } + } + + return platform.FanStressOptions{ + BaselineSec: d.baseline, + Phase1DurSec: d.load1, + PauseSec: d.pause, + Phase2DurSec: d.load2, + SizeMB: sizeMB, + GPUIndices: indices, + } +} diff --git a/audit/internal/tui/screen_health_check.go b/audit/internal/tui/screen_health_check.go index 28c8ed3..c1318a7 100644 --- a/audit/internal/tui/screen_health_check.go +++ b/audit/internal/tui/screen_health_check.go @@ -18,16 +18,17 @@ const ( // Cursor positions in Health Check screen. const ( - hcCurGPU = 0 - hcCurMemory = 1 - hcCurStorage = 2 - hcCurCPU = 3 - hcCurSelectAll = 4 - hcCurModeQuick = 5 - hcCurModeStd = 6 - hcCurModeExpr = 7 - hcCurRunAll = 8 - hcCurTotal = 9 + hcCurGPU = 0 + hcCurMemory = 1 + hcCurStorage = 2 + hcCurCPU = 3 + hcCurSelectAll = 4 + hcCurModeQuick = 5 + hcCurModeStd = 6 + hcCurModeExpr = 7 + hcCurRunAll = 8 + hcCurFanStress = 9 + hcCurTotal = 10 ) // hcModeDurations maps mode index (0=Quick,1=Standard,2=Express) to GPU stress seconds. @@ -82,6 +83,8 @@ func (m model) updateHealthCheck(msg tea.KeyMsg) (tea.Model, tea.Cmd) { m.hcMode = m.hcCursor - hcCurModeQuick case hcCurRunAll: return m.hcRunAll() + case hcCurFanStress: + return m.hcRunFanStress() } case "g", "G": return m.hcRunSingle(hcGPU) @@ -93,6 +96,8 @@ func (m model) updateHealthCheck(msg tea.KeyMsg) (tea.Model, tea.Cmd) { return m.hcRunSingle(hcCPU) case "r", "R": return m.hcRunAll() + case "f", "F": + return m.hcRunFanStress() case "a", "A": allOn := m.hcSel[0] && m.hcSel[1] && m.hcSel[2] && m.hcSel[3] for i := range m.hcSel { @@ -143,6 +148,13 @@ func (m model) hcRunSingle(idx int) (tea.Model, tea.Cmd) { return m, nil } +func (m model) hcRunFanStress() (tea.Model, tea.Cmd) { + m.pendingAction = actionRunFanStress + m.screen = screenConfirm + m.cursor = 0 + return m, nil +} + func (m model) hcRunAll() (tea.Model, tea.Cmd) { for _, sel := range m.hcSel { if sel { @@ -300,8 +312,16 @@ func renderHealthCheck(m model) string { fmt.Fprintf(&b, "%s[ RUN ALL [R] ]\n", pfx) } + { + pfx := " " + if m.hcCursor == hcCurFanStress { + pfx = "> " + } + fmt.Fprintf(&b, "%s[ FAN STRESS TEST [F] ] (thermal cycling, fan lag, throttle check)\n", pfx) + } + fmt.Fprintln(&b) fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────") - fmt.Fprint(&b, "[↑↓] move [space/enter] toggle [letter] single test [R] run all [Esc] back") + fmt.Fprint(&b, "[↑↓] move [space/enter] toggle [letter] single test [R] run all [F] fan stress [Esc] back") return b.String() } diff --git a/audit/internal/tui/types.go b/audit/internal/tui/types.go index 323db00..7382853 100644 --- a/audit/internal/tui/types.go +++ b/audit/internal/tui/types.go @@ -40,7 +40,8 @@ const ( actionRunMemorySAT actionKind = "run_memory_sat" actionRunStorageSAT actionKind = "run_storage_sat" actionRunCPUSAT actionKind = "run_cpu_sat" - actionRunAMDGPUSAT actionKind = "run_amd_gpu_sat" + actionRunAMDGPUSAT actionKind = "run_amd_gpu_sat" + actionRunFanStress actionKind = "run_fan_stress" ) type model struct { @@ -188,6 +189,11 @@ func (m model) confirmBody() (string, string) { return "CPU test", "Run stress-ng? Mode: " + modes[m.hcMode] case actionRunAMDGPUSAT: return "AMD GPU test", "Run AMD GPU diagnostic pack (rocm-smi)?" + case actionRunFanStress: + modes := []string{"Quick (2×2min)", "Standard (2×5min)", "Express (2×10min)"} + return "Fan Stress Test", "Two-phase GPU thermal cycling test.\n" + + "Monitors fans, temps, power — detects throttling.\n" + + "Mode: " + modes[m.hcMode] + "\n\nAll NVIDIA GPUs will be stressed." default: return "Confirm", "Proceed?" }