feat(audit): fan-stress SAT for MSI case-04 fan lag & thermal throttle detection

Two-phase GPU thermal cycling test with per-second telemetry:
- Phases: baseline → load1 → pause (no cooldown) → load2 → cooldown
- Monitors: fan RPM (ipmitool sdr), CPU/server temps (ipmitool/sensors),
  system power (ipmitool dcmi), GPU temp/power/usage/clock/throttle (nvidia-smi)
- Detects throttling via clocks_throttle_reasons.active bitmask
- Measures fan response lag from load start (validates case-04 ~2s lag)
- Exports metrics.csv (wide format, one row/sec) and fan-sensors.csv (long format)
- TUI: adds [F] Fan Stress Test to Health Check screen with Quick/Standard/Express modes

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Mikhail Chusavitin
2026-03-26 09:51:03 +03:00
parent cfe255f6e4
commit 4cd7c9ab4e
6 changed files with 753 additions and 13 deletions

View File

@@ -1,8 +1,10 @@
package tui
import (
"context"
"time"
"bee/audit/internal/platform"
tea "github.com/charmbracelet/bubbletea"
)
@@ -137,6 +139,21 @@ func (m model) updateConfirm(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
},
pollSATProgress("gpu-amd", since),
)
case actionRunFanStress:
m.busyTitle = "Fan Stress Test"
m.progressPrefix = "fan-stress"
m.progressSince = time.Now()
m.progressLines = nil
since := m.progressSince
opts := hcFanStressOpts(m.hcMode, m.app)
return m, tea.Batch(
func() tea.Msg {
ctx := context.Background()
result, err := m.app.RunFanStressTestResult(ctx, opts)
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenHealthCheck}
},
pollSATProgress("fan-stress", since),
)
}
case "ctrl+c":
return m, tea.Quit
@@ -148,9 +165,53 @@ func (m model) confirmCancelTarget() screen {
switch m.pendingAction {
case actionExportBundle:
return screenExportTargets
case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT:
case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT, actionRunFanStress:
return screenHealthCheck
default:
return screenMain
}
}
// hcFanStressOpts builds FanStressOptions for the selected mode, auto-detecting all GPUs.
func hcFanStressOpts(hcMode int, application interface {
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
}) platform.FanStressOptions {
// Phase durations per mode: [baseline, load1, pause, load2]
type durations struct{ baseline, load1, pause, load2 int }
modes := [3]durations{
{30, 120, 30, 120}, // Quick: ~5 min total
{60, 300, 60, 300}, // Standard: ~12 min total
{60, 600, 120, 600}, // Express: ~24 min total
}
if hcMode < 0 || hcMode >= len(modes) {
hcMode = 0
}
d := modes[hcMode]
// Use all detected NVIDIA GPUs.
var indices []int
if gpus, err := application.ListNvidiaGPUs(); err == nil {
for _, g := range gpus {
indices = append(indices, g.Index)
}
}
// Use minimum GPU memory size to fit all GPUs.
sizeMB := 64
if gpus, err := application.ListNvidiaGPUs(); err == nil {
for _, g := range gpus {
if g.MemoryMB > 0 && (sizeMB == 64 || g.MemoryMB < sizeMB) {
sizeMB = g.MemoryMB / 16 // allocate 1/16 of VRAM per GPU
}
}
}
return platform.FanStressOptions{
BaselineSec: d.baseline,
Phase1DurSec: d.load1,
PauseSec: d.pause,
Phase2DurSec: d.load2,
SizeMB: sizeMB,
GPUIndices: indices,
}
}

View File

@@ -18,16 +18,17 @@ const (
// Cursor positions in Health Check screen.
const (
hcCurGPU = 0
hcCurMemory = 1
hcCurStorage = 2
hcCurCPU = 3
hcCurSelectAll = 4
hcCurModeQuick = 5
hcCurModeStd = 6
hcCurModeExpr = 7
hcCurRunAll = 8
hcCurTotal = 9
hcCurGPU = 0
hcCurMemory = 1
hcCurStorage = 2
hcCurCPU = 3
hcCurSelectAll = 4
hcCurModeQuick = 5
hcCurModeStd = 6
hcCurModeExpr = 7
hcCurRunAll = 8
hcCurFanStress = 9
hcCurTotal = 10
)
// hcModeDurations maps mode index (0=Quick,1=Standard,2=Express) to GPU stress seconds.
@@ -82,6 +83,8 @@ func (m model) updateHealthCheck(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
m.hcMode = m.hcCursor - hcCurModeQuick
case hcCurRunAll:
return m.hcRunAll()
case hcCurFanStress:
return m.hcRunFanStress()
}
case "g", "G":
return m.hcRunSingle(hcGPU)
@@ -93,6 +96,8 @@ func (m model) updateHealthCheck(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
return m.hcRunSingle(hcCPU)
case "r", "R":
return m.hcRunAll()
case "f", "F":
return m.hcRunFanStress()
case "a", "A":
allOn := m.hcSel[0] && m.hcSel[1] && m.hcSel[2] && m.hcSel[3]
for i := range m.hcSel {
@@ -143,6 +148,13 @@ func (m model) hcRunSingle(idx int) (tea.Model, tea.Cmd) {
return m, nil
}
func (m model) hcRunFanStress() (tea.Model, tea.Cmd) {
m.pendingAction = actionRunFanStress
m.screen = screenConfirm
m.cursor = 0
return m, nil
}
func (m model) hcRunAll() (tea.Model, tea.Cmd) {
for _, sel := range m.hcSel {
if sel {
@@ -300,8 +312,16 @@ func renderHealthCheck(m model) string {
fmt.Fprintf(&b, "%s[ RUN ALL [R] ]\n", pfx)
}
{
pfx := " "
if m.hcCursor == hcCurFanStress {
pfx = "> "
}
fmt.Fprintf(&b, "%s[ FAN STRESS TEST [F] ] (thermal cycling, fan lag, throttle check)\n", pfx)
}
fmt.Fprintln(&b)
fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────")
fmt.Fprint(&b, "[↑↓] move [space/enter] toggle [letter] single test [R] run all [Esc] back")
fmt.Fprint(&b, "[↑↓] move [space/enter] toggle [letter] single test [R] run all [F] fan stress [Esc] back")
return b.String()
}

View File

@@ -40,7 +40,8 @@ const (
actionRunMemorySAT actionKind = "run_memory_sat"
actionRunStorageSAT actionKind = "run_storage_sat"
actionRunCPUSAT actionKind = "run_cpu_sat"
actionRunAMDGPUSAT actionKind = "run_amd_gpu_sat"
actionRunAMDGPUSAT actionKind = "run_amd_gpu_sat"
actionRunFanStress actionKind = "run_fan_stress"
)
type model struct {
@@ -188,6 +189,11 @@ func (m model) confirmBody() (string, string) {
return "CPU test", "Run stress-ng? Mode: " + modes[m.hcMode]
case actionRunAMDGPUSAT:
return "AMD GPU test", "Run AMD GPU diagnostic pack (rocm-smi)?"
case actionRunFanStress:
modes := []string{"Quick (2×2min)", "Standard (2×5min)", "Express (2×10min)"}
return "Fan Stress Test", "Two-phase GPU thermal cycling test.\n" +
"Monitors fans, temps, power — detects throttling.\n" +
"Mode: " + modes[m.hcMode] + "\n\nAll NVIDIA GPUs will be stressed."
default:
return "Confirm", "Proceed?"
}