feat(audit): fan-stress SAT for MSI case-04 fan lag & thermal throttle detection

Two-phase GPU thermal cycling test with per-second telemetry:
- Phases: baseline → load1 → pause (no cooldown) → load2 → cooldown
- Monitors: fan RPM (ipmitool sdr), CPU/server temps (ipmitool/sensors),
  system power (ipmitool dcmi), GPU temp/power/usage/clock/throttle (nvidia-smi)
- Detects throttling via clocks_throttle_reasons.active bitmask
- Measures fan response lag from load start (validates case-04 ~2s lag)
- Exports metrics.csv (wide format, one row/sec) and fan-sensors.csv (long format)
- TUI: adds [F] Fan Stress Test to Health Check screen with Quick/Standard/Express modes

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Mikhail Chusavitin
2026-03-26 09:51:03 +03:00
parent cfe255f6e4
commit 4cd7c9ab4e
6 changed files with 753 additions and 13 deletions

View File

@@ -1,8 +1,10 @@
package tui
import (
"context"
"time"
"bee/audit/internal/platform"
tea "github.com/charmbracelet/bubbletea"
)
@@ -137,6 +139,21 @@ func (m model) updateConfirm(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
},
pollSATProgress("gpu-amd", since),
)
case actionRunFanStress:
m.busyTitle = "Fan Stress Test"
m.progressPrefix = "fan-stress"
m.progressSince = time.Now()
m.progressLines = nil
since := m.progressSince
opts := hcFanStressOpts(m.hcMode, m.app)
return m, tea.Batch(
func() tea.Msg {
ctx := context.Background()
result, err := m.app.RunFanStressTestResult(ctx, opts)
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenHealthCheck}
},
pollSATProgress("fan-stress", since),
)
}
case "ctrl+c":
return m, tea.Quit
@@ -148,9 +165,53 @@ func (m model) confirmCancelTarget() screen {
switch m.pendingAction {
case actionExportBundle:
return screenExportTargets
case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT:
case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT, actionRunFanStress:
return screenHealthCheck
default:
return screenMain
}
}
// hcFanStressOpts builds FanStressOptions for the selected mode, auto-detecting all GPUs.
func hcFanStressOpts(hcMode int, application interface {
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
}) platform.FanStressOptions {
// Phase durations per mode: [baseline, load1, pause, load2]
type durations struct{ baseline, load1, pause, load2 int }
modes := [3]durations{
{30, 120, 30, 120}, // Quick: ~5 min total
{60, 300, 60, 300}, // Standard: ~12 min total
{60, 600, 120, 600}, // Express: ~24 min total
}
if hcMode < 0 || hcMode >= len(modes) {
hcMode = 0
}
d := modes[hcMode]
// Use all detected NVIDIA GPUs.
var indices []int
if gpus, err := application.ListNvidiaGPUs(); err == nil {
for _, g := range gpus {
indices = append(indices, g.Index)
}
}
// Use minimum GPU memory size to fit all GPUs.
sizeMB := 64
if gpus, err := application.ListNvidiaGPUs(); err == nil {
for _, g := range gpus {
if g.MemoryMB > 0 && (sizeMB == 64 || g.MemoryMB < sizeMB) {
sizeMB = g.MemoryMB / 16 // allocate 1/16 of VRAM per GPU
}
}
}
return platform.FanStressOptions{
BaselineSec: d.baseline,
Phase1DurSec: d.load1,
PauseSec: d.pause,
Phase2DurSec: d.load2,
SizeMB: sizeMB,
GPUIndices: indices,
}
}