feat(audit): fan-stress SAT for MSI case-04 fan lag & thermal throttle detection
Two-phase GPU thermal cycling test with per-second telemetry: - Phases: baseline → load1 → pause (no cooldown) → load2 → cooldown - Monitors: fan RPM (ipmitool sdr), CPU/server temps (ipmitool/sensors), system power (ipmitool dcmi), GPU temp/power/usage/clock/throttle (nvidia-smi) - Detects throttling via clocks_throttle_reasons.active bitmask - Measures fan response lag from load start (validates case-04 ~2s lag) - Exports metrics.csv (wide format, one row/sec) and fan-sensors.csv (long format) - TUI: adds [F] Fan Stress Test to Health Check screen with Quick/Standard/Express modes Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,8 +1,10 @@
|
||||
package tui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
|
||||
@@ -137,6 +139,21 @@ func (m model) updateConfirm(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
},
|
||||
pollSATProgress("gpu-amd", since),
|
||||
)
|
||||
case actionRunFanStress:
|
||||
m.busyTitle = "Fan Stress Test"
|
||||
m.progressPrefix = "fan-stress"
|
||||
m.progressSince = time.Now()
|
||||
m.progressLines = nil
|
||||
since := m.progressSince
|
||||
opts := hcFanStressOpts(m.hcMode, m.app)
|
||||
return m, tea.Batch(
|
||||
func() tea.Msg {
|
||||
ctx := context.Background()
|
||||
result, err := m.app.RunFanStressTestResult(ctx, opts)
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenHealthCheck}
|
||||
},
|
||||
pollSATProgress("fan-stress", since),
|
||||
)
|
||||
}
|
||||
case "ctrl+c":
|
||||
return m, tea.Quit
|
||||
@@ -148,9 +165,53 @@ func (m model) confirmCancelTarget() screen {
|
||||
switch m.pendingAction {
|
||||
case actionExportBundle:
|
||||
return screenExportTargets
|
||||
case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT:
|
||||
case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT, actionRunFanStress:
|
||||
return screenHealthCheck
|
||||
default:
|
||||
return screenMain
|
||||
}
|
||||
}
|
||||
|
||||
// hcFanStressOpts builds FanStressOptions for the selected mode, auto-detecting all GPUs.
|
||||
func hcFanStressOpts(hcMode int, application interface {
|
||||
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
|
||||
}) platform.FanStressOptions {
|
||||
// Phase durations per mode: [baseline, load1, pause, load2]
|
||||
type durations struct{ baseline, load1, pause, load2 int }
|
||||
modes := [3]durations{
|
||||
{30, 120, 30, 120}, // Quick: ~5 min total
|
||||
{60, 300, 60, 300}, // Standard: ~12 min total
|
||||
{60, 600, 120, 600}, // Express: ~24 min total
|
||||
}
|
||||
if hcMode < 0 || hcMode >= len(modes) {
|
||||
hcMode = 0
|
||||
}
|
||||
d := modes[hcMode]
|
||||
|
||||
// Use all detected NVIDIA GPUs.
|
||||
var indices []int
|
||||
if gpus, err := application.ListNvidiaGPUs(); err == nil {
|
||||
for _, g := range gpus {
|
||||
indices = append(indices, g.Index)
|
||||
}
|
||||
}
|
||||
|
||||
// Use minimum GPU memory size to fit all GPUs.
|
||||
sizeMB := 64
|
||||
if gpus, err := application.ListNvidiaGPUs(); err == nil {
|
||||
for _, g := range gpus {
|
||||
if g.MemoryMB > 0 && (sizeMB == 64 || g.MemoryMB < sizeMB) {
|
||||
sizeMB = g.MemoryMB / 16 // allocate 1/16 of VRAM per GPU
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return platform.FanStressOptions{
|
||||
BaselineSec: d.baseline,
|
||||
Phase1DurSec: d.load1,
|
||||
PauseSec: d.pause,
|
||||
Phase2DurSec: d.load2,
|
||||
SizeMB: sizeMB,
|
||||
GPUIndices: indices,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,16 +18,17 @@ const (
|
||||
|
||||
// Cursor positions in Health Check screen.
|
||||
const (
|
||||
hcCurGPU = 0
|
||||
hcCurMemory = 1
|
||||
hcCurStorage = 2
|
||||
hcCurCPU = 3
|
||||
hcCurSelectAll = 4
|
||||
hcCurModeQuick = 5
|
||||
hcCurModeStd = 6
|
||||
hcCurModeExpr = 7
|
||||
hcCurRunAll = 8
|
||||
hcCurTotal = 9
|
||||
hcCurGPU = 0
|
||||
hcCurMemory = 1
|
||||
hcCurStorage = 2
|
||||
hcCurCPU = 3
|
||||
hcCurSelectAll = 4
|
||||
hcCurModeQuick = 5
|
||||
hcCurModeStd = 6
|
||||
hcCurModeExpr = 7
|
||||
hcCurRunAll = 8
|
||||
hcCurFanStress = 9
|
||||
hcCurTotal = 10
|
||||
)
|
||||
|
||||
// hcModeDurations maps mode index (0=Quick,1=Standard,2=Express) to GPU stress seconds.
|
||||
@@ -82,6 +83,8 @@ func (m model) updateHealthCheck(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
m.hcMode = m.hcCursor - hcCurModeQuick
|
||||
case hcCurRunAll:
|
||||
return m.hcRunAll()
|
||||
case hcCurFanStress:
|
||||
return m.hcRunFanStress()
|
||||
}
|
||||
case "g", "G":
|
||||
return m.hcRunSingle(hcGPU)
|
||||
@@ -93,6 +96,8 @@ func (m model) updateHealthCheck(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
return m.hcRunSingle(hcCPU)
|
||||
case "r", "R":
|
||||
return m.hcRunAll()
|
||||
case "f", "F":
|
||||
return m.hcRunFanStress()
|
||||
case "a", "A":
|
||||
allOn := m.hcSel[0] && m.hcSel[1] && m.hcSel[2] && m.hcSel[3]
|
||||
for i := range m.hcSel {
|
||||
@@ -143,6 +148,13 @@ func (m model) hcRunSingle(idx int) (tea.Model, tea.Cmd) {
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (m model) hcRunFanStress() (tea.Model, tea.Cmd) {
|
||||
m.pendingAction = actionRunFanStress
|
||||
m.screen = screenConfirm
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (m model) hcRunAll() (tea.Model, tea.Cmd) {
|
||||
for _, sel := range m.hcSel {
|
||||
if sel {
|
||||
@@ -300,8 +312,16 @@ func renderHealthCheck(m model) string {
|
||||
fmt.Fprintf(&b, "%s[ RUN ALL [R] ]\n", pfx)
|
||||
}
|
||||
|
||||
{
|
||||
pfx := " "
|
||||
if m.hcCursor == hcCurFanStress {
|
||||
pfx = "> "
|
||||
}
|
||||
fmt.Fprintf(&b, "%s[ FAN STRESS TEST [F] ] (thermal cycling, fan lag, throttle check)\n", pfx)
|
||||
}
|
||||
|
||||
fmt.Fprintln(&b)
|
||||
fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────")
|
||||
fmt.Fprint(&b, "[↑↓] move [space/enter] toggle [letter] single test [R] run all [Esc] back")
|
||||
fmt.Fprint(&b, "[↑↓] move [space/enter] toggle [letter] single test [R] run all [F] fan stress [Esc] back")
|
||||
return b.String()
|
||||
}
|
||||
|
||||
@@ -40,7 +40,8 @@ const (
|
||||
actionRunMemorySAT actionKind = "run_memory_sat"
|
||||
actionRunStorageSAT actionKind = "run_storage_sat"
|
||||
actionRunCPUSAT actionKind = "run_cpu_sat"
|
||||
actionRunAMDGPUSAT actionKind = "run_amd_gpu_sat"
|
||||
actionRunAMDGPUSAT actionKind = "run_amd_gpu_sat"
|
||||
actionRunFanStress actionKind = "run_fan_stress"
|
||||
)
|
||||
|
||||
type model struct {
|
||||
@@ -188,6 +189,11 @@ func (m model) confirmBody() (string, string) {
|
||||
return "CPU test", "Run stress-ng? Mode: " + modes[m.hcMode]
|
||||
case actionRunAMDGPUSAT:
|
||||
return "AMD GPU test", "Run AMD GPU diagnostic pack (rocm-smi)?"
|
||||
case actionRunFanStress:
|
||||
modes := []string{"Quick (2×2min)", "Standard (2×5min)", "Express (2×10min)"}
|
||||
return "Fan Stress Test", "Two-phase GPU thermal cycling test.\n" +
|
||||
"Monitors fans, temps, power — detects throttling.\n" +
|
||||
"Mode: " + modes[m.hcMode] + "\n\nAll NVIDIA GPUs will be stressed."
|
||||
default:
|
||||
return "Confirm", "Proceed?"
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user