feat(audit): fan-stress SAT for MSI case-04 fan lag & thermal throttle detection
Two-phase GPU thermal cycling test with per-second telemetry: - Phases: baseline → load1 → pause (no cooldown) → load2 → cooldown - Monitors: fan RPM (ipmitool sdr), CPU/server temps (ipmitool/sensors), system power (ipmitool dcmi), GPU temp/power/usage/clock/throttle (nvidia-smi) - Detects throttling via clocks_throttle_reasons.active bitmask - Measures fan response lag from load start (validates case-04 ~2s lag) - Exports metrics.csv (wide format, one row/sec) and fan-sensors.csv (long format) - TUI: adds [F] Fan Stress Test to Health Check screen with Quick/Standard/Express modes Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -80,6 +80,7 @@ type satRunner interface {
|
|||||||
DetectGPUVendor() string
|
DetectGPUVendor() string
|
||||||
ListAMDGPUs() ([]platform.AMDGPUInfo, error)
|
ListAMDGPUs() ([]platform.AMDGPUInfo, error)
|
||||||
RunAMDAcceptancePack(baseDir string) (string, error)
|
RunAMDAcceptancePack(baseDir string) (string, error)
|
||||||
|
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
type runtimeChecker interface {
|
type runtimeChecker interface {
|
||||||
@@ -491,6 +492,67 @@ func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) {
|
|||||||
return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err
|
return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunFanStressTest(ctx, baseDir, opts)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) {
|
||||||
|
path, err := a.RunFanStressTest(ctx, "", opts)
|
||||||
|
body := formatFanStressResult(path)
|
||||||
|
if err != nil && err != context.Canceled {
|
||||||
|
body += "\nERROR: " + err.Error()
|
||||||
|
}
|
||||||
|
return ActionResult{Title: "Fan Stress Test", Body: body}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// formatFanStressResult formats the summary.txt from a fan-stress run, including
|
||||||
|
// the per-step pass/fail display and the analysis section (throttling, max temps, fan response).
|
||||||
|
func formatFanStressResult(archivePath string) string {
|
||||||
|
if archivePath == "" {
|
||||||
|
return "No output produced."
|
||||||
|
}
|
||||||
|
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||||
|
raw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||||
|
if err != nil {
|
||||||
|
return "Archive written to " + archivePath
|
||||||
|
}
|
||||||
|
content := strings.TrimSpace(string(raw))
|
||||||
|
kv := parseKeyValueSummary(content)
|
||||||
|
|
||||||
|
var b strings.Builder
|
||||||
|
b.WriteString(formatSATDetail(content))
|
||||||
|
|
||||||
|
// Append analysis section.
|
||||||
|
var analysis []string
|
||||||
|
if v, ok := kv["throttling_detected"]; ok {
|
||||||
|
label := "NO"
|
||||||
|
if v == "true" {
|
||||||
|
label = "YES ← throttling detected during load"
|
||||||
|
}
|
||||||
|
analysis = append(analysis, "Throttling: "+label)
|
||||||
|
}
|
||||||
|
if v, ok := kv["max_gpu_temp_c"]; ok && v != "0.0" {
|
||||||
|
analysis = append(analysis, "Max GPU temp: "+v+"°C")
|
||||||
|
}
|
||||||
|
if v, ok := kv["max_cpu_temp_c"]; ok && v != "0.0" {
|
||||||
|
analysis = append(analysis, "Max CPU temp: "+v+"°C")
|
||||||
|
}
|
||||||
|
if v, ok := kv["fan_response_sec"]; ok && v != "N/A" && v != "-1.0" {
|
||||||
|
analysis = append(analysis, "Fan response: "+v+"s")
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(analysis) > 0 {
|
||||||
|
b.WriteString("\n\n=== Analysis ===\n")
|
||||||
|
for _, line := range analysis {
|
||||||
|
b.WriteString(line + "\n")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(b.String())
|
||||||
|
}
|
||||||
|
|
||||||
// satResultBody reads summary.txt from the SAT run directory (archive path without .tar.gz)
|
// satResultBody reads summary.txt from the SAT run directory (archive path without .tar.gz)
|
||||||
// and returns a formatted human-readable result. Falls back to a plain message if unreadable.
|
// and returns a formatted human-readable result. Falls back to a plain message if unreadable.
|
||||||
func satResultBody(archivePath string) string {
|
func satResultBody(archivePath string) string {
|
||||||
|
|||||||
@@ -170,6 +170,10 @@ func (f fakeSAT) RunAMDAcceptancePack(baseDir string) (string, error) {
|
|||||||
return "", nil
|
return "", nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunFanStressTest(_ context.Context, _ string, _ platform.FanStressOptions) (string, error) {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
|
||||||
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
|
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
|
|||||||
587
audit/internal/platform/sat_fan_stress.go
Normal file
587
audit/internal/platform/sat_fan_stress.go
Normal file
@@ -0,0 +1,587 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// FanStressOptions configures the fan-stress / thermal cycling test.
|
||||||
|
type FanStressOptions struct {
|
||||||
|
BaselineSec int // idle monitoring before and after load (default 30)
|
||||||
|
Phase1DurSec int // first load phase duration in seconds (default 300)
|
||||||
|
PauseSec int // pause between the two load phases (default 60)
|
||||||
|
Phase2DurSec int // second load phase duration in seconds (default 300)
|
||||||
|
SizeMB int // GPU memory to allocate per GPU during stress (default 64)
|
||||||
|
GPUIndices []int // which GPU indices to stress (empty = all detected)
|
||||||
|
}
|
||||||
|
|
||||||
|
// FanReading holds one fan sensor reading.
|
||||||
|
type FanReading struct {
|
||||||
|
Name string
|
||||||
|
RPM float64
|
||||||
|
}
|
||||||
|
|
||||||
|
// GPUStressMetric holds per-GPU metrics during the stress test.
|
||||||
|
type GPUStressMetric struct {
|
||||||
|
Index int
|
||||||
|
TempC float64
|
||||||
|
UsagePct float64
|
||||||
|
PowerW float64
|
||||||
|
ClockMHz float64
|
||||||
|
Throttled bool // true if any throttle reason is active
|
||||||
|
}
|
||||||
|
|
||||||
|
// FanStressRow is one second-interval telemetry sample covering all monitored dimensions.
|
||||||
|
type FanStressRow struct {
|
||||||
|
TimestampUTC string
|
||||||
|
ElapsedSec float64
|
||||||
|
Phase string // "baseline", "load1", "pause", "load2", "cooldown"
|
||||||
|
GPUs []GPUStressMetric
|
||||||
|
Fans []FanReading
|
||||||
|
CPUMaxTempC float64 // highest CPU temperature from ipmitool / sensors
|
||||||
|
SysPowerW float64 // DCMI system power reading
|
||||||
|
}
|
||||||
|
|
||||||
|
// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
|
||||||
|
// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
|
||||||
|
// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
|
||||||
|
func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanStressOptions) (string, error) {
|
||||||
|
if baseDir == "" {
|
||||||
|
baseDir = "/var/log/bee-sat"
|
||||||
|
}
|
||||||
|
applyFanStressDefaults(&opts)
|
||||||
|
|
||||||
|
ts := time.Now().UTC().Format("20060102-150405")
|
||||||
|
runDir := filepath.Join(baseDir, "fan-stress-"+ts)
|
||||||
|
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
verboseLog := filepath.Join(runDir, "verbose.log")
|
||||||
|
|
||||||
|
// Phase name shared between sampler goroutine and main goroutine.
|
||||||
|
var phaseMu sync.Mutex
|
||||||
|
currentPhase := "init"
|
||||||
|
setPhase := func(name string) {
|
||||||
|
phaseMu.Lock()
|
||||||
|
currentPhase = name
|
||||||
|
phaseMu.Unlock()
|
||||||
|
}
|
||||||
|
getPhase := func() string {
|
||||||
|
phaseMu.Lock()
|
||||||
|
defer phaseMu.Unlock()
|
||||||
|
return currentPhase
|
||||||
|
}
|
||||||
|
|
||||||
|
start := time.Now()
|
||||||
|
var rowsMu sync.Mutex
|
||||||
|
var allRows []FanStressRow
|
||||||
|
|
||||||
|
// Start background sampler (every second).
|
||||||
|
stopCh := make(chan struct{})
|
||||||
|
doneCh := make(chan struct{})
|
||||||
|
go func() {
|
||||||
|
defer close(doneCh)
|
||||||
|
ticker := time.NewTicker(time.Second)
|
||||||
|
defer ticker.Stop()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-stopCh:
|
||||||
|
return
|
||||||
|
case <-ticker.C:
|
||||||
|
row := sampleFanStressRow(opts.GPUIndices, getPhase(), time.Since(start).Seconds())
|
||||||
|
rowsMu.Lock()
|
||||||
|
allRows = append(allRows, row)
|
||||||
|
rowsMu.Unlock()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
var summary strings.Builder
|
||||||
|
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
||||||
|
|
||||||
|
stats := satStats{}
|
||||||
|
|
||||||
|
// idlePhase sleeps for durSec while the sampler stamps phaseName on each row.
|
||||||
|
idlePhase := func(phaseName, stepName string, durSec int) {
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
setPhase(phaseName)
|
||||||
|
appendSATVerboseLog(verboseLog,
|
||||||
|
fmt.Sprintf("[%s] start %s (idle %ds)", time.Now().UTC().Format(time.RFC3339), stepName, durSec),
|
||||||
|
)
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
case <-time.After(time.Duration(durSec) * time.Second):
|
||||||
|
}
|
||||||
|
appendSATVerboseLog(verboseLog,
|
||||||
|
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), stepName),
|
||||||
|
)
|
||||||
|
fmt.Fprintf(&summary, "%s_status=OK\n", stepName)
|
||||||
|
stats.OK++
|
||||||
|
}
|
||||||
|
|
||||||
|
// loadPhase runs bee-gpu-stress for durSec; sampler stamps phaseName on each row.
|
||||||
|
loadPhase := func(phaseName, stepName string, durSec int) {
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
setPhase(phaseName)
|
||||||
|
var env []string
|
||||||
|
if len(opts.GPUIndices) > 0 {
|
||||||
|
ids := make([]string, len(opts.GPUIndices))
|
||||||
|
for i, idx := range opts.GPUIndices {
|
||||||
|
ids[i] = strconv.Itoa(idx)
|
||||||
|
}
|
||||||
|
env = []string{"CUDA_VISIBLE_DEVICES=" + strings.Join(ids, ",")}
|
||||||
|
}
|
||||||
|
cmd := []string{
|
||||||
|
"bee-gpu-stress",
|
||||||
|
"--seconds", strconv.Itoa(durSec),
|
||||||
|
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||||
|
}
|
||||||
|
out, err := runSATCommandCtx(ctx, verboseLog, stepName, cmd, env)
|
||||||
|
_ = os.WriteFile(filepath.Join(runDir, stepName+".log"), out, 0644)
|
||||||
|
if err != nil && err != context.Canceled && err.Error() != "signal: killed" {
|
||||||
|
fmt.Fprintf(&summary, "%s_status=FAILED\n", stepName)
|
||||||
|
stats.Failed++
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(&summary, "%s_status=OK\n", stepName)
|
||||||
|
stats.OK++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Execute test phases.
|
||||||
|
idlePhase("baseline", "01-baseline", opts.BaselineSec)
|
||||||
|
loadPhase("load1", "02-load1", opts.Phase1DurSec)
|
||||||
|
idlePhase("pause", "03-pause", opts.PauseSec)
|
||||||
|
loadPhase("load2", "04-load2", opts.Phase2DurSec)
|
||||||
|
idlePhase("cooldown", "05-cooldown", opts.BaselineSec)
|
||||||
|
|
||||||
|
// Stop sampler and collect rows.
|
||||||
|
close(stopCh)
|
||||||
|
<-doneCh
|
||||||
|
|
||||||
|
rowsMu.Lock()
|
||||||
|
rows := allRows
|
||||||
|
rowsMu.Unlock()
|
||||||
|
|
||||||
|
// Analysis.
|
||||||
|
throttled := analyzeThrottling(rows)
|
||||||
|
maxGPUTemp := analyzeMaxTemp(rows, func(r FanStressRow) float64 {
|
||||||
|
var m float64
|
||||||
|
for _, g := range r.GPUs {
|
||||||
|
if g.TempC > m {
|
||||||
|
m = g.TempC
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return m
|
||||||
|
})
|
||||||
|
maxCPUTemp := analyzeMaxTemp(rows, func(r FanStressRow) float64 {
|
||||||
|
return r.CPUMaxTempC
|
||||||
|
})
|
||||||
|
fanResponseSec := analyzeFanResponse(rows)
|
||||||
|
|
||||||
|
fmt.Fprintf(&summary, "throttling_detected=%v\n", throttled)
|
||||||
|
fmt.Fprintf(&summary, "max_gpu_temp_c=%.1f\n", maxGPUTemp)
|
||||||
|
fmt.Fprintf(&summary, "max_cpu_temp_c=%.1f\n", maxCPUTemp)
|
||||||
|
if fanResponseSec >= 0 {
|
||||||
|
fmt.Fprintf(&summary, "fan_response_sec=%.1f\n", fanResponseSec)
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(&summary, "fan_response_sec=N/A\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Throttling failure counts against overall result.
|
||||||
|
if throttled {
|
||||||
|
stats.Failed++
|
||||||
|
}
|
||||||
|
writeSATStats(&summary, stats)
|
||||||
|
|
||||||
|
// Write CSV outputs.
|
||||||
|
if err := WriteFanStressCSV(filepath.Join(runDir, "metrics.csv"), rows, opts.GPUIndices); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
_ = WriteFanSensorsCSV(filepath.Join(runDir, "fan-sensors.csv"), rows)
|
||||||
|
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
archive := filepath.Join(baseDir, "fan-stress-"+ts+".tar.gz")
|
||||||
|
if err := createTarGz(archive, runDir); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return archive, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func applyFanStressDefaults(opts *FanStressOptions) {
|
||||||
|
if opts.BaselineSec <= 0 {
|
||||||
|
opts.BaselineSec = 30
|
||||||
|
}
|
||||||
|
if opts.Phase1DurSec <= 0 {
|
||||||
|
opts.Phase1DurSec = 300
|
||||||
|
}
|
||||||
|
if opts.PauseSec <= 0 {
|
||||||
|
opts.PauseSec = 60
|
||||||
|
}
|
||||||
|
if opts.Phase2DurSec <= 0 {
|
||||||
|
opts.Phase2DurSec = 300
|
||||||
|
}
|
||||||
|
if opts.SizeMB <= 0 {
|
||||||
|
opts.SizeMB = 64
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// sampleFanStressRow collects all metrics for one telemetry sample.
|
||||||
|
func sampleFanStressRow(gpuIndices []int, phase string, elapsed float64) FanStressRow {
|
||||||
|
row := FanStressRow{
|
||||||
|
TimestampUTC: time.Now().UTC().Format(time.RFC3339),
|
||||||
|
ElapsedSec: elapsed,
|
||||||
|
Phase: phase,
|
||||||
|
}
|
||||||
|
row.GPUs = sampleGPUStressMetrics(gpuIndices)
|
||||||
|
row.Fans, _ = sampleFanSpeeds()
|
||||||
|
row.CPUMaxTempC = sampleCPUMaxTemp()
|
||||||
|
row.SysPowerW = sampleSystemPower()
|
||||||
|
return row
|
||||||
|
}
|
||||||
|
|
||||||
|
// sampleGPUStressMetrics queries nvidia-smi for temperature, utilization, power,
|
||||||
|
// clock frequency, and active throttle reasons for each GPU.
|
||||||
|
func sampleGPUStressMetrics(gpuIndices []int) []GPUStressMetric {
|
||||||
|
args := []string{
|
||||||
|
"--query-gpu=index,temperature.gpu,utilization.gpu,power.draw,clocks.current.graphics,clocks_throttle_reasons.active",
|
||||||
|
"--format=csv,noheader,nounits",
|
||||||
|
}
|
||||||
|
if len(gpuIndices) > 0 {
|
||||||
|
ids := make([]string, len(gpuIndices))
|
||||||
|
for i, idx := range gpuIndices {
|
||||||
|
ids[i] = strconv.Itoa(idx)
|
||||||
|
}
|
||||||
|
args = append([]string{"--id=" + strings.Join(ids, ",")}, args...)
|
||||||
|
}
|
||||||
|
out, err := exec.Command("nvidia-smi", args...).Output()
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
var metrics []GPUStressMetric
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
parts := strings.Split(line, ", ")
|
||||||
|
if len(parts) < 6 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||||
|
throttleVal := strings.TrimSpace(parts[5])
|
||||||
|
// Throttled if active reasons bitmask is non-zero.
|
||||||
|
throttled := throttleVal != "0x0000000000000000" &&
|
||||||
|
throttleVal != "0x0" &&
|
||||||
|
throttleVal != "0" &&
|
||||||
|
throttleVal != "" &&
|
||||||
|
throttleVal != "N/A"
|
||||||
|
metrics = append(metrics, GPUStressMetric{
|
||||||
|
Index: idx,
|
||||||
|
TempC: parseGPUFloat(parts[1]),
|
||||||
|
UsagePct: parseGPUFloat(parts[2]),
|
||||||
|
PowerW: parseGPUFloat(parts[3]),
|
||||||
|
ClockMHz: parseGPUFloat(parts[4]),
|
||||||
|
Throttled: throttled,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return metrics
|
||||||
|
}
|
||||||
|
|
||||||
|
// sampleFanSpeeds reads fan RPM values from ipmitool sdr.
|
||||||
|
func sampleFanSpeeds() ([]FanReading, error) {
|
||||||
|
out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return parseFanSpeeds(string(out)), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseFanSpeeds parses "ipmitool sdr type Fan" output.
|
||||||
|
// Line format: "FAN1 | 2400.000 | RPM | ok"
|
||||||
|
func parseFanSpeeds(raw string) []FanReading {
|
||||||
|
var fans []FanReading
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
||||||
|
parts := strings.Split(line, "|")
|
||||||
|
if len(parts) < 3 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
unit := strings.TrimSpace(parts[2])
|
||||||
|
if !strings.EqualFold(unit, "RPM") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
valStr := strings.TrimSpace(parts[1])
|
||||||
|
if strings.EqualFold(valStr, "na") || strings.EqualFold(valStr, "disabled") || valStr == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
val, err := strconv.ParseFloat(valStr, 64)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
fans = append(fans, FanReading{
|
||||||
|
Name: strings.TrimSpace(parts[0]),
|
||||||
|
RPM: val,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return fans
|
||||||
|
}
|
||||||
|
|
||||||
|
// sampleCPUMaxTemp returns the highest CPU/inlet temperature from ipmitool or sensors.
|
||||||
|
func sampleCPUMaxTemp() float64 {
|
||||||
|
out, err := exec.Command("ipmitool", "sdr", "type", "Temperature").Output()
|
||||||
|
if err != nil {
|
||||||
|
return sampleCPUTempViaSensors()
|
||||||
|
}
|
||||||
|
return parseIPMIMaxTemp(string(out))
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseIPMIMaxTemp extracts the maximum temperature from "ipmitool sdr type Temperature".
|
||||||
|
func parseIPMIMaxTemp(raw string) float64 {
|
||||||
|
var max float64
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
||||||
|
parts := strings.Split(line, "|")
|
||||||
|
if len(parts) < 3 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
unit := strings.TrimSpace(parts[2])
|
||||||
|
if !strings.Contains(strings.ToLower(unit), "degrees") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
valStr := strings.TrimSpace(parts[1])
|
||||||
|
if strings.EqualFold(valStr, "na") || valStr == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
val, err := strconv.ParseFloat(valStr, 64)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if val > max {
|
||||||
|
max = val
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return max
|
||||||
|
}
|
||||||
|
|
||||||
|
// sampleCPUTempViaSensors falls back to lm-sensors when ipmitool is unavailable.
|
||||||
|
func sampleCPUTempViaSensors() float64 {
|
||||||
|
out, err := exec.Command("sensors", "-u").Output()
|
||||||
|
if err != nil {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
var max float64
|
||||||
|
for _, line := range strings.Split(string(out), "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
fields := strings.Fields(line)
|
||||||
|
if len(fields) < 2 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !strings.HasSuffix(fields[0], "_input:") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
val, err := strconv.ParseFloat(fields[1], 64)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if val > 0 && val < 150 && val > max {
|
||||||
|
max = val
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return max
|
||||||
|
}
|
||||||
|
|
||||||
|
// sampleSystemPower reads system power draw via DCMI.
|
||||||
|
func sampleSystemPower() float64 {
|
||||||
|
out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
|
||||||
|
if err != nil {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return parseDCMIPowerReading(string(out))
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
|
||||||
|
// Sample: " Instantaneous power reading: 500 Watts"
|
||||||
|
func parseDCMIPowerReading(raw string) float64 {
|
||||||
|
for _, line := range strings.Split(raw, "\n") {
|
||||||
|
if !strings.Contains(strings.ToLower(line), "instantaneous") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
parts := strings.Fields(line)
|
||||||
|
for i, p := range parts {
|
||||||
|
if strings.EqualFold(p, "Watts") && i > 0 {
|
||||||
|
val, err := strconv.ParseFloat(parts[i-1], 64)
|
||||||
|
if err == nil {
|
||||||
|
return val
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
// analyzeThrottling returns true if any GPU reported an active throttle reason
|
||||||
|
// during either load phase.
|
||||||
|
func analyzeThrottling(rows []FanStressRow) bool {
|
||||||
|
for _, row := range rows {
|
||||||
|
if row.Phase != "load1" && row.Phase != "load2" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
for _, gpu := range row.GPUs {
|
||||||
|
if gpu.Throttled {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// analyzeMaxTemp returns the maximum value of the given extractor across all rows.
|
||||||
|
func analyzeMaxTemp(rows []FanStressRow, extract func(FanStressRow) float64) float64 {
|
||||||
|
var max float64
|
||||||
|
for _, row := range rows {
|
||||||
|
if v := extract(row); v > max {
|
||||||
|
max = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return max
|
||||||
|
}
|
||||||
|
|
||||||
|
// analyzeFanResponse returns the seconds from load1 start until fan RPM first
|
||||||
|
// increased by more than 5% above the baseline average. Returns -1 if undetermined.
|
||||||
|
func analyzeFanResponse(rows []FanStressRow) float64 {
|
||||||
|
// Compute baseline average fan RPM.
|
||||||
|
var baseTotal, baseCount float64
|
||||||
|
for _, row := range rows {
|
||||||
|
if row.Phase != "baseline" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
for _, f := range row.Fans {
|
||||||
|
baseTotal += f.RPM
|
||||||
|
baseCount++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if baseCount == 0 || baseTotal == 0 {
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
baseAvg := baseTotal / baseCount
|
||||||
|
threshold := baseAvg * 1.05 // 5% increase signals fan ramp-up
|
||||||
|
|
||||||
|
// Find elapsed time when load1 started.
|
||||||
|
var load1Start float64 = -1
|
||||||
|
for _, row := range rows {
|
||||||
|
if row.Phase == "load1" {
|
||||||
|
load1Start = row.ElapsedSec
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if load1Start < 0 {
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find first load1 row where average RPM crosses the threshold.
|
||||||
|
for _, row := range rows {
|
||||||
|
if row.Phase != "load1" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
var total, count float64
|
||||||
|
for _, f := range row.Fans {
|
||||||
|
total += f.RPM
|
||||||
|
count++
|
||||||
|
}
|
||||||
|
if count > 0 && total/count >= threshold {
|
||||||
|
return row.ElapsedSec - load1Start
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
|
||||||
|
// WriteFanStressCSV writes the wide-format metrics CSV with one row per second.
|
||||||
|
// GPU columns are generated per index in gpuIndices order.
|
||||||
|
func WriteFanStressCSV(path string, rows []FanStressRow, gpuIndices []int) error {
|
||||||
|
if len(rows) == 0 {
|
||||||
|
return os.WriteFile(path, []byte("no data\n"), 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
var b strings.Builder
|
||||||
|
|
||||||
|
// Header: fixed system columns + per-GPU columns.
|
||||||
|
b.WriteString("timestamp_utc,elapsed_sec,phase,fan_avg_rpm,fan_min_rpm,fan_max_rpm,cpu_max_temp_c,sys_power_w")
|
||||||
|
for _, idx := range gpuIndices {
|
||||||
|
fmt.Fprintf(&b, ",gpu%d_temp_c,gpu%d_usage_pct,gpu%d_power_w,gpu%d_clock_mhz,gpu%d_throttled",
|
||||||
|
idx, idx, idx, idx, idx)
|
||||||
|
}
|
||||||
|
b.WriteRune('\n')
|
||||||
|
|
||||||
|
for _, row := range rows {
|
||||||
|
favg, fmin, fmax := fanRPMStats(row.Fans)
|
||||||
|
fmt.Fprintf(&b, "%s,%.1f,%s,%.0f,%.0f,%.0f,%.1f,%.1f",
|
||||||
|
row.TimestampUTC,
|
||||||
|
row.ElapsedSec,
|
||||||
|
row.Phase,
|
||||||
|
favg, fmin, fmax,
|
||||||
|
row.CPUMaxTempC,
|
||||||
|
row.SysPowerW,
|
||||||
|
)
|
||||||
|
gpuByIdx := make(map[int]GPUStressMetric, len(row.GPUs))
|
||||||
|
for _, g := range row.GPUs {
|
||||||
|
gpuByIdx[g.Index] = g
|
||||||
|
}
|
||||||
|
for _, idx := range gpuIndices {
|
||||||
|
g := gpuByIdx[idx]
|
||||||
|
throttled := 0
|
||||||
|
if g.Throttled {
|
||||||
|
throttled = 1
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, ",%.1f,%.1f,%.1f,%.0f,%d",
|
||||||
|
g.TempC, g.UsagePct, g.PowerW, g.ClockMHz, throttled)
|
||||||
|
}
|
||||||
|
b.WriteRune('\n')
|
||||||
|
}
|
||||||
|
|
||||||
|
return os.WriteFile(path, []byte(b.String()), 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
// WriteFanSensorsCSV writes individual fan sensor readings in long (tidy) format.
|
||||||
|
func WriteFanSensorsCSV(path string, rows []FanStressRow) error {
|
||||||
|
var b strings.Builder
|
||||||
|
b.WriteString("timestamp_utc,elapsed_sec,phase,fan_name,rpm\n")
|
||||||
|
for _, row := range rows {
|
||||||
|
for _, f := range row.Fans {
|
||||||
|
fmt.Fprintf(&b, "%s,%.1f,%s,%s,%.0f\n",
|
||||||
|
row.TimestampUTC, row.ElapsedSec, row.Phase, f.Name, f.RPM)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return os.WriteFile(path, []byte(b.String()), 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
// fanRPMStats computes average, min, max RPM across all fans in a sample row.
|
||||||
|
func fanRPMStats(fans []FanReading) (avg, min, max float64) {
|
||||||
|
if len(fans) == 0 {
|
||||||
|
return 0, 0, 0
|
||||||
|
}
|
||||||
|
min = fans[0].RPM
|
||||||
|
max = fans[0].RPM
|
||||||
|
var total float64
|
||||||
|
for _, f := range fans {
|
||||||
|
total += f.RPM
|
||||||
|
if f.RPM < min {
|
||||||
|
min = f.RPM
|
||||||
|
}
|
||||||
|
if f.RPM > max {
|
||||||
|
max = f.RPM
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return total / float64(len(fans)), min, max
|
||||||
|
}
|
||||||
@@ -1,8 +1,10 @@
|
|||||||
package tui
|
package tui
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
tea "github.com/charmbracelet/bubbletea"
|
tea "github.com/charmbracelet/bubbletea"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -137,6 +139,21 @@ func (m model) updateConfirm(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
|||||||
},
|
},
|
||||||
pollSATProgress("gpu-amd", since),
|
pollSATProgress("gpu-amd", since),
|
||||||
)
|
)
|
||||||
|
case actionRunFanStress:
|
||||||
|
m.busyTitle = "Fan Stress Test"
|
||||||
|
m.progressPrefix = "fan-stress"
|
||||||
|
m.progressSince = time.Now()
|
||||||
|
m.progressLines = nil
|
||||||
|
since := m.progressSince
|
||||||
|
opts := hcFanStressOpts(m.hcMode, m.app)
|
||||||
|
return m, tea.Batch(
|
||||||
|
func() tea.Msg {
|
||||||
|
ctx := context.Background()
|
||||||
|
result, err := m.app.RunFanStressTestResult(ctx, opts)
|
||||||
|
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenHealthCheck}
|
||||||
|
},
|
||||||
|
pollSATProgress("fan-stress", since),
|
||||||
|
)
|
||||||
}
|
}
|
||||||
case "ctrl+c":
|
case "ctrl+c":
|
||||||
return m, tea.Quit
|
return m, tea.Quit
|
||||||
@@ -148,9 +165,53 @@ func (m model) confirmCancelTarget() screen {
|
|||||||
switch m.pendingAction {
|
switch m.pendingAction {
|
||||||
case actionExportBundle:
|
case actionExportBundle:
|
||||||
return screenExportTargets
|
return screenExportTargets
|
||||||
case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT:
|
case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT, actionRunFanStress:
|
||||||
return screenHealthCheck
|
return screenHealthCheck
|
||||||
default:
|
default:
|
||||||
return screenMain
|
return screenMain
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// hcFanStressOpts builds FanStressOptions for the selected mode, auto-detecting all GPUs.
|
||||||
|
func hcFanStressOpts(hcMode int, application interface {
|
||||||
|
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
|
||||||
|
}) platform.FanStressOptions {
|
||||||
|
// Phase durations per mode: [baseline, load1, pause, load2]
|
||||||
|
type durations struct{ baseline, load1, pause, load2 int }
|
||||||
|
modes := [3]durations{
|
||||||
|
{30, 120, 30, 120}, // Quick: ~5 min total
|
||||||
|
{60, 300, 60, 300}, // Standard: ~12 min total
|
||||||
|
{60, 600, 120, 600}, // Express: ~24 min total
|
||||||
|
}
|
||||||
|
if hcMode < 0 || hcMode >= len(modes) {
|
||||||
|
hcMode = 0
|
||||||
|
}
|
||||||
|
d := modes[hcMode]
|
||||||
|
|
||||||
|
// Use all detected NVIDIA GPUs.
|
||||||
|
var indices []int
|
||||||
|
if gpus, err := application.ListNvidiaGPUs(); err == nil {
|
||||||
|
for _, g := range gpus {
|
||||||
|
indices = append(indices, g.Index)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use minimum GPU memory size to fit all GPUs.
|
||||||
|
sizeMB := 64
|
||||||
|
if gpus, err := application.ListNvidiaGPUs(); err == nil {
|
||||||
|
for _, g := range gpus {
|
||||||
|
if g.MemoryMB > 0 && (sizeMB == 64 || g.MemoryMB < sizeMB) {
|
||||||
|
sizeMB = g.MemoryMB / 16 // allocate 1/16 of VRAM per GPU
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return platform.FanStressOptions{
|
||||||
|
BaselineSec: d.baseline,
|
||||||
|
Phase1DurSec: d.load1,
|
||||||
|
PauseSec: d.pause,
|
||||||
|
Phase2DurSec: d.load2,
|
||||||
|
SizeMB: sizeMB,
|
||||||
|
GPUIndices: indices,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -18,16 +18,17 @@ const (
|
|||||||
|
|
||||||
// Cursor positions in Health Check screen.
|
// Cursor positions in Health Check screen.
|
||||||
const (
|
const (
|
||||||
hcCurGPU = 0
|
hcCurGPU = 0
|
||||||
hcCurMemory = 1
|
hcCurMemory = 1
|
||||||
hcCurStorage = 2
|
hcCurStorage = 2
|
||||||
hcCurCPU = 3
|
hcCurCPU = 3
|
||||||
hcCurSelectAll = 4
|
hcCurSelectAll = 4
|
||||||
hcCurModeQuick = 5
|
hcCurModeQuick = 5
|
||||||
hcCurModeStd = 6
|
hcCurModeStd = 6
|
||||||
hcCurModeExpr = 7
|
hcCurModeExpr = 7
|
||||||
hcCurRunAll = 8
|
hcCurRunAll = 8
|
||||||
hcCurTotal = 9
|
hcCurFanStress = 9
|
||||||
|
hcCurTotal = 10
|
||||||
)
|
)
|
||||||
|
|
||||||
// hcModeDurations maps mode index (0=Quick,1=Standard,2=Express) to GPU stress seconds.
|
// hcModeDurations maps mode index (0=Quick,1=Standard,2=Express) to GPU stress seconds.
|
||||||
@@ -82,6 +83,8 @@ func (m model) updateHealthCheck(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
|||||||
m.hcMode = m.hcCursor - hcCurModeQuick
|
m.hcMode = m.hcCursor - hcCurModeQuick
|
||||||
case hcCurRunAll:
|
case hcCurRunAll:
|
||||||
return m.hcRunAll()
|
return m.hcRunAll()
|
||||||
|
case hcCurFanStress:
|
||||||
|
return m.hcRunFanStress()
|
||||||
}
|
}
|
||||||
case "g", "G":
|
case "g", "G":
|
||||||
return m.hcRunSingle(hcGPU)
|
return m.hcRunSingle(hcGPU)
|
||||||
@@ -93,6 +96,8 @@ func (m model) updateHealthCheck(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
|||||||
return m.hcRunSingle(hcCPU)
|
return m.hcRunSingle(hcCPU)
|
||||||
case "r", "R":
|
case "r", "R":
|
||||||
return m.hcRunAll()
|
return m.hcRunAll()
|
||||||
|
case "f", "F":
|
||||||
|
return m.hcRunFanStress()
|
||||||
case "a", "A":
|
case "a", "A":
|
||||||
allOn := m.hcSel[0] && m.hcSel[1] && m.hcSel[2] && m.hcSel[3]
|
allOn := m.hcSel[0] && m.hcSel[1] && m.hcSel[2] && m.hcSel[3]
|
||||||
for i := range m.hcSel {
|
for i := range m.hcSel {
|
||||||
@@ -143,6 +148,13 @@ func (m model) hcRunSingle(idx int) (tea.Model, tea.Cmd) {
|
|||||||
return m, nil
|
return m, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (m model) hcRunFanStress() (tea.Model, tea.Cmd) {
|
||||||
|
m.pendingAction = actionRunFanStress
|
||||||
|
m.screen = screenConfirm
|
||||||
|
m.cursor = 0
|
||||||
|
return m, nil
|
||||||
|
}
|
||||||
|
|
||||||
func (m model) hcRunAll() (tea.Model, tea.Cmd) {
|
func (m model) hcRunAll() (tea.Model, tea.Cmd) {
|
||||||
for _, sel := range m.hcSel {
|
for _, sel := range m.hcSel {
|
||||||
if sel {
|
if sel {
|
||||||
@@ -300,8 +312,16 @@ func renderHealthCheck(m model) string {
|
|||||||
fmt.Fprintf(&b, "%s[ RUN ALL [R] ]\n", pfx)
|
fmt.Fprintf(&b, "%s[ RUN ALL [R] ]\n", pfx)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
pfx := " "
|
||||||
|
if m.hcCursor == hcCurFanStress {
|
||||||
|
pfx = "> "
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "%s[ FAN STRESS TEST [F] ] (thermal cycling, fan lag, throttle check)\n", pfx)
|
||||||
|
}
|
||||||
|
|
||||||
fmt.Fprintln(&b)
|
fmt.Fprintln(&b)
|
||||||
fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────")
|
fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────")
|
||||||
fmt.Fprint(&b, "[↑↓] move [space/enter] toggle [letter] single test [R] run all [Esc] back")
|
fmt.Fprint(&b, "[↑↓] move [space/enter] toggle [letter] single test [R] run all [F] fan stress [Esc] back")
|
||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -40,7 +40,8 @@ const (
|
|||||||
actionRunMemorySAT actionKind = "run_memory_sat"
|
actionRunMemorySAT actionKind = "run_memory_sat"
|
||||||
actionRunStorageSAT actionKind = "run_storage_sat"
|
actionRunStorageSAT actionKind = "run_storage_sat"
|
||||||
actionRunCPUSAT actionKind = "run_cpu_sat"
|
actionRunCPUSAT actionKind = "run_cpu_sat"
|
||||||
actionRunAMDGPUSAT actionKind = "run_amd_gpu_sat"
|
actionRunAMDGPUSAT actionKind = "run_amd_gpu_sat"
|
||||||
|
actionRunFanStress actionKind = "run_fan_stress"
|
||||||
)
|
)
|
||||||
|
|
||||||
type model struct {
|
type model struct {
|
||||||
@@ -188,6 +189,11 @@ func (m model) confirmBody() (string, string) {
|
|||||||
return "CPU test", "Run stress-ng? Mode: " + modes[m.hcMode]
|
return "CPU test", "Run stress-ng? Mode: " + modes[m.hcMode]
|
||||||
case actionRunAMDGPUSAT:
|
case actionRunAMDGPUSAT:
|
||||||
return "AMD GPU test", "Run AMD GPU diagnostic pack (rocm-smi)?"
|
return "AMD GPU test", "Run AMD GPU diagnostic pack (rocm-smi)?"
|
||||||
|
case actionRunFanStress:
|
||||||
|
modes := []string{"Quick (2×2min)", "Standard (2×5min)", "Express (2×10min)"}
|
||||||
|
return "Fan Stress Test", "Two-phase GPU thermal cycling test.\n" +
|
||||||
|
"Monitors fans, temps, power — detects throttling.\n" +
|
||||||
|
"Mode: " + modes[m.hcMode] + "\n\nAll NVIDIA GPUs will be stressed."
|
||||||
default:
|
default:
|
||||||
return "Confirm", "Proceed?"
|
return "Confirm", "Proceed?"
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user