- GPU Platform Stress Test now shows a live in-TUI chart instead of nvtop. nvidia-smi is polled every second; up to 60 data points per GPU kept. All three metrics (Usage %, Temp °C, Power W) drawn on a single plot, each normalised to its own range and rendered in a different colour. - Memory allocation changed from MemoryMB/16 to MemoryMB-512 (full VRAM minus 512 MB driver overhead) so bee-gpu-stress actually stresses memory. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
396 lines
9.5 KiB
Go
396 lines
9.5 KiB
Go
package tui
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"strings"
|
|
"time"
|
|
|
|
"bee/audit/internal/platform"
|
|
|
|
tea "github.com/charmbracelet/bubbletea"
|
|
)
|
|
|
|
// Component indices.
|
|
const (
|
|
hcGPU = 0
|
|
hcMemory = 1
|
|
hcStorage = 2
|
|
hcCPU = 3
|
|
)
|
|
|
|
// Cursor positions in Health Check screen.
|
|
const (
|
|
hcCurGPU = 0
|
|
hcCurMemory = 1
|
|
hcCurStorage = 2
|
|
hcCurCPU = 3
|
|
hcCurSelectAll = 4
|
|
hcCurModeQuick = 5
|
|
hcCurModeStd = 6
|
|
hcCurModeExpr = 7
|
|
hcCurRunAll = 8
|
|
hcCurFanStress = 9
|
|
hcCurTotal = 10
|
|
)
|
|
|
|
// hcModeDurations maps mode index (0=Quick,1=Standard,2=Express) to GPU stress seconds.
|
|
var hcModeDurations = [3]int{600, 3600, 28800}
|
|
|
|
// hcCPUDurations maps mode index to CPU stress-ng seconds.
|
|
var hcCPUDurations = [3]int{60, 300, 900}
|
|
|
|
func (m model) enterHealthCheck() (tea.Model, tea.Cmd) {
|
|
m.screen = screenHealthCheck
|
|
if !m.hcInitialized {
|
|
m.hcSel = [4]bool{true, true, true, true}
|
|
m.hcMode = 0
|
|
m.hcCursor = 0
|
|
m.hcInitialized = true
|
|
}
|
|
return m, nil
|
|
}
|
|
|
|
func (m model) updateHealthCheck(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
|
switch msg.String() {
|
|
case "up", "k":
|
|
if m.hcCursor > 0 {
|
|
m.hcCursor--
|
|
}
|
|
case "down", "j":
|
|
if m.hcCursor < hcCurTotal-1 {
|
|
m.hcCursor++
|
|
}
|
|
case " ":
|
|
switch m.hcCursor {
|
|
case hcCurGPU, hcCurMemory, hcCurStorage, hcCurCPU:
|
|
m.hcSel[m.hcCursor] = !m.hcSel[m.hcCursor]
|
|
case hcCurSelectAll:
|
|
allOn := m.hcSel[0] && m.hcSel[1] && m.hcSel[2] && m.hcSel[3]
|
|
for i := range m.hcSel {
|
|
m.hcSel[i] = !allOn
|
|
}
|
|
case hcCurModeQuick, hcCurModeStd, hcCurModeExpr:
|
|
m.hcMode = m.hcCursor - hcCurModeQuick
|
|
}
|
|
case "enter":
|
|
switch m.hcCursor {
|
|
case hcCurGPU, hcCurMemory, hcCurStorage, hcCurCPU:
|
|
return m.hcRunSingle(m.hcCursor)
|
|
case hcCurSelectAll:
|
|
allOn := m.hcSel[0] && m.hcSel[1] && m.hcSel[2] && m.hcSel[3]
|
|
for i := range m.hcSel {
|
|
m.hcSel[i] = !allOn
|
|
}
|
|
case hcCurModeQuick, hcCurModeStd, hcCurModeExpr:
|
|
m.hcMode = m.hcCursor - hcCurModeQuick
|
|
case hcCurRunAll:
|
|
return m.hcRunAll()
|
|
case hcCurFanStress:
|
|
return m.hcRunFanStress()
|
|
}
|
|
case "g", "G":
|
|
return m.hcRunSingle(hcGPU)
|
|
case "m", "M":
|
|
return m.hcRunSingle(hcMemory)
|
|
case "s", "S":
|
|
return m.hcRunSingle(hcStorage)
|
|
case "c", "C":
|
|
return m.hcRunSingle(hcCPU)
|
|
case "r", "R":
|
|
return m.hcRunAll()
|
|
case "f", "F":
|
|
return m.hcRunFanStress()
|
|
case "a", "A":
|
|
allOn := m.hcSel[0] && m.hcSel[1] && m.hcSel[2] && m.hcSel[3]
|
|
for i := range m.hcSel {
|
|
m.hcSel[i] = !allOn
|
|
}
|
|
case "1":
|
|
m.hcMode = 0
|
|
case "2":
|
|
m.hcMode = 1
|
|
case "3":
|
|
m.hcMode = 2
|
|
case "esc":
|
|
m.screen = screenMain
|
|
m.cursor = 0
|
|
case "q", "ctrl+c":
|
|
return m, tea.Quit
|
|
}
|
|
return m, nil
|
|
}
|
|
|
|
func (m model) hcRunSingle(idx int) (tea.Model, tea.Cmd) {
|
|
switch idx {
|
|
case hcGPU:
|
|
if m.app.DetectGPUVendor() == "amd" {
|
|
m.pendingAction = actionRunAMDGPUSAT
|
|
m.screen = screenConfirm
|
|
m.cursor = 0
|
|
return m, nil
|
|
}
|
|
m.nvidiaDurIdx = m.hcMode
|
|
return m.enterNvidiaSATSetup()
|
|
case hcMemory:
|
|
m.pendingAction = actionRunMemorySAT
|
|
m.screen = screenConfirm
|
|
m.cursor = 0
|
|
return m, nil
|
|
case hcStorage:
|
|
m.pendingAction = actionRunStorageSAT
|
|
m.screen = screenConfirm
|
|
m.cursor = 0
|
|
return m, nil
|
|
case hcCPU:
|
|
m.pendingAction = actionRunCPUSAT
|
|
m.screen = screenConfirm
|
|
m.cursor = 0
|
|
return m, nil
|
|
}
|
|
return m, nil
|
|
}
|
|
|
|
func (m model) hcRunFanStress() (tea.Model, tea.Cmd) {
|
|
m.pendingAction = actionRunFanStress
|
|
m.screen = screenConfirm
|
|
m.cursor = 0
|
|
return m, nil
|
|
}
|
|
|
|
// startGPUStressTest launches the GPU Platform Stress Test with a live in-TUI chart.
|
|
func (m model) startGPUStressTest() (tea.Model, tea.Cmd) {
|
|
opts := hcFanStressOpts(m.hcMode, m.app)
|
|
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
m.gpuStressCancel = cancel
|
|
m.gpuStressAborted = false
|
|
m.gpuLiveRows = nil
|
|
m.gpuLiveIndices = opts.GPUIndices
|
|
m.gpuLiveStart = time.Now()
|
|
m.screen = screenGPUStressRunning
|
|
m.nvidiaSATCursor = 0
|
|
|
|
stressCmd := func() tea.Msg {
|
|
result, err := m.app.RunFanStressTestResult(ctx, opts)
|
|
return gpuStressDoneMsg{title: result.Title, body: result.Body, err: err}
|
|
}
|
|
|
|
return m, tea.Batch(stressCmd, pollGPULive(opts.GPUIndices))
|
|
}
|
|
|
|
// pollGPULive samples nvidia-smi once after one second and returns a gpuLiveTickMsg.
|
|
// The update handler reschedules it to achieve continuous 1s polling.
|
|
func pollGPULive(indices []int) tea.Cmd {
|
|
return tea.Tick(time.Second, func(_ time.Time) tea.Msg {
|
|
rows, _ := platform.SampleGPUMetrics(indices)
|
|
return gpuLiveTickMsg{rows: rows, indices: indices}
|
|
})
|
|
}
|
|
|
|
// updateGPUStressRunning handles keys on the GPU stress running screen.
|
|
func (m model) updateGPUStressRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
|
switch msg.String() {
|
|
case "a", "A":
|
|
if m.gpuStressCancel != nil {
|
|
m.gpuStressCancel()
|
|
m.gpuStressCancel = nil
|
|
}
|
|
m.gpuStressAborted = true
|
|
m.screen = screenHealthCheck
|
|
m.cursor = 0
|
|
case "ctrl+c":
|
|
return m, tea.Quit
|
|
}
|
|
return m, nil
|
|
}
|
|
|
|
func renderGPUStressRunning(m model) string {
|
|
var b strings.Builder
|
|
fmt.Fprintln(&b, "GPU PLATFORM STRESS TEST")
|
|
fmt.Fprintln(&b)
|
|
if len(m.gpuLiveRows) == 0 {
|
|
fmt.Fprintln(&b, "Collecting metrics...")
|
|
} else {
|
|
chartWidth := m.width - 8
|
|
if chartWidth < 40 {
|
|
chartWidth = 70
|
|
}
|
|
b.WriteString(platform.RenderGPULiveChart(m.gpuLiveRows, chartWidth))
|
|
}
|
|
fmt.Fprintln(&b)
|
|
b.WriteString("[a] Abort test [ctrl+c] quit")
|
|
return b.String()
|
|
}
|
|
|
|
func (m model) hcRunAll() (tea.Model, tea.Cmd) {
|
|
for _, sel := range m.hcSel {
|
|
if sel {
|
|
m.pendingAction = actionRunAll
|
|
m.screen = screenConfirm
|
|
m.cursor = 0
|
|
return m, nil
|
|
}
|
|
}
|
|
return m, nil
|
|
}
|
|
|
|
func (m model) executeRunAll() (tea.Model, tea.Cmd) {
|
|
durationSec := hcModeDurations[m.hcMode]
|
|
durationIdx := m.hcMode
|
|
sel := m.hcSel
|
|
app := m.app
|
|
m.busy = true
|
|
m.busyTitle = "Health Check"
|
|
return m, func() tea.Msg {
|
|
var parts []string
|
|
if sel[hcGPU] {
|
|
vendor := app.DetectGPUVendor()
|
|
if vendor == "amd" {
|
|
r, err := app.RunAMDAcceptancePackResult("")
|
|
body := r.Body
|
|
if err != nil {
|
|
body += "\nERROR: " + err.Error()
|
|
}
|
|
parts = append(parts, "=== GPU (AMD) ===\n"+body)
|
|
} else {
|
|
gpus, err := app.ListNvidiaGPUs()
|
|
if err != nil || len(gpus) == 0 {
|
|
parts = append(parts, "=== GPU ===\nNo NVIDIA GPUs detected or driver not loaded.")
|
|
} else {
|
|
var indices []int
|
|
sizeMB := 0
|
|
for _, g := range gpus {
|
|
indices = append(indices, g.Index)
|
|
if sizeMB == 0 || g.MemoryMB < sizeMB {
|
|
sizeMB = g.MemoryMB
|
|
}
|
|
}
|
|
if sizeMB == 0 {
|
|
sizeMB = 64
|
|
}
|
|
r, err := app.RunNvidiaAcceptancePackWithOptions(context.Background(), "", durationSec, sizeMB, indices)
|
|
body := r.Body
|
|
if err != nil {
|
|
body += "\nERROR: " + err.Error()
|
|
}
|
|
parts = append(parts, "=== GPU ===\n"+body)
|
|
}
|
|
}
|
|
}
|
|
if sel[hcMemory] {
|
|
r, err := app.RunMemoryAcceptancePackResult("")
|
|
body := r.Body
|
|
if err != nil {
|
|
body += "\nERROR: " + err.Error()
|
|
}
|
|
parts = append(parts, "=== MEMORY ===\n"+body)
|
|
}
|
|
if sel[hcStorage] {
|
|
r, err := app.RunStorageAcceptancePackResult("")
|
|
body := r.Body
|
|
if err != nil {
|
|
body += "\nERROR: " + err.Error()
|
|
}
|
|
parts = append(parts, "=== STORAGE ===\n"+body)
|
|
}
|
|
if sel[hcCPU] {
|
|
cpuDur := hcCPUDurations[durationIdx]
|
|
r, err := app.RunCPUAcceptancePackResult("", cpuDur)
|
|
body := r.Body
|
|
if err != nil {
|
|
body += "\nERROR: " + err.Error()
|
|
}
|
|
parts = append(parts, "=== CPU ===\n"+body)
|
|
}
|
|
combined := strings.Join(parts, "\n\n")
|
|
if combined == "" {
|
|
combined = "No components selected."
|
|
}
|
|
return resultMsg{title: "Health Check", body: combined, back: screenHealthCheck}
|
|
}
|
|
}
|
|
|
|
func renderHealthCheck(m model) string {
|
|
var b strings.Builder
|
|
|
|
fmt.Fprintln(&b, "HEALTH CHECK")
|
|
fmt.Fprintln(&b)
|
|
fmt.Fprintln(&b, " Diagnostics:")
|
|
fmt.Fprintln(&b)
|
|
|
|
type comp struct{ name, desc, key string }
|
|
comps := []comp{
|
|
{"GPU", "nvidia/amd auto-detect", "G"},
|
|
{"MEMORY", "memtester", "M"},
|
|
{"STORAGE", "smartctl + NVMe self-test", "S"},
|
|
{"CPU", "audit diagnostics", "C"},
|
|
}
|
|
for i, c := range comps {
|
|
pfx := " "
|
|
if m.hcCursor == i {
|
|
pfx = "> "
|
|
}
|
|
ch := "[ ]"
|
|
if m.hcSel[i] {
|
|
ch = "[x]"
|
|
}
|
|
fmt.Fprintf(&b, "%s%s %-8s %-28s [%s]\n", pfx, ch, c.name, c.desc, c.key)
|
|
}
|
|
|
|
fmt.Fprintln(&b, " ─────────────────────────────────────────────────")
|
|
{
|
|
pfx := " "
|
|
if m.hcCursor == hcCurSelectAll {
|
|
pfx = "> "
|
|
}
|
|
allOn := m.hcSel[0] && m.hcSel[1] && m.hcSel[2] && m.hcSel[3]
|
|
ch := "[ ]"
|
|
if allOn {
|
|
ch = "[x]"
|
|
}
|
|
fmt.Fprintf(&b, "%s%s Select / Deselect All [A]\n", pfx, ch)
|
|
}
|
|
|
|
fmt.Fprintln(&b)
|
|
fmt.Fprintln(&b, " Mode:")
|
|
modes := []struct{ label, key string }{
|
|
{"Quick", "1"},
|
|
{"Standard", "2"},
|
|
{"Express", "3"},
|
|
}
|
|
for i, mode := range modes {
|
|
pfx := " "
|
|
if m.hcCursor == hcCurModeQuick+i {
|
|
pfx = "> "
|
|
}
|
|
radio := "( )"
|
|
if m.hcMode == i {
|
|
radio = "(*)"
|
|
}
|
|
fmt.Fprintf(&b, "%s%s %-10s [%s]\n", pfx, radio, mode.label, mode.key)
|
|
}
|
|
|
|
fmt.Fprintln(&b)
|
|
{
|
|
pfx := " "
|
|
if m.hcCursor == hcCurRunAll {
|
|
pfx = "> "
|
|
}
|
|
fmt.Fprintf(&b, "%s[ RUN ALL [R] ]\n", pfx)
|
|
}
|
|
|
|
{
|
|
pfx := " "
|
|
if m.hcCursor == hcCurFanStress {
|
|
pfx = "> "
|
|
}
|
|
fmt.Fprintf(&b, "%s[ GPU PLATFORM STRESS TEST [F] ] (thermal cycling, fan lag, throttle check)\n", pfx)
|
|
}
|
|
|
|
fmt.Fprintln(&b)
|
|
fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────")
|
|
fmt.Fprint(&b, "[↑↓] move [space/enter] toggle [letter] single test [R] run all [F] gpu stress [Esc] back")
|
|
return b.String()
|
|
}
|