feat(tui): live GPU chart during stress test, full VRAM allocation

- GPU Platform Stress Test now shows a live in-TUI chart instead of nvtop.
  nvidia-smi is polled every second; up to 60 data points per GPU kept.
  All three metrics (Usage %, Temp °C, Power W) drawn on a single plot,
  each normalised to its own range and rendered in a different colour.
- Memory allocation changed from MemoryMB/16 to MemoryMB-512 (full VRAM
  minus 512 MB driver overhead) so bee-gpu-stress actually stresses memory.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Mikhail Chusavitin
2026-03-26 17:37:20 +03:00
parent 0a52a4f3ba
commit 8b4bfdf5ad
7 changed files with 223 additions and 27 deletions

View File

@@ -3,8 +3,10 @@ package tui
import (
"context"
"fmt"
"os/exec"
"strings"
"time"
"bee/audit/internal/platform"
tea "github.com/charmbracelet/bubbletea"
)
@@ -156,14 +158,16 @@ func (m model) hcRunFanStress() (tea.Model, tea.Cmd) {
return m, nil
}
// startGPUStressTest launches the GPU Platform Stress Test and nvtop concurrently.
// nvtop occupies the full terminal as a live chart; the stress test runs in background.
// startGPUStressTest launches the GPU Platform Stress Test with a live in-TUI chart.
func (m model) startGPUStressTest() (tea.Model, tea.Cmd) {
opts := hcFanStressOpts(m.hcMode, m.app)
ctx, cancel := context.WithCancel(context.Background())
m.gpuStressCancel = cancel
m.gpuStressAborted = false
m.gpuLiveRows = nil
m.gpuLiveIndices = opts.GPUIndices
m.gpuLiveStart = time.Now()
m.screen = screenGPUStressRunning
m.nvidiaSATCursor = 0
@@ -172,30 +176,21 @@ func (m model) startGPUStressTest() (tea.Model, tea.Cmd) {
return gpuStressDoneMsg{title: result.Title, body: result.Body, err: err}
}
nvtopPath, lookErr := exec.LookPath("nvtop")
if lookErr != nil {
return m, stressCmd
}
return m, tea.Batch(stressCmd, pollGPULive(opts.GPUIndices))
}
return m, tea.Batch(
stressCmd,
tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
return nvtopClosedMsg{}
}),
)
// pollGPULive samples nvidia-smi once after one second and returns a gpuLiveTickMsg.
// The update handler reschedules it to achieve continuous 1s polling.
func pollGPULive(indices []int) tea.Cmd {
return tea.Tick(time.Second, func(_ time.Time) tea.Msg {
rows, _ := platform.SampleGPUMetrics(indices)
return gpuLiveTickMsg{rows: rows, indices: indices}
})
}
// updateGPUStressRunning handles keys on the GPU stress running screen.
func (m model) updateGPUStressRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
switch msg.String() {
case "o", "O":
nvtopPath, err := exec.LookPath("nvtop")
if err != nil {
return m, nil
}
return m, tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
return nvtopClosedMsg{}
})
case "a", "A":
if m.gpuStressCancel != nil {
m.gpuStressCancel()
@@ -210,8 +205,22 @@ func (m model) updateGPUStressRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
return m, nil
}
func renderGPUStressRunning() string {
return "GPU PLATFORM STRESS TEST\n\nTest is running...\n\n[o] Open nvtop [a] Abort test [ctrl+c] quit\n"
func renderGPUStressRunning(m model) string {
var b strings.Builder
fmt.Fprintln(&b, "GPU PLATFORM STRESS TEST")
fmt.Fprintln(&b)
if len(m.gpuLiveRows) == 0 {
fmt.Fprintln(&b, "Collecting metrics...")
} else {
chartWidth := m.width - 8
if chartWidth < 40 {
chartWidth = 70
}
b.WriteString(platform.RenderGPULiveChart(m.gpuLiveRows, chartWidth))
}
fmt.Fprintln(&b)
b.WriteString("[a] Abort test [ctrl+c] quit")
return b.String()
}
func (m model) hcRunAll() (tea.Model, tea.Cmd) {