feat(tui): live GPU chart during stress test, full VRAM allocation

- GPU Platform Stress Test now shows a live in-TUI chart instead of nvtop. nvidia-smi is polled every second; up to 60 data points per GPU kept. All three metrics (Usage %, Temp °C, Power W) drawn on a single plot, each normalised to its own range and rendered in a different colour. - Memory allocation changed from MemoryMB/16 to MemoryMB-512 (full VRAM minus 512 MB driver overhead) so bee-gpu-stress actually stresses memory. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-26 17:37:20 +03:00
parent 0a52a4f3ba
commit 8b4bfdf5ad
7 changed files with 223 additions and 27 deletions
@@ -3,8 +3,10 @@ package tui
 import (
 	"context"
 	"fmt"
-	"os/exec"
 	"strings"
+	"time"
+
+	"bee/audit/internal/platform"

 	tea "github.com/charmbracelet/bubbletea"
 )
@@ -156,14 +158,16 @@ func (m model) hcRunFanStress() (tea.Model, tea.Cmd) {
 	return m, nil
 }

-// startGPUStressTest launches the GPU Platform Stress Test and nvtop concurrently.
-// nvtop occupies the full terminal as a live chart; the stress test runs in background.
+// startGPUStressTest launches the GPU Platform Stress Test with a live in-TUI chart.
 func (m model) startGPUStressTest() (tea.Model, tea.Cmd) {
 	opts := hcFanStressOpts(m.hcMode, m.app)

 	ctx, cancel := context.WithCancel(context.Background())
 	m.gpuStressCancel = cancel
 	m.gpuStressAborted = false
+	m.gpuLiveRows = nil
+	m.gpuLiveIndices = opts.GPUIndices
+	m.gpuLiveStart = time.Now()
 	m.screen = screenGPUStressRunning
 	m.nvidiaSATCursor = 0

@@ -172,30 +176,21 @@ func (m model) startGPUStressTest() (tea.Model, tea.Cmd) {
 		return gpuStressDoneMsg{title: result.Title, body: result.Body, err: err}
 	}

-	nvtopPath, lookErr := exec.LookPath("nvtop")
-	if lookErr != nil {
-		return m, stressCmd
-	}
+	return m, tea.Batch(stressCmd, pollGPULive(opts.GPUIndices))
+}

-	return m, tea.Batch(
-		stressCmd,
-		tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
-			return nvtopClosedMsg{}
-		}),
-	)
+// pollGPULive samples nvidia-smi once after one second and returns a gpuLiveTickMsg.
+// The update handler reschedules it to achieve continuous 1s polling.
+func pollGPULive(indices []int) tea.Cmd {
+	return tea.Tick(time.Second, func(_ time.Time) tea.Msg {
+		rows, _ := platform.SampleGPUMetrics(indices)
+		return gpuLiveTickMsg{rows: rows, indices: indices}
+	})
 }

 // updateGPUStressRunning handles keys on the GPU stress running screen.
 func (m model) updateGPUStressRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
 	switch msg.String() {
-	case "o", "O":
-		nvtopPath, err := exec.LookPath("nvtop")
-		if err != nil {
-			return m, nil
-		}
-		return m, tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
-			return nvtopClosedMsg{}
-		})
 	case "a", "A":
 		if m.gpuStressCancel != nil {
 			m.gpuStressCancel()
@@ -210,8 +205,22 @@ func (m model) updateGPUStressRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
 	return m, nil
 }

-func renderGPUStressRunning() string {
-	return "GPU PLATFORM STRESS TEST\n\nTest is running...\n\n[o] Open nvtop  [a] Abort test  [ctrl+c] quit\n"
+func renderGPUStressRunning(m model) string {
+	var b strings.Builder
+	fmt.Fprintln(&b, "GPU PLATFORM STRESS TEST")
+	fmt.Fprintln(&b)
+	if len(m.gpuLiveRows) == 0 {
+		fmt.Fprintln(&b, "Collecting metrics...")
+	} else {
+		chartWidth := m.width - 8
+		if chartWidth < 40 {
+			chartWidth = 70
+		}
+		b.WriteString(platform.RenderGPULiveChart(m.gpuLiveRows, chartWidth))
+	}
+	fmt.Fprintln(&b)
+	b.WriteString("[a] Abort test  [ctrl+c] quit")
+	return b.String()
 }

 func (m model) hcRunAll() (tea.Model, tea.Cmd) {