feat(tui): live GPU chart during stress test, full VRAM allocation

- GPU Platform Stress Test now shows a live in-TUI chart instead of nvtop. nvidia-smi is polled every second; up to 60 data points per GPU kept. All three metrics (Usage %, Temp °C, Power W) drawn on a single plot, each normalised to its own range and rendered in a different colour. - Memory allocation changed from MemoryMB/16 to MemoryMB-512 (full VRAM minus 512 MB driver overhead) so bee-gpu-stress actually stresses memory. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-26 17:37:20 +03:00
parent 0a52a4f3ba
commit 8b4bfdf5ad
7 changed files with 223 additions and 27 deletions
@@ -182,12 +182,13 @@ func hcFanStressOpts(hcMode int, application interface {
 		}
 	}

-	// Use minimum GPU memory size to fit all GPUs.
+	// Use nearly full GPU memory on the smallest GPU (leave 512 MB for driver overhead).
 	sizeMB := 64
 	if gpus, err := application.ListNvidiaGPUs(); err == nil {
 		for _, g := range gpus {
-			if g.MemoryMB > 0 && (sizeMB == 64 || g.MemoryMB < sizeMB) {
-				sizeMB = g.MemoryMB / 16 // allocate 1/16 of VRAM per GPU
+			free := g.MemoryMB - 512
+			if free > 0 && (sizeMB == 64 || free < sizeMB) {
+				sizeMB = free
 			}
 		}
 	}
@@ -50,3 +50,8 @@ type gpuStressDoneMsg struct {
 	body  string
 	err   error
 }
+
+type gpuLiveTickMsg struct {
+	rows    []platform.GPUMetricRow
+	indices []int
+}
@@ -3,8 +3,10 @@ package tui
 import (
 	"context"
 	"fmt"
-	"os/exec"
 	"strings"
+	"time"
+
+	"bee/audit/internal/platform"

 	tea "github.com/charmbracelet/bubbletea"
 )
@@ -156,14 +158,16 @@ func (m model) hcRunFanStress() (tea.Model, tea.Cmd) {
 	return m, nil
 }

-// startGPUStressTest launches the GPU Platform Stress Test and nvtop concurrently.
-// nvtop occupies the full terminal as a live chart; the stress test runs in background.
+// startGPUStressTest launches the GPU Platform Stress Test with a live in-TUI chart.
 func (m model) startGPUStressTest() (tea.Model, tea.Cmd) {
 	opts := hcFanStressOpts(m.hcMode, m.app)

 	ctx, cancel := context.WithCancel(context.Background())
 	m.gpuStressCancel = cancel
 	m.gpuStressAborted = false
+	m.gpuLiveRows = nil
+	m.gpuLiveIndices = opts.GPUIndices
+	m.gpuLiveStart = time.Now()
 	m.screen = screenGPUStressRunning
 	m.nvidiaSATCursor = 0

@@ -172,30 +176,21 @@ func (m model) startGPUStressTest() (tea.Model, tea.Cmd) {
 		return gpuStressDoneMsg{title: result.Title, body: result.Body, err: err}
 	}

-	nvtopPath, lookErr := exec.LookPath("nvtop")
-	if lookErr != nil {
-		return m, stressCmd
-	}
+	return m, tea.Batch(stressCmd, pollGPULive(opts.GPUIndices))
+}

-	return m, tea.Batch(
-		stressCmd,
-		tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
-			return nvtopClosedMsg{}
-		}),
-	)
+// pollGPULive samples nvidia-smi once after one second and returns a gpuLiveTickMsg.
+// The update handler reschedules it to achieve continuous 1s polling.
+func pollGPULive(indices []int) tea.Cmd {
+	return tea.Tick(time.Second, func(_ time.Time) tea.Msg {
+		rows, _ := platform.SampleGPUMetrics(indices)
+		return gpuLiveTickMsg{rows: rows, indices: indices}
+	})
 }

 // updateGPUStressRunning handles keys on the GPU stress running screen.
 func (m model) updateGPUStressRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
 	switch msg.String() {
-	case "o", "O":
-		nvtopPath, err := exec.LookPath("nvtop")
-		if err != nil {
-			return m, nil
-		}
-		return m, tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
-			return nvtopClosedMsg{}
-		})
 	case "a", "A":
 		if m.gpuStressCancel != nil {
 			m.gpuStressCancel()
@@ -210,8 +205,22 @@ func (m model) updateGPUStressRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
 	return m, nil
 }

-func renderGPUStressRunning() string {
-	return "GPU PLATFORM STRESS TEST\n\nTest is running...\n\n[o] Open nvtop  [a] Abort test  [ctrl+c] quit\n"
+func renderGPUStressRunning(m model) string {
+	var b strings.Builder
+	fmt.Fprintln(&b, "GPU PLATFORM STRESS TEST")
+	fmt.Fprintln(&b)
+	if len(m.gpuLiveRows) == 0 {
+		fmt.Fprintln(&b, "Collecting metrics...")
+	} else {
+		chartWidth := m.width - 8
+		if chartWidth < 40 {
+			chartWidth = 70
+		}
+		b.WriteString(platform.RenderGPULiveChart(m.gpuLiveRows, chartWidth))
+	}
+	fmt.Fprintln(&b)
+	b.WriteString("[a] Abort test  [ctrl+c] quit")
+	return b.String()
 }

 func (m model) hcRunAll() (tea.Model, tea.Cmd) {
@@ -97,6 +97,9 @@ type model struct {
 	// GPU Platform Stress Test running
 	gpuStressCancel  func()
 	gpuStressAborted bool
+	gpuLiveRows    []platform.GPUMetricRow
+	gpuLiveIndices []int
+	gpuLiveStart   time.Time

 	// SAT verbose progress (CPU / Memory / Storage / AMD GPU)
 	progressLines  []string
@@ -3,6 +3,7 @@ package tui
 import (
 	"fmt"
 	"strings"
+	"time"

 	tea "github.com/charmbracelet/bubbletea"
 )
@@ -130,6 +131,22 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 			m.body = msg.body
 		}
 		return m, m.refreshSnapshotCmd()
+	case gpuLiveTickMsg:
+		if m.screen == screenGPUStressRunning {
+			if len(msg.rows) > 0 {
+				elapsed := time.Since(m.gpuLiveStart).Seconds()
+				for i := range msg.rows {
+					msg.rows[i].ElapsedSec = elapsed
+				}
+				m.gpuLiveRows = append(m.gpuLiveRows, msg.rows...)
+				n := max(1, len(msg.indices))
+				if len(m.gpuLiveRows) > 60*n {
+					m.gpuLiveRows = m.gpuLiveRows[len(m.gpuLiveRows)-60*n:]
+				}
+			}
+			return m, pollGPULive(msg.indices)
+		}
+		return m, nil
 	case nvidiaSATDoneMsg:
 		if m.nvidiaSATAborted {
 			return m, nil
@@ -79,7 +79,7 @@ func (m model) View() string {
 		case screenNvidiaSATRunning:
 			body = renderNvidiaSATRunning()
 		case screenGPUStressRunning:
-			body = renderGPUStressRunning()
+			body = renderGPUStressRunning(m)
 		case screenOutput:
 			body = fmt.Sprintf("%s\n\n%s\n\n[enter/esc] back  [ctrl+c] quit\n", m.title, strings.TrimSpace(m.body))
 		default: