feat(tui): live GPU chart during stress test, full VRAM allocation

- GPU Platform Stress Test now shows a live in-TUI chart instead of nvtop. nvidia-smi is polled every second; up to 60 data points per GPU kept. All three metrics (Usage %, Temp °C, Power W) drawn on a single plot, each normalised to its own range and rendered in a different colour. - Memory allocation changed from MemoryMB/16 to MemoryMB-512 (full VRAM minus 512 MB driver overhead) so bee-gpu-stress actually stresses memory. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-26 17:37:20 +03:00
parent 0a52a4f3ba
commit 8b4bfdf5ad
7 changed files with 223 additions and 27 deletions
--- a/audit/internal/platform/gpu_metrics.go
+++ b/audit/internal/platform/gpu_metrics.go
@@ -69,6 +69,11 @@ func parseGPUFloat(s string) float64 {
 	return v
 }

+// SampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
+func SampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
+	return sampleGPUMetrics(gpuIndices)
+}
+
 // WriteGPUMetricsCSV writes collected rows as a CSV file.
 func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
 	var b bytes.Buffer
@@ -370,6 +375,162 @@ func RenderGPUTerminalChart(rows []GPUMetricRow) string {
 	return strings.TrimRight(b.String(), "\n")
 }

+// RenderGPULiveChart renders all GPU metrics on a single combined chart per GPU.
+// Each series is normalised to its own min–max and drawn in a different colour.
+// chartWidth controls the width of the plot area (Y-axis label uses 5 extra chars).
+func RenderGPULiveChart(rows []GPUMetricRow, chartWidth int) string {
+	if chartWidth < 20 {
+		chartWidth = 70
+	}
+	const chartHeight = 14
+
+	seen := make(map[int]bool)
+	var order []int
+	gpuMap := make(map[int][]GPUMetricRow)
+	for _, r := range rows {
+		if !seen[r.GPUIndex] {
+			seen[r.GPUIndex] = true
+			order = append(order, r.GPUIndex)
+		}
+		gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
+	}
+
+	type seriesDef struct {
+		label string
+		color string
+		unit  string
+		fn    func(GPUMetricRow) float64
+	}
+	defs := []seriesDef{
+		{"Usage", ansiBlue, "%", func(r GPUMetricRow) float64 { return r.UsagePct }},
+		{"Temp", ansiRed, "°C", func(r GPUMetricRow) float64 { return r.TempC }},
+		{"Power", ansiGreen, "W", func(r GPUMetricRow) float64 { return r.PowerW }},
+	}
+
+	var b strings.Builder
+	for _, gpuIdx := range order {
+		gr := gpuMap[gpuIdx]
+		if len(gr) == 0 {
+			continue
+		}
+		elapsed := gr[len(gr)-1].ElapsedSec
+
+		// Build value slices for each series.
+		type seriesData struct {
+			seriesDef
+			vals []float64
+			mn   float64
+			mx   float64
+		}
+		var series []seriesData
+		for _, d := range defs {
+			vals := extractGPUField(gr, d.fn)
+			mn, mx := gpuMinMax(vals)
+			if mn == mx {
+				mx = mn + 1
+			}
+			series = append(series, seriesData{d, vals, mn, mx})
+		}
+
+		// Shared character grid: row 0 = top (max), row chartHeight = bottom (min).
+		type cell struct {
+			ch    rune
+			color string
+		}
+		grid := make([][]cell, chartHeight+1)
+		for r := range grid {
+			grid[r] = make([]cell, chartWidth)
+			for c := range grid[r] {
+				grid[r][c] = cell{' ', ""}
+			}
+		}
+
+		// Plot each series onto the shared grid.
+		for _, s := range series {
+			w := chartWidth
+			if len(s.vals) < w {
+				w = len(s.vals)
+			}
+			data := gpuDownsample(s.vals, w)
+			prevRow := -1
+			for x, v := range data {
+				row := chartHeight - int(math.Round((v-s.mn)/(s.mx-s.mn)*float64(chartHeight)))
+				if row < 0 {
+					row = 0
+				}
+				if row > chartHeight {
+					row = chartHeight
+				}
+				if prevRow < 0 || prevRow == row {
+					grid[row][x] = cell{'─', s.color}
+				} else {
+					lo, hi := prevRow, row
+					if lo > hi {
+						lo, hi = hi, lo
+					}
+					for y := lo + 1; y < hi; y++ {
+						grid[y][x] = cell{'│', s.color}
+					}
+					if prevRow < row {
+						grid[prevRow][x] = cell{'╮', s.color}
+						grid[row][x] = cell{'╰', s.color}
+					} else {
+						grid[prevRow][x] = cell{'╯', s.color}
+						grid[row][x] = cell{'╭', s.color}
+					}
+				}
+				prevRow = row
+			}
+		}
+
+		// Render: Y axis + data rows.
+		fmt.Fprintf(&b, "GPU %d  (%.0fs)  each series normalised to its range\n", gpuIdx, elapsed)
+		for r := 0; r <= chartHeight; r++ {
+			// Y axis label: 100% at top, 50% in middle, 0% at bottom.
+			switch r {
+			case 0:
+				fmt.Fprintf(&b, "%4s┤", "100%")
+			case chartHeight / 2:
+				fmt.Fprintf(&b, "%4s┤", "50%")
+			case chartHeight:
+				fmt.Fprintf(&b, "%4s┤", "0%")
+			default:
+				fmt.Fprintf(&b, "%4s│", "")
+			}
+			for c := 0; c < chartWidth; c++ {
+				cl := grid[r][c]
+				if cl.color != "" {
+					b.WriteString(cl.color)
+					b.WriteRune(cl.ch)
+					b.WriteString(ansiReset)
+				} else {
+					b.WriteRune(' ')
+				}
+			}
+			b.WriteRune('\n')
+		}
+		// Bottom axis.
+		b.WriteString("     └")
+		b.WriteString(strings.Repeat("─", chartWidth))
+		b.WriteRune('\n')
+
+		// Legend with current (last) values.
+		b.WriteString("     ")
+		for i, s := range series {
+			last := s.vals[len(s.vals)-1]
+			b.WriteString(s.color)
+			fmt.Fprintf(&b, "▐ %s: %.0f%s", s.label, last, s.unit)
+			b.WriteString(ansiReset)
+			if i < len(series)-1 {
+				b.WriteString("   ")
+			}
+		}
+		b.WriteRune('\n')
+	}
+
+	return strings.TrimRight(b.String(), "\n")
+}
+
 // renderLineChart draws a single time-series line chart using box-drawing characters.
 // Produces output in the style of asciigraph: ╭─╮ │ ╰─╯ with a Y axis and caption.
 func renderLineChart(vals []float64, color, caption string, height, width int) string {
--- a/audit/internal/tui/forms.go
+++ b/audit/internal/tui/forms.go
@@ -182,12 +182,13 @@ func hcFanStressOpts(hcMode int, application interface {
 		}
 	}

-	// Use minimum GPU memory size to fit all GPUs.
+	// Use nearly full GPU memory on the smallest GPU (leave 512 MB for driver overhead).
 	sizeMB := 64
 	if gpus, err := application.ListNvidiaGPUs(); err == nil {
 		for _, g := range gpus {
-			if g.MemoryMB > 0 && (sizeMB == 64 || g.MemoryMB < sizeMB) {
-				sizeMB = g.MemoryMB / 16 // allocate 1/16 of VRAM per GPU
+			free := g.MemoryMB - 512
+			if free > 0 && (sizeMB == 64 || free < sizeMB) {
+				sizeMB = free
 			}
 		}
 	}
--- a/audit/internal/tui/messages.go
+++ b/audit/internal/tui/messages.go
@@ -50,3 +50,8 @@ type gpuStressDoneMsg struct {
 	body  string
 	err   error
 }
+
+type gpuLiveTickMsg struct {
+	rows    []platform.GPUMetricRow
+	indices []int
+}
--- a/audit/internal/tui/screen_health_check.go
+++ b/audit/internal/tui/screen_health_check.go
@@ -3,8 +3,10 @@ package tui
 import (
 	"context"
 	"fmt"
-	"os/exec"
 	"strings"
+	"time"
+
+	"bee/audit/internal/platform"

 	tea "github.com/charmbracelet/bubbletea"
 )
@@ -156,14 +158,16 @@ func (m model) hcRunFanStress() (tea.Model, tea.Cmd) {
 	return m, nil
 }

-// startGPUStressTest launches the GPU Platform Stress Test and nvtop concurrently.
-// nvtop occupies the full terminal as a live chart; the stress test runs in background.
+// startGPUStressTest launches the GPU Platform Stress Test with a live in-TUI chart.
 func (m model) startGPUStressTest() (tea.Model, tea.Cmd) {
 	opts := hcFanStressOpts(m.hcMode, m.app)

 	ctx, cancel := context.WithCancel(context.Background())
 	m.gpuStressCancel = cancel
 	m.gpuStressAborted = false
+	m.gpuLiveRows = nil
+	m.gpuLiveIndices = opts.GPUIndices
+	m.gpuLiveStart = time.Now()
 	m.screen = screenGPUStressRunning
 	m.nvidiaSATCursor = 0

@@ -172,30 +176,21 @@ func (m model) startGPUStressTest() (tea.Model, tea.Cmd) {
 		return gpuStressDoneMsg{title: result.Title, body: result.Body, err: err}
 	}

-	nvtopPath, lookErr := exec.LookPath("nvtop")
-	if lookErr != nil {
-		return m, stressCmd
-	}
+	return m, tea.Batch(stressCmd, pollGPULive(opts.GPUIndices))
+}

-	return m, tea.Batch(
-		stressCmd,
-		tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
-			return nvtopClosedMsg{}
-		}),
-	)
+// pollGPULive samples nvidia-smi once after one second and returns a gpuLiveTickMsg.
+// The update handler reschedules it to achieve continuous 1s polling.
+func pollGPULive(indices []int) tea.Cmd {
+	return tea.Tick(time.Second, func(_ time.Time) tea.Msg {
+		rows, _ := platform.SampleGPUMetrics(indices)
+		return gpuLiveTickMsg{rows: rows, indices: indices}
+	})
 }

 // updateGPUStressRunning handles keys on the GPU stress running screen.
 func (m model) updateGPUStressRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
 	switch msg.String() {
-	case "o", "O":
-		nvtopPath, err := exec.LookPath("nvtop")
-		if err != nil {
-			return m, nil
-		}
-		return m, tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
-			return nvtopClosedMsg{}
-		})
 	case "a", "A":
 		if m.gpuStressCancel != nil {
 			m.gpuStressCancel()
@@ -210,8 +205,22 @@ func (m model) updateGPUStressRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
 	return m, nil
 }

-func renderGPUStressRunning() string {
-	return "GPU PLATFORM STRESS TEST\n\nTest is running...\n\n[o] Open nvtop  [a] Abort test  [ctrl+c] quit\n"
+func renderGPUStressRunning(m model) string {
+	var b strings.Builder
+	fmt.Fprintln(&b, "GPU PLATFORM STRESS TEST")
+	fmt.Fprintln(&b)
+	if len(m.gpuLiveRows) == 0 {
+		fmt.Fprintln(&b, "Collecting metrics...")
+	} else {
+		chartWidth := m.width - 8
+		if chartWidth < 40 {
+			chartWidth = 70
+		}
+		b.WriteString(platform.RenderGPULiveChart(m.gpuLiveRows, chartWidth))
+	}
+	fmt.Fprintln(&b)
+	b.WriteString("[a] Abort test  [ctrl+c] quit")
+	return b.String()
 }

 func (m model) hcRunAll() (tea.Model, tea.Cmd) {
--- a/audit/internal/tui/types.go
+++ b/audit/internal/tui/types.go
@@ -97,6 +97,9 @@ type model struct {
 	// GPU Platform Stress Test running
 	gpuStressCancel  func()
 	gpuStressAborted bool
+	gpuLiveRows    []platform.GPUMetricRow
+	gpuLiveIndices []int
+	gpuLiveStart   time.Time

 	// SAT verbose progress (CPU / Memory / Storage / AMD GPU)
 	progressLines  []string
--- a/audit/internal/tui/update.go
+++ b/audit/internal/tui/update.go
@@ -3,6 +3,7 @@ package tui
 import (
 	"fmt"
 	"strings"
+	"time"

 	tea "github.com/charmbracelet/bubbletea"
 )
@@ -130,6 +131,22 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 			m.body = msg.body
 		}
 		return m, m.refreshSnapshotCmd()
+	case gpuLiveTickMsg:
+		if m.screen == screenGPUStressRunning {
+			if len(msg.rows) > 0 {
+				elapsed := time.Since(m.gpuLiveStart).Seconds()
+				for i := range msg.rows {
+					msg.rows[i].ElapsedSec = elapsed
+				}
+				m.gpuLiveRows = append(m.gpuLiveRows, msg.rows...)
+				n := max(1, len(msg.indices))
+				if len(m.gpuLiveRows) > 60*n {
+					m.gpuLiveRows = m.gpuLiveRows[len(m.gpuLiveRows)-60*n:]
+				}
+			}
+			return m, pollGPULive(msg.indices)
+		}
+		return m, nil
 	case nvidiaSATDoneMsg:
 		if m.nvidiaSATAborted {
 			return m, nil
--- a/audit/internal/tui/view.go
+++ b/audit/internal/tui/view.go
@@ -79,7 +79,7 @@ func (m model) View() string {
 		case screenNvidiaSATRunning:
 			body = renderNvidiaSATRunning()
 		case screenGPUStressRunning:
-			body = renderGPUStressRunning()
+			body = renderGPUStressRunning(m)
 		case screenOutput:
 			body = fmt.Sprintf("%s\n\n%s\n\n[enter/esc] back  [ctrl+c] quit\n", m.title, strings.TrimSpace(m.body))
 		default: