feat(tui): live GPU chart during stress test, full VRAM allocation

- GPU Platform Stress Test now shows a live in-TUI chart instead of nvtop. nvidia-smi is polled every second; up to 60 data points per GPU kept. All three metrics (Usage %, Temp °C, Power W) drawn on a single plot, each normalised to its own range and rendered in a different colour. - Memory allocation changed from MemoryMB/16 to MemoryMB-512 (full VRAM minus 512 MB driver overhead) so bee-gpu-stress actually stresses memory. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-26 17:37:20 +03:00
parent 0a52a4f3ba
commit 8b4bfdf5ad
7 changed files with 223 additions and 27 deletions
--- a/audit/internal/platform/gpu_metrics.go
+++ b/audit/internal/platform/gpu_metrics.go
@@ -69,6 +69,11 @@ func parseGPUFloat(s string) float64 {
 	return v
 }
 // SampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
 func SampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
 	return sampleGPUMetrics(gpuIndices)
 }
 // WriteGPUMetricsCSV writes collected rows as a CSV file.
 func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
 	var b bytes.Buffer
@@ -370,6 +375,162 @@ func RenderGPUTerminalChart(rows []GPUMetricRow) string {
 	return strings.TrimRight(b.String(), "\n")
 }
 // RenderGPULiveChart renders all GPU metrics on a single combined chart per GPU.
 // Each series is normalised to its own min–max and drawn in a different colour.
 // chartWidth controls the width of the plot area (Y-axis label uses 5 extra chars).
 func RenderGPULiveChart(rows []GPUMetricRow, chartWidth int) string {
 	if chartWidth < 20 {
 		chartWidth = 70
 	}
 	const chartHeight = 14
 	seen := make(map[int]bool)
 	var order []int
 	gpuMap := make(map[int][]GPUMetricRow)
 	for _, r := range rows {
 		if !seen[r.GPUIndex] {
 			seen[r.GPUIndex] = true
 			order = append(order, r.GPUIndex)
 		}
 		gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
 	}
 	type seriesDef struct {
 		label string
 		color string
 		unit  string
 		fn    func(GPUMetricRow) float64
 	}
 	defs := []seriesDef{
 		{"Usage", ansiBlue, "%", func(r GPUMetricRow) float64 { return r.UsagePct }},
 		{"Temp", ansiRed, "°C", func(r GPUMetricRow) float64 { return r.TempC }},
 		{"Power", ansiGreen, "W", func(r GPUMetricRow) float64 { return r.PowerW }},
 	}
 	var b strings.Builder
 	for _, gpuIdx := range order {
 		gr := gpuMap[gpuIdx]
 		if len(gr) == 0 {
 			continue
 		}
 		elapsed := gr[len(gr)-1].ElapsedSec
 		// Build value slices for each series.
 		type seriesData struct {
 			seriesDef
 			vals []float64
 			mn   float64
 			mx   float64
 		}
 		var series []seriesData
 		for _, d := range defs {
 			vals := extractGPUField(gr, d.fn)
 			mn, mx := gpuMinMax(vals)
 			if mn == mx {
 				mx = mn + 1
 			}
 			series = append(series, seriesData{d, vals, mn, mx})
 		}
 		// Shared character grid: row 0 = top (max), row chartHeight = bottom (min).
 		type cell struct {
 			ch    rune
 			color string
 		}
 		grid := make([][]cell, chartHeight+1)
 		for r := range grid {
 			grid[r] = make([]cell, chartWidth)
 			for c := range grid[r] {
 				grid[r][c] = cell{' ', ""}
 			}
 		}
 		// Plot each series onto the shared grid.
 		for _, s := range series {
 			w := chartWidth
 			if len(s.vals) < w {
 				w = len(s.vals)
 			}
 			data := gpuDownsample(s.vals, w)
 			prevRow := -1
 			for x, v := range data {
 				row := chartHeight - int(math.Round((v-s.mn)/(s.mx-s.mn)*float64(chartHeight)))
 				if row < 0 {
 					row = 0
 				}
 				if row > chartHeight {
 					row = chartHeight
 				}
 				if prevRow < 0 || prevRow == row {
 					grid[row][x] = cell{'─', s.color}
 				} else {
 					lo, hi := prevRow, row
 					if lo > hi {
 						lo, hi = hi, lo
 					}
 					for y := lo + 1; y < hi; y++ {
 						grid[y][x] = cell{'│', s.color}
 					}
 					if prevRow < row {
 						grid[prevRow][x] = cell{'╮', s.color}
 						grid[row][x] = cell{'╰', s.color}
 					} else {
 						grid[prevRow][x] = cell{'╯', s.color}
 						grid[row][x] = cell{'╭', s.color}
 					}
 				}
 				prevRow = row
 			}
 		}
 		// Render: Y axis + data rows.
 		fmt.Fprintf(&b, "GPU %d  (%.0fs)  each series normalised to its range\n", gpuIdx, elapsed)
 		for r := 0; r <= chartHeight; r++ {
 			// Y axis label: 100% at top, 50% in middle, 0% at bottom.
 			switch r {
 			case 0:
 				fmt.Fprintf(&b, "%4s┤", "100%")
 			case chartHeight / 2:
 				fmt.Fprintf(&b, "%4s┤", "50%")
 			case chartHeight:
 				fmt.Fprintf(&b, "%4s┤", "0%")
 			default:
 				fmt.Fprintf(&b, "%4s│", "")
 			}
 			for c := 0; c < chartWidth; c++ {
 				cl := grid[r][c]
 				if cl.color != "" {
 					b.WriteString(cl.color)
 					b.WriteRune(cl.ch)
 					b.WriteString(ansiReset)
 				} else {
 					b.WriteRune(' ')
 				}
 			}
 			b.WriteRune('\n')
 		}
 		// Bottom axis.
 		b.WriteString("     └")
 		b.WriteString(strings.Repeat("─", chartWidth))
 		b.WriteRune('\n')
 		// Legend with current (last) values.
 		b.WriteString("     ")
 		for i, s := range series {
 			last := s.vals[len(s.vals)-1]
 			b.WriteString(s.color)
 			fmt.Fprintf(&b, "▐ %s: %.0f%s", s.label, last, s.unit)
 			b.WriteString(ansiReset)
 			if i < len(series)-1 {
 				b.WriteString("   ")
 			}
 		}
 		b.WriteRune('\n')
 	}
 	return strings.TrimRight(b.String(), "\n")
 }
 // renderLineChart draws a single time-series line chart using box-drawing characters.
 // Produces output in the style of asciigraph: ╭─╮ │ ╰─╯ with a Y axis and caption.
 func renderLineChart(vals []float64, color, caption string, height, width int) string {
--- a/audit/internal/tui/forms.go
+++ b/audit/internal/tui/forms.go
@@ -182,12 +182,13 @@ func hcFanStressOpts(hcMode int, application interface {
 		}
 	}
-	// Use minimum GPU memory size to fit all GPUs.
+	// Use nearly full GPU memory on the smallest GPU (leave 512 MB for driver overhead).
 	sizeMB := 64
 	if gpus, err := application.ListNvidiaGPUs(); err == nil {
 		for _, g := range gpus {
-			if g.MemoryMB > 0 && (sizeMB == 64 || g.MemoryMB < sizeMB) {
+			free := g.MemoryMB - 512
-				sizeMB = g.MemoryMB / 16 // allocate 1/16 of VRAM per GPU
+			if free > 0 && (sizeMB == 64 || free < sizeMB) {
 				sizeMB = free
 			}
 		}
 	}
--- a/audit/internal/tui/messages.go
+++ b/audit/internal/tui/messages.go
@@ -50,3 +50,8 @@ type gpuStressDoneMsg struct {
 	body  string
 	err   error
 }
 type gpuLiveTickMsg struct {
 	rows    []platform.GPUMetricRow
 	indices []int
 }
--- a/audit/internal/tui/screen_health_check.go
+++ b/audit/internal/tui/screen_health_check.go
@@ -3,8 +3,10 @@ package tui
 import (
 	"context"
 	"fmt"
 	"os/exec"
 	"strings"
 	"time"
 	"bee/audit/internal/platform"
 	tea "github.com/charmbracelet/bubbletea"
 )
@@ -156,14 +158,16 @@ func (m model) hcRunFanStress() (tea.Model, tea.Cmd) {
 	return m, nil
 }
-// startGPUStressTest launches the GPU Platform Stress Test and nvtop concurrently.
+// startGPUStressTest launches the GPU Platform Stress Test with a live in-TUI chart.
 // nvtop occupies the full terminal as a live chart; the stress test runs in background.
 func (m model) startGPUStressTest() (tea.Model, tea.Cmd) {
 	opts := hcFanStressOpts(m.hcMode, m.app)
 	ctx, cancel := context.WithCancel(context.Background())
 	m.gpuStressCancel = cancel
 	m.gpuStressAborted = false
 	m.gpuLiveRows = nil
 	m.gpuLiveIndices = opts.GPUIndices
 	m.gpuLiveStart = time.Now()
 	m.screen = screenGPUStressRunning
 	m.nvidiaSATCursor = 0
@@ -172,30 +176,21 @@ func (m model) startGPUStressTest() (tea.Model, tea.Cmd) {
 		return gpuStressDoneMsg{title: result.Title, body: result.Body, err: err}
 	}
-	nvtopPath, lookErr := exec.LookPath("nvtop")
+	return m, tea.Batch(stressCmd, pollGPULive(opts.GPUIndices))
-	if lookErr != nil {
+}
 		return m, stressCmd
 	}
-	return m, tea.Batch(
+// pollGPULive samples nvidia-smi once after one second and returns a gpuLiveTickMsg.
-		stressCmd,
+// The update handler reschedules it to achieve continuous 1s polling.
-		tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
+func pollGPULive(indices []int) tea.Cmd {
-			return nvtopClosedMsg{}
+	return tea.Tick(time.Second, func(_ time.Time) tea.Msg {
-		}),
+		rows, _ := platform.SampleGPUMetrics(indices)
-	)
+		return gpuLiveTickMsg{rows: rows, indices: indices}
 	})
 }
 // updateGPUStressRunning handles keys on the GPU stress running screen.
 func (m model) updateGPUStressRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
 	switch msg.String() {
 	case "o", "O":
 		nvtopPath, err := exec.LookPath("nvtop")
 		if err != nil {
 			return m, nil
 		}
 		return m, tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
 			return nvtopClosedMsg{}
 		})
 	case "a", "A":
 		if m.gpuStressCancel != nil {
 			m.gpuStressCancel()
@@ -210,8 +205,22 @@ func (m model) updateGPUStressRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
 	return m, nil
 }
-func renderGPUStressRunning() string {
+func renderGPUStressRunning(m model) string {
-	return "GPU PLATFORM STRESS TEST\n\nTest is running...\n\n[o] Open nvtop  [a] Abort test  [ctrl+c] quit\n"
+	var b strings.Builder
 	fmt.Fprintln(&b, "GPU PLATFORM STRESS TEST")
 	fmt.Fprintln(&b)
 	if len(m.gpuLiveRows) == 0 {
 		fmt.Fprintln(&b, "Collecting metrics...")
 	} else {
 		chartWidth := m.width - 8
 		if chartWidth < 40 {
 			chartWidth = 70
 		}
 		b.WriteString(platform.RenderGPULiveChart(m.gpuLiveRows, chartWidth))
 	}
 	fmt.Fprintln(&b)
 	b.WriteString("[a] Abort test  [ctrl+c] quit")
 	return b.String()
 }
 func (m model) hcRunAll() (tea.Model, tea.Cmd) {
--- a/audit/internal/tui/types.go
+++ b/audit/internal/tui/types.go
@@ -97,6 +97,9 @@ type model struct {
 	// GPU Platform Stress Test running
 	gpuStressCancel  func()
 	gpuStressAborted bool
 	gpuLiveRows    []platform.GPUMetricRow
 	gpuLiveIndices []int
 	gpuLiveStart   time.Time
 	// SAT verbose progress (CPU / Memory / Storage / AMD GPU)
 	progressLines  []string
--- a/audit/internal/tui/update.go
+++ b/audit/internal/tui/update.go
@@ -3,6 +3,7 @@ package tui
 import (
 	"fmt"
 	"strings"
 	"time"
 	tea "github.com/charmbracelet/bubbletea"
 )
@@ -130,6 +131,22 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 			m.body = msg.body
 		}
 		return m, m.refreshSnapshotCmd()
 	case gpuLiveTickMsg:
 		if m.screen == screenGPUStressRunning {
 			if len(msg.rows) > 0 {
 				elapsed := time.Since(m.gpuLiveStart).Seconds()
 				for i := range msg.rows {
 					msg.rows[i].ElapsedSec = elapsed
 				}
 				m.gpuLiveRows = append(m.gpuLiveRows, msg.rows...)
 				n := max(1, len(msg.indices))
 				if len(m.gpuLiveRows) > 60*n {
 					m.gpuLiveRows = m.gpuLiveRows[len(m.gpuLiveRows)-60*n:]
 				}
 			}
 			return m, pollGPULive(msg.indices)
 		}
 		return m, nil
 	case nvidiaSATDoneMsg:
 		if m.nvidiaSATAborted {
 			return m, nil
--- a/audit/internal/tui/view.go
+++ b/audit/internal/tui/view.go
@@ -79,7 +79,7 @@ func (m model) View() string {
 		case screenNvidiaSATRunning:
 			body = renderNvidiaSATRunning()
 		case screenGPUStressRunning:
-			body = renderGPUStressRunning()
+			body = renderGPUStressRunning(m)
 		case screenOutput:
 			body = fmt.Sprintf("%s\n\n%s\n\n[enter/esc] back  [ctrl+c] quit\n", m.title, strings.TrimSpace(m.body))
 		default: