feat(tui): live GPU chart during stress test, full VRAM allocation

- GPU Platform Stress Test now shows a live in-TUI chart instead of nvtop.
  nvidia-smi is polled every second; up to 60 data points per GPU kept.
  All three metrics (Usage %, Temp °C, Power W) drawn on a single plot,
  each normalised to its own range and rendered in a different colour.
- Memory allocation changed from MemoryMB/16 to MemoryMB-512 (full VRAM
  minus 512 MB driver overhead) so bee-gpu-stress actually stresses memory.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Mikhail Chusavitin
2026-03-26 17:37:20 +03:00
parent 0a52a4f3ba
commit 8b4bfdf5ad
7 changed files with 223 additions and 27 deletions

View File

@@ -69,6 +69,11 @@ func parseGPUFloat(s string) float64 {
return v
}
// SampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
func SampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
return sampleGPUMetrics(gpuIndices)
}
// WriteGPUMetricsCSV writes collected rows as a CSV file.
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
var b bytes.Buffer
@@ -370,6 +375,162 @@ func RenderGPUTerminalChart(rows []GPUMetricRow) string {
return strings.TrimRight(b.String(), "\n")
}
// RenderGPULiveChart renders all GPU metrics on a single combined chart per GPU.
// Each series is normalised to its own minmax and drawn in a different colour.
// chartWidth controls the width of the plot area (Y-axis label uses 5 extra chars).
func RenderGPULiveChart(rows []GPUMetricRow, chartWidth int) string {
if chartWidth < 20 {
chartWidth = 70
}
const chartHeight = 14
seen := make(map[int]bool)
var order []int
gpuMap := make(map[int][]GPUMetricRow)
for _, r := range rows {
if !seen[r.GPUIndex] {
seen[r.GPUIndex] = true
order = append(order, r.GPUIndex)
}
gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
}
type seriesDef struct {
label string
color string
unit string
fn func(GPUMetricRow) float64
}
defs := []seriesDef{
{"Usage", ansiBlue, "%", func(r GPUMetricRow) float64 { return r.UsagePct }},
{"Temp", ansiRed, "°C", func(r GPUMetricRow) float64 { return r.TempC }},
{"Power", ansiGreen, "W", func(r GPUMetricRow) float64 { return r.PowerW }},
}
var b strings.Builder
for _, gpuIdx := range order {
gr := gpuMap[gpuIdx]
if len(gr) == 0 {
continue
}
elapsed := gr[len(gr)-1].ElapsedSec
// Build value slices for each series.
type seriesData struct {
seriesDef
vals []float64
mn float64
mx float64
}
var series []seriesData
for _, d := range defs {
vals := extractGPUField(gr, d.fn)
mn, mx := gpuMinMax(vals)
if mn == mx {
mx = mn + 1
}
series = append(series, seriesData{d, vals, mn, mx})
}
// Shared character grid: row 0 = top (max), row chartHeight = bottom (min).
type cell struct {
ch rune
color string
}
grid := make([][]cell, chartHeight+1)
for r := range grid {
grid[r] = make([]cell, chartWidth)
for c := range grid[r] {
grid[r][c] = cell{' ', ""}
}
}
// Plot each series onto the shared grid.
for _, s := range series {
w := chartWidth
if len(s.vals) < w {
w = len(s.vals)
}
data := gpuDownsample(s.vals, w)
prevRow := -1
for x, v := range data {
row := chartHeight - int(math.Round((v-s.mn)/(s.mx-s.mn)*float64(chartHeight)))
if row < 0 {
row = 0
}
if row > chartHeight {
row = chartHeight
}
if prevRow < 0 || prevRow == row {
grid[row][x] = cell{'─', s.color}
} else {
lo, hi := prevRow, row
if lo > hi {
lo, hi = hi, lo
}
for y := lo + 1; y < hi; y++ {
grid[y][x] = cell{'│', s.color}
}
if prevRow < row {
grid[prevRow][x] = cell{'╮', s.color}
grid[row][x] = cell{'╰', s.color}
} else {
grid[prevRow][x] = cell{'╯', s.color}
grid[row][x] = cell{'╭', s.color}
}
}
prevRow = row
}
}
// Render: Y axis + data rows.
fmt.Fprintf(&b, "GPU %d (%.0fs) each series normalised to its range\n", gpuIdx, elapsed)
for r := 0; r <= chartHeight; r++ {
// Y axis label: 100% at top, 50% in middle, 0% at bottom.
switch r {
case 0:
fmt.Fprintf(&b, "%4s┤", "100%")
case chartHeight / 2:
fmt.Fprintf(&b, "%4s┤", "50%")
case chartHeight:
fmt.Fprintf(&b, "%4s┤", "0%")
default:
fmt.Fprintf(&b, "%4s│", "")
}
for c := 0; c < chartWidth; c++ {
cl := grid[r][c]
if cl.color != "" {
b.WriteString(cl.color)
b.WriteRune(cl.ch)
b.WriteString(ansiReset)
} else {
b.WriteRune(' ')
}
}
b.WriteRune('\n')
}
// Bottom axis.
b.WriteString(" └")
b.WriteString(strings.Repeat("─", chartWidth))
b.WriteRune('\n')
// Legend with current (last) values.
b.WriteString(" ")
for i, s := range series {
last := s.vals[len(s.vals)-1]
b.WriteString(s.color)
fmt.Fprintf(&b, "▐ %s: %.0f%s", s.label, last, s.unit)
b.WriteString(ansiReset)
if i < len(series)-1 {
b.WriteString(" ")
}
}
b.WriteRune('\n')
}
return strings.TrimRight(b.String(), "\n")
}
// renderLineChart draws a single time-series line chart using box-drawing characters.
// Produces output in the style of asciigraph: ╭─╮ │ ╰─╯ with a Y axis and caption.
func renderLineChart(vals []float64, color, caption string, height, width int) string {

View File

@@ -182,12 +182,13 @@ func hcFanStressOpts(hcMode int, application interface {
}
}
// Use minimum GPU memory size to fit all GPUs.
// Use nearly full GPU memory on the smallest GPU (leave 512 MB for driver overhead).
sizeMB := 64
if gpus, err := application.ListNvidiaGPUs(); err == nil {
for _, g := range gpus {
if g.MemoryMB > 0 && (sizeMB == 64 || g.MemoryMB < sizeMB) {
sizeMB = g.MemoryMB / 16 // allocate 1/16 of VRAM per GPU
free := g.MemoryMB - 512
if free > 0 && (sizeMB == 64 || free < sizeMB) {
sizeMB = free
}
}
}

View File

@@ -50,3 +50,8 @@ type gpuStressDoneMsg struct {
body string
err error
}
type gpuLiveTickMsg struct {
rows []platform.GPUMetricRow
indices []int
}

View File

@@ -3,8 +3,10 @@ package tui
import (
"context"
"fmt"
"os/exec"
"strings"
"time"
"bee/audit/internal/platform"
tea "github.com/charmbracelet/bubbletea"
)
@@ -156,14 +158,16 @@ func (m model) hcRunFanStress() (tea.Model, tea.Cmd) {
return m, nil
}
// startGPUStressTest launches the GPU Platform Stress Test and nvtop concurrently.
// nvtop occupies the full terminal as a live chart; the stress test runs in background.
// startGPUStressTest launches the GPU Platform Stress Test with a live in-TUI chart.
func (m model) startGPUStressTest() (tea.Model, tea.Cmd) {
opts := hcFanStressOpts(m.hcMode, m.app)
ctx, cancel := context.WithCancel(context.Background())
m.gpuStressCancel = cancel
m.gpuStressAborted = false
m.gpuLiveRows = nil
m.gpuLiveIndices = opts.GPUIndices
m.gpuLiveStart = time.Now()
m.screen = screenGPUStressRunning
m.nvidiaSATCursor = 0
@@ -172,30 +176,21 @@ func (m model) startGPUStressTest() (tea.Model, tea.Cmd) {
return gpuStressDoneMsg{title: result.Title, body: result.Body, err: err}
}
nvtopPath, lookErr := exec.LookPath("nvtop")
if lookErr != nil {
return m, stressCmd
}
return m, tea.Batch(stressCmd, pollGPULive(opts.GPUIndices))
}
return m, tea.Batch(
stressCmd,
tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
return nvtopClosedMsg{}
}),
)
// pollGPULive samples nvidia-smi once after one second and returns a gpuLiveTickMsg.
// The update handler reschedules it to achieve continuous 1s polling.
func pollGPULive(indices []int) tea.Cmd {
return tea.Tick(time.Second, func(_ time.Time) tea.Msg {
rows, _ := platform.SampleGPUMetrics(indices)
return gpuLiveTickMsg{rows: rows, indices: indices}
})
}
// updateGPUStressRunning handles keys on the GPU stress running screen.
func (m model) updateGPUStressRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
switch msg.String() {
case "o", "O":
nvtopPath, err := exec.LookPath("nvtop")
if err != nil {
return m, nil
}
return m, tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
return nvtopClosedMsg{}
})
case "a", "A":
if m.gpuStressCancel != nil {
m.gpuStressCancel()
@@ -210,8 +205,22 @@ func (m model) updateGPUStressRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
return m, nil
}
func renderGPUStressRunning() string {
return "GPU PLATFORM STRESS TEST\n\nTest is running...\n\n[o] Open nvtop [a] Abort test [ctrl+c] quit\n"
func renderGPUStressRunning(m model) string {
var b strings.Builder
fmt.Fprintln(&b, "GPU PLATFORM STRESS TEST")
fmt.Fprintln(&b)
if len(m.gpuLiveRows) == 0 {
fmt.Fprintln(&b, "Collecting metrics...")
} else {
chartWidth := m.width - 8
if chartWidth < 40 {
chartWidth = 70
}
b.WriteString(platform.RenderGPULiveChart(m.gpuLiveRows, chartWidth))
}
fmt.Fprintln(&b)
b.WriteString("[a] Abort test [ctrl+c] quit")
return b.String()
}
func (m model) hcRunAll() (tea.Model, tea.Cmd) {

View File

@@ -97,6 +97,9 @@ type model struct {
// GPU Platform Stress Test running
gpuStressCancel func()
gpuStressAborted bool
gpuLiveRows []platform.GPUMetricRow
gpuLiveIndices []int
gpuLiveStart time.Time
// SAT verbose progress (CPU / Memory / Storage / AMD GPU)
progressLines []string

View File

@@ -3,6 +3,7 @@ package tui
import (
"fmt"
"strings"
"time"
tea "github.com/charmbracelet/bubbletea"
)
@@ -130,6 +131,22 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
m.body = msg.body
}
return m, m.refreshSnapshotCmd()
case gpuLiveTickMsg:
if m.screen == screenGPUStressRunning {
if len(msg.rows) > 0 {
elapsed := time.Since(m.gpuLiveStart).Seconds()
for i := range msg.rows {
msg.rows[i].ElapsedSec = elapsed
}
m.gpuLiveRows = append(m.gpuLiveRows, msg.rows...)
n := max(1, len(msg.indices))
if len(m.gpuLiveRows) > 60*n {
m.gpuLiveRows = m.gpuLiveRows[len(m.gpuLiveRows)-60*n:]
}
}
return m, pollGPULive(msg.indices)
}
return m, nil
case nvidiaSATDoneMsg:
if m.nvidiaSATAborted {
return m, nil

View File

@@ -79,7 +79,7 @@ func (m model) View() string {
case screenNvidiaSATRunning:
body = renderNvidiaSATRunning()
case screenGPUStressRunning:
body = renderGPUStressRunning()
body = renderGPUStressRunning(m)
case screenOutput:
body = fmt.Sprintf("%s\n\n%s\n\n[enter/esc] back [ctrl+c] quit\n", m.title, strings.TrimSpace(m.body))
default: