From 8b4bfdf5ad15b9d7a8013837681950ef86a9a190 Mon Sep 17 00:00:00 2001 From: Mikhail Chusavitin Date: Thu, 26 Mar 2026 17:37:20 +0300 Subject: [PATCH] feat(tui): live GPU chart during stress test, full VRAM allocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - GPU Platform Stress Test now shows a live in-TUI chart instead of nvtop. nvidia-smi is polled every second; up to 60 data points per GPU kept. All three metrics (Usage %, Temp °C, Power W) drawn on a single plot, each normalised to its own range and rendered in a different colour. - Memory allocation changed from MemoryMB/16 to MemoryMB-512 (full VRAM minus 512 MB driver overhead) so bee-gpu-stress actually stresses memory. Co-Authored-By: Claude Sonnet 4.6 --- audit/internal/platform/gpu_metrics.go | 161 ++++++++++++++++++++++ audit/internal/tui/forms.go | 7 +- audit/internal/tui/messages.go | 5 + audit/internal/tui/screen_health_check.go | 55 ++++---- audit/internal/tui/types.go | 3 + audit/internal/tui/update.go | 17 +++ audit/internal/tui/view.go | 2 +- 7 files changed, 223 insertions(+), 27 deletions(-) diff --git a/audit/internal/platform/gpu_metrics.go b/audit/internal/platform/gpu_metrics.go index 721d167..b828b29 100644 --- a/audit/internal/platform/gpu_metrics.go +++ b/audit/internal/platform/gpu_metrics.go @@ -69,6 +69,11 @@ func parseGPUFloat(s string) float64 { return v } +// SampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU. +func SampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) { + return sampleGPUMetrics(gpuIndices) +} + // WriteGPUMetricsCSV writes collected rows as a CSV file. func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error { var b bytes.Buffer @@ -370,6 +375,162 @@ func RenderGPUTerminalChart(rows []GPUMetricRow) string { return strings.TrimRight(b.String(), "\n") } +// RenderGPULiveChart renders all GPU metrics on a single combined chart per GPU. +// Each series is normalised to its own min–max and drawn in a different colour. +// chartWidth controls the width of the plot area (Y-axis label uses 5 extra chars). +func RenderGPULiveChart(rows []GPUMetricRow, chartWidth int) string { + if chartWidth < 20 { + chartWidth = 70 + } + const chartHeight = 14 + + seen := make(map[int]bool) + var order []int + gpuMap := make(map[int][]GPUMetricRow) + for _, r := range rows { + if !seen[r.GPUIndex] { + seen[r.GPUIndex] = true + order = append(order, r.GPUIndex) + } + gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r) + } + + type seriesDef struct { + label string + color string + unit string + fn func(GPUMetricRow) float64 + } + defs := []seriesDef{ + {"Usage", ansiBlue, "%", func(r GPUMetricRow) float64 { return r.UsagePct }}, + {"Temp", ansiRed, "°C", func(r GPUMetricRow) float64 { return r.TempC }}, + {"Power", ansiGreen, "W", func(r GPUMetricRow) float64 { return r.PowerW }}, + } + + var b strings.Builder + for _, gpuIdx := range order { + gr := gpuMap[gpuIdx] + if len(gr) == 0 { + continue + } + elapsed := gr[len(gr)-1].ElapsedSec + + // Build value slices for each series. + type seriesData struct { + seriesDef + vals []float64 + mn float64 + mx float64 + } + var series []seriesData + for _, d := range defs { + vals := extractGPUField(gr, d.fn) + mn, mx := gpuMinMax(vals) + if mn == mx { + mx = mn + 1 + } + series = append(series, seriesData{d, vals, mn, mx}) + } + + // Shared character grid: row 0 = top (max), row chartHeight = bottom (min). + type cell struct { + ch rune + color string + } + grid := make([][]cell, chartHeight+1) + for r := range grid { + grid[r] = make([]cell, chartWidth) + for c := range grid[r] { + grid[r][c] = cell{' ', ""} + } + } + + // Plot each series onto the shared grid. + for _, s := range series { + w := chartWidth + if len(s.vals) < w { + w = len(s.vals) + } + data := gpuDownsample(s.vals, w) + prevRow := -1 + for x, v := range data { + row := chartHeight - int(math.Round((v-s.mn)/(s.mx-s.mn)*float64(chartHeight))) + if row < 0 { + row = 0 + } + if row > chartHeight { + row = chartHeight + } + if prevRow < 0 || prevRow == row { + grid[row][x] = cell{'─', s.color} + } else { + lo, hi := prevRow, row + if lo > hi { + lo, hi = hi, lo + } + for y := lo + 1; y < hi; y++ { + grid[y][x] = cell{'│', s.color} + } + if prevRow < row { + grid[prevRow][x] = cell{'╮', s.color} + grid[row][x] = cell{'╰', s.color} + } else { + grid[prevRow][x] = cell{'╯', s.color} + grid[row][x] = cell{'╭', s.color} + } + } + prevRow = row + } + } + + // Render: Y axis + data rows. + fmt.Fprintf(&b, "GPU %d (%.0fs) each series normalised to its range\n", gpuIdx, elapsed) + for r := 0; r <= chartHeight; r++ { + // Y axis label: 100% at top, 50% in middle, 0% at bottom. + switch r { + case 0: + fmt.Fprintf(&b, "%4s┤", "100%") + case chartHeight / 2: + fmt.Fprintf(&b, "%4s┤", "50%") + case chartHeight: + fmt.Fprintf(&b, "%4s┤", "0%") + default: + fmt.Fprintf(&b, "%4s│", "") + } + for c := 0; c < chartWidth; c++ { + cl := grid[r][c] + if cl.color != "" { + b.WriteString(cl.color) + b.WriteRune(cl.ch) + b.WriteString(ansiReset) + } else { + b.WriteRune(' ') + } + } + b.WriteRune('\n') + } + // Bottom axis. + b.WriteString(" └") + b.WriteString(strings.Repeat("─", chartWidth)) + b.WriteRune('\n') + + // Legend with current (last) values. + b.WriteString(" ") + for i, s := range series { + last := s.vals[len(s.vals)-1] + b.WriteString(s.color) + fmt.Fprintf(&b, "▐ %s: %.0f%s", s.label, last, s.unit) + b.WriteString(ansiReset) + if i < len(series)-1 { + b.WriteString(" ") + } + } + b.WriteRune('\n') + } + + return strings.TrimRight(b.String(), "\n") +} + // renderLineChart draws a single time-series line chart using box-drawing characters. // Produces output in the style of asciigraph: ╭─╮ │ ╰─╯ with a Y axis and caption. func renderLineChart(vals []float64, color, caption string, height, width int) string { diff --git a/audit/internal/tui/forms.go b/audit/internal/tui/forms.go index ee6f856..6836668 100644 --- a/audit/internal/tui/forms.go +++ b/audit/internal/tui/forms.go @@ -182,12 +182,13 @@ func hcFanStressOpts(hcMode int, application interface { } } - // Use minimum GPU memory size to fit all GPUs. + // Use nearly full GPU memory on the smallest GPU (leave 512 MB for driver overhead). sizeMB := 64 if gpus, err := application.ListNvidiaGPUs(); err == nil { for _, g := range gpus { - if g.MemoryMB > 0 && (sizeMB == 64 || g.MemoryMB < sizeMB) { - sizeMB = g.MemoryMB / 16 // allocate 1/16 of VRAM per GPU + free := g.MemoryMB - 512 + if free > 0 && (sizeMB == 64 || free < sizeMB) { + sizeMB = free } } } diff --git a/audit/internal/tui/messages.go b/audit/internal/tui/messages.go index 3ee8612..31b5b73 100644 --- a/audit/internal/tui/messages.go +++ b/audit/internal/tui/messages.go @@ -50,3 +50,8 @@ type gpuStressDoneMsg struct { body string err error } + +type gpuLiveTickMsg struct { + rows []platform.GPUMetricRow + indices []int +} diff --git a/audit/internal/tui/screen_health_check.go b/audit/internal/tui/screen_health_check.go index aff0ba7..d2a09be 100644 --- a/audit/internal/tui/screen_health_check.go +++ b/audit/internal/tui/screen_health_check.go @@ -3,8 +3,10 @@ package tui import ( "context" "fmt" - "os/exec" "strings" + "time" + + "bee/audit/internal/platform" tea "github.com/charmbracelet/bubbletea" ) @@ -156,14 +158,16 @@ func (m model) hcRunFanStress() (tea.Model, tea.Cmd) { return m, nil } -// startGPUStressTest launches the GPU Platform Stress Test and nvtop concurrently. -// nvtop occupies the full terminal as a live chart; the stress test runs in background. +// startGPUStressTest launches the GPU Platform Stress Test with a live in-TUI chart. func (m model) startGPUStressTest() (tea.Model, tea.Cmd) { opts := hcFanStressOpts(m.hcMode, m.app) ctx, cancel := context.WithCancel(context.Background()) m.gpuStressCancel = cancel m.gpuStressAborted = false + m.gpuLiveRows = nil + m.gpuLiveIndices = opts.GPUIndices + m.gpuLiveStart = time.Now() m.screen = screenGPUStressRunning m.nvidiaSATCursor = 0 @@ -172,30 +176,21 @@ func (m model) startGPUStressTest() (tea.Model, tea.Cmd) { return gpuStressDoneMsg{title: result.Title, body: result.Body, err: err} } - nvtopPath, lookErr := exec.LookPath("nvtop") - if lookErr != nil { - return m, stressCmd - } + return m, tea.Batch(stressCmd, pollGPULive(opts.GPUIndices)) +} - return m, tea.Batch( - stressCmd, - tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg { - return nvtopClosedMsg{} - }), - ) +// pollGPULive samples nvidia-smi once after one second and returns a gpuLiveTickMsg. +// The update handler reschedules it to achieve continuous 1s polling. +func pollGPULive(indices []int) tea.Cmd { + return tea.Tick(time.Second, func(_ time.Time) tea.Msg { + rows, _ := platform.SampleGPUMetrics(indices) + return gpuLiveTickMsg{rows: rows, indices: indices} + }) } // updateGPUStressRunning handles keys on the GPU stress running screen. func (m model) updateGPUStressRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) { switch msg.String() { - case "o", "O": - nvtopPath, err := exec.LookPath("nvtop") - if err != nil { - return m, nil - } - return m, tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg { - return nvtopClosedMsg{} - }) case "a", "A": if m.gpuStressCancel != nil { m.gpuStressCancel() @@ -210,8 +205,22 @@ func (m model) updateGPUStressRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) { return m, nil } -func renderGPUStressRunning() string { - return "GPU PLATFORM STRESS TEST\n\nTest is running...\n\n[o] Open nvtop [a] Abort test [ctrl+c] quit\n" +func renderGPUStressRunning(m model) string { + var b strings.Builder + fmt.Fprintln(&b, "GPU PLATFORM STRESS TEST") + fmt.Fprintln(&b) + if len(m.gpuLiveRows) == 0 { + fmt.Fprintln(&b, "Collecting metrics...") + } else { + chartWidth := m.width - 8 + if chartWidth < 40 { + chartWidth = 70 + } + b.WriteString(platform.RenderGPULiveChart(m.gpuLiveRows, chartWidth)) + } + fmt.Fprintln(&b) + b.WriteString("[a] Abort test [ctrl+c] quit") + return b.String() } func (m model) hcRunAll() (tea.Model, tea.Cmd) { diff --git a/audit/internal/tui/types.go b/audit/internal/tui/types.go index ecdfc25..9c10778 100644 --- a/audit/internal/tui/types.go +++ b/audit/internal/tui/types.go @@ -97,6 +97,9 @@ type model struct { // GPU Platform Stress Test running gpuStressCancel func() gpuStressAborted bool + gpuLiveRows []platform.GPUMetricRow + gpuLiveIndices []int + gpuLiveStart time.Time // SAT verbose progress (CPU / Memory / Storage / AMD GPU) progressLines []string diff --git a/audit/internal/tui/update.go b/audit/internal/tui/update.go index 99b2f3d..ecea4af 100644 --- a/audit/internal/tui/update.go +++ b/audit/internal/tui/update.go @@ -3,6 +3,7 @@ package tui import ( "fmt" "strings" + "time" tea "github.com/charmbracelet/bubbletea" ) @@ -130,6 +131,22 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) { m.body = msg.body } return m, m.refreshSnapshotCmd() + case gpuLiveTickMsg: + if m.screen == screenGPUStressRunning { + if len(msg.rows) > 0 { + elapsed := time.Since(m.gpuLiveStart).Seconds() + for i := range msg.rows { + msg.rows[i].ElapsedSec = elapsed + } + m.gpuLiveRows = append(m.gpuLiveRows, msg.rows...) + n := max(1, len(msg.indices)) + if len(m.gpuLiveRows) > 60*n { + m.gpuLiveRows = m.gpuLiveRows[len(m.gpuLiveRows)-60*n:] + } + } + return m, pollGPULive(msg.indices) + } + return m, nil case nvidiaSATDoneMsg: if m.nvidiaSATAborted { return m, nil diff --git a/audit/internal/tui/view.go b/audit/internal/tui/view.go index f8ab293..62b0b67 100644 --- a/audit/internal/tui/view.go +++ b/audit/internal/tui/view.go @@ -79,7 +79,7 @@ func (m model) View() string { case screenNvidiaSATRunning: body = renderNvidiaSATRunning() case screenGPUStressRunning: - body = renderGPUStressRunning() + body = renderGPUStressRunning(m) case screenOutput: body = fmt.Sprintf("%s\n\n%s\n\n[enter/esc] back [ctrl+c] quit\n", m.title, strings.TrimSpace(m.body)) default: