feat(tui): live GPU chart during stress test, full VRAM allocation
- GPU Platform Stress Test now shows a live in-TUI chart instead of nvtop. nvidia-smi is polled every second; up to 60 data points per GPU kept. All three metrics (Usage %, Temp °C, Power W) drawn on a single plot, each normalised to its own range and rendered in a different colour. - Memory allocation changed from MemoryMB/16 to MemoryMB-512 (full VRAM minus 512 MB driver overhead) so bee-gpu-stress actually stresses memory. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -69,6 +69,11 @@ func parseGPUFloat(s string) float64 {
|
|||||||
return v
|
return v
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// SampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
|
||||||
|
func SampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
||||||
|
return sampleGPUMetrics(gpuIndices)
|
||||||
|
}
|
||||||
|
|
||||||
// WriteGPUMetricsCSV writes collected rows as a CSV file.
|
// WriteGPUMetricsCSV writes collected rows as a CSV file.
|
||||||
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
|
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
|
||||||
var b bytes.Buffer
|
var b bytes.Buffer
|
||||||
@@ -370,6 +375,162 @@ func RenderGPUTerminalChart(rows []GPUMetricRow) string {
|
|||||||
return strings.TrimRight(b.String(), "\n")
|
return strings.TrimRight(b.String(), "\n")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// RenderGPULiveChart renders all GPU metrics on a single combined chart per GPU.
|
||||||
|
// Each series is normalised to its own min–max and drawn in a different colour.
|
||||||
|
// chartWidth controls the width of the plot area (Y-axis label uses 5 extra chars).
|
||||||
|
func RenderGPULiveChart(rows []GPUMetricRow, chartWidth int) string {
|
||||||
|
if chartWidth < 20 {
|
||||||
|
chartWidth = 70
|
||||||
|
}
|
||||||
|
const chartHeight = 14
|
||||||
|
|
||||||
|
seen := make(map[int]bool)
|
||||||
|
var order []int
|
||||||
|
gpuMap := make(map[int][]GPUMetricRow)
|
||||||
|
for _, r := range rows {
|
||||||
|
if !seen[r.GPUIndex] {
|
||||||
|
seen[r.GPUIndex] = true
|
||||||
|
order = append(order, r.GPUIndex)
|
||||||
|
}
|
||||||
|
gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
|
||||||
|
}
|
||||||
|
|
||||||
|
type seriesDef struct {
|
||||||
|
label string
|
||||||
|
color string
|
||||||
|
unit string
|
||||||
|
fn func(GPUMetricRow) float64
|
||||||
|
}
|
||||||
|
defs := []seriesDef{
|
||||||
|
{"Usage", ansiBlue, "%", func(r GPUMetricRow) float64 { return r.UsagePct }},
|
||||||
|
{"Temp", ansiRed, "°C", func(r GPUMetricRow) float64 { return r.TempC }},
|
||||||
|
{"Power", ansiGreen, "W", func(r GPUMetricRow) float64 { return r.PowerW }},
|
||||||
|
}
|
||||||
|
|
||||||
|
var b strings.Builder
|
||||||
|
for _, gpuIdx := range order {
|
||||||
|
gr := gpuMap[gpuIdx]
|
||||||
|
if len(gr) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
elapsed := gr[len(gr)-1].ElapsedSec
|
||||||
|
|
||||||
|
// Build value slices for each series.
|
||||||
|
type seriesData struct {
|
||||||
|
seriesDef
|
||||||
|
vals []float64
|
||||||
|
mn float64
|
||||||
|
mx float64
|
||||||
|
}
|
||||||
|
var series []seriesData
|
||||||
|
for _, d := range defs {
|
||||||
|
vals := extractGPUField(gr, d.fn)
|
||||||
|
mn, mx := gpuMinMax(vals)
|
||||||
|
if mn == mx {
|
||||||
|
mx = mn + 1
|
||||||
|
}
|
||||||
|
series = append(series, seriesData{d, vals, mn, mx})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Shared character grid: row 0 = top (max), row chartHeight = bottom (min).
|
||||||
|
type cell struct {
|
||||||
|
ch rune
|
||||||
|
color string
|
||||||
|
}
|
||||||
|
grid := make([][]cell, chartHeight+1)
|
||||||
|
for r := range grid {
|
||||||
|
grid[r] = make([]cell, chartWidth)
|
||||||
|
for c := range grid[r] {
|
||||||
|
grid[r][c] = cell{' ', ""}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Plot each series onto the shared grid.
|
||||||
|
for _, s := range series {
|
||||||
|
w := chartWidth
|
||||||
|
if len(s.vals) < w {
|
||||||
|
w = len(s.vals)
|
||||||
|
}
|
||||||
|
data := gpuDownsample(s.vals, w)
|
||||||
|
prevRow := -1
|
||||||
|
for x, v := range data {
|
||||||
|
row := chartHeight - int(math.Round((v-s.mn)/(s.mx-s.mn)*float64(chartHeight)))
|
||||||
|
if row < 0 {
|
||||||
|
row = 0
|
||||||
|
}
|
||||||
|
if row > chartHeight {
|
||||||
|
row = chartHeight
|
||||||
|
}
|
||||||
|
if prevRow < 0 || prevRow == row {
|
||||||
|
grid[row][x] = cell{'─', s.color}
|
||||||
|
} else {
|
||||||
|
lo, hi := prevRow, row
|
||||||
|
if lo > hi {
|
||||||
|
lo, hi = hi, lo
|
||||||
|
}
|
||||||
|
for y := lo + 1; y < hi; y++ {
|
||||||
|
grid[y][x] = cell{'│', s.color}
|
||||||
|
}
|
||||||
|
if prevRow < row {
|
||||||
|
grid[prevRow][x] = cell{'╮', s.color}
|
||||||
|
grid[row][x] = cell{'╰', s.color}
|
||||||
|
} else {
|
||||||
|
grid[prevRow][x] = cell{'╯', s.color}
|
||||||
|
grid[row][x] = cell{'╭', s.color}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
prevRow = row
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Render: Y axis + data rows.
|
||||||
|
fmt.Fprintf(&b, "GPU %d (%.0fs) each series normalised to its range\n", gpuIdx, elapsed)
|
||||||
|
for r := 0; r <= chartHeight; r++ {
|
||||||
|
// Y axis label: 100% at top, 50% in middle, 0% at bottom.
|
||||||
|
switch r {
|
||||||
|
case 0:
|
||||||
|
fmt.Fprintf(&b, "%4s┤", "100%")
|
||||||
|
case chartHeight / 2:
|
||||||
|
fmt.Fprintf(&b, "%4s┤", "50%")
|
||||||
|
case chartHeight:
|
||||||
|
fmt.Fprintf(&b, "%4s┤", "0%")
|
||||||
|
default:
|
||||||
|
fmt.Fprintf(&b, "%4s│", "")
|
||||||
|
}
|
||||||
|
for c := 0; c < chartWidth; c++ {
|
||||||
|
cl := grid[r][c]
|
||||||
|
if cl.color != "" {
|
||||||
|
b.WriteString(cl.color)
|
||||||
|
b.WriteRune(cl.ch)
|
||||||
|
b.WriteString(ansiReset)
|
||||||
|
} else {
|
||||||
|
b.WriteRune(' ')
|
||||||
|
}
|
||||||
|
}
|
||||||
|
b.WriteRune('\n')
|
||||||
|
}
|
||||||
|
// Bottom axis.
|
||||||
|
b.WriteString(" └")
|
||||||
|
b.WriteString(strings.Repeat("─", chartWidth))
|
||||||
|
b.WriteRune('\n')
|
||||||
|
|
||||||
|
// Legend with current (last) values.
|
||||||
|
b.WriteString(" ")
|
||||||
|
for i, s := range series {
|
||||||
|
last := s.vals[len(s.vals)-1]
|
||||||
|
b.WriteString(s.color)
|
||||||
|
fmt.Fprintf(&b, "▐ %s: %.0f%s", s.label, last, s.unit)
|
||||||
|
b.WriteString(ansiReset)
|
||||||
|
if i < len(series)-1 {
|
||||||
|
b.WriteString(" ")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
b.WriteRune('\n')
|
||||||
|
}
|
||||||
|
|
||||||
|
return strings.TrimRight(b.String(), "\n")
|
||||||
|
}
|
||||||
|
|
||||||
// renderLineChart draws a single time-series line chart using box-drawing characters.
|
// renderLineChart draws a single time-series line chart using box-drawing characters.
|
||||||
// Produces output in the style of asciigraph: ╭─╮ │ ╰─╯ with a Y axis and caption.
|
// Produces output in the style of asciigraph: ╭─╮ │ ╰─╯ with a Y axis and caption.
|
||||||
func renderLineChart(vals []float64, color, caption string, height, width int) string {
|
func renderLineChart(vals []float64, color, caption string, height, width int) string {
|
||||||
|
|||||||
@@ -182,12 +182,13 @@ func hcFanStressOpts(hcMode int, application interface {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Use minimum GPU memory size to fit all GPUs.
|
// Use nearly full GPU memory on the smallest GPU (leave 512 MB for driver overhead).
|
||||||
sizeMB := 64
|
sizeMB := 64
|
||||||
if gpus, err := application.ListNvidiaGPUs(); err == nil {
|
if gpus, err := application.ListNvidiaGPUs(); err == nil {
|
||||||
for _, g := range gpus {
|
for _, g := range gpus {
|
||||||
if g.MemoryMB > 0 && (sizeMB == 64 || g.MemoryMB < sizeMB) {
|
free := g.MemoryMB - 512
|
||||||
sizeMB = g.MemoryMB / 16 // allocate 1/16 of VRAM per GPU
|
if free > 0 && (sizeMB == 64 || free < sizeMB) {
|
||||||
|
sizeMB = free
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -50,3 +50,8 @@ type gpuStressDoneMsg struct {
|
|||||||
body string
|
body string
|
||||||
err error
|
err error
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type gpuLiveTickMsg struct {
|
||||||
|
rows []platform.GPUMetricRow
|
||||||
|
indices []int
|
||||||
|
}
|
||||||
|
|||||||
@@ -3,8 +3,10 @@ package tui
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os/exec"
|
|
||||||
"strings"
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
|
||||||
tea "github.com/charmbracelet/bubbletea"
|
tea "github.com/charmbracelet/bubbletea"
|
||||||
)
|
)
|
||||||
@@ -156,14 +158,16 @@ func (m model) hcRunFanStress() (tea.Model, tea.Cmd) {
|
|||||||
return m, nil
|
return m, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// startGPUStressTest launches the GPU Platform Stress Test and nvtop concurrently.
|
// startGPUStressTest launches the GPU Platform Stress Test with a live in-TUI chart.
|
||||||
// nvtop occupies the full terminal as a live chart; the stress test runs in background.
|
|
||||||
func (m model) startGPUStressTest() (tea.Model, tea.Cmd) {
|
func (m model) startGPUStressTest() (tea.Model, tea.Cmd) {
|
||||||
opts := hcFanStressOpts(m.hcMode, m.app)
|
opts := hcFanStressOpts(m.hcMode, m.app)
|
||||||
|
|
||||||
ctx, cancel := context.WithCancel(context.Background())
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
m.gpuStressCancel = cancel
|
m.gpuStressCancel = cancel
|
||||||
m.gpuStressAborted = false
|
m.gpuStressAborted = false
|
||||||
|
m.gpuLiveRows = nil
|
||||||
|
m.gpuLiveIndices = opts.GPUIndices
|
||||||
|
m.gpuLiveStart = time.Now()
|
||||||
m.screen = screenGPUStressRunning
|
m.screen = screenGPUStressRunning
|
||||||
m.nvidiaSATCursor = 0
|
m.nvidiaSATCursor = 0
|
||||||
|
|
||||||
@@ -172,30 +176,21 @@ func (m model) startGPUStressTest() (tea.Model, tea.Cmd) {
|
|||||||
return gpuStressDoneMsg{title: result.Title, body: result.Body, err: err}
|
return gpuStressDoneMsg{title: result.Title, body: result.Body, err: err}
|
||||||
}
|
}
|
||||||
|
|
||||||
nvtopPath, lookErr := exec.LookPath("nvtop")
|
return m, tea.Batch(stressCmd, pollGPULive(opts.GPUIndices))
|
||||||
if lookErr != nil {
|
}
|
||||||
return m, stressCmd
|
|
||||||
}
|
|
||||||
|
|
||||||
return m, tea.Batch(
|
// pollGPULive samples nvidia-smi once after one second and returns a gpuLiveTickMsg.
|
||||||
stressCmd,
|
// The update handler reschedules it to achieve continuous 1s polling.
|
||||||
tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
|
func pollGPULive(indices []int) tea.Cmd {
|
||||||
return nvtopClosedMsg{}
|
return tea.Tick(time.Second, func(_ time.Time) tea.Msg {
|
||||||
}),
|
rows, _ := platform.SampleGPUMetrics(indices)
|
||||||
)
|
return gpuLiveTickMsg{rows: rows, indices: indices}
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// updateGPUStressRunning handles keys on the GPU stress running screen.
|
// updateGPUStressRunning handles keys on the GPU stress running screen.
|
||||||
func (m model) updateGPUStressRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
func (m model) updateGPUStressRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||||
switch msg.String() {
|
switch msg.String() {
|
||||||
case "o", "O":
|
|
||||||
nvtopPath, err := exec.LookPath("nvtop")
|
|
||||||
if err != nil {
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
return m, tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
|
|
||||||
return nvtopClosedMsg{}
|
|
||||||
})
|
|
||||||
case "a", "A":
|
case "a", "A":
|
||||||
if m.gpuStressCancel != nil {
|
if m.gpuStressCancel != nil {
|
||||||
m.gpuStressCancel()
|
m.gpuStressCancel()
|
||||||
@@ -210,8 +205,22 @@ func (m model) updateGPUStressRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
|||||||
return m, nil
|
return m, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func renderGPUStressRunning() string {
|
func renderGPUStressRunning(m model) string {
|
||||||
return "GPU PLATFORM STRESS TEST\n\nTest is running...\n\n[o] Open nvtop [a] Abort test [ctrl+c] quit\n"
|
var b strings.Builder
|
||||||
|
fmt.Fprintln(&b, "GPU PLATFORM STRESS TEST")
|
||||||
|
fmt.Fprintln(&b)
|
||||||
|
if len(m.gpuLiveRows) == 0 {
|
||||||
|
fmt.Fprintln(&b, "Collecting metrics...")
|
||||||
|
} else {
|
||||||
|
chartWidth := m.width - 8
|
||||||
|
if chartWidth < 40 {
|
||||||
|
chartWidth = 70
|
||||||
|
}
|
||||||
|
b.WriteString(platform.RenderGPULiveChart(m.gpuLiveRows, chartWidth))
|
||||||
|
}
|
||||||
|
fmt.Fprintln(&b)
|
||||||
|
b.WriteString("[a] Abort test [ctrl+c] quit")
|
||||||
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m model) hcRunAll() (tea.Model, tea.Cmd) {
|
func (m model) hcRunAll() (tea.Model, tea.Cmd) {
|
||||||
|
|||||||
@@ -97,6 +97,9 @@ type model struct {
|
|||||||
// GPU Platform Stress Test running
|
// GPU Platform Stress Test running
|
||||||
gpuStressCancel func()
|
gpuStressCancel func()
|
||||||
gpuStressAborted bool
|
gpuStressAborted bool
|
||||||
|
gpuLiveRows []platform.GPUMetricRow
|
||||||
|
gpuLiveIndices []int
|
||||||
|
gpuLiveStart time.Time
|
||||||
|
|
||||||
// SAT verbose progress (CPU / Memory / Storage / AMD GPU)
|
// SAT verbose progress (CPU / Memory / Storage / AMD GPU)
|
||||||
progressLines []string
|
progressLines []string
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ package tui
|
|||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"strings"
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
tea "github.com/charmbracelet/bubbletea"
|
tea "github.com/charmbracelet/bubbletea"
|
||||||
)
|
)
|
||||||
@@ -130,6 +131,22 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
|
|||||||
m.body = msg.body
|
m.body = msg.body
|
||||||
}
|
}
|
||||||
return m, m.refreshSnapshotCmd()
|
return m, m.refreshSnapshotCmd()
|
||||||
|
case gpuLiveTickMsg:
|
||||||
|
if m.screen == screenGPUStressRunning {
|
||||||
|
if len(msg.rows) > 0 {
|
||||||
|
elapsed := time.Since(m.gpuLiveStart).Seconds()
|
||||||
|
for i := range msg.rows {
|
||||||
|
msg.rows[i].ElapsedSec = elapsed
|
||||||
|
}
|
||||||
|
m.gpuLiveRows = append(m.gpuLiveRows, msg.rows...)
|
||||||
|
n := max(1, len(msg.indices))
|
||||||
|
if len(m.gpuLiveRows) > 60*n {
|
||||||
|
m.gpuLiveRows = m.gpuLiveRows[len(m.gpuLiveRows)-60*n:]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return m, pollGPULive(msg.indices)
|
||||||
|
}
|
||||||
|
return m, nil
|
||||||
case nvidiaSATDoneMsg:
|
case nvidiaSATDoneMsg:
|
||||||
if m.nvidiaSATAborted {
|
if m.nvidiaSATAborted {
|
||||||
return m, nil
|
return m, nil
|
||||||
|
|||||||
@@ -79,7 +79,7 @@ func (m model) View() string {
|
|||||||
case screenNvidiaSATRunning:
|
case screenNvidiaSATRunning:
|
||||||
body = renderNvidiaSATRunning()
|
body = renderNvidiaSATRunning()
|
||||||
case screenGPUStressRunning:
|
case screenGPUStressRunning:
|
||||||
body = renderGPUStressRunning()
|
body = renderGPUStressRunning(m)
|
||||||
case screenOutput:
|
case screenOutput:
|
||||||
body = fmt.Sprintf("%s\n\n%s\n\n[enter/esc] back [ctrl+c] quit\n", m.title, strings.TrimSpace(m.body))
|
body = fmt.Sprintf("%s\n\n%s\n\n[enter/esc] back [ctrl+c] quit\n", m.title, strings.TrimSpace(m.body))
|
||||||
default:
|
default:
|
||||||
|
|||||||
Reference in New Issue
Block a user