feat(tui): live GPU chart during stress test, full VRAM allocation
- GPU Platform Stress Test now shows a live in-TUI chart instead of nvtop. nvidia-smi is polled every second; up to 60 data points per GPU kept. All three metrics (Usage %, Temp °C, Power W) drawn on a single plot, each normalised to its own range and rendered in a different colour. - Memory allocation changed from MemoryMB/16 to MemoryMB-512 (full VRAM minus 512 MB driver overhead) so bee-gpu-stress actually stresses memory. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -182,12 +182,13 @@ func hcFanStressOpts(hcMode int, application interface {
|
||||
}
|
||||
}
|
||||
|
||||
// Use minimum GPU memory size to fit all GPUs.
|
||||
// Use nearly full GPU memory on the smallest GPU (leave 512 MB for driver overhead).
|
||||
sizeMB := 64
|
||||
if gpus, err := application.ListNvidiaGPUs(); err == nil {
|
||||
for _, g := range gpus {
|
||||
if g.MemoryMB > 0 && (sizeMB == 64 || g.MemoryMB < sizeMB) {
|
||||
sizeMB = g.MemoryMB / 16 // allocate 1/16 of VRAM per GPU
|
||||
free := g.MemoryMB - 512
|
||||
if free > 0 && (sizeMB == 64 || free < sizeMB) {
|
||||
sizeMB = free
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -50,3 +50,8 @@ type gpuStressDoneMsg struct {
|
||||
body string
|
||||
err error
|
||||
}
|
||||
|
||||
type gpuLiveTickMsg struct {
|
||||
rows []platform.GPUMetricRow
|
||||
indices []int
|
||||
}
|
||||
|
||||
@@ -3,8 +3,10 @@ package tui
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
@@ -156,14 +158,16 @@ func (m model) hcRunFanStress() (tea.Model, tea.Cmd) {
|
||||
return m, nil
|
||||
}
|
||||
|
||||
// startGPUStressTest launches the GPU Platform Stress Test and nvtop concurrently.
|
||||
// nvtop occupies the full terminal as a live chart; the stress test runs in background.
|
||||
// startGPUStressTest launches the GPU Platform Stress Test with a live in-TUI chart.
|
||||
func (m model) startGPUStressTest() (tea.Model, tea.Cmd) {
|
||||
opts := hcFanStressOpts(m.hcMode, m.app)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
m.gpuStressCancel = cancel
|
||||
m.gpuStressAborted = false
|
||||
m.gpuLiveRows = nil
|
||||
m.gpuLiveIndices = opts.GPUIndices
|
||||
m.gpuLiveStart = time.Now()
|
||||
m.screen = screenGPUStressRunning
|
||||
m.nvidiaSATCursor = 0
|
||||
|
||||
@@ -172,30 +176,21 @@ func (m model) startGPUStressTest() (tea.Model, tea.Cmd) {
|
||||
return gpuStressDoneMsg{title: result.Title, body: result.Body, err: err}
|
||||
}
|
||||
|
||||
nvtopPath, lookErr := exec.LookPath("nvtop")
|
||||
if lookErr != nil {
|
||||
return m, stressCmd
|
||||
}
|
||||
return m, tea.Batch(stressCmd, pollGPULive(opts.GPUIndices))
|
||||
}
|
||||
|
||||
return m, tea.Batch(
|
||||
stressCmd,
|
||||
tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
|
||||
return nvtopClosedMsg{}
|
||||
}),
|
||||
)
|
||||
// pollGPULive samples nvidia-smi once after one second and returns a gpuLiveTickMsg.
|
||||
// The update handler reschedules it to achieve continuous 1s polling.
|
||||
func pollGPULive(indices []int) tea.Cmd {
|
||||
return tea.Tick(time.Second, func(_ time.Time) tea.Msg {
|
||||
rows, _ := platform.SampleGPUMetrics(indices)
|
||||
return gpuLiveTickMsg{rows: rows, indices: indices}
|
||||
})
|
||||
}
|
||||
|
||||
// updateGPUStressRunning handles keys on the GPU stress running screen.
|
||||
func (m model) updateGPUStressRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
switch msg.String() {
|
||||
case "o", "O":
|
||||
nvtopPath, err := exec.LookPath("nvtop")
|
||||
if err != nil {
|
||||
return m, nil
|
||||
}
|
||||
return m, tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
|
||||
return nvtopClosedMsg{}
|
||||
})
|
||||
case "a", "A":
|
||||
if m.gpuStressCancel != nil {
|
||||
m.gpuStressCancel()
|
||||
@@ -210,8 +205,22 @@ func (m model) updateGPUStressRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func renderGPUStressRunning() string {
|
||||
return "GPU PLATFORM STRESS TEST\n\nTest is running...\n\n[o] Open nvtop [a] Abort test [ctrl+c] quit\n"
|
||||
func renderGPUStressRunning(m model) string {
|
||||
var b strings.Builder
|
||||
fmt.Fprintln(&b, "GPU PLATFORM STRESS TEST")
|
||||
fmt.Fprintln(&b)
|
||||
if len(m.gpuLiveRows) == 0 {
|
||||
fmt.Fprintln(&b, "Collecting metrics...")
|
||||
} else {
|
||||
chartWidth := m.width - 8
|
||||
if chartWidth < 40 {
|
||||
chartWidth = 70
|
||||
}
|
||||
b.WriteString(platform.RenderGPULiveChart(m.gpuLiveRows, chartWidth))
|
||||
}
|
||||
fmt.Fprintln(&b)
|
||||
b.WriteString("[a] Abort test [ctrl+c] quit")
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func (m model) hcRunAll() (tea.Model, tea.Cmd) {
|
||||
|
||||
@@ -97,6 +97,9 @@ type model struct {
|
||||
// GPU Platform Stress Test running
|
||||
gpuStressCancel func()
|
||||
gpuStressAborted bool
|
||||
gpuLiveRows []platform.GPUMetricRow
|
||||
gpuLiveIndices []int
|
||||
gpuLiveStart time.Time
|
||||
|
||||
// SAT verbose progress (CPU / Memory / Storage / AMD GPU)
|
||||
progressLines []string
|
||||
|
||||
@@ -3,6 +3,7 @@ package tui
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
@@ -130,6 +131,22 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
|
||||
m.body = msg.body
|
||||
}
|
||||
return m, m.refreshSnapshotCmd()
|
||||
case gpuLiveTickMsg:
|
||||
if m.screen == screenGPUStressRunning {
|
||||
if len(msg.rows) > 0 {
|
||||
elapsed := time.Since(m.gpuLiveStart).Seconds()
|
||||
for i := range msg.rows {
|
||||
msg.rows[i].ElapsedSec = elapsed
|
||||
}
|
||||
m.gpuLiveRows = append(m.gpuLiveRows, msg.rows...)
|
||||
n := max(1, len(msg.indices))
|
||||
if len(m.gpuLiveRows) > 60*n {
|
||||
m.gpuLiveRows = m.gpuLiveRows[len(m.gpuLiveRows)-60*n:]
|
||||
}
|
||||
}
|
||||
return m, pollGPULive(msg.indices)
|
||||
}
|
||||
return m, nil
|
||||
case nvidiaSATDoneMsg:
|
||||
if m.nvidiaSATAborted {
|
||||
return m, nil
|
||||
|
||||
@@ -79,7 +79,7 @@ func (m model) View() string {
|
||||
case screenNvidiaSATRunning:
|
||||
body = renderNvidiaSATRunning()
|
||||
case screenGPUStressRunning:
|
||||
body = renderGPUStressRunning()
|
||||
body = renderGPUStressRunning(m)
|
||||
case screenOutput:
|
||||
body = fmt.Sprintf("%s\n\n%s\n\n[enter/esc] back [ctrl+c] quit\n", m.title, strings.TrimSpace(m.body))
|
||||
default:
|
||||
|
||||
Reference in New Issue
Block a user