feat(tui): NVIDIA SAT with nvtop, GPU selection, metrics and chart — v1.0.0
- TUI: duration presets (10m/1h/8h/24h), GPU multi-select checkboxes - nvtop launched concurrently with SAT via tea.ExecProcess; can reopen or abort - GPU metrics collected per-second during bee-gpu-stress (temp/usage/power/clock) - Outputs: gpu-metrics.csv, gpu-metrics.html (offline SVG), gpu-metrics-term.txt - Terminal chart: asciigraph-style line chart with box-drawing chars and ANSI colours - AUDIT_VERSION bumped 0.1.1 → 1.0.0; nvtop added to ISO package list - runtime-flows.md updated with full NVIDIA SAT TUI flow documentation Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
@@ -71,8 +72,10 @@ type toolManager interface {
|
||||
|
||||
type satRunner interface {
|
||||
RunNvidiaAcceptancePack(baseDir string) (string, error)
|
||||
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, durationSec int, sizeMB int, gpuIndices []int) (string, error)
|
||||
RunMemoryAcceptancePack(baseDir string) (string, error)
|
||||
RunStorageAcceptancePack(baseDir string) (string, error)
|
||||
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
|
||||
}
|
||||
|
||||
type runtimeChecker interface {
|
||||
@@ -395,6 +398,29 @@ func (a *App) RunNvidiaAcceptancePackResult(baseDir string) (ActionResult, error
|
||||
return ActionResult{Title: "NVIDIA SAT", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
||||
return a.sat.ListNvidiaGPUs()
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, durationSec int, sizeMB int, gpuIndices []int) (ActionResult, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, durationSec, sizeMB, gpuIndices)
|
||||
body := "Archive written."
|
||||
if path != "" {
|
||||
body = "Archive written to " + path
|
||||
}
|
||||
// Include terminal chart if available (runDir = archive path without .tar.gz).
|
||||
if path != "" {
|
||||
termPath := filepath.Join(strings.TrimSuffix(path, ".tar.gz"), "gpu-metrics-term.txt")
|
||||
if chart, readErr := os.ReadFile(termPath); readErr == nil && len(chart) > 0 {
|
||||
body += "\n\n" + string(chart)
|
||||
}
|
||||
}
|
||||
return ActionResult{Title: "NVIDIA SAT", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePack(baseDir string) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"os"
|
||||
@@ -105,6 +106,14 @@ func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string) (string, error) {
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir string, _ int, _ int, _ []int) (string, error) {
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunMemoryAcceptancePack(baseDir string) (string, error) {
|
||||
return f.runMemoryFn(baseDir)
|
||||
}
|
||||
|
||||
577
audit/internal/platform/gpu_metrics.go
Normal file
577
audit/internal/platform/gpu_metrics.go
Normal file
@@ -0,0 +1,577 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// GPUMetricRow is one telemetry sample from nvidia-smi during a stress test.
|
||||
type GPUMetricRow struct {
|
||||
ElapsedSec float64
|
||||
GPUIndex int
|
||||
TempC float64
|
||||
UsagePct float64
|
||||
PowerW float64
|
||||
ClockMHz float64
|
||||
}
|
||||
|
||||
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
|
||||
func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
||||
args := []string{
|
||||
"--query-gpu=index,temperature.gpu,utilization.gpu,power.draw,clocks.current.graphics",
|
||||
"--format=csv,noheader,nounits",
|
||||
}
|
||||
if len(gpuIndices) > 0 {
|
||||
ids := make([]string, len(gpuIndices))
|
||||
for i, idx := range gpuIndices {
|
||||
ids[i] = strconv.Itoa(idx)
|
||||
}
|
||||
args = append([]string{"--id=" + strings.Join(ids, ",")}, args...)
|
||||
}
|
||||
out, err := exec.Command("nvidia-smi", args...).Output()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var rows []GPUMetricRow
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
parts := strings.Split(line, ", ")
|
||||
if len(parts) < 5 {
|
||||
continue
|
||||
}
|
||||
idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||
rows = append(rows, GPUMetricRow{
|
||||
GPUIndex: idx,
|
||||
TempC: parseGPUFloat(parts[1]),
|
||||
UsagePct: parseGPUFloat(parts[2]),
|
||||
PowerW: parseGPUFloat(parts[3]),
|
||||
ClockMHz: parseGPUFloat(parts[4]),
|
||||
})
|
||||
}
|
||||
return rows, nil
|
||||
}
|
||||
|
||||
func parseGPUFloat(s string) float64 {
|
||||
s = strings.TrimSpace(s)
|
||||
if s == "N/A" || s == "[Not Supported]" || s == "" {
|
||||
return 0
|
||||
}
|
||||
v, _ := strconv.ParseFloat(s, 64)
|
||||
return v
|
||||
}
|
||||
|
||||
// WriteGPUMetricsCSV writes collected rows as a CSV file.
|
||||
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
|
||||
var b bytes.Buffer
|
||||
b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,power_w,clock_mhz\n")
|
||||
for _, r := range rows {
|
||||
fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.0f\n",
|
||||
r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.PowerW, r.ClockMHz)
|
||||
}
|
||||
return os.WriteFile(path, b.Bytes(), 0644)
|
||||
}
|
||||
|
||||
// WriteGPUMetricsHTML writes a standalone HTML file with one SVG chart per GPU.
|
||||
func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
|
||||
// Group by GPU index preserving order.
|
||||
seen := make(map[int]bool)
|
||||
var order []int
|
||||
gpuMap := make(map[int][]GPUMetricRow)
|
||||
for _, r := range rows {
|
||||
if !seen[r.GPUIndex] {
|
||||
seen[r.GPUIndex] = true
|
||||
order = append(order, r.GPUIndex)
|
||||
}
|
||||
gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
|
||||
}
|
||||
|
||||
var svgs strings.Builder
|
||||
for _, gpuIdx := range order {
|
||||
svgs.WriteString(drawGPUChartSVG(gpuMap[gpuIdx], gpuIdx))
|
||||
svgs.WriteString("\n")
|
||||
}
|
||||
|
||||
ts := time.Now().UTC().Format("2006-01-02 15:04:05 UTC")
|
||||
html := fmt.Sprintf(`<!DOCTYPE html>
|
||||
<html><head>
|
||||
<meta charset="utf-8">
|
||||
<title>GPU Stress Test Metrics</title>
|
||||
<style>
|
||||
body { font-family: sans-serif; background: #f0f0f0; margin: 0; padding: 20px; }
|
||||
h1 { text-align: center; color: #333; margin: 0 0 8px; }
|
||||
p { text-align: center; color: #888; font-size: 13px; margin: 0 0 24px; }
|
||||
</style>
|
||||
</head><body>
|
||||
<h1>GPU Stress Test Metrics</h1>
|
||||
<p>Generated %s</p>
|
||||
%s
|
||||
</body></html>`, ts, svgs.String())
|
||||
|
||||
return os.WriteFile(path, []byte(html), 0644)
|
||||
}
|
||||
|
||||
// drawGPUChartSVG generates a self-contained SVG chart for one GPU.
|
||||
func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
|
||||
// Layout
|
||||
const W, H = 960, 520
|
||||
const plotX1 = 120 // usage axis / chart left border
|
||||
const plotX2 = 840 // power axis / chart right border
|
||||
const plotY1 = 70 // top
|
||||
const plotY2 = 465 // bottom (PH = 395)
|
||||
const PW = plotX2 - plotX1
|
||||
const PH = plotY2 - plotY1
|
||||
// Outer axes
|
||||
const tempAxisX = 60 // temp axis line
|
||||
const clockAxisX = 900 // clock axis line
|
||||
|
||||
colors := [4]string{"#e74c3c", "#3498db", "#2ecc71", "#f39c12"}
|
||||
seriesLabel := [4]string{
|
||||
fmt.Sprintf("GPU %d Temp (°C)", gpuIdx),
|
||||
fmt.Sprintf("GPU %d Usage (%%)", gpuIdx),
|
||||
fmt.Sprintf("GPU %d Power (W)", gpuIdx),
|
||||
fmt.Sprintf("GPU %d Clock (MHz)", gpuIdx),
|
||||
}
|
||||
axisLabel := [4]string{"Temperature (°C)", "GPU Usage (%)", "Power (W)", "Clock (MHz)"}
|
||||
|
||||
// Extract series
|
||||
t := make([]float64, len(rows))
|
||||
vals := [4][]float64{}
|
||||
for i := range vals {
|
||||
vals[i] = make([]float64, len(rows))
|
||||
}
|
||||
for i, r := range rows {
|
||||
t[i] = r.ElapsedSec
|
||||
vals[0][i] = r.TempC
|
||||
vals[1][i] = r.UsagePct
|
||||
vals[2][i] = r.PowerW
|
||||
vals[3][i] = r.ClockMHz
|
||||
}
|
||||
|
||||
tMin, tMax := gpuMinMax(t)
|
||||
type axisScale struct {
|
||||
ticks []float64
|
||||
min, max float64
|
||||
}
|
||||
var axes [4]axisScale
|
||||
for i := 0; i < 4; i++ {
|
||||
mn, mx := gpuMinMax(vals[i])
|
||||
tks := gpuNiceTicks(mn, mx, 8)
|
||||
axes[i] = axisScale{ticks: tks, min: tks[0], max: tks[len(tks)-1]}
|
||||
}
|
||||
|
||||
xv := func(tv float64) float64 {
|
||||
if tMax == tMin {
|
||||
return float64(plotX1)
|
||||
}
|
||||
return float64(plotX1) + (tv-tMin)/(tMax-tMin)*float64(PW)
|
||||
}
|
||||
yv := func(v float64, ai int) float64 {
|
||||
a := axes[ai]
|
||||
if a.max == a.min {
|
||||
return float64(plotY1 + PH/2)
|
||||
}
|
||||
return float64(plotY2) - (v-a.min)/(a.max-a.min)*float64(PH)
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
|
||||
fmt.Fprintf(&b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d"`+
|
||||
` style="background:#fff;border-radius:8px;display:block;margin:0 auto 24px;`+
|
||||
`box-shadow:0 2px 12px rgba(0,0,0,.12)">`+"\n", W, H)
|
||||
|
||||
// Title
|
||||
fmt.Fprintf(&b, `<text x="%d" y="22" text-anchor="middle" font-family="sans-serif"`+
|
||||
` font-size="14" font-weight="bold" fill="#333">GPU Stress Test Metrics — GPU %d</text>`+"\n",
|
||||
plotX1+PW/2, gpuIdx)
|
||||
|
||||
// Horizontal grid (align to temp axis ticks)
|
||||
b.WriteString(`<g stroke="#e0e0e0" stroke-width="0.5">` + "\n")
|
||||
for _, tick := range axes[0].ticks {
|
||||
y := yv(tick, 0)
|
||||
if y < float64(plotY1) || y > float64(plotY2) {
|
||||
continue
|
||||
}
|
||||
fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f"/>`+"\n",
|
||||
plotX1, y, plotX2, y)
|
||||
}
|
||||
// Vertical grid
|
||||
xTicks := gpuNiceTicks(tMin, tMax, 10)
|
||||
for _, tv := range xTicks {
|
||||
x := xv(tv)
|
||||
if x < float64(plotX1) || x > float64(plotX2) {
|
||||
continue
|
||||
}
|
||||
fmt.Fprintf(&b, `<line x1="%.1f" y1="%d" x2="%.1f" y2="%d"/>`+"\n",
|
||||
x, plotY1, x, plotY2)
|
||||
}
|
||||
b.WriteString("</g>\n")
|
||||
|
||||
// Chart border
|
||||
fmt.Fprintf(&b, `<rect x="%d" y="%d" width="%d" height="%d"`+
|
||||
` fill="none" stroke="#333" stroke-width="1"/>`+"\n",
|
||||
plotX1, plotY1, PW, PH)
|
||||
|
||||
// X axis ticks and labels
|
||||
b.WriteString(`<g font-family="sans-serif" font-size="11" fill="#333" text-anchor="middle">` + "\n")
|
||||
for _, tv := range xTicks {
|
||||
x := xv(tv)
|
||||
if x < float64(plotX1) || x > float64(plotX2) {
|
||||
continue
|
||||
}
|
||||
fmt.Fprintf(&b, `<text x="%.1f" y="%d">%s</text>`+"\n", x, plotY2+18, gpuFormatTick(tv))
|
||||
fmt.Fprintf(&b, `<line x1="%.1f" y1="%d" x2="%.1f" y2="%d" stroke="#333" stroke-width="1"/>`+"\n",
|
||||
x, plotY2, x, plotY2+4)
|
||||
}
|
||||
b.WriteString("</g>\n")
|
||||
fmt.Fprintf(&b, `<text x="%d" y="%d" font-family="sans-serif" font-size="13"`+
|
||||
` fill="#333" text-anchor="middle">Time (seconds)</text>`+"\n",
|
||||
plotX1+PW/2, plotY2+38)
|
||||
|
||||
// Y axes: [tempAxisX, plotX1, plotX2, clockAxisX]
|
||||
axisLineX := [4]int{tempAxisX, plotX1, plotX2, clockAxisX}
|
||||
axisRight := [4]bool{false, false, true, true}
|
||||
// Label x positions (for rotated vertical text)
|
||||
axisLabelX := [4]int{10, 68, 868, 950}
|
||||
|
||||
for i := 0; i < 4; i++ {
|
||||
ax := axisLineX[i]
|
||||
right := axisRight[i]
|
||||
color := colors[i]
|
||||
|
||||
// Axis line
|
||||
fmt.Fprintf(&b, `<line x1="%d" y1="%d" x2="%d" y2="%d"`+
|
||||
` stroke="%s" stroke-width="1"/>`+"\n",
|
||||
ax, plotY1, ax, plotY2, color)
|
||||
|
||||
// Ticks and tick labels
|
||||
fmt.Fprintf(&b, `<g font-family="sans-serif" font-size="10" fill="%s">`+"\n", color)
|
||||
for _, tick := range axes[i].ticks {
|
||||
y := yv(tick, i)
|
||||
if y < float64(plotY1) || y > float64(plotY2) {
|
||||
continue
|
||||
}
|
||||
dx := -5
|
||||
textX := ax - 8
|
||||
anchor := "end"
|
||||
if right {
|
||||
dx = 5
|
||||
textX = ax + 8
|
||||
anchor = "start"
|
||||
}
|
||||
fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f"`+
|
||||
` stroke="%s" stroke-width="1"/>`+"\n",
|
||||
ax, y, ax+dx, y, color)
|
||||
fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="%s" dy="4">%s</text>`+"\n",
|
||||
textX, y, anchor, gpuFormatTick(tick))
|
||||
}
|
||||
b.WriteString("</g>\n")
|
||||
|
||||
// Axis label (rotated)
|
||||
lx := axisLabelX[i]
|
||||
fmt.Fprintf(&b, `<text transform="translate(%d,%d) rotate(-90)"`+
|
||||
` font-family="sans-serif" font-size="12" fill="%s" text-anchor="middle">%s</text>`+"\n",
|
||||
lx, plotY1+PH/2, color, axisLabel[i])
|
||||
}
|
||||
|
||||
// Data lines
|
||||
for i := 0; i < 4; i++ {
|
||||
var pts strings.Builder
|
||||
for j := range rows {
|
||||
x := xv(t[j])
|
||||
y := yv(vals[i][j], i)
|
||||
if j == 0 {
|
||||
fmt.Fprintf(&pts, "%.1f,%.1f", x, y)
|
||||
} else {
|
||||
fmt.Fprintf(&pts, " %.1f,%.1f", x, y)
|
||||
}
|
||||
}
|
||||
fmt.Fprintf(&b, `<polyline points="%s" fill="none" stroke="%s" stroke-width="1.5"/>`+"\n",
|
||||
pts.String(), colors[i])
|
||||
}
|
||||
|
||||
// Legend
|
||||
const legendY = 42
|
||||
for i := 0; i < 4; i++ {
|
||||
lx := plotX1 + i*(PW/4) + 10
|
||||
fmt.Fprintf(&b, `<line x1="%d" y1="%d" x2="%d" y2="%d"`+
|
||||
` stroke="%s" stroke-width="2"/>`+"\n",
|
||||
lx, legendY, lx+20, legendY, colors[i])
|
||||
fmt.Fprintf(&b, `<text x="%d" y="%d" font-family="sans-serif" font-size="12" fill="#333">%s</text>`+"\n",
|
||||
lx+25, legendY+4, seriesLabel[i])
|
||||
}
|
||||
|
||||
b.WriteString("</svg>\n")
|
||||
return b.String()
|
||||
}
|
||||
|
||||
const (
|
||||
ansiRed = "\033[31m"
|
||||
ansiBlue = "\033[34m"
|
||||
ansiGreen = "\033[32m"
|
||||
ansiYellow = "\033[33m"
|
||||
ansiReset = "\033[0m"
|
||||
)
|
||||
|
||||
const (
|
||||
termChartWidth = 70
|
||||
termChartHeight = 12
|
||||
)
|
||||
|
||||
// RenderGPUTerminalChart returns ANSI line charts (asciigraph-style) per GPU.
|
||||
// Suitable for display in the TUI screenOutput.
|
||||
func RenderGPUTerminalChart(rows []GPUMetricRow) string {
|
||||
seen := make(map[int]bool)
|
||||
var order []int
|
||||
gpuMap := make(map[int][]GPUMetricRow)
|
||||
for _, r := range rows {
|
||||
if !seen[r.GPUIndex] {
|
||||
seen[r.GPUIndex] = true
|
||||
order = append(order, r.GPUIndex)
|
||||
}
|
||||
gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
|
||||
}
|
||||
|
||||
type seriesDef struct {
|
||||
caption string
|
||||
color string
|
||||
fn func(GPUMetricRow) float64
|
||||
}
|
||||
defs := []seriesDef{
|
||||
{"Temperature (°C)", ansiRed, func(r GPUMetricRow) float64 { return r.TempC }},
|
||||
{"GPU Usage (%)", ansiBlue, func(r GPUMetricRow) float64 { return r.UsagePct }},
|
||||
{"Power (W)", ansiGreen, func(r GPUMetricRow) float64 { return r.PowerW }},
|
||||
{"Clock (MHz)", ansiYellow, func(r GPUMetricRow) float64 { return r.ClockMHz }},
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
for _, gpuIdx := range order {
|
||||
gr := gpuMap[gpuIdx]
|
||||
if len(gr) == 0 {
|
||||
continue
|
||||
}
|
||||
tMax := gr[len(gr)-1].ElapsedSec - gr[0].ElapsedSec
|
||||
fmt.Fprintf(&b, "GPU %d — Stress Test Metrics (%.0f seconds)\n\n", gpuIdx, tMax)
|
||||
for _, d := range defs {
|
||||
b.WriteString(renderLineChart(extractGPUField(gr, d.fn), d.color, d.caption,
|
||||
termChartHeight, termChartWidth))
|
||||
b.WriteRune('\n')
|
||||
}
|
||||
}
|
||||
|
||||
return strings.TrimRight(b.String(), "\n")
|
||||
}
|
||||
|
||||
// renderLineChart draws a single time-series line chart using box-drawing characters.
|
||||
// Produces output in the style of asciigraph: ╭─╮ │ ╰─╯ with a Y axis and caption.
|
||||
func renderLineChart(vals []float64, color, caption string, height, width int) string {
|
||||
if len(vals) == 0 {
|
||||
return caption + "\n"
|
||||
}
|
||||
|
||||
mn, mx := gpuMinMax(vals)
|
||||
if mn == mx {
|
||||
mx = mn + 1
|
||||
}
|
||||
|
||||
// Use the smaller of width or len(vals) to avoid stretching sparse data.
|
||||
w := width
|
||||
if len(vals) < w {
|
||||
w = len(vals)
|
||||
}
|
||||
data := gpuDownsample(vals, w)
|
||||
|
||||
// row[i] = display row index: 0 = top = max value, height = bottom = min value.
|
||||
row := make([]int, w)
|
||||
for i, v := range data {
|
||||
r := int(math.Round((mx - v) / (mx - mn) * float64(height)))
|
||||
if r < 0 {
|
||||
r = 0
|
||||
}
|
||||
if r > height {
|
||||
r = height
|
||||
}
|
||||
row[i] = r
|
||||
}
|
||||
|
||||
// Fill the character grid.
|
||||
grid := make([][]rune, height+1)
|
||||
for i := range grid {
|
||||
grid[i] = make([]rune, w)
|
||||
for j := range grid[i] {
|
||||
grid[i][j] = ' '
|
||||
}
|
||||
}
|
||||
for x := 0; x < w; x++ {
|
||||
r := row[x]
|
||||
if x == 0 {
|
||||
grid[r][0] = '─'
|
||||
continue
|
||||
}
|
||||
p := row[x-1]
|
||||
switch {
|
||||
case r == p:
|
||||
grid[r][x] = '─'
|
||||
case r < p: // value went up (row index decreased toward top)
|
||||
grid[r][x] = '╭'
|
||||
grid[p][x] = '╯'
|
||||
for y := r + 1; y < p; y++ {
|
||||
grid[y][x] = '│'
|
||||
}
|
||||
default: // r > p, value went down
|
||||
grid[p][x] = '╮'
|
||||
grid[r][x] = '╰'
|
||||
for y := p + 1; y < r; y++ {
|
||||
grid[y][x] = '│'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Y axis tick labels.
|
||||
ticks := gpuNiceTicks(mn, mx, height/2)
|
||||
tickAtRow := make(map[int]string)
|
||||
labelWidth := 4
|
||||
for _, t := range ticks {
|
||||
r := int(math.Round((mx - t) / (mx - mn) * float64(height)))
|
||||
if r < 0 || r > height {
|
||||
continue
|
||||
}
|
||||
s := gpuFormatTick(t)
|
||||
tickAtRow[r] = s
|
||||
if len(s) > labelWidth {
|
||||
labelWidth = len(s)
|
||||
}
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
for r := 0; r <= height; r++ {
|
||||
label := tickAtRow[r]
|
||||
fmt.Fprintf(&b, "%*s", labelWidth, label)
|
||||
switch {
|
||||
case label != "":
|
||||
b.WriteRune('┤')
|
||||
case r == height:
|
||||
b.WriteRune('┼')
|
||||
default:
|
||||
b.WriteRune('│')
|
||||
}
|
||||
b.WriteString(color)
|
||||
b.WriteString(string(grid[r]))
|
||||
b.WriteString(ansiReset)
|
||||
b.WriteRune('\n')
|
||||
}
|
||||
|
||||
// Bottom axis.
|
||||
b.WriteString(strings.Repeat(" ", labelWidth))
|
||||
b.WriteRune('└')
|
||||
b.WriteString(strings.Repeat("─", w))
|
||||
b.WriteRune('\n')
|
||||
|
||||
// Caption centered under the chart.
|
||||
if caption != "" {
|
||||
total := labelWidth + 1 + w
|
||||
if pad := (total - len(caption)) / 2; pad > 0 {
|
||||
b.WriteString(strings.Repeat(" ", pad))
|
||||
}
|
||||
b.WriteString(caption)
|
||||
b.WriteRune('\n')
|
||||
}
|
||||
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func extractGPUField(rows []GPUMetricRow, fn func(GPUMetricRow) float64) []float64 {
|
||||
v := make([]float64, len(rows))
|
||||
for i, r := range rows {
|
||||
v[i] = fn(r)
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
// gpuDownsample averages vals into w buckets (or nearest-neighbor upsamples if len(vals) < w).
|
||||
func gpuDownsample(vals []float64, w int) []float64 {
|
||||
n := len(vals)
|
||||
if n == 0 {
|
||||
return make([]float64, w)
|
||||
}
|
||||
result := make([]float64, w)
|
||||
if n >= w {
|
||||
counts := make([]int, w)
|
||||
for i, v := range vals {
|
||||
bucket := i * w / n
|
||||
if bucket >= w {
|
||||
bucket = w - 1
|
||||
}
|
||||
result[bucket] += v
|
||||
counts[bucket]++
|
||||
}
|
||||
for i := range result {
|
||||
if counts[i] > 0 {
|
||||
result[i] /= float64(counts[i])
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Nearest-neighbour upsample.
|
||||
for i := range result {
|
||||
src := i * (n - 1) / (w - 1)
|
||||
if src >= n {
|
||||
src = n - 1
|
||||
}
|
||||
result[i] = vals[src]
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func gpuMinMax(vals []float64) (float64, float64) {
|
||||
if len(vals) == 0 {
|
||||
return 0, 1
|
||||
}
|
||||
mn, mx := vals[0], vals[0]
|
||||
for _, v := range vals[1:] {
|
||||
if v < mn {
|
||||
mn = v
|
||||
}
|
||||
if v > mx {
|
||||
mx = v
|
||||
}
|
||||
}
|
||||
return mn, mx
|
||||
}
|
||||
|
||||
func gpuNiceTicks(mn, mx float64, targetCount int) []float64 {
|
||||
if mn == mx {
|
||||
mn -= 1
|
||||
mx += 1
|
||||
}
|
||||
r := mx - mn
|
||||
step := math.Pow(10, math.Floor(math.Log10(r/float64(targetCount))))
|
||||
for _, f := range []float64{1, 2, 5, 10} {
|
||||
if r/(f*step) <= float64(targetCount)*1.5 {
|
||||
step = f * step
|
||||
break
|
||||
}
|
||||
}
|
||||
lo := math.Floor(mn/step) * step
|
||||
hi := math.Ceil(mx/step) * step
|
||||
var ticks []float64
|
||||
for v := lo; v <= hi+step*0.001; v += step {
|
||||
ticks = append(ticks, math.Round(v*1e9)/1e9)
|
||||
}
|
||||
return ticks
|
||||
}
|
||||
|
||||
func gpuFormatTick(v float64) string {
|
||||
if v == math.Trunc(v) {
|
||||
return strconv.Itoa(int(v))
|
||||
}
|
||||
return strconv.FormatFloat(v, 'f', 1, 64)
|
||||
}
|
||||
@@ -3,6 +3,7 @@ package platform
|
||||
import (
|
||||
"archive/tar"
|
||||
"compress/gzip"
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
@@ -14,10 +15,55 @@ import (
|
||||
"time"
|
||||
)
|
||||
|
||||
// NvidiaGPU holds basic GPU info from nvidia-smi.
|
||||
type NvidiaGPU struct {
|
||||
Index int
|
||||
Name string
|
||||
MemoryMB int
|
||||
}
|
||||
|
||||
// ListNvidiaGPUs returns GPUs visible to nvidia-smi.
|
||||
func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
|
||||
out, err := exec.Command("nvidia-smi",
|
||||
"--query-gpu=index,name,memory.total",
|
||||
"--format=csv,noheader,nounits").Output()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("nvidia-smi: %w", err)
|
||||
}
|
||||
var gpus []NvidiaGPU
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
parts := strings.SplitN(line, ", ", 3)
|
||||
if len(parts) != 3 {
|
||||
continue
|
||||
}
|
||||
idx, err := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
memMB, _ := strconv.Atoi(strings.TrimSpace(parts[2]))
|
||||
gpus = append(gpus, NvidiaGPU{
|
||||
Index: idx,
|
||||
Name: strings.TrimSpace(parts[1]),
|
||||
MemoryMB: memMB,
|
||||
})
|
||||
}
|
||||
return gpus, nil
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaAcceptancePack(baseDir string) (string, error) {
|
||||
return runAcceptancePack(baseDir, "gpu-nvidia", nvidiaSATJobs())
|
||||
}
|
||||
|
||||
// RunNvidiaAcceptancePackWithOptions runs the NVIDIA SAT with explicit duration,
|
||||
// GPU memory size, and GPU index selection. ctx cancellation kills the running job.
|
||||
func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, durationSec int, sizeMB int, gpuIndices []int) (string, error) {
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaSATJobsWithOptions(durationSec, sizeMB, gpuIndices))
|
||||
}
|
||||
|
||||
func (s *System) RunMemoryAcceptancePack(baseDir string) (string, error) {
|
||||
sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128)
|
||||
passes := envInt("BEE_MEMTESTER_PASSES", 1)
|
||||
@@ -84,8 +130,11 @@ func (s *System) RunStorageAcceptancePack(baseDir string) (string, error) {
|
||||
}
|
||||
|
||||
type satJob struct {
|
||||
name string
|
||||
cmd []string
|
||||
name string
|
||||
cmd []string
|
||||
env []string // extra env vars (appended to os.Environ)
|
||||
collectGPU bool // collect GPU metrics via nvidia-smi while this job runs
|
||||
gpuIndices []int // GPU indices to collect metrics for (empty = all)
|
||||
}
|
||||
|
||||
type satStats struct {
|
||||
@@ -147,6 +196,109 @@ func runAcceptancePack(baseDir, prefix string, jobs []satJob) (string, error) {
|
||||
return archive, nil
|
||||
}
|
||||
|
||||
func nvidiaSATJobsWithOptions(durationSec, sizeMB int, gpuIndices []int) []satJob {
|
||||
var env []string
|
||||
if len(gpuIndices) > 0 {
|
||||
ids := make([]string, len(gpuIndices))
|
||||
for i, idx := range gpuIndices {
|
||||
ids[i] = strconv.Itoa(idx)
|
||||
}
|
||||
env = []string{"CUDA_VISIBLE_DEVICES=" + strings.Join(ids, ",")}
|
||||
}
|
||||
return []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||
{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
||||
{
|
||||
name: "05-bee-gpu-stress.log",
|
||||
cmd: []string{"bee-gpu-stress", "--seconds", strconv.Itoa(durationSec), "--size-mb", strconv.Itoa(sizeMB)},
|
||||
env: env,
|
||||
collectGPU: true,
|
||||
gpuIndices: gpuIndices,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob) (string, error) {
|
||||
if baseDir == "" {
|
||||
baseDir = "/var/log/bee-sat"
|
||||
}
|
||||
ts := time.Now().UTC().Format("20060102-150405")
|
||||
runDir := filepath.Join(baseDir, prefix+"-"+ts)
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
return "", err
|
||||
}
|
||||
verboseLog := filepath.Join(runDir, "verbose.log")
|
||||
|
||||
var summary strings.Builder
|
||||
stats := satStats{}
|
||||
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
||||
for _, job := range jobs {
|
||||
if ctx.Err() != nil {
|
||||
break
|
||||
}
|
||||
cmd := make([]string, 0, len(job.cmd))
|
||||
for _, arg := range job.cmd {
|
||||
cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir))
|
||||
}
|
||||
|
||||
var out []byte
|
||||
var err error
|
||||
|
||||
if job.collectGPU {
|
||||
out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir)
|
||||
} else {
|
||||
out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env)
|
||||
}
|
||||
|
||||
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
||||
return "", writeErr
|
||||
}
|
||||
status, rc := classifySATResult(job.name, out, err)
|
||||
stats.Add(status)
|
||||
key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
|
||||
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
|
||||
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
|
||||
}
|
||||
writeSATStats(&summary, stats)
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
|
||||
if err := createTarGz(archive, runDir); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return archive, nil
|
||||
}
|
||||
|
||||
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string) ([]byte, error) {
|
||||
start := time.Now().UTC()
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
|
||||
"cmd: "+strings.Join(cmd, " "),
|
||||
)
|
||||
|
||||
c := exec.CommandContext(ctx, cmd[0], cmd[1:]...)
|
||||
if len(env) > 0 {
|
||||
c.Env = append(os.Environ(), env...)
|
||||
}
|
||||
out, err := c.CombinedOutput()
|
||||
|
||||
rc := 0
|
||||
if err != nil {
|
||||
rc = 1
|
||||
}
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
|
||||
fmt.Sprintf("rc: %d", rc),
|
||||
fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()),
|
||||
"",
|
||||
)
|
||||
return out, err
|
||||
}
|
||||
|
||||
func listStorageDevices() ([]string, error) {
|
||||
out, err := exec.Command("lsblk", "-dn", "-o", "NAME,TYPE").Output()
|
||||
if err != nil {
|
||||
@@ -251,6 +403,51 @@ func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) {
|
||||
return out, err
|
||||
}
|
||||
|
||||
// runSATCommandWithMetrics runs a command while collecting GPU metrics in the background.
|
||||
// On completion it writes gpu-metrics.csv and gpu-metrics.html into runDir.
|
||||
func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir string) ([]byte, error) {
|
||||
stopCh := make(chan struct{})
|
||||
doneCh := make(chan struct{})
|
||||
var metricRows []GPUMetricRow
|
||||
start := time.Now()
|
||||
|
||||
go func() {
|
||||
defer close(doneCh)
|
||||
ticker := time.NewTicker(time.Second)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-stopCh:
|
||||
return
|
||||
case <-ticker.C:
|
||||
samples, err := sampleGPUMetrics(gpuIndices)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
elapsed := time.Since(start).Seconds()
|
||||
for i := range samples {
|
||||
samples[i].ElapsedSec = elapsed
|
||||
}
|
||||
metricRows = append(metricRows, samples...)
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
out, err := runSATCommandCtx(ctx, verboseLog, name, cmd, env)
|
||||
|
||||
close(stopCh)
|
||||
<-doneCh
|
||||
|
||||
if len(metricRows) > 0 {
|
||||
_ = WriteGPUMetricsCSV(filepath.Join(runDir, "gpu-metrics.csv"), metricRows)
|
||||
_ = WriteGPUMetricsHTML(filepath.Join(runDir, "gpu-metrics.html"), metricRows)
|
||||
chart := RenderGPUTerminalChart(metricRows)
|
||||
_ = os.WriteFile(filepath.Join(runDir, "gpu-metrics-term.txt"), []byte(chart), 0644)
|
||||
}
|
||||
|
||||
return out, err
|
||||
}
|
||||
|
||||
func appendSATVerboseLog(path string, lines ...string) {
|
||||
if path == "" {
|
||||
return
|
||||
|
||||
@@ -27,3 +27,16 @@ type exportTargetsMsg struct {
|
||||
type bannerMsg struct {
|
||||
text string
|
||||
}
|
||||
|
||||
type nvidiaGPUsMsg struct {
|
||||
gpus []platform.NvidiaGPU
|
||||
err error
|
||||
}
|
||||
|
||||
type nvtopClosedMsg struct{}
|
||||
|
||||
type nvidiaSATDoneMsg struct {
|
||||
title string
|
||||
body string
|
||||
err error
|
||||
}
|
||||
|
||||
@@ -10,7 +10,7 @@ func (m model) handleAcceptanceMenu() (tea.Model, tea.Cmd) {
|
||||
}
|
||||
switch m.cursor {
|
||||
case 0:
|
||||
m.pendingAction = actionRunNvidiaSAT
|
||||
return m.enterNvidiaSATSetup()
|
||||
case 1:
|
||||
m.pendingAction = actionRunMemorySAT
|
||||
case 2:
|
||||
|
||||
238
audit/internal/tui/screen_nvidia_sat.go
Normal file
238
audit/internal/tui/screen_nvidia_sat.go
Normal file
@@ -0,0 +1,238 @@
|
||||
package tui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
|
||||
var nvidiaDurationOptions = []struct {
|
||||
label string
|
||||
seconds int
|
||||
}{
|
||||
{"10 minutes", 600},
|
||||
{"1 hour", 3600},
|
||||
{"8 hours", 28800},
|
||||
{"24 hours", 86400},
|
||||
}
|
||||
|
||||
// enterNvidiaSATSetup resets the setup screen and starts loading GPU list.
|
||||
func (m model) enterNvidiaSATSetup() (tea.Model, tea.Cmd) {
|
||||
m.screen = screenNvidiaSATSetup
|
||||
m.nvidiaGPUs = nil
|
||||
m.nvidiaGPUSel = nil
|
||||
m.nvidiaDurIdx = 0
|
||||
m.nvidiaSATCursor = 0
|
||||
m.busy = true
|
||||
m.busyTitle = "NVIDIA SAT"
|
||||
return m, func() tea.Msg {
|
||||
gpus, err := m.app.ListNvidiaGPUs()
|
||||
return nvidiaGPUsMsg{gpus: gpus, err: err}
|
||||
}
|
||||
}
|
||||
|
||||
// handleNvidiaGPUsMsg processes the GPU list response.
|
||||
func (m model) handleNvidiaGPUsMsg(msg nvidiaGPUsMsg) (tea.Model, tea.Cmd) {
|
||||
m.busy = false
|
||||
m.busyTitle = ""
|
||||
if msg.err != nil {
|
||||
m.title = "NVIDIA SAT"
|
||||
m.body = fmt.Sprintf("Failed to list GPUs: %v", msg.err)
|
||||
m.prevScreen = screenAcceptance
|
||||
m.screen = screenOutput
|
||||
return m, nil
|
||||
}
|
||||
m.nvidiaGPUs = msg.gpus
|
||||
m.nvidiaGPUSel = make([]bool, len(msg.gpus))
|
||||
for i := range m.nvidiaGPUSel {
|
||||
m.nvidiaGPUSel[i] = true // all selected by default
|
||||
}
|
||||
m.nvidiaSATCursor = 0
|
||||
return m, nil
|
||||
}
|
||||
|
||||
// updateNvidiaSATSetup handles keys on the setup screen.
|
||||
func (m model) updateNvidiaSATSetup(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
numDur := len(nvidiaDurationOptions)
|
||||
numGPU := len(m.nvidiaGPUs)
|
||||
totalItems := numDur + numGPU + 2 // +2: Start, Cancel
|
||||
switch msg.String() {
|
||||
case "up", "k":
|
||||
if m.nvidiaSATCursor > 0 {
|
||||
m.nvidiaSATCursor--
|
||||
}
|
||||
case "down", "j":
|
||||
if m.nvidiaSATCursor < totalItems-1 {
|
||||
m.nvidiaSATCursor++
|
||||
}
|
||||
case " ":
|
||||
switch {
|
||||
case m.nvidiaSATCursor < numDur:
|
||||
m.nvidiaDurIdx = m.nvidiaSATCursor
|
||||
case m.nvidiaSATCursor < numDur+numGPU:
|
||||
i := m.nvidiaSATCursor - numDur
|
||||
m.nvidiaGPUSel[i] = !m.nvidiaGPUSel[i]
|
||||
}
|
||||
case "enter":
|
||||
startIdx := numDur + numGPU
|
||||
cancelIdx := startIdx + 1
|
||||
switch {
|
||||
case m.nvidiaSATCursor < numDur:
|
||||
m.nvidiaDurIdx = m.nvidiaSATCursor
|
||||
case m.nvidiaSATCursor < startIdx:
|
||||
i := m.nvidiaSATCursor - numDur
|
||||
m.nvidiaGPUSel[i] = !m.nvidiaGPUSel[i]
|
||||
case m.nvidiaSATCursor == startIdx:
|
||||
return m.startNvidiaSAT()
|
||||
case m.nvidiaSATCursor == cancelIdx:
|
||||
m.screen = screenAcceptance
|
||||
m.cursor = 0
|
||||
}
|
||||
case "esc":
|
||||
m.screen = screenAcceptance
|
||||
m.cursor = 0
|
||||
case "ctrl+c", "q":
|
||||
return m, tea.Quit
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
// startNvidiaSAT launches the SAT and nvtop.
|
||||
func (m model) startNvidiaSAT() (tea.Model, tea.Cmd) {
|
||||
var selectedGPUs []platform.NvidiaGPU
|
||||
for i, sel := range m.nvidiaGPUSel {
|
||||
if sel {
|
||||
selectedGPUs = append(selectedGPUs, m.nvidiaGPUs[i])
|
||||
}
|
||||
}
|
||||
if len(selectedGPUs) == 0 {
|
||||
selectedGPUs = m.nvidiaGPUs // fallback: use all if none explicitly selected
|
||||
}
|
||||
|
||||
sizeMB := 0
|
||||
for _, g := range selectedGPUs {
|
||||
if sizeMB == 0 || g.MemoryMB < sizeMB {
|
||||
sizeMB = g.MemoryMB
|
||||
}
|
||||
}
|
||||
if sizeMB == 0 {
|
||||
sizeMB = 64
|
||||
}
|
||||
|
||||
var gpuIndices []int
|
||||
for _, g := range selectedGPUs {
|
||||
gpuIndices = append(gpuIndices, g.Index)
|
||||
}
|
||||
|
||||
durationSec := nvidiaDurationOptions[m.nvidiaDurIdx].seconds
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
m.nvidiaSATCancel = cancel
|
||||
m.nvidiaSATAborted = false
|
||||
m.screen = screenNvidiaSATRunning
|
||||
m.nvidiaSATCursor = 0
|
||||
|
||||
satCmd := func() tea.Msg {
|
||||
result, err := m.app.RunNvidiaAcceptancePackWithOptions(ctx, "", durationSec, sizeMB, gpuIndices)
|
||||
return nvidiaSATDoneMsg{title: result.Title, body: result.Body, err: err}
|
||||
}
|
||||
|
||||
nvtopPath, lookErr := exec.LookPath("nvtop")
|
||||
if lookErr != nil {
|
||||
// nvtop not available: just run the SAT, show running screen
|
||||
return m, satCmd
|
||||
}
|
||||
|
||||
return m, tea.Batch(
|
||||
satCmd,
|
||||
tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
|
||||
return nvtopClosedMsg{}
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
// updateNvidiaSATRunning handles keys on the running screen.
|
||||
func (m model) updateNvidiaSATRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
switch msg.String() {
|
||||
case "o", "O":
|
||||
nvtopPath, err := exec.LookPath("nvtop")
|
||||
if err != nil {
|
||||
return m, nil
|
||||
}
|
||||
return m, tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
|
||||
return nvtopClosedMsg{}
|
||||
})
|
||||
case "a", "A":
|
||||
if m.nvidiaSATCancel != nil {
|
||||
m.nvidiaSATCancel()
|
||||
m.nvidiaSATCancel = nil
|
||||
}
|
||||
m.nvidiaSATAborted = true
|
||||
m.screen = screenAcceptance
|
||||
m.cursor = 0
|
||||
case "ctrl+c":
|
||||
return m, tea.Quit
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
// renderNvidiaSATSetup renders the setup screen.
|
||||
func renderNvidiaSATSetup(m model) string {
|
||||
var b strings.Builder
|
||||
fmt.Fprintln(&b, "NVIDIA SAT")
|
||||
fmt.Fprintln(&b)
|
||||
fmt.Fprintln(&b, "Duration:")
|
||||
for i, opt := range nvidiaDurationOptions {
|
||||
radio := "( )"
|
||||
if i == m.nvidiaDurIdx {
|
||||
radio = "(*)"
|
||||
}
|
||||
prefix := " "
|
||||
if m.nvidiaSATCursor == i {
|
||||
prefix = "> "
|
||||
}
|
||||
fmt.Fprintf(&b, "%s%s %s\n", prefix, radio, opt.label)
|
||||
}
|
||||
fmt.Fprintln(&b)
|
||||
if len(m.nvidiaGPUs) == 0 {
|
||||
fmt.Fprintln(&b, "GPUs: (none detected)")
|
||||
} else {
|
||||
fmt.Fprintln(&b, "GPUs:")
|
||||
for i, gpu := range m.nvidiaGPUs {
|
||||
check := "[ ]"
|
||||
if m.nvidiaGPUSel[i] {
|
||||
check = "[x]"
|
||||
}
|
||||
prefix := " "
|
||||
if m.nvidiaSATCursor == len(nvidiaDurationOptions)+i {
|
||||
prefix = "> "
|
||||
}
|
||||
fmt.Fprintf(&b, "%s%s %d: %s (%d MB)\n", prefix, check, gpu.Index, gpu.Name, gpu.MemoryMB)
|
||||
}
|
||||
}
|
||||
fmt.Fprintln(&b)
|
||||
startIdx := len(nvidiaDurationOptions) + len(m.nvidiaGPUs)
|
||||
startPfx := " "
|
||||
cancelPfx := " "
|
||||
if m.nvidiaSATCursor == startIdx {
|
||||
startPfx = "> "
|
||||
}
|
||||
if m.nvidiaSATCursor == startIdx+1 {
|
||||
cancelPfx = "> "
|
||||
}
|
||||
fmt.Fprintf(&b, "%sStart\n", startPfx)
|
||||
fmt.Fprintf(&b, "%sCancel\n", cancelPfx)
|
||||
fmt.Fprintln(&b)
|
||||
b.WriteString("[↑/↓] move [space] toggle [enter] select [esc] cancel\n")
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// renderNvidiaSATRunning renders the running screen.
|
||||
func renderNvidiaSATRunning() string {
|
||||
return "NVIDIA SAT\n\nTest is running...\n\n[o] Open nvtop [a] Abort test [ctrl+c] quit\n"
|
||||
}
|
||||
@@ -255,7 +255,7 @@ func TestOutputScreenReturnsToPreviousScreen(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestAcceptanceConfirmFlow(t *testing.T) {
|
||||
func TestAcceptanceNvidiaSATOpensSetup(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
@@ -265,17 +265,15 @@ func TestAcceptanceConfirmFlow(t *testing.T) {
|
||||
next, cmd := m.handleAcceptanceMenu()
|
||||
got := next.(model)
|
||||
|
||||
if cmd != nil {
|
||||
t.Fatal("expected nil cmd")
|
||||
if cmd == nil {
|
||||
t.Fatal("expected non-nil cmd (GPU list loader)")
|
||||
}
|
||||
if got.screen != screenConfirm {
|
||||
t.Fatalf("screen=%q want %q", got.screen, screenConfirm)
|
||||
}
|
||||
if got.pendingAction != actionRunNvidiaSAT {
|
||||
t.Fatalf("pendingAction=%q want %q", got.pendingAction, actionRunNvidiaSAT)
|
||||
if got.screen != screenNvidiaSATSetup {
|
||||
t.Fatalf("screen=%q want %q", got.screen, screenNvidiaSATSetup)
|
||||
}
|
||||
|
||||
next, _ = got.updateConfirm(tea.KeyMsg{Type: tea.KeyEsc})
|
||||
// esc from setup returns to acceptance
|
||||
next, _ = got.updateNvidiaSATSetup(tea.KeyMsg{Type: tea.KeyEsc})
|
||||
got = next.(model)
|
||||
if got.screen != screenAcceptance {
|
||||
t.Fatalf("screen after esc=%q want %q", got.screen, screenAcceptance)
|
||||
@@ -289,7 +287,6 @@ func TestAcceptanceMenuMapsNewTargets(t *testing.T) {
|
||||
cursor int
|
||||
want actionKind
|
||||
}{
|
||||
{cursor: 0, want: actionRunNvidiaSAT},
|
||||
{cursor: 1, want: actionRunMemorySAT},
|
||||
{cursor: 2, want: actionRunStorageSAT},
|
||||
}
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
package tui
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
"bee/audit/internal/runtimeenv"
|
||||
@@ -12,16 +14,18 @@ import (
|
||||
type screen string
|
||||
|
||||
const (
|
||||
screenMain screen = "main"
|
||||
screenNetwork screen = "network"
|
||||
screenInterfacePick screen = "interface_pick"
|
||||
screenServices screen = "services"
|
||||
screenServiceAction screen = "service_action"
|
||||
screenAcceptance screen = "acceptance"
|
||||
screenExportTargets screen = "export_targets"
|
||||
screenOutput screen = "output"
|
||||
screenStaticForm screen = "static_form"
|
||||
screenConfirm screen = "confirm"
|
||||
screenMain screen = "main"
|
||||
screenNetwork screen = "network"
|
||||
screenInterfacePick screen = "interface_pick"
|
||||
screenServices screen = "services"
|
||||
screenServiceAction screen = "service_action"
|
||||
screenAcceptance screen = "acceptance"
|
||||
screenExportTargets screen = "export_targets"
|
||||
screenOutput screen = "output"
|
||||
screenStaticForm screen = "static_form"
|
||||
screenConfirm screen = "confirm"
|
||||
screenNvidiaSATSetup screen = "nvidia_sat_setup"
|
||||
screenNvidiaSATRunning screen = "nvidia_sat_running"
|
||||
)
|
||||
|
||||
type actionKind string
|
||||
@@ -63,6 +67,16 @@ type model struct {
|
||||
|
||||
formFields []formField
|
||||
formIndex int
|
||||
|
||||
// NVIDIA SAT setup
|
||||
nvidiaGPUs []platform.NvidiaGPU
|
||||
nvidiaGPUSel []bool
|
||||
nvidiaDurIdx int // index into nvidiaDurationOptions
|
||||
nvidiaSATCursor int
|
||||
|
||||
// NVIDIA SAT running
|
||||
nvidiaSATCancel context.CancelFunc
|
||||
nvidiaSATAborted bool
|
||||
}
|
||||
|
||||
type formField struct {
|
||||
|
||||
@@ -87,6 +87,33 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
|
||||
case bannerMsg:
|
||||
m.banner = strings.TrimSpace(msg.text)
|
||||
return m, nil
|
||||
case nvidiaGPUsMsg:
|
||||
return m.handleNvidiaGPUsMsg(msg)
|
||||
case nvtopClosedMsg:
|
||||
// nvtop closed — stay on running screen (or result if SAT is already done)
|
||||
return m, nil
|
||||
case nvidiaSATDoneMsg:
|
||||
if m.nvidiaSATAborted {
|
||||
return m, nil
|
||||
}
|
||||
if m.nvidiaSATCancel != nil {
|
||||
m.nvidiaSATCancel()
|
||||
m.nvidiaSATCancel = nil
|
||||
}
|
||||
m.prevScreen = screenAcceptance
|
||||
m.screen = screenOutput
|
||||
m.title = msg.title
|
||||
if msg.err != nil {
|
||||
body := strings.TrimSpace(msg.body)
|
||||
if body == "" {
|
||||
m.body = fmt.Sprintf("ERROR: %v", msg.err)
|
||||
} else {
|
||||
m.body = fmt.Sprintf("%s\n\nERROR: %v", body, msg.err)
|
||||
}
|
||||
} else {
|
||||
m.body = msg.body
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
return m, nil
|
||||
@@ -104,6 +131,10 @@ func (m model) updateKey(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
return m.updateMenu(msg, len(m.serviceMenu), m.handleServiceActionMenu)
|
||||
case screenAcceptance:
|
||||
return m.updateMenu(msg, 4, m.handleAcceptanceMenu)
|
||||
case screenNvidiaSATSetup:
|
||||
return m.updateNvidiaSATSetup(msg)
|
||||
case screenNvidiaSATRunning:
|
||||
return m.updateNvidiaSATRunning(msg)
|
||||
case screenExportTargets:
|
||||
return m.updateMenu(msg, len(m.targets), m.handleExportTargetsMenu)
|
||||
case screenInterfacePick:
|
||||
|
||||
@@ -43,6 +43,10 @@ func (m model) View() string {
|
||||
case screenConfirm:
|
||||
title, body := m.confirmBody()
|
||||
return renderConfirm(title, body, m.cursor)
|
||||
case screenNvidiaSATSetup:
|
||||
return renderNvidiaSATSetup(m)
|
||||
case screenNvidiaSATRunning:
|
||||
return renderNvidiaSATRunning()
|
||||
case screenOutput:
|
||||
return fmt.Sprintf("%s\n\n%s\n\n[enter/esc] back [ctrl+c] quit\n", m.title, strings.TrimSpace(m.body))
|
||||
default:
|
||||
|
||||
@@ -140,3 +140,34 @@ Acceptance flows:
|
||||
- `BEE_GPU_STRESS_SIZE_MB`
|
||||
- `BEE_MEMTESTER_SIZE_MB`
|
||||
- `BEE_MEMTESTER_PASSES`
|
||||
|
||||
## NVIDIA SAT TUI flow (v1.0.0+)
|
||||
|
||||
```
|
||||
TUI: Acceptance tests → NVIDIA command pack
|
||||
1. screenNvidiaSATSetup
|
||||
a. enumerate GPUs via `nvidia-smi --query-gpu=index,name,memory.total`
|
||||
b. user selects duration preset: 10 min / 1 h / 8 h / 24 h
|
||||
c. user selects GPUs via checkboxes (all selected by default)
|
||||
d. memory size = max(selected GPU memory) — auto-detected, not exposed to user
|
||||
2. Start → screenNvidiaSATRunning
|
||||
a. CUDA_VISIBLE_DEVICES set to selected GPU indices
|
||||
b. tea.Batch: SAT goroutine + tea.ExecProcess(nvtop) launched concurrently
|
||||
c. nvtop occupies full terminal; SAT result queues in background
|
||||
d. [o] reopen nvtop at any time; [a] abort (cancels context → kills bee-gpu-stress)
|
||||
3. GPU metrics collection (during bee-gpu-stress)
|
||||
- background goroutine polls `nvidia-smi` every second
|
||||
- per-second rows: elapsed, GPU index, temp°C, usage%, power W, clock MHz
|
||||
- outputs: gpu-metrics.csv, gpu-metrics.html (offline SVG chart), gpu-metrics-term.txt
|
||||
4. After SAT completes
|
||||
- result shown in screenOutput with terminal line-chart (gpu-metrics-term.txt)
|
||||
- chart is asciigraph-style: box-drawing chars (╭╮╰╯─│), 4 series per GPU,
|
||||
Y axis with ticks, ANSI colours (red=temp, blue=usage, green=power, yellow=clock)
|
||||
```
|
||||
|
||||
**Critical invariants:**
|
||||
- `nvtop` must be in `iso/builder/config/package-lists/bee.list.chroot` (baked into ISO).
|
||||
- `bee-gpu-stress` uses `exec.CommandContext` — aborted on cancel.
|
||||
- Metric goroutine uses stopCh/doneCh pattern; main goroutine waits `<-doneCh` before reading rows (no mutex needed).
|
||||
- If `nvtop` is not found on PATH, SAT still runs without it (graceful degradation).
|
||||
- SVG chart is fully offline: no JS, no external CSS, pure inline SVG.
|
||||
|
||||
@@ -2,4 +2,4 @@ DEBIAN_VERSION=12
|
||||
DEBIAN_KERNEL_ABI=6.1.0-43
|
||||
NVIDIA_DRIVER_VERSION=590.48.01
|
||||
GO_VERSION=1.24.0
|
||||
AUDIT_VERSION=0.1.1
|
||||
AUDIT_VERSION=1.0.0
|
||||
|
||||
@@ -27,6 +27,7 @@ less
|
||||
vim-tiny
|
||||
mc
|
||||
htop
|
||||
nvtop
|
||||
sudo
|
||||
zstd
|
||||
mstflint
|
||||
|
||||
Reference in New Issue
Block a user