Files
bee/audit/internal/tui/screen_nvidia_sat.go
Michael Chus eea98e6d76 feat(dcgm): add NVIDIA DCGM diagnostics, fix KVM console
- Add 9002-nvidia-dcgm.hook.chroot: installs datacenter-gpu-manager
  from NVIDIA apt repo during live-build
- Enable nvidia-dcgm.service in chroot setup hook
- Replace bee-gpu-stress with dcgmi diag (levels 1-4) in NVIDIA SAT
- TUI: replace GPU checkbox + duration UI with DCGM level selection
- Remove console=tty2 from boot params: KVM/VGA now shows tty1
  where bee-tui runs, fixing unresponsive console

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-26 23:08:12 +03:00

138 lines
3.5 KiB
Go

package tui
import (
"context"
"fmt"
"strings"
tea "github.com/charmbracelet/bubbletea"
)
var nvidiaDCGMOptions = []struct {
label string
level int
note string
}{
{"Level 1 — Quick", 1, "~1 min, configuration check"},
{"Level 2 — Medium", 2, "~2 min, memory test"},
{"Level 3 — Targeted stress", 3, "~10 min, SM + memory + PCIe [recommended]"},
{"Level 4 — Extended stress", 4, "~30 min, extended burn-in"},
}
// enterNvidiaSATSetup resets and shows the DCGM level selection screen.
func (m model) enterNvidiaSATSetup() (tea.Model, tea.Cmd) {
m.screen = screenNvidiaSATSetup
m.nvidiaDurIdx = 2 // default: Level 3
m.nvidiaSATCursor = 2
m.busy = false
return m, nil
}
// updateNvidiaSATSetup handles keys on the DCGM setup screen.
func (m model) updateNvidiaSATSetup(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
numOpts := len(nvidiaDCGMOptions)
totalItems := numOpts + 2 // +2: Start, Cancel
switch msg.String() {
case "up", "k":
if m.nvidiaSATCursor > 0 {
m.nvidiaSATCursor--
}
case "down", "j":
if m.nvidiaSATCursor < totalItems-1 {
m.nvidiaSATCursor++
}
case " ", "enter":
startIdx := numOpts
cancelIdx := startIdx + 1
switch {
case m.nvidiaSATCursor < numOpts:
m.nvidiaDurIdx = m.nvidiaSATCursor
case m.nvidiaSATCursor == startIdx:
return m.startNvidiaSAT()
case m.nvidiaSATCursor == cancelIdx:
m.screen = screenHealthCheck
m.cursor = 0
}
case "esc":
m.screen = screenHealthCheck
m.cursor = 0
case "ctrl+c", "q":
return m, tea.Quit
}
return m, nil
}
// startNvidiaSAT launches the DCGM diagnostic.
func (m model) startNvidiaSAT() (tea.Model, tea.Cmd) {
diagLevel := nvidiaDCGMOptions[m.nvidiaDurIdx].level
ctx, cancel := context.WithCancel(context.Background())
m.nvidiaSATCancel = cancel
m.nvidiaSATAborted = false
m.screen = screenNvidiaSATRunning
m.nvidiaSATCursor = 0
satCmd := func() tea.Msg {
result, err := m.app.RunNvidiaAcceptancePackWithOptions(ctx, "", diagLevel, nil)
return nvidiaSATDoneMsg{title: result.Title, body: result.Body, err: err}
}
return m, satCmd
}
// updateNvidiaSATRunning handles keys on the running screen.
func (m model) updateNvidiaSATRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
switch msg.String() {
case "a", "A":
if m.nvidiaSATCancel != nil {
m.nvidiaSATCancel()
m.nvidiaSATCancel = nil
}
m.nvidiaSATAborted = true
m.screen = screenHealthCheck
m.cursor = 0
case "ctrl+c":
return m, tea.Quit
}
return m, nil
}
// renderNvidiaSATSetup renders the DCGM level selection screen.
func renderNvidiaSATSetup(m model) string {
var b strings.Builder
fmt.Fprintln(&b, "NVIDIA Diagnostics (DCGM)")
fmt.Fprintln(&b)
fmt.Fprintln(&b, "Diagnostic level:")
for i, opt := range nvidiaDCGMOptions {
radio := "( )"
if i == m.nvidiaDurIdx {
radio = "(*)"
}
prefix := " "
if m.nvidiaSATCursor == i {
prefix = "> "
}
fmt.Fprintf(&b, "%s%s %s (%s)\n", prefix, radio, opt.label, opt.note)
}
fmt.Fprintln(&b)
startIdx := len(nvidiaDCGMOptions)
startPfx := " "
cancelPfx := " "
if m.nvidiaSATCursor == startIdx {
startPfx = "> "
}
if m.nvidiaSATCursor == startIdx+1 {
cancelPfx = "> "
}
fmt.Fprintf(&b, "%sStart\n", startPfx)
fmt.Fprintf(&b, "%sCancel\n", cancelPfx)
fmt.Fprintln(&b)
b.WriteString("[↑/↓] move [space/enter] select [esc] cancel\n")
return b.String()
}
// renderNvidiaSATRunning renders the running screen.
func renderNvidiaSATRunning() string {
return "NVIDIA Diagnostics (DCGM)\n\nTest is running...\n\n[a] Abort test [ctrl+c] quit\n"
}