feat(dcgm): add NVIDIA DCGM diagnostics, fix KVM console

- Add 9002-nvidia-dcgm.hook.chroot: installs datacenter-gpu-manager
  from NVIDIA apt repo during live-build
- Enable nvidia-dcgm.service in chroot setup hook
- Replace bee-gpu-stress with dcgmi diag (levels 1-4) in NVIDIA SAT
- TUI: replace GPU checkbox + duration UI with DCGM level selection
- Remove console=tty2 from boot params: KVM/VGA now shows tty1
  where bee-tui runs, fixing unresponsive console

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-26 23:08:12 +03:00
parent 967455194c
commit eea98e6d76
12 changed files with 121 additions and 170 deletions

View File

@@ -72,7 +72,7 @@ type toolManager interface {
type satRunner interface {
RunNvidiaAcceptancePack(baseDir string) (string, error)
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, durationSec int, sizeMB int, gpuIndices []int) (string, error)
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int) (string, error)
RunMemoryAcceptancePack(baseDir string) (string, error)
RunStorageAcceptancePack(baseDir string) (string, error)
RunCPUAcceptancePack(baseDir string, durationSec int) (string, error)
@@ -423,23 +423,16 @@ func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
return a.sat.ListNvidiaGPUs()
}
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, durationSec int, sizeMB int, gpuIndices []int) (ActionResult, error) {
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int) (ActionResult, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, durationSec, sizeMB, gpuIndices)
path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, diagLevel, gpuIndices)
body := "Archive written."
if path != "" {
body = "Archive written to " + path
}
// Include terminal chart if available (runDir = archive path without .tar.gz).
if path != "" {
termPath := filepath.Join(strings.TrimSuffix(path, ".tar.gz"), "gpu-metrics-term.txt")
if chart, readErr := os.ReadFile(termPath); readErr == nil && len(chart) > 0 {
body += "\n\n" + string(chart)
}
}
return ActionResult{Title: "NVIDIA SAT", Body: body}, err
return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
}
func (a *App) RunMemoryAcceptancePack(baseDir string) (string, error) {

View File

@@ -123,7 +123,7 @@ func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string) (string, error) {
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir string, _ int, _ int, _ []int) (string, error) {
func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir string, _ int, _ []int) (string, error) {
return f.runNvidiaFn(baseDir)
}

View File

@@ -125,10 +125,12 @@ func (s *System) RunNvidiaAcceptancePack(baseDir string) (string, error) {
return runAcceptancePack(baseDir, "gpu-nvidia", nvidiaSATJobs())
}
// RunNvidiaAcceptancePackWithOptions runs the NVIDIA SAT with explicit duration,
// GPU memory size, and GPU index selection. ctx cancellation kills the running job.
func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, durationSec int, sizeMB int, gpuIndices []int) (string, error) {
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaSATJobsWithOptions(durationSec, sizeMB, gpuIndices))
// RunNvidiaAcceptancePackWithOptions runs the NVIDIA diagnostics via DCGM.
// diagLevel: 1=quick, 2=medium, 3=targeted stress, 4=extended stress.
// gpuIndices: specific GPU indices to test (empty = all GPUs).
// ctx cancellation kills the running job.
func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int) (string, error) {
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, gpuIndices))
}
func (s *System) RunMemoryAcceptancePack(baseDir string) (string, error) {
@@ -275,27 +277,23 @@ func runAcceptancePack(baseDir, prefix string, jobs []satJob) (string, error) {
return archive, nil
}
func nvidiaSATJobsWithOptions(durationSec, sizeMB int, gpuIndices []int) []satJob {
var env []string
func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
if diagLevel < 1 || diagLevel > 4 {
diagLevel = 3
}
diagArgs := []string{"dcgmi", "diag", "-r", strconv.Itoa(diagLevel)}
if len(gpuIndices) > 0 {
ids := make([]string, len(gpuIndices))
for i, idx := range gpuIndices {
ids[i] = strconv.Itoa(idx)
}
env = []string{"CUDA_VISIBLE_DEVICES=" + strings.Join(ids, ",")}
diagArgs = append(diagArgs, "-i", strings.Join(ids, ","))
}
return []satJob{
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
{
name: "05-bee-gpu-stress.log",
cmd: []string{"bee-gpu-stress", "--seconds", strconv.Itoa(durationSec), "--size-mb", strconv.Itoa(sizeMB)},
env: env,
collectGPU: true,
gpuIndices: gpuIndices,
},
{name: "04-dcgmi-diag.log", cmd: diagArgs},
}
}

View File

@@ -32,11 +32,6 @@ type snapshotMsg struct {
panel app.HardwarePanelData
}
type nvidiaGPUsMsg struct {
gpus []platform.NvidiaGPU
err error
}
type nvtopClosedMsg struct{}
type nvidiaSATDoneMsg struct {

View File

@@ -33,8 +33,6 @@ const (
hcCurTotal = 9
)
// hcModeDurations maps mode index (0=Quick,1=Standard,2=Express) to GPU stress seconds.
var hcModeDurations = [3]int{600, 3600, 28800}
// hcCPUDurations maps mode index to CPU stress-ng seconds.
var hcCPUDurations = [3]int{60, 300, 900}
@@ -232,7 +230,6 @@ func (m model) hcRunAll() (tea.Model, tea.Cmd) {
}
func (m model) executeRunAll() (tea.Model, tea.Cmd) {
durationSec := hcModeDurations[m.hcMode]
durationIdx := m.hcMode
sel := m.hcSel
app := m.app
@@ -250,28 +247,14 @@ func (m model) executeRunAll() (tea.Model, tea.Cmd) {
}
parts = append(parts, "=== GPU (AMD) ===\n"+body)
} else {
gpus, err := app.ListNvidiaGPUs()
if err != nil || len(gpus) == 0 {
parts = append(parts, "=== GPU ===\nNo NVIDIA GPUs detected or driver not loaded.")
} else {
var indices []int
sizeMB := 0
for _, g := range gpus {
indices = append(indices, g.Index)
if sizeMB == 0 || g.MemoryMB < sizeMB {
sizeMB = g.MemoryMB
}
}
if sizeMB == 0 {
sizeMB = 64
}
r, err := app.RunNvidiaAcceptancePackWithOptions(context.Background(), "", durationSec, sizeMB, indices)
body := r.Body
if err != nil {
body += "\nERROR: " + err.Error()
}
parts = append(parts, "=== GPU ===\n"+body)
// Map hcMode (0=Quick,1=Standard,2=Express) to DCGM level (1,2,3)
diagLevel := durationIdx + 1
r, err := app.RunNvidiaAcceptancePackWithOptions(context.Background(), "", diagLevel, nil)
body := r.Body
if err != nil {
body += "\nERROR: " + err.Error()
}
parts = append(parts, "=== GPU (DCGM) ===\n"+body)
}
}
if sel[hcMemory] {

View File

@@ -5,61 +5,33 @@ import (
"fmt"
"strings"
"bee/audit/internal/platform"
tea "github.com/charmbracelet/bubbletea"
)
var nvidiaDurationOptions = []struct {
var nvidiaDCGMOptions = []struct {
label string
seconds int
level int
note string
}{
{"10 minutes", 600},
{"1 hour", 3600},
{"8 hours", 28800},
{"24 hours", 86400},
{"Level 1 — Quick", 1, "~1 min, configuration check"},
{"Level 2 — Medium", 2, "~2 min, memory test"},
{"Level 3 — Targeted stress", 3, "~10 min, SM + memory + PCIe [recommended]"},
{"Level 4 — Extended stress", 4, "~30 min, extended burn-in"},
}
// enterNvidiaSATSetup resets the setup screen and starts loading GPU list.
// enterNvidiaSATSetup resets and shows the DCGM level selection screen.
func (m model) enterNvidiaSATSetup() (tea.Model, tea.Cmd) {
m.screen = screenNvidiaSATSetup
m.nvidiaGPUs = nil
m.nvidiaGPUSel = nil
m.nvidiaDurIdx = 0
m.nvidiaSATCursor = 0
m.busy = true
m.busyTitle = "NVIDIA SAT"
return m, func() tea.Msg {
gpus, err := m.app.ListNvidiaGPUs()
return nvidiaGPUsMsg{gpus: gpus, err: err}
}
}
// handleNvidiaGPUsMsg processes the GPU list response.
func (m model) handleNvidiaGPUsMsg(msg nvidiaGPUsMsg) (tea.Model, tea.Cmd) {
m.nvidiaDurIdx = 2 // default: Level 3
m.nvidiaSATCursor = 2
m.busy = false
m.busyTitle = ""
if msg.err != nil {
m.title = "NVIDIA SAT"
m.body = fmt.Sprintf("Failed to list GPUs: %v", msg.err)
m.prevScreen = screenHealthCheck
m.screen = screenOutput
return m, nil
}
m.nvidiaGPUs = msg.gpus
m.nvidiaGPUSel = make([]bool, len(msg.gpus))
for i := range m.nvidiaGPUSel {
m.nvidiaGPUSel[i] = true // all selected by default
}
m.nvidiaSATCursor = 0
return m, nil
}
// updateNvidiaSATSetup handles keys on the setup screen.
// updateNvidiaSATSetup handles keys on the DCGM setup screen.
func (m model) updateNvidiaSATSetup(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
numDur := len(nvidiaDurationOptions)
numGPU := len(m.nvidiaGPUs)
totalItems := numDur + numGPU + 2 // +2: Start, Cancel
numOpts := len(nvidiaDCGMOptions)
totalItems := numOpts + 2 // +2: Start, Cancel
switch msg.String() {
case "up", "k":
if m.nvidiaSATCursor > 0 {
@@ -69,23 +41,12 @@ func (m model) updateNvidiaSATSetup(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
if m.nvidiaSATCursor < totalItems-1 {
m.nvidiaSATCursor++
}
case " ":
switch {
case m.nvidiaSATCursor < numDur:
m.nvidiaDurIdx = m.nvidiaSATCursor
case m.nvidiaSATCursor < numDur+numGPU:
i := m.nvidiaSATCursor - numDur
m.nvidiaGPUSel[i] = !m.nvidiaGPUSel[i]
}
case "enter":
startIdx := numDur + numGPU
case " ", "enter":
startIdx := numOpts
cancelIdx := startIdx + 1
switch {
case m.nvidiaSATCursor < numDur:
case m.nvidiaSATCursor < numOpts:
m.nvidiaDurIdx = m.nvidiaSATCursor
case m.nvidiaSATCursor < startIdx:
i := m.nvidiaSATCursor - numDur
m.nvidiaGPUSel[i] = !m.nvidiaGPUSel[i]
case m.nvidiaSATCursor == startIdx:
return m.startNvidiaSAT()
case m.nvidiaSATCursor == cancelIdx:
@@ -101,34 +62,9 @@ func (m model) updateNvidiaSATSetup(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
return m, nil
}
// startNvidiaSAT launches the NVIDIA acceptance pack.
// startNvidiaSAT launches the DCGM diagnostic.
func (m model) startNvidiaSAT() (tea.Model, tea.Cmd) {
var selectedGPUs []platform.NvidiaGPU
for i, sel := range m.nvidiaGPUSel {
if sel {
selectedGPUs = append(selectedGPUs, m.nvidiaGPUs[i])
}
}
if len(selectedGPUs) == 0 {
selectedGPUs = m.nvidiaGPUs // fallback: use all if none explicitly selected
}
sizeMB := 0
for _, g := range selectedGPUs {
if sizeMB == 0 || g.MemoryMB < sizeMB {
sizeMB = g.MemoryMB
}
}
if sizeMB == 0 {
sizeMB = 64
}
var gpuIndices []int
for _, g := range selectedGPUs {
gpuIndices = append(gpuIndices, g.Index)
}
durationSec := nvidiaDurationOptions[m.nvidiaDurIdx].seconds
diagLevel := nvidiaDCGMOptions[m.nvidiaDurIdx].level
ctx, cancel := context.WithCancel(context.Background())
m.nvidiaSATCancel = cancel
@@ -137,7 +73,7 @@ func (m model) startNvidiaSAT() (tea.Model, tea.Cmd) {
m.nvidiaSATCursor = 0
satCmd := func() tea.Msg {
result, err := m.app.RunNvidiaAcceptancePackWithOptions(ctx, "", durationSec, sizeMB, gpuIndices)
result, err := m.app.RunNvidiaAcceptancePackWithOptions(ctx, "", diagLevel, nil)
return nvidiaSATDoneMsg{title: result.Title, body: result.Body, err: err}
}
@@ -161,13 +97,13 @@ func (m model) updateNvidiaSATRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
return m, nil
}
// renderNvidiaSATSetup renders the setup screen.
// renderNvidiaSATSetup renders the DCGM level selection screen.
func renderNvidiaSATSetup(m model) string {
var b strings.Builder
fmt.Fprintln(&b, "NVIDIA SAT")
fmt.Fprintln(&b, "NVIDIA Diagnostics (DCGM)")
fmt.Fprintln(&b)
fmt.Fprintln(&b, "Duration:")
for i, opt := range nvidiaDurationOptions {
fmt.Fprintln(&b, "Diagnostic level:")
for i, opt := range nvidiaDCGMOptions {
radio := "( )"
if i == m.nvidiaDurIdx {
radio = "(*)"
@@ -176,27 +112,10 @@ func renderNvidiaSATSetup(m model) string {
if m.nvidiaSATCursor == i {
prefix = "> "
}
fmt.Fprintf(&b, "%s%s %s\n", prefix, radio, opt.label)
fmt.Fprintf(&b, "%s%s %s (%s)\n", prefix, radio, opt.label, opt.note)
}
fmt.Fprintln(&b)
if len(m.nvidiaGPUs) == 0 {
fmt.Fprintln(&b, "GPUs: (none detected)")
} else {
fmt.Fprintln(&b, "GPUs:")
for i, gpu := range m.nvidiaGPUs {
check := "[ ]"
if m.nvidiaGPUSel[i] {
check = "[x]"
}
prefix := " "
if m.nvidiaSATCursor == len(nvidiaDurationOptions)+i {
prefix = "> "
}
fmt.Fprintf(&b, "%s%s %d: %s (%d MB)\n", prefix, check, gpu.Index, gpu.Name, gpu.MemoryMB)
}
}
fmt.Fprintln(&b)
startIdx := len(nvidiaDurationOptions) + len(m.nvidiaGPUs)
startIdx := len(nvidiaDCGMOptions)
startPfx := " "
cancelPfx := " "
if m.nvidiaSATCursor == startIdx {
@@ -208,11 +127,11 @@ func renderNvidiaSATSetup(m model) string {
fmt.Fprintf(&b, "%sStart\n", startPfx)
fmt.Fprintf(&b, "%sCancel\n", cancelPfx)
fmt.Fprintln(&b)
b.WriteString("[↑/↓] move [space] toggle [enter] select [esc] cancel\n")
b.WriteString("[↑/↓] move [space/enter] select [esc] cancel\n")
return b.String()
}
// renderNvidiaSATRunning renders the running screen.
func renderNvidiaSATRunning() string {
return "NVIDIA SAT\n\nTest is running...\n\n[a] Abort test [ctrl+c] quit\n"
return "NVIDIA Diagnostics (DCGM)\n\nTest is running...\n\n[a] Abort test [ctrl+c] quit\n"
}

View File

@@ -91,8 +91,6 @@ type model struct {
burnInitialized bool
// NVIDIA SAT setup
nvidiaGPUs []platform.NvidiaGPU
nvidiaGPUSel []bool
nvidiaDurIdx int
nvidiaSATCursor int

View File

@@ -112,8 +112,6 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
m.screen = screenExportTargets
m.cursor = 0
return m, m.refreshSnapshotCmd()
case nvidiaGPUsMsg:
return m.handleNvidiaGPUsMsg(msg)
case nvtopClosedMsg:
return m, nil
case gpuStressDoneMsg: