feat(dcgm): add NVIDIA DCGM diagnostics, fix KVM console
- Add 9002-nvidia-dcgm.hook.chroot: installs datacenter-gpu-manager from NVIDIA apt repo during live-build - Enable nvidia-dcgm.service in chroot setup hook - Replace bee-gpu-stress with dcgmi diag (levels 1-4) in NVIDIA SAT - TUI: replace GPU checkbox + duration UI with DCGM level selection - Remove console=tty2 from boot params: KVM/VGA now shows tty1 where bee-tui runs, fixing unresponsive console Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -72,7 +72,7 @@ type toolManager interface {
|
|||||||
|
|
||||||
type satRunner interface {
|
type satRunner interface {
|
||||||
RunNvidiaAcceptancePack(baseDir string) (string, error)
|
RunNvidiaAcceptancePack(baseDir string) (string, error)
|
||||||
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, durationSec int, sizeMB int, gpuIndices []int) (string, error)
|
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int) (string, error)
|
||||||
RunMemoryAcceptancePack(baseDir string) (string, error)
|
RunMemoryAcceptancePack(baseDir string) (string, error)
|
||||||
RunStorageAcceptancePack(baseDir string) (string, error)
|
RunStorageAcceptancePack(baseDir string) (string, error)
|
||||||
RunCPUAcceptancePack(baseDir string, durationSec int) (string, error)
|
RunCPUAcceptancePack(baseDir string, durationSec int) (string, error)
|
||||||
@@ -423,23 +423,16 @@ func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
|||||||
return a.sat.ListNvidiaGPUs()
|
return a.sat.ListNvidiaGPUs()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, durationSec int, sizeMB int, gpuIndices []int) (ActionResult, error) {
|
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int) (ActionResult, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
}
|
}
|
||||||
path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, durationSec, sizeMB, gpuIndices)
|
path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, diagLevel, gpuIndices)
|
||||||
body := "Archive written."
|
body := "Archive written."
|
||||||
if path != "" {
|
if path != "" {
|
||||||
body = "Archive written to " + path
|
body = "Archive written to " + path
|
||||||
}
|
}
|
||||||
// Include terminal chart if available (runDir = archive path without .tar.gz).
|
return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
|
||||||
if path != "" {
|
|
||||||
termPath := filepath.Join(strings.TrimSuffix(path, ".tar.gz"), "gpu-metrics-term.txt")
|
|
||||||
if chart, readErr := os.ReadFile(termPath); readErr == nil && len(chart) > 0 {
|
|
||||||
body += "\n\n" + string(chart)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return ActionResult{Title: "NVIDIA SAT", Body: body}, err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunMemoryAcceptancePack(baseDir string) (string, error) {
|
func (a *App) RunMemoryAcceptancePack(baseDir string) (string, error) {
|
||||||
|
|||||||
@@ -123,7 +123,7 @@ func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string) (string, error) {
|
|||||||
return f.runNvidiaFn(baseDir)
|
return f.runNvidiaFn(baseDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir string, _ int, _ int, _ []int) (string, error) {
|
func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir string, _ int, _ []int) (string, error) {
|
||||||
return f.runNvidiaFn(baseDir)
|
return f.runNvidiaFn(baseDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -125,10 +125,12 @@ func (s *System) RunNvidiaAcceptancePack(baseDir string) (string, error) {
|
|||||||
return runAcceptancePack(baseDir, "gpu-nvidia", nvidiaSATJobs())
|
return runAcceptancePack(baseDir, "gpu-nvidia", nvidiaSATJobs())
|
||||||
}
|
}
|
||||||
|
|
||||||
// RunNvidiaAcceptancePackWithOptions runs the NVIDIA SAT with explicit duration,
|
// RunNvidiaAcceptancePackWithOptions runs the NVIDIA diagnostics via DCGM.
|
||||||
// GPU memory size, and GPU index selection. ctx cancellation kills the running job.
|
// diagLevel: 1=quick, 2=medium, 3=targeted stress, 4=extended stress.
|
||||||
func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, durationSec int, sizeMB int, gpuIndices []int) (string, error) {
|
// gpuIndices: specific GPU indices to test (empty = all GPUs).
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaSATJobsWithOptions(durationSec, sizeMB, gpuIndices))
|
// ctx cancellation kills the running job.
|
||||||
|
func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int) (string, error) {
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, gpuIndices))
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunMemoryAcceptancePack(baseDir string) (string, error) {
|
func (s *System) RunMemoryAcceptancePack(baseDir string) (string, error) {
|
||||||
@@ -275,27 +277,23 @@ func runAcceptancePack(baseDir, prefix string, jobs []satJob) (string, error) {
|
|||||||
return archive, nil
|
return archive, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func nvidiaSATJobsWithOptions(durationSec, sizeMB int, gpuIndices []int) []satJob {
|
func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
||||||
var env []string
|
if diagLevel < 1 || diagLevel > 4 {
|
||||||
|
diagLevel = 3
|
||||||
|
}
|
||||||
|
diagArgs := []string{"dcgmi", "diag", "-r", strconv.Itoa(diagLevel)}
|
||||||
if len(gpuIndices) > 0 {
|
if len(gpuIndices) > 0 {
|
||||||
ids := make([]string, len(gpuIndices))
|
ids := make([]string, len(gpuIndices))
|
||||||
for i, idx := range gpuIndices {
|
for i, idx := range gpuIndices {
|
||||||
ids[i] = strconv.Itoa(idx)
|
ids[i] = strconv.Itoa(idx)
|
||||||
}
|
}
|
||||||
env = []string{"CUDA_VISIBLE_DEVICES=" + strings.Join(ids, ",")}
|
diagArgs = append(diagArgs, "-i", strings.Join(ids, ","))
|
||||||
}
|
}
|
||||||
return []satJob{
|
return []satJob{
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||||
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||||
{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
{name: "04-dcgmi-diag.log", cmd: diagArgs},
|
||||||
{
|
|
||||||
name: "05-bee-gpu-stress.log",
|
|
||||||
cmd: []string{"bee-gpu-stress", "--seconds", strconv.Itoa(durationSec), "--size-mb", strconv.Itoa(sizeMB)},
|
|
||||||
env: env,
|
|
||||||
collectGPU: true,
|
|
||||||
gpuIndices: gpuIndices,
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -32,11 +32,6 @@ type snapshotMsg struct {
|
|||||||
panel app.HardwarePanelData
|
panel app.HardwarePanelData
|
||||||
}
|
}
|
||||||
|
|
||||||
type nvidiaGPUsMsg struct {
|
|
||||||
gpus []platform.NvidiaGPU
|
|
||||||
err error
|
|
||||||
}
|
|
||||||
|
|
||||||
type nvtopClosedMsg struct{}
|
type nvtopClosedMsg struct{}
|
||||||
|
|
||||||
type nvidiaSATDoneMsg struct {
|
type nvidiaSATDoneMsg struct {
|
||||||
|
|||||||
@@ -33,8 +33,6 @@ const (
|
|||||||
hcCurTotal = 9
|
hcCurTotal = 9
|
||||||
)
|
)
|
||||||
|
|
||||||
// hcModeDurations maps mode index (0=Quick,1=Standard,2=Express) to GPU stress seconds.
|
|
||||||
var hcModeDurations = [3]int{600, 3600, 28800}
|
|
||||||
|
|
||||||
// hcCPUDurations maps mode index to CPU stress-ng seconds.
|
// hcCPUDurations maps mode index to CPU stress-ng seconds.
|
||||||
var hcCPUDurations = [3]int{60, 300, 900}
|
var hcCPUDurations = [3]int{60, 300, 900}
|
||||||
@@ -232,7 +230,6 @@ func (m model) hcRunAll() (tea.Model, tea.Cmd) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m model) executeRunAll() (tea.Model, tea.Cmd) {
|
func (m model) executeRunAll() (tea.Model, tea.Cmd) {
|
||||||
durationSec := hcModeDurations[m.hcMode]
|
|
||||||
durationIdx := m.hcMode
|
durationIdx := m.hcMode
|
||||||
sel := m.hcSel
|
sel := m.hcSel
|
||||||
app := m.app
|
app := m.app
|
||||||
@@ -250,28 +247,14 @@ func (m model) executeRunAll() (tea.Model, tea.Cmd) {
|
|||||||
}
|
}
|
||||||
parts = append(parts, "=== GPU (AMD) ===\n"+body)
|
parts = append(parts, "=== GPU (AMD) ===\n"+body)
|
||||||
} else {
|
} else {
|
||||||
gpus, err := app.ListNvidiaGPUs()
|
// Map hcMode (0=Quick,1=Standard,2=Express) to DCGM level (1,2,3)
|
||||||
if err != nil || len(gpus) == 0 {
|
diagLevel := durationIdx + 1
|
||||||
parts = append(parts, "=== GPU ===\nNo NVIDIA GPUs detected or driver not loaded.")
|
r, err := app.RunNvidiaAcceptancePackWithOptions(context.Background(), "", diagLevel, nil)
|
||||||
} else {
|
body := r.Body
|
||||||
var indices []int
|
if err != nil {
|
||||||
sizeMB := 0
|
body += "\nERROR: " + err.Error()
|
||||||
for _, g := range gpus {
|
|
||||||
indices = append(indices, g.Index)
|
|
||||||
if sizeMB == 0 || g.MemoryMB < sizeMB {
|
|
||||||
sizeMB = g.MemoryMB
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if sizeMB == 0 {
|
|
||||||
sizeMB = 64
|
|
||||||
}
|
|
||||||
r, err := app.RunNvidiaAcceptancePackWithOptions(context.Background(), "", durationSec, sizeMB, indices)
|
|
||||||
body := r.Body
|
|
||||||
if err != nil {
|
|
||||||
body += "\nERROR: " + err.Error()
|
|
||||||
}
|
|
||||||
parts = append(parts, "=== GPU ===\n"+body)
|
|
||||||
}
|
}
|
||||||
|
parts = append(parts, "=== GPU (DCGM) ===\n"+body)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if sel[hcMemory] {
|
if sel[hcMemory] {
|
||||||
|
|||||||
@@ -5,61 +5,33 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"bee/audit/internal/platform"
|
|
||||||
|
|
||||||
tea "github.com/charmbracelet/bubbletea"
|
tea "github.com/charmbracelet/bubbletea"
|
||||||
)
|
)
|
||||||
|
|
||||||
var nvidiaDurationOptions = []struct {
|
var nvidiaDCGMOptions = []struct {
|
||||||
label string
|
label string
|
||||||
seconds int
|
level int
|
||||||
|
note string
|
||||||
}{
|
}{
|
||||||
{"10 minutes", 600},
|
{"Level 1 — Quick", 1, "~1 min, configuration check"},
|
||||||
{"1 hour", 3600},
|
{"Level 2 — Medium", 2, "~2 min, memory test"},
|
||||||
{"8 hours", 28800},
|
{"Level 3 — Targeted stress", 3, "~10 min, SM + memory + PCIe [recommended]"},
|
||||||
{"24 hours", 86400},
|
{"Level 4 — Extended stress", 4, "~30 min, extended burn-in"},
|
||||||
}
|
}
|
||||||
|
|
||||||
// enterNvidiaSATSetup resets the setup screen and starts loading GPU list.
|
// enterNvidiaSATSetup resets and shows the DCGM level selection screen.
|
||||||
func (m model) enterNvidiaSATSetup() (tea.Model, tea.Cmd) {
|
func (m model) enterNvidiaSATSetup() (tea.Model, tea.Cmd) {
|
||||||
m.screen = screenNvidiaSATSetup
|
m.screen = screenNvidiaSATSetup
|
||||||
m.nvidiaGPUs = nil
|
m.nvidiaDurIdx = 2 // default: Level 3
|
||||||
m.nvidiaGPUSel = nil
|
m.nvidiaSATCursor = 2
|
||||||
m.nvidiaDurIdx = 0
|
|
||||||
m.nvidiaSATCursor = 0
|
|
||||||
m.busy = true
|
|
||||||
m.busyTitle = "NVIDIA SAT"
|
|
||||||
return m, func() tea.Msg {
|
|
||||||
gpus, err := m.app.ListNvidiaGPUs()
|
|
||||||
return nvidiaGPUsMsg{gpus: gpus, err: err}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// handleNvidiaGPUsMsg processes the GPU list response.
|
|
||||||
func (m model) handleNvidiaGPUsMsg(msg nvidiaGPUsMsg) (tea.Model, tea.Cmd) {
|
|
||||||
m.busy = false
|
m.busy = false
|
||||||
m.busyTitle = ""
|
|
||||||
if msg.err != nil {
|
|
||||||
m.title = "NVIDIA SAT"
|
|
||||||
m.body = fmt.Sprintf("Failed to list GPUs: %v", msg.err)
|
|
||||||
m.prevScreen = screenHealthCheck
|
|
||||||
m.screen = screenOutput
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
m.nvidiaGPUs = msg.gpus
|
|
||||||
m.nvidiaGPUSel = make([]bool, len(msg.gpus))
|
|
||||||
for i := range m.nvidiaGPUSel {
|
|
||||||
m.nvidiaGPUSel[i] = true // all selected by default
|
|
||||||
}
|
|
||||||
m.nvidiaSATCursor = 0
|
|
||||||
return m, nil
|
return m, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// updateNvidiaSATSetup handles keys on the setup screen.
|
// updateNvidiaSATSetup handles keys on the DCGM setup screen.
|
||||||
func (m model) updateNvidiaSATSetup(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
func (m model) updateNvidiaSATSetup(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||||
numDur := len(nvidiaDurationOptions)
|
numOpts := len(nvidiaDCGMOptions)
|
||||||
numGPU := len(m.nvidiaGPUs)
|
totalItems := numOpts + 2 // +2: Start, Cancel
|
||||||
totalItems := numDur + numGPU + 2 // +2: Start, Cancel
|
|
||||||
switch msg.String() {
|
switch msg.String() {
|
||||||
case "up", "k":
|
case "up", "k":
|
||||||
if m.nvidiaSATCursor > 0 {
|
if m.nvidiaSATCursor > 0 {
|
||||||
@@ -69,23 +41,12 @@ func (m model) updateNvidiaSATSetup(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
|||||||
if m.nvidiaSATCursor < totalItems-1 {
|
if m.nvidiaSATCursor < totalItems-1 {
|
||||||
m.nvidiaSATCursor++
|
m.nvidiaSATCursor++
|
||||||
}
|
}
|
||||||
case " ":
|
case " ", "enter":
|
||||||
switch {
|
startIdx := numOpts
|
||||||
case m.nvidiaSATCursor < numDur:
|
|
||||||
m.nvidiaDurIdx = m.nvidiaSATCursor
|
|
||||||
case m.nvidiaSATCursor < numDur+numGPU:
|
|
||||||
i := m.nvidiaSATCursor - numDur
|
|
||||||
m.nvidiaGPUSel[i] = !m.nvidiaGPUSel[i]
|
|
||||||
}
|
|
||||||
case "enter":
|
|
||||||
startIdx := numDur + numGPU
|
|
||||||
cancelIdx := startIdx + 1
|
cancelIdx := startIdx + 1
|
||||||
switch {
|
switch {
|
||||||
case m.nvidiaSATCursor < numDur:
|
case m.nvidiaSATCursor < numOpts:
|
||||||
m.nvidiaDurIdx = m.nvidiaSATCursor
|
m.nvidiaDurIdx = m.nvidiaSATCursor
|
||||||
case m.nvidiaSATCursor < startIdx:
|
|
||||||
i := m.nvidiaSATCursor - numDur
|
|
||||||
m.nvidiaGPUSel[i] = !m.nvidiaGPUSel[i]
|
|
||||||
case m.nvidiaSATCursor == startIdx:
|
case m.nvidiaSATCursor == startIdx:
|
||||||
return m.startNvidiaSAT()
|
return m.startNvidiaSAT()
|
||||||
case m.nvidiaSATCursor == cancelIdx:
|
case m.nvidiaSATCursor == cancelIdx:
|
||||||
@@ -101,34 +62,9 @@ func (m model) updateNvidiaSATSetup(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
|||||||
return m, nil
|
return m, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// startNvidiaSAT launches the NVIDIA acceptance pack.
|
// startNvidiaSAT launches the DCGM diagnostic.
|
||||||
func (m model) startNvidiaSAT() (tea.Model, tea.Cmd) {
|
func (m model) startNvidiaSAT() (tea.Model, tea.Cmd) {
|
||||||
var selectedGPUs []platform.NvidiaGPU
|
diagLevel := nvidiaDCGMOptions[m.nvidiaDurIdx].level
|
||||||
for i, sel := range m.nvidiaGPUSel {
|
|
||||||
if sel {
|
|
||||||
selectedGPUs = append(selectedGPUs, m.nvidiaGPUs[i])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if len(selectedGPUs) == 0 {
|
|
||||||
selectedGPUs = m.nvidiaGPUs // fallback: use all if none explicitly selected
|
|
||||||
}
|
|
||||||
|
|
||||||
sizeMB := 0
|
|
||||||
for _, g := range selectedGPUs {
|
|
||||||
if sizeMB == 0 || g.MemoryMB < sizeMB {
|
|
||||||
sizeMB = g.MemoryMB
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if sizeMB == 0 {
|
|
||||||
sizeMB = 64
|
|
||||||
}
|
|
||||||
|
|
||||||
var gpuIndices []int
|
|
||||||
for _, g := range selectedGPUs {
|
|
||||||
gpuIndices = append(gpuIndices, g.Index)
|
|
||||||
}
|
|
||||||
|
|
||||||
durationSec := nvidiaDurationOptions[m.nvidiaDurIdx].seconds
|
|
||||||
|
|
||||||
ctx, cancel := context.WithCancel(context.Background())
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
m.nvidiaSATCancel = cancel
|
m.nvidiaSATCancel = cancel
|
||||||
@@ -137,7 +73,7 @@ func (m model) startNvidiaSAT() (tea.Model, tea.Cmd) {
|
|||||||
m.nvidiaSATCursor = 0
|
m.nvidiaSATCursor = 0
|
||||||
|
|
||||||
satCmd := func() tea.Msg {
|
satCmd := func() tea.Msg {
|
||||||
result, err := m.app.RunNvidiaAcceptancePackWithOptions(ctx, "", durationSec, sizeMB, gpuIndices)
|
result, err := m.app.RunNvidiaAcceptancePackWithOptions(ctx, "", diagLevel, nil)
|
||||||
return nvidiaSATDoneMsg{title: result.Title, body: result.Body, err: err}
|
return nvidiaSATDoneMsg{title: result.Title, body: result.Body, err: err}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -161,13 +97,13 @@ func (m model) updateNvidiaSATRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
|||||||
return m, nil
|
return m, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// renderNvidiaSATSetup renders the setup screen.
|
// renderNvidiaSATSetup renders the DCGM level selection screen.
|
||||||
func renderNvidiaSATSetup(m model) string {
|
func renderNvidiaSATSetup(m model) string {
|
||||||
var b strings.Builder
|
var b strings.Builder
|
||||||
fmt.Fprintln(&b, "NVIDIA SAT")
|
fmt.Fprintln(&b, "NVIDIA Diagnostics (DCGM)")
|
||||||
fmt.Fprintln(&b)
|
fmt.Fprintln(&b)
|
||||||
fmt.Fprintln(&b, "Duration:")
|
fmt.Fprintln(&b, "Diagnostic level:")
|
||||||
for i, opt := range nvidiaDurationOptions {
|
for i, opt := range nvidiaDCGMOptions {
|
||||||
radio := "( )"
|
radio := "( )"
|
||||||
if i == m.nvidiaDurIdx {
|
if i == m.nvidiaDurIdx {
|
||||||
radio = "(*)"
|
radio = "(*)"
|
||||||
@@ -176,27 +112,10 @@ func renderNvidiaSATSetup(m model) string {
|
|||||||
if m.nvidiaSATCursor == i {
|
if m.nvidiaSATCursor == i {
|
||||||
prefix = "> "
|
prefix = "> "
|
||||||
}
|
}
|
||||||
fmt.Fprintf(&b, "%s%s %s\n", prefix, radio, opt.label)
|
fmt.Fprintf(&b, "%s%s %s (%s)\n", prefix, radio, opt.label, opt.note)
|
||||||
}
|
}
|
||||||
fmt.Fprintln(&b)
|
fmt.Fprintln(&b)
|
||||||
if len(m.nvidiaGPUs) == 0 {
|
startIdx := len(nvidiaDCGMOptions)
|
||||||
fmt.Fprintln(&b, "GPUs: (none detected)")
|
|
||||||
} else {
|
|
||||||
fmt.Fprintln(&b, "GPUs:")
|
|
||||||
for i, gpu := range m.nvidiaGPUs {
|
|
||||||
check := "[ ]"
|
|
||||||
if m.nvidiaGPUSel[i] {
|
|
||||||
check = "[x]"
|
|
||||||
}
|
|
||||||
prefix := " "
|
|
||||||
if m.nvidiaSATCursor == len(nvidiaDurationOptions)+i {
|
|
||||||
prefix = "> "
|
|
||||||
}
|
|
||||||
fmt.Fprintf(&b, "%s%s %d: %s (%d MB)\n", prefix, check, gpu.Index, gpu.Name, gpu.MemoryMB)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fmt.Fprintln(&b)
|
|
||||||
startIdx := len(nvidiaDurationOptions) + len(m.nvidiaGPUs)
|
|
||||||
startPfx := " "
|
startPfx := " "
|
||||||
cancelPfx := " "
|
cancelPfx := " "
|
||||||
if m.nvidiaSATCursor == startIdx {
|
if m.nvidiaSATCursor == startIdx {
|
||||||
@@ -208,11 +127,11 @@ func renderNvidiaSATSetup(m model) string {
|
|||||||
fmt.Fprintf(&b, "%sStart\n", startPfx)
|
fmt.Fprintf(&b, "%sStart\n", startPfx)
|
||||||
fmt.Fprintf(&b, "%sCancel\n", cancelPfx)
|
fmt.Fprintf(&b, "%sCancel\n", cancelPfx)
|
||||||
fmt.Fprintln(&b)
|
fmt.Fprintln(&b)
|
||||||
b.WriteString("[↑/↓] move [space] toggle [enter] select [esc] cancel\n")
|
b.WriteString("[↑/↓] move [space/enter] select [esc] cancel\n")
|
||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
// renderNvidiaSATRunning renders the running screen.
|
// renderNvidiaSATRunning renders the running screen.
|
||||||
func renderNvidiaSATRunning() string {
|
func renderNvidiaSATRunning() string {
|
||||||
return "NVIDIA SAT\n\nTest is running...\n\n[a] Abort test [ctrl+c] quit\n"
|
return "NVIDIA Diagnostics (DCGM)\n\nTest is running...\n\n[a] Abort test [ctrl+c] quit\n"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -91,8 +91,6 @@ type model struct {
|
|||||||
burnInitialized bool
|
burnInitialized bool
|
||||||
|
|
||||||
// NVIDIA SAT setup
|
// NVIDIA SAT setup
|
||||||
nvidiaGPUs []platform.NvidiaGPU
|
|
||||||
nvidiaGPUSel []bool
|
|
||||||
nvidiaDurIdx int
|
nvidiaDurIdx int
|
||||||
nvidiaSATCursor int
|
nvidiaSATCursor int
|
||||||
|
|
||||||
|
|||||||
@@ -112,8 +112,6 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
|
|||||||
m.screen = screenExportTargets
|
m.screen = screenExportTargets
|
||||||
m.cursor = 0
|
m.cursor = 0
|
||||||
return m, m.refreshSnapshotCmd()
|
return m, m.refreshSnapshotCmd()
|
||||||
case nvidiaGPUsMsg:
|
|
||||||
return m.handleNvidiaGPUsMsg(msg)
|
|
||||||
case nvtopClosedMsg:
|
case nvtopClosedMsg:
|
||||||
return m, nil
|
return m, nil
|
||||||
case gpuStressDoneMsg:
|
case gpuStressDoneMsg:
|
||||||
|
|||||||
2
bible
2
bible
Submodule bible updated: 688b87e98d...456c1f022c
@@ -32,6 +32,6 @@ lb config noauto \
|
|||||||
--memtest none \
|
--memtest none \
|
||||||
--iso-volume "EASY-BEE" \
|
--iso-volume "EASY-BEE" \
|
||||||
--iso-application "EASY-BEE" \
|
--iso-application "EASY-BEE" \
|
||||||
--bootappend-live "boot=live components console=tty2 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
|
--bootappend-live "boot=live components console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
|
||||||
--apt-recommends false \
|
--apt-recommends false \
|
||||||
"${@}"
|
"${@}"
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ ensure_bee_console_user() {
|
|||||||
ensure_bee_console_user
|
ensure_bee_console_user
|
||||||
|
|
||||||
# Enable bee services
|
# Enable bee services
|
||||||
|
systemctl enable nvidia-dcgm.service 2>/dev/null || true
|
||||||
systemctl enable bee-network.service
|
systemctl enable bee-network.service
|
||||||
systemctl enable bee-nvidia.service
|
systemctl enable bee-nvidia.service
|
||||||
systemctl enable bee-preflight.service
|
systemctl enable bee-preflight.service
|
||||||
|
|||||||
66
iso/builder/config/hooks/normal/9002-nvidia-dcgm.hook.chroot
Executable file
66
iso/builder/config/hooks/normal/9002-nvidia-dcgm.hook.chroot
Executable file
@@ -0,0 +1,66 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# 9002-nvidia-dcgm.hook.chroot — install NVIDIA DCGM inside the live-build chroot.
|
||||||
|
# DCGM (Data Center GPU Manager) provides dcgmi diag for acceptance testing.
|
||||||
|
# Adds NVIDIA's CUDA apt repository (debian12/x86_64) and installs datacenter-gpu-manager.
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
NVIDIA_KEYRING="/usr/share/keyrings/nvidia-cuda.gpg"
|
||||||
|
NVIDIA_LIST="/etc/apt/sources.list.d/nvidia-cuda.list"
|
||||||
|
NVIDIA_KEY_URL="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/3bf863cc.pub"
|
||||||
|
NVIDIA_REPO="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/"
|
||||||
|
APT_UPDATED=0
|
||||||
|
|
||||||
|
mkdir -p /usr/share/keyrings /etc/apt/sources.list.d
|
||||||
|
|
||||||
|
ensure_tool() {
|
||||||
|
tool="$1"
|
||||||
|
pkg="$2"
|
||||||
|
if command -v "${tool}" >/dev/null 2>&1; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
if [ "${APT_UPDATED}" -eq 0 ]; then
|
||||||
|
apt-get update -qq
|
||||||
|
APT_UPDATED=1
|
||||||
|
fi
|
||||||
|
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends "${pkg}"
|
||||||
|
}
|
||||||
|
|
||||||
|
ensure_cert_bundle() {
|
||||||
|
if [ -s /etc/ssl/certs/ca-certificates.crt ]; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
if [ "${APT_UPDATED}" -eq 0 ]; then
|
||||||
|
apt-get update -qq
|
||||||
|
APT_UPDATED=1
|
||||||
|
fi
|
||||||
|
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends ca-certificates
|
||||||
|
}
|
||||||
|
|
||||||
|
if ! ensure_cert_bundle || ! ensure_tool wget wget || ! ensure_tool gpg gpg; then
|
||||||
|
echo "WARN: prerequisites missing — skipping DCGM install"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Download and import NVIDIA GPG key
|
||||||
|
if ! wget -qO- "${NVIDIA_KEY_URL}" | gpg --dearmor --yes --output "${NVIDIA_KEYRING}"; then
|
||||||
|
echo "WARN: failed to fetch NVIDIA GPG key — skipping DCGM install"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
cat > "${NVIDIA_LIST}" <<EOF
|
||||||
|
deb [signed-by=${NVIDIA_KEYRING}] ${NVIDIA_REPO} /
|
||||||
|
EOF
|
||||||
|
|
||||||
|
apt-get update -qq
|
||||||
|
|
||||||
|
if DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends datacenter-gpu-manager; then
|
||||||
|
echo "=== DCGM: datacenter-gpu-manager installed ==="
|
||||||
|
dcgmi --version 2>/dev/null || true
|
||||||
|
else
|
||||||
|
echo "WARN: datacenter-gpu-manager install failed — DCGM unavailable"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Clean up apt lists to keep ISO size down
|
||||||
|
rm -f "${NVIDIA_LIST}"
|
||||||
|
apt-get clean
|
||||||
Reference in New Issue
Block a user