diff --git a/audit/internal/app/app.go b/audit/internal/app/app.go index 7418811..a0a173d 100644 --- a/audit/internal/app/app.go +++ b/audit/internal/app/app.go @@ -72,7 +72,7 @@ type toolManager interface { type satRunner interface { RunNvidiaAcceptancePack(baseDir string) (string, error) - RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, durationSec int, sizeMB int, gpuIndices []int) (string, error) + RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int) (string, error) RunMemoryAcceptancePack(baseDir string) (string, error) RunStorageAcceptancePack(baseDir string) (string, error) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) @@ -423,23 +423,16 @@ func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) { return a.sat.ListNvidiaGPUs() } -func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, durationSec int, sizeMB int, gpuIndices []int) (ActionResult, error) { +func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int) (ActionResult, error) { if strings.TrimSpace(baseDir) == "" { baseDir = DefaultSATBaseDir } - path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, durationSec, sizeMB, gpuIndices) + path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, diagLevel, gpuIndices) body := "Archive written." if path != "" { body = "Archive written to " + path } - // Include terminal chart if available (runDir = archive path without .tar.gz). - if path != "" { - termPath := filepath.Join(strings.TrimSuffix(path, ".tar.gz"), "gpu-metrics-term.txt") - if chart, readErr := os.ReadFile(termPath); readErr == nil && len(chart) > 0 { - body += "\n\n" + string(chart) - } - } - return ActionResult{Title: "NVIDIA SAT", Body: body}, err + return ActionResult{Title: "NVIDIA DCGM", Body: body}, err } func (a *App) RunMemoryAcceptancePack(baseDir string) (string, error) { diff --git a/audit/internal/app/app_test.go b/audit/internal/app/app_test.go index 0af6708..67df2fd 100644 --- a/audit/internal/app/app_test.go +++ b/audit/internal/app/app_test.go @@ -123,7 +123,7 @@ func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string) (string, error) { return f.runNvidiaFn(baseDir) } -func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir string, _ int, _ int, _ []int) (string, error) { +func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir string, _ int, _ []int) (string, error) { return f.runNvidiaFn(baseDir) } diff --git a/audit/internal/platform/sat.go b/audit/internal/platform/sat.go index 0b88f24..9b0d274 100644 --- a/audit/internal/platform/sat.go +++ b/audit/internal/platform/sat.go @@ -125,10 +125,12 @@ func (s *System) RunNvidiaAcceptancePack(baseDir string) (string, error) { return runAcceptancePack(baseDir, "gpu-nvidia", nvidiaSATJobs()) } -// RunNvidiaAcceptancePackWithOptions runs the NVIDIA SAT with explicit duration, -// GPU memory size, and GPU index selection. ctx cancellation kills the running job. -func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, durationSec int, sizeMB int, gpuIndices []int) (string, error) { - return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaSATJobsWithOptions(durationSec, sizeMB, gpuIndices)) +// RunNvidiaAcceptancePackWithOptions runs the NVIDIA diagnostics via DCGM. +// diagLevel: 1=quick, 2=medium, 3=targeted stress, 4=extended stress. +// gpuIndices: specific GPU indices to test (empty = all GPUs). +// ctx cancellation kills the running job. +func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int) (string, error) { + return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, gpuIndices)) } func (s *System) RunMemoryAcceptancePack(baseDir string) (string, error) { @@ -275,27 +277,23 @@ func runAcceptancePack(baseDir, prefix string, jobs []satJob) (string, error) { return archive, nil } -func nvidiaSATJobsWithOptions(durationSec, sizeMB int, gpuIndices []int) []satJob { - var env []string +func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob { + if diagLevel < 1 || diagLevel > 4 { + diagLevel = 3 + } + diagArgs := []string{"dcgmi", "diag", "-r", strconv.Itoa(diagLevel)} if len(gpuIndices) > 0 { ids := make([]string, len(gpuIndices)) for i, idx := range gpuIndices { ids[i] = strconv.Itoa(idx) } - env = []string{"CUDA_VISIBLE_DEVICES=" + strings.Join(ids, ",")} + diagArgs = append(diagArgs, "-i", strings.Join(ids, ",")) } return []satJob{ {name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, {name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}}, {name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}}, - {name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}}, - { - name: "05-bee-gpu-stress.log", - cmd: []string{"bee-gpu-stress", "--seconds", strconv.Itoa(durationSec), "--size-mb", strconv.Itoa(sizeMB)}, - env: env, - collectGPU: true, - gpuIndices: gpuIndices, - }, + {name: "04-dcgmi-diag.log", cmd: diagArgs}, } } diff --git a/audit/internal/tui/messages.go b/audit/internal/tui/messages.go index 31b5b73..b599dd9 100644 --- a/audit/internal/tui/messages.go +++ b/audit/internal/tui/messages.go @@ -32,11 +32,6 @@ type snapshotMsg struct { panel app.HardwarePanelData } -type nvidiaGPUsMsg struct { - gpus []platform.NvidiaGPU - err error -} - type nvtopClosedMsg struct{} type nvidiaSATDoneMsg struct { diff --git a/audit/internal/tui/screen_health_check.go b/audit/internal/tui/screen_health_check.go index 9f0def1..5916242 100644 --- a/audit/internal/tui/screen_health_check.go +++ b/audit/internal/tui/screen_health_check.go @@ -33,8 +33,6 @@ const ( hcCurTotal = 9 ) -// hcModeDurations maps mode index (0=Quick,1=Standard,2=Express) to GPU stress seconds. -var hcModeDurations = [3]int{600, 3600, 28800} // hcCPUDurations maps mode index to CPU stress-ng seconds. var hcCPUDurations = [3]int{60, 300, 900} @@ -232,7 +230,6 @@ func (m model) hcRunAll() (tea.Model, tea.Cmd) { } func (m model) executeRunAll() (tea.Model, tea.Cmd) { - durationSec := hcModeDurations[m.hcMode] durationIdx := m.hcMode sel := m.hcSel app := m.app @@ -250,28 +247,14 @@ func (m model) executeRunAll() (tea.Model, tea.Cmd) { } parts = append(parts, "=== GPU (AMD) ===\n"+body) } else { - gpus, err := app.ListNvidiaGPUs() - if err != nil || len(gpus) == 0 { - parts = append(parts, "=== GPU ===\nNo NVIDIA GPUs detected or driver not loaded.") - } else { - var indices []int - sizeMB := 0 - for _, g := range gpus { - indices = append(indices, g.Index) - if sizeMB == 0 || g.MemoryMB < sizeMB { - sizeMB = g.MemoryMB - } - } - if sizeMB == 0 { - sizeMB = 64 - } - r, err := app.RunNvidiaAcceptancePackWithOptions(context.Background(), "", durationSec, sizeMB, indices) - body := r.Body - if err != nil { - body += "\nERROR: " + err.Error() - } - parts = append(parts, "=== GPU ===\n"+body) + // Map hcMode (0=Quick,1=Standard,2=Express) to DCGM level (1,2,3) + diagLevel := durationIdx + 1 + r, err := app.RunNvidiaAcceptancePackWithOptions(context.Background(), "", diagLevel, nil) + body := r.Body + if err != nil { + body += "\nERROR: " + err.Error() } + parts = append(parts, "=== GPU (DCGM) ===\n"+body) } } if sel[hcMemory] { diff --git a/audit/internal/tui/screen_nvidia_sat.go b/audit/internal/tui/screen_nvidia_sat.go index d29521d..dc9ca81 100644 --- a/audit/internal/tui/screen_nvidia_sat.go +++ b/audit/internal/tui/screen_nvidia_sat.go @@ -5,61 +5,33 @@ import ( "fmt" "strings" - "bee/audit/internal/platform" - tea "github.com/charmbracelet/bubbletea" ) -var nvidiaDurationOptions = []struct { +var nvidiaDCGMOptions = []struct { label string - seconds int + level int + note string }{ - {"10 minutes", 600}, - {"1 hour", 3600}, - {"8 hours", 28800}, - {"24 hours", 86400}, + {"Level 1 — Quick", 1, "~1 min, configuration check"}, + {"Level 2 — Medium", 2, "~2 min, memory test"}, + {"Level 3 — Targeted stress", 3, "~10 min, SM + memory + PCIe [recommended]"}, + {"Level 4 — Extended stress", 4, "~30 min, extended burn-in"}, } -// enterNvidiaSATSetup resets the setup screen and starts loading GPU list. +// enterNvidiaSATSetup resets and shows the DCGM level selection screen. func (m model) enterNvidiaSATSetup() (tea.Model, tea.Cmd) { m.screen = screenNvidiaSATSetup - m.nvidiaGPUs = nil - m.nvidiaGPUSel = nil - m.nvidiaDurIdx = 0 - m.nvidiaSATCursor = 0 - m.busy = true - m.busyTitle = "NVIDIA SAT" - return m, func() tea.Msg { - gpus, err := m.app.ListNvidiaGPUs() - return nvidiaGPUsMsg{gpus: gpus, err: err} - } -} - -// handleNvidiaGPUsMsg processes the GPU list response. -func (m model) handleNvidiaGPUsMsg(msg nvidiaGPUsMsg) (tea.Model, tea.Cmd) { + m.nvidiaDurIdx = 2 // default: Level 3 + m.nvidiaSATCursor = 2 m.busy = false - m.busyTitle = "" - if msg.err != nil { - m.title = "NVIDIA SAT" - m.body = fmt.Sprintf("Failed to list GPUs: %v", msg.err) - m.prevScreen = screenHealthCheck - m.screen = screenOutput - return m, nil - } - m.nvidiaGPUs = msg.gpus - m.nvidiaGPUSel = make([]bool, len(msg.gpus)) - for i := range m.nvidiaGPUSel { - m.nvidiaGPUSel[i] = true // all selected by default - } - m.nvidiaSATCursor = 0 return m, nil } -// updateNvidiaSATSetup handles keys on the setup screen. +// updateNvidiaSATSetup handles keys on the DCGM setup screen. func (m model) updateNvidiaSATSetup(msg tea.KeyMsg) (tea.Model, tea.Cmd) { - numDur := len(nvidiaDurationOptions) - numGPU := len(m.nvidiaGPUs) - totalItems := numDur + numGPU + 2 // +2: Start, Cancel + numOpts := len(nvidiaDCGMOptions) + totalItems := numOpts + 2 // +2: Start, Cancel switch msg.String() { case "up", "k": if m.nvidiaSATCursor > 0 { @@ -69,23 +41,12 @@ func (m model) updateNvidiaSATSetup(msg tea.KeyMsg) (tea.Model, tea.Cmd) { if m.nvidiaSATCursor < totalItems-1 { m.nvidiaSATCursor++ } - case " ": - switch { - case m.nvidiaSATCursor < numDur: - m.nvidiaDurIdx = m.nvidiaSATCursor - case m.nvidiaSATCursor < numDur+numGPU: - i := m.nvidiaSATCursor - numDur - m.nvidiaGPUSel[i] = !m.nvidiaGPUSel[i] - } - case "enter": - startIdx := numDur + numGPU + case " ", "enter": + startIdx := numOpts cancelIdx := startIdx + 1 switch { - case m.nvidiaSATCursor < numDur: + case m.nvidiaSATCursor < numOpts: m.nvidiaDurIdx = m.nvidiaSATCursor - case m.nvidiaSATCursor < startIdx: - i := m.nvidiaSATCursor - numDur - m.nvidiaGPUSel[i] = !m.nvidiaGPUSel[i] case m.nvidiaSATCursor == startIdx: return m.startNvidiaSAT() case m.nvidiaSATCursor == cancelIdx: @@ -101,34 +62,9 @@ func (m model) updateNvidiaSATSetup(msg tea.KeyMsg) (tea.Model, tea.Cmd) { return m, nil } -// startNvidiaSAT launches the NVIDIA acceptance pack. +// startNvidiaSAT launches the DCGM diagnostic. func (m model) startNvidiaSAT() (tea.Model, tea.Cmd) { - var selectedGPUs []platform.NvidiaGPU - for i, sel := range m.nvidiaGPUSel { - if sel { - selectedGPUs = append(selectedGPUs, m.nvidiaGPUs[i]) - } - } - if len(selectedGPUs) == 0 { - selectedGPUs = m.nvidiaGPUs // fallback: use all if none explicitly selected - } - - sizeMB := 0 - for _, g := range selectedGPUs { - if sizeMB == 0 || g.MemoryMB < sizeMB { - sizeMB = g.MemoryMB - } - } - if sizeMB == 0 { - sizeMB = 64 - } - - var gpuIndices []int - for _, g := range selectedGPUs { - gpuIndices = append(gpuIndices, g.Index) - } - - durationSec := nvidiaDurationOptions[m.nvidiaDurIdx].seconds + diagLevel := nvidiaDCGMOptions[m.nvidiaDurIdx].level ctx, cancel := context.WithCancel(context.Background()) m.nvidiaSATCancel = cancel @@ -137,7 +73,7 @@ func (m model) startNvidiaSAT() (tea.Model, tea.Cmd) { m.nvidiaSATCursor = 0 satCmd := func() tea.Msg { - result, err := m.app.RunNvidiaAcceptancePackWithOptions(ctx, "", durationSec, sizeMB, gpuIndices) + result, err := m.app.RunNvidiaAcceptancePackWithOptions(ctx, "", diagLevel, nil) return nvidiaSATDoneMsg{title: result.Title, body: result.Body, err: err} } @@ -161,13 +97,13 @@ func (m model) updateNvidiaSATRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) { return m, nil } -// renderNvidiaSATSetup renders the setup screen. +// renderNvidiaSATSetup renders the DCGM level selection screen. func renderNvidiaSATSetup(m model) string { var b strings.Builder - fmt.Fprintln(&b, "NVIDIA SAT") + fmt.Fprintln(&b, "NVIDIA Diagnostics (DCGM)") fmt.Fprintln(&b) - fmt.Fprintln(&b, "Duration:") - for i, opt := range nvidiaDurationOptions { + fmt.Fprintln(&b, "Diagnostic level:") + for i, opt := range nvidiaDCGMOptions { radio := "( )" if i == m.nvidiaDurIdx { radio = "(*)" @@ -176,27 +112,10 @@ func renderNvidiaSATSetup(m model) string { if m.nvidiaSATCursor == i { prefix = "> " } - fmt.Fprintf(&b, "%s%s %s\n", prefix, radio, opt.label) + fmt.Fprintf(&b, "%s%s %s (%s)\n", prefix, radio, opt.label, opt.note) } fmt.Fprintln(&b) - if len(m.nvidiaGPUs) == 0 { - fmt.Fprintln(&b, "GPUs: (none detected)") - } else { - fmt.Fprintln(&b, "GPUs:") - for i, gpu := range m.nvidiaGPUs { - check := "[ ]" - if m.nvidiaGPUSel[i] { - check = "[x]" - } - prefix := " " - if m.nvidiaSATCursor == len(nvidiaDurationOptions)+i { - prefix = "> " - } - fmt.Fprintf(&b, "%s%s %d: %s (%d MB)\n", prefix, check, gpu.Index, gpu.Name, gpu.MemoryMB) - } - } - fmt.Fprintln(&b) - startIdx := len(nvidiaDurationOptions) + len(m.nvidiaGPUs) + startIdx := len(nvidiaDCGMOptions) startPfx := " " cancelPfx := " " if m.nvidiaSATCursor == startIdx { @@ -208,11 +127,11 @@ func renderNvidiaSATSetup(m model) string { fmt.Fprintf(&b, "%sStart\n", startPfx) fmt.Fprintf(&b, "%sCancel\n", cancelPfx) fmt.Fprintln(&b) - b.WriteString("[↑/↓] move [space] toggle [enter] select [esc] cancel\n") + b.WriteString("[↑/↓] move [space/enter] select [esc] cancel\n") return b.String() } // renderNvidiaSATRunning renders the running screen. func renderNvidiaSATRunning() string { - return "NVIDIA SAT\n\nTest is running...\n\n[a] Abort test [ctrl+c] quit\n" + return "NVIDIA Diagnostics (DCGM)\n\nTest is running...\n\n[a] Abort test [ctrl+c] quit\n" } diff --git a/audit/internal/tui/types.go b/audit/internal/tui/types.go index 3f0c148..a094611 100644 --- a/audit/internal/tui/types.go +++ b/audit/internal/tui/types.go @@ -91,8 +91,6 @@ type model struct { burnInitialized bool // NVIDIA SAT setup - nvidiaGPUs []platform.NvidiaGPU - nvidiaGPUSel []bool nvidiaDurIdx int nvidiaSATCursor int diff --git a/audit/internal/tui/update.go b/audit/internal/tui/update.go index ce7d79f..3d64096 100644 --- a/audit/internal/tui/update.go +++ b/audit/internal/tui/update.go @@ -112,8 +112,6 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) { m.screen = screenExportTargets m.cursor = 0 return m, m.refreshSnapshotCmd() - case nvidiaGPUsMsg: - return m.handleNvidiaGPUsMsg(msg) case nvtopClosedMsg: return m, nil case gpuStressDoneMsg: diff --git a/bible b/bible index 688b87e..456c1f0 160000 --- a/bible +++ b/bible @@ -1 +1 @@ -Subproject commit 688b87e98deed5fadd71e10e123073640d92c15a +Subproject commit 456c1f022c17499ab059ae753684174b2621d74c diff --git a/iso/builder/auto/config b/iso/builder/auto/config index a0845cb..22991bf 100755 --- a/iso/builder/auto/config +++ b/iso/builder/auto/config @@ -32,6 +32,6 @@ lb config noauto \ --memtest none \ --iso-volume "EASY-BEE" \ --iso-application "EASY-BEE" \ - --bootappend-live "boot=live components console=tty2 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \ + --bootappend-live "boot=live components console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \ --apt-recommends false \ "${@}" diff --git a/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot b/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot index a83eed1..776cda5 100755 --- a/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot +++ b/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot @@ -21,6 +21,7 @@ ensure_bee_console_user() { ensure_bee_console_user # Enable bee services +systemctl enable nvidia-dcgm.service 2>/dev/null || true systemctl enable bee-network.service systemctl enable bee-nvidia.service systemctl enable bee-preflight.service diff --git a/iso/builder/config/hooks/normal/9002-nvidia-dcgm.hook.chroot b/iso/builder/config/hooks/normal/9002-nvidia-dcgm.hook.chroot new file mode 100755 index 0000000..4ef15af --- /dev/null +++ b/iso/builder/config/hooks/normal/9002-nvidia-dcgm.hook.chroot @@ -0,0 +1,66 @@ +#!/bin/sh +# 9002-nvidia-dcgm.hook.chroot — install NVIDIA DCGM inside the live-build chroot. +# DCGM (Data Center GPU Manager) provides dcgmi diag for acceptance testing. +# Adds NVIDIA's CUDA apt repository (debian12/x86_64) and installs datacenter-gpu-manager. + +set -e + +NVIDIA_KEYRING="/usr/share/keyrings/nvidia-cuda.gpg" +NVIDIA_LIST="/etc/apt/sources.list.d/nvidia-cuda.list" +NVIDIA_KEY_URL="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/3bf863cc.pub" +NVIDIA_REPO="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/" +APT_UPDATED=0 + +mkdir -p /usr/share/keyrings /etc/apt/sources.list.d + +ensure_tool() { + tool="$1" + pkg="$2" + if command -v "${tool}" >/dev/null 2>&1; then + return 0 + fi + if [ "${APT_UPDATED}" -eq 0 ]; then + apt-get update -qq + APT_UPDATED=1 + fi + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends "${pkg}" +} + +ensure_cert_bundle() { + if [ -s /etc/ssl/certs/ca-certificates.crt ]; then + return 0 + fi + if [ "${APT_UPDATED}" -eq 0 ]; then + apt-get update -qq + APT_UPDATED=1 + fi + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends ca-certificates +} + +if ! ensure_cert_bundle || ! ensure_tool wget wget || ! ensure_tool gpg gpg; then + echo "WARN: prerequisites missing — skipping DCGM install" + exit 0 +fi + +# Download and import NVIDIA GPG key +if ! wget -qO- "${NVIDIA_KEY_URL}" | gpg --dearmor --yes --output "${NVIDIA_KEYRING}"; then + echo "WARN: failed to fetch NVIDIA GPG key — skipping DCGM install" + exit 0 +fi + +cat > "${NVIDIA_LIST}" </dev/null || true +else + echo "WARN: datacenter-gpu-manager install failed — DCGM unavailable" +fi + +# Clean up apt lists to keep ISO size down +rm -f "${NVIDIA_LIST}" +apt-get clean