feat(dcgm): add NVIDIA DCGM diagnostics, fix KVM console
- Add 9002-nvidia-dcgm.hook.chroot: installs datacenter-gpu-manager from NVIDIA apt repo during live-build - Enable nvidia-dcgm.service in chroot setup hook - Replace bee-gpu-stress with dcgmi diag (levels 1-4) in NVIDIA SAT - TUI: replace GPU checkbox + duration UI with DCGM level selection - Remove console=tty2 from boot params: KVM/VGA now shows tty1 where bee-tui runs, fixing unresponsive console Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -33,8 +33,6 @@ const (
|
||||
hcCurTotal = 9
|
||||
)
|
||||
|
||||
// hcModeDurations maps mode index (0=Quick,1=Standard,2=Express) to GPU stress seconds.
|
||||
var hcModeDurations = [3]int{600, 3600, 28800}
|
||||
|
||||
// hcCPUDurations maps mode index to CPU stress-ng seconds.
|
||||
var hcCPUDurations = [3]int{60, 300, 900}
|
||||
@@ -232,7 +230,6 @@ func (m model) hcRunAll() (tea.Model, tea.Cmd) {
|
||||
}
|
||||
|
||||
func (m model) executeRunAll() (tea.Model, tea.Cmd) {
|
||||
durationSec := hcModeDurations[m.hcMode]
|
||||
durationIdx := m.hcMode
|
||||
sel := m.hcSel
|
||||
app := m.app
|
||||
@@ -250,28 +247,14 @@ func (m model) executeRunAll() (tea.Model, tea.Cmd) {
|
||||
}
|
||||
parts = append(parts, "=== GPU (AMD) ===\n"+body)
|
||||
} else {
|
||||
gpus, err := app.ListNvidiaGPUs()
|
||||
if err != nil || len(gpus) == 0 {
|
||||
parts = append(parts, "=== GPU ===\nNo NVIDIA GPUs detected or driver not loaded.")
|
||||
} else {
|
||||
var indices []int
|
||||
sizeMB := 0
|
||||
for _, g := range gpus {
|
||||
indices = append(indices, g.Index)
|
||||
if sizeMB == 0 || g.MemoryMB < sizeMB {
|
||||
sizeMB = g.MemoryMB
|
||||
}
|
||||
}
|
||||
if sizeMB == 0 {
|
||||
sizeMB = 64
|
||||
}
|
||||
r, err := app.RunNvidiaAcceptancePackWithOptions(context.Background(), "", durationSec, sizeMB, indices)
|
||||
body := r.Body
|
||||
if err != nil {
|
||||
body += "\nERROR: " + err.Error()
|
||||
}
|
||||
parts = append(parts, "=== GPU ===\n"+body)
|
||||
// Map hcMode (0=Quick,1=Standard,2=Express) to DCGM level (1,2,3)
|
||||
diagLevel := durationIdx + 1
|
||||
r, err := app.RunNvidiaAcceptancePackWithOptions(context.Background(), "", diagLevel, nil)
|
||||
body := r.Body
|
||||
if err != nil {
|
||||
body += "\nERROR: " + err.Error()
|
||||
}
|
||||
parts = append(parts, "=== GPU (DCGM) ===\n"+body)
|
||||
}
|
||||
}
|
||||
if sel[hcMemory] {
|
||||
|
||||
Reference in New Issue
Block a user