feat(dcgm): add NVIDIA DCGM diagnostics, fix KVM console

- Add 9002-nvidia-dcgm.hook.chroot: installs datacenter-gpu-manager
  from NVIDIA apt repo during live-build
- Enable nvidia-dcgm.service in chroot setup hook
- Replace bee-gpu-stress with dcgmi diag (levels 1-4) in NVIDIA SAT
- TUI: replace GPU checkbox + duration UI with DCGM level selection
- Remove console=tty2 from boot params: KVM/VGA now shows tty1
  where bee-tui runs, fixing unresponsive console

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-26 23:08:12 +03:00
parent 967455194c
commit eea98e6d76
12 changed files with 121 additions and 170 deletions

View File

@@ -125,10 +125,12 @@ func (s *System) RunNvidiaAcceptancePack(baseDir string) (string, error) {
return runAcceptancePack(baseDir, "gpu-nvidia", nvidiaSATJobs())
}
// RunNvidiaAcceptancePackWithOptions runs the NVIDIA SAT with explicit duration,
// GPU memory size, and GPU index selection. ctx cancellation kills the running job.
func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, durationSec int, sizeMB int, gpuIndices []int) (string, error) {
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaSATJobsWithOptions(durationSec, sizeMB, gpuIndices))
// RunNvidiaAcceptancePackWithOptions runs the NVIDIA diagnostics via DCGM.
// diagLevel: 1=quick, 2=medium, 3=targeted stress, 4=extended stress.
// gpuIndices: specific GPU indices to test (empty = all GPUs).
// ctx cancellation kills the running job.
func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int) (string, error) {
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, gpuIndices))
}
func (s *System) RunMemoryAcceptancePack(baseDir string) (string, error) {
@@ -275,27 +277,23 @@ func runAcceptancePack(baseDir, prefix string, jobs []satJob) (string, error) {
return archive, nil
}
func nvidiaSATJobsWithOptions(durationSec, sizeMB int, gpuIndices []int) []satJob {
var env []string
func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
if diagLevel < 1 || diagLevel > 4 {
diagLevel = 3
}
diagArgs := []string{"dcgmi", "diag", "-r", strconv.Itoa(diagLevel)}
if len(gpuIndices) > 0 {
ids := make([]string, len(gpuIndices))
for i, idx := range gpuIndices {
ids[i] = strconv.Itoa(idx)
}
env = []string{"CUDA_VISIBLE_DEVICES=" + strings.Join(ids, ",")}
diagArgs = append(diagArgs, "-i", strings.Join(ids, ","))
}
return []satJob{
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
{
name: "05-bee-gpu-stress.log",
cmd: []string{"bee-gpu-stress", "--seconds", strconv.Itoa(durationSec), "--size-mb", strconv.Itoa(sizeMB)},
env: env,
collectGPU: true,
gpuIndices: gpuIndices,
},
{name: "04-dcgmi-diag.log", cmd: diagArgs},
}
}