feat(dcgm): add NVIDIA DCGM diagnostics, fix KVM console
- Add 9002-nvidia-dcgm.hook.chroot: installs datacenter-gpu-manager from NVIDIA apt repo during live-build - Enable nvidia-dcgm.service in chroot setup hook - Replace bee-gpu-stress with dcgmi diag (levels 1-4) in NVIDIA SAT - TUI: replace GPU checkbox + duration UI with DCGM level selection - Remove console=tty2 from boot params: KVM/VGA now shows tty1 where bee-tui runs, fixing unresponsive console Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -125,10 +125,12 @@ func (s *System) RunNvidiaAcceptancePack(baseDir string) (string, error) {
|
||||
return runAcceptancePack(baseDir, "gpu-nvidia", nvidiaSATJobs())
|
||||
}
|
||||
|
||||
// RunNvidiaAcceptancePackWithOptions runs the NVIDIA SAT with explicit duration,
|
||||
// GPU memory size, and GPU index selection. ctx cancellation kills the running job.
|
||||
func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, durationSec int, sizeMB int, gpuIndices []int) (string, error) {
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaSATJobsWithOptions(durationSec, sizeMB, gpuIndices))
|
||||
// RunNvidiaAcceptancePackWithOptions runs the NVIDIA diagnostics via DCGM.
|
||||
// diagLevel: 1=quick, 2=medium, 3=targeted stress, 4=extended stress.
|
||||
// gpuIndices: specific GPU indices to test (empty = all GPUs).
|
||||
// ctx cancellation kills the running job.
|
||||
func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int) (string, error) {
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, gpuIndices))
|
||||
}
|
||||
|
||||
func (s *System) RunMemoryAcceptancePack(baseDir string) (string, error) {
|
||||
@@ -275,27 +277,23 @@ func runAcceptancePack(baseDir, prefix string, jobs []satJob) (string, error) {
|
||||
return archive, nil
|
||||
}
|
||||
|
||||
func nvidiaSATJobsWithOptions(durationSec, sizeMB int, gpuIndices []int) []satJob {
|
||||
var env []string
|
||||
func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
||||
if diagLevel < 1 || diagLevel > 4 {
|
||||
diagLevel = 3
|
||||
}
|
||||
diagArgs := []string{"dcgmi", "diag", "-r", strconv.Itoa(diagLevel)}
|
||||
if len(gpuIndices) > 0 {
|
||||
ids := make([]string, len(gpuIndices))
|
||||
for i, idx := range gpuIndices {
|
||||
ids[i] = strconv.Itoa(idx)
|
||||
}
|
||||
env = []string{"CUDA_VISIBLE_DEVICES=" + strings.Join(ids, ",")}
|
||||
diagArgs = append(diagArgs, "-i", strings.Join(ids, ","))
|
||||
}
|
||||
return []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||
{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
||||
{
|
||||
name: "05-bee-gpu-stress.log",
|
||||
cmd: []string{"bee-gpu-stress", "--seconds", strconv.Itoa(durationSec), "--size-mb", strconv.Itoa(sizeMB)},
|
||||
env: env,
|
||||
collectGPU: true,
|
||||
gpuIndices: gpuIndices,
|
||||
},
|
||||
{name: "04-dcgmi-diag.log", cmd: diagArgs},
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user