feat(dcgm): add NVIDIA DCGM diagnostics, fix KVM console
- Add 9002-nvidia-dcgm.hook.chroot: installs datacenter-gpu-manager from NVIDIA apt repo during live-build - Enable nvidia-dcgm.service in chroot setup hook - Replace bee-gpu-stress with dcgmi diag (levels 1-4) in NVIDIA SAT - TUI: replace GPU checkbox + duration UI with DCGM level selection - Remove console=tty2 from boot params: KVM/VGA now shows tty1 where bee-tui runs, fixing unresponsive console Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -72,7 +72,7 @@ type toolManager interface {
|
||||
|
||||
type satRunner interface {
|
||||
RunNvidiaAcceptancePack(baseDir string) (string, error)
|
||||
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, durationSec int, sizeMB int, gpuIndices []int) (string, error)
|
||||
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int) (string, error)
|
||||
RunMemoryAcceptancePack(baseDir string) (string, error)
|
||||
RunStorageAcceptancePack(baseDir string) (string, error)
|
||||
RunCPUAcceptancePack(baseDir string, durationSec int) (string, error)
|
||||
@@ -423,23 +423,16 @@ func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
||||
return a.sat.ListNvidiaGPUs()
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, durationSec int, sizeMB int, gpuIndices []int) (ActionResult, error) {
|
||||
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int) (ActionResult, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, durationSec, sizeMB, gpuIndices)
|
||||
path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, diagLevel, gpuIndices)
|
||||
body := "Archive written."
|
||||
if path != "" {
|
||||
body = "Archive written to " + path
|
||||
}
|
||||
// Include terminal chart if available (runDir = archive path without .tar.gz).
|
||||
if path != "" {
|
||||
termPath := filepath.Join(strings.TrimSuffix(path, ".tar.gz"), "gpu-metrics-term.txt")
|
||||
if chart, readErr := os.ReadFile(termPath); readErr == nil && len(chart) > 0 {
|
||||
body += "\n\n" + string(chart)
|
||||
}
|
||||
}
|
||||
return ActionResult{Title: "NVIDIA SAT", Body: body}, err
|
||||
return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePack(baseDir string) (string, error) {
|
||||
|
||||
Reference in New Issue
Block a user