From 0c16616cc918467e6c2dd6f3e06b88deed2fd00e Mon Sep 17 00:00:00 2001 From: Mikhail Chusavitin Date: Wed, 25 Mar 2026 17:54:27 +0300 Subject: [PATCH] 1. Verbose live progress during SAT tests (CPU, Memory, Storage, AMD GPU) - New tui/sat_progress.go: polls {DefaultSATBaseDir}/{prefix}-*/verbose.log every 300ms and parses completed/in-progress steps - Busy screen now shows each step as PASS lscpu (234ms) / FAIL stress-ng (60.0s) / ... sensors-after instead of just "Working..." MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 2. Test results shown on screen (instead of just "Archive written to /path") - RunCPUAcceptancePackResult, RunMemoryAcceptancePackResult, RunStorageAcceptancePackResult, RunAMDAcceptancePackResult now read summary.txt from the run directory and return a formatted per-step result: Run: 2025-03-25T10:00:00Z PASS lscpu PASS sensors-before FAIL stress-ng PASS sensors-after Overall: FAILED (ok=3 failed=1) 3. AMD GPU SAT with auto-detection - platform.System.DetectGPUVendor(): checks /dev/nvidia0 → "nvidia", /dev/kfd → "amd" - platform.System.RunAMDAcceptancePack(): runs rocm-smi, rocm-smi --showallinfo, dmidecode - GPU SAT (G key / GPU row enter) automatically routes to AMD or NVIDIA based on detected vendor - "Run All" also auto-detects vendor 4. Panel detail view - GPU detail now shows the most recent (NVIDIA or AMD) SAT result, whichever is newer - All SAT detail views use the same human-readable formatSATDetail format --- audit/internal/app/app.go | 53 +++++++++++---- audit/internal/app/app_test.go | 10 ++- audit/internal/app/panel.go | 82 ++++++++++++++++++++++- audit/internal/platform/sat.go | 50 ++++++++++++++ audit/internal/tui/forms.go | 66 ++++++++++++++---- audit/internal/tui/screen_health_check.go | 52 +++++++++----- audit/internal/tui/types.go | 25 ++++--- audit/internal/tui/update.go | 10 +++ audit/internal/tui/view.go | 9 +++ 9 files changed, 300 insertions(+), 57 deletions(-) diff --git a/audit/internal/app/app.go b/audit/internal/app/app.go index 5aca605..6b05d6a 100644 --- a/audit/internal/app/app.go +++ b/audit/internal/app/app.go @@ -77,6 +77,9 @@ type satRunner interface { RunStorageAcceptancePack(baseDir string) (string, error) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) + DetectGPUVendor() string + ListAMDGPUs() ([]platform.AMDGPUInfo, error) + RunAMDAcceptancePack(baseDir string) (string, error) } type runtimeChecker interface { @@ -431,11 +434,7 @@ func (a *App) RunMemoryAcceptancePack(baseDir string) (string, error) { func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) { path, err := a.RunMemoryAcceptancePack(baseDir) - body := "Archive written." - if path != "" { - body = "Archive written to " + path - } - return ActionResult{Title: "Memory SAT", Body: body}, err + return ActionResult{Title: "Memory SAT", Body: satResultBody(path)}, err } func (a *App) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) { @@ -447,11 +446,7 @@ func (a *App) RunCPUAcceptancePack(baseDir string, durationSec int) (string, err func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (ActionResult, error) { path, err := a.RunCPUAcceptancePack(baseDir, durationSec) - body := "Archive written." - if path != "" { - body = "Archive written to " + path - } - return ActionResult{Title: "CPU SAT", Body: body}, err + return ActionResult{Title: "CPU SAT", Body: satResultBody(path)}, err } func (a *App) RunStorageAcceptancePack(baseDir string) (string, error) { @@ -463,11 +458,41 @@ func (a *App) RunStorageAcceptancePack(baseDir string) (string, error) { func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) { path, err := a.RunStorageAcceptancePack(baseDir) - body := "Archive written." - if path != "" { - body = "Archive written to " + path + return ActionResult{Title: "Storage SAT", Body: satResultBody(path)}, err +} + +func (a *App) DetectGPUVendor() string { + return a.sat.DetectGPUVendor() +} + +func (a *App) ListAMDGPUs() ([]platform.AMDGPUInfo, error) { + return a.sat.ListAMDGPUs() +} + +func (a *App) RunAMDAcceptancePack(baseDir string) (string, error) { + if strings.TrimSpace(baseDir) == "" { + baseDir = DefaultSATBaseDir } - return ActionResult{Title: "Storage SAT", Body: body}, err + return a.sat.RunAMDAcceptancePack(baseDir) +} + +func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) { + path, err := a.RunAMDAcceptancePack(baseDir) + return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err +} + +// satResultBody reads summary.txt from the SAT run directory (archive path without .tar.gz) +// and returns a formatted human-readable result. Falls back to a plain message if unreadable. +func satResultBody(archivePath string) string { + if archivePath == "" { + return "No output produced." + } + runDir := strings.TrimSuffix(archivePath, ".tar.gz") + raw, err := os.ReadFile(filepath.Join(runDir, "summary.txt")) + if err != nil { + return "Archive written to " + archivePath + } + return formatSATDetail(strings.TrimSpace(string(raw))) } func (a *App) HealthSummaryResult() ActionResult { diff --git a/audit/internal/app/app_test.go b/audit/internal/app/app_test.go index 6a9efc5..6bcdd86 100644 --- a/audit/internal/app/app_test.go +++ b/audit/internal/app/app_test.go @@ -130,6 +130,12 @@ func (f fakeSAT) RunCPUAcceptancePack(baseDir string, durationSec int) (string, return "", nil } +func (f fakeSAT) DetectGPUVendor() string { return "" } + +func (f fakeSAT) ListAMDGPUs() ([]platform.AMDGPUInfo, error) { return nil, nil } + +func (f fakeSAT) RunAMDAcceptancePack(baseDir string) (string, error) { return "", nil } + func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) { t.Parallel() @@ -380,10 +386,10 @@ func TestActionResultsUseFallbackBody(t *testing.T) { if got, _ := a.RunNvidiaAcceptancePackResult(""); got.Body != "Archive written." { t.Fatalf("sat body=%q", got.Body) } - if got, _ := a.RunMemoryAcceptancePackResult(""); got.Body != "Archive written." { + if got, _ := a.RunMemoryAcceptancePackResult(""); got.Body != "No output produced." { t.Fatalf("memory sat body=%q", got.Body) } - if got, _ := a.RunStorageAcceptancePackResult(""); got.Body != "Archive written." { + if got, _ := a.RunStorageAcceptancePackResult(""); got.Body != "No output produced." { t.Fatalf("storage sat body=%q", got.Body) } } diff --git a/audit/internal/app/panel.go b/audit/internal/app/panel.go index 81d8c2d..d771213 100644 --- a/audit/internal/app/panel.go +++ b/audit/internal/app/panel.go @@ -103,6 +103,22 @@ func (a *App) ComponentDetailResult(key string) ActionResult { case "MEM": return a.satDetailResult("memory", "memory-", "MEM detail") case "GPU": + // Prefer whichever GPU SAT was run most recently. + nv, _ := filepath.Glob(filepath.Join(DefaultSATBaseDir, "gpu-nvidia-*/summary.txt")) + am, _ := filepath.Glob(filepath.Join(DefaultSATBaseDir, "gpu-amd-*/summary.txt")) + sort.Strings(nv) + sort.Strings(am) + latestNV := "" + if len(nv) > 0 { + latestNV = nv[len(nv)-1] + } + latestAM := "" + if len(am) > 0 { + latestAM = am[len(am)-1] + } + if latestAM > latestNV { + return a.satDetailResult("gpu", "gpu-amd-", "GPU detail") + } return a.satDetailResult("gpu", "gpu-nvidia-", "GPU detail") case "DISK": return a.satDetailResult("storage", "storage-", "DISK detail") @@ -190,7 +206,70 @@ func (a *App) satDetailResult(statusKey, prefix, title string) ActionResult { if err != nil { return ActionResult{Title: title, Body: "Could not read test results."} } - return ActionResult{Title: title, Body: strings.TrimSpace(string(raw))} + return ActionResult{Title: title, Body: formatSATDetail(strings.TrimSpace(string(raw)))} +} + +// formatSATDetail converts raw summary.txt key=value content to a human-readable per-step display. +func formatSATDetail(raw string) string { + var b strings.Builder + kv := parseKeyValueSummary(raw) + + if t, ok := kv["run_at_utc"]; ok { + fmt.Fprintf(&b, "Run: %s\n\n", t) + } + + // Collect step names in order they appear in the file + lines := strings.Split(raw, "\n") + var stepKeys []string + seenStep := map[string]bool{} + for _, line := range lines { + if idx := strings.Index(line, "_status="); idx >= 0 { + key := line[:idx] + if !seenStep[key] && key != "overall" { + seenStep[key] = true + stepKeys = append(stepKeys, key) + } + } + } + + for _, key := range stepKeys { + status := kv[key+"_status"] + display := cleanSummaryKey(key) + switch status { + case "OK": + fmt.Fprintf(&b, "PASS %s\n", display) + case "FAILED": + fmt.Fprintf(&b, "FAIL %s\n", display) + case "UNSUPPORTED": + fmt.Fprintf(&b, "SKIP %s\n", display) + default: + fmt.Fprintf(&b, "? %s\n", display) + } + } + + if overall, ok := kv["overall_status"]; ok { + ok2 := kv["job_ok"] + failed := kv["job_failed"] + fmt.Fprintf(&b, "\nOverall: %s (ok=%s failed=%s)", overall, ok2, failed) + } + + return strings.TrimSpace(b.String()) +} + +// cleanSummaryKey strips the leading numeric prefix from a SAT step key. +// "1-lscpu" → "lscpu", "3-stress-ng" → "stress-ng" +func cleanSummaryKey(key string) string { + idx := strings.Index(key, "-") + if idx <= 0 { + return key + } + prefix := key[:idx] + for _, c := range prefix { + if c < '0' || c > '9' { + return key + } + } + return key[idx+1:] } func (a *App) psuDetailResult() ActionResult { @@ -247,6 +326,7 @@ func satStatuses() map[string]string { prefix string }{ {"gpu", "gpu-nvidia-"}, + {"gpu", "gpu-amd-"}, {"memory", "memory-"}, {"storage", "storage-"}, {"cpu", "cpu-"}, diff --git a/audit/internal/platform/sat.go b/audit/internal/platform/sat.go index 44f2aef..90768db 100644 --- a/audit/internal/platform/sat.go +++ b/audit/internal/platform/sat.go @@ -22,6 +22,56 @@ type NvidiaGPU struct { MemoryMB int } +// AMDGPUInfo holds basic info about an AMD GPU from rocm-smi. +type AMDGPUInfo struct { + Index int + Name string +} + +// DetectGPUVendor returns "nvidia" if /dev/nvidia0 exists, "amd" if /dev/kfd exists, or "" otherwise. +func (s *System) DetectGPUVendor() string { + if _, err := os.Stat("/dev/nvidia0"); err == nil { + return "nvidia" + } + if _, err := os.Stat("/dev/kfd"); err == nil { + return "amd" + } + return "" +} + +// ListAMDGPUs returns AMD GPUs visible to rocm-smi. +func (s *System) ListAMDGPUs() ([]AMDGPUInfo, error) { + out, err := exec.Command("rocm-smi", "--showproductname", "--csv").Output() + if err != nil { + return nil, fmt.Errorf("rocm-smi: %w", err) + } + var gpus []AMDGPUInfo + for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") { + line = strings.TrimSpace(line) + if line == "" || strings.HasPrefix(strings.ToLower(line), "device") { + continue + } + parts := strings.SplitN(line, ",", 2) + name := "" + if len(parts) >= 2 { + name = strings.TrimSpace(parts[1]) + } + idx := len(gpus) + gpus = append(gpus, AMDGPUInfo{Index: idx, Name: name}) + } + return gpus, nil +} + +// RunAMDAcceptancePack runs an AMD GPU diagnostic pack using rocm-smi. +func (s *System) RunAMDAcceptancePack(baseDir string) (string, error) { + return runAcceptancePack(baseDir, "gpu-amd", []satJob{ + {name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}}, + {name: "02-rocm-smi-showallinfo.log", cmd: []string{"rocm-smi", "--showallinfo"}}, + {name: "03-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}}, + {name: "04-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}}, + }) +} + // ListNvidiaGPUs returns GPUs visible to nvidia-smi. func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) { out, err := exec.Command("nvidia-smi", diff --git a/audit/internal/tui/forms.go b/audit/internal/tui/forms.go index 1e58833..950aa21 100644 --- a/audit/internal/tui/forms.go +++ b/audit/internal/tui/forms.go @@ -1,6 +1,10 @@ package tui -import tea "github.com/charmbracelet/bubbletea" +import ( + "time" + + tea "github.com/charmbracelet/bubbletea" +) func (m model) updateStaticForm(msg tea.KeyMsg) (tea.Model, tea.Cmd) { switch msg.String() { @@ -82,23 +86,57 @@ func (m model) updateConfirm(msg tea.KeyMsg) (tea.Model, tea.Cmd) { return m.executeRunAll() case actionRunMemorySAT: m.busyTitle = "Memory test" - return m, func() tea.Msg { - result, err := m.app.RunMemoryAcceptancePackResult("") - return resultMsg{title: result.Title, body: result.Body, err: err, back: screenHealthCheck} - } + m.progressPrefix = "memory" + m.progressSince = time.Now() + m.progressLines = nil + since := m.progressSince + return m, tea.Batch( + func() tea.Msg { + result, err := m.app.RunMemoryAcceptancePackResult("") + return resultMsg{title: result.Title, body: result.Body, err: err, back: screenHealthCheck} + }, + pollSATProgress("memory", since), + ) case actionRunStorageSAT: m.busyTitle = "Storage test" - return m, func() tea.Msg { - result, err := m.app.RunStorageAcceptancePackResult("") - return resultMsg{title: result.Title, body: result.Body, err: err, back: screenHealthCheck} - } + m.progressPrefix = "storage" + m.progressSince = time.Now() + m.progressLines = nil + since := m.progressSince + return m, tea.Batch( + func() tea.Msg { + result, err := m.app.RunStorageAcceptancePackResult("") + return resultMsg{title: result.Title, body: result.Body, err: err, back: screenHealthCheck} + }, + pollSATProgress("storage", since), + ) case actionRunCPUSAT: m.busyTitle = "CPU test" + m.progressPrefix = "cpu" + m.progressSince = time.Now() + m.progressLines = nil + since := m.progressSince durationSec := hcCPUDurations[m.hcMode] - return m, func() tea.Msg { - result, err := m.app.RunCPUAcceptancePackResult("", durationSec) - return resultMsg{title: result.Title, body: result.Body, err: err, back: screenHealthCheck} - } + return m, tea.Batch( + func() tea.Msg { + result, err := m.app.RunCPUAcceptancePackResult("", durationSec) + return resultMsg{title: result.Title, body: result.Body, err: err, back: screenHealthCheck} + }, + pollSATProgress("cpu", since), + ) + case actionRunAMDGPUSAT: + m.busyTitle = "AMD GPU test" + m.progressPrefix = "gpu-amd" + m.progressSince = time.Now() + m.progressLines = nil + since := m.progressSince + return m, tea.Batch( + func() tea.Msg { + result, err := m.app.RunAMDAcceptancePackResult("") + return resultMsg{title: result.Title, body: result.Body, err: err, back: screenHealthCheck} + }, + pollSATProgress("gpu-amd", since), + ) } case "ctrl+c": return m, tea.Quit @@ -110,7 +148,7 @@ func (m model) confirmCancelTarget() screen { switch m.pendingAction { case actionExportBundle: return screenExportTargets - case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT: + case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT: return screenHealthCheck default: return screenMain diff --git a/audit/internal/tui/screen_health_check.go b/audit/internal/tui/screen_health_check.go index d90b97c..28c8ed3 100644 --- a/audit/internal/tui/screen_health_check.go +++ b/audit/internal/tui/screen_health_check.go @@ -116,6 +116,12 @@ func (m model) updateHealthCheck(msg tea.KeyMsg) (tea.Model, tea.Cmd) { func (m model) hcRunSingle(idx int) (tea.Model, tea.Cmd) { switch idx { case hcGPU: + if m.app.DetectGPUVendor() == "amd" { + m.pendingAction = actionRunAMDGPUSAT + m.screen = screenConfirm + m.cursor = 0 + return m, nil + } m.nvidiaDurIdx = m.hcMode return m.enterNvidiaSATSetup() case hcMemory: @@ -159,27 +165,37 @@ func (m model) executeRunAll() (tea.Model, tea.Cmd) { return m, func() tea.Msg { var parts []string if sel[hcGPU] { - gpus, err := app.ListNvidiaGPUs() - if err != nil || len(gpus) == 0 { - parts = append(parts, "=== GPU ===\nNo NVIDIA GPUs detected or driver not loaded.") - } else { - var indices []int - sizeMB := 0 - for _, g := range gpus { - indices = append(indices, g.Index) - if sizeMB == 0 || g.MemoryMB < sizeMB { - sizeMB = g.MemoryMB - } - } - if sizeMB == 0 { - sizeMB = 64 - } - r, err := app.RunNvidiaAcceptancePackWithOptions(context.Background(), "", durationSec, sizeMB, indices) + vendor := app.DetectGPUVendor() + if vendor == "amd" { + r, err := app.RunAMDAcceptancePackResult("") body := r.Body if err != nil { body += "\nERROR: " + err.Error() } - parts = append(parts, "=== GPU ===\n"+body) + parts = append(parts, "=== GPU (AMD) ===\n"+body) + } else { + gpus, err := app.ListNvidiaGPUs() + if err != nil || len(gpus) == 0 { + parts = append(parts, "=== GPU ===\nNo NVIDIA GPUs detected or driver not loaded.") + } else { + var indices []int + sizeMB := 0 + for _, g := range gpus { + indices = append(indices, g.Index) + if sizeMB == 0 || g.MemoryMB < sizeMB { + sizeMB = g.MemoryMB + } + } + if sizeMB == 0 { + sizeMB = 64 + } + r, err := app.RunNvidiaAcceptancePackWithOptions(context.Background(), "", durationSec, sizeMB, indices) + body := r.Body + if err != nil { + body += "\nERROR: " + err.Error() + } + parts = append(parts, "=== GPU ===\n"+body) + } } } if sel[hcMemory] { @@ -225,7 +241,7 @@ func renderHealthCheck(m model) string { type comp struct{ name, desc, key string } comps := []comp{ - {"GPU", "nvidia-smi + bee-gpu-stress", "G"}, + {"GPU", "nvidia/amd auto-detect", "G"}, {"MEMORY", "memtester", "M"}, {"STORAGE", "smartctl + NVMe self-test", "S"}, {"CPU", "audit diagnostics", "C"}, diff --git a/audit/internal/tui/types.go b/audit/internal/tui/types.go index 7702131..a37f65e 100644 --- a/audit/internal/tui/types.go +++ b/audit/internal/tui/types.go @@ -2,6 +2,7 @@ package tui import ( "strings" + "time" "bee/audit/internal/app" "bee/audit/internal/platform" @@ -31,14 +32,15 @@ const ( type actionKind string const ( - actionNone actionKind = "" - actionDHCPOne actionKind = "dhcp_one" - actionStaticIPv4 actionKind = "static_ipv4" - actionExportBundle actionKind = "export_bundle" - actionRunAll actionKind = "run_all" - actionRunMemorySAT actionKind = "run_memory_sat" - actionRunStorageSAT actionKind = "run_storage_sat" - actionRunCPUSAT actionKind = "run_cpu_sat" + actionNone actionKind = "" + actionDHCPOne actionKind = "dhcp_one" + actionStaticIPv4 actionKind = "static_ipv4" + actionExportBundle actionKind = "export_bundle" + actionRunAll actionKind = "run_all" + actionRunMemorySAT actionKind = "run_memory_sat" + actionRunStorageSAT actionKind = "run_storage_sat" + actionRunCPUSAT actionKind = "run_cpu_sat" + actionRunAMDGPUSAT actionKind = "run_amd_gpu_sat" ) type model struct { @@ -88,6 +90,11 @@ type model struct { // NVIDIA SAT running nvidiaSATCancel func() nvidiaSATAborted bool + + // SAT verbose progress (CPU / Memory / Storage / AMD GPU) + progressLines []string + progressPrefix string + progressSince time.Time } type formField struct { @@ -177,6 +184,8 @@ func (m model) confirmBody() (string, string) { case actionRunCPUSAT: modes := []string{"Quick (60s)", "Standard (300s)", "Express (900s)"} return "CPU test", "Run stress-ng? Mode: " + modes[m.hcMode] + case actionRunAMDGPUSAT: + return "AMD GPU test", "Run AMD GPU diagnostic pack (rocm-smi)?" default: return "Confirm", "Proceed?" } diff --git a/audit/internal/tui/update.go b/audit/internal/tui/update.go index 1a71aef..5af9942 100644 --- a/audit/internal/tui/update.go +++ b/audit/internal/tui/update.go @@ -17,9 +17,19 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) { return m, nil } return m.updateKey(msg) + case satProgressMsg: + if m.busy && m.progressPrefix != "" { + if len(msg.lines) > 0 { + m.progressLines = msg.lines + } + return m, pollSATProgress(m.progressPrefix, m.progressSince) + } + return m, nil case resultMsg: m.busy = false m.busyTitle = "" + m.progressLines = nil + m.progressPrefix = "" m.title = msg.title if msg.err != nil { body := strings.TrimSpace(msg.body) diff --git a/audit/internal/tui/view.go b/audit/internal/tui/view.go index 07db1cc..3fc5855 100644 --- a/audit/internal/tui/view.go +++ b/audit/internal/tui/view.go @@ -39,6 +39,15 @@ func (m model) View() string { if m.busyTitle != "" { title = m.busyTitle } + if len(m.progressLines) > 0 { + var b strings.Builder + fmt.Fprintf(&b, "%s\n\n", title) + for _, l := range m.progressLines { + fmt.Fprintf(&b, " %s\n", l) + } + b.WriteString("\n[ctrl+c] quit\n") + return b.String() + } return fmt.Sprintf("%s\n\nWorking...\n\n[ctrl+c] quit\n", title) } switch m.screen {