diff --git a/audit/cmd/bee/main.go b/audit/cmd/bee/main.go index bbe5972..7ab78fb 100644 --- a/audit/cmd/bee/main.go +++ b/audit/cmd/bee/main.go @@ -70,7 +70,7 @@ func printRootUsage(w io.Writer) { bee export --target bee support-bundle --output stdout|file: bee web --listen :80 --audit-path `+app.DefaultAuditJSONPath+` - bee sat nvidia|memory|storage + bee sat nvidia|memory|storage|cpu [--duration ] bee version bee help [command]`) } @@ -346,43 +346,58 @@ func runWeb(args []string, stdout, stderr io.Writer) int { func runSAT(args []string, stdout, stderr io.Writer) int { if len(args) == 0 { - fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage") + fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration ]") return 2 } if args[0] == "help" || args[0] == "--help" || args[0] == "-h" { - fmt.Fprintln(stdout, "usage: bee sat nvidia|memory|storage") + fmt.Fprintln(stdout, "usage: bee sat nvidia|memory|storage|cpu [--duration ]") return 0 } - if args[0] != "nvidia" && args[0] != "memory" && args[0] != "storage" { - fmt.Fprintf(stderr, "bee sat: unknown target %q\n", args[0]) - fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage") + + fs := flag.NewFlagSet("sat", flag.ContinueOnError) + fs.SetOutput(stderr) + duration := fs.Int("duration", 0, "stress-ng duration in seconds (cpu only; default: 60)") + if err := fs.Parse(args[1:]); err != nil { + if err == flag.ErrHelp { + return 0 + } return 2 } - if len(args) > 1 { - fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage") + if fs.NArg() != 0 { + fmt.Fprintf(stderr, "bee sat: unexpected arguments\n") return 2 } + + target := args[0] + if target != "nvidia" && target != "memory" && target != "storage" && target != "cpu" { + fmt.Fprintf(stderr, "bee sat: unknown target %q\n", target) + fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration ]") + return 2 + } + application := app.New(platform.New()) var ( archive string err error - label string ) - switch args[0] { + switch target { case "nvidia": - label = "nvidia" archive, err = application.RunNvidiaAcceptancePack("") case "memory": - label = "memory" archive, err = application.RunMemoryAcceptancePack("") case "storage": - label = "storage" archive, err = application.RunStorageAcceptancePack("") + case "cpu": + dur := *duration + if dur <= 0 { + dur = 60 + } + archive, err = application.RunCPUAcceptancePack("", dur) } if err != nil { - slog.Error("run sat", "target", label, "err", err) + slog.Error("run sat", "target", target, "err", err) return 1 } - slog.Info("sat archive written", "target", label, "path", archive) + slog.Info("sat archive written", "target", target, "path", archive) return 0 } diff --git a/audit/cmd/bee/main_test.go b/audit/cmd/bee/main_test.go index c45a56c..8493200 100644 --- a/audit/cmd/bee/main_test.go +++ b/audit/cmd/bee/main_test.go @@ -164,7 +164,7 @@ func TestRunSATHelp(t *testing.T) { if rc != 0 { t.Fatalf("rc=%d want 0", rc) } - if !strings.Contains(stdout.String(), "usage: bee sat nvidia|memory|storage") { + if !strings.Contains(stdout.String(), "usage: bee sat nvidia|memory|storage|cpu") { t.Fatalf("stdout missing sat help:\n%s", stdout.String()) } } @@ -177,8 +177,8 @@ func TestRunSATRejectsExtraArgs(t *testing.T) { if rc != 2 { t.Fatalf("rc=%d want 2", rc) } - if !strings.Contains(stderr.String(), "usage: bee sat nvidia|memory|storage") { - t.Fatalf("stderr missing sat usage:\n%s", stderr.String()) + if !strings.Contains(stderr.String(), "bee sat: unexpected arguments") { + t.Fatalf("stderr missing sat error:\n%s", stderr.String()) } } diff --git a/audit/internal/app/app.go b/audit/internal/app/app.go index 120a493..65dd98c 100644 --- a/audit/internal/app/app.go +++ b/audit/internal/app/app.go @@ -75,6 +75,7 @@ type satRunner interface { RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, durationSec int, sizeMB int, gpuIndices []int) (string, error) RunMemoryAcceptancePack(baseDir string) (string, error) RunStorageAcceptancePack(baseDir string) (string, error) + RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) } @@ -437,6 +438,22 @@ func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error return ActionResult{Title: "Memory SAT", Body: body}, err } +func (a *App) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) { + if strings.TrimSpace(baseDir) == "" { + baseDir = DefaultSATBaseDir + } + return a.sat.RunCPUAcceptancePack(baseDir, durationSec) +} + +func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (ActionResult, error) { + path, err := a.RunCPUAcceptancePack(baseDir, durationSec) + body := "Archive written." + if path != "" { + body = "Archive written to " + path + } + return ActionResult{Title: "CPU SAT", Body: body}, err +} + func (a *App) RunStorageAcceptancePack(baseDir string) (string, error) { if strings.TrimSpace(baseDir) == "" { baseDir = DefaultSATBaseDir @@ -592,6 +609,7 @@ func latestSATSummaries() []string { {label: "NVIDIA SAT", prefix: "gpu-nvidia-"}, {label: "Memory SAT", prefix: "memory-"}, {label: "Storage SAT", prefix: "storage-"}, + {label: "CPU SAT", prefix: "cpu-"}, } var out []string for _, item := range patterns { diff --git a/audit/internal/app/app_test.go b/audit/internal/app/app_test.go index 6f1dab7..6a9efc5 100644 --- a/audit/internal/app/app_test.go +++ b/audit/internal/app/app_test.go @@ -100,6 +100,7 @@ type fakeSAT struct { runNvidiaFn func(string) (string, error) runMemoryFn func(string) (string, error) runStorageFn func(string) (string, error) + runCPUFn func(string, int) (string, error) } func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string) (string, error) { @@ -122,6 +123,13 @@ func (f fakeSAT) RunStorageAcceptancePack(baseDir string) (string, error) { return f.runStorageFn(baseDir) } +func (f fakeSAT) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) { + if f.runCPUFn != nil { + return f.runCPUFn(baseDir, durationSec) + } + return "", nil +} + func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) { t.Parallel() diff --git a/audit/internal/app/panel.go b/audit/internal/app/panel.go index a531d3c..81d8c2d 100644 --- a/audit/internal/app/panel.go +++ b/audit/internal/app/panel.go @@ -45,7 +45,9 @@ func (a *App) LoadHardwarePanel() HardwarePanelData { for _, fw := range snap.Hardware.Firmware { if fw.DeviceName == "BIOS" && fw.Version != "" { header = append(header, "BIOS: "+fw.Version) - break + } + if fw.DeviceName == "BMC" && fw.Version != "" { + header = append(header, "BMC: "+fw.Version) } } if ip := formatIPLine(a.network.ListInterfaces); ip != "" { @@ -57,7 +59,7 @@ func (a *App) LoadHardwarePanel() HardwarePanelData { if cpu := formatCPULine(snap.Hardware.CPUs); cpu != "" { rows = append(rows, ComponentRow{ Key: "CPU", - Status: "N/A", + Status: statuses["cpu"], Detail: strings.TrimPrefix(cpu, "CPU: "), }) } @@ -97,7 +99,7 @@ func (a *App) LoadHardwarePanel() HardwarePanelData { func (a *App) ComponentDetailResult(key string) ActionResult { switch key { case "CPU": - return a.cpuDetailResult() + return a.cpuDetailResult(false) case "MEM": return a.satDetailResult("memory", "memory-", "MEM detail") case "GPU": @@ -111,19 +113,37 @@ func (a *App) ComponentDetailResult(key string) ActionResult { } } -func (a *App) cpuDetailResult() ActionResult { +func (a *App) cpuDetailResult(satOnly bool) ActionResult { + var b strings.Builder + + // Show latest SAT summary if available. + satResult := a.satDetailResult("cpu", "cpu-", "CPU SAT") + if satResult.Body != "No test results found. Run a test first." { + fmt.Fprintln(&b, "=== Last SAT ===") + fmt.Fprintln(&b, satResult.Body) + fmt.Fprintln(&b) + } + + if satOnly { + body := strings.TrimSpace(b.String()) + if body == "" { + body = "No CPU SAT results found. Run a test first." + } + return ActionResult{Title: "CPU SAT", Body: body} + } + raw, err := os.ReadFile(DefaultAuditJSONPath) if err != nil { - return ActionResult{Title: "CPU", Body: "No audit data."} + return ActionResult{Title: "CPU", Body: strings.TrimSpace(b.String())} } var snap schema.HardwareIngestRequest if err := json.Unmarshal(raw, &snap); err != nil { - return ActionResult{Title: "CPU", Body: "Audit data unreadable."} + return ActionResult{Title: "CPU", Body: strings.TrimSpace(b.String())} } if len(snap.Hardware.CPUs) == 0 { - return ActionResult{Title: "CPU", Body: "No CPU data in last audit."} + return ActionResult{Title: "CPU", Body: strings.TrimSpace(b.String())} } - var b strings.Builder + fmt.Fprintln(&b, "=== Audit ===") for i, cpu := range snap.Hardware.CPUs { fmt.Fprintf(&b, "CPU %d\n", i) if cpu.Model != nil { @@ -220,6 +240,7 @@ func satStatuses() map[string]string { "gpu": "N/A", "memory": "N/A", "storage": "N/A", + "cpu": "N/A", } patterns := []struct { key string @@ -228,6 +249,7 @@ func satStatuses() map[string]string { {"gpu", "gpu-nvidia-"}, {"memory", "memory-"}, {"storage", "storage-"}, + {"cpu", "cpu-"}, } for _, item := range patterns { matches, err := filepath.Glob(filepath.Join(DefaultSATBaseDir, item.prefix+"*/summary.txt")) diff --git a/audit/internal/collector/board.go b/audit/internal/collector/board.go index bac4421..de7c4d6 100644 --- a/audit/internal/collector/board.go +++ b/audit/internal/collector/board.go @@ -4,6 +4,7 @@ import ( "bee/audit/internal/schema" "bufio" "log/slog" + "os" "os/exec" "strings" ) @@ -16,6 +17,14 @@ var execDmidecode = func(typeNum string) (string, error) { return string(out), nil } +var execIpmitool = func(args ...string) (string, error) { + out, err := exec.Command("ipmitool", args...).Output() + if err != nil { + return "", err + } + return string(out), nil +} + // collectBoard runs dmidecode for types 0, 1, 2 and returns the board record // plus the BIOS firmware entry. Any failure is logged and returns zero values. func collectBoard() (schema.HardwareBoard, []schema.HardwareFirmwareRecord) { @@ -69,6 +78,45 @@ func parseBoard(type1, type2 string) schema.HardwareBoard { return board } +// collectBMCFirmware collects BMC firmware version via ipmitool mc info. +// Returns nil if ipmitool is missing, /dev/ipmi0 is absent, or any error occurs. +func collectBMCFirmware() []schema.HardwareFirmwareRecord { + if _, err := exec.LookPath("ipmitool"); err != nil { + return nil + } + if _, err := os.Stat("/dev/ipmi0"); err != nil { + return nil + } + out, err := execIpmitool("mc", "info") + if err != nil { + slog.Info("bmc: ipmitool mc info unavailable", "err", err) + return nil + } + version := parseBMCFirmwareRevision(out) + if version == "" { + return nil + } + slog.Info("bmc: collected", "version", version) + return []schema.HardwareFirmwareRecord{ + {DeviceName: "BMC", Version: version}, + } +} + +// parseBMCFirmwareRevision extracts the "Firmware Revision" field from ipmitool mc info output. +func parseBMCFirmwareRevision(out string) string { + for _, line := range strings.Split(out, "\n") { + line = strings.TrimSpace(line) + key, val, ok := strings.Cut(line, ":") + if !ok { + continue + } + if strings.TrimSpace(key) == "Firmware Revision" { + return strings.TrimSpace(val) + } + } + return "" +} + // parseBIOSFirmware extracts BIOS version from dmidecode type 0 output. func parseBIOSFirmware(type0 string) []schema.HardwareFirmwareRecord { fields := parseDMIFields(type0, "BIOS Information") diff --git a/audit/internal/collector/collector.go b/audit/internal/collector/collector.go index 6815582..16f042e 100644 --- a/audit/internal/collector/collector.go +++ b/audit/internal/collector/collector.go @@ -23,6 +23,7 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest { board, biosFW := collectBoard() snap.Board = board snap.Firmware = append(snap.Firmware, biosFW...) + snap.Firmware = append(snap.Firmware, collectBMCFirmware()...) snap.CPUs = collectCPUs() diff --git a/audit/internal/platform/sat.go b/audit/internal/platform/sat.go index 10d7e0a..2d90723 100644 --- a/audit/internal/platform/sat.go +++ b/audit/internal/platform/sat.go @@ -74,6 +74,18 @@ func (s *System) RunMemoryAcceptancePack(baseDir string) (string, error) { }) } +func (s *System) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) { + if durationSec <= 0 { + durationSec = 60 + } + return runAcceptancePack(baseDir, "cpu", []satJob{ + {name: "01-lscpu.log", cmd: []string{"lscpu"}}, + {name: "02-sensors-before.log", cmd: []string{"sensors"}}, + {name: "03-stress-ng.log", cmd: []string{"stress-ng", "--cpu", "0", "--cpu-method", "all", "--timeout", fmt.Sprintf("%d", durationSec)}}, + {name: "04-sensors-after.log", cmd: []string{"sensors"}}, + }) +} + func (s *System) RunStorageAcceptancePack(baseDir string) (string, error) { if baseDir == "" { baseDir = "/var/log/bee-sat" diff --git a/audit/internal/tui/forms.go b/audit/internal/tui/forms.go index 43a68da..1e58833 100644 --- a/audit/internal/tui/forms.go +++ b/audit/internal/tui/forms.go @@ -92,6 +92,13 @@ func (m model) updateConfirm(msg tea.KeyMsg) (tea.Model, tea.Cmd) { result, err := m.app.RunStorageAcceptancePackResult("") return resultMsg{title: result.Title, body: result.Body, err: err, back: screenHealthCheck} } + case actionRunCPUSAT: + m.busyTitle = "CPU test" + durationSec := hcCPUDurations[m.hcMode] + return m, func() tea.Msg { + result, err := m.app.RunCPUAcceptancePackResult("", durationSec) + return resultMsg{title: result.Title, body: result.Body, err: err, back: screenHealthCheck} + } } case "ctrl+c": return m, tea.Quit @@ -103,7 +110,7 @@ func (m model) confirmCancelTarget() screen { switch m.pendingAction { case actionExportBundle: return screenExportTargets - case actionRunAll, actionRunMemorySAT, actionRunStorageSAT: + case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT: return screenHealthCheck default: return screenMain diff --git a/audit/internal/tui/screen_health_check.go b/audit/internal/tui/screen_health_check.go index c96d1d7..d90b97c 100644 --- a/audit/internal/tui/screen_health_check.go +++ b/audit/internal/tui/screen_health_check.go @@ -33,6 +33,9 @@ const ( // hcModeDurations maps mode index (0=Quick,1=Standard,2=Express) to GPU stress seconds. var hcModeDurations = [3]int{600, 3600, 28800} +// hcCPUDurations maps mode index to CPU stress-ng seconds. +var hcCPUDurations = [3]int{60, 300, 900} + func (m model) enterHealthCheck() (tea.Model, tea.Cmd) { m.screen = screenHealthCheck if !m.hcInitialized { @@ -126,12 +129,10 @@ func (m model) hcRunSingle(idx int) (tea.Model, tea.Cmd) { m.cursor = 0 return m, nil case hcCPU: - m.busy = true - m.busyTitle = "CPU" - return m, func() tea.Msg { - r := m.app.ComponentDetailResult("CPU") - return resultMsg{title: r.Title, body: r.Body, back: screenHealthCheck} - } + m.pendingAction = actionRunCPUSAT + m.screen = screenConfirm + m.cursor = 0 + return m, nil } return m, nil } @@ -150,6 +151,7 @@ func (m model) hcRunAll() (tea.Model, tea.Cmd) { func (m model) executeRunAll() (tea.Model, tea.Cmd) { durationSec := hcModeDurations[m.hcMode] + durationIdx := m.hcMode sel := m.hcSel app := m.app m.busy = true @@ -197,8 +199,13 @@ func (m model) executeRunAll() (tea.Model, tea.Cmd) { parts = append(parts, "=== STORAGE ===\n"+body) } if sel[hcCPU] { - r := app.ComponentDetailResult("CPU") - parts = append(parts, "=== CPU ===\n"+r.Body) + cpuDur := hcCPUDurations[durationIdx] + r, err := app.RunCPUAcceptancePackResult("", cpuDur) + body := r.Body + if err != nil { + body += "\nERROR: " + err.Error() + } + parts = append(parts, "=== CPU ===\n"+body) } combined := strings.Join(parts, "\n\n") if combined == "" { diff --git a/audit/internal/tui/types.go b/audit/internal/tui/types.go index 5285415..7702131 100644 --- a/audit/internal/tui/types.go +++ b/audit/internal/tui/types.go @@ -38,6 +38,7 @@ const ( actionRunAll actionKind = "run_all" actionRunMemorySAT actionKind = "run_memory_sat" actionRunStorageSAT actionKind = "run_storage_sat" + actionRunCPUSAT actionKind = "run_cpu_sat" ) type model struct { @@ -173,6 +174,9 @@ func (m model) confirmBody() (string, string) { return "Memory test", "Run memtester?" case actionRunStorageSAT: return "Storage test", "Run storage diagnostic pack?" + case actionRunCPUSAT: + modes := []string{"Quick (60s)", "Standard (300s)", "Express (900s)"} + return "CPU test", "Run stress-ng? Mode: " + modes[m.hcMode] default: return "Confirm", "Proceed?" } diff --git a/bible-local/backlog.md b/bible-local/backlog.md index 96c94ea..1238d8c 100644 --- a/bible-local/backlog.md +++ b/bible-local/backlog.md @@ -2,7 +2,7 @@ ## BMC версия через IPMI -**Статус:** не реализовано. +**Статус:** реализовано. Добавить сбор версии BMC firmware в board collector: - Команда: `ipmitool mc info` → поле `Firmware Revision` @@ -12,7 +12,7 @@ ## CPU acceptance test через stress-ng -**Статус:** не реализовано. CPU в Health Check всегда `N/A`. +**Статус:** реализовано. CPU в Health Check получает PASS/FAIL из summary.txt. Добавить CPU SAT на базе `stress-ng`: - Bake `stress-ng` в ISO (добавить в `bee.list.chroot`)