feat: CPU SAT via stress-ng + BMC version via ipmitool

BMC:
- collector/board.go: collectBMCFirmware() via ipmitool mc info, graceful skip if /dev/ipmi0 absent
- collector/collector.go: append BMC firmware record to snap.Firmware
- app/panel.go: show BMC version in TUI right-panel header alongside BIOS

CPU SAT:
- platform/sat.go: RunCPUAcceptancePack(baseDir, durationSec) — lscpu + sensors before/after + stress-ng
- app/app.go: RunCPUAcceptancePack + RunCPUAcceptancePackResult methods, satRunner interface updated
- app/panel.go: CPU row now reads real PASS/FAIL from cpu-*/summary.txt via satStatuses(); cpuDetailResult shows last SAT summary + audit data
- tui/types.go: actionRunCPUSAT, confirmBody for CPU test with mode label
- tui/screen_health_check.go: hcCPUDurations [60,300,900]s; hcRunSingle(CPU)→confirm screen; executeRunAll uses RunCPUAcceptancePackResult
- tui/forms.go: actionRunCPUSAT → RunCPUAcceptancePackResult with mode duration
- cmd/bee/main.go: bee sat cpu [--duration N] subcommand

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Mikhail Chusavitin
2026-03-25 11:06:12 +03:00
parent 1c80906c1f
commit 36dff6e584
12 changed files with 179 additions and 37 deletions

View File

@@ -70,7 +70,7 @@ func printRootUsage(w io.Writer) {
bee export --target <device>
bee support-bundle --output stdout|file:<path>
bee web --listen :80 --audit-path `+app.DefaultAuditJSONPath+`
bee sat nvidia|memory|storage
bee sat nvidia|memory|storage|cpu [--duration <seconds>]
bee version
bee help [command]`)
}
@@ -346,43 +346,58 @@ func runWeb(args []string, stdout, stderr io.Writer) int {
func runSAT(args []string, stdout, stderr io.Writer) int {
if len(args) == 0 {
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage")
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
return 2
}
if args[0] == "help" || args[0] == "--help" || args[0] == "-h" {
fmt.Fprintln(stdout, "usage: bee sat nvidia|memory|storage")
fmt.Fprintln(stdout, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
return 0
}
if args[0] != "nvidia" && args[0] != "memory" && args[0] != "storage" {
fmt.Fprintf(stderr, "bee sat: unknown target %q\n", args[0])
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage")
fs := flag.NewFlagSet("sat", flag.ContinueOnError)
fs.SetOutput(stderr)
duration := fs.Int("duration", 0, "stress-ng duration in seconds (cpu only; default: 60)")
if err := fs.Parse(args[1:]); err != nil {
if err == flag.ErrHelp {
return 0
}
return 2
}
if len(args) > 1 {
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage")
if fs.NArg() != 0 {
fmt.Fprintf(stderr, "bee sat: unexpected arguments\n")
return 2
}
target := args[0]
if target != "nvidia" && target != "memory" && target != "storage" && target != "cpu" {
fmt.Fprintf(stderr, "bee sat: unknown target %q\n", target)
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
return 2
}
application := app.New(platform.New())
var (
archive string
err error
label string
)
switch args[0] {
switch target {
case "nvidia":
label = "nvidia"
archive, err = application.RunNvidiaAcceptancePack("")
case "memory":
label = "memory"
archive, err = application.RunMemoryAcceptancePack("")
case "storage":
label = "storage"
archive, err = application.RunStorageAcceptancePack("")
case "cpu":
dur := *duration
if dur <= 0 {
dur = 60
}
archive, err = application.RunCPUAcceptancePack("", dur)
}
if err != nil {
slog.Error("run sat", "target", label, "err", err)
slog.Error("run sat", "target", target, "err", err)
return 1
}
slog.Info("sat archive written", "target", label, "path", archive)
slog.Info("sat archive written", "target", target, "path", archive)
return 0
}

View File

@@ -164,7 +164,7 @@ func TestRunSATHelp(t *testing.T) {
if rc != 0 {
t.Fatalf("rc=%d want 0", rc)
}
if !strings.Contains(stdout.String(), "usage: bee sat nvidia|memory|storage") {
if !strings.Contains(stdout.String(), "usage: bee sat nvidia|memory|storage|cpu") {
t.Fatalf("stdout missing sat help:\n%s", stdout.String())
}
}
@@ -177,8 +177,8 @@ func TestRunSATRejectsExtraArgs(t *testing.T) {
if rc != 2 {
t.Fatalf("rc=%d want 2", rc)
}
if !strings.Contains(stderr.String(), "usage: bee sat nvidia|memory|storage") {
t.Fatalf("stderr missing sat usage:\n%s", stderr.String())
if !strings.Contains(stderr.String(), "bee sat: unexpected arguments") {
t.Fatalf("stderr missing sat error:\n%s", stderr.String())
}
}

View File

@@ -75,6 +75,7 @@ type satRunner interface {
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, durationSec int, sizeMB int, gpuIndices []int) (string, error)
RunMemoryAcceptancePack(baseDir string) (string, error)
RunStorageAcceptancePack(baseDir string) (string, error)
RunCPUAcceptancePack(baseDir string, durationSec int) (string, error)
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
}
@@ -437,6 +438,22 @@ func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error
return ActionResult{Title: "Memory SAT", Body: body}, err
}
func (a *App) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunCPUAcceptancePack(baseDir, durationSec)
}
func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (ActionResult, error) {
path, err := a.RunCPUAcceptancePack(baseDir, durationSec)
body := "Archive written."
if path != "" {
body = "Archive written to " + path
}
return ActionResult{Title: "CPU SAT", Body: body}, err
}
func (a *App) RunStorageAcceptancePack(baseDir string) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
@@ -592,6 +609,7 @@ func latestSATSummaries() []string {
{label: "NVIDIA SAT", prefix: "gpu-nvidia-"},
{label: "Memory SAT", prefix: "memory-"},
{label: "Storage SAT", prefix: "storage-"},
{label: "CPU SAT", prefix: "cpu-"},
}
var out []string
for _, item := range patterns {

View File

@@ -100,6 +100,7 @@ type fakeSAT struct {
runNvidiaFn func(string) (string, error)
runMemoryFn func(string) (string, error)
runStorageFn func(string) (string, error)
runCPUFn func(string, int) (string, error)
}
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string) (string, error) {
@@ -122,6 +123,13 @@ func (f fakeSAT) RunStorageAcceptancePack(baseDir string) (string, error) {
return f.runStorageFn(baseDir)
}
func (f fakeSAT) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) {
if f.runCPUFn != nil {
return f.runCPUFn(baseDir, durationSec)
}
return "", nil
}
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
t.Parallel()

View File

@@ -45,7 +45,9 @@ func (a *App) LoadHardwarePanel() HardwarePanelData {
for _, fw := range snap.Hardware.Firmware {
if fw.DeviceName == "BIOS" && fw.Version != "" {
header = append(header, "BIOS: "+fw.Version)
break
}
if fw.DeviceName == "BMC" && fw.Version != "" {
header = append(header, "BMC: "+fw.Version)
}
}
if ip := formatIPLine(a.network.ListInterfaces); ip != "" {
@@ -57,7 +59,7 @@ func (a *App) LoadHardwarePanel() HardwarePanelData {
if cpu := formatCPULine(snap.Hardware.CPUs); cpu != "" {
rows = append(rows, ComponentRow{
Key: "CPU",
Status: "N/A",
Status: statuses["cpu"],
Detail: strings.TrimPrefix(cpu, "CPU: "),
})
}
@@ -97,7 +99,7 @@ func (a *App) LoadHardwarePanel() HardwarePanelData {
func (a *App) ComponentDetailResult(key string) ActionResult {
switch key {
case "CPU":
return a.cpuDetailResult()
return a.cpuDetailResult(false)
case "MEM":
return a.satDetailResult("memory", "memory-", "MEM detail")
case "GPU":
@@ -111,19 +113,37 @@ func (a *App) ComponentDetailResult(key string) ActionResult {
}
}
func (a *App) cpuDetailResult() ActionResult {
func (a *App) cpuDetailResult(satOnly bool) ActionResult {
var b strings.Builder
// Show latest SAT summary if available.
satResult := a.satDetailResult("cpu", "cpu-", "CPU SAT")
if satResult.Body != "No test results found. Run a test first." {
fmt.Fprintln(&b, "=== Last SAT ===")
fmt.Fprintln(&b, satResult.Body)
fmt.Fprintln(&b)
}
if satOnly {
body := strings.TrimSpace(b.String())
if body == "" {
body = "No CPU SAT results found. Run a test first."
}
return ActionResult{Title: "CPU SAT", Body: body}
}
raw, err := os.ReadFile(DefaultAuditJSONPath)
if err != nil {
return ActionResult{Title: "CPU", Body: "No audit data."}
return ActionResult{Title: "CPU", Body: strings.TrimSpace(b.String())}
}
var snap schema.HardwareIngestRequest
if err := json.Unmarshal(raw, &snap); err != nil {
return ActionResult{Title: "CPU", Body: "Audit data unreadable."}
return ActionResult{Title: "CPU", Body: strings.TrimSpace(b.String())}
}
if len(snap.Hardware.CPUs) == 0 {
return ActionResult{Title: "CPU", Body: "No CPU data in last audit."}
return ActionResult{Title: "CPU", Body: strings.TrimSpace(b.String())}
}
var b strings.Builder
fmt.Fprintln(&b, "=== Audit ===")
for i, cpu := range snap.Hardware.CPUs {
fmt.Fprintf(&b, "CPU %d\n", i)
if cpu.Model != nil {
@@ -220,6 +240,7 @@ func satStatuses() map[string]string {
"gpu": "N/A",
"memory": "N/A",
"storage": "N/A",
"cpu": "N/A",
}
patterns := []struct {
key string
@@ -228,6 +249,7 @@ func satStatuses() map[string]string {
{"gpu", "gpu-nvidia-"},
{"memory", "memory-"},
{"storage", "storage-"},
{"cpu", "cpu-"},
}
for _, item := range patterns {
matches, err := filepath.Glob(filepath.Join(DefaultSATBaseDir, item.prefix+"*/summary.txt"))

View File

@@ -4,6 +4,7 @@ import (
"bee/audit/internal/schema"
"bufio"
"log/slog"
"os"
"os/exec"
"strings"
)
@@ -16,6 +17,14 @@ var execDmidecode = func(typeNum string) (string, error) {
return string(out), nil
}
var execIpmitool = func(args ...string) (string, error) {
out, err := exec.Command("ipmitool", args...).Output()
if err != nil {
return "", err
}
return string(out), nil
}
// collectBoard runs dmidecode for types 0, 1, 2 and returns the board record
// plus the BIOS firmware entry. Any failure is logged and returns zero values.
func collectBoard() (schema.HardwareBoard, []schema.HardwareFirmwareRecord) {
@@ -69,6 +78,45 @@ func parseBoard(type1, type2 string) schema.HardwareBoard {
return board
}
// collectBMCFirmware collects BMC firmware version via ipmitool mc info.
// Returns nil if ipmitool is missing, /dev/ipmi0 is absent, or any error occurs.
func collectBMCFirmware() []schema.HardwareFirmwareRecord {
if _, err := exec.LookPath("ipmitool"); err != nil {
return nil
}
if _, err := os.Stat("/dev/ipmi0"); err != nil {
return nil
}
out, err := execIpmitool("mc", "info")
if err != nil {
slog.Info("bmc: ipmitool mc info unavailable", "err", err)
return nil
}
version := parseBMCFirmwareRevision(out)
if version == "" {
return nil
}
slog.Info("bmc: collected", "version", version)
return []schema.HardwareFirmwareRecord{
{DeviceName: "BMC", Version: version},
}
}
// parseBMCFirmwareRevision extracts the "Firmware Revision" field from ipmitool mc info output.
func parseBMCFirmwareRevision(out string) string {
for _, line := range strings.Split(out, "\n") {
line = strings.TrimSpace(line)
key, val, ok := strings.Cut(line, ":")
if !ok {
continue
}
if strings.TrimSpace(key) == "Firmware Revision" {
return strings.TrimSpace(val)
}
}
return ""
}
// parseBIOSFirmware extracts BIOS version from dmidecode type 0 output.
func parseBIOSFirmware(type0 string) []schema.HardwareFirmwareRecord {
fields := parseDMIFields(type0, "BIOS Information")

View File

@@ -23,6 +23,7 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
board, biosFW := collectBoard()
snap.Board = board
snap.Firmware = append(snap.Firmware, biosFW...)
snap.Firmware = append(snap.Firmware, collectBMCFirmware()...)
snap.CPUs = collectCPUs()

View File

@@ -74,6 +74,18 @@ func (s *System) RunMemoryAcceptancePack(baseDir string) (string, error) {
})
}
func (s *System) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) {
if durationSec <= 0 {
durationSec = 60
}
return runAcceptancePack(baseDir, "cpu", []satJob{
{name: "01-lscpu.log", cmd: []string{"lscpu"}},
{name: "02-sensors-before.log", cmd: []string{"sensors"}},
{name: "03-stress-ng.log", cmd: []string{"stress-ng", "--cpu", "0", "--cpu-method", "all", "--timeout", fmt.Sprintf("%d", durationSec)}},
{name: "04-sensors-after.log", cmd: []string{"sensors"}},
})
}
func (s *System) RunStorageAcceptancePack(baseDir string) (string, error) {
if baseDir == "" {
baseDir = "/var/log/bee-sat"

View File

@@ -92,6 +92,13 @@ func (m model) updateConfirm(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
result, err := m.app.RunStorageAcceptancePackResult("")
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenHealthCheck}
}
case actionRunCPUSAT:
m.busyTitle = "CPU test"
durationSec := hcCPUDurations[m.hcMode]
return m, func() tea.Msg {
result, err := m.app.RunCPUAcceptancePackResult("", durationSec)
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenHealthCheck}
}
}
case "ctrl+c":
return m, tea.Quit
@@ -103,7 +110,7 @@ func (m model) confirmCancelTarget() screen {
switch m.pendingAction {
case actionExportBundle:
return screenExportTargets
case actionRunAll, actionRunMemorySAT, actionRunStorageSAT:
case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT:
return screenHealthCheck
default:
return screenMain

View File

@@ -33,6 +33,9 @@ const (
// hcModeDurations maps mode index (0=Quick,1=Standard,2=Express) to GPU stress seconds.
var hcModeDurations = [3]int{600, 3600, 28800}
// hcCPUDurations maps mode index to CPU stress-ng seconds.
var hcCPUDurations = [3]int{60, 300, 900}
func (m model) enterHealthCheck() (tea.Model, tea.Cmd) {
m.screen = screenHealthCheck
if !m.hcInitialized {
@@ -126,12 +129,10 @@ func (m model) hcRunSingle(idx int) (tea.Model, tea.Cmd) {
m.cursor = 0
return m, nil
case hcCPU:
m.busy = true
m.busyTitle = "CPU"
return m, func() tea.Msg {
r := m.app.ComponentDetailResult("CPU")
return resultMsg{title: r.Title, body: r.Body, back: screenHealthCheck}
}
m.pendingAction = actionRunCPUSAT
m.screen = screenConfirm
m.cursor = 0
return m, nil
}
return m, nil
}
@@ -150,6 +151,7 @@ func (m model) hcRunAll() (tea.Model, tea.Cmd) {
func (m model) executeRunAll() (tea.Model, tea.Cmd) {
durationSec := hcModeDurations[m.hcMode]
durationIdx := m.hcMode
sel := m.hcSel
app := m.app
m.busy = true
@@ -197,8 +199,13 @@ func (m model) executeRunAll() (tea.Model, tea.Cmd) {
parts = append(parts, "=== STORAGE ===\n"+body)
}
if sel[hcCPU] {
r := app.ComponentDetailResult("CPU")
parts = append(parts, "=== CPU ===\n"+r.Body)
cpuDur := hcCPUDurations[durationIdx]
r, err := app.RunCPUAcceptancePackResult("", cpuDur)
body := r.Body
if err != nil {
body += "\nERROR: " + err.Error()
}
parts = append(parts, "=== CPU ===\n"+body)
}
combined := strings.Join(parts, "\n\n")
if combined == "" {

View File

@@ -38,6 +38,7 @@ const (
actionRunAll actionKind = "run_all"
actionRunMemorySAT actionKind = "run_memory_sat"
actionRunStorageSAT actionKind = "run_storage_sat"
actionRunCPUSAT actionKind = "run_cpu_sat"
)
type model struct {
@@ -173,6 +174,9 @@ func (m model) confirmBody() (string, string) {
return "Memory test", "Run memtester?"
case actionRunStorageSAT:
return "Storage test", "Run storage diagnostic pack?"
case actionRunCPUSAT:
modes := []string{"Quick (60s)", "Standard (300s)", "Express (900s)"}
return "CPU test", "Run stress-ng? Mode: " + modes[m.hcMode]
default:
return "Confirm", "Proceed?"
}