Add health verdicts and acceptance tests

This commit is contained in:
Mikhail Chusavitin
2026-03-14 17:53:58 +03:00
parent 17f0bda45e
commit b483e2ce35
28 changed files with 1688 additions and 82 deletions

10
PLAN.md
View File

@@ -23,8 +23,10 @@ Fills the gaps where logpile/Redfish is blind: NVMe, DIMM serials, GPU serials,
- 1.7 PSU collector — **DONE (basic FRU path)** - 1.7 PSU collector — **DONE (basic FRU path)**
- 1.8 NVIDIA GPU enrichment — **DONE** - 1.8 NVIDIA GPU enrichment — **DONE**
- 1.8b Component wear / age telemetry — **DONE** (storage + NVMe + NVIDIA + NIC SFP/DOM + NIC packet stats) - 1.8b Component wear / age telemetry — **DONE** (storage + NVMe + NVIDIA + NIC SFP/DOM + NIC packet stats)
- 1.8c Storage health verdicts — **DONE** (SMART/NVMe warning/failed status derivation)
- 1.9 Mellanox/NVIDIA NIC enrichment — **DONE** (mstflint + ethtool firmware fallback) - 1.9 Mellanox/NVIDIA NIC enrichment — **DONE** (mstflint + ethtool firmware fallback)
- 1.10 RAID controller enrichment — **DONE (initial multi-tool support)** (storcli + sas2/3ircu + arcconf + ssacli + VROC/mdstat) - 1.10 RAID controller enrichment — **DONE (initial multi-tool support)** (storcli + sas2/3ircu + arcconf + ssacli + VROC/mdstat)
- 1.11 PSU SDR health — **DONE** (`ipmitool sdr` merged with FRU inventory)
- 1.11 Output and export workflow — **DONE** (explicit file output + manual removable export via TUI) - 1.11 Output and export workflow — **DONE** (explicit file output + manual removable export via TUI)
- 1.12 Integration test (local) — **DONE** (`scripts/test-local.sh`) - 1.12 Integration test (local) — **DONE** (`scripts/test-local.sh`)
@@ -343,6 +345,8 @@ Planned code shape:
- `menu` launches the LiveCD wrapper `bee-tui`, which escalates to `root` via `sudo -n` - `menu` launches the LiveCD wrapper `bee-tui`, which escalates to `root` via `sudo -n`
- `bee tui` can rerun the audit manually - `bee tui` can rerun the audit manually
- `bee tui` can export the latest audit JSON to removable media - `bee tui` can export the latest audit JSON to removable media
- `bee tui` can show health summary and run NVIDIA/memory/storage acceptance tests
- NVIDIA SAT now includes a lightweight in-image GPU stress step via `bee-gpu-stress`
- removable export requires explicit target selection, mount, confirmation, copy, and cleanup - removable export requires explicit target selection, mount, confirmation, copy, and cleanup
### 2.6 — Vendor utilities and optional assets ### 2.6 — Vendor utilities and optional assets
@@ -350,7 +354,9 @@ Planned code shape:
Optional binaries live in `iso/vendor/` and are included when present: Optional binaries live in `iso/vendor/` and are included when present:
- `storcli64` - `storcli64`
- `sas2ircu`, `sas3ircu` - `sas2ircu`, `sas3ircu`
- `mstflint` - `arcconf`
- `ssacli`
- `mstflint` (via Debian package set)
Missing optional tools do not fail the build or boot. Missing optional tools do not fail the build or boot.
@@ -405,7 +411,7 @@ No "works on my Mac" drift.
2.4 NVIDIA driver build → driver compiled into overlay 2.4 NVIDIA driver build → driver compiled into overlay
2.5 network bring-up on boot → DHCP on all interfaces 2.5 network bring-up on boot → DHCP on all interfaces
2.6 systemd boot service → audit runs on boot automatically 2.6 systemd boot service → audit runs on boot automatically
2.7 vendor utilities → storcli/sas2ircu/mstflint in image 2.7 vendor utilities → storcli/sas2ircu/arcconf/ssacli in image
2.8 release workflow → versioning + release notes 2.8 release workflow → versioning + release notes
2.9 operator export flow → explicit TUI export to removable media 2.9 operator export flow → explicit TUI export to removable media
``` ```

View File

@@ -27,11 +27,14 @@ func run(args []string, stdout, stderr io.Writer) int {
if len(args) == 0 { if len(args) == 0 {
printRootUsage(stderr) printRootUsage(stderr)
return 1 return 2
} }
switch args[0] { switch args[0] {
case "help", "--help", "-h": case "help", "--help", "-h":
if len(args) > 1 {
return runHelp(args[1:], stdout, stderr)
}
printRootUsage(stdout) printRootUsage(stdout)
return 0 return 0
case "audit": case "audit":
@@ -48,7 +51,7 @@ func run(args []string, stdout, stderr io.Writer) int {
default: default:
fmt.Fprintf(stderr, "bee: unknown command %q\n\n", args[0]) fmt.Fprintf(stderr, "bee: unknown command %q\n\n", args[0])
printRootUsage(stderr) printRootUsage(stderr)
return 1 return 2
} }
} }
@@ -57,8 +60,29 @@ func printRootUsage(w io.Writer) {
bee audit --runtime auto|local|livecd --output stdout|file:<path> bee audit --runtime auto|local|livecd --output stdout|file:<path>
bee tui --runtime auto|local|livecd bee tui --runtime auto|local|livecd
bee export --target <device> bee export --target <device>
bee sat nvidia bee sat nvidia|memory|storage
bee version`) bee version
bee help [command]`)
}
func runHelp(args []string, stdout, stderr io.Writer) int {
switch args[0] {
case "audit":
return runAudit([]string{"--help"}, stdout, stdout)
case "tui":
return runTUI([]string{"--help"}, stdout, stdout)
case "export":
return runExport([]string{"--help"}, stdout, stdout)
case "sat":
return runSAT([]string{"--help"}, stdout, stderr)
case "version":
fmt.Fprintln(stdout, "usage: bee version")
return 0
default:
fmt.Fprintf(stderr, "bee help: unknown command %q\n\n", args[0])
printRootUsage(stderr)
return 2
}
} }
func runAudit(args []string, stdout, stderr io.Writer) int { func runAudit(args []string, stdout, stderr io.Writer) int {
@@ -72,6 +96,13 @@ func runAudit(args []string, stdout, stderr io.Writer) int {
fs.PrintDefaults() fs.PrintDefaults()
} }
if err := fs.Parse(args); err != nil { if err := fs.Parse(args); err != nil {
if err == flag.ErrHelp {
return 0
}
return 2
}
if fs.NArg() != 0 {
fs.Usage()
return 2 return 2
} }
if *showVersion { if *showVersion {
@@ -107,6 +138,13 @@ func runTUI(args []string, stdout, stderr io.Writer) int {
fs.PrintDefaults() fs.PrintDefaults()
} }
if err := fs.Parse(args); err != nil { if err := fs.Parse(args); err != nil {
if err == flag.ErrHelp {
return 0
}
return 2
}
if fs.NArg() != 0 {
fs.Usage()
return 2 return 2
} }
@@ -137,6 +175,13 @@ func runExport(args []string, stdout, stderr io.Writer) int {
fs.PrintDefaults() fs.PrintDefaults()
} }
if err := fs.Parse(args); err != nil { if err := fs.Parse(args); err != nil {
if err == flag.ErrHelp {
return 0
}
return 2
}
if fs.NArg() != 0 {
fs.Usage()
return 2 return 2
} }
if strings.TrimSpace(*targetDevice) == "" { if strings.TrimSpace(*targetDevice) == "" {
@@ -169,21 +214,44 @@ func runExport(args []string, stdout, stderr io.Writer) int {
} }
func runSAT(args []string, stdout, stderr io.Writer) int { func runSAT(args []string, stdout, stderr io.Writer) int {
if len(args) == 0 || args[0] == "help" || args[0] == "--help" || args[0] == "-h" { if len(args) == 0 {
fmt.Fprintln(stderr, "usage: bee sat nvidia") fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage")
return 2 return 2
} }
if args[0] != "nvidia" { if args[0] == "help" || args[0] == "--help" || args[0] == "-h" {
fmt.Fprintln(stdout, "usage: bee sat nvidia|memory|storage")
return 0
}
if args[0] != "nvidia" && args[0] != "memory" && args[0] != "storage" {
fmt.Fprintf(stderr, "bee sat: unknown target %q\n", args[0]) fmt.Fprintf(stderr, "bee sat: unknown target %q\n", args[0])
fmt.Fprintln(stderr, "usage: bee sat nvidia") fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage")
return 2
}
if len(args) > 1 {
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage")
return 2 return 2
} }
application := app.New(platform.New()) application := app.New(platform.New())
archive, err := application.RunNvidiaAcceptancePack("") var (
archive string
err error
label string
)
switch args[0] {
case "nvidia":
label = "nvidia"
archive, err = application.RunNvidiaAcceptancePack("")
case "memory":
label = "memory"
archive, err = application.RunMemoryAcceptancePack("")
case "storage":
label = "storage"
archive, err = application.RunStorageAcceptancePack("")
}
if err != nil { if err != nil {
slog.Error("run nvidia sat", "err", err) slog.Error("run sat", "target", label, "err", err)
return 1 return 1
} }
slog.Info("nvidia sat archive written", "path", archive) slog.Info("sat archive written", "target", label, "path", archive)
return 0 return 0
} }

View File

@@ -24,8 +24,8 @@ func TestRunNoArgsPrintsUsage(t *testing.T) {
var stdout, stderr bytes.Buffer var stdout, stderr bytes.Buffer
rc := run(nil, &stdout, &stderr) rc := run(nil, &stdout, &stderr)
if rc != 1 { if rc != 2 {
t.Fatalf("rc=%d want 1", rc) t.Fatalf("rc=%d want 2", rc)
} }
if !strings.Contains(stderr.String(), "bee commands:") { if !strings.Contains(stderr.String(), "bee commands:") {
t.Fatalf("stderr missing root usage:\n%s", stderr.String()) t.Fatalf("stderr missing root usage:\n%s", stderr.String())
@@ -37,8 +37,8 @@ func TestRunUnknownCommand(t *testing.T) {
var stdout, stderr bytes.Buffer var stdout, stderr bytes.Buffer
rc := run([]string{"wat"}, &stdout, &stderr) rc := run([]string{"wat"}, &stdout, &stderr)
if rc != 1 { if rc != 2 {
t.Fatalf("rc=%d want 1", rc) t.Fatalf("rc=%d want 2", rc)
} }
if !strings.Contains(stderr.String(), `unknown command "wat"`) { if !strings.Contains(stderr.String(), `unknown command "wat"`) {
t.Fatalf("stderr missing unknown command message:\n%s", stderr.String()) t.Fatalf("stderr missing unknown command message:\n%s", stderr.String())
@@ -86,11 +86,37 @@ func TestRunSATUsage(t *testing.T) {
if rc != 2 { if rc != 2 {
t.Fatalf("rc=%d want 2", rc) t.Fatalf("rc=%d want 2", rc)
} }
if !strings.Contains(stderr.String(), "usage: bee sat nvidia") { if !strings.Contains(stderr.String(), "usage: bee sat nvidia|memory|storage") {
t.Fatalf("stderr missing sat usage:\n%s", stderr.String()) t.Fatalf("stderr missing sat usage:\n%s", stderr.String())
} }
} }
func TestRunHelpForSubcommand(t *testing.T) {
t.Parallel()
var stdout, stderr bytes.Buffer
rc := run([]string{"help", "export"}, &stdout, &stderr)
if rc != 0 {
t.Fatalf("rc=%d want 0", rc)
}
if !strings.Contains(stdout.String(), "usage: bee export --target <device>") {
t.Fatalf("stdout missing export help:\n%s", stdout.String())
}
}
func TestRunHelpUnknownSubcommand(t *testing.T) {
t.Parallel()
var stdout, stderr bytes.Buffer
rc := run([]string{"help", "wat"}, &stdout, &stderr)
if rc != 2 {
t.Fatalf("rc=%d want 2", rc)
}
if !strings.Contains(stderr.String(), `bee help: unknown command "wat"`) {
t.Fatalf("stderr missing help error:\n%s", stderr.String())
}
}
func TestRunSATUnknownTarget(t *testing.T) { func TestRunSATUnknownTarget(t *testing.T) {
t.Parallel() t.Parallel()
@@ -104,6 +130,32 @@ func TestRunSATUnknownTarget(t *testing.T) {
} }
} }
func TestRunSATHelp(t *testing.T) {
t.Parallel()
var stdout, stderr bytes.Buffer
rc := run([]string{"sat", "--help"}, &stdout, &stderr)
if rc != 0 {
t.Fatalf("rc=%d want 0", rc)
}
if !strings.Contains(stdout.String(), "usage: bee sat nvidia|memory|storage") {
t.Fatalf("stdout missing sat help:\n%s", stdout.String())
}
}
func TestRunSATRejectsExtraArgs(t *testing.T) {
t.Parallel()
var stdout, stderr bytes.Buffer
rc := run([]string{"sat", "memory", "extra"}, &stdout, &stderr)
if rc != 2 {
t.Fatalf("rc=%d want 2", rc)
}
if !strings.Contains(stderr.String(), "usage: bee sat nvidia|memory|storage") {
t.Fatalf("stderr missing sat usage:\n%s", stderr.String())
}
}
func TestRunAuditInvalidRuntime(t *testing.T) { func TestRunAuditInvalidRuntime(t *testing.T) {
t.Parallel() t.Parallel()
@@ -113,3 +165,29 @@ func TestRunAuditInvalidRuntime(t *testing.T) {
t.Fatalf("rc=%d want 1", rc) t.Fatalf("rc=%d want 1", rc)
} }
} }
func TestRunAuditRejectsExtraArgs(t *testing.T) {
t.Parallel()
var stdout, stderr bytes.Buffer
rc := run([]string{"audit", "extra"}, &stdout, &stderr)
if rc != 2 {
t.Fatalf("rc=%d want 2", rc)
}
if !strings.Contains(stderr.String(), "usage: bee audit") {
t.Fatalf("stderr missing audit usage:\n%s", stderr.String())
}
}
func TestRunExportRejectsExtraArgs(t *testing.T) {
t.Parallel()
var stdout, stderr bytes.Buffer
rc := run([]string{"export", "--target", "/dev/sdb1", "extra"}, &stdout, &stderr)
if rc != 2 {
t.Fatalf("rc=%d want 2", rc)
}
if !strings.Contains(stderr.String(), "usage: bee export --target <device>") {
t.Fatalf("stderr missing export usage:\n%s", stderr.String())
}
}

View File

@@ -5,6 +5,7 @@ import (
"fmt" "fmt"
"os" "os"
"path/filepath" "path/filepath"
"sort"
"strconv" "strconv"
"strings" "strings"
"time" "time"
@@ -58,6 +59,8 @@ type toolManager interface {
type satRunner interface { type satRunner interface {
RunNvidiaAcceptancePack(baseDir string) (string, error) RunNvidiaAcceptancePack(baseDir string) (string, error)
RunMemoryAcceptancePack(baseDir string) (string, error)
RunStorageAcceptancePack(baseDir string) (string, error)
} }
func New(platform *platform.System) *App { func New(platform *platform.System) *App {
@@ -124,7 +127,11 @@ func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error)
func (a *App) ExportLatestAuditResult(target platform.RemovableTarget) (ActionResult, error) { func (a *App) ExportLatestAuditResult(target platform.RemovableTarget) (ActionResult, error) {
path, err := a.ExportLatestAudit(target) path, err := a.ExportLatestAudit(target)
return ActionResult{Title: "Export audit", Body: "Audit exported to " + path}, err body := "Audit exported."
if path != "" {
body = "Audit exported to " + path
}
return ActionResult{Title: "Export audit", Body: body}, err
} }
func (a *App) ListInterfaces() ([]platform.InterfaceInfo, error) { func (a *App) ListInterfaces() ([]platform.InterfaceInfo, error) {
@@ -141,7 +148,7 @@ func (a *App) DHCPOne(iface string) (string, error) {
func (a *App) DHCPOneResult(iface string) (ActionResult, error) { func (a *App) DHCPOneResult(iface string) (ActionResult, error) {
body, err := a.network.DHCPOne(iface) body, err := a.network.DHCPOne(iface)
return ActionResult{Title: "DHCP on " + iface, Body: body}, err return ActionResult{Title: "DHCP: " + iface, Body: bodyOr(body, "DHCP completed.")}, err
} }
func (a *App) DHCPAll() (string, error) { func (a *App) DHCPAll() (string, error) {
@@ -150,7 +157,7 @@ func (a *App) DHCPAll() (string, error) {
func (a *App) DHCPAllResult() (ActionResult, error) { func (a *App) DHCPAllResult() (ActionResult, error) {
body, err := a.network.DHCPAll() body, err := a.network.DHCPAll()
return ActionResult{Title: "DHCP all interfaces", Body: body}, err return ActionResult{Title: "DHCP: all interfaces", Body: bodyOr(body, "DHCP completed.")}, err
} }
func (a *App) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error) { func (a *App) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error) {
@@ -159,7 +166,7 @@ func (a *App) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error) {
func (a *App) SetStaticIPv4Result(cfg platform.StaticIPv4Config) (ActionResult, error) { func (a *App) SetStaticIPv4Result(cfg platform.StaticIPv4Config) (ActionResult, error) {
body, err := a.network.SetStaticIPv4(cfg) body, err := a.network.SetStaticIPv4(cfg)
return ActionResult{Title: "Static IPv4 on " + cfg.Interface, Body: body}, err return ActionResult{Title: "Static IPv4: " + cfg.Interface, Body: bodyOr(body, "Static IPv4 updated.")}, err
} }
func (a *App) NetworkStatus() (ActionResult, error) { func (a *App) NetworkStatus() (ActionResult, error) {
@@ -167,6 +174,9 @@ func (a *App) NetworkStatus() (ActionResult, error) {
if err != nil { if err != nil {
return ActionResult{Title: "Network status"}, err return ActionResult{Title: "Network status"}, err
} }
if len(ifaces) == 0 {
return ActionResult{Title: "Network status", Body: "No physical interfaces found."}, nil
}
var body strings.Builder var body strings.Builder
for _, iface := range ifaces { for _, iface := range ifaces {
ipv4 := "(no IPv4)" ipv4 := "(no IPv4)"
@@ -216,7 +226,7 @@ func (a *App) ServiceStatus(name string) (string, error) {
func (a *App) ServiceStatusResult(name string) (ActionResult, error) { func (a *App) ServiceStatusResult(name string) (ActionResult, error) {
body, err := a.services.ServiceStatus(name) body, err := a.services.ServiceStatus(name)
return ActionResult{Title: "service: " + name, Body: body}, err return ActionResult{Title: "service status: " + name, Body: bodyOr(body, "No status output.")}, err
} }
func (a *App) ServiceDo(name string, action platform.ServiceAction) (string, error) { func (a *App) ServiceDo(name string, action platform.ServiceAction) (string, error) {
@@ -225,7 +235,7 @@ func (a *App) ServiceDo(name string, action platform.ServiceAction) (string, err
func (a *App) ServiceActionResult(name string, action platform.ServiceAction) (ActionResult, error) { func (a *App) ServiceActionResult(name string, action platform.ServiceAction) (ActionResult, error) {
body, err := a.services.ServiceDo(name, action) body, err := a.services.ServiceDo(name, action)
return ActionResult{Title: "service: " + name, Body: body}, err return ActionResult{Title: "service " + string(action) + ": " + name, Body: bodyOr(body, "Action completed.")}, err
} }
func (a *App) ListRemovableTargets() ([]platform.RemovableTarget, error) { func (a *App) ListRemovableTargets() ([]platform.RemovableTarget, error) {
@@ -241,6 +251,9 @@ func (a *App) CheckTools(names []string) []platform.ToolStatus {
} }
func (a *App) ToolCheckResult(names []string) ActionResult { func (a *App) ToolCheckResult(names []string) ActionResult {
if len(names) == 0 {
return ActionResult{Title: "Required tools", Body: "No tools checked."}
}
var body strings.Builder var body strings.Builder
for _, tool := range a.tools.CheckTools(names) { for _, tool := range a.tools.CheckTools(names) {
status := "MISSING" status := "MISSING"
@@ -253,7 +266,12 @@ func (a *App) ToolCheckResult(names []string) ActionResult {
} }
func (a *App) AuditLogTailResult() ActionResult { func (a *App) AuditLogTailResult() ActionResult {
body := a.tools.TailFile(DefaultAuditLogPath, 40) + "\n\n" + a.tools.TailFile(DefaultAuditJSONPath, 20) logTail := strings.TrimSpace(a.tools.TailFile(DefaultAuditLogPath, 40))
jsonTail := strings.TrimSpace(a.tools.TailFile(DefaultAuditJSONPath, 20))
body := strings.TrimSpace(logTail + "\n\n" + jsonTail)
if body == "" {
body = "No audit logs found."
}
return ActionResult{Title: "Audit log tail", Body: body} return ActionResult{Title: "Audit log tail", Body: body}
} }
@@ -263,7 +281,88 @@ func (a *App) RunNvidiaAcceptancePack(baseDir string) (string, error) {
func (a *App) RunNvidiaAcceptancePackResult(baseDir string) (ActionResult, error) { func (a *App) RunNvidiaAcceptancePackResult(baseDir string) (ActionResult, error) {
path, err := a.sat.RunNvidiaAcceptancePack(baseDir) path, err := a.sat.RunNvidiaAcceptancePack(baseDir)
return ActionResult{Title: "NVIDIA SAT", Body: "Archive written to " + path}, err body := "Archive written."
if path != "" {
body = "Archive written to " + path
}
return ActionResult{Title: "NVIDIA SAT", Body: body}, err
}
func (a *App) RunMemoryAcceptancePack(baseDir string) (string, error) {
return a.sat.RunMemoryAcceptancePack(baseDir)
}
func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
path, err := a.sat.RunMemoryAcceptancePack(baseDir)
body := "Archive written."
if path != "" {
body = "Archive written to " + path
}
return ActionResult{Title: "Memory SAT", Body: body}, err
}
func (a *App) RunStorageAcceptancePack(baseDir string) (string, error) {
return a.sat.RunStorageAcceptancePack(baseDir)
}
func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
path, err := a.sat.RunStorageAcceptancePack(baseDir)
body := "Archive written."
if path != "" {
body = "Archive written to " + path
}
return ActionResult{Title: "Storage SAT", Body: body}, err
}
func (a *App) HealthSummaryResult() ActionResult {
type auditFile struct {
Hardware struct {
Summary struct {
Status string `json:"status"`
Warnings []string `json:"warnings"`
Failures []string `json:"failures"`
StorageWarn int `json:"storage_warn"`
StorageFail int `json:"storage_fail"`
PCIeWarn int `json:"pcie_warn"`
PCIeFail int `json:"pcie_fail"`
PSUWarn int `json:"psu_warn"`
PSUFail int `json:"psu_fail"`
MemoryWarn int `json:"memory_warn"`
MemoryFail int `json:"memory_fail"`
} `json:"summary"`
} `json:"hardware"`
}
raw, err := os.ReadFile(DefaultAuditJSONPath)
if err != nil {
return ActionResult{Title: "Health summary", Body: "No audit JSON found."}
}
var snapshot auditFile
if err := json.Unmarshal(raw, &snapshot); err != nil {
return ActionResult{Title: "Health summary", Body: "Audit JSON is unreadable."}
}
summary := snapshot.Hardware.Summary
var body strings.Builder
status := summary.Status
if status == "" {
status = "UNKNOWN"
}
fmt.Fprintf(&body, "Overall: %s\n", status)
fmt.Fprintf(&body, "Storage: warn=%d fail=%d\n", summary.StorageWarn, summary.StorageFail)
fmt.Fprintf(&body, "PCIe: warn=%d fail=%d\n", summary.PCIeWarn, summary.PCIeFail)
fmt.Fprintf(&body, "PSU: warn=%d fail=%d\n", summary.PSUWarn, summary.PSUFail)
fmt.Fprintf(&body, "Memory: warn=%d fail=%d\n", summary.MemoryWarn, summary.MemoryFail)
for _, item := range latestSATSummaries() {
fmt.Fprintf(&body, "\n%s", item)
}
if len(summary.Failures) > 0 {
fmt.Fprintf(&body, "\n\nFailures:\n- %s", strings.Join(summary.Failures, "\n- "))
}
if len(summary.Warnings) > 0 {
fmt.Fprintf(&body, "\n\nWarnings:\n- %s", strings.Join(summary.Warnings, "\n- "))
}
return ActionResult{Title: "Health summary", Body: strings.TrimSpace(body.String())}
} }
func (a *App) FormatToolStatuses(statuses []platform.ToolStatus) string { func (a *App) FormatToolStatuses(statuses []platform.ToolStatus) string {
@@ -309,3 +408,37 @@ func sanitizeFilename(v string) string {
} }
return string(out) return string(out)
} }
func bodyOr(body, fallback string) string {
body = strings.TrimSpace(body)
if body == "" {
return fallback
}
return body
}
func latestSATSummaries() []string {
baseDir := "/var/log/bee-sat"
patterns := []struct {
label string
prefix string
}{
{label: "NVIDIA SAT", prefix: "gpu-nvidia-"},
{label: "Memory SAT", prefix: "memory-"},
{label: "Storage SAT", prefix: "storage-"},
}
var out []string
for _, item := range patterns {
matches, err := filepath.Glob(filepath.Join(baseDir, item.prefix+"*/summary.txt"))
if err != nil || len(matches) == 0 {
continue
}
sort.Strings(matches)
raw, err := os.ReadFile(matches[len(matches)-1])
if err != nil {
continue
}
out = append(out, item.label+":\n"+strings.TrimSpace(string(raw)))
}
return out
}

View File

@@ -76,11 +76,21 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
} }
type fakeSAT struct { type fakeSAT struct {
runFn func(string) (string, error) runNvidiaFn func(string) (string, error)
runMemoryFn func(string) (string, error)
runStorageFn func(string) (string, error)
} }
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string) (string, error) { func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string) (string, error) {
return f.runFn(baseDir) return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunMemoryAcceptancePack(baseDir string) (string, error) {
return f.runMemoryFn(baseDir)
}
func (f fakeSAT) RunStorageAcceptancePack(baseDir string) (string, error) {
return f.runStorageFn(baseDir)
} }
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) { func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
@@ -116,6 +126,25 @@ func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
} }
} }
func TestNetworkStatusHandlesNoInterfaces(t *testing.T) {
t.Parallel()
a := &App{
network: fakeNetwork{
listInterfacesFn: func() ([]platform.InterfaceInfo, error) { return nil, nil },
defaultRouteFn: func() string { return "" },
},
}
result, err := a.NetworkStatus()
if err != nil {
t.Fatalf("NetworkStatus error: %v", err)
}
if result.Body != "No physical interfaces found." {
t.Fatalf("body=%q want %q", result.Body, "No physical interfaces found.")
}
}
func TestNetworkStatusPropagatesListError(t *testing.T) { func TestNetworkStatusPropagatesListError(t *testing.T) {
t.Parallel() t.Parallel()
@@ -192,7 +221,7 @@ func TestServiceActionResults(t *testing.T) {
if err != nil { if err != nil {
t.Fatalf("ServiceStatusResult error: %v", err) t.Fatalf("ServiceStatusResult error: %v", err)
} }
if statusResult.Title != "service: bee-audit" || statusResult.Body != "active" { if statusResult.Title != "service status: bee-audit" || statusResult.Body != "active" {
t.Fatalf("unexpected status result: %#v", statusResult) t.Fatalf("unexpected status result: %#v", statusResult)
} }
@@ -200,7 +229,7 @@ func TestServiceActionResults(t *testing.T) {
if err != nil { if err != nil {
t.Fatalf("ServiceActionResult error: %v", err) t.Fatalf("ServiceActionResult error: %v", err)
} }
if actionResult.Title != "service: bee-audit" || actionResult.Body != "restart ok" { if actionResult.Title != "service restart: bee-audit" || actionResult.Body != "restart ok" {
t.Fatalf("unexpected action result: %#v", actionResult) t.Fatalf("unexpected action result: %#v", actionResult)
} }
} }
@@ -242,17 +271,79 @@ func TestToolCheckAndLogTailResults(t *testing.T) {
} }
} }
func TestActionResultsUseFallbackBody(t *testing.T) {
t.Parallel()
a := &App{
network: fakeNetwork{
dhcpOneFn: func(string) (string, error) { return " ", nil },
dhcpAllFn: func() (string, error) { return "", nil },
setStaticIPv4Fn: func(platform.StaticIPv4Config) (string, error) { return "", nil },
listInterfacesFn: func() ([]platform.InterfaceInfo, error) {
return nil, nil
},
defaultRouteFn: func() string { return "" },
},
services: fakeServices{
serviceStatusFn: func(string) (string, error) { return "", nil },
serviceDoFn: func(string, platform.ServiceAction) (string, error) { return "", nil },
},
tools: fakeTools{
tailFileFn: func(string, int) string { return " " },
checkToolsFn: func([]string) []platform.ToolStatus { return nil },
},
sat: fakeSAT{
runNvidiaFn: func(string) (string, error) { return "", nil },
runMemoryFn: func(string) (string, error) { return "", nil },
runStorageFn: func(string) (string, error) { return "", nil },
},
}
if got, _ := a.DHCPOneResult("eth0"); got.Body != "DHCP completed." {
t.Fatalf("dhcp one body=%q", got.Body)
}
if got, _ := a.DHCPAllResult(); got.Body != "DHCP completed." {
t.Fatalf("dhcp all body=%q", got.Body)
}
if got, _ := a.SetStaticIPv4Result(platform.StaticIPv4Config{Interface: "eth0"}); got.Body != "Static IPv4 updated." {
t.Fatalf("static body=%q", got.Body)
}
if got, _ := a.ServiceStatusResult("bee-audit"); got.Body != "No status output." {
t.Fatalf("status body=%q", got.Body)
}
if got, _ := a.ServiceActionResult("bee-audit", platform.ServiceRestart); got.Body != "Action completed." {
t.Fatalf("action body=%q", got.Body)
}
if got := a.ToolCheckResult(nil); got.Body != "No tools checked." {
t.Fatalf("tool body=%q", got.Body)
}
if got := a.AuditLogTailResult(); got.Body != "No audit logs found." {
t.Fatalf("log body=%q", got.Body)
}
if got, _ := a.RunNvidiaAcceptancePackResult(""); got.Body != "Archive written." {
t.Fatalf("sat body=%q", got.Body)
}
if got, _ := a.RunMemoryAcceptancePackResult(""); got.Body != "Archive written." {
t.Fatalf("memory sat body=%q", got.Body)
}
if got, _ := a.RunStorageAcceptancePackResult(""); got.Body != "Archive written." {
t.Fatalf("storage sat body=%q", got.Body)
}
}
func TestRunNvidiaAcceptancePackResult(t *testing.T) { func TestRunNvidiaAcceptancePackResult(t *testing.T) {
t.Parallel() t.Parallel()
a := &App{ a := &App{
sat: fakeSAT{ sat: fakeSAT{
runFn: func(baseDir string) (string, error) { runNvidiaFn: func(baseDir string) (string, error) {
if baseDir != "/tmp/sat" { if baseDir != "/tmp/sat" {
t.Fatalf("baseDir=%q want %q", baseDir, "/tmp/sat") t.Fatalf("baseDir=%q want %q", baseDir, "/tmp/sat")
} }
return "/tmp/sat/out.tar.gz", nil return "/tmp/sat/out.tar.gz", nil
}, },
runMemoryFn: func(string) (string, error) { return "", nil },
runStorageFn: func(string) (string, error) { return "", nil },
}, },
} }

View File

@@ -35,6 +35,7 @@ func Run(runtimeMode runtimeenv.Mode) schema.HardwareIngestRequest {
snap.Storage = enrichStorageWithVROC(snap.Storage, snap.PCIeDevices) snap.Storage = enrichStorageWithVROC(snap.Storage, snap.PCIeDevices)
snap.Storage = appendUniqueStorage(snap.Storage, collectRAIDStorage(snap.PCIeDevices)) snap.Storage = appendUniqueStorage(snap.Storage, collectRAIDStorage(snap.PCIeDevices))
snap.PowerSupplies = collectPSUs() snap.PowerSupplies = collectPSUs()
snap.Summary = buildHealthSummary(snap)
// remaining collectors added in steps 1.8 1.10 // remaining collectors added in steps 1.8 1.10

View File

@@ -4,6 +4,7 @@ import (
"bee/audit/internal/schema" "bee/audit/internal/schema"
"log/slog" "log/slog"
"os/exec" "os/exec"
"regexp"
"strconv" "strconv"
"strings" "strings"
) )
@@ -16,6 +17,9 @@ func collectPSUs() []schema.HardwarePowerSupply {
return nil return nil
} }
psus := parseFRU(string(out)) psus := parseFRU(string(out))
if sdrOut, err := exec.Command("ipmitool", "sdr").Output(); err == nil {
mergePSUSDR(psus, parsePSUSDR(string(sdrOut)))
}
slog.Info("psu: collected", "count", len(psus)) slog.Info("psu: collected", "count", len(psus))
return psus return psus
} }
@@ -116,6 +120,135 @@ func parseFRUBlock(block string, slotIdx int) (schema.HardwarePowerSupply, bool)
return psu, true return psu, true
} }
type psuSDR struct {
slot int
status string
inputPowerW *float64
outputPowerW *float64
inputVoltage *float64
}
var psuSlotRe = regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b|\bps\s*([0-9]+)\b`)
func parsePSUSDR(raw string) map[int]psuSDR {
out := map[int]psuSDR{}
for _, line := range strings.Split(raw, "\n") {
fields := splitSDRFields(line)
if len(fields) < 3 {
continue
}
name := fields[0]
value := fields[1]
state := strings.ToLower(fields[2])
slot, ok := parsePSUSlot(name)
if !ok {
continue
}
entry := out[slot]
entry.slot = slot
if entry.status == "" {
entry.status = "OK"
}
if state != "" && state != "ok" && state != "ns" {
entry.status = "FAILED"
}
lowerName := strings.ToLower(name)
switch {
case strings.Contains(lowerName, "input power"):
entry.inputPowerW = parseFloatPtr(value)
case strings.Contains(lowerName, "output power"):
entry.outputPowerW = parseFloatPtr(value)
case strings.Contains(lowerName, "input voltage"), strings.Contains(lowerName, "ac input"):
entry.inputVoltage = parseFloatPtr(value)
}
out[slot] = entry
}
return out
}
func mergePSUSDR(psus []schema.HardwarePowerSupply, sdr map[int]psuSDR) {
for i := range psus {
slotIdx, err := strconv.Atoi(derefPSUSlot(psus[i].Slot))
if err != nil {
continue
}
entry, ok := sdr[slotIdx+1]
if !ok {
continue
}
if entry.inputPowerW != nil {
psus[i].InputPowerW = entry.inputPowerW
}
if entry.outputPowerW != nil {
psus[i].OutputPowerW = entry.outputPowerW
}
if entry.inputVoltage != nil {
psus[i].InputVoltage = entry.inputVoltage
}
if entry.status != "" {
psus[i].Status = &entry.status
}
if psus[i].Status != nil && *psus[i].Status == "OK" {
if (entry.inputPowerW == nil && entry.outputPowerW == nil && entry.inputVoltage == nil) && entry.status == "" {
unknown := "UNKNOWN"
psus[i].Status = &unknown
}
}
}
}
func splitSDRFields(line string) []string {
parts := strings.Split(line, "|")
out := make([]string, 0, len(parts))
for _, part := range parts {
part = strings.TrimSpace(part)
if part != "" {
out = append(out, part)
}
}
return out
}
func parsePSUSlot(name string) (int, bool) {
m := psuSlotRe.FindStringSubmatch(strings.ToLower(name))
if len(m) == 0 {
return 0, false
}
for _, group := range m[1:] {
if group == "" {
continue
}
n, err := strconv.Atoi(group)
if err == nil && n > 0 {
return n, true
}
}
return 0, false
}
func parseFloatPtr(raw string) *float64 {
raw = strings.TrimSpace(raw)
if raw == "" || strings.EqualFold(raw, "na") {
return nil
}
for _, field := range strings.Fields(raw) {
n, err := strconv.ParseFloat(strings.TrimSpace(field), 64)
if err == nil {
return &n
}
}
return nil
}
func derefPSUSlot(slot *string) string {
if slot == nil {
return ""
}
return *slot
}
// parseWattage extracts wattage from strings like "PSU 800W", "1200W PLATINUM". // parseWattage extracts wattage from strings like "PSU 800W", "1200W PLATINUM".
func parseWattage(s string) int { func parseWattage(s string) int {
s = strings.ToUpper(s) s = strings.ToUpper(s)

View File

@@ -0,0 +1,32 @@
package collector
import "testing"
func TestParsePSUSDR(t *testing.T) {
raw := `
PS1 Input Power | 215 Watts | ok
PS1 Output Power | 198 Watts | ok
PS1 Input Voltage | 229 Volts | ok
PS2 Input Power | 0 Watts | cr
`
got := parsePSUSDR(raw)
if len(got) != 2 {
t.Fatalf("len(got)=%d want 2", len(got))
}
if got[1].status != "OK" {
t.Fatalf("ps1 status=%q", got[1].status)
}
if got[1].inputPowerW == nil || *got[1].inputPowerW != 215 {
t.Fatalf("ps1 input power=%v", got[1].inputPowerW)
}
if got[1].outputPowerW == nil || *got[1].outputPowerW != 198 {
t.Fatalf("ps1 output power=%v", got[1].outputPowerW)
}
if got[1].inputVoltage == nil || *got[1].inputVoltage != 229 {
t.Fatalf("ps1 input voltage=%v", got[1].inputVoltage)
}
if got[2].status != "FAILED" {
t.Fatalf("ps2 status=%q", got[2].status)
}
}

View File

@@ -67,6 +67,9 @@ type smartctlInfo struct {
SerialNumber string `json:"serial_number"` SerialNumber string `json:"serial_number"`
FirmwareVer string `json:"firmware_version"` FirmwareVer string `json:"firmware_version"`
RotationRate int `json:"rotation_rate"` RotationRate int `json:"rotation_rate"`
SmartStatus struct {
Passed bool `json:"passed"`
} `json:"smart_status"`
UserCapacity struct { UserCapacity struct {
Bytes int64 `json:"bytes"` Bytes int64 `json:"bytes"`
} `json:"user_capacity"` } `json:"user_capacity"`
@@ -127,7 +130,7 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
return s return s
} }
var info smartctlInfo var info smartctlInfo
if err := json.Unmarshal(out, &info); err == nil { if err := json.Unmarshal(out, &info); err == nil {
if v := cleanDMIValue(info.ModelName); v != "" { if v := cleanDMIValue(info.ModelName); v != "" {
s.Model = &v s.Model = &v
@@ -158,37 +161,65 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
if info.PowerCycleCount > 0 { if info.PowerCycleCount > 0 {
tel["power_cycles"] = info.PowerCycleCount tel["power_cycles"] = info.PowerCycleCount
} }
reallocated := int64(0)
pending := int64(0)
uncorrectable := int64(0)
lifeRemaining := int64(0)
for _, attr := range info.AtaSmartAttributes.Table { for _, attr := range info.AtaSmartAttributes.Table {
switch attr.ID { switch attr.ID {
case 5: case 5:
reallocated = attr.Raw.Value
tel["reallocated_sectors"] = attr.Raw.Value tel["reallocated_sectors"] = attr.Raw.Value
case 177: case 177:
tel["wear_leveling_pct"] = attr.Raw.Value tel["wear_leveling_pct"] = attr.Raw.Value
case 231: case 231:
lifeRemaining = attr.Raw.Value
tel["life_remaining_pct"] = attr.Raw.Value tel["life_remaining_pct"] = attr.Raw.Value
case 241: case 241:
tel["total_lba_written"] = attr.Raw.Value tel["total_lba_written"] = attr.Raw.Value
case 197:
pending = attr.Raw.Value
tel["current_pending_sectors"] = attr.Raw.Value
case 198:
uncorrectable = attr.Raw.Value
tel["offline_uncorrectable"] = attr.Raw.Value
} }
} }
if len(tel) > 0 { if len(tel) > 0 {
s.Telemetry = tel s.Telemetry = tel
} }
status := storageHealthStatus{
overallPassed: info.SmartStatus.Passed,
hasOverall: true,
reallocatedSectors: reallocated,
pendingSectors: pending,
offlineUncorrectable: uncorrectable,
lifeRemainingPct: lifeRemaining,
}
setStorageHealthStatus(&s, status)
return s
} }
s.Type = &devType s.Type = &devType
status := "OK" status := "UNKNOWN"
s.Status = &status s.Status = &status
return s return s
} }
// nvmeSmartLog is the subset of `nvme smart-log -o json` output we care about. // nvmeSmartLog is the subset of `nvme smart-log -o json` output we care about.
type nvmeSmartLog struct { type nvmeSmartLog struct {
PercentageUsed int `json:"percentage_used"` CriticalWarning int `json:"critical_warning"`
PowerOnHours int64 `json:"power_on_hours"` PercentageUsed int `json:"percentage_used"`
PowerCycles int64 `json:"power_cycles"` AvailableSpare int `json:"available_spare"`
UnsafeShutdowns int64 `json:"unsafe_shutdowns"` SpareThreshold int `json:"spare_thresh"`
DataUnitsWritten int64 `json:"data_units_written"` PowerOnHours int64 `json:"power_on_hours"`
ControllerBusy int64 `json:"controller_busy_time"` PowerCycles int64 `json:"power_cycles"`
UnsafeShutdowns int64 `json:"unsafe_shutdowns"`
DataUnitsWritten int64 `json:"data_units_written"`
ControllerBusy int64 `json:"controller_busy_time"`
MediaErrors int64 `json:"media_errors"`
NumErrLogEntries int64 `json:"num_err_log_entries"`
} }
// nvmeIDCtrl is the subset of `nvme id-ctrl -o json` output. // nvmeIDCtrl is the subset of `nvme id-ctrl -o json` output.
@@ -238,6 +269,9 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
var log nvmeSmartLog var log nvmeSmartLog
if json.Unmarshal(out, &log) == nil { if json.Unmarshal(out, &log) == nil {
tel := map[string]any{} tel := map[string]any{}
if log.CriticalWarning > 0 {
tel["critical_warning"] = log.CriticalWarning
}
if log.PowerOnHours > 0 { if log.PowerOnHours > 0 {
tel["power_on_hours"] = log.PowerOnHours tel["power_on_hours"] = log.PowerOnHours
} }
@@ -256,11 +290,78 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
if log.ControllerBusy > 0 { if log.ControllerBusy > 0 {
tel["controller_busy_time"] = log.ControllerBusy tel["controller_busy_time"] = log.ControllerBusy
} }
if log.AvailableSpare > 0 {
tel["available_spare_pct"] = log.AvailableSpare
}
if log.SpareThreshold > 0 {
tel["available_spare_threshold_pct"] = log.SpareThreshold
}
if log.MediaErrors > 0 {
tel["media_errors"] = log.MediaErrors
}
if log.NumErrLogEntries > 0 {
tel["error_log_entries"] = log.NumErrLogEntries
}
if len(tel) > 0 { if len(tel) > 0 {
s.Telemetry = tel s.Telemetry = tel
} }
setStorageHealthStatus(&s, storageHealthStatus{
criticalWarning: log.CriticalWarning,
percentageUsed: int64(log.PercentageUsed),
availableSpare: int64(log.AvailableSpare),
spareThreshold: int64(log.SpareThreshold),
unsafeShutdowns: log.UnsafeShutdowns,
mediaErrors: log.MediaErrors,
errorLogEntries: log.NumErrLogEntries,
})
return s
} }
} }
status = "UNKNOWN"
s.Status = &status
return s return s
} }
type storageHealthStatus struct {
hasOverall bool
overallPassed bool
reallocatedSectors int64
pendingSectors int64
offlineUncorrectable int64
lifeRemainingPct int64
criticalWarning int
percentageUsed int64
availableSpare int64
spareThreshold int64
unsafeShutdowns int64
mediaErrors int64
errorLogEntries int64
}
func setStorageHealthStatus(s *schema.HardwareStorage, health storageHealthStatus) {
status := "OK"
switch {
case health.hasOverall && !health.overallPassed:
status = "FAILED"
case health.criticalWarning > 0:
status = "FAILED"
case health.pendingSectors > 0 || health.offlineUncorrectable > 0:
status = "FAILED"
case health.mediaErrors > 0:
status = "WARNING"
case health.reallocatedSectors > 0:
status = "WARNING"
case health.errorLogEntries > 0:
status = "WARNING"
case health.lifeRemainingPct > 0 && health.lifeRemainingPct <= 10:
status = "WARNING"
case health.percentageUsed >= 95:
status = "WARNING"
case health.availableSpare > 0 && health.spareThreshold > 0 && health.availableSpare <= health.spareThreshold:
status = "WARNING"
case health.unsafeShutdowns > 100:
status = "WARNING"
}
s.Status = &status
}

View File

@@ -0,0 +1,63 @@
package collector
import (
"testing"
"bee/audit/internal/schema"
)
func TestSetStorageHealthStatus(t *testing.T) {
t.Parallel()
tests := []struct {
name string
health storageHealthStatus
want string
}{
{
name: "smart overall failed",
health: storageHealthStatus{hasOverall: true, overallPassed: false},
want: "FAILED",
},
{
name: "nvme critical warning",
health: storageHealthStatus{criticalWarning: 1},
want: "FAILED",
},
{
name: "pending sectors",
health: storageHealthStatus{pendingSectors: 1},
want: "FAILED",
},
{
name: "media errors warning",
health: storageHealthStatus{mediaErrors: 2},
want: "WARNING",
},
{
name: "reallocated warning",
health: storageHealthStatus{reallocatedSectors: 1},
want: "WARNING",
},
{
name: "life remaining low",
health: storageHealthStatus{lifeRemainingPct: 8},
want: "WARNING",
},
{
name: "healthy",
health: storageHealthStatus{},
want: "OK",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
var disk schema.HardwareStorage
setStorageHealthStatus(&disk, tt.health)
if disk.Status == nil || *disk.Status != tt.want {
t.Fatalf("status=%v want %q", disk.Status, tt.want)
}
})
}
}

View File

@@ -0,0 +1,114 @@
package collector
import (
"bee/audit/internal/schema"
"fmt"
"time"
)
func buildHealthSummary(snap schema.HardwareSnapshot) *schema.HardwareHealthSummary {
summary := &schema.HardwareHealthSummary{
Status: "OK",
CollectedAt: time.Now().UTC().Format(time.RFC3339),
}
for _, dimm := range snap.Memory {
switch derefString(dimm.Status) {
case "WARNING":
summary.MemoryWarn++
summary.Warnings = append(summary.Warnings, formatMemorySummary(dimm))
case "FAILED":
summary.MemoryFail++
summary.Failures = append(summary.Failures, formatMemorySummary(dimm))
case "EMPTY":
summary.EmptyDIMMs++
}
}
for _, disk := range snap.Storage {
switch derefString(disk.Status) {
case "WARNING":
summary.StorageWarn++
summary.Warnings = append(summary.Warnings, formatStorageSummary(disk))
case "FAILED":
summary.StorageFail++
summary.Failures = append(summary.Failures, formatStorageSummary(disk))
}
}
for _, dev := range snap.PCIeDevices {
switch derefString(dev.Status) {
case "WARNING":
summary.PCIeWarn++
summary.Warnings = append(summary.Warnings, formatPCIeSummary(dev))
case "FAILED":
summary.PCIeFail++
summary.Failures = append(summary.Failures, formatPCIeSummary(dev))
}
}
for _, psu := range snap.PowerSupplies {
if psu.Present != nil && !*psu.Present {
summary.MissingPSUs++
}
switch derefString(psu.Status) {
case "WARNING":
summary.PSUWarn++
summary.Warnings = append(summary.Warnings, formatPSUSummary(psu))
case "FAILED":
summary.PSUFail++
summary.Failures = append(summary.Failures, formatPSUSummary(psu))
}
}
if len(summary.Failures) > 0 || summary.StorageFail > 0 || summary.PCIeFail > 0 || summary.PSUFail > 0 || summary.MemoryFail > 0 {
summary.Status = "FAILED"
} else if len(summary.Warnings) > 0 || summary.StorageWarn > 0 || summary.PCIeWarn > 0 || summary.PSUWarn > 0 || summary.MemoryWarn > 0 {
summary.Status = "WARNING"
}
if len(summary.Warnings) == 0 {
summary.Warnings = nil
}
if len(summary.Failures) == 0 {
summary.Failures = nil
}
return summary
}
func derefString(value *string) string {
if value == nil {
return ""
}
return *value
}
func preferredName(model, serial, slot *string) string {
switch {
case model != nil && *model != "":
return *model
case serial != nil && *serial != "":
return *serial
case slot != nil && *slot != "":
return *slot
default:
return "unknown"
}
}
func formatStorageSummary(disk schema.HardwareStorage) string {
return fmt.Sprintf("storage %s status=%s", preferredName(disk.Model, disk.SerialNumber, disk.Slot), derefString(disk.Status))
}
func formatPCIeSummary(dev schema.HardwarePCIeDevice) string {
return fmt.Sprintf("pcie %s status=%s", preferredName(dev.Model, dev.SerialNumber, dev.BDF), derefString(dev.Status))
}
func formatPSUSummary(psu schema.HardwarePowerSupply) string {
return fmt.Sprintf("psu %s status=%s", preferredName(psu.Model, psu.SerialNumber, psu.Slot), derefString(psu.Status))
}
func formatMemorySummary(dimm schema.HardwareMemory) string {
return fmt.Sprintf("memory %s status=%s", preferredName(dimm.PartNumber, dimm.SerialNumber, dimm.Slot), derefString(dimm.Status))
}

View File

@@ -8,35 +8,107 @@ import (
"os" "os"
"os/exec" "os/exec"
"path/filepath" "path/filepath"
"sort"
"strings" "strings"
"time" "time"
) )
func (s *System) RunNvidiaAcceptancePack(baseDir string) (string, error) { func (s *System) RunNvidiaAcceptancePack(baseDir string) (string, error) {
return runAcceptancePack(baseDir, "gpu-nvidia", nvidiaSATJobs())
}
func (s *System) RunMemoryAcceptancePack(baseDir string) (string, error) {
return runAcceptancePack(baseDir, "memory", []satJob{
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
{name: "02-memtester.log", cmd: []string{"memtester", "128M", "1"}},
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
})
}
func (s *System) RunStorageAcceptancePack(baseDir string) (string, error) {
if baseDir == "" { if baseDir == "" {
baseDir = "/var/log/bee-sat" baseDir = "/var/log/bee-sat"
} }
ts := time.Now().UTC().Format("20060102-150405") ts := time.Now().UTC().Format("20060102-150405")
runDir := filepath.Join(baseDir, "gpu-nvidia-"+ts) runDir := filepath.Join(baseDir, "storage-"+ts)
if err := os.MkdirAll(runDir, 0755); err != nil { if err := os.MkdirAll(runDir, 0755); err != nil {
return "", err return "", err
} }
type job struct { devices, err := listStorageDevices()
name string if err != nil {
cmd []string return "", err
} }
jobs := []job{ sort.Strings(devices)
var summary strings.Builder
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
if len(devices) == 0 {
fmt.Fprintln(&summary, "devices=0")
} else {
fmt.Fprintf(&summary, "devices=%d\n", len(devices))
}
for index, devPath := range devices {
prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath))
commands := storageSATCommands(devPath)
for cmdIndex, job := range commands {
name := fmt.Sprintf("%s-%02d-%s.log", prefix, cmdIndex+1, job.name)
out, err := exec.Command(job.cmd[0], job.cmd[1:]...).CombinedOutput()
if writeErr := os.WriteFile(filepath.Join(runDir, name), out, 0644); writeErr != nil {
return "", writeErr
}
rc := 0
if err != nil {
rc = 1
}
fmt.Fprintf(&summary, "%s_%s_rc=%d\n", filepath.Base(devPath), strings.ReplaceAll(job.name, "-", "_"), rc)
}
}
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
return "", err
}
archive := filepath.Join(baseDir, "storage-"+ts+".tar.gz")
if err := createTarGz(archive, runDir); err != nil {
return "", err
}
return archive, nil
}
type satJob struct {
name string
cmd []string
}
func nvidiaSATJobs() []satJob {
return []satJob{
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, {name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}}, {name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}}, {name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output", filepath.Join(runDir, "nvidia-bug-report.log")}}, {name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output", "{{run_dir}}/nvidia-bug-report.log"}},
{name: "05-bee-gpu-stress.log", cmd: []string{"bee-gpu-stress", "--seconds", "5", "--size-mb", "64"}},
}
}
func runAcceptancePack(baseDir, prefix string, jobs []satJob) (string, error) {
if baseDir == "" {
baseDir = "/var/log/bee-sat"
}
ts := time.Now().UTC().Format("20060102-150405")
runDir := filepath.Join(baseDir, prefix+"-"+ts)
if err := os.MkdirAll(runDir, 0755); err != nil {
return "", err
} }
var summary strings.Builder var summary strings.Builder
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339)) fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
for _, job := range jobs { for _, job := range jobs {
out, err := exec.Command(job.cmd[0], job.cmd[1:]...).CombinedOutput() cmd := make([]string, 0, len(job.cmd))
for _, arg := range job.cmd {
cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir))
}
out, err := exec.Command(cmd[0], cmd[1:]...).CombinedOutput()
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil { if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
return "", writeErr return "", writeErr
} }
@@ -50,13 +122,43 @@ func (s *System) RunNvidiaAcceptancePack(baseDir string) (string, error) {
return "", err return "", err
} }
archive := filepath.Join(baseDir, "gpu-nvidia-"+ts+".tar.gz") archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
if err := createTarGz(archive, runDir); err != nil { if err := createTarGz(archive, runDir); err != nil {
return "", err return "", err
} }
return archive, nil return archive, nil
} }
func listStorageDevices() ([]string, error) {
out, err := exec.Command("lsblk", "-dn", "-o", "NAME,TYPE").Output()
if err != nil {
return nil, err
}
var devices []string
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
fields := strings.Fields(strings.TrimSpace(line))
if len(fields) != 2 || fields[1] != "disk" {
continue
}
devices = append(devices, "/dev/"+fields[0])
}
return devices, nil
}
func storageSATCommands(devPath string) []satJob {
if strings.Contains(filepath.Base(devPath), "nvme") {
return []satJob{
{name: "nvme-id-ctrl", cmd: []string{"nvme", "id-ctrl", devPath, "-o", "json"}},
{name: "nvme-smart-log", cmd: []string{"nvme", "smart-log", devPath, "-o", "json"}},
{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "--start", "1"}},
}
}
return []satJob{
{name: "smartctl-health", cmd: []string{"smartctl", "-H", "-A", devPath}},
{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", "short", devPath}},
}
}
func createTarGz(dst, srcDir string) error { func createTarGz(dst, srcDir string) error {
file, err := os.Create(dst) file, err := os.Create(dst)
if err != nil { if err != nil {

View File

@@ -0,0 +1,30 @@
package platform
import "testing"
func TestStorageSATCommands(t *testing.T) {
t.Parallel()
nvme := storageSATCommands("/dev/nvme0n1")
if len(nvme) != 3 || nvme[2].cmd[0] != "nvme" {
t.Fatalf("unexpected nvme commands: %#v", nvme)
}
sata := storageSATCommands("/dev/sda")
if len(sata) != 2 || sata[0].cmd[0] != "smartctl" {
t.Fatalf("unexpected sata commands: %#v", sata)
}
}
func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {
t.Parallel()
jobs := nvidiaSATJobs()
if len(jobs) != 5 {
t.Fatalf("jobs=%d want 5", len(jobs))
}
if got := jobs[4].cmd[0]; got != "bee-gpu-stress" {
t.Fatalf("gpu stress command=%q want bee-gpu-stress", got)
}
}

View File

@@ -21,6 +21,24 @@ type HardwareSnapshot struct {
Storage []HardwareStorage `json:"storage,omitempty"` Storage []HardwareStorage `json:"storage,omitempty"`
PCIeDevices []HardwarePCIeDevice `json:"pcie_devices,omitempty"` PCIeDevices []HardwarePCIeDevice `json:"pcie_devices,omitempty"`
PowerSupplies []HardwarePowerSupply `json:"power_supplies,omitempty"` PowerSupplies []HardwarePowerSupply `json:"power_supplies,omitempty"`
Summary *HardwareHealthSummary `json:"summary,omitempty"`
}
type HardwareHealthSummary struct {
Status string `json:"status"`
Warnings []string `json:"warnings,omitempty"`
Failures []string `json:"failures,omitempty"`
StorageWarn int `json:"storage_warn,omitempty"`
StorageFail int `json:"storage_fail,omitempty"`
PCIeWarn int `json:"pcie_warn,omitempty"`
PCIeFail int `json:"pcie_fail,omitempty"`
PSUWarn int `json:"psu_warn,omitempty"`
PSUFail int `json:"psu_fail,omitempty"`
MemoryWarn int `json:"memory_warn,omitempty"`
MemoryFail int `json:"memory_fail,omitempty"`
EmptyDIMMs int `json:"empty_dimms,omitempty"`
MissingPSUs int `json:"missing_psus,omitempty"`
CollectedAt string `json:"collected_at,omitempty"`
} }
type HardwareBoard struct { type HardwareBoard struct {

View File

@@ -29,6 +29,7 @@ func (m model) updateStaticForm(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
m.formFields[3].Value, m.formFields[3].Value,
}) })
m.busy = true m.busy = true
m.busyTitle = "Static IPv4: " + m.selectedIface
return m, func() tea.Msg { return m, func() tea.Msg {
result, err := m.app.SetStaticIPv4Result(cfg) result, err := m.app.SetStaticIPv4Result(cfg)
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenNetwork} return resultMsg{title: result.Title, body: result.Body, err: err, back: screenNetwork}
@@ -59,26 +60,42 @@ func (m model) updateConfirm(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
case "esc": case "esc":
m.screen = m.confirmCancelTarget() m.screen = m.confirmCancelTarget()
m.cursor = 0 m.cursor = 0
m.pendingAction = actionNone
return m, nil return m, nil
case "enter": case "enter":
if m.cursor == 1 { if m.cursor == 1 {
m.screen = m.confirmCancelTarget() m.screen = m.confirmCancelTarget()
m.cursor = 0 m.cursor = 0
m.pendingAction = actionNone
return m, nil return m, nil
} }
m.busy = true m.busy = true
switch m.pendingAction { switch m.pendingAction {
case actionExportAudit: case actionExportAudit:
m.busyTitle = "Export audit"
target := *m.selectedTarget target := *m.selectedTarget
return m, func() tea.Msg { return m, func() tea.Msg {
result, err := m.app.ExportLatestAuditResult(target) result, err := m.app.ExportLatestAuditResult(target)
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenMain} return resultMsg{title: result.Title, body: result.Body, err: err, back: screenMain}
} }
case actionRunNvidiaSAT: case actionRunNvidiaSAT:
m.busyTitle = "NVIDIA SAT"
return m, func() tea.Msg { return m, func() tea.Msg {
result, err := m.app.RunNvidiaAcceptancePackResult("") result, err := m.app.RunNvidiaAcceptancePackResult("")
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenAcceptance} return resultMsg{title: result.Title, body: result.Body, err: err, back: screenAcceptance}
} }
case actionRunMemorySAT:
m.busyTitle = "Memory SAT"
return m, func() tea.Msg {
result, err := m.app.RunMemoryAcceptancePackResult("")
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenAcceptance}
}
case actionRunStorageSAT:
m.busyTitle = "Storage SAT"
return m, func() tea.Msg {
result, err := m.app.RunStorageAcceptancePackResult("")
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenAcceptance}
}
} }
case "ctrl+c": case "ctrl+c":
return m, tea.Quit return m, tea.Quit
@@ -91,6 +108,10 @@ func (m model) confirmCancelTarget() screen {
case actionExportAudit: case actionExportAudit:
return screenExportTargets return screenExportTargets
case actionRunNvidiaSAT: case actionRunNvidiaSAT:
fallthrough
case actionRunMemorySAT:
fallthrough
case actionRunStorageSAT:
return screenAcceptance return screenAcceptance
default: default:
return screenMain return screenMain

View File

@@ -3,12 +3,19 @@ package tui
import tea "github.com/charmbracelet/bubbletea" import tea "github.com/charmbracelet/bubbletea"
func (m model) handleAcceptanceMenu() (tea.Model, tea.Cmd) { func (m model) handleAcceptanceMenu() (tea.Model, tea.Cmd) {
if m.cursor == 1 { if m.cursor == 3 {
m.screen = screenMain m.screen = screenMain
m.cursor = 0 m.cursor = 0
return m, nil return m, nil
} }
m.pendingAction = actionRunNvidiaSAT switch m.cursor {
case 0:
m.pendingAction = actionRunNvidiaSAT
case 1:
m.pendingAction = actionRunMemorySAT
case 2:
m.pendingAction = actionRunStorageSAT
}
m.screen = screenConfirm m.screen = screenConfirm
return m, nil return m, nil
} }

View File

@@ -12,6 +12,7 @@ func (m model) handleMainMenu() (tea.Model, tea.Cmd) {
return m, nil return m, nil
case 1: case 1:
m.busy = true m.busy = true
m.busyTitle = "Services"
return m, func() tea.Msg { return m, func() tea.Msg {
services, err := m.app.ListBeeServices() services, err := m.app.ListBeeServices()
return servicesMsg{services: services, err: err} return servicesMsg{services: services, err: err}
@@ -22,29 +23,40 @@ func (m model) handleMainMenu() (tea.Model, tea.Cmd) {
return m, nil return m, nil
case 3: case 3:
m.busy = true m.busy = true
m.busyTitle = "Run audit"
return m, func() tea.Msg { return m, func() tea.Msg {
result, err := m.app.RunAuditNow(m.runtimeMode) result, err := m.app.RunAuditNow(m.runtimeMode)
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenMain} return resultMsg{title: result.Title, body: result.Body, err: err, back: screenMain}
} }
case 4: case 4:
m.busy = true m.busy = true
m.busyTitle = "Export audit"
return m, func() tea.Msg { return m, func() tea.Msg {
targets, err := m.app.ListRemovableTargets() targets, err := m.app.ListRemovableTargets()
return exportTargetsMsg{targets: targets, err: err} return exportTargetsMsg{targets: targets, err: err}
} }
case 5: case 5:
m.busy = true m.busy = true
m.busyTitle = "Required tools"
return m, func() tea.Msg { return m, func() tea.Msg {
result := m.app.ToolCheckResult([]string{"dmidecode", "smartctl", "nvme", "ipmitool", "lspci", "bee", "nvidia-smi", "dhclient", "lsblk", "mount"}) result := m.app.ToolCheckResult([]string{"dmidecode", "smartctl", "nvme", "ipmitool", "lspci", "ethtool", "bee", "nvidia-smi", "bee-gpu-stress", "memtester", "dhclient", "lsblk", "mount"})
return resultMsg{title: result.Title, body: result.Body, back: screenMain} return resultMsg{title: result.Title, body: result.Body, back: screenMain}
} }
case 6: case 6:
m.busy = true m.busy = true
m.busyTitle = "Health summary"
return m, func() tea.Msg {
result := m.app.HealthSummaryResult()
return resultMsg{title: result.Title, body: result.Body, back: screenMain}
}
case 7:
m.busy = true
m.busyTitle = "Audit logs"
return m, func() tea.Msg { return m, func() tea.Msg {
result := m.app.AuditLogTailResult() result := m.app.AuditLogTailResult()
return resultMsg{title: result.Title, body: result.Body, back: screenMain} return resultMsg{title: result.Title, body: result.Body, back: screenMain}
} }
case 7: case 8:
return m, tea.Quit return m, tea.Quit
} }
return m, nil return m, nil

View File

@@ -10,12 +10,14 @@ func (m model) handleNetworkMenu() (tea.Model, tea.Cmd) {
switch m.cursor { switch m.cursor {
case 0: case 0:
m.busy = true m.busy = true
m.busyTitle = "Network status"
return m, func() tea.Msg { return m, func() tea.Msg {
result, err := m.app.NetworkStatus() result, err := m.app.NetworkStatus()
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenNetwork} return resultMsg{title: result.Title, body: result.Body, err: err, back: screenNetwork}
} }
case 1: case 1:
m.busy = true m.busy = true
m.busyTitle = "DHCP all interfaces"
return m, func() tea.Msg { return m, func() tea.Msg {
result, err := m.app.DHCPAllResult() result, err := m.app.DHCPAllResult()
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenNetwork} return resultMsg{title: result.Title, body: result.Body, err: err, back: screenNetwork}
@@ -23,6 +25,7 @@ func (m model) handleNetworkMenu() (tea.Model, tea.Cmd) {
case 2: case 2:
m.pendingAction = actionDHCPOne m.pendingAction = actionDHCPOne
m.busy = true m.busy = true
m.busyTitle = "Interfaces"
return m, func() tea.Msg { return m, func() tea.Msg {
ifaces, err := m.app.ListInterfaces() ifaces, err := m.app.ListInterfaces()
return interfacesMsg{ifaces: ifaces, err: err} return interfacesMsg{ifaces: ifaces, err: err}
@@ -30,6 +33,7 @@ func (m model) handleNetworkMenu() (tea.Model, tea.Cmd) {
case 3: case 3:
m.pendingAction = actionStaticIPv4 m.pendingAction = actionStaticIPv4
m.busy = true m.busy = true
m.busyTitle = "Interfaces"
return m, func() tea.Msg { return m, func() tea.Msg {
ifaces, err := m.app.ListInterfaces() ifaces, err := m.app.ListInterfaces()
return interfacesMsg{ifaces: ifaces, err: err} return interfacesMsg{ifaces: ifaces, err: err}
@@ -50,6 +54,7 @@ func (m model) handleInterfacePickMenu() (tea.Model, tea.Cmd) {
switch m.pendingAction { switch m.pendingAction {
case actionDHCPOne: case actionDHCPOne:
m.busy = true m.busy = true
m.busyTitle = "DHCP on " + m.selectedIface
return m, func() tea.Msg { return m, func() tea.Msg {
result, err := m.app.DHCPOneResult(m.selectedIface) result, err := m.app.DHCPOneResult(m.selectedIface)
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenNetwork} return resultMsg{title: result.Title, body: result.Body, err: err, back: screenNetwork}

View File

@@ -8,7 +8,7 @@ import (
func (m model) handleServicesMenu() (tea.Model, tea.Cmd) { func (m model) handleServicesMenu() (tea.Model, tea.Cmd) {
if len(m.services) == 0 { if len(m.services) == 0 {
return m, resultCmd("bee services", "No bee-* services found", nil, screenMain) return m, resultCmd("Services", "No bee-* services found.", nil, screenMain)
} }
m.selectedService = m.services[m.cursor] m.selectedService = m.services[m.cursor]
m.screen = screenServiceAction m.screen = screenServiceAction
@@ -25,22 +25,23 @@ func (m model) handleServiceActionMenu() (tea.Model, tea.Cmd) {
} }
m.busy = true m.busy = true
m.busyTitle = "service: " + m.selectedService
return m, func() tea.Msg { return m, func() tea.Msg {
switch action { switch action {
case "status": case "Status":
result, err := m.app.ServiceStatusResult(m.selectedService) result, err := m.app.ServiceStatusResult(m.selectedService)
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenServiceAction} return resultMsg{title: result.Title, body: result.Body, err: err, back: screenServiceAction}
case "restart": case "Restart":
result, err := m.app.ServiceActionResult(m.selectedService, platform.ServiceRestart) result, err := m.app.ServiceActionResult(m.selectedService, platform.ServiceRestart)
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenServiceAction} return resultMsg{title: result.Title, body: result.Body, err: err, back: screenServiceAction}
case "start": case "Start":
result, err := m.app.ServiceActionResult(m.selectedService, platform.ServiceStart) result, err := m.app.ServiceActionResult(m.selectedService, platform.ServiceStart)
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenServiceAction} return resultMsg{title: result.Title, body: result.Body, err: err, back: screenServiceAction}
case "stop": case "Stop":
result, err := m.app.ServiceActionResult(m.selectedService, platform.ServiceStop) result, err := m.app.ServiceActionResult(m.selectedService, platform.ServiceStop)
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenServiceAction} return resultMsg{title: result.Title, body: result.Body, err: err, back: screenServiceAction}
default: default:
return resultMsg{title: "service", body: "unknown action", back: screenServiceAction} return resultMsg{title: "Service", body: "Unknown action.", back: screenServiceAction}
} }
} }
} }

View File

@@ -1,6 +1,7 @@
package tui package tui
import ( import (
"strings"
"testing" "testing"
"bee/audit/internal/app" "bee/audit/internal/app"
@@ -153,7 +154,8 @@ func TestMainMenuAsyncActionsSetBusy(t *testing.T) {
{name: "run audit", cursor: 3}, {name: "run audit", cursor: 3},
{name: "export", cursor: 4}, {name: "export", cursor: 4},
{name: "check tools", cursor: 5}, {name: "check tools", cursor: 5},
{name: "log tail", cursor: 6}, {name: "health summary", cursor: 6},
{name: "log tail", cursor: 7},
} }
for _, test := range tests { for _, test := range tests {
@@ -262,6 +264,31 @@ func TestAcceptanceConfirmFlow(t *testing.T) {
} }
} }
func TestAcceptanceMenuMapsNewTargets(t *testing.T) {
t.Parallel()
tests := []struct {
cursor int
want actionKind
}{
{cursor: 0, want: actionRunNvidiaSAT},
{cursor: 1, want: actionRunMemorySAT},
{cursor: 2, want: actionRunStorageSAT},
}
for _, test := range tests {
m := newTestModel()
m.screen = screenAcceptance
m.cursor = test.cursor
next, _ := m.handleAcceptanceMenu()
got := next.(model)
if got.pendingAction != test.want {
t.Fatalf("cursor=%d pendingAction=%q want %q", test.cursor, got.pendingAction, test.want)
}
}
}
func TestExportTargetSelectionOpensConfirm(t *testing.T) { func TestExportTargetSelectionOpensConfirm(t *testing.T) {
t.Parallel() t.Parallel()
@@ -347,3 +374,197 @@ func TestConfirmCancelTarget(t *testing.T) {
t.Fatalf("default cancel target=%q want %q", got, screenMain) t.Fatalf("default cancel target=%q want %q", got, screenMain)
} }
} }
func TestViewMainMenuRendersSelectedItem(t *testing.T) {
t.Parallel()
m := newTestModel()
m.cursor = 1
view := m.View()
for _, want := range []string{
"bee",
"Select action",
" Network",
"> Services",
"Acceptance tests",
"[↑/↓] move [enter] select [esc] back [ctrl+c] quit",
} {
if !strings.Contains(view, want) {
t.Fatalf("view missing %q\nview:\n%s", want, view)
}
}
}
func TestViewBusyStateIsMinimal(t *testing.T) {
t.Parallel()
m := newTestModel()
m.busy = true
view := m.View()
want := "bee\n\nWorking...\n\n[ctrl+c] quit\n"
if view != want {
t.Fatalf("busy view mismatch\nwant:\n%s\ngot:\n%s", want, view)
}
}
func TestViewBusyStateUsesBusyTitle(t *testing.T) {
t.Parallel()
m := newTestModel()
m.busy = true
m.busyTitle = "Export audit"
view := m.View()
for _, want := range []string{
"Export audit",
"Working...",
"[ctrl+c] quit",
} {
if !strings.Contains(view, want) {
t.Fatalf("view missing %q\nview:\n%s", want, view)
}
}
}
func TestViewOutputScreenRendersBodyAndBackHint(t *testing.T) {
t.Parallel()
m := newTestModel()
m.screen = screenOutput
m.title = "Run audit"
m.body = "audit output: /var/log/bee-audit.json\n"
view := m.View()
for _, want := range []string{
"Run audit",
"audit output: /var/log/bee-audit.json",
"[enter/esc] back [ctrl+c] quit",
} {
if !strings.Contains(view, want) {
t.Fatalf("view missing %q\nview:\n%s", want, view)
}
}
}
func TestViewExportTargetsRendersDeviceMetadata(t *testing.T) {
t.Parallel()
m := newTestModel()
m.screen = screenExportTargets
m.targets = []platform.RemovableTarget{
{
Device: "/dev/sdb1",
FSType: "vfat",
Size: "29G",
Label: "BEEUSB",
Mountpoint: "/media/bee",
},
}
view := m.View()
for _, want := range []string{
"Export audit",
"Select removable filesystem",
"> /dev/sdb1 [vfat 29G] label=BEEUSB mounted=/media/bee",
} {
if !strings.Contains(view, want) {
t.Fatalf("view missing %q\nview:\n%s", want, view)
}
}
}
func TestViewStaticFormRendersFields(t *testing.T) {
t.Parallel()
m := newTestModel()
m.screen = screenStaticForm
m.selectedIface = "enp1s0"
m.formFields = []formField{
{Label: "Address", Value: "192.0.2.10/24"},
{Label: "Gateway", Value: "192.0.2.1"},
{Label: "DNS", Value: "1.1.1.1"},
}
m.formIndex = 1
view := m.View()
for _, want := range []string{
"Static IPv4: enp1s0",
" Address: 192.0.2.10/24",
"> Gateway: 192.0.2.1",
" DNS: 1.1.1.1",
"[tab/↑/↓] move [enter] next/submit [backspace] delete [esc] cancel",
} {
if !strings.Contains(view, want) {
t.Fatalf("view missing %q\nview:\n%s", want, view)
}
}
}
func TestViewConfirmScreenMatchesPendingExport(t *testing.T) {
t.Parallel()
m := newTestModel()
m.screen = screenConfirm
m.pendingAction = actionExportAudit
m.selectedTarget = &platform.RemovableTarget{Device: "/dev/sdb1"}
view := m.View()
for _, want := range []string{
"Export audit",
"Copy latest audit JSON to /dev/sdb1?",
"> Confirm",
" Cancel",
} {
if !strings.Contains(view, want) {
t.Fatalf("view missing %q\nview:\n%s", want, view)
}
}
}
func TestResultMsgClearsBusyAndPendingAction(t *testing.T) {
t.Parallel()
m := newTestModel()
m.busy = true
m.busyTitle = "Export audit"
m.pendingAction = actionExportAudit
m.screen = screenConfirm
next, _ := m.Update(resultMsg{title: "Export audit", body: "done", back: screenMain})
got := next.(model)
if got.busy {
t.Fatal("busy=true want false")
}
if got.busyTitle != "" {
t.Fatalf("busyTitle=%q want empty", got.busyTitle)
}
if got.pendingAction != actionNone {
t.Fatalf("pendingAction=%q want empty", got.pendingAction)
}
}
func TestResultMsgErrorWithoutBodyFormatsCleanly(t *testing.T) {
t.Parallel()
m := newTestModel()
next, _ := m.Update(resultMsg{title: "Export audit", err: assertErr("boom"), back: screenMain})
got := next.(model)
if got.body != "ERROR: boom" {
t.Fatalf("body=%q want %q", got.body, "ERROR: boom")
}
}
type assertErr string
func (e assertErr) Error() string { return string(e) }

View File

@@ -31,6 +31,8 @@ const (
actionStaticIPv4 actionKind = "static_ipv4" actionStaticIPv4 actionKind = "static_ipv4"
actionExportAudit actionKind = "export_audit" actionExportAudit actionKind = "export_audit"
actionRunNvidiaSAT actionKind = "run_nvidia_sat" actionRunNvidiaSAT actionKind = "run_nvidia_sat"
actionRunMemorySAT actionKind = "run_memory_sat"
actionRunStorageSAT actionKind = "run_storage_sat"
) )
type model struct { type model struct {
@@ -41,6 +43,7 @@ type model struct {
prevScreen screen prevScreen screen
cursor int cursor int
busy bool busy bool
busyTitle string
title string title string
body string body string
mainMenu []string mainMenu []string
@@ -80,28 +83,29 @@ func newModel(application *app.App, runtimeMode runtimeenv.Mode) model {
runtimeMode: runtimeMode, runtimeMode: runtimeMode,
screen: screenMain, screen: screenMain,
mainMenu: []string{ mainMenu: []string{
"Network setup", "Network",
"bee service management", "Services",
"System acceptance tests", "Acceptance tests",
"Run audit now", "Run audit",
"Export audit to removable drive", "Export audit",
"Check required tools", "Check tools",
"Show last audit log tail", "Show health summary",
"Show audit logs",
"Exit", "Exit",
}, },
networkMenu: []string{ networkMenu: []string{
"Show network status", "Show status",
"DHCP on all interfaces", "DHCP on all interfaces",
"DHCP on one interface", "DHCP on one interface",
"Set static IPv4 on one interface", "Set static IPv4",
"Back", "Back",
}, },
serviceMenu: []string{ serviceMenu: []string{
"status", "Status",
"restart", "Restart",
"start", "Start",
"stop", "Stop",
"back", "Back",
}, },
} }
} }

View File

@@ -21,12 +21,19 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
return m.updateKey(msg) return m.updateKey(msg)
case resultMsg: case resultMsg:
m.busy = false m.busy = false
m.busyTitle = ""
m.title = msg.title m.title = msg.title
if msg.err != nil { if msg.err != nil {
m.body = fmt.Sprintf("%s\n\nERROR: %v", strings.TrimSpace(msg.body), msg.err) body := strings.TrimSpace(msg.body)
if body == "" {
m.body = fmt.Sprintf("ERROR: %v", msg.err)
} else {
m.body = fmt.Sprintf("%s\n\nERROR: %v", body, msg.err)
}
} else { } else {
m.body = msg.body m.body = msg.body
} }
m.pendingAction = actionNone
if msg.back != "" { if msg.back != "" {
m.prevScreen = msg.back m.prevScreen = msg.back
} else { } else {
@@ -37,8 +44,9 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
return m, nil return m, nil
case servicesMsg: case servicesMsg:
m.busy = false m.busy = false
m.busyTitle = ""
if msg.err != nil { if msg.err != nil {
m.title = "bee services" m.title = "Services"
m.body = msg.err.Error() m.body = msg.err.Error()
m.prevScreen = screenMain m.prevScreen = screenMain
m.screen = screenOutput m.screen = screenOutput
@@ -50,6 +58,7 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
return m, nil return m, nil
case interfacesMsg: case interfacesMsg:
m.busy = false m.busy = false
m.busyTitle = ""
if msg.err != nil { if msg.err != nil {
m.title = "interfaces" m.title = "interfaces"
m.body = msg.err.Error() m.body = msg.err.Error()
@@ -63,6 +72,7 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
return m, nil return m, nil
case exportTargetsMsg: case exportTargetsMsg:
m.busy = false m.busy = false
m.busyTitle = ""
if msg.err != nil { if msg.err != nil {
m.title = "export" m.title = "export"
m.body = msg.err.Error() m.body = msg.err.Error()
@@ -90,7 +100,7 @@ func (m model) updateKey(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
case screenServiceAction: case screenServiceAction:
return m.updateMenu(msg, len(m.serviceMenu), m.handleServiceActionMenu) return m.updateMenu(msg, len(m.serviceMenu), m.handleServiceActionMenu)
case screenAcceptance: case screenAcceptance:
return m.updateMenu(msg, 2, m.handleAcceptanceMenu) return m.updateMenu(msg, 4, m.handleAcceptanceMenu)
case screenExportTargets: case screenExportTargets:
return m.updateMenu(msg, len(m.targets), m.handleExportTargetsMenu) return m.updateMenu(msg, len(m.targets), m.handleExportTargetsMenu)
case screenInterfacePick: case screenInterfacePick:
@@ -101,6 +111,7 @@ func (m model) updateKey(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
m.screen = m.prevScreen m.screen = m.prevScreen
m.body = "" m.body = ""
m.title = "" m.title = ""
m.pendingAction = actionNone
return m, nil return m, nil
case "ctrl+c": case "ctrl+c":
return m, tea.Quit return m, tea.Quit

View File

@@ -11,7 +11,11 @@ import (
func (m model) View() string { func (m model) View() string {
if m.busy { if m.busy {
return "bee\n\nWorking...\n" title := "bee"
if m.busyTitle != "" {
title = m.busyTitle
}
return fmt.Sprintf("%s\n\nWorking...\n\n[ctrl+c] quit\n", title)
} }
switch m.screen { switch m.screen {
case screenMain: case screenMain:
@@ -19,13 +23,13 @@ func (m model) View() string {
case screenNetwork: case screenNetwork:
return renderMenu("Network", "Select action", m.networkMenu, m.cursor) return renderMenu("Network", "Select action", m.networkMenu, m.cursor)
case screenServices: case screenServices:
return renderMenu("bee services", "Select service", m.services, m.cursor) return renderMenu("Services", "Select service", m.services, m.cursor)
case screenServiceAction: case screenServiceAction:
items := make([]string, len(m.serviceMenu)) items := make([]string, len(m.serviceMenu))
copy(items, m.serviceMenu) copy(items, m.serviceMenu)
return renderMenu("Service: "+m.selectedService, "Select action", items, m.cursor) return renderMenu("Service: "+m.selectedService, "Select action", items, m.cursor)
case screenAcceptance: case screenAcceptance:
return renderMenu("System acceptance tests", "Select action", []string{"Run NVIDIA command pack", "Back"}, m.cursor) return renderMenu("Acceptance tests", "Select action", []string{"Run NVIDIA command pack", "Run memory test", "Run storage diagnostic pack", "Back"}, m.cursor)
case screenExportTargets: case screenExportTargets:
return renderMenu("Export audit", "Select removable filesystem", renderTargetItems(m.targets), m.cursor) return renderMenu("Export audit", "Select removable filesystem", renderTargetItems(m.targets), m.cursor)
case screenInterfacePick: case screenInterfacePick:
@@ -51,6 +55,10 @@ func (m model) confirmBody() (string, string) {
return "Export audit", fmt.Sprintf("Copy latest audit JSON to %s?", m.selectedTarget.Device) return "Export audit", fmt.Sprintf("Copy latest audit JSON to %s?", m.selectedTarget.Device)
case actionRunNvidiaSAT: case actionRunNvidiaSAT:
return "NVIDIA SAT", "Run NVIDIA acceptance command pack?" return "NVIDIA SAT", "Run NVIDIA acceptance command pack?"
case actionRunMemorySAT:
return "Memory SAT", "Run runtime memory test with memtester?"
case actionRunStorageSAT:
return "Storage SAT", "Run storage diagnostic pack and start short self-tests where supported?"
default: default:
return "Confirm", "Proceed?" return "Confirm", "Proceed?"
} }

View File

@@ -29,6 +29,7 @@ local-fs.target
Reason: the modules are shipped in the ISO overlay under `/usr/local/lib/nvidia/`, not in the host module tree. Reason: the modules are shipped in the ISO overlay under `/usr/local/lib/nvidia/`, not in the host module tree.
- `bee-audit.service` does not wait for `network-online.target`; audit is local and must run even if DHCP is broken. - `bee-audit.service` does not wait for `network-online.target`; audit is local and must run even if DHCP is broken.
- `bee-audit.service` logs audit failures but does not turn partial collector problems into a boot blocker. - `bee-audit.service` logs audit failures but does not turn partial collector problems into a boot blocker.
- Audit JSON now includes a `hardware.summary` block with overall verdict and warning/failure counts.
## Console and login flow ## Console and login flow
@@ -59,7 +60,7 @@ build.sh [--authorized-keys /path/to/keys]
3. inject authorized_keys into staged `root/.ssh/` (or set password fallback marker) 3. inject authorized_keys into staged `root/.ssh/` (or set password fallback marker)
4. copy `bee` binary → staged `/usr/local/bin/bee` 4. copy `bee` binary → staged `/usr/local/bin/bee`
5. copy vendor binaries from `iso/vendor/` → staged `/usr/local/bin/` 5. copy vendor binaries from `iso/vendor/` → staged `/usr/local/bin/`
(`storcli64`, `sas2ircu`, `sas3ircu`, `mstflint` — each optional) (`storcli64`, `sas2ircu`, `sas3ircu`, `arcconf`, `ssacli` — optional; `mstflint` comes from the Debian package set)
6. `build-nvidia-module.sh`: 6. `build-nvidia-module.sh`:
a. install Debian kernel headers if missing a. install Debian kernel headers if missing
b. download NVIDIA `.run` installer (sha256 verified, cached in `dist/`) b. download NVIDIA `.run` installer (sha256 verified, cached in `dist/`)
@@ -119,10 +120,15 @@ Current validation state:
3. memory collector (dmidecode -t 17) 3. memory collector (dmidecode -t 17)
4. storage collector (lsblk -J, smartctl -j, nvme id-ctrl, nvme smart-log) 4. storage collector (lsblk -J, smartctl -j, nvme id-ctrl, nvme smart-log)
5. pcie collector (lspci -vmm -D, /sys/bus/pci/devices/) 5. pcie collector (lspci -vmm -D, /sys/bus/pci/devices/)
6. psu collector (ipmitool fru — silent if no /dev/ipmi0) 6. psu collector (ipmitool fru + sdr — silent if no /dev/ipmi0)
7. nvidia enrichment (nvidia-smi — skipped if binary absent or driver not loaded) 7. nvidia enrichment (nvidia-smi — skipped if binary absent or driver not loaded)
8. output JSON → /var/log/bee-audit.json 8. output JSON → /var/log/bee-audit.json
9. QR summary to stdout (qrencode if available) 9. QR summary to stdout (qrencode if available)
``` ```
Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal. Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal.
Acceptance flows:
- `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + lightweight `bee-gpu-stress`
- `bee sat memory``memtester` archive
- `bee sat storage` → SMART/NVMe diagnostic archive and short self-test trigger where supported

View File

@@ -19,6 +19,9 @@ Fills gaps where Redfish/logpile is blind:
## In scope ## In scope
- Read-only hardware inventory: board, CPU, memory, storage, PCIe, PSU, GPU, NIC, RAID - Read-only hardware inventory: board, CPU, memory, storage, PCIe, PSU, GPU, NIC, RAID
- Machine-readable health summary derived from collector verdicts
- Operator-triggered acceptance tests for NVIDIA, memory, and storage
- NVIDIA SAT includes both diagnostic collection and lightweight GPU stress via `bee-gpu-stress`
- Automatic boot audit with operator-facing local console and SSH access - Automatic boot audit with operator-facing local console and SSH access
- NVIDIA proprietary driver loaded at boot for GPU enrichment via `nvidia-smi` - NVIDIA proprietary driver loaded at boot for GPU enrichment via `nvidia-smi`
- SSH access (OpenSSH) always available for inspection and debugging - SSH access (OpenSSH) always available for inspection and debugging
@@ -81,7 +84,7 @@ Fills gaps where Redfish/logpile is blind:
| `audit/internal/schema/` | HardwareIngestRequest types | | `audit/internal/schema/` | HardwareIngestRequest types |
| `iso/builder/` | ISO build scripts and `live-build` profile | | `iso/builder/` | ISO build scripts and `live-build` profile |
| `iso/overlay/` | Source overlay copied into a staged build overlay | | `iso/overlay/` | Source overlay copied into a staged build overlay |
| `iso/vendor/` | Optional pre-built vendor binaries (storcli64, sas2ircu, sas3ircu, mstflint, …) | | `iso/vendor/` | Optional pre-built vendor binaries (storcli64, sas2ircu, sas3ircu, arcconf, ssacli, …) |
| `iso/builder/VERSIONS` | Pinned versions: Debian, Go, NVIDIA driver, kernel ABI | | `iso/builder/VERSIONS` | Pinned versions: Debian, Go, NVIDIA driver, kernel ABI |
| `iso/builder/smoketest.sh` | Post-boot smoke test — run via SSH to verify live ISO | | `iso/builder/smoketest.sh` | Post-boot smoke test — run via SSH to verify live ISO |
| `iso/overlay/etc/profile.d/bee.sh` | `menu` helper + tty1 auto-start policy | | `iso/overlay/etc/profile.d/bee.sh` | `menu` helper + tty1 auto-start policy |

View File

@@ -0,0 +1,314 @@
#define _POSIX_C_SOURCE 200809L
#include <dlfcn.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
typedef int CUdevice;
typedef uint64_t CUdeviceptr;
typedef int CUresult;
typedef void *CUcontext;
typedef void *CUmodule;
typedef void *CUfunction;
typedef void *CUstream;
#define CU_SUCCESS 0
static const char *ptx_source =
".version 6.0\n"
".target sm_30\n"
".address_size 64\n"
"\n"
".visible .entry burn(\n"
" .param .u64 data,\n"
" .param .u32 words,\n"
" .param .u32 rounds\n"
")\n"
"{\n"
" .reg .pred %p<2>;\n"
" .reg .b32 %r<8>;\n"
" .reg .b64 %rd<5>;\n"
"\n"
" ld.param.u64 %rd1, [data];\n"
" ld.param.u32 %r1, [words];\n"
" ld.param.u32 %r2, [rounds];\n"
" mov.u32 %r3, %ctaid.x;\n"
" mov.u32 %r4, %ntid.x;\n"
" mov.u32 %r5, %tid.x;\n"
" mad.lo.s32 %r0, %r3, %r4, %r5;\n"
" setp.ge.u32 %p0, %r0, %r1;\n"
" @%p0 bra DONE;\n"
" mul.wide.u32 %rd2, %r0, 4;\n"
" add.s64 %rd3, %rd1, %rd2;\n"
" ld.global.u32 %r6, [%rd3];\n"
"LOOP:\n"
" setp.eq.u32 %p1, %r2, 0;\n"
" @%p1 bra STORE;\n"
" mad.lo.u32 %r6, %r6, 1664525, 1013904223;\n"
" sub.u32 %r2, %r2, 1;\n"
" bra LOOP;\n"
"STORE:\n"
" st.global.u32 [%rd3], %r6;\n"
"DONE:\n"
" ret;\n"
"}\n";
typedef CUresult (*cuInit_fn)(unsigned int);
typedef CUresult (*cuDeviceGetCount_fn)(int *);
typedef CUresult (*cuDeviceGet_fn)(CUdevice *, int);
typedef CUresult (*cuDeviceGetName_fn)(char *, int, CUdevice);
typedef CUresult (*cuCtxCreate_fn)(CUcontext *, unsigned int, CUdevice);
typedef CUresult (*cuCtxDestroy_fn)(CUcontext);
typedef CUresult (*cuCtxSynchronize_fn)(void);
typedef CUresult (*cuMemAlloc_fn)(CUdeviceptr *, size_t);
typedef CUresult (*cuMemFree_fn)(CUdeviceptr);
typedef CUresult (*cuMemcpyHtoD_fn)(CUdeviceptr, const void *, size_t);
typedef CUresult (*cuMemcpyDtoH_fn)(void *, CUdeviceptr, size_t);
typedef CUresult (*cuModuleLoadDataEx_fn)(CUmodule *, const void *, unsigned int, void *, void *);
typedef CUresult (*cuModuleGetFunction_fn)(CUfunction *, CUmodule, const char *);
typedef CUresult (*cuLaunchKernel_fn)(CUfunction,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
CUstream,
void **,
void **);
typedef CUresult (*cuGetErrorName_fn)(CUresult, const char **);
typedef CUresult (*cuGetErrorString_fn)(CUresult, const char **);
struct cuda_api {
void *lib;
cuInit_fn cuInit;
cuDeviceGetCount_fn cuDeviceGetCount;
cuDeviceGet_fn cuDeviceGet;
cuDeviceGetName_fn cuDeviceGetName;
cuCtxCreate_fn cuCtxCreate;
cuCtxDestroy_fn cuCtxDestroy;
cuCtxSynchronize_fn cuCtxSynchronize;
cuMemAlloc_fn cuMemAlloc;
cuMemFree_fn cuMemFree;
cuMemcpyHtoD_fn cuMemcpyHtoD;
cuMemcpyDtoH_fn cuMemcpyDtoH;
cuModuleLoadDataEx_fn cuModuleLoadDataEx;
cuModuleGetFunction_fn cuModuleGetFunction;
cuLaunchKernel_fn cuLaunchKernel;
cuGetErrorName_fn cuGetErrorName;
cuGetErrorString_fn cuGetErrorString;
};
static int load_symbol(void *lib, const char *name, void **out) {
*out = dlsym(lib, name);
return *out != NULL;
}
static int load_cuda(struct cuda_api *api) {
memset(api, 0, sizeof(*api));
api->lib = dlopen("libcuda.so.1", RTLD_NOW | RTLD_LOCAL);
if (!api->lib) {
return 0;
}
return
load_symbol(api->lib, "cuInit", (void **)&api->cuInit) &&
load_symbol(api->lib, "cuDeviceGetCount", (void **)&api->cuDeviceGetCount) &&
load_symbol(api->lib, "cuDeviceGet", (void **)&api->cuDeviceGet) &&
load_symbol(api->lib, "cuDeviceGetName", (void **)&api->cuDeviceGetName) &&
load_symbol(api->lib, "cuCtxCreate_v2", (void **)&api->cuCtxCreate) &&
load_symbol(api->lib, "cuCtxDestroy_v2", (void **)&api->cuCtxDestroy) &&
load_symbol(api->lib, "cuCtxSynchronize", (void **)&api->cuCtxSynchronize) &&
load_symbol(api->lib, "cuMemAlloc_v2", (void **)&api->cuMemAlloc) &&
load_symbol(api->lib, "cuMemFree_v2", (void **)&api->cuMemFree) &&
load_symbol(api->lib, "cuMemcpyHtoD_v2", (void **)&api->cuMemcpyHtoD) &&
load_symbol(api->lib, "cuMemcpyDtoH_v2", (void **)&api->cuMemcpyDtoH) &&
load_symbol(api->lib, "cuModuleLoadDataEx", (void **)&api->cuModuleLoadDataEx) &&
load_symbol(api->lib, "cuModuleGetFunction", (void **)&api->cuModuleGetFunction) &&
load_symbol(api->lib, "cuLaunchKernel", (void **)&api->cuLaunchKernel);
}
static const char *cu_error_name(struct cuda_api *api, CUresult rc) {
const char *value = NULL;
if (api->cuGetErrorName && api->cuGetErrorName(rc, &value) == CU_SUCCESS && value) {
return value;
}
return "CUDA_ERROR";
}
static const char *cu_error_string(struct cuda_api *api, CUresult rc) {
const char *value = NULL;
if (api->cuGetErrorString && api->cuGetErrorString(rc, &value) == CU_SUCCESS && value) {
return value;
}
return "unknown";
}
static int check_rc(struct cuda_api *api, const char *step, CUresult rc) {
if (rc == CU_SUCCESS) {
return 1;
}
fprintf(stderr, "%s failed: %s (%s)\n", step, cu_error_name(api, rc), cu_error_string(api, rc));
return 0;
}
static double now_seconds(void) {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return (double)ts.tv_sec + ((double)ts.tv_nsec / 1000000000.0);
}
int main(int argc, char **argv) {
int seconds = 5;
int size_mb = 64;
for (int i = 1; i < argc; i++) {
if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) {
seconds = atoi(argv[++i]);
} else if ((strcmp(argv[i], "--size-mb") == 0 || strcmp(argv[i], "-m") == 0) && i + 1 < argc) {
size_mb = atoi(argv[++i]);
} else {
fprintf(stderr, "usage: %s [--seconds N] [--size-mb N]\n", argv[0]);
return 2;
}
}
if (seconds <= 0) {
seconds = 5;
}
if (size_mb <= 0) {
size_mb = 64;
}
struct cuda_api api;
if (!load_cuda(&api)) {
fprintf(stderr, "failed to load libcuda.so.1 or required Driver API symbols\n");
return 1;
}
load_symbol(api.lib, "cuGetErrorName", (void **)&api.cuGetErrorName);
load_symbol(api.lib, "cuGetErrorString", (void **)&api.cuGetErrorString);
if (!check_rc(&api, "cuInit", api.cuInit(0))) {
return 1;
}
int count = 0;
if (!check_rc(&api, "cuDeviceGetCount", api.cuDeviceGetCount(&count))) {
return 1;
}
if (count <= 0) {
fprintf(stderr, "no CUDA devices found\n");
return 1;
}
CUdevice dev = 0;
if (!check_rc(&api, "cuDeviceGet", api.cuDeviceGet(&dev, 0))) {
return 1;
}
char name[128] = {0};
if (!check_rc(&api, "cuDeviceGetName", api.cuDeviceGetName(name, (int)sizeof(name), dev))) {
return 1;
}
CUcontext ctx = NULL;
if (!check_rc(&api, "cuCtxCreate", api.cuCtxCreate(&ctx, 0, dev))) {
return 1;
}
size_t bytes = (size_t)size_mb * 1024 * 1024;
uint32_t words = (uint32_t)(bytes / sizeof(uint32_t));
if (words < 1024) {
words = 1024;
bytes = (size_t)words * sizeof(uint32_t);
}
uint32_t *host = (uint32_t *)malloc(bytes);
if (!host) {
fprintf(stderr, "malloc failed\n");
api.cuCtxDestroy(ctx);
return 1;
}
for (uint32_t i = 0; i < words; i++) {
host[i] = i ^ 0x12345678u;
}
CUdeviceptr device_mem = 0;
if (!check_rc(&api, "cuMemAlloc", api.cuMemAlloc(&device_mem, bytes))) {
free(host);
api.cuCtxDestroy(ctx);
return 1;
}
if (!check_rc(&api, "cuMemcpyHtoD", api.cuMemcpyHtoD(device_mem, host, bytes))) {
api.cuMemFree(device_mem);
free(host);
api.cuCtxDestroy(ctx);
return 1;
}
CUmodule module = NULL;
if (!check_rc(&api, "cuModuleLoadDataEx", api.cuModuleLoadDataEx(&module, ptx_source, 0, NULL, NULL))) {
api.cuMemFree(device_mem);
free(host);
api.cuCtxDestroy(ctx);
return 1;
}
CUfunction kernel = NULL;
if (!check_rc(&api, "cuModuleGetFunction", api.cuModuleGetFunction(&kernel, module, "burn"))) {
api.cuMemFree(device_mem);
free(host);
api.cuCtxDestroy(ctx);
return 1;
}
unsigned int threads = 256;
unsigned int blocks = (words + threads - 1) / threads;
uint32_t rounds = 256;
void *params[] = {&device_mem, &words, &rounds};
double start = now_seconds();
double deadline = start + (double)seconds;
unsigned long iterations = 0;
while (now_seconds() < deadline) {
if (!check_rc(&api, "cuLaunchKernel",
api.cuLaunchKernel(kernel, blocks, 1, 1, threads, 1, 1, 0, NULL, params, NULL))) {
api.cuMemFree(device_mem);
free(host);
api.cuCtxDestroy(ctx);
return 1;
}
iterations++;
}
if (!check_rc(&api, "cuCtxSynchronize", api.cuCtxSynchronize())) {
api.cuMemFree(device_mem);
free(host);
api.cuCtxDestroy(ctx);
return 1;
}
if (!check_rc(&api, "cuMemcpyDtoH", api.cuMemcpyDtoH(host, device_mem, bytes))) {
api.cuMemFree(device_mem);
free(host);
api.cuCtxDestroy(ctx);
return 1;
}
uint64_t checksum = 0;
for (uint32_t i = 0; i < words; i += words / 256 ? words / 256 : 1) {
checksum += host[i];
}
double elapsed = now_seconds() - start;
printf("device=%s\n", name);
printf("duration_s=%.2f\n", elapsed);
printf("buffer_mb=%d\n", size_mb);
printf("iterations=%lu\n", iterations);
printf("checksum=%llu\n", (unsigned long long)checksum);
printf("status=OK\n");
api.cuMemFree(device_mem);
free(host);
api.cuCtxDestroy(ctx);
return 0;
}

View File

@@ -41,6 +41,7 @@ echo ""
# --- compile bee binary (static, Linux amd64) --- # --- compile bee binary (static, Linux amd64) ---
BEE_BIN="${DIST_DIR}/bee-linux-amd64" BEE_BIN="${DIST_DIR}/bee-linux-amd64"
GPU_STRESS_BIN="${DIST_DIR}/bee-gpu-stress-linux-amd64"
NEED_BUILD=1 NEED_BUILD=1
if [ -f "$BEE_BIN" ]; then if [ -f "$BEE_BIN" ]; then
NEWEST_SRC=$(find "${REPO_ROOT}/audit" -name '*.go' -newer "$BEE_BIN" | head -1) NEWEST_SRC=$(find "${REPO_ROOT}/audit" -name '*.go' -newer "$BEE_BIN" | head -1)
@@ -70,6 +71,22 @@ else
echo "=== bee binary up to date, skipping build ===" echo "=== bee binary up to date, skipping build ==="
fi fi
GPU_STRESS_NEED_BUILD=1
if [ -f "$GPU_STRESS_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_STRESS_BIN" ]; then
GPU_STRESS_NEED_BUILD=0
fi
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
echo "=== building bee-gpu-stress ==="
gcc -O2 -s -Wall -Wextra \
-o "$GPU_STRESS_BIN" \
"${BUILDER_DIR}/bee-gpu-stress.c" \
-ldl
echo "binary: $GPU_STRESS_BIN"
else
echo "=== bee-gpu-stress up to date, skipping build ==="
fi
echo "=== preparing staged overlay ===" echo "=== preparing staged overlay ==="
rm -rf "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}" rm -rf "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}" mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
@@ -80,6 +97,7 @@ rm -f \
"${OVERLAY_STAGE_DIR}/etc/bee-release" \ "${OVERLAY_STAGE_DIR}/etc/bee-release" \
"${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \ "${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \ "${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest"
# --- inject authorized_keys for SSH access --- # --- inject authorized_keys for SSH access ---
@@ -119,13 +137,15 @@ fi
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin" mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin"
cp "${DIST_DIR}/bee-linux-amd64" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee" cp "${DIST_DIR}/bee-linux-amd64" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee" chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
cp "${GPU_STRESS_BIN}" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
# --- inject smoketest into overlay so it runs directly on the live CD --- # --- inject smoketest into overlay so it runs directly on the live CD ---
cp "${BUILDER_DIR}/smoketest.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" cp "${BUILDER_DIR}/smoketest.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest"
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest"
# --- vendor utilities (optional pre-fetched binaries) --- # --- vendor utilities (optional pre-fetched binaries) ---
for tool in storcli64 sas2ircu sas3ircu mstflint; do for tool in storcli64 sas2ircu sas3ircu arcconf ssacli; do
if [ -f "${VENDOR_DIR}/${tool}" ]; then if [ -f "${VENDOR_DIR}/${tool}" ]; then
cp "${VENDOR_DIR}/${tool}" "${OVERLAY_STAGE_DIR}/usr/local/bin/${tool}" cp "${VENDOR_DIR}/${tool}" "${OVERLAY_STAGE_DIR}/usr/local/bin/${tool}"
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/${tool}" || true chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/${tool}" || true

View File

@@ -11,6 +11,7 @@ lshw
iproute2 iproute2
isc-dhcp-client isc-dhcp-client
iputils-ping iputils-ping
ethtool
qemu-guest-agent qemu-guest-agent
# SSH # SSH
@@ -27,6 +28,8 @@ mc
htop htop
sudo sudo
zstd zstd
mstflint
memtester
# QR codes (for displaying audit results) # QR codes (for displaying audit results)
qrencode qrencode