Add TUI hardware banner and polish SAT summaries
This commit is contained in:
2
PLAN.md
2
PLAN.md
@@ -347,6 +347,8 @@ Planned code shape:
|
||||
- `bee tui` can export the latest audit JSON to removable media
|
||||
- `bee tui` can show health summary and run NVIDIA/memory/storage acceptance tests
|
||||
- NVIDIA SAT now includes a lightweight in-image GPU stress step via `bee-gpu-stress`
|
||||
- SAT summaries now expose `overall_status` plus per-job `OK/FAILED/UNSUPPORTED`
|
||||
- Memory/GPU SAT runtime defaults can be overridden via `BEE_MEMTESTER_*` and `BEE_GPU_STRESS_*`
|
||||
- removable export requires explicit target selection, mount, confirmation, copy, and cleanup
|
||||
|
||||
### 2.6 — Vendor utilities and optional assets
|
||||
|
||||
@@ -13,11 +13,13 @@ import (
|
||||
"bee/audit/internal/collector"
|
||||
"bee/audit/internal/platform"
|
||||
"bee/audit/internal/runtimeenv"
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
const (
|
||||
var (
|
||||
DefaultAuditJSONPath = "/var/log/bee-audit.json"
|
||||
DefaultAuditLogPath = "/var/log/bee-audit.log"
|
||||
DefaultSATBaseDir = "/var/log/bee-sat"
|
||||
)
|
||||
|
||||
type App struct {
|
||||
@@ -354,7 +356,7 @@ func (a *App) HealthSummaryResult() ActionResult {
|
||||
fmt.Fprintf(&body, "PSU: warn=%d fail=%d\n", summary.PSUWarn, summary.PSUFail)
|
||||
fmt.Fprintf(&body, "Memory: warn=%d fail=%d\n", summary.MemoryWarn, summary.MemoryFail)
|
||||
for _, item := range latestSATSummaries() {
|
||||
fmt.Fprintf(&body, "\n%s", item)
|
||||
fmt.Fprintf(&body, "\n\n%s", item)
|
||||
}
|
||||
if len(summary.Failures) > 0 {
|
||||
fmt.Fprintf(&body, "\n\nFailures:\n- %s", strings.Join(summary.Failures, "\n- "))
|
||||
@@ -365,6 +367,40 @@ func (a *App) HealthSummaryResult() ActionResult {
|
||||
return ActionResult{Title: "Health summary", Body: strings.TrimSpace(body.String())}
|
||||
}
|
||||
|
||||
func (a *App) MainBanner() string {
|
||||
raw, err := os.ReadFile(DefaultAuditJSONPath)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
var snapshot schema.HardwareIngestRequest
|
||||
if err := json.Unmarshal(raw, &snapshot); err != nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
var lines []string
|
||||
if system := formatSystemLine(snapshot.Hardware.Board); system != "" {
|
||||
lines = append(lines, system)
|
||||
}
|
||||
if cpu := formatCPULine(snapshot.Hardware.CPUs); cpu != "" {
|
||||
lines = append(lines, cpu)
|
||||
}
|
||||
if memory := formatMemoryLine(snapshot.Hardware.Memory); memory != "" {
|
||||
lines = append(lines, memory)
|
||||
}
|
||||
if storage := formatStorageLine(snapshot.Hardware.Storage); storage != "" {
|
||||
lines = append(lines, storage)
|
||||
}
|
||||
if gpu := formatGPULine(snapshot.Hardware.PCIeDevices); gpu != "" {
|
||||
lines = append(lines, gpu)
|
||||
}
|
||||
if ip := formatIPLine(a.network.ListInterfaces); ip != "" {
|
||||
lines = append(lines, ip)
|
||||
}
|
||||
|
||||
return strings.TrimSpace(strings.Join(lines, "\n"))
|
||||
}
|
||||
|
||||
func (a *App) FormatToolStatuses(statuses []platform.ToolStatus) string {
|
||||
var body strings.Builder
|
||||
for _, tool := range statuses {
|
||||
@@ -418,7 +454,6 @@ func bodyOr(body, fallback string) string {
|
||||
}
|
||||
|
||||
func latestSATSummaries() []string {
|
||||
baseDir := "/var/log/bee-sat"
|
||||
patterns := []struct {
|
||||
label string
|
||||
prefix string
|
||||
@@ -429,7 +464,7 @@ func latestSATSummaries() []string {
|
||||
}
|
||||
var out []string
|
||||
for _, item := range patterns {
|
||||
matches, err := filepath.Glob(filepath.Join(baseDir, item.prefix+"*/summary.txt"))
|
||||
matches, err := filepath.Glob(filepath.Join(DefaultSATBaseDir, item.prefix+"*/summary.txt"))
|
||||
if err != nil || len(matches) == 0 {
|
||||
continue
|
||||
}
|
||||
@@ -438,7 +473,273 @@ func latestSATSummaries() []string {
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
out = append(out, item.label+":\n"+strings.TrimSpace(string(raw)))
|
||||
out = append(out, formatSATSummary(item.label, string(raw)))
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func formatSATSummary(label, raw string) string {
|
||||
values := parseKeyValueSummary(raw)
|
||||
var body strings.Builder
|
||||
fmt.Fprintf(&body, "%s:", label)
|
||||
if overall := firstNonEmpty(values["overall_status"], "UNKNOWN"); overall != "" {
|
||||
fmt.Fprintf(&body, " %s", overall)
|
||||
}
|
||||
if ok := firstNonEmpty(values["job_ok"], "0"); ok != "" {
|
||||
fmt.Fprintf(&body, " ok=%s", ok)
|
||||
}
|
||||
if failed := firstNonEmpty(values["job_failed"], "0"); failed != "" {
|
||||
fmt.Fprintf(&body, " failed=%s", failed)
|
||||
}
|
||||
if unsupported := firstNonEmpty(values["job_unsupported"], "0"); unsupported != "" && unsupported != "0" {
|
||||
fmt.Fprintf(&body, " unsupported=%s", unsupported)
|
||||
}
|
||||
if devices := strings.TrimSpace(values["devices"]); devices != "" {
|
||||
fmt.Fprintf(&body, "\nDevices: %s", devices)
|
||||
}
|
||||
return body.String()
|
||||
}
|
||||
|
||||
func formatSystemLine(board schema.HardwareBoard) string {
|
||||
model := strings.TrimSpace(strings.Join([]string{
|
||||
trimPtr(board.Manufacturer),
|
||||
trimPtr(board.ProductName),
|
||||
}, " "))
|
||||
serial := strings.TrimSpace(board.SerialNumber)
|
||||
switch {
|
||||
case model != "" && serial != "":
|
||||
return fmt.Sprintf("System: %s | S/N %s", model, serial)
|
||||
case model != "":
|
||||
return "System: " + model
|
||||
case serial != "":
|
||||
return "System S/N: " + serial
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
func formatCPULine(cpus []schema.HardwareCPU) string {
|
||||
if len(cpus) == 0 {
|
||||
return ""
|
||||
}
|
||||
modelCounts := map[string]int{}
|
||||
unknown := 0
|
||||
for _, cpu := range cpus {
|
||||
model := trimPtr(cpu.Model)
|
||||
if model == "" {
|
||||
unknown++
|
||||
continue
|
||||
}
|
||||
modelCounts[model]++
|
||||
}
|
||||
if len(modelCounts) == 1 && unknown == 0 {
|
||||
for model, count := range modelCounts {
|
||||
return fmt.Sprintf("CPU: %d x %s", count, model)
|
||||
}
|
||||
}
|
||||
parts := make([]string, 0, len(modelCounts)+1)
|
||||
if len(modelCounts) > 0 {
|
||||
keys := make([]string, 0, len(modelCounts))
|
||||
for key := range modelCounts {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
for _, key := range keys {
|
||||
parts = append(parts, fmt.Sprintf("%d x %s", modelCounts[key], key))
|
||||
}
|
||||
}
|
||||
if unknown > 0 {
|
||||
parts = append(parts, fmt.Sprintf("%d x unknown", unknown))
|
||||
}
|
||||
return "CPU: " + strings.Join(parts, ", ")
|
||||
}
|
||||
|
||||
func formatMemoryLine(dimms []schema.HardwareMemory) string {
|
||||
totalMB := 0
|
||||
present := 0
|
||||
types := map[string]struct{}{}
|
||||
for _, dimm := range dimms {
|
||||
if dimm.Present != nil && !*dimm.Present {
|
||||
continue
|
||||
}
|
||||
if dimm.SizeMB == nil || *dimm.SizeMB <= 0 {
|
||||
continue
|
||||
}
|
||||
present++
|
||||
totalMB += *dimm.SizeMB
|
||||
if value := trimPtr(dimm.Type); value != "" {
|
||||
types[value] = struct{}{}
|
||||
}
|
||||
}
|
||||
if totalMB == 0 {
|
||||
return ""
|
||||
}
|
||||
typeText := joinSortedKeys(types)
|
||||
line := fmt.Sprintf("Memory: %s", humanizeMB(totalMB))
|
||||
if typeText != "" {
|
||||
line += " " + typeText
|
||||
}
|
||||
if present > 0 {
|
||||
line += fmt.Sprintf(" (%d DIMMs)", present)
|
||||
}
|
||||
return line
|
||||
}
|
||||
|
||||
func formatStorageLine(disks []schema.HardwareStorage) string {
|
||||
count := 0
|
||||
totalGB := 0
|
||||
for _, disk := range disks {
|
||||
if disk.Present != nil && !*disk.Present {
|
||||
continue
|
||||
}
|
||||
count++
|
||||
if disk.SizeGB != nil && *disk.SizeGB > 0 {
|
||||
totalGB += *disk.SizeGB
|
||||
}
|
||||
}
|
||||
if count == 0 {
|
||||
return ""
|
||||
}
|
||||
line := fmt.Sprintf("Storage: %d drives", count)
|
||||
if totalGB > 0 {
|
||||
line += fmt.Sprintf(" / %s", humanizeGB(totalGB))
|
||||
}
|
||||
return line
|
||||
}
|
||||
|
||||
func formatGPULine(devices []schema.HardwarePCIeDevice) string {
|
||||
gpus := map[string]int{}
|
||||
for _, dev := range devices {
|
||||
if !isGPUDevice(dev) {
|
||||
continue
|
||||
}
|
||||
name := firstNonEmpty(trimPtr(dev.Model), trimPtr(dev.Manufacturer), "unknown")
|
||||
gpus[name]++
|
||||
}
|
||||
if len(gpus) == 0 {
|
||||
return ""
|
||||
}
|
||||
keys := make([]string, 0, len(gpus))
|
||||
for key := range gpus {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
parts := make([]string, 0, len(keys))
|
||||
for _, key := range keys {
|
||||
parts = append(parts, fmt.Sprintf("%d x %s", gpus[key], key))
|
||||
}
|
||||
return "GPU: " + strings.Join(parts, ", ")
|
||||
}
|
||||
|
||||
func formatIPLine(list func() ([]platform.InterfaceInfo, error)) string {
|
||||
if list == nil {
|
||||
return ""
|
||||
}
|
||||
ifaces, err := list()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
seen := map[string]struct{}{}
|
||||
var ips []string
|
||||
for _, iface := range ifaces {
|
||||
for _, ip := range iface.IPv4 {
|
||||
ip = strings.TrimSpace(ip)
|
||||
if ip == "" {
|
||||
continue
|
||||
}
|
||||
if _, ok := seen[ip]; ok {
|
||||
continue
|
||||
}
|
||||
seen[ip] = struct{}{}
|
||||
ips = append(ips, ip)
|
||||
}
|
||||
}
|
||||
if len(ips) == 0 {
|
||||
return ""
|
||||
}
|
||||
sort.Strings(ips)
|
||||
return "IP: " + strings.Join(ips, ", ")
|
||||
}
|
||||
|
||||
func isGPUDevice(dev schema.HardwarePCIeDevice) bool {
|
||||
class := strings.ToLower(trimPtr(dev.DeviceClass))
|
||||
model := strings.ToLower(trimPtr(dev.Model))
|
||||
vendor := strings.ToLower(trimPtr(dev.Manufacturer))
|
||||
return strings.Contains(class, "vga") ||
|
||||
strings.Contains(class, "3d") ||
|
||||
strings.Contains(class, "display") ||
|
||||
strings.Contains(model, "nvidia") ||
|
||||
strings.Contains(vendor, "nvidia") ||
|
||||
strings.Contains(vendor, "amd")
|
||||
}
|
||||
|
||||
func trimPtr(value *string) string {
|
||||
if value == nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(*value)
|
||||
}
|
||||
|
||||
func joinSortedKeys(values map[string]struct{}) string {
|
||||
if len(values) == 0 {
|
||||
return ""
|
||||
}
|
||||
keys := make([]string, 0, len(values))
|
||||
for key := range values {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
return strings.Join(keys, "/")
|
||||
}
|
||||
|
||||
func humanizeMB(totalMB int) string {
|
||||
if totalMB <= 0 {
|
||||
return ""
|
||||
}
|
||||
gb := float64(totalMB) / 1024.0
|
||||
if gb >= 1024.0 {
|
||||
tb := gb / 1024.0
|
||||
return fmt.Sprintf("%.1f TB", tb)
|
||||
}
|
||||
if gb == float64(int64(gb)) {
|
||||
return fmt.Sprintf("%.0f GB", gb)
|
||||
}
|
||||
return fmt.Sprintf("%.1f GB", gb)
|
||||
}
|
||||
|
||||
func humanizeGB(totalGB int) string {
|
||||
if totalGB <= 0 {
|
||||
return ""
|
||||
}
|
||||
tb := float64(totalGB) / 1024.0
|
||||
if tb >= 1.0 {
|
||||
return fmt.Sprintf("%.1f TB", tb)
|
||||
}
|
||||
return fmt.Sprintf("%d GB", totalGB)
|
||||
}
|
||||
|
||||
func parseKeyValueSummary(raw string) map[string]string {
|
||||
out := map[string]string{}
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
key, value, ok := strings.Cut(line, "=")
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
out[strings.TrimSpace(key)] = strings.TrimSpace(value)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func firstNonEmpty(values ...string) string {
|
||||
for _, value := range values {
|
||||
value = strings.TrimSpace(value)
|
||||
if value != "" {
|
||||
return value
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
@@ -1,10 +1,14 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
type fakeNetwork struct {
|
||||
@@ -356,6 +360,124 @@ func TestRunNvidiaAcceptancePackResult(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestFormatSATSummary(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
got := formatSATSummary("Memory SAT", "overall_status=PARTIAL\njob_ok=2\njob_failed=0\njob_unsupported=1\ndevices=3\n")
|
||||
want := "Memory SAT: PARTIAL ok=2 failed=0 unsupported=1\nDevices: 3"
|
||||
if got != want {
|
||||
t.Fatalf("got %q want %q", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHealthSummaryResultIncludesCompactSATSummary(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tmp := t.TempDir()
|
||||
oldAuditPath := DefaultAuditJSONPath
|
||||
oldSATBaseDir := DefaultSATBaseDir
|
||||
DefaultAuditJSONPath = filepath.Join(tmp, "audit.json")
|
||||
DefaultSATBaseDir = filepath.Join(tmp, "sat")
|
||||
t.Cleanup(func() { DefaultAuditJSONPath = oldAuditPath })
|
||||
t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
|
||||
|
||||
satDir := filepath.Join(DefaultSATBaseDir, "memory-testcase")
|
||||
if err := os.MkdirAll(satDir, 0755); err != nil {
|
||||
t.Fatalf("mkdir sat dir: %v", err)
|
||||
}
|
||||
|
||||
raw := `{"hardware":{"summary":{"status":"WARNING","storage_warn":1,"storage_fail":0,"pcie_warn":0,"pcie_fail":0,"psu_warn":0,"psu_fail":0,"memory_warn":0,"memory_fail":0}}}`
|
||||
if err := os.WriteFile(DefaultAuditJSONPath, []byte(raw), 0644); err != nil {
|
||||
t.Fatalf("write audit json: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(satDir, "summary.txt"), []byte("overall_status=OK\njob_ok=3\njob_failed=0\njob_unsupported=0\n"), 0644); err != nil {
|
||||
t.Fatalf("write sat summary: %v", err)
|
||||
}
|
||||
|
||||
result := (&App{}).HealthSummaryResult()
|
||||
if !contains(result.Body, "Memory SAT: OK ok=3 failed=0") {
|
||||
t.Fatalf("body missing compact sat summary:\n%s", result.Body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMainBanner(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tmp := t.TempDir()
|
||||
oldAuditPath := DefaultAuditJSONPath
|
||||
DefaultAuditJSONPath = filepath.Join(tmp, "audit.json")
|
||||
t.Cleanup(func() { DefaultAuditJSONPath = oldAuditPath })
|
||||
|
||||
trueValue := true
|
||||
manufacturer := "Dell"
|
||||
product := "PowerEdge R760"
|
||||
cpuModel := "Intel Xeon Gold 6430"
|
||||
memoryType := "DDR5"
|
||||
gpuClass := "VGA compatible controller"
|
||||
gpuModel := "NVIDIA H100"
|
||||
|
||||
payload := schema.HardwareIngestRequest{
|
||||
Hardware: schema.HardwareSnapshot{
|
||||
Board: schema.HardwareBoard{
|
||||
Manufacturer: &manufacturer,
|
||||
ProductName: &product,
|
||||
SerialNumber: "SRV123",
|
||||
},
|
||||
CPUs: []schema.HardwareCPU{
|
||||
{Model: &cpuModel},
|
||||
{Model: &cpuModel},
|
||||
},
|
||||
Memory: []schema.HardwareMemory{
|
||||
{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType},
|
||||
{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType},
|
||||
},
|
||||
Storage: []schema.HardwareStorage{
|
||||
{Present: &trueValue, SizeGB: intPtr(3840)},
|
||||
{Present: &trueValue, SizeGB: intPtr(3840)},
|
||||
},
|
||||
PCIeDevices: []schema.HardwarePCIeDevice{
|
||||
{DeviceClass: &gpuClass, Model: &gpuModel},
|
||||
{DeviceClass: &gpuClass, Model: &gpuModel},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
raw, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(DefaultAuditJSONPath, raw, 0644); err != nil {
|
||||
t.Fatalf("write audit json: %v", err)
|
||||
}
|
||||
|
||||
a := &App{
|
||||
network: fakeNetwork{
|
||||
listInterfacesFn: func() ([]platform.InterfaceInfo, error) {
|
||||
return []platform.InterfaceInfo{
|
||||
{Name: "eth0", IPv4: []string{"10.0.0.10"}},
|
||||
{Name: "eth1", IPv4: []string{"192.168.1.10"}},
|
||||
}, nil
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
got := a.MainBanner()
|
||||
for _, want := range []string{
|
||||
"System: Dell PowerEdge R760 | S/N SRV123",
|
||||
"CPU: 2 x Intel Xeon Gold 6430",
|
||||
"Memory: 1.0 TB DDR5 (2 DIMMs)",
|
||||
"Storage: 2 drives / 7.5 TB",
|
||||
"GPU: 2 x NVIDIA H100",
|
||||
"IP: 10.0.0.10, 192.168.1.10",
|
||||
} {
|
||||
if !contains(got, want) {
|
||||
t.Fatalf("banner missing %q:\n%s", want, got)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func intPtr(v int) *int { return &v }
|
||||
|
||||
func contains(haystack, needle string) bool {
|
||||
return len(needle) == 0 || (len(haystack) >= len(needle) && (haystack == needle || containsAt(haystack, needle)))
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@ import (
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
@@ -18,9 +19,11 @@ func (s *System) RunNvidiaAcceptancePack(baseDir string) (string, error) {
|
||||
}
|
||||
|
||||
func (s *System) RunMemoryAcceptancePack(baseDir string) (string, error) {
|
||||
sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128)
|
||||
passes := envInt("BEE_MEMTESTER_PASSES", 1)
|
||||
return runAcceptancePack(baseDir, "memory", []satJob{
|
||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||
{name: "02-memtester.log", cmd: []string{"memtester", "128M", "1"}},
|
||||
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
||||
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
||||
})
|
||||
}
|
||||
@@ -42,9 +45,11 @@ func (s *System) RunStorageAcceptancePack(baseDir string) (string, error) {
|
||||
sort.Strings(devices)
|
||||
|
||||
var summary strings.Builder
|
||||
stats := satStats{}
|
||||
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
||||
if len(devices) == 0 {
|
||||
fmt.Fprintln(&summary, "devices=0")
|
||||
stats.Unsupported++
|
||||
} else {
|
||||
fmt.Fprintf(&summary, "devices=%d\n", len(devices))
|
||||
}
|
||||
@@ -58,14 +63,15 @@ func (s *System) RunStorageAcceptancePack(baseDir string) (string, error) {
|
||||
if writeErr := os.WriteFile(filepath.Join(runDir, name), out, 0644); writeErr != nil {
|
||||
return "", writeErr
|
||||
}
|
||||
rc := 0
|
||||
if err != nil {
|
||||
rc = 1
|
||||
}
|
||||
fmt.Fprintf(&summary, "%s_%s_rc=%d\n", filepath.Base(devPath), strings.ReplaceAll(job.name, "-", "_"), rc)
|
||||
status, rc := classifySATResult(job.name, out, err)
|
||||
stats.Add(status)
|
||||
key := filepath.Base(devPath) + "_" + strings.ReplaceAll(job.name, "-", "_")
|
||||
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
|
||||
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
|
||||
}
|
||||
}
|
||||
|
||||
writeSATStats(&summary, stats)
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -81,13 +87,21 @@ type satJob struct {
|
||||
cmd []string
|
||||
}
|
||||
|
||||
type satStats struct {
|
||||
OK int
|
||||
Failed int
|
||||
Unsupported int
|
||||
}
|
||||
|
||||
func nvidiaSATJobs() []satJob {
|
||||
seconds := envInt("BEE_GPU_STRESS_SECONDS", 5)
|
||||
sizeMB := envInt("BEE_GPU_STRESS_SIZE_MB", 64)
|
||||
return []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||
{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output", "{{run_dir}}/nvidia-bug-report.log"}},
|
||||
{name: "05-bee-gpu-stress.log", cmd: []string{"bee-gpu-stress", "--seconds", "5", "--size-mb", "64"}},
|
||||
{name: "05-bee-gpu-stress.log", cmd: []string{"bee-gpu-stress", "--seconds", fmt.Sprintf("%d", seconds), "--size-mb", fmt.Sprintf("%d", sizeMB)}},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -102,6 +116,7 @@ func runAcceptancePack(baseDir, prefix string, jobs []satJob) (string, error) {
|
||||
}
|
||||
|
||||
var summary strings.Builder
|
||||
stats := satStats{}
|
||||
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
||||
for _, job := range jobs {
|
||||
cmd := make([]string, 0, len(job.cmd))
|
||||
@@ -112,12 +127,13 @@ func runAcceptancePack(baseDir, prefix string, jobs []satJob) (string, error) {
|
||||
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
||||
return "", writeErr
|
||||
}
|
||||
rc := 0
|
||||
if err != nil {
|
||||
rc = 1
|
||||
}
|
||||
fmt.Fprintf(&summary, "%s_rc=%d\n", strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log"), rc)
|
||||
status, rc := classifySATResult(job.name, out, err)
|
||||
stats.Add(status)
|
||||
key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
|
||||
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
|
||||
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
|
||||
}
|
||||
writeSATStats(&summary, stats)
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -159,6 +175,69 @@ func storageSATCommands(devPath string) []satJob {
|
||||
}
|
||||
}
|
||||
|
||||
func (s *satStats) Add(status string) {
|
||||
switch status {
|
||||
case "OK":
|
||||
s.OK++
|
||||
case "UNSUPPORTED":
|
||||
s.Unsupported++
|
||||
default:
|
||||
s.Failed++
|
||||
}
|
||||
}
|
||||
|
||||
func (s satStats) Overall() string {
|
||||
if s.Failed > 0 {
|
||||
return "FAILED"
|
||||
}
|
||||
if s.Unsupported > 0 {
|
||||
return "PARTIAL"
|
||||
}
|
||||
return "OK"
|
||||
}
|
||||
|
||||
func writeSATStats(summary *strings.Builder, stats satStats) {
|
||||
fmt.Fprintf(summary, "overall_status=%s\n", stats.Overall())
|
||||
fmt.Fprintf(summary, "job_ok=%d\n", stats.OK)
|
||||
fmt.Fprintf(summary, "job_failed=%d\n", stats.Failed)
|
||||
fmt.Fprintf(summary, "job_unsupported=%d\n", stats.Unsupported)
|
||||
}
|
||||
|
||||
func classifySATResult(name string, out []byte, err error) (string, int) {
|
||||
rc := 0
|
||||
if err != nil {
|
||||
rc = 1
|
||||
}
|
||||
if err == nil {
|
||||
return "OK", rc
|
||||
}
|
||||
|
||||
text := strings.ToLower(string(out))
|
||||
if strings.Contains(text, "unsupported") ||
|
||||
strings.Contains(text, "not supported") ||
|
||||
strings.Contains(text, "invalid opcode") ||
|
||||
strings.Contains(text, "unknown command") ||
|
||||
strings.Contains(text, "not implemented") ||
|
||||
strings.Contains(text, "not available") ||
|
||||
strings.Contains(text, "no such device") ||
|
||||
(strings.Contains(name, "self-test") && strings.Contains(text, "aborted")) {
|
||||
return "UNSUPPORTED", rc
|
||||
}
|
||||
return "FAILED", rc
|
||||
}
|
||||
|
||||
func envInt(name string, fallback int) int {
|
||||
raw := strings.TrimSpace(os.Getenv(name))
|
||||
if raw == "" {
|
||||
return fallback
|
||||
}
|
||||
value, err := strconv.Atoi(raw)
|
||||
if err != nil || value <= 0 {
|
||||
return fallback
|
||||
}
|
||||
return value
|
||||
}
|
||||
|
||||
func createTarGz(dst, srcDir string) error {
|
||||
file, err := os.Create(dst)
|
||||
if err != nil {
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
package platform
|
||||
|
||||
import "testing"
|
||||
import (
|
||||
"errors"
|
||||
"os"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestStorageSATCommands(t *testing.T) {
|
||||
t.Parallel()
|
||||
@@ -28,3 +32,58 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {
|
||||
t.Fatalf("gpu stress command=%q want bee-gpu-stress", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) {
|
||||
t.Setenv("BEE_GPU_STRESS_SECONDS", "9")
|
||||
t.Setenv("BEE_GPU_STRESS_SIZE_MB", "96")
|
||||
|
||||
jobs := nvidiaSATJobs()
|
||||
got := jobs[4].cmd
|
||||
want := []string{"bee-gpu-stress", "--seconds", "9", "--size-mb", "96"}
|
||||
if len(got) != len(want) {
|
||||
t.Fatalf("cmd len=%d want %d", len(got), len(want))
|
||||
}
|
||||
for i := range want {
|
||||
if got[i] != want[i] {
|
||||
t.Fatalf("cmd[%d]=%q want %q", i, got[i], want[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnvIntFallback(t *testing.T) {
|
||||
os.Unsetenv("BEE_MEMTESTER_SIZE_MB")
|
||||
if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 123 {
|
||||
t.Fatalf("got %d want 123", got)
|
||||
}
|
||||
t.Setenv("BEE_MEMTESTER_SIZE_MB", "bad")
|
||||
if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 123 {
|
||||
t.Fatalf("got %d want 123", got)
|
||||
}
|
||||
t.Setenv("BEE_MEMTESTER_SIZE_MB", "256")
|
||||
if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 256 {
|
||||
t.Fatalf("got %d want 256", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestClassifySATResult(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
job string
|
||||
out string
|
||||
err error
|
||||
status string
|
||||
}{
|
||||
{name: "ok", job: "memtester", out: "done", err: nil, status: "OK"},
|
||||
{name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||
{name: "failed", job: "bee-gpu-stress", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got, _ := classifySATResult(tt.job, []byte(tt.out), tt.err)
|
||||
if got != tt.status {
|
||||
t.Fatalf("status=%q want %q", got, tt.status)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -23,3 +23,7 @@ type exportTargetsMsg struct {
|
||||
targets []platform.RemovableTarget
|
||||
err error
|
||||
}
|
||||
|
||||
type bannerMsg struct {
|
||||
text string
|
||||
}
|
||||
|
||||
@@ -179,6 +179,24 @@ func TestMainMenuAsyncActionsSetBusy(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestMainViewIncludesBanner(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.banner = "System: Test Server | S/N ABC123\nIP: 10.0.0.10"
|
||||
|
||||
view := m.View()
|
||||
if !strings.Contains(view, "System: Test Server | S/N ABC123") {
|
||||
t.Fatalf("view missing system banner:\n%s", view)
|
||||
}
|
||||
if !strings.Contains(view, "IP: 10.0.0.10") {
|
||||
t.Fatalf("view missing ip banner:\n%s", view)
|
||||
}
|
||||
if !strings.Contains(view, "Select action") {
|
||||
t.Fatalf("view missing menu subtitle:\n%s", view)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEscapeNavigation(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
"bee/audit/internal/runtimeenv"
|
||||
"strings"
|
||||
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
@@ -46,6 +47,7 @@ type model struct {
|
||||
busyTitle string
|
||||
title string
|
||||
body string
|
||||
banner string
|
||||
mainMenu []string
|
||||
networkMenu []string
|
||||
serviceMenu []string
|
||||
@@ -111,5 +113,7 @@ func newModel(application *app.App, runtimeMode runtimeenv.Mode) model {
|
||||
}
|
||||
|
||||
func (m model) Init() tea.Cmd {
|
||||
return nil
|
||||
return func() tea.Msg {
|
||||
return bannerMsg{text: strings.TrimSpace(m.app.MainBanner())}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -84,6 +84,9 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
|
||||
m.screen = screenExportTargets
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
case bannerMsg:
|
||||
m.banner = strings.TrimSpace(msg.text)
|
||||
return m, nil
|
||||
}
|
||||
|
||||
return m, nil
|
||||
|
||||
@@ -19,7 +19,7 @@ func (m model) View() string {
|
||||
}
|
||||
switch m.screen {
|
||||
case screenMain:
|
||||
return renderMenu("bee", "Select action", m.mainMenu, m.cursor)
|
||||
return renderMainMenu("bee", m.banner, "Select action", m.mainMenu, m.cursor)
|
||||
case screenNetwork:
|
||||
return renderMenu("Network", "Select action", m.networkMenu, m.cursor)
|
||||
case screenServices:
|
||||
@@ -109,6 +109,30 @@ func renderMenu(title, subtitle string, items []string, cursor int) string {
|
||||
return body.String()
|
||||
}
|
||||
|
||||
func renderMainMenu(title, banner, subtitle string, items []string, cursor int) string {
|
||||
var body strings.Builder
|
||||
fmt.Fprintf(&body, "%s\n\n", title)
|
||||
if banner != "" {
|
||||
body.WriteString(strings.TrimSpace(banner))
|
||||
body.WriteString("\n\n")
|
||||
}
|
||||
body.WriteString(subtitle)
|
||||
body.WriteString("\n\n")
|
||||
if len(items) == 0 {
|
||||
body.WriteString("(no items)\n")
|
||||
} else {
|
||||
for i, item := range items {
|
||||
prefix := " "
|
||||
if i == cursor {
|
||||
prefix = "> "
|
||||
}
|
||||
fmt.Fprintf(&body, "%s%s\n", prefix, item)
|
||||
}
|
||||
}
|
||||
body.WriteString("\n[↑/↓] move [enter] select [esc] back [ctrl+c] quit\n")
|
||||
return body.String()
|
||||
}
|
||||
|
||||
func renderForm(title string, fields []formField, idx int) string {
|
||||
var body strings.Builder
|
||||
fmt.Fprintf(&body, "%s\n\n", title)
|
||||
|
||||
@@ -132,3 +132,9 @@ Acceptance flows:
|
||||
- `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + lightweight `bee-gpu-stress`
|
||||
- `bee sat memory` → `memtester` archive
|
||||
- `bee sat storage` → SMART/NVMe diagnostic archive and short self-test trigger where supported
|
||||
- SAT `summary.txt` now includes `overall_status` and per-job `*_status` values (`OK`, `FAILED`, `UNSUPPORTED`)
|
||||
- Runtime overrides:
|
||||
- `BEE_GPU_STRESS_SECONDS`
|
||||
- `BEE_GPU_STRESS_SIZE_MB`
|
||||
- `BEE_MEMTESTER_SIZE_MB`
|
||||
- `BEE_MEMTESTER_PASSES`
|
||||
|
||||
@@ -1,22 +1,20 @@
|
||||
# Backlog
|
||||
|
||||
## GPU stress test (H100)
|
||||
## Real hardware validation
|
||||
|
||||
**Статус:** отложено. В текущем ISO `gpu_burn` не включается и не запускается.
|
||||
**Статус:** ожидает доступа к железу.
|
||||
|
||||
**Почему задача всё ещё в backlog:**
|
||||
- `gpu_burn` остаётся тяжёлым и неудобным с точки зрения зависимостей
|
||||
- хочется штатный lightweight stress tool без `libcublas.so` и без заметного раздувания ISO
|
||||
- для H100 нужен предсказуемый offline-инструмент, который можно стабильно возить внутри ISO
|
||||
Что осталось подтвердить на практике:
|
||||
- `bee sat nvidia` на реальном NVIDIA GPU host
|
||||
- `bee sat storage` на NVMe/SATA/RAID host
|
||||
- `ipmitool sdr` parsing на сервере с реальным BMC/IPMI
|
||||
- vendor RAID tooling (`storcli64`, `sas2ircu`, `sas3ircu`, `arcconf`, `ssacli`) в живом ISO
|
||||
|
||||
**Желаемый следующий шаг:** написать минимальный stress tool на CUDA Driver API
|
||||
- использует только `libcuda.so`, уже присутствующий в ISO
|
||||
- выполняет простой compute / memory workload через `cuLaunchKernel`
|
||||
- собирается отдельно на builder VM и кладётся в `iso/vendor/`
|
||||
- в будущем может вызываться из `bee tui` как предпочтительный встроенный GPU SAT/stress path
|
||||
## SAT result polish
|
||||
|
||||
**Отклонённые / проблемные варианты:**
|
||||
- `gpu_burn` — нужен libcublas (~500MB)
|
||||
- `nvbandwidth` — только bandwidth, не жжёт FLOPs; нужен libcudart (~8MB)
|
||||
- DCGM diag — правильный инструмент для H100 но ~100MB установка
|
||||
- Download on demand — нужен libcublas, проблема та же
|
||||
**Статус:** частично закрыто.
|
||||
|
||||
Что ещё можно улучшить после полевой проверки:
|
||||
- точнее классифицировать vendor-specific self-test outputs в `storage SAT`
|
||||
- подобрать дефолты `memtester` по объёму RAM на целевых машинах
|
||||
- при необходимости расширить `bee-gpu-stress` по длительности/нагрузке
|
||||
|
||||
Reference in New Issue
Block a user