diff --git a/PLAN.md b/PLAN.md index 9fcdb77..2cf359e 100644 --- a/PLAN.md +++ b/PLAN.md @@ -347,6 +347,8 @@ Planned code shape: - `bee tui` can export the latest audit JSON to removable media - `bee tui` can show health summary and run NVIDIA/memory/storage acceptance tests - NVIDIA SAT now includes a lightweight in-image GPU stress step via `bee-gpu-stress` +- SAT summaries now expose `overall_status` plus per-job `OK/FAILED/UNSUPPORTED` +- Memory/GPU SAT runtime defaults can be overridden via `BEE_MEMTESTER_*` and `BEE_GPU_STRESS_*` - removable export requires explicit target selection, mount, confirmation, copy, and cleanup ### 2.6 — Vendor utilities and optional assets diff --git a/audit/internal/app/app.go b/audit/internal/app/app.go index bf93506..d7091ae 100644 --- a/audit/internal/app/app.go +++ b/audit/internal/app/app.go @@ -13,11 +13,13 @@ import ( "bee/audit/internal/collector" "bee/audit/internal/platform" "bee/audit/internal/runtimeenv" + "bee/audit/internal/schema" ) -const ( +var ( DefaultAuditJSONPath = "/var/log/bee-audit.json" DefaultAuditLogPath = "/var/log/bee-audit.log" + DefaultSATBaseDir = "/var/log/bee-sat" ) type App struct { @@ -354,7 +356,7 @@ func (a *App) HealthSummaryResult() ActionResult { fmt.Fprintf(&body, "PSU: warn=%d fail=%d\n", summary.PSUWarn, summary.PSUFail) fmt.Fprintf(&body, "Memory: warn=%d fail=%d\n", summary.MemoryWarn, summary.MemoryFail) for _, item := range latestSATSummaries() { - fmt.Fprintf(&body, "\n%s", item) + fmt.Fprintf(&body, "\n\n%s", item) } if len(summary.Failures) > 0 { fmt.Fprintf(&body, "\n\nFailures:\n- %s", strings.Join(summary.Failures, "\n- ")) @@ -365,6 +367,40 @@ func (a *App) HealthSummaryResult() ActionResult { return ActionResult{Title: "Health summary", Body: strings.TrimSpace(body.String())} } +func (a *App) MainBanner() string { + raw, err := os.ReadFile(DefaultAuditJSONPath) + if err != nil { + return "" + } + + var snapshot schema.HardwareIngestRequest + if err := json.Unmarshal(raw, &snapshot); err != nil { + return "" + } + + var lines []string + if system := formatSystemLine(snapshot.Hardware.Board); system != "" { + lines = append(lines, system) + } + if cpu := formatCPULine(snapshot.Hardware.CPUs); cpu != "" { + lines = append(lines, cpu) + } + if memory := formatMemoryLine(snapshot.Hardware.Memory); memory != "" { + lines = append(lines, memory) + } + if storage := formatStorageLine(snapshot.Hardware.Storage); storage != "" { + lines = append(lines, storage) + } + if gpu := formatGPULine(snapshot.Hardware.PCIeDevices); gpu != "" { + lines = append(lines, gpu) + } + if ip := formatIPLine(a.network.ListInterfaces); ip != "" { + lines = append(lines, ip) + } + + return strings.TrimSpace(strings.Join(lines, "\n")) +} + func (a *App) FormatToolStatuses(statuses []platform.ToolStatus) string { var body strings.Builder for _, tool := range statuses { @@ -418,7 +454,6 @@ func bodyOr(body, fallback string) string { } func latestSATSummaries() []string { - baseDir := "/var/log/bee-sat" patterns := []struct { label string prefix string @@ -429,7 +464,7 @@ func latestSATSummaries() []string { } var out []string for _, item := range patterns { - matches, err := filepath.Glob(filepath.Join(baseDir, item.prefix+"*/summary.txt")) + matches, err := filepath.Glob(filepath.Join(DefaultSATBaseDir, item.prefix+"*/summary.txt")) if err != nil || len(matches) == 0 { continue } @@ -438,7 +473,273 @@ func latestSATSummaries() []string { if err != nil { continue } - out = append(out, item.label+":\n"+strings.TrimSpace(string(raw))) + out = append(out, formatSATSummary(item.label, string(raw))) } return out } + +func formatSATSummary(label, raw string) string { + values := parseKeyValueSummary(raw) + var body strings.Builder + fmt.Fprintf(&body, "%s:", label) + if overall := firstNonEmpty(values["overall_status"], "UNKNOWN"); overall != "" { + fmt.Fprintf(&body, " %s", overall) + } + if ok := firstNonEmpty(values["job_ok"], "0"); ok != "" { + fmt.Fprintf(&body, " ok=%s", ok) + } + if failed := firstNonEmpty(values["job_failed"], "0"); failed != "" { + fmt.Fprintf(&body, " failed=%s", failed) + } + if unsupported := firstNonEmpty(values["job_unsupported"], "0"); unsupported != "" && unsupported != "0" { + fmt.Fprintf(&body, " unsupported=%s", unsupported) + } + if devices := strings.TrimSpace(values["devices"]); devices != "" { + fmt.Fprintf(&body, "\nDevices: %s", devices) + } + return body.String() +} + +func formatSystemLine(board schema.HardwareBoard) string { + model := strings.TrimSpace(strings.Join([]string{ + trimPtr(board.Manufacturer), + trimPtr(board.ProductName), + }, " ")) + serial := strings.TrimSpace(board.SerialNumber) + switch { + case model != "" && serial != "": + return fmt.Sprintf("System: %s | S/N %s", model, serial) + case model != "": + return "System: " + model + case serial != "": + return "System S/N: " + serial + default: + return "" + } +} + +func formatCPULine(cpus []schema.HardwareCPU) string { + if len(cpus) == 0 { + return "" + } + modelCounts := map[string]int{} + unknown := 0 + for _, cpu := range cpus { + model := trimPtr(cpu.Model) + if model == "" { + unknown++ + continue + } + modelCounts[model]++ + } + if len(modelCounts) == 1 && unknown == 0 { + for model, count := range modelCounts { + return fmt.Sprintf("CPU: %d x %s", count, model) + } + } + parts := make([]string, 0, len(modelCounts)+1) + if len(modelCounts) > 0 { + keys := make([]string, 0, len(modelCounts)) + for key := range modelCounts { + keys = append(keys, key) + } + sort.Strings(keys) + for _, key := range keys { + parts = append(parts, fmt.Sprintf("%d x %s", modelCounts[key], key)) + } + } + if unknown > 0 { + parts = append(parts, fmt.Sprintf("%d x unknown", unknown)) + } + return "CPU: " + strings.Join(parts, ", ") +} + +func formatMemoryLine(dimms []schema.HardwareMemory) string { + totalMB := 0 + present := 0 + types := map[string]struct{}{} + for _, dimm := range dimms { + if dimm.Present != nil && !*dimm.Present { + continue + } + if dimm.SizeMB == nil || *dimm.SizeMB <= 0 { + continue + } + present++ + totalMB += *dimm.SizeMB + if value := trimPtr(dimm.Type); value != "" { + types[value] = struct{}{} + } + } + if totalMB == 0 { + return "" + } + typeText := joinSortedKeys(types) + line := fmt.Sprintf("Memory: %s", humanizeMB(totalMB)) + if typeText != "" { + line += " " + typeText + } + if present > 0 { + line += fmt.Sprintf(" (%d DIMMs)", present) + } + return line +} + +func formatStorageLine(disks []schema.HardwareStorage) string { + count := 0 + totalGB := 0 + for _, disk := range disks { + if disk.Present != nil && !*disk.Present { + continue + } + count++ + if disk.SizeGB != nil && *disk.SizeGB > 0 { + totalGB += *disk.SizeGB + } + } + if count == 0 { + return "" + } + line := fmt.Sprintf("Storage: %d drives", count) + if totalGB > 0 { + line += fmt.Sprintf(" / %s", humanizeGB(totalGB)) + } + return line +} + +func formatGPULine(devices []schema.HardwarePCIeDevice) string { + gpus := map[string]int{} + for _, dev := range devices { + if !isGPUDevice(dev) { + continue + } + name := firstNonEmpty(trimPtr(dev.Model), trimPtr(dev.Manufacturer), "unknown") + gpus[name]++ + } + if len(gpus) == 0 { + return "" + } + keys := make([]string, 0, len(gpus)) + for key := range gpus { + keys = append(keys, key) + } + sort.Strings(keys) + parts := make([]string, 0, len(keys)) + for _, key := range keys { + parts = append(parts, fmt.Sprintf("%d x %s", gpus[key], key)) + } + return "GPU: " + strings.Join(parts, ", ") +} + +func formatIPLine(list func() ([]platform.InterfaceInfo, error)) string { + if list == nil { + return "" + } + ifaces, err := list() + if err != nil { + return "" + } + seen := map[string]struct{}{} + var ips []string + for _, iface := range ifaces { + for _, ip := range iface.IPv4 { + ip = strings.TrimSpace(ip) + if ip == "" { + continue + } + if _, ok := seen[ip]; ok { + continue + } + seen[ip] = struct{}{} + ips = append(ips, ip) + } + } + if len(ips) == 0 { + return "" + } + sort.Strings(ips) + return "IP: " + strings.Join(ips, ", ") +} + +func isGPUDevice(dev schema.HardwarePCIeDevice) bool { + class := strings.ToLower(trimPtr(dev.DeviceClass)) + model := strings.ToLower(trimPtr(dev.Model)) + vendor := strings.ToLower(trimPtr(dev.Manufacturer)) + return strings.Contains(class, "vga") || + strings.Contains(class, "3d") || + strings.Contains(class, "display") || + strings.Contains(model, "nvidia") || + strings.Contains(vendor, "nvidia") || + strings.Contains(vendor, "amd") +} + +func trimPtr(value *string) string { + if value == nil { + return "" + } + return strings.TrimSpace(*value) +} + +func joinSortedKeys(values map[string]struct{}) string { + if len(values) == 0 { + return "" + } + keys := make([]string, 0, len(values)) + for key := range values { + keys = append(keys, key) + } + sort.Strings(keys) + return strings.Join(keys, "/") +} + +func humanizeMB(totalMB int) string { + if totalMB <= 0 { + return "" + } + gb := float64(totalMB) / 1024.0 + if gb >= 1024.0 { + tb := gb / 1024.0 + return fmt.Sprintf("%.1f TB", tb) + } + if gb == float64(int64(gb)) { + return fmt.Sprintf("%.0f GB", gb) + } + return fmt.Sprintf("%.1f GB", gb) +} + +func humanizeGB(totalGB int) string { + if totalGB <= 0 { + return "" + } + tb := float64(totalGB) / 1024.0 + if tb >= 1.0 { + return fmt.Sprintf("%.1f TB", tb) + } + return fmt.Sprintf("%d GB", totalGB) +} + +func parseKeyValueSummary(raw string) map[string]string { + out := map[string]string{} + for _, line := range strings.Split(raw, "\n") { + line = strings.TrimSpace(line) + if line == "" { + continue + } + key, value, ok := strings.Cut(line, "=") + if !ok { + continue + } + out[strings.TrimSpace(key)] = strings.TrimSpace(value) + } + return out +} + +func firstNonEmpty(values ...string) string { + for _, value := range values { + value = strings.TrimSpace(value) + if value != "" { + return value + } + } + return "" +} diff --git a/audit/internal/app/app_test.go b/audit/internal/app/app_test.go index 7de164f..2e6c87f 100644 --- a/audit/internal/app/app_test.go +++ b/audit/internal/app/app_test.go @@ -1,10 +1,14 @@ package app import ( + "encoding/json" "errors" + "os" + "path/filepath" "testing" "bee/audit/internal/platform" + "bee/audit/internal/schema" ) type fakeNetwork struct { @@ -76,8 +80,8 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus { } type fakeSAT struct { - runNvidiaFn func(string) (string, error) - runMemoryFn func(string) (string, error) + runNvidiaFn func(string) (string, error) + runMemoryFn func(string) (string, error) runStorageFn func(string) (string, error) } @@ -293,8 +297,8 @@ func TestActionResultsUseFallbackBody(t *testing.T) { checkToolsFn: func([]string) []platform.ToolStatus { return nil }, }, sat: fakeSAT{ - runNvidiaFn: func(string) (string, error) { return "", nil }, - runMemoryFn: func(string) (string, error) { return "", nil }, + runNvidiaFn: func(string) (string, error) { return "", nil }, + runMemoryFn: func(string) (string, error) { return "", nil }, runStorageFn: func(string) (string, error) { return "", nil }, }, } @@ -342,7 +346,7 @@ func TestRunNvidiaAcceptancePackResult(t *testing.T) { } return "/tmp/sat/out.tar.gz", nil }, - runMemoryFn: func(string) (string, error) { return "", nil }, + runMemoryFn: func(string) (string, error) { return "", nil }, runStorageFn: func(string) (string, error) { return "", nil }, }, } @@ -356,6 +360,124 @@ func TestRunNvidiaAcceptancePackResult(t *testing.T) { } } +func TestFormatSATSummary(t *testing.T) { + t.Parallel() + + got := formatSATSummary("Memory SAT", "overall_status=PARTIAL\njob_ok=2\njob_failed=0\njob_unsupported=1\ndevices=3\n") + want := "Memory SAT: PARTIAL ok=2 failed=0 unsupported=1\nDevices: 3" + if got != want { + t.Fatalf("got %q want %q", got, want) + } +} + +func TestHealthSummaryResultIncludesCompactSATSummary(t *testing.T) { + t.Parallel() + + tmp := t.TempDir() + oldAuditPath := DefaultAuditJSONPath + oldSATBaseDir := DefaultSATBaseDir + DefaultAuditJSONPath = filepath.Join(tmp, "audit.json") + DefaultSATBaseDir = filepath.Join(tmp, "sat") + t.Cleanup(func() { DefaultAuditJSONPath = oldAuditPath }) + t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir }) + + satDir := filepath.Join(DefaultSATBaseDir, "memory-testcase") + if err := os.MkdirAll(satDir, 0755); err != nil { + t.Fatalf("mkdir sat dir: %v", err) + } + + raw := `{"hardware":{"summary":{"status":"WARNING","storage_warn":1,"storage_fail":0,"pcie_warn":0,"pcie_fail":0,"psu_warn":0,"psu_fail":0,"memory_warn":0,"memory_fail":0}}}` + if err := os.WriteFile(DefaultAuditJSONPath, []byte(raw), 0644); err != nil { + t.Fatalf("write audit json: %v", err) + } + if err := os.WriteFile(filepath.Join(satDir, "summary.txt"), []byte("overall_status=OK\njob_ok=3\njob_failed=0\njob_unsupported=0\n"), 0644); err != nil { + t.Fatalf("write sat summary: %v", err) + } + + result := (&App{}).HealthSummaryResult() + if !contains(result.Body, "Memory SAT: OK ok=3 failed=0") { + t.Fatalf("body missing compact sat summary:\n%s", result.Body) + } +} + +func TestMainBanner(t *testing.T) { + t.Parallel() + + tmp := t.TempDir() + oldAuditPath := DefaultAuditJSONPath + DefaultAuditJSONPath = filepath.Join(tmp, "audit.json") + t.Cleanup(func() { DefaultAuditJSONPath = oldAuditPath }) + + trueValue := true + manufacturer := "Dell" + product := "PowerEdge R760" + cpuModel := "Intel Xeon Gold 6430" + memoryType := "DDR5" + gpuClass := "VGA compatible controller" + gpuModel := "NVIDIA H100" + + payload := schema.HardwareIngestRequest{ + Hardware: schema.HardwareSnapshot{ + Board: schema.HardwareBoard{ + Manufacturer: &manufacturer, + ProductName: &product, + SerialNumber: "SRV123", + }, + CPUs: []schema.HardwareCPU{ + {Model: &cpuModel}, + {Model: &cpuModel}, + }, + Memory: []schema.HardwareMemory{ + {Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType}, + {Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType}, + }, + Storage: []schema.HardwareStorage{ + {Present: &trueValue, SizeGB: intPtr(3840)}, + {Present: &trueValue, SizeGB: intPtr(3840)}, + }, + PCIeDevices: []schema.HardwarePCIeDevice{ + {DeviceClass: &gpuClass, Model: &gpuModel}, + {DeviceClass: &gpuClass, Model: &gpuModel}, + }, + }, + } + + raw, err := json.Marshal(payload) + if err != nil { + t.Fatalf("marshal: %v", err) + } + if err := os.WriteFile(DefaultAuditJSONPath, raw, 0644); err != nil { + t.Fatalf("write audit json: %v", err) + } + + a := &App{ + network: fakeNetwork{ + listInterfacesFn: func() ([]platform.InterfaceInfo, error) { + return []platform.InterfaceInfo{ + {Name: "eth0", IPv4: []string{"10.0.0.10"}}, + {Name: "eth1", IPv4: []string{"192.168.1.10"}}, + }, nil + }, + }, + } + + got := a.MainBanner() + for _, want := range []string{ + "System: Dell PowerEdge R760 | S/N SRV123", + "CPU: 2 x Intel Xeon Gold 6430", + "Memory: 1.0 TB DDR5 (2 DIMMs)", + "Storage: 2 drives / 7.5 TB", + "GPU: 2 x NVIDIA H100", + "IP: 10.0.0.10, 192.168.1.10", + } { + if !contains(got, want) { + t.Fatalf("banner missing %q:\n%s", want, got) + } + } +} + +func intPtr(v int) *int { return &v } + func contains(haystack, needle string) bool { return len(needle) == 0 || (len(haystack) >= len(needle) && (haystack == needle || containsAt(haystack, needle))) } diff --git a/audit/internal/platform/sat.go b/audit/internal/platform/sat.go index 0deba9f..b402332 100644 --- a/audit/internal/platform/sat.go +++ b/audit/internal/platform/sat.go @@ -9,6 +9,7 @@ import ( "os/exec" "path/filepath" "sort" + "strconv" "strings" "time" ) @@ -18,9 +19,11 @@ func (s *System) RunNvidiaAcceptancePack(baseDir string) (string, error) { } func (s *System) RunMemoryAcceptancePack(baseDir string) (string, error) { + sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128) + passes := envInt("BEE_MEMTESTER_PASSES", 1) return runAcceptancePack(baseDir, "memory", []satJob{ {name: "01-free-before.log", cmd: []string{"free", "-h"}}, - {name: "02-memtester.log", cmd: []string{"memtester", "128M", "1"}}, + {name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}}, {name: "03-free-after.log", cmd: []string{"free", "-h"}}, }) } @@ -42,9 +45,11 @@ func (s *System) RunStorageAcceptancePack(baseDir string) (string, error) { sort.Strings(devices) var summary strings.Builder + stats := satStats{} fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339)) if len(devices) == 0 { fmt.Fprintln(&summary, "devices=0") + stats.Unsupported++ } else { fmt.Fprintf(&summary, "devices=%d\n", len(devices)) } @@ -58,14 +63,15 @@ func (s *System) RunStorageAcceptancePack(baseDir string) (string, error) { if writeErr := os.WriteFile(filepath.Join(runDir, name), out, 0644); writeErr != nil { return "", writeErr } - rc := 0 - if err != nil { - rc = 1 - } - fmt.Fprintf(&summary, "%s_%s_rc=%d\n", filepath.Base(devPath), strings.ReplaceAll(job.name, "-", "_"), rc) + status, rc := classifySATResult(job.name, out, err) + stats.Add(status) + key := filepath.Base(devPath) + "_" + strings.ReplaceAll(job.name, "-", "_") + fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc) + fmt.Fprintf(&summary, "%s_status=%s\n", key, status) } } + writeSATStats(&summary, stats) if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil { return "", err } @@ -81,13 +87,21 @@ type satJob struct { cmd []string } +type satStats struct { + OK int + Failed int + Unsupported int +} + func nvidiaSATJobs() []satJob { + seconds := envInt("BEE_GPU_STRESS_SECONDS", 5) + sizeMB := envInt("BEE_GPU_STRESS_SIZE_MB", 64) return []satJob{ {name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, {name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}}, {name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}}, {name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output", "{{run_dir}}/nvidia-bug-report.log"}}, - {name: "05-bee-gpu-stress.log", cmd: []string{"bee-gpu-stress", "--seconds", "5", "--size-mb", "64"}}, + {name: "05-bee-gpu-stress.log", cmd: []string{"bee-gpu-stress", "--seconds", fmt.Sprintf("%d", seconds), "--size-mb", fmt.Sprintf("%d", sizeMB)}}, } } @@ -102,6 +116,7 @@ func runAcceptancePack(baseDir, prefix string, jobs []satJob) (string, error) { } var summary strings.Builder + stats := satStats{} fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339)) for _, job := range jobs { cmd := make([]string, 0, len(job.cmd)) @@ -112,12 +127,13 @@ func runAcceptancePack(baseDir, prefix string, jobs []satJob) (string, error) { if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil { return "", writeErr } - rc := 0 - if err != nil { - rc = 1 - } - fmt.Fprintf(&summary, "%s_rc=%d\n", strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log"), rc) + status, rc := classifySATResult(job.name, out, err) + stats.Add(status) + key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log") + fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc) + fmt.Fprintf(&summary, "%s_status=%s\n", key, status) } + writeSATStats(&summary, stats) if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil { return "", err } @@ -159,6 +175,69 @@ func storageSATCommands(devPath string) []satJob { } } +func (s *satStats) Add(status string) { + switch status { + case "OK": + s.OK++ + case "UNSUPPORTED": + s.Unsupported++ + default: + s.Failed++ + } +} + +func (s satStats) Overall() string { + if s.Failed > 0 { + return "FAILED" + } + if s.Unsupported > 0 { + return "PARTIAL" + } + return "OK" +} + +func writeSATStats(summary *strings.Builder, stats satStats) { + fmt.Fprintf(summary, "overall_status=%s\n", stats.Overall()) + fmt.Fprintf(summary, "job_ok=%d\n", stats.OK) + fmt.Fprintf(summary, "job_failed=%d\n", stats.Failed) + fmt.Fprintf(summary, "job_unsupported=%d\n", stats.Unsupported) +} + +func classifySATResult(name string, out []byte, err error) (string, int) { + rc := 0 + if err != nil { + rc = 1 + } + if err == nil { + return "OK", rc + } + + text := strings.ToLower(string(out)) + if strings.Contains(text, "unsupported") || + strings.Contains(text, "not supported") || + strings.Contains(text, "invalid opcode") || + strings.Contains(text, "unknown command") || + strings.Contains(text, "not implemented") || + strings.Contains(text, "not available") || + strings.Contains(text, "no such device") || + (strings.Contains(name, "self-test") && strings.Contains(text, "aborted")) { + return "UNSUPPORTED", rc + } + return "FAILED", rc +} + +func envInt(name string, fallback int) int { + raw := strings.TrimSpace(os.Getenv(name)) + if raw == "" { + return fallback + } + value, err := strconv.Atoi(raw) + if err != nil || value <= 0 { + return fallback + } + return value +} + func createTarGz(dst, srcDir string) error { file, err := os.Create(dst) if err != nil { diff --git a/audit/internal/platform/sat_test.go b/audit/internal/platform/sat_test.go index 1838b81..0c59e26 100644 --- a/audit/internal/platform/sat_test.go +++ b/audit/internal/platform/sat_test.go @@ -1,6 +1,10 @@ package platform -import "testing" +import ( + "errors" + "os" + "testing" +) func TestStorageSATCommands(t *testing.T) { t.Parallel() @@ -28,3 +32,58 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) { t.Fatalf("gpu stress command=%q want bee-gpu-stress", got) } } + +func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) { + t.Setenv("BEE_GPU_STRESS_SECONDS", "9") + t.Setenv("BEE_GPU_STRESS_SIZE_MB", "96") + + jobs := nvidiaSATJobs() + got := jobs[4].cmd + want := []string{"bee-gpu-stress", "--seconds", "9", "--size-mb", "96"} + if len(got) != len(want) { + t.Fatalf("cmd len=%d want %d", len(got), len(want)) + } + for i := range want { + if got[i] != want[i] { + t.Fatalf("cmd[%d]=%q want %q", i, got[i], want[i]) + } + } +} + +func TestEnvIntFallback(t *testing.T) { + os.Unsetenv("BEE_MEMTESTER_SIZE_MB") + if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 123 { + t.Fatalf("got %d want 123", got) + } + t.Setenv("BEE_MEMTESTER_SIZE_MB", "bad") + if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 123 { + t.Fatalf("got %d want 123", got) + } + t.Setenv("BEE_MEMTESTER_SIZE_MB", "256") + if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 256 { + t.Fatalf("got %d want 256", got) + } +} + +func TestClassifySATResult(t *testing.T) { + tests := []struct { + name string + job string + out string + err error + status string + }{ + {name: "ok", job: "memtester", out: "done", err: nil, status: "OK"}, + {name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"}, + {name: "failed", job: "bee-gpu-stress", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, _ := classifySATResult(tt.job, []byte(tt.out), tt.err) + if got != tt.status { + t.Fatalf("status=%q want %q", got, tt.status) + } + }) + } +} diff --git a/audit/internal/tui/messages.go b/audit/internal/tui/messages.go index 6cc3c48..34e6460 100644 --- a/audit/internal/tui/messages.go +++ b/audit/internal/tui/messages.go @@ -23,3 +23,7 @@ type exportTargetsMsg struct { targets []platform.RemovableTarget err error } + +type bannerMsg struct { + text string +} diff --git a/audit/internal/tui/tui_test.go b/audit/internal/tui/tui_test.go index 4d5506a..d400734 100644 --- a/audit/internal/tui/tui_test.go +++ b/audit/internal/tui/tui_test.go @@ -179,6 +179,24 @@ func TestMainMenuAsyncActionsSetBusy(t *testing.T) { } } +func TestMainViewIncludesBanner(t *testing.T) { + t.Parallel() + + m := newTestModel() + m.banner = "System: Test Server | S/N ABC123\nIP: 10.0.0.10" + + view := m.View() + if !strings.Contains(view, "System: Test Server | S/N ABC123") { + t.Fatalf("view missing system banner:\n%s", view) + } + if !strings.Contains(view, "IP: 10.0.0.10") { + t.Fatalf("view missing ip banner:\n%s", view) + } + if !strings.Contains(view, "Select action") { + t.Fatalf("view missing menu subtitle:\n%s", view) + } +} + func TestEscapeNavigation(t *testing.T) { t.Parallel() diff --git a/audit/internal/tui/types.go b/audit/internal/tui/types.go index d88339e..07d2301 100644 --- a/audit/internal/tui/types.go +++ b/audit/internal/tui/types.go @@ -4,6 +4,7 @@ import ( "bee/audit/internal/app" "bee/audit/internal/platform" "bee/audit/internal/runtimeenv" + "strings" tea "github.com/charmbracelet/bubbletea" ) @@ -26,12 +27,12 @@ const ( type actionKind string const ( - actionNone actionKind = "" - actionDHCPOne actionKind = "dhcp_one" - actionStaticIPv4 actionKind = "static_ipv4" - actionExportAudit actionKind = "export_audit" - actionRunNvidiaSAT actionKind = "run_nvidia_sat" - actionRunMemorySAT actionKind = "run_memory_sat" + actionNone actionKind = "" + actionDHCPOne actionKind = "dhcp_one" + actionStaticIPv4 actionKind = "static_ipv4" + actionExportAudit actionKind = "export_audit" + actionRunNvidiaSAT actionKind = "run_nvidia_sat" + actionRunMemorySAT actionKind = "run_memory_sat" actionRunStorageSAT actionKind = "run_storage_sat" ) @@ -46,6 +47,7 @@ type model struct { busyTitle string title string body string + banner string mainMenu []string networkMenu []string serviceMenu []string @@ -111,5 +113,7 @@ func newModel(application *app.App, runtimeMode runtimeenv.Mode) model { } func (m model) Init() tea.Cmd { - return nil + return func() tea.Msg { + return bannerMsg{text: strings.TrimSpace(m.app.MainBanner())} + } } diff --git a/audit/internal/tui/update.go b/audit/internal/tui/update.go index 99f99f3..a993485 100644 --- a/audit/internal/tui/update.go +++ b/audit/internal/tui/update.go @@ -84,6 +84,9 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) { m.screen = screenExportTargets m.cursor = 0 return m, nil + case bannerMsg: + m.banner = strings.TrimSpace(msg.text) + return m, nil } return m, nil diff --git a/audit/internal/tui/view.go b/audit/internal/tui/view.go index a2a3d0d..936d3dc 100644 --- a/audit/internal/tui/view.go +++ b/audit/internal/tui/view.go @@ -19,7 +19,7 @@ func (m model) View() string { } switch m.screen { case screenMain: - return renderMenu("bee", "Select action", m.mainMenu, m.cursor) + return renderMainMenu("bee", m.banner, "Select action", m.mainMenu, m.cursor) case screenNetwork: return renderMenu("Network", "Select action", m.networkMenu, m.cursor) case screenServices: @@ -109,6 +109,30 @@ func renderMenu(title, subtitle string, items []string, cursor int) string { return body.String() } +func renderMainMenu(title, banner, subtitle string, items []string, cursor int) string { + var body strings.Builder + fmt.Fprintf(&body, "%s\n\n", title) + if banner != "" { + body.WriteString(strings.TrimSpace(banner)) + body.WriteString("\n\n") + } + body.WriteString(subtitle) + body.WriteString("\n\n") + if len(items) == 0 { + body.WriteString("(no items)\n") + } else { + for i, item := range items { + prefix := " " + if i == cursor { + prefix = "> " + } + fmt.Fprintf(&body, "%s%s\n", prefix, item) + } + } + body.WriteString("\n[↑/↓] move [enter] select [esc] back [ctrl+c] quit\n") + return body.String() +} + func renderForm(title string, fields []formField, idx int) string { var body strings.Builder fmt.Fprintf(&body, "%s\n\n", title) diff --git a/bible-local/architecture/runtime-flows.md b/bible-local/architecture/runtime-flows.md index 15cbf19..b632511 100644 --- a/bible-local/architecture/runtime-flows.md +++ b/bible-local/architecture/runtime-flows.md @@ -132,3 +132,9 @@ Acceptance flows: - `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + lightweight `bee-gpu-stress` - `bee sat memory` → `memtester` archive - `bee sat storage` → SMART/NVMe diagnostic archive and short self-test trigger where supported +- SAT `summary.txt` now includes `overall_status` and per-job `*_status` values (`OK`, `FAILED`, `UNSUPPORTED`) +- Runtime overrides: + - `BEE_GPU_STRESS_SECONDS` + - `BEE_GPU_STRESS_SIZE_MB` + - `BEE_MEMTESTER_SIZE_MB` + - `BEE_MEMTESTER_PASSES` diff --git a/bible-local/backlog.md b/bible-local/backlog.md index ae4b27f..007ca56 100644 --- a/bible-local/backlog.md +++ b/bible-local/backlog.md @@ -1,22 +1,20 @@ # Backlog -## GPU stress test (H100) +## Real hardware validation -**Статус:** отложено. В текущем ISO `gpu_burn` не включается и не запускается. +**Статус:** ожидает доступа к железу. -**Почему задача всё ещё в backlog:** -- `gpu_burn` остаётся тяжёлым и неудобным с точки зрения зависимостей -- хочется штатный lightweight stress tool без `libcublas.so` и без заметного раздувания ISO -- для H100 нужен предсказуемый offline-инструмент, который можно стабильно возить внутри ISO +Что осталось подтвердить на практике: +- `bee sat nvidia` на реальном NVIDIA GPU host +- `bee sat storage` на NVMe/SATA/RAID host +- `ipmitool sdr` parsing на сервере с реальным BMC/IPMI +- vendor RAID tooling (`storcli64`, `sas2ircu`, `sas3ircu`, `arcconf`, `ssacli`) в живом ISO -**Желаемый следующий шаг:** написать минимальный stress tool на CUDA Driver API -- использует только `libcuda.so`, уже присутствующий в ISO -- выполняет простой compute / memory workload через `cuLaunchKernel` -- собирается отдельно на builder VM и кладётся в `iso/vendor/` -- в будущем может вызываться из `bee tui` как предпочтительный встроенный GPU SAT/stress path +## SAT result polish -**Отклонённые / проблемные варианты:** -- `gpu_burn` — нужен libcublas (~500MB) -- `nvbandwidth` — только bandwidth, не жжёт FLOPs; нужен libcudart (~8MB) -- DCGM diag — правильный инструмент для H100 но ~100MB установка -- Download on demand — нужен libcublas, проблема та же +**Статус:** частично закрыто. + +Что ещё можно улучшить после полевой проверки: +- точнее классифицировать vendor-specific self-test outputs в `storage SAT` +- подобрать дефолты `memtester` по объёму RAM на целевых машинах +- при необходимости расширить `bee-gpu-stress` по длительности/нагрузке