diff --git a/audit/internal/collector/board.go b/audit/internal/collector/board.go index de7c4d6..7e7e158 100644 --- a/audit/internal/collector/board.go +++ b/audit/internal/collector/board.go @@ -3,6 +3,7 @@ package collector import ( "bee/audit/internal/schema" "bufio" + "context" "log/slog" "os" "os/exec" @@ -17,14 +18,6 @@ var execDmidecode = func(typeNum string) (string, error) { return string(out), nil } -var execIpmitool = func(args ...string) (string, error) { - out, err := exec.Command("ipmitool", args...).Output() - if err != nil { - return "", err - } - return string(out), nil -} - // collectBoard runs dmidecode for types 0, 1, 2 and returns the board record // plus the BIOS firmware entry. Any failure is logged and returns zero values. func collectBoard() (schema.HardwareBoard, []schema.HardwareFirmwareRecord) { @@ -80,19 +73,23 @@ func parseBoard(type1, type2 string) schema.HardwareBoard { // collectBMCFirmware collects BMC firmware version via ipmitool mc info. // Returns nil if ipmitool is missing, /dev/ipmi0 is absent, or any error occurs. -func collectBMCFirmware() []schema.HardwareFirmwareRecord { +func collectBMCFirmware(manufacturer string) []schema.HardwareFirmwareRecord { if _, err := exec.LookPath("ipmitool"); err != nil { return nil } if _, err := os.Stat("/dev/ipmi0"); err != nil { return nil } - out, err := execIpmitool("mc", "info") + profile := selectIPMIProfile(manufacturer) + ctx, cancel := context.WithTimeout(context.Background(), profile.mcInfoTimeout) + defer cancel() + cmd := exec.CommandContext(ctx, "ipmitool", "mc", "info") + raw, err := cmd.Output() if err != nil { slog.Info("bmc: ipmitool mc info unavailable", "err", err) return nil } - version := parseBMCFirmwareRevision(out) + version := parseBMCFirmwareRevision(string(raw)) if version == "" { return nil } diff --git a/audit/internal/collector/collector.go b/audit/internal/collector/collector.go index 84213c8..b906e25 100644 --- a/audit/internal/collector/collector.go +++ b/audit/internal/collector/collector.go @@ -23,7 +23,7 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest { board, biosFW := collectBoard() snap.Board = board snap.Firmware = append(snap.Firmware, biosFW...) - snap.Firmware = append(snap.Firmware, collectBMCFirmware()...) + snap.Firmware = append(snap.Firmware, collectBMCFirmware(derefString(snap.Board.Manufacturer))...) snap.CPUs = collectCPUs() @@ -45,7 +45,8 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest { snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices) snap.Storage = enrichStorageWithVROC(snap.Storage, snap.PCIeDevices) snap.Storage = appendUniqueStorage(snap.Storage, collectRAIDStorage(snap.PCIeDevices)) - snap.PowerSupplies = collectPSUs() + snap.VROCLicense = collectVROCLicense(snap.PCIeDevices) + snap.PowerSupplies = collectPSUs(derefString(snap.Board.Manufacturer)) snap.PowerSupplies = enrichPSUsWithTelemetry(snap.PowerSupplies, sensorDoc) snap.Sensors = buildSensorsFromDoc(sensorDoc) finalizeSnapshot(&snap, collectedAt) diff --git a/audit/internal/collector/ipmi_profile.go b/audit/internal/collector/ipmi_profile.go new file mode 100644 index 0000000..02d6ba6 --- /dev/null +++ b/audit/internal/collector/ipmi_profile.go @@ -0,0 +1,92 @@ +package collector + +// Package-level IPMI tuning profiles. +// +// Each profile is matched by board manufacturer (already known before PSU +// collection runs). The profile drives two things: +// - Per-command timeouts — prevents infinite hangs on slow BMCs. +// - FRU early-exit — streaming parser stops reading once all PSU entries +// are found, avoiding the tail of non-PSU FRU records. +// +// To add a new vendor: append to ipmiProfiles. The first matching entry wins. + +import ( + "strings" + "time" +) + +// ipmiProfile holds tuning parameters for one or more board manufacturers. +type ipmiProfile struct { + // name is shown in log messages. + name string + // manufacturers is a list of lowercase substrings matched against the + // board manufacturer string from dmidecode type 1. + manufacturers []string + // fruTimeout is the hard deadline for the entire `ipmitool fru print` + // command. Zero means no timeout (not recommended). + fruTimeout time.Duration + // sdrTimeout is the hard deadline for `ipmitool sdr`. + sdrTimeout time.Duration + // mcInfoTimeout is the hard deadline for `ipmitool mc info`. + mcInfoTimeout time.Duration + // fruEarlyExit instructs the streaming FRU parser to stop reading + // after it has found at least one PSU entry and the current block is + // complete. Useful on servers with many non-PSU FRU devices. + fruEarlyExit bool +} + +// ipmiProfiles is the ordered list of profiles. First match wins. +var ipmiProfiles = []ipmiProfile{ + { + // Lenovo XCC-based servers (ThinkSystem SR6xx / SR8xx / ST series). + // SR650 V3 has 54 FRU devices; each IPMI read takes ~2 s, so the + // full `fru print` scan takes ~108 s on a loaded BMC. Enable early + // exit so collection stops once PSU records are found. + name: "lenovo", + manufacturers: []string{"lenovo"}, + fruTimeout: 90 * time.Second, + sdrTimeout: 45 * time.Second, + mcInfoTimeout: 15 * time.Second, + fruEarlyExit: true, + }, + { + // HPE iLO-based servers (ProLiant DL/ML/BL). + name: "hpe", + manufacturers: []string{"hp", "hewlett packard"}, + fruTimeout: 60 * time.Second, + sdrTimeout: 30 * time.Second, + mcInfoTimeout: 10 * time.Second, + fruEarlyExit: false, + }, + { + // Dell iDRAC-based servers. + name: "dell", + manufacturers: []string{"dell"}, + fruTimeout: 60 * time.Second, + sdrTimeout: 30 * time.Second, + mcInfoTimeout: 10 * time.Second, + fruEarlyExit: false, + }, +} + +// defaultIPMIProfile is used when no vendor profile matches. +var defaultIPMIProfile = ipmiProfile{ + name: "default", + fruTimeout: 60 * time.Second, + sdrTimeout: 30 * time.Second, + mcInfoTimeout: 10 * time.Second, + fruEarlyExit: false, +} + +// selectIPMIProfile returns the profile for the given board manufacturer. +func selectIPMIProfile(manufacturer string) ipmiProfile { + mfgLower := strings.ToLower(strings.TrimSpace(manufacturer)) + for _, p := range ipmiProfiles { + for _, m := range p.manufacturers { + if strings.Contains(mfgLower, m) { + return p + } + } + } + return defaultIPMIProfile +} diff --git a/audit/internal/collector/psu.go b/audit/internal/collector/psu.go index 8b9cf86..eee0b90 100644 --- a/audit/internal/collector/psu.go +++ b/audit/internal/collector/psu.go @@ -2,6 +2,8 @@ package collector import ( "bee/audit/internal/schema" + "bufio" + "context" "log/slog" "os/exec" "regexp" @@ -10,16 +12,29 @@ import ( "strings" ) -func collectPSUs() []schema.HardwarePowerSupply { +func collectPSUs(manufacturer string) []schema.HardwarePowerSupply { + profile := selectIPMIProfile(manufacturer) + var psus []schema.HardwarePowerSupply - if out, err := exec.Command("ipmitool", "fru", "print").Output(); err == nil { - psus = parseFRU(string(out)) + fruCtx, fruCancel := context.WithTimeout(context.Background(), profile.fruTimeout) + defer fruCancel() + + if profile.fruEarlyExit { + psus = collectFRUEarlyExit(fruCtx) } else { - slog.Info("psu: fru unavailable", "err", err) + cmd := exec.CommandContext(fruCtx, "ipmitool", "fru", "print") + if out, err := cmd.Output(); err == nil { + psus = parseFRU(string(out)) + } else { + slog.Info("psu: fru unavailable", "err", err) + } } sdrData := map[int]psuSDR{} - if sdrOut, err := exec.Command("ipmitool", "sdr").Output(); err == nil { + sdrCtx, sdrCancel := context.WithTimeout(context.Background(), profile.sdrTimeout) + defer sdrCancel() + cmd := exec.CommandContext(sdrCtx, "ipmitool", "sdr") + if sdrOut, err := cmd.Output(); err == nil { sdrData = parsePSUSDR(string(sdrOut)) if len(psus) == 0 { psus = synthesizePSUsFromSDR(sdrData) @@ -30,7 +45,66 @@ func collectPSUs() []schema.HardwarePowerSupply { slog.Info("psu: ipmitool unavailable, skipping", "err", err) return nil } - slog.Info("psu: collected", "count", len(psus)) + slog.Info("psu: collected", "count", len(psus), "profile", profile.name) + return psus +} + +// collectFRUEarlyExit streams ipmitool fru print line-by-line and stops reading +// as soon as it has found all PSU blocks and the next block is not a PSU. +// This avoids scanning all 50+ non-PSU FRU devices on Lenovo XCC servers. +func collectFRUEarlyExit(ctx context.Context) []schema.HardwarePowerSupply { + cmd := exec.CommandContext(ctx, "ipmitool", "fru", "print") + pipe, err := cmd.StdoutPipe() + if err != nil { + slog.Info("psu: fru pipe unavailable", "err", err) + return nil + } + if err := cmd.Start(); err != nil { + slog.Info("psu: fru start failed", "err", err) + return nil + } + + var psus []schema.HardwarePowerSupply + var currentBlock strings.Builder + slot := 0 + psuFound := false + stoppedEarly := false + + scanner := bufio.NewScanner(pipe) + for scanner.Scan() { + line := scanner.Text() + + if strings.HasPrefix(line, "FRU Device Description") { + if currentBlock.Len() > 0 { + if psu, ok := parseFRUBlock(currentBlock.String(), slot); ok { + psus = append(psus, psu) + psuFound = true + slot++ + } + currentBlock.Reset() + } + // Stop once we've collected PSUs and hit a non-PSU block header. + if psuFound && !isPSUHeader(strings.ToLower(line)) { + stoppedEarly = true + break + } + } + currentBlock.WriteString(line) + currentBlock.WriteByte('\n') + } + + if !stoppedEarly && currentBlock.Len() > 0 { + if psu, ok := parseFRUBlock(currentBlock.String(), slot); ok { + psus = append(psus, psu) + } + } + + // Kill the process immediately on early exit rather than waiting for context timeout. + if cmd.Process != nil { + cmd.Process.Kill() //nolint:errcheck + } + cmd.Wait() //nolint:errcheck + slog.Info("psu: fru early-exit complete", "psus_found", len(psus), "stopped_early", stoppedEarly) return psus } diff --git a/audit/internal/collector/raid.go b/audit/internal/collector/raid.go index 1ae7c23..f2bc71b 100644 --- a/audit/internal/collector/raid.go +++ b/audit/internal/collector/raid.go @@ -733,6 +733,37 @@ func parseMDStatArrays(raw string) []mdArray { return arrays } +// collectVROCLicense runs mdadm --detail-platform and extracts the License field. +// Returns nil when VROC is absent or the platform does not report a license. +func collectVROCLicense(pcie []schema.HardwarePCIeDevice) *string { + if !hasVROCController(pcie) { + return nil + } + out, err := raidToolQuery("mdadm", "--detail-platform") + if err != nil { + slog.Info("vroc: mdadm --detail-platform unavailable", "err", err) + return nil + } + return parseMDAdmPlatformLicense(string(out)) +} + +func parseMDAdmPlatformLicense(raw string) *string { + for _, line := range strings.Split(raw, "\n") { + trimmed := strings.TrimSpace(line) + if !strings.HasPrefix(strings.ToLower(trimmed), "license") { + continue + } + if idx := strings.Index(trimmed, ":"); idx >= 0 { + val := strings.TrimSpace(trimmed[idx+1:]) + if val != "" { + v := strings.ToLower(val) + return &v + } + } + } + return nil +} + func queryDeviceSerial(devPath string) string { if out, err := exec.Command("nvme", "id-ctrl", devPath, "-o", "json").Output(); err == nil { var ctrl nvmeIDCtrl diff --git a/audit/internal/collector/vroc_test.go b/audit/internal/collector/vroc_test.go index b140421..e899f12 100644 --- a/audit/internal/collector/vroc_test.go +++ b/audit/internal/collector/vroc_test.go @@ -28,6 +28,35 @@ md125 : active raid1 nvme2n1[0] nvme3n1[1] } } +func TestParseMDAdmPlatformLicense(t *testing.T) { + premium := `Platform : Intel(R) Virtual RAID on CPU +Version : 1.3.0.1138 +RAID Levels : raid0 raid1 raid5 raid10 +Total Disks : 4 +License : Premium +` + got := parseMDAdmPlatformLicense(premium) + if got == nil || *got != "premium" { + t.Fatalf("expected 'premium', got %v", got) + } + + standard := `Platform : Intel(R) Virtual RAID on CPU +License : Standard +` + got = parseMDAdmPlatformLicense(standard) + if got == nil || *got != "standard" { + t.Fatalf("expected 'standard', got %v", got) + } + + noLicense := `Platform : Intel(R) Virtual RAID on CPU +Version : 1.0.0 +` + got = parseMDAdmPlatformLicense(noLicense) + if got != nil { + t.Fatalf("expected nil, got %v", *got) + } +} + func TestHasVROCController(t *testing.T) { intel := vendorIntel model := "Volume Management Device NVMe RAID Controller" diff --git a/audit/internal/schema/hardware.go b/audit/internal/schema/hardware.go index e446368..2699563 100644 --- a/audit/internal/schema/hardware.go +++ b/audit/internal/schema/hardware.go @@ -66,6 +66,7 @@ type HardwareSnapshot struct { PowerSupplies []HardwarePowerSupply `json:"power_supplies,omitempty"` Sensors *HardwareSensors `json:"sensors,omitempty"` EventLogs []HardwareEventLog `json:"event_logs,omitempty"` + VROCLicense *string `json:"vroc_license,omitempty"` } type HardwareHealthSummary struct { diff --git a/audit/internal/webui/api.go b/audit/internal/webui/api.go index cd1d367..192cf30 100644 --- a/audit/internal/webui/api.go +++ b/audit/internal/webui/api.go @@ -1295,7 +1295,7 @@ func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request) var standardTools = []string{ "dmidecode", "smartctl", "nvme", "lspci", "ipmitool", "nvidia-smi", "dcgmi", "nv-hostengine", "memtester", "stress-ng", "nvtop", - "mstflint", "qrencode", + "mstflint", } func (h *handler) handleAPIToolsCheck(w http.ResponseWriter, r *http.Request) { diff --git a/bible-local/architecture/runtime-flows.md b/bible-local/architecture/runtime-flows.md index 626423d..00a5aec 100644 --- a/bible-local/architecture/runtime-flows.md +++ b/bible-local/architecture/runtime-flows.md @@ -149,7 +149,6 @@ Current validation state: 6. psu collector (ipmitool fru + sdr — silent if no /dev/ipmi0) 7. nvidia enrichment (nvidia-smi — skipped if binary absent or driver not loaded) 8. output JSON → /var/log/bee-audit.json - 9. QR summary to stdout (qrencode if available) ``` Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal. diff --git a/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot b/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot index fe337c7..78ee667 100755 --- a/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot +++ b/iso/builder/config/hooks/normal/9000-bee-setup.hook.chroot @@ -31,6 +31,7 @@ systemctl enable bee-preflight.service systemctl enable bee-audit.service systemctl enable bee-web.service systemctl enable bee-sshsetup.service +systemctl enable bee-blackbox.service systemctl enable bee-selfheal.timer systemctl enable bee-boot-status.service systemctl enable ssh.service diff --git a/iso/builder/config/package-lists/bee.list.chroot b/iso/builder/config/package-lists/bee.list.chroot index 4ac5112..7d4fd5d 100644 --- a/iso/builder/config/package-lists/bee.list.chroot +++ b/iso/builder/config/package-lists/bee.list.chroot @@ -66,9 +66,6 @@ jq curl net-tools -# QR codes (for displaying audit results) -qrencode - # Local desktop (openbox + chromium kiosk) gparted openbox