IPMI hang fix (Lenovo XCC SR650 V3): - Add pluggable ipmi_profile system with per-vendor timeouts and fruEarlyExit flag - Lenovo profile: 90s FRU timeout, streaming early-exit stops after PSU blocks found - collectFRUEarlyExit streams ipmitool fru print and kills process once PSU blocks are followed by a non-PSU header (~6s instead of ~108s on 54-device FRU list) - collectBMCFirmware and collectPSUs accept manufacturer and apply profile timeouts VROC license detection: - Detect VMD/VROC controller in PCIe list, run mdadm --detail-platform - Parse "License:" line; store as snap.VROCLicense in HardwareSnapshot Blackbox service fix: - bee-blackbox.service was missing from systemctl enable list in ISO build hook - Service never started on boot; state file never written; UI button stayed "Enable" Drop qrencode: - Remove from package list, standardTools API check, and runtime-flows doc Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
518 lines
14 KiB
Go
518 lines
14 KiB
Go
package collector
|
|
|
|
import (
|
|
"bee/audit/internal/schema"
|
|
"bufio"
|
|
"context"
|
|
"log/slog"
|
|
"os/exec"
|
|
"regexp"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
)
|
|
|
|
func collectPSUs(manufacturer string) []schema.HardwarePowerSupply {
|
|
profile := selectIPMIProfile(manufacturer)
|
|
|
|
var psus []schema.HardwarePowerSupply
|
|
fruCtx, fruCancel := context.WithTimeout(context.Background(), profile.fruTimeout)
|
|
defer fruCancel()
|
|
|
|
if profile.fruEarlyExit {
|
|
psus = collectFRUEarlyExit(fruCtx)
|
|
} else {
|
|
cmd := exec.CommandContext(fruCtx, "ipmitool", "fru", "print")
|
|
if out, err := cmd.Output(); err == nil {
|
|
psus = parseFRU(string(out))
|
|
} else {
|
|
slog.Info("psu: fru unavailable", "err", err)
|
|
}
|
|
}
|
|
|
|
sdrData := map[int]psuSDR{}
|
|
sdrCtx, sdrCancel := context.WithTimeout(context.Background(), profile.sdrTimeout)
|
|
defer sdrCancel()
|
|
cmd := exec.CommandContext(sdrCtx, "ipmitool", "sdr")
|
|
if sdrOut, err := cmd.Output(); err == nil {
|
|
sdrData = parsePSUSDR(string(sdrOut))
|
|
if len(psus) == 0 {
|
|
psus = synthesizePSUsFromSDR(sdrData)
|
|
} else {
|
|
mergePSUSDR(psus, sdrData)
|
|
}
|
|
} else if len(psus) == 0 {
|
|
slog.Info("psu: ipmitool unavailable, skipping", "err", err)
|
|
return nil
|
|
}
|
|
slog.Info("psu: collected", "count", len(psus), "profile", profile.name)
|
|
return psus
|
|
}
|
|
|
|
// collectFRUEarlyExit streams ipmitool fru print line-by-line and stops reading
|
|
// as soon as it has found all PSU blocks and the next block is not a PSU.
|
|
// This avoids scanning all 50+ non-PSU FRU devices on Lenovo XCC servers.
|
|
func collectFRUEarlyExit(ctx context.Context) []schema.HardwarePowerSupply {
|
|
cmd := exec.CommandContext(ctx, "ipmitool", "fru", "print")
|
|
pipe, err := cmd.StdoutPipe()
|
|
if err != nil {
|
|
slog.Info("psu: fru pipe unavailable", "err", err)
|
|
return nil
|
|
}
|
|
if err := cmd.Start(); err != nil {
|
|
slog.Info("psu: fru start failed", "err", err)
|
|
return nil
|
|
}
|
|
|
|
var psus []schema.HardwarePowerSupply
|
|
var currentBlock strings.Builder
|
|
slot := 0
|
|
psuFound := false
|
|
stoppedEarly := false
|
|
|
|
scanner := bufio.NewScanner(pipe)
|
|
for scanner.Scan() {
|
|
line := scanner.Text()
|
|
|
|
if strings.HasPrefix(line, "FRU Device Description") {
|
|
if currentBlock.Len() > 0 {
|
|
if psu, ok := parseFRUBlock(currentBlock.String(), slot); ok {
|
|
psus = append(psus, psu)
|
|
psuFound = true
|
|
slot++
|
|
}
|
|
currentBlock.Reset()
|
|
}
|
|
// Stop once we've collected PSUs and hit a non-PSU block header.
|
|
if psuFound && !isPSUHeader(strings.ToLower(line)) {
|
|
stoppedEarly = true
|
|
break
|
|
}
|
|
}
|
|
currentBlock.WriteString(line)
|
|
currentBlock.WriteByte('\n')
|
|
}
|
|
|
|
if !stoppedEarly && currentBlock.Len() > 0 {
|
|
if psu, ok := parseFRUBlock(currentBlock.String(), slot); ok {
|
|
psus = append(psus, psu)
|
|
}
|
|
}
|
|
|
|
// Kill the process immediately on early exit rather than waiting for context timeout.
|
|
if cmd.Process != nil {
|
|
cmd.Process.Kill() //nolint:errcheck
|
|
}
|
|
cmd.Wait() //nolint:errcheck
|
|
slog.Info("psu: fru early-exit complete", "psus_found", len(psus), "stopped_early", stoppedEarly)
|
|
return psus
|
|
}
|
|
|
|
// parseFRU parses ipmitool fru print output.
|
|
// Each FRU record starts with "FRU Device Description : <name> (ID <n>)"
|
|
// followed by indented key: value lines.
|
|
func parseFRU(output string) []schema.HardwarePowerSupply {
|
|
var psus []schema.HardwarePowerSupply
|
|
slot := 0
|
|
for _, block := range splitFRUBlocks(output) {
|
|
psu, ok := parseFRUBlock(block, slot)
|
|
if !ok {
|
|
continue
|
|
}
|
|
psus = append(psus, psu)
|
|
slot++
|
|
}
|
|
return psus
|
|
}
|
|
|
|
func splitFRUBlocks(output string) []string {
|
|
var blocks []string
|
|
var cur strings.Builder
|
|
for _, line := range strings.Split(output, "\n") {
|
|
if strings.HasPrefix(line, "FRU Device Description") {
|
|
if cur.Len() > 0 {
|
|
blocks = append(blocks, cur.String())
|
|
cur.Reset()
|
|
}
|
|
}
|
|
cur.WriteString(line)
|
|
cur.WriteByte('\n')
|
|
}
|
|
if cur.Len() > 0 {
|
|
blocks = append(blocks, cur.String())
|
|
}
|
|
return blocks
|
|
}
|
|
|
|
func parseFRUBlock(block string, slotIdx int) (schema.HardwarePowerSupply, bool) {
|
|
fields := map[string]string{}
|
|
header := ""
|
|
for _, line := range strings.Split(block, "\n") {
|
|
if strings.HasPrefix(line, "FRU Device Description") {
|
|
header = line
|
|
continue
|
|
}
|
|
idx := strings.Index(line, " : ")
|
|
if idx < 0 {
|
|
continue
|
|
}
|
|
key := strings.TrimSpace(line[:idx])
|
|
val := strings.TrimSpace(line[idx+3:])
|
|
fields[key] = val
|
|
}
|
|
|
|
// Only process PSU FRU records
|
|
headerLower := strings.ToLower(header)
|
|
if !isPSUHeader(headerLower) {
|
|
return schema.HardwarePowerSupply{}, false
|
|
}
|
|
|
|
present := true
|
|
psu := schema.HardwarePowerSupply{Present: &present}
|
|
|
|
slotStr := strconv.Itoa(slotIdx)
|
|
if slot, ok := parsePSUSlot(header); ok && slot > 0 {
|
|
slotStr = strconv.Itoa(slot - 1)
|
|
}
|
|
psu.Slot = &slotStr
|
|
|
|
if v := firstNonEmptyField(fields, "Board Product", "Product Name", "Product Part Number"); v != "" {
|
|
psu.Model = &v
|
|
}
|
|
if v := firstNonEmptyField(fields, "Board Mfg", "Product Manufacturer", "Product Manufacturer Name"); v != "" {
|
|
psu.Vendor = &v
|
|
}
|
|
if v := firstNonEmptyField(fields, "Board Serial", "Product Serial", "Product Serial Number"); v != "" {
|
|
psu.SerialNumber = &v
|
|
}
|
|
if v := firstNonEmptyField(fields, "Board Part Number", "Product Part Number", "Part Number"); v != "" {
|
|
psu.PartNumber = &v
|
|
}
|
|
if v := firstNonEmptyField(fields, "Board Extra", "Product Version", "Board Version"); v != "" {
|
|
psu.Firmware = &v
|
|
}
|
|
|
|
// wattage: some vendors put it in product name e.g. "PSU 800W"
|
|
if psu.Model != nil {
|
|
if w := parseWattage(*psu.Model); w > 0 {
|
|
psu.WattageW = &w
|
|
}
|
|
}
|
|
|
|
status := statusOK
|
|
psu.Status = &status
|
|
|
|
return psu, true
|
|
}
|
|
|
|
func isPSUHeader(headerLower string) bool {
|
|
return strings.Contains(headerLower, "psu") ||
|
|
strings.Contains(headerLower, "pws") ||
|
|
strings.Contains(headerLower, "power supply") ||
|
|
strings.Contains(headerLower, "power_supply") ||
|
|
strings.Contains(headerLower, "power module")
|
|
}
|
|
|
|
func firstNonEmptyField(fields map[string]string, keys ...string) string {
|
|
for _, key := range keys {
|
|
if value := cleanDMIValue(fields[key]); value != "" {
|
|
return value
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
type psuSDR struct {
|
|
slot int
|
|
status string
|
|
reason string
|
|
inputPowerW *float64
|
|
outputPowerW *float64
|
|
inputVoltage *float64
|
|
temperatureC *float64
|
|
healthPct *float64
|
|
}
|
|
|
|
var psuSlotPatterns = []*regexp.Regexp{
|
|
// MSI/underscore style: PSU1_POWER_IN, PSU2_POWER_OUT — underscore is \w so \b
|
|
// does not fire after the digit; match explicitly with underscore terminator.
|
|
regexp.MustCompile(`(?i)\bpsu([0-9]+)_`),
|
|
regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`), // PSU1, PS1, ps 2
|
|
regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`), // PS 6, PS6
|
|
regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`), // PWS1
|
|
regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`), // Power Supply 1, Power Supply Bay 3
|
|
regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`), // Bay 1
|
|
// Fallback for xFusion-style generic numbered PSU sensors (Power1, Power2, …).
|
|
// Must be last: "power supply N" is already caught by the pattern above.
|
|
regexp.MustCompile(`(?i)\bpower([0-9]+)\b`),
|
|
}
|
|
|
|
// psuInputPowerKeywords matches AC-input power sensor names across vendors:
|
|
// MSI: PSU1_POWER_IN, PSU1_PIN
|
|
// MLT: PSU1_PIN
|
|
// xFusion: (matched via default fallback — no explicit keyword)
|
|
// HPE: PS1 Input Power, PS1 Input Watts
|
|
func isPSUInputPower(name string) bool {
|
|
return strings.Contains(name, "input power") ||
|
|
strings.Contains(name, "input watts") ||
|
|
strings.Contains(name, "_pin") ||
|
|
strings.Contains(name, " pin") ||
|
|
strings.Contains(name, "_power_in") ||
|
|
strings.Contains(name, "power_in")
|
|
}
|
|
|
|
// isPSUOutputPower matches DC-output power sensor names across vendors:
|
|
// MSI: PSU1_POWER_OUT
|
|
// MLT: PSU1_POUT
|
|
// xFusion: PS1 POut
|
|
func isPSUOutputPower(name string) bool {
|
|
return strings.Contains(name, "output power") ||
|
|
strings.Contains(name, "output watts") ||
|
|
strings.Contains(name, "_pout") ||
|
|
strings.Contains(name, " pout") ||
|
|
strings.Contains(name, "_power_out") ||
|
|
strings.Contains(name, "power_out") ||
|
|
strings.Contains(name, "power supply bay") ||
|
|
strings.Contains(name, "psu bay")
|
|
}
|
|
|
|
// parseBoundedFloat parses a numeric value from an SDR value field and
|
|
// validates it is within (0, max]. Returns nil for zero, negative, or
|
|
// out-of-range values — these indicate missing/off/fault sensor readings.
|
|
func parseBoundedFloat(raw string, max float64) *float64 {
|
|
v := parseFloatPtr(raw)
|
|
if v == nil || *v <= 0 || *v > max {
|
|
return nil
|
|
}
|
|
return v
|
|
}
|
|
|
|
func parsePSUSDR(raw string) map[int]psuSDR {
|
|
out := map[int]psuSDR{}
|
|
for _, line := range strings.Split(raw, "\n") {
|
|
fields := splitSDRFields(line)
|
|
if len(fields) < 3 {
|
|
continue
|
|
}
|
|
name := fields[0]
|
|
value := fields[1]
|
|
state := strings.ToLower(fields[2])
|
|
slot, ok := parsePSUSlot(name)
|
|
if !ok {
|
|
continue
|
|
}
|
|
|
|
entry := out[slot]
|
|
entry.slot = slot
|
|
if entry.status == "" {
|
|
entry.status = statusOK
|
|
}
|
|
if state != "" && state != "ok" && state != "ns" {
|
|
entry.status = statusCritical
|
|
entry.reason = "PSU sensor reported non-OK state: " + state
|
|
}
|
|
|
|
lowerName := strings.ToLower(name)
|
|
switch {
|
|
case isPSUInputPower(lowerName):
|
|
entry.inputPowerW = parseBoundedFloat(value, 6000)
|
|
case isPSUOutputPower(lowerName):
|
|
entry.outputPowerW = parseBoundedFloat(value, 6000)
|
|
case strings.Contains(lowerName, "input voltage"), strings.Contains(lowerName, "ac input"):
|
|
entry.inputVoltage = parseFloatPtr(value)
|
|
case strings.Contains(lowerName, "temp"):
|
|
entry.temperatureC = parseFloatPtr(value)
|
|
case strings.Contains(lowerName, "health"), strings.Contains(lowerName, "remaining life"), strings.Contains(lowerName, "life remaining"):
|
|
entry.healthPct = parsePercentPtr(value)
|
|
default:
|
|
// Generic PSU power reading: sensor matched a slot pattern but carries
|
|
// no input/output keyword (e.g. xFusion "Power1", "Power2"). Treat as
|
|
// AC input if the value looks like wattage and no better data is set yet.
|
|
if entry.inputPowerW == nil {
|
|
entry.inputPowerW = parseBoundedFloat(value, 6000)
|
|
}
|
|
}
|
|
out[slot] = entry
|
|
}
|
|
return out
|
|
}
|
|
|
|
// PSUSlotPower holds SDR power readings for one PSU slot.
|
|
// Slot key used by PSUSlotsFromSDR is the 0-based index string,
|
|
// matching HardwarePowerSupply.Slot in the audit schema.
|
|
type PSUSlotPower struct {
|
|
InputW *float64 `json:"input_w,omitempty"`
|
|
OutputW *float64 `json:"output_w,omitempty"`
|
|
Status string `json:"status,omitempty"`
|
|
}
|
|
|
|
// PSUSlotsFromSDR parses `ipmitool sdr` output and returns per-slot PSU data
|
|
// using the same battle-tested slot patterns as the hardware audit collector.
|
|
// Works across MSI (PSU1_POWER_IN), xFusion (Power1, PS1 POut), MLT (PSU1_PIN).
|
|
// Slot keys are 0-based index strings matching HardwarePowerSupply.Slot.
|
|
func PSUSlotsFromSDR(sdrOutput string) map[string]PSUSlotPower {
|
|
sdr := parsePSUSDR(sdrOutput)
|
|
if len(sdr) == 0 {
|
|
return nil
|
|
}
|
|
out := make(map[string]PSUSlotPower, len(sdr))
|
|
for slot, entry := range sdr {
|
|
key := strconv.Itoa(slot - 1) // audit uses 0-based slot
|
|
out[key] = PSUSlotPower{
|
|
InputW: entry.inputPowerW,
|
|
OutputW: entry.outputPowerW,
|
|
Status: entry.status,
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func synthesizePSUsFromSDR(sdr map[int]psuSDR) []schema.HardwarePowerSupply {
|
|
if len(sdr) == 0 {
|
|
return nil
|
|
}
|
|
slots := make([]int, 0, len(sdr))
|
|
for slot := range sdr {
|
|
slots = append(slots, slot)
|
|
}
|
|
sort.Ints(slots)
|
|
|
|
out := make([]schema.HardwarePowerSupply, 0, len(slots))
|
|
for _, slot := range slots {
|
|
entry := sdr[slot]
|
|
present := true
|
|
status := entry.status
|
|
if status == "" {
|
|
status = statusUnknown
|
|
}
|
|
slotStr := strconv.Itoa(slot - 1)
|
|
model := "PSU"
|
|
psu := schema.HardwarePowerSupply{
|
|
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
|
|
Slot: &slotStr,
|
|
Present: &present,
|
|
Model: &model,
|
|
InputPowerW: entry.inputPowerW,
|
|
OutputPowerW: entry.outputPowerW,
|
|
InputVoltage: entry.inputVoltage,
|
|
TemperatureC: entry.temperatureC,
|
|
}
|
|
if entry.healthPct != nil {
|
|
psu.LifeRemainingPct = entry.healthPct
|
|
lifeUsed := 100 - *entry.healthPct
|
|
psu.LifeUsedPct = &lifeUsed
|
|
}
|
|
if entry.reason != "" {
|
|
psu.ErrorDescription = &entry.reason
|
|
}
|
|
out = append(out, psu)
|
|
}
|
|
return out
|
|
}
|
|
|
|
func mergePSUSDR(psus []schema.HardwarePowerSupply, sdr map[int]psuSDR) {
|
|
for i := range psus {
|
|
slotIdx, err := strconv.Atoi(derefPSUSlot(psus[i].Slot))
|
|
if err != nil {
|
|
continue
|
|
}
|
|
entry, ok := sdr[slotIdx+1]
|
|
if !ok {
|
|
continue
|
|
}
|
|
if entry.inputPowerW != nil {
|
|
psus[i].InputPowerW = entry.inputPowerW
|
|
}
|
|
if entry.outputPowerW != nil {
|
|
psus[i].OutputPowerW = entry.outputPowerW
|
|
}
|
|
if entry.inputVoltage != nil {
|
|
psus[i].InputVoltage = entry.inputVoltage
|
|
}
|
|
if entry.temperatureC != nil {
|
|
psus[i].TemperatureC = entry.temperatureC
|
|
}
|
|
if entry.healthPct != nil {
|
|
psus[i].LifeRemainingPct = entry.healthPct
|
|
lifeUsed := 100 - *entry.healthPct
|
|
psus[i].LifeUsedPct = &lifeUsed
|
|
}
|
|
if entry.status != "" {
|
|
psus[i].Status = &entry.status
|
|
}
|
|
if entry.reason != "" {
|
|
psus[i].ErrorDescription = &entry.reason
|
|
}
|
|
if psus[i].Status != nil && *psus[i].Status == statusOK {
|
|
if (entry.inputPowerW == nil && entry.outputPowerW == nil && entry.inputVoltage == nil) && entry.status == "" {
|
|
unknown := statusUnknown
|
|
psus[i].Status = &unknown
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func splitSDRFields(line string) []string {
|
|
parts := strings.Split(line, "|")
|
|
out := make([]string, 0, len(parts))
|
|
for _, part := range parts {
|
|
part = strings.TrimSpace(part)
|
|
if part != "" {
|
|
out = append(out, part)
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func parsePSUSlot(name string) (int, bool) {
|
|
for _, re := range psuSlotPatterns {
|
|
m := re.FindStringSubmatch(strings.ToLower(name))
|
|
if len(m) == 0 {
|
|
continue
|
|
}
|
|
for _, group := range m[1:] {
|
|
if group == "" {
|
|
continue
|
|
}
|
|
n, err := strconv.Atoi(group)
|
|
if err == nil && n > 0 {
|
|
return n, true
|
|
}
|
|
}
|
|
}
|
|
return 0, false
|
|
}
|
|
|
|
func parseFloatPtr(raw string) *float64 {
|
|
raw = strings.TrimSpace(raw)
|
|
if raw == "" || strings.EqualFold(raw, "na") {
|
|
return nil
|
|
}
|
|
for _, field := range strings.Fields(raw) {
|
|
n, err := strconv.ParseFloat(strings.TrimSpace(field), 64)
|
|
if err == nil {
|
|
return &n
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func derefPSUSlot(slot *string) string {
|
|
if slot == nil {
|
|
return ""
|
|
}
|
|
return *slot
|
|
}
|
|
|
|
// parseWattage extracts wattage from strings like "PSU 800W", "1200W PLATINUM".
|
|
func parseWattage(s string) int {
|
|
s = strings.ToUpper(s)
|
|
for _, part := range strings.Fields(s) {
|
|
part = strings.TrimSuffix(part, "W")
|
|
if n, err := strconv.Atoi(part); err == nil && n > 0 && n <= 5000 {
|
|
return n
|
|
}
|
|
}
|
|
return 0
|
|
}
|