Collect IPMI sensors, SEL and dmesg errors into audit JSON and support bundle

- audit JSON: IPMI sensor readings (ipmitool sensor) merged into hardware.sensors alongside lm-sensors data
- audit JSON: IPMI SEL entries (ipmitool sel list) in hardware.event_logs with source "ipmi-sel"
- audit JSON: dmesg error/warning lines in hardware.event_logs with source "dmesg" (filtered by error/warn/AER/Xid/NVRM/ECC/panic patterns)
- support bundle: added ipmitool-sensor.txt, ipmitool-sel.txt, ipmitool-sel-time.txt to techdump
- saa_dmi.go: fix dmiItemRE to accept SHN with parentheses (e.g. PS(4)LC for PSU fields)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Mikhail Chusavitin
2026-06-19 08:41:37 +03:00
parent bab941ccf1
commit cbb0d1e522
6 changed files with 442 additions and 2 deletions

View File

@@ -49,7 +49,8 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
snap.VROCLicense = collectVROCLicense(snap.PCIeDevices)
snap.PowerSupplies = collectPSUs(derefString(snap.Board.Manufacturer))
snap.PowerSupplies = enrichPSUsWithTelemetry(snap.PowerSupplies, sensorDoc)
snap.Sensors = buildSensorsFromDoc(sensorDoc)
snap.Sensors = mergeIPMISensors(buildSensorsFromDoc(sensorDoc), collectIPMISensors())
snap.EventLogs = append(collectIPMISEL(), collectDmesgErrors()...)
finalizeSnapshot(&snap, collectedAt)
// remaining collectors added in steps 1.8 1.10

View File

@@ -0,0 +1,129 @@
package collector
import (
"bee/audit/internal/schema"
"log/slog"
"os/exec"
"regexp"
"strings"
"time"
)
// dmesg -T output: [Thu Jun 18 14:23:45 2026] message
// dmesg without -T: [ 123.456789] message
var dmesgTimestampRE = regexp.MustCompile(`^\[([^\]]+)\]\s*(.*)$`)
// Keywords that indicate an error or hardware problem worth capturing.
var dmesgErrorPatterns = []*regexp.Regexp{
regexp.MustCompile(`(?i)\berr(or)?\b`),
regexp.MustCompile(`(?i)\bfail(ed|ure)?\b`),
regexp.MustCompile(`(?i)\bfault\b`),
regexp.MustCompile(`(?i)\bwarn(ing)?\b`),
regexp.MustCompile(`(?i)\bAER\b`),
regexp.MustCompile(`(?i)\bXid\b`),
regexp.MustCompile(`(?i)\bNVRM\b`),
regexp.MustCompile(`(?i)\bpanic\b`),
regexp.MustCompile(`(?i)\bcorrected\b`),
regexp.MustCompile(`(?i)\buncorrect`),
regexp.MustCompile(`(?i)\bECC\b`),
regexp.MustCompile(`(?i)\btimeout\b`),
regexp.MustCompile(`(?i)\breset\b`),
regexp.MustCompile(`(?i)\bdead\b`),
regexp.MustCompile(`(?i)\bhang\b`),
regexp.MustCompile(`(?i)\bstall\b`),
regexp.MustCompile(`(?i)\bdisabled\b`),
}
// collectDmesgErrors runs `dmesg -T` (or `dmesg` without -T on failure) and
// returns only lines that match known error/warning patterns.
func collectDmesgErrors() []schema.HardwareEventLog {
out, err := exec.Command("dmesg", "-T").Output()
if err != nil || len(out) == 0 {
// Fallback: dmesg without human-readable timestamps
out, err = exec.Command("dmesg").Output()
if err != nil || len(out) == 0 {
return nil
}
}
entries := parseDmesgErrors(string(out))
if len(entries) == 0 {
return nil
}
slog.Info("dmesg: collected error entries", "count", len(entries))
return entries
}
func parseDmesgErrors(output string) []schema.HardwareEventLog {
var entries []schema.HardwareEventLog
collectedAt := time.Now().UTC().Format(time.RFC3339)
for _, line := range strings.Split(output, "\n") {
line = strings.TrimSpace(line)
if line == "" {
continue
}
var timestamp, message string
if m := dmesgTimestampRE.FindStringSubmatch(line); m != nil {
timestamp = strings.TrimSpace(m[1])
message = strings.TrimSpace(m[2])
} else {
message = line
}
if message == "" {
continue
}
if !matchesAny(message, dmesgErrorPatterns) {
continue
}
severity := dmesgSeverity(message)
source := "dmesg"
var eventTime *string
if timestamp != "" {
t := timestamp
eventTime = &t
} else {
eventTime = &collectedAt
}
entries = append(entries, schema.HardwareEventLog{
Source: source,
EventTime: eventTime,
Severity: &severity,
Message: message,
})
}
return entries
}
func matchesAny(s string, patterns []*regexp.Regexp) bool {
for _, p := range patterns {
if p.MatchString(s) {
return true
}
}
return false
}
func dmesgSeverity(msg string) string {
lower := strings.ToLower(msg)
switch {
case strings.Contains(lower, "panic") ||
strings.Contains(lower, "aer") ||
strings.Contains(lower, "uncorrect") ||
strings.Contains(lower, "xid") ||
strings.Contains(lower, "nvrm"):
return statusCritical
case strings.Contains(lower, "error") ||
strings.Contains(lower, "fault") ||
strings.Contains(lower, "fail") ||
strings.Contains(lower, "dead") ||
strings.Contains(lower, "hang"):
return statusCritical
default:
return statusWarning
}
}

View File

@@ -0,0 +1,90 @@
package collector
import (
"bee/audit/internal/schema"
"fmt"
"log/slog"
"os/exec"
"strings"
)
// collectIPMISEL runs `ipmitool sel list` and returns parsed event log entries.
// Returns nil if ipmitool is unavailable or the SEL is empty.
func collectIPMISEL() []schema.HardwareEventLog {
out, err := exec.Command("ipmitool", "sel", "list").Output()
if err != nil || len(out) == 0 {
return nil
}
entries := parseIPMISELOutput(string(out))
if len(entries) == 0 {
return nil
}
slog.Info("ipmi sel: collected", "entries", len(entries))
return entries
}
// parseIPMISELOutput parses `ipmitool sel list` output.
// Line format: ID | date | time | sensor | event description | direction
// Example: 1 | 06/18/2026 | 14:23:45 | Temperature #0x30 | Upper Critical going high | Asserted
func parseIPMISELOutput(output string) []schema.HardwareEventLog {
var entries []schema.HardwareEventLog
for _, line := range strings.Split(output, "\n") {
line = strings.TrimSpace(line)
if line == "" {
continue
}
parts := strings.SplitN(line, "|", 6)
if len(parts) < 5 {
continue
}
id := strings.TrimSpace(parts[0])
date := strings.TrimSpace(parts[1])
timeStr := strings.TrimSpace(parts[2])
sensor := strings.TrimSpace(parts[3])
event := strings.TrimSpace(parts[4])
direction := ""
if len(parts) == 6 {
direction = strings.TrimSpace(parts[5])
}
var eventTime *string
if date != "" && timeStr != "" {
t := fmt.Sprintf("%s %s", date, timeStr)
eventTime = &t
}
message := event
if direction != "" && strings.EqualFold(direction, "Deasserted") {
message = event + " (Deasserted)"
}
severity := ipmiSELSeverity(event)
isActive := !strings.EqualFold(direction, "Deasserted")
entry := schema.HardwareEventLog{
Source: "ipmi-sel",
EventTime: eventTime,
Severity: &severity,
MessageID: &id,
Message: message,
IsActive: &isActive,
}
if sensor != "" {
entry.ComponentRef = &sensor
}
entries = append(entries, entry)
}
return entries
}
func ipmiSELSeverity(event string) string {
lower := strings.ToLower(event)
switch {
case strings.Contains(lower, "critical") || strings.Contains(lower, "non-recoverable"):
return statusCritical
case strings.Contains(lower, "non-critical") || strings.Contains(lower, "warning") || strings.Contains(lower, "degraded"):
return statusWarning
default:
return "info"
}
}

View File

@@ -0,0 +1,216 @@
package collector
import (
"bee/audit/internal/schema"
"log/slog"
"os/exec"
"strconv"
"strings"
)
// collectIPMISensors runs `ipmitool sensor` and returns parsed sensor readings.
// Returns nil if ipmitool is unavailable or produces no output.
func collectIPMISensors() *schema.HardwareSensors {
out, err := exec.Command("ipmitool", "sensor").Output()
if err != nil || len(out) == 0 {
return nil
}
result := parseIPMISensorOutput(string(out))
if result == nil {
return nil
}
slog.Info("ipmi sensors: collected",
"fans", len(result.Fans),
"temperatures", len(result.Temperatures),
"power", len(result.Power),
"other", len(result.Other),
)
return result
}
// parseIPMISensorOutput parses `ipmitool sensor` text output.
// Each line: name | value | unit | status | lnr | lcr | lnc | unc | ucr | unr
func parseIPMISensorOutput(output string) *schema.HardwareSensors {
result := &schema.HardwareSensors{}
seen := map[string]struct{}{}
for _, line := range strings.Split(output, "\n") {
line = strings.TrimSpace(line)
if line == "" {
continue
}
parts := strings.Split(line, "|")
if len(parts) < 4 {
continue
}
name := strings.TrimSpace(parts[0])
rawVal := strings.TrimSpace(parts[1])
unit := strings.TrimSpace(parts[2])
status := strings.TrimSpace(parts[3])
if name == "" || rawVal == "na" || rawVal == "N/A" || rawVal == "" {
continue
}
value, err := strconv.ParseFloat(rawVal, 64)
if err != nil {
continue
}
statusStr := normalizeIPMISensorStatus(status)
switch {
case strings.EqualFold(unit, "RPM"):
if duplicateSensor(seen, "fan", name) {
continue
}
rpm := int(value)
item := schema.HardwareFanSensor{Name: name, RPM: &rpm}
if statusStr != "" {
item.Status = &statusStr
}
result.Fans = append(result.Fans, item)
case strings.EqualFold(unit, "degrees C") || strings.EqualFold(unit, "C"):
if duplicateSensor(seen, "temp", name) {
continue
}
item := schema.HardwareTemperatureSensor{Name: name, Celsius: &value}
if len(parts) >= 9 {
if unc := parseIPMIThreshold(parts[7]); unc != nil {
item.ThresholdWarningCelsius = unc
}
if ucr := parseIPMIThreshold(parts[8]); ucr != nil {
item.ThresholdCriticalCelsius = ucr
}
}
if statusStr != "" {
item.Status = &statusStr
} else {
item.Status = deriveTemperatureStatus(item.Celsius, item.ThresholdWarningCelsius, item.ThresholdCriticalCelsius)
}
result.Temperatures = append(result.Temperatures, item)
case strings.EqualFold(unit, "Volts") || strings.EqualFold(unit, "V"):
if duplicateSensor(seen, "power", name) {
continue
}
item := schema.HardwarePowerSensor{Name: name, VoltageV: &value}
if statusStr != "" {
item.Status = &statusStr
}
result.Power = append(result.Power, item)
case strings.EqualFold(unit, "Watts") || strings.EqualFold(unit, "W"):
if duplicateSensor(seen, "power", name) {
continue
}
item := schema.HardwarePowerSensor{Name: name, PowerW: &value}
if statusStr != "" {
item.Status = &statusStr
}
result.Power = append(result.Power, item)
case strings.EqualFold(unit, "Amps") || strings.EqualFold(unit, "A"):
if duplicateSensor(seen, "power", name) {
continue
}
item := schema.HardwarePowerSensor{Name: name, CurrentA: &value}
if statusStr != "" {
item.Status = &statusStr
}
result.Power = append(result.Power, item)
default:
if duplicateSensor(seen, "other", name) {
continue
}
item := schema.HardwareOtherSensor{Name: name, Value: &value}
if unit != "" {
item.Unit = &unit
}
if statusStr != "" {
item.Status = &statusStr
}
result.Other = append(result.Other, item)
}
}
if len(result.Fans) == 0 && len(result.Temperatures) == 0 && len(result.Power) == 0 && len(result.Other) == 0 {
return nil
}
return result
}
func parseIPMIThreshold(raw string) *float64 {
s := strings.TrimSpace(raw)
if s == "" || s == "na" || s == "N/A" {
return nil
}
v, err := strconv.ParseFloat(s, 64)
if err != nil {
return nil
}
return &v
}
func normalizeIPMISensorStatus(s string) string {
switch strings.ToLower(s) {
case "ok":
return statusOK
case "cr", "ucr", "lcr":
return statusCritical
case "nc", "unc", "lnc", "nr", "unr", "lnr":
return statusWarning
case "ns", "na":
return ""
default:
return ""
}
}
// mergeIPMISensors appends IPMI sensor entries into existing, skipping names already present.
func mergeIPMISensors(existing, ipmi *schema.HardwareSensors) *schema.HardwareSensors {
if ipmi == nil {
return existing
}
if existing == nil {
return ipmi
}
existingNames := map[string]struct{}{}
for _, s := range existing.Fans {
existingNames["fan\x00"+s.Name] = struct{}{}
}
for _, s := range existing.Temperatures {
existingNames["temp\x00"+s.Name] = struct{}{}
}
for _, s := range existing.Power {
existingNames["power\x00"+s.Name] = struct{}{}
}
for _, s := range existing.Other {
existingNames["other\x00"+s.Name] = struct{}{}
}
for _, s := range ipmi.Fans {
if _, ok := existingNames["fan\x00"+s.Name]; !ok {
existing.Fans = append(existing.Fans, s)
}
}
for _, s := range ipmi.Temperatures {
if _, ok := existingNames["temp\x00"+s.Name]; !ok {
existing.Temperatures = append(existing.Temperatures, s)
}
}
for _, s := range ipmi.Power {
if _, ok := existingNames["power\x00"+s.Name]; !ok {
existing.Power = append(existing.Power, s)
}
}
for _, s := range ipmi.Other {
if _, ok := existingNames["other\x00"+s.Name]; !ok {
existing.Other = append(existing.Other, s)
}
}
return existing
}

View File

@@ -25,6 +25,9 @@ var techDumpFixedCommands = []struct {
{Name: "sensors", Args: []string{"-j"}, File: "sensors.json"},
{Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"},
{Name: "ipmitool", Args: []string{"sdr"}, File: "ipmitool-sdr.txt"},
{Name: "ipmitool", Args: []string{"sensor"}, File: "ipmitool-sensor.txt"},
{Name: "ipmitool", Args: []string{"sel", "list"}, File: "ipmitool-sel.txt"},
{Name: "ipmitool", Args: []string{"sel", "time", "get"}, File: "ipmitool-sel-time.txt"},
{Name: "nvme", Args: []string{"list", "-o", "json"}, File: "nvme-list.json"},
}

View File

@@ -28,7 +28,8 @@ var (
shnRE = regexp.MustCompile(`^[A-Za-z0-9_]{1,16}$`)
dmiSectionRE = regexp.MustCompile(`^\[(.+?)\]$`)
// Item Name {SHN} = value // comment
dmiItemRE = regexp.MustCompile(`^(.+?)\s+\{([A-Za-z0-9]{1,16})\}\s*=\s*(.*)$`)
// SHN may contain parentheses, e.g. {PS(4)LC} for power supply fields
dmiItemRE = regexp.MustCompile(`^(.+?)\s+\{([A-Za-z0-9_()\-]{1,24})\}\s*=\s*(.*)$`)
dmiVersionRE = regexp.MustCompile(`(?i)^version\s*=`)
)