Collect IPMI sensors, SEL and dmesg errors into audit JSON and support bundle
- audit JSON: IPMI sensor readings (ipmitool sensor) merged into hardware.sensors alongside lm-sensors data - audit JSON: IPMI SEL entries (ipmitool sel list) in hardware.event_logs with source "ipmi-sel" - audit JSON: dmesg error/warning lines in hardware.event_logs with source "dmesg" (filtered by error/warn/AER/Xid/NVRM/ECC/panic patterns) - support bundle: added ipmitool-sensor.txt, ipmitool-sel.txt, ipmitool-sel-time.txt to techdump - saa_dmi.go: fix dmiItemRE to accept SHN with parentheses (e.g. PS(4)LC for PSU fields) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -49,7 +49,8 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
||||
snap.VROCLicense = collectVROCLicense(snap.PCIeDevices)
|
||||
snap.PowerSupplies = collectPSUs(derefString(snap.Board.Manufacturer))
|
||||
snap.PowerSupplies = enrichPSUsWithTelemetry(snap.PowerSupplies, sensorDoc)
|
||||
snap.Sensors = buildSensorsFromDoc(sensorDoc)
|
||||
snap.Sensors = mergeIPMISensors(buildSensorsFromDoc(sensorDoc), collectIPMISensors())
|
||||
snap.EventLogs = append(collectIPMISEL(), collectDmesgErrors()...)
|
||||
finalizeSnapshot(&snap, collectedAt)
|
||||
|
||||
// remaining collectors added in steps 1.8 – 1.10
|
||||
|
||||
129
audit/internal/collector/dmesg_events.go
Normal file
129
audit/internal/collector/dmesg_events.go
Normal file
@@ -0,0 +1,129 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// dmesg -T output: [Thu Jun 18 14:23:45 2026] message
|
||||
// dmesg without -T: [ 123.456789] message
|
||||
var dmesgTimestampRE = regexp.MustCompile(`^\[([^\]]+)\]\s*(.*)$`)
|
||||
|
||||
// Keywords that indicate an error or hardware problem worth capturing.
|
||||
var dmesgErrorPatterns = []*regexp.Regexp{
|
||||
regexp.MustCompile(`(?i)\berr(or)?\b`),
|
||||
regexp.MustCompile(`(?i)\bfail(ed|ure)?\b`),
|
||||
regexp.MustCompile(`(?i)\bfault\b`),
|
||||
regexp.MustCompile(`(?i)\bwarn(ing)?\b`),
|
||||
regexp.MustCompile(`(?i)\bAER\b`),
|
||||
regexp.MustCompile(`(?i)\bXid\b`),
|
||||
regexp.MustCompile(`(?i)\bNVRM\b`),
|
||||
regexp.MustCompile(`(?i)\bpanic\b`),
|
||||
regexp.MustCompile(`(?i)\bcorrected\b`),
|
||||
regexp.MustCompile(`(?i)\buncorrect`),
|
||||
regexp.MustCompile(`(?i)\bECC\b`),
|
||||
regexp.MustCompile(`(?i)\btimeout\b`),
|
||||
regexp.MustCompile(`(?i)\breset\b`),
|
||||
regexp.MustCompile(`(?i)\bdead\b`),
|
||||
regexp.MustCompile(`(?i)\bhang\b`),
|
||||
regexp.MustCompile(`(?i)\bstall\b`),
|
||||
regexp.MustCompile(`(?i)\bdisabled\b`),
|
||||
}
|
||||
|
||||
// collectDmesgErrors runs `dmesg -T` (or `dmesg` without -T on failure) and
|
||||
// returns only lines that match known error/warning patterns.
|
||||
func collectDmesgErrors() []schema.HardwareEventLog {
|
||||
out, err := exec.Command("dmesg", "-T").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
// Fallback: dmesg without human-readable timestamps
|
||||
out, err = exec.Command("dmesg").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
entries := parseDmesgErrors(string(out))
|
||||
if len(entries) == 0 {
|
||||
return nil
|
||||
}
|
||||
slog.Info("dmesg: collected error entries", "count", len(entries))
|
||||
return entries
|
||||
}
|
||||
|
||||
func parseDmesgErrors(output string) []schema.HardwareEventLog {
|
||||
var entries []schema.HardwareEventLog
|
||||
collectedAt := time.Now().UTC().Format(time.RFC3339)
|
||||
|
||||
for _, line := range strings.Split(output, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
var timestamp, message string
|
||||
if m := dmesgTimestampRE.FindStringSubmatch(line); m != nil {
|
||||
timestamp = strings.TrimSpace(m[1])
|
||||
message = strings.TrimSpace(m[2])
|
||||
} else {
|
||||
message = line
|
||||
}
|
||||
|
||||
if message == "" {
|
||||
continue
|
||||
}
|
||||
if !matchesAny(message, dmesgErrorPatterns) {
|
||||
continue
|
||||
}
|
||||
|
||||
severity := dmesgSeverity(message)
|
||||
source := "dmesg"
|
||||
|
||||
var eventTime *string
|
||||
if timestamp != "" {
|
||||
t := timestamp
|
||||
eventTime = &t
|
||||
} else {
|
||||
eventTime = &collectedAt
|
||||
}
|
||||
|
||||
entries = append(entries, schema.HardwareEventLog{
|
||||
Source: source,
|
||||
EventTime: eventTime,
|
||||
Severity: &severity,
|
||||
Message: message,
|
||||
})
|
||||
}
|
||||
return entries
|
||||
}
|
||||
|
||||
func matchesAny(s string, patterns []*regexp.Regexp) bool {
|
||||
for _, p := range patterns {
|
||||
if p.MatchString(s) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func dmesgSeverity(msg string) string {
|
||||
lower := strings.ToLower(msg)
|
||||
switch {
|
||||
case strings.Contains(lower, "panic") ||
|
||||
strings.Contains(lower, "aer") ||
|
||||
strings.Contains(lower, "uncorrect") ||
|
||||
strings.Contains(lower, "xid") ||
|
||||
strings.Contains(lower, "nvrm"):
|
||||
return statusCritical
|
||||
case strings.Contains(lower, "error") ||
|
||||
strings.Contains(lower, "fault") ||
|
||||
strings.Contains(lower, "fail") ||
|
||||
strings.Contains(lower, "dead") ||
|
||||
strings.Contains(lower, "hang"):
|
||||
return statusCritical
|
||||
default:
|
||||
return statusWarning
|
||||
}
|
||||
}
|
||||
90
audit/internal/collector/ipmi_sel.go
Normal file
90
audit/internal/collector/ipmi_sel.go
Normal file
@@ -0,0 +1,90 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// collectIPMISEL runs `ipmitool sel list` and returns parsed event log entries.
|
||||
// Returns nil if ipmitool is unavailable or the SEL is empty.
|
||||
func collectIPMISEL() []schema.HardwareEventLog {
|
||||
out, err := exec.Command("ipmitool", "sel", "list").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
entries := parseIPMISELOutput(string(out))
|
||||
if len(entries) == 0 {
|
||||
return nil
|
||||
}
|
||||
slog.Info("ipmi sel: collected", "entries", len(entries))
|
||||
return entries
|
||||
}
|
||||
|
||||
// parseIPMISELOutput parses `ipmitool sel list` output.
|
||||
// Line format: ID | date | time | sensor | event description | direction
|
||||
// Example: 1 | 06/18/2026 | 14:23:45 | Temperature #0x30 | Upper Critical going high | Asserted
|
||||
func parseIPMISELOutput(output string) []schema.HardwareEventLog {
|
||||
var entries []schema.HardwareEventLog
|
||||
for _, line := range strings.Split(output, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
parts := strings.SplitN(line, "|", 6)
|
||||
if len(parts) < 5 {
|
||||
continue
|
||||
}
|
||||
id := strings.TrimSpace(parts[0])
|
||||
date := strings.TrimSpace(parts[1])
|
||||
timeStr := strings.TrimSpace(parts[2])
|
||||
sensor := strings.TrimSpace(parts[3])
|
||||
event := strings.TrimSpace(parts[4])
|
||||
direction := ""
|
||||
if len(parts) == 6 {
|
||||
direction = strings.TrimSpace(parts[5])
|
||||
}
|
||||
|
||||
var eventTime *string
|
||||
if date != "" && timeStr != "" {
|
||||
t := fmt.Sprintf("%s %s", date, timeStr)
|
||||
eventTime = &t
|
||||
}
|
||||
|
||||
message := event
|
||||
if direction != "" && strings.EqualFold(direction, "Deasserted") {
|
||||
message = event + " (Deasserted)"
|
||||
}
|
||||
|
||||
severity := ipmiSELSeverity(event)
|
||||
isActive := !strings.EqualFold(direction, "Deasserted")
|
||||
|
||||
entry := schema.HardwareEventLog{
|
||||
Source: "ipmi-sel",
|
||||
EventTime: eventTime,
|
||||
Severity: &severity,
|
||||
MessageID: &id,
|
||||
Message: message,
|
||||
IsActive: &isActive,
|
||||
}
|
||||
if sensor != "" {
|
||||
entry.ComponentRef = &sensor
|
||||
}
|
||||
entries = append(entries, entry)
|
||||
}
|
||||
return entries
|
||||
}
|
||||
|
||||
func ipmiSELSeverity(event string) string {
|
||||
lower := strings.ToLower(event)
|
||||
switch {
|
||||
case strings.Contains(lower, "critical") || strings.Contains(lower, "non-recoverable"):
|
||||
return statusCritical
|
||||
case strings.Contains(lower, "non-critical") || strings.Contains(lower, "warning") || strings.Contains(lower, "degraded"):
|
||||
return statusWarning
|
||||
default:
|
||||
return "info"
|
||||
}
|
||||
}
|
||||
216
audit/internal/collector/ipmi_sensors.go
Normal file
216
audit/internal/collector/ipmi_sensors.go
Normal file
@@ -0,0 +1,216 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// collectIPMISensors runs `ipmitool sensor` and returns parsed sensor readings.
|
||||
// Returns nil if ipmitool is unavailable or produces no output.
|
||||
func collectIPMISensors() *schema.HardwareSensors {
|
||||
out, err := exec.Command("ipmitool", "sensor").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
result := parseIPMISensorOutput(string(out))
|
||||
if result == nil {
|
||||
return nil
|
||||
}
|
||||
slog.Info("ipmi sensors: collected",
|
||||
"fans", len(result.Fans),
|
||||
"temperatures", len(result.Temperatures),
|
||||
"power", len(result.Power),
|
||||
"other", len(result.Other),
|
||||
)
|
||||
return result
|
||||
}
|
||||
|
||||
// parseIPMISensorOutput parses `ipmitool sensor` text output.
|
||||
// Each line: name | value | unit | status | lnr | lcr | lnc | unc | ucr | unr
|
||||
func parseIPMISensorOutput(output string) *schema.HardwareSensors {
|
||||
result := &schema.HardwareSensors{}
|
||||
seen := map[string]struct{}{}
|
||||
|
||||
for _, line := range strings.Split(output, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
parts := strings.Split(line, "|")
|
||||
if len(parts) < 4 {
|
||||
continue
|
||||
}
|
||||
name := strings.TrimSpace(parts[0])
|
||||
rawVal := strings.TrimSpace(parts[1])
|
||||
unit := strings.TrimSpace(parts[2])
|
||||
status := strings.TrimSpace(parts[3])
|
||||
|
||||
if name == "" || rawVal == "na" || rawVal == "N/A" || rawVal == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
value, err := strconv.ParseFloat(rawVal, 64)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
statusStr := normalizeIPMISensorStatus(status)
|
||||
|
||||
switch {
|
||||
case strings.EqualFold(unit, "RPM"):
|
||||
if duplicateSensor(seen, "fan", name) {
|
||||
continue
|
||||
}
|
||||
rpm := int(value)
|
||||
item := schema.HardwareFanSensor{Name: name, RPM: &rpm}
|
||||
if statusStr != "" {
|
||||
item.Status = &statusStr
|
||||
}
|
||||
result.Fans = append(result.Fans, item)
|
||||
|
||||
case strings.EqualFold(unit, "degrees C") || strings.EqualFold(unit, "C"):
|
||||
if duplicateSensor(seen, "temp", name) {
|
||||
continue
|
||||
}
|
||||
item := schema.HardwareTemperatureSensor{Name: name, Celsius: &value}
|
||||
if len(parts) >= 9 {
|
||||
if unc := parseIPMIThreshold(parts[7]); unc != nil {
|
||||
item.ThresholdWarningCelsius = unc
|
||||
}
|
||||
if ucr := parseIPMIThreshold(parts[8]); ucr != nil {
|
||||
item.ThresholdCriticalCelsius = ucr
|
||||
}
|
||||
}
|
||||
if statusStr != "" {
|
||||
item.Status = &statusStr
|
||||
} else {
|
||||
item.Status = deriveTemperatureStatus(item.Celsius, item.ThresholdWarningCelsius, item.ThresholdCriticalCelsius)
|
||||
}
|
||||
result.Temperatures = append(result.Temperatures, item)
|
||||
|
||||
case strings.EqualFold(unit, "Volts") || strings.EqualFold(unit, "V"):
|
||||
if duplicateSensor(seen, "power", name) {
|
||||
continue
|
||||
}
|
||||
item := schema.HardwarePowerSensor{Name: name, VoltageV: &value}
|
||||
if statusStr != "" {
|
||||
item.Status = &statusStr
|
||||
}
|
||||
result.Power = append(result.Power, item)
|
||||
|
||||
case strings.EqualFold(unit, "Watts") || strings.EqualFold(unit, "W"):
|
||||
if duplicateSensor(seen, "power", name) {
|
||||
continue
|
||||
}
|
||||
item := schema.HardwarePowerSensor{Name: name, PowerW: &value}
|
||||
if statusStr != "" {
|
||||
item.Status = &statusStr
|
||||
}
|
||||
result.Power = append(result.Power, item)
|
||||
|
||||
case strings.EqualFold(unit, "Amps") || strings.EqualFold(unit, "A"):
|
||||
if duplicateSensor(seen, "power", name) {
|
||||
continue
|
||||
}
|
||||
item := schema.HardwarePowerSensor{Name: name, CurrentA: &value}
|
||||
if statusStr != "" {
|
||||
item.Status = &statusStr
|
||||
}
|
||||
result.Power = append(result.Power, item)
|
||||
|
||||
default:
|
||||
if duplicateSensor(seen, "other", name) {
|
||||
continue
|
||||
}
|
||||
item := schema.HardwareOtherSensor{Name: name, Value: &value}
|
||||
if unit != "" {
|
||||
item.Unit = &unit
|
||||
}
|
||||
if statusStr != "" {
|
||||
item.Status = &statusStr
|
||||
}
|
||||
result.Other = append(result.Other, item)
|
||||
}
|
||||
}
|
||||
|
||||
if len(result.Fans) == 0 && len(result.Temperatures) == 0 && len(result.Power) == 0 && len(result.Other) == 0 {
|
||||
return nil
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func parseIPMIThreshold(raw string) *float64 {
|
||||
s := strings.TrimSpace(raw)
|
||||
if s == "" || s == "na" || s == "N/A" {
|
||||
return nil
|
||||
}
|
||||
v, err := strconv.ParseFloat(s, 64)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return &v
|
||||
}
|
||||
|
||||
func normalizeIPMISensorStatus(s string) string {
|
||||
switch strings.ToLower(s) {
|
||||
case "ok":
|
||||
return statusOK
|
||||
case "cr", "ucr", "lcr":
|
||||
return statusCritical
|
||||
case "nc", "unc", "lnc", "nr", "unr", "lnr":
|
||||
return statusWarning
|
||||
case "ns", "na":
|
||||
return ""
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
// mergeIPMISensors appends IPMI sensor entries into existing, skipping names already present.
|
||||
func mergeIPMISensors(existing, ipmi *schema.HardwareSensors) *schema.HardwareSensors {
|
||||
if ipmi == nil {
|
||||
return existing
|
||||
}
|
||||
if existing == nil {
|
||||
return ipmi
|
||||
}
|
||||
|
||||
existingNames := map[string]struct{}{}
|
||||
for _, s := range existing.Fans {
|
||||
existingNames["fan\x00"+s.Name] = struct{}{}
|
||||
}
|
||||
for _, s := range existing.Temperatures {
|
||||
existingNames["temp\x00"+s.Name] = struct{}{}
|
||||
}
|
||||
for _, s := range existing.Power {
|
||||
existingNames["power\x00"+s.Name] = struct{}{}
|
||||
}
|
||||
for _, s := range existing.Other {
|
||||
existingNames["other\x00"+s.Name] = struct{}{}
|
||||
}
|
||||
|
||||
for _, s := range ipmi.Fans {
|
||||
if _, ok := existingNames["fan\x00"+s.Name]; !ok {
|
||||
existing.Fans = append(existing.Fans, s)
|
||||
}
|
||||
}
|
||||
for _, s := range ipmi.Temperatures {
|
||||
if _, ok := existingNames["temp\x00"+s.Name]; !ok {
|
||||
existing.Temperatures = append(existing.Temperatures, s)
|
||||
}
|
||||
}
|
||||
for _, s := range ipmi.Power {
|
||||
if _, ok := existingNames["power\x00"+s.Name]; !ok {
|
||||
existing.Power = append(existing.Power, s)
|
||||
}
|
||||
}
|
||||
for _, s := range ipmi.Other {
|
||||
if _, ok := existingNames["other\x00"+s.Name]; !ok {
|
||||
existing.Other = append(existing.Other, s)
|
||||
}
|
||||
}
|
||||
return existing
|
||||
}
|
||||
@@ -25,6 +25,9 @@ var techDumpFixedCommands = []struct {
|
||||
{Name: "sensors", Args: []string{"-j"}, File: "sensors.json"},
|
||||
{Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"},
|
||||
{Name: "ipmitool", Args: []string{"sdr"}, File: "ipmitool-sdr.txt"},
|
||||
{Name: "ipmitool", Args: []string{"sensor"}, File: "ipmitool-sensor.txt"},
|
||||
{Name: "ipmitool", Args: []string{"sel", "list"}, File: "ipmitool-sel.txt"},
|
||||
{Name: "ipmitool", Args: []string{"sel", "time", "get"}, File: "ipmitool-sel-time.txt"},
|
||||
{Name: "nvme", Args: []string{"list", "-o", "json"}, File: "nvme-list.json"},
|
||||
}
|
||||
|
||||
|
||||
@@ -28,7 +28,8 @@ var (
|
||||
shnRE = regexp.MustCompile(`^[A-Za-z0-9_]{1,16}$`)
|
||||
dmiSectionRE = regexp.MustCompile(`^\[(.+?)\]$`)
|
||||
// Item Name {SHN} = value // comment
|
||||
dmiItemRE = regexp.MustCompile(`^(.+?)\s+\{([A-Za-z0-9]{1,16})\}\s*=\s*(.*)$`)
|
||||
// SHN may contain parentheses, e.g. {PS(4)LC} for power supply fields
|
||||
dmiItemRE = regexp.MustCompile(`^(.+?)\s+\{([A-Za-z0-9_()\-]{1,24})\}\s*=\s*(.*)$`)
|
||||
dmiVersionRE = regexp.MustCompile(`(?i)^version\s*=`)
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user