diff --git a/audit/internal/collector/collector.go b/audit/internal/collector/collector.go index e91434e..c551054 100644 --- a/audit/internal/collector/collector.go +++ b/audit/internal/collector/collector.go @@ -49,7 +49,8 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest { snap.VROCLicense = collectVROCLicense(snap.PCIeDevices) snap.PowerSupplies = collectPSUs(derefString(snap.Board.Manufacturer)) snap.PowerSupplies = enrichPSUsWithTelemetry(snap.PowerSupplies, sensorDoc) - snap.Sensors = buildSensorsFromDoc(sensorDoc) + snap.Sensors = mergeIPMISensors(buildSensorsFromDoc(sensorDoc), collectIPMISensors()) + snap.EventLogs = append(collectIPMISEL(), collectDmesgErrors()...) finalizeSnapshot(&snap, collectedAt) // remaining collectors added in steps 1.8 – 1.10 diff --git a/audit/internal/collector/dmesg_events.go b/audit/internal/collector/dmesg_events.go new file mode 100644 index 0000000..7956ed5 --- /dev/null +++ b/audit/internal/collector/dmesg_events.go @@ -0,0 +1,129 @@ +package collector + +import ( + "bee/audit/internal/schema" + "log/slog" + "os/exec" + "regexp" + "strings" + "time" +) + +// dmesg -T output: [Thu Jun 18 14:23:45 2026] message +// dmesg without -T: [ 123.456789] message +var dmesgTimestampRE = regexp.MustCompile(`^\[([^\]]+)\]\s*(.*)$`) + +// Keywords that indicate an error or hardware problem worth capturing. +var dmesgErrorPatterns = []*regexp.Regexp{ + regexp.MustCompile(`(?i)\berr(or)?\b`), + regexp.MustCompile(`(?i)\bfail(ed|ure)?\b`), + regexp.MustCompile(`(?i)\bfault\b`), + regexp.MustCompile(`(?i)\bwarn(ing)?\b`), + regexp.MustCompile(`(?i)\bAER\b`), + regexp.MustCompile(`(?i)\bXid\b`), + regexp.MustCompile(`(?i)\bNVRM\b`), + regexp.MustCompile(`(?i)\bpanic\b`), + regexp.MustCompile(`(?i)\bcorrected\b`), + regexp.MustCompile(`(?i)\buncorrect`), + regexp.MustCompile(`(?i)\bECC\b`), + regexp.MustCompile(`(?i)\btimeout\b`), + regexp.MustCompile(`(?i)\breset\b`), + regexp.MustCompile(`(?i)\bdead\b`), + regexp.MustCompile(`(?i)\bhang\b`), + regexp.MustCompile(`(?i)\bstall\b`), + regexp.MustCompile(`(?i)\bdisabled\b`), +} + +// collectDmesgErrors runs `dmesg -T` (or `dmesg` without -T on failure) and +// returns only lines that match known error/warning patterns. +func collectDmesgErrors() []schema.HardwareEventLog { + out, err := exec.Command("dmesg", "-T").Output() + if err != nil || len(out) == 0 { + // Fallback: dmesg without human-readable timestamps + out, err = exec.Command("dmesg").Output() + if err != nil || len(out) == 0 { + return nil + } + } + entries := parseDmesgErrors(string(out)) + if len(entries) == 0 { + return nil + } + slog.Info("dmesg: collected error entries", "count", len(entries)) + return entries +} + +func parseDmesgErrors(output string) []schema.HardwareEventLog { + var entries []schema.HardwareEventLog + collectedAt := time.Now().UTC().Format(time.RFC3339) + + for _, line := range strings.Split(output, "\n") { + line = strings.TrimSpace(line) + if line == "" { + continue + } + + var timestamp, message string + if m := dmesgTimestampRE.FindStringSubmatch(line); m != nil { + timestamp = strings.TrimSpace(m[1]) + message = strings.TrimSpace(m[2]) + } else { + message = line + } + + if message == "" { + continue + } + if !matchesAny(message, dmesgErrorPatterns) { + continue + } + + severity := dmesgSeverity(message) + source := "dmesg" + + var eventTime *string + if timestamp != "" { + t := timestamp + eventTime = &t + } else { + eventTime = &collectedAt + } + + entries = append(entries, schema.HardwareEventLog{ + Source: source, + EventTime: eventTime, + Severity: &severity, + Message: message, + }) + } + return entries +} + +func matchesAny(s string, patterns []*regexp.Regexp) bool { + for _, p := range patterns { + if p.MatchString(s) { + return true + } + } + return false +} + +func dmesgSeverity(msg string) string { + lower := strings.ToLower(msg) + switch { + case strings.Contains(lower, "panic") || + strings.Contains(lower, "aer") || + strings.Contains(lower, "uncorrect") || + strings.Contains(lower, "xid") || + strings.Contains(lower, "nvrm"): + return statusCritical + case strings.Contains(lower, "error") || + strings.Contains(lower, "fault") || + strings.Contains(lower, "fail") || + strings.Contains(lower, "dead") || + strings.Contains(lower, "hang"): + return statusCritical + default: + return statusWarning + } +} diff --git a/audit/internal/collector/ipmi_sel.go b/audit/internal/collector/ipmi_sel.go new file mode 100644 index 0000000..849c5b4 --- /dev/null +++ b/audit/internal/collector/ipmi_sel.go @@ -0,0 +1,90 @@ +package collector + +import ( + "bee/audit/internal/schema" + "fmt" + "log/slog" + "os/exec" + "strings" +) + +// collectIPMISEL runs `ipmitool sel list` and returns parsed event log entries. +// Returns nil if ipmitool is unavailable or the SEL is empty. +func collectIPMISEL() []schema.HardwareEventLog { + out, err := exec.Command("ipmitool", "sel", "list").Output() + if err != nil || len(out) == 0 { + return nil + } + entries := parseIPMISELOutput(string(out)) + if len(entries) == 0 { + return nil + } + slog.Info("ipmi sel: collected", "entries", len(entries)) + return entries +} + +// parseIPMISELOutput parses `ipmitool sel list` output. +// Line format: ID | date | time | sensor | event description | direction +// Example: 1 | 06/18/2026 | 14:23:45 | Temperature #0x30 | Upper Critical going high | Asserted +func parseIPMISELOutput(output string) []schema.HardwareEventLog { + var entries []schema.HardwareEventLog + for _, line := range strings.Split(output, "\n") { + line = strings.TrimSpace(line) + if line == "" { + continue + } + parts := strings.SplitN(line, "|", 6) + if len(parts) < 5 { + continue + } + id := strings.TrimSpace(parts[0]) + date := strings.TrimSpace(parts[1]) + timeStr := strings.TrimSpace(parts[2]) + sensor := strings.TrimSpace(parts[3]) + event := strings.TrimSpace(parts[4]) + direction := "" + if len(parts) == 6 { + direction = strings.TrimSpace(parts[5]) + } + + var eventTime *string + if date != "" && timeStr != "" { + t := fmt.Sprintf("%s %s", date, timeStr) + eventTime = &t + } + + message := event + if direction != "" && strings.EqualFold(direction, "Deasserted") { + message = event + " (Deasserted)" + } + + severity := ipmiSELSeverity(event) + isActive := !strings.EqualFold(direction, "Deasserted") + + entry := schema.HardwareEventLog{ + Source: "ipmi-sel", + EventTime: eventTime, + Severity: &severity, + MessageID: &id, + Message: message, + IsActive: &isActive, + } + if sensor != "" { + entry.ComponentRef = &sensor + } + entries = append(entries, entry) + } + return entries +} + +func ipmiSELSeverity(event string) string { + lower := strings.ToLower(event) + switch { + case strings.Contains(lower, "critical") || strings.Contains(lower, "non-recoverable"): + return statusCritical + case strings.Contains(lower, "non-critical") || strings.Contains(lower, "warning") || strings.Contains(lower, "degraded"): + return statusWarning + default: + return "info" + } +} diff --git a/audit/internal/collector/ipmi_sensors.go b/audit/internal/collector/ipmi_sensors.go new file mode 100644 index 0000000..333ea94 --- /dev/null +++ b/audit/internal/collector/ipmi_sensors.go @@ -0,0 +1,216 @@ +package collector + +import ( + "bee/audit/internal/schema" + "log/slog" + "os/exec" + "strconv" + "strings" +) + +// collectIPMISensors runs `ipmitool sensor` and returns parsed sensor readings. +// Returns nil if ipmitool is unavailable or produces no output. +func collectIPMISensors() *schema.HardwareSensors { + out, err := exec.Command("ipmitool", "sensor").Output() + if err != nil || len(out) == 0 { + return nil + } + result := parseIPMISensorOutput(string(out)) + if result == nil { + return nil + } + slog.Info("ipmi sensors: collected", + "fans", len(result.Fans), + "temperatures", len(result.Temperatures), + "power", len(result.Power), + "other", len(result.Other), + ) + return result +} + +// parseIPMISensorOutput parses `ipmitool sensor` text output. +// Each line: name | value | unit | status | lnr | lcr | lnc | unc | ucr | unr +func parseIPMISensorOutput(output string) *schema.HardwareSensors { + result := &schema.HardwareSensors{} + seen := map[string]struct{}{} + + for _, line := range strings.Split(output, "\n") { + line = strings.TrimSpace(line) + if line == "" { + continue + } + parts := strings.Split(line, "|") + if len(parts) < 4 { + continue + } + name := strings.TrimSpace(parts[0]) + rawVal := strings.TrimSpace(parts[1]) + unit := strings.TrimSpace(parts[2]) + status := strings.TrimSpace(parts[3]) + + if name == "" || rawVal == "na" || rawVal == "N/A" || rawVal == "" { + continue + } + + value, err := strconv.ParseFloat(rawVal, 64) + if err != nil { + continue + } + + statusStr := normalizeIPMISensorStatus(status) + + switch { + case strings.EqualFold(unit, "RPM"): + if duplicateSensor(seen, "fan", name) { + continue + } + rpm := int(value) + item := schema.HardwareFanSensor{Name: name, RPM: &rpm} + if statusStr != "" { + item.Status = &statusStr + } + result.Fans = append(result.Fans, item) + + case strings.EqualFold(unit, "degrees C") || strings.EqualFold(unit, "C"): + if duplicateSensor(seen, "temp", name) { + continue + } + item := schema.HardwareTemperatureSensor{Name: name, Celsius: &value} + if len(parts) >= 9 { + if unc := parseIPMIThreshold(parts[7]); unc != nil { + item.ThresholdWarningCelsius = unc + } + if ucr := parseIPMIThreshold(parts[8]); ucr != nil { + item.ThresholdCriticalCelsius = ucr + } + } + if statusStr != "" { + item.Status = &statusStr + } else { + item.Status = deriveTemperatureStatus(item.Celsius, item.ThresholdWarningCelsius, item.ThresholdCriticalCelsius) + } + result.Temperatures = append(result.Temperatures, item) + + case strings.EqualFold(unit, "Volts") || strings.EqualFold(unit, "V"): + if duplicateSensor(seen, "power", name) { + continue + } + item := schema.HardwarePowerSensor{Name: name, VoltageV: &value} + if statusStr != "" { + item.Status = &statusStr + } + result.Power = append(result.Power, item) + + case strings.EqualFold(unit, "Watts") || strings.EqualFold(unit, "W"): + if duplicateSensor(seen, "power", name) { + continue + } + item := schema.HardwarePowerSensor{Name: name, PowerW: &value} + if statusStr != "" { + item.Status = &statusStr + } + result.Power = append(result.Power, item) + + case strings.EqualFold(unit, "Amps") || strings.EqualFold(unit, "A"): + if duplicateSensor(seen, "power", name) { + continue + } + item := schema.HardwarePowerSensor{Name: name, CurrentA: &value} + if statusStr != "" { + item.Status = &statusStr + } + result.Power = append(result.Power, item) + + default: + if duplicateSensor(seen, "other", name) { + continue + } + item := schema.HardwareOtherSensor{Name: name, Value: &value} + if unit != "" { + item.Unit = &unit + } + if statusStr != "" { + item.Status = &statusStr + } + result.Other = append(result.Other, item) + } + } + + if len(result.Fans) == 0 && len(result.Temperatures) == 0 && len(result.Power) == 0 && len(result.Other) == 0 { + return nil + } + return result +} + +func parseIPMIThreshold(raw string) *float64 { + s := strings.TrimSpace(raw) + if s == "" || s == "na" || s == "N/A" { + return nil + } + v, err := strconv.ParseFloat(s, 64) + if err != nil { + return nil + } + return &v +} + +func normalizeIPMISensorStatus(s string) string { + switch strings.ToLower(s) { + case "ok": + return statusOK + case "cr", "ucr", "lcr": + return statusCritical + case "nc", "unc", "lnc", "nr", "unr", "lnr": + return statusWarning + case "ns", "na": + return "" + default: + return "" + } +} + +// mergeIPMISensors appends IPMI sensor entries into existing, skipping names already present. +func mergeIPMISensors(existing, ipmi *schema.HardwareSensors) *schema.HardwareSensors { + if ipmi == nil { + return existing + } + if existing == nil { + return ipmi + } + + existingNames := map[string]struct{}{} + for _, s := range existing.Fans { + existingNames["fan\x00"+s.Name] = struct{}{} + } + for _, s := range existing.Temperatures { + existingNames["temp\x00"+s.Name] = struct{}{} + } + for _, s := range existing.Power { + existingNames["power\x00"+s.Name] = struct{}{} + } + for _, s := range existing.Other { + existingNames["other\x00"+s.Name] = struct{}{} + } + + for _, s := range ipmi.Fans { + if _, ok := existingNames["fan\x00"+s.Name]; !ok { + existing.Fans = append(existing.Fans, s) + } + } + for _, s := range ipmi.Temperatures { + if _, ok := existingNames["temp\x00"+s.Name]; !ok { + existing.Temperatures = append(existing.Temperatures, s) + } + } + for _, s := range ipmi.Power { + if _, ok := existingNames["power\x00"+s.Name]; !ok { + existing.Power = append(existing.Power, s) + } + } + for _, s := range ipmi.Other { + if _, ok := existingNames["other\x00"+s.Name]; !ok { + existing.Other = append(existing.Other, s) + } + } + return existing +} diff --git a/audit/internal/platform/techdump.go b/audit/internal/platform/techdump.go index f79f928..7cfe457 100644 --- a/audit/internal/platform/techdump.go +++ b/audit/internal/platform/techdump.go @@ -25,6 +25,9 @@ var techDumpFixedCommands = []struct { {Name: "sensors", Args: []string{"-j"}, File: "sensors.json"}, {Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"}, {Name: "ipmitool", Args: []string{"sdr"}, File: "ipmitool-sdr.txt"}, + {Name: "ipmitool", Args: []string{"sensor"}, File: "ipmitool-sensor.txt"}, + {Name: "ipmitool", Args: []string{"sel", "list"}, File: "ipmitool-sel.txt"}, + {Name: "ipmitool", Args: []string{"sel", "time", "get"}, File: "ipmitool-sel-time.txt"}, {Name: "nvme", Args: []string{"list", "-o", "json"}, File: "nvme-list.json"}, } diff --git a/audit/internal/webui/saa_dmi.go b/audit/internal/webui/saa_dmi.go index 1947640..e10421f 100644 --- a/audit/internal/webui/saa_dmi.go +++ b/audit/internal/webui/saa_dmi.go @@ -28,7 +28,8 @@ var ( shnRE = regexp.MustCompile(`^[A-Za-z0-9_]{1,16}$`) dmiSectionRE = regexp.MustCompile(`^\[(.+?)\]$`) // Item Name {SHN} = value // comment - dmiItemRE = regexp.MustCompile(`^(.+?)\s+\{([A-Za-z0-9]{1,16})\}\s*=\s*(.*)$`) + // SHN may contain parentheses, e.g. {PS(4)LC} for power supply fields + dmiItemRE = regexp.MustCompile(`^(.+?)\s+\{([A-Za-z0-9_()\-]{1,24})\}\s*=\s*(.*)$`) dmiVersionRE = regexp.MustCompile(`(?i)^version\s*=`) )