feat: Redfish hardware event log collection + MSI ghost GPU filter + inventory improvements
- Collect hardware event logs (last 7 days) from Systems and Managers/SEL LogServices - Parse AMI raw IPMI dump messages into readable descriptions (Sensor_Type: Event_Type) - Filter out audit/journal/non-hardware log services; only SEL from Managers - MSI ghost GPU filter: exclude processor GPU entries with temperature=0 when host is powered on - Reanimator collected_at uses InventoryData/Status.LastModifiedTime (30-day fallback) - Invalidate Redfish inventory CRC groups before host power-on - Log inventory LastModifiedTime age in collection logs - Drop SecureBoot collection (SecureBootMode, SecureBootDatabases) — not hardware inventory - Add build version to UI footer via template - Add MSI Redfish API reference doc to bible-local/docs/ ADL-032–ADL-035 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
392
internal/collector/redfish_logentries.go
Normal file
392
internal/collector/redfish_logentries.go
Normal file
@@ -0,0 +1,392 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"git.mchus.pro/mchus/logpile/internal/models"
|
||||
)
|
||||
|
||||
const (
|
||||
redfishLogEntriesWindow = 7 * 24 * time.Hour
|
||||
redfishLogEntriesMaxTotal = 500
|
||||
redfishLogEntriesMaxPerSvc = 200
|
||||
)
|
||||
|
||||
// collectRedfishLogEntries fetches hardware event log entries from Systems and Managers LogServices.
|
||||
// Only hardware-relevant entries from the last 7 days are returned.
|
||||
// For Systems: all log services except audit/journal/security/debug.
|
||||
// For Managers: only the IPMI SEL service (Id="SEL") — audit and event logs are excluded.
|
||||
func (c *RedfishConnector) collectRedfishLogEntries(ctx context.Context, client *http.Client, req Request, baseURL string, systemPaths, managerPaths []string) []map[string]interface{} {
|
||||
cutoff := time.Now().UTC().Add(-redfishLogEntriesWindow)
|
||||
seen := make(map[string]struct{})
|
||||
var out []map[string]interface{}
|
||||
|
||||
collectFrom := func(logServicesPath string, filter func(map[string]interface{}) bool) {
|
||||
if len(out) >= redfishLogEntriesMaxTotal {
|
||||
return
|
||||
}
|
||||
services, err := c.getCollectionMembers(ctx, client, req, baseURL, logServicesPath)
|
||||
if err != nil || len(services) == 0 {
|
||||
return
|
||||
}
|
||||
for _, svc := range services {
|
||||
if len(out) >= redfishLogEntriesMaxTotal {
|
||||
break
|
||||
}
|
||||
if !filter(svc) {
|
||||
continue
|
||||
}
|
||||
entriesPath := redfishLogServiceEntriesPath(svc)
|
||||
if entriesPath == "" {
|
||||
continue
|
||||
}
|
||||
entries := c.fetchRedfishLogEntriesWithPaging(ctx, client, req, baseURL, entriesPath, cutoff, seen, redfishLogEntriesMaxPerSvc)
|
||||
out = append(out, entries...)
|
||||
}
|
||||
}
|
||||
|
||||
for _, systemPath := range systemPaths {
|
||||
collectFrom(joinPath(systemPath, "/LogServices"), isHardwareLogService)
|
||||
}
|
||||
// Managers hold the IPMI SEL on AMI/MSI BMCs — include only the "SEL" service.
|
||||
for _, managerPath := range managerPaths {
|
||||
collectFrom(joinPath(managerPath, "/LogServices"), isManagerSELService)
|
||||
}
|
||||
|
||||
if len(out) > 0 {
|
||||
log.Printf("redfish: collected %d hardware log entries (Systems+Managers SEL, window=7d)", len(out))
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// fetchRedfishLogEntriesWithPaging fetches entries from a LogEntry collection,
|
||||
// following nextLink pages. Stops early when entries older than cutoff are encountered
|
||||
// (assumes BMC returns entries newest-first, which is typical).
|
||||
func (c *RedfishConnector) fetchRedfishLogEntriesWithPaging(ctx context.Context, client *http.Client, req Request, baseURL, entriesPath string, cutoff time.Time, seen map[string]struct{}, limit int) []map[string]interface{} {
|
||||
var out []map[string]interface{}
|
||||
nextPath := entriesPath
|
||||
|
||||
for nextPath != "" && len(out) < limit {
|
||||
collection, err := c.getJSON(ctx, client, req, baseURL, nextPath)
|
||||
if err != nil {
|
||||
break
|
||||
}
|
||||
|
||||
// Handle both linked members (@odata.id only) and inline members (full objects).
|
||||
rawMembers, _ := collection["Members"].([]interface{})
|
||||
hitOldEntry := false
|
||||
|
||||
for _, rawMember := range rawMembers {
|
||||
if len(out) >= limit {
|
||||
break
|
||||
}
|
||||
memberMap, ok := rawMember.(map[string]interface{})
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
var entry map[string]interface{}
|
||||
if _, hasCreated := memberMap["Created"]; hasCreated {
|
||||
// Inline entry — use directly.
|
||||
entry = memberMap
|
||||
} else {
|
||||
// Linked entry — fetch by path.
|
||||
memberPath := normalizeRedfishPath(asString(memberMap["@odata.id"]))
|
||||
if memberPath == "" {
|
||||
continue
|
||||
}
|
||||
entry, err = c.getJSON(ctx, client, req, baseURL, memberPath)
|
||||
if err != nil || len(entry) == 0 {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// Dedup by entry Id or path.
|
||||
entryKey := asString(entry["Id"])
|
||||
if entryKey == "" {
|
||||
entryKey = asString(entry["@odata.id"])
|
||||
}
|
||||
if entryKey != "" {
|
||||
if _, dup := seen[entryKey]; dup {
|
||||
continue
|
||||
}
|
||||
seen[entryKey] = struct{}{}
|
||||
}
|
||||
|
||||
// Time filter.
|
||||
created := parseRedfishEntryTime(asString(entry["Created"]))
|
||||
if !created.IsZero() && created.Before(cutoff) {
|
||||
hitOldEntry = true
|
||||
continue
|
||||
}
|
||||
|
||||
// Hardware relevance filter.
|
||||
if !isHardwareLogEntry(entry) {
|
||||
continue
|
||||
}
|
||||
|
||||
out = append(out, entry)
|
||||
}
|
||||
|
||||
// Stop paging once we've seen entries older than the window.
|
||||
if hitOldEntry {
|
||||
break
|
||||
}
|
||||
nextPath = firstNonEmpty(
|
||||
normalizeRedfishPath(asString(collection["Members@odata.nextLink"])),
|
||||
normalizeRedfishPath(asString(collection["@odata.nextLink"])),
|
||||
)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// isManagerSELService returns true only for the IPMI SEL exposed under Managers.
|
||||
// On AMI/MSI BMCs the hardware SEL lives at Managers/{mgr}/LogServices/SEL.
|
||||
// All other Manager log services (AuditLog, EventLog, Journal) are excluded.
|
||||
func isManagerSELService(svc map[string]interface{}) bool {
|
||||
id := strings.ToLower(strings.TrimSpace(asString(svc["Id"])))
|
||||
return id == "sel"
|
||||
}
|
||||
|
||||
// isHardwareLogService returns true if the log service looks like a hardware event log
|
||||
// (SEL, System Event Log) rather than a BMC audit/journal log.
|
||||
func isHardwareLogService(svc map[string]interface{}) bool {
|
||||
id := strings.ToLower(strings.TrimSpace(asString(svc["Id"])))
|
||||
name := strings.ToLower(strings.TrimSpace(asString(svc["Name"])))
|
||||
for _, skip := range []string{"audit", "journal", "bmc", "security", "manager", "debug"} {
|
||||
if strings.Contains(id, skip) || strings.Contains(name, skip) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// redfishLogServiceEntriesPath returns the Entries collection path for a LogService document.
|
||||
func redfishLogServiceEntriesPath(svc map[string]interface{}) string {
|
||||
if entriesLink, ok := svc["Entries"].(map[string]interface{}); ok {
|
||||
if p := normalizeRedfishPath(asString(entriesLink["@odata.id"])); p != "" {
|
||||
return p
|
||||
}
|
||||
}
|
||||
if id := normalizeRedfishPath(asString(svc["@odata.id"])); id != "" {
|
||||
return joinPath(id, "/Entries")
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// isHardwareLogEntry returns true if the log entry is hardware-related.
|
||||
// Audit, authentication, and session events are excluded.
|
||||
func isHardwareLogEntry(entry map[string]interface{}) bool {
|
||||
entryType := strings.TrimSpace(asString(entry["EntryType"]))
|
||||
if strings.EqualFold(entryType, "Oem") {
|
||||
return false
|
||||
}
|
||||
|
||||
msgID := strings.ToLower(strings.TrimSpace(asString(entry["MessageId"])))
|
||||
for _, skip := range []string{
|
||||
"user", "account", "password", "login", "logon", "session",
|
||||
"auth", "certificate", "security", "credential", "privilege",
|
||||
} {
|
||||
if strings.Contains(msgID, skip) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
// Also check the human-readable message for obvious audit patterns.
|
||||
msg := strings.ToLower(strings.TrimSpace(asString(entry["Message"])))
|
||||
for _, skip := range []string{"logged in", "logged out", "log in", "log out", "sign in", "signed in"} {
|
||||
if strings.Contains(msg, skip) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// parseRedfishEntryTime parses a Redfish LogEntry Created timestamp (ISO 8601 / RFC 3339).
|
||||
func parseRedfishEntryTime(raw string) time.Time {
|
||||
raw = strings.TrimSpace(raw)
|
||||
if raw == "" {
|
||||
return time.Time{}
|
||||
}
|
||||
for _, layout := range []string{time.RFC3339, time.RFC3339Nano, "2006-01-02T15:04:05Z07:00"} {
|
||||
if t, err := time.Parse(layout, raw); err == nil {
|
||||
return t.UTC()
|
||||
}
|
||||
}
|
||||
return time.Time{}
|
||||
}
|
||||
|
||||
// parseRedfishLogEntries converts raw log entries stored in RawPayloads into models.Event slice.
|
||||
// Called during Redfish replay for both live and offline (archive) collections.
|
||||
func parseRedfishLogEntries(rawPayloads map[string]any, collectedAt time.Time) []models.Event {
|
||||
raw, ok := rawPayloads["redfish_log_entries"]
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
|
||||
var entries []map[string]interface{}
|
||||
switch v := raw.(type) {
|
||||
case []map[string]interface{}:
|
||||
entries = v
|
||||
case []interface{}:
|
||||
for _, item := range v {
|
||||
if m, ok := item.(map[string]interface{}); ok {
|
||||
entries = append(entries, m)
|
||||
}
|
||||
}
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
|
||||
if len(entries) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
out := make([]models.Event, 0, len(entries))
|
||||
for _, entry := range entries {
|
||||
ev := redfishLogEntryToEvent(entry, collectedAt)
|
||||
if ev == nil {
|
||||
continue
|
||||
}
|
||||
out = append(out, *ev)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// redfishLogEntryToEvent converts a single Redfish LogEntry document to models.Event.
|
||||
func redfishLogEntryToEvent(entry map[string]interface{}, collectedAt time.Time) *models.Event {
|
||||
// Prefer EventTimestamp (actual hardware event time) over Created (Redfish record creation time).
|
||||
ts := parseRedfishEntryTime(asString(entry["EventTimestamp"]))
|
||||
if ts.IsZero() {
|
||||
ts = parseRedfishEntryTime(asString(entry["Created"]))
|
||||
}
|
||||
if ts.IsZero() {
|
||||
ts = collectedAt
|
||||
}
|
||||
|
||||
severity := redfishLogEntrySeverity(entry)
|
||||
sensorType := strings.TrimSpace(asString(entry["SensorType"]))
|
||||
messageID := strings.TrimSpace(asString(entry["MessageId"]))
|
||||
entryType := strings.TrimSpace(asString(entry["EntryType"]))
|
||||
entryCode := strings.TrimSpace(asString(entry["EntryCode"]))
|
||||
|
||||
// SensorName: prefer "Name", fall back to "SensorNumber" + SensorType.
|
||||
sensorName := strings.TrimSpace(asString(entry["Name"]))
|
||||
if sensorName == "" {
|
||||
num := strings.TrimSpace(asString(entry["SensorNumber"]))
|
||||
if num != "" && sensorType != "" {
|
||||
sensorName = sensorType + " " + num
|
||||
}
|
||||
}
|
||||
|
||||
rawMessage := strings.TrimSpace(asString(entry["Message"]))
|
||||
|
||||
// AMI/MSI BMCs dump raw IPMI record fields into Message instead of human-readable text.
|
||||
// Detect this and build a readable description from structured fields instead.
|
||||
description, rawData := redfishDecodeMessage(rawMessage, sensorType, entryCode, entry)
|
||||
if description == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
return &models.Event{
|
||||
ID: messageID,
|
||||
Timestamp: ts,
|
||||
Source: "redfish",
|
||||
SensorType: sensorType,
|
||||
SensorName: sensorName,
|
||||
EventType: entryType,
|
||||
Severity: severity,
|
||||
Description: description,
|
||||
RawData: rawData,
|
||||
}
|
||||
}
|
||||
|
||||
// redfishDecodeMessage returns a human-readable description and optional raw data.
|
||||
// AMI/MSI BMCs dump raw IPMI record fields into Message as "Key : Value, Key : Value, ..."
|
||||
// instead of a plain human-readable string. We extract the useful decoded fields from it.
|
||||
func redfishDecodeMessage(message, sensorType, entryCode string, entry map[string]interface{}) (description, rawData string) {
|
||||
if !isRawIPMIDump(message) {
|
||||
description = message
|
||||
return
|
||||
}
|
||||
|
||||
rawData = message
|
||||
kv := parseIPMIDumpKV(message)
|
||||
|
||||
// Sensor_Type inside the dump is more specific than the top-level SensorType field.
|
||||
if v := kv["Sensor_Type"]; v != "" {
|
||||
sensorType = v
|
||||
}
|
||||
eventType := kv["Event_Type"] // human-readable IPMI event type, e.g. "Legacy OFF State"
|
||||
|
||||
var parts []string
|
||||
if sensorType != "" {
|
||||
parts = append(parts, sensorType)
|
||||
}
|
||||
if eventType != "" {
|
||||
parts = append(parts, eventType)
|
||||
} else if entryCode != "" {
|
||||
parts = append(parts, entryCode)
|
||||
}
|
||||
description = strings.Join(parts, ": ")
|
||||
return
|
||||
}
|
||||
|
||||
// isRawIPMIDump returns true if the message is an AMI raw IPMI record dump.
|
||||
func isRawIPMIDump(message string) bool {
|
||||
return strings.Contains(message, "Event_Data_1 :") && strings.Contains(message, "Record_Type :")
|
||||
}
|
||||
|
||||
// parseIPMIDumpKV parses the AMI "Key : Value, Key : Value, " format into a map.
|
||||
func parseIPMIDumpKV(message string) map[string]string {
|
||||
out := make(map[string]string)
|
||||
for _, part := range strings.Split(message, ",") {
|
||||
part = strings.TrimSpace(part)
|
||||
idx := strings.Index(part, " : ")
|
||||
if idx < 0 {
|
||||
continue
|
||||
}
|
||||
k := strings.TrimSpace(part[:idx])
|
||||
v := strings.TrimSpace(part[idx+3:])
|
||||
if k != "" && v != "" {
|
||||
out[k] = v
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// redfishLogEntrySeverity maps a Redfish LogEntry to models.Severity.
|
||||
// AMI/MSI BMCs often set Severity="OK" on all SEL records regardless of content,
|
||||
// so we fall back to inferring severity from SensorType when the explicit field is unhelpful.
|
||||
func redfishLogEntrySeverity(entry map[string]interface{}) models.Severity {
|
||||
// Newer Redfish uses MessageSeverity; older uses Severity.
|
||||
raw := strings.ToLower(firstNonEmpty(
|
||||
strings.TrimSpace(asString(entry["MessageSeverity"])),
|
||||
strings.TrimSpace(asString(entry["Severity"])),
|
||||
))
|
||||
switch raw {
|
||||
case "critical":
|
||||
return models.SeverityCritical
|
||||
case "warning":
|
||||
return models.SeverityWarning
|
||||
case "ok", "informational", "":
|
||||
// BMC didn't set a meaningful severity — infer from SensorType.
|
||||
return redfishSeverityFromSensorType(strings.TrimSpace(asString(entry["SensorType"])))
|
||||
default:
|
||||
return models.SeverityInfo
|
||||
}
|
||||
}
|
||||
|
||||
// redfishSeverityFromSensorType infers event severity from the IPMI/Redfish SensorType string.
|
||||
func redfishSeverityFromSensorType(sensorType string) models.Severity {
|
||||
switch strings.ToLower(sensorType) {
|
||||
case "critical interrupt", "processor", "memory", "power unit",
|
||||
"power supply", "drive slot", "system firmware progress":
|
||||
return models.SeverityWarning
|
||||
default:
|
||||
return models.SeverityInfo
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user