feat: Redfish hardware event log collection + MSI ghost GPU filter + inventory improvements

- Collect hardware event logs (last 7 days) from Systems and Managers/SEL LogServices
- Parse AMI raw IPMI dump messages into readable descriptions (Sensor_Type: Event_Type)
- Filter out audit/journal/non-hardware log services; only SEL from Managers
- MSI ghost GPU filter: exclude processor GPU entries with temperature=0 when host is powered on
- Reanimator collected_at uses InventoryData/Status.LastModifiedTime (30-day fallback)
- Invalidate Redfish inventory CRC groups before host power-on
- Log inventory LastModifiedTime age in collection logs
- Drop SecureBoot collection (SecureBootMode, SecureBootDatabases) — not hardware inventory
- Add build version to UI footer via template
- Add MSI Redfish API reference doc to bible-local/docs/

ADL-032–ADL-035

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Mikhail Chusavitin
2026-03-18 23:47:22 +03:00
parent 30409eef67
commit 96e65d8f65
15 changed files with 989 additions and 13 deletions

View File

@@ -311,6 +311,8 @@ func (c *RedfishConnector) Collect(ctx context.Context, req Request, emit Progre
if emit != nil {
emit(Progress{Status: "running", Progress: 99, Message: "Redfish: анализ raw snapshot..."})
}
// Collect hardware event logs separately (not part of tree-walk to avoid bloat).
rawLogEntries := c.collectRedfishLogEntries(withRedfishTelemetryPhase(ctx, "log_entries"), snapshotClient, req, baseURL, systemPaths, managerPaths)
rawPayloads := map[string]any{
"redfish_tree": rawTree,
"redfish_profiles": map[string]any{
@@ -413,12 +415,21 @@ func (c *RedfishConnector) Collect(ctx context.Context, req Request, emit Progre
if len(fetchErrMap) > 0 {
rawPayloads["redfish_fetch_errors"] = redfishFetchErrorMapToList(fetchErrMap)
}
if len(rawLogEntries) > 0 {
rawPayloads["redfish_log_entries"] = rawLogEntries
}
// Unified tunnel: live collection and raw import go through the same analyzer over redfish_tree.
result, err := ReplayRedfishFromRawPayloads(rawPayloads, nil)
if err != nil {
return nil, err
}
totalElapsed := time.Since(collectStart).Round(time.Second)
if !result.InventoryLastModifiedAt.IsZero() {
log.Printf("redfish-collect: inventory last modified at %s (age: %s)",
result.InventoryLastModifiedAt.Format(time.RFC3339),
time.Since(result.InventoryLastModifiedAt).Round(time.Minute),
)
}
log.Printf(
"redfish-postprobe-metrics: nvme_candidates=%d nvme_selected=%d nvme_added=%d candidates=%d selected=%d skipped_explicit=%d added=%d dur=%s",
postProbeMetrics.NVMECandidates,
@@ -495,6 +506,11 @@ func (c *RedfishConnector) ensureHostPowerForCollection(ctx context.Context, cli
return false, false
}
// Invalidate all inventory CRC groups before powering on so the BMC accepts
// fresh inventory from the host after boot. Best-effort: failure is logged but
// does not block power-on.
c.invalidateRedfishInventory(ctx, client, req, baseURL, systemPath, emit)
resetTarget := redfishResetActionTarget(systemDoc)
resetType := redfishPickResetType(systemDoc, "On", "ForceOn")
if resetTarget == "" || resetType == "" {
@@ -602,6 +618,32 @@ func (c *RedfishConnector) restoreHostPowerAfterCollection(ctx context.Context,
}
}
// invalidateRedfishInventory POSTs to the AMI/MSI InventoryCrc endpoint to zero out
// all known CRC groups before a host power-on. This causes the BMC to accept fresh
// inventory from the host after boot, preventing stale inventory (ghost GPUs, wrong
// BIOS version, etc.) from persisting across hardware changes.
// Best-effort: any error is logged and the call silently returns.
func (c *RedfishConnector) invalidateRedfishInventory(ctx context.Context, client *http.Client, req Request, baseURL, systemPath string, emit ProgressFn) {
crcPath := joinPath(systemPath, "/Oem/Ami/Inventory/Crc")
body := map[string]any{
"GroupCrcList": []map[string]any{
{"CPU": 0},
{"DIMM": 0},
{"PCIE": 0},
{"CERTIFICATES": 0},
{"SECUREBOOT": 0},
},
}
if err := c.postJSON(ctx, client, req, baseURL, crcPath, body); err != nil {
log.Printf("redfish: inventory invalidation skipped (not AMI/MSI or endpoint unavailable): %v", err)
return
}
log.Printf("redfish: inventory CRC groups invalidated at %s before host power-on", crcPath)
if emit != nil {
emit(Progress{Status: "running", Progress: 19, Message: "Redfish: инвентарь BMC инвалидирован перед включением host (все CRC группы сброшены)"})
}
}
func (c *RedfishConnector) waitForHostPowerState(ctx context.Context, client *http.Client, req Request, baseURL, systemPath string, wantOn bool, timeout time.Duration) bool {
deadline := time.Now().Add(timeout)
for {
@@ -2627,6 +2669,7 @@ func shouldCrawlPath(path string) bool {
"/Bios/Settings",
"/GetServerAllUSBStatus",
"/Oem/Public/KVM",
"/SecureBoot/SecureBootDatabases",
} {
if strings.Contains(normalized, part) {
return false
@@ -5548,7 +5591,7 @@ func storageControllerFromPath(path string) string {
return ""
}
func parseFirmware(system, bios, manager, secureBoot, networkProtocol map[string]interface{}) []models.FirmwareInfo {
func parseFirmware(system, bios, manager, networkProtocol map[string]interface{}) []models.FirmwareInfo {
var out []models.FirmwareInfo
appendFW := func(name, version string) {
@@ -5562,7 +5605,6 @@ func parseFirmware(system, bios, manager, secureBoot, networkProtocol map[string
appendFW("BIOS", asString(system["BiosVersion"]))
appendFW("BIOS", asString(bios["Version"]))
appendFW("BMC", asString(manager["FirmwareVersion"]))
appendFW("SecureBoot", asString(secureBoot["SecureBootMode"]))
return out
}