feat: Redfish hardware event log collection + MSI ghost GPU filter + inventory improvements
- Collect hardware event logs (last 7 days) from Systems and Managers/SEL LogServices - Parse AMI raw IPMI dump messages into readable descriptions (Sensor_Type: Event_Type) - Filter out audit/journal/non-hardware log services; only SEL from Managers - MSI ghost GPU filter: exclude processor GPU entries with temperature=0 when host is powered on - Reanimator collected_at uses InventoryData/Status.LastModifiedTime (30-day fallback) - Invalidate Redfish inventory CRC groups before host power-on - Log inventory LastModifiedTime age in collection logs - Drop SecureBoot collection (SecureBootMode, SecureBootDatabases) — not hardware inventory - Add build version to UI footer via template - Add MSI Redfish API reference doc to bible-local/docs/ ADL-032–ADL-035 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -311,6 +311,8 @@ func (c *RedfishConnector) Collect(ctx context.Context, req Request, emit Progre
|
||||
if emit != nil {
|
||||
emit(Progress{Status: "running", Progress: 99, Message: "Redfish: анализ raw snapshot..."})
|
||||
}
|
||||
// Collect hardware event logs separately (not part of tree-walk to avoid bloat).
|
||||
rawLogEntries := c.collectRedfishLogEntries(withRedfishTelemetryPhase(ctx, "log_entries"), snapshotClient, req, baseURL, systemPaths, managerPaths)
|
||||
rawPayloads := map[string]any{
|
||||
"redfish_tree": rawTree,
|
||||
"redfish_profiles": map[string]any{
|
||||
@@ -413,12 +415,21 @@ func (c *RedfishConnector) Collect(ctx context.Context, req Request, emit Progre
|
||||
if len(fetchErrMap) > 0 {
|
||||
rawPayloads["redfish_fetch_errors"] = redfishFetchErrorMapToList(fetchErrMap)
|
||||
}
|
||||
if len(rawLogEntries) > 0 {
|
||||
rawPayloads["redfish_log_entries"] = rawLogEntries
|
||||
}
|
||||
// Unified tunnel: live collection and raw import go through the same analyzer over redfish_tree.
|
||||
result, err := ReplayRedfishFromRawPayloads(rawPayloads, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
totalElapsed := time.Since(collectStart).Round(time.Second)
|
||||
if !result.InventoryLastModifiedAt.IsZero() {
|
||||
log.Printf("redfish-collect: inventory last modified at %s (age: %s)",
|
||||
result.InventoryLastModifiedAt.Format(time.RFC3339),
|
||||
time.Since(result.InventoryLastModifiedAt).Round(time.Minute),
|
||||
)
|
||||
}
|
||||
log.Printf(
|
||||
"redfish-postprobe-metrics: nvme_candidates=%d nvme_selected=%d nvme_added=%d candidates=%d selected=%d skipped_explicit=%d added=%d dur=%s",
|
||||
postProbeMetrics.NVMECandidates,
|
||||
@@ -495,6 +506,11 @@ func (c *RedfishConnector) ensureHostPowerForCollection(ctx context.Context, cli
|
||||
return false, false
|
||||
}
|
||||
|
||||
// Invalidate all inventory CRC groups before powering on so the BMC accepts
|
||||
// fresh inventory from the host after boot. Best-effort: failure is logged but
|
||||
// does not block power-on.
|
||||
c.invalidateRedfishInventory(ctx, client, req, baseURL, systemPath, emit)
|
||||
|
||||
resetTarget := redfishResetActionTarget(systemDoc)
|
||||
resetType := redfishPickResetType(systemDoc, "On", "ForceOn")
|
||||
if resetTarget == "" || resetType == "" {
|
||||
@@ -602,6 +618,32 @@ func (c *RedfishConnector) restoreHostPowerAfterCollection(ctx context.Context,
|
||||
}
|
||||
}
|
||||
|
||||
// invalidateRedfishInventory POSTs to the AMI/MSI InventoryCrc endpoint to zero out
|
||||
// all known CRC groups before a host power-on. This causes the BMC to accept fresh
|
||||
// inventory from the host after boot, preventing stale inventory (ghost GPUs, wrong
|
||||
// BIOS version, etc.) from persisting across hardware changes.
|
||||
// Best-effort: any error is logged and the call silently returns.
|
||||
func (c *RedfishConnector) invalidateRedfishInventory(ctx context.Context, client *http.Client, req Request, baseURL, systemPath string, emit ProgressFn) {
|
||||
crcPath := joinPath(systemPath, "/Oem/Ami/Inventory/Crc")
|
||||
body := map[string]any{
|
||||
"GroupCrcList": []map[string]any{
|
||||
{"CPU": 0},
|
||||
{"DIMM": 0},
|
||||
{"PCIE": 0},
|
||||
{"CERTIFICATES": 0},
|
||||
{"SECUREBOOT": 0},
|
||||
},
|
||||
}
|
||||
if err := c.postJSON(ctx, client, req, baseURL, crcPath, body); err != nil {
|
||||
log.Printf("redfish: inventory invalidation skipped (not AMI/MSI or endpoint unavailable): %v", err)
|
||||
return
|
||||
}
|
||||
log.Printf("redfish: inventory CRC groups invalidated at %s before host power-on", crcPath)
|
||||
if emit != nil {
|
||||
emit(Progress{Status: "running", Progress: 19, Message: "Redfish: инвентарь BMC инвалидирован перед включением host (все CRC группы сброшены)"})
|
||||
}
|
||||
}
|
||||
|
||||
func (c *RedfishConnector) waitForHostPowerState(ctx context.Context, client *http.Client, req Request, baseURL, systemPath string, wantOn bool, timeout time.Duration) bool {
|
||||
deadline := time.Now().Add(timeout)
|
||||
for {
|
||||
@@ -2627,6 +2669,7 @@ func shouldCrawlPath(path string) bool {
|
||||
"/Bios/Settings",
|
||||
"/GetServerAllUSBStatus",
|
||||
"/Oem/Public/KVM",
|
||||
"/SecureBoot/SecureBootDatabases",
|
||||
} {
|
||||
if strings.Contains(normalized, part) {
|
||||
return false
|
||||
@@ -5548,7 +5591,7 @@ func storageControllerFromPath(path string) string {
|
||||
return ""
|
||||
}
|
||||
|
||||
func parseFirmware(system, bios, manager, secureBoot, networkProtocol map[string]interface{}) []models.FirmwareInfo {
|
||||
func parseFirmware(system, bios, manager, networkProtocol map[string]interface{}) []models.FirmwareInfo {
|
||||
var out []models.FirmwareInfo
|
||||
|
||||
appendFW := func(name, version string) {
|
||||
@@ -5562,7 +5605,6 @@ func parseFirmware(system, bios, manager, secureBoot, networkProtocol map[string
|
||||
appendFW("BIOS", asString(system["BiosVersion"]))
|
||||
appendFW("BIOS", asString(bios["Version"]))
|
||||
appendFW("BMC", asString(manager["FirmwareVersion"]))
|
||||
appendFW("SecureBoot", asString(secureBoot["SecureBootMode"]))
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user