From 125f77ef6918940611a6d12ac6d47f4e31fc3cff Mon Sep 17 00:00:00 2001 From: Mikhail Chusavitin Date: Wed, 25 Mar 2026 11:19:36 +0300 Subject: [PATCH] feat: adaptive BMC readiness check + ghost NIC dedup fix + empty collection plan-B retry BMC readiness after power-on (waitForStablePoweredOnHost): - After initial 1m stabilization, poll BMC inventory readiness before collecting - Ready if MemorySummary.TotalSystemMemoryGiB > 0 OR PCIeDevices.Members non-empty - On failure: wait +60s, retry; on second failure: wait +120s, retry; then warn and proceed - Configurable via LOGPILE_REDFISH_BMC_READY_WAITS (default: 60s,120s) Empty critical collection plan-B retry (EnableEmptyCriticalCollectionRetry): - Hardware inventory collections that returned Members=[] are now re-probed in plan-B - Covers PCIeDevices, NetworkAdapters, Processors, Drives, Storage, EthernetInterfaces - Enabled by default in generic profile (applies to all vendors) Ghost NIC dedup fix (enrichNICsFromNetworkInterfaces): - NetworkInterface entries (e.g. Id=2) that don't match existing NIC slots are now resolved via Links.NetworkAdapter cross-reference to the real Chassis NIC - Prevents duplicate ghost entries (slot=2 "Network Device View") from appearing alongside real NICs (slot="RISER 5 slot 1 (7)") with the same MAC addresses Co-Authored-By: Claude Sonnet 4.6 --- internal/collector/redfish.go | 160 +++++++++++++++++- .../collector/redfish_replay_inventory.go | 39 +++++ internal/collector/redfish_test.go | 5 + .../redfishprofile/profile_generic.go | 1 + .../redfishprofile/profiles_common.go | 3 + internal/collector/redfishprofile/types.go | 1 + 6 files changed, 207 insertions(+), 2 deletions(-) diff --git a/internal/collector/redfish.go b/internal/collector/redfish.go index 8425cd2..5713576 100644 --- a/internal/collector/redfish.go +++ b/internal/collector/redfish.go @@ -569,11 +569,12 @@ func (c *RedfishConnector) waitForStablePoweredOnHost(ctx context.Context, clien }) } timer := time.NewTimer(stabilizationDelay) - defer timer.Stop() select { case <-ctx.Done(): + timer.Stop() return false case <-timer.C: + timer.Stop() } } if emit != nil { @@ -583,7 +584,92 @@ func (c *RedfishConnector) waitForStablePoweredOnHost(ctx context.Context, clien Message: "Redfish: повторная проверка PowerState после стабилизации host", }) } - return c.waitForHostPowerState(ctx, client, req, baseURL, systemPath, true, 5*time.Second) + if !c.waitForHostPowerState(ctx, client, req, baseURL, systemPath, true, 5*time.Second) { + return false + } + + // After the initial stabilization wait, the BMC may still be populating its + // hardware inventory (PCIeDevices, memory summary). Poll readiness with + // increasing back-off (default: +60s, +120s), then warn and proceed. + readinessWaits := redfishBMCReadinessWaits() + for attempt, extraWait := range readinessWaits { + ready, reason := c.isBMCInventoryReady(ctx, client, req, baseURL, systemPath) + if ready { + if emit != nil { + emit(Progress{ + Status: "running", + Progress: 20, + Message: fmt.Sprintf("Redfish: BMC готов (%s)", reason), + }) + } + return true + } + if emit != nil { + emit(Progress{ + Status: "running", + Progress: 20, + Message: fmt.Sprintf("Redfish: BMC не готов (%s), ожидание %s (попытка %d/%d)", reason, extraWait, attempt+1, len(readinessWaits)), + }) + } + timer := time.NewTimer(extraWait) + select { + case <-ctx.Done(): + timer.Stop() + return false + case <-timer.C: + timer.Stop() + } + if emit != nil { + emit(Progress{ + Status: "running", + Progress: 20, + Message: fmt.Sprintf("Redfish: повторная проверка готовности BMC (%d/%d)...", attempt+1, len(readinessWaits)), + }) + } + } + ready, reason := c.isBMCInventoryReady(ctx, client, req, baseURL, systemPath) + if !ready { + if emit != nil { + emit(Progress{ + Status: "running", + Progress: 20, + Message: fmt.Sprintf("Redfish: WARNING — BMC не подтвердил готовность (%s), сбор может быть неполным", reason), + }) + } + } else if emit != nil { + emit(Progress{ + Status: "running", + Progress: 20, + Message: fmt.Sprintf("Redfish: BMC готов (%s)", reason), + }) + } + return true +} + +// isBMCInventoryReady checks whether the BMC has finished populating its +// hardware inventory after a power-on. Returns (ready, reason). +// It considers the BMC ready if either the system memory summary reports +// a non-zero total or the PCIeDevices collection is non-empty. +func (c *RedfishConnector) isBMCInventoryReady(ctx context.Context, client *http.Client, req Request, baseURL, systemPath string) (bool, string) { + systemDoc, err := c.getJSON(ctx, client, req, baseURL, systemPath) + if err != nil { + return false, "не удалось прочитать System" + } + if summary, ok := systemDoc["MemorySummary"].(map[string]interface{}); ok { + if asFloat(summary["TotalSystemMemoryGiB"]) > 0 { + return true, "MemorySummary заполнен" + } + } + pcieDoc, err := c.getJSON(ctx, client, req, baseURL, joinPath(systemPath, "/PCIeDevices")) + if err == nil { + if asInt(pcieDoc["Members@odata.count"]) > 0 { + return true, "PCIeDevices не пуст" + } + if members, ok := pcieDoc["Members"].([]interface{}); ok && len(members) > 0 { + return true, "PCIeDevices не пуст" + } + } + return false, "MemorySummary=0, PCIeDevices пуст" } func (c *RedfishConnector) restoreHostPowerAfterCollection(ctx context.Context, client *http.Client, req Request, baseURL, systemPath string, emit ProgressFn) { @@ -2107,6 +2193,25 @@ func looksLikeRedfishResource(doc map[string]interface{}) bool { return false } +// isHardwareInventoryCollectionPath reports whether the path is a hardware +// inventory collection that is expected to have members when the machine is +// powered on and the BMC has finished initializing. +func isHardwareInventoryCollectionPath(p string) bool { + for _, suffix := range []string{ + "/PCIeDevices", + "/NetworkAdapters", + "/Processors", + "/Drives", + "/Storage", + "/EthernetInterfaces", + } { + if strings.HasSuffix(p, suffix) { + return true + } + } + return false +} + func shouldSlowProbeCriticalCollection(p string, tuning redfishprofile.AcquisitionTuning) bool { p = normalizeRedfishPath(p) if !tuning.RecoveryPolicy.EnableCriticalSlowProbe { @@ -2427,6 +2532,25 @@ func redfishPowerOnStabilizationDelay() time.Duration { return 60 * time.Second } +// redfishBMCReadinessWaits returns the extra wait durations used when polling +// BMC inventory readiness after power-on. Defaults: [60s, 120s]. +// Override with LOGPILE_REDFISH_BMC_READY_WAITS (comma-separated durations, +// e.g. "60s,120s"). +func redfishBMCReadinessWaits() []time.Duration { + if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_BMC_READY_WAITS")); v != "" { + var out []time.Duration + for _, part := range strings.Split(v, ",") { + if d, err := time.ParseDuration(strings.TrimSpace(part)); err == nil && d >= 0 { + out = append(out, d) + } + } + if len(out) > 0 { + return out + } + } + return []time.Duration{60 * time.Second, 120 * time.Second} +} + func redfishSnapshotMemoryRequestTimeout() time.Duration { if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_MEMORY_TIMEOUT")); v != "" { if d, err := time.ParseDuration(v); err == nil && d > 0 { @@ -3031,6 +3155,38 @@ func (c *RedfishConnector) recoverCriticalRedfishDocsPlanB(ctx context.Context, addTarget(memberPath) } } + + // Re-probe critical hardware collections that were successfully fetched but + // returned no members. This happens when the BMC hasn't finished enumerating + // hardware at collection time (e.g. PCIeDevices or NetworkAdapters empty right + // after power-on). Only hardware inventory collection suffixes are retried. + if tuning.RecoveryPolicy.EnableEmptyCriticalCollectionRetry { + for _, p := range criticalPaths { + p = normalizeRedfishPath(p) + if p == "" { + continue + } + if _, queued := seenTargets[p]; queued { + continue + } + docAny, ok := rawTree[p] + if !ok { + continue + } + doc, ok := docAny.(map[string]interface{}) + if !ok { + continue + } + if redfishCollectionHasExplicitMembers(doc) { + continue + } + if !isHardwareInventoryCollectionPath(p) { + continue + } + addTarget(p) + } + } + if len(targets) == 0 { return 0 } diff --git a/internal/collector/redfish_replay_inventory.go b/internal/collector/redfish_replay_inventory.go index d92d533..42f67c1 100644 --- a/internal/collector/redfish_replay_inventory.go +++ b/internal/collector/redfish_replay_inventory.go @@ -26,6 +26,16 @@ func (r redfishSnapshotReader) enrichNICsFromNetworkInterfaces(nics *[]models.Ne continue } idx, ok := bySlot[strings.ToLower(strings.TrimSpace(slot))] + if !ok { + // The NetworkInterface Id (e.g. "2") may not match the display slot of + // the real NIC that came from Chassis/NetworkAdapters (e.g. "RISER 5 + // slot 1 (7)"). Try to find the real NIC via the Links.NetworkAdapter + // cross-reference before creating a ghost entry. + if linkedIdx := r.findNICIndexByLinkedNetworkAdapter(iface, bySlot); linkedIdx >= 0 { + idx = linkedIdx + ok = true + } + } if !ok { *nics = append(*nics, models.NetworkAdapter{ Slot: slot, @@ -178,6 +188,35 @@ func (r redfishSnapshotReader) collectBMCMAC(managerPaths []string) string { return "" } +// findNICIndexByLinkedNetworkAdapter resolves a NetworkInterface document to an +// existing NIC in bySlot by following Links.NetworkAdapter → the Chassis +// NetworkAdapter doc → its slot label. Returns -1 if no match is found. +func (r redfishSnapshotReader) findNICIndexByLinkedNetworkAdapter(iface map[string]interface{}, bySlot map[string]int) int { + links, ok := iface["Links"].(map[string]interface{}) + if !ok { + return -1 + } + adapterRef, ok := links["NetworkAdapter"].(map[string]interface{}) + if !ok { + return -1 + } + adapterPath := normalizeRedfishPath(asString(adapterRef["@odata.id"])) + if adapterPath == "" { + return -1 + } + adapterDoc, err := r.getJSON(adapterPath) + if err != nil || len(adapterDoc) == 0 { + return -1 + } + adapterNIC := parseNIC(adapterDoc) + if slot := strings.ToLower(strings.TrimSpace(adapterNIC.Slot)); slot != "" { + if idx, ok := bySlot[slot]; ok { + return idx + } + } + return -1 +} + // enrichNICMACsFromNetworkDeviceFunctions reads the NetworkDeviceFunctions // collection linked from a NetworkAdapter document and populates the NIC's // MACAddresses from each function's Ethernet.PermanentMACAddress / MACAddress. diff --git a/internal/collector/redfish_test.go b/internal/collector/redfish_test.go index 208d4f3..84c01b2 100644 --- a/internal/collector/redfish_test.go +++ b/internal/collector/redfish_test.go @@ -272,6 +272,7 @@ func TestRedfishConnectorProbe(t *testing.T) { func TestEnsureHostPowerForCollection_WaitsForStablePowerOn(t *testing.T) { t.Setenv("LOGPILE_REDFISH_POWERON_STABILIZATION", "1ms") + t.Setenv("LOGPILE_REDFISH_BMC_READY_WAITS", "1ms,1ms") powerState := "Off" resetCalls := 0 @@ -282,6 +283,9 @@ func TestEnsureHostPowerForCollection_WaitsForStablePowerOn(t *testing.T) { _ = json.NewEncoder(w).Encode(map[string]interface{}{ "@odata.id": "/redfish/v1/Systems/1", "PowerState": powerState, + "MemorySummary": map[string]interface{}{ + "TotalSystemMemoryGiB": 128, + }, "Actions": map[string]interface{}{ "#ComputerSystem.Reset": map[string]interface{}{ "target": "/redfish/v1/Systems/1/Actions/ComputerSystem.Reset", @@ -329,6 +333,7 @@ func TestEnsureHostPowerForCollection_WaitsForStablePowerOn(t *testing.T) { func TestEnsureHostPowerForCollection_FailsIfHostDoesNotStayOnAfterStabilization(t *testing.T) { t.Setenv("LOGPILE_REDFISH_POWERON_STABILIZATION", "1ms") + t.Setenv("LOGPILE_REDFISH_BMC_READY_WAITS", "1ms,1ms") powerState := "Off" diff --git a/internal/collector/redfishprofile/profile_generic.go b/internal/collector/redfishprofile/profile_generic.go index 25522ea..d6c4a28 100644 --- a/internal/collector/redfishprofile/profile_generic.go +++ b/internal/collector/redfishprofile/profile_generic.go @@ -102,6 +102,7 @@ func genericProfile() Profile { ensureRecoveryPolicy(plan, AcquisitionRecoveryPolicy{ EnableCriticalCollectionMemberRetry: true, EnableCriticalSlowProbe: true, + EnableEmptyCriticalCollectionRetry: true, }) ensureRatePolicy(plan, AcquisitionRatePolicy{ TargetP95LatencyMS: 900, diff --git a/internal/collector/redfishprofile/profiles_common.go b/internal/collector/redfishprofile/profiles_common.go index cfa13ee..b66bb89 100644 --- a/internal/collector/redfishprofile/profiles_common.go +++ b/internal/collector/redfishprofile/profiles_common.go @@ -205,6 +205,9 @@ func ensureRecoveryPolicy(plan *AcquisitionPlan, policy AcquisitionRecoveryPolic if policy.EnableProfilePlanB { plan.Tuning.RecoveryPolicy.EnableProfilePlanB = true } + if policy.EnableEmptyCriticalCollectionRetry { + plan.Tuning.RecoveryPolicy.EnableEmptyCriticalCollectionRetry = true + } } func ensureScopedPathPolicy(plan *AcquisitionPlan, policy AcquisitionScopedPathPolicy) { diff --git a/internal/collector/redfishprofile/types.go b/internal/collector/redfishprofile/types.go index ec26c05..e83729e 100644 --- a/internal/collector/redfishprofile/types.go +++ b/internal/collector/redfishprofile/types.go @@ -90,6 +90,7 @@ type AcquisitionRecoveryPolicy struct { EnableCriticalCollectionMemberRetry bool EnableCriticalSlowProbe bool EnableProfilePlanB bool + EnableEmptyCriticalCollectionRetry bool } type AcquisitionPrefetchPolicy struct {