feat: adaptive BMC readiness check + ghost NIC dedup fix + empty collection plan-B retry

BMC readiness after power-on (waitForStablePoweredOnHost):
- After initial 1m stabilization, poll BMC inventory readiness before collecting
- Ready if MemorySummary.TotalSystemMemoryGiB > 0 OR PCIeDevices.Members non-empty
- On failure: wait +60s, retry; on second failure: wait +120s, retry; then warn and proceed
- Configurable via LOGPILE_REDFISH_BMC_READY_WAITS (default: 60s,120s)

Empty critical collection plan-B retry (EnableEmptyCriticalCollectionRetry):
- Hardware inventory collections that returned Members=[] are now re-probed in plan-B
- Covers PCIeDevices, NetworkAdapters, Processors, Drives, Storage, EthernetInterfaces
- Enabled by default in generic profile (applies to all vendors)

Ghost NIC dedup fix (enrichNICsFromNetworkInterfaces):
- NetworkInterface entries (e.g. Id=2) that don't match existing NIC slots are now
  resolved via Links.NetworkAdapter cross-reference to the real Chassis NIC
- Prevents duplicate ghost entries (slot=2 "Network Device View") from appearing
  alongside real NICs (slot="RISER 5 slot 1 (7)") with the same MAC addresses

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Mikhail Chusavitin
2026-03-25 11:19:36 +03:00
parent 063b08d5fb
commit 125f77ef69
6 changed files with 207 additions and 2 deletions

View File

@@ -569,11 +569,12 @@ func (c *RedfishConnector) waitForStablePoweredOnHost(ctx context.Context, clien
})
}
timer := time.NewTimer(stabilizationDelay)
defer timer.Stop()
select {
case <-ctx.Done():
timer.Stop()
return false
case <-timer.C:
timer.Stop()
}
}
if emit != nil {
@@ -583,7 +584,92 @@ func (c *RedfishConnector) waitForStablePoweredOnHost(ctx context.Context, clien
Message: "Redfish: повторная проверка PowerState после стабилизации host",
})
}
return c.waitForHostPowerState(ctx, client, req, baseURL, systemPath, true, 5*time.Second)
if !c.waitForHostPowerState(ctx, client, req, baseURL, systemPath, true, 5*time.Second) {
return false
}
// After the initial stabilization wait, the BMC may still be populating its
// hardware inventory (PCIeDevices, memory summary). Poll readiness with
// increasing back-off (default: +60s, +120s), then warn and proceed.
readinessWaits := redfishBMCReadinessWaits()
for attempt, extraWait := range readinessWaits {
ready, reason := c.isBMCInventoryReady(ctx, client, req, baseURL, systemPath)
if ready {
if emit != nil {
emit(Progress{
Status: "running",
Progress: 20,
Message: fmt.Sprintf("Redfish: BMC готов (%s)", reason),
})
}
return true
}
if emit != nil {
emit(Progress{
Status: "running",
Progress: 20,
Message: fmt.Sprintf("Redfish: BMC не готов (%s), ожидание %s (попытка %d/%d)", reason, extraWait, attempt+1, len(readinessWaits)),
})
}
timer := time.NewTimer(extraWait)
select {
case <-ctx.Done():
timer.Stop()
return false
case <-timer.C:
timer.Stop()
}
if emit != nil {
emit(Progress{
Status: "running",
Progress: 20,
Message: fmt.Sprintf("Redfish: повторная проверка готовности BMC (%d/%d)...", attempt+1, len(readinessWaits)),
})
}
}
ready, reason := c.isBMCInventoryReady(ctx, client, req, baseURL, systemPath)
if !ready {
if emit != nil {
emit(Progress{
Status: "running",
Progress: 20,
Message: fmt.Sprintf("Redfish: WARNING — BMC не подтвердил готовность (%s), сбор может быть неполным", reason),
})
}
} else if emit != nil {
emit(Progress{
Status: "running",
Progress: 20,
Message: fmt.Sprintf("Redfish: BMC готов (%s)", reason),
})
}
return true
}
// isBMCInventoryReady checks whether the BMC has finished populating its
// hardware inventory after a power-on. Returns (ready, reason).
// It considers the BMC ready if either the system memory summary reports
// a non-zero total or the PCIeDevices collection is non-empty.
func (c *RedfishConnector) isBMCInventoryReady(ctx context.Context, client *http.Client, req Request, baseURL, systemPath string) (bool, string) {
systemDoc, err := c.getJSON(ctx, client, req, baseURL, systemPath)
if err != nil {
return false, "не удалось прочитать System"
}
if summary, ok := systemDoc["MemorySummary"].(map[string]interface{}); ok {
if asFloat(summary["TotalSystemMemoryGiB"]) > 0 {
return true, "MemorySummary заполнен"
}
}
pcieDoc, err := c.getJSON(ctx, client, req, baseURL, joinPath(systemPath, "/PCIeDevices"))
if err == nil {
if asInt(pcieDoc["Members@odata.count"]) > 0 {
return true, "PCIeDevices не пуст"
}
if members, ok := pcieDoc["Members"].([]interface{}); ok && len(members) > 0 {
return true, "PCIeDevices не пуст"
}
}
return false, "MemorySummary=0, PCIeDevices пуст"
}
func (c *RedfishConnector) restoreHostPowerAfterCollection(ctx context.Context, client *http.Client, req Request, baseURL, systemPath string, emit ProgressFn) {
@@ -2107,6 +2193,25 @@ func looksLikeRedfishResource(doc map[string]interface{}) bool {
return false
}
// isHardwareInventoryCollectionPath reports whether the path is a hardware
// inventory collection that is expected to have members when the machine is
// powered on and the BMC has finished initializing.
func isHardwareInventoryCollectionPath(p string) bool {
for _, suffix := range []string{
"/PCIeDevices",
"/NetworkAdapters",
"/Processors",
"/Drives",
"/Storage",
"/EthernetInterfaces",
} {
if strings.HasSuffix(p, suffix) {
return true
}
}
return false
}
func shouldSlowProbeCriticalCollection(p string, tuning redfishprofile.AcquisitionTuning) bool {
p = normalizeRedfishPath(p)
if !tuning.RecoveryPolicy.EnableCriticalSlowProbe {
@@ -2427,6 +2532,25 @@ func redfishPowerOnStabilizationDelay() time.Duration {
return 60 * time.Second
}
// redfishBMCReadinessWaits returns the extra wait durations used when polling
// BMC inventory readiness after power-on. Defaults: [60s, 120s].
// Override with LOGPILE_REDFISH_BMC_READY_WAITS (comma-separated durations,
// e.g. "60s,120s").
func redfishBMCReadinessWaits() []time.Duration {
if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_BMC_READY_WAITS")); v != "" {
var out []time.Duration
for _, part := range strings.Split(v, ",") {
if d, err := time.ParseDuration(strings.TrimSpace(part)); err == nil && d >= 0 {
out = append(out, d)
}
}
if len(out) > 0 {
return out
}
}
return []time.Duration{60 * time.Second, 120 * time.Second}
}
func redfishSnapshotMemoryRequestTimeout() time.Duration {
if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_MEMORY_TIMEOUT")); v != "" {
if d, err := time.ParseDuration(v); err == nil && d > 0 {
@@ -3031,6 +3155,38 @@ func (c *RedfishConnector) recoverCriticalRedfishDocsPlanB(ctx context.Context,
addTarget(memberPath)
}
}
// Re-probe critical hardware collections that were successfully fetched but
// returned no members. This happens when the BMC hasn't finished enumerating
// hardware at collection time (e.g. PCIeDevices or NetworkAdapters empty right
// after power-on). Only hardware inventory collection suffixes are retried.
if tuning.RecoveryPolicy.EnableEmptyCriticalCollectionRetry {
for _, p := range criticalPaths {
p = normalizeRedfishPath(p)
if p == "" {
continue
}
if _, queued := seenTargets[p]; queued {
continue
}
docAny, ok := rawTree[p]
if !ok {
continue
}
doc, ok := docAny.(map[string]interface{})
if !ok {
continue
}
if redfishCollectionHasExplicitMembers(doc) {
continue
}
if !isHardwareInventoryCollectionPath(p) {
continue
}
addTarget(p)
}
}
if len(targets) == 0 {
return 0
}

View File

@@ -26,6 +26,16 @@ func (r redfishSnapshotReader) enrichNICsFromNetworkInterfaces(nics *[]models.Ne
continue
}
idx, ok := bySlot[strings.ToLower(strings.TrimSpace(slot))]
if !ok {
// The NetworkInterface Id (e.g. "2") may not match the display slot of
// the real NIC that came from Chassis/NetworkAdapters (e.g. "RISER 5
// slot 1 (7)"). Try to find the real NIC via the Links.NetworkAdapter
// cross-reference before creating a ghost entry.
if linkedIdx := r.findNICIndexByLinkedNetworkAdapter(iface, bySlot); linkedIdx >= 0 {
idx = linkedIdx
ok = true
}
}
if !ok {
*nics = append(*nics, models.NetworkAdapter{
Slot: slot,
@@ -178,6 +188,35 @@ func (r redfishSnapshotReader) collectBMCMAC(managerPaths []string) string {
return ""
}
// findNICIndexByLinkedNetworkAdapter resolves a NetworkInterface document to an
// existing NIC in bySlot by following Links.NetworkAdapter → the Chassis
// NetworkAdapter doc → its slot label. Returns -1 if no match is found.
func (r redfishSnapshotReader) findNICIndexByLinkedNetworkAdapter(iface map[string]interface{}, bySlot map[string]int) int {
links, ok := iface["Links"].(map[string]interface{})
if !ok {
return -1
}
adapterRef, ok := links["NetworkAdapter"].(map[string]interface{})
if !ok {
return -1
}
adapterPath := normalizeRedfishPath(asString(adapterRef["@odata.id"]))
if adapterPath == "" {
return -1
}
adapterDoc, err := r.getJSON(adapterPath)
if err != nil || len(adapterDoc) == 0 {
return -1
}
adapterNIC := parseNIC(adapterDoc)
if slot := strings.ToLower(strings.TrimSpace(adapterNIC.Slot)); slot != "" {
if idx, ok := bySlot[slot]; ok {
return idx
}
}
return -1
}
// enrichNICMACsFromNetworkDeviceFunctions reads the NetworkDeviceFunctions
// collection linked from a NetworkAdapter document and populates the NIC's
// MACAddresses from each function's Ethernet.PermanentMACAddress / MACAddress.

View File

@@ -272,6 +272,7 @@ func TestRedfishConnectorProbe(t *testing.T) {
func TestEnsureHostPowerForCollection_WaitsForStablePowerOn(t *testing.T) {
t.Setenv("LOGPILE_REDFISH_POWERON_STABILIZATION", "1ms")
t.Setenv("LOGPILE_REDFISH_BMC_READY_WAITS", "1ms,1ms")
powerState := "Off"
resetCalls := 0
@@ -282,6 +283,9 @@ func TestEnsureHostPowerForCollection_WaitsForStablePowerOn(t *testing.T) {
_ = json.NewEncoder(w).Encode(map[string]interface{}{
"@odata.id": "/redfish/v1/Systems/1",
"PowerState": powerState,
"MemorySummary": map[string]interface{}{
"TotalSystemMemoryGiB": 128,
},
"Actions": map[string]interface{}{
"#ComputerSystem.Reset": map[string]interface{}{
"target": "/redfish/v1/Systems/1/Actions/ComputerSystem.Reset",
@@ -329,6 +333,7 @@ func TestEnsureHostPowerForCollection_WaitsForStablePowerOn(t *testing.T) {
func TestEnsureHostPowerForCollection_FailsIfHostDoesNotStayOnAfterStabilization(t *testing.T) {
t.Setenv("LOGPILE_REDFISH_POWERON_STABILIZATION", "1ms")
t.Setenv("LOGPILE_REDFISH_BMC_READY_WAITS", "1ms,1ms")
powerState := "Off"

View File

@@ -102,6 +102,7 @@ func genericProfile() Profile {
ensureRecoveryPolicy(plan, AcquisitionRecoveryPolicy{
EnableCriticalCollectionMemberRetry: true,
EnableCriticalSlowProbe: true,
EnableEmptyCriticalCollectionRetry: true,
})
ensureRatePolicy(plan, AcquisitionRatePolicy{
TargetP95LatencyMS: 900,

View File

@@ -205,6 +205,9 @@ func ensureRecoveryPolicy(plan *AcquisitionPlan, policy AcquisitionRecoveryPolic
if policy.EnableProfilePlanB {
plan.Tuning.RecoveryPolicy.EnableProfilePlanB = true
}
if policy.EnableEmptyCriticalCollectionRetry {
plan.Tuning.RecoveryPolicy.EnableEmptyCriticalCollectionRetry = true
}
}
func ensureScopedPathPolicy(plan *AcquisitionPlan, policy AcquisitionScopedPathPolicy) {

View File

@@ -90,6 +90,7 @@ type AcquisitionRecoveryPolicy struct {
EnableCriticalCollectionMemberRetry bool
EnableCriticalSlowProbe bool
EnableProfilePlanB bool
EnableEmptyCriticalCollectionRetry bool
}
type AcquisitionPrefetchPolicy struct {