feat: adaptive BMC readiness check + ghost NIC dedup fix + empty collection plan-B retry
BMC readiness after power-on (waitForStablePoweredOnHost): - After initial 1m stabilization, poll BMC inventory readiness before collecting - Ready if MemorySummary.TotalSystemMemoryGiB > 0 OR PCIeDevices.Members non-empty - On failure: wait +60s, retry; on second failure: wait +120s, retry; then warn and proceed - Configurable via LOGPILE_REDFISH_BMC_READY_WAITS (default: 60s,120s) Empty critical collection plan-B retry (EnableEmptyCriticalCollectionRetry): - Hardware inventory collections that returned Members=[] are now re-probed in plan-B - Covers PCIeDevices, NetworkAdapters, Processors, Drives, Storage, EthernetInterfaces - Enabled by default in generic profile (applies to all vendors) Ghost NIC dedup fix (enrichNICsFromNetworkInterfaces): - NetworkInterface entries (e.g. Id=2) that don't match existing NIC slots are now resolved via Links.NetworkAdapter cross-reference to the real Chassis NIC - Prevents duplicate ghost entries (slot=2 "Network Device View") from appearing alongside real NICs (slot="RISER 5 slot 1 (7)") with the same MAC addresses Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -569,11 +569,12 @@ func (c *RedfishConnector) waitForStablePoweredOnHost(ctx context.Context, clien
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
timer := time.NewTimer(stabilizationDelay)
|
timer := time.NewTimer(stabilizationDelay)
|
||||||
defer timer.Stop()
|
|
||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
|
timer.Stop()
|
||||||
return false
|
return false
|
||||||
case <-timer.C:
|
case <-timer.C:
|
||||||
|
timer.Stop()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if emit != nil {
|
if emit != nil {
|
||||||
@@ -583,7 +584,92 @@ func (c *RedfishConnector) waitForStablePoweredOnHost(ctx context.Context, clien
|
|||||||
Message: "Redfish: повторная проверка PowerState после стабилизации host",
|
Message: "Redfish: повторная проверка PowerState после стабилизации host",
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
return c.waitForHostPowerState(ctx, client, req, baseURL, systemPath, true, 5*time.Second)
|
if !c.waitForHostPowerState(ctx, client, req, baseURL, systemPath, true, 5*time.Second) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// After the initial stabilization wait, the BMC may still be populating its
|
||||||
|
// hardware inventory (PCIeDevices, memory summary). Poll readiness with
|
||||||
|
// increasing back-off (default: +60s, +120s), then warn and proceed.
|
||||||
|
readinessWaits := redfishBMCReadinessWaits()
|
||||||
|
for attempt, extraWait := range readinessWaits {
|
||||||
|
ready, reason := c.isBMCInventoryReady(ctx, client, req, baseURL, systemPath)
|
||||||
|
if ready {
|
||||||
|
if emit != nil {
|
||||||
|
emit(Progress{
|
||||||
|
Status: "running",
|
||||||
|
Progress: 20,
|
||||||
|
Message: fmt.Sprintf("Redfish: BMC готов (%s)", reason),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
if emit != nil {
|
||||||
|
emit(Progress{
|
||||||
|
Status: "running",
|
||||||
|
Progress: 20,
|
||||||
|
Message: fmt.Sprintf("Redfish: BMC не готов (%s), ожидание %s (попытка %d/%d)", reason, extraWait, attempt+1, len(readinessWaits)),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
timer := time.NewTimer(extraWait)
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
timer.Stop()
|
||||||
|
return false
|
||||||
|
case <-timer.C:
|
||||||
|
timer.Stop()
|
||||||
|
}
|
||||||
|
if emit != nil {
|
||||||
|
emit(Progress{
|
||||||
|
Status: "running",
|
||||||
|
Progress: 20,
|
||||||
|
Message: fmt.Sprintf("Redfish: повторная проверка готовности BMC (%d/%d)...", attempt+1, len(readinessWaits)),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ready, reason := c.isBMCInventoryReady(ctx, client, req, baseURL, systemPath)
|
||||||
|
if !ready {
|
||||||
|
if emit != nil {
|
||||||
|
emit(Progress{
|
||||||
|
Status: "running",
|
||||||
|
Progress: 20,
|
||||||
|
Message: fmt.Sprintf("Redfish: WARNING — BMC не подтвердил готовность (%s), сбор может быть неполным", reason),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
} else if emit != nil {
|
||||||
|
emit(Progress{
|
||||||
|
Status: "running",
|
||||||
|
Progress: 20,
|
||||||
|
Message: fmt.Sprintf("Redfish: BMC готов (%s)", reason),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// isBMCInventoryReady checks whether the BMC has finished populating its
|
||||||
|
// hardware inventory after a power-on. Returns (ready, reason).
|
||||||
|
// It considers the BMC ready if either the system memory summary reports
|
||||||
|
// a non-zero total or the PCIeDevices collection is non-empty.
|
||||||
|
func (c *RedfishConnector) isBMCInventoryReady(ctx context.Context, client *http.Client, req Request, baseURL, systemPath string) (bool, string) {
|
||||||
|
systemDoc, err := c.getJSON(ctx, client, req, baseURL, systemPath)
|
||||||
|
if err != nil {
|
||||||
|
return false, "не удалось прочитать System"
|
||||||
|
}
|
||||||
|
if summary, ok := systemDoc["MemorySummary"].(map[string]interface{}); ok {
|
||||||
|
if asFloat(summary["TotalSystemMemoryGiB"]) > 0 {
|
||||||
|
return true, "MemorySummary заполнен"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pcieDoc, err := c.getJSON(ctx, client, req, baseURL, joinPath(systemPath, "/PCIeDevices"))
|
||||||
|
if err == nil {
|
||||||
|
if asInt(pcieDoc["Members@odata.count"]) > 0 {
|
||||||
|
return true, "PCIeDevices не пуст"
|
||||||
|
}
|
||||||
|
if members, ok := pcieDoc["Members"].([]interface{}); ok && len(members) > 0 {
|
||||||
|
return true, "PCIeDevices не пуст"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false, "MemorySummary=0, PCIeDevices пуст"
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *RedfishConnector) restoreHostPowerAfterCollection(ctx context.Context, client *http.Client, req Request, baseURL, systemPath string, emit ProgressFn) {
|
func (c *RedfishConnector) restoreHostPowerAfterCollection(ctx context.Context, client *http.Client, req Request, baseURL, systemPath string, emit ProgressFn) {
|
||||||
@@ -2107,6 +2193,25 @@ func looksLikeRedfishResource(doc map[string]interface{}) bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// isHardwareInventoryCollectionPath reports whether the path is a hardware
|
||||||
|
// inventory collection that is expected to have members when the machine is
|
||||||
|
// powered on and the BMC has finished initializing.
|
||||||
|
func isHardwareInventoryCollectionPath(p string) bool {
|
||||||
|
for _, suffix := range []string{
|
||||||
|
"/PCIeDevices",
|
||||||
|
"/NetworkAdapters",
|
||||||
|
"/Processors",
|
||||||
|
"/Drives",
|
||||||
|
"/Storage",
|
||||||
|
"/EthernetInterfaces",
|
||||||
|
} {
|
||||||
|
if strings.HasSuffix(p, suffix) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
func shouldSlowProbeCriticalCollection(p string, tuning redfishprofile.AcquisitionTuning) bool {
|
func shouldSlowProbeCriticalCollection(p string, tuning redfishprofile.AcquisitionTuning) bool {
|
||||||
p = normalizeRedfishPath(p)
|
p = normalizeRedfishPath(p)
|
||||||
if !tuning.RecoveryPolicy.EnableCriticalSlowProbe {
|
if !tuning.RecoveryPolicy.EnableCriticalSlowProbe {
|
||||||
@@ -2427,6 +2532,25 @@ func redfishPowerOnStabilizationDelay() time.Duration {
|
|||||||
return 60 * time.Second
|
return 60 * time.Second
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// redfishBMCReadinessWaits returns the extra wait durations used when polling
|
||||||
|
// BMC inventory readiness after power-on. Defaults: [60s, 120s].
|
||||||
|
// Override with LOGPILE_REDFISH_BMC_READY_WAITS (comma-separated durations,
|
||||||
|
// e.g. "60s,120s").
|
||||||
|
func redfishBMCReadinessWaits() []time.Duration {
|
||||||
|
if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_BMC_READY_WAITS")); v != "" {
|
||||||
|
var out []time.Duration
|
||||||
|
for _, part := range strings.Split(v, ",") {
|
||||||
|
if d, err := time.ParseDuration(strings.TrimSpace(part)); err == nil && d >= 0 {
|
||||||
|
out = append(out, d)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(out) > 0 {
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return []time.Duration{60 * time.Second, 120 * time.Second}
|
||||||
|
}
|
||||||
|
|
||||||
func redfishSnapshotMemoryRequestTimeout() time.Duration {
|
func redfishSnapshotMemoryRequestTimeout() time.Duration {
|
||||||
if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_MEMORY_TIMEOUT")); v != "" {
|
if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_MEMORY_TIMEOUT")); v != "" {
|
||||||
if d, err := time.ParseDuration(v); err == nil && d > 0 {
|
if d, err := time.ParseDuration(v); err == nil && d > 0 {
|
||||||
@@ -3031,6 +3155,38 @@ func (c *RedfishConnector) recoverCriticalRedfishDocsPlanB(ctx context.Context,
|
|||||||
addTarget(memberPath)
|
addTarget(memberPath)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Re-probe critical hardware collections that were successfully fetched but
|
||||||
|
// returned no members. This happens when the BMC hasn't finished enumerating
|
||||||
|
// hardware at collection time (e.g. PCIeDevices or NetworkAdapters empty right
|
||||||
|
// after power-on). Only hardware inventory collection suffixes are retried.
|
||||||
|
if tuning.RecoveryPolicy.EnableEmptyCriticalCollectionRetry {
|
||||||
|
for _, p := range criticalPaths {
|
||||||
|
p = normalizeRedfishPath(p)
|
||||||
|
if p == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if _, queued := seenTargets[p]; queued {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
docAny, ok := rawTree[p]
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
doc, ok := docAny.(map[string]interface{})
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if redfishCollectionHasExplicitMembers(doc) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !isHardwareInventoryCollectionPath(p) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
addTarget(p)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if len(targets) == 0 {
|
if len(targets) == 0 {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -26,6 +26,16 @@ func (r redfishSnapshotReader) enrichNICsFromNetworkInterfaces(nics *[]models.Ne
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
idx, ok := bySlot[strings.ToLower(strings.TrimSpace(slot))]
|
idx, ok := bySlot[strings.ToLower(strings.TrimSpace(slot))]
|
||||||
|
if !ok {
|
||||||
|
// The NetworkInterface Id (e.g. "2") may not match the display slot of
|
||||||
|
// the real NIC that came from Chassis/NetworkAdapters (e.g. "RISER 5
|
||||||
|
// slot 1 (7)"). Try to find the real NIC via the Links.NetworkAdapter
|
||||||
|
// cross-reference before creating a ghost entry.
|
||||||
|
if linkedIdx := r.findNICIndexByLinkedNetworkAdapter(iface, bySlot); linkedIdx >= 0 {
|
||||||
|
idx = linkedIdx
|
||||||
|
ok = true
|
||||||
|
}
|
||||||
|
}
|
||||||
if !ok {
|
if !ok {
|
||||||
*nics = append(*nics, models.NetworkAdapter{
|
*nics = append(*nics, models.NetworkAdapter{
|
||||||
Slot: slot,
|
Slot: slot,
|
||||||
@@ -178,6 +188,35 @@ func (r redfishSnapshotReader) collectBMCMAC(managerPaths []string) string {
|
|||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// findNICIndexByLinkedNetworkAdapter resolves a NetworkInterface document to an
|
||||||
|
// existing NIC in bySlot by following Links.NetworkAdapter → the Chassis
|
||||||
|
// NetworkAdapter doc → its slot label. Returns -1 if no match is found.
|
||||||
|
func (r redfishSnapshotReader) findNICIndexByLinkedNetworkAdapter(iface map[string]interface{}, bySlot map[string]int) int {
|
||||||
|
links, ok := iface["Links"].(map[string]interface{})
|
||||||
|
if !ok {
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
adapterRef, ok := links["NetworkAdapter"].(map[string]interface{})
|
||||||
|
if !ok {
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
adapterPath := normalizeRedfishPath(asString(adapterRef["@odata.id"]))
|
||||||
|
if adapterPath == "" {
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
adapterDoc, err := r.getJSON(adapterPath)
|
||||||
|
if err != nil || len(adapterDoc) == 0 {
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
adapterNIC := parseNIC(adapterDoc)
|
||||||
|
if slot := strings.ToLower(strings.TrimSpace(adapterNIC.Slot)); slot != "" {
|
||||||
|
if idx, ok := bySlot[slot]; ok {
|
||||||
|
return idx
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
|
||||||
// enrichNICMACsFromNetworkDeviceFunctions reads the NetworkDeviceFunctions
|
// enrichNICMACsFromNetworkDeviceFunctions reads the NetworkDeviceFunctions
|
||||||
// collection linked from a NetworkAdapter document and populates the NIC's
|
// collection linked from a NetworkAdapter document and populates the NIC's
|
||||||
// MACAddresses from each function's Ethernet.PermanentMACAddress / MACAddress.
|
// MACAddresses from each function's Ethernet.PermanentMACAddress / MACAddress.
|
||||||
|
|||||||
@@ -272,6 +272,7 @@ func TestRedfishConnectorProbe(t *testing.T) {
|
|||||||
|
|
||||||
func TestEnsureHostPowerForCollection_WaitsForStablePowerOn(t *testing.T) {
|
func TestEnsureHostPowerForCollection_WaitsForStablePowerOn(t *testing.T) {
|
||||||
t.Setenv("LOGPILE_REDFISH_POWERON_STABILIZATION", "1ms")
|
t.Setenv("LOGPILE_REDFISH_POWERON_STABILIZATION", "1ms")
|
||||||
|
t.Setenv("LOGPILE_REDFISH_BMC_READY_WAITS", "1ms,1ms")
|
||||||
|
|
||||||
powerState := "Off"
|
powerState := "Off"
|
||||||
resetCalls := 0
|
resetCalls := 0
|
||||||
@@ -282,6 +283,9 @@ func TestEnsureHostPowerForCollection_WaitsForStablePowerOn(t *testing.T) {
|
|||||||
_ = json.NewEncoder(w).Encode(map[string]interface{}{
|
_ = json.NewEncoder(w).Encode(map[string]interface{}{
|
||||||
"@odata.id": "/redfish/v1/Systems/1",
|
"@odata.id": "/redfish/v1/Systems/1",
|
||||||
"PowerState": powerState,
|
"PowerState": powerState,
|
||||||
|
"MemorySummary": map[string]interface{}{
|
||||||
|
"TotalSystemMemoryGiB": 128,
|
||||||
|
},
|
||||||
"Actions": map[string]interface{}{
|
"Actions": map[string]interface{}{
|
||||||
"#ComputerSystem.Reset": map[string]interface{}{
|
"#ComputerSystem.Reset": map[string]interface{}{
|
||||||
"target": "/redfish/v1/Systems/1/Actions/ComputerSystem.Reset",
|
"target": "/redfish/v1/Systems/1/Actions/ComputerSystem.Reset",
|
||||||
@@ -329,6 +333,7 @@ func TestEnsureHostPowerForCollection_WaitsForStablePowerOn(t *testing.T) {
|
|||||||
|
|
||||||
func TestEnsureHostPowerForCollection_FailsIfHostDoesNotStayOnAfterStabilization(t *testing.T) {
|
func TestEnsureHostPowerForCollection_FailsIfHostDoesNotStayOnAfterStabilization(t *testing.T) {
|
||||||
t.Setenv("LOGPILE_REDFISH_POWERON_STABILIZATION", "1ms")
|
t.Setenv("LOGPILE_REDFISH_POWERON_STABILIZATION", "1ms")
|
||||||
|
t.Setenv("LOGPILE_REDFISH_BMC_READY_WAITS", "1ms,1ms")
|
||||||
|
|
||||||
powerState := "Off"
|
powerState := "Off"
|
||||||
|
|
||||||
|
|||||||
@@ -102,6 +102,7 @@ func genericProfile() Profile {
|
|||||||
ensureRecoveryPolicy(plan, AcquisitionRecoveryPolicy{
|
ensureRecoveryPolicy(plan, AcquisitionRecoveryPolicy{
|
||||||
EnableCriticalCollectionMemberRetry: true,
|
EnableCriticalCollectionMemberRetry: true,
|
||||||
EnableCriticalSlowProbe: true,
|
EnableCriticalSlowProbe: true,
|
||||||
|
EnableEmptyCriticalCollectionRetry: true,
|
||||||
})
|
})
|
||||||
ensureRatePolicy(plan, AcquisitionRatePolicy{
|
ensureRatePolicy(plan, AcquisitionRatePolicy{
|
||||||
TargetP95LatencyMS: 900,
|
TargetP95LatencyMS: 900,
|
||||||
|
|||||||
@@ -205,6 +205,9 @@ func ensureRecoveryPolicy(plan *AcquisitionPlan, policy AcquisitionRecoveryPolic
|
|||||||
if policy.EnableProfilePlanB {
|
if policy.EnableProfilePlanB {
|
||||||
plan.Tuning.RecoveryPolicy.EnableProfilePlanB = true
|
plan.Tuning.RecoveryPolicy.EnableProfilePlanB = true
|
||||||
}
|
}
|
||||||
|
if policy.EnableEmptyCriticalCollectionRetry {
|
||||||
|
plan.Tuning.RecoveryPolicy.EnableEmptyCriticalCollectionRetry = true
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func ensureScopedPathPolicy(plan *AcquisitionPlan, policy AcquisitionScopedPathPolicy) {
|
func ensureScopedPathPolicy(plan *AcquisitionPlan, policy AcquisitionScopedPathPolicy) {
|
||||||
|
|||||||
@@ -90,6 +90,7 @@ type AcquisitionRecoveryPolicy struct {
|
|||||||
EnableCriticalCollectionMemberRetry bool
|
EnableCriticalCollectionMemberRetry bool
|
||||||
EnableCriticalSlowProbe bool
|
EnableCriticalSlowProbe bool
|
||||||
EnableProfilePlanB bool
|
EnableProfilePlanB bool
|
||||||
|
EnableEmptyCriticalCollectionRetry bool
|
||||||
}
|
}
|
||||||
|
|
||||||
type AcquisitionPrefetchPolicy struct {
|
type AcquisitionPrefetchPolicy struct {
|
||||||
|
|||||||
Reference in New Issue
Block a user