From 2fa4a1235a218bcc31f36396a6dd7b808bd18199 Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Sat, 28 Feb 2026 19:05:34 +0300 Subject: [PATCH] collector/redfish: make prefetch/post-probe adaptive with metrics --- internal/collector/redfish.go | 274 +++++++++++++++++++++++++---- internal/collector/redfish_test.go | 83 +++++++++ 2 files changed, 319 insertions(+), 38 deletions(-) diff --git a/internal/collector/redfish.go b/internal/collector/redfish.go index 7807e18..f5daea7 100644 --- a/internal/collector/redfish.go +++ b/internal/collector/redfish.go @@ -28,6 +28,27 @@ type RedfishConnector struct { debugSnapshot bool } +type redfishPrefetchMetrics struct { + Enabled bool + Candidates int + Targets int + Docs int + Added int + Duration time.Duration + SkipReason string +} + +type redfishPostProbeMetrics struct { + NVMECandidates int + NVMESelected int + NVMEAdded int + CollectionCandidates int + CollectionSelected int + SkippedExplicit int + Added int + Duration time.Duration +} + func NewRedfishConnector() *RedfishConnector { debug := false if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_DEBUG")); v != "" && v != "0" && !strings.EqualFold(v, "false") { @@ -87,7 +108,6 @@ func (c *RedfishConnector) Collect(ctx context.Context, req Request, emit Progre if emit != nil { emit(Progress{Status: "running", Progress: 30, Message: "Redfish: чтение структуры Redfish..."}) } - prefetchedCritical := c.prefetchCriticalRedfishDocs(ctx, prefetchClient, req, baseURL, criticalPaths, emit) if emit != nil { emit(Progress{Status: "running", Progress: 55, Message: "Redfish: подготовка snapshot..."}) @@ -95,29 +115,31 @@ func (c *RedfishConnector) Collect(ctx context.Context, req Request, emit Progre emit(Progress{Status: "running", Progress: 90, Message: "Redfish: сбор расширенного snapshot..."}) } c.debugSnapshotf("snapshot crawl start host=%s port=%d", req.Host, req.Port) - rawTree, fetchErrors := c.collectRawRedfishTree(ctx, snapshotClient, req, baseURL, redfishSnapshotPrioritySeeds(systemPaths, chassisPaths, managerPaths), emit) - if len(prefetchedCritical) > 0 { - reused := 0 - for p, doc := range prefetchedCritical { - if _, exists := rawTree[p]; exists { - continue - } - rawTree[p] = doc - reused++ - } - if emit != nil && reused > 0 { - emit(Progress{ - Status: "running", - Progress: 96, - Message: fmt.Sprintf("Redfish: prefetch использован для %d документов", reused), - }) - } - } + rawTree, fetchErrors, postProbeMetrics := c.collectRawRedfishTree(ctx, snapshotClient, req, baseURL, redfishSnapshotPrioritySeeds(systemPaths, chassisPaths, managerPaths), emit) c.debugSnapshotf("snapshot crawl done docs=%d", len(rawTree)) fetchErrMap := redfishFetchErrorListToMap(fetchErrors) + + prefetchedCritical, prefetchMetrics := c.prefetchCriticalRedfishDocs(ctx, prefetchClient, req, baseURL, criticalPaths, rawTree, fetchErrMap, emit) + for p, doc := range prefetchedCritical { + if _, exists := rawTree[p]; exists { + continue + } + rawTree[p] = doc + prefetchMetrics.Added++ + } for p := range prefetchedCritical { delete(fetchErrMap, p) } + log.Printf( + "redfish-prefetch-metrics: enabled=%t candidates=%d targets=%d docs=%d added=%d dur=%s skip=%s", + prefetchMetrics.Enabled, + prefetchMetrics.Candidates, + prefetchMetrics.Targets, + prefetchMetrics.Docs, + prefetchMetrics.Added, + prefetchMetrics.Duration.Round(time.Millisecond), + firstNonEmpty(prefetchMetrics.SkipReason, "-"), + ) if recoveredN := c.recoverCriticalRedfishDocsPlanB(ctx, criticalClient, req, baseURL, criticalPaths, rawTree, fetchErrMap, emit); recoveredN > 0 { c.debugSnapshotf("critical plan-b recovered docs=%d", recoveredN) } @@ -142,6 +164,17 @@ func (c *RedfishConnector) Collect(ctx context.Context, req Request, emit Progre return nil, err } totalElapsed := time.Since(collectStart).Round(time.Second) + log.Printf( + "redfish-postprobe-metrics: nvme_candidates=%d nvme_selected=%d nvme_added=%d candidates=%d selected=%d skipped_explicit=%d added=%d dur=%s", + postProbeMetrics.NVMECandidates, + postProbeMetrics.NVMESelected, + postProbeMetrics.NVMEAdded, + postProbeMetrics.CollectionCandidates, + postProbeMetrics.CollectionSelected, + postProbeMetrics.SkippedExplicit, + postProbeMetrics.Added, + postProbeMetrics.Duration.Round(time.Millisecond), + ) log.Printf("redfish-collect: completed in %s (docs=%d, fetch_errors=%d)", totalElapsed, len(rawTree), len(fetchErrMap)) if emit != nil { emit(Progress{ @@ -159,24 +192,46 @@ func (c *RedfishConnector) prefetchCriticalRedfishDocs( req Request, baseURL string, criticalPaths []string, + rawTree map[string]interface{}, + fetchErrMap map[string]string, emit ProgressFn, -) map[string]interface{} { - if !redfishPrefetchEnabled() || len(criticalPaths) == 0 { - return nil +) (map[string]interface{}, redfishPrefetchMetrics) { + metrics := redfishPrefetchMetrics{ + Enabled: redfishPrefetchEnabled(), + } + if !metrics.Enabled || len(criticalPaths) == 0 { + metrics.SkipReason = "disabled-or-empty" + return nil, metrics } - targets := redfishPrefetchTargets(criticalPaths) + candidates := redfishPrefetchTargets(criticalPaths) + metrics.Candidates = len(candidates) + if len(candidates) == 0 { + metrics.SkipReason = "no-candidates" + return nil, metrics + } + targets := redfishAdaptivePrefetchTargets(candidates, rawTree, fetchErrMap) + metrics.Targets = len(targets) if len(targets) == 0 { - return nil + metrics.SkipReason = "not-needed" + if emit != nil { + emit(Progress{ + Status: "running", + Progress: 96, + Message: fmt.Sprintf("Redfish: prefetch пропущен (адаптивно, кандидатов=%d)", metrics.Candidates), + }) + } + return nil, metrics } if emit != nil { emit(Progress{ Status: "running", - Progress: 35, - Message: fmt.Sprintf("Redfish: prefetch критичных endpoint (%d)...", len(targets)), + Progress: 96, + Message: fmt.Sprintf("Redfish: prefetch критичных endpoint (адаптивно %d/%d)...", len(targets), len(candidates)), }) } + start := time.Now() out := make(map[string]interface{}, len(targets)) seen := make(map[string]struct{}, len(targets)) var mu sync.Mutex @@ -241,22 +296,78 @@ func (c *RedfishConnector) prefetchCriticalRedfishDocs( case <-ctx.Done(): close(jobs) wg.Wait() - return out + metrics.Docs = len(out) + metrics.Duration = time.Since(start) + metrics.SkipReason = "ctx-cancelled" + return out, metrics } } close(jobs) wg.Wait() + metrics.Docs = len(out) + metrics.Duration = time.Since(start) if emit != nil { emit(Progress{ Status: "running", - Progress: 40, - Message: fmt.Sprintf("Redfish: prefetch завершен (targets=%d, docs=%d)", len(targets), len(out)), + Progress: 96, + Message: fmt.Sprintf("Redfish: prefetch завершен (адаптивно targets=%d, docs=%d)", len(targets), len(out)), }) } + return out, metrics +} + +func redfishAdaptivePrefetchTargets(candidates []string, rawTree map[string]interface{}, fetchErrs map[string]string) []string { + out := make([]string, 0, len(candidates)) + seen := make(map[string]struct{}, len(candidates)) + for _, p := range candidates { + p = normalizeRedfishPath(p) + if p == "" { + continue + } + if _, exists := seen[p]; exists { + continue + } + needsFetch := false + docAny, inTree := rawTree[p] + if !inTree { + needsFetch = true + if msg, hasErr := fetchErrs[p]; hasErr && !isRetryableRedfishFetchError(fmt.Errorf("%s", msg)) { + needsFetch = false + } + } else if doc, ok := docAny.(map[string]interface{}); ok { + needsFetch = redfishCollectionNeedsMemberRecovery(doc, rawTree, fetchErrs) + } + if !needsFetch { + continue + } + seen[p] = struct{}{} + out = append(out, p) + } return out } +func redfishCollectionNeedsMemberRecovery(collectionDoc map[string]interface{}, rawTree map[string]interface{}, fetchErrs map[string]string) bool { + memberPaths := redfishCollectionMemberRefs(collectionDoc) + if len(memberPaths) == 0 { + return false + } + for _, memberPath := range memberPaths { + memberPath = normalizeRedfishPath(memberPath) + if memberPath == "" { + continue + } + if _, exists := rawTree[memberPath]; exists { + continue + } + if msg, hasErr := fetchErrs[memberPath]; hasErr && !isRetryableRedfishFetchError(fmt.Errorf("%s", msg)) { + continue + } + return true + } + return false +} + func (c *RedfishConnector) httpClient(req Request) *http.Client { return c.httpClientWithTimeout(req, c.timeout) } @@ -656,7 +767,7 @@ func (c *RedfishConnector) discoverMemberPaths(ctx context.Context, client *http return nil } -func (c *RedfishConnector) collectRawRedfishTree(ctx context.Context, client *http.Client, req Request, baseURL string, seedPaths []string, emit ProgressFn) (map[string]interface{}, []map[string]interface{}) { +func (c *RedfishConnector) collectRawRedfishTree(ctx context.Context, client *http.Client, req Request, baseURL string, seedPaths []string, emit ProgressFn) (map[string]interface{}, []map[string]interface{}, redfishPostProbeMetrics) { maxDocuments := redfishSnapshotMaxDocuments() workers := redfishSnapshotWorkers() const heartbeatInterval = 5 * time.Second @@ -667,6 +778,7 @@ func (c *RedfishConnector) collectRawRedfishTree(ctx context.Context, client *ht branchRetryPause := redfishSnapshotBranchRequeueBackoff() timings := newRedfishPathTimingCollector(4) + postProbeMetrics := redfishPostProbeMetrics{} out := make(map[string]interface{}, maxDocuments) fetchErrors := make(map[string]string) seen := make(map[string]struct{}, maxDocuments) @@ -869,13 +981,22 @@ func (c *RedfishConnector) collectRawRedfishTree(ctx context.Context, client *ht // Some Supermicro BMCs expose NVMe disks at direct Disk.Bay endpoints even when the // Drives collection returns Members: []. Probe those paths so raw export can be replayed. + postProbeTotalStart := time.Now() driveCollections := make([]string, 0) - for path := range out { - if strings.HasSuffix(normalizeRedfishPath(path), "/Drives") { - driveCollections = append(driveCollections, normalizeRedfishPath(path)) + for path, docAny := range out { + normalized := normalizeRedfishPath(path) + if !strings.HasSuffix(normalized, "/Drives") { + continue } + postProbeMetrics.NVMECandidates++ + doc, _ := docAny.(map[string]interface{}) + if !shouldAdaptiveNVMeProbe(doc) { + continue + } + driveCollections = append(driveCollections, normalized) } sort.Strings(driveCollections) + postProbeMetrics.NVMESelected = len(driveCollections) nvmeProbeStart := time.Now() for i, path := range driveCollections { if emit != nil && len(driveCollections) > 0 && (i == 0 || i%4 == 0 || i == len(driveCollections)-1) { @@ -893,19 +1014,35 @@ func (c *RedfishConnector) collectRawRedfishTree(ctx context.Context, client *ht if !looksLikeDrive(doc) { continue } - out[normalizeRedfishPath(bayPath)] = doc + normalizedBayPath := normalizeRedfishPath(bayPath) + if _, exists := out[normalizedBayPath]; exists { + continue + } + out[normalizedBayPath] = doc + postProbeMetrics.NVMEAdded++ c.debugSnapshotf("snapshot nvme bay probe hit path=%s", bayPath) } } // Some BMCs under-report collection Members for sensors/PSU subresources but still serve // direct numeric child endpoints. Probe common collections to maximize raw snapshot fidelity. postProbeCollections := make([]string, 0) - for path := range out { - if shouldPostProbeCollectionPath(path) { - postProbeCollections = append(postProbeCollections, normalizeRedfishPath(path)) + for path, docAny := range out { + normalized := normalizeRedfishPath(path) + if !shouldPostProbeCollectionPath(normalized) { + continue + } + postProbeMetrics.CollectionCandidates++ + doc, _ := docAny.(map[string]interface{}) + if shouldAdaptivePostProbeCollectionPath(normalized, doc) { + postProbeCollections = append(postProbeCollections, normalized) + continue + } + if redfishCollectionHasExplicitMembers(doc) { + postProbeMetrics.SkippedExplicit++ } } sort.Strings(postProbeCollections) + postProbeMetrics.CollectionSelected = len(postProbeCollections) postProbeStart := time.Now() addedPostProbe := 0 for i, path := range postProbeCollections { @@ -924,6 +1061,8 @@ func (c *RedfishConnector) collectRawRedfishTree(ctx context.Context, client *ht addedPostProbe++ } } + postProbeMetrics.Added = addedPostProbe + postProbeMetrics.Duration = time.Since(postProbeTotalStart) if emit != nil && addedPostProbe > 0 { emit(Progress{ Status: "running", @@ -931,6 +1070,13 @@ func (c *RedfishConnector) collectRawRedfishTree(ctx context.Context, client *ht Message: fmt.Sprintf("Redfish snapshot: post-probe добавлено %d документов", addedPostProbe), }) } + if emit != nil { + emit(Progress{ + Status: "running", + Progress: 98, + Message: fmt.Sprintf("Redfish snapshot: post-probe метрики candidates=%d selected=%d skipped_explicit=%d added=%d", postProbeMetrics.CollectionCandidates, postProbeMetrics.CollectionSelected, postProbeMetrics.SkippedExplicit, postProbeMetrics.Added), + }) + } if emit != nil { emit(Progress{ @@ -963,7 +1109,7 @@ func (c *RedfishConnector) collectRawRedfishTree(ctx context.Context, client *ht } } - return out, errorList + return out, errorList, postProbeMetrics } func (c *RedfishConnector) probeSupermicroNVMeDiskBays(ctx context.Context, client *http.Client, req Request, baseURL, backplanePath string) []map[string]interface{} { @@ -1142,6 +1288,58 @@ func shouldPostProbeCollectionPath(path string) bool { } } +func shouldAdaptivePostProbeCollectionPath(path string, collectionDoc map[string]interface{}) bool { + path = normalizeRedfishPath(path) + if !shouldPostProbeCollectionPath(path) { + return false + } + if len(collectionDoc) == 0 { + return true + } + memberRefs := redfishCollectionMemberRefs(collectionDoc) + if len(memberRefs) == 0 { + return true + } + return redfishCollectionHasNumericMemberRefs(memberRefs) +} + +func shouldAdaptiveNVMeProbe(collectionDoc map[string]interface{}) bool { + if len(collectionDoc) == 0 { + return true + } + return !redfishCollectionHasExplicitMembers(collectionDoc) +} + +func redfishCollectionHasNumericMemberRefs(memberRefs []string) bool { + for _, memberPath := range memberRefs { + if redfishPathTailIsNumeric(memberPath) { + return true + } + } + return false +} + +func redfishPathTailIsNumeric(path string) bool { + normalized := normalizeRedfishPath(path) + if normalized == "" { + return false + } + parts := strings.Split(strings.Trim(normalized, "/"), "/") + if len(parts) == 0 { + return false + } + tail := strings.TrimSpace(parts[len(parts)-1]) + if tail == "" { + return false + } + for _, r := range tail { + if r < '0' || r > '9' { + return false + } + } + return true +} + func looksLikeRedfishResource(doc map[string]interface{}) bool { if len(doc) == 0 { return false diff --git a/internal/collector/redfish_test.go b/internal/collector/redfish_test.go index c622ff1..2937ca8 100644 --- a/internal/collector/redfish_test.go +++ b/internal/collector/redfish_test.go @@ -1634,6 +1634,89 @@ func TestShouldPostProbeCollectionPath(t *testing.T) { } } +func TestShouldAdaptivePostProbeCollectionPath(t *testing.T) { + withExplicitNamedMembers := map[string]interface{}{ + "Members": []interface{}{ + map[string]interface{}{"@odata.id": "/redfish/v1/Systems/1/EthernetInterfaces/NIC-0-0"}, + map[string]interface{}{"@odata.id": "/redfish/v1/Systems/1/EthernetInterfaces/NIC-0-1"}, + }, + } + if shouldAdaptivePostProbeCollectionPath("/redfish/v1/Systems/1/EthernetInterfaces", withExplicitNamedMembers) { + t.Fatalf("expected explicit non-numeric members to skip adaptive post-probe") + } + + withNumericMembers := map[string]interface{}{ + "Members": []interface{}{ + map[string]interface{}{"@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/1"}, + map[string]interface{}{"@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/2"}, + }, + } + if !shouldAdaptivePostProbeCollectionPath("/redfish/v1/Chassis/1/PCIeDevices", withNumericMembers) { + t.Fatalf("expected numeric members to allow adaptive post-probe") + } + + withoutMembers := map[string]interface{}{"Name": "Drives"} + if !shouldAdaptivePostProbeCollectionPath("/redfish/v1/Chassis/1/Drives", withoutMembers) { + t.Fatalf("expected missing members to allow adaptive post-probe") + } +} + +func TestShouldAdaptiveNVMeProbe(t *testing.T) { + withMembers := map[string]interface{}{ + "Members": []interface{}{ + map[string]interface{}{"@odata.id": "/redfish/v1/Chassis/1/Drives/OB01"}, + }, + } + if shouldAdaptiveNVMeProbe(withMembers) { + t.Fatalf("expected drives collection with explicit members to skip NVMe probe") + } + + withoutMembers := map[string]interface{}{"Name": "Drives"} + if !shouldAdaptiveNVMeProbe(withoutMembers) { + t.Fatalf("expected drives collection without members to allow NVMe probe") + } +} + +func TestRedfishAdaptivePrefetchTargets(t *testing.T) { + candidates := []string{ + "/redfish/v1/Systems/1/Memory", + "/redfish/v1/Systems/1/Processors", + "/redfish/v1/Systems/1/Storage", + } + rawTree := map[string]interface{}{ + "/redfish/v1/Systems/1/Memory": map[string]interface{}{ + "Members": []interface{}{ + map[string]interface{}{"@odata.id": "/redfish/v1/Systems/1/Memory/DIMM1"}, + }, + }, + "/redfish/v1/Systems/1/Storage": map[string]interface{}{ + "Members": []interface{}{ + map[string]interface{}{"@odata.id": "/redfish/v1/Systems/1/Storage/1"}, + }, + }, + } + fetchErrs := map[string]string{ + "/redfish/v1/Systems/1/Memory/DIMM1": "Get \"https://bmc/redfish/v1/Systems/1/Memory/DIMM1\": context deadline exceeded", + "/redfish/v1/Systems/1/Storage/1": "status 404 from /redfish/v1/Systems/1/Storage/1: not found", + "/redfish/v1/Systems/1/Processors": "Get \"https://bmc/redfish/v1/Systems/1/Processors\": context deadline exceeded", + "/redfish/v1/Systems/1/Storage/Volumes": "status 404 from /redfish/v1/Systems/1/Storage/Volumes: not found", + } + + got := redfishAdaptivePrefetchTargets(candidates, rawTree, fetchErrs) + joined := strings.Join(got, "\n") + for _, wanted := range []string{ + "/redfish/v1/Systems/1/Memory", + "/redfish/v1/Systems/1/Processors", + } { + if !strings.Contains(joined, wanted) { + t.Fatalf("expected adaptive prefetch target %q", wanted) + } + } + if strings.Contains(joined, "/redfish/v1/Systems/1/Storage") { + t.Fatalf("expected storage with only non-retryable missing members to be skipped") + } +} + func TestRedfishSnapshotPrioritySeeds_DefaultSkipsNoisyBranches(t *testing.T) { seeds := redfishSnapshotPrioritySeeds( []string{"/redfish/v1/Systems/1"},