diff --git a/internal/collector/redfish.go b/internal/collector/redfish.go index 05cb8e0..882e1ca 100644 --- a/internal/collector/redfish.go +++ b/internal/collector/redfish.go @@ -80,6 +80,9 @@ func (c *RedfishConnector) Collect(ctx context.Context, req Request, emit Progre systemPaths := c.discoverMemberPaths(ctx, client, req, baseURL, "/redfish/v1/Systems", "/redfish/v1/Systems/1") chassisPaths := c.discoverMemberPaths(ctx, client, req, baseURL, "/redfish/v1/Chassis", "/redfish/v1/Chassis/1") managerPaths := c.discoverMemberPaths(ctx, client, req, baseURL, "/redfish/v1/Managers", "/redfish/v1/Managers/1") + criticalPaths := redfishCriticalEndpoints(systemPaths, chassisPaths, managerPaths) + criticalClient := c.httpClientWithTimeout(req, redfishCriticalRequestTimeout()) + criticalWarmDocs, criticalWarmErrs := c.collectCriticalRedfishDocsSequential(ctx, criticalClient, req, baseURL, criticalPaths) if emit != nil { emit(Progress{Status: "running", Progress: 30, Message: "Redfish: чтение структуры Redfish..."}) @@ -90,27 +93,48 @@ func (c *RedfishConnector) Collect(ctx context.Context, req Request, emit Progre c.debugSnapshotf("snapshot crawl start host=%s port=%d", req.Host, req.Port) rawTree, fetchErrors := c.collectRawRedfishTree(ctx, client, req, baseURL, redfishSnapshotPrioritySeeds(systemPaths, chassisPaths, managerPaths), emit) c.debugSnapshotf("snapshot crawl done docs=%d", len(rawTree)) + for p, doc := range criticalWarmDocs { + if _, ok := rawTree[p]; !ok { + rawTree[p] = doc + } + } + fetchErrMap := redfishFetchErrorListToMap(fetchErrors) + for p, msg := range criticalWarmErrs { + if _, ok := rawTree[p]; ok { + continue + } + if _, exists := fetchErrMap[p]; !exists { + fetchErrMap[p] = msg + } + } + if recoveredN := c.recoverCriticalRedfishDocsPlanB(ctx, criticalClient, req, baseURL, criticalPaths, rawTree, fetchErrMap, emit); recoveredN > 0 { + c.debugSnapshotf("critical plan-b recovered docs=%d", recoveredN) + } if emit != nil { emit(Progress{Status: "running", Progress: 99, Message: "Redfish: анализ raw snapshot..."}) } rawPayloads := map[string]any{ "redfish_tree": rawTree, } - if len(fetchErrors) > 0 { - rawPayloads["redfish_fetch_errors"] = fetchErrors + if len(fetchErrMap) > 0 { + rawPayloads["redfish_fetch_errors"] = redfishFetchErrorMapToList(fetchErrMap) } // Unified tunnel: live collection and raw import go through the same analyzer over redfish_tree. return ReplayRedfishFromRawPayloads(rawPayloads, nil) } func (c *RedfishConnector) httpClient(req Request) *http.Client { + return c.httpClientWithTimeout(req, c.timeout) +} + +func (c *RedfishConnector) httpClientWithTimeout(req Request, timeout time.Duration) *http.Client { transport := &http.Transport{} if req.TLSMode == "insecure" { transport.TLSClientConfig = &tls.Config{InsecureSkipVerify: true} //nolint:gosec } return &http.Client{ Transport: transport, - Timeout: c.timeout, + Timeout: timeout, } } @@ -448,7 +472,7 @@ func (c *RedfishConnector) collectPCIeDevices(ctx context.Context, client *http. for _, doc := range memberDocs { functionDocs := c.getLinkedPCIeFunctions(ctx, client, req, baseURL, doc) dev := parsePCIeDevice(doc, functionDocs) - key := firstNonEmpty(dev.BDF, dev.SerialNumber, dev.Slot+"|"+dev.DeviceClass) + key := pcieDeviceDedupKey(dev) if key == "" { continue } @@ -468,7 +492,7 @@ func (c *RedfishConnector) collectPCIeDevices(ctx context.Context, client *http. } for idx, fn := range functionDocs { dev := parsePCIeFunction(fn, idx+1) - key := firstNonEmpty(dev.BDF, dev.SerialNumber, dev.Slot+"|"+dev.DeviceClass) + key := pcieDeviceDedupKey(dev) if key == "" { continue } @@ -775,6 +799,40 @@ func (c *RedfishConnector) probeDirectRedfishCollectionChildren(ctx context.Cont return out } +func (c *RedfishConnector) probeDirectRedfishCollectionChildrenSlow(ctx context.Context, client *http.Client, req Request, baseURL, collectionPath string) map[string]map[string]interface{} { + normalized := normalizeRedfishPath(collectionPath) + maxItems, startIndex, missBudget := directNumericProbePlan(normalized) + if maxItems <= 0 { + return nil + } + out := make(map[string]map[string]interface{}) + consecutiveMisses := 0 + for i := startIndex; i <= maxItems; i++ { + if len(out) > 0 || i > startIndex { + select { + case <-time.After(redfishCriticalSlowGap()): + case <-ctx.Done(): + return out + } + } + path := fmt.Sprintf("%s/%d", normalized, i) + doc, err := c.getJSONWithRetry(ctx, client, req, baseURL, path, redfishCriticalPlanBAttempts(), redfishCriticalRetryBackoff()) + if err != nil { + consecutiveMisses++ + if consecutiveMisses >= missBudget { + break + } + continue + } + consecutiveMisses = 0 + if !looksLikeRedfishResource(doc) { + continue + } + out[normalizeRedfishPath(path)] = doc + } + return out +} + func directNumericProbePlan(collectionPath string) (maxItems, startIndex, missBudget int) { switch { case strings.HasSuffix(collectionPath, "/Systems"): @@ -848,6 +906,169 @@ func looksLikeRedfishResource(doc map[string]interface{}) bool { return false } +func shouldSlowProbeCriticalCollection(p string) bool { + p = normalizeRedfishPath(p) + for _, suffix := range []string{ + "/Processors", + "/Memory", + "/Storage", + "/Drives", + "/Volumes", + "/PCIeDevices", + "/PCIeFunctions", + "/NetworkAdapters", + "/EthernetInterfaces", + "/NetworkInterfaces", + "/Sensors", + "/Fans", + "/Temperatures", + "/Voltages", + } { + if strings.HasSuffix(p, suffix) { + return true + } + } + return false +} + +func redfishCriticalEndpoints(systemPaths, chassisPaths, managerPaths []string) []string { + var out []string + seen := make(map[string]struct{}) + add := func(p string) { + p = normalizeRedfishPath(p) + if p == "" { + return + } + if _, ok := seen[p]; ok { + return + } + seen[p] = struct{}{} + out = append(out, p) + } + for _, p := range systemPaths { + add(p) + add(joinPath(p, "/Bios")) + add(joinPath(p, "/SecureBoot")) + add(joinPath(p, "/Processors")) + add(joinPath(p, "/Memory")) + add(joinPath(p, "/Storage")) + add(joinPath(p, "/SimpleStorage")) + add(joinPath(p, "/PCIeDevices")) + add(joinPath(p, "/EthernetInterfaces")) + add(joinPath(p, "/NetworkInterfaces")) + } + for _, p := range chassisPaths { + add(p) + add(joinPath(p, "/Power")) + add(joinPath(p, "/Thermal")) + add(joinPath(p, "/Sensors")) + add(joinPath(p, "/NetworkAdapters")) + add(joinPath(p, "/PCIeDevices")) + add(joinPath(p, "/Drives")) + } + for _, p := range managerPaths { + add(p) + add(joinPath(p, "/NetworkProtocol")) + } + add("/redfish/v1/UpdateService") + add("/redfish/v1/UpdateService/FirmwareInventory") + return out +} + +func redfishFetchErrorListToMap(list []map[string]interface{}) map[string]string { + out := make(map[string]string, len(list)) + for _, item := range list { + p := normalizeRedfishPath(asString(item["path"])) + if p == "" { + continue + } + out[p] = asString(item["error"]) + } + return out +} + +func redfishFetchErrorMapToList(m map[string]string) []map[string]interface{} { + if len(m) == 0 { + return nil + } + out := make([]map[string]interface{}, 0, len(m)) + for p, msg := range m { + out = append(out, map[string]interface{}{"path": p, "error": msg}) + } + sort.Slice(out, func(i, j int) bool { + return asString(out[i]["path"]) < asString(out[j]["path"]) + }) + return out +} + +func isRetryableRedfishFetchError(err error) bool { + if err == nil { + return false + } + msg := strings.ToLower(err.Error()) + if strings.Contains(msg, "timeout") || strings.Contains(msg, "deadline exceeded") || strings.Contains(msg, "connection reset") || strings.Contains(msg, "unexpected eof") { + return true + } + if strings.HasPrefix(msg, "status 500 ") || strings.HasPrefix(msg, "status 502 ") || strings.HasPrefix(msg, "status 503 ") || strings.HasPrefix(msg, "status 504 ") { + return true + } + return false +} + +func redfishCriticalRequestTimeout() time.Duration { + if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_CRITICAL_TIMEOUT")); v != "" { + if d, err := time.ParseDuration(v); err == nil && d > 0 { + return d + } + } + return 45 * time.Second +} + +func redfishCriticalRetryAttempts() int { + if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_CRITICAL_RETRIES")); v != "" { + if n, err := strconv.Atoi(v); err == nil && n >= 1 && n <= 10 { + return n + } + } + return 3 +} + +func redfishCriticalPlanBAttempts() int { + if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_CRITICAL_PLANB_RETRIES")); v != "" { + if n, err := strconv.Atoi(v); err == nil && n >= 1 && n <= 10 { + return n + } + } + return 3 +} + +func redfishCriticalRetryBackoff() time.Duration { + if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_CRITICAL_BACKOFF")); v != "" { + if d, err := time.ParseDuration(v); err == nil && d >= 0 { + return d + } + } + return 1500 * time.Millisecond +} + +func redfishCriticalCooldown() time.Duration { + if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_CRITICAL_COOLDOWN")); v != "" { + if d, err := time.ParseDuration(v); err == nil && d >= 0 { + return d + } + } + return 4 * time.Second +} + +func redfishCriticalSlowGap() time.Duration { + if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_CRITICAL_SLOW_GAP")); v != "" { + if d, err := time.ParseDuration(v); err == nil && d >= 0 { + return d + } + } + return 1200 * time.Millisecond +} + func redfishLinkRefs(doc map[string]interface{}, topKey, nestedKey string) []string { top, ok := doc[topKey].(map[string]interface{}) if !ok { @@ -870,6 +1091,36 @@ func redfishLinkRefs(doc map[string]interface{}, topKey, nestedKey string) []str return out } +func pcieDeviceDedupKey(dev models.PCIeDevice) string { + if bdf := strings.TrimSpace(dev.BDF); looksLikeCanonicalBDF(bdf) { + return strings.ToLower(bdf) + } + if s := strings.TrimSpace(dev.SerialNumber); s != "" { + return s + } + return firstNonEmpty( + strings.TrimSpace(dev.Slot)+"|"+strings.TrimSpace(dev.PartNumber)+"|"+strings.TrimSpace(dev.DeviceClass), + strings.TrimSpace(dev.Slot)+"|"+strings.TrimSpace(dev.DeviceClass), + strings.TrimSpace(dev.PartNumber)+"|"+strings.TrimSpace(dev.DeviceClass), + strings.TrimSpace(dev.Description)+"|"+strings.TrimSpace(dev.DeviceClass), + ) +} + +func looksLikeCanonicalBDF(bdf string) bool { + bdf = strings.TrimSpace(strings.ToLower(bdf)) + if bdf == "" { + return false + } + // Accept common forms: 0000:65:00.0 or 65:00.0 + if strings.Count(bdf, ":") == 2 && strings.Contains(bdf, ".") { + return true + } + if strings.Count(bdf, ":") == 1 && strings.Contains(bdf, ".") { + return true + } + return false +} + func shouldCrawlPath(path string) bool { if path == "" { return false @@ -1013,6 +1264,163 @@ func (c *RedfishConnector) getJSON(ctx context.Context, client *http.Client, req return doc, nil } +func (c *RedfishConnector) getJSONWithRetry(ctx context.Context, client *http.Client, req Request, baseURL, requestPath string, attempts int, backoff time.Duration) (map[string]interface{}, error) { + if attempts < 1 { + attempts = 1 + } + var lastErr error + for i := 0; i < attempts; i++ { + doc, err := c.getJSON(ctx, client, req, baseURL, requestPath) + if err == nil { + return doc, nil + } + lastErr = err + if i == attempts-1 || !isRetryableRedfishFetchError(err) { + break + } + if backoff > 0 { + select { + case <-time.After(backoff * time.Duration(i+1)): + case <-ctx.Done(): + return nil, ctx.Err() + } + } + } + return nil, lastErr +} + +func (c *RedfishConnector) collectCriticalRedfishDocsSequential(ctx context.Context, client *http.Client, req Request, baseURL string, paths []string) (map[string]interface{}, map[string]string) { + docs := make(map[string]interface{}) + errs := make(map[string]string) + for _, p := range paths { + doc, err := c.getJSONWithRetry(ctx, client, req, baseURL, p, redfishCriticalRetryAttempts(), redfishCriticalRetryBackoff()) + if err != nil { + errs[p] = err.Error() + continue + } + docs[p] = doc + // For critical collections, eagerly fetch members sequentially with the same slow policy. + if members, ok := c.collectCriticalCollectionMembersSequential(ctx, client, req, baseURL, p, doc); ok { + for mp, md := range members { + docs[mp] = md + } + } + } + return docs, errs +} + +func (c *RedfishConnector) collectCriticalCollectionMembersSequential(ctx context.Context, client *http.Client, req Request, baseURL, collectionPath string, collectionDoc map[string]interface{}) (map[string]interface{}, bool) { + refs, ok := collectionDoc["Members"].([]interface{}) + if !ok || len(refs) == 0 { + return nil, false + } + out := make(map[string]interface{}) + for _, refAny := range refs { + ref, ok := refAny.(map[string]interface{}) + if !ok { + continue + } + memberPath := normalizeRedfishPath(asString(ref["@odata.id"])) + if memberPath == "" { + continue + } + doc, err := c.getJSONWithRetry(ctx, client, req, baseURL, memberPath, redfishCriticalRetryAttempts(), redfishCriticalRetryBackoff()) + if err != nil { + continue + } + out[memberPath] = doc + } + return out, true +} + +func (c *RedfishConnector) recoverCriticalRedfishDocsPlanB(ctx context.Context, client *http.Client, req Request, baseURL string, criticalPaths []string, rawTree map[string]interface{}, fetchErrs map[string]string, emit ProgressFn) int { + var targets []string + for _, p := range criticalPaths { + p = normalizeRedfishPath(p) + if p == "" { + continue + } + if _, ok := rawTree[p]; ok { + continue + } + errMsg, hasErr := fetchErrs[p] + if !hasErr || !isRetryableRedfishFetchError(fmt.Errorf("%s", errMsg)) { + continue + } + targets = append(targets, p) + } + if len(targets) == 0 { + return 0 + } + if emit != nil { + emit(Progress{Status: "running", Progress: 97, Message: "Redfish: cooldown перед повторным добором критичных endpoint..."}) + } + select { + case <-time.After(redfishCriticalCooldown()): + case <-ctx.Done(): + return 0 + } + + recovered := 0 + for i, p := range targets { + if emit != nil { + emit(Progress{ + Status: "running", + Progress: 97, + Message: fmt.Sprintf("Redfish: plan-B (%d/%d) %s", i+1, len(targets), compactProgressPath(p)), + }) + } + if i > 0 { + select { + case <-time.After(redfishCriticalSlowGap()): + case <-ctx.Done(): + return recovered + } + } + doc, err := c.getJSONWithRetry(ctx, client, req, baseURL, p, redfishCriticalPlanBAttempts(), redfishCriticalRetryBackoff()) + if err == nil { + rawTree[p] = doc + delete(fetchErrs, p) + recovered++ + if members, ok := c.collectCriticalCollectionMembersSequential(ctx, client, req, baseURL, p, doc); ok { + for mp, md := range members { + if _, exists := rawTree[mp]; !exists { + rawTree[mp] = md + recovered++ + } + } + } + if shouldSlowProbeCriticalCollection(p) { + if children := c.probeDirectRedfishCollectionChildrenSlow(ctx, client, req, baseURL, p); len(children) > 0 { + for cp, cd := range children { + if _, exists := rawTree[cp]; exists { + continue + } + rawTree[cp] = cd + recovered++ + } + } + } + continue + } + fetchErrs[p] = err.Error() + // If collection endpoint times out, still try direct child probing for common numeric paths. + if shouldSlowProbeCriticalCollection(p) { + if children := c.probeDirectRedfishCollectionChildrenSlow(ctx, client, req, baseURL, p); len(children) > 0 { + for cp, cd := range children { + if _, exists := rawTree[cp]; exists { + continue + } + rawTree[cp] = cd + recovered++ + } + delete(fetchErrs, p) + } + } + } + return recovered +} + func parseBoardInfo(system map[string]interface{}) models.BoardInfo { return models.BoardInfo{ Manufacturer: asString(system["Manufacturer"]), diff --git a/internal/collector/redfish_replay.go b/internal/collector/redfish_replay.go index a3144b2..2040676 100644 --- a/internal/collector/redfish_replay.go +++ b/internal/collector/redfish_replay.go @@ -534,7 +534,7 @@ func (r redfishSnapshotReader) collectPCIeDevices(systemPaths, chassisPaths []st for _, doc := range memberDocs { functionDocs := r.getLinkedPCIeFunctions(doc) dev := parsePCIeDevice(doc, functionDocs) - key := firstNonEmpty(dev.BDF, dev.SerialNumber, dev.Slot+"|"+dev.DeviceClass) + key := pcieDeviceDedupKey(dev) if key == "" { continue } @@ -552,7 +552,7 @@ func (r redfishSnapshotReader) collectPCIeDevices(systemPaths, chassisPaths []st } for idx, fn := range functionDocs { dev := parsePCIeFunction(fn, idx+1) - key := firstNonEmpty(dev.BDF, dev.SerialNumber, dev.Slot+"|"+dev.DeviceClass) + key := pcieDeviceDedupKey(dev) if key == "" { continue }