Harden Redfish collection for slow BMC endpoints

This commit is contained in:
Mikhail Chusavitin
2026-02-25 12:42:43 +03:00
parent b1dde592ae
commit 68592da9f5
2 changed files with 415 additions and 7 deletions

View File

@@ -80,6 +80,9 @@ func (c *RedfishConnector) Collect(ctx context.Context, req Request, emit Progre
systemPaths := c.discoverMemberPaths(ctx, client, req, baseURL, "/redfish/v1/Systems", "/redfish/v1/Systems/1")
chassisPaths := c.discoverMemberPaths(ctx, client, req, baseURL, "/redfish/v1/Chassis", "/redfish/v1/Chassis/1")
managerPaths := c.discoverMemberPaths(ctx, client, req, baseURL, "/redfish/v1/Managers", "/redfish/v1/Managers/1")
criticalPaths := redfishCriticalEndpoints(systemPaths, chassisPaths, managerPaths)
criticalClient := c.httpClientWithTimeout(req, redfishCriticalRequestTimeout())
criticalWarmDocs, criticalWarmErrs := c.collectCriticalRedfishDocsSequential(ctx, criticalClient, req, baseURL, criticalPaths)
if emit != nil {
emit(Progress{Status: "running", Progress: 30, Message: "Redfish: чтение структуры Redfish..."})
@@ -90,27 +93,48 @@ func (c *RedfishConnector) Collect(ctx context.Context, req Request, emit Progre
c.debugSnapshotf("snapshot crawl start host=%s port=%d", req.Host, req.Port)
rawTree, fetchErrors := c.collectRawRedfishTree(ctx, client, req, baseURL, redfishSnapshotPrioritySeeds(systemPaths, chassisPaths, managerPaths), emit)
c.debugSnapshotf("snapshot crawl done docs=%d", len(rawTree))
for p, doc := range criticalWarmDocs {
if _, ok := rawTree[p]; !ok {
rawTree[p] = doc
}
}
fetchErrMap := redfishFetchErrorListToMap(fetchErrors)
for p, msg := range criticalWarmErrs {
if _, ok := rawTree[p]; ok {
continue
}
if _, exists := fetchErrMap[p]; !exists {
fetchErrMap[p] = msg
}
}
if recoveredN := c.recoverCriticalRedfishDocsPlanB(ctx, criticalClient, req, baseURL, criticalPaths, rawTree, fetchErrMap, emit); recoveredN > 0 {
c.debugSnapshotf("critical plan-b recovered docs=%d", recoveredN)
}
if emit != nil {
emit(Progress{Status: "running", Progress: 99, Message: "Redfish: анализ raw snapshot..."})
}
rawPayloads := map[string]any{
"redfish_tree": rawTree,
}
if len(fetchErrors) > 0 {
rawPayloads["redfish_fetch_errors"] = fetchErrors
if len(fetchErrMap) > 0 {
rawPayloads["redfish_fetch_errors"] = redfishFetchErrorMapToList(fetchErrMap)
}
// Unified tunnel: live collection and raw import go through the same analyzer over redfish_tree.
return ReplayRedfishFromRawPayloads(rawPayloads, nil)
}
func (c *RedfishConnector) httpClient(req Request) *http.Client {
return c.httpClientWithTimeout(req, c.timeout)
}
func (c *RedfishConnector) httpClientWithTimeout(req Request, timeout time.Duration) *http.Client {
transport := &http.Transport{}
if req.TLSMode == "insecure" {
transport.TLSClientConfig = &tls.Config{InsecureSkipVerify: true} //nolint:gosec
}
return &http.Client{
Transport: transport,
Timeout: c.timeout,
Timeout: timeout,
}
}
@@ -448,7 +472,7 @@ func (c *RedfishConnector) collectPCIeDevices(ctx context.Context, client *http.
for _, doc := range memberDocs {
functionDocs := c.getLinkedPCIeFunctions(ctx, client, req, baseURL, doc)
dev := parsePCIeDevice(doc, functionDocs)
key := firstNonEmpty(dev.BDF, dev.SerialNumber, dev.Slot+"|"+dev.DeviceClass)
key := pcieDeviceDedupKey(dev)
if key == "" {
continue
}
@@ -468,7 +492,7 @@ func (c *RedfishConnector) collectPCIeDevices(ctx context.Context, client *http.
}
for idx, fn := range functionDocs {
dev := parsePCIeFunction(fn, idx+1)
key := firstNonEmpty(dev.BDF, dev.SerialNumber, dev.Slot+"|"+dev.DeviceClass)
key := pcieDeviceDedupKey(dev)
if key == "" {
continue
}
@@ -775,6 +799,40 @@ func (c *RedfishConnector) probeDirectRedfishCollectionChildren(ctx context.Cont
return out
}
func (c *RedfishConnector) probeDirectRedfishCollectionChildrenSlow(ctx context.Context, client *http.Client, req Request, baseURL, collectionPath string) map[string]map[string]interface{} {
normalized := normalizeRedfishPath(collectionPath)
maxItems, startIndex, missBudget := directNumericProbePlan(normalized)
if maxItems <= 0 {
return nil
}
out := make(map[string]map[string]interface{})
consecutiveMisses := 0
for i := startIndex; i <= maxItems; i++ {
if len(out) > 0 || i > startIndex {
select {
case <-time.After(redfishCriticalSlowGap()):
case <-ctx.Done():
return out
}
}
path := fmt.Sprintf("%s/%d", normalized, i)
doc, err := c.getJSONWithRetry(ctx, client, req, baseURL, path, redfishCriticalPlanBAttempts(), redfishCriticalRetryBackoff())
if err != nil {
consecutiveMisses++
if consecutiveMisses >= missBudget {
break
}
continue
}
consecutiveMisses = 0
if !looksLikeRedfishResource(doc) {
continue
}
out[normalizeRedfishPath(path)] = doc
}
return out
}
func directNumericProbePlan(collectionPath string) (maxItems, startIndex, missBudget int) {
switch {
case strings.HasSuffix(collectionPath, "/Systems"):
@@ -848,6 +906,169 @@ func looksLikeRedfishResource(doc map[string]interface{}) bool {
return false
}
func shouldSlowProbeCriticalCollection(p string) bool {
p = normalizeRedfishPath(p)
for _, suffix := range []string{
"/Processors",
"/Memory",
"/Storage",
"/Drives",
"/Volumes",
"/PCIeDevices",
"/PCIeFunctions",
"/NetworkAdapters",
"/EthernetInterfaces",
"/NetworkInterfaces",
"/Sensors",
"/Fans",
"/Temperatures",
"/Voltages",
} {
if strings.HasSuffix(p, suffix) {
return true
}
}
return false
}
func redfishCriticalEndpoints(systemPaths, chassisPaths, managerPaths []string) []string {
var out []string
seen := make(map[string]struct{})
add := func(p string) {
p = normalizeRedfishPath(p)
if p == "" {
return
}
if _, ok := seen[p]; ok {
return
}
seen[p] = struct{}{}
out = append(out, p)
}
for _, p := range systemPaths {
add(p)
add(joinPath(p, "/Bios"))
add(joinPath(p, "/SecureBoot"))
add(joinPath(p, "/Processors"))
add(joinPath(p, "/Memory"))
add(joinPath(p, "/Storage"))
add(joinPath(p, "/SimpleStorage"))
add(joinPath(p, "/PCIeDevices"))
add(joinPath(p, "/EthernetInterfaces"))
add(joinPath(p, "/NetworkInterfaces"))
}
for _, p := range chassisPaths {
add(p)
add(joinPath(p, "/Power"))
add(joinPath(p, "/Thermal"))
add(joinPath(p, "/Sensors"))
add(joinPath(p, "/NetworkAdapters"))
add(joinPath(p, "/PCIeDevices"))
add(joinPath(p, "/Drives"))
}
for _, p := range managerPaths {
add(p)
add(joinPath(p, "/NetworkProtocol"))
}
add("/redfish/v1/UpdateService")
add("/redfish/v1/UpdateService/FirmwareInventory")
return out
}
func redfishFetchErrorListToMap(list []map[string]interface{}) map[string]string {
out := make(map[string]string, len(list))
for _, item := range list {
p := normalizeRedfishPath(asString(item["path"]))
if p == "" {
continue
}
out[p] = asString(item["error"])
}
return out
}
func redfishFetchErrorMapToList(m map[string]string) []map[string]interface{} {
if len(m) == 0 {
return nil
}
out := make([]map[string]interface{}, 0, len(m))
for p, msg := range m {
out = append(out, map[string]interface{}{"path": p, "error": msg})
}
sort.Slice(out, func(i, j int) bool {
return asString(out[i]["path"]) < asString(out[j]["path"])
})
return out
}
func isRetryableRedfishFetchError(err error) bool {
if err == nil {
return false
}
msg := strings.ToLower(err.Error())
if strings.Contains(msg, "timeout") || strings.Contains(msg, "deadline exceeded") || strings.Contains(msg, "connection reset") || strings.Contains(msg, "unexpected eof") {
return true
}
if strings.HasPrefix(msg, "status 500 ") || strings.HasPrefix(msg, "status 502 ") || strings.HasPrefix(msg, "status 503 ") || strings.HasPrefix(msg, "status 504 ") {
return true
}
return false
}
func redfishCriticalRequestTimeout() time.Duration {
if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_CRITICAL_TIMEOUT")); v != "" {
if d, err := time.ParseDuration(v); err == nil && d > 0 {
return d
}
}
return 45 * time.Second
}
func redfishCriticalRetryAttempts() int {
if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_CRITICAL_RETRIES")); v != "" {
if n, err := strconv.Atoi(v); err == nil && n >= 1 && n <= 10 {
return n
}
}
return 3
}
func redfishCriticalPlanBAttempts() int {
if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_CRITICAL_PLANB_RETRIES")); v != "" {
if n, err := strconv.Atoi(v); err == nil && n >= 1 && n <= 10 {
return n
}
}
return 3
}
func redfishCriticalRetryBackoff() time.Duration {
if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_CRITICAL_BACKOFF")); v != "" {
if d, err := time.ParseDuration(v); err == nil && d >= 0 {
return d
}
}
return 1500 * time.Millisecond
}
func redfishCriticalCooldown() time.Duration {
if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_CRITICAL_COOLDOWN")); v != "" {
if d, err := time.ParseDuration(v); err == nil && d >= 0 {
return d
}
}
return 4 * time.Second
}
func redfishCriticalSlowGap() time.Duration {
if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_CRITICAL_SLOW_GAP")); v != "" {
if d, err := time.ParseDuration(v); err == nil && d >= 0 {
return d
}
}
return 1200 * time.Millisecond
}
func redfishLinkRefs(doc map[string]interface{}, topKey, nestedKey string) []string {
top, ok := doc[topKey].(map[string]interface{})
if !ok {
@@ -870,6 +1091,36 @@ func redfishLinkRefs(doc map[string]interface{}, topKey, nestedKey string) []str
return out
}
func pcieDeviceDedupKey(dev models.PCIeDevice) string {
if bdf := strings.TrimSpace(dev.BDF); looksLikeCanonicalBDF(bdf) {
return strings.ToLower(bdf)
}
if s := strings.TrimSpace(dev.SerialNumber); s != "" {
return s
}
return firstNonEmpty(
strings.TrimSpace(dev.Slot)+"|"+strings.TrimSpace(dev.PartNumber)+"|"+strings.TrimSpace(dev.DeviceClass),
strings.TrimSpace(dev.Slot)+"|"+strings.TrimSpace(dev.DeviceClass),
strings.TrimSpace(dev.PartNumber)+"|"+strings.TrimSpace(dev.DeviceClass),
strings.TrimSpace(dev.Description)+"|"+strings.TrimSpace(dev.DeviceClass),
)
}
func looksLikeCanonicalBDF(bdf string) bool {
bdf = strings.TrimSpace(strings.ToLower(bdf))
if bdf == "" {
return false
}
// Accept common forms: 0000:65:00.0 or 65:00.0
if strings.Count(bdf, ":") == 2 && strings.Contains(bdf, ".") {
return true
}
if strings.Count(bdf, ":") == 1 && strings.Contains(bdf, ".") {
return true
}
return false
}
func shouldCrawlPath(path string) bool {
if path == "" {
return false
@@ -1013,6 +1264,163 @@ func (c *RedfishConnector) getJSON(ctx context.Context, client *http.Client, req
return doc, nil
}
func (c *RedfishConnector) getJSONWithRetry(ctx context.Context, client *http.Client, req Request, baseURL, requestPath string, attempts int, backoff time.Duration) (map[string]interface{}, error) {
if attempts < 1 {
attempts = 1
}
var lastErr error
for i := 0; i < attempts; i++ {
doc, err := c.getJSON(ctx, client, req, baseURL, requestPath)
if err == nil {
return doc, nil
}
lastErr = err
if i == attempts-1 || !isRetryableRedfishFetchError(err) {
break
}
if backoff > 0 {
select {
case <-time.After(backoff * time.Duration(i+1)):
case <-ctx.Done():
return nil, ctx.Err()
}
}
}
return nil, lastErr
}
func (c *RedfishConnector) collectCriticalRedfishDocsSequential(ctx context.Context, client *http.Client, req Request, baseURL string, paths []string) (map[string]interface{}, map[string]string) {
docs := make(map[string]interface{})
errs := make(map[string]string)
for _, p := range paths {
doc, err := c.getJSONWithRetry(ctx, client, req, baseURL, p, redfishCriticalRetryAttempts(), redfishCriticalRetryBackoff())
if err != nil {
errs[p] = err.Error()
continue
}
docs[p] = doc
// For critical collections, eagerly fetch members sequentially with the same slow policy.
if members, ok := c.collectCriticalCollectionMembersSequential(ctx, client, req, baseURL, p, doc); ok {
for mp, md := range members {
docs[mp] = md
}
}
}
return docs, errs
}
func (c *RedfishConnector) collectCriticalCollectionMembersSequential(ctx context.Context, client *http.Client, req Request, baseURL, collectionPath string, collectionDoc map[string]interface{}) (map[string]interface{}, bool) {
refs, ok := collectionDoc["Members"].([]interface{})
if !ok || len(refs) == 0 {
return nil, false
}
out := make(map[string]interface{})
for _, refAny := range refs {
ref, ok := refAny.(map[string]interface{})
if !ok {
continue
}
memberPath := normalizeRedfishPath(asString(ref["@odata.id"]))
if memberPath == "" {
continue
}
doc, err := c.getJSONWithRetry(ctx, client, req, baseURL, memberPath, redfishCriticalRetryAttempts(), redfishCriticalRetryBackoff())
if err != nil {
continue
}
out[memberPath] = doc
}
return out, true
}
func (c *RedfishConnector) recoverCriticalRedfishDocsPlanB(ctx context.Context, client *http.Client, req Request, baseURL string, criticalPaths []string, rawTree map[string]interface{}, fetchErrs map[string]string, emit ProgressFn) int {
var targets []string
for _, p := range criticalPaths {
p = normalizeRedfishPath(p)
if p == "" {
continue
}
if _, ok := rawTree[p]; ok {
continue
}
errMsg, hasErr := fetchErrs[p]
if !hasErr || !isRetryableRedfishFetchError(fmt.Errorf("%s", errMsg)) {
continue
}
targets = append(targets, p)
}
if len(targets) == 0 {
return 0
}
if emit != nil {
emit(Progress{Status: "running", Progress: 97, Message: "Redfish: cooldown перед повторным добором критичных endpoint..."})
}
select {
case <-time.After(redfishCriticalCooldown()):
case <-ctx.Done():
return 0
}
recovered := 0
for i, p := range targets {
if emit != nil {
emit(Progress{
Status: "running",
Progress: 97,
Message: fmt.Sprintf("Redfish: plan-B (%d/%d) %s", i+1, len(targets), compactProgressPath(p)),
})
}
if i > 0 {
select {
case <-time.After(redfishCriticalSlowGap()):
case <-ctx.Done():
return recovered
}
}
doc, err := c.getJSONWithRetry(ctx, client, req, baseURL, p, redfishCriticalPlanBAttempts(), redfishCriticalRetryBackoff())
if err == nil {
rawTree[p] = doc
delete(fetchErrs, p)
recovered++
if members, ok := c.collectCriticalCollectionMembersSequential(ctx, client, req, baseURL, p, doc); ok {
for mp, md := range members {
if _, exists := rawTree[mp]; !exists {
rawTree[mp] = md
recovered++
}
}
}
if shouldSlowProbeCriticalCollection(p) {
if children := c.probeDirectRedfishCollectionChildrenSlow(ctx, client, req, baseURL, p); len(children) > 0 {
for cp, cd := range children {
if _, exists := rawTree[cp]; exists {
continue
}
rawTree[cp] = cd
recovered++
}
}
}
continue
}
fetchErrs[p] = err.Error()
// If collection endpoint times out, still try direct child probing for common numeric paths.
if shouldSlowProbeCriticalCollection(p) {
if children := c.probeDirectRedfishCollectionChildrenSlow(ctx, client, req, baseURL, p); len(children) > 0 {
for cp, cd := range children {
if _, exists := rawTree[cp]; exists {
continue
}
rawTree[cp] = cd
recovered++
}
delete(fetchErrs, p)
}
}
}
return recovered
}
func parseBoardInfo(system map[string]interface{}) models.BoardInfo {
return models.BoardInfo{
Manufacturer: asString(system["Manufacturer"]),

View File

@@ -534,7 +534,7 @@ func (r redfishSnapshotReader) collectPCIeDevices(systemPaths, chassisPaths []st
for _, doc := range memberDocs {
functionDocs := r.getLinkedPCIeFunctions(doc)
dev := parsePCIeDevice(doc, functionDocs)
key := firstNonEmpty(dev.BDF, dev.SerialNumber, dev.Slot+"|"+dev.DeviceClass)
key := pcieDeviceDedupKey(dev)
if key == "" {
continue
}
@@ -552,7 +552,7 @@ func (r redfishSnapshotReader) collectPCIeDevices(systemPaths, chassisPaths []st
}
for idx, fn := range functionDocs {
dev := parsePCIeFunction(fn, idx+1)
key := firstNonEmpty(dev.BDF, dev.SerialNumber, dev.Slot+"|"+dev.DeviceClass)
key := pcieDeviceDedupKey(dev)
if key == "" {
continue
}