Harden Redfish collection for slow BMC endpoints
This commit is contained in:
@@ -80,6 +80,9 @@ func (c *RedfishConnector) Collect(ctx context.Context, req Request, emit Progre
|
||||
systemPaths := c.discoverMemberPaths(ctx, client, req, baseURL, "/redfish/v1/Systems", "/redfish/v1/Systems/1")
|
||||
chassisPaths := c.discoverMemberPaths(ctx, client, req, baseURL, "/redfish/v1/Chassis", "/redfish/v1/Chassis/1")
|
||||
managerPaths := c.discoverMemberPaths(ctx, client, req, baseURL, "/redfish/v1/Managers", "/redfish/v1/Managers/1")
|
||||
criticalPaths := redfishCriticalEndpoints(systemPaths, chassisPaths, managerPaths)
|
||||
criticalClient := c.httpClientWithTimeout(req, redfishCriticalRequestTimeout())
|
||||
criticalWarmDocs, criticalWarmErrs := c.collectCriticalRedfishDocsSequential(ctx, criticalClient, req, baseURL, criticalPaths)
|
||||
|
||||
if emit != nil {
|
||||
emit(Progress{Status: "running", Progress: 30, Message: "Redfish: чтение структуры Redfish..."})
|
||||
@@ -90,27 +93,48 @@ func (c *RedfishConnector) Collect(ctx context.Context, req Request, emit Progre
|
||||
c.debugSnapshotf("snapshot crawl start host=%s port=%d", req.Host, req.Port)
|
||||
rawTree, fetchErrors := c.collectRawRedfishTree(ctx, client, req, baseURL, redfishSnapshotPrioritySeeds(systemPaths, chassisPaths, managerPaths), emit)
|
||||
c.debugSnapshotf("snapshot crawl done docs=%d", len(rawTree))
|
||||
for p, doc := range criticalWarmDocs {
|
||||
if _, ok := rawTree[p]; !ok {
|
||||
rawTree[p] = doc
|
||||
}
|
||||
}
|
||||
fetchErrMap := redfishFetchErrorListToMap(fetchErrors)
|
||||
for p, msg := range criticalWarmErrs {
|
||||
if _, ok := rawTree[p]; ok {
|
||||
continue
|
||||
}
|
||||
if _, exists := fetchErrMap[p]; !exists {
|
||||
fetchErrMap[p] = msg
|
||||
}
|
||||
}
|
||||
if recoveredN := c.recoverCriticalRedfishDocsPlanB(ctx, criticalClient, req, baseURL, criticalPaths, rawTree, fetchErrMap, emit); recoveredN > 0 {
|
||||
c.debugSnapshotf("critical plan-b recovered docs=%d", recoveredN)
|
||||
}
|
||||
if emit != nil {
|
||||
emit(Progress{Status: "running", Progress: 99, Message: "Redfish: анализ raw snapshot..."})
|
||||
}
|
||||
rawPayloads := map[string]any{
|
||||
"redfish_tree": rawTree,
|
||||
}
|
||||
if len(fetchErrors) > 0 {
|
||||
rawPayloads["redfish_fetch_errors"] = fetchErrors
|
||||
if len(fetchErrMap) > 0 {
|
||||
rawPayloads["redfish_fetch_errors"] = redfishFetchErrorMapToList(fetchErrMap)
|
||||
}
|
||||
// Unified tunnel: live collection and raw import go through the same analyzer over redfish_tree.
|
||||
return ReplayRedfishFromRawPayloads(rawPayloads, nil)
|
||||
}
|
||||
|
||||
func (c *RedfishConnector) httpClient(req Request) *http.Client {
|
||||
return c.httpClientWithTimeout(req, c.timeout)
|
||||
}
|
||||
|
||||
func (c *RedfishConnector) httpClientWithTimeout(req Request, timeout time.Duration) *http.Client {
|
||||
transport := &http.Transport{}
|
||||
if req.TLSMode == "insecure" {
|
||||
transport.TLSClientConfig = &tls.Config{InsecureSkipVerify: true} //nolint:gosec
|
||||
}
|
||||
return &http.Client{
|
||||
Transport: transport,
|
||||
Timeout: c.timeout,
|
||||
Timeout: timeout,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -448,7 +472,7 @@ func (c *RedfishConnector) collectPCIeDevices(ctx context.Context, client *http.
|
||||
for _, doc := range memberDocs {
|
||||
functionDocs := c.getLinkedPCIeFunctions(ctx, client, req, baseURL, doc)
|
||||
dev := parsePCIeDevice(doc, functionDocs)
|
||||
key := firstNonEmpty(dev.BDF, dev.SerialNumber, dev.Slot+"|"+dev.DeviceClass)
|
||||
key := pcieDeviceDedupKey(dev)
|
||||
if key == "" {
|
||||
continue
|
||||
}
|
||||
@@ -468,7 +492,7 @@ func (c *RedfishConnector) collectPCIeDevices(ctx context.Context, client *http.
|
||||
}
|
||||
for idx, fn := range functionDocs {
|
||||
dev := parsePCIeFunction(fn, idx+1)
|
||||
key := firstNonEmpty(dev.BDF, dev.SerialNumber, dev.Slot+"|"+dev.DeviceClass)
|
||||
key := pcieDeviceDedupKey(dev)
|
||||
if key == "" {
|
||||
continue
|
||||
}
|
||||
@@ -775,6 +799,40 @@ func (c *RedfishConnector) probeDirectRedfishCollectionChildren(ctx context.Cont
|
||||
return out
|
||||
}
|
||||
|
||||
func (c *RedfishConnector) probeDirectRedfishCollectionChildrenSlow(ctx context.Context, client *http.Client, req Request, baseURL, collectionPath string) map[string]map[string]interface{} {
|
||||
normalized := normalizeRedfishPath(collectionPath)
|
||||
maxItems, startIndex, missBudget := directNumericProbePlan(normalized)
|
||||
if maxItems <= 0 {
|
||||
return nil
|
||||
}
|
||||
out := make(map[string]map[string]interface{})
|
||||
consecutiveMisses := 0
|
||||
for i := startIndex; i <= maxItems; i++ {
|
||||
if len(out) > 0 || i > startIndex {
|
||||
select {
|
||||
case <-time.After(redfishCriticalSlowGap()):
|
||||
case <-ctx.Done():
|
||||
return out
|
||||
}
|
||||
}
|
||||
path := fmt.Sprintf("%s/%d", normalized, i)
|
||||
doc, err := c.getJSONWithRetry(ctx, client, req, baseURL, path, redfishCriticalPlanBAttempts(), redfishCriticalRetryBackoff())
|
||||
if err != nil {
|
||||
consecutiveMisses++
|
||||
if consecutiveMisses >= missBudget {
|
||||
break
|
||||
}
|
||||
continue
|
||||
}
|
||||
consecutiveMisses = 0
|
||||
if !looksLikeRedfishResource(doc) {
|
||||
continue
|
||||
}
|
||||
out[normalizeRedfishPath(path)] = doc
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func directNumericProbePlan(collectionPath string) (maxItems, startIndex, missBudget int) {
|
||||
switch {
|
||||
case strings.HasSuffix(collectionPath, "/Systems"):
|
||||
@@ -848,6 +906,169 @@ func looksLikeRedfishResource(doc map[string]interface{}) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func shouldSlowProbeCriticalCollection(p string) bool {
|
||||
p = normalizeRedfishPath(p)
|
||||
for _, suffix := range []string{
|
||||
"/Processors",
|
||||
"/Memory",
|
||||
"/Storage",
|
||||
"/Drives",
|
||||
"/Volumes",
|
||||
"/PCIeDevices",
|
||||
"/PCIeFunctions",
|
||||
"/NetworkAdapters",
|
||||
"/EthernetInterfaces",
|
||||
"/NetworkInterfaces",
|
||||
"/Sensors",
|
||||
"/Fans",
|
||||
"/Temperatures",
|
||||
"/Voltages",
|
||||
} {
|
||||
if strings.HasSuffix(p, suffix) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func redfishCriticalEndpoints(systemPaths, chassisPaths, managerPaths []string) []string {
|
||||
var out []string
|
||||
seen := make(map[string]struct{})
|
||||
add := func(p string) {
|
||||
p = normalizeRedfishPath(p)
|
||||
if p == "" {
|
||||
return
|
||||
}
|
||||
if _, ok := seen[p]; ok {
|
||||
return
|
||||
}
|
||||
seen[p] = struct{}{}
|
||||
out = append(out, p)
|
||||
}
|
||||
for _, p := range systemPaths {
|
||||
add(p)
|
||||
add(joinPath(p, "/Bios"))
|
||||
add(joinPath(p, "/SecureBoot"))
|
||||
add(joinPath(p, "/Processors"))
|
||||
add(joinPath(p, "/Memory"))
|
||||
add(joinPath(p, "/Storage"))
|
||||
add(joinPath(p, "/SimpleStorage"))
|
||||
add(joinPath(p, "/PCIeDevices"))
|
||||
add(joinPath(p, "/EthernetInterfaces"))
|
||||
add(joinPath(p, "/NetworkInterfaces"))
|
||||
}
|
||||
for _, p := range chassisPaths {
|
||||
add(p)
|
||||
add(joinPath(p, "/Power"))
|
||||
add(joinPath(p, "/Thermal"))
|
||||
add(joinPath(p, "/Sensors"))
|
||||
add(joinPath(p, "/NetworkAdapters"))
|
||||
add(joinPath(p, "/PCIeDevices"))
|
||||
add(joinPath(p, "/Drives"))
|
||||
}
|
||||
for _, p := range managerPaths {
|
||||
add(p)
|
||||
add(joinPath(p, "/NetworkProtocol"))
|
||||
}
|
||||
add("/redfish/v1/UpdateService")
|
||||
add("/redfish/v1/UpdateService/FirmwareInventory")
|
||||
return out
|
||||
}
|
||||
|
||||
func redfishFetchErrorListToMap(list []map[string]interface{}) map[string]string {
|
||||
out := make(map[string]string, len(list))
|
||||
for _, item := range list {
|
||||
p := normalizeRedfishPath(asString(item["path"]))
|
||||
if p == "" {
|
||||
continue
|
||||
}
|
||||
out[p] = asString(item["error"])
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func redfishFetchErrorMapToList(m map[string]string) []map[string]interface{} {
|
||||
if len(m) == 0 {
|
||||
return nil
|
||||
}
|
||||
out := make([]map[string]interface{}, 0, len(m))
|
||||
for p, msg := range m {
|
||||
out = append(out, map[string]interface{}{"path": p, "error": msg})
|
||||
}
|
||||
sort.Slice(out, func(i, j int) bool {
|
||||
return asString(out[i]["path"]) < asString(out[j]["path"])
|
||||
})
|
||||
return out
|
||||
}
|
||||
|
||||
func isRetryableRedfishFetchError(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
msg := strings.ToLower(err.Error())
|
||||
if strings.Contains(msg, "timeout") || strings.Contains(msg, "deadline exceeded") || strings.Contains(msg, "connection reset") || strings.Contains(msg, "unexpected eof") {
|
||||
return true
|
||||
}
|
||||
if strings.HasPrefix(msg, "status 500 ") || strings.HasPrefix(msg, "status 502 ") || strings.HasPrefix(msg, "status 503 ") || strings.HasPrefix(msg, "status 504 ") {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func redfishCriticalRequestTimeout() time.Duration {
|
||||
if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_CRITICAL_TIMEOUT")); v != "" {
|
||||
if d, err := time.ParseDuration(v); err == nil && d > 0 {
|
||||
return d
|
||||
}
|
||||
}
|
||||
return 45 * time.Second
|
||||
}
|
||||
|
||||
func redfishCriticalRetryAttempts() int {
|
||||
if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_CRITICAL_RETRIES")); v != "" {
|
||||
if n, err := strconv.Atoi(v); err == nil && n >= 1 && n <= 10 {
|
||||
return n
|
||||
}
|
||||
}
|
||||
return 3
|
||||
}
|
||||
|
||||
func redfishCriticalPlanBAttempts() int {
|
||||
if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_CRITICAL_PLANB_RETRIES")); v != "" {
|
||||
if n, err := strconv.Atoi(v); err == nil && n >= 1 && n <= 10 {
|
||||
return n
|
||||
}
|
||||
}
|
||||
return 3
|
||||
}
|
||||
|
||||
func redfishCriticalRetryBackoff() time.Duration {
|
||||
if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_CRITICAL_BACKOFF")); v != "" {
|
||||
if d, err := time.ParseDuration(v); err == nil && d >= 0 {
|
||||
return d
|
||||
}
|
||||
}
|
||||
return 1500 * time.Millisecond
|
||||
}
|
||||
|
||||
func redfishCriticalCooldown() time.Duration {
|
||||
if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_CRITICAL_COOLDOWN")); v != "" {
|
||||
if d, err := time.ParseDuration(v); err == nil && d >= 0 {
|
||||
return d
|
||||
}
|
||||
}
|
||||
return 4 * time.Second
|
||||
}
|
||||
|
||||
func redfishCriticalSlowGap() time.Duration {
|
||||
if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_CRITICAL_SLOW_GAP")); v != "" {
|
||||
if d, err := time.ParseDuration(v); err == nil && d >= 0 {
|
||||
return d
|
||||
}
|
||||
}
|
||||
return 1200 * time.Millisecond
|
||||
}
|
||||
|
||||
func redfishLinkRefs(doc map[string]interface{}, topKey, nestedKey string) []string {
|
||||
top, ok := doc[topKey].(map[string]interface{})
|
||||
if !ok {
|
||||
@@ -870,6 +1091,36 @@ func redfishLinkRefs(doc map[string]interface{}, topKey, nestedKey string) []str
|
||||
return out
|
||||
}
|
||||
|
||||
func pcieDeviceDedupKey(dev models.PCIeDevice) string {
|
||||
if bdf := strings.TrimSpace(dev.BDF); looksLikeCanonicalBDF(bdf) {
|
||||
return strings.ToLower(bdf)
|
||||
}
|
||||
if s := strings.TrimSpace(dev.SerialNumber); s != "" {
|
||||
return s
|
||||
}
|
||||
return firstNonEmpty(
|
||||
strings.TrimSpace(dev.Slot)+"|"+strings.TrimSpace(dev.PartNumber)+"|"+strings.TrimSpace(dev.DeviceClass),
|
||||
strings.TrimSpace(dev.Slot)+"|"+strings.TrimSpace(dev.DeviceClass),
|
||||
strings.TrimSpace(dev.PartNumber)+"|"+strings.TrimSpace(dev.DeviceClass),
|
||||
strings.TrimSpace(dev.Description)+"|"+strings.TrimSpace(dev.DeviceClass),
|
||||
)
|
||||
}
|
||||
|
||||
func looksLikeCanonicalBDF(bdf string) bool {
|
||||
bdf = strings.TrimSpace(strings.ToLower(bdf))
|
||||
if bdf == "" {
|
||||
return false
|
||||
}
|
||||
// Accept common forms: 0000:65:00.0 or 65:00.0
|
||||
if strings.Count(bdf, ":") == 2 && strings.Contains(bdf, ".") {
|
||||
return true
|
||||
}
|
||||
if strings.Count(bdf, ":") == 1 && strings.Contains(bdf, ".") {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func shouldCrawlPath(path string) bool {
|
||||
if path == "" {
|
||||
return false
|
||||
@@ -1013,6 +1264,163 @@ func (c *RedfishConnector) getJSON(ctx context.Context, client *http.Client, req
|
||||
return doc, nil
|
||||
}
|
||||
|
||||
func (c *RedfishConnector) getJSONWithRetry(ctx context.Context, client *http.Client, req Request, baseURL, requestPath string, attempts int, backoff time.Duration) (map[string]interface{}, error) {
|
||||
if attempts < 1 {
|
||||
attempts = 1
|
||||
}
|
||||
var lastErr error
|
||||
for i := 0; i < attempts; i++ {
|
||||
doc, err := c.getJSON(ctx, client, req, baseURL, requestPath)
|
||||
if err == nil {
|
||||
return doc, nil
|
||||
}
|
||||
lastErr = err
|
||||
if i == attempts-1 || !isRetryableRedfishFetchError(err) {
|
||||
break
|
||||
}
|
||||
if backoff > 0 {
|
||||
select {
|
||||
case <-time.After(backoff * time.Duration(i+1)):
|
||||
case <-ctx.Done():
|
||||
return nil, ctx.Err()
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil, lastErr
|
||||
}
|
||||
|
||||
func (c *RedfishConnector) collectCriticalRedfishDocsSequential(ctx context.Context, client *http.Client, req Request, baseURL string, paths []string) (map[string]interface{}, map[string]string) {
|
||||
docs := make(map[string]interface{})
|
||||
errs := make(map[string]string)
|
||||
for _, p := range paths {
|
||||
doc, err := c.getJSONWithRetry(ctx, client, req, baseURL, p, redfishCriticalRetryAttempts(), redfishCriticalRetryBackoff())
|
||||
if err != nil {
|
||||
errs[p] = err.Error()
|
||||
continue
|
||||
}
|
||||
docs[p] = doc
|
||||
// For critical collections, eagerly fetch members sequentially with the same slow policy.
|
||||
if members, ok := c.collectCriticalCollectionMembersSequential(ctx, client, req, baseURL, p, doc); ok {
|
||||
for mp, md := range members {
|
||||
docs[mp] = md
|
||||
}
|
||||
}
|
||||
}
|
||||
return docs, errs
|
||||
}
|
||||
|
||||
func (c *RedfishConnector) collectCriticalCollectionMembersSequential(ctx context.Context, client *http.Client, req Request, baseURL, collectionPath string, collectionDoc map[string]interface{}) (map[string]interface{}, bool) {
|
||||
refs, ok := collectionDoc["Members"].([]interface{})
|
||||
if !ok || len(refs) == 0 {
|
||||
return nil, false
|
||||
}
|
||||
out := make(map[string]interface{})
|
||||
for _, refAny := range refs {
|
||||
ref, ok := refAny.(map[string]interface{})
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
memberPath := normalizeRedfishPath(asString(ref["@odata.id"]))
|
||||
if memberPath == "" {
|
||||
continue
|
||||
}
|
||||
doc, err := c.getJSONWithRetry(ctx, client, req, baseURL, memberPath, redfishCriticalRetryAttempts(), redfishCriticalRetryBackoff())
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
out[memberPath] = doc
|
||||
}
|
||||
return out, true
|
||||
}
|
||||
|
||||
func (c *RedfishConnector) recoverCriticalRedfishDocsPlanB(ctx context.Context, client *http.Client, req Request, baseURL string, criticalPaths []string, rawTree map[string]interface{}, fetchErrs map[string]string, emit ProgressFn) int {
|
||||
var targets []string
|
||||
for _, p := range criticalPaths {
|
||||
p = normalizeRedfishPath(p)
|
||||
if p == "" {
|
||||
continue
|
||||
}
|
||||
if _, ok := rawTree[p]; ok {
|
||||
continue
|
||||
}
|
||||
errMsg, hasErr := fetchErrs[p]
|
||||
if !hasErr || !isRetryableRedfishFetchError(fmt.Errorf("%s", errMsg)) {
|
||||
continue
|
||||
}
|
||||
targets = append(targets, p)
|
||||
}
|
||||
if len(targets) == 0 {
|
||||
return 0
|
||||
}
|
||||
if emit != nil {
|
||||
emit(Progress{Status: "running", Progress: 97, Message: "Redfish: cooldown перед повторным добором критичных endpoint..."})
|
||||
}
|
||||
select {
|
||||
case <-time.After(redfishCriticalCooldown()):
|
||||
case <-ctx.Done():
|
||||
return 0
|
||||
}
|
||||
|
||||
recovered := 0
|
||||
for i, p := range targets {
|
||||
if emit != nil {
|
||||
emit(Progress{
|
||||
Status: "running",
|
||||
Progress: 97,
|
||||
Message: fmt.Sprintf("Redfish: plan-B (%d/%d) %s", i+1, len(targets), compactProgressPath(p)),
|
||||
})
|
||||
}
|
||||
if i > 0 {
|
||||
select {
|
||||
case <-time.After(redfishCriticalSlowGap()):
|
||||
case <-ctx.Done():
|
||||
return recovered
|
||||
}
|
||||
}
|
||||
doc, err := c.getJSONWithRetry(ctx, client, req, baseURL, p, redfishCriticalPlanBAttempts(), redfishCriticalRetryBackoff())
|
||||
if err == nil {
|
||||
rawTree[p] = doc
|
||||
delete(fetchErrs, p)
|
||||
recovered++
|
||||
if members, ok := c.collectCriticalCollectionMembersSequential(ctx, client, req, baseURL, p, doc); ok {
|
||||
for mp, md := range members {
|
||||
if _, exists := rawTree[mp]; !exists {
|
||||
rawTree[mp] = md
|
||||
recovered++
|
||||
}
|
||||
}
|
||||
}
|
||||
if shouldSlowProbeCriticalCollection(p) {
|
||||
if children := c.probeDirectRedfishCollectionChildrenSlow(ctx, client, req, baseURL, p); len(children) > 0 {
|
||||
for cp, cd := range children {
|
||||
if _, exists := rawTree[cp]; exists {
|
||||
continue
|
||||
}
|
||||
rawTree[cp] = cd
|
||||
recovered++
|
||||
}
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
fetchErrs[p] = err.Error()
|
||||
// If collection endpoint times out, still try direct child probing for common numeric paths.
|
||||
if shouldSlowProbeCriticalCollection(p) {
|
||||
if children := c.probeDirectRedfishCollectionChildrenSlow(ctx, client, req, baseURL, p); len(children) > 0 {
|
||||
for cp, cd := range children {
|
||||
if _, exists := rawTree[cp]; exists {
|
||||
continue
|
||||
}
|
||||
rawTree[cp] = cd
|
||||
recovered++
|
||||
}
|
||||
delete(fetchErrs, p)
|
||||
}
|
||||
}
|
||||
}
|
||||
return recovered
|
||||
}
|
||||
|
||||
func parseBoardInfo(system map[string]interface{}) models.BoardInfo {
|
||||
return models.BoardInfo{
|
||||
Manufacturer: asString(system["Manufacturer"]),
|
||||
|
||||
@@ -534,7 +534,7 @@ func (r redfishSnapshotReader) collectPCIeDevices(systemPaths, chassisPaths []st
|
||||
for _, doc := range memberDocs {
|
||||
functionDocs := r.getLinkedPCIeFunctions(doc)
|
||||
dev := parsePCIeDevice(doc, functionDocs)
|
||||
key := firstNonEmpty(dev.BDF, dev.SerialNumber, dev.Slot+"|"+dev.DeviceClass)
|
||||
key := pcieDeviceDedupKey(dev)
|
||||
if key == "" {
|
||||
continue
|
||||
}
|
||||
@@ -552,7 +552,7 @@ func (r redfishSnapshotReader) collectPCIeDevices(systemPaths, chassisPaths []st
|
||||
}
|
||||
for idx, fn := range functionDocs {
|
||||
dev := parsePCIeFunction(fn, idx+1)
|
||||
key := firstNonEmpty(dev.BDF, dev.SerialNumber, dev.Slot+"|"+dev.DeviceClass)
|
||||
key := pcieDeviceDedupKey(dev)
|
||||
if key == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user