Fix Redfish snapshot crawl deadlock and add debug progress

This commit is contained in:
Mikhail Chusavitin
2026-02-24 16:22:37 +03:00
parent 38cc051f23
commit 5d9e9d73de

View File

@@ -6,8 +6,10 @@ import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"io" "io"
"log"
"net/http" "net/http"
"net/url" "net/url"
"os"
"path" "path"
"sort" "sort"
"strconv" "strconv"
@@ -22,11 +24,23 @@ import (
type RedfishConnector struct { type RedfishConnector struct {
timeout time.Duration timeout time.Duration
debug bool
debugSnapshot bool
} }
func NewRedfishConnector() *RedfishConnector { func NewRedfishConnector() *RedfishConnector {
debug := false
if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_DEBUG")); v != "" && v != "0" && !strings.EqualFold(v, "false") {
debug = true
}
debugSnapshot := false
if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_SNAPSHOT_DEBUG")); v != "" && v != "0" && !strings.EqualFold(v, "false") {
debugSnapshot = true
}
return &RedfishConnector{ return &RedfishConnector{
timeout: 10 * time.Second, timeout: 10 * time.Second,
debug: debug,
debugSnapshot: debugSnapshot || debug,
} }
} }
@@ -34,6 +48,20 @@ func (c *RedfishConnector) Protocol() string {
return "redfish" return "redfish"
} }
func (c *RedfishConnector) debugf(format string, args ...interface{}) {
if !c.debug {
return
}
log.Printf("redfish-debug: "+format, args...)
}
func (c *RedfishConnector) debugSnapshotf(format string, args ...interface{}) {
if !c.debugSnapshot {
return
}
log.Printf("redfish-snapshot-debug: "+format, args...)
}
func (c *RedfishConnector) Collect(ctx context.Context, req Request, emit ProgressFn) (*models.AnalysisResult, error) { func (c *RedfishConnector) Collect(ctx context.Context, req Request, emit ProgressFn) (*models.AnalysisResult, error) {
baseURL, err := c.baseURL(req) baseURL, err := c.baseURL(req)
if err != nil { if err != nil {
@@ -84,7 +112,9 @@ func (c *RedfishConnector) Collect(ctx context.Context, req Request, emit Progre
if emit != nil { if emit != nil {
emit(Progress{Status: "running", Progress: 90, Message: "Redfish: сбор расширенного snapshot..."}) emit(Progress{Status: "running", Progress: 90, Message: "Redfish: сбор расширенного snapshot..."})
} }
c.debugSnapshotf("snapshot crawl start host=%s port=%d", req.Host, req.Port)
rawTree := c.collectRawRedfishTree(ctx, client, req, baseURL, emit) rawTree := c.collectRawRedfishTree(ctx, client, req, baseURL, emit)
c.debugSnapshotf("snapshot crawl done docs=%d", len(rawTree))
result := &models.AnalysisResult{ result := &models.AnalysisResult{
Events: make([]models.Event, 0), Events: make([]models.Event, 0),
@@ -422,14 +452,19 @@ func (c *RedfishConnector) discoverMemberPaths(ctx context.Context, client *http
func (c *RedfishConnector) collectRawRedfishTree(ctx context.Context, client *http.Client, req Request, baseURL string, emit ProgressFn) map[string]interface{} { func (c *RedfishConnector) collectRawRedfishTree(ctx context.Context, client *http.Client, req Request, baseURL string, emit ProgressFn) map[string]interface{} {
const maxDocuments = 1200 const maxDocuments = 1200
const workers = 6 const workers = 6
const heartbeatInterval = 5 * time.Second
out := make(map[string]interface{}, maxDocuments) out := make(map[string]interface{}, maxDocuments)
seen := make(map[string]struct{}, maxDocuments) seen := make(map[string]struct{}, maxDocuments)
rootCounts := make(map[string]int) rootCounts := make(map[string]int)
var mu sync.Mutex var mu sync.Mutex
var processed int32 var processed int32
var lastPath atomic.Value
jobs := make(chan string, 256) // Workers enqueue newly discovered links into the same queue they consume.
// The queue capacity must be at least the crawl cap to avoid producer/consumer
// deadlock when several workers discover many links at once.
jobs := make(chan string, maxDocuments)
var wg sync.WaitGroup var wg sync.WaitGroup
enqueue := func(path string) { enqueue := func(path string) {
@@ -453,9 +488,49 @@ func (c *RedfishConnector) collectRawRedfishTree(ctx context.Context, client *ht
} }
enqueue("/redfish/v1") enqueue("/redfish/v1")
for i := 0; i < workers; i++ { c.debugSnapshotf("snapshot queue initialized workers=%d max_documents=%d", workers, maxDocuments)
stopHeartbeat := make(chan struct{})
if emit != nil {
go func() { go func() {
ticker := time.NewTicker(heartbeatInterval)
defer ticker.Stop()
for {
select {
case <-ticker.C:
n := atomic.LoadInt32(&processed)
mu.Lock()
countsCopy := make(map[string]int, len(rootCounts))
for k, v := range rootCounts {
countsCopy[k] = v
}
seenN := len(seen)
outN := len(out)
mu.Unlock()
roots := topRoots(countsCopy, 2)
last := "/redfish/v1"
if v := lastPath.Load(); v != nil {
if s, ok := v.(string); ok && s != "" {
last = s
}
}
emit(Progress{
Status: "running",
Progress: 92 + int(minInt32(n/200, 6)),
Message: fmt.Sprintf("Redfish snapshot: heartbeat документов=%d (ok=%d, seen=%d), корни=%s, последний=%s", n, outN, seenN, strings.Join(roots, ", "), compactProgressPath(last)),
})
case <-stopHeartbeat:
return
case <-ctx.Done():
return
}
}
}()
}
for i := 0; i < workers; i++ {
go func(workerID int) {
for current := range jobs { for current := range jobs {
lastPath.Store(current)
c.debugSnapshotf("worker=%d fetch start path=%s queue_len=%d", workerID, current, len(jobs))
doc, err := c.getJSON(ctx, client, req, baseURL, current) doc, err := c.getJSON(ctx, client, req, baseURL, current)
if err == nil { if err == nil {
mu.Lock() mu.Lock()
@@ -467,8 +542,17 @@ func (c *RedfishConnector) collectRawRedfishTree(ctx context.Context, client *ht
enqueue(ref) enqueue(ref)
} }
} }
n := atomic.AddInt32(&processed, 1) n := atomic.AddInt32(&processed, 1)
if err != nil {
c.debugSnapshotf("worker=%d fetch error path=%s err=%v", workerID, current, err)
if emit != nil {
emit(Progress{
Status: "running",
Progress: 92 + int(minInt32(n/200, 6)),
Message: fmt.Sprintf("Redfish snapshot: ошибка на %s", compactProgressPath(current)),
})
}
}
if emit != nil && n%40 == 0 { if emit != nil && n%40 == 0 {
mu.Lock() mu.Lock()
countsCopy := make(map[string]int, len(rootCounts)) countsCopy := make(map[string]int, len(rootCounts))
@@ -477,19 +561,33 @@ func (c *RedfishConnector) collectRawRedfishTree(ctx context.Context, client *ht
} }
mu.Unlock() mu.Unlock()
roots := topRoots(countsCopy, 2) roots := topRoots(countsCopy, 2)
last := current
if v := lastPath.Load(); v != nil {
if s, ok := v.(string); ok && s != "" {
last = s
}
}
emit(Progress{ emit(Progress{
Status: "running", Status: "running",
Progress: 92 + int(minInt32(n/200, 6)), Progress: 92 + int(minInt32(n/200, 6)),
Message: fmt.Sprintf("Redfish snapshot: документов=%d, корни=%s", n, strings.Join(roots, ", ")), Message: fmt.Sprintf("Redfish snapshot: документов=%d, корни=%s, последний=%s", n, strings.Join(roots, ", "), compactProgressPath(last)),
}) })
} }
if n%20 == 0 || err != nil {
mu.Lock()
seenN := len(seen)
outN := len(out)
mu.Unlock()
c.debugSnapshotf("snapshot progress processed=%d stored=%d seen=%d queue_len=%d", n, outN, seenN, len(jobs))
}
wg.Done() wg.Done()
} }
}() }(i + 1)
} }
wg.Wait() wg.Wait()
close(stopHeartbeat)
close(jobs) close(jobs)
if emit != nil { if emit != nil {
@@ -591,6 +689,7 @@ func (c *RedfishConnector) getCollectionMembers(ctx context.Context, client *htt
} }
func (c *RedfishConnector) getJSON(ctx context.Context, client *http.Client, req Request, baseURL, requestPath string) (map[string]interface{}, error) { func (c *RedfishConnector) getJSON(ctx context.Context, client *http.Client, req Request, baseURL, requestPath string) (map[string]interface{}, error) {
start := time.Now()
rel := requestPath rel := requestPath
if rel == "" { if rel == "" {
rel = "/" rel = "/"
@@ -621,21 +720,26 @@ func (c *RedfishConnector) getJSON(ctx context.Context, client *http.Client, req
resp, err := client.Do(httpReq) resp, err := client.Do(httpReq)
if err != nil { if err != nil {
c.debugf("http get path=%s error=%v dur=%s", requestPath, err, time.Since(start).Round(time.Millisecond))
return nil, err return nil, err
} }
defer resp.Body.Close() defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 { if resp.StatusCode < 200 || resp.StatusCode >= 300 {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 512)) body, _ := io.ReadAll(io.LimitReader(resp.Body, 512))
return nil, fmt.Errorf("status %d from %s: %s", resp.StatusCode, requestPath, strings.TrimSpace(string(body))) err := fmt.Errorf("status %d from %s: %s", resp.StatusCode, requestPath, strings.TrimSpace(string(body)))
c.debugf("http get path=%s status=%d dur=%s", requestPath, resp.StatusCode, time.Since(start).Round(time.Millisecond))
return nil, err
} }
var doc map[string]interface{} var doc map[string]interface{}
dec := json.NewDecoder(resp.Body) dec := json.NewDecoder(resp.Body)
dec.UseNumber() dec.UseNumber()
if err := dec.Decode(&doc); err != nil { if err := dec.Decode(&doc); err != nil {
c.debugf("http get path=%s decode_error=%v dur=%s", requestPath, err, time.Since(start).Round(time.Millisecond))
return nil, err return nil, err
} }
c.debugf("http get path=%s status=%d dur=%s", requestPath, resp.StatusCode, time.Since(start).Round(time.Millisecond))
return doc, nil return doc, nil
} }
@@ -1340,6 +1444,14 @@ func topRoots(counts map[string]int, limit int) []string {
return out return out
} }
func compactProgressPath(p string) string {
const maxLen = 72
if len(p) <= maxLen {
return p
}
return "..." + p[len(p)-maxLen+3:]
}
func minInt32(a, b int32) int32 { func minInt32(a, b int32) int32 {
if a < b { if a < b {
return a return a