Improve Redfish raw replay recovery and GUI diagnostics

This commit is contained in:
Mikhail Chusavitin
2026-02-25 12:16:31 +03:00
parent 66fb90233f
commit a4a1a19a94
5 changed files with 231 additions and 39 deletions

View File

@@ -23,8 +23,8 @@ import (
)
type RedfishConnector struct {
timeout time.Duration
debug bool
timeout time.Duration
debug bool
debugSnapshot bool
}
@@ -88,15 +88,19 @@ func (c *RedfishConnector) Collect(ctx context.Context, req Request, emit Progre
emit(Progress{Status: "running", Progress: 90, Message: "Redfish: сбор расширенного snapshot..."})
}
c.debugSnapshotf("snapshot crawl start host=%s port=%d", req.Host, req.Port)
rawTree := c.collectRawRedfishTree(ctx, client, req, baseURL, redfishSnapshotPrioritySeeds(systemPaths, chassisPaths, managerPaths), emit)
rawTree, fetchErrors := c.collectRawRedfishTree(ctx, client, req, baseURL, redfishSnapshotPrioritySeeds(systemPaths, chassisPaths, managerPaths), emit)
c.debugSnapshotf("snapshot crawl done docs=%d", len(rawTree))
if emit != nil {
emit(Progress{Status: "running", Progress: 99, Message: "Redfish: анализ raw snapshot..."})
}
// Unified tunnel: live collection and raw import go through the same analyzer over redfish_tree.
return ReplayRedfishFromRawPayloads(map[string]any{
rawPayloads := map[string]any{
"redfish_tree": rawTree,
}, nil)
}
if len(fetchErrors) > 0 {
rawPayloads["redfish_fetch_errors"] = fetchErrors
}
// Unified tunnel: live collection and raw import go through the same analyzer over redfish_tree.
return ReplayRedfishFromRawPayloads(rawPayloads, nil)
}
func (c *RedfishConnector) httpClient(req Request) *http.Client {
@@ -444,7 +448,7 @@ func (c *RedfishConnector) collectPCIeDevices(ctx context.Context, client *http.
for _, doc := range memberDocs {
functionDocs := c.getLinkedPCIeFunctions(ctx, client, req, baseURL, doc)
dev := parsePCIeDevice(doc, functionDocs)
key := firstNonEmpty(dev.SerialNumber, dev.BDF, dev.Slot+"|"+dev.DeviceClass)
key := firstNonEmpty(dev.BDF, dev.SerialNumber, dev.Slot+"|"+dev.DeviceClass)
if key == "" {
continue
}
@@ -506,12 +510,13 @@ func (c *RedfishConnector) discoverMemberPaths(ctx context.Context, client *http
return nil
}
func (c *RedfishConnector) collectRawRedfishTree(ctx context.Context, client *http.Client, req Request, baseURL string, seedPaths []string, emit ProgressFn) map[string]interface{} {
func (c *RedfishConnector) collectRawRedfishTree(ctx context.Context, client *http.Client, req Request, baseURL string, seedPaths []string, emit ProgressFn) (map[string]interface{}, []map[string]interface{}) {
maxDocuments := redfishSnapshotMaxDocuments()
const workers = 6
const heartbeatInterval = 5 * time.Second
out := make(map[string]interface{}, maxDocuments)
fetchErrors := make(map[string]string)
seen := make(map[string]struct{}, maxDocuments)
rootCounts := make(map[string]int)
var mu sync.Mutex
@@ -602,15 +607,20 @@ func (c *RedfishConnector) collectRawRedfishTree(ctx context.Context, client *ht
enqueue(ref)
}
}
n := atomic.AddInt32(&processed, 1)
if err != nil {
c.debugSnapshotf("worker=%d fetch error path=%s err=%v", workerID, current, err)
if emit != nil && shouldReportSnapshotFetchError(err) {
emit(Progress{
Status: "running",
Progress: 92 + int(minInt32(n/200, 6)),
Message: fmt.Sprintf("Redfish snapshot: ошибка на %s", compactProgressPath(current)),
})
n := atomic.AddInt32(&processed, 1)
if err != nil {
mu.Lock()
if _, ok := fetchErrors[current]; !ok {
fetchErrors[current] = err.Error()
}
mu.Unlock()
c.debugSnapshotf("worker=%d fetch error path=%s err=%v", workerID, current, err)
if emit != nil && shouldReportSnapshotFetchError(err) {
emit(Progress{
Status: "running",
Progress: 92 + int(minInt32(n/200, 6)),
Message: fmt.Sprintf("Redfish snapshot: ошибка на %s", compactProgressPath(current)),
})
}
}
if emit != nil && n%40 == 0 {
@@ -677,7 +687,18 @@ func (c *RedfishConnector) collectRawRedfishTree(ctx context.Context, client *ht
})
}
return out
errorList := make([]map[string]interface{}, 0, len(fetchErrors))
for p, msg := range fetchErrors {
errorList = append(errorList, map[string]interface{}{
"path": p,
"error": msg,
})
}
sort.Slice(errorList, func(i, j int) bool {
return asString(errorList[i]["path"]) < asString(errorList[j]["path"])
})
return out, errorList
}
func (c *RedfishConnector) probeSupermicroNVMeDiskBays(ctx context.Context, client *http.Client, req Request, baseURL, backplanePath string) []map[string]interface{} {

View File

@@ -2,6 +2,8 @@ package collector
import (
"fmt"
"sort"
"strings"
"git.mchus.pro/mchus/logpile/internal/models"
)
@@ -64,12 +66,10 @@ func ReplayRedfishFromRawPayloads(rawPayloads map[string]any, emit ProgressFn) (
networkProtocolDoc, _ := r.getJSON(joinPath(primaryManager, "/NetworkProtocol"))
result := &models.AnalysisResult{
Events: make([]models.Event, 0),
FRU: make([]models.FRUInfo, 0),
Sensors: make([]models.SensorReading, 0),
RawPayloads: map[string]any{
"redfish_tree": tree,
},
Events: make([]models.Event, 0),
FRU: make([]models.FRUInfo, 0),
Sensors: make([]models.SensorReading, 0),
RawPayloads: cloneRawPayloads(rawPayloads),
Hardware: &models.HardwareConfig{
BoardInfo: parseBoardInfo(systemDoc),
CPUs: parseCPUs(processors),
@@ -115,11 +115,11 @@ func (r redfishSnapshotReader) getJSON(requestPath string) (map[string]interface
func (r redfishSnapshotReader) getCollectionMembers(collectionPath string) ([]map[string]interface{}, error) {
collection, err := r.getJSON(collectionPath)
if err != nil {
return nil, err
return r.fallbackCollectionMembers(collectionPath, err)
}
refs, ok := collection["Members"].([]interface{})
if !ok || len(refs) == 0 {
return []map[string]interface{}{}, nil
return r.fallbackCollectionMembers(collectionPath, nil)
}
out := make([]map[string]interface{}, 0, len(refs))
for _, refAny := range refs {
@@ -137,9 +137,61 @@ func (r redfishSnapshotReader) getCollectionMembers(collectionPath string) ([]ma
}
out = append(out, doc)
}
if len(out) == 0 {
return r.fallbackCollectionMembers(collectionPath, nil)
}
return out, nil
}
func (r redfishSnapshotReader) fallbackCollectionMembers(collectionPath string, originalErr error) ([]map[string]interface{}, error) {
prefix := strings.TrimSuffix(normalizeRedfishPath(collectionPath), "/") + "/"
if prefix == "/" {
if originalErr != nil {
return nil, originalErr
}
return []map[string]interface{}{}, nil
}
paths := make([]string, 0)
for key := range r.tree {
p := normalizeRedfishPath(key)
if !strings.HasPrefix(p, prefix) {
continue
}
rest := strings.TrimPrefix(p, prefix)
if rest == "" || strings.Contains(rest, "/") {
continue
}
paths = append(paths, p)
}
if len(paths) == 0 {
if originalErr != nil {
return nil, originalErr
}
return []map[string]interface{}{}, nil
}
sort.Strings(paths)
out := make([]map[string]interface{}, 0, len(paths))
for _, p := range paths {
doc, err := r.getJSON(p)
if err != nil {
continue
}
out = append(out, doc)
}
return out, nil
}
func cloneRawPayloads(src map[string]any) map[string]any {
if len(src) == 0 {
return nil
}
dst := make(map[string]any, len(src))
for k, v := range src {
dst[k] = v
}
return dst
}
func (r redfishSnapshotReader) discoverMemberPaths(collectionPath, fallbackPath string) []string {
collection, err := r.getJSON(collectionPath)
if err == nil {
@@ -482,7 +534,7 @@ func (r redfishSnapshotReader) collectPCIeDevices(systemPaths, chassisPaths []st
for _, doc := range memberDocs {
functionDocs := r.getLinkedPCIeFunctions(doc)
dev := parsePCIeDevice(doc, functionDocs)
key := firstNonEmpty(dev.SerialNumber, dev.BDF, dev.Slot+"|"+dev.DeviceClass)
key := firstNonEmpty(dev.BDF, dev.SerialNumber, dev.Slot+"|"+dev.DeviceClass)
if key == "" {
continue
}

View File

@@ -44,9 +44,9 @@ func TestRedfishConnectorCollect(t *testing.T) {
},
})
register("/redfish/v1/Systems/1/Memory/DIMM1", map[string]interface{}{
"Name": "DIMM A1",
"CapacityMiB": 32768,
"MemoryDeviceType": "DDR5",
"Name": "DIMM A1",
"CapacityMiB": 32768,
"MemoryDeviceType": "DDR5",
"OperatingSpeedMhz": 4800,
"Status": map[string]interface{}{
"Health": "OK",
@@ -91,14 +91,14 @@ func TestRedfishConnectorCollect(t *testing.T) {
},
})
register("/redfish/v1/Systems/1/PCIeFunctions/GPU1F0", map[string]interface{}{
"FunctionId": "0000:65:00.0",
"VendorId": "0x10DE",
"DeviceId": "0x2331",
"ClassCode": "0x030200",
"CurrentLinkWidth": 16,
"CurrentLinkSpeed": "16.0 GT/s",
"MaxLinkWidth": 16,
"MaxLinkSpeed": "16.0 GT/s",
"FunctionId": "0000:65:00.0",
"VendorId": "0x10DE",
"DeviceId": "0x2331",
"ClassCode": "0x030200",
"CurrentLinkWidth": 16,
"CurrentLinkSpeed": "16.0 GT/s",
"MaxLinkWidth": 16,
"MaxLinkSpeed": "16.0 GT/s",
})
register("/redfish/v1/Chassis/1/NetworkAdapters", map[string]interface{}{
"Members": []map[string]string{
@@ -239,6 +239,68 @@ func TestParsePCIeDeviceSlot_EmptyMapFallsBackToID(t *testing.T) {
}
}
func TestReplayRedfishFromRawPayloads_FallbackCollectionMembersByPrefix(t *testing.T) {
raw := map[string]any{
"redfish_tree": map[string]interface{}{
"/redfish/v1": map[string]interface{}{
"Systems": map[string]interface{}{"@odata.id": "/redfish/v1/Systems"},
"Chassis": map[string]interface{}{"@odata.id": "/redfish/v1/Chassis"},
"Managers": map[string]interface{}{"@odata.id": "/redfish/v1/Managers"},
},
"/redfish/v1/Systems": map[string]interface{}{
"Members": []interface{}{
map[string]interface{}{"@odata.id": "/redfish/v1/Systems/1"},
},
},
"/redfish/v1/Systems/1": map[string]interface{}{
"Manufacturer": "Supermicro",
"Model": "SYS-TEST",
"SerialNumber": "SYS123",
},
// Intentionally missing /redfish/v1/Systems/1/Processors collection.
"/redfish/v1/Systems/1/Processors/CPU1": map[string]interface{}{
"Id": "CPU1",
"Model": "Xeon Gold",
"TotalCores": 32,
"TotalThreads": 64,
},
"/redfish/v1/Chassis": map[string]interface{}{
"Members": []interface{}{
map[string]interface{}{"@odata.id": "/redfish/v1/Chassis/1"},
},
},
"/redfish/v1/Chassis/1": map[string]interface{}{
"Id": "1",
},
"/redfish/v1/Managers": map[string]interface{}{
"Members": []interface{}{
map[string]interface{}{"@odata.id": "/redfish/v1/Managers/1"},
},
},
"/redfish/v1/Managers/1": map[string]interface{}{
"Id": "1",
},
},
"redfish_fetch_errors": []map[string]interface{}{
{"path": "/redfish/v1/Systems/1/Processors", "error": "status 500"},
},
}
got, err := ReplayRedfishFromRawPayloads(raw, nil)
if err != nil {
t.Fatalf("replay failed: %v", err)
}
if got.Hardware == nil {
t.Fatalf("expected hardware")
}
if len(got.Hardware.CPUs) != 1 {
t.Fatalf("expected one CPU via prefix fallback, got %d", len(got.Hardware.CPUs))
}
if _, ok := got.RawPayloads["redfish_fetch_errors"]; !ok {
t.Fatalf("expected raw payloads to preserve redfish_fetch_errors")
}
}
func TestEnrichNICFromPCIeFunctions(t *testing.T) {
nic := parseNIC(map[string]interface{}{
"Id": "1",
@@ -333,7 +395,7 @@ func TestReplayCollectStorage_ProbesSupermicroNVMeDiskBayWhenCollectionEmpty(t *
},
"/redfish/v1/Chassis/NVMeSSD.0.Group.0.StorageBackplane/Drives": map[string]interface{}{
"Members@odata.count": 0,
"Members": []interface{}{},
"Members": []interface{}{},
},
"/redfish/v1/Chassis/NVMeSSD.0.Group.0.StorageBackplane/Drives/Disk.Bay.0": map[string]interface{}{
"Id": "Disk.Bay.0",

View File

@@ -319,6 +319,11 @@ func (s *Server) handleGetConfig(w http.ResponseWriter, r *http.Request) {
"target_host": result.TargetHost,
"collected_at": result.CollectedAt,
}
if result.RawPayloads != nil {
if fetchErrors, ok := result.RawPayloads["redfish_fetch_errors"]; ok {
response["redfish_fetch_errors"] = fetchErrors
}
}
if result.Hardware == nil {
response["hardware"] = map[string]interface{}{}