From a4a1a19a94289972bb8316417df850afb4ca9992 Mon Sep 17 00:00:00 2001 From: Mikhail Chusavitin Date: Wed, 25 Feb 2026 12:16:31 +0300 Subject: [PATCH] Improve Redfish raw replay recovery and GUI diagnostics --- internal/collector/redfish.go | 57 ++++++++++++------ internal/collector/redfish_replay.go | 70 +++++++++++++++++++--- internal/collector/redfish_test.go | 86 ++++++++++++++++++++++++---- internal/server/handlers.go | 5 ++ web/static/js/app.js | 52 +++++++++++++++++ 5 files changed, 231 insertions(+), 39 deletions(-) diff --git a/internal/collector/redfish.go b/internal/collector/redfish.go index f94a2d9..5f7f7c2 100644 --- a/internal/collector/redfish.go +++ b/internal/collector/redfish.go @@ -23,8 +23,8 @@ import ( ) type RedfishConnector struct { - timeout time.Duration - debug bool + timeout time.Duration + debug bool debugSnapshot bool } @@ -88,15 +88,19 @@ func (c *RedfishConnector) Collect(ctx context.Context, req Request, emit Progre emit(Progress{Status: "running", Progress: 90, Message: "Redfish: сбор расширенного snapshot..."}) } c.debugSnapshotf("snapshot crawl start host=%s port=%d", req.Host, req.Port) - rawTree := c.collectRawRedfishTree(ctx, client, req, baseURL, redfishSnapshotPrioritySeeds(systemPaths, chassisPaths, managerPaths), emit) + rawTree, fetchErrors := c.collectRawRedfishTree(ctx, client, req, baseURL, redfishSnapshotPrioritySeeds(systemPaths, chassisPaths, managerPaths), emit) c.debugSnapshotf("snapshot crawl done docs=%d", len(rawTree)) if emit != nil { emit(Progress{Status: "running", Progress: 99, Message: "Redfish: анализ raw snapshot..."}) } - // Unified tunnel: live collection and raw import go through the same analyzer over redfish_tree. - return ReplayRedfishFromRawPayloads(map[string]any{ + rawPayloads := map[string]any{ "redfish_tree": rawTree, - }, nil) + } + if len(fetchErrors) > 0 { + rawPayloads["redfish_fetch_errors"] = fetchErrors + } + // Unified tunnel: live collection and raw import go through the same analyzer over redfish_tree. + return ReplayRedfishFromRawPayloads(rawPayloads, nil) } func (c *RedfishConnector) httpClient(req Request) *http.Client { @@ -444,7 +448,7 @@ func (c *RedfishConnector) collectPCIeDevices(ctx context.Context, client *http. for _, doc := range memberDocs { functionDocs := c.getLinkedPCIeFunctions(ctx, client, req, baseURL, doc) dev := parsePCIeDevice(doc, functionDocs) - key := firstNonEmpty(dev.SerialNumber, dev.BDF, dev.Slot+"|"+dev.DeviceClass) + key := firstNonEmpty(dev.BDF, dev.SerialNumber, dev.Slot+"|"+dev.DeviceClass) if key == "" { continue } @@ -506,12 +510,13 @@ func (c *RedfishConnector) discoverMemberPaths(ctx context.Context, client *http return nil } -func (c *RedfishConnector) collectRawRedfishTree(ctx context.Context, client *http.Client, req Request, baseURL string, seedPaths []string, emit ProgressFn) map[string]interface{} { +func (c *RedfishConnector) collectRawRedfishTree(ctx context.Context, client *http.Client, req Request, baseURL string, seedPaths []string, emit ProgressFn) (map[string]interface{}, []map[string]interface{}) { maxDocuments := redfishSnapshotMaxDocuments() const workers = 6 const heartbeatInterval = 5 * time.Second out := make(map[string]interface{}, maxDocuments) + fetchErrors := make(map[string]string) seen := make(map[string]struct{}, maxDocuments) rootCounts := make(map[string]int) var mu sync.Mutex @@ -602,15 +607,20 @@ func (c *RedfishConnector) collectRawRedfishTree(ctx context.Context, client *ht enqueue(ref) } } - n := atomic.AddInt32(&processed, 1) - if err != nil { - c.debugSnapshotf("worker=%d fetch error path=%s err=%v", workerID, current, err) - if emit != nil && shouldReportSnapshotFetchError(err) { - emit(Progress{ - Status: "running", - Progress: 92 + int(minInt32(n/200, 6)), - Message: fmt.Sprintf("Redfish snapshot: ошибка на %s", compactProgressPath(current)), - }) + n := atomic.AddInt32(&processed, 1) + if err != nil { + mu.Lock() + if _, ok := fetchErrors[current]; !ok { + fetchErrors[current] = err.Error() + } + mu.Unlock() + c.debugSnapshotf("worker=%d fetch error path=%s err=%v", workerID, current, err) + if emit != nil && shouldReportSnapshotFetchError(err) { + emit(Progress{ + Status: "running", + Progress: 92 + int(minInt32(n/200, 6)), + Message: fmt.Sprintf("Redfish snapshot: ошибка на %s", compactProgressPath(current)), + }) } } if emit != nil && n%40 == 0 { @@ -677,7 +687,18 @@ func (c *RedfishConnector) collectRawRedfishTree(ctx context.Context, client *ht }) } - return out + errorList := make([]map[string]interface{}, 0, len(fetchErrors)) + for p, msg := range fetchErrors { + errorList = append(errorList, map[string]interface{}{ + "path": p, + "error": msg, + }) + } + sort.Slice(errorList, func(i, j int) bool { + return asString(errorList[i]["path"]) < asString(errorList[j]["path"]) + }) + + return out, errorList } func (c *RedfishConnector) probeSupermicroNVMeDiskBays(ctx context.Context, client *http.Client, req Request, baseURL, backplanePath string) []map[string]interface{} { diff --git a/internal/collector/redfish_replay.go b/internal/collector/redfish_replay.go index b8f7b0e..a3144b2 100644 --- a/internal/collector/redfish_replay.go +++ b/internal/collector/redfish_replay.go @@ -2,6 +2,8 @@ package collector import ( "fmt" + "sort" + "strings" "git.mchus.pro/mchus/logpile/internal/models" ) @@ -64,12 +66,10 @@ func ReplayRedfishFromRawPayloads(rawPayloads map[string]any, emit ProgressFn) ( networkProtocolDoc, _ := r.getJSON(joinPath(primaryManager, "/NetworkProtocol")) result := &models.AnalysisResult{ - Events: make([]models.Event, 0), - FRU: make([]models.FRUInfo, 0), - Sensors: make([]models.SensorReading, 0), - RawPayloads: map[string]any{ - "redfish_tree": tree, - }, + Events: make([]models.Event, 0), + FRU: make([]models.FRUInfo, 0), + Sensors: make([]models.SensorReading, 0), + RawPayloads: cloneRawPayloads(rawPayloads), Hardware: &models.HardwareConfig{ BoardInfo: parseBoardInfo(systemDoc), CPUs: parseCPUs(processors), @@ -115,11 +115,11 @@ func (r redfishSnapshotReader) getJSON(requestPath string) (map[string]interface func (r redfishSnapshotReader) getCollectionMembers(collectionPath string) ([]map[string]interface{}, error) { collection, err := r.getJSON(collectionPath) if err != nil { - return nil, err + return r.fallbackCollectionMembers(collectionPath, err) } refs, ok := collection["Members"].([]interface{}) if !ok || len(refs) == 0 { - return []map[string]interface{}{}, nil + return r.fallbackCollectionMembers(collectionPath, nil) } out := make([]map[string]interface{}, 0, len(refs)) for _, refAny := range refs { @@ -137,9 +137,61 @@ func (r redfishSnapshotReader) getCollectionMembers(collectionPath string) ([]ma } out = append(out, doc) } + if len(out) == 0 { + return r.fallbackCollectionMembers(collectionPath, nil) + } return out, nil } +func (r redfishSnapshotReader) fallbackCollectionMembers(collectionPath string, originalErr error) ([]map[string]interface{}, error) { + prefix := strings.TrimSuffix(normalizeRedfishPath(collectionPath), "/") + "/" + if prefix == "/" { + if originalErr != nil { + return nil, originalErr + } + return []map[string]interface{}{}, nil + } + paths := make([]string, 0) + for key := range r.tree { + p := normalizeRedfishPath(key) + if !strings.HasPrefix(p, prefix) { + continue + } + rest := strings.TrimPrefix(p, prefix) + if rest == "" || strings.Contains(rest, "/") { + continue + } + paths = append(paths, p) + } + if len(paths) == 0 { + if originalErr != nil { + return nil, originalErr + } + return []map[string]interface{}{}, nil + } + sort.Strings(paths) + out := make([]map[string]interface{}, 0, len(paths)) + for _, p := range paths { + doc, err := r.getJSON(p) + if err != nil { + continue + } + out = append(out, doc) + } + return out, nil +} + +func cloneRawPayloads(src map[string]any) map[string]any { + if len(src) == 0 { + return nil + } + dst := make(map[string]any, len(src)) + for k, v := range src { + dst[k] = v + } + return dst +} + func (r redfishSnapshotReader) discoverMemberPaths(collectionPath, fallbackPath string) []string { collection, err := r.getJSON(collectionPath) if err == nil { @@ -482,7 +534,7 @@ func (r redfishSnapshotReader) collectPCIeDevices(systemPaths, chassisPaths []st for _, doc := range memberDocs { functionDocs := r.getLinkedPCIeFunctions(doc) dev := parsePCIeDevice(doc, functionDocs) - key := firstNonEmpty(dev.SerialNumber, dev.BDF, dev.Slot+"|"+dev.DeviceClass) + key := firstNonEmpty(dev.BDF, dev.SerialNumber, dev.Slot+"|"+dev.DeviceClass) if key == "" { continue } diff --git a/internal/collector/redfish_test.go b/internal/collector/redfish_test.go index 7d526ce..dfb858e 100644 --- a/internal/collector/redfish_test.go +++ b/internal/collector/redfish_test.go @@ -44,9 +44,9 @@ func TestRedfishConnectorCollect(t *testing.T) { }, }) register("/redfish/v1/Systems/1/Memory/DIMM1", map[string]interface{}{ - "Name": "DIMM A1", - "CapacityMiB": 32768, - "MemoryDeviceType": "DDR5", + "Name": "DIMM A1", + "CapacityMiB": 32768, + "MemoryDeviceType": "DDR5", "OperatingSpeedMhz": 4800, "Status": map[string]interface{}{ "Health": "OK", @@ -91,14 +91,14 @@ func TestRedfishConnectorCollect(t *testing.T) { }, }) register("/redfish/v1/Systems/1/PCIeFunctions/GPU1F0", map[string]interface{}{ - "FunctionId": "0000:65:00.0", - "VendorId": "0x10DE", - "DeviceId": "0x2331", - "ClassCode": "0x030200", - "CurrentLinkWidth": 16, - "CurrentLinkSpeed": "16.0 GT/s", - "MaxLinkWidth": 16, - "MaxLinkSpeed": "16.0 GT/s", + "FunctionId": "0000:65:00.0", + "VendorId": "0x10DE", + "DeviceId": "0x2331", + "ClassCode": "0x030200", + "CurrentLinkWidth": 16, + "CurrentLinkSpeed": "16.0 GT/s", + "MaxLinkWidth": 16, + "MaxLinkSpeed": "16.0 GT/s", }) register("/redfish/v1/Chassis/1/NetworkAdapters", map[string]interface{}{ "Members": []map[string]string{ @@ -239,6 +239,68 @@ func TestParsePCIeDeviceSlot_EmptyMapFallsBackToID(t *testing.T) { } } +func TestReplayRedfishFromRawPayloads_FallbackCollectionMembersByPrefix(t *testing.T) { + raw := map[string]any{ + "redfish_tree": map[string]interface{}{ + "/redfish/v1": map[string]interface{}{ + "Systems": map[string]interface{}{"@odata.id": "/redfish/v1/Systems"}, + "Chassis": map[string]interface{}{"@odata.id": "/redfish/v1/Chassis"}, + "Managers": map[string]interface{}{"@odata.id": "/redfish/v1/Managers"}, + }, + "/redfish/v1/Systems": map[string]interface{}{ + "Members": []interface{}{ + map[string]interface{}{"@odata.id": "/redfish/v1/Systems/1"}, + }, + }, + "/redfish/v1/Systems/1": map[string]interface{}{ + "Manufacturer": "Supermicro", + "Model": "SYS-TEST", + "SerialNumber": "SYS123", + }, + // Intentionally missing /redfish/v1/Systems/1/Processors collection. + "/redfish/v1/Systems/1/Processors/CPU1": map[string]interface{}{ + "Id": "CPU1", + "Model": "Xeon Gold", + "TotalCores": 32, + "TotalThreads": 64, + }, + "/redfish/v1/Chassis": map[string]interface{}{ + "Members": []interface{}{ + map[string]interface{}{"@odata.id": "/redfish/v1/Chassis/1"}, + }, + }, + "/redfish/v1/Chassis/1": map[string]interface{}{ + "Id": "1", + }, + "/redfish/v1/Managers": map[string]interface{}{ + "Members": []interface{}{ + map[string]interface{}{"@odata.id": "/redfish/v1/Managers/1"}, + }, + }, + "/redfish/v1/Managers/1": map[string]interface{}{ + "Id": "1", + }, + }, + "redfish_fetch_errors": []map[string]interface{}{ + {"path": "/redfish/v1/Systems/1/Processors", "error": "status 500"}, + }, + } + + got, err := ReplayRedfishFromRawPayloads(raw, nil) + if err != nil { + t.Fatalf("replay failed: %v", err) + } + if got.Hardware == nil { + t.Fatalf("expected hardware") + } + if len(got.Hardware.CPUs) != 1 { + t.Fatalf("expected one CPU via prefix fallback, got %d", len(got.Hardware.CPUs)) + } + if _, ok := got.RawPayloads["redfish_fetch_errors"]; !ok { + t.Fatalf("expected raw payloads to preserve redfish_fetch_errors") + } +} + func TestEnrichNICFromPCIeFunctions(t *testing.T) { nic := parseNIC(map[string]interface{}{ "Id": "1", @@ -333,7 +395,7 @@ func TestReplayCollectStorage_ProbesSupermicroNVMeDiskBayWhenCollectionEmpty(t * }, "/redfish/v1/Chassis/NVMeSSD.0.Group.0.StorageBackplane/Drives": map[string]interface{}{ "Members@odata.count": 0, - "Members": []interface{}{}, + "Members": []interface{}{}, }, "/redfish/v1/Chassis/NVMeSSD.0.Group.0.StorageBackplane/Drives/Disk.Bay.0": map[string]interface{}{ "Id": "Disk.Bay.0", diff --git a/internal/server/handlers.go b/internal/server/handlers.go index c9d04ca..8f2cb00 100644 --- a/internal/server/handlers.go +++ b/internal/server/handlers.go @@ -319,6 +319,11 @@ func (s *Server) handleGetConfig(w http.ResponseWriter, r *http.Request) { "target_host": result.TargetHost, "collected_at": result.CollectedAt, } + if result.RawPayloads != nil { + if fetchErrors, ok := result.RawPayloads["redfish_fetch_errors"]; ok { + response["redfish_fetch_errors"] = fetchErrors + } + } if result.Hardware == nil { response["hardware"] = map[string]interface{}{} diff --git a/web/static/js/app.js b/web/static/js/app.js index 2d327ef..12fa434 100644 --- a/web/static/js/app.js +++ b/web/static/js/app.js @@ -646,6 +646,8 @@ function renderConfig(data) { const config = data.hardware || data; const spec = data.specification; + const redfishFetchErrors = Array.isArray(data.redfish_fetch_errors) ? data.redfish_fetch_errors : []; + const visibleRedfishFetchErrors = filterVisibleRedfishFetchErrors(redfishFetchErrors); const devices = Array.isArray(config.devices) ? config.devices : []; const volumes = Array.isArray(config.volumes) ? config.volumes : []; @@ -688,6 +690,32 @@ function renderConfig(data) { // Specification tab html += '
'; + const partialInventory = detectPartialRedfishInventory({ + cpus, + memory, + redfishFetchErrors + }); + if (partialInventory) { + html += `
+

Частичный инвентарь

+

${escapeHtml(partialInventory)}

+
`; + } + if (visibleRedfishFetchErrors.length > 0) { + html += `
+

Redfish fetch errors (${visibleRedfishFetchErrors.length})

+

Сохранено в raw snapshot для последующего анализа в GUI.

+ `; + visibleRedfishFetchErrors.forEach(item => { + const path = item && typeof item === 'object' ? (item.path || '-') : '-'; + const err = item && typeof item === 'object' ? (item.error || '-') : String(item || '-'); + html += ` + + + `; + }); + html += '
EndpointОшибка
${escapeHtml(String(path))}${escapeHtml(String(err))}
'; + } if (spec && spec.length > 0) { html += '

Спецификация сервера

    '; spec.forEach(item => { @@ -1319,6 +1347,30 @@ function escapeHtml(text) { return div.innerHTML; } +function filterVisibleRedfishFetchErrors(items) { + if (!Array.isArray(items)) return []; + return items.filter(item => { + const message = String(item && typeof item === 'object' ? (item.error || '') : item || '').toLowerCase(); + return !( + message.startsWith('status 404 ') || + message.startsWith('status 405 ') || + message.startsWith('status 410 ') || + message.startsWith('status 501 ') + ); + }); +} + +function detectPartialRedfishInventory({ cpus, memory, redfishFetchErrors }) { + const errors = Array.isArray(redfishFetchErrors) ? redfishFetchErrors : []; + const paths = errors.map(item => String(item && typeof item === 'object' ? (item.path || '') : '')).filter(Boolean); + const cpuMissing = (!Array.isArray(cpus) || cpus.length === 0) && paths.some(p => /\/Systems\/[^/]+\/Processors(\/)?$/i.test(p)); + const memMissing = (!Array.isArray(memory) || memory.length === 0) && paths.some(p => /\/Systems\/[^/]+\/Memory(\/)?$/i.test(p)); + if (!cpuMissing && !memMissing) return ''; + if (cpuMissing && memMissing) return 'Не удалось восстановить CPU и Memory: Redfish endpoint\'ы /Processors и /Memory были недоступны во время сбора.'; + if (cpuMissing) return 'CPU-инвентарь неполный: Redfish endpoint /Processors был недоступен во время сбора.'; + return 'Memory-инвентарь неполный: Redfish endpoint /Memory был недоступен во время сбора.'; +} + function calculateCPUToPCIeBalance(inventoryRows, cpus) { const laneByCPU = new Map(); const cpuIndexes = new Set();