From f19a3454fa56da93a952fc71b2b1124d0c62d2c2 Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Mon, 13 Apr 2026 14:45:41 +0300 Subject: [PATCH] fix(redfish): gate hgx diagnostic plan-b by debug toggle --- bible-local/03-api.md | 1 + bible-local/05-collectors.md | 12 +++++ bible-local/10-decisions.md | 17 +++++++ internal/collector/redfish.go | 49 ++++++++++++++++++-- internal/collector/redfish_planb_test.go | 57 ++++++++++++++++++++++++ web/templates/index.html | 2 +- 6 files changed, 133 insertions(+), 5 deletions(-) create mode 100644 internal/collector/redfish_planb_test.go diff --git a/bible-local/03-api.md b/bible-local/03-api.md index 4a6368d..eaaaac2 100644 --- a/bible-local/03-api.md +++ b/bible-local/03-api.md @@ -58,6 +58,7 @@ Responses: Optional request field: - `power_on_if_host_off`: when `true`, Redfish collection may power on the host before collection if preflight found it powered off +- `debug_payloads`: when `true`, collector keeps extra diagnostic payloads and enables extended plan-B retries for slow HGX component inventory branches (`Assembly`, `Accelerators`, `Drives`, `NetworkAdapters`, `PCIeDevices`) ### `POST /api/collect/probe` diff --git a/bible-local/05-collectors.md b/bible-local/05-collectors.md index 43ef298..97f5997 100644 --- a/bible-local/05-collectors.md +++ b/bible-local/05-collectors.md @@ -27,6 +27,7 @@ Request fields passed from the server: - credential field (`password` or token) - `tls_mode` - optional `power_on_if_host_off` +- optional `debug_payloads` for extended diagnostics ### Core rule @@ -57,6 +58,17 @@ closes `skipCh` → goroutine in `Collect()` → `cancelCollect()`. The skip button is visible during `running` state and hidden once the job reaches a terminal state. +### Extended diagnostics toggle + +The live collect form exposes a user-facing checkbox for extended diagnostics. + +- default collection prioritizes inventory completeness and bounded runtime +- when extended diagnostics is off, heavy HGX component-chassis critical plan-B retries + (`Assembly`, `Accelerators`, `Drives`, `NetworkAdapters`, `PCIeDevices`) are skipped +- when extended diagnostics is on, those retries are allowed and extra debug payloads are collected + +This toggle is intended for operator-driven deep diagnostics on problematic hosts, not for the default path. + ### Discovery model The collector does not rely on one fixed vendor tree. diff --git a/bible-local/10-decisions.md b/bible-local/10-decisions.md index e6094d1..eae278b 100644 --- a/bible-local/10-decisions.md +++ b/bible-local/10-decisions.md @@ -1120,3 +1120,20 @@ incomplete for UI and Reanimator consumers. - System firmware such as BIOS and iBMC versions survives xFusion file exports. - xFusion archives participate more reliably in canonical device/export flows without special UI cases. + +--- + +## ADL-043 — Extended HGX diagnostic plan-B is opt-in from the live collect form + +**Date:** 2026-04-13 +**Context:** Some Supermicro HGX Redfish targets expose slow or hanging component-chassis inventory +collections during critical plan-B, especially under `Chassis/HGX_*` for `Assembly`, +`Accelerators`, `Drives`, `NetworkAdapters`, and `PCIeDevices`. Default collection should not +block operators on deep diagnostic retries that are useful mainly for troubleshooting. +**Decision:** Keep the normal snapshot/replay path unchanged, but gate those heavy HGX +component-chassis critical plan-B retries behind the existing live-collect `debug_payloads` flag, +presented in the UI as "Сбор расширенных данных для диагностики". +**Consequences:** +- Default live collection skips those heavy diagnostic plan-B retries and reaches replay faster. +- Operators can explicitly opt into the slower diagnostic path when they need deeper collection. +- The same user-facing toggle continues to enable extra debug payload capture for troubleshooting. diff --git a/internal/collector/redfish.go b/internal/collector/redfish.go index 5017842..03d438c 100644 --- a/internal/collector/redfish.go +++ b/internal/collector/redfish.go @@ -496,7 +496,6 @@ func (c *RedfishConnector) Collect(ctx context.Context, req Request, emit Progre return result, nil } - // collectDebugPayloads fetches vendor-specific diagnostic endpoints on a best-effort basis. // Results are stored in rawPayloads["redfish_debug_payloads"] and exported with the bundle. // Enabled only when Request.DebugPayloads is true. @@ -511,7 +510,6 @@ func (c *RedfishConnector) collectDebugPayloads(ctx context.Context, client *htt return out } - func firstNonEmptyPath(paths []string, fallback string) string { for _, p := range paths { if strings.TrimSpace(p) != "" { @@ -543,7 +541,6 @@ func redfishSystemPowerState(systemDoc map[string]interface{}) string { return "" } - func (c *RedfishConnector) postJSON(ctx context.Context, client *http.Client, req Request, baseURL, resourcePath string, payload map[string]any) error { body, err := json.Marshal(payload) if err != nil { @@ -2299,7 +2296,6 @@ func redfishCriticalSlowGap() time.Duration { return 1200 * time.Millisecond } - func redfishSnapshotMemoryRequestTimeout() time.Duration { if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_MEMORY_TIMEOUT")); v != "" { if d, err := time.ParseDuration(v); err == nil && d > 0 { @@ -2878,11 +2874,16 @@ func (c *RedfishConnector) recoverCriticalRedfishDocsPlanB(ctx context.Context, timings := newRedfishPathTimingCollector(4) var targets []string seenTargets := make(map[string]struct{}) + skippedDiagnosticTargets := 0 addTarget := func(path string) { path = normalizeRedfishPath(path) if path == "" { return } + if !shouldIncludeCriticalPlanBPath(req, path) { + skippedDiagnosticTargets++ + return + } if _, ok := seenTargets[path]; ok { return } @@ -2968,6 +2969,13 @@ func (c *RedfishConnector) recoverCriticalRedfishDocsPlanB(ctx context.Context, return 0 } if emit != nil { + if skippedDiagnosticTargets > 0 { + emit(Progress{ + Status: "running", + Progress: 97, + Message: fmt.Sprintf("Redfish: расширенная диагностика выключена, пропущено %d тяжелых diagnostic endpoint", skippedDiagnosticTargets), + }) + } totalETA := redfishCriticalCooldown() + estimatePlanBETA(len(targets)) emit(Progress{ Status: "running", @@ -3073,6 +3081,39 @@ func (c *RedfishConnector) recoverCriticalRedfishDocsPlanB(ctx context.Context, return recovered } +func shouldIncludeCriticalPlanBPath(req Request, path string) bool { + if req.DebugPayloads { + return true + } + return !isExtendedDiagnosticCriticalPlanBPath(path) +} + +func isExtendedDiagnosticCriticalPlanBPath(path string) bool { + path = normalizeRedfishPath(path) + if path == "" { + return false + } + parts := strings.Split(strings.Trim(path, "/"), "/") + if len(parts) < 5 || parts[0] != "redfish" || parts[1] != "v1" || parts[2] != "Chassis" { + return false + } + if !strings.HasPrefix(parts[3], "HGX_") { + return false + } + for _, suffix := range []string{ + "/Accelerators", + "/Assembly", + "/Drives", + "/NetworkAdapters", + "/PCIeDevices", + } { + if strings.HasSuffix(path, suffix) { + return true + } + } + return false +} + func (c *RedfishConnector) recoverProfilePlanBDocs(ctx context.Context, client *http.Client, req Request, baseURL string, plan redfishprofile.AcquisitionPlan, rawTree map[string]interface{}, emit ProgressFn) int { if len(plan.PlanBPaths) == 0 || plan.Mode == redfishprofile.ModeFallback || !plan.Tuning.RecoveryPolicy.EnableProfilePlanB { return 0 diff --git a/internal/collector/redfish_planb_test.go b/internal/collector/redfish_planb_test.go new file mode 100644 index 0000000..f0a89d8 --- /dev/null +++ b/internal/collector/redfish_planb_test.go @@ -0,0 +1,57 @@ +package collector + +import "testing" + +func TestShouldIncludeCriticalPlanBPath(t *testing.T) { + tests := []struct { + name string + req Request + path string + want bool + }{ + { + name: "skip hgx erot pcie without extended diagnostics", + req: Request{}, + path: "/redfish/v1/Chassis/HGX_ERoT_NVSwitch_0/PCIeDevices", + want: false, + }, + { + name: "skip hgx chassis assembly without extended diagnostics", + req: Request{}, + path: "/redfish/v1/Chassis/HGX_Chassis_0/Assembly", + want: false, + }, + { + name: "keep standard chassis inventory without extended diagnostics", + req: Request{}, + path: "/redfish/v1/Chassis/1/PCIeDevices", + want: true, + }, + { + name: "keep nvme storage backplane drives without extended diagnostics", + req: Request{}, + path: "/redfish/v1/Chassis/NVMeSSD.0.Group.0.StorageBackplane/Drives", + want: true, + }, + { + name: "keep system processors without extended diagnostics", + req: Request{}, + path: "/redfish/v1/Systems/HGX_Baseboard_0/Processors", + want: true, + }, + { + name: "include hgx erot pcie when extended diagnostics enabled", + req: Request{DebugPayloads: true}, + path: "/redfish/v1/Chassis/HGX_ERoT_NVSwitch_0/PCIeDevices", + want: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := shouldIncludeCriticalPlanBPath(tt.req, tt.path); got != tt.want { + t.Fatalf("shouldIncludeCriticalPlanBPath(%q) = %v, want %v", tt.path, got, tt.want) + } + }) + } +} diff --git a/web/templates/index.html b/web/templates/index.html index ae9b29d..0b28ca6 100644 --- a/web/templates/index.html +++ b/web/templates/index.html @@ -85,7 +85,7 @@