Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| f19a3454fa |
@@ -58,6 +58,7 @@ Responses:
|
|||||||
|
|
||||||
Optional request field:
|
Optional request field:
|
||||||
- `power_on_if_host_off`: when `true`, Redfish collection may power on the host before collection if preflight found it powered off
|
- `power_on_if_host_off`: when `true`, Redfish collection may power on the host before collection if preflight found it powered off
|
||||||
|
- `debug_payloads`: when `true`, collector keeps extra diagnostic payloads and enables extended plan-B retries for slow HGX component inventory branches (`Assembly`, `Accelerators`, `Drives`, `NetworkAdapters`, `PCIeDevices`)
|
||||||
|
|
||||||
### `POST /api/collect/probe`
|
### `POST /api/collect/probe`
|
||||||
|
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ Request fields passed from the server:
|
|||||||
- credential field (`password` or token)
|
- credential field (`password` or token)
|
||||||
- `tls_mode`
|
- `tls_mode`
|
||||||
- optional `power_on_if_host_off`
|
- optional `power_on_if_host_off`
|
||||||
|
- optional `debug_payloads` for extended diagnostics
|
||||||
|
|
||||||
### Core rule
|
### Core rule
|
||||||
|
|
||||||
@@ -57,6 +58,17 @@ closes `skipCh` → goroutine in `Collect()` → `cancelCollect()`.
|
|||||||
|
|
||||||
The skip button is visible during `running` state and hidden once the job reaches a terminal state.
|
The skip button is visible during `running` state and hidden once the job reaches a terminal state.
|
||||||
|
|
||||||
|
### Extended diagnostics toggle
|
||||||
|
|
||||||
|
The live collect form exposes a user-facing checkbox for extended diagnostics.
|
||||||
|
|
||||||
|
- default collection prioritizes inventory completeness and bounded runtime
|
||||||
|
- when extended diagnostics is off, heavy HGX component-chassis critical plan-B retries
|
||||||
|
(`Assembly`, `Accelerators`, `Drives`, `NetworkAdapters`, `PCIeDevices`) are skipped
|
||||||
|
- when extended diagnostics is on, those retries are allowed and extra debug payloads are collected
|
||||||
|
|
||||||
|
This toggle is intended for operator-driven deep diagnostics on problematic hosts, not for the default path.
|
||||||
|
|
||||||
### Discovery model
|
### Discovery model
|
||||||
|
|
||||||
The collector does not rely on one fixed vendor tree.
|
The collector does not rely on one fixed vendor tree.
|
||||||
|
|||||||
@@ -1120,3 +1120,20 @@ incomplete for UI and Reanimator consumers.
|
|||||||
- System firmware such as BIOS and iBMC versions survives xFusion file exports.
|
- System firmware such as BIOS and iBMC versions survives xFusion file exports.
|
||||||
- xFusion archives participate more reliably in canonical device/export flows without special UI
|
- xFusion archives participate more reliably in canonical device/export flows without special UI
|
||||||
cases.
|
cases.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## ADL-043 — Extended HGX diagnostic plan-B is opt-in from the live collect form
|
||||||
|
|
||||||
|
**Date:** 2026-04-13
|
||||||
|
**Context:** Some Supermicro HGX Redfish targets expose slow or hanging component-chassis inventory
|
||||||
|
collections during critical plan-B, especially under `Chassis/HGX_*` for `Assembly`,
|
||||||
|
`Accelerators`, `Drives`, `NetworkAdapters`, and `PCIeDevices`. Default collection should not
|
||||||
|
block operators on deep diagnostic retries that are useful mainly for troubleshooting.
|
||||||
|
**Decision:** Keep the normal snapshot/replay path unchanged, but gate those heavy HGX
|
||||||
|
component-chassis critical plan-B retries behind the existing live-collect `debug_payloads` flag,
|
||||||
|
presented in the UI as "Сбор расширенных данных для диагностики".
|
||||||
|
**Consequences:**
|
||||||
|
- Default live collection skips those heavy diagnostic plan-B retries and reaches replay faster.
|
||||||
|
- Operators can explicitly opt into the slower diagnostic path when they need deeper collection.
|
||||||
|
- The same user-facing toggle continues to enable extra debug payload capture for troubleshooting.
|
||||||
|
|||||||
@@ -496,7 +496,6 @@ func (c *RedfishConnector) Collect(ctx context.Context, req Request, emit Progre
|
|||||||
return result, nil
|
return result, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// collectDebugPayloads fetches vendor-specific diagnostic endpoints on a best-effort basis.
|
// collectDebugPayloads fetches vendor-specific diagnostic endpoints on a best-effort basis.
|
||||||
// Results are stored in rawPayloads["redfish_debug_payloads"] and exported with the bundle.
|
// Results are stored in rawPayloads["redfish_debug_payloads"] and exported with the bundle.
|
||||||
// Enabled only when Request.DebugPayloads is true.
|
// Enabled only when Request.DebugPayloads is true.
|
||||||
@@ -511,7 +510,6 @@ func (c *RedfishConnector) collectDebugPayloads(ctx context.Context, client *htt
|
|||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
func firstNonEmptyPath(paths []string, fallback string) string {
|
func firstNonEmptyPath(paths []string, fallback string) string {
|
||||||
for _, p := range paths {
|
for _, p := range paths {
|
||||||
if strings.TrimSpace(p) != "" {
|
if strings.TrimSpace(p) != "" {
|
||||||
@@ -543,7 +541,6 @@ func redfishSystemPowerState(systemDoc map[string]interface{}) string {
|
|||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
func (c *RedfishConnector) postJSON(ctx context.Context, client *http.Client, req Request, baseURL, resourcePath string, payload map[string]any) error {
|
func (c *RedfishConnector) postJSON(ctx context.Context, client *http.Client, req Request, baseURL, resourcePath string, payload map[string]any) error {
|
||||||
body, err := json.Marshal(payload)
|
body, err := json.Marshal(payload)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -2299,7 +2296,6 @@ func redfishCriticalSlowGap() time.Duration {
|
|||||||
return 1200 * time.Millisecond
|
return 1200 * time.Millisecond
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
func redfishSnapshotMemoryRequestTimeout() time.Duration {
|
func redfishSnapshotMemoryRequestTimeout() time.Duration {
|
||||||
if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_MEMORY_TIMEOUT")); v != "" {
|
if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_MEMORY_TIMEOUT")); v != "" {
|
||||||
if d, err := time.ParseDuration(v); err == nil && d > 0 {
|
if d, err := time.ParseDuration(v); err == nil && d > 0 {
|
||||||
@@ -2878,11 +2874,16 @@ func (c *RedfishConnector) recoverCriticalRedfishDocsPlanB(ctx context.Context,
|
|||||||
timings := newRedfishPathTimingCollector(4)
|
timings := newRedfishPathTimingCollector(4)
|
||||||
var targets []string
|
var targets []string
|
||||||
seenTargets := make(map[string]struct{})
|
seenTargets := make(map[string]struct{})
|
||||||
|
skippedDiagnosticTargets := 0
|
||||||
addTarget := func(path string) {
|
addTarget := func(path string) {
|
||||||
path = normalizeRedfishPath(path)
|
path = normalizeRedfishPath(path)
|
||||||
if path == "" {
|
if path == "" {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
if !shouldIncludeCriticalPlanBPath(req, path) {
|
||||||
|
skippedDiagnosticTargets++
|
||||||
|
return
|
||||||
|
}
|
||||||
if _, ok := seenTargets[path]; ok {
|
if _, ok := seenTargets[path]; ok {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -2968,6 +2969,13 @@ func (c *RedfishConnector) recoverCriticalRedfishDocsPlanB(ctx context.Context,
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
if emit != nil {
|
if emit != nil {
|
||||||
|
if skippedDiagnosticTargets > 0 {
|
||||||
|
emit(Progress{
|
||||||
|
Status: "running",
|
||||||
|
Progress: 97,
|
||||||
|
Message: fmt.Sprintf("Redfish: расширенная диагностика выключена, пропущено %d тяжелых diagnostic endpoint", skippedDiagnosticTargets),
|
||||||
|
})
|
||||||
|
}
|
||||||
totalETA := redfishCriticalCooldown() + estimatePlanBETA(len(targets))
|
totalETA := redfishCriticalCooldown() + estimatePlanBETA(len(targets))
|
||||||
emit(Progress{
|
emit(Progress{
|
||||||
Status: "running",
|
Status: "running",
|
||||||
@@ -3073,6 +3081,39 @@ func (c *RedfishConnector) recoverCriticalRedfishDocsPlanB(ctx context.Context,
|
|||||||
return recovered
|
return recovered
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func shouldIncludeCriticalPlanBPath(req Request, path string) bool {
|
||||||
|
if req.DebugPayloads {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return !isExtendedDiagnosticCriticalPlanBPath(path)
|
||||||
|
}
|
||||||
|
|
||||||
|
func isExtendedDiagnosticCriticalPlanBPath(path string) bool {
|
||||||
|
path = normalizeRedfishPath(path)
|
||||||
|
if path == "" {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
parts := strings.Split(strings.Trim(path, "/"), "/")
|
||||||
|
if len(parts) < 5 || parts[0] != "redfish" || parts[1] != "v1" || parts[2] != "Chassis" {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if !strings.HasPrefix(parts[3], "HGX_") {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
for _, suffix := range []string{
|
||||||
|
"/Accelerators",
|
||||||
|
"/Assembly",
|
||||||
|
"/Drives",
|
||||||
|
"/NetworkAdapters",
|
||||||
|
"/PCIeDevices",
|
||||||
|
} {
|
||||||
|
if strings.HasSuffix(path, suffix) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
func (c *RedfishConnector) recoverProfilePlanBDocs(ctx context.Context, client *http.Client, req Request, baseURL string, plan redfishprofile.AcquisitionPlan, rawTree map[string]interface{}, emit ProgressFn) int {
|
func (c *RedfishConnector) recoverProfilePlanBDocs(ctx context.Context, client *http.Client, req Request, baseURL string, plan redfishprofile.AcquisitionPlan, rawTree map[string]interface{}, emit ProgressFn) int {
|
||||||
if len(plan.PlanBPaths) == 0 || plan.Mode == redfishprofile.ModeFallback || !plan.Tuning.RecoveryPolicy.EnableProfilePlanB {
|
if len(plan.PlanBPaths) == 0 || plan.Mode == redfishprofile.ModeFallback || !plan.Tuning.RecoveryPolicy.EnableProfilePlanB {
|
||||||
return 0
|
return 0
|
||||||
|
|||||||
57
internal/collector/redfish_planb_test.go
Normal file
57
internal/collector/redfish_planb_test.go
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func TestShouldIncludeCriticalPlanBPath(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
req Request
|
||||||
|
path string
|
||||||
|
want bool
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "skip hgx erot pcie without extended diagnostics",
|
||||||
|
req: Request{},
|
||||||
|
path: "/redfish/v1/Chassis/HGX_ERoT_NVSwitch_0/PCIeDevices",
|
||||||
|
want: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "skip hgx chassis assembly without extended diagnostics",
|
||||||
|
req: Request{},
|
||||||
|
path: "/redfish/v1/Chassis/HGX_Chassis_0/Assembly",
|
||||||
|
want: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "keep standard chassis inventory without extended diagnostics",
|
||||||
|
req: Request{},
|
||||||
|
path: "/redfish/v1/Chassis/1/PCIeDevices",
|
||||||
|
want: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "keep nvme storage backplane drives without extended diagnostics",
|
||||||
|
req: Request{},
|
||||||
|
path: "/redfish/v1/Chassis/NVMeSSD.0.Group.0.StorageBackplane/Drives",
|
||||||
|
want: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "keep system processors without extended diagnostics",
|
||||||
|
req: Request{},
|
||||||
|
path: "/redfish/v1/Systems/HGX_Baseboard_0/Processors",
|
||||||
|
want: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "include hgx erot pcie when extended diagnostics enabled",
|
||||||
|
req: Request{DebugPayloads: true},
|
||||||
|
path: "/redfish/v1/Chassis/HGX_ERoT_NVSwitch_0/PCIeDevices",
|
||||||
|
want: true,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
if got := shouldIncludeCriticalPlanBPath(tt.req, tt.path); got != tt.want {
|
||||||
|
t.Fatalf("shouldIncludeCriticalPlanBPath(%q) = %v, want %v", tt.path, got, tt.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -85,7 +85,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<label class="api-form-checkbox" for="api-debug-payloads">
|
<label class="api-form-checkbox" for="api-debug-payloads">
|
||||||
<input id="api-debug-payloads" name="debug_payloads" type="checkbox">
|
<input id="api-debug-payloads" name="debug_payloads" type="checkbox">
|
||||||
<span>Сбор расширенных метрик для отладки</span>
|
<span>Сбор расширенных данных для диагностики</span>
|
||||||
</label>
|
</label>
|
||||||
<div class="api-form-actions">
|
<div class="api-form-actions">
|
||||||
<button id="api-collect-btn" type="submit">Собрать</button>
|
<button id="api-collect-btn" type="submit">Собрать</button>
|
||||||
|
|||||||
Reference in New Issue
Block a user