3 Commits

Author SHA1 Message Date
f19a3454fa fix(redfish): gate hgx diagnostic plan-b by debug toggle 2026-04-13 14:45:41 +03:00
Mikhail Chusavitin
becdca1d7e fix(redfish): read PCIeInterface link width for GPU PCIe devices
parseGPUWithSupplementalDocs did not read PCIeInterface from the device
doc, only from function docs. xFusion GPU PCIeCard entries carry link
width/speed in PCIeInterface (LanesInUse/Maxlanes/PCIeType/MaxPCIeType)
so GPU link width was always empty for xFusion servers.

Also apply the xFusion OEM function-level fallback for GPU function docs,
consistent with the NIC and PCIeDevice paths.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-12 13:35:29 +03:00
Mikhail Chusavitin
e10440ae32 fix(redfish): collect PCIe link width from xFusion servers
xFusion iBMC exposes PCIe link width in two non-standard ways:
- PCIeInterface uses "Maxlanes" (lowercase 'l') instead of "MaxLanes"
- PCIeFunction docs carry width/speed in Oem.xFusion.LinkWidth ("X8"),
  Oem.xFusion.LinkWidthAbility, Oem.xFusion.LinkSpeed, and
  Oem.xFusion.LinkSpeedAbility rather than the standard CurrentLinkWidth int

Add redfishEnrichFromOEMxFusionPCIeLink and parseXFusionLinkWidth helpers,
apply them as fallbacks in NIC and PCIeDevice enrichment paths.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-12 13:35:29 +03:00
7 changed files with 278 additions and 6 deletions

View File

@@ -58,6 +58,7 @@ Responses:
Optional request field:
- `power_on_if_host_off`: when `true`, Redfish collection may power on the host before collection if preflight found it powered off
- `debug_payloads`: when `true`, collector keeps extra diagnostic payloads and enables extended plan-B retries for slow HGX component inventory branches (`Assembly`, `Accelerators`, `Drives`, `NetworkAdapters`, `PCIeDevices`)
### `POST /api/collect/probe`

View File

@@ -27,6 +27,7 @@ Request fields passed from the server:
- credential field (`password` or token)
- `tls_mode`
- optional `power_on_if_host_off`
- optional `debug_payloads` for extended diagnostics
### Core rule
@@ -57,6 +58,17 @@ closes `skipCh` → goroutine in `Collect()` → `cancelCollect()`.
The skip button is visible during `running` state and hidden once the job reaches a terminal state.
### Extended diagnostics toggle
The live collect form exposes a user-facing checkbox for extended diagnostics.
- default collection prioritizes inventory completeness and bounded runtime
- when extended diagnostics is off, heavy HGX component-chassis critical plan-B retries
(`Assembly`, `Accelerators`, `Drives`, `NetworkAdapters`, `PCIeDevices`) are skipped
- when extended diagnostics is on, those retries are allowed and extra debug payloads are collected
This toggle is intended for operator-driven deep diagnostics on problematic hosts, not for the default path.
### Discovery model
The collector does not rely on one fixed vendor tree.

View File

@@ -1120,3 +1120,20 @@ incomplete for UI and Reanimator consumers.
- System firmware such as BIOS and iBMC versions survives xFusion file exports.
- xFusion archives participate more reliably in canonical device/export flows without special UI
cases.
---
## ADL-043 — Extended HGX diagnostic plan-B is opt-in from the live collect form
**Date:** 2026-04-13
**Context:** Some Supermicro HGX Redfish targets expose slow or hanging component-chassis inventory
collections during critical plan-B, especially under `Chassis/HGX_*` for `Assembly`,
`Accelerators`, `Drives`, `NetworkAdapters`, and `PCIeDevices`. Default collection should not
block operators on deep diagnostic retries that are useful mainly for troubleshooting.
**Decision:** Keep the normal snapshot/replay path unchanged, but gate those heavy HGX
component-chassis critical plan-B retries behind the existing live-collect `debug_payloads` flag,
presented in the UI as "Сбор расширенных данных для диагностики".
**Consequences:**
- Default live collection skips those heavy diagnostic plan-B retries and reaches replay faster.
- Operators can explicitly opt into the slower diagnostic path when they need deeper collection.
- The same user-facing toggle continues to enable extra debug payload capture for troubleshooting.

View File

@@ -496,7 +496,6 @@ func (c *RedfishConnector) Collect(ctx context.Context, req Request, emit Progre
return result, nil
}
// collectDebugPayloads fetches vendor-specific diagnostic endpoints on a best-effort basis.
// Results are stored in rawPayloads["redfish_debug_payloads"] and exported with the bundle.
// Enabled only when Request.DebugPayloads is true.
@@ -511,7 +510,6 @@ func (c *RedfishConnector) collectDebugPayloads(ctx context.Context, client *htt
return out
}
func firstNonEmptyPath(paths []string, fallback string) string {
for _, p := range paths {
if strings.TrimSpace(p) != "" {
@@ -543,7 +541,6 @@ func redfishSystemPowerState(systemDoc map[string]interface{}) string {
return ""
}
func (c *RedfishConnector) postJSON(ctx context.Context, client *http.Client, req Request, baseURL, resourcePath string, payload map[string]any) error {
body, err := json.Marshal(payload)
if err != nil {
@@ -2299,7 +2296,6 @@ func redfishCriticalSlowGap() time.Duration {
return 1200 * time.Millisecond
}
func redfishSnapshotMemoryRequestTimeout() time.Duration {
if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_MEMORY_TIMEOUT")); v != "" {
if d, err := time.ParseDuration(v); err == nil && d > 0 {
@@ -2878,11 +2874,16 @@ func (c *RedfishConnector) recoverCriticalRedfishDocsPlanB(ctx context.Context,
timings := newRedfishPathTimingCollector(4)
var targets []string
seenTargets := make(map[string]struct{})
skippedDiagnosticTargets := 0
addTarget := func(path string) {
path = normalizeRedfishPath(path)
if path == "" {
return
}
if !shouldIncludeCriticalPlanBPath(req, path) {
skippedDiagnosticTargets++
return
}
if _, ok := seenTargets[path]; ok {
return
}
@@ -2968,6 +2969,13 @@ func (c *RedfishConnector) recoverCriticalRedfishDocsPlanB(ctx context.Context,
return 0
}
if emit != nil {
if skippedDiagnosticTargets > 0 {
emit(Progress{
Status: "running",
Progress: 97,
Message: fmt.Sprintf("Redfish: расширенная диагностика выключена, пропущено %d тяжелых diagnostic endpoint", skippedDiagnosticTargets),
})
}
totalETA := redfishCriticalCooldown() + estimatePlanBETA(len(targets))
emit(Progress{
Status: "running",
@@ -3073,6 +3081,39 @@ func (c *RedfishConnector) recoverCriticalRedfishDocsPlanB(ctx context.Context,
return recovered
}
func shouldIncludeCriticalPlanBPath(req Request, path string) bool {
if req.DebugPayloads {
return true
}
return !isExtendedDiagnosticCriticalPlanBPath(path)
}
func isExtendedDiagnosticCriticalPlanBPath(path string) bool {
path = normalizeRedfishPath(path)
if path == "" {
return false
}
parts := strings.Split(strings.Trim(path, "/"), "/")
if len(parts) < 5 || parts[0] != "redfish" || parts[1] != "v1" || parts[2] != "Chassis" {
return false
}
if !strings.HasPrefix(parts[3], "HGX_") {
return false
}
for _, suffix := range []string{
"/Accelerators",
"/Assembly",
"/Drives",
"/NetworkAdapters",
"/PCIeDevices",
} {
if strings.HasSuffix(path, suffix) {
return true
}
}
return false
}
func (c *RedfishConnector) recoverProfilePlanBDocs(ctx context.Context, client *http.Client, req Request, baseURL string, plan redfishprofile.AcquisitionPlan, rawTree map[string]interface{}, emit ProgressFn) int {
if len(plan.PlanBPaths) == 0 || plan.Mode == redfishprofile.ModeFallback || !plan.Tuning.RecoveryPolicy.EnableProfilePlanB {
return 0
@@ -3592,7 +3633,7 @@ func parseNIC(doc map[string]interface{}) models.NetworkAdapter {
}
if pcieIf, ok := ctrl["PCIeInterface"].(map[string]interface{}); ok && linkWidth == 0 && maxLinkWidth == 0 && linkSpeed == "" && maxLinkSpeed == "" {
linkWidth = asInt(pcieIf["LanesInUse"])
maxLinkWidth = asInt(pcieIf["MaxLanes"])
maxLinkWidth = firstNonZeroInt(asInt(pcieIf["MaxLanes"]), asInt(pcieIf["Maxlanes"]))
linkSpeed = firstNonEmpty(asString(pcieIf["PCIeType"]), asString(pcieIf["CurrentLinkSpeedGTs"]), asString(pcieIf["CurrentLinkSpeed"]))
maxLinkSpeed = firstNonEmpty(asString(pcieIf["MaxPCIeType"]), asString(pcieIf["MaxLinkSpeedGTs"]), asString(pcieIf["MaxLinkSpeed"]))
}
@@ -3705,6 +3746,9 @@ func enrichNICFromPCIe(nic *models.NetworkAdapter, pcieDoc map[string]interface{
if strings.TrimSpace(nic.MaxLinkSpeed) == "" {
nic.MaxLinkSpeed = firstNonEmpty(asString(pcieDoc["MaxLinkSpeedGTs"]), asString(pcieDoc["MaxLinkSpeed"]))
}
if nic.LinkWidth == 0 || nic.MaxLinkWidth == 0 || nic.LinkSpeed == "" || nic.MaxLinkSpeed == "" {
redfishEnrichFromOEMxFusionPCIeLink(pcieDoc, &nic.LinkWidth, &nic.MaxLinkWidth, &nic.LinkSpeed, &nic.MaxLinkSpeed)
}
if normalizeRedfishIdentityField(nic.SerialNumber) == "" {
nic.SerialNumber = findFirstNormalizedStringByKeys(pcieDoc, "SerialNumber")
}
@@ -3736,6 +3780,9 @@ func enrichNICFromPCIe(nic *models.NetworkAdapter, pcieDoc map[string]interface{
if strings.TrimSpace(nic.MaxLinkSpeed) == "" {
nic.MaxLinkSpeed = firstNonEmpty(asString(fn["MaxLinkSpeedGTs"]), asString(fn["MaxLinkSpeed"]))
}
if nic.LinkWidth == 0 || nic.MaxLinkWidth == 0 || nic.LinkSpeed == "" || nic.MaxLinkSpeed == "" {
redfishEnrichFromOEMxFusionPCIeLink(fn, &nic.LinkWidth, &nic.MaxLinkWidth, &nic.LinkSpeed, &nic.MaxLinkSpeed)
}
if normalizeRedfishIdentityField(nic.SerialNumber) == "" {
nic.SerialNumber = findFirstNormalizedStringByKeys(fn, "SerialNumber")
}
@@ -4302,6 +4349,21 @@ func parseGPUWithSupplementalDocs(doc map[string]interface{}, functionDocs []map
gpu.DeviceID = asHexOrInt(doc["DeviceId"])
}
if pcieIf, ok := doc["PCIeInterface"].(map[string]interface{}); ok {
if gpu.CurrentLinkWidth == 0 {
gpu.CurrentLinkWidth = asInt(pcieIf["LanesInUse"])
}
if gpu.MaxLinkWidth == 0 {
gpu.MaxLinkWidth = firstNonZeroInt(asInt(pcieIf["MaxLanes"]), asInt(pcieIf["Maxlanes"]))
}
if gpu.CurrentLinkSpeed == "" {
gpu.CurrentLinkSpeed = firstNonEmpty(asString(pcieIf["PCIeType"]), asString(pcieIf["CurrentLinkSpeedGTs"]), asString(pcieIf["CurrentLinkSpeed"]))
}
if gpu.MaxLinkSpeed == "" {
gpu.MaxLinkSpeed = firstNonEmpty(asString(pcieIf["MaxPCIeType"]), asString(pcieIf["MaxLinkSpeedGTs"]), asString(pcieIf["MaxLinkSpeed"]))
}
}
for _, fn := range functionDocs {
if gpu.BDF == "" {
gpu.BDF = sanitizeRedfishBDF(asString(fn["FunctionId"]))
@@ -4324,6 +4386,9 @@ func parseGPUWithSupplementalDocs(doc map[string]interface{}, functionDocs []map
if gpu.CurrentLinkSpeed == "" {
gpu.CurrentLinkSpeed = firstNonEmpty(asString(fn["CurrentLinkSpeedGTs"]), asString(fn["CurrentLinkSpeed"]))
}
if gpu.CurrentLinkWidth == 0 || gpu.MaxLinkWidth == 0 || gpu.CurrentLinkSpeed == "" || gpu.MaxLinkSpeed == "" {
redfishEnrichFromOEMxFusionPCIeLink(fn, &gpu.CurrentLinkWidth, &gpu.MaxLinkWidth, &gpu.CurrentLinkSpeed, &gpu.MaxLinkSpeed)
}
}
if isMissingOrRawPCIModel(gpu.Model) {
@@ -4384,6 +4449,9 @@ func parsePCIeDeviceWithSupplementalDocs(doc map[string]interface{}, functionDoc
if dev.MaxLinkSpeed == "" {
dev.MaxLinkSpeed = firstNonEmpty(asString(fn["MaxLinkSpeedGTs"]), asString(fn["MaxLinkSpeed"]))
}
if dev.LinkWidth == 0 || dev.MaxLinkWidth == 0 || dev.LinkSpeed == "" || dev.MaxLinkSpeed == "" {
redfishEnrichFromOEMxFusionPCIeLink(fn, &dev.LinkWidth, &dev.MaxLinkWidth, &dev.LinkSpeed, &dev.MaxLinkSpeed)
}
}
if dev.DeviceClass == "" || isGenericPCIeClassLabel(dev.DeviceClass) {
dev.DeviceClass = firstNonEmpty(redfishFirstStringAcrossDocs(supplementalDocs, "DeviceType"), dev.DeviceClass)
@@ -4633,6 +4701,59 @@ func buildBDFfromOemPublic(doc map[string]interface{}) string {
return fmt.Sprintf("%04x:%02x:%02x.%x", segment, bus, dev, fn)
}
// redfishEnrichFromOEMxFusionPCIeLink fills in missing PCIe link width/speed
// from the xFusion OEM namespace. xFusion reports link width as a string like
// "X8" in Oem.xFusion.LinkWidth / Oem.xFusion.LinkWidthAbility, and link speed
// as a string like "Gen4 (16.0GT/s)" in Oem.xFusion.LinkSpeed /
// Oem.xFusion.LinkSpeedAbility. These fields appear on PCIeFunction docs.
func redfishEnrichFromOEMxFusionPCIeLink(doc map[string]interface{}, linkWidth, maxLinkWidth *int, linkSpeed, maxLinkSpeed *string) {
oem, _ := doc["Oem"].(map[string]interface{})
if oem == nil {
return
}
xf, _ := oem["xFusion"].(map[string]interface{})
if xf == nil {
return
}
if *linkWidth == 0 {
*linkWidth = parseXFusionLinkWidth(asString(xf["LinkWidth"]))
}
if *maxLinkWidth == 0 {
*maxLinkWidth = parseXFusionLinkWidth(asString(xf["LinkWidthAbility"]))
}
if strings.TrimSpace(*linkSpeed) == "" {
*linkSpeed = strings.TrimSpace(asString(xf["LinkSpeed"]))
}
if strings.TrimSpace(*maxLinkSpeed) == "" {
*maxLinkSpeed = strings.TrimSpace(asString(xf["LinkSpeedAbility"]))
}
}
// parseXFusionLinkWidth converts an xFusion link-width string like "X8" or
// "x16" to the integer lane count. Returns 0 for unrecognised values.
func parseXFusionLinkWidth(s string) int {
s = strings.TrimSpace(s)
if s == "" {
return 0
}
s = strings.TrimPrefix(strings.ToUpper(s), "X")
v := asInt(s)
if v <= 0 {
return 0
}
return v
}
// firstNonZeroInt returns the first argument that is non-zero.
func firstNonZeroInt(vals ...int) int {
for _, v := range vals {
if v != 0 {
return v
}
}
return 0
}
func normalizeRedfishIdentityField(v string) string {
v = strings.TrimSpace(v)
if v == "" {

View File

@@ -0,0 +1,57 @@
package collector
import "testing"
func TestShouldIncludeCriticalPlanBPath(t *testing.T) {
tests := []struct {
name string
req Request
path string
want bool
}{
{
name: "skip hgx erot pcie without extended diagnostics",
req: Request{},
path: "/redfish/v1/Chassis/HGX_ERoT_NVSwitch_0/PCIeDevices",
want: false,
},
{
name: "skip hgx chassis assembly without extended diagnostics",
req: Request{},
path: "/redfish/v1/Chassis/HGX_Chassis_0/Assembly",
want: false,
},
{
name: "keep standard chassis inventory without extended diagnostics",
req: Request{},
path: "/redfish/v1/Chassis/1/PCIeDevices",
want: true,
},
{
name: "keep nvme storage backplane drives without extended diagnostics",
req: Request{},
path: "/redfish/v1/Chassis/NVMeSSD.0.Group.0.StorageBackplane/Drives",
want: true,
},
{
name: "keep system processors without extended diagnostics",
req: Request{},
path: "/redfish/v1/Systems/HGX_Baseboard_0/Processors",
want: true,
},
{
name: "include hgx erot pcie when extended diagnostics enabled",
req: Request{DebugPayloads: true},
path: "/redfish/v1/Chassis/HGX_ERoT_NVSwitch_0/PCIeDevices",
want: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := shouldIncludeCriticalPlanBPath(tt.req, tt.path); got != tt.want {
t.Fatalf("shouldIncludeCriticalPlanBPath(%q) = %v, want %v", tt.path, got, tt.want)
}
})
}
}

View File

@@ -1341,6 +1341,48 @@ func TestParseNIC_PrefersControllerSlotLabelAndPCIeInterface(t *testing.T) {
}
}
func TestParseNIC_xFusionMaxlanesAndOEMLinkWidth(t *testing.T) {
// xFusion uses "Maxlanes" (lowercase 'l') in PCIeInterface, not "MaxLanes".
// xFusion also stores per-function link width as Oem.xFusion.LinkWidth = "X8".
nic := parseNIC(map[string]interface{}{
"Id": "OCPCard1",
"Model": "ConnectX-6 Lx",
"Controllers": []interface{}{
map[string]interface{}{
"PCIeInterface": map[string]interface{}{
"LanesInUse": 8,
"Maxlanes": 8, // xFusion uses lowercase 'l'
"PCIeType": "Gen4",
"MaxPCIeType": "Gen4",
},
},
},
})
if nic.LinkWidth != 8 || nic.MaxLinkWidth != 8 {
t.Fatalf("expected link widths 8/8 from xFusion Maxlanes, got current=%d max=%d", nic.LinkWidth, nic.MaxLinkWidth)
}
// enrichNICFromPCIe: OEM xFusion LinkWidth on a PCIeFunction doc.
nic2 := models.NetworkAdapter{}
fnDoc := map[string]interface{}{
"Oem": map[string]interface{}{
"xFusion": map[string]interface{}{
"LinkWidth": "X8",
"LinkWidthAbility": "X8",
"LinkSpeed": "Gen4 (16.0GT/s)",
"LinkSpeedAbility": "Gen4 (16.0GT/s)",
},
},
}
enrichNICFromPCIe(&nic2, map[string]interface{}{}, []map[string]interface{}{fnDoc}, nil)
if nic2.LinkWidth != 8 || nic2.MaxLinkWidth != 8 {
t.Fatalf("expected link width 8 from xFusion OEM LinkWidth, got current=%d max=%d", nic2.LinkWidth, nic2.MaxLinkWidth)
}
if nic2.LinkSpeed != "Gen4 (16.0GT/s)" || nic2.MaxLinkSpeed != "Gen4 (16.0GT/s)" {
t.Fatalf("expected link speed from xFusion OEM LinkSpeed, got current=%q max=%q", nic2.LinkSpeed, nic2.MaxLinkSpeed)
}
}
func TestParseNIC_DropsUnrealisticPortCount(t *testing.T) {
nic := parseNIC(map[string]interface{}{
"Id": "1",
@@ -2773,6 +2815,28 @@ func TestReplayCollectGPUs_DedupUsesRedfishPathBeforeHeuristics(t *testing.T) {
}
}
func TestParseGPU_xFusionPCIeInterfaceMaxlanes(t *testing.T) {
// xFusion GPU PCIeDevices (PCIeCard1..N) carry link width in PCIeInterface
// with "Maxlanes" (lowercase 'l') rather than "MaxLanes".
doc := map[string]interface{}{
"Id": "PCIeCard1",
"Model": "RTX PRO 6000",
"PCIeInterface": map[string]interface{}{
"LanesInUse": 16,
"Maxlanes": 16,
"PCIeType": "Gen5",
"MaxPCIeType": "Gen5",
},
}
gpu := parseGPU(doc, nil, 1)
if gpu.CurrentLinkWidth != 16 || gpu.MaxLinkWidth != 16 {
t.Fatalf("expected link widths 16/16 from PCIeInterface, got current=%d max=%d", gpu.CurrentLinkWidth, gpu.MaxLinkWidth)
}
if gpu.CurrentLinkSpeed != "Gen5" || gpu.MaxLinkSpeed != "Gen5" {
t.Fatalf("expected link speeds Gen5/Gen5 from PCIeInterface, got current=%q max=%q", gpu.CurrentLinkSpeed, gpu.MaxLinkSpeed)
}
}
func TestParseGPU_UsesNestedOemSerialNumber(t *testing.T) {
doc := map[string]interface{}{
"Id": "GPU4",

View File

@@ -85,7 +85,7 @@
</div>
<label class="api-form-checkbox" for="api-debug-payloads">
<input id="api-debug-payloads" name="debug_payloads" type="checkbox">
<span>Сбор расширенных метрик для отладки</span>
<span>Сбор расширенных данных для диагностики</span>
</label>
<div class="api-form-actions">
<button id="api-collect-btn" type="submit">Собрать</button>