Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| b04877549a | |||
| 8ca173c99b | |||
| f19a3454fa | |||
|
|
becdca1d7e | ||
|
|
e10440ae32 |
@@ -58,6 +58,7 @@ Responses:
|
||||
|
||||
Optional request field:
|
||||
- `power_on_if_host_off`: when `true`, Redfish collection may power on the host before collection if preflight found it powered off
|
||||
- `debug_payloads`: when `true`, collector keeps extra diagnostic payloads and enables extended plan-B retries for slow HGX component inventory branches (`Assembly`, `Accelerators`, `Drives`, `NetworkAdapters`, `PCIeDevices`)
|
||||
|
||||
### `POST /api/collect/probe`
|
||||
|
||||
|
||||
@@ -27,6 +27,7 @@ Request fields passed from the server:
|
||||
- credential field (`password` or token)
|
||||
- `tls_mode`
|
||||
- optional `power_on_if_host_off`
|
||||
- optional `debug_payloads` for extended diagnostics
|
||||
|
||||
### Core rule
|
||||
|
||||
@@ -57,6 +58,17 @@ closes `skipCh` → goroutine in `Collect()` → `cancelCollect()`.
|
||||
|
||||
The skip button is visible during `running` state and hidden once the job reaches a terminal state.
|
||||
|
||||
### Extended diagnostics toggle
|
||||
|
||||
The live collect form exposes a user-facing checkbox for extended diagnostics.
|
||||
|
||||
- default collection prioritizes inventory completeness and bounded runtime
|
||||
- when extended diagnostics is off, heavy HGX component-chassis critical plan-B retries
|
||||
(`Assembly`, `Accelerators`, `Drives`, `NetworkAdapters`, `PCIeDevices`) are skipped
|
||||
- when extended diagnostics is on, those retries are allowed and extra debug payloads are collected
|
||||
|
||||
This toggle is intended for operator-driven deep diagnostics on problematic hosts, not for the default path.
|
||||
|
||||
### Discovery model
|
||||
|
||||
The collector does not rely on one fixed vendor tree.
|
||||
|
||||
@@ -1120,3 +1120,20 @@ incomplete for UI and Reanimator consumers.
|
||||
- System firmware such as BIOS and iBMC versions survives xFusion file exports.
|
||||
- xFusion archives participate more reliably in canonical device/export flows without special UI
|
||||
cases.
|
||||
|
||||
---
|
||||
|
||||
## ADL-043 — Extended HGX diagnostic plan-B is opt-in from the live collect form
|
||||
|
||||
**Date:** 2026-04-13
|
||||
**Context:** Some Supermicro HGX Redfish targets expose slow or hanging component-chassis inventory
|
||||
collections during critical plan-B, especially under `Chassis/HGX_*` for `Assembly`,
|
||||
`Accelerators`, `Drives`, `NetworkAdapters`, and `PCIeDevices`. Default collection should not
|
||||
block operators on deep diagnostic retries that are useful mainly for troubleshooting.
|
||||
**Decision:** Keep the normal snapshot/replay path unchanged, but gate those heavy HGX
|
||||
component-chassis critical plan-B retries behind the existing live-collect `debug_payloads` flag,
|
||||
presented in the UI as "Сбор расширенных данных для диагностики".
|
||||
**Consequences:**
|
||||
- Default live collection skips those heavy diagnostic plan-B retries and reaches replay faster.
|
||||
- Operators can explicitly opt into the slower diagnostic path when they need deeper collection.
|
||||
- The same user-facing toggle continues to enable extra debug payload capture for troubleshooting.
|
||||
|
||||
@@ -345,8 +345,9 @@ func (c *RedfishConnector) Collect(ctx context.Context, req Request, emit Progre
|
||||
"manager_critical_suffixes": acquisitionPlan.ScopedPaths.ManagerCriticalSuffixes,
|
||||
},
|
||||
"tuning": map[string]any{
|
||||
"snapshot_max_documents": acquisitionPlan.Tuning.SnapshotMaxDocuments,
|
||||
"snapshot_workers": acquisitionPlan.Tuning.SnapshotWorkers,
|
||||
"snapshot_max_documents": acquisitionPlan.Tuning.SnapshotMaxDocuments,
|
||||
"snapshot_workers": acquisitionPlan.Tuning.SnapshotWorkers,
|
||||
"snapshot_exclude_contains": acquisitionPlan.Tuning.SnapshotExcludeContains,
|
||||
"prefetch_workers": acquisitionPlan.Tuning.PrefetchWorkers,
|
||||
"prefetch_enabled": boolPointerValue(acquisitionPlan.Tuning.PrefetchEnabled),
|
||||
"nvme_post_probe": boolPointerValue(acquisitionPlan.Tuning.NVMePostProbeEnabled),
|
||||
@@ -496,7 +497,6 @@ func (c *RedfishConnector) Collect(ctx context.Context, req Request, emit Progre
|
||||
return result, nil
|
||||
}
|
||||
|
||||
|
||||
// collectDebugPayloads fetches vendor-specific diagnostic endpoints on a best-effort basis.
|
||||
// Results are stored in rawPayloads["redfish_debug_payloads"] and exported with the bundle.
|
||||
// Enabled only when Request.DebugPayloads is true.
|
||||
@@ -511,7 +511,6 @@ func (c *RedfishConnector) collectDebugPayloads(ctx context.Context, client *htt
|
||||
return out
|
||||
}
|
||||
|
||||
|
||||
func firstNonEmptyPath(paths []string, fallback string) string {
|
||||
for _, p := range paths {
|
||||
if strings.TrimSpace(p) != "" {
|
||||
@@ -543,7 +542,6 @@ func redfishSystemPowerState(systemDoc map[string]interface{}) string {
|
||||
return ""
|
||||
}
|
||||
|
||||
|
||||
func (c *RedfishConnector) postJSON(ctx context.Context, client *http.Client, req Request, baseURL, resourcePath string, payload map[string]any) error {
|
||||
body, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
@@ -1346,6 +1344,11 @@ func (c *RedfishConnector) collectRawRedfishTree(ctx context.Context, client *ht
|
||||
if !shouldCrawlPath(path) {
|
||||
return
|
||||
}
|
||||
for _, pattern := range tuning.SnapshotExcludeContains {
|
||||
if pattern != "" && strings.Contains(path, pattern) {
|
||||
return
|
||||
}
|
||||
}
|
||||
mu.Lock()
|
||||
if len(seen) >= maxDocuments {
|
||||
mu.Unlock()
|
||||
@@ -2299,7 +2302,6 @@ func redfishCriticalSlowGap() time.Duration {
|
||||
return 1200 * time.Millisecond
|
||||
}
|
||||
|
||||
|
||||
func redfishSnapshotMemoryRequestTimeout() time.Duration {
|
||||
if v := strings.TrimSpace(os.Getenv("LOGPILE_REDFISH_MEMORY_TIMEOUT")); v != "" {
|
||||
if d, err := time.ParseDuration(v); err == nil && d > 0 {
|
||||
@@ -2878,11 +2880,16 @@ func (c *RedfishConnector) recoverCriticalRedfishDocsPlanB(ctx context.Context,
|
||||
timings := newRedfishPathTimingCollector(4)
|
||||
var targets []string
|
||||
seenTargets := make(map[string]struct{})
|
||||
skippedDiagnosticTargets := 0
|
||||
addTarget := func(path string) {
|
||||
path = normalizeRedfishPath(path)
|
||||
if path == "" {
|
||||
return
|
||||
}
|
||||
if !shouldIncludeCriticalPlanBPath(req, path) {
|
||||
skippedDiagnosticTargets++
|
||||
return
|
||||
}
|
||||
if _, ok := seenTargets[path]; ok {
|
||||
return
|
||||
}
|
||||
@@ -2968,6 +2975,13 @@ func (c *RedfishConnector) recoverCriticalRedfishDocsPlanB(ctx context.Context,
|
||||
return 0
|
||||
}
|
||||
if emit != nil {
|
||||
if skippedDiagnosticTargets > 0 {
|
||||
emit(Progress{
|
||||
Status: "running",
|
||||
Progress: 97,
|
||||
Message: fmt.Sprintf("Redfish: расширенная диагностика выключена, пропущено %d тяжелых diagnostic endpoint", skippedDiagnosticTargets),
|
||||
})
|
||||
}
|
||||
totalETA := redfishCriticalCooldown() + estimatePlanBETA(len(targets))
|
||||
emit(Progress{
|
||||
Status: "running",
|
||||
@@ -3073,6 +3087,39 @@ func (c *RedfishConnector) recoverCriticalRedfishDocsPlanB(ctx context.Context,
|
||||
return recovered
|
||||
}
|
||||
|
||||
func shouldIncludeCriticalPlanBPath(req Request, path string) bool {
|
||||
if req.DebugPayloads {
|
||||
return true
|
||||
}
|
||||
return !isExtendedDiagnosticCriticalPlanBPath(path)
|
||||
}
|
||||
|
||||
func isExtendedDiagnosticCriticalPlanBPath(path string) bool {
|
||||
path = normalizeRedfishPath(path)
|
||||
if path == "" {
|
||||
return false
|
||||
}
|
||||
parts := strings.Split(strings.Trim(path, "/"), "/")
|
||||
if len(parts) < 5 || parts[0] != "redfish" || parts[1] != "v1" || parts[2] != "Chassis" {
|
||||
return false
|
||||
}
|
||||
if !strings.HasPrefix(parts[3], "HGX_") {
|
||||
return false
|
||||
}
|
||||
for _, suffix := range []string{
|
||||
"/Accelerators",
|
||||
"/Assembly",
|
||||
"/Drives",
|
||||
"/NetworkAdapters",
|
||||
"/PCIeDevices",
|
||||
} {
|
||||
if strings.HasSuffix(path, suffix) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (c *RedfishConnector) recoverProfilePlanBDocs(ctx context.Context, client *http.Client, req Request, baseURL string, plan redfishprofile.AcquisitionPlan, rawTree map[string]interface{}, emit ProgressFn) int {
|
||||
if len(plan.PlanBPaths) == 0 || plan.Mode == redfishprofile.ModeFallback || !plan.Tuning.RecoveryPolicy.EnableProfilePlanB {
|
||||
return 0
|
||||
@@ -3592,7 +3639,7 @@ func parseNIC(doc map[string]interface{}) models.NetworkAdapter {
|
||||
}
|
||||
if pcieIf, ok := ctrl["PCIeInterface"].(map[string]interface{}); ok && linkWidth == 0 && maxLinkWidth == 0 && linkSpeed == "" && maxLinkSpeed == "" {
|
||||
linkWidth = asInt(pcieIf["LanesInUse"])
|
||||
maxLinkWidth = asInt(pcieIf["MaxLanes"])
|
||||
maxLinkWidth = firstNonZeroInt(asInt(pcieIf["MaxLanes"]), asInt(pcieIf["Maxlanes"]))
|
||||
linkSpeed = firstNonEmpty(asString(pcieIf["PCIeType"]), asString(pcieIf["CurrentLinkSpeedGTs"]), asString(pcieIf["CurrentLinkSpeed"]))
|
||||
maxLinkSpeed = firstNonEmpty(asString(pcieIf["MaxPCIeType"]), asString(pcieIf["MaxLinkSpeedGTs"]), asString(pcieIf["MaxLinkSpeed"]))
|
||||
}
|
||||
@@ -3705,6 +3752,9 @@ func enrichNICFromPCIe(nic *models.NetworkAdapter, pcieDoc map[string]interface{
|
||||
if strings.TrimSpace(nic.MaxLinkSpeed) == "" {
|
||||
nic.MaxLinkSpeed = firstNonEmpty(asString(pcieDoc["MaxLinkSpeedGTs"]), asString(pcieDoc["MaxLinkSpeed"]))
|
||||
}
|
||||
if nic.LinkWidth == 0 || nic.MaxLinkWidth == 0 || nic.LinkSpeed == "" || nic.MaxLinkSpeed == "" {
|
||||
redfishEnrichFromOEMxFusionPCIeLink(pcieDoc, &nic.LinkWidth, &nic.MaxLinkWidth, &nic.LinkSpeed, &nic.MaxLinkSpeed)
|
||||
}
|
||||
if normalizeRedfishIdentityField(nic.SerialNumber) == "" {
|
||||
nic.SerialNumber = findFirstNormalizedStringByKeys(pcieDoc, "SerialNumber")
|
||||
}
|
||||
@@ -3736,6 +3786,9 @@ func enrichNICFromPCIe(nic *models.NetworkAdapter, pcieDoc map[string]interface{
|
||||
if strings.TrimSpace(nic.MaxLinkSpeed) == "" {
|
||||
nic.MaxLinkSpeed = firstNonEmpty(asString(fn["MaxLinkSpeedGTs"]), asString(fn["MaxLinkSpeed"]))
|
||||
}
|
||||
if nic.LinkWidth == 0 || nic.MaxLinkWidth == 0 || nic.LinkSpeed == "" || nic.MaxLinkSpeed == "" {
|
||||
redfishEnrichFromOEMxFusionPCIeLink(fn, &nic.LinkWidth, &nic.MaxLinkWidth, &nic.LinkSpeed, &nic.MaxLinkSpeed)
|
||||
}
|
||||
if normalizeRedfishIdentityField(nic.SerialNumber) == "" {
|
||||
nic.SerialNumber = findFirstNormalizedStringByKeys(fn, "SerialNumber")
|
||||
}
|
||||
@@ -4302,6 +4355,21 @@ func parseGPUWithSupplementalDocs(doc map[string]interface{}, functionDocs []map
|
||||
gpu.DeviceID = asHexOrInt(doc["DeviceId"])
|
||||
}
|
||||
|
||||
if pcieIf, ok := doc["PCIeInterface"].(map[string]interface{}); ok {
|
||||
if gpu.CurrentLinkWidth == 0 {
|
||||
gpu.CurrentLinkWidth = asInt(pcieIf["LanesInUse"])
|
||||
}
|
||||
if gpu.MaxLinkWidth == 0 {
|
||||
gpu.MaxLinkWidth = firstNonZeroInt(asInt(pcieIf["MaxLanes"]), asInt(pcieIf["Maxlanes"]))
|
||||
}
|
||||
if gpu.CurrentLinkSpeed == "" {
|
||||
gpu.CurrentLinkSpeed = firstNonEmpty(asString(pcieIf["PCIeType"]), asString(pcieIf["CurrentLinkSpeedGTs"]), asString(pcieIf["CurrentLinkSpeed"]))
|
||||
}
|
||||
if gpu.MaxLinkSpeed == "" {
|
||||
gpu.MaxLinkSpeed = firstNonEmpty(asString(pcieIf["MaxPCIeType"]), asString(pcieIf["MaxLinkSpeedGTs"]), asString(pcieIf["MaxLinkSpeed"]))
|
||||
}
|
||||
}
|
||||
|
||||
for _, fn := range functionDocs {
|
||||
if gpu.BDF == "" {
|
||||
gpu.BDF = sanitizeRedfishBDF(asString(fn["FunctionId"]))
|
||||
@@ -4324,6 +4392,9 @@ func parseGPUWithSupplementalDocs(doc map[string]interface{}, functionDocs []map
|
||||
if gpu.CurrentLinkSpeed == "" {
|
||||
gpu.CurrentLinkSpeed = firstNonEmpty(asString(fn["CurrentLinkSpeedGTs"]), asString(fn["CurrentLinkSpeed"]))
|
||||
}
|
||||
if gpu.CurrentLinkWidth == 0 || gpu.MaxLinkWidth == 0 || gpu.CurrentLinkSpeed == "" || gpu.MaxLinkSpeed == "" {
|
||||
redfishEnrichFromOEMxFusionPCIeLink(fn, &gpu.CurrentLinkWidth, &gpu.MaxLinkWidth, &gpu.CurrentLinkSpeed, &gpu.MaxLinkSpeed)
|
||||
}
|
||||
}
|
||||
|
||||
if isMissingOrRawPCIModel(gpu.Model) {
|
||||
@@ -4384,6 +4455,9 @@ func parsePCIeDeviceWithSupplementalDocs(doc map[string]interface{}, functionDoc
|
||||
if dev.MaxLinkSpeed == "" {
|
||||
dev.MaxLinkSpeed = firstNonEmpty(asString(fn["MaxLinkSpeedGTs"]), asString(fn["MaxLinkSpeed"]))
|
||||
}
|
||||
if dev.LinkWidth == 0 || dev.MaxLinkWidth == 0 || dev.LinkSpeed == "" || dev.MaxLinkSpeed == "" {
|
||||
redfishEnrichFromOEMxFusionPCIeLink(fn, &dev.LinkWidth, &dev.MaxLinkWidth, &dev.LinkSpeed, &dev.MaxLinkSpeed)
|
||||
}
|
||||
}
|
||||
if dev.DeviceClass == "" || isGenericPCIeClassLabel(dev.DeviceClass) {
|
||||
dev.DeviceClass = firstNonEmpty(redfishFirstStringAcrossDocs(supplementalDocs, "DeviceType"), dev.DeviceClass)
|
||||
@@ -4633,6 +4707,59 @@ func buildBDFfromOemPublic(doc map[string]interface{}) string {
|
||||
return fmt.Sprintf("%04x:%02x:%02x.%x", segment, bus, dev, fn)
|
||||
}
|
||||
|
||||
// redfishEnrichFromOEMxFusionPCIeLink fills in missing PCIe link width/speed
|
||||
// from the xFusion OEM namespace. xFusion reports link width as a string like
|
||||
// "X8" in Oem.xFusion.LinkWidth / Oem.xFusion.LinkWidthAbility, and link speed
|
||||
// as a string like "Gen4 (16.0GT/s)" in Oem.xFusion.LinkSpeed /
|
||||
// Oem.xFusion.LinkSpeedAbility. These fields appear on PCIeFunction docs.
|
||||
func redfishEnrichFromOEMxFusionPCIeLink(doc map[string]interface{}, linkWidth, maxLinkWidth *int, linkSpeed, maxLinkSpeed *string) {
|
||||
oem, _ := doc["Oem"].(map[string]interface{})
|
||||
if oem == nil {
|
||||
return
|
||||
}
|
||||
xf, _ := oem["xFusion"].(map[string]interface{})
|
||||
if xf == nil {
|
||||
return
|
||||
}
|
||||
if *linkWidth == 0 {
|
||||
*linkWidth = parseXFusionLinkWidth(asString(xf["LinkWidth"]))
|
||||
}
|
||||
if *maxLinkWidth == 0 {
|
||||
*maxLinkWidth = parseXFusionLinkWidth(asString(xf["LinkWidthAbility"]))
|
||||
}
|
||||
if strings.TrimSpace(*linkSpeed) == "" {
|
||||
*linkSpeed = strings.TrimSpace(asString(xf["LinkSpeed"]))
|
||||
}
|
||||
if strings.TrimSpace(*maxLinkSpeed) == "" {
|
||||
*maxLinkSpeed = strings.TrimSpace(asString(xf["LinkSpeedAbility"]))
|
||||
}
|
||||
}
|
||||
|
||||
// parseXFusionLinkWidth converts an xFusion link-width string like "X8" or
|
||||
// "x16" to the integer lane count. Returns 0 for unrecognised values.
|
||||
func parseXFusionLinkWidth(s string) int {
|
||||
s = strings.TrimSpace(s)
|
||||
if s == "" {
|
||||
return 0
|
||||
}
|
||||
s = strings.TrimPrefix(strings.ToUpper(s), "X")
|
||||
v := asInt(s)
|
||||
if v <= 0 {
|
||||
return 0
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
// firstNonZeroInt returns the first argument that is non-zero.
|
||||
func firstNonZeroInt(vals ...int) int {
|
||||
for _, v := range vals {
|
||||
if v != 0 {
|
||||
return v
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func normalizeRedfishIdentityField(v string) string {
|
||||
v = strings.TrimSpace(v)
|
||||
if v == "" {
|
||||
|
||||
57
internal/collector/redfish_planb_test.go
Normal file
57
internal/collector/redfish_planb_test.go
Normal file
@@ -0,0 +1,57 @@
|
||||
package collector
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestShouldIncludeCriticalPlanBPath(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
req Request
|
||||
path string
|
||||
want bool
|
||||
}{
|
||||
{
|
||||
name: "skip hgx erot pcie without extended diagnostics",
|
||||
req: Request{},
|
||||
path: "/redfish/v1/Chassis/HGX_ERoT_NVSwitch_0/PCIeDevices",
|
||||
want: false,
|
||||
},
|
||||
{
|
||||
name: "skip hgx chassis assembly without extended diagnostics",
|
||||
req: Request{},
|
||||
path: "/redfish/v1/Chassis/HGX_Chassis_0/Assembly",
|
||||
want: false,
|
||||
},
|
||||
{
|
||||
name: "keep standard chassis inventory without extended diagnostics",
|
||||
req: Request{},
|
||||
path: "/redfish/v1/Chassis/1/PCIeDevices",
|
||||
want: true,
|
||||
},
|
||||
{
|
||||
name: "keep nvme storage backplane drives without extended diagnostics",
|
||||
req: Request{},
|
||||
path: "/redfish/v1/Chassis/NVMeSSD.0.Group.0.StorageBackplane/Drives",
|
||||
want: true,
|
||||
},
|
||||
{
|
||||
name: "keep system processors without extended diagnostics",
|
||||
req: Request{},
|
||||
path: "/redfish/v1/Systems/HGX_Baseboard_0/Processors",
|
||||
want: true,
|
||||
},
|
||||
{
|
||||
name: "include hgx erot pcie when extended diagnostics enabled",
|
||||
req: Request{DebugPayloads: true},
|
||||
path: "/redfish/v1/Chassis/HGX_ERoT_NVSwitch_0/PCIeDevices",
|
||||
want: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
if got := shouldIncludeCriticalPlanBPath(tt.req, tt.path); got != tt.want {
|
||||
t.Fatalf("shouldIncludeCriticalPlanBPath(%q) = %v, want %v", tt.path, got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -1341,6 +1341,48 @@ func TestParseNIC_PrefersControllerSlotLabelAndPCIeInterface(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseNIC_xFusionMaxlanesAndOEMLinkWidth(t *testing.T) {
|
||||
// xFusion uses "Maxlanes" (lowercase 'l') in PCIeInterface, not "MaxLanes".
|
||||
// xFusion also stores per-function link width as Oem.xFusion.LinkWidth = "X8".
|
||||
nic := parseNIC(map[string]interface{}{
|
||||
"Id": "OCPCard1",
|
||||
"Model": "ConnectX-6 Lx",
|
||||
"Controllers": []interface{}{
|
||||
map[string]interface{}{
|
||||
"PCIeInterface": map[string]interface{}{
|
||||
"LanesInUse": 8,
|
||||
"Maxlanes": 8, // xFusion uses lowercase 'l'
|
||||
"PCIeType": "Gen4",
|
||||
"MaxPCIeType": "Gen4",
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
if nic.LinkWidth != 8 || nic.MaxLinkWidth != 8 {
|
||||
t.Fatalf("expected link widths 8/8 from xFusion Maxlanes, got current=%d max=%d", nic.LinkWidth, nic.MaxLinkWidth)
|
||||
}
|
||||
|
||||
// enrichNICFromPCIe: OEM xFusion LinkWidth on a PCIeFunction doc.
|
||||
nic2 := models.NetworkAdapter{}
|
||||
fnDoc := map[string]interface{}{
|
||||
"Oem": map[string]interface{}{
|
||||
"xFusion": map[string]interface{}{
|
||||
"LinkWidth": "X8",
|
||||
"LinkWidthAbility": "X8",
|
||||
"LinkSpeed": "Gen4 (16.0GT/s)",
|
||||
"LinkSpeedAbility": "Gen4 (16.0GT/s)",
|
||||
},
|
||||
},
|
||||
}
|
||||
enrichNICFromPCIe(&nic2, map[string]interface{}{}, []map[string]interface{}{fnDoc}, nil)
|
||||
if nic2.LinkWidth != 8 || nic2.MaxLinkWidth != 8 {
|
||||
t.Fatalf("expected link width 8 from xFusion OEM LinkWidth, got current=%d max=%d", nic2.LinkWidth, nic2.MaxLinkWidth)
|
||||
}
|
||||
if nic2.LinkSpeed != "Gen4 (16.0GT/s)" || nic2.MaxLinkSpeed != "Gen4 (16.0GT/s)" {
|
||||
t.Fatalf("expected link speed from xFusion OEM LinkSpeed, got current=%q max=%q", nic2.LinkSpeed, nic2.MaxLinkSpeed)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseNIC_DropsUnrealisticPortCount(t *testing.T) {
|
||||
nic := parseNIC(map[string]interface{}{
|
||||
"Id": "1",
|
||||
@@ -2773,6 +2815,28 @@ func TestReplayCollectGPUs_DedupUsesRedfishPathBeforeHeuristics(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseGPU_xFusionPCIeInterfaceMaxlanes(t *testing.T) {
|
||||
// xFusion GPU PCIeDevices (PCIeCard1..N) carry link width in PCIeInterface
|
||||
// with "Maxlanes" (lowercase 'l') rather than "MaxLanes".
|
||||
doc := map[string]interface{}{
|
||||
"Id": "PCIeCard1",
|
||||
"Model": "RTX PRO 6000",
|
||||
"PCIeInterface": map[string]interface{}{
|
||||
"LanesInUse": 16,
|
||||
"Maxlanes": 16,
|
||||
"PCIeType": "Gen5",
|
||||
"MaxPCIeType": "Gen5",
|
||||
},
|
||||
}
|
||||
gpu := parseGPU(doc, nil, 1)
|
||||
if gpu.CurrentLinkWidth != 16 || gpu.MaxLinkWidth != 16 {
|
||||
t.Fatalf("expected link widths 16/16 from PCIeInterface, got current=%d max=%d", gpu.CurrentLinkWidth, gpu.MaxLinkWidth)
|
||||
}
|
||||
if gpu.CurrentLinkSpeed != "Gen5" || gpu.MaxLinkSpeed != "Gen5" {
|
||||
t.Fatalf("expected link speeds Gen5/Gen5 from PCIeInterface, got current=%q max=%q", gpu.CurrentLinkSpeed, gpu.MaxLinkSpeed)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseGPU_UsesNestedOemSerialNumber(t *testing.T) {
|
||||
doc := map[string]interface{}{
|
||||
"Id": "GPU4",
|
||||
|
||||
@@ -326,6 +326,47 @@ func TestBuildAnalysisDirectives_SupermicroEnablesStorageRecovery(t *testing.T)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMatchProfiles_LenovoXCCSelectsMatchedModeAndExcludesSensors(t *testing.T) {
|
||||
match := MatchProfiles(MatchSignals{
|
||||
SystemManufacturer: "Lenovo",
|
||||
ChassisManufacturer: "Lenovo",
|
||||
OEMNamespaces: []string{"Lenovo"},
|
||||
})
|
||||
if match.Mode != ModeMatched {
|
||||
t.Fatalf("expected matched mode, got %q", match.Mode)
|
||||
}
|
||||
found := false
|
||||
for _, profile := range match.Profiles {
|
||||
if profile.Name() == "lenovo" {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Fatal("expected lenovo profile to be selected")
|
||||
}
|
||||
|
||||
// Verify the acquisition plan excludes noisy Lenovo-specific snapshot paths.
|
||||
plan := BuildAcquisitionPlan(MatchSignals{
|
||||
SystemManufacturer: "Lenovo",
|
||||
ChassisManufacturer: "Lenovo",
|
||||
OEMNamespaces: []string{"Lenovo"},
|
||||
})
|
||||
wantExcluded := []string{"/Sensors/", "/Oem/Lenovo/LEDs/", "/Oem/Lenovo/Slots/"}
|
||||
for _, want := range wantExcluded {
|
||||
found := false
|
||||
for _, ex := range plan.Tuning.SnapshotExcludeContains {
|
||||
if ex == want {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Errorf("expected SnapshotExcludeContains to include %q, got %v", want, plan.Tuning.SnapshotExcludeContains)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestMatchProfiles_OrderingIsDeterministic(t *testing.T) {
|
||||
signals := MatchSignals{
|
||||
SystemManufacturer: "Micro-Star International Co., Ltd.",
|
||||
|
||||
65
internal/collector/redfishprofile/profile_lenovo.go
Normal file
65
internal/collector/redfishprofile/profile_lenovo.go
Normal file
@@ -0,0 +1,65 @@
|
||||
package redfishprofile
|
||||
|
||||
func lenovoProfile() Profile {
|
||||
return staticProfile{
|
||||
name: "lenovo",
|
||||
priority: 20,
|
||||
safeForFallback: true,
|
||||
matchFn: func(s MatchSignals) int {
|
||||
score := 0
|
||||
if containsFold(s.SystemManufacturer, "lenovo") ||
|
||||
containsFold(s.ChassisManufacturer, "lenovo") {
|
||||
score += 80
|
||||
}
|
||||
for _, ns := range s.OEMNamespaces {
|
||||
if containsFold(ns, "lenovo") {
|
||||
score += 30
|
||||
break
|
||||
}
|
||||
}
|
||||
// Lenovo XClarity Controller (XCC) is the BMC product line.
|
||||
if containsFold(s.ServiceRootProduct, "xclarity") ||
|
||||
containsFold(s.ServiceRootProduct, "xcc") {
|
||||
score += 30
|
||||
}
|
||||
return min(score, 100)
|
||||
},
|
||||
extendAcquisition: func(plan *AcquisitionPlan, _ MatchSignals) {
|
||||
// Lenovo XCC BMC exposes Chassis/1/Sensors with hundreds of individual
|
||||
// sensor member documents (e.g. Chassis/1/Sensors/101L1). These are
|
||||
// not used by any LOGPile parser — thermal/power data is read from
|
||||
// the aggregate Chassis/*/Thermal and Chassis/*/Power endpoints. On
|
||||
// a real server they largely return errors, wasting many minutes.
|
||||
// Lenovo OEM subtrees under Oem/Lenovo/LEDs and Oem/Lenovo/Slots also
|
||||
// enumerate dozens of individual documents not relevant to inventory.
|
||||
ensureSnapshotExcludeContains(plan,
|
||||
"/Sensors/", // individual sensor docs (Chassis/1/Sensors/NNN)
|
||||
"/Oem/Lenovo/LEDs/", // individual LED status entries (~47 per server)
|
||||
"/Oem/Lenovo/Slots/", // individual slot detail entries (~26 per server)
|
||||
"/Oem/Lenovo/Metrics/", // operational metrics, not inventory
|
||||
"/Oem/Lenovo/History", // historical telemetry
|
||||
"/Oem/Lenovo/ScheduledPower", // power scheduling config
|
||||
"/Oem/Lenovo/BootSettings/BootOrder", // individual boot order lists
|
||||
"/PortForwardingMap/", // network port forwarding config
|
||||
)
|
||||
// Lenovo XCC BMC is typically slow (p95 latency often 3-5s even under
|
||||
// normal load). Set rate thresholds that don't over-throttle on the
|
||||
// first few requests, and give the ETA estimator a realistic baseline.
|
||||
ensureRatePolicy(plan, AcquisitionRatePolicy{
|
||||
TargetP95LatencyMS: 2000,
|
||||
ThrottleP95LatencyMS: 4000,
|
||||
MinSnapshotWorkers: 2,
|
||||
MinPrefetchWorkers: 1,
|
||||
DisablePrefetchOnErrors: true,
|
||||
})
|
||||
ensureETABaseline(plan, AcquisitionETABaseline{
|
||||
DiscoverySeconds: 15,
|
||||
SnapshotSeconds: 120,
|
||||
PrefetchSeconds: 30,
|
||||
CriticalPlanBSeconds: 40,
|
||||
ProfilePlanBSeconds: 20,
|
||||
})
|
||||
addPlanNote(plan, "lenovo xcc acquisition extensions enabled: noisy sensor/oem paths excluded from snapshot")
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -56,6 +56,7 @@ func BuiltinProfiles() []Profile {
|
||||
supermicroProfile(),
|
||||
dellProfile(),
|
||||
hpeProfile(),
|
||||
lenovoProfile(),
|
||||
inspurGroupOEMPlatformsProfile(),
|
||||
hgxProfile(),
|
||||
xfusionProfile(),
|
||||
@@ -226,6 +227,10 @@ func ensurePrefetchPolicy(plan *AcquisitionPlan, policy AcquisitionPrefetchPolic
|
||||
addPlanPaths(&plan.Tuning.PrefetchPolicy.ExcludeContains, policy.ExcludeContains...)
|
||||
}
|
||||
|
||||
func ensureSnapshotExcludeContains(plan *AcquisitionPlan, patterns ...string) {
|
||||
addPlanPaths(&plan.Tuning.SnapshotExcludeContains, patterns...)
|
||||
}
|
||||
|
||||
func min(a, b int) int {
|
||||
if a < b {
|
||||
return a
|
||||
|
||||
@@ -53,16 +53,17 @@ type AcquisitionScopedPathPolicy struct {
|
||||
}
|
||||
|
||||
type AcquisitionTuning struct {
|
||||
SnapshotMaxDocuments int
|
||||
SnapshotWorkers int
|
||||
PrefetchEnabled *bool
|
||||
PrefetchWorkers int
|
||||
NVMePostProbeEnabled *bool
|
||||
RatePolicy AcquisitionRatePolicy
|
||||
ETABaseline AcquisitionETABaseline
|
||||
PostProbePolicy AcquisitionPostProbePolicy
|
||||
RecoveryPolicy AcquisitionRecoveryPolicy
|
||||
PrefetchPolicy AcquisitionPrefetchPolicy
|
||||
SnapshotMaxDocuments int
|
||||
SnapshotWorkers int
|
||||
SnapshotExcludeContains []string
|
||||
PrefetchEnabled *bool
|
||||
PrefetchWorkers int
|
||||
NVMePostProbeEnabled *bool
|
||||
RatePolicy AcquisitionRatePolicy
|
||||
ETABaseline AcquisitionETABaseline
|
||||
PostProbePolicy AcquisitionPostProbePolicy
|
||||
RecoveryPolicy AcquisitionRecoveryPolicy
|
||||
PrefetchPolicy AcquisitionPrefetchPolicy
|
||||
}
|
||||
|
||||
type AcquisitionRatePolicy struct {
|
||||
|
||||
@@ -1961,7 +1961,10 @@ func pcieDedupKey(item ReanimatorPCIe) string {
|
||||
slot := strings.ToLower(strings.TrimSpace(item.Slot))
|
||||
serial := strings.ToLower(strings.TrimSpace(item.SerialNumber))
|
||||
bdf := strings.ToLower(strings.TrimSpace(item.BDF))
|
||||
if slot != "" {
|
||||
// Generic slot names (e.g. "PCIe Device" from HGX BMC) are not unique
|
||||
// hardware positions — multiple distinct devices share the same name.
|
||||
// Fall through to serial/BDF so they are not incorrectly collapsed.
|
||||
if slot != "" && !isGenericPCIeSlotName(slot) {
|
||||
return "slot:" + slot
|
||||
}
|
||||
if serial != "" {
|
||||
@@ -1970,9 +1973,22 @@ func pcieDedupKey(item ReanimatorPCIe) string {
|
||||
if bdf != "" {
|
||||
return "bdf:" + bdf
|
||||
}
|
||||
if slot != "" {
|
||||
return "slot:" + slot
|
||||
}
|
||||
return strings.ToLower(strings.TrimSpace(item.DeviceClass)) + "|" + strings.ToLower(strings.TrimSpace(item.Model))
|
||||
}
|
||||
|
||||
// isGenericPCIeSlotName reports whether slot is a generic device-type label
|
||||
// rather than a unique hardware position identifier.
|
||||
func isGenericPCIeSlotName(slot string) bool {
|
||||
switch slot {
|
||||
case "pcie device", "pcie slot", "pcie":
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func pcieQualityScore(item ReanimatorPCIe) int {
|
||||
score := 0
|
||||
if strings.TrimSpace(item.SerialNumber) != "" {
|
||||
|
||||
@@ -733,6 +733,42 @@ func TestConvertPCIeDevices_SkipsDisplayControllerDuplicates(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestConvertPCIeDevices_PreservesAllGPUsWithGenericSlot(t *testing.T) {
|
||||
// Supermicro HGX BMC reports all GPU PCIe devices with Name "PCIe Device" —
|
||||
// a generic label that is not a unique hardware position. All 8 GPUs must
|
||||
// be preserved; dedup by generic slot name must not collapse them into one.
|
||||
gpus := make([]models.GPU, 8)
|
||||
serials := []string{
|
||||
"1654925165720", "1654925166160", "1654925165942", "1654925165271",
|
||||
"1654925165719", "1654925165252", "1654925165304", "1654925165587",
|
||||
}
|
||||
for i, sn := range serials {
|
||||
gpus[i] = models.GPU{
|
||||
Slot: "PCIe Device",
|
||||
Model: "B200 180GB HBM3e",
|
||||
Manufacturer: "NVIDIA",
|
||||
SerialNumber: sn,
|
||||
PartNumber: "2901-886-A1",
|
||||
Status: "OK",
|
||||
}
|
||||
}
|
||||
hw := &models.HardwareConfig{GPUs: gpus}
|
||||
result := convertPCIeDevices(hw, "2026-04-13T10:00:00Z")
|
||||
if len(result) != 8 {
|
||||
t.Fatalf("expected 8 GPU entries (one per serial), got %d", len(result))
|
||||
}
|
||||
seen := make(map[string]bool)
|
||||
for _, r := range result {
|
||||
if seen[r.SerialNumber] {
|
||||
t.Fatalf("duplicate serial %q in PCIe result", r.SerialNumber)
|
||||
}
|
||||
seen[r.SerialNumber] = true
|
||||
if r.DeviceClass != "VideoController" {
|
||||
t.Fatalf("expected VideoController device class, got %q", r.DeviceClass)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestConvertPCIeDevices_MapsGPUStatusHistory(t *testing.T) {
|
||||
hw := &models.HardwareConfig{
|
||||
GPUs: []models.GPU{
|
||||
|
||||
@@ -85,7 +85,7 @@
|
||||
</div>
|
||||
<label class="api-form-checkbox" for="api-debug-payloads">
|
||||
<input id="api-debug-payloads" name="debug_payloads" type="checkbox">
|
||||
<span>Сбор расширенных метрик для отладки</span>
|
||||
<span>Сбор расширенных данных для диагностики</span>
|
||||
</label>
|
||||
<div class="api-form-actions">
|
||||
<button id="api-collect-btn" type="submit">Собрать</button>
|
||||
|
||||
Reference in New Issue
Block a user