feat(collector): add Lenovo XCC profile to skip noisy snapshot paths
Lenovo ThinkSystem SR650 V3 (and similar XCC-based servers) caused
collection runs of 23+ minutes because the BMC exposes two large high-
error-rate subtrees in the snapshot BFS:
- Chassis/1/Sensors: 315 individual sensor members, 282/315 failing,
~3.7s per request → ~19 minutes wasted. These documents are never
read by any LOGPile parser (thermal/power data comes from aggregate
Chassis/*/Thermal and Chassis/*/Power endpoints).
- Chassis/1/Oem/Lenovo: 75 requests (LEDs×47, Slots×26, etc.),
68/75 failing → 8+ minutes wasted on non-inventory data.
Add a Lenovo profile (matched on SystemManufacturer/OEMNamespace "Lenovo")
that sets SnapshotExcludeContains to block individual sensor documents and
non-inventory Lenovo OEM subtrees from the snapshot BFS queue. Also sets
rate policy thresholds appropriate for XCC BMC latency (p95 often 3-5s).
Add SnapshotExcludeContains []string to AcquisitionTuning and check it
in the snapshot enqueue closure in redfish.go.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -345,8 +345,9 @@ func (c *RedfishConnector) Collect(ctx context.Context, req Request, emit Progre
|
|||||||
"manager_critical_suffixes": acquisitionPlan.ScopedPaths.ManagerCriticalSuffixes,
|
"manager_critical_suffixes": acquisitionPlan.ScopedPaths.ManagerCriticalSuffixes,
|
||||||
},
|
},
|
||||||
"tuning": map[string]any{
|
"tuning": map[string]any{
|
||||||
"snapshot_max_documents": acquisitionPlan.Tuning.SnapshotMaxDocuments,
|
"snapshot_max_documents": acquisitionPlan.Tuning.SnapshotMaxDocuments,
|
||||||
"snapshot_workers": acquisitionPlan.Tuning.SnapshotWorkers,
|
"snapshot_workers": acquisitionPlan.Tuning.SnapshotWorkers,
|
||||||
|
"snapshot_exclude_contains": acquisitionPlan.Tuning.SnapshotExcludeContains,
|
||||||
"prefetch_workers": acquisitionPlan.Tuning.PrefetchWorkers,
|
"prefetch_workers": acquisitionPlan.Tuning.PrefetchWorkers,
|
||||||
"prefetch_enabled": boolPointerValue(acquisitionPlan.Tuning.PrefetchEnabled),
|
"prefetch_enabled": boolPointerValue(acquisitionPlan.Tuning.PrefetchEnabled),
|
||||||
"nvme_post_probe": boolPointerValue(acquisitionPlan.Tuning.NVMePostProbeEnabled),
|
"nvme_post_probe": boolPointerValue(acquisitionPlan.Tuning.NVMePostProbeEnabled),
|
||||||
@@ -1343,6 +1344,11 @@ func (c *RedfishConnector) collectRawRedfishTree(ctx context.Context, client *ht
|
|||||||
if !shouldCrawlPath(path) {
|
if !shouldCrawlPath(path) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
for _, pattern := range tuning.SnapshotExcludeContains {
|
||||||
|
if pattern != "" && strings.Contains(path, pattern) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
mu.Lock()
|
mu.Lock()
|
||||||
if len(seen) >= maxDocuments {
|
if len(seen) >= maxDocuments {
|
||||||
mu.Unlock()
|
mu.Unlock()
|
||||||
|
|||||||
@@ -326,6 +326,47 @@ func TestBuildAnalysisDirectives_SupermicroEnablesStorageRecovery(t *testing.T)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestMatchProfiles_LenovoXCCSelectsMatchedModeAndExcludesSensors(t *testing.T) {
|
||||||
|
match := MatchProfiles(MatchSignals{
|
||||||
|
SystemManufacturer: "Lenovo",
|
||||||
|
ChassisManufacturer: "Lenovo",
|
||||||
|
OEMNamespaces: []string{"Lenovo"},
|
||||||
|
})
|
||||||
|
if match.Mode != ModeMatched {
|
||||||
|
t.Fatalf("expected matched mode, got %q", match.Mode)
|
||||||
|
}
|
||||||
|
found := false
|
||||||
|
for _, profile := range match.Profiles {
|
||||||
|
if profile.Name() == "lenovo" {
|
||||||
|
found = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !found {
|
||||||
|
t.Fatal("expected lenovo profile to be selected")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify the acquisition plan excludes noisy Lenovo-specific snapshot paths.
|
||||||
|
plan := BuildAcquisitionPlan(MatchSignals{
|
||||||
|
SystemManufacturer: "Lenovo",
|
||||||
|
ChassisManufacturer: "Lenovo",
|
||||||
|
OEMNamespaces: []string{"Lenovo"},
|
||||||
|
})
|
||||||
|
wantExcluded := []string{"/Sensors/", "/Oem/Lenovo/LEDs/", "/Oem/Lenovo/Slots/"}
|
||||||
|
for _, want := range wantExcluded {
|
||||||
|
found := false
|
||||||
|
for _, ex := range plan.Tuning.SnapshotExcludeContains {
|
||||||
|
if ex == want {
|
||||||
|
found = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !found {
|
||||||
|
t.Errorf("expected SnapshotExcludeContains to include %q, got %v", want, plan.Tuning.SnapshotExcludeContains)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestMatchProfiles_OrderingIsDeterministic(t *testing.T) {
|
func TestMatchProfiles_OrderingIsDeterministic(t *testing.T) {
|
||||||
signals := MatchSignals{
|
signals := MatchSignals{
|
||||||
SystemManufacturer: "Micro-Star International Co., Ltd.",
|
SystemManufacturer: "Micro-Star International Co., Ltd.",
|
||||||
|
|||||||
65
internal/collector/redfishprofile/profile_lenovo.go
Normal file
65
internal/collector/redfishprofile/profile_lenovo.go
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
package redfishprofile
|
||||||
|
|
||||||
|
func lenovoProfile() Profile {
|
||||||
|
return staticProfile{
|
||||||
|
name: "lenovo",
|
||||||
|
priority: 20,
|
||||||
|
safeForFallback: true,
|
||||||
|
matchFn: func(s MatchSignals) int {
|
||||||
|
score := 0
|
||||||
|
if containsFold(s.SystemManufacturer, "lenovo") ||
|
||||||
|
containsFold(s.ChassisManufacturer, "lenovo") {
|
||||||
|
score += 80
|
||||||
|
}
|
||||||
|
for _, ns := range s.OEMNamespaces {
|
||||||
|
if containsFold(ns, "lenovo") {
|
||||||
|
score += 30
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Lenovo XClarity Controller (XCC) is the BMC product line.
|
||||||
|
if containsFold(s.ServiceRootProduct, "xclarity") ||
|
||||||
|
containsFold(s.ServiceRootProduct, "xcc") {
|
||||||
|
score += 30
|
||||||
|
}
|
||||||
|
return min(score, 100)
|
||||||
|
},
|
||||||
|
extendAcquisition: func(plan *AcquisitionPlan, _ MatchSignals) {
|
||||||
|
// Lenovo XCC BMC exposes Chassis/1/Sensors with hundreds of individual
|
||||||
|
// sensor member documents (e.g. Chassis/1/Sensors/101L1). These are
|
||||||
|
// not used by any LOGPile parser — thermal/power data is read from
|
||||||
|
// the aggregate Chassis/*/Thermal and Chassis/*/Power endpoints. On
|
||||||
|
// a real server they largely return errors, wasting many minutes.
|
||||||
|
// Lenovo OEM subtrees under Oem/Lenovo/LEDs and Oem/Lenovo/Slots also
|
||||||
|
// enumerate dozens of individual documents not relevant to inventory.
|
||||||
|
ensureSnapshotExcludeContains(plan,
|
||||||
|
"/Sensors/", // individual sensor docs (Chassis/1/Sensors/NNN)
|
||||||
|
"/Oem/Lenovo/LEDs/", // individual LED status entries (~47 per server)
|
||||||
|
"/Oem/Lenovo/Slots/", // individual slot detail entries (~26 per server)
|
||||||
|
"/Oem/Lenovo/Metrics/", // operational metrics, not inventory
|
||||||
|
"/Oem/Lenovo/History", // historical telemetry
|
||||||
|
"/Oem/Lenovo/ScheduledPower", // power scheduling config
|
||||||
|
"/Oem/Lenovo/BootSettings/BootOrder", // individual boot order lists
|
||||||
|
"/PortForwardingMap/", // network port forwarding config
|
||||||
|
)
|
||||||
|
// Lenovo XCC BMC is typically slow (p95 latency often 3-5s even under
|
||||||
|
// normal load). Set rate thresholds that don't over-throttle on the
|
||||||
|
// first few requests, and give the ETA estimator a realistic baseline.
|
||||||
|
ensureRatePolicy(plan, AcquisitionRatePolicy{
|
||||||
|
TargetP95LatencyMS: 2000,
|
||||||
|
ThrottleP95LatencyMS: 4000,
|
||||||
|
MinSnapshotWorkers: 2,
|
||||||
|
MinPrefetchWorkers: 1,
|
||||||
|
DisablePrefetchOnErrors: true,
|
||||||
|
})
|
||||||
|
ensureETABaseline(plan, AcquisitionETABaseline{
|
||||||
|
DiscoverySeconds: 15,
|
||||||
|
SnapshotSeconds: 120,
|
||||||
|
PrefetchSeconds: 30,
|
||||||
|
CriticalPlanBSeconds: 40,
|
||||||
|
ProfilePlanBSeconds: 20,
|
||||||
|
})
|
||||||
|
addPlanNote(plan, "lenovo xcc acquisition extensions enabled: noisy sensor/oem paths excluded from snapshot")
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -56,6 +56,7 @@ func BuiltinProfiles() []Profile {
|
|||||||
supermicroProfile(),
|
supermicroProfile(),
|
||||||
dellProfile(),
|
dellProfile(),
|
||||||
hpeProfile(),
|
hpeProfile(),
|
||||||
|
lenovoProfile(),
|
||||||
inspurGroupOEMPlatformsProfile(),
|
inspurGroupOEMPlatformsProfile(),
|
||||||
hgxProfile(),
|
hgxProfile(),
|
||||||
xfusionProfile(),
|
xfusionProfile(),
|
||||||
@@ -226,6 +227,10 @@ func ensurePrefetchPolicy(plan *AcquisitionPlan, policy AcquisitionPrefetchPolic
|
|||||||
addPlanPaths(&plan.Tuning.PrefetchPolicy.ExcludeContains, policy.ExcludeContains...)
|
addPlanPaths(&plan.Tuning.PrefetchPolicy.ExcludeContains, policy.ExcludeContains...)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func ensureSnapshotExcludeContains(plan *AcquisitionPlan, patterns ...string) {
|
||||||
|
addPlanPaths(&plan.Tuning.SnapshotExcludeContains, patterns...)
|
||||||
|
}
|
||||||
|
|
||||||
func min(a, b int) int {
|
func min(a, b int) int {
|
||||||
if a < b {
|
if a < b {
|
||||||
return a
|
return a
|
||||||
|
|||||||
@@ -53,16 +53,17 @@ type AcquisitionScopedPathPolicy struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type AcquisitionTuning struct {
|
type AcquisitionTuning struct {
|
||||||
SnapshotMaxDocuments int
|
SnapshotMaxDocuments int
|
||||||
SnapshotWorkers int
|
SnapshotWorkers int
|
||||||
PrefetchEnabled *bool
|
SnapshotExcludeContains []string
|
||||||
PrefetchWorkers int
|
PrefetchEnabled *bool
|
||||||
NVMePostProbeEnabled *bool
|
PrefetchWorkers int
|
||||||
RatePolicy AcquisitionRatePolicy
|
NVMePostProbeEnabled *bool
|
||||||
ETABaseline AcquisitionETABaseline
|
RatePolicy AcquisitionRatePolicy
|
||||||
PostProbePolicy AcquisitionPostProbePolicy
|
ETABaseline AcquisitionETABaseline
|
||||||
RecoveryPolicy AcquisitionRecoveryPolicy
|
PostProbePolicy AcquisitionPostProbePolicy
|
||||||
PrefetchPolicy AcquisitionPrefetchPolicy
|
RecoveryPolicy AcquisitionRecoveryPolicy
|
||||||
|
PrefetchPolicy AcquisitionPrefetchPolicy
|
||||||
}
|
}
|
||||||
|
|
||||||
type AcquisitionRatePolicy struct {
|
type AcquisitionRatePolicy struct {
|
||||||
|
|||||||
Reference in New Issue
Block a user