Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| b04877549a | |||
| 8ca173c99b |
@@ -345,8 +345,9 @@ func (c *RedfishConnector) Collect(ctx context.Context, req Request, emit Progre
|
||||
"manager_critical_suffixes": acquisitionPlan.ScopedPaths.ManagerCriticalSuffixes,
|
||||
},
|
||||
"tuning": map[string]any{
|
||||
"snapshot_max_documents": acquisitionPlan.Tuning.SnapshotMaxDocuments,
|
||||
"snapshot_workers": acquisitionPlan.Tuning.SnapshotWorkers,
|
||||
"snapshot_max_documents": acquisitionPlan.Tuning.SnapshotMaxDocuments,
|
||||
"snapshot_workers": acquisitionPlan.Tuning.SnapshotWorkers,
|
||||
"snapshot_exclude_contains": acquisitionPlan.Tuning.SnapshotExcludeContains,
|
||||
"prefetch_workers": acquisitionPlan.Tuning.PrefetchWorkers,
|
||||
"prefetch_enabled": boolPointerValue(acquisitionPlan.Tuning.PrefetchEnabled),
|
||||
"nvme_post_probe": boolPointerValue(acquisitionPlan.Tuning.NVMePostProbeEnabled),
|
||||
@@ -1343,6 +1344,11 @@ func (c *RedfishConnector) collectRawRedfishTree(ctx context.Context, client *ht
|
||||
if !shouldCrawlPath(path) {
|
||||
return
|
||||
}
|
||||
for _, pattern := range tuning.SnapshotExcludeContains {
|
||||
if pattern != "" && strings.Contains(path, pattern) {
|
||||
return
|
||||
}
|
||||
}
|
||||
mu.Lock()
|
||||
if len(seen) >= maxDocuments {
|
||||
mu.Unlock()
|
||||
|
||||
@@ -326,6 +326,47 @@ func TestBuildAnalysisDirectives_SupermicroEnablesStorageRecovery(t *testing.T)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMatchProfiles_LenovoXCCSelectsMatchedModeAndExcludesSensors(t *testing.T) {
|
||||
match := MatchProfiles(MatchSignals{
|
||||
SystemManufacturer: "Lenovo",
|
||||
ChassisManufacturer: "Lenovo",
|
||||
OEMNamespaces: []string{"Lenovo"},
|
||||
})
|
||||
if match.Mode != ModeMatched {
|
||||
t.Fatalf("expected matched mode, got %q", match.Mode)
|
||||
}
|
||||
found := false
|
||||
for _, profile := range match.Profiles {
|
||||
if profile.Name() == "lenovo" {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Fatal("expected lenovo profile to be selected")
|
||||
}
|
||||
|
||||
// Verify the acquisition plan excludes noisy Lenovo-specific snapshot paths.
|
||||
plan := BuildAcquisitionPlan(MatchSignals{
|
||||
SystemManufacturer: "Lenovo",
|
||||
ChassisManufacturer: "Lenovo",
|
||||
OEMNamespaces: []string{"Lenovo"},
|
||||
})
|
||||
wantExcluded := []string{"/Sensors/", "/Oem/Lenovo/LEDs/", "/Oem/Lenovo/Slots/"}
|
||||
for _, want := range wantExcluded {
|
||||
found := false
|
||||
for _, ex := range plan.Tuning.SnapshotExcludeContains {
|
||||
if ex == want {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Errorf("expected SnapshotExcludeContains to include %q, got %v", want, plan.Tuning.SnapshotExcludeContains)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestMatchProfiles_OrderingIsDeterministic(t *testing.T) {
|
||||
signals := MatchSignals{
|
||||
SystemManufacturer: "Micro-Star International Co., Ltd.",
|
||||
|
||||
65
internal/collector/redfishprofile/profile_lenovo.go
Normal file
65
internal/collector/redfishprofile/profile_lenovo.go
Normal file
@@ -0,0 +1,65 @@
|
||||
package redfishprofile
|
||||
|
||||
func lenovoProfile() Profile {
|
||||
return staticProfile{
|
||||
name: "lenovo",
|
||||
priority: 20,
|
||||
safeForFallback: true,
|
||||
matchFn: func(s MatchSignals) int {
|
||||
score := 0
|
||||
if containsFold(s.SystemManufacturer, "lenovo") ||
|
||||
containsFold(s.ChassisManufacturer, "lenovo") {
|
||||
score += 80
|
||||
}
|
||||
for _, ns := range s.OEMNamespaces {
|
||||
if containsFold(ns, "lenovo") {
|
||||
score += 30
|
||||
break
|
||||
}
|
||||
}
|
||||
// Lenovo XClarity Controller (XCC) is the BMC product line.
|
||||
if containsFold(s.ServiceRootProduct, "xclarity") ||
|
||||
containsFold(s.ServiceRootProduct, "xcc") {
|
||||
score += 30
|
||||
}
|
||||
return min(score, 100)
|
||||
},
|
||||
extendAcquisition: func(plan *AcquisitionPlan, _ MatchSignals) {
|
||||
// Lenovo XCC BMC exposes Chassis/1/Sensors with hundreds of individual
|
||||
// sensor member documents (e.g. Chassis/1/Sensors/101L1). These are
|
||||
// not used by any LOGPile parser — thermal/power data is read from
|
||||
// the aggregate Chassis/*/Thermal and Chassis/*/Power endpoints. On
|
||||
// a real server they largely return errors, wasting many minutes.
|
||||
// Lenovo OEM subtrees under Oem/Lenovo/LEDs and Oem/Lenovo/Slots also
|
||||
// enumerate dozens of individual documents not relevant to inventory.
|
||||
ensureSnapshotExcludeContains(plan,
|
||||
"/Sensors/", // individual sensor docs (Chassis/1/Sensors/NNN)
|
||||
"/Oem/Lenovo/LEDs/", // individual LED status entries (~47 per server)
|
||||
"/Oem/Lenovo/Slots/", // individual slot detail entries (~26 per server)
|
||||
"/Oem/Lenovo/Metrics/", // operational metrics, not inventory
|
||||
"/Oem/Lenovo/History", // historical telemetry
|
||||
"/Oem/Lenovo/ScheduledPower", // power scheduling config
|
||||
"/Oem/Lenovo/BootSettings/BootOrder", // individual boot order lists
|
||||
"/PortForwardingMap/", // network port forwarding config
|
||||
)
|
||||
// Lenovo XCC BMC is typically slow (p95 latency often 3-5s even under
|
||||
// normal load). Set rate thresholds that don't over-throttle on the
|
||||
// first few requests, and give the ETA estimator a realistic baseline.
|
||||
ensureRatePolicy(plan, AcquisitionRatePolicy{
|
||||
TargetP95LatencyMS: 2000,
|
||||
ThrottleP95LatencyMS: 4000,
|
||||
MinSnapshotWorkers: 2,
|
||||
MinPrefetchWorkers: 1,
|
||||
DisablePrefetchOnErrors: true,
|
||||
})
|
||||
ensureETABaseline(plan, AcquisitionETABaseline{
|
||||
DiscoverySeconds: 15,
|
||||
SnapshotSeconds: 120,
|
||||
PrefetchSeconds: 30,
|
||||
CriticalPlanBSeconds: 40,
|
||||
ProfilePlanBSeconds: 20,
|
||||
})
|
||||
addPlanNote(plan, "lenovo xcc acquisition extensions enabled: noisy sensor/oem paths excluded from snapshot")
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -56,6 +56,7 @@ func BuiltinProfiles() []Profile {
|
||||
supermicroProfile(),
|
||||
dellProfile(),
|
||||
hpeProfile(),
|
||||
lenovoProfile(),
|
||||
inspurGroupOEMPlatformsProfile(),
|
||||
hgxProfile(),
|
||||
xfusionProfile(),
|
||||
@@ -226,6 +227,10 @@ func ensurePrefetchPolicy(plan *AcquisitionPlan, policy AcquisitionPrefetchPolic
|
||||
addPlanPaths(&plan.Tuning.PrefetchPolicy.ExcludeContains, policy.ExcludeContains...)
|
||||
}
|
||||
|
||||
func ensureSnapshotExcludeContains(plan *AcquisitionPlan, patterns ...string) {
|
||||
addPlanPaths(&plan.Tuning.SnapshotExcludeContains, patterns...)
|
||||
}
|
||||
|
||||
func min(a, b int) int {
|
||||
if a < b {
|
||||
return a
|
||||
|
||||
@@ -53,16 +53,17 @@ type AcquisitionScopedPathPolicy struct {
|
||||
}
|
||||
|
||||
type AcquisitionTuning struct {
|
||||
SnapshotMaxDocuments int
|
||||
SnapshotWorkers int
|
||||
PrefetchEnabled *bool
|
||||
PrefetchWorkers int
|
||||
NVMePostProbeEnabled *bool
|
||||
RatePolicy AcquisitionRatePolicy
|
||||
ETABaseline AcquisitionETABaseline
|
||||
PostProbePolicy AcquisitionPostProbePolicy
|
||||
RecoveryPolicy AcquisitionRecoveryPolicy
|
||||
PrefetchPolicy AcquisitionPrefetchPolicy
|
||||
SnapshotMaxDocuments int
|
||||
SnapshotWorkers int
|
||||
SnapshotExcludeContains []string
|
||||
PrefetchEnabled *bool
|
||||
PrefetchWorkers int
|
||||
NVMePostProbeEnabled *bool
|
||||
RatePolicy AcquisitionRatePolicy
|
||||
ETABaseline AcquisitionETABaseline
|
||||
PostProbePolicy AcquisitionPostProbePolicy
|
||||
RecoveryPolicy AcquisitionRecoveryPolicy
|
||||
PrefetchPolicy AcquisitionPrefetchPolicy
|
||||
}
|
||||
|
||||
type AcquisitionRatePolicy struct {
|
||||
|
||||
@@ -1961,7 +1961,10 @@ func pcieDedupKey(item ReanimatorPCIe) string {
|
||||
slot := strings.ToLower(strings.TrimSpace(item.Slot))
|
||||
serial := strings.ToLower(strings.TrimSpace(item.SerialNumber))
|
||||
bdf := strings.ToLower(strings.TrimSpace(item.BDF))
|
||||
if slot != "" {
|
||||
// Generic slot names (e.g. "PCIe Device" from HGX BMC) are not unique
|
||||
// hardware positions — multiple distinct devices share the same name.
|
||||
// Fall through to serial/BDF so they are not incorrectly collapsed.
|
||||
if slot != "" && !isGenericPCIeSlotName(slot) {
|
||||
return "slot:" + slot
|
||||
}
|
||||
if serial != "" {
|
||||
@@ -1970,9 +1973,22 @@ func pcieDedupKey(item ReanimatorPCIe) string {
|
||||
if bdf != "" {
|
||||
return "bdf:" + bdf
|
||||
}
|
||||
if slot != "" {
|
||||
return "slot:" + slot
|
||||
}
|
||||
return strings.ToLower(strings.TrimSpace(item.DeviceClass)) + "|" + strings.ToLower(strings.TrimSpace(item.Model))
|
||||
}
|
||||
|
||||
// isGenericPCIeSlotName reports whether slot is a generic device-type label
|
||||
// rather than a unique hardware position identifier.
|
||||
func isGenericPCIeSlotName(slot string) bool {
|
||||
switch slot {
|
||||
case "pcie device", "pcie slot", "pcie":
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func pcieQualityScore(item ReanimatorPCIe) int {
|
||||
score := 0
|
||||
if strings.TrimSpace(item.SerialNumber) != "" {
|
||||
|
||||
@@ -733,6 +733,42 @@ func TestConvertPCIeDevices_SkipsDisplayControllerDuplicates(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestConvertPCIeDevices_PreservesAllGPUsWithGenericSlot(t *testing.T) {
|
||||
// Supermicro HGX BMC reports all GPU PCIe devices with Name "PCIe Device" —
|
||||
// a generic label that is not a unique hardware position. All 8 GPUs must
|
||||
// be preserved; dedup by generic slot name must not collapse them into one.
|
||||
gpus := make([]models.GPU, 8)
|
||||
serials := []string{
|
||||
"1654925165720", "1654925166160", "1654925165942", "1654925165271",
|
||||
"1654925165719", "1654925165252", "1654925165304", "1654925165587",
|
||||
}
|
||||
for i, sn := range serials {
|
||||
gpus[i] = models.GPU{
|
||||
Slot: "PCIe Device",
|
||||
Model: "B200 180GB HBM3e",
|
||||
Manufacturer: "NVIDIA",
|
||||
SerialNumber: sn,
|
||||
PartNumber: "2901-886-A1",
|
||||
Status: "OK",
|
||||
}
|
||||
}
|
||||
hw := &models.HardwareConfig{GPUs: gpus}
|
||||
result := convertPCIeDevices(hw, "2026-04-13T10:00:00Z")
|
||||
if len(result) != 8 {
|
||||
t.Fatalf("expected 8 GPU entries (one per serial), got %d", len(result))
|
||||
}
|
||||
seen := make(map[string]bool)
|
||||
for _, r := range result {
|
||||
if seen[r.SerialNumber] {
|
||||
t.Fatalf("duplicate serial %q in PCIe result", r.SerialNumber)
|
||||
}
|
||||
seen[r.SerialNumber] = true
|
||||
if r.DeviceClass != "VideoController" {
|
||||
t.Fatalf("expected VideoController device class, got %q", r.DeviceClass)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestConvertPCIeDevices_MapsGPUStatusHistory(t *testing.T) {
|
||||
hw := &models.HardwareConfig{
|
||||
GPUs: []models.GPU{
|
||||
|
||||
Reference in New Issue
Block a user