redfish: skip NVMe bay probe for non-storage chassis types (Module/Component/Zone)
On Supermicro HGX systems (SYS-A21GE-NBRT) ~35 sub-chassis (GPU, NVSwitch, PCIeRetimer, ERoT/IRoT, BMC, FPGA) all carry ChassisType=Module/Component/Zone and expose empty /Drives collections. shouldAdaptiveNVMeProbe returned true for all of them, triggering 35 × 384 = 13 440 HTTP requests → ~22 min wasted per collection (more than half of total 35 min collection time). Fix: chassisTypeCanHaveNVMe returns false for Module, Component, Zone. The candidate selection loop in collectRawRedfishTree now checks the parent chassis doc before adding a /Drives path to the probe list. Enclosure (NVMe backplane), RackMount, and unknown types are unaffected. Tests: - TestChassisTypeCanHaveNVMe: table-driven, covers excluded and storage-capable types - TestNVMePostProbeSkipsNonStorageChassis: topology integration, GPU chassis + backplane with empty /Drives → exactly 1 candidate selected (backplane only) Docs: - ADL-018 in bible-local/10-decisions.md - Candidate-selection test matrix in bible-local/09-testing.md - SYS-A21GE-NBRT baseline row in docs/test_server_collection_memory.md Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -999,6 +999,17 @@ func (c *RedfishConnector) collectRawRedfishTree(ctx context.Context, client *ht
|
||||
if !shouldAdaptiveNVMeProbe(doc) {
|
||||
continue
|
||||
}
|
||||
// Skip chassis types that cannot contain NVMe storage (e.g. GPU modules,
|
||||
// RoT components, NVSwitch zones on HGX systems) to avoid probing hundreds
|
||||
// of Disk.Bay.N URLs against chassis that will never have drives.
|
||||
chassisPath := strings.TrimSuffix(normalized, "/Drives")
|
||||
if chassisDocAny, ok := out[chassisPath]; ok {
|
||||
if chassisDoc, ok := chassisDocAny.(map[string]interface{}); ok {
|
||||
if !chassisTypeCanHaveNVMe(asString(chassisDoc["ChassisType"])) {
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
driveCollections = append(driveCollections, normalized)
|
||||
}
|
||||
sort.Strings(driveCollections)
|
||||
@@ -1316,6 +1327,21 @@ func shouldAdaptiveNVMeProbe(collectionDoc map[string]interface{}) bool {
|
||||
return !redfishCollectionHasExplicitMembers(collectionDoc)
|
||||
}
|
||||
|
||||
// chassisTypeCanHaveNVMe returns false for Redfish ChassisType values that
|
||||
// represent compute/network/management sub-modules with no storage capability.
|
||||
// Used to skip expensive Disk.Bay.N probing on HGX GPU, NVSwitch, PCIeRetimer,
|
||||
// RoT and similar component chassis that expose an empty /Drives collection.
|
||||
func chassisTypeCanHaveNVMe(chassisType string) bool {
|
||||
switch strings.ToLower(strings.TrimSpace(chassisType)) {
|
||||
case "module", // GPU SXM, NVLinkManagementNIC, PCIeRetimer
|
||||
"component", // ERoT, IRoT, BMC, FPGA sub-chassis
|
||||
"zone": // HGX_Chassis_0 fabric zone
|
||||
return false
|
||||
default:
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
func redfishCollectionHasNumericMemberRefs(memberRefs []string) bool {
|
||||
for _, memberPath := range memberRefs {
|
||||
if redfishPathTailIsNumeric(memberPath) {
|
||||
|
||||
@@ -2034,3 +2034,101 @@ func TestRedfishPrefetchTargets_FilterNoisyBranches(t *testing.T) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestChassisTypeCanHaveNVMe verifies that non-storage chassis types (GPU modules,
|
||||
// RoT components, fabric zones) are excluded from NVMe bay probing, while storage
|
||||
// and unclassified chassis types are kept.
|
||||
//
|
||||
// Regression guard: on Supermicro HGX (SYS-A21GE-NBRT) all 35 sub-chassis (GPUs,
|
||||
// NVSwitches, PCIeRetimers, ERoT/IRoT, BMC, FPGA) have ChassisType=Module/Component/Zone
|
||||
// and expose empty /Drives collections. Without this filter each chassis triggered
|
||||
// 384 HTTP requests → ~22 minutes wasted per collection. (2026-03-12)
|
||||
func TestChassisTypeCanHaveNVMe(t *testing.T) {
|
||||
cases := []struct {
|
||||
chassisType string
|
||||
want bool
|
||||
}{
|
||||
// Non-storage sub-module types — must return false
|
||||
{"Module", false}, // GPU SXM, PCIeRetimer, NVLinkManagementNIC
|
||||
{"module", false}, // case-insensitive
|
||||
{"Component", false}, // ERoT, IRoT, BMC, FPGA sub-chassis
|
||||
{"component", false},
|
||||
{"Zone", false}, // HGX_Chassis_0 fabric zone
|
||||
{"zone", false},
|
||||
// Storage-capable and generic types — must return true
|
||||
{"Enclosure", true}, // NVMe StorageBackplane
|
||||
{"RackMount", true}, // main server chassis
|
||||
{"Blade", true}, // blade server chassis
|
||||
{"StandAlone", true}, // standalone server
|
||||
{"", true}, // unknown type — probe to be safe
|
||||
}
|
||||
for _, tc := range cases {
|
||||
got := chassisTypeCanHaveNVMe(tc.chassisType)
|
||||
if got != tc.want {
|
||||
t.Errorf("chassisTypeCanHaveNVMe(%q) = %v, want %v", tc.chassisType, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestNVMePostProbeSkipsNonStorageChassis verifies that the NVMe bay probe candidate
|
||||
// selection skips chassis whose ChassisType indicates they cannot hold NVMe drives.
|
||||
//
|
||||
// Simulates an HGX topology: one GPU chassis (Module) and one NVMe backplane
|
||||
// (Enclosure), both with empty /Drives collections. Only the backplane must be
|
||||
// selected as a probe candidate.
|
||||
func TestNVMePostProbeSkipsNonStorageChassis(t *testing.T) {
|
||||
// Build the out map as collectRawRedfishTree would produce it
|
||||
out := map[string]interface{}{
|
||||
// GPU chassis — Module type, empty Drives: should be skipped
|
||||
"/redfish/v1/Chassis/HGX_GPU_SXM_1": map[string]interface{}{
|
||||
"@odata.id": "/redfish/v1/Chassis/HGX_GPU_SXM_1",
|
||||
"ChassisType": "Module",
|
||||
"Name": "HGX_GPU_SXM_1",
|
||||
},
|
||||
"/redfish/v1/Chassis/HGX_GPU_SXM_1/Drives": map[string]interface{}{
|
||||
"@odata.id": "/redfish/v1/Chassis/HGX_GPU_SXM_1/Drives",
|
||||
"Members": []interface{}{},
|
||||
"Members@odata.count": 0,
|
||||
},
|
||||
// NVMe backplane — Enclosure type, empty Drives: must be selected
|
||||
"/redfish/v1/Chassis/NVMeSSD.0.Group.0.StorageBackplane": map[string]interface{}{
|
||||
"@odata.id": "/redfish/v1/Chassis/NVMeSSD.0.Group.0.StorageBackplane",
|
||||
"ChassisType": "Enclosure",
|
||||
"Name": "Backplane",
|
||||
},
|
||||
"/redfish/v1/Chassis/NVMeSSD.0.Group.0.StorageBackplane/Drives": map[string]interface{}{
|
||||
"@odata.id": "/redfish/v1/Chassis/NVMeSSD.0.Group.0.StorageBackplane/Drives",
|
||||
"Members": []interface{}{},
|
||||
"Members@odata.count": 0,
|
||||
},
|
||||
}
|
||||
|
||||
// Replicate the candidate selection logic from collectRawRedfishTree
|
||||
var selected []string
|
||||
for path, docAny := range out {
|
||||
normalized := normalizeRedfishPath(path)
|
||||
if !strings.HasSuffix(normalized, "/Drives") {
|
||||
continue
|
||||
}
|
||||
doc, _ := docAny.(map[string]interface{})
|
||||
if !shouldAdaptiveNVMeProbe(doc) {
|
||||
continue
|
||||
}
|
||||
chassisPath := strings.TrimSuffix(normalized, "/Drives")
|
||||
if chassisDocAny, ok := out[chassisPath]; ok {
|
||||
if chassisDoc, ok := chassisDocAny.(map[string]interface{}); ok {
|
||||
if !chassisTypeCanHaveNVMe(asString(chassisDoc["ChassisType"])) {
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
selected = append(selected, normalized)
|
||||
}
|
||||
|
||||
if len(selected) != 1 {
|
||||
t.Fatalf("expected 1 NVMe probe candidate (backplane), got %d: %v", len(selected), selected)
|
||||
}
|
||||
if !strings.Contains(selected[0], "StorageBackplane") {
|
||||
t.Fatalf("expected StorageBackplane to be selected, got %q", selected[0])
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user