diff --git a/bible-local/09-testing.md b/bible-local/09-testing.md index 5afc89a..7e5d03b 100644 --- a/bible-local/09-testing.md +++ b/bible-local/09-testing.md @@ -79,3 +79,34 @@ When you write a new filter/dedup/classify function, ask: 3. If I change the dedup key logic, do existing tests still exercise the old correct behavior? If any answer is "no" — add the missing test before committing. + +## Collector candidate-selection functions — mandatory coverage + +Any function that selects paths for an expensive operation (probing, crawling, plan-B retry) +**must** have tests covering: + +| Axis | What to test | Why | +|------|-------------|-----| +| **Positive** | Paths that should be selected ARE selected | Proves the feature works | +| **Negative** | Paths that should be excluded ARE excluded | Prevents runaway I/O | +| **Topology integration** | Given a realistic `out` map, the count of selected paths matches expectations | Catches implicit coupling between the selector and the surrounding data shape | + +### Worked example — NVMe post-probe regression (2026-03-12) + +`shouldAdaptiveNVMeProbe` was added in `2fa4a12` for Supermicro NVMe backplanes that return +`Members: []` but serve disks at `Disk.Bay.N` paths. No topology-level test was added. + +When SYS-A21GE-NBRT (HGX B200) arrived, its 35 sub-chassis (GPU, NVSwitch, PCIeRetimer, +ERoT, IRoT, BMC, FPGA) all have `ChassisType=Module/Component/Zone` and empty `/Drives` → +all 35 passed the filter → 35 × 384 = 13 440 HTTP requests → 22 min extra per collection. + +A topology integration test (`TestNVMePostProbeSkipsNonStorageChassis`) would have caught +this at commit time: given GPU chassis + backplane, exactly 1 candidate must be selected. + +**Required test matrix for any path-selection function:** + +``` +TestXxx_SelectsTargetPath — the path that motivated the code IS selected +TestXxx_SkipsIrrelevantPath — a path that must never be selected IS skipped +TestXxx_TopologyCount — given a realistic multi-chassis map, selected count = N +``` diff --git a/bible-local/10-decisions.md b/bible-local/10-decisions.md index fa8242a..4953c7a 100644 --- a/bible-local/10-decisions.md +++ b/bible-local/10-decisions.md @@ -253,4 +253,25 @@ at parse time before storing in any model struct. Use the regex --- +## ADL-018 — NVMe bay probe must be restricted to storage-capable chassis types + +**Date:** 2026-03-12 +**Context:** `shouldAdaptiveNVMeProbe` was introduced in `2fa4a12` to recover NVMe drives on +Supermicro BMCs that expose empty `Drives` collections but serve disks at direct `Disk.Bay.N` +paths. The function returns `true` for any chassis with an empty `Members` array. On +Supermicro HGX systems (SYS-A21GE-NBRT and similar) ~35 sub-chassis (GPU, NVSwitch, +PCIeRetimer, ERoT, IRoT, BMC, FPGA) all carry `ChassisType=Module/Component/Zone` and +expose empty `/Drives` collections. Without filtering, each triggered 384 HTTP requests → +13 440 requests ≈ 22 minutes of pure I/O waste per collection. +**Decision:** Before probing `Disk.Bay.N` candidates for a chassis, check its `ChassisType` +via `chassisTypeCanHaveNVMe`. Skip if type is `Module`, `Component`, or `Zone`. Keep probing +for `Enclosure`, `RackMount`, and any unrecognised type (fail-safe). +**Consequences:** +- On HGX systems post-probe NVMe goes from ~22 min to effectively zero. +- NVMe backplane recovery (`Enclosure` type) is unaffected. +- Any new chassis type that hosts NVMe storage is covered by the default `true` path. +- `chassisTypeCanHaveNVMe` and the candidate-selection loop must have unit tests covering + both the excluded types and the storage-capable types (see `TestChassisTypeCanHaveNVMe` + and `TestNVMePostProbeSkipsNonStorageChassis`). + diff --git a/docs/test_server_collection_memory.md b/docs/test_server_collection_memory.md index 7569b4f..8f5cfa1 100644 --- a/docs/test_server_collection_memory.md +++ b/docs/test_server_collection_memory.md @@ -19,6 +19,17 @@ Definition: | 2026-02-28 | `v1.7.0` (`ddab93a`) | n/a | 193 | n/a | 61 | 2026-02-28 (NULL) - 23E100051.zip | | 2026-02-28 | `v1.7.0` (`ddab93a`) | n/a | 291 | n/a | 61 | 2026-02-28 (NULL) - 23E100206.zip | +## Server Model: `SYS-A21GE-NBRT` (Supermicro HGX B200) + +> **HGX note:** this model has ~40 sub-chassis (GPU, NVSwitch, PCIeRetimer, ERoT/IRoT, BMC, FPGA) +> all exposing empty `/Drives` collections. Post-probe NVMe must skip `ChassisType=Module/Component/Zone` +> or it probes 35 × 384 = 13 440 URLs → ~22 min wasted. Fixed in the commit that added +> `chassisTypeCanHaveNVMe` (2026-03-12). Expected post-probe NVMe time after fix: <5s. + +| Date (UTC) | App Version | Collection Time | Documents | Speed | Metrics Collected | Notes | +|---|---|---:|---:|---:|---:|---| +| 2026-03-12 | `v1.8.0-6-ga9f58b3` (`a9f58b3`) | 35m28s (2128s) — **before fix** | 3197 | 1.50 docs/s | 140 | 2026-03-12 (SYS-A21GE-NBRT) - A936564X5C17287.zip | + ## Server Model: `KR1280-X2-A0-R0-00` | Date (UTC) | App Version | Collection Time | Documents | Speed | Metrics Collected | Notes | diff --git a/internal/collector/redfish.go b/internal/collector/redfish.go index 120d515..0ecb591 100644 --- a/internal/collector/redfish.go +++ b/internal/collector/redfish.go @@ -999,6 +999,17 @@ func (c *RedfishConnector) collectRawRedfishTree(ctx context.Context, client *ht if !shouldAdaptiveNVMeProbe(doc) { continue } + // Skip chassis types that cannot contain NVMe storage (e.g. GPU modules, + // RoT components, NVSwitch zones on HGX systems) to avoid probing hundreds + // of Disk.Bay.N URLs against chassis that will never have drives. + chassisPath := strings.TrimSuffix(normalized, "/Drives") + if chassisDocAny, ok := out[chassisPath]; ok { + if chassisDoc, ok := chassisDocAny.(map[string]interface{}); ok { + if !chassisTypeCanHaveNVMe(asString(chassisDoc["ChassisType"])) { + continue + } + } + } driveCollections = append(driveCollections, normalized) } sort.Strings(driveCollections) @@ -1316,6 +1327,21 @@ func shouldAdaptiveNVMeProbe(collectionDoc map[string]interface{}) bool { return !redfishCollectionHasExplicitMembers(collectionDoc) } +// chassisTypeCanHaveNVMe returns false for Redfish ChassisType values that +// represent compute/network/management sub-modules with no storage capability. +// Used to skip expensive Disk.Bay.N probing on HGX GPU, NVSwitch, PCIeRetimer, +// RoT and similar component chassis that expose an empty /Drives collection. +func chassisTypeCanHaveNVMe(chassisType string) bool { + switch strings.ToLower(strings.TrimSpace(chassisType)) { + case "module", // GPU SXM, NVLinkManagementNIC, PCIeRetimer + "component", // ERoT, IRoT, BMC, FPGA sub-chassis + "zone": // HGX_Chassis_0 fabric zone + return false + default: + return true + } +} + func redfishCollectionHasNumericMemberRefs(memberRefs []string) bool { for _, memberPath := range memberRefs { if redfishPathTailIsNumeric(memberPath) { diff --git a/internal/collector/redfish_test.go b/internal/collector/redfish_test.go index 392553e..2ced8fb 100644 --- a/internal/collector/redfish_test.go +++ b/internal/collector/redfish_test.go @@ -2034,3 +2034,101 @@ func TestRedfishPrefetchTargets_FilterNoisyBranches(t *testing.T) { } } } + +// TestChassisTypeCanHaveNVMe verifies that non-storage chassis types (GPU modules, +// RoT components, fabric zones) are excluded from NVMe bay probing, while storage +// and unclassified chassis types are kept. +// +// Regression guard: on Supermicro HGX (SYS-A21GE-NBRT) all 35 sub-chassis (GPUs, +// NVSwitches, PCIeRetimers, ERoT/IRoT, BMC, FPGA) have ChassisType=Module/Component/Zone +// and expose empty /Drives collections. Without this filter each chassis triggered +// 384 HTTP requests → ~22 minutes wasted per collection. (2026-03-12) +func TestChassisTypeCanHaveNVMe(t *testing.T) { + cases := []struct { + chassisType string + want bool + }{ + // Non-storage sub-module types — must return false + {"Module", false}, // GPU SXM, PCIeRetimer, NVLinkManagementNIC + {"module", false}, // case-insensitive + {"Component", false}, // ERoT, IRoT, BMC, FPGA sub-chassis + {"component", false}, + {"Zone", false}, // HGX_Chassis_0 fabric zone + {"zone", false}, + // Storage-capable and generic types — must return true + {"Enclosure", true}, // NVMe StorageBackplane + {"RackMount", true}, // main server chassis + {"Blade", true}, // blade server chassis + {"StandAlone", true}, // standalone server + {"", true}, // unknown type — probe to be safe + } + for _, tc := range cases { + got := chassisTypeCanHaveNVMe(tc.chassisType) + if got != tc.want { + t.Errorf("chassisTypeCanHaveNVMe(%q) = %v, want %v", tc.chassisType, got, tc.want) + } + } +} + +// TestNVMePostProbeSkipsNonStorageChassis verifies that the NVMe bay probe candidate +// selection skips chassis whose ChassisType indicates they cannot hold NVMe drives. +// +// Simulates an HGX topology: one GPU chassis (Module) and one NVMe backplane +// (Enclosure), both with empty /Drives collections. Only the backplane must be +// selected as a probe candidate. +func TestNVMePostProbeSkipsNonStorageChassis(t *testing.T) { + // Build the out map as collectRawRedfishTree would produce it + out := map[string]interface{}{ + // GPU chassis — Module type, empty Drives: should be skipped + "/redfish/v1/Chassis/HGX_GPU_SXM_1": map[string]interface{}{ + "@odata.id": "/redfish/v1/Chassis/HGX_GPU_SXM_1", + "ChassisType": "Module", + "Name": "HGX_GPU_SXM_1", + }, + "/redfish/v1/Chassis/HGX_GPU_SXM_1/Drives": map[string]interface{}{ + "@odata.id": "/redfish/v1/Chassis/HGX_GPU_SXM_1/Drives", + "Members": []interface{}{}, + "Members@odata.count": 0, + }, + // NVMe backplane — Enclosure type, empty Drives: must be selected + "/redfish/v1/Chassis/NVMeSSD.0.Group.0.StorageBackplane": map[string]interface{}{ + "@odata.id": "/redfish/v1/Chassis/NVMeSSD.0.Group.0.StorageBackplane", + "ChassisType": "Enclosure", + "Name": "Backplane", + }, + "/redfish/v1/Chassis/NVMeSSD.0.Group.0.StorageBackplane/Drives": map[string]interface{}{ + "@odata.id": "/redfish/v1/Chassis/NVMeSSD.0.Group.0.StorageBackplane/Drives", + "Members": []interface{}{}, + "Members@odata.count": 0, + }, + } + + // Replicate the candidate selection logic from collectRawRedfishTree + var selected []string + for path, docAny := range out { + normalized := normalizeRedfishPath(path) + if !strings.HasSuffix(normalized, "/Drives") { + continue + } + doc, _ := docAny.(map[string]interface{}) + if !shouldAdaptiveNVMeProbe(doc) { + continue + } + chassisPath := strings.TrimSuffix(normalized, "/Drives") + if chassisDocAny, ok := out[chassisPath]; ok { + if chassisDoc, ok := chassisDocAny.(map[string]interface{}); ok { + if !chassisTypeCanHaveNVMe(asString(chassisDoc["ChassisType"])) { + continue + } + } + } + selected = append(selected, normalized) + } + + if len(selected) != 1 { + t.Fatalf("expected 1 NVMe probe candidate (backplane), got %d: %v", len(selected), selected) + } + if !strings.Contains(selected[0], "StorageBackplane") { + t.Fatalf("expected StorageBackplane to be selected, got %q", selected[0]) + } +}