From a9f58b3cf40e4d322371273b95da0cdb4852a2f9 Mon Sep 17 00:00:00 2001 From: Mikhail Chusavitin Date: Wed, 11 Mar 2026 15:09:27 +0300 Subject: [PATCH] redfish: fix GPU duplication on Supermicro HGX, exclude NVSwitch, restore path dedup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three bugs, all related to GPU dedup in the Redfish replay pipeline: 1. collectGPUsFromProcessors (redfish_replay.go): GPU-type Processor entries (Systems/HGX_Baseboard_0/Processors/GPU_SXM_N) were not deduplicated against existing PCIeDevice GPUs on Supermicro HGX. The chassis-ID lookup keyed on processor Id ("GPU_SXM_1") but the chassis is named "HGX_GPU_SXM_1" — lookup returned nothing, serial stayed empty, UUID was unseen → 8 duplicate GPU rows. Fix: read SerialNumber directly from the Processor doc first; chassis lookup is now a fallback override (as it was designed for MSI). 2. looksLikeGPU (redfish.go): NVSwitch PCIe devices (Model="NVSwitch", Manufacturer="NVIDIA") were classified as GPUs because "nvidia" matched the GPU hint list. Fix: early return false when Model contains "nvswitch". 3. gpuDocDedupKey (redfish.go): commit 9df29b1 changed the dedup key to prefer slot|model before path, which collapsed two distinct GPUs with identical model names in GraphicsControllers into one entry. Fix: only serial and BDF are used as cross-path stable dedup keys; fall back to Redfish path when neither is present. This also restores TestReplayCollectGPUs_DedupUsesRedfishPathBeforeHeuristics which had been broken on main since 9df29b1. Added tests: - TestCollectGPUsFromProcessors_SupermicroHGX: Processor GPU dedup when chassis-ID naming convention does not match processor Id - TestReplayCollectGPUs_DedupCrossChassisSerial: same GPU via two Chassis PCIeDevice trees with matching serials → collapsed to one - TestLooksLikeGPU_NVSwitchExcluded: NVSwitch is not a GPU Added rule to bible-local/09-testing.md: dedup/filter/classify functions must cover true-positive, true-negative, and the vendor counter-case axes. Co-Authored-By: Claude Sonnet 4.6 --- bible-local/09-testing.md | 38 +++++ internal/collector/redfish.go | 16 ++- internal/collector/redfish_replay.go | 11 +- internal/collector/redfish_test.go | 205 +++++++++++++++++++++++++++ 4 files changed, 265 insertions(+), 5 deletions(-) diff --git a/bible-local/09-testing.md b/bible-local/09-testing.md index 5b8f686..5afc89a 100644 --- a/bible-local/09-testing.md +++ b/bible-local/09-testing.md @@ -41,3 +41,41 @@ go test ./internal/exporter/... -cover - When adding a new vendor parser, include at minimum: - `Detect()` test with a positive and a negative sample file list. - `Parse()` test with a minimal but representative archive. + +## Dedup and filtering functions — mandatory coverage + +Any function that deduplicates, filters, or classifies hardware inventory items +**must** have tests covering all three axes before the code is considered done: + +| Axis | What to test | Why | +|------|-------------|-----| +| **True positive** | Items that ARE duplicates are collapsed to one | Proves the function works | +| **True negative** | Items that are NOT duplicates are kept separate | Proves the function doesn't over-collapse | +| **Counter-case** | The scenario that motivated the original code still works after changes | Prevents regression from future fixes | + +### Worked example — GPU dedup regression (2026-03-11) + +`collectGPUsFromProcessors` was added for MSI (chassis Id matches processor Id). +No tests → when Supermicro HGX arrived (chassis Id = "HGX_GPU_SXM_1", processor Id = "GPU_SXM_1"), +the chassis lookup silently returned nothing, serial stayed empty, UUID was new → 8 duplicate GPUs. + +Simultaneously, fixing `gpuDocDedupKey` to use `slot|model` before path collapsed two distinct +GraphicsControllers GPUs with the same model into one — breaking an existing test that had no +counter-case for the path-fallback scenario. + +**Required test matrix for any dedup function:** + +``` +TestXxx_CollapsesDuplicates — same item via two sources → 1 result +TestXxx_KeepsDistinct — two different items with same model → 2 results +TestXxx_ — the specific vendor/setup that triggered the code +``` + +### Practical rule + +When you write a new filter/dedup/classify function, ask: +1. Does my test cover the vendor that motivated this code? +2. Does my test cover a *different* vendor or naming convention where the function must NOT fire? +3. If I change the dedup key logic, do existing tests still exercise the old correct behavior? + +If any answer is "no" — add the missing test before committing. diff --git a/internal/collector/redfish.go b/internal/collector/redfish.go index aa4051b..120d515 100644 --- a/internal/collector/redfish.go +++ b/internal/collector/redfish.go @@ -3113,8 +3113,16 @@ func gpuDocDedupKey(doc map[string]interface{}, gpu models.GPU) string { // physical GPU exposed under multiple Chassis PCIeDevice trees (e.g. Supermicro // HGX: Chassis/1/PCIeDevices/GPU1 and Chassis/HGX_GPU_SXM_1/PCIeDevices/GPU_SXM_1) // is correctly deduplicated. - if key := gpuDedupKey(gpu); key != "" { - return key + // + // Only stable identifiers (serial, BDF) are used for cross-path dedup. + // When neither is present we fall back to path, so two genuinely distinct GPUs + // that happen to share the same model name (e.g. in GraphicsControllers) are + // not incorrectly collapsed into one. + if serial := normalizeRedfishIdentityField(gpu.SerialNumber); serial != "" { + return serial + } + if bdf := strings.TrimSpace(gpu.BDF); bdf != "" { + return bdf } if path := normalizeRedfishPath(asString(doc["@odata.id"])); path != "" { return "path:" + path @@ -3342,6 +3350,10 @@ func looksLikeGPU(doc map[string]interface{}, functionDocs []map[string]interfac if strings.EqualFold(strings.TrimSpace(asString(doc["Description"])), "Display Device") { return false } + // NVSwitch is an NVIDIA NVLink interconnect switch, not a compute GPU. + if strings.Contains(strings.ToLower(strings.TrimSpace(asString(doc["Model"]))), "nvswitch") { + return false + } deviceType := strings.ToLower(asString(doc["DeviceType"])) if strings.Contains(deviceType, "gpu") || strings.Contains(deviceType, "graphics") || strings.Contains(deviceType, "accelerator") { diff --git a/internal/collector/redfish_replay.go b/internal/collector/redfish_replay.go index 8d5da38..c58f2e4 100644 --- a/internal/collector/redfish_replay.go +++ b/internal/collector/redfish_replay.go @@ -1476,11 +1476,16 @@ func (r redfishSnapshotReader) collectGPUsFromProcessors(systemPaths, chassisPat continue } - // Resolve serial from Chassis/. + // Resolve serial: prefer the processor doc itself (e.g. Supermicro + // HGX_Baseboard_0/Processors/GPU_SXM_N carries SerialNumber directly), + // then fall back to a matching chassis doc keyed by processor Id + // (e.g. MSI: Chassis/GPU_SXM_1/SerialNumber). gpuID := strings.TrimSpace(asString(doc["Id"])) - serial := "" + serial := findFirstNormalizedStringByKeys(doc, "SerialNumber") if chassisDoc, ok := chassisByID[strings.ToUpper(gpuID)]; ok { - serial = strings.TrimSpace(asString(chassisDoc["SerialNumber"])) + if cs := strings.TrimSpace(asString(chassisDoc["SerialNumber"])); cs != "" { + serial = cs + } } uuid := strings.TrimSpace(asString(doc["UUID"])) diff --git a/internal/collector/redfish_test.go b/internal/collector/redfish_test.go index 52e925b..392553e 100644 --- a/internal/collector/redfish_test.go +++ b/internal/collector/redfish_test.go @@ -1605,6 +1605,211 @@ func TestReplayCollectGPUs_MergesAmbiguousSameModelByOrder(t *testing.T) { } } +// TestCollectGPUsFromProcessors_SupermicroHGX verifies that GPU-type Processor +// entries (Supermicro HGX: HGX_Baseboard_0/Processors/GPU_SXM_N) are not added +// as duplicates when the same GPU is already present via Chassis PCIeDevices. +// The processor doc carries SerialNumber directly; the chassis ID ("HGX_GPU_SXM_1") +// does NOT match the processor Id ("GPU_SXM_1"), so chassis-based serial lookup +// fails and the dedup must fall back to the processor doc's own SerialNumber. +func TestCollectGPUsFromProcessors_SupermicroHGX(t *testing.T) { + tree := map[string]interface{}{ + // Main chassis PCIeDevices — GPU1 and GPU2 with serials. + "/redfish/v1/Chassis/1/PCIeDevices": map[string]interface{}{ + "Members": []interface{}{ + map[string]interface{}{"@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/GPU1"}, + map[string]interface{}{"@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/GPU2"}, + }, + }, + "/redfish/v1/Chassis/1/PCIeDevices/GPU1": map[string]interface{}{ + "Id": "GPU1", + "Name": "GPU1", + "Model": "NVIDIA H200", + "Manufacturer": "NVIDIA", + "SerialNumber": "SN001", + "FirmwareVersion": "96.00.D9.00.02", + "PCIeFunctions": map[string]interface{}{ + "@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/GPU1/PCIeFunctions", + }, + }, + "/redfish/v1/Chassis/1/PCIeDevices/GPU1/PCIeFunctions": map[string]interface{}{ + "Members": []interface{}{ + map[string]interface{}{"@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/GPU1/PCIeFunctions/1"}, + }, + }, + "/redfish/v1/Chassis/1/PCIeDevices/GPU1/PCIeFunctions/1": map[string]interface{}{ + "FunctionId": "1", + "ClassCode": "0x030200", + }, + "/redfish/v1/Chassis/1/PCIeDevices/GPU2": map[string]interface{}{ + "Id": "GPU2", + "Name": "GPU2", + "Model": "NVIDIA H200", + "Manufacturer": "NVIDIA", + "SerialNumber": "SN002", + "PCIeFunctions": map[string]interface{}{ + "@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/GPU2/PCIeFunctions", + }, + }, + "/redfish/v1/Chassis/1/PCIeDevices/GPU2/PCIeFunctions": map[string]interface{}{ + "Members": []interface{}{ + map[string]interface{}{"@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/GPU2/PCIeFunctions/1"}, + }, + }, + "/redfish/v1/Chassis/1/PCIeDevices/GPU2/PCIeFunctions/1": map[string]interface{}{ + "FunctionId": "2", + "ClassCode": "0x030200", + }, + // HGX GPU chassis — named HGX_GPU_SXM_N (NOT GPU_SXM_N), so chassis-ID lookup + // by processor Id "GPU_SXM_1" will NOT find them. + "/redfish/v1/Chassis/HGX_GPU_SXM_1": map[string]interface{}{ + "Id": "HGX_GPU_SXM_1", + }, + "/redfish/v1/Chassis/HGX_GPU_SXM_2": map[string]interface{}{ + "Id": "HGX_GPU_SXM_2", + }, + // HGX Baseboard system with GPU-type Processors carrying the same serials. + "/redfish/v1/Systems/HGX_Baseboard_0/Processors": map[string]interface{}{ + "Members": []interface{}{ + map[string]interface{}{"@odata.id": "/redfish/v1/Systems/HGX_Baseboard_0/Processors/GPU_SXM_1"}, + map[string]interface{}{"@odata.id": "/redfish/v1/Systems/HGX_Baseboard_0/Processors/GPU_SXM_2"}, + }, + }, + "/redfish/v1/Systems/HGX_Baseboard_0/Processors/GPU_SXM_1": map[string]interface{}{ + "Id": "GPU_SXM_1", + "Name": "Processor", + "ProcessorType": "GPU", + "Model": "NVIDIA H200", + "Manufacturer": "NVIDIA", + "SerialNumber": "SN001", + "UUID": "aaaaaaaa-0000-0000-0000-000000000001", + "Location": map[string]interface{}{ + "PartLocation": map[string]interface{}{ + "ServiceLabel": "SXM1", + }, + }, + }, + "/redfish/v1/Systems/HGX_Baseboard_0/Processors/GPU_SXM_2": map[string]interface{}{ + "Id": "GPU_SXM_2", + "Name": "Processor", + "ProcessorType": "GPU", + "Model": "NVIDIA H200", + "Manufacturer": "NVIDIA", + "SerialNumber": "SN002", + "UUID": "aaaaaaaa-0000-0000-0000-000000000002", + "Location": map[string]interface{}{ + "PartLocation": map[string]interface{}{ + "ServiceLabel": "SXM2", + }, + }, + }, + } + + r := redfishSnapshotReader{tree: tree} + chassisPaths := []string{ + "/redfish/v1/Chassis/1", + "/redfish/v1/Chassis/HGX_GPU_SXM_1", + "/redfish/v1/Chassis/HGX_GPU_SXM_2", + } + systemPaths := []string{"/redfish/v1/Systems/HGX_Baseboard_0"} + + gpus := r.collectGPUs(systemPaths, chassisPaths) + gpus = r.collectGPUsFromProcessors(systemPaths, chassisPaths, gpus) + + if len(gpus) != 2 { + var slots []string + for _, g := range gpus { + slots = append(slots, fmt.Sprintf("%s(sn=%s)", g.Slot, g.SerialNumber)) + } + t.Fatalf("expected 2 GPUs (no duplicates), got %d: %v", len(gpus), slots) + } +} + +// TestReplayCollectGPUs_DedupCrossChassisSerial verifies that the same GPU +// appearing under two Chassis PCIeDevice trees (e.g. Chassis/1/PCIeDevices/GPU1 +// and Chassis/HGX_GPU_SXM_1/PCIeDevices/GPU_SXM_1) is deduplicated to one entry +// when both expose the same serial number. +func TestReplayCollectGPUs_DedupCrossChassisSerial(t *testing.T) { + tree := map[string]interface{}{ + "/redfish/v1/Chassis/1/PCIeDevices": map[string]interface{}{ + "Members": []interface{}{ + map[string]interface{}{"@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/GPU1"}, + }, + }, + "/redfish/v1/Chassis/1/PCIeDevices/GPU1": map[string]interface{}{ + "@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/GPU1", + "Id": "GPU1", + "Name": "GPU1", + "Model": "NVIDIA H200", + "Manufacturer": "NVIDIA", + "SerialNumber": "SN-CROSSTEST-001", + "PCIeFunctions": map[string]interface{}{ + "@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/GPU1/PCIeFunctions", + }, + }, + "/redfish/v1/Chassis/1/PCIeDevices/GPU1/PCIeFunctions": map[string]interface{}{ + "Members": []interface{}{ + map[string]interface{}{"@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/GPU1/PCIeFunctions/1"}, + }, + }, + "/redfish/v1/Chassis/1/PCIeDevices/GPU1/PCIeFunctions/1": map[string]interface{}{ + "FunctionId": "1", + "ClassCode": "0x030200", + }, + // Same GPU exposed via dedicated HGX chassis — same serial, different path. + "/redfish/v1/Chassis/HGX_GPU_SXM_1/PCIeDevices": map[string]interface{}{ + "Members": []interface{}{ + map[string]interface{}{"@odata.id": "/redfish/v1/Chassis/HGX_GPU_SXM_1/PCIeDevices/GPU_SXM_1"}, + }, + }, + "/redfish/v1/Chassis/HGX_GPU_SXM_1/PCIeDevices/GPU_SXM_1": map[string]interface{}{ + "@odata.id": "/redfish/v1/Chassis/HGX_GPU_SXM_1/PCIeDevices/GPU_SXM_1", + "Id": "GPU_SXM_1", + "Name": "PCIe Device", + "Model": "NVIDIA H200", + "Manufacturer": "NVIDIA", + "SerialNumber": "SN-CROSSTEST-001", + "UUID": "deadbeef-0000-0000-0000-000000000001", + "PCIeFunctions": map[string]interface{}{ + "@odata.id": "/redfish/v1/Chassis/HGX_GPU_SXM_1/PCIeDevices/GPU_SXM_1/PCIeFunctions", + }, + }, + "/redfish/v1/Chassis/HGX_GPU_SXM_1/PCIeDevices/GPU_SXM_1/PCIeFunctions": map[string]interface{}{ + "Members": []interface{}{}, + }, + } + + r := redfishSnapshotReader{tree: tree} + got := r.collectGPUs(nil, []string{ + "/redfish/v1/Chassis/1", + "/redfish/v1/Chassis/HGX_GPU_SXM_1", + }) + if len(got) != 1 { + var slots []string + for _, g := range got { + slots = append(slots, fmt.Sprintf("%s(sn=%s)", g.Slot, g.SerialNumber)) + } + t.Fatalf("expected 1 GPU (cross-chassis serial dedup), got %d: %v", len(got), slots) + } + if got[0].SerialNumber != "SN-CROSSTEST-001" { + t.Fatalf("unexpected serial %q", got[0].SerialNumber) + } +} + +// TestLooksLikeGPU_NVSwitchExcluded verifies that NVSwitch PCIe devices +// are not classified as GPUs even though their manufacturer is NVIDIA. +func TestLooksLikeGPU_NVSwitchExcluded(t *testing.T) { + doc := map[string]interface{}{ + "Id": "NVSwitch_0", + "Name": "PCIe Device", + "Model": "NVSwitch", + "Manufacturer": "NVIDIA", + "DeviceType": "SingleFunction", + } + if looksLikeGPU(doc, nil) { + t.Fatal("NVSwitch should not be classified as a GPU") + } +} + func TestShouldCrawlPath_MemorySubresourcesAreSkipped(t *testing.T) { if !shouldCrawlPath("/redfish/v1/Systems/1/Memory/CPU0_C0D0") { t.Fatalf("expected direct DIMM resource to be crawlable")