redfish: fix GPU duplication on Supermicro HGX, exclude NVSwitch, restore path dedup

Three bugs, all related to GPU dedup in the Redfish replay pipeline:

1. collectGPUsFromProcessors (redfish_replay.go): GPU-type Processor entries
   (Systems/HGX_Baseboard_0/Processors/GPU_SXM_N) were not deduplicated against
   existing PCIeDevice GPUs on Supermicro HGX. The chassis-ID lookup keyed on
   processor Id ("GPU_SXM_1") but the chassis is named "HGX_GPU_SXM_1" — lookup
   returned nothing, serial stayed empty, UUID was unseen → 8 duplicate GPU rows.
   Fix: read SerialNumber directly from the Processor doc first; chassis lookup
   is now a fallback override (as it was designed for MSI).

2. looksLikeGPU (redfish.go): NVSwitch PCIe devices (Model="NVSwitch",
   Manufacturer="NVIDIA") were classified as GPUs because "nvidia" matched the
   GPU hint list. Fix: early return false when Model contains "nvswitch".

3. gpuDocDedupKey (redfish.go): commit 9df29b1 changed the dedup key to prefer
   slot|model before path, which collapsed two distinct GPUs with identical model
   names in GraphicsControllers into one entry. Fix: only serial and BDF are used
   as cross-path stable dedup keys; fall back to Redfish path when neither is
   present. This also restores TestReplayCollectGPUs_DedupUsesRedfishPathBeforeHeuristics
   which had been broken on main since 9df29b1.

Added tests:
- TestCollectGPUsFromProcessors_SupermicroHGX: Processor GPU dedup when
  chassis-ID naming convention does not match processor Id
- TestReplayCollectGPUs_DedupCrossChassisSerial: same GPU via two Chassis
  PCIeDevice trees with matching serials → collapsed to one
- TestLooksLikeGPU_NVSwitchExcluded: NVSwitch is not a GPU

Added rule to bible-local/09-testing.md: dedup/filter/classify functions must
cover true-positive, true-negative, and the vendor counter-case axes.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Mikhail Chusavitin
2026-03-11 15:09:27 +03:00
parent d8ffe3d3a5
commit a9f58b3cf4
4 changed files with 265 additions and 5 deletions

View File

@@ -3113,8 +3113,16 @@ func gpuDocDedupKey(doc map[string]interface{}, gpu models.GPU) string {
// physical GPU exposed under multiple Chassis PCIeDevice trees (e.g. Supermicro
// HGX: Chassis/1/PCIeDevices/GPU1 and Chassis/HGX_GPU_SXM_1/PCIeDevices/GPU_SXM_1)
// is correctly deduplicated.
if key := gpuDedupKey(gpu); key != "" {
return key
//
// Only stable identifiers (serial, BDF) are used for cross-path dedup.
// When neither is present we fall back to path, so two genuinely distinct GPUs
// that happen to share the same model name (e.g. in GraphicsControllers) are
// not incorrectly collapsed into one.
if serial := normalizeRedfishIdentityField(gpu.SerialNumber); serial != "" {
return serial
}
if bdf := strings.TrimSpace(gpu.BDF); bdf != "" {
return bdf
}
if path := normalizeRedfishPath(asString(doc["@odata.id"])); path != "" {
return "path:" + path
@@ -3342,6 +3350,10 @@ func looksLikeGPU(doc map[string]interface{}, functionDocs []map[string]interfac
if strings.EqualFold(strings.TrimSpace(asString(doc["Description"])), "Display Device") {
return false
}
// NVSwitch is an NVIDIA NVLink interconnect switch, not a compute GPU.
if strings.Contains(strings.ToLower(strings.TrimSpace(asString(doc["Model"]))), "nvswitch") {
return false
}
deviceType := strings.ToLower(asString(doc["DeviceType"]))
if strings.Contains(deviceType, "gpu") || strings.Contains(deviceType, "graphics") || strings.Contains(deviceType, "accelerator") {

View File

@@ -1476,11 +1476,16 @@ func (r redfishSnapshotReader) collectGPUsFromProcessors(systemPaths, chassisPat
continue
}
// Resolve serial from Chassis/<Id>.
// Resolve serial: prefer the processor doc itself (e.g. Supermicro
// HGX_Baseboard_0/Processors/GPU_SXM_N carries SerialNumber directly),
// then fall back to a matching chassis doc keyed by processor Id
// (e.g. MSI: Chassis/GPU_SXM_1/SerialNumber).
gpuID := strings.TrimSpace(asString(doc["Id"]))
serial := ""
serial := findFirstNormalizedStringByKeys(doc, "SerialNumber")
if chassisDoc, ok := chassisByID[strings.ToUpper(gpuID)]; ok {
serial = strings.TrimSpace(asString(chassisDoc["SerialNumber"]))
if cs := strings.TrimSpace(asString(chassisDoc["SerialNumber"])); cs != "" {
serial = cs
}
}
uuid := strings.TrimSpace(asString(doc["UUID"]))

View File

@@ -1605,6 +1605,211 @@ func TestReplayCollectGPUs_MergesAmbiguousSameModelByOrder(t *testing.T) {
}
}
// TestCollectGPUsFromProcessors_SupermicroHGX verifies that GPU-type Processor
// entries (Supermicro HGX: HGX_Baseboard_0/Processors/GPU_SXM_N) are not added
// as duplicates when the same GPU is already present via Chassis PCIeDevices.
// The processor doc carries SerialNumber directly; the chassis ID ("HGX_GPU_SXM_1")
// does NOT match the processor Id ("GPU_SXM_1"), so chassis-based serial lookup
// fails and the dedup must fall back to the processor doc's own SerialNumber.
func TestCollectGPUsFromProcessors_SupermicroHGX(t *testing.T) {
tree := map[string]interface{}{
// Main chassis PCIeDevices — GPU1 and GPU2 with serials.
"/redfish/v1/Chassis/1/PCIeDevices": map[string]interface{}{
"Members": []interface{}{
map[string]interface{}{"@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/GPU1"},
map[string]interface{}{"@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/GPU2"},
},
},
"/redfish/v1/Chassis/1/PCIeDevices/GPU1": map[string]interface{}{
"Id": "GPU1",
"Name": "GPU1",
"Model": "NVIDIA H200",
"Manufacturer": "NVIDIA",
"SerialNumber": "SN001",
"FirmwareVersion": "96.00.D9.00.02",
"PCIeFunctions": map[string]interface{}{
"@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/GPU1/PCIeFunctions",
},
},
"/redfish/v1/Chassis/1/PCIeDevices/GPU1/PCIeFunctions": map[string]interface{}{
"Members": []interface{}{
map[string]interface{}{"@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/GPU1/PCIeFunctions/1"},
},
},
"/redfish/v1/Chassis/1/PCIeDevices/GPU1/PCIeFunctions/1": map[string]interface{}{
"FunctionId": "1",
"ClassCode": "0x030200",
},
"/redfish/v1/Chassis/1/PCIeDevices/GPU2": map[string]interface{}{
"Id": "GPU2",
"Name": "GPU2",
"Model": "NVIDIA H200",
"Manufacturer": "NVIDIA",
"SerialNumber": "SN002",
"PCIeFunctions": map[string]interface{}{
"@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/GPU2/PCIeFunctions",
},
},
"/redfish/v1/Chassis/1/PCIeDevices/GPU2/PCIeFunctions": map[string]interface{}{
"Members": []interface{}{
map[string]interface{}{"@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/GPU2/PCIeFunctions/1"},
},
},
"/redfish/v1/Chassis/1/PCIeDevices/GPU2/PCIeFunctions/1": map[string]interface{}{
"FunctionId": "2",
"ClassCode": "0x030200",
},
// HGX GPU chassis — named HGX_GPU_SXM_N (NOT GPU_SXM_N), so chassis-ID lookup
// by processor Id "GPU_SXM_1" will NOT find them.
"/redfish/v1/Chassis/HGX_GPU_SXM_1": map[string]interface{}{
"Id": "HGX_GPU_SXM_1",
},
"/redfish/v1/Chassis/HGX_GPU_SXM_2": map[string]interface{}{
"Id": "HGX_GPU_SXM_2",
},
// HGX Baseboard system with GPU-type Processors carrying the same serials.
"/redfish/v1/Systems/HGX_Baseboard_0/Processors": map[string]interface{}{
"Members": []interface{}{
map[string]interface{}{"@odata.id": "/redfish/v1/Systems/HGX_Baseboard_0/Processors/GPU_SXM_1"},
map[string]interface{}{"@odata.id": "/redfish/v1/Systems/HGX_Baseboard_0/Processors/GPU_SXM_2"},
},
},
"/redfish/v1/Systems/HGX_Baseboard_0/Processors/GPU_SXM_1": map[string]interface{}{
"Id": "GPU_SXM_1",
"Name": "Processor",
"ProcessorType": "GPU",
"Model": "NVIDIA H200",
"Manufacturer": "NVIDIA",
"SerialNumber": "SN001",
"UUID": "aaaaaaaa-0000-0000-0000-000000000001",
"Location": map[string]interface{}{
"PartLocation": map[string]interface{}{
"ServiceLabel": "SXM1",
},
},
},
"/redfish/v1/Systems/HGX_Baseboard_0/Processors/GPU_SXM_2": map[string]interface{}{
"Id": "GPU_SXM_2",
"Name": "Processor",
"ProcessorType": "GPU",
"Model": "NVIDIA H200",
"Manufacturer": "NVIDIA",
"SerialNumber": "SN002",
"UUID": "aaaaaaaa-0000-0000-0000-000000000002",
"Location": map[string]interface{}{
"PartLocation": map[string]interface{}{
"ServiceLabel": "SXM2",
},
},
},
}
r := redfishSnapshotReader{tree: tree}
chassisPaths := []string{
"/redfish/v1/Chassis/1",
"/redfish/v1/Chassis/HGX_GPU_SXM_1",
"/redfish/v1/Chassis/HGX_GPU_SXM_2",
}
systemPaths := []string{"/redfish/v1/Systems/HGX_Baseboard_0"}
gpus := r.collectGPUs(systemPaths, chassisPaths)
gpus = r.collectGPUsFromProcessors(systemPaths, chassisPaths, gpus)
if len(gpus) != 2 {
var slots []string
for _, g := range gpus {
slots = append(slots, fmt.Sprintf("%s(sn=%s)", g.Slot, g.SerialNumber))
}
t.Fatalf("expected 2 GPUs (no duplicates), got %d: %v", len(gpus), slots)
}
}
// TestReplayCollectGPUs_DedupCrossChassisSerial verifies that the same GPU
// appearing under two Chassis PCIeDevice trees (e.g. Chassis/1/PCIeDevices/GPU1
// and Chassis/HGX_GPU_SXM_1/PCIeDevices/GPU_SXM_1) is deduplicated to one entry
// when both expose the same serial number.
func TestReplayCollectGPUs_DedupCrossChassisSerial(t *testing.T) {
tree := map[string]interface{}{
"/redfish/v1/Chassis/1/PCIeDevices": map[string]interface{}{
"Members": []interface{}{
map[string]interface{}{"@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/GPU1"},
},
},
"/redfish/v1/Chassis/1/PCIeDevices/GPU1": map[string]interface{}{
"@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/GPU1",
"Id": "GPU1",
"Name": "GPU1",
"Model": "NVIDIA H200",
"Manufacturer": "NVIDIA",
"SerialNumber": "SN-CROSSTEST-001",
"PCIeFunctions": map[string]interface{}{
"@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/GPU1/PCIeFunctions",
},
},
"/redfish/v1/Chassis/1/PCIeDevices/GPU1/PCIeFunctions": map[string]interface{}{
"Members": []interface{}{
map[string]interface{}{"@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/GPU1/PCIeFunctions/1"},
},
},
"/redfish/v1/Chassis/1/PCIeDevices/GPU1/PCIeFunctions/1": map[string]interface{}{
"FunctionId": "1",
"ClassCode": "0x030200",
},
// Same GPU exposed via dedicated HGX chassis — same serial, different path.
"/redfish/v1/Chassis/HGX_GPU_SXM_1/PCIeDevices": map[string]interface{}{
"Members": []interface{}{
map[string]interface{}{"@odata.id": "/redfish/v1/Chassis/HGX_GPU_SXM_1/PCIeDevices/GPU_SXM_1"},
},
},
"/redfish/v1/Chassis/HGX_GPU_SXM_1/PCIeDevices/GPU_SXM_1": map[string]interface{}{
"@odata.id": "/redfish/v1/Chassis/HGX_GPU_SXM_1/PCIeDevices/GPU_SXM_1",
"Id": "GPU_SXM_1",
"Name": "PCIe Device",
"Model": "NVIDIA H200",
"Manufacturer": "NVIDIA",
"SerialNumber": "SN-CROSSTEST-001",
"UUID": "deadbeef-0000-0000-0000-000000000001",
"PCIeFunctions": map[string]interface{}{
"@odata.id": "/redfish/v1/Chassis/HGX_GPU_SXM_1/PCIeDevices/GPU_SXM_1/PCIeFunctions",
},
},
"/redfish/v1/Chassis/HGX_GPU_SXM_1/PCIeDevices/GPU_SXM_1/PCIeFunctions": map[string]interface{}{
"Members": []interface{}{},
},
}
r := redfishSnapshotReader{tree: tree}
got := r.collectGPUs(nil, []string{
"/redfish/v1/Chassis/1",
"/redfish/v1/Chassis/HGX_GPU_SXM_1",
})
if len(got) != 1 {
var slots []string
for _, g := range got {
slots = append(slots, fmt.Sprintf("%s(sn=%s)", g.Slot, g.SerialNumber))
}
t.Fatalf("expected 1 GPU (cross-chassis serial dedup), got %d: %v", len(got), slots)
}
if got[0].SerialNumber != "SN-CROSSTEST-001" {
t.Fatalf("unexpected serial %q", got[0].SerialNumber)
}
}
// TestLooksLikeGPU_NVSwitchExcluded verifies that NVSwitch PCIe devices
// are not classified as GPUs even though their manufacturer is NVIDIA.
func TestLooksLikeGPU_NVSwitchExcluded(t *testing.T) {
doc := map[string]interface{}{
"Id": "NVSwitch_0",
"Name": "PCIe Device",
"Model": "NVSwitch",
"Manufacturer": "NVIDIA",
"DeviceType": "SingleFunction",
}
if looksLikeGPU(doc, nil) {
t.Fatal("NVSwitch should not be classified as a GPU")
}
}
func TestShouldCrawlPath_MemorySubresourcesAreSkipped(t *testing.T) {
if !shouldCrawlPath("/redfish/v1/Systems/1/Memory/CPU0_C0D0") {
t.Fatalf("expected direct DIMM resource to be crawlable")