redfish: fix GPU duplication on Supermicro HGX, exclude NVSwitch, restore path dedup
Three bugs, all related to GPU dedup in the Redfish replay pipeline:
1. collectGPUsFromProcessors (redfish_replay.go): GPU-type Processor entries
(Systems/HGX_Baseboard_0/Processors/GPU_SXM_N) were not deduplicated against
existing PCIeDevice GPUs on Supermicro HGX. The chassis-ID lookup keyed on
processor Id ("GPU_SXM_1") but the chassis is named "HGX_GPU_SXM_1" — lookup
returned nothing, serial stayed empty, UUID was unseen → 8 duplicate GPU rows.
Fix: read SerialNumber directly from the Processor doc first; chassis lookup
is now a fallback override (as it was designed for MSI).
2. looksLikeGPU (redfish.go): NVSwitch PCIe devices (Model="NVSwitch",
Manufacturer="NVIDIA") were classified as GPUs because "nvidia" matched the
GPU hint list. Fix: early return false when Model contains "nvswitch".
3. gpuDocDedupKey (redfish.go): commit 9df29b1 changed the dedup key to prefer
slot|model before path, which collapsed two distinct GPUs with identical model
names in GraphicsControllers into one entry. Fix: only serial and BDF are used
as cross-path stable dedup keys; fall back to Redfish path when neither is
present. This also restores TestReplayCollectGPUs_DedupUsesRedfishPathBeforeHeuristics
which had been broken on main since 9df29b1.
Added tests:
- TestCollectGPUsFromProcessors_SupermicroHGX: Processor GPU dedup when
chassis-ID naming convention does not match processor Id
- TestReplayCollectGPUs_DedupCrossChassisSerial: same GPU via two Chassis
PCIeDevice trees with matching serials → collapsed to one
- TestLooksLikeGPU_NVSwitchExcluded: NVSwitch is not a GPU
Added rule to bible-local/09-testing.md: dedup/filter/classify functions must
cover true-positive, true-negative, and the vendor counter-case axes.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3113,8 +3113,16 @@ func gpuDocDedupKey(doc map[string]interface{}, gpu models.GPU) string {
|
||||
// physical GPU exposed under multiple Chassis PCIeDevice trees (e.g. Supermicro
|
||||
// HGX: Chassis/1/PCIeDevices/GPU1 and Chassis/HGX_GPU_SXM_1/PCIeDevices/GPU_SXM_1)
|
||||
// is correctly deduplicated.
|
||||
if key := gpuDedupKey(gpu); key != "" {
|
||||
return key
|
||||
//
|
||||
// Only stable identifiers (serial, BDF) are used for cross-path dedup.
|
||||
// When neither is present we fall back to path, so two genuinely distinct GPUs
|
||||
// that happen to share the same model name (e.g. in GraphicsControllers) are
|
||||
// not incorrectly collapsed into one.
|
||||
if serial := normalizeRedfishIdentityField(gpu.SerialNumber); serial != "" {
|
||||
return serial
|
||||
}
|
||||
if bdf := strings.TrimSpace(gpu.BDF); bdf != "" {
|
||||
return bdf
|
||||
}
|
||||
if path := normalizeRedfishPath(asString(doc["@odata.id"])); path != "" {
|
||||
return "path:" + path
|
||||
@@ -3342,6 +3350,10 @@ func looksLikeGPU(doc map[string]interface{}, functionDocs []map[string]interfac
|
||||
if strings.EqualFold(strings.TrimSpace(asString(doc["Description"])), "Display Device") {
|
||||
return false
|
||||
}
|
||||
// NVSwitch is an NVIDIA NVLink interconnect switch, not a compute GPU.
|
||||
if strings.Contains(strings.ToLower(strings.TrimSpace(asString(doc["Model"]))), "nvswitch") {
|
||||
return false
|
||||
}
|
||||
|
||||
deviceType := strings.ToLower(asString(doc["DeviceType"]))
|
||||
if strings.Contains(deviceType, "gpu") || strings.Contains(deviceType, "graphics") || strings.Contains(deviceType, "accelerator") {
|
||||
|
||||
@@ -1476,11 +1476,16 @@ func (r redfishSnapshotReader) collectGPUsFromProcessors(systemPaths, chassisPat
|
||||
continue
|
||||
}
|
||||
|
||||
// Resolve serial from Chassis/<Id>.
|
||||
// Resolve serial: prefer the processor doc itself (e.g. Supermicro
|
||||
// HGX_Baseboard_0/Processors/GPU_SXM_N carries SerialNumber directly),
|
||||
// then fall back to a matching chassis doc keyed by processor Id
|
||||
// (e.g. MSI: Chassis/GPU_SXM_1/SerialNumber).
|
||||
gpuID := strings.TrimSpace(asString(doc["Id"]))
|
||||
serial := ""
|
||||
serial := findFirstNormalizedStringByKeys(doc, "SerialNumber")
|
||||
if chassisDoc, ok := chassisByID[strings.ToUpper(gpuID)]; ok {
|
||||
serial = strings.TrimSpace(asString(chassisDoc["SerialNumber"]))
|
||||
if cs := strings.TrimSpace(asString(chassisDoc["SerialNumber"])); cs != "" {
|
||||
serial = cs
|
||||
}
|
||||
}
|
||||
|
||||
uuid := strings.TrimSpace(asString(doc["UUID"]))
|
||||
|
||||
@@ -1605,6 +1605,211 @@ func TestReplayCollectGPUs_MergesAmbiguousSameModelByOrder(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestCollectGPUsFromProcessors_SupermicroHGX verifies that GPU-type Processor
|
||||
// entries (Supermicro HGX: HGX_Baseboard_0/Processors/GPU_SXM_N) are not added
|
||||
// as duplicates when the same GPU is already present via Chassis PCIeDevices.
|
||||
// The processor doc carries SerialNumber directly; the chassis ID ("HGX_GPU_SXM_1")
|
||||
// does NOT match the processor Id ("GPU_SXM_1"), so chassis-based serial lookup
|
||||
// fails and the dedup must fall back to the processor doc's own SerialNumber.
|
||||
func TestCollectGPUsFromProcessors_SupermicroHGX(t *testing.T) {
|
||||
tree := map[string]interface{}{
|
||||
// Main chassis PCIeDevices — GPU1 and GPU2 with serials.
|
||||
"/redfish/v1/Chassis/1/PCIeDevices": map[string]interface{}{
|
||||
"Members": []interface{}{
|
||||
map[string]interface{}{"@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/GPU1"},
|
||||
map[string]interface{}{"@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/GPU2"},
|
||||
},
|
||||
},
|
||||
"/redfish/v1/Chassis/1/PCIeDevices/GPU1": map[string]interface{}{
|
||||
"Id": "GPU1",
|
||||
"Name": "GPU1",
|
||||
"Model": "NVIDIA H200",
|
||||
"Manufacturer": "NVIDIA",
|
||||
"SerialNumber": "SN001",
|
||||
"FirmwareVersion": "96.00.D9.00.02",
|
||||
"PCIeFunctions": map[string]interface{}{
|
||||
"@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/GPU1/PCIeFunctions",
|
||||
},
|
||||
},
|
||||
"/redfish/v1/Chassis/1/PCIeDevices/GPU1/PCIeFunctions": map[string]interface{}{
|
||||
"Members": []interface{}{
|
||||
map[string]interface{}{"@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/GPU1/PCIeFunctions/1"},
|
||||
},
|
||||
},
|
||||
"/redfish/v1/Chassis/1/PCIeDevices/GPU1/PCIeFunctions/1": map[string]interface{}{
|
||||
"FunctionId": "1",
|
||||
"ClassCode": "0x030200",
|
||||
},
|
||||
"/redfish/v1/Chassis/1/PCIeDevices/GPU2": map[string]interface{}{
|
||||
"Id": "GPU2",
|
||||
"Name": "GPU2",
|
||||
"Model": "NVIDIA H200",
|
||||
"Manufacturer": "NVIDIA",
|
||||
"SerialNumber": "SN002",
|
||||
"PCIeFunctions": map[string]interface{}{
|
||||
"@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/GPU2/PCIeFunctions",
|
||||
},
|
||||
},
|
||||
"/redfish/v1/Chassis/1/PCIeDevices/GPU2/PCIeFunctions": map[string]interface{}{
|
||||
"Members": []interface{}{
|
||||
map[string]interface{}{"@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/GPU2/PCIeFunctions/1"},
|
||||
},
|
||||
},
|
||||
"/redfish/v1/Chassis/1/PCIeDevices/GPU2/PCIeFunctions/1": map[string]interface{}{
|
||||
"FunctionId": "2",
|
||||
"ClassCode": "0x030200",
|
||||
},
|
||||
// HGX GPU chassis — named HGX_GPU_SXM_N (NOT GPU_SXM_N), so chassis-ID lookup
|
||||
// by processor Id "GPU_SXM_1" will NOT find them.
|
||||
"/redfish/v1/Chassis/HGX_GPU_SXM_1": map[string]interface{}{
|
||||
"Id": "HGX_GPU_SXM_1",
|
||||
},
|
||||
"/redfish/v1/Chassis/HGX_GPU_SXM_2": map[string]interface{}{
|
||||
"Id": "HGX_GPU_SXM_2",
|
||||
},
|
||||
// HGX Baseboard system with GPU-type Processors carrying the same serials.
|
||||
"/redfish/v1/Systems/HGX_Baseboard_0/Processors": map[string]interface{}{
|
||||
"Members": []interface{}{
|
||||
map[string]interface{}{"@odata.id": "/redfish/v1/Systems/HGX_Baseboard_0/Processors/GPU_SXM_1"},
|
||||
map[string]interface{}{"@odata.id": "/redfish/v1/Systems/HGX_Baseboard_0/Processors/GPU_SXM_2"},
|
||||
},
|
||||
},
|
||||
"/redfish/v1/Systems/HGX_Baseboard_0/Processors/GPU_SXM_1": map[string]interface{}{
|
||||
"Id": "GPU_SXM_1",
|
||||
"Name": "Processor",
|
||||
"ProcessorType": "GPU",
|
||||
"Model": "NVIDIA H200",
|
||||
"Manufacturer": "NVIDIA",
|
||||
"SerialNumber": "SN001",
|
||||
"UUID": "aaaaaaaa-0000-0000-0000-000000000001",
|
||||
"Location": map[string]interface{}{
|
||||
"PartLocation": map[string]interface{}{
|
||||
"ServiceLabel": "SXM1",
|
||||
},
|
||||
},
|
||||
},
|
||||
"/redfish/v1/Systems/HGX_Baseboard_0/Processors/GPU_SXM_2": map[string]interface{}{
|
||||
"Id": "GPU_SXM_2",
|
||||
"Name": "Processor",
|
||||
"ProcessorType": "GPU",
|
||||
"Model": "NVIDIA H200",
|
||||
"Manufacturer": "NVIDIA",
|
||||
"SerialNumber": "SN002",
|
||||
"UUID": "aaaaaaaa-0000-0000-0000-000000000002",
|
||||
"Location": map[string]interface{}{
|
||||
"PartLocation": map[string]interface{}{
|
||||
"ServiceLabel": "SXM2",
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
r := redfishSnapshotReader{tree: tree}
|
||||
chassisPaths := []string{
|
||||
"/redfish/v1/Chassis/1",
|
||||
"/redfish/v1/Chassis/HGX_GPU_SXM_1",
|
||||
"/redfish/v1/Chassis/HGX_GPU_SXM_2",
|
||||
}
|
||||
systemPaths := []string{"/redfish/v1/Systems/HGX_Baseboard_0"}
|
||||
|
||||
gpus := r.collectGPUs(systemPaths, chassisPaths)
|
||||
gpus = r.collectGPUsFromProcessors(systemPaths, chassisPaths, gpus)
|
||||
|
||||
if len(gpus) != 2 {
|
||||
var slots []string
|
||||
for _, g := range gpus {
|
||||
slots = append(slots, fmt.Sprintf("%s(sn=%s)", g.Slot, g.SerialNumber))
|
||||
}
|
||||
t.Fatalf("expected 2 GPUs (no duplicates), got %d: %v", len(gpus), slots)
|
||||
}
|
||||
}
|
||||
|
||||
// TestReplayCollectGPUs_DedupCrossChassisSerial verifies that the same GPU
|
||||
// appearing under two Chassis PCIeDevice trees (e.g. Chassis/1/PCIeDevices/GPU1
|
||||
// and Chassis/HGX_GPU_SXM_1/PCIeDevices/GPU_SXM_1) is deduplicated to one entry
|
||||
// when both expose the same serial number.
|
||||
func TestReplayCollectGPUs_DedupCrossChassisSerial(t *testing.T) {
|
||||
tree := map[string]interface{}{
|
||||
"/redfish/v1/Chassis/1/PCIeDevices": map[string]interface{}{
|
||||
"Members": []interface{}{
|
||||
map[string]interface{}{"@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/GPU1"},
|
||||
},
|
||||
},
|
||||
"/redfish/v1/Chassis/1/PCIeDevices/GPU1": map[string]interface{}{
|
||||
"@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/GPU1",
|
||||
"Id": "GPU1",
|
||||
"Name": "GPU1",
|
||||
"Model": "NVIDIA H200",
|
||||
"Manufacturer": "NVIDIA",
|
||||
"SerialNumber": "SN-CROSSTEST-001",
|
||||
"PCIeFunctions": map[string]interface{}{
|
||||
"@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/GPU1/PCIeFunctions",
|
||||
},
|
||||
},
|
||||
"/redfish/v1/Chassis/1/PCIeDevices/GPU1/PCIeFunctions": map[string]interface{}{
|
||||
"Members": []interface{}{
|
||||
map[string]interface{}{"@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/GPU1/PCIeFunctions/1"},
|
||||
},
|
||||
},
|
||||
"/redfish/v1/Chassis/1/PCIeDevices/GPU1/PCIeFunctions/1": map[string]interface{}{
|
||||
"FunctionId": "1",
|
||||
"ClassCode": "0x030200",
|
||||
},
|
||||
// Same GPU exposed via dedicated HGX chassis — same serial, different path.
|
||||
"/redfish/v1/Chassis/HGX_GPU_SXM_1/PCIeDevices": map[string]interface{}{
|
||||
"Members": []interface{}{
|
||||
map[string]interface{}{"@odata.id": "/redfish/v1/Chassis/HGX_GPU_SXM_1/PCIeDevices/GPU_SXM_1"},
|
||||
},
|
||||
},
|
||||
"/redfish/v1/Chassis/HGX_GPU_SXM_1/PCIeDevices/GPU_SXM_1": map[string]interface{}{
|
||||
"@odata.id": "/redfish/v1/Chassis/HGX_GPU_SXM_1/PCIeDevices/GPU_SXM_1",
|
||||
"Id": "GPU_SXM_1",
|
||||
"Name": "PCIe Device",
|
||||
"Model": "NVIDIA H200",
|
||||
"Manufacturer": "NVIDIA",
|
||||
"SerialNumber": "SN-CROSSTEST-001",
|
||||
"UUID": "deadbeef-0000-0000-0000-000000000001",
|
||||
"PCIeFunctions": map[string]interface{}{
|
||||
"@odata.id": "/redfish/v1/Chassis/HGX_GPU_SXM_1/PCIeDevices/GPU_SXM_1/PCIeFunctions",
|
||||
},
|
||||
},
|
||||
"/redfish/v1/Chassis/HGX_GPU_SXM_1/PCIeDevices/GPU_SXM_1/PCIeFunctions": map[string]interface{}{
|
||||
"Members": []interface{}{},
|
||||
},
|
||||
}
|
||||
|
||||
r := redfishSnapshotReader{tree: tree}
|
||||
got := r.collectGPUs(nil, []string{
|
||||
"/redfish/v1/Chassis/1",
|
||||
"/redfish/v1/Chassis/HGX_GPU_SXM_1",
|
||||
})
|
||||
if len(got) != 1 {
|
||||
var slots []string
|
||||
for _, g := range got {
|
||||
slots = append(slots, fmt.Sprintf("%s(sn=%s)", g.Slot, g.SerialNumber))
|
||||
}
|
||||
t.Fatalf("expected 1 GPU (cross-chassis serial dedup), got %d: %v", len(got), slots)
|
||||
}
|
||||
if got[0].SerialNumber != "SN-CROSSTEST-001" {
|
||||
t.Fatalf("unexpected serial %q", got[0].SerialNumber)
|
||||
}
|
||||
}
|
||||
|
||||
// TestLooksLikeGPU_NVSwitchExcluded verifies that NVSwitch PCIe devices
|
||||
// are not classified as GPUs even though their manufacturer is NVIDIA.
|
||||
func TestLooksLikeGPU_NVSwitchExcluded(t *testing.T) {
|
||||
doc := map[string]interface{}{
|
||||
"Id": "NVSwitch_0",
|
||||
"Name": "PCIe Device",
|
||||
"Model": "NVSwitch",
|
||||
"Manufacturer": "NVIDIA",
|
||||
"DeviceType": "SingleFunction",
|
||||
}
|
||||
if looksLikeGPU(doc, nil) {
|
||||
t.Fatal("NVSwitch should not be classified as a GPU")
|
||||
}
|
||||
}
|
||||
|
||||
func TestShouldCrawlPath_MemorySubresourcesAreSkipped(t *testing.T) {
|
||||
if !shouldCrawlPath("/redfish/v1/Systems/1/Memory/CPU0_C0D0") {
|
||||
t.Fatalf("expected direct DIMM resource to be crawlable")
|
||||
|
||||
Reference in New Issue
Block a user