From 8ca173c99b8d7266e95c2c930166e1d2ea39a54d Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Mon, 13 Apr 2026 16:05:49 +0300 Subject: [PATCH] fix(exporter): preserve all HGX GPUs with generic PCIe slot name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Supermicro HGX BMC reports all 8 B200 GPU PCIe devices with Name "PCIe Device" — a generic label shared by every GPU, not a unique hardware position. pcieDedupKey used slot as the primary key, so all 8 GPUs collapsed to one entry in the UI (the first, serial 1654925165720). Add isGenericPCIeSlotName to detect non-positional slot labels and fall through to serial/BDF for dedup instead, preserving each GPU separately. Positional slots (#GPU0, SLOT-NIC1, etc.) continue to use slot-first dedup. Co-Authored-By: Claude Sonnet 4.6 --- internal/exporter/reanimator_converter.go | 18 +++++++++- .../exporter/reanimator_converter_test.go | 36 +++++++++++++++++++ 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/internal/exporter/reanimator_converter.go b/internal/exporter/reanimator_converter.go index 4f4def8..60da7a4 100644 --- a/internal/exporter/reanimator_converter.go +++ b/internal/exporter/reanimator_converter.go @@ -1961,7 +1961,10 @@ func pcieDedupKey(item ReanimatorPCIe) string { slot := strings.ToLower(strings.TrimSpace(item.Slot)) serial := strings.ToLower(strings.TrimSpace(item.SerialNumber)) bdf := strings.ToLower(strings.TrimSpace(item.BDF)) - if slot != "" { + // Generic slot names (e.g. "PCIe Device" from HGX BMC) are not unique + // hardware positions — multiple distinct devices share the same name. + // Fall through to serial/BDF so they are not incorrectly collapsed. + if slot != "" && !isGenericPCIeSlotName(slot) { return "slot:" + slot } if serial != "" { @@ -1970,9 +1973,22 @@ func pcieDedupKey(item ReanimatorPCIe) string { if bdf != "" { return "bdf:" + bdf } + if slot != "" { + return "slot:" + slot + } return strings.ToLower(strings.TrimSpace(item.DeviceClass)) + "|" + strings.ToLower(strings.TrimSpace(item.Model)) } +// isGenericPCIeSlotName reports whether slot is a generic device-type label +// rather than a unique hardware position identifier. +func isGenericPCIeSlotName(slot string) bool { + switch slot { + case "pcie device", "pcie slot", "pcie": + return true + } + return false +} + func pcieQualityScore(item ReanimatorPCIe) int { score := 0 if strings.TrimSpace(item.SerialNumber) != "" { diff --git a/internal/exporter/reanimator_converter_test.go b/internal/exporter/reanimator_converter_test.go index e8096fb..7d7fc2c 100644 --- a/internal/exporter/reanimator_converter_test.go +++ b/internal/exporter/reanimator_converter_test.go @@ -733,6 +733,42 @@ func TestConvertPCIeDevices_SkipsDisplayControllerDuplicates(t *testing.T) { } } +func TestConvertPCIeDevices_PreservesAllGPUsWithGenericSlot(t *testing.T) { + // Supermicro HGX BMC reports all GPU PCIe devices with Name "PCIe Device" — + // a generic label that is not a unique hardware position. All 8 GPUs must + // be preserved; dedup by generic slot name must not collapse them into one. + gpus := make([]models.GPU, 8) + serials := []string{ + "1654925165720", "1654925166160", "1654925165942", "1654925165271", + "1654925165719", "1654925165252", "1654925165304", "1654925165587", + } + for i, sn := range serials { + gpus[i] = models.GPU{ + Slot: "PCIe Device", + Model: "B200 180GB HBM3e", + Manufacturer: "NVIDIA", + SerialNumber: sn, + PartNumber: "2901-886-A1", + Status: "OK", + } + } + hw := &models.HardwareConfig{GPUs: gpus} + result := convertPCIeDevices(hw, "2026-04-13T10:00:00Z") + if len(result) != 8 { + t.Fatalf("expected 8 GPU entries (one per serial), got %d", len(result)) + } + seen := make(map[string]bool) + for _, r := range result { + if seen[r.SerialNumber] { + t.Fatalf("duplicate serial %q in PCIe result", r.SerialNumber) + } + seen[r.SerialNumber] = true + if r.DeviceClass != "VideoController" { + t.Fatalf("expected VideoController device class, got %q", r.DeviceClass) + } + } +} + func TestConvertPCIeDevices_MapsGPUStatusHistory(t *testing.T) { hw := &models.HardwareConfig{ GPUs: []models.GPU{