fix(exporter): preserve all HGX GPUs with generic PCIe slot name

Supermicro HGX BMC reports all 8 B200 GPU PCIe devices with Name
"PCIe Device" — a generic label shared by every GPU, not a unique
hardware position. pcieDedupKey used slot as the primary key, so all
8 GPUs collapsed to one entry in the UI (the first, serial 1654925165720).

Add isGenericPCIeSlotName to detect non-positional slot labels and fall
through to serial/BDF for dedup instead, preserving each GPU separately.
Positional slots (#GPU0, SLOT-NIC1, etc.) continue to use slot-first dedup.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-13 16:05:49 +03:00
parent f19a3454fa
commit 8ca173c99b
2 changed files with 53 additions and 1 deletions

View File

@@ -1961,7 +1961,10 @@ func pcieDedupKey(item ReanimatorPCIe) string {
slot := strings.ToLower(strings.TrimSpace(item.Slot))
serial := strings.ToLower(strings.TrimSpace(item.SerialNumber))
bdf := strings.ToLower(strings.TrimSpace(item.BDF))
if slot != "" {
// Generic slot names (e.g. "PCIe Device" from HGX BMC) are not unique
// hardware positions — multiple distinct devices share the same name.
// Fall through to serial/BDF so they are not incorrectly collapsed.
if slot != "" && !isGenericPCIeSlotName(slot) {
return "slot:" + slot
}
if serial != "" {
@@ -1970,9 +1973,22 @@ func pcieDedupKey(item ReanimatorPCIe) string {
if bdf != "" {
return "bdf:" + bdf
}
if slot != "" {
return "slot:" + slot
}
return strings.ToLower(strings.TrimSpace(item.DeviceClass)) + "|" + strings.ToLower(strings.TrimSpace(item.Model))
}
// isGenericPCIeSlotName reports whether slot is a generic device-type label
// rather than a unique hardware position identifier.
func isGenericPCIeSlotName(slot string) bool {
switch slot {
case "pcie device", "pcie slot", "pcie":
return true
}
return false
}
func pcieQualityScore(item ReanimatorPCIe) int {
score := 0
if strings.TrimSpace(item.SerialNumber) != "" {