From 9df29b1be9f49c4c7cbd3bbcc9b4f2609511bccf Mon Sep 17 00:00:00 2001 From: Mikhail Chusavitin Date: Fri, 6 Mar 2026 14:44:36 +0300 Subject: [PATCH] fix: dedup GPUs across multiple chassis PCIeDevice trees in Redfish collector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Supermicro HGX exposes each GPU under both Chassis/1/PCIeDevices and a dedicated Chassis/HGX_GPU_SXM_N/PCIeDevices. gpuDocDedupKey was keying by @odata.id path, so identical GPUs with the same serial were not deduplicated across sources. Now stable identifiers (serial → BDF → slot+model) take priority over path. Also includes Inspur parser improvements: NVMe model/serial enrichment from devicefrusdr.log and audit.log, RAID drive slot normalization to BP notation, PSU slot normalization, BMC/CPLD/VR firmware from RESTful version info section, and parser version bump to 1.8. Co-Authored-By: Claude Sonnet 4.6 --- internal/collector/redfish.go | 54 ++++++++++- internal/parser/vendors/inspur/asset.go | 25 ++++- .../vendors/inspur/asset_gpu_model_test.go | 2 +- internal/parser/vendors/inspur/audit.go | 94 +++++++++++++++++++ internal/parser/vendors/inspur/component.go | 57 +++++++++++ internal/parser/vendors/inspur/parser.go | 39 +++++++- internal/parser/vendors/inspur/pcie.go | 79 ++++++++++++++++ .../vendors/inspur/storage_serial_fallback.go | 18 ++++ .../inspur/storage_serial_fallback_test.go | 2 +- 9 files changed, 362 insertions(+), 8 deletions(-) create mode 100644 internal/parser/vendors/inspur/audit.go diff --git a/internal/collector/redfish.go b/internal/collector/redfish.go index f1e123e..5ae1faf 100644 --- a/internal/collector/redfish.go +++ b/internal/collector/redfish.go @@ -2606,8 +2606,9 @@ func parseDrive(doc map[string]interface{}) models.Storage { storageType := classifyStorageType(doc) + slot := normalizeRAIDDriveSlot(firstNonEmpty(asString(doc["Id"]), asString(doc["Name"]))) return models.Storage{ - Slot: firstNonEmpty(asString(doc["Id"]), asString(doc["Name"])), + Slot: slot, Type: storageType, Model: firstNonEmpty(asString(doc["Model"]), asString(doc["Name"])), SizeGB: sizeGB, @@ -2619,6 +2620,43 @@ func parseDrive(doc map[string]interface{}) models.Storage { } } +// isNumericString returns true if s is a non-empty string of only ASCII digits. +func isNumericString(s string) bool { + if s == "" { + return false + } + for _, c := range s { + if c < '0' || c > '9' { + return false + } + } + return true +} + +// normalizeRAIDDriveSlot converts Inspur-style RAID drive IDs to canonical BP notation. +// Example: "PCIe8_RAID_Disk_1:0" → "BP0:0" (enclosure_id - 1 = backplane_index) +// Other slot names are returned unchanged. +func normalizeRAIDDriveSlot(slot string) string { + // Pattern: {anything}_RAID_Disk_{enclosure}:{slot} + const marker = "_RAID_Disk_" + idx := strings.Index(slot, marker) + if idx < 0 { + return slot + } + rest := slot[idx+len(marker):] // e.g. "1:0" + colonIdx := strings.Index(rest, ":") + if colonIdx < 0 { + return slot + } + encStr := rest[:colonIdx] + slotStr := rest[colonIdx+1:] + enc, err := strconv.Atoi(encStr) + if err != nil || enc < 1 { + return slot + } + return fmt.Sprintf("BP%d:%s", enc-1, slotStr) +} + func parseStorageVolume(doc map[string]interface{}, controller string) models.StorageVolume { sizeGB := 0 capBytes := asInt64(doc["CapacityBytes"]) @@ -2767,6 +2805,11 @@ func parsePSU(doc map[string]interface{}, idx int) models.PSU { if slot == "" { slot = fmt.Sprintf("PSU%d", idx) } + // Normalize numeric-only slots ("0", "1") to "PSU0", "PSU1" for consistency + // with BMC log parsers (Inspur, Dell etc.) that use the PSU prefix. + if isNumericString(slot) { + slot = "PSU" + slot + } return models.PSU{ Slot: slot, @@ -3065,10 +3108,17 @@ func gpuDedupKey(gpu models.GPU) string { } func gpuDocDedupKey(doc map[string]interface{}, gpu models.GPU) string { + // Prefer stable GPU identifiers (serial, BDF) over path so that the same + // physical GPU exposed under multiple Chassis PCIeDevice trees (e.g. Supermicro + // HGX: Chassis/1/PCIeDevices/GPU1 and Chassis/HGX_GPU_SXM_1/PCIeDevices/GPU_SXM_1) + // is correctly deduplicated. + if key := gpuDedupKey(gpu); key != "" { + return key + } if path := normalizeRedfishPath(asString(doc["@odata.id"])); path != "" { return "path:" + path } - return gpuDedupKey(gpu) + return "" } func shouldSkipGenericGPUDuplicate(existing []models.GPU, candidate models.GPU) bool { diff --git a/internal/parser/vendors/inspur/asset.go b/internal/parser/vendors/inspur/asset.go index 2910ec0..77969d2 100644 --- a/internal/parser/vendors/inspur/asset.go +++ b/internal/parser/vendors/inspur/asset.go @@ -94,8 +94,12 @@ type AssetJSON struct { } `json:"PcieInfo"` } -// ParseAssetJSON parses Inspur asset.json content -func ParseAssetJSON(content []byte) (*models.HardwareConfig, error) { +// ParseAssetJSON parses Inspur asset.json content. +// - pcieSlotDeviceNames: optional map from integer PCIe slot ID to device name string, +// sourced from devicefrusdr.log PCIe REST section. Fills missing NVMe model names. +// - pcieSlotSerials: optional map from integer PCIe slot ID to serial number string, +// sourced from audit.log SN-changed events. Fills missing NVMe serial numbers. +func ParseAssetJSON(content []byte, pcieSlotDeviceNames map[int]string, pcieSlotSerials map[int]string) (*models.HardwareConfig, error) { var asset AssetJSON if err := json.Unmarshal(content, &asset); err != nil { return nil, err @@ -175,6 +179,23 @@ func ParseAssetJSON(content []byte) (*models.HardwareConfig, error) { continue } + // Enrich model name from PCIe device name (supplied from devicefrusdr.log). + // BMC does not populate HddInfo.ModelName for NVMe drives, but the PCIe REST + // section in devicefrusdr.log carries the drive model as device_name. + if modelName == "" && hdd.PcieSlot > 0 && len(pcieSlotDeviceNames) > 0 { + if devName, ok := pcieSlotDeviceNames[hdd.PcieSlot]; ok && devName != "" { + modelName = devName + } + } + + // Enrich serial number from audit.log SN-changed events (supplied via pcieSlotSerials). + // BMC asset.json does not carry NVMe serial numbers; audit.log logs every SN change. + if serial == "" && hdd.PcieSlot > 0 && len(pcieSlotSerials) > 0 { + if sn, ok := pcieSlotSerials[hdd.PcieSlot]; ok && sn != "" { + serial = sn + } + } + storageType := "HDD" if hdd.DiskInterfaceType == 5 { storageType = "NVMe" diff --git a/internal/parser/vendors/inspur/asset_gpu_model_test.go b/internal/parser/vendors/inspur/asset_gpu_model_test.go index 93d9879..9215e2d 100644 --- a/internal/parser/vendors/inspur/asset_gpu_model_test.go +++ b/internal/parser/vendors/inspur/asset_gpu_model_test.go @@ -28,7 +28,7 @@ func TestParseAssetJSON_NVIDIAGPUModelFromPCIIDs(t *testing.T) { }] }`) - hw, err := ParseAssetJSON(raw) + hw, err := ParseAssetJSON(raw, nil, nil) if err != nil { t.Fatalf("ParseAssetJSON failed: %v", err) } diff --git a/internal/parser/vendors/inspur/audit.go b/internal/parser/vendors/inspur/audit.go new file mode 100644 index 0000000..04c7714 --- /dev/null +++ b/internal/parser/vendors/inspur/audit.go @@ -0,0 +1,94 @@ +package inspur + +import ( + "fmt" + "regexp" + "strconv" + "strings" +) + +// auditSNChangedNVMeRegex matches: +// "Front Back Plane N NVMe DiskM SN changed from X to Y" +// Captures: disk_num, new_serial +var auditSNChangedNVMeRegex = regexp.MustCompile(`NVMe Disk(\d+)\s+SN changed from \S+\s+to\s+(\S+)`) + +// auditSNChangedRAIDRegex matches: +// "Raid(Pcie Slot:N) HDD(enclosure id:E slot:S) SN changed from X to Y" +// Captures: pcie_slot, enclosure_id, slot_num, new_serial +var auditSNChangedRAIDRegex = regexp.MustCompile(`Raid\(Pcie Slot:(\d+)\) HDD\(enclosure id:(\d+) slot:(\d+)\)\s+SN changed from \S+\s+to\s+(\S+)`) + +// ParseAuditLogNVMeSerials parses audit.log and returns the final (latest) serial number +// per NVMe disk number. The disk number matches the numeric suffix in PCIe location +// strings like "#NVME0", "#NVME2", etc. from devicefrusdr.log. +// Entries where the serial changed to "NULL" are excluded. +func ParseAuditLogNVMeSerials(content []byte) map[int]string { + serials := make(map[int]string) + + for _, line := range strings.Split(string(content), "\n") { + m := auditSNChangedNVMeRegex.FindStringSubmatch(line) + if m == nil { + continue + } + diskNum, err := strconv.Atoi(m[1]) + if err != nil { + continue + } + serial := strings.TrimSpace(m[2]) + if strings.EqualFold(serial, "NULL") || serial == "" { + delete(serials, diskNum) + } else { + serials[diskNum] = serial + } + } + if len(serials) == 0 { + return nil + } + return serials +} + +// ParseAuditLogRAIDSerials parses audit.log and returns the final (latest) serial number +// per RAID backplane disk. Key format is "BP{enclosure_id-1}:{slot_num}" (e.g. "BP0:0"). +// +// Each disk slot is claimed by a specific RAID controller (Pcie Slot:N). NULL events from +// an old controller do not clear serials assigned by a newer controller, preventing stale +// deletions when disks are migrated between RAID arrays. +func ParseAuditLogRAIDSerials(content []byte) map[string]string { + // owner tracks which PCIe RAID controller slot last assigned a serial to a disk key. + serials := make(map[string]string) + owner := make(map[string]int) + + for _, line := range strings.Split(string(content), "\n") { + m := auditSNChangedRAIDRegex.FindStringSubmatch(line) + if m == nil { + continue + } + pcieSlot, err := strconv.Atoi(m[1]) + if err != nil { + continue + } + enclosureID, err := strconv.Atoi(m[2]) + if err != nil { + continue + } + slotNum, err := strconv.Atoi(m[3]) + if err != nil { + continue + } + serial := strings.TrimSpace(m[4]) + key := fmt.Sprintf("BP%d:%d", enclosureID-1, slotNum) + if strings.EqualFold(serial, "NULL") || serial == "" { + // Only clear if this controller was the last to set the serial. + if owner[key] == pcieSlot { + delete(serials, key) + delete(owner, key) + } + } else { + serials[key] = serial + owner[key] = pcieSlot + } + } + if len(serials) == 0 { + return nil + } + return serials +} diff --git a/internal/parser/vendors/inspur/component.go b/internal/parser/vendors/inspur/component.go index 9c293eb..649c8ff 100644 --- a/internal/parser/vendors/inspur/component.go +++ b/internal/parser/vendors/inspur/component.go @@ -713,6 +713,63 @@ func extractComponentFirmware(text string, hw *models.HardwareConfig) { } } } + + // Extract BMC, CPLD and VR firmware from RESTful version info section. + // The JSON is a flat array: [{"id":N,"dev_name":"...","dev_version":"..."}, ...] + reVer := regexp.MustCompile(`RESTful version info:\s*(\[[\s\S]*?\])\s*RESTful`) + if match := reVer.FindStringSubmatch(text); match != nil { + type verEntry struct { + DevName string `json:"dev_name"` + DevVersion string `json:"dev_version"` + } + var entries []verEntry + if err := json.Unmarshal([]byte(match[1]), &entries); err == nil { + for _, e := range entries { + name := normalizeVersionInfoName(e.DevName) + if name == "" { + continue + } + version := strings.TrimSpace(e.DevVersion) + if version == "" { + continue + } + if existingFW[name] { + continue + } + hw.Firmware = append(hw.Firmware, models.FirmwareInfo{ + DeviceName: name, + Version: version, + }) + existingFW[name] = true + } + } + } +} + +// normalizeVersionInfoName converts RESTful version info dev_name to a clean label. +// Returns "" for entries that should be skipped (inactive BMC, PSU slots). +func normalizeVersionInfoName(name string) string { + name = strings.TrimSpace(name) + if name == "" { + return "" + } + // Skip PSU_N entries — firmware already extracted from PSU info section. + if regexp.MustCompile(`(?i)^PSU_\d+$`).MatchString(name) { + return "" + } + // Skip the inactive BMC partition. + if strings.HasPrefix(strings.ToLower(name), "inactivate(") { + return "" + } + // Active BMC: "Activate(BMC1)" → "BMC" + if strings.HasPrefix(strings.ToLower(name), "activate(") { + return "BMC" + } + // Strip trailing "Version" suffix (case-insensitive), e.g. "MainBoard0CPLDVersion" → "MainBoard0CPLD" + if strings.HasSuffix(strings.ToLower(name), "version") { + name = name[:len(name)-len("version")] + } + return strings.TrimSpace(name) } // DiskBackplaneRESTInfo represents the RESTful diskbackplane info structure diff --git a/internal/parser/vendors/inspur/parser.go b/internal/parser/vendors/inspur/parser.go index 1611557..c38ef74 100644 --- a/internal/parser/vendors/inspur/parser.go +++ b/internal/parser/vendors/inspur/parser.go @@ -16,7 +16,7 @@ import ( // parserVersion - version of this parser module // IMPORTANT: Increment this version when making changes to parser logic! -const parserVersion = "1.5" +const parserVersion = "1.8" func init() { parser.Register(&Parser{}) @@ -95,9 +95,41 @@ func (p *Parser) Parse(files []parser.ExtractedFile) (*models.AnalysisResult, er Sensors: make([]models.SensorReading, 0), } + // Pre-parse enrichment maps from devicefrusdr.log for use inside ParseAssetJSON. + // BMC does not populate HddInfo.ModelName or SerialNumber for NVMe drives. + var pcieSlotDeviceNames map[int]string + var nvmeLocToSlot map[int]int + if f := parser.FindFileByName(files, "devicefrusdr.log"); f != nil { + pcieSlotDeviceNames = ParsePCIeSlotDeviceNames(f.Content) + nvmeLocToSlot = ParsePCIeNVMeLocToSlot(f.Content) + } + + // Parse NVMe serial numbers from audit.log: every disk SN change is logged there. + // Combine with the NVMe loc→slot mapping to build pcieSlot→serial map. + // Also parse RAID disk serials by backplane slot key (e.g. "BP0:0"). + var pcieSlotSerials map[int]string + var raidSlotSerials map[string]string + if f := parser.FindFileByName(files, "audit.log"); f != nil { + if len(nvmeLocToSlot) > 0 { + nvmeDiskSerials := ParseAuditLogNVMeSerials(f.Content) + if len(nvmeDiskSerials) > 0 { + pcieSlotSerials = make(map[int]string, len(nvmeDiskSerials)) + for diskNum, serial := range nvmeDiskSerials { + if slot, ok := nvmeLocToSlot[diskNum]; ok { + pcieSlotSerials[slot] = serial + } + } + if len(pcieSlotSerials) == 0 { + pcieSlotSerials = nil + } + } + } + raidSlotSerials = ParseAuditLogRAIDSerials(f.Content) + } + // Parse asset.json first (base hardware info) if f := parser.FindFileByName(files, "asset.json"); f != nil { - if hw, err := ParseAssetJSON(f.Content); err == nil { + if hw, err := ParseAssetJSON(f.Content, pcieSlotDeviceNames, pcieSlotSerials); err == nil { result.Hardware = hw } } @@ -182,6 +214,9 @@ func (p *Parser) Parse(files []parser.ExtractedFile) (*models.AnalysisResult, er if result.Hardware != nil { applyGPUStatusFromEvents(result.Hardware, result.Events) enrichStorageFromSerialFallbackFiles(files, result.Hardware) + // Apply RAID disk serials from audit.log (authoritative: last non-NULL SN change). + // These override redis/component.log serials which may be stale after disk replacement. + applyRAIDSlotSerials(result.Hardware, raidSlotSerials) } return result, nil diff --git a/internal/parser/vendors/inspur/pcie.go b/internal/parser/vendors/inspur/pcie.go index 49c3f41..a143436 100644 --- a/internal/parser/vendors/inspur/pcie.go +++ b/internal/parser/vendors/inspur/pcie.go @@ -4,6 +4,7 @@ import ( "encoding/json" "fmt" "regexp" + "strconv" "strings" "git.mchus.pro/mchus/logpile/internal/models" @@ -37,6 +38,84 @@ type PCIeRESTInfo []struct { FwVer string `json:"fw_ver"` } +// ParsePCIeSlotDeviceNames parses devicefrusdr.log and returns a map from integer PCIe slot ID +// to device name string. Used to enrich HddInfo entries in asset.json that lack model names. +func ParsePCIeSlotDeviceNames(content []byte) map[int]string { + info, ok := parsePCIeRESTJSON(content) + if !ok { + return nil + } + result := make(map[int]string, len(info)) + for _, entry := range info { + if entry.Slot <= 0 { + continue + } + name := sanitizePCIeDeviceName(entry.DeviceName) + if name != "" { + result[entry.Slot] = name + } + } + if len(result) == 0 { + return nil + } + return result +} + +// parsePCIeRESTJSON parses the RESTful PCIE Device info JSON from devicefrusdr.log content. +func parsePCIeRESTJSON(content []byte) (PCIeRESTInfo, bool) { + text := string(content) + startMarker := "RESTful PCIE Device info:" + endMarker := "BMC sdr Info:" + + startIdx := strings.Index(text, startMarker) + if startIdx == -1 { + return nil, false + } + endIdx := strings.Index(text[startIdx:], endMarker) + if endIdx == -1 { + endIdx = len(text) - startIdx + } + jsonText := strings.TrimSpace(text[startIdx+len(startMarker) : startIdx+endIdx]) + + var info PCIeRESTInfo + if err := json.Unmarshal([]byte(jsonText), &info); err != nil { + return nil, false + } + return info, true +} + +// ParsePCIeNVMeLocToSlot parses devicefrusdr.log and returns a map from NVMe location number +// (the numeric suffix in "#NVME0", "#NVME2", etc.) to the integer PCIe slot ID. +// This is used to correlate audit.log NVMe disk numbers with HddInfo PcieSlot values. +func ParsePCIeNVMeLocToSlot(content []byte) map[int]int { + info, ok := parsePCIeRESTJSON(content) + if !ok { + return nil + } + + nvmeLocRegex := regexp.MustCompile(`(?i)^#NVME(\d+)$`) + result := make(map[int]int) + for _, entry := range info { + if entry.Slot <= 0 { + continue + } + loc := strings.TrimSpace(entry.Location) + m := nvmeLocRegex.FindStringSubmatch(loc) + if m == nil { + continue + } + locNum, err := strconv.Atoi(m[1]) + if err != nil { + continue + } + result[locNum] = entry.Slot + } + if len(result) == 0 { + return nil + } + return result +} + // ParsePCIeDevices parses RESTful PCIE Device info from devicefrusdr.log func ParsePCIeDevices(content []byte) []models.PCIeDevice { text := string(content) diff --git a/internal/parser/vendors/inspur/storage_serial_fallback.go b/internal/parser/vendors/inspur/storage_serial_fallback.go index d1abc10..4edb64a 100644 --- a/internal/parser/vendors/inspur/storage_serial_fallback.go +++ b/internal/parser/vendors/inspur/storage_serial_fallback.go @@ -73,6 +73,24 @@ func looksLikeStorageSerial(v string) bool { return hasLetter && hasDigit } +// applyRAIDSlotSerials updates storage serial numbers using the slot→serial map +// derived from audit.log RAID SN change events. Overwrites existing serials since +// audit.log represents the authoritative current state after all disk replacements. +func applyRAIDSlotSerials(hw *models.HardwareConfig, serials map[string]string) { + if hw == nil || len(serials) == 0 { + return + } + for i := range hw.Storage { + slot := strings.TrimSpace(hw.Storage[i].Slot) + if slot == "" { + continue + } + if sn, ok := serials[slot]; ok && sn != "" { + hw.Storage[i].SerialNumber = sn + } + } +} + func applyStorageSerialFallback(hw *models.HardwareConfig, serials []string) { if hw == nil || len(hw.Storage) == 0 || len(serials) == 0 { return diff --git a/internal/parser/vendors/inspur/storage_serial_fallback_test.go b/internal/parser/vendors/inspur/storage_serial_fallback_test.go index af556ac..40f9fa6 100644 --- a/internal/parser/vendors/inspur/storage_serial_fallback_test.go +++ b/internal/parser/vendors/inspur/storage_serial_fallback_test.go @@ -26,7 +26,7 @@ func TestParseAssetJSON_HddSlotFallbackAndPresence(t *testing.T) { ] }`) - hw, err := ParseAssetJSON(content) + hw, err := ParseAssetJSON(content, nil, nil) if err != nil { t.Fatalf("ParseAssetJSON failed: %v", err) }