fix: dedup GPUs across multiple chassis PCIeDevice trees in Redfish collector

Supermicro HGX exposes each GPU under both Chassis/1/PCIeDevices and a
dedicated Chassis/HGX_GPU_SXM_N/PCIeDevices. gpuDocDedupKey was keying
by @odata.id path, so identical GPUs with the same serial were not
deduplicated across sources. Now stable identifiers (serial → BDF →
slot+model) take priority over path.

Also includes Inspur parser improvements: NVMe model/serial enrichment
from devicefrusdr.log and audit.log, RAID drive slot normalization to
BP notation, PSU slot normalization, BMC/CPLD/VR firmware from RESTful
version info section, and parser version bump to 1.8.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Mikhail Chusavitin
2026-03-06 14:44:36 +03:00
parent 62d6ad6f66
commit 9df29b1be9
9 changed files with 362 additions and 8 deletions

View File

@@ -94,8 +94,12 @@ type AssetJSON struct {
} `json:"PcieInfo"`
}
// ParseAssetJSON parses Inspur asset.json content
func ParseAssetJSON(content []byte) (*models.HardwareConfig, error) {
// ParseAssetJSON parses Inspur asset.json content.
// - pcieSlotDeviceNames: optional map from integer PCIe slot ID to device name string,
// sourced from devicefrusdr.log PCIe REST section. Fills missing NVMe model names.
// - pcieSlotSerials: optional map from integer PCIe slot ID to serial number string,
// sourced from audit.log SN-changed events. Fills missing NVMe serial numbers.
func ParseAssetJSON(content []byte, pcieSlotDeviceNames map[int]string, pcieSlotSerials map[int]string) (*models.HardwareConfig, error) {
var asset AssetJSON
if err := json.Unmarshal(content, &asset); err != nil {
return nil, err
@@ -175,6 +179,23 @@ func ParseAssetJSON(content []byte) (*models.HardwareConfig, error) {
continue
}
// Enrich model name from PCIe device name (supplied from devicefrusdr.log).
// BMC does not populate HddInfo.ModelName for NVMe drives, but the PCIe REST
// section in devicefrusdr.log carries the drive model as device_name.
if modelName == "" && hdd.PcieSlot > 0 && len(pcieSlotDeviceNames) > 0 {
if devName, ok := pcieSlotDeviceNames[hdd.PcieSlot]; ok && devName != "" {
modelName = devName
}
}
// Enrich serial number from audit.log SN-changed events (supplied via pcieSlotSerials).
// BMC asset.json does not carry NVMe serial numbers; audit.log logs every SN change.
if serial == "" && hdd.PcieSlot > 0 && len(pcieSlotSerials) > 0 {
if sn, ok := pcieSlotSerials[hdd.PcieSlot]; ok && sn != "" {
serial = sn
}
}
storageType := "HDD"
if hdd.DiskInterfaceType == 5 {
storageType = "NVMe"

View File

@@ -28,7 +28,7 @@ func TestParseAssetJSON_NVIDIAGPUModelFromPCIIDs(t *testing.T) {
}]
}`)
hw, err := ParseAssetJSON(raw)
hw, err := ParseAssetJSON(raw, nil, nil)
if err != nil {
t.Fatalf("ParseAssetJSON failed: %v", err)
}

94
internal/parser/vendors/inspur/audit.go vendored Normal file
View File

@@ -0,0 +1,94 @@
package inspur
import (
"fmt"
"regexp"
"strconv"
"strings"
)
// auditSNChangedNVMeRegex matches:
// "Front Back Plane N NVMe DiskM SN changed from X to Y"
// Captures: disk_num, new_serial
var auditSNChangedNVMeRegex = regexp.MustCompile(`NVMe Disk(\d+)\s+SN changed from \S+\s+to\s+(\S+)`)
// auditSNChangedRAIDRegex matches:
// "Raid(Pcie Slot:N) HDD(enclosure id:E slot:S) SN changed from X to Y"
// Captures: pcie_slot, enclosure_id, slot_num, new_serial
var auditSNChangedRAIDRegex = regexp.MustCompile(`Raid\(Pcie Slot:(\d+)\) HDD\(enclosure id:(\d+) slot:(\d+)\)\s+SN changed from \S+\s+to\s+(\S+)`)
// ParseAuditLogNVMeSerials parses audit.log and returns the final (latest) serial number
// per NVMe disk number. The disk number matches the numeric suffix in PCIe location
// strings like "#NVME0", "#NVME2", etc. from devicefrusdr.log.
// Entries where the serial changed to "NULL" are excluded.
func ParseAuditLogNVMeSerials(content []byte) map[int]string {
serials := make(map[int]string)
for _, line := range strings.Split(string(content), "\n") {
m := auditSNChangedNVMeRegex.FindStringSubmatch(line)
if m == nil {
continue
}
diskNum, err := strconv.Atoi(m[1])
if err != nil {
continue
}
serial := strings.TrimSpace(m[2])
if strings.EqualFold(serial, "NULL") || serial == "" {
delete(serials, diskNum)
} else {
serials[diskNum] = serial
}
}
if len(serials) == 0 {
return nil
}
return serials
}
// ParseAuditLogRAIDSerials parses audit.log and returns the final (latest) serial number
// per RAID backplane disk. Key format is "BP{enclosure_id-1}:{slot_num}" (e.g. "BP0:0").
//
// Each disk slot is claimed by a specific RAID controller (Pcie Slot:N). NULL events from
// an old controller do not clear serials assigned by a newer controller, preventing stale
// deletions when disks are migrated between RAID arrays.
func ParseAuditLogRAIDSerials(content []byte) map[string]string {
// owner tracks which PCIe RAID controller slot last assigned a serial to a disk key.
serials := make(map[string]string)
owner := make(map[string]int)
for _, line := range strings.Split(string(content), "\n") {
m := auditSNChangedRAIDRegex.FindStringSubmatch(line)
if m == nil {
continue
}
pcieSlot, err := strconv.Atoi(m[1])
if err != nil {
continue
}
enclosureID, err := strconv.Atoi(m[2])
if err != nil {
continue
}
slotNum, err := strconv.Atoi(m[3])
if err != nil {
continue
}
serial := strings.TrimSpace(m[4])
key := fmt.Sprintf("BP%d:%d", enclosureID-1, slotNum)
if strings.EqualFold(serial, "NULL") || serial == "" {
// Only clear if this controller was the last to set the serial.
if owner[key] == pcieSlot {
delete(serials, key)
delete(owner, key)
}
} else {
serials[key] = serial
owner[key] = pcieSlot
}
}
if len(serials) == 0 {
return nil
}
return serials
}

View File

@@ -713,6 +713,63 @@ func extractComponentFirmware(text string, hw *models.HardwareConfig) {
}
}
}
// Extract BMC, CPLD and VR firmware from RESTful version info section.
// The JSON is a flat array: [{"id":N,"dev_name":"...","dev_version":"..."}, ...]
reVer := regexp.MustCompile(`RESTful version info:\s*(\[[\s\S]*?\])\s*RESTful`)
if match := reVer.FindStringSubmatch(text); match != nil {
type verEntry struct {
DevName string `json:"dev_name"`
DevVersion string `json:"dev_version"`
}
var entries []verEntry
if err := json.Unmarshal([]byte(match[1]), &entries); err == nil {
for _, e := range entries {
name := normalizeVersionInfoName(e.DevName)
if name == "" {
continue
}
version := strings.TrimSpace(e.DevVersion)
if version == "" {
continue
}
if existingFW[name] {
continue
}
hw.Firmware = append(hw.Firmware, models.FirmwareInfo{
DeviceName: name,
Version: version,
})
existingFW[name] = true
}
}
}
}
// normalizeVersionInfoName converts RESTful version info dev_name to a clean label.
// Returns "" for entries that should be skipped (inactive BMC, PSU slots).
func normalizeVersionInfoName(name string) string {
name = strings.TrimSpace(name)
if name == "" {
return ""
}
// Skip PSU_N entries — firmware already extracted from PSU info section.
if regexp.MustCompile(`(?i)^PSU_\d+$`).MatchString(name) {
return ""
}
// Skip the inactive BMC partition.
if strings.HasPrefix(strings.ToLower(name), "inactivate(") {
return ""
}
// Active BMC: "Activate(BMC1)" → "BMC"
if strings.HasPrefix(strings.ToLower(name), "activate(") {
return "BMC"
}
// Strip trailing "Version" suffix (case-insensitive), e.g. "MainBoard0CPLDVersion" → "MainBoard0CPLD"
if strings.HasSuffix(strings.ToLower(name), "version") {
name = name[:len(name)-len("version")]
}
return strings.TrimSpace(name)
}
// DiskBackplaneRESTInfo represents the RESTful diskbackplane info structure

View File

@@ -16,7 +16,7 @@ import (
// parserVersion - version of this parser module
// IMPORTANT: Increment this version when making changes to parser logic!
const parserVersion = "1.5"
const parserVersion = "1.8"
func init() {
parser.Register(&Parser{})
@@ -95,9 +95,41 @@ func (p *Parser) Parse(files []parser.ExtractedFile) (*models.AnalysisResult, er
Sensors: make([]models.SensorReading, 0),
}
// Pre-parse enrichment maps from devicefrusdr.log for use inside ParseAssetJSON.
// BMC does not populate HddInfo.ModelName or SerialNumber for NVMe drives.
var pcieSlotDeviceNames map[int]string
var nvmeLocToSlot map[int]int
if f := parser.FindFileByName(files, "devicefrusdr.log"); f != nil {
pcieSlotDeviceNames = ParsePCIeSlotDeviceNames(f.Content)
nvmeLocToSlot = ParsePCIeNVMeLocToSlot(f.Content)
}
// Parse NVMe serial numbers from audit.log: every disk SN change is logged there.
// Combine with the NVMe loc→slot mapping to build pcieSlot→serial map.
// Also parse RAID disk serials by backplane slot key (e.g. "BP0:0").
var pcieSlotSerials map[int]string
var raidSlotSerials map[string]string
if f := parser.FindFileByName(files, "audit.log"); f != nil {
if len(nvmeLocToSlot) > 0 {
nvmeDiskSerials := ParseAuditLogNVMeSerials(f.Content)
if len(nvmeDiskSerials) > 0 {
pcieSlotSerials = make(map[int]string, len(nvmeDiskSerials))
for diskNum, serial := range nvmeDiskSerials {
if slot, ok := nvmeLocToSlot[diskNum]; ok {
pcieSlotSerials[slot] = serial
}
}
if len(pcieSlotSerials) == 0 {
pcieSlotSerials = nil
}
}
}
raidSlotSerials = ParseAuditLogRAIDSerials(f.Content)
}
// Parse asset.json first (base hardware info)
if f := parser.FindFileByName(files, "asset.json"); f != nil {
if hw, err := ParseAssetJSON(f.Content); err == nil {
if hw, err := ParseAssetJSON(f.Content, pcieSlotDeviceNames, pcieSlotSerials); err == nil {
result.Hardware = hw
}
}
@@ -182,6 +214,9 @@ func (p *Parser) Parse(files []parser.ExtractedFile) (*models.AnalysisResult, er
if result.Hardware != nil {
applyGPUStatusFromEvents(result.Hardware, result.Events)
enrichStorageFromSerialFallbackFiles(files, result.Hardware)
// Apply RAID disk serials from audit.log (authoritative: last non-NULL SN change).
// These override redis/component.log serials which may be stale after disk replacement.
applyRAIDSlotSerials(result.Hardware, raidSlotSerials)
}
return result, nil

View File

@@ -4,6 +4,7 @@ import (
"encoding/json"
"fmt"
"regexp"
"strconv"
"strings"
"git.mchus.pro/mchus/logpile/internal/models"
@@ -37,6 +38,84 @@ type PCIeRESTInfo []struct {
FwVer string `json:"fw_ver"`
}
// ParsePCIeSlotDeviceNames parses devicefrusdr.log and returns a map from integer PCIe slot ID
// to device name string. Used to enrich HddInfo entries in asset.json that lack model names.
func ParsePCIeSlotDeviceNames(content []byte) map[int]string {
info, ok := parsePCIeRESTJSON(content)
if !ok {
return nil
}
result := make(map[int]string, len(info))
for _, entry := range info {
if entry.Slot <= 0 {
continue
}
name := sanitizePCIeDeviceName(entry.DeviceName)
if name != "" {
result[entry.Slot] = name
}
}
if len(result) == 0 {
return nil
}
return result
}
// parsePCIeRESTJSON parses the RESTful PCIE Device info JSON from devicefrusdr.log content.
func parsePCIeRESTJSON(content []byte) (PCIeRESTInfo, bool) {
text := string(content)
startMarker := "RESTful PCIE Device info:"
endMarker := "BMC sdr Info:"
startIdx := strings.Index(text, startMarker)
if startIdx == -1 {
return nil, false
}
endIdx := strings.Index(text[startIdx:], endMarker)
if endIdx == -1 {
endIdx = len(text) - startIdx
}
jsonText := strings.TrimSpace(text[startIdx+len(startMarker) : startIdx+endIdx])
var info PCIeRESTInfo
if err := json.Unmarshal([]byte(jsonText), &info); err != nil {
return nil, false
}
return info, true
}
// ParsePCIeNVMeLocToSlot parses devicefrusdr.log and returns a map from NVMe location number
// (the numeric suffix in "#NVME0", "#NVME2", etc.) to the integer PCIe slot ID.
// This is used to correlate audit.log NVMe disk numbers with HddInfo PcieSlot values.
func ParsePCIeNVMeLocToSlot(content []byte) map[int]int {
info, ok := parsePCIeRESTJSON(content)
if !ok {
return nil
}
nvmeLocRegex := regexp.MustCompile(`(?i)^#NVME(\d+)$`)
result := make(map[int]int)
for _, entry := range info {
if entry.Slot <= 0 {
continue
}
loc := strings.TrimSpace(entry.Location)
m := nvmeLocRegex.FindStringSubmatch(loc)
if m == nil {
continue
}
locNum, err := strconv.Atoi(m[1])
if err != nil {
continue
}
result[locNum] = entry.Slot
}
if len(result) == 0 {
return nil
}
return result
}
// ParsePCIeDevices parses RESTful PCIE Device info from devicefrusdr.log
func ParsePCIeDevices(content []byte) []models.PCIeDevice {
text := string(content)

View File

@@ -73,6 +73,24 @@ func looksLikeStorageSerial(v string) bool {
return hasLetter && hasDigit
}
// applyRAIDSlotSerials updates storage serial numbers using the slot→serial map
// derived from audit.log RAID SN change events. Overwrites existing serials since
// audit.log represents the authoritative current state after all disk replacements.
func applyRAIDSlotSerials(hw *models.HardwareConfig, serials map[string]string) {
if hw == nil || len(serials) == 0 {
return
}
for i := range hw.Storage {
slot := strings.TrimSpace(hw.Storage[i].Slot)
if slot == "" {
continue
}
if sn, ok := serials[slot]; ok && sn != "" {
hw.Storage[i].SerialNumber = sn
}
}
}
func applyStorageSerialFallback(hw *models.HardwareConfig, serials []string) {
if hw == nil || len(hw.Storage) == 0 || len(serials) == 0 {
return

View File

@@ -26,7 +26,7 @@ func TestParseAssetJSON_HddSlotFallbackAndPresence(t *testing.T) {
]
}`)
hw, err := ParseAssetJSON(content)
hw, err := ParseAssetJSON(content, nil, nil)
if err != nil {
t.Fatalf("ParseAssetJSON failed: %v", err)
}