fix: dedup GPUs across multiple chassis PCIeDevice trees in Redfish collector
Supermicro HGX exposes each GPU under both Chassis/1/PCIeDevices and a dedicated Chassis/HGX_GPU_SXM_N/PCIeDevices. gpuDocDedupKey was keying by @odata.id path, so identical GPUs with the same serial were not deduplicated across sources. Now stable identifiers (serial → BDF → slot+model) take priority over path. Also includes Inspur parser improvements: NVMe model/serial enrichment from devicefrusdr.log and audit.log, RAID drive slot normalization to BP notation, PSU slot normalization, BMC/CPLD/VR firmware from RESTful version info section, and parser version bump to 1.8. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
25
internal/parser/vendors/inspur/asset.go
vendored
25
internal/parser/vendors/inspur/asset.go
vendored
@@ -94,8 +94,12 @@ type AssetJSON struct {
|
||||
} `json:"PcieInfo"`
|
||||
}
|
||||
|
||||
// ParseAssetJSON parses Inspur asset.json content
|
||||
func ParseAssetJSON(content []byte) (*models.HardwareConfig, error) {
|
||||
// ParseAssetJSON parses Inspur asset.json content.
|
||||
// - pcieSlotDeviceNames: optional map from integer PCIe slot ID to device name string,
|
||||
// sourced from devicefrusdr.log PCIe REST section. Fills missing NVMe model names.
|
||||
// - pcieSlotSerials: optional map from integer PCIe slot ID to serial number string,
|
||||
// sourced from audit.log SN-changed events. Fills missing NVMe serial numbers.
|
||||
func ParseAssetJSON(content []byte, pcieSlotDeviceNames map[int]string, pcieSlotSerials map[int]string) (*models.HardwareConfig, error) {
|
||||
var asset AssetJSON
|
||||
if err := json.Unmarshal(content, &asset); err != nil {
|
||||
return nil, err
|
||||
@@ -175,6 +179,23 @@ func ParseAssetJSON(content []byte) (*models.HardwareConfig, error) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Enrich model name from PCIe device name (supplied from devicefrusdr.log).
|
||||
// BMC does not populate HddInfo.ModelName for NVMe drives, but the PCIe REST
|
||||
// section in devicefrusdr.log carries the drive model as device_name.
|
||||
if modelName == "" && hdd.PcieSlot > 0 && len(pcieSlotDeviceNames) > 0 {
|
||||
if devName, ok := pcieSlotDeviceNames[hdd.PcieSlot]; ok && devName != "" {
|
||||
modelName = devName
|
||||
}
|
||||
}
|
||||
|
||||
// Enrich serial number from audit.log SN-changed events (supplied via pcieSlotSerials).
|
||||
// BMC asset.json does not carry NVMe serial numbers; audit.log logs every SN change.
|
||||
if serial == "" && hdd.PcieSlot > 0 && len(pcieSlotSerials) > 0 {
|
||||
if sn, ok := pcieSlotSerials[hdd.PcieSlot]; ok && sn != "" {
|
||||
serial = sn
|
||||
}
|
||||
}
|
||||
|
||||
storageType := "HDD"
|
||||
if hdd.DiskInterfaceType == 5 {
|
||||
storageType = "NVMe"
|
||||
|
||||
@@ -28,7 +28,7 @@ func TestParseAssetJSON_NVIDIAGPUModelFromPCIIDs(t *testing.T) {
|
||||
}]
|
||||
}`)
|
||||
|
||||
hw, err := ParseAssetJSON(raw)
|
||||
hw, err := ParseAssetJSON(raw, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatalf("ParseAssetJSON failed: %v", err)
|
||||
}
|
||||
|
||||
94
internal/parser/vendors/inspur/audit.go
vendored
Normal file
94
internal/parser/vendors/inspur/audit.go
vendored
Normal file
@@ -0,0 +1,94 @@
|
||||
package inspur
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// auditSNChangedNVMeRegex matches:
|
||||
// "Front Back Plane N NVMe DiskM SN changed from X to Y"
|
||||
// Captures: disk_num, new_serial
|
||||
var auditSNChangedNVMeRegex = regexp.MustCompile(`NVMe Disk(\d+)\s+SN changed from \S+\s+to\s+(\S+)`)
|
||||
|
||||
// auditSNChangedRAIDRegex matches:
|
||||
// "Raid(Pcie Slot:N) HDD(enclosure id:E slot:S) SN changed from X to Y"
|
||||
// Captures: pcie_slot, enclosure_id, slot_num, new_serial
|
||||
var auditSNChangedRAIDRegex = regexp.MustCompile(`Raid\(Pcie Slot:(\d+)\) HDD\(enclosure id:(\d+) slot:(\d+)\)\s+SN changed from \S+\s+to\s+(\S+)`)
|
||||
|
||||
// ParseAuditLogNVMeSerials parses audit.log and returns the final (latest) serial number
|
||||
// per NVMe disk number. The disk number matches the numeric suffix in PCIe location
|
||||
// strings like "#NVME0", "#NVME2", etc. from devicefrusdr.log.
|
||||
// Entries where the serial changed to "NULL" are excluded.
|
||||
func ParseAuditLogNVMeSerials(content []byte) map[int]string {
|
||||
serials := make(map[int]string)
|
||||
|
||||
for _, line := range strings.Split(string(content), "\n") {
|
||||
m := auditSNChangedNVMeRegex.FindStringSubmatch(line)
|
||||
if m == nil {
|
||||
continue
|
||||
}
|
||||
diskNum, err := strconv.Atoi(m[1])
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
serial := strings.TrimSpace(m[2])
|
||||
if strings.EqualFold(serial, "NULL") || serial == "" {
|
||||
delete(serials, diskNum)
|
||||
} else {
|
||||
serials[diskNum] = serial
|
||||
}
|
||||
}
|
||||
if len(serials) == 0 {
|
||||
return nil
|
||||
}
|
||||
return serials
|
||||
}
|
||||
|
||||
// ParseAuditLogRAIDSerials parses audit.log and returns the final (latest) serial number
|
||||
// per RAID backplane disk. Key format is "BP{enclosure_id-1}:{slot_num}" (e.g. "BP0:0").
|
||||
//
|
||||
// Each disk slot is claimed by a specific RAID controller (Pcie Slot:N). NULL events from
|
||||
// an old controller do not clear serials assigned by a newer controller, preventing stale
|
||||
// deletions when disks are migrated between RAID arrays.
|
||||
func ParseAuditLogRAIDSerials(content []byte) map[string]string {
|
||||
// owner tracks which PCIe RAID controller slot last assigned a serial to a disk key.
|
||||
serials := make(map[string]string)
|
||||
owner := make(map[string]int)
|
||||
|
||||
for _, line := range strings.Split(string(content), "\n") {
|
||||
m := auditSNChangedRAIDRegex.FindStringSubmatch(line)
|
||||
if m == nil {
|
||||
continue
|
||||
}
|
||||
pcieSlot, err := strconv.Atoi(m[1])
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
enclosureID, err := strconv.Atoi(m[2])
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
slotNum, err := strconv.Atoi(m[3])
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
serial := strings.TrimSpace(m[4])
|
||||
key := fmt.Sprintf("BP%d:%d", enclosureID-1, slotNum)
|
||||
if strings.EqualFold(serial, "NULL") || serial == "" {
|
||||
// Only clear if this controller was the last to set the serial.
|
||||
if owner[key] == pcieSlot {
|
||||
delete(serials, key)
|
||||
delete(owner, key)
|
||||
}
|
||||
} else {
|
||||
serials[key] = serial
|
||||
owner[key] = pcieSlot
|
||||
}
|
||||
}
|
||||
if len(serials) == 0 {
|
||||
return nil
|
||||
}
|
||||
return serials
|
||||
}
|
||||
57
internal/parser/vendors/inspur/component.go
vendored
57
internal/parser/vendors/inspur/component.go
vendored
@@ -713,6 +713,63 @@ func extractComponentFirmware(text string, hw *models.HardwareConfig) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Extract BMC, CPLD and VR firmware from RESTful version info section.
|
||||
// The JSON is a flat array: [{"id":N,"dev_name":"...","dev_version":"..."}, ...]
|
||||
reVer := regexp.MustCompile(`RESTful version info:\s*(\[[\s\S]*?\])\s*RESTful`)
|
||||
if match := reVer.FindStringSubmatch(text); match != nil {
|
||||
type verEntry struct {
|
||||
DevName string `json:"dev_name"`
|
||||
DevVersion string `json:"dev_version"`
|
||||
}
|
||||
var entries []verEntry
|
||||
if err := json.Unmarshal([]byte(match[1]), &entries); err == nil {
|
||||
for _, e := range entries {
|
||||
name := normalizeVersionInfoName(e.DevName)
|
||||
if name == "" {
|
||||
continue
|
||||
}
|
||||
version := strings.TrimSpace(e.DevVersion)
|
||||
if version == "" {
|
||||
continue
|
||||
}
|
||||
if existingFW[name] {
|
||||
continue
|
||||
}
|
||||
hw.Firmware = append(hw.Firmware, models.FirmwareInfo{
|
||||
DeviceName: name,
|
||||
Version: version,
|
||||
})
|
||||
existingFW[name] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// normalizeVersionInfoName converts RESTful version info dev_name to a clean label.
|
||||
// Returns "" for entries that should be skipped (inactive BMC, PSU slots).
|
||||
func normalizeVersionInfoName(name string) string {
|
||||
name = strings.TrimSpace(name)
|
||||
if name == "" {
|
||||
return ""
|
||||
}
|
||||
// Skip PSU_N entries — firmware already extracted from PSU info section.
|
||||
if regexp.MustCompile(`(?i)^PSU_\d+$`).MatchString(name) {
|
||||
return ""
|
||||
}
|
||||
// Skip the inactive BMC partition.
|
||||
if strings.HasPrefix(strings.ToLower(name), "inactivate(") {
|
||||
return ""
|
||||
}
|
||||
// Active BMC: "Activate(BMC1)" → "BMC"
|
||||
if strings.HasPrefix(strings.ToLower(name), "activate(") {
|
||||
return "BMC"
|
||||
}
|
||||
// Strip trailing "Version" suffix (case-insensitive), e.g. "MainBoard0CPLDVersion" → "MainBoard0CPLD"
|
||||
if strings.HasSuffix(strings.ToLower(name), "version") {
|
||||
name = name[:len(name)-len("version")]
|
||||
}
|
||||
return strings.TrimSpace(name)
|
||||
}
|
||||
|
||||
// DiskBackplaneRESTInfo represents the RESTful diskbackplane info structure
|
||||
|
||||
39
internal/parser/vendors/inspur/parser.go
vendored
39
internal/parser/vendors/inspur/parser.go
vendored
@@ -16,7 +16,7 @@ import (
|
||||
|
||||
// parserVersion - version of this parser module
|
||||
// IMPORTANT: Increment this version when making changes to parser logic!
|
||||
const parserVersion = "1.5"
|
||||
const parserVersion = "1.8"
|
||||
|
||||
func init() {
|
||||
parser.Register(&Parser{})
|
||||
@@ -95,9 +95,41 @@ func (p *Parser) Parse(files []parser.ExtractedFile) (*models.AnalysisResult, er
|
||||
Sensors: make([]models.SensorReading, 0),
|
||||
}
|
||||
|
||||
// Pre-parse enrichment maps from devicefrusdr.log for use inside ParseAssetJSON.
|
||||
// BMC does not populate HddInfo.ModelName or SerialNumber for NVMe drives.
|
||||
var pcieSlotDeviceNames map[int]string
|
||||
var nvmeLocToSlot map[int]int
|
||||
if f := parser.FindFileByName(files, "devicefrusdr.log"); f != nil {
|
||||
pcieSlotDeviceNames = ParsePCIeSlotDeviceNames(f.Content)
|
||||
nvmeLocToSlot = ParsePCIeNVMeLocToSlot(f.Content)
|
||||
}
|
||||
|
||||
// Parse NVMe serial numbers from audit.log: every disk SN change is logged there.
|
||||
// Combine with the NVMe loc→slot mapping to build pcieSlot→serial map.
|
||||
// Also parse RAID disk serials by backplane slot key (e.g. "BP0:0").
|
||||
var pcieSlotSerials map[int]string
|
||||
var raidSlotSerials map[string]string
|
||||
if f := parser.FindFileByName(files, "audit.log"); f != nil {
|
||||
if len(nvmeLocToSlot) > 0 {
|
||||
nvmeDiskSerials := ParseAuditLogNVMeSerials(f.Content)
|
||||
if len(nvmeDiskSerials) > 0 {
|
||||
pcieSlotSerials = make(map[int]string, len(nvmeDiskSerials))
|
||||
for diskNum, serial := range nvmeDiskSerials {
|
||||
if slot, ok := nvmeLocToSlot[diskNum]; ok {
|
||||
pcieSlotSerials[slot] = serial
|
||||
}
|
||||
}
|
||||
if len(pcieSlotSerials) == 0 {
|
||||
pcieSlotSerials = nil
|
||||
}
|
||||
}
|
||||
}
|
||||
raidSlotSerials = ParseAuditLogRAIDSerials(f.Content)
|
||||
}
|
||||
|
||||
// Parse asset.json first (base hardware info)
|
||||
if f := parser.FindFileByName(files, "asset.json"); f != nil {
|
||||
if hw, err := ParseAssetJSON(f.Content); err == nil {
|
||||
if hw, err := ParseAssetJSON(f.Content, pcieSlotDeviceNames, pcieSlotSerials); err == nil {
|
||||
result.Hardware = hw
|
||||
}
|
||||
}
|
||||
@@ -182,6 +214,9 @@ func (p *Parser) Parse(files []parser.ExtractedFile) (*models.AnalysisResult, er
|
||||
if result.Hardware != nil {
|
||||
applyGPUStatusFromEvents(result.Hardware, result.Events)
|
||||
enrichStorageFromSerialFallbackFiles(files, result.Hardware)
|
||||
// Apply RAID disk serials from audit.log (authoritative: last non-NULL SN change).
|
||||
// These override redis/component.log serials which may be stale after disk replacement.
|
||||
applyRAIDSlotSerials(result.Hardware, raidSlotSerials)
|
||||
}
|
||||
|
||||
return result, nil
|
||||
|
||||
79
internal/parser/vendors/inspur/pcie.go
vendored
79
internal/parser/vendors/inspur/pcie.go
vendored
@@ -4,6 +4,7 @@ import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"git.mchus.pro/mchus/logpile/internal/models"
|
||||
@@ -37,6 +38,84 @@ type PCIeRESTInfo []struct {
|
||||
FwVer string `json:"fw_ver"`
|
||||
}
|
||||
|
||||
// ParsePCIeSlotDeviceNames parses devicefrusdr.log and returns a map from integer PCIe slot ID
|
||||
// to device name string. Used to enrich HddInfo entries in asset.json that lack model names.
|
||||
func ParsePCIeSlotDeviceNames(content []byte) map[int]string {
|
||||
info, ok := parsePCIeRESTJSON(content)
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
result := make(map[int]string, len(info))
|
||||
for _, entry := range info {
|
||||
if entry.Slot <= 0 {
|
||||
continue
|
||||
}
|
||||
name := sanitizePCIeDeviceName(entry.DeviceName)
|
||||
if name != "" {
|
||||
result[entry.Slot] = name
|
||||
}
|
||||
}
|
||||
if len(result) == 0 {
|
||||
return nil
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// parsePCIeRESTJSON parses the RESTful PCIE Device info JSON from devicefrusdr.log content.
|
||||
func parsePCIeRESTJSON(content []byte) (PCIeRESTInfo, bool) {
|
||||
text := string(content)
|
||||
startMarker := "RESTful PCIE Device info:"
|
||||
endMarker := "BMC sdr Info:"
|
||||
|
||||
startIdx := strings.Index(text, startMarker)
|
||||
if startIdx == -1 {
|
||||
return nil, false
|
||||
}
|
||||
endIdx := strings.Index(text[startIdx:], endMarker)
|
||||
if endIdx == -1 {
|
||||
endIdx = len(text) - startIdx
|
||||
}
|
||||
jsonText := strings.TrimSpace(text[startIdx+len(startMarker) : startIdx+endIdx])
|
||||
|
||||
var info PCIeRESTInfo
|
||||
if err := json.Unmarshal([]byte(jsonText), &info); err != nil {
|
||||
return nil, false
|
||||
}
|
||||
return info, true
|
||||
}
|
||||
|
||||
// ParsePCIeNVMeLocToSlot parses devicefrusdr.log and returns a map from NVMe location number
|
||||
// (the numeric suffix in "#NVME0", "#NVME2", etc.) to the integer PCIe slot ID.
|
||||
// This is used to correlate audit.log NVMe disk numbers with HddInfo PcieSlot values.
|
||||
func ParsePCIeNVMeLocToSlot(content []byte) map[int]int {
|
||||
info, ok := parsePCIeRESTJSON(content)
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
|
||||
nvmeLocRegex := regexp.MustCompile(`(?i)^#NVME(\d+)$`)
|
||||
result := make(map[int]int)
|
||||
for _, entry := range info {
|
||||
if entry.Slot <= 0 {
|
||||
continue
|
||||
}
|
||||
loc := strings.TrimSpace(entry.Location)
|
||||
m := nvmeLocRegex.FindStringSubmatch(loc)
|
||||
if m == nil {
|
||||
continue
|
||||
}
|
||||
locNum, err := strconv.Atoi(m[1])
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
result[locNum] = entry.Slot
|
||||
}
|
||||
if len(result) == 0 {
|
||||
return nil
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// ParsePCIeDevices parses RESTful PCIE Device info from devicefrusdr.log
|
||||
func ParsePCIeDevices(content []byte) []models.PCIeDevice {
|
||||
text := string(content)
|
||||
|
||||
@@ -73,6 +73,24 @@ func looksLikeStorageSerial(v string) bool {
|
||||
return hasLetter && hasDigit
|
||||
}
|
||||
|
||||
// applyRAIDSlotSerials updates storage serial numbers using the slot→serial map
|
||||
// derived from audit.log RAID SN change events. Overwrites existing serials since
|
||||
// audit.log represents the authoritative current state after all disk replacements.
|
||||
func applyRAIDSlotSerials(hw *models.HardwareConfig, serials map[string]string) {
|
||||
if hw == nil || len(serials) == 0 {
|
||||
return
|
||||
}
|
||||
for i := range hw.Storage {
|
||||
slot := strings.TrimSpace(hw.Storage[i].Slot)
|
||||
if slot == "" {
|
||||
continue
|
||||
}
|
||||
if sn, ok := serials[slot]; ok && sn != "" {
|
||||
hw.Storage[i].SerialNumber = sn
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func applyStorageSerialFallback(hw *models.HardwareConfig, serials []string) {
|
||||
if hw == nil || len(hw.Storage) == 0 || len(serials) == 0 {
|
||||
return
|
||||
|
||||
@@ -26,7 +26,7 @@ func TestParseAssetJSON_HddSlotFallbackAndPresence(t *testing.T) {
|
||||
]
|
||||
}`)
|
||||
|
||||
hw, err := ParseAssetJSON(content)
|
||||
hw, err := ParseAssetJSON(content, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatalf("ParseAssetJSON failed: %v", err)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user