From 440959483e28081ad640696e23b1849e5d58008a Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Thu, 28 May 2026 03:38:04 +0300 Subject: [PATCH] fix(inspur): correctly handle PCIe Assert/Deassert GPU fault events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three related fixes for IDL event processing: 1. idl.go: include EventType in dedup key so Deassert events are no longer silently dropped as duplicates of their Assert counterparts. 2. gpu_status.go: treat Deassert events as clearing all GPU faults — previously the code re-applied the same faulty GPU set from the description, leaving GPUs stuck in Critical even after alarm cleared. 3. reanimator_models/converter: add bmc_event_summary section to the Reanimator export — a deduplicated Critical/Warning event table with Active/Resolved status derived from Assert/Deassert pairs. Co-Authored-By: Claude Sonnet 4.6 --- internal/exporter/reanimator_converter.go | 80 ++++++++++++++++++- internal/exporter/reanimator_models.go | 32 +++++--- internal/parser/vendors/inspur/gpu_status.go | 4 +- .../vendors/inspur/hgx_gpu_status_test.go | 34 ++++++++ internal/parser/vendors/inspur/idl.go | 2 +- 5 files changed, 137 insertions(+), 15 deletions(-) diff --git a/internal/exporter/reanimator_converter.go b/internal/exporter/reanimator_converter.go index a6c25de..d6b2f37 100644 --- a/internal/exporter/reanimator_converter.go +++ b/internal/exporter/reanimator_converter.go @@ -49,9 +49,10 @@ func ConvertToReanimator(result *models.AnalysisResult) (*ReanimatorExport, erro Memory: dedupeMemory(convertMemoryFromDevices(devices, collectedAt)), Storage: dedupeStorage(convertStorageFromDevices(devices, collectedAt)), PCIeDevices: dedupePCIe(convertPCIeFromDevices(devices, collectedAt)), - PowerSupplies: dedupePSUs(convertPSUsFromDevices(devices, collectedAt)), - Sensors: convertSensors(result.Sensors), - EventLogs: convertEventLogs(result.Events, collectedAt), + PowerSupplies: dedupePSUs(convertPSUsFromDevices(devices, collectedAt)), + Sensors: convertSensors(result.Sensors), + BMCEventSummary: buildBMCEventSummary(result.Events, collectedAt), + EventLogs: convertEventLogs(result.Events, collectedAt), }, } @@ -2457,3 +2458,76 @@ func inferTargetHost(targetHost, filename string) string { return "" } + +// buildBMCEventSummary produces a summary table of Critical/Warning BMC events +// with their resolution status derived from Assert/Deassert pairs. +func buildBMCEventSummary(events []models.Event, collectedAt string) []ReanimatorBMCEventRow { + type assertKey struct { + id string + desc string + } + type eventPair struct { + assertEvent *models.Event + deassertEvent *models.Event + } + + pairs := make(map[assertKey]*eventPair) + order := make([]assertKey, 0) + + for i := range events { + e := &events[i] + if e.Severity != models.SeverityCritical && e.Severity != models.SeverityWarning { + continue + } + key := assertKey{id: e.ID, desc: e.Description} + p, exists := pairs[key] + if !exists { + p = &eventPair{} + pairs[key] = p + order = append(order, key) + } + switch strings.ToLower(e.EventType) { + case "deassert": + if p.deassertEvent == nil || e.Timestamp.After(p.deassertEvent.Timestamp) { + p.deassertEvent = e + } + default: + if p.assertEvent == nil || e.Timestamp.Before(p.assertEvent.Timestamp) { + p.assertEvent = e + } + } + } + + rows := make([]ReanimatorBMCEventRow, 0, len(order)) + for _, key := range order { + p := pairs[key] + ref := p.assertEvent + if ref == nil { + ref = p.deassertEvent + } + if ref == nil { + continue + } + + status := "Active" + resolvedAt := "" + if p.deassertEvent != nil { + status = "Resolved" + resolvedAt = formatEventLogTime(p.deassertEvent.Timestamp, collectedAt) + } + + rows = append(rows, ReanimatorBMCEventRow{ + Severity: normalizeEventLogSeverity(ref.Severity), + Component: strings.ToUpper(strings.TrimSpace(ref.SensorType)), + MessageID: strings.TrimSpace(ref.ID), + Timestamp: formatEventLogTime(ref.Timestamp, collectedAt), + Description: strings.TrimSpace(ref.Description), + Status: status, + ResolvedAt: resolvedAt, + }) + } + if len(rows) == 0 { + return nil + } + return rows +} diff --git a/internal/exporter/reanimator_models.go b/internal/exporter/reanimator_models.go index aed2cff..2c11899 100644 --- a/internal/exporter/reanimator_models.go +++ b/internal/exporter/reanimator_models.go @@ -12,16 +12,28 @@ type ReanimatorExport struct { // ReanimatorHardware contains all hardware components type ReanimatorHardware struct { - Board ReanimatorBoard `json:"board"` - Firmware []ReanimatorFirmware `json:"firmware,omitempty"` - CPUs []ReanimatorCPU `json:"cpus,omitempty"` - Memory []ReanimatorMemory `json:"memory,omitempty"` - Storage []ReanimatorStorage `json:"storage,omitempty"` - PCIeDevices []ReanimatorPCIe `json:"pcie_devices,omitempty"` - PowerSupplies []ReanimatorPSU `json:"power_supplies,omitempty"` - Sensors *ReanimatorSensors `json:"sensors,omitempty"` - EventLogs []ReanimatorEventLog `json:"event_logs,omitempty"` - PlatformConfig map[string]any `json:"platform_config,omitempty"` + Board ReanimatorBoard `json:"board"` + Firmware []ReanimatorFirmware `json:"firmware,omitempty"` + CPUs []ReanimatorCPU `json:"cpus,omitempty"` + Memory []ReanimatorMemory `json:"memory,omitempty"` + Storage []ReanimatorStorage `json:"storage,omitempty"` + PCIeDevices []ReanimatorPCIe `json:"pcie_devices,omitempty"` + PowerSupplies []ReanimatorPSU `json:"power_supplies,omitempty"` + Sensors *ReanimatorSensors `json:"sensors,omitempty"` + BMCEventSummary []ReanimatorBMCEventRow `json:"bmc_event_summary,omitempty"` + EventLogs []ReanimatorEventLog `json:"event_logs,omitempty"` + PlatformConfig map[string]any `json:"platform_config,omitempty"` +} + +// ReanimatorBMCEventRow is one row in the BMC critical/warning event summary table. +type ReanimatorBMCEventRow struct { + Severity string `json:"severity"` + Component string `json:"component"` + MessageID string `json:"message_id"` + Timestamp string `json:"timestamp"` + Description string `json:"description"` + Status string `json:"status"` + ResolvedAt string `json:"resolved_at,omitempty"` } // ReanimatorBoard represents motherboard/server information diff --git a/internal/parser/vendors/inspur/gpu_status.go b/internal/parser/vendors/inspur/gpu_status.go index 4416cd7..872b2ff 100644 --- a/internal/parser/vendors/inspur/gpu_status.go +++ b/internal/parser/vendors/inspur/gpu_status.go @@ -56,10 +56,12 @@ func applyGPUStatusFromEvents(hw *models.HardwareConfig, events []models.Event) } for _, e := range relevantEvents { + // Deassert means the alarm was cleared: all GPUs return to OK. + isDeassert := strings.EqualFold(strings.TrimSpace(e.EventType), "Deassert") faultySet := extractFaultyGPUSet(e.Description) for idx, gpu := range gpuByIndex { newStatus := "OK" - if faultySet[idx] { + if !isDeassert && faultySet[idx] { newStatus = "Critical" lastCriticalDetails[idx] = strings.TrimSpace(e.Description) } diff --git a/internal/parser/vendors/inspur/hgx_gpu_status_test.go b/internal/parser/vendors/inspur/hgx_gpu_status_test.go index 3ebe65c..f9e2df2 100644 --- a/internal/parser/vendors/inspur/hgx_gpu_status_test.go +++ b/internal/parser/vendors/inspur/hgx_gpu_status_test.go @@ -155,6 +155,40 @@ func TestApplyGPUStatusFromEvents_UsesLatestEventAsCurrentStatusAndKeepsHistory( } } +func TestApplyGPUStatusFromEvents_DeassertClearsAllGPUs(t *testing.T) { + hw := &models.HardwareConfig{ + GPUs: []models.GPU{ + {Slot: "#GPU1"}, + {Slot: "#GPU3"}, + {Slot: "#GPU5"}, + {Slot: "#GPU6"}, + }, + } + + events := []models.Event{ + { + ID: "17FFB002", + EventType: "Assert", + Timestamp: time.Date(2026, 5, 27, 13, 6, 56, 0, time.FixedZone("UTC+8", 8*3600)), + Description: "PCIe Present mismatch BIOS Scan, BIOS miss F_GPU1 F_GPU3 F_GPU5 F_GPU6", + }, + { + ID: "17FFB002", + EventType: "Deassert", + Timestamp: time.Date(2026, 5, 27, 13, 15, 56, 0, time.FixedZone("UTC+8", 8*3600)), + Description: "PCIe Present mismatch BIOS Scan, BIOS miss F_GPU1 F_GPU3 F_GPU5 F_GPU6", + }, + } + + applyGPUStatusFromEvents(hw, events) + + for _, gpu := range hw.GPUs { + if gpu.Status != "OK" { + t.Fatalf("expected %s to recover to OK after Deassert, got %q", gpu.Slot, gpu.Status) + } + } +} + func TestParseIDLLog_ParsesStructuredJSONLine(t *testing.T) { content := []byte(`{ "MESSAGE": "|2026-01-12T23:05:18+08:00|PCIE|Assert|Critical|17FFB002|PCIe Present mismatch BIOS miss F_GPU6 - Assert|" }`) diff --git a/internal/parser/vendors/inspur/idl.go b/internal/parser/vendors/inspur/idl.go index bfa7e0c..96214ca 100644 --- a/internal/parser/vendors/inspur/idl.go +++ b/internal/parser/vendors/inspur/idl.go @@ -48,7 +48,7 @@ func ParseIDLLog(content []byte) []models.Event { description = cleanDescription(description) // Create unique key for deduplication - eventKey := eventID + "|" + description + eventKey := eventID + "|" + eventType + "|" + description if seenEvents[eventKey] { continue }