fix(inspur): correctly handle PCIe Assert/Deassert GPU fault events
Three related fixes for IDL event processing: 1. idl.go: include EventType in dedup key so Deassert events are no longer silently dropped as duplicates of their Assert counterparts. 2. gpu_status.go: treat Deassert events as clearing all GPU faults — previously the code re-applied the same faulty GPU set from the description, leaving GPUs stuck in Critical even after alarm cleared. 3. reanimator_models/converter: add bmc_event_summary section to the Reanimator export — a deduplicated Critical/Warning event table with Active/Resolved status derived from Assert/Deassert pairs. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -49,9 +49,10 @@ func ConvertToReanimator(result *models.AnalysisResult) (*ReanimatorExport, erro
|
||||
Memory: dedupeMemory(convertMemoryFromDevices(devices, collectedAt)),
|
||||
Storage: dedupeStorage(convertStorageFromDevices(devices, collectedAt)),
|
||||
PCIeDevices: dedupePCIe(convertPCIeFromDevices(devices, collectedAt)),
|
||||
PowerSupplies: dedupePSUs(convertPSUsFromDevices(devices, collectedAt)),
|
||||
Sensors: convertSensors(result.Sensors),
|
||||
EventLogs: convertEventLogs(result.Events, collectedAt),
|
||||
PowerSupplies: dedupePSUs(convertPSUsFromDevices(devices, collectedAt)),
|
||||
Sensors: convertSensors(result.Sensors),
|
||||
BMCEventSummary: buildBMCEventSummary(result.Events, collectedAt),
|
||||
EventLogs: convertEventLogs(result.Events, collectedAt),
|
||||
},
|
||||
}
|
||||
|
||||
@@ -2457,3 +2458,76 @@ func inferTargetHost(targetHost, filename string) string {
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
// buildBMCEventSummary produces a summary table of Critical/Warning BMC events
|
||||
// with their resolution status derived from Assert/Deassert pairs.
|
||||
func buildBMCEventSummary(events []models.Event, collectedAt string) []ReanimatorBMCEventRow {
|
||||
type assertKey struct {
|
||||
id string
|
||||
desc string
|
||||
}
|
||||
type eventPair struct {
|
||||
assertEvent *models.Event
|
||||
deassertEvent *models.Event
|
||||
}
|
||||
|
||||
pairs := make(map[assertKey]*eventPair)
|
||||
order := make([]assertKey, 0)
|
||||
|
||||
for i := range events {
|
||||
e := &events[i]
|
||||
if e.Severity != models.SeverityCritical && e.Severity != models.SeverityWarning {
|
||||
continue
|
||||
}
|
||||
key := assertKey{id: e.ID, desc: e.Description}
|
||||
p, exists := pairs[key]
|
||||
if !exists {
|
||||
p = &eventPair{}
|
||||
pairs[key] = p
|
||||
order = append(order, key)
|
||||
}
|
||||
switch strings.ToLower(e.EventType) {
|
||||
case "deassert":
|
||||
if p.deassertEvent == nil || e.Timestamp.After(p.deassertEvent.Timestamp) {
|
||||
p.deassertEvent = e
|
||||
}
|
||||
default:
|
||||
if p.assertEvent == nil || e.Timestamp.Before(p.assertEvent.Timestamp) {
|
||||
p.assertEvent = e
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
rows := make([]ReanimatorBMCEventRow, 0, len(order))
|
||||
for _, key := range order {
|
||||
p := pairs[key]
|
||||
ref := p.assertEvent
|
||||
if ref == nil {
|
||||
ref = p.deassertEvent
|
||||
}
|
||||
if ref == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
status := "Active"
|
||||
resolvedAt := ""
|
||||
if p.deassertEvent != nil {
|
||||
status = "Resolved"
|
||||
resolvedAt = formatEventLogTime(p.deassertEvent.Timestamp, collectedAt)
|
||||
}
|
||||
|
||||
rows = append(rows, ReanimatorBMCEventRow{
|
||||
Severity: normalizeEventLogSeverity(ref.Severity),
|
||||
Component: strings.ToUpper(strings.TrimSpace(ref.SensorType)),
|
||||
MessageID: strings.TrimSpace(ref.ID),
|
||||
Timestamp: formatEventLogTime(ref.Timestamp, collectedAt),
|
||||
Description: strings.TrimSpace(ref.Description),
|
||||
Status: status,
|
||||
ResolvedAt: resolvedAt,
|
||||
})
|
||||
}
|
||||
if len(rows) == 0 {
|
||||
return nil
|
||||
}
|
||||
return rows
|
||||
}
|
||||
|
||||
@@ -12,16 +12,28 @@ type ReanimatorExport struct {
|
||||
|
||||
// ReanimatorHardware contains all hardware components
|
||||
type ReanimatorHardware struct {
|
||||
Board ReanimatorBoard `json:"board"`
|
||||
Firmware []ReanimatorFirmware `json:"firmware,omitempty"`
|
||||
CPUs []ReanimatorCPU `json:"cpus,omitempty"`
|
||||
Memory []ReanimatorMemory `json:"memory,omitempty"`
|
||||
Storage []ReanimatorStorage `json:"storage,omitempty"`
|
||||
PCIeDevices []ReanimatorPCIe `json:"pcie_devices,omitempty"`
|
||||
PowerSupplies []ReanimatorPSU `json:"power_supplies,omitempty"`
|
||||
Sensors *ReanimatorSensors `json:"sensors,omitempty"`
|
||||
EventLogs []ReanimatorEventLog `json:"event_logs,omitempty"`
|
||||
PlatformConfig map[string]any `json:"platform_config,omitempty"`
|
||||
Board ReanimatorBoard `json:"board"`
|
||||
Firmware []ReanimatorFirmware `json:"firmware,omitempty"`
|
||||
CPUs []ReanimatorCPU `json:"cpus,omitempty"`
|
||||
Memory []ReanimatorMemory `json:"memory,omitempty"`
|
||||
Storage []ReanimatorStorage `json:"storage,omitempty"`
|
||||
PCIeDevices []ReanimatorPCIe `json:"pcie_devices,omitempty"`
|
||||
PowerSupplies []ReanimatorPSU `json:"power_supplies,omitempty"`
|
||||
Sensors *ReanimatorSensors `json:"sensors,omitempty"`
|
||||
BMCEventSummary []ReanimatorBMCEventRow `json:"bmc_event_summary,omitempty"`
|
||||
EventLogs []ReanimatorEventLog `json:"event_logs,omitempty"`
|
||||
PlatformConfig map[string]any `json:"platform_config,omitempty"`
|
||||
}
|
||||
|
||||
// ReanimatorBMCEventRow is one row in the BMC critical/warning event summary table.
|
||||
type ReanimatorBMCEventRow struct {
|
||||
Severity string `json:"severity"`
|
||||
Component string `json:"component"`
|
||||
MessageID string `json:"message_id"`
|
||||
Timestamp string `json:"timestamp"`
|
||||
Description string `json:"description"`
|
||||
Status string `json:"status"`
|
||||
ResolvedAt string `json:"resolved_at,omitempty"`
|
||||
}
|
||||
|
||||
// ReanimatorBoard represents motherboard/server information
|
||||
|
||||
4
internal/parser/vendors/inspur/gpu_status.go
vendored
4
internal/parser/vendors/inspur/gpu_status.go
vendored
@@ -56,10 +56,12 @@ func applyGPUStatusFromEvents(hw *models.HardwareConfig, events []models.Event)
|
||||
}
|
||||
|
||||
for _, e := range relevantEvents {
|
||||
// Deassert means the alarm was cleared: all GPUs return to OK.
|
||||
isDeassert := strings.EqualFold(strings.TrimSpace(e.EventType), "Deassert")
|
||||
faultySet := extractFaultyGPUSet(e.Description)
|
||||
for idx, gpu := range gpuByIndex {
|
||||
newStatus := "OK"
|
||||
if faultySet[idx] {
|
||||
if !isDeassert && faultySet[idx] {
|
||||
newStatus = "Critical"
|
||||
lastCriticalDetails[idx] = strings.TrimSpace(e.Description)
|
||||
}
|
||||
|
||||
@@ -155,6 +155,40 @@ func TestApplyGPUStatusFromEvents_UsesLatestEventAsCurrentStatusAndKeepsHistory(
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyGPUStatusFromEvents_DeassertClearsAllGPUs(t *testing.T) {
|
||||
hw := &models.HardwareConfig{
|
||||
GPUs: []models.GPU{
|
||||
{Slot: "#GPU1"},
|
||||
{Slot: "#GPU3"},
|
||||
{Slot: "#GPU5"},
|
||||
{Slot: "#GPU6"},
|
||||
},
|
||||
}
|
||||
|
||||
events := []models.Event{
|
||||
{
|
||||
ID: "17FFB002",
|
||||
EventType: "Assert",
|
||||
Timestamp: time.Date(2026, 5, 27, 13, 6, 56, 0, time.FixedZone("UTC+8", 8*3600)),
|
||||
Description: "PCIe Present mismatch BIOS Scan, BIOS miss F_GPU1 F_GPU3 F_GPU5 F_GPU6",
|
||||
},
|
||||
{
|
||||
ID: "17FFB002",
|
||||
EventType: "Deassert",
|
||||
Timestamp: time.Date(2026, 5, 27, 13, 15, 56, 0, time.FixedZone("UTC+8", 8*3600)),
|
||||
Description: "PCIe Present mismatch BIOS Scan, BIOS miss F_GPU1 F_GPU3 F_GPU5 F_GPU6",
|
||||
},
|
||||
}
|
||||
|
||||
applyGPUStatusFromEvents(hw, events)
|
||||
|
||||
for _, gpu := range hw.GPUs {
|
||||
if gpu.Status != "OK" {
|
||||
t.Fatalf("expected %s to recover to OK after Deassert, got %q", gpu.Slot, gpu.Status)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseIDLLog_ParsesStructuredJSONLine(t *testing.T) {
|
||||
content := []byte(`{ "MESSAGE": "|2026-01-12T23:05:18+08:00|PCIE|Assert|Critical|17FFB002|PCIe Present mismatch BIOS miss F_GPU6 - Assert|" }`)
|
||||
|
||||
|
||||
2
internal/parser/vendors/inspur/idl.go
vendored
2
internal/parser/vendors/inspur/idl.go
vendored
@@ -48,7 +48,7 @@ func ParseIDLLog(content []byte) []models.Event {
|
||||
description = cleanDescription(description)
|
||||
|
||||
// Create unique key for deduplication
|
||||
eventKey := eventID + "|" + description
|
||||
eventKey := eventID + "|" + eventType + "|" + description
|
||||
if seenEvents[eventKey] {
|
||||
continue
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user