fix(inspur): correctly handle PCIe Assert/Deassert GPU fault events

Three related fixes for IDL event processing:

1. idl.go: include EventType in dedup key so Deassert events are no
   longer silently dropped as duplicates of their Assert counterparts.

2. gpu_status.go: treat Deassert events as clearing all GPU faults —
   previously the code re-applied the same faulty GPU set from the
   description, leaving GPUs stuck in Critical even after alarm cleared.

3. reanimator_models/converter: add bmc_event_summary section to the
   Reanimator export — a deduplicated Critical/Warning event table with
   Active/Resolved status derived from Assert/Deassert pairs.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-28 03:38:04 +03:00
parent f3836a34cc
commit 440959483e
5 changed files with 137 additions and 15 deletions

View File

@@ -56,10 +56,12 @@ func applyGPUStatusFromEvents(hw *models.HardwareConfig, events []models.Event)
}
for _, e := range relevantEvents {
// Deassert means the alarm was cleared: all GPUs return to OK.
isDeassert := strings.EqualFold(strings.TrimSpace(e.EventType), "Deassert")
faultySet := extractFaultyGPUSet(e.Description)
for idx, gpu := range gpuByIndex {
newStatus := "OK"
if faultySet[idx] {
if !isDeassert && faultySet[idx] {
newStatus = "Critical"
lastCriticalDetails[idx] = strings.TrimSpace(e.Description)
}

View File

@@ -155,6 +155,40 @@ func TestApplyGPUStatusFromEvents_UsesLatestEventAsCurrentStatusAndKeepsHistory(
}
}
func TestApplyGPUStatusFromEvents_DeassertClearsAllGPUs(t *testing.T) {
hw := &models.HardwareConfig{
GPUs: []models.GPU{
{Slot: "#GPU1"},
{Slot: "#GPU3"},
{Slot: "#GPU5"},
{Slot: "#GPU6"},
},
}
events := []models.Event{
{
ID: "17FFB002",
EventType: "Assert",
Timestamp: time.Date(2026, 5, 27, 13, 6, 56, 0, time.FixedZone("UTC+8", 8*3600)),
Description: "PCIe Present mismatch BIOS Scan, BIOS miss F_GPU1 F_GPU3 F_GPU5 F_GPU6",
},
{
ID: "17FFB002",
EventType: "Deassert",
Timestamp: time.Date(2026, 5, 27, 13, 15, 56, 0, time.FixedZone("UTC+8", 8*3600)),
Description: "PCIe Present mismatch BIOS Scan, BIOS miss F_GPU1 F_GPU3 F_GPU5 F_GPU6",
},
}
applyGPUStatusFromEvents(hw, events)
for _, gpu := range hw.GPUs {
if gpu.Status != "OK" {
t.Fatalf("expected %s to recover to OK after Deassert, got %q", gpu.Slot, gpu.Status)
}
}
}
func TestParseIDLLog_ParsesStructuredJSONLine(t *testing.T) {
content := []byte(`{ "MESSAGE": "|2026-01-12T23:05:18+08:00|PCIE|Assert|Critical|17FFB002|PCIe Present mismatch BIOS miss F_GPU6 - Assert|" }`)

View File

@@ -48,7 +48,7 @@ func ParseIDLLog(content []byte) []models.Event {
description = cleanDescription(description)
// Create unique key for deduplication
eventKey := eventID + "|" + description
eventKey := eventID + "|" + eventType + "|" + description
if seenEvents[eventKey] {
continue
}