fix(inspur): correctly handle PCIe Assert/Deassert GPU fault events
Three related fixes for IDL event processing: 1. idl.go: include EventType in dedup key so Deassert events are no longer silently dropped as duplicates of their Assert counterparts. 2. gpu_status.go: treat Deassert events as clearing all GPU faults — previously the code re-applied the same faulty GPU set from the description, leaving GPUs stuck in Critical even after alarm cleared. 3. reanimator_models/converter: add bmc_event_summary section to the Reanimator export — a deduplicated Critical/Warning event table with Active/Resolved status derived from Assert/Deassert pairs. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -155,6 +155,40 @@ func TestApplyGPUStatusFromEvents_UsesLatestEventAsCurrentStatusAndKeepsHistory(
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyGPUStatusFromEvents_DeassertClearsAllGPUs(t *testing.T) {
|
||||
hw := &models.HardwareConfig{
|
||||
GPUs: []models.GPU{
|
||||
{Slot: "#GPU1"},
|
||||
{Slot: "#GPU3"},
|
||||
{Slot: "#GPU5"},
|
||||
{Slot: "#GPU6"},
|
||||
},
|
||||
}
|
||||
|
||||
events := []models.Event{
|
||||
{
|
||||
ID: "17FFB002",
|
||||
EventType: "Assert",
|
||||
Timestamp: time.Date(2026, 5, 27, 13, 6, 56, 0, time.FixedZone("UTC+8", 8*3600)),
|
||||
Description: "PCIe Present mismatch BIOS Scan, BIOS miss F_GPU1 F_GPU3 F_GPU5 F_GPU6",
|
||||
},
|
||||
{
|
||||
ID: "17FFB002",
|
||||
EventType: "Deassert",
|
||||
Timestamp: time.Date(2026, 5, 27, 13, 15, 56, 0, time.FixedZone("UTC+8", 8*3600)),
|
||||
Description: "PCIe Present mismatch BIOS Scan, BIOS miss F_GPU1 F_GPU3 F_GPU5 F_GPU6",
|
||||
},
|
||||
}
|
||||
|
||||
applyGPUStatusFromEvents(hw, events)
|
||||
|
||||
for _, gpu := range hw.GPUs {
|
||||
if gpu.Status != "OK" {
|
||||
t.Fatalf("expected %s to recover to OK after Deassert, got %q", gpu.Slot, gpu.Status)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseIDLLog_ParsesStructuredJSONLine(t *testing.T) {
|
||||
content := []byte(`{ "MESSAGE": "|2026-01-12T23:05:18+08:00|PCIE|Assert|Critical|17FFB002|PCIe Present mismatch BIOS miss F_GPU6 - Assert|" }`)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user