Three related fixes for IDL event processing: 1. idl.go: include EventType in dedup key so Deassert events are no longer silently dropped as duplicates of their Assert counterparts. 2. gpu_status.go: treat Deassert events as clearing all GPU faults — previously the code re-applied the same faulty GPU set from the description, leaving GPUs stuck in Critical even after alarm cleared. 3. reanimator_models/converter: add bmc_event_summary section to the Reanimator export — a deduplicated Critical/Warning event table with Active/Resolved status derived from Assert/Deassert pairs. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
209 lines
5.9 KiB
Go
209 lines
5.9 KiB
Go
package inspur
|
|
|
|
import (
|
|
"testing"
|
|
"time"
|
|
|
|
"git.mchus.pro/mchus/logpile/internal/models"
|
|
)
|
|
|
|
func TestEnrichGPUsFromHGXHWInfo_UsesHGXLogicalMapping(t *testing.T) {
|
|
hw := &models.HardwareConfig{
|
|
GPUs: []models.GPU{
|
|
{Slot: "#GPU6"},
|
|
{Slot: "#GPU7"},
|
|
{Slot: "#GPU0"},
|
|
{Slot: "#CPU0_PE1_E_BMC", Model: "AST2500 VGA"},
|
|
},
|
|
}
|
|
|
|
content := []byte(`
|
|
# curl -X GET http://127.0.0.1/redfish/v1/Chassis/HGX_GPU_SXM_1/Assembly
|
|
{"Name":"GPU Board Assembly","Model":"B200 180GB HBM3e","PartNumber":"PN1","SerialNumber":"SXM1SN"}
|
|
# curl -X GET http://127.0.0.1/redfish/v1/Chassis/HGX_GPU_SXM_3/Assembly
|
|
{"Name":"GPU Board Assembly","Model":"B200 180GB HBM3e","PartNumber":"PN3","SerialNumber":"SXM3SN"}
|
|
# curl -X GET http://127.0.0.1/redfish/v1/Chassis/HGX_GPU_SXM_5/Assembly
|
|
{"Name":"GPU Board Assembly","Model":"B200 180GB HBM3e","PartNumber":"PN5","SerialNumber":"SXM5SN"}
|
|
{"Id":"HGX_FW_GPU_SXM_1","Version":"FW1"}
|
|
{"Id":"HGX_FW_GPU_SXM_3","Version":"FW3"}
|
|
{"Id":"HGX_FW_GPU_SXM_5","Version":"FW5"}
|
|
{"Id":"HGX_InfoROM_GPU_SXM_3","Version":"IR3"}
|
|
`)
|
|
|
|
enrichGPUsFromHGXHWInfo(content, hw)
|
|
|
|
if hw.GPUs[0].SerialNumber != "SXM3SN" {
|
|
t.Fatalf("expected #GPU6 to map to SXM3 serial, got %q", hw.GPUs[0].SerialNumber)
|
|
}
|
|
if hw.GPUs[1].SerialNumber != "SXM1SN" {
|
|
t.Fatalf("expected #GPU7 to map to SXM1 serial, got %q", hw.GPUs[1].SerialNumber)
|
|
}
|
|
if hw.GPUs[2].SerialNumber != "SXM5SN" {
|
|
t.Fatalf("expected #GPU0 to map to SXM5 serial, got %q", hw.GPUs[2].SerialNumber)
|
|
}
|
|
if hw.GPUs[0].Firmware != "FW3" {
|
|
t.Fatalf("expected #GPU6 firmware FW3, got %q", hw.GPUs[0].Firmware)
|
|
}
|
|
if hw.GPUs[0].VideoBIOS != "IR3" {
|
|
t.Fatalf("expected #GPU6 InfoROM in VideoBIOS IR3, got %q", hw.GPUs[0].VideoBIOS)
|
|
}
|
|
if hw.GPUs[2].Firmware != "FW5" {
|
|
t.Fatalf("expected #GPU0 firmware FW5, got %q", hw.GPUs[2].Firmware)
|
|
}
|
|
for _, g := range hw.GPUs {
|
|
if g.Slot == "#CPU0_PE1_E_BMC" {
|
|
t.Fatalf("expected non-HGX BMC VGA entry to be filtered out")
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestEnrichGPUsFromHGXHWInfo_AddsMissingLogicalGPU(t *testing.T) {
|
|
hw := &models.HardwareConfig{
|
|
GPUs: []models.GPU{
|
|
{Slot: "#GPU0"},
|
|
{Slot: "#GPU1"},
|
|
{Slot: "#GPU2"},
|
|
{Slot: "#GPU3"},
|
|
{Slot: "#GPU4"},
|
|
{Slot: "#GPU5"},
|
|
{Slot: "#GPU7"},
|
|
},
|
|
}
|
|
|
|
content := []byte(`
|
|
# curl -X GET http://127.0.0.1/redfish/v1/Chassis/HGX_GPU_SXM_3/Assembly
|
|
{"Name":"GPU Board Assembly","Model":"B200 180GB HBM3e","PartNumber":"PN3","SerialNumber":"SXM3SN"}
|
|
`)
|
|
|
|
enrichGPUsFromHGXHWInfo(content, hw)
|
|
|
|
found := false
|
|
for _, g := range hw.GPUs {
|
|
if g.Slot == "#GPU6" {
|
|
found = true
|
|
if g.SerialNumber != "SXM3SN" {
|
|
t.Fatalf("expected synthesized #GPU6 serial SXM3SN, got %q", g.SerialNumber)
|
|
}
|
|
}
|
|
}
|
|
if !found {
|
|
t.Fatalf("expected synthesized #GPU6 entry")
|
|
}
|
|
}
|
|
|
|
func TestApplyGPUStatusFromEvents_MarksFaultedGPU(t *testing.T) {
|
|
hw := &models.HardwareConfig{
|
|
GPUs: []models.GPU{
|
|
{Slot: "#GPU6"},
|
|
{Slot: "#GPU5"},
|
|
},
|
|
}
|
|
|
|
events := []models.Event{
|
|
{
|
|
ID: "17FFB002",
|
|
Timestamp: time.Now(),
|
|
Description: "PCIe Present mismatch BIOS miss F_GPU6",
|
|
},
|
|
}
|
|
|
|
applyGPUStatusFromEvents(hw, events)
|
|
|
|
if hw.GPUs[0].Status != "Critical" {
|
|
t.Fatalf("expected #GPU6 status Critical, got %q", hw.GPUs[0].Status)
|
|
}
|
|
if hw.GPUs[1].Status != "OK" {
|
|
t.Fatalf("expected healthy GPU status OK, got %q", hw.GPUs[1].Status)
|
|
}
|
|
}
|
|
|
|
func TestApplyGPUStatusFromEvents_UsesLatestEventAsCurrentStatusAndKeepsHistory(t *testing.T) {
|
|
hw := &models.HardwareConfig{
|
|
GPUs: []models.GPU{
|
|
{Slot: "#GPU1"},
|
|
{Slot: "#GPU3"},
|
|
{Slot: "#GPU6"},
|
|
},
|
|
}
|
|
|
|
events := []models.Event{
|
|
{
|
|
ID: "17FFB002",
|
|
Timestamp: time.Date(2026, 1, 12, 22, 51, 16, 0, time.FixedZone("UTC+8", 8*3600)),
|
|
Description: "PCIe Present mismatch BIOS miss F_GPU1 F_GPU3 F_GPU6",
|
|
},
|
|
{
|
|
ID: "17FFB002",
|
|
Timestamp: time.Date(2026, 1, 12, 23, 5, 18, 0, time.FixedZone("UTC+8", 8*3600)),
|
|
Description: "PCIe Present mismatch BIOS miss F_GPU6",
|
|
},
|
|
}
|
|
|
|
applyGPUStatusFromEvents(hw, events)
|
|
|
|
if hw.GPUs[0].Status != "OK" {
|
|
t.Fatalf("expected #GPU1 to recover to OK on latest event, got %q", hw.GPUs[0].Status)
|
|
}
|
|
if hw.GPUs[1].Status != "OK" {
|
|
t.Fatalf("expected #GPU3 to recover to OK on latest event, got %q", hw.GPUs[1].Status)
|
|
}
|
|
if hw.GPUs[2].Status != "Critical" {
|
|
t.Fatalf("expected #GPU6 to remain Critical, got %q", hw.GPUs[2].Status)
|
|
}
|
|
if len(hw.GPUs[0].StatusHistory) == 0 {
|
|
t.Fatalf("expected #GPU1 status history to be populated")
|
|
}
|
|
}
|
|
|
|
func TestApplyGPUStatusFromEvents_DeassertClearsAllGPUs(t *testing.T) {
|
|
hw := &models.HardwareConfig{
|
|
GPUs: []models.GPU{
|
|
{Slot: "#GPU1"},
|
|
{Slot: "#GPU3"},
|
|
{Slot: "#GPU5"},
|
|
{Slot: "#GPU6"},
|
|
},
|
|
}
|
|
|
|
events := []models.Event{
|
|
{
|
|
ID: "17FFB002",
|
|
EventType: "Assert",
|
|
Timestamp: time.Date(2026, 5, 27, 13, 6, 56, 0, time.FixedZone("UTC+8", 8*3600)),
|
|
Description: "PCIe Present mismatch BIOS Scan, BIOS miss F_GPU1 F_GPU3 F_GPU5 F_GPU6",
|
|
},
|
|
{
|
|
ID: "17FFB002",
|
|
EventType: "Deassert",
|
|
Timestamp: time.Date(2026, 5, 27, 13, 15, 56, 0, time.FixedZone("UTC+8", 8*3600)),
|
|
Description: "PCIe Present mismatch BIOS Scan, BIOS miss F_GPU1 F_GPU3 F_GPU5 F_GPU6",
|
|
},
|
|
}
|
|
|
|
applyGPUStatusFromEvents(hw, events)
|
|
|
|
for _, gpu := range hw.GPUs {
|
|
if gpu.Status != "OK" {
|
|
t.Fatalf("expected %s to recover to OK after Deassert, got %q", gpu.Slot, gpu.Status)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestParseIDLLog_ParsesStructuredJSONLine(t *testing.T) {
|
|
content := []byte(`{ "MESSAGE": "|2026-01-12T23:05:18+08:00|PCIE|Assert|Critical|17FFB002|PCIe Present mismatch BIOS miss F_GPU6 - Assert|" }`)
|
|
|
|
events := ParseIDLLog(content)
|
|
if len(events) != 1 {
|
|
t.Fatalf("expected 1 event from JSON line, got %d", len(events))
|
|
}
|
|
if events[0].ID != "17FFB002" {
|
|
t.Fatalf("expected event ID 17FFB002, got %q", events[0].ID)
|
|
}
|
|
if events[0].Source != "BMC" {
|
|
t.Fatalf("expected BMC source for IDL event, got %q", events[0].Source)
|
|
}
|
|
if events[0].SensorType != "pcie" {
|
|
t.Fatalf("expected component type pcie, got %#v", events[0])
|
|
}
|
|
}
|