Files
logpile/internal/parser/vendors/inspur/gpu_status.go
Michael Chus 440959483e fix(inspur): correctly handle PCIe Assert/Deassert GPU fault events
Three related fixes for IDL event processing:

1. idl.go: include EventType in dedup key so Deassert events are no
   longer silently dropped as duplicates of their Assert counterparts.

2. gpu_status.go: treat Deassert events as clearing all GPU faults —
   previously the code re-applied the same faulty GPU set from the
   description, leaving GPUs stuck in Critical even after alarm cleared.

3. reanimator_models/converter: add bmc_event_summary section to the
   Reanimator export — a deduplicated Critical/Warning event table with
   Active/Resolved status derived from Assert/Deassert pairs.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-28 03:38:04 +03:00

120 lines
2.8 KiB
Go

package inspur
import (
"regexp"
"sort"
"strconv"
"strings"
"git.mchus.pro/mchus/logpile/internal/models"
)
var reFaultGPU = regexp.MustCompile(`\bF_GPU(\d+)\b`)
func applyGPUStatusFromEvents(hw *models.HardwareConfig, events []models.Event) {
if hw == nil || len(hw.GPUs) == 0 {
return
}
gpuByIndex := make(map[int]*models.GPU)
for i := range hw.GPUs {
gpu := &hw.GPUs[i]
idx, ok := extractLogicalGPUIndex(gpu.Slot)
if !ok {
continue
}
gpuByIndex[idx] = gpu
gpu.StatusHistory = nil
gpu.ErrorDescription = ""
}
relevantEvents := make([]models.Event, 0)
for _, e := range events {
if !isGPUFaultEvent(e) || len(extractFaultyGPUSet(e.Description)) == 0 {
continue
}
relevantEvents = append(relevantEvents, e)
}
if len(relevantEvents) == 0 {
for _, gpu := range gpuByIndex {
if strings.TrimSpace(gpu.Status) == "" {
gpu.Status = "OK"
}
}
return
}
sort.Slice(relevantEvents, func(i, j int) bool {
return relevantEvents[i].Timestamp.Before(relevantEvents[j].Timestamp)
})
currentStatus := make(map[int]string, len(gpuByIndex))
lastCriticalDetails := make(map[int]string, len(gpuByIndex))
for idx := range gpuByIndex {
currentStatus[idx] = "OK"
}
for _, e := range relevantEvents {
// Deassert means the alarm was cleared: all GPUs return to OK.
isDeassert := strings.EqualFold(strings.TrimSpace(e.EventType), "Deassert")
faultySet := extractFaultyGPUSet(e.Description)
for idx, gpu := range gpuByIndex {
newStatus := "OK"
if !isDeassert && faultySet[idx] {
newStatus = "Critical"
lastCriticalDetails[idx] = strings.TrimSpace(e.Description)
}
if currentStatus[idx] != newStatus {
gpu.StatusHistory = append(gpu.StatusHistory, models.StatusHistoryEntry{
Status: newStatus,
ChangedAt: e.Timestamp,
Details: strings.TrimSpace(e.Description),
})
ts := e.Timestamp
gpu.StatusChangedAt = &ts
currentStatus[idx] = newStatus
}
ts := e.Timestamp
gpu.StatusCheckedAt = &ts
}
}
for idx, gpu := range gpuByIndex {
gpu.Status = currentStatus[idx]
if gpu.Status == "Critical" {
gpu.ErrorDescription = lastCriticalDetails[idx]
} else {
gpu.ErrorDescription = ""
}
if gpu.StatusCheckedAt == nil && strings.TrimSpace(gpu.Status) == "" {
gpu.Status = "OK"
}
}
}
func extractFaultyGPUSet(description string) map[int]bool {
faulty := make(map[int]bool)
matches := reFaultGPU.FindAllStringSubmatch(description, -1)
for _, m := range matches {
if len(m) < 2 {
continue
}
idx, err := strconv.Atoi(m[1])
if err == nil && idx >= 0 {
faulty[idx] = true
}
}
return faulty
}
func isGPUFaultEvent(e models.Event) bool {
desc := strings.ToLower(e.Description)
if strings.Contains(desc, "bios miss f_gpu") {
return true
}
return strings.EqualFold(strings.TrimSpace(e.ID), "17FFB002")
}