Three related fixes for IDL event processing: 1. idl.go: include EventType in dedup key so Deassert events are no longer silently dropped as duplicates of their Assert counterparts. 2. gpu_status.go: treat Deassert events as clearing all GPU faults — previously the code re-applied the same faulty GPU set from the description, leaving GPUs stuck in Critical even after alarm cleared. 3. reanimator_models/converter: add bmc_event_summary section to the Reanimator export — a deduplicated Critical/Warning event table with Active/Resolved status derived from Assert/Deassert pairs. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
120 lines
2.8 KiB
Go
120 lines
2.8 KiB
Go
package inspur
|
|
|
|
import (
|
|
"regexp"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"git.mchus.pro/mchus/logpile/internal/models"
|
|
)
|
|
|
|
var reFaultGPU = regexp.MustCompile(`\bF_GPU(\d+)\b`)
|
|
|
|
func applyGPUStatusFromEvents(hw *models.HardwareConfig, events []models.Event) {
|
|
if hw == nil || len(hw.GPUs) == 0 {
|
|
return
|
|
}
|
|
|
|
gpuByIndex := make(map[int]*models.GPU)
|
|
for i := range hw.GPUs {
|
|
gpu := &hw.GPUs[i]
|
|
idx, ok := extractLogicalGPUIndex(gpu.Slot)
|
|
if !ok {
|
|
continue
|
|
}
|
|
gpuByIndex[idx] = gpu
|
|
gpu.StatusHistory = nil
|
|
gpu.ErrorDescription = ""
|
|
}
|
|
|
|
relevantEvents := make([]models.Event, 0)
|
|
for _, e := range events {
|
|
if !isGPUFaultEvent(e) || len(extractFaultyGPUSet(e.Description)) == 0 {
|
|
continue
|
|
}
|
|
relevantEvents = append(relevantEvents, e)
|
|
}
|
|
|
|
if len(relevantEvents) == 0 {
|
|
for _, gpu := range gpuByIndex {
|
|
if strings.TrimSpace(gpu.Status) == "" {
|
|
gpu.Status = "OK"
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
sort.Slice(relevantEvents, func(i, j int) bool {
|
|
return relevantEvents[i].Timestamp.Before(relevantEvents[j].Timestamp)
|
|
})
|
|
|
|
currentStatus := make(map[int]string, len(gpuByIndex))
|
|
lastCriticalDetails := make(map[int]string, len(gpuByIndex))
|
|
for idx := range gpuByIndex {
|
|
currentStatus[idx] = "OK"
|
|
}
|
|
|
|
for _, e := range relevantEvents {
|
|
// Deassert means the alarm was cleared: all GPUs return to OK.
|
|
isDeassert := strings.EqualFold(strings.TrimSpace(e.EventType), "Deassert")
|
|
faultySet := extractFaultyGPUSet(e.Description)
|
|
for idx, gpu := range gpuByIndex {
|
|
newStatus := "OK"
|
|
if !isDeassert && faultySet[idx] {
|
|
newStatus = "Critical"
|
|
lastCriticalDetails[idx] = strings.TrimSpace(e.Description)
|
|
}
|
|
|
|
if currentStatus[idx] != newStatus {
|
|
gpu.StatusHistory = append(gpu.StatusHistory, models.StatusHistoryEntry{
|
|
Status: newStatus,
|
|
ChangedAt: e.Timestamp,
|
|
Details: strings.TrimSpace(e.Description),
|
|
})
|
|
ts := e.Timestamp
|
|
gpu.StatusChangedAt = &ts
|
|
currentStatus[idx] = newStatus
|
|
}
|
|
|
|
ts := e.Timestamp
|
|
gpu.StatusCheckedAt = &ts
|
|
}
|
|
}
|
|
|
|
for idx, gpu := range gpuByIndex {
|
|
gpu.Status = currentStatus[idx]
|
|
if gpu.Status == "Critical" {
|
|
gpu.ErrorDescription = lastCriticalDetails[idx]
|
|
} else {
|
|
gpu.ErrorDescription = ""
|
|
}
|
|
if gpu.StatusCheckedAt == nil && strings.TrimSpace(gpu.Status) == "" {
|
|
gpu.Status = "OK"
|
|
}
|
|
}
|
|
}
|
|
|
|
func extractFaultyGPUSet(description string) map[int]bool {
|
|
faulty := make(map[int]bool)
|
|
matches := reFaultGPU.FindAllStringSubmatch(description, -1)
|
|
for _, m := range matches {
|
|
if len(m) < 2 {
|
|
continue
|
|
}
|
|
idx, err := strconv.Atoi(m[1])
|
|
if err == nil && idx >= 0 {
|
|
faulty[idx] = true
|
|
}
|
|
}
|
|
return faulty
|
|
}
|
|
|
|
func isGPUFaultEvent(e models.Event) bool {
|
|
desc := strings.ToLower(e.Description)
|
|
if strings.Contains(desc, "bios miss f_gpu") {
|
|
return true
|
|
}
|
|
return strings.EqualFold(strings.TrimSpace(e.ID), "17FFB002")
|
|
}
|