Update Inspur parsing and align release docs
This commit is contained in:
101
internal/parser/vendors/inspur/gpu_status.go
vendored
101
internal/parser/vendors/inspur/gpu_status.go
vendored
@@ -2,6 +2,7 @@ package inspur
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
@@ -15,38 +16,96 @@ func applyGPUStatusFromEvents(hw *models.HardwareConfig, events []models.Event)
|
||||
return
|
||||
}
|
||||
|
||||
faulty := make(map[int]bool)
|
||||
for _, e := range events {
|
||||
if !isGPUFaultEvent(e) {
|
||||
continue
|
||||
}
|
||||
|
||||
matches := reFaultGPU.FindAllStringSubmatch(e.Description, -1)
|
||||
for _, m := range matches {
|
||||
if len(m) < 2 {
|
||||
continue
|
||||
}
|
||||
idx, err := strconv.Atoi(m[1])
|
||||
if err == nil && idx >= 0 {
|
||||
faulty[idx] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
gpuByIndex := make(map[int]*models.GPU)
|
||||
for i := range hw.GPUs {
|
||||
gpu := &hw.GPUs[i]
|
||||
idx, ok := extractLogicalGPUIndex(gpu.Slot)
|
||||
if ok && faulty[idx] {
|
||||
gpu.Status = "Critical"
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
gpuByIndex[idx] = gpu
|
||||
gpu.StatusHistory = nil
|
||||
gpu.ErrorDescription = ""
|
||||
}
|
||||
|
||||
if strings.TrimSpace(gpu.Status) == "" {
|
||||
relevantEvents := make([]models.Event, 0)
|
||||
for _, e := range events {
|
||||
if !isGPUFaultEvent(e) || len(extractFaultyGPUSet(e.Description)) == 0 {
|
||||
continue
|
||||
}
|
||||
relevantEvents = append(relevantEvents, e)
|
||||
}
|
||||
|
||||
if len(relevantEvents) == 0 {
|
||||
for _, gpu := range gpuByIndex {
|
||||
if strings.TrimSpace(gpu.Status) == "" {
|
||||
gpu.Status = "OK"
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
sort.Slice(relevantEvents, func(i, j int) bool {
|
||||
return relevantEvents[i].Timestamp.Before(relevantEvents[j].Timestamp)
|
||||
})
|
||||
|
||||
currentStatus := make(map[int]string, len(gpuByIndex))
|
||||
lastCriticalDetails := make(map[int]string, len(gpuByIndex))
|
||||
for idx := range gpuByIndex {
|
||||
currentStatus[idx] = "OK"
|
||||
}
|
||||
|
||||
for _, e := range relevantEvents {
|
||||
faultySet := extractFaultyGPUSet(e.Description)
|
||||
for idx, gpu := range gpuByIndex {
|
||||
newStatus := "OK"
|
||||
if faultySet[idx] {
|
||||
newStatus = "Critical"
|
||||
lastCriticalDetails[idx] = strings.TrimSpace(e.Description)
|
||||
}
|
||||
|
||||
if currentStatus[idx] != newStatus {
|
||||
gpu.StatusHistory = append(gpu.StatusHistory, models.StatusHistoryEntry{
|
||||
Status: newStatus,
|
||||
ChangedAt: e.Timestamp,
|
||||
Details: strings.TrimSpace(e.Description),
|
||||
})
|
||||
gpu.StatusChangedAt = e.Timestamp
|
||||
currentStatus[idx] = newStatus
|
||||
}
|
||||
|
||||
gpu.StatusCheckedAt = e.Timestamp
|
||||
}
|
||||
}
|
||||
|
||||
for idx, gpu := range gpuByIndex {
|
||||
gpu.Status = currentStatus[idx]
|
||||
if gpu.Status == "Critical" {
|
||||
gpu.ErrorDescription = lastCriticalDetails[idx]
|
||||
} else {
|
||||
gpu.ErrorDescription = ""
|
||||
}
|
||||
if gpu.StatusCheckedAt.IsZero() && strings.TrimSpace(gpu.Status) == "" {
|
||||
gpu.Status = "OK"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func extractFaultyGPUSet(description string) map[int]bool {
|
||||
faulty := make(map[int]bool)
|
||||
matches := reFaultGPU.FindAllStringSubmatch(description, -1)
|
||||
for _, m := range matches {
|
||||
if len(m) < 2 {
|
||||
continue
|
||||
}
|
||||
idx, err := strconv.Atoi(m[1])
|
||||
if err == nil && idx >= 0 {
|
||||
faulty[idx] = true
|
||||
}
|
||||
}
|
||||
return faulty
|
||||
}
|
||||
|
||||
func isGPUFaultEvent(e models.Event) bool {
|
||||
desc := strings.ToLower(e.Description)
|
||||
if strings.Contains(desc, "bios miss f_gpu") {
|
||||
|
||||
Reference in New Issue
Block a user