- Add MSI CG480-S5063 (H100 SXM5) support:
- collectGPUsFromProcessors: find GPUs via Processors/ProcessorType=GPU,
resolve serials from Chassis/<GpuId>
- looksLikeGPU: skip Description="Display Device" PCIe sidecars
- isVirtualStorageDrive: filter AMI virtual USB drives (0-byte)
- enrichNICMACsFromNetworkDeviceFunctions: pull MACs for MSI NICs
- parseCPUs: filter by ProcessorType, parse Socket, L1/L2/L3 from ProcessorMemory
- parseMemory: Location.PartLocation.ServiceLabel slot fallback
- shouldCrawlPath: block /SubProcessors subtrees
- Fix status_checked_at/status_changed_at serializing as 0001-01-01:
change all StatusCheckedAt/StatusChangedAt fields to *time.Time
- Redfish crawler cleanup:
- Block non-inventory branches: AccountService, CertificateService,
EventService, Registries, SessionService, TaskService, manager config paths,
OperatingConfigs, BootOptions, HostPostCode, Bios/Settings, OEM KVM paths
- Add Assembly to critical endpoints (FRU data)
- Remove BootOptions from priority seeds
- collectBMCMAC: read BMC MAC from Managers/*/EthernetInterfaces
- collectAssemblyFRU: extract FRU serial/part from Chassis/*/Assembly
- Firmware: remove NetworkProtocol noise, fix SecureBoot field,
filter BMCImageN redundant backup slots
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
118 lines
2.6 KiB
Go
118 lines
2.6 KiB
Go
package inspur
|
|
|
|
import (
|
|
"regexp"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"git.mchus.pro/mchus/logpile/internal/models"
|
|
)
|
|
|
|
var reFaultGPU = regexp.MustCompile(`\bF_GPU(\d+)\b`)
|
|
|
|
func applyGPUStatusFromEvents(hw *models.HardwareConfig, events []models.Event) {
|
|
if hw == nil || len(hw.GPUs) == 0 {
|
|
return
|
|
}
|
|
|
|
gpuByIndex := make(map[int]*models.GPU)
|
|
for i := range hw.GPUs {
|
|
gpu := &hw.GPUs[i]
|
|
idx, ok := extractLogicalGPUIndex(gpu.Slot)
|
|
if !ok {
|
|
continue
|
|
}
|
|
gpuByIndex[idx] = gpu
|
|
gpu.StatusHistory = nil
|
|
gpu.ErrorDescription = ""
|
|
}
|
|
|
|
relevantEvents := make([]models.Event, 0)
|
|
for _, e := range events {
|
|
if !isGPUFaultEvent(e) || len(extractFaultyGPUSet(e.Description)) == 0 {
|
|
continue
|
|
}
|
|
relevantEvents = append(relevantEvents, e)
|
|
}
|
|
|
|
if len(relevantEvents) == 0 {
|
|
for _, gpu := range gpuByIndex {
|
|
if strings.TrimSpace(gpu.Status) == "" {
|
|
gpu.Status = "OK"
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
sort.Slice(relevantEvents, func(i, j int) bool {
|
|
return relevantEvents[i].Timestamp.Before(relevantEvents[j].Timestamp)
|
|
})
|
|
|
|
currentStatus := make(map[int]string, len(gpuByIndex))
|
|
lastCriticalDetails := make(map[int]string, len(gpuByIndex))
|
|
for idx := range gpuByIndex {
|
|
currentStatus[idx] = "OK"
|
|
}
|
|
|
|
for _, e := range relevantEvents {
|
|
faultySet := extractFaultyGPUSet(e.Description)
|
|
for idx, gpu := range gpuByIndex {
|
|
newStatus := "OK"
|
|
if faultySet[idx] {
|
|
newStatus = "Critical"
|
|
lastCriticalDetails[idx] = strings.TrimSpace(e.Description)
|
|
}
|
|
|
|
if currentStatus[idx] != newStatus {
|
|
gpu.StatusHistory = append(gpu.StatusHistory, models.StatusHistoryEntry{
|
|
Status: newStatus,
|
|
ChangedAt: e.Timestamp,
|
|
Details: strings.TrimSpace(e.Description),
|
|
})
|
|
ts := e.Timestamp
|
|
gpu.StatusChangedAt = &ts
|
|
currentStatus[idx] = newStatus
|
|
}
|
|
|
|
ts := e.Timestamp
|
|
gpu.StatusCheckedAt = &ts
|
|
}
|
|
}
|
|
|
|
for idx, gpu := range gpuByIndex {
|
|
gpu.Status = currentStatus[idx]
|
|
if gpu.Status == "Critical" {
|
|
gpu.ErrorDescription = lastCriticalDetails[idx]
|
|
} else {
|
|
gpu.ErrorDescription = ""
|
|
}
|
|
if gpu.StatusCheckedAt == nil && strings.TrimSpace(gpu.Status) == "" {
|
|
gpu.Status = "OK"
|
|
}
|
|
}
|
|
}
|
|
|
|
func extractFaultyGPUSet(description string) map[int]bool {
|
|
faulty := make(map[int]bool)
|
|
matches := reFaultGPU.FindAllStringSubmatch(description, -1)
|
|
for _, m := range matches {
|
|
if len(m) < 2 {
|
|
continue
|
|
}
|
|
idx, err := strconv.Atoi(m[1])
|
|
if err == nil && idx >= 0 {
|
|
faulty[idx] = true
|
|
}
|
|
}
|
|
return faulty
|
|
}
|
|
|
|
func isGPUFaultEvent(e models.Event) bool {
|
|
desc := strings.ToLower(e.Description)
|
|
if strings.Contains(desc, "bios miss f_gpu") {
|
|
return true
|
|
}
|
|
return strings.EqualFold(strings.TrimSpace(e.ID), "17FFB002")
|
|
}
|