- Collect hardware event logs (last 7 days) from Systems and Managers/SEL LogServices - Parse AMI raw IPMI dump messages into readable descriptions (Sensor_Type: Event_Type) - Filter out audit/journal/non-hardware log services; only SEL from Managers - MSI ghost GPU filter: exclude processor GPU entries with temperature=0 when host is powered on - Reanimator collected_at uses InventoryData/Status.LastModifiedTime (30-day fallback) - Invalidate Redfish inventory CRC groups before host power-on - Log inventory LastModifiedTime age in collection logs - Drop SecureBoot collection (SecureBootMode, SecureBootDatabases) — not hardware inventory - Add build version to UI footer via template - Add MSI Redfish API reference doc to bible-local/docs/ ADL-032–ADL-035 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
199 lines
6.1 KiB
Go
199 lines
6.1 KiB
Go
package collector
|
|
|
|
import (
|
|
"fmt"
|
|
"strings"
|
|
|
|
"git.mchus.pro/mchus/logpile/internal/collector/redfishprofile"
|
|
"git.mchus.pro/mchus/logpile/internal/models"
|
|
)
|
|
|
|
func (r redfishSnapshotReader) collectGPUs(systemPaths, chassisPaths []string, plan redfishprofile.ResolvedAnalysisPlan) []models.GPU {
|
|
collections := make([]string, 0, len(systemPaths)*3+len(chassisPaths)*2)
|
|
for _, systemPath := range systemPaths {
|
|
collections = append(collections, joinPath(systemPath, "/PCIeDevices"))
|
|
collections = append(collections, joinPath(systemPath, "/Accelerators"))
|
|
collections = append(collections, joinPath(systemPath, "/GraphicsControllers"))
|
|
}
|
|
for _, chassisPath := range chassisPaths {
|
|
collections = append(collections, joinPath(chassisPath, "/PCIeDevices"))
|
|
collections = append(collections, joinPath(chassisPath, "/Accelerators"))
|
|
}
|
|
var out []models.GPU
|
|
seen := make(map[string]struct{})
|
|
idx := 1
|
|
for _, collectionPath := range collections {
|
|
memberDocs, err := r.getCollectionMembers(collectionPath)
|
|
if err != nil || len(memberDocs) == 0 {
|
|
continue
|
|
}
|
|
for _, doc := range memberDocs {
|
|
functionDocs := r.getLinkedPCIeFunctions(doc)
|
|
if !looksLikeGPU(doc, functionDocs) {
|
|
continue
|
|
}
|
|
supplementalDocs := r.getLinkedSupplementalDocs(doc, "EnvironmentMetrics", "Metrics")
|
|
for _, fn := range functionDocs {
|
|
supplementalDocs = append(supplementalDocs, r.getLinkedSupplementalDocs(fn, "EnvironmentMetrics", "Metrics")...)
|
|
}
|
|
gpu := parseGPUWithSupplementalDocs(doc, functionDocs, supplementalDocs, idx)
|
|
idx++
|
|
if plan.Directives.EnableGenericGraphicsControllerDedup && shouldSkipGenericGPUDuplicate(out, gpu) {
|
|
continue
|
|
}
|
|
key := gpuDocDedupKey(doc, gpu)
|
|
if key == "" {
|
|
continue
|
|
}
|
|
if _, ok := seen[key]; ok {
|
|
continue
|
|
}
|
|
seen[key] = struct{}{}
|
|
out = append(out, gpu)
|
|
}
|
|
}
|
|
if plan.Directives.EnableGenericGraphicsControllerDedup {
|
|
return dropModelOnlyGPUPlaceholders(out)
|
|
}
|
|
return out
|
|
}
|
|
|
|
// msiGhostGPUFilter returns true when the GPU chassis for gpuID shows a temperature
|
|
// of 0 on a powered-on host, which is the reliable MSI/AMI signal that the GPU is
|
|
// no longer physically installed (stale BMC inventory cache).
|
|
// It only filters when the system PowerState is "On" — when the host is off, all
|
|
// temperature readings are 0 and we cannot distinguish absent from idle.
|
|
func (r redfishSnapshotReader) msiGhostGPUFilter(systemPaths []string, gpuID, chassisPath string) bool {
|
|
// Require host powered on.
|
|
for _, sp := range systemPaths {
|
|
doc, err := r.getJSON(sp)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
if !strings.EqualFold(strings.TrimSpace(asString(doc["PowerState"])), "on") {
|
|
return false
|
|
}
|
|
break
|
|
}
|
|
// Read the temperature sensor for this GPU chassis.
|
|
sensorPath := joinPath(chassisPath, "/Sensors/"+gpuID+"_Temperature")
|
|
sensorDoc, err := r.getJSON(sensorPath)
|
|
if err != nil || len(sensorDoc) == 0 {
|
|
return false
|
|
}
|
|
reading, ok := sensorDoc["Reading"]
|
|
if !ok {
|
|
return false
|
|
}
|
|
switch v := reading.(type) {
|
|
case float64:
|
|
return v == 0
|
|
case int:
|
|
return v == 0
|
|
case int64:
|
|
return v == 0
|
|
}
|
|
return false
|
|
}
|
|
|
|
// collectGPUsFromProcessors finds GPUs that some BMCs (e.g. MSI) expose as
|
|
// Processor entries with ProcessorType=GPU rather than as PCIe devices.
|
|
// It supplements the existing gpus slice (already found via PCIe path),
|
|
// skipping entries already present by UUID or SerialNumber.
|
|
// Serial numbers are looked up from Chassis members named after each GPU Id.
|
|
func (r redfishSnapshotReader) collectGPUsFromProcessors(systemPaths, chassisPaths []string, existing []models.GPU, plan redfishprofile.ResolvedAnalysisPlan) []models.GPU {
|
|
if !plan.Directives.EnableProcessorGPUFallback {
|
|
return append([]models.GPU{}, existing...)
|
|
}
|
|
chassisByID := make(map[string]map[string]interface{})
|
|
chassisPathByID := make(map[string]string)
|
|
for _, cp := range chassisPaths {
|
|
doc, err := r.getJSON(cp)
|
|
if err != nil || len(doc) == 0 {
|
|
continue
|
|
}
|
|
id := strings.TrimSpace(asString(doc["Id"]))
|
|
if id != "" {
|
|
chassisByID[strings.ToUpper(id)] = doc
|
|
chassisPathByID[strings.ToUpper(id)] = cp
|
|
}
|
|
}
|
|
|
|
seenUUID := make(map[string]struct{})
|
|
seenSerial := make(map[string]struct{})
|
|
for _, g := range existing {
|
|
if u := strings.ToUpper(strings.TrimSpace(g.UUID)); u != "" {
|
|
seenUUID[u] = struct{}{}
|
|
}
|
|
if s := strings.ToUpper(strings.TrimSpace(g.SerialNumber)); s != "" {
|
|
seenSerial[s] = struct{}{}
|
|
}
|
|
}
|
|
|
|
out := append([]models.GPU{}, existing...)
|
|
idx := len(existing) + 1
|
|
for _, systemPath := range systemPaths {
|
|
procDocs, err := r.getCollectionMembers(joinPath(systemPath, "/Processors"))
|
|
if err != nil {
|
|
continue
|
|
}
|
|
for _, doc := range procDocs {
|
|
if !strings.EqualFold(strings.TrimSpace(asString(doc["ProcessorType"])), "GPU") {
|
|
continue
|
|
}
|
|
|
|
gpuID := strings.TrimSpace(asString(doc["Id"]))
|
|
serial := findFirstNormalizedStringByKeys(doc, "SerialNumber")
|
|
if serial == "" {
|
|
serial = resolveProcessorGPUChassisSerial(chassisByID, gpuID, plan)
|
|
}
|
|
|
|
if plan.Directives.EnableMSIGhostGPUFilter {
|
|
chassisPath := resolveProcessorGPUChassisPath(chassisPathByID, gpuID, plan)
|
|
if chassisPath != "" && r.msiGhostGPUFilter(systemPaths, gpuID, chassisPath) {
|
|
continue
|
|
}
|
|
}
|
|
|
|
uuid := strings.TrimSpace(asString(doc["UUID"]))
|
|
uuidKey := strings.ToUpper(uuid)
|
|
serialKey := strings.ToUpper(serial)
|
|
|
|
if uuidKey != "" {
|
|
if _, dup := seenUUID[uuidKey]; dup {
|
|
continue
|
|
}
|
|
seenUUID[uuidKey] = struct{}{}
|
|
}
|
|
if serialKey != "" {
|
|
if _, dup := seenSerial[serialKey]; dup {
|
|
continue
|
|
}
|
|
seenSerial[serialKey] = struct{}{}
|
|
}
|
|
|
|
slotLabel := firstNonEmpty(
|
|
redfishLocationLabel(doc["Location"]),
|
|
redfishLocationLabel(doc["PhysicalLocation"]),
|
|
)
|
|
if slotLabel == "" && gpuID != "" {
|
|
slotLabel = gpuID
|
|
}
|
|
if slotLabel == "" {
|
|
slotLabel = fmt.Sprintf("GPU%d", idx)
|
|
}
|
|
out = append(out, models.GPU{
|
|
Slot: slotLabel,
|
|
Model: firstNonEmpty(asString(doc["Model"]), asString(doc["Name"])),
|
|
Manufacturer: asString(doc["Manufacturer"]),
|
|
PartNumber: asString(doc["PartNumber"]),
|
|
SerialNumber: serial,
|
|
UUID: uuid,
|
|
Status: mapStatus(doc["Status"]),
|
|
})
|
|
idx++
|
|
}
|
|
}
|
|
return out
|
|
}
|