feat: Redfish hardware event log collection + MSI ghost GPU filter + inventory improvements
- Collect hardware event logs (last 7 days) from Systems and Managers/SEL LogServices - Parse AMI raw IPMI dump messages into readable descriptions (Sensor_Type: Event_Type) - Filter out audit/journal/non-hardware log services; only SEL from Managers - MSI ghost GPU filter: exclude processor GPU entries with temperature=0 when host is powered on - Reanimator collected_at uses InventoryData/Status.LastModifiedTime (30-day fallback) - Invalidate Redfish inventory CRC groups before host power-on - Log inventory LastModifiedTime age in collection logs - Drop SecureBoot collection (SecureBootMode, SecureBootDatabases) — not hardware inventory - Add build version to UI footer via template - Add MSI Redfish API reference doc to bible-local/docs/ ADL-032–ADL-035 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -58,6 +58,44 @@ func (r redfishSnapshotReader) collectGPUs(systemPaths, chassisPaths []string, p
|
||||
return out
|
||||
}
|
||||
|
||||
// msiGhostGPUFilter returns true when the GPU chassis for gpuID shows a temperature
|
||||
// of 0 on a powered-on host, which is the reliable MSI/AMI signal that the GPU is
|
||||
// no longer physically installed (stale BMC inventory cache).
|
||||
// It only filters when the system PowerState is "On" — when the host is off, all
|
||||
// temperature readings are 0 and we cannot distinguish absent from idle.
|
||||
func (r redfishSnapshotReader) msiGhostGPUFilter(systemPaths []string, gpuID, chassisPath string) bool {
|
||||
// Require host powered on.
|
||||
for _, sp := range systemPaths {
|
||||
doc, err := r.getJSON(sp)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if !strings.EqualFold(strings.TrimSpace(asString(doc["PowerState"])), "on") {
|
||||
return false
|
||||
}
|
||||
break
|
||||
}
|
||||
// Read the temperature sensor for this GPU chassis.
|
||||
sensorPath := joinPath(chassisPath, "/Sensors/"+gpuID+"_Temperature")
|
||||
sensorDoc, err := r.getJSON(sensorPath)
|
||||
if err != nil || len(sensorDoc) == 0 {
|
||||
return false
|
||||
}
|
||||
reading, ok := sensorDoc["Reading"]
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
switch v := reading.(type) {
|
||||
case float64:
|
||||
return v == 0
|
||||
case int:
|
||||
return v == 0
|
||||
case int64:
|
||||
return v == 0
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// collectGPUsFromProcessors finds GPUs that some BMCs (e.g. MSI) expose as
|
||||
// Processor entries with ProcessorType=GPU rather than as PCIe devices.
|
||||
// It supplements the existing gpus slice (already found via PCIe path),
|
||||
@@ -68,6 +106,7 @@ func (r redfishSnapshotReader) collectGPUsFromProcessors(systemPaths, chassisPat
|
||||
return append([]models.GPU{}, existing...)
|
||||
}
|
||||
chassisByID := make(map[string]map[string]interface{})
|
||||
chassisPathByID := make(map[string]string)
|
||||
for _, cp := range chassisPaths {
|
||||
doc, err := r.getJSON(cp)
|
||||
if err != nil || len(doc) == 0 {
|
||||
@@ -76,6 +115,7 @@ func (r redfishSnapshotReader) collectGPUsFromProcessors(systemPaths, chassisPat
|
||||
id := strings.TrimSpace(asString(doc["Id"]))
|
||||
if id != "" {
|
||||
chassisByID[strings.ToUpper(id)] = doc
|
||||
chassisPathByID[strings.ToUpper(id)] = cp
|
||||
}
|
||||
}
|
||||
|
||||
@@ -108,6 +148,13 @@ func (r redfishSnapshotReader) collectGPUsFromProcessors(systemPaths, chassisPat
|
||||
serial = resolveProcessorGPUChassisSerial(chassisByID, gpuID, plan)
|
||||
}
|
||||
|
||||
if plan.Directives.EnableMSIGhostGPUFilter {
|
||||
chassisPath := resolveProcessorGPUChassisPath(chassisPathByID, gpuID, plan)
|
||||
if chassisPath != "" && r.msiGhostGPUFilter(systemPaths, gpuID, chassisPath) {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
uuid := strings.TrimSpace(asString(doc["UUID"]))
|
||||
uuidKey := strings.ToUpper(uuid)
|
||||
serialKey := strings.ToUpper(serial)
|
||||
|
||||
Reference in New Issue
Block a user