collector/redfish: improve GPU SN/model fallback and warnings

This commit is contained in:
2026-02-28 12:52:22 +03:00
parent ddab93a5ee
commit 9aadf2f1e9
3 changed files with 436 additions and 11 deletions

View File

@@ -416,13 +416,15 @@ func redfishLinkedPath(doc map[string]interface{}, key string) string {
}
func (c *RedfishConnector) collectGPUs(ctx context.Context, client *http.Client, req Request, baseURL string, systemPaths, chassisPaths []string) []models.GPU {
collections := make([]string, 0, len(systemPaths)*2+len(chassisPaths))
collections := make([]string, 0, len(systemPaths)*3+len(chassisPaths)*2)
for _, systemPath := range systemPaths {
collections = append(collections, joinPath(systemPath, "/PCIeDevices"))
collections = append(collections, joinPath(systemPath, "/Accelerators"))
collections = append(collections, joinPath(systemPath, "/GraphicsControllers"))
}
for _, chassisPath := range chassisPaths {
collections = append(collections, joinPath(chassisPath, "/PCIeDevices"))
collections = append(collections, joinPath(chassisPath, "/Accelerators"))
}
var out []models.GPU
@@ -443,7 +445,7 @@ func (c *RedfishConnector) collectGPUs(ctx context.Context, client *http.Client,
gpu := parseGPU(doc, functionDocs, idx)
idx++
key := firstNonEmpty(gpu.SerialNumber, gpu.BDF, gpu.Slot+"|"+gpu.Model)
key := gpuDedupKey(gpu)
if key == "" {
continue
}
@@ -955,21 +957,26 @@ func redfishCriticalEndpoints(systemPaths, chassisPaths, managerPaths []string)
add(p)
add(joinPath(p, "/Bios"))
add(joinPath(p, "/SecureBoot"))
add(joinPath(p, "/Oem/Public/FRU"))
add(joinPath(p, "/Processors"))
add(joinPath(p, "/Memory"))
add(joinPath(p, "/Storage"))
add(joinPath(p, "/SimpleStorage"))
add(joinPath(p, "/PCIeDevices"))
add(joinPath(p, "/Accelerators"))
add(joinPath(p, "/GraphicsControllers"))
add(joinPath(p, "/EthernetInterfaces"))
add(joinPath(p, "/NetworkInterfaces"))
}
for _, p := range chassisPaths {
add(p)
add(joinPath(p, "/Oem/Public/FRU"))
add(joinPath(p, "/Power"))
add(joinPath(p, "/Thermal"))
add(joinPath(p, "/Sensors"))
add(joinPath(p, "/NetworkAdapters"))
add(joinPath(p, "/PCIeDevices"))
add(joinPath(p, "/Accelerators"))
add(joinPath(p, "/Drives"))
}
for _, p := range managerPaths {
@@ -1132,6 +1139,7 @@ func shouldCrawlPath(path string) bool {
return false
}
heavyParts := []string{
"/JsonSchemas",
"/LogServices/",
"/Entries/",
"/TelemetryService/",
@@ -1429,14 +1437,98 @@ func (c *RedfishConnector) recoverCriticalRedfishDocsPlanB(ctx context.Context,
func parseBoardInfo(system map[string]interface{}) models.BoardInfo {
return models.BoardInfo{
Manufacturer: asString(system["Manufacturer"]),
ProductName: firstNonEmpty(asString(system["Model"]), asString(system["Name"])),
SerialNumber: asString(system["SerialNumber"]),
PartNumber: asString(system["PartNumber"]),
UUID: asString(system["UUID"]),
Manufacturer: normalizeRedfishIdentityField(asString(system["Manufacturer"])),
ProductName: normalizeRedfishIdentityField(firstNonEmpty(
asString(system["Model"]),
asString(system["ProductName"]),
asString(system["Name"]),
)),
SerialNumber: normalizeRedfishIdentityField(asString(system["SerialNumber"])),
PartNumber: normalizeRedfishIdentityField(asString(system["PartNumber"])),
UUID: normalizeRedfishIdentityField(asString(system["UUID"])),
}
}
func parseBoardInfoWithFallback(system, chassis, fru map[string]interface{}) models.BoardInfo {
board := parseBoardInfo(system)
chassisBoard := parseBoardInfo(chassis)
fruBoard := parseBoardInfoFromFRUDoc(fru)
if board.Manufacturer == "" {
board.Manufacturer = firstNonEmpty(chassisBoard.Manufacturer, fruBoard.Manufacturer)
}
if board.ProductName == "" {
board.ProductName = firstNonEmpty(chassisBoard.ProductName, fruBoard.ProductName)
}
if board.SerialNumber == "" {
board.SerialNumber = firstNonEmpty(chassisBoard.SerialNumber, fruBoard.SerialNumber)
}
if board.PartNumber == "" {
board.PartNumber = firstNonEmpty(chassisBoard.PartNumber, fruBoard.PartNumber)
}
if board.UUID == "" {
board.UUID = chassisBoard.UUID
}
return board
}
func parseBoardInfoFromFRUDoc(doc map[string]interface{}) models.BoardInfo {
if len(doc) == 0 {
return models.BoardInfo{}
}
return models.BoardInfo{
Manufacturer: findFirstNormalizedStringByKeys(doc, "Manufacturer", "BoardManufacturer", "Vendor"),
ProductName: findFirstNormalizedStringByKeys(doc, "ProductName", "BoardName", "Model"),
SerialNumber: findFirstNormalizedStringByKeys(doc, "SerialNumber", "BoardSerialNumber"),
PartNumber: findFirstNormalizedStringByKeys(doc, "PartNumber", "BoardPartNumber", "ProductPartNumber"),
}
}
func findFirstNormalizedStringByKeys(doc map[string]interface{}, keys ...string) string {
if len(doc) == 0 || len(keys) == 0 {
return ""
}
keySet := make(map[string]struct{}, len(keys))
for _, key := range keys {
k := strings.ToLower(strings.TrimSpace(key))
if k != "" {
keySet[k] = struct{}{}
}
}
stack := []any{doc}
for len(stack) > 0 {
last := len(stack) - 1
node := stack[last]
stack = stack[:last]
switch v := node.(type) {
case map[string]interface{}:
for k, raw := range v {
if _, ok := keySet[strings.ToLower(strings.TrimSpace(k))]; ok {
if s, ok := raw.(string); ok {
if normalized := normalizeRedfishIdentityField(s); normalized != "" {
return normalized
}
}
}
switch nested := raw.(type) {
case map[string]interface{}, []interface{}:
stack = append(stack, nested)
}
}
case []interface{}:
for _, item := range v {
switch nested := item.(type) {
case map[string]interface{}, []interface{}:
stack = append(stack, nested)
}
}
}
}
return ""
}
func parseCPUs(docs []map[string]interface{}) []models.CPU {
cpus := make([]models.CPU, 0, len(docs))
for idx, doc := range docs {
@@ -1695,7 +1787,7 @@ func parseGPU(doc map[string]interface{}, functionDocs []map[string]interface{},
Location: firstNonEmpty(redfishLocationLabel(doc["Location"]), redfishLocationLabel(doc["PhysicalLocation"])),
Model: firstNonEmpty(asString(doc["Model"]), asString(doc["Name"])),
Manufacturer: asString(doc["Manufacturer"]),
SerialNumber: asString(doc["SerialNumber"]),
SerialNumber: strings.TrimSpace(asString(doc["SerialNumber"])),
PartNumber: asString(doc["PartNumber"]),
Firmware: asString(doc["FirmwareVersion"]),
Status: mapStatus(doc["Status"]),
@@ -1877,11 +1969,45 @@ func isGenericPCIeClassLabel(v string) bool {
}
}
func normalizeRedfishIdentityField(v string) string {
v = strings.TrimSpace(v)
if v == "" {
return ""
}
switch strings.ToLower(v) {
case "n/a", "na", "none", "null", "unknown", "0":
return ""
default:
return v
}
}
func gpuDedupKey(gpu models.GPU) string {
if serial := normalizeRedfishIdentityField(gpu.SerialNumber); serial != "" {
return serial
}
if bdf := strings.TrimSpace(gpu.BDF); bdf != "" {
return bdf
}
return firstNonEmpty(strings.TrimSpace(gpu.Slot)+"|"+strings.TrimSpace(gpu.Model), strings.TrimSpace(gpu.Slot))
}
func looksLikeGPU(doc map[string]interface{}, functionDocs []map[string]interface{}) bool {
deviceType := strings.ToLower(asString(doc["DeviceType"]))
if strings.Contains(deviceType, "gpu") || strings.Contains(deviceType, "graphics") || strings.Contains(deviceType, "accelerator") {
return true
}
if strings.Contains(deviceType, "network") {
return false
}
if oem, ok := doc["Oem"].(map[string]interface{}); ok {
if public, ok := oem["Public"].(map[string]interface{}); ok {
if dc := strings.ToLower(asString(public["DeviceClass"])); strings.Contains(dc, "network") {
return false
}
}
}
modelText := strings.ToLower(strings.Join([]string{
asString(doc["Name"]),
@@ -2378,6 +2504,7 @@ func redfishSnapshotPrioritySeeds(systemPaths, chassisPaths, managerPaths []stri
add(p)
add(joinPath(p, "/Bios"))
add(joinPath(p, "/SecureBoot"))
add(joinPath(p, "/Oem/Public/FRU"))
add(joinPath(p, "/Processors"))
add(joinPath(p, "/Memory"))
add(joinPath(p, "/EthernetInterfaces"))
@@ -2395,6 +2522,7 @@ func redfishSnapshotPrioritySeeds(systemPaths, chassisPaths, managerPaths []stri
}
for _, p := range chassisPaths {
add(p)
add(joinPath(p, "/Oem/Public/FRU"))
add(joinPath(p, "/Sensors"))
add(joinPath(p, "/Thermal"))
add(joinPath(p, "/EnvironmentMetrics"))

View File

@@ -4,6 +4,7 @@ import (
"fmt"
"sort"
"strings"
"time"
"git.mchus.pro/mchus/logpile/internal/models"
)
@@ -35,6 +36,7 @@ func ReplayRedfishFromRawPayloads(rawPayloads map[string]any, emit ProgressFn) (
chassisPaths := r.discoverMemberPaths("/redfish/v1/Chassis", "/redfish/v1/Chassis/1")
managerPaths := r.discoverMemberPaths("/redfish/v1/Managers", "/redfish/v1/Managers/1")
primarySystem := firstPathOrDefault(systemPaths, "/redfish/v1/Systems/1")
primaryChassis := firstPathOrDefault(chassisPaths, "/redfish/v1/Chassis/1")
primaryManager := firstPathOrDefault(managerPaths, "/redfish/v1/Managers/1")
if emit != nil {
@@ -44,8 +46,15 @@ func ReplayRedfishFromRawPayloads(rawPayloads map[string]any, emit ProgressFn) (
if err != nil {
return nil, fmt.Errorf("system info: %w", err)
}
chassisDoc, _ := r.getJSON(primaryChassis)
biosDoc, _ := r.getJSON(joinPath(primarySystem, "/Bios"))
secureBootDoc, _ := r.getJSON(joinPath(primarySystem, "/SecureBoot"))
systemFRUDoc, _ := r.getJSON(joinPath(primarySystem, "/Oem/Public/FRU"))
chassisFRUDoc, _ := r.getJSON(joinPath(primaryChassis, "/Oem/Public/FRU"))
fruDoc := systemFRUDoc
if len(fruDoc) == 0 {
fruDoc = chassisFRUDoc
}
if emit != nil {
emit(Progress{Status: "running", Progress: 55, Message: "Redfish snapshot: replay CPU/RAM/Storage..."})
@@ -71,7 +80,7 @@ func ReplayRedfishFromRawPayloads(rawPayloads map[string]any, emit ProgressFn) (
Sensors: make([]models.SensorReading, 0),
RawPayloads: cloneRawPayloads(rawPayloads),
Hardware: &models.HardwareConfig{
BoardInfo: parseBoardInfo(systemDoc),
BoardInfo: parseBoardInfoWithFallback(systemDoc, chassisDoc, fruDoc),
CPUs: parseCPUs(processors),
Memory: parseMemory(memory),
Storage: storageDevices,
@@ -83,9 +92,72 @@ func ReplayRedfishFromRawPayloads(rawPayloads map[string]any, emit ProgressFn) (
Firmware: parseFirmware(systemDoc, biosDoc, managerDoc, secureBootDoc, networkProtocolDoc),
},
}
appendMissingServerModelWarning(result, systemDoc, joinPath(primarySystem, "/Oem/Public/FRU"), joinPath(primaryChassis, "/Oem/Public/FRU"))
return result, nil
}
func appendMissingServerModelWarning(result *models.AnalysisResult, systemDoc map[string]interface{}, systemFRUPath, chassisFRUPath string) {
if result == nil || result.Hardware == nil {
return
}
if strings.TrimSpace(result.Hardware.BoardInfo.ProductName) != "" {
return
}
reasons := make([]string, 0, 3)
systemModelRaw := strings.TrimSpace(asString(systemDoc["Model"]))
if systemModelRaw != "" && normalizeRedfishIdentityField(systemModelRaw) == "" {
reasons = append(reasons, fmt.Sprintf("system model is placeholder: %q", systemModelRaw))
}
errs := redfishFetchErrorsFromRawPayloads(result.RawPayloads)
if msg := errs[normalizeRedfishPath(systemFRUPath)]; strings.TrimSpace(msg) != "" {
reasons = append(reasons, fmt.Sprintf("%s unavailable: %s", systemFRUPath, msg))
}
if msg := errs[normalizeRedfishPath(chassisFRUPath)]; strings.TrimSpace(msg) != "" {
reasons = append(reasons, fmt.Sprintf("%s unavailable: %s", chassisFRUPath, msg))
}
if len(reasons) == 0 {
reasons = append(reasons, "no non-placeholder ProductName/Model found in collected Redfish documents")
}
result.Events = append(result.Events, models.Event{
Timestamp: time.Now(),
Source: "Redfish",
EventType: "Collection Warning",
Severity: models.SeverityWarning,
Description: "Server model is missing in collected Redfish data",
RawData: strings.Join(reasons, "; "),
})
}
func redfishFetchErrorsFromRawPayloads(rawPayloads map[string]any) map[string]string {
out := make(map[string]string)
if len(rawPayloads) == 0 {
return out
}
raw, ok := rawPayloads["redfish_fetch_errors"]
if !ok {
return out
}
switch list := raw.(type) {
case []map[string]interface{}:
return redfishFetchErrorListToMap(list)
case []interface{}:
normalized := make([]map[string]interface{}, 0, len(list))
for _, item := range list {
m, ok := item.(map[string]interface{})
if !ok {
continue
}
normalized = append(normalized, m)
}
return redfishFetchErrorListToMap(normalized)
default:
return out
}
}
type redfishSnapshotReader struct {
tree map[string]interface{}
}
@@ -479,13 +551,15 @@ func (r redfishSnapshotReader) collectPSUs(chassisPaths []string) []models.PSU {
}
func (r redfishSnapshotReader) collectGPUs(systemPaths, chassisPaths []string) []models.GPU {
collections := make([]string, 0, len(systemPaths)*2+len(chassisPaths))
collections := make([]string, 0, len(systemPaths)*3+len(chassisPaths)*2)
for _, systemPath := range systemPaths {
collections = append(collections, joinPath(systemPath, "/PCIeDevices"))
collections = append(collections, joinPath(systemPath, "/Accelerators"))
collections = append(collections, joinPath(systemPath, "/GraphicsControllers"))
}
for _, chassisPath := range chassisPaths {
collections = append(collections, joinPath(chassisPath, "/PCIeDevices"))
collections = append(collections, joinPath(chassisPath, "/Accelerators"))
}
var out []models.GPU
seen := make(map[string]struct{})
@@ -502,7 +576,7 @@ func (r redfishSnapshotReader) collectGPUs(systemPaths, chassisPaths []string) [
}
gpu := parseGPU(doc, functionDocs, idx)
idx++
key := firstNonEmpty(gpu.SerialNumber, gpu.BDF, gpu.Slot+"|"+gpu.Model)
key := gpuDedupKey(gpu)
if key == "" {
continue
}

View File

@@ -421,3 +421,226 @@ func TestReplayCollectStorage_ProbesSupermicroNVMeDiskBayWhenCollectionEmpty(t *
t.Fatalf("expected size to be parsed from CapacityBytes")
}
}
func TestReplayCollectGPUs_DoesNotCollapseOnPlaceholderSerialAndSkipsNIC(t *testing.T) {
r := redfishSnapshotReader{tree: map[string]interface{}{
"/redfish/v1/Chassis/1/PCIeDevices": map[string]interface{}{
"Members": []interface{}{
map[string]interface{}{"@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/3"},
map[string]interface{}{"@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/9"},
map[string]interface{}{"@odata.id": "/redfish/v1/Chassis/1/PCIeDevices/7"},
},
},
"/redfish/v1/Chassis/1/PCIeDevices/3": map[string]interface{}{
"Id": "3",
"Name": "PCIeCard3",
"Model": "H200-SXM5-141G",
"Manufacturer": "NVIDIA",
"SerialNumber": "N/A",
"Oem": map[string]interface{}{
"Public": map[string]interface{}{
"DeviceClass": "DisplayController",
},
},
},
"/redfish/v1/Chassis/1/PCIeDevices/9": map[string]interface{}{
"Id": "9",
"Name": "PCIeCard9",
"Model": "H200-SXM5-141G",
"Manufacturer": "NVIDIA",
"SerialNumber": "N/A",
"Oem": map[string]interface{}{
"Public": map[string]interface{}{
"DeviceClass": "DisplayController",
},
},
},
"/redfish/v1/Chassis/1/PCIeDevices/7": map[string]interface{}{
"Id": "7",
"Name": "PCIeCard7",
"Model": "MCX631102AN-ADAT",
"Manufacturer": "NVIDIA",
"SerialNumber": "MT2538J00CZE",
"Oem": map[string]interface{}{
"Public": map[string]interface{}{
"DeviceClass": "NetworkController",
},
},
},
}}
got := r.collectGPUs(nil, []string{"/redfish/v1/Chassis/1"})
if len(got) != 2 {
t.Fatalf("expected 2 GPUs (two H200 cards), got %d", len(got))
}
for _, gpu := range got {
if gpu.Model == "MCX631102AN-ADAT" {
t.Fatalf("network adapter should not be classified as GPU")
}
}
}
func TestParseBoardInfo_NormalizesNullPlaceholders(t *testing.T) {
got := parseBoardInfo(map[string]interface{}{
"Manufacturer": "NULL",
"Model": "NULL",
"SerialNumber": "23E100051",
"PartNumber": "0 ",
"UUID": "fa403f6f-2ee9-11f0-bab9-346f1104085a",
})
if got.Manufacturer != "" {
t.Fatalf("expected empty manufacturer, got %q", got.Manufacturer)
}
if got.ProductName != "" {
t.Fatalf("expected empty product name, got %q", got.ProductName)
}
if got.PartNumber != "" {
t.Fatalf("expected empty part number, got %q", got.PartNumber)
}
if got.SerialNumber != "23E100051" {
t.Fatalf("unexpected serial number: %q", got.SerialNumber)
}
}
func TestShouldCrawlPath_SkipsJsonSchemas(t *testing.T) {
if shouldCrawlPath("/redfish/v1/JsonSchemas") {
t.Fatalf("expected /JsonSchemas to be skipped")
}
if shouldCrawlPath("/redfish/v1/JsonSchemas/ComputerSystem.v1_8_0") {
t.Fatalf("expected JsonSchemas members to be skipped")
}
if !shouldCrawlPath("/redfish/v1/Systems/1") {
t.Fatalf("expected normal hardware path to be crawled")
}
}
func TestReplayCollectGPUs_FromGraphicsControllers(t *testing.T) {
r := redfishSnapshotReader{tree: map[string]interface{}{
"/redfish/v1/Systems/1/GraphicsControllers": map[string]interface{}{
"Members": []interface{}{
map[string]interface{}{"@odata.id": "/redfish/v1/Systems/1/GraphicsControllers/GPU0"},
map[string]interface{}{"@odata.id": "/redfish/v1/Systems/1/GraphicsControllers/GPU1"},
},
},
"/redfish/v1/Systems/1/GraphicsControllers/GPU0": map[string]interface{}{
"Id": "GPU0",
"Name": "GPU0",
"Model": "H200-SXM5-141G",
"Manufacturer": "NVIDIA",
"SerialNumber": "1654225094493",
"Status": map[string]interface{}{"State": "Enabled", "Health": "OK"},
},
"/redfish/v1/Systems/1/GraphicsControllers/GPU1": map[string]interface{}{
"Id": "GPU1",
"Name": "GPU1",
"Model": "H200-SXM5-141G",
"Manufacturer": "NVIDIA",
"SerialNumber": "1654425002635",
"Status": map[string]interface{}{"State": "Enabled", "Health": "OK"},
},
}}
got := r.collectGPUs([]string{"/redfish/v1/Systems/1"}, nil)
if len(got) != 2 {
t.Fatalf("expected 2 GPUs from GraphicsControllers, got %d", len(got))
}
if got[0].SerialNumber == "" || got[1].SerialNumber == "" {
t.Fatalf("expected GPU serial numbers from GraphicsControllers")
}
}
func TestParseBoardInfoWithFallback_UsesFRU(t *testing.T) {
system := map[string]interface{}{
"Manufacturer": "NULL",
"Model": "NULL",
"SerialNumber": "23E100051",
"PartNumber": "0",
}
chassis := map[string]interface{}{
"Manufacturer": "NULL",
"Model": "NULL",
}
fru := map[string]interface{}{
"FRUInfo": map[string]interface{}{
"Board": map[string]interface{}{
"Manufacturer": "Kaytus",
"ProductName": "KR4268X2",
},
},
}
got := parseBoardInfoWithFallback(system, chassis, fru)
if got.ProductName != "KR4268X2" {
t.Fatalf("expected product from FRU, got %q", got.ProductName)
}
if got.Manufacturer != "Kaytus" {
t.Fatalf("expected manufacturer from FRU, got %q", got.Manufacturer)
}
if got.SerialNumber != "23E100051" {
t.Fatalf("expected serial from system, got %q", got.SerialNumber)
}
}
func TestReplayRedfishFromRawPayloads_AddsMissingServerModelWarning(t *testing.T) {
raw := map[string]any{
"redfish_tree": map[string]interface{}{
"/redfish/v1": map[string]interface{}{
"Systems": map[string]interface{}{"@odata.id": "/redfish/v1/Systems"},
"Chassis": map[string]interface{}{"@odata.id": "/redfish/v1/Chassis"},
"Managers": map[string]interface{}{"@odata.id": "/redfish/v1/Managers"},
},
"/redfish/v1/Systems": map[string]interface{}{
"Members": []interface{}{
map[string]interface{}{"@odata.id": "/redfish/v1/Systems/1"},
},
},
"/redfish/v1/Systems/1": map[string]interface{}{
"Manufacturer": "NULL",
"Model": "NULL",
"SerialNumber": "23E100051",
},
"/redfish/v1/Chassis": map[string]interface{}{
"Members": []interface{}{
map[string]interface{}{"@odata.id": "/redfish/v1/Chassis/1"},
},
},
"/redfish/v1/Chassis/1": map[string]interface{}{
"Id": "1",
"Manufacturer": "NULL",
"Model": "NULL",
},
"/redfish/v1/Managers": map[string]interface{}{
"Members": []interface{}{
map[string]interface{}{"@odata.id": "/redfish/v1/Managers/1"},
},
},
"/redfish/v1/Managers/1": map[string]interface{}{
"Id": "1",
},
},
"redfish_fetch_errors": []map[string]interface{}{
{"path": "/redfish/v1/Systems/1/Oem/Public/FRU", "error": "status 500"},
},
}
got, err := ReplayRedfishFromRawPayloads(raw, nil)
if err != nil {
t.Fatalf("replay failed: %v", err)
}
if got.Hardware == nil {
t.Fatalf("expected hardware")
}
if got.Hardware.BoardInfo.ProductName != "" {
t.Fatalf("expected empty model for warning test, got %q", got.Hardware.BoardInfo.ProductName)
}
found := false
for _, ev := range got.Events {
if ev.Source == "Redfish" && ev.EventType == "Collection Warning" {
found = true
break
}
}
if !found {
t.Fatalf("expected collection warning event about missing server model")
}
}