Update Inspur parsing and align release docs
This commit is contained in:
101
internal/parser/vendors/inspur/gpu_status.go
vendored
101
internal/parser/vendors/inspur/gpu_status.go
vendored
@@ -2,6 +2,7 @@ package inspur
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
@@ -15,38 +16,96 @@ func applyGPUStatusFromEvents(hw *models.HardwareConfig, events []models.Event)
|
||||
return
|
||||
}
|
||||
|
||||
faulty := make(map[int]bool)
|
||||
for _, e := range events {
|
||||
if !isGPUFaultEvent(e) {
|
||||
continue
|
||||
}
|
||||
|
||||
matches := reFaultGPU.FindAllStringSubmatch(e.Description, -1)
|
||||
for _, m := range matches {
|
||||
if len(m) < 2 {
|
||||
continue
|
||||
}
|
||||
idx, err := strconv.Atoi(m[1])
|
||||
if err == nil && idx >= 0 {
|
||||
faulty[idx] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
gpuByIndex := make(map[int]*models.GPU)
|
||||
for i := range hw.GPUs {
|
||||
gpu := &hw.GPUs[i]
|
||||
idx, ok := extractLogicalGPUIndex(gpu.Slot)
|
||||
if ok && faulty[idx] {
|
||||
gpu.Status = "Critical"
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
gpuByIndex[idx] = gpu
|
||||
gpu.StatusHistory = nil
|
||||
gpu.ErrorDescription = ""
|
||||
}
|
||||
|
||||
if strings.TrimSpace(gpu.Status) == "" {
|
||||
relevantEvents := make([]models.Event, 0)
|
||||
for _, e := range events {
|
||||
if !isGPUFaultEvent(e) || len(extractFaultyGPUSet(e.Description)) == 0 {
|
||||
continue
|
||||
}
|
||||
relevantEvents = append(relevantEvents, e)
|
||||
}
|
||||
|
||||
if len(relevantEvents) == 0 {
|
||||
for _, gpu := range gpuByIndex {
|
||||
if strings.TrimSpace(gpu.Status) == "" {
|
||||
gpu.Status = "OK"
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
sort.Slice(relevantEvents, func(i, j int) bool {
|
||||
return relevantEvents[i].Timestamp.Before(relevantEvents[j].Timestamp)
|
||||
})
|
||||
|
||||
currentStatus := make(map[int]string, len(gpuByIndex))
|
||||
lastCriticalDetails := make(map[int]string, len(gpuByIndex))
|
||||
for idx := range gpuByIndex {
|
||||
currentStatus[idx] = "OK"
|
||||
}
|
||||
|
||||
for _, e := range relevantEvents {
|
||||
faultySet := extractFaultyGPUSet(e.Description)
|
||||
for idx, gpu := range gpuByIndex {
|
||||
newStatus := "OK"
|
||||
if faultySet[idx] {
|
||||
newStatus = "Critical"
|
||||
lastCriticalDetails[idx] = strings.TrimSpace(e.Description)
|
||||
}
|
||||
|
||||
if currentStatus[idx] != newStatus {
|
||||
gpu.StatusHistory = append(gpu.StatusHistory, models.StatusHistoryEntry{
|
||||
Status: newStatus,
|
||||
ChangedAt: e.Timestamp,
|
||||
Details: strings.TrimSpace(e.Description),
|
||||
})
|
||||
gpu.StatusChangedAt = e.Timestamp
|
||||
currentStatus[idx] = newStatus
|
||||
}
|
||||
|
||||
gpu.StatusCheckedAt = e.Timestamp
|
||||
}
|
||||
}
|
||||
|
||||
for idx, gpu := range gpuByIndex {
|
||||
gpu.Status = currentStatus[idx]
|
||||
if gpu.Status == "Critical" {
|
||||
gpu.ErrorDescription = lastCriticalDetails[idx]
|
||||
} else {
|
||||
gpu.ErrorDescription = ""
|
||||
}
|
||||
if gpu.StatusCheckedAt.IsZero() && strings.TrimSpace(gpu.Status) == "" {
|
||||
gpu.Status = "OK"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func extractFaultyGPUSet(description string) map[int]bool {
|
||||
faulty := make(map[int]bool)
|
||||
matches := reFaultGPU.FindAllStringSubmatch(description, -1)
|
||||
for _, m := range matches {
|
||||
if len(m) < 2 {
|
||||
continue
|
||||
}
|
||||
idx, err := strconv.Atoi(m[1])
|
||||
if err == nil && idx >= 0 {
|
||||
faulty[idx] = true
|
||||
}
|
||||
}
|
||||
return faulty
|
||||
}
|
||||
|
||||
func isGPUFaultEvent(e models.Event) bool {
|
||||
desc := strings.ToLower(e.Description)
|
||||
if strings.Contains(desc, "bios miss f_gpu") {
|
||||
|
||||
69
internal/parser/vendors/inspur/hgx_firmware_test.go
vendored
Normal file
69
internal/parser/vendors/inspur/hgx_firmware_test.go
vendored
Normal file
@@ -0,0 +1,69 @@
|
||||
package inspur
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"git.mchus.pro/mchus/logpile/internal/models"
|
||||
)
|
||||
|
||||
func TestAppendHGXFirmwareFromHWInfo_AppendsInventoryEntries(t *testing.T) {
|
||||
hw := &models.HardwareConfig{
|
||||
Firmware: []models.FirmwareInfo{
|
||||
{DeviceName: "BIOS", Version: "1.0.0"},
|
||||
},
|
||||
}
|
||||
|
||||
content := []byte(`
|
||||
{
|
||||
"@odata.id": "/redfish/v1/UpdateService/FirmwareInventory/HGX_FW_BMC_0",
|
||||
"Id": "HGX_FW_BMC_0",
|
||||
"Oem": {
|
||||
"Nvidia": {
|
||||
"ActiveFirmwareSlot": {"Version": "25.05-A"},
|
||||
"InactiveFirmwareSlot": {"Version": "25.04-B"}
|
||||
}
|
||||
},
|
||||
"Version": "25.05-A",
|
||||
"WriteProtected": false
|
||||
}
|
||||
{
|
||||
"@odata.id": "/redfish/v1/UpdateService/FirmwareInventory/HGX_FW_GPU_SXM_1",
|
||||
"Id": "HGX_FW_GPU_SXM_1",
|
||||
"Version": "97.00.C5.00.0E",
|
||||
"WriteProtected": false
|
||||
}
|
||||
{
|
||||
"@odata.id": "/redfish/v1/UpdateService/FirmwareInventory/HGX_Driver_GPU_SXM_1",
|
||||
"Id": "HGX_Driver_GPU_SXM_1",
|
||||
"Version": "",
|
||||
"WriteProtected": false
|
||||
}
|
||||
`)
|
||||
|
||||
appendHGXFirmwareFromHWInfo(content, hw)
|
||||
|
||||
if len(hw.Firmware) != 5 {
|
||||
t.Fatalf("expected 5 firmware entries after append, got %d", len(hw.Firmware))
|
||||
}
|
||||
|
||||
seen := make(map[string]string)
|
||||
for _, fw := range hw.Firmware {
|
||||
seen[fw.DeviceName] = fw.Version
|
||||
}
|
||||
|
||||
if seen["HGX_FW_BMC_0"] != "25.05-A" {
|
||||
t.Fatalf("expected HGX_FW_BMC_0 version 25.05-A, got %q", seen["HGX_FW_BMC_0"])
|
||||
}
|
||||
if seen["HGX_FW_BMC_0 Active Slot"] != "25.05-A" {
|
||||
t.Fatalf("expected active slot version, got %q", seen["HGX_FW_BMC_0 Active Slot"])
|
||||
}
|
||||
if seen["HGX_FW_BMC_0 Inactive Slot"] != "25.04-B" {
|
||||
t.Fatalf("expected inactive slot version, got %q", seen["HGX_FW_BMC_0 Inactive Slot"])
|
||||
}
|
||||
if seen["HGX_FW_GPU_SXM_1"] != "97.00.C5.00.0E" {
|
||||
t.Fatalf("expected GPU FW entry, got %q", seen["HGX_FW_GPU_SXM_1"])
|
||||
}
|
||||
if _, ok := seen["HGX_Driver_GPU_SXM_1"]; ok {
|
||||
t.Fatalf("did not expect empty version driver entry")
|
||||
}
|
||||
}
|
||||
@@ -24,6 +24,10 @@ func TestEnrichGPUsFromHGXHWInfo_UsesHGXLogicalMapping(t *testing.T) {
|
||||
{"Name":"GPU Board Assembly","Model":"B200 180GB HBM3e","PartNumber":"PN3","SerialNumber":"SXM3SN"}
|
||||
# curl -X GET http://127.0.0.1/redfish/v1/Chassis/HGX_GPU_SXM_5/Assembly
|
||||
{"Name":"GPU Board Assembly","Model":"B200 180GB HBM3e","PartNumber":"PN5","SerialNumber":"SXM5SN"}
|
||||
{"Id":"HGX_FW_GPU_SXM_1","Version":"FW1"}
|
||||
{"Id":"HGX_FW_GPU_SXM_3","Version":"FW3"}
|
||||
{"Id":"HGX_FW_GPU_SXM_5","Version":"FW5"}
|
||||
{"Id":"HGX_InfoROM_GPU_SXM_3","Version":"IR3"}
|
||||
`)
|
||||
|
||||
enrichGPUsFromHGXHWInfo(content, hw)
|
||||
@@ -37,6 +41,15 @@ func TestEnrichGPUsFromHGXHWInfo_UsesHGXLogicalMapping(t *testing.T) {
|
||||
if hw.GPUs[2].SerialNumber != "SXM5SN" {
|
||||
t.Fatalf("expected #GPU0 to map to SXM5 serial, got %q", hw.GPUs[2].SerialNumber)
|
||||
}
|
||||
if hw.GPUs[0].Firmware != "FW3" {
|
||||
t.Fatalf("expected #GPU6 firmware FW3, got %q", hw.GPUs[0].Firmware)
|
||||
}
|
||||
if hw.GPUs[0].VideoBIOS != "IR3" {
|
||||
t.Fatalf("expected #GPU6 InfoROM in VideoBIOS IR3, got %q", hw.GPUs[0].VideoBIOS)
|
||||
}
|
||||
if hw.GPUs[2].Firmware != "FW5" {
|
||||
t.Fatalf("expected #GPU0 firmware FW5, got %q", hw.GPUs[2].Firmware)
|
||||
}
|
||||
for _, g := range hw.GPUs {
|
||||
if g.Slot == "#CPU0_PE1_E_BMC" {
|
||||
t.Fatalf("expected non-HGX BMC VGA entry to be filtered out")
|
||||
@@ -104,6 +117,44 @@ func TestApplyGPUStatusFromEvents_MarksFaultedGPU(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyGPUStatusFromEvents_UsesLatestEventAsCurrentStatusAndKeepsHistory(t *testing.T) {
|
||||
hw := &models.HardwareConfig{
|
||||
GPUs: []models.GPU{
|
||||
{Slot: "#GPU1"},
|
||||
{Slot: "#GPU3"},
|
||||
{Slot: "#GPU6"},
|
||||
},
|
||||
}
|
||||
|
||||
events := []models.Event{
|
||||
{
|
||||
ID: "17FFB002",
|
||||
Timestamp: time.Date(2026, 1, 12, 22, 51, 16, 0, time.FixedZone("UTC+8", 8*3600)),
|
||||
Description: "PCIe Present mismatch BIOS miss F_GPU1 F_GPU3 F_GPU6",
|
||||
},
|
||||
{
|
||||
ID: "17FFB002",
|
||||
Timestamp: time.Date(2026, 1, 12, 23, 5, 18, 0, time.FixedZone("UTC+8", 8*3600)),
|
||||
Description: "PCIe Present mismatch BIOS miss F_GPU6",
|
||||
},
|
||||
}
|
||||
|
||||
applyGPUStatusFromEvents(hw, events)
|
||||
|
||||
if hw.GPUs[0].Status != "OK" {
|
||||
t.Fatalf("expected #GPU1 to recover to OK on latest event, got %q", hw.GPUs[0].Status)
|
||||
}
|
||||
if hw.GPUs[1].Status != "OK" {
|
||||
t.Fatalf("expected #GPU3 to recover to OK on latest event, got %q", hw.GPUs[1].Status)
|
||||
}
|
||||
if hw.GPUs[2].Status != "Critical" {
|
||||
t.Fatalf("expected #GPU6 to remain Critical, got %q", hw.GPUs[2].Status)
|
||||
}
|
||||
if len(hw.GPUs[0].StatusHistory) == 0 {
|
||||
t.Fatalf("expected #GPU1 status history to be populated")
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseIDLLog_ParsesStructuredJSONLine(t *testing.T) {
|
||||
content := []byte(`{ "MESSAGE": "|2026-01-12T23:05:18+08:00|PCIE|Assert|Critical|17FFB002|PCIe Present mismatch BIOS miss F_GPU6 - Assert|" }`)
|
||||
|
||||
|
||||
185
internal/parser/vendors/inspur/hgx_hwinfo.go
vendored
185
internal/parser/vendors/inspur/hgx_hwinfo.go
vendored
@@ -15,6 +15,18 @@ type hgxGPUAssemblyInfo struct {
|
||||
Serial string
|
||||
}
|
||||
|
||||
type hgxGPUFirmwareInfo struct {
|
||||
Firmware string
|
||||
InfoROM string
|
||||
}
|
||||
|
||||
type hgxFirmwareInventoryEntry struct {
|
||||
ID string
|
||||
Version string
|
||||
ActiveVersion string
|
||||
InactiveVersion string
|
||||
}
|
||||
|
||||
// Logical GPU index mapping used by HGX B200 UI ordering.
|
||||
// Example from real logs/UI:
|
||||
// GPU0->SXM5, GPU1->SXM7, GPU2->SXM6, GPU3->SXM8, GPU4->SXM2, GPU5->SXM4, GPU6->SXM3, GPU7->SXM1.
|
||||
@@ -31,6 +43,10 @@ var hgxLogicalToSXM = map[int]int{
|
||||
|
||||
var (
|
||||
reHGXGPUBlock = regexp.MustCompile(`(?s)/redfish/v1/Chassis/HGX_GPU_SXM_(\d+)/Assembly.*?"Name":\s*"GPU Board Assembly".*?"Model":\s*"([^"]+)".*?"PartNumber":\s*"([^"]+)".*?"SerialNumber":\s*"([^"]+)"`)
|
||||
reHGXFWBlock = regexp.MustCompile(`(?s)"Id":\s*"HGX_FW_GPU_SXM_(\d+)".*?"Version":\s*"([^"]*)"`)
|
||||
reHGXInfoROM = regexp.MustCompile(`(?s)"Id":\s*"HGX_InfoROM_GPU_SXM_(\d+)".*?"Version":\s*"([^"]*)"`)
|
||||
reIDLine = regexp.MustCompile(`"Id":\s*"([^"]+)"`)
|
||||
reVersion = regexp.MustCompile(`"Version":\s*"([^"]*)"`)
|
||||
reSlotGPU = regexp.MustCompile(`(?i)gpu\s*#?\s*(\d+)`)
|
||||
)
|
||||
|
||||
@@ -43,6 +59,7 @@ func enrichGPUsFromHGXHWInfo(content []byte, hw *models.HardwareConfig) {
|
||||
if len(bySXM) == 0 {
|
||||
return
|
||||
}
|
||||
fwBySXM := parseHGXGPUFirmware(content)
|
||||
|
||||
normalizeHGXGPUInventory(hw, bySXM)
|
||||
|
||||
@@ -72,6 +89,57 @@ func enrichGPUsFromHGXHWInfo(content []byte, hw *models.HardwareConfig) {
|
||||
if strings.TrimSpace(gpu.Manufacturer) == "" {
|
||||
gpu.Manufacturer = "NVIDIA"
|
||||
}
|
||||
if fw, ok := fwBySXM[sxm]; ok {
|
||||
if strings.TrimSpace(gpu.Firmware) == "" && strings.TrimSpace(fw.Firmware) != "" {
|
||||
gpu.Firmware = fw.Firmware
|
||||
}
|
||||
if strings.TrimSpace(gpu.VideoBIOS) == "" && strings.TrimSpace(fw.InfoROM) != "" {
|
||||
gpu.VideoBIOS = fw.InfoROM
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func appendHGXFirmwareFromHWInfo(content []byte, hw *models.HardwareConfig) {
|
||||
if hw == nil || len(content) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
entries := parseHGXFirmwareInventory(content)
|
||||
if len(entries) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
existing := make(map[string]bool, len(hw.Firmware))
|
||||
for _, fw := range hw.Firmware {
|
||||
key := strings.ToLower(strings.TrimSpace(fw.DeviceName) + "|" + strings.TrimSpace(fw.Version))
|
||||
existing[key] = true
|
||||
}
|
||||
|
||||
appendFW := func(name, version string) {
|
||||
name = strings.TrimSpace(name)
|
||||
version = strings.TrimSpace(version)
|
||||
if name == "" || version == "" {
|
||||
return
|
||||
}
|
||||
key := strings.ToLower(name + "|" + version)
|
||||
if existing[key] {
|
||||
return
|
||||
}
|
||||
existing[key] = true
|
||||
hw.Firmware = append(hw.Firmware, models.FirmwareInfo{
|
||||
DeviceName: name,
|
||||
Version: version,
|
||||
})
|
||||
}
|
||||
|
||||
for _, e := range entries {
|
||||
appendFW(e.ID, e.Version)
|
||||
|
||||
if e.ActiveVersion != "" && e.InactiveVersion != "" && e.ActiveVersion != e.InactiveVersion {
|
||||
appendFW(e.ID+" Active Slot", e.ActiveVersion)
|
||||
appendFW(e.ID+" Inactive Slot", e.InactiveVersion)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -97,6 +165,123 @@ func parseHGXGPUAssembly(content []byte) map[int]hgxGPUAssemblyInfo {
|
||||
return result
|
||||
}
|
||||
|
||||
func parseHGXGPUFirmware(content []byte) map[int]hgxGPUFirmwareInfo {
|
||||
result := make(map[int]hgxGPUFirmwareInfo)
|
||||
|
||||
matchesFW := reHGXFWBlock.FindAllSubmatch(content, -1)
|
||||
for _, m := range matchesFW {
|
||||
if len(m) != 3 {
|
||||
continue
|
||||
}
|
||||
sxmIdx, err := strconv.Atoi(string(m[1]))
|
||||
if err != nil || sxmIdx <= 0 {
|
||||
continue
|
||||
}
|
||||
version := strings.TrimSpace(string(m[2]))
|
||||
if version == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
current := result[sxmIdx]
|
||||
if current.Firmware == "" {
|
||||
current.Firmware = version
|
||||
}
|
||||
result[sxmIdx] = current
|
||||
}
|
||||
|
||||
matchesInfoROM := reHGXInfoROM.FindAllSubmatch(content, -1)
|
||||
for _, m := range matchesInfoROM {
|
||||
if len(m) != 3 {
|
||||
continue
|
||||
}
|
||||
sxmIdx, err := strconv.Atoi(string(m[1]))
|
||||
if err != nil || sxmIdx <= 0 {
|
||||
continue
|
||||
}
|
||||
version := strings.TrimSpace(string(m[2]))
|
||||
if version == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
current := result[sxmIdx]
|
||||
if current.InfoROM == "" {
|
||||
current.InfoROM = version
|
||||
}
|
||||
result[sxmIdx] = current
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
func parseHGXFirmwareInventory(content []byte) []hgxFirmwareInventoryEntry {
|
||||
lines := strings.Split(string(content), "\n")
|
||||
result := make([]hgxFirmwareInventoryEntry, 0)
|
||||
|
||||
var current *hgxFirmwareInventoryEntry
|
||||
section := ""
|
||||
|
||||
flush := func() {
|
||||
if current == nil {
|
||||
return
|
||||
}
|
||||
if current.Version == "" && current.ActiveVersion == "" && current.InactiveVersion == "" {
|
||||
current = nil
|
||||
section = ""
|
||||
return
|
||||
}
|
||||
result = append(result, *current)
|
||||
current = nil
|
||||
section = ""
|
||||
}
|
||||
|
||||
for _, line := range lines {
|
||||
if m := reIDLine.FindStringSubmatch(line); len(m) > 1 {
|
||||
flush()
|
||||
id := strings.TrimSpace(m[1])
|
||||
if strings.HasPrefix(id, "HGX_") {
|
||||
current = &hgxFirmwareInventoryEntry{ID: id}
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
if current == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if strings.Contains(line, `"ActiveFirmwareSlot"`) {
|
||||
section = "active"
|
||||
}
|
||||
if strings.Contains(line, `"InactiveFirmwareSlot"`) {
|
||||
section = "inactive"
|
||||
}
|
||||
|
||||
if m := reVersion.FindStringSubmatch(line); len(m) > 1 {
|
||||
version := strings.TrimSpace(m[1])
|
||||
if version == "" {
|
||||
section = ""
|
||||
continue
|
||||
}
|
||||
switch section {
|
||||
case "active":
|
||||
if current.ActiveVersion == "" {
|
||||
current.ActiveVersion = version
|
||||
}
|
||||
case "inactive":
|
||||
if current.InactiveVersion == "" {
|
||||
current.InactiveVersion = version
|
||||
}
|
||||
default:
|
||||
// Keep top-level version from the last seen plain "Version" in current entry.
|
||||
current.Version = version
|
||||
}
|
||||
section = ""
|
||||
}
|
||||
}
|
||||
flush()
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
func extractLogicalGPUIndex(slot string) (int, bool) {
|
||||
m := reSlotGPU.FindStringSubmatch(slot)
|
||||
if len(m) < 2 {
|
||||
|
||||
1
internal/parser/vendors/inspur/parser.go
vendored
1
internal/parser/vendors/inspur/parser.go
vendored
@@ -161,6 +161,7 @@ func (p *Parser) Parse(files []parser.ExtractedFile) (*models.AnalysisResult, er
|
||||
// Enrich GPU inventory from HGX Redfish snapshot (serial/model/part mapping).
|
||||
if f := parser.FindFileByName(files, "HGX_HWInfo_FWVersion.log"); f != nil && result.Hardware != nil {
|
||||
enrichGPUsFromHGXHWInfo(f.Content, result.Hardware)
|
||||
appendHGXFirmwareFromHWInfo(f.Content, result.Hardware)
|
||||
}
|
||||
|
||||
// Mark problematic GPUs from IDL errors like "BIOS miss F_GPU6".
|
||||
|
||||
Reference in New Issue
Block a user