Fix NVIDIA GPU/NVSwitch parsing and Reanimator export statuses

This commit is contained in:
2026-02-15 21:00:30 +03:00
parent 0af3cee9b6
commit c7b2a7ab29
12 changed files with 695 additions and 92 deletions

View File

@@ -0,0 +1,178 @@
package nvidia
import (
"encoding/json"
"fmt"
"regexp"
"strings"
"git.mchus.pro/mchus/logpile/internal/models"
"git.mchus.pro/mchus/logpile/internal/parser"
)
var (
gpuNameWithSerialRegex = regexp.MustCompile(`^SXM(\d+)_SN_(.+)$`)
gpuNameSlotOnlyRegex = regexp.MustCompile(`^SXM(\d+)$`)
skuModelRegex = regexp.MustCompile(`sku_hgx-([a-z0-9]+)-\d+-gpu`)
skuCodeRegex = regexp.MustCompile(`^(G\d{3})[.-](\d{4})`)
)
type testSpecData struct {
Actions []struct {
VirtualID string `json:"virtual_id"`
Args struct {
SKUToFile map[string]string `json:"sku_to_sku_json_file_map"`
} `json:"args"`
} `json:"actions"`
}
type inventoryFieldDiagSummary struct {
ModsRuns []struct {
ModsHeader []struct {
GPUName string `json:"GpuName"`
BoardInfo string `json:"BoardInfo"`
} `json:"ModsHeader"`
} `json:"ModsRuns"`
}
// ApplyGPUModelsFromSKU updates GPU model names using SKU mapping from testspec.json.
// Mapping source:
// - inventory/fieldiag_summary.json: GPUName -> BoardInfo(SKU)
// - testspec.json: SKU -> sku_hgx-... filename
func ApplyGPUModelsFromSKU(files []parser.ExtractedFile, result *models.AnalysisResult) {
if result == nil || result.Hardware == nil || len(result.Hardware.GPUs) == 0 {
return
}
skuToFile := parseSKUToFileMap(files)
if len(skuToFile) == 0 {
return
}
serialToSKU, slotToSKU := parseGPUSKUMapping(files)
if len(serialToSKU) == 0 && len(slotToSKU) == 0 {
return
}
for i := range result.Hardware.GPUs {
gpu := &result.Hardware.GPUs[i]
sku := ""
if serial := strings.TrimSpace(gpu.SerialNumber); serial != "" {
sku = serialToSKU[serial]
}
if sku == "" {
sku = slotToSKU[strings.TrimSpace(gpu.Slot)]
}
if sku == "" {
continue
}
model := resolveModelFromSKU(sku, skuToFile)
if model == "" {
continue
}
gpu.Model = model
}
}
func parseSKUToFileMap(files []parser.ExtractedFile) map[string]string {
specFile := parser.FindFileByName(files, "testspec.json")
if specFile == nil {
return nil
}
var spec testSpecData
if err := json.Unmarshal(specFile.Content, &spec); err != nil {
return nil
}
result := make(map[string]string)
for _, action := range spec.Actions {
for sku, file := range action.Args.SKUToFile {
normSKU := normalizeSKUCode(sku)
if normSKU == "" {
continue
}
result[normSKU] = strings.TrimSpace(file)
}
}
return result
}
func parseGPUSKUMapping(files []parser.ExtractedFile) (map[string]string, map[string]string) {
var summaryFile *parser.ExtractedFile
for _, f := range files {
path := strings.ToLower(f.Path)
if strings.Contains(path, "inventory/fieldiag_summary.json") ||
strings.Contains(path, "inventory\\fieldiag_summary.json") {
summaryFile = &f
break
}
}
if summaryFile == nil {
return nil, nil
}
var summary inventoryFieldDiagSummary
if err := json.Unmarshal(summaryFile.Content, &summary); err != nil {
return nil, nil
}
serialToSKU := make(map[string]string)
slotToSKU := make(map[string]string)
for _, run := range summary.ModsRuns {
for _, h := range run.ModsHeader {
sku := normalizeSKUCode(h.BoardInfo)
if sku == "" {
continue
}
gpuName := strings.TrimSpace(h.GPUName)
if matches := gpuNameWithSerialRegex.FindStringSubmatch(gpuName); len(matches) == 3 {
slotToSKU["GPUSXM"+matches[1]] = sku
serialToSKU[strings.TrimSpace(matches[2])] = sku
continue
}
if matches := gpuNameSlotOnlyRegex.FindStringSubmatch(gpuName); len(matches) == 2 {
slotToSKU["GPUSXM"+matches[1]] = sku
}
}
}
return serialToSKU, slotToSKU
}
func resolveModelFromSKU(sku string, skuToFile map[string]string) string {
file := strings.ToLower(strings.TrimSpace(skuToFile[normalizeSKUCode(sku)]))
if file == "" {
return ""
}
m := skuModelRegex.FindStringSubmatch(file)
if len(m) != 2 {
return ""
}
gpuFamily := strings.ToUpper(strings.TrimSpace(m[1]))
if gpuFamily == "" {
return ""
}
return fmt.Sprintf("NVIDIA %s SXM", gpuFamily)
}
func normalizeSKUCode(v string) string {
s := strings.TrimSpace(strings.ToUpper(v))
if s == "" {
return ""
}
if m := skuCodeRegex.FindStringSubmatch(s); len(m) == 3 {
return m[1] + "-" + m[2]
}
return s
}

View File

@@ -0,0 +1,56 @@
package nvidia
import (
"testing"
"git.mchus.pro/mchus/logpile/internal/models"
"git.mchus.pro/mchus/logpile/internal/parser"
)
func TestApplyGPUModelsFromSKU(t *testing.T) {
files := []parser.ExtractedFile{
{
Path: "inventory/fieldiag_summary.json",
Content: []byte(`{
"ModsRuns":[
{"ModsHeader":[
{"GpuName":"SXM5_SN_1653925025497","BoardInfo":"G520-0280"}
]}
]
}`),
},
{
Path: "testspec.json",
Content: []byte(`{
"actions":[
{
"virtual_id":"inventory",
"args":{
"sku_to_sku_json_file_map":{
"G520-0280":"sku_hgx-h200-8-gpu_141g_aircooled_field.json"
}
}
}
]
}`),
},
}
result := &models.AnalysisResult{
Hardware: &models.HardwareConfig{
GPUs: []models.GPU{
{
Slot: "GPUSXM5",
SerialNumber: "1653925025497",
Model: "NVIDIA Device 2335",
},
},
},
}
ApplyGPUModelsFromSKU(files, result)
if got := result.Hardware.GPUs[0].Model; got != "NVIDIA H200 SXM" {
t.Fatalf("expected model NVIDIA H200 SXM, got %q", got)
}
}

View File

@@ -14,7 +14,7 @@ import (
// parserVersion - version of this parser module
// IMPORTANT: Increment this version when making changes to parser logic!
const parserVersion = "1.2.0"
const parserVersion = "1.2.4"
func init() {
parser.Register(&Parser{})
@@ -105,6 +105,7 @@ func (p *Parser) Parse(files []parser.ExtractedFile) (*models.AnalysisResult, er
result.Hardware = &models.HardwareConfig{
GPUs: make([]models.GPU, 0),
}
gpuStatuses := make(map[string]string)
// Parse output.log first (contains dmidecode system info)
// Find the output.log file that contains dmidecode output
@@ -133,18 +134,30 @@ func (p *Parser) Parse(files []parser.ExtractedFile) (*models.AnalysisResult, er
}
}
// Enhance GPU model names using SKU mapping from testspec + inventory summary.
ApplyGPUModelsFromSKU(files, result)
// Parse summary.json (test results summary)
if f := parser.FindFileByName(files, "summary.json"); f != nil {
events := ParseSummaryJSON(f.Content)
result.Events = append(result.Events, events...)
for componentID, status := range CollectGPUStatusesFromSummaryJSON(f.Content) {
gpuStatuses[componentID] = mergeGPUStatus(gpuStatuses[componentID], status)
}
}
// Parse summary.csv (alternative format)
if f := parser.FindFileByName(files, "summary.csv"); f != nil {
csvEvents := ParseSummaryCSV(f.Content)
result.Events = append(result.Events, csvEvents...)
for componentID, status := range CollectGPUStatusesFromSummaryCSV(f.Content) {
gpuStatuses[componentID] = mergeGPUStatus(gpuStatuses[componentID], status)
}
}
// Apply per-GPU PASS/FAIL status derived from summary files.
ApplyGPUStatuses(result, gpuStatuses)
// Parse GPU field diagnostics logs
gpuFieldiagFiles := parser.FindFileByPattern(files, "gpu_fieldiag/", ".log")
for _, f := range gpuFieldiagFiles {

View File

@@ -104,6 +104,82 @@ func TestNVIDIAParser_RealArchive(t *testing.T) {
t.Logf("\nSXM2 failure events: %d", sxm2Events)
}
func TestNVIDIAParser_GPUStatusFromSummary_RealArchive07900(t *testing.T) {
archivePath := filepath.Join("../../../../example", "A514359X5A07900_logs-20260122-074208.tar")
if _, err := os.Stat(archivePath); os.IsNotExist(err) {
t.Skip("Test archive not found, skipping test")
}
files, err := parser.ExtractArchive(archivePath)
if err != nil {
t.Fatalf("Failed to extract archive: %v", err)
}
p := &Parser{}
result, err := p.Parse(files)
if err != nil {
t.Fatalf("Failed to parse archive: %v", err)
}
if result.Hardware == nil || len(result.Hardware.GPUs) == 0 {
t.Fatalf("expected GPUs in parsed result")
}
statusBySerial := make(map[string]string, len(result.Hardware.GPUs))
for _, gpu := range result.Hardware.GPUs {
if gpu.SerialNumber != "" {
statusBySerial[gpu.SerialNumber] = gpu.Status
}
}
if got := statusBySerial["1653925025497"]; got != "FAIL" {
t.Fatalf("expected GPU serial 1653925025497 status FAIL, got %q", got)
}
for serial, st := range statusBySerial {
if serial == "1653925025497" {
continue
}
if st != "PASS" {
t.Fatalf("expected non-failing GPU serial %s status PASS, got %q", serial, st)
}
}
}
func TestNVIDIAParser_GPUModelFromSKU_RealArchive07900(t *testing.T) {
archivePath := filepath.Join("../../../../example", "A514359X5A07900_logs-20260122-074208.tar")
if _, err := os.Stat(archivePath); os.IsNotExist(err) {
t.Skip("Test archive not found, skipping test")
}
files, err := parser.ExtractArchive(archivePath)
if err != nil {
t.Fatalf("Failed to extract archive: %v", err)
}
p := &Parser{}
result, err := p.Parse(files)
if err != nil {
t.Fatalf("Failed to parse archive: %v", err)
}
if result.Hardware == nil || len(result.Hardware.GPUs) == 0 {
t.Fatalf("expected GPUs in parsed result")
}
found := false
for _, gpu := range result.Hardware.GPUs {
if gpu.Model == "NVIDIA H200 SXM" {
found = true
break
}
}
if !found {
t.Fatalf("expected at least one GPU model NVIDIA H200 SXM")
}
}
func contains(s, substr string) bool {
return len(s) >= len(substr) && (s == substr || len(s) > len(substr) &&
(s[:len(substr)] == substr || s[len(s)-len(substr):] == substr ||
@@ -118,4 +194,3 @@ func findSubstring(s, substr string) bool {
}
return false
}

View File

@@ -4,6 +4,7 @@ import (
"encoding/csv"
"encoding/json"
"fmt"
"regexp"
"strings"
"time"
@@ -20,6 +21,8 @@ type SummaryEntry struct {
IgnoreError string `json:"Ignore Error"`
}
var gpuComponentIDRegex = regexp.MustCompile(`^SXM(\d+)_SN_(.+)$`)
// ParseSummaryJSON parses summary.json file and returns events
func ParseSummaryJSON(content []byte) []models.Event {
var entries []SummaryEntry
@@ -92,6 +95,124 @@ func ParseSummaryCSV(content []byte) []models.Event {
return events
}
// CollectGPUStatusesFromSummaryJSON extracts per-GPU PASS/FAIL status from summary.json.
// Key format in returned map is component ID from summary (e.g. "SXM5_SN_1653925025497").
func CollectGPUStatusesFromSummaryJSON(content []byte) map[string]string {
var entries []SummaryEntry
if err := json.Unmarshal(content, &entries); err != nil {
return nil
}
statuses := make(map[string]string)
for _, entry := range entries {
component := strings.TrimSpace(entry.ComponentID)
if component == "" || !gpuComponentIDRegex.MatchString(component) {
continue
}
current := statuses[component]
next := "PASS"
if !isSummaryJSONRecordPassing(entry.ErrorCode, entry.Notes) {
next = "FAIL"
}
statuses[component] = mergeGPUStatus(current, next)
}
return statuses
}
// CollectGPUStatusesFromSummaryCSV extracts per-GPU PASS/FAIL status from summary.csv.
// Key format in returned map is component ID from summary (e.g. "SXM5_SN_1653925025497").
func CollectGPUStatusesFromSummaryCSV(content []byte) map[string]string {
reader := csv.NewReader(strings.NewReader(string(content)))
records, err := reader.ReadAll()
if err != nil {
return nil
}
statuses := make(map[string]string)
for i, record := range records {
if i == 0 || len(record) < 7 {
continue
}
component := strings.TrimSpace(record[5])
if component == "" || !gpuComponentIDRegex.MatchString(component) {
continue
}
errorCode := strings.TrimSpace(record[0])
notes := strings.TrimSpace(record[6])
current := statuses[component]
next := "PASS"
if !isSummaryCSVRecordPassing(errorCode, notes) {
next = "FAIL"
}
statuses[component] = mergeGPUStatus(current, next)
}
return statuses
}
func isSummaryJSONRecordPassing(errorCode, notes string) bool {
_ = errorCode
return strings.TrimSpace(notes) == "OK"
}
func isSummaryCSVRecordPassing(errorCode, notes string) bool {
_ = errorCode
return strings.TrimSpace(notes) == "OK"
}
func mergeGPUStatus(current, next string) string {
// FAIL has highest priority.
if current == "FAIL" || next == "FAIL" {
return "FAIL"
}
if current == "PASS" || next == "PASS" {
return "PASS"
}
return ""
}
// ApplyGPUStatuses applies aggregated PASS/FAIL statuses from summary components to parsed GPUs.
func ApplyGPUStatuses(result *models.AnalysisResult, componentStatuses map[string]string) {
if result == nil || result.Hardware == nil || len(result.Hardware.GPUs) == 0 || len(componentStatuses) == 0 {
return
}
slotStatus := make(map[string]string) // key: GPUSXM<idx>
serialStatus := make(map[string]string) // key: GPU serial
for componentID, status := range componentStatuses {
matches := gpuComponentIDRegex.FindStringSubmatch(strings.TrimSpace(componentID))
if len(matches) != 3 {
continue
}
slotKey := "GPUSXM" + matches[1]
serialKey := strings.TrimSpace(matches[2])
slotStatus[slotKey] = mergeGPUStatus(slotStatus[slotKey], status)
if serialKey != "" {
serialStatus[serialKey] = mergeGPUStatus(serialStatus[serialKey], status)
}
}
for i := range result.Hardware.GPUs {
gpu := &result.Hardware.GPUs[i]
next := ""
if serial := strings.TrimSpace(gpu.SerialNumber); serial != "" {
next = serialStatus[serial]
}
if next == "" {
next = slotStatus[strings.TrimSpace(gpu.Slot)]
}
if next != "" {
gpu.Status = next
}
}
}
// formatSummaryDescription creates a human-readable description from summary entry
func formatSummaryDescription(entry SummaryEntry) string {
component := entry.ComponentID

View File

@@ -0,0 +1,46 @@
package nvidia
import (
"strings"
"testing"
"git.mchus.pro/mchus/logpile/internal/models"
)
func TestApplyGPUStatuses_FromSummaryCSV_FailAndPass(t *testing.T) {
csvData := strings.Join([]string{
"ErrorCode,Test,VirtualID,SubTest,Type,ComponentID,Notes,Level,,,IgnoreError",
"0,gpumem,gpumem,,GPU,SXM1_SN_111,OK,1,,,False",
"363,gpumem,gpumem,,GPU,SXM5_SN_1653925025497,Row remapping failed,1,,,False",
"0,gpu_fieldiag,gpu_fieldiag,,GPU,SXM1_SN_111,OK,1,,,False",
"0,gpu_fieldiag,gpu_fieldiag,,GPU,SXM2_SN_222,OK,1,,,False",
}, "\n")
result := &models.AnalysisResult{
Hardware: &models.HardwareConfig{
GPUs: []models.GPU{
{Slot: "GPUSXM1", SerialNumber: "111"},
{Slot: "GPUSXM2", SerialNumber: "222"},
{Slot: "GPUSXM5", SerialNumber: "1653925025497"},
},
},
}
statuses := CollectGPUStatusesFromSummaryCSV([]byte(csvData))
ApplyGPUStatuses(result, statuses)
bySerial := map[string]string{}
for _, gpu := range result.Hardware.GPUs {
bySerial[gpu.SerialNumber] = gpu.Status
}
if bySerial["1653925025497"] != "FAIL" {
t.Fatalf("expected serial 1653925025497 status FAIL, got %q", bySerial["1653925025497"])
}
if bySerial["111"] != "PASS" {
t.Fatalf("expected serial 111 status PASS, got %q", bySerial["111"])
}
if bySerial["222"] != "PASS" {
t.Fatalf("expected serial 222 status PASS, got %q", bySerial["222"])
}
}

View File

@@ -3,6 +3,7 @@ package nvidia
import (
"encoding/json"
"fmt"
"regexp"
"strings"
"git.mchus.pro/mchus/logpile/internal/models"
@@ -53,6 +54,8 @@ type Property struct {
Value interface{} `json:"value"` // Can be string or number
}
var nvswitchComponentIDRegex = regexp.MustCompile(`^(NVSWITCH\d+|NVSWITCHNVSWITCH\d+)$`)
// GetValueAsString returns the value as a string
func (p *Property) GetValueAsString() string {
switch v := p.Value.(type) {
@@ -107,7 +110,7 @@ func parseInventoryComponents(components []Component, result *models.AnalysisRes
}
// Parse NVSwitch components
if strings.HasPrefix(comp.ComponentID, "NVSWITCHNVSWITCH") {
if isNVSwitchComponentID(comp.ComponentID) {
nvswitch := parseNVSwitchComponent(comp)
if nvswitch != nil {
// Add as PCIe device for now
@@ -217,7 +220,7 @@ func parseGPUComponent(comp Component) *models.GPU {
// parseNVSwitchComponent parses NVSwitch component information
func parseNVSwitchComponent(comp Component) *models.PCIeDevice {
device := &models.PCIeDevice{
Slot: comp.ComponentID, // e.g., "NVSWITCHNVSWITCH0"
Slot: normalizeNVSwitchSlot(comp.ComponentID),
}
var vendorIDStr, deviceIDStr, vbios, pciID string
@@ -279,3 +282,15 @@ func parseNVSwitchComponent(comp Component) *models.PCIeDevice {
return device
}
func normalizeNVSwitchSlot(componentID string) string {
slot := strings.TrimSpace(componentID)
if strings.HasPrefix(slot, "NVSWITCHNVSWITCH") {
return strings.Replace(slot, "NVSWITCHNVSWITCH", "NVSWITCH", 1)
}
return slot
}
func isNVSwitchComponentID(componentID string) bool {
return nvswitchComponentIDRegex.MatchString(strings.TrimSpace(componentID))
}

View File

@@ -0,0 +1,46 @@
package nvidia
import (
"testing"
"git.mchus.pro/mchus/logpile/internal/models"
)
func TestParseInventoryComponents_IgnoresNVSwitchPropertyChecks(t *testing.T) {
result := &models.AnalysisResult{
Hardware: &models.HardwareConfig{},
}
components := []Component{
{
ComponentID: "NVSWITCHNVSWITCH1",
Properties: []Property{
{ID: "VendorID", Value: "10de"},
{ID: "DeviceID", Value: "22a3"},
{ID: "PCIID", Value: "0000:06:00.0"},
},
},
{
ComponentID: "NVSWITCHNum",
Properties: []Property{
{ID: "NVSWITCHNum", Value: 4},
},
},
{
ComponentID: "NVSWITCH_NVSWITCH1_VendorID",
Properties: []Property{
{ID: "NVSWITCH_NVSWITCH1_VendorID", Value: "10de"},
},
},
}
parseInventoryComponents(components, result)
if got := len(result.Hardware.PCIeDevices); got != 1 {
t.Fatalf("expected exactly 1 parsed NVSwitch device, got %d", got)
}
if result.Hardware.PCIeDevices[0].Slot != "NVSWITCH1" {
t.Fatalf("expected slot NVSWITCH1, got %q", result.Hardware.PCIeDevices[0].Slot)
}
}

View File

@@ -0,0 +1,35 @@
package nvidia
import "testing"
func TestParseNVSwitchComponent_NormalizesDuplicatedPrefixInSlot(t *testing.T) {
comp := Component{
ComponentID: "NVSWITCHNVSWITCH1",
Properties: []Property{
{ID: "VendorID", Value: "10de"},
{ID: "DeviceID", Value: "22a3"},
{ID: "Vendor", Value: "NVIDIA Corporation"},
{ID: "PCIID", Value: "0000:06:00.0"},
{ID: "PCISpeed", Value: "16GT/s"},
{ID: "PCIWidth", Value: "x2"},
{ID: "VBIOS_version", Value: "96.10.6D.00.01"},
},
}
device := parseNVSwitchComponent(comp)
if device == nil {
t.Fatal("expected non-nil NVSwitch device")
}
if device.Slot != "NVSWITCH1" {
t.Fatalf("expected normalized slot NVSWITCH1, got %q", device.Slot)
}
if device.BDF != "0000:06:00.0" {
t.Fatalf("expected BDF 0000:06:00.0, got %q", device.BDF)
}
if device.DeviceClass != "NVSwitch" {
t.Fatalf("expected device class NVSwitch, got %q", device.DeviceClass)
}
}