Fix NVIDIA GPU/NVSwitch parsing and Reanimator export statuses

This commit is contained in:
2026-02-15 21:00:30 +03:00
parent 0af3cee9b6
commit c7b2a7ab29
12 changed files with 695 additions and 92 deletions

View File

@@ -27,8 +27,6 @@ func ConvertToReanimator(result *models.AnalysisResult) (*ReanimatorExport, erro
// Determine target host (optional field) // Determine target host (optional field)
targetHost := inferTargetHost(result.TargetHost, result.Filename) targetHost := inferTargetHost(result.TargetHost, result.Filename)
boardSerial := result.Hardware.BoardInfo.SerialNumber
export := &ReanimatorExport{ export := &ReanimatorExport{
Filename: result.Filename, Filename: result.Filename,
SourceType: normalizeSourceType(result.SourceType), SourceType: normalizeSourceType(result.SourceType),
@@ -41,7 +39,7 @@ func ConvertToReanimator(result *models.AnalysisResult) (*ReanimatorExport, erro
CPUs: convertCPUs(result.Hardware.CPUs), CPUs: convertCPUs(result.Hardware.CPUs),
Memory: convertMemory(result.Hardware.Memory), Memory: convertMemory(result.Hardware.Memory),
Storage: convertStorage(result.Hardware.Storage), Storage: convertStorage(result.Hardware.Storage),
PCIeDevices: convertPCIeDevices(result.Hardware, boardSerial), PCIeDevices: convertPCIeDevices(result.Hardware),
PowerSupplies: convertPowerSupplies(result.Hardware.PowerSupply), PowerSupplies: convertPowerSupplies(result.Hardware.PowerSupply),
}, },
} }
@@ -174,16 +172,12 @@ func convertStorage(storage []models.Storage) []ReanimatorStorage {
} }
// convertPCIeDevices converts PCIe devices, GPUs, and network adapters to Reanimator format // convertPCIeDevices converts PCIe devices, GPUs, and network adapters to Reanimator format
func convertPCIeDevices(hw *models.HardwareConfig, boardSerial string) []ReanimatorPCIe { func convertPCIeDevices(hw *models.HardwareConfig) []ReanimatorPCIe {
result := make([]ReanimatorPCIe, 0) result := make([]ReanimatorPCIe, 0)
// Convert regular PCIe devices // Convert regular PCIe devices
for _, pcie := range hw.PCIeDevices { for _, pcie := range hw.PCIeDevices {
serialNumber := pcie.SerialNumber serialNumber := normalizedSerial(pcie.SerialNumber)
if serialNumber == "" || serialNumber == "N/A" {
// Generate serial number
serialNumber = generatePCIeSerialNumber(boardSerial, pcie.Slot, pcie.BDF)
}
// Determine model (prefer PartNumber, fallback to DeviceClass) // Determine model (prefer PartNumber, fallback to DeviceClass)
model := pcie.PartNumber model := pcie.PartNumber
@@ -211,11 +205,7 @@ func convertPCIeDevices(hw *models.HardwareConfig, boardSerial string) []Reanima
// Convert GPUs as PCIe devices // Convert GPUs as PCIe devices
for _, gpu := range hw.GPUs { for _, gpu := range hw.GPUs {
serialNumber := gpu.SerialNumber serialNumber := normalizedSerial(gpu.SerialNumber)
if serialNumber == "" {
// Generate serial number
serialNumber = generatePCIeSerialNumber(boardSerial, gpu.Slot, gpu.BDF)
}
// Determine device class // Determine device class
deviceClass := "DisplayController" deviceClass := "DisplayController"
@@ -244,11 +234,7 @@ func convertPCIeDevices(hw *models.HardwareConfig, boardSerial string) []Reanima
continue continue
} }
serialNumber := nic.SerialNumber serialNumber := normalizedSerial(nic.SerialNumber)
if serialNumber == "" {
// Generate serial number
serialNumber = generatePCIeSerialNumber(boardSerial, nic.Slot, "")
}
result = append(result, ReanimatorPCIe{ result = append(result, ReanimatorPCIe{
Slot: nic.Slot, Slot: nic.Slot,
@@ -339,17 +325,17 @@ func inferCPUManufacturer(model string) string {
return "" return ""
} }
// generatePCIeSerialNumber generates a serial number for PCIe device func normalizedSerial(serial string) string {
func generatePCIeSerialNumber(boardSerial, slot, bdf string) string { s := strings.TrimSpace(serial)
if slot != "" { if s == "" {
return fmt.Sprintf("%s-PCIE-%s", boardSerial, slot) return ""
} }
if bdf != "" { switch strings.ToUpper(s) {
// Use BDF as identifier (e.g., "0000:18:00.0" -> "0000-18-00-0") case "N/A", "NA", "NONE", "NULL", "UNKNOWN", "-":
safeBDF := strings.ReplaceAll(strings.ReplaceAll(bdf, ":", "-"), ".", "-") return ""
return fmt.Sprintf("%s-PCIE-%s", boardSerial, safeBDF) default:
return s
} }
return fmt.Sprintf("%s-PCIE-UNKNOWN", boardSerial)
} }
// inferStorageStatus determines storage device status // inferStorageStatus determines storage device status
@@ -392,10 +378,14 @@ func normalizeStatus(status string, allowEmpty bool) string {
switch strings.ToLower(strings.TrimSpace(status)) { switch strings.ToLower(strings.TrimSpace(status)) {
case "ok": case "ok":
return "OK" return "OK"
case "pass":
return "OK"
case "warning": case "warning":
return "Warning" return "Warning"
case "critical": case "critical":
return "Critical" return "Critical"
case "fail":
return "Critical"
case "unknown": case "unknown":
return "Unknown" return "Unknown"
case "empty": case "empty":

View File

@@ -111,42 +111,39 @@ func TestInferCPUManufacturer(t *testing.T) {
} }
} }
func TestGeneratePCIeSerialNumber(t *testing.T) { func TestNormalizedSerial(t *testing.T) {
tests := []struct { tests := []struct {
name string name string
boardSerial string in string
slot string want string
bdf string
want string
}{ }{
{ {
name: "with slot", name: "empty",
boardSerial: "TEST123", in: "",
slot: "PCIeCard1", want: "",
bdf: "0000:18:00.0",
want: "TEST123-PCIE-PCIeCard1",
}, },
{ {
name: "without slot, with bdf", name: "n_a",
boardSerial: "TEST123", in: "N/A",
slot: "", want: "",
bdf: "0000:18:00.0",
want: "TEST123-PCIE-0000-18-00-0",
}, },
{ {
name: "without slot and bdf", name: "unknown",
boardSerial: "TEST123", in: "unknown",
slot: "", want: "",
bdf: "", },
want: "TEST123-PCIE-UNKNOWN", {
name: "normal",
in: "SN123",
want: "SN123",
}, },
} }
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
got := generatePCIeSerialNumber(tt.boardSerial, tt.slot, tt.bdf) got := normalizedSerial(tt.in)
if got != tt.want { if got != tt.want {
t.Errorf("generatePCIeSerialNumber() = %q, want %q", got, tt.want) t.Errorf("normalizedSerial() = %q, want %q", got, tt.want)
} }
}) })
} }
@@ -184,6 +181,15 @@ func TestInferStorageStatus(t *testing.T) {
} }
} }
func TestNormalizeStatus_PassFail(t *testing.T) {
if got := normalizeStatus("PASS", false); got != "OK" {
t.Fatalf("expected PASS -> OK, got %q", got)
}
if got := normalizeStatus("FAIL", false); got != "Critical" {
t.Fatalf("expected FAIL -> Critical, got %q", got)
}
}
func TestConvertCPUs(t *testing.T) { func TestConvertCPUs(t *testing.T) {
cpus := []models.CPU{ cpus := []models.CPU{
{ {
@@ -323,17 +329,16 @@ func TestConvertPCIeDevices(t *testing.T) {
}, },
} }
boardSerial := "TEST123" result := convertPCIeDevices(hw)
result := convertPCIeDevices(hw, boardSerial)
// Should have: 2 PCIe devices + 1 GPU + 1 NIC = 4 total // Should have: 2 PCIe devices + 1 GPU + 1 NIC = 4 total
if len(result) != 4 { if len(result) != 4 {
t.Fatalf("expected 4 PCIe devices total, got %d", len(result)) t.Fatalf("expected 4 PCIe devices total, got %d", len(result))
} }
// Check that serial was generated for second PCIe device // Check that serial is empty for second PCIe device (no auto-generation)
if result[1].SerialNumber != "TEST123-PCIE-PCIeCard2" { if result[1].SerialNumber != "" {
t.Errorf("expected generated serial TEST123-PCIE-PCIeCard2, got %q", result[1].SerialNumber) t.Errorf("expected empty serial for missing device serial, got %q", result[1].SerialNumber)
} }
// Check GPU was included // Check GPU was included
@@ -352,6 +357,29 @@ func TestConvertPCIeDevices(t *testing.T) {
} }
} }
func TestConvertPCIeDevices_NVSwitchWithoutSerialRemainsEmpty(t *testing.T) {
hw := &models.HardwareConfig{
PCIeDevices: []models.PCIeDevice{
{
Slot: "NVSWITCH1",
DeviceClass: "NVSwitch",
BDF: "0000:06:00.0",
// SerialNumber empty on purpose; should remain empty.
},
},
}
result := convertPCIeDevices(hw)
if len(result) != 1 {
t.Fatalf("expected 1 PCIe device, got %d", len(result))
}
if result[0].SerialNumber != "" {
t.Fatalf("expected empty NVSwitch serial, got %q", result[0].SerialNumber)
}
}
func TestConvertPowerSupplies(t *testing.T) { func TestConvertPowerSupplies(t *testing.T) {
psus := []models.PSU{ psus := []models.PSU{
{ {

View File

@@ -0,0 +1,178 @@
package nvidia
import (
"encoding/json"
"fmt"
"regexp"
"strings"
"git.mchus.pro/mchus/logpile/internal/models"
"git.mchus.pro/mchus/logpile/internal/parser"
)
var (
gpuNameWithSerialRegex = regexp.MustCompile(`^SXM(\d+)_SN_(.+)$`)
gpuNameSlotOnlyRegex = regexp.MustCompile(`^SXM(\d+)$`)
skuModelRegex = regexp.MustCompile(`sku_hgx-([a-z0-9]+)-\d+-gpu`)
skuCodeRegex = regexp.MustCompile(`^(G\d{3})[.-](\d{4})`)
)
type testSpecData struct {
Actions []struct {
VirtualID string `json:"virtual_id"`
Args struct {
SKUToFile map[string]string `json:"sku_to_sku_json_file_map"`
} `json:"args"`
} `json:"actions"`
}
type inventoryFieldDiagSummary struct {
ModsRuns []struct {
ModsHeader []struct {
GPUName string `json:"GpuName"`
BoardInfo string `json:"BoardInfo"`
} `json:"ModsHeader"`
} `json:"ModsRuns"`
}
// ApplyGPUModelsFromSKU updates GPU model names using SKU mapping from testspec.json.
// Mapping source:
// - inventory/fieldiag_summary.json: GPUName -> BoardInfo(SKU)
// - testspec.json: SKU -> sku_hgx-... filename
func ApplyGPUModelsFromSKU(files []parser.ExtractedFile, result *models.AnalysisResult) {
if result == nil || result.Hardware == nil || len(result.Hardware.GPUs) == 0 {
return
}
skuToFile := parseSKUToFileMap(files)
if len(skuToFile) == 0 {
return
}
serialToSKU, slotToSKU := parseGPUSKUMapping(files)
if len(serialToSKU) == 0 && len(slotToSKU) == 0 {
return
}
for i := range result.Hardware.GPUs {
gpu := &result.Hardware.GPUs[i]
sku := ""
if serial := strings.TrimSpace(gpu.SerialNumber); serial != "" {
sku = serialToSKU[serial]
}
if sku == "" {
sku = slotToSKU[strings.TrimSpace(gpu.Slot)]
}
if sku == "" {
continue
}
model := resolveModelFromSKU(sku, skuToFile)
if model == "" {
continue
}
gpu.Model = model
}
}
func parseSKUToFileMap(files []parser.ExtractedFile) map[string]string {
specFile := parser.FindFileByName(files, "testspec.json")
if specFile == nil {
return nil
}
var spec testSpecData
if err := json.Unmarshal(specFile.Content, &spec); err != nil {
return nil
}
result := make(map[string]string)
for _, action := range spec.Actions {
for sku, file := range action.Args.SKUToFile {
normSKU := normalizeSKUCode(sku)
if normSKU == "" {
continue
}
result[normSKU] = strings.TrimSpace(file)
}
}
return result
}
func parseGPUSKUMapping(files []parser.ExtractedFile) (map[string]string, map[string]string) {
var summaryFile *parser.ExtractedFile
for _, f := range files {
path := strings.ToLower(f.Path)
if strings.Contains(path, "inventory/fieldiag_summary.json") ||
strings.Contains(path, "inventory\\fieldiag_summary.json") {
summaryFile = &f
break
}
}
if summaryFile == nil {
return nil, nil
}
var summary inventoryFieldDiagSummary
if err := json.Unmarshal(summaryFile.Content, &summary); err != nil {
return nil, nil
}
serialToSKU := make(map[string]string)
slotToSKU := make(map[string]string)
for _, run := range summary.ModsRuns {
for _, h := range run.ModsHeader {
sku := normalizeSKUCode(h.BoardInfo)
if sku == "" {
continue
}
gpuName := strings.TrimSpace(h.GPUName)
if matches := gpuNameWithSerialRegex.FindStringSubmatch(gpuName); len(matches) == 3 {
slotToSKU["GPUSXM"+matches[1]] = sku
serialToSKU[strings.TrimSpace(matches[2])] = sku
continue
}
if matches := gpuNameSlotOnlyRegex.FindStringSubmatch(gpuName); len(matches) == 2 {
slotToSKU["GPUSXM"+matches[1]] = sku
}
}
}
return serialToSKU, slotToSKU
}
func resolveModelFromSKU(sku string, skuToFile map[string]string) string {
file := strings.ToLower(strings.TrimSpace(skuToFile[normalizeSKUCode(sku)]))
if file == "" {
return ""
}
m := skuModelRegex.FindStringSubmatch(file)
if len(m) != 2 {
return ""
}
gpuFamily := strings.ToUpper(strings.TrimSpace(m[1]))
if gpuFamily == "" {
return ""
}
return fmt.Sprintf("NVIDIA %s SXM", gpuFamily)
}
func normalizeSKUCode(v string) string {
s := strings.TrimSpace(strings.ToUpper(v))
if s == "" {
return ""
}
if m := skuCodeRegex.FindStringSubmatch(s); len(m) == 3 {
return m[1] + "-" + m[2]
}
return s
}

View File

@@ -0,0 +1,56 @@
package nvidia
import (
"testing"
"git.mchus.pro/mchus/logpile/internal/models"
"git.mchus.pro/mchus/logpile/internal/parser"
)
func TestApplyGPUModelsFromSKU(t *testing.T) {
files := []parser.ExtractedFile{
{
Path: "inventory/fieldiag_summary.json",
Content: []byte(`{
"ModsRuns":[
{"ModsHeader":[
{"GpuName":"SXM5_SN_1653925025497","BoardInfo":"G520-0280"}
]}
]
}`),
},
{
Path: "testspec.json",
Content: []byte(`{
"actions":[
{
"virtual_id":"inventory",
"args":{
"sku_to_sku_json_file_map":{
"G520-0280":"sku_hgx-h200-8-gpu_141g_aircooled_field.json"
}
}
}
]
}`),
},
}
result := &models.AnalysisResult{
Hardware: &models.HardwareConfig{
GPUs: []models.GPU{
{
Slot: "GPUSXM5",
SerialNumber: "1653925025497",
Model: "NVIDIA Device 2335",
},
},
},
}
ApplyGPUModelsFromSKU(files, result)
if got := result.Hardware.GPUs[0].Model; got != "NVIDIA H200 SXM" {
t.Fatalf("expected model NVIDIA H200 SXM, got %q", got)
}
}

View File

@@ -14,7 +14,7 @@ import (
// parserVersion - version of this parser module // parserVersion - version of this parser module
// IMPORTANT: Increment this version when making changes to parser logic! // IMPORTANT: Increment this version when making changes to parser logic!
const parserVersion = "1.2.0" const parserVersion = "1.2.4"
func init() { func init() {
parser.Register(&Parser{}) parser.Register(&Parser{})
@@ -105,6 +105,7 @@ func (p *Parser) Parse(files []parser.ExtractedFile) (*models.AnalysisResult, er
result.Hardware = &models.HardwareConfig{ result.Hardware = &models.HardwareConfig{
GPUs: make([]models.GPU, 0), GPUs: make([]models.GPU, 0),
} }
gpuStatuses := make(map[string]string)
// Parse output.log first (contains dmidecode system info) // Parse output.log first (contains dmidecode system info)
// Find the output.log file that contains dmidecode output // Find the output.log file that contains dmidecode output
@@ -133,18 +134,30 @@ func (p *Parser) Parse(files []parser.ExtractedFile) (*models.AnalysisResult, er
} }
} }
// Enhance GPU model names using SKU mapping from testspec + inventory summary.
ApplyGPUModelsFromSKU(files, result)
// Parse summary.json (test results summary) // Parse summary.json (test results summary)
if f := parser.FindFileByName(files, "summary.json"); f != nil { if f := parser.FindFileByName(files, "summary.json"); f != nil {
events := ParseSummaryJSON(f.Content) events := ParseSummaryJSON(f.Content)
result.Events = append(result.Events, events...) result.Events = append(result.Events, events...)
for componentID, status := range CollectGPUStatusesFromSummaryJSON(f.Content) {
gpuStatuses[componentID] = mergeGPUStatus(gpuStatuses[componentID], status)
}
} }
// Parse summary.csv (alternative format) // Parse summary.csv (alternative format)
if f := parser.FindFileByName(files, "summary.csv"); f != nil { if f := parser.FindFileByName(files, "summary.csv"); f != nil {
csvEvents := ParseSummaryCSV(f.Content) csvEvents := ParseSummaryCSV(f.Content)
result.Events = append(result.Events, csvEvents...) result.Events = append(result.Events, csvEvents...)
for componentID, status := range CollectGPUStatusesFromSummaryCSV(f.Content) {
gpuStatuses[componentID] = mergeGPUStatus(gpuStatuses[componentID], status)
}
} }
// Apply per-GPU PASS/FAIL status derived from summary files.
ApplyGPUStatuses(result, gpuStatuses)
// Parse GPU field diagnostics logs // Parse GPU field diagnostics logs
gpuFieldiagFiles := parser.FindFileByPattern(files, "gpu_fieldiag/", ".log") gpuFieldiagFiles := parser.FindFileByPattern(files, "gpu_fieldiag/", ".log")
for _, f := range gpuFieldiagFiles { for _, f := range gpuFieldiagFiles {

View File

@@ -104,6 +104,82 @@ func TestNVIDIAParser_RealArchive(t *testing.T) {
t.Logf("\nSXM2 failure events: %d", sxm2Events) t.Logf("\nSXM2 failure events: %d", sxm2Events)
} }
func TestNVIDIAParser_GPUStatusFromSummary_RealArchive07900(t *testing.T) {
archivePath := filepath.Join("../../../../example", "A514359X5A07900_logs-20260122-074208.tar")
if _, err := os.Stat(archivePath); os.IsNotExist(err) {
t.Skip("Test archive not found, skipping test")
}
files, err := parser.ExtractArchive(archivePath)
if err != nil {
t.Fatalf("Failed to extract archive: %v", err)
}
p := &Parser{}
result, err := p.Parse(files)
if err != nil {
t.Fatalf("Failed to parse archive: %v", err)
}
if result.Hardware == nil || len(result.Hardware.GPUs) == 0 {
t.Fatalf("expected GPUs in parsed result")
}
statusBySerial := make(map[string]string, len(result.Hardware.GPUs))
for _, gpu := range result.Hardware.GPUs {
if gpu.SerialNumber != "" {
statusBySerial[gpu.SerialNumber] = gpu.Status
}
}
if got := statusBySerial["1653925025497"]; got != "FAIL" {
t.Fatalf("expected GPU serial 1653925025497 status FAIL, got %q", got)
}
for serial, st := range statusBySerial {
if serial == "1653925025497" {
continue
}
if st != "PASS" {
t.Fatalf("expected non-failing GPU serial %s status PASS, got %q", serial, st)
}
}
}
func TestNVIDIAParser_GPUModelFromSKU_RealArchive07900(t *testing.T) {
archivePath := filepath.Join("../../../../example", "A514359X5A07900_logs-20260122-074208.tar")
if _, err := os.Stat(archivePath); os.IsNotExist(err) {
t.Skip("Test archive not found, skipping test")
}
files, err := parser.ExtractArchive(archivePath)
if err != nil {
t.Fatalf("Failed to extract archive: %v", err)
}
p := &Parser{}
result, err := p.Parse(files)
if err != nil {
t.Fatalf("Failed to parse archive: %v", err)
}
if result.Hardware == nil || len(result.Hardware.GPUs) == 0 {
t.Fatalf("expected GPUs in parsed result")
}
found := false
for _, gpu := range result.Hardware.GPUs {
if gpu.Model == "NVIDIA H200 SXM" {
found = true
break
}
}
if !found {
t.Fatalf("expected at least one GPU model NVIDIA H200 SXM")
}
}
func contains(s, substr string) bool { func contains(s, substr string) bool {
return len(s) >= len(substr) && (s == substr || len(s) > len(substr) && return len(s) >= len(substr) && (s == substr || len(s) > len(substr) &&
(s[:len(substr)] == substr || s[len(s)-len(substr):] == substr || (s[:len(substr)] == substr || s[len(s)-len(substr):] == substr ||
@@ -118,4 +194,3 @@ func findSubstring(s, substr string) bool {
} }
return false return false
} }

View File

@@ -4,6 +4,7 @@ import (
"encoding/csv" "encoding/csv"
"encoding/json" "encoding/json"
"fmt" "fmt"
"regexp"
"strings" "strings"
"time" "time"
@@ -20,6 +21,8 @@ type SummaryEntry struct {
IgnoreError string `json:"Ignore Error"` IgnoreError string `json:"Ignore Error"`
} }
var gpuComponentIDRegex = regexp.MustCompile(`^SXM(\d+)_SN_(.+)$`)
// ParseSummaryJSON parses summary.json file and returns events // ParseSummaryJSON parses summary.json file and returns events
func ParseSummaryJSON(content []byte) []models.Event { func ParseSummaryJSON(content []byte) []models.Event {
var entries []SummaryEntry var entries []SummaryEntry
@@ -92,6 +95,124 @@ func ParseSummaryCSV(content []byte) []models.Event {
return events return events
} }
// CollectGPUStatusesFromSummaryJSON extracts per-GPU PASS/FAIL status from summary.json.
// Key format in returned map is component ID from summary (e.g. "SXM5_SN_1653925025497").
func CollectGPUStatusesFromSummaryJSON(content []byte) map[string]string {
var entries []SummaryEntry
if err := json.Unmarshal(content, &entries); err != nil {
return nil
}
statuses := make(map[string]string)
for _, entry := range entries {
component := strings.TrimSpace(entry.ComponentID)
if component == "" || !gpuComponentIDRegex.MatchString(component) {
continue
}
current := statuses[component]
next := "PASS"
if !isSummaryJSONRecordPassing(entry.ErrorCode, entry.Notes) {
next = "FAIL"
}
statuses[component] = mergeGPUStatus(current, next)
}
return statuses
}
// CollectGPUStatusesFromSummaryCSV extracts per-GPU PASS/FAIL status from summary.csv.
// Key format in returned map is component ID from summary (e.g. "SXM5_SN_1653925025497").
func CollectGPUStatusesFromSummaryCSV(content []byte) map[string]string {
reader := csv.NewReader(strings.NewReader(string(content)))
records, err := reader.ReadAll()
if err != nil {
return nil
}
statuses := make(map[string]string)
for i, record := range records {
if i == 0 || len(record) < 7 {
continue
}
component := strings.TrimSpace(record[5])
if component == "" || !gpuComponentIDRegex.MatchString(component) {
continue
}
errorCode := strings.TrimSpace(record[0])
notes := strings.TrimSpace(record[6])
current := statuses[component]
next := "PASS"
if !isSummaryCSVRecordPassing(errorCode, notes) {
next = "FAIL"
}
statuses[component] = mergeGPUStatus(current, next)
}
return statuses
}
func isSummaryJSONRecordPassing(errorCode, notes string) bool {
_ = errorCode
return strings.TrimSpace(notes) == "OK"
}
func isSummaryCSVRecordPassing(errorCode, notes string) bool {
_ = errorCode
return strings.TrimSpace(notes) == "OK"
}
func mergeGPUStatus(current, next string) string {
// FAIL has highest priority.
if current == "FAIL" || next == "FAIL" {
return "FAIL"
}
if current == "PASS" || next == "PASS" {
return "PASS"
}
return ""
}
// ApplyGPUStatuses applies aggregated PASS/FAIL statuses from summary components to parsed GPUs.
func ApplyGPUStatuses(result *models.AnalysisResult, componentStatuses map[string]string) {
if result == nil || result.Hardware == nil || len(result.Hardware.GPUs) == 0 || len(componentStatuses) == 0 {
return
}
slotStatus := make(map[string]string) // key: GPUSXM<idx>
serialStatus := make(map[string]string) // key: GPU serial
for componentID, status := range componentStatuses {
matches := gpuComponentIDRegex.FindStringSubmatch(strings.TrimSpace(componentID))
if len(matches) != 3 {
continue
}
slotKey := "GPUSXM" + matches[1]
serialKey := strings.TrimSpace(matches[2])
slotStatus[slotKey] = mergeGPUStatus(slotStatus[slotKey], status)
if serialKey != "" {
serialStatus[serialKey] = mergeGPUStatus(serialStatus[serialKey], status)
}
}
for i := range result.Hardware.GPUs {
gpu := &result.Hardware.GPUs[i]
next := ""
if serial := strings.TrimSpace(gpu.SerialNumber); serial != "" {
next = serialStatus[serial]
}
if next == "" {
next = slotStatus[strings.TrimSpace(gpu.Slot)]
}
if next != "" {
gpu.Status = next
}
}
}
// formatSummaryDescription creates a human-readable description from summary entry // formatSummaryDescription creates a human-readable description from summary entry
func formatSummaryDescription(entry SummaryEntry) string { func formatSummaryDescription(entry SummaryEntry) string {
component := entry.ComponentID component := entry.ComponentID

View File

@@ -0,0 +1,46 @@
package nvidia
import (
"strings"
"testing"
"git.mchus.pro/mchus/logpile/internal/models"
)
func TestApplyGPUStatuses_FromSummaryCSV_FailAndPass(t *testing.T) {
csvData := strings.Join([]string{
"ErrorCode,Test,VirtualID,SubTest,Type,ComponentID,Notes,Level,,,IgnoreError",
"0,gpumem,gpumem,,GPU,SXM1_SN_111,OK,1,,,False",
"363,gpumem,gpumem,,GPU,SXM5_SN_1653925025497,Row remapping failed,1,,,False",
"0,gpu_fieldiag,gpu_fieldiag,,GPU,SXM1_SN_111,OK,1,,,False",
"0,gpu_fieldiag,gpu_fieldiag,,GPU,SXM2_SN_222,OK,1,,,False",
}, "\n")
result := &models.AnalysisResult{
Hardware: &models.HardwareConfig{
GPUs: []models.GPU{
{Slot: "GPUSXM1", SerialNumber: "111"},
{Slot: "GPUSXM2", SerialNumber: "222"},
{Slot: "GPUSXM5", SerialNumber: "1653925025497"},
},
},
}
statuses := CollectGPUStatusesFromSummaryCSV([]byte(csvData))
ApplyGPUStatuses(result, statuses)
bySerial := map[string]string{}
for _, gpu := range result.Hardware.GPUs {
bySerial[gpu.SerialNumber] = gpu.Status
}
if bySerial["1653925025497"] != "FAIL" {
t.Fatalf("expected serial 1653925025497 status FAIL, got %q", bySerial["1653925025497"])
}
if bySerial["111"] != "PASS" {
t.Fatalf("expected serial 111 status PASS, got %q", bySerial["111"])
}
if bySerial["222"] != "PASS" {
t.Fatalf("expected serial 222 status PASS, got %q", bySerial["222"])
}
}

View File

@@ -3,6 +3,7 @@ package nvidia
import ( import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"regexp"
"strings" "strings"
"git.mchus.pro/mchus/logpile/internal/models" "git.mchus.pro/mchus/logpile/internal/models"
@@ -53,6 +54,8 @@ type Property struct {
Value interface{} `json:"value"` // Can be string or number Value interface{} `json:"value"` // Can be string or number
} }
var nvswitchComponentIDRegex = regexp.MustCompile(`^(NVSWITCH\d+|NVSWITCHNVSWITCH\d+)$`)
// GetValueAsString returns the value as a string // GetValueAsString returns the value as a string
func (p *Property) GetValueAsString() string { func (p *Property) GetValueAsString() string {
switch v := p.Value.(type) { switch v := p.Value.(type) {
@@ -107,7 +110,7 @@ func parseInventoryComponents(components []Component, result *models.AnalysisRes
} }
// Parse NVSwitch components // Parse NVSwitch components
if strings.HasPrefix(comp.ComponentID, "NVSWITCHNVSWITCH") { if isNVSwitchComponentID(comp.ComponentID) {
nvswitch := parseNVSwitchComponent(comp) nvswitch := parseNVSwitchComponent(comp)
if nvswitch != nil { if nvswitch != nil {
// Add as PCIe device for now // Add as PCIe device for now
@@ -217,7 +220,7 @@ func parseGPUComponent(comp Component) *models.GPU {
// parseNVSwitchComponent parses NVSwitch component information // parseNVSwitchComponent parses NVSwitch component information
func parseNVSwitchComponent(comp Component) *models.PCIeDevice { func parseNVSwitchComponent(comp Component) *models.PCIeDevice {
device := &models.PCIeDevice{ device := &models.PCIeDevice{
Slot: comp.ComponentID, // e.g., "NVSWITCHNVSWITCH0" Slot: normalizeNVSwitchSlot(comp.ComponentID),
} }
var vendorIDStr, deviceIDStr, vbios, pciID string var vendorIDStr, deviceIDStr, vbios, pciID string
@@ -279,3 +282,15 @@ func parseNVSwitchComponent(comp Component) *models.PCIeDevice {
return device return device
} }
func normalizeNVSwitchSlot(componentID string) string {
slot := strings.TrimSpace(componentID)
if strings.HasPrefix(slot, "NVSWITCHNVSWITCH") {
return strings.Replace(slot, "NVSWITCHNVSWITCH", "NVSWITCH", 1)
}
return slot
}
func isNVSwitchComponentID(componentID string) bool {
return nvswitchComponentIDRegex.MatchString(strings.TrimSpace(componentID))
}

View File

@@ -0,0 +1,46 @@
package nvidia
import (
"testing"
"git.mchus.pro/mchus/logpile/internal/models"
)
func TestParseInventoryComponents_IgnoresNVSwitchPropertyChecks(t *testing.T) {
result := &models.AnalysisResult{
Hardware: &models.HardwareConfig{},
}
components := []Component{
{
ComponentID: "NVSWITCHNVSWITCH1",
Properties: []Property{
{ID: "VendorID", Value: "10de"},
{ID: "DeviceID", Value: "22a3"},
{ID: "PCIID", Value: "0000:06:00.0"},
},
},
{
ComponentID: "NVSWITCHNum",
Properties: []Property{
{ID: "NVSWITCHNum", Value: 4},
},
},
{
ComponentID: "NVSWITCH_NVSWITCH1_VendorID",
Properties: []Property{
{ID: "NVSWITCH_NVSWITCH1_VendorID", Value: "10de"},
},
},
}
parseInventoryComponents(components, result)
if got := len(result.Hardware.PCIeDevices); got != 1 {
t.Fatalf("expected exactly 1 parsed NVSwitch device, got %d", got)
}
if result.Hardware.PCIeDevices[0].Slot != "NVSWITCH1" {
t.Fatalf("expected slot NVSWITCH1, got %q", result.Hardware.PCIeDevices[0].Slot)
}
}

View File

@@ -0,0 +1,35 @@
package nvidia
import "testing"
func TestParseNVSwitchComponent_NormalizesDuplicatedPrefixInSlot(t *testing.T) {
comp := Component{
ComponentID: "NVSWITCHNVSWITCH1",
Properties: []Property{
{ID: "VendorID", Value: "10de"},
{ID: "DeviceID", Value: "22a3"},
{ID: "Vendor", Value: "NVIDIA Corporation"},
{ID: "PCIID", Value: "0000:06:00.0"},
{ID: "PCISpeed", Value: "16GT/s"},
{ID: "PCIWidth", Value: "x2"},
{ID: "VBIOS_version", Value: "96.10.6D.00.01"},
},
}
device := parseNVSwitchComponent(comp)
if device == nil {
t.Fatal("expected non-nil NVSwitch device")
}
if device.Slot != "NVSWITCH1" {
t.Fatalf("expected normalized slot NVSWITCH1, got %q", device.Slot)
}
if device.BDF != "0000:06:00.0" {
t.Fatalf("expected BDF 0000:06:00.0, got %q", device.BDF)
}
if device.DeviceClass != "NVSwitch" {
t.Fatalf("expected device class NVSwitch, got %q", device.DeviceClass)
}
}

View File

@@ -312,7 +312,7 @@ func (s *Server) handleGetSerials(w http.ResponseWriter, r *http.Request) {
// From FRU // From FRU
for _, fru := range result.FRU { for _, fru := range result.FRU {
if fru.SerialNumber == "" { if !hasUsableSerial(fru.SerialNumber) {
continue continue
} }
name := fru.ProductName name := fru.ProductName
@@ -320,8 +320,8 @@ func (s *Server) handleGetSerials(w http.ResponseWriter, r *http.Request) {
name = fru.Description name = fru.Description
} }
serials = append(serials, SerialEntry{ serials = append(serials, SerialEntry{
Component: name, Component: name,
SerialNumber: fru.SerialNumber, SerialNumber: strings.TrimSpace(fru.SerialNumber),
Manufacturer: fru.Manufacturer, Manufacturer: fru.Manufacturer,
PartNumber: fru.PartNumber, PartNumber: fru.PartNumber,
Category: "FRU", Category: "FRU",
@@ -331,10 +331,10 @@ func (s *Server) handleGetSerials(w http.ResponseWriter, r *http.Request) {
// From Hardware // From Hardware
if result.Hardware != nil { if result.Hardware != nil {
// Board // Board
if result.Hardware.BoardInfo.SerialNumber != "" { if hasUsableSerial(result.Hardware.BoardInfo.SerialNumber) {
serials = append(serials, SerialEntry{ serials = append(serials, SerialEntry{
Component: result.Hardware.BoardInfo.ProductName, Component: result.Hardware.BoardInfo.ProductName,
SerialNumber: result.Hardware.BoardInfo.SerialNumber, SerialNumber: strings.TrimSpace(result.Hardware.BoardInfo.SerialNumber),
Manufacturer: result.Hardware.BoardInfo.Manufacturer, Manufacturer: result.Hardware.BoardInfo.Manufacturer,
PartNumber: result.Hardware.BoardInfo.PartNumber, PartNumber: result.Hardware.BoardInfo.PartNumber,
Category: "Board", Category: "Board",
@@ -343,24 +343,20 @@ func (s *Server) handleGetSerials(w http.ResponseWriter, r *http.Request) {
// CPUs // CPUs
for _, cpu := range result.Hardware.CPUs { for _, cpu := range result.Hardware.CPUs {
sn := cpu.SerialNumber if !hasUsableSerial(cpu.SerialNumber) {
if sn == "" {
sn = cpu.PPIN // Use PPIN as fallback identifier
}
if sn == "" {
continue continue
} }
serials = append(serials, SerialEntry{ serials = append(serials, SerialEntry{
Component: cpu.Model, Component: cpu.Model,
Location: fmt.Sprintf("CPU%d", cpu.Socket), Location: fmt.Sprintf("CPU%d", cpu.Socket),
SerialNumber: sn, SerialNumber: strings.TrimSpace(cpu.SerialNumber),
Category: "CPU", Category: "CPU",
}) })
} }
// Memory DIMMs // Memory DIMMs
for _, mem := range result.Hardware.Memory { for _, mem := range result.Hardware.Memory {
if mem.SerialNumber == "" { if !hasUsableSerial(mem.SerialNumber) {
continue continue
} }
location := mem.Location location := mem.Location
@@ -370,7 +366,7 @@ func (s *Server) handleGetSerials(w http.ResponseWriter, r *http.Request) {
serials = append(serials, SerialEntry{ serials = append(serials, SerialEntry{
Component: mem.PartNumber, Component: mem.PartNumber,
Location: location, Location: location,
SerialNumber: mem.SerialNumber, SerialNumber: strings.TrimSpace(mem.SerialNumber),
Manufacturer: mem.Manufacturer, Manufacturer: mem.Manufacturer,
PartNumber: mem.PartNumber, PartNumber: mem.PartNumber,
Category: "Memory", Category: "Memory",
@@ -379,13 +375,13 @@ func (s *Server) handleGetSerials(w http.ResponseWriter, r *http.Request) {
// Storage // Storage
for _, stor := range result.Hardware.Storage { for _, stor := range result.Hardware.Storage {
if stor.SerialNumber == "" { if !hasUsableSerial(stor.SerialNumber) {
continue continue
} }
serials = append(serials, SerialEntry{ serials = append(serials, SerialEntry{
Component: stor.Model, Component: stor.Model,
Location: stor.Slot, Location: stor.Slot,
SerialNumber: stor.SerialNumber, SerialNumber: strings.TrimSpace(stor.SerialNumber),
Manufacturer: stor.Manufacturer, Manufacturer: stor.Manufacturer,
Category: "Storage", Category: "Storage",
}) })
@@ -393,7 +389,7 @@ func (s *Server) handleGetSerials(w http.ResponseWriter, r *http.Request) {
// GPUs // GPUs
for _, gpu := range result.Hardware.GPUs { for _, gpu := range result.Hardware.GPUs {
if gpu.SerialNumber == "" { if !hasUsableSerial(gpu.SerialNumber) {
continue continue
} }
model := gpu.Model model := gpu.Model
@@ -403,7 +399,7 @@ func (s *Server) handleGetSerials(w http.ResponseWriter, r *http.Request) {
serials = append(serials, SerialEntry{ serials = append(serials, SerialEntry{
Component: model, Component: model,
Location: gpu.Slot, Location: gpu.Slot,
SerialNumber: gpu.SerialNumber, SerialNumber: strings.TrimSpace(gpu.SerialNumber),
Manufacturer: gpu.Manufacturer, Manufacturer: gpu.Manufacturer,
Category: "GPU", Category: "GPU",
}) })
@@ -411,13 +407,13 @@ func (s *Server) handleGetSerials(w http.ResponseWriter, r *http.Request) {
// PCIe devices // PCIe devices
for _, pcie := range result.Hardware.PCIeDevices { for _, pcie := range result.Hardware.PCIeDevices {
if pcie.SerialNumber == "" { if !hasUsableSerial(pcie.SerialNumber) {
continue continue
} }
serials = append(serials, SerialEntry{ serials = append(serials, SerialEntry{
Component: pcie.DeviceClass, Component: pcie.DeviceClass,
Location: pcie.Slot, Location: pcie.Slot,
SerialNumber: pcie.SerialNumber, SerialNumber: strings.TrimSpace(pcie.SerialNumber),
Manufacturer: pcie.Manufacturer, Manufacturer: pcie.Manufacturer,
PartNumber: pcie.PartNumber, PartNumber: pcie.PartNumber,
Category: "PCIe", Category: "PCIe",
@@ -426,43 +422,47 @@ func (s *Server) handleGetSerials(w http.ResponseWriter, r *http.Request) {
// Network cards // Network cards
for _, nic := range result.Hardware.NetworkCards { for _, nic := range result.Hardware.NetworkCards {
if nic.SerialNumber == "" { if !hasUsableSerial(nic.SerialNumber) {
continue continue
} }
serials = append(serials, SerialEntry{ serials = append(serials, SerialEntry{
Component: nic.Model, Component: nic.Model,
SerialNumber: nic.SerialNumber, SerialNumber: strings.TrimSpace(nic.SerialNumber),
Category: "Network", Category: "Network",
}) })
} }
// Power supplies // Power supplies
for _, psu := range result.Hardware.PowerSupply { for _, psu := range result.Hardware.PowerSupply {
if psu.SerialNumber == "" { if !hasUsableSerial(psu.SerialNumber) {
continue continue
} }
serials = append(serials, SerialEntry{ serials = append(serials, SerialEntry{
Component: psu.Model, Component: psu.Model,
Location: psu.Slot, Location: psu.Slot,
SerialNumber: psu.SerialNumber, SerialNumber: strings.TrimSpace(psu.SerialNumber),
Manufacturer: psu.Vendor, Manufacturer: psu.Vendor,
Category: "PSU", Category: "PSU",
}) })
} }
// Firmware (using version as "serial number" for display)
for _, fw := range result.Hardware.Firmware {
serials = append(serials, SerialEntry{
Component: fw.DeviceName,
SerialNumber: fw.Version,
Category: "Firmware",
})
}
} }
jsonResponse(w, serials) jsonResponse(w, serials)
} }
func hasUsableSerial(serial string) bool {
s := strings.TrimSpace(serial)
if s == "" {
return false
}
switch strings.ToUpper(s) {
case "N/A", "NA", "NONE", "NULL", "UNKNOWN", "-":
return false
default:
return true
}
}
func (s *Server) handleGetFirmware(w http.ResponseWriter, r *http.Request) { func (s *Server) handleGetFirmware(w http.ResponseWriter, r *http.Request) {
result := s.GetResult() result := s.GetResult()
if result == nil || result.Hardware == nil { if result == nil || result.Hardware == nil {