Files
logpile/internal/parser/vendors/nvidia/parser.go

227 lines
7.2 KiB
Go

// Package nvidia provides parser for NVIDIA Field Diagnostics archives
// Tested with: HGX Field Diag (works with various server vendors)
//
// IMPORTANT: Increment parserVersion when modifying parser logic!
// This helps track which version was used to parse specific logs.
package nvidia
import (
"strings"
"git.mchus.pro/mchus/logpile/internal/models"
"git.mchus.pro/mchus/logpile/internal/parser"
)
// parserVersion - version of this parser module
// IMPORTANT: Increment this version when making changes to parser logic!
const parserVersion = "1.3.0"
func init() {
parser.Register(&Parser{})
}
// Parser implements VendorParser for NVIDIA Field Diagnostics
type Parser struct{}
// Name returns human-readable parser name
func (p *Parser) Name() string {
return "NVIDIA Field Diagnostics Parser"
}
// Vendor returns vendor identifier
func (p *Parser) Vendor() string {
return "nvidia"
}
// Version returns parser version
// IMPORTANT: Update parserVersion constant when modifying parser logic!
func (p *Parser) Version() string {
return parserVersion
}
// Detect checks if archive matches NVIDIA Field Diagnostics format
// Returns confidence 0-100
func (p *Parser) Detect(files []parser.ExtractedFile) int {
confidence := 0
for _, f := range files {
path := strings.ToLower(f.Path)
// Strong indicators for NVIDIA Field Diagnostics format
if strings.HasSuffix(path, "unified_summary.json") {
// Check if it's really NVIDIA Field Diag format
if containsNvidiaFieldDiagMarkers(f.Content) {
confidence += 40
}
}
if strings.HasSuffix(path, "summary.json") && !strings.Contains(path, "unified_") {
confidence += 20
}
if strings.HasSuffix(path, "summary.csv") {
confidence += 15
}
if strings.Contains(path, "gpu_fieldiag/") {
confidence += 15
}
if strings.HasSuffix(path, "output.log") {
// Check if it contains dmidecode output
if strings.Contains(string(f.Content), "dmidecode") ||
strings.Contains(string(f.Content), "System Information") {
confidence += 10
}
}
// Cap at 100
if confidence >= 100 {
return 100
}
}
return confidence
}
// containsNvidiaFieldDiagMarkers checks if content has NVIDIA Field Diag markers
func containsNvidiaFieldDiagMarkers(content []byte) bool {
s := string(content)
// Check for typical NVIDIA Field Diagnostics structure
return strings.Contains(s, "runInfo") &&
strings.Contains(s, "diagVersion") &&
strings.Contains(s, "HGX Field Diag")
}
// Parse parses NVIDIA Field Diagnostics archive
func (p *Parser) Parse(files []parser.ExtractedFile) (*models.AnalysisResult, error) {
result := &models.AnalysisResult{
Events: make([]models.Event, 0),
FRU: make([]models.FRUInfo, 0),
Sensors: make([]models.SensorReading, 0),
}
// Initialize hardware config
result.Hardware = &models.HardwareConfig{
GPUs: make([]models.GPU, 0),
}
gpuStatuses := make(map[string]string)
gpuFailureDetails := make(map[string]string)
nvswitchStatuses := make(map[string]string)
// Parse output.log first (contains dmidecode system info)
// Find the output.log file that contains dmidecode output
outputLogFile := findDmidecodeOutputLog(files)
if outputLogFile != nil {
if err := ParseOutputLog(outputLogFile.Content, result); err != nil {
// Log error but continue parsing other files
_ = err // Ignore error for now
}
}
// Parse unified_summary.json (contains detailed component info)
if f := parser.FindFileByName(files, "unified_summary.json"); f != nil {
if err := ParseUnifiedSummary(f.Content, result); err != nil {
// Log error but continue parsing other files
_ = err // Ignore error for now
}
}
// Parse inventory/output.log (contains GPU serial numbers from lspci)
inventoryLogFile := findInventoryOutputLog(files)
if inventoryLogFile != nil {
if err := ParseInventoryLog(inventoryLogFile.Content, result); err != nil {
// Log error but continue parsing other files
_ = err // Ignore error for now
}
}
// Parse inventory/inventory.log to enrich PCI BDF mapping for components.
inventoryInfoLog := findInventoryInfoLog(files)
if inventoryInfoLog != nil {
if err := ApplyInventoryPCIIDs(inventoryInfoLog.Content, result); err != nil {
_ = err
}
}
// Enhance GPU model names using SKU mapping from testspec + inventory summary.
ApplyGPUModelsFromSKU(files, result)
// Parse inventory/nvflash_verbose.log and apply firmware versions by BDF + IDs.
// This runs after GPU model/part-number enrichment so firmware tab uses final model labels.
nvflashVerbose := findNVFlashVerboseLog(files)
if nvflashVerbose != nil {
if err := ParseNVFlashVerboseLog(nvflashVerbose.Content, result); err != nil {
_ = err
}
}
// Parse summary.json (test results summary)
if f := parser.FindFileByName(files, "summary.json"); f != nil {
events := ParseSummaryJSON(f.Content)
result.Events = append(result.Events, events...)
for componentID, status := range CollectGPUStatusesFromSummaryJSON(f.Content) {
gpuStatuses[componentID] = mergeGPUStatus(gpuStatuses[componentID], status)
}
for slot, status := range CollectNVSwitchStatusesFromSummaryJSON(f.Content) {
nvswitchStatuses[slot] = mergeGPUStatus(nvswitchStatuses[slot], status)
}
for componentID, detail := range CollectGPUFailureDetailsFromSummaryJSON(f.Content) {
if _, exists := gpuFailureDetails[componentID]; !exists && strings.TrimSpace(detail) != "" {
gpuFailureDetails[componentID] = strings.TrimSpace(detail)
}
}
}
// Parse summary.csv (alternative format)
if f := parser.FindFileByName(files, "summary.csv"); f != nil {
csvEvents := ParseSummaryCSV(f.Content)
result.Events = append(result.Events, csvEvents...)
for componentID, status := range CollectGPUStatusesFromSummaryCSV(f.Content) {
gpuStatuses[componentID] = mergeGPUStatus(gpuStatuses[componentID], status)
}
for slot, status := range CollectNVSwitchStatusesFromSummaryCSV(f.Content) {
nvswitchStatuses[slot] = mergeGPUStatus(nvswitchStatuses[slot], status)
}
for componentID, detail := range CollectGPUFailureDetailsFromSummaryCSV(f.Content) {
if _, exists := gpuFailureDetails[componentID]; !exists && strings.TrimSpace(detail) != "" {
gpuFailureDetails[componentID] = strings.TrimSpace(detail)
}
}
}
// Apply per-GPU PASS/FAIL status derived from summary files.
ApplyGPUStatuses(result, gpuStatuses)
ApplyGPUFailureDetails(result, gpuFailureDetails)
ApplyNVSwitchStatuses(result, nvswitchStatuses)
ApplyGPUAndNVSwitchCheckTimes(result, CollectGPUAndNVSwitchCheckTimes(files))
// Parse GPU field diagnostics logs
gpuFieldiagFiles := parser.FindFileByPattern(files, "gpu_fieldiag/", ".log")
for _, f := range gpuFieldiagFiles {
// Parse individual GPU diagnostic logs if needed
// For now, we focus on summary files
_ = f
}
return result, nil
}
// findDmidecodeOutputLog finds the output.log file that contains dmidecode output
func findDmidecodeOutputLog(files []parser.ExtractedFile) *parser.ExtractedFile {
for _, f := range files {
// Look for output.log files
if !strings.HasSuffix(strings.ToLower(f.Path), "output.log") {
continue
}
// Check if it contains dmidecode output
content := string(f.Content)
if strings.Contains(content, "dmidecode") &&
strings.Contains(content, "System Information") {
return &f
}
}
return nil
}