227 lines
7.2 KiB
Go
227 lines
7.2 KiB
Go
// Package nvidia provides parser for NVIDIA Field Diagnostics archives
|
|
// Tested with: HGX Field Diag (works with various server vendors)
|
|
//
|
|
// IMPORTANT: Increment parserVersion when modifying parser logic!
|
|
// This helps track which version was used to parse specific logs.
|
|
package nvidia
|
|
|
|
import (
|
|
"strings"
|
|
|
|
"git.mchus.pro/mchus/logpile/internal/models"
|
|
"git.mchus.pro/mchus/logpile/internal/parser"
|
|
)
|
|
|
|
// parserVersion - version of this parser module
|
|
// IMPORTANT: Increment this version when making changes to parser logic!
|
|
const parserVersion = "1.3.0"
|
|
|
|
func init() {
|
|
parser.Register(&Parser{})
|
|
}
|
|
|
|
// Parser implements VendorParser for NVIDIA Field Diagnostics
|
|
type Parser struct{}
|
|
|
|
// Name returns human-readable parser name
|
|
func (p *Parser) Name() string {
|
|
return "NVIDIA Field Diagnostics Parser"
|
|
}
|
|
|
|
// Vendor returns vendor identifier
|
|
func (p *Parser) Vendor() string {
|
|
return "nvidia"
|
|
}
|
|
|
|
// Version returns parser version
|
|
// IMPORTANT: Update parserVersion constant when modifying parser logic!
|
|
func (p *Parser) Version() string {
|
|
return parserVersion
|
|
}
|
|
|
|
// Detect checks if archive matches NVIDIA Field Diagnostics format
|
|
// Returns confidence 0-100
|
|
func (p *Parser) Detect(files []parser.ExtractedFile) int {
|
|
confidence := 0
|
|
|
|
for _, f := range files {
|
|
path := strings.ToLower(f.Path)
|
|
|
|
// Strong indicators for NVIDIA Field Diagnostics format
|
|
if strings.HasSuffix(path, "unified_summary.json") {
|
|
// Check if it's really NVIDIA Field Diag format
|
|
if containsNvidiaFieldDiagMarkers(f.Content) {
|
|
confidence += 40
|
|
}
|
|
}
|
|
|
|
if strings.HasSuffix(path, "summary.json") && !strings.Contains(path, "unified_") {
|
|
confidence += 20
|
|
}
|
|
|
|
if strings.HasSuffix(path, "summary.csv") {
|
|
confidence += 15
|
|
}
|
|
|
|
if strings.Contains(path, "gpu_fieldiag/") {
|
|
confidence += 15
|
|
}
|
|
|
|
if strings.HasSuffix(path, "output.log") {
|
|
// Check if it contains dmidecode output
|
|
if strings.Contains(string(f.Content), "dmidecode") ||
|
|
strings.Contains(string(f.Content), "System Information") {
|
|
confidence += 10
|
|
}
|
|
}
|
|
|
|
// Cap at 100
|
|
if confidence >= 100 {
|
|
return 100
|
|
}
|
|
}
|
|
|
|
return confidence
|
|
}
|
|
|
|
// containsNvidiaFieldDiagMarkers checks if content has NVIDIA Field Diag markers
|
|
func containsNvidiaFieldDiagMarkers(content []byte) bool {
|
|
s := string(content)
|
|
// Check for typical NVIDIA Field Diagnostics structure
|
|
return strings.Contains(s, "runInfo") &&
|
|
strings.Contains(s, "diagVersion") &&
|
|
strings.Contains(s, "HGX Field Diag")
|
|
}
|
|
|
|
// Parse parses NVIDIA Field Diagnostics archive
|
|
func (p *Parser) Parse(files []parser.ExtractedFile) (*models.AnalysisResult, error) {
|
|
result := &models.AnalysisResult{
|
|
Events: make([]models.Event, 0),
|
|
FRU: make([]models.FRUInfo, 0),
|
|
Sensors: make([]models.SensorReading, 0),
|
|
}
|
|
|
|
// Initialize hardware config
|
|
result.Hardware = &models.HardwareConfig{
|
|
GPUs: make([]models.GPU, 0),
|
|
}
|
|
gpuStatuses := make(map[string]string)
|
|
gpuFailureDetails := make(map[string]string)
|
|
nvswitchStatuses := make(map[string]string)
|
|
|
|
// Parse output.log first (contains dmidecode system info)
|
|
// Find the output.log file that contains dmidecode output
|
|
outputLogFile := findDmidecodeOutputLog(files)
|
|
if outputLogFile != nil {
|
|
if err := ParseOutputLog(outputLogFile.Content, result); err != nil {
|
|
// Log error but continue parsing other files
|
|
_ = err // Ignore error for now
|
|
}
|
|
}
|
|
|
|
// Parse unified_summary.json (contains detailed component info)
|
|
if f := parser.FindFileByName(files, "unified_summary.json"); f != nil {
|
|
if err := ParseUnifiedSummary(f.Content, result); err != nil {
|
|
// Log error but continue parsing other files
|
|
_ = err // Ignore error for now
|
|
}
|
|
}
|
|
|
|
// Parse inventory/output.log (contains GPU serial numbers from lspci)
|
|
inventoryLogFile := findInventoryOutputLog(files)
|
|
if inventoryLogFile != nil {
|
|
if err := ParseInventoryLog(inventoryLogFile.Content, result); err != nil {
|
|
// Log error but continue parsing other files
|
|
_ = err // Ignore error for now
|
|
}
|
|
}
|
|
|
|
// Parse inventory/inventory.log to enrich PCI BDF mapping for components.
|
|
inventoryInfoLog := findInventoryInfoLog(files)
|
|
if inventoryInfoLog != nil {
|
|
if err := ApplyInventoryPCIIDs(inventoryInfoLog.Content, result); err != nil {
|
|
_ = err
|
|
}
|
|
}
|
|
|
|
// Enhance GPU model names using SKU mapping from testspec + inventory summary.
|
|
ApplyGPUModelsFromSKU(files, result)
|
|
|
|
// Parse inventory/nvflash_verbose.log and apply firmware versions by BDF + IDs.
|
|
// This runs after GPU model/part-number enrichment so firmware tab uses final model labels.
|
|
nvflashVerbose := findNVFlashVerboseLog(files)
|
|
if nvflashVerbose != nil {
|
|
if err := ParseNVFlashVerboseLog(nvflashVerbose.Content, result); err != nil {
|
|
_ = err
|
|
}
|
|
}
|
|
|
|
// Parse summary.json (test results summary)
|
|
if f := parser.FindFileByName(files, "summary.json"); f != nil {
|
|
events := ParseSummaryJSON(f.Content)
|
|
result.Events = append(result.Events, events...)
|
|
for componentID, status := range CollectGPUStatusesFromSummaryJSON(f.Content) {
|
|
gpuStatuses[componentID] = mergeGPUStatus(gpuStatuses[componentID], status)
|
|
}
|
|
for slot, status := range CollectNVSwitchStatusesFromSummaryJSON(f.Content) {
|
|
nvswitchStatuses[slot] = mergeGPUStatus(nvswitchStatuses[slot], status)
|
|
}
|
|
for componentID, detail := range CollectGPUFailureDetailsFromSummaryJSON(f.Content) {
|
|
if _, exists := gpuFailureDetails[componentID]; !exists && strings.TrimSpace(detail) != "" {
|
|
gpuFailureDetails[componentID] = strings.TrimSpace(detail)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Parse summary.csv (alternative format)
|
|
if f := parser.FindFileByName(files, "summary.csv"); f != nil {
|
|
csvEvents := ParseSummaryCSV(f.Content)
|
|
result.Events = append(result.Events, csvEvents...)
|
|
for componentID, status := range CollectGPUStatusesFromSummaryCSV(f.Content) {
|
|
gpuStatuses[componentID] = mergeGPUStatus(gpuStatuses[componentID], status)
|
|
}
|
|
for slot, status := range CollectNVSwitchStatusesFromSummaryCSV(f.Content) {
|
|
nvswitchStatuses[slot] = mergeGPUStatus(nvswitchStatuses[slot], status)
|
|
}
|
|
for componentID, detail := range CollectGPUFailureDetailsFromSummaryCSV(f.Content) {
|
|
if _, exists := gpuFailureDetails[componentID]; !exists && strings.TrimSpace(detail) != "" {
|
|
gpuFailureDetails[componentID] = strings.TrimSpace(detail)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Apply per-GPU PASS/FAIL status derived from summary files.
|
|
ApplyGPUStatuses(result, gpuStatuses)
|
|
ApplyGPUFailureDetails(result, gpuFailureDetails)
|
|
ApplyNVSwitchStatuses(result, nvswitchStatuses)
|
|
ApplyGPUAndNVSwitchCheckTimes(result, CollectGPUAndNVSwitchCheckTimes(files))
|
|
|
|
// Parse GPU field diagnostics logs
|
|
gpuFieldiagFiles := parser.FindFileByPattern(files, "gpu_fieldiag/", ".log")
|
|
for _, f := range gpuFieldiagFiles {
|
|
// Parse individual GPU diagnostic logs if needed
|
|
// For now, we focus on summary files
|
|
_ = f
|
|
}
|
|
|
|
return result, nil
|
|
}
|
|
|
|
// findDmidecodeOutputLog finds the output.log file that contains dmidecode output
|
|
func findDmidecodeOutputLog(files []parser.ExtractedFile) *parser.ExtractedFile {
|
|
for _, f := range files {
|
|
// Look for output.log files
|
|
if !strings.HasSuffix(strings.ToLower(f.Path), "output.log") {
|
|
continue
|
|
}
|
|
|
|
// Check if it contains dmidecode output
|
|
content := string(f.Content)
|
|
if strings.Contains(content, "dmidecode") &&
|
|
strings.Contains(content, "System Information") {
|
|
return &f
|
|
}
|
|
}
|
|
return nil
|
|
}
|