New parsers: - NVIDIA Field Diagnostics parser with dmidecode output support - NVIDIA Bug Report parser with comprehensive hardware extraction - Supermicro crashdump (CDump.txt) parser - Generic fallback parser for unrecognized text files Enhanced GPU parsing (nvidia-bug-report): - Model and manufacturer detection (NVIDIA H100 80GB HBM3) - UUID, Video BIOS version, IRQ information - Bus location (BDF), DMA size/mask, device minor - PCIe bus type details New hardware detection (nvidia-bug-report): - System Information: server S/N, UUID, manufacturer, product name - CPU: model, S/N, cores, threads, frequencies from dmidecode - Memory: P/N, S/N, manufacturer, speed for all DIMMs - Power Supplies: manufacturer, model, S/N, wattage, status - Network Adapters: Ethernet/InfiniBand controllers with VPD data - Model, P/N, S/N from lspci Vital Product Data - Port count/type detection (QSFP56, OSFP, etc.) - Support for ConnectX-6/7 adapters Archive handling improvements: - Plain .gz file support (not just tar.gz) - Increased size limit for plain gzip files (50MB) - Better error handling for mixed archive formats Web interface enhancements: - Display parser name and filename badges - Improved file info section with visual indicators Co-Authored-By: Claude (qwen3-coder:480b) <noreply@anthropic.com>
262 lines
6.6 KiB
Go
262 lines
6.6 KiB
Go
package supermicro
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"git.mchus.pro/mchus/logpile/internal/models"
|
|
)
|
|
|
|
// CrashDumpData represents the structure of CDump.txt
|
|
type CrashDumpData struct {
|
|
CrashData struct {
|
|
METADATA Metadata `json:"METADATA"`
|
|
PROCESSORS ProcessorsData `json:"PROCESSORS"`
|
|
} `json:"crash_data"`
|
|
}
|
|
|
|
// ProcessorsData contains processor crash data
|
|
type ProcessorsData struct {
|
|
Version string `json:"_version"`
|
|
CPU0 Processors `json:"cpu0"`
|
|
CPU1 Processors `json:"cpu1"`
|
|
}
|
|
|
|
// Metadata contains crashdump metadata
|
|
type Metadata struct {
|
|
CPU0 CPUMetadata `json:"cpu0"`
|
|
CPU1 CPUMetadata `json:"cpu1"`
|
|
BMCFWVer string `json:"bmc_fw_ver"`
|
|
BIOSId string `json:"bios_id"`
|
|
MEFWVer string `json:"me_fw_ver"`
|
|
Timestamp string `json:"timestamp"`
|
|
TriggerType string `json:"trigger_type"`
|
|
PlatformName string `json:"platform_name"`
|
|
CrashdumpVer string `json:"crashdump_ver"`
|
|
ResetDetected string `json:"_reset_detected"`
|
|
}
|
|
|
|
// CPUMetadata contains CPU metadata
|
|
type CPUMetadata struct {
|
|
CPUID string `json:"cpuid"`
|
|
CoreMask string `json:"core_mask"`
|
|
CHACount string `json:"cha_count"`
|
|
CoreCount string `json:"core_count"`
|
|
PPIN string `json:"ppin"`
|
|
UcodePatchVer string `json:"ucode_patch_ver"`
|
|
}
|
|
|
|
// Processors contains processor crash data
|
|
type Processors struct {
|
|
MCA MCAData `json:"MCA"`
|
|
}
|
|
|
|
// MCAData contains Machine Check Architecture data
|
|
type MCAData struct {
|
|
Uncore map[string]interface{} `json:"uncore"`
|
|
}
|
|
|
|
// ParseCrashDump parses CDump.txt file
|
|
func ParseCrashDump(content []byte, result *models.AnalysisResult) error {
|
|
var data CrashDumpData
|
|
if err := json.Unmarshal(content, &data); err != nil {
|
|
return fmt.Errorf("failed to parse CDump.txt: %w", err)
|
|
}
|
|
|
|
// Initialize Hardware.Firmware slice if nil
|
|
if result.Hardware.Firmware == nil {
|
|
result.Hardware.Firmware = make([]models.FirmwareInfo, 0)
|
|
}
|
|
|
|
// Parse metadata
|
|
parseMetadata(&data.CrashData.METADATA, result)
|
|
|
|
// Parse CPU information
|
|
parseCPUInfo(&data.CrashData.METADATA, result)
|
|
|
|
// Parse MCA errors
|
|
parseMCAErrors(&data.CrashData, result)
|
|
|
|
return nil
|
|
}
|
|
|
|
// parseMetadata extracts metadata information
|
|
func parseMetadata(metadata *Metadata, result *models.AnalysisResult) {
|
|
// Store firmware versions in HardwareConfig.Firmware
|
|
if metadata.BMCFWVer != "" {
|
|
result.Hardware.Firmware = append(result.Hardware.Firmware, models.FirmwareInfo{
|
|
DeviceName: "BMC",
|
|
Version: metadata.BMCFWVer,
|
|
})
|
|
}
|
|
|
|
if metadata.BIOSId != "" {
|
|
result.Hardware.Firmware = append(result.Hardware.Firmware, models.FirmwareInfo{
|
|
DeviceName: "BIOS",
|
|
Version: metadata.BIOSId,
|
|
})
|
|
}
|
|
|
|
if metadata.MEFWVer != "" {
|
|
result.Hardware.Firmware = append(result.Hardware.Firmware, models.FirmwareInfo{
|
|
DeviceName: "ME",
|
|
Version: metadata.MEFWVer,
|
|
})
|
|
}
|
|
|
|
// Create event for crashdump trigger
|
|
timestamp := time.Now()
|
|
if metadata.Timestamp != "" {
|
|
if t, err := time.Parse(time.RFC3339, metadata.Timestamp); err == nil {
|
|
timestamp = t
|
|
}
|
|
}
|
|
|
|
triggerType := metadata.TriggerType
|
|
if triggerType == "" {
|
|
triggerType = "Unknown"
|
|
}
|
|
|
|
severity := models.SeverityInfo
|
|
if metadata.ResetDetected != "" && metadata.ResetDetected != "NONE" {
|
|
severity = models.SeverityWarning
|
|
}
|
|
|
|
result.Events = append(result.Events, models.Event{
|
|
Timestamp: timestamp,
|
|
Source: "Crashdump",
|
|
EventType: "System Crashdump",
|
|
Description: fmt.Sprintf("Crashdump collected (%s)", triggerType),
|
|
Severity: severity,
|
|
RawData: fmt.Sprintf("Version: %s, Reset: %s", metadata.CrashdumpVer, metadata.ResetDetected),
|
|
})
|
|
}
|
|
|
|
// parseCPUInfo extracts CPU information
|
|
func parseCPUInfo(metadata *Metadata, result *models.AnalysisResult) {
|
|
cpus := []struct {
|
|
socket int
|
|
data CPUMetadata
|
|
}{
|
|
{0, metadata.CPU0},
|
|
{1, metadata.CPU1},
|
|
}
|
|
|
|
for _, cpu := range cpus {
|
|
if cpu.data.CPUID == "" {
|
|
continue
|
|
}
|
|
|
|
// Parse core count
|
|
coreCount := 0
|
|
if cpu.data.CoreCount != "" {
|
|
if count, err := strconv.ParseInt(strings.TrimPrefix(cpu.data.CoreCount, "0x"), 16, 64); err == nil {
|
|
coreCount = int(count)
|
|
}
|
|
}
|
|
|
|
cpuModel := models.CPU{
|
|
Socket: cpu.socket,
|
|
Model: fmt.Sprintf("Intel CPU (CPUID: %s)", cpu.data.CPUID),
|
|
Cores: coreCount,
|
|
}
|
|
|
|
// Add PPIN
|
|
if cpu.data.PPIN != "" && cpu.data.PPIN != "0x0" {
|
|
cpuModel.PPIN = cpu.data.PPIN
|
|
}
|
|
|
|
result.Hardware.CPUs = append(result.Hardware.CPUs, cpuModel)
|
|
|
|
// Add microcode version to firmware list
|
|
if cpu.data.UcodePatchVer != "" {
|
|
result.Hardware.Firmware = append(result.Hardware.Firmware, models.FirmwareInfo{
|
|
DeviceName: fmt.Sprintf("CPU%d Microcode", cpu.socket),
|
|
Version: cpu.data.UcodePatchVer,
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
// parseMCAErrors extracts Machine Check Architecture errors
|
|
func parseMCAErrors(crashData *struct {
|
|
METADATA Metadata `json:"METADATA"`
|
|
PROCESSORS ProcessorsData `json:"PROCESSORS"`
|
|
}, result *models.AnalysisResult) {
|
|
timestamp := time.Now()
|
|
if crashData.METADATA.Timestamp != "" {
|
|
if t, err := time.Parse(time.RFC3339, crashData.METADATA.Timestamp); err == nil {
|
|
timestamp = t
|
|
}
|
|
}
|
|
|
|
// Parse each CPU's MCA data
|
|
cpuProcs := []struct {
|
|
name string
|
|
data Processors
|
|
}{
|
|
{"cpu0", crashData.PROCESSORS.CPU0},
|
|
{"cpu1", crashData.PROCESSORS.CPU1},
|
|
}
|
|
|
|
for _, cpu := range cpuProcs {
|
|
if cpu.data.MCA.Uncore == nil {
|
|
continue
|
|
}
|
|
|
|
// Check each MCA bank for errors
|
|
for bankName, bankDataRaw := range cpu.data.MCA.Uncore {
|
|
bankData, ok := bankDataRaw.(map[string]interface{})
|
|
if !ok {
|
|
continue
|
|
}
|
|
|
|
// Look for status register
|
|
statusKey := strings.ToLower(bankName) + "_status"
|
|
statusRaw, ok := bankData[statusKey]
|
|
if !ok {
|
|
continue
|
|
}
|
|
|
|
statusStr, ok := statusRaw.(string)
|
|
if !ok {
|
|
continue
|
|
}
|
|
|
|
// Parse status value
|
|
status, err := strconv.ParseUint(strings.TrimPrefix(statusStr, "0x"), 16, 64)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
|
|
// Check if MCA error is valid (bit 63 = Valid)
|
|
if status&(1<<63) != 0 {
|
|
// MCA error detected
|
|
severity := models.SeverityWarning
|
|
if status&(1<<61) != 0 { // UC bit = uncorrected error
|
|
severity = models.SeverityCritical
|
|
}
|
|
|
|
description := fmt.Sprintf("MCA Error in %s bank %s", cpu.name, bankName)
|
|
if status&(1<<61) != 0 {
|
|
description += " (Uncorrected)"
|
|
} else {
|
|
description += " (Corrected)"
|
|
}
|
|
|
|
result.Events = append(result.Events, models.Event{
|
|
Timestamp: timestamp,
|
|
Source: "MCA",
|
|
EventType: "Machine Check",
|
|
Description: description,
|
|
Severity: severity,
|
|
RawData: fmt.Sprintf("Status: %s, CPU: %s, Bank: %s", statusStr, cpu.name, bankName),
|
|
})
|
|
}
|
|
}
|
|
}
|
|
}
|