v1.3.0: Add multiple vendor parsers and enhanced hardware detection

New parsers:
- NVIDIA Field Diagnostics parser with dmidecode output support
- NVIDIA Bug Report parser with comprehensive hardware extraction
- Supermicro crashdump (CDump.txt) parser
- Generic fallback parser for unrecognized text files

Enhanced GPU parsing (nvidia-bug-report):
- Model and manufacturer detection (NVIDIA H100 80GB HBM3)
- UUID, Video BIOS version, IRQ information
- Bus location (BDF), DMA size/mask, device minor
- PCIe bus type details

New hardware detection (nvidia-bug-report):
- System Information: server S/N, UUID, manufacturer, product name
- CPU: model, S/N, cores, threads, frequencies from dmidecode
- Memory: P/N, S/N, manufacturer, speed for all DIMMs
- Power Supplies: manufacturer, model, S/N, wattage, status
- Network Adapters: Ethernet/InfiniBand controllers with VPD data
  - Model, P/N, S/N from lspci Vital Product Data
  - Port count/type detection (QSFP56, OSFP, etc.)
  - Support for ConnectX-6/7 adapters

Archive handling improvements:
- Plain .gz file support (not just tar.gz)
- Increased size limit for plain gzip files (50MB)
- Better error handling for mixed archive formats

Web interface enhancements:
- Display parser name and filename badges
- Improved file info section with visual indicators

Co-Authored-By: Claude (qwen3-coder:480b) <noreply@anthropic.com>
This commit is contained in:
Mikhail Chusavitin
2026-01-30 17:19:47 +03:00
parent 21f4e5a67e
commit 70cd541d9e
24 changed files with 2930 additions and 12 deletions

View File

@@ -3,6 +3,7 @@ package parser
import (
"archive/tar"
"archive/zip"
"bytes"
"compress/gzip"
"fmt"
"io"
@@ -24,6 +25,8 @@ func ExtractArchive(archivePath string) ([]ExtractedFile, error) {
switch ext {
case ".gz", ".tgz":
return extractTarGz(archivePath)
case ".tar":
return extractTar(archivePath)
case ".zip":
return extractZip(archivePath)
default:
@@ -37,7 +40,9 @@ func ExtractArchiveFromReader(r io.Reader, filename string) ([]ExtractedFile, er
switch ext {
case ".gz", ".tgz":
return extractTarGzFromReader(r)
return extractTarGzFromReader(r, filename)
case ".tar":
return extractTarFromReader(r)
default:
return nil, fmt.Errorf("unsupported archive format: %s", ext)
}
@@ -50,17 +55,21 @@ func extractTarGz(archivePath string) ([]ExtractedFile, error) {
}
defer f.Close()
return extractTarGzFromReader(f)
return extractTarGzFromReader(f, filepath.Base(archivePath))
}
func extractTarGzFromReader(r io.Reader) ([]ExtractedFile, error) {
gzr, err := gzip.NewReader(r)
func extractTar(archivePath string) ([]ExtractedFile, error) {
f, err := os.Open(archivePath)
if err != nil {
return nil, fmt.Errorf("gzip reader: %w", err)
return nil, fmt.Errorf("open archive: %w", err)
}
defer gzr.Close()
defer f.Close()
tr := tar.NewReader(gzr)
return extractTarFromReader(f)
}
func extractTarFromReader(r io.Reader) ([]ExtractedFile, error) {
tr := tar.NewReader(r)
var files []ExtractedFile
for {
@@ -96,6 +105,75 @@ func extractTarGzFromReader(r io.Reader) ([]ExtractedFile, error) {
return files, nil
}
func extractTarGzFromReader(r io.Reader, filename string) ([]ExtractedFile, error) {
gzr, err := gzip.NewReader(r)
if err != nil {
return nil, fmt.Errorf("gzip reader: %w", err)
}
defer gzr.Close()
// Read all decompressed content into buffer
// Limit to 50MB for plain gzip files, 10MB per file for tar.gz
decompressed, err := io.ReadAll(io.LimitReader(gzr, 50*1024*1024))
if err != nil {
return nil, fmt.Errorf("read gzip content: %w", err)
}
// Try to read as tar archive
tr := tar.NewReader(bytes.NewReader(decompressed))
var files []ExtractedFile
header, err := tr.Next()
if err != nil {
// Not a tar archive - treat as a single gzipped file
if strings.Contains(err.Error(), "invalid tar header") || err == io.EOF {
// Get base filename without .gz extension
baseName := strings.TrimSuffix(filename, ".gz")
if gzr.Name != "" {
baseName = gzr.Name
}
return []ExtractedFile{
{
Path: baseName,
Content: decompressed,
},
}, nil
}
return nil, fmt.Errorf("tar read: %w", err)
}
// It's a valid tar archive, process it
for {
// Skip directories
if header.Typeflag != tar.TypeDir {
// Skip large files (>10MB)
if header.Size <= 10*1024*1024 {
content, err := io.ReadAll(tr)
if err != nil {
return nil, fmt.Errorf("read file %s: %w", header.Name, err)
}
files = append(files, ExtractedFile{
Path: header.Name,
Content: content,
})
}
}
// Read next header
header, err = tr.Next()
if err == io.EOF {
break
}
if err != nil {
return nil, fmt.Errorf("tar read: %w", err)
}
}
return files, nil
}
func extractZip(archivePath string) ([]ExtractedFile, error) {
r, err := zip.OpenReader(archivePath)
if err != nil {

View File

@@ -0,0 +1,72 @@
# Generic Text File Parser
Fallback парсер для текстовых файлов, которые не распознаны другими парсерами.
## Назначение
Этот парсер обрабатывает любые текстовые файлы, которые:
- Не являются архивами специфичных вендоров
- Содержат текстовую информацию (не бинарные данные)
- Представляют собой одиночные .gz файлы или простые текстовые файлы
## Приоритет
**Confidence score: 15** (низкий приоритет)
Этот парсер срабатывает только если ни один другой парсер не подошел с более высоким confidence.
## Поддерживаемые файлы
### Автоматически распознаваемые типы
1. **NVIDIA Bug Report** (`nvidia-bug-report-*.log.gz`)
- Извлекает информацию о драйвере NVIDIA
- Находит GPU устройства
- Показывает версию драйвера
2. **Любые текстовые файлы**
- Проверяет, что содержимое - текст (не бинарные данные)
- Показывает базовую информацию о файле
## Извлекаемые данные
### Events
- **Text File**: Базовая информация о загруженном файле
- **Driver Info**: Информация о NVIDIA драйвере (для nvidia-bug-report)
- **GPU Device**: Обнаруженные GPU устройства (для nvidia-bug-report)
## Пример использования
```bash
# Запуск с nvidia-bug-report
./logpile --file nvidia-bug-report-*.log.gz
# Запуск с любым текстовым файлом
./logpile --file system.log.gz
```
## Версионирование
**Текущая версия парсера:** 1.0.0
## Ограничения
1. Этот парсер предоставляет только базовую информацию
2. Не выполняет глубокий анализ содержимого
3. Для детального анализа специфичных логов рекомендуется создать dedicated парсер
## Расширение
Чтобы добавить поддержку нового типа файлов:
1. Добавьте проверку в функцию `Parse()`
2. Создайте функцию `parseXXX()` для извлечения специфичной информации
3. Увеличьте версию парсера
Пример:
```go
if strings.Contains(strings.ToLower(file.Path), "custom-log") {
parseCustomLog(content, result)
}
```

View File

@@ -0,0 +1,147 @@
// Package generic provides a fallback parser for unrecognized text files
package generic
import (
"strings"
"time"
"git.mchus.pro/mchus/logpile/internal/models"
"git.mchus.pro/mchus/logpile/internal/parser"
)
// parserVersion - version of this parser module
const parserVersion = "1.0.0"
func init() {
parser.Register(&Parser{})
}
// Parser implements VendorParser for generic text files
type Parser struct{}
// Name returns human-readable parser name
func (p *Parser) Name() string {
return "Generic Text File Parser"
}
// Vendor returns vendor identifier
func (p *Parser) Vendor() string {
return "generic"
}
// Version returns parser version
func (p *Parser) Version() string {
return parserVersion
}
// Detect checks if this is a text file (fallback with low confidence)
// Returns confidence 0-100
func (p *Parser) Detect(files []parser.ExtractedFile) int {
// Only detect if there's exactly one file (plain .gz or single file)
if len(files) != 1 {
return 0
}
file := files[0]
// Check if content looks like text (not binary)
if !isLikelyText(file.Content) {
return 0
}
// Return low confidence so other parsers have priority
return 15
}
// isLikelyText checks if content is likely text (not binary)
func isLikelyText(content []byte) bool {
// Check first 512 bytes for binary data
sample := content
if len(content) > 512 {
sample = content[:512]
}
binaryCount := 0
for _, b := range sample {
// Count non-printable characters (excluding common whitespace)
if b < 32 && b != '\n' && b != '\r' && b != '\t' {
binaryCount++
}
if b == 0 { // NULL byte is a strong indicator of binary
binaryCount += 10
}
}
// If less than 5% binary, consider it text
return binaryCount < len(sample)/20
}
// Parse parses generic text file
func (p *Parser) Parse(files []parser.ExtractedFile) (*models.AnalysisResult, error) {
result := &models.AnalysisResult{
Events: make([]models.Event, 0),
FRU: make([]models.FRUInfo, 0),
Sensors: make([]models.SensorReading, 0),
}
// Initialize hardware config
result.Hardware = &models.HardwareConfig{}
if len(files) == 0 {
return result, nil
}
file := files[0]
content := string(file.Content)
// Create a single event with file info
result.Events = append(result.Events, models.Event{
Timestamp: time.Now(),
Source: "File",
EventType: "Text File",
Description: "Generic text file loaded",
Severity: models.SeverityInfo,
RawData: "Filename: " + file.Path,
})
// Try to extract some basic info from common file types
if strings.Contains(strings.ToLower(file.Path), "nvidia-bug-report") {
parseNvidiaBugReport(content, result)
}
return result, nil
}
// parseNvidiaBugReport extracts info from nvidia-bug-report files
func parseNvidiaBugReport(content string, result *models.AnalysisResult) {
lines := strings.Split(content, "\n")
// Look for GPU information
for i, line := range lines {
// Find NVIDIA driver version
if strings.Contains(line, "NVRM version:") || strings.Contains(line, "nvidia-smi") {
if i+5 < len(lines) {
result.Events = append(result.Events, models.Event{
Timestamp: time.Now(),
Source: "NVIDIA Driver",
EventType: "Driver Info",
Description: "NVIDIA driver information found",
Severity: models.SeverityInfo,
RawData: strings.TrimSpace(line),
})
}
}
// Find GPU devices
if strings.Contains(line, "/proc/driver/nvidia/gpus/") && strings.Contains(line, "***") {
result.Events = append(result.Events, models.Event{
Timestamp: time.Now(),
Source: "GPU",
EventType: "GPU Device",
Description: "GPU device detected",
Severity: models.SeverityInfo,
RawData: strings.TrimSpace(line),
})
}
}
}

175
internal/parser/vendors/nvidia/README.md vendored Normal file
View File

@@ -0,0 +1,175 @@
# NVIDIA Field Diagnostics Parser
Парсер для диагностических архивов NVIDIA HGX Field Diagnostics.
Универсальный парсер, не привязанный к конкретному производителю серверов.
## Поддерживаемые архивы
- NVIDIA HGX Field Diag (работает с любыми серверами: Supermicro, Dell, HPE, и т.д.)
- Архивы с результатами GPU диагностики NVIDIA
## Формат архива
Парсер работает с архивами в формате:
- `.tar` (несжатый tar)
- `.tar.gz` (сжатый gzip)
## Распознаваемые файлы
### Основные файлы
1. **output.log** - вывод dmidecode с информацией о системе
- Производитель сервера (Manufacturer)
- Модель сервера (Product Name) - например, SYS-821GE-TNHR
- Серийный номер сервера (Serial Number) - например, A514359X5A07900
- UUID, SKU Number, Family
2. **unified_summary.json** - детальная информация о системе и компонентах
- Информация о GPU (модель, производитель, VBIOS, PCI адреса)
- Информация о NVSwitch (VendorID, DeviceID, Link speed/width)
- Информация о производителе и модели сервера
3. **summary.json** - результаты тестов диагностики
- Результаты тестов GPU (inforom, checkinforom, gpumem, gpustress, pcie, nvlink, nvswitch, power)
- Коды ошибок и статусы тестов
4. **summary.csv** - альтернативный формат результатов тестов
### Дополнительные файлы
- `gpu_fieldiag/*.log` - детальные логи диагностики каждого GPU
- `inventory/*.json` - дополнительная информация о конфигурации
## Извлекаемые данные
### Hardware Configuration
#### GPUs
```json
{
"slot": "GPUSXM1",
"model": "NVIDIA Device 2335",
"manufacturer": "NVIDIA Corporation",
"firmware": "96.00.D0.00.03",
"bdf": "0000:3a:00.0"
}
```
#### NVSwitch (как PCIe устройства)
```json
{
"slot": "NVSWITCHNVSWITCH0",
"device_class": "NVSwitch",
"manufacturer": "NVIDIA Corporation",
"vendor_id": 4318,
"device_id": 8867,
"bdf": "0000:05:00.0",
"link_speed": "16GT/s",
"link_width": 2
}
```
### Events
События создаются для:
- **Предупреждений и ошибок** тестов диагностики
- Примеры событий:
- `Row remapping failed` - ошибка памяти GPU (Warning)
- Различные тесты: connectivity, gpumem, gpustress, pcie, nvlink, nvswitch, power
Уровни severity:
- `info` - информационные события (тесты прошли успешно)
- `warning` - предупреждения (например, Row remapping failed)
- `critical` - критические ошибки (коды ошибок 300+)
## Пример использования
```bash
# Запуск веб-интерфейса
./logpile --file /path/to/A514359X5A07900_logs-20260122-074208.tar
# Веб-интерфейс будет доступен на http://localhost:8082
```
## Автоопределение
Парсер автоматически определяет архивы NVIDIA Field Diag по наличию:
- `unified_summary.json` с маркером "HGX Field Diag"
- `summary.json` и `summary.csv` с результатами тестов
- Директории `gpu_fieldiag/`
Confidence score:
- `unified_summary.json` с маркером "HGX Field Diag": +40
- `summary.json`: +20
- `summary.csv`: +15
- `gpu_fieldiag/` directory: +15
## Версионирование
**Текущая версия парсера:** 1.1.0
При модификации логики парсера необходимо увеличивать версию в константе `parserVersion` в файле `parser.go`.
### История версий
- **1.1.0** - Добавлен парсинг output.log (dmidecode) для извлечения модели и серийного номера сервера
- **1.0.0** - Первоначальная версия с парсингом unified_summary.json и summary.json/csv
## Примеры данных
### Пример unified_summary.json
```json
{
"runInfo": {
"diagVersion": "24287-XXXX-FLD-42658",
"diagName": "HGX Field Diag",
"finalResult": "FAIL",
"errorCode": 363
},
"tests": [{
"virtualId": "inventory",
"components": [{
"componentId": "GPUSXM1",
"properties": [
{"id": "Manufacturer", "value": "Any Server Vendor"},
{"id": "VendorID", "value": "10de"},
{"id": "DeviceID", "value": "2335"}
]
}]
}]
}
```
### Пример summary.json
```json
[
{
"Error Code": "005-000-1-000000000363",
"Test": "gpumem",
"Component ID": "SXM5_SN_1653925025497",
"Notes": "Row remapping failed",
"Virtual ID": "gpumem"
}
]
```
## Известные ограничения
1. Парсер фокусируется на данных из `unified_summary.json` и `summary.json`
2. Детальные логи из `gpu_fieldiag/*.log` пока не парсятся
3. Информация о CPU, памяти и дисках не извлекается (в архиве отсутствует)
## Разработка
### Добавление новых полей
1. Изучите структуру JSON в архиве
2. Добавьте поля в структуры `Component` или `Property`
3. Обновите функции `parseGPUComponent` или `parseNVSwitchComponent`
4. Увеличьте версию парсера
### Добавление новых типов файлов
1. Создайте новый файл с парсером (например, `gpu_logs.go`)
2. Добавьте парсинг в функцию `Parse()` в `parser.go`
3. Обновите документацию

View File

@@ -0,0 +1,68 @@
package nvidia
import (
"bufio"
"strings"
"git.mchus.pro/mchus/logpile/internal/models"
)
// ParseOutputLog parses output.log file which contains dmidecode output
func ParseOutputLog(content []byte, result *models.AnalysisResult) error {
scanner := bufio.NewScanner(strings.NewReader(string(content)))
inSystemInfo := false
for scanner.Scan() {
line := scanner.Text()
trimmed := strings.TrimSpace(line)
// Detect "System Information" section
if strings.Contains(trimmed, "System Information") {
inSystemInfo = true
continue
}
// Exit section when we hit another Handle or empty section
if inSystemInfo && strings.HasPrefix(trimmed, "Handle ") {
inSystemInfo = false
continue
}
// Parse fields in System Information section
if inSystemInfo && strings.Contains(line, ":") {
parts := strings.SplitN(trimmed, ":", 2)
if len(parts) != 2 {
continue
}
field := strings.TrimSpace(parts[0])
value := strings.TrimSpace(parts[1])
if value == "" {
continue
}
switch field {
case "Manufacturer":
result.Hardware.BoardInfo.Manufacturer = value
case "Product Name":
result.Hardware.BoardInfo.ProductName = value
case "Serial Number":
result.Hardware.BoardInfo.SerialNumber = value
case "Version":
// Store version in part number if needed
if result.Hardware.BoardInfo.PartNumber == "" {
result.Hardware.BoardInfo.PartNumber = value
}
case "UUID":
// Store UUID somewhere if needed (we don't have a field for it yet)
// Could add to FRU or as a custom field
case "Family":
// Could store family info if needed
}
}
}
return scanner.Err()
}

166
internal/parser/vendors/nvidia/parser.go vendored Normal file
View File

@@ -0,0 +1,166 @@
// Package nvidia provides parser for NVIDIA Field Diagnostics archives
// Tested with: HGX Field Diag (works with various server vendors)
//
// IMPORTANT: Increment parserVersion when modifying parser logic!
// This helps track which version was used to parse specific logs.
package nvidia
import (
"strings"
"git.mchus.pro/mchus/logpile/internal/models"
"git.mchus.pro/mchus/logpile/internal/parser"
)
// parserVersion - version of this parser module
// IMPORTANT: Increment this version when making changes to parser logic!
const parserVersion = "1.1.0"
func init() {
parser.Register(&Parser{})
}
// Parser implements VendorParser for NVIDIA Field Diagnostics
type Parser struct{}
// Name returns human-readable parser name
func (p *Parser) Name() string {
return "NVIDIA Field Diagnostics Parser"
}
// Vendor returns vendor identifier
func (p *Parser) Vendor() string {
return "nvidia"
}
// Version returns parser version
// IMPORTANT: Update parserVersion constant when modifying parser logic!
func (p *Parser) Version() string {
return parserVersion
}
// Detect checks if archive matches NVIDIA Field Diagnostics format
// Returns confidence 0-100
func (p *Parser) Detect(files []parser.ExtractedFile) int {
confidence := 0
for _, f := range files {
path := strings.ToLower(f.Path)
// Strong indicators for NVIDIA Field Diagnostics format
if strings.HasSuffix(path, "unified_summary.json") {
// Check if it's really NVIDIA Field Diag format
if containsNvidiaFieldDiagMarkers(f.Content) {
confidence += 40
}
}
if strings.HasSuffix(path, "summary.json") && !strings.Contains(path, "unified_") {
confidence += 20
}
if strings.HasSuffix(path, "summary.csv") {
confidence += 15
}
if strings.Contains(path, "gpu_fieldiag/") {
confidence += 15
}
if strings.HasSuffix(path, "output.log") {
// Check if it contains dmidecode output
if strings.Contains(string(f.Content), "dmidecode") ||
strings.Contains(string(f.Content), "System Information") {
confidence += 10
}
}
// Cap at 100
if confidence >= 100 {
return 100
}
}
return confidence
}
// containsNvidiaFieldDiagMarkers checks if content has NVIDIA Field Diag markers
func containsNvidiaFieldDiagMarkers(content []byte) bool {
s := string(content)
// Check for typical NVIDIA Field Diagnostics structure
return strings.Contains(s, "runInfo") &&
strings.Contains(s, "diagVersion") &&
strings.Contains(s, "HGX Field Diag")
}
// Parse parses NVIDIA Field Diagnostics archive
func (p *Parser) Parse(files []parser.ExtractedFile) (*models.AnalysisResult, error) {
result := &models.AnalysisResult{
Events: make([]models.Event, 0),
FRU: make([]models.FRUInfo, 0),
Sensors: make([]models.SensorReading, 0),
}
// Initialize hardware config
result.Hardware = &models.HardwareConfig{
GPUs: make([]models.GPU, 0),
}
// Parse output.log first (contains dmidecode system info)
// Find the output.log file that contains dmidecode output
outputLogFile := findDmidecodeOutputLog(files)
if outputLogFile != nil {
if err := ParseOutputLog(outputLogFile.Content, result); err != nil {
// Log error but continue parsing other files
_ = err // Ignore error for now
}
}
// Parse unified_summary.json (contains detailed component info)
if f := parser.FindFileByName(files, "unified_summary.json"); f != nil {
if err := ParseUnifiedSummary(f.Content, result); err != nil {
// Log error but continue parsing other files
_ = err // Ignore error for now
}
}
// Parse summary.json (test results summary)
if f := parser.FindFileByName(files, "summary.json"); f != nil {
events := ParseSummaryJSON(f.Content)
result.Events = append(result.Events, events...)
}
// Parse summary.csv (alternative format)
if f := parser.FindFileByName(files, "summary.csv"); f != nil {
csvEvents := ParseSummaryCSV(f.Content)
result.Events = append(result.Events, csvEvents...)
}
// Parse GPU field diagnostics logs
gpuFieldiagFiles := parser.FindFileByPattern(files, "gpu_fieldiag/", ".log")
for _, f := range gpuFieldiagFiles {
// Parse individual GPU diagnostic logs if needed
// For now, we focus on summary files
_ = f
}
return result, nil
}
// findDmidecodeOutputLog finds the output.log file that contains dmidecode output
func findDmidecodeOutputLog(files []parser.ExtractedFile) *parser.ExtractedFile {
for _, f := range files {
// Look for output.log files
if !strings.HasSuffix(strings.ToLower(f.Path), "output.log") {
continue
}
// Check if it contains dmidecode output
content := string(f.Content)
if strings.Contains(content, "dmidecode") &&
strings.Contains(content, "System Information") {
return &f
}
}
return nil
}

View File

@@ -0,0 +1,152 @@
package nvidia
import (
"encoding/csv"
"encoding/json"
"fmt"
"strings"
"time"
"git.mchus.pro/mchus/logpile/internal/models"
)
// SummaryEntry represents a single test result entry
type SummaryEntry struct {
ErrorCode string `json:"Error Code"`
Test string `json:"Test"`
ComponentID string `json:"Component ID"`
Notes string `json:"Notes"`
VirtualID string `json:"Virtual ID"`
IgnoreError string `json:"Ignore Error"`
}
// ParseSummaryJSON parses summary.json file and returns events
func ParseSummaryJSON(content []byte) []models.Event {
var entries []SummaryEntry
if err := json.Unmarshal(content, &entries); err != nil {
return nil
}
events := make([]models.Event, 0)
timestamp := time.Now() // Use current time as we don't have exact timestamps in summary
for _, entry := range entries {
// Only create events for failures or warnings
if entry.Notes != "OK" || entry.ErrorCode != "001-000-1-000000000000" {
event := models.Event{
Timestamp: timestamp,
Source: "GPU Field Diagnostics",
EventType: entry.Test,
Description: formatSummaryDescription(entry),
Severity: getSeverityFromErrorCode(entry.ErrorCode, entry.Notes),
RawData: fmt.Sprintf("Test: %s, Component: %s, Error: %s", entry.Test, entry.ComponentID, entry.ErrorCode),
}
events = append(events, event)
}
}
return events
}
// ParseSummaryCSV parses summary.csv file and returns events
func ParseSummaryCSV(content []byte) []models.Event {
reader := csv.NewReader(strings.NewReader(string(content)))
records, err := reader.ReadAll()
if err != nil {
return nil
}
events := make([]models.Event, 0)
timestamp := time.Now()
// Skip header row
for i, record := range records {
if i == 0 {
continue // Skip header
}
// CSV format: ErrorCode,Test,VirtualID,SubTest,Type,ComponentID,Notes,Level,,,IgnoreError
if len(record) < 7 {
continue
}
errorCode := record[0]
test := record[1]
componentID := record[5]
notes := record[6]
// Only create events for failures or warnings
if notes != "OK" || (errorCode != "0" && !strings.HasPrefix(errorCode, "048-000-0") && !strings.HasPrefix(errorCode, "001-000-1")) {
event := models.Event{
Timestamp: timestamp,
Source: "GPU Field Diagnostics",
EventType: test,
Description: formatCSVDescription(test, componentID, notes, errorCode),
Severity: getSeverityFromErrorCode(errorCode, notes),
RawData: fmt.Sprintf("Test: %s, Component: %s, Error: %s", test, componentID, errorCode),
}
events = append(events, event)
}
}
return events
}
// formatSummaryDescription creates a human-readable description from summary entry
func formatSummaryDescription(entry SummaryEntry) string {
component := entry.ComponentID
if component == "" {
component = entry.VirtualID
}
if entry.Notes == "OK" {
return fmt.Sprintf("%s test passed for %s", entry.Test, component)
}
return fmt.Sprintf("%s test failed for %s: %s (Error: %s)", entry.Test, component, entry.Notes, entry.ErrorCode)
}
// formatCSVDescription creates a human-readable description from CSV record
func formatCSVDescription(test, component, notes, errorCode string) string {
if notes == "OK" {
return fmt.Sprintf("%s test passed for %s", test, component)
}
return fmt.Sprintf("%s test failed for %s: %s (Error: %s)", test, component, notes, errorCode)
}
// getSeverityFromErrorCode determines severity based on error code and notes
func getSeverityFromErrorCode(errorCode, notes string) models.Severity {
// Parse error code format: XXX-YYY-Z-ZZZZZZZZZZZZ
// First digit indicates severity in some cases
if notes == "OK" {
return models.SeverityInfo
}
// Row remapping failed is a warning
if strings.Contains(notes, "Row remapping failed") {
return models.SeverityWarning
}
// Check error code
if errorCode == "" || errorCode == "0" {
return models.SeverityInfo
}
// Codes starting with 0 are typically informational
if strings.HasPrefix(errorCode, "001-000-1") || strings.HasPrefix(errorCode, "048-000-0") {
return models.SeverityInfo
}
// Non-zero error codes are typically warnings or errors
// If code is in 300+ range, it's likely an error
if len(errorCode) > 2 {
firstDigits := errorCode[:3]
if firstDigits >= "300" {
return models.SeverityCritical
}
}
return models.SeverityWarning
}

View File

@@ -0,0 +1,281 @@
package nvidia
import (
"encoding/json"
"fmt"
"strings"
"git.mchus.pro/mchus/logpile/internal/models"
)
// UnifiedSummaryData represents the structure of unified_summary.json
type UnifiedSummaryData struct {
RunInfo RunInfo `json:"runInfo"`
Tests []Test `json:"tests"`
}
// RunInfo contains information about the diagnostic run
type RunInfo struct {
TimeInfo struct {
StartTime string `json:"startTime"`
EndTime string `json:"endTime"`
TotalDuration string `json:"totalDuration"`
} `json:"timeInfo"`
DiagVersion string `json:"diagVersion"`
BaseVersion string `json:"baseVersion"`
FinalResult string `json:"finalResult"`
ErrorCode int `json:"errorCode"`
DiagName string `json:"diagName"`
RunLevel string `json:"runLevel"`
}
// Test represents a diagnostic test
type Test struct {
VirtualID string `json:"virtualId"`
Action string `json:"action"`
StartTime string `json:"startTime"`
EndTime string `json:"endTime"`
Components []Component `json:"components"`
}
// Component represents a hardware component
type Component struct {
ComponentID string `json:"componentId"`
ErrorCode string `json:"errorCode"`
Notes string `json:"notes"`
Result string `json:"result"`
Properties []Property `json:"properties"`
}
// Property represents a component property
type Property struct {
ID string `json:"id"`
Value interface{} `json:"value"` // Can be string or number
}
// GetValueAsString returns the value as a string
func (p *Property) GetValueAsString() string {
switch v := p.Value.(type) {
case string:
return v
case float64:
return fmt.Sprintf("%.0f", v)
case int:
return fmt.Sprintf("%d", v)
default:
return fmt.Sprintf("%v", v)
}
}
// ParseUnifiedSummary parses unified_summary.json file
func ParseUnifiedSummary(content []byte, result *models.AnalysisResult) error {
var data UnifiedSummaryData
if err := json.Unmarshal(content, &data); err != nil {
return fmt.Errorf("failed to parse unified_summary.json: %w", err)
}
// Set default board info only if not already set (from output.log)
if result.Hardware.BoardInfo.ProductName == "" {
result.Hardware.BoardInfo.ProductName = "GPU Server (Field Diag)"
}
// Parse inventory test for hardware details
for _, test := range data.Tests {
if test.VirtualID == "inventory" || test.Action == "inventory" {
parseInventoryComponents(test.Components, result)
}
}
return nil
}
// parseInventoryComponents extracts hardware info from inventory test
func parseInventoryComponents(components []Component, result *models.AnalysisResult) {
for _, comp := range components {
// Parse system/board information
if parseSystemInfo(comp, result) {
// System info was found and parsed
continue
}
// Parse GPU components
if strings.HasPrefix(comp.ComponentID, "GPUSXM") {
gpu := parseGPUComponent(comp)
if gpu != nil {
result.Hardware.GPUs = append(result.Hardware.GPUs, *gpu)
}
}
// Parse NVSwitch components
if strings.HasPrefix(comp.ComponentID, "NVSWITCHNVSWITCH") {
nvswitch := parseNVSwitchComponent(comp)
if nvswitch != nil {
// Add as PCIe device for now
result.Hardware.PCIeDevices = append(result.Hardware.PCIeDevices, *nvswitch)
}
}
}
}
// parseSystemInfo extracts system/board information from a component
// Returns true if this component contains system info
func parseSystemInfo(comp Component, result *models.AnalysisResult) bool {
compID := strings.ToUpper(comp.ComponentID)
// Check if this is a system/board component
isSystemComponent := strings.Contains(compID, "BASEBOARD") ||
strings.Contains(compID, "SYSTEM") ||
strings.Contains(compID, "MOTHERBOARD") ||
strings.Contains(compID, "BOARD") ||
comp.ComponentID == "Inventory"
if !isSystemComponent {
return false
}
// Extract system properties
for _, prop := range comp.Properties {
propID := prop.ID
value := prop.GetValueAsString()
if value == "" {
continue
}
switch propID {
case "Manufacturer", "BoardManufacturer", "SystemManufacturer":
// Only set if not already populated (e.g., from output.log)
if result.Hardware.BoardInfo.Manufacturer == "" {
result.Hardware.BoardInfo.Manufacturer = value
}
case "ProductName", "Product", "Model", "ModelName", "BoardProduct", "SystemProduct":
// Don't overwrite real data from output.log with generic data
// Only set if empty or still has the default placeholder value
if result.Hardware.BoardInfo.ProductName == "" ||
result.Hardware.BoardInfo.ProductName == "GPU Server (Field Diag)" {
result.Hardware.BoardInfo.ProductName = value
}
case "SerialNumber", "Serial", "BoardSerial", "SystemSerial":
// Only set if not already populated (e.g., from output.log)
if result.Hardware.BoardInfo.SerialNumber == "" {
result.Hardware.BoardInfo.SerialNumber = value
}
case "PartNumber", "BoardPartNumber":
// Only set if not already populated
if result.Hardware.BoardInfo.PartNumber == "" {
result.Hardware.BoardInfo.PartNumber = value
}
}
}
return true
}
// parseGPUComponent parses GPU component information
func parseGPUComponent(comp Component) *models.GPU {
gpu := &models.GPU{
Slot: comp.ComponentID, // e.g., "GPUSXM1"
}
var deviceID, vbios, pciID string
for _, prop := range comp.Properties {
switch prop.ID {
case "DeviceID":
deviceID = prop.GetValueAsString()
case "Vendor":
gpu.Manufacturer = prop.GetValueAsString()
case "DeviceName":
gpu.Model = prop.GetValueAsString()
case "VBIOS_version":
vbios = prop.GetValueAsString()
case "PCIID":
pciID = prop.GetValueAsString()
}
}
// Build model string from vendor/device IDs
if gpu.Model == "" || strings.Contains(gpu.Model, "Device") {
if deviceID != "" {
gpu.Model = fmt.Sprintf("NVIDIA Device %s", strings.ToUpper(deviceID))
}
}
// Add firmware info
if vbios != "" {
gpu.Firmware = vbios
}
// Add PCI info
if pciID != "" {
gpu.BDF = pciID
}
return gpu
}
// parseNVSwitchComponent parses NVSwitch component information
func parseNVSwitchComponent(comp Component) *models.PCIeDevice {
device := &models.PCIeDevice{
Slot: comp.ComponentID, // e.g., "NVSWITCHNVSWITCH0"
}
var vendorIDStr, deviceIDStr, vbios, pciID string
var pciSpeedStr, pciWidthStr string
var vendor string
for _, prop := range comp.Properties {
switch prop.ID {
case "VendorID":
vendorIDStr = prop.GetValueAsString()
case "DeviceID":
deviceIDStr = prop.GetValueAsString()
case "Vendor":
vendor = prop.GetValueAsString()
case "VBIOS_version":
vbios = prop.GetValueAsString()
case "InfoROM_version":
// Store in part number field as we don't have a better place
case "PCIID":
pciID = prop.GetValueAsString()
device.BDF = pciID
case "PCISpeed":
pciSpeedStr = prop.GetValueAsString()
device.LinkSpeed = pciSpeedStr
device.MaxLinkSpeed = pciSpeedStr
case "PCIWidth":
pciWidthStr = prop.GetValueAsString()
}
}
// Parse vendor ID
if vendorIDStr != "" {
fmt.Sscanf(vendorIDStr, "%x", &device.VendorID)
}
// Parse device ID
if deviceIDStr != "" {
fmt.Sscanf(deviceIDStr, "%x", &device.DeviceID)
}
// Set manufacturer
if vendor != "" {
device.Manufacturer = vendor
}
// Set device class
device.DeviceClass = "NVSwitch"
// Parse link width
if pciWidthStr != "" {
fmt.Sscanf(pciWidthStr, "x%d", &device.LinkWidth)
device.MaxLinkWidth = device.LinkWidth
}
// Store part number (use for firmware version)
if vbios != "" {
device.PartNumber = vbios
}
return device
}

View File

@@ -0,0 +1,275 @@
# NVIDIA Bug Report Parser
Парсер для файлов nvidia-bug-report, генерируемых скриптом `nvidia-bug-report.sh`.
## Назначение
Этот парсер обрабатывает диагностические логи NVIDIA драйверов и извлекает:
- Информацию о модулях памяти (из dmidecode)
- Информацию о GPU устройствах
- Версию NVIDIA драйвера
## Формат файла
- Имя файла: `nvidia-bug-report-*.log.gz`
- Формат: Gzip-сжатый текстовый файл
- Генерируется: `nvidia-bug-report.sh` скриптом
## Confidence Score
**85** - высокий приоритет для файлов nvidia-bug-report
## Извлекаемые данные
### 1. System Information (из dmidecode)
Информация о сервере:
- **Serial Number**: Серийный номер сервера (например, 2KD501412)
- **UUID**: Уникальный идентификатор системы (например, 2e4054bc-1dd2-11b2-0284-6b0a21737950)
- **Manufacturer**: Производитель сервера
- **Product Name**: Модель сервера
- **Version**: Версия системы
### 2. CPU Information (из dmidecode)
Для каждого процессора извлекается:
- **Model**: Модель процессора (например, Intel(R) Xeon(R) Platinum 8480+)
- **Serial Number**: Серийный номер (например, 5DB0D6C0DD30ABD8)
- **Core Count**: Количество ядер (например, 56)
- **Thread Count**: Количество потоков (например, 112)
- **Max Speed**: Максимальная частота (например, 3800 MHz)
- **Current Speed**: Текущая частота (например, 2000 MHz)
Пример:
```
Socket 0: Intel(R) Xeon(R) Platinum 8480+
Serial Number: 5DB0D6C0DD30ABD8
Cores: 56, Threads: 112
Frequency: 2000 MHz (Max: 3800 MHz)
```
### 3. Memory Modules (из dmidecode)
Для каждого модуля памяти извлекается:
- **Slot/Location**: Например, CPU0_C0D0
- **Size**: Размер в GB (например, 64 GB)
- **Type**: Тип памяти (DDR5, DDR4, etc.)
- **Manufacturer**: Производитель (Hynix, Samsung, Micron, etc.)
- **Part Number**: P/N модуля (например, HMCG94AGBRA179N)
- **Serial Number**: S/N модуля (например, 80AD0224322B3834E6)
- **Speed**: Max/Current скорость (например, 5600/4400 MHz)
- **Ranks**: Количество рангов
Пример:
```
Slot: CPU0_C0D0
Size: 64 GB
Type: DDR5
Manufacturer: Hynix
Part Number: HMCG94AGBRA179N
Serial Number: 80AD0224322B3834E6
Speed: 5600 MT/s (configured: 4400 MT/s)
Ranks: 2
```
### 4. Power Supplies (из dmidecode)
Для каждого блока питания извлекается:
- **Location**: Позиция (например, PSU0, PSU1)
- **Manufacturer**: Производитель (например, DELTA, Great Wall)
- **Model Part Number**: Модель БП (например, V0310DT000000000)
- **Serial Number**: Серийный номер (например, DGPLV251500LZ)
- **Max Power Capacity**: Максимальная мощность (например, 2700 W)
- **Revision**: Версия прошивки (например, 00.01.04)
- **Status**: Статус (например, Present, OK)
Пример:
```
PSU0: V0310DT000000000 (DELTA)
Serial Number: DGPLV251500LZ
Power: 2700 W, Revision: 00.01.04
Status: Present, OK
```
### 5. Network Adapters (из lspci)
Для каждого сетевого адаптера (Ethernet, Network, InfiniBand) извлекается:
- **Model**: Полное название модели из VPD (например, "NVIDIA ConnectX-7 HHHL Adapter card, 400GbE / NDR IB (default mode), Single-port OSFP, PCIe 5.0 x16")
- **Location**: PCI BDF адрес (например, 0000:0e:00.0)
- **Slot**: Физический слот (например, 108)
- **Part Number**: P/N адаптера (например, MCX75310AAS-NEAT)
- **Serial Number**: S/N адаптера (например, MT2430600249)
- **Vendor**: Производитель (Mellanox, NVIDIA)
- **Vendor ID / Device ID**: PCI идентификаторы (например, 15b3:1021)
- **Port Count**: Количество портов (определяется из модели: Dual-port = 2, Single-port = 1)
- **Port Type**: Тип портов (QSFP56, OSFP, SFP+)
Пример:
```
0000:0e:00.0: NVIDIA ConnectX-7 HHHL Adapter card, 400GbE / NDR IB (default mode), Single-port OSFP
Slot: 108
P/N: MCX75310AAS-NEAT
S/N: MT2430600249
Ports: 1 x OSFP
```
### 6. GPU Devices
Для каждого GPU извлекается:
- **Model**: Модель GPU (например, NVIDIA H100 80GB HBM3)
- **BDF (Bus:Device.Function)**: PCI адрес (например, 0000:0f:00.0)
- **UUID**: Уникальный идентификатор GPU (например, GPU-64674e47-e036-c12a-3e8d-55a2a9ac8db3)
- **Video BIOS**: Версия BIOS видеокарты (например, 96.00.99.00.01)
- **IRQ**: Прерывание (например, 17)
- **Bus Type**: Тип шины (PCIe)
- **DMA Size**: Размер DMA (например, 52 bits)
- **DMA Mask**: Маска DMA (например, 0xfffffffffffff)
- **Device Minor**: Номер устройства (например, 0)
- **Manufacturer**: NVIDIA
Пример:
```
0000:0f:00.0: NVIDIA H100 80GB HBM3
UUID: GPU-64674e47-e036-c12a-3e8d-55a2a9ac8db3
Video BIOS: 96.00.99.00.01
IRQ: 17
```
### 7. Events
- **Memory Configuration**: Сводка по модулям памяти (количество, производители, общий размер)
- **GPU Detection**: Обнаруженные GPU устройства
- **Driver Version**: Версия NVIDIA драйвера
## Пример использования
```bash
# Запуск с nvidia-bug-report файлом
./logpile --file nvidia-bug-report-2KD501412.log.gz
# Веб-интерфейс будет доступен на http://localhost:8082
```
## Пример вывода
```
✓ Detected vendor: NVIDIA Bug Report Parser
✓ CPUs: 2
✓ Memory: 32 modules
✓ Power Supplies: 8
✓ GPUs: 8
✓ Network Adapters: 12
System Information:
Serial Number: 2KD501412
UUID: 2e4054bc-1dd2-11b2-0284-6b0a21737950
Version: 0
CPU Information:
Socket 0: Intel(R) Xeon(R) Platinum 8480+
S/N: 5DB0D6C0DD30ABD8, Cores: 56, Threads: 112
Socket 1: Intel(R) Xeon(R) Platinum 8480+
S/N: 5DB017C05685B3ED, Cores: 56, Threads: 112
Power Supplies:
PSU0: V0310DT000000000 (DELTA)
S/N: DGPLV251500LZ
Power: 2700 W, Revision: 00.01.04
Status: Present, OK
PSU1: V0310DT000000000 (DELTA)
S/N: DGPLV251500GY
Power: 2700 W, Revision: 00.01.04
Status: Present, OK
[... 6 more PSUs ...]
Memory Modules:
CPU0_C0D0: 64 GB, Hynix
P/N: HMCG94AGBRA179N, S/N: 80AD0224322B3834E6
Type: DDR5, Speed: 4400/5600 MHz
[... 31 more modules ...]
Network Adapters: 12 devices
0000:0e:00.0: NVIDIA ConnectX-7 HHHL Adapter card, 400GbE / NDR IB (default mode), Single-port OSFP
Slot: 108
P/N: MCX75310AAS-NEAT
S/N: MT2430600249
Ports: 1 x OSFP
0000:1f:00.0: ConnectX-6 Dx EN adapter card, 100GbE, Dual-port QSFP56
Slot: 12
P/N: MCX623106AN-CDAT
S/N: MT2434J00PCD
Ports: 2 x QSFP56
[... 10 more adapters ...]
GPUs: 8 devices
0000:0f:00.0: NVIDIA H100 80GB HBM3
UUID: GPU-64674e47-e036-c12a-3e8d-55a2a9ac8db3
Video BIOS: 96.00.99.00.01
IRQ: 17
0000:34:00.0: NVIDIA H100 80GB HBM3
UUID: GPU-fa796345-c23a-54aa-1b67-709ac2542852
Video BIOS: 96.00.99.00.01
IRQ: 16
[... 6 more GPUs ...]
```
## Версионирование
**Текущая версия парсера:** 1.0.0
### История версий
- **1.0.0** - Первоначальная версия с парсингом System Info, CPU, Memory, PSU, GPU, Network Adapters и Driver
## Структура данных
Парсер использует следующие секции в bug report:
1. **dmidecode output (System Information)** - для извлечения информации о сервере
2. **dmidecode output (Processor Information)** - для извлечения информации о CPU
3. **dmidecode output (Memory Device)** - для извлечения информации о памяти
4. **dmidecode output (System Power Supply)** - для извлечения информации о блоках питания
5. **lspci -vvv output (Ethernet/Network/Infiniband controller)** - для извлечения информации о сетевых адаптерах
6. **lspci VPD (Vital Product Data)** - для извлечения P/N, S/N и модели сетевых адаптеров
7. **/proc/driver/nvidia/gpus/.../information** - для детальной информации о GPU
8. **NVRM version** - для версии драйвера
## Известные ограничения
1. Ошибки и предупреждения из логов пока не извлекаются
2. Некоторые специфичные характеристики GPU (температура, утилизация) не парсятся
3. Информация о производительности и метрики GPU требуют парсинга других секций
## Расширение
Для добавления новых возможностей:
1. **Ошибки драйвера**: Парсить секции с ошибками NVIDIA драйвера
2. **nvidia-smi output**: Извлекать детальную информацию из вывода nvidia-smi (температура, утилизация)
3. **GPU производительность**: Парсить метрики производительности и использования памяти GPU
4. **PCIe информация**: Извлекать детали о PCIe конфигурации (скорость линка, ширина)
## Пример структуры файла
```
Start of NVIDIA bug report log file
nvidia-bug-report.sh Version: 34275561
Date: Thu Jul 17 18:18:18 EDT 2025
[... system info ...]
Memory Device
Data Width: 64 bits
Size: 64 GB
Form Factor: DIMM
Locator: CPU0_C0D0
Type: DDR5
Speed: 5600 MT/s
Manufacturer: Hynix
Serial Number: 80AD0224322B3834E6
Part Number: HMCG94AGBRA179N
[... more memory modules ...]
*** /proc/driver/nvidia/./gpus/0000:0f:00.0/power
[... GPU info ...]
```

View File

@@ -0,0 +1,140 @@
package nvidia_bug_report
import (
"bufio"
"strconv"
"strings"
"git.mchus.pro/mchus/logpile/internal/models"
)
// parseCPUInfo extracts CPU information from dmidecode output
func parseCPUInfo(content string, result *models.AnalysisResult) {
scanner := bufio.NewScanner(strings.NewReader(content))
var currentCPU *models.CPU
inProcessorInfo := false
cpuSocket := 0
for scanner.Scan() {
line := scanner.Text()
trimmed := strings.TrimSpace(line)
// Start of Processor Information section
if strings.Contains(trimmed, "Processor Information") {
inProcessorInfo = true
currentCPU = &models.CPU{
Socket: cpuSocket,
}
cpuSocket++
continue
}
// End of current section (empty line or new section with Handle)
if inProcessorInfo && (trimmed == "" || strings.HasPrefix(trimmed, "Handle ")) {
// Save CPU if it has valid data
if currentCPU != nil && currentCPU.Model != "" {
result.Hardware.CPUs = append(result.Hardware.CPUs, *currentCPU)
}
inProcessorInfo = false
currentCPU = nil
continue
}
// Parse fields within Processor Information section
if inProcessorInfo && currentCPU != nil && strings.Contains(line, ":") {
parts := strings.SplitN(trimmed, ":", 2)
if len(parts) != 2 {
continue
}
field := strings.TrimSpace(parts[0])
value := strings.TrimSpace(parts[1])
if value == "" || value == "Not Specified" || value == "Unknown" || value == "UNKNOWN" || value == "<OUT OF SPEC>" {
continue
}
switch field {
case "Version":
// CPU model name
currentCPU.Model = value
case "Serial Number":
currentCPU.SerialNumber = value
case "Part Number":
// Store part number if available
// Could be stored in a custom field if needed
case "Core Count":
if cores, err := strconv.Atoi(value); err == nil {
currentCPU.Cores = cores
}
case "Core Enabled":
// Could store this if needed
case "Thread Count":
if threads, err := strconv.Atoi(value); err == nil {
currentCPU.Threads = threads
}
case "Max Speed":
// Parse speed like "3800 MHz"
if speed := parseCPUSpeed(value); speed > 0 {
currentCPU.MaxFreqMHz = speed
}
case "Current Speed":
// Parse current speed like "2000 MHz"
if speed := parseCPUSpeed(value); speed > 0 {
currentCPU.FrequencyMHz = speed
}
case "Voltage":
// Could parse voltage if needed (e.g., "1.6 V")
case "Status":
// Status like "Populated, Enabled"
// Check if CPU is enabled
if !strings.Contains(value, "Populated") {
// Skip unpopulated CPUs
currentCPU = nil
inProcessorInfo = false
}
}
}
}
// Save last CPU if exists
if currentCPU != nil && currentCPU.Model != "" {
result.Hardware.CPUs = append(result.Hardware.CPUs, *currentCPU)
}
}
// parseCPUSpeed parses CPU speed strings like "3800 MHz" or "2.0 GHz"
func parseCPUSpeed(speedStr string) int {
parts := strings.Fields(speedStr)
if len(parts) < 2 {
return 0
}
// Try to parse the number (may be int or float)
speedStr = parts[0]
var speed float64
var err error
if strings.Contains(speedStr, ".") {
speed, err = strconv.ParseFloat(speedStr, 64)
} else {
var speedInt int
speedInt, err = strconv.Atoi(speedStr)
speed = float64(speedInt)
}
if err != nil {
return 0
}
unit := strings.ToUpper(parts[1])
switch unit {
case "MHZ":
return int(speed)
case "GHZ":
return int(speed * 1000)
default:
return 0
}
}

View File

@@ -0,0 +1,170 @@
package nvidia_bug_report
import (
"bufio"
"regexp"
"strconv"
"strings"
"time"
"git.mchus.pro/mchus/logpile/internal/models"
)
// parseGPUInfo extracts GPU information from the bug report
func parseGPUInfo(content string, result *models.AnalysisResult) {
scanner := bufio.NewScanner(strings.NewReader(content))
var currentGPU *models.GPU
inGPUInfo := false
for scanner.Scan() {
line := scanner.Text()
// Look for GPU information section markers (but skip ls listings)
if strings.Contains(line, "/proc/driver/nvidia") && strings.Contains(line, "/gpus/") &&
strings.Contains(line, "/information") && !strings.Contains(line, "ls:") {
// Extract PCI address
re := regexp.MustCompile(`/gpus/([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.[\da-f])`)
matches := re.FindStringSubmatch(line)
if len(matches) > 1 {
pciAddr := matches[1]
// Save previous GPU if exists
if currentGPU != nil {
result.Hardware.GPUs = append(result.Hardware.GPUs, *currentGPU)
}
// Start new GPU entry
currentGPU = &models.GPU{
BDF: pciAddr,
Manufacturer: "NVIDIA",
}
inGPUInfo = true
continue
}
}
// End of GPU info section (separator line or new section, but not ls lines)
if inGPUInfo && (strings.HasPrefix(line, "___") || (strings.HasPrefix(line, "***") && !strings.Contains(line, "ls:"))) {
inGPUInfo = false
continue
}
// Parse GPU fields within information section
if inGPUInfo && currentGPU != nil && strings.Contains(line, ":") {
// Split on first colon and trim whitespace/tabs
parts := strings.SplitN(line, ":", 2)
if len(parts) != 2 {
continue
}
field := strings.TrimSpace(parts[0])
value := strings.TrimSpace(parts[1])
if value == "" {
continue
}
switch field {
case "Model":
currentGPU.Model = value
case "IRQ":
if irq, err := strconv.Atoi(value); err == nil {
currentGPU.IRQ = irq
}
case "GPU UUID":
currentGPU.UUID = value
case "Video BIOS":
currentGPU.VideoBIOS = value
case "Bus Type":
currentGPU.BusType = value
case "DMA Size":
currentGPU.DMASize = value
case "DMA Mask":
currentGPU.DMAMask = value
case "Bus Location":
// BDF already set from path, but verify consistency
if currentGPU.BDF != value {
// Use the value from the information section as it's more explicit
currentGPU.BDF = value
}
case "Device Minor":
if minor, err := strconv.Atoi(value); err == nil {
currentGPU.DeviceMinor = minor
}
case "GPU Excluded":
// Store as status if "Yes"
if strings.ToLower(value) == "yes" {
currentGPU.Status = "Excluded"
}
}
}
}
// Save last GPU if exists
if currentGPU != nil {
result.Hardware.GPUs = append(result.Hardware.GPUs, *currentGPU)
}
// Create event for GPU summary
if len(result.Hardware.GPUs) > 0 {
result.Events = append(result.Events, models.Event{
Timestamp: time.Now(),
Source: "NVIDIA Driver",
EventType: "GPU Detection",
Description: "NVIDIA GPUs detected",
Severity: models.SeverityInfo,
RawData: formatGPUSummary(result.Hardware.GPUs),
})
}
}
// parseDriverVersion extracts NVIDIA driver version
func parseDriverVersion(content string, result *models.AnalysisResult) {
scanner := bufio.NewScanner(strings.NewReader(content))
for scanner.Scan() {
line := scanner.Text()
// Look for NVRM version line
if strings.Contains(line, "NVRM version:") {
// Extract version info
parts := strings.Split(line, "NVRM version:")
if len(parts) > 1 {
version := strings.TrimSpace(parts[1])
result.Events = append(result.Events, models.Event{
Timestamp: time.Now(),
Source: "NVIDIA Driver",
EventType: "Driver Version",
Description: "NVIDIA driver version detected",
Severity: models.SeverityInfo,
RawData: version,
})
break
}
}
}
}
// formatGPUSummary creates a summary string for GPUs
func formatGPUSummary(gpus []models.GPU) string {
if len(gpus) == 0 {
return ""
}
var summary strings.Builder
for i, gpu := range gpus {
if i > 0 {
summary.WriteString("; ")
}
summary.WriteString(gpu.BDF)
if gpu.Model != "" {
summary.WriteString(" (")
summary.WriteString(gpu.Model)
summary.WriteString(")")
}
}
return summary.String()
}

View File

@@ -0,0 +1,183 @@
package nvidia_bug_report
import (
"bufio"
"strconv"
"strings"
"time"
"git.mchus.pro/mchus/logpile/internal/models"
)
// parseMemoryModules extracts memory module information from dmidecode output
func parseMemoryModules(content string, result *models.AnalysisResult) {
scanner := bufio.NewScanner(strings.NewReader(content))
var currentModule *models.MemoryDIMM
inMemoryDevice := false
for scanner.Scan() {
line := scanner.Text()
trimmed := strings.TrimSpace(line)
// Start of Memory Device section
if strings.Contains(trimmed, "Memory Device") && !strings.Contains(trimmed, "Array") {
inMemoryDevice = true
currentModule = &models.MemoryDIMM{
Present: true,
}
continue
}
// End of current section (empty line or new section)
if inMemoryDevice && (trimmed == "" || strings.HasPrefix(trimmed, "Handle ")) {
// Save module if it has valid data
if currentModule != nil && currentModule.Slot != "" && currentModule.SizeMB > 0 {
result.Hardware.Memory = append(result.Hardware.Memory, *currentModule)
}
inMemoryDevice = false
currentModule = nil
continue
}
// Parse fields within Memory Device section
if inMemoryDevice && currentModule != nil && strings.Contains(line, ":") {
parts := strings.SplitN(trimmed, ":", 2)
if len(parts) != 2 {
continue
}
field := strings.TrimSpace(parts[0])
value := strings.TrimSpace(parts[1])
if value == "" || value == "Not Specified" || value == "Unknown" || value == "NO DIMM" {
continue
}
switch field {
case "Size":
// Parse size like "64 GB" or "32768 MB"
currentModule.SizeMB = parseMemorySize(value)
case "Locator":
currentModule.Slot = value
currentModule.Location = value
case "Bank Locator":
// Store in location if slot is empty
if currentModule.Location == "" {
currentModule.Location = value
}
case "Type":
currentModule.Type = value
case "Type Detail":
currentModule.Technology = value
case "Speed":
// Parse speed like "5600 MT/s"
currentModule.MaxSpeedMHz = parseMemorySpeed(value)
case "Configured Memory Speed":
currentModule.CurrentSpeedMHz = parseMemorySpeed(value)
case "Manufacturer":
currentModule.Manufacturer = value
case "Serial Number":
currentModule.SerialNumber = value
case "Part Number":
currentModule.PartNumber = strings.TrimSpace(value)
case "Rank":
// Parse rank
if rank, err := strconv.Atoi(value); err == nil {
currentModule.Ranks = rank
}
}
}
}
// Save last module if exists
if currentModule != nil && currentModule.Slot != "" && currentModule.SizeMB > 0 {
result.Hardware.Memory = append(result.Hardware.Memory, *currentModule)
}
// Create event for memory summary
if len(result.Hardware.Memory) > 0 {
totalMemoryGB := 0
for _, mem := range result.Hardware.Memory {
totalMemoryGB += mem.SizeMB / 1024
}
result.Events = append(result.Events, models.Event{
Timestamp: time.Now(),
Source: "DMI",
EventType: "Memory Configuration",
Description: "Memory modules detected",
Severity: models.SeverityInfo,
RawData: formatMemorySummary(result.Hardware.Memory, totalMemoryGB),
})
}
}
// parseMemorySize parses memory size strings like "64 GB" or "32768 MB"
func parseMemorySize(sizeStr string) int {
parts := strings.Fields(sizeStr)
if len(parts) < 2 {
return 0
}
size, err := strconv.Atoi(parts[0])
if err != nil {
return 0
}
unit := strings.ToUpper(parts[1])
switch unit {
case "GB":
return size * 1024
case "MB":
return size
case "TB":
return size * 1024 * 1024
default:
return 0
}
}
// parseMemorySpeed parses speed strings like "5600 MT/s" or "4400 MHz"
func parseMemorySpeed(speedStr string) int {
parts := strings.Fields(speedStr)
if len(parts) < 1 {
return 0
}
speed, err := strconv.Atoi(parts[0])
if err != nil {
return 0
}
return speed
}
// formatMemorySummary creates a summary string for memory modules
func formatMemorySummary(modules []models.MemoryDIMM, totalGB int) string {
if len(modules) == 0 {
return ""
}
// Group by manufacturer
manufacturerCount := make(map[string]int)
for _, mem := range modules {
if mem.Manufacturer != "" {
manufacturerCount[mem.Manufacturer]++
}
}
summary := ""
for mfr, count := range manufacturerCount {
if summary != "" {
summary += ", "
}
summary += mfr + ": " + strconv.Itoa(count) + " modules"
}
if summary == "" {
summary = strconv.Itoa(len(modules)) + " modules"
}
return summary + ", Total: " + strconv.Itoa(totalGB) + " GB"
}

View File

@@ -0,0 +1,160 @@
package nvidia_bug_report
import (
"bufio"
"regexp"
"strconv"
"strings"
"git.mchus.pro/mchus/logpile/internal/models"
)
// parseNetworkAdapters extracts network adapter information from lspci output
func parseNetworkAdapters(content string, result *models.AnalysisResult) {
scanner := bufio.NewScanner(strings.NewReader(content))
var currentAdapter *models.NetworkAdapter
inVPD := false
currentBDF := ""
for scanner.Scan() {
line := scanner.Text()
trimmed := strings.TrimSpace(line)
// Check if this is a new PCI device line
re := regexp.MustCompile(`^([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.[\da-f])\s+`)
matches := re.FindStringSubmatch(line)
if len(matches) > 0 {
// Save previous adapter if exists before processing new device
if currentAdapter != nil && currentAdapter.Model != "" {
result.Hardware.NetworkAdapters = append(result.Hardware.NetworkAdapters, *currentAdapter)
}
currentAdapter = nil
inVPD = false
}
// Match PCI device line: "0000:1f:00.0 Ethernet controller [0200]: Mellanox Technologies..."
if strings.Contains(line, "Ethernet controller") || strings.Contains(line, "Network controller") || strings.Contains(line, "Infiniband controller") {
// Extract BDF (Bus:Device.Function)
if len(matches) > 1 {
currentBDF = matches[1]
currentAdapter = &models.NetworkAdapter{
Location: currentBDF,
Present: true,
}
// Extract vendor and device info
// Format: "Vendor description [DeviceClass]: Vendor Name Device Name [VendorID:DeviceID]"
re2 := regexp.MustCompile(`:\s+(.+?)\s+\[([0-9a-f]{4}):([0-9a-f]{4})\]`)
matches2 := re2.FindStringSubmatch(line)
if len(matches2) > 3 {
// Parse vendor name from description
vendorDesc := matches2[1]
if idx := strings.Index(vendorDesc, " "); idx > 0 {
currentAdapter.Vendor = strings.Split(vendorDesc, " ")[0]
}
// Parse vendor ID and device ID
if vendorID, err := strconv.ParseInt(matches2[2], 16, 32); err == nil {
currentAdapter.VendorID = int(vendorID)
}
if deviceID, err := strconv.ParseInt(matches2[3], 16, 32); err == nil {
currentAdapter.DeviceID = int(deviceID)
}
}
continue
}
}
// Skip if not processing an adapter
if currentAdapter == nil {
continue
}
// Parse Physical Slot
if strings.HasPrefix(trimmed, "Physical Slot:") {
slotStr := strings.TrimPrefix(trimmed, "Physical Slot:")
currentAdapter.Slot = strings.TrimSpace(slotStr)
continue
}
// Start of Vital Product Data section
if strings.Contains(trimmed, "Vital Product Data") {
inVPD = true
continue
}
// End of VPD section
if inVPD && (trimmed == "End" || strings.HasPrefix(trimmed, "Capabilities:")) {
if trimmed == "End" {
inVPD = false
}
continue
}
// Parse Product Name in VPD
if inVPD && strings.HasPrefix(trimmed, "Product Name:") {
productName := strings.TrimPrefix(trimmed, "Product Name:")
currentAdapter.Model = strings.TrimSpace(productName)
// Extract port count from model name
if strings.Contains(currentAdapter.Model, "Dual-port") {
currentAdapter.PortCount = 2
} else if strings.Contains(currentAdapter.Model, "Single-port") {
currentAdapter.PortCount = 1
} else if strings.Contains(currentAdapter.Model, "Quad-port") {
currentAdapter.PortCount = 4
}
// Extract port type from model name
if strings.Contains(currentAdapter.Model, "QSFP56") {
currentAdapter.PortType = "QSFP56"
} else if strings.Contains(currentAdapter.Model, "QSFP28") {
currentAdapter.PortType = "QSFP28"
} else if strings.Contains(currentAdapter.Model, "OSFP") {
currentAdapter.PortType = "OSFP"
} else if strings.Contains(currentAdapter.Model, "SFP") {
currentAdapter.PortType = "SFP+"
}
continue
}
// Parse VPD fields
if inVPD && strings.HasPrefix(trimmed, "[") {
// Match pattern: [TAG] Description: Value
re := regexp.MustCompile(`^\[([A-Z0-9]+)\]\s+([^:]+):\s+(.+)`)
matches := re.FindStringSubmatch(trimmed)
if len(matches) > 3 {
tag := matches[1]
value := strings.TrimSpace(matches[3])
switch tag {
case "PN":
// Part number
currentAdapter.PartNumber = value
case "SN":
// Serial number
currentAdapter.SerialNumber = value
case "EC":
// Engineering changes - could be stored as firmware/revision
if currentAdapter.Firmware == "" {
currentAdapter.Firmware = value
}
}
}
continue
}
// End of current device section (empty line followed by hex dump or new device)
if currentAdapter != nil && trimmed == "" {
// Check if next lines are hex dump (config space)
continue
}
}
// Save last adapter if exists
if currentAdapter != nil && currentAdapter.Model != "" {
result.Hardware.NetworkAdapters = append(result.Hardware.NetworkAdapters, *currentAdapter)
}
}

View File

@@ -0,0 +1,107 @@
// Package nvidia_bug_report provides parser for NVIDIA bug report files
// Generated by nvidia-bug-report.sh script
package nvidia_bug_report
import (
"strings"
"git.mchus.pro/mchus/logpile/internal/models"
"git.mchus.pro/mchus/logpile/internal/parser"
)
// parserVersion - version of this parser module
const parserVersion = "1.0.0"
func init() {
parser.Register(&Parser{})
}
// Parser implements VendorParser for NVIDIA bug reports
type Parser struct{}
// Name returns human-readable parser name
func (p *Parser) Name() string {
return "NVIDIA Bug Report Parser"
}
// Vendor returns vendor identifier
func (p *Parser) Vendor() string {
return "nvidia_bug_report"
}
// Version returns parser version
func (p *Parser) Version() string {
return parserVersion
}
// Detect checks if this is an NVIDIA bug report
// Returns confidence 0-100
func (p *Parser) Detect(files []parser.ExtractedFile) int {
// Only detect if there's exactly one file
if len(files) != 1 {
return 0
}
file := files[0]
// Check filename
if !strings.Contains(strings.ToLower(file.Path), "nvidia-bug-report") {
return 0
}
// Check content markers
content := string(file.Content)
if !strings.Contains(content, "nvidia-bug-report.sh") ||
!strings.Contains(content, "NVIDIA bug report log file") {
return 0
}
// High confidence for nvidia-bug-report files
return 85
}
// Parse parses NVIDIA bug report file
func (p *Parser) Parse(files []parser.ExtractedFile) (*models.AnalysisResult, error) {
result := &models.AnalysisResult{
Events: make([]models.Event, 0),
FRU: make([]models.FRUInfo, 0),
Sensors: make([]models.SensorReading, 0),
}
// Initialize hardware config
result.Hardware = &models.HardwareConfig{
CPUs: make([]models.CPU, 0),
Memory: make([]models.MemoryDIMM, 0),
GPUs: make([]models.GPU, 0),
PowerSupply: make([]models.PSU, 0),
}
if len(files) == 0 {
return result, nil
}
content := string(files[0].Content)
// Parse system information
parseSystemInfo(content, result)
// Parse CPU information
parseCPUInfo(content, result)
// Parse memory modules
parseMemoryModules(content, result)
// Parse power supplies
parsePSUInfo(content, result)
// Parse GPU information
parseGPUInfo(content, result)
// Parse network adapters
parseNetworkAdapters(content, result)
// Parse driver version
parseDriverVersion(content, result)
return result, nil
}

View File

@@ -0,0 +1,116 @@
package nvidia_bug_report
import (
"bufio"
"strconv"
"strings"
"git.mchus.pro/mchus/logpile/internal/models"
)
// parsePSUInfo extracts Power Supply information from dmidecode output
func parsePSUInfo(content string, result *models.AnalysisResult) {
scanner := bufio.NewScanner(strings.NewReader(content))
var currentPSU *models.PSU
inPowerSupply := false
for scanner.Scan() {
line := scanner.Text()
trimmed := strings.TrimSpace(line)
// Start of System Power Supply section
if strings.Contains(trimmed, "System Power Supply") {
inPowerSupply = true
currentPSU = &models.PSU{}
continue
}
// End of current section (empty line or new section with Handle)
if inPowerSupply && (trimmed == "" || strings.HasPrefix(trimmed, "Handle ")) {
// Save PSU if it has valid data
if currentPSU != nil && currentPSU.Slot != "" {
// Only add if PSU is present
if strings.Contains(strings.ToLower(currentPSU.Status), "present") {
result.Hardware.PowerSupply = append(result.Hardware.PowerSupply, *currentPSU)
}
}
inPowerSupply = false
currentPSU = nil
continue
}
// Parse fields within System Power Supply section
if inPowerSupply && currentPSU != nil && strings.Contains(line, ":") {
parts := strings.SplitN(trimmed, ":", 2)
if len(parts) != 2 {
continue
}
field := strings.TrimSpace(parts[0])
value := strings.TrimSpace(parts[1])
if value == "" || value == "Not Specified" || value == "Unknown" || value == "UNKNOWN" {
continue
}
switch field {
case "Location":
currentPSU.Slot = value
case "Name":
// Use Name as Model if Model is not set later
if currentPSU.Model == "" {
currentPSU.Model = value
}
case "Manufacturer":
currentPSU.Vendor = value
case "Serial Number":
currentPSU.SerialNumber = value
case "Model Part Number":
// Use Model Part Number as the primary model identifier
currentPSU.Model = value
case "Revision":
currentPSU.Firmware = value
case "Max Power Capacity":
// Parse wattage like "2700 W"
if wattage := parsePowerWattage(value); wattage > 0 {
currentPSU.WattageW = wattage
}
case "Status":
currentPSU.Status = value
case "Type":
// Could store PSU type if needed (e.g., "Switching")
case "Plugged":
// Could track if PSU is plugged
case "Hot Replaceable":
// Could track if hot-swappable
}
}
}
// Save last PSU if exists
if currentPSU != nil && currentPSU.Slot != "" {
if strings.Contains(strings.ToLower(currentPSU.Status), "present") {
result.Hardware.PowerSupply = append(result.Hardware.PowerSupply, *currentPSU)
}
}
}
// parsePowerWattage parses power capacity strings like "2700 W" or "1200 Watts"
func parsePowerWattage(powerStr string) int {
parts := strings.Fields(powerStr)
if len(parts) < 1 {
return 0
}
// Try to parse the number
wattageStr := parts[0]
wattage, err := strconv.Atoi(wattageStr)
if err != nil {
return 0
}
// Check if unit is specified (W, Watts, etc.) and convert if needed
// For now, assume it's always in Watts
return wattage
}

View File

@@ -0,0 +1,61 @@
package nvidia_bug_report
import (
"bufio"
"strings"
"git.mchus.pro/mchus/logpile/internal/models"
)
// parseSystemInfo extracts System Information from dmidecode output
func parseSystemInfo(content string, result *models.AnalysisResult) {
scanner := bufio.NewScanner(strings.NewReader(content))
inSystemInfo := false
for scanner.Scan() {
line := scanner.Text()
trimmed := strings.TrimSpace(line)
// Start of System Information section
if trimmed == "System Information" {
inSystemInfo = true
continue
}
// End of section (empty line or new Handle)
if inSystemInfo && (trimmed == "" || strings.HasPrefix(trimmed, "Handle ")) {
inSystemInfo = false
continue
}
// Parse fields within System Information section
if inSystemInfo && strings.Contains(line, ":") {
parts := strings.SplitN(trimmed, ":", 2)
if len(parts) != 2 {
continue
}
field := strings.TrimSpace(parts[0])
value := strings.TrimSpace(parts[1])
// Skip empty, NULL, or "Not specified" values
if value == "" || value == "NULL" || value == "Not specified" || value == "Not Specified" {
continue
}
switch field {
case "Manufacturer":
result.Hardware.BoardInfo.Manufacturer = value
case "Product Name":
result.Hardware.BoardInfo.ProductName = value
case "Version":
result.Hardware.BoardInfo.Version = value
case "Serial Number":
result.Hardware.BoardInfo.SerialNumber = value
case "UUID":
result.Hardware.BoardInfo.UUID = value
}
}
}
}

View File

@@ -0,0 +1,133 @@
# SMC Crash Dump Parser
Парсер для архивов Supermicro (SMC) BMC Crash Dump.
## Поддерживаемые серверы
- Supermicro SYS-821GE-TNHR
- Другие серверы Supermicro с BMC Crashdump функциональностью
## Формат архива
Парсер работает с архивами в формате:
- `.tgz` / `.tar.gz` (сжатый tar)
- `.tar` (несжатый tar)
## Распознаваемые файлы
### Основные файлы
1. **CDump.txt** - JSON файл с данными crashdump
- Metadata (BMC, BIOS, ME версии firmware)
- CPU информация (CPUID, количество ядер, microcode версия, PPIN)
- MCA (Machine Check Architecture) данные - ошибки процессоров
## Извлекаемые данные
### Hardware Configuration
#### CPUs
```json
{
"slot": "CPU0",
"model": "CPUID: 0xc06f2",
"cores": 56,
"manufacturer": "Intel",
"firmware": "Microcode: 0x210002b3"
}
```
### FRU Information
- BMC Firmware Version
- BIOS Version
- ME Firmware Version
- CPU PPIN (Protected Processor Inventory Number)
### Events
События создаются для:
- **Crashdump collection** - когда был собран crashdump
- **MCA Errors** - ошибки Machine Check Architecture
- Corrected errors (Warning severity)
- Uncorrected errors (Critical severity)
Уровни severity:
- `info` - информационные события (crashdump по запросу)
- `warning` - предупреждения (corrected MCA errors, reset detected)
- `critical` - критические ошибки (uncorrected MCA errors)
## Пример использования
```bash
# Запуск веб-интерфейса
./logpile --file /path/to/CDump_090859_01302026.tgz
# Веб-интерфейс будет доступен на http://localhost:8082
```
## Автоопределение
Парсер автоматически определяет архивы SMC Crash Dump по наличию:
- `CDump.txt` с маркерами "crash_data", "METADATA", "bmc_fw_ver"
Confidence score:
- `CDump.txt` с маркерами crashdump: +80
## Версионирование
**Текущая версия парсера:** 1.0.0
При модификации логики парсера необходимо увеличивать версию в константе `parserVersion` в файле `parser.go`.
## Примеры данных
### Пример CDump.txt (metadata)
```json
{
"crash_data": {
"METADATA": {
"cpu0": {
"cpuid": "0xc06f2",
"core_count": "0x38",
"ppin": "0xa3ccbe7d45026592",
"ucode_patch_ver": "0x210002b3"
},
"bmc_fw_ver": "01.03.18",
"bios_id": "BIOS Date: 08/04/2025 Rev 2.7",
"me_fw_ver": "6.1.4.204",
"timestamp": "2026-01-30T09:06:52Z",
"trigger_type": "On-Demand"
}
}
}
```
### MCA Error Detection
Парсер проверяет регистры MCA status на наличие ошибок:
- Bit 63 (Valid) - индикатор валидной ошибки
- Bit 61 (UC) - uncorrected error
- Bit 60 (EN) - error enabled
## Известные ограничения
1. Парсер фокусируется на данных из `CDump.txt`
2. Детальный анализ MCA errors пока упрощен (только проверка status регистров)
3. TOR dump и другие расширенные данные пока не парсятся
## Разработка
### Добавление новых полей
1. Изучите структуру JSON в CDump.txt
2. Добавьте поля в структуры `Metadata`, `CPUMetadata`, или `MCAData`
3. Обновите функции парсинга
4. Увеличьте версию парсера
### Расширение MCA анализа
Для более детального анализа MCA ошибок можно:
1. Добавить декодирование MCA error codes
2. Парсить MISC и ADDR регистры
3. Добавить корреляцию ошибок между банками

View File

@@ -0,0 +1,261 @@
package supermicro
import (
"encoding/json"
"fmt"
"strconv"
"strings"
"time"
"git.mchus.pro/mchus/logpile/internal/models"
)
// CrashDumpData represents the structure of CDump.txt
type CrashDumpData struct {
CrashData struct {
METADATA Metadata `json:"METADATA"`
PROCESSORS ProcessorsData `json:"PROCESSORS"`
} `json:"crash_data"`
}
// ProcessorsData contains processor crash data
type ProcessorsData struct {
Version string `json:"_version"`
CPU0 Processors `json:"cpu0"`
CPU1 Processors `json:"cpu1"`
}
// Metadata contains crashdump metadata
type Metadata struct {
CPU0 CPUMetadata `json:"cpu0"`
CPU1 CPUMetadata `json:"cpu1"`
BMCFWVer string `json:"bmc_fw_ver"`
BIOSId string `json:"bios_id"`
MEFWVer string `json:"me_fw_ver"`
Timestamp string `json:"timestamp"`
TriggerType string `json:"trigger_type"`
PlatformName string `json:"platform_name"`
CrashdumpVer string `json:"crashdump_ver"`
ResetDetected string `json:"_reset_detected"`
}
// CPUMetadata contains CPU metadata
type CPUMetadata struct {
CPUID string `json:"cpuid"`
CoreMask string `json:"core_mask"`
CHACount string `json:"cha_count"`
CoreCount string `json:"core_count"`
PPIN string `json:"ppin"`
UcodePatchVer string `json:"ucode_patch_ver"`
}
// Processors contains processor crash data
type Processors struct {
MCA MCAData `json:"MCA"`
}
// MCAData contains Machine Check Architecture data
type MCAData struct {
Uncore map[string]interface{} `json:"uncore"`
}
// ParseCrashDump parses CDump.txt file
func ParseCrashDump(content []byte, result *models.AnalysisResult) error {
var data CrashDumpData
if err := json.Unmarshal(content, &data); err != nil {
return fmt.Errorf("failed to parse CDump.txt: %w", err)
}
// Initialize Hardware.Firmware slice if nil
if result.Hardware.Firmware == nil {
result.Hardware.Firmware = make([]models.FirmwareInfo, 0)
}
// Parse metadata
parseMetadata(&data.CrashData.METADATA, result)
// Parse CPU information
parseCPUInfo(&data.CrashData.METADATA, result)
// Parse MCA errors
parseMCAErrors(&data.CrashData, result)
return nil
}
// parseMetadata extracts metadata information
func parseMetadata(metadata *Metadata, result *models.AnalysisResult) {
// Store firmware versions in HardwareConfig.Firmware
if metadata.BMCFWVer != "" {
result.Hardware.Firmware = append(result.Hardware.Firmware, models.FirmwareInfo{
DeviceName: "BMC",
Version: metadata.BMCFWVer,
})
}
if metadata.BIOSId != "" {
result.Hardware.Firmware = append(result.Hardware.Firmware, models.FirmwareInfo{
DeviceName: "BIOS",
Version: metadata.BIOSId,
})
}
if metadata.MEFWVer != "" {
result.Hardware.Firmware = append(result.Hardware.Firmware, models.FirmwareInfo{
DeviceName: "ME",
Version: metadata.MEFWVer,
})
}
// Create event for crashdump trigger
timestamp := time.Now()
if metadata.Timestamp != "" {
if t, err := time.Parse(time.RFC3339, metadata.Timestamp); err == nil {
timestamp = t
}
}
triggerType := metadata.TriggerType
if triggerType == "" {
triggerType = "Unknown"
}
severity := models.SeverityInfo
if metadata.ResetDetected != "" && metadata.ResetDetected != "NONE" {
severity = models.SeverityWarning
}
result.Events = append(result.Events, models.Event{
Timestamp: timestamp,
Source: "Crashdump",
EventType: "System Crashdump",
Description: fmt.Sprintf("Crashdump collected (%s)", triggerType),
Severity: severity,
RawData: fmt.Sprintf("Version: %s, Reset: %s", metadata.CrashdumpVer, metadata.ResetDetected),
})
}
// parseCPUInfo extracts CPU information
func parseCPUInfo(metadata *Metadata, result *models.AnalysisResult) {
cpus := []struct {
socket int
data CPUMetadata
}{
{0, metadata.CPU0},
{1, metadata.CPU1},
}
for _, cpu := range cpus {
if cpu.data.CPUID == "" {
continue
}
// Parse core count
coreCount := 0
if cpu.data.CoreCount != "" {
if count, err := strconv.ParseInt(strings.TrimPrefix(cpu.data.CoreCount, "0x"), 16, 64); err == nil {
coreCount = int(count)
}
}
cpuModel := models.CPU{
Socket: cpu.socket,
Model: fmt.Sprintf("Intel CPU (CPUID: %s)", cpu.data.CPUID),
Cores: coreCount,
}
// Add PPIN
if cpu.data.PPIN != "" && cpu.data.PPIN != "0x0" {
cpuModel.PPIN = cpu.data.PPIN
}
result.Hardware.CPUs = append(result.Hardware.CPUs, cpuModel)
// Add microcode version to firmware list
if cpu.data.UcodePatchVer != "" {
result.Hardware.Firmware = append(result.Hardware.Firmware, models.FirmwareInfo{
DeviceName: fmt.Sprintf("CPU%d Microcode", cpu.socket),
Version: cpu.data.UcodePatchVer,
})
}
}
}
// parseMCAErrors extracts Machine Check Architecture errors
func parseMCAErrors(crashData *struct {
METADATA Metadata `json:"METADATA"`
PROCESSORS ProcessorsData `json:"PROCESSORS"`
}, result *models.AnalysisResult) {
timestamp := time.Now()
if crashData.METADATA.Timestamp != "" {
if t, err := time.Parse(time.RFC3339, crashData.METADATA.Timestamp); err == nil {
timestamp = t
}
}
// Parse each CPU's MCA data
cpuProcs := []struct {
name string
data Processors
}{
{"cpu0", crashData.PROCESSORS.CPU0},
{"cpu1", crashData.PROCESSORS.CPU1},
}
for _, cpu := range cpuProcs {
if cpu.data.MCA.Uncore == nil {
continue
}
// Check each MCA bank for errors
for bankName, bankDataRaw := range cpu.data.MCA.Uncore {
bankData, ok := bankDataRaw.(map[string]interface{})
if !ok {
continue
}
// Look for status register
statusKey := strings.ToLower(bankName) + "_status"
statusRaw, ok := bankData[statusKey]
if !ok {
continue
}
statusStr, ok := statusRaw.(string)
if !ok {
continue
}
// Parse status value
status, err := strconv.ParseUint(strings.TrimPrefix(statusStr, "0x"), 16, 64)
if err != nil {
continue
}
// Check if MCA error is valid (bit 63 = Valid)
if status&(1<<63) != 0 {
// MCA error detected
severity := models.SeverityWarning
if status&(1<<61) != 0 { // UC bit = uncorrected error
severity = models.SeverityCritical
}
description := fmt.Sprintf("MCA Error in %s bank %s", cpu.name, bankName)
if status&(1<<61) != 0 {
description += " (Uncorrected)"
} else {
description += " (Corrected)"
}
result.Events = append(result.Events, models.Event{
Timestamp: timestamp,
Source: "MCA",
EventType: "Machine Check",
Description: description,
Severity: severity,
RawData: fmt.Sprintf("Status: %s, CPU: %s, Bank: %s", statusStr, cpu.name, bankName),
})
}
}
}
}

View File

@@ -0,0 +1,98 @@
// Package supermicro provides parser for Supermicro BMC crashdump archives
// Tested with: Supermicro SYS-821GE-TNHR (Crashdump format)
//
// IMPORTANT: Increment parserVersion when modifying parser logic!
// This helps track which version was used to parse specific logs.
package supermicro
import (
"strings"
"git.mchus.pro/mchus/logpile/internal/models"
"git.mchus.pro/mchus/logpile/internal/parser"
)
// parserVersion - version of this parser module
// IMPORTANT: Increment this version when making changes to parser logic!
const parserVersion = "1.0.0"
func init() {
parser.Register(&Parser{})
}
// Parser implements VendorParser for Supermicro servers
type Parser struct{}
// Name returns human-readable parser name
func (p *Parser) Name() string {
return "SMC Crash Dump Parser"
}
// Vendor returns vendor identifier
func (p *Parser) Vendor() string {
return "supermicro"
}
// Version returns parser version
// IMPORTANT: Update parserVersion constant when modifying parser logic!
func (p *Parser) Version() string {
return parserVersion
}
// Detect checks if archive matches Supermicro crashdump format
// Returns confidence 0-100
func (p *Parser) Detect(files []parser.ExtractedFile) int {
confidence := 0
for _, f := range files {
path := strings.ToLower(f.Path)
// Strong indicator for Supermicro Crashdump format
if strings.HasSuffix(path, "cdump.txt") {
// Check if it's really Supermicro crashdump format
if containsCrashdumpMarkers(f.Content) {
confidence += 80
}
}
// Cap at 100
if confidence >= 100 {
return 100
}
}
return confidence
}
// containsCrashdumpMarkers checks if content has Supermicro crashdump markers
func containsCrashdumpMarkers(content []byte) bool {
s := string(content)
// Check for typical Supermicro Crashdump structure
return strings.Contains(s, "crash_data") &&
strings.Contains(s, "METADATA") &&
(strings.Contains(s, "bmc_fw_ver") || strings.Contains(s, "crashdump_ver"))
}
// Parse parses Supermicro crashdump archive
func (p *Parser) Parse(files []parser.ExtractedFile) (*models.AnalysisResult, error) {
result := &models.AnalysisResult{
Events: make([]models.Event, 0),
FRU: make([]models.FRUInfo, 0),
Sensors: make([]models.SensorReading, 0),
}
// Initialize hardware config
result.Hardware = &models.HardwareConfig{
CPUs: make([]models.CPU, 0),
}
// Parse CDump.txt (JSON crashdump)
if f := parser.FindFileByName(files, "CDump.txt"); f != nil {
if err := ParseCrashDump(f.Content, result); err != nil {
// Log error but continue parsing other files
_ = err // Ignore error for now
}
}
return result, nil
}

View File

@@ -5,9 +5,14 @@ package vendors
import (
// Import vendor modules to trigger their init() registration
_ "git.mchus.pro/mchus/logpile/internal/parser/vendors/inspur"
_ "git.mchus.pro/mchus/logpile/internal/parser/vendors/nvidia"
_ "git.mchus.pro/mchus/logpile/internal/parser/vendors/nvidia_bug_report"
_ "git.mchus.pro/mchus/logpile/internal/parser/vendors/supermicro"
// Generic fallback parser (must be last for lowest priority)
_ "git.mchus.pro/mchus/logpile/internal/parser/vendors/generic"
// Future vendors:
// _ "git.mchus.pro/mchus/logpile/internal/parser/vendors/supermicro"
// _ "git.mchus.pro/mchus/logpile/internal/parser/vendors/dell"
// _ "git.mchus.pro/mchus/logpile/internal/parser/vendors/hpe"
// _ "git.mchus.pro/mchus/logpile/internal/parser/vendors/lenovo"