v1.3.0: Add multiple vendor parsers and enhanced hardware detection
New parsers: - NVIDIA Field Diagnostics parser with dmidecode output support - NVIDIA Bug Report parser with comprehensive hardware extraction - Supermicro crashdump (CDump.txt) parser - Generic fallback parser for unrecognized text files Enhanced GPU parsing (nvidia-bug-report): - Model and manufacturer detection (NVIDIA H100 80GB HBM3) - UUID, Video BIOS version, IRQ information - Bus location (BDF), DMA size/mask, device minor - PCIe bus type details New hardware detection (nvidia-bug-report): - System Information: server S/N, UUID, manufacturer, product name - CPU: model, S/N, cores, threads, frequencies from dmidecode - Memory: P/N, S/N, manufacturer, speed for all DIMMs - Power Supplies: manufacturer, model, S/N, wattage, status - Network Adapters: Ethernet/InfiniBand controllers with VPD data - Model, P/N, S/N from lspci Vital Product Data - Port count/type detection (QSFP56, OSFP, etc.) - Support for ConnectX-6/7 adapters Archive handling improvements: - Plain .gz file support (not just tar.gz) - Increased size limit for plain gzip files (50MB) - Better error handling for mixed archive formats Web interface enhancements: - Display parser name and filename badges - Improved file info section with visual indicators Co-Authored-By: Claude (qwen3-coder:480b) <noreply@anthropic.com>
This commit is contained in:
@@ -3,6 +3,7 @@ package parser
|
||||
import (
|
||||
"archive/tar"
|
||||
"archive/zip"
|
||||
"bytes"
|
||||
"compress/gzip"
|
||||
"fmt"
|
||||
"io"
|
||||
@@ -24,6 +25,8 @@ func ExtractArchive(archivePath string) ([]ExtractedFile, error) {
|
||||
switch ext {
|
||||
case ".gz", ".tgz":
|
||||
return extractTarGz(archivePath)
|
||||
case ".tar":
|
||||
return extractTar(archivePath)
|
||||
case ".zip":
|
||||
return extractZip(archivePath)
|
||||
default:
|
||||
@@ -37,7 +40,9 @@ func ExtractArchiveFromReader(r io.Reader, filename string) ([]ExtractedFile, er
|
||||
|
||||
switch ext {
|
||||
case ".gz", ".tgz":
|
||||
return extractTarGzFromReader(r)
|
||||
return extractTarGzFromReader(r, filename)
|
||||
case ".tar":
|
||||
return extractTarFromReader(r)
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported archive format: %s", ext)
|
||||
}
|
||||
@@ -50,17 +55,21 @@ func extractTarGz(archivePath string) ([]ExtractedFile, error) {
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
return extractTarGzFromReader(f)
|
||||
return extractTarGzFromReader(f, filepath.Base(archivePath))
|
||||
}
|
||||
|
||||
func extractTarGzFromReader(r io.Reader) ([]ExtractedFile, error) {
|
||||
gzr, err := gzip.NewReader(r)
|
||||
func extractTar(archivePath string) ([]ExtractedFile, error) {
|
||||
f, err := os.Open(archivePath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("gzip reader: %w", err)
|
||||
return nil, fmt.Errorf("open archive: %w", err)
|
||||
}
|
||||
defer gzr.Close()
|
||||
defer f.Close()
|
||||
|
||||
tr := tar.NewReader(gzr)
|
||||
return extractTarFromReader(f)
|
||||
}
|
||||
|
||||
func extractTarFromReader(r io.Reader) ([]ExtractedFile, error) {
|
||||
tr := tar.NewReader(r)
|
||||
var files []ExtractedFile
|
||||
|
||||
for {
|
||||
@@ -96,6 +105,75 @@ func extractTarGzFromReader(r io.Reader) ([]ExtractedFile, error) {
|
||||
return files, nil
|
||||
}
|
||||
|
||||
func extractTarGzFromReader(r io.Reader, filename string) ([]ExtractedFile, error) {
|
||||
gzr, err := gzip.NewReader(r)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("gzip reader: %w", err)
|
||||
}
|
||||
defer gzr.Close()
|
||||
|
||||
// Read all decompressed content into buffer
|
||||
// Limit to 50MB for plain gzip files, 10MB per file for tar.gz
|
||||
decompressed, err := io.ReadAll(io.LimitReader(gzr, 50*1024*1024))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read gzip content: %w", err)
|
||||
}
|
||||
|
||||
// Try to read as tar archive
|
||||
tr := tar.NewReader(bytes.NewReader(decompressed))
|
||||
var files []ExtractedFile
|
||||
|
||||
header, err := tr.Next()
|
||||
if err != nil {
|
||||
// Not a tar archive - treat as a single gzipped file
|
||||
if strings.Contains(err.Error(), "invalid tar header") || err == io.EOF {
|
||||
// Get base filename without .gz extension
|
||||
baseName := strings.TrimSuffix(filename, ".gz")
|
||||
if gzr.Name != "" {
|
||||
baseName = gzr.Name
|
||||
}
|
||||
|
||||
return []ExtractedFile{
|
||||
{
|
||||
Path: baseName,
|
||||
Content: decompressed,
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
return nil, fmt.Errorf("tar read: %w", err)
|
||||
}
|
||||
|
||||
// It's a valid tar archive, process it
|
||||
for {
|
||||
// Skip directories
|
||||
if header.Typeflag != tar.TypeDir {
|
||||
// Skip large files (>10MB)
|
||||
if header.Size <= 10*1024*1024 {
|
||||
content, err := io.ReadAll(tr)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read file %s: %w", header.Name, err)
|
||||
}
|
||||
|
||||
files = append(files, ExtractedFile{
|
||||
Path: header.Name,
|
||||
Content: content,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Read next header
|
||||
header, err = tr.Next()
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("tar read: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
return files, nil
|
||||
}
|
||||
|
||||
func extractZip(archivePath string) ([]ExtractedFile, error) {
|
||||
r, err := zip.OpenReader(archivePath)
|
||||
if err != nil {
|
||||
|
||||
72
internal/parser/vendors/generic/README.md
vendored
Normal file
72
internal/parser/vendors/generic/README.md
vendored
Normal file
@@ -0,0 +1,72 @@
|
||||
# Generic Text File Parser
|
||||
|
||||
Fallback парсер для текстовых файлов, которые не распознаны другими парсерами.
|
||||
|
||||
## Назначение
|
||||
|
||||
Этот парсер обрабатывает любые текстовые файлы, которые:
|
||||
- Не являются архивами специфичных вендоров
|
||||
- Содержат текстовую информацию (не бинарные данные)
|
||||
- Представляют собой одиночные .gz файлы или простые текстовые файлы
|
||||
|
||||
## Приоритет
|
||||
|
||||
**Confidence score: 15** (низкий приоритет)
|
||||
|
||||
Этот парсер срабатывает только если ни один другой парсер не подошел с более высоким confidence.
|
||||
|
||||
## Поддерживаемые файлы
|
||||
|
||||
### Автоматически распознаваемые типы
|
||||
|
||||
1. **NVIDIA Bug Report** (`nvidia-bug-report-*.log.gz`)
|
||||
- Извлекает информацию о драйвере NVIDIA
|
||||
- Находит GPU устройства
|
||||
- Показывает версию драйвера
|
||||
|
||||
2. **Любые текстовые файлы**
|
||||
- Проверяет, что содержимое - текст (не бинарные данные)
|
||||
- Показывает базовую информацию о файле
|
||||
|
||||
## Извлекаемые данные
|
||||
|
||||
### Events
|
||||
|
||||
- **Text File**: Базовая информация о загруженном файле
|
||||
- **Driver Info**: Информация о NVIDIA драйвере (для nvidia-bug-report)
|
||||
- **GPU Device**: Обнаруженные GPU устройства (для nvidia-bug-report)
|
||||
|
||||
## Пример использования
|
||||
|
||||
```bash
|
||||
# Запуск с nvidia-bug-report
|
||||
./logpile --file nvidia-bug-report-*.log.gz
|
||||
|
||||
# Запуск с любым текстовым файлом
|
||||
./logpile --file system.log.gz
|
||||
```
|
||||
|
||||
## Версионирование
|
||||
|
||||
**Текущая версия парсера:** 1.0.0
|
||||
|
||||
## Ограничения
|
||||
|
||||
1. Этот парсер предоставляет только базовую информацию
|
||||
2. Не выполняет глубокий анализ содержимого
|
||||
3. Для детального анализа специфичных логов рекомендуется создать dedicated парсер
|
||||
|
||||
## Расширение
|
||||
|
||||
Чтобы добавить поддержку нового типа файлов:
|
||||
|
||||
1. Добавьте проверку в функцию `Parse()`
|
||||
2. Создайте функцию `parseXXX()` для извлечения специфичной информации
|
||||
3. Увеличьте версию парсера
|
||||
|
||||
Пример:
|
||||
```go
|
||||
if strings.Contains(strings.ToLower(file.Path), "custom-log") {
|
||||
parseCustomLog(content, result)
|
||||
}
|
||||
```
|
||||
147
internal/parser/vendors/generic/parser.go
vendored
Normal file
147
internal/parser/vendors/generic/parser.go
vendored
Normal file
@@ -0,0 +1,147 @@
|
||||
// Package generic provides a fallback parser for unrecognized text files
|
||||
package generic
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"git.mchus.pro/mchus/logpile/internal/models"
|
||||
"git.mchus.pro/mchus/logpile/internal/parser"
|
||||
)
|
||||
|
||||
// parserVersion - version of this parser module
|
||||
const parserVersion = "1.0.0"
|
||||
|
||||
func init() {
|
||||
parser.Register(&Parser{})
|
||||
}
|
||||
|
||||
// Parser implements VendorParser for generic text files
|
||||
type Parser struct{}
|
||||
|
||||
// Name returns human-readable parser name
|
||||
func (p *Parser) Name() string {
|
||||
return "Generic Text File Parser"
|
||||
}
|
||||
|
||||
// Vendor returns vendor identifier
|
||||
func (p *Parser) Vendor() string {
|
||||
return "generic"
|
||||
}
|
||||
|
||||
// Version returns parser version
|
||||
func (p *Parser) Version() string {
|
||||
return parserVersion
|
||||
}
|
||||
|
||||
// Detect checks if this is a text file (fallback with low confidence)
|
||||
// Returns confidence 0-100
|
||||
func (p *Parser) Detect(files []parser.ExtractedFile) int {
|
||||
// Only detect if there's exactly one file (plain .gz or single file)
|
||||
if len(files) != 1 {
|
||||
return 0
|
||||
}
|
||||
|
||||
file := files[0]
|
||||
|
||||
// Check if content looks like text (not binary)
|
||||
if !isLikelyText(file.Content) {
|
||||
return 0
|
||||
}
|
||||
|
||||
// Return low confidence so other parsers have priority
|
||||
return 15
|
||||
}
|
||||
|
||||
// isLikelyText checks if content is likely text (not binary)
|
||||
func isLikelyText(content []byte) bool {
|
||||
// Check first 512 bytes for binary data
|
||||
sample := content
|
||||
if len(content) > 512 {
|
||||
sample = content[:512]
|
||||
}
|
||||
|
||||
binaryCount := 0
|
||||
for _, b := range sample {
|
||||
// Count non-printable characters (excluding common whitespace)
|
||||
if b < 32 && b != '\n' && b != '\r' && b != '\t' {
|
||||
binaryCount++
|
||||
}
|
||||
if b == 0 { // NULL byte is a strong indicator of binary
|
||||
binaryCount += 10
|
||||
}
|
||||
}
|
||||
|
||||
// If less than 5% binary, consider it text
|
||||
return binaryCount < len(sample)/20
|
||||
}
|
||||
|
||||
// Parse parses generic text file
|
||||
func (p *Parser) Parse(files []parser.ExtractedFile) (*models.AnalysisResult, error) {
|
||||
result := &models.AnalysisResult{
|
||||
Events: make([]models.Event, 0),
|
||||
FRU: make([]models.FRUInfo, 0),
|
||||
Sensors: make([]models.SensorReading, 0),
|
||||
}
|
||||
|
||||
// Initialize hardware config
|
||||
result.Hardware = &models.HardwareConfig{}
|
||||
|
||||
if len(files) == 0 {
|
||||
return result, nil
|
||||
}
|
||||
|
||||
file := files[0]
|
||||
content := string(file.Content)
|
||||
|
||||
// Create a single event with file info
|
||||
result.Events = append(result.Events, models.Event{
|
||||
Timestamp: time.Now(),
|
||||
Source: "File",
|
||||
EventType: "Text File",
|
||||
Description: "Generic text file loaded",
|
||||
Severity: models.SeverityInfo,
|
||||
RawData: "Filename: " + file.Path,
|
||||
})
|
||||
|
||||
// Try to extract some basic info from common file types
|
||||
if strings.Contains(strings.ToLower(file.Path), "nvidia-bug-report") {
|
||||
parseNvidiaBugReport(content, result)
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// parseNvidiaBugReport extracts info from nvidia-bug-report files
|
||||
func parseNvidiaBugReport(content string, result *models.AnalysisResult) {
|
||||
lines := strings.Split(content, "\n")
|
||||
|
||||
// Look for GPU information
|
||||
for i, line := range lines {
|
||||
// Find NVIDIA driver version
|
||||
if strings.Contains(line, "NVRM version:") || strings.Contains(line, "nvidia-smi") {
|
||||
if i+5 < len(lines) {
|
||||
result.Events = append(result.Events, models.Event{
|
||||
Timestamp: time.Now(),
|
||||
Source: "NVIDIA Driver",
|
||||
EventType: "Driver Info",
|
||||
Description: "NVIDIA driver information found",
|
||||
Severity: models.SeverityInfo,
|
||||
RawData: strings.TrimSpace(line),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Find GPU devices
|
||||
if strings.Contains(line, "/proc/driver/nvidia/gpus/") && strings.Contains(line, "***") {
|
||||
result.Events = append(result.Events, models.Event{
|
||||
Timestamp: time.Now(),
|
||||
Source: "GPU",
|
||||
EventType: "GPU Device",
|
||||
Description: "GPU device detected",
|
||||
Severity: models.SeverityInfo,
|
||||
RawData: strings.TrimSpace(line),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
175
internal/parser/vendors/nvidia/README.md
vendored
Normal file
175
internal/parser/vendors/nvidia/README.md
vendored
Normal file
@@ -0,0 +1,175 @@
|
||||
# NVIDIA Field Diagnostics Parser
|
||||
|
||||
Парсер для диагностических архивов NVIDIA HGX Field Diagnostics.
|
||||
Универсальный парсер, не привязанный к конкретному производителю серверов.
|
||||
|
||||
## Поддерживаемые архивы
|
||||
|
||||
- NVIDIA HGX Field Diag (работает с любыми серверами: Supermicro, Dell, HPE, и т.д.)
|
||||
- Архивы с результатами GPU диагностики NVIDIA
|
||||
|
||||
## Формат архива
|
||||
|
||||
Парсер работает с архивами в формате:
|
||||
- `.tar` (несжатый tar)
|
||||
- `.tar.gz` (сжатый gzip)
|
||||
|
||||
## Распознаваемые файлы
|
||||
|
||||
### Основные файлы
|
||||
|
||||
1. **output.log** - вывод dmidecode с информацией о системе
|
||||
- Производитель сервера (Manufacturer)
|
||||
- Модель сервера (Product Name) - например, SYS-821GE-TNHR
|
||||
- Серийный номер сервера (Serial Number) - например, A514359X5A07900
|
||||
- UUID, SKU Number, Family
|
||||
|
||||
2. **unified_summary.json** - детальная информация о системе и компонентах
|
||||
- Информация о GPU (модель, производитель, VBIOS, PCI адреса)
|
||||
- Информация о NVSwitch (VendorID, DeviceID, Link speed/width)
|
||||
- Информация о производителе и модели сервера
|
||||
|
||||
3. **summary.json** - результаты тестов диагностики
|
||||
- Результаты тестов GPU (inforom, checkinforom, gpumem, gpustress, pcie, nvlink, nvswitch, power)
|
||||
- Коды ошибок и статусы тестов
|
||||
|
||||
4. **summary.csv** - альтернативный формат результатов тестов
|
||||
|
||||
### Дополнительные файлы
|
||||
|
||||
- `gpu_fieldiag/*.log` - детальные логи диагностики каждого GPU
|
||||
- `inventory/*.json` - дополнительная информация о конфигурации
|
||||
|
||||
## Извлекаемые данные
|
||||
|
||||
### Hardware Configuration
|
||||
|
||||
#### GPUs
|
||||
```json
|
||||
{
|
||||
"slot": "GPUSXM1",
|
||||
"model": "NVIDIA Device 2335",
|
||||
"manufacturer": "NVIDIA Corporation",
|
||||
"firmware": "96.00.D0.00.03",
|
||||
"bdf": "0000:3a:00.0"
|
||||
}
|
||||
```
|
||||
|
||||
#### NVSwitch (как PCIe устройства)
|
||||
```json
|
||||
{
|
||||
"slot": "NVSWITCHNVSWITCH0",
|
||||
"device_class": "NVSwitch",
|
||||
"manufacturer": "NVIDIA Corporation",
|
||||
"vendor_id": 4318,
|
||||
"device_id": 8867,
|
||||
"bdf": "0000:05:00.0",
|
||||
"link_speed": "16GT/s",
|
||||
"link_width": 2
|
||||
}
|
||||
```
|
||||
|
||||
### Events
|
||||
|
||||
События создаются для:
|
||||
- **Предупреждений и ошибок** тестов диагностики
|
||||
- Примеры событий:
|
||||
- `Row remapping failed` - ошибка памяти GPU (Warning)
|
||||
- Различные тесты: connectivity, gpumem, gpustress, pcie, nvlink, nvswitch, power
|
||||
|
||||
Уровни severity:
|
||||
- `info` - информационные события (тесты прошли успешно)
|
||||
- `warning` - предупреждения (например, Row remapping failed)
|
||||
- `critical` - критические ошибки (коды ошибок 300+)
|
||||
|
||||
## Пример использования
|
||||
|
||||
```bash
|
||||
# Запуск веб-интерфейса
|
||||
./logpile --file /path/to/A514359X5A07900_logs-20260122-074208.tar
|
||||
|
||||
# Веб-интерфейс будет доступен на http://localhost:8082
|
||||
```
|
||||
|
||||
## Автоопределение
|
||||
|
||||
Парсер автоматически определяет архивы NVIDIA Field Diag по наличию:
|
||||
- `unified_summary.json` с маркером "HGX Field Diag"
|
||||
- `summary.json` и `summary.csv` с результатами тестов
|
||||
- Директории `gpu_fieldiag/`
|
||||
|
||||
Confidence score:
|
||||
- `unified_summary.json` с маркером "HGX Field Diag": +40
|
||||
- `summary.json`: +20
|
||||
- `summary.csv`: +15
|
||||
- `gpu_fieldiag/` directory: +15
|
||||
|
||||
## Версионирование
|
||||
|
||||
**Текущая версия парсера:** 1.1.0
|
||||
|
||||
При модификации логики парсера необходимо увеличивать версию в константе `parserVersion` в файле `parser.go`.
|
||||
|
||||
### История версий
|
||||
|
||||
- **1.1.0** - Добавлен парсинг output.log (dmidecode) для извлечения модели и серийного номера сервера
|
||||
- **1.0.0** - Первоначальная версия с парсингом unified_summary.json и summary.json/csv
|
||||
|
||||
## Примеры данных
|
||||
|
||||
### Пример unified_summary.json
|
||||
```json
|
||||
{
|
||||
"runInfo": {
|
||||
"diagVersion": "24287-XXXX-FLD-42658",
|
||||
"diagName": "HGX Field Diag",
|
||||
"finalResult": "FAIL",
|
||||
"errorCode": 363
|
||||
},
|
||||
"tests": [{
|
||||
"virtualId": "inventory",
|
||||
"components": [{
|
||||
"componentId": "GPUSXM1",
|
||||
"properties": [
|
||||
{"id": "Manufacturer", "value": "Any Server Vendor"},
|
||||
{"id": "VendorID", "value": "10de"},
|
||||
{"id": "DeviceID", "value": "2335"}
|
||||
]
|
||||
}]
|
||||
}]
|
||||
}
|
||||
```
|
||||
|
||||
### Пример summary.json
|
||||
```json
|
||||
[
|
||||
{
|
||||
"Error Code": "005-000-1-000000000363",
|
||||
"Test": "gpumem",
|
||||
"Component ID": "SXM5_SN_1653925025497",
|
||||
"Notes": "Row remapping failed",
|
||||
"Virtual ID": "gpumem"
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
## Известные ограничения
|
||||
|
||||
1. Парсер фокусируется на данных из `unified_summary.json` и `summary.json`
|
||||
2. Детальные логи из `gpu_fieldiag/*.log` пока не парсятся
|
||||
3. Информация о CPU, памяти и дисках не извлекается (в архиве отсутствует)
|
||||
|
||||
## Разработка
|
||||
|
||||
### Добавление новых полей
|
||||
|
||||
1. Изучите структуру JSON в архиве
|
||||
2. Добавьте поля в структуры `Component` или `Property`
|
||||
3. Обновите функции `parseGPUComponent` или `parseNVSwitchComponent`
|
||||
4. Увеличьте версию парсера
|
||||
|
||||
### Добавление новых типов файлов
|
||||
|
||||
1. Создайте новый файл с парсером (например, `gpu_logs.go`)
|
||||
2. Добавьте парсинг в функцию `Parse()` в `parser.go`
|
||||
3. Обновите документацию
|
||||
68
internal/parser/vendors/nvidia/output_log.go
vendored
Normal file
68
internal/parser/vendors/nvidia/output_log.go
vendored
Normal file
@@ -0,0 +1,68 @@
|
||||
package nvidia
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"strings"
|
||||
|
||||
"git.mchus.pro/mchus/logpile/internal/models"
|
||||
)
|
||||
|
||||
// ParseOutputLog parses output.log file which contains dmidecode output
|
||||
func ParseOutputLog(content []byte, result *models.AnalysisResult) error {
|
||||
scanner := bufio.NewScanner(strings.NewReader(string(content)))
|
||||
|
||||
inSystemInfo := false
|
||||
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
trimmed := strings.TrimSpace(line)
|
||||
|
||||
// Detect "System Information" section
|
||||
if strings.Contains(trimmed, "System Information") {
|
||||
inSystemInfo = true
|
||||
continue
|
||||
}
|
||||
|
||||
// Exit section when we hit another Handle or empty section
|
||||
if inSystemInfo && strings.HasPrefix(trimmed, "Handle ") {
|
||||
inSystemInfo = false
|
||||
continue
|
||||
}
|
||||
|
||||
// Parse fields in System Information section
|
||||
if inSystemInfo && strings.Contains(line, ":") {
|
||||
parts := strings.SplitN(trimmed, ":", 2)
|
||||
if len(parts) != 2 {
|
||||
continue
|
||||
}
|
||||
|
||||
field := strings.TrimSpace(parts[0])
|
||||
value := strings.TrimSpace(parts[1])
|
||||
|
||||
if value == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
switch field {
|
||||
case "Manufacturer":
|
||||
result.Hardware.BoardInfo.Manufacturer = value
|
||||
case "Product Name":
|
||||
result.Hardware.BoardInfo.ProductName = value
|
||||
case "Serial Number":
|
||||
result.Hardware.BoardInfo.SerialNumber = value
|
||||
case "Version":
|
||||
// Store version in part number if needed
|
||||
if result.Hardware.BoardInfo.PartNumber == "" {
|
||||
result.Hardware.BoardInfo.PartNumber = value
|
||||
}
|
||||
case "UUID":
|
||||
// Store UUID somewhere if needed (we don't have a field for it yet)
|
||||
// Could add to FRU or as a custom field
|
||||
case "Family":
|
||||
// Could store family info if needed
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return scanner.Err()
|
||||
}
|
||||
166
internal/parser/vendors/nvidia/parser.go
vendored
Normal file
166
internal/parser/vendors/nvidia/parser.go
vendored
Normal file
@@ -0,0 +1,166 @@
|
||||
// Package nvidia provides parser for NVIDIA Field Diagnostics archives
|
||||
// Tested with: HGX Field Diag (works with various server vendors)
|
||||
//
|
||||
// IMPORTANT: Increment parserVersion when modifying parser logic!
|
||||
// This helps track which version was used to parse specific logs.
|
||||
package nvidia
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"git.mchus.pro/mchus/logpile/internal/models"
|
||||
"git.mchus.pro/mchus/logpile/internal/parser"
|
||||
)
|
||||
|
||||
// parserVersion - version of this parser module
|
||||
// IMPORTANT: Increment this version when making changes to parser logic!
|
||||
const parserVersion = "1.1.0"
|
||||
|
||||
func init() {
|
||||
parser.Register(&Parser{})
|
||||
}
|
||||
|
||||
// Parser implements VendorParser for NVIDIA Field Diagnostics
|
||||
type Parser struct{}
|
||||
|
||||
// Name returns human-readable parser name
|
||||
func (p *Parser) Name() string {
|
||||
return "NVIDIA Field Diagnostics Parser"
|
||||
}
|
||||
|
||||
// Vendor returns vendor identifier
|
||||
func (p *Parser) Vendor() string {
|
||||
return "nvidia"
|
||||
}
|
||||
|
||||
// Version returns parser version
|
||||
// IMPORTANT: Update parserVersion constant when modifying parser logic!
|
||||
func (p *Parser) Version() string {
|
||||
return parserVersion
|
||||
}
|
||||
|
||||
// Detect checks if archive matches NVIDIA Field Diagnostics format
|
||||
// Returns confidence 0-100
|
||||
func (p *Parser) Detect(files []parser.ExtractedFile) int {
|
||||
confidence := 0
|
||||
|
||||
for _, f := range files {
|
||||
path := strings.ToLower(f.Path)
|
||||
|
||||
// Strong indicators for NVIDIA Field Diagnostics format
|
||||
if strings.HasSuffix(path, "unified_summary.json") {
|
||||
// Check if it's really NVIDIA Field Diag format
|
||||
if containsNvidiaFieldDiagMarkers(f.Content) {
|
||||
confidence += 40
|
||||
}
|
||||
}
|
||||
|
||||
if strings.HasSuffix(path, "summary.json") && !strings.Contains(path, "unified_") {
|
||||
confidence += 20
|
||||
}
|
||||
|
||||
if strings.HasSuffix(path, "summary.csv") {
|
||||
confidence += 15
|
||||
}
|
||||
|
||||
if strings.Contains(path, "gpu_fieldiag/") {
|
||||
confidence += 15
|
||||
}
|
||||
|
||||
if strings.HasSuffix(path, "output.log") {
|
||||
// Check if it contains dmidecode output
|
||||
if strings.Contains(string(f.Content), "dmidecode") ||
|
||||
strings.Contains(string(f.Content), "System Information") {
|
||||
confidence += 10
|
||||
}
|
||||
}
|
||||
|
||||
// Cap at 100
|
||||
if confidence >= 100 {
|
||||
return 100
|
||||
}
|
||||
}
|
||||
|
||||
return confidence
|
||||
}
|
||||
|
||||
// containsNvidiaFieldDiagMarkers checks if content has NVIDIA Field Diag markers
|
||||
func containsNvidiaFieldDiagMarkers(content []byte) bool {
|
||||
s := string(content)
|
||||
// Check for typical NVIDIA Field Diagnostics structure
|
||||
return strings.Contains(s, "runInfo") &&
|
||||
strings.Contains(s, "diagVersion") &&
|
||||
strings.Contains(s, "HGX Field Diag")
|
||||
}
|
||||
|
||||
// Parse parses NVIDIA Field Diagnostics archive
|
||||
func (p *Parser) Parse(files []parser.ExtractedFile) (*models.AnalysisResult, error) {
|
||||
result := &models.AnalysisResult{
|
||||
Events: make([]models.Event, 0),
|
||||
FRU: make([]models.FRUInfo, 0),
|
||||
Sensors: make([]models.SensorReading, 0),
|
||||
}
|
||||
|
||||
// Initialize hardware config
|
||||
result.Hardware = &models.HardwareConfig{
|
||||
GPUs: make([]models.GPU, 0),
|
||||
}
|
||||
|
||||
// Parse output.log first (contains dmidecode system info)
|
||||
// Find the output.log file that contains dmidecode output
|
||||
outputLogFile := findDmidecodeOutputLog(files)
|
||||
if outputLogFile != nil {
|
||||
if err := ParseOutputLog(outputLogFile.Content, result); err != nil {
|
||||
// Log error but continue parsing other files
|
||||
_ = err // Ignore error for now
|
||||
}
|
||||
}
|
||||
|
||||
// Parse unified_summary.json (contains detailed component info)
|
||||
if f := parser.FindFileByName(files, "unified_summary.json"); f != nil {
|
||||
if err := ParseUnifiedSummary(f.Content, result); err != nil {
|
||||
// Log error but continue parsing other files
|
||||
_ = err // Ignore error for now
|
||||
}
|
||||
}
|
||||
|
||||
// Parse summary.json (test results summary)
|
||||
if f := parser.FindFileByName(files, "summary.json"); f != nil {
|
||||
events := ParseSummaryJSON(f.Content)
|
||||
result.Events = append(result.Events, events...)
|
||||
}
|
||||
|
||||
// Parse summary.csv (alternative format)
|
||||
if f := parser.FindFileByName(files, "summary.csv"); f != nil {
|
||||
csvEvents := ParseSummaryCSV(f.Content)
|
||||
result.Events = append(result.Events, csvEvents...)
|
||||
}
|
||||
|
||||
// Parse GPU field diagnostics logs
|
||||
gpuFieldiagFiles := parser.FindFileByPattern(files, "gpu_fieldiag/", ".log")
|
||||
for _, f := range gpuFieldiagFiles {
|
||||
// Parse individual GPU diagnostic logs if needed
|
||||
// For now, we focus on summary files
|
||||
_ = f
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// findDmidecodeOutputLog finds the output.log file that contains dmidecode output
|
||||
func findDmidecodeOutputLog(files []parser.ExtractedFile) *parser.ExtractedFile {
|
||||
for _, f := range files {
|
||||
// Look for output.log files
|
||||
if !strings.HasSuffix(strings.ToLower(f.Path), "output.log") {
|
||||
continue
|
||||
}
|
||||
|
||||
// Check if it contains dmidecode output
|
||||
content := string(f.Content)
|
||||
if strings.Contains(content, "dmidecode") &&
|
||||
strings.Contains(content, "System Information") {
|
||||
return &f
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
152
internal/parser/vendors/nvidia/summary.go
vendored
Normal file
152
internal/parser/vendors/nvidia/summary.go
vendored
Normal file
@@ -0,0 +1,152 @@
|
||||
package nvidia
|
||||
|
||||
import (
|
||||
"encoding/csv"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"git.mchus.pro/mchus/logpile/internal/models"
|
||||
)
|
||||
|
||||
// SummaryEntry represents a single test result entry
|
||||
type SummaryEntry struct {
|
||||
ErrorCode string `json:"Error Code"`
|
||||
Test string `json:"Test"`
|
||||
ComponentID string `json:"Component ID"`
|
||||
Notes string `json:"Notes"`
|
||||
VirtualID string `json:"Virtual ID"`
|
||||
IgnoreError string `json:"Ignore Error"`
|
||||
}
|
||||
|
||||
// ParseSummaryJSON parses summary.json file and returns events
|
||||
func ParseSummaryJSON(content []byte) []models.Event {
|
||||
var entries []SummaryEntry
|
||||
if err := json.Unmarshal(content, &entries); err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
events := make([]models.Event, 0)
|
||||
timestamp := time.Now() // Use current time as we don't have exact timestamps in summary
|
||||
|
||||
for _, entry := range entries {
|
||||
// Only create events for failures or warnings
|
||||
if entry.Notes != "OK" || entry.ErrorCode != "001-000-1-000000000000" {
|
||||
event := models.Event{
|
||||
Timestamp: timestamp,
|
||||
Source: "GPU Field Diagnostics",
|
||||
EventType: entry.Test,
|
||||
Description: formatSummaryDescription(entry),
|
||||
Severity: getSeverityFromErrorCode(entry.ErrorCode, entry.Notes),
|
||||
RawData: fmt.Sprintf("Test: %s, Component: %s, Error: %s", entry.Test, entry.ComponentID, entry.ErrorCode),
|
||||
}
|
||||
events = append(events, event)
|
||||
}
|
||||
}
|
||||
|
||||
return events
|
||||
}
|
||||
|
||||
// ParseSummaryCSV parses summary.csv file and returns events
|
||||
func ParseSummaryCSV(content []byte) []models.Event {
|
||||
reader := csv.NewReader(strings.NewReader(string(content)))
|
||||
records, err := reader.ReadAll()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
events := make([]models.Event, 0)
|
||||
timestamp := time.Now()
|
||||
|
||||
// Skip header row
|
||||
for i, record := range records {
|
||||
if i == 0 {
|
||||
continue // Skip header
|
||||
}
|
||||
|
||||
// CSV format: ErrorCode,Test,VirtualID,SubTest,Type,ComponentID,Notes,Level,,,IgnoreError
|
||||
if len(record) < 7 {
|
||||
continue
|
||||
}
|
||||
|
||||
errorCode := record[0]
|
||||
test := record[1]
|
||||
componentID := record[5]
|
||||
notes := record[6]
|
||||
|
||||
// Only create events for failures or warnings
|
||||
if notes != "OK" || (errorCode != "0" && !strings.HasPrefix(errorCode, "048-000-0") && !strings.HasPrefix(errorCode, "001-000-1")) {
|
||||
event := models.Event{
|
||||
Timestamp: timestamp,
|
||||
Source: "GPU Field Diagnostics",
|
||||
EventType: test,
|
||||
Description: formatCSVDescription(test, componentID, notes, errorCode),
|
||||
Severity: getSeverityFromErrorCode(errorCode, notes),
|
||||
RawData: fmt.Sprintf("Test: %s, Component: %s, Error: %s", test, componentID, errorCode),
|
||||
}
|
||||
events = append(events, event)
|
||||
}
|
||||
}
|
||||
|
||||
return events
|
||||
}
|
||||
|
||||
// formatSummaryDescription creates a human-readable description from summary entry
|
||||
func formatSummaryDescription(entry SummaryEntry) string {
|
||||
component := entry.ComponentID
|
||||
if component == "" {
|
||||
component = entry.VirtualID
|
||||
}
|
||||
|
||||
if entry.Notes == "OK" {
|
||||
return fmt.Sprintf("%s test passed for %s", entry.Test, component)
|
||||
}
|
||||
|
||||
return fmt.Sprintf("%s test failed for %s: %s (Error: %s)", entry.Test, component, entry.Notes, entry.ErrorCode)
|
||||
}
|
||||
|
||||
// formatCSVDescription creates a human-readable description from CSV record
|
||||
func formatCSVDescription(test, component, notes, errorCode string) string {
|
||||
if notes == "OK" {
|
||||
return fmt.Sprintf("%s test passed for %s", test, component)
|
||||
}
|
||||
|
||||
return fmt.Sprintf("%s test failed for %s: %s (Error: %s)", test, component, notes, errorCode)
|
||||
}
|
||||
|
||||
// getSeverityFromErrorCode determines severity based on error code and notes
|
||||
func getSeverityFromErrorCode(errorCode, notes string) models.Severity {
|
||||
// Parse error code format: XXX-YYY-Z-ZZZZZZZZZZZZ
|
||||
// First digit indicates severity in some cases
|
||||
|
||||
if notes == "OK" {
|
||||
return models.SeverityInfo
|
||||
}
|
||||
|
||||
// Row remapping failed is a warning
|
||||
if strings.Contains(notes, "Row remapping failed") {
|
||||
return models.SeverityWarning
|
||||
}
|
||||
|
||||
// Check error code
|
||||
if errorCode == "" || errorCode == "0" {
|
||||
return models.SeverityInfo
|
||||
}
|
||||
|
||||
// Codes starting with 0 are typically informational
|
||||
if strings.HasPrefix(errorCode, "001-000-1") || strings.HasPrefix(errorCode, "048-000-0") {
|
||||
return models.SeverityInfo
|
||||
}
|
||||
|
||||
// Non-zero error codes are typically warnings or errors
|
||||
// If code is in 300+ range, it's likely an error
|
||||
if len(errorCode) > 2 {
|
||||
firstDigits := errorCode[:3]
|
||||
if firstDigits >= "300" {
|
||||
return models.SeverityCritical
|
||||
}
|
||||
}
|
||||
|
||||
return models.SeverityWarning
|
||||
}
|
||||
281
internal/parser/vendors/nvidia/unified_summary.go
vendored
Normal file
281
internal/parser/vendors/nvidia/unified_summary.go
vendored
Normal file
@@ -0,0 +1,281 @@
|
||||
package nvidia
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"git.mchus.pro/mchus/logpile/internal/models"
|
||||
)
|
||||
|
||||
// UnifiedSummaryData represents the structure of unified_summary.json
|
||||
type UnifiedSummaryData struct {
|
||||
RunInfo RunInfo `json:"runInfo"`
|
||||
Tests []Test `json:"tests"`
|
||||
}
|
||||
|
||||
// RunInfo contains information about the diagnostic run
|
||||
type RunInfo struct {
|
||||
TimeInfo struct {
|
||||
StartTime string `json:"startTime"`
|
||||
EndTime string `json:"endTime"`
|
||||
TotalDuration string `json:"totalDuration"`
|
||||
} `json:"timeInfo"`
|
||||
DiagVersion string `json:"diagVersion"`
|
||||
BaseVersion string `json:"baseVersion"`
|
||||
FinalResult string `json:"finalResult"`
|
||||
ErrorCode int `json:"errorCode"`
|
||||
DiagName string `json:"diagName"`
|
||||
RunLevel string `json:"runLevel"`
|
||||
}
|
||||
|
||||
// Test represents a diagnostic test
|
||||
type Test struct {
|
||||
VirtualID string `json:"virtualId"`
|
||||
Action string `json:"action"`
|
||||
StartTime string `json:"startTime"`
|
||||
EndTime string `json:"endTime"`
|
||||
Components []Component `json:"components"`
|
||||
}
|
||||
|
||||
// Component represents a hardware component
|
||||
type Component struct {
|
||||
ComponentID string `json:"componentId"`
|
||||
ErrorCode string `json:"errorCode"`
|
||||
Notes string `json:"notes"`
|
||||
Result string `json:"result"`
|
||||
Properties []Property `json:"properties"`
|
||||
}
|
||||
|
||||
// Property represents a component property
|
||||
type Property struct {
|
||||
ID string `json:"id"`
|
||||
Value interface{} `json:"value"` // Can be string or number
|
||||
}
|
||||
|
||||
// GetValueAsString returns the value as a string
|
||||
func (p *Property) GetValueAsString() string {
|
||||
switch v := p.Value.(type) {
|
||||
case string:
|
||||
return v
|
||||
case float64:
|
||||
return fmt.Sprintf("%.0f", v)
|
||||
case int:
|
||||
return fmt.Sprintf("%d", v)
|
||||
default:
|
||||
return fmt.Sprintf("%v", v)
|
||||
}
|
||||
}
|
||||
|
||||
// ParseUnifiedSummary parses unified_summary.json file
|
||||
func ParseUnifiedSummary(content []byte, result *models.AnalysisResult) error {
|
||||
var data UnifiedSummaryData
|
||||
if err := json.Unmarshal(content, &data); err != nil {
|
||||
return fmt.Errorf("failed to parse unified_summary.json: %w", err)
|
||||
}
|
||||
|
||||
// Set default board info only if not already set (from output.log)
|
||||
if result.Hardware.BoardInfo.ProductName == "" {
|
||||
result.Hardware.BoardInfo.ProductName = "GPU Server (Field Diag)"
|
||||
}
|
||||
|
||||
// Parse inventory test for hardware details
|
||||
for _, test := range data.Tests {
|
||||
if test.VirtualID == "inventory" || test.Action == "inventory" {
|
||||
parseInventoryComponents(test.Components, result)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// parseInventoryComponents extracts hardware info from inventory test
|
||||
func parseInventoryComponents(components []Component, result *models.AnalysisResult) {
|
||||
for _, comp := range components {
|
||||
// Parse system/board information
|
||||
if parseSystemInfo(comp, result) {
|
||||
// System info was found and parsed
|
||||
continue
|
||||
}
|
||||
|
||||
// Parse GPU components
|
||||
if strings.HasPrefix(comp.ComponentID, "GPUSXM") {
|
||||
gpu := parseGPUComponent(comp)
|
||||
if gpu != nil {
|
||||
result.Hardware.GPUs = append(result.Hardware.GPUs, *gpu)
|
||||
}
|
||||
}
|
||||
|
||||
// Parse NVSwitch components
|
||||
if strings.HasPrefix(comp.ComponentID, "NVSWITCHNVSWITCH") {
|
||||
nvswitch := parseNVSwitchComponent(comp)
|
||||
if nvswitch != nil {
|
||||
// Add as PCIe device for now
|
||||
result.Hardware.PCIeDevices = append(result.Hardware.PCIeDevices, *nvswitch)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// parseSystemInfo extracts system/board information from a component
|
||||
// Returns true if this component contains system info
|
||||
func parseSystemInfo(comp Component, result *models.AnalysisResult) bool {
|
||||
compID := strings.ToUpper(comp.ComponentID)
|
||||
|
||||
// Check if this is a system/board component
|
||||
isSystemComponent := strings.Contains(compID, "BASEBOARD") ||
|
||||
strings.Contains(compID, "SYSTEM") ||
|
||||
strings.Contains(compID, "MOTHERBOARD") ||
|
||||
strings.Contains(compID, "BOARD") ||
|
||||
comp.ComponentID == "Inventory"
|
||||
|
||||
if !isSystemComponent {
|
||||
return false
|
||||
}
|
||||
|
||||
// Extract system properties
|
||||
for _, prop := range comp.Properties {
|
||||
propID := prop.ID
|
||||
value := prop.GetValueAsString()
|
||||
|
||||
if value == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
switch propID {
|
||||
case "Manufacturer", "BoardManufacturer", "SystemManufacturer":
|
||||
// Only set if not already populated (e.g., from output.log)
|
||||
if result.Hardware.BoardInfo.Manufacturer == "" {
|
||||
result.Hardware.BoardInfo.Manufacturer = value
|
||||
}
|
||||
case "ProductName", "Product", "Model", "ModelName", "BoardProduct", "SystemProduct":
|
||||
// Don't overwrite real data from output.log with generic data
|
||||
// Only set if empty or still has the default placeholder value
|
||||
if result.Hardware.BoardInfo.ProductName == "" ||
|
||||
result.Hardware.BoardInfo.ProductName == "GPU Server (Field Diag)" {
|
||||
result.Hardware.BoardInfo.ProductName = value
|
||||
}
|
||||
case "SerialNumber", "Serial", "BoardSerial", "SystemSerial":
|
||||
// Only set if not already populated (e.g., from output.log)
|
||||
if result.Hardware.BoardInfo.SerialNumber == "" {
|
||||
result.Hardware.BoardInfo.SerialNumber = value
|
||||
}
|
||||
case "PartNumber", "BoardPartNumber":
|
||||
// Only set if not already populated
|
||||
if result.Hardware.BoardInfo.PartNumber == "" {
|
||||
result.Hardware.BoardInfo.PartNumber = value
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
// parseGPUComponent parses GPU component information
|
||||
func parseGPUComponent(comp Component) *models.GPU {
|
||||
gpu := &models.GPU{
|
||||
Slot: comp.ComponentID, // e.g., "GPUSXM1"
|
||||
}
|
||||
|
||||
var deviceID, vbios, pciID string
|
||||
|
||||
for _, prop := range comp.Properties {
|
||||
switch prop.ID {
|
||||
case "DeviceID":
|
||||
deviceID = prop.GetValueAsString()
|
||||
case "Vendor":
|
||||
gpu.Manufacturer = prop.GetValueAsString()
|
||||
case "DeviceName":
|
||||
gpu.Model = prop.GetValueAsString()
|
||||
case "VBIOS_version":
|
||||
vbios = prop.GetValueAsString()
|
||||
case "PCIID":
|
||||
pciID = prop.GetValueAsString()
|
||||
}
|
||||
}
|
||||
|
||||
// Build model string from vendor/device IDs
|
||||
if gpu.Model == "" || strings.Contains(gpu.Model, "Device") {
|
||||
if deviceID != "" {
|
||||
gpu.Model = fmt.Sprintf("NVIDIA Device %s", strings.ToUpper(deviceID))
|
||||
}
|
||||
}
|
||||
|
||||
// Add firmware info
|
||||
if vbios != "" {
|
||||
gpu.Firmware = vbios
|
||||
}
|
||||
|
||||
// Add PCI info
|
||||
if pciID != "" {
|
||||
gpu.BDF = pciID
|
||||
}
|
||||
|
||||
return gpu
|
||||
}
|
||||
|
||||
// parseNVSwitchComponent parses NVSwitch component information
|
||||
func parseNVSwitchComponent(comp Component) *models.PCIeDevice {
|
||||
device := &models.PCIeDevice{
|
||||
Slot: comp.ComponentID, // e.g., "NVSWITCHNVSWITCH0"
|
||||
}
|
||||
|
||||
var vendorIDStr, deviceIDStr, vbios, pciID string
|
||||
var pciSpeedStr, pciWidthStr string
|
||||
var vendor string
|
||||
|
||||
for _, prop := range comp.Properties {
|
||||
switch prop.ID {
|
||||
case "VendorID":
|
||||
vendorIDStr = prop.GetValueAsString()
|
||||
case "DeviceID":
|
||||
deviceIDStr = prop.GetValueAsString()
|
||||
case "Vendor":
|
||||
vendor = prop.GetValueAsString()
|
||||
case "VBIOS_version":
|
||||
vbios = prop.GetValueAsString()
|
||||
case "InfoROM_version":
|
||||
// Store in part number field as we don't have a better place
|
||||
case "PCIID":
|
||||
pciID = prop.GetValueAsString()
|
||||
device.BDF = pciID
|
||||
case "PCISpeed":
|
||||
pciSpeedStr = prop.GetValueAsString()
|
||||
device.LinkSpeed = pciSpeedStr
|
||||
device.MaxLinkSpeed = pciSpeedStr
|
||||
case "PCIWidth":
|
||||
pciWidthStr = prop.GetValueAsString()
|
||||
}
|
||||
}
|
||||
|
||||
// Parse vendor ID
|
||||
if vendorIDStr != "" {
|
||||
fmt.Sscanf(vendorIDStr, "%x", &device.VendorID)
|
||||
}
|
||||
|
||||
// Parse device ID
|
||||
if deviceIDStr != "" {
|
||||
fmt.Sscanf(deviceIDStr, "%x", &device.DeviceID)
|
||||
}
|
||||
|
||||
// Set manufacturer
|
||||
if vendor != "" {
|
||||
device.Manufacturer = vendor
|
||||
}
|
||||
|
||||
// Set device class
|
||||
device.DeviceClass = "NVSwitch"
|
||||
|
||||
// Parse link width
|
||||
if pciWidthStr != "" {
|
||||
fmt.Sscanf(pciWidthStr, "x%d", &device.LinkWidth)
|
||||
device.MaxLinkWidth = device.LinkWidth
|
||||
}
|
||||
|
||||
// Store part number (use for firmware version)
|
||||
if vbios != "" {
|
||||
device.PartNumber = vbios
|
||||
}
|
||||
|
||||
return device
|
||||
}
|
||||
275
internal/parser/vendors/nvidia_bug_report/README.md
vendored
Normal file
275
internal/parser/vendors/nvidia_bug_report/README.md
vendored
Normal file
@@ -0,0 +1,275 @@
|
||||
# NVIDIA Bug Report Parser
|
||||
|
||||
Парсер для файлов nvidia-bug-report, генерируемых скриптом `nvidia-bug-report.sh`.
|
||||
|
||||
## Назначение
|
||||
|
||||
Этот парсер обрабатывает диагностические логи NVIDIA драйверов и извлекает:
|
||||
- Информацию о модулях памяти (из dmidecode)
|
||||
- Информацию о GPU устройствах
|
||||
- Версию NVIDIA драйвера
|
||||
|
||||
## Формат файла
|
||||
|
||||
- Имя файла: `nvidia-bug-report-*.log.gz`
|
||||
- Формат: Gzip-сжатый текстовый файл
|
||||
- Генерируется: `nvidia-bug-report.sh` скриптом
|
||||
|
||||
## Confidence Score
|
||||
|
||||
**85** - высокий приоритет для файлов nvidia-bug-report
|
||||
|
||||
## Извлекаемые данные
|
||||
|
||||
### 1. System Information (из dmidecode)
|
||||
|
||||
Информация о сервере:
|
||||
- **Serial Number**: Серийный номер сервера (например, 2KD501412)
|
||||
- **UUID**: Уникальный идентификатор системы (например, 2e4054bc-1dd2-11b2-0284-6b0a21737950)
|
||||
- **Manufacturer**: Производитель сервера
|
||||
- **Product Name**: Модель сервера
|
||||
- **Version**: Версия системы
|
||||
|
||||
### 2. CPU Information (из dmidecode)
|
||||
|
||||
Для каждого процессора извлекается:
|
||||
- **Model**: Модель процессора (например, Intel(R) Xeon(R) Platinum 8480+)
|
||||
- **Serial Number**: Серийный номер (например, 5DB0D6C0DD30ABD8)
|
||||
- **Core Count**: Количество ядер (например, 56)
|
||||
- **Thread Count**: Количество потоков (например, 112)
|
||||
- **Max Speed**: Максимальная частота (например, 3800 MHz)
|
||||
- **Current Speed**: Текущая частота (например, 2000 MHz)
|
||||
|
||||
Пример:
|
||||
```
|
||||
Socket 0: Intel(R) Xeon(R) Platinum 8480+
|
||||
Serial Number: 5DB0D6C0DD30ABD8
|
||||
Cores: 56, Threads: 112
|
||||
Frequency: 2000 MHz (Max: 3800 MHz)
|
||||
```
|
||||
|
||||
### 3. Memory Modules (из dmidecode)
|
||||
|
||||
Для каждого модуля памяти извлекается:
|
||||
- **Slot/Location**: Например, CPU0_C0D0
|
||||
- **Size**: Размер в GB (например, 64 GB)
|
||||
- **Type**: Тип памяти (DDR5, DDR4, etc.)
|
||||
- **Manufacturer**: Производитель (Hynix, Samsung, Micron, etc.)
|
||||
- **Part Number**: P/N модуля (например, HMCG94AGBRA179N)
|
||||
- **Serial Number**: S/N модуля (например, 80AD0224322B3834E6)
|
||||
- **Speed**: Max/Current скорость (например, 5600/4400 MHz)
|
||||
- **Ranks**: Количество рангов
|
||||
|
||||
Пример:
|
||||
```
|
||||
Slot: CPU0_C0D0
|
||||
Size: 64 GB
|
||||
Type: DDR5
|
||||
Manufacturer: Hynix
|
||||
Part Number: HMCG94AGBRA179N
|
||||
Serial Number: 80AD0224322B3834E6
|
||||
Speed: 5600 MT/s (configured: 4400 MT/s)
|
||||
Ranks: 2
|
||||
```
|
||||
|
||||
### 4. Power Supplies (из dmidecode)
|
||||
|
||||
Для каждого блока питания извлекается:
|
||||
- **Location**: Позиция (например, PSU0, PSU1)
|
||||
- **Manufacturer**: Производитель (например, DELTA, Great Wall)
|
||||
- **Model Part Number**: Модель БП (например, V0310DT000000000)
|
||||
- **Serial Number**: Серийный номер (например, DGPLV251500LZ)
|
||||
- **Max Power Capacity**: Максимальная мощность (например, 2700 W)
|
||||
- **Revision**: Версия прошивки (например, 00.01.04)
|
||||
- **Status**: Статус (например, Present, OK)
|
||||
|
||||
Пример:
|
||||
```
|
||||
PSU0: V0310DT000000000 (DELTA)
|
||||
Serial Number: DGPLV251500LZ
|
||||
Power: 2700 W, Revision: 00.01.04
|
||||
Status: Present, OK
|
||||
```
|
||||
|
||||
### 5. Network Adapters (из lspci)
|
||||
|
||||
Для каждого сетевого адаптера (Ethernet, Network, InfiniBand) извлекается:
|
||||
- **Model**: Полное название модели из VPD (например, "NVIDIA ConnectX-7 HHHL Adapter card, 400GbE / NDR IB (default mode), Single-port OSFP, PCIe 5.0 x16")
|
||||
- **Location**: PCI BDF адрес (например, 0000:0e:00.0)
|
||||
- **Slot**: Физический слот (например, 108)
|
||||
- **Part Number**: P/N адаптера (например, MCX75310AAS-NEAT)
|
||||
- **Serial Number**: S/N адаптера (например, MT2430600249)
|
||||
- **Vendor**: Производитель (Mellanox, NVIDIA)
|
||||
- **Vendor ID / Device ID**: PCI идентификаторы (например, 15b3:1021)
|
||||
- **Port Count**: Количество портов (определяется из модели: Dual-port = 2, Single-port = 1)
|
||||
- **Port Type**: Тип портов (QSFP56, OSFP, SFP+)
|
||||
|
||||
Пример:
|
||||
```
|
||||
0000:0e:00.0: NVIDIA ConnectX-7 HHHL Adapter card, 400GbE / NDR IB (default mode), Single-port OSFP
|
||||
Slot: 108
|
||||
P/N: MCX75310AAS-NEAT
|
||||
S/N: MT2430600249
|
||||
Ports: 1 x OSFP
|
||||
```
|
||||
|
||||
### 6. GPU Devices
|
||||
|
||||
Для каждого GPU извлекается:
|
||||
- **Model**: Модель GPU (например, NVIDIA H100 80GB HBM3)
|
||||
- **BDF (Bus:Device.Function)**: PCI адрес (например, 0000:0f:00.0)
|
||||
- **UUID**: Уникальный идентификатор GPU (например, GPU-64674e47-e036-c12a-3e8d-55a2a9ac8db3)
|
||||
- **Video BIOS**: Версия BIOS видеокарты (например, 96.00.99.00.01)
|
||||
- **IRQ**: Прерывание (например, 17)
|
||||
- **Bus Type**: Тип шины (PCIe)
|
||||
- **DMA Size**: Размер DMA (например, 52 bits)
|
||||
- **DMA Mask**: Маска DMA (например, 0xfffffffffffff)
|
||||
- **Device Minor**: Номер устройства (например, 0)
|
||||
- **Manufacturer**: NVIDIA
|
||||
|
||||
Пример:
|
||||
```
|
||||
0000:0f:00.0: NVIDIA H100 80GB HBM3
|
||||
UUID: GPU-64674e47-e036-c12a-3e8d-55a2a9ac8db3
|
||||
Video BIOS: 96.00.99.00.01
|
||||
IRQ: 17
|
||||
```
|
||||
|
||||
### 7. Events
|
||||
|
||||
- **Memory Configuration**: Сводка по модулям памяти (количество, производители, общий размер)
|
||||
- **GPU Detection**: Обнаруженные GPU устройства
|
||||
- **Driver Version**: Версия NVIDIA драйвера
|
||||
|
||||
## Пример использования
|
||||
|
||||
```bash
|
||||
# Запуск с nvidia-bug-report файлом
|
||||
./logpile --file nvidia-bug-report-2KD501412.log.gz
|
||||
|
||||
# Веб-интерфейс будет доступен на http://localhost:8082
|
||||
```
|
||||
|
||||
## Пример вывода
|
||||
|
||||
```
|
||||
✓ Detected vendor: NVIDIA Bug Report Parser
|
||||
✓ CPUs: 2
|
||||
✓ Memory: 32 modules
|
||||
✓ Power Supplies: 8
|
||||
✓ GPUs: 8
|
||||
✓ Network Adapters: 12
|
||||
|
||||
System Information:
|
||||
Serial Number: 2KD501412
|
||||
UUID: 2e4054bc-1dd2-11b2-0284-6b0a21737950
|
||||
Version: 0
|
||||
|
||||
CPU Information:
|
||||
Socket 0: Intel(R) Xeon(R) Platinum 8480+
|
||||
S/N: 5DB0D6C0DD30ABD8, Cores: 56, Threads: 112
|
||||
Socket 1: Intel(R) Xeon(R) Platinum 8480+
|
||||
S/N: 5DB017C05685B3ED, Cores: 56, Threads: 112
|
||||
|
||||
Power Supplies:
|
||||
PSU0: V0310DT000000000 (DELTA)
|
||||
S/N: DGPLV251500LZ
|
||||
Power: 2700 W, Revision: 00.01.04
|
||||
Status: Present, OK
|
||||
PSU1: V0310DT000000000 (DELTA)
|
||||
S/N: DGPLV251500GY
|
||||
Power: 2700 W, Revision: 00.01.04
|
||||
Status: Present, OK
|
||||
[... 6 more PSUs ...]
|
||||
|
||||
Memory Modules:
|
||||
CPU0_C0D0: 64 GB, Hynix
|
||||
P/N: HMCG94AGBRA179N, S/N: 80AD0224322B3834E6
|
||||
Type: DDR5, Speed: 4400/5600 MHz
|
||||
[... 31 more modules ...]
|
||||
|
||||
Network Adapters: 12 devices
|
||||
0000:0e:00.0: NVIDIA ConnectX-7 HHHL Adapter card, 400GbE / NDR IB (default mode), Single-port OSFP
|
||||
Slot: 108
|
||||
P/N: MCX75310AAS-NEAT
|
||||
S/N: MT2430600249
|
||||
Ports: 1 x OSFP
|
||||
0000:1f:00.0: ConnectX-6 Dx EN adapter card, 100GbE, Dual-port QSFP56
|
||||
Slot: 12
|
||||
P/N: MCX623106AN-CDAT
|
||||
S/N: MT2434J00PCD
|
||||
Ports: 2 x QSFP56
|
||||
[... 10 more adapters ...]
|
||||
|
||||
GPUs: 8 devices
|
||||
0000:0f:00.0: NVIDIA H100 80GB HBM3
|
||||
UUID: GPU-64674e47-e036-c12a-3e8d-55a2a9ac8db3
|
||||
Video BIOS: 96.00.99.00.01
|
||||
IRQ: 17
|
||||
0000:34:00.0: NVIDIA H100 80GB HBM3
|
||||
UUID: GPU-fa796345-c23a-54aa-1b67-709ac2542852
|
||||
Video BIOS: 96.00.99.00.01
|
||||
IRQ: 16
|
||||
[... 6 more GPUs ...]
|
||||
```
|
||||
|
||||
## Версионирование
|
||||
|
||||
**Текущая версия парсера:** 1.0.0
|
||||
|
||||
### История версий
|
||||
|
||||
- **1.0.0** - Первоначальная версия с парсингом System Info, CPU, Memory, PSU, GPU, Network Adapters и Driver
|
||||
|
||||
## Структура данных
|
||||
|
||||
Парсер использует следующие секции в bug report:
|
||||
1. **dmidecode output (System Information)** - для извлечения информации о сервере
|
||||
2. **dmidecode output (Processor Information)** - для извлечения информации о CPU
|
||||
3. **dmidecode output (Memory Device)** - для извлечения информации о памяти
|
||||
4. **dmidecode output (System Power Supply)** - для извлечения информации о блоках питания
|
||||
5. **lspci -vvv output (Ethernet/Network/Infiniband controller)** - для извлечения информации о сетевых адаптерах
|
||||
6. **lspci VPD (Vital Product Data)** - для извлечения P/N, S/N и модели сетевых адаптеров
|
||||
7. **/proc/driver/nvidia/gpus/.../information** - для детальной информации о GPU
|
||||
8. **NVRM version** - для версии драйвера
|
||||
|
||||
## Известные ограничения
|
||||
|
||||
1. Ошибки и предупреждения из логов пока не извлекаются
|
||||
2. Некоторые специфичные характеристики GPU (температура, утилизация) не парсятся
|
||||
3. Информация о производительности и метрики GPU требуют парсинга других секций
|
||||
|
||||
## Расширение
|
||||
|
||||
Для добавления новых возможностей:
|
||||
|
||||
1. **Ошибки драйвера**: Парсить секции с ошибками NVIDIA драйвера
|
||||
2. **nvidia-smi output**: Извлекать детальную информацию из вывода nvidia-smi (температура, утилизация)
|
||||
3. **GPU производительность**: Парсить метрики производительности и использования памяти GPU
|
||||
4. **PCIe информация**: Извлекать детали о PCIe конфигурации (скорость линка, ширина)
|
||||
|
||||
## Пример структуры файла
|
||||
|
||||
```
|
||||
Start of NVIDIA bug report log file
|
||||
nvidia-bug-report.sh Version: 34275561
|
||||
Date: Thu Jul 17 18:18:18 EDT 2025
|
||||
|
||||
[... system info ...]
|
||||
|
||||
Memory Device
|
||||
Data Width: 64 bits
|
||||
Size: 64 GB
|
||||
Form Factor: DIMM
|
||||
Locator: CPU0_C0D0
|
||||
Type: DDR5
|
||||
Speed: 5600 MT/s
|
||||
Manufacturer: Hynix
|
||||
Serial Number: 80AD0224322B3834E6
|
||||
Part Number: HMCG94AGBRA179N
|
||||
|
||||
[... more memory modules ...]
|
||||
|
||||
*** /proc/driver/nvidia/./gpus/0000:0f:00.0/power
|
||||
[... GPU info ...]
|
||||
```
|
||||
140
internal/parser/vendors/nvidia_bug_report/cpu.go
vendored
Normal file
140
internal/parser/vendors/nvidia_bug_report/cpu.go
vendored
Normal file
@@ -0,0 +1,140 @@
|
||||
package nvidia_bug_report
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"git.mchus.pro/mchus/logpile/internal/models"
|
||||
)
|
||||
|
||||
// parseCPUInfo extracts CPU information from dmidecode output
|
||||
func parseCPUInfo(content string, result *models.AnalysisResult) {
|
||||
scanner := bufio.NewScanner(strings.NewReader(content))
|
||||
|
||||
var currentCPU *models.CPU
|
||||
inProcessorInfo := false
|
||||
cpuSocket := 0
|
||||
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
trimmed := strings.TrimSpace(line)
|
||||
|
||||
// Start of Processor Information section
|
||||
if strings.Contains(trimmed, "Processor Information") {
|
||||
inProcessorInfo = true
|
||||
currentCPU = &models.CPU{
|
||||
Socket: cpuSocket,
|
||||
}
|
||||
cpuSocket++
|
||||
continue
|
||||
}
|
||||
|
||||
// End of current section (empty line or new section with Handle)
|
||||
if inProcessorInfo && (trimmed == "" || strings.HasPrefix(trimmed, "Handle ")) {
|
||||
// Save CPU if it has valid data
|
||||
if currentCPU != nil && currentCPU.Model != "" {
|
||||
result.Hardware.CPUs = append(result.Hardware.CPUs, *currentCPU)
|
||||
}
|
||||
inProcessorInfo = false
|
||||
currentCPU = nil
|
||||
continue
|
||||
}
|
||||
|
||||
// Parse fields within Processor Information section
|
||||
if inProcessorInfo && currentCPU != nil && strings.Contains(line, ":") {
|
||||
parts := strings.SplitN(trimmed, ":", 2)
|
||||
if len(parts) != 2 {
|
||||
continue
|
||||
}
|
||||
|
||||
field := strings.TrimSpace(parts[0])
|
||||
value := strings.TrimSpace(parts[1])
|
||||
|
||||
if value == "" || value == "Not Specified" || value == "Unknown" || value == "UNKNOWN" || value == "<OUT OF SPEC>" {
|
||||
continue
|
||||
}
|
||||
|
||||
switch field {
|
||||
case "Version":
|
||||
// CPU model name
|
||||
currentCPU.Model = value
|
||||
case "Serial Number":
|
||||
currentCPU.SerialNumber = value
|
||||
case "Part Number":
|
||||
// Store part number if available
|
||||
// Could be stored in a custom field if needed
|
||||
case "Core Count":
|
||||
if cores, err := strconv.Atoi(value); err == nil {
|
||||
currentCPU.Cores = cores
|
||||
}
|
||||
case "Core Enabled":
|
||||
// Could store this if needed
|
||||
case "Thread Count":
|
||||
if threads, err := strconv.Atoi(value); err == nil {
|
||||
currentCPU.Threads = threads
|
||||
}
|
||||
case "Max Speed":
|
||||
// Parse speed like "3800 MHz"
|
||||
if speed := parseCPUSpeed(value); speed > 0 {
|
||||
currentCPU.MaxFreqMHz = speed
|
||||
}
|
||||
case "Current Speed":
|
||||
// Parse current speed like "2000 MHz"
|
||||
if speed := parseCPUSpeed(value); speed > 0 {
|
||||
currentCPU.FrequencyMHz = speed
|
||||
}
|
||||
case "Voltage":
|
||||
// Could parse voltage if needed (e.g., "1.6 V")
|
||||
case "Status":
|
||||
// Status like "Populated, Enabled"
|
||||
// Check if CPU is enabled
|
||||
if !strings.Contains(value, "Populated") {
|
||||
// Skip unpopulated CPUs
|
||||
currentCPU = nil
|
||||
inProcessorInfo = false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Save last CPU if exists
|
||||
if currentCPU != nil && currentCPU.Model != "" {
|
||||
result.Hardware.CPUs = append(result.Hardware.CPUs, *currentCPU)
|
||||
}
|
||||
}
|
||||
|
||||
// parseCPUSpeed parses CPU speed strings like "3800 MHz" or "2.0 GHz"
|
||||
func parseCPUSpeed(speedStr string) int {
|
||||
parts := strings.Fields(speedStr)
|
||||
if len(parts) < 2 {
|
||||
return 0
|
||||
}
|
||||
|
||||
// Try to parse the number (may be int or float)
|
||||
speedStr = parts[0]
|
||||
var speed float64
|
||||
var err error
|
||||
|
||||
if strings.Contains(speedStr, ".") {
|
||||
speed, err = strconv.ParseFloat(speedStr, 64)
|
||||
} else {
|
||||
var speedInt int
|
||||
speedInt, err = strconv.Atoi(speedStr)
|
||||
speed = float64(speedInt)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
|
||||
unit := strings.ToUpper(parts[1])
|
||||
switch unit {
|
||||
case "MHZ":
|
||||
return int(speed)
|
||||
case "GHZ":
|
||||
return int(speed * 1000)
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
170
internal/parser/vendors/nvidia_bug_report/gpu.go
vendored
Normal file
170
internal/parser/vendors/nvidia_bug_report/gpu.go
vendored
Normal file
@@ -0,0 +1,170 @@
|
||||
package nvidia_bug_report
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"git.mchus.pro/mchus/logpile/internal/models"
|
||||
)
|
||||
|
||||
// parseGPUInfo extracts GPU information from the bug report
|
||||
func parseGPUInfo(content string, result *models.AnalysisResult) {
|
||||
scanner := bufio.NewScanner(strings.NewReader(content))
|
||||
|
||||
var currentGPU *models.GPU
|
||||
inGPUInfo := false
|
||||
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
|
||||
// Look for GPU information section markers (but skip ls listings)
|
||||
if strings.Contains(line, "/proc/driver/nvidia") && strings.Contains(line, "/gpus/") &&
|
||||
strings.Contains(line, "/information") && !strings.Contains(line, "ls:") {
|
||||
// Extract PCI address
|
||||
re := regexp.MustCompile(`/gpus/([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.[\da-f])`)
|
||||
matches := re.FindStringSubmatch(line)
|
||||
if len(matches) > 1 {
|
||||
pciAddr := matches[1]
|
||||
|
||||
// Save previous GPU if exists
|
||||
if currentGPU != nil {
|
||||
result.Hardware.GPUs = append(result.Hardware.GPUs, *currentGPU)
|
||||
}
|
||||
|
||||
// Start new GPU entry
|
||||
currentGPU = &models.GPU{
|
||||
BDF: pciAddr,
|
||||
Manufacturer: "NVIDIA",
|
||||
}
|
||||
inGPUInfo = true
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// End of GPU info section (separator line or new section, but not ls lines)
|
||||
if inGPUInfo && (strings.HasPrefix(line, "___") || (strings.HasPrefix(line, "***") && !strings.Contains(line, "ls:"))) {
|
||||
inGPUInfo = false
|
||||
continue
|
||||
}
|
||||
|
||||
// Parse GPU fields within information section
|
||||
if inGPUInfo && currentGPU != nil && strings.Contains(line, ":") {
|
||||
// Split on first colon and trim whitespace/tabs
|
||||
parts := strings.SplitN(line, ":", 2)
|
||||
if len(parts) != 2 {
|
||||
continue
|
||||
}
|
||||
|
||||
field := strings.TrimSpace(parts[0])
|
||||
value := strings.TrimSpace(parts[1])
|
||||
|
||||
if value == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
switch field {
|
||||
case "Model":
|
||||
currentGPU.Model = value
|
||||
case "IRQ":
|
||||
if irq, err := strconv.Atoi(value); err == nil {
|
||||
currentGPU.IRQ = irq
|
||||
}
|
||||
case "GPU UUID":
|
||||
currentGPU.UUID = value
|
||||
case "Video BIOS":
|
||||
currentGPU.VideoBIOS = value
|
||||
case "Bus Type":
|
||||
currentGPU.BusType = value
|
||||
case "DMA Size":
|
||||
currentGPU.DMASize = value
|
||||
case "DMA Mask":
|
||||
currentGPU.DMAMask = value
|
||||
case "Bus Location":
|
||||
// BDF already set from path, but verify consistency
|
||||
if currentGPU.BDF != value {
|
||||
// Use the value from the information section as it's more explicit
|
||||
currentGPU.BDF = value
|
||||
}
|
||||
case "Device Minor":
|
||||
if minor, err := strconv.Atoi(value); err == nil {
|
||||
currentGPU.DeviceMinor = minor
|
||||
}
|
||||
case "GPU Excluded":
|
||||
// Store as status if "Yes"
|
||||
if strings.ToLower(value) == "yes" {
|
||||
currentGPU.Status = "Excluded"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Save last GPU if exists
|
||||
if currentGPU != nil {
|
||||
result.Hardware.GPUs = append(result.Hardware.GPUs, *currentGPU)
|
||||
}
|
||||
|
||||
// Create event for GPU summary
|
||||
if len(result.Hardware.GPUs) > 0 {
|
||||
result.Events = append(result.Events, models.Event{
|
||||
Timestamp: time.Now(),
|
||||
Source: "NVIDIA Driver",
|
||||
EventType: "GPU Detection",
|
||||
Description: "NVIDIA GPUs detected",
|
||||
Severity: models.SeverityInfo,
|
||||
RawData: formatGPUSummary(result.Hardware.GPUs),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// parseDriverVersion extracts NVIDIA driver version
|
||||
func parseDriverVersion(content string, result *models.AnalysisResult) {
|
||||
scanner := bufio.NewScanner(strings.NewReader(content))
|
||||
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
|
||||
// Look for NVRM version line
|
||||
if strings.Contains(line, "NVRM version:") {
|
||||
// Extract version info
|
||||
parts := strings.Split(line, "NVRM version:")
|
||||
if len(parts) > 1 {
|
||||
version := strings.TrimSpace(parts[1])
|
||||
|
||||
result.Events = append(result.Events, models.Event{
|
||||
Timestamp: time.Now(),
|
||||
Source: "NVIDIA Driver",
|
||||
EventType: "Driver Version",
|
||||
Description: "NVIDIA driver version detected",
|
||||
Severity: models.SeverityInfo,
|
||||
RawData: version,
|
||||
})
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// formatGPUSummary creates a summary string for GPUs
|
||||
func formatGPUSummary(gpus []models.GPU) string {
|
||||
if len(gpus) == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
var summary strings.Builder
|
||||
for i, gpu := range gpus {
|
||||
if i > 0 {
|
||||
summary.WriteString("; ")
|
||||
}
|
||||
summary.WriteString(gpu.BDF)
|
||||
if gpu.Model != "" {
|
||||
summary.WriteString(" (")
|
||||
summary.WriteString(gpu.Model)
|
||||
summary.WriteString(")")
|
||||
}
|
||||
}
|
||||
|
||||
return summary.String()
|
||||
}
|
||||
183
internal/parser/vendors/nvidia_bug_report/memory.go
vendored
Normal file
183
internal/parser/vendors/nvidia_bug_report/memory.go
vendored
Normal file
@@ -0,0 +1,183 @@
|
||||
package nvidia_bug_report
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"git.mchus.pro/mchus/logpile/internal/models"
|
||||
)
|
||||
|
||||
// parseMemoryModules extracts memory module information from dmidecode output
|
||||
func parseMemoryModules(content string, result *models.AnalysisResult) {
|
||||
scanner := bufio.NewScanner(strings.NewReader(content))
|
||||
|
||||
var currentModule *models.MemoryDIMM
|
||||
inMemoryDevice := false
|
||||
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
trimmed := strings.TrimSpace(line)
|
||||
|
||||
// Start of Memory Device section
|
||||
if strings.Contains(trimmed, "Memory Device") && !strings.Contains(trimmed, "Array") {
|
||||
inMemoryDevice = true
|
||||
currentModule = &models.MemoryDIMM{
|
||||
Present: true,
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// End of current section (empty line or new section)
|
||||
if inMemoryDevice && (trimmed == "" || strings.HasPrefix(trimmed, "Handle ")) {
|
||||
// Save module if it has valid data
|
||||
if currentModule != nil && currentModule.Slot != "" && currentModule.SizeMB > 0 {
|
||||
result.Hardware.Memory = append(result.Hardware.Memory, *currentModule)
|
||||
}
|
||||
inMemoryDevice = false
|
||||
currentModule = nil
|
||||
continue
|
||||
}
|
||||
|
||||
// Parse fields within Memory Device section
|
||||
if inMemoryDevice && currentModule != nil && strings.Contains(line, ":") {
|
||||
parts := strings.SplitN(trimmed, ":", 2)
|
||||
if len(parts) != 2 {
|
||||
continue
|
||||
}
|
||||
|
||||
field := strings.TrimSpace(parts[0])
|
||||
value := strings.TrimSpace(parts[1])
|
||||
|
||||
if value == "" || value == "Not Specified" || value == "Unknown" || value == "NO DIMM" {
|
||||
continue
|
||||
}
|
||||
|
||||
switch field {
|
||||
case "Size":
|
||||
// Parse size like "64 GB" or "32768 MB"
|
||||
currentModule.SizeMB = parseMemorySize(value)
|
||||
case "Locator":
|
||||
currentModule.Slot = value
|
||||
currentModule.Location = value
|
||||
case "Bank Locator":
|
||||
// Store in location if slot is empty
|
||||
if currentModule.Location == "" {
|
||||
currentModule.Location = value
|
||||
}
|
||||
case "Type":
|
||||
currentModule.Type = value
|
||||
case "Type Detail":
|
||||
currentModule.Technology = value
|
||||
case "Speed":
|
||||
// Parse speed like "5600 MT/s"
|
||||
currentModule.MaxSpeedMHz = parseMemorySpeed(value)
|
||||
case "Configured Memory Speed":
|
||||
currentModule.CurrentSpeedMHz = parseMemorySpeed(value)
|
||||
case "Manufacturer":
|
||||
currentModule.Manufacturer = value
|
||||
case "Serial Number":
|
||||
currentModule.SerialNumber = value
|
||||
case "Part Number":
|
||||
currentModule.PartNumber = strings.TrimSpace(value)
|
||||
case "Rank":
|
||||
// Parse rank
|
||||
if rank, err := strconv.Atoi(value); err == nil {
|
||||
currentModule.Ranks = rank
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Save last module if exists
|
||||
if currentModule != nil && currentModule.Slot != "" && currentModule.SizeMB > 0 {
|
||||
result.Hardware.Memory = append(result.Hardware.Memory, *currentModule)
|
||||
}
|
||||
|
||||
// Create event for memory summary
|
||||
if len(result.Hardware.Memory) > 0 {
|
||||
totalMemoryGB := 0
|
||||
for _, mem := range result.Hardware.Memory {
|
||||
totalMemoryGB += mem.SizeMB / 1024
|
||||
}
|
||||
|
||||
result.Events = append(result.Events, models.Event{
|
||||
Timestamp: time.Now(),
|
||||
Source: "DMI",
|
||||
EventType: "Memory Configuration",
|
||||
Description: "Memory modules detected",
|
||||
Severity: models.SeverityInfo,
|
||||
RawData: formatMemorySummary(result.Hardware.Memory, totalMemoryGB),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// parseMemorySize parses memory size strings like "64 GB" or "32768 MB"
|
||||
func parseMemorySize(sizeStr string) int {
|
||||
parts := strings.Fields(sizeStr)
|
||||
if len(parts) < 2 {
|
||||
return 0
|
||||
}
|
||||
|
||||
size, err := strconv.Atoi(parts[0])
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
|
||||
unit := strings.ToUpper(parts[1])
|
||||
switch unit {
|
||||
case "GB":
|
||||
return size * 1024
|
||||
case "MB":
|
||||
return size
|
||||
case "TB":
|
||||
return size * 1024 * 1024
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
// parseMemorySpeed parses speed strings like "5600 MT/s" or "4400 MHz"
|
||||
func parseMemorySpeed(speedStr string) int {
|
||||
parts := strings.Fields(speedStr)
|
||||
if len(parts) < 1 {
|
||||
return 0
|
||||
}
|
||||
|
||||
speed, err := strconv.Atoi(parts[0])
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
|
||||
return speed
|
||||
}
|
||||
|
||||
// formatMemorySummary creates a summary string for memory modules
|
||||
func formatMemorySummary(modules []models.MemoryDIMM, totalGB int) string {
|
||||
if len(modules) == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Group by manufacturer
|
||||
manufacturerCount := make(map[string]int)
|
||||
for _, mem := range modules {
|
||||
if mem.Manufacturer != "" {
|
||||
manufacturerCount[mem.Manufacturer]++
|
||||
}
|
||||
}
|
||||
|
||||
summary := ""
|
||||
for mfr, count := range manufacturerCount {
|
||||
if summary != "" {
|
||||
summary += ", "
|
||||
}
|
||||
summary += mfr + ": " + strconv.Itoa(count) + " modules"
|
||||
}
|
||||
|
||||
if summary == "" {
|
||||
summary = strconv.Itoa(len(modules)) + " modules"
|
||||
}
|
||||
|
||||
return summary + ", Total: " + strconv.Itoa(totalGB) + " GB"
|
||||
}
|
||||
160
internal/parser/vendors/nvidia_bug_report/network_adapter.go
vendored
Normal file
160
internal/parser/vendors/nvidia_bug_report/network_adapter.go
vendored
Normal file
@@ -0,0 +1,160 @@
|
||||
package nvidia_bug_report
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"git.mchus.pro/mchus/logpile/internal/models"
|
||||
)
|
||||
|
||||
// parseNetworkAdapters extracts network adapter information from lspci output
|
||||
func parseNetworkAdapters(content string, result *models.AnalysisResult) {
|
||||
scanner := bufio.NewScanner(strings.NewReader(content))
|
||||
|
||||
var currentAdapter *models.NetworkAdapter
|
||||
inVPD := false
|
||||
currentBDF := ""
|
||||
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
trimmed := strings.TrimSpace(line)
|
||||
|
||||
// Check if this is a new PCI device line
|
||||
re := regexp.MustCompile(`^([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.[\da-f])\s+`)
|
||||
matches := re.FindStringSubmatch(line)
|
||||
|
||||
if len(matches) > 0 {
|
||||
// Save previous adapter if exists before processing new device
|
||||
if currentAdapter != nil && currentAdapter.Model != "" {
|
||||
result.Hardware.NetworkAdapters = append(result.Hardware.NetworkAdapters, *currentAdapter)
|
||||
}
|
||||
currentAdapter = nil
|
||||
inVPD = false
|
||||
}
|
||||
|
||||
// Match PCI device line: "0000:1f:00.0 Ethernet controller [0200]: Mellanox Technologies..."
|
||||
if strings.Contains(line, "Ethernet controller") || strings.Contains(line, "Network controller") || strings.Contains(line, "Infiniband controller") {
|
||||
// Extract BDF (Bus:Device.Function)
|
||||
if len(matches) > 1 {
|
||||
currentBDF = matches[1]
|
||||
currentAdapter = &models.NetworkAdapter{
|
||||
Location: currentBDF,
|
||||
Present: true,
|
||||
}
|
||||
|
||||
// Extract vendor and device info
|
||||
// Format: "Vendor description [DeviceClass]: Vendor Name Device Name [VendorID:DeviceID]"
|
||||
re2 := regexp.MustCompile(`:\s+(.+?)\s+\[([0-9a-f]{4}):([0-9a-f]{4})\]`)
|
||||
matches2 := re2.FindStringSubmatch(line)
|
||||
if len(matches2) > 3 {
|
||||
// Parse vendor name from description
|
||||
vendorDesc := matches2[1]
|
||||
if idx := strings.Index(vendorDesc, " "); idx > 0 {
|
||||
currentAdapter.Vendor = strings.Split(vendorDesc, " ")[0]
|
||||
}
|
||||
|
||||
// Parse vendor ID and device ID
|
||||
if vendorID, err := strconv.ParseInt(matches2[2], 16, 32); err == nil {
|
||||
currentAdapter.VendorID = int(vendorID)
|
||||
}
|
||||
if deviceID, err := strconv.ParseInt(matches2[3], 16, 32); err == nil {
|
||||
currentAdapter.DeviceID = int(deviceID)
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// Skip if not processing an adapter
|
||||
if currentAdapter == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// Parse Physical Slot
|
||||
if strings.HasPrefix(trimmed, "Physical Slot:") {
|
||||
slotStr := strings.TrimPrefix(trimmed, "Physical Slot:")
|
||||
currentAdapter.Slot = strings.TrimSpace(slotStr)
|
||||
continue
|
||||
}
|
||||
|
||||
// Start of Vital Product Data section
|
||||
if strings.Contains(trimmed, "Vital Product Data") {
|
||||
inVPD = true
|
||||
continue
|
||||
}
|
||||
|
||||
// End of VPD section
|
||||
if inVPD && (trimmed == "End" || strings.HasPrefix(trimmed, "Capabilities:")) {
|
||||
if trimmed == "End" {
|
||||
inVPD = false
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// Parse Product Name in VPD
|
||||
if inVPD && strings.HasPrefix(trimmed, "Product Name:") {
|
||||
productName := strings.TrimPrefix(trimmed, "Product Name:")
|
||||
currentAdapter.Model = strings.TrimSpace(productName)
|
||||
|
||||
// Extract port count from model name
|
||||
if strings.Contains(currentAdapter.Model, "Dual-port") {
|
||||
currentAdapter.PortCount = 2
|
||||
} else if strings.Contains(currentAdapter.Model, "Single-port") {
|
||||
currentAdapter.PortCount = 1
|
||||
} else if strings.Contains(currentAdapter.Model, "Quad-port") {
|
||||
currentAdapter.PortCount = 4
|
||||
}
|
||||
|
||||
// Extract port type from model name
|
||||
if strings.Contains(currentAdapter.Model, "QSFP56") {
|
||||
currentAdapter.PortType = "QSFP56"
|
||||
} else if strings.Contains(currentAdapter.Model, "QSFP28") {
|
||||
currentAdapter.PortType = "QSFP28"
|
||||
} else if strings.Contains(currentAdapter.Model, "OSFP") {
|
||||
currentAdapter.PortType = "OSFP"
|
||||
} else if strings.Contains(currentAdapter.Model, "SFP") {
|
||||
currentAdapter.PortType = "SFP+"
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// Parse VPD fields
|
||||
if inVPD && strings.HasPrefix(trimmed, "[") {
|
||||
// Match pattern: [TAG] Description: Value
|
||||
re := regexp.MustCompile(`^\[([A-Z0-9]+)\]\s+([^:]+):\s+(.+)`)
|
||||
matches := re.FindStringSubmatch(trimmed)
|
||||
if len(matches) > 3 {
|
||||
tag := matches[1]
|
||||
value := strings.TrimSpace(matches[3])
|
||||
|
||||
switch tag {
|
||||
case "PN":
|
||||
// Part number
|
||||
currentAdapter.PartNumber = value
|
||||
case "SN":
|
||||
// Serial number
|
||||
currentAdapter.SerialNumber = value
|
||||
case "EC":
|
||||
// Engineering changes - could be stored as firmware/revision
|
||||
if currentAdapter.Firmware == "" {
|
||||
currentAdapter.Firmware = value
|
||||
}
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// End of current device section (empty line followed by hex dump or new device)
|
||||
if currentAdapter != nil && trimmed == "" {
|
||||
// Check if next lines are hex dump (config space)
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// Save last adapter if exists
|
||||
if currentAdapter != nil && currentAdapter.Model != "" {
|
||||
result.Hardware.NetworkAdapters = append(result.Hardware.NetworkAdapters, *currentAdapter)
|
||||
}
|
||||
}
|
||||
107
internal/parser/vendors/nvidia_bug_report/parser.go
vendored
Normal file
107
internal/parser/vendors/nvidia_bug_report/parser.go
vendored
Normal file
@@ -0,0 +1,107 @@
|
||||
// Package nvidia_bug_report provides parser for NVIDIA bug report files
|
||||
// Generated by nvidia-bug-report.sh script
|
||||
package nvidia_bug_report
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"git.mchus.pro/mchus/logpile/internal/models"
|
||||
"git.mchus.pro/mchus/logpile/internal/parser"
|
||||
)
|
||||
|
||||
// parserVersion - version of this parser module
|
||||
const parserVersion = "1.0.0"
|
||||
|
||||
func init() {
|
||||
parser.Register(&Parser{})
|
||||
}
|
||||
|
||||
// Parser implements VendorParser for NVIDIA bug reports
|
||||
type Parser struct{}
|
||||
|
||||
// Name returns human-readable parser name
|
||||
func (p *Parser) Name() string {
|
||||
return "NVIDIA Bug Report Parser"
|
||||
}
|
||||
|
||||
// Vendor returns vendor identifier
|
||||
func (p *Parser) Vendor() string {
|
||||
return "nvidia_bug_report"
|
||||
}
|
||||
|
||||
// Version returns parser version
|
||||
func (p *Parser) Version() string {
|
||||
return parserVersion
|
||||
}
|
||||
|
||||
// Detect checks if this is an NVIDIA bug report
|
||||
// Returns confidence 0-100
|
||||
func (p *Parser) Detect(files []parser.ExtractedFile) int {
|
||||
// Only detect if there's exactly one file
|
||||
if len(files) != 1 {
|
||||
return 0
|
||||
}
|
||||
|
||||
file := files[0]
|
||||
|
||||
// Check filename
|
||||
if !strings.Contains(strings.ToLower(file.Path), "nvidia-bug-report") {
|
||||
return 0
|
||||
}
|
||||
|
||||
// Check content markers
|
||||
content := string(file.Content)
|
||||
if !strings.Contains(content, "nvidia-bug-report.sh") ||
|
||||
!strings.Contains(content, "NVIDIA bug report log file") {
|
||||
return 0
|
||||
}
|
||||
|
||||
// High confidence for nvidia-bug-report files
|
||||
return 85
|
||||
}
|
||||
|
||||
// Parse parses NVIDIA bug report file
|
||||
func (p *Parser) Parse(files []parser.ExtractedFile) (*models.AnalysisResult, error) {
|
||||
result := &models.AnalysisResult{
|
||||
Events: make([]models.Event, 0),
|
||||
FRU: make([]models.FRUInfo, 0),
|
||||
Sensors: make([]models.SensorReading, 0),
|
||||
}
|
||||
|
||||
// Initialize hardware config
|
||||
result.Hardware = &models.HardwareConfig{
|
||||
CPUs: make([]models.CPU, 0),
|
||||
Memory: make([]models.MemoryDIMM, 0),
|
||||
GPUs: make([]models.GPU, 0),
|
||||
PowerSupply: make([]models.PSU, 0),
|
||||
}
|
||||
|
||||
if len(files) == 0 {
|
||||
return result, nil
|
||||
}
|
||||
|
||||
content := string(files[0].Content)
|
||||
|
||||
// Parse system information
|
||||
parseSystemInfo(content, result)
|
||||
|
||||
// Parse CPU information
|
||||
parseCPUInfo(content, result)
|
||||
|
||||
// Parse memory modules
|
||||
parseMemoryModules(content, result)
|
||||
|
||||
// Parse power supplies
|
||||
parsePSUInfo(content, result)
|
||||
|
||||
// Parse GPU information
|
||||
parseGPUInfo(content, result)
|
||||
|
||||
// Parse network adapters
|
||||
parseNetworkAdapters(content, result)
|
||||
|
||||
// Parse driver version
|
||||
parseDriverVersion(content, result)
|
||||
|
||||
return result, nil
|
||||
}
|
||||
116
internal/parser/vendors/nvidia_bug_report/psu.go
vendored
Normal file
116
internal/parser/vendors/nvidia_bug_report/psu.go
vendored
Normal file
@@ -0,0 +1,116 @@
|
||||
package nvidia_bug_report
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"git.mchus.pro/mchus/logpile/internal/models"
|
||||
)
|
||||
|
||||
// parsePSUInfo extracts Power Supply information from dmidecode output
|
||||
func parsePSUInfo(content string, result *models.AnalysisResult) {
|
||||
scanner := bufio.NewScanner(strings.NewReader(content))
|
||||
|
||||
var currentPSU *models.PSU
|
||||
inPowerSupply := false
|
||||
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
trimmed := strings.TrimSpace(line)
|
||||
|
||||
// Start of System Power Supply section
|
||||
if strings.Contains(trimmed, "System Power Supply") {
|
||||
inPowerSupply = true
|
||||
currentPSU = &models.PSU{}
|
||||
continue
|
||||
}
|
||||
|
||||
// End of current section (empty line or new section with Handle)
|
||||
if inPowerSupply && (trimmed == "" || strings.HasPrefix(trimmed, "Handle ")) {
|
||||
// Save PSU if it has valid data
|
||||
if currentPSU != nil && currentPSU.Slot != "" {
|
||||
// Only add if PSU is present
|
||||
if strings.Contains(strings.ToLower(currentPSU.Status), "present") {
|
||||
result.Hardware.PowerSupply = append(result.Hardware.PowerSupply, *currentPSU)
|
||||
}
|
||||
}
|
||||
inPowerSupply = false
|
||||
currentPSU = nil
|
||||
continue
|
||||
}
|
||||
|
||||
// Parse fields within System Power Supply section
|
||||
if inPowerSupply && currentPSU != nil && strings.Contains(line, ":") {
|
||||
parts := strings.SplitN(trimmed, ":", 2)
|
||||
if len(parts) != 2 {
|
||||
continue
|
||||
}
|
||||
|
||||
field := strings.TrimSpace(parts[0])
|
||||
value := strings.TrimSpace(parts[1])
|
||||
|
||||
if value == "" || value == "Not Specified" || value == "Unknown" || value == "UNKNOWN" {
|
||||
continue
|
||||
}
|
||||
|
||||
switch field {
|
||||
case "Location":
|
||||
currentPSU.Slot = value
|
||||
case "Name":
|
||||
// Use Name as Model if Model is not set later
|
||||
if currentPSU.Model == "" {
|
||||
currentPSU.Model = value
|
||||
}
|
||||
case "Manufacturer":
|
||||
currentPSU.Vendor = value
|
||||
case "Serial Number":
|
||||
currentPSU.SerialNumber = value
|
||||
case "Model Part Number":
|
||||
// Use Model Part Number as the primary model identifier
|
||||
currentPSU.Model = value
|
||||
case "Revision":
|
||||
currentPSU.Firmware = value
|
||||
case "Max Power Capacity":
|
||||
// Parse wattage like "2700 W"
|
||||
if wattage := parsePowerWattage(value); wattage > 0 {
|
||||
currentPSU.WattageW = wattage
|
||||
}
|
||||
case "Status":
|
||||
currentPSU.Status = value
|
||||
case "Type":
|
||||
// Could store PSU type if needed (e.g., "Switching")
|
||||
case "Plugged":
|
||||
// Could track if PSU is plugged
|
||||
case "Hot Replaceable":
|
||||
// Could track if hot-swappable
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Save last PSU if exists
|
||||
if currentPSU != nil && currentPSU.Slot != "" {
|
||||
if strings.Contains(strings.ToLower(currentPSU.Status), "present") {
|
||||
result.Hardware.PowerSupply = append(result.Hardware.PowerSupply, *currentPSU)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// parsePowerWattage parses power capacity strings like "2700 W" or "1200 Watts"
|
||||
func parsePowerWattage(powerStr string) int {
|
||||
parts := strings.Fields(powerStr)
|
||||
if len(parts) < 1 {
|
||||
return 0
|
||||
}
|
||||
|
||||
// Try to parse the number
|
||||
wattageStr := parts[0]
|
||||
wattage, err := strconv.Atoi(wattageStr)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
|
||||
// Check if unit is specified (W, Watts, etc.) and convert if needed
|
||||
// For now, assume it's always in Watts
|
||||
return wattage
|
||||
}
|
||||
61
internal/parser/vendors/nvidia_bug_report/system_info.go
vendored
Normal file
61
internal/parser/vendors/nvidia_bug_report/system_info.go
vendored
Normal file
@@ -0,0 +1,61 @@
|
||||
package nvidia_bug_report
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"strings"
|
||||
|
||||
"git.mchus.pro/mchus/logpile/internal/models"
|
||||
)
|
||||
|
||||
// parseSystemInfo extracts System Information from dmidecode output
|
||||
func parseSystemInfo(content string, result *models.AnalysisResult) {
|
||||
scanner := bufio.NewScanner(strings.NewReader(content))
|
||||
|
||||
inSystemInfo := false
|
||||
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
trimmed := strings.TrimSpace(line)
|
||||
|
||||
// Start of System Information section
|
||||
if trimmed == "System Information" {
|
||||
inSystemInfo = true
|
||||
continue
|
||||
}
|
||||
|
||||
// End of section (empty line or new Handle)
|
||||
if inSystemInfo && (trimmed == "" || strings.HasPrefix(trimmed, "Handle ")) {
|
||||
inSystemInfo = false
|
||||
continue
|
||||
}
|
||||
|
||||
// Parse fields within System Information section
|
||||
if inSystemInfo && strings.Contains(line, ":") {
|
||||
parts := strings.SplitN(trimmed, ":", 2)
|
||||
if len(parts) != 2 {
|
||||
continue
|
||||
}
|
||||
|
||||
field := strings.TrimSpace(parts[0])
|
||||
value := strings.TrimSpace(parts[1])
|
||||
|
||||
// Skip empty, NULL, or "Not specified" values
|
||||
if value == "" || value == "NULL" || value == "Not specified" || value == "Not Specified" {
|
||||
continue
|
||||
}
|
||||
|
||||
switch field {
|
||||
case "Manufacturer":
|
||||
result.Hardware.BoardInfo.Manufacturer = value
|
||||
case "Product Name":
|
||||
result.Hardware.BoardInfo.ProductName = value
|
||||
case "Version":
|
||||
result.Hardware.BoardInfo.Version = value
|
||||
case "Serial Number":
|
||||
result.Hardware.BoardInfo.SerialNumber = value
|
||||
case "UUID":
|
||||
result.Hardware.BoardInfo.UUID = value
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
133
internal/parser/vendors/supermicro/README.md
vendored
Normal file
133
internal/parser/vendors/supermicro/README.md
vendored
Normal file
@@ -0,0 +1,133 @@
|
||||
# SMC Crash Dump Parser
|
||||
|
||||
Парсер для архивов Supermicro (SMC) BMC Crash Dump.
|
||||
|
||||
## Поддерживаемые серверы
|
||||
|
||||
- Supermicro SYS-821GE-TNHR
|
||||
- Другие серверы Supermicro с BMC Crashdump функциональностью
|
||||
|
||||
## Формат архива
|
||||
|
||||
Парсер работает с архивами в формате:
|
||||
- `.tgz` / `.tar.gz` (сжатый tar)
|
||||
- `.tar` (несжатый tar)
|
||||
|
||||
## Распознаваемые файлы
|
||||
|
||||
### Основные файлы
|
||||
|
||||
1. **CDump.txt** - JSON файл с данными crashdump
|
||||
- Metadata (BMC, BIOS, ME версии firmware)
|
||||
- CPU информация (CPUID, количество ядер, microcode версия, PPIN)
|
||||
- MCA (Machine Check Architecture) данные - ошибки процессоров
|
||||
|
||||
## Извлекаемые данные
|
||||
|
||||
### Hardware Configuration
|
||||
|
||||
#### CPUs
|
||||
```json
|
||||
{
|
||||
"slot": "CPU0",
|
||||
"model": "CPUID: 0xc06f2",
|
||||
"cores": 56,
|
||||
"manufacturer": "Intel",
|
||||
"firmware": "Microcode: 0x210002b3"
|
||||
}
|
||||
```
|
||||
|
||||
### FRU Information
|
||||
|
||||
- BMC Firmware Version
|
||||
- BIOS Version
|
||||
- ME Firmware Version
|
||||
- CPU PPIN (Protected Processor Inventory Number)
|
||||
|
||||
### Events
|
||||
|
||||
События создаются для:
|
||||
- **Crashdump collection** - когда был собран crashdump
|
||||
- **MCA Errors** - ошибки Machine Check Architecture
|
||||
- Corrected errors (Warning severity)
|
||||
- Uncorrected errors (Critical severity)
|
||||
|
||||
Уровни severity:
|
||||
- `info` - информационные события (crashdump по запросу)
|
||||
- `warning` - предупреждения (corrected MCA errors, reset detected)
|
||||
- `critical` - критические ошибки (uncorrected MCA errors)
|
||||
|
||||
## Пример использования
|
||||
|
||||
```bash
|
||||
# Запуск веб-интерфейса
|
||||
./logpile --file /path/to/CDump_090859_01302026.tgz
|
||||
|
||||
# Веб-интерфейс будет доступен на http://localhost:8082
|
||||
```
|
||||
|
||||
## Автоопределение
|
||||
|
||||
Парсер автоматически определяет архивы SMC Crash Dump по наличию:
|
||||
- `CDump.txt` с маркерами "crash_data", "METADATA", "bmc_fw_ver"
|
||||
|
||||
Confidence score:
|
||||
- `CDump.txt` с маркерами crashdump: +80
|
||||
|
||||
## Версионирование
|
||||
|
||||
**Текущая версия парсера:** 1.0.0
|
||||
|
||||
При модификации логики парсера необходимо увеличивать версию в константе `parserVersion` в файле `parser.go`.
|
||||
|
||||
## Примеры данных
|
||||
|
||||
### Пример CDump.txt (metadata)
|
||||
```json
|
||||
{
|
||||
"crash_data": {
|
||||
"METADATA": {
|
||||
"cpu0": {
|
||||
"cpuid": "0xc06f2",
|
||||
"core_count": "0x38",
|
||||
"ppin": "0xa3ccbe7d45026592",
|
||||
"ucode_patch_ver": "0x210002b3"
|
||||
},
|
||||
"bmc_fw_ver": "01.03.18",
|
||||
"bios_id": "BIOS Date: 08/04/2025 Rev 2.7",
|
||||
"me_fw_ver": "6.1.4.204",
|
||||
"timestamp": "2026-01-30T09:06:52Z",
|
||||
"trigger_type": "On-Demand"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### MCA Error Detection
|
||||
|
||||
Парсер проверяет регистры MCA status на наличие ошибок:
|
||||
- Bit 63 (Valid) - индикатор валидной ошибки
|
||||
- Bit 61 (UC) - uncorrected error
|
||||
- Bit 60 (EN) - error enabled
|
||||
|
||||
## Известные ограничения
|
||||
|
||||
1. Парсер фокусируется на данных из `CDump.txt`
|
||||
2. Детальный анализ MCA errors пока упрощен (только проверка status регистров)
|
||||
3. TOR dump и другие расширенные данные пока не парсятся
|
||||
|
||||
## Разработка
|
||||
|
||||
### Добавление новых полей
|
||||
|
||||
1. Изучите структуру JSON в CDump.txt
|
||||
2. Добавьте поля в структуры `Metadata`, `CPUMetadata`, или `MCAData`
|
||||
3. Обновите функции парсинга
|
||||
4. Увеличьте версию парсера
|
||||
|
||||
### Расширение MCA анализа
|
||||
|
||||
Для более детального анализа MCA ошибок можно:
|
||||
1. Добавить декодирование MCA error codes
|
||||
2. Парсить MISC и ADDR регистры
|
||||
3. Добавить корреляцию ошибок между банками
|
||||
261
internal/parser/vendors/supermicro/crashdump.go
vendored
Normal file
261
internal/parser/vendors/supermicro/crashdump.go
vendored
Normal file
@@ -0,0 +1,261 @@
|
||||
package supermicro
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"git.mchus.pro/mchus/logpile/internal/models"
|
||||
)
|
||||
|
||||
// CrashDumpData represents the structure of CDump.txt
|
||||
type CrashDumpData struct {
|
||||
CrashData struct {
|
||||
METADATA Metadata `json:"METADATA"`
|
||||
PROCESSORS ProcessorsData `json:"PROCESSORS"`
|
||||
} `json:"crash_data"`
|
||||
}
|
||||
|
||||
// ProcessorsData contains processor crash data
|
||||
type ProcessorsData struct {
|
||||
Version string `json:"_version"`
|
||||
CPU0 Processors `json:"cpu0"`
|
||||
CPU1 Processors `json:"cpu1"`
|
||||
}
|
||||
|
||||
// Metadata contains crashdump metadata
|
||||
type Metadata struct {
|
||||
CPU0 CPUMetadata `json:"cpu0"`
|
||||
CPU1 CPUMetadata `json:"cpu1"`
|
||||
BMCFWVer string `json:"bmc_fw_ver"`
|
||||
BIOSId string `json:"bios_id"`
|
||||
MEFWVer string `json:"me_fw_ver"`
|
||||
Timestamp string `json:"timestamp"`
|
||||
TriggerType string `json:"trigger_type"`
|
||||
PlatformName string `json:"platform_name"`
|
||||
CrashdumpVer string `json:"crashdump_ver"`
|
||||
ResetDetected string `json:"_reset_detected"`
|
||||
}
|
||||
|
||||
// CPUMetadata contains CPU metadata
|
||||
type CPUMetadata struct {
|
||||
CPUID string `json:"cpuid"`
|
||||
CoreMask string `json:"core_mask"`
|
||||
CHACount string `json:"cha_count"`
|
||||
CoreCount string `json:"core_count"`
|
||||
PPIN string `json:"ppin"`
|
||||
UcodePatchVer string `json:"ucode_patch_ver"`
|
||||
}
|
||||
|
||||
// Processors contains processor crash data
|
||||
type Processors struct {
|
||||
MCA MCAData `json:"MCA"`
|
||||
}
|
||||
|
||||
// MCAData contains Machine Check Architecture data
|
||||
type MCAData struct {
|
||||
Uncore map[string]interface{} `json:"uncore"`
|
||||
}
|
||||
|
||||
// ParseCrashDump parses CDump.txt file
|
||||
func ParseCrashDump(content []byte, result *models.AnalysisResult) error {
|
||||
var data CrashDumpData
|
||||
if err := json.Unmarshal(content, &data); err != nil {
|
||||
return fmt.Errorf("failed to parse CDump.txt: %w", err)
|
||||
}
|
||||
|
||||
// Initialize Hardware.Firmware slice if nil
|
||||
if result.Hardware.Firmware == nil {
|
||||
result.Hardware.Firmware = make([]models.FirmwareInfo, 0)
|
||||
}
|
||||
|
||||
// Parse metadata
|
||||
parseMetadata(&data.CrashData.METADATA, result)
|
||||
|
||||
// Parse CPU information
|
||||
parseCPUInfo(&data.CrashData.METADATA, result)
|
||||
|
||||
// Parse MCA errors
|
||||
parseMCAErrors(&data.CrashData, result)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// parseMetadata extracts metadata information
|
||||
func parseMetadata(metadata *Metadata, result *models.AnalysisResult) {
|
||||
// Store firmware versions in HardwareConfig.Firmware
|
||||
if metadata.BMCFWVer != "" {
|
||||
result.Hardware.Firmware = append(result.Hardware.Firmware, models.FirmwareInfo{
|
||||
DeviceName: "BMC",
|
||||
Version: metadata.BMCFWVer,
|
||||
})
|
||||
}
|
||||
|
||||
if metadata.BIOSId != "" {
|
||||
result.Hardware.Firmware = append(result.Hardware.Firmware, models.FirmwareInfo{
|
||||
DeviceName: "BIOS",
|
||||
Version: metadata.BIOSId,
|
||||
})
|
||||
}
|
||||
|
||||
if metadata.MEFWVer != "" {
|
||||
result.Hardware.Firmware = append(result.Hardware.Firmware, models.FirmwareInfo{
|
||||
DeviceName: "ME",
|
||||
Version: metadata.MEFWVer,
|
||||
})
|
||||
}
|
||||
|
||||
// Create event for crashdump trigger
|
||||
timestamp := time.Now()
|
||||
if metadata.Timestamp != "" {
|
||||
if t, err := time.Parse(time.RFC3339, metadata.Timestamp); err == nil {
|
||||
timestamp = t
|
||||
}
|
||||
}
|
||||
|
||||
triggerType := metadata.TriggerType
|
||||
if triggerType == "" {
|
||||
triggerType = "Unknown"
|
||||
}
|
||||
|
||||
severity := models.SeverityInfo
|
||||
if metadata.ResetDetected != "" && metadata.ResetDetected != "NONE" {
|
||||
severity = models.SeverityWarning
|
||||
}
|
||||
|
||||
result.Events = append(result.Events, models.Event{
|
||||
Timestamp: timestamp,
|
||||
Source: "Crashdump",
|
||||
EventType: "System Crashdump",
|
||||
Description: fmt.Sprintf("Crashdump collected (%s)", triggerType),
|
||||
Severity: severity,
|
||||
RawData: fmt.Sprintf("Version: %s, Reset: %s", metadata.CrashdumpVer, metadata.ResetDetected),
|
||||
})
|
||||
}
|
||||
|
||||
// parseCPUInfo extracts CPU information
|
||||
func parseCPUInfo(metadata *Metadata, result *models.AnalysisResult) {
|
||||
cpus := []struct {
|
||||
socket int
|
||||
data CPUMetadata
|
||||
}{
|
||||
{0, metadata.CPU0},
|
||||
{1, metadata.CPU1},
|
||||
}
|
||||
|
||||
for _, cpu := range cpus {
|
||||
if cpu.data.CPUID == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
// Parse core count
|
||||
coreCount := 0
|
||||
if cpu.data.CoreCount != "" {
|
||||
if count, err := strconv.ParseInt(strings.TrimPrefix(cpu.data.CoreCount, "0x"), 16, 64); err == nil {
|
||||
coreCount = int(count)
|
||||
}
|
||||
}
|
||||
|
||||
cpuModel := models.CPU{
|
||||
Socket: cpu.socket,
|
||||
Model: fmt.Sprintf("Intel CPU (CPUID: %s)", cpu.data.CPUID),
|
||||
Cores: coreCount,
|
||||
}
|
||||
|
||||
// Add PPIN
|
||||
if cpu.data.PPIN != "" && cpu.data.PPIN != "0x0" {
|
||||
cpuModel.PPIN = cpu.data.PPIN
|
||||
}
|
||||
|
||||
result.Hardware.CPUs = append(result.Hardware.CPUs, cpuModel)
|
||||
|
||||
// Add microcode version to firmware list
|
||||
if cpu.data.UcodePatchVer != "" {
|
||||
result.Hardware.Firmware = append(result.Hardware.Firmware, models.FirmwareInfo{
|
||||
DeviceName: fmt.Sprintf("CPU%d Microcode", cpu.socket),
|
||||
Version: cpu.data.UcodePatchVer,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// parseMCAErrors extracts Machine Check Architecture errors
|
||||
func parseMCAErrors(crashData *struct {
|
||||
METADATA Metadata `json:"METADATA"`
|
||||
PROCESSORS ProcessorsData `json:"PROCESSORS"`
|
||||
}, result *models.AnalysisResult) {
|
||||
timestamp := time.Now()
|
||||
if crashData.METADATA.Timestamp != "" {
|
||||
if t, err := time.Parse(time.RFC3339, crashData.METADATA.Timestamp); err == nil {
|
||||
timestamp = t
|
||||
}
|
||||
}
|
||||
|
||||
// Parse each CPU's MCA data
|
||||
cpuProcs := []struct {
|
||||
name string
|
||||
data Processors
|
||||
}{
|
||||
{"cpu0", crashData.PROCESSORS.CPU0},
|
||||
{"cpu1", crashData.PROCESSORS.CPU1},
|
||||
}
|
||||
|
||||
for _, cpu := range cpuProcs {
|
||||
if cpu.data.MCA.Uncore == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// Check each MCA bank for errors
|
||||
for bankName, bankDataRaw := range cpu.data.MCA.Uncore {
|
||||
bankData, ok := bankDataRaw.(map[string]interface{})
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
// Look for status register
|
||||
statusKey := strings.ToLower(bankName) + "_status"
|
||||
statusRaw, ok := bankData[statusKey]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
statusStr, ok := statusRaw.(string)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
// Parse status value
|
||||
status, err := strconv.ParseUint(strings.TrimPrefix(statusStr, "0x"), 16, 64)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// Check if MCA error is valid (bit 63 = Valid)
|
||||
if status&(1<<63) != 0 {
|
||||
// MCA error detected
|
||||
severity := models.SeverityWarning
|
||||
if status&(1<<61) != 0 { // UC bit = uncorrected error
|
||||
severity = models.SeverityCritical
|
||||
}
|
||||
|
||||
description := fmt.Sprintf("MCA Error in %s bank %s", cpu.name, bankName)
|
||||
if status&(1<<61) != 0 {
|
||||
description += " (Uncorrected)"
|
||||
} else {
|
||||
description += " (Corrected)"
|
||||
}
|
||||
|
||||
result.Events = append(result.Events, models.Event{
|
||||
Timestamp: timestamp,
|
||||
Source: "MCA",
|
||||
EventType: "Machine Check",
|
||||
Description: description,
|
||||
Severity: severity,
|
||||
RawData: fmt.Sprintf("Status: %s, CPU: %s, Bank: %s", statusStr, cpu.name, bankName),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
98
internal/parser/vendors/supermicro/parser.go
vendored
Normal file
98
internal/parser/vendors/supermicro/parser.go
vendored
Normal file
@@ -0,0 +1,98 @@
|
||||
// Package supermicro provides parser for Supermicro BMC crashdump archives
|
||||
// Tested with: Supermicro SYS-821GE-TNHR (Crashdump format)
|
||||
//
|
||||
// IMPORTANT: Increment parserVersion when modifying parser logic!
|
||||
// This helps track which version was used to parse specific logs.
|
||||
package supermicro
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"git.mchus.pro/mchus/logpile/internal/models"
|
||||
"git.mchus.pro/mchus/logpile/internal/parser"
|
||||
)
|
||||
|
||||
// parserVersion - version of this parser module
|
||||
// IMPORTANT: Increment this version when making changes to parser logic!
|
||||
const parserVersion = "1.0.0"
|
||||
|
||||
func init() {
|
||||
parser.Register(&Parser{})
|
||||
}
|
||||
|
||||
// Parser implements VendorParser for Supermicro servers
|
||||
type Parser struct{}
|
||||
|
||||
// Name returns human-readable parser name
|
||||
func (p *Parser) Name() string {
|
||||
return "SMC Crash Dump Parser"
|
||||
}
|
||||
|
||||
// Vendor returns vendor identifier
|
||||
func (p *Parser) Vendor() string {
|
||||
return "supermicro"
|
||||
}
|
||||
|
||||
// Version returns parser version
|
||||
// IMPORTANT: Update parserVersion constant when modifying parser logic!
|
||||
func (p *Parser) Version() string {
|
||||
return parserVersion
|
||||
}
|
||||
|
||||
// Detect checks if archive matches Supermicro crashdump format
|
||||
// Returns confidence 0-100
|
||||
func (p *Parser) Detect(files []parser.ExtractedFile) int {
|
||||
confidence := 0
|
||||
|
||||
for _, f := range files {
|
||||
path := strings.ToLower(f.Path)
|
||||
|
||||
// Strong indicator for Supermicro Crashdump format
|
||||
if strings.HasSuffix(path, "cdump.txt") {
|
||||
// Check if it's really Supermicro crashdump format
|
||||
if containsCrashdumpMarkers(f.Content) {
|
||||
confidence += 80
|
||||
}
|
||||
}
|
||||
|
||||
// Cap at 100
|
||||
if confidence >= 100 {
|
||||
return 100
|
||||
}
|
||||
}
|
||||
|
||||
return confidence
|
||||
}
|
||||
|
||||
// containsCrashdumpMarkers checks if content has Supermicro crashdump markers
|
||||
func containsCrashdumpMarkers(content []byte) bool {
|
||||
s := string(content)
|
||||
// Check for typical Supermicro Crashdump structure
|
||||
return strings.Contains(s, "crash_data") &&
|
||||
strings.Contains(s, "METADATA") &&
|
||||
(strings.Contains(s, "bmc_fw_ver") || strings.Contains(s, "crashdump_ver"))
|
||||
}
|
||||
|
||||
// Parse parses Supermicro crashdump archive
|
||||
func (p *Parser) Parse(files []parser.ExtractedFile) (*models.AnalysisResult, error) {
|
||||
result := &models.AnalysisResult{
|
||||
Events: make([]models.Event, 0),
|
||||
FRU: make([]models.FRUInfo, 0),
|
||||
Sensors: make([]models.SensorReading, 0),
|
||||
}
|
||||
|
||||
// Initialize hardware config
|
||||
result.Hardware = &models.HardwareConfig{
|
||||
CPUs: make([]models.CPU, 0),
|
||||
}
|
||||
|
||||
// Parse CDump.txt (JSON crashdump)
|
||||
if f := parser.FindFileByName(files, "CDump.txt"); f != nil {
|
||||
if err := ParseCrashDump(f.Content, result); err != nil {
|
||||
// Log error but continue parsing other files
|
||||
_ = err // Ignore error for now
|
||||
}
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
7
internal/parser/vendors/vendors.go
vendored
7
internal/parser/vendors/vendors.go
vendored
@@ -5,9 +5,14 @@ package vendors
|
||||
import (
|
||||
// Import vendor modules to trigger their init() registration
|
||||
_ "git.mchus.pro/mchus/logpile/internal/parser/vendors/inspur"
|
||||
_ "git.mchus.pro/mchus/logpile/internal/parser/vendors/nvidia"
|
||||
_ "git.mchus.pro/mchus/logpile/internal/parser/vendors/nvidia_bug_report"
|
||||
_ "git.mchus.pro/mchus/logpile/internal/parser/vendors/supermicro"
|
||||
|
||||
// Generic fallback parser (must be last for lowest priority)
|
||||
_ "git.mchus.pro/mchus/logpile/internal/parser/vendors/generic"
|
||||
|
||||
// Future vendors:
|
||||
// _ "git.mchus.pro/mchus/logpile/internal/parser/vendors/supermicro"
|
||||
// _ "git.mchus.pro/mchus/logpile/internal/parser/vendors/dell"
|
||||
// _ "git.mchus.pro/mchus/logpile/internal/parser/vendors/hpe"
|
||||
// _ "git.mchus.pro/mchus/logpile/internal/parser/vendors/lenovo"
|
||||
|
||||
Reference in New Issue
Block a user