Files
logpile/internal/parser/vendors/nvidia/summary.go

274 lines
7.6 KiB
Go

package nvidia
import (
"encoding/csv"
"encoding/json"
"fmt"
"regexp"
"strings"
"time"
"git.mchus.pro/mchus/logpile/internal/models"
)
// SummaryEntry represents a single test result entry
type SummaryEntry struct {
ErrorCode string `json:"Error Code"`
Test string `json:"Test"`
ComponentID string `json:"Component ID"`
Notes string `json:"Notes"`
VirtualID string `json:"Virtual ID"`
IgnoreError string `json:"Ignore Error"`
}
var gpuComponentIDRegex = regexp.MustCompile(`^SXM(\d+)_SN_(.+)$`)
// ParseSummaryJSON parses summary.json file and returns events
func ParseSummaryJSON(content []byte) []models.Event {
var entries []SummaryEntry
if err := json.Unmarshal(content, &entries); err != nil {
return nil
}
events := make([]models.Event, 0)
timestamp := time.Now() // Use current time as we don't have exact timestamps in summary
for _, entry := range entries {
// Only create events for failures or warnings
if entry.Notes != "OK" || entry.ErrorCode != "001-000-1-000000000000" {
event := models.Event{
Timestamp: timestamp,
Source: "GPU Field Diagnostics",
EventType: entry.Test,
Description: formatSummaryDescription(entry),
Severity: getSeverityFromErrorCode(entry.ErrorCode, entry.Notes),
RawData: fmt.Sprintf("Test: %s, Component: %s, Error: %s", entry.Test, entry.ComponentID, entry.ErrorCode),
}
events = append(events, event)
}
}
return events
}
// ParseSummaryCSV parses summary.csv file and returns events
func ParseSummaryCSV(content []byte) []models.Event {
reader := csv.NewReader(strings.NewReader(string(content)))
records, err := reader.ReadAll()
if err != nil {
return nil
}
events := make([]models.Event, 0)
timestamp := time.Now()
// Skip header row
for i, record := range records {
if i == 0 {
continue // Skip header
}
// CSV format: ErrorCode,Test,VirtualID,SubTest,Type,ComponentID,Notes,Level,,,IgnoreError
if len(record) < 7 {
continue
}
errorCode := record[0]
test := record[1]
componentID := record[5]
notes := record[6]
// Only create events for failures or warnings
if notes != "OK" || (errorCode != "0" && !strings.HasPrefix(errorCode, "048-000-0") && !strings.HasPrefix(errorCode, "001-000-1")) {
event := models.Event{
Timestamp: timestamp,
Source: "GPU Field Diagnostics",
EventType: test,
Description: formatCSVDescription(test, componentID, notes, errorCode),
Severity: getSeverityFromErrorCode(errorCode, notes),
RawData: fmt.Sprintf("Test: %s, Component: %s, Error: %s", test, componentID, errorCode),
}
events = append(events, event)
}
}
return events
}
// CollectGPUStatusesFromSummaryJSON extracts per-GPU PASS/FAIL status from summary.json.
// Key format in returned map is component ID from summary (e.g. "SXM5_SN_1653925025497").
func CollectGPUStatusesFromSummaryJSON(content []byte) map[string]string {
var entries []SummaryEntry
if err := json.Unmarshal(content, &entries); err != nil {
return nil
}
statuses := make(map[string]string)
for _, entry := range entries {
component := strings.TrimSpace(entry.ComponentID)
if component == "" || !gpuComponentIDRegex.MatchString(component) {
continue
}
current := statuses[component]
next := "PASS"
if !isSummaryJSONRecordPassing(entry.ErrorCode, entry.Notes) {
next = "FAIL"
}
statuses[component] = mergeGPUStatus(current, next)
}
return statuses
}
// CollectGPUStatusesFromSummaryCSV extracts per-GPU PASS/FAIL status from summary.csv.
// Key format in returned map is component ID from summary (e.g. "SXM5_SN_1653925025497").
func CollectGPUStatusesFromSummaryCSV(content []byte) map[string]string {
reader := csv.NewReader(strings.NewReader(string(content)))
records, err := reader.ReadAll()
if err != nil {
return nil
}
statuses := make(map[string]string)
for i, record := range records {
if i == 0 || len(record) < 7 {
continue
}
component := strings.TrimSpace(record[5])
if component == "" || !gpuComponentIDRegex.MatchString(component) {
continue
}
errorCode := strings.TrimSpace(record[0])
notes := strings.TrimSpace(record[6])
current := statuses[component]
next := "PASS"
if !isSummaryCSVRecordPassing(errorCode, notes) {
next = "FAIL"
}
statuses[component] = mergeGPUStatus(current, next)
}
return statuses
}
func isSummaryJSONRecordPassing(errorCode, notes string) bool {
_ = errorCode
return strings.TrimSpace(notes) == "OK"
}
func isSummaryCSVRecordPassing(errorCode, notes string) bool {
_ = errorCode
return strings.TrimSpace(notes) == "OK"
}
func mergeGPUStatus(current, next string) string {
// FAIL has highest priority.
if current == "FAIL" || next == "FAIL" {
return "FAIL"
}
if current == "PASS" || next == "PASS" {
return "PASS"
}
return ""
}
// ApplyGPUStatuses applies aggregated PASS/FAIL statuses from summary components to parsed GPUs.
func ApplyGPUStatuses(result *models.AnalysisResult, componentStatuses map[string]string) {
if result == nil || result.Hardware == nil || len(result.Hardware.GPUs) == 0 || len(componentStatuses) == 0 {
return
}
slotStatus := make(map[string]string) // key: GPUSXM<idx>
serialStatus := make(map[string]string) // key: GPU serial
for componentID, status := range componentStatuses {
matches := gpuComponentIDRegex.FindStringSubmatch(strings.TrimSpace(componentID))
if len(matches) != 3 {
continue
}
slotKey := "GPUSXM" + matches[1]
serialKey := strings.TrimSpace(matches[2])
slotStatus[slotKey] = mergeGPUStatus(slotStatus[slotKey], status)
if serialKey != "" {
serialStatus[serialKey] = mergeGPUStatus(serialStatus[serialKey], status)
}
}
for i := range result.Hardware.GPUs {
gpu := &result.Hardware.GPUs[i]
next := ""
if serial := strings.TrimSpace(gpu.SerialNumber); serial != "" {
next = serialStatus[serial]
}
if next == "" {
next = slotStatus[strings.TrimSpace(gpu.Slot)]
}
if next != "" {
gpu.Status = next
}
}
}
// formatSummaryDescription creates a human-readable description from summary entry
func formatSummaryDescription(entry SummaryEntry) string {
component := entry.ComponentID
if component == "" {
component = entry.VirtualID
}
if entry.Notes == "OK" {
return fmt.Sprintf("%s test passed for %s", entry.Test, component)
}
return fmt.Sprintf("%s test failed for %s: %s (Error: %s)", entry.Test, component, entry.Notes, entry.ErrorCode)
}
// formatCSVDescription creates a human-readable description from CSV record
func formatCSVDescription(test, component, notes, errorCode string) string {
if notes == "OK" {
return fmt.Sprintf("%s test passed for %s", test, component)
}
return fmt.Sprintf("%s test failed for %s: %s (Error: %s)", test, component, notes, errorCode)
}
// getSeverityFromErrorCode determines severity based on error code and notes
func getSeverityFromErrorCode(errorCode, notes string) models.Severity {
// Parse error code format: XXX-YYY-Z-ZZZZZZZZZZZZ
// First digit indicates severity in some cases
if notes == "OK" {
return models.SeverityInfo
}
// Row remapping failed is a warning
if strings.Contains(notes, "Row remapping failed") {
return models.SeverityWarning
}
// Check error code
if errorCode == "" || errorCode == "0" {
return models.SeverityInfo
}
// Codes starting with 0 are typically informational
if strings.HasPrefix(errorCode, "001-000-1") || strings.HasPrefix(errorCode, "048-000-0") {
return models.SeverityInfo
}
// Non-zero error codes are typically warnings or errors
// If code is in 300+ range, it's likely an error
if len(errorCode) > 2 {
firstDigits := errorCode[:3]
if firstDigits >= "300" {
return models.SeverityCritical
}
}
return models.SeverityWarning
}