package nvidia import ( "encoding/csv" "encoding/json" "fmt" "regexp" "strings" "time" "git.mchus.pro/mchus/logpile/internal/models" ) // SummaryEntry represents a single test result entry type SummaryEntry struct { ErrorCode string `json:"Error Code"` Test string `json:"Test"` ComponentID string `json:"Component ID"` Notes string `json:"Notes"` VirtualID string `json:"Virtual ID"` IgnoreError string `json:"Ignore Error"` } var gpuComponentIDRegex = regexp.MustCompile(`^SXM(\d+)_SN_(.+)$`) // ParseSummaryJSON parses summary.json file and returns events func ParseSummaryJSON(content []byte) []models.Event { var entries []SummaryEntry if err := json.Unmarshal(content, &entries); err != nil { return nil } events := make([]models.Event, 0) timestamp := time.Now() // Use current time as we don't have exact timestamps in summary for _, entry := range entries { // Only create events for failures or warnings if entry.Notes != "OK" || entry.ErrorCode != "001-000-1-000000000000" { event := models.Event{ Timestamp: timestamp, Source: "GPU Field Diagnostics", EventType: entry.Test, Description: formatSummaryDescription(entry), Severity: getSeverityFromErrorCode(entry.ErrorCode, entry.Notes), RawData: fmt.Sprintf("Test: %s, Component: %s, Error: %s", entry.Test, entry.ComponentID, entry.ErrorCode), } events = append(events, event) } } return events } // ParseSummaryCSV parses summary.csv file and returns events func ParseSummaryCSV(content []byte) []models.Event { reader := csv.NewReader(strings.NewReader(string(content))) records, err := reader.ReadAll() if err != nil { return nil } events := make([]models.Event, 0) timestamp := time.Now() // Skip header row for i, record := range records { if i == 0 { continue // Skip header } // CSV format: ErrorCode,Test,VirtualID,SubTest,Type,ComponentID,Notes,Level,,,IgnoreError if len(record) < 7 { continue } errorCode := record[0] test := record[1] componentID := record[5] notes := record[6] // Only create events for failures or warnings if notes != "OK" || (errorCode != "0" && !strings.HasPrefix(errorCode, "048-000-0") && !strings.HasPrefix(errorCode, "001-000-1")) { event := models.Event{ Timestamp: timestamp, Source: "GPU Field Diagnostics", EventType: test, Description: formatCSVDescription(test, componentID, notes, errorCode), Severity: getSeverityFromErrorCode(errorCode, notes), RawData: fmt.Sprintf("Test: %s, Component: %s, Error: %s", test, componentID, errorCode), } events = append(events, event) } } return events } // CollectGPUStatusesFromSummaryJSON extracts per-GPU PASS/FAIL status from summary.json. // Key format in returned map is component ID from summary (e.g. "SXM5_SN_1653925025497"). func CollectGPUStatusesFromSummaryJSON(content []byte) map[string]string { var entries []SummaryEntry if err := json.Unmarshal(content, &entries); err != nil { return nil } statuses := make(map[string]string) for _, entry := range entries { component := strings.TrimSpace(entry.ComponentID) if component == "" || !gpuComponentIDRegex.MatchString(component) { continue } current := statuses[component] next := "PASS" if !isSummaryJSONRecordPassing(entry.ErrorCode, entry.Notes) { next = "FAIL" } statuses[component] = mergeGPUStatus(current, next) } return statuses } // CollectGPUStatusesFromSummaryCSV extracts per-GPU PASS/FAIL status from summary.csv. // Key format in returned map is component ID from summary (e.g. "SXM5_SN_1653925025497"). func CollectGPUStatusesFromSummaryCSV(content []byte) map[string]string { reader := csv.NewReader(strings.NewReader(string(content))) records, err := reader.ReadAll() if err != nil { return nil } statuses := make(map[string]string) for i, record := range records { if i == 0 || len(record) < 7 { continue } component := strings.TrimSpace(record[5]) if component == "" || !gpuComponentIDRegex.MatchString(component) { continue } errorCode := strings.TrimSpace(record[0]) notes := strings.TrimSpace(record[6]) current := statuses[component] next := "PASS" if !isSummaryCSVRecordPassing(errorCode, notes) { next = "FAIL" } statuses[component] = mergeGPUStatus(current, next) } return statuses } func isSummaryJSONRecordPassing(errorCode, notes string) bool { _ = errorCode return strings.TrimSpace(notes) == "OK" } func isSummaryCSVRecordPassing(errorCode, notes string) bool { _ = errorCode return strings.TrimSpace(notes) == "OK" } func mergeGPUStatus(current, next string) string { // FAIL has highest priority. if current == "FAIL" || next == "FAIL" { return "FAIL" } if current == "PASS" || next == "PASS" { return "PASS" } return "" } // ApplyGPUStatuses applies aggregated PASS/FAIL statuses from summary components to parsed GPUs. func ApplyGPUStatuses(result *models.AnalysisResult, componentStatuses map[string]string) { if result == nil || result.Hardware == nil || len(result.Hardware.GPUs) == 0 || len(componentStatuses) == 0 { return } slotStatus := make(map[string]string) // key: GPUSXM serialStatus := make(map[string]string) // key: GPU serial for componentID, status := range componentStatuses { matches := gpuComponentIDRegex.FindStringSubmatch(strings.TrimSpace(componentID)) if len(matches) != 3 { continue } slotKey := "GPUSXM" + matches[1] serialKey := strings.TrimSpace(matches[2]) slotStatus[slotKey] = mergeGPUStatus(slotStatus[slotKey], status) if serialKey != "" { serialStatus[serialKey] = mergeGPUStatus(serialStatus[serialKey], status) } } for i := range result.Hardware.GPUs { gpu := &result.Hardware.GPUs[i] next := "" if serial := strings.TrimSpace(gpu.SerialNumber); serial != "" { next = serialStatus[serial] } if next == "" { next = slotStatus[strings.TrimSpace(gpu.Slot)] } if next != "" { gpu.Status = next } } } // formatSummaryDescription creates a human-readable description from summary entry func formatSummaryDescription(entry SummaryEntry) string { component := entry.ComponentID if component == "" { component = entry.VirtualID } if entry.Notes == "OK" { return fmt.Sprintf("%s test passed for %s", entry.Test, component) } return fmt.Sprintf("%s test failed for %s: %s (Error: %s)", entry.Test, component, entry.Notes, entry.ErrorCode) } // formatCSVDescription creates a human-readable description from CSV record func formatCSVDescription(test, component, notes, errorCode string) string { if notes == "OK" { return fmt.Sprintf("%s test passed for %s", test, component) } return fmt.Sprintf("%s test failed for %s: %s (Error: %s)", test, component, notes, errorCode) } // getSeverityFromErrorCode determines severity based on error code and notes func getSeverityFromErrorCode(errorCode, notes string) models.Severity { // Parse error code format: XXX-YYY-Z-ZZZZZZZZZZZZ // First digit indicates severity in some cases if notes == "OK" { return models.SeverityInfo } // Row remapping failed is a warning if strings.Contains(notes, "Row remapping failed") { return models.SeverityWarning } // Check error code if errorCode == "" || errorCode == "0" { return models.SeverityInfo } // Codes starting with 0 are typically informational if strings.HasPrefix(errorCode, "001-000-1") || strings.HasPrefix(errorCode, "048-000-0") { return models.SeverityInfo } // Non-zero error codes are typically warnings or errors // If code is in 300+ range, it's likely an error if len(errorCode) > 2 { firstDigits := errorCode[:3] if firstDigits >= "300" { return models.SeverityCritical } } return models.SeverityWarning }