package nvidia import ( "encoding/csv" "encoding/json" "fmt" "regexp" "strings" "time" "git.mchus.pro/mchus/logpile/internal/models" ) // SummaryEntry represents a single test result entry type SummaryEntry struct { ErrorCode string `json:"Error Code"` Test string `json:"Test"` ComponentID string `json:"Component ID"` Notes string `json:"Notes"` VirtualID string `json:"Virtual ID"` IgnoreError string `json:"Ignore Error"` } var gpuComponentIDRegex = regexp.MustCompile(`^SXM(\d+)_SN_(.+)$`) var nvswitchInventoryComponentRegex = regexp.MustCompile(`^NVSWITCH_(NVSWITCH\d+)_`) // ParseSummaryJSON parses summary.json file and returns events func ParseSummaryJSON(content []byte) []models.Event { var entries []SummaryEntry if err := json.Unmarshal(content, &entries); err != nil { return nil } events := make([]models.Event, 0) timestamp := time.Now() // Use current time as we don't have exact timestamps in summary for _, entry := range entries { // Only create events for failures or warnings if entry.Notes != "OK" || entry.ErrorCode != "001-000-1-000000000000" { event := models.Event{ Timestamp: timestamp, Source: "GPU Field Diagnostics", EventType: entry.Test, Description: formatSummaryDescription(entry), Severity: getSeverityFromErrorCode(entry.ErrorCode, entry.Notes), RawData: fmt.Sprintf("Test: %s, Component: %s, Error: %s", entry.Test, entry.ComponentID, entry.ErrorCode), } events = append(events, event) } } return events } // ParseSummaryCSV parses summary.csv file and returns events func ParseSummaryCSV(content []byte) []models.Event { reader := csv.NewReader(strings.NewReader(string(content))) records, err := reader.ReadAll() if err != nil { return nil } events := make([]models.Event, 0) timestamp := time.Now() // Skip header row for i, record := range records { if i == 0 { continue // Skip header } // CSV format: ErrorCode,Test,VirtualID,SubTest,Type,ComponentID,Notes,Level,,,IgnoreError if len(record) < 7 { continue } errorCode := record[0] test := record[1] componentID := record[5] notes := record[6] // Only create events for failures or warnings if notes != "OK" || (errorCode != "0" && !strings.HasPrefix(errorCode, "048-000-0") && !strings.HasPrefix(errorCode, "001-000-1")) { event := models.Event{ Timestamp: timestamp, Source: "GPU Field Diagnostics", EventType: test, Description: formatCSVDescription(test, componentID, notes, errorCode), Severity: getSeverityFromErrorCode(errorCode, notes), RawData: fmt.Sprintf("Test: %s, Component: %s, Error: %s", test, componentID, errorCode), } events = append(events, event) } } return events } // CollectGPUStatusesFromSummaryJSON extracts per-GPU PASS/FAIL status from summary.json. // Key format in returned map is component ID from summary (e.g. "SXM5_SN_1653925025497"). func CollectGPUStatusesFromSummaryJSON(content []byte) map[string]string { var entries []SummaryEntry if err := json.Unmarshal(content, &entries); err != nil { return nil } statuses := make(map[string]string) for _, entry := range entries { component := strings.TrimSpace(entry.ComponentID) if component == "" || !gpuComponentIDRegex.MatchString(component) { continue } current := statuses[component] next := "PASS" if !isSummaryJSONRecordPassing(entry.ErrorCode, entry.Notes) { next = "FAIL" } statuses[component] = mergeGPUStatus(current, next) } return statuses } // CollectGPUFailureDetailsFromSummaryJSON extracts per-GPU failure details from summary.json. // Key format in returned map is component ID from summary (e.g. "SXM5_SN_1653925025497"). func CollectGPUFailureDetailsFromSummaryJSON(content []byte) map[string]string { var entries []SummaryEntry if err := json.Unmarshal(content, &entries); err != nil { return nil } details := make(map[string]string) for _, entry := range entries { component := strings.TrimSpace(entry.ComponentID) if component == "" || !gpuComponentIDRegex.MatchString(component) { continue } if isSummaryJSONRecordPassing(entry.ErrorCode, entry.Notes) { continue } note := strings.TrimSpace(entry.Notes) if note == "" || strings.EqualFold(note, "OK") { note = strings.TrimSpace(entry.ErrorCode) } if note == "" { continue } // Keep first non-empty detail to avoid noisy overrides. if _, exists := details[component]; !exists { details[component] = note } } return details } // CollectGPUStatusesFromSummaryCSV extracts per-GPU PASS/FAIL status from summary.csv. // Key format in returned map is component ID from summary (e.g. "SXM5_SN_1653925025497"). func CollectGPUStatusesFromSummaryCSV(content []byte) map[string]string { reader := csv.NewReader(strings.NewReader(string(content))) records, err := reader.ReadAll() if err != nil { return nil } statuses := make(map[string]string) for i, record := range records { if i == 0 || len(record) < 7 { continue } component := strings.TrimSpace(record[5]) if component == "" || !gpuComponentIDRegex.MatchString(component) { continue } errorCode := strings.TrimSpace(record[0]) notes := strings.TrimSpace(record[6]) current := statuses[component] next := "PASS" if !isSummaryCSVRecordPassing(errorCode, notes) { next = "FAIL" } statuses[component] = mergeGPUStatus(current, next) } return statuses } // CollectNVSwitchStatusesFromSummaryJSON extracts per-NVSwitch PASS/FAIL status from summary.json. // Key format in returned map is normalized switch slot (e.g. "NVSWITCH0"). func CollectNVSwitchStatusesFromSummaryJSON(content []byte) map[string]string { var entries []SummaryEntry if err := json.Unmarshal(content, &entries); err != nil { return nil } statuses := make(map[string]string) for _, entry := range entries { component := strings.TrimSpace(entry.ComponentID) matches := nvswitchInventoryComponentRegex.FindStringSubmatch(component) if len(matches) != 2 { continue } slot := strings.TrimSpace(matches[1]) if slot == "" { continue } current := statuses[slot] next := "PASS" if !isSummaryJSONRecordPassing(entry.ErrorCode, entry.Notes) { next = "FAIL" } statuses[slot] = mergeGPUStatus(current, next) } return statuses } // CollectNVSwitchStatusesFromSummaryCSV extracts per-NVSwitch PASS/FAIL status from summary.csv. // Key format in returned map is normalized switch slot (e.g. "NVSWITCH0"). func CollectNVSwitchStatusesFromSummaryCSV(content []byte) map[string]string { reader := csv.NewReader(strings.NewReader(string(content))) records, err := reader.ReadAll() if err != nil { return nil } statuses := make(map[string]string) for i, record := range records { if i == 0 || len(record) < 7 { continue } component := strings.TrimSpace(record[5]) matches := nvswitchInventoryComponentRegex.FindStringSubmatch(component) if len(matches) != 2 { continue } slot := strings.TrimSpace(matches[1]) if slot == "" { continue } errorCode := strings.TrimSpace(record[0]) notes := strings.TrimSpace(record[6]) current := statuses[slot] next := "PASS" if !isSummaryCSVRecordPassing(errorCode, notes) { next = "FAIL" } statuses[slot] = mergeGPUStatus(current, next) } return statuses } // CollectGPUFailureDetailsFromSummaryCSV extracts per-GPU failure details from summary.csv. // Key format in returned map is component ID from summary (e.g. "SXM5_SN_1653925025497"). func CollectGPUFailureDetailsFromSummaryCSV(content []byte) map[string]string { reader := csv.NewReader(strings.NewReader(string(content))) records, err := reader.ReadAll() if err != nil { return nil } details := make(map[string]string) for i, record := range records { if i == 0 || len(record) < 7 { continue } component := strings.TrimSpace(record[5]) if component == "" || !gpuComponentIDRegex.MatchString(component) { continue } errorCode := strings.TrimSpace(record[0]) notes := strings.TrimSpace(record[6]) if isSummaryCSVRecordPassing(errorCode, notes) { continue } note := notes if note == "" || strings.EqualFold(note, "OK") { note = errorCode } if note == "" { continue } if _, exists := details[component]; !exists { details[component] = note } } return details } func isSummaryJSONRecordPassing(errorCode, notes string) bool { _ = errorCode return strings.TrimSpace(notes) == "OK" } func isSummaryCSVRecordPassing(errorCode, notes string) bool { _ = errorCode return strings.TrimSpace(notes) == "OK" } func mergeGPUStatus(current, next string) string { // FAIL has highest priority. if current == "FAIL" || next == "FAIL" { return "FAIL" } if current == "PASS" || next == "PASS" { return "PASS" } return "" } // ApplyGPUStatuses applies aggregated PASS/FAIL statuses from summary components to parsed GPUs. func ApplyGPUStatuses(result *models.AnalysisResult, componentStatuses map[string]string) { if result == nil || result.Hardware == nil || len(result.Hardware.GPUs) == 0 || len(componentStatuses) == 0 { return } slotStatus := make(map[string]string) // key: GPUSXM serialStatus := make(map[string]string) // key: GPU serial for componentID, status := range componentStatuses { matches := gpuComponentIDRegex.FindStringSubmatch(strings.TrimSpace(componentID)) if len(matches) != 3 { continue } slotKey := "GPUSXM" + matches[1] serialKey := strings.TrimSpace(matches[2]) slotStatus[slotKey] = mergeGPUStatus(slotStatus[slotKey], status) if serialKey != "" { serialStatus[serialKey] = mergeGPUStatus(serialStatus[serialKey], status) } } for i := range result.Hardware.GPUs { gpu := &result.Hardware.GPUs[i] next := "" if serial := strings.TrimSpace(gpu.SerialNumber); serial != "" { next = serialStatus[serial] } if next == "" { next = slotStatus[strings.TrimSpace(gpu.Slot)] } if next != "" { gpu.Status = next } } } // ApplyNVSwitchStatuses applies aggregated PASS/FAIL statuses from summary components to parsed NVSwitch devices. func ApplyNVSwitchStatuses(result *models.AnalysisResult, switchStatuses map[string]string) { if result == nil || result.Hardware == nil || len(result.Hardware.PCIeDevices) == 0 || len(switchStatuses) == 0 { return } for i := range result.Hardware.PCIeDevices { dev := &result.Hardware.PCIeDevices[i] slot := normalizeNVSwitchSlot(strings.TrimSpace(dev.Slot)) if slot == "" { continue } if !strings.HasPrefix(strings.ToUpper(slot), "NVSWITCH") { continue } if st := switchStatuses[slot]; st != "" { dev.Status = st } } } // ApplyGPUFailureDetails maps parsed failure details from summary components to GPUs. func ApplyGPUFailureDetails(result *models.AnalysisResult, componentDetails map[string]string) { if result == nil || result.Hardware == nil || len(result.Hardware.GPUs) == 0 || len(componentDetails) == 0 { return } slotDetails := make(map[string]string) // key: GPUSXM serialDetails := make(map[string]string) // key: GPU serial for componentID, detail := range componentDetails { matches := gpuComponentIDRegex.FindStringSubmatch(strings.TrimSpace(componentID)) if len(matches) != 3 { continue } detail = strings.TrimSpace(detail) if detail == "" { continue } slotKey := "GPUSXM" + matches[1] serialKey := strings.TrimSpace(matches[2]) if _, exists := slotDetails[slotKey]; !exists { slotDetails[slotKey] = detail } if serialKey != "" { if _, exists := serialDetails[serialKey]; !exists { serialDetails[serialKey] = detail } } } for i := range result.Hardware.GPUs { gpu := &result.Hardware.GPUs[i] detail := "" if serial := strings.TrimSpace(gpu.SerialNumber); serial != "" { detail = serialDetails[serial] } if detail == "" { detail = slotDetails[strings.TrimSpace(gpu.Slot)] } if detail != "" { gpu.ErrorDescription = detail } } } // formatSummaryDescription creates a human-readable description from summary entry func formatSummaryDescription(entry SummaryEntry) string { component := entry.ComponentID if component == "" { component = entry.VirtualID } if entry.Notes == "OK" { return fmt.Sprintf("%s test passed for %s", entry.Test, component) } return fmt.Sprintf("%s test failed for %s: %s (Error: %s)", entry.Test, component, entry.Notes, entry.ErrorCode) } // formatCSVDescription creates a human-readable description from CSV record func formatCSVDescription(test, component, notes, errorCode string) string { if notes == "OK" { return fmt.Sprintf("%s test passed for %s", test, component) } return fmt.Sprintf("%s test failed for %s: %s (Error: %s)", test, component, notes, errorCode) } // getSeverityFromErrorCode determines severity based on error code and notes func getSeverityFromErrorCode(errorCode, notes string) models.Severity { // Parse error code format: XXX-YYY-Z-ZZZZZZZZZZZZ // First digit indicates severity in some cases if notes == "OK" { return models.SeverityInfo } // Row remapping failed is a warning if strings.Contains(notes, "Row remapping failed") { return models.SeverityWarning } // Check error code if errorCode == "" || errorCode == "0" { return models.SeverityInfo } // Codes starting with 0 are typically informational if strings.HasPrefix(errorCode, "001-000-1") || strings.HasPrefix(errorCode, "048-000-0") { return models.SeverityInfo } // Non-zero error codes are typically warnings or errors // If code is in 300+ range, it's likely an error if len(errorCode) > 2 { firstDigits := errorCode[:3] if firstDigits >= "300" { return models.SeverityCritical } } return models.SeverityWarning }