491 lines
14 KiB
Go
491 lines
14 KiB
Go
package nvidia
|
|
|
|
import (
|
|
"encoding/csv"
|
|
"encoding/json"
|
|
"fmt"
|
|
"regexp"
|
|
"strings"
|
|
"time"
|
|
|
|
"git.mchus.pro/mchus/logpile/internal/models"
|
|
)
|
|
|
|
// SummaryEntry represents a single test result entry
|
|
type SummaryEntry struct {
|
|
ErrorCode string `json:"Error Code"`
|
|
Test string `json:"Test"`
|
|
ComponentID string `json:"Component ID"`
|
|
Notes string `json:"Notes"`
|
|
VirtualID string `json:"Virtual ID"`
|
|
IgnoreError string `json:"Ignore Error"`
|
|
}
|
|
|
|
var gpuComponentIDRegex = regexp.MustCompile(`^SXM(\d+)_SN_(.+)$`)
|
|
var nvswitchInventoryComponentRegex = regexp.MustCompile(`^NVSWITCH_(NVSWITCH\d+)_`)
|
|
|
|
// ParseSummaryJSON parses summary.json file and returns events
|
|
func ParseSummaryJSON(content []byte) []models.Event {
|
|
var entries []SummaryEntry
|
|
if err := json.Unmarshal(content, &entries); err != nil {
|
|
return nil
|
|
}
|
|
|
|
events := make([]models.Event, 0)
|
|
timestamp := time.Now() // Use current time as we don't have exact timestamps in summary
|
|
|
|
for _, entry := range entries {
|
|
// Only create events for failures or warnings
|
|
if entry.Notes != "OK" || entry.ErrorCode != "001-000-1-000000000000" {
|
|
event := models.Event{
|
|
Timestamp: timestamp,
|
|
Source: "GPU Field Diagnostics",
|
|
EventType: entry.Test,
|
|
Description: formatSummaryDescription(entry),
|
|
Severity: getSeverityFromErrorCode(entry.ErrorCode, entry.Notes),
|
|
RawData: fmt.Sprintf("Test: %s, Component: %s, Error: %s", entry.Test, entry.ComponentID, entry.ErrorCode),
|
|
}
|
|
events = append(events, event)
|
|
}
|
|
}
|
|
|
|
return events
|
|
}
|
|
|
|
// ParseSummaryCSV parses summary.csv file and returns events
|
|
func ParseSummaryCSV(content []byte) []models.Event {
|
|
reader := csv.NewReader(strings.NewReader(string(content)))
|
|
records, err := reader.ReadAll()
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
|
|
events := make([]models.Event, 0)
|
|
timestamp := time.Now()
|
|
|
|
// Skip header row
|
|
for i, record := range records {
|
|
if i == 0 {
|
|
continue // Skip header
|
|
}
|
|
|
|
// CSV format: ErrorCode,Test,VirtualID,SubTest,Type,ComponentID,Notes,Level,,,IgnoreError
|
|
if len(record) < 7 {
|
|
continue
|
|
}
|
|
|
|
errorCode := record[0]
|
|
test := record[1]
|
|
componentID := record[5]
|
|
notes := record[6]
|
|
|
|
// Only create events for failures or warnings
|
|
if notes != "OK" || (errorCode != "0" && !strings.HasPrefix(errorCode, "048-000-0") && !strings.HasPrefix(errorCode, "001-000-1")) {
|
|
event := models.Event{
|
|
Timestamp: timestamp,
|
|
Source: "GPU Field Diagnostics",
|
|
EventType: test,
|
|
Description: formatCSVDescription(test, componentID, notes, errorCode),
|
|
Severity: getSeverityFromErrorCode(errorCode, notes),
|
|
RawData: fmt.Sprintf("Test: %s, Component: %s, Error: %s", test, componentID, errorCode),
|
|
}
|
|
events = append(events, event)
|
|
}
|
|
}
|
|
|
|
return events
|
|
}
|
|
|
|
// CollectGPUStatusesFromSummaryJSON extracts per-GPU PASS/FAIL status from summary.json.
|
|
// Key format in returned map is component ID from summary (e.g. "SXM5_SN_1653925025497").
|
|
func CollectGPUStatusesFromSummaryJSON(content []byte) map[string]string {
|
|
var entries []SummaryEntry
|
|
if err := json.Unmarshal(content, &entries); err != nil {
|
|
return nil
|
|
}
|
|
|
|
statuses := make(map[string]string)
|
|
for _, entry := range entries {
|
|
component := strings.TrimSpace(entry.ComponentID)
|
|
if component == "" || !gpuComponentIDRegex.MatchString(component) {
|
|
continue
|
|
}
|
|
|
|
current := statuses[component]
|
|
next := "PASS"
|
|
if !isSummaryJSONRecordPassing(entry.ErrorCode, entry.Notes) {
|
|
next = "FAIL"
|
|
}
|
|
statuses[component] = mergeGPUStatus(current, next)
|
|
}
|
|
|
|
return statuses
|
|
}
|
|
|
|
// CollectGPUFailureDetailsFromSummaryJSON extracts per-GPU failure details from summary.json.
|
|
// Key format in returned map is component ID from summary (e.g. "SXM5_SN_1653925025497").
|
|
func CollectGPUFailureDetailsFromSummaryJSON(content []byte) map[string]string {
|
|
var entries []SummaryEntry
|
|
if err := json.Unmarshal(content, &entries); err != nil {
|
|
return nil
|
|
}
|
|
|
|
details := make(map[string]string)
|
|
for _, entry := range entries {
|
|
component := strings.TrimSpace(entry.ComponentID)
|
|
if component == "" || !gpuComponentIDRegex.MatchString(component) {
|
|
continue
|
|
}
|
|
if isSummaryJSONRecordPassing(entry.ErrorCode, entry.Notes) {
|
|
continue
|
|
}
|
|
|
|
note := strings.TrimSpace(entry.Notes)
|
|
if note == "" || strings.EqualFold(note, "OK") {
|
|
note = strings.TrimSpace(entry.ErrorCode)
|
|
}
|
|
if note == "" {
|
|
continue
|
|
}
|
|
|
|
// Keep first non-empty detail to avoid noisy overrides.
|
|
if _, exists := details[component]; !exists {
|
|
details[component] = note
|
|
}
|
|
}
|
|
|
|
return details
|
|
}
|
|
|
|
// CollectGPUStatusesFromSummaryCSV extracts per-GPU PASS/FAIL status from summary.csv.
|
|
// Key format in returned map is component ID from summary (e.g. "SXM5_SN_1653925025497").
|
|
func CollectGPUStatusesFromSummaryCSV(content []byte) map[string]string {
|
|
reader := csv.NewReader(strings.NewReader(string(content)))
|
|
records, err := reader.ReadAll()
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
|
|
statuses := make(map[string]string)
|
|
for i, record := range records {
|
|
if i == 0 || len(record) < 7 {
|
|
continue
|
|
}
|
|
|
|
component := strings.TrimSpace(record[5])
|
|
if component == "" || !gpuComponentIDRegex.MatchString(component) {
|
|
continue
|
|
}
|
|
|
|
errorCode := strings.TrimSpace(record[0])
|
|
notes := strings.TrimSpace(record[6])
|
|
|
|
current := statuses[component]
|
|
next := "PASS"
|
|
if !isSummaryCSVRecordPassing(errorCode, notes) {
|
|
next = "FAIL"
|
|
}
|
|
statuses[component] = mergeGPUStatus(current, next)
|
|
}
|
|
|
|
return statuses
|
|
}
|
|
|
|
// CollectNVSwitchStatusesFromSummaryJSON extracts per-NVSwitch PASS/FAIL status from summary.json.
|
|
// Key format in returned map is normalized switch slot (e.g. "NVSWITCH0").
|
|
func CollectNVSwitchStatusesFromSummaryJSON(content []byte) map[string]string {
|
|
var entries []SummaryEntry
|
|
if err := json.Unmarshal(content, &entries); err != nil {
|
|
return nil
|
|
}
|
|
|
|
statuses := make(map[string]string)
|
|
for _, entry := range entries {
|
|
component := strings.TrimSpace(entry.ComponentID)
|
|
matches := nvswitchInventoryComponentRegex.FindStringSubmatch(component)
|
|
if len(matches) != 2 {
|
|
continue
|
|
}
|
|
|
|
slot := strings.TrimSpace(matches[1])
|
|
if slot == "" {
|
|
continue
|
|
}
|
|
|
|
current := statuses[slot]
|
|
next := "PASS"
|
|
if !isSummaryJSONRecordPassing(entry.ErrorCode, entry.Notes) {
|
|
next = "FAIL"
|
|
}
|
|
statuses[slot] = mergeGPUStatus(current, next)
|
|
}
|
|
|
|
return statuses
|
|
}
|
|
|
|
// CollectNVSwitchStatusesFromSummaryCSV extracts per-NVSwitch PASS/FAIL status from summary.csv.
|
|
// Key format in returned map is normalized switch slot (e.g. "NVSWITCH0").
|
|
func CollectNVSwitchStatusesFromSummaryCSV(content []byte) map[string]string {
|
|
reader := csv.NewReader(strings.NewReader(string(content)))
|
|
records, err := reader.ReadAll()
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
|
|
statuses := make(map[string]string)
|
|
for i, record := range records {
|
|
if i == 0 || len(record) < 7 {
|
|
continue
|
|
}
|
|
|
|
component := strings.TrimSpace(record[5])
|
|
matches := nvswitchInventoryComponentRegex.FindStringSubmatch(component)
|
|
if len(matches) != 2 {
|
|
continue
|
|
}
|
|
|
|
slot := strings.TrimSpace(matches[1])
|
|
if slot == "" {
|
|
continue
|
|
}
|
|
|
|
errorCode := strings.TrimSpace(record[0])
|
|
notes := strings.TrimSpace(record[6])
|
|
|
|
current := statuses[slot]
|
|
next := "PASS"
|
|
if !isSummaryCSVRecordPassing(errorCode, notes) {
|
|
next = "FAIL"
|
|
}
|
|
statuses[slot] = mergeGPUStatus(current, next)
|
|
}
|
|
|
|
return statuses
|
|
}
|
|
|
|
// CollectGPUFailureDetailsFromSummaryCSV extracts per-GPU failure details from summary.csv.
|
|
// Key format in returned map is component ID from summary (e.g. "SXM5_SN_1653925025497").
|
|
func CollectGPUFailureDetailsFromSummaryCSV(content []byte) map[string]string {
|
|
reader := csv.NewReader(strings.NewReader(string(content)))
|
|
records, err := reader.ReadAll()
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
|
|
details := make(map[string]string)
|
|
for i, record := range records {
|
|
if i == 0 || len(record) < 7 {
|
|
continue
|
|
}
|
|
|
|
component := strings.TrimSpace(record[5])
|
|
if component == "" || !gpuComponentIDRegex.MatchString(component) {
|
|
continue
|
|
}
|
|
|
|
errorCode := strings.TrimSpace(record[0])
|
|
notes := strings.TrimSpace(record[6])
|
|
if isSummaryCSVRecordPassing(errorCode, notes) {
|
|
continue
|
|
}
|
|
|
|
note := notes
|
|
if note == "" || strings.EqualFold(note, "OK") {
|
|
note = errorCode
|
|
}
|
|
if note == "" {
|
|
continue
|
|
}
|
|
|
|
if _, exists := details[component]; !exists {
|
|
details[component] = note
|
|
}
|
|
}
|
|
|
|
return details
|
|
}
|
|
|
|
func isSummaryJSONRecordPassing(errorCode, notes string) bool {
|
|
_ = errorCode
|
|
return strings.TrimSpace(notes) == "OK"
|
|
}
|
|
|
|
func isSummaryCSVRecordPassing(errorCode, notes string) bool {
|
|
_ = errorCode
|
|
return strings.TrimSpace(notes) == "OK"
|
|
}
|
|
|
|
func mergeGPUStatus(current, next string) string {
|
|
// FAIL has highest priority.
|
|
if current == "FAIL" || next == "FAIL" {
|
|
return "FAIL"
|
|
}
|
|
if current == "PASS" || next == "PASS" {
|
|
return "PASS"
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// ApplyGPUStatuses applies aggregated PASS/FAIL statuses from summary components to parsed GPUs.
|
|
func ApplyGPUStatuses(result *models.AnalysisResult, componentStatuses map[string]string) {
|
|
if result == nil || result.Hardware == nil || len(result.Hardware.GPUs) == 0 || len(componentStatuses) == 0 {
|
|
return
|
|
}
|
|
|
|
slotStatus := make(map[string]string) // key: GPUSXM<idx>
|
|
serialStatus := make(map[string]string) // key: GPU serial
|
|
|
|
for componentID, status := range componentStatuses {
|
|
matches := gpuComponentIDRegex.FindStringSubmatch(strings.TrimSpace(componentID))
|
|
if len(matches) != 3 {
|
|
continue
|
|
}
|
|
slotKey := "GPUSXM" + matches[1]
|
|
serialKey := strings.TrimSpace(matches[2])
|
|
slotStatus[slotKey] = mergeGPUStatus(slotStatus[slotKey], status)
|
|
if serialKey != "" {
|
|
serialStatus[serialKey] = mergeGPUStatus(serialStatus[serialKey], status)
|
|
}
|
|
}
|
|
|
|
for i := range result.Hardware.GPUs {
|
|
gpu := &result.Hardware.GPUs[i]
|
|
next := ""
|
|
if serial := strings.TrimSpace(gpu.SerialNumber); serial != "" {
|
|
next = serialStatus[serial]
|
|
}
|
|
if next == "" {
|
|
next = slotStatus[strings.TrimSpace(gpu.Slot)]
|
|
}
|
|
if next != "" {
|
|
gpu.Status = next
|
|
}
|
|
}
|
|
}
|
|
|
|
// ApplyNVSwitchStatuses applies aggregated PASS/FAIL statuses from summary components to parsed NVSwitch devices.
|
|
func ApplyNVSwitchStatuses(result *models.AnalysisResult, switchStatuses map[string]string) {
|
|
if result == nil || result.Hardware == nil || len(result.Hardware.PCIeDevices) == 0 || len(switchStatuses) == 0 {
|
|
return
|
|
}
|
|
|
|
for i := range result.Hardware.PCIeDevices {
|
|
dev := &result.Hardware.PCIeDevices[i]
|
|
slot := normalizeNVSwitchSlot(strings.TrimSpace(dev.Slot))
|
|
if slot == "" {
|
|
continue
|
|
}
|
|
if !strings.HasPrefix(strings.ToUpper(slot), "NVSWITCH") {
|
|
continue
|
|
}
|
|
if st := switchStatuses[slot]; st != "" {
|
|
dev.Status = st
|
|
}
|
|
}
|
|
}
|
|
|
|
// ApplyGPUFailureDetails maps parsed failure details from summary components to GPUs.
|
|
func ApplyGPUFailureDetails(result *models.AnalysisResult, componentDetails map[string]string) {
|
|
if result == nil || result.Hardware == nil || len(result.Hardware.GPUs) == 0 || len(componentDetails) == 0 {
|
|
return
|
|
}
|
|
|
|
slotDetails := make(map[string]string) // key: GPUSXM<idx>
|
|
serialDetails := make(map[string]string) // key: GPU serial
|
|
|
|
for componentID, detail := range componentDetails {
|
|
matches := gpuComponentIDRegex.FindStringSubmatch(strings.TrimSpace(componentID))
|
|
if len(matches) != 3 {
|
|
continue
|
|
}
|
|
detail = strings.TrimSpace(detail)
|
|
if detail == "" {
|
|
continue
|
|
}
|
|
|
|
slotKey := "GPUSXM" + matches[1]
|
|
serialKey := strings.TrimSpace(matches[2])
|
|
if _, exists := slotDetails[slotKey]; !exists {
|
|
slotDetails[slotKey] = detail
|
|
}
|
|
if serialKey != "" {
|
|
if _, exists := serialDetails[serialKey]; !exists {
|
|
serialDetails[serialKey] = detail
|
|
}
|
|
}
|
|
}
|
|
|
|
for i := range result.Hardware.GPUs {
|
|
gpu := &result.Hardware.GPUs[i]
|
|
detail := ""
|
|
if serial := strings.TrimSpace(gpu.SerialNumber); serial != "" {
|
|
detail = serialDetails[serial]
|
|
}
|
|
if detail == "" {
|
|
detail = slotDetails[strings.TrimSpace(gpu.Slot)]
|
|
}
|
|
if detail != "" {
|
|
gpu.ErrorDescription = detail
|
|
}
|
|
}
|
|
}
|
|
|
|
// formatSummaryDescription creates a human-readable description from summary entry
|
|
func formatSummaryDescription(entry SummaryEntry) string {
|
|
component := entry.ComponentID
|
|
if component == "" {
|
|
component = entry.VirtualID
|
|
}
|
|
|
|
if entry.Notes == "OK" {
|
|
return fmt.Sprintf("%s test passed for %s", entry.Test, component)
|
|
}
|
|
|
|
return fmt.Sprintf("%s test failed for %s: %s (Error: %s)", entry.Test, component, entry.Notes, entry.ErrorCode)
|
|
}
|
|
|
|
// formatCSVDescription creates a human-readable description from CSV record
|
|
func formatCSVDescription(test, component, notes, errorCode string) string {
|
|
if notes == "OK" {
|
|
return fmt.Sprintf("%s test passed for %s", test, component)
|
|
}
|
|
|
|
return fmt.Sprintf("%s test failed for %s: %s (Error: %s)", test, component, notes, errorCode)
|
|
}
|
|
|
|
// getSeverityFromErrorCode determines severity based on error code and notes
|
|
func getSeverityFromErrorCode(errorCode, notes string) models.Severity {
|
|
// Parse error code format: XXX-YYY-Z-ZZZZZZZZZZZZ
|
|
// First digit indicates severity in some cases
|
|
|
|
if notes == "OK" {
|
|
return models.SeverityInfo
|
|
}
|
|
|
|
// Row remapping failed is a warning
|
|
if strings.Contains(notes, "Row remapping failed") {
|
|
return models.SeverityWarning
|
|
}
|
|
|
|
// Check error code
|
|
if errorCode == "" || errorCode == "0" {
|
|
return models.SeverityInfo
|
|
}
|
|
|
|
// Codes starting with 0 are typically informational
|
|
if strings.HasPrefix(errorCode, "001-000-1") || strings.HasPrefix(errorCode, "048-000-0") {
|
|
return models.SeverityInfo
|
|
}
|
|
|
|
// Non-zero error codes are typically warnings or errors
|
|
// If code is in 300+ range, it's likely an error
|
|
if len(errorCode) > 2 {
|
|
firstDigits := errorCode[:3]
|
|
if firstDigits >= "300" {
|
|
return models.SeverityCritical
|
|
}
|
|
}
|
|
|
|
return models.SeverityWarning
|
|
}
|