nvidia: improve component mapping, firmware, statuses and check times
This commit is contained in:
217
internal/parser/vendors/nvidia/summary.go
vendored
217
internal/parser/vendors/nvidia/summary.go
vendored
@@ -22,6 +22,7 @@ type SummaryEntry struct {
|
||||
}
|
||||
|
||||
var gpuComponentIDRegex = regexp.MustCompile(`^SXM(\d+)_SN_(.+)$`)
|
||||
var nvswitchInventoryComponentRegex = regexp.MustCompile(`^NVSWITCH_(NVSWITCH\d+)_`)
|
||||
|
||||
// ParseSummaryJSON parses summary.json file and returns events
|
||||
func ParseSummaryJSON(content []byte) []models.Event {
|
||||
@@ -121,6 +122,41 @@ func CollectGPUStatusesFromSummaryJSON(content []byte) map[string]string {
|
||||
return statuses
|
||||
}
|
||||
|
||||
// CollectGPUFailureDetailsFromSummaryJSON extracts per-GPU failure details from summary.json.
|
||||
// Key format in returned map is component ID from summary (e.g. "SXM5_SN_1653925025497").
|
||||
func CollectGPUFailureDetailsFromSummaryJSON(content []byte) map[string]string {
|
||||
var entries []SummaryEntry
|
||||
if err := json.Unmarshal(content, &entries); err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
details := make(map[string]string)
|
||||
for _, entry := range entries {
|
||||
component := strings.TrimSpace(entry.ComponentID)
|
||||
if component == "" || !gpuComponentIDRegex.MatchString(component) {
|
||||
continue
|
||||
}
|
||||
if isSummaryJSONRecordPassing(entry.ErrorCode, entry.Notes) {
|
||||
continue
|
||||
}
|
||||
|
||||
note := strings.TrimSpace(entry.Notes)
|
||||
if note == "" || strings.EqualFold(note, "OK") {
|
||||
note = strings.TrimSpace(entry.ErrorCode)
|
||||
}
|
||||
if note == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
// Keep first non-empty detail to avoid noisy overrides.
|
||||
if _, exists := details[component]; !exists {
|
||||
details[component] = note
|
||||
}
|
||||
}
|
||||
|
||||
return details
|
||||
}
|
||||
|
||||
// CollectGPUStatusesFromSummaryCSV extracts per-GPU PASS/FAIL status from summary.csv.
|
||||
// Key format in returned map is component ID from summary (e.g. "SXM5_SN_1653925025497").
|
||||
func CollectGPUStatusesFromSummaryCSV(content []byte) map[string]string {
|
||||
@@ -155,6 +191,120 @@ func CollectGPUStatusesFromSummaryCSV(content []byte) map[string]string {
|
||||
return statuses
|
||||
}
|
||||
|
||||
// CollectNVSwitchStatusesFromSummaryJSON extracts per-NVSwitch PASS/FAIL status from summary.json.
|
||||
// Key format in returned map is normalized switch slot (e.g. "NVSWITCH0").
|
||||
func CollectNVSwitchStatusesFromSummaryJSON(content []byte) map[string]string {
|
||||
var entries []SummaryEntry
|
||||
if err := json.Unmarshal(content, &entries); err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
statuses := make(map[string]string)
|
||||
for _, entry := range entries {
|
||||
component := strings.TrimSpace(entry.ComponentID)
|
||||
matches := nvswitchInventoryComponentRegex.FindStringSubmatch(component)
|
||||
if len(matches) != 2 {
|
||||
continue
|
||||
}
|
||||
|
||||
slot := strings.TrimSpace(matches[1])
|
||||
if slot == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
current := statuses[slot]
|
||||
next := "PASS"
|
||||
if !isSummaryJSONRecordPassing(entry.ErrorCode, entry.Notes) {
|
||||
next = "FAIL"
|
||||
}
|
||||
statuses[slot] = mergeGPUStatus(current, next)
|
||||
}
|
||||
|
||||
return statuses
|
||||
}
|
||||
|
||||
// CollectNVSwitchStatusesFromSummaryCSV extracts per-NVSwitch PASS/FAIL status from summary.csv.
|
||||
// Key format in returned map is normalized switch slot (e.g. "NVSWITCH0").
|
||||
func CollectNVSwitchStatusesFromSummaryCSV(content []byte) map[string]string {
|
||||
reader := csv.NewReader(strings.NewReader(string(content)))
|
||||
records, err := reader.ReadAll()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
statuses := make(map[string]string)
|
||||
for i, record := range records {
|
||||
if i == 0 || len(record) < 7 {
|
||||
continue
|
||||
}
|
||||
|
||||
component := strings.TrimSpace(record[5])
|
||||
matches := nvswitchInventoryComponentRegex.FindStringSubmatch(component)
|
||||
if len(matches) != 2 {
|
||||
continue
|
||||
}
|
||||
|
||||
slot := strings.TrimSpace(matches[1])
|
||||
if slot == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
errorCode := strings.TrimSpace(record[0])
|
||||
notes := strings.TrimSpace(record[6])
|
||||
|
||||
current := statuses[slot]
|
||||
next := "PASS"
|
||||
if !isSummaryCSVRecordPassing(errorCode, notes) {
|
||||
next = "FAIL"
|
||||
}
|
||||
statuses[slot] = mergeGPUStatus(current, next)
|
||||
}
|
||||
|
||||
return statuses
|
||||
}
|
||||
|
||||
// CollectGPUFailureDetailsFromSummaryCSV extracts per-GPU failure details from summary.csv.
|
||||
// Key format in returned map is component ID from summary (e.g. "SXM5_SN_1653925025497").
|
||||
func CollectGPUFailureDetailsFromSummaryCSV(content []byte) map[string]string {
|
||||
reader := csv.NewReader(strings.NewReader(string(content)))
|
||||
records, err := reader.ReadAll()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
details := make(map[string]string)
|
||||
for i, record := range records {
|
||||
if i == 0 || len(record) < 7 {
|
||||
continue
|
||||
}
|
||||
|
||||
component := strings.TrimSpace(record[5])
|
||||
if component == "" || !gpuComponentIDRegex.MatchString(component) {
|
||||
continue
|
||||
}
|
||||
|
||||
errorCode := strings.TrimSpace(record[0])
|
||||
notes := strings.TrimSpace(record[6])
|
||||
if isSummaryCSVRecordPassing(errorCode, notes) {
|
||||
continue
|
||||
}
|
||||
|
||||
note := notes
|
||||
if note == "" || strings.EqualFold(note, "OK") {
|
||||
note = errorCode
|
||||
}
|
||||
if note == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
if _, exists := details[component]; !exists {
|
||||
details[component] = note
|
||||
}
|
||||
}
|
||||
|
||||
return details
|
||||
}
|
||||
|
||||
func isSummaryJSONRecordPassing(errorCode, notes string) bool {
|
||||
_ = errorCode
|
||||
return strings.TrimSpace(notes) == "OK"
|
||||
@@ -213,6 +363,73 @@ func ApplyGPUStatuses(result *models.AnalysisResult, componentStatuses map[strin
|
||||
}
|
||||
}
|
||||
|
||||
// ApplyNVSwitchStatuses applies aggregated PASS/FAIL statuses from summary components to parsed NVSwitch devices.
|
||||
func ApplyNVSwitchStatuses(result *models.AnalysisResult, switchStatuses map[string]string) {
|
||||
if result == nil || result.Hardware == nil || len(result.Hardware.PCIeDevices) == 0 || len(switchStatuses) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
for i := range result.Hardware.PCIeDevices {
|
||||
dev := &result.Hardware.PCIeDevices[i]
|
||||
slot := normalizeNVSwitchSlot(strings.TrimSpace(dev.Slot))
|
||||
if slot == "" {
|
||||
continue
|
||||
}
|
||||
if !strings.HasPrefix(strings.ToUpper(slot), "NVSWITCH") {
|
||||
continue
|
||||
}
|
||||
if st := switchStatuses[slot]; st != "" {
|
||||
dev.Status = st
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ApplyGPUFailureDetails maps parsed failure details from summary components to GPUs.
|
||||
func ApplyGPUFailureDetails(result *models.AnalysisResult, componentDetails map[string]string) {
|
||||
if result == nil || result.Hardware == nil || len(result.Hardware.GPUs) == 0 || len(componentDetails) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
slotDetails := make(map[string]string) // key: GPUSXM<idx>
|
||||
serialDetails := make(map[string]string) // key: GPU serial
|
||||
|
||||
for componentID, detail := range componentDetails {
|
||||
matches := gpuComponentIDRegex.FindStringSubmatch(strings.TrimSpace(componentID))
|
||||
if len(matches) != 3 {
|
||||
continue
|
||||
}
|
||||
detail = strings.TrimSpace(detail)
|
||||
if detail == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
slotKey := "GPUSXM" + matches[1]
|
||||
serialKey := strings.TrimSpace(matches[2])
|
||||
if _, exists := slotDetails[slotKey]; !exists {
|
||||
slotDetails[slotKey] = detail
|
||||
}
|
||||
if serialKey != "" {
|
||||
if _, exists := serialDetails[serialKey]; !exists {
|
||||
serialDetails[serialKey] = detail
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for i := range result.Hardware.GPUs {
|
||||
gpu := &result.Hardware.GPUs[i]
|
||||
detail := ""
|
||||
if serial := strings.TrimSpace(gpu.SerialNumber); serial != "" {
|
||||
detail = serialDetails[serial]
|
||||
}
|
||||
if detail == "" {
|
||||
detail = slotDetails[strings.TrimSpace(gpu.Slot)]
|
||||
}
|
||||
if detail != "" {
|
||||
gpu.ErrorDescription = detail
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// formatSummaryDescription creates a human-readable description from summary entry
|
||||
func formatSummaryDescription(entry SummaryEntry) string {
|
||||
component := entry.ComponentID
|
||||
|
||||
Reference in New Issue
Block a user