Fix NVIDIA GPU serial number format extraction
Extract decimal serial numbers from devname parameters (e.g., "SXM5_SN_1653925027099") instead of hex PCIe Device Serial Numbers. This provides the correct GPU serial numbers as they appear in NVIDIA diagnostics tooling. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
95
internal/parser/vendors/nvidia/inventory_log.go
vendored
95
internal/parser/vendors/nvidia/inventory_log.go
vendored
@@ -10,22 +10,13 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
// Regex to extract GPU serial numbers from lspci output
|
|
||||||
// Example: " Capabilities: [2f0 v1] Device Serial Number 14-17-dc-65-77-2d-b0-48"
|
|
||||||
gpuSerialRegex = regexp.MustCompile(`Device Serial Number\s+([\da-fA-F-]+)`)
|
|
||||||
|
|
||||||
// Regex to extract PCI BDF from lspci header
|
|
||||||
// Example: "2a:00.0 3D controller: NVIDIA Corporation Device 2335 (rev a1)"
|
|
||||||
// Note: lspci format is bus:device.function (e.g., "2a:00.0")
|
|
||||||
pciBDFRegex = regexp.MustCompile(`^([0-9a-fA-F]{2,4}:[0-9a-fA-F]{2}\.[0-9])\s+3D controller.*NVIDIA`)
|
|
||||||
|
|
||||||
// Regex to extract devname mappings from fieldiag command line
|
// Regex to extract devname mappings from fieldiag command line
|
||||||
// Example: "devname=0000:ba:00.0,SXM5_SN_1653925027099"
|
// Example: "devname=0000:ba:00.0,SXM5_SN_1653925027099"
|
||||||
devnameRegex = regexp.MustCompile(`devname=([\da-fA-F:\.]+),(\w+)`)
|
devnameRegex = regexp.MustCompile(`devname=([\da-fA-F:\.]+),(\w+)`)
|
||||||
)
|
)
|
||||||
|
|
||||||
// ParseInventoryLog parses inventory/output.log to extract GPU serial numbers
|
// ParseInventoryLog parses inventory/output.log to extract GPU serial numbers
|
||||||
// from lspci output and map them to slots
|
// from fieldiag devname parameters (e.g., "SXM5_SN_1653925027099")
|
||||||
func ParseInventoryLog(content []byte, result *models.AnalysisResult) error {
|
func ParseInventoryLog(content []byte, result *models.AnalysisResult) error {
|
||||||
if result.Hardware == nil || len(result.Hardware.GPUs) == 0 {
|
if result.Hardware == nil || len(result.Hardware.GPUs) == 0 {
|
||||||
// No GPUs to update
|
// No GPUs to update
|
||||||
@@ -34,8 +25,9 @@ func ParseInventoryLog(content []byte, result *models.AnalysisResult) error {
|
|||||||
|
|
||||||
scanner := bufio.NewScanner(strings.NewReader(string(content)))
|
scanner := bufio.NewScanner(strings.NewReader(string(content)))
|
||||||
|
|
||||||
// First pass: build mapping of PCI BDF -> Slot name from fieldiag command line
|
// First pass: build mapping of PCI BDF -> Slot name and serial number from fieldiag command line
|
||||||
pciToSlot := make(map[string]string)
|
pciToSlot := make(map[string]string)
|
||||||
|
pciToSerial := make(map[string]string)
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
line := scanner.Text()
|
line := scanner.Text()
|
||||||
// Look for fieldiag command with devname parameters
|
// Look for fieldiag command with devname parameters
|
||||||
@@ -45,8 +37,7 @@ func ParseInventoryLog(content []byte, result *models.AnalysisResult) error {
|
|||||||
if len(match) == 3 {
|
if len(match) == 3 {
|
||||||
pciBDF := match[1]
|
pciBDF := match[1]
|
||||||
slotName := match[2]
|
slotName := match[2]
|
||||||
// Extract slot number from name like "SXM5_SN_1653925027099"
|
// Extract slot number and serial from name like "SXM5_SN_1653925027099"
|
||||||
// We want to map to slot like "GPUSXM5"
|
|
||||||
if strings.HasPrefix(slotName, "SXM") {
|
if strings.HasPrefix(slotName, "SXM") {
|
||||||
parts := strings.Split(slotName, "_")
|
parts := strings.Split(slotName, "_")
|
||||||
if len(parts) >= 1 {
|
if len(parts) >= 1 {
|
||||||
@@ -54,81 +45,39 @@ func ParseInventoryLog(content []byte, result *models.AnalysisResult) error {
|
|||||||
slot := "GPU" + parts[0]
|
slot := "GPU" + parts[0]
|
||||||
pciToSlot[pciBDF] = slot
|
pciToSlot[pciBDF] = slot
|
||||||
}
|
}
|
||||||
|
// Extract serial number from "SXM5_SN_1653925027099"
|
||||||
|
if len(parts) == 3 && parts[1] == "SN" {
|
||||||
|
serial := parts[2]
|
||||||
|
pciToSerial[pciBDF] = serial
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Second pass: extract GPU serial numbers from lspci output
|
// Second pass: assign serial numbers to GPUs based on slot mapping
|
||||||
scanner = bufio.NewScanner(strings.NewReader(string(content)))
|
for i := range result.Hardware.GPUs {
|
||||||
var currentPCIBDF string
|
slot := result.Hardware.GPUs[i].Slot
|
||||||
var currentSlot string
|
// Find the PCI BDF for this slot
|
||||||
|
var foundSerial string
|
||||||
for scanner.Scan() {
|
for pciBDF, mappedSlot := range pciToSlot {
|
||||||
line := scanner.Text()
|
if mappedSlot == slot {
|
||||||
|
// Found matching slot, get serial number
|
||||||
// Check if this is a new GPU device header
|
if serial, ok := pciToSerial[pciBDF]; ok {
|
||||||
if match := pciBDFRegex.FindStringSubmatch(line); len(match) > 1 {
|
foundSerial = serial
|
||||||
currentPCIBDF = match[1]
|
|
||||||
// Normalize BDF format - lspci uses short format (bus:device.function)
|
|
||||||
// but fieldiag uses full format (domain:bus:device.function)
|
|
||||||
// Convert "2a:00.0" to "0000:2a:00.0"
|
|
||||||
normalizedBDF := currentPCIBDF
|
|
||||||
if len(strings.Split(currentPCIBDF, ":")) == 2 {
|
|
||||||
// Short format without domain, add 0000:
|
|
||||||
normalizedBDF = "0000:" + currentPCIBDF
|
|
||||||
}
|
|
||||||
|
|
||||||
// Map to slot name if we have it
|
|
||||||
if slot, ok := pciToSlot[normalizedBDF]; ok {
|
|
||||||
currentSlot = slot
|
|
||||||
} else if slot, ok := pciToSlot[currentPCIBDF]; ok {
|
|
||||||
currentSlot = slot
|
|
||||||
} else {
|
|
||||||
currentSlot = ""
|
|
||||||
}
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// Look for Device Serial Number in capabilities
|
|
||||||
if match := gpuSerialRegex.FindStringSubmatch(line); len(match) > 1 && currentSlot != "" {
|
|
||||||
serialNumber := match[1]
|
|
||||||
// Format: 14-17-dc-65-77-2d-b0-48
|
|
||||||
// Convert to more readable format: 48:b0:2d:77:65:dc:17:14 (reversed)
|
|
||||||
serialFormatted := formatGPUSerial(serialNumber)
|
|
||||||
|
|
||||||
// Find the GPU in our results and update its serial number
|
|
||||||
for i := range result.Hardware.GPUs {
|
|
||||||
if result.Hardware.GPUs[i].Slot == currentSlot {
|
|
||||||
result.Hardware.GPUs[i].SerialNumber = serialFormatted
|
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if foundSerial != "" {
|
||||||
|
result.Hardware.GPUs[i].SerialNumber = foundSerial
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return scanner.Err()
|
return scanner.Err()
|
||||||
}
|
}
|
||||||
|
|
||||||
// formatGPUSerial formats GPU serial number from PCIe format to human-readable
|
|
||||||
// Input: "14-17-dc-65-77-2d-b0-48" (little-endian from PCIe)
|
|
||||||
// Output: "48:b0:2d:77:65:dc:17:14" (reversed to match GPU label)
|
|
||||||
func formatGPUSerial(serial string) string {
|
|
||||||
parts := strings.Split(serial, "-")
|
|
||||||
if len(parts) != 8 {
|
|
||||||
return serial // Return as-is if unexpected format
|
|
||||||
}
|
|
||||||
|
|
||||||
// Reverse the bytes (PCIe reports in little-endian)
|
|
||||||
reversed := make([]string, len(parts))
|
|
||||||
for i := range parts {
|
|
||||||
reversed[len(parts)-1-i] = strings.ToUpper(parts[i])
|
|
||||||
}
|
|
||||||
|
|
||||||
return strings.Join(reversed, ":")
|
|
||||||
}
|
|
||||||
|
|
||||||
// findInventoryOutputLog finds the inventory/output.log file
|
// findInventoryOutputLog finds the inventory/output.log file
|
||||||
func findInventoryOutputLog(files []parser.ExtractedFile) *parser.ExtractedFile {
|
func findInventoryOutputLog(files []parser.ExtractedFile) *parser.ExtractedFile {
|
||||||
for _, f := range files {
|
for _, f := range files {
|
||||||
|
|||||||
@@ -39,9 +39,10 @@ func TestParseInventoryLog(t *testing.T) {
|
|||||||
|
|
||||||
content := string(inventoryLog.Content)
|
content := string(inventoryLog.Content)
|
||||||
|
|
||||||
// Test devname regex
|
// Test devname regex - this extracts both slot mapping and serial numbers
|
||||||
t.Log("Testing devname extraction:")
|
t.Log("Testing devname extraction:")
|
||||||
lines := strings.Split(content, "\n")
|
lines := strings.Split(content, "\n")
|
||||||
|
serialCount := 0
|
||||||
for i, line := range lines {
|
for i, line := range lines {
|
||||||
if strings.Contains(line, "devname=") && strings.Contains(line, "fieldiag") {
|
if strings.Contains(line, "devname=") && strings.Contains(line, "fieldiag") {
|
||||||
t.Logf("Line %d: Found fieldiag command", i)
|
t.Logf("Line %d: Found fieldiag command", i)
|
||||||
@@ -49,34 +50,29 @@ func TestParseInventoryLog(t *testing.T) {
|
|||||||
t.Logf(" Found %d devname matches", len(matches))
|
t.Logf(" Found %d devname matches", len(matches))
|
||||||
for _, match := range matches {
|
for _, match := range matches {
|
||||||
if len(match) == 3 {
|
if len(match) == 3 {
|
||||||
t.Logf(" PCI: %s -> Slot: %s", match[1], match[2])
|
pciBDF := match[1]
|
||||||
|
slotName := match[2]
|
||||||
|
t.Logf(" PCI: %s -> Slot: %s", pciBDF, slotName)
|
||||||
|
|
||||||
|
// Extract serial number from slot name
|
||||||
|
if strings.HasPrefix(slotName, "SXM") {
|
||||||
|
parts := strings.Split(slotName, "_")
|
||||||
|
if len(parts) == 3 && parts[1] == "SN" {
|
||||||
|
serial := parts[2]
|
||||||
|
t.Logf(" Serial: %s", serial)
|
||||||
|
serialCount++
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
t.Logf("\nTotal GPU serials extracted: %d", serialCount)
|
||||||
|
|
||||||
// Test lspci regex
|
if serialCount == 0 {
|
||||||
t.Log("\nTesting lspci BDF extraction:")
|
t.Error("Expected to find GPU serial numbers, but found none")
|
||||||
serialCount := 0
|
|
||||||
bdfCount := 0
|
|
||||||
for i, line := range lines {
|
|
||||||
// Check for lines that look like lspci headers
|
|
||||||
if strings.Contains(line, "3D controller") && strings.Contains(line, "NVIDIA") {
|
|
||||||
t.Logf("Line %d: Potential lspci line: %q (starts with: %q)", i, line[:min(80, len(line))], line[:min(10, len(line))])
|
|
||||||
if match := pciBDFRegex.FindStringSubmatch(line); len(match) > 1 {
|
|
||||||
bdfCount++
|
|
||||||
t.Logf(" -> Matched BDF: %s", match[1])
|
|
||||||
} else {
|
|
||||||
t.Logf(" -> NO MATCH")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if match := gpuSerialRegex.FindStringSubmatch(line); len(match) > 1 {
|
|
||||||
serialCount++
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
t.Logf("\nTotal BDFs found: %d", bdfCount)
|
|
||||||
t.Logf("Total serials found: %d", serialCount)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func min(a, b int) int {
|
func min(a, b int) int {
|
||||||
|
|||||||
24
internal/parser/vendors/nvidia/parser_test.go
vendored
24
internal/parser/vendors/nvidia/parser_test.go
vendored
@@ -119,27 +119,3 @@ func findSubstring(s, substr string) bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestFormatGPUSerial(t *testing.T) {
|
|
||||||
tests := []struct {
|
|
||||||
input string
|
|
||||||
expected string
|
|
||||||
}{
|
|
||||||
{
|
|
||||||
input: "14-17-dc-65-77-2d-b0-48",
|
|
||||||
expected: "48:B0:2D:77:65:DC:17:14",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
input: "f2-fd-85-e0-2f-2d-b0-48",
|
|
||||||
expected: "48:B0:2D:2F:E0:85:FD:F2",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tt := range tests {
|
|
||||||
t.Run(tt.input, func(t *testing.T) {
|
|
||||||
result := formatGPUSerial(tt.input)
|
|
||||||
if result != tt.expected {
|
|
||||||
t.Errorf("formatGPUSerial(%s) = %s, want %s", tt.input, result, tt.expected)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
Reference in New Issue
Block a user