Add GPU serial number extraction for NVIDIA diagnostics
Parse inventory/output.log to extract GPU serial numbers from lspci output, expose them via serials API, and add GPU category to web UI. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
143
internal/parser/vendors/nvidia/inventory_log.go
vendored
Normal file
143
internal/parser/vendors/nvidia/inventory_log.go
vendored
Normal file
@@ -0,0 +1,143 @@
|
||||
package nvidia
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"git.mchus.pro/mchus/logpile/internal/models"
|
||||
"git.mchus.pro/mchus/logpile/internal/parser"
|
||||
)
|
||||
|
||||
var (
|
||||
// Regex to extract GPU serial numbers from lspci output
|
||||
// Example: " Capabilities: [2f0 v1] Device Serial Number 14-17-dc-65-77-2d-b0-48"
|
||||
gpuSerialRegex = regexp.MustCompile(`Device Serial Number\s+([\da-fA-F-]+)`)
|
||||
|
||||
// Regex to extract PCI BDF from lspci header
|
||||
// Example: "2a:00.0 3D controller: NVIDIA Corporation Device 2335 (rev a1)"
|
||||
// Note: lspci format is bus:device.function (e.g., "2a:00.0")
|
||||
pciBDFRegex = regexp.MustCompile(`^([0-9a-fA-F]{2,4}:[0-9a-fA-F]{2}\.[0-9])\s+3D controller.*NVIDIA`)
|
||||
|
||||
// Regex to extract devname mappings from fieldiag command line
|
||||
// Example: "devname=0000:ba:00.0,SXM5_SN_1653925027099"
|
||||
devnameRegex = regexp.MustCompile(`devname=([\da-fA-F:\.]+),(\w+)`)
|
||||
)
|
||||
|
||||
// ParseInventoryLog parses inventory/output.log to extract GPU serial numbers
|
||||
// from lspci output and map them to slots
|
||||
func ParseInventoryLog(content []byte, result *models.AnalysisResult) error {
|
||||
if result.Hardware == nil || len(result.Hardware.GPUs) == 0 {
|
||||
// No GPUs to update
|
||||
return nil
|
||||
}
|
||||
|
||||
scanner := bufio.NewScanner(strings.NewReader(string(content)))
|
||||
|
||||
// First pass: build mapping of PCI BDF -> Slot name from fieldiag command line
|
||||
pciToSlot := make(map[string]string)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
// Look for fieldiag command with devname parameters
|
||||
if strings.Contains(line, "devname=") && strings.Contains(line, "fieldiag") {
|
||||
matches := devnameRegex.FindAllStringSubmatch(line, -1)
|
||||
for _, match := range matches {
|
||||
if len(match) == 3 {
|
||||
pciBDF := match[1]
|
||||
slotName := match[2]
|
||||
// Extract slot number from name like "SXM5_SN_1653925027099"
|
||||
// We want to map to slot like "GPUSXM5"
|
||||
if strings.HasPrefix(slotName, "SXM") {
|
||||
parts := strings.Split(slotName, "_")
|
||||
if len(parts) >= 1 {
|
||||
// Convert "SXM5" to "GPUSXM5"
|
||||
slot := "GPU" + parts[0]
|
||||
pciToSlot[pciBDF] = slot
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Second pass: extract GPU serial numbers from lspci output
|
||||
scanner = bufio.NewScanner(strings.NewReader(string(content)))
|
||||
var currentPCIBDF string
|
||||
var currentSlot string
|
||||
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
|
||||
// Check if this is a new GPU device header
|
||||
if match := pciBDFRegex.FindStringSubmatch(line); len(match) > 1 {
|
||||
currentPCIBDF = match[1]
|
||||
// Normalize BDF format - lspci uses short format (bus:device.function)
|
||||
// but fieldiag uses full format (domain:bus:device.function)
|
||||
// Convert "2a:00.0" to "0000:2a:00.0"
|
||||
normalizedBDF := currentPCIBDF
|
||||
if len(strings.Split(currentPCIBDF, ":")) == 2 {
|
||||
// Short format without domain, add 0000:
|
||||
normalizedBDF = "0000:" + currentPCIBDF
|
||||
}
|
||||
|
||||
// Map to slot name if we have it
|
||||
if slot, ok := pciToSlot[normalizedBDF]; ok {
|
||||
currentSlot = slot
|
||||
} else if slot, ok := pciToSlot[currentPCIBDF]; ok {
|
||||
currentSlot = slot
|
||||
} else {
|
||||
currentSlot = ""
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// Look for Device Serial Number in capabilities
|
||||
if match := gpuSerialRegex.FindStringSubmatch(line); len(match) > 1 && currentSlot != "" {
|
||||
serialNumber := match[1]
|
||||
// Format: 14-17-dc-65-77-2d-b0-48
|
||||
// Convert to more readable format: 48:b0:2d:77:65:dc:17:14 (reversed)
|
||||
serialFormatted := formatGPUSerial(serialNumber)
|
||||
|
||||
// Find the GPU in our results and update its serial number
|
||||
for i := range result.Hardware.GPUs {
|
||||
if result.Hardware.GPUs[i].Slot == currentSlot {
|
||||
result.Hardware.GPUs[i].SerialNumber = serialFormatted
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return scanner.Err()
|
||||
}
|
||||
|
||||
// formatGPUSerial formats GPU serial number from PCIe format to human-readable
|
||||
// Input: "14-17-dc-65-77-2d-b0-48" (little-endian from PCIe)
|
||||
// Output: "48:b0:2d:77:65:dc:17:14" (reversed to match GPU label)
|
||||
func formatGPUSerial(serial string) string {
|
||||
parts := strings.Split(serial, "-")
|
||||
if len(parts) != 8 {
|
||||
return serial // Return as-is if unexpected format
|
||||
}
|
||||
|
||||
// Reverse the bytes (PCIe reports in little-endian)
|
||||
reversed := make([]string, len(parts))
|
||||
for i := range parts {
|
||||
reversed[len(parts)-1-i] = strings.ToUpper(parts[i])
|
||||
}
|
||||
|
||||
return strings.Join(reversed, ":")
|
||||
}
|
||||
|
||||
// findInventoryOutputLog finds the inventory/output.log file
|
||||
func findInventoryOutputLog(files []parser.ExtractedFile) *parser.ExtractedFile {
|
||||
for _, f := range files {
|
||||
// Look for inventory/output.log
|
||||
path := strings.ToLower(f.Path)
|
||||
if strings.Contains(path, "inventory/output.log") ||
|
||||
strings.Contains(path, "inventory\\output.log") {
|
||||
return &f
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
87
internal/parser/vendors/nvidia/inventory_log_test.go
vendored
Normal file
87
internal/parser/vendors/nvidia/inventory_log_test.go
vendored
Normal file
@@ -0,0 +1,87 @@
|
||||
package nvidia
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"git.mchus.pro/mchus/logpile/internal/parser"
|
||||
)
|
||||
|
||||
func TestParseInventoryLog(t *testing.T) {
|
||||
// Test with the real archive
|
||||
archivePath := filepath.Join("../../../../example", "A514359X5A09844_logs-20260115-151707.tar")
|
||||
|
||||
// Check if file exists
|
||||
if _, err := os.Stat(archivePath); os.IsNotExist(err) {
|
||||
t.Skip("Test archive not found, skipping test")
|
||||
}
|
||||
|
||||
// Extract files from archive
|
||||
files, err := parser.ExtractArchive(archivePath)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to extract archive: %v", err)
|
||||
}
|
||||
|
||||
// Find inventory/output.log
|
||||
var inventoryLog *parser.ExtractedFile
|
||||
for _, f := range files {
|
||||
if strings.Contains(f.Path, "inventory/output.log") {
|
||||
inventoryLog = &f
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if inventoryLog == nil {
|
||||
t.Fatal("inventory/output.log not found")
|
||||
}
|
||||
|
||||
content := string(inventoryLog.Content)
|
||||
|
||||
// Test devname regex
|
||||
t.Log("Testing devname extraction:")
|
||||
lines := strings.Split(content, "\n")
|
||||
for i, line := range lines {
|
||||
if strings.Contains(line, "devname=") && strings.Contains(line, "fieldiag") {
|
||||
t.Logf("Line %d: Found fieldiag command", i)
|
||||
matches := devnameRegex.FindAllStringSubmatch(line, -1)
|
||||
t.Logf(" Found %d devname matches", len(matches))
|
||||
for _, match := range matches {
|
||||
if len(match) == 3 {
|
||||
t.Logf(" PCI: %s -> Slot: %s", match[1], match[2])
|
||||
}
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Test lspci regex
|
||||
t.Log("\nTesting lspci BDF extraction:")
|
||||
serialCount := 0
|
||||
bdfCount := 0
|
||||
for i, line := range lines {
|
||||
// Check for lines that look like lspci headers
|
||||
if strings.Contains(line, "3D controller") && strings.Contains(line, "NVIDIA") {
|
||||
t.Logf("Line %d: Potential lspci line: %q (starts with: %q)", i, line[:min(80, len(line))], line[:min(10, len(line))])
|
||||
if match := pciBDFRegex.FindStringSubmatch(line); len(match) > 1 {
|
||||
bdfCount++
|
||||
t.Logf(" -> Matched BDF: %s", match[1])
|
||||
} else {
|
||||
t.Logf(" -> NO MATCH")
|
||||
}
|
||||
}
|
||||
if match := gpuSerialRegex.FindStringSubmatch(line); len(match) > 1 {
|
||||
serialCount++
|
||||
}
|
||||
}
|
||||
t.Logf("\nTotal BDFs found: %d", bdfCount)
|
||||
t.Logf("Total serials found: %d", serialCount)
|
||||
}
|
||||
|
||||
func min(a, b int) int {
|
||||
if a < b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
11
internal/parser/vendors/nvidia/parser.go
vendored
11
internal/parser/vendors/nvidia/parser.go
vendored
@@ -14,7 +14,7 @@ import (
|
||||
|
||||
// parserVersion - version of this parser module
|
||||
// IMPORTANT: Increment this version when making changes to parser logic!
|
||||
const parserVersion = "1.1.0"
|
||||
const parserVersion = "1.2.0"
|
||||
|
||||
func init() {
|
||||
parser.Register(&Parser{})
|
||||
@@ -124,6 +124,15 @@ func (p *Parser) Parse(files []parser.ExtractedFile) (*models.AnalysisResult, er
|
||||
}
|
||||
}
|
||||
|
||||
// Parse inventory/output.log (contains GPU serial numbers from lspci)
|
||||
inventoryLogFile := findInventoryOutputLog(files)
|
||||
if inventoryLogFile != nil {
|
||||
if err := ParseInventoryLog(inventoryLogFile.Content, result); err != nil {
|
||||
// Log error but continue parsing other files
|
||||
_ = err // Ignore error for now
|
||||
}
|
||||
}
|
||||
|
||||
// Parse summary.json (test results summary)
|
||||
if f := parser.FindFileByName(files, "summary.json"); f != nil {
|
||||
events := ParseSummaryJSON(f.Content)
|
||||
|
||||
145
internal/parser/vendors/nvidia/parser_test.go
vendored
Normal file
145
internal/parser/vendors/nvidia/parser_test.go
vendored
Normal file
@@ -0,0 +1,145 @@
|
||||
package nvidia
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"git.mchus.pro/mchus/logpile/internal/parser"
|
||||
)
|
||||
|
||||
func TestNVIDIAParser_RealArchive(t *testing.T) {
|
||||
// Test with the real archive that was reported as problematic
|
||||
archivePath := filepath.Join("../../../../example", "A514359X5A09844_logs-20260115-151707.tar")
|
||||
|
||||
// Check if file exists
|
||||
if _, err := os.Stat(archivePath); os.IsNotExist(err) {
|
||||
t.Skip("Test archive not found, skipping test")
|
||||
}
|
||||
|
||||
// Extract files from archive
|
||||
files, err := parser.ExtractArchive(archivePath)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to extract archive: %v", err)
|
||||
}
|
||||
|
||||
// Check if inventory/output.log exists
|
||||
hasInventoryLog := false
|
||||
for _, f := range files {
|
||||
if filepath.Base(f.Path) == "output.log" {
|
||||
t.Logf("Found file: %s", f.Path)
|
||||
}
|
||||
if f.Path == "./inventory/output.log" || f.Path == "inventory/output.log" {
|
||||
hasInventoryLog = true
|
||||
t.Logf("Found inventory/output.log with %d bytes", len(f.Content))
|
||||
}
|
||||
}
|
||||
if !hasInventoryLog {
|
||||
t.Error("inventory/output.log not found in extracted files")
|
||||
}
|
||||
|
||||
// Create parser and parse
|
||||
p := &Parser{}
|
||||
result, err := p.Parse(files)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to parse archive: %v", err)
|
||||
}
|
||||
|
||||
// Verify basic system info
|
||||
if result.Hardware.BoardInfo.Manufacturer == "" {
|
||||
t.Error("Expected Manufacturer to be set")
|
||||
}
|
||||
if result.Hardware.BoardInfo.ProductName == "" {
|
||||
t.Error("Expected ProductName to be set")
|
||||
}
|
||||
if result.Hardware.BoardInfo.SerialNumber == "" {
|
||||
t.Error("Expected SerialNumber to be set")
|
||||
}
|
||||
|
||||
t.Logf("System Info:")
|
||||
t.Logf(" Manufacturer: %s", result.Hardware.BoardInfo.Manufacturer)
|
||||
t.Logf(" Product: %s", result.Hardware.BoardInfo.ProductName)
|
||||
t.Logf(" Serial: %s", result.Hardware.BoardInfo.SerialNumber)
|
||||
|
||||
// Verify GPUs were found
|
||||
if len(result.Hardware.GPUs) == 0 {
|
||||
t.Error("Expected to find GPUs")
|
||||
}
|
||||
|
||||
t.Logf("\nFound %d GPUs:", len(result.Hardware.GPUs))
|
||||
|
||||
gpusWithSerials := 0
|
||||
for _, gpu := range result.Hardware.GPUs {
|
||||
t.Logf(" %s: %s (Firmware: %s, Serial: %s, BDF: %s)",
|
||||
gpu.Slot, gpu.Model, gpu.Firmware, gpu.SerialNumber, gpu.BDF)
|
||||
|
||||
if gpu.SerialNumber != "" {
|
||||
gpusWithSerials++
|
||||
}
|
||||
}
|
||||
|
||||
// Verify that GPU serial numbers were extracted
|
||||
if gpusWithSerials == 0 {
|
||||
t.Error("Expected at least some GPUs to have serial numbers")
|
||||
}
|
||||
|
||||
t.Logf("\nGPUs with serial numbers: %d/%d", gpusWithSerials, len(result.Hardware.GPUs))
|
||||
|
||||
// Check events for SXM2 failures
|
||||
t.Logf("\nTotal events: %d", len(result.Events))
|
||||
|
||||
// Look for the specific serial or SXM2
|
||||
sxm2Events := 0
|
||||
for _, event := range result.Events {
|
||||
desc := event.Description + " " + event.RawData + " " + event.EventType
|
||||
if contains(desc, "SXM2") || contains(desc, "1653925025827") {
|
||||
t.Logf(" SXM2 Event: [%s] %s (Severity: %s)", event.EventType, event.Description, event.Severity)
|
||||
sxm2Events++
|
||||
}
|
||||
}
|
||||
|
||||
if sxm2Events == 0 {
|
||||
t.Error("Expected to find events for SXM2 (faulty GPU 1653925025827)")
|
||||
}
|
||||
t.Logf("\nSXM2 failure events: %d", sxm2Events)
|
||||
}
|
||||
|
||||
func contains(s, substr string) bool {
|
||||
return len(s) >= len(substr) && (s == substr || len(s) > len(substr) &&
|
||||
(s[:len(substr)] == substr || s[len(s)-len(substr):] == substr ||
|
||||
findSubstring(s, substr)))
|
||||
}
|
||||
|
||||
func findSubstring(s, substr string) bool {
|
||||
for i := 0; i <= len(s)-len(substr); i++ {
|
||||
if s[i:i+len(substr)] == substr {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func TestFormatGPUSerial(t *testing.T) {
|
||||
tests := []struct {
|
||||
input string
|
||||
expected string
|
||||
}{
|
||||
{
|
||||
input: "14-17-dc-65-77-2d-b0-48",
|
||||
expected: "48:B0:2D:77:65:DC:17:14",
|
||||
},
|
||||
{
|
||||
input: "f2-fd-85-e0-2f-2d-b0-48",
|
||||
expected: "48:B0:2D:2F:E0:85:FD:F2",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.input, func(t *testing.T) {
|
||||
result := formatGPUSerial(tt.input)
|
||||
if result != tt.expected {
|
||||
t.Errorf("formatGPUSerial(%s) = %s, want %s", tt.input, result, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -391,6 +391,24 @@ func (s *Server) handleGetSerials(w http.ResponseWriter, r *http.Request) {
|
||||
})
|
||||
}
|
||||
|
||||
// GPUs
|
||||
for _, gpu := range result.Hardware.GPUs {
|
||||
if gpu.SerialNumber == "" {
|
||||
continue
|
||||
}
|
||||
model := gpu.Model
|
||||
if model == "" {
|
||||
model = "GPU"
|
||||
}
|
||||
serials = append(serials, SerialEntry{
|
||||
Component: model,
|
||||
Location: gpu.Slot,
|
||||
SerialNumber: gpu.SerialNumber,
|
||||
Manufacturer: gpu.Manufacturer,
|
||||
Category: "GPU",
|
||||
})
|
||||
}
|
||||
|
||||
// PCIe devices
|
||||
for _, pcie := range result.Hardware.PCIeDevices {
|
||||
if pcie.SerialNumber == "" {
|
||||
|
||||
132
internal/server/handlers_gpu_test.go
Normal file
132
internal/server/handlers_gpu_test.go
Normal file
@@ -0,0 +1,132 @@
|
||||
package server
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
|
||||
"git.mchus.pro/mchus/logpile/internal/models"
|
||||
)
|
||||
|
||||
func TestHandleGetSerials_WithGPUs(t *testing.T) {
|
||||
// Create test server with GPU data
|
||||
srv := &Server{}
|
||||
|
||||
testResult := &models.AnalysisResult{
|
||||
Hardware: &models.HardwareConfig{
|
||||
GPUs: []models.GPU{
|
||||
{
|
||||
Slot: "GPUSXM1",
|
||||
Model: "NVIDIA Device 2335",
|
||||
Manufacturer: "NVIDIA Corporation",
|
||||
SerialNumber: "48:B0:2D:BB:8E:51:9E:E5",
|
||||
Firmware: "96.00.D0.00.03",
|
||||
BDF: "0000:3a:00.0",
|
||||
},
|
||||
{
|
||||
Slot: "GPUSXM2",
|
||||
Model: "NVIDIA Device 2335",
|
||||
Manufacturer: "NVIDIA Corporation",
|
||||
SerialNumber: "48:B0:2D:EE:DA:27:CF:78",
|
||||
Firmware: "96.00.D0.00.03",
|
||||
BDF: "0000:18:00.0",
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
srv.SetResult(testResult)
|
||||
|
||||
// Create request
|
||||
req := httptest.NewRequest("GET", "/api/serials", nil)
|
||||
w := httptest.NewRecorder()
|
||||
|
||||
// Call handler
|
||||
srv.handleGetSerials(w, req)
|
||||
|
||||
// Check response
|
||||
if w.Code != http.StatusOK {
|
||||
t.Errorf("Expected status 200, got %d", w.Code)
|
||||
}
|
||||
|
||||
// Parse response
|
||||
var serials []struct {
|
||||
Component string `json:"component"`
|
||||
Location string `json:"location,omitempty"`
|
||||
SerialNumber string `json:"serial_number"`
|
||||
Manufacturer string `json:"manufacturer,omitempty"`
|
||||
Category string `json:"category"`
|
||||
}
|
||||
|
||||
if err := json.NewDecoder(w.Body).Decode(&serials); err != nil {
|
||||
t.Fatalf("Failed to decode response: %v", err)
|
||||
}
|
||||
|
||||
// Check that we have GPU entries
|
||||
gpuCount := 0
|
||||
for _, s := range serials {
|
||||
if s.Category == "GPU" {
|
||||
gpuCount++
|
||||
t.Logf("Found GPU: %s (%s) S/N: %s", s.Component, s.Location, s.SerialNumber)
|
||||
|
||||
// Verify fields are set
|
||||
if s.SerialNumber == "" {
|
||||
t.Errorf("GPU serial number is empty")
|
||||
}
|
||||
if s.Location == "" {
|
||||
t.Errorf("GPU location is empty")
|
||||
}
|
||||
if s.Manufacturer == "" {
|
||||
t.Errorf("GPU manufacturer is empty")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if gpuCount != 2 {
|
||||
t.Errorf("Expected 2 GPUs in serials, got %d", gpuCount)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleGetSerials_WithoutGPUSerials(t *testing.T) {
|
||||
// Create test server with GPUs but no serial numbers
|
||||
srv := &Server{}
|
||||
|
||||
testResult := &models.AnalysisResult{
|
||||
Hardware: &models.HardwareConfig{
|
||||
GPUs: []models.GPU{
|
||||
{
|
||||
Slot: "GPU0",
|
||||
Model: "Some GPU",
|
||||
Manufacturer: "Vendor",
|
||||
SerialNumber: "", // No serial number
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
srv.SetResult(testResult)
|
||||
|
||||
// Create request
|
||||
req := httptest.NewRequest("GET", "/api/serials", nil)
|
||||
w := httptest.NewRecorder()
|
||||
|
||||
// Call handler
|
||||
srv.handleGetSerials(w, req)
|
||||
|
||||
// Parse response
|
||||
var serials []struct {
|
||||
Category string `json:"category"`
|
||||
}
|
||||
|
||||
if err := json.NewDecoder(w.Body).Decode(&serials); err != nil {
|
||||
t.Fatalf("Failed to decode response: %v", err)
|
||||
}
|
||||
|
||||
// Check that GPUs without serial numbers are not included
|
||||
for _, s := range serials {
|
||||
if s.Category == "GPU" {
|
||||
t.Error("GPU without serial number should not be included in serials list")
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user