nvidia: improve component mapping, firmware, statuses and check times

This commit is contained in:
2026-02-16 23:17:13 +03:00
parent 514da76ddb
commit b33cca5fcc
19 changed files with 2051 additions and 65 deletions

View File

@@ -11,6 +11,8 @@ import (
"git.mchus.pro/mchus/logpile/internal/models"
)
var cpuMicrocodeFirmwareRegex = regexp.MustCompile(`(?i)^cpu\d+\s+microcode$`)
// ConvertToReanimator converts AnalysisResult to Reanimator export format
func ConvertToReanimator(result *models.AnalysisResult) (*ReanimatorExport, error) {
if result == nil {
@@ -77,14 +79,39 @@ func convertFirmware(firmware []models.FirmwareInfo) []ReanimatorFirmware {
result := make([]ReanimatorFirmware, 0, len(firmware))
for _, fw := range firmware {
if isDeviceBoundFirmwareName(fw.DeviceName) {
continue
}
result = append(result, ReanimatorFirmware{
DeviceName: fw.DeviceName,
Version: fw.Version,
})
}
if len(result) == 0 {
return nil
}
return result
}
func isDeviceBoundFirmwareName(name string) bool {
n := strings.TrimSpace(strings.ToLower(name))
if n == "" {
return false
}
if strings.HasPrefix(n, "gpu ") ||
strings.HasPrefix(n, "nvswitch ") ||
strings.HasPrefix(n, "nic ") ||
strings.HasPrefix(n, "hdd ") ||
strings.HasPrefix(n, "ssd ") ||
strings.HasPrefix(n, "nvme ") ||
strings.HasPrefix(n, "psu") {
return true
}
return cpuMicrocodeFirmwareRegex.MatchString(strings.TrimSpace(name))
}
// convertCPUs converts CPU information to Reanimator format
func convertCPUs(cpus []models.CPU, collectedAt string) []ReanimatorCPU {
if len(cpus) == 0 {
@@ -229,6 +256,7 @@ func convertStorage(storage []models.Storage, collectedAt string) []ReanimatorSt
func convertPCIeDevices(hw *models.HardwareConfig, collectedAt string) []ReanimatorPCIe {
result := make([]ReanimatorPCIe, 0)
gpuSlots := make(map[string]struct{}, len(hw.GPUs))
nvswitchFirmwareBySlot := buildNVSwitchFirmwareBySlot(hw.Firmware)
for _, gpu := range hw.GPUs {
slot := strings.ToLower(strings.TrimSpace(gpu.Slot))
if slot != "" {
@@ -254,6 +282,10 @@ func convertPCIeDevices(hw *models.HardwareConfig, collectedAt string) []Reanima
}
status := normalizeStatus(pcie.Status, false)
firmware := ""
if isNVSwitchPCIeDevice(pcie) {
firmware = nvswitchFirmwareBySlot[normalizeNVSwitchSlotForLookup(pcie.Slot)]
}
meta := buildStatusMeta(
status,
pcie.StatusCheckedAt,
@@ -277,7 +309,7 @@ func convertPCIeDevices(hw *models.HardwareConfig, collectedAt string) []Reanima
MaxLinkWidth: pcie.MaxLinkWidth,
MaxLinkSpeed: pcie.MaxLinkSpeed,
SerialNumber: serialNumber,
Firmware: "", // PCIeDevice doesn't have firmware in models
Firmware: firmware,
Status: status,
StatusCheckedAt: meta.StatusCheckedAt,
StatusChangedAt: meta.StatusChangedAt,
@@ -373,6 +405,57 @@ func convertPCIeDevices(hw *models.HardwareConfig, collectedAt string) []Reanima
return result
}
func isNVSwitchPCIeDevice(pcie models.PCIeDevice) bool {
deviceClass := strings.TrimSpace(pcie.DeviceClass)
if strings.EqualFold(deviceClass, "NVSwitch") {
return true
}
slot := normalizeNVSwitchSlotForLookup(pcie.Slot)
return strings.HasPrefix(slot, "NVSWITCH")
}
func buildNVSwitchFirmwareBySlot(firmware []models.FirmwareInfo) map[string]string {
result := make(map[string]string)
for _, fw := range firmware {
name := strings.TrimSpace(fw.DeviceName)
if !strings.HasPrefix(strings.ToUpper(name), "NVSWITCH ") {
continue
}
rest := strings.TrimSpace(name[len("NVSwitch "):])
if rest == "" {
continue
}
slot := rest
if idx := strings.Index(rest, " ("); idx > 0 {
slot = strings.TrimSpace(rest[:idx])
}
slot = normalizeNVSwitchSlotForLookup(slot)
if slot == "" {
continue
}
if _, exists := result[slot]; exists {
continue
}
version := strings.TrimSpace(fw.Version)
if version == "" {
continue
}
result[slot] = version
}
return result
}
func normalizeNVSwitchSlotForLookup(slot string) string {
normalized := strings.ToUpper(strings.TrimSpace(slot))
if strings.HasPrefix(normalized, "NVSWITCHNVSWITCH") {
return "NVSWITCH" + strings.TrimPrefix(normalized, "NVSWITCHNVSWITCH")
}
return normalized
}
func isDisplayClass(deviceClass string) bool {
class := strings.ToLower(strings.TrimSpace(deviceClass))
return strings.Contains(class, "display") ||

View File

@@ -359,6 +359,12 @@ func TestConvertPCIeDevices(t *testing.T) {
func TestConvertPCIeDevices_NVSwitchWithoutSerialRemainsEmpty(t *testing.T) {
hw := &models.HardwareConfig{
Firmware: []models.FirmwareInfo{
{
DeviceName: "NVSwitch NVSWITCH1 (965-25612-0002-000)",
Version: "96.10.6D.00.01",
},
},
PCIeDevices: []models.PCIeDevice{
{
Slot: "NVSWITCH1",
@@ -378,6 +384,9 @@ func TestConvertPCIeDevices_NVSwitchWithoutSerialRemainsEmpty(t *testing.T) {
if result[0].SerialNumber != "" {
t.Fatalf("expected empty NVSwitch serial, got %q", result[0].SerialNumber)
}
if result[0].Firmware != "96.10.6D.00.01" {
t.Fatalf("expected NVSwitch firmware 96.10.6D.00.01, got %q", result[0].Firmware)
}
}
func TestConvertPCIeDevices_SkipsDisplayControllerDuplicates(t *testing.T) {
@@ -646,3 +655,47 @@ func TestConvertToReanimator_DeduplicatesAllSections(t *testing.T) {
t.Fatalf("expected single #GPU0 record, got %d", gpuCount)
}
}
func TestConvertToReanimator_FirmwareExcludesDeviceBoundEntries(t *testing.T) {
input := &models.AnalysisResult{
Filename: "fw-filter-test.json",
Hardware: &models.HardwareConfig{
BoardInfo: models.BoardInfo{SerialNumber: "BOARD-001"},
Firmware: []models.FirmwareInfo{
{DeviceName: "BIOS", Version: "1.0.0"},
{DeviceName: "BMC", Version: "2.0.0"},
{DeviceName: "GPU GPUSXM1 (692-2G520-0280-501)", Version: "96.00.D0.00.03"},
{DeviceName: "NVSwitch NVSWITCH0 (965-25612-0002-000)", Version: "96.10.6D.00.01"},
{DeviceName: "NIC #CPU1_PCIE9 (MCX512A-ACAT)", Version: "28.38.1900"},
{DeviceName: "CPU0 Microcode", Version: "0x2b000643"},
},
},
}
out, err := ConvertToReanimator(input)
if err != nil {
t.Fatalf("ConvertToReanimator() failed: %v", err)
}
if len(out.Hardware.Firmware) != 2 {
t.Fatalf("expected only machine-level firmware entries, got %d", len(out.Hardware.Firmware))
}
got := map[string]string{}
for _, fw := range out.Hardware.Firmware {
got[fw.DeviceName] = fw.Version
}
if got["BIOS"] != "1.0.0" {
t.Fatalf("expected BIOS firmware to be kept")
}
if got["BMC"] != "2.0.0" {
t.Fatalf("expected BMC firmware to be kept")
}
if _, exists := got["GPU GPUSXM1 (692-2G520-0280-501)"]; exists {
t.Fatalf("expected GPU firmware to be excluded from hardware.firmware")
}
if _, exists := got["NVSwitch NVSWITCH0 (965-25612-0002-000)"]; exists {
t.Fatalf("expected NVSwitch firmware to be excluded from hardware.firmware")
}
}

View File

@@ -97,6 +97,7 @@ type HardwareConfig struct {
// FirmwareInfo represents firmware version information
type FirmwareInfo struct {
DeviceName string `json:"device_name"`
Description string `json:"description,omitempty"`
Version string `json:"version"`
BuildTime string `json:"build_time,omitempty"`
}
@@ -105,6 +106,7 @@ type FirmwareInfo struct {
type BoardInfo struct {
Manufacturer string `json:"manufacturer,omitempty"`
ProductName string `json:"product_name,omitempty"`
Description string `json:"description,omitempty"`
SerialNumber string `json:"serial_number,omitempty"`
PartNumber string `json:"part_number,omitempty"`
Version string `json:"version,omitempty"`
@@ -115,6 +117,7 @@ type BoardInfo struct {
type CPU struct {
Socket int `json:"socket"`
Model string `json:"model"`
Description string `json:"description,omitempty"`
Cores int `json:"cores"`
Threads int `json:"threads"`
FrequencyMHz int `json:"frequency_mhz"`
@@ -138,6 +141,7 @@ type CPU struct {
type MemoryDIMM struct {
Slot string `json:"slot"`
Location string `json:"location"`
Description string `json:"description,omitempty"`
Present bool `json:"present"`
SizeMB int `json:"size_mb"`
Type string `json:"type"`
@@ -162,6 +166,7 @@ type Storage struct {
Slot string `json:"slot"`
Type string `json:"type"`
Model string `json:"model"`
Description string `json:"description,omitempty"`
SizeGB int `json:"size_gb"`
SerialNumber string `json:"serial_number,omitempty"`
Manufacturer string `json:"manufacturer,omitempty"`
@@ -182,6 +187,7 @@ type Storage struct {
// PCIeDevice represents a PCIe device
type PCIeDevice struct {
Slot string `json:"slot"`
Description string `json:"description,omitempty"`
VendorID int `json:"vendor_id"`
DeviceID int `json:"device_id"`
BDF string `json:"bdf"`
@@ -207,6 +213,7 @@ type PCIeDevice struct {
type NIC struct {
Name string `json:"name"`
Model string `json:"model"`
Description string `json:"description,omitempty"`
MACAddress string `json:"mac_address"`
SpeedMbps int `json:"speed_mbps,omitempty"`
SerialNumber string `json:"serial_number,omitempty"`
@@ -217,6 +224,7 @@ type PSU struct {
Slot string `json:"slot"`
Present bool `json:"present"`
Model string `json:"model"`
Description string `json:"description,omitempty"`
Vendor string `json:"vendor,omitempty"`
WattageW int `json:"wattage_w,omitempty"`
SerialNumber string `json:"serial_number,omitempty"`
@@ -242,6 +250,7 @@ type GPU struct {
Slot string `json:"slot"`
Location string `json:"location,omitempty"`
Model string `json:"model"`
Description string `json:"description,omitempty"`
Manufacturer string `json:"manufacturer,omitempty"`
VendorID int `json:"vendor_id,omitempty"`
DeviceID int `json:"device_id,omitempty"`
@@ -280,6 +289,7 @@ type NetworkAdapter struct {
Location string `json:"location"`
Present bool `json:"present"`
Model string `json:"model"`
Description string `json:"description,omitempty"`
Vendor string `json:"vendor,omitempty"`
VendorID int `json:"vendor_id,omitempty"`
DeviceID int `json:"device_id,omitempty"`

View File

@@ -0,0 +1,274 @@
package nvidia
import (
"regexp"
"strconv"
"strings"
"time"
"git.mchus.pro/mchus/logpile/internal/models"
"git.mchus.pro/mchus/logpile/internal/parser"
)
var verboseRunTestingLineRegex = regexp.MustCompile(`^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}),\d+\s+-\s+Testing\s+([a-zA-Z0-9_]+)\s*$`)
var runLogStartTimeRegex = regexp.MustCompile(`^Start time\s+([A-Za-z]{3}, \d{2} [A-Za-z]{3} \d{4} \d{2}:\d{2}:\d{2})\s*$`)
var runLogTestDurationRegex = regexp.MustCompile(`^Testing\s+([a-zA-Z0-9_]+)\s+\S+\s+\[\s*([0-9]+):([0-9]{2})s\s*\]\s*$`)
var modsStartLineRegex = regexp.MustCompile(`(?m)^MODS start:\s+([A-Za-z]{3}\s+[A-Za-z]{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+\d{4})\s*$`)
var gpuFieldiagOutputPathRegex = regexp.MustCompile(`(?i)gpu_fieldiag[\\/]+sxm(\d+)_sn_([^\\/]+)[\\/]+output\.log$`)
var nvswitchDevnameRegex = regexp.MustCompile(`devname=[^,\s]+,(NVSWITCH\d+)`)
type componentCheckTimes struct {
GPUDefault time.Time
NVSwitchDefault time.Time
GPUBySerial map[string]time.Time // key: GPU serial
GPUBySlot map[string]time.Time // key: GPUSXM<idx>
NVSwitchBySlot map[string]time.Time // key: NVSWITCH<idx>
}
// CollectGPUAndNVSwitchCheckTimes extracts GPU/NVSwitch check timestamps from NVIDIA logs.
// Priority:
// 1) verbose_run.log "Testing <test>" timestamps
// 2) run.log start time + cumulative durations
func CollectGPUAndNVSwitchCheckTimes(files []parser.ExtractedFile) componentCheckTimes {
gpuBySerial := make(map[string]time.Time)
gpuBySlot := make(map[string]time.Time)
nvsBySlot := make(map[string]time.Time)
for _, f := range files {
path := strings.TrimSpace(f.Path)
pathLower := strings.ToLower(path)
// Per-GPU timestamp from gpu_fieldiag/<SXMx_SN_serial>/output.log
if strings.HasSuffix(pathLower, "output.log") && strings.Contains(pathLower, "gpu_fieldiag/") {
ts := parseModsStartTime(f.Content)
if ts.IsZero() {
continue
}
matches := gpuFieldiagOutputPathRegex.FindStringSubmatch(path)
if len(matches) == 3 {
slot := "GPUSXM" + strings.TrimSpace(matches[1])
serial := strings.TrimSpace(matches[2])
if slot != "" {
gpuBySlot[slot] = ts
}
if serial != "" {
gpuBySerial[serial] = ts
}
}
}
// Per-NVSwitch timestamp and slot list from nvswitch/output.log
if strings.HasSuffix(pathLower, "nvswitch/output.log") || strings.HasSuffix(pathLower, "nvswitch\\output.log") {
ts := parseModsStartTime(f.Content)
if ts.IsZero() {
continue
}
for _, slot := range parseNVSwitchSlotsFromOutput(f.Content) {
nvsBySlot[slot] = ts
}
}
}
testStarts := make(map[string]time.Time)
if f := parser.FindFileByName(files, "verbose_run.log"); f != nil {
for testName, ts := range parseVerboseRunTestStartTimes(f.Content) {
testStarts[strings.ToLower(strings.TrimSpace(testName))] = ts
}
}
if len(testStarts) == 0 {
if f := parser.FindFileByName(files, "run.log"); f != nil {
for testName, ts := range parseRunLogTestStartTimes(f.Content) {
testStarts[strings.ToLower(strings.TrimSpace(testName))] = ts
}
}
}
return componentCheckTimes{
GPUDefault: pickFirstTestTime(testStarts, "gpu_fieldiag", "gpumem", "gpustress", "pcie", "inventory"),
NVSwitchDefault: pickFirstTestTime(testStarts, "nvswitch", "inventory"),
GPUBySerial: gpuBySerial,
GPUBySlot: gpuBySlot,
NVSwitchBySlot: nvsBySlot,
}
}
func pickFirstTestTime(testStarts map[string]time.Time, names ...string) time.Time {
for _, name := range names {
if ts := testStarts[strings.ToLower(strings.TrimSpace(name))]; !ts.IsZero() {
return ts
}
}
return time.Time{}
}
func parseVerboseRunTestStartTimes(content []byte) map[string]time.Time {
result := make(map[string]time.Time)
lines := strings.Split(string(content), "\n")
for _, line := range lines {
matches := verboseRunTestingLineRegex.FindStringSubmatch(strings.TrimSpace(line))
if len(matches) != 3 {
continue
}
ts, err := time.ParseInLocation("2006-01-02 15:04:05", strings.TrimSpace(matches[1]), time.UTC)
if err != nil {
continue
}
testName := strings.ToLower(strings.TrimSpace(matches[2]))
if testName == "" {
continue
}
if _, exists := result[testName]; !exists {
result[testName] = ts
}
}
return result
}
func parseRunLogTestStartTimes(content []byte) map[string]time.Time {
lines := strings.Split(string(content), "\n")
start := time.Time{}
for _, line := range lines {
matches := runLogStartTimeRegex.FindStringSubmatch(strings.TrimSpace(line))
if len(matches) != 2 {
continue
}
parsed, err := time.ParseInLocation("Mon, 02 Jan 2006 15:04:05", strings.TrimSpace(matches[1]), time.UTC)
if err != nil {
continue
}
start = parsed
break
}
if start.IsZero() {
return nil
}
result := make(map[string]time.Time)
cursor := start
for _, line := range lines {
matches := runLogTestDurationRegex.FindStringSubmatch(strings.TrimSpace(line))
if len(matches) != 4 {
continue
}
testName := strings.ToLower(strings.TrimSpace(matches[1]))
minutes, errMin := strconv.Atoi(strings.TrimSpace(matches[2]))
seconds, errSec := strconv.Atoi(strings.TrimSpace(matches[3]))
if errMin != nil || errSec != nil {
continue
}
if _, exists := result[testName]; !exists {
result[testName] = cursor
}
cursor = cursor.Add(time.Duration(minutes)*time.Minute + time.Duration(seconds)*time.Second)
}
return result
}
func parseModsStartTime(content []byte) time.Time {
matches := modsStartLineRegex.FindSubmatch(content)
if len(matches) != 2 {
return time.Time{}
}
tsRaw := strings.TrimSpace(string(matches[1]))
if tsRaw == "" {
return time.Time{}
}
ts, err := time.ParseInLocation("Mon Jan 2 15:04:05 2006", tsRaw, time.UTC)
if err != nil {
return time.Time{}
}
return ts
}
func parseNVSwitchSlotsFromOutput(content []byte) []string {
matches := nvswitchDevnameRegex.FindAllSubmatch(content, -1)
if len(matches) == 0 {
return nil
}
seen := make(map[string]struct{})
out := make([]string, 0, len(matches))
for _, m := range matches {
if len(m) != 2 {
continue
}
slot := strings.ToUpper(strings.TrimSpace(string(m[1])))
if slot == "" {
continue
}
if _, exists := seen[slot]; exists {
continue
}
seen[slot] = struct{}{}
out = append(out, slot)
}
return out
}
// ApplyGPUAndNVSwitchCheckTimes writes parsed check timestamps to component status metadata.
func ApplyGPUAndNVSwitchCheckTimes(result *models.AnalysisResult, times componentCheckTimes) {
if result == nil || result.Hardware == nil {
return
}
for i := range result.Hardware.GPUs {
gpu := &result.Hardware.GPUs[i]
ts := time.Time{}
if serial := strings.TrimSpace(gpu.SerialNumber); serial != "" {
ts = times.GPUBySerial[serial]
}
if ts.IsZero() {
ts = times.GPUBySlot[strings.ToUpper(strings.TrimSpace(gpu.Slot))]
}
if ts.IsZero() {
ts = times.GPUDefault
}
if ts.IsZero() {
continue
}
gpu.StatusCheckedAt = ts
status := strings.TrimSpace(gpu.Status)
if status == "" {
status = "Unknown"
}
gpu.StatusAtCollect = &models.StatusAtCollection{
Status: status,
At: ts,
}
}
for i := range result.Hardware.PCIeDevices {
dev := &result.Hardware.PCIeDevices[i]
slot := normalizeNVSwitchSlot(strings.TrimSpace(dev.Slot))
if slot == "" {
continue
}
slot = strings.ToUpper(slot)
if !strings.EqualFold(strings.TrimSpace(dev.DeviceClass), "NVSwitch") &&
!strings.HasPrefix(slot, "NVSWITCH") {
continue
}
ts := times.NVSwitchBySlot[slot]
if ts.IsZero() {
ts = times.NVSwitchDefault
}
if ts.IsZero() {
continue
}
dev.StatusCheckedAt = ts
status := strings.TrimSpace(dev.Status)
if status == "" {
status = "Unknown"
}
dev.StatusAtCollect = &models.StatusAtCollection{
Status: status,
At: ts,
}
}
}

View File

@@ -0,0 +1,143 @@
package nvidia
import (
"testing"
"time"
"git.mchus.pro/mchus/logpile/internal/models"
"git.mchus.pro/mchus/logpile/internal/parser"
)
func TestParseVerboseRunTestStartTimes(t *testing.T) {
content := []byte(`
2026-01-22 09:11:32,458 - Testing nvswitch
2026-01-22 09:45:36,016 - Testing gpu_fieldiag
`)
got := parseVerboseRunTestStartTimes(content)
nvs := got["nvswitch"]
if nvs.IsZero() {
t.Fatalf("expected nvswitch timestamp")
}
gpu := got["gpu_fieldiag"]
if gpu.IsZero() {
t.Fatalf("expected gpu_fieldiag timestamp")
}
if nvs.Format(time.RFC3339) != "2026-01-22T09:11:32Z" {
t.Fatalf("unexpected nvswitch timestamp: %s", nvs.Format(time.RFC3339))
}
if gpu.Format(time.RFC3339) != "2026-01-22T09:45:36Z" {
t.Fatalf("unexpected gpu_fieldiag timestamp: %s", gpu.Format(time.RFC3339))
}
}
func TestParseRunLogTestStartTimes(t *testing.T) {
content := []byte(`
Start time Thu, 22 Jan 2026 07:42:26
Testing gpumem FAILED [ 26:12s ]
Testing gpustress OK [ 7:10s ]
Testing nvswitch OK [ 9:25s ]
`)
got := parseRunLogTestStartTimes(content)
if got["gpumem"].Format(time.RFC3339) != "2026-01-22T07:42:26Z" {
t.Fatalf("unexpected gpumem start: %s", got["gpumem"].Format(time.RFC3339))
}
if got["gpustress"].Format(time.RFC3339) != "2026-01-22T08:08:38Z" {
t.Fatalf("unexpected gpustress start: %s", got["gpustress"].Format(time.RFC3339))
}
if got["nvswitch"].Format(time.RFC3339) != "2026-01-22T08:15:48Z" {
t.Fatalf("unexpected nvswitch start: %s", got["nvswitch"].Format(time.RFC3339))
}
}
func TestApplyGPUAndNVSwitchCheckTimes(t *testing.T) {
gpuTs := time.Date(2026, 1, 22, 9, 45, 36, 0, time.UTC)
nvsTs := time.Date(2026, 1, 22, 9, 11, 32, 0, time.UTC)
result := &models.AnalysisResult{
Hardware: &models.HardwareConfig{
GPUs: []models.GPU{
{Slot: "GPUSXM5", Status: "FAIL"},
},
PCIeDevices: []models.PCIeDevice{
{Slot: "NVSWITCH0", DeviceClass: "NVSwitch", Status: "PASS"},
{Slot: "NIC0", DeviceClass: "NetworkController", Status: "PASS"},
},
},
}
ApplyGPUAndNVSwitchCheckTimes(result, componentCheckTimes{
GPUBySlot: map[string]time.Time{"GPUSXM5": gpuTs},
NVSwitchBySlot: map[string]time.Time{"NVSWITCH0": nvsTs},
})
if got := result.Hardware.GPUs[0].StatusCheckedAt; !got.Equal(gpuTs) {
t.Fatalf("expected gpu status_checked_at %s, got %s", gpuTs.Format(time.RFC3339), got.Format(time.RFC3339))
}
if result.Hardware.GPUs[0].StatusAtCollect == nil || !result.Hardware.GPUs[0].StatusAtCollect.At.Equal(gpuTs) {
t.Fatalf("expected gpu status_at_collection.at %s", gpuTs.Format(time.RFC3339))
}
if got := result.Hardware.PCIeDevices[0].StatusCheckedAt; !got.Equal(nvsTs) {
t.Fatalf("expected nvswitch status_checked_at %s, got %s", nvsTs.Format(time.RFC3339), got.Format(time.RFC3339))
}
if result.Hardware.PCIeDevices[0].StatusAtCollect == nil || !result.Hardware.PCIeDevices[0].StatusAtCollect.At.Equal(nvsTs) {
t.Fatalf("expected nvswitch status_at_collection.at %s", nvsTs.Format(time.RFC3339))
}
if !result.Hardware.PCIeDevices[1].StatusCheckedAt.IsZero() {
t.Fatalf("expected non-nvswitch device status_checked_at to stay zero")
}
}
func TestCollectGPUAndNVSwitchCheckTimes_FromVerboseRun(t *testing.T) {
files := []parser.ExtractedFile{
{
Path: "verbose_run.log",
Content: []byte(`
2026-01-22 09:11:32,458 - Testing nvswitch
2026-01-22 09:45:36,016 - Testing gpu_fieldiag
`),
},
}
got := CollectGPUAndNVSwitchCheckTimes(files)
if got.GPUDefault.Format(time.RFC3339) != "2026-01-22T09:45:36Z" {
t.Fatalf("unexpected GPU check time: %s", got.GPUDefault.Format(time.RFC3339))
}
if got.NVSwitchDefault.Format(time.RFC3339) != "2026-01-22T09:11:32Z" {
t.Fatalf("unexpected NVSwitch check time: %s", got.NVSwitchDefault.Format(time.RFC3339))
}
}
func TestCollectGPUAndNVSwitchCheckTimes_FromComponentOutputLogs(t *testing.T) {
files := []parser.ExtractedFile{
{
Path: "gpu_fieldiag/SXM5_SN_1653925025497/output.log",
Content: []byte(`
$ some command
MODS start: Thu Jan 22 09:45:36 2026
`),
},
{
Path: "nvswitch/output.log",
Content: []byte(`
$ cmd devname=0000:08:00.0,NVSWITCH3 devname=0000:07:00.0,NVSWITCH2 devname=0000:06:00.0,NVSWITCH1 devname=0000:05:00.0,NVSWITCH0
MODS start: Thu Jan 22 09:11:32 2026
`),
},
}
got := CollectGPUAndNVSwitchCheckTimes(files)
if got.GPUBySerial["1653925025497"].Format(time.RFC3339) != "2026-01-22T09:45:36Z" {
t.Fatalf("unexpected GPU serial check time: %s", got.GPUBySerial["1653925025497"].Format(time.RFC3339))
}
if got.GPUBySlot["GPUSXM5"].Format(time.RFC3339) != "2026-01-22T09:45:36Z" {
t.Fatalf("unexpected GPU slot check time: %s", got.GPUBySlot["GPUSXM5"].Format(time.RFC3339))
}
if got.NVSwitchBySlot["NVSWITCH0"].Format(time.RFC3339) != "2026-01-22T09:11:32Z" {
t.Fatalf("unexpected NVSwitch0 check time: %s", got.NVSwitchBySlot["NVSWITCH0"].Format(time.RFC3339))
}
if got.NVSwitchBySlot["NVSWITCH3"].Format(time.RFC3339) != "2026-01-22T09:11:32Z" {
t.Fatalf("unexpected NVSwitch3 check time: %s", got.NVSwitchBySlot["NVSWITCH3"].Format(time.RFC3339))
}
}

View File

@@ -2,8 +2,8 @@ package nvidia
import (
"encoding/json"
"fmt"
"regexp"
"strconv"
"strings"
"git.mchus.pro/mchus/logpile/internal/models"
@@ -13,8 +13,11 @@ import (
var (
gpuNameWithSerialRegex = regexp.MustCompile(`^SXM(\d+)_SN_(.+)$`)
gpuNameSlotOnlyRegex = regexp.MustCompile(`^SXM(\d+)$`)
skuModelRegex = regexp.MustCompile(`sku_hgx-([a-z0-9]+)-\d+-gpu`)
skuCodeRegex = regexp.MustCompile(`^(G\d{3})[.-](\d{4})`)
skuCodeInsideRegex = regexp.MustCompile(`(?:^|[^A-Z0-9])(?:\d)?(G\d{3})[.-](\d{4})(?:[^A-Z0-9]|$)`)
inforomPathRegex = regexp.MustCompile(`(?i)(?:^|[\\/])(checkinforom|inforom)[\\/](SXM(\d+))(?:_SN_([^\\/]+))?[\\/]fieldiag\.jso$`)
inforomProductPNRegex = regexp.MustCompile(`"product_part_num"\s*:\s*"([^"]+)"`)
inforomSerialRegex = regexp.MustCompile(`"serial_number"\s*:\s*"([^"]+)"`)
)
type testSpecData struct {
@@ -22,6 +25,7 @@ type testSpecData struct {
VirtualID string `json:"virtual_id"`
Args struct {
SKUToFile map[string]string `json:"sku_to_sku_json_file_map"`
ModsMapping map[string]json.RawMessage `json:"mods_mapping"`
} `json:"args"`
} `json:"actions"`
}
@@ -35,49 +39,111 @@ type inventoryFieldDiagSummary struct {
} `json:"ModsRuns"`
}
var hardcodedSKUToFileMap = map[string]string{
"G520-0200": "sku_hgx-h100-8-gpu_80g_aircooled_field.json",
"G520-0201": "sku_hgx-h100-8-gpu_80g_aircooled_field.json",
"G520-0202": "sku_hgx-h100-8-gpu_80g_tpol_field.json",
"G520-0203": "sku_hgx-h100-8-gpu_80g_tpol_field.json",
"G520-0205": "sku_hgx-h800-8-gpu_80g_aircooled_field.json",
"G520-0207": "sku_hgx-h800-8-gpu_80g_tpol_field.json",
"G520-0221": "sku_hgx-h100-8-gpu_96g_aircooled_field.json",
"G520-0236": "sku_hgx-h20-8-gpu_96g_aircooled_field.json",
"G520-0238": "sku_hgx-h20-8-gpu_96g_tpol_field.json",
"G520-0266": "sku_hgx-h20-8-gpu_141g_aircooled_field.json",
"G520-0280": "sku_hgx-h200-8-gpu_141g_aircooled_field.json",
"G520-0282": "sku_hgx-h200-8-gpu_141g_tpol_field.json",
"G520-0292": "sku_hgx-h100-8-gpu_sku_292_field.json",
}
// ApplyGPUModelsFromSKU updates GPU model names using SKU mapping from testspec.json.
// Mapping source:
// - inventory/fieldiag_summary.json: GPUName -> BoardInfo(SKU)
// - testspec.json: SKU -> sku_hgx-... filename
// - hardcoded SKU mapping
// - testspec.json: SKU -> sku_hgx-... filename (fallback for unknown hardcoded SKU)
// - inforom/*/fieldiag.jso: product_part_num (full P/N with embedded SKU)
// - testspec.json gpu_fieldiag.mods_mapping: DeviceID -> GPU generation (last fallback for description)
func ApplyGPUModelsFromSKU(files []parser.ExtractedFile, result *models.AnalysisResult) {
if result == nil || result.Hardware == nil || len(result.Hardware.GPUs) == 0 {
return
}
skuToFile := parseSKUToFileMap(files)
if len(skuToFile) == 0 {
return
}
generationByDeviceID := parseGenerationByDeviceID(files)
serialToSKU, slotToSKU := parseGPUSKUMapping(files)
if len(serialToSKU) == 0 && len(slotToSKU) == 0 {
return
}
serialToSKU, slotToSKU, serialToPartNumber, slotToPartNumber := parseGPUSKUMapping(files)
for i := range result.Hardware.GPUs {
gpu := &result.Hardware.GPUs[i]
sku := ""
slot := strings.TrimSpace(gpu.Slot)
serial := strings.TrimSpace(gpu.SerialNumber)
if serial := strings.TrimSpace(gpu.SerialNumber); serial != "" {
if gpu.PartNumber == "" && serial != "" {
if pn := strings.TrimSpace(serialToPartNumber[serial]); pn != "" {
gpu.PartNumber = pn
}
}
if gpu.PartNumber == "" {
if pn := strings.TrimSpace(slotToPartNumber[slot]); pn != "" {
gpu.PartNumber = pn
}
}
if partNumber := strings.TrimSpace(gpu.PartNumber); partNumber != "" {
gpu.Model = partNumber
}
sku := extractSKUFromPartNumber(gpu.PartNumber)
if sku == "" && serial != "" {
sku = serialToSKU[serial]
}
if sku == "" {
sku = slotToSKU[strings.TrimSpace(gpu.Slot)]
sku = slotToSKU[slot]
}
if sku == "" {
if sku != "" {
if desc := resolveDescriptionFromSKU(sku, skuToFile); desc != "" {
gpu.Description = desc
continue
}
model := resolveModelFromSKU(sku, skuToFile)
if model == "" {
continue
}
gpu.Model = model
if gen := resolveGenerationDescription(gpu.DeviceID, generationByDeviceID); gen != "" {
gpu.Description = gen
}
}
}
func parseSKUToFileMap(files []parser.ExtractedFile) map[string]string {
result := make(map[string]string, len(hardcodedSKUToFileMap))
for sku, file := range hardcodedSKUToFileMap {
result[normalizeSKUCode(sku)] = strings.TrimSpace(file)
}
specFile := parser.FindFileByName(files, "testspec.json")
if specFile == nil {
return result
}
var spec testSpecData
if err := json.Unmarshal(specFile.Content, &spec); err != nil {
return result
}
for _, action := range spec.Actions {
for sku, file := range action.Args.SKUToFile {
normSKU := normalizeSKUCode(sku)
if normSKU == "" {
continue
}
// Priority: hardcoded mapping wins, testspec extends unknown SKU list.
if _, exists := result[normSKU]; !exists {
result[normSKU] = strings.TrimSpace(file)
}
}
}
return result
}
func parseGenerationByDeviceID(files []parser.ExtractedFile) map[string]string {
specFile := parser.FindFileByName(files, "testspec.json")
if specFile == nil {
return nil
@@ -88,20 +154,61 @@ func parseSKUToFileMap(files []parser.ExtractedFile) map[string]string {
return nil
}
result := make(map[string]string)
familyToGeneration := make(map[string]string)
deviceToGeneration := make(map[string]string)
for _, action := range spec.Actions {
for sku, file := range action.Args.SKUToFile {
normSKU := normalizeSKUCode(sku)
if normSKU == "" {
if strings.TrimSpace(strings.ToLower(action.VirtualID)) != "gpu_fieldiag" {
continue
}
result[normSKU] = strings.TrimSpace(file)
for key, raw := range action.Args.ModsMapping {
if strings.HasPrefix(key, "#mods.") {
family := strings.TrimSpace(strings.TrimPrefix(key, "#mods."))
if family == "" {
continue
}
var generation string
if err := json.Unmarshal(raw, &generation); err == nil {
generation = strings.TrimSpace(generation)
if generation != "" {
familyToGeneration[family] = generation
}
}
}
return result
}
func parseGPUSKUMapping(files []parser.ExtractedFile) (map[string]string, map[string]string) {
for key, raw := range action.Args.ModsMapping {
family := strings.TrimSpace(key)
if family == "" || strings.HasPrefix(family, "#") {
continue
}
generation := strings.TrimSpace(familyToGeneration[family])
if generation == "" {
continue
}
var deviceIDs []string
if err := json.Unmarshal(raw, &deviceIDs); err != nil {
continue
}
for _, id := range deviceIDs {
norm := normalizeDeviceIDHex(id)
if norm != "" {
deviceToGeneration[norm] = generation
}
}
}
}
return deviceToGeneration
}
func parseGPUSKUMapping(files []parser.ExtractedFile) (map[string]string, map[string]string, map[string]string, map[string]string) {
serialToSKU := make(map[string]string)
slotToSKU := make(map[string]string)
serialToPartNumber := make(map[string]string)
slotToPartNumber := make(map[string]string)
// 1) inventory/fieldiag_summary.json mapping (GPUName/BoardInfo).
var summaryFile *parser.ExtractedFile
for _, f := range files {
path := strings.ToLower(f.Path)
@@ -112,17 +219,67 @@ func parseGPUSKUMapping(files []parser.ExtractedFile) (map[string]string, map[st
}
}
if summaryFile == nil {
return nil, nil
// Continue: inforom may still contain usable part numbers.
} else {
var summaries []inventoryFieldDiagSummary
if err := json.Unmarshal(summaryFile.Content, &summaries); err == nil {
for _, summary := range summaries {
addSummaryMapping(summary, serialToSKU, slotToSKU)
}
} else {
var summary inventoryFieldDiagSummary
if err := json.Unmarshal(summaryFile.Content, &summary); err != nil {
return nil, nil
if err := json.Unmarshal(summaryFile.Content, &summary); err == nil {
addSummaryMapping(summary, serialToSKU, slotToSKU)
}
}
}
serialToSKU := make(map[string]string)
slotToSKU := make(map[string]string)
// 2) inforom/checkinforom fieldiag.jso mapping (full product_part_num).
for _, f := range files {
path := strings.TrimSpace(f.Path)
m := inforomPathRegex.FindStringSubmatch(path)
if len(m) == 0 {
continue
}
slot := "GPU" + strings.ToUpper(strings.TrimSpace(m[2])) // SXM7 -> GPUSXM7
serialFromPath := strings.TrimSpace(m[4])
productPNMatch := inforomProductPNRegex.FindSubmatch(f.Content)
if len(productPNMatch) == 2 {
partNumber := strings.TrimSpace(string(productPNMatch[1]))
if partNumber != "" {
slotToPartNumber[slot] = partNumber
if serialFromPath != "" {
serialToPartNumber[serialFromPath] = partNumber
}
if sku := extractSKUFromPartNumber(partNumber); sku != "" {
slotToSKU[slot] = sku
if serialFromPath != "" {
serialToSKU[serialFromPath] = sku
}
}
}
}
serialMatch := inforomSerialRegex.FindSubmatch(f.Content)
if len(serialMatch) == 2 {
serial := strings.TrimSpace(string(serialMatch[1]))
if serial != "" {
if sku := slotToSKU[slot]; sku != "" {
serialToSKU[serial] = sku
}
if pn := slotToPartNumber[slot]; pn != "" {
serialToPartNumber[serial] = pn
}
}
}
}
return serialToSKU, slotToSKU, serialToPartNumber, slotToPartNumber
}
func addSummaryMapping(summary inventoryFieldDiagSummary, serialToSKU map[string]string, slotToSKU map[string]string) {
for _, run := range summary.ModsRuns {
for _, h := range run.ModsHeader {
sku := normalizeSKUCode(h.BoardInfo)
@@ -141,27 +298,15 @@ func parseGPUSKUMapping(files []parser.ExtractedFile) (map[string]string, map[st
}
}
}
return serialToSKU, slotToSKU
}
func resolveModelFromSKU(sku string, skuToFile map[string]string) string {
func resolveDescriptionFromSKU(sku string, skuToFile map[string]string) string {
file := strings.ToLower(strings.TrimSpace(skuToFile[normalizeSKUCode(sku)]))
if file == "" {
return ""
}
m := skuModelRegex.FindStringSubmatch(file)
if len(m) != 2 {
return ""
}
gpuFamily := strings.ToUpper(strings.TrimSpace(m[1]))
if gpuFamily == "" {
return ""
}
return fmt.Sprintf("NVIDIA %s SXM", gpuFamily)
return skuFilenameToDescription(file)
}
func normalizeSKUCode(v string) string {
@@ -176,3 +321,54 @@ func normalizeSKUCode(v string) string {
return s
}
func extractSKUFromPartNumber(partNumber string) string {
s := strings.TrimSpace(strings.ToUpper(partNumber))
if s == "" {
return ""
}
if m := skuCodeInsideRegex.FindStringSubmatch(s); len(m) == 3 {
return m[1] + "-" + m[2]
}
return ""
}
func skuFilenameToDescription(file string) string {
s := strings.TrimSpace(strings.ToLower(file))
if s == "" {
return ""
}
s = strings.TrimSuffix(s, ".json")
s = strings.TrimSuffix(s, "_field")
s = strings.TrimPrefix(s, "sku_")
s = strings.ReplaceAll(s, "-", " ")
s = strings.ReplaceAll(s, "_", " ")
s = strings.Join(strings.Fields(s), " ")
return strings.TrimSpace(s)
}
func resolveGenerationDescription(deviceID int, deviceToGeneration map[string]string) string {
if deviceID <= 0 || len(deviceToGeneration) == 0 {
return ""
}
return strings.TrimSpace(deviceToGeneration[normalizeDeviceIDHex(strconv.FormatInt(int64(deviceID), 16))])
}
func normalizeDeviceIDHex(v string) string {
s := strings.TrimSpace(strings.ToLower(v))
s = strings.TrimPrefix(s, "0x")
if s == "" {
return ""
}
n, err := strconv.ParseUint(s, 16, 32)
if err != nil {
return ""
}
return "0x" + strings.ToLower(strconv.FormatUint(n, 16))
}

View File

@@ -50,7 +50,158 @@ func TestApplyGPUModelsFromSKU(t *testing.T) {
ApplyGPUModelsFromSKU(files, result)
if got := result.Hardware.GPUs[0].Model; got != "NVIDIA H200 SXM" {
t.Fatalf("expected model NVIDIA H200 SXM, got %q", got)
if got := result.Hardware.GPUs[0].Model; got != "NVIDIA Device 2335" {
t.Fatalf("expected model NVIDIA Device 2335, got %q", got)
}
if got := result.Hardware.GPUs[0].Description; got != "hgx h200 8 gpu 141g aircooled" {
t.Fatalf("expected description hgx h200 8 gpu 141g aircooled, got %q", got)
}
}
func TestApplyGPUModelsFromSKU_FromPartNumber(t *testing.T) {
files := []parser.ExtractedFile{
{
Path: "inforom/SXM5/fieldiag.jso",
Content: []byte(`[
[
{
"__tag__":"inforom",
"serial_number":"1653925025497",
"product_part_num":"692-2G520-0280-501"
}
]
]`),
},
{
Path: "testspec.json",
Content: []byte(`{
"actions":[
{
"virtual_id":"inventory",
"args":{
"sku_to_sku_json_file_map":{
"G520-0280":"sku_hgx-h200-8-gpu_141g_aircooled_field.json"
}
}
}
]
}`),
},
}
result := &models.AnalysisResult{
Hardware: &models.HardwareConfig{
GPUs: []models.GPU{
{
Slot: "GPUSXM5",
SerialNumber: "1653925025497",
Model: "NVIDIA Device 2335",
},
},
},
}
ApplyGPUModelsFromSKU(files, result)
if got := result.Hardware.GPUs[0].Model; got != "692-2G520-0280-501" {
t.Fatalf("expected model 692-2G520-0280-501, got %q", got)
}
if got := result.Hardware.GPUs[0].PartNumber; got != "692-2G520-0280-501" {
t.Fatalf("expected part number 692-2G520-0280-501, got %q", got)
}
if got := result.Hardware.GPUs[0].Description; got != "hgx h200 8 gpu 141g aircooled" {
t.Fatalf("expected description hgx h200 8 gpu 141g aircooled, got %q", got)
}
}
func TestApplyGPUModelsFromSKU_FieldDiagSummaryArrayFormat(t *testing.T) {
files := []parser.ExtractedFile{
{
Path: "inventory/fieldiag_summary.json",
Content: []byte(`[
{
"ModsRuns":[
{"ModsHeader":[
{"GpuName":"SXM5_SN_1653925025497","BoardInfo":"G520-0280"}
]}
]
}
]`),
},
{
Path: "testspec.json",
Content: []byte(`{
"actions":[
{
"virtual_id":"inventory",
"args":{
"sku_to_sku_json_file_map":{
"G520-0280":"sku_hgx-h200-8-gpu_141g_aircooled_field.json"
}
}
}
]
}`),
},
}
result := &models.AnalysisResult{
Hardware: &models.HardwareConfig{
GPUs: []models.GPU{
{
Slot: "GPUSXM5",
SerialNumber: "1653925025497",
Model: "NVIDIA Device 2335",
},
},
},
}
ApplyGPUModelsFromSKU(files, result)
if got := result.Hardware.GPUs[0].Model; got != "NVIDIA Device 2335" {
t.Fatalf("expected model NVIDIA Device 2335, got %q", got)
}
if got := result.Hardware.GPUs[0].Description; got != "hgx h200 8 gpu 141g aircooled" {
t.Fatalf("expected description hgx h200 8 gpu 141g aircooled, got %q", got)
}
}
func TestApplyGPUModelsFromSKU_FallbackToGenerationFromModsMapping(t *testing.T) {
files := []parser.ExtractedFile{
{
Path: "testspec.json",
Content: []byte(`{
"actions":[
{
"virtual_id":"gpu_fieldiag",
"args":{
"mods_mapping":{
"#mods.525":"Hopper",
"525":["0x2335"]
}
}
}
]
}`),
},
}
result := &models.AnalysisResult{
Hardware: &models.HardwareConfig{
GPUs: []models.GPU{
{
Slot: "GPUSXM5",
Model: "NVIDIA Device 2335",
DeviceID: 0x2335,
},
},
},
}
ApplyGPUModelsFromSKU(files, result)
if got := result.Hardware.GPUs[0].Description; got != "Hopper" {
t.Fatalf("expected description Hopper, got %q", got)
}
}

View File

@@ -13,6 +13,11 @@ var (
// Regex to extract devname mappings from fieldiag command line
// Example: "devname=0000:ba:00.0,SXM5_SN_1653925027099"
devnameRegex = regexp.MustCompile(`devname=([\da-fA-F:\.]+),(\w+)`)
// Regex to capture BDF from commands like:
// "$ lspci -vvvs 0000:05:00.0" or "$ lspci -vvs 0000:05:00.0"
lspciBDFRegex = regexp.MustCompile(`^\$\s+lspci\s+-[^\s]*\s+([0-9a-fA-F]{4}:[0-9a-fA-F]{2}:[0-9a-fA-F]{2}\.[0-7])\s*$`)
// Example: "Capabilities: [2f0 v1] Device Serial Number 99-d3-61-c8-ac-2d-b0-48"
deviceSerialRegex = regexp.MustCompile(`Device Serial Number\s+([0-9a-fA-F\-:]+)`)
)
// ParseInventoryLog parses inventory/output.log to extract GPU serial numbers
@@ -75,6 +80,64 @@ func ParseInventoryLog(content []byte, result *models.AnalysisResult) error {
}
}
// Third pass: parse lspci "Device Serial Number" by BDF (useful for NVSwitch serials).
bdfToDeviceSerial := make(map[string]string)
currentBDF := ""
scanner = bufio.NewScanner(strings.NewReader(string(content)))
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if line == "" {
continue
}
if m := lspciBDFRegex.FindStringSubmatch(line); len(m) == 2 {
currentBDF = strings.ToLower(strings.TrimSpace(m[1]))
continue
}
if currentBDF == "" {
continue
}
if m := deviceSerialRegex.FindStringSubmatch(line); len(m) == 2 {
serial := strings.TrimSpace(m[1])
if serial != "" {
bdfToDeviceSerial[currentBDF] = serial
}
currentBDF = ""
}
}
// Apply to PCIe devices first (includes NVSwitch).
for i := range result.Hardware.PCIeDevices {
dev := &result.Hardware.PCIeDevices[i]
if strings.TrimSpace(dev.SerialNumber) != "" {
continue
}
bdf := strings.ToLower(strings.TrimSpace(dev.BDF))
if bdf == "" {
continue
}
if serial := bdfToDeviceSerial[bdf]; serial != "" {
dev.SerialNumber = serial
}
}
// Apply to GPUs only if GPU serial is still empty (do not overwrite prod serial from devname).
for i := range result.Hardware.GPUs {
gpu := &result.Hardware.GPUs[i]
if strings.TrimSpace(gpu.SerialNumber) != "" {
continue
}
bdf := strings.ToLower(strings.TrimSpace(gpu.BDF))
if bdf == "" {
continue
}
if serial := bdfToDeviceSerial[bdf]; serial != "" {
gpu.SerialNumber = serial
}
}
return scanner.Err()
}

View File

@@ -6,6 +6,7 @@ import (
"strings"
"testing"
"git.mchus.pro/mchus/logpile/internal/models"
"git.mchus.pro/mchus/logpile/internal/parser"
)
@@ -81,3 +82,45 @@ func min(a, b int) int {
}
return b
}
func TestParseInventoryLog_AssignsNVSwitchSerialByBDF(t *testing.T) {
content := []byte(`
$ lspci -vvvs 0000:05:00.0
05:00.0 Bridge: NVIDIA Corporation Device 22a3 (rev a1)
Capabilities: [2f0 v1] Device Serial Number 99-d3-61-c8-ac-2d-b0-48
/tmp/fieldiag devname=0000:ba:00.0,SXM5_SN_1653925025497 fieldiag
`)
result := &models.AnalysisResult{
Hardware: &models.HardwareConfig{
GPUs: []models.GPU{
{
Slot: "GPUSXM5",
BDF: "0000:ba:00.0",
SerialNumber: "",
},
},
PCIeDevices: []models.PCIeDevice{
{
Slot: "NVSWITCH0",
BDF: "0000:05:00.0",
SerialNumber: "",
},
},
},
}
if err := ParseInventoryLog(content, result); err != nil {
t.Fatalf("ParseInventoryLog failed: %v", err)
}
if got := result.Hardware.PCIeDevices[0].SerialNumber; got != "99-d3-61-c8-ac-2d-b0-48" {
t.Fatalf("expected NVSwitch serial 99-d3-61-c8-ac-2d-b0-48, got %q", got)
}
// GPU serial should come from fieldiag devname mapping.
if got := result.Hardware.GPUs[0].SerialNumber; got != "1653925025497" {
t.Fatalf("expected GPU serial 1653925025497, got %q", got)
}
}

View File

@@ -0,0 +1,370 @@
package nvidia
import (
"bufio"
"fmt"
"regexp"
"strconv"
"strings"
"git.mchus.pro/mchus/logpile/internal/models"
"git.mchus.pro/mchus/logpile/internal/parser"
)
var (
nvflashAdapterRegex = regexp.MustCompile(`^Adapter:\s+.+\(([\da-fA-F]+),([\da-fA-F]+),([\da-fA-F]+),([\da-fA-F]+)\)\s+S:([0-9A-Fa-f]{2}),B:([0-9A-Fa-f]{2}),D:([0-9A-Fa-f]{2}),F:([0-9A-Fa-f])`)
gpuPCIIDRegex = regexp.MustCompile(`^GPU_SXM(\d+)_PCIID:\s*(\S+)$`)
nvsPCIIDRegex = regexp.MustCompile(`^NVSWITCH_NVSWITCH(\d+)_PCIID:\s*(\S+)$`)
)
var nvswitchProjectToPartNumber = map[string]string{
"5612-0002": "965-25612-0002-000",
}
type nvflashDeviceRecord struct {
BDF string
VendorID int
DeviceID int
SSVendorID int
SSDeviceID int
Version string
BoardID string
HierarchyID string
ChipSKU string
Project string
}
// ParseNVFlashVerboseLog parses inventory/nvflash_verbose.log and applies firmware versions
// to already discovered devices using PCI BDF with optional ID checks.
func ParseNVFlashVerboseLog(content []byte, result *models.AnalysisResult) error {
if result == nil || result.Hardware == nil {
return nil
}
records := parseNVFlashRecords(content)
if len(records) == 0 {
return nil
}
for i := range result.Hardware.GPUs {
gpu := &result.Hardware.GPUs[i]
bdf := normalizePCIBDF(gpu.BDF)
if bdf == "" {
continue
}
rec, ok := records[bdf]
if !ok {
continue
}
if gpu.DeviceID != 0 && rec.DeviceID != 0 && gpu.DeviceID != rec.DeviceID {
continue
}
if gpu.VendorID != 0 && rec.VendorID != 0 && gpu.VendorID != rec.VendorID {
continue
}
if strings.TrimSpace(rec.Version) != "" {
gpu.Firmware = strings.TrimSpace(rec.Version)
}
}
for i := range result.Hardware.PCIeDevices {
dev := &result.Hardware.PCIeDevices[i]
bdf := normalizePCIBDF(dev.BDF)
if bdf == "" {
continue
}
rec, ok := records[bdf]
if !ok {
continue
}
if dev.DeviceID != 0 && rec.DeviceID != 0 && dev.DeviceID != rec.DeviceID {
continue
}
if dev.VendorID != 0 && rec.VendorID != 0 && dev.VendorID != rec.VendorID {
continue
}
if strings.EqualFold(strings.TrimSpace(dev.DeviceClass), "NVSwitch") || strings.HasPrefix(strings.ToUpper(strings.TrimSpace(dev.Slot)), "NVSWITCH") {
if mappedPN := mapNVSwitchPartNumberByProject(rec.Project); mappedPN != "" {
dev.PartNumber = mappedPN
}
}
if strings.TrimSpace(rec.Version) != "" && strings.TrimSpace(dev.PartNumber) == "" {
// Fallback for non-NVSwitch devices where part number is unknown.
dev.PartNumber = strings.TrimSpace(rec.Version)
}
}
appendNVFlashFirmwareEntries(result, records)
return nil
}
// ApplyInventoryPCIIDs enriches devices with PCI BDFs from inventory/inventory.log.
func ApplyInventoryPCIIDs(content []byte, result *models.AnalysisResult) error {
if result == nil || result.Hardware == nil {
return nil
}
slotToBDF := parseInventoryPCIIDs(content)
if len(slotToBDF) == 0 {
return nil
}
for i := range result.Hardware.GPUs {
gpu := &result.Hardware.GPUs[i]
if strings.TrimSpace(gpu.BDF) != "" {
continue
}
if bdf := slotToBDF[strings.TrimSpace(gpu.Slot)]; bdf != "" {
gpu.BDF = bdf
}
}
for i := range result.Hardware.PCIeDevices {
dev := &result.Hardware.PCIeDevices[i]
if strings.TrimSpace(dev.BDF) != "" {
continue
}
if bdf := slotToBDF[normalizeNVSwitchSlot(strings.TrimSpace(dev.Slot))]; bdf != "" {
dev.BDF = bdf
}
}
return nil
}
func parseNVFlashRecords(content []byte) map[string]nvflashDeviceRecord {
scanner := bufio.NewScanner(strings.NewReader(string(content)))
records := make(map[string]nvflashDeviceRecord)
var current *nvflashDeviceRecord
commit := func() {
if current == nil {
return
}
if current.BDF == "" || strings.TrimSpace(current.Version) == "" {
return
}
records[current.BDF] = *current
}
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if line == "" {
continue
}
if m := nvflashAdapterRegex.FindStringSubmatch(line); len(m) == 9 {
commit()
vendorID, _ := parseHexInt(m[1])
deviceID, _ := parseHexInt(m[2])
ssVendorID, _ := parseHexInt(m[3])
ssDeviceID, _ := parseHexInt(m[4])
current = &nvflashDeviceRecord{
BDF: fmt.Sprintf("0000:%s:%s.%s", strings.ToLower(m[6]), strings.ToLower(m[7]), strings.ToLower(m[8])),
VendorID: vendorID,
DeviceID: deviceID,
SSVendorID: ssVendorID,
SSDeviceID: ssDeviceID,
}
continue
}
if current == nil {
continue
}
if !strings.Contains(line, ":") {
continue
}
parts := strings.SplitN(line, ":", 2)
key := strings.TrimSpace(parts[0])
val := strings.TrimSpace(parts[1])
if key == "" || val == "" {
continue
}
switch key {
case "Version":
current.Version = val
case "Board ID":
current.BoardID = strings.ToLower(strings.TrimPrefix(val, "0x"))
case "Vendor ID":
if v, err := parseHexInt(val); err == nil {
current.VendorID = v
}
case "Device ID":
if v, err := parseHexInt(val); err == nil {
current.DeviceID = v
}
case "Hierarchy ID":
current.HierarchyID = val
case "Chip SKU":
current.ChipSKU = val
case "Project":
current.Project = val
}
}
commit()
return records
}
func parseInventoryPCIIDs(content []byte) map[string]string {
scanner := bufio.NewScanner(strings.NewReader(string(content)))
slotToBDF := make(map[string]string)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if line == "" {
continue
}
if m := gpuPCIIDRegex.FindStringSubmatch(line); len(m) == 3 {
slotToBDF["GPUSXM"+m[1]] = normalizePCIBDF(m[2])
continue
}
if m := nvsPCIIDRegex.FindStringSubmatch(line); len(m) == 3 {
slotToBDF["NVSWITCH"+m[1]] = normalizePCIBDF(m[2])
}
}
return slotToBDF
}
func normalizePCIBDF(v string) string {
s := strings.TrimSpace(strings.ToLower(v))
if s == "" {
return ""
}
// bus:device.func -> 0000:bus:device.func
short := regexp.MustCompile(`^([0-9a-f]{2}:[0-9a-f]{2}\.[0-7])$`)
if m := short.FindStringSubmatch(s); len(m) == 2 {
return "0000:" + m[1]
}
full := regexp.MustCompile(`^([0-9a-f]{4}:[0-9a-f]{2}:[0-9a-f]{2}\.[0-7])$`)
if m := full.FindStringSubmatch(s); len(m) == 2 {
return m[1]
}
return s
}
func parseHexInt(v string) (int, error) {
s := strings.TrimSpace(strings.ToLower(v))
s = strings.TrimPrefix(s, "0x")
if s == "" {
return 0, fmt.Errorf("empty hex value")
}
n, err := strconv.ParseInt(s, 16, 32)
if err != nil {
return 0, err
}
return int(n), nil
}
func findNVFlashVerboseLog(files []parser.ExtractedFile) *parser.ExtractedFile {
for _, f := range files {
path := strings.ToLower(f.Path)
if strings.Contains(path, "inventory/nvflash_verbose.log") ||
strings.Contains(path, "inventory\\nvflash_verbose.log") {
return &f
}
}
return nil
}
func findInventoryInfoLog(files []parser.ExtractedFile) *parser.ExtractedFile {
for _, f := range files {
path := strings.ToLower(f.Path)
if strings.Contains(path, "inventory/inventory.log") ||
strings.Contains(path, "inventory\\inventory.log") {
return &f
}
}
return nil
}
func appendNVFlashFirmwareEntries(result *models.AnalysisResult, records map[string]nvflashDeviceRecord) {
if result == nil || result.Hardware == nil {
return
}
if result.Hardware.Firmware == nil {
result.Hardware.Firmware = make([]models.FirmwareInfo, 0)
}
seen := make(map[string]struct{})
for _, fw := range result.Hardware.Firmware {
key := strings.ToLower(strings.TrimSpace(fw.DeviceName)) + "|" + strings.TrimSpace(fw.Version)
seen[key] = struct{}{}
}
for _, gpu := range result.Hardware.GPUs {
version := strings.TrimSpace(gpu.Firmware)
if version == "" {
continue
}
model := strings.TrimSpace(gpu.PartNumber)
if model == "" {
model = strings.TrimSpace(gpu.Model)
}
if model == "" {
model = strings.TrimSpace(gpu.Slot)
}
deviceName := fmt.Sprintf("GPU %s (%s)", strings.TrimSpace(gpu.Slot), model)
key := strings.ToLower(deviceName) + "|" + version
if _, ok := seen[key]; ok {
continue
}
seen[key] = struct{}{}
result.Hardware.Firmware = append(result.Hardware.Firmware, models.FirmwareInfo{
DeviceName: deviceName,
Version: version,
})
}
for _, dev := range result.Hardware.PCIeDevices {
bdf := normalizePCIBDF(dev.BDF)
rec, ok := records[bdf]
if !ok {
continue
}
version := strings.TrimSpace(rec.Version)
if version == "" {
continue
}
slot := strings.TrimSpace(dev.Slot)
deviceClass := strings.TrimSpace(dev.DeviceClass)
if strings.EqualFold(deviceClass, "NVSwitch") || strings.HasPrefix(strings.ToUpper(slot), "NVSWITCH") {
model := slot
if pn := strings.TrimSpace(dev.PartNumber); pn != "" {
model = pn
}
deviceName := fmt.Sprintf("NVSwitch %s (%s)", slot, model)
key := strings.ToLower(deviceName) + "|" + version
if _, ok := seen[key]; ok {
continue
}
seen[key] = struct{}{}
result.Hardware.Firmware = append(result.Hardware.Firmware, models.FirmwareInfo{
DeviceName: deviceName,
Version: version,
})
}
}
}
func mapNVSwitchPartNumberByProject(project string) string {
key := strings.TrimSpace(strings.ToLower(project))
if key == "" {
return ""
}
return strings.TrimSpace(nvswitchProjectToPartNumber[key])
}

View File

@@ -0,0 +1,93 @@
package nvidia
import (
"testing"
"git.mchus.pro/mchus/logpile/internal/models"
)
func TestApplyInventoryPCIIDsAndNVFlashFirmware(t *testing.T) {
result := &models.AnalysisResult{
Hardware: &models.HardwareConfig{
GPUs: []models.GPU{
{
Slot: "GPUSXM5",
DeviceID: 0x2335,
},
},
PCIeDevices: []models.PCIeDevice{
{
Slot: "NVSWITCHNVSWITCH2",
DeviceID: 0x22a3,
},
},
},
}
inventoryLog := []byte(`
GPU_SXM5_PCIID: 0000:ba:00.0
NVSWITCH_NVSWITCH2_PCIID: 0000:07:00.0
`)
nvflashLog := []byte(`
Adapter: Graphics Device (10DE,2335,10DE,18BE) S:00,B:BA,D:00,F:00
Version : 96.00.D0.00.03
Board ID : 0x053C
Vendor ID : 0x10DE
Device ID : 0x2335
Hierarchy ID : Normal Board
Chip SKU : 895-0
Project : G520-0280
Adapter: Graphics Device (10DE,22A3,10DE,1796) S:00,B:07,D:00,F:00
Version : 96.10.6D.00.01
Board ID : 0x03B7
Vendor ID : 0x10DE
Device ID : 0x22A3
Hierarchy ID : Normal Board
Chip SKU : 890-0
Project : 5612-0002
`)
if err := ApplyInventoryPCIIDs(inventoryLog, result); err != nil {
t.Fatalf("ApplyInventoryPCIIDs failed: %v", err)
}
if err := ParseNVFlashVerboseLog(nvflashLog, result); err != nil {
t.Fatalf("ParseNVFlashVerboseLog failed: %v", err)
}
if got := result.Hardware.GPUs[0].BDF; got != "0000:ba:00.0" {
t.Fatalf("expected GPU BDF 0000:ba:00.0, got %q", got)
}
if got := result.Hardware.GPUs[0].Firmware; got != "96.00.D0.00.03" {
t.Fatalf("expected GPU firmware 96.00.D0.00.03, got %q", got)
}
if got := result.Hardware.PCIeDevices[0].BDF; got != "0000:07:00.0" {
t.Fatalf("expected NVSwitch BDF 0000:07:00.0, got %q", got)
}
if got := result.Hardware.PCIeDevices[0].PartNumber; got != "965-25612-0002-000" {
t.Fatalf("expected NVSwitch part number 965-25612-0002-000, got %q", got)
}
if len(result.Hardware.Firmware) == 0 {
t.Fatalf("expected firmware entries to be populated from nvflash log")
}
hasGPUFW := false
hasNVSwitchFW := false
for _, fw := range result.Hardware.Firmware {
if fw.Version == "96.00.D0.00.03" {
hasGPUFW = true
}
if fw.Version == "96.10.6D.00.01" {
hasNVSwitchFW = true
}
}
if !hasGPUFW {
t.Fatalf("expected GPU firmware version 96.00.D0.00.03 in hardware firmware list")
}
if !hasNVSwitchFW {
t.Fatalf("expected NVSwitch firmware version 96.10.6D.00.01 in hardware firmware list")
}
}

View File

@@ -14,7 +14,7 @@ import (
// parserVersion - version of this parser module
// IMPORTANT: Increment this version when making changes to parser logic!
const parserVersion = "1.2.4"
const parserVersion = "1.3.0"
func init() {
parser.Register(&Parser{})
@@ -106,6 +106,8 @@ func (p *Parser) Parse(files []parser.ExtractedFile) (*models.AnalysisResult, er
GPUs: make([]models.GPU, 0),
}
gpuStatuses := make(map[string]string)
gpuFailureDetails := make(map[string]string)
nvswitchStatuses := make(map[string]string)
// Parse output.log first (contains dmidecode system info)
// Find the output.log file that contains dmidecode output
@@ -134,9 +136,26 @@ func (p *Parser) Parse(files []parser.ExtractedFile) (*models.AnalysisResult, er
}
}
// Parse inventory/inventory.log to enrich PCI BDF mapping for components.
inventoryInfoLog := findInventoryInfoLog(files)
if inventoryInfoLog != nil {
if err := ApplyInventoryPCIIDs(inventoryInfoLog.Content, result); err != nil {
_ = err
}
}
// Enhance GPU model names using SKU mapping from testspec + inventory summary.
ApplyGPUModelsFromSKU(files, result)
// Parse inventory/nvflash_verbose.log and apply firmware versions by BDF + IDs.
// This runs after GPU model/part-number enrichment so firmware tab uses final model labels.
nvflashVerbose := findNVFlashVerboseLog(files)
if nvflashVerbose != nil {
if err := ParseNVFlashVerboseLog(nvflashVerbose.Content, result); err != nil {
_ = err
}
}
// Parse summary.json (test results summary)
if f := parser.FindFileByName(files, "summary.json"); f != nil {
events := ParseSummaryJSON(f.Content)
@@ -144,6 +163,14 @@ func (p *Parser) Parse(files []parser.ExtractedFile) (*models.AnalysisResult, er
for componentID, status := range CollectGPUStatusesFromSummaryJSON(f.Content) {
gpuStatuses[componentID] = mergeGPUStatus(gpuStatuses[componentID], status)
}
for slot, status := range CollectNVSwitchStatusesFromSummaryJSON(f.Content) {
nvswitchStatuses[slot] = mergeGPUStatus(nvswitchStatuses[slot], status)
}
for componentID, detail := range CollectGPUFailureDetailsFromSummaryJSON(f.Content) {
if _, exists := gpuFailureDetails[componentID]; !exists && strings.TrimSpace(detail) != "" {
gpuFailureDetails[componentID] = strings.TrimSpace(detail)
}
}
}
// Parse summary.csv (alternative format)
@@ -153,10 +180,21 @@ func (p *Parser) Parse(files []parser.ExtractedFile) (*models.AnalysisResult, er
for componentID, status := range CollectGPUStatusesFromSummaryCSV(f.Content) {
gpuStatuses[componentID] = mergeGPUStatus(gpuStatuses[componentID], status)
}
for slot, status := range CollectNVSwitchStatusesFromSummaryCSV(f.Content) {
nvswitchStatuses[slot] = mergeGPUStatus(nvswitchStatuses[slot], status)
}
for componentID, detail := range CollectGPUFailureDetailsFromSummaryCSV(f.Content) {
if _, exists := gpuFailureDetails[componentID]; !exists && strings.TrimSpace(detail) != "" {
gpuFailureDetails[componentID] = strings.TrimSpace(detail)
}
}
}
// Apply per-GPU PASS/FAIL status derived from summary files.
ApplyGPUStatuses(result, gpuStatuses)
ApplyGPUFailureDetails(result, gpuFailureDetails)
ApplyNVSwitchStatuses(result, nvswitchStatuses)
ApplyGPUAndNVSwitchCheckTimes(result, CollectGPUAndNVSwitchCheckTimes(files))
// Parse GPU field diagnostics logs
gpuFieldiagFiles := parser.FindFileByPattern(files, "gpu_fieldiag/", ".log")

View File

@@ -4,6 +4,7 @@ import (
"os"
"path/filepath"
"testing"
"time"
"git.mchus.pro/mchus/logpile/internal/parser"
)
@@ -146,6 +147,39 @@ func TestNVIDIAParser_GPUStatusFromSummary_RealArchive07900(t *testing.T) {
}
}
func TestNVIDIAParser_GPUErrorDetailsFromSummary_RealArchive07900(t *testing.T) {
archivePath := filepath.Join("../../../../example", "A514359X5A07900_logs-20260122-074208.tar")
if _, err := os.Stat(archivePath); os.IsNotExist(err) {
t.Skip("Test archive not found, skipping test")
}
files, err := parser.ExtractArchive(archivePath)
if err != nil {
t.Fatalf("Failed to extract archive: %v", err)
}
p := &Parser{}
result, err := p.Parse(files)
if err != nil {
t.Fatalf("Failed to parse archive: %v", err)
}
if result.Hardware == nil || len(result.Hardware.GPUs) == 0 {
t.Fatalf("expected GPUs in parsed result")
}
errBySerial := make(map[string]string, len(result.Hardware.GPUs))
for _, gpu := range result.Hardware.GPUs {
if gpu.SerialNumber != "" {
errBySerial[gpu.SerialNumber] = gpu.ErrorDescription
}
}
if got := errBySerial["1653925025497"]; got != "Row remapping failed" {
t.Fatalf("expected GPU serial 1653925025497 error Row remapping failed, got %q", got)
}
}
func TestNVIDIAParser_GPUModelFromSKU_RealArchive07900(t *testing.T) {
archivePath := filepath.Join("../../../../example", "A514359X5A07900_logs-20260122-074208.tar")
if _, err := os.Stat(archivePath); os.IsNotExist(err) {
@@ -169,14 +203,75 @@ func TestNVIDIAParser_GPUModelFromSKU_RealArchive07900(t *testing.T) {
found := false
for _, gpu := range result.Hardware.GPUs {
if gpu.Model == "NVIDIA H200 SXM" {
if gpu.Model == "692-2G520-0280-501" && gpu.Description == "hgx h200 8 gpu 141g aircooled" {
found = true
break
}
}
if !found {
t.Fatalf("expected at least one GPU model NVIDIA H200 SXM")
t.Fatalf("expected at least one GPU with model 692-2G520-0280-501 and description hgx h200 8 gpu 141g aircooled")
}
}
func TestNVIDIAParser_ComponentCheckTimes_RealArchive07900(t *testing.T) {
archivePath := filepath.Join("../../../../example", "A514359X5A07900_logs-20260122-074208.tar")
if _, err := os.Stat(archivePath); os.IsNotExist(err) {
t.Skip("Test archive not found, skipping test")
}
files, err := parser.ExtractArchive(archivePath)
if err != nil {
t.Fatalf("Failed to extract archive: %v", err)
}
p := &Parser{}
result, err := p.Parse(files)
if err != nil {
t.Fatalf("Failed to parse archive: %v", err)
}
if result.Hardware == nil {
t.Fatalf("expected hardware in parsed result")
}
expectedGPU := time.Date(2026, 1, 22, 9, 45, 36, 0, time.UTC)
expectedNVSwitch := time.Date(2026, 1, 22, 9, 11, 32, 0, time.UTC)
if len(result.Hardware.GPUs) == 0 {
t.Fatalf("expected GPUs in parsed result")
}
for _, gpu := range result.Hardware.GPUs {
if !gpu.StatusCheckedAt.Equal(expectedGPU) {
t.Fatalf("expected GPU %s status_checked_at %s, got %s", gpu.Slot, expectedGPU.Format(time.RFC3339), gpu.StatusCheckedAt.Format(time.RFC3339))
}
if gpu.StatusAtCollect == nil || !gpu.StatusAtCollect.At.Equal(expectedGPU) {
t.Fatalf("expected GPU %s status_at_collection.at %s", gpu.Slot, expectedGPU.Format(time.RFC3339))
}
}
nvsCount := 0
for _, dev := range result.Hardware.PCIeDevices {
slot := normalizeNVSwitchSlot(dev.Slot)
if slot == "" {
continue
}
if dev.DeviceClass != "NVSwitch" && len(slot) < len("NVSWITCH") {
continue
}
if dev.DeviceClass != "NVSwitch" && slot[:len("NVSWITCH")] != "NVSWITCH" {
continue
}
nvsCount++
if !dev.StatusCheckedAt.Equal(expectedNVSwitch) {
t.Fatalf("expected NVSwitch %s status_checked_at %s, got %s", dev.Slot, expectedNVSwitch.Format(time.RFC3339), dev.StatusCheckedAt.Format(time.RFC3339))
}
if dev.StatusAtCollect == nil || !dev.StatusAtCollect.At.Equal(expectedNVSwitch) {
t.Fatalf("expected NVSwitch %s status_at_collection.at %s", dev.Slot, expectedNVSwitch.Format(time.RFC3339))
}
}
if nvsCount == 0 {
t.Fatalf("expected NVSwitch devices in parsed result")
}
}

View File

@@ -22,6 +22,7 @@ type SummaryEntry struct {
}
var gpuComponentIDRegex = regexp.MustCompile(`^SXM(\d+)_SN_(.+)$`)
var nvswitchInventoryComponentRegex = regexp.MustCompile(`^NVSWITCH_(NVSWITCH\d+)_`)
// ParseSummaryJSON parses summary.json file and returns events
func ParseSummaryJSON(content []byte) []models.Event {
@@ -121,6 +122,41 @@ func CollectGPUStatusesFromSummaryJSON(content []byte) map[string]string {
return statuses
}
// CollectGPUFailureDetailsFromSummaryJSON extracts per-GPU failure details from summary.json.
// Key format in returned map is component ID from summary (e.g. "SXM5_SN_1653925025497").
func CollectGPUFailureDetailsFromSummaryJSON(content []byte) map[string]string {
var entries []SummaryEntry
if err := json.Unmarshal(content, &entries); err != nil {
return nil
}
details := make(map[string]string)
for _, entry := range entries {
component := strings.TrimSpace(entry.ComponentID)
if component == "" || !gpuComponentIDRegex.MatchString(component) {
continue
}
if isSummaryJSONRecordPassing(entry.ErrorCode, entry.Notes) {
continue
}
note := strings.TrimSpace(entry.Notes)
if note == "" || strings.EqualFold(note, "OK") {
note = strings.TrimSpace(entry.ErrorCode)
}
if note == "" {
continue
}
// Keep first non-empty detail to avoid noisy overrides.
if _, exists := details[component]; !exists {
details[component] = note
}
}
return details
}
// CollectGPUStatusesFromSummaryCSV extracts per-GPU PASS/FAIL status from summary.csv.
// Key format in returned map is component ID from summary (e.g. "SXM5_SN_1653925025497").
func CollectGPUStatusesFromSummaryCSV(content []byte) map[string]string {
@@ -155,6 +191,120 @@ func CollectGPUStatusesFromSummaryCSV(content []byte) map[string]string {
return statuses
}
// CollectNVSwitchStatusesFromSummaryJSON extracts per-NVSwitch PASS/FAIL status from summary.json.
// Key format in returned map is normalized switch slot (e.g. "NVSWITCH0").
func CollectNVSwitchStatusesFromSummaryJSON(content []byte) map[string]string {
var entries []SummaryEntry
if err := json.Unmarshal(content, &entries); err != nil {
return nil
}
statuses := make(map[string]string)
for _, entry := range entries {
component := strings.TrimSpace(entry.ComponentID)
matches := nvswitchInventoryComponentRegex.FindStringSubmatch(component)
if len(matches) != 2 {
continue
}
slot := strings.TrimSpace(matches[1])
if slot == "" {
continue
}
current := statuses[slot]
next := "PASS"
if !isSummaryJSONRecordPassing(entry.ErrorCode, entry.Notes) {
next = "FAIL"
}
statuses[slot] = mergeGPUStatus(current, next)
}
return statuses
}
// CollectNVSwitchStatusesFromSummaryCSV extracts per-NVSwitch PASS/FAIL status from summary.csv.
// Key format in returned map is normalized switch slot (e.g. "NVSWITCH0").
func CollectNVSwitchStatusesFromSummaryCSV(content []byte) map[string]string {
reader := csv.NewReader(strings.NewReader(string(content)))
records, err := reader.ReadAll()
if err != nil {
return nil
}
statuses := make(map[string]string)
for i, record := range records {
if i == 0 || len(record) < 7 {
continue
}
component := strings.TrimSpace(record[5])
matches := nvswitchInventoryComponentRegex.FindStringSubmatch(component)
if len(matches) != 2 {
continue
}
slot := strings.TrimSpace(matches[1])
if slot == "" {
continue
}
errorCode := strings.TrimSpace(record[0])
notes := strings.TrimSpace(record[6])
current := statuses[slot]
next := "PASS"
if !isSummaryCSVRecordPassing(errorCode, notes) {
next = "FAIL"
}
statuses[slot] = mergeGPUStatus(current, next)
}
return statuses
}
// CollectGPUFailureDetailsFromSummaryCSV extracts per-GPU failure details from summary.csv.
// Key format in returned map is component ID from summary (e.g. "SXM5_SN_1653925025497").
func CollectGPUFailureDetailsFromSummaryCSV(content []byte) map[string]string {
reader := csv.NewReader(strings.NewReader(string(content)))
records, err := reader.ReadAll()
if err != nil {
return nil
}
details := make(map[string]string)
for i, record := range records {
if i == 0 || len(record) < 7 {
continue
}
component := strings.TrimSpace(record[5])
if component == "" || !gpuComponentIDRegex.MatchString(component) {
continue
}
errorCode := strings.TrimSpace(record[0])
notes := strings.TrimSpace(record[6])
if isSummaryCSVRecordPassing(errorCode, notes) {
continue
}
note := notes
if note == "" || strings.EqualFold(note, "OK") {
note = errorCode
}
if note == "" {
continue
}
if _, exists := details[component]; !exists {
details[component] = note
}
}
return details
}
func isSummaryJSONRecordPassing(errorCode, notes string) bool {
_ = errorCode
return strings.TrimSpace(notes) == "OK"
@@ -213,6 +363,73 @@ func ApplyGPUStatuses(result *models.AnalysisResult, componentStatuses map[strin
}
}
// ApplyNVSwitchStatuses applies aggregated PASS/FAIL statuses from summary components to parsed NVSwitch devices.
func ApplyNVSwitchStatuses(result *models.AnalysisResult, switchStatuses map[string]string) {
if result == nil || result.Hardware == nil || len(result.Hardware.PCIeDevices) == 0 || len(switchStatuses) == 0 {
return
}
for i := range result.Hardware.PCIeDevices {
dev := &result.Hardware.PCIeDevices[i]
slot := normalizeNVSwitchSlot(strings.TrimSpace(dev.Slot))
if slot == "" {
continue
}
if !strings.HasPrefix(strings.ToUpper(slot), "NVSWITCH") {
continue
}
if st := switchStatuses[slot]; st != "" {
dev.Status = st
}
}
}
// ApplyGPUFailureDetails maps parsed failure details from summary components to GPUs.
func ApplyGPUFailureDetails(result *models.AnalysisResult, componentDetails map[string]string) {
if result == nil || result.Hardware == nil || len(result.Hardware.GPUs) == 0 || len(componentDetails) == 0 {
return
}
slotDetails := make(map[string]string) // key: GPUSXM<idx>
serialDetails := make(map[string]string) // key: GPU serial
for componentID, detail := range componentDetails {
matches := gpuComponentIDRegex.FindStringSubmatch(strings.TrimSpace(componentID))
if len(matches) != 3 {
continue
}
detail = strings.TrimSpace(detail)
if detail == "" {
continue
}
slotKey := "GPUSXM" + matches[1]
serialKey := strings.TrimSpace(matches[2])
if _, exists := slotDetails[slotKey]; !exists {
slotDetails[slotKey] = detail
}
if serialKey != "" {
if _, exists := serialDetails[serialKey]; !exists {
serialDetails[serialKey] = detail
}
}
}
for i := range result.Hardware.GPUs {
gpu := &result.Hardware.GPUs[i]
detail := ""
if serial := strings.TrimSpace(gpu.SerialNumber); serial != "" {
detail = serialDetails[serial]
}
if detail == "" {
detail = slotDetails[strings.TrimSpace(gpu.Slot)]
}
if detail != "" {
gpu.ErrorDescription = detail
}
}
}
// formatSummaryDescription creates a human-readable description from summary entry
func formatSummaryDescription(entry SummaryEntry) string {
component := entry.ComponentID

View File

@@ -44,3 +44,79 @@ func TestApplyGPUStatuses_FromSummaryCSV_FailAndPass(t *testing.T) {
t.Fatalf("expected serial 222 status PASS, got %q", bySerial["222"])
}
}
func TestApplyGPUFailureDetails_FromSummaryJSON_BySerial(t *testing.T) {
jsonData := []byte(`[
{
"Error Code": "005-000-1-000000000363",
"Test": "gpumem",
"Component ID": "SXM5_SN_1653925025497",
"Notes": "Row remapping failed",
"Virtual ID": "gpumem",
"Ignore Error": "False"
}
]`)
result := &models.AnalysisResult{
Hardware: &models.HardwareConfig{
GPUs: []models.GPU{
{Slot: "GPUSXM5", SerialNumber: "1653925025497"},
{Slot: "GPUSXM2", SerialNumber: "1653925024190"},
},
},
}
details := CollectGPUFailureDetailsFromSummaryJSON(jsonData)
ApplyGPUFailureDetails(result, details)
if got := result.Hardware.GPUs[0].ErrorDescription; got != "Row remapping failed" {
t.Fatalf("expected serial 1653925025497 error Row remapping failed, got %q", got)
}
if got := result.Hardware.GPUs[1].ErrorDescription; got != "" {
t.Fatalf("expected no error description for healthy GPU, got %q", got)
}
}
func TestApplyNVSwitchStatuses_FromSummaryJSON(t *testing.T) {
jsonData := []byte(`[
{
"Error Code": "0",
"Test": "inventory",
"Component ID": "NVSWITCH_NVSWITCH0_VendorID",
"Notes": "OK",
"Virtual ID": "inventory",
"Ignore Error": "False"
},
{
"Error Code": "1",
"Test": "inventory",
"Component ID": "NVSWITCH_NVSWITCH1_LinkState",
"Notes": "Link down",
"Virtual ID": "inventory",
"Ignore Error": "False"
}
]`)
result := &models.AnalysisResult{
Hardware: &models.HardwareConfig{
PCIeDevices: []models.PCIeDevice{
{Slot: "NVSWITCH0", Status: "Unknown"},
{Slot: "NVSWITCH1", Status: "Unknown"},
{Slot: "NVSWITCH2", Status: "Unknown"},
},
},
}
statuses := CollectNVSwitchStatusesFromSummaryJSON(jsonData)
ApplyNVSwitchStatuses(result, statuses)
if got := result.Hardware.PCIeDevices[0].Status; got != "PASS" {
t.Fatalf("expected NVSWITCH0 status PASS, got %q", got)
}
if got := result.Hardware.PCIeDevices[1].Status; got != "FAIL" {
t.Fatalf("expected NVSWITCH1 status FAIL, got %q", got)
}
if got := result.Hardware.PCIeDevices[2].Status; got != "Unknown" {
t.Fatalf("expected NVSWITCH2 status unchanged Unknown, got %q", got)
}
}

View File

@@ -186,6 +186,9 @@ func parseGPUComponent(comp Component) *models.GPU {
switch prop.ID {
case "DeviceID":
deviceID = prop.GetValueAsString()
if deviceID != "" {
fmt.Sscanf(deviceID, "%x", &gpu.DeviceID)
}
case "Vendor":
gpu.Manufacturer = prop.GetValueAsString()
case "DeviceName":

View File

@@ -410,8 +410,12 @@ func (s *Server) handleGetSerials(w http.ResponseWriter, r *http.Request) {
if !hasUsableSerial(pcie.SerialNumber) {
continue
}
component := pcie.DeviceClass
if strings.EqualFold(strings.TrimSpace(pcie.DeviceClass), "NVSwitch") && strings.TrimSpace(pcie.PartNumber) != "" {
component = strings.TrimSpace(pcie.PartNumber)
}
serials = append(serials, SerialEntry{
Component: pcie.DeviceClass,
Component: component,
Location: pcie.Slot,
SerialNumber: strings.TrimSpace(pcie.SerialNumber),
Manufacturer: pcie.Manufacturer,
@@ -526,6 +530,36 @@ func extractFirmwareComponentAndModel(deviceName string) (component, model strin
return "NIC", "-"
}
// For "GPU GPUSXM5 (692-2G520-0280-501)" -> component: "GPU", model: "GPUSXM5 (692-2G520-0280-501)"
if strings.HasPrefix(deviceName, "GPU ") {
if idx := strings.Index(deviceName, "("); idx != -1 {
model = strings.TrimSpace(strings.Trim(deviceName[idx:], "()"))
if model != "" {
return "GPU", model
}
}
model = strings.TrimSpace(strings.TrimPrefix(deviceName, "GPU "))
if model == "" {
return "GPU", "-"
}
return "GPU", model
}
// For "NVSwitch NVSWITCH2 (NVSWITCH2)" -> component: "NVSwitch", model: "NVSWITCH2 (NVSWITCH2)"
if strings.HasPrefix(deviceName, "NVSwitch ") {
if idx := strings.Index(deviceName, "("); idx != -1 {
model = strings.TrimSpace(strings.Trim(deviceName[idx:], "()"))
if model != "" {
return "NVSwitch", model
}
}
model = strings.TrimSpace(strings.TrimPrefix(deviceName, "NVSwitch "))
if model == "" {
return "NVSwitch", "-"
}
return "NVSwitch", model
}
// For "HDD Samsung MZ7L33T8HBNA-00A07" -> component: "HDD", model: "Samsung MZ7L33T8HBNA-00A07"
if strings.HasPrefix(deviceName, "HDD ") {
return "HDD", strings.TrimPrefix(deviceName, "HDD ")

View File

@@ -0,0 +1,23 @@
package server
import "testing"
func TestExtractFirmwareComponentAndModel_GPUUsesPartNumberFromParentheses(t *testing.T) {
component, model := extractFirmwareComponentAndModel("GPU GPUSXM3 (692-2G520-0280-501)")
if component != "GPU" {
t.Fatalf("expected component GPU, got %q", component)
}
if model != "692-2G520-0280-501" {
t.Fatalf("expected GPU model 692-2G520-0280-501, got %q", model)
}
}
func TestExtractFirmwareComponentAndModel_GPUFallbackWithoutParentheses(t *testing.T) {
component, model := extractFirmwareComponentAndModel("GPU 692-2G520-0280-501")
if component != "GPU" {
t.Fatalf("expected component GPU, got %q", component)
}
if model != "692-2G520-0280-501" {
t.Fatalf("expected GPU model 692-2G520-0280-501, got %q", model)
}
}

View File

@@ -903,9 +903,11 @@ function renderConfig(data) {
// PCIe Device Inventory tab
html += '<div class="config-tab-content" id="config-pcie">';
if (config.pcie_devices && config.pcie_devices.length > 0) {
html += '<h3>PCIe устройства</h3><table class="config-table"><thead><tr><th>Слот</th><th>BDF</th><th>Тип</th><th>Производитель</th><th>Vendor:Device ID</th><th>PCIe Link</th></tr></thead><tbody>';
config.pcie_devices.forEach(p => {
const hasPCIe = config.pcie_devices && config.pcie_devices.length > 0;
const hasGPUs = config.gpus && config.gpus.length > 0;
if (hasPCIe || hasGPUs) {
html += '<h3>PCIe устройства</h3><table class="config-table"><thead><tr><th>Слот</th><th>BDF</th><th>Тип</th><th>Модель</th><th>Производитель</th><th>Vendor:Device ID</th><th>PCIe Link</th></tr></thead><tbody>';
(config.pcie_devices || []).forEach(p => {
const pcieLink = formatPCIeLink(
p.link_width,
p.link_speed,
@@ -916,11 +918,30 @@ function renderConfig(data) {
<td>${escapeHtml(p.slot || '-')}</td>
<td><code>${escapeHtml(p.bdf || '-')}</code></td>
<td>${escapeHtml(p.device_class || '-')}</td>
<td>${escapeHtml(p.part_number || '-')}</td>
<td>${escapeHtml(p.manufacturer || '-')}</td>
<td><code>${p.vendor_id ? p.vendor_id.toString(16) : '-'}:${p.device_id ? p.device_id.toString(16) : '-'}</code></td>
<td>${pcieLink}</td>
</tr>`;
});
(config.gpus || []).forEach(gpu => {
const pcieLink = formatPCIeLink(
gpu.current_link_width || gpu.link_width,
gpu.current_link_speed || gpu.link_speed,
gpu.max_link_width,
gpu.max_link_speed
);
html += `<tr>
<td>${escapeHtml(gpu.slot || '-')}</td>
<td><code>${escapeHtml(gpu.bdf || '-')}</code></td>
<td>GPU</td>
<td>${escapeHtml(gpu.model || gpu.part_number || '-')}</td>
<td>${escapeHtml(gpu.manufacturer || '-')}</td>
<td><code>${gpu.vendor_id ? gpu.vendor_id.toString(16) : '-'}:${gpu.device_id ? gpu.device_id.toString(16) : '-'}</code></td>
<td>${pcieLink}</td>
</tr>`;
});
html += '</tbody></table>';
} else {
html += '<p class="no-data">Нет данных о PCIe устройствах</p>';