nvidia: improve component mapping, firmware, statuses and check times
This commit is contained in:
274
internal/parser/vendors/nvidia/component_status_time.go
vendored
Normal file
274
internal/parser/vendors/nvidia/component_status_time.go
vendored
Normal file
@@ -0,0 +1,274 @@
|
||||
package nvidia
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"git.mchus.pro/mchus/logpile/internal/models"
|
||||
"git.mchus.pro/mchus/logpile/internal/parser"
|
||||
)
|
||||
|
||||
var verboseRunTestingLineRegex = regexp.MustCompile(`^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}),\d+\s+-\s+Testing\s+([a-zA-Z0-9_]+)\s*$`)
|
||||
var runLogStartTimeRegex = regexp.MustCompile(`^Start time\s+([A-Za-z]{3}, \d{2} [A-Za-z]{3} \d{4} \d{2}:\d{2}:\d{2})\s*$`)
|
||||
var runLogTestDurationRegex = regexp.MustCompile(`^Testing\s+([a-zA-Z0-9_]+)\s+\S+\s+\[\s*([0-9]+):([0-9]{2})s\s*\]\s*$`)
|
||||
var modsStartLineRegex = regexp.MustCompile(`(?m)^MODS start:\s+([A-Za-z]{3}\s+[A-Za-z]{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+\d{4})\s*$`)
|
||||
var gpuFieldiagOutputPathRegex = regexp.MustCompile(`(?i)gpu_fieldiag[\\/]+sxm(\d+)_sn_([^\\/]+)[\\/]+output\.log$`)
|
||||
var nvswitchDevnameRegex = regexp.MustCompile(`devname=[^,\s]+,(NVSWITCH\d+)`)
|
||||
|
||||
type componentCheckTimes struct {
|
||||
GPUDefault time.Time
|
||||
NVSwitchDefault time.Time
|
||||
GPUBySerial map[string]time.Time // key: GPU serial
|
||||
GPUBySlot map[string]time.Time // key: GPUSXM<idx>
|
||||
NVSwitchBySlot map[string]time.Time // key: NVSWITCH<idx>
|
||||
}
|
||||
|
||||
// CollectGPUAndNVSwitchCheckTimes extracts GPU/NVSwitch check timestamps from NVIDIA logs.
|
||||
// Priority:
|
||||
// 1) verbose_run.log "Testing <test>" timestamps
|
||||
// 2) run.log start time + cumulative durations
|
||||
func CollectGPUAndNVSwitchCheckTimes(files []parser.ExtractedFile) componentCheckTimes {
|
||||
gpuBySerial := make(map[string]time.Time)
|
||||
gpuBySlot := make(map[string]time.Time)
|
||||
nvsBySlot := make(map[string]time.Time)
|
||||
|
||||
for _, f := range files {
|
||||
path := strings.TrimSpace(f.Path)
|
||||
pathLower := strings.ToLower(path)
|
||||
|
||||
// Per-GPU timestamp from gpu_fieldiag/<SXMx_SN_serial>/output.log
|
||||
if strings.HasSuffix(pathLower, "output.log") && strings.Contains(pathLower, "gpu_fieldiag/") {
|
||||
ts := parseModsStartTime(f.Content)
|
||||
if ts.IsZero() {
|
||||
continue
|
||||
}
|
||||
matches := gpuFieldiagOutputPathRegex.FindStringSubmatch(path)
|
||||
if len(matches) == 3 {
|
||||
slot := "GPUSXM" + strings.TrimSpace(matches[1])
|
||||
serial := strings.TrimSpace(matches[2])
|
||||
if slot != "" {
|
||||
gpuBySlot[slot] = ts
|
||||
}
|
||||
if serial != "" {
|
||||
gpuBySerial[serial] = ts
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Per-NVSwitch timestamp and slot list from nvswitch/output.log
|
||||
if strings.HasSuffix(pathLower, "nvswitch/output.log") || strings.HasSuffix(pathLower, "nvswitch\\output.log") {
|
||||
ts := parseModsStartTime(f.Content)
|
||||
if ts.IsZero() {
|
||||
continue
|
||||
}
|
||||
for _, slot := range parseNVSwitchSlotsFromOutput(f.Content) {
|
||||
nvsBySlot[slot] = ts
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
testStarts := make(map[string]time.Time)
|
||||
|
||||
if f := parser.FindFileByName(files, "verbose_run.log"); f != nil {
|
||||
for testName, ts := range parseVerboseRunTestStartTimes(f.Content) {
|
||||
testStarts[strings.ToLower(strings.TrimSpace(testName))] = ts
|
||||
}
|
||||
}
|
||||
|
||||
if len(testStarts) == 0 {
|
||||
if f := parser.FindFileByName(files, "run.log"); f != nil {
|
||||
for testName, ts := range parseRunLogTestStartTimes(f.Content) {
|
||||
testStarts[strings.ToLower(strings.TrimSpace(testName))] = ts
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return componentCheckTimes{
|
||||
GPUDefault: pickFirstTestTime(testStarts, "gpu_fieldiag", "gpumem", "gpustress", "pcie", "inventory"),
|
||||
NVSwitchDefault: pickFirstTestTime(testStarts, "nvswitch", "inventory"),
|
||||
GPUBySerial: gpuBySerial,
|
||||
GPUBySlot: gpuBySlot,
|
||||
NVSwitchBySlot: nvsBySlot,
|
||||
}
|
||||
}
|
||||
|
||||
func pickFirstTestTime(testStarts map[string]time.Time, names ...string) time.Time {
|
||||
for _, name := range names {
|
||||
if ts := testStarts[strings.ToLower(strings.TrimSpace(name))]; !ts.IsZero() {
|
||||
return ts
|
||||
}
|
||||
}
|
||||
return time.Time{}
|
||||
}
|
||||
|
||||
func parseVerboseRunTestStartTimes(content []byte) map[string]time.Time {
|
||||
result := make(map[string]time.Time)
|
||||
lines := strings.Split(string(content), "\n")
|
||||
for _, line := range lines {
|
||||
matches := verboseRunTestingLineRegex.FindStringSubmatch(strings.TrimSpace(line))
|
||||
if len(matches) != 3 {
|
||||
continue
|
||||
}
|
||||
|
||||
ts, err := time.ParseInLocation("2006-01-02 15:04:05", strings.TrimSpace(matches[1]), time.UTC)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
testName := strings.ToLower(strings.TrimSpace(matches[2]))
|
||||
if testName == "" {
|
||||
continue
|
||||
}
|
||||
if _, exists := result[testName]; !exists {
|
||||
result[testName] = ts
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func parseRunLogTestStartTimes(content []byte) map[string]time.Time {
|
||||
lines := strings.Split(string(content), "\n")
|
||||
start := time.Time{}
|
||||
for _, line := range lines {
|
||||
matches := runLogStartTimeRegex.FindStringSubmatch(strings.TrimSpace(line))
|
||||
if len(matches) != 2 {
|
||||
continue
|
||||
}
|
||||
parsed, err := time.ParseInLocation("Mon, 02 Jan 2006 15:04:05", strings.TrimSpace(matches[1]), time.UTC)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
start = parsed
|
||||
break
|
||||
}
|
||||
if start.IsZero() {
|
||||
return nil
|
||||
}
|
||||
|
||||
result := make(map[string]time.Time)
|
||||
cursor := start
|
||||
for _, line := range lines {
|
||||
matches := runLogTestDurationRegex.FindStringSubmatch(strings.TrimSpace(line))
|
||||
if len(matches) != 4 {
|
||||
continue
|
||||
}
|
||||
|
||||
testName := strings.ToLower(strings.TrimSpace(matches[1]))
|
||||
minutes, errMin := strconv.Atoi(strings.TrimSpace(matches[2]))
|
||||
seconds, errSec := strconv.Atoi(strings.TrimSpace(matches[3]))
|
||||
if errMin != nil || errSec != nil {
|
||||
continue
|
||||
}
|
||||
if _, exists := result[testName]; !exists {
|
||||
result[testName] = cursor
|
||||
}
|
||||
cursor = cursor.Add(time.Duration(minutes)*time.Minute + time.Duration(seconds)*time.Second)
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
func parseModsStartTime(content []byte) time.Time {
|
||||
matches := modsStartLineRegex.FindSubmatch(content)
|
||||
if len(matches) != 2 {
|
||||
return time.Time{}
|
||||
}
|
||||
tsRaw := strings.TrimSpace(string(matches[1]))
|
||||
if tsRaw == "" {
|
||||
return time.Time{}
|
||||
}
|
||||
ts, err := time.ParseInLocation("Mon Jan 2 15:04:05 2006", tsRaw, time.UTC)
|
||||
if err != nil {
|
||||
return time.Time{}
|
||||
}
|
||||
return ts
|
||||
}
|
||||
|
||||
func parseNVSwitchSlotsFromOutput(content []byte) []string {
|
||||
matches := nvswitchDevnameRegex.FindAllSubmatch(content, -1)
|
||||
if len(matches) == 0 {
|
||||
return nil
|
||||
}
|
||||
seen := make(map[string]struct{})
|
||||
out := make([]string, 0, len(matches))
|
||||
for _, m := range matches {
|
||||
if len(m) != 2 {
|
||||
continue
|
||||
}
|
||||
slot := strings.ToUpper(strings.TrimSpace(string(m[1])))
|
||||
if slot == "" {
|
||||
continue
|
||||
}
|
||||
if _, exists := seen[slot]; exists {
|
||||
continue
|
||||
}
|
||||
seen[slot] = struct{}{}
|
||||
out = append(out, slot)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// ApplyGPUAndNVSwitchCheckTimes writes parsed check timestamps to component status metadata.
|
||||
func ApplyGPUAndNVSwitchCheckTimes(result *models.AnalysisResult, times componentCheckTimes) {
|
||||
if result == nil || result.Hardware == nil {
|
||||
return
|
||||
}
|
||||
|
||||
for i := range result.Hardware.GPUs {
|
||||
gpu := &result.Hardware.GPUs[i]
|
||||
ts := time.Time{}
|
||||
if serial := strings.TrimSpace(gpu.SerialNumber); serial != "" {
|
||||
ts = times.GPUBySerial[serial]
|
||||
}
|
||||
if ts.IsZero() {
|
||||
ts = times.GPUBySlot[strings.ToUpper(strings.TrimSpace(gpu.Slot))]
|
||||
}
|
||||
if ts.IsZero() {
|
||||
ts = times.GPUDefault
|
||||
}
|
||||
if ts.IsZero() {
|
||||
continue
|
||||
}
|
||||
gpu.StatusCheckedAt = ts
|
||||
status := strings.TrimSpace(gpu.Status)
|
||||
if status == "" {
|
||||
status = "Unknown"
|
||||
}
|
||||
gpu.StatusAtCollect = &models.StatusAtCollection{
|
||||
Status: status,
|
||||
At: ts,
|
||||
}
|
||||
}
|
||||
|
||||
for i := range result.Hardware.PCIeDevices {
|
||||
dev := &result.Hardware.PCIeDevices[i]
|
||||
slot := normalizeNVSwitchSlot(strings.TrimSpace(dev.Slot))
|
||||
if slot == "" {
|
||||
continue
|
||||
}
|
||||
slot = strings.ToUpper(slot)
|
||||
if !strings.EqualFold(strings.TrimSpace(dev.DeviceClass), "NVSwitch") &&
|
||||
!strings.HasPrefix(slot, "NVSWITCH") {
|
||||
continue
|
||||
}
|
||||
|
||||
ts := times.NVSwitchBySlot[slot]
|
||||
if ts.IsZero() {
|
||||
ts = times.NVSwitchDefault
|
||||
}
|
||||
if ts.IsZero() {
|
||||
continue
|
||||
}
|
||||
|
||||
dev.StatusCheckedAt = ts
|
||||
status := strings.TrimSpace(dev.Status)
|
||||
if status == "" {
|
||||
status = "Unknown"
|
||||
}
|
||||
dev.StatusAtCollect = &models.StatusAtCollection{
|
||||
Status: status,
|
||||
At: ts,
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user