Files
logpile/internal/parser/vendors/nvidia/component_status_time.go

275 lines
7.6 KiB
Go

package nvidia
import (
"regexp"
"strconv"
"strings"
"time"
"git.mchus.pro/mchus/logpile/internal/models"
"git.mchus.pro/mchus/logpile/internal/parser"
)
var verboseRunTestingLineRegex = regexp.MustCompile(`^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}),\d+\s+-\s+Testing\s+([a-zA-Z0-9_]+)\s*$`)
var runLogStartTimeRegex = regexp.MustCompile(`^Start time\s+([A-Za-z]{3}, \d{2} [A-Za-z]{3} \d{4} \d{2}:\d{2}:\d{2})\s*$`)
var runLogTestDurationRegex = regexp.MustCompile(`^Testing\s+([a-zA-Z0-9_]+)\s+\S+\s+\[\s*([0-9]+):([0-9]{2})s\s*\]\s*$`)
var modsStartLineRegex = regexp.MustCompile(`(?m)^MODS start:\s+([A-Za-z]{3}\s+[A-Za-z]{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+\d{4})\s*$`)
var gpuFieldiagOutputPathRegex = regexp.MustCompile(`(?i)gpu_fieldiag[\\/]+sxm(\d+)_sn_([^\\/]+)[\\/]+output\.log$`)
var nvswitchDevnameRegex = regexp.MustCompile(`devname=[^,\s]+,(NVSWITCH\d+)`)
type componentCheckTimes struct {
GPUDefault time.Time
NVSwitchDefault time.Time
GPUBySerial map[string]time.Time // key: GPU serial
GPUBySlot map[string]time.Time // key: GPUSXM<idx>
NVSwitchBySlot map[string]time.Time // key: NVSWITCH<idx>
}
// CollectGPUAndNVSwitchCheckTimes extracts GPU/NVSwitch check timestamps from NVIDIA logs.
// Priority:
// 1) verbose_run.log "Testing <test>" timestamps
// 2) run.log start time + cumulative durations
func CollectGPUAndNVSwitchCheckTimes(files []parser.ExtractedFile) componentCheckTimes {
gpuBySerial := make(map[string]time.Time)
gpuBySlot := make(map[string]time.Time)
nvsBySlot := make(map[string]time.Time)
for _, f := range files {
path := strings.TrimSpace(f.Path)
pathLower := strings.ToLower(path)
// Per-GPU timestamp from gpu_fieldiag/<SXMx_SN_serial>/output.log
if strings.HasSuffix(pathLower, "output.log") && strings.Contains(pathLower, "gpu_fieldiag/") {
ts := parseModsStartTime(f.Content)
if ts.IsZero() {
continue
}
matches := gpuFieldiagOutputPathRegex.FindStringSubmatch(path)
if len(matches) == 3 {
slot := "GPUSXM" + strings.TrimSpace(matches[1])
serial := strings.TrimSpace(matches[2])
if slot != "" {
gpuBySlot[slot] = ts
}
if serial != "" {
gpuBySerial[serial] = ts
}
}
}
// Per-NVSwitch timestamp and slot list from nvswitch/output.log
if strings.HasSuffix(pathLower, "nvswitch/output.log") || strings.HasSuffix(pathLower, "nvswitch\\output.log") {
ts := parseModsStartTime(f.Content)
if ts.IsZero() {
continue
}
for _, slot := range parseNVSwitchSlotsFromOutput(f.Content) {
nvsBySlot[slot] = ts
}
}
}
testStarts := make(map[string]time.Time)
if f := parser.FindFileByName(files, "verbose_run.log"); f != nil {
for testName, ts := range parseVerboseRunTestStartTimes(f.Content) {
testStarts[strings.ToLower(strings.TrimSpace(testName))] = ts
}
}
if len(testStarts) == 0 {
if f := parser.FindFileByName(files, "run.log"); f != nil {
for testName, ts := range parseRunLogTestStartTimes(f.Content) {
testStarts[strings.ToLower(strings.TrimSpace(testName))] = ts
}
}
}
return componentCheckTimes{
GPUDefault: pickFirstTestTime(testStarts, "gpu_fieldiag", "gpumem", "gpustress", "pcie", "inventory"),
NVSwitchDefault: pickFirstTestTime(testStarts, "nvswitch", "inventory"),
GPUBySerial: gpuBySerial,
GPUBySlot: gpuBySlot,
NVSwitchBySlot: nvsBySlot,
}
}
func pickFirstTestTime(testStarts map[string]time.Time, names ...string) time.Time {
for _, name := range names {
if ts := testStarts[strings.ToLower(strings.TrimSpace(name))]; !ts.IsZero() {
return ts
}
}
return time.Time{}
}
func parseVerboseRunTestStartTimes(content []byte) map[string]time.Time {
result := make(map[string]time.Time)
lines := strings.Split(string(content), "\n")
for _, line := range lines {
matches := verboseRunTestingLineRegex.FindStringSubmatch(strings.TrimSpace(line))
if len(matches) != 3 {
continue
}
ts, err := time.ParseInLocation("2006-01-02 15:04:05", strings.TrimSpace(matches[1]), time.UTC)
if err != nil {
continue
}
testName := strings.ToLower(strings.TrimSpace(matches[2]))
if testName == "" {
continue
}
if _, exists := result[testName]; !exists {
result[testName] = ts
}
}
return result
}
func parseRunLogTestStartTimes(content []byte) map[string]time.Time {
lines := strings.Split(string(content), "\n")
start := time.Time{}
for _, line := range lines {
matches := runLogStartTimeRegex.FindStringSubmatch(strings.TrimSpace(line))
if len(matches) != 2 {
continue
}
parsed, err := time.ParseInLocation("Mon, 02 Jan 2006 15:04:05", strings.TrimSpace(matches[1]), time.UTC)
if err != nil {
continue
}
start = parsed
break
}
if start.IsZero() {
return nil
}
result := make(map[string]time.Time)
cursor := start
for _, line := range lines {
matches := runLogTestDurationRegex.FindStringSubmatch(strings.TrimSpace(line))
if len(matches) != 4 {
continue
}
testName := strings.ToLower(strings.TrimSpace(matches[1]))
minutes, errMin := strconv.Atoi(strings.TrimSpace(matches[2]))
seconds, errSec := strconv.Atoi(strings.TrimSpace(matches[3]))
if errMin != nil || errSec != nil {
continue
}
if _, exists := result[testName]; !exists {
result[testName] = cursor
}
cursor = cursor.Add(time.Duration(minutes)*time.Minute + time.Duration(seconds)*time.Second)
}
return result
}
func parseModsStartTime(content []byte) time.Time {
matches := modsStartLineRegex.FindSubmatch(content)
if len(matches) != 2 {
return time.Time{}
}
tsRaw := strings.TrimSpace(string(matches[1]))
if tsRaw == "" {
return time.Time{}
}
ts, err := time.ParseInLocation("Mon Jan 2 15:04:05 2006", tsRaw, time.UTC)
if err != nil {
return time.Time{}
}
return ts
}
func parseNVSwitchSlotsFromOutput(content []byte) []string {
matches := nvswitchDevnameRegex.FindAllSubmatch(content, -1)
if len(matches) == 0 {
return nil
}
seen := make(map[string]struct{})
out := make([]string, 0, len(matches))
for _, m := range matches {
if len(m) != 2 {
continue
}
slot := strings.ToUpper(strings.TrimSpace(string(m[1])))
if slot == "" {
continue
}
if _, exists := seen[slot]; exists {
continue
}
seen[slot] = struct{}{}
out = append(out, slot)
}
return out
}
// ApplyGPUAndNVSwitchCheckTimes writes parsed check timestamps to component status metadata.
func ApplyGPUAndNVSwitchCheckTimes(result *models.AnalysisResult, times componentCheckTimes) {
if result == nil || result.Hardware == nil {
return
}
for i := range result.Hardware.GPUs {
gpu := &result.Hardware.GPUs[i]
ts := time.Time{}
if serial := strings.TrimSpace(gpu.SerialNumber); serial != "" {
ts = times.GPUBySerial[serial]
}
if ts.IsZero() {
ts = times.GPUBySlot[strings.ToUpper(strings.TrimSpace(gpu.Slot))]
}
if ts.IsZero() {
ts = times.GPUDefault
}
if ts.IsZero() {
continue
}
gpu.StatusCheckedAt = ts
status := strings.TrimSpace(gpu.Status)
if status == "" {
status = "Unknown"
}
gpu.StatusAtCollect = &models.StatusAtCollection{
Status: status,
At: ts,
}
}
for i := range result.Hardware.PCIeDevices {
dev := &result.Hardware.PCIeDevices[i]
slot := normalizeNVSwitchSlot(strings.TrimSpace(dev.Slot))
if slot == "" {
continue
}
slot = strings.ToUpper(slot)
if !strings.EqualFold(strings.TrimSpace(dev.DeviceClass), "NVSwitch") &&
!strings.HasPrefix(slot, "NVSWITCH") {
continue
}
ts := times.NVSwitchBySlot[slot]
if ts.IsZero() {
ts = times.NVSwitchDefault
}
if ts.IsZero() {
continue
}
dev.StatusCheckedAt = ts
status := strings.TrimSpace(dev.Status)
if status == "" {
status = "Unknown"
}
dev.StatusAtCollect = &models.StatusAtCollection{
Status: status,
At: ts,
}
}
}