275 lines
7.6 KiB
Go
275 lines
7.6 KiB
Go
package nvidia
|
|
|
|
import (
|
|
"regexp"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"git.mchus.pro/mchus/logpile/internal/models"
|
|
"git.mchus.pro/mchus/logpile/internal/parser"
|
|
)
|
|
|
|
var verboseRunTestingLineRegex = regexp.MustCompile(`^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}),\d+\s+-\s+Testing\s+([a-zA-Z0-9_]+)\s*$`)
|
|
var runLogStartTimeRegex = regexp.MustCompile(`^Start time\s+([A-Za-z]{3}, \d{2} [A-Za-z]{3} \d{4} \d{2}:\d{2}:\d{2})\s*$`)
|
|
var runLogTestDurationRegex = regexp.MustCompile(`^Testing\s+([a-zA-Z0-9_]+)\s+\S+\s+\[\s*([0-9]+):([0-9]{2})s\s*\]\s*$`)
|
|
var modsStartLineRegex = regexp.MustCompile(`(?m)^MODS start:\s+([A-Za-z]{3}\s+[A-Za-z]{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+\d{4})\s*$`)
|
|
var gpuFieldiagOutputPathRegex = regexp.MustCompile(`(?i)gpu_fieldiag[\\/]+sxm(\d+)_sn_([^\\/]+)[\\/]+output\.log$`)
|
|
var nvswitchDevnameRegex = regexp.MustCompile(`devname=[^,\s]+,(NVSWITCH\d+)`)
|
|
|
|
type componentCheckTimes struct {
|
|
GPUDefault time.Time
|
|
NVSwitchDefault time.Time
|
|
GPUBySerial map[string]time.Time // key: GPU serial
|
|
GPUBySlot map[string]time.Time // key: GPUSXM<idx>
|
|
NVSwitchBySlot map[string]time.Time // key: NVSWITCH<idx>
|
|
}
|
|
|
|
// CollectGPUAndNVSwitchCheckTimes extracts GPU/NVSwitch check timestamps from NVIDIA logs.
|
|
// Priority:
|
|
// 1) verbose_run.log "Testing <test>" timestamps
|
|
// 2) run.log start time + cumulative durations
|
|
func CollectGPUAndNVSwitchCheckTimes(files []parser.ExtractedFile) componentCheckTimes {
|
|
gpuBySerial := make(map[string]time.Time)
|
|
gpuBySlot := make(map[string]time.Time)
|
|
nvsBySlot := make(map[string]time.Time)
|
|
|
|
for _, f := range files {
|
|
path := strings.TrimSpace(f.Path)
|
|
pathLower := strings.ToLower(path)
|
|
|
|
// Per-GPU timestamp from gpu_fieldiag/<SXMx_SN_serial>/output.log
|
|
if strings.HasSuffix(pathLower, "output.log") && strings.Contains(pathLower, "gpu_fieldiag/") {
|
|
ts := parseModsStartTime(f.Content)
|
|
if ts.IsZero() {
|
|
continue
|
|
}
|
|
matches := gpuFieldiagOutputPathRegex.FindStringSubmatch(path)
|
|
if len(matches) == 3 {
|
|
slot := "GPUSXM" + strings.TrimSpace(matches[1])
|
|
serial := strings.TrimSpace(matches[2])
|
|
if slot != "" {
|
|
gpuBySlot[slot] = ts
|
|
}
|
|
if serial != "" {
|
|
gpuBySerial[serial] = ts
|
|
}
|
|
}
|
|
}
|
|
|
|
// Per-NVSwitch timestamp and slot list from nvswitch/output.log
|
|
if strings.HasSuffix(pathLower, "nvswitch/output.log") || strings.HasSuffix(pathLower, "nvswitch\\output.log") {
|
|
ts := parseModsStartTime(f.Content)
|
|
if ts.IsZero() {
|
|
continue
|
|
}
|
|
for _, slot := range parseNVSwitchSlotsFromOutput(f.Content) {
|
|
nvsBySlot[slot] = ts
|
|
}
|
|
}
|
|
}
|
|
|
|
testStarts := make(map[string]time.Time)
|
|
|
|
if f := parser.FindFileByName(files, "verbose_run.log"); f != nil {
|
|
for testName, ts := range parseVerboseRunTestStartTimes(f.Content) {
|
|
testStarts[strings.ToLower(strings.TrimSpace(testName))] = ts
|
|
}
|
|
}
|
|
|
|
if len(testStarts) == 0 {
|
|
if f := parser.FindFileByName(files, "run.log"); f != nil {
|
|
for testName, ts := range parseRunLogTestStartTimes(f.Content) {
|
|
testStarts[strings.ToLower(strings.TrimSpace(testName))] = ts
|
|
}
|
|
}
|
|
}
|
|
|
|
return componentCheckTimes{
|
|
GPUDefault: pickFirstTestTime(testStarts, "gpu_fieldiag", "gpumem", "gpustress", "pcie", "inventory"),
|
|
NVSwitchDefault: pickFirstTestTime(testStarts, "nvswitch", "inventory"),
|
|
GPUBySerial: gpuBySerial,
|
|
GPUBySlot: gpuBySlot,
|
|
NVSwitchBySlot: nvsBySlot,
|
|
}
|
|
}
|
|
|
|
func pickFirstTestTime(testStarts map[string]time.Time, names ...string) time.Time {
|
|
for _, name := range names {
|
|
if ts := testStarts[strings.ToLower(strings.TrimSpace(name))]; !ts.IsZero() {
|
|
return ts
|
|
}
|
|
}
|
|
return time.Time{}
|
|
}
|
|
|
|
func parseVerboseRunTestStartTimes(content []byte) map[string]time.Time {
|
|
result := make(map[string]time.Time)
|
|
lines := strings.Split(string(content), "\n")
|
|
for _, line := range lines {
|
|
matches := verboseRunTestingLineRegex.FindStringSubmatch(strings.TrimSpace(line))
|
|
if len(matches) != 3 {
|
|
continue
|
|
}
|
|
|
|
ts, err := time.ParseInLocation("2006-01-02 15:04:05", strings.TrimSpace(matches[1]), time.UTC)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
testName := strings.ToLower(strings.TrimSpace(matches[2]))
|
|
if testName == "" {
|
|
continue
|
|
}
|
|
if _, exists := result[testName]; !exists {
|
|
result[testName] = ts
|
|
}
|
|
}
|
|
return result
|
|
}
|
|
|
|
func parseRunLogTestStartTimes(content []byte) map[string]time.Time {
|
|
lines := strings.Split(string(content), "\n")
|
|
start := time.Time{}
|
|
for _, line := range lines {
|
|
matches := runLogStartTimeRegex.FindStringSubmatch(strings.TrimSpace(line))
|
|
if len(matches) != 2 {
|
|
continue
|
|
}
|
|
parsed, err := time.ParseInLocation("Mon, 02 Jan 2006 15:04:05", strings.TrimSpace(matches[1]), time.UTC)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
start = parsed
|
|
break
|
|
}
|
|
if start.IsZero() {
|
|
return nil
|
|
}
|
|
|
|
result := make(map[string]time.Time)
|
|
cursor := start
|
|
for _, line := range lines {
|
|
matches := runLogTestDurationRegex.FindStringSubmatch(strings.TrimSpace(line))
|
|
if len(matches) != 4 {
|
|
continue
|
|
}
|
|
|
|
testName := strings.ToLower(strings.TrimSpace(matches[1]))
|
|
minutes, errMin := strconv.Atoi(strings.TrimSpace(matches[2]))
|
|
seconds, errSec := strconv.Atoi(strings.TrimSpace(matches[3]))
|
|
if errMin != nil || errSec != nil {
|
|
continue
|
|
}
|
|
if _, exists := result[testName]; !exists {
|
|
result[testName] = cursor
|
|
}
|
|
cursor = cursor.Add(time.Duration(minutes)*time.Minute + time.Duration(seconds)*time.Second)
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
func parseModsStartTime(content []byte) time.Time {
|
|
matches := modsStartLineRegex.FindSubmatch(content)
|
|
if len(matches) != 2 {
|
|
return time.Time{}
|
|
}
|
|
tsRaw := strings.TrimSpace(string(matches[1]))
|
|
if tsRaw == "" {
|
|
return time.Time{}
|
|
}
|
|
ts, err := time.ParseInLocation("Mon Jan 2 15:04:05 2006", tsRaw, time.UTC)
|
|
if err != nil {
|
|
return time.Time{}
|
|
}
|
|
return ts
|
|
}
|
|
|
|
func parseNVSwitchSlotsFromOutput(content []byte) []string {
|
|
matches := nvswitchDevnameRegex.FindAllSubmatch(content, -1)
|
|
if len(matches) == 0 {
|
|
return nil
|
|
}
|
|
seen := make(map[string]struct{})
|
|
out := make([]string, 0, len(matches))
|
|
for _, m := range matches {
|
|
if len(m) != 2 {
|
|
continue
|
|
}
|
|
slot := strings.ToUpper(strings.TrimSpace(string(m[1])))
|
|
if slot == "" {
|
|
continue
|
|
}
|
|
if _, exists := seen[slot]; exists {
|
|
continue
|
|
}
|
|
seen[slot] = struct{}{}
|
|
out = append(out, slot)
|
|
}
|
|
return out
|
|
}
|
|
|
|
// ApplyGPUAndNVSwitchCheckTimes writes parsed check timestamps to component status metadata.
|
|
func ApplyGPUAndNVSwitchCheckTimes(result *models.AnalysisResult, times componentCheckTimes) {
|
|
if result == nil || result.Hardware == nil {
|
|
return
|
|
}
|
|
|
|
for i := range result.Hardware.GPUs {
|
|
gpu := &result.Hardware.GPUs[i]
|
|
ts := time.Time{}
|
|
if serial := strings.TrimSpace(gpu.SerialNumber); serial != "" {
|
|
ts = times.GPUBySerial[serial]
|
|
}
|
|
if ts.IsZero() {
|
|
ts = times.GPUBySlot[strings.ToUpper(strings.TrimSpace(gpu.Slot))]
|
|
}
|
|
if ts.IsZero() {
|
|
ts = times.GPUDefault
|
|
}
|
|
if ts.IsZero() {
|
|
continue
|
|
}
|
|
gpu.StatusCheckedAt = ts
|
|
status := strings.TrimSpace(gpu.Status)
|
|
if status == "" {
|
|
status = "Unknown"
|
|
}
|
|
gpu.StatusAtCollect = &models.StatusAtCollection{
|
|
Status: status,
|
|
At: ts,
|
|
}
|
|
}
|
|
|
|
for i := range result.Hardware.PCIeDevices {
|
|
dev := &result.Hardware.PCIeDevices[i]
|
|
slot := normalizeNVSwitchSlot(strings.TrimSpace(dev.Slot))
|
|
if slot == "" {
|
|
continue
|
|
}
|
|
slot = strings.ToUpper(slot)
|
|
if !strings.EqualFold(strings.TrimSpace(dev.DeviceClass), "NVSwitch") &&
|
|
!strings.HasPrefix(slot, "NVSWITCH") {
|
|
continue
|
|
}
|
|
|
|
ts := times.NVSwitchBySlot[slot]
|
|
if ts.IsZero() {
|
|
ts = times.NVSwitchDefault
|
|
}
|
|
if ts.IsZero() {
|
|
continue
|
|
}
|
|
|
|
dev.StatusCheckedAt = ts
|
|
status := strings.TrimSpace(dev.Status)
|
|
if status == "" {
|
|
status = "Unknown"
|
|
}
|
|
dev.StatusAtCollect = &models.StatusAtCollection{
|
|
Status: status,
|
|
At: ts,
|
|
}
|
|
}
|
|
}
|