Unify NVIDIA GPU recovery paths
This commit is contained in:
@@ -105,6 +105,7 @@ var (
|
||||
benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
|
||||
benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
|
||||
benchmarkGeteuid = os.Geteuid
|
||||
benchmarkResetNvidiaGPU = resetNvidiaGPU
|
||||
benchmarkSleep = time.Sleep
|
||||
)
|
||||
|
||||
@@ -249,6 +250,35 @@ func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, po
|
||||
return nil
|
||||
}
|
||||
|
||||
func resetBenchmarkGPU(ctx context.Context, verboseLog string, gpuIndex int, logFunc func(string)) error {
|
||||
if logFunc != nil {
|
||||
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset via shared NVIDIA recover path", gpuIndex))
|
||||
}
|
||||
out, err := benchmarkResetNvidiaGPU(gpuIndex)
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] start power-preflight-gpu-%d-reset.log", time.Now().UTC().Format(time.RFC3339), gpuIndex),
|
||||
"cmd: bee-nvidia-recover reset-gpu "+strconv.Itoa(gpuIndex),
|
||||
)
|
||||
if trimmed := strings.TrimSpace(out); trimmed != "" && logFunc != nil {
|
||||
for _, line := range strings.Split(trimmed, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line != "" {
|
||||
logFunc(line)
|
||||
}
|
||||
}
|
||||
}
|
||||
rc := 0
|
||||
if err != nil {
|
||||
rc = 1
|
||||
}
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] finish power-preflight-gpu-%d-reset.log", time.Now().UTC().Format(time.RFC3339), gpuIndex),
|
||||
fmt.Sprintf("rc: %d", rc),
|
||||
"",
|
||||
)
|
||||
return err
|
||||
}
|
||||
|
||||
func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int, logFunc func(string)) []int {
|
||||
if len(gpuIndices) == 0 {
|
||||
return nil
|
||||
@@ -266,8 +296,7 @@ func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int
|
||||
}
|
||||
var failed []int
|
||||
for _, idx := range gpuIndices {
|
||||
name := fmt.Sprintf("power-preflight-gpu-%d-reset.log", idx)
|
||||
if _, err := runSATCommandCtx(ctx, verboseLog, name, []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-r"}, nil, logFunc); err != nil {
|
||||
if err := resetBenchmarkGPU(ctx, verboseLog, idx, logFunc); err != nil {
|
||||
failed = append(failed, idx)
|
||||
if logFunc != nil {
|
||||
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset failed: %v", idx, err))
|
||||
@@ -4440,8 +4469,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
_ = os.MkdirAll(singleDir, 0755)
|
||||
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||
if failed := resetBenchmarkGPUs(ctx, verboseLog, []int{idx}, logFunc); len(failed) > 0 {
|
||||
result.Findings = append(result.Findings,
|
||||
fmt.Sprintf("GPU %d reset pre-flight did not complete before its first power test; throttle counters may contain stale state.", idx))
|
||||
return "", fmt.Errorf("power benchmark pre-flight: failed to reset GPU %d; benchmark aborted to keep measurements clean", idx)
|
||||
}
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
||||
singlePowerStopCh := make(chan struct{})
|
||||
|
||||
Reference in New Issue
Block a user