Unify NVIDIA GPU recovery paths

This commit is contained in:
2026-04-23 20:31:41 +03:00
parent 6112094d45
commit 749fc8a94d
6 changed files with 278 additions and 82 deletions

View File

@@ -105,6 +105,7 @@ var (
benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
benchmarkGeteuid = os.Geteuid
benchmarkResetNvidiaGPU = resetNvidiaGPU
benchmarkSleep = time.Sleep
)
@@ -249,6 +250,35 @@ func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, po
return nil
}
func resetBenchmarkGPU(ctx context.Context, verboseLog string, gpuIndex int, logFunc func(string)) error {
if logFunc != nil {
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset via shared NVIDIA recover path", gpuIndex))
}
out, err := benchmarkResetNvidiaGPU(gpuIndex)
appendSATVerboseLog(verboseLog,
fmt.Sprintf("[%s] start power-preflight-gpu-%d-reset.log", time.Now().UTC().Format(time.RFC3339), gpuIndex),
"cmd: bee-nvidia-recover reset-gpu "+strconv.Itoa(gpuIndex),
)
if trimmed := strings.TrimSpace(out); trimmed != "" && logFunc != nil {
for _, line := range strings.Split(trimmed, "\n") {
line = strings.TrimSpace(line)
if line != "" {
logFunc(line)
}
}
}
rc := 0
if err != nil {
rc = 1
}
appendSATVerboseLog(verboseLog,
fmt.Sprintf("[%s] finish power-preflight-gpu-%d-reset.log", time.Now().UTC().Format(time.RFC3339), gpuIndex),
fmt.Sprintf("rc: %d", rc),
"",
)
return err
}
func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int, logFunc func(string)) []int {
if len(gpuIndices) == 0 {
return nil
@@ -266,8 +296,7 @@ func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int
}
var failed []int
for _, idx := range gpuIndices {
name := fmt.Sprintf("power-preflight-gpu-%d-reset.log", idx)
if _, err := runSATCommandCtx(ctx, verboseLog, name, []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-r"}, nil, logFunc); err != nil {
if err := resetBenchmarkGPU(ctx, verboseLog, idx, logFunc); err != nil {
failed = append(failed, idx)
if logFunc != nil {
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset failed: %v", idx, err))
@@ -4440,8 +4469,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
_ = os.MkdirAll(singleDir, 0755)
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
if failed := resetBenchmarkGPUs(ctx, verboseLog, []int{idx}, logFunc); len(failed) > 0 {
result.Findings = append(result.Findings,
fmt.Sprintf("GPU %d reset pre-flight did not complete before its first power test; throttle counters may contain stale state.", idx))
return "", fmt.Errorf("power benchmark pre-flight: failed to reset GPU %d; benchmark aborted to keep measurements clean", idx)
}
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
singlePowerStopCh := make(chan struct{})