Update power benchmark GPU reset flow

Fix NVIDIA self-heal recovery flow
Reset GPUs before power benchmark
2026-04-20 09:46:00 +03:00 · 2026-04-20 09:43:22 +03:00 · 2026-04-20 09:42:19 +03:00 · 2026-04-20 09:28:58 +03:00 · 2026-04-20 09:26:56 +03:00 · 2026-04-20 09:26:29 +03:00
7 changed files with 527 additions and 50 deletions
--- a/audit/bee
+++ b/audit/bee
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -37,6 +37,8 @@ type benchmarkGPUInfo struct {
 	VBIOS                string
 	PowerLimitW          float64
 	DefaultPowerLimitW   float64
 	MinPowerLimitW       float64
 	MaxPowerLimitW       float64
 	MaxGraphicsClockMHz  float64
 	MaxMemoryClockMHz    float64
 	BaseGraphicsClockMHz float64
@@ -95,6 +97,8 @@ var (
 	benchmarkReadyPattern      = regexp.MustCompile(`^([a-z0-9_]+)\[(\d+)\]=READY dim=(\d+)x(\d+)x(\d+)\b`)
 	benchmarkSkippedPattern    = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
 	benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
 	benchmarkGeteuid           = os.Geteuid
 	benchmarkSleep             = time.Sleep
 )
 // benchmarkPrecisionPhases lists the precision categories run as individual
@@ -220,8 +224,6 @@ func benchmarkCalibrationThrottleReason(before, after BenchmarkThrottleCounters)
 		return "hw_thermal"
 	case diff.SWThermalSlowdownUS > 0:
 		return "sw_thermal"
 	case diff.HWPowerBrakeSlowdownUS > 0:
 		return "hw_power_brake"
 	default:
 		return ""
 	}
@@ -240,6 +242,39 @@ func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, po
 	return nil
 }
 func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int, logFunc func(string)) []int {
 	if len(gpuIndices) == 0 {
 		return nil
 	}
 	if benchmarkGeteuid() != 0 {
 		if logFunc != nil {
 			logFunc("power benchmark pre-flight: root privileges unavailable, GPU reset skipped")
 		}
 		return append([]int(nil), gpuIndices...)
 	}
 	if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
 		for _, p := range killed {
 			logFunc(fmt.Sprintf("power benchmark pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
 		}
 	}
 	var failed []int
 	for _, idx := range gpuIndices {
 		name := fmt.Sprintf("power-preflight-gpu-%d-reset.log", idx)
 		if _, err := runSATCommandCtx(ctx, verboseLog, name, []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-r"}, nil, logFunc); err != nil {
 			failed = append(failed, idx)
 			if logFunc != nil {
 				logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset failed: %v", idx, err))
 			}
 			continue
 		}
 		if logFunc != nil {
 			logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset completed", idx))
 		}
 		benchmarkSleep(time.Second)
 	}
 	return failed
 }
 func benchmarkPowerEngine() string {
 	switch strings.TrimSpace(strings.ToLower(os.Getenv("BEE_BENCH_POWER_ENGINE"))) {
 	case BenchmarkPowerEngineTargetedPower:
@@ -351,9 +386,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
 		result.Normalization.Status = "partial"
 	}
-	// Enrich with max clocks from verbose output — covers GPUs where
+	// Enrich with verbose nvidia-smi data — covers GPUs where some CSV fields
-	// clocks.max.* CSV fields are unsupported (e.g. Blackwell / driver 98.x).
+	// are unsupported (e.g. clocks.max.* on Blackwell / driver 98.x).
-	enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQOut)
+	enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQOut)
 	activeApps, err := queryActiveComputeApps(selected)
 	if err == nil && len(activeApps) > 0 {
@@ -737,8 +772,8 @@ func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
 // (attribute.multiprocessor_count, power.default_limit) are not supported on
 // all driver versions, so we fall back to the base set if the full query fails.
 // The minimal fallback omits clock fields entirely — clocks.max.* returns
-// exit status 2 on some GPU generations (e.g. Blackwell); max clocks are
+// exit status 2 on some GPU generations (e.g. Blackwell); missing data is
-// then recovered from nvidia-smi -q via enrichGPUInfoWithMaxClocks.
+// then recovered from nvidia-smi -q.
 var benchmarkGPUInfoQueries = []struct {
 	fields   string
 	extended bool // whether this query includes optional extended fields
@@ -758,12 +793,9 @@ var benchmarkGPUInfoQueries = []struct {
 	},
 }
-// enrichGPUInfoWithMaxClocks fills MaxGraphicsClockMHz / MaxMemoryClockMHz for
+// enrichGPUInfoWithNvidiaSMIQ fills benchmark GPU metadata from nvidia-smi -q
-// any GPU in infoByIndex where those values are still zero.  It parses the
+// for fields that may be missing from --query-gpu on some driver versions.
-// "Max Clocks" section of nvidia-smi -q output (already available as nvsmiQ).
+func enrichGPUInfoWithNvidiaSMIQ(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) {
 // This is the fallback for GPUs (e.g. Blackwell) where clocks.max.* CSV fields
 // return exit status 2 but the verbose query works fine.
 func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) {
 	if len(infoByIndex) == 0 || len(nvsmiQ) == 0 {
 		return
 	}
@@ -784,6 +816,8 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
 	maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`)
 	defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`)
 	currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`)
 	minPwrRe := regexp.MustCompile(`(?i)Min Power Limit\s*:\s*([0-9.]+)\s*W`)
 	maxPwrRe := regexp.MustCompile(`(?i)Max Power Limit\s*:\s*([0-9.]+)\s*W`)
 	smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`)
 	shutdownTempRe := regexp.MustCompile(`(?i)GPU Shutdown Temp\s*:\s*(\d+)\s*C`)
 	slowdownTempRe := regexp.MustCompile(`(?i)GPU Slowdown Temp\s*:\s*(\d+)\s*C`)
@@ -843,6 +877,20 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
 				}
 			}
 		}
 		if info.MinPowerLimitW == 0 {
 			if m := minPwrRe.FindSubmatch(section); m != nil {
 				if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
 					info.MinPowerLimitW = v
 				}
 			}
 		}
 		if info.MaxPowerLimitW == 0 {
 			if m := maxPwrRe.FindSubmatch(section); m != nil {
 				if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
 					info.MaxPowerLimitW = v
 				}
 			}
 		}
 		if info.MultiprocessorCount == 0 {
 			if m := smCountRe.FindSubmatch(section); m != nil {
 				if v, err := strconv.Atoi(string(m[1])); err == nil && v > 0 {
@@ -3043,7 +3091,6 @@ func runBenchmarkPowerCalibration(
 	if calibDurationSec <= 0 {
 		calibDurationSec = 120
 	}
 	const maxDerateW = 150
 	// calibSearchTolerance is the binary-search convergence threshold in watts.
 	// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
 	const calibSearchTolerance = 10
@@ -3090,8 +3137,9 @@ func runBenchmarkPowerCalibration(
 		originalLimitW int
 		appliedLimitW  int
 		minLimitW      int
-		lo             int // highest verified-stable limit (assumed: minLimitW)
+		lo             int // highest verified-stable limit
 		hi             int // lowest verified-unstable limit (exclusive sentinel above start)
 		loVerified     bool
 		calib          benchmarkPowerCalibrationResult
 		converged      bool
 	}
@@ -3113,23 +3161,17 @@ func runBenchmarkPowerCalibration(
 		if defaultLimitW <= 0 {
 			defaultLimitW = originalLimitW
 		}
-		appliedLimitW := originalLimitW
+		appliedLimitW := initialBenchmarkCalibrationLimitW(info)
 		if appliedLimitW <= 0 {
 			appliedLimitW = defaultLimitW
 		}
-		minLimitW := appliedLimitW
+		minLimitW := int(math.Round(info.MinPowerLimitW))
-		switch {
+		if minLimitW <= 0 {
-		case defaultLimitW > 0:
+			minLimitW = appliedLimitW
 			minLimitW = defaultLimitW - maxDerateW
 			floorByRatio := int(math.Round(float64(defaultLimitW) * 0.70))
 			if minLimitW < floorByRatio {
 				minLimitW = floorByRatio
 			}
 		case appliedLimitW > 0:
 			minLimitW = appliedLimitW - maxDerateW
 		}
-		if minLimitW < calibSearchTolerance {
+		maxLimitW := int(math.Round(info.MaxPowerLimitW))
-			minLimitW = calibSearchTolerance
+		if maxLimitW > 0 && appliedLimitW > maxLimitW {
 			appliedLimitW = maxLimitW
 		}
 		s := &gpuCalibState{
 			idx:            idx,
@@ -3141,11 +3183,24 @@ func runBenchmarkPowerCalibration(
 			hi:             appliedLimitW + 1, // not yet tested, not yet confirmed unstable
 			calib:          benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
 		}
 		if minLimitW > 0 && appliedLimitW > 0 && minLimitW >= appliedLimitW {
 			s.appliedLimitW = minLimitW
 			s.hi = minLimitW + 1
 		}
 		if info.MinPowerLimitW <= 0 {
 			s.calib.Notes = append(s.calib.Notes, "minimum power limit was not reported by nvidia-smi; calibration can only validate the current/default power limit")
 		}
 		if seedLimits != nil {
 			if seedW, ok := seedLimits[idx]; ok && seedW > 0 {
 				// A previously validated limit is only a starting point. Re-run
 				// targeted_power under the current multi-GPU thermal load and derate
 				// again if this step shows new throttling.
 				if seedW < s.minLimitW {
 					seedW = s.minLimitW
 				}
 				if maxLimitW > 0 && seedW > maxLimitW {
 					seedW = maxLimitW
 				}
 				if canDerate {
 					_ = setBenchmarkPowerLimit(ctx, verboseLog, idx, seedW)
 				}
@@ -3333,6 +3388,7 @@ calibDone:
 				s.calib.AppliedPowerLimitW = float64(s.appliedLimitW)
 				logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", s.idx, s.appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
 				s.lo = s.appliedLimitW
 				s.loVerified = true
 				if canDerate && s.hi-s.lo > calibSearchTolerance {
 					next := roundTo5W((s.lo + s.hi) / 2)
 					if next > s.lo && next < s.hi {
@@ -3371,7 +3427,23 @@ calibDone:
 			s.hi = s.appliedLimitW
 			if s.hi-s.lo <= calibSearchTolerance {
-				if s.lo > s.minLimitW {
+				if !s.loVerified && s.minLimitW > 0 && s.appliedLimitW != s.minLimitW {
 					if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.minLimitW); err != nil {
 						s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
 						logFunc(fmt.Sprintf("power calibration: GPU %d failed to set minimum power limit %d W: %v", s.idx, s.minLimitW, err))
 						s.converged = true
 						continue
 					}
 					s.appliedLimitW = s.minLimitW
 					s.calib.AppliedPowerLimitW = float64(s.minLimitW)
 					s.calib.Derated = s.minLimitW < s.originalLimitW
 					s.info.PowerLimitW = float64(s.minLimitW)
 					infoByIndex[s.idx] = s.info
 					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: validating minimum settable limit %d W before concluding failure", s.minLimitW))
 					logFunc(fmt.Sprintf("power calibration: GPU %d binary search: validating minimum settable limit %d W", s.idx, s.minLimitW))
 					continue
 				}
 				if s.loVerified {
 					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", s.lo, s.lo, s.hi))
 					if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.lo); err == nil {
 						s.appliedLimitW = s.lo
@@ -3383,7 +3455,8 @@ calibDone:
 						s.calib.Completed = true
 					}
 				} else {
-					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
+					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit down to the minimum settable power limit %d W", engineLabel, s.minLimitW))
 					logFunc(fmt.Sprintf("power calibration: GPU %d no stable limit found down to minimum settable power limit %d W", s.idx, s.minLimitW))
 				}
 				s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
 				s.converged = true
@@ -3398,9 +3471,7 @@ calibDone:
 				next = (s.lo + s.hi) / 2
 			}
 			if next < s.minLimitW {
-				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
+				next = s.minLimitW
 				s.converged = true
 				continue
 			}
 			if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err != nil {
 				s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
@@ -3439,6 +3510,24 @@ func roundTo5W(w int) int {
 	return ((w + 2) / 5) * 5
 }
 func initialBenchmarkCalibrationLimitW(info benchmarkGPUInfo) int {
 	defaultLimitW := int(math.Round(info.DefaultPowerLimitW))
 	currentLimitW := int(math.Round(info.PowerLimitW))
 	maxLimitW := int(math.Round(info.MaxPowerLimitW))
 	startW := defaultLimitW
 	if startW <= 0 {
 		startW = currentLimitW
 	}
 	if startW <= 0 {
 		startW = maxLimitW
 	}
 	if maxLimitW > 0 && startW > maxLimitW {
 		startW = maxLimitW
 	}
 	return startW
 }
 // meanFanRPM returns the average RPM across a set of fan readings.
 func meanFanRPM(fans []FanReading) float64 {
 	if len(fans) == 0 {
@@ -4096,14 +4185,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		return "", fmt.Errorf("mkdir %s: %w", runDir, err)
 	}
 	verboseLog := filepath.Join(runDir, "verbose.log")
 	infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
 	if infoErr != nil {
 		return "", infoErr
 	}
 	// Capture full nvidia-smi -q snapshot at the start of the run.
 	if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
 		_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
 	}
 	hostname, _ := os.Hostname()
 	result := NvidiaPowerBenchResult{
 		BenchmarkVersion:   benchmarkVersion,
@@ -4114,6 +4195,14 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		SelectedGPUIndices: append([]int(nil), selected...),
 		OverallStatus:      "OK",
 	}
 	infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
 	if infoErr != nil {
 		return "", infoErr
 	}
 	// Capture full nvidia-smi -q snapshot at the start of the run.
 	if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
 		_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
 	}
 	durationSec := powerBenchDurationSec(opts.Profile)
 	// Sample server idle power before any GPU load.
@@ -4139,6 +4228,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
 		singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
 		_ = os.MkdirAll(singleDir, 0755)
 		singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
 		if failed := resetBenchmarkGPUs(ctx, verboseLog, []int{idx}, logFunc); len(failed) > 0 {
 			result.Findings = append(result.Findings,
 				fmt.Sprintf("GPU %d reset pre-flight did not complete before its first power test; throttle counters may contain stale state.", idx))
 		}
 		logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
 		singlePowerStopCh := make(chan struct{})
 		singlePowerCh := startSelectedPowerSourceSampler(singlePowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
--- a/audit/internal/platform/benchmark_test.go
+++ b/audit/internal/platform/benchmark_test.go
@@ -1,8 +1,13 @@
 package platform
 import (
 	"context"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"strings"
 	"testing"
 	"time"
 )
 func TestResolveBenchmarkProfile(t *testing.T) {
@@ -164,6 +169,93 @@ func TestBenchmarkPlannedPhaseStatus(t *testing.T) {
 	}
 }
 func TestBenchmarkCalibrationThrottleReasonIgnoresPowerReasons(t *testing.T) {
 	t.Parallel()
 	before := BenchmarkThrottleCounters{}
 	if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWPowerCapUS: 1_000_000}); got != "" {
 		t.Fatalf("sw_power_cap should be ignored, got %q", got)
 	}
 	if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWPowerBrakeSlowdownUS: 1_000_000}); got != "" {
 		t.Fatalf("hw_power_brake should be ignored, got %q", got)
 	}
 	if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWThermalSlowdownUS: 1_000_000}); got != "hw_thermal" {
 		t.Fatalf("hw_thermal mismatch: got %q", got)
 	}
 	if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWThermalSlowdownUS: 1_000_000}); got != "sw_thermal" {
 		t.Fatalf("sw_thermal mismatch: got %q", got)
 	}
 }
 func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
 	t.Parallel()
 	oldGeteuid := benchmarkGeteuid
 	oldExec := satExecCommand
 	benchmarkGeteuid = func() int { return 1000 }
 	satExecCommand = func(name string, args ...string) *exec.Cmd {
 		t.Fatalf("unexpected command: %s %v", name, args)
 		return nil
 	}
 	t.Cleanup(func() {
 		benchmarkGeteuid = oldGeteuid
 		satExecCommand = oldExec
 	})
 	var logs []string
 	failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{0, 2}, func(line string) {
 		logs = append(logs, line)
 	})
 	if got, want := strings.Join(logs, "\n"), "power benchmark pre-flight: root privileges unavailable, GPU reset skipped"; !strings.Contains(got, want) {
 		t.Fatalf("logs=%q want substring %q", got, want)
 	}
 	if len(failed) != 2 || failed[0] != 0 || failed[1] != 2 {
 		t.Fatalf("failed=%v want [0 2]", failed)
 	}
 }
 func TestResetBenchmarkGPUsResetsEachGPU(t *testing.T) {
 	t.Parallel()
 	dir := t.TempDir()
 	script := filepath.Join(dir, "nvidia-smi")
 	argsLog := filepath.Join(dir, "args.log")
 	if err := os.WriteFile(script, []byte("#!/bin/sh\nprintf '%s\\n' \"$*\" >> "+argsLog+"\nprintf 'ok\\n'\n"), 0755); err != nil {
 		t.Fatalf("write script: %v", err)
 	}
 	oldGeteuid := benchmarkGeteuid
 	oldSleep := benchmarkSleep
 	oldLookPath := satLookPath
 	benchmarkGeteuid = func() int { return 0 }
 	benchmarkSleep = func(time.Duration) {}
 	satLookPath = func(file string) (string, error) {
 		if file == "nvidia-smi" {
 			return script, nil
 		}
 		return exec.LookPath(file)
 	}
 	t.Cleanup(func() {
 		benchmarkGeteuid = oldGeteuid
 		benchmarkSleep = oldSleep
 		satLookPath = oldLookPath
 	})
 	failed := resetBenchmarkGPUs(context.Background(), filepath.Join(dir, "verbose.log"), []int{2, 5}, nil)
 	if len(failed) != 0 {
 		t.Fatalf("failed=%v want no failures", failed)
 	}
 	raw, err := os.ReadFile(argsLog)
 	if err != nil {
 		t.Fatalf("read args log: %v", err)
 	}
 	got := strings.Fields(string(raw))
 	want := []string{"-i", "2", "-r", "-i", "5", "-r"}
 	if strings.Join(got, " ") != strings.Join(want, " ") {
 		t.Fatalf("args=%v want %v", got, want)
 	}
 }
 func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
 	t.Parallel()
@@ -179,6 +271,59 @@ func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
 	}
 }
 func TestInitialBenchmarkCalibrationLimitW(t *testing.T) {
 	t.Parallel()
 	cases := []struct {
 		name string
 		info benchmarkGPUInfo
 		want int
 	}{
 		{
 			name: "prefers default tdp over current derated limit",
 			info: benchmarkGPUInfo{
 				PowerLimitW:        500,
 				DefaultPowerLimitW: 600,
 				MaxPowerLimitW:     600,
 			},
 			want: 600,
 		},
 		{
 			name: "caps default tdp to reported max limit",
 			info: benchmarkGPUInfo{
 				PowerLimitW:        500,
 				DefaultPowerLimitW: 700,
 				MaxPowerLimitW:     650,
 			},
 			want: 650,
 		},
 		{
 			name: "falls back to current limit when default missing",
 			info: benchmarkGPUInfo{
 				PowerLimitW:    525,
 				MaxPowerLimitW: 600,
 			},
 			want: 525,
 		},
 		{
 			name: "falls back to max limit when only that is known",
 			info: benchmarkGPUInfo{
 				MaxPowerLimitW: 575,
 			},
 			want: 575,
 		},
 	}
 	for _, tc := range cases {
 		tc := tc
 		t.Run(tc.name, func(t *testing.T) {
 			if got := initialBenchmarkCalibrationLimitW(tc.info); got != tc.want {
 				t.Fatalf("initialBenchmarkCalibrationLimitW(%+v)=%d want %d", tc.info, got, tc.want)
 			}
 		})
 	}
 }
 func TestParseBenchmarkBurnLog(t *testing.T) {
 	t.Parallel()
@@ -338,12 +483,16 @@ func TestScoreBenchmarkGPUIgnoresDisabledPrecisions(t *testing.T) {
 	}
 }
-func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {
+func TestEnrichGPUInfoWithNvidiaSMIQ(t *testing.T) {
 	t.Parallel()
 	nvsmiQ := []byte(`
 GPU 00000000:4E:00.0
    Product Name                          : NVIDIA RTX PRO 6000 Blackwell Server Edition
    Min Power Limit                       : 200.00 W
    Max Power Limit                       : 600.00 W
    Default Power Limit                   : 575.00 W
    Current Power Limit                   : 560.00 W
    Clocks
        Graphics                          : 2422 MHz
        Memory                            : 12481 MHz
@@ -365,7 +514,7 @@ GPU 00000000:4F:00.0
 		1: {Index: 1, BusID: "00000000:4F:00.0"},
 	}
-	enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
+	enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)
 	if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
 		t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz)
@@ -379,25 +528,49 @@ GPU 00000000:4F:00.0
 	if infoByIndex[1].MaxMemoryClockMHz != 12481 {
 		t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz)
 	}
 	if infoByIndex[0].MinPowerLimitW != 200 {
 		t.Errorf("GPU 0 MinPowerLimitW = %v, want 200", infoByIndex[0].MinPowerLimitW)
 	}
 	if infoByIndex[0].MaxPowerLimitW != 600 {
 		t.Errorf("GPU 0 MaxPowerLimitW = %v, want 600", infoByIndex[0].MaxPowerLimitW)
 	}
 	if infoByIndex[0].DefaultPowerLimitW != 575 {
 		t.Errorf("GPU 0 DefaultPowerLimitW = %v, want 575", infoByIndex[0].DefaultPowerLimitW)
 	}
 	if infoByIndex[0].PowerLimitW != 560 {
 		t.Errorf("GPU 0 PowerLimitW = %v, want 560", infoByIndex[0].PowerLimitW)
 	}
 }
-func TestEnrichGPUInfoWithMaxClocksSkipsPopulated(t *testing.T) {
+func TestEnrichGPUInfoWithNvidiaSMIQSkipsPopulated(t *testing.T) {
 	t.Parallel()
 	nvsmiQ := []byte(`
 GPU 00000000:4E:00.0
    Min Power Limit                       : 100.00 W
    Max Power Limit                       : 900.00 W
    Max Clocks
        Graphics                          : 9999 MHz
        Memory                            : 9999 MHz
 `)
 	// Already populated — must not be overwritten.
 	infoByIndex := map[int]benchmarkGPUInfo{
-		0: {Index: 0, BusID: "00000000:4E:00.0", MaxGraphicsClockMHz: 2430, MaxMemoryClockMHz: 12481},
+		0: {
 			Index:               0,
 			BusID:               "00000000:4E:00.0",
 			MaxGraphicsClockMHz: 2430,
 			MaxMemoryClockMHz:   12481,
 			MinPowerLimitW:      200,
 			MaxPowerLimitW:      600,
 		},
 	}
-	enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
+	enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)
 	if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
 		t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz)
 	}
 	if infoByIndex[0].MinPowerLimitW != 200 {
 		t.Errorf("expected existing min power limit to be preserved, got %v", infoByIndex[0].MinPowerLimitW)
 	}
 }
--- a/audit/internal/platform/nvidia_recover.go
+++ b/audit/internal/platform/nvidia_recover.go
@@ -0,0 +1,30 @@
 package platform
 import (
 	"fmt"
 	"os/exec"
 	"time"
 )
 const nvidiaRecoverHelper = "/usr/local/bin/bee-nvidia-recover"
 func runNvidiaRecover(args ...string) (string, error) {
 	helperArgs := append([]string{nvidiaRecoverHelper}, args...)
 	if _, err := exec.LookPath("systemd-run"); err == nil {
 		unit := fmt.Sprintf("bee-nvidia-recover-%d", time.Now().UnixNano())
 		cmdArgs := []string{
 			"systemd-run",
 			"--quiet",
 			"--pipe",
 			"--wait",
 			"--collect",
 			"--service-type=oneshot",
 			"--unit", unit,
 		}
 		cmdArgs = append(cmdArgs, helperArgs...)
 		raw, err := exec.Command("sudo", cmdArgs...).CombinedOutput()
 		return string(raw), err
 	}
 	raw, err := exec.Command("sudo", helperArgs...).CombinedOutput()
 	return string(raw), err
 }
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -407,11 +407,11 @@ func (s *System) ResetNvidiaGPU(index int) (string, error) {
 	if index < 0 {
 		return "", fmt.Errorf("gpu index must be >= 0")
 	}
-	raw, err := satExecCommand("nvidia-smi", "-r", "-i", strconv.Itoa(index)).CombinedOutput()
+	out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
-	if len(raw) == 0 && err == nil {
+	if strings.TrimSpace(out) == "" && err == nil {
-		raw = []byte("GPU reset completed.\n")
+		out = "GPU reset completed.\n"
 	}
-	return string(raw), err
+	return out, err
 }
 // RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
--- a/audit/internal/platform/services.go
+++ b/audit/internal/platform/services.go
@@ -61,6 +61,9 @@ func (s *System) ServiceState(name string) string {
 }
 func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
 	if name == "bee-nvidia" && action == ServiceRestart {
 		return runNvidiaRecover("restart-drivers")
 	}
 	// bee-web runs as the bee user; sudo is required to control system services.
 	// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
 	raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()
--- a/iso/overlay/usr/local/bin/bee-nvidia-recover
+++ b/iso/overlay/usr/local/bin/bee-nvidia-recover
@@ -0,0 +1,178 @@
 #!/bin/sh
 # bee-nvidia-recover — drain NVIDIA clients, then reset a GPU or reload drivers.
 set -u
 log() {
    echo "[bee-nvidia-recover] $*"
 }
 log_blocker() {
    echo "[bee-nvidia-recover] blocker: $*"
 }
 usage() {
    cat <<'EOF'
 usage:
  bee-nvidia-recover restart-drivers
  bee-nvidia-recover reset-gpu <index>
 EOF
 }
 unit_exists() {
    systemctl cat "$1" >/dev/null 2>&1
 }
 unit_is_active() {
    systemctl is-active --quiet "$1" 2>/dev/null
 }
 stop_unit_if_active() {
    unit="$1"
    if unit_is_active "$unit"; then
        log "stopping $unit"
        systemctl stop "$unit"
        return 0
    fi
    return 1
 }
 start_unit_if_marked() {
    unit="$1"
    marker="$2"
    if [ "$marker" = "1" ] && unit_exists "$unit"; then
        log "starting $unit"
        systemctl start "$unit"
    fi
 }
 wait_for_process_exit() {
    name="$1"
    tries=0
    while pgrep -x "$name" >/dev/null 2>&1; do
        tries=$((tries + 1))
        if [ "$tries" -ge 15 ]; then
            log "WARN: $name is still running after stop request"
            return 1
        fi
        sleep 1
    done
    return 0
 }
 kill_pattern() {
    pattern="$1"
    if pgrep -f "$pattern" >/dev/null 2>&1; then
        pgrep -af "$pattern" 2>/dev/null | while IFS= read -r line; do
            [ -n "$line" ] || continue
            log_blocker "$line"
        done
        log "killing processes matching: $pattern"
        pkill -TERM -f "$pattern" >/dev/null 2>&1 || true
        sleep 1
        pkill -KILL -f "$pattern" >/dev/null 2>&1 || true
    fi
 }
 drain_gpu_clients() {
    display_was_active=0
    fabric_was_active=0
    for unit in display-manager.service lightdm.service; do
        if unit_exists "$unit" && stop_unit_if_active "$unit"; then
            log_blocker "service $unit"
            display_was_active=1
        fi
    done
    if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
        log_blocker "service nvidia-fabricmanager.service"
        fabric_was_active=1
    fi
    if pgrep -x nv-hostengine >/dev/null 2>&1; then
        pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do
            [ -n "$line" ] || continue
            log_blocker "$line"
        done
        log "stopping nv-hostengine"
        pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true
        wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true
    fi
    for pattern in \
        "nvidia-smi" \
        "dcgmi" \
        "nvvs" \
        "dcgmproftester" \
        "all_reduce_perf" \
        "nvtop" \
        "bee-gpu-burn" \
        "bee-john-gpu-stress" \
        "bee-nccl-gpu-stress" \
        "Xorg" \
        "Xwayland"; do
        kill_pattern "$pattern"
    done
 }
 restore_gpu_clients() {
    if command -v nvidia-smi >/dev/null 2>&1; then
        if nvidia-smi -pm 1 >/dev/null 2>&1; then
            log "enabled NVIDIA persistence mode"
        else
            log "WARN: failed to enable NVIDIA persistence mode"
        fi
    fi
    if command -v nv-hostengine >/dev/null 2>&1 && ! pgrep -x nv-hostengine >/dev/null 2>&1; then
        log "starting nv-hostengine"
        nv-hostengine
    fi
    start_unit_if_marked nvidia-fabricmanager.service "${fabric_was_active:-0}"
    start_unit_if_marked display-manager.service "${display_was_active:-0}"
    if [ "${display_was_active:-0}" = "1" ] && unit_exists lightdm.service && ! unit_is_active lightdm.service; then
        start_unit_if_marked lightdm.service "1"
    fi
 }
 restart_drivers() {
    drain_gpu_clients
    for mod in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do
        if lsmod | awk '{print $1}' | grep -qx "$mod"; then
            log "unloading module $mod"
            rmmod "$mod"
        fi
    done
    rm -f /dev/nvidiactl /dev/nvidia-uvm /dev/nvidia-uvm-tools /dev/nvidia[0-9]* 2>/dev/null || true
    log "reloading NVIDIA driver stack"
    /usr/local/bin/bee-nvidia-load
    restore_gpu_clients
 }
 reset_gpu() {
    index="$1"
    drain_gpu_clients
    log "resetting GPU $index"
    nvidia-smi -r -i "$index"
    restore_gpu_clients
 }
 cmd="${1:-}"
 case "$cmd" in
    restart-drivers)
        restart_drivers
        ;;
    reset-gpu)
        if [ "$#" -ne 2 ]; then
            usage >&2
            exit 2
        fi
        reset_gpu "$2"
        ;;
    *)
        usage >&2
        exit 2
        ;;
 esac
Author	SHA1	Message	Date
Mikhail Chusavitin	5f0103635b	Update power benchmark GPU reset flow	2026-04-20 09:46:00 +03:00
Mikhail Chusavitin	84a2551dc0	Fix NVIDIA self-heal recovery flow	2026-04-20 09:43:22 +03:00
Mikhail Chusavitin	1cfabc9230	Reset GPUs before power benchmark	2026-04-20 09:42:19 +03:00
Mikhail Chusavitin	5dc711de23	Start power calibration from full GPU TDP	2026-04-20 09:28:58 +03:00
Mikhail Chusavitin	ab802719f8	Use real NVIDIA power-limit bounds in benchmark	2026-04-20 09:26:56 +03:00
Mikhail Chusavitin	a94e8007f8	Ignore power throttling in benchmark calibration	2026-04-20 09:26:29 +03:00