Compare commits

...

6 Commits
v8.31 ... v8.32

Author SHA1 Message Date
Mikhail Chusavitin
5f0103635b Update power benchmark GPU reset flow 2026-04-20 09:46:00 +03:00
Mikhail Chusavitin
84a2551dc0 Fix NVIDIA self-heal recovery flow 2026-04-20 09:43:22 +03:00
Mikhail Chusavitin
1cfabc9230 Reset GPUs before power benchmark 2026-04-20 09:42:19 +03:00
Mikhail Chusavitin
5dc711de23 Start power calibration from full GPU TDP 2026-04-20 09:28:58 +03:00
Mikhail Chusavitin
ab802719f8 Use real NVIDIA power-limit bounds in benchmark 2026-04-20 09:26:56 +03:00
Mikhail Chusavitin
a94e8007f8 Ignore power throttling in benchmark calibration 2026-04-20 09:26:29 +03:00
7 changed files with 527 additions and 50 deletions

BIN
audit/bee

Binary file not shown.

View File

@@ -37,6 +37,8 @@ type benchmarkGPUInfo struct {
VBIOS string
PowerLimitW float64
DefaultPowerLimitW float64
MinPowerLimitW float64
MaxPowerLimitW float64
MaxGraphicsClockMHz float64
MaxMemoryClockMHz float64
BaseGraphicsClockMHz float64
@@ -95,6 +97,8 @@ var (
benchmarkReadyPattern = regexp.MustCompile(`^([a-z0-9_]+)\[(\d+)\]=READY dim=(\d+)x(\d+)x(\d+)\b`)
benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
benchmarkGeteuid = os.Geteuid
benchmarkSleep = time.Sleep
)
// benchmarkPrecisionPhases lists the precision categories run as individual
@@ -220,8 +224,6 @@ func benchmarkCalibrationThrottleReason(before, after BenchmarkThrottleCounters)
return "hw_thermal"
case diff.SWThermalSlowdownUS > 0:
return "sw_thermal"
case diff.HWPowerBrakeSlowdownUS > 0:
return "hw_power_brake"
default:
return ""
}
@@ -240,6 +242,39 @@ func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, po
return nil
}
func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int, logFunc func(string)) []int {
if len(gpuIndices) == 0 {
return nil
}
if benchmarkGeteuid() != 0 {
if logFunc != nil {
logFunc("power benchmark pre-flight: root privileges unavailable, GPU reset skipped")
}
return append([]int(nil), gpuIndices...)
}
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
for _, p := range killed {
logFunc(fmt.Sprintf("power benchmark pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
}
}
var failed []int
for _, idx := range gpuIndices {
name := fmt.Sprintf("power-preflight-gpu-%d-reset.log", idx)
if _, err := runSATCommandCtx(ctx, verboseLog, name, []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-r"}, nil, logFunc); err != nil {
failed = append(failed, idx)
if logFunc != nil {
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset failed: %v", idx, err))
}
continue
}
if logFunc != nil {
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset completed", idx))
}
benchmarkSleep(time.Second)
}
return failed
}
func benchmarkPowerEngine() string {
switch strings.TrimSpace(strings.ToLower(os.Getenv("BEE_BENCH_POWER_ENGINE"))) {
case BenchmarkPowerEngineTargetedPower:
@@ -351,9 +386,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
result.Normalization.Status = "partial"
}
// Enrich with max clocks from verbose output — covers GPUs where
// clocks.max.* CSV fields are unsupported (e.g. Blackwell / driver 98.x).
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQOut)
// Enrich with verbose nvidia-smi data — covers GPUs where some CSV fields
// are unsupported (e.g. clocks.max.* on Blackwell / driver 98.x).
enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQOut)
activeApps, err := queryActiveComputeApps(selected)
if err == nil && len(activeApps) > 0 {
@@ -737,8 +772,8 @@ func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
// (attribute.multiprocessor_count, power.default_limit) are not supported on
// all driver versions, so we fall back to the base set if the full query fails.
// The minimal fallback omits clock fields entirely — clocks.max.* returns
// exit status 2 on some GPU generations (e.g. Blackwell); max clocks are
// then recovered from nvidia-smi -q via enrichGPUInfoWithMaxClocks.
// exit status 2 on some GPU generations (e.g. Blackwell); missing data is
// then recovered from nvidia-smi -q.
var benchmarkGPUInfoQueries = []struct {
fields string
extended bool // whether this query includes optional extended fields
@@ -758,12 +793,9 @@ var benchmarkGPUInfoQueries = []struct {
},
}
// enrichGPUInfoWithMaxClocks fills MaxGraphicsClockMHz / MaxMemoryClockMHz for
// any GPU in infoByIndex where those values are still zero. It parses the
// "Max Clocks" section of nvidia-smi -q output (already available as nvsmiQ).
// This is the fallback for GPUs (e.g. Blackwell) where clocks.max.* CSV fields
// return exit status 2 but the verbose query works fine.
func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) {
// enrichGPUInfoWithNvidiaSMIQ fills benchmark GPU metadata from nvidia-smi -q
// for fields that may be missing from --query-gpu on some driver versions.
func enrichGPUInfoWithNvidiaSMIQ(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) {
if len(infoByIndex) == 0 || len(nvsmiQ) == 0 {
return
}
@@ -784,6 +816,8 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`)
defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`)
currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`)
minPwrRe := regexp.MustCompile(`(?i)Min Power Limit\s*:\s*([0-9.]+)\s*W`)
maxPwrRe := regexp.MustCompile(`(?i)Max Power Limit\s*:\s*([0-9.]+)\s*W`)
smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`)
shutdownTempRe := regexp.MustCompile(`(?i)GPU Shutdown Temp\s*:\s*(\d+)\s*C`)
slowdownTempRe := regexp.MustCompile(`(?i)GPU Slowdown Temp\s*:\s*(\d+)\s*C`)
@@ -843,6 +877,20 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
}
}
}
if info.MinPowerLimitW == 0 {
if m := minPwrRe.FindSubmatch(section); m != nil {
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
info.MinPowerLimitW = v
}
}
}
if info.MaxPowerLimitW == 0 {
if m := maxPwrRe.FindSubmatch(section); m != nil {
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
info.MaxPowerLimitW = v
}
}
}
if info.MultiprocessorCount == 0 {
if m := smCountRe.FindSubmatch(section); m != nil {
if v, err := strconv.Atoi(string(m[1])); err == nil && v > 0 {
@@ -3043,7 +3091,6 @@ func runBenchmarkPowerCalibration(
if calibDurationSec <= 0 {
calibDurationSec = 120
}
const maxDerateW = 150
// calibSearchTolerance is the binary-search convergence threshold in watts.
// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
const calibSearchTolerance = 10
@@ -3090,8 +3137,9 @@ func runBenchmarkPowerCalibration(
originalLimitW int
appliedLimitW int
minLimitW int
lo int // highest verified-stable limit (assumed: minLimitW)
lo int // highest verified-stable limit
hi int // lowest verified-unstable limit (exclusive sentinel above start)
loVerified bool
calib benchmarkPowerCalibrationResult
converged bool
}
@@ -3113,23 +3161,17 @@ func runBenchmarkPowerCalibration(
if defaultLimitW <= 0 {
defaultLimitW = originalLimitW
}
appliedLimitW := originalLimitW
appliedLimitW := initialBenchmarkCalibrationLimitW(info)
if appliedLimitW <= 0 {
appliedLimitW = defaultLimitW
}
minLimitW := appliedLimitW
switch {
case defaultLimitW > 0:
minLimitW = defaultLimitW - maxDerateW
floorByRatio := int(math.Round(float64(defaultLimitW) * 0.70))
if minLimitW < floorByRatio {
minLimitW = floorByRatio
}
case appliedLimitW > 0:
minLimitW = appliedLimitW - maxDerateW
minLimitW := int(math.Round(info.MinPowerLimitW))
if minLimitW <= 0 {
minLimitW = appliedLimitW
}
if minLimitW < calibSearchTolerance {
minLimitW = calibSearchTolerance
maxLimitW := int(math.Round(info.MaxPowerLimitW))
if maxLimitW > 0 && appliedLimitW > maxLimitW {
appliedLimitW = maxLimitW
}
s := &gpuCalibState{
idx: idx,
@@ -3141,11 +3183,24 @@ func runBenchmarkPowerCalibration(
hi: appliedLimitW + 1, // not yet tested, not yet confirmed unstable
calib: benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
}
if minLimitW > 0 && appliedLimitW > 0 && minLimitW >= appliedLimitW {
s.appliedLimitW = minLimitW
s.hi = minLimitW + 1
}
if info.MinPowerLimitW <= 0 {
s.calib.Notes = append(s.calib.Notes, "minimum power limit was not reported by nvidia-smi; calibration can only validate the current/default power limit")
}
if seedLimits != nil {
if seedW, ok := seedLimits[idx]; ok && seedW > 0 {
// A previously validated limit is only a starting point. Re-run
// targeted_power under the current multi-GPU thermal load and derate
// again if this step shows new throttling.
if seedW < s.minLimitW {
seedW = s.minLimitW
}
if maxLimitW > 0 && seedW > maxLimitW {
seedW = maxLimitW
}
if canDerate {
_ = setBenchmarkPowerLimit(ctx, verboseLog, idx, seedW)
}
@@ -3333,6 +3388,7 @@ calibDone:
s.calib.AppliedPowerLimitW = float64(s.appliedLimitW)
logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", s.idx, s.appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
s.lo = s.appliedLimitW
s.loVerified = true
if canDerate && s.hi-s.lo > calibSearchTolerance {
next := roundTo5W((s.lo + s.hi) / 2)
if next > s.lo && next < s.hi {
@@ -3371,7 +3427,23 @@ calibDone:
s.hi = s.appliedLimitW
if s.hi-s.lo <= calibSearchTolerance {
if s.lo > s.minLimitW {
if !s.loVerified && s.minLimitW > 0 && s.appliedLimitW != s.minLimitW {
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.minLimitW); err != nil {
s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
logFunc(fmt.Sprintf("power calibration: GPU %d failed to set minimum power limit %d W: %v", s.idx, s.minLimitW, err))
s.converged = true
continue
}
s.appliedLimitW = s.minLimitW
s.calib.AppliedPowerLimitW = float64(s.minLimitW)
s.calib.Derated = s.minLimitW < s.originalLimitW
s.info.PowerLimitW = float64(s.minLimitW)
infoByIndex[s.idx] = s.info
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: validating minimum settable limit %d W before concluding failure", s.minLimitW))
logFunc(fmt.Sprintf("power calibration: GPU %d binary search: validating minimum settable limit %d W", s.idx, s.minLimitW))
continue
}
if s.loVerified {
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", s.lo, s.lo, s.hi))
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.lo); err == nil {
s.appliedLimitW = s.lo
@@ -3383,7 +3455,8 @@ calibDone:
s.calib.Completed = true
}
} else {
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit down to the minimum settable power limit %d W", engineLabel, s.minLimitW))
logFunc(fmt.Sprintf("power calibration: GPU %d no stable limit found down to minimum settable power limit %d W", s.idx, s.minLimitW))
}
s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
s.converged = true
@@ -3398,9 +3471,7 @@ calibDone:
next = (s.lo + s.hi) / 2
}
if next < s.minLimitW {
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
s.converged = true
continue
next = s.minLimitW
}
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err != nil {
s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
@@ -3439,6 +3510,24 @@ func roundTo5W(w int) int {
return ((w + 2) / 5) * 5
}
func initialBenchmarkCalibrationLimitW(info benchmarkGPUInfo) int {
defaultLimitW := int(math.Round(info.DefaultPowerLimitW))
currentLimitW := int(math.Round(info.PowerLimitW))
maxLimitW := int(math.Round(info.MaxPowerLimitW))
startW := defaultLimitW
if startW <= 0 {
startW = currentLimitW
}
if startW <= 0 {
startW = maxLimitW
}
if maxLimitW > 0 && startW > maxLimitW {
startW = maxLimitW
}
return startW
}
// meanFanRPM returns the average RPM across a set of fan readings.
func meanFanRPM(fans []FanReading) float64 {
if len(fans) == 0 {
@@ -4096,14 +4185,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
}
verboseLog := filepath.Join(runDir, "verbose.log")
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
if infoErr != nil {
return "", infoErr
}
// Capture full nvidia-smi -q snapshot at the start of the run.
if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
}
hostname, _ := os.Hostname()
result := NvidiaPowerBenchResult{
BenchmarkVersion: benchmarkVersion,
@@ -4114,6 +4195,14 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
SelectedGPUIndices: append([]int(nil), selected...),
OverallStatus: "OK",
}
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
if infoErr != nil {
return "", infoErr
}
// Capture full nvidia-smi -q snapshot at the start of the run.
if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
}
durationSec := powerBenchDurationSec(opts.Profile)
// Sample server idle power before any GPU load.
@@ -4139,6 +4228,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
_ = os.MkdirAll(singleDir, 0755)
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
if failed := resetBenchmarkGPUs(ctx, verboseLog, []int{idx}, logFunc); len(failed) > 0 {
result.Findings = append(result.Findings,
fmt.Sprintf("GPU %d reset pre-flight did not complete before its first power test; throttle counters may contain stale state.", idx))
}
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
singlePowerStopCh := make(chan struct{})
singlePowerCh := startSelectedPowerSourceSampler(singlePowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)

View File

@@ -1,8 +1,13 @@
package platform
import (
"context"
"os"
"os/exec"
"path/filepath"
"strings"
"testing"
"time"
)
func TestResolveBenchmarkProfile(t *testing.T) {
@@ -164,6 +169,93 @@ func TestBenchmarkPlannedPhaseStatus(t *testing.T) {
}
}
func TestBenchmarkCalibrationThrottleReasonIgnoresPowerReasons(t *testing.T) {
t.Parallel()
before := BenchmarkThrottleCounters{}
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWPowerCapUS: 1_000_000}); got != "" {
t.Fatalf("sw_power_cap should be ignored, got %q", got)
}
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWPowerBrakeSlowdownUS: 1_000_000}); got != "" {
t.Fatalf("hw_power_brake should be ignored, got %q", got)
}
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWThermalSlowdownUS: 1_000_000}); got != "hw_thermal" {
t.Fatalf("hw_thermal mismatch: got %q", got)
}
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWThermalSlowdownUS: 1_000_000}); got != "sw_thermal" {
t.Fatalf("sw_thermal mismatch: got %q", got)
}
}
func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
t.Parallel()
oldGeteuid := benchmarkGeteuid
oldExec := satExecCommand
benchmarkGeteuid = func() int { return 1000 }
satExecCommand = func(name string, args ...string) *exec.Cmd {
t.Fatalf("unexpected command: %s %v", name, args)
return nil
}
t.Cleanup(func() {
benchmarkGeteuid = oldGeteuid
satExecCommand = oldExec
})
var logs []string
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{0, 2}, func(line string) {
logs = append(logs, line)
})
if got, want := strings.Join(logs, "\n"), "power benchmark pre-flight: root privileges unavailable, GPU reset skipped"; !strings.Contains(got, want) {
t.Fatalf("logs=%q want substring %q", got, want)
}
if len(failed) != 2 || failed[0] != 0 || failed[1] != 2 {
t.Fatalf("failed=%v want [0 2]", failed)
}
}
func TestResetBenchmarkGPUsResetsEachGPU(t *testing.T) {
t.Parallel()
dir := t.TempDir()
script := filepath.Join(dir, "nvidia-smi")
argsLog := filepath.Join(dir, "args.log")
if err := os.WriteFile(script, []byte("#!/bin/sh\nprintf '%s\\n' \"$*\" >> "+argsLog+"\nprintf 'ok\\n'\n"), 0755); err != nil {
t.Fatalf("write script: %v", err)
}
oldGeteuid := benchmarkGeteuid
oldSleep := benchmarkSleep
oldLookPath := satLookPath
benchmarkGeteuid = func() int { return 0 }
benchmarkSleep = func(time.Duration) {}
satLookPath = func(file string) (string, error) {
if file == "nvidia-smi" {
return script, nil
}
return exec.LookPath(file)
}
t.Cleanup(func() {
benchmarkGeteuid = oldGeteuid
benchmarkSleep = oldSleep
satLookPath = oldLookPath
})
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(dir, "verbose.log"), []int{2, 5}, nil)
if len(failed) != 0 {
t.Fatalf("failed=%v want no failures", failed)
}
raw, err := os.ReadFile(argsLog)
if err != nil {
t.Fatalf("read args log: %v", err)
}
got := strings.Fields(string(raw))
want := []string{"-i", "2", "-r", "-i", "5", "-r"}
if strings.Join(got, " ") != strings.Join(want, " ") {
t.Fatalf("args=%v want %v", got, want)
}
}
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
t.Parallel()
@@ -179,6 +271,59 @@ func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
}
}
func TestInitialBenchmarkCalibrationLimitW(t *testing.T) {
t.Parallel()
cases := []struct {
name string
info benchmarkGPUInfo
want int
}{
{
name: "prefers default tdp over current derated limit",
info: benchmarkGPUInfo{
PowerLimitW: 500,
DefaultPowerLimitW: 600,
MaxPowerLimitW: 600,
},
want: 600,
},
{
name: "caps default tdp to reported max limit",
info: benchmarkGPUInfo{
PowerLimitW: 500,
DefaultPowerLimitW: 700,
MaxPowerLimitW: 650,
},
want: 650,
},
{
name: "falls back to current limit when default missing",
info: benchmarkGPUInfo{
PowerLimitW: 525,
MaxPowerLimitW: 600,
},
want: 525,
},
{
name: "falls back to max limit when only that is known",
info: benchmarkGPUInfo{
MaxPowerLimitW: 575,
},
want: 575,
},
}
for _, tc := range cases {
tc := tc
t.Run(tc.name, func(t *testing.T) {
if got := initialBenchmarkCalibrationLimitW(tc.info); got != tc.want {
t.Fatalf("initialBenchmarkCalibrationLimitW(%+v)=%d want %d", tc.info, got, tc.want)
}
})
}
}
func TestParseBenchmarkBurnLog(t *testing.T) {
t.Parallel()
@@ -338,12 +483,16 @@ func TestScoreBenchmarkGPUIgnoresDisabledPrecisions(t *testing.T) {
}
}
func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {
func TestEnrichGPUInfoWithNvidiaSMIQ(t *testing.T) {
t.Parallel()
nvsmiQ := []byte(`
GPU 00000000:4E:00.0
Product Name : NVIDIA RTX PRO 6000 Blackwell Server Edition
Min Power Limit : 200.00 W
Max Power Limit : 600.00 W
Default Power Limit : 575.00 W
Current Power Limit : 560.00 W
Clocks
Graphics : 2422 MHz
Memory : 12481 MHz
@@ -365,7 +514,7 @@ GPU 00000000:4F:00.0
1: {Index: 1, BusID: "00000000:4F:00.0"},
}
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz)
@@ -379,25 +528,49 @@ GPU 00000000:4F:00.0
if infoByIndex[1].MaxMemoryClockMHz != 12481 {
t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz)
}
if infoByIndex[0].MinPowerLimitW != 200 {
t.Errorf("GPU 0 MinPowerLimitW = %v, want 200", infoByIndex[0].MinPowerLimitW)
}
if infoByIndex[0].MaxPowerLimitW != 600 {
t.Errorf("GPU 0 MaxPowerLimitW = %v, want 600", infoByIndex[0].MaxPowerLimitW)
}
if infoByIndex[0].DefaultPowerLimitW != 575 {
t.Errorf("GPU 0 DefaultPowerLimitW = %v, want 575", infoByIndex[0].DefaultPowerLimitW)
}
if infoByIndex[0].PowerLimitW != 560 {
t.Errorf("GPU 0 PowerLimitW = %v, want 560", infoByIndex[0].PowerLimitW)
}
}
func TestEnrichGPUInfoWithMaxClocksSkipsPopulated(t *testing.T) {
func TestEnrichGPUInfoWithNvidiaSMIQSkipsPopulated(t *testing.T) {
t.Parallel()
nvsmiQ := []byte(`
GPU 00000000:4E:00.0
Min Power Limit : 100.00 W
Max Power Limit : 900.00 W
Max Clocks
Graphics : 9999 MHz
Memory : 9999 MHz
`)
// Already populated — must not be overwritten.
infoByIndex := map[int]benchmarkGPUInfo{
0: {Index: 0, BusID: "00000000:4E:00.0", MaxGraphicsClockMHz: 2430, MaxMemoryClockMHz: 12481},
0: {
Index: 0,
BusID: "00000000:4E:00.0",
MaxGraphicsClockMHz: 2430,
MaxMemoryClockMHz: 12481,
MinPowerLimitW: 200,
MaxPowerLimitW: 600,
},
}
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz)
}
if infoByIndex[0].MinPowerLimitW != 200 {
t.Errorf("expected existing min power limit to be preserved, got %v", infoByIndex[0].MinPowerLimitW)
}
}

View File

@@ -0,0 +1,30 @@
package platform
import (
"fmt"
"os/exec"
"time"
)
const nvidiaRecoverHelper = "/usr/local/bin/bee-nvidia-recover"
func runNvidiaRecover(args ...string) (string, error) {
helperArgs := append([]string{nvidiaRecoverHelper}, args...)
if _, err := exec.LookPath("systemd-run"); err == nil {
unit := fmt.Sprintf("bee-nvidia-recover-%d", time.Now().UnixNano())
cmdArgs := []string{
"systemd-run",
"--quiet",
"--pipe",
"--wait",
"--collect",
"--service-type=oneshot",
"--unit", unit,
}
cmdArgs = append(cmdArgs, helperArgs...)
raw, err := exec.Command("sudo", cmdArgs...).CombinedOutput()
return string(raw), err
}
raw, err := exec.Command("sudo", helperArgs...).CombinedOutput()
return string(raw), err
}

View File

@@ -407,11 +407,11 @@ func (s *System) ResetNvidiaGPU(index int) (string, error) {
if index < 0 {
return "", fmt.Errorf("gpu index must be >= 0")
}
raw, err := satExecCommand("nvidia-smi", "-r", "-i", strconv.Itoa(index)).CombinedOutput()
if len(raw) == 0 && err == nil {
raw = []byte("GPU reset completed.\n")
out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
if strings.TrimSpace(out) == "" && err == nil {
out = "GPU reset completed.\n"
}
return string(raw), err
return out, err
}
// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.

View File

@@ -61,6 +61,9 @@ func (s *System) ServiceState(name string) string {
}
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
if name == "bee-nvidia" && action == ServiceRestart {
return runNvidiaRecover("restart-drivers")
}
// bee-web runs as the bee user; sudo is required to control system services.
// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()

View File

@@ -0,0 +1,178 @@
#!/bin/sh
# bee-nvidia-recover — drain NVIDIA clients, then reset a GPU or reload drivers.
set -u
log() {
echo "[bee-nvidia-recover] $*"
}
log_blocker() {
echo "[bee-nvidia-recover] blocker: $*"
}
usage() {
cat <<'EOF'
usage:
bee-nvidia-recover restart-drivers
bee-nvidia-recover reset-gpu <index>
EOF
}
unit_exists() {
systemctl cat "$1" >/dev/null 2>&1
}
unit_is_active() {
systemctl is-active --quiet "$1" 2>/dev/null
}
stop_unit_if_active() {
unit="$1"
if unit_is_active "$unit"; then
log "stopping $unit"
systemctl stop "$unit"
return 0
fi
return 1
}
start_unit_if_marked() {
unit="$1"
marker="$2"
if [ "$marker" = "1" ] && unit_exists "$unit"; then
log "starting $unit"
systemctl start "$unit"
fi
}
wait_for_process_exit() {
name="$1"
tries=0
while pgrep -x "$name" >/dev/null 2>&1; do
tries=$((tries + 1))
if [ "$tries" -ge 15 ]; then
log "WARN: $name is still running after stop request"
return 1
fi
sleep 1
done
return 0
}
kill_pattern() {
pattern="$1"
if pgrep -f "$pattern" >/dev/null 2>&1; then
pgrep -af "$pattern" 2>/dev/null | while IFS= read -r line; do
[ -n "$line" ] || continue
log_blocker "$line"
done
log "killing processes matching: $pattern"
pkill -TERM -f "$pattern" >/dev/null 2>&1 || true
sleep 1
pkill -KILL -f "$pattern" >/dev/null 2>&1 || true
fi
}
drain_gpu_clients() {
display_was_active=0
fabric_was_active=0
for unit in display-manager.service lightdm.service; do
if unit_exists "$unit" && stop_unit_if_active "$unit"; then
log_blocker "service $unit"
display_was_active=1
fi
done
if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
log_blocker "service nvidia-fabricmanager.service"
fabric_was_active=1
fi
if pgrep -x nv-hostengine >/dev/null 2>&1; then
pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do
[ -n "$line" ] || continue
log_blocker "$line"
done
log "stopping nv-hostengine"
pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true
wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true
fi
for pattern in \
"nvidia-smi" \
"dcgmi" \
"nvvs" \
"dcgmproftester" \
"all_reduce_perf" \
"nvtop" \
"bee-gpu-burn" \
"bee-john-gpu-stress" \
"bee-nccl-gpu-stress" \
"Xorg" \
"Xwayland"; do
kill_pattern "$pattern"
done
}
restore_gpu_clients() {
if command -v nvidia-smi >/dev/null 2>&1; then
if nvidia-smi -pm 1 >/dev/null 2>&1; then
log "enabled NVIDIA persistence mode"
else
log "WARN: failed to enable NVIDIA persistence mode"
fi
fi
if command -v nv-hostengine >/dev/null 2>&1 && ! pgrep -x nv-hostengine >/dev/null 2>&1; then
log "starting nv-hostengine"
nv-hostengine
fi
start_unit_if_marked nvidia-fabricmanager.service "${fabric_was_active:-0}"
start_unit_if_marked display-manager.service "${display_was_active:-0}"
if [ "${display_was_active:-0}" = "1" ] && unit_exists lightdm.service && ! unit_is_active lightdm.service; then
start_unit_if_marked lightdm.service "1"
fi
}
restart_drivers() {
drain_gpu_clients
for mod in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do
if lsmod | awk '{print $1}' | grep -qx "$mod"; then
log "unloading module $mod"
rmmod "$mod"
fi
done
rm -f /dev/nvidiactl /dev/nvidia-uvm /dev/nvidia-uvm-tools /dev/nvidia[0-9]* 2>/dev/null || true
log "reloading NVIDIA driver stack"
/usr/local/bin/bee-nvidia-load
restore_gpu_clients
}
reset_gpu() {
index="$1"
drain_gpu_clients
log "resetting GPU $index"
nvidia-smi -r -i "$index"
restore_gpu_clients
}
cmd="${1:-}"
case "$cmd" in
restart-drivers)
restart_drivers
;;
reset-gpu)
if [ "$#" -ne 2 ]; then
usage >&2
exit 2
fi
reset_gpu "$2"
;;
*)
usage >&2
exit 2
;;
esac