Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5f0103635b | ||
|
|
84a2551dc0 | ||
|
|
1cfabc9230 | ||
|
|
5dc711de23 | ||
|
|
ab802719f8 | ||
|
|
a94e8007f8 |
@@ -37,6 +37,8 @@ type benchmarkGPUInfo struct {
|
||||
VBIOS string
|
||||
PowerLimitW float64
|
||||
DefaultPowerLimitW float64
|
||||
MinPowerLimitW float64
|
||||
MaxPowerLimitW float64
|
||||
MaxGraphicsClockMHz float64
|
||||
MaxMemoryClockMHz float64
|
||||
BaseGraphicsClockMHz float64
|
||||
@@ -95,6 +97,8 @@ var (
|
||||
benchmarkReadyPattern = regexp.MustCompile(`^([a-z0-9_]+)\[(\d+)\]=READY dim=(\d+)x(\d+)x(\d+)\b`)
|
||||
benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
|
||||
benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
|
||||
benchmarkGeteuid = os.Geteuid
|
||||
benchmarkSleep = time.Sleep
|
||||
)
|
||||
|
||||
// benchmarkPrecisionPhases lists the precision categories run as individual
|
||||
@@ -220,8 +224,6 @@ func benchmarkCalibrationThrottleReason(before, after BenchmarkThrottleCounters)
|
||||
return "hw_thermal"
|
||||
case diff.SWThermalSlowdownUS > 0:
|
||||
return "sw_thermal"
|
||||
case diff.HWPowerBrakeSlowdownUS > 0:
|
||||
return "hw_power_brake"
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
@@ -240,6 +242,39 @@ func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, po
|
||||
return nil
|
||||
}
|
||||
|
||||
func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int, logFunc func(string)) []int {
|
||||
if len(gpuIndices) == 0 {
|
||||
return nil
|
||||
}
|
||||
if benchmarkGeteuid() != 0 {
|
||||
if logFunc != nil {
|
||||
logFunc("power benchmark pre-flight: root privileges unavailable, GPU reset skipped")
|
||||
}
|
||||
return append([]int(nil), gpuIndices...)
|
||||
}
|
||||
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||
for _, p := range killed {
|
||||
logFunc(fmt.Sprintf("power benchmark pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||
}
|
||||
}
|
||||
var failed []int
|
||||
for _, idx := range gpuIndices {
|
||||
name := fmt.Sprintf("power-preflight-gpu-%d-reset.log", idx)
|
||||
if _, err := runSATCommandCtx(ctx, verboseLog, name, []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-r"}, nil, logFunc); err != nil {
|
||||
failed = append(failed, idx)
|
||||
if logFunc != nil {
|
||||
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset failed: %v", idx, err))
|
||||
}
|
||||
continue
|
||||
}
|
||||
if logFunc != nil {
|
||||
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset completed", idx))
|
||||
}
|
||||
benchmarkSleep(time.Second)
|
||||
}
|
||||
return failed
|
||||
}
|
||||
|
||||
func benchmarkPowerEngine() string {
|
||||
switch strings.TrimSpace(strings.ToLower(os.Getenv("BEE_BENCH_POWER_ENGINE"))) {
|
||||
case BenchmarkPowerEngineTargetedPower:
|
||||
@@ -351,9 +386,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
||||
result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
|
||||
result.Normalization.Status = "partial"
|
||||
}
|
||||
// Enrich with max clocks from verbose output — covers GPUs where
|
||||
// clocks.max.* CSV fields are unsupported (e.g. Blackwell / driver 98.x).
|
||||
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQOut)
|
||||
// Enrich with verbose nvidia-smi data — covers GPUs where some CSV fields
|
||||
// are unsupported (e.g. clocks.max.* on Blackwell / driver 98.x).
|
||||
enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQOut)
|
||||
|
||||
activeApps, err := queryActiveComputeApps(selected)
|
||||
if err == nil && len(activeApps) > 0 {
|
||||
@@ -737,8 +772,8 @@ func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
|
||||
// (attribute.multiprocessor_count, power.default_limit) are not supported on
|
||||
// all driver versions, so we fall back to the base set if the full query fails.
|
||||
// The minimal fallback omits clock fields entirely — clocks.max.* returns
|
||||
// exit status 2 on some GPU generations (e.g. Blackwell); max clocks are
|
||||
// then recovered from nvidia-smi -q via enrichGPUInfoWithMaxClocks.
|
||||
// exit status 2 on some GPU generations (e.g. Blackwell); missing data is
|
||||
// then recovered from nvidia-smi -q.
|
||||
var benchmarkGPUInfoQueries = []struct {
|
||||
fields string
|
||||
extended bool // whether this query includes optional extended fields
|
||||
@@ -758,12 +793,9 @@ var benchmarkGPUInfoQueries = []struct {
|
||||
},
|
||||
}
|
||||
|
||||
// enrichGPUInfoWithMaxClocks fills MaxGraphicsClockMHz / MaxMemoryClockMHz for
|
||||
// any GPU in infoByIndex where those values are still zero. It parses the
|
||||
// "Max Clocks" section of nvidia-smi -q output (already available as nvsmiQ).
|
||||
// This is the fallback for GPUs (e.g. Blackwell) where clocks.max.* CSV fields
|
||||
// return exit status 2 but the verbose query works fine.
|
||||
func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) {
|
||||
// enrichGPUInfoWithNvidiaSMIQ fills benchmark GPU metadata from nvidia-smi -q
|
||||
// for fields that may be missing from --query-gpu on some driver versions.
|
||||
func enrichGPUInfoWithNvidiaSMIQ(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) {
|
||||
if len(infoByIndex) == 0 || len(nvsmiQ) == 0 {
|
||||
return
|
||||
}
|
||||
@@ -784,6 +816,8 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
|
||||
maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`)
|
||||
defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`)
|
||||
currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`)
|
||||
minPwrRe := regexp.MustCompile(`(?i)Min Power Limit\s*:\s*([0-9.]+)\s*W`)
|
||||
maxPwrRe := regexp.MustCompile(`(?i)Max Power Limit\s*:\s*([0-9.]+)\s*W`)
|
||||
smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`)
|
||||
shutdownTempRe := regexp.MustCompile(`(?i)GPU Shutdown Temp\s*:\s*(\d+)\s*C`)
|
||||
slowdownTempRe := regexp.MustCompile(`(?i)GPU Slowdown Temp\s*:\s*(\d+)\s*C`)
|
||||
@@ -843,6 +877,20 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
|
||||
}
|
||||
}
|
||||
}
|
||||
if info.MinPowerLimitW == 0 {
|
||||
if m := minPwrRe.FindSubmatch(section); m != nil {
|
||||
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
|
||||
info.MinPowerLimitW = v
|
||||
}
|
||||
}
|
||||
}
|
||||
if info.MaxPowerLimitW == 0 {
|
||||
if m := maxPwrRe.FindSubmatch(section); m != nil {
|
||||
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
|
||||
info.MaxPowerLimitW = v
|
||||
}
|
||||
}
|
||||
}
|
||||
if info.MultiprocessorCount == 0 {
|
||||
if m := smCountRe.FindSubmatch(section); m != nil {
|
||||
if v, err := strconv.Atoi(string(m[1])); err == nil && v > 0 {
|
||||
@@ -3043,7 +3091,6 @@ func runBenchmarkPowerCalibration(
|
||||
if calibDurationSec <= 0 {
|
||||
calibDurationSec = 120
|
||||
}
|
||||
const maxDerateW = 150
|
||||
// calibSearchTolerance is the binary-search convergence threshold in watts.
|
||||
// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
|
||||
const calibSearchTolerance = 10
|
||||
@@ -3090,8 +3137,9 @@ func runBenchmarkPowerCalibration(
|
||||
originalLimitW int
|
||||
appliedLimitW int
|
||||
minLimitW int
|
||||
lo int // highest verified-stable limit (assumed: minLimitW)
|
||||
lo int // highest verified-stable limit
|
||||
hi int // lowest verified-unstable limit (exclusive sentinel above start)
|
||||
loVerified bool
|
||||
calib benchmarkPowerCalibrationResult
|
||||
converged bool
|
||||
}
|
||||
@@ -3113,23 +3161,17 @@ func runBenchmarkPowerCalibration(
|
||||
if defaultLimitW <= 0 {
|
||||
defaultLimitW = originalLimitW
|
||||
}
|
||||
appliedLimitW := originalLimitW
|
||||
appliedLimitW := initialBenchmarkCalibrationLimitW(info)
|
||||
if appliedLimitW <= 0 {
|
||||
appliedLimitW = defaultLimitW
|
||||
}
|
||||
minLimitW := appliedLimitW
|
||||
switch {
|
||||
case defaultLimitW > 0:
|
||||
minLimitW = defaultLimitW - maxDerateW
|
||||
floorByRatio := int(math.Round(float64(defaultLimitW) * 0.70))
|
||||
if minLimitW < floorByRatio {
|
||||
minLimitW = floorByRatio
|
||||
}
|
||||
case appliedLimitW > 0:
|
||||
minLimitW = appliedLimitW - maxDerateW
|
||||
minLimitW := int(math.Round(info.MinPowerLimitW))
|
||||
if minLimitW <= 0 {
|
||||
minLimitW = appliedLimitW
|
||||
}
|
||||
if minLimitW < calibSearchTolerance {
|
||||
minLimitW = calibSearchTolerance
|
||||
maxLimitW := int(math.Round(info.MaxPowerLimitW))
|
||||
if maxLimitW > 0 && appliedLimitW > maxLimitW {
|
||||
appliedLimitW = maxLimitW
|
||||
}
|
||||
s := &gpuCalibState{
|
||||
idx: idx,
|
||||
@@ -3141,11 +3183,24 @@ func runBenchmarkPowerCalibration(
|
||||
hi: appliedLimitW + 1, // not yet tested, not yet confirmed unstable
|
||||
calib: benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
|
||||
}
|
||||
if minLimitW > 0 && appliedLimitW > 0 && minLimitW >= appliedLimitW {
|
||||
s.appliedLimitW = minLimitW
|
||||
s.hi = minLimitW + 1
|
||||
}
|
||||
if info.MinPowerLimitW <= 0 {
|
||||
s.calib.Notes = append(s.calib.Notes, "minimum power limit was not reported by nvidia-smi; calibration can only validate the current/default power limit")
|
||||
}
|
||||
if seedLimits != nil {
|
||||
if seedW, ok := seedLimits[idx]; ok && seedW > 0 {
|
||||
// A previously validated limit is only a starting point. Re-run
|
||||
// targeted_power under the current multi-GPU thermal load and derate
|
||||
// again if this step shows new throttling.
|
||||
if seedW < s.minLimitW {
|
||||
seedW = s.minLimitW
|
||||
}
|
||||
if maxLimitW > 0 && seedW > maxLimitW {
|
||||
seedW = maxLimitW
|
||||
}
|
||||
if canDerate {
|
||||
_ = setBenchmarkPowerLimit(ctx, verboseLog, idx, seedW)
|
||||
}
|
||||
@@ -3333,6 +3388,7 @@ calibDone:
|
||||
s.calib.AppliedPowerLimitW = float64(s.appliedLimitW)
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", s.idx, s.appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
|
||||
s.lo = s.appliedLimitW
|
||||
s.loVerified = true
|
||||
if canDerate && s.hi-s.lo > calibSearchTolerance {
|
||||
next := roundTo5W((s.lo + s.hi) / 2)
|
||||
if next > s.lo && next < s.hi {
|
||||
@@ -3371,7 +3427,23 @@ calibDone:
|
||||
s.hi = s.appliedLimitW
|
||||
|
||||
if s.hi-s.lo <= calibSearchTolerance {
|
||||
if s.lo > s.minLimitW {
|
||||
if !s.loVerified && s.minLimitW > 0 && s.appliedLimitW != s.minLimitW {
|
||||
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.minLimitW); err != nil {
|
||||
s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d failed to set minimum power limit %d W: %v", s.idx, s.minLimitW, err))
|
||||
s.converged = true
|
||||
continue
|
||||
}
|
||||
s.appliedLimitW = s.minLimitW
|
||||
s.calib.AppliedPowerLimitW = float64(s.minLimitW)
|
||||
s.calib.Derated = s.minLimitW < s.originalLimitW
|
||||
s.info.PowerLimitW = float64(s.minLimitW)
|
||||
infoByIndex[s.idx] = s.info
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: validating minimum settable limit %d W before concluding failure", s.minLimitW))
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d binary search: validating minimum settable limit %d W", s.idx, s.minLimitW))
|
||||
continue
|
||||
}
|
||||
if s.loVerified {
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", s.lo, s.lo, s.hi))
|
||||
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.lo); err == nil {
|
||||
s.appliedLimitW = s.lo
|
||||
@@ -3383,7 +3455,8 @@ calibDone:
|
||||
s.calib.Completed = true
|
||||
}
|
||||
} else {
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit down to the minimum settable power limit %d W", engineLabel, s.minLimitW))
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d no stable limit found down to minimum settable power limit %d W", s.idx, s.minLimitW))
|
||||
}
|
||||
s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
|
||||
s.converged = true
|
||||
@@ -3398,9 +3471,7 @@ calibDone:
|
||||
next = (s.lo + s.hi) / 2
|
||||
}
|
||||
if next < s.minLimitW {
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
|
||||
s.converged = true
|
||||
continue
|
||||
next = s.minLimitW
|
||||
}
|
||||
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err != nil {
|
||||
s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
|
||||
@@ -3439,6 +3510,24 @@ func roundTo5W(w int) int {
|
||||
return ((w + 2) / 5) * 5
|
||||
}
|
||||
|
||||
func initialBenchmarkCalibrationLimitW(info benchmarkGPUInfo) int {
|
||||
defaultLimitW := int(math.Round(info.DefaultPowerLimitW))
|
||||
currentLimitW := int(math.Round(info.PowerLimitW))
|
||||
maxLimitW := int(math.Round(info.MaxPowerLimitW))
|
||||
|
||||
startW := defaultLimitW
|
||||
if startW <= 0 {
|
||||
startW = currentLimitW
|
||||
}
|
||||
if startW <= 0 {
|
||||
startW = maxLimitW
|
||||
}
|
||||
if maxLimitW > 0 && startW > maxLimitW {
|
||||
startW = maxLimitW
|
||||
}
|
||||
return startW
|
||||
}
|
||||
|
||||
// meanFanRPM returns the average RPM across a set of fan readings.
|
||||
func meanFanRPM(fans []FanReading) float64 {
|
||||
if len(fans) == 0 {
|
||||
@@ -4096,14 +4185,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
|
||||
}
|
||||
verboseLog := filepath.Join(runDir, "verbose.log")
|
||||
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
|
||||
if infoErr != nil {
|
||||
return "", infoErr
|
||||
}
|
||||
// Capture full nvidia-smi -q snapshot at the start of the run.
|
||||
if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
|
||||
_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
|
||||
}
|
||||
hostname, _ := os.Hostname()
|
||||
result := NvidiaPowerBenchResult{
|
||||
BenchmarkVersion: benchmarkVersion,
|
||||
@@ -4114,6 +4195,14 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
SelectedGPUIndices: append([]int(nil), selected...),
|
||||
OverallStatus: "OK",
|
||||
}
|
||||
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
|
||||
if infoErr != nil {
|
||||
return "", infoErr
|
||||
}
|
||||
// Capture full nvidia-smi -q snapshot at the start of the run.
|
||||
if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
|
||||
_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
|
||||
}
|
||||
durationSec := powerBenchDurationSec(opts.Profile)
|
||||
|
||||
// Sample server idle power before any GPU load.
|
||||
@@ -4139,6 +4228,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
|
||||
_ = os.MkdirAll(singleDir, 0755)
|
||||
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||
if failed := resetBenchmarkGPUs(ctx, verboseLog, []int{idx}, logFunc); len(failed) > 0 {
|
||||
result.Findings = append(result.Findings,
|
||||
fmt.Sprintf("GPU %d reset pre-flight did not complete before its first power test; throttle counters may contain stale state.", idx))
|
||||
}
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
||||
singlePowerStopCh := make(chan struct{})
|
||||
singlePowerCh := startSelectedPowerSourceSampler(singlePowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
|
||||
|
||||
@@ -1,8 +1,13 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestResolveBenchmarkProfile(t *testing.T) {
|
||||
@@ -164,6 +169,93 @@ func TestBenchmarkPlannedPhaseStatus(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkCalibrationThrottleReasonIgnoresPowerReasons(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
before := BenchmarkThrottleCounters{}
|
||||
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWPowerCapUS: 1_000_000}); got != "" {
|
||||
t.Fatalf("sw_power_cap should be ignored, got %q", got)
|
||||
}
|
||||
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWPowerBrakeSlowdownUS: 1_000_000}); got != "" {
|
||||
t.Fatalf("hw_power_brake should be ignored, got %q", got)
|
||||
}
|
||||
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWThermalSlowdownUS: 1_000_000}); got != "hw_thermal" {
|
||||
t.Fatalf("hw_thermal mismatch: got %q", got)
|
||||
}
|
||||
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWThermalSlowdownUS: 1_000_000}); got != "sw_thermal" {
|
||||
t.Fatalf("sw_thermal mismatch: got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
oldGeteuid := benchmarkGeteuid
|
||||
oldExec := satExecCommand
|
||||
benchmarkGeteuid = func() int { return 1000 }
|
||||
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
t.Fatalf("unexpected command: %s %v", name, args)
|
||||
return nil
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
benchmarkGeteuid = oldGeteuid
|
||||
satExecCommand = oldExec
|
||||
})
|
||||
|
||||
var logs []string
|
||||
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{0, 2}, func(line string) {
|
||||
logs = append(logs, line)
|
||||
})
|
||||
if got, want := strings.Join(logs, "\n"), "power benchmark pre-flight: root privileges unavailable, GPU reset skipped"; !strings.Contains(got, want) {
|
||||
t.Fatalf("logs=%q want substring %q", got, want)
|
||||
}
|
||||
if len(failed) != 2 || failed[0] != 0 || failed[1] != 2 {
|
||||
t.Fatalf("failed=%v want [0 2]", failed)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResetBenchmarkGPUsResetsEachGPU(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
dir := t.TempDir()
|
||||
script := filepath.Join(dir, "nvidia-smi")
|
||||
argsLog := filepath.Join(dir, "args.log")
|
||||
if err := os.WriteFile(script, []byte("#!/bin/sh\nprintf '%s\\n' \"$*\" >> "+argsLog+"\nprintf 'ok\\n'\n"), 0755); err != nil {
|
||||
t.Fatalf("write script: %v", err)
|
||||
}
|
||||
|
||||
oldGeteuid := benchmarkGeteuid
|
||||
oldSleep := benchmarkSleep
|
||||
oldLookPath := satLookPath
|
||||
benchmarkGeteuid = func() int { return 0 }
|
||||
benchmarkSleep = func(time.Duration) {}
|
||||
satLookPath = func(file string) (string, error) {
|
||||
if file == "nvidia-smi" {
|
||||
return script, nil
|
||||
}
|
||||
return exec.LookPath(file)
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
benchmarkGeteuid = oldGeteuid
|
||||
benchmarkSleep = oldSleep
|
||||
satLookPath = oldLookPath
|
||||
})
|
||||
|
||||
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(dir, "verbose.log"), []int{2, 5}, nil)
|
||||
if len(failed) != 0 {
|
||||
t.Fatalf("failed=%v want no failures", failed)
|
||||
}
|
||||
raw, err := os.ReadFile(argsLog)
|
||||
if err != nil {
|
||||
t.Fatalf("read args log: %v", err)
|
||||
}
|
||||
got := strings.Fields(string(raw))
|
||||
want := []string{"-i", "2", "-r", "-i", "5", "-r"}
|
||||
if strings.Join(got, " ") != strings.Join(want, " ") {
|
||||
t.Fatalf("args=%v want %v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -179,6 +271,59 @@ func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestInitialBenchmarkCalibrationLimitW(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
info benchmarkGPUInfo
|
||||
want int
|
||||
}{
|
||||
{
|
||||
name: "prefers default tdp over current derated limit",
|
||||
info: benchmarkGPUInfo{
|
||||
PowerLimitW: 500,
|
||||
DefaultPowerLimitW: 600,
|
||||
MaxPowerLimitW: 600,
|
||||
},
|
||||
want: 600,
|
||||
},
|
||||
{
|
||||
name: "caps default tdp to reported max limit",
|
||||
info: benchmarkGPUInfo{
|
||||
PowerLimitW: 500,
|
||||
DefaultPowerLimitW: 700,
|
||||
MaxPowerLimitW: 650,
|
||||
},
|
||||
want: 650,
|
||||
},
|
||||
{
|
||||
name: "falls back to current limit when default missing",
|
||||
info: benchmarkGPUInfo{
|
||||
PowerLimitW: 525,
|
||||
MaxPowerLimitW: 600,
|
||||
},
|
||||
want: 525,
|
||||
},
|
||||
{
|
||||
name: "falls back to max limit when only that is known",
|
||||
info: benchmarkGPUInfo{
|
||||
MaxPowerLimitW: 575,
|
||||
},
|
||||
want: 575,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
tc := tc
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
if got := initialBenchmarkCalibrationLimitW(tc.info); got != tc.want {
|
||||
t.Fatalf("initialBenchmarkCalibrationLimitW(%+v)=%d want %d", tc.info, got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseBenchmarkBurnLog(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -338,12 +483,16 @@ func TestScoreBenchmarkGPUIgnoresDisabledPrecisions(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {
|
||||
func TestEnrichGPUInfoWithNvidiaSMIQ(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
nvsmiQ := []byte(`
|
||||
GPU 00000000:4E:00.0
|
||||
Product Name : NVIDIA RTX PRO 6000 Blackwell Server Edition
|
||||
Min Power Limit : 200.00 W
|
||||
Max Power Limit : 600.00 W
|
||||
Default Power Limit : 575.00 W
|
||||
Current Power Limit : 560.00 W
|
||||
Clocks
|
||||
Graphics : 2422 MHz
|
||||
Memory : 12481 MHz
|
||||
@@ -365,7 +514,7 @@ GPU 00000000:4F:00.0
|
||||
1: {Index: 1, BusID: "00000000:4F:00.0"},
|
||||
}
|
||||
|
||||
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
|
||||
enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)
|
||||
|
||||
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
|
||||
t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz)
|
||||
@@ -379,25 +528,49 @@ GPU 00000000:4F:00.0
|
||||
if infoByIndex[1].MaxMemoryClockMHz != 12481 {
|
||||
t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz)
|
||||
}
|
||||
if infoByIndex[0].MinPowerLimitW != 200 {
|
||||
t.Errorf("GPU 0 MinPowerLimitW = %v, want 200", infoByIndex[0].MinPowerLimitW)
|
||||
}
|
||||
if infoByIndex[0].MaxPowerLimitW != 600 {
|
||||
t.Errorf("GPU 0 MaxPowerLimitW = %v, want 600", infoByIndex[0].MaxPowerLimitW)
|
||||
}
|
||||
if infoByIndex[0].DefaultPowerLimitW != 575 {
|
||||
t.Errorf("GPU 0 DefaultPowerLimitW = %v, want 575", infoByIndex[0].DefaultPowerLimitW)
|
||||
}
|
||||
if infoByIndex[0].PowerLimitW != 560 {
|
||||
t.Errorf("GPU 0 PowerLimitW = %v, want 560", infoByIndex[0].PowerLimitW)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnrichGPUInfoWithMaxClocksSkipsPopulated(t *testing.T) {
|
||||
func TestEnrichGPUInfoWithNvidiaSMIQSkipsPopulated(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
nvsmiQ := []byte(`
|
||||
GPU 00000000:4E:00.0
|
||||
Min Power Limit : 100.00 W
|
||||
Max Power Limit : 900.00 W
|
||||
Max Clocks
|
||||
Graphics : 9999 MHz
|
||||
Memory : 9999 MHz
|
||||
`)
|
||||
// Already populated — must not be overwritten.
|
||||
infoByIndex := map[int]benchmarkGPUInfo{
|
||||
0: {Index: 0, BusID: "00000000:4E:00.0", MaxGraphicsClockMHz: 2430, MaxMemoryClockMHz: 12481},
|
||||
0: {
|
||||
Index: 0,
|
||||
BusID: "00000000:4E:00.0",
|
||||
MaxGraphicsClockMHz: 2430,
|
||||
MaxMemoryClockMHz: 12481,
|
||||
MinPowerLimitW: 200,
|
||||
MaxPowerLimitW: 600,
|
||||
},
|
||||
}
|
||||
|
||||
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
|
||||
enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)
|
||||
|
||||
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
|
||||
t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz)
|
||||
}
|
||||
if infoByIndex[0].MinPowerLimitW != 200 {
|
||||
t.Errorf("expected existing min power limit to be preserved, got %v", infoByIndex[0].MinPowerLimitW)
|
||||
}
|
||||
}
|
||||
|
||||
30
audit/internal/platform/nvidia_recover.go
Normal file
30
audit/internal/platform/nvidia_recover.go
Normal file
@@ -0,0 +1,30 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"time"
|
||||
)
|
||||
|
||||
const nvidiaRecoverHelper = "/usr/local/bin/bee-nvidia-recover"
|
||||
|
||||
func runNvidiaRecover(args ...string) (string, error) {
|
||||
helperArgs := append([]string{nvidiaRecoverHelper}, args...)
|
||||
if _, err := exec.LookPath("systemd-run"); err == nil {
|
||||
unit := fmt.Sprintf("bee-nvidia-recover-%d", time.Now().UnixNano())
|
||||
cmdArgs := []string{
|
||||
"systemd-run",
|
||||
"--quiet",
|
||||
"--pipe",
|
||||
"--wait",
|
||||
"--collect",
|
||||
"--service-type=oneshot",
|
||||
"--unit", unit,
|
||||
}
|
||||
cmdArgs = append(cmdArgs, helperArgs...)
|
||||
raw, err := exec.Command("sudo", cmdArgs...).CombinedOutput()
|
||||
return string(raw), err
|
||||
}
|
||||
raw, err := exec.Command("sudo", helperArgs...).CombinedOutput()
|
||||
return string(raw), err
|
||||
}
|
||||
@@ -407,11 +407,11 @@ func (s *System) ResetNvidiaGPU(index int) (string, error) {
|
||||
if index < 0 {
|
||||
return "", fmt.Errorf("gpu index must be >= 0")
|
||||
}
|
||||
raw, err := satExecCommand("nvidia-smi", "-r", "-i", strconv.Itoa(index)).CombinedOutput()
|
||||
if len(raw) == 0 && err == nil {
|
||||
raw = []byte("GPU reset completed.\n")
|
||||
out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
|
||||
if strings.TrimSpace(out) == "" && err == nil {
|
||||
out = "GPU reset completed.\n"
|
||||
}
|
||||
return string(raw), err
|
||||
return out, err
|
||||
}
|
||||
|
||||
// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
|
||||
|
||||
@@ -61,6 +61,9 @@ func (s *System) ServiceState(name string) string {
|
||||
}
|
||||
|
||||
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
|
||||
if name == "bee-nvidia" && action == ServiceRestart {
|
||||
return runNvidiaRecover("restart-drivers")
|
||||
}
|
||||
// bee-web runs as the bee user; sudo is required to control system services.
|
||||
// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
|
||||
raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()
|
||||
|
||||
178
iso/overlay/usr/local/bin/bee-nvidia-recover
Executable file
178
iso/overlay/usr/local/bin/bee-nvidia-recover
Executable file
@@ -0,0 +1,178 @@
|
||||
#!/bin/sh
|
||||
# bee-nvidia-recover — drain NVIDIA clients, then reset a GPU or reload drivers.
|
||||
|
||||
set -u
|
||||
|
||||
log() {
|
||||
echo "[bee-nvidia-recover] $*"
|
||||
}
|
||||
|
||||
log_blocker() {
|
||||
echo "[bee-nvidia-recover] blocker: $*"
|
||||
}
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
usage:
|
||||
bee-nvidia-recover restart-drivers
|
||||
bee-nvidia-recover reset-gpu <index>
|
||||
EOF
|
||||
}
|
||||
|
||||
unit_exists() {
|
||||
systemctl cat "$1" >/dev/null 2>&1
|
||||
}
|
||||
|
||||
unit_is_active() {
|
||||
systemctl is-active --quiet "$1" 2>/dev/null
|
||||
}
|
||||
|
||||
stop_unit_if_active() {
|
||||
unit="$1"
|
||||
if unit_is_active "$unit"; then
|
||||
log "stopping $unit"
|
||||
systemctl stop "$unit"
|
||||
return 0
|
||||
fi
|
||||
return 1
|
||||
}
|
||||
|
||||
start_unit_if_marked() {
|
||||
unit="$1"
|
||||
marker="$2"
|
||||
if [ "$marker" = "1" ] && unit_exists "$unit"; then
|
||||
log "starting $unit"
|
||||
systemctl start "$unit"
|
||||
fi
|
||||
}
|
||||
|
||||
wait_for_process_exit() {
|
||||
name="$1"
|
||||
tries=0
|
||||
while pgrep -x "$name" >/dev/null 2>&1; do
|
||||
tries=$((tries + 1))
|
||||
if [ "$tries" -ge 15 ]; then
|
||||
log "WARN: $name is still running after stop request"
|
||||
return 1
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
return 0
|
||||
}
|
||||
|
||||
kill_pattern() {
|
||||
pattern="$1"
|
||||
if pgrep -f "$pattern" >/dev/null 2>&1; then
|
||||
pgrep -af "$pattern" 2>/dev/null | while IFS= read -r line; do
|
||||
[ -n "$line" ] || continue
|
||||
log_blocker "$line"
|
||||
done
|
||||
log "killing processes matching: $pattern"
|
||||
pkill -TERM -f "$pattern" >/dev/null 2>&1 || true
|
||||
sleep 1
|
||||
pkill -KILL -f "$pattern" >/dev/null 2>&1 || true
|
||||
fi
|
||||
}
|
||||
|
||||
drain_gpu_clients() {
|
||||
display_was_active=0
|
||||
fabric_was_active=0
|
||||
|
||||
for unit in display-manager.service lightdm.service; do
|
||||
if unit_exists "$unit" && stop_unit_if_active "$unit"; then
|
||||
log_blocker "service $unit"
|
||||
display_was_active=1
|
||||
fi
|
||||
done
|
||||
|
||||
if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
|
||||
log_blocker "service nvidia-fabricmanager.service"
|
||||
fabric_was_active=1
|
||||
fi
|
||||
|
||||
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||
pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do
|
||||
[ -n "$line" ] || continue
|
||||
log_blocker "$line"
|
||||
done
|
||||
log "stopping nv-hostengine"
|
||||
pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true
|
||||
wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true
|
||||
fi
|
||||
|
||||
for pattern in \
|
||||
"nvidia-smi" \
|
||||
"dcgmi" \
|
||||
"nvvs" \
|
||||
"dcgmproftester" \
|
||||
"all_reduce_perf" \
|
||||
"nvtop" \
|
||||
"bee-gpu-burn" \
|
||||
"bee-john-gpu-stress" \
|
||||
"bee-nccl-gpu-stress" \
|
||||
"Xorg" \
|
||||
"Xwayland"; do
|
||||
kill_pattern "$pattern"
|
||||
done
|
||||
}
|
||||
|
||||
restore_gpu_clients() {
|
||||
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||
if nvidia-smi -pm 1 >/dev/null 2>&1; then
|
||||
log "enabled NVIDIA persistence mode"
|
||||
else
|
||||
log "WARN: failed to enable NVIDIA persistence mode"
|
||||
fi
|
||||
fi
|
||||
|
||||
if command -v nv-hostengine >/dev/null 2>&1 && ! pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||
log "starting nv-hostengine"
|
||||
nv-hostengine
|
||||
fi
|
||||
|
||||
start_unit_if_marked nvidia-fabricmanager.service "${fabric_was_active:-0}"
|
||||
start_unit_if_marked display-manager.service "${display_was_active:-0}"
|
||||
if [ "${display_was_active:-0}" = "1" ] && unit_exists lightdm.service && ! unit_is_active lightdm.service; then
|
||||
start_unit_if_marked lightdm.service "1"
|
||||
fi
|
||||
}
|
||||
|
||||
restart_drivers() {
|
||||
drain_gpu_clients
|
||||
for mod in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do
|
||||
if lsmod | awk '{print $1}' | grep -qx "$mod"; then
|
||||
log "unloading module $mod"
|
||||
rmmod "$mod"
|
||||
fi
|
||||
done
|
||||
rm -f /dev/nvidiactl /dev/nvidia-uvm /dev/nvidia-uvm-tools /dev/nvidia[0-9]* 2>/dev/null || true
|
||||
log "reloading NVIDIA driver stack"
|
||||
/usr/local/bin/bee-nvidia-load
|
||||
restore_gpu_clients
|
||||
}
|
||||
|
||||
reset_gpu() {
|
||||
index="$1"
|
||||
drain_gpu_clients
|
||||
log "resetting GPU $index"
|
||||
nvidia-smi -r -i "$index"
|
||||
restore_gpu_clients
|
||||
}
|
||||
|
||||
cmd="${1:-}"
|
||||
case "$cmd" in
|
||||
restart-drivers)
|
||||
restart_drivers
|
||||
;;
|
||||
reset-gpu)
|
||||
if [ "$#" -ne 2 ]; then
|
||||
usage >&2
|
||||
exit 2
|
||||
fi
|
||||
reset_gpu "$2"
|
||||
;;
|
||||
*)
|
||||
usage >&2
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
Reference in New Issue
Block a user