Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5f0103635b | ||
|
|
84a2551dc0 | ||
|
|
1cfabc9230 | ||
|
|
5dc711de23 | ||
|
|
ab802719f8 | ||
|
|
a94e8007f8 |
@@ -37,6 +37,8 @@ type benchmarkGPUInfo struct {
|
|||||||
VBIOS string
|
VBIOS string
|
||||||
PowerLimitW float64
|
PowerLimitW float64
|
||||||
DefaultPowerLimitW float64
|
DefaultPowerLimitW float64
|
||||||
|
MinPowerLimitW float64
|
||||||
|
MaxPowerLimitW float64
|
||||||
MaxGraphicsClockMHz float64
|
MaxGraphicsClockMHz float64
|
||||||
MaxMemoryClockMHz float64
|
MaxMemoryClockMHz float64
|
||||||
BaseGraphicsClockMHz float64
|
BaseGraphicsClockMHz float64
|
||||||
@@ -95,6 +97,8 @@ var (
|
|||||||
benchmarkReadyPattern = regexp.MustCompile(`^([a-z0-9_]+)\[(\d+)\]=READY dim=(\d+)x(\d+)x(\d+)\b`)
|
benchmarkReadyPattern = regexp.MustCompile(`^([a-z0-9_]+)\[(\d+)\]=READY dim=(\d+)x(\d+)x(\d+)\b`)
|
||||||
benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
|
benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
|
||||||
benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
|
benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
|
||||||
|
benchmarkGeteuid = os.Geteuid
|
||||||
|
benchmarkSleep = time.Sleep
|
||||||
)
|
)
|
||||||
|
|
||||||
// benchmarkPrecisionPhases lists the precision categories run as individual
|
// benchmarkPrecisionPhases lists the precision categories run as individual
|
||||||
@@ -220,8 +224,6 @@ func benchmarkCalibrationThrottleReason(before, after BenchmarkThrottleCounters)
|
|||||||
return "hw_thermal"
|
return "hw_thermal"
|
||||||
case diff.SWThermalSlowdownUS > 0:
|
case diff.SWThermalSlowdownUS > 0:
|
||||||
return "sw_thermal"
|
return "sw_thermal"
|
||||||
case diff.HWPowerBrakeSlowdownUS > 0:
|
|
||||||
return "hw_power_brake"
|
|
||||||
default:
|
default:
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
@@ -240,6 +242,39 @@ func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, po
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int, logFunc func(string)) []int {
|
||||||
|
if len(gpuIndices) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if benchmarkGeteuid() != 0 {
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc("power benchmark pre-flight: root privileges unavailable, GPU reset skipped")
|
||||||
|
}
|
||||||
|
return append([]int(nil), gpuIndices...)
|
||||||
|
}
|
||||||
|
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||||
|
for _, p := range killed {
|
||||||
|
logFunc(fmt.Sprintf("power benchmark pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
var failed []int
|
||||||
|
for _, idx := range gpuIndices {
|
||||||
|
name := fmt.Sprintf("power-preflight-gpu-%d-reset.log", idx)
|
||||||
|
if _, err := runSATCommandCtx(ctx, verboseLog, name, []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-r"}, nil, logFunc); err != nil {
|
||||||
|
failed = append(failed, idx)
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset failed: %v", idx, err))
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset completed", idx))
|
||||||
|
}
|
||||||
|
benchmarkSleep(time.Second)
|
||||||
|
}
|
||||||
|
return failed
|
||||||
|
}
|
||||||
|
|
||||||
func benchmarkPowerEngine() string {
|
func benchmarkPowerEngine() string {
|
||||||
switch strings.TrimSpace(strings.ToLower(os.Getenv("BEE_BENCH_POWER_ENGINE"))) {
|
switch strings.TrimSpace(strings.ToLower(os.Getenv("BEE_BENCH_POWER_ENGINE"))) {
|
||||||
case BenchmarkPowerEngineTargetedPower:
|
case BenchmarkPowerEngineTargetedPower:
|
||||||
@@ -351,9 +386,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
|
result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
|
||||||
result.Normalization.Status = "partial"
|
result.Normalization.Status = "partial"
|
||||||
}
|
}
|
||||||
// Enrich with max clocks from verbose output — covers GPUs where
|
// Enrich with verbose nvidia-smi data — covers GPUs where some CSV fields
|
||||||
// clocks.max.* CSV fields are unsupported (e.g. Blackwell / driver 98.x).
|
// are unsupported (e.g. clocks.max.* on Blackwell / driver 98.x).
|
||||||
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQOut)
|
enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQOut)
|
||||||
|
|
||||||
activeApps, err := queryActiveComputeApps(selected)
|
activeApps, err := queryActiveComputeApps(selected)
|
||||||
if err == nil && len(activeApps) > 0 {
|
if err == nil && len(activeApps) > 0 {
|
||||||
@@ -737,8 +772,8 @@ func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
|
|||||||
// (attribute.multiprocessor_count, power.default_limit) are not supported on
|
// (attribute.multiprocessor_count, power.default_limit) are not supported on
|
||||||
// all driver versions, so we fall back to the base set if the full query fails.
|
// all driver versions, so we fall back to the base set if the full query fails.
|
||||||
// The minimal fallback omits clock fields entirely — clocks.max.* returns
|
// The minimal fallback omits clock fields entirely — clocks.max.* returns
|
||||||
// exit status 2 on some GPU generations (e.g. Blackwell); max clocks are
|
// exit status 2 on some GPU generations (e.g. Blackwell); missing data is
|
||||||
// then recovered from nvidia-smi -q via enrichGPUInfoWithMaxClocks.
|
// then recovered from nvidia-smi -q.
|
||||||
var benchmarkGPUInfoQueries = []struct {
|
var benchmarkGPUInfoQueries = []struct {
|
||||||
fields string
|
fields string
|
||||||
extended bool // whether this query includes optional extended fields
|
extended bool // whether this query includes optional extended fields
|
||||||
@@ -758,12 +793,9 @@ var benchmarkGPUInfoQueries = []struct {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
// enrichGPUInfoWithMaxClocks fills MaxGraphicsClockMHz / MaxMemoryClockMHz for
|
// enrichGPUInfoWithNvidiaSMIQ fills benchmark GPU metadata from nvidia-smi -q
|
||||||
// any GPU in infoByIndex where those values are still zero. It parses the
|
// for fields that may be missing from --query-gpu on some driver versions.
|
||||||
// "Max Clocks" section of nvidia-smi -q output (already available as nvsmiQ).
|
func enrichGPUInfoWithNvidiaSMIQ(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) {
|
||||||
// This is the fallback for GPUs (e.g. Blackwell) where clocks.max.* CSV fields
|
|
||||||
// return exit status 2 but the verbose query works fine.
|
|
||||||
func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) {
|
|
||||||
if len(infoByIndex) == 0 || len(nvsmiQ) == 0 {
|
if len(infoByIndex) == 0 || len(nvsmiQ) == 0 {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -784,6 +816,8 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
|
|||||||
maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`)
|
maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`)
|
||||||
defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`)
|
defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`)
|
||||||
currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`)
|
currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`)
|
||||||
|
minPwrRe := regexp.MustCompile(`(?i)Min Power Limit\s*:\s*([0-9.]+)\s*W`)
|
||||||
|
maxPwrRe := regexp.MustCompile(`(?i)Max Power Limit\s*:\s*([0-9.]+)\s*W`)
|
||||||
smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`)
|
smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`)
|
||||||
shutdownTempRe := regexp.MustCompile(`(?i)GPU Shutdown Temp\s*:\s*(\d+)\s*C`)
|
shutdownTempRe := regexp.MustCompile(`(?i)GPU Shutdown Temp\s*:\s*(\d+)\s*C`)
|
||||||
slowdownTempRe := regexp.MustCompile(`(?i)GPU Slowdown Temp\s*:\s*(\d+)\s*C`)
|
slowdownTempRe := regexp.MustCompile(`(?i)GPU Slowdown Temp\s*:\s*(\d+)\s*C`)
|
||||||
@@ -843,6 +877,20 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if info.MinPowerLimitW == 0 {
|
||||||
|
if m := minPwrRe.FindSubmatch(section); m != nil {
|
||||||
|
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
|
||||||
|
info.MinPowerLimitW = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if info.MaxPowerLimitW == 0 {
|
||||||
|
if m := maxPwrRe.FindSubmatch(section); m != nil {
|
||||||
|
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
|
||||||
|
info.MaxPowerLimitW = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
if info.MultiprocessorCount == 0 {
|
if info.MultiprocessorCount == 0 {
|
||||||
if m := smCountRe.FindSubmatch(section); m != nil {
|
if m := smCountRe.FindSubmatch(section); m != nil {
|
||||||
if v, err := strconv.Atoi(string(m[1])); err == nil && v > 0 {
|
if v, err := strconv.Atoi(string(m[1])); err == nil && v > 0 {
|
||||||
@@ -3043,7 +3091,6 @@ func runBenchmarkPowerCalibration(
|
|||||||
if calibDurationSec <= 0 {
|
if calibDurationSec <= 0 {
|
||||||
calibDurationSec = 120
|
calibDurationSec = 120
|
||||||
}
|
}
|
||||||
const maxDerateW = 150
|
|
||||||
// calibSearchTolerance is the binary-search convergence threshold in watts.
|
// calibSearchTolerance is the binary-search convergence threshold in watts.
|
||||||
// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
|
// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
|
||||||
const calibSearchTolerance = 10
|
const calibSearchTolerance = 10
|
||||||
@@ -3090,8 +3137,9 @@ func runBenchmarkPowerCalibration(
|
|||||||
originalLimitW int
|
originalLimitW int
|
||||||
appliedLimitW int
|
appliedLimitW int
|
||||||
minLimitW int
|
minLimitW int
|
||||||
lo int // highest verified-stable limit (assumed: minLimitW)
|
lo int // highest verified-stable limit
|
||||||
hi int // lowest verified-unstable limit (exclusive sentinel above start)
|
hi int // lowest verified-unstable limit (exclusive sentinel above start)
|
||||||
|
loVerified bool
|
||||||
calib benchmarkPowerCalibrationResult
|
calib benchmarkPowerCalibrationResult
|
||||||
converged bool
|
converged bool
|
||||||
}
|
}
|
||||||
@@ -3113,23 +3161,17 @@ func runBenchmarkPowerCalibration(
|
|||||||
if defaultLimitW <= 0 {
|
if defaultLimitW <= 0 {
|
||||||
defaultLimitW = originalLimitW
|
defaultLimitW = originalLimitW
|
||||||
}
|
}
|
||||||
appliedLimitW := originalLimitW
|
appliedLimitW := initialBenchmarkCalibrationLimitW(info)
|
||||||
if appliedLimitW <= 0 {
|
if appliedLimitW <= 0 {
|
||||||
appliedLimitW = defaultLimitW
|
appliedLimitW = defaultLimitW
|
||||||
}
|
}
|
||||||
minLimitW := appliedLimitW
|
minLimitW := int(math.Round(info.MinPowerLimitW))
|
||||||
switch {
|
if minLimitW <= 0 {
|
||||||
case defaultLimitW > 0:
|
minLimitW = appliedLimitW
|
||||||
minLimitW = defaultLimitW - maxDerateW
|
|
||||||
floorByRatio := int(math.Round(float64(defaultLimitW) * 0.70))
|
|
||||||
if minLimitW < floorByRatio {
|
|
||||||
minLimitW = floorByRatio
|
|
||||||
}
|
|
||||||
case appliedLimitW > 0:
|
|
||||||
minLimitW = appliedLimitW - maxDerateW
|
|
||||||
}
|
}
|
||||||
if minLimitW < calibSearchTolerance {
|
maxLimitW := int(math.Round(info.MaxPowerLimitW))
|
||||||
minLimitW = calibSearchTolerance
|
if maxLimitW > 0 && appliedLimitW > maxLimitW {
|
||||||
|
appliedLimitW = maxLimitW
|
||||||
}
|
}
|
||||||
s := &gpuCalibState{
|
s := &gpuCalibState{
|
||||||
idx: idx,
|
idx: idx,
|
||||||
@@ -3141,11 +3183,24 @@ func runBenchmarkPowerCalibration(
|
|||||||
hi: appliedLimitW + 1, // not yet tested, not yet confirmed unstable
|
hi: appliedLimitW + 1, // not yet tested, not yet confirmed unstable
|
||||||
calib: benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
|
calib: benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
|
||||||
}
|
}
|
||||||
|
if minLimitW > 0 && appliedLimitW > 0 && minLimitW >= appliedLimitW {
|
||||||
|
s.appliedLimitW = minLimitW
|
||||||
|
s.hi = minLimitW + 1
|
||||||
|
}
|
||||||
|
if info.MinPowerLimitW <= 0 {
|
||||||
|
s.calib.Notes = append(s.calib.Notes, "minimum power limit was not reported by nvidia-smi; calibration can only validate the current/default power limit")
|
||||||
|
}
|
||||||
if seedLimits != nil {
|
if seedLimits != nil {
|
||||||
if seedW, ok := seedLimits[idx]; ok && seedW > 0 {
|
if seedW, ok := seedLimits[idx]; ok && seedW > 0 {
|
||||||
// A previously validated limit is only a starting point. Re-run
|
// A previously validated limit is only a starting point. Re-run
|
||||||
// targeted_power under the current multi-GPU thermal load and derate
|
// targeted_power under the current multi-GPU thermal load and derate
|
||||||
// again if this step shows new throttling.
|
// again if this step shows new throttling.
|
||||||
|
if seedW < s.minLimitW {
|
||||||
|
seedW = s.minLimitW
|
||||||
|
}
|
||||||
|
if maxLimitW > 0 && seedW > maxLimitW {
|
||||||
|
seedW = maxLimitW
|
||||||
|
}
|
||||||
if canDerate {
|
if canDerate {
|
||||||
_ = setBenchmarkPowerLimit(ctx, verboseLog, idx, seedW)
|
_ = setBenchmarkPowerLimit(ctx, verboseLog, idx, seedW)
|
||||||
}
|
}
|
||||||
@@ -3333,6 +3388,7 @@ calibDone:
|
|||||||
s.calib.AppliedPowerLimitW = float64(s.appliedLimitW)
|
s.calib.AppliedPowerLimitW = float64(s.appliedLimitW)
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", s.idx, s.appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
|
logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", s.idx, s.appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
|
||||||
s.lo = s.appliedLimitW
|
s.lo = s.appliedLimitW
|
||||||
|
s.loVerified = true
|
||||||
if canDerate && s.hi-s.lo > calibSearchTolerance {
|
if canDerate && s.hi-s.lo > calibSearchTolerance {
|
||||||
next := roundTo5W((s.lo + s.hi) / 2)
|
next := roundTo5W((s.lo + s.hi) / 2)
|
||||||
if next > s.lo && next < s.hi {
|
if next > s.lo && next < s.hi {
|
||||||
@@ -3371,7 +3427,23 @@ calibDone:
|
|||||||
s.hi = s.appliedLimitW
|
s.hi = s.appliedLimitW
|
||||||
|
|
||||||
if s.hi-s.lo <= calibSearchTolerance {
|
if s.hi-s.lo <= calibSearchTolerance {
|
||||||
if s.lo > s.minLimitW {
|
if !s.loVerified && s.minLimitW > 0 && s.appliedLimitW != s.minLimitW {
|
||||||
|
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.minLimitW); err != nil {
|
||||||
|
s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
|
||||||
|
logFunc(fmt.Sprintf("power calibration: GPU %d failed to set minimum power limit %d W: %v", s.idx, s.minLimitW, err))
|
||||||
|
s.converged = true
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
s.appliedLimitW = s.minLimitW
|
||||||
|
s.calib.AppliedPowerLimitW = float64(s.minLimitW)
|
||||||
|
s.calib.Derated = s.minLimitW < s.originalLimitW
|
||||||
|
s.info.PowerLimitW = float64(s.minLimitW)
|
||||||
|
infoByIndex[s.idx] = s.info
|
||||||
|
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: validating minimum settable limit %d W before concluding failure", s.minLimitW))
|
||||||
|
logFunc(fmt.Sprintf("power calibration: GPU %d binary search: validating minimum settable limit %d W", s.idx, s.minLimitW))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if s.loVerified {
|
||||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", s.lo, s.lo, s.hi))
|
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", s.lo, s.lo, s.hi))
|
||||||
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.lo); err == nil {
|
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.lo); err == nil {
|
||||||
s.appliedLimitW = s.lo
|
s.appliedLimitW = s.lo
|
||||||
@@ -3383,7 +3455,8 @@ calibDone:
|
|||||||
s.calib.Completed = true
|
s.calib.Completed = true
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
|
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit down to the minimum settable power limit %d W", engineLabel, s.minLimitW))
|
||||||
|
logFunc(fmt.Sprintf("power calibration: GPU %d no stable limit found down to minimum settable power limit %d W", s.idx, s.minLimitW))
|
||||||
}
|
}
|
||||||
s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
|
s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
|
||||||
s.converged = true
|
s.converged = true
|
||||||
@@ -3398,9 +3471,7 @@ calibDone:
|
|||||||
next = (s.lo + s.hi) / 2
|
next = (s.lo + s.hi) / 2
|
||||||
}
|
}
|
||||||
if next < s.minLimitW {
|
if next < s.minLimitW {
|
||||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
|
next = s.minLimitW
|
||||||
s.converged = true
|
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err != nil {
|
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err != nil {
|
||||||
s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
|
s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
|
||||||
@@ -3439,6 +3510,24 @@ func roundTo5W(w int) int {
|
|||||||
return ((w + 2) / 5) * 5
|
return ((w + 2) / 5) * 5
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func initialBenchmarkCalibrationLimitW(info benchmarkGPUInfo) int {
|
||||||
|
defaultLimitW := int(math.Round(info.DefaultPowerLimitW))
|
||||||
|
currentLimitW := int(math.Round(info.PowerLimitW))
|
||||||
|
maxLimitW := int(math.Round(info.MaxPowerLimitW))
|
||||||
|
|
||||||
|
startW := defaultLimitW
|
||||||
|
if startW <= 0 {
|
||||||
|
startW = currentLimitW
|
||||||
|
}
|
||||||
|
if startW <= 0 {
|
||||||
|
startW = maxLimitW
|
||||||
|
}
|
||||||
|
if maxLimitW > 0 && startW > maxLimitW {
|
||||||
|
startW = maxLimitW
|
||||||
|
}
|
||||||
|
return startW
|
||||||
|
}
|
||||||
|
|
||||||
// meanFanRPM returns the average RPM across a set of fan readings.
|
// meanFanRPM returns the average RPM across a set of fan readings.
|
||||||
func meanFanRPM(fans []FanReading) float64 {
|
func meanFanRPM(fans []FanReading) float64 {
|
||||||
if len(fans) == 0 {
|
if len(fans) == 0 {
|
||||||
@@ -4096,14 +4185,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
|
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
|
||||||
}
|
}
|
||||||
verboseLog := filepath.Join(runDir, "verbose.log")
|
verboseLog := filepath.Join(runDir, "verbose.log")
|
||||||
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
|
|
||||||
if infoErr != nil {
|
|
||||||
return "", infoErr
|
|
||||||
}
|
|
||||||
// Capture full nvidia-smi -q snapshot at the start of the run.
|
|
||||||
if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
|
|
||||||
_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
|
|
||||||
}
|
|
||||||
hostname, _ := os.Hostname()
|
hostname, _ := os.Hostname()
|
||||||
result := NvidiaPowerBenchResult{
|
result := NvidiaPowerBenchResult{
|
||||||
BenchmarkVersion: benchmarkVersion,
|
BenchmarkVersion: benchmarkVersion,
|
||||||
@@ -4114,6 +4195,14 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
SelectedGPUIndices: append([]int(nil), selected...),
|
SelectedGPUIndices: append([]int(nil), selected...),
|
||||||
OverallStatus: "OK",
|
OverallStatus: "OK",
|
||||||
}
|
}
|
||||||
|
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
|
||||||
|
if infoErr != nil {
|
||||||
|
return "", infoErr
|
||||||
|
}
|
||||||
|
// Capture full nvidia-smi -q snapshot at the start of the run.
|
||||||
|
if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
|
||||||
|
_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
|
||||||
|
}
|
||||||
durationSec := powerBenchDurationSec(opts.Profile)
|
durationSec := powerBenchDurationSec(opts.Profile)
|
||||||
|
|
||||||
// Sample server idle power before any GPU load.
|
// Sample server idle power before any GPU load.
|
||||||
@@ -4139,6 +4228,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
|
singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
|
||||||
_ = os.MkdirAll(singleDir, 0755)
|
_ = os.MkdirAll(singleDir, 0755)
|
||||||
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||||
|
if failed := resetBenchmarkGPUs(ctx, verboseLog, []int{idx}, logFunc); len(failed) > 0 {
|
||||||
|
result.Findings = append(result.Findings,
|
||||||
|
fmt.Sprintf("GPU %d reset pre-flight did not complete before its first power test; throttle counters may contain stale state.", idx))
|
||||||
|
}
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
||||||
singlePowerStopCh := make(chan struct{})
|
singlePowerStopCh := make(chan struct{})
|
||||||
singlePowerCh := startSelectedPowerSourceSampler(singlePowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
|
singlePowerCh := startSelectedPowerSourceSampler(singlePowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
|
||||||
|
|||||||
@@ -1,8 +1,13 @@
|
|||||||
package platform
|
package platform
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestResolveBenchmarkProfile(t *testing.T) {
|
func TestResolveBenchmarkProfile(t *testing.T) {
|
||||||
@@ -164,6 +169,93 @@ func TestBenchmarkPlannedPhaseStatus(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestBenchmarkCalibrationThrottleReasonIgnoresPowerReasons(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
before := BenchmarkThrottleCounters{}
|
||||||
|
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWPowerCapUS: 1_000_000}); got != "" {
|
||||||
|
t.Fatalf("sw_power_cap should be ignored, got %q", got)
|
||||||
|
}
|
||||||
|
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWPowerBrakeSlowdownUS: 1_000_000}); got != "" {
|
||||||
|
t.Fatalf("hw_power_brake should be ignored, got %q", got)
|
||||||
|
}
|
||||||
|
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWThermalSlowdownUS: 1_000_000}); got != "hw_thermal" {
|
||||||
|
t.Fatalf("hw_thermal mismatch: got %q", got)
|
||||||
|
}
|
||||||
|
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWThermalSlowdownUS: 1_000_000}); got != "sw_thermal" {
|
||||||
|
t.Fatalf("sw_thermal mismatch: got %q", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
oldGeteuid := benchmarkGeteuid
|
||||||
|
oldExec := satExecCommand
|
||||||
|
benchmarkGeteuid = func() int { return 1000 }
|
||||||
|
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||||
|
t.Fatalf("unexpected command: %s %v", name, args)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
t.Cleanup(func() {
|
||||||
|
benchmarkGeteuid = oldGeteuid
|
||||||
|
satExecCommand = oldExec
|
||||||
|
})
|
||||||
|
|
||||||
|
var logs []string
|
||||||
|
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{0, 2}, func(line string) {
|
||||||
|
logs = append(logs, line)
|
||||||
|
})
|
||||||
|
if got, want := strings.Join(logs, "\n"), "power benchmark pre-flight: root privileges unavailable, GPU reset skipped"; !strings.Contains(got, want) {
|
||||||
|
t.Fatalf("logs=%q want substring %q", got, want)
|
||||||
|
}
|
||||||
|
if len(failed) != 2 || failed[0] != 0 || failed[1] != 2 {
|
||||||
|
t.Fatalf("failed=%v want [0 2]", failed)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResetBenchmarkGPUsResetsEachGPU(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
dir := t.TempDir()
|
||||||
|
script := filepath.Join(dir, "nvidia-smi")
|
||||||
|
argsLog := filepath.Join(dir, "args.log")
|
||||||
|
if err := os.WriteFile(script, []byte("#!/bin/sh\nprintf '%s\\n' \"$*\" >> "+argsLog+"\nprintf 'ok\\n'\n"), 0755); err != nil {
|
||||||
|
t.Fatalf("write script: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
oldGeteuid := benchmarkGeteuid
|
||||||
|
oldSleep := benchmarkSleep
|
||||||
|
oldLookPath := satLookPath
|
||||||
|
benchmarkGeteuid = func() int { return 0 }
|
||||||
|
benchmarkSleep = func(time.Duration) {}
|
||||||
|
satLookPath = func(file string) (string, error) {
|
||||||
|
if file == "nvidia-smi" {
|
||||||
|
return script, nil
|
||||||
|
}
|
||||||
|
return exec.LookPath(file)
|
||||||
|
}
|
||||||
|
t.Cleanup(func() {
|
||||||
|
benchmarkGeteuid = oldGeteuid
|
||||||
|
benchmarkSleep = oldSleep
|
||||||
|
satLookPath = oldLookPath
|
||||||
|
})
|
||||||
|
|
||||||
|
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(dir, "verbose.log"), []int{2, 5}, nil)
|
||||||
|
if len(failed) != 0 {
|
||||||
|
t.Fatalf("failed=%v want no failures", failed)
|
||||||
|
}
|
||||||
|
raw, err := os.ReadFile(argsLog)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("read args log: %v", err)
|
||||||
|
}
|
||||||
|
got := strings.Fields(string(raw))
|
||||||
|
want := []string{"-i", "2", "-r", "-i", "5", "-r"}
|
||||||
|
if strings.Join(got, " ") != strings.Join(want, " ") {
|
||||||
|
t.Fatalf("args=%v want %v", got, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
|
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
@@ -179,6 +271,59 @@ func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestInitialBenchmarkCalibrationLimitW(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
cases := []struct {
|
||||||
|
name string
|
||||||
|
info benchmarkGPUInfo
|
||||||
|
want int
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "prefers default tdp over current derated limit",
|
||||||
|
info: benchmarkGPUInfo{
|
||||||
|
PowerLimitW: 500,
|
||||||
|
DefaultPowerLimitW: 600,
|
||||||
|
MaxPowerLimitW: 600,
|
||||||
|
},
|
||||||
|
want: 600,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "caps default tdp to reported max limit",
|
||||||
|
info: benchmarkGPUInfo{
|
||||||
|
PowerLimitW: 500,
|
||||||
|
DefaultPowerLimitW: 700,
|
||||||
|
MaxPowerLimitW: 650,
|
||||||
|
},
|
||||||
|
want: 650,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "falls back to current limit when default missing",
|
||||||
|
info: benchmarkGPUInfo{
|
||||||
|
PowerLimitW: 525,
|
||||||
|
MaxPowerLimitW: 600,
|
||||||
|
},
|
||||||
|
want: 525,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "falls back to max limit when only that is known",
|
||||||
|
info: benchmarkGPUInfo{
|
||||||
|
MaxPowerLimitW: 575,
|
||||||
|
},
|
||||||
|
want: 575,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range cases {
|
||||||
|
tc := tc
|
||||||
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
|
if got := initialBenchmarkCalibrationLimitW(tc.info); got != tc.want {
|
||||||
|
t.Fatalf("initialBenchmarkCalibrationLimitW(%+v)=%d want %d", tc.info, got, tc.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestParseBenchmarkBurnLog(t *testing.T) {
|
func TestParseBenchmarkBurnLog(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
@@ -338,12 +483,16 @@ func TestScoreBenchmarkGPUIgnoresDisabledPrecisions(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {
|
func TestEnrichGPUInfoWithNvidiaSMIQ(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
nvsmiQ := []byte(`
|
nvsmiQ := []byte(`
|
||||||
GPU 00000000:4E:00.0
|
GPU 00000000:4E:00.0
|
||||||
Product Name : NVIDIA RTX PRO 6000 Blackwell Server Edition
|
Product Name : NVIDIA RTX PRO 6000 Blackwell Server Edition
|
||||||
|
Min Power Limit : 200.00 W
|
||||||
|
Max Power Limit : 600.00 W
|
||||||
|
Default Power Limit : 575.00 W
|
||||||
|
Current Power Limit : 560.00 W
|
||||||
Clocks
|
Clocks
|
||||||
Graphics : 2422 MHz
|
Graphics : 2422 MHz
|
||||||
Memory : 12481 MHz
|
Memory : 12481 MHz
|
||||||
@@ -365,7 +514,7 @@ GPU 00000000:4F:00.0
|
|||||||
1: {Index: 1, BusID: "00000000:4F:00.0"},
|
1: {Index: 1, BusID: "00000000:4F:00.0"},
|
||||||
}
|
}
|
||||||
|
|
||||||
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
|
enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)
|
||||||
|
|
||||||
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
|
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
|
||||||
t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz)
|
t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz)
|
||||||
@@ -379,25 +528,49 @@ GPU 00000000:4F:00.0
|
|||||||
if infoByIndex[1].MaxMemoryClockMHz != 12481 {
|
if infoByIndex[1].MaxMemoryClockMHz != 12481 {
|
||||||
t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz)
|
t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz)
|
||||||
}
|
}
|
||||||
|
if infoByIndex[0].MinPowerLimitW != 200 {
|
||||||
|
t.Errorf("GPU 0 MinPowerLimitW = %v, want 200", infoByIndex[0].MinPowerLimitW)
|
||||||
|
}
|
||||||
|
if infoByIndex[0].MaxPowerLimitW != 600 {
|
||||||
|
t.Errorf("GPU 0 MaxPowerLimitW = %v, want 600", infoByIndex[0].MaxPowerLimitW)
|
||||||
|
}
|
||||||
|
if infoByIndex[0].DefaultPowerLimitW != 575 {
|
||||||
|
t.Errorf("GPU 0 DefaultPowerLimitW = %v, want 575", infoByIndex[0].DefaultPowerLimitW)
|
||||||
|
}
|
||||||
|
if infoByIndex[0].PowerLimitW != 560 {
|
||||||
|
t.Errorf("GPU 0 PowerLimitW = %v, want 560", infoByIndex[0].PowerLimitW)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestEnrichGPUInfoWithMaxClocksSkipsPopulated(t *testing.T) {
|
func TestEnrichGPUInfoWithNvidiaSMIQSkipsPopulated(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
nvsmiQ := []byte(`
|
nvsmiQ := []byte(`
|
||||||
GPU 00000000:4E:00.0
|
GPU 00000000:4E:00.0
|
||||||
|
Min Power Limit : 100.00 W
|
||||||
|
Max Power Limit : 900.00 W
|
||||||
Max Clocks
|
Max Clocks
|
||||||
Graphics : 9999 MHz
|
Graphics : 9999 MHz
|
||||||
Memory : 9999 MHz
|
Memory : 9999 MHz
|
||||||
`)
|
`)
|
||||||
// Already populated — must not be overwritten.
|
// Already populated — must not be overwritten.
|
||||||
infoByIndex := map[int]benchmarkGPUInfo{
|
infoByIndex := map[int]benchmarkGPUInfo{
|
||||||
0: {Index: 0, BusID: "00000000:4E:00.0", MaxGraphicsClockMHz: 2430, MaxMemoryClockMHz: 12481},
|
0: {
|
||||||
|
Index: 0,
|
||||||
|
BusID: "00000000:4E:00.0",
|
||||||
|
MaxGraphicsClockMHz: 2430,
|
||||||
|
MaxMemoryClockMHz: 12481,
|
||||||
|
MinPowerLimitW: 200,
|
||||||
|
MaxPowerLimitW: 600,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
|
enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)
|
||||||
|
|
||||||
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
|
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
|
||||||
t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz)
|
t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz)
|
||||||
}
|
}
|
||||||
|
if infoByIndex[0].MinPowerLimitW != 200 {
|
||||||
|
t.Errorf("expected existing min power limit to be preserved, got %v", infoByIndex[0].MinPowerLimitW)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
30
audit/internal/platform/nvidia_recover.go
Normal file
30
audit/internal/platform/nvidia_recover.go
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os/exec"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
const nvidiaRecoverHelper = "/usr/local/bin/bee-nvidia-recover"
|
||||||
|
|
||||||
|
func runNvidiaRecover(args ...string) (string, error) {
|
||||||
|
helperArgs := append([]string{nvidiaRecoverHelper}, args...)
|
||||||
|
if _, err := exec.LookPath("systemd-run"); err == nil {
|
||||||
|
unit := fmt.Sprintf("bee-nvidia-recover-%d", time.Now().UnixNano())
|
||||||
|
cmdArgs := []string{
|
||||||
|
"systemd-run",
|
||||||
|
"--quiet",
|
||||||
|
"--pipe",
|
||||||
|
"--wait",
|
||||||
|
"--collect",
|
||||||
|
"--service-type=oneshot",
|
||||||
|
"--unit", unit,
|
||||||
|
}
|
||||||
|
cmdArgs = append(cmdArgs, helperArgs...)
|
||||||
|
raw, err := exec.Command("sudo", cmdArgs...).CombinedOutput()
|
||||||
|
return string(raw), err
|
||||||
|
}
|
||||||
|
raw, err := exec.Command("sudo", helperArgs...).CombinedOutput()
|
||||||
|
return string(raw), err
|
||||||
|
}
|
||||||
@@ -407,11 +407,11 @@ func (s *System) ResetNvidiaGPU(index int) (string, error) {
|
|||||||
if index < 0 {
|
if index < 0 {
|
||||||
return "", fmt.Errorf("gpu index must be >= 0")
|
return "", fmt.Errorf("gpu index must be >= 0")
|
||||||
}
|
}
|
||||||
raw, err := satExecCommand("nvidia-smi", "-r", "-i", strconv.Itoa(index)).CombinedOutput()
|
out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
|
||||||
if len(raw) == 0 && err == nil {
|
if strings.TrimSpace(out) == "" && err == nil {
|
||||||
raw = []byte("GPU reset completed.\n")
|
out = "GPU reset completed.\n"
|
||||||
}
|
}
|
||||||
return string(raw), err
|
return out, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
|
// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
|
||||||
|
|||||||
@@ -61,6 +61,9 @@ func (s *System) ServiceState(name string) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
|
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
|
||||||
|
if name == "bee-nvidia" && action == ServiceRestart {
|
||||||
|
return runNvidiaRecover("restart-drivers")
|
||||||
|
}
|
||||||
// bee-web runs as the bee user; sudo is required to control system services.
|
// bee-web runs as the bee user; sudo is required to control system services.
|
||||||
// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
|
// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
|
||||||
raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()
|
raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()
|
||||||
|
|||||||
178
iso/overlay/usr/local/bin/bee-nvidia-recover
Executable file
178
iso/overlay/usr/local/bin/bee-nvidia-recover
Executable file
@@ -0,0 +1,178 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# bee-nvidia-recover — drain NVIDIA clients, then reset a GPU or reload drivers.
|
||||||
|
|
||||||
|
set -u
|
||||||
|
|
||||||
|
log() {
|
||||||
|
echo "[bee-nvidia-recover] $*"
|
||||||
|
}
|
||||||
|
|
||||||
|
log_blocker() {
|
||||||
|
echo "[bee-nvidia-recover] blocker: $*"
|
||||||
|
}
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
cat <<'EOF'
|
||||||
|
usage:
|
||||||
|
bee-nvidia-recover restart-drivers
|
||||||
|
bee-nvidia-recover reset-gpu <index>
|
||||||
|
EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
unit_exists() {
|
||||||
|
systemctl cat "$1" >/dev/null 2>&1
|
||||||
|
}
|
||||||
|
|
||||||
|
unit_is_active() {
|
||||||
|
systemctl is-active --quiet "$1" 2>/dev/null
|
||||||
|
}
|
||||||
|
|
||||||
|
stop_unit_if_active() {
|
||||||
|
unit="$1"
|
||||||
|
if unit_is_active "$unit"; then
|
||||||
|
log "stopping $unit"
|
||||||
|
systemctl stop "$unit"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
start_unit_if_marked() {
|
||||||
|
unit="$1"
|
||||||
|
marker="$2"
|
||||||
|
if [ "$marker" = "1" ] && unit_exists "$unit"; then
|
||||||
|
log "starting $unit"
|
||||||
|
systemctl start "$unit"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
wait_for_process_exit() {
|
||||||
|
name="$1"
|
||||||
|
tries=0
|
||||||
|
while pgrep -x "$name" >/dev/null 2>&1; do
|
||||||
|
tries=$((tries + 1))
|
||||||
|
if [ "$tries" -ge 15 ]; then
|
||||||
|
log "WARN: $name is still running after stop request"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
kill_pattern() {
|
||||||
|
pattern="$1"
|
||||||
|
if pgrep -f "$pattern" >/dev/null 2>&1; then
|
||||||
|
pgrep -af "$pattern" 2>/dev/null | while IFS= read -r line; do
|
||||||
|
[ -n "$line" ] || continue
|
||||||
|
log_blocker "$line"
|
||||||
|
done
|
||||||
|
log "killing processes matching: $pattern"
|
||||||
|
pkill -TERM -f "$pattern" >/dev/null 2>&1 || true
|
||||||
|
sleep 1
|
||||||
|
pkill -KILL -f "$pattern" >/dev/null 2>&1 || true
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
drain_gpu_clients() {
|
||||||
|
display_was_active=0
|
||||||
|
fabric_was_active=0
|
||||||
|
|
||||||
|
for unit in display-manager.service lightdm.service; do
|
||||||
|
if unit_exists "$unit" && stop_unit_if_active "$unit"; then
|
||||||
|
log_blocker "service $unit"
|
||||||
|
display_was_active=1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
|
||||||
|
log_blocker "service nvidia-fabricmanager.service"
|
||||||
|
fabric_was_active=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||||
|
pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do
|
||||||
|
[ -n "$line" ] || continue
|
||||||
|
log_blocker "$line"
|
||||||
|
done
|
||||||
|
log "stopping nv-hostengine"
|
||||||
|
pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true
|
||||||
|
wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
for pattern in \
|
||||||
|
"nvidia-smi" \
|
||||||
|
"dcgmi" \
|
||||||
|
"nvvs" \
|
||||||
|
"dcgmproftester" \
|
||||||
|
"all_reduce_perf" \
|
||||||
|
"nvtop" \
|
||||||
|
"bee-gpu-burn" \
|
||||||
|
"bee-john-gpu-stress" \
|
||||||
|
"bee-nccl-gpu-stress" \
|
||||||
|
"Xorg" \
|
||||||
|
"Xwayland"; do
|
||||||
|
kill_pattern "$pattern"
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
restore_gpu_clients() {
|
||||||
|
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||||
|
if nvidia-smi -pm 1 >/dev/null 2>&1; then
|
||||||
|
log "enabled NVIDIA persistence mode"
|
||||||
|
else
|
||||||
|
log "WARN: failed to enable NVIDIA persistence mode"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if command -v nv-hostengine >/dev/null 2>&1 && ! pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||||
|
log "starting nv-hostengine"
|
||||||
|
nv-hostengine
|
||||||
|
fi
|
||||||
|
|
||||||
|
start_unit_if_marked nvidia-fabricmanager.service "${fabric_was_active:-0}"
|
||||||
|
start_unit_if_marked display-manager.service "${display_was_active:-0}"
|
||||||
|
if [ "${display_was_active:-0}" = "1" ] && unit_exists lightdm.service && ! unit_is_active lightdm.service; then
|
||||||
|
start_unit_if_marked lightdm.service "1"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
restart_drivers() {
|
||||||
|
drain_gpu_clients
|
||||||
|
for mod in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do
|
||||||
|
if lsmod | awk '{print $1}' | grep -qx "$mod"; then
|
||||||
|
log "unloading module $mod"
|
||||||
|
rmmod "$mod"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
rm -f /dev/nvidiactl /dev/nvidia-uvm /dev/nvidia-uvm-tools /dev/nvidia[0-9]* 2>/dev/null || true
|
||||||
|
log "reloading NVIDIA driver stack"
|
||||||
|
/usr/local/bin/bee-nvidia-load
|
||||||
|
restore_gpu_clients
|
||||||
|
}
|
||||||
|
|
||||||
|
reset_gpu() {
|
||||||
|
index="$1"
|
||||||
|
drain_gpu_clients
|
||||||
|
log "resetting GPU $index"
|
||||||
|
nvidia-smi -r -i "$index"
|
||||||
|
restore_gpu_clients
|
||||||
|
}
|
||||||
|
|
||||||
|
cmd="${1:-}"
|
||||||
|
case "$cmd" in
|
||||||
|
restart-drivers)
|
||||||
|
restart_drivers
|
||||||
|
;;
|
||||||
|
reset-gpu)
|
||||||
|
if [ "$#" -ne 2 ]; then
|
||||||
|
usage >&2
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
reset_gpu "$2"
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
usage >&2
|
||||||
|
exit 2
|
||||||
|
;;
|
||||||
|
esac
|
||||||
Reference in New Issue
Block a user