Compare commits
21 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 6112094d45 | |||
| e9a2bc9f9d | |||
|
|
7a8f884664 | ||
|
|
8bf8dfa45b | ||
|
|
6a22199aff | ||
|
|
ddb2bb5d1c | ||
|
|
aa284ae754 | ||
|
|
8512098174 | ||
|
|
6b5d22c194 | ||
|
|
a35e90a93e | ||
|
|
1ced81707f | ||
|
|
679aeb9947 | ||
|
|
647e99b697 | ||
|
|
4af997f436 | ||
|
|
6caace0cc0 | ||
|
|
5f0103635b | ||
|
|
84a2551dc0 | ||
|
|
1cfabc9230 | ||
|
|
5dc711de23 | ||
|
|
ab802719f8 | ||
|
|
a94e8007f8 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -3,3 +3,4 @@
|
|||||||
dist/
|
dist/
|
||||||
iso/out/
|
iso/out/
|
||||||
build-cache/
|
build-cache/
|
||||||
|
audit/bee
|
||||||
|
|||||||
@@ -37,6 +37,8 @@ type benchmarkGPUInfo struct {
|
|||||||
VBIOS string
|
VBIOS string
|
||||||
PowerLimitW float64
|
PowerLimitW float64
|
||||||
DefaultPowerLimitW float64
|
DefaultPowerLimitW float64
|
||||||
|
MinPowerLimitW float64
|
||||||
|
MaxPowerLimitW float64
|
||||||
MaxGraphicsClockMHz float64
|
MaxGraphicsClockMHz float64
|
||||||
MaxMemoryClockMHz float64
|
MaxMemoryClockMHz float64
|
||||||
BaseGraphicsClockMHz float64
|
BaseGraphicsClockMHz float64
|
||||||
@@ -65,6 +67,13 @@ type benchmarkPowerCalibrationResult struct {
|
|||||||
MetricRows []GPUMetricRow
|
MetricRows []GPUMetricRow
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type benchmarkPowerCalibrationRunSummary struct {
|
||||||
|
LoadedSDR benchmarkSDRSeriesSummary
|
||||||
|
AvgFanRPM float64
|
||||||
|
AvgFanDutyCyclePct float64
|
||||||
|
FanSamples int
|
||||||
|
}
|
||||||
|
|
||||||
type benchmarkBurnProfile struct {
|
type benchmarkBurnProfile struct {
|
||||||
name string
|
name string
|
||||||
category string
|
category string
|
||||||
@@ -95,6 +104,8 @@ var (
|
|||||||
benchmarkReadyPattern = regexp.MustCompile(`^([a-z0-9_]+)\[(\d+)\]=READY dim=(\d+)x(\d+)x(\d+)\b`)
|
benchmarkReadyPattern = regexp.MustCompile(`^([a-z0-9_]+)\[(\d+)\]=READY dim=(\d+)x(\d+)x(\d+)\b`)
|
||||||
benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
|
benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
|
||||||
benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
|
benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
|
||||||
|
benchmarkGeteuid = os.Geteuid
|
||||||
|
benchmarkSleep = time.Sleep
|
||||||
)
|
)
|
||||||
|
|
||||||
// benchmarkPrecisionPhases lists the precision categories run as individual
|
// benchmarkPrecisionPhases lists the precision categories run as individual
|
||||||
@@ -220,8 +231,6 @@ func benchmarkCalibrationThrottleReason(before, after BenchmarkThrottleCounters)
|
|||||||
return "hw_thermal"
|
return "hw_thermal"
|
||||||
case diff.SWThermalSlowdownUS > 0:
|
case diff.SWThermalSlowdownUS > 0:
|
||||||
return "sw_thermal"
|
return "sw_thermal"
|
||||||
case diff.HWPowerBrakeSlowdownUS > 0:
|
|
||||||
return "hw_power_brake"
|
|
||||||
default:
|
default:
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
@@ -240,6 +249,39 @@ func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, po
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int, logFunc func(string)) []int {
|
||||||
|
if len(gpuIndices) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if benchmarkGeteuid() != 0 {
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc("power benchmark pre-flight: root privileges unavailable, GPU reset skipped")
|
||||||
|
}
|
||||||
|
return append([]int(nil), gpuIndices...)
|
||||||
|
}
|
||||||
|
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||||
|
for _, p := range killed {
|
||||||
|
logFunc(fmt.Sprintf("power benchmark pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
var failed []int
|
||||||
|
for _, idx := range gpuIndices {
|
||||||
|
name := fmt.Sprintf("power-preflight-gpu-%d-reset.log", idx)
|
||||||
|
if _, err := runSATCommandCtx(ctx, verboseLog, name, []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-r"}, nil, logFunc); err != nil {
|
||||||
|
failed = append(failed, idx)
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset failed: %v", idx, err))
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset completed", idx))
|
||||||
|
}
|
||||||
|
benchmarkSleep(time.Second)
|
||||||
|
}
|
||||||
|
return failed
|
||||||
|
}
|
||||||
|
|
||||||
func benchmarkPowerEngine() string {
|
func benchmarkPowerEngine() string {
|
||||||
switch strings.TrimSpace(strings.ToLower(os.Getenv("BEE_BENCH_POWER_ENGINE"))) {
|
switch strings.TrimSpace(strings.ToLower(os.Getenv("BEE_BENCH_POWER_ENGINE"))) {
|
||||||
case BenchmarkPowerEngineTargetedPower:
|
case BenchmarkPowerEngineTargetedPower:
|
||||||
@@ -351,9 +393,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
|
result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
|
||||||
result.Normalization.Status = "partial"
|
result.Normalization.Status = "partial"
|
||||||
}
|
}
|
||||||
// Enrich with max clocks from verbose output — covers GPUs where
|
// Enrich with verbose nvidia-smi data — covers GPUs where some CSV fields
|
||||||
// clocks.max.* CSV fields are unsupported (e.g. Blackwell / driver 98.x).
|
// are unsupported (e.g. clocks.max.* on Blackwell / driver 98.x).
|
||||||
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQOut)
|
enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQOut)
|
||||||
|
|
||||||
activeApps, err := queryActiveComputeApps(selected)
|
activeApps, err := queryActiveComputeApps(selected)
|
||||||
if err == nil && len(activeApps) > 0 {
|
if err == nil && len(activeApps) > 0 {
|
||||||
@@ -737,8 +779,8 @@ func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
|
|||||||
// (attribute.multiprocessor_count, power.default_limit) are not supported on
|
// (attribute.multiprocessor_count, power.default_limit) are not supported on
|
||||||
// all driver versions, so we fall back to the base set if the full query fails.
|
// all driver versions, so we fall back to the base set if the full query fails.
|
||||||
// The minimal fallback omits clock fields entirely — clocks.max.* returns
|
// The minimal fallback omits clock fields entirely — clocks.max.* returns
|
||||||
// exit status 2 on some GPU generations (e.g. Blackwell); max clocks are
|
// exit status 2 on some GPU generations (e.g. Blackwell); missing data is
|
||||||
// then recovered from nvidia-smi -q via enrichGPUInfoWithMaxClocks.
|
// then recovered from nvidia-smi -q.
|
||||||
var benchmarkGPUInfoQueries = []struct {
|
var benchmarkGPUInfoQueries = []struct {
|
||||||
fields string
|
fields string
|
||||||
extended bool // whether this query includes optional extended fields
|
extended bool // whether this query includes optional extended fields
|
||||||
@@ -758,12 +800,9 @@ var benchmarkGPUInfoQueries = []struct {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
// enrichGPUInfoWithMaxClocks fills MaxGraphicsClockMHz / MaxMemoryClockMHz for
|
// enrichGPUInfoWithNvidiaSMIQ fills benchmark GPU metadata from nvidia-smi -q
|
||||||
// any GPU in infoByIndex where those values are still zero. It parses the
|
// for fields that may be missing from --query-gpu on some driver versions.
|
||||||
// "Max Clocks" section of nvidia-smi -q output (already available as nvsmiQ).
|
func enrichGPUInfoWithNvidiaSMIQ(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) {
|
||||||
// This is the fallback for GPUs (e.g. Blackwell) where clocks.max.* CSV fields
|
|
||||||
// return exit status 2 but the verbose query works fine.
|
|
||||||
func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) {
|
|
||||||
if len(infoByIndex) == 0 || len(nvsmiQ) == 0 {
|
if len(infoByIndex) == 0 || len(nvsmiQ) == 0 {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -784,6 +823,8 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
|
|||||||
maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`)
|
maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`)
|
||||||
defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`)
|
defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`)
|
||||||
currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`)
|
currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`)
|
||||||
|
minPwrRe := regexp.MustCompile(`(?i)Min Power Limit\s*:\s*([0-9.]+)\s*W`)
|
||||||
|
maxPwrRe := regexp.MustCompile(`(?i)Max Power Limit\s*:\s*([0-9.]+)\s*W`)
|
||||||
smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`)
|
smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`)
|
||||||
shutdownTempRe := regexp.MustCompile(`(?i)GPU Shutdown Temp\s*:\s*(\d+)\s*C`)
|
shutdownTempRe := regexp.MustCompile(`(?i)GPU Shutdown Temp\s*:\s*(\d+)\s*C`)
|
||||||
slowdownTempRe := regexp.MustCompile(`(?i)GPU Slowdown Temp\s*:\s*(\d+)\s*C`)
|
slowdownTempRe := regexp.MustCompile(`(?i)GPU Slowdown Temp\s*:\s*(\d+)\s*C`)
|
||||||
@@ -843,6 +884,20 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if info.MinPowerLimitW == 0 {
|
||||||
|
if m := minPwrRe.FindSubmatch(section); m != nil {
|
||||||
|
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
|
||||||
|
info.MinPowerLimitW = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if info.MaxPowerLimitW == 0 {
|
||||||
|
if m := maxPwrRe.FindSubmatch(section); m != nil {
|
||||||
|
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
|
||||||
|
info.MaxPowerLimitW = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
if info.MultiprocessorCount == 0 {
|
if info.MultiprocessorCount == 0 {
|
||||||
if m := smCountRe.FindSubmatch(section); m != nil {
|
if m := smCountRe.FindSubmatch(section); m != nil {
|
||||||
if v, err := strconv.Atoi(string(m[1])); err == nil && v > 0 {
|
if v, err := strconv.Atoi(string(m[1])); err == nil && v > 0 {
|
||||||
@@ -2365,6 +2420,16 @@ type sdrPowerSnapshot struct {
|
|||||||
SkippedSensors []string // sensors rejected during self-healing
|
SkippedSensors []string // sensors rejected during self-healing
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type benchmarkSDRSeriesSummary struct {
|
||||||
|
PSUInW float64
|
||||||
|
PSUOutW float64
|
||||||
|
GPUSlotW float64
|
||||||
|
PSUSlots map[string]BenchmarkPSUSlotPower
|
||||||
|
Samples int
|
||||||
|
|
||||||
|
SkippedSensors []string
|
||||||
|
}
|
||||||
|
|
||||||
// sdrSensor is a name+watts pair used for GPU slot self-healing filtering.
|
// sdrSensor is a name+watts pair used for GPU slot self-healing filtering.
|
||||||
type sdrSensor struct {
|
type sdrSensor struct {
|
||||||
name string
|
name string
|
||||||
@@ -2494,6 +2559,137 @@ func sampleIPMISDRPowerSensors() sdrPowerSnapshot {
|
|||||||
return snap
|
return snap
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func startIPMISDRSampler(stopCh <-chan struct{}, intervalSec int) <-chan []sdrPowerSnapshot {
|
||||||
|
if intervalSec <= 0 {
|
||||||
|
intervalSec = benchmarkPowerAutotuneSampleInterval
|
||||||
|
}
|
||||||
|
ch := make(chan []sdrPowerSnapshot, 1)
|
||||||
|
go func() {
|
||||||
|
defer close(ch)
|
||||||
|
var samples []sdrPowerSnapshot
|
||||||
|
record := func() {
|
||||||
|
snap := sampleIPMISDRPowerSensors()
|
||||||
|
if snap.PSUInW <= 0 && snap.PSUOutW <= 0 && snap.GPUSlotW <= 0 && len(snap.PSUSlots) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
samples = append(samples, snap)
|
||||||
|
}
|
||||||
|
record()
|
||||||
|
ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
|
||||||
|
defer ticker.Stop()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-stopCh:
|
||||||
|
ch <- samples
|
||||||
|
return
|
||||||
|
case <-ticker.C:
|
||||||
|
record()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
return ch
|
||||||
|
}
|
||||||
|
|
||||||
|
func summarizeSDRPowerSeries(samples []sdrPowerSnapshot) benchmarkSDRSeriesSummary {
|
||||||
|
var summary benchmarkSDRSeriesSummary
|
||||||
|
if len(samples) == 0 {
|
||||||
|
return summary
|
||||||
|
}
|
||||||
|
|
||||||
|
type slotAggregate struct {
|
||||||
|
inputs []float64
|
||||||
|
outputs []float64
|
||||||
|
status string
|
||||||
|
}
|
||||||
|
|
||||||
|
slotAgg := make(map[string]*slotAggregate)
|
||||||
|
skippedSet := make(map[string]struct{})
|
||||||
|
var inputTotals []float64
|
||||||
|
var outputTotals []float64
|
||||||
|
var gpuSlotTotals []float64
|
||||||
|
|
||||||
|
for _, sample := range samples {
|
||||||
|
if sample.PSUInW > 0 {
|
||||||
|
inputTotals = append(inputTotals, sample.PSUInW)
|
||||||
|
}
|
||||||
|
if sample.PSUOutW > 0 {
|
||||||
|
outputTotals = append(outputTotals, sample.PSUOutW)
|
||||||
|
}
|
||||||
|
if sample.GPUSlotW > 0 {
|
||||||
|
gpuSlotTotals = append(gpuSlotTotals, sample.GPUSlotW)
|
||||||
|
}
|
||||||
|
for _, skipped := range sample.SkippedSensors {
|
||||||
|
if skipped != "" {
|
||||||
|
skippedSet[skipped] = struct{}{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for slot, reading := range sample.PSUSlots {
|
||||||
|
agg := slotAgg[slot]
|
||||||
|
if agg == nil {
|
||||||
|
agg = &slotAggregate{}
|
||||||
|
slotAgg[slot] = agg
|
||||||
|
}
|
||||||
|
if reading.InputW != nil && *reading.InputW > 0 {
|
||||||
|
agg.inputs = append(agg.inputs, *reading.InputW)
|
||||||
|
}
|
||||||
|
if reading.OutputW != nil && *reading.OutputW > 0 {
|
||||||
|
agg.outputs = append(agg.outputs, *reading.OutputW)
|
||||||
|
}
|
||||||
|
switch {
|
||||||
|
case reading.Status == "":
|
||||||
|
case agg.status == "":
|
||||||
|
agg.status = reading.Status
|
||||||
|
case agg.status == "OK" && reading.Status != "OK":
|
||||||
|
agg.status = reading.Status
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
summary.PSUInW = benchmarkMean(inputTotals)
|
||||||
|
summary.PSUOutW = benchmarkMean(outputTotals)
|
||||||
|
summary.GPUSlotW = benchmarkMean(gpuSlotTotals)
|
||||||
|
summary.Samples = len(samples)
|
||||||
|
|
||||||
|
if len(slotAgg) > 0 {
|
||||||
|
summary.PSUSlots = make(map[string]BenchmarkPSUSlotPower, len(slotAgg))
|
||||||
|
for slot, agg := range slotAgg {
|
||||||
|
reading := BenchmarkPSUSlotPower{Status: agg.status}
|
||||||
|
if mean := benchmarkMean(agg.inputs); mean > 0 {
|
||||||
|
v := mean
|
||||||
|
reading.InputW = &v
|
||||||
|
}
|
||||||
|
if mean := benchmarkMean(agg.outputs); mean > 0 {
|
||||||
|
v := mean
|
||||||
|
reading.OutputW = &v
|
||||||
|
}
|
||||||
|
summary.PSUSlots[slot] = reading
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(skippedSet) > 0 {
|
||||||
|
summary.SkippedSensors = make([]string, 0, len(skippedSet))
|
||||||
|
for skipped := range skippedSet {
|
||||||
|
summary.SkippedSensors = append(summary.SkippedSensors, skipped)
|
||||||
|
}
|
||||||
|
sort.Strings(summary.SkippedSensors)
|
||||||
|
}
|
||||||
|
|
||||||
|
return summary
|
||||||
|
}
|
||||||
|
|
||||||
|
func collectIPMISDRPowerSeries(ctx context.Context, durationSec, intervalSec int) benchmarkSDRSeriesSummary {
|
||||||
|
if durationSec <= 0 {
|
||||||
|
return benchmarkSDRSeriesSummary{}
|
||||||
|
}
|
||||||
|
stopCh := make(chan struct{})
|
||||||
|
doneCh := startIPMISDRSampler(stopCh, intervalSec)
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
case <-time.After(time.Duration(durationSec) * time.Second):
|
||||||
|
}
|
||||||
|
close(stopCh)
|
||||||
|
return summarizeSDRPowerSeries(<-doneCh)
|
||||||
|
}
|
||||||
|
|
||||||
// queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
|
// queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
|
||||||
// Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
|
// Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
|
||||||
func queryIPMIServerPowerW() (float64, error) {
|
func queryIPMIServerPowerW() (float64, error) {
|
||||||
@@ -3038,12 +3234,12 @@ func runBenchmarkPowerCalibration(
|
|||||||
logFunc func(string),
|
logFunc func(string),
|
||||||
seedLimits map[int]int,
|
seedLimits map[int]int,
|
||||||
durationSec int,
|
durationSec int,
|
||||||
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow) {
|
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow, benchmarkPowerCalibrationRunSummary) {
|
||||||
calibDurationSec := durationSec
|
calibDurationSec := durationSec
|
||||||
|
var runSummary benchmarkPowerCalibrationRunSummary
|
||||||
if calibDurationSec <= 0 {
|
if calibDurationSec <= 0 {
|
||||||
calibDurationSec = 120
|
calibDurationSec = 120
|
||||||
}
|
}
|
||||||
const maxDerateW = 150
|
|
||||||
// calibSearchTolerance is the binary-search convergence threshold in watts.
|
// calibSearchTolerance is the binary-search convergence threshold in watts.
|
||||||
// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
|
// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
|
||||||
const calibSearchTolerance = 10
|
const calibSearchTolerance = 10
|
||||||
@@ -3058,12 +3254,12 @@ func runBenchmarkPowerCalibration(
|
|||||||
if engine == BenchmarkPowerEngineTargetedPower {
|
if engine == BenchmarkPowerEngineTargetedPower {
|
||||||
if _, err := exec.LookPath("dcgmi"); err != nil {
|
if _, err := exec.LookPath("dcgmi"); err != nil {
|
||||||
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
|
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
|
||||||
return map[int]benchmarkPowerCalibrationResult{}, nil, nil
|
return map[int]benchmarkPowerCalibrationResult{}, nil, nil, runSummary
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if _, _, err := resolveBenchmarkPowerLoadCommand(calibDurationSec, gpuIndices); err != nil {
|
if _, _, err := resolveBenchmarkPowerLoadCommand(calibDurationSec, gpuIndices); err != nil {
|
||||||
logFunc("power calibration: dcgmproftester not found, skipping (will use default power limit)")
|
logFunc("power calibration: dcgmproftester not found, skipping (will use default power limit)")
|
||||||
return map[int]benchmarkPowerCalibrationResult{}, nil, nil
|
return map[int]benchmarkPowerCalibrationResult{}, nil, nil, runSummary
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if killed := KillTestWorkers(); len(killed) > 0 {
|
if killed := KillTestWorkers(); len(killed) > 0 {
|
||||||
@@ -3090,8 +3286,9 @@ func runBenchmarkPowerCalibration(
|
|||||||
originalLimitW int
|
originalLimitW int
|
||||||
appliedLimitW int
|
appliedLimitW int
|
||||||
minLimitW int
|
minLimitW int
|
||||||
lo int // highest verified-stable limit (assumed: minLimitW)
|
lo int // highest verified-stable limit
|
||||||
hi int // lowest verified-unstable limit (exclusive sentinel above start)
|
hi int // lowest verified-unstable limit (exclusive sentinel above start)
|
||||||
|
loVerified bool
|
||||||
calib benchmarkPowerCalibrationResult
|
calib benchmarkPowerCalibrationResult
|
||||||
converged bool
|
converged bool
|
||||||
}
|
}
|
||||||
@@ -3113,23 +3310,17 @@ func runBenchmarkPowerCalibration(
|
|||||||
if defaultLimitW <= 0 {
|
if defaultLimitW <= 0 {
|
||||||
defaultLimitW = originalLimitW
|
defaultLimitW = originalLimitW
|
||||||
}
|
}
|
||||||
appliedLimitW := originalLimitW
|
appliedLimitW := initialBenchmarkCalibrationLimitW(info)
|
||||||
if appliedLimitW <= 0 {
|
if appliedLimitW <= 0 {
|
||||||
appliedLimitW = defaultLimitW
|
appliedLimitW = defaultLimitW
|
||||||
}
|
}
|
||||||
minLimitW := appliedLimitW
|
minLimitW := int(math.Round(info.MinPowerLimitW))
|
||||||
switch {
|
if minLimitW <= 0 {
|
||||||
case defaultLimitW > 0:
|
minLimitW = appliedLimitW
|
||||||
minLimitW = defaultLimitW - maxDerateW
|
|
||||||
floorByRatio := int(math.Round(float64(defaultLimitW) * 0.70))
|
|
||||||
if minLimitW < floorByRatio {
|
|
||||||
minLimitW = floorByRatio
|
|
||||||
}
|
}
|
||||||
case appliedLimitW > 0:
|
maxLimitW := int(math.Round(info.MaxPowerLimitW))
|
||||||
minLimitW = appliedLimitW - maxDerateW
|
if maxLimitW > 0 && appliedLimitW > maxLimitW {
|
||||||
}
|
appliedLimitW = maxLimitW
|
||||||
if minLimitW < calibSearchTolerance {
|
|
||||||
minLimitW = calibSearchTolerance
|
|
||||||
}
|
}
|
||||||
s := &gpuCalibState{
|
s := &gpuCalibState{
|
||||||
idx: idx,
|
idx: idx,
|
||||||
@@ -3141,11 +3332,24 @@ func runBenchmarkPowerCalibration(
|
|||||||
hi: appliedLimitW + 1, // not yet tested, not yet confirmed unstable
|
hi: appliedLimitW + 1, // not yet tested, not yet confirmed unstable
|
||||||
calib: benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
|
calib: benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
|
||||||
}
|
}
|
||||||
|
if minLimitW > 0 && appliedLimitW > 0 && minLimitW >= appliedLimitW {
|
||||||
|
s.appliedLimitW = minLimitW
|
||||||
|
s.hi = minLimitW + 1
|
||||||
|
}
|
||||||
|
if info.MinPowerLimitW <= 0 {
|
||||||
|
s.calib.Notes = append(s.calib.Notes, "minimum power limit was not reported by nvidia-smi; calibration can only validate the current/default power limit")
|
||||||
|
}
|
||||||
if seedLimits != nil {
|
if seedLimits != nil {
|
||||||
if seedW, ok := seedLimits[idx]; ok && seedW > 0 {
|
if seedW, ok := seedLimits[idx]; ok && seedW > 0 {
|
||||||
// A previously validated limit is only a starting point. Re-run
|
// A previously validated limit is only a starting point. Re-run
|
||||||
// targeted_power under the current multi-GPU thermal load and derate
|
// targeted_power under the current multi-GPU thermal load and derate
|
||||||
// again if this step shows new throttling.
|
// again if this step shows new throttling.
|
||||||
|
if seedW < s.minLimitW {
|
||||||
|
seedW = s.minLimitW
|
||||||
|
}
|
||||||
|
if maxLimitW > 0 && seedW > maxLimitW {
|
||||||
|
seedW = maxLimitW
|
||||||
|
}
|
||||||
if canDerate {
|
if canDerate {
|
||||||
_ = setBenchmarkPowerLimit(ctx, verboseLog, idx, seedW)
|
_ = setBenchmarkPowerLimit(ctx, verboseLog, idx, seedW)
|
||||||
}
|
}
|
||||||
@@ -3220,6 +3424,10 @@ calibDone:
|
|||||||
}
|
}
|
||||||
attemptCtx, cancelAttempt := context.WithCancel(ctx)
|
attemptCtx, cancelAttempt := context.WithCancel(ctx)
|
||||||
doneCh := make(chan sharedAttemptResult, 1)
|
doneCh := make(chan sharedAttemptResult, 1)
|
||||||
|
sdrStopCh := make(chan struct{})
|
||||||
|
sdrDoneCh := startIPMISDRSampler(sdrStopCh, benchmarkPowerAutotuneSampleInterval)
|
||||||
|
fanStopCh := make(chan struct{})
|
||||||
|
fanDoneCh := startBenchmarkFanSampler(fanStopCh, benchmarkPowerAutotuneSampleInterval)
|
||||||
go func() {
|
go func() {
|
||||||
out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, env, gpuIndices, logFunc)
|
out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, env, gpuIndices, logFunc)
|
||||||
doneCh <- sharedAttemptResult{out: out, rows: rows, err: err}
|
doneCh <- sharedAttemptResult{out: out, rows: rows, err: err}
|
||||||
@@ -3259,6 +3467,10 @@ calibDone:
|
|||||||
}
|
}
|
||||||
ticker.Stop()
|
ticker.Stop()
|
||||||
cancelAttempt()
|
cancelAttempt()
|
||||||
|
close(sdrStopCh)
|
||||||
|
close(fanStopCh)
|
||||||
|
attemptSDRSummary := summarizeSDRPowerSeries(<-sdrDoneCh)
|
||||||
|
attemptFanSummary := <-fanDoneCh
|
||||||
_ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644)
|
_ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644)
|
||||||
// Accumulate telemetry rows with attempt stage label.
|
// Accumulate telemetry rows with attempt stage label.
|
||||||
appendBenchmarkMetrics(&allCalibRows, ar.rows, fmt.Sprintf("attempt-%d", sharedAttempt), &calibCursor, float64(calibDurationSec))
|
appendBenchmarkMetrics(&allCalibRows, ar.rows, fmt.Sprintf("attempt-%d", sharedAttempt), &calibCursor, float64(calibDurationSec))
|
||||||
@@ -3296,10 +3508,14 @@ calibDone:
|
|||||||
busyDelaySec = 1
|
busyDelaySec = 1
|
||||||
|
|
||||||
// Per-GPU analysis and binary search update.
|
// Per-GPU analysis and binary search update.
|
||||||
|
attemptStable := ar.err == nil
|
||||||
for _, s := range active {
|
for _, s := range active {
|
||||||
perGPU := filterRowsByGPU(ar.rows, s.idx)
|
perGPU := filterRowsByGPU(ar.rows, s.idx)
|
||||||
summary := summarizeBenchmarkTelemetry(perGPU)
|
summary := summarizeBenchmarkTelemetry(perGPU)
|
||||||
throttle := throttleReasons[s.idx]
|
throttle := throttleReasons[s.idx]
|
||||||
|
if throttle != "" || summary.P95PowerW <= 0 {
|
||||||
|
attemptStable = false
|
||||||
|
}
|
||||||
|
|
||||||
// Cooling warning: thermal throttle with fans not at maximum.
|
// Cooling warning: thermal throttle with fans not at maximum.
|
||||||
if strings.Contains(throttle, "thermal") && s.calib.CoolingWarning == "" {
|
if strings.Contains(throttle, "thermal") && s.calib.CoolingWarning == "" {
|
||||||
@@ -3333,6 +3549,7 @@ calibDone:
|
|||||||
s.calib.AppliedPowerLimitW = float64(s.appliedLimitW)
|
s.calib.AppliedPowerLimitW = float64(s.appliedLimitW)
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", s.idx, s.appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
|
logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", s.idx, s.appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
|
||||||
s.lo = s.appliedLimitW
|
s.lo = s.appliedLimitW
|
||||||
|
s.loVerified = true
|
||||||
if canDerate && s.hi-s.lo > calibSearchTolerance {
|
if canDerate && s.hi-s.lo > calibSearchTolerance {
|
||||||
next := roundTo5W((s.lo + s.hi) / 2)
|
next := roundTo5W((s.lo + s.hi) / 2)
|
||||||
if next > s.lo && next < s.hi {
|
if next > s.lo && next < s.hi {
|
||||||
@@ -3371,7 +3588,23 @@ calibDone:
|
|||||||
s.hi = s.appliedLimitW
|
s.hi = s.appliedLimitW
|
||||||
|
|
||||||
if s.hi-s.lo <= calibSearchTolerance {
|
if s.hi-s.lo <= calibSearchTolerance {
|
||||||
if s.lo > s.minLimitW {
|
if !s.loVerified && s.minLimitW > 0 && s.appliedLimitW != s.minLimitW {
|
||||||
|
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.minLimitW); err != nil {
|
||||||
|
s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
|
||||||
|
logFunc(fmt.Sprintf("power calibration: GPU %d failed to set minimum power limit %d W: %v", s.idx, s.minLimitW, err))
|
||||||
|
s.converged = true
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
s.appliedLimitW = s.minLimitW
|
||||||
|
s.calib.AppliedPowerLimitW = float64(s.minLimitW)
|
||||||
|
s.calib.Derated = s.minLimitW < s.originalLimitW
|
||||||
|
s.info.PowerLimitW = float64(s.minLimitW)
|
||||||
|
infoByIndex[s.idx] = s.info
|
||||||
|
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: validating minimum settable limit %d W before concluding failure", s.minLimitW))
|
||||||
|
logFunc(fmt.Sprintf("power calibration: GPU %d binary search: validating minimum settable limit %d W", s.idx, s.minLimitW))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if s.loVerified {
|
||||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", s.lo, s.lo, s.hi))
|
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", s.lo, s.lo, s.hi))
|
||||||
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.lo); err == nil {
|
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.lo); err == nil {
|
||||||
s.appliedLimitW = s.lo
|
s.appliedLimitW = s.lo
|
||||||
@@ -3383,7 +3616,8 @@ calibDone:
|
|||||||
s.calib.Completed = true
|
s.calib.Completed = true
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
|
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit down to the minimum settable power limit %d W", engineLabel, s.minLimitW))
|
||||||
|
logFunc(fmt.Sprintf("power calibration: GPU %d no stable limit found down to minimum settable power limit %d W", s.idx, s.minLimitW))
|
||||||
}
|
}
|
||||||
s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
|
s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
|
||||||
s.converged = true
|
s.converged = true
|
||||||
@@ -3398,9 +3632,7 @@ calibDone:
|
|||||||
next = (s.lo + s.hi) / 2
|
next = (s.lo + s.hi) / 2
|
||||||
}
|
}
|
||||||
if next < s.minLimitW {
|
if next < s.minLimitW {
|
||||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
|
next = s.minLimitW
|
||||||
s.converged = true
|
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err != nil {
|
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err != nil {
|
||||||
s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
|
s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
|
||||||
@@ -3416,6 +3648,16 @@ calibDone:
|
|||||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", next, s.lo, s.hi))
|
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", next, s.lo, s.hi))
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", s.idx, next, s.lo, s.hi))
|
logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", s.idx, next, s.lo, s.hi))
|
||||||
}
|
}
|
||||||
|
if attemptStable {
|
||||||
|
if attemptSDRSummary.Samples > 0 {
|
||||||
|
runSummary.LoadedSDR = attemptSDRSummary
|
||||||
|
}
|
||||||
|
if attemptFanSummary.FanSamples > 0 {
|
||||||
|
runSummary.AvgFanRPM = attemptFanSummary.AvgFanRPM
|
||||||
|
runSummary.AvgFanDutyCyclePct = attemptFanSummary.AvgFanDutyCyclePct
|
||||||
|
runSummary.FanSamples = attemptFanSummary.FanSamples
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, s := range states {
|
for _, s := range states {
|
||||||
@@ -3424,7 +3666,7 @@ calibDone:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
writeBenchmarkMetricsFiles(runDir, allCalibRows)
|
writeBenchmarkMetricsFiles(runDir, allCalibRows)
|
||||||
return results, restore, allCalibRows
|
return results, restore, allCalibRows, runSummary
|
||||||
}
|
}
|
||||||
|
|
||||||
// isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222),
|
// isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222),
|
||||||
@@ -3439,6 +3681,24 @@ func roundTo5W(w int) int {
|
|||||||
return ((w + 2) / 5) * 5
|
return ((w + 2) / 5) * 5
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func initialBenchmarkCalibrationLimitW(info benchmarkGPUInfo) int {
|
||||||
|
defaultLimitW := int(math.Round(info.DefaultPowerLimitW))
|
||||||
|
currentLimitW := int(math.Round(info.PowerLimitW))
|
||||||
|
maxLimitW := int(math.Round(info.MaxPowerLimitW))
|
||||||
|
|
||||||
|
startW := defaultLimitW
|
||||||
|
if startW <= 0 {
|
||||||
|
startW = currentLimitW
|
||||||
|
}
|
||||||
|
if startW <= 0 {
|
||||||
|
startW = maxLimitW
|
||||||
|
}
|
||||||
|
if maxLimitW > 0 && startW > maxLimitW {
|
||||||
|
startW = maxLimitW
|
||||||
|
}
|
||||||
|
return startW
|
||||||
|
}
|
||||||
|
|
||||||
// meanFanRPM returns the average RPM across a set of fan readings.
|
// meanFanRPM returns the average RPM across a set of fan readings.
|
||||||
func meanFanRPM(fans []FanReading) float64 {
|
func meanFanRPM(fans []FanReading) float64 {
|
||||||
if len(fans) == 0 {
|
if len(fans) == 0 {
|
||||||
@@ -3451,6 +3711,47 @@ func meanFanRPM(fans []FanReading) float64 {
|
|||||||
return sum / float64(len(fans))
|
return sum / float64(len(fans))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func startBenchmarkFanSampler(stopCh <-chan struct{}, intervalSec int) <-chan benchmarkPowerCalibrationRunSummary {
|
||||||
|
if intervalSec <= 0 {
|
||||||
|
intervalSec = benchmarkPowerAutotuneSampleInterval
|
||||||
|
}
|
||||||
|
ch := make(chan benchmarkPowerCalibrationRunSummary, 1)
|
||||||
|
go func() {
|
||||||
|
defer close(ch)
|
||||||
|
var rpmSamples []float64
|
||||||
|
var dutySamples []float64
|
||||||
|
record := func() {
|
||||||
|
fans, err := sampleFanSpeeds()
|
||||||
|
if err != nil || len(fans) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if rpm := meanFanRPM(fans); rpm > 0 {
|
||||||
|
rpmSamples = append(rpmSamples, rpm)
|
||||||
|
}
|
||||||
|
if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok && duty > 0 {
|
||||||
|
dutySamples = append(dutySamples, duty)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
record()
|
||||||
|
ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
|
||||||
|
defer ticker.Stop()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-stopCh:
|
||||||
|
ch <- benchmarkPowerCalibrationRunSummary{
|
||||||
|
AvgFanRPM: benchmarkMean(rpmSamples),
|
||||||
|
AvgFanDutyCyclePct: benchmarkMean(dutySamples),
|
||||||
|
FanSamples: len(rpmSamples),
|
||||||
|
}
|
||||||
|
return
|
||||||
|
case <-ticker.C:
|
||||||
|
record()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
return ch
|
||||||
|
}
|
||||||
|
|
||||||
func powerBenchDurationSec(profile string) int {
|
func powerBenchDurationSec(profile string) int {
|
||||||
switch strings.TrimSpace(strings.ToLower(profile)) {
|
switch strings.TrimSpace(strings.ToLower(profile)) {
|
||||||
case NvidiaBenchmarkProfileStability:
|
case NvidiaBenchmarkProfileStability:
|
||||||
@@ -3479,41 +3780,39 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
|
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
|
||||||
fmt.Fprintf(&b, "**Platform max TDP (GPU-reported):** %.0f W \n", result.PlatformMaxTDPW)
|
fmt.Fprintf(&b, "**Platform max TDP (GPU-reported):** %.0f W \n", result.PlatformMaxTDPW)
|
||||||
if sp := result.ServerPower; sp != nil && sp.Available {
|
if sp := result.ServerPower; sp != nil && sp.Available {
|
||||||
fmt.Fprintf(&b, "**Server power delta (IPMI DCMI):** %.0f W \n", sp.DeltaW)
|
sourceLabel := "autotuned source"
|
||||||
if sp.PSUInputLoadedW > 0 {
|
switch normalizeBenchmarkPowerSource(sp.Source) {
|
||||||
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
|
case BenchmarkPowerSourceSDRPSUInput:
|
||||||
fmt.Fprintf(&b, "**PSU AC input Δ (IPMI SDR):** %.0f W \n", psuDelta)
|
sourceLabel = "autotuned source (SDR PSU AC input)"
|
||||||
|
case BenchmarkPowerSourceDCMI:
|
||||||
|
sourceLabel = "autotuned source (DCMI)"
|
||||||
}
|
}
|
||||||
fmt.Fprintf(&b, "**Reporting ratio (IPMI Δ / GPU actual sum):** %.2f \n", sp.ReportingRatio)
|
fmt.Fprintf(&b, "**Server power delta (%s):** %.0f W \n", sourceLabel, sp.DeltaW)
|
||||||
|
fmt.Fprintf(&b, "**Reporting ratio:** %.2f \n", sp.ReportingRatio)
|
||||||
}
|
}
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
// Server power comparison table.
|
// Server power comparison table.
|
||||||
if sp := result.ServerPower; sp != nil {
|
if sp := result.ServerPower; sp != nil {
|
||||||
b.WriteString("## Server vs GPU Power Comparison\n\n")
|
b.WriteString("## Server vs GPU Power Comparison\n\n")
|
||||||
|
selectedSource := normalizeBenchmarkPowerSource(sp.Source)
|
||||||
|
selectedSourceLabel := "Selected source"
|
||||||
|
if selectedSource == BenchmarkPowerSourceSDRPSUInput {
|
||||||
|
selectedSourceLabel = "Selected source (SDR PSU AC input)"
|
||||||
|
} else if selectedSource == BenchmarkPowerSourceDCMI {
|
||||||
|
selectedSourceLabel = "Selected source (DCMI)"
|
||||||
|
}
|
||||||
var spRows [][]string
|
var spRows [][]string
|
||||||
spRows = append(spRows, []string{"GPU stable limits sum", "nvidia-smi", fmt.Sprintf("%.0f W", result.PlatformMaxTDPW)})
|
spRows = append(spRows, []string{"GPU actual power sum (p95, last step)", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)})
|
||||||
spRows = append(spRows, []string{"GPU actual power sum (p95, last step)", "nvidia-smi", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)})
|
|
||||||
if sp.GPUSlotTotalW > 0 {
|
|
||||||
spRows = append(spRows, []string{"GPU PCIe slot power (at peak load)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.GPUSlotTotalW)})
|
|
||||||
}
|
|
||||||
if sp.Available {
|
if sp.Available {
|
||||||
spRows = append(spRows, []string{"Server idle power", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.IdleW)})
|
spRows = append(spRows, []string{selectedSourceLabel + " idle power", fmt.Sprintf("%.0f W", sp.IdleW)})
|
||||||
spRows = append(spRows, []string{"Server loaded power", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.LoadedW)})
|
spRows = append(spRows, []string{selectedSourceLabel + " loaded power", fmt.Sprintf("%.0f W", sp.LoadedW)})
|
||||||
spRows = append(spRows, []string{"Server Δ power (loaded − idle)", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.DeltaW)})
|
spRows = append(spRows, []string{selectedSourceLabel + " Δ power (loaded − idle)", fmt.Sprintf("%.0f W", sp.DeltaW)})
|
||||||
}
|
}
|
||||||
if sp.PSUInputLoadedW > 0 {
|
if selectedSource == BenchmarkPowerSourceSDRPSUInput && sp.PSUInputLoadedW > 0 {
|
||||||
spRows = append(spRows, []string{"PSU AC input (idle)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUInputIdleW)})
|
spRows = append(spRows, []string{"PSU AC input (idle avg, pre-load phase)", fmt.Sprintf("%.0f W", sp.PSUInputIdleW)})
|
||||||
spRows = append(spRows, []string{"PSU AC input (loaded)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUInputLoadedW)})
|
spRows = append(spRows, []string{"PSU AC input (loaded avg, final phase)", fmt.Sprintf("%.0f W", sp.PSUInputLoadedW)})
|
||||||
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
|
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
|
||||||
spRows = append(spRows, []string{"PSU AC input Δ (loaded − idle)", "IPMI SDR", fmt.Sprintf("%.0f W", psuDelta)})
|
spRows = append(spRows, []string{"PSU AC input Δ (loaded − idle)", fmt.Sprintf("%.0f W", psuDelta)})
|
||||||
}
|
|
||||||
if sp.PSUOutputLoadedW > 0 {
|
|
||||||
spRows = append(spRows, []string{"PSU DC output (idle)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUOutputIdleW)})
|
|
||||||
spRows = append(spRows, []string{"PSU DC output (loaded)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUOutputLoadedW)})
|
|
||||||
if sp.PSUInputLoadedW > 0 && sp.PSUInputIdleW > 0 {
|
|
||||||
psuEff := sp.PSUOutputIdleW / sp.PSUInputIdleW * 100
|
|
||||||
spRows = append(spRows, []string{"PSU conversion efficiency (idle)", "IPMI SDR", fmt.Sprintf("%.1f%%", psuEff)})
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if sp.Available {
|
if sp.Available {
|
||||||
ratio := sp.ReportingRatio
|
ratio := sp.ReportingRatio
|
||||||
@@ -3530,8 +3829,8 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
default:
|
default:
|
||||||
ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power"
|
ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power"
|
||||||
}
|
}
|
||||||
spRows = append(spRows, []string{"Reporting ratio (DCMI Δ / GPU actual)", "IPMI DCMI", fmt.Sprintf("%.2f — %s", ratio, ratioNote)})
|
spRows = append(spRows, []string{"Reporting ratio", fmt.Sprintf("%.2f — %s", ratio, ratioNote)})
|
||||||
if sp.PSUInputLoadedW > 0 && sp.GPUReportedSumW > 0 {
|
if selectedSource == BenchmarkPowerSourceSDRPSUInput && sp.PSUInputLoadedW > 0 && sp.GPUReportedSumW > 0 {
|
||||||
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
|
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
|
||||||
sdrRatio := psuDelta / sp.GPUReportedSumW
|
sdrRatio := psuDelta / sp.GPUReportedSumW
|
||||||
sdrNote := ""
|
sdrNote := ""
|
||||||
@@ -3543,12 +3842,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
default:
|
default:
|
||||||
sdrNote = "✗ significant discrepancy"
|
sdrNote = "✗ significant discrepancy"
|
||||||
}
|
}
|
||||||
spRows = append(spRows, []string{"Reporting ratio (SDR PSU Δ / GPU actual)", "IPMI SDR", fmt.Sprintf("%.2f — %s", sdrRatio, sdrNote)})
|
spRows = append(spRows, []string{"PSU AC input reporting ratio", fmt.Sprintf("%.2f — %s", sdrRatio, sdrNote)})
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
spRows = append(spRows, []string{"IPMI availability", "—", "not available — IPMI not supported or ipmitool not found"})
|
spRows = append(spRows, []string{"IPMI availability", "not available — IPMI not supported or ipmitool not found"})
|
||||||
}
|
}
|
||||||
b.WriteString(fmtMDTable([]string{"Metric", "Source", "Value"}, spRows))
|
b.WriteString(fmtMDTable([]string{"Metric", "Value"}, spRows))
|
||||||
for _, note := range sp.Notes {
|
for _, note := range sp.Notes {
|
||||||
fmt.Fprintf(&b, "\n> %s\n", note)
|
fmt.Fprintf(&b, "\n> %s\n", note)
|
||||||
}
|
}
|
||||||
@@ -3600,11 +3899,10 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
psuDistRows = append(psuDistRows, []string{
|
psuDistRows = append(psuDistRows, []string{
|
||||||
slot,
|
slot,
|
||||||
fmtW(idle.InputW), fmtW(loaded.InputW),
|
fmtW(idle.InputW), fmtW(loaded.InputW),
|
||||||
fmtW(idle.OutputW), fmtW(loaded.OutputW),
|
|
||||||
deltaStr, status,
|
deltaStr, status,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
b.WriteString(fmtMDTable([]string{"Slot", "AC Input (idle)", "AC Input (loaded)", "DC Output (idle)", "DC Output (loaded)", "Load Δ", "Status"}, psuDistRows))
|
b.WriteString(fmtMDTable([]string{"Slot", "AC Input (idle avg)", "AC Input (loaded avg)", "Load Δ", "Status"}, psuDistRows))
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -3652,7 +3950,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
fan,
|
fan,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
b.WriteString(fmtMDTable([]string{"GPU", "Clock MHz (Mem MHz)", "Avg Temp °C", "Power W", "Server Δ W", "Fan RPM (duty%)"}, sgRows))
|
b.WriteString(fmtMDTable([]string{"GPU", "Clock MHz (Mem MHz)", "Avg Temp °C", "Power W", "Server Δ W", "Avg Fan RPM (duty%)"}, sgRows))
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
}
|
}
|
||||||
if len(result.RecommendedSlotOrder) > 0 {
|
if len(result.RecommendedSlotOrder) > 0 {
|
||||||
@@ -3761,7 +4059,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
for _, slot := range psuSlots {
|
for _, slot := range psuSlots {
|
||||||
psuHeaders = append(psuHeaders, fmt.Sprintf("PSU %s W", slot))
|
psuHeaders = append(psuHeaders, fmt.Sprintf("PSU %s W", slot))
|
||||||
}
|
}
|
||||||
psuHeaders = append(psuHeaders, "PSU Total W", "Platform eff.", "Fan RPM (duty%)")
|
psuHeaders = append(psuHeaders, "PSU Total W", "Platform eff.", "Avg Fan RPM (duty%)")
|
||||||
|
|
||||||
var psuRows [][]string
|
var psuRows [][]string
|
||||||
for _, step := range result.RampSteps {
|
for _, step := range result.RampSteps {
|
||||||
@@ -3842,7 +4140,6 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
}
|
}
|
||||||
pdRows = append(pdRows, []string{
|
pdRows = append(pdRows, []string{
|
||||||
fmt.Sprintf("GPU %d", gpu.Index),
|
fmt.Sprintf("GPU %d", gpu.Index),
|
||||||
fmt.Sprintf("%.0f W", gpu.DefaultPowerLimitW),
|
|
||||||
fmt.Sprintf("%.0f W", gpu.AppliedPowerLimitW),
|
fmt.Sprintf("%.0f W", gpu.AppliedPowerLimitW),
|
||||||
fmt.Sprintf("%.0f W", stable),
|
fmt.Sprintf("%.0f W", stable),
|
||||||
realization,
|
realization,
|
||||||
@@ -3855,13 +4152,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
}
|
}
|
||||||
pdRows = append(pdRows, []string{
|
pdRows = append(pdRows, []string{
|
||||||
"**Platform**",
|
"**Platform**",
|
||||||
fmt.Sprintf("**%.0f W**", totalDefault),
|
|
||||||
"—",
|
"—",
|
||||||
fmt.Sprintf("**%.0f W**", totalStable),
|
fmt.Sprintf("**%.0f W**", totalStable),
|
||||||
fmt.Sprintf("**%s**", platformReal),
|
fmt.Sprintf("**%s**", platformReal),
|
||||||
"",
|
"",
|
||||||
})
|
})
|
||||||
b.WriteString(fmtMDTable([]string{"GPU", "Default TDP", "Single-card limit", "Stable limit", "Realization", "Derated"}, pdRows))
|
b.WriteString(fmtMDTable([]string{"GPU", "Single-card limit", "Stable limit", "Realization", "Derated"}, pdRows))
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
|
|
||||||
// Balance across GPUs — only meaningful with 2+ GPUs.
|
// Balance across GPUs — only meaningful with 2+ GPUs.
|
||||||
@@ -4011,7 +4307,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
{"Avg Temp °C", singleTemp},
|
{"Avg Temp °C", singleTemp},
|
||||||
{"Power W", singlePwr},
|
{"Power W", singlePwr},
|
||||||
{"Per GPU wall W", singleWall},
|
{"Per GPU wall W", singleWall},
|
||||||
{"Fan RPM (duty%)", singleFan},
|
{"Avg Fan RPM (duty%)", singleFan},
|
||||||
}
|
}
|
||||||
if lastStep != nil {
|
if lastStep != nil {
|
||||||
compRows[0] = append(compRows[0], fmt.Sprintf("%s (%s)", allClk, allMem))
|
compRows[0] = append(compRows[0], fmt.Sprintf("%s (%s)", allClk, allMem))
|
||||||
@@ -4096,14 +4392,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
|
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
|
||||||
}
|
}
|
||||||
verboseLog := filepath.Join(runDir, "verbose.log")
|
verboseLog := filepath.Join(runDir, "verbose.log")
|
||||||
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
|
|
||||||
if infoErr != nil {
|
|
||||||
return "", infoErr
|
|
||||||
}
|
|
||||||
// Capture full nvidia-smi -q snapshot at the start of the run.
|
|
||||||
if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
|
|
||||||
_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
|
|
||||||
}
|
|
||||||
hostname, _ := os.Hostname()
|
hostname, _ := os.Hostname()
|
||||||
result := NvidiaPowerBenchResult{
|
result := NvidiaPowerBenchResult{
|
||||||
BenchmarkVersion: benchmarkVersion,
|
BenchmarkVersion: benchmarkVersion,
|
||||||
@@ -4114,23 +4402,35 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
SelectedGPUIndices: append([]int(nil), selected...),
|
SelectedGPUIndices: append([]int(nil), selected...),
|
||||||
OverallStatus: "OK",
|
OverallStatus: "OK",
|
||||||
}
|
}
|
||||||
|
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
|
||||||
|
if infoErr != nil {
|
||||||
|
return "", infoErr
|
||||||
|
}
|
||||||
|
// Capture full nvidia-smi -q snapshot at the start of the run.
|
||||||
|
if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
|
||||||
|
_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
|
||||||
|
}
|
||||||
durationSec := powerBenchDurationSec(opts.Profile)
|
durationSec := powerBenchDurationSec(opts.Profile)
|
||||||
|
|
||||||
// Sample server idle power before any GPU load.
|
// Sample server idle power before any GPU load.
|
||||||
var serverIdleW float64
|
var serverIdleW float64
|
||||||
var serverIdleOK bool
|
var serverIdleOK bool
|
||||||
|
idleSDRStopCh := make(chan struct{})
|
||||||
|
idleSDRCh := startIPMISDRSampler(idleSDRStopCh, benchmarkPowerAutotuneSampleInterval)
|
||||||
if w, ok := sampleBenchmarkPowerSourceSeries(ctx, opts.ServerPowerSource, 10, benchmarkPowerAutotuneSampleInterval); ok {
|
if w, ok := sampleBenchmarkPowerSourceSeries(ctx, opts.ServerPowerSource, 10, benchmarkPowerAutotuneSampleInterval); ok {
|
||||||
serverIdleW = w
|
serverIdleW = w
|
||||||
serverIdleOK = true
|
serverIdleOK = true
|
||||||
logFunc(fmt.Sprintf("server idle power (%s): %.0f W", opts.ServerPowerSource, w))
|
logFunc(fmt.Sprintf("server idle power (%s): %.0f W", opts.ServerPowerSource, w))
|
||||||
}
|
}
|
||||||
sdrIdle := sampleIPMISDRPowerSensors()
|
close(idleSDRStopCh)
|
||||||
|
sdrIdle := summarizeSDRPowerSeries(<-idleSDRCh)
|
||||||
psuBefore := psuStatusSnapshot()
|
psuBefore := psuStatusSnapshot()
|
||||||
|
|
||||||
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
|
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
|
||||||
// establish a true single-card power baseline unaffected by neighbour heat.
|
// establish a true single-card power baseline unaffected by neighbour heat.
|
||||||
calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
|
calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
|
||||||
singleIPMILoadedW := make(map[int]float64, len(selected))
|
singleIPMILoadedW := make(map[int]float64, len(selected))
|
||||||
|
singleRunSummaryByIndex := make(map[int]benchmarkPowerCalibrationRunSummary, len(selected))
|
||||||
var allRestoreActions []benchmarkRestoreAction
|
var allRestoreActions []benchmarkRestoreAction
|
||||||
// allPowerRows accumulates telemetry from all phases for the top-level gpu-metrics.csv.
|
// allPowerRows accumulates telemetry from all phases for the top-level gpu-metrics.csv.
|
||||||
var allPowerRows []GPUMetricRow
|
var allPowerRows []GPUMetricRow
|
||||||
@@ -4139,24 +4439,28 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
|
singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
|
||||||
_ = os.MkdirAll(singleDir, 0755)
|
_ = os.MkdirAll(singleDir, 0755)
|
||||||
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||||
|
if failed := resetBenchmarkGPUs(ctx, verboseLog, []int{idx}, logFunc); len(failed) > 0 {
|
||||||
|
result.Findings = append(result.Findings,
|
||||||
|
fmt.Sprintf("GPU %d reset pre-flight did not complete before its first power test; throttle counters may contain stale state.", idx))
|
||||||
|
}
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
||||||
singlePowerStopCh := make(chan struct{})
|
singlePowerStopCh := make(chan struct{})
|
||||||
singlePowerCh := startSelectedPowerSourceSampler(singlePowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
|
singlePowerCh := startSelectedPowerSourceSampler(singlePowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
|
||||||
c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil, durationSec)
|
c, restore, singleRows, singleRun := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil, durationSec)
|
||||||
appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0)
|
appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0)
|
||||||
close(singlePowerStopCh)
|
close(singlePowerStopCh)
|
||||||
sdrSingle := sampleIPMISDRPowerSensors()
|
|
||||||
if samples := <-singlePowerCh; len(samples) > 0 {
|
if samples := <-singlePowerCh; len(samples) > 0 {
|
||||||
singleIPMILoadedW[idx] = benchmarkMean(samples)
|
singleIPMILoadedW[idx] = benchmarkMean(samples)
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card server power (%s avg): %.0f W", idx, opts.ServerPowerSource, singleIPMILoadedW[idx]))
|
logFunc(fmt.Sprintf("power calibration: GPU %d single-card server power (%s avg): %.0f W", idx, opts.ServerPowerSource, singleIPMILoadedW[idx]))
|
||||||
} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrSingle.PSUInW > 0 {
|
} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && singleRun.LoadedSDR.PSUInW > 0 {
|
||||||
singleIPMILoadedW[idx] = sdrSingle.PSUInW
|
singleIPMILoadedW[idx] = singleRun.LoadedSDR.PSUInW
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card fallback server power (SDR snapshot): %.0f W", idx, sdrSingle.PSUInW))
|
logFunc(fmt.Sprintf("power calibration: GPU %d single-card fallback server power (SDR avg): %.0f W", idx, singleRun.LoadedSDR.PSUInW))
|
||||||
}
|
}
|
||||||
allRestoreActions = append(allRestoreActions, restore...)
|
allRestoreActions = append(allRestoreActions, restore...)
|
||||||
if r, ok := c[idx]; ok {
|
if r, ok := c[idx]; ok {
|
||||||
calibByIndex[idx] = r
|
calibByIndex[idx] = r
|
||||||
}
|
}
|
||||||
|
singleRunSummaryByIndex[idx] = singleRun
|
||||||
}
|
}
|
||||||
defer func() {
|
defer func() {
|
||||||
for i := len(allRestoreActions) - 1; i >= 0; i-- {
|
for i := len(allRestoreActions) - 1; i >= 0; i-- {
|
||||||
@@ -4199,11 +4503,9 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
t := summarizeBenchmarkTelemetry(calib.MetricRows)
|
t := summarizeBenchmarkTelemetry(calib.MetricRows)
|
||||||
gpu.Telemetry = &t
|
gpu.Telemetry = &t
|
||||||
}
|
}
|
||||||
if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 {
|
if singleRun := singleRunSummaryByIndex[idx]; singleRun.AvgFanRPM > 0 {
|
||||||
gpu.AvgFanRPM = meanFanRPM(fans)
|
gpu.AvgFanRPM = singleRun.AvgFanRPM
|
||||||
if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok {
|
gpu.AvgFanDutyCyclePct = singleRun.AvgFanDutyCyclePct
|
||||||
gpu.AvgFanDutyCyclePct = duty
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
gpus = append(gpus, gpu)
|
gpus = append(gpus, gpu)
|
||||||
}
|
}
|
||||||
@@ -4259,10 +4561,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
// per-step in NvidiaPowerBenchStep.ServerLoadedW.
|
// per-step in NvidiaPowerBenchStep.ServerLoadedW.
|
||||||
var serverLoadedW float64
|
var serverLoadedW float64
|
||||||
var serverLoadedOK bool
|
var serverLoadedOK bool
|
||||||
// sdrLastStep retains the SDR snapshot from the last ramp step while GPUs are
|
// sdrLastStep retains the phase-averaged SDR readings from the last ramp step
|
||||||
// still loaded. Used as PSUInputLoadedW in the summary instead of re-sampling
|
// while GPUs are loaded. Used in the summary instead of re-sampling after the
|
||||||
// after the test when GPUs have already returned to idle.
|
// test when GPUs have already returned to idle.
|
||||||
var sdrLastStep sdrPowerSnapshot
|
var sdrLastStep benchmarkSDRSeriesSummary
|
||||||
|
|
||||||
// Step 1: reuse single-card calibration result directly.
|
// Step 1: reuse single-card calibration result directly.
|
||||||
if len(result.RecommendedSlotOrder) > 0 {
|
if len(result.RecommendedSlotOrder) > 0 {
|
||||||
@@ -4283,6 +4585,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
ramp.ServerLoadedW = w
|
ramp.ServerLoadedW = w
|
||||||
ramp.ServerDeltaW = w - serverIdleW
|
ramp.ServerDeltaW = w - serverIdleW
|
||||||
}
|
}
|
||||||
|
if singleRun := singleRunSummaryByIndex[firstIdx]; singleRun.AvgFanRPM > 0 {
|
||||||
|
ramp.AvgFanRPM = singleRun.AvgFanRPM
|
||||||
|
ramp.AvgFanDutyCyclePct = singleRun.AvgFanDutyCyclePct
|
||||||
|
}
|
||||||
if !firstCalib.Completed {
|
if !firstCalib.Completed {
|
||||||
ramp.Status = "FAILED"
|
ramp.Status = "FAILED"
|
||||||
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card %s", firstIdx, benchmarkPowerEngineLabel(benchmarkPowerEngine())))
|
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card %s", firstIdx, benchmarkPowerEngineLabel(benchmarkPowerEngine())))
|
||||||
@@ -4333,7 +4639,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||||
stepPowerStopCh := make(chan struct{})
|
stepPowerStopCh := make(chan struct{})
|
||||||
stepPowerCh := startSelectedPowerSourceSampler(stepPowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
|
stepPowerCh := startSelectedPowerSourceSampler(stepPowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
|
||||||
stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep, durationSec)
|
stepCalib, stepRestore, stepRows, stepRun := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep, durationSec)
|
||||||
appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0)
|
appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0)
|
||||||
close(stepPowerStopCh)
|
close(stepPowerStopCh)
|
||||||
var stepIPMILoadedW float64
|
var stepIPMILoadedW float64
|
||||||
@@ -4404,10 +4710,9 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
|
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
|
||||||
}
|
}
|
||||||
|
|
||||||
// Per-step PSU slot snapshot — also used as the authoritative loaded power
|
// Per-step PSU slot readings are averaged over the whole load phase rather
|
||||||
// source when SDR PSU sensors are available (more accurate than DCMI on
|
// than captured as a single end-of-phase snapshot.
|
||||||
// servers where DCMI covers only a subset of installed PSUs).
|
sdrStep := stepRun.LoadedSDR
|
||||||
sdrStep := sampleIPMISDRPowerSensors()
|
|
||||||
if len(sdrStep.PSUSlots) > 0 {
|
if len(sdrStep.PSUSlots) > 0 {
|
||||||
ramp.PSUSlotReadings = sdrStep.PSUSlots
|
ramp.PSUSlotReadings = sdrStep.PSUSlots
|
||||||
}
|
}
|
||||||
@@ -4425,7 +4730,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrStep.PSUInW > 0 {
|
} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrStep.PSUInW > 0 {
|
||||||
ramp.ServerLoadedW = sdrStep.PSUInW
|
ramp.ServerLoadedW = sdrStep.PSUInW
|
||||||
ramp.ServerDeltaW = sdrStep.PSUInW - sdrIdle.PSUInW
|
ramp.ServerDeltaW = sdrStep.PSUInW - sdrIdle.PSUInW
|
||||||
logFunc(fmt.Sprintf("power ramp: step %d fallback server loaded power (SDR snapshot): %.0f W", step, sdrStep.PSUInW))
|
logFunc(fmt.Sprintf("power ramp: step %d fallback server loaded power (SDR avg): %.0f W", step, sdrStep.PSUInW))
|
||||||
if step == len(result.RecommendedSlotOrder) {
|
if step == len(result.RecommendedSlotOrder) {
|
||||||
serverLoadedW = sdrStep.PSUInW
|
serverLoadedW = sdrStep.PSUInW
|
||||||
serverLoadedOK = true
|
serverLoadedOK = true
|
||||||
@@ -4433,12 +4738,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fan state at end of ramp step.
|
// Fan values are phase averages over the same load window.
|
||||||
if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 {
|
if stepRun.AvgFanRPM > 0 {
|
||||||
ramp.AvgFanRPM = meanFanRPM(fans)
|
ramp.AvgFanRPM = stepRun.AvgFanRPM
|
||||||
if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok {
|
ramp.AvgFanDutyCyclePct = stepRun.AvgFanDutyCyclePct
|
||||||
ramp.AvgFanDutyCyclePct = duty
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Per-GPU telemetry from this ramp step's calibration.
|
// Per-GPU telemetry from this ramp step's calibration.
|
||||||
@@ -4491,8 +4794,8 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
// Supplement DCMI with SDR multi-source data via collector's PSU slot patterns.
|
// Supplement DCMI with SDR multi-source data via collector's PSU slot patterns.
|
||||||
// Per-slot readings enable correlation with audit HardwarePowerSupply entries.
|
// Per-slot readings enable correlation with audit HardwarePowerSupply entries.
|
||||||
if result.ServerPower != nil {
|
if result.ServerPower != nil {
|
||||||
// Use the SDR snapshot from the last ramp step (GPUs still loaded) rather
|
// Use the SDR phase average from the last ramp step (GPUs still loaded)
|
||||||
// than re-sampling here, which would capture post-test idle state.
|
// rather than re-sampling here, which would capture post-test idle state.
|
||||||
sdrLoaded := sdrLastStep
|
sdrLoaded := sdrLastStep
|
||||||
result.ServerPower.PSUInputIdleW = sdrIdle.PSUInW
|
result.ServerPower.PSUInputIdleW = sdrIdle.PSUInW
|
||||||
result.ServerPower.PSUInputLoadedW = sdrLoaded.PSUInW
|
result.ServerPower.PSUInputLoadedW = sdrLoaded.PSUInW
|
||||||
@@ -4512,6 +4815,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
result.ServerPower.Notes = append(result.ServerPower.Notes,
|
result.ServerPower.Notes = append(result.ServerPower.Notes,
|
||||||
"SDR sensors skipped (self-healed): "+strings.Join(sdrLoaded.SkippedSensors, "; "))
|
"SDR sensors skipped (self-healed): "+strings.Join(sdrLoaded.SkippedSensors, "; "))
|
||||||
}
|
}
|
||||||
|
if sdrLoaded.Samples > 0 {
|
||||||
|
result.ServerPower.Notes = append(result.ServerPower.Notes,
|
||||||
|
fmt.Sprintf("Final SDR PSU loaded values are phase averages across %d sample(s) from the last full-load step.", sdrLoaded.Samples))
|
||||||
|
}
|
||||||
// Detect DCMI partial coverage: direct SDR comparison first,
|
// Detect DCMI partial coverage: direct SDR comparison first,
|
||||||
// ramp heuristic as fallback when SDR PSU sensors are absent.
|
// ramp heuristic as fallback when SDR PSU sensors are absent.
|
||||||
dcmiUnreliable := detectDCMIPartialCoverage(result.ServerPower) ||
|
dcmiUnreliable := detectDCMIPartialCoverage(result.ServerPower) ||
|
||||||
|
|||||||
@@ -1,8 +1,13 @@
|
|||||||
package platform
|
package platform
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestResolveBenchmarkProfile(t *testing.T) {
|
func TestResolveBenchmarkProfile(t *testing.T) {
|
||||||
@@ -164,6 +169,93 @@ func TestBenchmarkPlannedPhaseStatus(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestBenchmarkCalibrationThrottleReasonIgnoresPowerReasons(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
before := BenchmarkThrottleCounters{}
|
||||||
|
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWPowerCapUS: 1_000_000}); got != "" {
|
||||||
|
t.Fatalf("sw_power_cap should be ignored, got %q", got)
|
||||||
|
}
|
||||||
|
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWPowerBrakeSlowdownUS: 1_000_000}); got != "" {
|
||||||
|
t.Fatalf("hw_power_brake should be ignored, got %q", got)
|
||||||
|
}
|
||||||
|
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWThermalSlowdownUS: 1_000_000}); got != "hw_thermal" {
|
||||||
|
t.Fatalf("hw_thermal mismatch: got %q", got)
|
||||||
|
}
|
||||||
|
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWThermalSlowdownUS: 1_000_000}); got != "sw_thermal" {
|
||||||
|
t.Fatalf("sw_thermal mismatch: got %q", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
oldGeteuid := benchmarkGeteuid
|
||||||
|
oldExec := satExecCommand
|
||||||
|
benchmarkGeteuid = func() int { return 1000 }
|
||||||
|
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||||
|
t.Fatalf("unexpected command: %s %v", name, args)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
t.Cleanup(func() {
|
||||||
|
benchmarkGeteuid = oldGeteuid
|
||||||
|
satExecCommand = oldExec
|
||||||
|
})
|
||||||
|
|
||||||
|
var logs []string
|
||||||
|
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{0, 2}, func(line string) {
|
||||||
|
logs = append(logs, line)
|
||||||
|
})
|
||||||
|
if got, want := strings.Join(logs, "\n"), "power benchmark pre-flight: root privileges unavailable, GPU reset skipped"; !strings.Contains(got, want) {
|
||||||
|
t.Fatalf("logs=%q want substring %q", got, want)
|
||||||
|
}
|
||||||
|
if len(failed) != 2 || failed[0] != 0 || failed[1] != 2 {
|
||||||
|
t.Fatalf("failed=%v want [0 2]", failed)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResetBenchmarkGPUsResetsEachGPU(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
dir := t.TempDir()
|
||||||
|
script := filepath.Join(dir, "nvidia-smi")
|
||||||
|
argsLog := filepath.Join(dir, "args.log")
|
||||||
|
if err := os.WriteFile(script, []byte("#!/bin/sh\nprintf '%s\\n' \"$*\" >> "+argsLog+"\nprintf 'ok\\n'\n"), 0755); err != nil {
|
||||||
|
t.Fatalf("write script: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
oldGeteuid := benchmarkGeteuid
|
||||||
|
oldSleep := benchmarkSleep
|
||||||
|
oldLookPath := satLookPath
|
||||||
|
benchmarkGeteuid = func() int { return 0 }
|
||||||
|
benchmarkSleep = func(time.Duration) {}
|
||||||
|
satLookPath = func(file string) (string, error) {
|
||||||
|
if file == "nvidia-smi" {
|
||||||
|
return script, nil
|
||||||
|
}
|
||||||
|
return exec.LookPath(file)
|
||||||
|
}
|
||||||
|
t.Cleanup(func() {
|
||||||
|
benchmarkGeteuid = oldGeteuid
|
||||||
|
benchmarkSleep = oldSleep
|
||||||
|
satLookPath = oldLookPath
|
||||||
|
})
|
||||||
|
|
||||||
|
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(dir, "verbose.log"), []int{2, 5}, nil)
|
||||||
|
if len(failed) != 0 {
|
||||||
|
t.Fatalf("failed=%v want no failures", failed)
|
||||||
|
}
|
||||||
|
raw, err := os.ReadFile(argsLog)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("read args log: %v", err)
|
||||||
|
}
|
||||||
|
got := strings.Fields(string(raw))
|
||||||
|
want := []string{"-i", "2", "-r", "-i", "5", "-r"}
|
||||||
|
if strings.Join(got, " ") != strings.Join(want, " ") {
|
||||||
|
t.Fatalf("args=%v want %v", got, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
|
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
@@ -179,6 +271,59 @@ func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestInitialBenchmarkCalibrationLimitW(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
cases := []struct {
|
||||||
|
name string
|
||||||
|
info benchmarkGPUInfo
|
||||||
|
want int
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "prefers default tdp over current derated limit",
|
||||||
|
info: benchmarkGPUInfo{
|
||||||
|
PowerLimitW: 500,
|
||||||
|
DefaultPowerLimitW: 600,
|
||||||
|
MaxPowerLimitW: 600,
|
||||||
|
},
|
||||||
|
want: 600,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "caps default tdp to reported max limit",
|
||||||
|
info: benchmarkGPUInfo{
|
||||||
|
PowerLimitW: 500,
|
||||||
|
DefaultPowerLimitW: 700,
|
||||||
|
MaxPowerLimitW: 650,
|
||||||
|
},
|
||||||
|
want: 650,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "falls back to current limit when default missing",
|
||||||
|
info: benchmarkGPUInfo{
|
||||||
|
PowerLimitW: 525,
|
||||||
|
MaxPowerLimitW: 600,
|
||||||
|
},
|
||||||
|
want: 525,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "falls back to max limit when only that is known",
|
||||||
|
info: benchmarkGPUInfo{
|
||||||
|
MaxPowerLimitW: 575,
|
||||||
|
},
|
||||||
|
want: 575,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range cases {
|
||||||
|
tc := tc
|
||||||
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
|
if got := initialBenchmarkCalibrationLimitW(tc.info); got != tc.want {
|
||||||
|
t.Fatalf("initialBenchmarkCalibrationLimitW(%+v)=%d want %d", tc.info, got, tc.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestParseBenchmarkBurnLog(t *testing.T) {
|
func TestParseBenchmarkBurnLog(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
@@ -338,12 +483,16 @@ func TestScoreBenchmarkGPUIgnoresDisabledPrecisions(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {
|
func TestEnrichGPUInfoWithNvidiaSMIQ(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
nvsmiQ := []byte(`
|
nvsmiQ := []byte(`
|
||||||
GPU 00000000:4E:00.0
|
GPU 00000000:4E:00.0
|
||||||
Product Name : NVIDIA RTX PRO 6000 Blackwell Server Edition
|
Product Name : NVIDIA RTX PRO 6000 Blackwell Server Edition
|
||||||
|
Min Power Limit : 200.00 W
|
||||||
|
Max Power Limit : 600.00 W
|
||||||
|
Default Power Limit : 575.00 W
|
||||||
|
Current Power Limit : 560.00 W
|
||||||
Clocks
|
Clocks
|
||||||
Graphics : 2422 MHz
|
Graphics : 2422 MHz
|
||||||
Memory : 12481 MHz
|
Memory : 12481 MHz
|
||||||
@@ -365,7 +514,7 @@ GPU 00000000:4F:00.0
|
|||||||
1: {Index: 1, BusID: "00000000:4F:00.0"},
|
1: {Index: 1, BusID: "00000000:4F:00.0"},
|
||||||
}
|
}
|
||||||
|
|
||||||
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
|
enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)
|
||||||
|
|
||||||
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
|
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
|
||||||
t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz)
|
t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz)
|
||||||
@@ -379,25 +528,49 @@ GPU 00000000:4F:00.0
|
|||||||
if infoByIndex[1].MaxMemoryClockMHz != 12481 {
|
if infoByIndex[1].MaxMemoryClockMHz != 12481 {
|
||||||
t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz)
|
t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz)
|
||||||
}
|
}
|
||||||
|
if infoByIndex[0].MinPowerLimitW != 200 {
|
||||||
|
t.Errorf("GPU 0 MinPowerLimitW = %v, want 200", infoByIndex[0].MinPowerLimitW)
|
||||||
|
}
|
||||||
|
if infoByIndex[0].MaxPowerLimitW != 600 {
|
||||||
|
t.Errorf("GPU 0 MaxPowerLimitW = %v, want 600", infoByIndex[0].MaxPowerLimitW)
|
||||||
|
}
|
||||||
|
if infoByIndex[0].DefaultPowerLimitW != 575 {
|
||||||
|
t.Errorf("GPU 0 DefaultPowerLimitW = %v, want 575", infoByIndex[0].DefaultPowerLimitW)
|
||||||
|
}
|
||||||
|
if infoByIndex[0].PowerLimitW != 560 {
|
||||||
|
t.Errorf("GPU 0 PowerLimitW = %v, want 560", infoByIndex[0].PowerLimitW)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestEnrichGPUInfoWithMaxClocksSkipsPopulated(t *testing.T) {
|
func TestEnrichGPUInfoWithNvidiaSMIQSkipsPopulated(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
nvsmiQ := []byte(`
|
nvsmiQ := []byte(`
|
||||||
GPU 00000000:4E:00.0
|
GPU 00000000:4E:00.0
|
||||||
|
Min Power Limit : 100.00 W
|
||||||
|
Max Power Limit : 900.00 W
|
||||||
Max Clocks
|
Max Clocks
|
||||||
Graphics : 9999 MHz
|
Graphics : 9999 MHz
|
||||||
Memory : 9999 MHz
|
Memory : 9999 MHz
|
||||||
`)
|
`)
|
||||||
// Already populated — must not be overwritten.
|
// Already populated — must not be overwritten.
|
||||||
infoByIndex := map[int]benchmarkGPUInfo{
|
infoByIndex := map[int]benchmarkGPUInfo{
|
||||||
0: {Index: 0, BusID: "00000000:4E:00.0", MaxGraphicsClockMHz: 2430, MaxMemoryClockMHz: 12481},
|
0: {
|
||||||
|
Index: 0,
|
||||||
|
BusID: "00000000:4E:00.0",
|
||||||
|
MaxGraphicsClockMHz: 2430,
|
||||||
|
MaxMemoryClockMHz: 12481,
|
||||||
|
MinPowerLimitW: 200,
|
||||||
|
MaxPowerLimitW: 600,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
|
enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)
|
||||||
|
|
||||||
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
|
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
|
||||||
t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz)
|
t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz)
|
||||||
}
|
}
|
||||||
|
if infoByIndex[0].MinPowerLimitW != 200 {
|
||||||
|
t.Errorf("expected existing min power limit to be preserved, got %v", infoByIndex[0].MinPowerLimitW)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
30
audit/internal/platform/nvidia_recover.go
Normal file
30
audit/internal/platform/nvidia_recover.go
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os/exec"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
const nvidiaRecoverHelper = "/usr/local/bin/bee-nvidia-recover"
|
||||||
|
|
||||||
|
func runNvidiaRecover(args ...string) (string, error) {
|
||||||
|
helperArgs := append([]string{nvidiaRecoverHelper}, args...)
|
||||||
|
if _, err := exec.LookPath("systemd-run"); err == nil {
|
||||||
|
unit := fmt.Sprintf("bee-nvidia-recover-%d", time.Now().UnixNano())
|
||||||
|
cmdArgs := []string{
|
||||||
|
"systemd-run",
|
||||||
|
"--quiet",
|
||||||
|
"--pipe",
|
||||||
|
"--wait",
|
||||||
|
"--collect",
|
||||||
|
"--service-type=oneshot",
|
||||||
|
"--unit", unit,
|
||||||
|
}
|
||||||
|
cmdArgs = append(cmdArgs, helperArgs...)
|
||||||
|
raw, err := exec.Command("sudo", cmdArgs...).CombinedOutput()
|
||||||
|
return string(raw), err
|
||||||
|
}
|
||||||
|
raw, err := exec.Command("sudo", helperArgs...).CombinedOutput()
|
||||||
|
return string(raw), err
|
||||||
|
}
|
||||||
@@ -30,10 +30,10 @@ import (
|
|||||||
// Sources:
|
// Sources:
|
||||||
// - SATEstimatedCPUValidateSec: xFusion v8.6 — 62 s
|
// - SATEstimatedCPUValidateSec: xFusion v8.6 — 62 s
|
||||||
// - SATEstimatedMemoryValidateSec: xFusion v8.6 — 68 s
|
// - SATEstimatedMemoryValidateSec: xFusion v8.6 — 68 s
|
||||||
// - SATEstimatedNvidiaGPUValidatePerGPUSec: xFusion v8.6/v8.22 — 77–87 s/GPU
|
// - SATEstimatedNvidiaGPUValidateSec: xFusion v8.6/v8.22 — 77–87 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
|
||||||
// - SATEstimatedNvidiaGPUStressPerGPUSec: xFusion v8.6/v8.22 — 444–448 s/GPU
|
// - SATEstimatedNvidiaGPUStressSec: xFusion v8.6/v8.22 — 444–448 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
|
||||||
// - SATEstimatedNvidiaTargetedStressPerGPUSec: xFusion v8.6/v8.22 — 347–348 s/GPU (300 s default + overhead)
|
// - SATEstimatedNvidiaTargetedStressSec: xFusion v8.6/v8.22 — 347–348 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
|
||||||
// - SATEstimatedNvidiaTargetedPowerPerGPUSec: MSI v8.22 / xFusion v8.6 — 346–351 s/GPU
|
// - SATEstimatedNvidiaTargetedPowerSec: MSI v8.22 / xFusion v8.6 — 346–351 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
|
||||||
// - SATEstimatedNvidiaPulseTestSec: xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous)
|
// - SATEstimatedNvidiaPulseTestSec: xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous)
|
||||||
// - SATEstimatedNvidiaInterconnectSec: xFusion v8.6/v8.22 — 210–384 s / 8 GPU (all simultaneous)
|
// - SATEstimatedNvidiaInterconnectSec: xFusion v8.6/v8.22 — 210–384 s / 8 GPU (all simultaneous)
|
||||||
// - SATEstimatedNvidiaBandwidthSec: xFusion v8.6/v8.22 — 2 664–2 688 s / 8 GPU (all simultaneous)
|
// - SATEstimatedNvidiaBandwidthSec: xFusion v8.6/v8.22 — 2 664–2 688 s / 8 GPU (all simultaneous)
|
||||||
@@ -48,15 +48,15 @@ const (
|
|||||||
// RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size).
|
// RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size).
|
||||||
SATEstimatedMemoryStressSec = 140
|
SATEstimatedMemoryStressSec = 140
|
||||||
|
|
||||||
// NVIDIA dcgmi diag Level 2 (medium), per GPU, sequential.
|
// NVIDIA dcgmi diag Level 2 (medium), all GPUs simultaneously.
|
||||||
SATEstimatedNvidiaGPUValidatePerGPUSec = 85
|
SATEstimatedNvidiaGPUValidateSec = 85
|
||||||
// NVIDIA dcgmi diag Level 3 (targeted stress), per GPU, sequential.
|
// NVIDIA dcgmi diag Level 3 (targeted stress), all GPUs simultaneously.
|
||||||
SATEstimatedNvidiaGPUStressPerGPUSec = 450
|
SATEstimatedNvidiaGPUStressSec = 450
|
||||||
|
|
||||||
// NVIDIA dcgmi targeted_stress 300 s + overhead, per GPU, sequential.
|
// NVIDIA dcgmi targeted_stress 300 s + overhead, all GPUs simultaneously.
|
||||||
SATEstimatedNvidiaTargetedStressPerGPUSec = 350
|
SATEstimatedNvidiaTargetedStressSec = 350
|
||||||
// NVIDIA dcgmi targeted_power 300 s + overhead, per GPU, sequential.
|
// NVIDIA dcgmi targeted_power 300 s + overhead, all GPUs simultaneously.
|
||||||
SATEstimatedNvidiaTargetedPowerPerGPUSec = 350
|
SATEstimatedNvidiaTargetedPowerSec = 350
|
||||||
|
|
||||||
// NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU).
|
// NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU).
|
||||||
SATEstimatedNvidiaPulseTestSec = 5000
|
SATEstimatedNvidiaPulseTestSec = 5000
|
||||||
@@ -407,11 +407,11 @@ func (s *System) ResetNvidiaGPU(index int) (string, error) {
|
|||||||
if index < 0 {
|
if index < 0 {
|
||||||
return "", fmt.Errorf("gpu index must be >= 0")
|
return "", fmt.Errorf("gpu index must be >= 0")
|
||||||
}
|
}
|
||||||
raw, err := satExecCommand("nvidia-smi", "-r", "-i", strconv.Itoa(index)).CombinedOutput()
|
out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
|
||||||
if len(raw) == 0 && err == nil {
|
if strings.TrimSpace(out) == "" && err == nil {
|
||||||
raw = []byte("GPU reset completed.\n")
|
out = "GPU reset completed.\n"
|
||||||
}
|
}
|
||||||
return string(raw), err
|
return out, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
|
// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
|
||||||
|
|||||||
@@ -61,6 +61,9 @@ func (s *System) ServiceState(name string) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
|
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
|
||||||
|
if name == "bee-nvidia" && action == ServiceRestart {
|
||||||
|
return runNvidiaRecover("restart-drivers")
|
||||||
|
}
|
||||||
// bee-web runs as the bee user; sudo is required to control system services.
|
// bee-web runs as the bee user; sudo is required to control system services.
|
||||||
// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
|
// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
|
||||||
raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()
|
raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()
|
||||||
|
|||||||
@@ -35,9 +35,11 @@ func validateTotalValidateSec(n int) int {
|
|||||||
}
|
}
|
||||||
total := platform.SATEstimatedCPUValidateSec +
|
total := platform.SATEstimatedCPUValidateSec +
|
||||||
platform.SATEstimatedMemoryValidateSec +
|
platform.SATEstimatedMemoryValidateSec +
|
||||||
n*platform.SATEstimatedNvidiaGPUValidatePerGPUSec +
|
|
||||||
platform.SATEstimatedNvidiaInterconnectSec +
|
platform.SATEstimatedNvidiaInterconnectSec +
|
||||||
platform.SATEstimatedNvidiaBandwidthSec
|
platform.SATEstimatedNvidiaBandwidthSec
|
||||||
|
if n > 0 {
|
||||||
|
total += platform.SATEstimatedNvidiaGPUValidateSec
|
||||||
|
}
|
||||||
return total
|
return total
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -47,12 +49,14 @@ func validateTotalStressSec(n int) int {
|
|||||||
}
|
}
|
||||||
total := platform.SATEstimatedCPUStressSec +
|
total := platform.SATEstimatedCPUStressSec +
|
||||||
platform.SATEstimatedMemoryStressSec +
|
platform.SATEstimatedMemoryStressSec +
|
||||||
n*platform.SATEstimatedNvidiaGPUStressPerGPUSec +
|
|
||||||
n*platform.SATEstimatedNvidiaTargetedStressPerGPUSec +
|
|
||||||
n*platform.SATEstimatedNvidiaTargetedPowerPerGPUSec +
|
|
||||||
platform.SATEstimatedNvidiaPulseTestSec +
|
platform.SATEstimatedNvidiaPulseTestSec +
|
||||||
platform.SATEstimatedNvidiaInterconnectSec +
|
platform.SATEstimatedNvidiaInterconnectSec +
|
||||||
platform.SATEstimatedNvidiaBandwidthSec
|
platform.SATEstimatedNvidiaBandwidthSec
|
||||||
|
if n > 0 {
|
||||||
|
total += platform.SATEstimatedNvidiaGPUStressSec +
|
||||||
|
platform.SATEstimatedNvidiaTargetedStressSec +
|
||||||
|
platform.SATEstimatedNvidiaTargetedPowerSec
|
||||||
|
}
|
||||||
return total
|
return total
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -128,33 +132,16 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Runs NVIDIA diagnostics and board inventory checks.`,
|
`Runs NVIDIA diagnostics and board inventory checks.`,
|
||||||
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
||||||
func() string {
|
fmt.Sprintf("Validate: %s (Level 2, all GPUs simultaneously). Stress: %s (Level 3, all GPUs simultaneously).",
|
||||||
perV := platform.SATEstimatedNvidiaGPUValidatePerGPUSec
|
validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec),
|
||||||
perS := platform.SATEstimatedNvidiaGPUStressPerGPUSec
|
validateFmtDur(platform.SATEstimatedNvidiaGPUStressSec)),
|
||||||
if n > 0 {
|
|
||||||
return fmt.Sprintf("Validate: %s/GPU × %d = %s (Level 2, sequential). Stress: %s/GPU × %d = %s (Level 3, sequential).",
|
|
||||||
validateFmtDur(perV), n, validateFmtDur(perV*n),
|
|
||||||
validateFmtDur(perS), n, validateFmtDur(perS*n))
|
|
||||||
}
|
|
||||||
return fmt.Sprintf("Validate: %s/GPU (Level 2, sequential). Stress: %s/GPU (Level 3, sequential).",
|
|
||||||
validateFmtDur(perV), validateFmtDur(perS))
|
|
||||||
}(),
|
|
||||||
)) +
|
)) +
|
||||||
`<div id="sat-card-nvidia-targeted-stress">` +
|
`<div id="sat-card-nvidia-targeted-stress">` +
|
||||||
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
|
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
|
||||||
`<code>dcgmi diag targeted_stress</code>`,
|
`<code>dcgmi diag targeted_stress</code>`,
|
||||||
func() string {
|
"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec) + ` (all GPUs simultaneously).<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||||
per := platform.SATEstimatedNvidiaTargetedStressPerGPUSec
|
|
||||||
s := "Skipped in Validate. "
|
|
||||||
if n > 0 {
|
|
||||||
s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
|
|
||||||
} else {
|
|
||||||
s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
|
|
||||||
}
|
|
||||||
return s + `<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
|
|
||||||
}(),
|
|
||||||
)) +
|
)) +
|
||||||
`</div>` +
|
`</div>` +
|
||||||
`<div id="sat-card-nvidia-targeted-power">` +
|
`<div id="sat-card-nvidia-targeted-power">` +
|
||||||
@@ -162,16 +149,7 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
|
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
|
||||||
`<code>dcgmi diag targeted_power</code>`,
|
`<code>dcgmi diag targeted_power</code>`,
|
||||||
func() string {
|
"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec) + ` (all GPUs simultaneously).<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||||
per := platform.SATEstimatedNvidiaTargetedPowerPerGPUSec
|
|
||||||
s := "Skipped in Validate. "
|
|
||||||
if n > 0 {
|
|
||||||
s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
|
|
||||||
} else {
|
|
||||||
s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
|
|
||||||
}
|
|
||||||
return s + `<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
|
|
||||||
}(),
|
|
||||||
)) +
|
)) +
|
||||||
`</div>` +
|
`</div>` +
|
||||||
`<div id="sat-card-nvidia-pulse">` +
|
`<div id="sat-card-nvidia-pulse">` +
|
||||||
@@ -382,8 +360,8 @@ function runSATWithOverrides(target, overrides) {
|
|||||||
return enqueueSATTarget(target, overrides)
|
return enqueueSATTarget(target, overrides)
|
||||||
.then(d => streamSATTask(d.task_id, title, false));
|
.then(d => streamSATTask(d.task_id, title, false));
|
||||||
}
|
}
|
||||||
const nvidiaPerGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power'];
|
const nvidiaPerGPUTargets = [];
|
||||||
const nvidiaAllGPUTargets = ['nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
|
const nvidiaAllGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
|
||||||
function satAllGPUIndicesForMulti() {
|
function satAllGPUIndicesForMulti() {
|
||||||
return Promise.resolve(satSelectedGPUIndices());
|
return Promise.resolve(satSelectedGPUIndices());
|
||||||
}
|
}
|
||||||
@@ -417,40 +395,9 @@ function runNvidiaFabricValidate(target) {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
function runNvidiaValidateSet(target) {
|
function runNvidiaValidateSet(target) {
|
||||||
return loadSatNvidiaGPUs().then(gpus => {
|
|
||||||
const selected = satSelectedGPUIndices();
|
const selected = satSelectedGPUIndices();
|
||||||
const picked = gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0);
|
if (!selected.length) { alert('Select at least one NVIDIA GPU.'); return; }
|
||||||
if (!picked.length) {
|
return runSATWithOverrides(target, {gpu_indices: selected, display_name: satLabels()[target] || target});
|
||||||
throw new Error('Select at least one NVIDIA GPU.');
|
|
||||||
}
|
|
||||||
if (picked.length === 1) {
|
|
||||||
const gpu = picked[0];
|
|
||||||
return runSATWithOverrides(target, {
|
|
||||||
gpu_indices: [Number(gpu.index)],
|
|
||||||
display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')',
|
|
||||||
});
|
|
||||||
}
|
|
||||||
document.getElementById('sat-output').style.display='block';
|
|
||||||
document.getElementById('sat-title').textContent = '— ' + target;
|
|
||||||
const term = document.getElementById('sat-terminal');
|
|
||||||
term.textContent = 'Running ' + target + ' one GPU at a time...\n';
|
|
||||||
const labelBase = satLabels()[target] || ('Validate ' + target);
|
|
||||||
const runNext = (idx) => {
|
|
||||||
if (idx >= picked.length) return Promise.resolve();
|
|
||||||
const gpu = picked[idx];
|
|
||||||
const gpuLabel = satGPUDisplayName(gpu);
|
|
||||||
term.textContent += '\n[' + (idx + 1) + '/' + picked.length + '] ' + gpuLabel + '\n';
|
|
||||||
return enqueueSATTarget(target, {
|
|
||||||
gpu_indices: [Number(gpu.index)],
|
|
||||||
display_name: labelBase + ' (' + gpuLabel + ')',
|
|
||||||
}).then(d => {
|
|
||||||
return streamSATTask(d.task_id, labelBase + ' (' + gpuLabel + ')', false);
|
|
||||||
}).then(function() {
|
|
||||||
return runNext(idx + 1);
|
|
||||||
});
|
|
||||||
};
|
|
||||||
return runNext(0);
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
function runAMDValidateSet() {
|
function runAMDValidateSet() {
|
||||||
const targets = selectedAMDValidateTargets();
|
const targets = selectedAMDValidateTargets();
|
||||||
|
|||||||
@@ -126,6 +126,37 @@ resolve_iso_version() {
|
|||||||
resolve_audit_version
|
resolve_audit_version
|
||||||
}
|
}
|
||||||
|
|
||||||
|
sync_builder_workdir() {
|
||||||
|
src_dir="$1"
|
||||||
|
dst_dir="$2"
|
||||||
|
|
||||||
|
mkdir -p "$dst_dir"
|
||||||
|
|
||||||
|
# Historical bug: old workdirs could keep config/bootloaders/grub-pc even
|
||||||
|
# after the source tree moved to grub-efi only. Remove bootloaders eagerly
|
||||||
|
# so reused workdirs cannot leak stale templates into a new ISO build.
|
||||||
|
rm -rf "$dst_dir/config/bootloaders"
|
||||||
|
|
||||||
|
rsync -a --delete \
|
||||||
|
--exclude='cache/' \
|
||||||
|
--exclude='chroot/' \
|
||||||
|
--exclude='.build/' \
|
||||||
|
--exclude='*.iso' \
|
||||||
|
--exclude='*.packages' \
|
||||||
|
--exclude='*.contents' \
|
||||||
|
--exclude='*.files' \
|
||||||
|
"$src_dir/" "$dst_dir/"
|
||||||
|
|
||||||
|
if [ ! -f "$dst_dir/config/bootloaders/grub-efi/grub.cfg" ]; then
|
||||||
|
echo "ERROR: staged workdir is missing config/bootloaders/grub-efi/grub.cfg" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if [ -e "$dst_dir/config/bootloaders/grub-pc" ]; then
|
||||||
|
echo "ERROR: stale config/bootloaders/grub-pc remained in staged workdir" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
iso_list_files() {
|
iso_list_files() {
|
||||||
iso_path="$1"
|
iso_path="$1"
|
||||||
|
|
||||||
@@ -466,6 +497,75 @@ validate_iso_memtest() {
|
|||||||
echo "=== memtest validation OK ==="
|
echo "=== memtest validation OK ==="
|
||||||
}
|
}
|
||||||
|
|
||||||
|
validate_iso_live_boot_entries() {
|
||||||
|
iso_path="$1"
|
||||||
|
echo "=== validating live boot entries in ISO ==="
|
||||||
|
|
||||||
|
[ -f "$iso_path" ] || {
|
||||||
|
echo "ERROR: ISO not found for live boot validation: $iso_path" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
require_iso_reader "$iso_path" >/dev/null 2>&1 || {
|
||||||
|
echo "ERROR: ISO reader unavailable for live boot validation" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
grub_cfg="$(mktemp)"
|
||||||
|
isolinux_cfg="$(mktemp)"
|
||||||
|
|
||||||
|
iso_read_member "$iso_path" boot/grub/grub.cfg "$grub_cfg" || {
|
||||||
|
echo "ERROR: failed to read boot/grub/grub.cfg from ISO" >&2
|
||||||
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
iso_read_member "$iso_path" isolinux/live.cfg "$isolinux_cfg" || {
|
||||||
|
echo "ERROR: failed to read isolinux/live.cfg from ISO" >&2
|
||||||
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
if grep -q '@APPEND_LIVE@\|@KERNEL_LIVE@\|@INITRD_LIVE@' "$grub_cfg" "$isolinux_cfg"; then
|
||||||
|
echo "ERROR: unresolved live-build placeholders remain in ISO bootloader config" >&2
|
||||||
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
grep -q 'menuentry "EASY-BEE"' "$grub_cfg" || {
|
||||||
|
echo "ERROR: GRUB default EASY-BEE entry is missing" >&2
|
||||||
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
grep -q 'menuentry "EASY-BEE -- load to RAM (toram)"' "$grub_cfg" || {
|
||||||
|
echo "ERROR: GRUB toram entry is missing" >&2
|
||||||
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
grep -q 'linux .*boot=live ' "$grub_cfg" || {
|
||||||
|
echo "ERROR: GRUB live entry is missing boot=live" >&2
|
||||||
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
grep -q 'linux .*boot=live .*toram ' "$grub_cfg" || {
|
||||||
|
echo "ERROR: GRUB toram entry is missing boot=live or toram" >&2
|
||||||
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
grep -q 'append .*boot=live ' "$isolinux_cfg" || {
|
||||||
|
echo "ERROR: isolinux live entry is missing boot=live" >&2
|
||||||
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
grep -q 'append .*boot=live .*toram ' "$isolinux_cfg" || {
|
||||||
|
echo "ERROR: isolinux toram entry is missing boot=live or toram" >&2
|
||||||
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
|
echo "=== live boot validation OK ==="
|
||||||
|
}
|
||||||
|
|
||||||
validate_iso_nvidia_runtime() {
|
validate_iso_nvidia_runtime() {
|
||||||
iso_path="$1"
|
iso_path="$1"
|
||||||
[ "$BEE_GPU_VENDOR" = "nvidia" ] || return 0
|
[ "$BEE_GPU_VENDOR" = "nvidia" ] || return 0
|
||||||
@@ -558,6 +658,21 @@ extract_live_grub_entry() {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
load_live_build_append() {
|
||||||
|
lb_dir="$1"
|
||||||
|
binary_cfg="$lb_dir/config/binary"
|
||||||
|
[ -f "$binary_cfg" ] || return 1
|
||||||
|
|
||||||
|
# config/binary is generated by live-build and contains shell variable
|
||||||
|
# assignments such as LB_BOOTAPPEND_LIVE="boot=live ...".
|
||||||
|
# shellcheck disable=SC1090
|
||||||
|
. "$binary_cfg"
|
||||||
|
|
||||||
|
[ -n "${LB_BOOTAPPEND_LIVE:-}" ] || return 1
|
||||||
|
live_build_append="$LB_BOOTAPPEND_LIVE"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
extract_live_isolinux_entry() {
|
extract_live_isolinux_entry() {
|
||||||
cfg="$1"
|
cfg="$1"
|
||||||
isolinux_linux="$(awk '/^[[:space:]]*linux[[:space:]]+\/live\// { print; exit }' "$cfg")"
|
isolinux_linux="$(awk '/^[[:space:]]*linux[[:space:]]+\/live\// { print; exit }' "$cfg")"
|
||||||
@@ -594,36 +709,15 @@ echo " Hardware Audit LiveCD"
|
|||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
menuentry "EASY-BEE" {
|
menuentry "EASY-BEE" {
|
||||||
linux ${kernel} ${append_live} nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
linux ${kernel} ${append_live} bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
initrd ${initrd}
|
initrd ${initrd}
|
||||||
}
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE — load to RAM (toram)" {
|
menuentry "EASY-BEE -- load to RAM (toram)" {
|
||||||
linux ${kernel} ${append_live} toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
linux ${kernel} ${append_live} toram bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
initrd ${initrd}
|
initrd ${initrd}
|
||||||
}
|
}
|
||||||
|
|
||||||
submenu "EASY-BEE (advanced options) -->" {
|
|
||||||
menuentry "EASY-BEE — GSP=off" {
|
|
||||||
linux ${kernel} ${append_live} nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
|
||||||
initrd ${initrd}
|
|
||||||
}
|
|
||||||
|
|
||||||
menuentry "EASY-BEE — KMS (no nomodeset)" {
|
|
||||||
linux ${kernel} ${append_live} bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
|
||||||
initrd ${initrd}
|
|
||||||
}
|
|
||||||
|
|
||||||
menuentry "EASY-BEE — KMS + GSP=off" {
|
|
||||||
linux ${kernel} ${append_live} bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
|
||||||
initrd ${initrd}
|
|
||||||
}
|
|
||||||
|
|
||||||
menuentry "EASY-BEE — fail-safe" {
|
|
||||||
linux ${kernel} ${append_live} nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
|
||||||
initrd ${initrd}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if [ "\${grub_platform}" = "efi" ]; then
|
if [ "\${grub_platform}" = "efi" ]; then
|
||||||
menuentry "Memory Test (memtest86+)" {
|
menuentry "Memory Test (memtest86+)" {
|
||||||
@@ -699,13 +793,18 @@ enforce_live_build_bootloader_assets() {
|
|||||||
grub_dir="$lb_dir/binary/boot/grub"
|
grub_dir="$lb_dir/binary/boot/grub"
|
||||||
isolinux_cfg="$lb_dir/binary/isolinux/live.cfg"
|
isolinux_cfg="$lb_dir/binary/isolinux/live.cfg"
|
||||||
|
|
||||||
|
if ! load_live_build_append "$lb_dir"; then
|
||||||
|
echo "bootloader sync: WARNING: could not load LB_BOOTAPPEND_LIVE from $lb_dir/config/binary" >&2
|
||||||
|
live_build_append=""
|
||||||
|
fi
|
||||||
|
|
||||||
if [ -f "$grub_cfg" ]; then
|
if [ -f "$grub_cfg" ]; then
|
||||||
if extract_live_grub_entry "$grub_cfg"; then
|
if extract_live_grub_entry "$grub_cfg"; then
|
||||||
mkdir -p "$grub_dir/live-theme"
|
mkdir -p "$grub_dir/live-theme"
|
||||||
cp "${BUILDER_DIR}/config/bootloaders/grub-efi/config.cfg" "$grub_dir/config.cfg"
|
cp "${BUILDER_DIR}/config/bootloaders/grub-efi/config.cfg" "$grub_dir/config.cfg"
|
||||||
cp "${BUILDER_DIR}/config/bootloaders/grub-efi/theme.cfg" "$grub_dir/theme.cfg"
|
cp "${BUILDER_DIR}/config/bootloaders/grub-efi/theme.cfg" "$grub_dir/theme.cfg"
|
||||||
cp -R "${BUILDER_DIR}/config/bootloaders/grub-efi/live-theme/." "$grub_dir/live-theme/"
|
cp -R "${BUILDER_DIR}/config/bootloaders/grub-efi/live-theme/." "$grub_dir/live-theme/"
|
||||||
write_canonical_grub_cfg "$grub_cfg" "$grub_kernel" "$grub_append" "$grub_initrd"
|
write_canonical_grub_cfg "$grub_cfg" "$grub_kernel" "${live_build_append:-$grub_append}" "$grub_initrd"
|
||||||
echo "bootloader sync: rewrote binary/boot/grub/grub.cfg with canonical EASY-BEE menu"
|
echo "bootloader sync: rewrote binary/boot/grub/grub.cfg with canonical EASY-BEE menu"
|
||||||
else
|
else
|
||||||
echo "bootloader sync: WARNING: could not extract live entry from $grub_cfg" >&2
|
echo "bootloader sync: WARNING: could not extract live entry from $grub_cfg" >&2
|
||||||
@@ -714,7 +813,7 @@ enforce_live_build_bootloader_assets() {
|
|||||||
|
|
||||||
if [ -f "$isolinux_cfg" ]; then
|
if [ -f "$isolinux_cfg" ]; then
|
||||||
if extract_live_isolinux_entry "$isolinux_cfg"; then
|
if extract_live_isolinux_entry "$isolinux_cfg"; then
|
||||||
write_canonical_isolinux_cfg "$isolinux_cfg" "$isolinux_kernel" "$isolinux_initrd_path" "$isolinux_append"
|
write_canonical_isolinux_cfg "$isolinux_cfg" "$isolinux_kernel" "$isolinux_initrd_path" "${live_build_append:-$isolinux_append}"
|
||||||
echo "bootloader sync: rewrote binary/isolinux/live.cfg with canonical EASY-BEE menu"
|
echo "bootloader sync: rewrote binary/isolinux/live.cfg with canonical EASY-BEE menu"
|
||||||
else
|
else
|
||||||
echo "bootloader sync: WARNING: could not extract live entry from $isolinux_cfg" >&2
|
echo "bootloader sync: WARNING: could not extract live entry from $isolinux_cfg" >&2
|
||||||
@@ -1112,15 +1211,7 @@ echo "=== preparing staged overlay (${BUILD_VARIANT}) ==="
|
|||||||
mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
|
mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
|
||||||
|
|
||||||
# Sync builder config into variant work dir, preserving lb cache.
|
# Sync builder config into variant work dir, preserving lb cache.
|
||||||
rsync -a --delete \
|
sync_builder_workdir "${BUILDER_DIR}" "${BUILD_WORK_DIR}"
|
||||||
--exclude='cache/' \
|
|
||||||
--exclude='chroot/' \
|
|
||||||
--exclude='.build/' \
|
|
||||||
--exclude='*.iso' \
|
|
||||||
--exclude='*.packages' \
|
|
||||||
--exclude='*.contents' \
|
|
||||||
--exclude='*.files' \
|
|
||||||
"${BUILDER_DIR}/" "${BUILD_WORK_DIR}/"
|
|
||||||
|
|
||||||
# Share deb package cache across variants.
|
# Share deb package cache across variants.
|
||||||
# Restore: populate work dir cache from shared cache before build.
|
# Restore: populate work dir cache from shared cache before build.
|
||||||
@@ -1411,8 +1502,11 @@ dump_memtest_debug "pre-build" "${LB_DIR}"
|
|||||||
run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
|
run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
|
||||||
echo "=== enforcing canonical bootloader assets ==="
|
echo "=== enforcing canonical bootloader assets ==="
|
||||||
enforce_live_build_bootloader_assets "${LB_DIR}"
|
enforce_live_build_bootloader_assets "${LB_DIR}"
|
||||||
|
reset_live_build_stage "${LB_DIR}" "binary_checksums"
|
||||||
|
reset_live_build_stage "${LB_DIR}" "binary_iso"
|
||||||
|
reset_live_build_stage "${LB_DIR}" "binary_zsync"
|
||||||
run_step_sh "rebuild live-build checksums after bootloader sync" "91b-lb-checksums" "lb binary_checksums 2>&1"
|
run_step_sh "rebuild live-build checksums after bootloader sync" "91b-lb-checksums" "lb binary_checksums 2>&1"
|
||||||
run_step_sh "rebuild ISO after bootloader sync" "91c-lb-binary-iso" "rm -f '${LB_DIR}/live-image-amd64.hybrid.iso' && lb binary_iso 2>&1"
|
run_step_sh "rebuild ISO after bootloader sync" "91c-lb-binary-iso" "lb binary_iso 2>&1"
|
||||||
run_step_sh "rebuild zsync after bootloader sync" "91d-lb-zsync" "lb binary_zsync 2>&1"
|
run_step_sh "rebuild zsync after bootloader sync" "91d-lb-zsync" "lb binary_zsync 2>&1"
|
||||||
|
|
||||||
# --- persist deb package cache back to shared location ---
|
# --- persist deb package cache back to shared location ---
|
||||||
@@ -1438,6 +1532,7 @@ if [ -f "$ISO_RAW" ]; then
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
validate_iso_memtest "$ISO_RAW"
|
validate_iso_memtest "$ISO_RAW"
|
||||||
|
validate_iso_live_boot_entries "$ISO_RAW"
|
||||||
validate_iso_nvidia_runtime "$ISO_RAW"
|
validate_iso_nvidia_runtime "$ISO_RAW"
|
||||||
cp "$ISO_RAW" "$ISO_OUT"
|
cp "$ISO_RAW" "$ISO_OUT"
|
||||||
echo ""
|
echo ""
|
||||||
|
|||||||
@@ -23,9 +23,9 @@ insmod serial
|
|||||||
serial --unit=0 --speed=115200 --word=8 --parity=no --stop=1
|
serial --unit=0 --speed=115200 --word=8 --parity=no --stop=1
|
||||||
|
|
||||||
insmod gfxterm
|
insmod gfxterm
|
||||||
insmod png
|
|
||||||
|
|
||||||
source /boot/grub/theme.cfg
|
|
||||||
|
|
||||||
terminal_input console serial
|
terminal_input console serial
|
||||||
terminal_output gfxterm serial
|
terminal_output gfxterm serial
|
||||||
|
|
||||||
|
insmod png
|
||||||
|
source /boot/grub/theme.cfg
|
||||||
|
|||||||
@@ -1,46 +1,15 @@
|
|||||||
source /boot/grub/config.cfg
|
source /boot/grub/config.cfg
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo " ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗"
|
|
||||||
echo " ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝"
|
|
||||||
echo " █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗"
|
|
||||||
echo " ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝"
|
|
||||||
echo " ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗"
|
|
||||||
echo " ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝"
|
|
||||||
echo " Hardware Audit LiveCD"
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
menuentry "EASY-BEE" {
|
menuentry "EASY-BEE" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
submenu "EASY-BEE (advanced options) -->" {
|
menuentry "EASY-BEE -- load to RAM (toram)" {
|
||||||
menuentry "EASY-BEE — load to RAM (toram)" {
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE — GSP=off" {
|
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
|
||||||
initrd @INITRD_LIVE@
|
|
||||||
}
|
|
||||||
|
|
||||||
menuentry "EASY-BEE — KMS (no nomodeset)" {
|
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
|
||||||
initrd @INITRD_LIVE@
|
|
||||||
}
|
|
||||||
|
|
||||||
menuentry "EASY-BEE — KMS + GSP=off" {
|
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
|
||||||
initrd @INITRD_LIVE@
|
|
||||||
}
|
|
||||||
|
|
||||||
menuentry "EASY-BEE — fail-safe" {
|
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
|
||||||
initrd @INITRD_LIVE@
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if [ "${grub_platform}" = "efi" ]; then
|
if [ "${grub_platform}" = "efi" ]; then
|
||||||
menuentry "Memory Test (memtest86+)" {
|
menuentry "Memory Test (memtest86+)" {
|
||||||
|
|||||||
Binary file not shown.
|
Before Width: | Height: | Size: 70 KiB After Width: | Height: | Size: 78 KiB |
@@ -5,12 +5,10 @@ title-text: ""
|
|||||||
message-font: "Unifont Regular 16"
|
message-font: "Unifont Regular 16"
|
||||||
terminal-font: "Unifont Regular 16"
|
terminal-font: "Unifont Regular 16"
|
||||||
|
|
||||||
#bee logo — centered, upper third of screen
|
#bee logo - centered, upper third of screen
|
||||||
+ image {
|
+ image {
|
||||||
top = 4%
|
top = 4%
|
||||||
left = 50%-200
|
left = 50%-200
|
||||||
width = 400
|
|
||||||
height = 400
|
|
||||||
file = "bee-logo.png"
|
file = "bee-logo.png"
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -36,11 +34,11 @@ terminal-font: "Unifont Regular 16"
|
|||||||
item_font = "Unifont Regular 16"
|
item_font = "Unifont Regular 16"
|
||||||
selected_item_color= "#f5a800"
|
selected_item_color= "#f5a800"
|
||||||
selected_item_font = "Unifont Regular 16"
|
selected_item_font = "Unifont Regular 16"
|
||||||
item_height = 16
|
item_height = 20
|
||||||
item_padding = 0
|
item_padding = 2
|
||||||
item_spacing = 4
|
item_spacing = 4
|
||||||
icon_width = 0
|
icon_width = 0
|
||||||
icon_heigh = 0
|
icon_height = 0
|
||||||
item_icon_space = 0
|
item_icon_space = 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
178
iso/overlay/usr/local/bin/bee-nvidia-recover
Executable file
178
iso/overlay/usr/local/bin/bee-nvidia-recover
Executable file
@@ -0,0 +1,178 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# bee-nvidia-recover — drain NVIDIA clients, then reset a GPU or reload drivers.
|
||||||
|
|
||||||
|
set -u
|
||||||
|
|
||||||
|
log() {
|
||||||
|
echo "[bee-nvidia-recover] $*"
|
||||||
|
}
|
||||||
|
|
||||||
|
log_blocker() {
|
||||||
|
echo "[bee-nvidia-recover] blocker: $*"
|
||||||
|
}
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
cat <<'EOF'
|
||||||
|
usage:
|
||||||
|
bee-nvidia-recover restart-drivers
|
||||||
|
bee-nvidia-recover reset-gpu <index>
|
||||||
|
EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
unit_exists() {
|
||||||
|
systemctl cat "$1" >/dev/null 2>&1
|
||||||
|
}
|
||||||
|
|
||||||
|
unit_is_active() {
|
||||||
|
systemctl is-active --quiet "$1" 2>/dev/null
|
||||||
|
}
|
||||||
|
|
||||||
|
stop_unit_if_active() {
|
||||||
|
unit="$1"
|
||||||
|
if unit_is_active "$unit"; then
|
||||||
|
log "stopping $unit"
|
||||||
|
systemctl stop "$unit"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
start_unit_if_marked() {
|
||||||
|
unit="$1"
|
||||||
|
marker="$2"
|
||||||
|
if [ "$marker" = "1" ] && unit_exists "$unit"; then
|
||||||
|
log "starting $unit"
|
||||||
|
systemctl start "$unit"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
wait_for_process_exit() {
|
||||||
|
name="$1"
|
||||||
|
tries=0
|
||||||
|
while pgrep -x "$name" >/dev/null 2>&1; do
|
||||||
|
tries=$((tries + 1))
|
||||||
|
if [ "$tries" -ge 15 ]; then
|
||||||
|
log "WARN: $name is still running after stop request"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
kill_pattern() {
|
||||||
|
pattern="$1"
|
||||||
|
if pgrep -f "$pattern" >/dev/null 2>&1; then
|
||||||
|
pgrep -af "$pattern" 2>/dev/null | while IFS= read -r line; do
|
||||||
|
[ -n "$line" ] || continue
|
||||||
|
log_blocker "$line"
|
||||||
|
done
|
||||||
|
log "killing processes matching: $pattern"
|
||||||
|
pkill -TERM -f "$pattern" >/dev/null 2>&1 || true
|
||||||
|
sleep 1
|
||||||
|
pkill -KILL -f "$pattern" >/dev/null 2>&1 || true
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
drain_gpu_clients() {
|
||||||
|
display_was_active=0
|
||||||
|
fabric_was_active=0
|
||||||
|
|
||||||
|
for unit in display-manager.service lightdm.service; do
|
||||||
|
if unit_exists "$unit" && stop_unit_if_active "$unit"; then
|
||||||
|
log_blocker "service $unit"
|
||||||
|
display_was_active=1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if unit_exists nvidia-fabricmanager.service && stop_unit_if_active nvidia-fabricmanager.service; then
|
||||||
|
log_blocker "service nvidia-fabricmanager.service"
|
||||||
|
fabric_was_active=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||||
|
pgrep -af "^nv-hostengine$" 2>/dev/null | while IFS= read -r line; do
|
||||||
|
[ -n "$line" ] || continue
|
||||||
|
log_blocker "$line"
|
||||||
|
done
|
||||||
|
log "stopping nv-hostengine"
|
||||||
|
pkill -TERM -x nv-hostengine >/dev/null 2>&1 || true
|
||||||
|
wait_for_process_exit nv-hostengine || pkill -KILL -x nv-hostengine >/dev/null 2>&1 || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
for pattern in \
|
||||||
|
"nvidia-smi" \
|
||||||
|
"dcgmi" \
|
||||||
|
"nvvs" \
|
||||||
|
"dcgmproftester" \
|
||||||
|
"all_reduce_perf" \
|
||||||
|
"nvtop" \
|
||||||
|
"bee-gpu-burn" \
|
||||||
|
"bee-john-gpu-stress" \
|
||||||
|
"bee-nccl-gpu-stress" \
|
||||||
|
"Xorg" \
|
||||||
|
"Xwayland"; do
|
||||||
|
kill_pattern "$pattern"
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
restore_gpu_clients() {
|
||||||
|
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||||
|
if nvidia-smi -pm 1 >/dev/null 2>&1; then
|
||||||
|
log "enabled NVIDIA persistence mode"
|
||||||
|
else
|
||||||
|
log "WARN: failed to enable NVIDIA persistence mode"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if command -v nv-hostengine >/dev/null 2>&1 && ! pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||||
|
log "starting nv-hostengine"
|
||||||
|
nv-hostengine
|
||||||
|
fi
|
||||||
|
|
||||||
|
start_unit_if_marked nvidia-fabricmanager.service "${fabric_was_active:-0}"
|
||||||
|
start_unit_if_marked display-manager.service "${display_was_active:-0}"
|
||||||
|
if [ "${display_was_active:-0}" = "1" ] && unit_exists lightdm.service && ! unit_is_active lightdm.service; then
|
||||||
|
start_unit_if_marked lightdm.service "1"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
restart_drivers() {
|
||||||
|
drain_gpu_clients
|
||||||
|
for mod in nvidia_uvm nvidia_drm nvidia_modeset nvidia; do
|
||||||
|
if lsmod | awk '{print $1}' | grep -qx "$mod"; then
|
||||||
|
log "unloading module $mod"
|
||||||
|
rmmod "$mod"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
rm -f /dev/nvidiactl /dev/nvidia-uvm /dev/nvidia-uvm-tools /dev/nvidia[0-9]* 2>/dev/null || true
|
||||||
|
log "reloading NVIDIA driver stack"
|
||||||
|
/usr/local/bin/bee-nvidia-load
|
||||||
|
restore_gpu_clients
|
||||||
|
}
|
||||||
|
|
||||||
|
reset_gpu() {
|
||||||
|
index="$1"
|
||||||
|
drain_gpu_clients
|
||||||
|
log "resetting GPU $index"
|
||||||
|
nvidia-smi -r -i "$index"
|
||||||
|
restore_gpu_clients
|
||||||
|
}
|
||||||
|
|
||||||
|
cmd="${1:-}"
|
||||||
|
case "$cmd" in
|
||||||
|
restart-drivers)
|
||||||
|
restart_drivers
|
||||||
|
;;
|
||||||
|
reset-gpu)
|
||||||
|
if [ "$#" -ne 2 ]; then
|
||||||
|
usage >&2
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
reset_gpu "$2"
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
usage >&2
|
||||||
|
exit 2
|
||||||
|
;;
|
||||||
|
esac
|
||||||
Reference in New Issue
Block a user