Reset GPUs before power benchmark
This commit is contained in:
@@ -97,6 +97,8 @@ var (
|
|||||||
benchmarkReadyPattern = regexp.MustCompile(`^([a-z0-9_]+)\[(\d+)\]=READY dim=(\d+)x(\d+)x(\d+)\b`)
|
benchmarkReadyPattern = regexp.MustCompile(`^([a-z0-9_]+)\[(\d+)\]=READY dim=(\d+)x(\d+)x(\d+)\b`)
|
||||||
benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
|
benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
|
||||||
benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
|
benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
|
||||||
|
benchmarkGeteuid = os.Geteuid
|
||||||
|
benchmarkSleep = time.Sleep
|
||||||
)
|
)
|
||||||
|
|
||||||
// benchmarkPrecisionPhases lists the precision categories run as individual
|
// benchmarkPrecisionPhases lists the precision categories run as individual
|
||||||
@@ -240,6 +242,39 @@ func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, po
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int, logFunc func(string)) []int {
|
||||||
|
if len(gpuIndices) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if benchmarkGeteuid() != 0 {
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc("power benchmark pre-flight: root privileges unavailable, GPU reset skipped")
|
||||||
|
}
|
||||||
|
return append([]int(nil), gpuIndices...)
|
||||||
|
}
|
||||||
|
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||||
|
for _, p := range killed {
|
||||||
|
logFunc(fmt.Sprintf("power benchmark pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
var failed []int
|
||||||
|
for _, idx := range gpuIndices {
|
||||||
|
name := fmt.Sprintf("power-preflight-gpu-%d-reset.log", idx)
|
||||||
|
if _, err := runSATCommandCtx(ctx, verboseLog, name, []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-r"}, nil, logFunc); err != nil {
|
||||||
|
failed = append(failed, idx)
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset failed: %v", idx, err))
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset completed", idx))
|
||||||
|
}
|
||||||
|
benchmarkSleep(time.Second)
|
||||||
|
}
|
||||||
|
return failed
|
||||||
|
}
|
||||||
|
|
||||||
func benchmarkPowerEngine() string {
|
func benchmarkPowerEngine() string {
|
||||||
switch strings.TrimSpace(strings.ToLower(os.Getenv("BEE_BENCH_POWER_ENGINE"))) {
|
switch strings.TrimSpace(strings.ToLower(os.Getenv("BEE_BENCH_POWER_ENGINE"))) {
|
||||||
case BenchmarkPowerEngineTargetedPower:
|
case BenchmarkPowerEngineTargetedPower:
|
||||||
@@ -4150,14 +4185,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
|
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
|
||||||
}
|
}
|
||||||
verboseLog := filepath.Join(runDir, "verbose.log")
|
verboseLog := filepath.Join(runDir, "verbose.log")
|
||||||
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
|
|
||||||
if infoErr != nil {
|
|
||||||
return "", infoErr
|
|
||||||
}
|
|
||||||
// Capture full nvidia-smi -q snapshot at the start of the run.
|
|
||||||
if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
|
|
||||||
_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
|
|
||||||
}
|
|
||||||
hostname, _ := os.Hostname()
|
hostname, _ := os.Hostname()
|
||||||
result := NvidiaPowerBenchResult{
|
result := NvidiaPowerBenchResult{
|
||||||
BenchmarkVersion: benchmarkVersion,
|
BenchmarkVersion: benchmarkVersion,
|
||||||
@@ -4168,6 +4195,18 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
SelectedGPUIndices: append([]int(nil), selected...),
|
SelectedGPUIndices: append([]int(nil), selected...),
|
||||||
OverallStatus: "OK",
|
OverallStatus: "OK",
|
||||||
}
|
}
|
||||||
|
if failed := resetBenchmarkGPUs(ctx, verboseLog, selected, logFunc); len(failed) > 0 {
|
||||||
|
result.Findings = append(result.Findings,
|
||||||
|
fmt.Sprintf("GPU reset pre-flight did not complete for GPU(s) %s; throttle counters may contain stale state.", joinIndexList(failed)))
|
||||||
|
}
|
||||||
|
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
|
||||||
|
if infoErr != nil {
|
||||||
|
return "", infoErr
|
||||||
|
}
|
||||||
|
// Capture full nvidia-smi -q snapshot at the start of the run.
|
||||||
|
if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
|
||||||
|
_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
|
||||||
|
}
|
||||||
durationSec := powerBenchDurationSec(opts.Profile)
|
durationSec := powerBenchDurationSec(opts.Profile)
|
||||||
|
|
||||||
// Sample server idle power before any GPU load.
|
// Sample server idle power before any GPU load.
|
||||||
|
|||||||
@@ -1,8 +1,13 @@
|
|||||||
package platform
|
package platform
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestResolveBenchmarkProfile(t *testing.T) {
|
func TestResolveBenchmarkProfile(t *testing.T) {
|
||||||
@@ -182,6 +187,75 @@ func TestBenchmarkCalibrationThrottleReasonIgnoresPowerReasons(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
oldGeteuid := benchmarkGeteuid
|
||||||
|
oldExec := satExecCommand
|
||||||
|
benchmarkGeteuid = func() int { return 1000 }
|
||||||
|
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||||
|
t.Fatalf("unexpected command: %s %v", name, args)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
t.Cleanup(func() {
|
||||||
|
benchmarkGeteuid = oldGeteuid
|
||||||
|
satExecCommand = oldExec
|
||||||
|
})
|
||||||
|
|
||||||
|
var logs []string
|
||||||
|
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{0, 2}, func(line string) {
|
||||||
|
logs = append(logs, line)
|
||||||
|
})
|
||||||
|
if got, want := strings.Join(logs, "\n"), "power benchmark pre-flight: root privileges unavailable, GPU reset skipped"; !strings.Contains(got, want) {
|
||||||
|
t.Fatalf("logs=%q want substring %q", got, want)
|
||||||
|
}
|
||||||
|
if len(failed) != 2 || failed[0] != 0 || failed[1] != 2 {
|
||||||
|
t.Fatalf("failed=%v want [0 2]", failed)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResetBenchmarkGPUsResetsEachGPU(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
dir := t.TempDir()
|
||||||
|
script := filepath.Join(dir, "nvidia-smi")
|
||||||
|
argsLog := filepath.Join(dir, "args.log")
|
||||||
|
if err := os.WriteFile(script, []byte("#!/bin/sh\nprintf '%s\\n' \"$*\" >> "+argsLog+"\nprintf 'ok\\n'\n"), 0755); err != nil {
|
||||||
|
t.Fatalf("write script: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
oldGeteuid := benchmarkGeteuid
|
||||||
|
oldSleep := benchmarkSleep
|
||||||
|
oldLookPath := satLookPath
|
||||||
|
benchmarkGeteuid = func() int { return 0 }
|
||||||
|
benchmarkSleep = func(time.Duration) {}
|
||||||
|
satLookPath = func(file string) (string, error) {
|
||||||
|
if file == "nvidia-smi" {
|
||||||
|
return script, nil
|
||||||
|
}
|
||||||
|
return exec.LookPath(file)
|
||||||
|
}
|
||||||
|
t.Cleanup(func() {
|
||||||
|
benchmarkGeteuid = oldGeteuid
|
||||||
|
benchmarkSleep = oldSleep
|
||||||
|
satLookPath = oldLookPath
|
||||||
|
})
|
||||||
|
|
||||||
|
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(dir, "verbose.log"), []int{2, 5}, nil)
|
||||||
|
if len(failed) != 0 {
|
||||||
|
t.Fatalf("failed=%v want no failures", failed)
|
||||||
|
}
|
||||||
|
raw, err := os.ReadFile(argsLog)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("read args log: %v", err)
|
||||||
|
}
|
||||||
|
got := strings.Fields(string(raw))
|
||||||
|
want := []string{"-i", "2", "-r", "-i", "5", "-r"}
|
||||||
|
if strings.Join(got, " ") != strings.Join(want, " ") {
|
||||||
|
t.Fatalf("args=%v want %v", got, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
|
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user