Refine burn UI and NVIDIA stress flows
This commit is contained in:
@@ -38,6 +38,12 @@ var (
|
||||
"/opt/rocm/bin/rvs",
|
||||
"/opt/rocm-*/bin/rvs",
|
||||
}
|
||||
dcgmProfTesterCandidates = []string{
|
||||
"dcgmproftester",
|
||||
"dcgmproftester13",
|
||||
"dcgmproftester12",
|
||||
"dcgmproftester11",
|
||||
}
|
||||
)
|
||||
|
||||
// streamExecOutput runs cmd and streams each output line to logFunc (if non-nil).
|
||||
@@ -277,6 +283,80 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
profCmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
|
||||
{
|
||||
name: "03-dcgmproftester.log",
|
||||
cmd: profCmd,
|
||||
env: nvidiaVisibleDevicesEnv(selected),
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
},
|
||||
{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{
|
||||
name: "02-dcgmi-targeted-power.log",
|
||||
cmd: nvidiaDCGMNamedDiagCommand("targeted_power", normalizeNvidiaBurnDuration(durationSec), selected),
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
},
|
||||
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{
|
||||
name: "02-dcgmi-pulse-test.log",
|
||||
cmd: nvidiaDCGMNamedDiagCommand("pulse_test", normalizeNvidiaBurnDuration(durationSec), selected),
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
},
|
||||
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{
|
||||
name: "02-dcgmi-nvbandwidth.log",
|
||||
cmd: nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, selected),
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
},
|
||||
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return runAcceptancePackCtx(context.Background(), baseDir, "gpu-nvidia", nvidiaSATJobs(), logFunc)
|
||||
}
|
||||
@@ -293,6 +373,23 @@ func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, resolvedGPUIndices), logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{
|
||||
name: "02-dcgmi-targeted-stress.log",
|
||||
cmd: nvidiaDCGMNamedDiagCommand("targeted_stress", normalizeNvidiaBurnDuration(durationSec), selected),
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
},
|
||||
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
|
||||
if len(gpuIndices) > 0 {
|
||||
return dedupeSortedIndices(gpuIndices), nil
|
||||
@@ -473,6 +570,31 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
||||
}
|
||||
}
|
||||
|
||||
func nvidiaDCGMNamedDiagCommand(name string, durationSec int, gpuIndices []int) []string {
|
||||
args := []string{"dcgmi", "diag", "-r", name}
|
||||
if durationSec > 0 {
|
||||
args = append(args, "-p", fmt.Sprintf("%s.test_duration=%d", name, durationSec))
|
||||
}
|
||||
if len(gpuIndices) > 0 {
|
||||
args = append(args, "-i", joinIndexList(gpuIndices))
|
||||
}
|
||||
return args
|
||||
}
|
||||
|
||||
func normalizeNvidiaBurnDuration(durationSec int) int {
|
||||
if durationSec <= 0 {
|
||||
return 300
|
||||
}
|
||||
return durationSec
|
||||
}
|
||||
|
||||
func nvidiaVisibleDevicesEnv(gpuIndices []int) []string {
|
||||
if len(gpuIndices) == 0 {
|
||||
return nil
|
||||
}
|
||||
return []string{"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices)}
|
||||
}
|
||||
|
||||
func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
|
||||
if ctx == nil {
|
||||
ctx = context.Background()
|
||||
@@ -642,6 +764,7 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
|
||||
}
|
||||
if strings.Contains(text, "unsupported") ||
|
||||
strings.Contains(text, "not supported") ||
|
||||
strings.Contains(text, "not found in path") ||
|
||||
strings.Contains(text, "invalid opcode") ||
|
||||
strings.Contains(text, "unknown command") ||
|
||||
strings.Contains(text, "not implemented") ||
|
||||
@@ -748,6 +871,15 @@ func resolveROCmSMICommand(args ...string) ([]string, error) {
|
||||
return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
|
||||
}
|
||||
|
||||
func resolveDCGMProfTesterCommand(args ...string) ([]string, error) {
|
||||
for _, candidate := range dcgmProfTesterCandidates {
|
||||
if path, err := satLookPath(candidate); err == nil {
|
||||
return append([]string{path}, args...), nil
|
||||
}
|
||||
}
|
||||
return nil, errors.New("dcgmproftester not found in PATH")
|
||||
}
|
||||
|
||||
func ensureAMDRuntimeReady() error {
|
||||
if _, err := os.Stat("/dev/kfd"); err == nil {
|
||||
return nil
|
||||
|
||||
Reference in New Issue
Block a user