Add NVIDIA stress loader selection and DCGM 4 support
This commit is contained in:
178
audit/internal/platform/nvidia_stress.go
Normal file
178
audit/internal/platform/nvidia_stress.go
Normal file
@@ -0,0 +1,178 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||
normalizeNvidiaStressOptions(&opts)
|
||||
|
||||
job, err := buildNvidiaStressJob(opts)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-stress", []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
|
||||
job,
|
||||
{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
||||
selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
|
||||
if err != nil {
|
||||
return satJob{}, err
|
||||
}
|
||||
|
||||
loader := strings.TrimSpace(strings.ToLower(opts.Loader))
|
||||
switch loader {
|
||||
case "", NvidiaStressLoaderBuiltin:
|
||||
cmd := []string{
|
||||
"bee-gpu-burn",
|
||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||
}
|
||||
if len(selected) > 0 {
|
||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||
}
|
||||
return satJob{
|
||||
name: "03-bee-gpu-burn.log",
|
||||
cmd: cmd,
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
}, nil
|
||||
case NvidiaStressLoaderJohn:
|
||||
cmd := []string{
|
||||
"bee-john-gpu-stress",
|
||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||
}
|
||||
if len(selected) > 0 {
|
||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||
}
|
||||
return satJob{
|
||||
name: "03-john-gpu-stress.log",
|
||||
cmd: cmd,
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
}, nil
|
||||
default:
|
||||
return satJob{}, fmt.Errorf("unknown NVIDIA stress loader %q", opts.Loader)
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeNvidiaStressOptions(opts *NvidiaStressOptions) {
|
||||
if opts.DurationSec <= 0 {
|
||||
opts.DurationSec = 300
|
||||
}
|
||||
if opts.SizeMB <= 0 {
|
||||
opts.SizeMB = 64
|
||||
}
|
||||
switch strings.TrimSpace(strings.ToLower(opts.Loader)) {
|
||||
case "", NvidiaStressLoaderBuiltin:
|
||||
opts.Loader = NvidiaStressLoaderBuiltin
|
||||
case NvidiaStressLoaderJohn:
|
||||
opts.Loader = NvidiaStressLoaderJohn
|
||||
default:
|
||||
opts.Loader = NvidiaStressLoaderBuiltin
|
||||
}
|
||||
opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices)
|
||||
opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices)
|
||||
}
|
||||
|
||||
func resolveNvidiaGPUSelection(include, exclude []int) ([]int, error) {
|
||||
all, err := listNvidiaGPUIndices()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(all) == 0 {
|
||||
return nil, fmt.Errorf("nvidia-smi found no NVIDIA GPUs")
|
||||
}
|
||||
|
||||
selected := all
|
||||
if len(include) > 0 {
|
||||
want := make(map[int]struct{}, len(include))
|
||||
for _, idx := range include {
|
||||
want[idx] = struct{}{}
|
||||
}
|
||||
selected = selected[:0]
|
||||
for _, idx := range all {
|
||||
if _, ok := want[idx]; ok {
|
||||
selected = append(selected, idx)
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(exclude) > 0 {
|
||||
skip := make(map[int]struct{}, len(exclude))
|
||||
for _, idx := range exclude {
|
||||
skip[idx] = struct{}{}
|
||||
}
|
||||
filtered := selected[:0]
|
||||
for _, idx := range selected {
|
||||
if _, ok := skip[idx]; ok {
|
||||
continue
|
||||
}
|
||||
filtered = append(filtered, idx)
|
||||
}
|
||||
selected = filtered
|
||||
}
|
||||
if len(selected) == 0 {
|
||||
return nil, fmt.Errorf("no NVIDIA GPUs selected after applying filters")
|
||||
}
|
||||
out := append([]int(nil), selected...)
|
||||
sort.Ints(out)
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func listNvidiaGPUIndices() ([]int, error) {
|
||||
out, err := satExecCommand("nvidia-smi", "--query-gpu=index", "--format=csv,noheader,nounits").Output()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("nvidia-smi: %w", err)
|
||||
}
|
||||
var indices []int
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
idx, err := strconv.Atoi(line)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
indices = append(indices, idx)
|
||||
}
|
||||
return dedupeSortedIndices(indices), nil
|
||||
}
|
||||
|
||||
func dedupeSortedIndices(values []int) []int {
|
||||
if len(values) == 0 {
|
||||
return nil
|
||||
}
|
||||
seen := make(map[int]struct{}, len(values))
|
||||
out := make([]int, 0, len(values))
|
||||
for _, value := range values {
|
||||
if value < 0 {
|
||||
continue
|
||||
}
|
||||
if _, ok := seen[value]; ok {
|
||||
continue
|
||||
}
|
||||
seen[value] = struct{}{}
|
||||
out = append(out, value)
|
||||
}
|
||||
sort.Ints(out)
|
||||
return out
|
||||
}
|
||||
|
||||
func joinIndexList(values []int) string {
|
||||
parts := make([]string, 0, len(values))
|
||||
for _, value := range values {
|
||||
parts = append(parts, strconv.Itoa(value))
|
||||
}
|
||||
return strings.Join(parts, ",")
|
||||
}
|
||||
Reference in New Issue
Block a user