package platform import ( "context" "fmt" "sort" "strconv" "strings" ) func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts NvidiaStressOptions, logFunc func(string)) (string, error) { normalizeNvidiaStressOptions(&opts) job, err := buildNvidiaStressJob(opts) if err != nil { return "", err } return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), []satJob{ {name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, {name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}}, job, {name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}}, }, logFunc) } func nvidiaStressArchivePrefix(loader string) string { switch strings.TrimSpace(strings.ToLower(loader)) { case NvidiaStressLoaderJohn: return "gpu-nvidia-john" case NvidiaStressLoaderNCCL: return "gpu-nvidia-nccl" default: return "gpu-nvidia-burn" } } func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) { selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices) if err != nil { return satJob{}, err } loader := strings.TrimSpace(strings.ToLower(opts.Loader)) switch loader { case "", NvidiaStressLoaderBuiltin: cmd := []string{ "bee-gpu-burn", "--seconds", strconv.Itoa(opts.DurationSec), "--size-mb", strconv.Itoa(opts.SizeMB), } if len(selected) > 0 { cmd = append(cmd, "--devices", joinIndexList(selected)) } return satJob{ name: "03-bee-gpu-burn.log", cmd: cmd, collectGPU: true, gpuIndices: selected, }, nil case NvidiaStressLoaderJohn: cmd := []string{ "bee-john-gpu-stress", "--seconds", strconv.Itoa(opts.DurationSec), } if len(selected) > 0 { cmd = append(cmd, "--devices", joinIndexList(selected)) } return satJob{ name: "03-john-gpu-stress.log", cmd: cmd, collectGPU: true, gpuIndices: selected, }, nil case NvidiaStressLoaderNCCL: cmd := []string{ "bee-nccl-gpu-stress", "--seconds", strconv.Itoa(opts.DurationSec), } if len(selected) > 0 { cmd = append(cmd, "--devices", joinIndexList(selected)) } return satJob{ name: "03-bee-nccl-gpu-stress.log", cmd: cmd, collectGPU: true, gpuIndices: selected, }, nil default: return satJob{}, fmt.Errorf("unknown NVIDIA stress loader %q", opts.Loader) } } func normalizeNvidiaStressOptions(opts *NvidiaStressOptions) { if opts.DurationSec <= 0 { opts.DurationSec = 300 } // SizeMB=0 means "auto" — bee-gpu-burn will query per-GPU memory at runtime. switch strings.TrimSpace(strings.ToLower(opts.Loader)) { case "", NvidiaStressLoaderBuiltin: opts.Loader = NvidiaStressLoaderBuiltin case NvidiaStressLoaderJohn: opts.Loader = NvidiaStressLoaderJohn case NvidiaStressLoaderNCCL: opts.Loader = NvidiaStressLoaderNCCL default: opts.Loader = NvidiaStressLoaderBuiltin } opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices) opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices) } func resolveNvidiaGPUSelection(include, exclude []int) ([]int, error) { all, err := listNvidiaGPUIndices() if err != nil { return nil, err } if len(all) == 0 { return nil, fmt.Errorf("nvidia-smi found no NVIDIA GPUs") } selected := all if len(include) > 0 { want := make(map[int]struct{}, len(include)) for _, idx := range include { want[idx] = struct{}{} } selected = selected[:0] for _, idx := range all { if _, ok := want[idx]; ok { selected = append(selected, idx) } } } if len(exclude) > 0 { skip := make(map[int]struct{}, len(exclude)) for _, idx := range exclude { skip[idx] = struct{}{} } filtered := selected[:0] for _, idx := range selected { if _, ok := skip[idx]; ok { continue } filtered = append(filtered, idx) } selected = filtered } if len(selected) == 0 { return nil, fmt.Errorf("no NVIDIA GPUs selected after applying filters") } out := append([]int(nil), selected...) sort.Ints(out) return out, nil } func listNvidiaGPUIndices() ([]int, error) { out, err := satExecCommand("nvidia-smi", "--query-gpu=index", "--format=csv,noheader,nounits").Output() if err != nil { return nil, fmt.Errorf("nvidia-smi: %w", err) } var indices []int for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") { line = strings.TrimSpace(line) if line == "" { continue } idx, err := strconv.Atoi(line) if err != nil { continue } indices = append(indices, idx) } return dedupeSortedIndices(indices), nil } func dedupeSortedIndices(values []int) []int { if len(values) == 0 { return nil } seen := make(map[int]struct{}, len(values)) out := make([]int, 0, len(values)) for _, value := range values { if value < 0 { continue } if _, ok := seen[value]; ok { continue } seen[value] = struct{}{} out = append(out, value) } sort.Ints(out) return out } func joinIndexList(values []int) string { parts := make([]string, 0, len(values)) for _, value := range values { parts = append(parts, strconv.Itoa(value)) } return strings.Join(parts, ",") }