- Burn tab: replace 6 flat cards with 3 grouped cards (GPU Stress, Compute Stress, Platform Thermal Cycling) + global Burn Profile - Run All button at top enqueues all enabled tests across all cards - GPU Stress: tool checkboxes enabled/disabled via new /api/gpu/tools endpoint based on driver status (/dev/nvidia0, /dev/kfd) - Compute Stress: checkboxes for cpu/memory-stress/stressapptest - Platform Thermal Cycling: component checkboxes (cpu/nvidia/amd) with platform_components param wired through to PlatformStressOptions - bee-gpu-burn: default size-mb changed from 64 to 0 (auto); script now queries nvidia-smi memory.total per GPU and uses 95% of it - platform_stress: removed hardcoded --size-mb 64; respects Components field to selectively run CPU and/or GPU load goroutines Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
204 lines
5.1 KiB
Go
204 lines
5.1 KiB
Go
package platform
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
)
|
|
|
|
func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts NvidiaStressOptions, logFunc func(string)) (string, error) {
|
|
normalizeNvidiaStressOptions(&opts)
|
|
|
|
job, err := buildNvidiaStressJob(opts)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), []satJob{
|
|
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
|
{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
|
|
job,
|
|
{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
|
}, logFunc)
|
|
}
|
|
|
|
func nvidiaStressArchivePrefix(loader string) string {
|
|
switch strings.TrimSpace(strings.ToLower(loader)) {
|
|
case NvidiaStressLoaderJohn:
|
|
return "gpu-nvidia-john"
|
|
case NvidiaStressLoaderNCCL:
|
|
return "gpu-nvidia-nccl"
|
|
default:
|
|
return "gpu-nvidia-burn"
|
|
}
|
|
}
|
|
|
|
func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
|
selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
|
|
if err != nil {
|
|
return satJob{}, err
|
|
}
|
|
|
|
loader := strings.TrimSpace(strings.ToLower(opts.Loader))
|
|
switch loader {
|
|
case "", NvidiaStressLoaderBuiltin:
|
|
cmd := []string{
|
|
"bee-gpu-burn",
|
|
"--seconds", strconv.Itoa(opts.DurationSec),
|
|
"--size-mb", strconv.Itoa(opts.SizeMB),
|
|
}
|
|
if len(selected) > 0 {
|
|
cmd = append(cmd, "--devices", joinIndexList(selected))
|
|
}
|
|
return satJob{
|
|
name: "03-bee-gpu-burn.log",
|
|
cmd: cmd,
|
|
collectGPU: true,
|
|
gpuIndices: selected,
|
|
}, nil
|
|
case NvidiaStressLoaderJohn:
|
|
cmd := []string{
|
|
"bee-john-gpu-stress",
|
|
"--seconds", strconv.Itoa(opts.DurationSec),
|
|
}
|
|
if len(selected) > 0 {
|
|
cmd = append(cmd, "--devices", joinIndexList(selected))
|
|
}
|
|
return satJob{
|
|
name: "03-john-gpu-stress.log",
|
|
cmd: cmd,
|
|
collectGPU: true,
|
|
gpuIndices: selected,
|
|
}, nil
|
|
case NvidiaStressLoaderNCCL:
|
|
cmd := []string{
|
|
"bee-nccl-gpu-stress",
|
|
"--seconds", strconv.Itoa(opts.DurationSec),
|
|
}
|
|
if len(selected) > 0 {
|
|
cmd = append(cmd, "--devices", joinIndexList(selected))
|
|
}
|
|
return satJob{
|
|
name: "03-bee-nccl-gpu-stress.log",
|
|
cmd: cmd,
|
|
collectGPU: true,
|
|
gpuIndices: selected,
|
|
}, nil
|
|
default:
|
|
return satJob{}, fmt.Errorf("unknown NVIDIA stress loader %q", opts.Loader)
|
|
}
|
|
}
|
|
|
|
func normalizeNvidiaStressOptions(opts *NvidiaStressOptions) {
|
|
if opts.DurationSec <= 0 {
|
|
opts.DurationSec = 300
|
|
}
|
|
// SizeMB=0 means "auto" — bee-gpu-burn will query per-GPU memory at runtime.
|
|
switch strings.TrimSpace(strings.ToLower(opts.Loader)) {
|
|
case "", NvidiaStressLoaderBuiltin:
|
|
opts.Loader = NvidiaStressLoaderBuiltin
|
|
case NvidiaStressLoaderJohn:
|
|
opts.Loader = NvidiaStressLoaderJohn
|
|
case NvidiaStressLoaderNCCL:
|
|
opts.Loader = NvidiaStressLoaderNCCL
|
|
default:
|
|
opts.Loader = NvidiaStressLoaderBuiltin
|
|
}
|
|
opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices)
|
|
opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices)
|
|
}
|
|
|
|
func resolveNvidiaGPUSelection(include, exclude []int) ([]int, error) {
|
|
all, err := listNvidiaGPUIndices()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if len(all) == 0 {
|
|
return nil, fmt.Errorf("nvidia-smi found no NVIDIA GPUs")
|
|
}
|
|
|
|
selected := all
|
|
if len(include) > 0 {
|
|
want := make(map[int]struct{}, len(include))
|
|
for _, idx := range include {
|
|
want[idx] = struct{}{}
|
|
}
|
|
selected = selected[:0]
|
|
for _, idx := range all {
|
|
if _, ok := want[idx]; ok {
|
|
selected = append(selected, idx)
|
|
}
|
|
}
|
|
}
|
|
if len(exclude) > 0 {
|
|
skip := make(map[int]struct{}, len(exclude))
|
|
for _, idx := range exclude {
|
|
skip[idx] = struct{}{}
|
|
}
|
|
filtered := selected[:0]
|
|
for _, idx := range selected {
|
|
if _, ok := skip[idx]; ok {
|
|
continue
|
|
}
|
|
filtered = append(filtered, idx)
|
|
}
|
|
selected = filtered
|
|
}
|
|
if len(selected) == 0 {
|
|
return nil, fmt.Errorf("no NVIDIA GPUs selected after applying filters")
|
|
}
|
|
out := append([]int(nil), selected...)
|
|
sort.Ints(out)
|
|
return out, nil
|
|
}
|
|
|
|
func listNvidiaGPUIndices() ([]int, error) {
|
|
out, err := satExecCommand("nvidia-smi", "--query-gpu=index", "--format=csv,noheader,nounits").Output()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("nvidia-smi: %w", err)
|
|
}
|
|
var indices []int
|
|
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
|
line = strings.TrimSpace(line)
|
|
if line == "" {
|
|
continue
|
|
}
|
|
idx, err := strconv.Atoi(line)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
indices = append(indices, idx)
|
|
}
|
|
return dedupeSortedIndices(indices), nil
|
|
}
|
|
|
|
func dedupeSortedIndices(values []int) []int {
|
|
if len(values) == 0 {
|
|
return nil
|
|
}
|
|
seen := make(map[int]struct{}, len(values))
|
|
out := make([]int, 0, len(values))
|
|
for _, value := range values {
|
|
if value < 0 {
|
|
continue
|
|
}
|
|
if _, ok := seen[value]; ok {
|
|
continue
|
|
}
|
|
seen[value] = struct{}{}
|
|
out = append(out, value)
|
|
}
|
|
sort.Ints(out)
|
|
return out
|
|
}
|
|
|
|
func joinIndexList(values []int) string {
|
|
parts := make([]string, 0, len(values))
|
|
for _, value := range values {
|
|
parts = append(parts, strconv.Itoa(value))
|
|
}
|
|
return strings.Join(parts, ",")
|
|
}
|