Add NVIDIA stress loader selection and DCGM 4 support
This commit is contained in:
178
audit/internal/platform/nvidia_stress.go
Normal file
178
audit/internal/platform/nvidia_stress.go
Normal file
@@ -0,0 +1,178 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||
normalizeNvidiaStressOptions(&opts)
|
||||
|
||||
job, err := buildNvidiaStressJob(opts)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-stress", []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
|
||||
job,
|
||||
{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
||||
selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
|
||||
if err != nil {
|
||||
return satJob{}, err
|
||||
}
|
||||
|
||||
loader := strings.TrimSpace(strings.ToLower(opts.Loader))
|
||||
switch loader {
|
||||
case "", NvidiaStressLoaderBuiltin:
|
||||
cmd := []string{
|
||||
"bee-gpu-burn",
|
||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||
}
|
||||
if len(selected) > 0 {
|
||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||
}
|
||||
return satJob{
|
||||
name: "03-bee-gpu-burn.log",
|
||||
cmd: cmd,
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
}, nil
|
||||
case NvidiaStressLoaderJohn:
|
||||
cmd := []string{
|
||||
"bee-john-gpu-stress",
|
||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||
}
|
||||
if len(selected) > 0 {
|
||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||
}
|
||||
return satJob{
|
||||
name: "03-john-gpu-stress.log",
|
||||
cmd: cmd,
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
}, nil
|
||||
default:
|
||||
return satJob{}, fmt.Errorf("unknown NVIDIA stress loader %q", opts.Loader)
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeNvidiaStressOptions(opts *NvidiaStressOptions) {
|
||||
if opts.DurationSec <= 0 {
|
||||
opts.DurationSec = 300
|
||||
}
|
||||
if opts.SizeMB <= 0 {
|
||||
opts.SizeMB = 64
|
||||
}
|
||||
switch strings.TrimSpace(strings.ToLower(opts.Loader)) {
|
||||
case "", NvidiaStressLoaderBuiltin:
|
||||
opts.Loader = NvidiaStressLoaderBuiltin
|
||||
case NvidiaStressLoaderJohn:
|
||||
opts.Loader = NvidiaStressLoaderJohn
|
||||
default:
|
||||
opts.Loader = NvidiaStressLoaderBuiltin
|
||||
}
|
||||
opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices)
|
||||
opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices)
|
||||
}
|
||||
|
||||
func resolveNvidiaGPUSelection(include, exclude []int) ([]int, error) {
|
||||
all, err := listNvidiaGPUIndices()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(all) == 0 {
|
||||
return nil, fmt.Errorf("nvidia-smi found no NVIDIA GPUs")
|
||||
}
|
||||
|
||||
selected := all
|
||||
if len(include) > 0 {
|
||||
want := make(map[int]struct{}, len(include))
|
||||
for _, idx := range include {
|
||||
want[idx] = struct{}{}
|
||||
}
|
||||
selected = selected[:0]
|
||||
for _, idx := range all {
|
||||
if _, ok := want[idx]; ok {
|
||||
selected = append(selected, idx)
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(exclude) > 0 {
|
||||
skip := make(map[int]struct{}, len(exclude))
|
||||
for _, idx := range exclude {
|
||||
skip[idx] = struct{}{}
|
||||
}
|
||||
filtered := selected[:0]
|
||||
for _, idx := range selected {
|
||||
if _, ok := skip[idx]; ok {
|
||||
continue
|
||||
}
|
||||
filtered = append(filtered, idx)
|
||||
}
|
||||
selected = filtered
|
||||
}
|
||||
if len(selected) == 0 {
|
||||
return nil, fmt.Errorf("no NVIDIA GPUs selected after applying filters")
|
||||
}
|
||||
out := append([]int(nil), selected...)
|
||||
sort.Ints(out)
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func listNvidiaGPUIndices() ([]int, error) {
|
||||
out, err := satExecCommand("nvidia-smi", "--query-gpu=index", "--format=csv,noheader,nounits").Output()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("nvidia-smi: %w", err)
|
||||
}
|
||||
var indices []int
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
idx, err := strconv.Atoi(line)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
indices = append(indices, idx)
|
||||
}
|
||||
return dedupeSortedIndices(indices), nil
|
||||
}
|
||||
|
||||
func dedupeSortedIndices(values []int) []int {
|
||||
if len(values) == 0 {
|
||||
return nil
|
||||
}
|
||||
seen := make(map[int]struct{}, len(values))
|
||||
out := make([]int, 0, len(values))
|
||||
for _, value := range values {
|
||||
if value < 0 {
|
||||
continue
|
||||
}
|
||||
if _, ok := seen[value]; ok {
|
||||
continue
|
||||
}
|
||||
seen[value] = struct{}{}
|
||||
out = append(out, value)
|
||||
}
|
||||
sort.Ints(out)
|
||||
return out
|
||||
}
|
||||
|
||||
func joinIndexList(values []int) string {
|
||||
parts := make([]string, 0, len(values))
|
||||
for _, value := range values {
|
||||
parts = append(parts, strconv.Itoa(value))
|
||||
}
|
||||
return strings.Join(parts, ",")
|
||||
}
|
||||
@@ -423,7 +423,10 @@ func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||
}
|
||||
|
||||
func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||
path, err := satLookPath("bee-gpu-stress")
|
||||
path, err := satLookPath("bee-gpu-burn")
|
||||
if err != nil {
|
||||
path, err = satLookPath("bee-gpu-stress")
|
||||
}
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -136,7 +136,8 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
|
||||
tools = append(tools, s.CheckTools([]string{
|
||||
"nvidia-smi",
|
||||
"nvidia-bug-report.sh",
|
||||
"bee-gpu-stress",
|
||||
"bee-gpu-burn",
|
||||
"bee-john-gpu-stress",
|
||||
})...)
|
||||
case "amd":
|
||||
tool := ToolStatus{Name: "rocm-smi"}
|
||||
@@ -176,8 +177,8 @@ func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHe
|
||||
health.DriverReady = true
|
||||
}
|
||||
|
||||
if lookErr := exec.Command("sh", "-c", "command -v bee-gpu-stress >/dev/null 2>&1").Run(); lookErr == nil {
|
||||
out, err := exec.Command("bee-gpu-stress", "--seconds", "1", "--size-mb", "1").CombinedOutput()
|
||||
if _, lookErr := exec.LookPath("bee-gpu-burn"); lookErr == nil {
|
||||
out, err := exec.Command("bee-gpu-burn", "--seconds", "1", "--size-mb", "1").CombinedOutput()
|
||||
if err == nil {
|
||||
health.CUDAReady = true
|
||||
} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
|
||||
|
||||
@@ -425,14 +425,12 @@ type satStats struct {
|
||||
}
|
||||
|
||||
func nvidiaSATJobs() []satJob {
|
||||
seconds := envInt("BEE_GPU_STRESS_SECONDS", 5)
|
||||
sizeMB := envInt("BEE_GPU_STRESS_SIZE_MB", 64)
|
||||
return []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||
{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
||||
{name: "05-bee-gpu-stress.log", cmd: []string{"bee-gpu-stress", "--seconds", fmt.Sprintf("%d", seconds), "--size-mb", fmt.Sprintf("%d", sizeMB)}},
|
||||
{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -130,26 +130,21 @@ func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanS
|
||||
stats.OK++
|
||||
}
|
||||
|
||||
// loadPhase runs bee-gpu-stress for durSec; sampler stamps phaseName on each row.
|
||||
// loadPhase runs bee-gpu-burn for durSec; sampler stamps phaseName on each row.
|
||||
loadPhase := func(phaseName, stepName string, durSec int) {
|
||||
if ctx.Err() != nil {
|
||||
return
|
||||
}
|
||||
setPhase(phaseName)
|
||||
var env []string
|
||||
if len(opts.GPUIndices) > 0 {
|
||||
ids := make([]string, len(opts.GPUIndices))
|
||||
for i, idx := range opts.GPUIndices {
|
||||
ids[i] = strconv.Itoa(idx)
|
||||
}
|
||||
env = []string{"CUDA_VISIBLE_DEVICES=" + strings.Join(ids, ",")}
|
||||
}
|
||||
cmd := []string{
|
||||
"bee-gpu-stress",
|
||||
"bee-gpu-burn",
|
||||
"--seconds", strconv.Itoa(durSec),
|
||||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||
}
|
||||
out, err := runSATCommandCtx(ctx, verboseLog, stepName, cmd, env, nil)
|
||||
if len(opts.GPUIndices) > 0 {
|
||||
cmd = append(cmd, "--devices", joinIndexList(dedupeSortedIndices(opts.GPUIndices)))
|
||||
}
|
||||
out, err := runSATCommandCtx(ctx, verboseLog, stepName, cmd, nil, nil)
|
||||
_ = os.WriteFile(filepath.Join(runDir, stepName+".log"), out, 0644)
|
||||
if err != nil && err != context.Canceled && err.Error() != "signal: killed" {
|
||||
fmt.Fprintf(&summary, "%s_status=FAILED\n", stepName)
|
||||
@@ -323,8 +318,9 @@ func sampleFanSpeeds() ([]FanReading, error) {
|
||||
|
||||
// parseFanSpeeds parses "ipmitool sdr type Fan" output.
|
||||
// Handles two formats:
|
||||
// Old: "FAN1 | 2400.000 | RPM | ok" (value in col[1], unit in col[2])
|
||||
// New: "FAN1 | 41h | ok | 29.1 | 4340 RPM" (value+unit combined in last col)
|
||||
//
|
||||
// Old: "FAN1 | 2400.000 | RPM | ok" (value in col[1], unit in col[2])
|
||||
// New: "FAN1 | 41h | ok | 29.1 | 4340 RPM" (value+unit combined in last col)
|
||||
func parseFanSpeeds(raw string) []FanReading {
|
||||
var fans []FanReading
|
||||
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
||||
|
||||
@@ -31,8 +31,8 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {
|
||||
if len(jobs) != 5 {
|
||||
t.Fatalf("jobs=%d want 5", len(jobs))
|
||||
}
|
||||
if got := jobs[4].cmd[0]; got != "bee-gpu-stress" {
|
||||
t.Fatalf("gpu stress command=%q want bee-gpu-stress", got)
|
||||
if got := jobs[4].cmd[0]; got != "bee-gpu-burn" {
|
||||
t.Fatalf("gpu stress command=%q want bee-gpu-burn", got)
|
||||
}
|
||||
if got := jobs[3].cmd[1]; got != "--output-file" {
|
||||
t.Fatalf("bug report flag=%q want --output-file", got)
|
||||
@@ -80,13 +80,10 @@ func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) {
|
||||
t.Setenv("BEE_GPU_STRESS_SECONDS", "9")
|
||||
t.Setenv("BEE_GPU_STRESS_SIZE_MB", "96")
|
||||
|
||||
func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
|
||||
jobs := nvidiaSATJobs()
|
||||
got := jobs[4].cmd
|
||||
want := []string{"bee-gpu-stress", "--seconds", "9", "--size-mb", "96"}
|
||||
want := []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}
|
||||
if len(got) != len(want) {
|
||||
t.Fatalf("cmd len=%d want %d", len(got), len(want))
|
||||
}
|
||||
@@ -97,6 +94,40 @@ func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
oldExecCommand := satExecCommand
|
||||
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
if name == "nvidia-smi" {
|
||||
return exec.Command("sh", "-c", "printf '0\n1\n2\n'")
|
||||
}
|
||||
return exec.Command(name, args...)
|
||||
}
|
||||
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||
|
||||
job, err := buildNvidiaStressJob(NvidiaStressOptions{
|
||||
DurationSec: 600,
|
||||
Loader: NvidiaStressLoaderJohn,
|
||||
ExcludeGPUIndices: []int{1},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("buildNvidiaStressJob error: %v", err)
|
||||
}
|
||||
wantCmd := []string{"bee-john-gpu-stress", "--seconds", "600", "--devices", "0,2"}
|
||||
if len(job.cmd) != len(wantCmd) {
|
||||
t.Fatalf("cmd len=%d want %d (%v)", len(job.cmd), len(wantCmd), job.cmd)
|
||||
}
|
||||
for i := range wantCmd {
|
||||
if job.cmd[i] != wantCmd[i] {
|
||||
t.Fatalf("cmd[%d]=%q want %q", i, job.cmd[i], wantCmd[i])
|
||||
}
|
||||
}
|
||||
if got := joinIndexList(job.gpuIndices); got != "0,2" {
|
||||
t.Fatalf("gpuIndices=%q want 0,2", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnvIntFallback(t *testing.T) {
|
||||
os.Unsetenv("BEE_MEMTESTER_SIZE_MB")
|
||||
if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 123 {
|
||||
@@ -122,8 +153,8 @@ func TestClassifySATResult(t *testing.T) {
|
||||
}{
|
||||
{name: "ok", job: "memtester", out: "done", err: nil, status: "OK"},
|
||||
{name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||
{name: "failed", job: "bee-gpu-stress", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
|
||||
{name: "cuda not ready", job: "bee-gpu-stress", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||
{name: "failed", job: "bee-gpu-burn", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
|
||||
{name: "cuda not ready", job: "bee-gpu-burn", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
|
||||
@@ -51,6 +51,19 @@ type ToolStatus struct {
|
||||
OK bool
|
||||
}
|
||||
|
||||
const (
|
||||
NvidiaStressLoaderBuiltin = "builtin"
|
||||
NvidiaStressLoaderJohn = "john"
|
||||
)
|
||||
|
||||
type NvidiaStressOptions struct {
|
||||
DurationSec int
|
||||
SizeMB int
|
||||
Loader string
|
||||
GPUIndices []int
|
||||
ExcludeGPUIndices []int
|
||||
}
|
||||
|
||||
func New() *System {
|
||||
return &System{}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user