A single dcgmproftester process without -i only loads GPU 0 regardless of CUDA_VISIBLE_DEVICES. Now always routes multi-GPU runs through bee-dcgmproftester-staggered (--stagger-seconds 0 for parallel mode), which spawns one process per GPU so all GPUs are loaded simultaneously. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1547 lines
47 KiB
Go
1547 lines
47 KiB
Go
package platform
|
||
|
||
import (
|
||
"archive/tar"
|
||
"bufio"
|
||
"bytes"
|
||
"compress/gzip"
|
||
"context"
|
||
"errors"
|
||
"fmt"
|
||
"io"
|
||
"os"
|
||
"os/exec"
|
||
"path/filepath"
|
||
"sort"
|
||
"strconv"
|
||
"strings"
|
||
"sync"
|
||
"syscall"
|
||
"time"
|
||
)
|
||
|
||
// Estimated wall-clock durations for each SAT/validate test, derived from real
|
||
// production logs in _benchmark/_v8/.
|
||
//
|
||
// Rule: whenever the commands, timeout parameters, or number of sub-jobs inside
|
||
// the corresponding Run*Pack function change, re-measure the wall-clock duration
|
||
// from actual task logs and update the matching constant here.
|
||
//
|
||
// Sources:
|
||
// - SATEstimatedCPUValidateSec: xFusion v8.6 — 62 s
|
||
// - SATEstimatedMemoryValidateSec: xFusion v8.6 — 68 s
|
||
// - SATEstimatedNvidiaGPUValidatePerGPUSec: xFusion v8.6/v8.22 — 77–87 s/GPU
|
||
// - SATEstimatedNvidiaGPUStressPerGPUSec: xFusion v8.6/v8.22 — 444–448 s/GPU
|
||
// - SATEstimatedNvidiaTargetedStressPerGPUSec: xFusion v8.6/v8.22 — 347–348 s/GPU (300 s default + overhead)
|
||
// - SATEstimatedNvidiaTargetedPowerPerGPUSec: MSI v8.22 / xFusion v8.6 — 346–351 s/GPU
|
||
// - SATEstimatedNvidiaPulseTestSec: xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous)
|
||
// - SATEstimatedNvidiaInterconnectSec: xFusion v8.6/v8.22 — 210–384 s / 8 GPU (all simultaneous)
|
||
// - SATEstimatedNvidiaBandwidthSec: xFusion v8.6/v8.22 — 2 664–2 688 s / 8 GPU (all simultaneous)
|
||
const (
|
||
// CPU stress: stress-ng 60 s + lscpu/sensors overhead.
|
||
SATEstimatedCPUValidateSec = 65
|
||
// CPU stress: stress-ng 1800 s (stress mode default).
|
||
SATEstimatedCPUStressSec = 1800
|
||
|
||
// RAM: memtester 256 MB / 1 pass.
|
||
SATEstimatedMemoryValidateSec = 70
|
||
// RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size).
|
||
SATEstimatedMemoryStressSec = 140
|
||
|
||
// NVIDIA dcgmi diag Level 2 (medium), per GPU, sequential.
|
||
SATEstimatedNvidiaGPUValidatePerGPUSec = 85
|
||
// NVIDIA dcgmi diag Level 3 (targeted stress), per GPU, sequential.
|
||
SATEstimatedNvidiaGPUStressPerGPUSec = 450
|
||
|
||
// NVIDIA dcgmi targeted_stress 300 s + overhead, per GPU, sequential.
|
||
SATEstimatedNvidiaTargetedStressPerGPUSec = 350
|
||
// NVIDIA dcgmi targeted_power 300 s + overhead, per GPU, sequential.
|
||
SATEstimatedNvidiaTargetedPowerPerGPUSec = 350
|
||
|
||
// NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU).
|
||
SATEstimatedNvidiaPulseTestSec = 5000
|
||
|
||
// NCCL all_reduce_perf, all GPUs simultaneously.
|
||
SATEstimatedNvidiaInterconnectSec = 300
|
||
// nvbandwidth, all GPUs simultaneously. Tool runs all built-in tests
|
||
// without a user-configurable time limit; duration is determined by nvbandwidth itself.
|
||
SATEstimatedNvidiaBandwidthSec = 2700
|
||
)
|
||
|
||
var (
|
||
satExecCommand = exec.Command
|
||
satLookPath = exec.LookPath
|
||
satGlob = filepath.Glob
|
||
satStat = os.Stat
|
||
satFreeMemBytes = freeMemBytes
|
||
|
||
rocmSMIExecutableGlobs = []string{
|
||
"/opt/rocm/bin/rocm-smi",
|
||
"/opt/rocm-*/bin/rocm-smi",
|
||
}
|
||
rocmSMIScriptGlobs = []string{
|
||
"/opt/rocm/libexec/rocm_smi/rocm_smi.py",
|
||
"/opt/rocm-*/libexec/rocm_smi/rocm_smi.py",
|
||
}
|
||
rvsExecutableGlobs = []string{
|
||
"/opt/rocm/bin/rvs",
|
||
"/opt/rocm-*/bin/rvs",
|
||
}
|
||
dcgmProfTesterCandidates = []string{
|
||
"dcgmproftester",
|
||
"dcgmproftester13",
|
||
"dcgmproftester12",
|
||
"dcgmproftester11",
|
||
}
|
||
)
|
||
|
||
// streamExecOutput runs cmd and streams each output line to logFunc (if non-nil).
|
||
// Returns combined stdout+stderr as a byte slice.
|
||
func streamExecOutput(cmd *exec.Cmd, logFunc func(string)) ([]byte, error) {
|
||
pr, pw := io.Pipe()
|
||
cmd.Stdout = pw
|
||
cmd.Stderr = pw
|
||
|
||
var buf bytes.Buffer
|
||
var wg sync.WaitGroup
|
||
wg.Add(1)
|
||
go func() {
|
||
defer wg.Done()
|
||
scanner := bufio.NewScanner(pr)
|
||
for scanner.Scan() {
|
||
line := scanner.Text()
|
||
buf.WriteString(line + "\n")
|
||
if logFunc != nil {
|
||
logFunc(line)
|
||
}
|
||
}
|
||
}()
|
||
|
||
err := cmd.Start()
|
||
if err != nil {
|
||
_ = pw.Close()
|
||
wg.Wait()
|
||
return nil, err
|
||
}
|
||
waitErr := cmd.Wait()
|
||
_ = pw.Close()
|
||
wg.Wait()
|
||
return buf.Bytes(), waitErr
|
||
}
|
||
|
||
// NvidiaGPU holds basic GPU info from nvidia-smi.
|
||
type NvidiaGPU struct {
|
||
Index int `json:"index"`
|
||
Name string `json:"name"`
|
||
MemoryMB int `json:"memory_mb"`
|
||
}
|
||
|
||
type NvidiaGPUStatus struct {
|
||
Index int `json:"index"`
|
||
Name string `json:"name"`
|
||
BDF string `json:"bdf,omitempty"`
|
||
Serial string `json:"serial,omitempty"`
|
||
Status string `json:"status"`
|
||
RawLine string `json:"raw_line,omitempty"`
|
||
NeedsReset bool `json:"needs_reset"`
|
||
ParseFailure bool `json:"parse_failure,omitempty"`
|
||
}
|
||
|
||
type nvidiaGPUHealth struct {
|
||
Index int
|
||
Name string
|
||
NeedsReset bool
|
||
RawLine string
|
||
ParseFailure bool
|
||
}
|
||
|
||
type nvidiaGPUStatusFile struct {
|
||
Index int
|
||
Name string
|
||
RunStatus string
|
||
Reason string
|
||
Health string
|
||
HealthRaw string
|
||
Observed bool
|
||
Selected bool
|
||
FailingJob string
|
||
}
|
||
|
||
// AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
|
||
type AMDGPUInfo struct {
|
||
Index int `json:"index"`
|
||
Name string `json:"name"`
|
||
}
|
||
|
||
// DetectGPUVendor returns "nvidia" if /dev/nvidia0 exists, "amd" if /dev/kfd exists, or "" otherwise.
|
||
func (s *System) DetectGPUVendor() string {
|
||
if _, err := os.Stat("/dev/nvidia0"); err == nil {
|
||
return "nvidia"
|
||
}
|
||
if _, err := os.Stat("/dev/kfd"); err == nil {
|
||
return "amd"
|
||
}
|
||
if raw, err := exec.Command("lspci", "-nn").Output(); err == nil {
|
||
text := strings.ToLower(string(raw))
|
||
if strings.Contains(text, "advanced micro devices") || strings.Contains(text, "amd/ati") {
|
||
return "amd"
|
||
}
|
||
}
|
||
return ""
|
||
}
|
||
|
||
// ListAMDGPUs returns AMD GPUs visible to rocm-smi.
|
||
func (s *System) ListAMDGPUs() ([]AMDGPUInfo, error) {
|
||
out, err := runROCmSMI("--showproductname", "--csv")
|
||
if err != nil {
|
||
return nil, fmt.Errorf("rocm-smi: %w", err)
|
||
}
|
||
var gpus []AMDGPUInfo
|
||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||
line = strings.TrimSpace(line)
|
||
if line == "" || strings.HasPrefix(strings.ToLower(line), "device") {
|
||
continue
|
||
}
|
||
parts := strings.SplitN(line, ",", 2)
|
||
name := ""
|
||
if len(parts) >= 2 {
|
||
name = strings.TrimSpace(parts[1])
|
||
}
|
||
idx := len(gpus)
|
||
gpus = append(gpus, AMDGPUInfo{Index: idx, Name: name})
|
||
}
|
||
return gpus, nil
|
||
}
|
||
|
||
// RunAMDAcceptancePack runs an AMD GPU diagnostic pack using rocm-smi.
|
||
func (s *System) RunAMDAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd", []satJob{
|
||
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||
{name: "02-rocm-smi-showallinfo.log", cmd: []string{"rocm-smi", "--showallinfo"}},
|
||
{name: "03-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||
{name: "04-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||
}, logFunc)
|
||
}
|
||
|
||
// RunAMDMemIntegrityPack runs the official RVS MEM module as a validate-style memory integrity test.
|
||
func (s *System) RunAMDMemIntegrityPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||
if err := ensureAMDRuntimeReady(); err != nil {
|
||
return "", err
|
||
}
|
||
cfgFile := "/tmp/bee-amd-mem.conf"
|
||
cfg := `actions:
|
||
- name: mem_integrity
|
||
device: all
|
||
module: mem
|
||
parallel: true
|
||
duration: 60000
|
||
copy_matrix: false
|
||
target_stress: 90
|
||
matrix_size: 8640
|
||
`
|
||
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-mem", []satJob{
|
||
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||
{name: "02-rvs-mem.log", cmd: []string{"rvs", "-c", cfgFile}},
|
||
{name: "03-rocm-smi-after.log", cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--showmemuse", "--csv"}},
|
||
}, logFunc)
|
||
}
|
||
|
||
// RunAMDMemBandwidthPack runs AMD's memory/interconnect bandwidth-oriented tools.
|
||
func (s *System) RunAMDMemBandwidthPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||
if err := ensureAMDRuntimeReady(); err != nil {
|
||
return "", err
|
||
}
|
||
cfgFile := "/tmp/bee-amd-babel.conf"
|
||
cfg := `actions:
|
||
- name: babel_mem_bw
|
||
device: all
|
||
module: babel
|
||
parallel: true
|
||
copy_matrix: true
|
||
target_stress: 90
|
||
matrix_size: 134217728
|
||
`
|
||
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-bandwidth", []satJob{
|
||
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||
{name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}},
|
||
{name: "03-rvs-babel.log", cmd: []string{"rvs", "-c", cfgFile}},
|
||
{name: "04-rocm-smi-after.log", cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--showmemuse", "--csv"}},
|
||
}, logFunc)
|
||
}
|
||
|
||
// RunAMDStressPack runs an AMD GPU burn-in pack.
|
||
// Missing tools are reported as UNSUPPORTED, consistent with the existing SAT pattern.
|
||
func (s *System) RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||
seconds := durationSec
|
||
if seconds <= 0 {
|
||
seconds = envInt("BEE_AMD_STRESS_SECONDS", 300)
|
||
}
|
||
if err := ensureAMDRuntimeReady(); err != nil {
|
||
return "", err
|
||
}
|
||
// Enable copy_matrix so the same GST run drives VRAM traffic in addition to compute.
|
||
rvsCfg := amdStressRVSConfig(seconds)
|
||
cfgFile := "/tmp/bee-amd-gst.conf"
|
||
_ = os.WriteFile(cfgFile, []byte(rvsCfg), 0644)
|
||
|
||
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-stress", amdStressJobs(seconds, cfgFile), logFunc)
|
||
}
|
||
|
||
func amdStressRVSConfig(seconds int) string {
|
||
return fmt.Sprintf(`actions:
|
||
- name: gst_stress
|
||
device: all
|
||
module: gst
|
||
parallel: true
|
||
duration: %d
|
||
copy_matrix: false
|
||
target_stress: 90
|
||
matrix_size_a: 8640
|
||
matrix_size_b: 8640
|
||
matrix_size_c: 8640
|
||
`, seconds*1000)
|
||
}
|
||
|
||
func amdStressJobs(seconds int, cfgFile string) []satJob {
|
||
return []satJob{
|
||
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||
{name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}},
|
||
{name: fmt.Sprintf("03-rvs-gst-%ds.log", seconds), cmd: []string{"rvs", "-c", cfgFile}},
|
||
{name: fmt.Sprintf("04-rocm-smi-after.log"), cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--csv"}},
|
||
}
|
||
}
|
||
|
||
// ListNvidiaGPUs returns GPUs visible to nvidia-smi.
|
||
func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
|
||
out, err := exec.Command("nvidia-smi",
|
||
"--query-gpu=index,name,memory.total",
|
||
"--format=csv,noheader,nounits").Output()
|
||
if err != nil {
|
||
return nil, fmt.Errorf("nvidia-smi: %w", err)
|
||
}
|
||
var gpus []NvidiaGPU
|
||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||
line = strings.TrimSpace(line)
|
||
if line == "" {
|
||
continue
|
||
}
|
||
parts := strings.SplitN(line, ", ", 3)
|
||
if len(parts) != 3 {
|
||
continue
|
||
}
|
||
idx, err := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||
if err != nil {
|
||
continue
|
||
}
|
||
memMB, _ := strconv.Atoi(strings.TrimSpace(parts[2]))
|
||
gpus = append(gpus, NvidiaGPU{
|
||
Index: idx,
|
||
Name: strings.TrimSpace(parts[1]),
|
||
MemoryMB: memMB,
|
||
})
|
||
}
|
||
sort.Slice(gpus, func(i, j int) bool {
|
||
return gpus[i].Index < gpus[j].Index
|
||
})
|
||
return gpus, nil
|
||
}
|
||
|
||
func (s *System) ListNvidiaGPUStatuses() ([]NvidiaGPUStatus, error) {
|
||
out, err := satExecCommand(
|
||
"nvidia-smi",
|
||
"--query-gpu=index,name,pci.bus_id,serial,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total",
|
||
"--format=csv,noheader,nounits",
|
||
).Output()
|
||
if err != nil {
|
||
return nil, fmt.Errorf("nvidia-smi: %w", err)
|
||
}
|
||
var gpus []NvidiaGPUStatus
|
||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||
line = strings.TrimSpace(line)
|
||
if line == "" {
|
||
continue
|
||
}
|
||
parts := strings.Split(line, ",")
|
||
if len(parts) < 4 {
|
||
gpus = append(gpus, NvidiaGPUStatus{RawLine: line, Status: "UNKNOWN", ParseFailure: true})
|
||
continue
|
||
}
|
||
idx, err := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||
if err != nil {
|
||
gpus = append(gpus, NvidiaGPUStatus{RawLine: line, Status: "UNKNOWN", ParseFailure: true})
|
||
continue
|
||
}
|
||
upper := strings.ToUpper(line)
|
||
needsReset := strings.Contains(upper, "GPU REQUIRES RESET")
|
||
status := "OK"
|
||
if needsReset {
|
||
status = "RESET_REQUIRED"
|
||
}
|
||
gpus = append(gpus, NvidiaGPUStatus{
|
||
Index: idx,
|
||
Name: strings.TrimSpace(parts[1]),
|
||
BDF: normalizeNvidiaBusID(strings.TrimSpace(parts[2])),
|
||
Serial: strings.TrimSpace(parts[3]),
|
||
Status: status,
|
||
RawLine: line,
|
||
NeedsReset: needsReset,
|
||
})
|
||
}
|
||
sort.Slice(gpus, func(i, j int) bool { return gpus[i].Index < gpus[j].Index })
|
||
return gpus, nil
|
||
}
|
||
|
||
func normalizeNvidiaBusID(v string) string {
|
||
v = strings.TrimSpace(strings.ToLower(v))
|
||
parts := strings.Split(v, ":")
|
||
if len(parts) == 3 && len(parts[0]) > 4 {
|
||
parts[0] = parts[0][len(parts[0])-4:]
|
||
return strings.Join(parts, ":")
|
||
}
|
||
return v
|
||
}
|
||
|
||
func (s *System) ResetNvidiaGPU(index int) (string, error) {
|
||
if index < 0 {
|
||
return "", fmt.Errorf("gpu index must be >= 0")
|
||
}
|
||
raw, err := satExecCommand("nvidia-smi", "-r", "-i", strconv.Itoa(index)).CombinedOutput()
|
||
if len(raw) == 0 && err == nil {
|
||
raw = []byte("GPU reset completed.\n")
|
||
}
|
||
return string(raw), err
|
||
}
|
||
|
||
// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
|
||
// Measures collective communication bandwidth over NVLink/PCIe.
|
||
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
gpuCount := len(selected)
|
||
if gpuCount < 1 {
|
||
gpuCount = 1
|
||
}
|
||
return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", withNvidiaPersistenceMode(
|
||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||
satJob{name: "02-all-reduce-perf.log", cmd: []string{
|
||
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
|
||
"-g", strconv.Itoa(gpuCount), "--iters", "20",
|
||
}, env: nvidiaVisibleDevicesEnv(selected)},
|
||
), logFunc)
|
||
}
|
||
|
||
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
var (
|
||
profCmd []string
|
||
profEnv []string
|
||
)
|
||
if len(selected) > 1 {
|
||
// For multiple GPUs, always spawn one dcgmproftester process per GPU via
|
||
// bee-dcgmproftester-staggered (stagger=0 means all start simultaneously).
|
||
// A single dcgmproftester process without -i only loads GPU 0 regardless
|
||
// of CUDA_VISIBLE_DEVICES.
|
||
stagger := staggerSec
|
||
if stagger < 0 {
|
||
stagger = 0
|
||
}
|
||
profCmd = []string{
|
||
"bee-dcgmproftester-staggered",
|
||
"--seconds", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)),
|
||
"--stagger-seconds", strconv.Itoa(stagger),
|
||
"--devices", joinIndexList(selected),
|
||
}
|
||
} else {
|
||
profCmd, err = resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
profEnv = nvidiaVisibleDevicesEnv(selected)
|
||
}
|
||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
|
||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||
satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
|
||
satJob{
|
||
name: "03-dcgmproftester.log",
|
||
cmd: profCmd,
|
||
env: profEnv,
|
||
collectGPU: true,
|
||
gpuIndices: selected,
|
||
},
|
||
satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||
), logFunc)
|
||
}
|
||
|
||
func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||
for _, p := range killed {
|
||
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||
}
|
||
}
|
||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", withNvidiaPersistenceMode(
|
||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||
satJob{
|
||
name: "02-dcgmi-targeted-power.log",
|
||
cmd: nvidiaDCGMNamedDiagCommand("targeted_power", normalizeNvidiaBurnDuration(durationSec), selected),
|
||
collectGPU: true,
|
||
gpuIndices: selected,
|
||
},
|
||
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||
), logFunc)
|
||
}
|
||
|
||
func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||
for _, p := range killed {
|
||
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||
}
|
||
}
|
||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", withNvidiaPersistenceMode(
|
||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||
satJob{
|
||
name: "02-dcgmi-pulse-test.log",
|
||
cmd: nvidiaDCGMNamedDiagCommand("pulse_test", normalizeNvidiaBurnDuration(durationSec), selected),
|
||
collectGPU: true,
|
||
gpuIndices: selected,
|
||
},
|
||
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||
), logFunc)
|
||
}
|
||
|
||
func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||
for _, p := range killed {
|
||
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||
}
|
||
}
|
||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", withNvidiaPersistenceMode(
|
||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||
satJob{
|
||
name: "02-dcgmi-nvbandwidth.log",
|
||
cmd: nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, selected),
|
||
collectGPU: true,
|
||
gpuIndices: selected,
|
||
},
|
||
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||
), logFunc)
|
||
}
|
||
|
||
func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||
return runAcceptancePackCtx(context.Background(), baseDir, "gpu-nvidia", nvidiaSATJobs(), logFunc)
|
||
}
|
||
|
||
// RunNvidiaAcceptancePackWithOptions runs the NVIDIA diagnostics via DCGM.
|
||
// diagLevel: 1=quick, 2=medium, 3=targeted stress, 4=extended stress.
|
||
// gpuIndices: specific GPU indices to test (empty = all GPUs).
|
||
// ctx cancellation kills the running job.
|
||
func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||
resolvedGPUIndices, err := resolveDCGMGPUIndices(gpuIndices)
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, resolvedGPUIndices), logFunc)
|
||
}
|
||
|
||
func (s *System) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||
for _, p := range killed {
|
||
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||
}
|
||
}
|
||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", withNvidiaPersistenceMode(
|
||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||
satJob{
|
||
name: "02-dcgmi-targeted-stress.log",
|
||
cmd: nvidiaDCGMNamedDiagCommand("targeted_stress", normalizeNvidiaBurnDuration(durationSec), selected),
|
||
collectGPU: true,
|
||
gpuIndices: selected,
|
||
},
|
||
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||
), logFunc)
|
||
}
|
||
|
||
func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
|
||
if len(gpuIndices) > 0 {
|
||
return dedupeSortedIndices(gpuIndices), nil
|
||
}
|
||
all, err := listNvidiaGPUIndices()
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
if len(all) == 0 {
|
||
return nil, fmt.Errorf("nvidia-smi found no NVIDIA GPUs")
|
||
}
|
||
return all, nil
|
||
}
|
||
|
||
func memoryStressSizeArg() string {
|
||
if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 {
|
||
return fmt.Sprintf("%dM", mb)
|
||
}
|
||
availBytes := satFreeMemBytes()
|
||
if availBytes <= 0 {
|
||
return "80%"
|
||
}
|
||
availMB := availBytes / (1024 * 1024)
|
||
targetMB := (availMB * 2) / 3
|
||
if targetMB >= 256 {
|
||
targetMB = (targetMB / 256) * 256
|
||
}
|
||
if targetMB <= 0 {
|
||
return "80%"
|
||
}
|
||
return fmt.Sprintf("%dM", targetMB)
|
||
}
|
||
|
||
func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
|
||
if sizeMB <= 0 {
|
||
sizeMB = 256
|
||
}
|
||
if passes <= 0 {
|
||
passes = 1
|
||
}
|
||
// Keep Validate Memory bounded to a quick diagnostic window. The timeout is
|
||
// intentionally conservative enough for healthy systems while avoiding the
|
||
// prior 30-80 minute hangs caused by memtester spinning on a bad subtest.
|
||
timeoutSec := sizeMB*passes*20/100 + 60
|
||
if timeoutSec < 180 {
|
||
timeoutSec = 180
|
||
}
|
||
if timeoutSec > 900 {
|
||
timeoutSec = 900
|
||
}
|
||
return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
|
||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||
{name: "02-memtester.log", cmd: []string{"timeout", fmt.Sprintf("%d", timeoutSec), "memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
||
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
||
}, logFunc)
|
||
}
|
||
|
||
func (s *System) RunMemoryStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||
seconds := durationSec
|
||
if seconds <= 0 {
|
||
seconds = envInt("BEE_VM_STRESS_SECONDS", 300)
|
||
}
|
||
// Base the default on current MemAvailable and keep headroom for the OS and
|
||
// concurrent stressors so mixed burn runs do not trip the OOM killer.
|
||
sizeArg := memoryStressSizeArg()
|
||
return runAcceptancePackCtx(ctx, baseDir, "memory-stress", []satJob{
|
||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||
{name: "02-stress-ng-vm.log", cmd: []string{
|
||
"stress-ng", "--vm", "1",
|
||
"--vm-bytes", sizeArg,
|
||
"--vm-method", "all",
|
||
"--timeout", fmt.Sprintf("%d", seconds),
|
||
"--metrics-brief",
|
||
}},
|
||
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
||
}, logFunc)
|
||
}
|
||
|
||
func (s *System) RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||
seconds := durationSec
|
||
if seconds <= 0 {
|
||
seconds = envInt("BEE_SAT_STRESS_SECONDS", 300)
|
||
}
|
||
cmd := []string{"stressapptest", "-s", fmt.Sprintf("%d", seconds), "-W", "--cc_test"}
|
||
if mb := envInt("BEE_SAT_STRESS_MB", 0); mb > 0 {
|
||
cmd = append(cmd, "-M", fmt.Sprintf("%d", mb))
|
||
}
|
||
return runAcceptancePackCtx(ctx, baseDir, "sat-stress", []satJob{
|
||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||
{name: "02-stressapptest.log", cmd: cmd},
|
||
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
||
}, logFunc)
|
||
}
|
||
|
||
func (s *System) RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||
if durationSec <= 0 {
|
||
durationSec = 60
|
||
}
|
||
return runAcceptancePackCtx(ctx, baseDir, "cpu", []satJob{
|
||
{name: "01-lscpu.log", cmd: []string{"lscpu"}},
|
||
{name: "02-sensors-before.log", cmd: []string{"sensors"}},
|
||
{name: "03-stress-ng.log", cmd: []string{"stress-ng", "--cpu", "0", "--cpu-method", "all", "--timeout", fmt.Sprintf("%d", durationSec)}},
|
||
{name: "04-sensors-after.log", cmd: []string{"sensors"}},
|
||
}, logFunc)
|
||
}
|
||
|
||
func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
|
||
if baseDir == "" {
|
||
baseDir = "/var/log/bee-sat"
|
||
}
|
||
ts := time.Now().UTC().Format("20060102-150405")
|
||
runDir := filepath.Join(baseDir, "storage-"+ts)
|
||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||
return "", err
|
||
}
|
||
verboseLog := filepath.Join(runDir, "verbose.log")
|
||
|
||
devices, err := listStorageDevices()
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
sort.Strings(devices)
|
||
|
||
var summary strings.Builder
|
||
stats := satStats{}
|
||
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
||
if len(devices) == 0 {
|
||
fmt.Fprintln(&summary, "devices=0")
|
||
stats.Unsupported++
|
||
} else {
|
||
fmt.Fprintf(&summary, "devices=%d\n", len(devices))
|
||
}
|
||
|
||
for index, devPath := range devices {
|
||
if ctx.Err() != nil {
|
||
break
|
||
}
|
||
prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath))
|
||
commands := storageSATCommands(devPath, extended)
|
||
for cmdIndex, job := range commands {
|
||
if ctx.Err() != nil {
|
||
break
|
||
}
|
||
name := fmt.Sprintf("%s-%02d-%s.log", prefix, cmdIndex+1, job.name)
|
||
out, err := runSATCommandCtx(ctx, verboseLog, job.name, job.cmd, nil, logFunc)
|
||
if writeErr := os.WriteFile(filepath.Join(runDir, name), out, 0644); writeErr != nil {
|
||
return "", writeErr
|
||
}
|
||
status, rc := classifySATResult(job.name, out, err)
|
||
stats.Add(status)
|
||
key := filepath.Base(devPath) + "_" + strings.ReplaceAll(job.name, "-", "_")
|
||
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
|
||
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
|
||
}
|
||
}
|
||
|
||
writeSATStats(&summary, stats)
|
||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
||
return "", err
|
||
}
|
||
return runDir, nil
|
||
}
|
||
|
||
type satJob struct {
|
||
name string
|
||
cmd []string
|
||
env []string // extra env vars (appended to os.Environ)
|
||
collectGPU bool // collect GPU metrics via nvidia-smi while this job runs
|
||
gpuIndices []int // GPU indices to collect metrics for (empty = all)
|
||
}
|
||
|
||
type satStats struct {
|
||
OK int
|
||
Failed int
|
||
Unsupported int
|
||
}
|
||
|
||
func withNvidiaPersistenceMode(jobs ...satJob) []satJob {
|
||
out := make([]satJob, 0, len(jobs)+1)
|
||
out = append(out, satJob{
|
||
name: "00-nvidia-smi-persistence-mode.log",
|
||
cmd: []string{"nvidia-smi", "-pm", "1"},
|
||
})
|
||
out = append(out, jobs...)
|
||
return out
|
||
}
|
||
|
||
func nvidiaSATJobs() []satJob {
|
||
return withNvidiaPersistenceMode(
|
||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||
satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||
satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||
satJob{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
||
satJob{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
|
||
)
|
||
}
|
||
|
||
func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
||
if diagLevel < 1 || diagLevel > 4 {
|
||
diagLevel = 3
|
||
}
|
||
diagArgs := []string{"dcgmi", "diag", "-r", strconv.Itoa(diagLevel)}
|
||
if len(gpuIndices) > 0 {
|
||
ids := make([]string, len(gpuIndices))
|
||
for i, idx := range gpuIndices {
|
||
ids[i] = strconv.Itoa(idx)
|
||
}
|
||
diagArgs = append(diagArgs, "-i", strings.Join(ids, ","))
|
||
}
|
||
return withNvidiaPersistenceMode(
|
||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||
satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||
satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||
satJob{name: "04-dcgmi-diag.log", cmd: diagArgs, gpuIndices: gpuIndices},
|
||
)
|
||
}
|
||
|
||
func nvidiaDCGMNamedDiagCommand(name string, durationSec int, gpuIndices []int) []string {
|
||
args := []string{"dcgmi", "diag", "-r", name}
|
||
if durationSec > 0 {
|
||
args = append(args, "-p", fmt.Sprintf("%s.test_duration=%d", name, durationSec))
|
||
}
|
||
if len(gpuIndices) > 0 {
|
||
args = append(args, "-i", joinIndexList(gpuIndices))
|
||
}
|
||
return args
|
||
}
|
||
|
||
func normalizeNvidiaBurnDuration(durationSec int) int {
|
||
if durationSec <= 0 {
|
||
return 300
|
||
}
|
||
return durationSec
|
||
}
|
||
|
||
func nvidiaVisibleDevicesEnv(gpuIndices []int) []string {
|
||
if len(gpuIndices) == 0 {
|
||
return nil
|
||
}
|
||
return []string{
|
||
"CUDA_DEVICE_ORDER=PCI_BUS_ID",
|
||
"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices),
|
||
}
|
||
}
|
||
|
||
func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
|
||
if ctx == nil {
|
||
ctx = context.Background()
|
||
}
|
||
if baseDir == "" {
|
||
baseDir = "/var/log/bee-sat"
|
||
}
|
||
ts := time.Now().UTC().Format("20060102-150405")
|
||
runDir := filepath.Join(baseDir, prefix+"-"+ts)
|
||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||
return "", err
|
||
}
|
||
verboseLog := filepath.Join(runDir, "verbose.log")
|
||
|
||
var summary strings.Builder
|
||
stats := satStats{}
|
||
nvidiaPack := strings.HasPrefix(prefix, "gpu-nvidia")
|
||
perGPU := map[int]*nvidiaGPUStatusFile{}
|
||
selectedGPUIndices := map[int]struct{}{}
|
||
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
||
for _, job := range jobs {
|
||
if ctx.Err() != nil {
|
||
break
|
||
}
|
||
for _, idx := range job.gpuIndices {
|
||
selectedGPUIndices[idx] = struct{}{}
|
||
status := perGPU[idx]
|
||
if status == nil {
|
||
status = &nvidiaGPUStatusFile{Index: idx}
|
||
perGPU[idx] = status
|
||
}
|
||
status.Selected = true
|
||
}
|
||
cmd := make([]string, 0, len(job.cmd))
|
||
for _, arg := range job.cmd {
|
||
cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir))
|
||
}
|
||
|
||
var out []byte
|
||
var err error
|
||
|
||
if nvidiaPack && nvidiaJobNeedsHealthCheck(job) {
|
||
if msg, healthErr := checkNvidiaJobHealth(job.gpuIndices); healthErr != nil {
|
||
if logFunc != nil {
|
||
logFunc(msg)
|
||
}
|
||
out = []byte(msg + "\n")
|
||
err = healthErr
|
||
}
|
||
}
|
||
|
||
if err == nil {
|
||
if job.collectGPU {
|
||
out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc)
|
||
} else {
|
||
out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc)
|
||
}
|
||
}
|
||
|
||
if nvidiaPack && nvidiaJobNeedsHealthCheck(job) {
|
||
if msg, healthErr := checkNvidiaJobHealth(job.gpuIndices); healthErr != nil {
|
||
if logFunc != nil {
|
||
logFunc(msg)
|
||
}
|
||
if len(out) > 0 && !bytes.HasSuffix(out, []byte("\n")) {
|
||
out = append(out, '\n')
|
||
}
|
||
out = append(out, []byte(msg+"\n")...)
|
||
if err == nil {
|
||
err = healthErr
|
||
}
|
||
}
|
||
}
|
||
|
||
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
||
return "", writeErr
|
||
}
|
||
if ctx.Err() != nil {
|
||
return "", ctx.Err()
|
||
}
|
||
status, rc := classifySATResult(job.name, out, err)
|
||
stats.Add(status)
|
||
if nvidiaPack && len(job.gpuIndices) > 0 && nvidiaJobNeedsHealthCheck(job) {
|
||
for _, idx := range job.gpuIndices {
|
||
updateNvidiaGPUStatus(perGPU, idx, status, job.name, string(out))
|
||
}
|
||
}
|
||
key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
|
||
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
|
||
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
|
||
}
|
||
writeSATStats(&summary, stats)
|
||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
||
return "", err
|
||
}
|
||
if nvidiaPack {
|
||
if err := writeNvidiaGPUStatusFiles(runDir, stats.Overall(), perGPU, selectedGPUIndices); err != nil {
|
||
return "", err
|
||
}
|
||
}
|
||
|
||
return runDir, nil
|
||
}
|
||
|
||
func updateNvidiaGPUStatus(perGPU map[int]*nvidiaGPUStatusFile, idx int, status, jobName, detail string) {
|
||
entry := perGPU[idx]
|
||
if entry == nil {
|
||
entry = &nvidiaGPUStatusFile{Index: idx}
|
||
perGPU[idx] = entry
|
||
}
|
||
if nvidiaSATStatusSeverity(status) >= nvidiaSATStatusSeverity(entry.RunStatus) {
|
||
entry.RunStatus = status
|
||
entry.FailingJob = jobName
|
||
entry.Reason = firstLine(detail)
|
||
}
|
||
}
|
||
|
||
func writeNvidiaGPUStatusFiles(runDir, overall string, perGPU map[int]*nvidiaGPUStatusFile, selected map[int]struct{}) error {
|
||
health, err := readNvidiaGPUHealth()
|
||
if err == nil {
|
||
for _, gpu := range health {
|
||
entry := perGPU[gpu.Index]
|
||
if entry == nil {
|
||
entry = &nvidiaGPUStatusFile{Index: gpu.Index}
|
||
perGPU[gpu.Index] = entry
|
||
}
|
||
entry.Name = gpu.Name
|
||
entry.Observed = true
|
||
entry.HealthRaw = gpu.RawLine
|
||
if gpu.NeedsReset {
|
||
entry.Health = "RESET_REQUIRED"
|
||
if entry.RunStatus == "" || nvidiaSATStatusSeverity("FAILED") >= nvidiaSATStatusSeverity(entry.RunStatus) {
|
||
entry.RunStatus = "FAILED"
|
||
if strings.TrimSpace(entry.Reason) == "" {
|
||
entry.Reason = "GPU requires reset"
|
||
}
|
||
}
|
||
} else {
|
||
entry.Health = "OK"
|
||
}
|
||
}
|
||
}
|
||
for idx := range selected {
|
||
entry := perGPU[idx]
|
||
if entry == nil {
|
||
entry = &nvidiaGPUStatusFile{Index: idx}
|
||
perGPU[idx] = entry
|
||
}
|
||
entry.Selected = true
|
||
}
|
||
var indices []int
|
||
for idx := range perGPU {
|
||
indices = append(indices, idx)
|
||
}
|
||
sort.Ints(indices)
|
||
for _, idx := range indices {
|
||
entry := perGPU[idx]
|
||
if entry.RunStatus == "" {
|
||
entry.RunStatus = overall
|
||
}
|
||
if entry.Health == "" {
|
||
entry.Health = "UNKNOWN"
|
||
}
|
||
if entry.Name == "" {
|
||
entry.Name = "Unknown GPU"
|
||
}
|
||
var body strings.Builder
|
||
fmt.Fprintf(&body, "gpu_index=%d\n", entry.Index)
|
||
fmt.Fprintf(&body, "gpu_name=%s\n", entry.Name)
|
||
fmt.Fprintf(&body, "selected=%t\n", entry.Selected)
|
||
fmt.Fprintf(&body, "observed=%t\n", entry.Observed)
|
||
fmt.Fprintf(&body, "run_status=%s\n", entry.RunStatus)
|
||
fmt.Fprintf(&body, "health_status=%s\n", entry.Health)
|
||
if strings.TrimSpace(entry.FailingJob) != "" {
|
||
fmt.Fprintf(&body, "failing_job=%s\n", entry.FailingJob)
|
||
}
|
||
if strings.TrimSpace(entry.Reason) != "" {
|
||
fmt.Fprintf(&body, "reason=%s\n", entry.Reason)
|
||
}
|
||
if strings.TrimSpace(entry.HealthRaw) != "" {
|
||
fmt.Fprintf(&body, "health_raw=%s\n", entry.HealthRaw)
|
||
}
|
||
if err := os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-status.txt", idx)), []byte(body.String()), 0644); err != nil {
|
||
return err
|
||
}
|
||
}
|
||
return nil
|
||
}
|
||
|
||
func nvidiaSATStatusSeverity(status string) int {
|
||
switch strings.ToUpper(strings.TrimSpace(status)) {
|
||
case "FAILED":
|
||
return 3
|
||
case "PARTIAL", "UNSUPPORTED":
|
||
return 2
|
||
case "OK":
|
||
return 1
|
||
default:
|
||
return 0
|
||
}
|
||
}
|
||
|
||
func firstLine(s string) string {
|
||
s = strings.TrimSpace(s)
|
||
if s == "" {
|
||
return ""
|
||
}
|
||
if idx := strings.IndexByte(s, '\n'); idx >= 0 {
|
||
return strings.TrimSpace(s[:idx])
|
||
}
|
||
return s
|
||
}
|
||
|
||
func nvidiaJobNeedsHealthCheck(job satJob) bool {
|
||
if job.collectGPU {
|
||
return true
|
||
}
|
||
name := strings.ToLower(strings.TrimSpace(job.name))
|
||
return strings.Contains(name, "dcgmi") ||
|
||
strings.Contains(name, "gpu-burn") ||
|
||
strings.Contains(name, "gpu-stress") ||
|
||
strings.Contains(name, "dcgmproftester")
|
||
}
|
||
|
||
func checkNvidiaJobHealth(selected []int) (string, error) {
|
||
health, err := readNvidiaGPUHealth()
|
||
if err != nil {
|
||
return "", nil
|
||
}
|
||
var bad []nvidiaGPUHealth
|
||
selectedSet := make(map[int]struct{}, len(selected))
|
||
for _, idx := range selected {
|
||
selectedSet[idx] = struct{}{}
|
||
}
|
||
for _, gpu := range health {
|
||
if len(selectedSet) > 0 {
|
||
if _, ok := selectedSet[gpu.Index]; !ok {
|
||
continue
|
||
}
|
||
}
|
||
if gpu.NeedsReset {
|
||
bad = append(bad, gpu)
|
||
}
|
||
}
|
||
if len(bad) == 0 {
|
||
return "", nil
|
||
}
|
||
lines := make([]string, 0, len(bad)+1)
|
||
lines = append(lines, "NVIDIA GPU health check failed:")
|
||
for _, gpu := range bad {
|
||
lines = append(lines, fmt.Sprintf("gpu %d (%s) requires reset: %s", gpu.Index, gpu.Name, gpu.RawLine))
|
||
}
|
||
return strings.Join(lines, "\n"), errors.New("nvidia gpu requires reset")
|
||
}
|
||
|
||
func readNvidiaGPUHealth() ([]nvidiaGPUHealth, error) {
|
||
out, err := satExecCommand(
|
||
"nvidia-smi",
|
||
"--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total",
|
||
"--format=csv,noheader,nounits",
|
||
).Output()
|
||
if err != nil {
|
||
return nil, fmt.Errorf("nvidia-smi: %w", err)
|
||
}
|
||
return parseNvidiaGPUHealth(string(out)), nil
|
||
}
|
||
|
||
func parseNvidiaGPUHealth(raw string) []nvidiaGPUHealth {
|
||
var gpus []nvidiaGPUHealth
|
||
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
||
line = strings.TrimSpace(line)
|
||
if line == "" {
|
||
continue
|
||
}
|
||
parts := strings.Split(line, ",")
|
||
if len(parts) < 2 {
|
||
gpus = append(gpus, nvidiaGPUHealth{RawLine: line, ParseFailure: true})
|
||
continue
|
||
}
|
||
idx, err := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||
if err != nil {
|
||
gpus = append(gpus, nvidiaGPUHealth{RawLine: line, ParseFailure: true})
|
||
continue
|
||
}
|
||
upper := strings.ToUpper(line)
|
||
gpus = append(gpus, nvidiaGPUHealth{
|
||
Index: idx,
|
||
Name: strings.TrimSpace(parts[1]),
|
||
NeedsReset: strings.Contains(upper, "GPU REQUIRES RESET"),
|
||
RawLine: line,
|
||
})
|
||
}
|
||
return gpus
|
||
}
|
||
|
||
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string, logFunc func(string)) ([]byte, error) {
|
||
start := time.Now().UTC()
|
||
resolvedCmd, err := resolveSATCommand(cmd)
|
||
appendSATVerboseLog(verboseLog,
|
||
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
|
||
"cmd: "+strings.Join(resolvedCmd, " "),
|
||
)
|
||
if logFunc != nil {
|
||
logFunc(fmt.Sprintf("=== %s ===", name))
|
||
}
|
||
if err != nil {
|
||
appendSATVerboseLog(verboseLog,
|
||
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
|
||
"rc: 1",
|
||
fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()),
|
||
"",
|
||
)
|
||
return []byte(err.Error() + "\n"), err
|
||
}
|
||
|
||
c := exec.CommandContext(ctx, resolvedCmd[0], resolvedCmd[1:]...)
|
||
c.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||
c.Cancel = func() error {
|
||
if c.Process != nil {
|
||
_ = syscall.Kill(-c.Process.Pid, syscall.SIGKILL)
|
||
}
|
||
return nil
|
||
}
|
||
if len(env) > 0 {
|
||
c.Env = append(os.Environ(), env...)
|
||
}
|
||
out, err := streamExecOutput(c, logFunc)
|
||
|
||
rc := 0
|
||
if err != nil {
|
||
rc = 1
|
||
}
|
||
appendSATVerboseLog(verboseLog,
|
||
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
|
||
fmt.Sprintf("rc: %d", rc),
|
||
fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()),
|
||
"",
|
||
)
|
||
return out, err
|
||
}
|
||
|
||
func listStorageDevices() ([]string, error) {
|
||
out, err := satExecCommand("lsblk", "-dn", "-o", "NAME,TYPE,TRAN").Output()
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
return parseStorageDevices(string(out)), nil
|
||
}
|
||
|
||
func storageSATCommands(devPath string, extended bool) []satJob {
|
||
if strings.Contains(filepath.Base(devPath), "nvme") {
|
||
selfTestLevel := "1"
|
||
if extended {
|
||
selfTestLevel = "2"
|
||
}
|
||
return []satJob{
|
||
{name: "nvme-id-ctrl", cmd: []string{"nvme", "id-ctrl", devPath, "-o", "json"}},
|
||
{name: "nvme-smart-log", cmd: []string{"nvme", "smart-log", devPath, "-o", "json"}},
|
||
{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", selfTestLevel, "--wait"}},
|
||
}
|
||
}
|
||
smartTestType := "short"
|
||
if extended {
|
||
smartTestType = "long"
|
||
}
|
||
return []satJob{
|
||
{name: "smartctl-health", cmd: []string{"smartctl", "-H", "-A", devPath}},
|
||
{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", smartTestType, devPath}},
|
||
}
|
||
}
|
||
|
||
func (s *satStats) Add(status string) {
|
||
switch status {
|
||
case "OK":
|
||
s.OK++
|
||
case "UNSUPPORTED":
|
||
s.Unsupported++
|
||
default:
|
||
s.Failed++
|
||
}
|
||
}
|
||
|
||
func (s satStats) Overall() string {
|
||
if s.Failed > 0 {
|
||
return "FAILED"
|
||
}
|
||
if s.Unsupported > 0 {
|
||
return "PARTIAL"
|
||
}
|
||
return "OK"
|
||
}
|
||
|
||
func writeSATStats(summary *strings.Builder, stats satStats) {
|
||
fmt.Fprintf(summary, "overall_status=%s\n", stats.Overall())
|
||
fmt.Fprintf(summary, "job_ok=%d\n", stats.OK)
|
||
fmt.Fprintf(summary, "job_failed=%d\n", stats.Failed)
|
||
fmt.Fprintf(summary, "job_unsupported=%d\n", stats.Unsupported)
|
||
}
|
||
|
||
func classifySATResult(name string, out []byte, err error) (string, int) {
|
||
rc := 0
|
||
if err != nil {
|
||
rc = 1
|
||
}
|
||
if err == nil {
|
||
return "OK", rc
|
||
}
|
||
|
||
text := strings.ToLower(string(out))
|
||
// No output at all means the tool failed to start (mlock limit, binary missing,
|
||
// etc.) — we cannot say anything about hardware health → UNSUPPORTED.
|
||
if len(strings.TrimSpace(text)) == 0 {
|
||
return "UNSUPPORTED", rc
|
||
}
|
||
if strings.Contains(text, "unsupported") ||
|
||
strings.Contains(text, "not supported") ||
|
||
strings.Contains(text, "not found in path") ||
|
||
strings.Contains(text, "invalid opcode") ||
|
||
strings.Contains(text, "unknown command") ||
|
||
strings.Contains(text, "not implemented") ||
|
||
strings.Contains(text, "not available") ||
|
||
strings.Contains(text, "cuda_error_system_not_ready") ||
|
||
strings.Contains(text, "no such device") ||
|
||
// nvidia-smi on a machine with no NVIDIA GPU
|
||
strings.Contains(text, "couldn't communicate with the nvidia driver") ||
|
||
strings.Contains(text, "no nvidia gpu") ||
|
||
// Some NVMe firmwares start self-test but never expose progress to nvme-cli
|
||
// while waiting, so the CLI stops polling without proving device failure.
|
||
(strings.Contains(name, "self-test") &&
|
||
strings.Contains(text, "no progress for") &&
|
||
strings.Contains(text, "stop waiting")) ||
|
||
(strings.Contains(name, "self-test") && strings.Contains(text, "aborted")) {
|
||
return "UNSUPPORTED", rc
|
||
}
|
||
return "FAILED", rc
|
||
}
|
||
|
||
func runSATCommand(verboseLog, name string, cmd []string, logFunc func(string)) ([]byte, error) {
|
||
start := time.Now().UTC()
|
||
resolvedCmd, err := resolveSATCommand(cmd)
|
||
appendSATVerboseLog(verboseLog,
|
||
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
|
||
"cmd: "+strings.Join(resolvedCmd, " "),
|
||
)
|
||
if logFunc != nil {
|
||
logFunc(fmt.Sprintf("=== %s ===", name))
|
||
}
|
||
if err != nil {
|
||
appendSATVerboseLog(verboseLog,
|
||
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
|
||
"rc: 1",
|
||
fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()),
|
||
"",
|
||
)
|
||
return []byte(err.Error() + "\n"), err
|
||
}
|
||
|
||
out, err := streamExecOutput(satExecCommand(resolvedCmd[0], resolvedCmd[1:]...), logFunc)
|
||
|
||
rc := 0
|
||
if err != nil {
|
||
rc = 1
|
||
}
|
||
appendSATVerboseLog(verboseLog,
|
||
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
|
||
fmt.Sprintf("rc: %d", rc),
|
||
fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()),
|
||
"",
|
||
)
|
||
return out, err
|
||
}
|
||
|
||
func runROCmSMI(args ...string) ([]byte, error) {
|
||
cmd, err := resolveROCmSMICommand(args...)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
return satExecCommand(cmd[0], cmd[1:]...).CombinedOutput()
|
||
}
|
||
|
||
func resolveSATCommand(cmd []string) ([]string, error) {
|
||
if len(cmd) == 0 {
|
||
return nil, errors.New("empty SAT command")
|
||
}
|
||
switch cmd[0] {
|
||
case "rocm-smi":
|
||
return resolveROCmSMICommand(cmd[1:]...)
|
||
case "rvs":
|
||
return resolveRVSCommand(cmd[1:]...)
|
||
}
|
||
path, err := satLookPath(cmd[0])
|
||
if err != nil {
|
||
return nil, fmt.Errorf("%s not found in PATH: %w", cmd[0], err)
|
||
}
|
||
return append([]string{path}, cmd[1:]...), nil
|
||
}
|
||
|
||
func resolveRVSCommand(args ...string) ([]string, error) {
|
||
if path, err := satLookPath("rvs"); err == nil {
|
||
return append([]string{path}, args...), nil
|
||
}
|
||
for _, path := range expandExistingPaths(rvsExecutableGlobs) {
|
||
return append([]string{path}, args...), nil
|
||
}
|
||
return nil, errors.New("rvs not found in PATH or under /opt/rocm")
|
||
}
|
||
|
||
func resolveROCmSMICommand(args ...string) ([]string, error) {
|
||
if path, err := satLookPath("rocm-smi"); err == nil {
|
||
return append([]string{path}, args...), nil
|
||
}
|
||
|
||
for _, path := range rocmSMIExecutableCandidates() {
|
||
return append([]string{path}, args...), nil
|
||
}
|
||
|
||
pythonPath, pyErr := satLookPath("python3")
|
||
if pyErr == nil {
|
||
for _, script := range rocmSMIScriptCandidates() {
|
||
cmd := []string{pythonPath, script}
|
||
cmd = append(cmd, args...)
|
||
return cmd, nil
|
||
}
|
||
}
|
||
|
||
return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
|
||
}
|
||
|
||
func resolveDCGMProfTesterCommand(args ...string) ([]string, error) {
|
||
for _, candidate := range dcgmProfTesterCandidates {
|
||
if path, err := satLookPath(candidate); err == nil {
|
||
return append([]string{path}, args...), nil
|
||
}
|
||
}
|
||
return nil, errors.New("dcgmproftester not found in PATH")
|
||
}
|
||
|
||
func ensureAMDRuntimeReady() error {
|
||
if _, err := os.Stat("/dev/kfd"); err == nil {
|
||
return nil
|
||
}
|
||
if raw, err := os.ReadFile("/sys/module/amdgpu/initstate"); err == nil {
|
||
state := strings.TrimSpace(string(raw))
|
||
if strings.EqualFold(state, "live") {
|
||
return nil
|
||
}
|
||
return fmt.Errorf("AMD driver is present but not initialized: amdgpu initstate=%q", state)
|
||
}
|
||
return errors.New("AMD GPUs are present but the runtime is not initialized: /dev/kfd is missing and amdgpu is not loaded")
|
||
}
|
||
|
||
func rocmSMIExecutableCandidates() []string {
|
||
return expandExistingPaths(rocmSMIExecutableGlobs)
|
||
}
|
||
|
||
func rocmSMIScriptCandidates() []string {
|
||
return expandExistingPaths(rocmSMIScriptGlobs)
|
||
}
|
||
|
||
func expandExistingPaths(patterns []string) []string {
|
||
seen := make(map[string]struct{})
|
||
var paths []string
|
||
for _, pattern := range patterns {
|
||
matches, err := satGlob(pattern)
|
||
if err != nil {
|
||
continue
|
||
}
|
||
sort.Strings(matches)
|
||
for _, match := range matches {
|
||
if _, err := satStat(match); err != nil {
|
||
continue
|
||
}
|
||
if _, ok := seen[match]; ok {
|
||
continue
|
||
}
|
||
seen[match] = struct{}{}
|
||
paths = append(paths, match)
|
||
}
|
||
}
|
||
return paths
|
||
}
|
||
|
||
func parseStorageDevices(raw string) []string {
|
||
var devices []string
|
||
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
||
fields := strings.Fields(strings.TrimSpace(line))
|
||
if len(fields) < 2 || fields[1] != "disk" {
|
||
continue
|
||
}
|
||
if len(fields) >= 3 && strings.EqualFold(fields[2], "usb") {
|
||
continue
|
||
}
|
||
devices = append(devices, "/dev/"+fields[0])
|
||
}
|
||
return devices
|
||
}
|
||
|
||
// runSATCommandWithMetrics runs a command while collecting GPU metrics in the background.
|
||
// On completion it writes gpu-metrics.csv and gpu-metrics.html into runDir.
|
||
func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir string, logFunc func(string)) ([]byte, error) {
|
||
stopCh := make(chan struct{})
|
||
doneCh := make(chan struct{})
|
||
var metricRows []GPUMetricRow
|
||
start := time.Now()
|
||
|
||
go func() {
|
||
defer close(doneCh)
|
||
ticker := time.NewTicker(time.Second)
|
||
defer ticker.Stop()
|
||
for {
|
||
select {
|
||
case <-stopCh:
|
||
return
|
||
case <-ticker.C:
|
||
samples, err := sampleGPUMetrics(gpuIndices)
|
||
if err != nil {
|
||
continue
|
||
}
|
||
elapsed := time.Since(start).Seconds()
|
||
for i := range samples {
|
||
samples[i].ElapsedSec = elapsed
|
||
}
|
||
metricRows = append(metricRows, samples...)
|
||
}
|
||
}
|
||
}()
|
||
|
||
out, err := runSATCommandCtx(ctx, verboseLog, name, cmd, env, logFunc)
|
||
|
||
close(stopCh)
|
||
<-doneCh
|
||
|
||
if len(metricRows) > 0 {
|
||
_ = WriteGPUMetricsCSV(filepath.Join(runDir, "gpu-metrics.csv"), metricRows)
|
||
_ = WriteGPUMetricsHTML(filepath.Join(runDir, "gpu-metrics.html"), metricRows)
|
||
}
|
||
|
||
return out, err
|
||
}
|
||
|
||
func appendSATVerboseLog(path string, lines ...string) {
|
||
if path == "" {
|
||
return
|
||
}
|
||
f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
|
||
if err != nil {
|
||
return
|
||
}
|
||
defer f.Close()
|
||
for _, line := range lines {
|
||
_, _ = io.WriteString(f, line+"\n")
|
||
}
|
||
}
|
||
|
||
func envInt(name string, fallback int) int {
|
||
raw := strings.TrimSpace(os.Getenv(name))
|
||
if raw == "" {
|
||
return fallback
|
||
}
|
||
value, err := strconv.Atoi(raw)
|
||
if err != nil || value <= 0 {
|
||
return fallback
|
||
}
|
||
return value
|
||
}
|
||
|
||
func createTarGz(dst, srcDir string) error {
|
||
file, err := os.Create(dst)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
defer file.Close()
|
||
|
||
gz := gzip.NewWriter(file)
|
||
defer gz.Close()
|
||
|
||
tw := tar.NewWriter(gz)
|
||
defer tw.Close()
|
||
|
||
base := filepath.Dir(srcDir)
|
||
return filepath.Walk(srcDir, func(path string, info os.FileInfo, err error) error {
|
||
if err != nil {
|
||
return err
|
||
}
|
||
if info.IsDir() {
|
||
return nil
|
||
}
|
||
header, err := tar.FileInfoHeader(info, "")
|
||
if err != nil {
|
||
return err
|
||
}
|
||
rel, err := filepath.Rel(base, path)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
header.Name = rel
|
||
if err := tw.WriteHeader(header); err != nil {
|
||
return err
|
||
}
|
||
file, err := os.Open(path)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
defer file.Close()
|
||
_, err = io.Copy(tw, file)
|
||
return err
|
||
})
|
||
}
|