package platform import ( "archive/tar" "bufio" "bytes" "compress/gzip" "context" "errors" "fmt" "io" "os" "os/exec" "path/filepath" "sort" "strconv" "strings" "sync" "syscall" "time" ) var ( satExecCommand = exec.Command satLookPath = exec.LookPath satGlob = filepath.Glob satStat = os.Stat satFreeMemBytes = freeMemBytes rocmSMIExecutableGlobs = []string{ "/opt/rocm/bin/rocm-smi", "/opt/rocm-*/bin/rocm-smi", } rocmSMIScriptGlobs = []string{ "/opt/rocm/libexec/rocm_smi/rocm_smi.py", "/opt/rocm-*/libexec/rocm_smi/rocm_smi.py", } rvsExecutableGlobs = []string{ "/opt/rocm/bin/rvs", "/opt/rocm-*/bin/rvs", } dcgmProfTesterCandidates = []string{ "dcgmproftester", "dcgmproftester13", "dcgmproftester12", "dcgmproftester11", } ) // streamExecOutput runs cmd and streams each output line to logFunc (if non-nil). // Returns combined stdout+stderr as a byte slice. func streamExecOutput(cmd *exec.Cmd, logFunc func(string)) ([]byte, error) { pr, pw := io.Pipe() cmd.Stdout = pw cmd.Stderr = pw var buf bytes.Buffer var wg sync.WaitGroup wg.Add(1) go func() { defer wg.Done() scanner := bufio.NewScanner(pr) for scanner.Scan() { line := scanner.Text() buf.WriteString(line + "\n") if logFunc != nil { logFunc(line) } } }() err := cmd.Start() if err != nil { _ = pw.Close() wg.Wait() return nil, err } waitErr := cmd.Wait() _ = pw.Close() wg.Wait() return buf.Bytes(), waitErr } // NvidiaGPU holds basic GPU info from nvidia-smi. type NvidiaGPU struct { Index int `json:"index"` Name string `json:"name"` MemoryMB int `json:"memory_mb"` } type NvidiaGPUStatus struct { Index int `json:"index"` Name string `json:"name"` BDF string `json:"bdf,omitempty"` Serial string `json:"serial,omitempty"` Status string `json:"status"` RawLine string `json:"raw_line,omitempty"` NeedsReset bool `json:"needs_reset"` ParseFailure bool `json:"parse_failure,omitempty"` } type nvidiaGPUHealth struct { Index int Name string NeedsReset bool RawLine string ParseFailure bool } type nvidiaGPUStatusFile struct { Index int Name string RunStatus string Reason string Health string HealthRaw string Observed bool Selected bool FailingJob string } // AMDGPUInfo holds basic info about an AMD GPU from rocm-smi. type AMDGPUInfo struct { Index int `json:"index"` Name string `json:"name"` } // DetectGPUVendor returns "nvidia" if /dev/nvidia0 exists, "amd" if /dev/kfd exists, or "" otherwise. func (s *System) DetectGPUVendor() string { if _, err := os.Stat("/dev/nvidia0"); err == nil { return "nvidia" } if _, err := os.Stat("/dev/kfd"); err == nil { return "amd" } if raw, err := exec.Command("lspci", "-nn").Output(); err == nil { text := strings.ToLower(string(raw)) if strings.Contains(text, "advanced micro devices") || strings.Contains(text, "amd/ati") { return "amd" } } return "" } // ListAMDGPUs returns AMD GPUs visible to rocm-smi. func (s *System) ListAMDGPUs() ([]AMDGPUInfo, error) { out, err := runROCmSMI("--showproductname", "--csv") if err != nil { return nil, fmt.Errorf("rocm-smi: %w", err) } var gpus []AMDGPUInfo for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") { line = strings.TrimSpace(line) if line == "" || strings.HasPrefix(strings.ToLower(line), "device") { continue } parts := strings.SplitN(line, ",", 2) name := "" if len(parts) >= 2 { name = strings.TrimSpace(parts[1]) } idx := len(gpus) gpus = append(gpus, AMDGPUInfo{Index: idx, Name: name}) } return gpus, nil } // RunAMDAcceptancePack runs an AMD GPU diagnostic pack using rocm-smi. func (s *System) RunAMDAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) { return runAcceptancePackCtx(ctx, baseDir, "gpu-amd", []satJob{ {name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}}, {name: "02-rocm-smi-showallinfo.log", cmd: []string{"rocm-smi", "--showallinfo"}}, {name: "03-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}}, {name: "04-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}}, }, logFunc) } // RunAMDMemIntegrityPack runs the official RVS MEM module as a validate-style memory integrity test. func (s *System) RunAMDMemIntegrityPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) { if err := ensureAMDRuntimeReady(); err != nil { return "", err } cfgFile := "/tmp/bee-amd-mem.conf" cfg := `actions: - name: mem_integrity device: all module: mem parallel: true duration: 60000 copy_matrix: false target_stress: 90 matrix_size: 8640 ` _ = os.WriteFile(cfgFile, []byte(cfg), 0644) return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-mem", []satJob{ {name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}}, {name: "02-rvs-mem.log", cmd: []string{"rvs", "-c", cfgFile}}, {name: "03-rocm-smi-after.log", cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--showmemuse", "--csv"}}, }, logFunc) } // RunAMDMemBandwidthPack runs AMD's memory/interconnect bandwidth-oriented tools. func (s *System) RunAMDMemBandwidthPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) { if err := ensureAMDRuntimeReady(); err != nil { return "", err } cfgFile := "/tmp/bee-amd-babel.conf" cfg := `actions: - name: babel_mem_bw device: all module: babel parallel: true copy_matrix: true target_stress: 90 matrix_size: 134217728 ` _ = os.WriteFile(cfgFile, []byte(cfg), 0644) return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-bandwidth", []satJob{ {name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}}, {name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}}, {name: "03-rvs-babel.log", cmd: []string{"rvs", "-c", cfgFile}}, {name: "04-rocm-smi-after.log", cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--showmemuse", "--csv"}}, }, logFunc) } // RunAMDStressPack runs an AMD GPU burn-in pack. // Missing tools are reported as UNSUPPORTED, consistent with the existing SAT pattern. func (s *System) RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) { seconds := durationSec if seconds <= 0 { seconds = envInt("BEE_AMD_STRESS_SECONDS", 300) } if err := ensureAMDRuntimeReady(); err != nil { return "", err } // Enable copy_matrix so the same GST run drives VRAM traffic in addition to compute. rvsCfg := amdStressRVSConfig(seconds) cfgFile := "/tmp/bee-amd-gst.conf" _ = os.WriteFile(cfgFile, []byte(rvsCfg), 0644) return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-stress", amdStressJobs(seconds, cfgFile), logFunc) } func amdStressRVSConfig(seconds int) string { return fmt.Sprintf(`actions: - name: gst_stress device: all module: gst parallel: true duration: %d copy_matrix: false target_stress: 90 matrix_size_a: 8640 matrix_size_b: 8640 matrix_size_c: 8640 `, seconds*1000) } func amdStressJobs(seconds int, cfgFile string) []satJob { return []satJob{ {name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}}, {name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}}, {name: fmt.Sprintf("03-rvs-gst-%ds.log", seconds), cmd: []string{"rvs", "-c", cfgFile}}, {name: fmt.Sprintf("04-rocm-smi-after.log"), cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--csv"}}, } } // ListNvidiaGPUs returns GPUs visible to nvidia-smi. func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) { out, err := exec.Command("nvidia-smi", "--query-gpu=index,name,memory.total", "--format=csv,noheader,nounits").Output() if err != nil { return nil, fmt.Errorf("nvidia-smi: %w", err) } var gpus []NvidiaGPU for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") { line = strings.TrimSpace(line) if line == "" { continue } parts := strings.SplitN(line, ", ", 3) if len(parts) != 3 { continue } idx, err := strconv.Atoi(strings.TrimSpace(parts[0])) if err != nil { continue } memMB, _ := strconv.Atoi(strings.TrimSpace(parts[2])) gpus = append(gpus, NvidiaGPU{ Index: idx, Name: strings.TrimSpace(parts[1]), MemoryMB: memMB, }) } sort.Slice(gpus, func(i, j int) bool { return gpus[i].Index < gpus[j].Index }) return gpus, nil } func (s *System) ListNvidiaGPUStatuses() ([]NvidiaGPUStatus, error) { out, err := satExecCommand( "nvidia-smi", "--query-gpu=index,name,pci.bus_id,serial,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits", ).Output() if err != nil { return nil, fmt.Errorf("nvidia-smi: %w", err) } var gpus []NvidiaGPUStatus for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") { line = strings.TrimSpace(line) if line == "" { continue } parts := strings.Split(line, ",") if len(parts) < 4 { gpus = append(gpus, NvidiaGPUStatus{RawLine: line, Status: "UNKNOWN", ParseFailure: true}) continue } idx, err := strconv.Atoi(strings.TrimSpace(parts[0])) if err != nil { gpus = append(gpus, NvidiaGPUStatus{RawLine: line, Status: "UNKNOWN", ParseFailure: true}) continue } upper := strings.ToUpper(line) needsReset := strings.Contains(upper, "GPU REQUIRES RESET") status := "OK" if needsReset { status = "RESET_REQUIRED" } gpus = append(gpus, NvidiaGPUStatus{ Index: idx, Name: strings.TrimSpace(parts[1]), BDF: normalizeNvidiaBusID(strings.TrimSpace(parts[2])), Serial: strings.TrimSpace(parts[3]), Status: status, RawLine: line, NeedsReset: needsReset, }) } sort.Slice(gpus, func(i, j int) bool { return gpus[i].Index < gpus[j].Index }) return gpus, nil } func normalizeNvidiaBusID(v string) string { v = strings.TrimSpace(strings.ToLower(v)) parts := strings.Split(v, ":") if len(parts) == 3 && len(parts[0]) > 4 { parts[0] = parts[0][len(parts[0])-4:] return strings.Join(parts, ":") } return v } func (s *System) ResetNvidiaGPU(index int) (string, error) { if index < 0 { return "", fmt.Errorf("gpu index must be >= 0") } raw, err := satExecCommand("nvidia-smi", "-r", "-i", strconv.Itoa(index)).CombinedOutput() if len(raw) == 0 && err == nil { raw = []byte("GPU reset completed.\n") } return string(raw), err } // RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs. // Measures collective communication bandwidth over NVLink/PCIe. func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) { // detect GPU count out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output() gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n")) if gpuCount < 1 { gpuCount = 1 } return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", withNvidiaPersistenceMode( satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, satJob{name: "02-all-reduce-perf.log", cmd: []string{ "all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2", "-g", strconv.Itoa(gpuCount), "--iters", "20", }}, ), logFunc) } func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) { selected, err := resolveDCGMGPUIndices(gpuIndices) if err != nil { return "", err } profCmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec))) if err != nil { return "", err } return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode( satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}}, satJob{ name: "03-dcgmproftester.log", cmd: profCmd, env: nvidiaVisibleDevicesEnv(selected), collectGPU: true, gpuIndices: selected, }, satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}}, ), logFunc) } func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) { selected, err := resolveDCGMGPUIndices(gpuIndices) if err != nil { return "", err } return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", withNvidiaPersistenceMode( satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, satJob{ name: "02-dcgmi-targeted-power.log", cmd: nvidiaDCGMNamedDiagCommand("targeted_power", normalizeNvidiaBurnDuration(durationSec), selected), collectGPU: true, gpuIndices: selected, }, satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}}, ), logFunc) } func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) { selected, err := resolveDCGMGPUIndices(gpuIndices) if err != nil { return "", err } return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", withNvidiaPersistenceMode( satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, satJob{ name: "02-dcgmi-pulse-test.log", cmd: nvidiaDCGMNamedDiagCommand("pulse_test", normalizeNvidiaBurnDuration(durationSec), selected), collectGPU: true, gpuIndices: selected, }, satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}}, ), logFunc) } func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) { selected, err := resolveDCGMGPUIndices(gpuIndices) if err != nil { return "", err } return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", withNvidiaPersistenceMode( satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, satJob{ name: "02-dcgmi-nvbandwidth.log", cmd: nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, selected), collectGPU: true, gpuIndices: selected, }, satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}}, ), logFunc) } func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) { return runAcceptancePackCtx(context.Background(), baseDir, "gpu-nvidia", nvidiaSATJobs(), logFunc) } // RunNvidiaAcceptancePackWithOptions runs the NVIDIA diagnostics via DCGM. // diagLevel: 1=quick, 2=medium, 3=targeted stress, 4=extended stress. // gpuIndices: specific GPU indices to test (empty = all GPUs). // ctx cancellation kills the running job. func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error) { resolvedGPUIndices, err := resolveDCGMGPUIndices(gpuIndices) if err != nil { return "", err } return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, resolvedGPUIndices), logFunc) } func (s *System) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) { selected, err := resolveDCGMGPUIndices(gpuIndices) if err != nil { return "", err } // Kill any lingering nvvs/dcgmi processes from a previous interrupted run // before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34). if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil { for _, p := range killed { logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name)) } } return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", withNvidiaPersistenceMode( satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, satJob{ name: "02-dcgmi-targeted-stress.log", cmd: nvidiaDCGMNamedDiagCommand("targeted_stress", normalizeNvidiaBurnDuration(durationSec), selected), collectGPU: true, gpuIndices: selected, }, satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}}, ), logFunc) } func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) { if len(gpuIndices) > 0 { return dedupeSortedIndices(gpuIndices), nil } all, err := listNvidiaGPUIndices() if err != nil { return nil, err } if len(all) == 0 { return nil, fmt.Errorf("nvidia-smi found no NVIDIA GPUs") } return all, nil } func memoryStressSizeArg() string { if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 { return fmt.Sprintf("%dM", mb) } availBytes := satFreeMemBytes() if availBytes <= 0 { return "80%" } availMB := availBytes / (1024 * 1024) targetMB := (availMB * 2) / 3 if targetMB >= 256 { targetMB = (targetMB / 256) * 256 } if targetMB <= 0 { return "80%" } return fmt.Sprintf("%dM", targetMB) } func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) { if sizeMB <= 0 { sizeMB = 256 } if passes <= 0 { passes = 1 } return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{ {name: "01-free-before.log", cmd: []string{"free", "-h"}}, {name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}}, {name: "03-free-after.log", cmd: []string{"free", "-h"}}, }, logFunc) } func (s *System) RunMemoryStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) { seconds := durationSec if seconds <= 0 { seconds = envInt("BEE_VM_STRESS_SECONDS", 300) } // Base the default on current MemAvailable and keep headroom for the OS and // concurrent stressors so mixed burn runs do not trip the OOM killer. sizeArg := memoryStressSizeArg() return runAcceptancePackCtx(ctx, baseDir, "memory-stress", []satJob{ {name: "01-free-before.log", cmd: []string{"free", "-h"}}, {name: "02-stress-ng-vm.log", cmd: []string{ "stress-ng", "--vm", "1", "--vm-bytes", sizeArg, "--vm-method", "all", "--timeout", fmt.Sprintf("%d", seconds), "--metrics-brief", }}, {name: "03-free-after.log", cmd: []string{"free", "-h"}}, }, logFunc) } func (s *System) RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) { seconds := durationSec if seconds <= 0 { seconds = envInt("BEE_SAT_STRESS_SECONDS", 300) } cmd := []string{"stressapptest", "-s", fmt.Sprintf("%d", seconds), "-W", "--cc_test"} if mb := envInt("BEE_SAT_STRESS_MB", 0); mb > 0 { cmd = append(cmd, "-M", fmt.Sprintf("%d", mb)) } return runAcceptancePackCtx(ctx, baseDir, "sat-stress", []satJob{ {name: "01-free-before.log", cmd: []string{"free", "-h"}}, {name: "02-stressapptest.log", cmd: cmd}, {name: "03-free-after.log", cmd: []string{"free", "-h"}}, }, logFunc) } func (s *System) RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) { if durationSec <= 0 { durationSec = 60 } return runAcceptancePackCtx(ctx, baseDir, "cpu", []satJob{ {name: "01-lscpu.log", cmd: []string{"lscpu"}}, {name: "02-sensors-before.log", cmd: []string{"sensors"}}, {name: "03-stress-ng.log", cmd: []string{"stress-ng", "--cpu", "0", "--cpu-method", "all", "--timeout", fmt.Sprintf("%d", durationSec)}}, {name: "04-sensors-after.log", cmd: []string{"sensors"}}, }, logFunc) } func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) { if baseDir == "" { baseDir = "/var/log/bee-sat" } ts := time.Now().UTC().Format("20060102-150405") runDir := filepath.Join(baseDir, "storage-"+ts) if err := os.MkdirAll(runDir, 0755); err != nil { return "", err } verboseLog := filepath.Join(runDir, "verbose.log") devices, err := listStorageDevices() if err != nil { return "", err } sort.Strings(devices) var summary strings.Builder stats := satStats{} fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339)) if len(devices) == 0 { fmt.Fprintln(&summary, "devices=0") stats.Unsupported++ } else { fmt.Fprintf(&summary, "devices=%d\n", len(devices)) } for index, devPath := range devices { if ctx.Err() != nil { break } prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath)) commands := storageSATCommands(devPath, extended) for cmdIndex, job := range commands { if ctx.Err() != nil { break } name := fmt.Sprintf("%s-%02d-%s.log", prefix, cmdIndex+1, job.name) out, err := runSATCommandCtx(ctx, verboseLog, job.name, job.cmd, nil, logFunc) if writeErr := os.WriteFile(filepath.Join(runDir, name), out, 0644); writeErr != nil { return "", writeErr } status, rc := classifySATResult(job.name, out, err) stats.Add(status) key := filepath.Base(devPath) + "_" + strings.ReplaceAll(job.name, "-", "_") fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc) fmt.Fprintf(&summary, "%s_status=%s\n", key, status) } } writeSATStats(&summary, stats) if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil { return "", err } archive := filepath.Join(baseDir, "storage-"+ts+".tar.gz") if err := createTarGz(archive, runDir); err != nil { return "", err } return archive, nil } type satJob struct { name string cmd []string env []string // extra env vars (appended to os.Environ) collectGPU bool // collect GPU metrics via nvidia-smi while this job runs gpuIndices []int // GPU indices to collect metrics for (empty = all) } type satStats struct { OK int Failed int Unsupported int } func withNvidiaPersistenceMode(jobs ...satJob) []satJob { out := make([]satJob, 0, len(jobs)+1) out = append(out, satJob{ name: "00-nvidia-smi-persistence-mode.log", cmd: []string{"nvidia-smi", "-pm", "1"}, }) out = append(out, jobs...) return out } func nvidiaSATJobs() []satJob { return withNvidiaPersistenceMode( satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}}, satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}}, satJob{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}}, satJob{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}}, ) } func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob { if diagLevel < 1 || diagLevel > 4 { diagLevel = 3 } diagArgs := []string{"dcgmi", "diag", "-r", strconv.Itoa(diagLevel)} if len(gpuIndices) > 0 { ids := make([]string, len(gpuIndices)) for i, idx := range gpuIndices { ids[i] = strconv.Itoa(idx) } diagArgs = append(diagArgs, "-i", strings.Join(ids, ",")) } return withNvidiaPersistenceMode( satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}}, satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}}, satJob{name: "04-dcgmi-diag.log", cmd: diagArgs, gpuIndices: gpuIndices}, ) } func nvidiaDCGMNamedDiagCommand(name string, durationSec int, gpuIndices []int) []string { args := []string{"dcgmi", "diag", "-r", name} if durationSec > 0 { args = append(args, "-p", fmt.Sprintf("%s.test_duration=%d", name, durationSec)) } if len(gpuIndices) > 0 { args = append(args, "-i", joinIndexList(gpuIndices)) } return args } func normalizeNvidiaBurnDuration(durationSec int) int { if durationSec <= 0 { return 300 } return durationSec } func nvidiaVisibleDevicesEnv(gpuIndices []int) []string { if len(gpuIndices) == 0 { return nil } return []string{ "CUDA_DEVICE_ORDER=PCI_BUS_ID", "CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices), } } func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) { if ctx == nil { ctx = context.Background() } if baseDir == "" { baseDir = "/var/log/bee-sat" } ts := time.Now().UTC().Format("20060102-150405") runDir := filepath.Join(baseDir, prefix+"-"+ts) if err := os.MkdirAll(runDir, 0755); err != nil { return "", err } verboseLog := filepath.Join(runDir, "verbose.log") var summary strings.Builder stats := satStats{} nvidiaPack := strings.HasPrefix(prefix, "gpu-nvidia") perGPU := map[int]*nvidiaGPUStatusFile{} selectedGPUIndices := map[int]struct{}{} fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339)) for _, job := range jobs { if ctx.Err() != nil { break } for _, idx := range job.gpuIndices { selectedGPUIndices[idx] = struct{}{} status := perGPU[idx] if status == nil { status = &nvidiaGPUStatusFile{Index: idx} perGPU[idx] = status } status.Selected = true } cmd := make([]string, 0, len(job.cmd)) for _, arg := range job.cmd { cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir)) } var out []byte var err error if nvidiaPack && nvidiaJobNeedsHealthCheck(job) { if msg, healthErr := checkNvidiaJobHealth(job.gpuIndices); healthErr != nil { if logFunc != nil { logFunc(msg) } out = []byte(msg + "\n") err = healthErr } } if err == nil { if job.collectGPU { out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc) } else { out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc) } } if nvidiaPack && nvidiaJobNeedsHealthCheck(job) { if msg, healthErr := checkNvidiaJobHealth(job.gpuIndices); healthErr != nil { if logFunc != nil { logFunc(msg) } if len(out) > 0 && !bytes.HasSuffix(out, []byte("\n")) { out = append(out, '\n') } out = append(out, []byte(msg+"\n")...) if err == nil { err = healthErr } } } if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil { return "", writeErr } if ctx.Err() != nil { return "", ctx.Err() } status, rc := classifySATResult(job.name, out, err) stats.Add(status) if nvidiaPack && len(job.gpuIndices) > 0 && nvidiaJobNeedsHealthCheck(job) { for _, idx := range job.gpuIndices { updateNvidiaGPUStatus(perGPU, idx, status, job.name, string(out)) } } key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log") fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc) fmt.Fprintf(&summary, "%s_status=%s\n", key, status) } writeSATStats(&summary, stats) if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil { return "", err } if nvidiaPack { if err := writeNvidiaGPUStatusFiles(runDir, stats.Overall(), perGPU, selectedGPUIndices); err != nil { return "", err } } archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz") if err := createTarGz(archive, runDir); err != nil { return "", err } return archive, nil } func updateNvidiaGPUStatus(perGPU map[int]*nvidiaGPUStatusFile, idx int, status, jobName, detail string) { entry := perGPU[idx] if entry == nil { entry = &nvidiaGPUStatusFile{Index: idx} perGPU[idx] = entry } if nvidiaSATStatusSeverity(status) >= nvidiaSATStatusSeverity(entry.RunStatus) { entry.RunStatus = status entry.FailingJob = jobName entry.Reason = firstLine(detail) } } func writeNvidiaGPUStatusFiles(runDir, overall string, perGPU map[int]*nvidiaGPUStatusFile, selected map[int]struct{}) error { health, err := readNvidiaGPUHealth() if err == nil { for _, gpu := range health { entry := perGPU[gpu.Index] if entry == nil { entry = &nvidiaGPUStatusFile{Index: gpu.Index} perGPU[gpu.Index] = entry } entry.Name = gpu.Name entry.Observed = true entry.HealthRaw = gpu.RawLine if gpu.NeedsReset { entry.Health = "RESET_REQUIRED" if entry.RunStatus == "" || nvidiaSATStatusSeverity("FAILED") >= nvidiaSATStatusSeverity(entry.RunStatus) { entry.RunStatus = "FAILED" if strings.TrimSpace(entry.Reason) == "" { entry.Reason = "GPU requires reset" } } } else { entry.Health = "OK" } } } for idx := range selected { entry := perGPU[idx] if entry == nil { entry = &nvidiaGPUStatusFile{Index: idx} perGPU[idx] = entry } entry.Selected = true } var indices []int for idx := range perGPU { indices = append(indices, idx) } sort.Ints(indices) for _, idx := range indices { entry := perGPU[idx] if entry.RunStatus == "" { entry.RunStatus = overall } if entry.Health == "" { entry.Health = "UNKNOWN" } if entry.Name == "" { entry.Name = "unknown" } var body strings.Builder fmt.Fprintf(&body, "gpu_index=%d\n", entry.Index) fmt.Fprintf(&body, "gpu_name=%s\n", entry.Name) fmt.Fprintf(&body, "selected=%t\n", entry.Selected) fmt.Fprintf(&body, "observed=%t\n", entry.Observed) fmt.Fprintf(&body, "run_status=%s\n", entry.RunStatus) fmt.Fprintf(&body, "health_status=%s\n", entry.Health) if strings.TrimSpace(entry.FailingJob) != "" { fmt.Fprintf(&body, "failing_job=%s\n", entry.FailingJob) } if strings.TrimSpace(entry.Reason) != "" { fmt.Fprintf(&body, "reason=%s\n", entry.Reason) } if strings.TrimSpace(entry.HealthRaw) != "" { fmt.Fprintf(&body, "health_raw=%s\n", entry.HealthRaw) } if err := os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-status.txt", idx)), []byte(body.String()), 0644); err != nil { return err } } return nil } func nvidiaSATStatusSeverity(status string) int { switch strings.ToUpper(strings.TrimSpace(status)) { case "FAILED": return 3 case "PARTIAL", "UNSUPPORTED": return 2 case "OK": return 1 default: return 0 } } func firstLine(s string) string { s = strings.TrimSpace(s) if s == "" { return "" } if idx := strings.IndexByte(s, '\n'); idx >= 0 { return strings.TrimSpace(s[:idx]) } return s } func nvidiaJobNeedsHealthCheck(job satJob) bool { if job.collectGPU { return true } name := strings.ToLower(strings.TrimSpace(job.name)) return strings.Contains(name, "dcgmi") || strings.Contains(name, "gpu-burn") || strings.Contains(name, "gpu-stress") || strings.Contains(name, "dcgmproftester") } func checkNvidiaJobHealth(selected []int) (string, error) { health, err := readNvidiaGPUHealth() if err != nil { return "", nil } var bad []nvidiaGPUHealth selectedSet := make(map[int]struct{}, len(selected)) for _, idx := range selected { selectedSet[idx] = struct{}{} } for _, gpu := range health { if len(selectedSet) > 0 { if _, ok := selectedSet[gpu.Index]; !ok { continue } } if gpu.NeedsReset { bad = append(bad, gpu) } } if len(bad) == 0 { return "", nil } lines := make([]string, 0, len(bad)+1) lines = append(lines, "NVIDIA GPU health check failed:") for _, gpu := range bad { lines = append(lines, fmt.Sprintf("gpu %d (%s) requires reset: %s", gpu.Index, gpu.Name, gpu.RawLine)) } return strings.Join(lines, "\n"), errors.New("nvidia gpu requires reset") } func readNvidiaGPUHealth() ([]nvidiaGPUHealth, error) { out, err := satExecCommand( "nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits", ).Output() if err != nil { return nil, fmt.Errorf("nvidia-smi: %w", err) } return parseNvidiaGPUHealth(string(out)), nil } func parseNvidiaGPUHealth(raw string) []nvidiaGPUHealth { var gpus []nvidiaGPUHealth for _, line := range strings.Split(strings.TrimSpace(raw), "\n") { line = strings.TrimSpace(line) if line == "" { continue } parts := strings.Split(line, ",") if len(parts) < 2 { gpus = append(gpus, nvidiaGPUHealth{RawLine: line, ParseFailure: true}) continue } idx, err := strconv.Atoi(strings.TrimSpace(parts[0])) if err != nil { gpus = append(gpus, nvidiaGPUHealth{RawLine: line, ParseFailure: true}) continue } upper := strings.ToUpper(line) gpus = append(gpus, nvidiaGPUHealth{ Index: idx, Name: strings.TrimSpace(parts[1]), NeedsReset: strings.Contains(upper, "GPU REQUIRES RESET"), RawLine: line, }) } return gpus } func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string, logFunc func(string)) ([]byte, error) { start := time.Now().UTC() resolvedCmd, err := resolveSATCommand(cmd) appendSATVerboseLog(verboseLog, fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name), "cmd: "+strings.Join(resolvedCmd, " "), ) if logFunc != nil { logFunc(fmt.Sprintf("=== %s ===", name)) } if err != nil { appendSATVerboseLog(verboseLog, fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name), "rc: 1", fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()), "", ) return []byte(err.Error() + "\n"), err } c := exec.CommandContext(ctx, resolvedCmd[0], resolvedCmd[1:]...) c.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} c.Cancel = func() error { if c.Process != nil { _ = syscall.Kill(-c.Process.Pid, syscall.SIGKILL) } return nil } if len(env) > 0 { c.Env = append(os.Environ(), env...) } out, err := streamExecOutput(c, logFunc) rc := 0 if err != nil { rc = 1 } appendSATVerboseLog(verboseLog, fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name), fmt.Sprintf("rc: %d", rc), fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()), "", ) return out, err } func listStorageDevices() ([]string, error) { out, err := satExecCommand("lsblk", "-dn", "-o", "NAME,TYPE,TRAN").Output() if err != nil { return nil, err } return parseStorageDevices(string(out)), nil } func storageSATCommands(devPath string, extended bool) []satJob { if strings.Contains(filepath.Base(devPath), "nvme") { selfTestLevel := "1" if extended { selfTestLevel = "2" } return []satJob{ {name: "nvme-id-ctrl", cmd: []string{"nvme", "id-ctrl", devPath, "-o", "json"}}, {name: "nvme-smart-log", cmd: []string{"nvme", "smart-log", devPath, "-o", "json"}}, {name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", selfTestLevel, "--wait"}}, } } smartTestType := "short" if extended { smartTestType = "long" } return []satJob{ {name: "smartctl-health", cmd: []string{"smartctl", "-H", "-A", devPath}}, {name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", smartTestType, devPath}}, } } func (s *satStats) Add(status string) { switch status { case "OK": s.OK++ case "UNSUPPORTED": s.Unsupported++ default: s.Failed++ } } func (s satStats) Overall() string { if s.Failed > 0 { return "FAILED" } if s.Unsupported > 0 { return "PARTIAL" } return "OK" } func writeSATStats(summary *strings.Builder, stats satStats) { fmt.Fprintf(summary, "overall_status=%s\n", stats.Overall()) fmt.Fprintf(summary, "job_ok=%d\n", stats.OK) fmt.Fprintf(summary, "job_failed=%d\n", stats.Failed) fmt.Fprintf(summary, "job_unsupported=%d\n", stats.Unsupported) } func classifySATResult(name string, out []byte, err error) (string, int) { rc := 0 if err != nil { rc = 1 } if err == nil { return "OK", rc } text := strings.ToLower(string(out)) // No output at all means the tool failed to start (mlock limit, binary missing, // etc.) — we cannot say anything about hardware health → UNSUPPORTED. if len(strings.TrimSpace(text)) == 0 { return "UNSUPPORTED", rc } if strings.Contains(text, "unsupported") || strings.Contains(text, "not supported") || strings.Contains(text, "not found in path") || strings.Contains(text, "invalid opcode") || strings.Contains(text, "unknown command") || strings.Contains(text, "not implemented") || strings.Contains(text, "not available") || strings.Contains(text, "cuda_error_system_not_ready") || strings.Contains(text, "no such device") || // nvidia-smi on a machine with no NVIDIA GPU strings.Contains(text, "couldn't communicate with the nvidia driver") || strings.Contains(text, "no nvidia gpu") || // Some NVMe firmwares start self-test but never expose progress to nvme-cli // while waiting, so the CLI stops polling without proving device failure. (strings.Contains(name, "self-test") && strings.Contains(text, "no progress for") && strings.Contains(text, "stop waiting")) || (strings.Contains(name, "self-test") && strings.Contains(text, "aborted")) { return "UNSUPPORTED", rc } return "FAILED", rc } func runSATCommand(verboseLog, name string, cmd []string, logFunc func(string)) ([]byte, error) { start := time.Now().UTC() resolvedCmd, err := resolveSATCommand(cmd) appendSATVerboseLog(verboseLog, fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name), "cmd: "+strings.Join(resolvedCmd, " "), ) if logFunc != nil { logFunc(fmt.Sprintf("=== %s ===", name)) } if err != nil { appendSATVerboseLog(verboseLog, fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name), "rc: 1", fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()), "", ) return []byte(err.Error() + "\n"), err } out, err := streamExecOutput(satExecCommand(resolvedCmd[0], resolvedCmd[1:]...), logFunc) rc := 0 if err != nil { rc = 1 } appendSATVerboseLog(verboseLog, fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name), fmt.Sprintf("rc: %d", rc), fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()), "", ) return out, err } func runROCmSMI(args ...string) ([]byte, error) { cmd, err := resolveROCmSMICommand(args...) if err != nil { return nil, err } return satExecCommand(cmd[0], cmd[1:]...).CombinedOutput() } func resolveSATCommand(cmd []string) ([]string, error) { if len(cmd) == 0 { return nil, errors.New("empty SAT command") } switch cmd[0] { case "rocm-smi": return resolveROCmSMICommand(cmd[1:]...) case "rvs": return resolveRVSCommand(cmd[1:]...) } path, err := satLookPath(cmd[0]) if err != nil { return nil, fmt.Errorf("%s not found in PATH: %w", cmd[0], err) } return append([]string{path}, cmd[1:]...), nil } func resolveRVSCommand(args ...string) ([]string, error) { if path, err := satLookPath("rvs"); err == nil { return append([]string{path}, args...), nil } for _, path := range expandExistingPaths(rvsExecutableGlobs) { return append([]string{path}, args...), nil } return nil, errors.New("rvs not found in PATH or under /opt/rocm") } func resolveROCmSMICommand(args ...string) ([]string, error) { if path, err := satLookPath("rocm-smi"); err == nil { return append([]string{path}, args...), nil } for _, path := range rocmSMIExecutableCandidates() { return append([]string{path}, args...), nil } pythonPath, pyErr := satLookPath("python3") if pyErr == nil { for _, script := range rocmSMIScriptCandidates() { cmd := []string{pythonPath, script} cmd = append(cmd, args...) return cmd, nil } } return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm") } func resolveDCGMProfTesterCommand(args ...string) ([]string, error) { for _, candidate := range dcgmProfTesterCandidates { if path, err := satLookPath(candidate); err == nil { return append([]string{path}, args...), nil } } return nil, errors.New("dcgmproftester not found in PATH") } func ensureAMDRuntimeReady() error { if _, err := os.Stat("/dev/kfd"); err == nil { return nil } if raw, err := os.ReadFile("/sys/module/amdgpu/initstate"); err == nil { state := strings.TrimSpace(string(raw)) if strings.EqualFold(state, "live") { return nil } return fmt.Errorf("AMD driver is present but not initialized: amdgpu initstate=%q", state) } return errors.New("AMD GPUs are present but the runtime is not initialized: /dev/kfd is missing and amdgpu is not loaded") } func rocmSMIExecutableCandidates() []string { return expandExistingPaths(rocmSMIExecutableGlobs) } func rocmSMIScriptCandidates() []string { return expandExistingPaths(rocmSMIScriptGlobs) } func expandExistingPaths(patterns []string) []string { seen := make(map[string]struct{}) var paths []string for _, pattern := range patterns { matches, err := satGlob(pattern) if err != nil { continue } sort.Strings(matches) for _, match := range matches { if _, err := satStat(match); err != nil { continue } if _, ok := seen[match]; ok { continue } seen[match] = struct{}{} paths = append(paths, match) } } return paths } func parseStorageDevices(raw string) []string { var devices []string for _, line := range strings.Split(strings.TrimSpace(raw), "\n") { fields := strings.Fields(strings.TrimSpace(line)) if len(fields) < 2 || fields[1] != "disk" { continue } if len(fields) >= 3 && strings.EqualFold(fields[2], "usb") { continue } devices = append(devices, "/dev/"+fields[0]) } return devices } // runSATCommandWithMetrics runs a command while collecting GPU metrics in the background. // On completion it writes gpu-metrics.csv and gpu-metrics.html into runDir. func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir string, logFunc func(string)) ([]byte, error) { stopCh := make(chan struct{}) doneCh := make(chan struct{}) var metricRows []GPUMetricRow start := time.Now() go func() { defer close(doneCh) ticker := time.NewTicker(time.Second) defer ticker.Stop() for { select { case <-stopCh: return case <-ticker.C: samples, err := sampleGPUMetrics(gpuIndices) if err != nil { continue } elapsed := time.Since(start).Seconds() for i := range samples { samples[i].ElapsedSec = elapsed } metricRows = append(metricRows, samples...) } } }() out, err := runSATCommandCtx(ctx, verboseLog, name, cmd, env, logFunc) close(stopCh) <-doneCh if len(metricRows) > 0 { _ = WriteGPUMetricsCSV(filepath.Join(runDir, "gpu-metrics.csv"), metricRows) _ = WriteGPUMetricsHTML(filepath.Join(runDir, "gpu-metrics.html"), metricRows) chart := RenderGPUTerminalChart(metricRows) _ = os.WriteFile(filepath.Join(runDir, "gpu-metrics-term.txt"), []byte(chart), 0644) } return out, err } func appendSATVerboseLog(path string, lines ...string) { if path == "" { return } f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644) if err != nil { return } defer f.Close() for _, line := range lines { _, _ = io.WriteString(f, line+"\n") } } func envInt(name string, fallback int) int { raw := strings.TrimSpace(os.Getenv(name)) if raw == "" { return fallback } value, err := strconv.Atoi(raw) if err != nil || value <= 0 { return fallback } return value } func createTarGz(dst, srcDir string) error { file, err := os.Create(dst) if err != nil { return err } defer file.Close() gz := gzip.NewWriter(file) defer gz.Close() tw := tar.NewWriter(gz) defer tw.Close() base := filepath.Dir(srcDir) return filepath.Walk(srcDir, func(path string, info os.FileInfo, err error) error { if err != nil { return err } if info.IsDir() { return nil } header, err := tar.FileInfoHeader(info, "") if err != nil { return err } rel, err := filepath.Rel(base, path) if err != nil { return err } header.Name = rel if err := tw.WriteHeader(header); err != nil { return err } file, err := os.Open(path) if err != nil { return err } defer file.Close() _, err = io.Copy(tw, file) return err }) }