package platform import ( "archive/tar" "compress/gzip" "context" "fmt" "io" "os" "os/exec" "path/filepath" "sort" "strconv" "strings" "time" ) // NvidiaGPU holds basic GPU info from nvidia-smi. type NvidiaGPU struct { Index int Name string MemoryMB int } // ListNvidiaGPUs returns GPUs visible to nvidia-smi. func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) { out, err := exec.Command("nvidia-smi", "--query-gpu=index,name,memory.total", "--format=csv,noheader,nounits").Output() if err != nil { return nil, fmt.Errorf("nvidia-smi: %w", err) } var gpus []NvidiaGPU for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") { line = strings.TrimSpace(line) if line == "" { continue } parts := strings.SplitN(line, ", ", 3) if len(parts) != 3 { continue } idx, err := strconv.Atoi(strings.TrimSpace(parts[0])) if err != nil { continue } memMB, _ := strconv.Atoi(strings.TrimSpace(parts[2])) gpus = append(gpus, NvidiaGPU{ Index: idx, Name: strings.TrimSpace(parts[1]), MemoryMB: memMB, }) } return gpus, nil } func (s *System) RunNvidiaAcceptancePack(baseDir string) (string, error) { return runAcceptancePack(baseDir, "gpu-nvidia", nvidiaSATJobs()) } // RunNvidiaAcceptancePackWithOptions runs the NVIDIA SAT with explicit duration, // GPU memory size, and GPU index selection. ctx cancellation kills the running job. func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, durationSec int, sizeMB int, gpuIndices []int) (string, error) { return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaSATJobsWithOptions(durationSec, sizeMB, gpuIndices)) } func (s *System) RunMemoryAcceptancePack(baseDir string) (string, error) { sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128) passes := envInt("BEE_MEMTESTER_PASSES", 1) return runAcceptancePack(baseDir, "memory", []satJob{ {name: "01-free-before.log", cmd: []string{"free", "-h"}}, {name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}}, {name: "03-free-after.log", cmd: []string{"free", "-h"}}, }) } func (s *System) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) { if durationSec <= 0 { durationSec = 60 } return runAcceptancePack(baseDir, "cpu", []satJob{ {name: "01-lscpu.log", cmd: []string{"lscpu"}}, {name: "02-sensors-before.log", cmd: []string{"sensors"}}, {name: "03-stress-ng.log", cmd: []string{"stress-ng", "--cpu", "0", "--cpu-method", "all", "--timeout", fmt.Sprintf("%d", durationSec)}}, {name: "04-sensors-after.log", cmd: []string{"sensors"}}, }) } func (s *System) RunStorageAcceptancePack(baseDir string) (string, error) { if baseDir == "" { baseDir = "/var/log/bee-sat" } ts := time.Now().UTC().Format("20060102-150405") runDir := filepath.Join(baseDir, "storage-"+ts) if err := os.MkdirAll(runDir, 0755); err != nil { return "", err } verboseLog := filepath.Join(runDir, "verbose.log") devices, err := listStorageDevices() if err != nil { return "", err } sort.Strings(devices) var summary strings.Builder stats := satStats{} fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339)) if len(devices) == 0 { fmt.Fprintln(&summary, "devices=0") stats.Unsupported++ } else { fmt.Fprintf(&summary, "devices=%d\n", len(devices)) } for index, devPath := range devices { prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath)) commands := storageSATCommands(devPath) for cmdIndex, job := range commands { name := fmt.Sprintf("%s-%02d-%s.log", prefix, cmdIndex+1, job.name) out, err := runSATCommand(verboseLog, job.name, job.cmd) if writeErr := os.WriteFile(filepath.Join(runDir, name), out, 0644); writeErr != nil { return "", writeErr } status, rc := classifySATResult(job.name, out, err) stats.Add(status) key := filepath.Base(devPath) + "_" + strings.ReplaceAll(job.name, "-", "_") fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc) fmt.Fprintf(&summary, "%s_status=%s\n", key, status) } } writeSATStats(&summary, stats) if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil { return "", err } archive := filepath.Join(baseDir, "storage-"+ts+".tar.gz") if err := createTarGz(archive, runDir); err != nil { return "", err } return archive, nil } type satJob struct { name string cmd []string env []string // extra env vars (appended to os.Environ) collectGPU bool // collect GPU metrics via nvidia-smi while this job runs gpuIndices []int // GPU indices to collect metrics for (empty = all) } type satStats struct { OK int Failed int Unsupported int } func nvidiaSATJobs() []satJob { seconds := envInt("BEE_GPU_STRESS_SECONDS", 5) sizeMB := envInt("BEE_GPU_STRESS_SIZE_MB", 64) return []satJob{ {name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, {name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}}, {name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}}, {name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}}, {name: "05-bee-gpu-stress.log", cmd: []string{"bee-gpu-stress", "--seconds", fmt.Sprintf("%d", seconds), "--size-mb", fmt.Sprintf("%d", sizeMB)}}, } } func runAcceptancePack(baseDir, prefix string, jobs []satJob) (string, error) { if baseDir == "" { baseDir = "/var/log/bee-sat" } ts := time.Now().UTC().Format("20060102-150405") runDir := filepath.Join(baseDir, prefix+"-"+ts) if err := os.MkdirAll(runDir, 0755); err != nil { return "", err } verboseLog := filepath.Join(runDir, "verbose.log") var summary strings.Builder stats := satStats{} fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339)) for _, job := range jobs { cmd := make([]string, 0, len(job.cmd)) for _, arg := range job.cmd { cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir)) } out, err := runSATCommand(verboseLog, job.name, cmd) if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil { return "", writeErr } status, rc := classifySATResult(job.name, out, err) stats.Add(status) key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log") fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc) fmt.Fprintf(&summary, "%s_status=%s\n", key, status) } writeSATStats(&summary, stats) if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil { return "", err } archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz") if err := createTarGz(archive, runDir); err != nil { return "", err } return archive, nil } func nvidiaSATJobsWithOptions(durationSec, sizeMB int, gpuIndices []int) []satJob { var env []string if len(gpuIndices) > 0 { ids := make([]string, len(gpuIndices)) for i, idx := range gpuIndices { ids[i] = strconv.Itoa(idx) } env = []string{"CUDA_VISIBLE_DEVICES=" + strings.Join(ids, ",")} } return []satJob{ {name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, {name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}}, {name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}}, {name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}}, { name: "05-bee-gpu-stress.log", cmd: []string{"bee-gpu-stress", "--seconds", strconv.Itoa(durationSec), "--size-mb", strconv.Itoa(sizeMB)}, env: env, collectGPU: true, gpuIndices: gpuIndices, }, } } func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob) (string, error) { if baseDir == "" { baseDir = "/var/log/bee-sat" } ts := time.Now().UTC().Format("20060102-150405") runDir := filepath.Join(baseDir, prefix+"-"+ts) if err := os.MkdirAll(runDir, 0755); err != nil { return "", err } verboseLog := filepath.Join(runDir, "verbose.log") var summary strings.Builder stats := satStats{} fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339)) for _, job := range jobs { if ctx.Err() != nil { break } cmd := make([]string, 0, len(job.cmd)) for _, arg := range job.cmd { cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir)) } var out []byte var err error if job.collectGPU { out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir) } else { out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env) } if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil { return "", writeErr } status, rc := classifySATResult(job.name, out, err) stats.Add(status) key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log") fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc) fmt.Fprintf(&summary, "%s_status=%s\n", key, status) } writeSATStats(&summary, stats) if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil { return "", err } archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz") if err := createTarGz(archive, runDir); err != nil { return "", err } return archive, nil } func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string) ([]byte, error) { start := time.Now().UTC() appendSATVerboseLog(verboseLog, fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name), "cmd: "+strings.Join(cmd, " "), ) c := exec.CommandContext(ctx, cmd[0], cmd[1:]...) if len(env) > 0 { c.Env = append(os.Environ(), env...) } out, err := c.CombinedOutput() rc := 0 if err != nil { rc = 1 } appendSATVerboseLog(verboseLog, fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name), fmt.Sprintf("rc: %d", rc), fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()), "", ) return out, err } func listStorageDevices() ([]string, error) { out, err := exec.Command("lsblk", "-dn", "-o", "NAME,TYPE").Output() if err != nil { return nil, err } var devices []string for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") { fields := strings.Fields(strings.TrimSpace(line)) if len(fields) != 2 || fields[1] != "disk" { continue } devices = append(devices, "/dev/"+fields[0]) } return devices, nil } func storageSATCommands(devPath string) []satJob { if strings.Contains(filepath.Base(devPath), "nvme") { return []satJob{ {name: "nvme-id-ctrl", cmd: []string{"nvme", "id-ctrl", devPath, "-o", "json"}}, {name: "nvme-smart-log", cmd: []string{"nvme", "smart-log", devPath, "-o", "json"}}, {name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", "1", "--wait"}}, } } return []satJob{ {name: "smartctl-health", cmd: []string{"smartctl", "-H", "-A", devPath}}, {name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", "short", devPath}}, } } func (s *satStats) Add(status string) { switch status { case "OK": s.OK++ case "UNSUPPORTED": s.Unsupported++ default: s.Failed++ } } func (s satStats) Overall() string { if s.Failed > 0 { return "FAILED" } if s.Unsupported > 0 { return "PARTIAL" } return "OK" } func writeSATStats(summary *strings.Builder, stats satStats) { fmt.Fprintf(summary, "overall_status=%s\n", stats.Overall()) fmt.Fprintf(summary, "job_ok=%d\n", stats.OK) fmt.Fprintf(summary, "job_failed=%d\n", stats.Failed) fmt.Fprintf(summary, "job_unsupported=%d\n", stats.Unsupported) } func classifySATResult(name string, out []byte, err error) (string, int) { rc := 0 if err != nil { rc = 1 } if err == nil { return "OK", rc } text := strings.ToLower(string(out)) if strings.Contains(text, "unsupported") || strings.Contains(text, "not supported") || strings.Contains(text, "invalid opcode") || strings.Contains(text, "unknown command") || strings.Contains(text, "not implemented") || strings.Contains(text, "not available") || strings.Contains(text, "cuda_error_system_not_ready") || strings.Contains(text, "no such device") || (strings.Contains(name, "self-test") && strings.Contains(text, "aborted")) { return "UNSUPPORTED", rc } return "FAILED", rc } func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) { start := time.Now().UTC() appendSATVerboseLog(verboseLog, fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name), "cmd: "+strings.Join(cmd, " "), ) out, err := exec.Command(cmd[0], cmd[1:]...).CombinedOutput() rc := 0 if err != nil { rc = 1 } appendSATVerboseLog(verboseLog, fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name), fmt.Sprintf("rc: %d", rc), fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()), "", ) return out, err } // runSATCommandWithMetrics runs a command while collecting GPU metrics in the background. // On completion it writes gpu-metrics.csv and gpu-metrics.html into runDir. func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir string) ([]byte, error) { stopCh := make(chan struct{}) doneCh := make(chan struct{}) var metricRows []GPUMetricRow start := time.Now() go func() { defer close(doneCh) ticker := time.NewTicker(time.Second) defer ticker.Stop() for { select { case <-stopCh: return case <-ticker.C: samples, err := sampleGPUMetrics(gpuIndices) if err != nil { continue } elapsed := time.Since(start).Seconds() for i := range samples { samples[i].ElapsedSec = elapsed } metricRows = append(metricRows, samples...) } } }() out, err := runSATCommandCtx(ctx, verboseLog, name, cmd, env) close(stopCh) <-doneCh if len(metricRows) > 0 { _ = WriteGPUMetricsCSV(filepath.Join(runDir, "gpu-metrics.csv"), metricRows) _ = WriteGPUMetricsHTML(filepath.Join(runDir, "gpu-metrics.html"), metricRows) chart := RenderGPUTerminalChart(metricRows) _ = os.WriteFile(filepath.Join(runDir, "gpu-metrics-term.txt"), []byte(chart), 0644) } return out, err } func appendSATVerboseLog(path string, lines ...string) { if path == "" { return } f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644) if err != nil { return } defer f.Close() for _, line := range lines { _, _ = io.WriteString(f, line+"\n") } } func envInt(name string, fallback int) int { raw := strings.TrimSpace(os.Getenv(name)) if raw == "" { return fallback } value, err := strconv.Atoi(raw) if err != nil || value <= 0 { return fallback } return value } func createTarGz(dst, srcDir string) error { file, err := os.Create(dst) if err != nil { return err } defer file.Close() gz := gzip.NewWriter(file) defer gz.Close() tw := tar.NewWriter(gz) defer tw.Close() base := filepath.Dir(srcDir) return filepath.Walk(srcDir, func(path string, info os.FileInfo, err error) error { if err != nil { return err } if info.IsDir() { return nil } header, err := tar.FileInfoHeader(info, "") if err != nil { return err } rel, err := filepath.Rel(base, path) if err != nil { return err } header.Name = rel if err := tw.WriteHeader(header); err != nil { return err } file, err := os.Open(path) if err != nil { return err } defer file.Close() _, err = io.Copy(tw, file) return err }) }