--start is not a valid nvme-cli flag; correct syntax is -s 1 (short test). Add --wait so the command blocks until the test completes. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
531 lines
15 KiB
Go
531 lines
15 KiB
Go
package platform
|
|
|
|
import (
|
|
"archive/tar"
|
|
"compress/gzip"
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// NvidiaGPU holds basic GPU info from nvidia-smi.
|
|
type NvidiaGPU struct {
|
|
Index int
|
|
Name string
|
|
MemoryMB int
|
|
}
|
|
|
|
// ListNvidiaGPUs returns GPUs visible to nvidia-smi.
|
|
func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
|
|
out, err := exec.Command("nvidia-smi",
|
|
"--query-gpu=index,name,memory.total",
|
|
"--format=csv,noheader,nounits").Output()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("nvidia-smi: %w", err)
|
|
}
|
|
var gpus []NvidiaGPU
|
|
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
|
line = strings.TrimSpace(line)
|
|
if line == "" {
|
|
continue
|
|
}
|
|
parts := strings.SplitN(line, ", ", 3)
|
|
if len(parts) != 3 {
|
|
continue
|
|
}
|
|
idx, err := strconv.Atoi(strings.TrimSpace(parts[0]))
|
|
if err != nil {
|
|
continue
|
|
}
|
|
memMB, _ := strconv.Atoi(strings.TrimSpace(parts[2]))
|
|
gpus = append(gpus, NvidiaGPU{
|
|
Index: idx,
|
|
Name: strings.TrimSpace(parts[1]),
|
|
MemoryMB: memMB,
|
|
})
|
|
}
|
|
return gpus, nil
|
|
}
|
|
|
|
func (s *System) RunNvidiaAcceptancePack(baseDir string) (string, error) {
|
|
return runAcceptancePack(baseDir, "gpu-nvidia", nvidiaSATJobs())
|
|
}
|
|
|
|
// RunNvidiaAcceptancePackWithOptions runs the NVIDIA SAT with explicit duration,
|
|
// GPU memory size, and GPU index selection. ctx cancellation kills the running job.
|
|
func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, durationSec int, sizeMB int, gpuIndices []int) (string, error) {
|
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaSATJobsWithOptions(durationSec, sizeMB, gpuIndices))
|
|
}
|
|
|
|
func (s *System) RunMemoryAcceptancePack(baseDir string) (string, error) {
|
|
sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128)
|
|
passes := envInt("BEE_MEMTESTER_PASSES", 1)
|
|
return runAcceptancePack(baseDir, "memory", []satJob{
|
|
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
|
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
|
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
|
})
|
|
}
|
|
|
|
func (s *System) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) {
|
|
if durationSec <= 0 {
|
|
durationSec = 60
|
|
}
|
|
return runAcceptancePack(baseDir, "cpu", []satJob{
|
|
{name: "01-lscpu.log", cmd: []string{"lscpu"}},
|
|
{name: "02-sensors-before.log", cmd: []string{"sensors"}},
|
|
{name: "03-stress-ng.log", cmd: []string{"stress-ng", "--cpu", "0", "--cpu-method", "all", "--timeout", fmt.Sprintf("%d", durationSec)}},
|
|
{name: "04-sensors-after.log", cmd: []string{"sensors"}},
|
|
})
|
|
}
|
|
|
|
func (s *System) RunStorageAcceptancePack(baseDir string) (string, error) {
|
|
if baseDir == "" {
|
|
baseDir = "/var/log/bee-sat"
|
|
}
|
|
ts := time.Now().UTC().Format("20060102-150405")
|
|
runDir := filepath.Join(baseDir, "storage-"+ts)
|
|
if err := os.MkdirAll(runDir, 0755); err != nil {
|
|
return "", err
|
|
}
|
|
verboseLog := filepath.Join(runDir, "verbose.log")
|
|
|
|
devices, err := listStorageDevices()
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
sort.Strings(devices)
|
|
|
|
var summary strings.Builder
|
|
stats := satStats{}
|
|
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
|
if len(devices) == 0 {
|
|
fmt.Fprintln(&summary, "devices=0")
|
|
stats.Unsupported++
|
|
} else {
|
|
fmt.Fprintf(&summary, "devices=%d\n", len(devices))
|
|
}
|
|
|
|
for index, devPath := range devices {
|
|
prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath))
|
|
commands := storageSATCommands(devPath)
|
|
for cmdIndex, job := range commands {
|
|
name := fmt.Sprintf("%s-%02d-%s.log", prefix, cmdIndex+1, job.name)
|
|
out, err := runSATCommand(verboseLog, job.name, job.cmd)
|
|
if writeErr := os.WriteFile(filepath.Join(runDir, name), out, 0644); writeErr != nil {
|
|
return "", writeErr
|
|
}
|
|
status, rc := classifySATResult(job.name, out, err)
|
|
stats.Add(status)
|
|
key := filepath.Base(devPath) + "_" + strings.ReplaceAll(job.name, "-", "_")
|
|
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
|
|
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
|
|
}
|
|
}
|
|
|
|
writeSATStats(&summary, stats)
|
|
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
|
return "", err
|
|
}
|
|
archive := filepath.Join(baseDir, "storage-"+ts+".tar.gz")
|
|
if err := createTarGz(archive, runDir); err != nil {
|
|
return "", err
|
|
}
|
|
return archive, nil
|
|
}
|
|
|
|
type satJob struct {
|
|
name string
|
|
cmd []string
|
|
env []string // extra env vars (appended to os.Environ)
|
|
collectGPU bool // collect GPU metrics via nvidia-smi while this job runs
|
|
gpuIndices []int // GPU indices to collect metrics for (empty = all)
|
|
}
|
|
|
|
type satStats struct {
|
|
OK int
|
|
Failed int
|
|
Unsupported int
|
|
}
|
|
|
|
func nvidiaSATJobs() []satJob {
|
|
seconds := envInt("BEE_GPU_STRESS_SECONDS", 5)
|
|
sizeMB := envInt("BEE_GPU_STRESS_SIZE_MB", 64)
|
|
return []satJob{
|
|
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
|
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
|
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
|
{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
|
{name: "05-bee-gpu-stress.log", cmd: []string{"bee-gpu-stress", "--seconds", fmt.Sprintf("%d", seconds), "--size-mb", fmt.Sprintf("%d", sizeMB)}},
|
|
}
|
|
}
|
|
|
|
func runAcceptancePack(baseDir, prefix string, jobs []satJob) (string, error) {
|
|
if baseDir == "" {
|
|
baseDir = "/var/log/bee-sat"
|
|
}
|
|
ts := time.Now().UTC().Format("20060102-150405")
|
|
runDir := filepath.Join(baseDir, prefix+"-"+ts)
|
|
if err := os.MkdirAll(runDir, 0755); err != nil {
|
|
return "", err
|
|
}
|
|
verboseLog := filepath.Join(runDir, "verbose.log")
|
|
|
|
var summary strings.Builder
|
|
stats := satStats{}
|
|
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
|
for _, job := range jobs {
|
|
cmd := make([]string, 0, len(job.cmd))
|
|
for _, arg := range job.cmd {
|
|
cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir))
|
|
}
|
|
out, err := runSATCommand(verboseLog, job.name, cmd)
|
|
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
|
return "", writeErr
|
|
}
|
|
status, rc := classifySATResult(job.name, out, err)
|
|
stats.Add(status)
|
|
key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
|
|
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
|
|
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
|
|
}
|
|
writeSATStats(&summary, stats)
|
|
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
|
return "", err
|
|
}
|
|
|
|
archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
|
|
if err := createTarGz(archive, runDir); err != nil {
|
|
return "", err
|
|
}
|
|
return archive, nil
|
|
}
|
|
|
|
func nvidiaSATJobsWithOptions(durationSec, sizeMB int, gpuIndices []int) []satJob {
|
|
var env []string
|
|
if len(gpuIndices) > 0 {
|
|
ids := make([]string, len(gpuIndices))
|
|
for i, idx := range gpuIndices {
|
|
ids[i] = strconv.Itoa(idx)
|
|
}
|
|
env = []string{"CUDA_VISIBLE_DEVICES=" + strings.Join(ids, ",")}
|
|
}
|
|
return []satJob{
|
|
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
|
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
|
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
|
{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
|
{
|
|
name: "05-bee-gpu-stress.log",
|
|
cmd: []string{"bee-gpu-stress", "--seconds", strconv.Itoa(durationSec), "--size-mb", strconv.Itoa(sizeMB)},
|
|
env: env,
|
|
collectGPU: true,
|
|
gpuIndices: gpuIndices,
|
|
},
|
|
}
|
|
}
|
|
|
|
func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob) (string, error) {
|
|
if baseDir == "" {
|
|
baseDir = "/var/log/bee-sat"
|
|
}
|
|
ts := time.Now().UTC().Format("20060102-150405")
|
|
runDir := filepath.Join(baseDir, prefix+"-"+ts)
|
|
if err := os.MkdirAll(runDir, 0755); err != nil {
|
|
return "", err
|
|
}
|
|
verboseLog := filepath.Join(runDir, "verbose.log")
|
|
|
|
var summary strings.Builder
|
|
stats := satStats{}
|
|
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
|
for _, job := range jobs {
|
|
if ctx.Err() != nil {
|
|
break
|
|
}
|
|
cmd := make([]string, 0, len(job.cmd))
|
|
for _, arg := range job.cmd {
|
|
cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir))
|
|
}
|
|
|
|
var out []byte
|
|
var err error
|
|
|
|
if job.collectGPU {
|
|
out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir)
|
|
} else {
|
|
out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env)
|
|
}
|
|
|
|
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
|
return "", writeErr
|
|
}
|
|
status, rc := classifySATResult(job.name, out, err)
|
|
stats.Add(status)
|
|
key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
|
|
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
|
|
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
|
|
}
|
|
writeSATStats(&summary, stats)
|
|
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
|
return "", err
|
|
}
|
|
|
|
archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
|
|
if err := createTarGz(archive, runDir); err != nil {
|
|
return "", err
|
|
}
|
|
return archive, nil
|
|
}
|
|
|
|
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string) ([]byte, error) {
|
|
start := time.Now().UTC()
|
|
appendSATVerboseLog(verboseLog,
|
|
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
|
|
"cmd: "+strings.Join(cmd, " "),
|
|
)
|
|
|
|
c := exec.CommandContext(ctx, cmd[0], cmd[1:]...)
|
|
if len(env) > 0 {
|
|
c.Env = append(os.Environ(), env...)
|
|
}
|
|
out, err := c.CombinedOutput()
|
|
|
|
rc := 0
|
|
if err != nil {
|
|
rc = 1
|
|
}
|
|
appendSATVerboseLog(verboseLog,
|
|
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
|
|
fmt.Sprintf("rc: %d", rc),
|
|
fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()),
|
|
"",
|
|
)
|
|
return out, err
|
|
}
|
|
|
|
func listStorageDevices() ([]string, error) {
|
|
out, err := exec.Command("lsblk", "-dn", "-o", "NAME,TYPE").Output()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
var devices []string
|
|
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
|
fields := strings.Fields(strings.TrimSpace(line))
|
|
if len(fields) != 2 || fields[1] != "disk" {
|
|
continue
|
|
}
|
|
devices = append(devices, "/dev/"+fields[0])
|
|
}
|
|
return devices, nil
|
|
}
|
|
|
|
func storageSATCommands(devPath string) []satJob {
|
|
if strings.Contains(filepath.Base(devPath), "nvme") {
|
|
return []satJob{
|
|
{name: "nvme-id-ctrl", cmd: []string{"nvme", "id-ctrl", devPath, "-o", "json"}},
|
|
{name: "nvme-smart-log", cmd: []string{"nvme", "smart-log", devPath, "-o", "json"}},
|
|
{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", "1", "--wait"}},
|
|
}
|
|
}
|
|
return []satJob{
|
|
{name: "smartctl-health", cmd: []string{"smartctl", "-H", "-A", devPath}},
|
|
{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", "short", devPath}},
|
|
}
|
|
}
|
|
|
|
func (s *satStats) Add(status string) {
|
|
switch status {
|
|
case "OK":
|
|
s.OK++
|
|
case "UNSUPPORTED":
|
|
s.Unsupported++
|
|
default:
|
|
s.Failed++
|
|
}
|
|
}
|
|
|
|
func (s satStats) Overall() string {
|
|
if s.Failed > 0 {
|
|
return "FAILED"
|
|
}
|
|
if s.Unsupported > 0 {
|
|
return "PARTIAL"
|
|
}
|
|
return "OK"
|
|
}
|
|
|
|
func writeSATStats(summary *strings.Builder, stats satStats) {
|
|
fmt.Fprintf(summary, "overall_status=%s\n", stats.Overall())
|
|
fmt.Fprintf(summary, "job_ok=%d\n", stats.OK)
|
|
fmt.Fprintf(summary, "job_failed=%d\n", stats.Failed)
|
|
fmt.Fprintf(summary, "job_unsupported=%d\n", stats.Unsupported)
|
|
}
|
|
|
|
func classifySATResult(name string, out []byte, err error) (string, int) {
|
|
rc := 0
|
|
if err != nil {
|
|
rc = 1
|
|
}
|
|
if err == nil {
|
|
return "OK", rc
|
|
}
|
|
|
|
text := strings.ToLower(string(out))
|
|
if strings.Contains(text, "unsupported") ||
|
|
strings.Contains(text, "not supported") ||
|
|
strings.Contains(text, "invalid opcode") ||
|
|
strings.Contains(text, "unknown command") ||
|
|
strings.Contains(text, "not implemented") ||
|
|
strings.Contains(text, "not available") ||
|
|
strings.Contains(text, "cuda_error_system_not_ready") ||
|
|
strings.Contains(text, "no such device") ||
|
|
(strings.Contains(name, "self-test") && strings.Contains(text, "aborted")) {
|
|
return "UNSUPPORTED", rc
|
|
}
|
|
return "FAILED", rc
|
|
}
|
|
|
|
func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) {
|
|
start := time.Now().UTC()
|
|
appendSATVerboseLog(verboseLog,
|
|
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
|
|
"cmd: "+strings.Join(cmd, " "),
|
|
)
|
|
|
|
out, err := exec.Command(cmd[0], cmd[1:]...).CombinedOutput()
|
|
|
|
rc := 0
|
|
if err != nil {
|
|
rc = 1
|
|
}
|
|
appendSATVerboseLog(verboseLog,
|
|
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
|
|
fmt.Sprintf("rc: %d", rc),
|
|
fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()),
|
|
"",
|
|
)
|
|
return out, err
|
|
}
|
|
|
|
// runSATCommandWithMetrics runs a command while collecting GPU metrics in the background.
|
|
// On completion it writes gpu-metrics.csv and gpu-metrics.html into runDir.
|
|
func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir string) ([]byte, error) {
|
|
stopCh := make(chan struct{})
|
|
doneCh := make(chan struct{})
|
|
var metricRows []GPUMetricRow
|
|
start := time.Now()
|
|
|
|
go func() {
|
|
defer close(doneCh)
|
|
ticker := time.NewTicker(time.Second)
|
|
defer ticker.Stop()
|
|
for {
|
|
select {
|
|
case <-stopCh:
|
|
return
|
|
case <-ticker.C:
|
|
samples, err := sampleGPUMetrics(gpuIndices)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
elapsed := time.Since(start).Seconds()
|
|
for i := range samples {
|
|
samples[i].ElapsedSec = elapsed
|
|
}
|
|
metricRows = append(metricRows, samples...)
|
|
}
|
|
}
|
|
}()
|
|
|
|
out, err := runSATCommandCtx(ctx, verboseLog, name, cmd, env)
|
|
|
|
close(stopCh)
|
|
<-doneCh
|
|
|
|
if len(metricRows) > 0 {
|
|
_ = WriteGPUMetricsCSV(filepath.Join(runDir, "gpu-metrics.csv"), metricRows)
|
|
_ = WriteGPUMetricsHTML(filepath.Join(runDir, "gpu-metrics.html"), metricRows)
|
|
chart := RenderGPUTerminalChart(metricRows)
|
|
_ = os.WriteFile(filepath.Join(runDir, "gpu-metrics-term.txt"), []byte(chart), 0644)
|
|
}
|
|
|
|
return out, err
|
|
}
|
|
|
|
func appendSATVerboseLog(path string, lines ...string) {
|
|
if path == "" {
|
|
return
|
|
}
|
|
f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
|
|
if err != nil {
|
|
return
|
|
}
|
|
defer f.Close()
|
|
for _, line := range lines {
|
|
_, _ = io.WriteString(f, line+"\n")
|
|
}
|
|
}
|
|
|
|
func envInt(name string, fallback int) int {
|
|
raw := strings.TrimSpace(os.Getenv(name))
|
|
if raw == "" {
|
|
return fallback
|
|
}
|
|
value, err := strconv.Atoi(raw)
|
|
if err != nil || value <= 0 {
|
|
return fallback
|
|
}
|
|
return value
|
|
}
|
|
|
|
func createTarGz(dst, srcDir string) error {
|
|
file, err := os.Create(dst)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer file.Close()
|
|
|
|
gz := gzip.NewWriter(file)
|
|
defer gz.Close()
|
|
|
|
tw := tar.NewWriter(gz)
|
|
defer tw.Close()
|
|
|
|
base := filepath.Dir(srcDir)
|
|
return filepath.Walk(srcDir, func(path string, info os.FileInfo, err error) error {
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if info.IsDir() {
|
|
return nil
|
|
}
|
|
header, err := tar.FileInfoHeader(info, "")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
rel, err := filepath.Rel(base, path)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
header.Name = rel
|
|
if err := tw.WriteHeader(header); err != nil {
|
|
return err
|
|
}
|
|
file, err := os.Open(path)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer file.Close()
|
|
_, err = io.Copy(tw, file)
|
|
return err
|
|
})
|
|
}
|