release: v3.1

This commit is contained in:
2026-03-28 22:51:36 +03:00
parent 0dbfaf6121
commit 59a1d4b209
22 changed files with 1560 additions and 269 deletions

View File

@@ -90,6 +90,12 @@ func (s *System) DetectGPUVendor() string {
if _, err := os.Stat("/dev/kfd"); err == nil {
return "amd"
}
if raw, err := exec.Command("lspci", "-nn").Output(); err == nil {
text := strings.ToLower(string(raw))
if strings.Contains(text, "advanced micro devices") || strings.Contains(text, "amd/ati") {
return "amd"
}
}
return ""
}
@@ -117,8 +123,8 @@ func (s *System) ListAMDGPUs() ([]AMDGPUInfo, error) {
}
// RunAMDAcceptancePack runs an AMD GPU diagnostic pack using rocm-smi.
func (s *System) RunAMDAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
return runAcceptancePack(baseDir, "gpu-amd", []satJob{
func (s *System) RunAMDAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd", []satJob{
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
{name: "02-rocm-smi-showallinfo.log", cmd: []string{"rocm-smi", "--showallinfo"}},
{name: "03-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
@@ -128,14 +134,20 @@ func (s *System) RunAMDAcceptancePack(baseDir string, logFunc func(string)) (str
// RunAMDStressPack runs an AMD GPU burn-in pack.
// Missing tools are reported as UNSUPPORTED, consistent with the existing SAT pattern.
func (s *System) RunAMDStressPack(baseDir string, logFunc func(string)) (string, error) {
seconds := envInt("BEE_AMD_STRESS_SECONDS", 300)
return runAcceptancePack(baseDir, "gpu-amd-stress", []satJob{
func (s *System) RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
seconds := durationSec
if seconds <= 0 {
seconds = envInt("BEE_AMD_STRESS_SECONDS", 300)
}
if err := ensureAMDRuntimeReady(); err != nil {
return "", err
}
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-stress", []satJob{
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
{name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}},
{name: fmt.Sprintf("03-rocm-smi-monitor-%ds.log", seconds), cmd: []string{
"rocm-smi", "--showtemp", "--showpower",
fmt.Sprintf("--duration=%d", seconds),
"bash", "-lc",
fmt.Sprintf("end=$((SECONDS+%d)); while [ \"$SECONDS\" -lt \"$end\" ]; do rocm-smi --showtemp --showpower --csv; sleep 1; done", seconds),
}},
}, logFunc)
}
@@ -191,7 +203,7 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
}
func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
return runAcceptancePack(baseDir, "gpu-nvidia", nvidiaSATJobs(), logFunc)
return runAcceptancePackCtx(context.Background(), baseDir, "gpu-nvidia", nvidiaSATJobs(), logFunc)
}
// RunNvidiaAcceptancePackWithOptions runs the NVIDIA diagnostics via DCGM.
@@ -202,24 +214,27 @@ func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, gpuIndices), logFunc)
}
func (s *System) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128)
passes := envInt("BEE_MEMTESTER_PASSES", 1)
return runAcceptancePack(baseDir, "memory", []satJob{
return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
}, logFunc)
}
func (s *System) RunMemoryStressPack(baseDir string, logFunc func(string)) (string, error) {
seconds := envInt("BEE_VM_STRESS_SECONDS", 300)
func (s *System) RunMemoryStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
seconds := durationSec
if seconds <= 0 {
seconds = envInt("BEE_VM_STRESS_SECONDS", 300)
}
// Use 80% of RAM by default; override with BEE_VM_STRESS_SIZE_MB.
sizeArg := "80%"
if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 {
sizeArg = fmt.Sprintf("%dM", mb)
}
return runAcceptancePack(baseDir, "memory-stress", []satJob{
return runAcceptancePackCtx(ctx, baseDir, "memory-stress", []satJob{
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
{name: "02-stress-ng-vm.log", cmd: []string{
"stress-ng", "--vm", "1",
@@ -232,24 +247,27 @@ func (s *System) RunMemoryStressPack(baseDir string, logFunc func(string)) (stri
}, logFunc)
}
func (s *System) RunSATStressPack(baseDir string, logFunc func(string)) (string, error) {
seconds := envInt("BEE_SAT_STRESS_SECONDS", 300)
func (s *System) RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
seconds := durationSec
if seconds <= 0 {
seconds = envInt("BEE_SAT_STRESS_SECONDS", 300)
}
cmd := []string{"stressapptest", "-s", fmt.Sprintf("%d", seconds), "-W", "--cc_test"}
if mb := envInt("BEE_SAT_STRESS_MB", 0); mb > 0 {
cmd = append(cmd, "-M", fmt.Sprintf("%d", mb))
}
return runAcceptancePack(baseDir, "sat-stress", []satJob{
return runAcceptancePackCtx(ctx, baseDir, "sat-stress", []satJob{
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
{name: "02-stressapptest.log", cmd: cmd},
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
}, logFunc)
}
func (s *System) RunCPUAcceptancePack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
func (s *System) RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
if durationSec <= 0 {
durationSec = 60
}
return runAcceptancePack(baseDir, "cpu", []satJob{
return runAcceptancePackCtx(ctx, baseDir, "cpu", []satJob{
{name: "01-lscpu.log", cmd: []string{"lscpu"}},
{name: "02-sensors-before.log", cmd: []string{"sensors"}},
{name: "03-stress-ng.log", cmd: []string{"stress-ng", "--cpu", "0", "--cpu-method", "all", "--timeout", fmt.Sprintf("%d", durationSec)}},
@@ -257,7 +275,7 @@ func (s *System) RunCPUAcceptancePack(baseDir string, durationSec int, logFunc f
}, logFunc)
}
func (s *System) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
if baseDir == "" {
baseDir = "/var/log/bee-sat"
}
@@ -285,11 +303,17 @@ func (s *System) RunStorageAcceptancePack(baseDir string, logFunc func(string))
}
for index, devPath := range devices {
if ctx.Err() != nil {
break
}
prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath))
commands := storageSATCommands(devPath)
for cmdIndex, job := range commands {
if ctx.Err() != nil {
break
}
name := fmt.Sprintf("%s-%02d-%s.log", prefix, cmdIndex+1, job.name)
out, err := runSATCommand(verboseLog, job.name, job.cmd, logFunc)
out, err := runSATCommandCtx(ctx, verboseLog, job.name, job.cmd, nil, logFunc)
if writeErr := os.WriteFile(filepath.Join(runDir, name), out, 0644); writeErr != nil {
return "", writeErr
}
@@ -338,49 +362,6 @@ func nvidiaSATJobs() []satJob {
}
}
func runAcceptancePack(baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
if baseDir == "" {
baseDir = "/var/log/bee-sat"
}
ts := time.Now().UTC().Format("20060102-150405")
runDir := filepath.Join(baseDir, prefix+"-"+ts)
if err := os.MkdirAll(runDir, 0755); err != nil {
return "", err
}
verboseLog := filepath.Join(runDir, "verbose.log")
var summary strings.Builder
stats := satStats{}
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
for _, job := range jobs {
var out []byte
var err error
cmd := make([]string, 0, len(job.cmd))
for _, arg := range job.cmd {
cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir))
}
out, err = runSATCommand(verboseLog, job.name, cmd, logFunc)
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
return "", writeErr
}
status, rc := classifySATResult(job.name, out, err)
stats.Add(status)
key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
}
writeSATStats(&summary, stats)
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
return "", err
}
archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
if err := createTarGz(archive, runDir); err != nil {
return "", err
}
return archive, nil
}
func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
if diagLevel < 1 || diagLevel > 4 {
diagLevel = 3
@@ -402,6 +383,9 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
}
func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
if ctx == nil {
ctx = context.Background()
}
if baseDir == "" {
baseDir = "/var/log/bee-sat"
}
@@ -649,6 +633,20 @@ func resolveROCmSMICommand(args ...string) ([]string, error) {
return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
}
func ensureAMDRuntimeReady() error {
if _, err := os.Stat("/dev/kfd"); err == nil {
return nil
}
if raw, err := os.ReadFile("/sys/module/amdgpu/initstate"); err == nil {
state := strings.TrimSpace(string(raw))
if strings.EqualFold(state, "live") {
return nil
}
return fmt.Errorf("AMD driver is present but not initialized: amdgpu initstate=%q", state)
}
return errors.New("AMD GPUs are present but the runtime is not initialized: /dev/kfd is missing and amdgpu is not loaded")
}
func rocmSMIExecutableCandidates() []string {
return expandExistingPaths(rocmSMIExecutableGlobs)
}