feat: task queue, UI overhaul, burn tests, install-to-RAM

- Task queue: all SAT/audit jobs enqueue and run one-at-a-time;
  tasks persist past page navigation; new Tasks page with cancel/priority/log stream
- UI: consolidate nav (Validate, Burn, Tasks, Tools); Audit becomes modal;
  Dashboard hardware summary badges + split metrics charts (load/temp/power);
  Tools page consolidates network, services, install, support bundle
- AMD GPU: acceptance test and stress burn cards; GPU presence API greys
  out irrelevant SAT cards automatically
- Burn tests: Memory Stress (stress-ng --vm), SAT Stress (stressapptest)
- Install to RAM: copies squashfs to /dev/shm, re-associates loop devices
  via LOOP_CHANGE_FD ioctl so live media can be ejected
- Charts: relative time axis (0 = now, negative left)
- memtester: LimitMEMLOCK=infinity in bee-web.service; empty output → UNSUPPORTED
- SAT overlay applied dynamically on every /audit.json serve
- MIME panic guard for LiveCD ramdisk I/O errors
- ISO: add memtest86+, stressapptest packages; memtest86+ GRUB entry;
  disable screensaver/DPMS in bee-openbox-session
- Unknown SAT status severity = 1 (does not override OK)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-28 21:15:11 +03:00
parent 911745e4da
commit 0a98ed8ae9
22 changed files with 1964 additions and 326 deletions

View File

@@ -2,6 +2,8 @@ package platform
import (
"archive/tar"
"bufio"
"bytes"
"compress/gzip"
"context"
"errors"
@@ -13,6 +15,7 @@ import (
"sort"
"strconv"
"strings"
"sync"
"time"
)
@@ -32,6 +35,40 @@ var (
}
)
// streamExecOutput runs cmd and streams each output line to logFunc (if non-nil).
// Returns combined stdout+stderr as a byte slice.
func streamExecOutput(cmd *exec.Cmd, logFunc func(string)) ([]byte, error) {
pr, pw := io.Pipe()
cmd.Stdout = pw
cmd.Stderr = pw
var buf bytes.Buffer
var wg sync.WaitGroup
wg.Add(1)
go func() {
defer wg.Done()
scanner := bufio.NewScanner(pr)
for scanner.Scan() {
line := scanner.Text()
buf.WriteString(line + "\n")
if logFunc != nil {
logFunc(line)
}
}
}()
err := cmd.Start()
if err != nil {
_ = pw.Close()
wg.Wait()
return nil, err
}
waitErr := cmd.Wait()
_ = pw.Close()
wg.Wait()
return buf.Bytes(), waitErr
}
// NvidiaGPU holds basic GPU info from nvidia-smi.
type NvidiaGPU struct {
Index int
@@ -80,13 +117,27 @@ func (s *System) ListAMDGPUs() ([]AMDGPUInfo, error) {
}
// RunAMDAcceptancePack runs an AMD GPU diagnostic pack using rocm-smi.
func (s *System) RunAMDAcceptancePack(baseDir string) (string, error) {
func (s *System) RunAMDAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
return runAcceptancePack(baseDir, "gpu-amd", []satJob{
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
{name: "02-rocm-smi-showallinfo.log", cmd: []string{"rocm-smi", "--showallinfo"}},
{name: "03-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
{name: "04-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
})
}, logFunc)
}
// RunAMDStressPack runs an AMD GPU burn-in pack.
// Missing tools are reported as UNSUPPORTED, consistent with the existing SAT pattern.
func (s *System) RunAMDStressPack(baseDir string, logFunc func(string)) (string, error) {
seconds := envInt("BEE_AMD_STRESS_SECONDS", 300)
return runAcceptancePack(baseDir, "gpu-amd-stress", []satJob{
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
{name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}},
{name: fmt.Sprintf("03-rocm-smi-monitor-%ds.log", seconds), cmd: []string{
"rocm-smi", "--showtemp", "--showpower",
fmt.Sprintf("--duration=%d", seconds),
}},
}, logFunc)
}
// ListNvidiaGPUs returns GPUs visible to nvidia-smi.
@@ -123,7 +174,7 @@ func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
// Measures collective communication bandwidth over NVLink/PCIe.
func (s *System) RunNCCLTests(ctx context.Context, baseDir string) (string, error) {
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
// detect GPU count
out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
@@ -136,32 +187,65 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string) (string, erro
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
"-g", strconv.Itoa(gpuCount), "--iters", "20",
}},
})
}, logFunc)
}
func (s *System) RunNvidiaAcceptancePack(baseDir string) (string, error) {
return runAcceptancePack(baseDir, "gpu-nvidia", nvidiaSATJobs())
func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
return runAcceptancePack(baseDir, "gpu-nvidia", nvidiaSATJobs(), logFunc)
}
// RunNvidiaAcceptancePackWithOptions runs the NVIDIA diagnostics via DCGM.
// diagLevel: 1=quick, 2=medium, 3=targeted stress, 4=extended stress.
// gpuIndices: specific GPU indices to test (empty = all GPUs).
// ctx cancellation kills the running job.
func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int) (string, error) {
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, gpuIndices))
func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error) {
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, gpuIndices), logFunc)
}
func (s *System) RunMemoryAcceptancePack(baseDir string) (string, error) {
func (s *System) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128)
passes := envInt("BEE_MEMTESTER_PASSES", 1)
return runAcceptancePack(baseDir, "memory", []satJob{
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
})
}, logFunc)
}
func (s *System) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) {
func (s *System) RunMemoryStressPack(baseDir string, logFunc func(string)) (string, error) {
seconds := envInt("BEE_VM_STRESS_SECONDS", 300)
// Use 80% of RAM by default; override with BEE_VM_STRESS_SIZE_MB.
sizeArg := "80%"
if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 {
sizeArg = fmt.Sprintf("%dM", mb)
}
return runAcceptancePack(baseDir, "memory-stress", []satJob{
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
{name: "02-stress-ng-vm.log", cmd: []string{
"stress-ng", "--vm", "1",
"--vm-bytes", sizeArg,
"--vm-method", "all",
"--timeout", fmt.Sprintf("%d", seconds),
"--metrics-brief",
}},
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
}, logFunc)
}
func (s *System) RunSATStressPack(baseDir string, logFunc func(string)) (string, error) {
seconds := envInt("BEE_SAT_STRESS_SECONDS", 300)
cmd := []string{"stressapptest", "-s", fmt.Sprintf("%d", seconds), "-W", "--cc_test"}
if mb := envInt("BEE_SAT_STRESS_MB", 0); mb > 0 {
cmd = append(cmd, "-M", fmt.Sprintf("%d", mb))
}
return runAcceptancePack(baseDir, "sat-stress", []satJob{
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
{name: "02-stressapptest.log", cmd: cmd},
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
}, logFunc)
}
func (s *System) RunCPUAcceptancePack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
if durationSec <= 0 {
durationSec = 60
}
@@ -170,10 +254,10 @@ func (s *System) RunCPUAcceptancePack(baseDir string, durationSec int) (string,
{name: "02-sensors-before.log", cmd: []string{"sensors"}},
{name: "03-stress-ng.log", cmd: []string{"stress-ng", "--cpu", "0", "--cpu-method", "all", "--timeout", fmt.Sprintf("%d", durationSec)}},
{name: "04-sensors-after.log", cmd: []string{"sensors"}},
})
}, logFunc)
}
func (s *System) RunStorageAcceptancePack(baseDir string) (string, error) {
func (s *System) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
if baseDir == "" {
baseDir = "/var/log/bee-sat"
}
@@ -205,7 +289,7 @@ func (s *System) RunStorageAcceptancePack(baseDir string) (string, error) {
commands := storageSATCommands(devPath)
for cmdIndex, job := range commands {
name := fmt.Sprintf("%s-%02d-%s.log", prefix, cmdIndex+1, job.name)
out, err := runSATCommand(verboseLog, job.name, job.cmd)
out, err := runSATCommand(verboseLog, job.name, job.cmd, logFunc)
if writeErr := os.WriteFile(filepath.Join(runDir, name), out, 0644); writeErr != nil {
return "", writeErr
}
@@ -254,7 +338,7 @@ func nvidiaSATJobs() []satJob {
}
}
func runAcceptancePack(baseDir, prefix string, jobs []satJob) (string, error) {
func runAcceptancePack(baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
if baseDir == "" {
baseDir = "/var/log/bee-sat"
}
@@ -269,11 +353,13 @@ func runAcceptancePack(baseDir, prefix string, jobs []satJob) (string, error) {
stats := satStats{}
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
for _, job := range jobs {
var out []byte
var err error
cmd := make([]string, 0, len(job.cmd))
for _, arg := range job.cmd {
cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir))
}
out, err := runSATCommand(verboseLog, job.name, cmd)
out, err = runSATCommand(verboseLog, job.name, cmd, logFunc)
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
return "", writeErr
}
@@ -315,7 +401,7 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
}
}
func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob) (string, error) {
func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
if baseDir == "" {
baseDir = "/var/log/bee-sat"
}
@@ -342,9 +428,9 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
var err error
if job.collectGPU {
out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir)
out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc)
} else {
out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env)
out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc)
}
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
@@ -368,13 +454,16 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
return archive, nil
}
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string) ([]byte, error) {
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string, logFunc func(string)) ([]byte, error) {
start := time.Now().UTC()
resolvedCmd, err := resolveSATCommand(cmd)
appendSATVerboseLog(verboseLog,
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
"cmd: "+strings.Join(resolvedCmd, " "),
)
if logFunc != nil {
logFunc(fmt.Sprintf("=== %s ===", name))
}
if err != nil {
appendSATVerboseLog(verboseLog,
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
@@ -389,7 +478,7 @@ func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string
if len(env) > 0 {
c.Env = append(os.Environ(), env...)
}
out, err := c.CombinedOutput()
out, err := streamExecOutput(c, logFunc)
rc := 0
if err != nil {
@@ -464,6 +553,11 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
}
text := strings.ToLower(string(out))
// No output at all means the tool failed to start (mlock limit, binary missing,
// etc.) — we cannot say anything about hardware health → UNSUPPORTED.
if len(strings.TrimSpace(text)) == 0 {
return "UNSUPPORTED", rc
}
if strings.Contains(text, "unsupported") ||
strings.Contains(text, "not supported") ||
strings.Contains(text, "invalid opcode") ||
@@ -472,19 +566,25 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
strings.Contains(text, "not available") ||
strings.Contains(text, "cuda_error_system_not_ready") ||
strings.Contains(text, "no such device") ||
// nvidia-smi on a machine with no NVIDIA GPU
strings.Contains(text, "couldn't communicate with the nvidia driver") ||
strings.Contains(text, "no nvidia gpu") ||
(strings.Contains(name, "self-test") && strings.Contains(text, "aborted")) {
return "UNSUPPORTED", rc
}
return "FAILED", rc
}
func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) {
func runSATCommand(verboseLog, name string, cmd []string, logFunc func(string)) ([]byte, error) {
start := time.Now().UTC()
resolvedCmd, err := resolveSATCommand(cmd)
appendSATVerboseLog(verboseLog,
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
"cmd: "+strings.Join(resolvedCmd, " "),
)
if logFunc != nil {
logFunc(fmt.Sprintf("=== %s ===", name))
}
if err != nil {
appendSATVerboseLog(verboseLog,
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
@@ -495,7 +595,7 @@ func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) {
return []byte(err.Error() + "\n"), err
}
out, err := satExecCommand(resolvedCmd[0], resolvedCmd[1:]...).CombinedOutput()
out, err := streamExecOutput(satExecCommand(resolvedCmd[0], resolvedCmd[1:]...), logFunc)
rc := 0
if err != nil {
@@ -597,7 +697,7 @@ func parseStorageDevices(raw string) []string {
// runSATCommandWithMetrics runs a command while collecting GPU metrics in the background.
// On completion it writes gpu-metrics.csv and gpu-metrics.html into runDir.
func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir string) ([]byte, error) {
func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir string, logFunc func(string)) ([]byte, error) {
stopCh := make(chan struct{})
doneCh := make(chan struct{})
var metricRows []GPUMetricRow
@@ -625,7 +725,7 @@ func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd
}
}()
out, err := runSATCommandCtx(ctx, verboseLog, name, cmd, env)
out, err := runSATCommandCtx(ctx, verboseLog, name, cmd, env, logFunc)
close(stopCh)
<-doneCh