diff --git a/audit/internal/webui/api.go b/audit/internal/webui/api.go index 48d1f57..518e92d 100644 --- a/audit/internal/webui/api.go +++ b/audit/internal/webui/api.go @@ -155,8 +155,11 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc { } id := newJobID("sat-" + target) j := globalJobs.create(id) + ctx, cancel := context.WithCancel(context.Background()) + j.cancel = cancel go func() { + defer cancel() j.append(fmt.Sprintf("Starting %s acceptance test...", target)) var ( archive string @@ -178,7 +181,7 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc { case "nvidia": if len(body.GPUIndices) > 0 || body.DiagLevel > 0 { result, e := h.opts.App.RunNvidiaAcceptancePackWithOptions( - context.Background(), "", body.DiagLevel, body.GPUIndices, + ctx, "", body.DiagLevel, body.GPUIndices, ) if e != nil { err = e @@ -201,8 +204,13 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc { } if err != nil { - j.append("ERROR: " + err.Error()) - j.finish(err.Error()) + if ctx.Err() != nil { + j.append("Aborted.") + j.finish("aborted") + } else { + j.append("ERROR: " + err.Error()) + j.finish(err.Error()) + } return } j.append(fmt.Sprintf("Archive written: %s", archive)) @@ -223,6 +231,20 @@ func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) { streamJob(w, r, j) } +func (h *handler) handleAPISATAbort(w http.ResponseWriter, r *http.Request) { + id := r.URL.Query().Get("job_id") + j, ok := globalJobs.get(id) + if !ok { + http.Error(w, "job not found", http.StatusNotFound) + return + } + if j.abort() { + writeJSON(w, map[string]string{"status": "aborted"}) + } else { + writeJSON(w, map[string]string{"status": "not_running"}) + } +} + // ── Services ────────────────────────────────────────────────────────────────── func (h *handler) handleAPIServicesList(w http.ResponseWriter, r *http.Request) { diff --git a/audit/internal/webui/jobs.go b/audit/internal/webui/jobs.go index 1992ed8..512cfa2 100644 --- a/audit/internal/webui/jobs.go +++ b/audit/internal/webui/jobs.go @@ -7,12 +7,23 @@ import ( // jobState holds the output lines and completion status of an async job. type jobState struct { - lines []string - done bool - err string - mu sync.Mutex - // subs is a list of channels that receive new lines as they arrive. - subs []chan string + lines []string + done bool + err string + mu sync.Mutex + subs []chan string + cancel func() // optional cancel function; nil if job is not cancellable +} + +// abort cancels the job if it has a cancel function and is not yet done. +func (j *jobState) abort() bool { + j.mu.Lock() + defer j.mu.Unlock() + if j.done || j.cancel == nil { + return false + } + j.cancel() + return true } func (j *jobState) append(line string) { diff --git a/audit/internal/webui/server.go b/audit/internal/webui/server.go index 654ab0c..ef04741 100644 --- a/audit/internal/webui/server.go +++ b/audit/internal/webui/server.go @@ -132,6 +132,7 @@ func NewHandler(opts HandlerOptions) http.Handler { mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage")) mux.HandleFunc("POST /api/sat/cpu/run", h.handleAPISATRun("cpu")) mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream) + mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort) // Services mux.HandleFunc("GET /api/services", h.handleAPIServicesList) diff --git a/bible-local/architecture/runtime-flows.md b/bible-local/architecture/runtime-flows.md index 9c056c7..0977207 100644 --- a/bible-local/architecture/runtime-flows.md +++ b/bible-local/architecture/runtime-flows.md @@ -126,7 +126,7 @@ Key checks: NVIDIA modules loaded, `nvidia-smi` sees all GPUs, lib symlinks pres systemd services running, audit completed with NVIDIA enrichment, LAN reachability. Current validation state: -- local/libvirt VM boot path is validated for `systemd`, SSH, `bee audit`, `bee-network`, and TUI startup +- local/libvirt VM boot path is validated for `systemd`, SSH, `bee audit`, `bee-network`, and Web UI startup - real hardware validation is still required before treating the ISO as release-ready ## Overlay mechanism @@ -168,33 +168,17 @@ Acceptance flows: - `BEE_MEMTESTER_SIZE_MB` - `BEE_MEMTESTER_PASSES` -## NVIDIA SAT TUI flow (v1.0.0+) +## NVIDIA SAT Web UI flow ``` -TUI: Acceptance tests → NVIDIA command pack - 1. screenNvidiaSATSetup - a. enumerate GPUs via `nvidia-smi --query-gpu=index,name,memory.total` - b. user selects duration preset: 10 min / 1 h / 8 h / 24 h - c. user selects GPUs via checkboxes (all selected by default) - d. memory size = max(selected GPU memory) — auto-detected, not exposed to user - 2. Start → screenNvidiaSATRunning - a. CUDA_VISIBLE_DEVICES set to selected GPU indices - b. tea.Batch: SAT goroutine + tea.ExecProcess(nvtop) launched concurrently - c. nvtop occupies full terminal; SAT result queues in background - d. [o] reopen nvtop at any time; [a] abort (cancels context → kills bee-gpu-stress) - 3. GPU metrics collection (during bee-gpu-stress) - - background goroutine polls `nvidia-smi` every second - - per-second rows: elapsed, GPU index, temp°C, usage%, power W, clock MHz - - outputs: gpu-metrics.csv, gpu-metrics.html (offline SVG chart), gpu-metrics-term.txt - 4. After SAT completes - - result shown in screenOutput with terminal line-chart (gpu-metrics-term.txt) - - chart is asciigraph-style: box-drawing chars (╭╮╰╯─│), 4 series per GPU, - Y axis with ticks, ANSI colours (red=temp, blue=usage, green=power, yellow=clock) +Web UI: Acceptance Tests page → Run Test button + 1. POST /api/sat/nvidia/run → returns job_id + 2. GET /api/sat/stream?job_id=... (SSE) — streams stdout/stderr lines live + 3. After completion — archive written to /appdata/bee/export/bee-sat/ + summary.txt contains overall_status (OK / FAILED) and per-job status values ``` **Critical invariants:** -- `nvtop` must be in `iso/builder/config/package-lists/bee.list.chroot` (baked into ISO). -- `bee-gpu-stress` uses `exec.CommandContext` — aborted on cancel. +- `bee-gpu-stress` uses `exec.CommandContext` — killed on job context cancel. - Metric goroutine uses stopCh/doneCh pattern; main goroutine waits `<-doneCh` before reading rows (no mutex needed). -- If `nvtop` is not found on PATH, SAT still runs without it (graceful degradation). - SVG chart is fully offline: no JS, no external CSS, pure inline SVG.