Compare commits

...

8 Commits
v5.8 ... v5.10

Author SHA1 Message Date
53455063b9 Stabilize live task detail page 2026-04-05 22:14:52 +03:00
4602f97836 Enforce sequential task orchestration 2026-04-05 22:10:42 +03:00
c65d3ae3b1 Add nomodeset to default GRUB entry — fix black screen on headless servers
Servers with NVIDIA compute GPUs (H100 etc.) have no display output,
so KMS blanks the console. nomodeset disables kernel modesetting and
lets the NVIDIA proprietary driver handle display via Xorg.

KMS variant moved to advanced submenu for cases where it is needed.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-05 21:40:47 +03:00
7a21c370e4 Handle NVIDIA GSP firmware init hang with timeout fallback
- bee-nvidia-load: run insmod in background, poll /proc/devices for
  nvidiactl; if GSP init doesn't complete in 90s, kill insmod and retry
  with NVreg_EnableGpuFirmware=0. Handles EBUSY case with clear error.
- Write /run/bee-nvidia-mode (gsp-on/gsp-off/gsp-stuck) for audit layer
- Show GSP mode badge in sidebar: yellow for gsp-off, red for gsp-stuck
- Report NvidiaGSPMode in RuntimeHealth with issue entries
- Simplify GRUB menu: default (KMS+GSP), advanced submenu (GSP=off,
  nomodeset, fail-safe), remove load-to-RAM entry
- Add pcmanfm, ristretto, mupdf, mousepad to desktop packages

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-05 21:00:43 +03:00
a493e3ab5b Fix service control buttons: sudo, real error output, UX feedback
- services.go: use sudo systemctl so bee user can control system services
- api.go: always return 200 with output field even on error, so the
  frontend shows the actual systemctl message instead of "exit status 1"
- pages.go: button shows "..." while pending then restores label;
  output panel is full-width under the table with ✓/✗ status indicator;
  output auto-scrolls to bottom

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-05 20:25:41 +03:00
19b4803ec7 Pass exact cycle duration to GPU stress instead of 86400s sentinel
bee-gpu-burn now receives --seconds <LoadSec> so it exits naturally
when the cycle ends, rather than relying solely on context cancellation
to kill it. Process group kill (Setpgid+Cancel) is kept as a safety net
for early cancellation (user stop, context timeout). Same fix for AMD
RVS which now gets duration_ms = LoadSec * 1000.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-05 20:22:43 +03:00
1bdfb1e9ca Fix nvidia-targeted-stress failing with DCGM_ST_IN_USE (-34)
nvvs (DCGM validation suite) survives when dcgmi is killed mid-run,
leaving the GPU occupied. The next dcgmi diag invocation then fails
with "affected resource is in use".

Two-part fix:
- Add nvvs and dcgmi to KillTestWorkers patterns so they are cleaned
  up by the global cancel handler
- Call KillTestWorkers at the start of RunNvidiaTargetedStressValidatePack
  to clear any stale processes before dcgmi diag runs

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-05 20:21:36 +03:00
c5d6b30177 Fix platform thermal cycling leaving GPU load running after test ends
bee-gpu-burn is a shell script that spawns bee-gpu-burn-worker children.
exec.CommandContext default cancel only kills the shell parent; the worker
processes survive and keep loading the GPU indefinitely.

Fix: set Setpgid=true and a custom Cancel that sends SIGKILL to the
entire process group (-pid), same pattern already used in runSATCommandCtx.
Applied to Nvidia, AMD, and CPU stress commands for consistency.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-05 20:19:20 +03:00
15 changed files with 513 additions and 155 deletions

View File

@@ -15,6 +15,10 @@ var workerPatterns = []string{
"stress-ng", "stress-ng",
"stressapptest", "stressapptest",
"memtester", "memtester",
// DCGM diagnostic workers — nvvs is spawned by dcgmi diag and survives
// if dcgmi is killed mid-run, leaving the GPU occupied (DCGM_ST_IN_USE).
"nvvs",
"dcgmi",
} }
// KilledProcess describes a process that was sent SIGKILL. // KilledProcess describes a process that was sent SIGKILL.

View File

@@ -110,7 +110,7 @@ func (s *System) RunPlatformStress(
wg.Add(1) wg.Add(1)
go func() { go func() {
defer wg.Done() defer wg.Done()
gpuCmd := buildGPUStressCmd(loadCtx, vendor) gpuCmd := buildGPUStressCmd(loadCtx, vendor, cycle.LoadSec)
if gpuCmd == nil { if gpuCmd == nil {
return return
} }
@@ -392,6 +392,13 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
cmdArgs = append(cmdArgs, "-M", strconv.Itoa(mb)) cmdArgs = append(cmdArgs, "-M", strconv.Itoa(mb))
} }
cmd := exec.CommandContext(ctx, path, cmdArgs...) cmd := exec.CommandContext(ctx, path, cmdArgs...)
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
cmd.Cancel = func() error {
if cmd.Process != nil {
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
}
return nil
}
cmd.Stdout = nil cmd.Stdout = nil
cmd.Stderr = nil cmd.Stderr = nil
if err := startLowPriorityCmd(cmd, 15); err != nil { if err := startLowPriorityCmd(cmd, 15); err != nil {
@@ -402,28 +409,28 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
// buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor. // buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
// Returns nil if no GPU stress tool is available (CPU-only cycling still useful). // Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
func buildGPUStressCmd(ctx context.Context, vendor string) *exec.Cmd { func buildGPUStressCmd(ctx context.Context, vendor string, durSec int) *exec.Cmd {
switch strings.ToLower(vendor) { switch strings.ToLower(vendor) {
case "amd": case "amd":
return buildAMDGPUStressCmd(ctx) return buildAMDGPUStressCmd(ctx, durSec)
case "nvidia": case "nvidia":
return buildNvidiaGPUStressCmd(ctx) return buildNvidiaGPUStressCmd(ctx, durSec)
} }
return nil return nil
} }
func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd { func buildAMDGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
rvsArgs, err := resolveRVSCommand() rvsArgs, err := resolveRVSCommand()
if err != nil { if err != nil {
return nil return nil
} }
rvsPath := rvsArgs[0] rvsPath := rvsArgs[0]
cfg := `actions: cfg := fmt.Sprintf(`actions:
- name: gst_platform - name: gst_platform
device: all device: all
module: gst module: gst
parallel: true parallel: true
duration: 86400000 duration: %d`, durSec*1000) + `
copy_matrix: false copy_matrix: false
target_stress: 90 target_stress: 90
matrix_size_a: 8640 matrix_size_a: 8640
@@ -433,13 +440,20 @@ func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
cfgFile := "/tmp/bee-platform-gst.conf" cfgFile := "/tmp/bee-platform-gst.conf"
_ = os.WriteFile(cfgFile, []byte(cfg), 0644) _ = os.WriteFile(cfgFile, []byte(cfg), 0644)
cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile) cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
cmd.Cancel = func() error {
if cmd.Process != nil {
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
}
return nil
}
cmd.Stdout = nil cmd.Stdout = nil
cmd.Stderr = nil cmd.Stderr = nil
_ = startLowPriorityCmd(cmd, 10) _ = startLowPriorityCmd(cmd, 10)
return cmd return cmd
} }
func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd { func buildNvidiaGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
path, err := satLookPath("bee-gpu-burn") path, err := satLookPath("bee-gpu-burn")
if err != nil { if err != nil {
path, err = satLookPath("bee-gpu-stress") path, err = satLookPath("bee-gpu-stress")
@@ -447,7 +461,17 @@ func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
if err != nil { if err != nil {
return nil return nil
} }
cmd := exec.CommandContext(ctx, path, "--seconds", "86400") // Pass exact duration so bee-gpu-burn exits on its own when the cycle ends.
// Process group kill via Setpgid+Cancel is kept as a safety net for cases
// where the context is cancelled early (user stop, parent timeout).
cmd := exec.CommandContext(ctx, path, "--seconds", strconv.Itoa(durSec))
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
cmd.Cancel = func() error {
if cmd.Process != nil {
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
}
return nil
}
cmd.Stdout = nil cmd.Stdout = nil
cmd.Stderr = nil cmd.Stderr = nil
_ = startLowPriorityCmd(cmd, 10) _ = startLowPriorityCmd(cmd, 10)

View File

@@ -173,6 +173,22 @@ func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHe
switch vendor { switch vendor {
case "nvidia": case "nvidia":
if raw, err := os.ReadFile("/run/bee-nvidia-mode"); err == nil {
health.NvidiaGSPMode = strings.TrimSpace(string(raw))
if health.NvidiaGSPMode == "gsp-stuck" {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "nvidia_gsp_stuck",
Severity: "critical",
Description: "NVIDIA GSP firmware init timed out and the kernel module is stuck. Reboot and select 'GSP=off' in the boot menu.",
})
} else if health.NvidiaGSPMode == "gsp-off" {
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "nvidia_gsp_disabled",
Severity: "warning",
Description: "NVIDIA GSP firmware disabled (fallback). Power management runs via CPU path — power draw readings may differ from reference hardware.",
})
}
}
health.DriverReady = strings.Contains(lsmodText, "nvidia ") health.DriverReady = strings.Contains(lsmodText, "nvidia ")
if !health.DriverReady { if !health.DriverReady {
health.Issues = append(health.Issues, schema.RuntimeIssue{ health.Issues = append(health.Issues, schema.RuntimeIssue{

View File

@@ -382,6 +382,13 @@ func (s *System) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDi
if err != nil { if err != nil {
return "", err return "", err
} }
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
for _, p := range killed {
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
}
}
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", []satJob{ return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", []satJob{
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, {name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{ {

View File

@@ -61,7 +61,9 @@ func (s *System) ServiceState(name string) string {
} }
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) { func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
raw, err := exec.Command("systemctl", string(action), name).CombinedOutput() // bee-web runs as the bee user; sudo is required to control system services.
// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()
return string(raw), err return string(raw), err
} }

View File

@@ -20,6 +20,7 @@ type RuntimeHealth struct {
ExportDir string `json:"export_dir,omitempty"` ExportDir string `json:"export_dir,omitempty"`
DriverReady bool `json:"driver_ready,omitempty"` DriverReady bool `json:"driver_ready,omitempty"`
CUDAReady bool `json:"cuda_ready,omitempty"` CUDAReady bool `json:"cuda_ready,omitempty"`
NvidiaGSPMode string `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
NetworkStatus string `json:"network_status,omitempty"` NetworkStatus string `json:"network_status,omitempty"`
Issues []RuntimeIssue `json:"issues,omitempty"` Issues []RuntimeIssue `json:"issues,omitempty"`
Tools []RuntimeToolStatus `json:"tools,omitempty"` Tools []RuntimeToolStatus `json:"tools,omitempty"`

View File

@@ -383,11 +383,13 @@ func (h *handler) handleAPIServicesAction(w http.ResponseWriter, r *http.Request
return return
} }
result, err := h.opts.App.ServiceActionResult(req.Name, action) result, err := h.opts.App.ServiceActionResult(req.Name, action)
status := "ok"
if err != nil { if err != nil {
writeError(w, http.StatusInternalServerError, err.Error()) status = "error"
return
} }
writeJSON(w, map[string]string{"status": "ok", "output": result.Body}) // Always return 200 with output so the frontend can display the actual
// systemctl error message instead of a generic "exit status 1".
writeJSON(w, map[string]string{"status": status, "output": result.Body})
} }
// ── Network ─────────────────────────────────────────────────────────────────── // ── Network ───────────────────────────────────────────────────────────────────

View File

@@ -33,6 +33,9 @@ a{color:var(--accent);text-decoration:none}
.sidebar-logo{padding:18px 16px 12px;font-size:18px;font-weight:700;color:#fff;letter-spacing:-.5px} .sidebar-logo{padding:18px 16px 12px;font-size:18px;font-weight:700;color:#fff;letter-spacing:-.5px}
.sidebar-logo span{color:rgba(255,255,255,.5);font-weight:400;font-size:12px;display:block;margin-top:2px} .sidebar-logo span{color:rgba(255,255,255,.5);font-weight:400;font-size:12px;display:block;margin-top:2px}
.sidebar-version{padding:0 16px 14px;font-size:11px;color:rgba(255,255,255,.45)} .sidebar-version{padding:0 16px 14px;font-size:11px;color:rgba(255,255,255,.45)}
.sidebar-badge{margin:0 12px 12px;padding:5px 8px;border-radius:4px;font-size:11px;font-weight:600;text-align:center}
.sidebar-badge-warn{background:#7a4f00;color:#f6c90e}
.sidebar-badge-crit{background:#5c1a1a;color:#ff6b6b}
.nav{flex:1} .nav{flex:1}
.nav-item{display:block;padding:10px 16px;color:rgba(255,255,255,.7);font-size:13px;border-left:3px solid transparent;transition:all .15s} .nav-item{display:block;padding:10px 16px;color:rgba(255,255,255,.7);font-size:13px;border-left:3px solid transparent;transition:all .15s}
.nav-item:hover{color:#fff;background:rgba(255,255,255,.08)} .nav-item:hover{color:#fff;background:rgba(255,255,255,.08)}
@@ -107,6 +110,15 @@ func layoutNav(active string, buildLabel string) string {
buildLabel = "dev" buildLabel = "dev"
} }
b.WriteString(`<div class="sidebar-version">Version ` + html.EscapeString(buildLabel) + `</div>`) b.WriteString(`<div class="sidebar-version">Version ` + html.EscapeString(buildLabel) + `</div>`)
if raw, err := os.ReadFile("/run/bee-nvidia-mode"); err == nil {
gspMode := strings.TrimSpace(string(raw))
switch gspMode {
case "gsp-off":
b.WriteString(`<div class="sidebar-badge sidebar-badge-warn">NVIDIA GSP=off</div>`)
case "gsp-stuck":
b.WriteString(`<div class="sidebar-badge sidebar-badge-crit">NVIDIA GSP stuck — reboot</div>`)
}
}
b.WriteString(`<nav class="nav">`) b.WriteString(`<nav class="nav">`)
for _, item := range items { for _, item := range items {
cls := "nav-item" cls := "nav-item"
@@ -1056,17 +1068,23 @@ func renderValidate(opts HandlerOptions) string {
`</div> `</div>
<div style="height:1px;background:var(--border);margin:16px 0"></div> <div style="height:1px;background:var(--border);margin:16px 0"></div>
<div class="grid3"> <div class="grid3">
` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody( ` + renderSATCard("nvidia-selection", "NVIDIA GPU Selection", "", "", renderValidateCardBody(
inv.NVIDIA,
`Select which NVIDIA GPUs to include in Validate. The same selection is used by both NVIDIA GPU cards below and by Validate one by one.`,
`<code>nvidia-smi --query-gpu=index,name,memory.total</code>`,
`<div id="sat-gpu-list"><p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs…</p></div><div style="display:flex;gap:8px;flex-wrap:wrap;margin-top:8px"><button type="button" class="btn btn-sm btn-secondary" onclick="satSelectAllGPUs()">Select all</button><button type="button" class="btn btn-sm btn-secondary" onclick="satSelectNoGPUs()">Clear</button></div><div id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin-top:8px"></div>`,
)) +
renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
inv.NVIDIA, inv.NVIDIA,
`Runs NVIDIA diagnostics and board inventory checks.`, `Runs NVIDIA diagnostics and board inventory checks.`,
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`, `<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
`Runs one GPU at a time. Diag level is taken from Validate Profile.`, `Runs one GPU at a time on the selected NVIDIA GPUs. Diag level is taken from Validate Profile.`,
)) + )) +
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody( renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
inv.NVIDIA, inv.NVIDIA,
`Runs a controlled NVIDIA DCGM load in Validate to check stability under moderate stress.`, `Runs a controlled NVIDIA DCGM load in Validate to check stability under moderate stress.`,
`<code>dcgmi diag targeted_stress</code>`, `<code>dcgmi diag targeted_stress</code>`,
`Runs one GPU at a time with the fixed DCGM targeted stress recipe.`, `Runs one GPU at a time on the selected NVIDIA GPUs with the fixed DCGM targeted stress recipe.`,
)) + )) +
`</div> `</div>
<div class="grid3" style="margin-top:16px"> <div class="grid3" style="margin-top:16px">
@@ -1088,6 +1106,8 @@ func renderValidate(opts HandlerOptions) string {
.validate-card-body { padding:0; } .validate-card-body { padding:0; }
.validate-card-section { padding:12px 16px 0; } .validate-card-section { padding:12px 16px 0; }
.validate-card-section:last-child { padding-bottom:16px; } .validate-card-section:last-child { padding-bottom:16px; }
.sat-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
.sat-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
@media(max-width:900px){ .validate-profile-body { grid-template-columns:1fr; } } @media(max-width:900px){ .validate-profile-body { grid-template-columns:1fr; } }
</style> </style>
<script> <script>
@@ -1116,6 +1136,59 @@ function loadSatNvidiaGPUs() {
} }
return satNvidiaGPUsPromise; return satNvidiaGPUsPromise;
} }
function satSelectedGPUIndices() {
return Array.from(document.querySelectorAll('.sat-nvidia-checkbox'))
.filter(function(el) { return el.checked && !el.disabled; })
.map(function(el) { return parseInt(el.value, 10); })
.filter(function(v) { return !Number.isNaN(v); })
.sort(function(a, b) { return a - b; });
}
function satUpdateGPUSelectionNote() {
const note = document.getElementById('sat-gpu-selection-note');
if (!note) return;
const selected = satSelectedGPUIndices();
if (!selected.length) {
note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA validate tasks.';
return;
}
note.textContent = 'Selected NVIDIA GPUs: ' + selected.join(', ') + '.';
}
function satRenderGPUList(gpus) {
const root = document.getElementById('sat-gpu-list');
if (!root) return;
if (!gpus || !gpus.length) {
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
satUpdateGPUSelectionNote();
return;
}
root.innerHTML = gpus.map(function(gpu) {
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
return '<label class="sat-gpu-row">'
+ '<input class="sat-nvidia-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="satUpdateGPUSelectionNote()">'
+ '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
+ '</label>';
}).join('');
satUpdateGPUSelectionNote();
}
function satSelectAllGPUs() {
document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = true; });
satUpdateGPUSelectionNote();
}
function satSelectNoGPUs() {
document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = false; });
satUpdateGPUSelectionNote();
}
function satLoadGPUs() {
loadSatNvidiaGPUs().then(function(gpus) {
satRenderGPUList(gpus);
}).catch(function(err) {
const root = document.getElementById('sat-gpu-list');
if (root) {
root.innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
}
satUpdateGPUSelectionNote();
});
}
function satGPUDisplayName(gpu) { function satGPUDisplayName(gpu) {
const idx = (gpu && Number.isFinite(Number(gpu.index))) ? Number(gpu.index) : 0; const idx = (gpu && Number.isFinite(Number(gpu.index))) ? Number(gpu.index) : 0;
const name = gpu && gpu.name ? gpu.name : ('GPU ' + idx); const name = gpu && gpu.name ? gpu.name : ('GPU ' + idx);
@@ -1137,6 +1210,36 @@ function enqueueSATTarget(target, overrides) {
return fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(satRequestBody(target, overrides))}) return fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(satRequestBody(target, overrides))})
.then(r => r.json()); .then(r => r.json());
} }
function streamSATTask(taskId, title, resetTerminal) {
if (satES) { satES.close(); satES = null; }
document.getElementById('sat-output').style.display='block';
document.getElementById('sat-title').textContent = '— ' + title;
const term = document.getElementById('sat-terminal');
if (resetTerminal) {
term.textContent = '';
}
term.textContent += 'Task ' + taskId + ' queued. Streaming log...\n';
return new Promise(function(resolve) {
satES = new EventSource('/api/tasks/' + taskId + '/stream');
satES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
satES.addEventListener('done', function(e) {
satES.close();
satES = null;
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
term.scrollTop = term.scrollHeight;
resolve({ok: !e.data, error: e.data || ''});
});
satES.onerror = function() {
if (satES) {
satES.close();
satES = null;
}
term.textContent += '\nERROR: stream disconnected.\n';
term.scrollTop = term.scrollHeight;
resolve({ok: false, error: 'stream disconnected'});
};
});
}
function selectedAMDValidateTargets() { function selectedAMDValidateTargets() {
const targets = []; const targets = [];
const gpu = document.getElementById('sat-amd-target'); const gpu = document.getElementById('sat-amd-target');
@@ -1151,24 +1254,23 @@ function runSAT(target) {
return runSATWithOverrides(target, null); return runSATWithOverrides(target, null);
} }
function runSATWithOverrides(target, overrides) { function runSATWithOverrides(target, overrides) {
if (satES) { satES.close(); satES = null; } const title = (overrides && overrides.display_name) || target;
document.getElementById('sat-output').style.display='block';
document.getElementById('sat-title').textContent = '— ' + target;
const term = document.getElementById('sat-terminal'); const term = document.getElementById('sat-terminal');
term.textContent = 'Enqueuing ' + target + ' test...\n'; document.getElementById('sat-output').style.display='block';
document.getElementById('sat-title').textContent = '— ' + title;
term.textContent = 'Enqueuing ' + title + ' test...\n';
return enqueueSATTarget(target, overrides) return enqueueSATTarget(target, overrides)
.then(d => { .then(d => streamSATTask(d.task_id, title, false));
term.textContent += 'Task ' + d.task_id + ' queued. Streaming log...\n';
satES = new EventSource('/api/tasks/'+d.task_id+'/stream');
satES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop=term.scrollHeight; };
satES.addEventListener('done', e => { satES.close(); satES=null; term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n'; });
});
} }
function expandSATTarget(target) { function expandSATTarget(target) {
if (target !== 'nvidia' && target !== 'nvidia-targeted-stress') { if (target !== 'nvidia' && target !== 'nvidia-targeted-stress') {
return Promise.resolve([{target: target}]); return Promise.resolve([{target: target}]);
} }
return loadSatNvidiaGPUs().then(gpus => gpus.map(gpu => ({ const selected = satSelectedGPUIndices();
if (!selected.length) {
return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
}
return loadSatNvidiaGPUs().then(gpus => gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0).map(gpu => ({
target: target, target: target,
overrides: { overrides: {
gpu_indices: [Number(gpu.index)], gpu_indices: [Number(gpu.index)],
@@ -1179,65 +1281,61 @@ function expandSATTarget(target) {
} }
function runNvidiaValidateSet(target) { function runNvidiaValidateSet(target) {
return loadSatNvidiaGPUs().then(gpus => { return loadSatNvidiaGPUs().then(gpus => {
if (!gpus.length) return; const selected = satSelectedGPUIndices();
if (gpus.length === 1) { const picked = gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0);
const gpu = gpus[0]; if (!picked.length) {
throw new Error('Select at least one NVIDIA GPU.');
}
if (picked.length === 1) {
const gpu = picked[0];
return runSATWithOverrides(target, { return runSATWithOverrides(target, {
gpu_indices: [Number(gpu.index)], gpu_indices: [Number(gpu.index)],
display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')' display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')'
}); });
} }
if (satES) { satES.close(); satES = null; }
document.getElementById('sat-output').style.display='block'; document.getElementById('sat-output').style.display='block';
document.getElementById('sat-title').textContent = '— ' + target; document.getElementById('sat-title').textContent = '— ' + target;
const term = document.getElementById('sat-terminal'); const term = document.getElementById('sat-terminal');
term.textContent = 'Enqueuing ' + target + ' tests one GPU at a time...\n'; term.textContent = 'Running ' + target + ' one GPU at a time...\n';
const labelBase = satLabels()[target] || ('Validate ' + target); const labelBase = satLabels()[target] || ('Validate ' + target);
const enqueueNext = (idx) => { const runNext = (idx) => {
if (idx >= gpus.length) return; if (idx >= picked.length) return Promise.resolve();
const gpu = gpus[idx]; const gpu = picked[idx];
const gpuLabel = satGPUDisplayName(gpu); const gpuLabel = satGPUDisplayName(gpu);
enqueueSATTarget(target, { term.textContent += '\n[' + (idx + 1) + '/' + picked.length + '] ' + gpuLabel + '\n';
return enqueueSATTarget(target, {
gpu_indices: [Number(gpu.index)], gpu_indices: [Number(gpu.index)],
display_name: labelBase + ' (' + gpuLabel + ')' display_name: labelBase + ' (' + gpuLabel + ')'
}).then(d => { }).then(d => {
term.textContent += 'Task ' + d.task_id + ' queued for ' + gpuLabel + '.\n'; return streamSATTask(d.task_id, labelBase + ' (' + gpuLabel + ')', false);
if (idx === gpus.length - 1) { }).then(function() {
satES = new EventSource('/api/tasks/' + d.task_id + '/stream'); return runNext(idx + 1);
satES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop=term.scrollHeight; };
satES.addEventListener('done', e => { satES.close(); satES=null; term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n'; });
}
enqueueNext(idx + 1);
}); });
}; };
enqueueNext(0); return runNext(0);
}); });
} }
function runAMDValidateSet() { function runAMDValidateSet() {
const targets = selectedAMDValidateTargets(); const targets = selectedAMDValidateTargets();
if (!targets.length) return; if (!targets.length) return;
if (targets.length === 1) return runSAT(targets[0]); if (targets.length === 1) return runSAT(targets[0]);
if (satES) { satES.close(); satES = null; }
document.getElementById('sat-output').style.display='block'; document.getElementById('sat-output').style.display='block';
document.getElementById('sat-title').textContent = '— amd'; document.getElementById('sat-title').textContent = '— amd';
const term = document.getElementById('sat-terminal'); const term = document.getElementById('sat-terminal');
term.textContent = 'Enqueuing AMD validate set...\n'; term.textContent = 'Running AMD validate set one by one...\n';
const labels = satLabels(); const labels = satLabels();
const enqueueNext = (idx) => { const runNext = (idx) => {
if (idx >= targets.length) return; if (idx >= targets.length) return Promise.resolve();
const target = targets[idx]; const target = targets[idx];
enqueueSATTarget(target) term.textContent += '\n[' + (idx + 1) + '/' + targets.length + '] ' + labels[target] + '\n';
return enqueueSATTarget(target)
.then(d => { .then(d => {
term.textContent += 'Task ' + d.task_id + ' queued for ' + labels[target] + '.\n'; return streamSATTask(d.task_id, labels[target], false);
if (idx === targets.length - 1) { }).then(function() {
satES = new EventSource('/api/tasks/'+d.task_id+'/stream'); return runNext(idx + 1);
satES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop=term.scrollHeight; };
satES.addEventListener('done', e => { satES.close(); satES=null; term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n'; });
}
enqueueNext(idx + 1);
}); });
}; };
enqueueNext(0); return runNext(0);
} }
function runAllSAT() { function runAllSAT() {
const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1); const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
@@ -1259,17 +1357,17 @@ function runAllSAT() {
status.textContent = 'No tasks selected.'; status.textContent = 'No tasks selected.';
return; return;
} }
const enqueueNext = (idx) => { const runNext = (idx) => {
if (idx >= expanded.length) { status.textContent = 'Enqueued ' + total + ' tasks.'; return; } if (idx >= expanded.length) { status.textContent = 'Completed ' + total + ' task(s).'; return Promise.resolve(); }
const item = expanded[idx]; const item = expanded[idx];
enqueueSATTarget(item.target, item.overrides) status.textContent = 'Running ' + (idx + 1) + '/' + total + '...';
return enqueueSATTarget(item.target, item.overrides)
.then(() => { .then(() => {
enqueued++; enqueued++;
status.textContent = 'Enqueued ' + enqueued + '/' + total + '...'; return runNext(idx + 1);
enqueueNext(idx + 1);
}); });
}; };
enqueueNext(0); return runNext(0);
}).catch(err => { }).catch(err => {
status.textContent = 'Error: ' + err.message; status.textContent = 'Error: ' + err.message;
}); });
@@ -1282,6 +1380,7 @@ fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected'); if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
if (!gp.amd) disableSATAMDOptions('No AMD GPU detected'); if (!gp.amd) disableSATAMDOptions('No AMD GPU detected');
}); });
satLoadGPUs();
function disableSATAMDOptions(reason) { function disableSATAMDOptions(reason) {
['sat-amd-target','sat-amd-mem-target','sat-amd-bandwidth-target'].forEach(function(id) { ['sat-amd-target','sat-amd-mem-target','sat-amd-bandwidth-target'].forEach(function(id) {
const cb = document.getElementById(id); const cb = document.getElementById(id);
@@ -1874,6 +1973,36 @@ function streamTask(taskId, label) {
term.scrollTop = term.scrollHeight; term.scrollTop = term.scrollHeight;
}); });
} }
function streamBurnTask(taskId, label, resetTerminal) {
if (biES) { biES.close(); biES = null; }
document.getElementById('bi-output').style.display = 'block';
document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
const term = document.getElementById('bi-terminal');
if (resetTerminal) {
term.textContent = '';
}
term.textContent += 'Task ' + taskId + ' queued. Streaming...\n';
return new Promise(function(resolve) {
biES = new EventSource('/api/tasks/' + taskId + '/stream');
biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
biES.addEventListener('done', function(e) {
biES.close();
biES = null;
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
term.scrollTop = term.scrollHeight;
resolve({ok: !e.data, error: e.data || ''});
});
biES.onerror = function() {
if (biES) {
biES.close();
biES = null;
}
term.textContent += '\nERROR: stream disconnected.\n';
term.scrollTop = term.scrollHeight;
resolve({ok: false, error: 'stream disconnected'});
};
});
}
function runBurnTaskSet(tasks, statusElId) { function runBurnTaskSet(tasks, statusElId) {
const enabled = tasks.filter(function(t) { const enabled = tasks.filter(function(t) {
@@ -1886,19 +2015,33 @@ function runBurnTaskSet(tasks, statusElId) {
if (status) status.textContent = 'No tasks selected.'; if (status) status.textContent = 'No tasks selected.';
return; return;
} }
enabled.forEach(function(t) { const term = document.getElementById('bi-terminal');
enqueueBurnTask(t.target, t.label, t.extra, !!t.nvidia) document.getElementById('bi-output').style.display = 'block';
document.getElementById('bi-title').textContent = '— Burn one by one [' + burnProfile() + ']';
term.textContent = '';
const runNext = function(idx) {
if (idx >= enabled.length) {
if (status) status.textContent = 'Completed ' + enabled.length + ' task(s).';
return Promise.resolve();
}
const t = enabled[idx];
term.textContent += '\n[' + (idx + 1) + '/' + enabled.length + '] ' + t.label + '\n';
if (status) status.textContent = 'Running ' + (idx + 1) + '/' + enabled.length + '...';
return enqueueBurnTask(t.target, t.label, t.extra, !!t.nvidia)
.then(function(d) { .then(function(d) {
if (status) status.textContent = enabled.length + ' task(s) queued.'; return streamBurnTask(d.task_id, t.label, false);
streamTask(d.task_id, t.label); })
.then(function() {
return runNext(idx + 1);
}) })
.catch(function(err) { .catch(function(err) {
if (status) status.textContent = 'Error: ' + err.message; if (status) status.textContent = 'Error: ' + err.message;
const term = document.getElementById('bi-terminal');
document.getElementById('bi-output').style.display = 'block'; document.getElementById('bi-output').style.display = 'block';
term.textContent += 'ERROR: ' + err.message + '\n'; term.textContent += 'ERROR: ' + err.message + '\n';
return Promise.reject(err);
}); });
}); };
return runNext(0);
} }
function runPlatformStress() { function runPlatformStress() {
@@ -2107,9 +2250,12 @@ func renderServicesInline() string {
return `<p style="font-size:13px;color:var(--muted);margin-bottom:10px">` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `</p> return `<p style="font-size:13px;color:var(--muted);margin-bottom:10px">` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `</p>
<div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="restartGPUDrivers()">Restart GPU Drivers</button><button class="btn btn-sm btn-secondary" onclick="loadServices()">&#8635; Refresh</button></div> <div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="restartGPUDrivers()">Restart GPU Drivers</button><button class="btn btn-sm btn-secondary" onclick="loadServices()">&#8635; Refresh</button></div>
<div id="svc-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div> <div id="svc-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
<div id="svc-out" style="display:none;margin-top:8px" class="card"> <div id="svc-out" style="display:none;margin-top:12px">
<div class="card-head">Output</div> <div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
<div class="card-body" style="padding:10px"><div id="svc-terminal" class="terminal" style="max-height:150px"></div></div> <span id="svc-out-label" style="font-size:12px;font-weight:600;color:var(--muted)">Output</span>
<span id="svc-out-status" style="font-size:12px"></span>
</div>
<div id="svc-terminal" class="terminal" style="max-height:220px;width:100%;box-sizing:border-box"></div>
</div> </div>
<script> <script>
function loadServices() { function loadServices() {
@@ -2125,9 +2271,9 @@ function loadServices() {
'<div id="'+id+'" style="display:none;margin-top:6px"><pre style="font-size:11px;white-space:pre-wrap;word-break:break-all;max-height:200px;overflow-y:auto;background:#1b1c1d;padding:8px;border-radius:4px;color:#b5cea8">'+body+'</pre></div>' + '<div id="'+id+'" style="display:none;margin-top:6px"><pre style="font-size:11px;white-space:pre-wrap;word-break:break-all;max-height:200px;overflow-y:auto;background:#1b1c1d;padding:8px;border-radius:4px;color:#b5cea8">'+body+'</pre></div>' +
'</td>' + '</td>' +
'<td style="white-space:nowrap">' + '<td style="white-space:nowrap">' +
'<button class="btn btn-sm btn-secondary" onclick="svcAction(\''+s.name+'\',\'start\')">Start</button> ' + '<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-start" onclick="svcAction(this,\''+s.name+'\',\'start\')">Start</button> ' +
'<button class="btn btn-sm btn-secondary" onclick="svcAction(\''+s.name+'\',\'stop\')">Stop</button> ' + '<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-stop" onclick="svcAction(this,\''+s.name+'\',\'stop\')">Stop</button> ' +
'<button class="btn btn-sm btn-secondary" onclick="svcAction(\''+s.name+'\',\'restart\')">Restart</button>' + '<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-restart" onclick="svcAction(this,\''+s.name+'\',\'restart\')">Restart</button>' +
'</td></tr>'; '</td></tr>';
}).join(''); }).join('');
document.getElementById('svc-table').innerHTML = document.getElementById('svc-table').innerHTML =
@@ -2138,16 +2284,45 @@ function toggleBody(id) {
const el = document.getElementById(id); const el = document.getElementById(id);
if (el) el.style.display = el.style.display==='none' ? 'block' : 'none'; if (el) el.style.display = el.style.display==='none' ? 'block' : 'none';
} }
function svcAction(name, action) { function svcAction(btn, name, action) {
var label = btn.textContent;
btn.disabled = true;
btn.textContent = '...';
var out = document.getElementById('svc-out');
var term = document.getElementById('svc-terminal');
var statusEl = document.getElementById('svc-out-status');
var labelEl = document.getElementById('svc-out-label');
out.style.display = 'block';
labelEl.textContent = action + ' ' + name;
term.textContent = 'Running...';
statusEl.textContent = '';
statusEl.style.color = '';
fetch('/api/services/action',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({name,action})}) fetch('/api/services/action',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({name,action})})
.then(r=>r.json()).then(d => { .then(r=>r.json()).then(d => {
document.getElementById('svc-out').style.display='block'; term.textContent = d.output || d.error || '(no output)';
document.getElementById('svc-terminal').textContent = d.output || d.error || action+' '+name; term.scrollTop = term.scrollHeight;
setTimeout(loadServices, 1000); if (d.status === 'ok') {
statusEl.textContent = '✓ done';
statusEl.style.color = 'var(--ok-fg, #2c662d)';
} else {
statusEl.textContent = '✗ failed';
statusEl.style.color = 'var(--crit-fg, #9f3a38)';
}
btn.textContent = label;
btn.disabled = false;
setTimeout(loadServices, 800);
}).catch(e => {
term.textContent = 'Request failed: ' + e;
statusEl.textContent = '✗ error';
statusEl.style.color = 'var(--crit-fg, #9f3a38)';
btn.textContent = label;
btn.disabled = false;
}); });
} }
function restartGPUDrivers() { function restartGPUDrivers() {
svcAction('bee-nvidia', 'restart'); var btn = document.querySelector('[onclick*="restartGPUDrivers"]');
if (!btn) { svcAction({textContent:'',disabled:false}, 'bee-nvidia', 'restart'); return; }
svcAction(btn, 'bee-nvidia', 'restart');
} }
loadServices(); loadServices();
</script>` </script>`

View File

@@ -601,8 +601,8 @@ func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
if !strings.Contains(body, `Restart GPU Drivers`) { if !strings.Contains(body, `Restart GPU Drivers`) {
t.Fatalf("tools page missing restart gpu drivers button: %s", body) t.Fatalf("tools page missing restart gpu drivers button: %s", body)
} }
if !strings.Contains(body, `svcAction('bee-nvidia', 'restart')`) { if !strings.Contains(body, `restartGPUDrivers()`) {
t.Fatalf("tools page missing bee-nvidia restart action: %s", body) t.Fatalf("tools page missing restartGPUDrivers action: %s", body)
} }
if !strings.Contains(body, `id="boot-source-text"`) { if !strings.Contains(body, `id="boot-source-text"`) {
t.Fatalf("tools page missing boot source field: %s", body) t.Fatalf("tools page missing boot source field: %s", body)
@@ -649,6 +649,8 @@ func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
`nvidia-targeted-stress`, `nvidia-targeted-stress`,
`controlled NVIDIA DCGM load`, `controlled NVIDIA DCGM load`,
`<code>dcgmi diag targeted_stress</code>`, `<code>dcgmi diag targeted_stress</code>`,
`NVIDIA GPU Selection`,
`id="sat-gpu-list"`,
} { } {
if !strings.Contains(body, needle) { if !strings.Contains(body, needle) {
t.Fatalf("validate page missing %q: %s", needle, body) t.Fatalf("validate page missing %q: %s", needle, body)

View File

@@ -106,23 +106,61 @@ func renderTaskDetailPage(opts HandlerOptions, task Task) string {
body.WriteString(`</div></div>`) body.WriteString(`</div></div>`)
body.WriteString(`<script> body.WriteString(`<script>
function cancelTaskDetail(id) { function cancelTaskDetail(id) {
fetch('/api/tasks/' + id + '/cancel', {method:'POST'}).then(function(){ window.location.reload(); }); fetch('/api/tasks/' + id + '/cancel', {method:'POST'}).then(function(){
var term = document.getElementById('task-live-log');
if (term) {
term.textContent += '\nCancel requested.\n';
term.scrollTop = term.scrollHeight;
} }
function loadTaskLiveCharts(taskId) { });
fetch('/api/tasks/' + taskId + '/charts').then(function(r){ return r.json(); }).then(function(charts){ }
function renderTaskLiveCharts(taskId, charts) {
const host = document.getElementById('task-live-charts'); const host = document.getElementById('task-live-charts');
if (!host) return; if (!host) return;
if (!Array.isArray(charts) || charts.length === 0) { if (!Array.isArray(charts) || charts.length === 0) {
host.innerHTML = 'Waiting for metric samples...'; host.innerHTML = 'Waiting for metric samples...';
return; return;
} }
host.innerHTML = charts.map(function(chart) { const seen = {};
return '<div class="card" style="margin:0">' + charts.forEach(function(chart) {
'<div class="card-head">' + chart.title + '</div>' + seen[chart.file] = true;
'<div class="card-body" style="padding:12px">' + let img = host.querySelector('img[data-chart-file="' + chart.file + '"]');
'<img data-task-chart="1" data-base-src="/api/tasks/' + taskId + '/chart/' + chart.file + '" src="/api/tasks/' + taskId + '/chart/' + chart.file + '?t=' + Date.now() + '" style="width:100%;display:block;border-radius:6px" alt="' + chart.title + '">' + if (img) {
'</div></div>'; const card = img.closest('.card');
}).join(''); if (card) {
const title = card.querySelector('.card-head');
if (title) title.textContent = chart.title;
}
return;
}
const card = document.createElement('div');
card.className = 'card';
card.style.margin = '0';
card.innerHTML = '<div class="card-head"></div><div class="card-body" style="padding:12px"></div>';
card.querySelector('.card-head').textContent = chart.title;
const body = card.querySelector('.card-body');
img = document.createElement('img');
img.setAttribute('data-task-chart', '1');
img.setAttribute('data-chart-file', chart.file);
img.setAttribute('data-base-src', '/api/tasks/' + taskId + '/chart/' + chart.file);
img.src = '/api/tasks/' + taskId + '/chart/' + chart.file + '?t=' + Date.now();
img.style.width = '100%';
img.style.display = 'block';
img.style.borderRadius = '6px';
img.alt = chart.title;
body.appendChild(img);
host.appendChild(card);
});
Array.from(host.querySelectorAll('img[data-task-chart="1"]')).forEach(function(img) {
const file = img.getAttribute('data-chart-file') || '';
if (seen[file]) return;
const card = img.closest('.card');
if (card) card.remove();
});
}
function loadTaskLiveCharts(taskId) {
fetch('/api/tasks/' + taskId + '/charts').then(function(r){ return r.json(); }).then(function(charts){
renderTaskLiveCharts(taskId, charts);
}).catch(function(){ }).catch(function(){
const host = document.getElementById('task-live-charts'); const host = document.getElementById('task-live-charts');
if (host) host.innerHTML = 'Task charts are unavailable.'; if (host) host.innerHTML = 'Task charts are unavailable.';
@@ -138,12 +176,31 @@ function refreshTaskLiveCharts() {
var _taskDetailES = new EventSource('/api/tasks/` + html.EscapeString(task.ID) + `/stream'); var _taskDetailES = new EventSource('/api/tasks/` + html.EscapeString(task.ID) + `/stream');
var _taskDetailTerm = document.getElementById('task-live-log'); var _taskDetailTerm = document.getElementById('task-live-log');
var _taskChartTimer = null; var _taskChartTimer = null;
var _taskChartsFrozen = false;
_taskDetailES.onopen = function(){ _taskDetailTerm.textContent = ''; }; _taskDetailES.onopen = function(){ _taskDetailTerm.textContent = ''; };
_taskDetailES.onmessage = function(e){ _taskDetailTerm.textContent += e.data + "\n"; _taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight; }; _taskDetailES.onmessage = function(e){ _taskDetailTerm.textContent += e.data + "\n"; _taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight; };
_taskDetailES.addEventListener('done', function(){ if (_taskChartTimer) clearInterval(_taskChartTimer); _taskDetailES.close(); setTimeout(function(){ window.location.reload(); }, 1000); }); _taskDetailES.addEventListener('done', function(e){
_taskDetailES.onerror = function(){ if (_taskChartTimer) clearInterval(_taskChartTimer); _taskDetailES.close(); }; if (_taskChartTimer) clearInterval(_taskChartTimer);
_taskDetailES.close();
_taskDetailES = null;
_taskChartsFrozen = true;
_taskDetailTerm.textContent += (e.data ? '\nTask finished with error.\n' : '\nTask finished.\n');
_taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight;
refreshTaskLiveCharts();
});
_taskDetailES.onerror = function(){
if (_taskChartTimer) clearInterval(_taskChartTimer);
if (_taskDetailES) {
_taskDetailES.close();
_taskDetailES = null;
}
};
loadTaskLiveCharts('` + html.EscapeString(task.ID) + `'); loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
_taskChartTimer = setInterval(function(){ refreshTaskLiveCharts(); loadTaskLiveCharts('` + html.EscapeString(task.ID) + `'); }, 2000); _taskChartTimer = setInterval(function(){
if (_taskChartsFrozen) return;
loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
refreshTaskLiveCharts();
}, 2000);
</script>`) </script>`)
} }

View File

@@ -423,13 +423,14 @@ func (q *taskQueue) worker() {
setCPUGovernor("performance") setCPUGovernor("performance")
defer setCPUGovernor("powersave") defer setCPUGovernor("powersave")
// Drain all pending tasks and start them in parallel.
q.mu.Lock()
var batch []*Task
for { for {
q.mu.Lock()
t := q.nextPending() t := q.nextPending()
if t == nil { if t == nil {
break q.prune()
q.persistLocked()
q.mu.Unlock()
return
} }
now := time.Now() now := time.Now()
t.Status = TaskRunning t.Status = TaskRunning
@@ -438,29 +439,14 @@ func (q *taskQueue) worker() {
t.ErrMsg = "" t.ErrMsg = ""
j := newTaskJobState(t.LogPath, taskSerialPrefix(t)) j := newTaskJobState(t.LogPath, taskSerialPrefix(t))
t.job = j t.job = j
batch = append(batch, t)
}
if len(batch) > 0 {
q.persistLocked() q.persistLocked()
}
q.mu.Unlock() q.mu.Unlock()
var wg sync.WaitGroup
for _, t := range batch {
t := t
j := t.job
taskCtx, taskCancel := context.WithCancel(context.Background()) taskCtx, taskCancel := context.WithCancel(context.Background())
j.cancel = taskCancel j.cancel = taskCancel
wg.Add(1)
goRecoverOnce("task "+t.Target, func() {
defer wg.Done()
defer taskCancel()
q.executeTask(t, j, taskCtx) q.executeTask(t, j, taskCtx)
}) taskCancel()
}
wg.Wait()
if len(batch) > 0 {
q.mu.Lock() q.mu.Lock()
q.prune() q.prune()
q.persistLocked() q.persistLocked()

View File

@@ -15,30 +15,22 @@ menuentry "EASY-BEE" {
initrd @INITRD_LIVE@ initrd @INITRD_LIVE@
} }
menuentry "EASY-BEE (graphics/KMS)" { submenu "EASY-BEE (advanced options) -->" {
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup menuentry "EASY-BEE — GSP=off" {
initrd @INITRD_LIVE@
}
menuentry "EASY-BEE (load to RAM)" {
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
initrd @INITRD_LIVE@
}
menuentry "EASY-BEE (NVIDIA GSP=off)" {
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
initrd @INITRD_LIVE@ initrd @INITRD_LIVE@
} }
menuentry "EASY-BEE (graphics/KMS, GSP=off)" { menuentry "EASY-BEE — KMS (no nomodeset)" {
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
initrd @INITRD_LIVE@ initrd @INITRD_LIVE@
} }
menuentry "EASY-BEE (fail-safe)" { menuentry "EASY-BEE fail-safe" {
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0 linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
initrd @INITRD_LIVE@ initrd @INITRD_LIVE@
} }
}
if [ "${grub_platform}" = "efi" ]; then if [ "${grub_platform}" = "efi" ]; then
menuentry "Memory Test (memtest86+)" { menuentry "Memory Test (memtest86+)" {

View File

@@ -44,23 +44,27 @@ else:
img = Image.new('RGB', (W, H), (0, 0, 0)) img = Image.new('RGB', (W, H), (0, 0, 0))
draw = ImageDraw.Draw(img) draw = ImageDraw.Draw(img)
# Measure logo block # Measure logo block line by line to avoid font ascender offset
lines = LOGO.split('\n') lines = LOGO.split('\n')
bbox = draw.textbbox((0, 0), LOGO, font=font_logo)
text_w = bbox[2] - bbox[0]
text_h = bbox[3] - bbox[1]
x = (W - text_w) // 2
y = (H - text_h) // 2
# Draw logo lines: first 6 in amber, last line (subtitle) dimmer
logo_lines = lines[:6] logo_lines = lines[:6]
sub_line = lines[6] if len(lines) > 6 else '' sub_line = lines[6] if len(lines) > 6 else ''
line_h = SIZE + 2
block_h = len(logo_lines) * line_h + 8 + (SIZE if sub_line else 0)
# Width: measure the widest logo line
max_w = 0
for line in logo_lines:
bb = draw.textbbox((0, 0), line, font=font_logo)
max_w = max(max_w, bb[2] - bb[0])
x = (W - max_w) // 2
y = (H - block_h) // 2
cy = y cy = y
for line in logo_lines: for line in logo_lines:
draw.text((x, cy), line, font=font_logo, fill=(0xf6, 0xc9, 0x0e)) draw.text((x, cy), line, font=font_logo, fill=(0xf6, 0xc9, 0x0e))
cy += SIZE + 2 cy += line_h
cy += 8 cy += 8
if sub_line: if sub_line:
draw.text((x, cy), sub_line, font=font_sub, fill=(0x80, 0x68, 0x18)) draw.text((x, cy), sub_line, font=font_sub, fill=(0x80, 0x68, 0x18))

View File

@@ -65,6 +65,10 @@ python3-pil
xorg xorg
xterm xterm
chromium chromium
mousepad
pcmanfm
ristretto
mupdf
xserver-xorg-video-fbdev xserver-xorg-video-fbdev
xserver-xorg-video-vesa xserver-xorg-video-vesa
lightdm lightdm

View File

@@ -50,11 +50,93 @@ load_module() {
log "WARN: not found: $ko" log "WARN: not found: $ko"
return 1 return 1
fi fi
if insmod "$ko" "$@"; then if timeout 90 insmod "$ko" "$@"; then
log "loaded: $mod $*" log "loaded: $mod $*"
return 0 return 0
fi fi
log "WARN: failed to load: $mod" log "WARN: failed to load: $mod (exit $?)"
dmesg | tail -n 10 | sed 's/^/ dmesg: /' || true
return 1
}
nvidia_is_functional() {
grep -q ' nvidiactl$' /proc/devices 2>/dev/null
}
load_module_with_gsp_fallback() {
ko="$NVIDIA_KO_DIR/nvidia.ko"
if [ ! -f "$ko" ]; then
log "ERROR: not found: $ko"
return 1
fi
# Run insmod in background — on some converted SXM→PCIe cards GSP enters an
# infinite crash/reload loop and insmod never returns. We check for successful
# initialization by polling /proc/devices for nvidiactl instead of waiting for
# insmod to exit.
log "loading nvidia (GSP enabled, timeout 90s)"
insmod "$ko" &
_insmod_pid=$!
_waited=0
while [ $_waited -lt 90 ]; do
if nvidia_is_functional; then
log "loaded: nvidia (GSP enabled, ${_waited}s)"
echo "gsp-on" > /run/bee-nvidia-mode
return 0
fi
# Check if insmod exited with an error before timeout
if ! kill -0 "$_insmod_pid" 2>/dev/null; then
wait "$_insmod_pid"
_rc=$?
if [ $_rc -ne 0 ]; then
log "nvidia load failed (exit $_rc)"
dmesg | tail -n 10 | sed 's/^/ dmesg: /' || true
return 1
fi
# insmod exited 0 but nvidiactl not yet in /proc/devices — give it a moment
sleep 2
if nvidia_is_functional; then
log "loaded: nvidia (GSP enabled, ${_waited}s)"
return 0
fi
log "insmod exited 0 but nvidiactl missing — treating as failure"
return 1
fi
sleep 1
_waited=$((_waited + 1))
done
# GSP init timed out — kill the hanging insmod and attempt gsp-off fallback
log "nvidia GSP init timed out after 90s"
kill "$_insmod_pid" 2>/dev/null || true
wait "$_insmod_pid" 2>/dev/null || true
# Attempt to unload the partially-initialized module
if ! rmmod nvidia 2>/dev/null; then
# Module is stuck in the kernel — cannot reload with different params.
# User must reboot and select bee.nvidia.mode=gsp-off at boot menu.
log "ERROR: rmmod nvidia failed (EBUSY) — module stuck in kernel"
log "ERROR: reboot and select 'EASY-BEE (advanced) -> GSP=off' in boot menu"
echo "gsp-stuck" > /run/bee-nvidia-mode
return 1
fi
sleep 2
log "retrying with NVreg_EnableGpuFirmware=0"
log "WARNING: GSP disabled — power management will run via CPU path, not GPU firmware"
if insmod "$ko" NVreg_EnableGpuFirmware=0; then
if nvidia_is_functional; then
log "loaded: nvidia (GSP disabled)"
echo "gsp-off" > /run/bee-nvidia-mode
return 0
fi
log "insmod gsp-off exited 0 but nvidiactl missing"
return 1
fi
log "nvidia load failed (GSP=off)"
dmesg | tail -n 10 | sed 's/^/ dmesg: /' || true dmesg | tail -n 10 | sed 's/^/ dmesg: /' || true
return 1 return 1
} }
@@ -70,7 +152,7 @@ load_host_module() {
case "$nvidia_mode" in case "$nvidia_mode" in
normal|full) normal|full)
if ! load_module nvidia; then if ! load_module_with_gsp_fallback; then
exit 1 exit 1
fi fi
# nvidia-modeset on some server kernels needs ACPI video helper symbols # nvidia-modeset on some server kernels needs ACPI video helper symbols