WIP: checkpoint current tree

This commit is contained in:
2026-04-05 12:05:00 +03:00
parent a14ec8631c
commit 20abff7f90
15 changed files with 539 additions and 59 deletions

View File

@@ -91,6 +91,7 @@ func layoutNav(active string, buildLabel string) string {
{"audit", "Audit", "/audit", ""},
{"validate", "Validate", "/validate", ""},
{"burn", "Burn", "/burn", ""},
{"benchmark", "Benchmark", "/benchmark", ""},
{"tasks", "Tasks", "/tasks", ""},
{"tools", "Tools", "/tools", ""},
}
@@ -140,6 +141,10 @@ func renderPage(page string, opts HandlerOptions) string {
pageID = "burn"
title = "Burn"
body = renderBurn()
case "benchmark":
pageID = "benchmark"
title = "Benchmark"
body = renderBenchmark()
case "tasks":
pageID = "tasks"
title = "Tasks"
@@ -781,6 +786,193 @@ func renderSATCard(id, label, extra string) string {
label, extra, id, id)
}
// ── Benchmark ─────────────────────────────────────────────────────────────────
func renderBenchmark() string {
return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Benchmark runs generate a human-readable TXT report and machine-readable result bundle. Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
<div class="grid2">
<div class="card">
<div class="card-head">NVIDIA Benchmark</div>
<div class="card-body">
<div class="form-row">
<label>Profile</label>
<select id="benchmark-profile">
<option value="standard" selected>Standard — about 15 minutes</option>
<option value="stability">Stability — 1 to 2 hours</option>
<option value="overnight">Overnight — 8 hours</option>
</select>
</div>
<div class="form-row">
<label>GPU Selection</label>
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
<button class="btn btn-sm btn-secondary" type="button" onclick="benchmarkSelectAll()">Select All</button>
<button class="btn btn-sm btn-secondary" type="button" onclick="benchmarkSelectNone()">Clear</button>
</div>
<div id="benchmark-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
</div>
</div>
<label class="benchmark-cb-row">
<input type="checkbox" id="benchmark-run-nccl" checked>
<span>Run multi-GPU interconnect step (NCCL) only on the selected GPUs</span>
</label>
<p id="benchmark-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 14px">Select one GPU for single-card benchmarking or several GPUs for a constrained multi-GPU run.</p>
<button id="benchmark-run-btn" class="btn btn-primary" onclick="runNvidiaBenchmark()" disabled>&#9654; Run Benchmark</button>
<span id="benchmark-run-status" style="margin-left:10px;font-size:12px;color:var(--muted)"></span>
</div>
</div>
<div class="card">
<div class="card-head">Method</div>
<div class="card-body">
<p style="font-size:13px;color:var(--muted);margin-bottom:10px">Each benchmark run performs warmup, sustained compute, telemetry capture, cooldown, and optional NCCL interconnect checks.</p>
<table>
<tr><th>Profile</th><th>Purpose</th></tr>
<tr><td>Standard</td><td>Fast, repeatable performance check for server-to-server comparison.</td></tr>
<tr><td>Stability</td><td>Longer run for thermal drift, power caps, and clock instability.</td></tr>
<tr><td>Overnight</td><td>Extended verification of long-run stability and late throttling.</td></tr>
</table>
</div>
</div>
</div>
<div id="benchmark-output" style="display:none;margin-top:16px" class="card">
<div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
<div class="card-body"><div id="benchmark-terminal" class="terminal"></div></div>
</div>
<style>
.benchmark-cb-row { display:flex; align-items:flex-start; gap:8px; cursor:pointer; font-size:13px; }
.benchmark-cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
.benchmark-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
.benchmark-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
</style>
<script>
let benchmarkES = null;
function benchmarkSelectedGPUIndices() {
return Array.from(document.querySelectorAll('.benchmark-gpu-checkbox'))
.filter(function(el) { return el.checked && !el.disabled; })
.map(function(el) { return parseInt(el.value, 10); })
.filter(function(v) { return !Number.isNaN(v); })
.sort(function(a, b) { return a - b; });
}
function benchmarkUpdateSelectionNote() {
const selected = benchmarkSelectedGPUIndices();
const btn = document.getElementById('benchmark-run-btn');
const note = document.getElementById('benchmark-selection-note');
const nccl = document.getElementById('benchmark-run-nccl');
if (!selected.length) {
btn.disabled = true;
note.textContent = 'Select at least one NVIDIA GPU to run the benchmark.';
return;
}
btn.disabled = false;
note.textContent = 'Selected GPUs: ' + selected.join(', ') + '.';
if (nccl && nccl.checked && selected.length < 2) {
note.textContent += ' NCCL will be skipped because fewer than 2 GPUs are selected.';
} else if (nccl && nccl.checked) {
note.textContent += ' NCCL interconnect will use only these GPUs.';
}
}
function benchmarkRenderGPUList(gpus) {
const root = document.getElementById('benchmark-gpu-list');
if (!gpus || !gpus.length) {
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
benchmarkUpdateSelectionNote();
return;
}
root.innerHTML = gpus.map(function(gpu) {
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
return '<label class="benchmark-gpu-row">'
+ '<input class="benchmark-gpu-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="benchmarkUpdateSelectionNote()">'
+ '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
+ '</label>';
}).join('');
benchmarkUpdateSelectionNote();
}
function benchmarkLoadGPUs() {
const status = document.getElementById('benchmark-run-status');
status.textContent = '';
fetch('/api/gpu/nvidia').then(function(r) {
return r.json().then(function(body) {
if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
return body;
});
}).then(function(gpus) {
benchmarkRenderGPUList(gpus);
}).catch(function(err) {
document.getElementById('benchmark-gpu-list').innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
benchmarkUpdateSelectionNote();
});
}
function benchmarkSelectAll() {
document.querySelectorAll('.benchmark-gpu-checkbox').forEach(function(el) { el.checked = true; });
benchmarkUpdateSelectionNote();
}
function benchmarkSelectNone() {
document.querySelectorAll('.benchmark-gpu-checkbox').forEach(function(el) { el.checked = false; });
benchmarkUpdateSelectionNote();
}
function runNvidiaBenchmark() {
const selected = benchmarkSelectedGPUIndices();
const status = document.getElementById('benchmark-run-status');
if (!selected.length) {
status.textContent = 'Select at least one GPU.';
return;
}
if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
const body = {
profile: document.getElementById('benchmark-profile').value || 'standard',
gpu_indices: selected,
run_nccl: !!document.getElementById('benchmark-run-nccl').checked,
display_name: 'NVIDIA Benchmark'
};
document.getElementById('benchmark-output').style.display = 'block';
document.getElementById('benchmark-title').textContent = '— ' + body.profile + ' [' + selected.join(', ') + ']';
const term = document.getElementById('benchmark-terminal');
term.textContent = 'Enqueuing benchmark for GPUs ' + selected.join(', ') + '...\n';
status.textContent = 'Queueing...';
fetch('/api/benchmark/nvidia/run', {
method: 'POST',
headers: {'Content-Type':'application/json'},
body: JSON.stringify(body)
}).then(function(r) {
return r.json().then(function(payload) {
if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
return payload;
});
}).then(function(d) {
status.textContent = 'Task ' + d.task_id + ' queued.';
term.textContent += 'Task ' + d.task_id + ' queued. Streaming log...\n';
benchmarkES = new EventSource('/api/tasks/' + d.task_id + '/stream');
benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
benchmarkES.addEventListener('done', function(e) {
benchmarkES.close();
benchmarkES = null;
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
term.scrollTop = term.scrollHeight;
status.textContent = e.data ? 'Failed.' : 'Completed.';
});
}).catch(function(err) {
status.textContent = 'Error.';
term.textContent += 'ERROR: ' + err.message + '\n';
});
}
document.getElementById('benchmark-run-nccl').addEventListener('change', benchmarkUpdateSelectionNote);
benchmarkLoadGPUs();
</script>`
}
// ── Burn ──────────────────────────────────────────────────────────────────────
func renderBurn() string {
@@ -805,11 +997,12 @@ func renderBurn() string {
<div class="card">
<div class="card-head">GPU Stress</div>
<div class="card-body">
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Tests run on all GPUs in the system. Availability determined by driver status.</p>
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">NVIDIA tools run on all discovered GPUs. DCGM is the official NVIDIA diagnostic path. NCCL exercises multi-GPU fabric and is not a full compute burn.</p>
<div id="gpu-tools-list">
<label class="cb-row"><input type="checkbox" id="burn-gpu-bee" value="bee-gpu-burn" disabled><span>bee-gpu-burn <span class="cb-note" id="note-bee"></span></span></label>
<label class="cb-row"><input type="checkbox" id="burn-gpu-dcgm" value="dcgm" disabled><span>DCGM Diagnostics (Official NVIDIA) <span class="cb-note" id="note-dcgm"></span></span></label>
<label class="cb-row"><input type="checkbox" id="burn-gpu-john" value="john" disabled><span>John the Ripper (OpenCL) <span class="cb-note" id="note-john"></span></span></label>
<label class="cb-row"><input type="checkbox" id="burn-gpu-nccl" value="nccl" disabled><span>NCCL all_reduce_perf <span class="cb-note" id="note-nccl"></span></span></label>
<label class="cb-row"><input type="checkbox" id="burn-gpu-nccl" value="nccl" disabled><span>NCCL all_reduce_perf (Interconnect) <span class="cb-note" id="note-nccl"></span></span></label>
<label class="cb-row"><input type="checkbox" id="burn-gpu-rvs" value="rvs" disabled><span>RVS GST (AMD) <span class="cb-note" id="note-rvs"></span></span></label>
</div>
<button class="btn btn-primary" style="margin-top:10px" onclick="runGPUStress()">&#9654; Run GPU Stress</button>
@@ -881,17 +1074,18 @@ function streamTask(taskId, label) {
}
function runGPUStress() {
const ids = ['burn-gpu-bee','burn-gpu-john','burn-gpu-nccl','burn-gpu-rvs'];
const loaderMap = {'burn-gpu-bee':'builtin','burn-gpu-john':'john','burn-gpu-nccl':'nccl','burn-gpu-rvs':'rvs'};
const targetMap = {'burn-gpu-bee':'nvidia-stress','burn-gpu-john':'nvidia-stress','burn-gpu-nccl':'nvidia-stress','burn-gpu-rvs':'amd-stress'};
let last = null;
ids.filter(id => {
const el = document.getElementById(id);
const tasks = [
{id:'burn-gpu-bee', target:'nvidia-stress', label:'bee-gpu-burn', extra:{loader:'builtin'}},
{id:'burn-gpu-dcgm', target:'nvidia', label:'DCGM Diagnostics (Official NVIDIA)', extra:{display_name:'NVIDIA DCGM Diagnostics (Official)'}},
{id:'burn-gpu-john', target:'nvidia-stress', label:'John GPU Stress', extra:{loader:'john'}},
{id:'burn-gpu-nccl', target:'nvidia-stress', label:'NCCL Interconnect Stress', extra:{loader:'nccl', display_name:'NCCL Interconnect Stress'}},
{id:'burn-gpu-rvs', target:'amd-stress', label:'RVS GST', extra:{}},
];
tasks.filter(t => {
const el = document.getElementById(t.id);
return el && el.checked && !el.disabled;
}).forEach(id => {
const target = targetMap[id];
const extra = target === 'nvidia-stress' ? {loader: loaderMap[id]} : {};
enqueueTask(target, extra).then(d => { last = d; streamTask(d.task_id, target + ' / ' + loaderMap[id]); });
}).forEach(t => {
enqueueTask(t.target, t.extra).then(d => { streamTask(d.task_id, t.label); });
});
}
@@ -928,13 +1122,15 @@ function runAll() {
const done = () => { count++; status.textContent = count + ' tasks queued.'; };
// GPU tests
const gpuIds = ['burn-gpu-bee','burn-gpu-john','burn-gpu-nccl','burn-gpu-rvs'];
const loaderMap = {'burn-gpu-bee':'builtin','burn-gpu-john':'john','burn-gpu-nccl':'nccl','burn-gpu-rvs':'rvs'};
const gpuTargetMap = {'burn-gpu-bee':'nvidia-stress','burn-gpu-john':'nvidia-stress','burn-gpu-nccl':'nvidia-stress','burn-gpu-rvs':'amd-stress'};
gpuIds.filter(id => { const el = document.getElementById(id); return el && el.checked && !el.disabled; }).forEach(id => {
const target = gpuTargetMap[id];
const extra = target === 'nvidia-stress' ? {loader: loaderMap[id]} : {};
enqueueTask(target, extra).then(d => { streamTask(d.task_id, target); done(); });
const gpuTasks = [
{id:'burn-gpu-bee', target:'nvidia-stress', label:'bee-gpu-burn', extra:{loader:'builtin'}},
{id:'burn-gpu-dcgm', target:'nvidia', label:'DCGM Diagnostics (Official NVIDIA)', extra:{display_name:'NVIDIA DCGM Diagnostics (Official)'}},
{id:'burn-gpu-john', target:'nvidia-stress', label:'John GPU Stress', extra:{loader:'john'}},
{id:'burn-gpu-nccl', target:'nvidia-stress', label:'NCCL Interconnect Stress', extra:{loader:'nccl', display_name:'NCCL Interconnect Stress'}},
{id:'burn-gpu-rvs', target:'amd-stress', label:'RVS GST', extra:{}},
];
gpuTasks.filter(t => { const el = document.getElementById(t.id); return el && el.checked && !el.disabled; }).forEach(t => {
enqueueTask(t.target, t.extra).then(d => { streamTask(d.task_id, t.label); done(); });
});
// Compute tests
@@ -955,17 +1151,19 @@ function runAll() {
// Load GPU tool availability
fetch('/api/gpu/tools').then(r => r.json()).then(tools => {
const nvidiaMap = {'bee-gpu-burn':'burn-gpu-bee','john':'burn-gpu-john','nccl':'burn-gpu-nccl','rvs':'burn-gpu-rvs'};
const noteMap = {'bee-gpu-burn':'note-bee','john':'note-john','nccl':'note-nccl','rvs':'note-rvs'};
const nvidiaMap = {'bee-gpu-burn':'burn-gpu-bee','dcgm':'burn-gpu-dcgm','john':'burn-gpu-john','nccl':'burn-gpu-nccl','rvs':'burn-gpu-rvs'};
const noteMap = {'bee-gpu-burn':'note-bee','dcgm':'note-dcgm','john':'note-john','nccl':'note-nccl','rvs':'note-rvs'};
tools.forEach(t => {
const cb = document.getElementById(nvidiaMap[t.id]);
const note = document.getElementById(noteMap[t.id]);
if (!cb) return;
if (t.available) {
cb.disabled = false;
if (t.id === 'bee-gpu-burn') cb.checked = true;
if (t.id === 'bee-gpu-burn' || t.id === 'dcgm') cb.checked = true;
} else {
const reason = t.vendor === 'nvidia' ? 'NVIDIA driver not running' : 'AMD driver not running';
let reason = t.vendor === 'nvidia' ? 'NVIDIA driver not running' : 'AMD driver not running';
if (t.id === 'dcgm' && t.vendor === 'nvidia') reason = 'dcgmi not available or NVIDIA driver not running';
if (t.id === 'nccl' && t.vendor === 'nvidia') reason = 'NCCL interconnect tool unavailable or NVIDIA driver not running';
if (note) note.textContent = '— ' + reason;
}
});
@@ -1125,7 +1323,8 @@ func renderNetwork() string {
// ── Services ──────────────────────────────────────────────────────────────────
func renderServicesInline() string {
return `<div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="restartGPUDrivers()">Restart GPU Drivers</button><button class="btn btn-sm btn-secondary" onclick="loadServices()">&#8635; Refresh</button></div>
return `<p style="font-size:13px;color:var(--muted);margin-bottom:10px">` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `</p>
<div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="restartGPUDrivers()">Restart GPU Drivers</button><button class="btn btn-sm btn-secondary" onclick="loadServices()">&#8635; Refresh</button></div>
<div id="svc-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
<div id="svc-out" style="display:none;margin-top:8px" class="card">
<div class="card-head">Output</div>
@@ -1151,7 +1350,7 @@ function loadServices() {
'</td></tr>';
}).join('');
document.getElementById('svc-table').innerHTML =
'<table><tr><th>Service</th><th>Status</th><th>Actions</th></tr>'+rows+'</table>';
'<table><tr><th>Unit</th><th>Status</th><th>Actions</th></tr>'+rows+'</table>';
});
}
function toggleBody(id) {