Files
bee/audit/internal/webui/page_validate.go
Mikhail Chusavitin 271dadda03 Restructure web UI navigation into 7 numbered workflow stages
Replace the flat menu (Dashboard, Audit, Validate, Burn, Benchmark,
Tasks, Tools) with a numbered progression that guides engineers through
a logical acceptance workflow:

  Dashboard (landing) → 1. Audit → 2. Check → 3. Load → 4. Speed
  → 5. Endurance → 6. Tools → 7. Settings

Key changes:
- layout.go: numbered nav labels, new hrefs, Tasks removed from nav
  and replaced with a persistent sidebar badge (polls /api/tasks every
  5 s, highlights amber when tasks are active)
- server.go: 301 redirects from /validate→/check, /burn→/load,
  /benchmark→/speed for backward compatibility
- pages.go: dispatch cases for all new routes; old routes kept as
  fallbacks
- page_validate.go: add renderCheck() — non-destructive check page
  with validate-mode tests only (no stress toggle, no targeted-stress/
  targeted-power/pulse cards)
- page_burn.go: add renderLoad() wrapper; update scope alert to
  reference /check instead of /validate
- page_benchmark.go: add renderSpeed() (performance focus) and
  renderEndurance() (stability/overnight focus) wrappers
- page_settings.go: new Settings page with blackbox logging toggle,
  NVIDIA driver reset, and build info
- server_test.go: update five tests to use new route names and
  content expectations

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-18 11:00:02 +03:00

953 lines
43 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package webui
import (
"encoding/json"
"fmt"
"html"
"sort"
"strings"
"bee/audit/internal/platform"
"bee/audit/internal/schema"
)
// PCI vendor IDs used for GPU classification (source: pci-ids.ucw.cz).
const (
pciVendorNvidia = 0x10de
pciVendorAMD = 0x1002
pciVendorAspeed = 0x1a03
)
type validateInventory struct {
CPU string
Memory string
Storage string
NVIDIA string
AMD string
NvidiaGPUCount int
AMDGPUCount int
}
func validateFmtDur(secs int) string {
if secs < 120 {
return fmt.Sprintf("~%d s", secs)
}
mins := (secs + 29) / 60
return fmt.Sprintf("~%d min", mins)
}
func validateTotalValidateSec(n int) int {
if n < 0 {
n = 0
}
total := platform.SATEstimatedCPUValidateSec +
platform.SATEstimatedMemoryValidateSec +
platform.SATEstimatedNvidiaInterconnectSec +
platform.SATEstimatedNvidiaBandwidthSec
if n > 0 {
total += platform.SATEstimatedNvidiaGPUValidateSec
}
return total
}
func validateTotalStressSec(n int) int {
if n < 0 {
n = 0
}
total := platform.SATEstimatedCPUStressSec +
platform.SATEstimatedMemoryStressSec +
platform.SATEstimatedNvidiaPulseTestSec +
platform.SATEstimatedNvidiaInterconnectSec +
platform.SATEstimatedNvidiaBandwidthSec
if n > 0 {
total += platform.SATEstimatedNvidiaGPUStressSec +
platform.SATEstimatedNvidiaTargetedStressSec +
platform.SATEstimatedNvidiaTargetedPowerSec
}
return total
}
func renderValidate(opts HandlerOptions) string {
inv := loadValidateInventory(opts)
n := inv.NvidiaGPUCount
validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
stressTotalStr := validateFmtDur(validateTotalStressSec(n))
gpuNote := ""
if n > 0 {
gpuNote = fmt.Sprintf(" (%d GPU)", n)
}
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
<div class="card" style="margin-bottom:16px">
<div class="card-head">Validate Profile</div>
<div class="card-body validate-profile-body">
<div class="validate-profile-col">
<div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (` + stressTotalStr + gpuNote + `)</span></label>
</div>
<div class="validate-profile-col validate-profile-action">
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially. Validate: ` + validateTotalStr + gpuNote + `; Stress: ` + stressTotalStr + gpuNote + `. Estimates are based on real log data and scale with GPU count.</p>
<button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
<div style="margin-top:12px">
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
</div>
</div>
</div>
</div>
<div class="grid3">
` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
inv.CPU,
`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
validateFmtDur(platform.SATEstimatedCPUValidateSec)+` in Validate (stress-ng 60 s). `+validateFmtDur(platform.SATEstimatedCPUStressSec)+` in Stress (stress-ng 30 min).`,
)) +
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
inv.Memory,
`Runs a RAM validation pass and records memory state around the test.`,
`<code>free</code>, <code>memtester</code>`,
validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` in Validate (256 MB × 1 pass). `+validateFmtDur(platform.SATEstimatedMemoryStressSec)+` in Stress (512 MB × 1 pass).`,
)) +
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
inv.Storage,
`Scans all storage devices and runs the matching health or self-test path for each device type.`,
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
`Seconds in Validate (NVMe: instant device query; SATA/SAS: short self-test). Up to ~1 h per device in Stress (extended self-test, device-dependent).`,
)) +
`</div>
<div style="height:1px;background:var(--border);margin:16px 0"></div>
<div class="card" style="margin-bottom:16px">
<div class="card-head">NVIDIA GPU Selection</div>
<div class="card-body">
<p style="font-size:12px;color:var(--muted);margin:0 0 8px">` + inv.NVIDIA + `</p>
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Validate one by one.</p>
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectAllGPUs()">Select All</button>
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectNoGPUs()">Clear</button>
</div>
<div id="sat-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
</div>
<p id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA validate tasks.</p>
</div>
</div>
<div class="grid3">
` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
inv.NVIDIA,
`Runs NVIDIA diagnostics and board inventory checks.`,
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
fmt.Sprintf("Validate: %s (Level 2, all GPUs simultaneously). Stress: %s (Level 3, all GPUs simultaneously).",
validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec),
validateFmtDur(platform.SATEstimatedNvidiaGPUStressSec)),
)) +
`<div id="sat-card-nvidia-targeted-stress">` +
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
inv.NVIDIA,
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
`<code>dcgmi diag targeted_stress</code>`,
"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec) + ` (all GPUs simultaneously).<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
)) +
`</div>` +
`<div id="sat-card-nvidia-targeted-power">` +
renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody(
inv.NVIDIA,
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
`<code>dcgmi diag targeted_power</code>`,
"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec) + ` (all GPUs simultaneously).<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
)) +
`</div>` +
`<div id="sat-card-nvidia-pulse">` +
renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody(
inv.NVIDIA,
`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
`<code>dcgmi diag pulse_test</code>`,
`Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
)) +
`</div>` +
`<div id="sat-card-nvidia-interconnect">` +
renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
inv.NVIDIA,
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
`<code>all_reduce_perf</code> (NCCL tests)`,
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
)) +
`</div>` +
`<div id="sat-card-nvidia-bandwidth">` +
renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
inv.NVIDIA,
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
`<code>nvbandwidth</code>`,
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
)) +
`</div>` +
`</div>
<div class="grid3" style="margin-top:16px">
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
inv.AMD,
`Runs the selected AMD checks only. GPU Validate collects inventory; MEM Integrity uses the RVS MEM module; MEM Bandwidth uses rocm-bandwidth-test and the RVS BABEL module.`,
`GPU Validate: <code>rocm-smi</code>, <code>dmidecode</code>; MEM Integrity: <code>rvs mem</code>; MEM Bandwidth: <code>rocm-bandwidth-test</code>, <code>rvs babel</code>`,
`<div style="display:flex;flex-direction:column;gap:4px"><label class="cb-row"><input type="checkbox" id="sat-amd-target" checked><span>GPU Validate</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-mem-target" checked><span>MEM Integrity</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-bandwidth-target" checked><span>MEM Bandwidth</span></label></div>`,
)) +
`</div>
<div id="sat-output" style="display:none;margin-top:16px" class="card">
<div class="card-head">Test Output <span id="sat-title"></span></div>
<div class="card-body"><div id="sat-terminal" class="terminal"></div></div>
</div>
<style>
.validate-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
.validate-profile-col { min-width:0; display:flex; flex-direction:column; }
.validate-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:center; }
.validate-card-body { padding:0; }
.validate-card-section { padding:12px 16px 0; }
.validate-card-section:last-child { padding-bottom:16px; }
.sat-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
.sat-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
@media(max-width:900px){ .validate-profile-body { grid-template-columns:1fr; } }
</style>
<script>
let satES = null;
function satStressMode() {
return document.querySelector('input[name="sat-mode"]:checked')?.value === 'stress';
}
function satModeChanged() {
const stress = satStressMode();
[
{card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
{card: 'sat-card-nvidia-targeted-power', hint: 'sat-tp-mode-hint'},
{card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'},
].forEach(function(item) {
const card = document.getElementById(item.card);
if (card) {
card.style.opacity = stress ? '1' : '0.5';
const hint = document.getElementById(item.hint);
if (hint) hint.style.display = stress ? 'none' : '';
}
});
}
function satLabels() {
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
}
let satNvidiaGPUsPromise = null;
function loadSatNvidiaGPUs() {
if (!satNvidiaGPUsPromise) {
satNvidiaGPUsPromise = fetch('/api/gpu/nvidia')
.then(r => {
if (!r.ok) throw new Error('Failed to load NVIDIA GPUs.');
return r.json();
})
.then(list => Array.isArray(list) ? list : []);
}
return satNvidiaGPUsPromise;
}
function satSelectedGPUIndices() {
return Array.from(document.querySelectorAll('.sat-nvidia-checkbox'))
.filter(function(el) { return el.checked && !el.disabled; })
.map(function(el) { return parseInt(el.value, 10); })
.filter(function(v) { return !Number.isNaN(v); })
.sort(function(a, b) { return a - b; });
}
function satUpdateGPUSelectionNote() {
const note = document.getElementById('sat-gpu-selection-note');
if (!note) return;
const selected = satSelectedGPUIndices();
if (!selected.length) {
note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA validate tasks.';
return;
}
note.textContent = 'Selected GPUs: ' + selected.join(', ') + '. Multi-GPU tests will use all selected GPUs.';
}
function satRenderGPUList(gpus) {
const root = document.getElementById('sat-gpu-list');
if (!root) return;
if (!gpus || !gpus.length) {
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
satUpdateGPUSelectionNote();
return;
}
root.innerHTML = gpus.map(function(gpu) {
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
return '<label class="sat-gpu-row">'
+ '<input class="sat-nvidia-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="satUpdateGPUSelectionNote()">'
+ '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
+ '</label>';
}).join('');
satUpdateGPUSelectionNote();
}
function satSelectAllGPUs() {
document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = true; });
satUpdateGPUSelectionNote();
}
function satSelectNoGPUs() {
document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = false; });
satUpdateGPUSelectionNote();
}
function satLoadGPUs() {
loadSatNvidiaGPUs().then(function(gpus) {
satRenderGPUList(gpus);
}).catch(function(err) {
const root = document.getElementById('sat-gpu-list');
if (root) {
root.innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
}
satUpdateGPUSelectionNote();
});
}
function satGPUDisplayName(gpu) {
const idx = (gpu && Number.isFinite(Number(gpu.index))) ? Number(gpu.index) : 0;
const name = gpu && gpu.name ? gpu.name : ('GPU ' + idx);
return 'GPU ' + idx + ' — ' + name;
}
function satRequestBody(target, overrides) {
const body = {};
const labels = satLabels();
body.display_name = labels[target] || ('Validate ' + target);
body.stress_mode = satStressMode();
if (target === 'cpu') body.duration = satStressMode() ? 1800 : 60;
if (overrides) {
Object.keys(overrides).forEach(key => { body[key] = overrides[key]; });
}
return body;
}
function enqueueSATTarget(target, overrides) {
return fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(satRequestBody(target, overrides))})
.then(r => r.json());
}
function streamSATTask(taskId, title, resetTerminal) {
if (satES) { satES.close(); satES = null; }
document.getElementById('sat-output').style.display='block';
document.getElementById('sat-title').textContent = '— ' + title;
const term = document.getElementById('sat-terminal');
if (resetTerminal) {
term.textContent = '';
}
term.textContent += 'Task ' + taskId + ' queued. Streaming log...\n';
return new Promise(function(resolve) {
satES = new EventSource('/api/tasks/' + taskId + '/stream');
satES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
satES.addEventListener('done', function(e) {
satES.close();
satES = null;
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
term.scrollTop = term.scrollHeight;
resolve({ok: !e.data, error: e.data || ''});
});
satES.onerror = function() {
if (satES) {
satES.close();
satES = null;
}
term.textContent += '\nERROR: stream disconnected.\n';
term.scrollTop = term.scrollHeight;
resolve({ok: false, error: 'stream disconnected'});
};
});
}
function selectedAMDValidateTargets() {
const targets = [];
const gpu = document.getElementById('sat-amd-target');
const mem = document.getElementById('sat-amd-mem-target');
const bw = document.getElementById('sat-amd-bandwidth-target');
if (gpu && gpu.checked && !gpu.disabled) targets.push('amd');
if (mem && mem.checked && !mem.disabled) targets.push('amd-mem');
if (bw && bw.checked && !bw.disabled) targets.push('amd-bandwidth');
return targets;
}
function runSAT(target) {
return runSATWithOverrides(target, null);
}
function runSATWithOverrides(target, overrides) {
const title = (overrides && overrides.display_name) || target;
const term = document.getElementById('sat-terminal');
document.getElementById('sat-output').style.display='block';
document.getElementById('sat-title').textContent = '— ' + title;
term.textContent = 'Enqueuing ' + title + ' test...\n';
return enqueueSATTarget(target, overrides)
.then(d => streamSATTask(d.task_id, title, false));
}
const nvidiaPerGPUTargets = [];
const nvidiaAllGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
function satAllGPUIndicesForMulti() {
return Promise.resolve(satSelectedGPUIndices());
}
function expandSATTarget(target) {
if (nvidiaAllGPUTargets.indexOf(target) >= 0) {
return satAllGPUIndicesForMulti().then(function(indices) {
if (!indices.length) return Promise.reject(new Error('No NVIDIA GPUs available.'));
return [{target: target, overrides: {gpu_indices: indices, display_name: satLabels()[target] || target}}];
});
}
if (nvidiaPerGPUTargets.indexOf(target) < 0) {
return Promise.resolve([{target: target}]);
}
const selected = satSelectedGPUIndices();
if (!selected.length) {
return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
}
return loadSatNvidiaGPUs().then(gpus => gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0).map(gpu => ({
target: target,
overrides: {
gpu_indices: [Number(gpu.index)],
display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')'
},
label: satGPUDisplayName(gpu),
})));
}
function runNvidiaFabricValidate(target) {
satAllGPUIndicesForMulti().then(function(indices) {
if (!indices.length) { alert('No NVIDIA GPUs available.'); return; }
runSATWithOverrides(target, {gpu_indices: indices, display_name: satLabels()[target] || target});
});
}
function runNvidiaValidateSet(target) {
const selected = satSelectedGPUIndices();
if (!selected.length) { alert('Select at least one NVIDIA GPU.'); return; }
return runSATWithOverrides(target, {gpu_indices: selected, display_name: satLabels()[target] || target});
}
function runAMDValidateSet() {
const targets = selectedAMDValidateTargets();
if (!targets.length) return;
if (targets.length === 1) return runSAT(targets[0]);
document.getElementById('sat-output').style.display='block';
document.getElementById('sat-title').textContent = '— amd';
const term = document.getElementById('sat-terminal');
term.textContent = 'Running AMD validate set one by one...\n';
const labels = satLabels();
const runNext = (idx) => {
if (idx >= targets.length) return Promise.resolve();
const target = targets[idx];
term.textContent += '\n[' + (idx + 1) + '/' + targets.length + '] ' + labels[target] + '\n';
return enqueueSATTarget(target)
.then(d => {
return streamSATTask(d.task_id, labels[target], false);
}).then(function() {
return runNext(idx + 1);
});
};
return runNext(0);
}
function runAllSAT() {
const cycles = 1;
const status = document.getElementById('sat-all-status');
status.textContent = 'Enqueuing...';
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
const activeTargets = baseTargets.filter(target => {
if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
const btn = document.getElementById('sat-btn-' + target);
return !(btn && btn.disabled);
});
Promise.all(activeTargets.map(expandSATTarget)).then(groups => {
const expanded = [];
for (let cycle = 0; cycle < cycles; cycle++) {
groups.forEach(group => group.forEach(item => expanded.push(item)));
}
const total = expanded.length;
let enqueued = 0;
if (!total) {
status.textContent = 'No tasks selected.';
return;
}
const runNext = (idx) => {
if (idx >= expanded.length) { status.textContent = 'Completed ' + total + ' task(s).'; return Promise.resolve(); }
const item = expanded[idx];
status.textContent = 'Running ' + (idx + 1) + '/' + total + '...';
return enqueueSATTarget(item.target, item.overrides)
.then(() => {
enqueued++;
return runNext(idx + 1);
});
};
return runNext(0);
}).catch(err => {
status.textContent = 'Error: ' + err.message;
});
}
</script>
<script>
fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
if (!gp.nvidia) disableSATCard('nvidia-targeted-stress', 'No NVIDIA GPU detected');
if (!gp.nvidia) disableSATCard('nvidia-targeted-power', 'No NVIDIA GPU detected');
if (!gp.nvidia) disableSATCard('nvidia-pulse', 'No NVIDIA GPU detected');
if (!gp.nvidia) disableSATCard('nvidia-interconnect', 'No NVIDIA GPU detected');
if (!gp.nvidia) disableSATCard('nvidia-bandwidth', 'No NVIDIA GPU detected');
if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
if (!gp.amd) disableSATAMDOptions('No AMD GPU detected');
});
satLoadGPUs();
function disableSATAMDOptions(reason) {
['sat-amd-target','sat-amd-mem-target','sat-amd-bandwidth-target'].forEach(function(id) {
const cb = document.getElementById(id);
if (!cb) return;
cb.disabled = true;
cb.checked = false;
cb.title = reason;
});
}
function disableSATCard(id, reason) {
const btn = document.getElementById('sat-btn-' + id);
if (!btn) return;
btn.disabled = true;
btn.title = reason;
btn.style.opacity = '0.4';
const card = btn.closest('.card');
if (card) {
let note = card.querySelector('.sat-unavail');
if (!note) {
note = document.createElement('p');
note.className = 'sat-unavail';
note.style.cssText = 'color:var(--muted);font-size:12px;margin:0 0 8px';
const body = card.querySelector('.card-body');
if (body) body.insertBefore(note, body.firstChild);
}
note.textContent = reason;
}
}
</script>`
}
func loadValidateInventory(opts HandlerOptions) validateInventory {
unknown := "Audit snapshot not loaded."
out := validateInventory{
CPU: unknown,
Memory: unknown,
Storage: unknown,
NVIDIA: unknown,
AMD: unknown,
}
data, err := loadSnapshot(opts.AuditPath)
if err != nil {
return out
}
var snap schema.HardwareIngestRequest
if err := json.Unmarshal(data, &snap); err != nil {
return out
}
cpuCounts := map[string]int{}
cpuTotal := 0
for _, cpu := range snap.Hardware.CPUs {
if cpu.Present != nil && !*cpu.Present {
continue
}
cpuTotal++
addValidateModel(cpuCounts, validateFirstNonEmpty(validateTrimPtr(cpu.Model), validateTrimPtr(cpu.Manufacturer), "unknown"))
}
memCounts := map[string]int{}
memTotal := 0
for _, dimm := range snap.Hardware.Memory {
if dimm.Present != nil && !*dimm.Present {
continue
}
memTotal++
addValidateModel(memCounts, validateFirstNonEmpty(validateTrimPtr(dimm.PartNumber), validateTrimPtr(dimm.Type), validateTrimPtr(dimm.Manufacturer), "unknown"))
}
storageCounts := map[string]int{}
storageTotal := 0
for _, dev := range snap.Hardware.Storage {
if dev.Present != nil && !*dev.Present {
continue
}
storageTotal++
addValidateModel(storageCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
}
nvidiaCounts := map[string]int{}
nvidiaTotal := 0
amdCounts := map[string]int{}
amdTotal := 0
for _, dev := range snap.Hardware.PCIeDevices {
if dev.Present != nil && !*dev.Present {
continue
}
if validateIsVendorGPU(dev, "nvidia") {
nvidiaTotal++
addValidateModel(nvidiaCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
}
if validateIsVendorGPU(dev, "amd") {
amdTotal++
addValidateModel(amdCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
}
}
out.CPU = formatValidateDeviceSummary(cpuTotal, cpuCounts, "CPU")
out.Memory = formatValidateDeviceSummary(memTotal, memCounts, "module")
out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device")
out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU")
out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU")
out.NvidiaGPUCount = nvidiaTotal
out.AMDGPUCount = amdTotal
return out
}
func renderValidateCardBody(devices, description, commands, settings string) string {
return `<div class="validate-card-section"><div style="font-size:13px;color:var(--muted)">` + devices + `</div></div>` +
`<div class="validate-card-section"><div style="font-size:13px">` + description + `</div></div>` +
`<div class="validate-card-section"><div style="font-size:13px">` + commands + `</div></div>` +
`<div class="validate-card-section"><div style="font-size:13px;color:var(--muted)">` + settings + `</div></div>`
}
func formatValidateDeviceSummary(total int, models map[string]int, unit string) string {
if total == 0 {
return "0 " + unit + "s detected."
}
keys := make([]string, 0, len(models))
for key := range models {
keys = append(keys, key)
}
sort.Strings(keys)
parts := make([]string, 0, len(keys))
for _, key := range keys {
parts = append(parts, fmt.Sprintf("%d x %s", models[key], html.EscapeString(key)))
}
label := unit
if total != 1 {
label += "s"
}
if len(parts) == 1 {
return parts[0] + " " + label
}
return fmt.Sprintf("%d %s: %s", total, label, strings.Join(parts, ", "))
}
func addValidateModel(counts map[string]int, name string) {
name = strings.TrimSpace(name)
if name == "" {
name = "unknown"
}
counts[name]++
}
func validateTrimPtr(value *string) string {
if value == nil {
return ""
}
return strings.TrimSpace(*value)
}
func validateFirstNonEmpty(values ...string) string {
for _, value := range values {
value = strings.TrimSpace(value)
if value != "" {
return value
}
}
return ""
}
func validateIsVendorGPU(dev schema.HardwarePCIeDevice, vendor string) bool {
if dev.VendorID != nil && *dev.VendorID == pciVendorAspeed {
return false
}
class := strings.ToLower(validateTrimPtr(dev.DeviceClass))
isGPUClass := class == "videocontroller" || class == "processingaccelerator" || class == "displaycontroller"
switch vendor {
case "nvidia":
return isGPUClass && dev.VendorID != nil && *dev.VendorID == pciVendorNvidia
case "amd":
return isGPUClass && dev.VendorID != nil && *dev.VendorID == pciVendorAMD
default:
return false
}
}
// renderCheck renders the non-destructive Check page (step 2).
// Shows validate-mode tests only: CPU, Memory, Storage, NVIDIA L2, NCCL, NVBandwidth, AMD.
// Stress-mode tests (targeted-stress, targeted-power, pulse) are on the Load page.
func renderCheck(opts HandlerOptions) string {
inv := loadValidateInventory(opts)
n := inv.NvidiaGPUCount
validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
gpuNote := ""
if n > 0 {
gpuNote = fmt.Sprintf(" (%d GPU)", n)
}
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Check tests collect diagnostics only — no writes to disks, no sustained load, no hardware wear counters incremented. For stress testing, go to <a href="/load">3. Load</a>.</div>
<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px">
<button type="button" class="btn btn-primary" onclick="runAllCheckSAT()">Run All Checks</button>
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
<span style="font-size:12px;color:var(--muted)">est. ` + validateTotalStr + gpuNote + `</span>
</div>
<div class="grid3">
` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
inv.CPU,
`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
validateFmtDur(platform.SATEstimatedCPUValidateSec)+` (stress-ng 60 s).`,
)) +
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
inv.Memory,
`Runs a RAM validation pass and records memory state around the test.`,
`<code>free</code>, <code>memtester</code>`,
validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` (256 MB × 1 pass).`,
)) +
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
inv.Storage,
`Scans all storage devices and runs the matching health or self-test path for each.`,
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
`Seconds (NVMe: instant device query; SATA/SAS: short self-test).`,
)) +
`</div>
<div style="height:1px;background:var(--border);margin:16px 0"></div>
<div class="card" style="margin-bottom:16px">
<div class="card-head">NVIDIA GPU Selection</div>
<div class="card-body">
<p style="font-size:12px;color:var(--muted);margin:0 0 8px">` + inv.NVIDIA + `</p>
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectAllGPUs()">Select All</button>
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectNoGPUs()">Clear</button>
</div>
<div id="sat-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
</div>
<p id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA check tasks.</p>
</div>
</div>
<div class="grid3">
` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
inv.NVIDIA,
`Runs NVIDIA diagnostics and board inventory checks (DCGM Level 2).`,
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec)+` (Level 2, all GPUs simultaneously).`,
)) +
renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
inv.NVIDIA,
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs.`,
`<code>all_reduce_perf</code> (NCCL tests)`,
validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
)) +
renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
inv.NVIDIA,
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
`<code>nvbandwidth</code>`,
validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously).`,
)) +
`</div>
<div class="grid3" style="margin-top:16px">
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
inv.AMD,
`Runs AMD GPU inventory, MEM integrity, and MEM bandwidth checks.`,
`GPU Validate: <code>rocm-smi</code>, <code>dmidecode</code>; MEM Integrity: <code>rvs mem</code>; MEM Bandwidth: <code>rocm-bandwidth-test</code>, <code>rvs babel</code>`,
`<div style="display:flex;flex-direction:column;gap:4px"><label class="cb-row"><input type="checkbox" id="sat-amd-target" checked><span>GPU Validate</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-mem-target" checked><span>MEM Integrity</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-bandwidth-target" checked><span>MEM Bandwidth</span></label></div>`,
)) +
`</div>
<div id="sat-output" style="display:none;margin-top:16px" class="card">
<div class="card-head">Test Output <span id="sat-title"></span></div>
<div class="card-body"><div id="sat-terminal" class="terminal"></div></div>
</div>
<style>
.validate-card-body { padding:0; }
.validate-card-section { padding:12px 16px 0; }
.validate-card-section:last-child { padding-bottom:16px; }
.sat-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
.sat-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
.cb-row { display:flex; align-items:flex-start; gap:8px; padding:4px 0; cursor:pointer; font-size:13px; }
.cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
</style>
<script>
let satES = null;
function satLabels() {
return {nvidia:'Check GPU (DCGM L2)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Check Memory', storage:'Check Storage', cpu:'Check CPU', amd:'Check AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
}
let satNvidiaGPUsPromise = null;
function loadSatNvidiaGPUs() {
if (!satNvidiaGPUsPromise) {
satNvidiaGPUsPromise = fetch('/api/gpu/nvidia').then(r => {
if (!r.ok) throw new Error('Failed to load NVIDIA GPUs.');
return r.json();
}).then(list => Array.isArray(list) ? list : []);
}
return satNvidiaGPUsPromise;
}
function satSelectedGPUIndices() {
return Array.from(document.querySelectorAll('.sat-nvidia-checkbox'))
.filter(el => el.checked && !el.disabled)
.map(el => parseInt(el.value, 10))
.filter(v => !Number.isNaN(v))
.sort((a, b) => a - b);
}
function satUpdateGPUSelectionNote() {
const note = document.getElementById('sat-gpu-selection-note');
if (!note) return;
const sel = satSelectedGPUIndices();
note.textContent = sel.length
? 'Selected GPUs: ' + sel.join(', ') + '. Multi-GPU tests will use all selected GPUs.'
: 'Select at least one NVIDIA GPU to enable NVIDIA check tasks.';
}
function satRenderGPUList(gpus) {
const root = document.getElementById('sat-gpu-list');
if (!root) return;
if (!gpus || !gpus.length) {
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
satUpdateGPUSelectionNote(); return;
}
root.innerHTML = gpus.map(gpu => {
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
return '<label class="sat-gpu-row"><input class="sat-nvidia-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="satUpdateGPUSelectionNote()"><span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span></label>';
}).join('');
satUpdateGPUSelectionNote();
}
function satSelectAllGPUs() { document.querySelectorAll('.sat-nvidia-checkbox').forEach(el => { el.checked = true; }); satUpdateGPUSelectionNote(); }
function satSelectNoGPUs() { document.querySelectorAll('.sat-nvidia-checkbox').forEach(el => { el.checked = false; }); satUpdateGPUSelectionNote(); }
function satGPULoadInit() {
loadSatNvidiaGPUs().then(satRenderGPUList).catch(err => {
const root = document.getElementById('sat-gpu-list');
if (root) root.innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
satUpdateGPUSelectionNote();
});
}
function satRequestBody(target, overrides) {
const body = {};
const labels = satLabels();
body.display_name = labels[target] || ('Check ' + target);
body.stress_mode = false;
if (target === 'cpu') body.duration = 60;
if (overrides) Object.keys(overrides).forEach(k => { body[k] = overrides[k]; });
return body;
}
function enqueueSATTarget(target, overrides) {
return fetch('/api/sat/' + target + '/run', {method:'POST', headers:{'Content-Type':'application/json'}, body:JSON.stringify(satRequestBody(target, overrides))}).then(r => r.json());
}
function streamSATTask(taskId, title, resetTerminal) {
if (satES) { satES.close(); satES = null; }
document.getElementById('sat-output').style.display = 'block';
document.getElementById('sat-title').textContent = '— ' + title;
const term = document.getElementById('sat-terminal');
if (resetTerminal) term.textContent = '';
term.textContent += 'Task ' + taskId + ' queued. Streaming log...\n';
return new Promise(resolve => {
satES = new EventSource('/api/tasks/' + taskId + '/stream');
satES.onmessage = e => { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
satES.addEventListener('done', e => {
satES.close(); satES = null;
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
term.scrollTop = term.scrollHeight;
resolve({ok: !e.data, error: e.data || ''});
});
satES.onerror = () => {
if (satES) { satES.close(); satES = null; }
term.textContent += '\nERROR: stream disconnected.\n';
term.scrollTop = term.scrollHeight;
resolve({ok: false, error: 'stream disconnected'});
};
});
}
function selectedAMDValidateTargets() {
const targets = [];
const gpu = document.getElementById('sat-amd-target');
const mem = document.getElementById('sat-amd-mem-target');
const bw = document.getElementById('sat-amd-bandwidth-target');
if (gpu && gpu.checked && !gpu.disabled) targets.push('amd');
if (mem && mem.checked && !mem.disabled) targets.push('amd-mem');
if (bw && bw.checked && !bw.disabled) targets.push('amd-bandwidth');
return targets;
}
function runSAT(target) { return runSATWithOverrides(target, null); }
function runSATWithOverrides(target, overrides) {
const title = (overrides && overrides.display_name) || target;
document.getElementById('sat-output').style.display = 'block';
document.getElementById('sat-title').textContent = '— ' + title;
const term = document.getElementById('sat-terminal');
term.textContent = 'Enqueuing ' + title + ' test...\n';
return enqueueSATTarget(target, overrides).then(d => streamSATTask(d.task_id, title, false));
}
function runNvidiaFabricValidate(target) {
const indices = satSelectedGPUIndices();
if (!indices.length) { alert('No NVIDIA GPUs available.'); return; }
runSATWithOverrides(target, {gpu_indices: indices, display_name: satLabels()[target] || target});
}
function runNvidiaValidateSet(target) {
const sel = satSelectedGPUIndices();
if (!sel.length) { alert('Select at least one NVIDIA GPU.'); return; }
return runSATWithOverrides(target, {gpu_indices: sel, display_name: satLabels()[target] || target});
}
function runAMDValidateSet() {
const targets = selectedAMDValidateTargets();
if (!targets.length) return;
if (targets.length === 1) return runSAT(targets[0]);
const term = document.getElementById('sat-terminal');
document.getElementById('sat-output').style.display = 'block';
document.getElementById('sat-title').textContent = '— amd';
term.textContent = 'Running AMD check set...\n';
const labels = satLabels();
const runNext = idx => {
if (idx >= targets.length) return Promise.resolve();
const t = targets[idx];
term.textContent += '\n[' + (idx + 1) + '/' + targets.length + '] ' + labels[t] + '\n';
return enqueueSATTarget(t).then(d => streamSATTask(d.task_id, labels[t], false)).then(() => runNext(idx + 1));
};
return runNext(0);
}
function runAllCheckSAT() {
const status = document.getElementById('sat-all-status');
status.textContent = 'Enqueuing...';
const nvidiaIndices = satSelectedGPUIndices();
const nvidiaAllTargets = ['nvidia', 'nvidia-interconnect', 'nvidia-bandwidth'];
const baseTargets = ['cpu', 'memory', 'storage'];
const amdTargets = selectedAMDValidateTargets();
const expanded = [];
baseTargets.forEach(t => expanded.push({target: t}));
if (nvidiaIndices.length) {
nvidiaAllTargets.forEach(t => {
const btn = document.getElementById('sat-btn-' + t);
if (!(btn && btn.disabled)) expanded.push({target: t, overrides: {gpu_indices: nvidiaIndices, display_name: satLabels()[t] || t}});
});
}
amdTargets.forEach(t => expanded.push({target: t}));
if (!expanded.length) { status.textContent = 'No tasks selected.'; return; }
const total = expanded.length;
const runNext = idx => {
if (idx >= expanded.length) { status.textContent = 'Completed ' + total + ' task(s).'; return Promise.resolve(); }
const item = expanded[idx];
status.textContent = 'Running ' + (idx + 1) + '/' + total + '...';
return enqueueSATTarget(item.target, item.overrides).then(() => runNext(idx + 1));
};
runNext(0).catch(err => { status.textContent = 'Error: ' + err.message; });
}
function disableSATCard(id, reason) {
const btn = document.getElementById('sat-btn-' + id);
if (!btn) return;
btn.disabled = true; btn.title = reason; btn.style.opacity = '0.4';
const card = btn.closest('.card');
if (card) {
let note = card.querySelector('.sat-unavail');
if (!note) {
note = document.createElement('p');
note.className = 'sat-unavail';
note.style.cssText = 'color:var(--muted);font-size:12px;margin:0 0 8px';
const body = card.querySelector('.card-body');
if (body) body.insertBefore(note, body.firstChild);
}
note.textContent = reason;
}
}
fetch('/api/gpu/presence').then(r => r.json()).then(gp => {
if (!gp.nvidia) ['nvidia','nvidia-interconnect','nvidia-bandwidth'].forEach(t => disableSATCard(t, 'No NVIDIA GPU detected'));
if (!gp.amd) {
disableSATCard('amd', 'No AMD GPU detected');
['sat-amd-target','sat-amd-mem-target','sat-amd-bandwidth-target'].forEach(id => {
const cb = document.getElementById(id);
if (cb) { cb.disabled = true; cb.checked = false; }
});
}
});
satGPULoadInit();
</script>`
}
func renderSATCard(id, label, runAction, headerActions, body string) string {
actions := `<button id="sat-btn-` + id + `" class="btn btn-primary btn-sm" onclick="` + runAction + `">Run</button>`
if strings.TrimSpace(headerActions) != "" {
actions += headerActions
}
return fmt.Sprintf(`<div class="card"><div class="card-head card-head-actions"><span>%s</span><div class="card-head-buttons">%s</div></div><div class="card-body validate-card-body">%s</div></div>`,
label, actions, body)
}