717 lines
31 KiB
Go
717 lines
31 KiB
Go
package webui
|
||
|
||
import (
|
||
"encoding/json"
|
||
"fmt"
|
||
"html"
|
||
"sort"
|
||
"strings"
|
||
|
||
"bee/audit/internal/platform"
|
||
"bee/audit/internal/schema"
|
||
)
|
||
|
||
type validateInventory struct {
|
||
CPU string
|
||
Memory string
|
||
Storage string
|
||
NVIDIA string
|
||
AMD string
|
||
NvidiaGPUCount int
|
||
AMDGPUCount int
|
||
}
|
||
|
||
func validateFmtDur(secs int) string {
|
||
if secs < 120 {
|
||
return fmt.Sprintf("~%d s", secs)
|
||
}
|
||
mins := (secs + 29) / 60
|
||
return fmt.Sprintf("~%d min", mins)
|
||
}
|
||
|
||
func validateTotalValidateSec(n int) int {
|
||
if n < 0 {
|
||
n = 0
|
||
}
|
||
total := platform.SATEstimatedCPUValidateSec +
|
||
platform.SATEstimatedMemoryValidateSec +
|
||
n*platform.SATEstimatedNvidiaGPUValidatePerGPUSec +
|
||
platform.SATEstimatedNvidiaInterconnectSec +
|
||
platform.SATEstimatedNvidiaBandwidthSec
|
||
return total
|
||
}
|
||
|
||
func validateTotalStressSec(n int) int {
|
||
if n < 0 {
|
||
n = 0
|
||
}
|
||
total := platform.SATEstimatedCPUStressSec +
|
||
platform.SATEstimatedMemoryStressSec +
|
||
n*platform.SATEstimatedNvidiaGPUStressPerGPUSec +
|
||
n*platform.SATEstimatedNvidiaTargetedStressPerGPUSec +
|
||
n*platform.SATEstimatedNvidiaTargetedPowerPerGPUSec +
|
||
platform.SATEstimatedNvidiaPulseTestSec +
|
||
platform.SATEstimatedNvidiaInterconnectSec +
|
||
platform.SATEstimatedNvidiaBandwidthSec
|
||
return total
|
||
}
|
||
|
||
func renderValidate(opts HandlerOptions) string {
|
||
inv := loadValidateInventory(opts)
|
||
n := inv.NvidiaGPUCount
|
||
validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
|
||
stressTotalStr := validateFmtDur(validateTotalStressSec(n))
|
||
gpuNote := ""
|
||
if n > 0 {
|
||
gpuNote = fmt.Sprintf(" (%d GPU)", n)
|
||
}
|
||
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
|
||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||
|
||
<div class="card" style="margin-bottom:16px">
|
||
<div class="card-head">Validate Profile</div>
|
||
<div class="card-body validate-profile-body">
|
||
<div class="validate-profile-col">
|
||
<div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
|
||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
|
||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (` + stressTotalStr + gpuNote + `)</span></label>
|
||
</div>
|
||
<div class="validate-profile-col validate-profile-action">
|
||
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially. Validate: ` + validateTotalStr + gpuNote + `; Stress: ` + stressTotalStr + gpuNote + `. Estimates are based on real log data and scale with GPU count.</p>
|
||
<button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
|
||
<div style="margin-top:12px">
|
||
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
<div class="grid3">
|
||
` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
|
||
inv.CPU,
|
||
`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
|
||
`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
|
||
validateFmtDur(platform.SATEstimatedCPUValidateSec)+` in Validate (stress-ng 60 s). `+validateFmtDur(platform.SATEstimatedCPUStressSec)+` in Stress (stress-ng 30 min).`,
|
||
)) +
|
||
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
|
||
inv.Memory,
|
||
`Runs a RAM validation pass and records memory state around the test.`,
|
||
`<code>free</code>, <code>memtester</code>`,
|
||
validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` in Validate (256 MB × 1 pass). `+validateFmtDur(platform.SATEstimatedMemoryStressSec)+` in Stress (512 MB × 1 pass).`,
|
||
)) +
|
||
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
|
||
inv.Storage,
|
||
`Scans all storage devices and runs the matching health or self-test path for each device type.`,
|
||
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
|
||
`Seconds in Validate (NVMe: instant device query; SATA/SAS: short self-test). Up to ~1 h per device in Stress (extended self-test, device-dependent).`,
|
||
)) +
|
||
`</div>
|
||
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
||
<div class="card" style="margin-bottom:16px">
|
||
<div class="card-head">NVIDIA GPU Selection</div>
|
||
<div class="card-body">
|
||
<p style="font-size:12px;color:var(--muted);margin:0 0 8px">` + inv.NVIDIA + `</p>
|
||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Validate one by one.</p>
|
||
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectAllGPUs()">Select All</button>
|
||
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectNoGPUs()">Clear</button>
|
||
</div>
|
||
<div id="sat-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||
</div>
|
||
<p id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA validate tasks.</p>
|
||
</div>
|
||
</div>
|
||
|
||
<div class="grid3">
|
||
` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
|
||
inv.NVIDIA,
|
||
`Runs NVIDIA diagnostics and board inventory checks.`,
|
||
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
||
func() string {
|
||
perV := platform.SATEstimatedNvidiaGPUValidatePerGPUSec
|
||
perS := platform.SATEstimatedNvidiaGPUStressPerGPUSec
|
||
if n > 0 {
|
||
return fmt.Sprintf("Validate: %s/GPU × %d = %s (Level 2, sequential). Stress: %s/GPU × %d = %s (Level 3, sequential).",
|
||
validateFmtDur(perV), n, validateFmtDur(perV*n),
|
||
validateFmtDur(perS), n, validateFmtDur(perS*n))
|
||
}
|
||
return fmt.Sprintf("Validate: %s/GPU (Level 2, sequential). Stress: %s/GPU (Level 3, sequential).",
|
||
validateFmtDur(perV), validateFmtDur(perS))
|
||
}(),
|
||
)) +
|
||
`<div id="sat-card-nvidia-targeted-stress">` +
|
||
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
||
inv.NVIDIA,
|
||
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
|
||
`<code>dcgmi diag targeted_stress</code>`,
|
||
func() string {
|
||
per := platform.SATEstimatedNvidiaTargetedStressPerGPUSec
|
||
s := "Skipped in Validate. "
|
||
if n > 0 {
|
||
s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
|
||
} else {
|
||
s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
|
||
}
|
||
return s + `<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
|
||
}(),
|
||
)) +
|
||
`</div>` +
|
||
`<div id="sat-card-nvidia-targeted-power">` +
|
||
renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody(
|
||
inv.NVIDIA,
|
||
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
|
||
`<code>dcgmi diag targeted_power</code>`,
|
||
func() string {
|
||
per := platform.SATEstimatedNvidiaTargetedPowerPerGPUSec
|
||
s := "Skipped in Validate. "
|
||
if n > 0 {
|
||
s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
|
||
} else {
|
||
s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
|
||
}
|
||
return s + `<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
|
||
}(),
|
||
)) +
|
||
`</div>` +
|
||
`<div id="sat-card-nvidia-pulse">` +
|
||
renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody(
|
||
inv.NVIDIA,
|
||
`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
|
||
`<code>dcgmi diag pulse_test</code>`,
|
||
`Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||
)) +
|
||
`</div>` +
|
||
`<div id="sat-card-nvidia-interconnect">` +
|
||
renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
|
||
inv.NVIDIA,
|
||
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
|
||
`<code>all_reduce_perf</code> (NCCL tests)`,
|
||
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
|
||
)) +
|
||
`</div>` +
|
||
`<div id="sat-card-nvidia-bandwidth">` +
|
||
renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
|
||
inv.NVIDIA,
|
||
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
|
||
`<code>nvbandwidth</code>`,
|
||
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
|
||
)) +
|
||
`</div>` +
|
||
`</div>
|
||
<div class="grid3" style="margin-top:16px">
|
||
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
|
||
inv.AMD,
|
||
`Runs the selected AMD checks only. GPU Validate collects inventory; MEM Integrity uses the RVS MEM module; MEM Bandwidth uses rocm-bandwidth-test and the RVS BABEL module.`,
|
||
`GPU Validate: <code>rocm-smi</code>, <code>dmidecode</code>; MEM Integrity: <code>rvs mem</code>; MEM Bandwidth: <code>rocm-bandwidth-test</code>, <code>rvs babel</code>`,
|
||
`<div style="display:flex;flex-direction:column;gap:4px"><label class="cb-row"><input type="checkbox" id="sat-amd-target" checked><span>GPU Validate</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-mem-target" checked><span>MEM Integrity</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-bandwidth-target" checked><span>MEM Bandwidth</span></label></div>`,
|
||
)) +
|
||
`</div>
|
||
<div id="sat-output" style="display:none;margin-top:16px" class="card">
|
||
<div class="card-head">Test Output <span id="sat-title"></span></div>
|
||
<div class="card-body"><div id="sat-terminal" class="terminal"></div></div>
|
||
</div>
|
||
<style>
|
||
.validate-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
|
||
.validate-profile-col { min-width:0; display:flex; flex-direction:column; }
|
||
.validate-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:center; }
|
||
.validate-card-body { padding:0; }
|
||
.validate-card-section { padding:12px 16px 0; }
|
||
.validate-card-section:last-child { padding-bottom:16px; }
|
||
.sat-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
|
||
.sat-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||
@media(max-width:900px){ .validate-profile-body { grid-template-columns:1fr; } }
|
||
</style>
|
||
<script>
|
||
let satES = null;
|
||
function satStressMode() {
|
||
return document.querySelector('input[name="sat-mode"]:checked')?.value === 'stress';
|
||
}
|
||
function satModeChanged() {
|
||
const stress = satStressMode();
|
||
[
|
||
{card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
|
||
{card: 'sat-card-nvidia-targeted-power', hint: 'sat-tp-mode-hint'},
|
||
{card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'},
|
||
].forEach(function(item) {
|
||
const card = document.getElementById(item.card);
|
||
if (card) {
|
||
card.style.opacity = stress ? '1' : '0.5';
|
||
const hint = document.getElementById(item.hint);
|
||
if (hint) hint.style.display = stress ? 'none' : '';
|
||
}
|
||
});
|
||
}
|
||
function satLabels() {
|
||
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||
}
|
||
let satNvidiaGPUsPromise = null;
|
||
function loadSatNvidiaGPUs() {
|
||
if (!satNvidiaGPUsPromise) {
|
||
satNvidiaGPUsPromise = fetch('/api/gpu/nvidia')
|
||
.then(r => {
|
||
if (!r.ok) throw new Error('Failed to load NVIDIA GPUs.');
|
||
return r.json();
|
||
})
|
||
.then(list => Array.isArray(list) ? list : []);
|
||
}
|
||
return satNvidiaGPUsPromise;
|
||
}
|
||
function satSelectedGPUIndices() {
|
||
return Array.from(document.querySelectorAll('.sat-nvidia-checkbox'))
|
||
.filter(function(el) { return el.checked && !el.disabled; })
|
||
.map(function(el) { return parseInt(el.value, 10); })
|
||
.filter(function(v) { return !Number.isNaN(v); })
|
||
.sort(function(a, b) { return a - b; });
|
||
}
|
||
function satUpdateGPUSelectionNote() {
|
||
const note = document.getElementById('sat-gpu-selection-note');
|
||
if (!note) return;
|
||
const selected = satSelectedGPUIndices();
|
||
if (!selected.length) {
|
||
note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA validate tasks.';
|
||
return;
|
||
}
|
||
note.textContent = 'Selected GPUs: ' + selected.join(', ') + '. Multi-GPU tests will use all selected GPUs.';
|
||
}
|
||
function satRenderGPUList(gpus) {
|
||
const root = document.getElementById('sat-gpu-list');
|
||
if (!root) return;
|
||
if (!gpus || !gpus.length) {
|
||
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
|
||
satUpdateGPUSelectionNote();
|
||
return;
|
||
}
|
||
root.innerHTML = gpus.map(function(gpu) {
|
||
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
|
||
return '<label class="sat-gpu-row">'
|
||
+ '<input class="sat-nvidia-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="satUpdateGPUSelectionNote()">'
|
||
+ '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
|
||
+ '</label>';
|
||
}).join('');
|
||
satUpdateGPUSelectionNote();
|
||
}
|
||
function satSelectAllGPUs() {
|
||
document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = true; });
|
||
satUpdateGPUSelectionNote();
|
||
}
|
||
function satSelectNoGPUs() {
|
||
document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = false; });
|
||
satUpdateGPUSelectionNote();
|
||
}
|
||
function satLoadGPUs() {
|
||
loadSatNvidiaGPUs().then(function(gpus) {
|
||
satRenderGPUList(gpus);
|
||
}).catch(function(err) {
|
||
const root = document.getElementById('sat-gpu-list');
|
||
if (root) {
|
||
root.innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
|
||
}
|
||
satUpdateGPUSelectionNote();
|
||
});
|
||
}
|
||
function satGPUDisplayName(gpu) {
|
||
const idx = (gpu && Number.isFinite(Number(gpu.index))) ? Number(gpu.index) : 0;
|
||
const name = gpu && gpu.name ? gpu.name : ('GPU ' + idx);
|
||
return 'GPU ' + idx + ' — ' + name;
|
||
}
|
||
function satRequestBody(target, overrides) {
|
||
const body = {};
|
||
const labels = satLabels();
|
||
body.display_name = labels[target] || ('Validate ' + target);
|
||
body.stress_mode = satStressMode();
|
||
if (target === 'cpu') body.duration = satStressMode() ? 1800 : 60;
|
||
if (overrides) {
|
||
Object.keys(overrides).forEach(key => { body[key] = overrides[key]; });
|
||
}
|
||
return body;
|
||
}
|
||
function enqueueSATTarget(target, overrides) {
|
||
return fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(satRequestBody(target, overrides))})
|
||
.then(r => r.json());
|
||
}
|
||
function streamSATTask(taskId, title, resetTerminal) {
|
||
if (satES) { satES.close(); satES = null; }
|
||
document.getElementById('sat-output').style.display='block';
|
||
document.getElementById('sat-title').textContent = '— ' + title;
|
||
const term = document.getElementById('sat-terminal');
|
||
if (resetTerminal) {
|
||
term.textContent = '';
|
||
}
|
||
term.textContent += 'Task ' + taskId + ' queued. Streaming log...\n';
|
||
return new Promise(function(resolve) {
|
||
satES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||
satES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||
satES.addEventListener('done', function(e) {
|
||
satES.close();
|
||
satES = null;
|
||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||
term.scrollTop = term.scrollHeight;
|
||
resolve({ok: !e.data, error: e.data || ''});
|
||
});
|
||
satES.onerror = function() {
|
||
if (satES) {
|
||
satES.close();
|
||
satES = null;
|
||
}
|
||
term.textContent += '\nERROR: stream disconnected.\n';
|
||
term.scrollTop = term.scrollHeight;
|
||
resolve({ok: false, error: 'stream disconnected'});
|
||
};
|
||
});
|
||
}
|
||
function selectedAMDValidateTargets() {
|
||
const targets = [];
|
||
const gpu = document.getElementById('sat-amd-target');
|
||
const mem = document.getElementById('sat-amd-mem-target');
|
||
const bw = document.getElementById('sat-amd-bandwidth-target');
|
||
if (gpu && gpu.checked && !gpu.disabled) targets.push('amd');
|
||
if (mem && mem.checked && !mem.disabled) targets.push('amd-mem');
|
||
if (bw && bw.checked && !bw.disabled) targets.push('amd-bandwidth');
|
||
return targets;
|
||
}
|
||
function runSAT(target) {
|
||
return runSATWithOverrides(target, null);
|
||
}
|
||
function runSATWithOverrides(target, overrides) {
|
||
const title = (overrides && overrides.display_name) || target;
|
||
const term = document.getElementById('sat-terminal');
|
||
document.getElementById('sat-output').style.display='block';
|
||
document.getElementById('sat-title').textContent = '— ' + title;
|
||
term.textContent = 'Enqueuing ' + title + ' test...\n';
|
||
return enqueueSATTarget(target, overrides)
|
||
.then(d => streamSATTask(d.task_id, title, false));
|
||
}
|
||
const nvidiaPerGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power'];
|
||
const nvidiaAllGPUTargets = ['nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
|
||
function satAllGPUIndicesForMulti() {
|
||
return Promise.resolve(satSelectedGPUIndices());
|
||
}
|
||
function expandSATTarget(target) {
|
||
if (nvidiaAllGPUTargets.indexOf(target) >= 0) {
|
||
return satAllGPUIndicesForMulti().then(function(indices) {
|
||
if (!indices.length) return Promise.reject(new Error('No NVIDIA GPUs available.'));
|
||
return [{target: target, overrides: {gpu_indices: indices, display_name: satLabels()[target] || target}}];
|
||
});
|
||
}
|
||
if (nvidiaPerGPUTargets.indexOf(target) < 0) {
|
||
return Promise.resolve([{target: target}]);
|
||
}
|
||
const selected = satSelectedGPUIndices();
|
||
if (!selected.length) {
|
||
return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
|
||
}
|
||
return loadSatNvidiaGPUs().then(gpus => gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0).map(gpu => ({
|
||
target: target,
|
||
overrides: {
|
||
gpu_indices: [Number(gpu.index)],
|
||
display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')'
|
||
},
|
||
label: satGPUDisplayName(gpu),
|
||
})));
|
||
}
|
||
function runNvidiaFabricValidate(target) {
|
||
satAllGPUIndicesForMulti().then(function(indices) {
|
||
if (!indices.length) { alert('No NVIDIA GPUs available.'); return; }
|
||
runSATWithOverrides(target, {gpu_indices: indices, display_name: satLabels()[target] || target});
|
||
});
|
||
}
|
||
function runNvidiaValidateSet(target) {
|
||
return loadSatNvidiaGPUs().then(gpus => {
|
||
const selected = satSelectedGPUIndices();
|
||
const picked = gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0);
|
||
if (!picked.length) {
|
||
throw new Error('Select at least one NVIDIA GPU.');
|
||
}
|
||
if (picked.length === 1) {
|
||
const gpu = picked[0];
|
||
return runSATWithOverrides(target, {
|
||
gpu_indices: [Number(gpu.index)],
|
||
display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')',
|
||
});
|
||
}
|
||
document.getElementById('sat-output').style.display='block';
|
||
document.getElementById('sat-title').textContent = '— ' + target;
|
||
const term = document.getElementById('sat-terminal');
|
||
term.textContent = 'Running ' + target + ' one GPU at a time...\n';
|
||
const labelBase = satLabels()[target] || ('Validate ' + target);
|
||
const runNext = (idx) => {
|
||
if (idx >= picked.length) return Promise.resolve();
|
||
const gpu = picked[idx];
|
||
const gpuLabel = satGPUDisplayName(gpu);
|
||
term.textContent += '\n[' + (idx + 1) + '/' + picked.length + '] ' + gpuLabel + '\n';
|
||
return enqueueSATTarget(target, {
|
||
gpu_indices: [Number(gpu.index)],
|
||
display_name: labelBase + ' (' + gpuLabel + ')',
|
||
}).then(d => {
|
||
return streamSATTask(d.task_id, labelBase + ' (' + gpuLabel + ')', false);
|
||
}).then(function() {
|
||
return runNext(idx + 1);
|
||
});
|
||
};
|
||
return runNext(0);
|
||
});
|
||
}
|
||
function runAMDValidateSet() {
|
||
const targets = selectedAMDValidateTargets();
|
||
if (!targets.length) return;
|
||
if (targets.length === 1) return runSAT(targets[0]);
|
||
document.getElementById('sat-output').style.display='block';
|
||
document.getElementById('sat-title').textContent = '— amd';
|
||
const term = document.getElementById('sat-terminal');
|
||
term.textContent = 'Running AMD validate set one by one...\n';
|
||
const labels = satLabels();
|
||
const runNext = (idx) => {
|
||
if (idx >= targets.length) return Promise.resolve();
|
||
const target = targets[idx];
|
||
term.textContent += '\n[' + (idx + 1) + '/' + targets.length + '] ' + labels[target] + '\n';
|
||
return enqueueSATTarget(target)
|
||
.then(d => {
|
||
return streamSATTask(d.task_id, labels[target], false);
|
||
}).then(function() {
|
||
return runNext(idx + 1);
|
||
});
|
||
};
|
||
return runNext(0);
|
||
}
|
||
function runAllSAT() {
|
||
const cycles = 1;
|
||
const status = document.getElementById('sat-all-status');
|
||
status.textContent = 'Enqueuing...';
|
||
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
|
||
const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
|
||
const activeTargets = baseTargets.filter(target => {
|
||
if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
|
||
const btn = document.getElementById('sat-btn-' + target);
|
||
return !(btn && btn.disabled);
|
||
});
|
||
Promise.all(activeTargets.map(expandSATTarget)).then(groups => {
|
||
const expanded = [];
|
||
for (let cycle = 0; cycle < cycles; cycle++) {
|
||
groups.forEach(group => group.forEach(item => expanded.push(item)));
|
||
}
|
||
const total = expanded.length;
|
||
let enqueued = 0;
|
||
if (!total) {
|
||
status.textContent = 'No tasks selected.';
|
||
return;
|
||
}
|
||
const runNext = (idx) => {
|
||
if (idx >= expanded.length) { status.textContent = 'Completed ' + total + ' task(s).'; return Promise.resolve(); }
|
||
const item = expanded[idx];
|
||
status.textContent = 'Running ' + (idx + 1) + '/' + total + '...';
|
||
return enqueueSATTarget(item.target, item.overrides)
|
||
.then(() => {
|
||
enqueued++;
|
||
return runNext(idx + 1);
|
||
});
|
||
};
|
||
return runNext(0);
|
||
}).catch(err => {
|
||
status.textContent = 'Error: ' + err.message;
|
||
});
|
||
}
|
||
</script>
|
||
<script>
|
||
fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
|
||
if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
|
||
if (!gp.nvidia) disableSATCard('nvidia-targeted-stress', 'No NVIDIA GPU detected');
|
||
if (!gp.nvidia) disableSATCard('nvidia-targeted-power', 'No NVIDIA GPU detected');
|
||
if (!gp.nvidia) disableSATCard('nvidia-pulse', 'No NVIDIA GPU detected');
|
||
if (!gp.nvidia) disableSATCard('nvidia-interconnect', 'No NVIDIA GPU detected');
|
||
if (!gp.nvidia) disableSATCard('nvidia-bandwidth', 'No NVIDIA GPU detected');
|
||
if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
|
||
if (!gp.amd) disableSATAMDOptions('No AMD GPU detected');
|
||
});
|
||
satLoadGPUs();
|
||
function disableSATAMDOptions(reason) {
|
||
['sat-amd-target','sat-amd-mem-target','sat-amd-bandwidth-target'].forEach(function(id) {
|
||
const cb = document.getElementById(id);
|
||
if (!cb) return;
|
||
cb.disabled = true;
|
||
cb.checked = false;
|
||
cb.title = reason;
|
||
});
|
||
}
|
||
function disableSATCard(id, reason) {
|
||
const btn = document.getElementById('sat-btn-' + id);
|
||
if (!btn) return;
|
||
btn.disabled = true;
|
||
btn.title = reason;
|
||
btn.style.opacity = '0.4';
|
||
const card = btn.closest('.card');
|
||
if (card) {
|
||
let note = card.querySelector('.sat-unavail');
|
||
if (!note) {
|
||
note = document.createElement('p');
|
||
note.className = 'sat-unavail';
|
||
note.style.cssText = 'color:var(--muted);font-size:12px;margin:0 0 8px';
|
||
const body = card.querySelector('.card-body');
|
||
if (body) body.insertBefore(note, body.firstChild);
|
||
}
|
||
note.textContent = reason;
|
||
}
|
||
}
|
||
</script>`
|
||
}
|
||
|
||
func loadValidateInventory(opts HandlerOptions) validateInventory {
|
||
unknown := "Audit snapshot not loaded."
|
||
out := validateInventory{
|
||
CPU: unknown,
|
||
Memory: unknown,
|
||
Storage: unknown,
|
||
NVIDIA: unknown,
|
||
AMD: unknown,
|
||
}
|
||
data, err := loadSnapshot(opts.AuditPath)
|
||
if err != nil {
|
||
return out
|
||
}
|
||
var snap schema.HardwareIngestRequest
|
||
if err := json.Unmarshal(data, &snap); err != nil {
|
||
return out
|
||
}
|
||
|
||
cpuCounts := map[string]int{}
|
||
cpuTotal := 0
|
||
for _, cpu := range snap.Hardware.CPUs {
|
||
if cpu.Present != nil && !*cpu.Present {
|
||
continue
|
||
}
|
||
cpuTotal++
|
||
addValidateModel(cpuCounts, validateFirstNonEmpty(validateTrimPtr(cpu.Model), validateTrimPtr(cpu.Manufacturer), "unknown"))
|
||
}
|
||
|
||
memCounts := map[string]int{}
|
||
memTotal := 0
|
||
for _, dimm := range snap.Hardware.Memory {
|
||
if dimm.Present != nil && !*dimm.Present {
|
||
continue
|
||
}
|
||
memTotal++
|
||
addValidateModel(memCounts, validateFirstNonEmpty(validateTrimPtr(dimm.PartNumber), validateTrimPtr(dimm.Type), validateTrimPtr(dimm.Manufacturer), "unknown"))
|
||
}
|
||
|
||
storageCounts := map[string]int{}
|
||
storageTotal := 0
|
||
for _, dev := range snap.Hardware.Storage {
|
||
if dev.Present != nil && !*dev.Present {
|
||
continue
|
||
}
|
||
storageTotal++
|
||
addValidateModel(storageCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
|
||
}
|
||
|
||
nvidiaCounts := map[string]int{}
|
||
nvidiaTotal := 0
|
||
amdCounts := map[string]int{}
|
||
amdTotal := 0
|
||
for _, dev := range snap.Hardware.PCIeDevices {
|
||
if dev.Present != nil && !*dev.Present {
|
||
continue
|
||
}
|
||
if validateIsVendorGPU(dev, "nvidia") {
|
||
nvidiaTotal++
|
||
addValidateModel(nvidiaCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
|
||
}
|
||
if validateIsVendorGPU(dev, "amd") {
|
||
amdTotal++
|
||
addValidateModel(amdCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
|
||
}
|
||
}
|
||
|
||
out.CPU = formatValidateDeviceSummary(cpuTotal, cpuCounts, "CPU")
|
||
out.Memory = formatValidateDeviceSummary(memTotal, memCounts, "module")
|
||
out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device")
|
||
out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU")
|
||
out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU")
|
||
out.NvidiaGPUCount = nvidiaTotal
|
||
out.AMDGPUCount = amdTotal
|
||
return out
|
||
}
|
||
|
||
func renderValidateCardBody(devices, description, commands, settings string) string {
|
||
return `<div class="validate-card-section"><div style="font-size:13px;color:var(--muted)">` + devices + `</div></div>` +
|
||
`<div class="validate-card-section"><div style="font-size:13px">` + description + `</div></div>` +
|
||
`<div class="validate-card-section"><div style="font-size:13px">` + commands + `</div></div>` +
|
||
`<div class="validate-card-section"><div style="font-size:13px;color:var(--muted)">` + settings + `</div></div>`
|
||
}
|
||
|
||
func formatValidateDeviceSummary(total int, models map[string]int, unit string) string {
|
||
if total == 0 {
|
||
return "0 " + unit + "s detected."
|
||
}
|
||
keys := make([]string, 0, len(models))
|
||
for key := range models {
|
||
keys = append(keys, key)
|
||
}
|
||
sort.Strings(keys)
|
||
parts := make([]string, 0, len(keys))
|
||
for _, key := range keys {
|
||
parts = append(parts, fmt.Sprintf("%d x %s", models[key], html.EscapeString(key)))
|
||
}
|
||
label := unit
|
||
if total != 1 {
|
||
label += "s"
|
||
}
|
||
if len(parts) == 1 {
|
||
return parts[0] + " " + label
|
||
}
|
||
return fmt.Sprintf("%d %s: %s", total, label, strings.Join(parts, ", "))
|
||
}
|
||
|
||
func addValidateModel(counts map[string]int, name string) {
|
||
name = strings.TrimSpace(name)
|
||
if name == "" {
|
||
name = "unknown"
|
||
}
|
||
counts[name]++
|
||
}
|
||
|
||
func validateTrimPtr(value *string) string {
|
||
if value == nil {
|
||
return ""
|
||
}
|
||
return strings.TrimSpace(*value)
|
||
}
|
||
|
||
func validateFirstNonEmpty(values ...string) string {
|
||
for _, value := range values {
|
||
value = strings.TrimSpace(value)
|
||
if value != "" {
|
||
return value
|
||
}
|
||
}
|
||
return ""
|
||
}
|
||
|
||
func validateIsVendorGPU(dev schema.HardwarePCIeDevice, vendor string) bool {
|
||
model := strings.ToLower(validateTrimPtr(dev.Model))
|
||
manufacturer := strings.ToLower(validateTrimPtr(dev.Manufacturer))
|
||
class := strings.ToLower(validateTrimPtr(dev.DeviceClass))
|
||
if strings.Contains(model, "aspeed") || strings.Contains(manufacturer, "aspeed") {
|
||
return false
|
||
}
|
||
switch vendor {
|
||
case "nvidia":
|
||
return strings.Contains(model, "nvidia") || strings.Contains(manufacturer, "nvidia")
|
||
case "amd":
|
||
isGPUClass := class == "processingaccelerator" || class == "displaycontroller" || class == "videocontroller"
|
||
isAMDVendor := strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd") || strings.Contains(manufacturer, "ati")
|
||
isAMDModel := strings.Contains(model, "instinct") || strings.Contains(model, "radeon") || strings.Contains(model, "amd")
|
||
return isGPUClass && (isAMDVendor || isAMDModel)
|
||
default:
|
||
return false
|
||
}
|
||
}
|
||
|
||
func renderSATCard(id, label, runAction, headerActions, body string) string {
|
||
actions := `<button id="sat-btn-` + id + `" class="btn btn-primary btn-sm" onclick="` + runAction + `">Run</button>`
|
||
if strings.TrimSpace(headerActions) != "" {
|
||
actions += headerActions
|
||
}
|
||
return fmt.Sprintf(`<div class="card"><div class="card-head card-head-actions"><span>%s</span><div class="card-head-buttons">%s</div></div><div class="card-body validate-card-body">%s</div></div>`,
|
||
label, actions, body)
|
||
}
|