Move NCCL interconnect and NVBandwidth tests to validate/stress

nvidia-interconnect (NCCL all_reduce_perf) and nvidia-bandwidth
(NVBandwidth) verify fabric connectivity and bandwidth — they are
not sustained burn loads. Move both from the Burn section to the
Validate section under the stress-mode toggle, alongside the other
DCGM diagnostic tests moved in the previous commit.

- Add sat-card-nvidia-interconnect and sat-card-nvidia-bandwidth
  validate cards (stress-only, all selected GPUs at once)
- Add runNvidiaFabricValidate() for all-GPU-at-once dispatch
- Add nvidiaAllGPUTargets handling in expandSATTarget/runAllSAT
- Remove Interconnect / Bandwidth card from Burn section
- Remove nvidia-interconnect and nvidia-bandwidth from runAllBurnTasks
  and the gpu/tools availability map

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-08 00:16:42 +03:00
parent d1a22d782d
commit b9be93c213

View File

@@ -1119,6 +1119,22 @@ func renderValidate(opts HandlerOptions) string {
`Skipped in Validate mode. Runs in Stress mode only. Runs one GPU at a time.<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
)) +
`</div>` +
`<div id="sat-card-nvidia-interconnect">` +
renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
inv.NVIDIA,
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
`<code>all_reduce_perf</code> (NCCL tests)`,
`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously (requires ≥2).<p id="sat-ni-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
)) +
`</div>` +
`<div id="sat-card-nvidia-bandwidth">` +
renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
inv.NVIDIA,
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
`<code>nvbandwidth</code>`,
`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously.<p id="sat-nb-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
)) +
`</div>` +
`</div>
<div class="grid3" style="margin-top:16px">
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
@@ -1154,6 +1170,8 @@ function satModeChanged() {
{card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
{card: 'sat-card-nvidia-targeted-power', hint: 'sat-tp-mode-hint'},
{card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'},
{card: 'sat-card-nvidia-interconnect', hint: 'sat-ni-mode-hint'},
{card: 'sat-card-nvidia-bandwidth', hint: 'sat-nb-mode-hint'},
].forEach(function(item) {
const card = document.getElementById(item.card);
if (card) {
@@ -1164,7 +1182,7 @@ function satModeChanged() {
});
}
function satLabels() {
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA Pulse Test (dcgmi diag pulse_test)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
}
let satNvidiaGPUsPromise = null;
function loadSatNvidiaGPUs() {
@@ -1304,7 +1322,16 @@ function runSATWithOverrides(target, overrides) {
.then(d => streamSATTask(d.task_id, title, false));
}
const nvidiaPerGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
const nvidiaAllGPUTargets = ['nvidia-interconnect', 'nvidia-bandwidth'];
function expandSATTarget(target) {
if (nvidiaAllGPUTargets.indexOf(target) >= 0) {
const selected = satSelectedGPUIndices();
if (!selected.length) return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
return Promise.resolve([{
target: target,
overrides: {gpu_indices: selected, display_name: satLabels()[target] || target}
}]);
}
if (nvidiaPerGPUTargets.indexOf(target) < 0) {
return Promise.resolve([{target: target}]);
}
@@ -1321,6 +1348,14 @@ function expandSATTarget(target) {
label: satGPUDisplayName(gpu)
})));
}
function runNvidiaFabricValidate(target) {
const selected = satSelectedGPUIndices();
if (!selected.length) {
alert('Select at least one NVIDIA GPU.');
return;
}
return runSATWithOverrides(target, {gpu_indices: selected, display_name: satLabels()[target] || target});
}
function runNvidiaValidateSet(target) {
return loadSatNvidiaGPUs().then(gpus => {
const selected = satSelectedGPUIndices();
@@ -1383,8 +1418,8 @@ function runAllSAT() {
const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
const status = document.getElementById('sat-all-status');
status.textContent = 'Enqueuing...';
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','memory','storage','cpu'].concat(selectedAMDValidateTargets());
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
const activeTargets = baseTargets.filter(target => {
if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
const btn = document.getElementById('sat-btn-' + target);
@@ -1423,6 +1458,8 @@ fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
if (!gp.nvidia) disableSATCard('nvidia-targeted-stress', 'No NVIDIA GPU detected');
if (!gp.nvidia) disableSATCard('nvidia-targeted-power', 'No NVIDIA GPU detected');
if (!gp.nvidia) disableSATCard('nvidia-pulse', 'No NVIDIA GPU detected');
if (!gp.nvidia) disableSATCard('nvidia-interconnect', 'No NVIDIA GPU detected');
if (!gp.nvidia) disableSATCard('nvidia-bandwidth', 'No NVIDIA GPU detected');
if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
if (!gp.amd) disableSATAMDOptions('No AMD GPU detected');
});
@@ -2093,14 +2130,6 @@ func renderBurn() string {
<div class="burn-section">GPU-Specific Tests</div>
<div class="grid2 burn-grid" style="margin-bottom:16px">
<div class="card burn-card">
<div class="card-head card-head-actions"><span>Interconnect / Bandwidth</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-nvidia-interconnect',target:'nvidia-interconnect',label:'NVIDIA Interconnect Test (NCCL all_reduce_perf)',nvidia:true},{id:'burn-nvidia-bandwidth',target:'nvidia-bandwidth',label:'NVIDIA Bandwidth Test (NVBandwidth)',nvidia:true}])">Run</button></div>
<div class="card-body burn-card-body">
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Official NVIDIA fabric paths. NCCL is interconnect-only and is not a compute burn. NVBandwidth validates copy and bandwidth paths.</p>
<label class="cb-row"><input type="checkbox" id="burn-nvidia-interconnect" disabled><span>NVIDIA Interconnect Test (NCCL all_reduce_perf) <span class="cb-note" id="note-nvidia-interconnect"></span></span></label>
<label class="cb-row"><input type="checkbox" id="burn-nvidia-bandwidth" disabled><span>NVIDIA Bandwidth Test (NVBandwidth) <span class="cb-note" id="note-nvidia-bandwidth"></span></span></label>
</div>
</div>
</div>
<div id="bi-output" style="display:none;margin-top:16px" class="card">
@@ -2352,8 +2381,6 @@ function runAllBurnTasks() {
const status = document.getElementById('burn-all-status');
const all = [
{id:'burn-nvidia-compute',target:'nvidia-compute',label:'NVIDIA Max Compute Load (dcgmproftester)',nvidia:true},
{id:'burn-nvidia-interconnect',target:'nvidia-interconnect',label:'NVIDIA Interconnect Test (NCCL all_reduce_perf)',nvidia:true},
{id:'burn-nvidia-bandwidth',target:'nvidia-bandwidth',label:'NVIDIA Bandwidth Test (NVBandwidth)',nvidia:true},
{id:'burn-gpu-bee',target:'nvidia-stress',label:'GPU Burn (bee-gpu-burn)',nvidia:true,extra:{loader:'builtin'}},
{id:'burn-gpu-john',target:'nvidia-stress',label:'John GPU Stress (john/OpenCL)',nvidia:true,extra:{loader:'john'}},
{id:'burn-gpu-rvs',target:'amd-stress',label:'AMD GPU Stress (rvs gst)'},
@@ -2368,8 +2395,6 @@ function runAllBurnTasks() {
fetch('/api/gpu/tools').then(function(r) { return r.json(); }).then(function(tools) {
const map = {
'nvidia-compute': {cb:'burn-nvidia-compute', note:'note-nvidia-compute', reason:'dcgmproftester not available or NVIDIA driver not running'},
'nvidia-interconnect': {cb:'burn-nvidia-interconnect', note:'note-nvidia-interconnect', reason:'NCCL interconnect tool not available or NVIDIA driver not running'},
'nvidia-bandwidth': {cb:'burn-nvidia-bandwidth', note:'note-nvidia-bandwidth', reason:'nvbandwidth or dcgmi not available or NVIDIA driver not running'},
'bee-gpu-burn': {cb:'burn-gpu-bee', note:'note-bee', reason:'bee-gpu-burn not available or NVIDIA driver not running'},
'john': {cb:'burn-gpu-john', note:'note-john', reason:'bee-john-gpu-stress not available or NVIDIA driver not running'},
'rvs': {cb:'burn-gpu-rvs', note:'note-rvs', reason:'AMD driver not running'},