diff --git a/audit/internal/webui/pages.go b/audit/internal/webui/pages.go index cd5df2c..9b75d97 100644 --- a/audit/internal/webui/pages.go +++ b/audit/internal/webui/pages.go @@ -1119,6 +1119,22 @@ func renderValidate(opts HandlerOptions) string { `Skipped in Validate mode. Runs in Stress mode only. Runs one GPU at a time.

Only runs in Stress mode. Switch mode above to enable in Run All.

`, )) + `` + + `
` + + renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody( + inv.NVIDIA, + `Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`, + `all_reduce_perf (NCCL tests)`, + `Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously (requires ≥2).

Only runs in Stress mode. Switch mode above to enable in Run All.

`, + )) + + `
` + + `
` + + renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody( + inv.NVIDIA, + `Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`, + `nvbandwidth`, + `Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously.

Only runs in Stress mode. Switch mode above to enable in Run All.

`, + )) + + `
` + `
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody( @@ -1154,6 +1170,8 @@ function satModeChanged() { {card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'}, {card: 'sat-card-nvidia-targeted-power', hint: 'sat-tp-mode-hint'}, {card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'}, + {card: 'sat-card-nvidia-interconnect', hint: 'sat-ni-mode-hint'}, + {card: 'sat-card-nvidia-bandwidth', hint: 'sat-nb-mode-hint'}, ].forEach(function(item) { const card = document.getElementById(item.card); if (card) { @@ -1164,7 +1182,7 @@ function satModeChanged() { }); } function satLabels() { - return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA Pulse Test (dcgmi diag pulse_test)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'}; + return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'}; } let satNvidiaGPUsPromise = null; function loadSatNvidiaGPUs() { @@ -1304,7 +1322,16 @@ function runSATWithOverrides(target, overrides) { .then(d => streamSATTask(d.task_id, title, false)); } const nvidiaPerGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse']; +const nvidiaAllGPUTargets = ['nvidia-interconnect', 'nvidia-bandwidth']; function expandSATTarget(target) { + if (nvidiaAllGPUTargets.indexOf(target) >= 0) { + const selected = satSelectedGPUIndices(); + if (!selected.length) return Promise.reject(new Error('Select at least one NVIDIA GPU.')); + return Promise.resolve([{ + target: target, + overrides: {gpu_indices: selected, display_name: satLabels()[target] || target} + }]); + } if (nvidiaPerGPUTargets.indexOf(target) < 0) { return Promise.resolve([{target: target}]); } @@ -1321,6 +1348,14 @@ function expandSATTarget(target) { label: satGPUDisplayName(gpu) }))); } +function runNvidiaFabricValidate(target) { + const selected = satSelectedGPUIndices(); + if (!selected.length) { + alert('Select at least one NVIDIA GPU.'); + return; + } + return runSATWithOverrides(target, {gpu_indices: selected, display_name: satLabels()[target] || target}); +} function runNvidiaValidateSet(target) { return loadSatNvidiaGPUs().then(gpus => { const selected = satSelectedGPUIndices(); @@ -1383,8 +1418,8 @@ function runAllSAT() { const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1); const status = document.getElementById('sat-all-status'); status.textContent = 'Enqueuing...'; - const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse']; - const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','memory','storage','cpu'].concat(selectedAMDValidateTargets()); + const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth']; + const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets()); const activeTargets = baseTargets.filter(target => { if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false; const btn = document.getElementById('sat-btn-' + target); @@ -1423,6 +1458,8 @@ fetch('/api/gpu/presence').then(r=>r.json()).then(gp => { if (!gp.nvidia) disableSATCard('nvidia-targeted-stress', 'No NVIDIA GPU detected'); if (!gp.nvidia) disableSATCard('nvidia-targeted-power', 'No NVIDIA GPU detected'); if (!gp.nvidia) disableSATCard('nvidia-pulse', 'No NVIDIA GPU detected'); + if (!gp.nvidia) disableSATCard('nvidia-interconnect', 'No NVIDIA GPU detected'); + if (!gp.nvidia) disableSATCard('nvidia-bandwidth', 'No NVIDIA GPU detected'); if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected'); if (!gp.amd) disableSATAMDOptions('No AMD GPU detected'); }); @@ -2093,14 +2130,6 @@ func renderBurn() string {
GPU-Specific Tests
-
-
Interconnect / Bandwidth
-
-

Official NVIDIA fabric paths. NCCL is interconnect-only and is not a compute burn. NVBandwidth validates copy and bandwidth paths.

- - -
-