Move power diag tests to validate/stress; fix GPU burn power saturation
- bee-gpu-stress.c: remove per-wave cuCtxSynchronize barrier in both cuBLASLt and PTX hot loops; sync at most once/sec so the GPU queue stays continuously full — eliminates the CPU↔GPU ping-pong that prevented reaching full TDP - sat_fan_stress.go: default SizeMB 0 (auto = 95% VRAM) instead of hardcoded 64 MB; tiny matrices caused <0.1 ms kernels where CPU re-queue overhead dominated - pages.go: move nvidia-targeted-power and nvidia-pulse from Burn → Validate stress section alongside nvidia-targeted-stress; these are DCGM pass/fail diagnostics, not sustained burn loads; remove the Power Delivery / Power Budget card from Burn entirely Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -20,7 +20,7 @@ type FanStressOptions struct {
|
|||||||
Phase1DurSec int // first load phase duration in seconds (default 300)
|
Phase1DurSec int // first load phase duration in seconds (default 300)
|
||||||
PauseSec int // pause between the two load phases (default 60)
|
PauseSec int // pause between the two load phases (default 60)
|
||||||
Phase2DurSec int // second load phase duration in seconds (default 300)
|
Phase2DurSec int // second load phase duration in seconds (default 300)
|
||||||
SizeMB int // GPU memory to allocate per GPU during stress (default 64)
|
SizeMB int // GPU memory to allocate per GPU during stress (0 = auto: 95% of VRAM)
|
||||||
GPUIndices []int // which GPU indices to stress (empty = all detected)
|
GPUIndices []int // which GPU indices to stress (empty = all detected)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -243,9 +243,8 @@ func applyFanStressDefaults(opts *FanStressOptions) {
|
|||||||
if opts.Phase2DurSec <= 0 {
|
if opts.Phase2DurSec <= 0 {
|
||||||
opts.Phase2DurSec = 300
|
opts.Phase2DurSec = 300
|
||||||
}
|
}
|
||||||
if opts.SizeMB <= 0 {
|
// SizeMB == 0 means "auto" (worker picks 95% of GPU VRAM for maximum power draw).
|
||||||
opts.SizeMB = 64
|
// Leave at 0 to avoid passing a too-small size that starves the tensor-core path.
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// sampleFanStressRow collects all metrics for one telemetry sample.
|
// sampleFanStressRow collects all metrics for one telemetry sample.
|
||||||
|
|||||||
@@ -1036,10 +1036,12 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
<div class="card-body validate-profile-body">
|
<div class="card-body validate-profile-body">
|
||||||
<div class="validate-profile-col">
|
<div class="validate-profile-col">
|
||||||
<div class="form-row" style="margin:0"><label>Cycles</label><input type="number" id="sat-cycles" value="1" min="1" max="100" style="width:100%"></div>
|
<div class="form-row" style="margin:0"><label>Cycles</label><input type="number" id="sat-cycles" value="1" min="1" max="100" style="width:100%"></div>
|
||||||
<div class="form-row" style="margin:12px 0 0"><label>Diag level</label><select id="sat-profile-nvidia-level" style="width:100%"><option value="1">Level 1 — Quick</option><option value="2">Level 2 — Standard</option><option value="3">Level 3 — Extended</option><option value="4">Level 4 — Full</option></select></div>
|
<div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
|
||||||
|
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
|
||||||
|
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (~30–60 min)</span></label>
|
||||||
</div>
|
</div>
|
||||||
<div class="validate-profile-col validate-profile-action">
|
<div class="validate-profile-col validate-profile-action">
|
||||||
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially with the selected cycle count. NVIDIA <code>dcgmi diag</code> uses the selected diag level from this profile.</p>
|
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially with the selected cycle count and mode. Validate is quick (~5–15 min total); Stress is thorough (~30–60 min total).</p>
|
||||||
<button class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
|
<button class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
|
||||||
</div>
|
</div>
|
||||||
<div class="validate-profile-col"></div>
|
<div class="validate-profile-col"></div>
|
||||||
@@ -1054,19 +1056,19 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
inv.CPU,
|
inv.CPU,
|
||||||
`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
|
`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
|
||||||
`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
|
`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
|
||||||
`Duration is taken from Validate Profile diag level: Level 1 = 60s, Level 2 = 5m, Level 3 = 1h, Level 4 = 1h.`,
|
`60s in Validate, 30 min in Stress.`,
|
||||||
)) +
|
)) +
|
||||||
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
|
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
|
||||||
inv.Memory,
|
inv.Memory,
|
||||||
`Runs a short RAM validation pass and records memory state around the test.`,
|
`Runs a RAM validation pass and records memory state around the test.`,
|
||||||
`<code>free</code>, <code>memtester</code>`,
|
`<code>free</code>, <code>memtester</code>`,
|
||||||
`No extra settings.`,
|
`256 MB / 1 pass in Validate, 1 GB / 3 passes in Stress.`,
|
||||||
)) +
|
)) +
|
||||||
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
|
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
|
||||||
inv.Storage,
|
inv.Storage,
|
||||||
`Scans all storage devices and runs the matching health or self-test path for each device type.`,
|
`Scans all storage devices and runs the matching health or self-test path for each device type.`,
|
||||||
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
|
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
|
||||||
`No extra settings.`,
|
`Short self-test in Validate, extended self-test in Stress.`,
|
||||||
)) +
|
)) +
|
||||||
`</div>
|
`</div>
|
||||||
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
||||||
@@ -1091,14 +1093,32 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Runs NVIDIA diagnostics and board inventory checks.`,
|
`Runs NVIDIA diagnostics and board inventory checks.`,
|
||||||
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
||||||
`Runs one GPU at a time on the selected NVIDIA GPUs. Diag level is taken from Validate Profile.`,
|
`Level 2 in Validate, Level 3 in Stress. Runs one GPU at a time on the selected NVIDIA GPUs.`,
|
||||||
)) +
|
)) +
|
||||||
|
`<div id="sat-card-nvidia-targeted-stress">` +
|
||||||
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Runs a controlled NVIDIA DCGM load in Validate to check stability under moderate stress.`,
|
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
|
||||||
`<code>dcgmi diag targeted_stress</code>`,
|
`<code>dcgmi diag targeted_stress</code>`,
|
||||||
`Runs one GPU at a time on the selected NVIDIA GPUs with the fixed DCGM targeted stress recipe.`,
|
`Skipped in Validate mode. Runs after dcgmi diag in Stress mode. Runs one GPU at a time on the selected NVIDIA GPUs.<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||||
)) +
|
)) +
|
||||||
|
`</div>` +
|
||||||
|
`<div id="sat-card-nvidia-targeted-power">` +
|
||||||
|
renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody(
|
||||||
|
inv.NVIDIA,
|
||||||
|
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
|
||||||
|
`<code>dcgmi diag targeted_power</code>`,
|
||||||
|
`Skipped in Validate mode. Runs in Stress mode only. Runs one GPU at a time.<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||||
|
)) +
|
||||||
|
`</div>` +
|
||||||
|
`<div id="sat-card-nvidia-pulse">` +
|
||||||
|
renderSATCard("nvidia-pulse", "NVIDIA Pulse Test", "runNvidiaValidateSet('nvidia-pulse')", "", renderValidateCardBody(
|
||||||
|
inv.NVIDIA,
|
||||||
|
`Verifies GPU transient power response using DCGM pulse load. Pass/fail determined by DCGM.`,
|
||||||
|
`<code>dcgmi diag pulse_test</code>`,
|
||||||
|
`Skipped in Validate mode. Runs in Stress mode only. Runs one GPU at a time.<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||||
|
)) +
|
||||||
|
`</div>` +
|
||||||
`</div>
|
`</div>
|
||||||
<div class="grid3" style="margin-top:16px">
|
<div class="grid3" style="margin-top:16px">
|
||||||
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
|
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
|
||||||
@@ -1125,17 +1145,26 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
</style>
|
</style>
|
||||||
<script>
|
<script>
|
||||||
let satES = null;
|
let satES = null;
|
||||||
function satDiagLevel() {
|
function satStressMode() {
|
||||||
return parseInt(document.getElementById('sat-profile-nvidia-level').value) || 1;
|
return document.querySelector('input[name="sat-mode"]:checked')?.value === 'stress';
|
||||||
}
|
}
|
||||||
function satCPUDurationFromDiagLevel() {
|
function satModeChanged() {
|
||||||
const level = satDiagLevel();
|
const stress = satStressMode();
|
||||||
if (level === 1) return 60;
|
[
|
||||||
if (level === 2) return 5 * 60;
|
{card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
|
||||||
return 60 * 60;
|
{card: 'sat-card-nvidia-targeted-power', hint: 'sat-tp-mode-hint'},
|
||||||
|
{card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'},
|
||||||
|
].forEach(function(item) {
|
||||||
|
const card = document.getElementById(item.card);
|
||||||
|
if (card) {
|
||||||
|
card.style.opacity = stress ? '1' : '0.5';
|
||||||
|
const hint = document.getElementById(item.hint);
|
||||||
|
if (hint) hint.style.display = stress ? 'none' : '';
|
||||||
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
function satLabels() {
|
function satLabels() {
|
||||||
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA Pulse Test (dcgmi diag pulse_test)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||||
}
|
}
|
||||||
let satNvidiaGPUsPromise = null;
|
let satNvidiaGPUsPromise = null;
|
||||||
function loadSatNvidiaGPUs() {
|
function loadSatNvidiaGPUs() {
|
||||||
@@ -1211,9 +1240,8 @@ function satRequestBody(target, overrides) {
|
|||||||
const body = {};
|
const body = {};
|
||||||
const labels = satLabels();
|
const labels = satLabels();
|
||||||
body.display_name = labels[target] || ('Validate ' + target);
|
body.display_name = labels[target] || ('Validate ' + target);
|
||||||
if (target === 'nvidia') body.diag_level = satDiagLevel();
|
body.stress_mode = satStressMode();
|
||||||
if (target === 'nvidia-targeted-stress') body.duration = 300;
|
if (target === 'cpu') body.duration = satStressMode() ? 1800 : 60;
|
||||||
if (target === 'cpu') body.duration = satCPUDurationFromDiagLevel();
|
|
||||||
if (overrides) {
|
if (overrides) {
|
||||||
Object.keys(overrides).forEach(key => { body[key] = overrides[key]; });
|
Object.keys(overrides).forEach(key => { body[key] = overrides[key]; });
|
||||||
}
|
}
|
||||||
@@ -1275,8 +1303,9 @@ function runSATWithOverrides(target, overrides) {
|
|||||||
return enqueueSATTarget(target, overrides)
|
return enqueueSATTarget(target, overrides)
|
||||||
.then(d => streamSATTask(d.task_id, title, false));
|
.then(d => streamSATTask(d.task_id, title, false));
|
||||||
}
|
}
|
||||||
|
const nvidiaPerGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
|
||||||
function expandSATTarget(target) {
|
function expandSATTarget(target) {
|
||||||
if (target !== 'nvidia' && target !== 'nvidia-targeted-stress') {
|
if (nvidiaPerGPUTargets.indexOf(target) < 0) {
|
||||||
return Promise.resolve([{target: target}]);
|
return Promise.resolve([{target: target}]);
|
||||||
}
|
}
|
||||||
const selected = satSelectedGPUIndices();
|
const selected = satSelectedGPUIndices();
|
||||||
@@ -1354,8 +1383,10 @@ function runAllSAT() {
|
|||||||
const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
|
const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
|
||||||
const status = document.getElementById('sat-all-status');
|
const status = document.getElementById('sat-all-status');
|
||||||
status.textContent = 'Enqueuing...';
|
status.textContent = 'Enqueuing...';
|
||||||
const baseTargets = ['nvidia','nvidia-targeted-stress','memory','storage','cpu'].concat(selectedAMDValidateTargets());
|
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
|
||||||
|
const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','memory','storage','cpu'].concat(selectedAMDValidateTargets());
|
||||||
const activeTargets = baseTargets.filter(target => {
|
const activeTargets = baseTargets.filter(target => {
|
||||||
|
if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
|
||||||
const btn = document.getElementById('sat-btn-' + target);
|
const btn = document.getElementById('sat-btn-' + target);
|
||||||
return !(btn && btn.disabled);
|
return !(btn && btn.disabled);
|
||||||
});
|
});
|
||||||
@@ -1390,6 +1421,8 @@ function runAllSAT() {
|
|||||||
fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
|
fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
|
||||||
if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
|
if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
|
||||||
if (!gp.nvidia) disableSATCard('nvidia-targeted-stress', 'No NVIDIA GPU detected');
|
if (!gp.nvidia) disableSATCard('nvidia-targeted-stress', 'No NVIDIA GPU detected');
|
||||||
|
if (!gp.nvidia) disableSATCard('nvidia-targeted-power', 'No NVIDIA GPU detected');
|
||||||
|
if (!gp.nvidia) disableSATCard('nvidia-pulse', 'No NVIDIA GPU detected');
|
||||||
if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
|
if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
|
||||||
if (!gp.amd) disableSATAMDOptions('No AMD GPU detected');
|
if (!gp.amd) disableSATAMDOptions('No AMD GPU detected');
|
||||||
});
|
});
|
||||||
@@ -1587,6 +1620,7 @@ type benchmarkHistoryColumn struct {
|
|||||||
label string
|
label string
|
||||||
name string
|
name string
|
||||||
index int
|
index int
|
||||||
|
parallel bool
|
||||||
}
|
}
|
||||||
|
|
||||||
type benchmarkHistoryCell struct {
|
type benchmarkHistoryCell struct {
|
||||||
@@ -1894,30 +1928,44 @@ func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []
|
|||||||
cells: make(map[string]benchmarkHistoryCell),
|
cells: make(map[string]benchmarkHistoryCell),
|
||||||
}
|
}
|
||||||
|
|
||||||
// Count how many GPUs of each model appear in this run (for the label).
|
if result.ParallelGPUs {
|
||||||
|
// All GPUs ran simultaneously — one column per server, score = avg composite.
|
||||||
gpuModelCount := make(map[string]int)
|
gpuModelCount := make(map[string]int)
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
gpuModelCount[strings.TrimSpace(gpu.Name)]++
|
gpuModelCount[strings.TrimSpace(gpu.Name)]++
|
||||||
}
|
}
|
||||||
|
scoreSum := make(map[string]float64)
|
||||||
// Track best composite score per column key within this run.
|
scoreCnt := make(map[string]int)
|
||||||
runBest := make(map[string]float64)
|
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
key := benchmarkHistoryColumnKey(result.ServerModel, gpu.Name)
|
key := "parallel|" + strings.TrimSpace(result.ServerModel) + "|" + strings.TrimSpace(gpu.Name)
|
||||||
|
scoreSum[key] += gpu.Scores.CompositeScore
|
||||||
|
scoreCnt[key]++
|
||||||
count := gpuModelCount[strings.TrimSpace(gpu.Name)]
|
count := gpuModelCount[strings.TrimSpace(gpu.Name)]
|
||||||
columnByKey[key] = benchmarkHistoryColumn{
|
columnByKey[key] = benchmarkHistoryColumn{
|
||||||
key: key,
|
key: key,
|
||||||
label: benchmarkHistoryColumnLabel(result.ServerModel, gpu.Name, count),
|
label: benchmarkHistoryParallelLabel(result.ServerModel, gpu.Name, count),
|
||||||
|
name: strings.TrimSpace(gpu.Name),
|
||||||
|
index: -1,
|
||||||
|
parallel: true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for key, sum := range scoreSum {
|
||||||
|
run.cells[key] = benchmarkHistoryCell{score: sum / float64(scoreCnt[key]), present: true}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Each GPU ran independently — one column per GPU index.
|
||||||
|
for _, gpu := range result.GPUs {
|
||||||
|
key := "gpu|" + strings.TrimSpace(result.ServerModel) + "|" + strings.TrimSpace(gpu.Name) + "|" + strconv.Itoa(gpu.Index)
|
||||||
|
columnByKey[key] = benchmarkHistoryColumn{
|
||||||
|
key: key,
|
||||||
|
label: benchmarkHistoryPerGPULabel(gpu.Name, gpu.Index),
|
||||||
name: strings.TrimSpace(gpu.Name),
|
name: strings.TrimSpace(gpu.Name),
|
||||||
index: gpu.Index,
|
index: gpu.Index,
|
||||||
|
parallel: false,
|
||||||
}
|
}
|
||||||
if gpu.Scores.CompositeScore > runBest[key] {
|
run.cells[key] = benchmarkHistoryCell{score: gpu.Scores.CompositeScore, present: true}
|
||||||
runBest[key] = gpu.Scores.CompositeScore
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for key, score := range runBest {
|
|
||||||
run.cells[key] = benchmarkHistoryCell{score: score, present: true}
|
|
||||||
}
|
|
||||||
runs = append(runs, run)
|
runs = append(runs, run)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1925,13 +1973,24 @@ func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []
|
|||||||
for _, col := range columnByKey {
|
for _, col := range columnByKey {
|
||||||
columns = append(columns, col)
|
columns = append(columns, col)
|
||||||
}
|
}
|
||||||
|
// Sequential GPU columns first (sorted by GPU index), then parallel server columns.
|
||||||
sort.Slice(columns, func(i, j int) bool {
|
sort.Slice(columns, func(i, j int) bool {
|
||||||
|
if columns[i].parallel != columns[j].parallel {
|
||||||
|
return !columns[i].parallel // sequential first
|
||||||
|
}
|
||||||
|
if columns[i].parallel {
|
||||||
li := strings.ToLower(columns[i].label)
|
li := strings.ToLower(columns[i].label)
|
||||||
lj := strings.ToLower(columns[j].label)
|
lj := strings.ToLower(columns[j].label)
|
||||||
if li != lj {
|
if li != lj {
|
||||||
return li < lj
|
return li < lj
|
||||||
}
|
}
|
||||||
return columns[i].key < columns[j].key
|
return columns[i].key < columns[j].key
|
||||||
|
}
|
||||||
|
// Sequential: sort by GPU index, then name.
|
||||||
|
if columns[i].index != columns[j].index {
|
||||||
|
return columns[i].index < columns[j].index
|
||||||
|
}
|
||||||
|
return strings.ToLower(columns[i].name) < strings.ToLower(columns[j].name)
|
||||||
})
|
})
|
||||||
sort.Slice(runs, func(i, j int) bool {
|
sort.Slice(runs, func(i, j int) bool {
|
||||||
return runs[i].generatedAt.After(runs[j].generatedAt)
|
return runs[i].generatedAt.After(runs[j].generatedAt)
|
||||||
@@ -1939,25 +1998,28 @@ func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []
|
|||||||
return columns, runs
|
return columns, runs
|
||||||
}
|
}
|
||||||
|
|
||||||
// benchmarkHistoryColumnKey groups results by server model + GPU model so that
|
// benchmarkHistoryPerGPULabel formats a label for a single-GPU column: "GPU #N — ModelName".
|
||||||
// runs on the same hardware produce one column regardless of individual GPU index.
|
func benchmarkHistoryPerGPULabel(gpuName string, index int) string {
|
||||||
func benchmarkHistoryColumnKey(serverModel, gpuName string) string {
|
gpuName = strings.TrimSpace(gpuName)
|
||||||
return strings.TrimSpace(serverModel) + "|" + strings.TrimSpace(gpuName)
|
if gpuName == "" {
|
||||||
|
gpuName = "Unknown GPU"
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("GPU #%d — %s", index, gpuName)
|
||||||
}
|
}
|
||||||
|
|
||||||
// benchmarkHistoryColumnLabel formats the column header as
|
// benchmarkHistoryParallelLabel formats a label for an all-GPU parallel column:
|
||||||
// "Server Model (N× GPU Model)" or "GPU Model" when server info is missing.
|
// "ServerModel — N× ModelName (All GPUs)" or "N× ModelName (All GPUs)" if no server.
|
||||||
func benchmarkHistoryColumnLabel(serverModel, gpuName string, count int) string {
|
func benchmarkHistoryParallelLabel(serverModel, gpuName string, count int) string {
|
||||||
serverModel = strings.TrimSpace(serverModel)
|
serverModel = strings.TrimSpace(serverModel)
|
||||||
gpuName = strings.TrimSpace(gpuName)
|
gpuName = strings.TrimSpace(gpuName)
|
||||||
if gpuName == "" {
|
if gpuName == "" {
|
||||||
gpuName = "Unknown GPU"
|
gpuName = "Unknown GPU"
|
||||||
}
|
}
|
||||||
gpuPart := fmt.Sprintf("%d× %s", count, gpuName)
|
gpuPart := fmt.Sprintf("%d× %s (All GPUs)", count, gpuName)
|
||||||
if serverModel == "" {
|
if serverModel == "" {
|
||||||
return gpuPart
|
return gpuPart
|
||||||
}
|
}
|
||||||
return fmt.Sprintf("%s (%s)", serverModel, gpuPart)
|
return fmt.Sprintf("%s — %s", serverModel, gpuPart)
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Burn ──────────────────────────────────────────────────────────────────────
|
// ── Burn ──────────────────────────────────────────────────────────────────────
|
||||||
@@ -2031,15 +2093,6 @@ func renderBurn() string {
|
|||||||
|
|
||||||
<div class="burn-section">GPU-Specific Tests</div>
|
<div class="burn-section">GPU-Specific Tests</div>
|
||||||
<div class="grid2 burn-grid" style="margin-bottom:16px">
|
<div class="grid2 burn-grid" style="margin-bottom:16px">
|
||||||
<div class="card burn-card">
|
|
||||||
<div class="card-head card-head-actions"><span>Power Delivery / Power Budget</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-nvidia-power',target:'nvidia-targeted-power',label:'NVIDIA Targeted Power (dcgmi diag targeted_power)',nvidia:true},{id:'burn-nvidia-pulse',target:'nvidia-pulse',label:'NVIDIA Pulse Test (dcgmi diag pulse_test)',nvidia:true}])">Run</button></div>
|
|
||||||
<div class="card-body burn-card-body">
|
|
||||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Official NVIDIA power-oriented recipes. ` + "targeted_power" + ` checks sustained delivery; ` + "pulse_test" + ` checks transient behavior.</p>
|
|
||||||
<label class="cb-row"><input type="checkbox" id="burn-nvidia-power" disabled><span>NVIDIA Targeted Power (dcgmi diag targeted_power) <span class="cb-note" id="note-nvidia-power"></span></span></label>
|
|
||||||
<label class="cb-row"><input type="checkbox" id="burn-nvidia-pulse" disabled><span>NVIDIA Pulse Test (dcgmi diag pulse_test) <span class="cb-note" id="note-nvidia-pulse"></span></span></label>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="card burn-card">
|
<div class="card burn-card">
|
||||||
<div class="card-head card-head-actions"><span>Interconnect / Bandwidth</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-nvidia-interconnect',target:'nvidia-interconnect',label:'NVIDIA Interconnect Test (NCCL all_reduce_perf)',nvidia:true},{id:'burn-nvidia-bandwidth',target:'nvidia-bandwidth',label:'NVIDIA Bandwidth Test (NVBandwidth)',nvidia:true}])">Run</button></div>
|
<div class="card-head card-head-actions"><span>Interconnect / Bandwidth</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-nvidia-interconnect',target:'nvidia-interconnect',label:'NVIDIA Interconnect Test (NCCL all_reduce_perf)',nvidia:true},{id:'burn-nvidia-bandwidth',target:'nvidia-bandwidth',label:'NVIDIA Bandwidth Test (NVBandwidth)',nvidia:true}])">Run</button></div>
|
||||||
<div class="card-body burn-card-body">
|
<div class="card-body burn-card-body">
|
||||||
@@ -2299,8 +2352,6 @@ function runAllBurnTasks() {
|
|||||||
const status = document.getElementById('burn-all-status');
|
const status = document.getElementById('burn-all-status');
|
||||||
const all = [
|
const all = [
|
||||||
{id:'burn-nvidia-compute',target:'nvidia-compute',label:'NVIDIA Max Compute Load (dcgmproftester)',nvidia:true},
|
{id:'burn-nvidia-compute',target:'nvidia-compute',label:'NVIDIA Max Compute Load (dcgmproftester)',nvidia:true},
|
||||||
{id:'burn-nvidia-power',target:'nvidia-targeted-power',label:'NVIDIA Targeted Power (dcgmi diag targeted_power)',nvidia:true},
|
|
||||||
{id:'burn-nvidia-pulse',target:'nvidia-pulse',label:'NVIDIA Pulse Test (dcgmi diag pulse_test)',nvidia:true},
|
|
||||||
{id:'burn-nvidia-interconnect',target:'nvidia-interconnect',label:'NVIDIA Interconnect Test (NCCL all_reduce_perf)',nvidia:true},
|
{id:'burn-nvidia-interconnect',target:'nvidia-interconnect',label:'NVIDIA Interconnect Test (NCCL all_reduce_perf)',nvidia:true},
|
||||||
{id:'burn-nvidia-bandwidth',target:'nvidia-bandwidth',label:'NVIDIA Bandwidth Test (NVBandwidth)',nvidia:true},
|
{id:'burn-nvidia-bandwidth',target:'nvidia-bandwidth',label:'NVIDIA Bandwidth Test (NVBandwidth)',nvidia:true},
|
||||||
{id:'burn-gpu-bee',target:'nvidia-stress',label:'GPU Burn (bee-gpu-burn)',nvidia:true,extra:{loader:'builtin'}},
|
{id:'burn-gpu-bee',target:'nvidia-stress',label:'GPU Burn (bee-gpu-burn)',nvidia:true,extra:{loader:'builtin'}},
|
||||||
@@ -2317,8 +2368,6 @@ function runAllBurnTasks() {
|
|||||||
fetch('/api/gpu/tools').then(function(r) { return r.json(); }).then(function(tools) {
|
fetch('/api/gpu/tools').then(function(r) { return r.json(); }).then(function(tools) {
|
||||||
const map = {
|
const map = {
|
||||||
'nvidia-compute': {cb:'burn-nvidia-compute', note:'note-nvidia-compute', reason:'dcgmproftester not available or NVIDIA driver not running'},
|
'nvidia-compute': {cb:'burn-nvidia-compute', note:'note-nvidia-compute', reason:'dcgmproftester not available or NVIDIA driver not running'},
|
||||||
'nvidia-targeted-power': {cb:'burn-nvidia-power', note:'note-nvidia-power', reason:'dcgmi not available or NVIDIA driver not running'},
|
|
||||||
'nvidia-pulse': {cb:'burn-nvidia-pulse', note:'note-nvidia-pulse', reason:'dcgmi not available or NVIDIA driver not running'},
|
|
||||||
'nvidia-interconnect': {cb:'burn-nvidia-interconnect', note:'note-nvidia-interconnect', reason:'NCCL interconnect tool not available or NVIDIA driver not running'},
|
'nvidia-interconnect': {cb:'burn-nvidia-interconnect', note:'note-nvidia-interconnect', reason:'NCCL interconnect tool not available or NVIDIA driver not running'},
|
||||||
'nvidia-bandwidth': {cb:'burn-nvidia-bandwidth', note:'note-nvidia-bandwidth', reason:'nvbandwidth or dcgmi not available or NVIDIA driver not running'},
|
'nvidia-bandwidth': {cb:'burn-nvidia-bandwidth', note:'note-nvidia-bandwidth', reason:'nvbandwidth or dcgmi not available or NVIDIA driver not running'},
|
||||||
'bee-gpu-burn': {cb:'burn-gpu-bee', note:'note-bee', reason:'bee-gpu-burn not available or NVIDIA driver not running'},
|
'bee-gpu-burn': {cb:'burn-gpu-bee', note:'note-bee', reason:'bee-gpu-burn not available or NVIDIA driver not running'},
|
||||||
|
|||||||
@@ -36,7 +36,6 @@ typedef void *CUstream;
|
|||||||
#define MAX_CUBLAS_PROFILES 5
|
#define MAX_CUBLAS_PROFILES 5
|
||||||
#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
|
#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
|
||||||
#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
|
#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
|
||||||
#define STRESS_LAUNCH_DEPTH 8
|
|
||||||
|
|
||||||
static const char *ptx_source =
|
static const char *ptx_source =
|
||||||
".version 6.0\n"
|
".version 6.0\n"
|
||||||
@@ -344,7 +343,6 @@ static int run_ptx_fallback(struct cuda_api *api,
|
|||||||
unsigned long iterations = 0;
|
unsigned long iterations = 0;
|
||||||
int mp_count = 0;
|
int mp_count = 0;
|
||||||
int stream_count = 1;
|
int stream_count = 1;
|
||||||
int launches_per_wave = 0;
|
|
||||||
|
|
||||||
memset(report, 0, sizeof(*report));
|
memset(report, 0, sizeof(*report));
|
||||||
snprintf(report->backend, sizeof(report->backend), "driver-ptx");
|
snprintf(report->backend, sizeof(report->backend), "driver-ptx");
|
||||||
@@ -419,12 +417,10 @@ static int run_ptx_fallback(struct cuda_api *api,
|
|||||||
|
|
||||||
unsigned int threads = 256;
|
unsigned int threads = 256;
|
||||||
|
|
||||||
double start = now_seconds();
|
double deadline = now_seconds() + (double)seconds;
|
||||||
double deadline = start + (double)seconds;
|
double next_sync = now_seconds() + 1.0;
|
||||||
while (now_seconds() < deadline) {
|
while (now_seconds() < deadline) {
|
||||||
launches_per_wave = 0;
|
int launched = 0;
|
||||||
for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
|
|
||||||
int launched_this_batch = 0;
|
|
||||||
for (int lane = 0; lane < stream_count; lane++) {
|
for (int lane = 0; lane < stream_count; lane++) {
|
||||||
unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
|
unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
|
||||||
if (!check_rc(api,
|
if (!check_rc(api,
|
||||||
@@ -442,21 +438,21 @@ static int run_ptx_fallback(struct cuda_api *api,
|
|||||||
NULL))) {
|
NULL))) {
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
launches_per_wave++;
|
launched++;
|
||||||
launched_this_batch++;
|
iterations++;
|
||||||
}
|
}
|
||||||
if (launched_this_batch <= 0) {
|
if (launched <= 0) {
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (launches_per_wave <= 0) {
|
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
|
double now = now_seconds();
|
||||||
|
if (now >= next_sync || now >= deadline) {
|
||||||
if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
|
if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
iterations += (unsigned long)launches_per_wave;
|
next_sync = now + 1.0;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
api->cuCtxSynchronize();
|
||||||
|
|
||||||
if (!check_rc(api, "cuMemcpyDtoH", api->cuMemcpyDtoH(sample, device_mem[0], sizeof(sample)))) {
|
if (!check_rc(api, "cuMemcpyDtoH", api->cuMemcpyDtoH(sample, device_mem[0], sizeof(sample)))) {
|
||||||
goto fail;
|
goto fail;
|
||||||
@@ -468,11 +464,10 @@ static int run_ptx_fallback(struct cuda_api *api,
|
|||||||
report->iterations = iterations;
|
report->iterations = iterations;
|
||||||
snprintf(report->details,
|
snprintf(report->details,
|
||||||
sizeof(report->details),
|
sizeof(report->details),
|
||||||
"fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d queue_depth=%d per_stream_mb=%zu iterations=%lu\n",
|
"fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d per_stream_mb=%zu iterations=%lu\n",
|
||||||
size_mb,
|
size_mb,
|
||||||
report->buffer_mb,
|
report->buffer_mb,
|
||||||
report->stream_count,
|
report->stream_count,
|
||||||
STRESS_LAUNCH_DEPTH,
|
|
||||||
bytes_per_stream[0] / (1024u * 1024u),
|
bytes_per_stream[0] / (1024u * 1024u),
|
||||||
iterations);
|
iterations);
|
||||||
|
|
||||||
@@ -1140,7 +1135,6 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
int stream_count = 1;
|
int stream_count = 1;
|
||||||
int profile_count = (int)(sizeof(k_profiles) / sizeof(k_profiles[0]));
|
int profile_count = (int)(sizeof(k_profiles) / sizeof(k_profiles[0]));
|
||||||
int prepared_count = 0;
|
int prepared_count = 0;
|
||||||
int wave_launches = 0;
|
|
||||||
size_t requested_budget = 0;
|
size_t requested_budget = 0;
|
||||||
size_t total_budget = 0;
|
size_t total_budget = 0;
|
||||||
size_t per_profile_budget = 0;
|
size_t per_profile_budget = 0;
|
||||||
@@ -1207,11 +1201,10 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
report->buffer_mb = (int)(total_budget / (1024u * 1024u));
|
report->buffer_mb = (int)(total_budget / (1024u * 1024u));
|
||||||
append_detail(report->details,
|
append_detail(report->details,
|
||||||
sizeof(report->details),
|
sizeof(report->details),
|
||||||
"requested_mb=%d actual_mb=%d streams=%d queue_depth=%d mp_count=%d per_worker_mb=%zu\n",
|
"requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n",
|
||||||
size_mb,
|
size_mb,
|
||||||
report->buffer_mb,
|
report->buffer_mb,
|
||||||
report->stream_count,
|
report->stream_count,
|
||||||
STRESS_LAUNCH_DEPTH,
|
|
||||||
mp_count,
|
mp_count,
|
||||||
per_profile_budget / (1024u * 1024u));
|
per_profile_budget / (1024u * 1024u));
|
||||||
|
|
||||||
@@ -1260,11 +1253,15 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Keep the GPU queue continuously full by submitting kernels without
|
||||||
|
* synchronizing after every wave. A sync barrier after each small batch
|
||||||
|
* creates CPU↔GPU ping-pong gaps that prevent full TDP utilisation,
|
||||||
|
* especially when individual kernels are short. Instead we sync at most
|
||||||
|
* once per second (for error detection) and once at the very end. */
|
||||||
double deadline = now_seconds() + (double)seconds;
|
double deadline = now_seconds() + (double)seconds;
|
||||||
|
double next_sync = now_seconds() + 1.0;
|
||||||
while (now_seconds() < deadline) {
|
while (now_seconds() < deadline) {
|
||||||
wave_launches = 0;
|
int launched = 0;
|
||||||
for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
|
|
||||||
int launched_this_batch = 0;
|
|
||||||
for (int i = 0; i < prepared_count; i++) {
|
for (int i = 0; i < prepared_count; i++) {
|
||||||
if (!prepared[i].ready) {
|
if (!prepared[i].ready) {
|
||||||
continue;
|
continue;
|
||||||
@@ -1284,16 +1281,13 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
}
|
}
|
||||||
prepared[i].iterations++;
|
prepared[i].iterations++;
|
||||||
report->iterations++;
|
report->iterations++;
|
||||||
wave_launches++;
|
launched++;
|
||||||
launched_this_batch++;
|
|
||||||
}
|
}
|
||||||
if (launched_this_batch <= 0) {
|
if (launched <= 0) {
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (wave_launches <= 0) {
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
double now = now_seconds();
|
||||||
|
if (now >= next_sync || now >= deadline) {
|
||||||
if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
|
if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
|
||||||
for (int i = 0; i < prepared_count; i++) {
|
for (int i = 0; i < prepared_count; i++) {
|
||||||
destroy_profile(&cublas, cuda, &prepared[i]);
|
destroy_profile(&cublas, cuda, &prepared[i]);
|
||||||
@@ -1303,7 +1297,11 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
cuda->cuCtxDestroy(ctx);
|
cuda->cuCtxDestroy(ctx);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
next_sync = now + 1.0;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
/* Final drain — ensure all queued work finishes before we read results. */
|
||||||
|
cuda->cuCtxSynchronize();
|
||||||
|
|
||||||
for (int i = 0; i < prepared_count; i++) {
|
for (int i = 0; i < prepared_count; i++) {
|
||||||
if (!prepared[i].ready) {
|
if (!prepared[i].ready) {
|
||||||
|
|||||||
Reference in New Issue
Block a user