614 lines
26 KiB
Go
614 lines
26 KiB
Go
package webui
|
||
|
||
import (
|
||
"encoding/json"
|
||
"fmt"
|
||
"html"
|
||
"os"
|
||
"path/filepath"
|
||
"sort"
|
||
"strconv"
|
||
"strings"
|
||
"time"
|
||
|
||
"bee/audit/internal/app"
|
||
"bee/audit/internal/platform"
|
||
)
|
||
|
||
type benchmarkHistoryRun struct {
|
||
generatedAt time.Time
|
||
displayTime string
|
||
gpuScores map[int]float64
|
||
gpuStatuses map[int]string
|
||
overallStatus string
|
||
}
|
||
|
||
func renderBenchmark(opts HandlerOptions) string {
|
||
return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Benchmark runs generate a human-readable TXT report and machine-readable result bundle. Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||
|
||
<div class="grid2">
|
||
<div class="card">
|
||
<div class="card-head">Benchmark Setup</div>
|
||
<div class="card-body">
|
||
<div class="form-row">
|
||
<label>Profile</label>
|
||
<select id="benchmark-profile">
|
||
<option value="standard" selected>Standard — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</option>
|
||
<option value="stability">Stability — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</option>
|
||
<option value="overnight">Overnight — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfOvernightSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerOvernightSec) + `</option>
|
||
</select>
|
||
</div>
|
||
<div class="form-row">
|
||
<label>GPU Selection</label>
|
||
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||
<button class="btn btn-sm btn-secondary" type="button" onclick="benchmarkSelectAll()">Select All</button>
|
||
<button class="btn btn-sm btn-secondary" type="button" onclick="benchmarkSelectNone()">Clear</button>
|
||
</div>
|
||
<div id="benchmark-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||
</div>
|
||
</div>
|
||
<label class="benchmark-cb-row">
|
||
<input type="radio" name="benchmark-mode" value="sequential" onchange="benchmarkUpdateSelectionNote()">
|
||
<span>Sequential — one GPU at a time</span>
|
||
</label>
|
||
<label class="benchmark-cb-row" id="benchmark-parallel-label">
|
||
<input type="radio" name="benchmark-mode" value="parallel" onchange="benchmarkUpdateSelectionNote()">
|
||
<span>Parallel — all selected GPUs simultaneously</span>
|
||
</label>
|
||
<label class="benchmark-cb-row" id="benchmark-ramp-label">
|
||
<input type="radio" name="benchmark-mode" value="ramp-up" checked onchange="benchmarkUpdateSelectionNote()">
|
||
<span>Ramp-up — 1 GPU → 2 → … → all selected (separate tasks)</span>
|
||
</label>
|
||
<p id="benchmark-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 14px">Select one GPU for single-card benchmarking or several GPUs for a constrained multi-GPU run.</p>
|
||
<div style="display:flex;gap:8px;flex-wrap:wrap;align-items:center">
|
||
<button id="benchmark-run-performance-btn" class="btn btn-primary" onclick="runNvidiaBenchmark('performance')" disabled>▶ Run Performance Benchmark</button>
|
||
<button id="benchmark-run-power-fit-btn" class="btn btn-secondary" onclick="runNvidiaBenchmark('power-fit')" disabled>▶ Run Power / Thermal Fit</button>
|
||
<button id="benchmark-run-autotune-btn" class="btn btn-secondary" onclick="runBenchmarkAutotune()">Autotune</button>
|
||
</div>
|
||
<span id="benchmark-run-nccl" hidden>nccl-auto</span>
|
||
<span id="benchmark-run-status" style="margin-left:10px;font-size:12px;color:var(--muted)"></span>
|
||
<div id="benchmark-autotune-status" style="margin-top:10px;font-size:12px;color:var(--muted)">Autotune status: loading…</div>
|
||
<div style="margin-top:6px;font-size:12px;color:var(--muted)">Autotune overwrites the saved system-power source and applies it to all new power charts and tests.</div>
|
||
</div>
|
||
</div>
|
||
|
||
<div class="card">
|
||
<div class="card-head">Method Split</div>
|
||
<div class="card-body">
|
||
<p style="font-size:13px;color:var(--muted);margin-bottom:10px">The benchmark page now exposes two fundamentally different test families so compute score and server power-fit are not mixed into one number.</p>
|
||
<table>
|
||
<tr><th>Run Type</th><th>Engine</th><th>Question</th><th>Standard</th><th>Stability</th></tr>
|
||
<tr><td>Performance Benchmark</td><td><code>bee-gpu-burn</code></td><td>How much isolated compute performance does the GPU realize in this server?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + `</td></tr>
|
||
<tr><td>Power / Thermal Fit</td><td><code>dcgmproftester</code> + <code>nvidia-smi -pl</code></td><td>How much power per GPU can this server sustain as GPU count ramps up?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</td></tr>
|
||
</table>
|
||
<p style="font-size:12px;color:var(--muted);margin-top:10px">Timings are per full ramp-up run (1 GPU → all selected), measured on 4–8 GPU servers. Use ramp-up mode for capacity work: it creates 1 GPU → 2 GPU → … → all selected steps so analysis software can derive server total score and watts-per-GPU curves.</p>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
` + `<div id="benchmark-results-section">` + renderBenchmarkResultsCard(opts.ExportDir) + `</div>` + `
|
||
|
||
<div id="benchmark-output" style="display:none;margin-top:16px" class="card">
|
||
<div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
|
||
<div class="card-body"><div id="benchmark-terminal" class="terminal"></div></div>
|
||
</div>
|
||
|
||
<style>
|
||
.benchmark-cb-row { display:flex; align-items:flex-start; gap:8px; cursor:pointer; font-size:13px; }
|
||
.benchmark-cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||
.benchmark-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
|
||
.benchmark-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||
</style>
|
||
|
||
<script>
|
||
let benchmarkES = null;
|
||
function benchmarkTaskIDs(payload) {
|
||
if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
|
||
if (payload && payload.task_id) return [payload.task_id];
|
||
return [];
|
||
}
|
||
function benchmarkSelectedGPUIndices() {
|
||
return Array.from(document.querySelectorAll('.benchmark-gpu-checkbox'))
|
||
.filter(function(el) { return el.checked && !el.disabled; })
|
||
.map(function(el) { return parseInt(el.value, 10); })
|
||
.filter(function(v) { return !Number.isNaN(v); })
|
||
.sort(function(a, b) { return a - b; });
|
||
}
|
||
function benchmarkMode() {
|
||
const el = document.querySelector('input[name="benchmark-mode"]:checked');
|
||
return el ? el.value : 'sequential';
|
||
}
|
||
function benchmarkUpdateSelectionNote() {
|
||
const selected = benchmarkSelectedGPUIndices();
|
||
const perfBtn = document.getElementById('benchmark-run-performance-btn');
|
||
const fitBtn = document.getElementById('benchmark-run-power-fit-btn');
|
||
const note = document.getElementById('benchmark-selection-note');
|
||
if (!selected.length) {
|
||
perfBtn.disabled = true;
|
||
fitBtn.disabled = true;
|
||
note.textContent = 'Select at least one NVIDIA GPU to run the benchmark.';
|
||
return;
|
||
}
|
||
perfBtn.disabled = false;
|
||
fitBtn.disabled = false;
|
||
const mode = benchmarkMode();
|
||
if (mode === 'ramp-up') {
|
||
note.textContent = 'Ramp-up: ' + selected.length + ' tasks (1 GPU → ' + selected.length + ' GPUs). Performance uses compute benchmark; Power / Thermal Fit uses dcgmproftester load with nvidia-smi power-limit search per step.';
|
||
} else if (mode === 'parallel') {
|
||
note.textContent = 'Parallel: all ' + selected.length + ' GPU(s) simultaneously. Only the performance benchmark supports this mode.';
|
||
} else {
|
||
note.textContent = 'Sequential: each selected GPU benchmarked separately.';
|
||
}
|
||
}
|
||
function benchmarkRenderGPUList(gpus) {
|
||
const root = document.getElementById('benchmark-gpu-list');
|
||
if (!gpus || !gpus.length) {
|
||
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
|
||
benchmarkUpdateSelectionNote();
|
||
return;
|
||
}
|
||
root.innerHTML = gpus.map(function(gpu) {
|
||
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
|
||
return '<label class="benchmark-gpu-row">'
|
||
+ '<input class="benchmark-gpu-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="benchmarkUpdateSelectionNote()">'
|
||
+ '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
|
||
+ '</label>';
|
||
}).join('');
|
||
benchmarkApplyMultiGPUState(gpus.length);
|
||
benchmarkUpdateSelectionNote();
|
||
}
|
||
function benchmarkApplyMultiGPUState(gpuCount) {
|
||
var multiValues = ['parallel', 'ramp-up'];
|
||
var radios = document.querySelectorAll('input[name="benchmark-mode"]');
|
||
radios.forEach(function(el) {
|
||
var isMulti = multiValues.indexOf(el.value) >= 0;
|
||
if (gpuCount < 2 && isMulti) {
|
||
el.disabled = true;
|
||
if (el.checked) {
|
||
var seq = document.querySelector('input[name="benchmark-mode"][value="sequential"]');
|
||
if (seq) seq.checked = true;
|
||
}
|
||
var label = el.closest('label');
|
||
if (label) label.style.opacity = '0.4';
|
||
} else {
|
||
el.disabled = false;
|
||
if (gpuCount >= 2 && el.value === 'ramp-up') el.checked = true;
|
||
var label = el.closest('label');
|
||
if (label) label.style.opacity = '';
|
||
}
|
||
});
|
||
benchmarkUpdateSelectionNote();
|
||
}
|
||
function benchmarkLoadGPUs() {
|
||
const status = document.getElementById('benchmark-run-status');
|
||
status.textContent = '';
|
||
fetch('/api/gpu/nvidia').then(function(r) {
|
||
return r.json().then(function(body) {
|
||
if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
|
||
return body;
|
||
});
|
||
}).then(function(gpus) {
|
||
benchmarkRenderGPUList(gpus);
|
||
}).catch(function(err) {
|
||
document.getElementById('benchmark-gpu-list').innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
|
||
benchmarkUpdateSelectionNote();
|
||
});
|
||
}
|
||
function benchmarkSelectAll() {
|
||
document.querySelectorAll('.benchmark-gpu-checkbox').forEach(function(el) { el.checked = true; });
|
||
benchmarkUpdateSelectionNote();
|
||
}
|
||
function benchmarkSelectNone() {
|
||
document.querySelectorAll('.benchmark-gpu-checkbox').forEach(function(el) { el.checked = false; });
|
||
benchmarkUpdateSelectionNote();
|
||
}
|
||
function runNvidiaBenchmark(kind) {
|
||
const selected = benchmarkSelectedGPUIndices();
|
||
const status = document.getElementById('benchmark-run-status');
|
||
if (!selected.length) {
|
||
status.textContent = 'Select at least one GPU.';
|
||
return;
|
||
}
|
||
if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
|
||
const mode = benchmarkMode();
|
||
const rampUp = mode === 'ramp-up' && selected.length > 1;
|
||
const parallelGPUs = mode === 'parallel' && kind === 'performance';
|
||
if (kind === 'power-fit' && mode === 'parallel') {
|
||
status.textContent = 'Power / Thermal Fit supports sequential or ramp-up only.';
|
||
return;
|
||
}
|
||
const body = {
|
||
profile: document.getElementById('benchmark-profile').value || 'standard',
|
||
gpu_indices: selected,
|
||
run_nccl: kind === 'performance' && selected.length > 1,
|
||
parallel_gpus: parallelGPUs,
|
||
ramp_up: rampUp,
|
||
display_name: kind === 'power-fit' ? 'NVIDIA Power / Thermal Fit' : 'NVIDIA Performance Benchmark'
|
||
};
|
||
document.getElementById('benchmark-output').style.display = 'block';
|
||
document.getElementById('benchmark-title').textContent = '— ' + body.display_name + ' · ' + body.profile + ' [' + selected.join(', ') + ']';
|
||
const term = document.getElementById('benchmark-terminal');
|
||
term.textContent = 'Enqueuing ' + body.display_name + ' for GPUs ' + selected.join(', ') + '...\n';
|
||
status.textContent = 'Queueing...';
|
||
const endpoint = kind === 'power-fit' ? '/api/bee-bench/nvidia/power/run' : '/api/bee-bench/nvidia/perf/run';
|
||
fetch(endpoint, {
|
||
method: 'POST',
|
||
headers: {'Content-Type':'application/json'},
|
||
body: JSON.stringify(body)
|
||
}).then(function(r) {
|
||
return r.json().then(function(payload) {
|
||
if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
|
||
return payload;
|
||
});
|
||
}).then(function(d) {
|
||
const taskIds = benchmarkTaskIDs(d);
|
||
if (!taskIds.length) throw new Error('No benchmark task was queued.');
|
||
status.textContent = taskIds.length === 1 ? ('Task ' + taskIds[0] + ' queued.') : ('Queued ' + taskIds.length + ' tasks.');
|
||
const streamNext = function(idx, failures) {
|
||
if (idx >= taskIds.length) {
|
||
status.textContent = failures ? 'Completed with failures.' : 'Completed.';
|
||
return;
|
||
}
|
||
const taskId = taskIds[idx];
|
||
term.textContent += '\n[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming log...\n';
|
||
benchmarkES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||
benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||
benchmarkES.addEventListener('done', function(e) {
|
||
benchmarkES.close();
|
||
benchmarkES = null;
|
||
if (e.data) failures += 1;
|
||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||
term.scrollTop = term.scrollHeight;
|
||
const isLast = (idx + 1 >= taskIds.length);
|
||
streamNext(idx + 1, failures);
|
||
if (isLast) { benchmarkRefreshResults(); }
|
||
});
|
||
benchmarkES.onerror = function() {
|
||
if (benchmarkES) {
|
||
benchmarkES.close();
|
||
benchmarkES = null;
|
||
}
|
||
term.textContent += '\nERROR: stream disconnected.\n';
|
||
term.scrollTop = term.scrollHeight;
|
||
streamNext(idx + 1, failures + 1);
|
||
};
|
||
};
|
||
streamNext(0, 0);
|
||
}).catch(function(err) {
|
||
status.textContent = 'Error.';
|
||
term.textContent += 'ERROR: ' + err.message + '\n';
|
||
});
|
||
}
|
||
function benchmarkRenderAutotuneStatus(payload) {
|
||
const el = document.getElementById('benchmark-autotune-status');
|
||
if (!el) return;
|
||
if (!payload || !payload.configured || !payload.config) {
|
||
el.textContent = 'Autotune status: not configured. Temporary fallback source is used until autotune completes.';
|
||
return;
|
||
}
|
||
const cfg = payload.config || {};
|
||
const decision = payload.decision || {};
|
||
const updated = cfg.updated_at ? new Date(cfg.updated_at).toLocaleString() : 'unknown time';
|
||
const confidence = typeof cfg.confidence === 'number' ? (' · confidence ' + Math.round(cfg.confidence * 100) + '%') : '';
|
||
const effective = decision.effective_source ? (' · effective ' + decision.effective_source) : '';
|
||
const mode = decision.mode ? (' · mode ' + decision.mode) : '';
|
||
el.textContent = 'Autotune status: ' + cfg.selected_source + effective + mode + ' · updated ' + updated + confidence;
|
||
}
|
||
function loadBenchmarkAutotuneStatus() {
|
||
fetch('/api/bee-bench/nvidia/autotune/status')
|
||
.then(function(r) {
|
||
return r.json().then(function(body) {
|
||
if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
|
||
return body;
|
||
});
|
||
})
|
||
.then(function(body) { benchmarkRenderAutotuneStatus(body); })
|
||
.catch(function(err) {
|
||
const el = document.getElementById('benchmark-autotune-status');
|
||
if (el) el.textContent = 'Autotune status error: ' + err.message;
|
||
});
|
||
}
|
||
function runBenchmarkAutotune() {
|
||
const selected = benchmarkSelectedGPUIndices();
|
||
const status = document.getElementById('benchmark-run-status');
|
||
const term = document.getElementById('benchmark-terminal');
|
||
if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
|
||
document.getElementById('benchmark-output').style.display = 'block';
|
||
document.getElementById('benchmark-title').textContent = '— NVIDIA Benchmark Autotune';
|
||
term.textContent = 'Enqueuing benchmark autotune...\n';
|
||
status.textContent = 'Queueing autotune...';
|
||
fetch('/api/bee-bench/nvidia/autotune/run', {
|
||
method: 'POST',
|
||
headers: {'Content-Type':'application/json'},
|
||
body: JSON.stringify({
|
||
profile: document.getElementById('benchmark-profile').value || 'standard',
|
||
benchmark_kind: benchmarkMode() === 'parallel' ? 'performance' : 'power-fit',
|
||
gpu_indices: selected
|
||
})
|
||
}).then(function(r) {
|
||
return r.json().then(function(payload) {
|
||
if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
|
||
return payload;
|
||
});
|
||
}).then(function(d) {
|
||
const taskIds = benchmarkTaskIDs(d);
|
||
if (!taskIds.length) throw new Error('No autotune task was queued.');
|
||
const taskId = taskIds[0];
|
||
status.textContent = 'Autotune queued: ' + taskId;
|
||
benchmarkES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||
benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||
benchmarkES.addEventListener('done', function(e) {
|
||
if (benchmarkES) {
|
||
benchmarkES.close();
|
||
benchmarkES = null;
|
||
}
|
||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||
status.textContent = e.data ? 'Autotune failed.' : 'Autotune completed.';
|
||
loadBenchmarkAutotuneStatus();
|
||
});
|
||
}).catch(function(err) {
|
||
status.textContent = 'Autotune error.';
|
||
term.textContent += 'ERROR: ' + err.message + '\n';
|
||
});
|
||
}
|
||
benchmarkLoadGPUs();
|
||
loadBenchmarkAutotuneStatus();
|
||
function benchmarkRefreshResults() {
|
||
fetch('/api/benchmark/results')
|
||
.then(function(r) { return r.text(); })
|
||
.then(function(html) {
|
||
const el = document.getElementById('benchmark-results-section');
|
||
if (el) el.innerHTML = html;
|
||
})
|
||
.catch(function() {});
|
||
}
|
||
</script>`
|
||
}
|
||
|
||
func renderBenchmarkResultsCard(exportDir string) string {
|
||
maxIdx, runs := loadBenchmarkHistory(exportDir)
|
||
perf := renderBenchmarkResultsCardFromRuns(
|
||
"Perf Results",
|
||
"Composite score by saved benchmark run and GPU.",
|
||
"No saved performance benchmark runs yet.",
|
||
maxIdx,
|
||
runs,
|
||
)
|
||
power := renderPowerBenchmarkResultsCard(exportDir)
|
||
return perf + "\n" + power
|
||
}
|
||
|
||
func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, maxGPUIndex int, runs []benchmarkHistoryRun) string {
|
||
if len(runs) == 0 {
|
||
return `<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body"><p style="color:var(--muted);font-size:13px">` + html.EscapeString(emptyMessage) + `</p></div></div>`
|
||
}
|
||
var b strings.Builder
|
||
b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body">`)
|
||
if strings.TrimSpace(description) != "" {
|
||
b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">` + html.EscapeString(description) + `</p>`)
|
||
}
|
||
b.WriteString(`<div style="overflow-x:auto">`)
|
||
b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th><th>Status</th>`)
|
||
for i := 0; i <= maxGPUIndex; i++ {
|
||
b.WriteString(`<th>GPU ` + strconv.Itoa(i) + `</th>`)
|
||
}
|
||
b.WriteString(`</tr></thead><tbody>`)
|
||
for i, run := range runs {
|
||
b.WriteString(`<tr>`)
|
||
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
|
||
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
|
||
overallColor := "var(--ok)"
|
||
overallLabel := run.overallStatus
|
||
if overallLabel == "" {
|
||
overallLabel = "OK"
|
||
}
|
||
if overallLabel == "FAILED" {
|
||
overallColor = "var(--crit-fg,#9f3a38)"
|
||
} else if overallLabel != "OK" {
|
||
overallColor = "var(--warn)"
|
||
}
|
||
b.WriteString(`<td style="color:` + overallColor + `;font-weight:600">` + html.EscapeString(overallLabel) + `</td>`)
|
||
for idx := 0; idx <= maxGPUIndex; idx++ {
|
||
score, ok := run.gpuScores[idx]
|
||
if !ok {
|
||
b.WriteString(`<td style="color:var(--muted)">-</td>`)
|
||
continue
|
||
}
|
||
gpuStatus := run.gpuStatuses[idx]
|
||
scoreColor := ""
|
||
switch gpuStatus {
|
||
case "FAILED":
|
||
scoreColor = ` style="color:var(--crit-fg,#9f3a38);font-weight:600"`
|
||
case "WARNING", "PARTIAL":
|
||
scoreColor = ` style="color:var(--warn);font-weight:600"`
|
||
case "", "OK":
|
||
default:
|
||
scoreColor = ` style="color:var(--warn);font-weight:600"`
|
||
}
|
||
b.WriteString(`<td` + scoreColor + `>` + fmt.Sprintf("%.2f", score) + `</td>`)
|
||
}
|
||
b.WriteString(`</tr>`)
|
||
}
|
||
b.WriteString(`</tbody></table></div></div></div>`)
|
||
return b.String()
|
||
}
|
||
|
||
func loadBenchmarkHistory(exportDir string) (int, []benchmarkHistoryRun) {
|
||
baseDir := app.DefaultBeeBenchPerfDir
|
||
if strings.TrimSpace(exportDir) != "" {
|
||
baseDir = filepath.Join(exportDir, "bee-bench", "perf")
|
||
}
|
||
paths, err := filepath.Glob(filepath.Join(baseDir, "perf-*", "result.json"))
|
||
if err != nil || len(paths) == 0 {
|
||
return -1, nil
|
||
}
|
||
sort.Strings(paths)
|
||
return loadBenchmarkHistoryFromPaths(paths)
|
||
}
|
||
|
||
func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun) {
|
||
runs := make([]benchmarkHistoryRun, 0, len(paths))
|
||
maxGPUIndex := -1
|
||
for _, path := range paths {
|
||
raw, err := os.ReadFile(path)
|
||
if err != nil {
|
||
continue
|
||
}
|
||
var result platform.NvidiaBenchmarkResult
|
||
if err := json.Unmarshal(raw, &result); err != nil {
|
||
continue
|
||
}
|
||
run := benchmarkHistoryRun{
|
||
generatedAt: result.GeneratedAt,
|
||
displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
||
gpuScores: make(map[int]float64),
|
||
gpuStatuses: make(map[int]string),
|
||
overallStatus: result.OverallStatus,
|
||
}
|
||
for _, gpu := range result.GPUs {
|
||
run.gpuScores[gpu.Index] = gpu.Scores.CompositeScore
|
||
run.gpuStatuses[gpu.Index] = gpu.Status
|
||
if gpu.Index > maxGPUIndex {
|
||
maxGPUIndex = gpu.Index
|
||
}
|
||
}
|
||
runs = append(runs, run)
|
||
}
|
||
sort.Slice(runs, func(i, j int) bool {
|
||
return runs[i].generatedAt.After(runs[j].generatedAt)
|
||
})
|
||
return maxGPUIndex, runs
|
||
}
|
||
|
||
func renderPowerBenchmarkResultsCard(exportDir string) string {
|
||
baseDir := app.DefaultBeeBenchPowerDir
|
||
if strings.TrimSpace(exportDir) != "" {
|
||
baseDir = filepath.Join(exportDir, "bee-bench", "power")
|
||
}
|
||
paths, err := filepath.Glob(filepath.Join(baseDir, "power-*", "result.json"))
|
||
if err != nil || len(paths) == 0 {
|
||
return `<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body"><p style="color:var(--muted);font-size:13px">No saved power benchmark runs yet.</p></div></div>`
|
||
}
|
||
sort.Strings(paths)
|
||
|
||
type powerRun struct {
|
||
generatedAt time.Time
|
||
displayTime string
|
||
result platform.NvidiaPowerBenchResult
|
||
}
|
||
var runs []powerRun
|
||
for _, path := range paths {
|
||
raw, err := os.ReadFile(path)
|
||
if err != nil {
|
||
continue
|
||
}
|
||
var r platform.NvidiaPowerBenchResult
|
||
if err := json.Unmarshal(raw, &r); err != nil {
|
||
continue
|
||
}
|
||
runs = append(runs, powerRun{
|
||
generatedAt: r.GeneratedAt,
|
||
displayTime: r.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
||
result: r,
|
||
})
|
||
}
|
||
sort.Slice(runs, func(i, j int) bool {
|
||
return runs[i].generatedAt.After(runs[j].generatedAt)
|
||
})
|
||
|
||
var b strings.Builder
|
||
b.WriteString(`<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body">`)
|
||
|
||
latest := runs[0].result
|
||
b.WriteString(`<p style="font-size:12px;color:var(--muted);margin-bottom:10px">Latest run: ` + html.EscapeString(runs[0].displayTime))
|
||
if latest.Hostname != "" {
|
||
b.WriteString(` — ` + html.EscapeString(latest.Hostname))
|
||
}
|
||
if latest.OverallStatus != "" {
|
||
statusColor := "var(--ok)"
|
||
if latest.OverallStatus != "OK" {
|
||
statusColor = "var(--warn)"
|
||
}
|
||
b.WriteString(` — <span style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(latest.OverallStatus) + `</span>`)
|
||
}
|
||
b.WriteString(`</p>`)
|
||
|
||
if len(latest.GPUs) > 0 {
|
||
b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`)
|
||
b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Single-card W</th><th>Multi-GPU W</th><th>P95 Observed W</th><th>Status</th>`)
|
||
b.WriteString(`</tr></thead><tbody>`)
|
||
for _, gpu := range latest.GPUs {
|
||
finalLimitW := gpu.StablePowerLimitW
|
||
if finalLimitW <= 0 {
|
||
finalLimitW = gpu.AppliedPowerLimitW
|
||
}
|
||
derated := gpu.Derated ||
|
||
(gpu.DefaultPowerLimitW > 0 && finalLimitW > 0 && finalLimitW < gpu.DefaultPowerLimitW-1)
|
||
rowStyle := ""
|
||
finalStyle := ""
|
||
if derated {
|
||
rowStyle = ` style="background:rgba(255,180,0,0.08)"`
|
||
finalStyle = ` style="color:#e6a000;font-weight:600"`
|
||
}
|
||
statusLabel := gpu.Status
|
||
if statusLabel == "" {
|
||
statusLabel = "OK"
|
||
}
|
||
statusColor := "var(--ok)"
|
||
if statusLabel == "FAILED" {
|
||
statusColor = "var(--crit-fg,#9f3a38)"
|
||
} else if statusLabel != "OK" {
|
||
statusColor = "var(--warn)"
|
||
}
|
||
nominalStr := "-"
|
||
if gpu.DefaultPowerLimitW > 0 {
|
||
nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW)
|
||
}
|
||
singleStr := "-"
|
||
if gpu.AppliedPowerLimitW > 0 {
|
||
singleStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
|
||
}
|
||
multiStr := "-"
|
||
if gpu.StablePowerLimitW > 0 {
|
||
multiStr = fmt.Sprintf("%.0f", gpu.StablePowerLimitW)
|
||
}
|
||
p95Str := "-"
|
||
if gpu.MaxObservedPowerW > 0 {
|
||
p95Str = fmt.Sprintf("%.0f", gpu.MaxObservedPowerW)
|
||
}
|
||
b.WriteString(`<tr` + rowStyle + `>`)
|
||
b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`)
|
||
b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`)
|
||
b.WriteString(`<td>` + nominalStr + `</td>`)
|
||
b.WriteString(`<td>` + singleStr + `</td>`)
|
||
b.WriteString(`<td` + finalStyle + `>` + multiStr + `</td>`)
|
||
b.WriteString(`<td>` + p95Str + `</td>`)
|
||
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`)
|
||
b.WriteString(`</tr>`)
|
||
}
|
||
b.WriteString(`</tbody></table></div>`)
|
||
}
|
||
|
||
if len(runs) > 1 {
|
||
b.WriteString(`<details style="margin-top:12px"><summary style="font-size:12px;color:var(--muted);cursor:pointer">` + strconv.Itoa(len(runs)) + ` runs total</summary>`)
|
||
b.WriteString(`<div style="overflow-x:auto;margin-top:8px"><table><thead><tr><th>#</th><th>Time</th><th>GPUs</th><th>Status</th></tr></thead><tbody>`)
|
||
for i, run := range runs {
|
||
statusColor := "var(--ok)"
|
||
if run.result.OverallStatus != "OK" {
|
||
statusColor = "var(--warn)"
|
||
}
|
||
b.WriteString(`<tr>`)
|
||
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
|
||
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
|
||
b.WriteString(`<td>` + strconv.Itoa(len(run.result.GPUs)) + `</td>`)
|
||
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(run.result.OverallStatus) + `</td>`)
|
||
b.WriteString(`</tr>`)
|
||
}
|
||
b.WriteString(`</tbody></table></div></details>`)
|
||
}
|
||
|
||
b.WriteString(`</div></div>`)
|
||
return b.String()
|
||
}
|