Files
bee/audit/internal/webui/page_benchmark.go

614 lines
26 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package webui
import (
"encoding/json"
"fmt"
"html"
"os"
"path/filepath"
"sort"
"strconv"
"strings"
"time"
"bee/audit/internal/app"
"bee/audit/internal/platform"
)
type benchmarkHistoryRun struct {
generatedAt time.Time
displayTime string
gpuScores map[int]float64
gpuStatuses map[int]string
overallStatus string
}
func renderBenchmark(opts HandlerOptions) string {
return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Benchmark runs generate a human-readable TXT report and machine-readable result bundle. Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
<div class="grid2">
<div class="card">
<div class="card-head">Benchmark Setup</div>
<div class="card-body">
<div class="form-row">
<label>Profile</label>
<select id="benchmark-profile">
<option value="standard" selected>Standard — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</option>
<option value="stability">Stability — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</option>
<option value="overnight">Overnight — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfOvernightSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerOvernightSec) + `</option>
</select>
</div>
<div class="form-row">
<label>GPU Selection</label>
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
<button class="btn btn-sm btn-secondary" type="button" onclick="benchmarkSelectAll()">Select All</button>
<button class="btn btn-sm btn-secondary" type="button" onclick="benchmarkSelectNone()">Clear</button>
</div>
<div id="benchmark-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
</div>
</div>
<label class="benchmark-cb-row">
<input type="radio" name="benchmark-mode" value="sequential" onchange="benchmarkUpdateSelectionNote()">
<span>Sequential — one GPU at a time</span>
</label>
<label class="benchmark-cb-row" id="benchmark-parallel-label">
<input type="radio" name="benchmark-mode" value="parallel" onchange="benchmarkUpdateSelectionNote()">
<span>Parallel — all selected GPUs simultaneously</span>
</label>
<label class="benchmark-cb-row" id="benchmark-ramp-label">
<input type="radio" name="benchmark-mode" value="ramp-up" checked onchange="benchmarkUpdateSelectionNote()">
<span>Ramp-up — 1 GPU → 2 → … → all selected (separate tasks)</span>
</label>
<p id="benchmark-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 14px">Select one GPU for single-card benchmarking or several GPUs for a constrained multi-GPU run.</p>
<div style="display:flex;gap:8px;flex-wrap:wrap;align-items:center">
<button id="benchmark-run-performance-btn" class="btn btn-primary" onclick="runNvidiaBenchmark('performance')" disabled>&#9654; Run Performance Benchmark</button>
<button id="benchmark-run-power-fit-btn" class="btn btn-secondary" onclick="runNvidiaBenchmark('power-fit')" disabled>&#9654; Run Power / Thermal Fit</button>
<button id="benchmark-run-autotune-btn" class="btn btn-secondary" onclick="runBenchmarkAutotune()">Autotune</button>
</div>
<span id="benchmark-run-nccl" hidden>nccl-auto</span>
<span id="benchmark-run-status" style="margin-left:10px;font-size:12px;color:var(--muted)"></span>
<div id="benchmark-autotune-status" style="margin-top:10px;font-size:12px;color:var(--muted)">Autotune status: loading…</div>
<div style="margin-top:6px;font-size:12px;color:var(--muted)">Autotune overwrites the saved system-power source and applies it to all new power charts and tests.</div>
</div>
</div>
<div class="card">
<div class="card-head">Method Split</div>
<div class="card-body">
<p style="font-size:13px;color:var(--muted);margin-bottom:10px">The benchmark page now exposes two fundamentally different test families so compute score and server power-fit are not mixed into one number.</p>
<table>
<tr><th>Run Type</th><th>Engine</th><th>Question</th><th>Standard</th><th>Stability</th></tr>
<tr><td>Performance Benchmark</td><td><code>bee-gpu-burn</code></td><td>How much isolated compute performance does the GPU realize in this server?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + `</td></tr>
<tr><td>Power / Thermal Fit</td><td><code>dcgmproftester</code> + <code>nvidia-smi -pl</code></td><td>How much power per GPU can this server sustain as GPU count ramps up?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</td></tr>
</table>
<p style="font-size:12px;color:var(--muted);margin-top:10px">Timings are per full ramp-up run (1 GPU → all selected), measured on 48 GPU servers. Use ramp-up mode for capacity work: it creates 1 GPU → 2 GPU → … → all selected steps so analysis software can derive server total score and watts-per-GPU curves.</p>
</div>
</div>
</div>
` + `<div id="benchmark-results-section">` + renderBenchmarkResultsCard(opts.ExportDir) + `</div>` + `
<div id="benchmark-output" style="display:none;margin-top:16px" class="card">
<div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
<div class="card-body"><div id="benchmark-terminal" class="terminal"></div></div>
</div>
<style>
.benchmark-cb-row { display:flex; align-items:flex-start; gap:8px; cursor:pointer; font-size:13px; }
.benchmark-cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
.benchmark-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
.benchmark-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
</style>
<script>
let benchmarkES = null;
function benchmarkTaskIDs(payload) {
if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
if (payload && payload.task_id) return [payload.task_id];
return [];
}
function benchmarkSelectedGPUIndices() {
return Array.from(document.querySelectorAll('.benchmark-gpu-checkbox'))
.filter(function(el) { return el.checked && !el.disabled; })
.map(function(el) { return parseInt(el.value, 10); })
.filter(function(v) { return !Number.isNaN(v); })
.sort(function(a, b) { return a - b; });
}
function benchmarkMode() {
const el = document.querySelector('input[name="benchmark-mode"]:checked');
return el ? el.value : 'sequential';
}
function benchmarkUpdateSelectionNote() {
const selected = benchmarkSelectedGPUIndices();
const perfBtn = document.getElementById('benchmark-run-performance-btn');
const fitBtn = document.getElementById('benchmark-run-power-fit-btn');
const note = document.getElementById('benchmark-selection-note');
if (!selected.length) {
perfBtn.disabled = true;
fitBtn.disabled = true;
note.textContent = 'Select at least one NVIDIA GPU to run the benchmark.';
return;
}
perfBtn.disabled = false;
fitBtn.disabled = false;
const mode = benchmarkMode();
if (mode === 'ramp-up') {
note.textContent = 'Ramp-up: ' + selected.length + ' tasks (1 GPU → ' + selected.length + ' GPUs). Performance uses compute benchmark; Power / Thermal Fit uses dcgmproftester load with nvidia-smi power-limit search per step.';
} else if (mode === 'parallel') {
note.textContent = 'Parallel: all ' + selected.length + ' GPU(s) simultaneously. Only the performance benchmark supports this mode.';
} else {
note.textContent = 'Sequential: each selected GPU benchmarked separately.';
}
}
function benchmarkRenderGPUList(gpus) {
const root = document.getElementById('benchmark-gpu-list');
if (!gpus || !gpus.length) {
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
benchmarkUpdateSelectionNote();
return;
}
root.innerHTML = gpus.map(function(gpu) {
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
return '<label class="benchmark-gpu-row">'
+ '<input class="benchmark-gpu-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="benchmarkUpdateSelectionNote()">'
+ '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
+ '</label>';
}).join('');
benchmarkApplyMultiGPUState(gpus.length);
benchmarkUpdateSelectionNote();
}
function benchmarkApplyMultiGPUState(gpuCount) {
var multiValues = ['parallel', 'ramp-up'];
var radios = document.querySelectorAll('input[name="benchmark-mode"]');
radios.forEach(function(el) {
var isMulti = multiValues.indexOf(el.value) >= 0;
if (gpuCount < 2 && isMulti) {
el.disabled = true;
if (el.checked) {
var seq = document.querySelector('input[name="benchmark-mode"][value="sequential"]');
if (seq) seq.checked = true;
}
var label = el.closest('label');
if (label) label.style.opacity = '0.4';
} else {
el.disabled = false;
if (gpuCount >= 2 && el.value === 'ramp-up') el.checked = true;
var label = el.closest('label');
if (label) label.style.opacity = '';
}
});
benchmarkUpdateSelectionNote();
}
function benchmarkLoadGPUs() {
const status = document.getElementById('benchmark-run-status');
status.textContent = '';
fetch('/api/gpu/nvidia').then(function(r) {
return r.json().then(function(body) {
if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
return body;
});
}).then(function(gpus) {
benchmarkRenderGPUList(gpus);
}).catch(function(err) {
document.getElementById('benchmark-gpu-list').innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
benchmarkUpdateSelectionNote();
});
}
function benchmarkSelectAll() {
document.querySelectorAll('.benchmark-gpu-checkbox').forEach(function(el) { el.checked = true; });
benchmarkUpdateSelectionNote();
}
function benchmarkSelectNone() {
document.querySelectorAll('.benchmark-gpu-checkbox').forEach(function(el) { el.checked = false; });
benchmarkUpdateSelectionNote();
}
function runNvidiaBenchmark(kind) {
const selected = benchmarkSelectedGPUIndices();
const status = document.getElementById('benchmark-run-status');
if (!selected.length) {
status.textContent = 'Select at least one GPU.';
return;
}
if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
const mode = benchmarkMode();
const rampUp = mode === 'ramp-up' && selected.length > 1;
const parallelGPUs = mode === 'parallel' && kind === 'performance';
if (kind === 'power-fit' && mode === 'parallel') {
status.textContent = 'Power / Thermal Fit supports sequential or ramp-up only.';
return;
}
const body = {
profile: document.getElementById('benchmark-profile').value || 'standard',
gpu_indices: selected,
run_nccl: kind === 'performance' && selected.length > 1,
parallel_gpus: parallelGPUs,
ramp_up: rampUp,
display_name: kind === 'power-fit' ? 'NVIDIA Power / Thermal Fit' : 'NVIDIA Performance Benchmark'
};
document.getElementById('benchmark-output').style.display = 'block';
document.getElementById('benchmark-title').textContent = '— ' + body.display_name + ' · ' + body.profile + ' [' + selected.join(', ') + ']';
const term = document.getElementById('benchmark-terminal');
term.textContent = 'Enqueuing ' + body.display_name + ' for GPUs ' + selected.join(', ') + '...\n';
status.textContent = 'Queueing...';
const endpoint = kind === 'power-fit' ? '/api/bee-bench/nvidia/power/run' : '/api/bee-bench/nvidia/perf/run';
fetch(endpoint, {
method: 'POST',
headers: {'Content-Type':'application/json'},
body: JSON.stringify(body)
}).then(function(r) {
return r.json().then(function(payload) {
if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
return payload;
});
}).then(function(d) {
const taskIds = benchmarkTaskIDs(d);
if (!taskIds.length) throw new Error('No benchmark task was queued.');
status.textContent = taskIds.length === 1 ? ('Task ' + taskIds[0] + ' queued.') : ('Queued ' + taskIds.length + ' tasks.');
const streamNext = function(idx, failures) {
if (idx >= taskIds.length) {
status.textContent = failures ? 'Completed with failures.' : 'Completed.';
return;
}
const taskId = taskIds[idx];
term.textContent += '\n[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming log...\n';
benchmarkES = new EventSource('/api/tasks/' + taskId + '/stream');
benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
benchmarkES.addEventListener('done', function(e) {
benchmarkES.close();
benchmarkES = null;
if (e.data) failures += 1;
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
term.scrollTop = term.scrollHeight;
const isLast = (idx + 1 >= taskIds.length);
streamNext(idx + 1, failures);
if (isLast) { benchmarkRefreshResults(); }
});
benchmarkES.onerror = function() {
if (benchmarkES) {
benchmarkES.close();
benchmarkES = null;
}
term.textContent += '\nERROR: stream disconnected.\n';
term.scrollTop = term.scrollHeight;
streamNext(idx + 1, failures + 1);
};
};
streamNext(0, 0);
}).catch(function(err) {
status.textContent = 'Error.';
term.textContent += 'ERROR: ' + err.message + '\n';
});
}
function benchmarkRenderAutotuneStatus(payload) {
const el = document.getElementById('benchmark-autotune-status');
if (!el) return;
if (!payload || !payload.configured || !payload.config) {
el.textContent = 'Autotune status: not configured. Temporary fallback source is used until autotune completes.';
return;
}
const cfg = payload.config || {};
const decision = payload.decision || {};
const updated = cfg.updated_at ? new Date(cfg.updated_at).toLocaleString() : 'unknown time';
const confidence = typeof cfg.confidence === 'number' ? (' · confidence ' + Math.round(cfg.confidence * 100) + '%') : '';
const effective = decision.effective_source ? (' · effective ' + decision.effective_source) : '';
const mode = decision.mode ? (' · mode ' + decision.mode) : '';
el.textContent = 'Autotune status: ' + cfg.selected_source + effective + mode + ' · updated ' + updated + confidence;
}
function loadBenchmarkAutotuneStatus() {
fetch('/api/bee-bench/nvidia/autotune/status')
.then(function(r) {
return r.json().then(function(body) {
if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
return body;
});
})
.then(function(body) { benchmarkRenderAutotuneStatus(body); })
.catch(function(err) {
const el = document.getElementById('benchmark-autotune-status');
if (el) el.textContent = 'Autotune status error: ' + err.message;
});
}
function runBenchmarkAutotune() {
const selected = benchmarkSelectedGPUIndices();
const status = document.getElementById('benchmark-run-status');
const term = document.getElementById('benchmark-terminal');
if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
document.getElementById('benchmark-output').style.display = 'block';
document.getElementById('benchmark-title').textContent = '— NVIDIA Benchmark Autotune';
term.textContent = 'Enqueuing benchmark autotune...\n';
status.textContent = 'Queueing autotune...';
fetch('/api/bee-bench/nvidia/autotune/run', {
method: 'POST',
headers: {'Content-Type':'application/json'},
body: JSON.stringify({
profile: document.getElementById('benchmark-profile').value || 'standard',
benchmark_kind: benchmarkMode() === 'parallel' ? 'performance' : 'power-fit',
gpu_indices: selected
})
}).then(function(r) {
return r.json().then(function(payload) {
if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
return payload;
});
}).then(function(d) {
const taskIds = benchmarkTaskIDs(d);
if (!taskIds.length) throw new Error('No autotune task was queued.');
const taskId = taskIds[0];
status.textContent = 'Autotune queued: ' + taskId;
benchmarkES = new EventSource('/api/tasks/' + taskId + '/stream');
benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
benchmarkES.addEventListener('done', function(e) {
if (benchmarkES) {
benchmarkES.close();
benchmarkES = null;
}
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
status.textContent = e.data ? 'Autotune failed.' : 'Autotune completed.';
loadBenchmarkAutotuneStatus();
});
}).catch(function(err) {
status.textContent = 'Autotune error.';
term.textContent += 'ERROR: ' + err.message + '\n';
});
}
benchmarkLoadGPUs();
loadBenchmarkAutotuneStatus();
function benchmarkRefreshResults() {
fetch('/api/benchmark/results')
.then(function(r) { return r.text(); })
.then(function(html) {
const el = document.getElementById('benchmark-results-section');
if (el) el.innerHTML = html;
})
.catch(function() {});
}
</script>`
}
func renderBenchmarkResultsCard(exportDir string) string {
maxIdx, runs := loadBenchmarkHistory(exportDir)
perf := renderBenchmarkResultsCardFromRuns(
"Perf Results",
"Composite score by saved benchmark run and GPU.",
"No saved performance benchmark runs yet.",
maxIdx,
runs,
)
power := renderPowerBenchmarkResultsCard(exportDir)
return perf + "\n" + power
}
func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, maxGPUIndex int, runs []benchmarkHistoryRun) string {
if len(runs) == 0 {
return `<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body"><p style="color:var(--muted);font-size:13px">` + html.EscapeString(emptyMessage) + `</p></div></div>`
}
var b strings.Builder
b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body">`)
if strings.TrimSpace(description) != "" {
b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">` + html.EscapeString(description) + `</p>`)
}
b.WriteString(`<div style="overflow-x:auto">`)
b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th><th>Status</th>`)
for i := 0; i <= maxGPUIndex; i++ {
b.WriteString(`<th>GPU ` + strconv.Itoa(i) + `</th>`)
}
b.WriteString(`</tr></thead><tbody>`)
for i, run := range runs {
b.WriteString(`<tr>`)
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
overallColor := "var(--ok)"
overallLabel := run.overallStatus
if overallLabel == "" {
overallLabel = "OK"
}
if overallLabel == "FAILED" {
overallColor = "var(--crit-fg,#9f3a38)"
} else if overallLabel != "OK" {
overallColor = "var(--warn)"
}
b.WriteString(`<td style="color:` + overallColor + `;font-weight:600">` + html.EscapeString(overallLabel) + `</td>`)
for idx := 0; idx <= maxGPUIndex; idx++ {
score, ok := run.gpuScores[idx]
if !ok {
b.WriteString(`<td style="color:var(--muted)">-</td>`)
continue
}
gpuStatus := run.gpuStatuses[idx]
scoreColor := ""
switch gpuStatus {
case "FAILED":
scoreColor = ` style="color:var(--crit-fg,#9f3a38);font-weight:600"`
case "WARNING", "PARTIAL":
scoreColor = ` style="color:var(--warn);font-weight:600"`
case "", "OK":
default:
scoreColor = ` style="color:var(--warn);font-weight:600"`
}
b.WriteString(`<td` + scoreColor + `>` + fmt.Sprintf("%.2f", score) + `</td>`)
}
b.WriteString(`</tr>`)
}
b.WriteString(`</tbody></table></div></div></div>`)
return b.String()
}
func loadBenchmarkHistory(exportDir string) (int, []benchmarkHistoryRun) {
baseDir := app.DefaultBeeBenchPerfDir
if strings.TrimSpace(exportDir) != "" {
baseDir = filepath.Join(exportDir, "bee-bench", "perf")
}
paths, err := filepath.Glob(filepath.Join(baseDir, "perf-*", "result.json"))
if err != nil || len(paths) == 0 {
return -1, nil
}
sort.Strings(paths)
return loadBenchmarkHistoryFromPaths(paths)
}
func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun) {
runs := make([]benchmarkHistoryRun, 0, len(paths))
maxGPUIndex := -1
for _, path := range paths {
raw, err := os.ReadFile(path)
if err != nil {
continue
}
var result platform.NvidiaBenchmarkResult
if err := json.Unmarshal(raw, &result); err != nil {
continue
}
run := benchmarkHistoryRun{
generatedAt: result.GeneratedAt,
displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
gpuScores: make(map[int]float64),
gpuStatuses: make(map[int]string),
overallStatus: result.OverallStatus,
}
for _, gpu := range result.GPUs {
run.gpuScores[gpu.Index] = gpu.Scores.CompositeScore
run.gpuStatuses[gpu.Index] = gpu.Status
if gpu.Index > maxGPUIndex {
maxGPUIndex = gpu.Index
}
}
runs = append(runs, run)
}
sort.Slice(runs, func(i, j int) bool {
return runs[i].generatedAt.After(runs[j].generatedAt)
})
return maxGPUIndex, runs
}
func renderPowerBenchmarkResultsCard(exportDir string) string {
baseDir := app.DefaultBeeBenchPowerDir
if strings.TrimSpace(exportDir) != "" {
baseDir = filepath.Join(exportDir, "bee-bench", "power")
}
paths, err := filepath.Glob(filepath.Join(baseDir, "power-*", "result.json"))
if err != nil || len(paths) == 0 {
return `<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body"><p style="color:var(--muted);font-size:13px">No saved power benchmark runs yet.</p></div></div>`
}
sort.Strings(paths)
type powerRun struct {
generatedAt time.Time
displayTime string
result platform.NvidiaPowerBenchResult
}
var runs []powerRun
for _, path := range paths {
raw, err := os.ReadFile(path)
if err != nil {
continue
}
var r platform.NvidiaPowerBenchResult
if err := json.Unmarshal(raw, &r); err != nil {
continue
}
runs = append(runs, powerRun{
generatedAt: r.GeneratedAt,
displayTime: r.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
result: r,
})
}
sort.Slice(runs, func(i, j int) bool {
return runs[i].generatedAt.After(runs[j].generatedAt)
})
var b strings.Builder
b.WriteString(`<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body">`)
latest := runs[0].result
b.WriteString(`<p style="font-size:12px;color:var(--muted);margin-bottom:10px">Latest run: ` + html.EscapeString(runs[0].displayTime))
if latest.Hostname != "" {
b.WriteString(`` + html.EscapeString(latest.Hostname))
}
if latest.OverallStatus != "" {
statusColor := "var(--ok)"
if latest.OverallStatus != "OK" {
statusColor = "var(--warn)"
}
b.WriteString(` — <span style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(latest.OverallStatus) + `</span>`)
}
b.WriteString(`</p>`)
if len(latest.GPUs) > 0 {
b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`)
b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Single-card W</th><th>Multi-GPU W</th><th>P95 Observed W</th><th>Status</th>`)
b.WriteString(`</tr></thead><tbody>`)
for _, gpu := range latest.GPUs {
finalLimitW := gpu.StablePowerLimitW
if finalLimitW <= 0 {
finalLimitW = gpu.AppliedPowerLimitW
}
derated := gpu.Derated ||
(gpu.DefaultPowerLimitW > 0 && finalLimitW > 0 && finalLimitW < gpu.DefaultPowerLimitW-1)
rowStyle := ""
finalStyle := ""
if derated {
rowStyle = ` style="background:rgba(255,180,0,0.08)"`
finalStyle = ` style="color:#e6a000;font-weight:600"`
}
statusLabel := gpu.Status
if statusLabel == "" {
statusLabel = "OK"
}
statusColor := "var(--ok)"
if statusLabel == "FAILED" {
statusColor = "var(--crit-fg,#9f3a38)"
} else if statusLabel != "OK" {
statusColor = "var(--warn)"
}
nominalStr := "-"
if gpu.DefaultPowerLimitW > 0 {
nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW)
}
singleStr := "-"
if gpu.AppliedPowerLimitW > 0 {
singleStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
}
multiStr := "-"
if gpu.StablePowerLimitW > 0 {
multiStr = fmt.Sprintf("%.0f", gpu.StablePowerLimitW)
}
p95Str := "-"
if gpu.MaxObservedPowerW > 0 {
p95Str = fmt.Sprintf("%.0f", gpu.MaxObservedPowerW)
}
b.WriteString(`<tr` + rowStyle + `>`)
b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`)
b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`)
b.WriteString(`<td>` + nominalStr + `</td>`)
b.WriteString(`<td>` + singleStr + `</td>`)
b.WriteString(`<td` + finalStyle + `>` + multiStr + `</td>`)
b.WriteString(`<td>` + p95Str + `</td>`)
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`)
b.WriteString(`</tr>`)
}
b.WriteString(`</tbody></table></div>`)
}
if len(runs) > 1 {
b.WriteString(`<details style="margin-top:12px"><summary style="font-size:12px;color:var(--muted);cursor:pointer">` + strconv.Itoa(len(runs)) + ` runs total</summary>`)
b.WriteString(`<div style="overflow-x:auto;margin-top:8px"><table><thead><tr><th>#</th><th>Time</th><th>GPUs</th><th>Status</th></tr></thead><tbody>`)
for i, run := range runs {
statusColor := "var(--ok)"
if run.result.OverallStatus != "OK" {
statusColor = "var(--warn)"
}
b.WriteString(`<tr>`)
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
b.WriteString(`<td>` + strconv.Itoa(len(run.result.GPUs)) + `</td>`)
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(run.result.OverallStatus) + `</td>`)
b.WriteString(`</tr>`)
}
b.WriteString(`</tbody></table></div></details>`)
}
b.WriteString(`</div></div>`)
return b.String()
}