Add power results table, fix benchmark results refresh, bound memtester
- Benchmark page now shows two result sections: Performance (scores) and Power / Thermal Fit (slot table). After any benchmark task completes the results section auto-refreshes via GET /api/benchmark/results without a full page reload. - Power results table shows each GPU slot with nominal TDP, achieved stable power limit, and P95 observed power. Rows with derated cards are highlighted amber so under-performing slots stand out at a glance. Older runs are collapsed in a <details> summary. - memtester is now wrapped with timeout(1) so a stuck memory controller cannot cause Validate Memory to hang indefinitely. Wall-clock limit is ~2.5 min per 100 MB per pass plus a 2-minute buffer. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -552,9 +552,13 @@ func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, si
|
|||||||
if passes <= 0 {
|
if passes <= 0 {
|
||||||
passes = 1
|
passes = 1
|
||||||
}
|
}
|
||||||
|
// Bound memtester with a hard wall-clock timeout: ~2.5 min per 100 MB per
|
||||||
|
// pass, plus a fixed 2-minute buffer. Without this, a stuck memory
|
||||||
|
// controller can cause memtester to spin forever on a single subtest.
|
||||||
|
timeoutSec := sizeMB*passes*150/100 + 120
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
|
||||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||||
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
{name: "02-memtester.log", cmd: []string{"timeout", fmt.Sprintf("%d", timeoutSec), "memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
||||||
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
||||||
}, logFunc)
|
}, logFunc)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1529,6 +1529,11 @@ func (h *handler) handleAPINetworkRollback(w http.ResponseWriter, _ *http.Reques
|
|||||||
writeJSON(w, map[string]string{"status": "rolled back"})
|
writeJSON(w, map[string]string{"status": "rolled back"})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPIBenchmarkResults(w http.ResponseWriter, r *http.Request) {
|
||||||
|
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||||
|
fmt.Fprint(w, renderBenchmarkResultsCard(h.opts.ExportDir))
|
||||||
|
}
|
||||||
|
|
||||||
func (h *handler) rollbackPendingNetworkChange() error {
|
func (h *handler) rollbackPendingNetworkChange() error {
|
||||||
h.pendingNetMu.Lock()
|
h.pendingNetMu.Lock()
|
||||||
pnc := h.pendingNet
|
pnc := h.pendingNet
|
||||||
|
|||||||
@@ -2002,7 +2002,7 @@ func renderBenchmark(opts HandlerOptions) string {
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
` + renderBenchmarkResultsCard(opts.ExportDir) + `
|
`+`<div id="benchmark-results-section">`+renderBenchmarkResultsCard(opts.ExportDir)+`</div>`+`
|
||||||
|
|
||||||
<div id="benchmark-output" style="display:none;margin-top:16px" class="card">
|
<div id="benchmark-output" style="display:none;margin-top:16px" class="card">
|
||||||
<div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
|
<div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
|
||||||
@@ -2188,7 +2188,9 @@ function runNvidiaBenchmark(kind) {
|
|||||||
if (e.data) failures += 1;
|
if (e.data) failures += 1;
|
||||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||||
term.scrollTop = term.scrollHeight;
|
term.scrollTop = term.scrollHeight;
|
||||||
|
const isLast = (idx + 1 >= taskIds.length);
|
||||||
streamNext(idx + 1, failures);
|
streamNext(idx + 1, failures);
|
||||||
|
if (isLast) { benchmarkRefreshResults(); }
|
||||||
});
|
});
|
||||||
benchmarkES.onerror = function() {
|
benchmarkES.onerror = function() {
|
||||||
if (benchmarkES) {
|
if (benchmarkES) {
|
||||||
@@ -2208,18 +2210,30 @@ function runNvidiaBenchmark(kind) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
benchmarkLoadGPUs();
|
benchmarkLoadGPUs();
|
||||||
|
|
||||||
|
function benchmarkRefreshResults() {
|
||||||
|
fetch('/api/benchmark/results')
|
||||||
|
.then(function(r) { return r.text(); })
|
||||||
|
.then(function(html) {
|
||||||
|
const el = document.getElementById('benchmark-results-section');
|
||||||
|
if (el) el.innerHTML = html;
|
||||||
|
})
|
||||||
|
.catch(function() {});
|
||||||
|
}
|
||||||
</script>`
|
</script>`
|
||||||
}
|
}
|
||||||
|
|
||||||
func renderBenchmarkResultsCard(exportDir string) string {
|
func renderBenchmarkResultsCard(exportDir string) string {
|
||||||
maxIdx, runs := loadBenchmarkHistory(exportDir)
|
maxIdx, runs := loadBenchmarkHistory(exportDir)
|
||||||
return renderBenchmarkResultsCardFromRuns(
|
perf := renderBenchmarkResultsCardFromRuns(
|
||||||
"Perf Results",
|
"Performance Results",
|
||||||
"Composite score by saved benchmark run and GPU.",
|
"Composite score by saved benchmark run and GPU.",
|
||||||
"No saved benchmark runs yet.",
|
"No saved performance benchmark runs yet.",
|
||||||
maxIdx,
|
maxIdx,
|
||||||
runs,
|
runs,
|
||||||
)
|
)
|
||||||
|
power := renderPowerBenchmarkResultsCard(exportDir)
|
||||||
|
return perf + "\n" + power
|
||||||
}
|
}
|
||||||
|
|
||||||
func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, maxGPUIndex int, runs []benchmarkHistoryRun) string {
|
func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, maxGPUIndex int, runs []benchmarkHistoryRun) string {
|
||||||
@@ -2299,6 +2313,126 @@ func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun)
|
|||||||
return maxGPUIndex, runs
|
return maxGPUIndex, runs
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func renderPowerBenchmarkResultsCard(exportDir string) string {
|
||||||
|
baseDir := app.DefaultBeeBenchPowerDir
|
||||||
|
if strings.TrimSpace(exportDir) != "" {
|
||||||
|
baseDir = filepath.Join(exportDir, "bee-bench", "power")
|
||||||
|
}
|
||||||
|
paths, err := filepath.Glob(filepath.Join(baseDir, "power-*", "result.json"))
|
||||||
|
if err != nil || len(paths) == 0 {
|
||||||
|
return `<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body"><p style="color:var(--muted);font-size:13px">No saved power benchmark runs yet.</p></div></div>`
|
||||||
|
}
|
||||||
|
sort.Strings(paths)
|
||||||
|
|
||||||
|
type powerRun struct {
|
||||||
|
generatedAt time.Time
|
||||||
|
displayTime string
|
||||||
|
result platform.NvidiaPowerBenchResult
|
||||||
|
}
|
||||||
|
var runs []powerRun
|
||||||
|
for _, path := range paths {
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
var r platform.NvidiaPowerBenchResult
|
||||||
|
if err := json.Unmarshal(raw, &r); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
runs = append(runs, powerRun{
|
||||||
|
generatedAt: r.GeneratedAt,
|
||||||
|
displayTime: r.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
||||||
|
result: r,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
sort.Slice(runs, func(i, j int) bool {
|
||||||
|
return runs[i].generatedAt.After(runs[j].generatedAt)
|
||||||
|
})
|
||||||
|
|
||||||
|
// Show only the most recent run's GPU slot table, plus a run history summary.
|
||||||
|
var b strings.Builder
|
||||||
|
b.WriteString(`<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body">`)
|
||||||
|
|
||||||
|
latest := runs[0].result
|
||||||
|
b.WriteString(`<p style="font-size:12px;color:var(--muted);margin-bottom:10px">Latest run: ` + html.EscapeString(runs[0].displayTime))
|
||||||
|
if latest.Hostname != "" {
|
||||||
|
b.WriteString(` — ` + html.EscapeString(latest.Hostname))
|
||||||
|
}
|
||||||
|
if latest.OverallStatus != "" {
|
||||||
|
statusColor := "var(--ok)"
|
||||||
|
if latest.OverallStatus != "OK" {
|
||||||
|
statusColor = "var(--warn)"
|
||||||
|
}
|
||||||
|
b.WriteString(` — <span style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(latest.OverallStatus) + `</span>`)
|
||||||
|
}
|
||||||
|
b.WriteString(`</p>`)
|
||||||
|
|
||||||
|
if len(latest.GPUs) > 0 {
|
||||||
|
b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`)
|
||||||
|
b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Achieved W</th><th>P95 Observed W</th><th>Status</th>`)
|
||||||
|
b.WriteString(`</tr></thead><tbody>`)
|
||||||
|
for _, gpu := range latest.GPUs {
|
||||||
|
derated := gpu.Derated || (gpu.DefaultPowerLimitW > 0 && gpu.AppliedPowerLimitW < gpu.DefaultPowerLimitW-1)
|
||||||
|
rowStyle := ""
|
||||||
|
achievedStyle := ""
|
||||||
|
if derated {
|
||||||
|
rowStyle = ` style="background:rgba(255,180,0,0.08)"`
|
||||||
|
achievedStyle = ` style="color:#e6a000;font-weight:600"`
|
||||||
|
}
|
||||||
|
statusLabel := gpu.Status
|
||||||
|
if statusLabel == "" {
|
||||||
|
statusLabel = "OK"
|
||||||
|
}
|
||||||
|
statusColor := "var(--ok)"
|
||||||
|
if statusLabel != "OK" {
|
||||||
|
statusColor = "var(--warn)"
|
||||||
|
}
|
||||||
|
nominalStr := "-"
|
||||||
|
if gpu.DefaultPowerLimitW > 0 {
|
||||||
|
nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW)
|
||||||
|
}
|
||||||
|
achievedStr := "-"
|
||||||
|
if gpu.AppliedPowerLimitW > 0 {
|
||||||
|
achievedStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
|
||||||
|
}
|
||||||
|
p95Str := "-"
|
||||||
|
if gpu.MaxObservedPowerW > 0 {
|
||||||
|
p95Str = fmt.Sprintf("%.0f", gpu.MaxObservedPowerW)
|
||||||
|
}
|
||||||
|
b.WriteString(`<tr` + rowStyle + `>`)
|
||||||
|
b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`)
|
||||||
|
b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`)
|
||||||
|
b.WriteString(`<td>` + nominalStr + `</td>`)
|
||||||
|
b.WriteString(`<td` + achievedStyle + `>` + achievedStr + `</td>`)
|
||||||
|
b.WriteString(`<td>` + p95Str + `</td>`)
|
||||||
|
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`)
|
||||||
|
b.WriteString(`</tr>`)
|
||||||
|
}
|
||||||
|
b.WriteString(`</tbody></table></div>`)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(runs) > 1 {
|
||||||
|
b.WriteString(`<details style="margin-top:12px"><summary style="font-size:12px;color:var(--muted);cursor:pointer">` + strconv.Itoa(len(runs)) + ` runs total</summary>`)
|
||||||
|
b.WriteString(`<div style="overflow-x:auto;margin-top:8px"><table><thead><tr><th>#</th><th>Time</th><th>GPUs</th><th>Status</th></tr></thead><tbody>`)
|
||||||
|
for i, run := range runs {
|
||||||
|
statusColor := "var(--ok)"
|
||||||
|
if run.result.OverallStatus != "OK" {
|
||||||
|
statusColor = "var(--warn)"
|
||||||
|
}
|
||||||
|
b.WriteString(`<tr>`)
|
||||||
|
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
|
||||||
|
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
|
||||||
|
b.WriteString(`<td>` + strconv.Itoa(len(run.result.GPUs)) + `</td>`)
|
||||||
|
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(run.result.OverallStatus) + `</td>`)
|
||||||
|
b.WriteString(`</tr>`)
|
||||||
|
}
|
||||||
|
b.WriteString(`</tbody></table></div></details>`)
|
||||||
|
}
|
||||||
|
|
||||||
|
b.WriteString(`</div></div>`)
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
// ── Burn ──────────────────────────────────────────────────────────────────────
|
// ── Burn ──────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
func renderBurn() string {
|
func renderBurn() string {
|
||||||
|
|||||||
@@ -263,6 +263,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
|
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
|
||||||
mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf"))
|
mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf"))
|
||||||
mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power"))
|
mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power"))
|
||||||
|
mux.HandleFunc("GET /api/benchmark/results", h.handleAPIBenchmarkResults)
|
||||||
|
|
||||||
// Tasks
|
// Tasks
|
||||||
mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)
|
mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)
|
||||||
|
|||||||
Reference in New Issue
Block a user