Compare commits

..

4 Commits
v8.6 ... v8.7

Author SHA1 Message Date
c80a39e7ac Add power results table, fix benchmark results refresh, bound memtester
- Benchmark page now shows two result sections: Performance (scores) and
  Power / Thermal Fit (slot table). After any benchmark task completes
  the results section auto-refreshes via GET /api/benchmark/results
  without a full page reload.
- Power results table shows each GPU slot with nominal TDP, achieved
  stable power limit, and P95 observed power. Rows with derated cards
  are highlighted amber so under-performing slots stand out at a glance.
  Older runs are collapsed in a <details> summary.
- memtester is now wrapped with timeout(1) so a stuck memory controller
  cannot cause Validate Memory to hang indefinitely. Wall-clock limit is
  ~2.5 min per 100 MB per pass plus a 2-minute buffer.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-15 07:16:18 +03:00
a5e0261ff2 Refactor power ramp to use true single-card baselines
Phase 1 now calibrates each GPU individually (sequentially) so that
PowerRealizationPct reflects real degradation from neighbour thermals and
shared power rails. Previously the baseline came from an all-GPU-together
run, making realization always ≈100% at the final ramp step.

Ramp step 1 reuses single-card calibration results (no extra run); steps
2..N run targeted_power on the growing GPU subset with derating active.

Remove OccupiedSlots/OccupiedSlotsNote fields and occupiedSlots() helper —
they were compensation for the old all-GPU calibration approach.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 23:47:57 +03:00
ee422ede3c Revert "Add raster Easy Bee branding assets"
This reverts commit d560b2fead.
2026-04-14 23:00:15 +03:00
d560b2fead Add raster Easy Bee branding assets 2026-04-14 22:39:25 +03:00
6 changed files with 183 additions and 33 deletions

View File

@@ -2831,15 +2831,6 @@ func powerBenchDurationSec(profile string) int {
}
}
func occupiedSlots(indices []int, current int) []int {
out := make([]int, 0, len(indices))
for _, idx := range indices {
if idx != current {
out = append(out, idx)
}
}
return out
}
func cloneBenchmarkGPUInfoMap(src map[int]benchmarkGPUInfo) map[int]benchmarkGPUInfo {
out := make(map[int]benchmarkGPUInfo, len(src))
@@ -2887,9 +2878,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
b.WriteString("\n")
for _, gpu := range result.GPUs {
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name)
if gpu.OccupiedSlotsNote != "" {
fmt.Fprintf(&b, "- %s\n", gpu.OccupiedSlotsNote)
}
for _, note := range gpu.Notes {
fmt.Fprintf(&b, "- %s\n", note)
}
@@ -2955,10 +2944,24 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
}
durationSec := powerBenchDurationSec(opts.Profile)
_ = durationSec
calibByIndex, restoreActions := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, infoByIndex, logFunc)
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
// establish a true single-card power baseline unaffected by neighbour heat.
calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
var allRestoreActions []benchmarkRestoreAction
for _, idx := range selected {
singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
_ = os.MkdirAll(singleDir, 0755)
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
c, restore := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc)
allRestoreActions = append(allRestoreActions, restore...)
if r, ok := c[idx]; ok {
calibByIndex[idx] = r
}
}
defer func() {
for i := len(restoreActions) - 1; i >= 0; i-- {
restoreActions[i].fn()
for i := len(allRestoreActions) - 1; i >= 0; i-- {
allRestoreActions[i].fn()
}
}()
gpus := make([]NvidiaPowerBenchGPU, 0, len(selected))
@@ -2975,11 +2978,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
result.OverallStatus = "PARTIAL"
}
}
occupied := occupiedSlots(selected, idx)
note := ""
if len(occupied) > 0 {
note = fmt.Sprintf("Slot recommendation was measured while slots %s were populated; airflow in a different chassis fill pattern may differ.", joinIndexList(occupied))
}
gpus = append(gpus, NvidiaPowerBenchGPU{
Index: idx,
Name: info.Name,
@@ -2991,8 +2989,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
CalibrationAttempts: calib.Attempts,
Derated: calib.Derated,
Status: status,
OccupiedSlots: occupied,
OccupiedSlotsNote: note,
Notes: append([]string(nil), calib.Notes...),
CoolingWarning: calib.CoolingWarning,
})
@@ -3032,14 +3028,26 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
for _, gpu := range gpus {
singleByIndex[gpu.Index] = gpu
}
// Phase 2: ramp — add one GPU per step and calibrate the growing subset
// simultaneously. Step 1 reuses single-card results; steps 2..N run fresh
// targeted_power with derating if degradation is detected.
for step := 1; step <= len(result.RecommendedSlotOrder); step++ {
subset := append([]int(nil), result.RecommendedSlotOrder[:step]...)
stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step))
_ = os.MkdirAll(stepDir, 0755)
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc)
for i := len(stepRestore) - 1; i >= 0; i-- {
stepRestore[i].fn()
var stepCalib map[int]benchmarkPowerCalibrationResult
if step == 1 {
// Single-GPU step — already measured in phase 1; reuse directly.
stepCalib = calibByIndex
logFunc(fmt.Sprintf("power ramp: step 1/%d — reusing single-card calibration for GPU %d", len(result.RecommendedSlotOrder), subset[0]))
} else {
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
var stepRestore []benchmarkRestoreAction
stepCalib, stepRestore = runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc)
for i := len(stepRestore) - 1; i >= 0; i-- {
stepRestore[i].fn()
}
}
ramp := NvidiaPowerBenchStep{
StepIndex: step,

View File

@@ -280,8 +280,6 @@ type NvidiaPowerBenchGPU struct {
CalibrationAttempts int `json:"calibration_attempts,omitempty"`
Derated bool `json:"derated,omitempty"`
Status string `json:"status"`
OccupiedSlots []int `json:"occupied_slots,omitempty"`
OccupiedSlotsNote string `json:"occupied_slots_note,omitempty"`
Notes []string `json:"notes,omitempty"`
// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
CoolingWarning string `json:"cooling_warning,omitempty"`

View File

@@ -552,9 +552,13 @@ func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, si
if passes <= 0 {
passes = 1
}
// Bound memtester with a hard wall-clock timeout: ~2.5 min per 100 MB per
// pass, plus a fixed 2-minute buffer. Without this, a stuck memory
// controller can cause memtester to spin forever on a single subtest.
timeoutSec := sizeMB*passes*150/100 + 120
return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
{name: "02-memtester.log", cmd: []string{"timeout", fmt.Sprintf("%d", timeoutSec), "memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
}, logFunc)
}

View File

@@ -1529,6 +1529,11 @@ func (h *handler) handleAPINetworkRollback(w http.ResponseWriter, _ *http.Reques
writeJSON(w, map[string]string{"status": "rolled back"})
}
func (h *handler) handleAPIBenchmarkResults(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html; charset=utf-8")
fmt.Fprint(w, renderBenchmarkResultsCard(h.opts.ExportDir))
}
func (h *handler) rollbackPendingNetworkChange() error {
h.pendingNetMu.Lock()
pnc := h.pendingNet

View File

@@ -2002,7 +2002,7 @@ func renderBenchmark(opts HandlerOptions) string {
</div>
</div>
` + renderBenchmarkResultsCard(opts.ExportDir) + `
`+`<div id="benchmark-results-section">`+renderBenchmarkResultsCard(opts.ExportDir)+`</div>`+`
<div id="benchmark-output" style="display:none;margin-top:16px" class="card">
<div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
@@ -2188,7 +2188,9 @@ function runNvidiaBenchmark(kind) {
if (e.data) failures += 1;
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
term.scrollTop = term.scrollHeight;
const isLast = (idx + 1 >= taskIds.length);
streamNext(idx + 1, failures);
if (isLast) { benchmarkRefreshResults(); }
});
benchmarkES.onerror = function() {
if (benchmarkES) {
@@ -2208,18 +2210,30 @@ function runNvidiaBenchmark(kind) {
}
benchmarkLoadGPUs();
function benchmarkRefreshResults() {
fetch('/api/benchmark/results')
.then(function(r) { return r.text(); })
.then(function(html) {
const el = document.getElementById('benchmark-results-section');
if (el) el.innerHTML = html;
})
.catch(function() {});
}
</script>`
}
func renderBenchmarkResultsCard(exportDir string) string {
maxIdx, runs := loadBenchmarkHistory(exportDir)
return renderBenchmarkResultsCardFromRuns(
"Perf Results",
perf := renderBenchmarkResultsCardFromRuns(
"Performance Results",
"Composite score by saved benchmark run and GPU.",
"No saved benchmark runs yet.",
"No saved performance benchmark runs yet.",
maxIdx,
runs,
)
power := renderPowerBenchmarkResultsCard(exportDir)
return perf + "\n" + power
}
func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, maxGPUIndex int, runs []benchmarkHistoryRun) string {
@@ -2299,6 +2313,126 @@ func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun)
return maxGPUIndex, runs
}
func renderPowerBenchmarkResultsCard(exportDir string) string {
baseDir := app.DefaultBeeBenchPowerDir
if strings.TrimSpace(exportDir) != "" {
baseDir = filepath.Join(exportDir, "bee-bench", "power")
}
paths, err := filepath.Glob(filepath.Join(baseDir, "power-*", "result.json"))
if err != nil || len(paths) == 0 {
return `<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body"><p style="color:var(--muted);font-size:13px">No saved power benchmark runs yet.</p></div></div>`
}
sort.Strings(paths)
type powerRun struct {
generatedAt time.Time
displayTime string
result platform.NvidiaPowerBenchResult
}
var runs []powerRun
for _, path := range paths {
raw, err := os.ReadFile(path)
if err != nil {
continue
}
var r platform.NvidiaPowerBenchResult
if err := json.Unmarshal(raw, &r); err != nil {
continue
}
runs = append(runs, powerRun{
generatedAt: r.GeneratedAt,
displayTime: r.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
result: r,
})
}
sort.Slice(runs, func(i, j int) bool {
return runs[i].generatedAt.After(runs[j].generatedAt)
})
// Show only the most recent run's GPU slot table, plus a run history summary.
var b strings.Builder
b.WriteString(`<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body">`)
latest := runs[0].result
b.WriteString(`<p style="font-size:12px;color:var(--muted);margin-bottom:10px">Latest run: ` + html.EscapeString(runs[0].displayTime))
if latest.Hostname != "" {
b.WriteString(`` + html.EscapeString(latest.Hostname))
}
if latest.OverallStatus != "" {
statusColor := "var(--ok)"
if latest.OverallStatus != "OK" {
statusColor = "var(--warn)"
}
b.WriteString(` — <span style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(latest.OverallStatus) + `</span>`)
}
b.WriteString(`</p>`)
if len(latest.GPUs) > 0 {
b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`)
b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Achieved W</th><th>P95 Observed W</th><th>Status</th>`)
b.WriteString(`</tr></thead><tbody>`)
for _, gpu := range latest.GPUs {
derated := gpu.Derated || (gpu.DefaultPowerLimitW > 0 && gpu.AppliedPowerLimitW < gpu.DefaultPowerLimitW-1)
rowStyle := ""
achievedStyle := ""
if derated {
rowStyle = ` style="background:rgba(255,180,0,0.08)"`
achievedStyle = ` style="color:#e6a000;font-weight:600"`
}
statusLabel := gpu.Status
if statusLabel == "" {
statusLabel = "OK"
}
statusColor := "var(--ok)"
if statusLabel != "OK" {
statusColor = "var(--warn)"
}
nominalStr := "-"
if gpu.DefaultPowerLimitW > 0 {
nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW)
}
achievedStr := "-"
if gpu.AppliedPowerLimitW > 0 {
achievedStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
}
p95Str := "-"
if gpu.MaxObservedPowerW > 0 {
p95Str = fmt.Sprintf("%.0f", gpu.MaxObservedPowerW)
}
b.WriteString(`<tr` + rowStyle + `>`)
b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`)
b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`)
b.WriteString(`<td>` + nominalStr + `</td>`)
b.WriteString(`<td` + achievedStyle + `>` + achievedStr + `</td>`)
b.WriteString(`<td>` + p95Str + `</td>`)
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`)
b.WriteString(`</tr>`)
}
b.WriteString(`</tbody></table></div>`)
}
if len(runs) > 1 {
b.WriteString(`<details style="margin-top:12px"><summary style="font-size:12px;color:var(--muted);cursor:pointer">` + strconv.Itoa(len(runs)) + ` runs total</summary>`)
b.WriteString(`<div style="overflow-x:auto;margin-top:8px"><table><thead><tr><th>#</th><th>Time</th><th>GPUs</th><th>Status</th></tr></thead><tbody>`)
for i, run := range runs {
statusColor := "var(--ok)"
if run.result.OverallStatus != "OK" {
statusColor = "var(--warn)"
}
b.WriteString(`<tr>`)
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
b.WriteString(`<td>` + strconv.Itoa(len(run.result.GPUs)) + `</td>`)
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(run.result.OverallStatus) + `</td>`)
b.WriteString(`</tr>`)
}
b.WriteString(`</tbody></table></div></details>`)
}
b.WriteString(`</div></div>`)
return b.String()
}
// ── Burn ──────────────────────────────────────────────────────────────────────
func renderBurn() string {

View File

@@ -263,6 +263,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf"))
mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power"))
mux.HandleFunc("GET /api/benchmark/results", h.handleAPIBenchmarkResults)
// Tasks
mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)