Compare commits

...

5 Commits
v5.9 ... v5.12

Author SHA1 Message Date
Mikhail Chusavitin
fc5c100a29 Fix NVIDIA persistence mode and add benchmark results table 2026-04-06 10:47:07 +03:00
6e94216f3b Hide task charts while pending 2026-04-05 22:34:34 +03:00
53455063b9 Stabilize live task detail page 2026-04-05 22:14:52 +03:00
4602f97836 Enforce sequential task orchestration 2026-04-05 22:10:42 +03:00
c65d3ae3b1 Add nomodeset to default GRUB entry — fix black screen on headless servers
Servers with NVIDIA compute GPUs (H100 etc.) have no display output,
so KMS blanks the console. nomodeset disables kernel modesetting and
lets the NVIDIA proprietary driver handle display via Xorg.

KMS variant moved to advanced submenu for cases where it is needed.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-05 21:40:47 +03:00
10 changed files with 586 additions and 169 deletions

View File

@@ -16,12 +16,12 @@ func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts N
return "", err return "", err
} }
return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), []satJob{ return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), withNvidiaPersistenceMode(
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}}, satJob{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
job, job,
{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}}, satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
}, logFunc) ), logFunc)
} }
func nvidiaStressArchivePrefix(loader string) string { func nvidiaStressArchivePrefix(loader string) string {

View File

@@ -278,13 +278,13 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
if gpuCount < 1 { if gpuCount < 1 {
gpuCount = 1 gpuCount = 1
} }
return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", []satJob{ return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", withNvidiaPersistenceMode(
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{name: "02-all-reduce-perf.log", cmd: []string{ satJob{name: "02-all-reduce-perf.log", cmd: []string{
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2", "all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
"-g", strconv.Itoa(gpuCount), "--iters", "20", "-g", strconv.Itoa(gpuCount), "--iters", "20",
}}, }},
}, logFunc) ), logFunc)
} }
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) { func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
@@ -296,18 +296,18 @@ func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir strin
if err != nil { if err != nil {
return "", err return "", err
} }
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", []satJob{ return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}}, satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
{ satJob{
name: "03-dcgmproftester.log", name: "03-dcgmproftester.log",
cmd: profCmd, cmd: profCmd,
env: nvidiaVisibleDevicesEnv(selected), env: nvidiaVisibleDevicesEnv(selected),
collectGPU: true, collectGPU: true,
gpuIndices: selected, gpuIndices: selected,
}, },
{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}}, satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
}, logFunc) ), logFunc)
} }
func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) { func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
@@ -315,16 +315,16 @@ func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string,
if err != nil { if err != nil {
return "", err return "", err
} }
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", []satJob{ return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", withNvidiaPersistenceMode(
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{ satJob{
name: "02-dcgmi-targeted-power.log", name: "02-dcgmi-targeted-power.log",
cmd: nvidiaDCGMNamedDiagCommand("targeted_power", normalizeNvidiaBurnDuration(durationSec), selected), cmd: nvidiaDCGMNamedDiagCommand("targeted_power", normalizeNvidiaBurnDuration(durationSec), selected),
collectGPU: true, collectGPU: true,
gpuIndices: selected, gpuIndices: selected,
}, },
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}}, satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
}, logFunc) ), logFunc)
} }
func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) { func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
@@ -332,16 +332,16 @@ func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, dur
if err != nil { if err != nil {
return "", err return "", err
} }
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", []satJob{ return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", withNvidiaPersistenceMode(
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{ satJob{
name: "02-dcgmi-pulse-test.log", name: "02-dcgmi-pulse-test.log",
cmd: nvidiaDCGMNamedDiagCommand("pulse_test", normalizeNvidiaBurnDuration(durationSec), selected), cmd: nvidiaDCGMNamedDiagCommand("pulse_test", normalizeNvidiaBurnDuration(durationSec), selected),
collectGPU: true, collectGPU: true,
gpuIndices: selected, gpuIndices: selected,
}, },
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}}, satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
}, logFunc) ), logFunc)
} }
func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) { func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
@@ -349,16 +349,16 @@ func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpu
if err != nil { if err != nil {
return "", err return "", err
} }
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", []satJob{ return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", withNvidiaPersistenceMode(
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{ satJob{
name: "02-dcgmi-nvbandwidth.log", name: "02-dcgmi-nvbandwidth.log",
cmd: nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, selected), cmd: nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, selected),
collectGPU: true, collectGPU: true,
gpuIndices: selected, gpuIndices: selected,
}, },
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}}, satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
}, logFunc) ), logFunc)
} }
func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) { func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
@@ -389,16 +389,16 @@ func (s *System) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDi
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name)) logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
} }
} }
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", []satJob{ return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", withNvidiaPersistenceMode(
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{ satJob{
name: "02-dcgmi-targeted-stress.log", name: "02-dcgmi-targeted-stress.log",
cmd: nvidiaDCGMNamedDiagCommand("targeted_stress", normalizeNvidiaBurnDuration(durationSec), selected), cmd: nvidiaDCGMNamedDiagCommand("targeted_stress", normalizeNvidiaBurnDuration(durationSec), selected),
collectGPU: true, collectGPU: true,
gpuIndices: selected, gpuIndices: selected,
}, },
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}}, satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
}, logFunc) ), logFunc)
} }
func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) { func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
@@ -568,14 +568,24 @@ type satStats struct {
Unsupported int Unsupported int
} }
func withNvidiaPersistenceMode(jobs ...satJob) []satJob {
out := make([]satJob, 0, len(jobs)+1)
out = append(out, satJob{
name: "00-nvidia-smi-persistence-mode.log",
cmd: []string{"nvidia-smi", "-pm", "1"},
})
out = append(out, jobs...)
return out
}
func nvidiaSATJobs() []satJob { func nvidiaSATJobs() []satJob {
return []satJob{ return withNvidiaPersistenceMode(
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}}, satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}}, satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}}, satJob{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}}, satJob{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
} )
} }
func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob { func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
@@ -590,12 +600,12 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
} }
diagArgs = append(diagArgs, "-i", strings.Join(ids, ",")) diagArgs = append(diagArgs, "-i", strings.Join(ids, ","))
} }
return []satJob{ return withNvidiaPersistenceMode(
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}}, satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}}, satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
{name: "04-dcgmi-diag.log", cmd: diagArgs}, satJob{name: "04-dcgmi-diag.log", cmd: diagArgs},
} )
} }
func nvidiaDCGMNamedDiagCommand(name string, durationSec int, gpuIndices []int) []string { func nvidiaDCGMNamedDiagCommand(name string, durationSec int, gpuIndices []int) []string {

View File

@@ -28,13 +28,19 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {
jobs := nvidiaSATJobs() jobs := nvidiaSATJobs()
if len(jobs) != 5 { if len(jobs) != 6 {
t.Fatalf("jobs=%d want 5", len(jobs)) t.Fatalf("jobs=%d want 6", len(jobs))
} }
if got := jobs[4].cmd[0]; got != "bee-gpu-burn" { if got := jobs[0].cmd[0]; got != "nvidia-smi" {
t.Fatalf("preflight command=%q want nvidia-smi", got)
}
if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
}
if got := jobs[5].cmd[0]; got != "bee-gpu-burn" {
t.Fatalf("gpu stress command=%q want bee-gpu-burn", got) t.Fatalf("gpu stress command=%q want bee-gpu-burn", got)
} }
if got := jobs[3].cmd[1]; got != "--output-file" { if got := jobs[4].cmd[1]; got != "--output-file" {
t.Fatalf("bug report flag=%q want --output-file", got) t.Fatalf("bug report flag=%q want --output-file", got)
} }
} }
@@ -82,7 +88,7 @@ func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) {
func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) { func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
jobs := nvidiaSATJobs() jobs := nvidiaSATJobs()
got := jobs[4].cmd got := jobs[5].cmd
want := []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"} want := []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}
if len(got) != len(want) { if len(got) != len(want) {
t.Fatalf("cmd len=%d want %d", len(got), len(want)) t.Fatalf("cmd len=%d want %d", len(got), len(want))
@@ -94,6 +100,19 @@ func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
} }
} }
func TestNvidiaDCGMJobsEnablePersistenceModeBeforeDiag(t *testing.T) {
jobs := nvidiaDCGMJobs(3, []int{2, 0})
if len(jobs) != 5 {
t.Fatalf("jobs=%d want 5", len(jobs))
}
if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
}
if got := strings.Join(jobs[4].cmd, " "); got != "dcgmi diag -r 3 -i 2,0" {
t.Fatalf("diag=%q want %q", got, "dcgmi diag -r 3 -i 2,0")
}
}
func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) { func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) {
t.Parallel() t.Parallel()

View File

@@ -8,9 +8,12 @@ import (
"os" "os"
"path/filepath" "path/filepath"
"sort" "sort"
"strconv"
"strings" "strings"
"time"
"bee/audit/internal/app" "bee/audit/internal/app"
"bee/audit/internal/platform"
"bee/audit/internal/schema" "bee/audit/internal/schema"
) )
@@ -161,7 +164,7 @@ func renderPage(page string, opts HandlerOptions) string {
case "benchmark": case "benchmark":
pageID = "benchmark" pageID = "benchmark"
title = "Benchmark" title = "Benchmark"
body = renderBenchmark() body = renderBenchmark(opts)
case "tasks": case "tasks":
pageID = "tasks" pageID = "tasks"
title = "Tasks" title = "Tasks"
@@ -1068,17 +1071,23 @@ func renderValidate(opts HandlerOptions) string {
`</div> `</div>
<div style="height:1px;background:var(--border);margin:16px 0"></div> <div style="height:1px;background:var(--border);margin:16px 0"></div>
<div class="grid3"> <div class="grid3">
` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody( ` + renderSATCard("nvidia-selection", "NVIDIA GPU Selection", "", "", renderValidateCardBody(
inv.NVIDIA, inv.NVIDIA,
`Runs NVIDIA diagnostics and board inventory checks.`, `Select which NVIDIA GPUs to include in Validate. The same selection is used by both NVIDIA GPU cards below and by Validate one by one.`,
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`, `<code>nvidia-smi --query-gpu=index,name,memory.total</code>`,
`Runs one GPU at a time. Diag level is taken from Validate Profile.`, `<div id="sat-gpu-list"><p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs…</p></div><div style="display:flex;gap:8px;flex-wrap:wrap;margin-top:8px"><button type="button" class="btn btn-sm btn-secondary" onclick="satSelectAllGPUs()">Select all</button><button type="button" class="btn btn-sm btn-secondary" onclick="satSelectNoGPUs()">Clear</button></div><div id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin-top:8px"></div>`,
)) + )) +
renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
inv.NVIDIA,
`Runs NVIDIA diagnostics and board inventory checks.`,
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
`Runs one GPU at a time on the selected NVIDIA GPUs. Diag level is taken from Validate Profile.`,
)) +
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody( renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
inv.NVIDIA, inv.NVIDIA,
`Runs a controlled NVIDIA DCGM load in Validate to check stability under moderate stress.`, `Runs a controlled NVIDIA DCGM load in Validate to check stability under moderate stress.`,
`<code>dcgmi diag targeted_stress</code>`, `<code>dcgmi diag targeted_stress</code>`,
`Runs one GPU at a time with the fixed DCGM targeted stress recipe.`, `Runs one GPU at a time on the selected NVIDIA GPUs with the fixed DCGM targeted stress recipe.`,
)) + )) +
`</div> `</div>
<div class="grid3" style="margin-top:16px"> <div class="grid3" style="margin-top:16px">
@@ -1100,6 +1109,8 @@ func renderValidate(opts HandlerOptions) string {
.validate-card-body { padding:0; } .validate-card-body { padding:0; }
.validate-card-section { padding:12px 16px 0; } .validate-card-section { padding:12px 16px 0; }
.validate-card-section:last-child { padding-bottom:16px; } .validate-card-section:last-child { padding-bottom:16px; }
.sat-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
.sat-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
@media(max-width:900px){ .validate-profile-body { grid-template-columns:1fr; } } @media(max-width:900px){ .validate-profile-body { grid-template-columns:1fr; } }
</style> </style>
<script> <script>
@@ -1128,6 +1139,59 @@ function loadSatNvidiaGPUs() {
} }
return satNvidiaGPUsPromise; return satNvidiaGPUsPromise;
} }
function satSelectedGPUIndices() {
return Array.from(document.querySelectorAll('.sat-nvidia-checkbox'))
.filter(function(el) { return el.checked && !el.disabled; })
.map(function(el) { return parseInt(el.value, 10); })
.filter(function(v) { return !Number.isNaN(v); })
.sort(function(a, b) { return a - b; });
}
function satUpdateGPUSelectionNote() {
const note = document.getElementById('sat-gpu-selection-note');
if (!note) return;
const selected = satSelectedGPUIndices();
if (!selected.length) {
note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA validate tasks.';
return;
}
note.textContent = 'Selected NVIDIA GPUs: ' + selected.join(', ') + '.';
}
function satRenderGPUList(gpus) {
const root = document.getElementById('sat-gpu-list');
if (!root) return;
if (!gpus || !gpus.length) {
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
satUpdateGPUSelectionNote();
return;
}
root.innerHTML = gpus.map(function(gpu) {
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
return '<label class="sat-gpu-row">'
+ '<input class="sat-nvidia-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="satUpdateGPUSelectionNote()">'
+ '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
+ '</label>';
}).join('');
satUpdateGPUSelectionNote();
}
function satSelectAllGPUs() {
document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = true; });
satUpdateGPUSelectionNote();
}
function satSelectNoGPUs() {
document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = false; });
satUpdateGPUSelectionNote();
}
function satLoadGPUs() {
loadSatNvidiaGPUs().then(function(gpus) {
satRenderGPUList(gpus);
}).catch(function(err) {
const root = document.getElementById('sat-gpu-list');
if (root) {
root.innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
}
satUpdateGPUSelectionNote();
});
}
function satGPUDisplayName(gpu) { function satGPUDisplayName(gpu) {
const idx = (gpu && Number.isFinite(Number(gpu.index))) ? Number(gpu.index) : 0; const idx = (gpu && Number.isFinite(Number(gpu.index))) ? Number(gpu.index) : 0;
const name = gpu && gpu.name ? gpu.name : ('GPU ' + idx); const name = gpu && gpu.name ? gpu.name : ('GPU ' + idx);
@@ -1149,6 +1213,36 @@ function enqueueSATTarget(target, overrides) {
return fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(satRequestBody(target, overrides))}) return fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(satRequestBody(target, overrides))})
.then(r => r.json()); .then(r => r.json());
} }
function streamSATTask(taskId, title, resetTerminal) {
if (satES) { satES.close(); satES = null; }
document.getElementById('sat-output').style.display='block';
document.getElementById('sat-title').textContent = '— ' + title;
const term = document.getElementById('sat-terminal');
if (resetTerminal) {
term.textContent = '';
}
term.textContent += 'Task ' + taskId + ' queued. Streaming log...\n';
return new Promise(function(resolve) {
satES = new EventSource('/api/tasks/' + taskId + '/stream');
satES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
satES.addEventListener('done', function(e) {
satES.close();
satES = null;
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
term.scrollTop = term.scrollHeight;
resolve({ok: !e.data, error: e.data || ''});
});
satES.onerror = function() {
if (satES) {
satES.close();
satES = null;
}
term.textContent += '\nERROR: stream disconnected.\n';
term.scrollTop = term.scrollHeight;
resolve({ok: false, error: 'stream disconnected'});
};
});
}
function selectedAMDValidateTargets() { function selectedAMDValidateTargets() {
const targets = []; const targets = [];
const gpu = document.getElementById('sat-amd-target'); const gpu = document.getElementById('sat-amd-target');
@@ -1163,24 +1257,23 @@ function runSAT(target) {
return runSATWithOverrides(target, null); return runSATWithOverrides(target, null);
} }
function runSATWithOverrides(target, overrides) { function runSATWithOverrides(target, overrides) {
if (satES) { satES.close(); satES = null; } const title = (overrides && overrides.display_name) || target;
document.getElementById('sat-output').style.display='block';
document.getElementById('sat-title').textContent = '— ' + target;
const term = document.getElementById('sat-terminal'); const term = document.getElementById('sat-terminal');
term.textContent = 'Enqueuing ' + target + ' test...\n'; document.getElementById('sat-output').style.display='block';
document.getElementById('sat-title').textContent = '— ' + title;
term.textContent = 'Enqueuing ' + title + ' test...\n';
return enqueueSATTarget(target, overrides) return enqueueSATTarget(target, overrides)
.then(d => { .then(d => streamSATTask(d.task_id, title, false));
term.textContent += 'Task ' + d.task_id + ' queued. Streaming log...\n';
satES = new EventSource('/api/tasks/'+d.task_id+'/stream');
satES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop=term.scrollHeight; };
satES.addEventListener('done', e => { satES.close(); satES=null; term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n'; });
});
} }
function expandSATTarget(target) { function expandSATTarget(target) {
if (target !== 'nvidia' && target !== 'nvidia-targeted-stress') { if (target !== 'nvidia' && target !== 'nvidia-targeted-stress') {
return Promise.resolve([{target: target}]); return Promise.resolve([{target: target}]);
} }
return loadSatNvidiaGPUs().then(gpus => gpus.map(gpu => ({ const selected = satSelectedGPUIndices();
if (!selected.length) {
return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
}
return loadSatNvidiaGPUs().then(gpus => gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0).map(gpu => ({
target: target, target: target,
overrides: { overrides: {
gpu_indices: [Number(gpu.index)], gpu_indices: [Number(gpu.index)],
@@ -1191,65 +1284,61 @@ function expandSATTarget(target) {
} }
function runNvidiaValidateSet(target) { function runNvidiaValidateSet(target) {
return loadSatNvidiaGPUs().then(gpus => { return loadSatNvidiaGPUs().then(gpus => {
if (!gpus.length) return; const selected = satSelectedGPUIndices();
if (gpus.length === 1) { const picked = gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0);
const gpu = gpus[0]; if (!picked.length) {
throw new Error('Select at least one NVIDIA GPU.');
}
if (picked.length === 1) {
const gpu = picked[0];
return runSATWithOverrides(target, { return runSATWithOverrides(target, {
gpu_indices: [Number(gpu.index)], gpu_indices: [Number(gpu.index)],
display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')' display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')'
}); });
} }
if (satES) { satES.close(); satES = null; }
document.getElementById('sat-output').style.display='block'; document.getElementById('sat-output').style.display='block';
document.getElementById('sat-title').textContent = '— ' + target; document.getElementById('sat-title').textContent = '— ' + target;
const term = document.getElementById('sat-terminal'); const term = document.getElementById('sat-terminal');
term.textContent = 'Enqueuing ' + target + ' tests one GPU at a time...\n'; term.textContent = 'Running ' + target + ' one GPU at a time...\n';
const labelBase = satLabels()[target] || ('Validate ' + target); const labelBase = satLabels()[target] || ('Validate ' + target);
const enqueueNext = (idx) => { const runNext = (idx) => {
if (idx >= gpus.length) return; if (idx >= picked.length) return Promise.resolve();
const gpu = gpus[idx]; const gpu = picked[idx];
const gpuLabel = satGPUDisplayName(gpu); const gpuLabel = satGPUDisplayName(gpu);
enqueueSATTarget(target, { term.textContent += '\n[' + (idx + 1) + '/' + picked.length + '] ' + gpuLabel + '\n';
return enqueueSATTarget(target, {
gpu_indices: [Number(gpu.index)], gpu_indices: [Number(gpu.index)],
display_name: labelBase + ' (' + gpuLabel + ')' display_name: labelBase + ' (' + gpuLabel + ')'
}).then(d => { }).then(d => {
term.textContent += 'Task ' + d.task_id + ' queued for ' + gpuLabel + '.\n'; return streamSATTask(d.task_id, labelBase + ' (' + gpuLabel + ')', false);
if (idx === gpus.length - 1) { }).then(function() {
satES = new EventSource('/api/tasks/' + d.task_id + '/stream'); return runNext(idx + 1);
satES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop=term.scrollHeight; };
satES.addEventListener('done', e => { satES.close(); satES=null; term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n'; });
}
enqueueNext(idx + 1);
}); });
}; };
enqueueNext(0); return runNext(0);
}); });
} }
function runAMDValidateSet() { function runAMDValidateSet() {
const targets = selectedAMDValidateTargets(); const targets = selectedAMDValidateTargets();
if (!targets.length) return; if (!targets.length) return;
if (targets.length === 1) return runSAT(targets[0]); if (targets.length === 1) return runSAT(targets[0]);
if (satES) { satES.close(); satES = null; }
document.getElementById('sat-output').style.display='block'; document.getElementById('sat-output').style.display='block';
document.getElementById('sat-title').textContent = '— amd'; document.getElementById('sat-title').textContent = '— amd';
const term = document.getElementById('sat-terminal'); const term = document.getElementById('sat-terminal');
term.textContent = 'Enqueuing AMD validate set...\n'; term.textContent = 'Running AMD validate set one by one...\n';
const labels = satLabels(); const labels = satLabels();
const enqueueNext = (idx) => { const runNext = (idx) => {
if (idx >= targets.length) return; if (idx >= targets.length) return Promise.resolve();
const target = targets[idx]; const target = targets[idx];
enqueueSATTarget(target) term.textContent += '\n[' + (idx + 1) + '/' + targets.length + '] ' + labels[target] + '\n';
return enqueueSATTarget(target)
.then(d => { .then(d => {
term.textContent += 'Task ' + d.task_id + ' queued for ' + labels[target] + '.\n'; return streamSATTask(d.task_id, labels[target], false);
if (idx === targets.length - 1) { }).then(function() {
satES = new EventSource('/api/tasks/'+d.task_id+'/stream'); return runNext(idx + 1);
satES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop=term.scrollHeight; };
satES.addEventListener('done', e => { satES.close(); satES=null; term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n'; });
}
enqueueNext(idx + 1);
}); });
}; };
enqueueNext(0); return runNext(0);
} }
function runAllSAT() { function runAllSAT() {
const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1); const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
@@ -1271,17 +1360,17 @@ function runAllSAT() {
status.textContent = 'No tasks selected.'; status.textContent = 'No tasks selected.';
return; return;
} }
const enqueueNext = (idx) => { const runNext = (idx) => {
if (idx >= expanded.length) { status.textContent = 'Enqueued ' + total + ' tasks.'; return; } if (idx >= expanded.length) { status.textContent = 'Completed ' + total + ' task(s).'; return Promise.resolve(); }
const item = expanded[idx]; const item = expanded[idx];
enqueueSATTarget(item.target, item.overrides) status.textContent = 'Running ' + (idx + 1) + '/' + total + '...';
return enqueueSATTarget(item.target, item.overrides)
.then(() => { .then(() => {
enqueued++; enqueued++;
status.textContent = 'Enqueued ' + enqueued + '/' + total + '...'; return runNext(idx + 1);
enqueueNext(idx + 1);
}); });
}; };
enqueueNext(0); return runNext(0);
}).catch(err => { }).catch(err => {
status.textContent = 'Error: ' + err.message; status.textContent = 'Error: ' + err.message;
}); });
@@ -1294,6 +1383,7 @@ fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected'); if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
if (!gp.amd) disableSATAMDOptions('No AMD GPU detected'); if (!gp.amd) disableSATAMDOptions('No AMD GPU detected');
}); });
satLoadGPUs();
function disableSATAMDOptions(reason) { function disableSATAMDOptions(reason) {
['sat-amd-target','sat-amd-mem-target','sat-amd-bandwidth-target'].forEach(function(id) { ['sat-amd-target','sat-amd-mem-target','sat-amd-bandwidth-target'].forEach(function(id) {
const cb = document.getElementById(id); const cb = document.getElementById(id);
@@ -1482,7 +1572,25 @@ func renderSATCard(id, label, runAction, headerActions, body string) string {
// ── Benchmark ───────────────────────────────────────────────────────────────── // ── Benchmark ─────────────────────────────────────────────────────────────────
func renderBenchmark() string { type benchmarkHistoryColumn struct {
key string
label string
name string
index int
}
type benchmarkHistoryCell struct {
score float64
present bool
}
type benchmarkHistoryRun struct {
generatedAt time.Time
displayTime string
cells map[string]benchmarkHistoryCell
}
func renderBenchmark(opts HandlerOptions) string {
return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Benchmark runs generate a human-readable TXT report and machine-readable result bundle. Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p> return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Benchmark runs generate a human-readable TXT report and machine-readable result bundle. Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
<div class="grid2"> <div class="grid2">
@@ -1531,6 +1639,8 @@ func renderBenchmark() string {
</div> </div>
</div> </div>
` + renderBenchmarkResultsCard(opts.ExportDir) + `
<div id="benchmark-output" style="display:none;margin-top:16px" class="card"> <div id="benchmark-output" style="display:none;margin-top:16px" class="card">
<div class="card-head">Benchmark Output <span id="benchmark-title"></span></div> <div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
<div class="card-body"><div id="benchmark-terminal" class="terminal"></div></div> <div class="card-body"><div id="benchmark-terminal" class="terminal"></div></div>
@@ -1667,6 +1777,115 @@ benchmarkLoadGPUs();
</script>` </script>`
} }
func renderBenchmarkResultsCard(exportDir string) string {
columns, runs := loadBenchmarkHistory(exportDir)
if len(runs) == 0 {
return `<div class="card"><div class="card-head">Benchmark Results</div><div class="card-body"><p style="color:var(--muted);font-size:13px">No saved benchmark runs yet.</p></div></div>`
}
var b strings.Builder
b.WriteString(`<div class="card"><div class="card-head">Benchmark Results</div><div class="card-body">`)
b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">Composite score by saved benchmark run and GPU.</p>`)
b.WriteString(`<div style="overflow-x:auto">`)
b.WriteString(`<table><thead><tr><th>Test</th><th>Time</th>`)
for _, col := range columns {
b.WriteString(`<th>` + html.EscapeString(col.label) + `</th>`)
}
b.WriteString(`</tr></thead><tbody>`)
for i, run := range runs {
b.WriteString(`<tr>`)
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
for _, col := range columns {
cell, ok := run.cells[col.key]
if !ok || !cell.present {
b.WriteString(`<td style="color:var(--muted)">-</td>`)
continue
}
b.WriteString(`<td>` + fmt.Sprintf("%.2f", cell.score) + `</td>`)
}
b.WriteString(`</tr>`)
}
b.WriteString(`</tbody></table></div></div></div>`)
return b.String()
}
func loadBenchmarkHistory(exportDir string) ([]benchmarkHistoryColumn, []benchmarkHistoryRun) {
baseDir := app.DefaultBenchmarkBaseDir
if strings.TrimSpace(exportDir) != "" {
baseDir = filepath.Join(exportDir, "bee-benchmark")
}
paths, err := filepath.Glob(filepath.Join(baseDir, "gpu-benchmark-*", "result.json"))
if err != nil || len(paths) == 0 {
return nil, nil
}
sort.Strings(paths)
columnByKey := make(map[string]benchmarkHistoryColumn)
runs := make([]benchmarkHistoryRun, 0, len(paths))
for _, path := range paths {
raw, err := os.ReadFile(path)
if err != nil {
continue
}
var result platform.NvidiaBenchmarkResult
if err := json.Unmarshal(raw, &result); err != nil {
continue
}
run := benchmarkHistoryRun{
generatedAt: result.GeneratedAt,
displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
cells: make(map[string]benchmarkHistoryCell),
}
for _, gpu := range result.GPUs {
key := benchmarkHistoryColumnKey(gpu.Name, gpu.Index)
columnByKey[key] = benchmarkHistoryColumn{
key: key,
label: benchmarkHistoryColumnLabel(gpu.Name, gpu.Index),
name: strings.TrimSpace(gpu.Name),
index: gpu.Index,
}
run.cells[key] = benchmarkHistoryCell{
score: gpu.Scores.CompositeScore,
present: true,
}
}
runs = append(runs, run)
}
columns := make([]benchmarkHistoryColumn, 0, len(columnByKey))
for _, col := range columnByKey {
columns = append(columns, col)
}
sort.Slice(columns, func(i, j int) bool {
leftName := strings.ToLower(strings.TrimSpace(columns[i].name))
rightName := strings.ToLower(strings.TrimSpace(columns[j].name))
if leftName != rightName {
return leftName < rightName
}
if columns[i].index != columns[j].index {
return columns[i].index < columns[j].index
}
return columns[i].key < columns[j].key
})
sort.Slice(runs, func(i, j int) bool {
return runs[i].generatedAt.After(runs[j].generatedAt)
})
return columns, runs
}
func benchmarkHistoryColumnKey(name string, index int) string {
return strings.TrimSpace(name) + "|" + strconv.Itoa(index)
}
func benchmarkHistoryColumnLabel(name string, index int) string {
name = strings.TrimSpace(name)
if name == "" {
return fmt.Sprintf("GPU %d", index)
}
return fmt.Sprintf("%s / GPU %d", name, index)
}
// ── Burn ────────────────────────────────────────────────────────────────────── // ── Burn ──────────────────────────────────────────────────────────────────────
func renderBurn() string { func renderBurn() string {
@@ -1886,6 +2105,36 @@ function streamTask(taskId, label) {
term.scrollTop = term.scrollHeight; term.scrollTop = term.scrollHeight;
}); });
} }
function streamBurnTask(taskId, label, resetTerminal) {
if (biES) { biES.close(); biES = null; }
document.getElementById('bi-output').style.display = 'block';
document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
const term = document.getElementById('bi-terminal');
if (resetTerminal) {
term.textContent = '';
}
term.textContent += 'Task ' + taskId + ' queued. Streaming...\n';
return new Promise(function(resolve) {
biES = new EventSource('/api/tasks/' + taskId + '/stream');
biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
biES.addEventListener('done', function(e) {
biES.close();
biES = null;
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
term.scrollTop = term.scrollHeight;
resolve({ok: !e.data, error: e.data || ''});
});
biES.onerror = function() {
if (biES) {
biES.close();
biES = null;
}
term.textContent += '\nERROR: stream disconnected.\n';
term.scrollTop = term.scrollHeight;
resolve({ok: false, error: 'stream disconnected'});
};
});
}
function runBurnTaskSet(tasks, statusElId) { function runBurnTaskSet(tasks, statusElId) {
const enabled = tasks.filter(function(t) { const enabled = tasks.filter(function(t) {
@@ -1898,19 +2147,33 @@ function runBurnTaskSet(tasks, statusElId) {
if (status) status.textContent = 'No tasks selected.'; if (status) status.textContent = 'No tasks selected.';
return; return;
} }
enabled.forEach(function(t) { const term = document.getElementById('bi-terminal');
enqueueBurnTask(t.target, t.label, t.extra, !!t.nvidia) document.getElementById('bi-output').style.display = 'block';
document.getElementById('bi-title').textContent = '— Burn one by one [' + burnProfile() + ']';
term.textContent = '';
const runNext = function(idx) {
if (idx >= enabled.length) {
if (status) status.textContent = 'Completed ' + enabled.length + ' task(s).';
return Promise.resolve();
}
const t = enabled[idx];
term.textContent += '\n[' + (idx + 1) + '/' + enabled.length + '] ' + t.label + '\n';
if (status) status.textContent = 'Running ' + (idx + 1) + '/' + enabled.length + '...';
return enqueueBurnTask(t.target, t.label, t.extra, !!t.nvidia)
.then(function(d) { .then(function(d) {
if (status) status.textContent = enabled.length + ' task(s) queued.'; return streamBurnTask(d.task_id, t.label, false);
streamTask(d.task_id, t.label); })
.then(function() {
return runNext(idx + 1);
}) })
.catch(function(err) { .catch(function(err) {
if (status) status.textContent = 'Error: ' + err.message; if (status) status.textContent = 'Error: ' + err.message;
const term = document.getElementById('bi-terminal');
document.getElementById('bi-output').style.display = 'block'; document.getElementById('bi-output').style.display = 'block';
term.textContent += 'ERROR: ' + err.message + '\n'; term.textContent += 'ERROR: ' + err.message + '\n';
return Promise.reject(err);
}); });
}); };
return runNext(0);
} }
function runPlatformStress() { function runPlatformStress() {

View File

@@ -1,6 +1,7 @@
package webui package webui
import ( import (
"encoding/json"
"net/http" "net/http"
"net/http/httptest" "net/http/httptest"
"os" "os"
@@ -601,8 +602,8 @@ func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
if !strings.Contains(body, `Restart GPU Drivers`) { if !strings.Contains(body, `Restart GPU Drivers`) {
t.Fatalf("tools page missing restart gpu drivers button: %s", body) t.Fatalf("tools page missing restart gpu drivers button: %s", body)
} }
if !strings.Contains(body, `svcAction('bee-nvidia', 'restart')`) { if !strings.Contains(body, `restartGPUDrivers()`) {
t.Fatalf("tools page missing bee-nvidia restart action: %s", body) t.Fatalf("tools page missing restartGPUDrivers action: %s", body)
} }
if !strings.Contains(body, `id="boot-source-text"`) { if !strings.Contains(body, `id="boot-source-text"`) {
t.Fatalf("tools page missing boot source field: %s", body) t.Fatalf("tools page missing boot source field: %s", body)
@@ -636,6 +637,66 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
} }
} }
func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
dir := t.TempDir()
exportDir := filepath.Join(dir, "export")
runDir := filepath.Join(exportDir, "bee-benchmark", "gpu-benchmark-20260406-120000")
if err := os.MkdirAll(runDir, 0755); err != nil {
t.Fatal(err)
}
result := platform.NvidiaBenchmarkResult{
GeneratedAt: time.Date(2026, time.April, 6, 12, 0, 0, 0, time.UTC),
BenchmarkProfile: "standard",
OverallStatus: "OK",
GPUs: []platform.BenchmarkGPUResult{
{
Index: 0,
Name: "NVIDIA H100 PCIe",
Scores: platform.BenchmarkScorecard{
CompositeScore: 1176.25,
},
},
{
Index: 1,
Name: "NVIDIA H100 PCIe",
Scores: platform.BenchmarkScorecard{
CompositeScore: 1168.50,
},
},
},
}
raw, err := json.Marshal(result)
if err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(runDir, "result.json"), raw, 0644); err != nil {
t.Fatal(err)
}
handler := NewHandler(HandlerOptions{ExportDir: exportDir})
rec := httptest.NewRecorder()
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/benchmark", nil))
if rec.Code != http.StatusOK {
t.Fatalf("status=%d", rec.Code)
}
body := rec.Body.String()
wantTime := result.GeneratedAt.Local().Format("2006-01-02 15:04:05")
for _, needle := range []string{
`Benchmark Results`,
`Composite score by saved benchmark run and GPU.`,
`NVIDIA H100 PCIe / GPU 0`,
`NVIDIA H100 PCIe / GPU 1`,
`#1`,
wantTime,
`1176.25`,
`1168.50`,
} {
if !strings.Contains(body, needle) {
t.Fatalf("benchmark page missing %q: %s", needle, body)
}
}
}
func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) { func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
handler := NewHandler(HandlerOptions{}) handler := NewHandler(HandlerOptions{})
rec := httptest.NewRecorder() rec := httptest.NewRecorder()
@@ -649,6 +710,8 @@ func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
`nvidia-targeted-stress`, `nvidia-targeted-stress`,
`controlled NVIDIA DCGM load`, `controlled NVIDIA DCGM load`,
`<code>dcgmi diag targeted_stress</code>`, `<code>dcgmi diag targeted_stress</code>`,
`NVIDIA GPU Selection`,
`id="sat-gpu-list"`,
} { } {
if !strings.Contains(body, needle) { if !strings.Contains(body, needle) {
t.Fatalf("validate page missing %q: %s", needle, body) t.Fatalf("validate page missing %q: %s", needle, body)

View File

@@ -97,32 +97,73 @@ func renderTaskDetailPage(opts HandlerOptions, task Task) string {
body.WriteString(`</div></div>`) body.WriteString(`</div></div>`)
} }
if task.Status == TaskRunning || task.Status == TaskPending { if task.Status == TaskRunning {
body.WriteString(`<div class="card"><div class="card-head">Live Charts</div><div class="card-body">`) body.WriteString(`<div class="card"><div class="card-head">Live Charts</div><div class="card-body">`)
body.WriteString(`<div id="task-live-charts" style="display:flex;flex-direction:column;gap:16px;color:var(--muted);font-size:13px">Loading charts...</div>`) body.WriteString(`<div id="task-live-charts" style="display:flex;flex-direction:column;gap:16px;color:var(--muted);font-size:13px">Loading charts...</div>`)
body.WriteString(`</div></div>`) body.WriteString(`</div></div>`)
}
if task.Status == TaskRunning || task.Status == TaskPending {
body.WriteString(`<div class="card"><div class="card-head">Live Logs</div><div class="card-body">`) body.WriteString(`<div class="card"><div class="card-head">Live Logs</div><div class="card-body">`)
body.WriteString(`<div id="task-live-log" class="terminal" style="max-height:none;white-space:pre-wrap">Connecting...</div>`) body.WriteString(`<div id="task-live-log" class="terminal" style="max-height:none;white-space:pre-wrap">Connecting...</div>`)
body.WriteString(`</div></div>`) body.WriteString(`</div></div>`)
body.WriteString(`<script> body.WriteString(`<script>
function cancelTaskDetail(id) { function cancelTaskDetail(id) {
fetch('/api/tasks/' + id + '/cancel', {method:'POST'}).then(function(){ window.location.reload(); }); fetch('/api/tasks/' + id + '/cancel', {method:'POST'}).then(function(){
var term = document.getElementById('task-live-log');
if (term) {
term.textContent += '\nCancel requested.\n';
term.scrollTop = term.scrollHeight;
}
});
}
function renderTaskLiveCharts(taskId, charts) {
const host = document.getElementById('task-live-charts');
if (!host) return;
if (!Array.isArray(charts) || charts.length === 0) {
host.innerHTML = 'Waiting for metric samples...';
return;
}
const seen = {};
charts.forEach(function(chart) {
seen[chart.file] = true;
let img = host.querySelector('img[data-chart-file="' + chart.file + '"]');
if (img) {
const card = img.closest('.card');
if (card) {
const title = card.querySelector('.card-head');
if (title) title.textContent = chart.title;
}
return;
}
const card = document.createElement('div');
card.className = 'card';
card.style.margin = '0';
card.innerHTML = '<div class="card-head"></div><div class="card-body" style="padding:12px"></div>';
card.querySelector('.card-head').textContent = chart.title;
const body = card.querySelector('.card-body');
img = document.createElement('img');
img.setAttribute('data-task-chart', '1');
img.setAttribute('data-chart-file', chart.file);
img.setAttribute('data-base-src', '/api/tasks/' + taskId + '/chart/' + chart.file);
img.src = '/api/tasks/' + taskId + '/chart/' + chart.file + '?t=' + Date.now();
img.style.width = '100%';
img.style.display = 'block';
img.style.borderRadius = '6px';
img.alt = chart.title;
body.appendChild(img);
host.appendChild(card);
});
Array.from(host.querySelectorAll('img[data-task-chart="1"]')).forEach(function(img) {
const file = img.getAttribute('data-chart-file') || '';
if (seen[file]) return;
const card = img.closest('.card');
if (card) card.remove();
});
} }
function loadTaskLiveCharts(taskId) { function loadTaskLiveCharts(taskId) {
fetch('/api/tasks/' + taskId + '/charts').then(function(r){ return r.json(); }).then(function(charts){ fetch('/api/tasks/' + taskId + '/charts').then(function(r){ return r.json(); }).then(function(charts){
const host = document.getElementById('task-live-charts'); renderTaskLiveCharts(taskId, charts);
if (!host) return;
if (!Array.isArray(charts) || charts.length === 0) {
host.innerHTML = 'Waiting for metric samples...';
return;
}
host.innerHTML = charts.map(function(chart) {
return '<div class="card" style="margin:0">' +
'<div class="card-head">' + chart.title + '</div>' +
'<div class="card-body" style="padding:12px">' +
'<img data-task-chart="1" data-base-src="/api/tasks/' + taskId + '/chart/' + chart.file + '" src="/api/tasks/' + taskId + '/chart/' + chart.file + '?t=' + Date.now() + '" style="width:100%;display:block;border-radius:6px" alt="' + chart.title + '">' +
'</div></div>';
}).join('');
}).catch(function(){ }).catch(function(){
const host = document.getElementById('task-live-charts'); const host = document.getElementById('task-live-charts');
if (host) host.innerHTML = 'Task charts are unavailable.'; if (host) host.innerHTML = 'Task charts are unavailable.';
@@ -138,12 +179,31 @@ function refreshTaskLiveCharts() {
var _taskDetailES = new EventSource('/api/tasks/` + html.EscapeString(task.ID) + `/stream'); var _taskDetailES = new EventSource('/api/tasks/` + html.EscapeString(task.ID) + `/stream');
var _taskDetailTerm = document.getElementById('task-live-log'); var _taskDetailTerm = document.getElementById('task-live-log');
var _taskChartTimer = null; var _taskChartTimer = null;
var _taskChartsFrozen = false;
_taskDetailES.onopen = function(){ _taskDetailTerm.textContent = ''; }; _taskDetailES.onopen = function(){ _taskDetailTerm.textContent = ''; };
_taskDetailES.onmessage = function(e){ _taskDetailTerm.textContent += e.data + "\n"; _taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight; }; _taskDetailES.onmessage = function(e){ _taskDetailTerm.textContent += e.data + "\n"; _taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight; };
_taskDetailES.addEventListener('done', function(){ if (_taskChartTimer) clearInterval(_taskChartTimer); _taskDetailES.close(); setTimeout(function(){ window.location.reload(); }, 1000); }); _taskDetailES.addEventListener('done', function(e){
_taskDetailES.onerror = function(){ if (_taskChartTimer) clearInterval(_taskChartTimer); _taskDetailES.close(); }; if (_taskChartTimer) clearInterval(_taskChartTimer);
_taskDetailES.close();
_taskDetailES = null;
_taskChartsFrozen = true;
_taskDetailTerm.textContent += (e.data ? '\nTask finished with error.\n' : '\nTask finished.\n');
_taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight;
refreshTaskLiveCharts();
});
_taskDetailES.onerror = function(){
if (_taskChartTimer) clearInterval(_taskChartTimer);
if (_taskDetailES) {
_taskDetailES.close();
_taskDetailES = null;
}
};
loadTaskLiveCharts('` + html.EscapeString(task.ID) + `'); loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
_taskChartTimer = setInterval(function(){ refreshTaskLiveCharts(); loadTaskLiveCharts('` + html.EscapeString(task.ID) + `'); }, 2000); _taskChartTimer = setInterval(function(){
if (_taskChartsFrozen) return;
loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
refreshTaskLiveCharts();
}, 2000);
</script>`) </script>`)
} }

View File

@@ -423,13 +423,14 @@ func (q *taskQueue) worker() {
setCPUGovernor("performance") setCPUGovernor("performance")
defer setCPUGovernor("powersave") defer setCPUGovernor("powersave")
// Drain all pending tasks and start them in parallel.
q.mu.Lock()
var batch []*Task
for { for {
q.mu.Lock()
t := q.nextPending() t := q.nextPending()
if t == nil { if t == nil {
break q.prune()
q.persistLocked()
q.mu.Unlock()
return
} }
now := time.Now() now := time.Now()
t.Status = TaskRunning t.Status = TaskRunning
@@ -438,29 +439,14 @@ func (q *taskQueue) worker() {
t.ErrMsg = "" t.ErrMsg = ""
j := newTaskJobState(t.LogPath, taskSerialPrefix(t)) j := newTaskJobState(t.LogPath, taskSerialPrefix(t))
t.job = j t.job = j
batch = append(batch, t)
}
if len(batch) > 0 {
q.persistLocked() q.persistLocked()
} q.mu.Unlock()
q.mu.Unlock()
var wg sync.WaitGroup
for _, t := range batch {
t := t
j := t.job
taskCtx, taskCancel := context.WithCancel(context.Background()) taskCtx, taskCancel := context.WithCancel(context.Background())
j.cancel = taskCancel j.cancel = taskCancel
wg.Add(1) q.executeTask(t, j, taskCtx)
goRecoverOnce("task "+t.Target, func() { taskCancel()
defer wg.Done()
defer taskCancel()
q.executeTask(t, j, taskCtx)
})
}
wg.Wait()
if len(batch) > 0 {
q.mu.Lock() q.mu.Lock()
q.prune() q.prune()
q.persistLocked() q.persistLocked()

View File

@@ -11,18 +11,18 @@ echo " Hardware Audit LiveCD"
echo "" echo ""
menuentry "EASY-BEE" { menuentry "EASY-BEE" {
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
initrd @INITRD_LIVE@ initrd @INITRD_LIVE@
} }
submenu "EASY-BEE (advanced options) -->" { submenu "EASY-BEE (advanced options) -->" {
menuentry "EASY-BEE — GSP=off" { menuentry "EASY-BEE — GSP=off" {
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
initrd @INITRD_LIVE@ initrd @INITRD_LIVE@
} }
menuentry "EASY-BEE — nomodeset" { menuentry "EASY-BEE — KMS (no nomodeset)" {
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
initrd @INITRD_LIVE@ initrd @INITRD_LIVE@
} }

View File

@@ -44,23 +44,27 @@ else:
img = Image.new('RGB', (W, H), (0, 0, 0)) img = Image.new('RGB', (W, H), (0, 0, 0))
draw = ImageDraw.Draw(img) draw = ImageDraw.Draw(img)
# Measure logo block # Measure logo block line by line to avoid font ascender offset
lines = LOGO.split('\n') lines = LOGO.split('\n')
bbox = draw.textbbox((0, 0), LOGO, font=font_logo) logo_lines = lines[:6]
text_w = bbox[2] - bbox[0] sub_line = lines[6] if len(lines) > 6 else ''
text_h = bbox[3] - bbox[1]
x = (W - text_w) // 2 line_h = SIZE + 2
y = (H - text_h) // 2 block_h = len(logo_lines) * line_h + 8 + (SIZE if sub_line else 0)
# Draw logo lines: first 6 in amber, last line (subtitle) dimmer # Width: measure the widest logo line
logo_lines = lines[:6] max_w = 0
sub_line = lines[6] if len(lines) > 6 else '' for line in logo_lines:
bb = draw.textbbox((0, 0), line, font=font_logo)
max_w = max(max_w, bb[2] - bb[0])
x = (W - max_w) // 2
y = (H - block_h) // 2
cy = y cy = y
for line in logo_lines: for line in logo_lines:
draw.text((x, cy), line, font=font_logo, fill=(0xf6, 0xc9, 0x0e)) draw.text((x, cy), line, font=font_logo, fill=(0xf6, 0xc9, 0x0e))
cy += SIZE + 2 cy += line_h
cy += 8 cy += 8
if sub_line: if sub_line:
draw.text((x, cy), sub_line, font=font_sub, fill=(0x80, 0x68, 0x18)) draw.text((x, cy), sub_line, font=font_sub, fill=(0x80, 0x68, 0x18))

View File

@@ -209,6 +209,18 @@ fi
ldconfig 2>/dev/null || true ldconfig 2>/dev/null || true
log "ldconfig refreshed" log "ldconfig refreshed"
# Keep persistence mode enabled across the session so dcgmi / stress tools do
# not fail with deployment warnings on otherwise healthy GPUs.
if command -v nvidia-smi >/dev/null 2>&1; then
if nvidia-smi -pm 1 >/dev/null 2>&1; then
log "enabled NVIDIA persistence mode"
else
log "WARN: failed to enable NVIDIA persistence mode"
fi
else
log "WARN: nvidia-smi not found — cannot enable persistence mode"
fi
# Start DCGM host engine so dcgmi can discover GPUs. # Start DCGM host engine so dcgmi can discover GPUs.
# nv-hostengine must run after the NVIDIA modules and device nodes are ready. # nv-hostengine must run after the NVIDIA modules and device nodes are ready.
# If it started too early (for example via systemd before bee-nvidia-load), it can # If it started too early (for example via systemd before bee-nvidia-load), it can