Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
fc5c100a29 | ||
| 6e94216f3b | |||
| 53455063b9 | |||
| 4602f97836 | |||
| c65d3ae3b1 |
@@ -16,12 +16,12 @@ func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts N
|
||||
return "", err
|
||||
}
|
||||
|
||||
return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
|
||||
return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
|
||||
job,
|
||||
{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
}, logFunc)
|
||||
satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
func nvidiaStressArchivePrefix(loader string) string {
|
||||
|
||||
@@ -278,13 +278,13 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
|
||||
if gpuCount < 1 {
|
||||
gpuCount = 1
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-all-reduce-perf.log", cmd: []string{
|
||||
return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{name: "02-all-reduce-perf.log", cmd: []string{
|
||||
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
|
||||
"-g", strconv.Itoa(gpuCount), "--iters", "20",
|
||||
}},
|
||||
}, logFunc)
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
@@ -296,18 +296,18 @@ func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir strin
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
|
||||
{
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
|
||||
satJob{
|
||||
name: "03-dcgmproftester.log",
|
||||
cmd: profCmd,
|
||||
env: nvidiaVisibleDevicesEnv(selected),
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
},
|
||||
{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
}, logFunc)
|
||||
satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
@@ -315,16 +315,16 @@ func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string,
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{
|
||||
name: "02-dcgmi-targeted-power.log",
|
||||
cmd: nvidiaDCGMNamedDiagCommand("targeted_power", normalizeNvidiaBurnDuration(durationSec), selected),
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
},
|
||||
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
}, logFunc)
|
||||
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
@@ -332,16 +332,16 @@ func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, dur
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{
|
||||
name: "02-dcgmi-pulse-test.log",
|
||||
cmd: nvidiaDCGMNamedDiagCommand("pulse_test", normalizeNvidiaBurnDuration(durationSec), selected),
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
},
|
||||
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
}, logFunc)
|
||||
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
@@ -349,16 +349,16 @@ func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpu
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{
|
||||
name: "02-dcgmi-nvbandwidth.log",
|
||||
cmd: nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, selected),
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
},
|
||||
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
}, logFunc)
|
||||
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
@@ -389,16 +389,16 @@ func (s *System) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDi
|
||||
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||
}
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{
|
||||
name: "02-dcgmi-targeted-stress.log",
|
||||
cmd: nvidiaDCGMNamedDiagCommand("targeted_stress", normalizeNvidiaBurnDuration(durationSec), selected),
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
},
|
||||
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
}, logFunc)
|
||||
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
|
||||
@@ -568,14 +568,24 @@ type satStats struct {
|
||||
Unsupported int
|
||||
}
|
||||
|
||||
func withNvidiaPersistenceMode(jobs ...satJob) []satJob {
|
||||
out := make([]satJob, 0, len(jobs)+1)
|
||||
out = append(out, satJob{
|
||||
name: "00-nvidia-smi-persistence-mode.log",
|
||||
cmd: []string{"nvidia-smi", "-pm", "1"},
|
||||
})
|
||||
out = append(out, jobs...)
|
||||
return out
|
||||
}
|
||||
|
||||
func nvidiaSATJobs() []satJob {
|
||||
return []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||
{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
||||
{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
|
||||
}
|
||||
return withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||
satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||
satJob{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
||||
satJob{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
|
||||
)
|
||||
}
|
||||
|
||||
func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
||||
@@ -590,12 +600,12 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
||||
}
|
||||
diagArgs = append(diagArgs, "-i", strings.Join(ids, ","))
|
||||
}
|
||||
return []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||
{name: "04-dcgmi-diag.log", cmd: diagArgs},
|
||||
}
|
||||
return withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||
satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||
satJob{name: "04-dcgmi-diag.log", cmd: diagArgs},
|
||||
)
|
||||
}
|
||||
|
||||
func nvidiaDCGMNamedDiagCommand(name string, durationSec int, gpuIndices []int) []string {
|
||||
|
||||
@@ -28,13 +28,19 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {
|
||||
|
||||
jobs := nvidiaSATJobs()
|
||||
|
||||
if len(jobs) != 5 {
|
||||
t.Fatalf("jobs=%d want 5", len(jobs))
|
||||
if len(jobs) != 6 {
|
||||
t.Fatalf("jobs=%d want 6", len(jobs))
|
||||
}
|
||||
if got := jobs[4].cmd[0]; got != "bee-gpu-burn" {
|
||||
if got := jobs[0].cmd[0]; got != "nvidia-smi" {
|
||||
t.Fatalf("preflight command=%q want nvidia-smi", got)
|
||||
}
|
||||
if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
|
||||
t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
|
||||
}
|
||||
if got := jobs[5].cmd[0]; got != "bee-gpu-burn" {
|
||||
t.Fatalf("gpu stress command=%q want bee-gpu-burn", got)
|
||||
}
|
||||
if got := jobs[3].cmd[1]; got != "--output-file" {
|
||||
if got := jobs[4].cmd[1]; got != "--output-file" {
|
||||
t.Fatalf("bug report flag=%q want --output-file", got)
|
||||
}
|
||||
}
|
||||
@@ -82,7 +88,7 @@ func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) {
|
||||
|
||||
func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
|
||||
jobs := nvidiaSATJobs()
|
||||
got := jobs[4].cmd
|
||||
got := jobs[5].cmd
|
||||
want := []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}
|
||||
if len(got) != len(want) {
|
||||
t.Fatalf("cmd len=%d want %d", len(got), len(want))
|
||||
@@ -94,6 +100,19 @@ func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaDCGMJobsEnablePersistenceModeBeforeDiag(t *testing.T) {
|
||||
jobs := nvidiaDCGMJobs(3, []int{2, 0})
|
||||
if len(jobs) != 5 {
|
||||
t.Fatalf("jobs=%d want 5", len(jobs))
|
||||
}
|
||||
if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
|
||||
t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
|
||||
}
|
||||
if got := strings.Join(jobs[4].cmd, " "); got != "dcgmi diag -r 3 -i 2,0" {
|
||||
t.Fatalf("diag=%q want %q", got, "dcgmi diag -r 3 -i 2,0")
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
@@ -8,9 +8,12 @@ import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
@@ -161,7 +164,7 @@ func renderPage(page string, opts HandlerOptions) string {
|
||||
case "benchmark":
|
||||
pageID = "benchmark"
|
||||
title = "Benchmark"
|
||||
body = renderBenchmark()
|
||||
body = renderBenchmark(opts)
|
||||
case "tasks":
|
||||
pageID = "tasks"
|
||||
title = "Tasks"
|
||||
@@ -1068,17 +1071,23 @@ func renderValidate(opts HandlerOptions) string {
|
||||
`</div>
|
||||
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
||||
<div class="grid3">
|
||||
` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
|
||||
` + renderSATCard("nvidia-selection", "NVIDIA GPU Selection", "", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Runs NVIDIA diagnostics and board inventory checks.`,
|
||||
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
||||
`Runs one GPU at a time. Diag level is taken from Validate Profile.`,
|
||||
`Select which NVIDIA GPUs to include in Validate. The same selection is used by both NVIDIA GPU cards below and by Validate one by one.`,
|
||||
`<code>nvidia-smi --query-gpu=index,name,memory.total</code>`,
|
||||
`<div id="sat-gpu-list"><p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs…</p></div><div style="display:flex;gap:8px;flex-wrap:wrap;margin-top:8px"><button type="button" class="btn btn-sm btn-secondary" onclick="satSelectAllGPUs()">Select all</button><button type="button" class="btn btn-sm btn-secondary" onclick="satSelectNoGPUs()">Clear</button></div><div id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin-top:8px"></div>`,
|
||||
)) +
|
||||
renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Runs NVIDIA diagnostics and board inventory checks.`,
|
||||
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
||||
`Runs one GPU at a time on the selected NVIDIA GPUs. Diag level is taken from Validate Profile.`,
|
||||
)) +
|
||||
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Runs a controlled NVIDIA DCGM load in Validate to check stability under moderate stress.`,
|
||||
`<code>dcgmi diag targeted_stress</code>`,
|
||||
`Runs one GPU at a time with the fixed DCGM targeted stress recipe.`,
|
||||
`Runs one GPU at a time on the selected NVIDIA GPUs with the fixed DCGM targeted stress recipe.`,
|
||||
)) +
|
||||
`</div>
|
||||
<div class="grid3" style="margin-top:16px">
|
||||
@@ -1100,6 +1109,8 @@ func renderValidate(opts HandlerOptions) string {
|
||||
.validate-card-body { padding:0; }
|
||||
.validate-card-section { padding:12px 16px 0; }
|
||||
.validate-card-section:last-child { padding-bottom:16px; }
|
||||
.sat-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
|
||||
.sat-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
@media(max-width:900px){ .validate-profile-body { grid-template-columns:1fr; } }
|
||||
</style>
|
||||
<script>
|
||||
@@ -1128,6 +1139,59 @@ function loadSatNvidiaGPUs() {
|
||||
}
|
||||
return satNvidiaGPUsPromise;
|
||||
}
|
||||
function satSelectedGPUIndices() {
|
||||
return Array.from(document.querySelectorAll('.sat-nvidia-checkbox'))
|
||||
.filter(function(el) { return el.checked && !el.disabled; })
|
||||
.map(function(el) { return parseInt(el.value, 10); })
|
||||
.filter(function(v) { return !Number.isNaN(v); })
|
||||
.sort(function(a, b) { return a - b; });
|
||||
}
|
||||
function satUpdateGPUSelectionNote() {
|
||||
const note = document.getElementById('sat-gpu-selection-note');
|
||||
if (!note) return;
|
||||
const selected = satSelectedGPUIndices();
|
||||
if (!selected.length) {
|
||||
note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA validate tasks.';
|
||||
return;
|
||||
}
|
||||
note.textContent = 'Selected NVIDIA GPUs: ' + selected.join(', ') + '.';
|
||||
}
|
||||
function satRenderGPUList(gpus) {
|
||||
const root = document.getElementById('sat-gpu-list');
|
||||
if (!root) return;
|
||||
if (!gpus || !gpus.length) {
|
||||
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
|
||||
satUpdateGPUSelectionNote();
|
||||
return;
|
||||
}
|
||||
root.innerHTML = gpus.map(function(gpu) {
|
||||
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
|
||||
return '<label class="sat-gpu-row">'
|
||||
+ '<input class="sat-nvidia-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="satUpdateGPUSelectionNote()">'
|
||||
+ '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
|
||||
+ '</label>';
|
||||
}).join('');
|
||||
satUpdateGPUSelectionNote();
|
||||
}
|
||||
function satSelectAllGPUs() {
|
||||
document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = true; });
|
||||
satUpdateGPUSelectionNote();
|
||||
}
|
||||
function satSelectNoGPUs() {
|
||||
document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = false; });
|
||||
satUpdateGPUSelectionNote();
|
||||
}
|
||||
function satLoadGPUs() {
|
||||
loadSatNvidiaGPUs().then(function(gpus) {
|
||||
satRenderGPUList(gpus);
|
||||
}).catch(function(err) {
|
||||
const root = document.getElementById('sat-gpu-list');
|
||||
if (root) {
|
||||
root.innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
|
||||
}
|
||||
satUpdateGPUSelectionNote();
|
||||
});
|
||||
}
|
||||
function satGPUDisplayName(gpu) {
|
||||
const idx = (gpu && Number.isFinite(Number(gpu.index))) ? Number(gpu.index) : 0;
|
||||
const name = gpu && gpu.name ? gpu.name : ('GPU ' + idx);
|
||||
@@ -1149,6 +1213,36 @@ function enqueueSATTarget(target, overrides) {
|
||||
return fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(satRequestBody(target, overrides))})
|
||||
.then(r => r.json());
|
||||
}
|
||||
function streamSATTask(taskId, title, resetTerminal) {
|
||||
if (satES) { satES.close(); satES = null; }
|
||||
document.getElementById('sat-output').style.display='block';
|
||||
document.getElementById('sat-title').textContent = '— ' + title;
|
||||
const term = document.getElementById('sat-terminal');
|
||||
if (resetTerminal) {
|
||||
term.textContent = '';
|
||||
}
|
||||
term.textContent += 'Task ' + taskId + ' queued. Streaming log...\n';
|
||||
return new Promise(function(resolve) {
|
||||
satES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
satES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||
satES.addEventListener('done', function(e) {
|
||||
satES.close();
|
||||
satES = null;
|
||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
resolve({ok: !e.data, error: e.data || ''});
|
||||
});
|
||||
satES.onerror = function() {
|
||||
if (satES) {
|
||||
satES.close();
|
||||
satES = null;
|
||||
}
|
||||
term.textContent += '\nERROR: stream disconnected.\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
resolve({ok: false, error: 'stream disconnected'});
|
||||
};
|
||||
});
|
||||
}
|
||||
function selectedAMDValidateTargets() {
|
||||
const targets = [];
|
||||
const gpu = document.getElementById('sat-amd-target');
|
||||
@@ -1163,24 +1257,23 @@ function runSAT(target) {
|
||||
return runSATWithOverrides(target, null);
|
||||
}
|
||||
function runSATWithOverrides(target, overrides) {
|
||||
if (satES) { satES.close(); satES = null; }
|
||||
document.getElementById('sat-output').style.display='block';
|
||||
document.getElementById('sat-title').textContent = '— ' + target;
|
||||
const title = (overrides && overrides.display_name) || target;
|
||||
const term = document.getElementById('sat-terminal');
|
||||
term.textContent = 'Enqueuing ' + target + ' test...\n';
|
||||
document.getElementById('sat-output').style.display='block';
|
||||
document.getElementById('sat-title').textContent = '— ' + title;
|
||||
term.textContent = 'Enqueuing ' + title + ' test...\n';
|
||||
return enqueueSATTarget(target, overrides)
|
||||
.then(d => {
|
||||
term.textContent += 'Task ' + d.task_id + ' queued. Streaming log...\n';
|
||||
satES = new EventSource('/api/tasks/'+d.task_id+'/stream');
|
||||
satES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop=term.scrollHeight; };
|
||||
satES.addEventListener('done', e => { satES.close(); satES=null; term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n'; });
|
||||
});
|
||||
.then(d => streamSATTask(d.task_id, title, false));
|
||||
}
|
||||
function expandSATTarget(target) {
|
||||
if (target !== 'nvidia' && target !== 'nvidia-targeted-stress') {
|
||||
return Promise.resolve([{target: target}]);
|
||||
}
|
||||
return loadSatNvidiaGPUs().then(gpus => gpus.map(gpu => ({
|
||||
const selected = satSelectedGPUIndices();
|
||||
if (!selected.length) {
|
||||
return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
|
||||
}
|
||||
return loadSatNvidiaGPUs().then(gpus => gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0).map(gpu => ({
|
||||
target: target,
|
||||
overrides: {
|
||||
gpu_indices: [Number(gpu.index)],
|
||||
@@ -1191,65 +1284,61 @@ function expandSATTarget(target) {
|
||||
}
|
||||
function runNvidiaValidateSet(target) {
|
||||
return loadSatNvidiaGPUs().then(gpus => {
|
||||
if (!gpus.length) return;
|
||||
if (gpus.length === 1) {
|
||||
const gpu = gpus[0];
|
||||
const selected = satSelectedGPUIndices();
|
||||
const picked = gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0);
|
||||
if (!picked.length) {
|
||||
throw new Error('Select at least one NVIDIA GPU.');
|
||||
}
|
||||
if (picked.length === 1) {
|
||||
const gpu = picked[0];
|
||||
return runSATWithOverrides(target, {
|
||||
gpu_indices: [Number(gpu.index)],
|
||||
display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')'
|
||||
});
|
||||
}
|
||||
if (satES) { satES.close(); satES = null; }
|
||||
document.getElementById('sat-output').style.display='block';
|
||||
document.getElementById('sat-title').textContent = '— ' + target;
|
||||
const term = document.getElementById('sat-terminal');
|
||||
term.textContent = 'Enqueuing ' + target + ' tests one GPU at a time...\n';
|
||||
term.textContent = 'Running ' + target + ' one GPU at a time...\n';
|
||||
const labelBase = satLabels()[target] || ('Validate ' + target);
|
||||
const enqueueNext = (idx) => {
|
||||
if (idx >= gpus.length) return;
|
||||
const gpu = gpus[idx];
|
||||
const runNext = (idx) => {
|
||||
if (idx >= picked.length) return Promise.resolve();
|
||||
const gpu = picked[idx];
|
||||
const gpuLabel = satGPUDisplayName(gpu);
|
||||
enqueueSATTarget(target, {
|
||||
term.textContent += '\n[' + (idx + 1) + '/' + picked.length + '] ' + gpuLabel + '\n';
|
||||
return enqueueSATTarget(target, {
|
||||
gpu_indices: [Number(gpu.index)],
|
||||
display_name: labelBase + ' (' + gpuLabel + ')'
|
||||
}).then(d => {
|
||||
term.textContent += 'Task ' + d.task_id + ' queued for ' + gpuLabel + '.\n';
|
||||
if (idx === gpus.length - 1) {
|
||||
satES = new EventSource('/api/tasks/' + d.task_id + '/stream');
|
||||
satES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop=term.scrollHeight; };
|
||||
satES.addEventListener('done', e => { satES.close(); satES=null; term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n'; });
|
||||
}
|
||||
enqueueNext(idx + 1);
|
||||
return streamSATTask(d.task_id, labelBase + ' (' + gpuLabel + ')', false);
|
||||
}).then(function() {
|
||||
return runNext(idx + 1);
|
||||
});
|
||||
};
|
||||
enqueueNext(0);
|
||||
return runNext(0);
|
||||
});
|
||||
}
|
||||
function runAMDValidateSet() {
|
||||
const targets = selectedAMDValidateTargets();
|
||||
if (!targets.length) return;
|
||||
if (targets.length === 1) return runSAT(targets[0]);
|
||||
if (satES) { satES.close(); satES = null; }
|
||||
document.getElementById('sat-output').style.display='block';
|
||||
document.getElementById('sat-title').textContent = '— amd';
|
||||
const term = document.getElementById('sat-terminal');
|
||||
term.textContent = 'Enqueuing AMD validate set...\n';
|
||||
term.textContent = 'Running AMD validate set one by one...\n';
|
||||
const labels = satLabels();
|
||||
const enqueueNext = (idx) => {
|
||||
if (idx >= targets.length) return;
|
||||
const runNext = (idx) => {
|
||||
if (idx >= targets.length) return Promise.resolve();
|
||||
const target = targets[idx];
|
||||
enqueueSATTarget(target)
|
||||
term.textContent += '\n[' + (idx + 1) + '/' + targets.length + '] ' + labels[target] + '\n';
|
||||
return enqueueSATTarget(target)
|
||||
.then(d => {
|
||||
term.textContent += 'Task ' + d.task_id + ' queued for ' + labels[target] + '.\n';
|
||||
if (idx === targets.length - 1) {
|
||||
satES = new EventSource('/api/tasks/'+d.task_id+'/stream');
|
||||
satES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop=term.scrollHeight; };
|
||||
satES.addEventListener('done', e => { satES.close(); satES=null; term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n'; });
|
||||
}
|
||||
enqueueNext(idx + 1);
|
||||
return streamSATTask(d.task_id, labels[target], false);
|
||||
}).then(function() {
|
||||
return runNext(idx + 1);
|
||||
});
|
||||
};
|
||||
enqueueNext(0);
|
||||
return runNext(0);
|
||||
}
|
||||
function runAllSAT() {
|
||||
const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
|
||||
@@ -1271,17 +1360,17 @@ function runAllSAT() {
|
||||
status.textContent = 'No tasks selected.';
|
||||
return;
|
||||
}
|
||||
const enqueueNext = (idx) => {
|
||||
if (idx >= expanded.length) { status.textContent = 'Enqueued ' + total + ' tasks.'; return; }
|
||||
const runNext = (idx) => {
|
||||
if (idx >= expanded.length) { status.textContent = 'Completed ' + total + ' task(s).'; return Promise.resolve(); }
|
||||
const item = expanded[idx];
|
||||
enqueueSATTarget(item.target, item.overrides)
|
||||
status.textContent = 'Running ' + (idx + 1) + '/' + total + '...';
|
||||
return enqueueSATTarget(item.target, item.overrides)
|
||||
.then(() => {
|
||||
enqueued++;
|
||||
status.textContent = 'Enqueued ' + enqueued + '/' + total + '...';
|
||||
enqueueNext(idx + 1);
|
||||
return runNext(idx + 1);
|
||||
});
|
||||
};
|
||||
enqueueNext(0);
|
||||
return runNext(0);
|
||||
}).catch(err => {
|
||||
status.textContent = 'Error: ' + err.message;
|
||||
});
|
||||
@@ -1294,6 +1383,7 @@ fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
|
||||
if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
|
||||
if (!gp.amd) disableSATAMDOptions('No AMD GPU detected');
|
||||
});
|
||||
satLoadGPUs();
|
||||
function disableSATAMDOptions(reason) {
|
||||
['sat-amd-target','sat-amd-mem-target','sat-amd-bandwidth-target'].forEach(function(id) {
|
||||
const cb = document.getElementById(id);
|
||||
@@ -1482,7 +1572,25 @@ func renderSATCard(id, label, runAction, headerActions, body string) string {
|
||||
|
||||
// ── Benchmark ─────────────────────────────────────────────────────────────────
|
||||
|
||||
func renderBenchmark() string {
|
||||
type benchmarkHistoryColumn struct {
|
||||
key string
|
||||
label string
|
||||
name string
|
||||
index int
|
||||
}
|
||||
|
||||
type benchmarkHistoryCell struct {
|
||||
score float64
|
||||
present bool
|
||||
}
|
||||
|
||||
type benchmarkHistoryRun struct {
|
||||
generatedAt time.Time
|
||||
displayTime string
|
||||
cells map[string]benchmarkHistoryCell
|
||||
}
|
||||
|
||||
func renderBenchmark(opts HandlerOptions) string {
|
||||
return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Benchmark runs generate a human-readable TXT report and machine-readable result bundle. Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
|
||||
<div class="grid2">
|
||||
@@ -1531,6 +1639,8 @@ func renderBenchmark() string {
|
||||
</div>
|
||||
</div>
|
||||
|
||||
` + renderBenchmarkResultsCard(opts.ExportDir) + `
|
||||
|
||||
<div id="benchmark-output" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
|
||||
<div class="card-body"><div id="benchmark-terminal" class="terminal"></div></div>
|
||||
@@ -1667,6 +1777,115 @@ benchmarkLoadGPUs();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderBenchmarkResultsCard(exportDir string) string {
|
||||
columns, runs := loadBenchmarkHistory(exportDir)
|
||||
if len(runs) == 0 {
|
||||
return `<div class="card"><div class="card-head">Benchmark Results</div><div class="card-body"><p style="color:var(--muted);font-size:13px">No saved benchmark runs yet.</p></div></div>`
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
b.WriteString(`<div class="card"><div class="card-head">Benchmark Results</div><div class="card-body">`)
|
||||
b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">Composite score by saved benchmark run and GPU.</p>`)
|
||||
b.WriteString(`<div style="overflow-x:auto">`)
|
||||
b.WriteString(`<table><thead><tr><th>Test</th><th>Time</th>`)
|
||||
for _, col := range columns {
|
||||
b.WriteString(`<th>` + html.EscapeString(col.label) + `</th>`)
|
||||
}
|
||||
b.WriteString(`</tr></thead><tbody>`)
|
||||
for i, run := range runs {
|
||||
b.WriteString(`<tr>`)
|
||||
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
|
||||
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
|
||||
for _, col := range columns {
|
||||
cell, ok := run.cells[col.key]
|
||||
if !ok || !cell.present {
|
||||
b.WriteString(`<td style="color:var(--muted)">-</td>`)
|
||||
continue
|
||||
}
|
||||
b.WriteString(`<td>` + fmt.Sprintf("%.2f", cell.score) + `</td>`)
|
||||
}
|
||||
b.WriteString(`</tr>`)
|
||||
}
|
||||
b.WriteString(`</tbody></table></div></div></div>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func loadBenchmarkHistory(exportDir string) ([]benchmarkHistoryColumn, []benchmarkHistoryRun) {
|
||||
baseDir := app.DefaultBenchmarkBaseDir
|
||||
if strings.TrimSpace(exportDir) != "" {
|
||||
baseDir = filepath.Join(exportDir, "bee-benchmark")
|
||||
}
|
||||
paths, err := filepath.Glob(filepath.Join(baseDir, "gpu-benchmark-*", "result.json"))
|
||||
if err != nil || len(paths) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
sort.Strings(paths)
|
||||
|
||||
columnByKey := make(map[string]benchmarkHistoryColumn)
|
||||
runs := make([]benchmarkHistoryRun, 0, len(paths))
|
||||
for _, path := range paths {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
var result platform.NvidiaBenchmarkResult
|
||||
if err := json.Unmarshal(raw, &result); err != nil {
|
||||
continue
|
||||
}
|
||||
run := benchmarkHistoryRun{
|
||||
generatedAt: result.GeneratedAt,
|
||||
displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
||||
cells: make(map[string]benchmarkHistoryCell),
|
||||
}
|
||||
for _, gpu := range result.GPUs {
|
||||
key := benchmarkHistoryColumnKey(gpu.Name, gpu.Index)
|
||||
columnByKey[key] = benchmarkHistoryColumn{
|
||||
key: key,
|
||||
label: benchmarkHistoryColumnLabel(gpu.Name, gpu.Index),
|
||||
name: strings.TrimSpace(gpu.Name),
|
||||
index: gpu.Index,
|
||||
}
|
||||
run.cells[key] = benchmarkHistoryCell{
|
||||
score: gpu.Scores.CompositeScore,
|
||||
present: true,
|
||||
}
|
||||
}
|
||||
runs = append(runs, run)
|
||||
}
|
||||
|
||||
columns := make([]benchmarkHistoryColumn, 0, len(columnByKey))
|
||||
for _, col := range columnByKey {
|
||||
columns = append(columns, col)
|
||||
}
|
||||
sort.Slice(columns, func(i, j int) bool {
|
||||
leftName := strings.ToLower(strings.TrimSpace(columns[i].name))
|
||||
rightName := strings.ToLower(strings.TrimSpace(columns[j].name))
|
||||
if leftName != rightName {
|
||||
return leftName < rightName
|
||||
}
|
||||
if columns[i].index != columns[j].index {
|
||||
return columns[i].index < columns[j].index
|
||||
}
|
||||
return columns[i].key < columns[j].key
|
||||
})
|
||||
sort.Slice(runs, func(i, j int) bool {
|
||||
return runs[i].generatedAt.After(runs[j].generatedAt)
|
||||
})
|
||||
return columns, runs
|
||||
}
|
||||
|
||||
func benchmarkHistoryColumnKey(name string, index int) string {
|
||||
return strings.TrimSpace(name) + "|" + strconv.Itoa(index)
|
||||
}
|
||||
|
||||
func benchmarkHistoryColumnLabel(name string, index int) string {
|
||||
name = strings.TrimSpace(name)
|
||||
if name == "" {
|
||||
return fmt.Sprintf("GPU %d", index)
|
||||
}
|
||||
return fmt.Sprintf("%s / GPU %d", name, index)
|
||||
}
|
||||
|
||||
// ── Burn ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
func renderBurn() string {
|
||||
@@ -1886,6 +2105,36 @@ function streamTask(taskId, label) {
|
||||
term.scrollTop = term.scrollHeight;
|
||||
});
|
||||
}
|
||||
function streamBurnTask(taskId, label, resetTerminal) {
|
||||
if (biES) { biES.close(); biES = null; }
|
||||
document.getElementById('bi-output').style.display = 'block';
|
||||
document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
|
||||
const term = document.getElementById('bi-terminal');
|
||||
if (resetTerminal) {
|
||||
term.textContent = '';
|
||||
}
|
||||
term.textContent += 'Task ' + taskId + ' queued. Streaming...\n';
|
||||
return new Promise(function(resolve) {
|
||||
biES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||
biES.addEventListener('done', function(e) {
|
||||
biES.close();
|
||||
biES = null;
|
||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
resolve({ok: !e.data, error: e.data || ''});
|
||||
});
|
||||
biES.onerror = function() {
|
||||
if (biES) {
|
||||
biES.close();
|
||||
biES = null;
|
||||
}
|
||||
term.textContent += '\nERROR: stream disconnected.\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
resolve({ok: false, error: 'stream disconnected'});
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
function runBurnTaskSet(tasks, statusElId) {
|
||||
const enabled = tasks.filter(function(t) {
|
||||
@@ -1898,19 +2147,33 @@ function runBurnTaskSet(tasks, statusElId) {
|
||||
if (status) status.textContent = 'No tasks selected.';
|
||||
return;
|
||||
}
|
||||
enabled.forEach(function(t) {
|
||||
enqueueBurnTask(t.target, t.label, t.extra, !!t.nvidia)
|
||||
const term = document.getElementById('bi-terminal');
|
||||
document.getElementById('bi-output').style.display = 'block';
|
||||
document.getElementById('bi-title').textContent = '— Burn one by one [' + burnProfile() + ']';
|
||||
term.textContent = '';
|
||||
const runNext = function(idx) {
|
||||
if (idx >= enabled.length) {
|
||||
if (status) status.textContent = 'Completed ' + enabled.length + ' task(s).';
|
||||
return Promise.resolve();
|
||||
}
|
||||
const t = enabled[idx];
|
||||
term.textContent += '\n[' + (idx + 1) + '/' + enabled.length + '] ' + t.label + '\n';
|
||||
if (status) status.textContent = 'Running ' + (idx + 1) + '/' + enabled.length + '...';
|
||||
return enqueueBurnTask(t.target, t.label, t.extra, !!t.nvidia)
|
||||
.then(function(d) {
|
||||
if (status) status.textContent = enabled.length + ' task(s) queued.';
|
||||
streamTask(d.task_id, t.label);
|
||||
return streamBurnTask(d.task_id, t.label, false);
|
||||
})
|
||||
.then(function() {
|
||||
return runNext(idx + 1);
|
||||
})
|
||||
.catch(function(err) {
|
||||
if (status) status.textContent = 'Error: ' + err.message;
|
||||
const term = document.getElementById('bi-terminal');
|
||||
document.getElementById('bi-output').style.display = 'block';
|
||||
term.textContent += 'ERROR: ' + err.message + '\n';
|
||||
return Promise.reject(err);
|
||||
});
|
||||
});
|
||||
};
|
||||
return runNext(0);
|
||||
}
|
||||
|
||||
function runPlatformStress() {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
@@ -601,8 +602,8 @@ func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
|
||||
if !strings.Contains(body, `Restart GPU Drivers`) {
|
||||
t.Fatalf("tools page missing restart gpu drivers button: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `svcAction('bee-nvidia', 'restart')`) {
|
||||
t.Fatalf("tools page missing bee-nvidia restart action: %s", body)
|
||||
if !strings.Contains(body, `restartGPUDrivers()`) {
|
||||
t.Fatalf("tools page missing restartGPUDrivers action: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `id="boot-source-text"`) {
|
||||
t.Fatalf("tools page missing boot source field: %s", body)
|
||||
@@ -636,6 +637,66 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
exportDir := filepath.Join(dir, "export")
|
||||
runDir := filepath.Join(exportDir, "bee-benchmark", "gpu-benchmark-20260406-120000")
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
result := platform.NvidiaBenchmarkResult{
|
||||
GeneratedAt: time.Date(2026, time.April, 6, 12, 0, 0, 0, time.UTC),
|
||||
BenchmarkProfile: "standard",
|
||||
OverallStatus: "OK",
|
||||
GPUs: []platform.BenchmarkGPUResult{
|
||||
{
|
||||
Index: 0,
|
||||
Name: "NVIDIA H100 PCIe",
|
||||
Scores: platform.BenchmarkScorecard{
|
||||
CompositeScore: 1176.25,
|
||||
},
|
||||
},
|
||||
{
|
||||
Index: 1,
|
||||
Name: "NVIDIA H100 PCIe",
|
||||
Scores: platform.BenchmarkScorecard{
|
||||
CompositeScore: 1168.50,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
raw, err := json.Marshal(result)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(runDir, "result.json"), raw, 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
handler := NewHandler(HandlerOptions{ExportDir: exportDir})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/benchmark", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
wantTime := result.GeneratedAt.Local().Format("2006-01-02 15:04:05")
|
||||
for _, needle := range []string{
|
||||
`Benchmark Results`,
|
||||
`Composite score by saved benchmark run and GPU.`,
|
||||
`NVIDIA H100 PCIe / GPU 0`,
|
||||
`NVIDIA H100 PCIe / GPU 1`,
|
||||
`#1`,
|
||||
wantTime,
|
||||
`1176.25`,
|
||||
`1168.50`,
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("benchmark page missing %q: %s", needle, body)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
@@ -649,6 +710,8 @@ func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
|
||||
`nvidia-targeted-stress`,
|
||||
`controlled NVIDIA DCGM load`,
|
||||
`<code>dcgmi diag targeted_stress</code>`,
|
||||
`NVIDIA GPU Selection`,
|
||||
`id="sat-gpu-list"`,
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("validate page missing %q: %s", needle, body)
|
||||
|
||||
@@ -97,32 +97,73 @@ func renderTaskDetailPage(opts HandlerOptions, task Task) string {
|
||||
body.WriteString(`</div></div>`)
|
||||
}
|
||||
|
||||
if task.Status == TaskRunning || task.Status == TaskPending {
|
||||
if task.Status == TaskRunning {
|
||||
body.WriteString(`<div class="card"><div class="card-head">Live Charts</div><div class="card-body">`)
|
||||
body.WriteString(`<div id="task-live-charts" style="display:flex;flex-direction:column;gap:16px;color:var(--muted);font-size:13px">Loading charts...</div>`)
|
||||
body.WriteString(`</div></div>`)
|
||||
}
|
||||
|
||||
if task.Status == TaskRunning || task.Status == TaskPending {
|
||||
body.WriteString(`<div class="card"><div class="card-head">Live Logs</div><div class="card-body">`)
|
||||
body.WriteString(`<div id="task-live-log" class="terminal" style="max-height:none;white-space:pre-wrap">Connecting...</div>`)
|
||||
body.WriteString(`</div></div>`)
|
||||
body.WriteString(`<script>
|
||||
function cancelTaskDetail(id) {
|
||||
fetch('/api/tasks/' + id + '/cancel', {method:'POST'}).then(function(){ window.location.reload(); });
|
||||
fetch('/api/tasks/' + id + '/cancel', {method:'POST'}).then(function(){
|
||||
var term = document.getElementById('task-live-log');
|
||||
if (term) {
|
||||
term.textContent += '\nCancel requested.\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
}
|
||||
});
|
||||
}
|
||||
function renderTaskLiveCharts(taskId, charts) {
|
||||
const host = document.getElementById('task-live-charts');
|
||||
if (!host) return;
|
||||
if (!Array.isArray(charts) || charts.length === 0) {
|
||||
host.innerHTML = 'Waiting for metric samples...';
|
||||
return;
|
||||
}
|
||||
const seen = {};
|
||||
charts.forEach(function(chart) {
|
||||
seen[chart.file] = true;
|
||||
let img = host.querySelector('img[data-chart-file="' + chart.file + '"]');
|
||||
if (img) {
|
||||
const card = img.closest('.card');
|
||||
if (card) {
|
||||
const title = card.querySelector('.card-head');
|
||||
if (title) title.textContent = chart.title;
|
||||
}
|
||||
return;
|
||||
}
|
||||
const card = document.createElement('div');
|
||||
card.className = 'card';
|
||||
card.style.margin = '0';
|
||||
card.innerHTML = '<div class="card-head"></div><div class="card-body" style="padding:12px"></div>';
|
||||
card.querySelector('.card-head').textContent = chart.title;
|
||||
const body = card.querySelector('.card-body');
|
||||
img = document.createElement('img');
|
||||
img.setAttribute('data-task-chart', '1');
|
||||
img.setAttribute('data-chart-file', chart.file);
|
||||
img.setAttribute('data-base-src', '/api/tasks/' + taskId + '/chart/' + chart.file);
|
||||
img.src = '/api/tasks/' + taskId + '/chart/' + chart.file + '?t=' + Date.now();
|
||||
img.style.width = '100%';
|
||||
img.style.display = 'block';
|
||||
img.style.borderRadius = '6px';
|
||||
img.alt = chart.title;
|
||||
body.appendChild(img);
|
||||
host.appendChild(card);
|
||||
});
|
||||
Array.from(host.querySelectorAll('img[data-task-chart="1"]')).forEach(function(img) {
|
||||
const file = img.getAttribute('data-chart-file') || '';
|
||||
if (seen[file]) return;
|
||||
const card = img.closest('.card');
|
||||
if (card) card.remove();
|
||||
});
|
||||
}
|
||||
function loadTaskLiveCharts(taskId) {
|
||||
fetch('/api/tasks/' + taskId + '/charts').then(function(r){ return r.json(); }).then(function(charts){
|
||||
const host = document.getElementById('task-live-charts');
|
||||
if (!host) return;
|
||||
if (!Array.isArray(charts) || charts.length === 0) {
|
||||
host.innerHTML = 'Waiting for metric samples...';
|
||||
return;
|
||||
}
|
||||
host.innerHTML = charts.map(function(chart) {
|
||||
return '<div class="card" style="margin:0">' +
|
||||
'<div class="card-head">' + chart.title + '</div>' +
|
||||
'<div class="card-body" style="padding:12px">' +
|
||||
'<img data-task-chart="1" data-base-src="/api/tasks/' + taskId + '/chart/' + chart.file + '" src="/api/tasks/' + taskId + '/chart/' + chart.file + '?t=' + Date.now() + '" style="width:100%;display:block;border-radius:6px" alt="' + chart.title + '">' +
|
||||
'</div></div>';
|
||||
}).join('');
|
||||
renderTaskLiveCharts(taskId, charts);
|
||||
}).catch(function(){
|
||||
const host = document.getElementById('task-live-charts');
|
||||
if (host) host.innerHTML = 'Task charts are unavailable.';
|
||||
@@ -138,12 +179,31 @@ function refreshTaskLiveCharts() {
|
||||
var _taskDetailES = new EventSource('/api/tasks/` + html.EscapeString(task.ID) + `/stream');
|
||||
var _taskDetailTerm = document.getElementById('task-live-log');
|
||||
var _taskChartTimer = null;
|
||||
var _taskChartsFrozen = false;
|
||||
_taskDetailES.onopen = function(){ _taskDetailTerm.textContent = ''; };
|
||||
_taskDetailES.onmessage = function(e){ _taskDetailTerm.textContent += e.data + "\n"; _taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight; };
|
||||
_taskDetailES.addEventListener('done', function(){ if (_taskChartTimer) clearInterval(_taskChartTimer); _taskDetailES.close(); setTimeout(function(){ window.location.reload(); }, 1000); });
|
||||
_taskDetailES.onerror = function(){ if (_taskChartTimer) clearInterval(_taskChartTimer); _taskDetailES.close(); };
|
||||
_taskDetailES.addEventListener('done', function(e){
|
||||
if (_taskChartTimer) clearInterval(_taskChartTimer);
|
||||
_taskDetailES.close();
|
||||
_taskDetailES = null;
|
||||
_taskChartsFrozen = true;
|
||||
_taskDetailTerm.textContent += (e.data ? '\nTask finished with error.\n' : '\nTask finished.\n');
|
||||
_taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight;
|
||||
refreshTaskLiveCharts();
|
||||
});
|
||||
_taskDetailES.onerror = function(){
|
||||
if (_taskChartTimer) clearInterval(_taskChartTimer);
|
||||
if (_taskDetailES) {
|
||||
_taskDetailES.close();
|
||||
_taskDetailES = null;
|
||||
}
|
||||
};
|
||||
loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
|
||||
_taskChartTimer = setInterval(function(){ refreshTaskLiveCharts(); loadTaskLiveCharts('` + html.EscapeString(task.ID) + `'); }, 2000);
|
||||
_taskChartTimer = setInterval(function(){
|
||||
if (_taskChartsFrozen) return;
|
||||
loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
|
||||
refreshTaskLiveCharts();
|
||||
}, 2000);
|
||||
</script>`)
|
||||
}
|
||||
|
||||
|
||||
@@ -423,13 +423,14 @@ func (q *taskQueue) worker() {
|
||||
setCPUGovernor("performance")
|
||||
defer setCPUGovernor("powersave")
|
||||
|
||||
// Drain all pending tasks and start them in parallel.
|
||||
q.mu.Lock()
|
||||
var batch []*Task
|
||||
for {
|
||||
q.mu.Lock()
|
||||
t := q.nextPending()
|
||||
if t == nil {
|
||||
break
|
||||
q.prune()
|
||||
q.persistLocked()
|
||||
q.mu.Unlock()
|
||||
return
|
||||
}
|
||||
now := time.Now()
|
||||
t.Status = TaskRunning
|
||||
@@ -438,29 +439,14 @@ func (q *taskQueue) worker() {
|
||||
t.ErrMsg = ""
|
||||
j := newTaskJobState(t.LogPath, taskSerialPrefix(t))
|
||||
t.job = j
|
||||
batch = append(batch, t)
|
||||
}
|
||||
if len(batch) > 0 {
|
||||
q.persistLocked()
|
||||
}
|
||||
q.mu.Unlock()
|
||||
q.mu.Unlock()
|
||||
|
||||
var wg sync.WaitGroup
|
||||
for _, t := range batch {
|
||||
t := t
|
||||
j := t.job
|
||||
taskCtx, taskCancel := context.WithCancel(context.Background())
|
||||
j.cancel = taskCancel
|
||||
wg.Add(1)
|
||||
goRecoverOnce("task "+t.Target, func() {
|
||||
defer wg.Done()
|
||||
defer taskCancel()
|
||||
q.executeTask(t, j, taskCtx)
|
||||
})
|
||||
}
|
||||
wg.Wait()
|
||||
q.executeTask(t, j, taskCtx)
|
||||
taskCancel()
|
||||
|
||||
if len(batch) > 0 {
|
||||
q.mu.Lock()
|
||||
q.prune()
|
||||
q.persistLocked()
|
||||
|
||||
@@ -11,18 +11,18 @@ echo " Hardware Audit LiveCD"
|
||||
echo ""
|
||||
|
||||
menuentry "EASY-BEE" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
submenu "EASY-BEE (advanced options) -->" {
|
||||
menuentry "EASY-BEE — GSP=off" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE — nomodeset" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
menuentry "EASY-BEE — KMS (no nomodeset)" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
|
||||
@@ -44,23 +44,27 @@ else:
|
||||
img = Image.new('RGB', (W, H), (0, 0, 0))
|
||||
draw = ImageDraw.Draw(img)
|
||||
|
||||
# Measure logo block
|
||||
# Measure logo block line by line to avoid font ascender offset
|
||||
lines = LOGO.split('\n')
|
||||
bbox = draw.textbbox((0, 0), LOGO, font=font_logo)
|
||||
text_w = bbox[2] - bbox[0]
|
||||
text_h = bbox[3] - bbox[1]
|
||||
logo_lines = lines[:6]
|
||||
sub_line = lines[6] if len(lines) > 6 else ''
|
||||
|
||||
x = (W - text_w) // 2
|
||||
y = (H - text_h) // 2
|
||||
line_h = SIZE + 2
|
||||
block_h = len(logo_lines) * line_h + 8 + (SIZE if sub_line else 0)
|
||||
|
||||
# Draw logo lines: first 6 in amber, last line (subtitle) dimmer
|
||||
logo_lines = lines[:6]
|
||||
sub_line = lines[6] if len(lines) > 6 else ''
|
||||
# Width: measure the widest logo line
|
||||
max_w = 0
|
||||
for line in logo_lines:
|
||||
bb = draw.textbbox((0, 0), line, font=font_logo)
|
||||
max_w = max(max_w, bb[2] - bb[0])
|
||||
|
||||
x = (W - max_w) // 2
|
||||
y = (H - block_h) // 2
|
||||
|
||||
cy = y
|
||||
for line in logo_lines:
|
||||
draw.text((x, cy), line, font=font_logo, fill=(0xf6, 0xc9, 0x0e))
|
||||
cy += SIZE + 2
|
||||
cy += line_h
|
||||
cy += 8
|
||||
if sub_line:
|
||||
draw.text((x, cy), sub_line, font=font_sub, fill=(0x80, 0x68, 0x18))
|
||||
|
||||
@@ -209,6 +209,18 @@ fi
|
||||
ldconfig 2>/dev/null || true
|
||||
log "ldconfig refreshed"
|
||||
|
||||
# Keep persistence mode enabled across the session so dcgmi / stress tools do
|
||||
# not fail with deployment warnings on otherwise healthy GPUs.
|
||||
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||
if nvidia-smi -pm 1 >/dev/null 2>&1; then
|
||||
log "enabled NVIDIA persistence mode"
|
||||
else
|
||||
log "WARN: failed to enable NVIDIA persistence mode"
|
||||
fi
|
||||
else
|
||||
log "WARN: nvidia-smi not found — cannot enable persistence mode"
|
||||
fi
|
||||
|
||||
# Start DCGM host engine so dcgmi can discover GPUs.
|
||||
# nv-hostengine must run after the NVIDIA modules and device nodes are ready.
|
||||
# If it started too early (for example via systemd before bee-nvidia-load), it can
|
||||
|
||||
Reference in New Issue
Block a user