refactor(webui): redesign Burn tab and fix gpu-burn memory defaults
- Burn tab: replace 6 flat cards with 3 grouped cards (GPU Stress, Compute Stress, Platform Thermal Cycling) + global Burn Profile - Run All button at top enqueues all enabled tests across all cards - GPU Stress: tool checkboxes enabled/disabled via new /api/gpu/tools endpoint based on driver status (/dev/nvidia0, /dev/kfd) - Compute Stress: checkboxes for cpu/memory-stress/stressapptest - Platform Thermal Cycling: component checkboxes (cpu/nvidia/amd) with platform_components param wired through to PlatformStressOptions - bee-gpu-burn: default size-mb changed from 64 to 0 (auto); script now queries nvidia-smi memory.total per GPU and uses 95% of it - platform_stress: removed hardcoded --size-mb 64; respects Components field to selectively run CPU and/or GPU load goroutines Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -95,9 +95,7 @@ func normalizeNvidiaStressOptions(opts *NvidiaStressOptions) {
|
||||
if opts.DurationSec <= 0 {
|
||||
opts.DurationSec = 300
|
||||
}
|
||||
if opts.SizeMB <= 0 {
|
||||
opts.SizeMB = 64
|
||||
}
|
||||
// SizeMB=0 means "auto" — bee-gpu-burn will query per-GPU memory at runtime.
|
||||
switch strings.TrimSpace(strings.ToLower(opts.Loader)) {
|
||||
case "", NvidiaStressLoaderBuiltin:
|
||||
opts.Loader = NvidiaStressLoaderBuiltin
|
||||
|
||||
@@ -26,7 +26,8 @@ type PlatformStressCycle struct {
|
||||
|
||||
// PlatformStressOptions controls the thermal cycling test.
|
||||
type PlatformStressOptions struct {
|
||||
Cycles []PlatformStressCycle
|
||||
Cycles []PlatformStressCycle
|
||||
Components []string // if empty: run all; values: "cpu", "gpu"
|
||||
}
|
||||
|
||||
// platformStressRow is one second of telemetry.
|
||||
@@ -68,8 +69,11 @@ func (s *System) RunPlatformStress(
|
||||
return "", fmt.Errorf("mkdir run dir: %w", err)
|
||||
}
|
||||
|
||||
hasCPU := len(opts.Components) == 0 || containsComponent(opts.Components, "cpu")
|
||||
hasGPU := len(opts.Components) == 0 || containsComponent(opts.Components, "gpu")
|
||||
|
||||
vendor := s.DetectGPUVendor()
|
||||
logFunc(fmt.Sprintf("Platform Thermal Cycling — %d cycle(s), GPU vendor: %s", len(opts.Cycles), vendor))
|
||||
logFunc(fmt.Sprintf("Platform Thermal Cycling — %d cycle(s), GPU vendor: %s, cpu=%v gpu=%v", len(opts.Cycles), vendor, hasCPU, hasGPU))
|
||||
|
||||
var rows []platformStressRow
|
||||
start := time.Now()
|
||||
@@ -88,27 +92,31 @@ func (s *System) RunPlatformStress(
|
||||
var wg sync.WaitGroup
|
||||
|
||||
// CPU stress
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
cpuCmd, err := buildCPUStressCmd(loadCtx)
|
||||
if err != nil {
|
||||
logFunc("CPU stress: " + err.Error())
|
||||
return
|
||||
}
|
||||
_ = cpuCmd.Wait() // exits when loadCtx times out (SIGKILL)
|
||||
}()
|
||||
if hasCPU {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
cpuCmd, err := buildCPUStressCmd(loadCtx)
|
||||
if err != nil {
|
||||
logFunc("CPU stress: " + err.Error())
|
||||
return
|
||||
}
|
||||
_ = cpuCmd.Wait() // exits when loadCtx times out (SIGKILL)
|
||||
}()
|
||||
}
|
||||
|
||||
// GPU stress
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
gpuCmd := buildGPUStressCmd(loadCtx, vendor)
|
||||
if gpuCmd == nil {
|
||||
return
|
||||
}
|
||||
_ = gpuCmd.Wait()
|
||||
}()
|
||||
if hasGPU {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
gpuCmd := buildGPUStressCmd(loadCtx, vendor)
|
||||
if gpuCmd == nil {
|
||||
return
|
||||
}
|
||||
_ = gpuCmd.Wait()
|
||||
}()
|
||||
}
|
||||
|
||||
// Monitoring goroutine for load phase
|
||||
loadRows := collectPhase(loadCtx, cycleNum, "load", start)
|
||||
@@ -439,7 +447,7 @@ func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
cmd := exec.CommandContext(ctx, path, "--seconds", "86400", "--size-mb", "64")
|
||||
cmd := exec.CommandContext(ctx, path, "--seconds", "86400")
|
||||
cmd.Stdout = nil
|
||||
cmd.Stderr = nil
|
||||
_ = startLowPriorityCmd(cmd, 10)
|
||||
@@ -486,6 +494,15 @@ func platformStressMemoryMB() int {
|
||||
return mb
|
||||
}
|
||||
|
||||
func containsComponent(components []string, name string) bool {
|
||||
for _, c := range components {
|
||||
if c == name {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func packPlatformDir(dir, dest string) error {
|
||||
f, err := os.Create(dest)
|
||||
if err != nil {
|
||||
|
||||
@@ -181,13 +181,14 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
||||
}
|
||||
|
||||
var body struct {
|
||||
Duration int `json:"duration"`
|
||||
DiagLevel int `json:"diag_level"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||
Loader string `json:"loader"`
|
||||
Profile string `json:"profile"`
|
||||
DisplayName string `json:"display_name"`
|
||||
Duration int `json:"duration"`
|
||||
DiagLevel int `json:"diag_level"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||
Loader string `json:"loader"`
|
||||
Profile string `json:"profile"`
|
||||
DisplayName string `json:"display_name"`
|
||||
PlatformComponents []string `json:"platform_components"`
|
||||
}
|
||||
if r.Body != nil {
|
||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
|
||||
@@ -204,13 +205,14 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{
|
||||
Duration: body.Duration,
|
||||
DiagLevel: body.DiagLevel,
|
||||
GPUIndices: body.GPUIndices,
|
||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||
Loader: body.Loader,
|
||||
BurnProfile: body.Profile,
|
||||
DisplayName: body.DisplayName,
|
||||
Duration: body.Duration,
|
||||
DiagLevel: body.DiagLevel,
|
||||
GPUIndices: body.GPUIndices,
|
||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||
Loader: body.Loader,
|
||||
BurnProfile: body.Profile,
|
||||
DisplayName: body.DisplayName,
|
||||
PlatformComponents: body.PlatformComponents,
|
||||
},
|
||||
}
|
||||
if strings.TrimSpace(body.DisplayName) != "" {
|
||||
@@ -512,6 +514,26 @@ func (h *handler) handleAPIGPUPresence(w http.ResponseWriter, r *http.Request) {
|
||||
})
|
||||
}
|
||||
|
||||
// ── GPU tools ─────────────────────────────────────────────────────────────────
|
||||
|
||||
func (h *handler) handleAPIGPUTools(w http.ResponseWriter, _ *http.Request) {
|
||||
type toolEntry struct {
|
||||
ID string `json:"id"`
|
||||
Available bool `json:"available"`
|
||||
Vendor string `json:"vendor"` // "nvidia" | "amd"
|
||||
}
|
||||
_, nvidiaErr := os.Stat("/dev/nvidia0")
|
||||
_, amdErr := os.Stat("/dev/kfd")
|
||||
nvidiaUp := nvidiaErr == nil
|
||||
amdUp := amdErr == nil
|
||||
writeJSON(w, []toolEntry{
|
||||
{ID: "bee-gpu-burn", Available: nvidiaUp, Vendor: "nvidia"},
|
||||
{ID: "john", Available: nvidiaUp, Vendor: "nvidia"},
|
||||
{ID: "nccl", Available: nvidiaUp, Vendor: "nvidia"},
|
||||
{ID: "rvs", Available: amdUp, Vendor: "amd"},
|
||||
})
|
||||
}
|
||||
|
||||
// ── System ────────────────────────────────────────────────────────────────────
|
||||
|
||||
func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
@@ -657,96 +657,210 @@ func renderSATCard(id, label, extra string) string {
|
||||
func renderBurn() string {
|
||||
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Warning:</strong> Stress tests on this page run hardware at maximum load. Repeated or prolonged use may reduce hardware lifespan (storage endurance, GPU wear). Use only when necessary.</div>
|
||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
<div class="card"><div class="card-head">Burn Profile</div><div class="card-body">
|
||||
<div class="form-row" style="max-width:320px"><label>Preset</label><select id="burn-profile"><option value="smoke" selected>Smoke: quick check (~5 min CPU / DCGM level 1)</option><option value="acceptance">Acceptance: 1 hour (DCGM level 3)</option><option value="overnight">Overnight: 8 hours (DCGM level 4)</option></select></div>
|
||||
<p style="color:var(--muted);font-size:12px">Applied to all tests on this page. NVIDIA SAT on the Validate page still uses DCGM. NVIDIA GPU Stress on this page uses the selected stress loader for the preset duration.</p>
|
||||
</div></div>
|
||||
<div class="grid3">
|
||||
<div class="card"><div class="card-head">NVIDIA GPU Stress</div><div class="card-body">
|
||||
<div class="form-row"><label>Load Tool</label><select id="nvidia-stress-loader"><option value="builtin" selected>bee-gpu-burn</option><option value="nccl">NCCL all_reduce_perf</option><option value="john">John the Ripper jumbo (OpenCL)</option></select></div>
|
||||
<div class="form-row"><label>Exclude GPU indices</label><input type="text" id="nvidia-stress-exclude" placeholder="e.g. 1,3"></div>
|
||||
<p style="color:var(--muted);font-size:12px;margin-bottom:8px"><code>bee-gpu-burn</code> runs on all detected NVIDIA GPUs by default. <code>NCCL all_reduce_perf</code> is useful for multi-GPU / interconnect load. Use exclusions only when one or more cards must be skipped.</p>
|
||||
<button id="sat-btn-nvidia-stress" class="btn btn-primary" onclick="runBurnIn('nvidia-stress')">▶ Start NVIDIA Stress</button>
|
||||
</div></div>
|
||||
<div class="card"><div class="card-head">CPU Stress</div><div class="card-body">
|
||||
<button class="btn btn-primary" onclick="runBurnIn('cpu')">▶ Start CPU Stress</button>
|
||||
</div></div>
|
||||
<div class="card"><div class="card-head">AMD GPU Stress</div><div class="card-body">
|
||||
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Runs ROCm compute stress together with VRAM copy/load activity via RVS GST and records a separate <code>rocm-bandwidth-test</code> snapshot. Missing tools reported as UNSUPPORTED.</p>
|
||||
<button id="sat-btn-amd-stress" class="btn btn-primary" onclick="runBurnIn('amd-stress')">▶ Start AMD Stress</button>
|
||||
</div></div>
|
||||
<div class="card"><div class="card-head">Memory Stress</div><div class="card-body">
|
||||
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">stress-ng --vm writes and verifies memory patterns across all of RAM. Env: <code>BEE_VM_STRESS_SECONDS</code> (default 300), <code>BEE_VM_STRESS_SIZE_MB</code> (default 80%).</p>
|
||||
<button class="btn btn-primary" onclick="runBurnIn('memory-stress')">▶ Start Memory Stress</button>
|
||||
</div></div>
|
||||
<div class="card"><div class="card-head">SAT Stress (stressapptest)</div><div class="card-body">
|
||||
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Google stressapptest saturates CPU, memory and cache buses simultaneously. Env: <code>BEE_SAT_STRESS_SECONDS</code> (default 300), <code>BEE_SAT_STRESS_MB</code> (default auto).</p>
|
||||
<button class="btn btn-primary" onclick="runBurnIn('sat-stress')">▶ Start SAT Stress</button>
|
||||
</div></div>
|
||||
<div class="card"><div class="card-head">Platform Thermal Cycling</div><div class="card-body">
|
||||
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Runs CPU + GPU stress simultaneously across multiple load/idle cycles with varying durations. Detects cooling systems that fail to recover under repeated load cycles. Smoke: 2 cycles ~5 min. Acceptance: 4 cycles ~25 min.</p>
|
||||
<button class="btn btn-primary" onclick="runBurnIn('platform-stress')">▶ Start Thermal Cycling</button>
|
||||
</div></div>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Burn Profile</div>
|
||||
<div class="card-body" style="display:flex;align-items:center;gap:16px;flex-wrap:wrap">
|
||||
<div class="form-row" style="margin:0;max-width:380px"><label>Preset</label><select id="burn-profile">
|
||||
<option value="smoke" selected>Smoke — quick check (~5 min)</option>
|
||||
<option value="acceptance">Acceptance — 1 hour</option>
|
||||
<option value="overnight">Overnight — 8 hours</option>
|
||||
</select></div>
|
||||
<button class="btn btn-primary" onclick="runAll()">▶ Run All</button>
|
||||
<span id="burn-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="grid3" style="margin-bottom:16px">
|
||||
|
||||
<div class="card">
|
||||
<div class="card-head">GPU Stress</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Tests run on all GPUs in the system. Availability determined by driver status.</p>
|
||||
<div id="gpu-tools-list">
|
||||
<label class="cb-row"><input type="checkbox" id="burn-gpu-bee" value="bee-gpu-burn" disabled><span>bee-gpu-burn <span class="cb-note" id="note-bee"></span></span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-gpu-john" value="john" disabled><span>John the Ripper (OpenCL) <span class="cb-note" id="note-john"></span></span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-gpu-nccl" value="nccl" disabled><span>NCCL all_reduce_perf <span class="cb-note" id="note-nccl"></span></span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-gpu-rvs" value="rvs" disabled><span>RVS GST (AMD) <span class="cb-note" id="note-rvs"></span></span></label>
|
||||
</div>
|
||||
<button class="btn btn-primary" style="margin-top:10px" onclick="runGPUStress()">▶ Run GPU Stress</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card">
|
||||
<div class="card-head">Compute Stress</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Select which subsystems to stress. Each checked item runs as a separate task.</p>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-cpu" checked><span>CPU stress (stress-ng)</span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-mem-stress" checked><span>Memory stress (stress-ng --vm)</span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-sat-stress"><span>stressapptest (CPU + memory bus)</span></label>
|
||||
<button class="btn btn-primary" style="margin-top:10px" onclick="runComputeStress()">▶ Run Compute Stress</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card">
|
||||
<div class="card-head">Platform Thermal Cycling</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Repeated load+idle cycles. Detects cooling recovery failures and GPU throttle. Smoke: 2×90s. Acceptance: 4×300s.</p>
|
||||
<p style="font-size:12px;font-weight:600;margin:0 0 6px">Load components:</p>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-pt-cpu" checked><span>CPU (stressapptest)</span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-pt-nvidia" disabled><span>NVIDIA GPU <span class="cb-note" id="note-pt-nvidia"></span></span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-pt-amd" disabled><span>AMD GPU <span class="cb-note" id="note-pt-amd"></span></span></label>
|
||||
<button class="btn btn-primary" style="margin-top:10px" onclick="runPlatformStress()">▶ Run Thermal Cycling</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
<div id="bi-output" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Output <span id="bi-title"></span></div>
|
||||
<div class="card-body"><div id="bi-terminal" class="terminal"></div></div>
|
||||
</div>
|
||||
|
||||
<style>
|
||||
.cb-row { display:flex; align-items:center; gap:8px; padding:4px 0; cursor:pointer; font-size:13px; }
|
||||
.cb-row input[type=checkbox] { width:16px; height:16px; flex-shrink:0; }
|
||||
.cb-row input[type=checkbox]:disabled { opacity:0.4; cursor:not-allowed; }
|
||||
.cb-row input[type=checkbox]:disabled ~ span { opacity:0.45; cursor:not-allowed; }
|
||||
.cb-note { font-size:11px; color:var(--muted); font-style:italic; }
|
||||
</style>
|
||||
|
||||
<script>
|
||||
let biES = null;
|
||||
function parseGPUIndexList(raw) {
|
||||
return (raw || '')
|
||||
.split(',')
|
||||
.map(v => v.trim())
|
||||
.filter(v => v !== '')
|
||||
.map(v => Number(v))
|
||||
.filter(v => Number.isInteger(v) && v >= 0);
|
||||
|
||||
function profile() { return document.getElementById('burn-profile').value || 'smoke'; }
|
||||
|
||||
function enqueueTask(target, extra) {
|
||||
const body = Object.assign({ profile: profile() }, extra || {});
|
||||
return fetch('/api/sat/'+target+'/run', {
|
||||
method: 'POST', headers: {'Content-Type':'application/json'}, body: JSON.stringify(body)
|
||||
}).then(r => r.json());
|
||||
}
|
||||
function runBurnIn(target) {
|
||||
|
||||
function streamTask(taskId, label) {
|
||||
if (biES) { biES.close(); biES = null; }
|
||||
const body = { profile: document.getElementById('burn-profile').value || 'smoke' };
|
||||
if (target === 'nvidia-stress') {
|
||||
body.loader = document.getElementById('nvidia-stress-loader').value || 'builtin';
|
||||
body.exclude_gpu_indices = parseGPUIndexList(document.getElementById('nvidia-stress-exclude').value);
|
||||
}
|
||||
document.getElementById('bi-output').style.display='block';
|
||||
const loaderLabel = body.loader ? ' / ' + body.loader : '';
|
||||
document.getElementById('bi-title').textContent = '— ' + target + loaderLabel + ' [' + body.profile + ']';
|
||||
document.getElementById('bi-output').style.display = 'block';
|
||||
document.getElementById('bi-title').textContent = '— ' + label + ' [' + profile() + ']';
|
||||
const term = document.getElementById('bi-terminal');
|
||||
term.textContent = 'Enqueuing ' + target + ' stress...\n';
|
||||
fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(body)})
|
||||
.then(r => r.json())
|
||||
.then(d => {
|
||||
term.textContent += 'Task ' + d.task_id + ' queued.\n';
|
||||
biES = new EventSource('/api/tasks/'+d.task_id+'/stream');
|
||||
biES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop=term.scrollHeight; };
|
||||
biES.addEventListener('done', e => { biES.close(); biES=null; term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n'; });
|
||||
});
|
||||
term.textContent = 'Task ' + taskId + ' queued. Streaming...\n';
|
||||
biES = new EventSource('/api/tasks/'+taskId+'/stream');
|
||||
biES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop = term.scrollHeight; };
|
||||
biES.addEventListener('done', e => {
|
||||
biES.close(); biES = null;
|
||||
term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n';
|
||||
});
|
||||
}
|
||||
</script>
|
||||
<script>
|
||||
fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
|
||||
if (!gp.nvidia) disableSATCard('nvidia-stress', 'No NVIDIA GPU detected');
|
||||
if (!gp.amd) disableSATCard('amd-stress', 'No AMD GPU detected');
|
||||
});
|
||||
function disableSATCard(id, reason) {
|
||||
const btn = document.getElementById('sat-btn-' + id);
|
||||
if (!btn) return;
|
||||
btn.disabled = true;
|
||||
btn.title = reason;
|
||||
btn.style.opacity = '0.4';
|
||||
const card = btn.closest('.card');
|
||||
if (card) {
|
||||
let note = card.querySelector('.sat-unavail');
|
||||
if (!note) {
|
||||
note = document.createElement('p');
|
||||
note.className = 'sat-unavail';
|
||||
note.style.cssText = 'color:var(--muted);font-size:12px;margin-top:6px';
|
||||
btn.parentNode.insertBefore(note, btn.nextSibling);
|
||||
}
|
||||
note.textContent = reason;
|
||||
|
||||
function runGPUStress() {
|
||||
const ids = ['burn-gpu-bee','burn-gpu-john','burn-gpu-nccl','burn-gpu-rvs'];
|
||||
const loaderMap = {'burn-gpu-bee':'builtin','burn-gpu-john':'john','burn-gpu-nccl':'nccl','burn-gpu-rvs':'rvs'};
|
||||
const targetMap = {'burn-gpu-bee':'nvidia-stress','burn-gpu-john':'nvidia-stress','burn-gpu-nccl':'nvidia-stress','burn-gpu-rvs':'amd-stress'};
|
||||
let last = null;
|
||||
ids.filter(id => {
|
||||
const el = document.getElementById(id);
|
||||
return el && el.checked && !el.disabled;
|
||||
}).forEach(id => {
|
||||
const target = targetMap[id];
|
||||
const extra = target === 'nvidia-stress' ? {loader: loaderMap[id]} : {};
|
||||
enqueueTask(target, extra).then(d => { last = d; streamTask(d.task_id, target + ' / ' + loaderMap[id]); });
|
||||
});
|
||||
}
|
||||
|
||||
function runComputeStress() {
|
||||
const tasks = [
|
||||
{id:'burn-cpu', target:'cpu'},
|
||||
{id:'burn-mem-stress', target:'memory-stress'},
|
||||
{id:'burn-sat-stress', target:'sat-stress'},
|
||||
];
|
||||
let last = null;
|
||||
tasks.filter(t => {
|
||||
const el = document.getElementById(t.id);
|
||||
return el && el.checked;
|
||||
}).forEach(t => {
|
||||
enqueueTask(t.target).then(d => { last = d; streamTask(d.task_id, t.target); });
|
||||
});
|
||||
}
|
||||
|
||||
function runPlatformStress() {
|
||||
const comps = [];
|
||||
if (document.getElementById('burn-pt-cpu').checked) comps.push('cpu');
|
||||
const nv = document.getElementById('burn-pt-nvidia');
|
||||
if (nv && nv.checked && !nv.disabled) comps.push('gpu');
|
||||
const am = document.getElementById('burn-pt-amd');
|
||||
if (am && am.checked && !am.disabled) comps.push('gpu');
|
||||
const extra = comps.length > 0 ? {platform_components: comps} : {};
|
||||
enqueueTask('platform-stress', extra).then(d => streamTask(d.task_id, 'platform-stress'));
|
||||
}
|
||||
|
||||
function runAll() {
|
||||
const status = document.getElementById('burn-all-status');
|
||||
status.textContent = 'Enqueuing...';
|
||||
let count = 0;
|
||||
const done = () => { count++; status.textContent = count + ' tasks queued.'; };
|
||||
|
||||
// GPU tests
|
||||
const gpuIds = ['burn-gpu-bee','burn-gpu-john','burn-gpu-nccl','burn-gpu-rvs'];
|
||||
const loaderMap = {'burn-gpu-bee':'builtin','burn-gpu-john':'john','burn-gpu-nccl':'nccl','burn-gpu-rvs':'rvs'};
|
||||
const gpuTargetMap = {'burn-gpu-bee':'nvidia-stress','burn-gpu-john':'nvidia-stress','burn-gpu-nccl':'nvidia-stress','burn-gpu-rvs':'amd-stress'};
|
||||
gpuIds.filter(id => { const el = document.getElementById(id); return el && el.checked && !el.disabled; }).forEach(id => {
|
||||
const target = gpuTargetMap[id];
|
||||
const extra = target === 'nvidia-stress' ? {loader: loaderMap[id]} : {};
|
||||
enqueueTask(target, extra).then(d => { streamTask(d.task_id, target); done(); });
|
||||
});
|
||||
|
||||
// Compute tests
|
||||
[{id:'burn-cpu',target:'cpu'},{id:'burn-mem-stress',target:'memory-stress'},{id:'burn-sat-stress',target:'sat-stress'}]
|
||||
.filter(t => { const el = document.getElementById(t.id); return el && el.checked; })
|
||||
.forEach(t => enqueueTask(t.target).then(d => { streamTask(d.task_id, t.target); done(); }));
|
||||
|
||||
// Platform
|
||||
const comps = [];
|
||||
if (document.getElementById('burn-pt-cpu').checked) comps.push('cpu');
|
||||
const nv = document.getElementById('burn-pt-nvidia');
|
||||
if (nv && nv.checked && !nv.disabled) comps.push('gpu');
|
||||
const am = document.getElementById('burn-pt-amd');
|
||||
if (am && am.checked && !am.disabled) comps.push('gpu');
|
||||
const ptExtra = comps.length > 0 ? {platform_components: comps} : {};
|
||||
enqueueTask('platform-stress', ptExtra).then(d => { streamTask(d.task_id, 'platform-stress'); done(); });
|
||||
}
|
||||
|
||||
// Load GPU tool availability
|
||||
fetch('/api/gpu/tools').then(r => r.json()).then(tools => {
|
||||
const nvidiaMap = {'bee-gpu-burn':'burn-gpu-bee','john':'burn-gpu-john','nccl':'burn-gpu-nccl','rvs':'burn-gpu-rvs'};
|
||||
const noteMap = {'bee-gpu-burn':'note-bee','john':'note-john','nccl':'note-nccl','rvs':'note-rvs'};
|
||||
tools.forEach(t => {
|
||||
const cb = document.getElementById(nvidiaMap[t.id]);
|
||||
const note = document.getElementById(noteMap[t.id]);
|
||||
if (!cb) return;
|
||||
if (t.available) {
|
||||
cb.disabled = false;
|
||||
if (t.id === 'bee-gpu-burn') cb.checked = true;
|
||||
} else {
|
||||
const reason = t.vendor === 'nvidia' ? 'NVIDIA driver not running' : 'AMD driver not running';
|
||||
if (note) note.textContent = '— ' + reason;
|
||||
}
|
||||
}
|
||||
});
|
||||
}).catch(() => {});
|
||||
|
||||
// Load GPU presence for platform thermal cycling
|
||||
fetch('/api/gpu/presence').then(r => r.json()).then(gp => {
|
||||
const nvCb = document.getElementById('burn-pt-nvidia');
|
||||
const amCb = document.getElementById('burn-pt-amd');
|
||||
const nvNote = document.getElementById('note-pt-nvidia');
|
||||
const amNote = document.getElementById('note-pt-amd');
|
||||
if (gp.nvidia) {
|
||||
nvCb.disabled = false;
|
||||
nvCb.checked = true;
|
||||
} else {
|
||||
if (nvNote) nvNote.textContent = '— NVIDIA driver not running';
|
||||
}
|
||||
if (gp.amd) {
|
||||
amCb.disabled = false;
|
||||
amCb.checked = true;
|
||||
} else {
|
||||
if (amNote) amNote.textContent = '— AMD driver not running';
|
||||
}
|
||||
}).catch(() => {});
|
||||
</script>`
|
||||
}
|
||||
|
||||
|
||||
@@ -255,8 +255,9 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
mux.HandleFunc("GET /api/display/resolutions", h.handleAPIDisplayResolutions)
|
||||
mux.HandleFunc("POST /api/display/set", h.handleAPIDisplaySet)
|
||||
|
||||
// GPU presence
|
||||
// GPU presence / tools
|
||||
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
||||
mux.HandleFunc("GET /api/gpu/tools", h.handleAPIGPUTools)
|
||||
|
||||
// System
|
||||
mux.HandleFunc("GET /api/system/ram-status", h.handleAPIRAMStatus)
|
||||
@@ -516,6 +517,7 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
||||
case path == "server-power":
|
||||
title = "System Power"
|
||||
vPower, l := h.ringPower.snapshot()
|
||||
vPower = normalizePowerSeries(vPower)
|
||||
labels = l
|
||||
datasets = [][]float64{vPower}
|
||||
names = []string{"Power W"}
|
||||
@@ -728,9 +730,11 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
||||
for i, s := range samples {
|
||||
power[i] = s.PowerW
|
||||
}
|
||||
power = normalizePowerSeries(power)
|
||||
datasets = [][]float64{power}
|
||||
names = []string{"Power W"}
|
||||
yMin, yMax = autoBounds120(power)
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(power)
|
||||
|
||||
case path == "server-fans":
|
||||
title = "Fan RPM"
|
||||
@@ -928,6 +932,27 @@ func coalesceDataset(ds []float64, n int) []float64 {
|
||||
return make([]float64, n)
|
||||
}
|
||||
|
||||
func normalizePowerSeries(ds []float64) []float64 {
|
||||
if len(ds) == 0 {
|
||||
return nil
|
||||
}
|
||||
out := make([]float64, len(ds))
|
||||
copy(out, ds)
|
||||
last := 0.0
|
||||
haveLast := false
|
||||
for i, v := range out {
|
||||
if v > 0 {
|
||||
last = v
|
||||
haveLast = true
|
||||
continue
|
||||
}
|
||||
if haveLast {
|
||||
out[i] = last
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// floatPtr returns a pointer to a float64 value.
|
||||
func floatPtr(v float64) *float64 { return &v }
|
||||
|
||||
|
||||
@@ -106,9 +106,10 @@ type taskParams struct {
|
||||
GPUIndices []int `json:"gpu_indices,omitempty"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
|
||||
Loader string `json:"loader,omitempty"`
|
||||
BurnProfile string `json:"burn_profile,omitempty"`
|
||||
DisplayName string `json:"display_name,omitempty"`
|
||||
Device string `json:"device,omitempty"` // for install
|
||||
BurnProfile string `json:"burn_profile,omitempty"`
|
||||
DisplayName string `json:"display_name,omitempty"`
|
||||
Device string `json:"device,omitempty"` // for install
|
||||
PlatformComponents []string `json:"platform_components,omitempty"`
|
||||
}
|
||||
|
||||
type persistedTask struct {
|
||||
@@ -550,6 +551,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
break
|
||||
}
|
||||
opts := resolvePlatformStressPreset(t.params.BurnProfile)
|
||||
opts.Components = t.params.PlatformComponents
|
||||
archive, err = a.RunPlatformStress(ctx, "", opts, j.append)
|
||||
case "audit":
|
||||
if a == nil {
|
||||
|
||||
Reference in New Issue
Block a user