WIP: checkpoint current tree
This commit is contained in:
@@ -274,9 +274,6 @@ func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) Nv
|
||||
}
|
||||
opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices)
|
||||
opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices)
|
||||
if !opts.RunNCCL {
|
||||
opts.RunNCCL = true
|
||||
}
|
||||
return opts
|
||||
}
|
||||
|
||||
|
||||
@@ -41,6 +41,21 @@ func TestResolveBenchmarkProfile(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
opts := normalizeNvidiaBenchmarkOptionsForBenchmark(NvidiaBenchmarkOptions{
|
||||
Profile: "stability",
|
||||
RunNCCL: false,
|
||||
})
|
||||
if opts.Profile != NvidiaBenchmarkProfileStability {
|
||||
t.Fatalf("profile=%q want %q", opts.Profile, NvidiaBenchmarkProfileStability)
|
||||
}
|
||||
if opts.RunNCCL {
|
||||
t.Fatalf("RunNCCL should stay false when explicitly disabled")
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseBenchmarkBurnLog(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
@@ -120,10 +120,45 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) erro
|
||||
log(fmt.Sprintf("Warning: rebind /run/live/medium failed: %v", err))
|
||||
}
|
||||
|
||||
log("Verifying live medium now served from RAM...")
|
||||
status := s.LiveBootSource()
|
||||
if err := verifyInstallToRAMStatus(status); err != nil {
|
||||
return err
|
||||
}
|
||||
log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
|
||||
log("Done. Installation media can be safely disconnected.")
|
||||
return nil
|
||||
}
|
||||
|
||||
func verifyInstallToRAMStatus(status LiveBootSource) error {
|
||||
if status.InRAM {
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("install to RAM verification failed: live medium still mounted from %s", describeLiveBootSource(status))
|
||||
}
|
||||
|
||||
func describeLiveBootSource(status LiveBootSource) string {
|
||||
source := strings.TrimSpace(status.Device)
|
||||
if source == "" {
|
||||
source = strings.TrimSpace(status.Source)
|
||||
}
|
||||
if source == "" {
|
||||
source = "unknown source"
|
||||
}
|
||||
switch strings.TrimSpace(status.Kind) {
|
||||
case "ram":
|
||||
return "RAM"
|
||||
case "usb":
|
||||
return "USB (" + source + ")"
|
||||
case "cdrom":
|
||||
return "CD-ROM (" + source + ")"
|
||||
case "disk":
|
||||
return "disk (" + source + ")"
|
||||
default:
|
||||
return source
|
||||
}
|
||||
}
|
||||
|
||||
func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) error {
|
||||
in, err := os.Open(src)
|
||||
if err != nil {
|
||||
|
||||
@@ -3,6 +3,8 @@ package platform
|
||||
import "testing"
|
||||
|
||||
func TestInferLiveBootKind(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
fsType string
|
||||
@@ -18,6 +20,7 @@ func TestInferLiveBootKind(t *testing.T) {
|
||||
{name: "unknown", source: "overlay", want: "unknown"},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
tc := tc
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
got := inferLiveBootKind(tc.fsType, tc.source, tc.deviceType, tc.transport)
|
||||
if got != tc.want {
|
||||
@@ -26,3 +29,29 @@ func TestInferLiveBootKind(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestVerifyInstallToRAMStatus(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
if err := verifyInstallToRAMStatus(LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"}); err != nil {
|
||||
t.Fatalf("expected success for RAM-backed status, got %v", err)
|
||||
}
|
||||
err := verifyInstallToRAMStatus(LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"})
|
||||
if err == nil {
|
||||
t.Fatal("expected verification failure when media is still on USB")
|
||||
}
|
||||
if got := err.Error(); got != "install to RAM verification failed: live medium still mounted from USB (/dev/sdb1)" {
|
||||
t.Fatalf("error=%q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDescribeLiveBootSource(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
if got := describeLiveBootSource(LiveBootSource{InRAM: true, Kind: "ram"}); got != "RAM" {
|
||||
t.Fatalf("got %q want RAM", got)
|
||||
}
|
||||
if got := describeLiveBootSource(LiveBootSource{Kind: "unknown", Source: "/run/live/medium"}); got != "/run/live/medium" {
|
||||
t.Fatalf("got %q want /run/live/medium", got)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,11 +12,11 @@ import (
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"syscall"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
)
|
||||
|
||||
@@ -76,15 +76,15 @@ func streamExecOutput(cmd *exec.Cmd, logFunc func(string)) ([]byte, error) {
|
||||
|
||||
// NvidiaGPU holds basic GPU info from nvidia-smi.
|
||||
type NvidiaGPU struct {
|
||||
Index int
|
||||
Name string
|
||||
MemoryMB int
|
||||
Index int `json:"index"`
|
||||
Name string `json:"name"`
|
||||
MemoryMB int `json:"memory_mb"`
|
||||
}
|
||||
|
||||
// AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
|
||||
type AMDGPUInfo struct {
|
||||
Index int
|
||||
Name string
|
||||
Index int `json:"index"`
|
||||
Name string `json:"name"`
|
||||
}
|
||||
|
||||
// DetectGPUVendor returns "nvidia" if /dev/nvidia0 exists, "amd" if /dev/kfd exists, or "" otherwise.
|
||||
|
||||
@@ -10,17 +10,30 @@ import (
|
||||
func (s *System) ListBeeServices() ([]string, error) {
|
||||
seen := map[string]bool{}
|
||||
var out []string
|
||||
for _, pattern := range []string{"/etc/systemd/system/bee-*.service", "/lib/systemd/system/bee-*.service"} {
|
||||
for _, pattern := range []string{
|
||||
"/etc/systemd/system/bee-*.service",
|
||||
"/lib/systemd/system/bee-*.service",
|
||||
"/etc/systemd/system/bee-*.timer",
|
||||
"/lib/systemd/system/bee-*.timer",
|
||||
} {
|
||||
matches, err := filepath.Glob(pattern)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
for _, match := range matches {
|
||||
name := strings.TrimSuffix(filepath.Base(match), ".service")
|
||||
base := filepath.Base(match)
|
||||
name := base
|
||||
if strings.HasSuffix(base, ".service") {
|
||||
name = strings.TrimSuffix(base, ".service")
|
||||
}
|
||||
// Skip template units (e.g. bee-journal-mirror@) — they have no instances to query.
|
||||
if strings.HasSuffix(name, "@") {
|
||||
continue
|
||||
}
|
||||
// bee-selfheal is timer-managed; showing the oneshot service as inactive is misleading.
|
||||
if name == "bee-selfheal" && strings.HasSuffix(base, ".service") {
|
||||
continue
|
||||
}
|
||||
if !seen[name] {
|
||||
seen[name] = true
|
||||
out = append(out, name)
|
||||
|
||||
@@ -44,12 +44,12 @@ type StaticIPv4Config struct {
|
||||
}
|
||||
|
||||
type RemovableTarget struct {
|
||||
Device string
|
||||
FSType string
|
||||
Size string
|
||||
Label string
|
||||
Model string
|
||||
Mountpoint string
|
||||
Device string `json:"device"`
|
||||
FSType string `json:"fs_type"`
|
||||
Size string `json:"size"`
|
||||
Label string `json:"label"`
|
||||
Model string `json:"model"`
|
||||
Mountpoint string `json:"mountpoint"`
|
||||
}
|
||||
|
||||
type ToolStatus struct {
|
||||
|
||||
31
audit/internal/platform/types_test.go
Normal file
31
audit/internal/platform/types_test.go
Normal file
@@ -0,0 +1,31 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestRemovableTargetJSONUsesFrontendFieldNames(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
data, err := json.Marshal(RemovableTarget{
|
||||
Device: "/dev/sdb1",
|
||||
FSType: "exfat",
|
||||
Size: "1.8T",
|
||||
Label: "USB",
|
||||
Model: "Flash",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("marshal: %v", err)
|
||||
}
|
||||
raw := string(data)
|
||||
for _, key := range []string{`"device"`, `"fs_type"`, `"size"`, `"label"`, `"model"`} {
|
||||
if !strings.Contains(raw, key) {
|
||||
t.Fatalf("json missing key %s: %s", key, raw)
|
||||
}
|
||||
}
|
||||
if strings.Contains(raw, `"Device"`) || strings.Contains(raw, `"FSType"`) {
|
||||
t.Fatalf("json still contains Go field names: %s", raw)
|
||||
}
|
||||
}
|
||||
@@ -232,6 +232,54 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
||||
}
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
|
||||
var body struct {
|
||||
Profile string `json:"profile"`
|
||||
SizeMB int `json:"size_mb"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||
RunNCCL *bool `json:"run_nccl"`
|
||||
DisplayName string `json:"display_name"`
|
||||
}
|
||||
if r.Body != nil {
|
||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
|
||||
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
runNCCL := true
|
||||
if body.RunNCCL != nil {
|
||||
runNCCL = *body.RunNCCL
|
||||
}
|
||||
t := &Task{
|
||||
ID: newJobID("benchmark-nvidia"),
|
||||
Name: taskDisplayName("nvidia-benchmark", "", ""),
|
||||
Target: "nvidia-benchmark",
|
||||
Priority: 15,
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{
|
||||
GPUIndices: body.GPUIndices,
|
||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||
SizeMB: body.SizeMB,
|
||||
BenchmarkProfile: body.Profile,
|
||||
RunNCCL: runNCCL,
|
||||
DisplayName: body.DisplayName,
|
||||
},
|
||||
}
|
||||
if strings.TrimSpace(body.DisplayName) != "" {
|
||||
t.Name = body.DisplayName
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {
|
||||
id := r.URL.Query().Get("job_id")
|
||||
if id == "" {
|
||||
@@ -491,6 +539,22 @@ func (h *handler) handleAPIExportUSBBundle(w http.ResponseWriter, r *http.Reques
|
||||
|
||||
// ── GPU presence ──────────────────────────────────────────────────────────────
|
||||
|
||||
func (h *handler) handleAPIGNVIDIAGPUs(w http.ResponseWriter, _ *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
gpus, err := h.opts.App.ListNvidiaGPUs()
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
if gpus == nil {
|
||||
gpus = []platform.NvidiaGPU{}
|
||||
}
|
||||
writeJSON(w, gpus)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIGPUPresence(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
@@ -516,8 +580,10 @@ func (h *handler) handleAPIGPUTools(w http.ResponseWriter, _ *http.Request) {
|
||||
_, amdErr := os.Stat("/dev/kfd")
|
||||
nvidiaUp := nvidiaErr == nil
|
||||
amdUp := amdErr == nil
|
||||
_, dcgmErr := exec.LookPath("dcgmi")
|
||||
writeJSON(w, []toolEntry{
|
||||
{ID: "bee-gpu-burn", Available: nvidiaUp, Vendor: "nvidia"},
|
||||
{ID: "dcgm", Available: nvidiaUp && dcgmErr == nil, Vendor: "nvidia"},
|
||||
{ID: "john", Available: nvidiaUp, Vendor: "nvidia"},
|
||||
{ID: "nccl", Available: nvidiaUp, Vendor: "nvidia"},
|
||||
{ID: "rvs", Available: amdUp, Vendor: "amd"},
|
||||
|
||||
@@ -64,6 +64,42 @@ func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
globalQueue.tasks = nil
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = originalTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPIBenchmarkNvidiaRun(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
if len(globalQueue.tasks) != 1 {
|
||||
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
|
||||
}
|
||||
task := globalQueue.tasks[0]
|
||||
if task.Target != "nvidia-benchmark" {
|
||||
t.Fatalf("target=%q want nvidia-benchmark", task.Target)
|
||||
}
|
||||
if got := task.params.GPUIndices; len(got) != 2 || got[0] != 1 || got[1] != 3 {
|
||||
t.Fatalf("gpu indices=%v want [1 3]", got)
|
||||
}
|
||||
if task.params.RunNCCL {
|
||||
t.Fatal("RunNCCL should reflect explicit false from request")
|
||||
}
|
||||
}
|
||||
|
||||
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
|
||||
h := &handler{}
|
||||
|
||||
@@ -232,7 +232,7 @@ func truncate(s string, max int) string {
|
||||
// isSATTarget returns true for task targets that run hardware acceptance tests.
|
||||
func isSATTarget(target string) bool {
|
||||
switch target {
|
||||
case "nvidia", "nvidia-stress", "memory", "memory-stress", "storage",
|
||||
case "nvidia", "nvidia-benchmark", "nvidia-stress", "memory", "memory-stress", "storage",
|
||||
"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
|
||||
"platform-stress":
|
||||
return true
|
||||
|
||||
@@ -91,6 +91,7 @@ func layoutNav(active string, buildLabel string) string {
|
||||
{"audit", "Audit", "/audit", ""},
|
||||
{"validate", "Validate", "/validate", ""},
|
||||
{"burn", "Burn", "/burn", ""},
|
||||
{"benchmark", "Benchmark", "/benchmark", ""},
|
||||
{"tasks", "Tasks", "/tasks", ""},
|
||||
{"tools", "Tools", "/tools", ""},
|
||||
}
|
||||
@@ -140,6 +141,10 @@ func renderPage(page string, opts HandlerOptions) string {
|
||||
pageID = "burn"
|
||||
title = "Burn"
|
||||
body = renderBurn()
|
||||
case "benchmark":
|
||||
pageID = "benchmark"
|
||||
title = "Benchmark"
|
||||
body = renderBenchmark()
|
||||
case "tasks":
|
||||
pageID = "tasks"
|
||||
title = "Tasks"
|
||||
@@ -781,6 +786,193 @@ func renderSATCard(id, label, extra string) string {
|
||||
label, extra, id, id)
|
||||
}
|
||||
|
||||
// ── Benchmark ─────────────────────────────────────────────────────────────────
|
||||
|
||||
func renderBenchmark() string {
|
||||
return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Benchmark runs generate a human-readable TXT report and machine-readable result bundle. Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
|
||||
<div class="grid2">
|
||||
<div class="card">
|
||||
<div class="card-head">NVIDIA Benchmark</div>
|
||||
<div class="card-body">
|
||||
<div class="form-row">
|
||||
<label>Profile</label>
|
||||
<select id="benchmark-profile">
|
||||
<option value="standard" selected>Standard — about 15 minutes</option>
|
||||
<option value="stability">Stability — 1 to 2 hours</option>
|
||||
<option value="overnight">Overnight — 8 hours</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="form-row">
|
||||
<label>GPU Selection</label>
|
||||
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="benchmarkSelectAll()">Select All</button>
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="benchmarkSelectNone()">Clear</button>
|
||||
</div>
|
||||
<div id="benchmark-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
||||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||
</div>
|
||||
</div>
|
||||
<label class="benchmark-cb-row">
|
||||
<input type="checkbox" id="benchmark-run-nccl" checked>
|
||||
<span>Run multi-GPU interconnect step (NCCL) only on the selected GPUs</span>
|
||||
</label>
|
||||
<p id="benchmark-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 14px">Select one GPU for single-card benchmarking or several GPUs for a constrained multi-GPU run.</p>
|
||||
<button id="benchmark-run-btn" class="btn btn-primary" onclick="runNvidiaBenchmark()" disabled>▶ Run Benchmark</button>
|
||||
<span id="benchmark-run-status" style="margin-left:10px;font-size:12px;color:var(--muted)"></span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card">
|
||||
<div class="card-head">Method</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:10px">Each benchmark run performs warmup, sustained compute, telemetry capture, cooldown, and optional NCCL interconnect checks.</p>
|
||||
<table>
|
||||
<tr><th>Profile</th><th>Purpose</th></tr>
|
||||
<tr><td>Standard</td><td>Fast, repeatable performance check for server-to-server comparison.</td></tr>
|
||||
<tr><td>Stability</td><td>Longer run for thermal drift, power caps, and clock instability.</td></tr>
|
||||
<tr><td>Overnight</td><td>Extended verification of long-run stability and late throttling.</td></tr>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="benchmark-output" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
|
||||
<div class="card-body"><div id="benchmark-terminal" class="terminal"></div></div>
|
||||
</div>
|
||||
|
||||
<style>
|
||||
.benchmark-cb-row { display:flex; align-items:flex-start; gap:8px; cursor:pointer; font-size:13px; }
|
||||
.benchmark-cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
.benchmark-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
|
||||
.benchmark-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
</style>
|
||||
|
||||
<script>
|
||||
let benchmarkES = null;
|
||||
|
||||
function benchmarkSelectedGPUIndices() {
|
||||
return Array.from(document.querySelectorAll('.benchmark-gpu-checkbox'))
|
||||
.filter(function(el) { return el.checked && !el.disabled; })
|
||||
.map(function(el) { return parseInt(el.value, 10); })
|
||||
.filter(function(v) { return !Number.isNaN(v); })
|
||||
.sort(function(a, b) { return a - b; });
|
||||
}
|
||||
|
||||
function benchmarkUpdateSelectionNote() {
|
||||
const selected = benchmarkSelectedGPUIndices();
|
||||
const btn = document.getElementById('benchmark-run-btn');
|
||||
const note = document.getElementById('benchmark-selection-note');
|
||||
const nccl = document.getElementById('benchmark-run-nccl');
|
||||
if (!selected.length) {
|
||||
btn.disabled = true;
|
||||
note.textContent = 'Select at least one NVIDIA GPU to run the benchmark.';
|
||||
return;
|
||||
}
|
||||
btn.disabled = false;
|
||||
note.textContent = 'Selected GPUs: ' + selected.join(', ') + '.';
|
||||
if (nccl && nccl.checked && selected.length < 2) {
|
||||
note.textContent += ' NCCL will be skipped because fewer than 2 GPUs are selected.';
|
||||
} else if (nccl && nccl.checked) {
|
||||
note.textContent += ' NCCL interconnect will use only these GPUs.';
|
||||
}
|
||||
}
|
||||
|
||||
function benchmarkRenderGPUList(gpus) {
|
||||
const root = document.getElementById('benchmark-gpu-list');
|
||||
if (!gpus || !gpus.length) {
|
||||
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
|
||||
benchmarkUpdateSelectionNote();
|
||||
return;
|
||||
}
|
||||
root.innerHTML = gpus.map(function(gpu) {
|
||||
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
|
||||
return '<label class="benchmark-gpu-row">'
|
||||
+ '<input class="benchmark-gpu-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="benchmarkUpdateSelectionNote()">'
|
||||
+ '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
|
||||
+ '</label>';
|
||||
}).join('');
|
||||
benchmarkUpdateSelectionNote();
|
||||
}
|
||||
|
||||
function benchmarkLoadGPUs() {
|
||||
const status = document.getElementById('benchmark-run-status');
|
||||
status.textContent = '';
|
||||
fetch('/api/gpu/nvidia').then(function(r) {
|
||||
return r.json().then(function(body) {
|
||||
if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
|
||||
return body;
|
||||
});
|
||||
}).then(function(gpus) {
|
||||
benchmarkRenderGPUList(gpus);
|
||||
}).catch(function(err) {
|
||||
document.getElementById('benchmark-gpu-list').innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
|
||||
benchmarkUpdateSelectionNote();
|
||||
});
|
||||
}
|
||||
|
||||
function benchmarkSelectAll() {
|
||||
document.querySelectorAll('.benchmark-gpu-checkbox').forEach(function(el) { el.checked = true; });
|
||||
benchmarkUpdateSelectionNote();
|
||||
}
|
||||
|
||||
function benchmarkSelectNone() {
|
||||
document.querySelectorAll('.benchmark-gpu-checkbox').forEach(function(el) { el.checked = false; });
|
||||
benchmarkUpdateSelectionNote();
|
||||
}
|
||||
|
||||
function runNvidiaBenchmark() {
|
||||
const selected = benchmarkSelectedGPUIndices();
|
||||
const status = document.getElementById('benchmark-run-status');
|
||||
if (!selected.length) {
|
||||
status.textContent = 'Select at least one GPU.';
|
||||
return;
|
||||
}
|
||||
if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
|
||||
const body = {
|
||||
profile: document.getElementById('benchmark-profile').value || 'standard',
|
||||
gpu_indices: selected,
|
||||
run_nccl: !!document.getElementById('benchmark-run-nccl').checked,
|
||||
display_name: 'NVIDIA Benchmark'
|
||||
};
|
||||
document.getElementById('benchmark-output').style.display = 'block';
|
||||
document.getElementById('benchmark-title').textContent = '— ' + body.profile + ' [' + selected.join(', ') + ']';
|
||||
const term = document.getElementById('benchmark-terminal');
|
||||
term.textContent = 'Enqueuing benchmark for GPUs ' + selected.join(', ') + '...\n';
|
||||
status.textContent = 'Queueing...';
|
||||
fetch('/api/benchmark/nvidia/run', {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type':'application/json'},
|
||||
body: JSON.stringify(body)
|
||||
}).then(function(r) {
|
||||
return r.json().then(function(payload) {
|
||||
if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
|
||||
return payload;
|
||||
});
|
||||
}).then(function(d) {
|
||||
status.textContent = 'Task ' + d.task_id + ' queued.';
|
||||
term.textContent += 'Task ' + d.task_id + ' queued. Streaming log...\n';
|
||||
benchmarkES = new EventSource('/api/tasks/' + d.task_id + '/stream');
|
||||
benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||
benchmarkES.addEventListener('done', function(e) {
|
||||
benchmarkES.close();
|
||||
benchmarkES = null;
|
||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
status.textContent = e.data ? 'Failed.' : 'Completed.';
|
||||
});
|
||||
}).catch(function(err) {
|
||||
status.textContent = 'Error.';
|
||||
term.textContent += 'ERROR: ' + err.message + '\n';
|
||||
});
|
||||
}
|
||||
|
||||
document.getElementById('benchmark-run-nccl').addEventListener('change', benchmarkUpdateSelectionNote);
|
||||
benchmarkLoadGPUs();
|
||||
</script>`
|
||||
}
|
||||
|
||||
// ── Burn ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
func renderBurn() string {
|
||||
@@ -805,11 +997,12 @@ func renderBurn() string {
|
||||
<div class="card">
|
||||
<div class="card-head">GPU Stress</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Tests run on all GPUs in the system. Availability determined by driver status.</p>
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">NVIDIA tools run on all discovered GPUs. DCGM is the official NVIDIA diagnostic path. NCCL exercises multi-GPU fabric and is not a full compute burn.</p>
|
||||
<div id="gpu-tools-list">
|
||||
<label class="cb-row"><input type="checkbox" id="burn-gpu-bee" value="bee-gpu-burn" disabled><span>bee-gpu-burn <span class="cb-note" id="note-bee"></span></span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-gpu-dcgm" value="dcgm" disabled><span>DCGM Diagnostics (Official NVIDIA) <span class="cb-note" id="note-dcgm"></span></span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-gpu-john" value="john" disabled><span>John the Ripper (OpenCL) <span class="cb-note" id="note-john"></span></span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-gpu-nccl" value="nccl" disabled><span>NCCL all_reduce_perf <span class="cb-note" id="note-nccl"></span></span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-gpu-nccl" value="nccl" disabled><span>NCCL all_reduce_perf (Interconnect) <span class="cb-note" id="note-nccl"></span></span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-gpu-rvs" value="rvs" disabled><span>RVS GST (AMD) <span class="cb-note" id="note-rvs"></span></span></label>
|
||||
</div>
|
||||
<button class="btn btn-primary" style="margin-top:10px" onclick="runGPUStress()">▶ Run GPU Stress</button>
|
||||
@@ -881,17 +1074,18 @@ function streamTask(taskId, label) {
|
||||
}
|
||||
|
||||
function runGPUStress() {
|
||||
const ids = ['burn-gpu-bee','burn-gpu-john','burn-gpu-nccl','burn-gpu-rvs'];
|
||||
const loaderMap = {'burn-gpu-bee':'builtin','burn-gpu-john':'john','burn-gpu-nccl':'nccl','burn-gpu-rvs':'rvs'};
|
||||
const targetMap = {'burn-gpu-bee':'nvidia-stress','burn-gpu-john':'nvidia-stress','burn-gpu-nccl':'nvidia-stress','burn-gpu-rvs':'amd-stress'};
|
||||
let last = null;
|
||||
ids.filter(id => {
|
||||
const el = document.getElementById(id);
|
||||
const tasks = [
|
||||
{id:'burn-gpu-bee', target:'nvidia-stress', label:'bee-gpu-burn', extra:{loader:'builtin'}},
|
||||
{id:'burn-gpu-dcgm', target:'nvidia', label:'DCGM Diagnostics (Official NVIDIA)', extra:{display_name:'NVIDIA DCGM Diagnostics (Official)'}},
|
||||
{id:'burn-gpu-john', target:'nvidia-stress', label:'John GPU Stress', extra:{loader:'john'}},
|
||||
{id:'burn-gpu-nccl', target:'nvidia-stress', label:'NCCL Interconnect Stress', extra:{loader:'nccl', display_name:'NCCL Interconnect Stress'}},
|
||||
{id:'burn-gpu-rvs', target:'amd-stress', label:'RVS GST', extra:{}},
|
||||
];
|
||||
tasks.filter(t => {
|
||||
const el = document.getElementById(t.id);
|
||||
return el && el.checked && !el.disabled;
|
||||
}).forEach(id => {
|
||||
const target = targetMap[id];
|
||||
const extra = target === 'nvidia-stress' ? {loader: loaderMap[id]} : {};
|
||||
enqueueTask(target, extra).then(d => { last = d; streamTask(d.task_id, target + ' / ' + loaderMap[id]); });
|
||||
}).forEach(t => {
|
||||
enqueueTask(t.target, t.extra).then(d => { streamTask(d.task_id, t.label); });
|
||||
});
|
||||
}
|
||||
|
||||
@@ -928,13 +1122,15 @@ function runAll() {
|
||||
const done = () => { count++; status.textContent = count + ' tasks queued.'; };
|
||||
|
||||
// GPU tests
|
||||
const gpuIds = ['burn-gpu-bee','burn-gpu-john','burn-gpu-nccl','burn-gpu-rvs'];
|
||||
const loaderMap = {'burn-gpu-bee':'builtin','burn-gpu-john':'john','burn-gpu-nccl':'nccl','burn-gpu-rvs':'rvs'};
|
||||
const gpuTargetMap = {'burn-gpu-bee':'nvidia-stress','burn-gpu-john':'nvidia-stress','burn-gpu-nccl':'nvidia-stress','burn-gpu-rvs':'amd-stress'};
|
||||
gpuIds.filter(id => { const el = document.getElementById(id); return el && el.checked && !el.disabled; }).forEach(id => {
|
||||
const target = gpuTargetMap[id];
|
||||
const extra = target === 'nvidia-stress' ? {loader: loaderMap[id]} : {};
|
||||
enqueueTask(target, extra).then(d => { streamTask(d.task_id, target); done(); });
|
||||
const gpuTasks = [
|
||||
{id:'burn-gpu-bee', target:'nvidia-stress', label:'bee-gpu-burn', extra:{loader:'builtin'}},
|
||||
{id:'burn-gpu-dcgm', target:'nvidia', label:'DCGM Diagnostics (Official NVIDIA)', extra:{display_name:'NVIDIA DCGM Diagnostics (Official)'}},
|
||||
{id:'burn-gpu-john', target:'nvidia-stress', label:'John GPU Stress', extra:{loader:'john'}},
|
||||
{id:'burn-gpu-nccl', target:'nvidia-stress', label:'NCCL Interconnect Stress', extra:{loader:'nccl', display_name:'NCCL Interconnect Stress'}},
|
||||
{id:'burn-gpu-rvs', target:'amd-stress', label:'RVS GST', extra:{}},
|
||||
];
|
||||
gpuTasks.filter(t => { const el = document.getElementById(t.id); return el && el.checked && !el.disabled; }).forEach(t => {
|
||||
enqueueTask(t.target, t.extra).then(d => { streamTask(d.task_id, t.label); done(); });
|
||||
});
|
||||
|
||||
// Compute tests
|
||||
@@ -955,17 +1151,19 @@ function runAll() {
|
||||
|
||||
// Load GPU tool availability
|
||||
fetch('/api/gpu/tools').then(r => r.json()).then(tools => {
|
||||
const nvidiaMap = {'bee-gpu-burn':'burn-gpu-bee','john':'burn-gpu-john','nccl':'burn-gpu-nccl','rvs':'burn-gpu-rvs'};
|
||||
const noteMap = {'bee-gpu-burn':'note-bee','john':'note-john','nccl':'note-nccl','rvs':'note-rvs'};
|
||||
const nvidiaMap = {'bee-gpu-burn':'burn-gpu-bee','dcgm':'burn-gpu-dcgm','john':'burn-gpu-john','nccl':'burn-gpu-nccl','rvs':'burn-gpu-rvs'};
|
||||
const noteMap = {'bee-gpu-burn':'note-bee','dcgm':'note-dcgm','john':'note-john','nccl':'note-nccl','rvs':'note-rvs'};
|
||||
tools.forEach(t => {
|
||||
const cb = document.getElementById(nvidiaMap[t.id]);
|
||||
const note = document.getElementById(noteMap[t.id]);
|
||||
if (!cb) return;
|
||||
if (t.available) {
|
||||
cb.disabled = false;
|
||||
if (t.id === 'bee-gpu-burn') cb.checked = true;
|
||||
if (t.id === 'bee-gpu-burn' || t.id === 'dcgm') cb.checked = true;
|
||||
} else {
|
||||
const reason = t.vendor === 'nvidia' ? 'NVIDIA driver not running' : 'AMD driver not running';
|
||||
let reason = t.vendor === 'nvidia' ? 'NVIDIA driver not running' : 'AMD driver not running';
|
||||
if (t.id === 'dcgm' && t.vendor === 'nvidia') reason = 'dcgmi not available or NVIDIA driver not running';
|
||||
if (t.id === 'nccl' && t.vendor === 'nvidia') reason = 'NCCL interconnect tool unavailable or NVIDIA driver not running';
|
||||
if (note) note.textContent = '— ' + reason;
|
||||
}
|
||||
});
|
||||
@@ -1125,7 +1323,8 @@ func renderNetwork() string {
|
||||
// ── Services ──────────────────────────────────────────────────────────────────
|
||||
|
||||
func renderServicesInline() string {
|
||||
return `<div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="restartGPUDrivers()">Restart GPU Drivers</button><button class="btn btn-sm btn-secondary" onclick="loadServices()">↻ Refresh</button></div>
|
||||
return `<p style="font-size:13px;color:var(--muted);margin-bottom:10px">` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `</p>
|
||||
<div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="restartGPUDrivers()">Restart GPU Drivers</button><button class="btn btn-sm btn-secondary" onclick="loadServices()">↻ Refresh</button></div>
|
||||
<div id="svc-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||
<div id="svc-out" style="display:none;margin-top:8px" class="card">
|
||||
<div class="card-head">Output</div>
|
||||
@@ -1151,7 +1350,7 @@ function loadServices() {
|
||||
'</td></tr>';
|
||||
}).join('');
|
||||
document.getElementById('svc-table').innerHTML =
|
||||
'<table><tr><th>Service</th><th>Status</th><th>Actions</th></tr>'+rows+'</table>';
|
||||
'<table><tr><th>Unit</th><th>Status</th><th>Actions</th></tr>'+rows+'</table>';
|
||||
});
|
||||
}
|
||||
function toggleBody(id) {
|
||||
|
||||
@@ -253,6 +253,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
mux.HandleFunc("POST /api/sat/platform-stress/run", h.handleAPISATRun("platform-stress"))
|
||||
mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream)
|
||||
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
|
||||
mux.HandleFunc("POST /api/benchmark/nvidia/run", h.handleAPIBenchmarkNvidiaRun)
|
||||
|
||||
// Tasks
|
||||
mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)
|
||||
@@ -289,6 +290,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
|
||||
// GPU presence / tools
|
||||
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
||||
mux.HandleFunc("GET /api/gpu/nvidia", h.handleAPIGNVIDIAGPUs)
|
||||
mux.HandleFunc("GET /api/gpu/tools", h.handleAPIGPUTools)
|
||||
|
||||
// System
|
||||
|
||||
@@ -514,6 +514,47 @@ func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/benchmark", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
for _, needle := range []string{
|
||||
`href="/benchmark"`,
|
||||
`id="benchmark-gpu-list"`,
|
||||
`/api/gpu/nvidia`,
|
||||
`/api/benchmark/nvidia/run`,
|
||||
`benchmark-run-nccl`,
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("benchmark page missing %q: %s", needle, body)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestBurnPageRendersOfficialNVIDIADCGMAndNCCLInterconnectLabel(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/burn", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
for _, needle := range []string{
|
||||
`DCGM Diagnostics (Official NVIDIA)`,
|
||||
`NCCL all_reduce_perf (Interconnect)`,
|
||||
`DCGM is the official NVIDIA diagnostic path`,
|
||||
`burn-gpu-dcgm`,
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("burn page missing %q: %s", needle, body)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestTasksPageRendersScrollableLogModal(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
|
||||
@@ -30,22 +30,23 @@ const (
|
||||
|
||||
// taskNames maps target → human-readable name for validate (SAT) runs.
|
||||
var taskNames = map[string]string{
|
||||
"nvidia": "NVIDIA SAT",
|
||||
"nvidia-stress": "NVIDIA GPU Stress",
|
||||
"memory": "Memory SAT",
|
||||
"storage": "Storage SAT",
|
||||
"cpu": "CPU SAT",
|
||||
"amd": "AMD GPU SAT",
|
||||
"amd-mem": "AMD GPU MEM Integrity",
|
||||
"amd-bandwidth": "AMD GPU MEM Bandwidth",
|
||||
"amd-stress": "AMD GPU Burn-in",
|
||||
"memory-stress": "Memory Burn-in",
|
||||
"sat-stress": "SAT Stress (stressapptest)",
|
||||
"platform-stress": "Platform Thermal Cycling",
|
||||
"audit": "Audit",
|
||||
"support-bundle": "Support Bundle",
|
||||
"install": "Install to Disk",
|
||||
"install-to-ram": "Install to RAM",
|
||||
"nvidia": "NVIDIA SAT",
|
||||
"nvidia-benchmark": "NVIDIA Benchmark",
|
||||
"nvidia-stress": "NVIDIA GPU Stress",
|
||||
"memory": "Memory SAT",
|
||||
"storage": "Storage SAT",
|
||||
"cpu": "CPU SAT",
|
||||
"amd": "AMD GPU SAT",
|
||||
"amd-mem": "AMD GPU MEM Integrity",
|
||||
"amd-bandwidth": "AMD GPU MEM Bandwidth",
|
||||
"amd-stress": "AMD GPU Burn-in",
|
||||
"memory-stress": "Memory Burn-in",
|
||||
"sat-stress": "SAT Stress (stressapptest)",
|
||||
"platform-stress": "Platform Thermal Cycling",
|
||||
"audit": "Audit",
|
||||
"support-bundle": "Support Bundle",
|
||||
"install": "Install to Disk",
|
||||
"install-to-ram": "Install to RAM",
|
||||
}
|
||||
|
||||
// burnNames maps target → human-readable name when a burn profile is set.
|
||||
@@ -108,8 +109,11 @@ type taskParams struct {
|
||||
DiagLevel int `json:"diag_level,omitempty"`
|
||||
GPUIndices []int `json:"gpu_indices,omitempty"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
|
||||
SizeMB int `json:"size_mb,omitempty"`
|
||||
Loader string `json:"loader,omitempty"`
|
||||
BurnProfile string `json:"burn_profile,omitempty"`
|
||||
BenchmarkProfile string `json:"benchmark_profile,omitempty"`
|
||||
RunNCCL bool `json:"run_nccl,omitempty"`
|
||||
DisplayName string `json:"display_name,omitempty"`
|
||||
Device string `json:"device,omitempty"` // for install
|
||||
PlatformComponents []string `json:"platform_components,omitempty"`
|
||||
@@ -547,6 +551,18 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
} else {
|
||||
archive, err = a.RunNvidiaAcceptancePack("", j.append)
|
||||
}
|
||||
case "nvidia-benchmark":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = a.RunNvidiaBenchmarkCtx(ctx, "", platform.NvidiaBenchmarkOptions{
|
||||
Profile: t.params.BenchmarkProfile,
|
||||
SizeMB: t.params.SizeMB,
|
||||
GPUIndices: t.params.GPUIndices,
|
||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||
RunNCCL: t.params.RunNCCL,
|
||||
}, j.append)
|
||||
case "nvidia-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
|
||||
Reference in New Issue
Block a user