Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 53455063b9 | |||
| 4602f97836 | |||
| c65d3ae3b1 | |||
| 7a21c370e4 | |||
| a493e3ab5b | |||
| 19b4803ec7 | |||
| 1bdfb1e9ca | |||
| c5d6b30177 |
@@ -15,6 +15,10 @@ var workerPatterns = []string{
|
|||||||
"stress-ng",
|
"stress-ng",
|
||||||
"stressapptest",
|
"stressapptest",
|
||||||
"memtester",
|
"memtester",
|
||||||
|
// DCGM diagnostic workers — nvvs is spawned by dcgmi diag and survives
|
||||||
|
// if dcgmi is killed mid-run, leaving the GPU occupied (DCGM_ST_IN_USE).
|
||||||
|
"nvvs",
|
||||||
|
"dcgmi",
|
||||||
}
|
}
|
||||||
|
|
||||||
// KilledProcess describes a process that was sent SIGKILL.
|
// KilledProcess describes a process that was sent SIGKILL.
|
||||||
|
|||||||
@@ -110,7 +110,7 @@ func (s *System) RunPlatformStress(
|
|||||||
wg.Add(1)
|
wg.Add(1)
|
||||||
go func() {
|
go func() {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
gpuCmd := buildGPUStressCmd(loadCtx, vendor)
|
gpuCmd := buildGPUStressCmd(loadCtx, vendor, cycle.LoadSec)
|
||||||
if gpuCmd == nil {
|
if gpuCmd == nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -392,6 +392,13 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
|
|||||||
cmdArgs = append(cmdArgs, "-M", strconv.Itoa(mb))
|
cmdArgs = append(cmdArgs, "-M", strconv.Itoa(mb))
|
||||||
}
|
}
|
||||||
cmd := exec.CommandContext(ctx, path, cmdArgs...)
|
cmd := exec.CommandContext(ctx, path, cmdArgs...)
|
||||||
|
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||||
|
cmd.Cancel = func() error {
|
||||||
|
if cmd.Process != nil {
|
||||||
|
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
cmd.Stdout = nil
|
cmd.Stdout = nil
|
||||||
cmd.Stderr = nil
|
cmd.Stderr = nil
|
||||||
if err := startLowPriorityCmd(cmd, 15); err != nil {
|
if err := startLowPriorityCmd(cmd, 15); err != nil {
|
||||||
@@ -402,28 +409,28 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
|
|||||||
|
|
||||||
// buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
|
// buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
|
||||||
// Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
|
// Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
|
||||||
func buildGPUStressCmd(ctx context.Context, vendor string) *exec.Cmd {
|
func buildGPUStressCmd(ctx context.Context, vendor string, durSec int) *exec.Cmd {
|
||||||
switch strings.ToLower(vendor) {
|
switch strings.ToLower(vendor) {
|
||||||
case "amd":
|
case "amd":
|
||||||
return buildAMDGPUStressCmd(ctx)
|
return buildAMDGPUStressCmd(ctx, durSec)
|
||||||
case "nvidia":
|
case "nvidia":
|
||||||
return buildNvidiaGPUStressCmd(ctx)
|
return buildNvidiaGPUStressCmd(ctx, durSec)
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
|
func buildAMDGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
|
||||||
rvsArgs, err := resolveRVSCommand()
|
rvsArgs, err := resolveRVSCommand()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
rvsPath := rvsArgs[0]
|
rvsPath := rvsArgs[0]
|
||||||
cfg := `actions:
|
cfg := fmt.Sprintf(`actions:
|
||||||
- name: gst_platform
|
- name: gst_platform
|
||||||
device: all
|
device: all
|
||||||
module: gst
|
module: gst
|
||||||
parallel: true
|
parallel: true
|
||||||
duration: 86400000
|
duration: %d`, durSec*1000) + `
|
||||||
copy_matrix: false
|
copy_matrix: false
|
||||||
target_stress: 90
|
target_stress: 90
|
||||||
matrix_size_a: 8640
|
matrix_size_a: 8640
|
||||||
@@ -433,13 +440,20 @@ func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
|
|||||||
cfgFile := "/tmp/bee-platform-gst.conf"
|
cfgFile := "/tmp/bee-platform-gst.conf"
|
||||||
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||||
cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
|
cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
|
||||||
|
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||||
|
cmd.Cancel = func() error {
|
||||||
|
if cmd.Process != nil {
|
||||||
|
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
cmd.Stdout = nil
|
cmd.Stdout = nil
|
||||||
cmd.Stderr = nil
|
cmd.Stderr = nil
|
||||||
_ = startLowPriorityCmd(cmd, 10)
|
_ = startLowPriorityCmd(cmd, 10)
|
||||||
return cmd
|
return cmd
|
||||||
}
|
}
|
||||||
|
|
||||||
func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
|
func buildNvidiaGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
|
||||||
path, err := satLookPath("bee-gpu-burn")
|
path, err := satLookPath("bee-gpu-burn")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
path, err = satLookPath("bee-gpu-stress")
|
path, err = satLookPath("bee-gpu-stress")
|
||||||
@@ -447,7 +461,17 @@ func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
cmd := exec.CommandContext(ctx, path, "--seconds", "86400")
|
// Pass exact duration so bee-gpu-burn exits on its own when the cycle ends.
|
||||||
|
// Process group kill via Setpgid+Cancel is kept as a safety net for cases
|
||||||
|
// where the context is cancelled early (user stop, parent timeout).
|
||||||
|
cmd := exec.CommandContext(ctx, path, "--seconds", strconv.Itoa(durSec))
|
||||||
|
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||||
|
cmd.Cancel = func() error {
|
||||||
|
if cmd.Process != nil {
|
||||||
|
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
cmd.Stdout = nil
|
cmd.Stdout = nil
|
||||||
cmd.Stderr = nil
|
cmd.Stderr = nil
|
||||||
_ = startLowPriorityCmd(cmd, 10)
|
_ = startLowPriorityCmd(cmd, 10)
|
||||||
|
|||||||
@@ -173,6 +173,22 @@ func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHe
|
|||||||
|
|
||||||
switch vendor {
|
switch vendor {
|
||||||
case "nvidia":
|
case "nvidia":
|
||||||
|
if raw, err := os.ReadFile("/run/bee-nvidia-mode"); err == nil {
|
||||||
|
health.NvidiaGSPMode = strings.TrimSpace(string(raw))
|
||||||
|
if health.NvidiaGSPMode == "gsp-stuck" {
|
||||||
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
|
Code: "nvidia_gsp_stuck",
|
||||||
|
Severity: "critical",
|
||||||
|
Description: "NVIDIA GSP firmware init timed out and the kernel module is stuck. Reboot and select 'GSP=off' in the boot menu.",
|
||||||
|
})
|
||||||
|
} else if health.NvidiaGSPMode == "gsp-off" {
|
||||||
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
|
Code: "nvidia_gsp_disabled",
|
||||||
|
Severity: "warning",
|
||||||
|
Description: "NVIDIA GSP firmware disabled (fallback). Power management runs via CPU path — power draw readings may differ from reference hardware.",
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
|
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
|
||||||
if !health.DriverReady {
|
if !health.DriverReady {
|
||||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
|
|||||||
@@ -382,6 +382,13 @@ func (s *System) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDi
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||||||
|
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||||||
|
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||||
|
for _, p := range killed {
|
||||||
|
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||||
|
}
|
||||||
|
}
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", []satJob{
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -61,7 +61,9 @@ func (s *System) ServiceState(name string) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
|
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
|
||||||
raw, err := exec.Command("systemctl", string(action), name).CombinedOutput()
|
// bee-web runs as the bee user; sudo is required to control system services.
|
||||||
|
// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
|
||||||
|
raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()
|
||||||
return string(raw), err
|
return string(raw), err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ type RuntimeHealth struct {
|
|||||||
ExportDir string `json:"export_dir,omitempty"`
|
ExportDir string `json:"export_dir,omitempty"`
|
||||||
DriverReady bool `json:"driver_ready,omitempty"`
|
DriverReady bool `json:"driver_ready,omitempty"`
|
||||||
CUDAReady bool `json:"cuda_ready,omitempty"`
|
CUDAReady bool `json:"cuda_ready,omitempty"`
|
||||||
|
NvidiaGSPMode string `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
|
||||||
NetworkStatus string `json:"network_status,omitempty"`
|
NetworkStatus string `json:"network_status,omitempty"`
|
||||||
Issues []RuntimeIssue `json:"issues,omitempty"`
|
Issues []RuntimeIssue `json:"issues,omitempty"`
|
||||||
Tools []RuntimeToolStatus `json:"tools,omitempty"`
|
Tools []RuntimeToolStatus `json:"tools,omitempty"`
|
||||||
|
|||||||
@@ -383,11 +383,13 @@ func (h *handler) handleAPIServicesAction(w http.ResponseWriter, r *http.Request
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
result, err := h.opts.App.ServiceActionResult(req.Name, action)
|
result, err := h.opts.App.ServiceActionResult(req.Name, action)
|
||||||
|
status := "ok"
|
||||||
if err != nil {
|
if err != nil {
|
||||||
writeError(w, http.StatusInternalServerError, err.Error())
|
status = "error"
|
||||||
return
|
|
||||||
}
|
}
|
||||||
writeJSON(w, map[string]string{"status": "ok", "output": result.Body})
|
// Always return 200 with output so the frontend can display the actual
|
||||||
|
// systemctl error message instead of a generic "exit status 1".
|
||||||
|
writeJSON(w, map[string]string{"status": status, "output": result.Body})
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Network ───────────────────────────────────────────────────────────────────
|
// ── Network ───────────────────────────────────────────────────────────────────
|
||||||
|
|||||||
@@ -33,6 +33,9 @@ a{color:var(--accent);text-decoration:none}
|
|||||||
.sidebar-logo{padding:18px 16px 12px;font-size:18px;font-weight:700;color:#fff;letter-spacing:-.5px}
|
.sidebar-logo{padding:18px 16px 12px;font-size:18px;font-weight:700;color:#fff;letter-spacing:-.5px}
|
||||||
.sidebar-logo span{color:rgba(255,255,255,.5);font-weight:400;font-size:12px;display:block;margin-top:2px}
|
.sidebar-logo span{color:rgba(255,255,255,.5);font-weight:400;font-size:12px;display:block;margin-top:2px}
|
||||||
.sidebar-version{padding:0 16px 14px;font-size:11px;color:rgba(255,255,255,.45)}
|
.sidebar-version{padding:0 16px 14px;font-size:11px;color:rgba(255,255,255,.45)}
|
||||||
|
.sidebar-badge{margin:0 12px 12px;padding:5px 8px;border-radius:4px;font-size:11px;font-weight:600;text-align:center}
|
||||||
|
.sidebar-badge-warn{background:#7a4f00;color:#f6c90e}
|
||||||
|
.sidebar-badge-crit{background:#5c1a1a;color:#ff6b6b}
|
||||||
.nav{flex:1}
|
.nav{flex:1}
|
||||||
.nav-item{display:block;padding:10px 16px;color:rgba(255,255,255,.7);font-size:13px;border-left:3px solid transparent;transition:all .15s}
|
.nav-item{display:block;padding:10px 16px;color:rgba(255,255,255,.7);font-size:13px;border-left:3px solid transparent;transition:all .15s}
|
||||||
.nav-item:hover{color:#fff;background:rgba(255,255,255,.08)}
|
.nav-item:hover{color:#fff;background:rgba(255,255,255,.08)}
|
||||||
@@ -107,6 +110,15 @@ func layoutNav(active string, buildLabel string) string {
|
|||||||
buildLabel = "dev"
|
buildLabel = "dev"
|
||||||
}
|
}
|
||||||
b.WriteString(`<div class="sidebar-version">Version ` + html.EscapeString(buildLabel) + `</div>`)
|
b.WriteString(`<div class="sidebar-version">Version ` + html.EscapeString(buildLabel) + `</div>`)
|
||||||
|
if raw, err := os.ReadFile("/run/bee-nvidia-mode"); err == nil {
|
||||||
|
gspMode := strings.TrimSpace(string(raw))
|
||||||
|
switch gspMode {
|
||||||
|
case "gsp-off":
|
||||||
|
b.WriteString(`<div class="sidebar-badge sidebar-badge-warn">NVIDIA GSP=off</div>`)
|
||||||
|
case "gsp-stuck":
|
||||||
|
b.WriteString(`<div class="sidebar-badge sidebar-badge-crit">NVIDIA GSP stuck — reboot</div>`)
|
||||||
|
}
|
||||||
|
}
|
||||||
b.WriteString(`<nav class="nav">`)
|
b.WriteString(`<nav class="nav">`)
|
||||||
for _, item := range items {
|
for _, item := range items {
|
||||||
cls := "nav-item"
|
cls := "nav-item"
|
||||||
@@ -1056,17 +1068,23 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
`</div>
|
`</div>
|
||||||
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
||||||
<div class="grid3">
|
<div class="grid3">
|
||||||
` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
|
` + renderSATCard("nvidia-selection", "NVIDIA GPU Selection", "", "", renderValidateCardBody(
|
||||||
|
inv.NVIDIA,
|
||||||
|
`Select which NVIDIA GPUs to include in Validate. The same selection is used by both NVIDIA GPU cards below and by Validate one by one.`,
|
||||||
|
`<code>nvidia-smi --query-gpu=index,name,memory.total</code>`,
|
||||||
|
`<div id="sat-gpu-list"><p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs…</p></div><div style="display:flex;gap:8px;flex-wrap:wrap;margin-top:8px"><button type="button" class="btn btn-sm btn-secondary" onclick="satSelectAllGPUs()">Select all</button><button type="button" class="btn btn-sm btn-secondary" onclick="satSelectNoGPUs()">Clear</button></div><div id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin-top:8px"></div>`,
|
||||||
|
)) +
|
||||||
|
renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
|
||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Runs NVIDIA diagnostics and board inventory checks.`,
|
`Runs NVIDIA diagnostics and board inventory checks.`,
|
||||||
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
||||||
`Runs one GPU at a time. Diag level is taken from Validate Profile.`,
|
`Runs one GPU at a time on the selected NVIDIA GPUs. Diag level is taken from Validate Profile.`,
|
||||||
)) +
|
)) +
|
||||||
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Runs a controlled NVIDIA DCGM load in Validate to check stability under moderate stress.`,
|
`Runs a controlled NVIDIA DCGM load in Validate to check stability under moderate stress.`,
|
||||||
`<code>dcgmi diag targeted_stress</code>`,
|
`<code>dcgmi diag targeted_stress</code>`,
|
||||||
`Runs one GPU at a time with the fixed DCGM targeted stress recipe.`,
|
`Runs one GPU at a time on the selected NVIDIA GPUs with the fixed DCGM targeted stress recipe.`,
|
||||||
)) +
|
)) +
|
||||||
`</div>
|
`</div>
|
||||||
<div class="grid3" style="margin-top:16px">
|
<div class="grid3" style="margin-top:16px">
|
||||||
@@ -1088,6 +1106,8 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
.validate-card-body { padding:0; }
|
.validate-card-body { padding:0; }
|
||||||
.validate-card-section { padding:12px 16px 0; }
|
.validate-card-section { padding:12px 16px 0; }
|
||||||
.validate-card-section:last-child { padding-bottom:16px; }
|
.validate-card-section:last-child { padding-bottom:16px; }
|
||||||
|
.sat-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
|
||||||
|
.sat-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||||
@media(max-width:900px){ .validate-profile-body { grid-template-columns:1fr; } }
|
@media(max-width:900px){ .validate-profile-body { grid-template-columns:1fr; } }
|
||||||
</style>
|
</style>
|
||||||
<script>
|
<script>
|
||||||
@@ -1116,6 +1136,59 @@ function loadSatNvidiaGPUs() {
|
|||||||
}
|
}
|
||||||
return satNvidiaGPUsPromise;
|
return satNvidiaGPUsPromise;
|
||||||
}
|
}
|
||||||
|
function satSelectedGPUIndices() {
|
||||||
|
return Array.from(document.querySelectorAll('.sat-nvidia-checkbox'))
|
||||||
|
.filter(function(el) { return el.checked && !el.disabled; })
|
||||||
|
.map(function(el) { return parseInt(el.value, 10); })
|
||||||
|
.filter(function(v) { return !Number.isNaN(v); })
|
||||||
|
.sort(function(a, b) { return a - b; });
|
||||||
|
}
|
||||||
|
function satUpdateGPUSelectionNote() {
|
||||||
|
const note = document.getElementById('sat-gpu-selection-note');
|
||||||
|
if (!note) return;
|
||||||
|
const selected = satSelectedGPUIndices();
|
||||||
|
if (!selected.length) {
|
||||||
|
note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA validate tasks.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
note.textContent = 'Selected NVIDIA GPUs: ' + selected.join(', ') + '.';
|
||||||
|
}
|
||||||
|
function satRenderGPUList(gpus) {
|
||||||
|
const root = document.getElementById('sat-gpu-list');
|
||||||
|
if (!root) return;
|
||||||
|
if (!gpus || !gpus.length) {
|
||||||
|
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
|
||||||
|
satUpdateGPUSelectionNote();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
root.innerHTML = gpus.map(function(gpu) {
|
||||||
|
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
|
||||||
|
return '<label class="sat-gpu-row">'
|
||||||
|
+ '<input class="sat-nvidia-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="satUpdateGPUSelectionNote()">'
|
||||||
|
+ '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
|
||||||
|
+ '</label>';
|
||||||
|
}).join('');
|
||||||
|
satUpdateGPUSelectionNote();
|
||||||
|
}
|
||||||
|
function satSelectAllGPUs() {
|
||||||
|
document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = true; });
|
||||||
|
satUpdateGPUSelectionNote();
|
||||||
|
}
|
||||||
|
function satSelectNoGPUs() {
|
||||||
|
document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = false; });
|
||||||
|
satUpdateGPUSelectionNote();
|
||||||
|
}
|
||||||
|
function satLoadGPUs() {
|
||||||
|
loadSatNvidiaGPUs().then(function(gpus) {
|
||||||
|
satRenderGPUList(gpus);
|
||||||
|
}).catch(function(err) {
|
||||||
|
const root = document.getElementById('sat-gpu-list');
|
||||||
|
if (root) {
|
||||||
|
root.innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
|
||||||
|
}
|
||||||
|
satUpdateGPUSelectionNote();
|
||||||
|
});
|
||||||
|
}
|
||||||
function satGPUDisplayName(gpu) {
|
function satGPUDisplayName(gpu) {
|
||||||
const idx = (gpu && Number.isFinite(Number(gpu.index))) ? Number(gpu.index) : 0;
|
const idx = (gpu && Number.isFinite(Number(gpu.index))) ? Number(gpu.index) : 0;
|
||||||
const name = gpu && gpu.name ? gpu.name : ('GPU ' + idx);
|
const name = gpu && gpu.name ? gpu.name : ('GPU ' + idx);
|
||||||
@@ -1137,6 +1210,36 @@ function enqueueSATTarget(target, overrides) {
|
|||||||
return fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(satRequestBody(target, overrides))})
|
return fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(satRequestBody(target, overrides))})
|
||||||
.then(r => r.json());
|
.then(r => r.json());
|
||||||
}
|
}
|
||||||
|
function streamSATTask(taskId, title, resetTerminal) {
|
||||||
|
if (satES) { satES.close(); satES = null; }
|
||||||
|
document.getElementById('sat-output').style.display='block';
|
||||||
|
document.getElementById('sat-title').textContent = '— ' + title;
|
||||||
|
const term = document.getElementById('sat-terminal');
|
||||||
|
if (resetTerminal) {
|
||||||
|
term.textContent = '';
|
||||||
|
}
|
||||||
|
term.textContent += 'Task ' + taskId + ' queued. Streaming log...\n';
|
||||||
|
return new Promise(function(resolve) {
|
||||||
|
satES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||||
|
satES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||||
|
satES.addEventListener('done', function(e) {
|
||||||
|
satES.close();
|
||||||
|
satES = null;
|
||||||
|
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
resolve({ok: !e.data, error: e.data || ''});
|
||||||
|
});
|
||||||
|
satES.onerror = function() {
|
||||||
|
if (satES) {
|
||||||
|
satES.close();
|
||||||
|
satES = null;
|
||||||
|
}
|
||||||
|
term.textContent += '\nERROR: stream disconnected.\n';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
resolve({ok: false, error: 'stream disconnected'});
|
||||||
|
};
|
||||||
|
});
|
||||||
|
}
|
||||||
function selectedAMDValidateTargets() {
|
function selectedAMDValidateTargets() {
|
||||||
const targets = [];
|
const targets = [];
|
||||||
const gpu = document.getElementById('sat-amd-target');
|
const gpu = document.getElementById('sat-amd-target');
|
||||||
@@ -1151,24 +1254,23 @@ function runSAT(target) {
|
|||||||
return runSATWithOverrides(target, null);
|
return runSATWithOverrides(target, null);
|
||||||
}
|
}
|
||||||
function runSATWithOverrides(target, overrides) {
|
function runSATWithOverrides(target, overrides) {
|
||||||
if (satES) { satES.close(); satES = null; }
|
const title = (overrides && overrides.display_name) || target;
|
||||||
document.getElementById('sat-output').style.display='block';
|
|
||||||
document.getElementById('sat-title').textContent = '— ' + target;
|
|
||||||
const term = document.getElementById('sat-terminal');
|
const term = document.getElementById('sat-terminal');
|
||||||
term.textContent = 'Enqueuing ' + target + ' test...\n';
|
document.getElementById('sat-output').style.display='block';
|
||||||
|
document.getElementById('sat-title').textContent = '— ' + title;
|
||||||
|
term.textContent = 'Enqueuing ' + title + ' test...\n';
|
||||||
return enqueueSATTarget(target, overrides)
|
return enqueueSATTarget(target, overrides)
|
||||||
.then(d => {
|
.then(d => streamSATTask(d.task_id, title, false));
|
||||||
term.textContent += 'Task ' + d.task_id + ' queued. Streaming log...\n';
|
|
||||||
satES = new EventSource('/api/tasks/'+d.task_id+'/stream');
|
|
||||||
satES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop=term.scrollHeight; };
|
|
||||||
satES.addEventListener('done', e => { satES.close(); satES=null; term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n'; });
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
function expandSATTarget(target) {
|
function expandSATTarget(target) {
|
||||||
if (target !== 'nvidia' && target !== 'nvidia-targeted-stress') {
|
if (target !== 'nvidia' && target !== 'nvidia-targeted-stress') {
|
||||||
return Promise.resolve([{target: target}]);
|
return Promise.resolve([{target: target}]);
|
||||||
}
|
}
|
||||||
return loadSatNvidiaGPUs().then(gpus => gpus.map(gpu => ({
|
const selected = satSelectedGPUIndices();
|
||||||
|
if (!selected.length) {
|
||||||
|
return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
|
||||||
|
}
|
||||||
|
return loadSatNvidiaGPUs().then(gpus => gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0).map(gpu => ({
|
||||||
target: target,
|
target: target,
|
||||||
overrides: {
|
overrides: {
|
||||||
gpu_indices: [Number(gpu.index)],
|
gpu_indices: [Number(gpu.index)],
|
||||||
@@ -1179,65 +1281,61 @@ function expandSATTarget(target) {
|
|||||||
}
|
}
|
||||||
function runNvidiaValidateSet(target) {
|
function runNvidiaValidateSet(target) {
|
||||||
return loadSatNvidiaGPUs().then(gpus => {
|
return loadSatNvidiaGPUs().then(gpus => {
|
||||||
if (!gpus.length) return;
|
const selected = satSelectedGPUIndices();
|
||||||
if (gpus.length === 1) {
|
const picked = gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0);
|
||||||
const gpu = gpus[0];
|
if (!picked.length) {
|
||||||
|
throw new Error('Select at least one NVIDIA GPU.');
|
||||||
|
}
|
||||||
|
if (picked.length === 1) {
|
||||||
|
const gpu = picked[0];
|
||||||
return runSATWithOverrides(target, {
|
return runSATWithOverrides(target, {
|
||||||
gpu_indices: [Number(gpu.index)],
|
gpu_indices: [Number(gpu.index)],
|
||||||
display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')'
|
display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')'
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
if (satES) { satES.close(); satES = null; }
|
|
||||||
document.getElementById('sat-output').style.display='block';
|
document.getElementById('sat-output').style.display='block';
|
||||||
document.getElementById('sat-title').textContent = '— ' + target;
|
document.getElementById('sat-title').textContent = '— ' + target;
|
||||||
const term = document.getElementById('sat-terminal');
|
const term = document.getElementById('sat-terminal');
|
||||||
term.textContent = 'Enqueuing ' + target + ' tests one GPU at a time...\n';
|
term.textContent = 'Running ' + target + ' one GPU at a time...\n';
|
||||||
const labelBase = satLabels()[target] || ('Validate ' + target);
|
const labelBase = satLabels()[target] || ('Validate ' + target);
|
||||||
const enqueueNext = (idx) => {
|
const runNext = (idx) => {
|
||||||
if (idx >= gpus.length) return;
|
if (idx >= picked.length) return Promise.resolve();
|
||||||
const gpu = gpus[idx];
|
const gpu = picked[idx];
|
||||||
const gpuLabel = satGPUDisplayName(gpu);
|
const gpuLabel = satGPUDisplayName(gpu);
|
||||||
enqueueSATTarget(target, {
|
term.textContent += '\n[' + (idx + 1) + '/' + picked.length + '] ' + gpuLabel + '\n';
|
||||||
|
return enqueueSATTarget(target, {
|
||||||
gpu_indices: [Number(gpu.index)],
|
gpu_indices: [Number(gpu.index)],
|
||||||
display_name: labelBase + ' (' + gpuLabel + ')'
|
display_name: labelBase + ' (' + gpuLabel + ')'
|
||||||
}).then(d => {
|
}).then(d => {
|
||||||
term.textContent += 'Task ' + d.task_id + ' queued for ' + gpuLabel + '.\n';
|
return streamSATTask(d.task_id, labelBase + ' (' + gpuLabel + ')', false);
|
||||||
if (idx === gpus.length - 1) {
|
}).then(function() {
|
||||||
satES = new EventSource('/api/tasks/' + d.task_id + '/stream');
|
return runNext(idx + 1);
|
||||||
satES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop=term.scrollHeight; };
|
|
||||||
satES.addEventListener('done', e => { satES.close(); satES=null; term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n'; });
|
|
||||||
}
|
|
||||||
enqueueNext(idx + 1);
|
|
||||||
});
|
});
|
||||||
};
|
};
|
||||||
enqueueNext(0);
|
return runNext(0);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
function runAMDValidateSet() {
|
function runAMDValidateSet() {
|
||||||
const targets = selectedAMDValidateTargets();
|
const targets = selectedAMDValidateTargets();
|
||||||
if (!targets.length) return;
|
if (!targets.length) return;
|
||||||
if (targets.length === 1) return runSAT(targets[0]);
|
if (targets.length === 1) return runSAT(targets[0]);
|
||||||
if (satES) { satES.close(); satES = null; }
|
|
||||||
document.getElementById('sat-output').style.display='block';
|
document.getElementById('sat-output').style.display='block';
|
||||||
document.getElementById('sat-title').textContent = '— amd';
|
document.getElementById('sat-title').textContent = '— amd';
|
||||||
const term = document.getElementById('sat-terminal');
|
const term = document.getElementById('sat-terminal');
|
||||||
term.textContent = 'Enqueuing AMD validate set...\n';
|
term.textContent = 'Running AMD validate set one by one...\n';
|
||||||
const labels = satLabels();
|
const labels = satLabels();
|
||||||
const enqueueNext = (idx) => {
|
const runNext = (idx) => {
|
||||||
if (idx >= targets.length) return;
|
if (idx >= targets.length) return Promise.resolve();
|
||||||
const target = targets[idx];
|
const target = targets[idx];
|
||||||
enqueueSATTarget(target)
|
term.textContent += '\n[' + (idx + 1) + '/' + targets.length + '] ' + labels[target] + '\n';
|
||||||
|
return enqueueSATTarget(target)
|
||||||
.then(d => {
|
.then(d => {
|
||||||
term.textContent += 'Task ' + d.task_id + ' queued for ' + labels[target] + '.\n';
|
return streamSATTask(d.task_id, labels[target], false);
|
||||||
if (idx === targets.length - 1) {
|
}).then(function() {
|
||||||
satES = new EventSource('/api/tasks/'+d.task_id+'/stream');
|
return runNext(idx + 1);
|
||||||
satES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop=term.scrollHeight; };
|
|
||||||
satES.addEventListener('done', e => { satES.close(); satES=null; term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n'; });
|
|
||||||
}
|
|
||||||
enqueueNext(idx + 1);
|
|
||||||
});
|
});
|
||||||
};
|
};
|
||||||
enqueueNext(0);
|
return runNext(0);
|
||||||
}
|
}
|
||||||
function runAllSAT() {
|
function runAllSAT() {
|
||||||
const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
|
const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
|
||||||
@@ -1259,17 +1357,17 @@ function runAllSAT() {
|
|||||||
status.textContent = 'No tasks selected.';
|
status.textContent = 'No tasks selected.';
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const enqueueNext = (idx) => {
|
const runNext = (idx) => {
|
||||||
if (idx >= expanded.length) { status.textContent = 'Enqueued ' + total + ' tasks.'; return; }
|
if (idx >= expanded.length) { status.textContent = 'Completed ' + total + ' task(s).'; return Promise.resolve(); }
|
||||||
const item = expanded[idx];
|
const item = expanded[idx];
|
||||||
enqueueSATTarget(item.target, item.overrides)
|
status.textContent = 'Running ' + (idx + 1) + '/' + total + '...';
|
||||||
|
return enqueueSATTarget(item.target, item.overrides)
|
||||||
.then(() => {
|
.then(() => {
|
||||||
enqueued++;
|
enqueued++;
|
||||||
status.textContent = 'Enqueued ' + enqueued + '/' + total + '...';
|
return runNext(idx + 1);
|
||||||
enqueueNext(idx + 1);
|
|
||||||
});
|
});
|
||||||
};
|
};
|
||||||
enqueueNext(0);
|
return runNext(0);
|
||||||
}).catch(err => {
|
}).catch(err => {
|
||||||
status.textContent = 'Error: ' + err.message;
|
status.textContent = 'Error: ' + err.message;
|
||||||
});
|
});
|
||||||
@@ -1282,6 +1380,7 @@ fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
|
|||||||
if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
|
if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
|
||||||
if (!gp.amd) disableSATAMDOptions('No AMD GPU detected');
|
if (!gp.amd) disableSATAMDOptions('No AMD GPU detected');
|
||||||
});
|
});
|
||||||
|
satLoadGPUs();
|
||||||
function disableSATAMDOptions(reason) {
|
function disableSATAMDOptions(reason) {
|
||||||
['sat-amd-target','sat-amd-mem-target','sat-amd-bandwidth-target'].forEach(function(id) {
|
['sat-amd-target','sat-amd-mem-target','sat-amd-bandwidth-target'].forEach(function(id) {
|
||||||
const cb = document.getElementById(id);
|
const cb = document.getElementById(id);
|
||||||
@@ -1874,6 +1973,36 @@ function streamTask(taskId, label) {
|
|||||||
term.scrollTop = term.scrollHeight;
|
term.scrollTop = term.scrollHeight;
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
function streamBurnTask(taskId, label, resetTerminal) {
|
||||||
|
if (biES) { biES.close(); biES = null; }
|
||||||
|
document.getElementById('bi-output').style.display = 'block';
|
||||||
|
document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
|
||||||
|
const term = document.getElementById('bi-terminal');
|
||||||
|
if (resetTerminal) {
|
||||||
|
term.textContent = '';
|
||||||
|
}
|
||||||
|
term.textContent += 'Task ' + taskId + ' queued. Streaming...\n';
|
||||||
|
return new Promise(function(resolve) {
|
||||||
|
biES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||||
|
biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||||
|
biES.addEventListener('done', function(e) {
|
||||||
|
biES.close();
|
||||||
|
biES = null;
|
||||||
|
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
resolve({ok: !e.data, error: e.data || ''});
|
||||||
|
});
|
||||||
|
biES.onerror = function() {
|
||||||
|
if (biES) {
|
||||||
|
biES.close();
|
||||||
|
biES = null;
|
||||||
|
}
|
||||||
|
term.textContent += '\nERROR: stream disconnected.\n';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
resolve({ok: false, error: 'stream disconnected'});
|
||||||
|
};
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
function runBurnTaskSet(tasks, statusElId) {
|
function runBurnTaskSet(tasks, statusElId) {
|
||||||
const enabled = tasks.filter(function(t) {
|
const enabled = tasks.filter(function(t) {
|
||||||
@@ -1886,19 +2015,33 @@ function runBurnTaskSet(tasks, statusElId) {
|
|||||||
if (status) status.textContent = 'No tasks selected.';
|
if (status) status.textContent = 'No tasks selected.';
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
enabled.forEach(function(t) {
|
const term = document.getElementById('bi-terminal');
|
||||||
enqueueBurnTask(t.target, t.label, t.extra, !!t.nvidia)
|
document.getElementById('bi-output').style.display = 'block';
|
||||||
|
document.getElementById('bi-title').textContent = '— Burn one by one [' + burnProfile() + ']';
|
||||||
|
term.textContent = '';
|
||||||
|
const runNext = function(idx) {
|
||||||
|
if (idx >= enabled.length) {
|
||||||
|
if (status) status.textContent = 'Completed ' + enabled.length + ' task(s).';
|
||||||
|
return Promise.resolve();
|
||||||
|
}
|
||||||
|
const t = enabled[idx];
|
||||||
|
term.textContent += '\n[' + (idx + 1) + '/' + enabled.length + '] ' + t.label + '\n';
|
||||||
|
if (status) status.textContent = 'Running ' + (idx + 1) + '/' + enabled.length + '...';
|
||||||
|
return enqueueBurnTask(t.target, t.label, t.extra, !!t.nvidia)
|
||||||
.then(function(d) {
|
.then(function(d) {
|
||||||
if (status) status.textContent = enabled.length + ' task(s) queued.';
|
return streamBurnTask(d.task_id, t.label, false);
|
||||||
streamTask(d.task_id, t.label);
|
})
|
||||||
|
.then(function() {
|
||||||
|
return runNext(idx + 1);
|
||||||
})
|
})
|
||||||
.catch(function(err) {
|
.catch(function(err) {
|
||||||
if (status) status.textContent = 'Error: ' + err.message;
|
if (status) status.textContent = 'Error: ' + err.message;
|
||||||
const term = document.getElementById('bi-terminal');
|
|
||||||
document.getElementById('bi-output').style.display = 'block';
|
document.getElementById('bi-output').style.display = 'block';
|
||||||
term.textContent += 'ERROR: ' + err.message + '\n';
|
term.textContent += 'ERROR: ' + err.message + '\n';
|
||||||
|
return Promise.reject(err);
|
||||||
});
|
});
|
||||||
});
|
};
|
||||||
|
return runNext(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
function runPlatformStress() {
|
function runPlatformStress() {
|
||||||
@@ -2107,9 +2250,12 @@ func renderServicesInline() string {
|
|||||||
return `<p style="font-size:13px;color:var(--muted);margin-bottom:10px">` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `</p>
|
return `<p style="font-size:13px;color:var(--muted);margin-bottom:10px">` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `</p>
|
||||||
<div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="restartGPUDrivers()">Restart GPU Drivers</button><button class="btn btn-sm btn-secondary" onclick="loadServices()">↻ Refresh</button></div>
|
<div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="restartGPUDrivers()">Restart GPU Drivers</button><button class="btn btn-sm btn-secondary" onclick="loadServices()">↻ Refresh</button></div>
|
||||||
<div id="svc-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
<div id="svc-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||||
<div id="svc-out" style="display:none;margin-top:8px" class="card">
|
<div id="svc-out" style="display:none;margin-top:12px">
|
||||||
<div class="card-head">Output</div>
|
<div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
|
||||||
<div class="card-body" style="padding:10px"><div id="svc-terminal" class="terminal" style="max-height:150px"></div></div>
|
<span id="svc-out-label" style="font-size:12px;font-weight:600;color:var(--muted)">Output</span>
|
||||||
|
<span id="svc-out-status" style="font-size:12px"></span>
|
||||||
|
</div>
|
||||||
|
<div id="svc-terminal" class="terminal" style="max-height:220px;width:100%;box-sizing:border-box"></div>
|
||||||
</div>
|
</div>
|
||||||
<script>
|
<script>
|
||||||
function loadServices() {
|
function loadServices() {
|
||||||
@@ -2125,9 +2271,9 @@ function loadServices() {
|
|||||||
'<div id="'+id+'" style="display:none;margin-top:6px"><pre style="font-size:11px;white-space:pre-wrap;word-break:break-all;max-height:200px;overflow-y:auto;background:#1b1c1d;padding:8px;border-radius:4px;color:#b5cea8">'+body+'</pre></div>' +
|
'<div id="'+id+'" style="display:none;margin-top:6px"><pre style="font-size:11px;white-space:pre-wrap;word-break:break-all;max-height:200px;overflow-y:auto;background:#1b1c1d;padding:8px;border-radius:4px;color:#b5cea8">'+body+'</pre></div>' +
|
||||||
'</td>' +
|
'</td>' +
|
||||||
'<td style="white-space:nowrap">' +
|
'<td style="white-space:nowrap">' +
|
||||||
'<button class="btn btn-sm btn-secondary" onclick="svcAction(\''+s.name+'\',\'start\')">Start</button> ' +
|
'<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-start" onclick="svcAction(this,\''+s.name+'\',\'start\')">Start</button> ' +
|
||||||
'<button class="btn btn-sm btn-secondary" onclick="svcAction(\''+s.name+'\',\'stop\')">Stop</button> ' +
|
'<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-stop" onclick="svcAction(this,\''+s.name+'\',\'stop\')">Stop</button> ' +
|
||||||
'<button class="btn btn-sm btn-secondary" onclick="svcAction(\''+s.name+'\',\'restart\')">Restart</button>' +
|
'<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-restart" onclick="svcAction(this,\''+s.name+'\',\'restart\')">Restart</button>' +
|
||||||
'</td></tr>';
|
'</td></tr>';
|
||||||
}).join('');
|
}).join('');
|
||||||
document.getElementById('svc-table').innerHTML =
|
document.getElementById('svc-table').innerHTML =
|
||||||
@@ -2138,16 +2284,45 @@ function toggleBody(id) {
|
|||||||
const el = document.getElementById(id);
|
const el = document.getElementById(id);
|
||||||
if (el) el.style.display = el.style.display==='none' ? 'block' : 'none';
|
if (el) el.style.display = el.style.display==='none' ? 'block' : 'none';
|
||||||
}
|
}
|
||||||
function svcAction(name, action) {
|
function svcAction(btn, name, action) {
|
||||||
|
var label = btn.textContent;
|
||||||
|
btn.disabled = true;
|
||||||
|
btn.textContent = '...';
|
||||||
|
var out = document.getElementById('svc-out');
|
||||||
|
var term = document.getElementById('svc-terminal');
|
||||||
|
var statusEl = document.getElementById('svc-out-status');
|
||||||
|
var labelEl = document.getElementById('svc-out-label');
|
||||||
|
out.style.display = 'block';
|
||||||
|
labelEl.textContent = action + ' ' + name;
|
||||||
|
term.textContent = 'Running...';
|
||||||
|
statusEl.textContent = '';
|
||||||
|
statusEl.style.color = '';
|
||||||
fetch('/api/services/action',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({name,action})})
|
fetch('/api/services/action',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({name,action})})
|
||||||
.then(r=>r.json()).then(d => {
|
.then(r=>r.json()).then(d => {
|
||||||
document.getElementById('svc-out').style.display='block';
|
term.textContent = d.output || d.error || '(no output)';
|
||||||
document.getElementById('svc-terminal').textContent = d.output || d.error || action+' '+name;
|
term.scrollTop = term.scrollHeight;
|
||||||
setTimeout(loadServices, 1000);
|
if (d.status === 'ok') {
|
||||||
|
statusEl.textContent = '✓ done';
|
||||||
|
statusEl.style.color = 'var(--ok-fg, #2c662d)';
|
||||||
|
} else {
|
||||||
|
statusEl.textContent = '✗ failed';
|
||||||
|
statusEl.style.color = 'var(--crit-fg, #9f3a38)';
|
||||||
|
}
|
||||||
|
btn.textContent = label;
|
||||||
|
btn.disabled = false;
|
||||||
|
setTimeout(loadServices, 800);
|
||||||
|
}).catch(e => {
|
||||||
|
term.textContent = 'Request failed: ' + e;
|
||||||
|
statusEl.textContent = '✗ error';
|
||||||
|
statusEl.style.color = 'var(--crit-fg, #9f3a38)';
|
||||||
|
btn.textContent = label;
|
||||||
|
btn.disabled = false;
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
function restartGPUDrivers() {
|
function restartGPUDrivers() {
|
||||||
svcAction('bee-nvidia', 'restart');
|
var btn = document.querySelector('[onclick*="restartGPUDrivers"]');
|
||||||
|
if (!btn) { svcAction({textContent:'',disabled:false}, 'bee-nvidia', 'restart'); return; }
|
||||||
|
svcAction(btn, 'bee-nvidia', 'restart');
|
||||||
}
|
}
|
||||||
loadServices();
|
loadServices();
|
||||||
</script>`
|
</script>`
|
||||||
|
|||||||
@@ -601,8 +601,8 @@ func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
|
|||||||
if !strings.Contains(body, `Restart GPU Drivers`) {
|
if !strings.Contains(body, `Restart GPU Drivers`) {
|
||||||
t.Fatalf("tools page missing restart gpu drivers button: %s", body)
|
t.Fatalf("tools page missing restart gpu drivers button: %s", body)
|
||||||
}
|
}
|
||||||
if !strings.Contains(body, `svcAction('bee-nvidia', 'restart')`) {
|
if !strings.Contains(body, `restartGPUDrivers()`) {
|
||||||
t.Fatalf("tools page missing bee-nvidia restart action: %s", body)
|
t.Fatalf("tools page missing restartGPUDrivers action: %s", body)
|
||||||
}
|
}
|
||||||
if !strings.Contains(body, `id="boot-source-text"`) {
|
if !strings.Contains(body, `id="boot-source-text"`) {
|
||||||
t.Fatalf("tools page missing boot source field: %s", body)
|
t.Fatalf("tools page missing boot source field: %s", body)
|
||||||
@@ -649,6 +649,8 @@ func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
|
|||||||
`nvidia-targeted-stress`,
|
`nvidia-targeted-stress`,
|
||||||
`controlled NVIDIA DCGM load`,
|
`controlled NVIDIA DCGM load`,
|
||||||
`<code>dcgmi diag targeted_stress</code>`,
|
`<code>dcgmi diag targeted_stress</code>`,
|
||||||
|
`NVIDIA GPU Selection`,
|
||||||
|
`id="sat-gpu-list"`,
|
||||||
} {
|
} {
|
||||||
if !strings.Contains(body, needle) {
|
if !strings.Contains(body, needle) {
|
||||||
t.Fatalf("validate page missing %q: %s", needle, body)
|
t.Fatalf("validate page missing %q: %s", needle, body)
|
||||||
|
|||||||
@@ -106,23 +106,61 @@ func renderTaskDetailPage(opts HandlerOptions, task Task) string {
|
|||||||
body.WriteString(`</div></div>`)
|
body.WriteString(`</div></div>`)
|
||||||
body.WriteString(`<script>
|
body.WriteString(`<script>
|
||||||
function cancelTaskDetail(id) {
|
function cancelTaskDetail(id) {
|
||||||
fetch('/api/tasks/' + id + '/cancel', {method:'POST'}).then(function(){ window.location.reload(); });
|
fetch('/api/tasks/' + id + '/cancel', {method:'POST'}).then(function(){
|
||||||
|
var term = document.getElementById('task-live-log');
|
||||||
|
if (term) {
|
||||||
|
term.textContent += '\nCancel requested.\n';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
}
|
}
|
||||||
function loadTaskLiveCharts(taskId) {
|
});
|
||||||
fetch('/api/tasks/' + taskId + '/charts').then(function(r){ return r.json(); }).then(function(charts){
|
}
|
||||||
|
function renderTaskLiveCharts(taskId, charts) {
|
||||||
const host = document.getElementById('task-live-charts');
|
const host = document.getElementById('task-live-charts');
|
||||||
if (!host) return;
|
if (!host) return;
|
||||||
if (!Array.isArray(charts) || charts.length === 0) {
|
if (!Array.isArray(charts) || charts.length === 0) {
|
||||||
host.innerHTML = 'Waiting for metric samples...';
|
host.innerHTML = 'Waiting for metric samples...';
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
host.innerHTML = charts.map(function(chart) {
|
const seen = {};
|
||||||
return '<div class="card" style="margin:0">' +
|
charts.forEach(function(chart) {
|
||||||
'<div class="card-head">' + chart.title + '</div>' +
|
seen[chart.file] = true;
|
||||||
'<div class="card-body" style="padding:12px">' +
|
let img = host.querySelector('img[data-chart-file="' + chart.file + '"]');
|
||||||
'<img data-task-chart="1" data-base-src="/api/tasks/' + taskId + '/chart/' + chart.file + '" src="/api/tasks/' + taskId + '/chart/' + chart.file + '?t=' + Date.now() + '" style="width:100%;display:block;border-radius:6px" alt="' + chart.title + '">' +
|
if (img) {
|
||||||
'</div></div>';
|
const card = img.closest('.card');
|
||||||
}).join('');
|
if (card) {
|
||||||
|
const title = card.querySelector('.card-head');
|
||||||
|
if (title) title.textContent = chart.title;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const card = document.createElement('div');
|
||||||
|
card.className = 'card';
|
||||||
|
card.style.margin = '0';
|
||||||
|
card.innerHTML = '<div class="card-head"></div><div class="card-body" style="padding:12px"></div>';
|
||||||
|
card.querySelector('.card-head').textContent = chart.title;
|
||||||
|
const body = card.querySelector('.card-body');
|
||||||
|
img = document.createElement('img');
|
||||||
|
img.setAttribute('data-task-chart', '1');
|
||||||
|
img.setAttribute('data-chart-file', chart.file);
|
||||||
|
img.setAttribute('data-base-src', '/api/tasks/' + taskId + '/chart/' + chart.file);
|
||||||
|
img.src = '/api/tasks/' + taskId + '/chart/' + chart.file + '?t=' + Date.now();
|
||||||
|
img.style.width = '100%';
|
||||||
|
img.style.display = 'block';
|
||||||
|
img.style.borderRadius = '6px';
|
||||||
|
img.alt = chart.title;
|
||||||
|
body.appendChild(img);
|
||||||
|
host.appendChild(card);
|
||||||
|
});
|
||||||
|
Array.from(host.querySelectorAll('img[data-task-chart="1"]')).forEach(function(img) {
|
||||||
|
const file = img.getAttribute('data-chart-file') || '';
|
||||||
|
if (seen[file]) return;
|
||||||
|
const card = img.closest('.card');
|
||||||
|
if (card) card.remove();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function loadTaskLiveCharts(taskId) {
|
||||||
|
fetch('/api/tasks/' + taskId + '/charts').then(function(r){ return r.json(); }).then(function(charts){
|
||||||
|
renderTaskLiveCharts(taskId, charts);
|
||||||
}).catch(function(){
|
}).catch(function(){
|
||||||
const host = document.getElementById('task-live-charts');
|
const host = document.getElementById('task-live-charts');
|
||||||
if (host) host.innerHTML = 'Task charts are unavailable.';
|
if (host) host.innerHTML = 'Task charts are unavailable.';
|
||||||
@@ -138,12 +176,31 @@ function refreshTaskLiveCharts() {
|
|||||||
var _taskDetailES = new EventSource('/api/tasks/` + html.EscapeString(task.ID) + `/stream');
|
var _taskDetailES = new EventSource('/api/tasks/` + html.EscapeString(task.ID) + `/stream');
|
||||||
var _taskDetailTerm = document.getElementById('task-live-log');
|
var _taskDetailTerm = document.getElementById('task-live-log');
|
||||||
var _taskChartTimer = null;
|
var _taskChartTimer = null;
|
||||||
|
var _taskChartsFrozen = false;
|
||||||
_taskDetailES.onopen = function(){ _taskDetailTerm.textContent = ''; };
|
_taskDetailES.onopen = function(){ _taskDetailTerm.textContent = ''; };
|
||||||
_taskDetailES.onmessage = function(e){ _taskDetailTerm.textContent += e.data + "\n"; _taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight; };
|
_taskDetailES.onmessage = function(e){ _taskDetailTerm.textContent += e.data + "\n"; _taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight; };
|
||||||
_taskDetailES.addEventListener('done', function(){ if (_taskChartTimer) clearInterval(_taskChartTimer); _taskDetailES.close(); setTimeout(function(){ window.location.reload(); }, 1000); });
|
_taskDetailES.addEventListener('done', function(e){
|
||||||
_taskDetailES.onerror = function(){ if (_taskChartTimer) clearInterval(_taskChartTimer); _taskDetailES.close(); };
|
if (_taskChartTimer) clearInterval(_taskChartTimer);
|
||||||
|
_taskDetailES.close();
|
||||||
|
_taskDetailES = null;
|
||||||
|
_taskChartsFrozen = true;
|
||||||
|
_taskDetailTerm.textContent += (e.data ? '\nTask finished with error.\n' : '\nTask finished.\n');
|
||||||
|
_taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight;
|
||||||
|
refreshTaskLiveCharts();
|
||||||
|
});
|
||||||
|
_taskDetailES.onerror = function(){
|
||||||
|
if (_taskChartTimer) clearInterval(_taskChartTimer);
|
||||||
|
if (_taskDetailES) {
|
||||||
|
_taskDetailES.close();
|
||||||
|
_taskDetailES = null;
|
||||||
|
}
|
||||||
|
};
|
||||||
loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
|
loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
|
||||||
_taskChartTimer = setInterval(function(){ refreshTaskLiveCharts(); loadTaskLiveCharts('` + html.EscapeString(task.ID) + `'); }, 2000);
|
_taskChartTimer = setInterval(function(){
|
||||||
|
if (_taskChartsFrozen) return;
|
||||||
|
loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
|
||||||
|
refreshTaskLiveCharts();
|
||||||
|
}, 2000);
|
||||||
</script>`)
|
</script>`)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -423,13 +423,14 @@ func (q *taskQueue) worker() {
|
|||||||
setCPUGovernor("performance")
|
setCPUGovernor("performance")
|
||||||
defer setCPUGovernor("powersave")
|
defer setCPUGovernor("powersave")
|
||||||
|
|
||||||
// Drain all pending tasks and start them in parallel.
|
|
||||||
q.mu.Lock()
|
|
||||||
var batch []*Task
|
|
||||||
for {
|
for {
|
||||||
|
q.mu.Lock()
|
||||||
t := q.nextPending()
|
t := q.nextPending()
|
||||||
if t == nil {
|
if t == nil {
|
||||||
break
|
q.prune()
|
||||||
|
q.persistLocked()
|
||||||
|
q.mu.Unlock()
|
||||||
|
return
|
||||||
}
|
}
|
||||||
now := time.Now()
|
now := time.Now()
|
||||||
t.Status = TaskRunning
|
t.Status = TaskRunning
|
||||||
@@ -438,29 +439,14 @@ func (q *taskQueue) worker() {
|
|||||||
t.ErrMsg = ""
|
t.ErrMsg = ""
|
||||||
j := newTaskJobState(t.LogPath, taskSerialPrefix(t))
|
j := newTaskJobState(t.LogPath, taskSerialPrefix(t))
|
||||||
t.job = j
|
t.job = j
|
||||||
batch = append(batch, t)
|
|
||||||
}
|
|
||||||
if len(batch) > 0 {
|
|
||||||
q.persistLocked()
|
q.persistLocked()
|
||||||
}
|
|
||||||
q.mu.Unlock()
|
q.mu.Unlock()
|
||||||
|
|
||||||
var wg sync.WaitGroup
|
|
||||||
for _, t := range batch {
|
|
||||||
t := t
|
|
||||||
j := t.job
|
|
||||||
taskCtx, taskCancel := context.WithCancel(context.Background())
|
taskCtx, taskCancel := context.WithCancel(context.Background())
|
||||||
j.cancel = taskCancel
|
j.cancel = taskCancel
|
||||||
wg.Add(1)
|
|
||||||
goRecoverOnce("task "+t.Target, func() {
|
|
||||||
defer wg.Done()
|
|
||||||
defer taskCancel()
|
|
||||||
q.executeTask(t, j, taskCtx)
|
q.executeTask(t, j, taskCtx)
|
||||||
})
|
taskCancel()
|
||||||
}
|
|
||||||
wg.Wait()
|
|
||||||
|
|
||||||
if len(batch) > 0 {
|
|
||||||
q.mu.Lock()
|
q.mu.Lock()
|
||||||
q.prune()
|
q.prune()
|
||||||
q.persistLocked()
|
q.persistLocked()
|
||||||
|
|||||||
@@ -15,30 +15,22 @@ menuentry "EASY-BEE" {
|
|||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE (graphics/KMS)" {
|
submenu "EASY-BEE (advanced options) -->" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
menuentry "EASY-BEE — GSP=off" {
|
||||||
initrd @INITRD_LIVE@
|
|
||||||
}
|
|
||||||
|
|
||||||
menuentry "EASY-BEE (load to RAM)" {
|
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
|
||||||
initrd @INITRD_LIVE@
|
|
||||||
}
|
|
||||||
|
|
||||||
menuentry "EASY-BEE (NVIDIA GSP=off)" {
|
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE (graphics/KMS, GSP=off)" {
|
menuentry "EASY-BEE — KMS (no nomodeset)" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE (fail-safe)" {
|
menuentry "EASY-BEE — fail-safe" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if [ "${grub_platform}" = "efi" ]; then
|
if [ "${grub_platform}" = "efi" ]; then
|
||||||
menuentry "Memory Test (memtest86+)" {
|
menuentry "Memory Test (memtest86+)" {
|
||||||
|
|||||||
@@ -44,23 +44,27 @@ else:
|
|||||||
img = Image.new('RGB', (W, H), (0, 0, 0))
|
img = Image.new('RGB', (W, H), (0, 0, 0))
|
||||||
draw = ImageDraw.Draw(img)
|
draw = ImageDraw.Draw(img)
|
||||||
|
|
||||||
# Measure logo block
|
# Measure logo block line by line to avoid font ascender offset
|
||||||
lines = LOGO.split('\n')
|
lines = LOGO.split('\n')
|
||||||
bbox = draw.textbbox((0, 0), LOGO, font=font_logo)
|
|
||||||
text_w = bbox[2] - bbox[0]
|
|
||||||
text_h = bbox[3] - bbox[1]
|
|
||||||
|
|
||||||
x = (W - text_w) // 2
|
|
||||||
y = (H - text_h) // 2
|
|
||||||
|
|
||||||
# Draw logo lines: first 6 in amber, last line (subtitle) dimmer
|
|
||||||
logo_lines = lines[:6]
|
logo_lines = lines[:6]
|
||||||
sub_line = lines[6] if len(lines) > 6 else ''
|
sub_line = lines[6] if len(lines) > 6 else ''
|
||||||
|
|
||||||
|
line_h = SIZE + 2
|
||||||
|
block_h = len(logo_lines) * line_h + 8 + (SIZE if sub_line else 0)
|
||||||
|
|
||||||
|
# Width: measure the widest logo line
|
||||||
|
max_w = 0
|
||||||
|
for line in logo_lines:
|
||||||
|
bb = draw.textbbox((0, 0), line, font=font_logo)
|
||||||
|
max_w = max(max_w, bb[2] - bb[0])
|
||||||
|
|
||||||
|
x = (W - max_w) // 2
|
||||||
|
y = (H - block_h) // 2
|
||||||
|
|
||||||
cy = y
|
cy = y
|
||||||
for line in logo_lines:
|
for line in logo_lines:
|
||||||
draw.text((x, cy), line, font=font_logo, fill=(0xf6, 0xc9, 0x0e))
|
draw.text((x, cy), line, font=font_logo, fill=(0xf6, 0xc9, 0x0e))
|
||||||
cy += SIZE + 2
|
cy += line_h
|
||||||
cy += 8
|
cy += 8
|
||||||
if sub_line:
|
if sub_line:
|
||||||
draw.text((x, cy), sub_line, font=font_sub, fill=(0x80, 0x68, 0x18))
|
draw.text((x, cy), sub_line, font=font_sub, fill=(0x80, 0x68, 0x18))
|
||||||
|
|||||||
@@ -65,6 +65,10 @@ python3-pil
|
|||||||
xorg
|
xorg
|
||||||
xterm
|
xterm
|
||||||
chromium
|
chromium
|
||||||
|
mousepad
|
||||||
|
pcmanfm
|
||||||
|
ristretto
|
||||||
|
mupdf
|
||||||
xserver-xorg-video-fbdev
|
xserver-xorg-video-fbdev
|
||||||
xserver-xorg-video-vesa
|
xserver-xorg-video-vesa
|
||||||
lightdm
|
lightdm
|
||||||
|
|||||||
@@ -50,11 +50,93 @@ load_module() {
|
|||||||
log "WARN: not found: $ko"
|
log "WARN: not found: $ko"
|
||||||
return 1
|
return 1
|
||||||
fi
|
fi
|
||||||
if insmod "$ko" "$@"; then
|
if timeout 90 insmod "$ko" "$@"; then
|
||||||
log "loaded: $mod $*"
|
log "loaded: $mod $*"
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
log "WARN: failed to load: $mod"
|
log "WARN: failed to load: $mod (exit $?)"
|
||||||
|
dmesg | tail -n 10 | sed 's/^/ dmesg: /' || true
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
nvidia_is_functional() {
|
||||||
|
grep -q ' nvidiactl$' /proc/devices 2>/dev/null
|
||||||
|
}
|
||||||
|
|
||||||
|
load_module_with_gsp_fallback() {
|
||||||
|
ko="$NVIDIA_KO_DIR/nvidia.ko"
|
||||||
|
if [ ! -f "$ko" ]; then
|
||||||
|
log "ERROR: not found: $ko"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Run insmod in background — on some converted SXM→PCIe cards GSP enters an
|
||||||
|
# infinite crash/reload loop and insmod never returns. We check for successful
|
||||||
|
# initialization by polling /proc/devices for nvidiactl instead of waiting for
|
||||||
|
# insmod to exit.
|
||||||
|
log "loading nvidia (GSP enabled, timeout 90s)"
|
||||||
|
insmod "$ko" &
|
||||||
|
_insmod_pid=$!
|
||||||
|
|
||||||
|
_waited=0
|
||||||
|
while [ $_waited -lt 90 ]; do
|
||||||
|
if nvidia_is_functional; then
|
||||||
|
log "loaded: nvidia (GSP enabled, ${_waited}s)"
|
||||||
|
echo "gsp-on" > /run/bee-nvidia-mode
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
# Check if insmod exited with an error before timeout
|
||||||
|
if ! kill -0 "$_insmod_pid" 2>/dev/null; then
|
||||||
|
wait "$_insmod_pid"
|
||||||
|
_rc=$?
|
||||||
|
if [ $_rc -ne 0 ]; then
|
||||||
|
log "nvidia load failed (exit $_rc)"
|
||||||
|
dmesg | tail -n 10 | sed 's/^/ dmesg: /' || true
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
# insmod exited 0 but nvidiactl not yet in /proc/devices — give it a moment
|
||||||
|
sleep 2
|
||||||
|
if nvidia_is_functional; then
|
||||||
|
log "loaded: nvidia (GSP enabled, ${_waited}s)"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
log "insmod exited 0 but nvidiactl missing — treating as failure"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
sleep 1
|
||||||
|
_waited=$((_waited + 1))
|
||||||
|
done
|
||||||
|
|
||||||
|
# GSP init timed out — kill the hanging insmod and attempt gsp-off fallback
|
||||||
|
log "nvidia GSP init timed out after 90s"
|
||||||
|
kill "$_insmod_pid" 2>/dev/null || true
|
||||||
|
wait "$_insmod_pid" 2>/dev/null || true
|
||||||
|
|
||||||
|
# Attempt to unload the partially-initialized module
|
||||||
|
if ! rmmod nvidia 2>/dev/null; then
|
||||||
|
# Module is stuck in the kernel — cannot reload with different params.
|
||||||
|
# User must reboot and select bee.nvidia.mode=gsp-off at boot menu.
|
||||||
|
log "ERROR: rmmod nvidia failed (EBUSY) — module stuck in kernel"
|
||||||
|
log "ERROR: reboot and select 'EASY-BEE (advanced) -> GSP=off' in boot menu"
|
||||||
|
echo "gsp-stuck" > /run/bee-nvidia-mode
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
sleep 2
|
||||||
|
log "retrying with NVreg_EnableGpuFirmware=0"
|
||||||
|
log "WARNING: GSP disabled — power management will run via CPU path, not GPU firmware"
|
||||||
|
|
||||||
|
if insmod "$ko" NVreg_EnableGpuFirmware=0; then
|
||||||
|
if nvidia_is_functional; then
|
||||||
|
log "loaded: nvidia (GSP disabled)"
|
||||||
|
echo "gsp-off" > /run/bee-nvidia-mode
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
log "insmod gsp-off exited 0 but nvidiactl missing"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "nvidia load failed (GSP=off)"
|
||||||
dmesg | tail -n 10 | sed 's/^/ dmesg: /' || true
|
dmesg | tail -n 10 | sed 's/^/ dmesg: /' || true
|
||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
@@ -70,7 +152,7 @@ load_host_module() {
|
|||||||
|
|
||||||
case "$nvidia_mode" in
|
case "$nvidia_mode" in
|
||||||
normal|full)
|
normal|full)
|
||||||
if ! load_module nvidia; then
|
if ! load_module_with_gsp_fallback; then
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
# nvidia-modeset on some server kernels needs ACPI video helper symbols
|
# nvidia-modeset on some server kernels needs ACPI video helper symbols
|
||||||
|
|||||||
Reference in New Issue
Block a user