Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 7a21c370e4 | |||
| a493e3ab5b | |||
| 19b4803ec7 | |||
| 1bdfb1e9ca | |||
| c5d6b30177 |
@@ -15,6 +15,10 @@ var workerPatterns = []string{
|
|||||||
"stress-ng",
|
"stress-ng",
|
||||||
"stressapptest",
|
"stressapptest",
|
||||||
"memtester",
|
"memtester",
|
||||||
|
// DCGM diagnostic workers — nvvs is spawned by dcgmi diag and survives
|
||||||
|
// if dcgmi is killed mid-run, leaving the GPU occupied (DCGM_ST_IN_USE).
|
||||||
|
"nvvs",
|
||||||
|
"dcgmi",
|
||||||
}
|
}
|
||||||
|
|
||||||
// KilledProcess describes a process that was sent SIGKILL.
|
// KilledProcess describes a process that was sent SIGKILL.
|
||||||
|
|||||||
@@ -110,7 +110,7 @@ func (s *System) RunPlatformStress(
|
|||||||
wg.Add(1)
|
wg.Add(1)
|
||||||
go func() {
|
go func() {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
gpuCmd := buildGPUStressCmd(loadCtx, vendor)
|
gpuCmd := buildGPUStressCmd(loadCtx, vendor, cycle.LoadSec)
|
||||||
if gpuCmd == nil {
|
if gpuCmd == nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -392,6 +392,13 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
|
|||||||
cmdArgs = append(cmdArgs, "-M", strconv.Itoa(mb))
|
cmdArgs = append(cmdArgs, "-M", strconv.Itoa(mb))
|
||||||
}
|
}
|
||||||
cmd := exec.CommandContext(ctx, path, cmdArgs...)
|
cmd := exec.CommandContext(ctx, path, cmdArgs...)
|
||||||
|
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||||
|
cmd.Cancel = func() error {
|
||||||
|
if cmd.Process != nil {
|
||||||
|
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
cmd.Stdout = nil
|
cmd.Stdout = nil
|
||||||
cmd.Stderr = nil
|
cmd.Stderr = nil
|
||||||
if err := startLowPriorityCmd(cmd, 15); err != nil {
|
if err := startLowPriorityCmd(cmd, 15); err != nil {
|
||||||
@@ -402,28 +409,28 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
|
|||||||
|
|
||||||
// buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
|
// buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
|
||||||
// Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
|
// Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
|
||||||
func buildGPUStressCmd(ctx context.Context, vendor string) *exec.Cmd {
|
func buildGPUStressCmd(ctx context.Context, vendor string, durSec int) *exec.Cmd {
|
||||||
switch strings.ToLower(vendor) {
|
switch strings.ToLower(vendor) {
|
||||||
case "amd":
|
case "amd":
|
||||||
return buildAMDGPUStressCmd(ctx)
|
return buildAMDGPUStressCmd(ctx, durSec)
|
||||||
case "nvidia":
|
case "nvidia":
|
||||||
return buildNvidiaGPUStressCmd(ctx)
|
return buildNvidiaGPUStressCmd(ctx, durSec)
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
|
func buildAMDGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
|
||||||
rvsArgs, err := resolveRVSCommand()
|
rvsArgs, err := resolveRVSCommand()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
rvsPath := rvsArgs[0]
|
rvsPath := rvsArgs[0]
|
||||||
cfg := `actions:
|
cfg := fmt.Sprintf(`actions:
|
||||||
- name: gst_platform
|
- name: gst_platform
|
||||||
device: all
|
device: all
|
||||||
module: gst
|
module: gst
|
||||||
parallel: true
|
parallel: true
|
||||||
duration: 86400000
|
duration: %d`, durSec*1000) + `
|
||||||
copy_matrix: false
|
copy_matrix: false
|
||||||
target_stress: 90
|
target_stress: 90
|
||||||
matrix_size_a: 8640
|
matrix_size_a: 8640
|
||||||
@@ -433,13 +440,20 @@ func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
|
|||||||
cfgFile := "/tmp/bee-platform-gst.conf"
|
cfgFile := "/tmp/bee-platform-gst.conf"
|
||||||
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||||
cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
|
cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
|
||||||
|
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||||
|
cmd.Cancel = func() error {
|
||||||
|
if cmd.Process != nil {
|
||||||
|
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
cmd.Stdout = nil
|
cmd.Stdout = nil
|
||||||
cmd.Stderr = nil
|
cmd.Stderr = nil
|
||||||
_ = startLowPriorityCmd(cmd, 10)
|
_ = startLowPriorityCmd(cmd, 10)
|
||||||
return cmd
|
return cmd
|
||||||
}
|
}
|
||||||
|
|
||||||
func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
|
func buildNvidiaGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
|
||||||
path, err := satLookPath("bee-gpu-burn")
|
path, err := satLookPath("bee-gpu-burn")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
path, err = satLookPath("bee-gpu-stress")
|
path, err = satLookPath("bee-gpu-stress")
|
||||||
@@ -447,7 +461,17 @@ func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
cmd := exec.CommandContext(ctx, path, "--seconds", "86400")
|
// Pass exact duration so bee-gpu-burn exits on its own when the cycle ends.
|
||||||
|
// Process group kill via Setpgid+Cancel is kept as a safety net for cases
|
||||||
|
// where the context is cancelled early (user stop, parent timeout).
|
||||||
|
cmd := exec.CommandContext(ctx, path, "--seconds", strconv.Itoa(durSec))
|
||||||
|
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||||
|
cmd.Cancel = func() error {
|
||||||
|
if cmd.Process != nil {
|
||||||
|
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
cmd.Stdout = nil
|
cmd.Stdout = nil
|
||||||
cmd.Stderr = nil
|
cmd.Stderr = nil
|
||||||
_ = startLowPriorityCmd(cmd, 10)
|
_ = startLowPriorityCmd(cmd, 10)
|
||||||
|
|||||||
@@ -173,6 +173,22 @@ func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHe
|
|||||||
|
|
||||||
switch vendor {
|
switch vendor {
|
||||||
case "nvidia":
|
case "nvidia":
|
||||||
|
if raw, err := os.ReadFile("/run/bee-nvidia-mode"); err == nil {
|
||||||
|
health.NvidiaGSPMode = strings.TrimSpace(string(raw))
|
||||||
|
if health.NvidiaGSPMode == "gsp-stuck" {
|
||||||
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
|
Code: "nvidia_gsp_stuck",
|
||||||
|
Severity: "critical",
|
||||||
|
Description: "NVIDIA GSP firmware init timed out and the kernel module is stuck. Reboot and select 'GSP=off' in the boot menu.",
|
||||||
|
})
|
||||||
|
} else if health.NvidiaGSPMode == "gsp-off" {
|
||||||
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
|
Code: "nvidia_gsp_disabled",
|
||||||
|
Severity: "warning",
|
||||||
|
Description: "NVIDIA GSP firmware disabled (fallback). Power management runs via CPU path — power draw readings may differ from reference hardware.",
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
|
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
|
||||||
if !health.DriverReady {
|
if !health.DriverReady {
|
||||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
|
|||||||
@@ -382,6 +382,13 @@ func (s *System) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDi
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||||||
|
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||||||
|
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||||
|
for _, p := range killed {
|
||||||
|
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||||
|
}
|
||||||
|
}
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", []satJob{
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -61,7 +61,9 @@ func (s *System) ServiceState(name string) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
|
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
|
||||||
raw, err := exec.Command("systemctl", string(action), name).CombinedOutput()
|
// bee-web runs as the bee user; sudo is required to control system services.
|
||||||
|
// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
|
||||||
|
raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()
|
||||||
return string(raw), err
|
return string(raw), err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ type RuntimeHealth struct {
|
|||||||
ExportDir string `json:"export_dir,omitempty"`
|
ExportDir string `json:"export_dir,omitempty"`
|
||||||
DriverReady bool `json:"driver_ready,omitempty"`
|
DriverReady bool `json:"driver_ready,omitempty"`
|
||||||
CUDAReady bool `json:"cuda_ready,omitempty"`
|
CUDAReady bool `json:"cuda_ready,omitempty"`
|
||||||
|
NvidiaGSPMode string `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
|
||||||
NetworkStatus string `json:"network_status,omitempty"`
|
NetworkStatus string `json:"network_status,omitempty"`
|
||||||
Issues []RuntimeIssue `json:"issues,omitempty"`
|
Issues []RuntimeIssue `json:"issues,omitempty"`
|
||||||
Tools []RuntimeToolStatus `json:"tools,omitempty"`
|
Tools []RuntimeToolStatus `json:"tools,omitempty"`
|
||||||
|
|||||||
@@ -383,11 +383,13 @@ func (h *handler) handleAPIServicesAction(w http.ResponseWriter, r *http.Request
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
result, err := h.opts.App.ServiceActionResult(req.Name, action)
|
result, err := h.opts.App.ServiceActionResult(req.Name, action)
|
||||||
|
status := "ok"
|
||||||
if err != nil {
|
if err != nil {
|
||||||
writeError(w, http.StatusInternalServerError, err.Error())
|
status = "error"
|
||||||
return
|
|
||||||
}
|
}
|
||||||
writeJSON(w, map[string]string{"status": "ok", "output": result.Body})
|
// Always return 200 with output so the frontend can display the actual
|
||||||
|
// systemctl error message instead of a generic "exit status 1".
|
||||||
|
writeJSON(w, map[string]string{"status": status, "output": result.Body})
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Network ───────────────────────────────────────────────────────────────────
|
// ── Network ───────────────────────────────────────────────────────────────────
|
||||||
|
|||||||
@@ -33,6 +33,9 @@ a{color:var(--accent);text-decoration:none}
|
|||||||
.sidebar-logo{padding:18px 16px 12px;font-size:18px;font-weight:700;color:#fff;letter-spacing:-.5px}
|
.sidebar-logo{padding:18px 16px 12px;font-size:18px;font-weight:700;color:#fff;letter-spacing:-.5px}
|
||||||
.sidebar-logo span{color:rgba(255,255,255,.5);font-weight:400;font-size:12px;display:block;margin-top:2px}
|
.sidebar-logo span{color:rgba(255,255,255,.5);font-weight:400;font-size:12px;display:block;margin-top:2px}
|
||||||
.sidebar-version{padding:0 16px 14px;font-size:11px;color:rgba(255,255,255,.45)}
|
.sidebar-version{padding:0 16px 14px;font-size:11px;color:rgba(255,255,255,.45)}
|
||||||
|
.sidebar-badge{margin:0 12px 12px;padding:5px 8px;border-radius:4px;font-size:11px;font-weight:600;text-align:center}
|
||||||
|
.sidebar-badge-warn{background:#7a4f00;color:#f6c90e}
|
||||||
|
.sidebar-badge-crit{background:#5c1a1a;color:#ff6b6b}
|
||||||
.nav{flex:1}
|
.nav{flex:1}
|
||||||
.nav-item{display:block;padding:10px 16px;color:rgba(255,255,255,.7);font-size:13px;border-left:3px solid transparent;transition:all .15s}
|
.nav-item{display:block;padding:10px 16px;color:rgba(255,255,255,.7);font-size:13px;border-left:3px solid transparent;transition:all .15s}
|
||||||
.nav-item:hover{color:#fff;background:rgba(255,255,255,.08)}
|
.nav-item:hover{color:#fff;background:rgba(255,255,255,.08)}
|
||||||
@@ -107,6 +110,15 @@ func layoutNav(active string, buildLabel string) string {
|
|||||||
buildLabel = "dev"
|
buildLabel = "dev"
|
||||||
}
|
}
|
||||||
b.WriteString(`<div class="sidebar-version">Version ` + html.EscapeString(buildLabel) + `</div>`)
|
b.WriteString(`<div class="sidebar-version">Version ` + html.EscapeString(buildLabel) + `</div>`)
|
||||||
|
if raw, err := os.ReadFile("/run/bee-nvidia-mode"); err == nil {
|
||||||
|
gspMode := strings.TrimSpace(string(raw))
|
||||||
|
switch gspMode {
|
||||||
|
case "gsp-off":
|
||||||
|
b.WriteString(`<div class="sidebar-badge sidebar-badge-warn">NVIDIA GSP=off</div>`)
|
||||||
|
case "gsp-stuck":
|
||||||
|
b.WriteString(`<div class="sidebar-badge sidebar-badge-crit">NVIDIA GSP stuck — reboot</div>`)
|
||||||
|
}
|
||||||
|
}
|
||||||
b.WriteString(`<nav class="nav">`)
|
b.WriteString(`<nav class="nav">`)
|
||||||
for _, item := range items {
|
for _, item := range items {
|
||||||
cls := "nav-item"
|
cls := "nav-item"
|
||||||
@@ -2107,9 +2119,12 @@ func renderServicesInline() string {
|
|||||||
return `<p style="font-size:13px;color:var(--muted);margin-bottom:10px">` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `</p>
|
return `<p style="font-size:13px;color:var(--muted);margin-bottom:10px">` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `</p>
|
||||||
<div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="restartGPUDrivers()">Restart GPU Drivers</button><button class="btn btn-sm btn-secondary" onclick="loadServices()">↻ Refresh</button></div>
|
<div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="restartGPUDrivers()">Restart GPU Drivers</button><button class="btn btn-sm btn-secondary" onclick="loadServices()">↻ Refresh</button></div>
|
||||||
<div id="svc-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
<div id="svc-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||||
<div id="svc-out" style="display:none;margin-top:8px" class="card">
|
<div id="svc-out" style="display:none;margin-top:12px">
|
||||||
<div class="card-head">Output</div>
|
<div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
|
||||||
<div class="card-body" style="padding:10px"><div id="svc-terminal" class="terminal" style="max-height:150px"></div></div>
|
<span id="svc-out-label" style="font-size:12px;font-weight:600;color:var(--muted)">Output</span>
|
||||||
|
<span id="svc-out-status" style="font-size:12px"></span>
|
||||||
|
</div>
|
||||||
|
<div id="svc-terminal" class="terminal" style="max-height:220px;width:100%;box-sizing:border-box"></div>
|
||||||
</div>
|
</div>
|
||||||
<script>
|
<script>
|
||||||
function loadServices() {
|
function loadServices() {
|
||||||
@@ -2125,9 +2140,9 @@ function loadServices() {
|
|||||||
'<div id="'+id+'" style="display:none;margin-top:6px"><pre style="font-size:11px;white-space:pre-wrap;word-break:break-all;max-height:200px;overflow-y:auto;background:#1b1c1d;padding:8px;border-radius:4px;color:#b5cea8">'+body+'</pre></div>' +
|
'<div id="'+id+'" style="display:none;margin-top:6px"><pre style="font-size:11px;white-space:pre-wrap;word-break:break-all;max-height:200px;overflow-y:auto;background:#1b1c1d;padding:8px;border-radius:4px;color:#b5cea8">'+body+'</pre></div>' +
|
||||||
'</td>' +
|
'</td>' +
|
||||||
'<td style="white-space:nowrap">' +
|
'<td style="white-space:nowrap">' +
|
||||||
'<button class="btn btn-sm btn-secondary" onclick="svcAction(\''+s.name+'\',\'start\')">Start</button> ' +
|
'<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-start" onclick="svcAction(this,\''+s.name+'\',\'start\')">Start</button> ' +
|
||||||
'<button class="btn btn-sm btn-secondary" onclick="svcAction(\''+s.name+'\',\'stop\')">Stop</button> ' +
|
'<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-stop" onclick="svcAction(this,\''+s.name+'\',\'stop\')">Stop</button> ' +
|
||||||
'<button class="btn btn-sm btn-secondary" onclick="svcAction(\''+s.name+'\',\'restart\')">Restart</button>' +
|
'<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-restart" onclick="svcAction(this,\''+s.name+'\',\'restart\')">Restart</button>' +
|
||||||
'</td></tr>';
|
'</td></tr>';
|
||||||
}).join('');
|
}).join('');
|
||||||
document.getElementById('svc-table').innerHTML =
|
document.getElementById('svc-table').innerHTML =
|
||||||
@@ -2138,16 +2153,45 @@ function toggleBody(id) {
|
|||||||
const el = document.getElementById(id);
|
const el = document.getElementById(id);
|
||||||
if (el) el.style.display = el.style.display==='none' ? 'block' : 'none';
|
if (el) el.style.display = el.style.display==='none' ? 'block' : 'none';
|
||||||
}
|
}
|
||||||
function svcAction(name, action) {
|
function svcAction(btn, name, action) {
|
||||||
|
var label = btn.textContent;
|
||||||
|
btn.disabled = true;
|
||||||
|
btn.textContent = '...';
|
||||||
|
var out = document.getElementById('svc-out');
|
||||||
|
var term = document.getElementById('svc-terminal');
|
||||||
|
var statusEl = document.getElementById('svc-out-status');
|
||||||
|
var labelEl = document.getElementById('svc-out-label');
|
||||||
|
out.style.display = 'block';
|
||||||
|
labelEl.textContent = action + ' ' + name;
|
||||||
|
term.textContent = 'Running...';
|
||||||
|
statusEl.textContent = '';
|
||||||
|
statusEl.style.color = '';
|
||||||
fetch('/api/services/action',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({name,action})})
|
fetch('/api/services/action',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({name,action})})
|
||||||
.then(r=>r.json()).then(d => {
|
.then(r=>r.json()).then(d => {
|
||||||
document.getElementById('svc-out').style.display='block';
|
term.textContent = d.output || d.error || '(no output)';
|
||||||
document.getElementById('svc-terminal').textContent = d.output || d.error || action+' '+name;
|
term.scrollTop = term.scrollHeight;
|
||||||
setTimeout(loadServices, 1000);
|
if (d.status === 'ok') {
|
||||||
|
statusEl.textContent = '✓ done';
|
||||||
|
statusEl.style.color = 'var(--ok-fg, #2c662d)';
|
||||||
|
} else {
|
||||||
|
statusEl.textContent = '✗ failed';
|
||||||
|
statusEl.style.color = 'var(--crit-fg, #9f3a38)';
|
||||||
|
}
|
||||||
|
btn.textContent = label;
|
||||||
|
btn.disabled = false;
|
||||||
|
setTimeout(loadServices, 800);
|
||||||
|
}).catch(e => {
|
||||||
|
term.textContent = 'Request failed: ' + e;
|
||||||
|
statusEl.textContent = '✗ error';
|
||||||
|
statusEl.style.color = 'var(--crit-fg, #9f3a38)';
|
||||||
|
btn.textContent = label;
|
||||||
|
btn.disabled = false;
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
function restartGPUDrivers() {
|
function restartGPUDrivers() {
|
||||||
svcAction('bee-nvidia', 'restart');
|
var btn = document.querySelector('[onclick*="restartGPUDrivers"]');
|
||||||
|
if (!btn) { svcAction({textContent:'',disabled:false}, 'bee-nvidia', 'restart'); return; }
|
||||||
|
svcAction(btn, 'bee-nvidia', 'restart');
|
||||||
}
|
}
|
||||||
loadServices();
|
loadServices();
|
||||||
</script>`
|
</script>`
|
||||||
|
|||||||
@@ -11,33 +11,25 @@ echo " Hardware Audit LiveCD"
|
|||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
menuentry "EASY-BEE" {
|
menuentry "EASY-BEE" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE (graphics/KMS)" {
|
submenu "EASY-BEE (advanced options) -->" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
menuentry "EASY-BEE — GSP=off" {
|
||||||
initrd @INITRD_LIVE@
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||||
}
|
initrd @INITRD_LIVE@
|
||||||
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE (load to RAM)" {
|
menuentry "EASY-BEE — nomodeset" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE (NVIDIA GSP=off)" {
|
menuentry "EASY-BEE — fail-safe" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE (graphics/KMS, GSP=off)" {
|
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
|
||||||
initrd @INITRD_LIVE@
|
|
||||||
}
|
|
||||||
|
|
||||||
menuentry "EASY-BEE (fail-safe)" {
|
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
|
||||||
initrd @INITRD_LIVE@
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if [ "${grub_platform}" = "efi" ]; then
|
if [ "${grub_platform}" = "efi" ]; then
|
||||||
|
|||||||
@@ -65,6 +65,10 @@ python3-pil
|
|||||||
xorg
|
xorg
|
||||||
xterm
|
xterm
|
||||||
chromium
|
chromium
|
||||||
|
mousepad
|
||||||
|
pcmanfm
|
||||||
|
ristretto
|
||||||
|
mupdf
|
||||||
xserver-xorg-video-fbdev
|
xserver-xorg-video-fbdev
|
||||||
xserver-xorg-video-vesa
|
xserver-xorg-video-vesa
|
||||||
lightdm
|
lightdm
|
||||||
|
|||||||
@@ -50,11 +50,93 @@ load_module() {
|
|||||||
log "WARN: not found: $ko"
|
log "WARN: not found: $ko"
|
||||||
return 1
|
return 1
|
||||||
fi
|
fi
|
||||||
if insmod "$ko" "$@"; then
|
if timeout 90 insmod "$ko" "$@"; then
|
||||||
log "loaded: $mod $*"
|
log "loaded: $mod $*"
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
log "WARN: failed to load: $mod"
|
log "WARN: failed to load: $mod (exit $?)"
|
||||||
|
dmesg | tail -n 10 | sed 's/^/ dmesg: /' || true
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
nvidia_is_functional() {
|
||||||
|
grep -q ' nvidiactl$' /proc/devices 2>/dev/null
|
||||||
|
}
|
||||||
|
|
||||||
|
load_module_with_gsp_fallback() {
|
||||||
|
ko="$NVIDIA_KO_DIR/nvidia.ko"
|
||||||
|
if [ ! -f "$ko" ]; then
|
||||||
|
log "ERROR: not found: $ko"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Run insmod in background — on some converted SXM→PCIe cards GSP enters an
|
||||||
|
# infinite crash/reload loop and insmod never returns. We check for successful
|
||||||
|
# initialization by polling /proc/devices for nvidiactl instead of waiting for
|
||||||
|
# insmod to exit.
|
||||||
|
log "loading nvidia (GSP enabled, timeout 90s)"
|
||||||
|
insmod "$ko" &
|
||||||
|
_insmod_pid=$!
|
||||||
|
|
||||||
|
_waited=0
|
||||||
|
while [ $_waited -lt 90 ]; do
|
||||||
|
if nvidia_is_functional; then
|
||||||
|
log "loaded: nvidia (GSP enabled, ${_waited}s)"
|
||||||
|
echo "gsp-on" > /run/bee-nvidia-mode
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
# Check if insmod exited with an error before timeout
|
||||||
|
if ! kill -0 "$_insmod_pid" 2>/dev/null; then
|
||||||
|
wait "$_insmod_pid"
|
||||||
|
_rc=$?
|
||||||
|
if [ $_rc -ne 0 ]; then
|
||||||
|
log "nvidia load failed (exit $_rc)"
|
||||||
|
dmesg | tail -n 10 | sed 's/^/ dmesg: /' || true
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
# insmod exited 0 but nvidiactl not yet in /proc/devices — give it a moment
|
||||||
|
sleep 2
|
||||||
|
if nvidia_is_functional; then
|
||||||
|
log "loaded: nvidia (GSP enabled, ${_waited}s)"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
log "insmod exited 0 but nvidiactl missing — treating as failure"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
sleep 1
|
||||||
|
_waited=$((_waited + 1))
|
||||||
|
done
|
||||||
|
|
||||||
|
# GSP init timed out — kill the hanging insmod and attempt gsp-off fallback
|
||||||
|
log "nvidia GSP init timed out after 90s"
|
||||||
|
kill "$_insmod_pid" 2>/dev/null || true
|
||||||
|
wait "$_insmod_pid" 2>/dev/null || true
|
||||||
|
|
||||||
|
# Attempt to unload the partially-initialized module
|
||||||
|
if ! rmmod nvidia 2>/dev/null; then
|
||||||
|
# Module is stuck in the kernel — cannot reload with different params.
|
||||||
|
# User must reboot and select bee.nvidia.mode=gsp-off at boot menu.
|
||||||
|
log "ERROR: rmmod nvidia failed (EBUSY) — module stuck in kernel"
|
||||||
|
log "ERROR: reboot and select 'EASY-BEE (advanced) -> GSP=off' in boot menu"
|
||||||
|
echo "gsp-stuck" > /run/bee-nvidia-mode
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
sleep 2
|
||||||
|
log "retrying with NVreg_EnableGpuFirmware=0"
|
||||||
|
log "WARNING: GSP disabled — power management will run via CPU path, not GPU firmware"
|
||||||
|
|
||||||
|
if insmod "$ko" NVreg_EnableGpuFirmware=0; then
|
||||||
|
if nvidia_is_functional; then
|
||||||
|
log "loaded: nvidia (GSP disabled)"
|
||||||
|
echo "gsp-off" > /run/bee-nvidia-mode
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
log "insmod gsp-off exited 0 but nvidiactl missing"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "nvidia load failed (GSP=off)"
|
||||||
dmesg | tail -n 10 | sed 's/^/ dmesg: /' || true
|
dmesg | tail -n 10 | sed 's/^/ dmesg: /' || true
|
||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
@@ -70,7 +152,7 @@ load_host_module() {
|
|||||||
|
|
||||||
case "$nvidia_mode" in
|
case "$nvidia_mode" in
|
||||||
normal|full)
|
normal|full)
|
||||||
if ! load_module nvidia; then
|
if ! load_module_with_gsp_fallback; then
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
# nvidia-modeset on some server kernels needs ACPI video helper symbols
|
# nvidia-modeset on some server kernels needs ACPI video helper symbols
|
||||||
|
|||||||
Reference in New Issue
Block a user