Stabilize live ISO consoles and NVIDIA boot path

This commit is contained in:
Mikhail Chusavitin
2026-03-25 19:05:18 +03:00
parent b345b0d14d
commit d36e8442a9
13 changed files with 124 additions and 38 deletions

View File

@@ -32,6 +32,6 @@ lb config noauto \
--memtest none \ --memtest none \
--iso-volume "EASY-BEE" \ --iso-volume "EASY-BEE" \
--iso-application "EASY-BEE" \ --iso-application "EASY-BEE" \
--bootappend-live "boot=live components console=tty0 console=ttyS0,115200n8 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \ --bootappend-live "boot=live components console=ttyS0,115200n8 console=ttyS1,115200n8 loglevel=7 systemd.log_target=console systemd.journald.forward_to_console=1 systemd.journald.max_level_console=debug username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
--apt-recommends false \ --apt-recommends false \
"${@}" "${@}"

View File

@@ -5,6 +5,21 @@ set -e
echo "=== bee chroot setup ===" echo "=== bee chroot setup ==="
ensure_bee_console_user() {
if id bee >/dev/null 2>&1; then
usermod -d /home/bee -s /bin/sh bee 2>/dev/null || true
else
useradd -d /home/bee -m -s /bin/sh -U bee
fi
mkdir -p /home/bee
chown -R bee:bee /home/bee
echo "bee:eeb" | chpasswd
usermod -aG sudo bee 2>/dev/null || true
}
ensure_bee_console_user
# Enable bee services # Enable bee services
systemctl enable bee-network.service systemctl enable bee-network.service
systemctl enable bee-nvidia.service systemctl enable bee-nvidia.service
@@ -15,6 +30,8 @@ systemctl enable bee-sshsetup.service
systemctl enable ssh.service systemctl enable ssh.service
systemctl enable qemu-guest-agent.service 2>/dev/null || true systemctl enable qemu-guest-agent.service 2>/dev/null || true
systemctl enable serial-getty@ttyS0.service 2>/dev/null || true systemctl enable serial-getty@ttyS0.service 2>/dev/null || true
systemctl enable serial-getty@ttyS1.service 2>/dev/null || true
systemctl enable bee-journal-mirror@ttyS1.service 2>/dev/null || true
# Ensure scripts are executable # Ensure scripts are executable
chmod +x /usr/local/bin/bee-network.sh 2>/dev/null || true chmod +x /usr/local/bin/bee-network.sh 2>/dev/null || true
@@ -23,6 +40,7 @@ chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
chmod +x /usr/local/bin/bee-tui 2>/dev/null || true chmod +x /usr/local/bin/bee-tui 2>/dev/null || true
chmod +x /usr/local/bin/bee 2>/dev/null || true chmod +x /usr/local/bin/bee 2>/dev/null || true
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
# Reload udev rules # Reload udev rules
udevadm control --reload-rules 2>/dev/null || true udevadm control --reload-rules 2>/dev/null || true

View File

@@ -53,17 +53,23 @@ else
fail "NVIDIA ko dir missing: $KO_DIR" fail "NVIDIA ko dir missing: $KO_DIR"
fi fi
for mod in nvidia nvidia_modeset nvidia_uvm; do if /sbin/lsmod 2>/dev/null | grep -q "^nvidia "; then
ok "module loaded: nvidia"
else
fail "module NOT loaded: nvidia"
fi
for mod in nvidia_modeset nvidia_uvm; do
if /sbin/lsmod 2>/dev/null | grep -q "^$mod "; then if /sbin/lsmod 2>/dev/null | grep -q "^$mod "; then
ok "module loaded: $mod" ok "module loaded: $mod"
else else
fail "module NOT loaded: $mod" warn "module not loaded at boot: $mod"
fi fi
done done
echo "" echo ""
echo "-- NVIDIA device nodes --" echo "-- NVIDIA device nodes --"
for dev in nvidiactl nvidia0 nvidia-uvm; do for dev in nvidiactl nvidia0; do
if [ -e "/dev/$dev" ]; then if [ -e "/dev/$dev" ]; then
ok "/dev/$dev exists" ok "/dev/$dev exists"
else else
@@ -71,6 +77,12 @@ for dev in nvidiactl nvidia0 nvidia-uvm; do
fi fi
done done
if [ -e /dev/nvidia-uvm ]; then
ok "/dev/nvidia-uvm exists"
else
warn "/dev/nvidia-uvm missing — CUDA stress path may be unavailable until loaded on demand"
fi
echo "" echo ""
echo "-- nvidia-smi --" echo "-- nvidia-smi --"
if PATH="/usr/local/bin:$PATH" command -v nvidia-smi >/dev/null 2>&1; then if PATH="/usr/local/bin:$PATH" command -v nvidia-smi >/dev/null 2>&1; then

View File

@@ -0,0 +1,4 @@
[Journal]
ForwardToConsole=yes
TTYPath=/dev/ttyS0
MaxLevelConsole=debug

View File

@@ -5,9 +5,9 @@ Before=bee-web.service
[Service] [Service]
Type=oneshot Type=oneshot
ExecStart=/bin/sh -c '/usr/local/bin/bee audit --runtime livecd --output file:/appdata/bee/export/bee-audit.json; rc=$?; if [ "$rc" -ne 0 ]; then echo "[bee-audit] WARN: audit exited with rc=$rc"; fi; exit 0' ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-audit.log /bin/sh -c '/usr/local/bin/bee audit --runtime livecd --output file:/appdata/bee/export/bee-audit.json; rc=$?; if [ "$rc" -ne 0 ]; then echo "[bee-audit] WARN: audit exited with rc=$rc"; fi; exit 0'
StandardOutput=append:/appdata/bee/export/bee-audit.log StandardOutput=journal
StandardError=append:/appdata/bee/export/bee-audit.log StandardError=journal
RemainAfterExit=yes RemainAfterExit=yes
[Install] [Install]

View File

@@ -0,0 +1,16 @@
[Unit]
Description=Bee: mirror system journal to %I
After=systemd-journald.service
Requires=systemd-journald.service
ConditionPathExists=/dev/%I
[Service]
Type=simple
ExecStart=/bin/sh -c 'exec journalctl -f -n 200 -o short-monotonic > /dev/%I'
Restart=always
RestartSec=1
StandardOutput=null
StandardError=journal
[Install]
WantedBy=multi-user.target

View File

@@ -5,9 +5,9 @@ Before=network-online.target bee-audit.service
[Service] [Service]
Type=oneshot Type=oneshot
ExecStart=/usr/local/bin/bee-network.sh ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-network.log /usr/local/bin/bee-network.sh
StandardOutput=append:/appdata/bee/export/bee-network.log StandardOutput=journal
StandardError=append:/appdata/bee/export/bee-network.log StandardError=journal
RemainAfterExit=yes RemainAfterExit=yes
[Install] [Install]

View File

@@ -5,9 +5,9 @@ Before=bee-audit.service
[Service] [Service]
Type=oneshot Type=oneshot
ExecStart=/usr/local/bin/bee-nvidia-load ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-nvidia.log /usr/local/bin/bee-nvidia-load
StandardOutput=append:/appdata/bee/export/bee-nvidia.log StandardOutput=journal
StandardError=append:/appdata/bee/export/bee-nvidia.log StandardError=journal
RemainAfterExit=yes RemainAfterExit=yes
[Install] [Install]

View File

@@ -5,9 +5,9 @@ Before=bee-audit.service
[Service] [Service]
Type=oneshot Type=oneshot
ExecStart=/bin/sh -c '/usr/local/bin/bee preflight --output file:/appdata/bee/export/runtime-health.json; rc=$?; if [ "$rc" -ne 0 ]; then echo "[bee-preflight] WARN: preflight exited with rc=$rc"; fi; exit 0' ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/runtime-health.log /bin/sh -c '/usr/local/bin/bee preflight --output file:/appdata/bee/export/runtime-health.json; rc=$?; if [ "$rc" -ne 0 ]; then echo "[bee-preflight] WARN: preflight exited with rc=$rc"; fi; exit 0'
StandardOutput=append:/appdata/bee/export/runtime-health.log StandardOutput=journal
StandardError=append:/appdata/bee/export/runtime-health.log StandardError=journal
RemainAfterExit=yes RemainAfterExit=yes
[Install] [Install]

View File

@@ -5,9 +5,9 @@ Before=ssh.service
[Service] [Service]
Type=oneshot Type=oneshot
ExecStart=/usr/local/bin/bee-sshsetup ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-sshsetup.log /usr/local/bin/bee-sshsetup
StandardOutput=append:/appdata/bee/export/bee-sshsetup.log StandardOutput=journal
StandardError=append:/appdata/bee/export/bee-sshsetup.log StandardError=journal
RemainAfterExit=yes RemainAfterExit=yes
[Install] [Install]

View File

@@ -5,11 +5,11 @@ Wants=bee-audit.service
[Service] [Service]
Type=simple Type=simple
ExecStart=/usr/local/bin/bee web --listen :80 --audit-path /appdata/bee/export/bee-audit.json --export-dir /appdata/bee/export --title "Bee Hardware Audit" ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-web.log /usr/local/bin/bee web --listen :80 --audit-path /appdata/bee/export/bee-audit.json --export-dir /appdata/bee/export --title "Bee Hardware Audit"
Restart=always Restart=always
RestartSec=2 RestartSec=2
StandardOutput=append:/appdata/bee/export/bee-web.log StandardOutput=journal
StandardError=append:/appdata/bee/export/bee-web.log StandardError=journal
[Install] [Install]
WantedBy=multi-user.target WantedBy=multi-user.target

View File

@@ -0,0 +1,29 @@
#!/bin/bash
# bee-log-run — run a command, append its output to a file, and keep stdout/stderr
# connected to systemd so journald and the serial console also receive the logs.
set -o pipefail
log_file="$1"
shift
if [ -z "$log_file" ] || [ "$#" -eq 0 ]; then
echo "usage: $0 <log-file> <command> [args...]" >&2
exit 2
fi
mkdir -p "$(dirname "$log_file")"
serial_sink() {
local tty="$1"
if [ -w "$tty" ]; then
cat > "$tty"
else
cat > /dev/null
fi
}
"$@" 2>&1 | tee -a "$log_file" \
>(serial_sink /dev/ttyS0) \
>(serial_sink /dev/ttyS1)
exit "${PIPESTATUS[0]}"

View File

@@ -22,24 +22,33 @@ fi
log "module dir: $NVIDIA_KO_DIR" log "module dir: $NVIDIA_KO_DIR"
ls "$NVIDIA_KO_DIR"/*.ko 2>/dev/null | sed 's/^/ /' || true ls "$NVIDIA_KO_DIR"/*.ko 2>/dev/null | sed 's/^/ /' || true
# Some kernels expose backlight helper symbols only after loading `video`. load_module() {
modprobe video >/dev/null 2>&1 && log "loaded helper module: video" || log "helper module unavailable: video" mod="$1"
shift
# Load modules via insmod (direct load — no depmod needed)
for mod in nvidia nvidia-modeset nvidia-uvm; do
ko="$NVIDIA_KO_DIR/${mod}.ko" ko="$NVIDIA_KO_DIR/${mod}.ko"
[ -f "$ko" ] || ko="$NVIDIA_KO_DIR/${mod//-/_}.ko" [ -f "$ko" ] || ko="$NVIDIA_KO_DIR/${mod//-/_}.ko"
if [ -f "$ko" ]; then if [ ! -f "$ko" ]; then
if insmod "$ko"; then
log "loaded: $mod"
else
log "WARN: failed to load: $mod"
dmesg | tail -n 5 | sed 's/^/ dmesg: /' || true
fi
else
log "WARN: not found: $ko" log "WARN: not found: $ko"
return 1
fi fi
done if insmod "$ko" "$@"; then
log "loaded: $mod $*"
return 0
fi
log "WARN: failed to load: $mod"
dmesg | tail -n 10 | sed 's/^/ dmesg: /' || true
return 1
}
# Load only the base NVIDIA kernel module on boot.
# NVIDIA documents that GSP firmware is enabled by default on newer GPUs and can
# be disabled via NVreg_EnableGpuFirmware=0. This keeps the live ISO on the
# conservative path until we have a stable repro for the observed boot crash.
if ! load_module nvidia NVreg_EnableGpuFirmware=0; then
exit 1
fi
log "skipping nvidia-modeset and nvidia-uvm during boot for stability"
# Create /dev/nvidia* device nodes (udev rules absent since we use .run installer) # Create /dev/nvidia* device nodes (udev rules absent since we use .run installer)
nvidia_major=$(grep -m1 ' nvidiactl$' /proc/devices | awk '{print $1}') nvidia_major=$(grep -m1 ' nvidiactl$' /proc/devices | awk '{print $1}')
@@ -61,8 +70,6 @@ if [ -n "$uvm_major" ]; then
&& log "created /dev/nvidia-uvm (major $uvm_major)" \ && log "created /dev/nvidia-uvm (major $uvm_major)" \
|| log "WARN: /dev/nvidia-uvm already exists" || log "WARN: /dev/nvidia-uvm already exists"
mknod -m 666 /dev/nvidia-uvm-tools c "$uvm_major" 1 || true mknod -m 666 /dev/nvidia-uvm-tools c "$uvm_major" 1 || true
else
log "WARN: nvidia-uvm not in /proc/devices"
fi fi
log "done" log "done"