Compare commits

..

5 Commits
v10.15 ... main

Author SHA1 Message Date
Mikhail Chusavitin
7640f20714 Consolidate dist/ into cache/ and release/ subdirs
All intermediate build artifacts (binaries, live-build work dirs, overlay
stages, NVIDIA/NCCL/cuBLAS/john caches) now live under dist/cache/.
Final ISOs go to dist/release/ instead of scattered dist/easy-bee-v*/ and iso/out/.
dist/ is already gitignored, iso/out/ entry removed as redundant.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-06 12:28:47 +03:00
Mikhail Chusavitin
1593bf3e76 Add scripts/build.sh -- single entry point for ISO builds
Auto-detects build mode: remote VM if BUILDER_HOST is set in .env,
local Docker otherwise. Cache hardcoded to dist/container-cache (gitignored).
All flags forwarded to build-in-container.sh.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-06 12:24:09 +03:00
Mikhail Chusavitin
ae80d7711e Add continuous hardware health monitoring and component detail view
- kmsg watcher now records kernel errors (GPU Xid, MCE, EDAC, storage I/O) at all times,
  not only during SAT tasks; flushImmediate writes directly to ComponentStatusDB
- New health_poller: polls ipmitool sdr every 60s for PSU health (watchdog:psu source)
- Hardware Summary card auto-refreshes every 30s via htmx without page reload
- Component rows (CPU/Memory/Storage/GPU/PSU) are now clickable -- opens a modal
  with per-component status, source, timestamp and last 20 history entries

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-06 09:56:39 +03:00
Mikhail Chusavitin
ca78b9df65 Add initramfs-level Drive Wipe tool (bee.wipe=all)
Installs a local-premount initramfs hook that intercepts bee.wipe=all before
squashfs is mounted. Shows a numbered disk selection TUI (pure POSIX sh), wipes
selected disks (nvme format / blkdiscard / dd fallback), syncs, and reboots.
Works even when squashfs fails to mount.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-06 09:23:05 +03:00
Mikhail Chusavitin
5cafe63f33 Add Drive Wipe boot menu entry and overlay wipe script
Adds a "WIPE ALL DISKS" entry to both GRUB and isolinux menus (bee.wipe=all).
Includes bee-wipe-disks for manual use from a running live system.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-06 09:22:59 +03:00
14 changed files with 789 additions and 39 deletions

1
.gitignore vendored
View File

@@ -1,6 +1,5 @@
.env
.DS_Store
dist/
iso/out/
build-cache/
audit/bee

View File

@@ -1679,6 +1679,56 @@ func (h *handler) handleAPIBenchmarkResults(w http.ResponseWriter, r *http.Reque
fmt.Fprint(w, renderBenchmarkResultsCard(h.opts.ExportDir))
}
// ── Hardware summary / component detail ──────────────────────────────────────
// handleAPIHardwareSummary returns the hardware summary card HTML fragment for
// htmx polling (hx-get="/api/hardware-summary" hx-swap="outerHTML").
func (h *handler) handleAPIHardwareSummary(w http.ResponseWriter, _ *http.Request) {
w.Header().Set("Content-Type", "text/html; charset=utf-8")
w.Header().Set("Cache-Control", "no-store")
fmt.Fprint(w, renderHardwareSummaryCard(h.opts))
}
// handleAPIComponentDetail returns an HTML fragment describing the current and
// historical status for one component type (cpu, memory, storage, gpu, psu).
func (h *handler) handleAPIComponentDetail(w http.ResponseWriter, r *http.Request) {
compType := r.PathValue("type")
var exact, prefixes []string
var title string
switch compType {
case "cpu":
title = "CPU"
exact = []string{"cpu:all"}
case "memory":
title = "Memory"
exact = []string{"memory:all"}
prefixes = []string{"memory:"}
case "storage":
title = "Storage"
exact = []string{"storage:all"}
prefixes = []string{"storage:"}
case "gpu":
title = "GPU"
prefixes = []string{"pcie:gpu:"}
case "psu":
title = "PSU"
prefixes = []string{"psu:"}
default:
http.NotFound(w, r)
return
}
var records []app.ComponentStatusRecord
if h.opts.App != nil && h.opts.App.StatusDB != nil {
all := h.opts.App.StatusDB.All()
records = matchedRecords(all, exact, prefixes)
}
w.Header().Set("Content-Type", "text/html; charset=utf-8")
w.Header().Set("Cache-Control", "no-store")
fmt.Fprint(w, renderComponentDetail(title, records))
}
func (h *handler) rollbackPendingNetworkChange() error {
h.pendingNetMu.Lock()
pnc := h.pendingNet

View File

@@ -0,0 +1,76 @@
package webui
import (
"bytes"
"context"
"log/slog"
"os/exec"
"time"
"bee/audit/internal/app"
"bee/audit/internal/collector"
)
const healthPollInterval = 60 * time.Second
const psuIPMITimeout = 15 * time.Second
// healthPoller runs periodic health checks for hardware components that do not
// emit kernel log events (e.g. PSU). Results are written to ComponentStatusDB.
type healthPoller struct {
statusDB *app.ComponentStatusDB
}
func newHealthPoller(statusDB *app.ComponentStatusDB) *healthPoller {
return &healthPoller{statusDB: statusDB}
}
func (p *healthPoller) start() {
goRecoverLoop("health poller", 5*time.Second, p.run)
}
func (p *healthPoller) run() {
ticker := time.NewTicker(healthPollInterval)
defer ticker.Stop()
for range ticker.C {
p.pollPSU()
}
}
func (p *healthPoller) pollPSU() {
if p.statusDB == nil {
return
}
ctx, cancel := context.WithTimeout(context.Background(), psuIPMITimeout)
defer cancel()
cmd := exec.CommandContext(ctx, "ipmitool", "sdr")
var out bytes.Buffer
cmd.Stdout = &out
if err := cmd.Run(); err != nil {
// IPMI not available or not a server — skip silently.
slog.Debug("health poller: ipmitool sdr unavailable", "err", err)
return
}
slots := collector.PSUSlotsFromSDR(out.String())
if len(slots) == 0 {
return
}
const source = "watchdog:psu"
for slot, psu := range slots {
key := "psu:" + slot
status := psu.Status
if status == "" {
status = "Unknown"
}
detail := ""
switch status {
case "Critical":
detail = "PSU sensor reported non-OK state"
case "Warning":
detail = "PSU sensor in warning state"
}
p.statusDB.Record(key, source, status, detail)
}
}

View File

@@ -73,6 +73,9 @@ func (w *kmsgWatcher) run() {
w.mu.Lock()
if w.window != nil {
w.recordEvent(evt)
} else {
evtCopy := evt
goRecoverOnce("kmsg flush immediate", func() { w.flushImmediate(evtCopy) })
}
w.mu.Unlock()
}
@@ -180,6 +183,52 @@ func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
}
}
// flushImmediate writes a single kmsg event directly to the status DB without a SAT window.
// Called when an error is detected outside of any SAT task (always-on watching).
func (w *kmsgWatcher) flushImmediate(evt kmsgEvent) {
if w.statusDB == nil {
return
}
const source = "watchdog:kmsg"
detail := "kernel: " + truncate(evt.raw, 120)
var severity string
for _, p := range platform.HardwareErrorPatterns {
if p.Re.MatchString(evt.raw) {
if p.Severity == "critical" {
severity = "Critical"
} else {
severity = "Warning"
}
break
}
}
if severity == "" {
severity = "Warning"
}
if len(evt.ids) == 0 {
key := "cpu:all"
if evt.category == "memory" {
key = "memory:all"
}
w.statusDB.Record(key, source, severity, detail)
return
}
for _, id := range evt.ids {
var key string
switch evt.category {
case "gpu", "pcie":
key = "pcie:" + normalizeBDF(id)
case "storage":
key = "storage:" + id
default:
key = "pcie:" + normalizeBDF(id)
}
w.statusDB.Record(key, source, severity, detail)
}
}
// parseKmsgLine parses a single /dev/kmsg line and returns an event if it matches
// any pattern in platform.HardwareErrorPatterns.
// kmsg format: "<priority>,<sequence>,<timestamp_usec>,-;message text"

View File

@@ -85,6 +85,7 @@ func renderPage(page string, opts HandlerOptions) string {
body +
`</div></div>` +
renderAuditModal() +
`<dialog id="component-detail-dialog" style="min-width:600px;max-width:900px;width:90vw;padding:0;border:1px solid var(--border);border-radius:8px;background:var(--surface)"><div id="component-detail-body" style="padding-bottom:20px"></div></dialog>` +
`<script>
// Add copy button to every .terminal on the page
document.querySelectorAll('.terminal').forEach(function(t){
@@ -184,13 +185,14 @@ func renderAudit() string {
}
func renderHardwareSummaryCard(opts HandlerOptions) string {
const cardAttrs = ` hx-get="/api/hardware-summary" hx-trigger="every 30s" hx-swap="outerHTML"`
data, err := loadSnapshot(opts.AuditPath)
if err != nil {
return `<div class="card"><div class="card-head card-head-actions"><span>Hardware Summary</span><div class="card-head-buttons"><button class="btn btn-primary btn-sm" onclick="auditModalRun()">Run audit</button></div></div><div class="card-body"></div></div>`
return `<div class="card"` + cardAttrs + `><div class="card-head card-head-actions"><span>Hardware Summary</span><div class="card-head-buttons"><button class="btn btn-primary btn-sm" onclick="auditModalRun()">Run audit</button></div></div><div class="card-body"></div></div>`
}
var ingest schema.HardwareIngestRequest
if err := json.Unmarshal(data, &ingest); err != nil {
return `<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body"><span class="badge badge-err">Parse error</span></div></div>`
return `<div class="card"` + cardAttrs + `><div class="card-head">Hardware Summary</div><div class="card-body"><span class="badge badge-err">Parse error</span></div></div>`
}
hw := ingest.Hardware
@@ -200,7 +202,7 @@ func renderHardwareSummaryCard(opts HandlerOptions) string {
}
var b strings.Builder
b.WriteString(`<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body">`)
b.WriteString(`<div class="card"` + cardAttrs + `><div class="card-head">Hardware Summary</div><div class="card-body">`)
// Server identity block above the component table.
{
@@ -229,22 +231,32 @@ func renderHardwareSummaryCard(opts HandlerOptions) string {
}
b.WriteString(`<table style="width:auto">`)
writeRow := func(label, value, badgeHTML string) {
b.WriteString(fmt.Sprintf(`<tr><td style="padding:6px 14px 6px 0;font-weight:700;white-space:nowrap">%s</td><td style="padding:6px 0;color:var(--muted);font-size:13px">%s</td><td style="padding:6px 0 6px 12px">%s</td></tr>`,
html.EscapeString(label), html.EscapeString(value), badgeHTML))
// writeRow renders one component row. compType is the URL path segment for the detail
// endpoint (e.g. "cpu"). Pass "" for rows that have no detail view.
writeRow := func(label, value, badgeHTML, compType string) {
var labelHTML string
if compType != "" {
labelHTML = fmt.Sprintf(
`<span style="cursor:pointer;text-decoration:underline dotted;text-underline-offset:3px" hx-get="/api/components/%s" hx-target="#component-detail-body" hx-swap="innerHTML" onclick="document.getElementById('component-detail-dialog').showModal()">%s</span>`,
compType, html.EscapeString(label))
} else {
labelHTML = html.EscapeString(label)
}
fmt.Fprintf(&b, `<tr><td style="padding:6px 14px 6px 0;font-weight:700;white-space:nowrap">%s</td><td style="padding:6px 0;color:var(--muted);font-size:13px">%s</td><td style="padding:6px 0 6px 12px">%s</td></tr>`,
labelHTML, html.EscapeString(value), badgeHTML)
}
writeRow("CPU", hwDescribeCPU(hw),
renderComponentChips(matchedRecords(records, []string{"cpu:all"}, nil)))
renderComponentChips(matchedRecords(records, []string{"cpu:all"}, nil)), "cpu")
writeRow("Memory", hwDescribeMemory(hw),
renderComponentChips(matchedRecords(records, []string{"memory:all"}, []string{"memory:"})))
renderComponentChips(matchedRecords(records, []string{"memory:all"}, []string{"memory:"})), "memory")
writeRow("Storage", hwDescribeStorage(hw),
renderComponentChips(matchedRecords(records, []string{"storage:all"}, []string{"storage:"})))
renderComponentChips(matchedRecords(records, []string{"storage:all"}, []string{"storage:"})), "storage")
writeRow("GPU", hwDescribeGPU(hw),
renderComponentChips(matchedRecords(records, nil, []string{"pcie:gpu:"})))
renderComponentChips(matchedRecords(records, nil, []string{"pcie:gpu:"})), "gpu")
psuMatched := matchedRecords(records, nil, []string{"psu:"})
if len(psuMatched) == 0 && len(hw.PowerSupplies) > 0 {
@@ -252,10 +264,10 @@ func renderHardwareSummaryCard(opts HandlerOptions) string {
psuStatus := hwPSUStatus(hw.PowerSupplies)
psuMatched = []app.ComponentStatusRecord{{ComponentKey: "psu:ipmi", Status: psuStatus}}
}
writeRow("PSU", hwDescribePSU(hw), renderComponentChips(psuMatched))
writeRow("PSU", hwDescribePSU(hw), renderComponentChips(psuMatched), "psu")
if nicDesc := hwDescribeNIC(hw); nicDesc != "" {
writeRow("Network", nicDesc, "")
writeRow("Network", nicDesc, "", "")
}
b.WriteString(`</table>`)
@@ -999,3 +1011,67 @@ func rowIssueHTML(issue string) string {
}
return html.EscapeString(issue)
}
// renderComponentDetail renders a modal content fragment for one component type.
// Called by handleAPIComponentDetail and displayed inside #component-detail-dialog.
func renderComponentDetail(title string, records []app.ComponentStatusRecord) string {
var b strings.Builder
fmt.Fprintf(&b, `<div style="padding:20px 24px 0">`)
fmt.Fprintf(&b, `<div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:16px">`)
fmt.Fprintf(&b, `<span style="font-size:16px;font-weight:700">%s — Status Detail</span>`, html.EscapeString(title))
b.WriteString(`<button class="btn btn-sm btn-secondary" onclick="document.getElementById('component-detail-dialog').close()">Close</button>`)
b.WriteString(`</div>`)
if len(records) == 0 {
b.WriteString(`<p style="color:var(--muted)">No status data recorded yet for this component type.</p>`)
b.WriteString(`</div>`)
return b.String()
}
sort.Slice(records, func(i, j int) bool {
return records[i].ComponentKey < records[j].ComponentKey
})
for _, rec := range records {
letter, cls := chipLetterClass(rec.Status)
fmt.Fprintf(&b, `<div style="margin-bottom:20px">`)
fmt.Fprintf(&b, `<div style="display:flex;align-items:center;gap:8px;margin-bottom:8px">`)
fmt.Fprintf(&b, `<span class="chip %s">%s</span>`, cls, letter)
fmt.Fprintf(&b, `<span style="font-weight:700;font-size:13px">%s</span>`, html.EscapeString(rec.ComponentKey))
if !rec.LastCheckedAt.IsZero() {
fmt.Fprintf(&b, `<span style="color:var(--muted);font-size:12px">checked %s</span>`, rec.LastCheckedAt.Format("2006-01-02 15:04:05"))
}
b.WriteString(`</div>`)
if rec.ErrorSummary != "" {
fmt.Fprintf(&b, `<div style="font-size:12px;margin-bottom:8px;color:var(--muted)">%s</div>`, html.EscapeString(rec.ErrorSummary))
}
// History table — newest first, cap at 20 entries.
history := rec.History
if len(history) > 20 {
history = history[len(history)-20:]
}
b.WriteString(`<table style="width:100%;font-size:12px;border-collapse:collapse">`)
b.WriteString(`<tr style="color:var(--muted)"><th style="text-align:left;padding:2px 10px 2px 0;white-space:nowrap">Time</th><th style="text-align:left;padding:2px 10px 2px 0">Status</th><th style="text-align:left;padding:2px 10px 2px 0">Source</th><th style="text-align:left;padding:2px 0">Detail</th></tr>`)
for i := len(history) - 1; i >= 0; i-- {
e := history[i]
eLetter, eCls := chipLetterClass(e.Status)
detail := e.Detail
if detail == "" {
detail = "—"
}
fmt.Fprintf(&b,
`<tr><td style="padding:3px 10px 3px 0;white-space:nowrap;color:var(--muted)">%s</td><td style="padding:3px 10px 3px 0"><span class="chip %s" style="font-size:10px;width:16px;height:16px">%s</span></td><td style="padding:3px 10px 3px 0;white-space:nowrap">%s</td><td style="padding:3px 0;color:var(--muted)">%s</td></tr>`,
html.EscapeString(e.At.Format("2006-01-02 15:04:05")),
eCls, eLetter,
html.EscapeString(e.Source),
html.EscapeString(detail),
)
}
b.WriteString(`</table>`)
b.WriteString(`</div>`)
}
b.WriteString(`</div>`)
return b.String()
}

View File

@@ -221,6 +221,11 @@ func NewHandler(opts HandlerOptions) http.Handler {
h.kmsg = newKmsgWatcher(opts.App.StatusDB)
h.kmsg.start()
globalQueue.kmsgWatcher = h.kmsg
// Start periodic health poller for components that don't emit kernel log events (e.g. PSU).
if opts.App.StatusDB != nil {
newHealthPoller(opts.App.StatusDB).start()
}
}
globalQueue.startWorker(&opts)
@@ -328,6 +333,10 @@ func NewHandler(opts HandlerOptions) http.Handler {
mux.HandleFunc("GET /api/install/disks", h.handleAPIInstallDisks)
mux.HandleFunc("POST /api/install/run", h.handleAPIInstallRun)
// Hardware component detail (fragment for modal in Hardware Summary card)
mux.HandleFunc("GET /api/hardware-summary", h.handleAPIHardwareSummary)
mux.HandleFunc("GET /api/components/{type}", h.handleAPIComponentDetail)
// Metrics — SSE stream of live sensor data + server-side SVG charts + CSV export
mux.HandleFunc("GET /api/metrics/stream", h.handleAPIMetricsStream)
mux.HandleFunc("GET /api/metrics/latest", h.handleAPIMetricsLatest)

View File

@@ -8,7 +8,7 @@ BUILDER_DIR="${REPO_ROOT}/iso/builder"
CONTAINER_TOOL="${CONTAINER_TOOL:-docker}"
IMAGE_TAG="${BEE_BUILDER_IMAGE:-bee-iso-builder}"
BUILDER_PLATFORM="${BEE_BUILDER_PLATFORM:-linux/amd64}"
CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/container-cache}"
CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/cache}"
AUTH_KEYS=""
CLEAN_CACHE=0
VARIANT="all"
@@ -54,14 +54,14 @@ if [ "$CLEAN_CACHE" = "1" ]; then
"${CACHE_DIR:?}/bee" \
"${CACHE_DIR:?}/lb-packages"
echo "=== cleaning live-build work dirs ==="
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia"
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia-legacy"
rm -rf "${REPO_ROOT}/dist/live-build-work-amd"
rm -rf "${REPO_ROOT}/dist/live-build-work-nogpu"
rm -rf "${REPO_ROOT}/dist/overlay-stage-nvidia"
rm -rf "${REPO_ROOT}/dist/overlay-stage-nvidia-legacy"
rm -rf "${REPO_ROOT}/dist/overlay-stage-amd"
rm -rf "${REPO_ROOT}/dist/overlay-stage-nogpu"
rm -rf "${REPO_ROOT}/dist/cache/live-build-work-nvidia"
rm -rf "${REPO_ROOT}/dist/cache/live-build-work-nvidia-legacy"
rm -rf "${REPO_ROOT}/dist/cache/live-build-work-amd"
rm -rf "${REPO_ROOT}/dist/cache/live-build-work-nogpu"
rm -rf "${REPO_ROOT}/dist/cache/overlay-stage-nvidia"
rm -rf "${REPO_ROOT}/dist/cache/overlay-stage-nvidia-legacy"
rm -rf "${REPO_ROOT}/dist/cache/overlay-stage-amd"
rm -rf "${REPO_ROOT}/dist/cache/overlay-stage-nogpu"
echo "=== caches cleared, proceeding with build ==="
fi

View File

@@ -51,8 +51,8 @@ case "$BUILD_VARIANT" in
;;
esac
BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BUILD_VARIANT}"
OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BUILD_VARIANT}"
BUILD_WORK_DIR="${DIST_DIR}/cache/live-build-work-${BUILD_VARIANT}"
OVERLAY_STAGE_DIR="${DIST_DIR}/cache/overlay-stage-${BUILD_VARIANT}"
export BEE_GPU_VENDOR BEE_NVIDIA_MODULE_FLAVOR BUILD_VARIANT
@@ -63,7 +63,7 @@ export PATH="$PATH:/usr/local/go/bin"
# Allow git to read the bind-mounted repo (different UID inside container).
git config --global safe.directory "${REPO_ROOT}"
mkdir -p "${DIST_DIR}"
mkdir -p "${DIST_DIR}/cache" "${DIST_DIR}/release"
mkdir -p "${CACHE_ROOT}"
: "${GOCACHE:=${CACHE_ROOT}/go-build}"
: "${GOMODCACHE:=${CACHE_ROOT}/go-mod}"
@@ -1105,7 +1105,7 @@ PROJECT_VERSION_EFFECTIVE="$(resolve_project_version)"
SQUASHFS_FILENAME="filesystem-v${PROJECT_VERSION_EFFECTIVE}.squashfs"
ISO_BASENAME="easy-bee-${BUILD_VARIANT}-v${PROJECT_VERSION_EFFECTIVE}-amd64"
# Versioned output directory: dist/easy-bee-v4.1/ — all final artefacts live here.
OUT_DIR="${DIST_DIR}/easy-bee-v${PROJECT_VERSION_EFFECTIVE}"
OUT_DIR="${DIST_DIR}/release/easy-bee-v${PROJECT_VERSION_EFFECTIVE}"
ISO_VERSION_LABEL_TOKEN="$(printf '%s' "${PROJECT_VERSION_EFFECTIVE}" | tr '[:lower:].-' '[:upper:]__')"
mkdir -p "${OUT_DIR}"
LOG_DIR="${OUT_DIR}/${ISO_BASENAME}.logs"
@@ -1290,7 +1290,7 @@ run_step "sync git submodules" "05-git-submodules" \
# --- compile bee binary (static, Linux amd64) ---
# Shared between variants — built once, reused on second pass.
BEE_BIN="${DIST_DIR}/bee-linux-amd64"
BEE_BIN="${DIST_DIR}/cache/bee-linux-amd64"
NEED_BUILD=1
if [ -f "$BEE_BIN" ]; then
NEWEST_SRC=$(find "${REPO_ROOT}/audit" -name '*.go' -newer "$BEE_BIN" | head -1)
@@ -1321,16 +1321,16 @@ else
fi
# --- NVIDIA-only build steps ---
GPU_BURN_WORKER_BIN="${DIST_DIR}/bee-gpu-burn-worker-linux-amd64"
GPU_BURN_WORKER_BIN="${DIST_DIR}/cache/bee-gpu-burn-worker-linux-amd64"
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
run_step "download cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace" "20-cublas" \
sh "${BUILDER_DIR}/build-cublas.sh" \
"${CUBLAS_VERSION}" \
"${CUDA_USERSPACE_VERSION}" \
"${NCCL_CUDA_VERSION}" \
"${DIST_DIR}"
"${DIST_DIR}/cache"
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
CUBLAS_CACHE="${DIST_DIR}/cache/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
echo "=== bee-gpu-burn FP4 header probe ==="
fp4_type_match="$(grep -Rsnm 1 'CUDA_R_4F_E2M1' "${CUBLAS_CACHE}/include" 2>/dev/null || true)"
@@ -1456,7 +1456,7 @@ fi
# --- copy bee binary into overlay ---
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin"
cp "${DIST_DIR}/bee-linux-amd64" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
cp "$BEE_BIN" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
if [ "$BEE_GPU_VENDOR" = "nvidia" ] && [ -f "$GPU_BURN_WORKER_BIN" ]; then
@@ -1486,10 +1486,10 @@ done
# --- NVIDIA kernel modules and userspace libs ---
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
run_step "build NVIDIA ${NVIDIA_DRIVER_VERSION} modules" "40-nvidia-module" \
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}" "${BEE_NVIDIA_MODULE_FLAVOR}"
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}/cache" "${DEBIAN_KERNEL_ABI}" "${BEE_NVIDIA_MODULE_FLAVOR}"
KVER="${DEBIAN_KERNEL_ABI}-amd64"
NVIDIA_CACHE="${DIST_DIR}/nvidia-${BEE_NVIDIA_MODULE_FLAVOR}-${NVIDIA_DRIVER_VERSION}-${KVER}"
NVIDIA_CACHE="${DIST_DIR}/cache/nvidia-${BEE_NVIDIA_MODULE_FLAVOR}-${NVIDIA_DRIVER_VERSION}-${KVER}"
# Inject .ko files into overlay at /usr/local/lib/nvidia/
OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
@@ -1515,9 +1515,9 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
# --- build / download NCCL ---
run_step "download NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}" "50-nccl" \
sh "${BUILDER_DIR}/build-nccl.sh" "${NCCL_VERSION}" "${NCCL_CUDA_VERSION}" "${DIST_DIR}" "${NCCL_SHA256:-}"
sh "${BUILDER_DIR}/build-nccl.sh" "${NCCL_VERSION}" "${NCCL_CUDA_VERSION}" "${DIST_DIR}/cache" "${NCCL_SHA256:-}"
NCCL_CACHE="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
NCCL_CACHE="${DIST_DIR}/cache/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
# Inject libnccl.so.* into overlay alongside other NVIDIA userspace libs
cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
@@ -1533,19 +1533,19 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
"${NCCL_TESTS_VERSION}" \
"${NCCL_VERSION}" \
"${NCCL_CUDA_VERSION}" \
"${DIST_DIR}" \
"${DIST_DIR}/cache" \
"${NVCC_VERSION}" \
"${DEBIAN_VERSION}"
NCCL_TESTS_CACHE="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
NCCL_TESTS_CACHE="${DIST_DIR}/cache/nccl-tests-${NCCL_TESTS_VERSION}"
cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
cp "${NCCL_TESTS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
echo "=== all_reduce_perf injected ==="
run_step "build john jumbo ${JOHN_JUMBO_COMMIT}" "70-john" \
sh "${BUILDER_DIR}/build-john.sh" "${JOHN_JUMBO_COMMIT}" "${DIST_DIR}"
JOHN_CACHE="${DIST_DIR}/john-${JOHN_JUMBO_COMMIT}"
sh "${BUILDER_DIR}/build-john.sh" "${JOHN_JUMBO_COMMIT}" "${DIST_DIR}/cache"
JOHN_CACHE="${DIST_DIR}/cache/john-${JOHN_JUMBO_COMMIT}"
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john"
rsync -a --delete "${JOHN_CACHE}/run/" "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john/run/"
ln -sfn ../lib/bee/john/run/john "${OVERLAY_STAGE_DIR}/usr/local/bin/john"

View File

@@ -16,6 +16,11 @@ menuentry "EASY-BEE v@VERSION@ -- no GUI / no X11" {
}
menuentry "*** WIPE ALL DISKS (irreversible!) ***" {
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.gui=off bee.wipe=all net.ifnames=0 biosdevname=0
initrd @INITRD_LIVE@
}
if [ "${grub_platform}" = "efi" ]; then
menuentry "Memory Test (memtest86+)" {
chainloader /boot/memtest86+x64.efi

View File

@@ -41,6 +41,12 @@ label live-@FLAVOUR@-failsafe
initrd @INITRD@
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
label wipe-disks
menu label *** WIPE ALL DISKS (irreversible!) ***
linux @LINUX@
initrd @INITRD@
append @APPEND_LIVE@ toram nomodeset bee.gui=off bee.wipe=all net.ifnames=0 biosdevname=0
label memtest
menu label ^Memory Test (memtest86+)
linux /boot/memtest86+x64.bin

View File

@@ -0,0 +1,57 @@
#!/bin/sh
# 9012-wipe.hook.chroot
#
# Adds bee-initramfs-wipe to the initramfs so that selecting the
# "WIPE ALL DISKS" boot menu entry runs the wipe tool before squashfs
# is mounted — i.e. it works even when live boot fails.
#
# Two files are installed inside the chroot:
# /etc/initramfs-tools/hooks/bee-wipe — copies binaries into initrd
# /etc/initramfs-tools/scripts/local-premount/bee-wipe — runs at boot
set -e
HOOK_DIR="/etc/initramfs-tools/hooks"
SCRIPT_DIR="/etc/initramfs-tools/scripts/local-premount"
mkdir -p "${HOOK_DIR}" "${SCRIPT_DIR}"
# ── initramfs hook: copy binaries ────────────────────────────────────────────
cat > "${HOOK_DIR}/bee-wipe" << 'EOF'
#!/bin/sh
PREREQ=""
prereqs() { echo "$PREREQ"; }
case "$1" in prereqs) prereqs; exit 0 ;; esac
. /usr/share/initramfs-tools/hook-functions
for bin in lsblk blkid blkdiscard blockdev; do
b=$(command -v "$bin" 2>/dev/null) && copy_exec "$b" /bin
done
[ -x /usr/sbin/nvme ] && copy_exec /usr/sbin/nvme /sbin
copy_exec /usr/local/bin/bee-initramfs-wipe /bin/bee-wipe
EOF
chmod +x "${HOOK_DIR}/bee-wipe"
# ── initramfs premount script: trigger on bee.wipe=all ───────────────────────
cat > "${SCRIPT_DIR}/bee-wipe" << 'EOF'
#!/bin/sh
PREREQ=""
prereqs() { echo "$PREREQ"; }
case "$1" in prereqs) prereqs; exit 0 ;; esac
grep -qw 'bee.wipe=all' /proc/cmdline 2>/dev/null || exit 0
exec /bin/bee-wipe
EOF
chmod +x "${SCRIPT_DIR}/bee-wipe"
echo "9012-wipe: installed initramfs hook and premount script"
KVER=$(ls /lib/modules | sort -V | tail -1)
echo "9012-wipe: rebuilding initramfs for kernel ${KVER}"
update-initramfs -u -k "${KVER}"
echo "9012-wipe: done"

View File

@@ -0,0 +1,166 @@
#!/bin/sh
# bee-initramfs-wipe — interactive disk wipe running entirely in the initramfs.
# Triggered by bee.wipe=all on the kernel cmdline (via local-premount hook).
# Works before squashfs is mounted, so it runs even when live boot fails.
RED='\033[1;31m'
YEL='\033[1;33m'
GRN='\033[1;32m'
CYN='\033[1;36m'
NC='\033[0m'
p() { printf '%b\n' "$*"; }
pp() { printf '%b' "$*"; }
banner() {
p ""
p "${RED}╔══════════════════════════════════════════════════════════╗${NC}"
p "${RED}║ BEE DRIVE WIPE — initramfs stage ║${NC}"
p "${RED}╚══════════════════════════════════════════════════════════╝${NC}"
p ""
}
# ── find boot device ─────────────────────────────────────────────────────────
boot_dev() {
local label token
for token in $(cat /proc/cmdline 2>/dev/null); do
case "$token" in
live-media-label=*) label="${token#*=}" ;;
esac
done
[ -z "$label" ] && return
local dev
dev=$(blkid -L "$label" 2>/dev/null) || return
# strip partition suffix: /dev/sdb1 → /dev/sdb, /dev/nvme0n1p1 → /dev/nvme0n1
echo "$dev" | sed 's/p\?[0-9]\+$//'
}
# ── enumerate candidate disks ─────────────────────────────────────────────────
list_disks() {
local boot
boot=$(boot_dev)
lsblk -d -n -o NAME,TYPE,SIZE,MODEL 2>/dev/null | while read -r name type size model; do
[ "$type" = "disk" ] || continue
[ "$size" = "0B" ] && continue
local dev="/dev/$name"
[ "$dev" = "$boot" ] && continue
printf '%s\t%s\t%s\n' "$dev" "$size" "${model:-}"
done
}
# ── wipe one disk ─────────────────────────────────────────────────────────────
wipe_one() {
local dev="$1"
p ""
p "=== ${YEL}${dev}${NC} ==="
if echo "$dev" | grep -q '^/dev/nvme'; then
if nvme format --ses=1 "$dev" 2>&1; then
p " ${GRN}nvme format OK${NC}"
blockdev --flushbufs "$dev" 2>/dev/null || true
return
fi
p " nvme format failed — falling back to blkdiscard"
fi
if blkdiscard -f "$dev" 2>&1; then
p " ${GRN}blkdiscard OK${NC}"
blockdev --flushbufs "$dev" 2>/dev/null || true
return
fi
p " blkdiscard not supported — zeroing partition tables (HDD fallback)"
local size_bytes mb32 skip
size_bytes=$(blockdev --getsize64 "$dev" 2>/dev/null || echo 0)
mb32=$(( 32 * 1024 * 1024 ))
dd if=/dev/zero of="$dev" bs=4M count=8 conv=fsync status=progress 2>&1 || true
if [ "$size_bytes" -gt $(( mb32 * 2 )) ]; then
skip=$(( (size_bytes - mb32) / (4 * 1024 * 1024) ))
dd if=/dev/zero of="$dev" bs=4M count=8 seek="$skip" conv=fsync status=progress 2>&1 || true
fi
blockdev --flushbufs "$dev" 2>/dev/null || true
p " ${GRN}done (partition tables zeroed)${NC}"
}
# ── main ──────────────────────────────────────────────────────────────────────
banner
BOOT=$(boot_dev)
[ -n "$BOOT" ] && p "Boot device (excluded): ${CYN}${BOOT}${NC}\n"
# build indexed list
i=0
DEVS=""
IFS='
'
for line in $(list_disks); do
i=$(( i + 1 ))
dev=$(echo "$line" | cut -f1)
size=$(echo "$line" | cut -f2)
model=$(echo "$line" | cut -f3)
DEVS="${DEVS}${i}:${dev}:${size}:${model}
"
printf " ${CYN}[%d]${NC} %-16s %8s %s\n" "$i" "$dev" "$size" "$model"
done
IFS='
'
if [ "$i" -eq 0 ]; then
p "\nNo physical disks found (boot device excluded)."
p "Dropping to shell — type 'exit' to continue boot."
exec /bin/sh
fi
p ""
pp "Enter numbers to wipe (space-separated), ${YEL}all${NC} for all, ${YEL}q${NC} to abort: "
read -r SELECTION
case "$SELECTION" in
q|Q|'') p "\nAborted."; exec /bin/sh ;;
esac
# resolve selection → list of devs
SELECTED=""
if [ "$SELECTION" = "all" ] || [ "$SELECTION" = "ALL" ]; then
SELECTED=$(echo "$DEVS" | grep -v '^$' | cut -d: -f2 | tr '\n' ' ')
else
for num in $SELECTION; do
match=$(echo "$DEVS" | grep "^${num}:" | cut -d: -f2)
if [ -z "$match" ]; then
p "${RED}Unknown index: ${num}${NC}"; exec /bin/sh
fi
SELECTED="${SELECTED}${match} "
done
fi
SELECTED=$(echo "$SELECTED" | tr -s ' ' | sed 's/ $//')
p ""
p "Selected for wipe: ${YEL}${SELECTED}${NC}"
p "${RED}WARNING: This is IRREVERSIBLE. All data on the selected disks will be lost.${NC}"
p ""
pp "Type YES to confirm, anything else to abort: "
read -r CONFIRM
if [ "$CONFIRM" != "YES" ]; then
p "\nAborted — no disks were touched."
exec /bin/sh
fi
p "\nStarting wipe..."
for dev in $SELECTED; do
wipe_one "$dev"
done
sync
p ""
p "${GRN}=== All selected disks wiped and flushed. ===${NC}"
p ""
pp "Press Enter to reboot..."
read -r _
reboot

View File

@@ -0,0 +1,132 @@
#!/bin/bash
# bee-wipe-disks — erase all physical disks (interactive, confirmation required)
#
# Triggered automatically when the kernel cmdline contains bee.wipe=all.
# Can also be run manually from a root shell.
#
# Wipe strategy:
# NVMe — nvme format (ATA-style secure erase, fast)
# Other — blkdiscard -f (TRIM/UNMAP, fast on SSDs)
# dd if=/dev/zero (fallback for HDDs, zeros first+last 32 MB)
set -euo pipefail
RED=$'\033[1;31m'
YEL=$'\033[1;33m'
GRN=$'\033[1;32m'
NC=$'\033[0m'
banner() {
echo ""
echo "${RED}╔══════════════════════════════════════════════════════════╗${NC}"
echo "${RED}║ BEE DISK WIPE — ALL DATA WILL BE DESTROYED ║${NC}"
echo "${RED}╚══════════════════════════════════════════════════════════╝${NC}"
echo ""
}
# ── find boot device to skip ──────────────────────────────────────────────────
live_dev() {
local src
src=$(findmnt -n -o SOURCE /run/live/medium 2>/dev/null || true)
[ -z "$src" ] && return
# Strip partition suffix: /dev/sdb1 → /dev/sdb, /dev/nvme0n1p1 → /dev/nvme0n1
echo "$src" | sed 's/p\?[0-9]\+$//'
}
# ── enumerate target disks ────────────────────────────────────────────────────
find_disks() {
local boot_dev
boot_dev=$(live_dev)
lsblk -d -n -o NAME,TYPE,SIZE,MODEL | while read -r name type size model; do
[ "$type" = "disk" ] || continue
[ "$size" = "0B" ] && continue # empty virtual media
local dev="/dev/$name"
[ "$dev" = "$boot_dev" ] && continue # skip boot device
printf '%s\t%s\t%s\n' "$dev" "$size" "$model"
done
}
# ── wipe one disk ─────────────────────────────────────────────────────────────
wipe_disk() {
local dev="$1"
echo ""
echo "=== ${YEL}${dev}${NC} ==="
if echo "$dev" | grep -q '^/dev/nvme'; then
# NVMe format (ses=1 = user data erase)
if nvme format --ses=1 "$dev" 2>&1; then
echo " ${GRN}nvme format OK${NC}"
return
fi
echo " nvme format failed, falling back to blkdiscard"
fi
if blkdiscard -f "$dev" 2>&1; then
echo " ${GRN}blkdiscard OK${NC}"
return
fi
echo " blkdiscard not supported — zeroing partition tables (HDD fallback)"
local size_bytes
size_bytes=$(blockdev --getsize64 "$dev")
local mb32=$(( 32 * 1024 * 1024 ))
# Zero first 32 MB (MBR, GPT, filesystem superblocks)
dd if=/dev/zero of="$dev" bs=4M count=8 conv=fsync status=progress 2>&1 || true
# Zero last 32 MB (backup GPT)
if [ "$size_bytes" -gt $(( mb32 * 2 )) ]; then
local skip=$(( (size_bytes - mb32) / (4 * 1024 * 1024) ))
dd if=/dev/zero of="$dev" bs=4M count=8 seek="$skip" conv=fsync status=progress 2>&1 || true
fi
echo " ${GRN}done (partition tables zeroed)${NC}"
}
# ── main ──────────────────────────────────────────────────────────────────────
banner
mapfile -t DISKS < <(find_disks | awk '{print $1}')
if [ ${#DISKS[@]} -eq 0 ]; then
echo "No physical disks found (boot device excluded)."
echo "Nothing to wipe."
exit 0
fi
echo "Disks to be ${RED}COMPLETELY ERASED${NC}:"
echo ""
find_disks | while IFS=$'\t' read -r dev size model; do
printf " ${YEL}%-16s${NC} %8s %s\n" "$dev" "$size" "$model"
done
echo ""
echo "${RED}WARNING: This is IRREVERSIBLE. All data on the listed disks will be lost.${NC}"
echo ""
printf "Type YES to confirm wipe, anything else to abort: "
read -r CONFIRM
if [ "$CONFIRM" != "YES" ]; then
echo ""
echo "Aborted — no disks were touched."
exit 0
fi
echo ""
echo "Starting wipe..."
for dev in "${DISKS[@]}"; do
wipe_disk "$dev"
done
echo ""
echo "${GRN}=== All disks wiped. ===${NC}"
echo ""
printf "Reboot now to return to the boot menu? [Y/n] "
read -r REBOOT
case "${REBOOT:-Y}" in
[Nn]*) echo "You can reboot manually when ready." ;;
*) echo "Rebooting..."; sleep 2; reboot ;;
esac

125
scripts/build.sh Executable file
View File

@@ -0,0 +1,125 @@
#!/bin/sh
# build.sh -- single entry point for ISO builds.
#
# Local build (default):
# sh scripts/build.sh
# sh scripts/build.sh --variant nvidia
# sh scripts/build.sh --clean-build
#
# Remote build (set BUILDER_HOST + BUILDER_USER in .env):
# sh scripts/build.sh
# sh scripts/build.sh --authorized-keys ~/.ssh/authorized_keys
#
# All flags are forwarded to build-in-container.sh.
set -e
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
ENV_FILE="${REPO_ROOT}/.env"
if [ -f "$ENV_FILE" ]; then
# shellcheck disable=SC1090
. "$ENV_FILE"
fi
BUILDER_HOST="${BUILDER_HOST:-}"
BUILDER_USER="${BUILDER_USER:-}"
# Cache lives inside the repo under dist/ (gitignored).
CACHE_DIR="${REPO_ROOT}/dist/cache"
# Forward all arguments as-is to the underlying build script.
EXTRA_ARGS="$*"
# ── Remote build ────────────────────────────────────────────────────────────
if [ -n "$BUILDER_HOST" ]; then
if [ -z "$BUILDER_USER" ]; then
echo "ERROR: BUILDER_USER not set. Set it in .env."
exit 1
fi
echo "=== bee builder (remote: ${BUILDER_USER}@${BUILDER_HOST}) ==="
echo ""
cd "${REPO_ROOT}"
git fetch --quiet origin main
LOCAL=$(git rev-parse HEAD)
REMOTE=$(git rev-parse origin/main)
if [ "$LOCAL" != "$REMOTE" ]; then
echo "ERROR: local repo is not in sync with remote."
echo " local: $LOCAL"
echo " remote: $REMOTE"
echo ""
echo "Push or pull before building:"
echo " git push -- if you have unpushed commits"
echo " git pull -- if remote is ahead"
exit 1
fi
echo "repo: in sync with remote ($LOCAL)"
echo ""
ssh -o StrictHostKeyChecking=no "${BUILDER_USER}@${BUILDER_HOST}" /bin/sh <<ENDSSH
set -e
REPO="/home/${BUILDER_USER}/bee"
LOG=/tmp/bee-build.log
if [ ! -d "\$REPO/.git" ]; then
echo "--- cloning bee repo ---"
git clone https://git.mchus.pro/reanimator/bee.git "\$REPO"
fi
cd "\$REPO"
echo "--- pulling latest ---"
sudo git checkout -- .
git pull --ff-only
chmod +x iso/overlay/usr/local/bin/* 2>/dev/null || true
screen -S bee-build -X quit 2>/dev/null || true
echo "--- starting build in screen session (survives SSH disconnect) ---"
echo "--- log: \$LOG ---"
screen -dmS bee-build sh -c "sh iso/builder/build-in-container.sh --cache-dir \$REPO/dist/cache ${EXTRA_ARGS} > \$LOG 2>&1; echo \$? > /tmp/bee-build-exit"
echo "--- streaming build log (Ctrl+C safe -- build continues on VM) ---"
tail -n +1 -f "\$LOG" 2>/dev/null &
TAIL_PID=\$!
while screen -list 2>/dev/null | grep -q bee-build; do
sleep 2
done
sleep 1
kill \$TAIL_PID 2>/dev/null || true
tail -n 20 "\$LOG" 2>/dev/null || true
EXIT_CODE=\$(cat /tmp/bee-build-exit 2>/dev/null || echo 1)
exit \$EXIT_CODE
ENDSSH
echo ""
echo "=== downloading ISO ==="
LOCAL_ISO_DIR="${REPO_ROOT}/dist/release"
mkdir -p "${LOCAL_ISO_DIR}"
if command -v rsync >/dev/null 2>&1 && ssh -o StrictHostKeyChecking=no "${BUILDER_USER}@${BUILDER_HOST}" command -v rsync >/dev/null 2>&1; then
rsync -az --progress \
-e "ssh -o StrictHostKeyChecking=no" \
"${BUILDER_USER}@${BUILDER_HOST}:/home/${BUILDER_USER}/bee/dist/*.iso" \
"${LOCAL_ISO_DIR}/"
else
scp -o StrictHostKeyChecking=no \
"${BUILDER_USER}@${BUILDER_HOST}:/home/${BUILDER_USER}/bee/dist/*.iso" \
"${LOCAL_ISO_DIR}/"
fi
echo ""
echo "=== build complete ==="
echo "ISO saved to: ${LOCAL_ISO_DIR}/"
ls -lh "${LOCAL_ISO_DIR}/"*.iso 2>/dev/null || true
exit 0
fi
# ── Local build ─────────────────────────────────────────────────────────────
echo "=== bee builder (local) ==="
echo "cache: ${CACHE_DIR}"
echo ""
# shellcheck disable=SC2086
exec sh "${REPO_ROOT}/iso/builder/build-in-container.sh" --cache-dir "${CACHE_DIR}" $EXTRA_ARGS