Unify live RAM runtime state

This commit is contained in:
Mikhail Chusavitin
2026-04-14 16:18:33 +03:00
parent 2be7ae6d28
commit 54338dbae5
9 changed files with 277 additions and 72 deletions

View File

@@ -84,6 +84,7 @@ type installer interface {
InstallToDisk(ctx context.Context, device string, logFile string) error
IsLiveMediaInRAM() bool
LiveBootSource() platform.LiveBootSource
LiveMediaRAMState() platform.LiveMediaRAMState
RunInstallToRAM(ctx context.Context, logFunc func(string)) error
}
@@ -108,6 +109,10 @@ func (a *App) LiveBootSource() platform.LiveBootSource {
return a.installer.LiveBootSource()
}
func (a *App) LiveMediaRAMState() platform.LiveMediaRAMState {
return a.installer.LiveMediaRAMState()
}
func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
return a.installer.RunInstallToRAM(ctx, logFunc)
}

View File

@@ -11,20 +11,10 @@ import (
"strings"
)
const installToRAMDir = "/dev/shm/bee-live"
func (s *System) IsLiveMediaInRAM() bool {
fsType := mountFSType("/run/live/medium")
if fsType == "" {
// No medium mount at all — fall back to toram kernel parameter.
return toramActive()
}
if strings.EqualFold(fsType, "tmpfs") {
return true
}
// When RunInstallToRAM copies squashfs to /dev/shm/bee-live but the bind
// mount of /run/live/medium fails (common for CD-ROM boots), the medium
// fstype still shows the CD-ROM type. Check whether the RAM copy exists.
files, _ := filepath.Glob("/dev/shm/bee-live/*.squashfs")
return len(files) > 0
return s.LiveMediaRAMState().InRAM
}
func (s *System) LiveBootSource() LiveBootSource {
@@ -56,14 +46,95 @@ func (s *System) LiveBootSource() LiveBootSource {
return status
}
func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
func (s *System) LiveMediaRAMState() LiveMediaRAMState {
return evaluateLiveMediaRAMState(
s.LiveBootSource(),
toramActive(),
globPaths("/run/live/medium/live/*.squashfs"),
globPaths(filepath.Join(installToRAMDir, "*.squashfs")),
)
}
func evaluateLiveMediaRAMState(status LiveBootSource, toram bool, sourceSquashfs, copiedSquashfs []string) LiveMediaRAMState {
state := LiveMediaRAMState{
LiveBootSource: status,
ToramActive: toram,
CopyPresent: len(copiedSquashfs) > 0,
}
if status.InRAM {
state.State = "in_ram"
state.Status = "ok"
state.CopyComplete = true
state.Message = "Running from RAM — installation media can be safely disconnected."
return state
}
expected := pathBaseSet(sourceSquashfs)
copied := pathBaseSet(copiedSquashfs)
state.CopyComplete = len(expected) > 0 && setContainsAll(copied, expected)
switch {
case state.CopyComplete:
state.State = "partial"
state.Status = "partial"
state.CanStartCopy = true
state.Message = "Live media files were copied to RAM, but the system is still mounted from the original boot source."
case state.CopyPresent:
state.State = "partial"
state.Status = "partial"
state.CanStartCopy = true
state.Message = "Partial RAM copy detected. A previous Copy to RAM run was interrupted or cancelled."
case toram:
state.State = "toram_failed"
state.Status = "failed"
state.CanStartCopy = true
state.Message = "toram boot parameter is set but the live medium is not mounted from RAM."
default:
state.State = "not_in_ram"
state.Status = "warning"
state.CanStartCopy = true
state.Message = "ISO not copied to RAM. Use Copy to RAM to free the boot drive and improve performance."
}
return state
}
func globPaths(pattern string) []string {
matches, _ := filepath.Glob(pattern)
return matches
}
func pathBaseSet(paths []string) map[string]struct{} {
out := make(map[string]struct{}, len(paths))
for _, path := range paths {
base := strings.TrimSpace(filepath.Base(path))
if base != "" {
out[base] = struct{}{}
}
}
return out
}
func setContainsAll(have, want map[string]struct{}) bool {
if len(want) == 0 {
return false
}
for name := range want {
if _, ok := have[name]; !ok {
return false
}
}
return true
}
func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (retErr error) {
log := func(msg string) {
if logFunc != nil {
logFunc(msg)
}
}
if s.IsLiveMediaInRAM() {
state := s.LiveMediaRAMState()
if state.InRAM {
log("Already running from RAM — installation media can be safely disconnected.")
return nil
}
@@ -88,10 +159,21 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) erro
humanBytes(needed+headroom), humanBytes(free))
}
dstDir := "/dev/shm/bee-live"
dstDir := installToRAMDir
if state.CopyPresent {
log("Removing stale partial RAM copy before retry...")
}
_ = os.RemoveAll(dstDir)
if err := os.MkdirAll(dstDir, 0755); err != nil {
return fmt.Errorf("create tmpfs dir: %v", err)
}
defer func() {
if retErr == nil {
return
}
_ = os.RemoveAll(dstDir)
log("Removed incomplete RAM copy.")
}()
for _, sf := range squashfsFiles {
if err := ctx.Err(); err != nil {

View File

@@ -58,3 +58,46 @@ func TestDescribeLiveBootSource(t *testing.T) {
t.Fatalf("got %q want /run/live/medium", got)
}
}
func TestEvaluateLiveMediaRAMState(t *testing.T) {
t.Parallel()
t.Run("in_ram", func(t *testing.T) {
state := evaluateLiveMediaRAMState(
LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"},
false,
nil,
nil,
)
if state.State != "in_ram" || state.Status != "ok" || state.CanStartCopy {
t.Fatalf("state=%+v", state)
}
})
t.Run("partial_copy_after_cancel", func(t *testing.T) {
state := evaluateLiveMediaRAMState(
LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"},
false,
[]string{"/run/live/medium/live/filesystem.squashfs", "/run/live/medium/live/firmware.squashfs"},
[]string{"/dev/shm/bee-live/filesystem.squashfs"},
)
if state.State != "partial" || state.Status != "partial" || !state.CanStartCopy {
t.Fatalf("state=%+v", state)
}
if state.CopyComplete {
t.Fatalf("CopyComplete=%v want false", state.CopyComplete)
}
})
t.Run("toram_failed", func(t *testing.T) {
state := evaluateLiveMediaRAMState(
LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"},
true,
nil,
nil,
)
if state.State != "toram_failed" || state.Status != "failed" || !state.CanStartCopy {
t.Fatalf("state=%+v", state)
}
})
}

View File

@@ -171,25 +171,28 @@ func resolvedToolStatus(display string, candidates ...string) ToolStatus {
return ToolStatus{Name: display}
}
// collectToRAMHealth checks whether the LiveCD ISO has been copied to RAM.
// Status values: "ok" = in RAM, "warning" = toram not active (no copy attempted),
// "failed" = toram was requested but medium is not in RAM (copy failed or in progress).
// collectToRAMHealth evaluates whether the live system is fully running from RAM.
// Status values: "ok" = fully in RAM, "warning" = not copied, "partial" = stale or
// incomplete RAM copy exists but runtime still depends on the boot medium,
// "failed" = toram was requested but medium is not in RAM.
func (s *System) collectToRAMHealth(health *schema.RuntimeHealth) {
inRAM := s.IsLiveMediaInRAM()
active := toramActive()
switch {
case inRAM:
health.ToRAMStatus = "ok"
case active:
// toram was requested but medium is not yet/no longer in RAM
health.ToRAMStatus = "failed"
state := s.LiveMediaRAMState()
health.ToRAMStatus = state.Status
switch state.Status {
case "ok":
return
case "failed":
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "toram_copy_failed",
Severity: "warning",
Description: "toram boot parameter is set but the live medium is not mounted from RAM.",
Description: state.Message,
})
case "partial":
health.Issues = append(health.Issues, schema.RuntimeIssue{
Code: "toram_copy_partial",
Severity: "warning",
Description: state.Message,
})
default:
health.ToRAMStatus = "warning"
}
}
@@ -211,13 +214,13 @@ func findUSBExportMount() string {
// fs types that are expected on USB export drives
exportFSTypes := map[string]bool{
"vfat": true,
"exfat": true,
"ext2": true,
"ext3": true,
"ext4": true,
"ntfs": true,
"ntfs3": true,
"vfat": true,
"exfat": true,
"ext2": true,
"ext3": true,
"ext4": true,
"ntfs": true,
"ntfs3": true,
"fuseblk": true,
}

View File

@@ -9,6 +9,17 @@ type LiveBootSource struct {
Device string `json:"device,omitempty"`
}
type LiveMediaRAMState struct {
LiveBootSource
State string `json:"state"`
Status string `json:"status"`
ToramActive bool `json:"toram_active,omitempty"`
CopyPresent bool `json:"copy_present,omitempty"`
CopyComplete bool `json:"copy_complete,omitempty"`
CanStartCopy bool `json:"can_start_copy,omitempty"`
Message string `json:"message,omitempty"`
}
type InterfaceInfo struct {
Name string
State string

View File

@@ -15,17 +15,17 @@ type HardwareIngestRequest struct {
}
type RuntimeHealth struct {
Status string `json:"status"`
CheckedAt string `json:"checked_at"`
ExportDir string `json:"export_dir,omitempty"`
DriverReady bool `json:"driver_ready,omitempty"`
CUDAReady bool `json:"cuda_ready,omitempty"`
NvidiaGSPMode string `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
NetworkStatus string `json:"network_status,omitempty"`
// ToRAMStatus: "ok" (ISO in RAM), "warning" (toram not active), "failed" (toram active but copy failed)
ToRAMStatus string `json:"toram_status,omitempty"`
Status string `json:"status"`
CheckedAt string `json:"checked_at"`
ExportDir string `json:"export_dir,omitempty"`
DriverReady bool `json:"driver_ready,omitempty"`
CUDAReady bool `json:"cuda_ready,omitempty"`
NvidiaGSPMode string `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
NetworkStatus string `json:"network_status,omitempty"`
// ToRAMStatus: "ok" (fully in RAM), "warning" (not copied), "partial" (stale/incomplete copy exists), "failed" (toram active but copy failed)
ToRAMStatus string `json:"toram_status,omitempty"`
// USBExportPath: mount point of the first writable USB drive found, empty if none.
USBExportPath string `json:"usb_export_path,omitempty"`
USBExportPath string `json:"usb_export_path,omitempty"`
Issues []RuntimeIssue `json:"issues,omitempty"`
Tools []RuntimeToolStatus `json:"tools,omitempty"`
Services []RuntimeServiceStatus `json:"services,omitempty"`

View File

@@ -526,14 +526,14 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
return
}
var body struct {
Duration int `json:"duration"`
StressMode bool `json:"stress_mode"`
GPUIndices []int `json:"gpu_indices"`
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
StaggerGPUStart bool `json:"stagger_gpu_start"`
ParallelGPUs bool `json:"parallel_gpus"`
Loader string `json:"loader"`
var body struct {
Duration int `json:"duration"`
StressMode bool `json:"stress_mode"`
GPUIndices []int `json:"gpu_indices"`
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
StaggerGPUStart bool `json:"stagger_gpu_start"`
ParallelGPUs bool `json:"parallel_gpus"`
Loader string `json:"loader"`
Profile string `json:"profile"`
DisplayName string `json:"display_name"`
PlatformComponents []string `json:"platform_components"`
@@ -549,14 +549,14 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
if strings.TrimSpace(body.DisplayName) != "" {
name = body.DisplayName
}
params := taskParams{
Duration: body.Duration,
StressMode: body.StressMode,
GPUIndices: body.GPUIndices,
ExcludeGPUIndices: body.ExcludeGPUIndices,
StaggerGPUStart: body.StaggerGPUStart,
ParallelGPUs: body.ParallelGPUs,
Loader: body.Loader,
params := taskParams{
Duration: body.Duration,
StressMode: body.StressMode,
GPUIndices: body.GPUIndices,
ExcludeGPUIndices: body.ExcludeGPUIndices,
StaggerGPUStart: body.StaggerGPUStart,
ParallelGPUs: body.ParallelGPUs,
Loader: body.Loader,
BurnProfile: body.Profile,
DisplayName: body.DisplayName,
PlatformComponents: body.PlatformComponents,
@@ -1072,18 +1072,55 @@ func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) {
writeError(w, http.StatusServiceUnavailable, "app not configured")
return
}
status := h.opts.App.LiveBootSource()
status := h.currentRAMStatus()
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(status)
}
type ramStatusResponse struct {
platform.LiveMediaRAMState
InstallTaskActive bool `json:"install_task_active,omitempty"`
CopyTaskActive bool `json:"copy_task_active,omitempty"`
CanStartTask bool `json:"can_start_task,omitempty"`
BlockedReason string `json:"blocked_reason,omitempty"`
}
func (h *handler) currentRAMStatus() ramStatusResponse {
state := h.opts.App.LiveMediaRAMState()
resp := ramStatusResponse{LiveMediaRAMState: state}
if globalQueue.hasActiveTarget("install") {
resp.InstallTaskActive = true
resp.BlockedReason = "install to disk is already running"
return resp
}
if globalQueue.hasActiveTarget("install-to-ram") {
resp.CopyTaskActive = true
resp.BlockedReason = "install to RAM task is already pending or running"
return resp
}
if state.InRAM {
resp.BlockedReason = "system is already running from RAM"
return resp
}
resp.CanStartTask = state.CanStartCopy
if !resp.CanStartTask && resp.BlockedReason == "" {
resp.BlockedReason = state.Message
}
return resp
}
func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request) {
if h.opts.App == nil {
writeError(w, http.StatusServiceUnavailable, "app not configured")
return
}
if globalQueue.hasActiveTarget("install") {
writeError(w, http.StatusConflict, "install to disk is already running")
status := h.currentRAMStatus()
if !status.CanStartTask {
msg := strings.TrimSpace(status.BlockedReason)
if msg == "" {
msg = "install to RAM is not available"
}
writeError(w, http.StatusConflict, msg)
return
}
t := &Task{

View File

@@ -845,6 +845,13 @@ func buildRuntimeToRAMRow(health schema.RuntimeHealth) runtimeHealthRow {
Source: "live-boot / /proc/mounts",
Issue: "",
}
case "partial":
return runtimeHealthRow{
Title: "LiveCD in RAM",
Status: "WARNING",
Source: "live-boot / /proc/mounts / /dev/shm/bee-live",
Issue: "Partial or staged RAM copy detected. System is not fully running from RAM; Copy to RAM can be retried.",
}
case "failed":
return runtimeHealthRow{
Title: "LiveCD in RAM",
@@ -2280,7 +2287,6 @@ func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun)
return maxGPUIndex, runs
}
// ── Burn ──────────────────────────────────────────────────────────────────────
func renderBurn() string {
@@ -3245,12 +3251,19 @@ fetch('/api/system/ram-status').then(r=>r.json()).then(d=>{
else if (kind === 'disk') label = 'disk (' + source + ')';
else label = source;
boot.textContent = 'Current boot source: ' + label + '.';
if (d.in_ram) {
txt.textContent = '✓ Running from RAM — installation media can be safely disconnected.';
txt.textContent = d.message || 'Checking...';
if (d.status === 'ok' || d.in_ram) {
txt.style.color = 'var(--ok, green)';
} else if (d.status === 'failed') {
txt.style.color = 'var(--err, #b91c1c)';
} else {
txt.textContent = 'Live media is mounted from installation device. Copy to RAM to allow media removal.';
txt.style.color = 'var(--muted)';
}
if (d.can_start_task) {
btn.style.display = '';
btn.disabled = false;
} else {
btn.style.display = 'none';
}
});
function installToRAM() {

View File

@@ -11,6 +11,7 @@ import (
"time"
"bee/audit/internal/platform"
"bee/audit/internal/schema"
)
func TestChartLegendNumber(t *testing.T) {
@@ -78,6 +79,16 @@ func TestRecoverMiddlewarePreservesStreamingInterfaces(t *testing.T) {
}
}
func TestBuildRuntimeToRAMRowShowsPartialCopyWarning(t *testing.T) {
row := buildRuntimeToRAMRow(schema.RuntimeHealth{ToRAMStatus: "partial"})
if row.Status != "WARNING" {
t.Fatalf("status=%q want WARNING", row.Status)
}
if !strings.Contains(row.Issue, "Partial or staged RAM copy detected") {
t.Fatalf("issue=%q", row.Issue)
}
}
func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
samples := []platform.LiveMetricSample{
{
@@ -1113,8 +1124,8 @@ func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
`>Storage<`,
`>GPU<`,
`>PSU<`,
`badge-warn`, // cpu Warning badge
`badge-err`, // storage Critical badge
`badge-warn`, // cpu Warning badge
`badge-err`, // storage Critical badge
} {
if !strings.Contains(body, needle) {
t.Fatalf("dashboard missing %q: %s", needle, body)