fix(iso): increase boot verbosity for service startup visibility

Raise loglevel from 3 to 6 (INFO) and add systemd.show_status=1 so kernel driver messages and systemd [ OK ]/[ FAILED ] lines are visible during boot instead of showing only a blank cursor. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
feat(watchdog): hardware error monitor + unified component status store
2026-04-02 19:33:27 +03:00 · 2026-04-02 19:20:59 +03:00 · 2026-04-02 15:42:28 +03:00 · 2026-04-02 15:36:32 +03:00 · 2026-04-02 15:30:23 +03:00 · 2026-04-02 13:44:58 +03:00
18 changed files with 1283 additions and 53 deletions
--- a/audit/internal/app/app.go
+++ b/audit/internal/app/app.go
@@ -40,6 +40,8 @@ type App struct {
 	sat       satRunner
 	runtime   runtimeChecker
 	installer installer
+	// StatusDB is the unified component health store (nil if unavailable).
+	StatusDB *ComponentStatusDB
 }

 type ActionResult struct {
@@ -80,6 +82,7 @@ type installer interface {
 	ListInstallDisks() ([]platform.InstallDisk, error)
 	InstallToDisk(ctx context.Context, device string, logFile string) error
 	IsLiveMediaInRAM() bool
+	LiveBootSource() platform.LiveBootSource
 	RunInstallToRAM(ctx context.Context, logFunc func(string)) error
 }

@@ -100,6 +103,10 @@ func (a *App) IsLiveMediaInRAM() bool {
 	return a.installer.IsLiveMediaInRAM()
 }

+func (a *App) LiveBootSource() platform.LiveBootSource {
+	return a.installer.LiveBootSource()
+}
+
 func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
 	return a.installer.RunInstallToRAM(ctx, logFunc)
 }
@@ -131,7 +138,7 @@ type runtimeChecker interface {
 }

 func New(platform *platform.System) *App {
-	return &App{
+	a := &App{
 		network:   platform,
 		services:  platform,
 		exports:   platform,
@@ -140,6 +147,10 @@ func New(platform *platform.System) *App {
 		runtime:   platform,
 		installer: platform,
 	}
+	if db, err := OpenComponentStatusDB(DefaultExportDir + "/component-status.json"); err == nil {
+		a.StatusDB = db
+	}
+	return a
 }

 // ApplySATOverlay parses a raw audit JSON, overlays the latest SAT results,
@@ -149,7 +160,7 @@ func ApplySATOverlay(auditJSON []byte) ([]byte, error) {
 	if err != nil {
 		return nil, err
 	}
-	applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir)
+	applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir, nil)
 	return json.MarshalIndent(snap, "", "  ")
 }

@@ -169,7 +180,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
 		}
 	}
 	result := collector.Run(runtimeMode)
-	applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir)
+	applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
 	if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
 		result.Runtime = &health
 	}
--- a/audit/internal/app/component_status_db.go
+++ b/audit/internal/app/component_status_db.go
@@ -0,0 +1,266 @@
+package app
+
+import (
+	"encoding/json"
+	"os"
+	"path/filepath"
+	"strings"
+	"sync"
+	"time"
+)
+
+// ComponentStatusDB is a persistent, append-only store of hardware component health records.
+// Records are keyed by component identity strings (e.g. "pcie:0000:c8:00.0", "storage:nvme0n1").
+// Once a component is marked Warning or Critical, subsequent OK entries do not downgrade it —
+// the component stays at the highest observed severity until explicitly reset.
+type ComponentStatusDB struct {
+	path    string
+	mu      sync.Mutex
+	records map[string]*ComponentStatusRecord
+}
+
+// ComponentStatusRecord holds the current and historical health of one hardware component.
+type ComponentStatusRecord struct {
+	ComponentKey  string                  `json:"component_key"`
+	Status        string                  `json:"status"` // "OK", "Warning", "Critical", "Unknown"
+	LastCheckedAt time.Time               `json:"last_checked_at"`
+	LastChangedAt time.Time               `json:"last_changed_at"`
+	ErrorSummary  string                  `json:"error_summary,omitempty"`
+	History       []ComponentStatusEntry  `json:"history"`
+}
+
+// ComponentStatusEntry is one observation written to a component's history.
+type ComponentStatusEntry struct {
+	At     time.Time `json:"at"`
+	Status string    `json:"status"`
+	Source string    `json:"source"` // e.g. "sat:nvidia", "sat:memory", "watchdog:kmsg"
+	Detail string    `json:"detail,omitempty"`
+}
+
+// OpenComponentStatusDB opens (or creates) the JSON status DB at path.
+func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) {
+	db := &ComponentStatusDB{
+		path:    path,
+		records: make(map[string]*ComponentStatusRecord),
+	}
+	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+		return nil, err
+	}
+	data, err := os.ReadFile(path)
+	if err != nil && !os.IsNotExist(err) {
+		return nil, err
+	}
+	if len(data) > 0 {
+		var records []ComponentStatusRecord
+		if err := json.Unmarshal(data, &records); err == nil {
+			for i := range records {
+				db.records[records[i].ComponentKey] = &records[i]
+			}
+		}
+	}
+	return db, nil
+}
+
+// Record writes one observation for the given component key.
+// source is a short label like "sat:nvidia" or "watchdog:kmsg".
+// status is "OK", "Warning", "Critical", or "Unknown".
+// OK never downgrades an existing Warning or Critical status.
+func (db *ComponentStatusDB) Record(key, source, status, detail string) {
+	if db == nil || strings.TrimSpace(key) == "" {
+		return
+	}
+	db.mu.Lock()
+	defer db.mu.Unlock()
+
+	now := time.Now().UTC()
+	rec, exists := db.records[key]
+	if !exists {
+		rec = &ComponentStatusRecord{ComponentKey: key}
+		db.records[key] = rec
+	}
+	rec.LastCheckedAt = now
+
+	entry := ComponentStatusEntry{At: now, Status: status, Source: source, Detail: detail}
+	rec.History = append(rec.History, entry)
+
+	// Status merge: OK never downgrades Warning/Critical.
+	newSev := componentSeverity(status)
+	curSev := componentSeverity(rec.Status)
+	if newSev > curSev {
+		rec.Status = status
+		rec.LastChangedAt = now
+		rec.ErrorSummary = detail
+	} else if rec.Status == "" {
+		rec.Status = status
+		rec.LastChangedAt = now
+	}
+
+	_ = db.saveLocked()
+}
+
+// Get returns the current record for a component key.
+func (db *ComponentStatusDB) Get(key string) (ComponentStatusRecord, bool) {
+	if db == nil {
+		return ComponentStatusRecord{}, false
+	}
+	db.mu.Lock()
+	defer db.mu.Unlock()
+	r, ok := db.records[key]
+	if !ok {
+		return ComponentStatusRecord{}, false
+	}
+	return *r, true
+}
+
+// All returns a snapshot of all records.
+func (db *ComponentStatusDB) All() []ComponentStatusRecord {
+	if db == nil {
+		return nil
+	}
+	db.mu.Lock()
+	defer db.mu.Unlock()
+	out := make([]ComponentStatusRecord, 0, len(db.records))
+	for _, r := range db.records {
+		out = append(out, *r)
+	}
+	return out
+}
+
+func (db *ComponentStatusDB) saveLocked() error {
+	records := make([]ComponentStatusRecord, 0, len(db.records))
+	for _, r := range db.records {
+		records = append(records, *r)
+	}
+	data, err := json.MarshalIndent(records, "", "  ")
+	if err != nil {
+		return err
+	}
+	return os.WriteFile(db.path, data, 0644)
+}
+
+// componentSeverity returns a numeric severity so higher values win.
+func componentSeverity(status string) int {
+	switch strings.TrimSpace(status) {
+	case "Critical":
+		return 3
+	case "Warning":
+		return 2
+	case "OK":
+		return 1
+	default:
+		return 0
+	}
+}
+
+// ApplySATResultToDB reads a SAT summary.txt from the run directory next to archivePath
+// and writes component status records to db for the given SAT target.
+// archivePath may be either a bare .tar.gz path or "Archive written to /path/foo.tar.gz".
+func ApplySATResultToDB(db *ComponentStatusDB, target, archivePath string) {
+	if db == nil || strings.TrimSpace(archivePath) == "" {
+		return
+	}
+	archivePath = extractArchivePath(archivePath)
+	if archivePath == "" {
+		return
+	}
+	runDir := strings.TrimSuffix(archivePath, ".tar.gz")
+	data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
+	if err != nil {
+		return
+	}
+	kv := parseSATKV(string(data))
+	overall := strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
+	if overall == "" {
+		return
+	}
+
+	source := "sat:" + target
+	dbStatus := satStatusToDBStatus(overall)
+
+	// Map SAT target to component keys.
+	switch target {
+	case "nvidia", "amd", "nvidia-stress", "amd-stress", "amd-mem", "amd-bandwidth":
+		db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall)
+	case "memory", "memory-stress", "sat-stress":
+		db.Record("memory:all", source, dbStatus, target+" SAT: "+overall)
+	case "cpu", "platform-stress":
+		db.Record("cpu:all", source, dbStatus, target+" SAT: "+overall)
+	case "storage":
+		// Try to record per-device if available in summary.
+		recordedAny := false
+		for key, val := range kv {
+			if !strings.HasSuffix(key, "_status") || key == "overall_status" {
+				continue
+			}
+			base := strings.TrimSuffix(key, "_status")
+			idx := strings.Index(base, "_")
+			if idx <= 0 {
+				continue
+			}
+			devName := base[:idx]
+			devStatus := satStatusToDBStatus(strings.ToUpper(strings.TrimSpace(val)))
+			db.Record("storage:"+devName, source, devStatus, "storage SAT: "+val)
+			recordedAny = true
+		}
+		if !recordedAny {
+			db.Record("storage:all", source, dbStatus, "storage SAT: "+overall)
+		}
+	}
+}
+
+func satStatusToDBStatus(overall string) string {
+	switch overall {
+	case "OK":
+		return "OK"
+	case "FAILED":
+		return "Warning"
+	case "PARTIAL", "UNSUPPORTED":
+		return "Unknown"
+	default:
+		return "Unknown"
+	}
+}
+
+// ExtractArchivePath extracts a bare .tar.gz path from a string that may be
+// "Archive written to /path/foo.tar.gz" or already a bare path.
+func ExtractArchivePath(s string) string {
+	return extractArchivePath(s)
+}
+
+// ReadSATOverallStatus reads the overall_status value from the summary.txt
+// file located in the run directory alongside archivePath.
+// Returns "" if the file cannot be read.
+func ReadSATOverallStatus(archivePath string) string {
+	if strings.TrimSpace(archivePath) == "" {
+		return ""
+	}
+	runDir := strings.TrimSuffix(archivePath, ".tar.gz")
+	data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
+	if err != nil {
+		return ""
+	}
+	kv := parseSATKV(string(data))
+	return strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
+}
+
+func extractArchivePath(s string) string {
+	s = strings.TrimSpace(s)
+	if strings.HasSuffix(s, ".tar.gz") {
+		parts := strings.Fields(s)
+		if len(parts) > 0 {
+			return parts[len(parts)-1]
+		}
+	}
+	return s
+}
+
+func parseSATKV(raw string) map[string]string {
+	kv := make(map[string]string)
+	for _, line := range strings.Split(raw, "\n") {
+		k, v, ok := strings.Cut(strings.TrimSpace(line), "=")
+		if ok {
+			kv[strings.TrimSpace(k)] = strings.TrimSpace(v)
+		}
+	}
+	return kv
+}
--- a/audit/internal/app/sat_overlay.go
+++ b/audit/internal/app/sat_overlay.go
@@ -9,7 +9,7 @@ import (
 	"bee/audit/internal/schema"
 )

-func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
+func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *ComponentStatusDB) {
 	if snap == nil || strings.TrimSpace(baseDir) == "" {
 		return
 	}
@@ -28,6 +28,8 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
 	if summary, ok := loadLatestSATSummary(baseDir, "storage-"); ok {
 		applyStorageSAT(snap.Storage, summary)
 	}
+	// Apply unified component status DB — overlaid last so it can only upgrade severity.
+	applyComponentStatusDB(snap, db)
 }

 type satSummary struct {
@@ -206,6 +208,86 @@ func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool {
 	}
 }

+func applyComponentStatusDB(snap *schema.HardwareSnapshot, db *ComponentStatusDB) {
+	if snap == nil || db == nil {
+		return
+	}
+	for _, rec := range db.All() {
+		key := rec.ComponentKey
+		status := dbStatusToSATStatus(rec.Status)
+		if status == "" {
+			continue
+		}
+		detail := rec.ErrorSummary
+		ts := rec.LastChangedAt.UTC().Format("2006-01-02T15:04:05Z")
+
+		switch {
+		case strings.HasPrefix(key, "pcie:"):
+			bdf := strings.TrimPrefix(key, "pcie:")
+			bdf = strings.TrimPrefix(bdf, "gpu:") // strip sub-type if present
+			// bdf may be empty (e.g. "pcie:gpu:nvidia") — skip BDF matching
+			if sanitizeBDFForLookup(bdf) == "" {
+				break
+			}
+			normalized := sanitizeBDFForLookup(bdf)
+			for i := range snap.PCIeDevices {
+				if snap.PCIeDevices[i].BDF == nil {
+					continue
+				}
+				if sanitizeBDFForLookup(*snap.PCIeDevices[i].BDF) == normalized {
+					mergeComponentStatus(&snap.PCIeDevices[i].HardwareComponentStatus, ts, status, detail)
+				}
+			}
+		case strings.HasPrefix(key, "storage:"):
+			devName := strings.TrimPrefix(key, "storage:")
+			if devName == "all" {
+				for i := range snap.Storage {
+					mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
+				}
+			} else {
+				for i := range snap.Storage {
+					linuxDev, _ := snap.Storage[i].Telemetry["linux_device"].(string)
+					if filepath.Base(strings.TrimSpace(linuxDev)) == devName {
+						mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
+					}
+				}
+			}
+		case strings.HasPrefix(key, "memory:"):
+			for i := range snap.Memory {
+				mergeComponentStatus(&snap.Memory[i].HardwareComponentStatus, ts, status, detail)
+			}
+		case strings.HasPrefix(key, "cpu:"):
+			for i := range snap.CPUs {
+				mergeComponentStatus(&snap.CPUs[i].HardwareComponentStatus, ts, status, detail)
+			}
+		}
+	}
+}
+
+// dbStatusToSATStatus converts ComponentStatusDB status strings to the format
+// expected by mergeComponentStatus (which uses "OK", "Warning", "Critical", "Unknown").
+func dbStatusToSATStatus(s string) string {
+	switch strings.TrimSpace(s) {
+	case "OK", "Warning", "Critical", "Unknown":
+		return s
+	default:
+		return ""
+	}
+}
+
+// sanitizeBDFForLookup normalises a PCIe BDF address to a canonical lower-case form
+// suitable for comparison. "c8:00.0" → "0000:c8:00.0"; already-full BDFs are left as-is.
+func sanitizeBDFForLookup(bdf string) string {
+	bdf = strings.ToLower(strings.TrimSpace(bdf))
+	if bdf == "" || bdf == "gpu" || strings.ContainsAny(bdf, " \t") {
+		return ""
+	}
+	if strings.Count(bdf, ":") == 1 {
+		bdf = "0000:" + bdf
+	}
+	return bdf
+}
+
 func ptrString(v *string) string {
 	if v == nil {
 		return ""
--- a/audit/internal/app/sat_overlay_test.go
+++ b/audit/internal/app/sat_overlay_test.go
@@ -23,7 +23,7 @@ func TestApplyLatestSATStatusesMarksStorageByDevice(t *testing.T) {
 	usb := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/sda"}}
 	snap := schema.HardwareSnapshot{Storage: []schema.HardwareStorage{nvme, usb}}

-	applyLatestSATStatuses(&snap, baseDir)
+	applyLatestSATStatuses(&snap, baseDir, nil)

 	if snap.Storage[0].Status == nil || *snap.Storage[0].Status != "OK" {
 		t.Fatalf("nvme status=%v want OK", snap.Storage[0].Status)
@@ -53,7 +53,7 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
 		}},
 	}

-	applyLatestSATStatuses(&snap, baseDir)
+	applyLatestSATStatuses(&snap, baseDir, nil)

 	if snap.PCIeDevices[0].Status == nil || *snap.PCIeDevices[0].Status != "Critical" {
 		t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
--- a/audit/internal/app/support_bundle.go
+++ b/audit/internal/app/support_bundle.go
@@ -27,13 +27,35 @@ var supportBundleCommands = []struct {
 	cmd  []string
 }{
 	{name: "system/uname.txt", cmd: []string{"uname", "-a"}},
+	{name: "system/cmdline.txt", cmd: []string{"cat", "/proc/cmdline"}},
 	{name: "system/lsmod.txt", cmd: []string{"lsmod"}},
 	{name: "system/lspci-nn.txt", cmd: []string{"lspci", "-nn"}},
+	{name: "system/lspci-vvv.txt", cmd: []string{"lspci", "-vvv"}},
 	{name: "system/ip-addr.txt", cmd: []string{"ip", "addr"}},
 	{name: "system/ip-route.txt", cmd: []string{"ip", "route"}},
 	{name: "system/mount.txt", cmd: []string{"mount"}},
 	{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
-	{name: "system/dmesg-tail.txt", cmd: []string{"sh", "-c", "dmesg | tail -n 200"}},
+	{name: "system/dmesg.txt", cmd: []string{"dmesg"}},
+	{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
+	{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
+for d in /sys/bus/pci/devices/*/; do
+  vendor=$(cat "$d/vendor" 2>/dev/null)
+  [ "$vendor" = "0x10de" ] || continue
+  dev=$(basename "$d")
+  echo "=== $dev ==="
+  for f in current_link_speed current_link_width max_link_speed max_link_width; do
+    printf "  %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
+  done
+done
+`}},
+}
+
+var supportBundleOptionalFiles = []struct {
+	name string
+	src  string
+}{
+	{name: "system/kern.log", src: "/var/log/kern.log"},
+	{name: "system/syslog.txt", src: "/var/log/syslog"},
 }

 const supportBundleGlob = "bee-support-*.tar.gz"
@@ -77,6 +99,9 @@ func BuildSupportBundle(exportDir string) (string, error) {
 			return "", err
 		}
 	}
+	for _, item := range supportBundleOptionalFiles {
+		_ = copyOptionalFile(item.src, filepath.Join(stageRoot, item.name))
+	}
 	if err := writeManifest(filepath.Join(stageRoot, "manifest.txt"), exportDir, stageRoot); err != nil {
 		return "", err
 	}
@@ -184,6 +209,24 @@ func writeCommandOutput(dst string, cmd []string) error {
 	return os.WriteFile(dst, raw, 0644)
 }

+func copyOptionalFile(src, dst string) error {
+	in, err := os.Open(src)
+	if err != nil {
+		return err
+	}
+	defer in.Close()
+	if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
+		return err
+	}
+	out, err := os.Create(dst)
+	if err != nil {
+		return err
+	}
+	defer out.Close()
+	_, err = io.Copy(out, in)
+	return err
+}
+
 func writeManifest(dst, exportDir, stageRoot string) error {
 	if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
 		return err
--- a/audit/internal/platform/error_patterns.go
+++ b/audit/internal/platform/error_patterns.go
@@ -0,0 +1,139 @@
+package platform
+
+import "regexp"
+
+// ErrorPattern describes a kernel log pattern that indicates a hardware error.
+// Add new patterns by appending to HardwareErrorPatterns — no other code changes needed.
+type ErrorPattern struct {
+	// Name is a short machine-readable label for logging and deduplication.
+	Name string
+	// Re is the compiled regular expression matched against a single kmsg line.
+	Re *regexp.Regexp
+	// Category groups related errors: "gpu", "pcie", "storage", "mce", "memory", "cpu".
+	Category string
+	// Severity is "warning" for recoverable/uncertain faults, "critical" for definitive failures.
+	Severity string
+	// BDFGroup is the capture group index (1-based) that contains a PCIe BDF address
+	// (e.g. "0000:c8:00.0"). 0 means no BDF is captured by this pattern.
+	BDFGroup int
+	// DevGroup is the capture group index (1-based) that contains a device name
+	// (e.g. "sda", "nvme0"). 0 means no device name is captured by this pattern.
+	DevGroup int
+}
+
+// HardwareErrorPatterns is the global list of kernel log patterns that indicate hardware faults.
+// To add a new pattern: append a new ErrorPattern struct to this slice.
+var HardwareErrorPatterns = []ErrorPattern{
+	// ── GPU / NVIDIA ────────────────────────────────────────────────────────────
+	{
+		Name:     "nvidia-rminitadapter",
+		Re:       mustPat(`(?i)NVRM:.*GPU\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
+		Category: "gpu",
+		Severity: "warning",
+		BDFGroup: 1,
+	},
+	{
+		Name:     "nvidia-msi-fail",
+		Re:       mustPat(`(?i)NVRM:.*Failed to enable MSI`),
+		Category: "gpu",
+		Severity: "warning",
+	},
+	{
+		Name:     "nvidia-aer",
+		Re:       mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
+		Category: "gpu",
+		Severity: "warning",
+		BDFGroup: 1,
+	},
+	{
+		Name:     "nvidia-xid",
+		Re:       mustPat(`(?i)NVRM:.*Xid.*\b([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
+		Category: "gpu",
+		Severity: "warning",
+		BDFGroup: 1,
+	},
+
+	// ── PCIe AER (generic) ──────────────────────────────────────────────────────
+	{
+		Name:     "pcie-aer",
+		Re:       mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
+		Category: "pcie",
+		Severity: "warning",
+		BDFGroup: 1,
+	},
+	{
+		Name:     "pcie-uncorrectable",
+		Re:       mustPat(`(?i)([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Uu]ncorrectable`),
+		Category: "pcie",
+		Severity: "warning",
+		BDFGroup: 1,
+	},
+	{
+		Name:     "pcie-link-down",
+		Re:       mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Ll]ink.*[Dd]own`),
+		Category: "pcie",
+		Severity: "warning",
+		BDFGroup: 1,
+	},
+
+	// ── Storage ─────────────────────────────────────────────────────────────────
+	{
+		Name:     "blk-io-error",
+		Re:       mustPat(`(?i)blk_update_request.*I/O error.*dev\s+(\w+)`),
+		Category: "storage",
+		Severity: "warning",
+		DevGroup: 1,
+	},
+	{
+		Name:     "nvme-timeout",
+		Re:       mustPat(`(?i)nvme\s+(\w+):.*timeout`),
+		Category: "storage",
+		Severity: "warning",
+		DevGroup: 1,
+	},
+	{
+		Name:     "scsi-failed",
+		Re:       mustPat(`(?i)sd\s+[\da-f:]+:.*FAILED`),
+		Category: "storage",
+		Severity: "warning",
+	},
+	{
+		Name:     "nvme-reset",
+		Re:       mustPat(`(?i)nvme\s+(\w+):.*reset`),
+		Category: "storage",
+		Severity: "warning",
+		DevGroup: 1,
+	},
+
+	// ── Machine Check Exceptions ────────────────────────────────────────────────
+	{
+		Name:     "mce-hardware-error",
+		Re:       mustPat(`(?i)mce:.*[Hh]ardware [Ee]rror`),
+		Category: "mce",
+		Severity: "warning",
+	},
+	{
+		Name:     "mce-corrected",
+		Re:       mustPat(`(?i)mce:.*[Cc]orrected`),
+		Category: "mce",
+		Severity: "warning",
+	},
+
+	// ── Memory ─────────────────────────────────────────────────────────────────
+	{
+		Name:     "edac-ue",
+		Re:       mustPat(`(?i)EDAC.*[Uu]ncorrectable`),
+		Category: "memory",
+		Severity: "warning",
+	},
+	{
+		Name:     "edac-ce",
+		Re:       mustPat(`(?i)EDAC.*[Cc]orrectable`),
+		Category: "memory",
+		Severity: "warning",
+	},
+}
+
+func mustPat(s string) *regexp.Regexp {
+	return regexp.MustCompile(s)
+}
--- a/audit/internal/platform/install.go
+++ b/audit/internal/platform/install.go
@@ -11,10 +11,10 @@ import (

 // InstallDisk describes a candidate disk for installation.
 type InstallDisk struct {
-	Device      string   // e.g. /dev/sda
-	Model       string
-	Size        string   // human-readable, e.g. "500G"
-	SizeBytes   int64    // raw byte count from lsblk
+	Device       string // e.g. /dev/sda
+	Model        string
+	Size         string   // human-readable, e.g. "500G"
+	SizeBytes    int64    // raw byte count from lsblk
 	MountedParts []string // partition mount points currently active
 }

@@ -117,6 +117,61 @@ func findLiveBootDevice() string {
 	return "/dev/" + strings.TrimSpace(string(out2))
 }

+func mountSource(target string) string {
+	out, err := exec.Command("findmnt", "-n", "-o", "SOURCE", target).Output()
+	if err != nil {
+		return ""
+	}
+	return strings.TrimSpace(string(out))
+}
+
+func mountFSType(target string) string {
+	out, err := exec.Command("findmnt", "-n", "-o", "FSTYPE", target).Output()
+	if err != nil {
+		return ""
+	}
+	return strings.TrimSpace(string(out))
+}
+
+func blockDeviceType(device string) string {
+	if strings.TrimSpace(device) == "" {
+		return ""
+	}
+	out, err := exec.Command("lsblk", "-dn", "-o", "TYPE", device).Output()
+	if err != nil {
+		return ""
+	}
+	return strings.TrimSpace(string(out))
+}
+
+func blockDeviceTransport(device string) string {
+	if strings.TrimSpace(device) == "" {
+		return ""
+	}
+	out, err := exec.Command("lsblk", "-dn", "-o", "TRAN", device).Output()
+	if err != nil {
+		return ""
+	}
+	return strings.TrimSpace(string(out))
+}
+
+func inferLiveBootKind(fsType, source, deviceType, transport string) string {
+	switch {
+	case strings.EqualFold(strings.TrimSpace(fsType), "tmpfs"):
+		return "ram"
+	case strings.EqualFold(strings.TrimSpace(deviceType), "rom"):
+		return "cdrom"
+	case strings.EqualFold(strings.TrimSpace(transport), "usb"):
+		return "usb"
+	case strings.HasPrefix(strings.TrimSpace(source), "/dev/sr"):
+		return "cdrom"
+	case strings.HasPrefix(strings.TrimSpace(source), "/dev/"):
+		return "disk"
+	default:
+		return "unknown"
+	}
+}
+
 // MinInstallBytes returns the minimum recommended disk size for installation:
 // squashfs size × 1.5 to allow for extracted filesystem and bootloader.
 // Returns 0 if the squashfs is not available (non-live environment).
--- a/audit/internal/platform/install_to_ram.go
+++ b/audit/internal/platform/install_to_ram.go
@@ -12,11 +12,40 @@ import (
 )

 func (s *System) IsLiveMediaInRAM() bool {
-	out, err := exec.Command("findmnt", "-n", "-o", "FSTYPE", "/run/live/medium").Output()
-	if err != nil {
+	fsType := mountFSType("/run/live/medium")
+	if fsType == "" {
 		return toramActive()
 	}
-	return strings.TrimSpace(string(out)) == "tmpfs"
+	return strings.EqualFold(fsType, "tmpfs")
+}
+
+func (s *System) LiveBootSource() LiveBootSource {
+	fsType := mountFSType("/run/live/medium")
+	source := mountSource("/run/live/medium")
+	device := findLiveBootDevice()
+	status := LiveBootSource{
+		InRAM:  strings.EqualFold(fsType, "tmpfs"),
+		Source: source,
+		Device: device,
+	}
+	if fsType == "" && source == "" && device == "" {
+		if toramActive() {
+			status.InRAM = true
+			status.Kind = "ram"
+			status.Source = "tmpfs"
+			return status
+		}
+		status.Kind = "unknown"
+		return status
+	}
+	status.Kind = inferLiveBootKind(fsType, source, blockDeviceType(device), blockDeviceTransport(device))
+	if status.Kind == "" {
+		status.Kind = "unknown"
+	}
+	if status.InRAM && strings.TrimSpace(status.Source) == "" {
+		status.Source = "tmpfs"
+	}
+	return status
 }

 func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
--- a/audit/internal/platform/install_to_ram_test.go
+++ b/audit/internal/platform/install_to_ram_test.go
@@ -0,0 +1,28 @@
+package platform
+
+import "testing"
+
+func TestInferLiveBootKind(t *testing.T) {
+	tests := []struct {
+		name       string
+		fsType     string
+		source     string
+		deviceType string
+		transport  string
+		want       string
+	}{
+		{name: "ram tmpfs", fsType: "tmpfs", source: "/dev/shm/bee-live", want: "ram"},
+		{name: "usb disk", source: "/dev/sdb1", deviceType: "disk", transport: "usb", want: "usb"},
+		{name: "cdrom rom", source: "/dev/sr0", deviceType: "rom", want: "cdrom"},
+		{name: "disk sata", source: "/dev/nvme0n1p1", deviceType: "disk", transport: "nvme", want: "disk"},
+		{name: "unknown", source: "overlay", want: "unknown"},
+	}
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			got := inferLiveBootKind(tc.fsType, tc.source, tc.deviceType, tc.transport)
+			if got != tc.want {
+				t.Fatalf("inferLiveBootKind(%q,%q,%q,%q)=%q want %q", tc.fsType, tc.source, tc.deviceType, tc.transport, got, tc.want)
+			}
+		})
+	}
+}
--- a/audit/internal/platform/types.go
+++ b/audit/internal/platform/types.go
@@ -2,6 +2,13 @@ package platform

 type System struct{}

+type LiveBootSource struct {
+	InRAM  bool   `json:"in_ram"`
+	Kind   string `json:"kind"`
+	Source string `json:"source,omitempty"`
+	Device string `json:"device,omitempty"`
+}
+
 type InterfaceInfo struct {
 	Name  string
 	State string
--- a/audit/internal/webui/api.go
+++ b/audit/internal/webui/api.go
@@ -63,6 +63,10 @@ func streamJob(w http.ResponseWriter, r *http.Request, j *jobState) {
 	if !sseStart(w) {
 		return
 	}
+	streamSubscribedJob(w, r, j)
+}
+
+func streamSubscribedJob(w http.ResponseWriter, r *http.Request, j *jobState) {
 	existing, ch := j.subscribe()
 	for _, line := range existing {
 		sseWrite(w, "", line)
@@ -428,7 +432,6 @@ func (h *handler) handleAPIExportList(w http.ResponseWriter, r *http.Request) {
 	writeJSON(w, entries)
 }

-
 func (h *handler) handleAPIExportUSBTargets(w http.ResponseWriter, _ *http.Request) {
 	if h.opts.App == nil {
 		writeError(w, http.StatusServiceUnavailable, "app not configured")
@@ -523,9 +526,9 @@ func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) {
 		writeError(w, http.StatusServiceUnavailable, "app not configured")
 		return
 	}
-	inRAM := h.opts.App.IsLiveMediaInRAM()
+	status := h.opts.App.LiveBootSource()
 	w.Header().Set("Content-Type", "application/json")
-	_ = json.NewEncoder(w).Encode(map[string]bool{"in_ram": inRAM})
+	_ = json.NewEncoder(w).Encode(status)
 }

 func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request) {
--- a/audit/internal/webui/kmsg_watcher.go
+++ b/audit/internal/webui/kmsg_watcher.go
@@ -0,0 +1,230 @@
+package webui
+
+import (
+	"bufio"
+	"io"
+	"log/slog"
+	"os"
+	"strings"
+	"sync"
+	"time"
+
+	"bee/audit/internal/app"
+	"bee/audit/internal/platform"
+)
+
+// kmsgWatcher reads /dev/kmsg and accumulates hardware error events.
+// During an active SAT task window it records matching lines; on task finish
+// it writes Warning status records to the component status DB.
+type kmsgWatcher struct {
+	mu           sync.Mutex
+	activeWindow *kmsgWindow
+	statusDB     *app.ComponentStatusDB
+}
+
+type kmsgWindow struct {
+	taskID    string
+	target    string
+	startedAt time.Time
+	seen      map[kmsgEventKey]bool
+	events    []kmsgEvent
+}
+
+type kmsgEventKey struct {
+	id       string // BDF or device name
+	category string
+}
+
+type kmsgEvent struct {
+	timestamp time.Time
+	raw       string
+	ids       []string // BDF addresses or device names extracted
+	category  string
+}
+
+func newKmsgWatcher(statusDB *app.ComponentStatusDB) *kmsgWatcher {
+	return &kmsgWatcher{statusDB: statusDB}
+}
+
+// start launches the background kmsg reading goroutine.
+func (w *kmsgWatcher) start() {
+	go w.run()
+}
+
+func (w *kmsgWatcher) run() {
+	f, err := os.Open("/dev/kmsg")
+	if err != nil {
+		slog.Warn("kmsg watcher unavailable", "err", err)
+		return
+	}
+	defer f.Close()
+
+	// Best-effort seek to end so we only capture events from now forward.
+	_, _ = f.Seek(0, io.SeekEnd)
+
+	scanner := bufio.NewScanner(f)
+	scanner.Buffer(make([]byte, 64*1024), 64*1024)
+	for scanner.Scan() {
+		line := scanner.Text()
+		evt, ok := parseKmsgLine(line)
+		if !ok {
+			continue
+		}
+		w.mu.Lock()
+		if w.activeWindow != nil {
+			w.recordEvent(evt)
+		}
+		w.mu.Unlock()
+	}
+	if err := scanner.Err(); err != nil {
+		slog.Warn("kmsg watcher stopped", "err", err)
+	}
+}
+
+// recordEvent appends evt to the active window, deduplicating by (id, category).
+// Must be called with w.mu held.
+func (w *kmsgWatcher) recordEvent(evt kmsgEvent) {
+	if len(evt.ids) == 0 {
+		// Events without a device ID (e.g. MCE) — deduplicate by category.
+		key := kmsgEventKey{id: "", category: evt.category}
+		if !w.activeWindow.seen[key] {
+			w.activeWindow.seen[key] = true
+			w.activeWindow.events = append(w.activeWindow.events, evt)
+		}
+		return
+	}
+	for _, id := range evt.ids {
+		key := kmsgEventKey{id: id, category: evt.category}
+		if !w.activeWindow.seen[key] {
+			w.activeWindow.seen[key] = true
+			w.activeWindow.events = append(w.activeWindow.events, evt)
+		}
+	}
+}
+
+// NotifyTaskStarted opens a new event window for the given SAT task.
+func (w *kmsgWatcher) NotifyTaskStarted(taskID, target string) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	w.activeWindow = &kmsgWindow{
+		taskID:    taskID,
+		target:    target,
+		startedAt: time.Now(),
+		seen:      make(map[kmsgEventKey]bool),
+	}
+}
+
+// NotifyTaskFinished closes the event window and asynchronously writes status records.
+func (w *kmsgWatcher) NotifyTaskFinished(taskID string) {
+	w.mu.Lock()
+	window := w.activeWindow
+	if window != nil && window.taskID == taskID {
+		w.activeWindow = nil
+	}
+	w.mu.Unlock()
+
+	if window == nil || len(window.events) == 0 {
+		return
+	}
+	go w.flushWindow(window)
+}
+
+func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
+	if w.statusDB == nil {
+		return
+	}
+	source := "watchdog:kmsg"
+	// Collect unique component keys from events.
+	seen := map[string]string{} // componentKey → first raw line
+	for _, evt := range window.events {
+		if len(evt.ids) == 0 {
+			// MCE or un-identified error.
+			key := "cpu:all"
+			if evt.category == "memory" {
+				key = "memory:all"
+			}
+			if _, exists := seen[key]; !exists {
+				seen[key] = evt.raw
+			}
+			continue
+		}
+		for _, id := range evt.ids {
+			var key string
+			switch evt.category {
+			case "gpu", "pcie":
+				key = "pcie:" + normalizeBDF(id)
+			case "storage":
+				key = "storage:" + id
+			default:
+				key = "pcie:" + normalizeBDF(id)
+			}
+			if _, exists := seen[key]; !exists {
+				seen[key] = evt.raw
+			}
+		}
+	}
+	for key, detail := range seen {
+		detail = "kernel error during " + window.target + " SAT: " + truncate(detail, 120)
+		w.statusDB.Record(key, source, "Warning", detail)
+	}
+}
+
+// parseKmsgLine parses a single /dev/kmsg line and returns an event if it matches
+// any pattern in platform.HardwareErrorPatterns.
+// kmsg format: "<priority>,<sequence>,<timestamp_usec>,-;message text"
+func parseKmsgLine(raw string) (kmsgEvent, bool) {
+	msg := raw
+	if idx := strings.Index(raw, ";"); idx >= 0 {
+		msg = strings.TrimSpace(raw[idx+1:])
+	}
+	if msg == "" {
+		return kmsgEvent{}, false
+	}
+
+	for _, p := range platform.HardwareErrorPatterns {
+		m := p.Re.FindStringSubmatch(msg)
+		if m == nil {
+			continue
+		}
+		evt := kmsgEvent{
+			timestamp: time.Now(),
+			raw:       msg,
+			category:  p.Category,
+		}
+		if p.BDFGroup > 0 && p.BDFGroup < len(m) {
+			evt.ids = append(evt.ids, normalizeBDF(m[p.BDFGroup]))
+		}
+		if p.DevGroup > 0 && p.DevGroup < len(m) {
+			evt.ids = append(evt.ids, m[p.DevGroup])
+		}
+		return evt, true
+	}
+	return kmsgEvent{}, false
+}
+
+// normalizeBDF normalizes a PCIe BDF to the 4-part form "0000:c8:00.0".
+func normalizeBDF(bdf string) string {
+	bdf = strings.ToLower(strings.TrimSpace(bdf))
+	if strings.Count(bdf, ":") == 1 {
+		return "0000:" + bdf
+	}
+	return bdf
+}
+
+func truncate(s string, max int) string {
+	if len(s) <= max {
+		return s
+	}
+	return s[:max] + "..."
+}
+
+// isSATTarget returns true for task targets that run hardware acceptance tests.
+func isSATTarget(target string) bool {
+	switch target {
+	case "nvidia", "nvidia-stress", "memory", "memory-stress", "storage",
+		"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
+		"platform-stress":
+		return true
+	}
+	return false
+}
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
@@ -1013,7 +1013,7 @@ func renderNetwork() string {
 // ── Services ──────────────────────────────────────────────────────────────────

 func renderServicesInline() string {
-	return `<div style="display:flex;justify-content:flex-end;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="loadServices()">&#8635; Refresh</button></div>
+	return `<div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="restartGPUDrivers()">Restart GPU Drivers</button><button class="btn btn-sm btn-secondary" onclick="loadServices()">&#8635; Refresh</button></div>
 <div id="svc-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
 <div id="svc-out" style="display:none;margin-top:8px" class="card">
  <div class="card-head">Output</div>
@@ -1054,6 +1054,9 @@ function svcAction(name, action) {
      setTimeout(loadServices, 1000);
    });
 }
+function restartGPUDrivers() {
+  svcAction('bee-nvidia', 'restart');
+}
 loadServices();
 </script>`
 }
@@ -1279,6 +1282,7 @@ func renderTools() string {
  <div class="card-body">
    <div style="margin-bottom:20px">
    <div style="font-weight:600;margin-bottom:8px">Install to RAM</div>
+    <p id="boot-source-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Detecting boot source...</p>
    <p id="ram-status-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Checking...</p>
    <button id="ram-install-btn" class="btn btn-primary" onclick="installToRAM()" style="display:none">&#9654; Copy to RAM</button>
    </div>
@@ -1290,8 +1294,18 @@ func renderTools() string {
 </div>
 <script>
 fetch('/api/system/ram-status').then(r=>r.json()).then(d=>{
+  const boot = document.getElementById('boot-source-text');
  const txt = document.getElementById('ram-status-text');
  const btn = document.getElementById('ram-install-btn');
+  let source = d.device || d.source || 'unknown source';
+  let kind = d.kind || 'unknown';
+  let label = source;
+  if (kind === 'ram') label = 'RAM';
+  else if (kind === 'usb') label = 'USB (' + source + ')';
+  else if (kind === 'cdrom') label = 'CD-ROM (' + source + ')';
+  else if (kind === 'disk') label = 'disk (' + source + ')';
+  else label = source;
+  boot.textContent = 'Current boot source: ' + label + '.';
  if (d.in_ram) {
    txt.textContent = '✓ Running from RAM — installation media can be safely disconnected.';
    txt.style.color = 'var(--ok, green)';
@@ -1559,23 +1573,37 @@ func renderTasks() string {
 <div class="card">
 <div id="tasks-table"><p style="color:var(--muted);font-size:13px;padding:16px">Loading...</p></div>
 </div>
-<div id="task-log-section" style="display:none;margin-top:16px" class="card">
-  <div class="card-head">Logs — <span id="task-log-title"></span>
-    <button class="btn btn-sm btn-secondary" onclick="closeTaskLog()" style="margin-left:auto">&#10005;</button>
+<div id="task-log-overlay" style="display:none;position:fixed;inset:0;background:rgba(0,0,0,.58);z-index:120;align-items:center;justify-content:center;padding:16px">
+  <div style="background:#fff;border-radius:6px;box-shadow:0 24px 60px rgba(0,0,0,.35);width:calc(100vw - 32px);max-width:1600px;height:calc(100vh - 32px);display:flex;flex-direction:column;overflow:hidden;position:relative">
+    <div class="card-head" style="padding:14px 18px;font-size:14px">Logs — <span id="task-log-title"></span>
+      <button class="btn btn-sm btn-secondary" onclick="closeTaskLog()" style="margin-left:auto">&#10005;</button>
+    </div>
+    <div class="card-body" style="padding:16px;flex:1;min-height:0"><div id="task-log-terminal" class="terminal" style="height:100%;max-height:none"></div></div>
  </div>
-  <div class="card-body"><div id="task-log-terminal" class="terminal" style="max-height:500px"></div></div>
 </div>
 <script>
 var _taskLogES = null;
 var _taskRefreshTimer = null;
+var _tasksAll = [];
+var _taskPage = 1;
+var _taskPageSize = 50;
+var _taskLogID = '';

 function loadTasks() {
  fetch('/api/tasks').then(r=>r.json()).then(tasks => {
-    if (!tasks || tasks.length === 0) {
+    _tasksAll = Array.isArray(tasks) ? tasks : [];
+    if (_tasksAll.length === 0) {
+      _taskPage = 1;
      document.getElementById('tasks-table').innerHTML = '<p style="color:var(--muted);font-size:13px;padding:16px">No tasks.</p>';
+      syncTaskLogFromHash();
      return;
    }
-    const rows = tasks.map(t => {
+    const totalPages = Math.max(1, Math.ceil(_tasksAll.length / _taskPageSize));
+    if (_taskPage > totalPages) _taskPage = totalPages;
+    if (_taskPage < 1) _taskPage = 1;
+    const start = (_taskPage - 1) * _taskPageSize;
+    const pageTasks = _tasksAll.slice(start, start + _taskPageSize);
+    const rows = pageTasks.map(t => {
      const dur = t.elapsed_sec ? formatDurSec(t.elapsed_sec) : '';
      const statusClass = {running:'badge-ok',pending:'badge-unknown',done:'badge-ok',failed:'badge-err',cancelled:'badge-unknown'}[t.status]||'badge-unknown';
      const statusLabel = {running:'&#9654; running',pending:'pending',done:'&#10003; done',failed:'&#10007; failed',cancelled:'cancelled'}[t.status]||t.status;
@@ -1594,8 +1622,20 @@ function loadTasks() {
        '<td>'+t.priority+'</td>' +
        '<td>'+actions+'</td></tr>';
    }).join('');
+    const showingFrom = start + 1;
+    const showingTo = Math.min(start + pageTasks.length, _tasksAll.length);
+    const pager =
+      '<div style="display:flex;align-items:center;justify-content:space-between;gap:12px;flex-wrap:wrap;padding:12px 14px;border-top:1px solid var(--border-lite);background:var(--surface-2)">' +
+        '<div style="font-size:12px;color:var(--muted)">Showing '+showingFrom+'-'+showingTo+' of '+_tasksAll.length+' tasks</div>' +
+        '<div style="display:flex;align-items:center;gap:8px">' +
+          '<button class="btn btn-sm btn-secondary" onclick="setTaskPage('+(_taskPage-1)+')" '+(_taskPage <= 1 ? 'disabled' : '')+'>Previous</button>' +
+          '<span style="font-size:12px;color:var(--muted)">Page '+_taskPage+' / '+totalPages+'</span>' +
+          '<button class="btn btn-sm btn-secondary" onclick="setTaskPage('+(_taskPage+1)+')" '+(_taskPage >= totalPages ? 'disabled' : '')+'>Next</button>' +
+        '</div>' +
+      '</div>';
    document.getElementById('tasks-table').innerHTML =
-      '<table><tr><th>Name</th><th>Status</th><th>Created</th><th>Duration</th><th>Priority</th><th>Actions</th></tr>'+rows+'</table>';
+      '<table><tr><th>Name</th><th>Status</th><th>Created</th><th>Duration</th><th>Priority</th><th>Actions</th></tr>'+rows+'</table>' + pager;
+    syncTaskLogFromHash();
  });
 }

@@ -1607,6 +1647,11 @@ function formatDurSec(sec) {
  const m = Math.floor(sec/60), ss = sec%60;
  return m+'m '+ss+'s';
 }
+function setTaskPage(page) {
+  const totalPages = Math.max(1, Math.ceil(_tasksAll.length / _taskPageSize));
+  _taskPage = Math.min(totalPages, Math.max(1, page));
+  loadTasks();
+}

 function cancelTask(id) {
  fetch('/api/tasks/'+id+'/cancel',{method:'POST'}).then(()=>loadTasks());
@@ -1633,24 +1678,59 @@ function setPriority(id, delta) {
  fetch('/api/tasks/'+id+'/priority',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({delta:delta})})
    .then(()=>loadTasks());
 }
+function resetTaskLog(term, text) {
+  term.textContent = text ? text + '\n' : '';
+  if (text) term.dataset.placeholder = '1';
+  else delete term.dataset.placeholder;
+}
+function prependTaskLogLine(term, line) {
+  if (term.dataset.placeholder === '1') {
+    term.textContent = '';
+    delete term.dataset.placeholder;
+  }
+  term.prepend(document.createTextNode(line + '\n'));
+  term.scrollTop = 0;
+}
 function viewLog(id, name) {
  if (_taskLogES) { _taskLogES.close(); _taskLogES = null; }
-  document.getElementById('task-log-section').style.display = '';
+  _taskLogID = id;
+  window.location.hash = id;
+  document.getElementById('task-log-overlay').style.display = 'flex';
  document.getElementById('task-log-title').textContent = name;
  const term = document.getElementById('task-log-terminal');
-  term.textContent = 'Connecting...\n';
+  resetTaskLog(term, 'Connecting...');
  _taskLogES = new EventSource('/api/tasks/'+id+'/stream');
-  _taskLogES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop=term.scrollHeight; };
+  _taskLogES.onopen = () => {
+    if (term.dataset.placeholder === '1') resetTaskLog(term, 'Connected. Waiting for output...');
+  };
+  _taskLogES.onmessage = e => { prependTaskLogLine(term, e.data); };
  _taskLogES.addEventListener('done', e => {
    _taskLogES.close(); _taskLogES=null;
-    term.textContent += (e.data ? '\nERROR: '+e.data : '\nDone.')+'\n';
+    prependTaskLogLine(term, e.data ? 'ERROR: '+e.data : 'Done.');
  });
 }
+function syncTaskLogFromHash() {
+  const id = (window.location.hash || '').replace(/^#/, '');
+  if (!id || id === _taskLogID) return;
+  const task = _tasksAll.find(t => t.id === id);
+  if (!task) return;
+  viewLog(task.id, task.name || task.id);
+}
 function closeTaskLog() {
  if (_taskLogES) { _taskLogES.close(); _taskLogES=null; }
-  document.getElementById('task-log-section').style.display='none';
+  _taskLogID = '';
+  if (window.location.hash) history.replaceState(null, '', '/tasks');
+  document.getElementById('task-log-overlay').style.display='none';
 }

+document.getElementById('task-log-overlay').addEventListener('click', function(e) {
+  if (e.target === this) closeTaskLog();
+});
+window.addEventListener('hashchange', syncTaskLogFromHash);
+window.addEventListener('keydown', function(e) {
+  if (e.key === 'Escape' && document.getElementById('task-log-overlay').style.display !== 'none') closeTaskLog();
+});
+
 loadTasks();
 _taskRefreshTimer = setInterval(loadTasks, 2000);
 </script>`
--- a/audit/internal/webui/server.go
+++ b/audit/internal/webui/server.go
@@ -164,6 +164,8 @@ type handler struct {
 	// pending network change (rollback on timeout)
 	pendingNet   *pendingNetChange
 	pendingNetMu sync.Mutex
+	// kmsg hardware error watcher
+	kmsg *kmsgWatcher
 }

 // NewHandler creates the HTTP mux with all routes.
@@ -203,6 +205,13 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	}
 	h.startMetricsCollector()

+	// Start kmsg hardware error watcher if the app (and its status DB) is available.
+	if opts.App != nil {
+		h.kmsg = newKmsgWatcher(opts.App.StatusDB)
+		h.kmsg.start()
+		globalQueue.kmsgWatcher = h.kmsg
+	}
+
 	globalQueue.startWorker(&opts)
 	mux := http.NewServeMux()

--- a/audit/internal/webui/server_test.go
+++ b/audit/internal/webui/server_test.go
@@ -359,6 +359,44 @@ func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
 	}
 }

+func TestTasksPageRendersLogModalAndPaginationControls(t *testing.T) {
+	handler := NewHandler(HandlerOptions{})
+	rec := httptest.NewRecorder()
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks", nil))
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d", rec.Code)
+	}
+	body := rec.Body.String()
+	if !strings.Contains(body, `id="task-log-overlay"`) {
+		t.Fatalf("tasks page missing log modal overlay: %s", body)
+	}
+	if !strings.Contains(body, `_taskPageSize = 50`) {
+		t.Fatalf("tasks page missing pagination size config: %s", body)
+	}
+	if !strings.Contains(body, `Previous</button>`) || !strings.Contains(body, `Next</button>`) {
+		t.Fatalf("tasks page missing pagination controls: %s", body)
+	}
+}
+
+func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
+	handler := NewHandler(HandlerOptions{})
+	rec := httptest.NewRecorder()
+	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tools", nil))
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d", rec.Code)
+	}
+	body := rec.Body.String()
+	if !strings.Contains(body, `Restart GPU Drivers`) {
+		t.Fatalf("tools page missing restart gpu drivers button: %s", body)
+	}
+	if !strings.Contains(body, `svcAction('bee-nvidia', 'restart')`) {
+		t.Fatalf("tools page missing bee-nvidia restart action: %s", body)
+	}
+	if !strings.Contains(body, `id="boot-source-text"`) {
+		t.Fatalf("tools page missing boot source field: %s", body)
+	}
+}
+
 func TestViewerRendersLatestSnapshot(t *testing.T) {
 	dir := t.TempDir()
 	path := filepath.Join(dir, "audit.json")
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -173,13 +173,14 @@ func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions

 // taskQueue manages a priority-ordered list of tasks and runs them one at a time.
 type taskQueue struct {
-	mu        sync.Mutex
-	tasks     []*Task
-	trigger   chan struct{}
-	opts      *HandlerOptions // set by startWorker
-	statePath string
-	logsDir   string
-	started   bool
+	mu          sync.Mutex
+	tasks       []*Task
+	trigger     chan struct{}
+	opts        *HandlerOptions // set by startWorker
+	statePath   string
+	logsDir     string
+	started     bool
+	kmsgWatcher *kmsgWatcher
 }

 var globalQueue = &taskQueue{trigger: make(chan struct{}, 1)}
@@ -291,6 +292,30 @@ func (q *taskQueue) findJob(id string) (*jobState, bool) {
 	return t.job, true
 }

+type taskStreamSource struct {
+	status  string
+	errMsg  string
+	logPath string
+	job     *jobState
+}
+
+func (q *taskQueue) taskStreamSource(id string) (taskStreamSource, bool) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	for _, t := range q.tasks {
+		if t.ID != id {
+			continue
+		}
+		return taskStreamSource{
+			status:  t.Status,
+			errMsg:  t.ErrMsg,
+			logPath: t.LogPath,
+			job:     t.job,
+		}, true
+	}
+	return taskStreamSource{}, false
+}
+
 func (q *taskQueue) hasActiveTarget(target string) bool {
 	q.mu.Lock()
 	defer q.mu.Unlock()
@@ -305,7 +330,7 @@ func (q *taskQueue) hasActiveTarget(target string) bool {
 	return false
 }

-// snapshot returns a copy of all tasks sorted for display (running first, then pending by priority, then done by doneAt desc).
+// snapshot returns a copy of all tasks sorted for display with newest tasks first.
 func (q *taskQueue) snapshot() []Task {
 	q.mu.Lock()
 	defer q.mu.Unlock()
@@ -315,6 +340,9 @@ func (q *taskQueue) snapshot() []Task {
 		out[i].ElapsedSec = taskElapsedSec(&out[i], time.Now())
 	}
 	sort.SliceStable(out, func(i, j int) bool {
+		if !out[i].CreatedAt.Equal(out[j].CreatedAt) {
+			return out[i].CreatedAt.After(out[j].CreatedAt)
+		}
 		si := statusOrder(out[i].Status)
 		sj := statusOrder(out[j].Status)
 		if si != sj {
@@ -323,7 +351,7 @@ func (q *taskQueue) snapshot() []Task {
 		if out[i].Priority != out[j].Priority {
 			return out[i].Priority > out[j].Priority
 		}
-		return out[i].CreatedAt.Before(out[j].CreatedAt)
+		return out[i].Name < out[j].Name
 	})
 	return out
 }
@@ -384,8 +412,16 @@ func (q *taskQueue) worker() {
 			q.persistLocked()
 			q.mu.Unlock()

+			if q.kmsgWatcher != nil && isSATTarget(t.Target) {
+				q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
+			}
+
 			q.runTask(t, j, ctx)

+			if q.kmsgWatcher != nil {
+				q.kmsgWatcher.NotifyTaskFinished(t.ID)
+			}
+
 			q.mu.Lock()
 			now2 := time.Now()
 			t.DoneAt = &now2
@@ -591,6 +627,19 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 		return
 	}

+	// If the SAT archive was produced, check overall_status and write to component DB.
+	if archive != "" {
+		archivePath := app.ExtractArchivePath(archive)
+		if err == nil {
+			if app.ReadSATOverallStatus(archivePath) == "FAILED" {
+				err = fmt.Errorf("SAT overall_status=FAILED (see summary.txt)")
+			}
+		}
+		if db := q.statusDB(); db != nil {
+			app.ApplySATResultToDB(db, t.Target, archivePath)
+		}
+	}
+
 	if err != nil {
 		if ctx.Err() != nil {
 			j.append("Aborted.")
@@ -607,6 +656,13 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 	j.finish("")
 }

+func (q *taskQueue) statusDB() *app.ComponentStatusDB {
+	if q.opts == nil || q.opts.App == nil {
+		return nil
+	}
+	return q.opts.App.StatusDB
+}
+
 func splitLines(s string) []string {
 	var out []string
 	for _, l := range splitNL(s) {
@@ -750,21 +806,49 @@ func (h *handler) handleAPITasksKillWorkers(w http.ResponseWriter, _ *http.Reque

 func (h *handler) handleAPITasksStream(w http.ResponseWriter, r *http.Request) {
 	id := r.PathValue("id")
-	// Wait up to 5s for the task to get a job (it may be pending)
-	deadline := time.Now().Add(5 * time.Second)
-	var j *jobState
-	for time.Now().Before(deadline) {
-		if jj, ok := globalQueue.findJob(id); ok {
-			j = jj
-			break
-		}
-		time.Sleep(200 * time.Millisecond)
-	}
-	if j == nil {
-		http.Error(w, "task not found or not yet started", http.StatusNotFound)
+	src, ok := globalQueue.taskStreamSource(id)
+	if !ok {
+		http.Error(w, "task not found", http.StatusNotFound)
 		return
 	}
-	streamJob(w, r, j)
+	if src.job != nil {
+		streamJob(w, r, src.job)
+		return
+	}
+	if src.status == TaskDone || src.status == TaskFailed || src.status == TaskCancelled {
+		j := newTaskJobState(src.logPath)
+		j.finish(src.errMsg)
+		streamJob(w, r, j)
+		return
+	}
+	if !sseStart(w) {
+		return
+	}
+	sseWrite(w, "", "Task is queued. Waiting for worker...")
+	ticker := time.NewTicker(200 * time.Millisecond)
+	defer ticker.Stop()
+	for {
+		select {
+		case <-ticker.C:
+			src, ok = globalQueue.taskStreamSource(id)
+			if !ok {
+				sseWrite(w, "done", "task not found")
+				return
+			}
+			if src.job != nil {
+				streamSubscribedJob(w, r, src.job)
+				return
+			}
+			if src.status == TaskDone || src.status == TaskFailed || src.status == TaskCancelled {
+				j := newTaskJobState(src.logPath)
+				j.finish(src.errMsg)
+				streamSubscribedJob(w, r, j)
+				return
+			}
+		case <-r.Context().Done():
+			return
+		}
+	}
 }

 func (q *taskQueue) assignTaskLogPathLocked(t *Task) {
--- a/audit/internal/webui/tasks_test.go
+++ b/audit/internal/webui/tasks_test.go
@@ -2,6 +2,8 @@ package webui

 import (
 	"context"
+	"net/http"
+	"net/http/httptest"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -122,6 +124,130 @@ func TestNewTaskJobStateLoadsExistingLog(t *testing.T) {
 	}
 }

+func TestTaskQueueSnapshotSortsNewestFirst(t *testing.T) {
+	now := time.Date(2026, 4, 2, 12, 0, 0, 0, time.UTC)
+	q := &taskQueue{
+		tasks: []*Task{
+			{
+				ID:        "old-running",
+				Name:      "Old Running",
+				Status:    TaskRunning,
+				Priority:  10,
+				CreatedAt: now.Add(-3 * time.Minute),
+			},
+			{
+				ID:        "new-done",
+				Name:      "New Done",
+				Status:    TaskDone,
+				Priority:  0,
+				CreatedAt: now.Add(-1 * time.Minute),
+			},
+			{
+				ID:        "mid-pending",
+				Name:      "Mid Pending",
+				Status:    TaskPending,
+				Priority:  1,
+				CreatedAt: now.Add(-2 * time.Minute),
+			},
+		},
+	}
+
+	got := q.snapshot()
+	if len(got) != 3 {
+		t.Fatalf("snapshot len=%d want 3", len(got))
+	}
+	if got[0].ID != "new-done" || got[1].ID != "mid-pending" || got[2].ID != "old-running" {
+		t.Fatalf("snapshot order=%q,%q,%q", got[0].ID, got[1].ID, got[2].ID)
+	}
+}
+
+func TestHandleAPITasksStreamReplaysPersistedLogWithoutLiveJob(t *testing.T) {
+	dir := t.TempDir()
+	logPath := filepath.Join(dir, "task.log")
+	if err := os.WriteFile(logPath, []byte("line1\nline2\n"), 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	globalQueue.mu.Lock()
+	origTasks := globalQueue.tasks
+	globalQueue.tasks = []*Task{{
+		ID:        "done-1",
+		Name:      "Done Task",
+		Status:    TaskDone,
+		CreatedAt: time.Now(),
+		LogPath:   logPath,
+	}}
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = origTasks
+		globalQueue.mu.Unlock()
+	})
+
+	req := httptest.NewRequest(http.MethodGet, "/api/tasks/done-1/stream", nil)
+	req.SetPathValue("id", "done-1")
+	rec := httptest.NewRecorder()
+
+	h := &handler{}
+	h.handleAPITasksStream(rec, req)
+
+	if rec.Code != http.StatusOK {
+		t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+	}
+	body := rec.Body.String()
+	if !strings.Contains(body, "data: line1\n\n") || !strings.Contains(body, "data: line2\n\n") {
+		t.Fatalf("body=%q", body)
+	}
+	if !strings.Contains(body, "event: done\n") {
+		t.Fatalf("missing done event: %q", body)
+	}
+}
+
+func TestHandleAPITasksStreamPendingTaskStartsSSEImmediately(t *testing.T) {
+	globalQueue.mu.Lock()
+	origTasks := globalQueue.tasks
+	globalQueue.tasks = []*Task{{
+		ID:        "pending-1",
+		Name:      "Pending Task",
+		Status:    TaskPending,
+		CreatedAt: time.Now(),
+	}}
+	globalQueue.mu.Unlock()
+	t.Cleanup(func() {
+		globalQueue.mu.Lock()
+		globalQueue.tasks = origTasks
+		globalQueue.mu.Unlock()
+	})
+
+	ctx, cancel := context.WithCancel(context.Background())
+	req := httptest.NewRequest(http.MethodGet, "/api/tasks/pending-1/stream", nil).WithContext(ctx)
+	req.SetPathValue("id", "pending-1")
+	rec := httptest.NewRecorder()
+
+	done := make(chan struct{})
+	go func() {
+		h := &handler{}
+		h.handleAPITasksStream(rec, req)
+		close(done)
+	}()
+
+	deadline := time.Now().Add(2 * time.Second)
+	for time.Now().Before(deadline) {
+		if strings.Contains(rec.Body.String(), "Task is queued. Waiting for worker...") {
+			cancel()
+			<-done
+			if rec.Code != http.StatusOK {
+				t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
+			}
+			return
+		}
+		time.Sleep(20 * time.Millisecond)
+	}
+	cancel()
+	<-done
+	t.Fatalf("stream did not emit queued status promptly, body=%q", rec.Body.String())
+}
+
 func TestResolveBurnPreset(t *testing.T) {
 	tests := []struct {
 		profile string
--- a/iso/builder/auto/config
+++ b/iso/builder/auto/config
@@ -32,7 +32,7 @@ lb config noauto \
    --memtest memtest86+ \
    --iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
    --iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
-    --bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
+    --bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=6 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
    --apt-recommends false \
    --chroot-squashfs-compression-type zstd \
    "${@}"
Author	SHA1	Message	Date
Mikhail Chusavitin	444a7d16cc	fix(iso): increase boot verbosity for service startup visibility Raise loglevel from 3 to 6 (INFO) and add systemd.show_status=1 so kernel driver messages and systemd [ OK ]/[ FAILED ] lines are visible during boot instead of showing only a blank cursor. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-02 19:33:27 +03:00
Mikhail Chusavitin	fd722692a4	feat(watchdog): hardware error monitor + unified component status store - Add platform/error_patterns.go: pluggable table of kernel log patterns (NVIDIA/GPU, PCIe AER, storage I/O, MCE, EDAC) — extend by adding one struct - Add app/component_status_db.go: persistent JSON store (component-status.json) keyed by "pcie:BDF", "storage:dev", "cpu:all", "memory:all"; OK never downgrades Warning or Critical - Add webui/kmsg_watcher.go: goroutine reads /dev/kmsg during SAT tasks, writes Warning to DB for matched hardware errors - Fix task status: overall_status=FAILED in summary.txt now marks task failed - Audit routine overlays component DB statuses into bee-audit.json on every read Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-02 19:20:59 +03:00
Mikhail Chusavitin	99cece524c	feat(support-bundle): add PCIe link diagnostics and system logs - Add full dmesg (was tail -200), kern.log, syslog - Add /proc/cmdline, lspci -vvv, nvidia-smi -q - Add per-GPU PCIe link speed/width from sysfs (NVIDIA devices only) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-02 15:42:28 +03:00
Mikhail Chusavitin	c27449c60e	feat(webui): show current boot source	2026-04-02 15:36:32 +03:00
Mikhail Chusavitin	5ef879e307	feat(webui): add gpu driver restart action	2026-04-02 15:30:23 +03:00
Mikhail Chusavitin	e7df63bae1	fix(app): include extra system logs in support bundle	2026-04-02 13:44:58 +03:00
Mikhail Chusavitin	17ff3811f8	fix(webui): improve tasks logs and ordering	2026-04-02 13:43:59 +03:00