Harden NIC probing for empty SFP ports

Unblock bee-web startup and expand support bundle diagnostics
Use plain repo tags for build version
2026-04-04 15:23:15 +03:00 · 2026-04-04 15:18:43 +03:00 · 2026-04-03 10:48:51 +03:00 · 2026-04-03 10:36:11 +03:00 · 2026-04-03 10:08:00 +03:00 · 2026-04-03 09:50:33 +03:00
27 changed files with 1280 additions and 161 deletions
--- a/audit/Makefile
+++ b/audit/Makefile
@@ -1,5 +1,7 @@
 LISTEN ?= :8080
 AUDIT_PATH ?=
 VERSION ?= $(shell sh ./scripts/resolve-version.sh)
 GO_LDFLAGS := -X main.Version=$(VERSION)
 RUN_ARGS := web --listen $(LISTEN)
 ifneq ($(AUDIT_PATH),)
@@ -9,10 +11,10 @@ endif
 .PHONY: run build test
 run:
-	go run ./cmd/bee $(RUN_ARGS)
+	go run -ldflags "$(GO_LDFLAGS)" ./cmd/bee $(RUN_ARGS)
 build:
-	go build -o bee ./cmd/bee
+	go build -ldflags "$(GO_LDFLAGS)" -o bee ./cmd/bee
 test:
 	go test ./...
--- a/audit/cmd/bee/main.go
+++ b/audit/cmd/bee/main.go
@@ -7,7 +7,6 @@ import (
 	"io"
 	"log/slog"
 	"os"
 	"runtime/debug"
 	"strings"
 	"bee/audit/internal/app"
@@ -21,30 +20,7 @@ var Version = "dev"
 func buildLabel() string {
 	label := strings.TrimSpace(Version)
 	if label == "" {
-		label = "dev"
+		return "dev"
 	}
 	if info, ok := debug.ReadBuildInfo(); ok {
 		var revision string
 		var modified bool
 		for _, setting := range info.Settings {
 			switch setting.Key {
 			case "vcs.revision":
 				revision = setting.Value
 			case "vcs.modified":
 				modified = setting.Value == "true"
 			}
 		}
 		if revision != "" {
 			short := revision
 			if len(short) > 12 {
 				short = short[:12]
 			}
 			label += " (" + short
 			if modified {
 				label += "+"
 			}
 			label += ")"
 		}
 	}
 	return label
 }
--- a/audit/cmd/bee/main_test.go
+++ b/audit/cmd/bee/main_test.go
@@ -46,8 +46,6 @@ func TestRunUnknownCommand(t *testing.T) {
 }
 func TestRunVersion(t *testing.T) {
 	t.Parallel()
 	old := Version
 	Version = "test-version"
 	t.Cleanup(func() { Version = old })
@@ -62,6 +60,16 @@ func TestRunVersion(t *testing.T) {
 	}
 }
 func TestBuildLabelUsesVersionAsIs(t *testing.T) {
 	old := Version
 	Version = "1.2.3"
 	t.Cleanup(func() { Version = old })
 	if got := buildLabel(); got != "1.2.3" {
 		t.Fatalf("buildLabel=%q want %q", got, "1.2.3")
 	}
 }
 func TestRunExportRequiresTarget(t *testing.T) {
 	t.Parallel()
--- a/audit/internal/app/app.go
+++ b/audit/internal/app/app.go
@@ -40,6 +40,8 @@ type App struct {
 	sat       satRunner
 	runtime   runtimeChecker
 	installer installer
 	// StatusDB is the unified component health store (nil if unavailable).
 	StatusDB *ComponentStatusDB
 }
 type ActionResult struct {
@@ -136,7 +138,7 @@ type runtimeChecker interface {
 }
 func New(platform *platform.System) *App {
-	return &App{
+	a := &App{
 		network:   platform,
 		services:  platform,
 		exports:   platform,
@@ -145,6 +147,10 @@ func New(platform *platform.System) *App {
 		runtime:   platform,
 		installer: platform,
 	}
 	if db, err := OpenComponentStatusDB(DefaultExportDir + "/component-status.json"); err == nil {
 		a.StatusDB = db
 	}
 	return a
 }
 // ApplySATOverlay parses a raw audit JSON, overlays the latest SAT results,
@@ -154,7 +160,7 @@ func ApplySATOverlay(auditJSON []byte) ([]byte, error) {
 	if err != nil {
 		return nil, err
 	}
-	applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir)
+	applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir, nil)
 	return json.MarshalIndent(snap, "", "  ")
 }
@@ -174,7 +180,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
 		}
 	}
 	result := collector.Run(runtimeMode)
-	applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir)
+	applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
 	if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
 		result.Runtime = &health
 	}
--- a/audit/internal/app/app_test.go
+++ b/audit/internal/app/app_test.go
@@ -754,6 +754,26 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
 		}
 	}
 	for _, want := range []string{
 		"/system/ip-link.txt",
 		"/system/ip-link-stats.txt",
 		"/system/ethtool-info.txt",
 		"/system/ethtool-link.txt",
 		"/system/ethtool-module.txt",
 		"/system/mstflint-query.txt",
 	} {
 		var found bool
 		for _, name := range names {
 			if contains(name, want) {
 				found = true
 				break
 			}
 		}
 		if !found {
 			t.Fatalf("support bundle missing %s, names=%v", want, names)
 		}
 	}
 	var foundRaw bool
 	for _, name := range names {
 		if contains(name, "/export/bee-sat/memory-run/verbose.log") {
--- a/audit/internal/app/component_status_db.go
+++ b/audit/internal/app/component_status_db.go
@@ -0,0 +1,266 @@
 package app
 import (
 	"encoding/json"
 	"os"
 	"path/filepath"
 	"strings"
 	"sync"
 	"time"
 )
 // ComponentStatusDB is a persistent, append-only store of hardware component health records.
 // Records are keyed by component identity strings (e.g. "pcie:0000:c8:00.0", "storage:nvme0n1").
 // Once a component is marked Warning or Critical, subsequent OK entries do not downgrade it —
 // the component stays at the highest observed severity until explicitly reset.
 type ComponentStatusDB struct {
 	path    string
 	mu      sync.Mutex
 	records map[string]*ComponentStatusRecord
 }
 // ComponentStatusRecord holds the current and historical health of one hardware component.
 type ComponentStatusRecord struct {
 	ComponentKey  string                  `json:"component_key"`
 	Status        string                  `json:"status"` // "OK", "Warning", "Critical", "Unknown"
 	LastCheckedAt time.Time               `json:"last_checked_at"`
 	LastChangedAt time.Time               `json:"last_changed_at"`
 	ErrorSummary  string                  `json:"error_summary,omitempty"`
 	History       []ComponentStatusEntry  `json:"history"`
 }
 // ComponentStatusEntry is one observation written to a component's history.
 type ComponentStatusEntry struct {
 	At     time.Time `json:"at"`
 	Status string    `json:"status"`
 	Source string    `json:"source"` // e.g. "sat:nvidia", "sat:memory", "watchdog:kmsg"
 	Detail string    `json:"detail,omitempty"`
 }
 // OpenComponentStatusDB opens (or creates) the JSON status DB at path.
 func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) {
 	db := &ComponentStatusDB{
 		path:    path,
 		records: make(map[string]*ComponentStatusRecord),
 	}
 	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
 		return nil, err
 	}
 	data, err := os.ReadFile(path)
 	if err != nil && !os.IsNotExist(err) {
 		return nil, err
 	}
 	if len(data) > 0 {
 		var records []ComponentStatusRecord
 		if err := json.Unmarshal(data, &records); err == nil {
 			for i := range records {
 				db.records[records[i].ComponentKey] = &records[i]
 			}
 		}
 	}
 	return db, nil
 }
 // Record writes one observation for the given component key.
 // source is a short label like "sat:nvidia" or "watchdog:kmsg".
 // status is "OK", "Warning", "Critical", or "Unknown".
 // OK never downgrades an existing Warning or Critical status.
 func (db *ComponentStatusDB) Record(key, source, status, detail string) {
 	if db == nil || strings.TrimSpace(key) == "" {
 		return
 	}
 	db.mu.Lock()
 	defer db.mu.Unlock()
 	now := time.Now().UTC()
 	rec, exists := db.records[key]
 	if !exists {
 		rec = &ComponentStatusRecord{ComponentKey: key}
 		db.records[key] = rec
 	}
 	rec.LastCheckedAt = now
 	entry := ComponentStatusEntry{At: now, Status: status, Source: source, Detail: detail}
 	rec.History = append(rec.History, entry)
 	// Status merge: OK never downgrades Warning/Critical.
 	newSev := componentSeverity(status)
 	curSev := componentSeverity(rec.Status)
 	if newSev > curSev {
 		rec.Status = status
 		rec.LastChangedAt = now
 		rec.ErrorSummary = detail
 	} else if rec.Status == "" {
 		rec.Status = status
 		rec.LastChangedAt = now
 	}
 	_ = db.saveLocked()
 }
 // Get returns the current record for a component key.
 func (db *ComponentStatusDB) Get(key string) (ComponentStatusRecord, bool) {
 	if db == nil {
 		return ComponentStatusRecord{}, false
 	}
 	db.mu.Lock()
 	defer db.mu.Unlock()
 	r, ok := db.records[key]
 	if !ok {
 		return ComponentStatusRecord{}, false
 	}
 	return *r, true
 }
 // All returns a snapshot of all records.
 func (db *ComponentStatusDB) All() []ComponentStatusRecord {
 	if db == nil {
 		return nil
 	}
 	db.mu.Lock()
 	defer db.mu.Unlock()
 	out := make([]ComponentStatusRecord, 0, len(db.records))
 	for _, r := range db.records {
 		out = append(out, *r)
 	}
 	return out
 }
 func (db *ComponentStatusDB) saveLocked() error {
 	records := make([]ComponentStatusRecord, 0, len(db.records))
 	for _, r := range db.records {
 		records = append(records, *r)
 	}
 	data, err := json.MarshalIndent(records, "", "  ")
 	if err != nil {
 		return err
 	}
 	return os.WriteFile(db.path, data, 0644)
 }
 // componentSeverity returns a numeric severity so higher values win.
 func componentSeverity(status string) int {
 	switch strings.TrimSpace(status) {
 	case "Critical":
 		return 3
 	case "Warning":
 		return 2
 	case "OK":
 		return 1
 	default:
 		return 0
 	}
 }
 // ApplySATResultToDB reads a SAT summary.txt from the run directory next to archivePath
 // and writes component status records to db for the given SAT target.
 // archivePath may be either a bare .tar.gz path or "Archive written to /path/foo.tar.gz".
 func ApplySATResultToDB(db *ComponentStatusDB, target, archivePath string) {
 	if db == nil || strings.TrimSpace(archivePath) == "" {
 		return
 	}
 	archivePath = extractArchivePath(archivePath)
 	if archivePath == "" {
 		return
 	}
 	runDir := strings.TrimSuffix(archivePath, ".tar.gz")
 	data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
 	if err != nil {
 		return
 	}
 	kv := parseSATKV(string(data))
 	overall := strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
 	if overall == "" {
 		return
 	}
 	source := "sat:" + target
 	dbStatus := satStatusToDBStatus(overall)
 	// Map SAT target to component keys.
 	switch target {
 	case "nvidia", "amd", "nvidia-stress", "amd-stress", "amd-mem", "amd-bandwidth":
 		db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall)
 	case "memory", "memory-stress", "sat-stress":
 		db.Record("memory:all", source, dbStatus, target+" SAT: "+overall)
 	case "cpu", "platform-stress":
 		db.Record("cpu:all", source, dbStatus, target+" SAT: "+overall)
 	case "storage":
 		// Try to record per-device if available in summary.
 		recordedAny := false
 		for key, val := range kv {
 			if !strings.HasSuffix(key, "_status") || key == "overall_status" {
 				continue
 			}
 			base := strings.TrimSuffix(key, "_status")
 			idx := strings.Index(base, "_")
 			if idx <= 0 {
 				continue
 			}
 			devName := base[:idx]
 			devStatus := satStatusToDBStatus(strings.ToUpper(strings.TrimSpace(val)))
 			db.Record("storage:"+devName, source, devStatus, "storage SAT: "+val)
 			recordedAny = true
 		}
 		if !recordedAny {
 			db.Record("storage:all", source, dbStatus, "storage SAT: "+overall)
 		}
 	}
 }
 func satStatusToDBStatus(overall string) string {
 	switch overall {
 	case "OK":
 		return "OK"
 	case "FAILED":
 		return "Warning"
 	case "PARTIAL", "UNSUPPORTED":
 		return "Unknown"
 	default:
 		return "Unknown"
 	}
 }
 // ExtractArchivePath extracts a bare .tar.gz path from a string that may be
 // "Archive written to /path/foo.tar.gz" or already a bare path.
 func ExtractArchivePath(s string) string {
 	return extractArchivePath(s)
 }
 // ReadSATOverallStatus reads the overall_status value from the summary.txt
 // file located in the run directory alongside archivePath.
 // Returns "" if the file cannot be read.
 func ReadSATOverallStatus(archivePath string) string {
 	if strings.TrimSpace(archivePath) == "" {
 		return ""
 	}
 	runDir := strings.TrimSuffix(archivePath, ".tar.gz")
 	data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
 	if err != nil {
 		return ""
 	}
 	kv := parseSATKV(string(data))
 	return strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
 }
 func extractArchivePath(s string) string {
 	s = strings.TrimSpace(s)
 	if strings.HasSuffix(s, ".tar.gz") {
 		parts := strings.Fields(s)
 		if len(parts) > 0 {
 			return parts[len(parts)-1]
 		}
 	}
 	return s
 }
 func parseSATKV(raw string) map[string]string {
 	kv := make(map[string]string)
 	for _, line := range strings.Split(raw, "\n") {
 		k, v, ok := strings.Cut(strings.TrimSpace(line), "=")
 		if ok {
 			kv[strings.TrimSpace(k)] = strings.TrimSpace(v)
 		}
 	}
 	return kv
 }
--- a/audit/internal/app/sat_overlay.go
+++ b/audit/internal/app/sat_overlay.go
@@ -9,7 +9,7 @@ import (
 	"bee/audit/internal/schema"
 )
-func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
+func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *ComponentStatusDB) {
 	if snap == nil || strings.TrimSpace(baseDir) == "" {
 		return
 	}
@@ -28,6 +28,8 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
 	if summary, ok := loadLatestSATSummary(baseDir, "storage-"); ok {
 		applyStorageSAT(snap.Storage, summary)
 	}
 	// Apply unified component status DB — overlaid last so it can only upgrade severity.
 	applyComponentStatusDB(snap, db)
 }
 type satSummary struct {
@@ -206,6 +208,86 @@ func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool {
 	}
 }
 func applyComponentStatusDB(snap *schema.HardwareSnapshot, db *ComponentStatusDB) {
 	if snap == nil || db == nil {
 		return
 	}
 	for _, rec := range db.All() {
 		key := rec.ComponentKey
 		status := dbStatusToSATStatus(rec.Status)
 		if status == "" {
 			continue
 		}
 		detail := rec.ErrorSummary
 		ts := rec.LastChangedAt.UTC().Format("2006-01-02T15:04:05Z")
 		switch {
 		case strings.HasPrefix(key, "pcie:"):
 			bdf := strings.TrimPrefix(key, "pcie:")
 			bdf = strings.TrimPrefix(bdf, "gpu:") // strip sub-type if present
 			// bdf may be empty (e.g. "pcie:gpu:nvidia") — skip BDF matching
 			if sanitizeBDFForLookup(bdf) == "" {
 				break
 			}
 			normalized := sanitizeBDFForLookup(bdf)
 			for i := range snap.PCIeDevices {
 				if snap.PCIeDevices[i].BDF == nil {
 					continue
 				}
 				if sanitizeBDFForLookup(*snap.PCIeDevices[i].BDF) == normalized {
 					mergeComponentStatus(&snap.PCIeDevices[i].HardwareComponentStatus, ts, status, detail)
 				}
 			}
 		case strings.HasPrefix(key, "storage:"):
 			devName := strings.TrimPrefix(key, "storage:")
 			if devName == "all" {
 				for i := range snap.Storage {
 					mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
 				}
 			} else {
 				for i := range snap.Storage {
 					linuxDev, _ := snap.Storage[i].Telemetry["linux_device"].(string)
 					if filepath.Base(strings.TrimSpace(linuxDev)) == devName {
 						mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
 					}
 				}
 			}
 		case strings.HasPrefix(key, "memory:"):
 			for i := range snap.Memory {
 				mergeComponentStatus(&snap.Memory[i].HardwareComponentStatus, ts, status, detail)
 			}
 		case strings.HasPrefix(key, "cpu:"):
 			for i := range snap.CPUs {
 				mergeComponentStatus(&snap.CPUs[i].HardwareComponentStatus, ts, status, detail)
 			}
 		}
 	}
 }
 // dbStatusToSATStatus converts ComponentStatusDB status strings to the format
 // expected by mergeComponentStatus (which uses "OK", "Warning", "Critical", "Unknown").
 func dbStatusToSATStatus(s string) string {
 	switch strings.TrimSpace(s) {
 	case "OK", "Warning", "Critical", "Unknown":
 		return s
 	default:
 		return ""
 	}
 }
 // sanitizeBDFForLookup normalises a PCIe BDF address to a canonical lower-case form
 // suitable for comparison. "c8:00.0" → "0000:c8:00.0"; already-full BDFs are left as-is.
 func sanitizeBDFForLookup(bdf string) string {
 	bdf = strings.ToLower(strings.TrimSpace(bdf))
 	if bdf == "" || bdf == "gpu" || strings.ContainsAny(bdf, " \t") {
 		return ""
 	}
 	if strings.Count(bdf, ":") == 1 {
 		bdf = "0000:" + bdf
 	}
 	return bdf
 }
 func ptrString(v *string) string {
 	if v == nil {
 		return ""
--- a/audit/internal/app/sat_overlay_test.go
+++ b/audit/internal/app/sat_overlay_test.go
@@ -23,7 +23,7 @@ func TestApplyLatestSATStatusesMarksStorageByDevice(t *testing.T) {
 	usb := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/sda"}}
 	snap := schema.HardwareSnapshot{Storage: []schema.HardwareStorage{nvme, usb}}
-	applyLatestSATStatuses(&snap, baseDir)
+	applyLatestSATStatuses(&snap, baseDir, nil)
 	if snap.Storage[0].Status == nil || *snap.Storage[0].Status != "OK" {
 		t.Fatalf("nvme status=%v want OK", snap.Storage[0].Status)
@@ -53,7 +53,7 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
 		}},
 	}
-	applyLatestSATStatuses(&snap, baseDir)
+	applyLatestSATStatuses(&snap, baseDir, nil)
 	if snap.PCIeDevices[0].Status == nil || *snap.PCIeDevices[0].Status != "Critical" {
 		t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
--- a/audit/internal/app/support_bundle.go
+++ b/audit/internal/app/support_bundle.go
@@ -32,6 +32,8 @@ var supportBundleCommands = []struct {
 	{name: "system/lspci-nn.txt", cmd: []string{"lspci", "-nn"}},
 	{name: "system/lspci-vvv.txt", cmd: []string{"lspci", "-vvv"}},
 	{name: "system/ip-addr.txt", cmd: []string{"ip", "addr"}},
 	{name: "system/ip-link.txt", cmd: []string{"ip", "-details", "link", "show"}},
 	{name: "system/ip-link-stats.txt", cmd: []string{"ip", "-s", "link", "show"}},
 	{name: "system/ip-route.txt", cmd: []string{"ip", "route"}},
 	{name: "system/mount.txt", cmd: []string{"mount"}},
 	{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
@@ -47,6 +49,83 @@ for d in /sys/bus/pci/devices/*/; do
    printf "  %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
  done
 done
 `}},
 	{name: "system/ethtool-info.txt", cmd: []string{"sh", "-c", `
 if ! command -v ethtool >/dev/null 2>&1; then
  echo "ethtool not found"
  exit 0
 fi
 found=0
 for path in /sys/class/net/*; do
  [ -e "$path" ] || continue
  iface=$(basename "$path")
  [ "$iface" = "lo" ] && continue
  found=1
  echo "=== $iface ==="
  ethtool -i "$iface" 2>&1 || true
  echo
 done
 if [ "$found" -eq 0 ]; then
  echo "no interfaces found"
 fi
 `}},
 	{name: "system/ethtool-link.txt", cmd: []string{"sh", "-c", `
 if ! command -v ethtool >/dev/null 2>&1; then
  echo "ethtool not found"
  exit 0
 fi
 found=0
 for path in /sys/class/net/*; do
  [ -e "$path" ] || continue
  iface=$(basename "$path")
  [ "$iface" = "lo" ] && continue
  found=1
  echo "=== $iface ==="
  ethtool "$iface" 2>&1 || true
  echo
 done
 if [ "$found" -eq 0 ]; then
  echo "no interfaces found"
 fi
 `}},
 	{name: "system/ethtool-module.txt", cmd: []string{"sh", "-c", `
 if ! command -v ethtool >/dev/null 2>&1; then
  echo "ethtool not found"
  exit 0
 fi
 found=0
 for path in /sys/class/net/*; do
  [ -e "$path" ] || continue
  iface=$(basename "$path")
  [ "$iface" = "lo" ] && continue
  found=1
  echo "=== $iface ==="
  ethtool -m "$iface" 2>&1 || true
  echo
 done
 if [ "$found" -eq 0 ]; then
  echo "no interfaces found"
 fi
 `}},
 	{name: "system/mstflint-query.txt", cmd: []string{"sh", "-c", `
 if ! command -v mstflint >/dev/null 2>&1; then
  echo "mstflint not found"
  exit 0
 fi
 found=0
 for path in /sys/bus/pci/devices/*; do
  [ -e "$path/vendor" ] || continue
  vendor=$(cat "$path/vendor" 2>/dev/null)
  [ "$vendor" = "0x15b3" ] || continue
  bdf=$(basename "$path")
  found=1
  echo "=== $bdf ==="
  mstflint -d "$bdf" q 2>&1 || true
  echo
 done
 if [ "$found" -eq 0 ]; then
  echo "no Mellanox/NVIDIA networking devices found"
 fi
 `}},
 }
--- a/audit/internal/collector/nic_mellanox.go
+++ b/audit/internal/collector/nic_mellanox.go
@@ -2,18 +2,21 @@ package collector
 import (
 	"bee/audit/internal/schema"
 	"context"
 	"log/slog"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"strings"
 	"time"
 )
 const mellanoxVendorID = 0x15b3
 const nicProbeTimeout = 2 * time.Second
 var (
 	mstflintQuery = func(bdf string) (string, error) {
-		out, err := exec.Command("mstflint", "-d", bdf, "q").Output()
+		out, err := commandOutputWithTimeout(nicProbeTimeout, "mstflint", "-d", bdf, "q")
 		if err != nil {
 			return "", err
 		}
@@ -21,7 +24,7 @@ var (
 	}
 	ethtoolInfoQuery = func(iface string) (string, error) {
-		out, err := exec.Command("ethtool", "-i", iface).Output()
+		out, err := commandOutputWithTimeout(nicProbeTimeout, "ethtool", "-i", iface)
 		if err != nil {
 			return "", err
 		}
@@ -29,6 +32,14 @@ var (
 	}
 	netIfacesByBDF = listNetIfacesByBDF
 	readNetCarrierFile = func(iface string) (string, error) {
 		path := filepath.Join("/sys/class/net", iface, "carrier")
 		raw, err := os.ReadFile(path)
 		if err != nil {
 			return "", err
 		}
 		return strings.TrimSpace(string(raw)), nil
 	}
 )
 // enrichPCIeWithMellanox enriches Mellanox/NVIDIA Networking devices with
@@ -162,3 +173,17 @@ func listNetIfacesByBDF(bdf string) []string {
 	}
 	return ifaces
 }
 func commandOutputWithTimeout(timeout time.Duration, name string, args ...string) ([]byte, error) {
 	ctx, cancel := context.WithTimeout(context.Background(), timeout)
 	defer cancel()
 	return exec.CommandContext(ctx, name, args...).Output()
 }
 func interfaceHasCarrier(iface string) bool {
 	raw, err := readNetCarrierFile(iface)
 	if err != nil {
 		return false
 	}
 	return strings.TrimSpace(raw) == "1"
 }
--- a/audit/internal/collector/nic_telemetry.go
+++ b/audit/internal/collector/nic_telemetry.go
@@ -12,7 +12,7 @@ import (
 var (
 	ethtoolModuleQuery = func(iface string) (string, error) {
-		out, err := raidToolQuery("ethtool", "-m", iface)
+		out, err := commandOutputWithTimeout(nicProbeTimeout, "ethtool", "-m", iface)
 		if err != nil {
 			return "", err
 		}
@@ -58,10 +58,12 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
 			}
 		}
-		if out, err := ethtoolModuleQuery(iface); err == nil {
+		if interfaceHasCarrier(iface) {
-			if injectSFPDOMTelemetry(&devs[i], out) {
+			if out, err := ethtoolModuleQuery(iface); err == nil {
-				enriched++
+				if injectSFPDOMTelemetry(&devs[i], out) {
-				continue
+					enriched++
 					continue
 				}
 			}
 		}
 		if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {
--- a/audit/internal/collector/nic_telemetry_test.go
+++ b/audit/internal/collector/nic_telemetry_test.go
@@ -57,6 +57,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
 	origReadMAC := readNetAddressFile
 	origEth := ethtoolInfoQuery
 	origModule := ethtoolModuleQuery
 	origCarrier := readNetCarrierFile
 	t.Cleanup(func() {
 		queryPCILSPCIDetail = origDetail
 		readPCIVPDFile = origVPD
@@ -64,6 +65,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
 		readNetAddressFile = origReadMAC
 		ethtoolInfoQuery = origEth
 		ethtoolModuleQuery = origModule
 		readNetCarrierFile = origCarrier
 	})
 	queryPCILSPCIDetail = func(bdf string) (string, error) {
@@ -82,6 +84,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
 		}
 		return "aa:bb:cc:dd:ee:ff", nil
 	}
 	readNetCarrierFile = func(string) (string, error) { return "1", nil }
 	ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
 	ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("skip optics") }
@@ -101,6 +104,42 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
 	}
 }
 func TestEnrichPCIeWithNICTelemetrySkipsModuleQueryWithoutCarrier(t *testing.T) {
 	origIfaces := netIfacesByBDF
 	origReadMAC := readNetAddressFile
 	origEth := ethtoolInfoQuery
 	origModule := ethtoolModuleQuery
 	origCarrier := readNetCarrierFile
 	t.Cleanup(func() {
 		netIfacesByBDF = origIfaces
 		readNetAddressFile = origReadMAC
 		ethtoolInfoQuery = origEth
 		ethtoolModuleQuery = origModule
 		readNetCarrierFile = origCarrier
 	})
 	netIfacesByBDF = func(string) []string { return []string{"eth0"} }
 	readNetAddressFile = func(string) (string, error) { return "aa:bb:cc:dd:ee:ff", nil }
 	readNetCarrierFile = func(string) (string, error) { return "0", nil }
 	ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
 	ethtoolModuleQuery = func(string) (string, error) {
 		t.Fatal("ethtool -m should not be called without carrier")
 		return "", nil
 	}
 	class := "EthernetController"
 	bdf := "0000:18:00.0"
 	devs := []schema.HardwarePCIeDevice{{
 		DeviceClass: &class,
 		BDF:         &bdf,
 	}}
 	out := enrichPCIeWithNICTelemetry(devs)
 	if len(out[0].MacAddresses) != 1 || out[0].MacAddresses[0] != "aa:bb:cc:dd:ee:ff" {
 		t.Fatalf("mac_addresses=%v", out[0].MacAddresses)
 	}
 }
 func TestDBMValue(t *testing.T) {
 	tests := []struct {
 		in   string
--- a/audit/internal/platform/error_patterns.go
+++ b/audit/internal/platform/error_patterns.go
@@ -0,0 +1,139 @@
 package platform
 import "regexp"
 // ErrorPattern describes a kernel log pattern that indicates a hardware error.
 // Add new patterns by appending to HardwareErrorPatterns — no other code changes needed.
 type ErrorPattern struct {
 	// Name is a short machine-readable label for logging and deduplication.
 	Name string
 	// Re is the compiled regular expression matched against a single kmsg line.
 	Re *regexp.Regexp
 	// Category groups related errors: "gpu", "pcie", "storage", "mce", "memory", "cpu".
 	Category string
 	// Severity is "warning" for recoverable/uncertain faults, "critical" for definitive failures.
 	Severity string
 	// BDFGroup is the capture group index (1-based) that contains a PCIe BDF address
 	// (e.g. "0000:c8:00.0"). 0 means no BDF is captured by this pattern.
 	BDFGroup int
 	// DevGroup is the capture group index (1-based) that contains a device name
 	// (e.g. "sda", "nvme0"). 0 means no device name is captured by this pattern.
 	DevGroup int
 }
 // HardwareErrorPatterns is the global list of kernel log patterns that indicate hardware faults.
 // To add a new pattern: append a new ErrorPattern struct to this slice.
 var HardwareErrorPatterns = []ErrorPattern{
 	// ── GPU / NVIDIA ────────────────────────────────────────────────────────────
 	{
 		Name:     "nvidia-rminitadapter",
 		Re:       mustPat(`(?i)NVRM:.*GPU\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
 		Category: "gpu",
 		Severity: "warning",
 		BDFGroup: 1,
 	},
 	{
 		Name:     "nvidia-msi-fail",
 		Re:       mustPat(`(?i)NVRM:.*Failed to enable MSI`),
 		Category: "gpu",
 		Severity: "warning",
 	},
 	{
 		Name:     "nvidia-aer",
 		Re:       mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
 		Category: "gpu",
 		Severity: "warning",
 		BDFGroup: 1,
 	},
 	{
 		Name:     "nvidia-xid",
 		Re:       mustPat(`(?i)NVRM:.*Xid.*\b([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
 		Category: "gpu",
 		Severity: "warning",
 		BDFGroup: 1,
 	},
 	// ── PCIe AER (generic) ──────────────────────────────────────────────────────
 	{
 		Name:     "pcie-aer",
 		Re:       mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
 		Category: "pcie",
 		Severity: "warning",
 		BDFGroup: 1,
 	},
 	{
 		Name:     "pcie-uncorrectable",
 		Re:       mustPat(`(?i)([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Uu]ncorrectable`),
 		Category: "pcie",
 		Severity: "warning",
 		BDFGroup: 1,
 	},
 	{
 		Name:     "pcie-link-down",
 		Re:       mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Ll]ink.*[Dd]own`),
 		Category: "pcie",
 		Severity: "warning",
 		BDFGroup: 1,
 	},
 	// ── Storage ─────────────────────────────────────────────────────────────────
 	{
 		Name:     "blk-io-error",
 		Re:       mustPat(`(?i)blk_update_request.*I/O error.*dev\s+(\w+)`),
 		Category: "storage",
 		Severity: "warning",
 		DevGroup: 1,
 	},
 	{
 		Name:     "nvme-timeout",
 		Re:       mustPat(`(?i)nvme\s+(\w+):.*timeout`),
 		Category: "storage",
 		Severity: "warning",
 		DevGroup: 1,
 	},
 	{
 		Name:     "scsi-failed",
 		Re:       mustPat(`(?i)sd\s+[\da-f:]+:.*FAILED`),
 		Category: "storage",
 		Severity: "warning",
 	},
 	{
 		Name:     "nvme-reset",
 		Re:       mustPat(`(?i)nvme\s+(\w+):.*reset`),
 		Category: "storage",
 		Severity: "warning",
 		DevGroup: 1,
 	},
 	// ── Machine Check Exceptions ────────────────────────────────────────────────
 	{
 		Name:     "mce-hardware-error",
 		Re:       mustPat(`(?i)mce:.*[Hh]ardware [Ee]rror`),
 		Category: "mce",
 		Severity: "warning",
 	},
 	{
 		Name:     "mce-corrected",
 		Re:       mustPat(`(?i)mce:.*[Cc]orrected`),
 		Category: "mce",
 		Severity: "warning",
 	},
 	// ── Memory ─────────────────────────────────────────────────────────────────
 	{
 		Name:     "edac-ue",
 		Re:       mustPat(`(?i)EDAC.*[Uu]ncorrectable`),
 		Category: "memory",
 		Severity: "warning",
 	},
 	{
 		Name:     "edac-ce",
 		Re:       mustPat(`(?i)EDAC.*[Cc]orrectable`),
 		Category: "memory",
 		Severity: "warning",
 	},
 }
 func mustPat(s string) *regexp.Regexp {
 	return regexp.MustCompile(s)
 }
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -286,7 +286,25 @@ func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (
 // gpuIndices: specific GPU indices to test (empty = all GPUs).
 // ctx cancellation kills the running job.
 func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error) {
-	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, gpuIndices), logFunc)
+	resolvedGPUIndices, err := resolveDCGMGPUIndices(gpuIndices)
 	if err != nil {
 		return "", err
 	}
 	return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, resolvedGPUIndices), logFunc)
 }
 func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
 	if len(gpuIndices) > 0 {
 		return dedupeSortedIndices(gpuIndices), nil
 	}
 	all, err := listNvidiaGPUIndices()
 	if err != nil {
 		return nil, err
 	}
 	if len(all) == 0 {
 		return nil, fmt.Errorf("nvidia-smi found no NVIDIA GPUs")
 	}
 	return all, nil
 }
 func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
--- a/audit/internal/platform/sat_test.go
+++ b/audit/internal/platform/sat_test.go
@@ -162,6 +162,39 @@ func TestBuildNvidiaStressJobUsesNCCLLoader(t *testing.T) {
 	}
 }
 func TestResolveDCGMGPUIndicesUsesDetectedGPUsWhenUnset(t *testing.T) {
 	t.Parallel()
 	oldExecCommand := satExecCommand
 	satExecCommand = func(name string, args ...string) *exec.Cmd {
 		if name == "nvidia-smi" {
 			return exec.Command("sh", "-c", "printf '2\n0\n1\n'")
 		}
 		return exec.Command(name, args...)
 	}
 	t.Cleanup(func() { satExecCommand = oldExecCommand })
 	got, err := resolveDCGMGPUIndices(nil)
 	if err != nil {
 		t.Fatalf("resolveDCGMGPUIndices error: %v", err)
 	}
 	if want := "0,1,2"; joinIndexList(got) != want {
 		t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want)
 	}
 }
 func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) {
 	t.Parallel()
 	got, err := resolveDCGMGPUIndices([]int{3, 1, 3})
 	if err != nil {
 		t.Fatalf("resolveDCGMGPUIndices error: %v", err)
 	}
 	if want := "1,3"; joinIndexList(got) != want {
 		t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want)
 	}
 }
 func TestNvidiaStressArchivePrefixByLoader(t *testing.T) {
 	t.Parallel()
--- a/audit/internal/webui/kmsg_watcher.go
+++ b/audit/internal/webui/kmsg_watcher.go
@@ -0,0 +1,238 @@
 package webui
 import (
 	"bufio"
 	"io"
 	"log/slog"
 	"os"
 	"strings"
 	"sync"
 	"time"
 	"bee/audit/internal/app"
 	"bee/audit/internal/platform"
 )
 // kmsgWatcher reads /dev/kmsg and accumulates hardware error events.
 // It supports multiple concurrent SAT tasks: a shared event window is open
 // while any SAT task is running, and flushed when all tasks complete.
 type kmsgWatcher struct {
 	mu           sync.Mutex
 	activeCount  int        // number of in-flight SAT tasks
 	window       *kmsgWindow
 	statusDB     *app.ComponentStatusDB
 }
 type kmsgWindow struct {
 	targets   []string // SAT targets running concurrently
 	startedAt time.Time
 	seen      map[kmsgEventKey]bool
 	events    []kmsgEvent
 }
 type kmsgEventKey struct {
 	id       string // BDF or device name
 	category string
 }
 type kmsgEvent struct {
 	timestamp time.Time
 	raw       string
 	ids       []string // BDF addresses or device names extracted
 	category  string
 }
 func newKmsgWatcher(statusDB *app.ComponentStatusDB) *kmsgWatcher {
 	return &kmsgWatcher{statusDB: statusDB}
 }
 // start launches the background kmsg reading goroutine.
 func (w *kmsgWatcher) start() {
 	go w.run()
 }
 func (w *kmsgWatcher) run() {
 	f, err := os.Open("/dev/kmsg")
 	if err != nil {
 		slog.Warn("kmsg watcher unavailable", "err", err)
 		return
 	}
 	defer f.Close()
 	// Best-effort seek to end so we only capture events from now forward.
 	_, _ = f.Seek(0, io.SeekEnd)
 	scanner := bufio.NewScanner(f)
 	scanner.Buffer(make([]byte, 64*1024), 64*1024)
 	for scanner.Scan() {
 		line := scanner.Text()
 		evt, ok := parseKmsgLine(line)
 		if !ok {
 			continue
 		}
 		w.mu.Lock()
 		if w.window != nil {
 			w.recordEvent(evt)
 		}
 		w.mu.Unlock()
 	}
 	if err := scanner.Err(); err != nil {
 		slog.Warn("kmsg watcher stopped", "err", err)
 	}
 }
 // recordEvent appends evt to the active window, deduplicating by (id, category).
 // Must be called with w.mu held.
 func (w *kmsgWatcher) recordEvent(evt kmsgEvent) {
 	if len(evt.ids) == 0 {
 		key := kmsgEventKey{id: "", category: evt.category}
 		if !w.window.seen[key] {
 			w.window.seen[key] = true
 			w.window.events = append(w.window.events, evt)
 		}
 		return
 	}
 	for _, id := range evt.ids {
 		key := kmsgEventKey{id: id, category: evt.category}
 		if !w.window.seen[key] {
 			w.window.seen[key] = true
 			w.window.events = append(w.window.events, evt)
 		}
 	}
 }
 // NotifyTaskStarted increments the active task counter and opens a shared event window
 // if this is the first task starting.
 func (w *kmsgWatcher) NotifyTaskStarted(taskID, target string) {
 	w.mu.Lock()
 	defer w.mu.Unlock()
 	if w.activeCount == 0 {
 		w.window = &kmsgWindow{
 			startedAt: time.Now(),
 			seen:      make(map[kmsgEventKey]bool),
 		}
 	}
 	w.activeCount++
 	if w.window != nil {
 		w.window.targets = append(w.window.targets, target)
 	}
 }
 // NotifyTaskFinished decrements the active task counter. When all tasks finish,
 // it flushes the accumulated events to the status DB.
 func (w *kmsgWatcher) NotifyTaskFinished(taskID string) {
 	w.mu.Lock()
 	w.activeCount--
 	var window *kmsgWindow
 	if w.activeCount <= 0 {
 		w.activeCount = 0
 		window = w.window
 		w.window = nil
 	}
 	w.mu.Unlock()
 	if window == nil || len(window.events) == 0 {
 		return
 	}
 	go w.flushWindow(window)
 }
 func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
 	if w.statusDB == nil {
 		return
 	}
 	source := "watchdog:kmsg"
 	// Collect unique component keys from events.
 	seen := map[string]string{} // componentKey → first raw line
 	for _, evt := range window.events {
 		if len(evt.ids) == 0 {
 			// MCE or un-identified error.
 			key := "cpu:all"
 			if evt.category == "memory" {
 				key = "memory:all"
 			}
 			if _, exists := seen[key]; !exists {
 				seen[key] = evt.raw
 			}
 			continue
 		}
 		for _, id := range evt.ids {
 			var key string
 			switch evt.category {
 			case "gpu", "pcie":
 				key = "pcie:" + normalizeBDF(id)
 			case "storage":
 				key = "storage:" + id
 			default:
 				key = "pcie:" + normalizeBDF(id)
 			}
 			if _, exists := seen[key]; !exists {
 				seen[key] = evt.raw
 			}
 		}
 	}
 	for key, detail := range seen {
 		detail = "kernel error during SAT (" + strings.Join(window.targets, ",") + "): " + truncate(detail, 120)
 		w.statusDB.Record(key, source, "Warning", detail)
 	}
 }
 // parseKmsgLine parses a single /dev/kmsg line and returns an event if it matches
 // any pattern in platform.HardwareErrorPatterns.
 // kmsg format: "<priority>,<sequence>,<timestamp_usec>,-;message text"
 func parseKmsgLine(raw string) (kmsgEvent, bool) {
 	msg := raw
 	if idx := strings.Index(raw, ";"); idx >= 0 {
 		msg = strings.TrimSpace(raw[idx+1:])
 	}
 	if msg == "" {
 		return kmsgEvent{}, false
 	}
 	for _, p := range platform.HardwareErrorPatterns {
 		m := p.Re.FindStringSubmatch(msg)
 		if m == nil {
 			continue
 		}
 		evt := kmsgEvent{
 			timestamp: time.Now(),
 			raw:       msg,
 			category:  p.Category,
 		}
 		if p.BDFGroup > 0 && p.BDFGroup < len(m) {
 			evt.ids = append(evt.ids, normalizeBDF(m[p.BDFGroup]))
 		}
 		if p.DevGroup > 0 && p.DevGroup < len(m) {
 			evt.ids = append(evt.ids, m[p.DevGroup])
 		}
 		return evt, true
 	}
 	return kmsgEvent{}, false
 }
 // normalizeBDF normalizes a PCIe BDF to the 4-part form "0000:c8:00.0".
 func normalizeBDF(bdf string) string {
 	bdf = strings.ToLower(strings.TrimSpace(bdf))
 	if strings.Count(bdf, ":") == 1 {
 		return "0000:" + bdf
 	}
 	return bdf
 }
 func truncate(s string, max int) string {
 	if len(s) <= max {
 		return s
 	}
 	return s[:max] + "..."
 }
 // isSATTarget returns true for task targets that run hardware acceptance tests.
 func isSATTarget(target string) bool {
 	switch target {
 	case "nvidia", "nvidia-stress", "memory", "memory-stress", "storage",
 		"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
 		"platform-stress":
 		return true
 	}
 	return false
 }
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
@@ -29,6 +29,7 @@ a{color:var(--accent);text-decoration:none}
 .sidebar{width:210px;min-height:100vh;background:#1b1c1d;flex-shrink:0;display:flex;flex-direction:column}
 .sidebar-logo{padding:18px 16px 12px;font-size:18px;font-weight:700;color:#fff;letter-spacing:-.5px}
 .sidebar-logo span{color:rgba(255,255,255,.5);font-weight:400;font-size:12px;display:block;margin-top:2px}
 .sidebar-version{padding:0 16px 14px;font-size:11px;color:rgba(255,255,255,.45)}
 .nav{flex:1}
 .nav-item{display:block;padding:10px 16px;color:rgba(255,255,255,.7);font-size:13px;border-left:3px solid transparent;transition:all .15s}
 .nav-item:hover{color:#fff;background:rgba(255,255,255,.08)}
@@ -96,6 +97,10 @@ func layoutNav(active string, buildLabel string) string {
 	var b strings.Builder
 	b.WriteString(`<aside class="sidebar">`)
 	b.WriteString(`<div class="sidebar-logo">bee<span>hardware audit</span></div>`)
 	if strings.TrimSpace(buildLabel) == "" {
 		buildLabel = "dev"
 	}
 	b.WriteString(`<div class="sidebar-version">Version ` + html.EscapeString(buildLabel) + `</div>`)
 	b.WriteString(`<nav class="nav">`)
 	for _, item := range items {
 		cls := "nav-item"
@@ -110,11 +115,7 @@ func layoutNav(active string, buildLabel string) string {
 				cls, item.href, item.label))
 		}
 	}
 	if strings.TrimSpace(buildLabel) == "" {
 		buildLabel = "dev"
 	}
 	b.WriteString(`</nav>`)
 	b.WriteString(`<div style="padding:12px 16px;border-top:1px solid rgba(255,255,255,.08);font-size:11px;color:rgba(255,255,255,.45)">Build ` + html.EscapeString(buildLabel) + `</div>`)
 	b.WriteString(`</aside>`)
 	return b.String()
 }
@@ -1089,72 +1090,7 @@ func renderExport(exportDir string) string {
 </div></div>
 </div>
-<div class="card" style="margin-top:16px">
+` + renderUSBExportCard()
  <div class="card-head">Export to USB
    <button class="btn btn-sm btn-secondary" onclick="usbRefresh()" style="margin-left:auto">&#8635; Refresh</button>
  </div>
  <div class="card-body">
    <p style="font-size:13px;color:var(--muted);margin-bottom:12px">Write audit JSON or support bundle directly to a removable USB drive.</p>
    <div id="usb-status" style="font-size:13px;color:var(--muted)">Scanning for USB devices...</div>
    <div id="usb-targets" style="margin-top:12px"></div>
    <div id="usb-msg" style="margin-top:10px;font-size:13px"></div>
  </div>
 </div>
 <script>
 (function(){
 function usbRefresh() {
  document.getElementById('usb-status').textContent = 'Scanning...';
  document.getElementById('usb-targets').innerHTML = '';
  document.getElementById('usb-msg').textContent = '';
  fetch('/api/export/usb').then(r=>r.json()).then(targets => {
    const st = document.getElementById('usb-status');
    const ct = document.getElementById('usb-targets');
    if (!targets || targets.length === 0) {
      st.textContent = 'No removable USB devices found.';
      return;
    }
    st.textContent = targets.length + ' device(s) found:';
    ct.innerHTML = '<table><tr><th>Device</th><th>FS</th><th>Size</th><th>Label</th><th>Model</th><th>Actions</th></tr>' +
      targets.map(t => {
        const dev = t.device || '';
        const label = t.label || '';
        const model = t.model || '';
        return '<tr>' +
          '<td style="font-family:monospace">'+dev+'</td>' +
          '<td>'+t.fs_type+'</td>' +
          '<td>'+t.size+'</td>' +
          '<td>'+label+'</td>' +
          '<td style="font-size:12px;color:var(--muted)">'+model+'</td>' +
          '<td style="white-space:nowrap">' +
            '<button class="btn btn-sm btn-primary" onclick="usbExport(\'audit\','+JSON.stringify(t)+')">Audit JSON</button> ' +
            '<button class="btn btn-sm btn-secondary" onclick="usbExport(\'bundle\','+JSON.stringify(t)+')">Support Bundle</button>' +
          '</td></tr>';
      }).join('') + '</table>';
  }).catch(e => {
    document.getElementById('usb-status').textContent = 'Error: ' + e;
  });
 }
 window.usbExport = function(type, target) {
  const msg = document.getElementById('usb-msg');
  msg.style.color = 'var(--muted)';
  msg.textContent = 'Exporting to ' + (target.device||'') + '...';
  fetch('/api/export/usb/'+type, {
    method: 'POST',
    headers: {'Content-Type':'application/json'},
    body: JSON.stringify(target)
  }).then(r=>r.json()).then(d => {
    if (d.error) { msg.style.color='var(--err,red)'; msg.textContent = 'Error: '+d.error; return; }
    msg.style.color = 'var(--ok,green)';
    msg.textContent = d.message || 'Done.';
  }).catch(e => {
    msg.style.color = 'var(--err,red)';
    msg.textContent = 'Error: '+e;
  });
 };
 window.usbRefresh = usbRefresh;
 usbRefresh();
 })();
 </script>`
 }
 func listExportFiles(exportDir string) ([]string, error) {
@@ -1224,6 +1160,77 @@ window.supportBundleDownload = function() {
 </script>`
 }
 func renderUSBExportCard() string {
 	return `<div class="card" style="margin-top:16px">
  <div class="card-head">Export to USB
    <button class="btn btn-sm btn-secondary" onclick="usbRefresh()" style="margin-left:auto">&#8635; Refresh</button>
  </div>
  <div class="card-body">` + renderUSBExportInline() + `</div>
 </div>`
 }
 func renderUSBExportInline() string {
 	return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Write audit JSON or support bundle directly to a removable USB drive.</p>
 <div id="usb-status" style="font-size:13px;color:var(--muted)">Scanning for USB devices...</div>
 <div id="usb-targets" style="margin-top:12px"></div>
 <div id="usb-msg" style="margin-top:10px;font-size:13px"></div>
 <script>
 (function(){
 function usbRefresh() {
  document.getElementById('usb-status').textContent = 'Scanning...';
  document.getElementById('usb-targets').innerHTML = '';
  document.getElementById('usb-msg').textContent = '';
  fetch('/api/export/usb').then(r=>r.json()).then(targets => {
    const st = document.getElementById('usb-status');
    const ct = document.getElementById('usb-targets');
    if (!targets || targets.length === 0) {
      st.textContent = 'No removable USB devices found.';
      return;
    }
    st.textContent = targets.length + ' device(s) found:';
    ct.innerHTML = '<table><tr><th>Device</th><th>FS</th><th>Size</th><th>Label</th><th>Model</th><th>Actions</th></tr>' +
      targets.map(t => {
        const dev = t.device || '';
        const label = t.label || '';
        const model = t.model || '';
        return '<tr>' +
          '<td style="font-family:monospace">'+dev+'</td>' +
          '<td>'+t.fs_type+'</td>' +
          '<td>'+t.size+'</td>' +
          '<td>'+label+'</td>' +
          '<td style="font-size:12px;color:var(--muted)">'+model+'</td>' +
          '<td style="white-space:nowrap">' +
            '<button class="btn btn-sm btn-primary" onclick="usbExport(\'audit\','+JSON.stringify(t)+')">Audit JSON</button> ' +
            '<button class="btn btn-sm btn-secondary" onclick="usbExport(\'bundle\','+JSON.stringify(t)+')">Support Bundle</button>' +
          '</td></tr>';
      }).join('') + '</table>';
  }).catch(e => {
    document.getElementById('usb-status').textContent = 'Error: ' + e;
  });
 }
 window.usbExport = function(type, target) {
  const msg = document.getElementById('usb-msg');
  msg.style.color = 'var(--muted)';
  msg.textContent = 'Exporting to ' + (target.device||'') + '...';
  fetch('/api/export/usb/'+type, {
    method: 'POST',
    headers: {'Content-Type':'application/json'},
    body: JSON.stringify(target)
  }).then(r=>r.json()).then(d => {
    if (d.error) { msg.style.color='var(--err,red)'; msg.textContent = 'Error: '+d.error; return; }
    msg.style.color = 'var(--ok,green)';
    msg.textContent = d.message || 'Done.';
  }).catch(e => {
    msg.style.color = 'var(--err,red)';
    msg.textContent = 'Error: '+e;
  });
 };
 window.usbRefresh = usbRefresh;
 usbRefresh();
 })();
 </script>`
 }
 // ── Display Resolution ────────────────────────────────────────────────────────
 func renderDisplayInline() string {
@@ -1325,6 +1332,10 @@ function installToRAM() {
 <div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
 <p style="font-size:13px;color:var(--muted);margin-bottom:12px">Downloads a tar.gz archive of all audit files, SAT results, and logs.</p>
 ` + renderSupportBundleInline() + `
 <div style="border-top:1px solid var(--border);margin-top:16px;padding-top:16px">
  <div style="font-weight:600;margin-bottom:8px">Export to USB</div>
  ` + renderUSBExportInline() + `
 </div>
 </div></div>
 <div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">&#8635; Check</button></div>
@@ -1578,7 +1589,11 @@ func renderTasks() string {
    <div class="card-head" style="padding:14px 18px;font-size:14px">Logs — <span id="task-log-title"></span>
      <button class="btn btn-sm btn-secondary" onclick="closeTaskLog()" style="margin-left:auto">&#10005;</button>
    </div>
-    <div class="card-body" style="padding:16px;flex:1;min-height:0"><div id="task-log-terminal" class="terminal" style="height:100%;max-height:none"></div></div>
+    <div class="card-body" style="padding:16px;flex:1;min-height:0;overflow:hidden">
      <div style="height:100%;min-height:0;overflow:auto">
        <div id="task-log-terminal" class="terminal" style="margin:0;max-height:none;overflow:visible"></div>
      </div>
    </div>
  </div>
 </div>
 <script>
--- a/audit/internal/webui/server.go
+++ b/audit/internal/webui/server.go
@@ -164,6 +164,8 @@ type handler struct {
 	// pending network change (rollback on timeout)
 	pendingNet   *pendingNetChange
 	pendingNetMu sync.Mutex
 	// kmsg hardware error watcher
 	kmsg *kmsgWatcher
 }
 // NewHandler creates the HTTP mux with all routes.
@@ -203,6 +205,13 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	}
 	h.startMetricsCollector()
 	// Start kmsg hardware error watcher if the app (and its status DB) is available.
 	if opts.App != nil {
 		h.kmsg = newKmsgWatcher(opts.App.StatusDB)
 		h.kmsg.start()
 		globalQueue.kmsgWatcher = h.kmsg
 	}
 	globalQueue.startWorker(&opts)
 	mux := http.NewServeMux()
--- a/audit/internal/webui/server_test.go
+++ b/audit/internal/webui/server_test.go
@@ -275,9 +275,10 @@ func TestRootRendersDashboard(t *testing.T) {
 	}
 	handler := NewHandler(HandlerOptions{
-		Title:     "Bee Hardware Audit",
+		Title:      "Bee Hardware Audit",
-		AuditPath: path,
+		BuildLabel: "1.2.3",
-		ExportDir: exportDir,
+		AuditPath:  path,
 		ExportDir:  exportDir,
 	})
 	first := httptest.NewRecorder()
@@ -292,6 +293,11 @@ func TestRootRendersDashboard(t *testing.T) {
 	if !strings.Contains(first.Body.String(), `/viewer`) {
 		t.Fatalf("first body missing viewer link: %s", first.Body.String())
 	}
 	versionIdx := strings.Index(first.Body.String(), `Version 1.2.3`)
 	navIdx := strings.Index(first.Body.String(), `href="/"`)
 	if versionIdx == -1 || navIdx == -1 || versionIdx > navIdx {
 		t.Fatalf("version should render near top of sidebar before nav links: %s", first.Body.String())
 	}
 	if got := first.Header().Get("Cache-Control"); got != "no-store" {
 		t.Fatalf("first cache-control=%q", got)
 	}
@@ -395,6 +401,46 @@ func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
 	if !strings.Contains(body, `id="boot-source-text"`) {
 		t.Fatalf("tools page missing boot source field: %s", body)
 	}
 	if !strings.Contains(body, `Export to USB`) {
 		t.Fatalf("tools page missing export to usb section: %s", body)
 	}
 	if !strings.Contains(body, `Support Bundle</button>`) {
 		t.Fatalf("tools page missing support bundle usb button: %s", body)
 	}
 }
 func TestTasksPageRendersScrollableLogModal(t *testing.T) {
 	dir := t.TempDir()
 	path := filepath.Join(dir, "audit.json")
 	exportDir := filepath.Join(dir, "export")
 	if err := os.MkdirAll(exportDir, 0755); err != nil {
 		t.Fatal(err)
 	}
 	if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z"}`), 0644); err != nil {
 		t.Fatal(err)
 	}
 	handler := NewHandler(HandlerOptions{
 		Title:     "Bee Hardware Audit",
 		AuditPath: path,
 		ExportDir: exportDir,
 	})
 	rec := httptest.NewRecorder()
 	handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks", nil))
 	if rec.Code != http.StatusOK {
 		t.Fatalf("status=%d", rec.Code)
 	}
 	body := rec.Body.String()
 	if !strings.Contains(body, `height:calc(100vh - 32px)`) {
 		t.Fatalf("tasks page missing bounded log modal height: %s", body)
 	}
 	if !strings.Contains(body, `flex:1;min-height:0;overflow:hidden`) {
 		t.Fatalf("tasks page missing log modal overflow guard: %s", body)
 	}
 	if !strings.Contains(body, `height:100%;min-height:0;overflow:auto`) {
 		t.Fatalf("tasks page missing scrollable log wrapper: %s", body)
 	}
 }
 func TestViewerRendersLatestSnapshot(t *testing.T) {
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -173,13 +173,14 @@ func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions
 // taskQueue manages a priority-ordered list of tasks and runs them one at a time.
 type taskQueue struct {
-	mu        sync.Mutex
+	mu          sync.Mutex
-	tasks     []*Task
+	tasks       []*Task
-	trigger   chan struct{}
+	trigger     chan struct{}
-	opts      *HandlerOptions // set by startWorker
+	opts        *HandlerOptions // set by startWorker
-	statePath string
+	statePath   string
-	logsDir   string
+	logsDir     string
-	started   bool
+	started     bool
 	kmsgWatcher *kmsgWatcher
 }
 var globalQueue = &taskQueue{trigger: make(chan struct{}, 1)}
@@ -392,11 +393,13 @@ func (q *taskQueue) worker() {
 	for {
 		<-q.trigger
 		setCPUGovernor("performance")
 		// Drain all pending tasks and start them in parallel.
 		q.mu.Lock()
 		var batch []*Task
 		for {
 			q.mu.Lock()
 			t := q.nextPending()
 			if t == nil {
 				q.mu.Unlock()
 				break
 			}
 			now := time.Now()
@@ -405,29 +408,58 @@ func (q *taskQueue) worker() {
 			t.DoneAt = nil
 			t.ErrMsg = ""
 			j := newTaskJobState(t.LogPath)
 			ctx, cancel := context.WithCancel(context.Background())
 			j.cancel = cancel
 			t.job = j
 			batch = append(batch, t)
 		}
 		if len(batch) > 0 {
 			q.persistLocked()
-			q.mu.Unlock()
+		}
 		q.mu.Unlock()
-			q.runTask(t, j, ctx)
+		var wg sync.WaitGroup
 		for _, t := range batch {
 			t := t
 			j := t.job
 			taskCtx, taskCancel := context.WithCancel(context.Background())
 			j.cancel = taskCancel
 			wg.Add(1)
 			go func() {
 				defer wg.Done()
-			q.mu.Lock()
+				if q.kmsgWatcher != nil && isSATTarget(t.Target) {
-			now2 := time.Now()
+					q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
 			t.DoneAt = &now2
 			if t.Status == TaskRunning { // not cancelled externally
 				if j.err != "" {
 					t.Status = TaskFailed
 					t.ErrMsg = j.err
 				} else {
 					t.Status = TaskDone
 				}
-			}
+
 				q.runTask(t, j, taskCtx)
 				if q.kmsgWatcher != nil {
 					q.kmsgWatcher.NotifyTaskFinished(t.ID)
 				}
 				q.mu.Lock()
 				now2 := time.Now()
 				t.DoneAt = &now2
 				if t.Status == TaskRunning {
 					if j.err != "" {
 						t.Status = TaskFailed
 						t.ErrMsg = j.err
 					} else {
 						t.Status = TaskDone
 					}
 				}
 				q.persistLocked()
 				q.mu.Unlock()
 			}()
 		}
 		wg.Wait()
 		if len(batch) > 0 {
 			q.mu.Lock()
 			q.prune()
 			q.persistLocked()
 			q.mu.Unlock()
 		}
 		setCPUGovernor("powersave")
 	}
 }
@@ -618,6 +650,19 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 		return
 	}
 	// If the SAT archive was produced, check overall_status and write to component DB.
 	if archive != "" {
 		archivePath := app.ExtractArchivePath(archive)
 		if err == nil {
 			if app.ReadSATOverallStatus(archivePath) == "FAILED" {
 				err = fmt.Errorf("SAT overall_status=FAILED (see summary.txt)")
 			}
 		}
 		if db := q.statusDB(); db != nil {
 			app.ApplySATResultToDB(db, t.Target, archivePath)
 		}
 	}
 	if err != nil {
 		if ctx.Err() != nil {
 			j.append("Aborted.")
@@ -634,6 +679,13 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 	j.finish("")
 }
 func (q *taskQueue) statusDB() *app.ComponentStatusDB {
 	if q.opts == nil || q.opts.App == nil {
 		return nil
 	}
 	return q.opts.App.StatusDB
 }
 func splitLines(s string) []string {
 	var out []string
 	for _, l := range splitNL(s) {
--- a/audit/scripts/resolve-version.sh
+++ b/audit/scripts/resolve-version.sh
@@ -0,0 +1,16 @@
 #!/bin/sh
 set -eu
 tag="$(git describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
 case "${tag}" in
 	v*)
 		printf '%s\n' "${tag#v}"
 		;;
 	"")
 		printf 'dev\n'
 		;;
 	*)
 		printf '%s\n' "${tag}"
 		;;
 esac
--- a/iso/builder/auto/config
+++ b/iso/builder/auto/config
@@ -32,7 +32,7 @@ lb config noauto \
    --memtest memtest86+ \
    --iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
    --iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
-    --bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
+    --bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=6 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
    --apt-recommends false \
    --chroot-squashfs-compression-type zstd \
    "${@}"
--- a/iso/builder/build.sh
+++ b/iso/builder/build.sh
@@ -54,15 +54,8 @@ resolve_audit_version() {
        return 0
    fi
-    tag="$(git -C "${REPO_ROOT}" describe --tags --match 'audit/v*' --abbrev=7 --dirty 2>/dev/null || true)"
+    tag="$(git -C "${REPO_ROOT}" describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
    if [ -z "${tag}" ]; then
        tag="$(git -C "${REPO_ROOT}" describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
    fi
    case "${tag}" in
        audit/v*)
            echo "${tag#audit/v}"
            return 0
            ;;
        v*)
            echo "${tag#v}"
            return 0
--- a/iso/overlay/etc/systemd/system/bee-audit.service
+++ b/iso/overlay/etc/systemd/system/bee-audit.service
@@ -1,7 +1,6 @@
 [Unit]
 Description=Bee: hardware audit
 After=bee-preflight.service bee-network.service bee-nvidia.service
 Before=bee-web.service
 [Service]
 Type=oneshot
--- a/iso/overlay/etc/systemd/system/bee-web.service
+++ b/iso/overlay/etc/systemd/system/bee-web.service
@@ -1,6 +1,5 @@
 [Unit]
 Description=Bee: hardware audit web viewer
 After=bee-audit.service
 [Service]
 Type=simple
--- a/iso/overlay/usr/local/bin/bee-john-gpu-stress
+++ b/iso/overlay/usr/local/bin/bee-john-gpu-stress
@@ -1,10 +1,11 @@
 #!/bin/sh
 set -eu
-SECONDS=300
+DURATION_SEC=300
 DEVICES=""
 EXCLUDE=""
 FORMAT=""
 TEST_SLICE_SECONDS=300
 JOHN_DIR="/usr/local/lib/bee/john/run"
 JOHN_BIN="${JOHN_DIR}/john"
 export OCL_ICD_VENDORS="/etc/OpenCL/vendors"
@@ -116,7 +117,7 @@ ensure_opencl_ready() {
 while [ "$#" -gt 0 ]; do
    case "$1" in
-        --seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
+        --seconds|-t) [ "$#" -ge 2 ] || usage; DURATION_SEC="$2"; shift 2 ;;
        --devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
        --exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
        --format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;;
@@ -189,14 +190,51 @@ CHOSEN_FORMAT=$(choose_format) || {
    exit 1
 }
-echo "format=${CHOSEN_FORMAT}"
+run_john_loop() {
    opencl_id="$1"
    deadline="$2"
    round=0
    while :; do
        now=$(date +%s)
        remaining=$((deadline - now))
        if [ "${remaining}" -le 0 ]; then
            break
        fi
        round=$((round + 1))
        slice="${remaining}"
        if [ "${slice}" -gt "${TEST_SLICE_SECONDS}" ]; then
            slice="${TEST_SLICE_SECONDS}"
        fi
        echo "device=${opencl_id} round=${round} remaining_sec=${remaining} slice_sec=${slice}"
        ./john --test="${slice}" --format="${CHOSEN_FORMAT}" --devices="${opencl_id}" || return 1
    done
 }
 PIDS=""
 cleanup() {
    rc=$?
    trap - EXIT INT TERM
    for pid in ${PIDS}; do
        kill "${pid}" 2>/dev/null || true
    done
    for pid in ${PIDS}; do
        wait "${pid}" 2>/dev/null || true
    done
    exit "${rc}"
 }
 trap cleanup EXIT INT TERM
 echo "format=${CHOSEN_FORMAT}"
 echo "target_seconds=${DURATION_SEC}"
 echo "slice_seconds=${TEST_SLICE_SECONDS}"
 DEADLINE=$(( $(date +%s) + DURATION_SEC ))
 _first=1
 for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do
    [ "${_first}" = "1" ] || sleep 3
    _first=0
-    ./john --test="${SECONDS}" --format="${CHOSEN_FORMAT}" --devices="${opencl_id}" &
+    run_john_loop "${opencl_id}" "${DEADLINE}" &
-    PIDS="${PIDS} $!"
+    pid=$!
    PIDS="${PIDS} ${pid}"
 done
 FAIL=0
 for pid in ${PIDS}; do
--- a/iso/overlay/usr/local/bin/bee-nvidia-load
+++ b/iso/overlay/usr/local/bin/bee-nvidia-load
@@ -128,13 +128,32 @@ ldconfig 2>/dev/null || true
 log "ldconfig refreshed"
 # Start DCGM host engine so dcgmi can discover GPUs.
-# nv-hostengine must run before any dcgmi command — without it, dcgmi reports
+# nv-hostengine must run after the NVIDIA modules and device nodes are ready.
-# "group is empty" even when GPUs and modules are present.
+# If it started too early (for example via systemd before bee-nvidia-load), it can
-# Skip if already running (e.g. started by a dcgm systemd service or prior boot).
+# keep a stale empty inventory and dcgmi diag later reports no testable entities.
 if command -v nv-hostengine >/dev/null 2>&1; then
    if pgrep -x nv-hostengine >/dev/null 2>&1; then
-        log "nv-hostengine already running — skipping"
+        if command -v pkill >/dev/null 2>&1; then
-    else
+            pkill -x nv-hostengine >/dev/null 2>&1 || true
            tries=0
            while pgrep -x nv-hostengine >/dev/null 2>&1; do
                tries=$((tries + 1))
                if [ "${tries}" -ge 10 ]; then
                    log "WARN: nv-hostengine is still running after restart request"
                    break
                fi
                sleep 1
            done
            if pgrep -x nv-hostengine >/dev/null 2>&1; then
                log "WARN: keeping existing nv-hostengine process"
            else
                log "nv-hostengine restarted"
            fi
        else
            log "WARN: pkill not found — cannot refresh nv-hostengine inventory"
        fi
    fi
    if ! pgrep -x nv-hostengine >/dev/null 2>&1; then
        nv-hostengine
        log "nv-hostengine started"
    fi
Author	SHA1	Message	Date
Mikhail Chusavitin	f3c14cd893	Harden NIC probing for empty SFP ports	2026-04-04 15:23:15 +03:00
Mikhail Chusavitin	728270dc8e	Unblock bee-web startup and expand support bundle diagnostics	2026-04-04 15:18:43 +03:00
Mikhail Chusavitin	8692f825bc	Use plain repo tags for build version	2026-04-03 10:48:51 +03:00
Mikhail Chusavitin	11f52ac710	Fix task log modal scrolling	2026-04-03 10:36:11 +03:00
Mikhail Chusavitin	1cb398fe83	Show tag version at top of sidebar	2026-04-03 10:08:00 +03:00
Mikhail Chusavitin	7a843be6b0	Stabilize DCGM GPU discovery	2026-04-03 09:50:33 +03:00
Mikhail Chusavitin	7f6386dccc	Restore USB support bundle export on tools page	2026-04-03 09:48:22 +03:00
Mikhail Chusavitin	eea2591bcc	Fix John GPU stress duration semantics	2026-04-03 09:46:16 +03:00
Mikhail Chusavitin	295a19b93a	feat(tasks): run all queued tasks in parallel Tasks are now started simultaneously when multiple are enqueued (e.g. Run All). The worker drains all pending tasks at once and launches each in its own goroutine, waiting via WaitGroup. kmsg watcher updated to use a shared event window with a reference counter across concurrent tasks. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-03 09:15:06 +03:00
Mikhail Chusavitin	444a7d16cc	fix(iso): increase boot verbosity for service startup visibility Raise loglevel from 3 to 6 (INFO) and add systemd.show_status=1 so kernel driver messages and systemd [ OK ]/[ FAILED ] lines are visible during boot instead of showing only a blank cursor. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-02 19:33:27 +03:00
Mikhail Chusavitin	fd722692a4	feat(watchdog): hardware error monitor + unified component status store - Add platform/error_patterns.go: pluggable table of kernel log patterns (NVIDIA/GPU, PCIe AER, storage I/O, MCE, EDAC) — extend by adding one struct - Add app/component_status_db.go: persistent JSON store (component-status.json) keyed by "pcie:BDF", "storage:dev", "cpu:all", "memory:all"; OK never downgrades Warning or Critical - Add webui/kmsg_watcher.go: goroutine reads /dev/kmsg during SAT tasks, writes Warning to DB for matched hardware errors - Fix task status: overall_status=FAILED in summary.txt now marks task failed - Audit routine overlays component DB statuses into bee-audit.json on every read Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-04-02 19:20:59 +03:00