Compare commits
16 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f3c14cd893 | ||
|
|
728270dc8e | ||
|
|
8692f825bc | ||
|
|
11f52ac710 | ||
|
|
1cb398fe83 | ||
|
|
7a843be6b0 | ||
|
|
7f6386dccc | ||
|
|
eea2591bcc | ||
|
|
295a19b93a | ||
|
|
444a7d16cc | ||
|
|
fd722692a4 | ||
|
|
99cece524c | ||
|
|
c27449c60e | ||
|
|
5ef879e307 | ||
|
|
e7df63bae1 | ||
|
|
17ff3811f8 |
@@ -1,5 +1,7 @@
|
||||
LISTEN ?= :8080
|
||||
AUDIT_PATH ?=
|
||||
VERSION ?= $(shell sh ./scripts/resolve-version.sh)
|
||||
GO_LDFLAGS := -X main.Version=$(VERSION)
|
||||
|
||||
RUN_ARGS := web --listen $(LISTEN)
|
||||
ifneq ($(AUDIT_PATH),)
|
||||
@@ -9,10 +11,10 @@ endif
|
||||
.PHONY: run build test
|
||||
|
||||
run:
|
||||
go run ./cmd/bee $(RUN_ARGS)
|
||||
go run -ldflags "$(GO_LDFLAGS)" ./cmd/bee $(RUN_ARGS)
|
||||
|
||||
build:
|
||||
go build -o bee ./cmd/bee
|
||||
go build -ldflags "$(GO_LDFLAGS)" -o bee ./cmd/bee
|
||||
|
||||
test:
|
||||
go test ./...
|
||||
|
||||
@@ -7,7 +7,6 @@ import (
|
||||
"io"
|
||||
"log/slog"
|
||||
"os"
|
||||
"runtime/debug"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
@@ -21,30 +20,7 @@ var Version = "dev"
|
||||
func buildLabel() string {
|
||||
label := strings.TrimSpace(Version)
|
||||
if label == "" {
|
||||
label = "dev"
|
||||
}
|
||||
if info, ok := debug.ReadBuildInfo(); ok {
|
||||
var revision string
|
||||
var modified bool
|
||||
for _, setting := range info.Settings {
|
||||
switch setting.Key {
|
||||
case "vcs.revision":
|
||||
revision = setting.Value
|
||||
case "vcs.modified":
|
||||
modified = setting.Value == "true"
|
||||
}
|
||||
}
|
||||
if revision != "" {
|
||||
short := revision
|
||||
if len(short) > 12 {
|
||||
short = short[:12]
|
||||
}
|
||||
label += " (" + short
|
||||
if modified {
|
||||
label += "+"
|
||||
}
|
||||
label += ")"
|
||||
}
|
||||
return "dev"
|
||||
}
|
||||
return label
|
||||
}
|
||||
|
||||
@@ -46,8 +46,6 @@ func TestRunUnknownCommand(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestRunVersion(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
old := Version
|
||||
Version = "test-version"
|
||||
t.Cleanup(func() { Version = old })
|
||||
@@ -62,6 +60,16 @@ func TestRunVersion(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildLabelUsesVersionAsIs(t *testing.T) {
|
||||
old := Version
|
||||
Version = "1.2.3"
|
||||
t.Cleanup(func() { Version = old })
|
||||
|
||||
if got := buildLabel(); got != "1.2.3" {
|
||||
t.Fatalf("buildLabel=%q want %q", got, "1.2.3")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunExportRequiresTarget(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
@@ -40,6 +40,8 @@ type App struct {
|
||||
sat satRunner
|
||||
runtime runtimeChecker
|
||||
installer installer
|
||||
// StatusDB is the unified component health store (nil if unavailable).
|
||||
StatusDB *ComponentStatusDB
|
||||
}
|
||||
|
||||
type ActionResult struct {
|
||||
@@ -80,6 +82,7 @@ type installer interface {
|
||||
ListInstallDisks() ([]platform.InstallDisk, error)
|
||||
InstallToDisk(ctx context.Context, device string, logFile string) error
|
||||
IsLiveMediaInRAM() bool
|
||||
LiveBootSource() platform.LiveBootSource
|
||||
RunInstallToRAM(ctx context.Context, logFunc func(string)) error
|
||||
}
|
||||
|
||||
@@ -100,6 +103,10 @@ func (a *App) IsLiveMediaInRAM() bool {
|
||||
return a.installer.IsLiveMediaInRAM()
|
||||
}
|
||||
|
||||
func (a *App) LiveBootSource() platform.LiveBootSource {
|
||||
return a.installer.LiveBootSource()
|
||||
}
|
||||
|
||||
func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
||||
return a.installer.RunInstallToRAM(ctx, logFunc)
|
||||
}
|
||||
@@ -131,7 +138,7 @@ type runtimeChecker interface {
|
||||
}
|
||||
|
||||
func New(platform *platform.System) *App {
|
||||
return &App{
|
||||
a := &App{
|
||||
network: platform,
|
||||
services: platform,
|
||||
exports: platform,
|
||||
@@ -140,6 +147,10 @@ func New(platform *platform.System) *App {
|
||||
runtime: platform,
|
||||
installer: platform,
|
||||
}
|
||||
if db, err := OpenComponentStatusDB(DefaultExportDir + "/component-status.json"); err == nil {
|
||||
a.StatusDB = db
|
||||
}
|
||||
return a
|
||||
}
|
||||
|
||||
// ApplySATOverlay parses a raw audit JSON, overlays the latest SAT results,
|
||||
@@ -149,7 +160,7 @@ func ApplySATOverlay(auditJSON []byte) ([]byte, error) {
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir)
|
||||
applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir, nil)
|
||||
return json.MarshalIndent(snap, "", " ")
|
||||
}
|
||||
|
||||
@@ -169,7 +180,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
|
||||
}
|
||||
}
|
||||
result := collector.Run(runtimeMode)
|
||||
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir)
|
||||
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
|
||||
if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
|
||||
result.Runtime = &health
|
||||
}
|
||||
|
||||
@@ -754,6 +754,26 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
for _, want := range []string{
|
||||
"/system/ip-link.txt",
|
||||
"/system/ip-link-stats.txt",
|
||||
"/system/ethtool-info.txt",
|
||||
"/system/ethtool-link.txt",
|
||||
"/system/ethtool-module.txt",
|
||||
"/system/mstflint-query.txt",
|
||||
} {
|
||||
var found bool
|
||||
for _, name := range names {
|
||||
if contains(name, want) {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Fatalf("support bundle missing %s, names=%v", want, names)
|
||||
}
|
||||
}
|
||||
|
||||
var foundRaw bool
|
||||
for _, name := range names {
|
||||
if contains(name, "/export/bee-sat/memory-run/verbose.log") {
|
||||
|
||||
266
audit/internal/app/component_status_db.go
Normal file
266
audit/internal/app/component_status_db.go
Normal file
@@ -0,0 +1,266 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ComponentStatusDB is a persistent, append-only store of hardware component health records.
|
||||
// Records are keyed by component identity strings (e.g. "pcie:0000:c8:00.0", "storage:nvme0n1").
|
||||
// Once a component is marked Warning or Critical, subsequent OK entries do not downgrade it —
|
||||
// the component stays at the highest observed severity until explicitly reset.
|
||||
type ComponentStatusDB struct {
|
||||
path string
|
||||
mu sync.Mutex
|
||||
records map[string]*ComponentStatusRecord
|
||||
}
|
||||
|
||||
// ComponentStatusRecord holds the current and historical health of one hardware component.
|
||||
type ComponentStatusRecord struct {
|
||||
ComponentKey string `json:"component_key"`
|
||||
Status string `json:"status"` // "OK", "Warning", "Critical", "Unknown"
|
||||
LastCheckedAt time.Time `json:"last_checked_at"`
|
||||
LastChangedAt time.Time `json:"last_changed_at"`
|
||||
ErrorSummary string `json:"error_summary,omitempty"`
|
||||
History []ComponentStatusEntry `json:"history"`
|
||||
}
|
||||
|
||||
// ComponentStatusEntry is one observation written to a component's history.
|
||||
type ComponentStatusEntry struct {
|
||||
At time.Time `json:"at"`
|
||||
Status string `json:"status"`
|
||||
Source string `json:"source"` // e.g. "sat:nvidia", "sat:memory", "watchdog:kmsg"
|
||||
Detail string `json:"detail,omitempty"`
|
||||
}
|
||||
|
||||
// OpenComponentStatusDB opens (or creates) the JSON status DB at path.
|
||||
func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) {
|
||||
db := &ComponentStatusDB{
|
||||
path: path,
|
||||
records: make(map[string]*ComponentStatusRecord),
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil && !os.IsNotExist(err) {
|
||||
return nil, err
|
||||
}
|
||||
if len(data) > 0 {
|
||||
var records []ComponentStatusRecord
|
||||
if err := json.Unmarshal(data, &records); err == nil {
|
||||
for i := range records {
|
||||
db.records[records[i].ComponentKey] = &records[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
return db, nil
|
||||
}
|
||||
|
||||
// Record writes one observation for the given component key.
|
||||
// source is a short label like "sat:nvidia" or "watchdog:kmsg".
|
||||
// status is "OK", "Warning", "Critical", or "Unknown".
|
||||
// OK never downgrades an existing Warning or Critical status.
|
||||
func (db *ComponentStatusDB) Record(key, source, status, detail string) {
|
||||
if db == nil || strings.TrimSpace(key) == "" {
|
||||
return
|
||||
}
|
||||
db.mu.Lock()
|
||||
defer db.mu.Unlock()
|
||||
|
||||
now := time.Now().UTC()
|
||||
rec, exists := db.records[key]
|
||||
if !exists {
|
||||
rec = &ComponentStatusRecord{ComponentKey: key}
|
||||
db.records[key] = rec
|
||||
}
|
||||
rec.LastCheckedAt = now
|
||||
|
||||
entry := ComponentStatusEntry{At: now, Status: status, Source: source, Detail: detail}
|
||||
rec.History = append(rec.History, entry)
|
||||
|
||||
// Status merge: OK never downgrades Warning/Critical.
|
||||
newSev := componentSeverity(status)
|
||||
curSev := componentSeverity(rec.Status)
|
||||
if newSev > curSev {
|
||||
rec.Status = status
|
||||
rec.LastChangedAt = now
|
||||
rec.ErrorSummary = detail
|
||||
} else if rec.Status == "" {
|
||||
rec.Status = status
|
||||
rec.LastChangedAt = now
|
||||
}
|
||||
|
||||
_ = db.saveLocked()
|
||||
}
|
||||
|
||||
// Get returns the current record for a component key.
|
||||
func (db *ComponentStatusDB) Get(key string) (ComponentStatusRecord, bool) {
|
||||
if db == nil {
|
||||
return ComponentStatusRecord{}, false
|
||||
}
|
||||
db.mu.Lock()
|
||||
defer db.mu.Unlock()
|
||||
r, ok := db.records[key]
|
||||
if !ok {
|
||||
return ComponentStatusRecord{}, false
|
||||
}
|
||||
return *r, true
|
||||
}
|
||||
|
||||
// All returns a snapshot of all records.
|
||||
func (db *ComponentStatusDB) All() []ComponentStatusRecord {
|
||||
if db == nil {
|
||||
return nil
|
||||
}
|
||||
db.mu.Lock()
|
||||
defer db.mu.Unlock()
|
||||
out := make([]ComponentStatusRecord, 0, len(db.records))
|
||||
for _, r := range db.records {
|
||||
out = append(out, *r)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func (db *ComponentStatusDB) saveLocked() error {
|
||||
records := make([]ComponentStatusRecord, 0, len(db.records))
|
||||
for _, r := range db.records {
|
||||
records = append(records, *r)
|
||||
}
|
||||
data, err := json.MarshalIndent(records, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(db.path, data, 0644)
|
||||
}
|
||||
|
||||
// componentSeverity returns a numeric severity so higher values win.
|
||||
func componentSeverity(status string) int {
|
||||
switch strings.TrimSpace(status) {
|
||||
case "Critical":
|
||||
return 3
|
||||
case "Warning":
|
||||
return 2
|
||||
case "OK":
|
||||
return 1
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
// ApplySATResultToDB reads a SAT summary.txt from the run directory next to archivePath
|
||||
// and writes component status records to db for the given SAT target.
|
||||
// archivePath may be either a bare .tar.gz path or "Archive written to /path/foo.tar.gz".
|
||||
func ApplySATResultToDB(db *ComponentStatusDB, target, archivePath string) {
|
||||
if db == nil || strings.TrimSpace(archivePath) == "" {
|
||||
return
|
||||
}
|
||||
archivePath = extractArchivePath(archivePath)
|
||||
if archivePath == "" {
|
||||
return
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
kv := parseSATKV(string(data))
|
||||
overall := strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
|
||||
if overall == "" {
|
||||
return
|
||||
}
|
||||
|
||||
source := "sat:" + target
|
||||
dbStatus := satStatusToDBStatus(overall)
|
||||
|
||||
// Map SAT target to component keys.
|
||||
switch target {
|
||||
case "nvidia", "amd", "nvidia-stress", "amd-stress", "amd-mem", "amd-bandwidth":
|
||||
db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall)
|
||||
case "memory", "memory-stress", "sat-stress":
|
||||
db.Record("memory:all", source, dbStatus, target+" SAT: "+overall)
|
||||
case "cpu", "platform-stress":
|
||||
db.Record("cpu:all", source, dbStatus, target+" SAT: "+overall)
|
||||
case "storage":
|
||||
// Try to record per-device if available in summary.
|
||||
recordedAny := false
|
||||
for key, val := range kv {
|
||||
if !strings.HasSuffix(key, "_status") || key == "overall_status" {
|
||||
continue
|
||||
}
|
||||
base := strings.TrimSuffix(key, "_status")
|
||||
idx := strings.Index(base, "_")
|
||||
if idx <= 0 {
|
||||
continue
|
||||
}
|
||||
devName := base[:idx]
|
||||
devStatus := satStatusToDBStatus(strings.ToUpper(strings.TrimSpace(val)))
|
||||
db.Record("storage:"+devName, source, devStatus, "storage SAT: "+val)
|
||||
recordedAny = true
|
||||
}
|
||||
if !recordedAny {
|
||||
db.Record("storage:all", source, dbStatus, "storage SAT: "+overall)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func satStatusToDBStatus(overall string) string {
|
||||
switch overall {
|
||||
case "OK":
|
||||
return "OK"
|
||||
case "FAILED":
|
||||
return "Warning"
|
||||
case "PARTIAL", "UNSUPPORTED":
|
||||
return "Unknown"
|
||||
default:
|
||||
return "Unknown"
|
||||
}
|
||||
}
|
||||
|
||||
// ExtractArchivePath extracts a bare .tar.gz path from a string that may be
|
||||
// "Archive written to /path/foo.tar.gz" or already a bare path.
|
||||
func ExtractArchivePath(s string) string {
|
||||
return extractArchivePath(s)
|
||||
}
|
||||
|
||||
// ReadSATOverallStatus reads the overall_status value from the summary.txt
|
||||
// file located in the run directory alongside archivePath.
|
||||
// Returns "" if the file cannot be read.
|
||||
func ReadSATOverallStatus(archivePath string) string {
|
||||
if strings.TrimSpace(archivePath) == "" {
|
||||
return ""
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
kv := parseSATKV(string(data))
|
||||
return strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
|
||||
}
|
||||
|
||||
func extractArchivePath(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
if strings.HasSuffix(s, ".tar.gz") {
|
||||
parts := strings.Fields(s)
|
||||
if len(parts) > 0 {
|
||||
return parts[len(parts)-1]
|
||||
}
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
func parseSATKV(raw string) map[string]string {
|
||||
kv := make(map[string]string)
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
k, v, ok := strings.Cut(strings.TrimSpace(line), "=")
|
||||
if ok {
|
||||
kv[strings.TrimSpace(k)] = strings.TrimSpace(v)
|
||||
}
|
||||
}
|
||||
return kv
|
||||
}
|
||||
@@ -9,7 +9,7 @@ import (
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
|
||||
func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *ComponentStatusDB) {
|
||||
if snap == nil || strings.TrimSpace(baseDir) == "" {
|
||||
return
|
||||
}
|
||||
@@ -28,6 +28,8 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "storage-"); ok {
|
||||
applyStorageSAT(snap.Storage, summary)
|
||||
}
|
||||
// Apply unified component status DB — overlaid last so it can only upgrade severity.
|
||||
applyComponentStatusDB(snap, db)
|
||||
}
|
||||
|
||||
type satSummary struct {
|
||||
@@ -206,6 +208,86 @@ func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool {
|
||||
}
|
||||
}
|
||||
|
||||
func applyComponentStatusDB(snap *schema.HardwareSnapshot, db *ComponentStatusDB) {
|
||||
if snap == nil || db == nil {
|
||||
return
|
||||
}
|
||||
for _, rec := range db.All() {
|
||||
key := rec.ComponentKey
|
||||
status := dbStatusToSATStatus(rec.Status)
|
||||
if status == "" {
|
||||
continue
|
||||
}
|
||||
detail := rec.ErrorSummary
|
||||
ts := rec.LastChangedAt.UTC().Format("2006-01-02T15:04:05Z")
|
||||
|
||||
switch {
|
||||
case strings.HasPrefix(key, "pcie:"):
|
||||
bdf := strings.TrimPrefix(key, "pcie:")
|
||||
bdf = strings.TrimPrefix(bdf, "gpu:") // strip sub-type if present
|
||||
// bdf may be empty (e.g. "pcie:gpu:nvidia") — skip BDF matching
|
||||
if sanitizeBDFForLookup(bdf) == "" {
|
||||
break
|
||||
}
|
||||
normalized := sanitizeBDFForLookup(bdf)
|
||||
for i := range snap.PCIeDevices {
|
||||
if snap.PCIeDevices[i].BDF == nil {
|
||||
continue
|
||||
}
|
||||
if sanitizeBDFForLookup(*snap.PCIeDevices[i].BDF) == normalized {
|
||||
mergeComponentStatus(&snap.PCIeDevices[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
}
|
||||
case strings.HasPrefix(key, "storage:"):
|
||||
devName := strings.TrimPrefix(key, "storage:")
|
||||
if devName == "all" {
|
||||
for i := range snap.Storage {
|
||||
mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
} else {
|
||||
for i := range snap.Storage {
|
||||
linuxDev, _ := snap.Storage[i].Telemetry["linux_device"].(string)
|
||||
if filepath.Base(strings.TrimSpace(linuxDev)) == devName {
|
||||
mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
}
|
||||
}
|
||||
case strings.HasPrefix(key, "memory:"):
|
||||
for i := range snap.Memory {
|
||||
mergeComponentStatus(&snap.Memory[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
case strings.HasPrefix(key, "cpu:"):
|
||||
for i := range snap.CPUs {
|
||||
mergeComponentStatus(&snap.CPUs[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// dbStatusToSATStatus converts ComponentStatusDB status strings to the format
|
||||
// expected by mergeComponentStatus (which uses "OK", "Warning", "Critical", "Unknown").
|
||||
func dbStatusToSATStatus(s string) string {
|
||||
switch strings.TrimSpace(s) {
|
||||
case "OK", "Warning", "Critical", "Unknown":
|
||||
return s
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
// sanitizeBDFForLookup normalises a PCIe BDF address to a canonical lower-case form
|
||||
// suitable for comparison. "c8:00.0" → "0000:c8:00.0"; already-full BDFs are left as-is.
|
||||
func sanitizeBDFForLookup(bdf string) string {
|
||||
bdf = strings.ToLower(strings.TrimSpace(bdf))
|
||||
if bdf == "" || bdf == "gpu" || strings.ContainsAny(bdf, " \t") {
|
||||
return ""
|
||||
}
|
||||
if strings.Count(bdf, ":") == 1 {
|
||||
bdf = "0000:" + bdf
|
||||
}
|
||||
return bdf
|
||||
}
|
||||
|
||||
func ptrString(v *string) string {
|
||||
if v == nil {
|
||||
return ""
|
||||
|
||||
@@ -23,7 +23,7 @@ func TestApplyLatestSATStatusesMarksStorageByDevice(t *testing.T) {
|
||||
usb := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/sda"}}
|
||||
snap := schema.HardwareSnapshot{Storage: []schema.HardwareStorage{nvme, usb}}
|
||||
|
||||
applyLatestSATStatuses(&snap, baseDir)
|
||||
applyLatestSATStatuses(&snap, baseDir, nil)
|
||||
|
||||
if snap.Storage[0].Status == nil || *snap.Storage[0].Status != "OK" {
|
||||
t.Fatalf("nvme status=%v want OK", snap.Storage[0].Status)
|
||||
@@ -53,7 +53,7 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
|
||||
}},
|
||||
}
|
||||
|
||||
applyLatestSATStatuses(&snap, baseDir)
|
||||
applyLatestSATStatuses(&snap, baseDir, nil)
|
||||
|
||||
if snap.PCIeDevices[0].Status == nil || *snap.PCIeDevices[0].Status != "Critical" {
|
||||
t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
|
||||
|
||||
@@ -27,13 +27,114 @@ var supportBundleCommands = []struct {
|
||||
cmd []string
|
||||
}{
|
||||
{name: "system/uname.txt", cmd: []string{"uname", "-a"}},
|
||||
{name: "system/cmdline.txt", cmd: []string{"cat", "/proc/cmdline"}},
|
||||
{name: "system/lsmod.txt", cmd: []string{"lsmod"}},
|
||||
{name: "system/lspci-nn.txt", cmd: []string{"lspci", "-nn"}},
|
||||
{name: "system/lspci-vvv.txt", cmd: []string{"lspci", "-vvv"}},
|
||||
{name: "system/ip-addr.txt", cmd: []string{"ip", "addr"}},
|
||||
{name: "system/ip-link.txt", cmd: []string{"ip", "-details", "link", "show"}},
|
||||
{name: "system/ip-link-stats.txt", cmd: []string{"ip", "-s", "link", "show"}},
|
||||
{name: "system/ip-route.txt", cmd: []string{"ip", "route"}},
|
||||
{name: "system/mount.txt", cmd: []string{"mount"}},
|
||||
{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
|
||||
{name: "system/dmesg-tail.txt", cmd: []string{"sh", "-c", "dmesg | tail -n 200"}},
|
||||
{name: "system/dmesg.txt", cmd: []string{"dmesg"}},
|
||||
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
|
||||
for d in /sys/bus/pci/devices/*/; do
|
||||
vendor=$(cat "$d/vendor" 2>/dev/null)
|
||||
[ "$vendor" = "0x10de" ] || continue
|
||||
dev=$(basename "$d")
|
||||
echo "=== $dev ==="
|
||||
for f in current_link_speed current_link_width max_link_speed max_link_width; do
|
||||
printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
|
||||
done
|
||||
done
|
||||
`}},
|
||||
{name: "system/ethtool-info.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v ethtool >/dev/null 2>&1; then
|
||||
echo "ethtool not found"
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for path in /sys/class/net/*; do
|
||||
[ -e "$path" ] || continue
|
||||
iface=$(basename "$path")
|
||||
[ "$iface" = "lo" ] && continue
|
||||
found=1
|
||||
echo "=== $iface ==="
|
||||
ethtool -i "$iface" 2>&1 || true
|
||||
echo
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no interfaces found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/ethtool-link.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v ethtool >/dev/null 2>&1; then
|
||||
echo "ethtool not found"
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for path in /sys/class/net/*; do
|
||||
[ -e "$path" ] || continue
|
||||
iface=$(basename "$path")
|
||||
[ "$iface" = "lo" ] && continue
|
||||
found=1
|
||||
echo "=== $iface ==="
|
||||
ethtool "$iface" 2>&1 || true
|
||||
echo
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no interfaces found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/ethtool-module.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v ethtool >/dev/null 2>&1; then
|
||||
echo "ethtool not found"
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for path in /sys/class/net/*; do
|
||||
[ -e "$path" ] || continue
|
||||
iface=$(basename "$path")
|
||||
[ "$iface" = "lo" ] && continue
|
||||
found=1
|
||||
echo "=== $iface ==="
|
||||
ethtool -m "$iface" 2>&1 || true
|
||||
echo
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no interfaces found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/mstflint-query.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v mstflint >/dev/null 2>&1; then
|
||||
echo "mstflint not found"
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for path in /sys/bus/pci/devices/*; do
|
||||
[ -e "$path/vendor" ] || continue
|
||||
vendor=$(cat "$path/vendor" 2>/dev/null)
|
||||
[ "$vendor" = "0x15b3" ] || continue
|
||||
bdf=$(basename "$path")
|
||||
found=1
|
||||
echo "=== $bdf ==="
|
||||
mstflint -d "$bdf" q 2>&1 || true
|
||||
echo
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no Mellanox/NVIDIA networking devices found"
|
||||
fi
|
||||
`}},
|
||||
}
|
||||
|
||||
var supportBundleOptionalFiles = []struct {
|
||||
name string
|
||||
src string
|
||||
}{
|
||||
{name: "system/kern.log", src: "/var/log/kern.log"},
|
||||
{name: "system/syslog.txt", src: "/var/log/syslog"},
|
||||
}
|
||||
|
||||
const supportBundleGlob = "bee-support-*.tar.gz"
|
||||
@@ -77,6 +178,9 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
||||
return "", err
|
||||
}
|
||||
}
|
||||
for _, item := range supportBundleOptionalFiles {
|
||||
_ = copyOptionalFile(item.src, filepath.Join(stageRoot, item.name))
|
||||
}
|
||||
if err := writeManifest(filepath.Join(stageRoot, "manifest.txt"), exportDir, stageRoot); err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -184,6 +288,24 @@ func writeCommandOutput(dst string, cmd []string) error {
|
||||
return os.WriteFile(dst, raw, 0644)
|
||||
}
|
||||
|
||||
func copyOptionalFile(src, dst string) error {
|
||||
in, err := os.Open(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer in.Close()
|
||||
if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
out, err := os.Create(dst)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer out.Close()
|
||||
_, err = io.Copy(out, in)
|
||||
return err
|
||||
}
|
||||
|
||||
func writeManifest(dst, exportDir, stageRoot string) error {
|
||||
if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
|
||||
return err
|
||||
|
||||
@@ -2,18 +2,21 @@ package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"context"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
const mellanoxVendorID = 0x15b3
|
||||
const nicProbeTimeout = 2 * time.Second
|
||||
|
||||
var (
|
||||
mstflintQuery = func(bdf string) (string, error) {
|
||||
out, err := exec.Command("mstflint", "-d", bdf, "q").Output()
|
||||
out, err := commandOutputWithTimeout(nicProbeTimeout, "mstflint", "-d", bdf, "q")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -21,7 +24,7 @@ var (
|
||||
}
|
||||
|
||||
ethtoolInfoQuery = func(iface string) (string, error) {
|
||||
out, err := exec.Command("ethtool", "-i", iface).Output()
|
||||
out, err := commandOutputWithTimeout(nicProbeTimeout, "ethtool", "-i", iface)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -29,6 +32,14 @@ var (
|
||||
}
|
||||
|
||||
netIfacesByBDF = listNetIfacesByBDF
|
||||
readNetCarrierFile = func(iface string) (string, error) {
|
||||
path := filepath.Join("/sys/class/net", iface, "carrier")
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return strings.TrimSpace(string(raw)), nil
|
||||
}
|
||||
)
|
||||
|
||||
// enrichPCIeWithMellanox enriches Mellanox/NVIDIA Networking devices with
|
||||
@@ -162,3 +173,17 @@ func listNetIfacesByBDF(bdf string) []string {
|
||||
}
|
||||
return ifaces
|
||||
}
|
||||
|
||||
func commandOutputWithTimeout(timeout time.Duration, name string, args ...string) ([]byte, error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||
defer cancel()
|
||||
return exec.CommandContext(ctx, name, args...).Output()
|
||||
}
|
||||
|
||||
func interfaceHasCarrier(iface string) bool {
|
||||
raw, err := readNetCarrierFile(iface)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return strings.TrimSpace(raw) == "1"
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@ import (
|
||||
|
||||
var (
|
||||
ethtoolModuleQuery = func(iface string) (string, error) {
|
||||
out, err := raidToolQuery("ethtool", "-m", iface)
|
||||
out, err := commandOutputWithTimeout(nicProbeTimeout, "ethtool", "-m", iface)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -58,10 +58,12 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
|
||||
}
|
||||
}
|
||||
|
||||
if out, err := ethtoolModuleQuery(iface); err == nil {
|
||||
if injectSFPDOMTelemetry(&devs[i], out) {
|
||||
enriched++
|
||||
continue
|
||||
if interfaceHasCarrier(iface) {
|
||||
if out, err := ethtoolModuleQuery(iface); err == nil {
|
||||
if injectSFPDOMTelemetry(&devs[i], out) {
|
||||
enriched++
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {
|
||||
|
||||
@@ -57,6 +57,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
|
||||
origReadMAC := readNetAddressFile
|
||||
origEth := ethtoolInfoQuery
|
||||
origModule := ethtoolModuleQuery
|
||||
origCarrier := readNetCarrierFile
|
||||
t.Cleanup(func() {
|
||||
queryPCILSPCIDetail = origDetail
|
||||
readPCIVPDFile = origVPD
|
||||
@@ -64,6 +65,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
|
||||
readNetAddressFile = origReadMAC
|
||||
ethtoolInfoQuery = origEth
|
||||
ethtoolModuleQuery = origModule
|
||||
readNetCarrierFile = origCarrier
|
||||
})
|
||||
|
||||
queryPCILSPCIDetail = func(bdf string) (string, error) {
|
||||
@@ -82,6 +84,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
|
||||
}
|
||||
return "aa:bb:cc:dd:ee:ff", nil
|
||||
}
|
||||
readNetCarrierFile = func(string) (string, error) { return "1", nil }
|
||||
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
|
||||
ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("skip optics") }
|
||||
|
||||
@@ -101,6 +104,42 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnrichPCIeWithNICTelemetrySkipsModuleQueryWithoutCarrier(t *testing.T) {
|
||||
origIfaces := netIfacesByBDF
|
||||
origReadMAC := readNetAddressFile
|
||||
origEth := ethtoolInfoQuery
|
||||
origModule := ethtoolModuleQuery
|
||||
origCarrier := readNetCarrierFile
|
||||
t.Cleanup(func() {
|
||||
netIfacesByBDF = origIfaces
|
||||
readNetAddressFile = origReadMAC
|
||||
ethtoolInfoQuery = origEth
|
||||
ethtoolModuleQuery = origModule
|
||||
readNetCarrierFile = origCarrier
|
||||
})
|
||||
|
||||
netIfacesByBDF = func(string) []string { return []string{"eth0"} }
|
||||
readNetAddressFile = func(string) (string, error) { return "aa:bb:cc:dd:ee:ff", nil }
|
||||
readNetCarrierFile = func(string) (string, error) { return "0", nil }
|
||||
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
|
||||
ethtoolModuleQuery = func(string) (string, error) {
|
||||
t.Fatal("ethtool -m should not be called without carrier")
|
||||
return "", nil
|
||||
}
|
||||
|
||||
class := "EthernetController"
|
||||
bdf := "0000:18:00.0"
|
||||
devs := []schema.HardwarePCIeDevice{{
|
||||
DeviceClass: &class,
|
||||
BDF: &bdf,
|
||||
}}
|
||||
|
||||
out := enrichPCIeWithNICTelemetry(devs)
|
||||
if len(out[0].MacAddresses) != 1 || out[0].MacAddresses[0] != "aa:bb:cc:dd:ee:ff" {
|
||||
t.Fatalf("mac_addresses=%v", out[0].MacAddresses)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDBMValue(t *testing.T) {
|
||||
tests := []struct {
|
||||
in string
|
||||
|
||||
139
audit/internal/platform/error_patterns.go
Normal file
139
audit/internal/platform/error_patterns.go
Normal file
@@ -0,0 +1,139 @@
|
||||
package platform
|
||||
|
||||
import "regexp"
|
||||
|
||||
// ErrorPattern describes a kernel log pattern that indicates a hardware error.
|
||||
// Add new patterns by appending to HardwareErrorPatterns — no other code changes needed.
|
||||
type ErrorPattern struct {
|
||||
// Name is a short machine-readable label for logging and deduplication.
|
||||
Name string
|
||||
// Re is the compiled regular expression matched against a single kmsg line.
|
||||
Re *regexp.Regexp
|
||||
// Category groups related errors: "gpu", "pcie", "storage", "mce", "memory", "cpu".
|
||||
Category string
|
||||
// Severity is "warning" for recoverable/uncertain faults, "critical" for definitive failures.
|
||||
Severity string
|
||||
// BDFGroup is the capture group index (1-based) that contains a PCIe BDF address
|
||||
// (e.g. "0000:c8:00.0"). 0 means no BDF is captured by this pattern.
|
||||
BDFGroup int
|
||||
// DevGroup is the capture group index (1-based) that contains a device name
|
||||
// (e.g. "sda", "nvme0"). 0 means no device name is captured by this pattern.
|
||||
DevGroup int
|
||||
}
|
||||
|
||||
// HardwareErrorPatterns is the global list of kernel log patterns that indicate hardware faults.
|
||||
// To add a new pattern: append a new ErrorPattern struct to this slice.
|
||||
var HardwareErrorPatterns = []ErrorPattern{
|
||||
// ── GPU / NVIDIA ────────────────────────────────────────────────────────────
|
||||
{
|
||||
Name: "nvidia-rminitadapter",
|
||||
Re: mustPat(`(?i)NVRM:.*GPU\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "nvidia-msi-fail",
|
||||
Re: mustPat(`(?i)NVRM:.*Failed to enable MSI`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
},
|
||||
{
|
||||
Name: "nvidia-aer",
|
||||
Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "nvidia-xid",
|
||||
Re: mustPat(`(?i)NVRM:.*Xid.*\b([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
|
||||
// ── PCIe AER (generic) ──────────────────────────────────────────────────────
|
||||
{
|
||||
Name: "pcie-aer",
|
||||
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||
Category: "pcie",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "pcie-uncorrectable",
|
||||
Re: mustPat(`(?i)([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Uu]ncorrectable`),
|
||||
Category: "pcie",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "pcie-link-down",
|
||||
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Ll]ink.*[Dd]own`),
|
||||
Category: "pcie",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
|
||||
// ── Storage ─────────────────────────────────────────────────────────────────
|
||||
{
|
||||
Name: "blk-io-error",
|
||||
Re: mustPat(`(?i)blk_update_request.*I/O error.*dev\s+(\w+)`),
|
||||
Category: "storage",
|
||||
Severity: "warning",
|
||||
DevGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "nvme-timeout",
|
||||
Re: mustPat(`(?i)nvme\s+(\w+):.*timeout`),
|
||||
Category: "storage",
|
||||
Severity: "warning",
|
||||
DevGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "scsi-failed",
|
||||
Re: mustPat(`(?i)sd\s+[\da-f:]+:.*FAILED`),
|
||||
Category: "storage",
|
||||
Severity: "warning",
|
||||
},
|
||||
{
|
||||
Name: "nvme-reset",
|
||||
Re: mustPat(`(?i)nvme\s+(\w+):.*reset`),
|
||||
Category: "storage",
|
||||
Severity: "warning",
|
||||
DevGroup: 1,
|
||||
},
|
||||
|
||||
// ── Machine Check Exceptions ────────────────────────────────────────────────
|
||||
{
|
||||
Name: "mce-hardware-error",
|
||||
Re: mustPat(`(?i)mce:.*[Hh]ardware [Ee]rror`),
|
||||
Category: "mce",
|
||||
Severity: "warning",
|
||||
},
|
||||
{
|
||||
Name: "mce-corrected",
|
||||
Re: mustPat(`(?i)mce:.*[Cc]orrected`),
|
||||
Category: "mce",
|
||||
Severity: "warning",
|
||||
},
|
||||
|
||||
// ── Memory ─────────────────────────────────────────────────────────────────
|
||||
{
|
||||
Name: "edac-ue",
|
||||
Re: mustPat(`(?i)EDAC.*[Uu]ncorrectable`),
|
||||
Category: "memory",
|
||||
Severity: "warning",
|
||||
},
|
||||
{
|
||||
Name: "edac-ce",
|
||||
Re: mustPat(`(?i)EDAC.*[Cc]orrectable`),
|
||||
Category: "memory",
|
||||
Severity: "warning",
|
||||
},
|
||||
}
|
||||
|
||||
func mustPat(s string) *regexp.Regexp {
|
||||
return regexp.MustCompile(s)
|
||||
}
|
||||
@@ -11,10 +11,10 @@ import (
|
||||
|
||||
// InstallDisk describes a candidate disk for installation.
|
||||
type InstallDisk struct {
|
||||
Device string // e.g. /dev/sda
|
||||
Model string
|
||||
Size string // human-readable, e.g. "500G"
|
||||
SizeBytes int64 // raw byte count from lsblk
|
||||
Device string // e.g. /dev/sda
|
||||
Model string
|
||||
Size string // human-readable, e.g. "500G"
|
||||
SizeBytes int64 // raw byte count from lsblk
|
||||
MountedParts []string // partition mount points currently active
|
||||
}
|
||||
|
||||
@@ -117,6 +117,61 @@ func findLiveBootDevice() string {
|
||||
return "/dev/" + strings.TrimSpace(string(out2))
|
||||
}
|
||||
|
||||
func mountSource(target string) string {
|
||||
out, err := exec.Command("findmnt", "-n", "-o", "SOURCE", target).Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(string(out))
|
||||
}
|
||||
|
||||
func mountFSType(target string) string {
|
||||
out, err := exec.Command("findmnt", "-n", "-o", "FSTYPE", target).Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(string(out))
|
||||
}
|
||||
|
||||
func blockDeviceType(device string) string {
|
||||
if strings.TrimSpace(device) == "" {
|
||||
return ""
|
||||
}
|
||||
out, err := exec.Command("lsblk", "-dn", "-o", "TYPE", device).Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(string(out))
|
||||
}
|
||||
|
||||
func blockDeviceTransport(device string) string {
|
||||
if strings.TrimSpace(device) == "" {
|
||||
return ""
|
||||
}
|
||||
out, err := exec.Command("lsblk", "-dn", "-o", "TRAN", device).Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(string(out))
|
||||
}
|
||||
|
||||
func inferLiveBootKind(fsType, source, deviceType, transport string) string {
|
||||
switch {
|
||||
case strings.EqualFold(strings.TrimSpace(fsType), "tmpfs"):
|
||||
return "ram"
|
||||
case strings.EqualFold(strings.TrimSpace(deviceType), "rom"):
|
||||
return "cdrom"
|
||||
case strings.EqualFold(strings.TrimSpace(transport), "usb"):
|
||||
return "usb"
|
||||
case strings.HasPrefix(strings.TrimSpace(source), "/dev/sr"):
|
||||
return "cdrom"
|
||||
case strings.HasPrefix(strings.TrimSpace(source), "/dev/"):
|
||||
return "disk"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
|
||||
// MinInstallBytes returns the minimum recommended disk size for installation:
|
||||
// squashfs size × 1.5 to allow for extracted filesystem and bootloader.
|
||||
// Returns 0 if the squashfs is not available (non-live environment).
|
||||
|
||||
@@ -12,11 +12,40 @@ import (
|
||||
)
|
||||
|
||||
func (s *System) IsLiveMediaInRAM() bool {
|
||||
out, err := exec.Command("findmnt", "-n", "-o", "FSTYPE", "/run/live/medium").Output()
|
||||
if err != nil {
|
||||
fsType := mountFSType("/run/live/medium")
|
||||
if fsType == "" {
|
||||
return toramActive()
|
||||
}
|
||||
return strings.TrimSpace(string(out)) == "tmpfs"
|
||||
return strings.EqualFold(fsType, "tmpfs")
|
||||
}
|
||||
|
||||
func (s *System) LiveBootSource() LiveBootSource {
|
||||
fsType := mountFSType("/run/live/medium")
|
||||
source := mountSource("/run/live/medium")
|
||||
device := findLiveBootDevice()
|
||||
status := LiveBootSource{
|
||||
InRAM: strings.EqualFold(fsType, "tmpfs"),
|
||||
Source: source,
|
||||
Device: device,
|
||||
}
|
||||
if fsType == "" && source == "" && device == "" {
|
||||
if toramActive() {
|
||||
status.InRAM = true
|
||||
status.Kind = "ram"
|
||||
status.Source = "tmpfs"
|
||||
return status
|
||||
}
|
||||
status.Kind = "unknown"
|
||||
return status
|
||||
}
|
||||
status.Kind = inferLiveBootKind(fsType, source, blockDeviceType(device), blockDeviceTransport(device))
|
||||
if status.Kind == "" {
|
||||
status.Kind = "unknown"
|
||||
}
|
||||
if status.InRAM && strings.TrimSpace(status.Source) == "" {
|
||||
status.Source = "tmpfs"
|
||||
}
|
||||
return status
|
||||
}
|
||||
|
||||
func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
||||
|
||||
28
audit/internal/platform/install_to_ram_test.go
Normal file
28
audit/internal/platform/install_to_ram_test.go
Normal file
@@ -0,0 +1,28 @@
|
||||
package platform
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestInferLiveBootKind(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
fsType string
|
||||
source string
|
||||
deviceType string
|
||||
transport string
|
||||
want string
|
||||
}{
|
||||
{name: "ram tmpfs", fsType: "tmpfs", source: "/dev/shm/bee-live", want: "ram"},
|
||||
{name: "usb disk", source: "/dev/sdb1", deviceType: "disk", transport: "usb", want: "usb"},
|
||||
{name: "cdrom rom", source: "/dev/sr0", deviceType: "rom", want: "cdrom"},
|
||||
{name: "disk sata", source: "/dev/nvme0n1p1", deviceType: "disk", transport: "nvme", want: "disk"},
|
||||
{name: "unknown", source: "overlay", want: "unknown"},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
got := inferLiveBootKind(tc.fsType, tc.source, tc.deviceType, tc.transport)
|
||||
if got != tc.want {
|
||||
t.Fatalf("inferLiveBootKind(%q,%q,%q,%q)=%q want %q", tc.fsType, tc.source, tc.deviceType, tc.transport, got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -286,7 +286,25 @@ func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (
|
||||
// gpuIndices: specific GPU indices to test (empty = all GPUs).
|
||||
// ctx cancellation kills the running job.
|
||||
func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, gpuIndices), logFunc)
|
||||
resolvedGPUIndices, err := resolveDCGMGPUIndices(gpuIndices)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, resolvedGPUIndices), logFunc)
|
||||
}
|
||||
|
||||
func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
|
||||
if len(gpuIndices) > 0 {
|
||||
return dedupeSortedIndices(gpuIndices), nil
|
||||
}
|
||||
all, err := listNvidiaGPUIndices()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(all) == 0 {
|
||||
return nil, fmt.Errorf("nvidia-smi found no NVIDIA GPUs")
|
||||
}
|
||||
return all, nil
|
||||
}
|
||||
|
||||
func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
|
||||
@@ -162,6 +162,39 @@ func TestBuildNvidiaStressJobUsesNCCLLoader(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveDCGMGPUIndicesUsesDetectedGPUsWhenUnset(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
oldExecCommand := satExecCommand
|
||||
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
if name == "nvidia-smi" {
|
||||
return exec.Command("sh", "-c", "printf '2\n0\n1\n'")
|
||||
}
|
||||
return exec.Command(name, args...)
|
||||
}
|
||||
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||
|
||||
got, err := resolveDCGMGPUIndices(nil)
|
||||
if err != nil {
|
||||
t.Fatalf("resolveDCGMGPUIndices error: %v", err)
|
||||
}
|
||||
if want := "0,1,2"; joinIndexList(got) != want {
|
||||
t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
got, err := resolveDCGMGPUIndices([]int{3, 1, 3})
|
||||
if err != nil {
|
||||
t.Fatalf("resolveDCGMGPUIndices error: %v", err)
|
||||
}
|
||||
if want := "1,3"; joinIndexList(got) != want {
|
||||
t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaStressArchivePrefixByLoader(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
@@ -2,6 +2,13 @@ package platform
|
||||
|
||||
type System struct{}
|
||||
|
||||
type LiveBootSource struct {
|
||||
InRAM bool `json:"in_ram"`
|
||||
Kind string `json:"kind"`
|
||||
Source string `json:"source,omitempty"`
|
||||
Device string `json:"device,omitempty"`
|
||||
}
|
||||
|
||||
type InterfaceInfo struct {
|
||||
Name string
|
||||
State string
|
||||
|
||||
@@ -63,6 +63,10 @@ func streamJob(w http.ResponseWriter, r *http.Request, j *jobState) {
|
||||
if !sseStart(w) {
|
||||
return
|
||||
}
|
||||
streamSubscribedJob(w, r, j)
|
||||
}
|
||||
|
||||
func streamSubscribedJob(w http.ResponseWriter, r *http.Request, j *jobState) {
|
||||
existing, ch := j.subscribe()
|
||||
for _, line := range existing {
|
||||
sseWrite(w, "", line)
|
||||
@@ -428,7 +432,6 @@ func (h *handler) handleAPIExportList(w http.ResponseWriter, r *http.Request) {
|
||||
writeJSON(w, entries)
|
||||
}
|
||||
|
||||
|
||||
func (h *handler) handleAPIExportUSBTargets(w http.ResponseWriter, _ *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
@@ -523,9 +526,9 @@ func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
inRAM := h.opts.App.IsLiveMediaInRAM()
|
||||
status := h.opts.App.LiveBootSource()
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_ = json.NewEncoder(w).Encode(map[string]bool{"in_ram": inRAM})
|
||||
_ = json.NewEncoder(w).Encode(status)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
238
audit/internal/webui/kmsg_watcher.go
Normal file
238
audit/internal/webui/kmsg_watcher.go
Normal file
@@ -0,0 +1,238 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"io"
|
||||
"log/slog"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
// kmsgWatcher reads /dev/kmsg and accumulates hardware error events.
|
||||
// It supports multiple concurrent SAT tasks: a shared event window is open
|
||||
// while any SAT task is running, and flushed when all tasks complete.
|
||||
type kmsgWatcher struct {
|
||||
mu sync.Mutex
|
||||
activeCount int // number of in-flight SAT tasks
|
||||
window *kmsgWindow
|
||||
statusDB *app.ComponentStatusDB
|
||||
}
|
||||
|
||||
type kmsgWindow struct {
|
||||
targets []string // SAT targets running concurrently
|
||||
startedAt time.Time
|
||||
seen map[kmsgEventKey]bool
|
||||
events []kmsgEvent
|
||||
}
|
||||
|
||||
type kmsgEventKey struct {
|
||||
id string // BDF or device name
|
||||
category string
|
||||
}
|
||||
|
||||
type kmsgEvent struct {
|
||||
timestamp time.Time
|
||||
raw string
|
||||
ids []string // BDF addresses or device names extracted
|
||||
category string
|
||||
}
|
||||
|
||||
func newKmsgWatcher(statusDB *app.ComponentStatusDB) *kmsgWatcher {
|
||||
return &kmsgWatcher{statusDB: statusDB}
|
||||
}
|
||||
|
||||
// start launches the background kmsg reading goroutine.
|
||||
func (w *kmsgWatcher) start() {
|
||||
go w.run()
|
||||
}
|
||||
|
||||
func (w *kmsgWatcher) run() {
|
||||
f, err := os.Open("/dev/kmsg")
|
||||
if err != nil {
|
||||
slog.Warn("kmsg watcher unavailable", "err", err)
|
||||
return
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
// Best-effort seek to end so we only capture events from now forward.
|
||||
_, _ = f.Seek(0, io.SeekEnd)
|
||||
|
||||
scanner := bufio.NewScanner(f)
|
||||
scanner.Buffer(make([]byte, 64*1024), 64*1024)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
evt, ok := parseKmsgLine(line)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
w.mu.Lock()
|
||||
if w.window != nil {
|
||||
w.recordEvent(evt)
|
||||
}
|
||||
w.mu.Unlock()
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
slog.Warn("kmsg watcher stopped", "err", err)
|
||||
}
|
||||
}
|
||||
|
||||
// recordEvent appends evt to the active window, deduplicating by (id, category).
|
||||
// Must be called with w.mu held.
|
||||
func (w *kmsgWatcher) recordEvent(evt kmsgEvent) {
|
||||
if len(evt.ids) == 0 {
|
||||
key := kmsgEventKey{id: "", category: evt.category}
|
||||
if !w.window.seen[key] {
|
||||
w.window.seen[key] = true
|
||||
w.window.events = append(w.window.events, evt)
|
||||
}
|
||||
return
|
||||
}
|
||||
for _, id := range evt.ids {
|
||||
key := kmsgEventKey{id: id, category: evt.category}
|
||||
if !w.window.seen[key] {
|
||||
w.window.seen[key] = true
|
||||
w.window.events = append(w.window.events, evt)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// NotifyTaskStarted increments the active task counter and opens a shared event window
|
||||
// if this is the first task starting.
|
||||
func (w *kmsgWatcher) NotifyTaskStarted(taskID, target string) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
if w.activeCount == 0 {
|
||||
w.window = &kmsgWindow{
|
||||
startedAt: time.Now(),
|
||||
seen: make(map[kmsgEventKey]bool),
|
||||
}
|
||||
}
|
||||
w.activeCount++
|
||||
if w.window != nil {
|
||||
w.window.targets = append(w.window.targets, target)
|
||||
}
|
||||
}
|
||||
|
||||
// NotifyTaskFinished decrements the active task counter. When all tasks finish,
|
||||
// it flushes the accumulated events to the status DB.
|
||||
func (w *kmsgWatcher) NotifyTaskFinished(taskID string) {
|
||||
w.mu.Lock()
|
||||
w.activeCount--
|
||||
var window *kmsgWindow
|
||||
if w.activeCount <= 0 {
|
||||
w.activeCount = 0
|
||||
window = w.window
|
||||
w.window = nil
|
||||
}
|
||||
w.mu.Unlock()
|
||||
|
||||
if window == nil || len(window.events) == 0 {
|
||||
return
|
||||
}
|
||||
go w.flushWindow(window)
|
||||
}
|
||||
|
||||
func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
|
||||
if w.statusDB == nil {
|
||||
return
|
||||
}
|
||||
source := "watchdog:kmsg"
|
||||
// Collect unique component keys from events.
|
||||
seen := map[string]string{} // componentKey → first raw line
|
||||
for _, evt := range window.events {
|
||||
if len(evt.ids) == 0 {
|
||||
// MCE or un-identified error.
|
||||
key := "cpu:all"
|
||||
if evt.category == "memory" {
|
||||
key = "memory:all"
|
||||
}
|
||||
if _, exists := seen[key]; !exists {
|
||||
seen[key] = evt.raw
|
||||
}
|
||||
continue
|
||||
}
|
||||
for _, id := range evt.ids {
|
||||
var key string
|
||||
switch evt.category {
|
||||
case "gpu", "pcie":
|
||||
key = "pcie:" + normalizeBDF(id)
|
||||
case "storage":
|
||||
key = "storage:" + id
|
||||
default:
|
||||
key = "pcie:" + normalizeBDF(id)
|
||||
}
|
||||
if _, exists := seen[key]; !exists {
|
||||
seen[key] = evt.raw
|
||||
}
|
||||
}
|
||||
}
|
||||
for key, detail := range seen {
|
||||
detail = "kernel error during SAT (" + strings.Join(window.targets, ",") + "): " + truncate(detail, 120)
|
||||
w.statusDB.Record(key, source, "Warning", detail)
|
||||
}
|
||||
}
|
||||
|
||||
// parseKmsgLine parses a single /dev/kmsg line and returns an event if it matches
|
||||
// any pattern in platform.HardwareErrorPatterns.
|
||||
// kmsg format: "<priority>,<sequence>,<timestamp_usec>,-;message text"
|
||||
func parseKmsgLine(raw string) (kmsgEvent, bool) {
|
||||
msg := raw
|
||||
if idx := strings.Index(raw, ";"); idx >= 0 {
|
||||
msg = strings.TrimSpace(raw[idx+1:])
|
||||
}
|
||||
if msg == "" {
|
||||
return kmsgEvent{}, false
|
||||
}
|
||||
|
||||
for _, p := range platform.HardwareErrorPatterns {
|
||||
m := p.Re.FindStringSubmatch(msg)
|
||||
if m == nil {
|
||||
continue
|
||||
}
|
||||
evt := kmsgEvent{
|
||||
timestamp: time.Now(),
|
||||
raw: msg,
|
||||
category: p.Category,
|
||||
}
|
||||
if p.BDFGroup > 0 && p.BDFGroup < len(m) {
|
||||
evt.ids = append(evt.ids, normalizeBDF(m[p.BDFGroup]))
|
||||
}
|
||||
if p.DevGroup > 0 && p.DevGroup < len(m) {
|
||||
evt.ids = append(evt.ids, m[p.DevGroup])
|
||||
}
|
||||
return evt, true
|
||||
}
|
||||
return kmsgEvent{}, false
|
||||
}
|
||||
|
||||
// normalizeBDF normalizes a PCIe BDF to the 4-part form "0000:c8:00.0".
|
||||
func normalizeBDF(bdf string) string {
|
||||
bdf = strings.ToLower(strings.TrimSpace(bdf))
|
||||
if strings.Count(bdf, ":") == 1 {
|
||||
return "0000:" + bdf
|
||||
}
|
||||
return bdf
|
||||
}
|
||||
|
||||
func truncate(s string, max int) string {
|
||||
if len(s) <= max {
|
||||
return s
|
||||
}
|
||||
return s[:max] + "..."
|
||||
}
|
||||
|
||||
// isSATTarget returns true for task targets that run hardware acceptance tests.
|
||||
func isSATTarget(target string) bool {
|
||||
switch target {
|
||||
case "nvidia", "nvidia-stress", "memory", "memory-stress", "storage",
|
||||
"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
|
||||
"platform-stress":
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
@@ -29,6 +29,7 @@ a{color:var(--accent);text-decoration:none}
|
||||
.sidebar{width:210px;min-height:100vh;background:#1b1c1d;flex-shrink:0;display:flex;flex-direction:column}
|
||||
.sidebar-logo{padding:18px 16px 12px;font-size:18px;font-weight:700;color:#fff;letter-spacing:-.5px}
|
||||
.sidebar-logo span{color:rgba(255,255,255,.5);font-weight:400;font-size:12px;display:block;margin-top:2px}
|
||||
.sidebar-version{padding:0 16px 14px;font-size:11px;color:rgba(255,255,255,.45)}
|
||||
.nav{flex:1}
|
||||
.nav-item{display:block;padding:10px 16px;color:rgba(255,255,255,.7);font-size:13px;border-left:3px solid transparent;transition:all .15s}
|
||||
.nav-item:hover{color:#fff;background:rgba(255,255,255,.08)}
|
||||
@@ -96,6 +97,10 @@ func layoutNav(active string, buildLabel string) string {
|
||||
var b strings.Builder
|
||||
b.WriteString(`<aside class="sidebar">`)
|
||||
b.WriteString(`<div class="sidebar-logo">bee<span>hardware audit</span></div>`)
|
||||
if strings.TrimSpace(buildLabel) == "" {
|
||||
buildLabel = "dev"
|
||||
}
|
||||
b.WriteString(`<div class="sidebar-version">Version ` + html.EscapeString(buildLabel) + `</div>`)
|
||||
b.WriteString(`<nav class="nav">`)
|
||||
for _, item := range items {
|
||||
cls := "nav-item"
|
||||
@@ -110,11 +115,7 @@ func layoutNav(active string, buildLabel string) string {
|
||||
cls, item.href, item.label))
|
||||
}
|
||||
}
|
||||
if strings.TrimSpace(buildLabel) == "" {
|
||||
buildLabel = "dev"
|
||||
}
|
||||
b.WriteString(`</nav>`)
|
||||
b.WriteString(`<div style="padding:12px 16px;border-top:1px solid rgba(255,255,255,.08);font-size:11px;color:rgba(255,255,255,.45)">Build ` + html.EscapeString(buildLabel) + `</div>`)
|
||||
b.WriteString(`</aside>`)
|
||||
return b.String()
|
||||
}
|
||||
@@ -1013,7 +1014,7 @@ func renderNetwork() string {
|
||||
// ── Services ──────────────────────────────────────────────────────────────────
|
||||
|
||||
func renderServicesInline() string {
|
||||
return `<div style="display:flex;justify-content:flex-end;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="loadServices()">↻ Refresh</button></div>
|
||||
return `<div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="restartGPUDrivers()">Restart GPU Drivers</button><button class="btn btn-sm btn-secondary" onclick="loadServices()">↻ Refresh</button></div>
|
||||
<div id="svc-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||
<div id="svc-out" style="display:none;margin-top:8px" class="card">
|
||||
<div class="card-head">Output</div>
|
||||
@@ -1054,6 +1055,9 @@ function svcAction(name, action) {
|
||||
setTimeout(loadServices, 1000);
|
||||
});
|
||||
}
|
||||
function restartGPUDrivers() {
|
||||
svcAction('bee-nvidia', 'restart');
|
||||
}
|
||||
loadServices();
|
||||
</script>`
|
||||
}
|
||||
@@ -1086,72 +1090,7 @@ func renderExport(exportDir string) string {
|
||||
</div></div>
|
||||
</div>
|
||||
|
||||
<div class="card" style="margin-top:16px">
|
||||
<div class="card-head">Export to USB
|
||||
<button class="btn btn-sm btn-secondary" onclick="usbRefresh()" style="margin-left:auto">↻ Refresh</button>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Write audit JSON or support bundle directly to a removable USB drive.</p>
|
||||
<div id="usb-status" style="font-size:13px;color:var(--muted)">Scanning for USB devices...</div>
|
||||
<div id="usb-targets" style="margin-top:12px"></div>
|
||||
<div id="usb-msg" style="margin-top:10px;font-size:13px"></div>
|
||||
</div>
|
||||
</div>
|
||||
<script>
|
||||
(function(){
|
||||
function usbRefresh() {
|
||||
document.getElementById('usb-status').textContent = 'Scanning...';
|
||||
document.getElementById('usb-targets').innerHTML = '';
|
||||
document.getElementById('usb-msg').textContent = '';
|
||||
fetch('/api/export/usb').then(r=>r.json()).then(targets => {
|
||||
const st = document.getElementById('usb-status');
|
||||
const ct = document.getElementById('usb-targets');
|
||||
if (!targets || targets.length === 0) {
|
||||
st.textContent = 'No removable USB devices found.';
|
||||
return;
|
||||
}
|
||||
st.textContent = targets.length + ' device(s) found:';
|
||||
ct.innerHTML = '<table><tr><th>Device</th><th>FS</th><th>Size</th><th>Label</th><th>Model</th><th>Actions</th></tr>' +
|
||||
targets.map(t => {
|
||||
const dev = t.device || '';
|
||||
const label = t.label || '';
|
||||
const model = t.model || '';
|
||||
return '<tr>' +
|
||||
'<td style="font-family:monospace">'+dev+'</td>' +
|
||||
'<td>'+t.fs_type+'</td>' +
|
||||
'<td>'+t.size+'</td>' +
|
||||
'<td>'+label+'</td>' +
|
||||
'<td style="font-size:12px;color:var(--muted)">'+model+'</td>' +
|
||||
'<td style="white-space:nowrap">' +
|
||||
'<button class="btn btn-sm btn-primary" onclick="usbExport(\'audit\','+JSON.stringify(t)+')">Audit JSON</button> ' +
|
||||
'<button class="btn btn-sm btn-secondary" onclick="usbExport(\'bundle\','+JSON.stringify(t)+')">Support Bundle</button>' +
|
||||
'</td></tr>';
|
||||
}).join('') + '</table>';
|
||||
}).catch(e => {
|
||||
document.getElementById('usb-status').textContent = 'Error: ' + e;
|
||||
});
|
||||
}
|
||||
window.usbExport = function(type, target) {
|
||||
const msg = document.getElementById('usb-msg');
|
||||
msg.style.color = 'var(--muted)';
|
||||
msg.textContent = 'Exporting to ' + (target.device||'') + '...';
|
||||
fetch('/api/export/usb/'+type, {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type':'application/json'},
|
||||
body: JSON.stringify(target)
|
||||
}).then(r=>r.json()).then(d => {
|
||||
if (d.error) { msg.style.color='var(--err,red)'; msg.textContent = 'Error: '+d.error; return; }
|
||||
msg.style.color = 'var(--ok,green)';
|
||||
msg.textContent = d.message || 'Done.';
|
||||
}).catch(e => {
|
||||
msg.style.color = 'var(--err,red)';
|
||||
msg.textContent = 'Error: '+e;
|
||||
});
|
||||
};
|
||||
window.usbRefresh = usbRefresh;
|
||||
usbRefresh();
|
||||
})();
|
||||
</script>`
|
||||
` + renderUSBExportCard()
|
||||
}
|
||||
|
||||
func listExportFiles(exportDir string) ([]string, error) {
|
||||
@@ -1221,6 +1160,77 @@ window.supportBundleDownload = function() {
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderUSBExportCard() string {
|
||||
return `<div class="card" style="margin-top:16px">
|
||||
<div class="card-head">Export to USB
|
||||
<button class="btn btn-sm btn-secondary" onclick="usbRefresh()" style="margin-left:auto">↻ Refresh</button>
|
||||
</div>
|
||||
<div class="card-body">` + renderUSBExportInline() + `</div>
|
||||
</div>`
|
||||
}
|
||||
|
||||
func renderUSBExportInline() string {
|
||||
return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Write audit JSON or support bundle directly to a removable USB drive.</p>
|
||||
<div id="usb-status" style="font-size:13px;color:var(--muted)">Scanning for USB devices...</div>
|
||||
<div id="usb-targets" style="margin-top:12px"></div>
|
||||
<div id="usb-msg" style="margin-top:10px;font-size:13px"></div>
|
||||
<script>
|
||||
(function(){
|
||||
function usbRefresh() {
|
||||
document.getElementById('usb-status').textContent = 'Scanning...';
|
||||
document.getElementById('usb-targets').innerHTML = '';
|
||||
document.getElementById('usb-msg').textContent = '';
|
||||
fetch('/api/export/usb').then(r=>r.json()).then(targets => {
|
||||
const st = document.getElementById('usb-status');
|
||||
const ct = document.getElementById('usb-targets');
|
||||
if (!targets || targets.length === 0) {
|
||||
st.textContent = 'No removable USB devices found.';
|
||||
return;
|
||||
}
|
||||
st.textContent = targets.length + ' device(s) found:';
|
||||
ct.innerHTML = '<table><tr><th>Device</th><th>FS</th><th>Size</th><th>Label</th><th>Model</th><th>Actions</th></tr>' +
|
||||
targets.map(t => {
|
||||
const dev = t.device || '';
|
||||
const label = t.label || '';
|
||||
const model = t.model || '';
|
||||
return '<tr>' +
|
||||
'<td style="font-family:monospace">'+dev+'</td>' +
|
||||
'<td>'+t.fs_type+'</td>' +
|
||||
'<td>'+t.size+'</td>' +
|
||||
'<td>'+label+'</td>' +
|
||||
'<td style="font-size:12px;color:var(--muted)">'+model+'</td>' +
|
||||
'<td style="white-space:nowrap">' +
|
||||
'<button class="btn btn-sm btn-primary" onclick="usbExport(\'audit\','+JSON.stringify(t)+')">Audit JSON</button> ' +
|
||||
'<button class="btn btn-sm btn-secondary" onclick="usbExport(\'bundle\','+JSON.stringify(t)+')">Support Bundle</button>' +
|
||||
'</td></tr>';
|
||||
}).join('') + '</table>';
|
||||
}).catch(e => {
|
||||
document.getElementById('usb-status').textContent = 'Error: ' + e;
|
||||
});
|
||||
}
|
||||
window.usbExport = function(type, target) {
|
||||
const msg = document.getElementById('usb-msg');
|
||||
msg.style.color = 'var(--muted)';
|
||||
msg.textContent = 'Exporting to ' + (target.device||'') + '...';
|
||||
fetch('/api/export/usb/'+type, {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type':'application/json'},
|
||||
body: JSON.stringify(target)
|
||||
}).then(r=>r.json()).then(d => {
|
||||
if (d.error) { msg.style.color='var(--err,red)'; msg.textContent = 'Error: '+d.error; return; }
|
||||
msg.style.color = 'var(--ok,green)';
|
||||
msg.textContent = d.message || 'Done.';
|
||||
}).catch(e => {
|
||||
msg.style.color = 'var(--err,red)';
|
||||
msg.textContent = 'Error: '+e;
|
||||
});
|
||||
};
|
||||
window.usbRefresh = usbRefresh;
|
||||
usbRefresh();
|
||||
})();
|
||||
</script>`
|
||||
}
|
||||
|
||||
// ── Display Resolution ────────────────────────────────────────────────────────
|
||||
|
||||
func renderDisplayInline() string {
|
||||
@@ -1279,6 +1289,7 @@ func renderTools() string {
|
||||
<div class="card-body">
|
||||
<div style="margin-bottom:20px">
|
||||
<div style="font-weight:600;margin-bottom:8px">Install to RAM</div>
|
||||
<p id="boot-source-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Detecting boot source...</p>
|
||||
<p id="ram-status-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Checking...</p>
|
||||
<button id="ram-install-btn" class="btn btn-primary" onclick="installToRAM()" style="display:none">▶ Copy to RAM</button>
|
||||
</div>
|
||||
@@ -1290,8 +1301,18 @@ func renderTools() string {
|
||||
</div>
|
||||
<script>
|
||||
fetch('/api/system/ram-status').then(r=>r.json()).then(d=>{
|
||||
const boot = document.getElementById('boot-source-text');
|
||||
const txt = document.getElementById('ram-status-text');
|
||||
const btn = document.getElementById('ram-install-btn');
|
||||
let source = d.device || d.source || 'unknown source';
|
||||
let kind = d.kind || 'unknown';
|
||||
let label = source;
|
||||
if (kind === 'ram') label = 'RAM';
|
||||
else if (kind === 'usb') label = 'USB (' + source + ')';
|
||||
else if (kind === 'cdrom') label = 'CD-ROM (' + source + ')';
|
||||
else if (kind === 'disk') label = 'disk (' + source + ')';
|
||||
else label = source;
|
||||
boot.textContent = 'Current boot source: ' + label + '.';
|
||||
if (d.in_ram) {
|
||||
txt.textContent = '✓ Running from RAM — installation media can be safely disconnected.';
|
||||
txt.style.color = 'var(--ok, green)';
|
||||
@@ -1311,6 +1332,10 @@ function installToRAM() {
|
||||
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Downloads a tar.gz archive of all audit files, SAT results, and logs.</p>
|
||||
` + renderSupportBundleInline() + `
|
||||
<div style="border-top:1px solid var(--border);margin-top:16px;padding-top:16px">
|
||||
<div style="font-weight:600;margin-bottom:8px">Export to USB</div>
|
||||
` + renderUSBExportInline() + `
|
||||
</div>
|
||||
</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">↻ Check</button></div>
|
||||
@@ -1559,23 +1584,41 @@ func renderTasks() string {
|
||||
<div class="card">
|
||||
<div id="tasks-table"><p style="color:var(--muted);font-size:13px;padding:16px">Loading...</p></div>
|
||||
</div>
|
||||
<div id="task-log-section" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Logs — <span id="task-log-title"></span>
|
||||
<button class="btn btn-sm btn-secondary" onclick="closeTaskLog()" style="margin-left:auto">✕</button>
|
||||
<div id="task-log-overlay" style="display:none;position:fixed;inset:0;background:rgba(0,0,0,.58);z-index:120;align-items:center;justify-content:center;padding:16px">
|
||||
<div style="background:#fff;border-radius:6px;box-shadow:0 24px 60px rgba(0,0,0,.35);width:calc(100vw - 32px);max-width:1600px;height:calc(100vh - 32px);display:flex;flex-direction:column;overflow:hidden;position:relative">
|
||||
<div class="card-head" style="padding:14px 18px;font-size:14px">Logs — <span id="task-log-title"></span>
|
||||
<button class="btn btn-sm btn-secondary" onclick="closeTaskLog()" style="margin-left:auto">✕</button>
|
||||
</div>
|
||||
<div class="card-body" style="padding:16px;flex:1;min-height:0;overflow:hidden">
|
||||
<div style="height:100%;min-height:0;overflow:auto">
|
||||
<div id="task-log-terminal" class="terminal" style="margin:0;max-height:none;overflow:visible"></div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="card-body"><div id="task-log-terminal" class="terminal" style="max-height:500px"></div></div>
|
||||
</div>
|
||||
<script>
|
||||
var _taskLogES = null;
|
||||
var _taskRefreshTimer = null;
|
||||
var _tasksAll = [];
|
||||
var _taskPage = 1;
|
||||
var _taskPageSize = 50;
|
||||
var _taskLogID = '';
|
||||
|
||||
function loadTasks() {
|
||||
fetch('/api/tasks').then(r=>r.json()).then(tasks => {
|
||||
if (!tasks || tasks.length === 0) {
|
||||
_tasksAll = Array.isArray(tasks) ? tasks : [];
|
||||
if (_tasksAll.length === 0) {
|
||||
_taskPage = 1;
|
||||
document.getElementById('tasks-table').innerHTML = '<p style="color:var(--muted);font-size:13px;padding:16px">No tasks.</p>';
|
||||
syncTaskLogFromHash();
|
||||
return;
|
||||
}
|
||||
const rows = tasks.map(t => {
|
||||
const totalPages = Math.max(1, Math.ceil(_tasksAll.length / _taskPageSize));
|
||||
if (_taskPage > totalPages) _taskPage = totalPages;
|
||||
if (_taskPage < 1) _taskPage = 1;
|
||||
const start = (_taskPage - 1) * _taskPageSize;
|
||||
const pageTasks = _tasksAll.slice(start, start + _taskPageSize);
|
||||
const rows = pageTasks.map(t => {
|
||||
const dur = t.elapsed_sec ? formatDurSec(t.elapsed_sec) : '';
|
||||
const statusClass = {running:'badge-ok',pending:'badge-unknown',done:'badge-ok',failed:'badge-err',cancelled:'badge-unknown'}[t.status]||'badge-unknown';
|
||||
const statusLabel = {running:'▶ running',pending:'pending',done:'✓ done',failed:'✗ failed',cancelled:'cancelled'}[t.status]||t.status;
|
||||
@@ -1594,8 +1637,20 @@ function loadTasks() {
|
||||
'<td>'+t.priority+'</td>' +
|
||||
'<td>'+actions+'</td></tr>';
|
||||
}).join('');
|
||||
const showingFrom = start + 1;
|
||||
const showingTo = Math.min(start + pageTasks.length, _tasksAll.length);
|
||||
const pager =
|
||||
'<div style="display:flex;align-items:center;justify-content:space-between;gap:12px;flex-wrap:wrap;padding:12px 14px;border-top:1px solid var(--border-lite);background:var(--surface-2)">' +
|
||||
'<div style="font-size:12px;color:var(--muted)">Showing '+showingFrom+'-'+showingTo+' of '+_tasksAll.length+' tasks</div>' +
|
||||
'<div style="display:flex;align-items:center;gap:8px">' +
|
||||
'<button class="btn btn-sm btn-secondary" onclick="setTaskPage('+(_taskPage-1)+')" '+(_taskPage <= 1 ? 'disabled' : '')+'>Previous</button>' +
|
||||
'<span style="font-size:12px;color:var(--muted)">Page '+_taskPage+' / '+totalPages+'</span>' +
|
||||
'<button class="btn btn-sm btn-secondary" onclick="setTaskPage('+(_taskPage+1)+')" '+(_taskPage >= totalPages ? 'disabled' : '')+'>Next</button>' +
|
||||
'</div>' +
|
||||
'</div>';
|
||||
document.getElementById('tasks-table').innerHTML =
|
||||
'<table><tr><th>Name</th><th>Status</th><th>Created</th><th>Duration</th><th>Priority</th><th>Actions</th></tr>'+rows+'</table>';
|
||||
'<table><tr><th>Name</th><th>Status</th><th>Created</th><th>Duration</th><th>Priority</th><th>Actions</th></tr>'+rows+'</table>' + pager;
|
||||
syncTaskLogFromHash();
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1607,6 +1662,11 @@ function formatDurSec(sec) {
|
||||
const m = Math.floor(sec/60), ss = sec%60;
|
||||
return m+'m '+ss+'s';
|
||||
}
|
||||
function setTaskPage(page) {
|
||||
const totalPages = Math.max(1, Math.ceil(_tasksAll.length / _taskPageSize));
|
||||
_taskPage = Math.min(totalPages, Math.max(1, page));
|
||||
loadTasks();
|
||||
}
|
||||
|
||||
function cancelTask(id) {
|
||||
fetch('/api/tasks/'+id+'/cancel',{method:'POST'}).then(()=>loadTasks());
|
||||
@@ -1633,24 +1693,59 @@ function setPriority(id, delta) {
|
||||
fetch('/api/tasks/'+id+'/priority',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({delta:delta})})
|
||||
.then(()=>loadTasks());
|
||||
}
|
||||
function resetTaskLog(term, text) {
|
||||
term.textContent = text ? text + '\n' : '';
|
||||
if (text) term.dataset.placeholder = '1';
|
||||
else delete term.dataset.placeholder;
|
||||
}
|
||||
function prependTaskLogLine(term, line) {
|
||||
if (term.dataset.placeholder === '1') {
|
||||
term.textContent = '';
|
||||
delete term.dataset.placeholder;
|
||||
}
|
||||
term.prepend(document.createTextNode(line + '\n'));
|
||||
term.scrollTop = 0;
|
||||
}
|
||||
function viewLog(id, name) {
|
||||
if (_taskLogES) { _taskLogES.close(); _taskLogES = null; }
|
||||
document.getElementById('task-log-section').style.display = '';
|
||||
_taskLogID = id;
|
||||
window.location.hash = id;
|
||||
document.getElementById('task-log-overlay').style.display = 'flex';
|
||||
document.getElementById('task-log-title').textContent = name;
|
||||
const term = document.getElementById('task-log-terminal');
|
||||
term.textContent = 'Connecting...\n';
|
||||
resetTaskLog(term, 'Connecting...');
|
||||
_taskLogES = new EventSource('/api/tasks/'+id+'/stream');
|
||||
_taskLogES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop=term.scrollHeight; };
|
||||
_taskLogES.onopen = () => {
|
||||
if (term.dataset.placeholder === '1') resetTaskLog(term, 'Connected. Waiting for output...');
|
||||
};
|
||||
_taskLogES.onmessage = e => { prependTaskLogLine(term, e.data); };
|
||||
_taskLogES.addEventListener('done', e => {
|
||||
_taskLogES.close(); _taskLogES=null;
|
||||
term.textContent += (e.data ? '\nERROR: '+e.data : '\nDone.')+'\n';
|
||||
prependTaskLogLine(term, e.data ? 'ERROR: '+e.data : 'Done.');
|
||||
});
|
||||
}
|
||||
function syncTaskLogFromHash() {
|
||||
const id = (window.location.hash || '').replace(/^#/, '');
|
||||
if (!id || id === _taskLogID) return;
|
||||
const task = _tasksAll.find(t => t.id === id);
|
||||
if (!task) return;
|
||||
viewLog(task.id, task.name || task.id);
|
||||
}
|
||||
function closeTaskLog() {
|
||||
if (_taskLogES) { _taskLogES.close(); _taskLogES=null; }
|
||||
document.getElementById('task-log-section').style.display='none';
|
||||
_taskLogID = '';
|
||||
if (window.location.hash) history.replaceState(null, '', '/tasks');
|
||||
document.getElementById('task-log-overlay').style.display='none';
|
||||
}
|
||||
|
||||
document.getElementById('task-log-overlay').addEventListener('click', function(e) {
|
||||
if (e.target === this) closeTaskLog();
|
||||
});
|
||||
window.addEventListener('hashchange', syncTaskLogFromHash);
|
||||
window.addEventListener('keydown', function(e) {
|
||||
if (e.key === 'Escape' && document.getElementById('task-log-overlay').style.display !== 'none') closeTaskLog();
|
||||
});
|
||||
|
||||
loadTasks();
|
||||
_taskRefreshTimer = setInterval(loadTasks, 2000);
|
||||
</script>`
|
||||
|
||||
@@ -164,6 +164,8 @@ type handler struct {
|
||||
// pending network change (rollback on timeout)
|
||||
pendingNet *pendingNetChange
|
||||
pendingNetMu sync.Mutex
|
||||
// kmsg hardware error watcher
|
||||
kmsg *kmsgWatcher
|
||||
}
|
||||
|
||||
// NewHandler creates the HTTP mux with all routes.
|
||||
@@ -203,6 +205,13 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
}
|
||||
h.startMetricsCollector()
|
||||
|
||||
// Start kmsg hardware error watcher if the app (and its status DB) is available.
|
||||
if opts.App != nil {
|
||||
h.kmsg = newKmsgWatcher(opts.App.StatusDB)
|
||||
h.kmsg.start()
|
||||
globalQueue.kmsgWatcher = h.kmsg
|
||||
}
|
||||
|
||||
globalQueue.startWorker(&opts)
|
||||
mux := http.NewServeMux()
|
||||
|
||||
|
||||
@@ -275,9 +275,10 @@ func TestRootRendersDashboard(t *testing.T) {
|
||||
}
|
||||
|
||||
handler := NewHandler(HandlerOptions{
|
||||
Title: "Bee Hardware Audit",
|
||||
AuditPath: path,
|
||||
ExportDir: exportDir,
|
||||
Title: "Bee Hardware Audit",
|
||||
BuildLabel: "1.2.3",
|
||||
AuditPath: path,
|
||||
ExportDir: exportDir,
|
||||
})
|
||||
|
||||
first := httptest.NewRecorder()
|
||||
@@ -292,6 +293,11 @@ func TestRootRendersDashboard(t *testing.T) {
|
||||
if !strings.Contains(first.Body.String(), `/viewer`) {
|
||||
t.Fatalf("first body missing viewer link: %s", first.Body.String())
|
||||
}
|
||||
versionIdx := strings.Index(first.Body.String(), `Version 1.2.3`)
|
||||
navIdx := strings.Index(first.Body.String(), `href="/"`)
|
||||
if versionIdx == -1 || navIdx == -1 || versionIdx > navIdx {
|
||||
t.Fatalf("version should render near top of sidebar before nav links: %s", first.Body.String())
|
||||
}
|
||||
if got := first.Header().Get("Cache-Control"); got != "no-store" {
|
||||
t.Fatalf("first cache-control=%q", got)
|
||||
}
|
||||
@@ -359,6 +365,84 @@ func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestTasksPageRendersLogModalAndPaginationControls(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, `id="task-log-overlay"`) {
|
||||
t.Fatalf("tasks page missing log modal overlay: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `_taskPageSize = 50`) {
|
||||
t.Fatalf("tasks page missing pagination size config: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `Previous</button>`) || !strings.Contains(body, `Next</button>`) {
|
||||
t.Fatalf("tasks page missing pagination controls: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tools", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, `Restart GPU Drivers`) {
|
||||
t.Fatalf("tools page missing restart gpu drivers button: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `svcAction('bee-nvidia', 'restart')`) {
|
||||
t.Fatalf("tools page missing bee-nvidia restart action: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `id="boot-source-text"`) {
|
||||
t.Fatalf("tools page missing boot source field: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `Export to USB`) {
|
||||
t.Fatalf("tools page missing export to usb section: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `Support Bundle</button>`) {
|
||||
t.Fatalf("tools page missing support bundle usb button: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTasksPageRendersScrollableLogModal(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
exportDir := filepath.Join(dir, "export")
|
||||
if err := os.MkdirAll(exportDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z"}`), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
handler := NewHandler(HandlerOptions{
|
||||
Title: "Bee Hardware Audit",
|
||||
AuditPath: path,
|
||||
ExportDir: exportDir,
|
||||
})
|
||||
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, `height:calc(100vh - 32px)`) {
|
||||
t.Fatalf("tasks page missing bounded log modal height: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `flex:1;min-height:0;overflow:hidden`) {
|
||||
t.Fatalf("tasks page missing log modal overflow guard: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `height:100%;min-height:0;overflow:auto`) {
|
||||
t.Fatalf("tasks page missing scrollable log wrapper: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestViewerRendersLatestSnapshot(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
|
||||
@@ -173,13 +173,14 @@ func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions
|
||||
|
||||
// taskQueue manages a priority-ordered list of tasks and runs them one at a time.
|
||||
type taskQueue struct {
|
||||
mu sync.Mutex
|
||||
tasks []*Task
|
||||
trigger chan struct{}
|
||||
opts *HandlerOptions // set by startWorker
|
||||
statePath string
|
||||
logsDir string
|
||||
started bool
|
||||
mu sync.Mutex
|
||||
tasks []*Task
|
||||
trigger chan struct{}
|
||||
opts *HandlerOptions // set by startWorker
|
||||
statePath string
|
||||
logsDir string
|
||||
started bool
|
||||
kmsgWatcher *kmsgWatcher
|
||||
}
|
||||
|
||||
var globalQueue = &taskQueue{trigger: make(chan struct{}, 1)}
|
||||
@@ -291,6 +292,30 @@ func (q *taskQueue) findJob(id string) (*jobState, bool) {
|
||||
return t.job, true
|
||||
}
|
||||
|
||||
type taskStreamSource struct {
|
||||
status string
|
||||
errMsg string
|
||||
logPath string
|
||||
job *jobState
|
||||
}
|
||||
|
||||
func (q *taskQueue) taskStreamSource(id string) (taskStreamSource, bool) {
|
||||
q.mu.Lock()
|
||||
defer q.mu.Unlock()
|
||||
for _, t := range q.tasks {
|
||||
if t.ID != id {
|
||||
continue
|
||||
}
|
||||
return taskStreamSource{
|
||||
status: t.Status,
|
||||
errMsg: t.ErrMsg,
|
||||
logPath: t.LogPath,
|
||||
job: t.job,
|
||||
}, true
|
||||
}
|
||||
return taskStreamSource{}, false
|
||||
}
|
||||
|
||||
func (q *taskQueue) hasActiveTarget(target string) bool {
|
||||
q.mu.Lock()
|
||||
defer q.mu.Unlock()
|
||||
@@ -305,7 +330,7 @@ func (q *taskQueue) hasActiveTarget(target string) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// snapshot returns a copy of all tasks sorted for display (running first, then pending by priority, then done by doneAt desc).
|
||||
// snapshot returns a copy of all tasks sorted for display with newest tasks first.
|
||||
func (q *taskQueue) snapshot() []Task {
|
||||
q.mu.Lock()
|
||||
defer q.mu.Unlock()
|
||||
@@ -315,6 +340,9 @@ func (q *taskQueue) snapshot() []Task {
|
||||
out[i].ElapsedSec = taskElapsedSec(&out[i], time.Now())
|
||||
}
|
||||
sort.SliceStable(out, func(i, j int) bool {
|
||||
if !out[i].CreatedAt.Equal(out[j].CreatedAt) {
|
||||
return out[i].CreatedAt.After(out[j].CreatedAt)
|
||||
}
|
||||
si := statusOrder(out[i].Status)
|
||||
sj := statusOrder(out[j].Status)
|
||||
if si != sj {
|
||||
@@ -323,7 +351,7 @@ func (q *taskQueue) snapshot() []Task {
|
||||
if out[i].Priority != out[j].Priority {
|
||||
return out[i].Priority > out[j].Priority
|
||||
}
|
||||
return out[i].CreatedAt.Before(out[j].CreatedAt)
|
||||
return out[i].Name < out[j].Name
|
||||
})
|
||||
return out
|
||||
}
|
||||
@@ -365,11 +393,13 @@ func (q *taskQueue) worker() {
|
||||
for {
|
||||
<-q.trigger
|
||||
setCPUGovernor("performance")
|
||||
|
||||
// Drain all pending tasks and start them in parallel.
|
||||
q.mu.Lock()
|
||||
var batch []*Task
|
||||
for {
|
||||
q.mu.Lock()
|
||||
t := q.nextPending()
|
||||
if t == nil {
|
||||
q.mu.Unlock()
|
||||
break
|
||||
}
|
||||
now := time.Now()
|
||||
@@ -378,29 +408,58 @@ func (q *taskQueue) worker() {
|
||||
t.DoneAt = nil
|
||||
t.ErrMsg = ""
|
||||
j := newTaskJobState(t.LogPath)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
j.cancel = cancel
|
||||
t.job = j
|
||||
batch = append(batch, t)
|
||||
}
|
||||
if len(batch) > 0 {
|
||||
q.persistLocked()
|
||||
q.mu.Unlock()
|
||||
}
|
||||
q.mu.Unlock()
|
||||
|
||||
q.runTask(t, j, ctx)
|
||||
var wg sync.WaitGroup
|
||||
for _, t := range batch {
|
||||
t := t
|
||||
j := t.job
|
||||
taskCtx, taskCancel := context.WithCancel(context.Background())
|
||||
j.cancel = taskCancel
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
|
||||
q.mu.Lock()
|
||||
now2 := time.Now()
|
||||
t.DoneAt = &now2
|
||||
if t.Status == TaskRunning { // not cancelled externally
|
||||
if j.err != "" {
|
||||
t.Status = TaskFailed
|
||||
t.ErrMsg = j.err
|
||||
} else {
|
||||
t.Status = TaskDone
|
||||
if q.kmsgWatcher != nil && isSATTarget(t.Target) {
|
||||
q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
|
||||
}
|
||||
}
|
||||
|
||||
q.runTask(t, j, taskCtx)
|
||||
|
||||
if q.kmsgWatcher != nil {
|
||||
q.kmsgWatcher.NotifyTaskFinished(t.ID)
|
||||
}
|
||||
|
||||
q.mu.Lock()
|
||||
now2 := time.Now()
|
||||
t.DoneAt = &now2
|
||||
if t.Status == TaskRunning {
|
||||
if j.err != "" {
|
||||
t.Status = TaskFailed
|
||||
t.ErrMsg = j.err
|
||||
} else {
|
||||
t.Status = TaskDone
|
||||
}
|
||||
}
|
||||
q.persistLocked()
|
||||
q.mu.Unlock()
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
if len(batch) > 0 {
|
||||
q.mu.Lock()
|
||||
q.prune()
|
||||
q.persistLocked()
|
||||
q.mu.Unlock()
|
||||
}
|
||||
|
||||
setCPUGovernor("powersave")
|
||||
}
|
||||
}
|
||||
@@ -591,6 +650,19 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
// If the SAT archive was produced, check overall_status and write to component DB.
|
||||
if archive != "" {
|
||||
archivePath := app.ExtractArchivePath(archive)
|
||||
if err == nil {
|
||||
if app.ReadSATOverallStatus(archivePath) == "FAILED" {
|
||||
err = fmt.Errorf("SAT overall_status=FAILED (see summary.txt)")
|
||||
}
|
||||
}
|
||||
if db := q.statusDB(); db != nil {
|
||||
app.ApplySATResultToDB(db, t.Target, archivePath)
|
||||
}
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
if ctx.Err() != nil {
|
||||
j.append("Aborted.")
|
||||
@@ -607,6 +679,13 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
j.finish("")
|
||||
}
|
||||
|
||||
func (q *taskQueue) statusDB() *app.ComponentStatusDB {
|
||||
if q.opts == nil || q.opts.App == nil {
|
||||
return nil
|
||||
}
|
||||
return q.opts.App.StatusDB
|
||||
}
|
||||
|
||||
func splitLines(s string) []string {
|
||||
var out []string
|
||||
for _, l := range splitNL(s) {
|
||||
@@ -750,21 +829,49 @@ func (h *handler) handleAPITasksKillWorkers(w http.ResponseWriter, _ *http.Reque
|
||||
|
||||
func (h *handler) handleAPITasksStream(w http.ResponseWriter, r *http.Request) {
|
||||
id := r.PathValue("id")
|
||||
// Wait up to 5s for the task to get a job (it may be pending)
|
||||
deadline := time.Now().Add(5 * time.Second)
|
||||
var j *jobState
|
||||
for time.Now().Before(deadline) {
|
||||
if jj, ok := globalQueue.findJob(id); ok {
|
||||
j = jj
|
||||
break
|
||||
}
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
}
|
||||
if j == nil {
|
||||
http.Error(w, "task not found or not yet started", http.StatusNotFound)
|
||||
src, ok := globalQueue.taskStreamSource(id)
|
||||
if !ok {
|
||||
http.Error(w, "task not found", http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
streamJob(w, r, j)
|
||||
if src.job != nil {
|
||||
streamJob(w, r, src.job)
|
||||
return
|
||||
}
|
||||
if src.status == TaskDone || src.status == TaskFailed || src.status == TaskCancelled {
|
||||
j := newTaskJobState(src.logPath)
|
||||
j.finish(src.errMsg)
|
||||
streamJob(w, r, j)
|
||||
return
|
||||
}
|
||||
if !sseStart(w) {
|
||||
return
|
||||
}
|
||||
sseWrite(w, "", "Task is queued. Waiting for worker...")
|
||||
ticker := time.NewTicker(200 * time.Millisecond)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
src, ok = globalQueue.taskStreamSource(id)
|
||||
if !ok {
|
||||
sseWrite(w, "done", "task not found")
|
||||
return
|
||||
}
|
||||
if src.job != nil {
|
||||
streamSubscribedJob(w, r, src.job)
|
||||
return
|
||||
}
|
||||
if src.status == TaskDone || src.status == TaskFailed || src.status == TaskCancelled {
|
||||
j := newTaskJobState(src.logPath)
|
||||
j.finish(src.errMsg)
|
||||
streamSubscribedJob(w, r, j)
|
||||
return
|
||||
}
|
||||
case <-r.Context().Done():
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (q *taskQueue) assignTaskLogPathLocked(t *Task) {
|
||||
|
||||
@@ -2,6 +2,8 @@ package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
@@ -122,6 +124,130 @@ func TestNewTaskJobStateLoadsExistingLog(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestTaskQueueSnapshotSortsNewestFirst(t *testing.T) {
|
||||
now := time.Date(2026, 4, 2, 12, 0, 0, 0, time.UTC)
|
||||
q := &taskQueue{
|
||||
tasks: []*Task{
|
||||
{
|
||||
ID: "old-running",
|
||||
Name: "Old Running",
|
||||
Status: TaskRunning,
|
||||
Priority: 10,
|
||||
CreatedAt: now.Add(-3 * time.Minute),
|
||||
},
|
||||
{
|
||||
ID: "new-done",
|
||||
Name: "New Done",
|
||||
Status: TaskDone,
|
||||
Priority: 0,
|
||||
CreatedAt: now.Add(-1 * time.Minute),
|
||||
},
|
||||
{
|
||||
ID: "mid-pending",
|
||||
Name: "Mid Pending",
|
||||
Status: TaskPending,
|
||||
Priority: 1,
|
||||
CreatedAt: now.Add(-2 * time.Minute),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
got := q.snapshot()
|
||||
if len(got) != 3 {
|
||||
t.Fatalf("snapshot len=%d want 3", len(got))
|
||||
}
|
||||
if got[0].ID != "new-done" || got[1].ID != "mid-pending" || got[2].ID != "old-running" {
|
||||
t.Fatalf("snapshot order=%q,%q,%q", got[0].ID, got[1].ID, got[2].ID)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPITasksStreamReplaysPersistedLogWithoutLiveJob(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
logPath := filepath.Join(dir, "task.log")
|
||||
if err := os.WriteFile(logPath, []byte("line1\nline2\n"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
globalQueue.mu.Lock()
|
||||
origTasks := globalQueue.tasks
|
||||
globalQueue.tasks = []*Task{{
|
||||
ID: "done-1",
|
||||
Name: "Done Task",
|
||||
Status: TaskDone,
|
||||
CreatedAt: time.Now(),
|
||||
LogPath: logPath,
|
||||
}}
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = origTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
|
||||
req := httptest.NewRequest(http.MethodGet, "/api/tasks/done-1/stream", nil)
|
||||
req.SetPathValue("id", "done-1")
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h := &handler{}
|
||||
h.handleAPITasksStream(rec, req)
|
||||
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, "data: line1\n\n") || !strings.Contains(body, "data: line2\n\n") {
|
||||
t.Fatalf("body=%q", body)
|
||||
}
|
||||
if !strings.Contains(body, "event: done\n") {
|
||||
t.Fatalf("missing done event: %q", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPITasksStreamPendingTaskStartsSSEImmediately(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
origTasks := globalQueue.tasks
|
||||
globalQueue.tasks = []*Task{{
|
||||
ID: "pending-1",
|
||||
Name: "Pending Task",
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
}}
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = origTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
req := httptest.NewRequest(http.MethodGet, "/api/tasks/pending-1/stream", nil).WithContext(ctx)
|
||||
req.SetPathValue("id", "pending-1")
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
h := &handler{}
|
||||
h.handleAPITasksStream(rec, req)
|
||||
close(done)
|
||||
}()
|
||||
|
||||
deadline := time.Now().Add(2 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
if strings.Contains(rec.Body.String(), "Task is queued. Waiting for worker...") {
|
||||
cancel()
|
||||
<-done
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
return
|
||||
}
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
}
|
||||
cancel()
|
||||
<-done
|
||||
t.Fatalf("stream did not emit queued status promptly, body=%q", rec.Body.String())
|
||||
}
|
||||
|
||||
func TestResolveBurnPreset(t *testing.T) {
|
||||
tests := []struct {
|
||||
profile string
|
||||
|
||||
16
audit/scripts/resolve-version.sh
Executable file
16
audit/scripts/resolve-version.sh
Executable file
@@ -0,0 +1,16 @@
|
||||
#!/bin/sh
|
||||
set -eu
|
||||
|
||||
tag="$(git describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
|
||||
|
||||
case "${tag}" in
|
||||
v*)
|
||||
printf '%s\n' "${tag#v}"
|
||||
;;
|
||||
"")
|
||||
printf 'dev\n'
|
||||
;;
|
||||
*)
|
||||
printf '%s\n' "${tag}"
|
||||
;;
|
||||
esac
|
||||
@@ -32,7 +32,7 @@ lb config noauto \
|
||||
--memtest memtest86+ \
|
||||
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
||||
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=6 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
||||
--apt-recommends false \
|
||||
--chroot-squashfs-compression-type zstd \
|
||||
"${@}"
|
||||
|
||||
@@ -54,15 +54,8 @@ resolve_audit_version() {
|
||||
return 0
|
||||
fi
|
||||
|
||||
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'audit/v*' --abbrev=7 --dirty 2>/dev/null || true)"
|
||||
if [ -z "${tag}" ]; then
|
||||
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
|
||||
fi
|
||||
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
|
||||
case "${tag}" in
|
||||
audit/v*)
|
||||
echo "${tag#audit/v}"
|
||||
return 0
|
||||
;;
|
||||
v*)
|
||||
echo "${tag#v}"
|
||||
return 0
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
[Unit]
|
||||
Description=Bee: hardware audit
|
||||
After=bee-preflight.service bee-network.service bee-nvidia.service
|
||||
Before=bee-web.service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
[Unit]
|
||||
Description=Bee: hardware audit web viewer
|
||||
After=bee-audit.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
#!/bin/sh
|
||||
set -eu
|
||||
|
||||
SECONDS=300
|
||||
DURATION_SEC=300
|
||||
DEVICES=""
|
||||
EXCLUDE=""
|
||||
FORMAT=""
|
||||
TEST_SLICE_SECONDS=300
|
||||
JOHN_DIR="/usr/local/lib/bee/john/run"
|
||||
JOHN_BIN="${JOHN_DIR}/john"
|
||||
export OCL_ICD_VENDORS="/etc/OpenCL/vendors"
|
||||
@@ -116,7 +117,7 @@ ensure_opencl_ready() {
|
||||
|
||||
while [ "$#" -gt 0 ]; do
|
||||
case "$1" in
|
||||
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
||||
--seconds|-t) [ "$#" -ge 2 ] || usage; DURATION_SEC="$2"; shift 2 ;;
|
||||
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||
--format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;;
|
||||
@@ -189,14 +190,51 @@ CHOSEN_FORMAT=$(choose_format) || {
|
||||
exit 1
|
||||
}
|
||||
|
||||
echo "format=${CHOSEN_FORMAT}"
|
||||
run_john_loop() {
|
||||
opencl_id="$1"
|
||||
deadline="$2"
|
||||
round=0
|
||||
while :; do
|
||||
now=$(date +%s)
|
||||
remaining=$((deadline - now))
|
||||
if [ "${remaining}" -le 0 ]; then
|
||||
break
|
||||
fi
|
||||
round=$((round + 1))
|
||||
slice="${remaining}"
|
||||
if [ "${slice}" -gt "${TEST_SLICE_SECONDS}" ]; then
|
||||
slice="${TEST_SLICE_SECONDS}"
|
||||
fi
|
||||
echo "device=${opencl_id} round=${round} remaining_sec=${remaining} slice_sec=${slice}"
|
||||
./john --test="${slice}" --format="${CHOSEN_FORMAT}" --devices="${opencl_id}" || return 1
|
||||
done
|
||||
}
|
||||
|
||||
PIDS=""
|
||||
cleanup() {
|
||||
rc=$?
|
||||
trap - EXIT INT TERM
|
||||
for pid in ${PIDS}; do
|
||||
kill "${pid}" 2>/dev/null || true
|
||||
done
|
||||
for pid in ${PIDS}; do
|
||||
wait "${pid}" 2>/dev/null || true
|
||||
done
|
||||
exit "${rc}"
|
||||
}
|
||||
trap cleanup EXIT INT TERM
|
||||
|
||||
echo "format=${CHOSEN_FORMAT}"
|
||||
echo "target_seconds=${DURATION_SEC}"
|
||||
echo "slice_seconds=${TEST_SLICE_SECONDS}"
|
||||
DEADLINE=$(( $(date +%s) + DURATION_SEC ))
|
||||
_first=1
|
||||
for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do
|
||||
[ "${_first}" = "1" ] || sleep 3
|
||||
_first=0
|
||||
./john --test="${SECONDS}" --format="${CHOSEN_FORMAT}" --devices="${opencl_id}" &
|
||||
PIDS="${PIDS} $!"
|
||||
run_john_loop "${opencl_id}" "${DEADLINE}" &
|
||||
pid=$!
|
||||
PIDS="${PIDS} ${pid}"
|
||||
done
|
||||
FAIL=0
|
||||
for pid in ${PIDS}; do
|
||||
|
||||
@@ -128,13 +128,32 @@ ldconfig 2>/dev/null || true
|
||||
log "ldconfig refreshed"
|
||||
|
||||
# Start DCGM host engine so dcgmi can discover GPUs.
|
||||
# nv-hostengine must run before any dcgmi command — without it, dcgmi reports
|
||||
# "group is empty" even when GPUs and modules are present.
|
||||
# Skip if already running (e.g. started by a dcgm systemd service or prior boot).
|
||||
# nv-hostengine must run after the NVIDIA modules and device nodes are ready.
|
||||
# If it started too early (for example via systemd before bee-nvidia-load), it can
|
||||
# keep a stale empty inventory and dcgmi diag later reports no testable entities.
|
||||
if command -v nv-hostengine >/dev/null 2>&1; then
|
||||
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||
log "nv-hostengine already running — skipping"
|
||||
else
|
||||
if command -v pkill >/dev/null 2>&1; then
|
||||
pkill -x nv-hostengine >/dev/null 2>&1 || true
|
||||
tries=0
|
||||
while pgrep -x nv-hostengine >/dev/null 2>&1; do
|
||||
tries=$((tries + 1))
|
||||
if [ "${tries}" -ge 10 ]; then
|
||||
log "WARN: nv-hostengine is still running after restart request"
|
||||
break
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||
log "WARN: keeping existing nv-hostengine process"
|
||||
else
|
||||
log "nv-hostengine restarted"
|
||||
fi
|
||||
else
|
||||
log "WARN: pkill not found — cannot refresh nv-hostengine inventory"
|
||||
fi
|
||||
fi
|
||||
if ! pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||
nv-hostengine
|
||||
log "nv-hostengine started"
|
||||
fi
|
||||
|
||||
Reference in New Issue
Block a user