Compare commits
11 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f3c14cd893 | ||
|
|
728270dc8e | ||
|
|
8692f825bc | ||
|
|
11f52ac710 | ||
|
|
1cb398fe83 | ||
|
|
7a843be6b0 | ||
|
|
7f6386dccc | ||
|
|
eea2591bcc | ||
|
|
295a19b93a | ||
|
|
444a7d16cc | ||
|
|
fd722692a4 |
@@ -1,5 +1,7 @@
|
|||||||
LISTEN ?= :8080
|
LISTEN ?= :8080
|
||||||
AUDIT_PATH ?=
|
AUDIT_PATH ?=
|
||||||
|
VERSION ?= $(shell sh ./scripts/resolve-version.sh)
|
||||||
|
GO_LDFLAGS := -X main.Version=$(VERSION)
|
||||||
|
|
||||||
RUN_ARGS := web --listen $(LISTEN)
|
RUN_ARGS := web --listen $(LISTEN)
|
||||||
ifneq ($(AUDIT_PATH),)
|
ifneq ($(AUDIT_PATH),)
|
||||||
@@ -9,10 +11,10 @@ endif
|
|||||||
.PHONY: run build test
|
.PHONY: run build test
|
||||||
|
|
||||||
run:
|
run:
|
||||||
go run ./cmd/bee $(RUN_ARGS)
|
go run -ldflags "$(GO_LDFLAGS)" ./cmd/bee $(RUN_ARGS)
|
||||||
|
|
||||||
build:
|
build:
|
||||||
go build -o bee ./cmd/bee
|
go build -ldflags "$(GO_LDFLAGS)" -o bee ./cmd/bee
|
||||||
|
|
||||||
test:
|
test:
|
||||||
go test ./...
|
go test ./...
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ import (
|
|||||||
"io"
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
"runtime/debug"
|
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"bee/audit/internal/app"
|
"bee/audit/internal/app"
|
||||||
@@ -21,30 +20,7 @@ var Version = "dev"
|
|||||||
func buildLabel() string {
|
func buildLabel() string {
|
||||||
label := strings.TrimSpace(Version)
|
label := strings.TrimSpace(Version)
|
||||||
if label == "" {
|
if label == "" {
|
||||||
label = "dev"
|
return "dev"
|
||||||
}
|
|
||||||
if info, ok := debug.ReadBuildInfo(); ok {
|
|
||||||
var revision string
|
|
||||||
var modified bool
|
|
||||||
for _, setting := range info.Settings {
|
|
||||||
switch setting.Key {
|
|
||||||
case "vcs.revision":
|
|
||||||
revision = setting.Value
|
|
||||||
case "vcs.modified":
|
|
||||||
modified = setting.Value == "true"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if revision != "" {
|
|
||||||
short := revision
|
|
||||||
if len(short) > 12 {
|
|
||||||
short = short[:12]
|
|
||||||
}
|
|
||||||
label += " (" + short
|
|
||||||
if modified {
|
|
||||||
label += "+"
|
|
||||||
}
|
|
||||||
label += ")"
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return label
|
return label
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -46,8 +46,6 @@ func TestRunUnknownCommand(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestRunVersion(t *testing.T) {
|
func TestRunVersion(t *testing.T) {
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
old := Version
|
old := Version
|
||||||
Version = "test-version"
|
Version = "test-version"
|
||||||
t.Cleanup(func() { Version = old })
|
t.Cleanup(func() { Version = old })
|
||||||
@@ -62,6 +60,16 @@ func TestRunVersion(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestBuildLabelUsesVersionAsIs(t *testing.T) {
|
||||||
|
old := Version
|
||||||
|
Version = "1.2.3"
|
||||||
|
t.Cleanup(func() { Version = old })
|
||||||
|
|
||||||
|
if got := buildLabel(); got != "1.2.3" {
|
||||||
|
t.Fatalf("buildLabel=%q want %q", got, "1.2.3")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestRunExportRequiresTarget(t *testing.T) {
|
func TestRunExportRequiresTarget(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
|
|||||||
@@ -40,6 +40,8 @@ type App struct {
|
|||||||
sat satRunner
|
sat satRunner
|
||||||
runtime runtimeChecker
|
runtime runtimeChecker
|
||||||
installer installer
|
installer installer
|
||||||
|
// StatusDB is the unified component health store (nil if unavailable).
|
||||||
|
StatusDB *ComponentStatusDB
|
||||||
}
|
}
|
||||||
|
|
||||||
type ActionResult struct {
|
type ActionResult struct {
|
||||||
@@ -136,7 +138,7 @@ type runtimeChecker interface {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func New(platform *platform.System) *App {
|
func New(platform *platform.System) *App {
|
||||||
return &App{
|
a := &App{
|
||||||
network: platform,
|
network: platform,
|
||||||
services: platform,
|
services: platform,
|
||||||
exports: platform,
|
exports: platform,
|
||||||
@@ -145,6 +147,10 @@ func New(platform *platform.System) *App {
|
|||||||
runtime: platform,
|
runtime: platform,
|
||||||
installer: platform,
|
installer: platform,
|
||||||
}
|
}
|
||||||
|
if db, err := OpenComponentStatusDB(DefaultExportDir + "/component-status.json"); err == nil {
|
||||||
|
a.StatusDB = db
|
||||||
|
}
|
||||||
|
return a
|
||||||
}
|
}
|
||||||
|
|
||||||
// ApplySATOverlay parses a raw audit JSON, overlays the latest SAT results,
|
// ApplySATOverlay parses a raw audit JSON, overlays the latest SAT results,
|
||||||
@@ -154,7 +160,7 @@ func ApplySATOverlay(auditJSON []byte) ([]byte, error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir)
|
applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir, nil)
|
||||||
return json.MarshalIndent(snap, "", " ")
|
return json.MarshalIndent(snap, "", " ")
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -174,7 +180,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
result := collector.Run(runtimeMode)
|
result := collector.Run(runtimeMode)
|
||||||
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir)
|
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
|
||||||
if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
|
if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
|
||||||
result.Runtime = &health
|
result.Runtime = &health
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -754,6 +754,26 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for _, want := range []string{
|
||||||
|
"/system/ip-link.txt",
|
||||||
|
"/system/ip-link-stats.txt",
|
||||||
|
"/system/ethtool-info.txt",
|
||||||
|
"/system/ethtool-link.txt",
|
||||||
|
"/system/ethtool-module.txt",
|
||||||
|
"/system/mstflint-query.txt",
|
||||||
|
} {
|
||||||
|
var found bool
|
||||||
|
for _, name := range names {
|
||||||
|
if contains(name, want) {
|
||||||
|
found = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !found {
|
||||||
|
t.Fatalf("support bundle missing %s, names=%v", want, names)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
var foundRaw bool
|
var foundRaw bool
|
||||||
for _, name := range names {
|
for _, name := range names {
|
||||||
if contains(name, "/export/bee-sat/memory-run/verbose.log") {
|
if contains(name, "/export/bee-sat/memory-run/verbose.log") {
|
||||||
|
|||||||
266
audit/internal/app/component_status_db.go
Normal file
266
audit/internal/app/component_status_db.go
Normal file
@@ -0,0 +1,266 @@
|
|||||||
|
package app
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ComponentStatusDB is a persistent, append-only store of hardware component health records.
|
||||||
|
// Records are keyed by component identity strings (e.g. "pcie:0000:c8:00.0", "storage:nvme0n1").
|
||||||
|
// Once a component is marked Warning or Critical, subsequent OK entries do not downgrade it —
|
||||||
|
// the component stays at the highest observed severity until explicitly reset.
|
||||||
|
type ComponentStatusDB struct {
|
||||||
|
path string
|
||||||
|
mu sync.Mutex
|
||||||
|
records map[string]*ComponentStatusRecord
|
||||||
|
}
|
||||||
|
|
||||||
|
// ComponentStatusRecord holds the current and historical health of one hardware component.
|
||||||
|
type ComponentStatusRecord struct {
|
||||||
|
ComponentKey string `json:"component_key"`
|
||||||
|
Status string `json:"status"` // "OK", "Warning", "Critical", "Unknown"
|
||||||
|
LastCheckedAt time.Time `json:"last_checked_at"`
|
||||||
|
LastChangedAt time.Time `json:"last_changed_at"`
|
||||||
|
ErrorSummary string `json:"error_summary,omitempty"`
|
||||||
|
History []ComponentStatusEntry `json:"history"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// ComponentStatusEntry is one observation written to a component's history.
|
||||||
|
type ComponentStatusEntry struct {
|
||||||
|
At time.Time `json:"at"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
Source string `json:"source"` // e.g. "sat:nvidia", "sat:memory", "watchdog:kmsg"
|
||||||
|
Detail string `json:"detail,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// OpenComponentStatusDB opens (or creates) the JSON status DB at path.
|
||||||
|
func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) {
|
||||||
|
db := &ComponentStatusDB{
|
||||||
|
path: path,
|
||||||
|
records: make(map[string]*ComponentStatusRecord),
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
data, err := os.ReadFile(path)
|
||||||
|
if err != nil && !os.IsNotExist(err) {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if len(data) > 0 {
|
||||||
|
var records []ComponentStatusRecord
|
||||||
|
if err := json.Unmarshal(data, &records); err == nil {
|
||||||
|
for i := range records {
|
||||||
|
db.records[records[i].ComponentKey] = &records[i]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return db, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Record writes one observation for the given component key.
|
||||||
|
// source is a short label like "sat:nvidia" or "watchdog:kmsg".
|
||||||
|
// status is "OK", "Warning", "Critical", or "Unknown".
|
||||||
|
// OK never downgrades an existing Warning or Critical status.
|
||||||
|
func (db *ComponentStatusDB) Record(key, source, status, detail string) {
|
||||||
|
if db == nil || strings.TrimSpace(key) == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
db.mu.Lock()
|
||||||
|
defer db.mu.Unlock()
|
||||||
|
|
||||||
|
now := time.Now().UTC()
|
||||||
|
rec, exists := db.records[key]
|
||||||
|
if !exists {
|
||||||
|
rec = &ComponentStatusRecord{ComponentKey: key}
|
||||||
|
db.records[key] = rec
|
||||||
|
}
|
||||||
|
rec.LastCheckedAt = now
|
||||||
|
|
||||||
|
entry := ComponentStatusEntry{At: now, Status: status, Source: source, Detail: detail}
|
||||||
|
rec.History = append(rec.History, entry)
|
||||||
|
|
||||||
|
// Status merge: OK never downgrades Warning/Critical.
|
||||||
|
newSev := componentSeverity(status)
|
||||||
|
curSev := componentSeverity(rec.Status)
|
||||||
|
if newSev > curSev {
|
||||||
|
rec.Status = status
|
||||||
|
rec.LastChangedAt = now
|
||||||
|
rec.ErrorSummary = detail
|
||||||
|
} else if rec.Status == "" {
|
||||||
|
rec.Status = status
|
||||||
|
rec.LastChangedAt = now
|
||||||
|
}
|
||||||
|
|
||||||
|
_ = db.saveLocked()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get returns the current record for a component key.
|
||||||
|
func (db *ComponentStatusDB) Get(key string) (ComponentStatusRecord, bool) {
|
||||||
|
if db == nil {
|
||||||
|
return ComponentStatusRecord{}, false
|
||||||
|
}
|
||||||
|
db.mu.Lock()
|
||||||
|
defer db.mu.Unlock()
|
||||||
|
r, ok := db.records[key]
|
||||||
|
if !ok {
|
||||||
|
return ComponentStatusRecord{}, false
|
||||||
|
}
|
||||||
|
return *r, true
|
||||||
|
}
|
||||||
|
|
||||||
|
// All returns a snapshot of all records.
|
||||||
|
func (db *ComponentStatusDB) All() []ComponentStatusRecord {
|
||||||
|
if db == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
db.mu.Lock()
|
||||||
|
defer db.mu.Unlock()
|
||||||
|
out := make([]ComponentStatusRecord, 0, len(db.records))
|
||||||
|
for _, r := range db.records {
|
||||||
|
out = append(out, *r)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func (db *ComponentStatusDB) saveLocked() error {
|
||||||
|
records := make([]ComponentStatusRecord, 0, len(db.records))
|
||||||
|
for _, r := range db.records {
|
||||||
|
records = append(records, *r)
|
||||||
|
}
|
||||||
|
data, err := json.MarshalIndent(records, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return os.WriteFile(db.path, data, 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
// componentSeverity returns a numeric severity so higher values win.
|
||||||
|
func componentSeverity(status string) int {
|
||||||
|
switch strings.TrimSpace(status) {
|
||||||
|
case "Critical":
|
||||||
|
return 3
|
||||||
|
case "Warning":
|
||||||
|
return 2
|
||||||
|
case "OK":
|
||||||
|
return 1
|
||||||
|
default:
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ApplySATResultToDB reads a SAT summary.txt from the run directory next to archivePath
|
||||||
|
// and writes component status records to db for the given SAT target.
|
||||||
|
// archivePath may be either a bare .tar.gz path or "Archive written to /path/foo.tar.gz".
|
||||||
|
func ApplySATResultToDB(db *ComponentStatusDB, target, archivePath string) {
|
||||||
|
if db == nil || strings.TrimSpace(archivePath) == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
archivePath = extractArchivePath(archivePath)
|
||||||
|
if archivePath == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||||
|
data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
kv := parseSATKV(string(data))
|
||||||
|
overall := strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
|
||||||
|
if overall == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
source := "sat:" + target
|
||||||
|
dbStatus := satStatusToDBStatus(overall)
|
||||||
|
|
||||||
|
// Map SAT target to component keys.
|
||||||
|
switch target {
|
||||||
|
case "nvidia", "amd", "nvidia-stress", "amd-stress", "amd-mem", "amd-bandwidth":
|
||||||
|
db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall)
|
||||||
|
case "memory", "memory-stress", "sat-stress":
|
||||||
|
db.Record("memory:all", source, dbStatus, target+" SAT: "+overall)
|
||||||
|
case "cpu", "platform-stress":
|
||||||
|
db.Record("cpu:all", source, dbStatus, target+" SAT: "+overall)
|
||||||
|
case "storage":
|
||||||
|
// Try to record per-device if available in summary.
|
||||||
|
recordedAny := false
|
||||||
|
for key, val := range kv {
|
||||||
|
if !strings.HasSuffix(key, "_status") || key == "overall_status" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
base := strings.TrimSuffix(key, "_status")
|
||||||
|
idx := strings.Index(base, "_")
|
||||||
|
if idx <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
devName := base[:idx]
|
||||||
|
devStatus := satStatusToDBStatus(strings.ToUpper(strings.TrimSpace(val)))
|
||||||
|
db.Record("storage:"+devName, source, devStatus, "storage SAT: "+val)
|
||||||
|
recordedAny = true
|
||||||
|
}
|
||||||
|
if !recordedAny {
|
||||||
|
db.Record("storage:all", source, dbStatus, "storage SAT: "+overall)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func satStatusToDBStatus(overall string) string {
|
||||||
|
switch overall {
|
||||||
|
case "OK":
|
||||||
|
return "OK"
|
||||||
|
case "FAILED":
|
||||||
|
return "Warning"
|
||||||
|
case "PARTIAL", "UNSUPPORTED":
|
||||||
|
return "Unknown"
|
||||||
|
default:
|
||||||
|
return "Unknown"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ExtractArchivePath extracts a bare .tar.gz path from a string that may be
|
||||||
|
// "Archive written to /path/foo.tar.gz" or already a bare path.
|
||||||
|
func ExtractArchivePath(s string) string {
|
||||||
|
return extractArchivePath(s)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ReadSATOverallStatus reads the overall_status value from the summary.txt
|
||||||
|
// file located in the run directory alongside archivePath.
|
||||||
|
// Returns "" if the file cannot be read.
|
||||||
|
func ReadSATOverallStatus(archivePath string) string {
|
||||||
|
if strings.TrimSpace(archivePath) == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||||
|
data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
kv := parseSATKV(string(data))
|
||||||
|
return strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
|
||||||
|
}
|
||||||
|
|
||||||
|
func extractArchivePath(s string) string {
|
||||||
|
s = strings.TrimSpace(s)
|
||||||
|
if strings.HasSuffix(s, ".tar.gz") {
|
||||||
|
parts := strings.Fields(s)
|
||||||
|
if len(parts) > 0 {
|
||||||
|
return parts[len(parts)-1]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseSATKV(raw string) map[string]string {
|
||||||
|
kv := make(map[string]string)
|
||||||
|
for _, line := range strings.Split(raw, "\n") {
|
||||||
|
k, v, ok := strings.Cut(strings.TrimSpace(line), "=")
|
||||||
|
if ok {
|
||||||
|
kv[strings.TrimSpace(k)] = strings.TrimSpace(v)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return kv
|
||||||
|
}
|
||||||
@@ -9,7 +9,7 @@ import (
|
|||||||
"bee/audit/internal/schema"
|
"bee/audit/internal/schema"
|
||||||
)
|
)
|
||||||
|
|
||||||
func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
|
func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *ComponentStatusDB) {
|
||||||
if snap == nil || strings.TrimSpace(baseDir) == "" {
|
if snap == nil || strings.TrimSpace(baseDir) == "" {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -28,6 +28,8 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
|
|||||||
if summary, ok := loadLatestSATSummary(baseDir, "storage-"); ok {
|
if summary, ok := loadLatestSATSummary(baseDir, "storage-"); ok {
|
||||||
applyStorageSAT(snap.Storage, summary)
|
applyStorageSAT(snap.Storage, summary)
|
||||||
}
|
}
|
||||||
|
// Apply unified component status DB — overlaid last so it can only upgrade severity.
|
||||||
|
applyComponentStatusDB(snap, db)
|
||||||
}
|
}
|
||||||
|
|
||||||
type satSummary struct {
|
type satSummary struct {
|
||||||
@@ -206,6 +208,86 @@ func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func applyComponentStatusDB(snap *schema.HardwareSnapshot, db *ComponentStatusDB) {
|
||||||
|
if snap == nil || db == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, rec := range db.All() {
|
||||||
|
key := rec.ComponentKey
|
||||||
|
status := dbStatusToSATStatus(rec.Status)
|
||||||
|
if status == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
detail := rec.ErrorSummary
|
||||||
|
ts := rec.LastChangedAt.UTC().Format("2006-01-02T15:04:05Z")
|
||||||
|
|
||||||
|
switch {
|
||||||
|
case strings.HasPrefix(key, "pcie:"):
|
||||||
|
bdf := strings.TrimPrefix(key, "pcie:")
|
||||||
|
bdf = strings.TrimPrefix(bdf, "gpu:") // strip sub-type if present
|
||||||
|
// bdf may be empty (e.g. "pcie:gpu:nvidia") — skip BDF matching
|
||||||
|
if sanitizeBDFForLookup(bdf) == "" {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
normalized := sanitizeBDFForLookup(bdf)
|
||||||
|
for i := range snap.PCIeDevices {
|
||||||
|
if snap.PCIeDevices[i].BDF == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if sanitizeBDFForLookup(*snap.PCIeDevices[i].BDF) == normalized {
|
||||||
|
mergeComponentStatus(&snap.PCIeDevices[i].HardwareComponentStatus, ts, status, detail)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case strings.HasPrefix(key, "storage:"):
|
||||||
|
devName := strings.TrimPrefix(key, "storage:")
|
||||||
|
if devName == "all" {
|
||||||
|
for i := range snap.Storage {
|
||||||
|
mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for i := range snap.Storage {
|
||||||
|
linuxDev, _ := snap.Storage[i].Telemetry["linux_device"].(string)
|
||||||
|
if filepath.Base(strings.TrimSpace(linuxDev)) == devName {
|
||||||
|
mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case strings.HasPrefix(key, "memory:"):
|
||||||
|
for i := range snap.Memory {
|
||||||
|
mergeComponentStatus(&snap.Memory[i].HardwareComponentStatus, ts, status, detail)
|
||||||
|
}
|
||||||
|
case strings.HasPrefix(key, "cpu:"):
|
||||||
|
for i := range snap.CPUs {
|
||||||
|
mergeComponentStatus(&snap.CPUs[i].HardwareComponentStatus, ts, status, detail)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// dbStatusToSATStatus converts ComponentStatusDB status strings to the format
|
||||||
|
// expected by mergeComponentStatus (which uses "OK", "Warning", "Critical", "Unknown").
|
||||||
|
func dbStatusToSATStatus(s string) string {
|
||||||
|
switch strings.TrimSpace(s) {
|
||||||
|
case "OK", "Warning", "Critical", "Unknown":
|
||||||
|
return s
|
||||||
|
default:
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// sanitizeBDFForLookup normalises a PCIe BDF address to a canonical lower-case form
|
||||||
|
// suitable for comparison. "c8:00.0" → "0000:c8:00.0"; already-full BDFs are left as-is.
|
||||||
|
func sanitizeBDFForLookup(bdf string) string {
|
||||||
|
bdf = strings.ToLower(strings.TrimSpace(bdf))
|
||||||
|
if bdf == "" || bdf == "gpu" || strings.ContainsAny(bdf, " \t") {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
if strings.Count(bdf, ":") == 1 {
|
||||||
|
bdf = "0000:" + bdf
|
||||||
|
}
|
||||||
|
return bdf
|
||||||
|
}
|
||||||
|
|
||||||
func ptrString(v *string) string {
|
func ptrString(v *string) string {
|
||||||
if v == nil {
|
if v == nil {
|
||||||
return ""
|
return ""
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ func TestApplyLatestSATStatusesMarksStorageByDevice(t *testing.T) {
|
|||||||
usb := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/sda"}}
|
usb := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/sda"}}
|
||||||
snap := schema.HardwareSnapshot{Storage: []schema.HardwareStorage{nvme, usb}}
|
snap := schema.HardwareSnapshot{Storage: []schema.HardwareStorage{nvme, usb}}
|
||||||
|
|
||||||
applyLatestSATStatuses(&snap, baseDir)
|
applyLatestSATStatuses(&snap, baseDir, nil)
|
||||||
|
|
||||||
if snap.Storage[0].Status == nil || *snap.Storage[0].Status != "OK" {
|
if snap.Storage[0].Status == nil || *snap.Storage[0].Status != "OK" {
|
||||||
t.Fatalf("nvme status=%v want OK", snap.Storage[0].Status)
|
t.Fatalf("nvme status=%v want OK", snap.Storage[0].Status)
|
||||||
@@ -53,7 +53,7 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
|
|||||||
}},
|
}},
|
||||||
}
|
}
|
||||||
|
|
||||||
applyLatestSATStatuses(&snap, baseDir)
|
applyLatestSATStatuses(&snap, baseDir, nil)
|
||||||
|
|
||||||
if snap.PCIeDevices[0].Status == nil || *snap.PCIeDevices[0].Status != "Critical" {
|
if snap.PCIeDevices[0].Status == nil || *snap.PCIeDevices[0].Status != "Critical" {
|
||||||
t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
|
t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
|
||||||
|
|||||||
@@ -32,6 +32,8 @@ var supportBundleCommands = []struct {
|
|||||||
{name: "system/lspci-nn.txt", cmd: []string{"lspci", "-nn"}},
|
{name: "system/lspci-nn.txt", cmd: []string{"lspci", "-nn"}},
|
||||||
{name: "system/lspci-vvv.txt", cmd: []string{"lspci", "-vvv"}},
|
{name: "system/lspci-vvv.txt", cmd: []string{"lspci", "-vvv"}},
|
||||||
{name: "system/ip-addr.txt", cmd: []string{"ip", "addr"}},
|
{name: "system/ip-addr.txt", cmd: []string{"ip", "addr"}},
|
||||||
|
{name: "system/ip-link.txt", cmd: []string{"ip", "-details", "link", "show"}},
|
||||||
|
{name: "system/ip-link-stats.txt", cmd: []string{"ip", "-s", "link", "show"}},
|
||||||
{name: "system/ip-route.txt", cmd: []string{"ip", "route"}},
|
{name: "system/ip-route.txt", cmd: []string{"ip", "route"}},
|
||||||
{name: "system/mount.txt", cmd: []string{"mount"}},
|
{name: "system/mount.txt", cmd: []string{"mount"}},
|
||||||
{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
|
{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
|
||||||
@@ -47,6 +49,83 @@ for d in /sys/bus/pci/devices/*/; do
|
|||||||
printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
|
printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
|
||||||
done
|
done
|
||||||
done
|
done
|
||||||
|
`}},
|
||||||
|
{name: "system/ethtool-info.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if ! command -v ethtool >/dev/null 2>&1; then
|
||||||
|
echo "ethtool not found"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
found=0
|
||||||
|
for path in /sys/class/net/*; do
|
||||||
|
[ -e "$path" ] || continue
|
||||||
|
iface=$(basename "$path")
|
||||||
|
[ "$iface" = "lo" ] && continue
|
||||||
|
found=1
|
||||||
|
echo "=== $iface ==="
|
||||||
|
ethtool -i "$iface" 2>&1 || true
|
||||||
|
echo
|
||||||
|
done
|
||||||
|
if [ "$found" -eq 0 ]; then
|
||||||
|
echo "no interfaces found"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
|
{name: "system/ethtool-link.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if ! command -v ethtool >/dev/null 2>&1; then
|
||||||
|
echo "ethtool not found"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
found=0
|
||||||
|
for path in /sys/class/net/*; do
|
||||||
|
[ -e "$path" ] || continue
|
||||||
|
iface=$(basename "$path")
|
||||||
|
[ "$iface" = "lo" ] && continue
|
||||||
|
found=1
|
||||||
|
echo "=== $iface ==="
|
||||||
|
ethtool "$iface" 2>&1 || true
|
||||||
|
echo
|
||||||
|
done
|
||||||
|
if [ "$found" -eq 0 ]; then
|
||||||
|
echo "no interfaces found"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
|
{name: "system/ethtool-module.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if ! command -v ethtool >/dev/null 2>&1; then
|
||||||
|
echo "ethtool not found"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
found=0
|
||||||
|
for path in /sys/class/net/*; do
|
||||||
|
[ -e "$path" ] || continue
|
||||||
|
iface=$(basename "$path")
|
||||||
|
[ "$iface" = "lo" ] && continue
|
||||||
|
found=1
|
||||||
|
echo "=== $iface ==="
|
||||||
|
ethtool -m "$iface" 2>&1 || true
|
||||||
|
echo
|
||||||
|
done
|
||||||
|
if [ "$found" -eq 0 ]; then
|
||||||
|
echo "no interfaces found"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
|
{name: "system/mstflint-query.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if ! command -v mstflint >/dev/null 2>&1; then
|
||||||
|
echo "mstflint not found"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
found=0
|
||||||
|
for path in /sys/bus/pci/devices/*; do
|
||||||
|
[ -e "$path/vendor" ] || continue
|
||||||
|
vendor=$(cat "$path/vendor" 2>/dev/null)
|
||||||
|
[ "$vendor" = "0x15b3" ] || continue
|
||||||
|
bdf=$(basename "$path")
|
||||||
|
found=1
|
||||||
|
echo "=== $bdf ==="
|
||||||
|
mstflint -d "$bdf" q 2>&1 || true
|
||||||
|
echo
|
||||||
|
done
|
||||||
|
if [ "$found" -eq 0 ]; then
|
||||||
|
echo "no Mellanox/NVIDIA networking devices found"
|
||||||
|
fi
|
||||||
`}},
|
`}},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -2,18 +2,21 @@ package collector
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bee/audit/internal/schema"
|
"bee/audit/internal/schema"
|
||||||
|
"context"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
const mellanoxVendorID = 0x15b3
|
const mellanoxVendorID = 0x15b3
|
||||||
|
const nicProbeTimeout = 2 * time.Second
|
||||||
|
|
||||||
var (
|
var (
|
||||||
mstflintQuery = func(bdf string) (string, error) {
|
mstflintQuery = func(bdf string) (string, error) {
|
||||||
out, err := exec.Command("mstflint", "-d", bdf, "q").Output()
|
out, err := commandOutputWithTimeout(nicProbeTimeout, "mstflint", "-d", bdf, "q")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
@@ -21,7 +24,7 @@ var (
|
|||||||
}
|
}
|
||||||
|
|
||||||
ethtoolInfoQuery = func(iface string) (string, error) {
|
ethtoolInfoQuery = func(iface string) (string, error) {
|
||||||
out, err := exec.Command("ethtool", "-i", iface).Output()
|
out, err := commandOutputWithTimeout(nicProbeTimeout, "ethtool", "-i", iface)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
@@ -29,6 +32,14 @@ var (
|
|||||||
}
|
}
|
||||||
|
|
||||||
netIfacesByBDF = listNetIfacesByBDF
|
netIfacesByBDF = listNetIfacesByBDF
|
||||||
|
readNetCarrierFile = func(iface string) (string, error) {
|
||||||
|
path := filepath.Join("/sys/class/net", iface, "carrier")
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(string(raw)), nil
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
// enrichPCIeWithMellanox enriches Mellanox/NVIDIA Networking devices with
|
// enrichPCIeWithMellanox enriches Mellanox/NVIDIA Networking devices with
|
||||||
@@ -162,3 +173,17 @@ func listNetIfacesByBDF(bdf string) []string {
|
|||||||
}
|
}
|
||||||
return ifaces
|
return ifaces
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func commandOutputWithTimeout(timeout time.Duration, name string, args ...string) ([]byte, error) {
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||||
|
defer cancel()
|
||||||
|
return exec.CommandContext(ctx, name, args...).Output()
|
||||||
|
}
|
||||||
|
|
||||||
|
func interfaceHasCarrier(iface string) bool {
|
||||||
|
raw, err := readNetCarrierFile(iface)
|
||||||
|
if err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(raw) == "1"
|
||||||
|
}
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ import (
|
|||||||
|
|
||||||
var (
|
var (
|
||||||
ethtoolModuleQuery = func(iface string) (string, error) {
|
ethtoolModuleQuery = func(iface string) (string, error) {
|
||||||
out, err := raidToolQuery("ethtool", "-m", iface)
|
out, err := commandOutputWithTimeout(nicProbeTimeout, "ethtool", "-m", iface)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
@@ -58,10 +58,12 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if out, err := ethtoolModuleQuery(iface); err == nil {
|
if interfaceHasCarrier(iface) {
|
||||||
if injectSFPDOMTelemetry(&devs[i], out) {
|
if out, err := ethtoolModuleQuery(iface); err == nil {
|
||||||
enriched++
|
if injectSFPDOMTelemetry(&devs[i], out) {
|
||||||
continue
|
enriched++
|
||||||
|
continue
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {
|
if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {
|
||||||
|
|||||||
@@ -57,6 +57,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
|
|||||||
origReadMAC := readNetAddressFile
|
origReadMAC := readNetAddressFile
|
||||||
origEth := ethtoolInfoQuery
|
origEth := ethtoolInfoQuery
|
||||||
origModule := ethtoolModuleQuery
|
origModule := ethtoolModuleQuery
|
||||||
|
origCarrier := readNetCarrierFile
|
||||||
t.Cleanup(func() {
|
t.Cleanup(func() {
|
||||||
queryPCILSPCIDetail = origDetail
|
queryPCILSPCIDetail = origDetail
|
||||||
readPCIVPDFile = origVPD
|
readPCIVPDFile = origVPD
|
||||||
@@ -64,6 +65,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
|
|||||||
readNetAddressFile = origReadMAC
|
readNetAddressFile = origReadMAC
|
||||||
ethtoolInfoQuery = origEth
|
ethtoolInfoQuery = origEth
|
||||||
ethtoolModuleQuery = origModule
|
ethtoolModuleQuery = origModule
|
||||||
|
readNetCarrierFile = origCarrier
|
||||||
})
|
})
|
||||||
|
|
||||||
queryPCILSPCIDetail = func(bdf string) (string, error) {
|
queryPCILSPCIDetail = func(bdf string) (string, error) {
|
||||||
@@ -82,6 +84,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
|
|||||||
}
|
}
|
||||||
return "aa:bb:cc:dd:ee:ff", nil
|
return "aa:bb:cc:dd:ee:ff", nil
|
||||||
}
|
}
|
||||||
|
readNetCarrierFile = func(string) (string, error) { return "1", nil }
|
||||||
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
|
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
|
||||||
ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("skip optics") }
|
ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("skip optics") }
|
||||||
|
|
||||||
@@ -101,6 +104,42 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestEnrichPCIeWithNICTelemetrySkipsModuleQueryWithoutCarrier(t *testing.T) {
|
||||||
|
origIfaces := netIfacesByBDF
|
||||||
|
origReadMAC := readNetAddressFile
|
||||||
|
origEth := ethtoolInfoQuery
|
||||||
|
origModule := ethtoolModuleQuery
|
||||||
|
origCarrier := readNetCarrierFile
|
||||||
|
t.Cleanup(func() {
|
||||||
|
netIfacesByBDF = origIfaces
|
||||||
|
readNetAddressFile = origReadMAC
|
||||||
|
ethtoolInfoQuery = origEth
|
||||||
|
ethtoolModuleQuery = origModule
|
||||||
|
readNetCarrierFile = origCarrier
|
||||||
|
})
|
||||||
|
|
||||||
|
netIfacesByBDF = func(string) []string { return []string{"eth0"} }
|
||||||
|
readNetAddressFile = func(string) (string, error) { return "aa:bb:cc:dd:ee:ff", nil }
|
||||||
|
readNetCarrierFile = func(string) (string, error) { return "0", nil }
|
||||||
|
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
|
||||||
|
ethtoolModuleQuery = func(string) (string, error) {
|
||||||
|
t.Fatal("ethtool -m should not be called without carrier")
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
|
||||||
|
class := "EthernetController"
|
||||||
|
bdf := "0000:18:00.0"
|
||||||
|
devs := []schema.HardwarePCIeDevice{{
|
||||||
|
DeviceClass: &class,
|
||||||
|
BDF: &bdf,
|
||||||
|
}}
|
||||||
|
|
||||||
|
out := enrichPCIeWithNICTelemetry(devs)
|
||||||
|
if len(out[0].MacAddresses) != 1 || out[0].MacAddresses[0] != "aa:bb:cc:dd:ee:ff" {
|
||||||
|
t.Fatalf("mac_addresses=%v", out[0].MacAddresses)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestDBMValue(t *testing.T) {
|
func TestDBMValue(t *testing.T) {
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
in string
|
in string
|
||||||
|
|||||||
139
audit/internal/platform/error_patterns.go
Normal file
139
audit/internal/platform/error_patterns.go
Normal file
@@ -0,0 +1,139 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import "regexp"
|
||||||
|
|
||||||
|
// ErrorPattern describes a kernel log pattern that indicates a hardware error.
|
||||||
|
// Add new patterns by appending to HardwareErrorPatterns — no other code changes needed.
|
||||||
|
type ErrorPattern struct {
|
||||||
|
// Name is a short machine-readable label for logging and deduplication.
|
||||||
|
Name string
|
||||||
|
// Re is the compiled regular expression matched against a single kmsg line.
|
||||||
|
Re *regexp.Regexp
|
||||||
|
// Category groups related errors: "gpu", "pcie", "storage", "mce", "memory", "cpu".
|
||||||
|
Category string
|
||||||
|
// Severity is "warning" for recoverable/uncertain faults, "critical" for definitive failures.
|
||||||
|
Severity string
|
||||||
|
// BDFGroup is the capture group index (1-based) that contains a PCIe BDF address
|
||||||
|
// (e.g. "0000:c8:00.0"). 0 means no BDF is captured by this pattern.
|
||||||
|
BDFGroup int
|
||||||
|
// DevGroup is the capture group index (1-based) that contains a device name
|
||||||
|
// (e.g. "sda", "nvme0"). 0 means no device name is captured by this pattern.
|
||||||
|
DevGroup int
|
||||||
|
}
|
||||||
|
|
||||||
|
// HardwareErrorPatterns is the global list of kernel log patterns that indicate hardware faults.
|
||||||
|
// To add a new pattern: append a new ErrorPattern struct to this slice.
|
||||||
|
var HardwareErrorPatterns = []ErrorPattern{
|
||||||
|
// ── GPU / NVIDIA ────────────────────────────────────────────────────────────
|
||||||
|
{
|
||||||
|
Name: "nvidia-rminitadapter",
|
||||||
|
Re: mustPat(`(?i)NVRM:.*GPU\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
|
||||||
|
Category: "gpu",
|
||||||
|
Severity: "warning",
|
||||||
|
BDFGroup: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "nvidia-msi-fail",
|
||||||
|
Re: mustPat(`(?i)NVRM:.*Failed to enable MSI`),
|
||||||
|
Category: "gpu",
|
||||||
|
Severity: "warning",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "nvidia-aer",
|
||||||
|
Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||||
|
Category: "gpu",
|
||||||
|
Severity: "warning",
|
||||||
|
BDFGroup: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "nvidia-xid",
|
||||||
|
Re: mustPat(`(?i)NVRM:.*Xid.*\b([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
|
||||||
|
Category: "gpu",
|
||||||
|
Severity: "warning",
|
||||||
|
BDFGroup: 1,
|
||||||
|
},
|
||||||
|
|
||||||
|
// ── PCIe AER (generic) ──────────────────────────────────────────────────────
|
||||||
|
{
|
||||||
|
Name: "pcie-aer",
|
||||||
|
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||||
|
Category: "pcie",
|
||||||
|
Severity: "warning",
|
||||||
|
BDFGroup: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "pcie-uncorrectable",
|
||||||
|
Re: mustPat(`(?i)([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Uu]ncorrectable`),
|
||||||
|
Category: "pcie",
|
||||||
|
Severity: "warning",
|
||||||
|
BDFGroup: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "pcie-link-down",
|
||||||
|
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Ll]ink.*[Dd]own`),
|
||||||
|
Category: "pcie",
|
||||||
|
Severity: "warning",
|
||||||
|
BDFGroup: 1,
|
||||||
|
},
|
||||||
|
|
||||||
|
// ── Storage ─────────────────────────────────────────────────────────────────
|
||||||
|
{
|
||||||
|
Name: "blk-io-error",
|
||||||
|
Re: mustPat(`(?i)blk_update_request.*I/O error.*dev\s+(\w+)`),
|
||||||
|
Category: "storage",
|
||||||
|
Severity: "warning",
|
||||||
|
DevGroup: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "nvme-timeout",
|
||||||
|
Re: mustPat(`(?i)nvme\s+(\w+):.*timeout`),
|
||||||
|
Category: "storage",
|
||||||
|
Severity: "warning",
|
||||||
|
DevGroup: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "scsi-failed",
|
||||||
|
Re: mustPat(`(?i)sd\s+[\da-f:]+:.*FAILED`),
|
||||||
|
Category: "storage",
|
||||||
|
Severity: "warning",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "nvme-reset",
|
||||||
|
Re: mustPat(`(?i)nvme\s+(\w+):.*reset`),
|
||||||
|
Category: "storage",
|
||||||
|
Severity: "warning",
|
||||||
|
DevGroup: 1,
|
||||||
|
},
|
||||||
|
|
||||||
|
// ── Machine Check Exceptions ────────────────────────────────────────────────
|
||||||
|
{
|
||||||
|
Name: "mce-hardware-error",
|
||||||
|
Re: mustPat(`(?i)mce:.*[Hh]ardware [Ee]rror`),
|
||||||
|
Category: "mce",
|
||||||
|
Severity: "warning",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "mce-corrected",
|
||||||
|
Re: mustPat(`(?i)mce:.*[Cc]orrected`),
|
||||||
|
Category: "mce",
|
||||||
|
Severity: "warning",
|
||||||
|
},
|
||||||
|
|
||||||
|
// ── Memory ─────────────────────────────────────────────────────────────────
|
||||||
|
{
|
||||||
|
Name: "edac-ue",
|
||||||
|
Re: mustPat(`(?i)EDAC.*[Uu]ncorrectable`),
|
||||||
|
Category: "memory",
|
||||||
|
Severity: "warning",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "edac-ce",
|
||||||
|
Re: mustPat(`(?i)EDAC.*[Cc]orrectable`),
|
||||||
|
Category: "memory",
|
||||||
|
Severity: "warning",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
func mustPat(s string) *regexp.Regexp {
|
||||||
|
return regexp.MustCompile(s)
|
||||||
|
}
|
||||||
@@ -286,7 +286,25 @@ func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (
|
|||||||
// gpuIndices: specific GPU indices to test (empty = all GPUs).
|
// gpuIndices: specific GPU indices to test (empty = all GPUs).
|
||||||
// ctx cancellation kills the running job.
|
// ctx cancellation kills the running job.
|
||||||
func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error) {
|
func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, gpuIndices), logFunc)
|
resolvedGPUIndices, err := resolveDCGMGPUIndices(gpuIndices)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, resolvedGPUIndices), logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
|
||||||
|
if len(gpuIndices) > 0 {
|
||||||
|
return dedupeSortedIndices(gpuIndices), nil
|
||||||
|
}
|
||||||
|
all, err := listNvidiaGPUIndices()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if len(all) == 0 {
|
||||||
|
return nil, fmt.Errorf("nvidia-smi found no NVIDIA GPUs")
|
||||||
|
}
|
||||||
|
return all, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
|||||||
@@ -162,6 +162,39 @@ func TestBuildNvidiaStressJobUsesNCCLLoader(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestResolveDCGMGPUIndicesUsesDetectedGPUsWhenUnset(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
oldExecCommand := satExecCommand
|
||||||
|
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||||
|
if name == "nvidia-smi" {
|
||||||
|
return exec.Command("sh", "-c", "printf '2\n0\n1\n'")
|
||||||
|
}
|
||||||
|
return exec.Command(name, args...)
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||||
|
|
||||||
|
got, err := resolveDCGMGPUIndices(nil)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("resolveDCGMGPUIndices error: %v", err)
|
||||||
|
}
|
||||||
|
if want := "0,1,2"; joinIndexList(got) != want {
|
||||||
|
t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
got, err := resolveDCGMGPUIndices([]int{3, 1, 3})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("resolveDCGMGPUIndices error: %v", err)
|
||||||
|
}
|
||||||
|
if want := "1,3"; joinIndexList(got) != want {
|
||||||
|
t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestNvidiaStressArchivePrefixByLoader(t *testing.T) {
|
func TestNvidiaStressArchivePrefixByLoader(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
|
|||||||
238
audit/internal/webui/kmsg_watcher.go
Normal file
238
audit/internal/webui/kmsg_watcher.go
Normal file
@@ -0,0 +1,238 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"io"
|
||||||
|
"log/slog"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/app"
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
// kmsgWatcher reads /dev/kmsg and accumulates hardware error events.
|
||||||
|
// It supports multiple concurrent SAT tasks: a shared event window is open
|
||||||
|
// while any SAT task is running, and flushed when all tasks complete.
|
||||||
|
type kmsgWatcher struct {
|
||||||
|
mu sync.Mutex
|
||||||
|
activeCount int // number of in-flight SAT tasks
|
||||||
|
window *kmsgWindow
|
||||||
|
statusDB *app.ComponentStatusDB
|
||||||
|
}
|
||||||
|
|
||||||
|
type kmsgWindow struct {
|
||||||
|
targets []string // SAT targets running concurrently
|
||||||
|
startedAt time.Time
|
||||||
|
seen map[kmsgEventKey]bool
|
||||||
|
events []kmsgEvent
|
||||||
|
}
|
||||||
|
|
||||||
|
type kmsgEventKey struct {
|
||||||
|
id string // BDF or device name
|
||||||
|
category string
|
||||||
|
}
|
||||||
|
|
||||||
|
type kmsgEvent struct {
|
||||||
|
timestamp time.Time
|
||||||
|
raw string
|
||||||
|
ids []string // BDF addresses or device names extracted
|
||||||
|
category string
|
||||||
|
}
|
||||||
|
|
||||||
|
func newKmsgWatcher(statusDB *app.ComponentStatusDB) *kmsgWatcher {
|
||||||
|
return &kmsgWatcher{statusDB: statusDB}
|
||||||
|
}
|
||||||
|
|
||||||
|
// start launches the background kmsg reading goroutine.
|
||||||
|
func (w *kmsgWatcher) start() {
|
||||||
|
go w.run()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *kmsgWatcher) run() {
|
||||||
|
f, err := os.Open("/dev/kmsg")
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("kmsg watcher unavailable", "err", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
// Best-effort seek to end so we only capture events from now forward.
|
||||||
|
_, _ = f.Seek(0, io.SeekEnd)
|
||||||
|
|
||||||
|
scanner := bufio.NewScanner(f)
|
||||||
|
scanner.Buffer(make([]byte, 64*1024), 64*1024)
|
||||||
|
for scanner.Scan() {
|
||||||
|
line := scanner.Text()
|
||||||
|
evt, ok := parseKmsgLine(line)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
w.mu.Lock()
|
||||||
|
if w.window != nil {
|
||||||
|
w.recordEvent(evt)
|
||||||
|
}
|
||||||
|
w.mu.Unlock()
|
||||||
|
}
|
||||||
|
if err := scanner.Err(); err != nil {
|
||||||
|
slog.Warn("kmsg watcher stopped", "err", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// recordEvent appends evt to the active window, deduplicating by (id, category).
|
||||||
|
// Must be called with w.mu held.
|
||||||
|
func (w *kmsgWatcher) recordEvent(evt kmsgEvent) {
|
||||||
|
if len(evt.ids) == 0 {
|
||||||
|
key := kmsgEventKey{id: "", category: evt.category}
|
||||||
|
if !w.window.seen[key] {
|
||||||
|
w.window.seen[key] = true
|
||||||
|
w.window.events = append(w.window.events, evt)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, id := range evt.ids {
|
||||||
|
key := kmsgEventKey{id: id, category: evt.category}
|
||||||
|
if !w.window.seen[key] {
|
||||||
|
w.window.seen[key] = true
|
||||||
|
w.window.events = append(w.window.events, evt)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// NotifyTaskStarted increments the active task counter and opens a shared event window
|
||||||
|
// if this is the first task starting.
|
||||||
|
func (w *kmsgWatcher) NotifyTaskStarted(taskID, target string) {
|
||||||
|
w.mu.Lock()
|
||||||
|
defer w.mu.Unlock()
|
||||||
|
if w.activeCount == 0 {
|
||||||
|
w.window = &kmsgWindow{
|
||||||
|
startedAt: time.Now(),
|
||||||
|
seen: make(map[kmsgEventKey]bool),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
w.activeCount++
|
||||||
|
if w.window != nil {
|
||||||
|
w.window.targets = append(w.window.targets, target)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// NotifyTaskFinished decrements the active task counter. When all tasks finish,
|
||||||
|
// it flushes the accumulated events to the status DB.
|
||||||
|
func (w *kmsgWatcher) NotifyTaskFinished(taskID string) {
|
||||||
|
w.mu.Lock()
|
||||||
|
w.activeCount--
|
||||||
|
var window *kmsgWindow
|
||||||
|
if w.activeCount <= 0 {
|
||||||
|
w.activeCount = 0
|
||||||
|
window = w.window
|
||||||
|
w.window = nil
|
||||||
|
}
|
||||||
|
w.mu.Unlock()
|
||||||
|
|
||||||
|
if window == nil || len(window.events) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
go w.flushWindow(window)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
|
||||||
|
if w.statusDB == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
source := "watchdog:kmsg"
|
||||||
|
// Collect unique component keys from events.
|
||||||
|
seen := map[string]string{} // componentKey → first raw line
|
||||||
|
for _, evt := range window.events {
|
||||||
|
if len(evt.ids) == 0 {
|
||||||
|
// MCE or un-identified error.
|
||||||
|
key := "cpu:all"
|
||||||
|
if evt.category == "memory" {
|
||||||
|
key = "memory:all"
|
||||||
|
}
|
||||||
|
if _, exists := seen[key]; !exists {
|
||||||
|
seen[key] = evt.raw
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
for _, id := range evt.ids {
|
||||||
|
var key string
|
||||||
|
switch evt.category {
|
||||||
|
case "gpu", "pcie":
|
||||||
|
key = "pcie:" + normalizeBDF(id)
|
||||||
|
case "storage":
|
||||||
|
key = "storage:" + id
|
||||||
|
default:
|
||||||
|
key = "pcie:" + normalizeBDF(id)
|
||||||
|
}
|
||||||
|
if _, exists := seen[key]; !exists {
|
||||||
|
seen[key] = evt.raw
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for key, detail := range seen {
|
||||||
|
detail = "kernel error during SAT (" + strings.Join(window.targets, ",") + "): " + truncate(detail, 120)
|
||||||
|
w.statusDB.Record(key, source, "Warning", detail)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseKmsgLine parses a single /dev/kmsg line and returns an event if it matches
|
||||||
|
// any pattern in platform.HardwareErrorPatterns.
|
||||||
|
// kmsg format: "<priority>,<sequence>,<timestamp_usec>,-;message text"
|
||||||
|
func parseKmsgLine(raw string) (kmsgEvent, bool) {
|
||||||
|
msg := raw
|
||||||
|
if idx := strings.Index(raw, ";"); idx >= 0 {
|
||||||
|
msg = strings.TrimSpace(raw[idx+1:])
|
||||||
|
}
|
||||||
|
if msg == "" {
|
||||||
|
return kmsgEvent{}, false
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, p := range platform.HardwareErrorPatterns {
|
||||||
|
m := p.Re.FindStringSubmatch(msg)
|
||||||
|
if m == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
evt := kmsgEvent{
|
||||||
|
timestamp: time.Now(),
|
||||||
|
raw: msg,
|
||||||
|
category: p.Category,
|
||||||
|
}
|
||||||
|
if p.BDFGroup > 0 && p.BDFGroup < len(m) {
|
||||||
|
evt.ids = append(evt.ids, normalizeBDF(m[p.BDFGroup]))
|
||||||
|
}
|
||||||
|
if p.DevGroup > 0 && p.DevGroup < len(m) {
|
||||||
|
evt.ids = append(evt.ids, m[p.DevGroup])
|
||||||
|
}
|
||||||
|
return evt, true
|
||||||
|
}
|
||||||
|
return kmsgEvent{}, false
|
||||||
|
}
|
||||||
|
|
||||||
|
// normalizeBDF normalizes a PCIe BDF to the 4-part form "0000:c8:00.0".
|
||||||
|
func normalizeBDF(bdf string) string {
|
||||||
|
bdf = strings.ToLower(strings.TrimSpace(bdf))
|
||||||
|
if strings.Count(bdf, ":") == 1 {
|
||||||
|
return "0000:" + bdf
|
||||||
|
}
|
||||||
|
return bdf
|
||||||
|
}
|
||||||
|
|
||||||
|
func truncate(s string, max int) string {
|
||||||
|
if len(s) <= max {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
return s[:max] + "..."
|
||||||
|
}
|
||||||
|
|
||||||
|
// isSATTarget returns true for task targets that run hardware acceptance tests.
|
||||||
|
func isSATTarget(target string) bool {
|
||||||
|
switch target {
|
||||||
|
case "nvidia", "nvidia-stress", "memory", "memory-stress", "storage",
|
||||||
|
"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
|
||||||
|
"platform-stress":
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
@@ -29,6 +29,7 @@ a{color:var(--accent);text-decoration:none}
|
|||||||
.sidebar{width:210px;min-height:100vh;background:#1b1c1d;flex-shrink:0;display:flex;flex-direction:column}
|
.sidebar{width:210px;min-height:100vh;background:#1b1c1d;flex-shrink:0;display:flex;flex-direction:column}
|
||||||
.sidebar-logo{padding:18px 16px 12px;font-size:18px;font-weight:700;color:#fff;letter-spacing:-.5px}
|
.sidebar-logo{padding:18px 16px 12px;font-size:18px;font-weight:700;color:#fff;letter-spacing:-.5px}
|
||||||
.sidebar-logo span{color:rgba(255,255,255,.5);font-weight:400;font-size:12px;display:block;margin-top:2px}
|
.sidebar-logo span{color:rgba(255,255,255,.5);font-weight:400;font-size:12px;display:block;margin-top:2px}
|
||||||
|
.sidebar-version{padding:0 16px 14px;font-size:11px;color:rgba(255,255,255,.45)}
|
||||||
.nav{flex:1}
|
.nav{flex:1}
|
||||||
.nav-item{display:block;padding:10px 16px;color:rgba(255,255,255,.7);font-size:13px;border-left:3px solid transparent;transition:all .15s}
|
.nav-item{display:block;padding:10px 16px;color:rgba(255,255,255,.7);font-size:13px;border-left:3px solid transparent;transition:all .15s}
|
||||||
.nav-item:hover{color:#fff;background:rgba(255,255,255,.08)}
|
.nav-item:hover{color:#fff;background:rgba(255,255,255,.08)}
|
||||||
@@ -96,6 +97,10 @@ func layoutNav(active string, buildLabel string) string {
|
|||||||
var b strings.Builder
|
var b strings.Builder
|
||||||
b.WriteString(`<aside class="sidebar">`)
|
b.WriteString(`<aside class="sidebar">`)
|
||||||
b.WriteString(`<div class="sidebar-logo">bee<span>hardware audit</span></div>`)
|
b.WriteString(`<div class="sidebar-logo">bee<span>hardware audit</span></div>`)
|
||||||
|
if strings.TrimSpace(buildLabel) == "" {
|
||||||
|
buildLabel = "dev"
|
||||||
|
}
|
||||||
|
b.WriteString(`<div class="sidebar-version">Version ` + html.EscapeString(buildLabel) + `</div>`)
|
||||||
b.WriteString(`<nav class="nav">`)
|
b.WriteString(`<nav class="nav">`)
|
||||||
for _, item := range items {
|
for _, item := range items {
|
||||||
cls := "nav-item"
|
cls := "nav-item"
|
||||||
@@ -110,11 +115,7 @@ func layoutNav(active string, buildLabel string) string {
|
|||||||
cls, item.href, item.label))
|
cls, item.href, item.label))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if strings.TrimSpace(buildLabel) == "" {
|
|
||||||
buildLabel = "dev"
|
|
||||||
}
|
|
||||||
b.WriteString(`</nav>`)
|
b.WriteString(`</nav>`)
|
||||||
b.WriteString(`<div style="padding:12px 16px;border-top:1px solid rgba(255,255,255,.08);font-size:11px;color:rgba(255,255,255,.45)">Build ` + html.EscapeString(buildLabel) + `</div>`)
|
|
||||||
b.WriteString(`</aside>`)
|
b.WriteString(`</aside>`)
|
||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
@@ -1089,72 +1090,7 @@ func renderExport(exportDir string) string {
|
|||||||
</div></div>
|
</div></div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="card" style="margin-top:16px">
|
` + renderUSBExportCard()
|
||||||
<div class="card-head">Export to USB
|
|
||||||
<button class="btn btn-sm btn-secondary" onclick="usbRefresh()" style="margin-left:auto">↻ Refresh</button>
|
|
||||||
</div>
|
|
||||||
<div class="card-body">
|
|
||||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Write audit JSON or support bundle directly to a removable USB drive.</p>
|
|
||||||
<div id="usb-status" style="font-size:13px;color:var(--muted)">Scanning for USB devices...</div>
|
|
||||||
<div id="usb-targets" style="margin-top:12px"></div>
|
|
||||||
<div id="usb-msg" style="margin-top:10px;font-size:13px"></div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
<script>
|
|
||||||
(function(){
|
|
||||||
function usbRefresh() {
|
|
||||||
document.getElementById('usb-status').textContent = 'Scanning...';
|
|
||||||
document.getElementById('usb-targets').innerHTML = '';
|
|
||||||
document.getElementById('usb-msg').textContent = '';
|
|
||||||
fetch('/api/export/usb').then(r=>r.json()).then(targets => {
|
|
||||||
const st = document.getElementById('usb-status');
|
|
||||||
const ct = document.getElementById('usb-targets');
|
|
||||||
if (!targets || targets.length === 0) {
|
|
||||||
st.textContent = 'No removable USB devices found.';
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
st.textContent = targets.length + ' device(s) found:';
|
|
||||||
ct.innerHTML = '<table><tr><th>Device</th><th>FS</th><th>Size</th><th>Label</th><th>Model</th><th>Actions</th></tr>' +
|
|
||||||
targets.map(t => {
|
|
||||||
const dev = t.device || '';
|
|
||||||
const label = t.label || '';
|
|
||||||
const model = t.model || '';
|
|
||||||
return '<tr>' +
|
|
||||||
'<td style="font-family:monospace">'+dev+'</td>' +
|
|
||||||
'<td>'+t.fs_type+'</td>' +
|
|
||||||
'<td>'+t.size+'</td>' +
|
|
||||||
'<td>'+label+'</td>' +
|
|
||||||
'<td style="font-size:12px;color:var(--muted)">'+model+'</td>' +
|
|
||||||
'<td style="white-space:nowrap">' +
|
|
||||||
'<button class="btn btn-sm btn-primary" onclick="usbExport(\'audit\','+JSON.stringify(t)+')">Audit JSON</button> ' +
|
|
||||||
'<button class="btn btn-sm btn-secondary" onclick="usbExport(\'bundle\','+JSON.stringify(t)+')">Support Bundle</button>' +
|
|
||||||
'</td></tr>';
|
|
||||||
}).join('') + '</table>';
|
|
||||||
}).catch(e => {
|
|
||||||
document.getElementById('usb-status').textContent = 'Error: ' + e;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
window.usbExport = function(type, target) {
|
|
||||||
const msg = document.getElementById('usb-msg');
|
|
||||||
msg.style.color = 'var(--muted)';
|
|
||||||
msg.textContent = 'Exporting to ' + (target.device||'') + '...';
|
|
||||||
fetch('/api/export/usb/'+type, {
|
|
||||||
method: 'POST',
|
|
||||||
headers: {'Content-Type':'application/json'},
|
|
||||||
body: JSON.stringify(target)
|
|
||||||
}).then(r=>r.json()).then(d => {
|
|
||||||
if (d.error) { msg.style.color='var(--err,red)'; msg.textContent = 'Error: '+d.error; return; }
|
|
||||||
msg.style.color = 'var(--ok,green)';
|
|
||||||
msg.textContent = d.message || 'Done.';
|
|
||||||
}).catch(e => {
|
|
||||||
msg.style.color = 'var(--err,red)';
|
|
||||||
msg.textContent = 'Error: '+e;
|
|
||||||
});
|
|
||||||
};
|
|
||||||
window.usbRefresh = usbRefresh;
|
|
||||||
usbRefresh();
|
|
||||||
})();
|
|
||||||
</script>`
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func listExportFiles(exportDir string) ([]string, error) {
|
func listExportFiles(exportDir string) ([]string, error) {
|
||||||
@@ -1224,6 +1160,77 @@ window.supportBundleDownload = function() {
|
|||||||
</script>`
|
</script>`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func renderUSBExportCard() string {
|
||||||
|
return `<div class="card" style="margin-top:16px">
|
||||||
|
<div class="card-head">Export to USB
|
||||||
|
<button class="btn btn-sm btn-secondary" onclick="usbRefresh()" style="margin-left:auto">↻ Refresh</button>
|
||||||
|
</div>
|
||||||
|
<div class="card-body">` + renderUSBExportInline() + `</div>
|
||||||
|
</div>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderUSBExportInline() string {
|
||||||
|
return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Write audit JSON or support bundle directly to a removable USB drive.</p>
|
||||||
|
<div id="usb-status" style="font-size:13px;color:var(--muted)">Scanning for USB devices...</div>
|
||||||
|
<div id="usb-targets" style="margin-top:12px"></div>
|
||||||
|
<div id="usb-msg" style="margin-top:10px;font-size:13px"></div>
|
||||||
|
<script>
|
||||||
|
(function(){
|
||||||
|
function usbRefresh() {
|
||||||
|
document.getElementById('usb-status').textContent = 'Scanning...';
|
||||||
|
document.getElementById('usb-targets').innerHTML = '';
|
||||||
|
document.getElementById('usb-msg').textContent = '';
|
||||||
|
fetch('/api/export/usb').then(r=>r.json()).then(targets => {
|
||||||
|
const st = document.getElementById('usb-status');
|
||||||
|
const ct = document.getElementById('usb-targets');
|
||||||
|
if (!targets || targets.length === 0) {
|
||||||
|
st.textContent = 'No removable USB devices found.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
st.textContent = targets.length + ' device(s) found:';
|
||||||
|
ct.innerHTML = '<table><tr><th>Device</th><th>FS</th><th>Size</th><th>Label</th><th>Model</th><th>Actions</th></tr>' +
|
||||||
|
targets.map(t => {
|
||||||
|
const dev = t.device || '';
|
||||||
|
const label = t.label || '';
|
||||||
|
const model = t.model || '';
|
||||||
|
return '<tr>' +
|
||||||
|
'<td style="font-family:monospace">'+dev+'</td>' +
|
||||||
|
'<td>'+t.fs_type+'</td>' +
|
||||||
|
'<td>'+t.size+'</td>' +
|
||||||
|
'<td>'+label+'</td>' +
|
||||||
|
'<td style="font-size:12px;color:var(--muted)">'+model+'</td>' +
|
||||||
|
'<td style="white-space:nowrap">' +
|
||||||
|
'<button class="btn btn-sm btn-primary" onclick="usbExport(\'audit\','+JSON.stringify(t)+')">Audit JSON</button> ' +
|
||||||
|
'<button class="btn btn-sm btn-secondary" onclick="usbExport(\'bundle\','+JSON.stringify(t)+')">Support Bundle</button>' +
|
||||||
|
'</td></tr>';
|
||||||
|
}).join('') + '</table>';
|
||||||
|
}).catch(e => {
|
||||||
|
document.getElementById('usb-status').textContent = 'Error: ' + e;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
window.usbExport = function(type, target) {
|
||||||
|
const msg = document.getElementById('usb-msg');
|
||||||
|
msg.style.color = 'var(--muted)';
|
||||||
|
msg.textContent = 'Exporting to ' + (target.device||'') + '...';
|
||||||
|
fetch('/api/export/usb/'+type, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {'Content-Type':'application/json'},
|
||||||
|
body: JSON.stringify(target)
|
||||||
|
}).then(r=>r.json()).then(d => {
|
||||||
|
if (d.error) { msg.style.color='var(--err,red)'; msg.textContent = 'Error: '+d.error; return; }
|
||||||
|
msg.style.color = 'var(--ok,green)';
|
||||||
|
msg.textContent = d.message || 'Done.';
|
||||||
|
}).catch(e => {
|
||||||
|
msg.style.color = 'var(--err,red)';
|
||||||
|
msg.textContent = 'Error: '+e;
|
||||||
|
});
|
||||||
|
};
|
||||||
|
window.usbRefresh = usbRefresh;
|
||||||
|
usbRefresh();
|
||||||
|
})();
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
|
|
||||||
// ── Display Resolution ────────────────────────────────────────────────────────
|
// ── Display Resolution ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
func renderDisplayInline() string {
|
func renderDisplayInline() string {
|
||||||
@@ -1325,6 +1332,10 @@ function installToRAM() {
|
|||||||
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
||||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Downloads a tar.gz archive of all audit files, SAT results, and logs.</p>
|
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Downloads a tar.gz archive of all audit files, SAT results, and logs.</p>
|
||||||
` + renderSupportBundleInline() + `
|
` + renderSupportBundleInline() + `
|
||||||
|
<div style="border-top:1px solid var(--border);margin-top:16px;padding-top:16px">
|
||||||
|
<div style="font-weight:600;margin-bottom:8px">Export to USB</div>
|
||||||
|
` + renderUSBExportInline() + `
|
||||||
|
</div>
|
||||||
</div></div>
|
</div></div>
|
||||||
|
|
||||||
<div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">↻ Check</button></div>
|
<div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">↻ Check</button></div>
|
||||||
@@ -1578,7 +1589,11 @@ func renderTasks() string {
|
|||||||
<div class="card-head" style="padding:14px 18px;font-size:14px">Logs — <span id="task-log-title"></span>
|
<div class="card-head" style="padding:14px 18px;font-size:14px">Logs — <span id="task-log-title"></span>
|
||||||
<button class="btn btn-sm btn-secondary" onclick="closeTaskLog()" style="margin-left:auto">✕</button>
|
<button class="btn btn-sm btn-secondary" onclick="closeTaskLog()" style="margin-left:auto">✕</button>
|
||||||
</div>
|
</div>
|
||||||
<div class="card-body" style="padding:16px;flex:1;min-height:0"><div id="task-log-terminal" class="terminal" style="height:100%;max-height:none"></div></div>
|
<div class="card-body" style="padding:16px;flex:1;min-height:0;overflow:hidden">
|
||||||
|
<div style="height:100%;min-height:0;overflow:auto">
|
||||||
|
<div id="task-log-terminal" class="terminal" style="margin:0;max-height:none;overflow:visible"></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<script>
|
<script>
|
||||||
|
|||||||
@@ -164,6 +164,8 @@ type handler struct {
|
|||||||
// pending network change (rollback on timeout)
|
// pending network change (rollback on timeout)
|
||||||
pendingNet *pendingNetChange
|
pendingNet *pendingNetChange
|
||||||
pendingNetMu sync.Mutex
|
pendingNetMu sync.Mutex
|
||||||
|
// kmsg hardware error watcher
|
||||||
|
kmsg *kmsgWatcher
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewHandler creates the HTTP mux with all routes.
|
// NewHandler creates the HTTP mux with all routes.
|
||||||
@@ -203,6 +205,13 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
}
|
}
|
||||||
h.startMetricsCollector()
|
h.startMetricsCollector()
|
||||||
|
|
||||||
|
// Start kmsg hardware error watcher if the app (and its status DB) is available.
|
||||||
|
if opts.App != nil {
|
||||||
|
h.kmsg = newKmsgWatcher(opts.App.StatusDB)
|
||||||
|
h.kmsg.start()
|
||||||
|
globalQueue.kmsgWatcher = h.kmsg
|
||||||
|
}
|
||||||
|
|
||||||
globalQueue.startWorker(&opts)
|
globalQueue.startWorker(&opts)
|
||||||
mux := http.NewServeMux()
|
mux := http.NewServeMux()
|
||||||
|
|
||||||
|
|||||||
@@ -275,9 +275,10 @@ func TestRootRendersDashboard(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
handler := NewHandler(HandlerOptions{
|
handler := NewHandler(HandlerOptions{
|
||||||
Title: "Bee Hardware Audit",
|
Title: "Bee Hardware Audit",
|
||||||
AuditPath: path,
|
BuildLabel: "1.2.3",
|
||||||
ExportDir: exportDir,
|
AuditPath: path,
|
||||||
|
ExportDir: exportDir,
|
||||||
})
|
})
|
||||||
|
|
||||||
first := httptest.NewRecorder()
|
first := httptest.NewRecorder()
|
||||||
@@ -292,6 +293,11 @@ func TestRootRendersDashboard(t *testing.T) {
|
|||||||
if !strings.Contains(first.Body.String(), `/viewer`) {
|
if !strings.Contains(first.Body.String(), `/viewer`) {
|
||||||
t.Fatalf("first body missing viewer link: %s", first.Body.String())
|
t.Fatalf("first body missing viewer link: %s", first.Body.String())
|
||||||
}
|
}
|
||||||
|
versionIdx := strings.Index(first.Body.String(), `Version 1.2.3`)
|
||||||
|
navIdx := strings.Index(first.Body.String(), `href="/"`)
|
||||||
|
if versionIdx == -1 || navIdx == -1 || versionIdx > navIdx {
|
||||||
|
t.Fatalf("version should render near top of sidebar before nav links: %s", first.Body.String())
|
||||||
|
}
|
||||||
if got := first.Header().Get("Cache-Control"); got != "no-store" {
|
if got := first.Header().Get("Cache-Control"); got != "no-store" {
|
||||||
t.Fatalf("first cache-control=%q", got)
|
t.Fatalf("first cache-control=%q", got)
|
||||||
}
|
}
|
||||||
@@ -395,6 +401,46 @@ func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
|
|||||||
if !strings.Contains(body, `id="boot-source-text"`) {
|
if !strings.Contains(body, `id="boot-source-text"`) {
|
||||||
t.Fatalf("tools page missing boot source field: %s", body)
|
t.Fatalf("tools page missing boot source field: %s", body)
|
||||||
}
|
}
|
||||||
|
if !strings.Contains(body, `Export to USB`) {
|
||||||
|
t.Fatalf("tools page missing export to usb section: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `Support Bundle</button>`) {
|
||||||
|
t.Fatalf("tools page missing support bundle usb button: %s", body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTasksPageRendersScrollableLogModal(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
path := filepath.Join(dir, "audit.json")
|
||||||
|
exportDir := filepath.Join(dir, "export")
|
||||||
|
if err := os.MkdirAll(exportDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z"}`), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
handler := NewHandler(HandlerOptions{
|
||||||
|
Title: "Bee Hardware Audit",
|
||||||
|
AuditPath: path,
|
||||||
|
ExportDir: exportDir,
|
||||||
|
})
|
||||||
|
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d", rec.Code)
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
if !strings.Contains(body, `height:calc(100vh - 32px)`) {
|
||||||
|
t.Fatalf("tasks page missing bounded log modal height: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `flex:1;min-height:0;overflow:hidden`) {
|
||||||
|
t.Fatalf("tasks page missing log modal overflow guard: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `height:100%;min-height:0;overflow:auto`) {
|
||||||
|
t.Fatalf("tasks page missing scrollable log wrapper: %s", body)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestViewerRendersLatestSnapshot(t *testing.T) {
|
func TestViewerRendersLatestSnapshot(t *testing.T) {
|
||||||
|
|||||||
@@ -173,13 +173,14 @@ func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions
|
|||||||
|
|
||||||
// taskQueue manages a priority-ordered list of tasks and runs them one at a time.
|
// taskQueue manages a priority-ordered list of tasks and runs them one at a time.
|
||||||
type taskQueue struct {
|
type taskQueue struct {
|
||||||
mu sync.Mutex
|
mu sync.Mutex
|
||||||
tasks []*Task
|
tasks []*Task
|
||||||
trigger chan struct{}
|
trigger chan struct{}
|
||||||
opts *HandlerOptions // set by startWorker
|
opts *HandlerOptions // set by startWorker
|
||||||
statePath string
|
statePath string
|
||||||
logsDir string
|
logsDir string
|
||||||
started bool
|
started bool
|
||||||
|
kmsgWatcher *kmsgWatcher
|
||||||
}
|
}
|
||||||
|
|
||||||
var globalQueue = &taskQueue{trigger: make(chan struct{}, 1)}
|
var globalQueue = &taskQueue{trigger: make(chan struct{}, 1)}
|
||||||
@@ -392,11 +393,13 @@ func (q *taskQueue) worker() {
|
|||||||
for {
|
for {
|
||||||
<-q.trigger
|
<-q.trigger
|
||||||
setCPUGovernor("performance")
|
setCPUGovernor("performance")
|
||||||
|
|
||||||
|
// Drain all pending tasks and start them in parallel.
|
||||||
|
q.mu.Lock()
|
||||||
|
var batch []*Task
|
||||||
for {
|
for {
|
||||||
q.mu.Lock()
|
|
||||||
t := q.nextPending()
|
t := q.nextPending()
|
||||||
if t == nil {
|
if t == nil {
|
||||||
q.mu.Unlock()
|
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
now := time.Now()
|
now := time.Now()
|
||||||
@@ -405,29 +408,58 @@ func (q *taskQueue) worker() {
|
|||||||
t.DoneAt = nil
|
t.DoneAt = nil
|
||||||
t.ErrMsg = ""
|
t.ErrMsg = ""
|
||||||
j := newTaskJobState(t.LogPath)
|
j := newTaskJobState(t.LogPath)
|
||||||
ctx, cancel := context.WithCancel(context.Background())
|
|
||||||
j.cancel = cancel
|
|
||||||
t.job = j
|
t.job = j
|
||||||
|
batch = append(batch, t)
|
||||||
|
}
|
||||||
|
if len(batch) > 0 {
|
||||||
q.persistLocked()
|
q.persistLocked()
|
||||||
q.mu.Unlock()
|
}
|
||||||
|
q.mu.Unlock()
|
||||||
|
|
||||||
q.runTask(t, j, ctx)
|
var wg sync.WaitGroup
|
||||||
|
for _, t := range batch {
|
||||||
|
t := t
|
||||||
|
j := t.job
|
||||||
|
taskCtx, taskCancel := context.WithCancel(context.Background())
|
||||||
|
j.cancel = taskCancel
|
||||||
|
wg.Add(1)
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
|
||||||
q.mu.Lock()
|
if q.kmsgWatcher != nil && isSATTarget(t.Target) {
|
||||||
now2 := time.Now()
|
q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
|
||||||
t.DoneAt = &now2
|
|
||||||
if t.Status == TaskRunning { // not cancelled externally
|
|
||||||
if j.err != "" {
|
|
||||||
t.Status = TaskFailed
|
|
||||||
t.ErrMsg = j.err
|
|
||||||
} else {
|
|
||||||
t.Status = TaskDone
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
q.runTask(t, j, taskCtx)
|
||||||
|
|
||||||
|
if q.kmsgWatcher != nil {
|
||||||
|
q.kmsgWatcher.NotifyTaskFinished(t.ID)
|
||||||
|
}
|
||||||
|
|
||||||
|
q.mu.Lock()
|
||||||
|
now2 := time.Now()
|
||||||
|
t.DoneAt = &now2
|
||||||
|
if t.Status == TaskRunning {
|
||||||
|
if j.err != "" {
|
||||||
|
t.Status = TaskFailed
|
||||||
|
t.ErrMsg = j.err
|
||||||
|
} else {
|
||||||
|
t.Status = TaskDone
|
||||||
|
}
|
||||||
|
}
|
||||||
|
q.persistLocked()
|
||||||
|
q.mu.Unlock()
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
wg.Wait()
|
||||||
|
|
||||||
|
if len(batch) > 0 {
|
||||||
|
q.mu.Lock()
|
||||||
q.prune()
|
q.prune()
|
||||||
q.persistLocked()
|
q.persistLocked()
|
||||||
q.mu.Unlock()
|
q.mu.Unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
setCPUGovernor("powersave")
|
setCPUGovernor("powersave")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -618,6 +650,19 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If the SAT archive was produced, check overall_status and write to component DB.
|
||||||
|
if archive != "" {
|
||||||
|
archivePath := app.ExtractArchivePath(archive)
|
||||||
|
if err == nil {
|
||||||
|
if app.ReadSATOverallStatus(archivePath) == "FAILED" {
|
||||||
|
err = fmt.Errorf("SAT overall_status=FAILED (see summary.txt)")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if db := q.statusDB(); db != nil {
|
||||||
|
app.ApplySATResultToDB(db, t.Target, archivePath)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if ctx.Err() != nil {
|
if ctx.Err() != nil {
|
||||||
j.append("Aborted.")
|
j.append("Aborted.")
|
||||||
@@ -634,6 +679,13 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
j.finish("")
|
j.finish("")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (q *taskQueue) statusDB() *app.ComponentStatusDB {
|
||||||
|
if q.opts == nil || q.opts.App == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return q.opts.App.StatusDB
|
||||||
|
}
|
||||||
|
|
||||||
func splitLines(s string) []string {
|
func splitLines(s string) []string {
|
||||||
var out []string
|
var out []string
|
||||||
for _, l := range splitNL(s) {
|
for _, l := range splitNL(s) {
|
||||||
|
|||||||
16
audit/scripts/resolve-version.sh
Executable file
16
audit/scripts/resolve-version.sh
Executable file
@@ -0,0 +1,16 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
tag="$(git describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
|
||||||
|
|
||||||
|
case "${tag}" in
|
||||||
|
v*)
|
||||||
|
printf '%s\n' "${tag#v}"
|
||||||
|
;;
|
||||||
|
"")
|
||||||
|
printf 'dev\n'
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
printf '%s\n' "${tag}"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
@@ -32,7 +32,7 @@ lb config noauto \
|
|||||||
--memtest memtest86+ \
|
--memtest memtest86+ \
|
||||||
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||||
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||||
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=6 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
||||||
--apt-recommends false \
|
--apt-recommends false \
|
||||||
--chroot-squashfs-compression-type zstd \
|
--chroot-squashfs-compression-type zstd \
|
||||||
"${@}"
|
"${@}"
|
||||||
|
|||||||
@@ -54,15 +54,8 @@ resolve_audit_version() {
|
|||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'audit/v*' --abbrev=7 --dirty 2>/dev/null || true)"
|
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
|
||||||
if [ -z "${tag}" ]; then
|
|
||||||
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
|
|
||||||
fi
|
|
||||||
case "${tag}" in
|
case "${tag}" in
|
||||||
audit/v*)
|
|
||||||
echo "${tag#audit/v}"
|
|
||||||
return 0
|
|
||||||
;;
|
|
||||||
v*)
|
v*)
|
||||||
echo "${tag#v}"
|
echo "${tag#v}"
|
||||||
return 0
|
return 0
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Bee: hardware audit
|
Description=Bee: hardware audit
|
||||||
After=bee-preflight.service bee-network.service bee-nvidia.service
|
After=bee-preflight.service bee-network.service bee-nvidia.service
|
||||||
Before=bee-web.service
|
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=oneshot
|
Type=oneshot
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Bee: hardware audit web viewer
|
Description=Bee: hardware audit web viewer
|
||||||
After=bee-audit.service
|
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
|
|||||||
@@ -1,10 +1,11 @@
|
|||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
set -eu
|
set -eu
|
||||||
|
|
||||||
SECONDS=300
|
DURATION_SEC=300
|
||||||
DEVICES=""
|
DEVICES=""
|
||||||
EXCLUDE=""
|
EXCLUDE=""
|
||||||
FORMAT=""
|
FORMAT=""
|
||||||
|
TEST_SLICE_SECONDS=300
|
||||||
JOHN_DIR="/usr/local/lib/bee/john/run"
|
JOHN_DIR="/usr/local/lib/bee/john/run"
|
||||||
JOHN_BIN="${JOHN_DIR}/john"
|
JOHN_BIN="${JOHN_DIR}/john"
|
||||||
export OCL_ICD_VENDORS="/etc/OpenCL/vendors"
|
export OCL_ICD_VENDORS="/etc/OpenCL/vendors"
|
||||||
@@ -116,7 +117,7 @@ ensure_opencl_ready() {
|
|||||||
|
|
||||||
while [ "$#" -gt 0 ]; do
|
while [ "$#" -gt 0 ]; do
|
||||||
case "$1" in
|
case "$1" in
|
||||||
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
--seconds|-t) [ "$#" -ge 2 ] || usage; DURATION_SEC="$2"; shift 2 ;;
|
||||||
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||||
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||||
--format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;;
|
--format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;;
|
||||||
@@ -189,14 +190,51 @@ CHOSEN_FORMAT=$(choose_format) || {
|
|||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
echo "format=${CHOSEN_FORMAT}"
|
run_john_loop() {
|
||||||
|
opencl_id="$1"
|
||||||
|
deadline="$2"
|
||||||
|
round=0
|
||||||
|
while :; do
|
||||||
|
now=$(date +%s)
|
||||||
|
remaining=$((deadline - now))
|
||||||
|
if [ "${remaining}" -le 0 ]; then
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
round=$((round + 1))
|
||||||
|
slice="${remaining}"
|
||||||
|
if [ "${slice}" -gt "${TEST_SLICE_SECONDS}" ]; then
|
||||||
|
slice="${TEST_SLICE_SECONDS}"
|
||||||
|
fi
|
||||||
|
echo "device=${opencl_id} round=${round} remaining_sec=${remaining} slice_sec=${slice}"
|
||||||
|
./john --test="${slice}" --format="${CHOSEN_FORMAT}" --devices="${opencl_id}" || return 1
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
PIDS=""
|
PIDS=""
|
||||||
|
cleanup() {
|
||||||
|
rc=$?
|
||||||
|
trap - EXIT INT TERM
|
||||||
|
for pid in ${PIDS}; do
|
||||||
|
kill "${pid}" 2>/dev/null || true
|
||||||
|
done
|
||||||
|
for pid in ${PIDS}; do
|
||||||
|
wait "${pid}" 2>/dev/null || true
|
||||||
|
done
|
||||||
|
exit "${rc}"
|
||||||
|
}
|
||||||
|
trap cleanup EXIT INT TERM
|
||||||
|
|
||||||
|
echo "format=${CHOSEN_FORMAT}"
|
||||||
|
echo "target_seconds=${DURATION_SEC}"
|
||||||
|
echo "slice_seconds=${TEST_SLICE_SECONDS}"
|
||||||
|
DEADLINE=$(( $(date +%s) + DURATION_SEC ))
|
||||||
_first=1
|
_first=1
|
||||||
for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do
|
for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do
|
||||||
[ "${_first}" = "1" ] || sleep 3
|
[ "${_first}" = "1" ] || sleep 3
|
||||||
_first=0
|
_first=0
|
||||||
./john --test="${SECONDS}" --format="${CHOSEN_FORMAT}" --devices="${opencl_id}" &
|
run_john_loop "${opencl_id}" "${DEADLINE}" &
|
||||||
PIDS="${PIDS} $!"
|
pid=$!
|
||||||
|
PIDS="${PIDS} ${pid}"
|
||||||
done
|
done
|
||||||
FAIL=0
|
FAIL=0
|
||||||
for pid in ${PIDS}; do
|
for pid in ${PIDS}; do
|
||||||
|
|||||||
@@ -128,13 +128,32 @@ ldconfig 2>/dev/null || true
|
|||||||
log "ldconfig refreshed"
|
log "ldconfig refreshed"
|
||||||
|
|
||||||
# Start DCGM host engine so dcgmi can discover GPUs.
|
# Start DCGM host engine so dcgmi can discover GPUs.
|
||||||
# nv-hostengine must run before any dcgmi command — without it, dcgmi reports
|
# nv-hostengine must run after the NVIDIA modules and device nodes are ready.
|
||||||
# "group is empty" even when GPUs and modules are present.
|
# If it started too early (for example via systemd before bee-nvidia-load), it can
|
||||||
# Skip if already running (e.g. started by a dcgm systemd service or prior boot).
|
# keep a stale empty inventory and dcgmi diag later reports no testable entities.
|
||||||
if command -v nv-hostengine >/dev/null 2>&1; then
|
if command -v nv-hostengine >/dev/null 2>&1; then
|
||||||
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||||
log "nv-hostengine already running — skipping"
|
if command -v pkill >/dev/null 2>&1; then
|
||||||
else
|
pkill -x nv-hostengine >/dev/null 2>&1 || true
|
||||||
|
tries=0
|
||||||
|
while pgrep -x nv-hostengine >/dev/null 2>&1; do
|
||||||
|
tries=$((tries + 1))
|
||||||
|
if [ "${tries}" -ge 10 ]; then
|
||||||
|
log "WARN: nv-hostengine is still running after restart request"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||||
|
log "WARN: keeping existing nv-hostengine process"
|
||||||
|
else
|
||||||
|
log "nv-hostengine restarted"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
log "WARN: pkill not found — cannot refresh nv-hostengine inventory"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
if ! pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||||
nv-hostengine
|
nv-hostengine
|
||||||
log "nv-hostengine started"
|
log "nv-hostengine started"
|
||||||
fi
|
fi
|
||||||
|
|||||||
Reference in New Issue
Block a user