feat(watchdog): hardware error monitor + unified component status store
- Add platform/error_patterns.go: pluggable table of kernel log patterns (NVIDIA/GPU, PCIe AER, storage I/O, MCE, EDAC) — extend by adding one struct - Add app/component_status_db.go: persistent JSON store (component-status.json) keyed by "pcie:BDF", "storage:dev", "cpu:all", "memory:all"; OK never downgrades Warning or Critical - Add webui/kmsg_watcher.go: goroutine reads /dev/kmsg during SAT tasks, writes Warning to DB for matched hardware errors - Fix task status: overall_status=FAILED in summary.txt now marks task failed - Audit routine overlays component DB statuses into bee-audit.json on every read Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -40,6 +40,8 @@ type App struct {
|
|||||||
sat satRunner
|
sat satRunner
|
||||||
runtime runtimeChecker
|
runtime runtimeChecker
|
||||||
installer installer
|
installer installer
|
||||||
|
// StatusDB is the unified component health store (nil if unavailable).
|
||||||
|
StatusDB *ComponentStatusDB
|
||||||
}
|
}
|
||||||
|
|
||||||
type ActionResult struct {
|
type ActionResult struct {
|
||||||
@@ -136,7 +138,7 @@ type runtimeChecker interface {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func New(platform *platform.System) *App {
|
func New(platform *platform.System) *App {
|
||||||
return &App{
|
a := &App{
|
||||||
network: platform,
|
network: platform,
|
||||||
services: platform,
|
services: platform,
|
||||||
exports: platform,
|
exports: platform,
|
||||||
@@ -145,6 +147,10 @@ func New(platform *platform.System) *App {
|
|||||||
runtime: platform,
|
runtime: platform,
|
||||||
installer: platform,
|
installer: platform,
|
||||||
}
|
}
|
||||||
|
if db, err := OpenComponentStatusDB(DefaultExportDir + "/component-status.json"); err == nil {
|
||||||
|
a.StatusDB = db
|
||||||
|
}
|
||||||
|
return a
|
||||||
}
|
}
|
||||||
|
|
||||||
// ApplySATOverlay parses a raw audit JSON, overlays the latest SAT results,
|
// ApplySATOverlay parses a raw audit JSON, overlays the latest SAT results,
|
||||||
@@ -154,7 +160,7 @@ func ApplySATOverlay(auditJSON []byte) ([]byte, error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir)
|
applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir, nil)
|
||||||
return json.MarshalIndent(snap, "", " ")
|
return json.MarshalIndent(snap, "", " ")
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -174,7 +180,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
result := collector.Run(runtimeMode)
|
result := collector.Run(runtimeMode)
|
||||||
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir)
|
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
|
||||||
if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
|
if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
|
||||||
result.Runtime = &health
|
result.Runtime = &health
|
||||||
}
|
}
|
||||||
|
|||||||
266
audit/internal/app/component_status_db.go
Normal file
266
audit/internal/app/component_status_db.go
Normal file
@@ -0,0 +1,266 @@
|
|||||||
|
package app
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ComponentStatusDB is a persistent, append-only store of hardware component health records.
|
||||||
|
// Records are keyed by component identity strings (e.g. "pcie:0000:c8:00.0", "storage:nvme0n1").
|
||||||
|
// Once a component is marked Warning or Critical, subsequent OK entries do not downgrade it —
|
||||||
|
// the component stays at the highest observed severity until explicitly reset.
|
||||||
|
type ComponentStatusDB struct {
|
||||||
|
path string
|
||||||
|
mu sync.Mutex
|
||||||
|
records map[string]*ComponentStatusRecord
|
||||||
|
}
|
||||||
|
|
||||||
|
// ComponentStatusRecord holds the current and historical health of one hardware component.
|
||||||
|
type ComponentStatusRecord struct {
|
||||||
|
ComponentKey string `json:"component_key"`
|
||||||
|
Status string `json:"status"` // "OK", "Warning", "Critical", "Unknown"
|
||||||
|
LastCheckedAt time.Time `json:"last_checked_at"`
|
||||||
|
LastChangedAt time.Time `json:"last_changed_at"`
|
||||||
|
ErrorSummary string `json:"error_summary,omitempty"`
|
||||||
|
History []ComponentStatusEntry `json:"history"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// ComponentStatusEntry is one observation written to a component's history.
|
||||||
|
type ComponentStatusEntry struct {
|
||||||
|
At time.Time `json:"at"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
Source string `json:"source"` // e.g. "sat:nvidia", "sat:memory", "watchdog:kmsg"
|
||||||
|
Detail string `json:"detail,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// OpenComponentStatusDB opens (or creates) the JSON status DB at path.
|
||||||
|
func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) {
|
||||||
|
db := &ComponentStatusDB{
|
||||||
|
path: path,
|
||||||
|
records: make(map[string]*ComponentStatusRecord),
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
data, err := os.ReadFile(path)
|
||||||
|
if err != nil && !os.IsNotExist(err) {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if len(data) > 0 {
|
||||||
|
var records []ComponentStatusRecord
|
||||||
|
if err := json.Unmarshal(data, &records); err == nil {
|
||||||
|
for i := range records {
|
||||||
|
db.records[records[i].ComponentKey] = &records[i]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return db, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Record writes one observation for the given component key.
|
||||||
|
// source is a short label like "sat:nvidia" or "watchdog:kmsg".
|
||||||
|
// status is "OK", "Warning", "Critical", or "Unknown".
|
||||||
|
// OK never downgrades an existing Warning or Critical status.
|
||||||
|
func (db *ComponentStatusDB) Record(key, source, status, detail string) {
|
||||||
|
if db == nil || strings.TrimSpace(key) == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
db.mu.Lock()
|
||||||
|
defer db.mu.Unlock()
|
||||||
|
|
||||||
|
now := time.Now().UTC()
|
||||||
|
rec, exists := db.records[key]
|
||||||
|
if !exists {
|
||||||
|
rec = &ComponentStatusRecord{ComponentKey: key}
|
||||||
|
db.records[key] = rec
|
||||||
|
}
|
||||||
|
rec.LastCheckedAt = now
|
||||||
|
|
||||||
|
entry := ComponentStatusEntry{At: now, Status: status, Source: source, Detail: detail}
|
||||||
|
rec.History = append(rec.History, entry)
|
||||||
|
|
||||||
|
// Status merge: OK never downgrades Warning/Critical.
|
||||||
|
newSev := componentSeverity(status)
|
||||||
|
curSev := componentSeverity(rec.Status)
|
||||||
|
if newSev > curSev {
|
||||||
|
rec.Status = status
|
||||||
|
rec.LastChangedAt = now
|
||||||
|
rec.ErrorSummary = detail
|
||||||
|
} else if rec.Status == "" {
|
||||||
|
rec.Status = status
|
||||||
|
rec.LastChangedAt = now
|
||||||
|
}
|
||||||
|
|
||||||
|
_ = db.saveLocked()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get returns the current record for a component key.
|
||||||
|
func (db *ComponentStatusDB) Get(key string) (ComponentStatusRecord, bool) {
|
||||||
|
if db == nil {
|
||||||
|
return ComponentStatusRecord{}, false
|
||||||
|
}
|
||||||
|
db.mu.Lock()
|
||||||
|
defer db.mu.Unlock()
|
||||||
|
r, ok := db.records[key]
|
||||||
|
if !ok {
|
||||||
|
return ComponentStatusRecord{}, false
|
||||||
|
}
|
||||||
|
return *r, true
|
||||||
|
}
|
||||||
|
|
||||||
|
// All returns a snapshot of all records.
|
||||||
|
func (db *ComponentStatusDB) All() []ComponentStatusRecord {
|
||||||
|
if db == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
db.mu.Lock()
|
||||||
|
defer db.mu.Unlock()
|
||||||
|
out := make([]ComponentStatusRecord, 0, len(db.records))
|
||||||
|
for _, r := range db.records {
|
||||||
|
out = append(out, *r)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func (db *ComponentStatusDB) saveLocked() error {
|
||||||
|
records := make([]ComponentStatusRecord, 0, len(db.records))
|
||||||
|
for _, r := range db.records {
|
||||||
|
records = append(records, *r)
|
||||||
|
}
|
||||||
|
data, err := json.MarshalIndent(records, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return os.WriteFile(db.path, data, 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
// componentSeverity returns a numeric severity so higher values win.
|
||||||
|
func componentSeverity(status string) int {
|
||||||
|
switch strings.TrimSpace(status) {
|
||||||
|
case "Critical":
|
||||||
|
return 3
|
||||||
|
case "Warning":
|
||||||
|
return 2
|
||||||
|
case "OK":
|
||||||
|
return 1
|
||||||
|
default:
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ApplySATResultToDB reads a SAT summary.txt from the run directory next to archivePath
|
||||||
|
// and writes component status records to db for the given SAT target.
|
||||||
|
// archivePath may be either a bare .tar.gz path or "Archive written to /path/foo.tar.gz".
|
||||||
|
func ApplySATResultToDB(db *ComponentStatusDB, target, archivePath string) {
|
||||||
|
if db == nil || strings.TrimSpace(archivePath) == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
archivePath = extractArchivePath(archivePath)
|
||||||
|
if archivePath == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||||
|
data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
kv := parseSATKV(string(data))
|
||||||
|
overall := strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
|
||||||
|
if overall == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
source := "sat:" + target
|
||||||
|
dbStatus := satStatusToDBStatus(overall)
|
||||||
|
|
||||||
|
// Map SAT target to component keys.
|
||||||
|
switch target {
|
||||||
|
case "nvidia", "amd", "nvidia-stress", "amd-stress", "amd-mem", "amd-bandwidth":
|
||||||
|
db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall)
|
||||||
|
case "memory", "memory-stress", "sat-stress":
|
||||||
|
db.Record("memory:all", source, dbStatus, target+" SAT: "+overall)
|
||||||
|
case "cpu", "platform-stress":
|
||||||
|
db.Record("cpu:all", source, dbStatus, target+" SAT: "+overall)
|
||||||
|
case "storage":
|
||||||
|
// Try to record per-device if available in summary.
|
||||||
|
recordedAny := false
|
||||||
|
for key, val := range kv {
|
||||||
|
if !strings.HasSuffix(key, "_status") || key == "overall_status" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
base := strings.TrimSuffix(key, "_status")
|
||||||
|
idx := strings.Index(base, "_")
|
||||||
|
if idx <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
devName := base[:idx]
|
||||||
|
devStatus := satStatusToDBStatus(strings.ToUpper(strings.TrimSpace(val)))
|
||||||
|
db.Record("storage:"+devName, source, devStatus, "storage SAT: "+val)
|
||||||
|
recordedAny = true
|
||||||
|
}
|
||||||
|
if !recordedAny {
|
||||||
|
db.Record("storage:all", source, dbStatus, "storage SAT: "+overall)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func satStatusToDBStatus(overall string) string {
|
||||||
|
switch overall {
|
||||||
|
case "OK":
|
||||||
|
return "OK"
|
||||||
|
case "FAILED":
|
||||||
|
return "Warning"
|
||||||
|
case "PARTIAL", "UNSUPPORTED":
|
||||||
|
return "Unknown"
|
||||||
|
default:
|
||||||
|
return "Unknown"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ExtractArchivePath extracts a bare .tar.gz path from a string that may be
|
||||||
|
// "Archive written to /path/foo.tar.gz" or already a bare path.
|
||||||
|
func ExtractArchivePath(s string) string {
|
||||||
|
return extractArchivePath(s)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ReadSATOverallStatus reads the overall_status value from the summary.txt
|
||||||
|
// file located in the run directory alongside archivePath.
|
||||||
|
// Returns "" if the file cannot be read.
|
||||||
|
func ReadSATOverallStatus(archivePath string) string {
|
||||||
|
if strings.TrimSpace(archivePath) == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||||
|
data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
kv := parseSATKV(string(data))
|
||||||
|
return strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
|
||||||
|
}
|
||||||
|
|
||||||
|
func extractArchivePath(s string) string {
|
||||||
|
s = strings.TrimSpace(s)
|
||||||
|
if strings.HasSuffix(s, ".tar.gz") {
|
||||||
|
parts := strings.Fields(s)
|
||||||
|
if len(parts) > 0 {
|
||||||
|
return parts[len(parts)-1]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseSATKV(raw string) map[string]string {
|
||||||
|
kv := make(map[string]string)
|
||||||
|
for _, line := range strings.Split(raw, "\n") {
|
||||||
|
k, v, ok := strings.Cut(strings.TrimSpace(line), "=")
|
||||||
|
if ok {
|
||||||
|
kv[strings.TrimSpace(k)] = strings.TrimSpace(v)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return kv
|
||||||
|
}
|
||||||
@@ -9,7 +9,7 @@ import (
|
|||||||
"bee/audit/internal/schema"
|
"bee/audit/internal/schema"
|
||||||
)
|
)
|
||||||
|
|
||||||
func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
|
func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *ComponentStatusDB) {
|
||||||
if snap == nil || strings.TrimSpace(baseDir) == "" {
|
if snap == nil || strings.TrimSpace(baseDir) == "" {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -28,6 +28,8 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
|
|||||||
if summary, ok := loadLatestSATSummary(baseDir, "storage-"); ok {
|
if summary, ok := loadLatestSATSummary(baseDir, "storage-"); ok {
|
||||||
applyStorageSAT(snap.Storage, summary)
|
applyStorageSAT(snap.Storage, summary)
|
||||||
}
|
}
|
||||||
|
// Apply unified component status DB — overlaid last so it can only upgrade severity.
|
||||||
|
applyComponentStatusDB(snap, db)
|
||||||
}
|
}
|
||||||
|
|
||||||
type satSummary struct {
|
type satSummary struct {
|
||||||
@@ -206,6 +208,86 @@ func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func applyComponentStatusDB(snap *schema.HardwareSnapshot, db *ComponentStatusDB) {
|
||||||
|
if snap == nil || db == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, rec := range db.All() {
|
||||||
|
key := rec.ComponentKey
|
||||||
|
status := dbStatusToSATStatus(rec.Status)
|
||||||
|
if status == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
detail := rec.ErrorSummary
|
||||||
|
ts := rec.LastChangedAt.UTC().Format("2006-01-02T15:04:05Z")
|
||||||
|
|
||||||
|
switch {
|
||||||
|
case strings.HasPrefix(key, "pcie:"):
|
||||||
|
bdf := strings.TrimPrefix(key, "pcie:")
|
||||||
|
bdf = strings.TrimPrefix(bdf, "gpu:") // strip sub-type if present
|
||||||
|
// bdf may be empty (e.g. "pcie:gpu:nvidia") — skip BDF matching
|
||||||
|
if sanitizeBDFForLookup(bdf) == "" {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
normalized := sanitizeBDFForLookup(bdf)
|
||||||
|
for i := range snap.PCIeDevices {
|
||||||
|
if snap.PCIeDevices[i].BDF == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if sanitizeBDFForLookup(*snap.PCIeDevices[i].BDF) == normalized {
|
||||||
|
mergeComponentStatus(&snap.PCIeDevices[i].HardwareComponentStatus, ts, status, detail)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case strings.HasPrefix(key, "storage:"):
|
||||||
|
devName := strings.TrimPrefix(key, "storage:")
|
||||||
|
if devName == "all" {
|
||||||
|
for i := range snap.Storage {
|
||||||
|
mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for i := range snap.Storage {
|
||||||
|
linuxDev, _ := snap.Storage[i].Telemetry["linux_device"].(string)
|
||||||
|
if filepath.Base(strings.TrimSpace(linuxDev)) == devName {
|
||||||
|
mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case strings.HasPrefix(key, "memory:"):
|
||||||
|
for i := range snap.Memory {
|
||||||
|
mergeComponentStatus(&snap.Memory[i].HardwareComponentStatus, ts, status, detail)
|
||||||
|
}
|
||||||
|
case strings.HasPrefix(key, "cpu:"):
|
||||||
|
for i := range snap.CPUs {
|
||||||
|
mergeComponentStatus(&snap.CPUs[i].HardwareComponentStatus, ts, status, detail)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// dbStatusToSATStatus converts ComponentStatusDB status strings to the format
|
||||||
|
// expected by mergeComponentStatus (which uses "OK", "Warning", "Critical", "Unknown").
|
||||||
|
func dbStatusToSATStatus(s string) string {
|
||||||
|
switch strings.TrimSpace(s) {
|
||||||
|
case "OK", "Warning", "Critical", "Unknown":
|
||||||
|
return s
|
||||||
|
default:
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// sanitizeBDFForLookup normalises a PCIe BDF address to a canonical lower-case form
|
||||||
|
// suitable for comparison. "c8:00.0" → "0000:c8:00.0"; already-full BDFs are left as-is.
|
||||||
|
func sanitizeBDFForLookup(bdf string) string {
|
||||||
|
bdf = strings.ToLower(strings.TrimSpace(bdf))
|
||||||
|
if bdf == "" || bdf == "gpu" || strings.ContainsAny(bdf, " \t") {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
if strings.Count(bdf, ":") == 1 {
|
||||||
|
bdf = "0000:" + bdf
|
||||||
|
}
|
||||||
|
return bdf
|
||||||
|
}
|
||||||
|
|
||||||
func ptrString(v *string) string {
|
func ptrString(v *string) string {
|
||||||
if v == nil {
|
if v == nil {
|
||||||
return ""
|
return ""
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ func TestApplyLatestSATStatusesMarksStorageByDevice(t *testing.T) {
|
|||||||
usb := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/sda"}}
|
usb := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/sda"}}
|
||||||
snap := schema.HardwareSnapshot{Storage: []schema.HardwareStorage{nvme, usb}}
|
snap := schema.HardwareSnapshot{Storage: []schema.HardwareStorage{nvme, usb}}
|
||||||
|
|
||||||
applyLatestSATStatuses(&snap, baseDir)
|
applyLatestSATStatuses(&snap, baseDir, nil)
|
||||||
|
|
||||||
if snap.Storage[0].Status == nil || *snap.Storage[0].Status != "OK" {
|
if snap.Storage[0].Status == nil || *snap.Storage[0].Status != "OK" {
|
||||||
t.Fatalf("nvme status=%v want OK", snap.Storage[0].Status)
|
t.Fatalf("nvme status=%v want OK", snap.Storage[0].Status)
|
||||||
@@ -53,7 +53,7 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
|
|||||||
}},
|
}},
|
||||||
}
|
}
|
||||||
|
|
||||||
applyLatestSATStatuses(&snap, baseDir)
|
applyLatestSATStatuses(&snap, baseDir, nil)
|
||||||
|
|
||||||
if snap.PCIeDevices[0].Status == nil || *snap.PCIeDevices[0].Status != "Critical" {
|
if snap.PCIeDevices[0].Status == nil || *snap.PCIeDevices[0].Status != "Critical" {
|
||||||
t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
|
t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
|
||||||
|
|||||||
139
audit/internal/platform/error_patterns.go
Normal file
139
audit/internal/platform/error_patterns.go
Normal file
@@ -0,0 +1,139 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import "regexp"
|
||||||
|
|
||||||
|
// ErrorPattern describes a kernel log pattern that indicates a hardware error.
|
||||||
|
// Add new patterns by appending to HardwareErrorPatterns — no other code changes needed.
|
||||||
|
type ErrorPattern struct {
|
||||||
|
// Name is a short machine-readable label for logging and deduplication.
|
||||||
|
Name string
|
||||||
|
// Re is the compiled regular expression matched against a single kmsg line.
|
||||||
|
Re *regexp.Regexp
|
||||||
|
// Category groups related errors: "gpu", "pcie", "storage", "mce", "memory", "cpu".
|
||||||
|
Category string
|
||||||
|
// Severity is "warning" for recoverable/uncertain faults, "critical" for definitive failures.
|
||||||
|
Severity string
|
||||||
|
// BDFGroup is the capture group index (1-based) that contains a PCIe BDF address
|
||||||
|
// (e.g. "0000:c8:00.0"). 0 means no BDF is captured by this pattern.
|
||||||
|
BDFGroup int
|
||||||
|
// DevGroup is the capture group index (1-based) that contains a device name
|
||||||
|
// (e.g. "sda", "nvme0"). 0 means no device name is captured by this pattern.
|
||||||
|
DevGroup int
|
||||||
|
}
|
||||||
|
|
||||||
|
// HardwareErrorPatterns is the global list of kernel log patterns that indicate hardware faults.
|
||||||
|
// To add a new pattern: append a new ErrorPattern struct to this slice.
|
||||||
|
var HardwareErrorPatterns = []ErrorPattern{
|
||||||
|
// ── GPU / NVIDIA ────────────────────────────────────────────────────────────
|
||||||
|
{
|
||||||
|
Name: "nvidia-rminitadapter",
|
||||||
|
Re: mustPat(`(?i)NVRM:.*GPU\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
|
||||||
|
Category: "gpu",
|
||||||
|
Severity: "warning",
|
||||||
|
BDFGroup: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "nvidia-msi-fail",
|
||||||
|
Re: mustPat(`(?i)NVRM:.*Failed to enable MSI`),
|
||||||
|
Category: "gpu",
|
||||||
|
Severity: "warning",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "nvidia-aer",
|
||||||
|
Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||||
|
Category: "gpu",
|
||||||
|
Severity: "warning",
|
||||||
|
BDFGroup: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "nvidia-xid",
|
||||||
|
Re: mustPat(`(?i)NVRM:.*Xid.*\b([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
|
||||||
|
Category: "gpu",
|
||||||
|
Severity: "warning",
|
||||||
|
BDFGroup: 1,
|
||||||
|
},
|
||||||
|
|
||||||
|
// ── PCIe AER (generic) ──────────────────────────────────────────────────────
|
||||||
|
{
|
||||||
|
Name: "pcie-aer",
|
||||||
|
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||||
|
Category: "pcie",
|
||||||
|
Severity: "warning",
|
||||||
|
BDFGroup: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "pcie-uncorrectable",
|
||||||
|
Re: mustPat(`(?i)([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Uu]ncorrectable`),
|
||||||
|
Category: "pcie",
|
||||||
|
Severity: "warning",
|
||||||
|
BDFGroup: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "pcie-link-down",
|
||||||
|
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Ll]ink.*[Dd]own`),
|
||||||
|
Category: "pcie",
|
||||||
|
Severity: "warning",
|
||||||
|
BDFGroup: 1,
|
||||||
|
},
|
||||||
|
|
||||||
|
// ── Storage ─────────────────────────────────────────────────────────────────
|
||||||
|
{
|
||||||
|
Name: "blk-io-error",
|
||||||
|
Re: mustPat(`(?i)blk_update_request.*I/O error.*dev\s+(\w+)`),
|
||||||
|
Category: "storage",
|
||||||
|
Severity: "warning",
|
||||||
|
DevGroup: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "nvme-timeout",
|
||||||
|
Re: mustPat(`(?i)nvme\s+(\w+):.*timeout`),
|
||||||
|
Category: "storage",
|
||||||
|
Severity: "warning",
|
||||||
|
DevGroup: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "scsi-failed",
|
||||||
|
Re: mustPat(`(?i)sd\s+[\da-f:]+:.*FAILED`),
|
||||||
|
Category: "storage",
|
||||||
|
Severity: "warning",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "nvme-reset",
|
||||||
|
Re: mustPat(`(?i)nvme\s+(\w+):.*reset`),
|
||||||
|
Category: "storage",
|
||||||
|
Severity: "warning",
|
||||||
|
DevGroup: 1,
|
||||||
|
},
|
||||||
|
|
||||||
|
// ── Machine Check Exceptions ────────────────────────────────────────────────
|
||||||
|
{
|
||||||
|
Name: "mce-hardware-error",
|
||||||
|
Re: mustPat(`(?i)mce:.*[Hh]ardware [Ee]rror`),
|
||||||
|
Category: "mce",
|
||||||
|
Severity: "warning",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "mce-corrected",
|
||||||
|
Re: mustPat(`(?i)mce:.*[Cc]orrected`),
|
||||||
|
Category: "mce",
|
||||||
|
Severity: "warning",
|
||||||
|
},
|
||||||
|
|
||||||
|
// ── Memory ─────────────────────────────────────────────────────────────────
|
||||||
|
{
|
||||||
|
Name: "edac-ue",
|
||||||
|
Re: mustPat(`(?i)EDAC.*[Uu]ncorrectable`),
|
||||||
|
Category: "memory",
|
||||||
|
Severity: "warning",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "edac-ce",
|
||||||
|
Re: mustPat(`(?i)EDAC.*[Cc]orrectable`),
|
||||||
|
Category: "memory",
|
||||||
|
Severity: "warning",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
func mustPat(s string) *regexp.Regexp {
|
||||||
|
return regexp.MustCompile(s)
|
||||||
|
}
|
||||||
230
audit/internal/webui/kmsg_watcher.go
Normal file
230
audit/internal/webui/kmsg_watcher.go
Normal file
@@ -0,0 +1,230 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"io"
|
||||||
|
"log/slog"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/app"
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
// kmsgWatcher reads /dev/kmsg and accumulates hardware error events.
|
||||||
|
// During an active SAT task window it records matching lines; on task finish
|
||||||
|
// it writes Warning status records to the component status DB.
|
||||||
|
type kmsgWatcher struct {
|
||||||
|
mu sync.Mutex
|
||||||
|
activeWindow *kmsgWindow
|
||||||
|
statusDB *app.ComponentStatusDB
|
||||||
|
}
|
||||||
|
|
||||||
|
type kmsgWindow struct {
|
||||||
|
taskID string
|
||||||
|
target string
|
||||||
|
startedAt time.Time
|
||||||
|
seen map[kmsgEventKey]bool
|
||||||
|
events []kmsgEvent
|
||||||
|
}
|
||||||
|
|
||||||
|
type kmsgEventKey struct {
|
||||||
|
id string // BDF or device name
|
||||||
|
category string
|
||||||
|
}
|
||||||
|
|
||||||
|
type kmsgEvent struct {
|
||||||
|
timestamp time.Time
|
||||||
|
raw string
|
||||||
|
ids []string // BDF addresses or device names extracted
|
||||||
|
category string
|
||||||
|
}
|
||||||
|
|
||||||
|
func newKmsgWatcher(statusDB *app.ComponentStatusDB) *kmsgWatcher {
|
||||||
|
return &kmsgWatcher{statusDB: statusDB}
|
||||||
|
}
|
||||||
|
|
||||||
|
// start launches the background kmsg reading goroutine.
|
||||||
|
func (w *kmsgWatcher) start() {
|
||||||
|
go w.run()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *kmsgWatcher) run() {
|
||||||
|
f, err := os.Open("/dev/kmsg")
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("kmsg watcher unavailable", "err", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
// Best-effort seek to end so we only capture events from now forward.
|
||||||
|
_, _ = f.Seek(0, io.SeekEnd)
|
||||||
|
|
||||||
|
scanner := bufio.NewScanner(f)
|
||||||
|
scanner.Buffer(make([]byte, 64*1024), 64*1024)
|
||||||
|
for scanner.Scan() {
|
||||||
|
line := scanner.Text()
|
||||||
|
evt, ok := parseKmsgLine(line)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
w.mu.Lock()
|
||||||
|
if w.activeWindow != nil {
|
||||||
|
w.recordEvent(evt)
|
||||||
|
}
|
||||||
|
w.mu.Unlock()
|
||||||
|
}
|
||||||
|
if err := scanner.Err(); err != nil {
|
||||||
|
slog.Warn("kmsg watcher stopped", "err", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// recordEvent appends evt to the active window, deduplicating by (id, category).
|
||||||
|
// Must be called with w.mu held.
|
||||||
|
func (w *kmsgWatcher) recordEvent(evt kmsgEvent) {
|
||||||
|
if len(evt.ids) == 0 {
|
||||||
|
// Events without a device ID (e.g. MCE) — deduplicate by category.
|
||||||
|
key := kmsgEventKey{id: "", category: evt.category}
|
||||||
|
if !w.activeWindow.seen[key] {
|
||||||
|
w.activeWindow.seen[key] = true
|
||||||
|
w.activeWindow.events = append(w.activeWindow.events, evt)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, id := range evt.ids {
|
||||||
|
key := kmsgEventKey{id: id, category: evt.category}
|
||||||
|
if !w.activeWindow.seen[key] {
|
||||||
|
w.activeWindow.seen[key] = true
|
||||||
|
w.activeWindow.events = append(w.activeWindow.events, evt)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// NotifyTaskStarted opens a new event window for the given SAT task.
|
||||||
|
func (w *kmsgWatcher) NotifyTaskStarted(taskID, target string) {
|
||||||
|
w.mu.Lock()
|
||||||
|
defer w.mu.Unlock()
|
||||||
|
w.activeWindow = &kmsgWindow{
|
||||||
|
taskID: taskID,
|
||||||
|
target: target,
|
||||||
|
startedAt: time.Now(),
|
||||||
|
seen: make(map[kmsgEventKey]bool),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// NotifyTaskFinished closes the event window and asynchronously writes status records.
|
||||||
|
func (w *kmsgWatcher) NotifyTaskFinished(taskID string) {
|
||||||
|
w.mu.Lock()
|
||||||
|
window := w.activeWindow
|
||||||
|
if window != nil && window.taskID == taskID {
|
||||||
|
w.activeWindow = nil
|
||||||
|
}
|
||||||
|
w.mu.Unlock()
|
||||||
|
|
||||||
|
if window == nil || len(window.events) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
go w.flushWindow(window)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
|
||||||
|
if w.statusDB == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
source := "watchdog:kmsg"
|
||||||
|
// Collect unique component keys from events.
|
||||||
|
seen := map[string]string{} // componentKey → first raw line
|
||||||
|
for _, evt := range window.events {
|
||||||
|
if len(evt.ids) == 0 {
|
||||||
|
// MCE or un-identified error.
|
||||||
|
key := "cpu:all"
|
||||||
|
if evt.category == "memory" {
|
||||||
|
key = "memory:all"
|
||||||
|
}
|
||||||
|
if _, exists := seen[key]; !exists {
|
||||||
|
seen[key] = evt.raw
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
for _, id := range evt.ids {
|
||||||
|
var key string
|
||||||
|
switch evt.category {
|
||||||
|
case "gpu", "pcie":
|
||||||
|
key = "pcie:" + normalizeBDF(id)
|
||||||
|
case "storage":
|
||||||
|
key = "storage:" + id
|
||||||
|
default:
|
||||||
|
key = "pcie:" + normalizeBDF(id)
|
||||||
|
}
|
||||||
|
if _, exists := seen[key]; !exists {
|
||||||
|
seen[key] = evt.raw
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for key, detail := range seen {
|
||||||
|
detail = "kernel error during " + window.target + " SAT: " + truncate(detail, 120)
|
||||||
|
w.statusDB.Record(key, source, "Warning", detail)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseKmsgLine parses a single /dev/kmsg line and returns an event if it matches
|
||||||
|
// any pattern in platform.HardwareErrorPatterns.
|
||||||
|
// kmsg format: "<priority>,<sequence>,<timestamp_usec>,-;message text"
|
||||||
|
func parseKmsgLine(raw string) (kmsgEvent, bool) {
|
||||||
|
msg := raw
|
||||||
|
if idx := strings.Index(raw, ";"); idx >= 0 {
|
||||||
|
msg = strings.TrimSpace(raw[idx+1:])
|
||||||
|
}
|
||||||
|
if msg == "" {
|
||||||
|
return kmsgEvent{}, false
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, p := range platform.HardwareErrorPatterns {
|
||||||
|
m := p.Re.FindStringSubmatch(msg)
|
||||||
|
if m == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
evt := kmsgEvent{
|
||||||
|
timestamp: time.Now(),
|
||||||
|
raw: msg,
|
||||||
|
category: p.Category,
|
||||||
|
}
|
||||||
|
if p.BDFGroup > 0 && p.BDFGroup < len(m) {
|
||||||
|
evt.ids = append(evt.ids, normalizeBDF(m[p.BDFGroup]))
|
||||||
|
}
|
||||||
|
if p.DevGroup > 0 && p.DevGroup < len(m) {
|
||||||
|
evt.ids = append(evt.ids, m[p.DevGroup])
|
||||||
|
}
|
||||||
|
return evt, true
|
||||||
|
}
|
||||||
|
return kmsgEvent{}, false
|
||||||
|
}
|
||||||
|
|
||||||
|
// normalizeBDF normalizes a PCIe BDF to the 4-part form "0000:c8:00.0".
|
||||||
|
func normalizeBDF(bdf string) string {
|
||||||
|
bdf = strings.ToLower(strings.TrimSpace(bdf))
|
||||||
|
if strings.Count(bdf, ":") == 1 {
|
||||||
|
return "0000:" + bdf
|
||||||
|
}
|
||||||
|
return bdf
|
||||||
|
}
|
||||||
|
|
||||||
|
func truncate(s string, max int) string {
|
||||||
|
if len(s) <= max {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
return s[:max] + "..."
|
||||||
|
}
|
||||||
|
|
||||||
|
// isSATTarget returns true for task targets that run hardware acceptance tests.
|
||||||
|
func isSATTarget(target string) bool {
|
||||||
|
switch target {
|
||||||
|
case "nvidia", "nvidia-stress", "memory", "memory-stress", "storage",
|
||||||
|
"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
|
||||||
|
"platform-stress":
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
@@ -164,6 +164,8 @@ type handler struct {
|
|||||||
// pending network change (rollback on timeout)
|
// pending network change (rollback on timeout)
|
||||||
pendingNet *pendingNetChange
|
pendingNet *pendingNetChange
|
||||||
pendingNetMu sync.Mutex
|
pendingNetMu sync.Mutex
|
||||||
|
// kmsg hardware error watcher
|
||||||
|
kmsg *kmsgWatcher
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewHandler creates the HTTP mux with all routes.
|
// NewHandler creates the HTTP mux with all routes.
|
||||||
@@ -203,6 +205,13 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
}
|
}
|
||||||
h.startMetricsCollector()
|
h.startMetricsCollector()
|
||||||
|
|
||||||
|
// Start kmsg hardware error watcher if the app (and its status DB) is available.
|
||||||
|
if opts.App != nil {
|
||||||
|
h.kmsg = newKmsgWatcher(opts.App.StatusDB)
|
||||||
|
h.kmsg.start()
|
||||||
|
globalQueue.kmsgWatcher = h.kmsg
|
||||||
|
}
|
||||||
|
|
||||||
globalQueue.startWorker(&opts)
|
globalQueue.startWorker(&opts)
|
||||||
mux := http.NewServeMux()
|
mux := http.NewServeMux()
|
||||||
|
|
||||||
|
|||||||
@@ -180,6 +180,7 @@ type taskQueue struct {
|
|||||||
statePath string
|
statePath string
|
||||||
logsDir string
|
logsDir string
|
||||||
started bool
|
started bool
|
||||||
|
kmsgWatcher *kmsgWatcher
|
||||||
}
|
}
|
||||||
|
|
||||||
var globalQueue = &taskQueue{trigger: make(chan struct{}, 1)}
|
var globalQueue = &taskQueue{trigger: make(chan struct{}, 1)}
|
||||||
@@ -411,8 +412,16 @@ func (q *taskQueue) worker() {
|
|||||||
q.persistLocked()
|
q.persistLocked()
|
||||||
q.mu.Unlock()
|
q.mu.Unlock()
|
||||||
|
|
||||||
|
if q.kmsgWatcher != nil && isSATTarget(t.Target) {
|
||||||
|
q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
|
||||||
|
}
|
||||||
|
|
||||||
q.runTask(t, j, ctx)
|
q.runTask(t, j, ctx)
|
||||||
|
|
||||||
|
if q.kmsgWatcher != nil {
|
||||||
|
q.kmsgWatcher.NotifyTaskFinished(t.ID)
|
||||||
|
}
|
||||||
|
|
||||||
q.mu.Lock()
|
q.mu.Lock()
|
||||||
now2 := time.Now()
|
now2 := time.Now()
|
||||||
t.DoneAt = &now2
|
t.DoneAt = &now2
|
||||||
@@ -618,6 +627,19 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If the SAT archive was produced, check overall_status and write to component DB.
|
||||||
|
if archive != "" {
|
||||||
|
archivePath := app.ExtractArchivePath(archive)
|
||||||
|
if err == nil {
|
||||||
|
if app.ReadSATOverallStatus(archivePath) == "FAILED" {
|
||||||
|
err = fmt.Errorf("SAT overall_status=FAILED (see summary.txt)")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if db := q.statusDB(); db != nil {
|
||||||
|
app.ApplySATResultToDB(db, t.Target, archivePath)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if ctx.Err() != nil {
|
if ctx.Err() != nil {
|
||||||
j.append("Aborted.")
|
j.append("Aborted.")
|
||||||
@@ -634,6 +656,13 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
j.finish("")
|
j.finish("")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (q *taskQueue) statusDB() *app.ComponentStatusDB {
|
||||||
|
if q.opts == nil || q.opts.App == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return q.opts.App.StatusDB
|
||||||
|
}
|
||||||
|
|
||||||
func splitLines(s string) []string {
|
func splitLines(s string) []string {
|
||||||
var out []string
|
var out []string
|
||||||
for _, l := range splitNL(s) {
|
for _, l := range splitNL(s) {
|
||||||
|
|||||||
Reference in New Issue
Block a user