269 lines
7.6 KiB
Go
269 lines
7.6 KiB
Go
package app
|
|
|
|
import (
|
|
"encoding/json"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
)
|
|
|
|
// ComponentStatusDB is a persistent, append-only store of hardware component health records.
|
|
// Records are keyed by component identity strings (e.g. "pcie:0000:c8:00.0", "storage:nvme0n1").
|
|
// Once a component is marked Warning or Critical, subsequent OK entries do not downgrade it —
|
|
// the component stays at the highest observed severity until explicitly reset.
|
|
type ComponentStatusDB struct {
|
|
path string
|
|
mu sync.Mutex
|
|
records map[string]*ComponentStatusRecord
|
|
}
|
|
|
|
// ComponentStatusRecord holds the current and historical health of one hardware component.
|
|
type ComponentStatusRecord struct {
|
|
ComponentKey string `json:"component_key"`
|
|
Status string `json:"status"` // "OK", "Warning", "Critical", "Unknown"
|
|
LastCheckedAt time.Time `json:"last_checked_at"`
|
|
LastChangedAt time.Time `json:"last_changed_at"`
|
|
ErrorSummary string `json:"error_summary,omitempty"`
|
|
History []ComponentStatusEntry `json:"history"`
|
|
}
|
|
|
|
// ComponentStatusEntry is one observation written to a component's history.
|
|
type ComponentStatusEntry struct {
|
|
At time.Time `json:"at"`
|
|
Status string `json:"status"`
|
|
Source string `json:"source"` // e.g. "sat:nvidia", "sat:memory", "watchdog:kmsg"
|
|
Detail string `json:"detail,omitempty"`
|
|
}
|
|
|
|
// OpenComponentStatusDB opens (or creates) the JSON status DB at path.
|
|
func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) {
|
|
db := &ComponentStatusDB{
|
|
path: path,
|
|
records: make(map[string]*ComponentStatusRecord),
|
|
}
|
|
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
|
return nil, err
|
|
}
|
|
data, err := os.ReadFile(path)
|
|
if err != nil && !os.IsNotExist(err) {
|
|
return nil, err
|
|
}
|
|
if len(data) > 0 {
|
|
var records []ComponentStatusRecord
|
|
if err := json.Unmarshal(data, &records); err == nil {
|
|
for i := range records {
|
|
db.records[records[i].ComponentKey] = &records[i]
|
|
}
|
|
}
|
|
}
|
|
return db, nil
|
|
}
|
|
|
|
// Record writes one observation for the given component key.
|
|
// source is a short label like "sat:nvidia" or "watchdog:kmsg".
|
|
// status is "OK", "Warning", "Critical", or "Unknown".
|
|
// OK never downgrades an existing Warning or Critical status.
|
|
func (db *ComponentStatusDB) Record(key, source, status, detail string) {
|
|
if db == nil || strings.TrimSpace(key) == "" {
|
|
return
|
|
}
|
|
db.mu.Lock()
|
|
defer db.mu.Unlock()
|
|
|
|
now := time.Now().UTC()
|
|
rec, exists := db.records[key]
|
|
if !exists {
|
|
rec = &ComponentStatusRecord{ComponentKey: key}
|
|
db.records[key] = rec
|
|
}
|
|
rec.LastCheckedAt = now
|
|
|
|
entry := ComponentStatusEntry{At: now, Status: status, Source: source, Detail: detail}
|
|
rec.History = append(rec.History, entry)
|
|
|
|
// Status merge: OK never downgrades Warning/Critical.
|
|
newSev := componentSeverity(status)
|
|
curSev := componentSeverity(rec.Status)
|
|
if newSev > curSev {
|
|
rec.Status = status
|
|
rec.LastChangedAt = now
|
|
rec.ErrorSummary = detail
|
|
} else if rec.Status == "" {
|
|
rec.Status = status
|
|
rec.LastChangedAt = now
|
|
}
|
|
|
|
_ = db.saveLocked()
|
|
}
|
|
|
|
// Get returns the current record for a component key.
|
|
func (db *ComponentStatusDB) Get(key string) (ComponentStatusRecord, bool) {
|
|
if db == nil {
|
|
return ComponentStatusRecord{}, false
|
|
}
|
|
db.mu.Lock()
|
|
defer db.mu.Unlock()
|
|
r, ok := db.records[key]
|
|
if !ok {
|
|
return ComponentStatusRecord{}, false
|
|
}
|
|
return *r, true
|
|
}
|
|
|
|
// All returns a snapshot of all records.
|
|
func (db *ComponentStatusDB) All() []ComponentStatusRecord {
|
|
if db == nil {
|
|
return nil
|
|
}
|
|
db.mu.Lock()
|
|
defer db.mu.Unlock()
|
|
out := make([]ComponentStatusRecord, 0, len(db.records))
|
|
for _, r := range db.records {
|
|
out = append(out, *r)
|
|
}
|
|
return out
|
|
}
|
|
|
|
func (db *ComponentStatusDB) saveLocked() error {
|
|
records := make([]ComponentStatusRecord, 0, len(db.records))
|
|
for _, r := range db.records {
|
|
records = append(records, *r)
|
|
}
|
|
data, err := json.MarshalIndent(records, "", " ")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return os.WriteFile(db.path, data, 0644)
|
|
}
|
|
|
|
// componentSeverity returns a numeric severity so higher values win.
|
|
func componentSeverity(status string) int {
|
|
switch strings.TrimSpace(status) {
|
|
case "Critical":
|
|
return 3
|
|
case "Warning":
|
|
return 2
|
|
case "OK":
|
|
return 1
|
|
default:
|
|
return 0
|
|
}
|
|
}
|
|
|
|
// ApplySATResultToDB reads a SAT summary.txt from the run directory next to archivePath
|
|
// and writes component status records to db for the given SAT target.
|
|
// archivePath may be either a bare .tar.gz path or "Archive written to /path/foo.tar.gz".
|
|
func ApplySATResultToDB(db *ComponentStatusDB, target, archivePath string) {
|
|
if db == nil || strings.TrimSpace(archivePath) == "" {
|
|
return
|
|
}
|
|
archivePath = extractArchivePath(archivePath)
|
|
if archivePath == "" {
|
|
return
|
|
}
|
|
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
|
data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
|
if err != nil {
|
|
return
|
|
}
|
|
kv := parseSATKV(string(data))
|
|
overall := strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
|
|
if overall == "" {
|
|
return
|
|
}
|
|
|
|
source := "sat:" + target
|
|
dbStatus := satStatusToDBStatus(overall)
|
|
|
|
// Map SAT target to component keys.
|
|
switch target {
|
|
case "nvidia", "nvidia-targeted-stress", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
|
|
"nvidia-interconnect", "nvidia-bandwidth", "amd", "nvidia-stress",
|
|
"amd-stress", "amd-mem", "amd-bandwidth":
|
|
db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall)
|
|
case "memory", "memory-stress", "sat-stress":
|
|
db.Record("memory:all", source, dbStatus, target+" SAT: "+overall)
|
|
case "cpu", "platform-stress":
|
|
db.Record("cpu:all", source, dbStatus, target+" SAT: "+overall)
|
|
case "storage":
|
|
// Try to record per-device if available in summary.
|
|
recordedAny := false
|
|
for key, val := range kv {
|
|
if !strings.HasSuffix(key, "_status") || key == "overall_status" {
|
|
continue
|
|
}
|
|
base := strings.TrimSuffix(key, "_status")
|
|
idx := strings.Index(base, "_")
|
|
if idx <= 0 {
|
|
continue
|
|
}
|
|
devName := base[:idx]
|
|
devStatus := satStatusToDBStatus(strings.ToUpper(strings.TrimSpace(val)))
|
|
db.Record("storage:"+devName, source, devStatus, "storage SAT: "+val)
|
|
recordedAny = true
|
|
}
|
|
if !recordedAny {
|
|
db.Record("storage:all", source, dbStatus, "storage SAT: "+overall)
|
|
}
|
|
}
|
|
}
|
|
|
|
func satStatusToDBStatus(overall string) string {
|
|
switch overall {
|
|
case "OK":
|
|
return "OK"
|
|
case "FAILED":
|
|
return "Warning"
|
|
case "PARTIAL", "UNSUPPORTED":
|
|
return "Unknown"
|
|
default:
|
|
return "Unknown"
|
|
}
|
|
}
|
|
|
|
// ExtractArchivePath extracts a bare .tar.gz path from a string that may be
|
|
// "Archive written to /path/foo.tar.gz" or already a bare path.
|
|
func ExtractArchivePath(s string) string {
|
|
return extractArchivePath(s)
|
|
}
|
|
|
|
// ReadSATOverallStatus reads the overall_status value from the summary.txt
|
|
// file located in the run directory alongside archivePath.
|
|
// Returns "" if the file cannot be read.
|
|
func ReadSATOverallStatus(archivePath string) string {
|
|
if strings.TrimSpace(archivePath) == "" {
|
|
return ""
|
|
}
|
|
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
|
data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
kv := parseSATKV(string(data))
|
|
return strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
|
|
}
|
|
|
|
func extractArchivePath(s string) string {
|
|
s = strings.TrimSpace(s)
|
|
if strings.HasSuffix(s, ".tar.gz") {
|
|
parts := strings.Fields(s)
|
|
if len(parts) > 0 {
|
|
return parts[len(parts)-1]
|
|
}
|
|
}
|
|
return s
|
|
}
|
|
|
|
func parseSATKV(raw string) map[string]string {
|
|
kv := make(map[string]string)
|
|
for _, line := range strings.Split(raw, "\n") {
|
|
k, v, ok := strings.Cut(strings.TrimSpace(line), "=")
|
|
if ok {
|
|
kv[strings.TrimSpace(k)] = strings.TrimSpace(v)
|
|
}
|
|
}
|
|
return kv
|
|
}
|