Files
bee/audit/internal/app/component_status_db.go
Michael Chus d52ec67f8f Stability hardening, build script fixes, GRUB bee logo
Stability hardening (webui/app):
- readFileLimited(): защита от OOM при чтении audit JSON (100 MB),
  component-status DB (10 MB) и лога задачи (50 MB)
- jobs.go: буферизованный лог задачи — один открытый fd на задачу
  вместо open/write/close на каждую строку (устраняет тысячи syscall/сек
  при GPU стресс-тестах)
- stability.go: экспоненциальный backoff в goRecoverLoop (2s→4s→…→60s),
  сброс при успешном прогоне >30s, счётчик перезапусков в slog
- kill_workers.go: таймаут 5s на скан /proc, warn при срабатывании
- bee-web.service: MemoryMax=3G — OOM killer защищён

Build script:
- build.sh: удалён блок генерации grub-pc/grub.cfg + live.cfg.in —
  мёртвый код с v8.25; grub-pc игнорируется live-build, а генерируемый
  live.cfg.in перезаписывал правильный статический файл устаревшей
  версией без tuning-параметров ядра и пунктов gsp-off/kms+gsp-off
- build.sh: dump_memtest_debug теперь логирует grub-efi/grub.cfg
  вместо grub-pc/grub.cfg (было всегда "missing")

GRUB:
- live-theme/bee-logo.png: логотип пчелы 400×400px на чёрном фоне
- live-theme/theme.txt: + image компонент по центру в верхней трети
  экрана; меню сдвинуто с 62% до 65%

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-19 13:08:31 +03:00

269 lines
7.6 KiB
Go

package app
import (
"encoding/json"
"os"
"path/filepath"
"strings"
"sync"
"time"
)
// ComponentStatusDB is a persistent, append-only store of hardware component health records.
// Records are keyed by component identity strings (e.g. "pcie:0000:c8:00.0", "storage:nvme0n1").
// Once a component is marked Warning or Critical, subsequent OK entries do not downgrade it —
// the component stays at the highest observed severity until explicitly reset.
type ComponentStatusDB struct {
path string
mu sync.Mutex
records map[string]*ComponentStatusRecord
}
// ComponentStatusRecord holds the current and historical health of one hardware component.
type ComponentStatusRecord struct {
ComponentKey string `json:"component_key"`
Status string `json:"status"` // "OK", "Warning", "Critical", "Unknown"
LastCheckedAt time.Time `json:"last_checked_at"`
LastChangedAt time.Time `json:"last_changed_at"`
ErrorSummary string `json:"error_summary,omitempty"`
History []ComponentStatusEntry `json:"history"`
}
// ComponentStatusEntry is one observation written to a component's history.
type ComponentStatusEntry struct {
At time.Time `json:"at"`
Status string `json:"status"`
Source string `json:"source"` // e.g. "sat:nvidia", "sat:memory", "watchdog:kmsg"
Detail string `json:"detail,omitempty"`
}
// OpenComponentStatusDB opens (or creates) the JSON status DB at path.
func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) {
db := &ComponentStatusDB{
path: path,
records: make(map[string]*ComponentStatusRecord),
}
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
return nil, err
}
data, err := readFileLimited(path, 10<<20)
if err != nil && !os.IsNotExist(err) {
return nil, err
}
if len(data) > 0 {
var records []ComponentStatusRecord
if err := json.Unmarshal(data, &records); err == nil {
for i := range records {
db.records[records[i].ComponentKey] = &records[i]
}
}
}
return db, nil
}
// Record writes one observation for the given component key.
// source is a short label like "sat:nvidia" or "watchdog:kmsg".
// status is "OK", "Warning", "Critical", or "Unknown".
// OK never downgrades an existing Warning or Critical status.
func (db *ComponentStatusDB) Record(key, source, status, detail string) {
if db == nil || strings.TrimSpace(key) == "" {
return
}
db.mu.Lock()
defer db.mu.Unlock()
now := time.Now().UTC()
rec, exists := db.records[key]
if !exists {
rec = &ComponentStatusRecord{ComponentKey: key}
db.records[key] = rec
}
rec.LastCheckedAt = now
entry := ComponentStatusEntry{At: now, Status: status, Source: source, Detail: detail}
rec.History = append(rec.History, entry)
// Status merge: OK never downgrades Warning/Critical.
newSev := componentSeverity(status)
curSev := componentSeverity(rec.Status)
if newSev > curSev {
rec.Status = status
rec.LastChangedAt = now
rec.ErrorSummary = detail
} else if rec.Status == "" {
rec.Status = status
rec.LastChangedAt = now
}
_ = db.saveLocked()
}
// Get returns the current record for a component key.
func (db *ComponentStatusDB) Get(key string) (ComponentStatusRecord, bool) {
if db == nil {
return ComponentStatusRecord{}, false
}
db.mu.Lock()
defer db.mu.Unlock()
r, ok := db.records[key]
if !ok {
return ComponentStatusRecord{}, false
}
return *r, true
}
// All returns a snapshot of all records.
func (db *ComponentStatusDB) All() []ComponentStatusRecord {
if db == nil {
return nil
}
db.mu.Lock()
defer db.mu.Unlock()
out := make([]ComponentStatusRecord, 0, len(db.records))
for _, r := range db.records {
out = append(out, *r)
}
return out
}
func (db *ComponentStatusDB) saveLocked() error {
records := make([]ComponentStatusRecord, 0, len(db.records))
for _, r := range db.records {
records = append(records, *r)
}
data, err := json.MarshalIndent(records, "", " ")
if err != nil {
return err
}
return os.WriteFile(db.path, data, 0644)
}
// componentSeverity returns a numeric severity so higher values win.
func componentSeverity(status string) int {
switch strings.TrimSpace(status) {
case "Critical":
return 3
case "Warning":
return 2
case "OK":
return 1
default:
return 0
}
}
// ApplySATResultToDB reads a SAT summary.txt from the run directory next to archivePath
// and writes component status records to db for the given SAT target.
// archivePath may be either a bare .tar.gz path or "Archive written to /path/foo.tar.gz".
func ApplySATResultToDB(db *ComponentStatusDB, target, archivePath string) {
if db == nil || strings.TrimSpace(archivePath) == "" {
return
}
archivePath = extractArchivePath(archivePath)
if archivePath == "" {
return
}
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
if err != nil {
return
}
kv := parseSATKV(string(data))
overall := strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
if overall == "" {
return
}
source := "sat:" + target
dbStatus := satStatusToDBStatus(overall)
// Map SAT target to component keys.
switch target {
case "nvidia", "nvidia-targeted-stress", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
"nvidia-interconnect", "nvidia-bandwidth", "amd", "nvidia-stress",
"amd-stress", "amd-mem", "amd-bandwidth":
db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall)
case "memory", "memory-stress", "sat-stress":
db.Record("memory:all", source, dbStatus, target+" SAT: "+overall)
case "cpu", "platform-stress":
db.Record("cpu:all", source, dbStatus, target+" SAT: "+overall)
case "storage":
// Try to record per-device if available in summary.
recordedAny := false
for key, val := range kv {
if !strings.HasSuffix(key, "_status") || key == "overall_status" {
continue
}
base := strings.TrimSuffix(key, "_status")
idx := strings.Index(base, "_")
if idx <= 0 {
continue
}
devName := base[:idx]
devStatus := satStatusToDBStatus(strings.ToUpper(strings.TrimSpace(val)))
db.Record("storage:"+devName, source, devStatus, "storage SAT: "+val)
recordedAny = true
}
if !recordedAny {
db.Record("storage:all", source, dbStatus, "storage SAT: "+overall)
}
}
}
func satStatusToDBStatus(overall string) string {
switch overall {
case "OK":
return "OK"
case "FAILED":
return "Warning"
case "PARTIAL", "UNSUPPORTED":
return "Unknown"
default:
return "Unknown"
}
}
// ExtractArchivePath extracts a bare .tar.gz path from a string that may be
// "Archive written to /path/foo.tar.gz" or already a bare path.
func ExtractArchivePath(s string) string {
return extractArchivePath(s)
}
// ReadSATOverallStatus reads the overall_status value from the summary.txt
// file located in the run directory alongside archivePath.
// Returns "" if the file cannot be read.
func ReadSATOverallStatus(archivePath string) string {
if strings.TrimSpace(archivePath) == "" {
return ""
}
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
if err != nil {
return ""
}
kv := parseSATKV(string(data))
return strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
}
func extractArchivePath(s string) string {
s = strings.TrimSpace(s)
if strings.HasSuffix(s, ".tar.gz") {
parts := strings.Fields(s)
if len(parts) > 0 {
return parts[len(parts)-1]
}
}
return s
}
func parseSATKV(raw string) map[string]string {
kv := make(map[string]string)
for _, line := range strings.Split(raw, "\n") {
k, v, ok := strings.Cut(strings.TrimSpace(line), "=")
if ok {
kv[strings.TrimSpace(k)] = strings.TrimSpace(v)
}
}
return kv
}